aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module')
-rw-r--r--sys/contrib/openzfs/module/.gitignore26
-rw-r--r--sys/contrib/openzfs/module/Kbuild.in47
-rw-r--r--sys/contrib/openzfs/module/Makefile.bsd368
-rw-r--r--sys/contrib/openzfs/module/Makefile.in135
-rw-r--r--sys/contrib/openzfs/module/avl/Makefile.in10
-rw-r--r--sys/contrib/openzfs/module/avl/avl.c1093
-rw-r--r--sys/contrib/openzfs/module/icp/Makefile.in96
-rw-r--r--sys/contrib/openzfs/module/icp/algs/aes/aes_impl.c443
-rw-r--r--sys/contrib/openzfs/module/icp/algs/aes/aes_impl_aesni.c124
-rw-r--r--sys/contrib/openzfs/module/icp/algs/aes/aes_impl_generic.c1242
-rw-r--r--sys/contrib/openzfs/module/icp/algs/aes/aes_impl_x86-64.c63
-rw-r--r--sys/contrib/openzfs/module/icp/algs/aes/aes_modes.c135
-rw-r--r--sys/contrib/openzfs/module/icp/algs/edonr/edonr.c746
-rw-r--r--sys/contrib/openzfs/module/icp/algs/edonr/edonr_byteorder.h216
-rw-r--r--sys/contrib/openzfs/module/icp/algs/modes/cbc.c273
-rw-r--r--sys/contrib/openzfs/module/icp/algs/modes/ccm.c907
-rw-r--r--sys/contrib/openzfs/module/icp/algs/modes/ctr.c228
-rw-r--r--sys/contrib/openzfs/module/icp/algs/modes/ecb.c128
-rw-r--r--sys/contrib/openzfs/module/icp/algs/modes/gcm.c1587
-rw-r--r--sys/contrib/openzfs/module/icp/algs/modes/gcm_generic.c83
-rw-r--r--sys/contrib/openzfs/module/icp/algs/modes/gcm_pclmulqdq.c64
-rw-r--r--sys/contrib/openzfs/module/icp/algs/modes/modes.c165
-rw-r--r--sys/contrib/openzfs/module/icp/algs/sha1/sha1.c835
-rw-r--r--sys/contrib/openzfs/module/icp/algs/sha2/sha2.c956
-rw-r--r--sys/contrib/openzfs/module/icp/algs/skein/THIRDPARTYLICENSE3
-rw-r--r--sys/contrib/openzfs/module/icp/algs/skein/THIRDPARTYLICENSE.descrip1
-rw-r--r--sys/contrib/openzfs/module/icp/algs/skein/skein.c911
-rw-r--r--sys/contrib/openzfs/module/icp/algs/skein/skein_block.c790
-rw-r--r--sys/contrib/openzfs/module/icp/algs/skein/skein_impl.h292
-rw-r--r--sys/contrib/openzfs/module/icp/algs/skein/skein_iv.c185
-rw-r--r--sys/contrib/openzfs/module/icp/algs/skein/skein_port.h116
-rw-r--r--sys/contrib/openzfs/module/icp/api/kcf_cipher.c930
-rw-r--r--sys/contrib/openzfs/module/icp/api/kcf_ctxops.c151
-rw-r--r--sys/contrib/openzfs/module/icp/api/kcf_digest.c491
-rw-r--r--sys/contrib/openzfs/module/icp/api/kcf_mac.c645
-rw-r--r--sys/contrib/openzfs/module/icp/api/kcf_miscapi.c127
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman23
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip1
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl127
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip1
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/aes/aes_aesni.S748
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/aes/aes_amd64.S906
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/aes/aeskey.c580
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/aes/aesopt.h770
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/aes/aestab.h165
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/aes/aestab2.h594
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams36
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip1
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl177
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip1
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S1261
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S254
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/modes/ghash-x86_64.S714
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/sha1/sha1-x86_64.S1353
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/sha2/sha256_impl.S2063
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/sha2/sha512_impl.S2088
-rw-r--r--sys/contrib/openzfs/module/icp/core/kcf_callprov.c1567
-rw-r--r--sys/contrib/openzfs/module/icp/core/kcf_mech_tabs.c791
-rw-r--r--sys/contrib/openzfs/module/icp/core/kcf_prov_lib.c227
-rw-r--r--sys/contrib/openzfs/module/icp/core/kcf_prov_tabs.c645
-rw-r--r--sys/contrib/openzfs/module/icp/core/kcf_sched.c1780
-rw-r--r--sys/contrib/openzfs/module/icp/illumos-crypto.c158
-rw-r--r--sys/contrib/openzfs/module/icp/include/aes/aes_impl.h227
-rw-r--r--sys/contrib/openzfs/module/icp/include/modes/gcm_impl.h75
-rw-r--r--sys/contrib/openzfs/module/icp/include/modes/modes.h411
-rw-r--r--sys/contrib/openzfs/module/icp/include/sha1/sha1.h61
-rw-r--r--sys/contrib/openzfs/module/icp/include/sha1/sha1_consts.h65
-rw-r--r--sys/contrib/openzfs/module/icp/include/sha1/sha1_impl.h73
-rw-r--r--sys/contrib/openzfs/module/icp/include/sha2/sha2_consts.h219
-rw-r--r--sys/contrib/openzfs/module/icp/include/sha2/sha2_impl.h64
-rw-r--r--sys/contrib/openzfs/module/icp/include/sys/asm_linkage.h46
-rw-r--r--sys/contrib/openzfs/module/icp/include/sys/bitmap.h183
-rw-r--r--sys/contrib/openzfs/module/icp/include/sys/crypto/elfsign.h137
-rw-r--r--sys/contrib/openzfs/module/icp/include/sys/crypto/impl.h1363
-rw-r--r--sys/contrib/openzfs/module/icp/include/sys/crypto/ioctl.h1480
-rw-r--r--sys/contrib/openzfs/module/icp/include/sys/crypto/ioctladmin.h136
-rw-r--r--sys/contrib/openzfs/module/icp/include/sys/crypto/ops_impl.h630
-rw-r--r--sys/contrib/openzfs/module/icp/include/sys/crypto/sched_impl.h531
-rw-r--r--sys/contrib/openzfs/module/icp/include/sys/crypto/spi.h726
-rw-r--r--sys/contrib/openzfs/module/icp/include/sys/ia32/asm_linkage.h307
-rw-r--r--sys/contrib/openzfs/module/icp/include/sys/ia32/stack.h160
-rw-r--r--sys/contrib/openzfs/module/icp/include/sys/ia32/trap.h107
-rw-r--r--sys/contrib/openzfs/module/icp/include/sys/modctl.h477
-rw-r--r--sys/contrib/openzfs/module/icp/include/sys/modhash.h147
-rw-r--r--sys/contrib/openzfs/module/icp/include/sys/modhash_impl.h108
-rw-r--r--sys/contrib/openzfs/module/icp/include/sys/stack.h36
-rw-r--r--sys/contrib/openzfs/module/icp/include/sys/trap.h36
-rw-r--r--sys/contrib/openzfs/module/icp/io/aes.c1457
-rw-r--r--sys/contrib/openzfs/module/icp/io/edonr_mod.c63
-rw-r--r--sys/contrib/openzfs/module/icp/io/sha1_mod.c1230
-rw-r--r--sys/contrib/openzfs/module/icp/io/sha2_mod.c1399
-rw-r--r--sys/contrib/openzfs/module/icp/io/skein_mod.c729
-rw-r--r--sys/contrib/openzfs/module/icp/os/modconf.c173
-rw-r--r--sys/contrib/openzfs/module/icp/os/modhash.c927
-rw-r--r--sys/contrib/openzfs/module/icp/spi/kcf_spi.c925
-rw-r--r--sys/contrib/openzfs/module/lua/Makefile.in39
-rw-r--r--sys/contrib/openzfs/module/lua/README.zfs80
-rw-r--r--sys/contrib/openzfs/module/lua/lapi.c1345
-rw-r--r--sys/contrib/openzfs/module/lua/lapi.h26
-rw-r--r--sys/contrib/openzfs/module/lua/lauxlib.c800
-rw-r--r--sys/contrib/openzfs/module/lua/lbaselib.c296
-rw-r--r--sys/contrib/openzfs/module/lua/lcode.c884
-rw-r--r--sys/contrib/openzfs/module/lua/lcode.h85
-rw-r--r--sys/contrib/openzfs/module/lua/lcompat.c101
-rw-r--r--sys/contrib/openzfs/module/lua/lcorolib.c159
-rw-r--r--sys/contrib/openzfs/module/lua/lctype.c52
-rw-r--r--sys/contrib/openzfs/module/lua/lctype.h94
-rw-r--r--sys/contrib/openzfs/module/lua/ldebug.c608
-rw-r--r--sys/contrib/openzfs/module/lua/ldebug.h36
-rw-r--r--sys/contrib/openzfs/module/lua/ldo.c749
-rw-r--r--sys/contrib/openzfs/module/lua/ldo.h47
-rw-r--r--sys/contrib/openzfs/module/lua/lfunc.c160
-rw-r--r--sys/contrib/openzfs/module/lua/lfunc.h35
-rw-r--r--sys/contrib/openzfs/module/lua/lgc.c1218
-rw-r--r--sys/contrib/openzfs/module/lua/lgc.h159
-rw-r--r--sys/contrib/openzfs/module/lua/llex.c531
-rw-r--r--sys/contrib/openzfs/module/lua/llex.h83
-rw-r--r--sys/contrib/openzfs/module/lua/llimits.h314
-rw-r--r--sys/contrib/openzfs/module/lua/lmem.c98
-rw-r--r--sys/contrib/openzfs/module/lua/lmem.h56
-rw-r--r--sys/contrib/openzfs/module/lua/lobject.c282
-rw-r--r--sys/contrib/openzfs/module/lua/lobject.h605
-rw-r--r--sys/contrib/openzfs/module/lua/lopcodes.c108
-rw-r--r--sys/contrib/openzfs/module/lua/lopcodes.h290
-rw-r--r--sys/contrib/openzfs/module/lua/lparser.c1643
-rw-r--r--sys/contrib/openzfs/module/lua/lparser.h121
-rw-r--r--sys/contrib/openzfs/module/lua/lstate.c320
-rw-r--r--sys/contrib/openzfs/module/lua/lstate.h230
-rw-r--r--sys/contrib/openzfs/module/lua/lstring.c186
-rw-r--r--sys/contrib/openzfs/module/lua/lstring.h48
-rw-r--r--sys/contrib/openzfs/module/lua/lstrlib.c1040
-rw-r--r--sys/contrib/openzfs/module/lua/ltable.c592
-rw-r--r--sys/contrib/openzfs/module/lua/ltable.h47
-rw-r--r--sys/contrib/openzfs/module/lua/ltablib.c289
-rw-r--r--sys/contrib/openzfs/module/lua/ltm.c76
-rw-r--r--sys/contrib/openzfs/module/lua/ltm.h59
-rw-r--r--sys/contrib/openzfs/module/lua/lvm.c932
-rw-r--r--sys/contrib/openzfs/module/lua/lvm.h46
-rw-r--r--sys/contrib/openzfs/module/lua/lzio.c74
-rw-r--r--sys/contrib/openzfs/module/lua/lzio.h67
-rw-r--r--sys/contrib/openzfs/module/lua/setjmp/setjmp.S19
-rw-r--r--sys/contrib/openzfs/module/lua/setjmp/setjmp_aarch64.S86
-rw-r--r--sys/contrib/openzfs/module/lua/setjmp/setjmp_arm.S84
-rw-r--r--sys/contrib/openzfs/module/lua/setjmp/setjmp_i386.S69
-rw-r--r--sys/contrib/openzfs/module/lua/setjmp/setjmp_mips.S105
-rw-r--r--sys/contrib/openzfs/module/lua/setjmp/setjmp_ppc.S165
-rw-r--r--sys/contrib/openzfs/module/lua/setjmp/setjmp_rv64g.S91
-rw-r--r--sys/contrib/openzfs/module/lua/setjmp/setjmp_s390x.S64
-rw-r--r--sys/contrib/openzfs/module/lua/setjmp/setjmp_sparc64.S105
-rw-r--r--sys/contrib/openzfs/module/lua/setjmp/setjmp_x86_64.S77
-rw-r--r--sys/contrib/openzfs/module/nvpair/Makefile.in13
-rw-r--r--sys/contrib/openzfs/module/nvpair/fnvpair.c660
-rw-r--r--sys/contrib/openzfs/module/nvpair/nvpair.c3738
-rw-r--r--sys/contrib/openzfs/module/nvpair/nvpair_alloc_fixed.c115
-rw-r--r--sys/contrib/openzfs/module/nvpair/nvpair_alloc_spl.c96
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/acl_common.c1709
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/callb.c373
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/list.c244
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/sha224.h96
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/sha256.h99
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/sha256c.c378
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/sha384.h96
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/sha512.h101
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/sha512c.c508
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/sha512t.h143
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_acl.c222
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_atomic.c123
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_cmn_err.c77
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_dtrace.c38
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c352
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c575
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c113
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c438
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_procfs_list.c161
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c107
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_sunddi.c75
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c262
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c444
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_uio.c100
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_vfs.c287
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c75
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_zlib.c242
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_zone.c260
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c487
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c255
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c611
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c349
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/hkdf.c102
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c375
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/spa_os.c281
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c700
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c354
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c1214
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/vdev_label_os.c74
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c2700
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c1360
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_debug.c251
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c968
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c308
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_compat.c363
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_os.c161
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c2301
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c5888
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c2067
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c1839
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c1525
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/Makefile.in17
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/README.md16
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/THIRDPARTYLICENSE.gplv2339
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip1
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-atomic.c35
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c509
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-cred.c195
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-err.c123
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-generic.c841
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c1468
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c617
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c781
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-proc.c790
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c284
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c1428
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-thread.c160
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-trace.c33
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c719
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-vmem.c90
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-xdr.c512
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-zlib.c217
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/Makefile.in37
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/abd_os.c1073
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/arc_os.c530
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c41
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/policy.c375
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/qat.c105
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/qat_compress.c569
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/qat_crypt.c630
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c110
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/trace.c55
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c919
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c382
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c2932
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c1260
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c255
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c1225
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c440
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c329
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c662
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c333
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c2176
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c4010
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c2244
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c2049
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c552
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c154
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c1069
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c745
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c365
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c1486
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c1098
-rw-r--r--sys/contrib/openzfs/module/spl/Makefile.in13
-rw-r--r--sys/contrib/openzfs/module/unicode/Makefile.in11
-rw-r--r--sys/contrib/openzfs/module/unicode/u8_textprep.c2151
-rw-r--r--sys/contrib/openzfs/module/unicode/uconv.c863
-rw-r--r--sys/contrib/openzfs/module/zcommon/Makefile.in28
-rw-r--r--sys/contrib/openzfs/module/zcommon/cityhash.c67
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfeature_common.c609
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfs_comutil.c263
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfs_deleg.c249
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfs_fletcher.c991
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfs_fletcher_aarch64_neon.c215
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfs_fletcher_avx512.c225
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfs_fletcher_intel.c173
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfs_fletcher_sse.c232
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfs_fletcher_superscalar.c163
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfs_fletcher_superscalar4.c229
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfs_namecheck.c473
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfs_prop.c1052
-rw-r--r--sys/contrib/openzfs/module/zcommon/zpool_prop.c279
-rw-r--r--sys/contrib/openzfs/module/zcommon/zprop_common.c480
-rw-r--r--sys/contrib/openzfs/module/zfs/Makefile.in157
-rw-r--r--sys/contrib/openzfs/module/zfs/THIRDPARTYLICENSE.cityhash19
-rw-r--r--sys/contrib/openzfs/module/zfs/THIRDPARTYLICENSE.cityhash.descrip1
-rw-r--r--sys/contrib/openzfs/module/zfs/abd.c1212
-rw-r--r--sys/contrib/openzfs/module/zfs/aggsum.c240
-rw-r--r--sys/contrib/openzfs/module/zfs/arc.c10768
-rw-r--r--sys/contrib/openzfs/module/zfs/blkptr.c153
-rw-r--r--sys/contrib/openzfs/module/zfs/bplist.c91
-rw-r--r--sys/contrib/openzfs/module/zfs/bpobj.c943
-rw-r--r--sys/contrib/openzfs/module/zfs/bptree.c303
-rw-r--r--sys/contrib/openzfs/module/zfs/bqueue.c155
-rw-r--r--sys/contrib/openzfs/module/zfs/btree.c2124
-rw-r--r--sys/contrib/openzfs/module/zfs/dataset_kstats.c215
-rw-r--r--sys/contrib/openzfs/module/zfs/dbuf.c4958
-rw-r--r--sys/contrib/openzfs/module/zfs/dbuf_stats.c232
-rw-r--r--sys/contrib/openzfs/module/zfs/ddt.c1187
-rw-r--r--sys/contrib/openzfs/module/zfs/ddt_zap.c168
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu.c2333
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_diff.c240
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_object.c523
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_objset.c3044
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_recv.c3390
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_redact.c1199
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_send.c3094
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_traverse.c788
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_tx.c1417
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_zfetch.c471
-rw-r--r--sys/contrib/openzfs/module/zfs/dnode.c2583
-rw-r--r--sys/contrib/openzfs/module/zfs/dnode_sync.c858
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_bookmark.c1734
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_crypt.c2863
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_dataset.c5014
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_deadlist.c1012
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_deleg.c774
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_destroy.c1281
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_dir.c2403
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_pool.c1417
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_prop.c1287
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_scan.c4422
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_synctask.c257
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_userhold.c691
-rw-r--r--sys/contrib/openzfs/module/zfs/edonr_zfs.c115
-rw-r--r--sys/contrib/openzfs/module/zfs/fm.c1686
-rw-r--r--sys/contrib/openzfs/module/zfs/gzip.c106
-rw-r--r--sys/contrib/openzfs/module/zfs/hkdf.c171
-rw-r--r--sys/contrib/openzfs/module/zfs/lz4.c1084
-rw-r--r--sys/contrib/openzfs/module/zfs/lzjb.c132
-rw-r--r--sys/contrib/openzfs/module/zfs/metaslab.c6287
-rw-r--r--sys/contrib/openzfs/module/zfs/mmp.c741
-rw-r--r--sys/contrib/openzfs/module/zfs/multilist.c434
-rw-r--r--sys/contrib/openzfs/module/zfs/objlist.c84
-rw-r--r--sys/contrib/openzfs/module/zfs/pathname.c88
-rw-r--r--sys/contrib/openzfs/module/zfs/range_tree.c922
-rw-r--r--sys/contrib/openzfs/module/zfs/refcount.c327
-rw-r--r--sys/contrib/openzfs/module/zfs/rrwlock.c396
-rw-r--r--sys/contrib/openzfs/module/zfs/sa.c2257
-rw-r--r--sys/contrib/openzfs/module/zfs/sha256.c105
-rw-r--r--sys/contrib/openzfs/module/zfs/skein_zfs.c102
-rw-r--r--sys/contrib/openzfs/module/zfs/spa.c9885
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_boot.c50
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_checkpoint.c636
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_config.c623
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_errlog.c416
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_history.c634
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_log_spacemap.c1322
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_misc.c2953
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_stats.c1029
-rw-r--r--sys/contrib/openzfs/module/zfs/space_map.c1105
-rw-r--r--sys/contrib/openzfs/module/zfs/space_reftree.c152
-rw-r--r--sys/contrib/openzfs/module/zfs/txg.c1076
-rw-r--r--sys/contrib/openzfs/module/zfs/uberblock.c74
-rw-r--r--sys/contrib/openzfs/module/zfs/unique.c112
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev.c5420
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_cache.c437
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_draid.c2976
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_draid_rand.c40
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_indirect.c1911
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_indirect_births.c226
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c616
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_initialize.c766
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_label.c1992
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_mirror.c972
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_missing.c131
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_queue.c1164
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz.c2747
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math.c666
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon.c2279
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h684
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neonx2.c232
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c413
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c413
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c494
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h1502
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec.c4337
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec_common.h690
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c337
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c631
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c2477
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_rebuild.c1147
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_removal.c2390
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_root.c167
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_trim.c1719
-rw-r--r--sys/contrib/openzfs/module/zfs/zap.c1384
-rw-r--r--sys/contrib/openzfs/module/zfs/zap_leaf.c849
-rw-r--r--sys/contrib/openzfs/module/zfs/zap_micro.c1697
-rw-r--r--sys/contrib/openzfs/module/zfs/zcp.c1451
-rw-r--r--sys/contrib/openzfs/module/zfs/zcp_get.c813
-rw-r--r--sys/contrib/openzfs/module/zfs/zcp_global.c89
-rw-r--r--sys/contrib/openzfs/module/zfs/zcp_iter.c751
-rw-r--r--sys/contrib/openzfs/module/zfs/zcp_set.c100
-rw-r--r--sys/contrib/openzfs/module/zfs/zcp_synctask.c544
-rw-r--r--sys/contrib/openzfs/module/zfs/zfeature.c526
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_byteswap.c211
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_fm.c1416
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_fuid.c815
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_ioctl.c7688
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_log.c781
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_onexit.c173
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_quota.c476
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_ratelimit.c99
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_replay.c997
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_rlock.c691
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_sa.c446
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_vnops.c897
-rw-r--r--sys/contrib/openzfs/module/zfs/zil.c3695
-rw-r--r--sys/contrib/openzfs/module/zfs/zio.c5039
-rw-r--r--sys/contrib/openzfs/module/zfs/zio_checksum.c570
-rw-r--r--sys/contrib/openzfs/module/zfs/zio_compress.c220
-rw-r--r--sys/contrib/openzfs/module/zfs/zio_inject.c972
-rw-r--r--sys/contrib/openzfs/module/zfs/zle.c91
-rw-r--r--sys/contrib/openzfs/module/zfs/zrlock.c188
-rw-r--r--sys/contrib/openzfs/module/zfs/zthr.c536
-rw-r--r--sys/contrib/openzfs/module/zfs/zvol.c1739
-rw-r--r--sys/contrib/openzfs/module/zstd/Makefile.in38
-rw-r--r--sys/contrib/openzfs/module/zstd/README.md65
-rw-r--r--sys/contrib/openzfs/module/zstd/include/aarch64_compat.h37
-rw-r--r--sys/contrib/openzfs/module/zstd/include/limits.h63
-rw-r--r--sys/contrib/openzfs/module/zstd/include/stddef.h62
-rw-r--r--sys/contrib/openzfs/module/zstd/include/stdint.h62
-rw-r--r--sys/contrib/openzfs/module/zstd/include/stdio.h54
-rw-r--r--sys/contrib/openzfs/module/zstd/include/stdlib.h58
-rw-r--r--sys/contrib/openzfs/module/zstd/include/string.h62
-rw-r--r--sys/contrib/openzfs/module/zstd/include/zstd_compat_wrapper.h460
-rw-r--r--sys/contrib/openzfs/module/zstd/lib/zstd.c27826
-rw-r--r--sys/contrib/openzfs/module/zstd/lib/zstd.h2115
-rw-r--r--sys/contrib/openzfs/module/zstd/lib/zstd_errors.h94
-rw-r--r--sys/contrib/openzfs/module/zstd/zfs_zstd.c780
-rw-r--r--sys/contrib/openzfs/module/zstd/zstd-in.c68
426 files changed, 354219 insertions, 0 deletions
diff --git a/sys/contrib/openzfs/module/.gitignore b/sys/contrib/openzfs/module/.gitignore
new file mode 100644
index 000000000000..7a4bd3673e77
--- /dev/null
+++ b/sys/contrib/openzfs/module/.gitignore
@@ -0,0 +1,26 @@
+*.ko
+*.ko.unsigned
+*.ko.out
+*.ko.out.sig
+*.ko.debug
+*.ko.full
+*.dwo
+.*.cmd
+.*.d
+*.mod
+
+/Kbuild
+/.cache.mk
+/.tmp_versions
+/Module.markers
+/Module.symvers
+/vnode_if*
+/bus_if.h
+/device_if.h
+/opt_global.h
+
+/export_syms
+/machine
+/x86
+
+!Makefile.in
diff --git a/sys/contrib/openzfs/module/Kbuild.in b/sys/contrib/openzfs/module/Kbuild.in
new file mode 100644
index 000000000000..1507965c5750
--- /dev/null
+++ b/sys/contrib/openzfs/module/Kbuild.in
@@ -0,0 +1,47 @@
+# When integrated in to a monolithic kernel the spl module must appear
+# first. This ensures its module initialization function is run before
+# any of the other module initialization functions which depend on it.
+ZFS_MODULES += spl/
+ZFS_MODULES += avl/
+ZFS_MODULES += icp/
+ZFS_MODULES += lua/
+ZFS_MODULES += nvpair/
+ZFS_MODULES += unicode/
+ZFS_MODULES += zcommon/
+ZFS_MODULES += zfs/
+ZFS_MODULES += zstd/
+
+# The rest is only relevant when run by kbuild
+ifneq ($(KERNELRELEASE),)
+
+obj-$(CONFIG_ZFS) := $(ZFS_MODULES)
+
+ZFS_MODULE_CFLAGS += -std=gnu99 -Wno-declaration-after-statement
+ZFS_MODULE_CFLAGS += -Wmissing-prototypes
+ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@ @NO_FORMAT_ZERO_LENGTH@
+
+ifneq ($(KBUILD_EXTMOD),)
+zfs_include = @abs_top_srcdir@/include
+ZFS_MODULE_CFLAGS += -include @abs_top_builddir@/zfs_config.h
+ZFS_MODULE_CFLAGS += -I@abs_top_builddir@/include
+else
+zfs_include = $(srctree)/include/zfs
+ZFS_MODULE_CFLAGS += -include $(zfs_include)/zfs_config.h
+endif
+
+ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/kernel
+ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/spl
+ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/zfs
+ZFS_MODULE_CFLAGS += -I$(zfs_include)
+ZFS_MODULE_CPPFLAGS += -D_KERNEL
+ZFS_MODULE_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@
+
+ifneq ($(KBUILD_EXTMOD),)
+@CONFIG_QAT_TRUE@ZFS_MODULE_CFLAGS += -I@QAT_SRC@/include
+@CONFIG_QAT_TRUE@KBUILD_EXTRA_SYMBOLS += @QAT_SYMBOLS@
+endif
+
+subdir-asflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS)
+subdir-ccflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS)
+
+endif
diff --git a/sys/contrib/openzfs/module/Makefile.bsd b/sys/contrib/openzfs/module/Makefile.bsd
new file mode 100644
index 000000000000..e7cddcc5bb5e
--- /dev/null
+++ b/sys/contrib/openzfs/module/Makefile.bsd
@@ -0,0 +1,368 @@
+.if !defined(WITH_CTF)
+WITH_CTF=1
+.endif
+
+.include <bsd.sys.mk>
+
+SRCDIR=${.CURDIR}
+INCDIR=${.CURDIR:H}/include
+
+KMOD= openzfs
+
+.PATH: ${SRCDIR}/avl \
+ ${SRCDIR}/lua \
+ ${SRCDIR}/nvpair \
+ ${SRCDIR}/os/freebsd/spl \
+ ${SRCDIR}/os/freebsd/zfs \
+ ${SRCDIR}/unicode \
+ ${SRCDIR}/zcommon \
+ ${SRCDIR}/zfs \
+ ${SRCDIR}/zstd \
+ ${SRCDIR}/zstd/lib
+
+
+
+CFLAGS+= -I${.OBJDIR:H}/include
+CFLAGS+= -I${INCDIR}
+CFLAGS+= -I${INCDIR}/os/freebsd
+CFLAGS+= -I${INCDIR}/os/freebsd/spl
+CFLAGS+= -I${INCDIR}/os/freebsd/zfs
+CFLAGS+= -I${SRCDIR}/zstd/include
+CFLAGS+= -include ${INCDIR}/os/freebsd/spl/sys/ccompile.h
+
+CFLAGS+= -D__KERNEL__ -DFREEBSD_NAMECACHE -DBUILDING_ZFS -D__BSD_VISIBLE=1 \
+ -DHAVE_UIO_ZEROCOPY -DWITHOUT_NETDUMP -D__KERNEL -D_SYS_CONDVAR_H_ \
+ -D_SYS_VMEM_H_ -DKDTRACE_HOOKS -DSMP -DHAVE_KSID -DCOMPAT_FREEBSD11
+
+.if ${MACHINE_ARCH} == "amd64"
+CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_SSE2 -DHAVE_AVX512F -DHAVE_SSSE3
+.endif
+
+.if defined(WITH_DEBUG) && ${WITH_DEBUG} == "true"
+CFLAGS+= -DZFS_DEBUG -g
+.if defined(WITH_INVARIANTS) && ${WITH_INVARIANTS} == "true"
+ CFLAGS+= -DINVARIANTS -DWITNESS -DOPENSOLARIS_WITNESS
+.endif
+.if defined(WITH_O0) && ${WITH_O0} == "true"
+ CFLAGS+= -O0
+.endif
+.else
+CFLAGS += -DNDEBUG
+.endif
+
+.if defined(WITH_VFS_DEBUG) && ${WITH_VFS_DEBUG} == "true"
+# kernel must also be built with this option for this to work
+CFLAGS+= -DDEBUG_VFS_LOCKS
+.endif
+
+.if defined(WITH_GCOV) && ${WITH_GCOV} == "true"
+CFLAGS+= -fprofile-arcs -ftest-coverage
+.endif
+
+DEBUG_FLAGS=-g
+
+.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \
+ ${MACHINE_ARCH} == "arm"
+CFLAGS+= -DBITS_PER_LONG=32
+.else
+CFLAGS+= -DBITS_PER_LONG=64
+.endif
+
+SRCS= vnode_if.h device_if.h bus_if.h
+
+# avl
+SRCS+= avl.c
+
+#lua
+SRCS+= lapi.c \
+ lauxlib.c \
+ lbaselib.c \
+ lcode.c \
+ lcompat.c \
+ lcorolib.c \
+ lctype.c \
+ ldebug.c \
+ ldo.c \
+ lfunc.c \
+ lgc.c \
+ llex.c \
+ lmem.c \
+ lobject.c \
+ lopcodes.c \
+ lparser.c \
+ lstate.c \
+ lstring.c \
+ lstrlib.c \
+ ltable.c \
+ ltablib.c \
+ ltm.c \
+ lvm.c \
+ lzio.c
+
+#nvpair
+SRCS+= nvpair.c \
+ fnvpair.c \
+ nvpair_alloc_spl.c \
+ nvpair_alloc_fixed.c
+
+#os/freebsd/spl
+SRCS+= acl_common.c \
+ callb.c \
+ list.c \
+ sha256c.c \
+ sha512c.c \
+ spl_acl.c \
+ spl_cmn_err.c \
+ spl_dtrace.c \
+ spl_kmem.c \
+ spl_kstat.c \
+ spl_misc.c \
+ spl_policy.c \
+ spl_procfs_list.c \
+ spl_string.c \
+ spl_sunddi.c \
+ spl_sysevent.c \
+ spl_taskq.c \
+ spl_uio.c \
+ spl_vfs.c \
+ spl_vm.c \
+ spl_zlib.c \
+ spl_zone.c
+
+
+.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \
+ ${MACHINE_ARCH} == "arm"
+SRCS+= spl_atomic.c
+.endif
+
+#os/freebsd/zfs
+SRCS+= abd_os.c \
+ arc_os.c \
+ crypto_os.c \
+ dmu_os.c \
+ hkdf.c \
+ kmod_core.c \
+ spa_os.c \
+ sysctl_os.c \
+ vdev_file.c \
+ vdev_geom.c \
+ vdev_label_os.c \
+ zfs_acl.c \
+ zfs_ctldir.c \
+ zfs_debug.c \
+ zfs_dir.c \
+ zfs_ioctl_compat.c \
+ zfs_ioctl_os.c \
+ zfs_vfsops.c \
+ zfs_vnops_os.c \
+ zfs_znode.c \
+ zio_crypt.c \
+ zvol_os.c
+
+#unicode
+SRCS+= uconv.c \
+ u8_textprep.c
+
+#zcommon
+SRCS+= zfeature_common.c \
+ zfs_comutil.c \
+ zfs_deleg.c \
+ zfs_fletcher.c \
+ zfs_fletcher_avx512.c \
+ zfs_fletcher_intel.c \
+ zfs_fletcher_sse.c \
+ zfs_fletcher_superscalar.c \
+ zfs_fletcher_superscalar4.c \
+ zfs_namecheck.c \
+ zfs_prop.c \
+ zpool_prop.c \
+ zprop_common.c
+
+#zfs
+SRCS+= abd.c \
+ aggsum.c \
+ arc.c \
+ blkptr.c \
+ bplist.c \
+ bpobj.c \
+ btree.c \
+ cityhash.c \
+ dbuf.c \
+ dbuf_stats.c \
+ bptree.c \
+ bqueue.c \
+ dataset_kstats.c \
+ ddt.c \
+ ddt_zap.c \
+ dmu.c \
+ dmu_diff.c \
+ dmu_object.c \
+ dmu_objset.c \
+ dmu_recv.c \
+ dmu_redact.c \
+ dmu_send.c \
+ dmu_traverse.c \
+ dmu_tx.c \
+ dmu_zfetch.c \
+ dnode.c \
+ dnode_sync.c \
+ dsl_dataset.c \
+ dsl_deadlist.c \
+ dsl_deleg.c \
+ dsl_bookmark.c \
+ dsl_dir.c \
+ dsl_crypt.c \
+ dsl_destroy.c \
+ dsl_pool.c \
+ dsl_prop.c \
+ dsl_scan.c \
+ dsl_synctask.c \
+ dsl_userhold.c \
+ fm.c \
+ gzip.c \
+ lzjb.c \
+ lz4.c \
+ metaslab.c \
+ mmp.c \
+ multilist.c \
+ objlist.c \
+ pathname.c \
+ range_tree.c \
+ refcount.c \
+ rrwlock.c \
+ sa.c \
+ sha256.c \
+ skein_zfs.c \
+ spa.c \
+ spa_boot.c \
+ spa_checkpoint.c \
+ spa_config.c \
+ spa_errlog.c \
+ spa_history.c \
+ spa_log_spacemap.c \
+ spa_misc.c \
+ spa_stats.c \
+ space_map.c \
+ space_reftree.c \
+ txg.c \
+ uberblock.c \
+ unique.c \
+ vdev.c \
+ vdev_cache.c \
+ vdev_draid.c \
+ vdev_draid_rand.c \
+ vdev_indirect.c \
+ vdev_indirect_births.c \
+ vdev_indirect_mapping.c \
+ vdev_initialize.c \
+ vdev_label.c \
+ vdev_mirror.c \
+ vdev_missing.c \
+ vdev_queue.c \
+ vdev_raidz.c \
+ vdev_raidz_math.c \
+ vdev_raidz_math_scalar.c \
+ vdev_rebuild.c \
+ vdev_raidz_math_avx2.c \
+ vdev_raidz_math_avx512bw.c \
+ vdev_raidz_math_avx512f.c \
+ vdev_raidz_math_sse2.c \
+ vdev_raidz_math_ssse3.c \
+ vdev_removal.c \
+ vdev_root.c \
+ vdev_trim.c \
+ zap.c \
+ zap_leaf.c \
+ zap_micro.c \
+ zcp.c \
+ zcp_get.c \
+ zcp_global.c \
+ zcp_iter.c \
+ zcp_set.c \
+ zcp_synctask.c \
+ zfeature.c \
+ zfs_byteswap.c \
+ zfs_file_os.c \
+ zfs_fm.c \
+ zfs_fuid.c \
+ zfs_ioctl.c \
+ zfs_log.c \
+ zfs_onexit.c \
+ zfs_quota.c \
+ zfs_ratelimit.c \
+ zfs_replay.c \
+ zfs_rlock.c \
+ zfs_sa.c \
+ zfs_vnops.c \
+ zil.c \
+ zio.c \
+ zio_checksum.c \
+ zio_compress.c \
+ zio_inject.c \
+ zle.c \
+ zrlock.c \
+ zthr.c \
+ zvol.c
+
+#zstd
+SRCS+= zfs_zstd.c \
+ zstd.c
+
+beforeinstall:
+.if ${MK_DEBUG_FILES} != "no"
+ mtree -eu \
+ -f /etc/mtree/BSD.debug.dist \
+ -p ${DESTDIR}/usr/lib
+.endif
+
+.include <bsd.kmod.mk>
+
+
+CFLAGS.gcc+= -Wno-pointer-to-int-cast
+
+CFLAGS.lapi.c= -Wno-cast-qual
+CFLAGS.lcompat.c= -Wno-cast-qual
+CFLAGS.lobject.c= -Wno-cast-qual
+CFLAGS.ltable.c= -Wno-cast-qual
+CFLAGS.lvm.c= -Wno-cast-qual
+CFLAGS.nvpair.c= -DHAVE_RPC_TYPES -Wno-cast-qual
+CFLAGS.spl_string.c= -Wno-cast-qual
+CFLAGS.spl_vm.c= -Wno-cast-qual
+CFLAGS.spl_zlib.c= -Wno-cast-qual
+CFLAGS.abd.c= -Wno-cast-qual
+CFLAGS.zfs_log.c= -Wno-cast-qual
+CFLAGS.zfs_vnops_os.c= -Wno-pointer-arith
+CFLAGS.u8_textprep.c= -Wno-cast-qual
+CFLAGS.zfs_fletcher.c= -Wno-cast-qual -Wno-pointer-arith
+CFLAGS.zfs_fletcher_intel.c= -Wno-cast-qual -Wno-pointer-arith
+CFLAGS.zfs_fletcher_sse.c= -Wno-cast-qual -Wno-pointer-arith
+CFLAGS.zfs_fletcher_avx512.c= -Wno-cast-qual -Wno-pointer-arith
+CFLAGS.zprop_common.c= -Wno-cast-qual
+CFLAGS.ddt.c= -Wno-cast-qual
+CFLAGS.dmu.c= -Wno-cast-qual
+CFLAGS.dmu_traverse.c= -Wno-cast-qual
+CFLAGS.dsl_dir.c= -Wno-cast-qual
+CFLAGS.dsl_deadlist.c= -Wno-cast-qual
+CFLAGS.dsl_prop.c= -Wno-cast-qual
+CFLAGS.fm.c= -Wno-cast-qual
+CFLAGS.lz4.c= -Wno-cast-qual
+CFLAGS.spa.c= -Wno-cast-qual
+CFLAGS.spa_misc.c= -Wno-cast-qual
+CFLAGS.sysctl_os.c= -include ../zfs_config.h
+CFLAGS.vdev_draid.c= -Wno-cast-qual
+CFLAGS.vdev_raidz.c= -Wno-cast-qual
+CFLAGS.vdev_raidz_math.c= -Wno-cast-qual
+CFLAGS.vdev_raidz_math_scalar.c= -Wno-cast-qual
+CFLAGS.vdev_raidz_math_avx2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier
+CFLAGS.vdev_raidz_math_avx512f.c= -Wno-cast-qual -Wno-duplicate-decl-specifier
+CFLAGS.vdev_raidz_math_sse2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier
+CFLAGS.zap_leaf.c= -Wno-cast-qual
+CFLAGS.zap_micro.c= -Wno-cast-qual
+CFLAGS.zcp.c= -Wno-cast-qual
+CFLAGS.zfs_fm.c= -Wno-cast-qual
+CFLAGS.zfs_ioctl.c= -Wno-cast-qual
+CFLAGS.zil.c= -Wno-cast-qual
+CFLAGS.zio.c= -Wno-cast-qual
+CFLAGS.zrlock.c= -Wno-cast-qual
+CFLAGS.zfs_zstd.c= -Wno-cast-qual -Wno-pointer-arith
+CFLAGS.zstd.c= -fno-tree-vectorize -U__BMI__
diff --git a/sys/contrib/openzfs/module/Makefile.in b/sys/contrib/openzfs/module/Makefile.in
new file mode 100644
index 000000000000..69caf48570e9
--- /dev/null
+++ b/sys/contrib/openzfs/module/Makefile.in
@@ -0,0 +1,135 @@
+include Kbuild
+
+INSTALL_MOD_DIR ?= extra
+
+SUBDIR_TARGETS = icp lua zstd
+
+all: modules
+distclean maintainer-clean: clean
+install: modules_install
+uninstall: modules_uninstall
+check:
+
+.PHONY: all distclean maintainer-clean install uninstall check distdir \
+ modules modules-Linux modules-FreeBSD modules-unknown \
+ clean clean-Linux clean-FreeBSD \
+ modules_install modules_install-Linux modules_install-FreeBSD \
+ modules_uninstall modules_uninstall-Linux modules_uninstall-FreeBSD \
+ cppcheck cppcheck-Linux cppcheck-FreeBSD
+
+# Filter out options that FreeBSD make doesn't understand
+getflags = ( \
+set -- \
+ $(filter-out --%,$(firstword $(MFLAGS))) \
+ $(filter -I%,$(MFLAGS)) \
+ $(filter -j%,$(MFLAGS)); \
+fmakeflags=""; \
+while getopts :deiI:j:knqrstw flag; do \
+ case $$flag in \
+ \?) :;; \
+ :) if [ $$OPTARG = "j" ]; then \
+ ncpus=$$(sysctl -n kern.smp.cpus 2>/dev/null || :); \
+ if [ -n "$$ncpus" ]; then fmakeflags="$$fmakeflags -j$$ncpus"; fi; \
+ fi;; \
+ d) fmakeflags="$$fmakeflags -dA";; \
+ *) fmakeflags="$$fmakeflags -$$flag$$OPTARG";; \
+ esac; \
+done; \
+echo $$fmakeflags \
+)
+FMAKEFLAGS = -C @abs_srcdir@ -f Makefile.bsd $(shell $(getflags))
+
+ifneq (@abs_srcdir@,@abs_builddir@)
+FMAKEFLAGS += MAKEOBJDIR=@abs_builddir@
+endif
+FMAKE = env -u MAKEFLAGS make $(FMAKEFLAGS)
+
+modules-Linux:
+ list='$(SUBDIR_TARGETS)'; for targetdir in $$list; do \
+ $(MAKE) -C $$targetdir; \
+ done
+ $(MAKE) -C @LINUX_OBJ@ M=`pwd` @KERNEL_MAKE@ CONFIG_ZFS=m modules
+
+modules-FreeBSD:
+ +$(FMAKE)
+
+modules-unknown:
+ @true
+
+modules: modules-@ac_system@
+
+clean-Linux:
+ @# Only cleanup the kernel build directories when CONFIG_KERNEL
+ @# is defined. This indicates that kernel modules should be built.
+@CONFIG_KERNEL_TRUE@ $(MAKE) -C @LINUX_OBJ@ M=`pwd` @KERNEL_MAKE@ clean
+
+ if [ -f @LINUX_SYMBOLS@ ]; then $(RM) @LINUX_SYMBOLS@; fi
+ if [ -f Module.markers ]; then $(RM) Module.markers; fi
+
+ find . -name '*.ur-safe' -type f -print | xargs $(RM)
+
+clean-FreeBSD:
+ +$(FMAKE) clean
+
+clean: clean-@ac_system@
+
+modules_install-Linux:
+ @# Install the kernel modules
+ $(MAKE) -C @LINUX_OBJ@ M=`pwd` modules_install \
+ INSTALL_MOD_PATH=$(DESTDIR)$(INSTALL_MOD_PATH) \
+ INSTALL_MOD_DIR=$(INSTALL_MOD_DIR) \
+ KERNELRELEASE=@LINUX_VERSION@
+ @# Remove extraneous build products when packaging
+ kmoddir=$(DESTDIR)$(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@; \
+ if [ -n "$(DESTDIR)" ]; then \
+ find $$kmoddir -name 'modules.*' | xargs $(RM); \
+ fi
+ sysmap=$(DESTDIR)$(INSTALL_MOD_PATH)/boot/System.map-@LINUX_VERSION@; \
+ if [ -f $$sysmap ]; then \
+ depmod -ae -F $$sysmap @LINUX_VERSION@; \
+ fi
+
+modules_install-FreeBSD:
+ @# Install the kernel modules
+ +$(FMAKE) install
+
+modules_install: modules_install-@ac_system@
+
+modules_uninstall-Linux:
+ @# Uninstall the kernel modules
+ kmoddir=$(DESTDIR)$(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@; \
+ for objdir in $(ZFS_MODULES); do \
+ $(RM) -R $$kmoddir/$(INSTALL_MOD_DIR)/$$objdir; \
+ done
+
+modules_uninstall-FreeBSD:
+ @false
+
+modules_uninstall: modules_uninstall-@ac_system@
+
+cppcheck-Linux:
+ @CPPCHECK@ -j@CPU_COUNT@ --std=c99 --quiet --force --error-exitcode=2 \
+ --inline-suppr --suppress=noValidConfiguration \
+ --enable=warning,information -D_KERNEL \
+ --include=@LINUX_OBJ@/include/generated/autoconf.h \
+ --include=@top_srcdir@/zfs_config.h \
+ --config-exclude=@LINUX_OBJ@/include \
+ -I @LINUX_OBJ@/include \
+ -I @top_srcdir@/include/os/linux/kernel \
+ -I @top_srcdir@/include/os/linux/spl \
+ -I @top_srcdir@/include/os/linux/zfs \
+ -I @top_srcdir@/include \
+ avl icp lua nvpair spl unicode zcommon zfs zstd os/linux
+
+cppcheck-FreeBSD:
+ @true
+
+cppcheck: cppcheck-@ac_system@
+
+distdir:
+ (cd @srcdir@ && find $(ZFS_MODULES) os -name '*.[chS]') | \
+ while read path; do \
+ mkdir -p $$distdir/$${path%/*}; \
+ cp @srcdir@/$$path $$distdir/$$path; \
+ done; \
+ cp @srcdir@/Makefile.bsd $$distdir/Makefile.bsd
diff --git a/sys/contrib/openzfs/module/avl/Makefile.in b/sys/contrib/openzfs/module/avl/Makefile.in
new file mode 100644
index 000000000000..991d5f95b8c0
--- /dev/null
+++ b/sys/contrib/openzfs/module/avl/Makefile.in
@@ -0,0 +1,10 @@
+ifneq ($(KBUILD_EXTMOD),)
+src = @abs_srcdir@
+obj = @abs_builddir@
+endif
+
+MODULE := zavl
+
+obj-$(CONFIG_ZFS) := $(MODULE).o
+
+$(MODULE)-objs += avl.o
diff --git a/sys/contrib/openzfs/module/avl/avl.c b/sys/contrib/openzfs/module/avl/avl.c
new file mode 100644
index 000000000000..d0473d883b3d
--- /dev/null
+++ b/sys/contrib/openzfs/module/avl/avl.c
@@ -0,0 +1,1093 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
+ */
+
+/*
+ * AVL - generic AVL tree implementation for kernel use
+ *
+ * A complete description of AVL trees can be found in many CS textbooks.
+ *
+ * Here is a very brief overview. An AVL tree is a binary search tree that is
+ * almost perfectly balanced. By "almost" perfectly balanced, we mean that at
+ * any given node, the left and right subtrees are allowed to differ in height
+ * by at most 1 level.
+ *
+ * This relaxation from a perfectly balanced binary tree allows doing
+ * insertion and deletion relatively efficiently. Searching the tree is
+ * still a fast operation, roughly O(log(N)).
+ *
+ * The key to insertion and deletion is a set of tree manipulations called
+ * rotations, which bring unbalanced subtrees back into the semi-balanced state.
+ *
+ * This implementation of AVL trees has the following peculiarities:
+ *
+ * - The AVL specific data structures are physically embedded as fields
+ * in the "using" data structures. To maintain generality the code
+ * must constantly translate between "avl_node_t *" and containing
+ * data structure "void *"s by adding/subtracting the avl_offset.
+ *
+ * - Since the AVL data is always embedded in other structures, there is
+ * no locking or memory allocation in the AVL routines. This must be
+ * provided for by the enclosing data structure's semantics. Typically,
+ * avl_insert()/_add()/_remove()/avl_insert_here() require some kind of
+ * exclusive write lock. Other operations require a read lock.
+ *
+ * - The implementation uses iteration instead of explicit recursion,
+ * since it is intended to run on limited size kernel stacks. Since
+ * there is no recursion stack present to move "up" in the tree,
+ * there is an explicit "parent" link in the avl_node_t.
+ *
+ * - The left/right children pointers of a node are in an array.
+ * In the code, variables (instead of constants) are used to represent
+ * left and right indices. The implementation is written as if it only
+ * dealt with left handed manipulations. By changing the value assigned
+ * to "left", the code also works for right handed trees. The
+ * following variables/terms are frequently used:
+ *
+ * int left; // 0 when dealing with left children,
+ * // 1 for dealing with right children
+ *
+ * int left_heavy; // -1 when left subtree is taller at some node,
+ * // +1 when right subtree is taller
+ *
+ * int right; // will be the opposite of left (0 or 1)
+ * int right_heavy;// will be the opposite of left_heavy (-1 or 1)
+ *
+ * int direction; // 0 for "<" (ie. left child); 1 for ">" (right)
+ *
+ * Though it is a little more confusing to read the code, the approach
+ * allows using half as much code (and hence cache footprint) for tree
+ * manipulations and eliminates many conditional branches.
+ *
+ * - The avl_index_t is an opaque "cookie" used to find nodes at or
+ * adjacent to where a new value would be inserted in the tree. The value
+ * is a modified "avl_node_t *". The bottom bit (normally 0 for a
+ * pointer) is set to indicate if that the new node has a value greater
+ * than the value of the indicated "avl_node_t *".
+ *
+ * Note - in addition to userland (e.g. libavl and libutil) and the kernel
+ * (e.g. genunix), avl.c is compiled into ld.so and kmdb's genunix module,
+ * which each have their own compilation environments and subsequent
+ * requirements. Each of these environments must be considered when adding
+ * dependencies from avl.c.
+ *
+ * Link to Illumos.org for more information on avl function:
+ * [1] https://illumos.org/man/9f/avl
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/debug.h>
+#include <sys/avl.h>
+#include <sys/cmn_err.h>
+#include <sys/mod.h>
+
+/*
+ * Small arrays to translate between balance (or diff) values and child indices.
+ *
+ * Code that deals with binary tree data structures will randomly use
+ * left and right children when examining a tree. C "if()" statements
+ * which evaluate randomly suffer from very poor hardware branch prediction.
+ * In this code we avoid some of the branch mispredictions by using the
+ * following translation arrays. They replace random branches with an
+ * additional memory reference. Since the translation arrays are both very
+ * small the data should remain efficiently in cache.
+ */
+static const int avl_child2balance[2] = {-1, 1};
+static const int avl_balance2child[] = {0, 0, 1};
+
+
+/*
+ * Walk from one node to the previous valued node (ie. an infix walk
+ * towards the left). At any given node we do one of 2 things:
+ *
+ * - If there is a left child, go to it, then to it's rightmost descendant.
+ *
+ * - otherwise we return through parent nodes until we've come from a right
+ * child.
+ *
+ * Return Value:
+ * NULL - if at the end of the nodes
+ * otherwise next node
+ */
+void *
+avl_walk(avl_tree_t *tree, void *oldnode, int left)
+{
+ size_t off = tree->avl_offset;
+ avl_node_t *node = AVL_DATA2NODE(oldnode, off);
+ int right = 1 - left;
+ int was_child;
+
+
+ /*
+ * nowhere to walk to if tree is empty
+ */
+ if (node == NULL)
+ return (NULL);
+
+ /*
+ * Visit the previous valued node. There are two possibilities:
+ *
+ * If this node has a left child, go down one left, then all
+ * the way right.
+ */
+ if (node->avl_child[left] != NULL) {
+ for (node = node->avl_child[left];
+ node->avl_child[right] != NULL;
+ node = node->avl_child[right])
+ ;
+ /*
+ * Otherwise, return through left children as far as we can.
+ */
+ } else {
+ for (;;) {
+ was_child = AVL_XCHILD(node);
+ node = AVL_XPARENT(node);
+ if (node == NULL)
+ return (NULL);
+ if (was_child == right)
+ break;
+ }
+ }
+
+ return (AVL_NODE2DATA(node, off));
+}
+
+/*
+ * Return the lowest valued node in a tree or NULL.
+ * (leftmost child from root of tree)
+ */
+void *
+avl_first(avl_tree_t *tree)
+{
+ avl_node_t *node;
+ avl_node_t *prev = NULL;
+ size_t off = tree->avl_offset;
+
+ for (node = tree->avl_root; node != NULL; node = node->avl_child[0])
+ prev = node;
+
+ if (prev != NULL)
+ return (AVL_NODE2DATA(prev, off));
+ return (NULL);
+}
+
+/*
+ * Return the highest valued node in a tree or NULL.
+ * (rightmost child from root of tree)
+ */
+void *
+avl_last(avl_tree_t *tree)
+{
+ avl_node_t *node;
+ avl_node_t *prev = NULL;
+ size_t off = tree->avl_offset;
+
+ for (node = tree->avl_root; node != NULL; node = node->avl_child[1])
+ prev = node;
+
+ if (prev != NULL)
+ return (AVL_NODE2DATA(prev, off));
+ return (NULL);
+}
+
+/*
+ * Access the node immediately before or after an insertion point.
+ *
+ * "avl_index_t" is a (avl_node_t *) with the bottom bit indicating a child
+ *
+ * Return value:
+ * NULL: no node in the given direction
+ * "void *" of the found tree node
+ */
+void *
+avl_nearest(avl_tree_t *tree, avl_index_t where, int direction)
+{
+ int child = AVL_INDEX2CHILD(where);
+ avl_node_t *node = AVL_INDEX2NODE(where);
+ void *data;
+ size_t off = tree->avl_offset;
+
+ if (node == NULL) {
+ ASSERT(tree->avl_root == NULL);
+ return (NULL);
+ }
+ data = AVL_NODE2DATA(node, off);
+ if (child != direction)
+ return (data);
+
+ return (avl_walk(tree, data, direction));
+}
+
+
+/*
+ * Search for the node which contains "value". The algorithm is a
+ * simple binary tree search.
+ *
+ * return value:
+ * NULL: the value is not in the AVL tree
+ * *where (if not NULL) is set to indicate the insertion point
+ * "void *" of the found tree node
+ */
+void *
+avl_find(avl_tree_t *tree, const void *value, avl_index_t *where)
+{
+ avl_node_t *node;
+ avl_node_t *prev = NULL;
+ int child = 0;
+ int diff;
+ size_t off = tree->avl_offset;
+
+ for (node = tree->avl_root; node != NULL;
+ node = node->avl_child[child]) {
+
+ prev = node;
+
+ diff = tree->avl_compar(value, AVL_NODE2DATA(node, off));
+ ASSERT(-1 <= diff && diff <= 1);
+ if (diff == 0) {
+#ifdef ZFS_DEBUG
+ if (where != NULL)
+ *where = 0;
+#endif
+ return (AVL_NODE2DATA(node, off));
+ }
+ child = avl_balance2child[1 + diff];
+
+ }
+
+ if (where != NULL)
+ *where = AVL_MKINDEX(prev, child);
+
+ return (NULL);
+}
+
+
+/*
+ * Perform a rotation to restore balance at the subtree given by depth.
+ *
+ * This routine is used by both insertion and deletion. The return value
+ * indicates:
+ * 0 : subtree did not change height
+ * !0 : subtree was reduced in height
+ *
+ * The code is written as if handling left rotations, right rotations are
+ * symmetric and handled by swapping values of variables right/left[_heavy]
+ *
+ * On input balance is the "new" balance at "node". This value is either
+ * -2 or +2.
+ */
+static int
+avl_rotation(avl_tree_t *tree, avl_node_t *node, int balance)
+{
+ int left = !(balance < 0); /* when balance = -2, left will be 0 */
+ int right = 1 - left;
+ int left_heavy = balance >> 1;
+ int right_heavy = -left_heavy;
+ avl_node_t *parent = AVL_XPARENT(node);
+ avl_node_t *child = node->avl_child[left];
+ avl_node_t *cright;
+ avl_node_t *gchild;
+ avl_node_t *gright;
+ avl_node_t *gleft;
+ int which_child = AVL_XCHILD(node);
+ int child_bal = AVL_XBALANCE(child);
+
+ /* BEGIN CSTYLED */
+ /*
+ * case 1 : node is overly left heavy, the left child is balanced or
+ * also left heavy. This requires the following rotation.
+ *
+ * (node bal:-2)
+ * / \
+ * / \
+ * (child bal:0 or -1)
+ * / \
+ * / \
+ * cright
+ *
+ * becomes:
+ *
+ * (child bal:1 or 0)
+ * / \
+ * / \
+ * (node bal:-1 or 0)
+ * / \
+ * / \
+ * cright
+ *
+ * we detect this situation by noting that child's balance is not
+ * right_heavy.
+ */
+ /* END CSTYLED */
+ if (child_bal != right_heavy) {
+
+ /*
+ * compute new balance of nodes
+ *
+ * If child used to be left heavy (now balanced) we reduced
+ * the height of this sub-tree -- used in "return...;" below
+ */
+ child_bal += right_heavy; /* adjust towards right */
+
+ /*
+ * move "cright" to be node's left child
+ */
+ cright = child->avl_child[right];
+ node->avl_child[left] = cright;
+ if (cright != NULL) {
+ AVL_SETPARENT(cright, node);
+ AVL_SETCHILD(cright, left);
+ }
+
+ /*
+ * move node to be child's right child
+ */
+ child->avl_child[right] = node;
+ AVL_SETBALANCE(node, -child_bal);
+ AVL_SETCHILD(node, right);
+ AVL_SETPARENT(node, child);
+
+ /*
+ * update the pointer into this subtree
+ */
+ AVL_SETBALANCE(child, child_bal);
+ AVL_SETCHILD(child, which_child);
+ AVL_SETPARENT(child, parent);
+ if (parent != NULL)
+ parent->avl_child[which_child] = child;
+ else
+ tree->avl_root = child;
+
+ return (child_bal == 0);
+ }
+
+ /* BEGIN CSTYLED */
+ /*
+ * case 2 : When node is left heavy, but child is right heavy we use
+ * a different rotation.
+ *
+ * (node b:-2)
+ * / \
+ * / \
+ * / \
+ * (child b:+1)
+ * / \
+ * / \
+ * (gchild b: != 0)
+ * / \
+ * / \
+ * gleft gright
+ *
+ * becomes:
+ *
+ * (gchild b:0)
+ * / \
+ * / \
+ * / \
+ * (child b:?) (node b:?)
+ * / \ / \
+ * / \ / \
+ * gleft gright
+ *
+ * computing the new balances is more complicated. As an example:
+ * if gchild was right_heavy, then child is now left heavy
+ * else it is balanced
+ */
+ /* END CSTYLED */
+ gchild = child->avl_child[right];
+ gleft = gchild->avl_child[left];
+ gright = gchild->avl_child[right];
+
+ /*
+ * move gright to left child of node and
+ *
+ * move gleft to right child of node
+ */
+ node->avl_child[left] = gright;
+ if (gright != NULL) {
+ AVL_SETPARENT(gright, node);
+ AVL_SETCHILD(gright, left);
+ }
+
+ child->avl_child[right] = gleft;
+ if (gleft != NULL) {
+ AVL_SETPARENT(gleft, child);
+ AVL_SETCHILD(gleft, right);
+ }
+
+ /*
+ * move child to left child of gchild and
+ *
+ * move node to right child of gchild and
+ *
+ * fixup parent of all this to point to gchild
+ */
+ balance = AVL_XBALANCE(gchild);
+ gchild->avl_child[left] = child;
+ AVL_SETBALANCE(child, (balance == right_heavy ? left_heavy : 0));
+ AVL_SETPARENT(child, gchild);
+ AVL_SETCHILD(child, left);
+
+ gchild->avl_child[right] = node;
+ AVL_SETBALANCE(node, (balance == left_heavy ? right_heavy : 0));
+ AVL_SETPARENT(node, gchild);
+ AVL_SETCHILD(node, right);
+
+ AVL_SETBALANCE(gchild, 0);
+ AVL_SETPARENT(gchild, parent);
+ AVL_SETCHILD(gchild, which_child);
+ if (parent != NULL)
+ parent->avl_child[which_child] = gchild;
+ else
+ tree->avl_root = gchild;
+
+ return (1); /* the new tree is always shorter */
+}
+
+
+/*
+ * Insert a new node into an AVL tree at the specified (from avl_find()) place.
+ *
+ * Newly inserted nodes are always leaf nodes in the tree, since avl_find()
+ * searches out to the leaf positions. The avl_index_t indicates the node
+ * which will be the parent of the new node.
+ *
+ * After the node is inserted, a single rotation further up the tree may
+ * be necessary to maintain an acceptable AVL balance.
+ */
+void
+avl_insert(avl_tree_t *tree, void *new_data, avl_index_t where)
+{
+ avl_node_t *node;
+ avl_node_t *parent = AVL_INDEX2NODE(where);
+ int old_balance;
+ int new_balance;
+ int which_child = AVL_INDEX2CHILD(where);
+ size_t off = tree->avl_offset;
+
+#ifdef _LP64
+ ASSERT(((uintptr_t)new_data & 0x7) == 0);
+#endif
+
+ node = AVL_DATA2NODE(new_data, off);
+
+ /*
+ * First, add the node to the tree at the indicated position.
+ */
+ ++tree->avl_numnodes;
+
+ node->avl_child[0] = NULL;
+ node->avl_child[1] = NULL;
+
+ AVL_SETCHILD(node, which_child);
+ AVL_SETBALANCE(node, 0);
+ AVL_SETPARENT(node, parent);
+ if (parent != NULL) {
+ ASSERT(parent->avl_child[which_child] == NULL);
+ parent->avl_child[which_child] = node;
+ } else {
+ ASSERT(tree->avl_root == NULL);
+ tree->avl_root = node;
+ }
+ /*
+ * Now, back up the tree modifying the balance of all nodes above the
+ * insertion point. If we get to a highly unbalanced ancestor, we
+ * need to do a rotation. If we back out of the tree we are done.
+ * If we brought any subtree into perfect balance (0), we are also done.
+ */
+ for (;;) {
+ node = parent;
+ if (node == NULL)
+ return;
+
+ /*
+ * Compute the new balance
+ */
+ old_balance = AVL_XBALANCE(node);
+ new_balance = old_balance + avl_child2balance[which_child];
+
+ /*
+ * If we introduced equal balance, then we are done immediately
+ */
+ if (new_balance == 0) {
+ AVL_SETBALANCE(node, 0);
+ return;
+ }
+
+ /*
+ * If both old and new are not zero we went
+ * from -1 to -2 balance, do a rotation.
+ */
+ if (old_balance != 0)
+ break;
+
+ AVL_SETBALANCE(node, new_balance);
+ parent = AVL_XPARENT(node);
+ which_child = AVL_XCHILD(node);
+ }
+
+ /*
+ * perform a rotation to fix the tree and return
+ */
+ (void) avl_rotation(tree, node, new_balance);
+}
+
+/*
+ * Insert "new_data" in "tree" in the given "direction" either after or
+ * before (AVL_AFTER, AVL_BEFORE) the data "here".
+ *
+ * Insertions can only be done at empty leaf points in the tree, therefore
+ * if the given child of the node is already present we move to either
+ * the AVL_PREV or AVL_NEXT and reverse the insertion direction. Since
+ * every other node in the tree is a leaf, this always works.
+ *
+ * To help developers using this interface, we assert that the new node
+ * is correctly ordered at every step of the way in DEBUG kernels.
+ */
+void
+avl_insert_here(
+ avl_tree_t *tree,
+ void *new_data,
+ void *here,
+ int direction)
+{
+ avl_node_t *node;
+ int child = direction; /* rely on AVL_BEFORE == 0, AVL_AFTER == 1 */
+#ifdef ZFS_DEBUG
+ int diff;
+#endif
+
+ ASSERT(tree != NULL);
+ ASSERT(new_data != NULL);
+ ASSERT(here != NULL);
+ ASSERT(direction == AVL_BEFORE || direction == AVL_AFTER);
+
+ /*
+ * If corresponding child of node is not NULL, go to the neighboring
+ * node and reverse the insertion direction.
+ */
+ node = AVL_DATA2NODE(here, tree->avl_offset);
+
+#ifdef ZFS_DEBUG
+ diff = tree->avl_compar(new_data, here);
+ ASSERT(-1 <= diff && diff <= 1);
+ ASSERT(diff != 0);
+ ASSERT(diff > 0 ? child == 1 : child == 0);
+#endif
+
+ if (node->avl_child[child] != NULL) {
+ node = node->avl_child[child];
+ child = 1 - child;
+ while (node->avl_child[child] != NULL) {
+#ifdef ZFS_DEBUG
+ diff = tree->avl_compar(new_data,
+ AVL_NODE2DATA(node, tree->avl_offset));
+ ASSERT(-1 <= diff && diff <= 1);
+ ASSERT(diff != 0);
+ ASSERT(diff > 0 ? child == 1 : child == 0);
+#endif
+ node = node->avl_child[child];
+ }
+#ifdef ZFS_DEBUG
+ diff = tree->avl_compar(new_data,
+ AVL_NODE2DATA(node, tree->avl_offset));
+ ASSERT(-1 <= diff && diff <= 1);
+ ASSERT(diff != 0);
+ ASSERT(diff > 0 ? child == 1 : child == 0);
+#endif
+ }
+ ASSERT(node->avl_child[child] == NULL);
+
+ avl_insert(tree, new_data, AVL_MKINDEX(node, child));
+}
+
+/*
+ * Add a new node to an AVL tree. Strictly enforce that no duplicates can
+ * be added to the tree with a VERIFY which is enabled for non-DEBUG builds.
+ */
+void
+avl_add(avl_tree_t *tree, void *new_node)
+{
+ avl_index_t where = 0;
+
+ VERIFY(avl_find(tree, new_node, &where) == NULL);
+
+ avl_insert(tree, new_node, where);
+}
+
+/*
+ * Delete a node from the AVL tree. Deletion is similar to insertion, but
+ * with 2 complications.
+ *
+ * First, we may be deleting an interior node. Consider the following subtree:
+ *
+ * d c c
+ * / \ / \ / \
+ * b e b e b e
+ * / \ / \ /
+ * a c a a
+ *
+ * When we are deleting node (d), we find and bring up an adjacent valued leaf
+ * node, say (c), to take the interior node's place. In the code this is
+ * handled by temporarily swapping (d) and (c) in the tree and then using
+ * common code to delete (d) from the leaf position.
+ *
+ * Secondly, an interior deletion from a deep tree may require more than one
+ * rotation to fix the balance. This is handled by moving up the tree through
+ * parents and applying rotations as needed. The return value from
+ * avl_rotation() is used to detect when a subtree did not change overall
+ * height due to a rotation.
+ */
+void
+avl_remove(avl_tree_t *tree, void *data)
+{
+ avl_node_t *delete;
+ avl_node_t *parent;
+ avl_node_t *node;
+ avl_node_t tmp;
+ int old_balance;
+ int new_balance;
+ int left;
+ int right;
+ int which_child;
+ size_t off = tree->avl_offset;
+
+ delete = AVL_DATA2NODE(data, off);
+
+ /*
+ * Deletion is easiest with a node that has at most 1 child.
+ * We swap a node with 2 children with a sequentially valued
+ * neighbor node. That node will have at most 1 child. Note this
+ * has no effect on the ordering of the remaining nodes.
+ *
+ * As an optimization, we choose the greater neighbor if the tree
+ * is right heavy, otherwise the left neighbor. This reduces the
+ * number of rotations needed.
+ */
+ if (delete->avl_child[0] != NULL && delete->avl_child[1] != NULL) {
+
+ /*
+ * choose node to swap from whichever side is taller
+ */
+ old_balance = AVL_XBALANCE(delete);
+ left = avl_balance2child[old_balance + 1];
+ right = 1 - left;
+
+ /*
+ * get to the previous value'd node
+ * (down 1 left, as far as possible right)
+ */
+ for (node = delete->avl_child[left];
+ node->avl_child[right] != NULL;
+ node = node->avl_child[right])
+ ;
+
+ /*
+ * create a temp placeholder for 'node'
+ * move 'node' to delete's spot in the tree
+ */
+ tmp = *node;
+
+ *node = *delete;
+ if (node->avl_child[left] == node)
+ node->avl_child[left] = &tmp;
+
+ parent = AVL_XPARENT(node);
+ if (parent != NULL)
+ parent->avl_child[AVL_XCHILD(node)] = node;
+ else
+ tree->avl_root = node;
+ AVL_SETPARENT(node->avl_child[left], node);
+ AVL_SETPARENT(node->avl_child[right], node);
+
+ /*
+ * Put tmp where node used to be (just temporary).
+ * It always has a parent and at most 1 child.
+ */
+ delete = &tmp;
+ parent = AVL_XPARENT(delete);
+ parent->avl_child[AVL_XCHILD(delete)] = delete;
+ which_child = (delete->avl_child[1] != 0);
+ if (delete->avl_child[which_child] != NULL)
+ AVL_SETPARENT(delete->avl_child[which_child], delete);
+ }
+
+
+ /*
+ * Here we know "delete" is at least partially a leaf node. It can
+ * be easily removed from the tree.
+ */
+ ASSERT(tree->avl_numnodes > 0);
+ --tree->avl_numnodes;
+ parent = AVL_XPARENT(delete);
+ which_child = AVL_XCHILD(delete);
+ if (delete->avl_child[0] != NULL)
+ node = delete->avl_child[0];
+ else
+ node = delete->avl_child[1];
+
+ /*
+ * Connect parent directly to node (leaving out delete).
+ */
+ if (node != NULL) {
+ AVL_SETPARENT(node, parent);
+ AVL_SETCHILD(node, which_child);
+ }
+ if (parent == NULL) {
+ tree->avl_root = node;
+ return;
+ }
+ parent->avl_child[which_child] = node;
+
+
+ /*
+ * Since the subtree is now shorter, begin adjusting parent balances
+ * and performing any needed rotations.
+ */
+ do {
+
+ /*
+ * Move up the tree and adjust the balance
+ *
+ * Capture the parent and which_child values for the next
+ * iteration before any rotations occur.
+ */
+ node = parent;
+ old_balance = AVL_XBALANCE(node);
+ new_balance = old_balance - avl_child2balance[which_child];
+ parent = AVL_XPARENT(node);
+ which_child = AVL_XCHILD(node);
+
+ /*
+ * If a node was in perfect balance but isn't anymore then
+ * we can stop, since the height didn't change above this point
+ * due to a deletion.
+ */
+ if (old_balance == 0) {
+ AVL_SETBALANCE(node, new_balance);
+ break;
+ }
+
+ /*
+ * If the new balance is zero, we don't need to rotate
+ * else
+ * need a rotation to fix the balance.
+ * If the rotation doesn't change the height
+ * of the sub-tree we have finished adjusting.
+ */
+ if (new_balance == 0)
+ AVL_SETBALANCE(node, new_balance);
+ else if (!avl_rotation(tree, node, new_balance))
+ break;
+ } while (parent != NULL);
+}
+
+#define AVL_REINSERT(tree, obj) \
+ avl_remove((tree), (obj)); \
+ avl_add((tree), (obj))
+
+boolean_t
+avl_update_lt(avl_tree_t *t, void *obj)
+{
+ void *neighbor;
+
+ ASSERT(((neighbor = AVL_NEXT(t, obj)) == NULL) ||
+ (t->avl_compar(obj, neighbor) <= 0));
+
+ neighbor = AVL_PREV(t, obj);
+ if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) {
+ AVL_REINSERT(t, obj);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+boolean_t
+avl_update_gt(avl_tree_t *t, void *obj)
+{
+ void *neighbor;
+
+ ASSERT(((neighbor = AVL_PREV(t, obj)) == NULL) ||
+ (t->avl_compar(obj, neighbor) >= 0));
+
+ neighbor = AVL_NEXT(t, obj);
+ if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) {
+ AVL_REINSERT(t, obj);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+boolean_t
+avl_update(avl_tree_t *t, void *obj)
+{
+ void *neighbor;
+
+ neighbor = AVL_PREV(t, obj);
+ if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) {
+ AVL_REINSERT(t, obj);
+ return (B_TRUE);
+ }
+
+ neighbor = AVL_NEXT(t, obj);
+ if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) {
+ AVL_REINSERT(t, obj);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+void
+avl_swap(avl_tree_t *tree1, avl_tree_t *tree2)
+{
+ avl_node_t *temp_node;
+ ulong_t temp_numnodes;
+
+ ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar);
+ ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset);
+ ASSERT3U(tree1->avl_size, ==, tree2->avl_size);
+
+ temp_node = tree1->avl_root;
+ temp_numnodes = tree1->avl_numnodes;
+ tree1->avl_root = tree2->avl_root;
+ tree1->avl_numnodes = tree2->avl_numnodes;
+ tree2->avl_root = temp_node;
+ tree2->avl_numnodes = temp_numnodes;
+}
+
+/*
+ * initialize a new AVL tree
+ */
+void
+avl_create(avl_tree_t *tree, int (*compar) (const void *, const void *),
+ size_t size, size_t offset)
+{
+ ASSERT(tree);
+ ASSERT(compar);
+ ASSERT(size > 0);
+ ASSERT(size >= offset + sizeof (avl_node_t));
+#ifdef _LP64
+ ASSERT((offset & 0x7) == 0);
+#endif
+
+ tree->avl_compar = compar;
+ tree->avl_root = NULL;
+ tree->avl_numnodes = 0;
+ tree->avl_size = size;
+ tree->avl_offset = offset;
+}
+
+/*
+ * Delete a tree.
+ */
+/* ARGSUSED */
+void
+avl_destroy(avl_tree_t *tree)
+{
+ ASSERT(tree);
+ ASSERT(tree->avl_numnodes == 0);
+ ASSERT(tree->avl_root == NULL);
+}
+
+
+/*
+ * Return the number of nodes in an AVL tree.
+ */
+ulong_t
+avl_numnodes(avl_tree_t *tree)
+{
+ ASSERT(tree);
+ return (tree->avl_numnodes);
+}
+
+boolean_t
+avl_is_empty(avl_tree_t *tree)
+{
+ ASSERT(tree);
+ return (tree->avl_numnodes == 0);
+}
+
+#define CHILDBIT (1L)
+
+/*
+ * Post-order tree walk used to visit all tree nodes and destroy the tree
+ * in post order. This is used for removing all the nodes from a tree without
+ * paying any cost for rebalancing it.
+ *
+ * example:
+ *
+ * void *cookie = NULL;
+ * my_data_t *node;
+ *
+ * while ((node = avl_destroy_nodes(tree, &cookie)) != NULL)
+ * free(node);
+ * avl_destroy(tree);
+ *
+ * The cookie is really an avl_node_t to the current node's parent and
+ * an indication of which child you looked at last.
+ *
+ * On input, a cookie value of CHILDBIT indicates the tree is done.
+ */
+void *
+avl_destroy_nodes(avl_tree_t *tree, void **cookie)
+{
+ avl_node_t *node;
+ avl_node_t *parent;
+ int child;
+ void *first;
+ size_t off = tree->avl_offset;
+
+ /*
+ * Initial calls go to the first node or it's right descendant.
+ */
+ if (*cookie == NULL) {
+ first = avl_first(tree);
+
+ /*
+ * deal with an empty tree
+ */
+ if (first == NULL) {
+ *cookie = (void *)CHILDBIT;
+ return (NULL);
+ }
+
+ node = AVL_DATA2NODE(first, off);
+ parent = AVL_XPARENT(node);
+ goto check_right_side;
+ }
+
+ /*
+ * If there is no parent to return to we are done.
+ */
+ parent = (avl_node_t *)((uintptr_t)(*cookie) & ~CHILDBIT);
+ if (parent == NULL) {
+ if (tree->avl_root != NULL) {
+ ASSERT(tree->avl_numnodes == 1);
+ tree->avl_root = NULL;
+ tree->avl_numnodes = 0;
+ }
+ return (NULL);
+ }
+
+ /*
+ * Remove the child pointer we just visited from the parent and tree.
+ */
+ child = (uintptr_t)(*cookie) & CHILDBIT;
+ parent->avl_child[child] = NULL;
+ ASSERT(tree->avl_numnodes > 1);
+ --tree->avl_numnodes;
+
+ /*
+ * If we just did a right child or there isn't one, go up to parent.
+ */
+ if (child == 1 || parent->avl_child[1] == NULL) {
+ node = parent;
+ parent = AVL_XPARENT(parent);
+ goto done;
+ }
+
+ /*
+ * Do parent's right child, then leftmost descendent.
+ */
+ node = parent->avl_child[1];
+ while (node->avl_child[0] != NULL) {
+ parent = node;
+ node = node->avl_child[0];
+ }
+
+ /*
+ * If here, we moved to a left child. It may have one
+ * child on the right (when balance == +1).
+ */
+check_right_side:
+ if (node->avl_child[1] != NULL) {
+ ASSERT(AVL_XBALANCE(node) == 1);
+ parent = node;
+ node = node->avl_child[1];
+ ASSERT(node->avl_child[0] == NULL &&
+ node->avl_child[1] == NULL);
+ } else {
+ ASSERT(AVL_XBALANCE(node) <= 0);
+ }
+
+done:
+ if (parent == NULL) {
+ *cookie = (void *)CHILDBIT;
+ ASSERT(node == tree->avl_root);
+ } else {
+ *cookie = (void *)((uintptr_t)parent | AVL_XCHILD(node));
+ }
+
+ return (AVL_NODE2DATA(node, off));
+}
+
+#if defined(_KERNEL)
+
+static int __init
+avl_init(void)
+{
+ return (0);
+}
+
+static void __exit
+avl_fini(void)
+{
+}
+
+module_init(avl_init);
+module_exit(avl_fini);
+#endif
+
+ZFS_MODULE_DESCRIPTION("Generic AVL tree implementation");
+ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR);
+ZFS_MODULE_LICENSE(ZFS_META_LICENSE);
+ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
+
+EXPORT_SYMBOL(avl_create);
+EXPORT_SYMBOL(avl_find);
+EXPORT_SYMBOL(avl_insert);
+EXPORT_SYMBOL(avl_insert_here);
+EXPORT_SYMBOL(avl_walk);
+EXPORT_SYMBOL(avl_first);
+EXPORT_SYMBOL(avl_last);
+EXPORT_SYMBOL(avl_nearest);
+EXPORT_SYMBOL(avl_add);
+EXPORT_SYMBOL(avl_swap);
+EXPORT_SYMBOL(avl_is_empty);
+EXPORT_SYMBOL(avl_remove);
+EXPORT_SYMBOL(avl_numnodes);
+EXPORT_SYMBOL(avl_destroy_nodes);
+EXPORT_SYMBOL(avl_destroy);
+EXPORT_SYMBOL(avl_update_lt);
+EXPORT_SYMBOL(avl_update_gt);
+EXPORT_SYMBOL(avl_update);
diff --git a/sys/contrib/openzfs/module/icp/Makefile.in b/sys/contrib/openzfs/module/icp/Makefile.in
new file mode 100644
index 000000000000..7a01b2f08b8e
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/Makefile.in
@@ -0,0 +1,96 @@
+ifneq ($(KBUILD_EXTMOD),)
+src = @abs_srcdir@
+obj = @abs_builddir@
+icp_include = $(src)/include
+else
+icp_include = $(srctree)/$(src)/include
+endif
+
+MODULE := icp
+
+obj-$(CONFIG_ZFS) := $(MODULE).o
+
+asflags-y := -I$(icp_include)
+ccflags-y := -I$(icp_include)
+
+$(MODULE)-objs += illumos-crypto.o
+$(MODULE)-objs += api/kcf_cipher.o
+$(MODULE)-objs += api/kcf_digest.o
+$(MODULE)-objs += api/kcf_mac.o
+$(MODULE)-objs += api/kcf_miscapi.o
+$(MODULE)-objs += api/kcf_ctxops.o
+$(MODULE)-objs += core/kcf_callprov.o
+$(MODULE)-objs += core/kcf_prov_tabs.o
+$(MODULE)-objs += core/kcf_sched.o
+$(MODULE)-objs += core/kcf_mech_tabs.o
+$(MODULE)-objs += core/kcf_prov_lib.o
+$(MODULE)-objs += spi/kcf_spi.o
+$(MODULE)-objs += io/aes.o
+$(MODULE)-objs += io/edonr_mod.o
+$(MODULE)-objs += io/sha1_mod.o
+$(MODULE)-objs += io/sha2_mod.o
+$(MODULE)-objs += io/skein_mod.o
+$(MODULE)-objs += os/modhash.o
+$(MODULE)-objs += os/modconf.o
+$(MODULE)-objs += algs/modes/cbc.o
+$(MODULE)-objs += algs/modes/ccm.o
+$(MODULE)-objs += algs/modes/ctr.o
+$(MODULE)-objs += algs/modes/ecb.o
+$(MODULE)-objs += algs/modes/gcm_generic.o
+$(MODULE)-objs += algs/modes/gcm.o
+$(MODULE)-objs += algs/modes/modes.o
+$(MODULE)-objs += algs/aes/aes_impl_generic.o
+$(MODULE)-objs += algs/aes/aes_impl.o
+$(MODULE)-objs += algs/aes/aes_modes.o
+$(MODULE)-objs += algs/edonr/edonr.o
+$(MODULE)-objs += algs/sha1/sha1.o
+$(MODULE)-objs += algs/sha2/sha2.o
+$(MODULE)-objs += algs/sha1/sha1.o
+$(MODULE)-objs += algs/skein/skein.o
+$(MODULE)-objs += algs/skein/skein_block.o
+$(MODULE)-objs += algs/skein/skein_iv.o
+
+$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/aes/aeskey.o
+$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/aes/aes_amd64.o
+$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/aes/aes_aesni.o
+$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/modes/gcm_pclmulqdq.o
+$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/modes/aesni-gcm-x86_64.o
+$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/modes/ghash-x86_64.o
+$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/sha1/sha1-x86_64.o
+$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/sha2/sha256_impl.o
+$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/sha2/sha512_impl.o
+
+$(MODULE)-$(CONFIG_X86) += algs/modes/gcm_pclmulqdq.o
+$(MODULE)-$(CONFIG_X86) += algs/aes/aes_impl_aesni.o
+$(MODULE)-$(CONFIG_X86) += algs/aes/aes_impl_x86-64.o
+
+# Suppress objtool "can't find jump dest instruction at" warnings. They
+# are caused by the constants which are defined in the text section of the
+# assembly file using .byte instructions (e.g. bswap_mask). The objtool
+# utility tries to interpret them as opcodes and obviously fails doing so.
+OBJECT_FILES_NON_STANDARD_aesni-gcm-x86_64.o := y
+OBJECT_FILES_NON_STANDARD_ghash-x86_64.o := y
+
+ICP_DIRS = \
+ api \
+ core \
+ spi \
+ io \
+ os \
+ algs \
+ algs/aes \
+ algs/edonr \
+ algs/modes \
+ algs/sha1 \
+ algs/sha2 \
+ algs/skein \
+ asm-x86_64 \
+ asm-x86_64/aes \
+ asm-x86_64/modes \
+ asm-x86_64/sha1 \
+ asm-x86_64/sha2 \
+ asm-i386 \
+ asm-generic
+
+all:
+ mkdir -p $(ICP_DIRS)
diff --git a/sys/contrib/openzfs/module/icp/algs/aes/aes_impl.c b/sys/contrib/openzfs/module/icp/algs/aes/aes_impl.c
new file mode 100644
index 000000000000..037be0db60d7
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/aes/aes_impl.c
@@ -0,0 +1,443 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/icp.h>
+#include <sys/crypto/spi.h>
+#include <sys/simd.h>
+#include <modes/modes.h>
+#include <aes/aes_impl.h>
+
+/*
+ * Initialize AES encryption and decryption key schedules.
+ *
+ * Parameters:
+ * cipherKey User key
+ * keyBits AES key size (128, 192, or 256 bits)
+ * keysched AES key schedule to be initialized, of type aes_key_t.
+ * Allocated by aes_alloc_keysched().
+ */
+void
+aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, void *keysched)
+{
+ const aes_impl_ops_t *ops = aes_impl_get_ops();
+ aes_key_t *newbie = keysched;
+ uint_t keysize, i, j;
+ union {
+ uint64_t ka64[4];
+ uint32_t ka32[8];
+ } keyarr;
+
+ switch (keyBits) {
+ case 128:
+ newbie->nr = 10;
+ break;
+
+ case 192:
+ newbie->nr = 12;
+ break;
+
+ case 256:
+ newbie->nr = 14;
+ break;
+
+ default:
+ /* should never get here */
+ return;
+ }
+ keysize = CRYPTO_BITS2BYTES(keyBits);
+
+ /*
+ * Generic C implementation requires byteswap for little endian
+ * machines, various accelerated implementations for various
+ * architectures may not.
+ */
+ if (!ops->needs_byteswap) {
+ /* no byteswap needed */
+ if (IS_P2ALIGNED(cipherKey, sizeof (uint64_t))) {
+ for (i = 0, j = 0; j < keysize; i++, j += 8) {
+ /* LINTED: pointer alignment */
+ keyarr.ka64[i] = *((uint64_t *)&cipherKey[j]);
+ }
+ } else {
+ bcopy(cipherKey, keyarr.ka32, keysize);
+ }
+ } else {
+ /* byte swap */
+ for (i = 0, j = 0; j < keysize; i++, j += 4) {
+ keyarr.ka32[i] =
+ htonl(*(uint32_t *)(void *)&cipherKey[j]);
+ }
+ }
+
+ ops->generate(newbie, keyarr.ka32, keyBits);
+ newbie->ops = ops;
+
+ /*
+ * Note: if there are systems that need the AES_64BIT_KS type in the
+ * future, move setting key schedule type to individual implementations
+ */
+ newbie->type = AES_32BIT_KS;
+}
+
+
+/*
+ * Encrypt one block using AES.
+ * Align if needed and (for x86 32-bit only) byte-swap.
+ *
+ * Parameters:
+ * ks Key schedule, of type aes_key_t
+ * pt Input block (plain text)
+ * ct Output block (crypto text). Can overlap with pt
+ */
+int
+aes_encrypt_block(const void *ks, const uint8_t *pt, uint8_t *ct)
+{
+ aes_key_t *ksch = (aes_key_t *)ks;
+ const aes_impl_ops_t *ops = ksch->ops;
+
+ if (IS_P2ALIGNED2(pt, ct, sizeof (uint32_t)) && !ops->needs_byteswap) {
+ /* LINTED: pointer alignment */
+ ops->encrypt(&ksch->encr_ks.ks32[0], ksch->nr,
+ /* LINTED: pointer alignment */
+ (uint32_t *)pt, (uint32_t *)ct);
+ } else {
+ uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)];
+
+ /* Copy input block into buffer */
+ if (ops->needs_byteswap) {
+ buffer[0] = htonl(*(uint32_t *)(void *)&pt[0]);
+ buffer[1] = htonl(*(uint32_t *)(void *)&pt[4]);
+ buffer[2] = htonl(*(uint32_t *)(void *)&pt[8]);
+ buffer[3] = htonl(*(uint32_t *)(void *)&pt[12]);
+ } else
+ bcopy(pt, &buffer, AES_BLOCK_LEN);
+
+ ops->encrypt(&ksch->encr_ks.ks32[0], ksch->nr, buffer, buffer);
+
+ /* Copy result from buffer to output block */
+ if (ops->needs_byteswap) {
+ *(uint32_t *)(void *)&ct[0] = htonl(buffer[0]);
+ *(uint32_t *)(void *)&ct[4] = htonl(buffer[1]);
+ *(uint32_t *)(void *)&ct[8] = htonl(buffer[2]);
+ *(uint32_t *)(void *)&ct[12] = htonl(buffer[3]);
+ } else
+ bcopy(&buffer, ct, AES_BLOCK_LEN);
+ }
+ return (CRYPTO_SUCCESS);
+}
+
+
+/*
+ * Decrypt one block using AES.
+ * Align and byte-swap if needed.
+ *
+ * Parameters:
+ * ks Key schedule, of type aes_key_t
+ * ct Input block (crypto text)
+ * pt Output block (plain text). Can overlap with pt
+ */
+int
+aes_decrypt_block(const void *ks, const uint8_t *ct, uint8_t *pt)
+{
+ aes_key_t *ksch = (aes_key_t *)ks;
+ const aes_impl_ops_t *ops = ksch->ops;
+
+ if (IS_P2ALIGNED2(ct, pt, sizeof (uint32_t)) && !ops->needs_byteswap) {
+ /* LINTED: pointer alignment */
+ ops->decrypt(&ksch->decr_ks.ks32[0], ksch->nr,
+ /* LINTED: pointer alignment */
+ (uint32_t *)ct, (uint32_t *)pt);
+ } else {
+ uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)];
+
+ /* Copy input block into buffer */
+ if (ops->needs_byteswap) {
+ buffer[0] = htonl(*(uint32_t *)(void *)&ct[0]);
+ buffer[1] = htonl(*(uint32_t *)(void *)&ct[4]);
+ buffer[2] = htonl(*(uint32_t *)(void *)&ct[8]);
+ buffer[3] = htonl(*(uint32_t *)(void *)&ct[12]);
+ } else
+ bcopy(ct, &buffer, AES_BLOCK_LEN);
+
+ ops->decrypt(&ksch->decr_ks.ks32[0], ksch->nr, buffer, buffer);
+
+ /* Copy result from buffer to output block */
+ if (ops->needs_byteswap) {
+ *(uint32_t *)(void *)&pt[0] = htonl(buffer[0]);
+ *(uint32_t *)(void *)&pt[4] = htonl(buffer[1]);
+ *(uint32_t *)(void *)&pt[8] = htonl(buffer[2]);
+ *(uint32_t *)(void *)&pt[12] = htonl(buffer[3]);
+ } else
+ bcopy(&buffer, pt, AES_BLOCK_LEN);
+ }
+ return (CRYPTO_SUCCESS);
+}
+
+
+/*
+ * Allocate key schedule for AES.
+ *
+ * Return the pointer and set size to the number of bytes allocated.
+ * Memory allocated must be freed by the caller when done.
+ *
+ * Parameters:
+ * size Size of key schedule allocated, in bytes
+ * kmflag Flag passed to kmem_alloc(9F); ignored in userland.
+ */
+/* ARGSUSED */
+void *
+aes_alloc_keysched(size_t *size, int kmflag)
+{
+ aes_key_t *keysched;
+
+ keysched = (aes_key_t *)kmem_alloc(sizeof (aes_key_t), kmflag);
+ if (keysched != NULL) {
+ *size = sizeof (aes_key_t);
+ return (keysched);
+ }
+ return (NULL);
+}
+
+/* AES implementation that contains the fastest methods */
+static aes_impl_ops_t aes_fastest_impl = {
+ .name = "fastest"
+};
+
+/* All compiled in implementations */
+const aes_impl_ops_t *aes_all_impl[] = {
+ &aes_generic_impl,
+#if defined(__x86_64)
+ &aes_x86_64_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_AES)
+ &aes_aesni_impl,
+#endif
+};
+
+/* Indicate that benchmark has been completed */
+static boolean_t aes_impl_initialized = B_FALSE;
+
+/* Select aes implementation */
+#define IMPL_FASTEST (UINT32_MAX)
+#define IMPL_CYCLE (UINT32_MAX-1)
+
+#define AES_IMPL_READ(i) (*(volatile uint32_t *) &(i))
+
+static uint32_t icp_aes_impl = IMPL_FASTEST;
+static uint32_t user_sel_impl = IMPL_FASTEST;
+
+/* Hold all supported implementations */
+static size_t aes_supp_impl_cnt = 0;
+static aes_impl_ops_t *aes_supp_impl[ARRAY_SIZE(aes_all_impl)];
+
+/*
+ * Returns the AES operations for encrypt/decrypt/key setup. When a
+ * SIMD implementation is not allowed in the current context, then
+ * fallback to the fastest generic implementation.
+ */
+const aes_impl_ops_t *
+aes_impl_get_ops(void)
+{
+ if (!kfpu_allowed())
+ return (&aes_generic_impl);
+
+ const aes_impl_ops_t *ops = NULL;
+ const uint32_t impl = AES_IMPL_READ(icp_aes_impl);
+
+ switch (impl) {
+ case IMPL_FASTEST:
+ ASSERT(aes_impl_initialized);
+ ops = &aes_fastest_impl;
+ break;
+ case IMPL_CYCLE:
+ /* Cycle through supported implementations */
+ ASSERT(aes_impl_initialized);
+ ASSERT3U(aes_supp_impl_cnt, >, 0);
+ static size_t cycle_impl_idx = 0;
+ size_t idx = (++cycle_impl_idx) % aes_supp_impl_cnt;
+ ops = aes_supp_impl[idx];
+ break;
+ default:
+ ASSERT3U(impl, <, aes_supp_impl_cnt);
+ ASSERT3U(aes_supp_impl_cnt, >, 0);
+ if (impl < ARRAY_SIZE(aes_all_impl))
+ ops = aes_supp_impl[impl];
+ break;
+ }
+
+ ASSERT3P(ops, !=, NULL);
+
+ return (ops);
+}
+
+/*
+ * Initialize all supported implementations.
+ */
+void
+aes_impl_init(void)
+{
+ aes_impl_ops_t *curr_impl;
+ int i, c;
+
+ /* Move supported implementations into aes_supp_impls */
+ for (i = 0, c = 0; i < ARRAY_SIZE(aes_all_impl); i++) {
+ curr_impl = (aes_impl_ops_t *)aes_all_impl[i];
+
+ if (curr_impl->is_supported())
+ aes_supp_impl[c++] = (aes_impl_ops_t *)curr_impl;
+ }
+ aes_supp_impl_cnt = c;
+
+ /*
+ * Set the fastest implementation given the assumption that the
+ * hardware accelerated version is the fastest.
+ */
+#if defined(__x86_64)
+#if defined(HAVE_AES)
+ if (aes_aesni_impl.is_supported()) {
+ memcpy(&aes_fastest_impl, &aes_aesni_impl,
+ sizeof (aes_fastest_impl));
+ } else
+#endif
+ {
+ memcpy(&aes_fastest_impl, &aes_x86_64_impl,
+ sizeof (aes_fastest_impl));
+ }
+#else
+ memcpy(&aes_fastest_impl, &aes_generic_impl,
+ sizeof (aes_fastest_impl));
+#endif
+
+ strlcpy(aes_fastest_impl.name, "fastest", AES_IMPL_NAME_MAX);
+
+ /* Finish initialization */
+ atomic_swap_32(&icp_aes_impl, user_sel_impl);
+ aes_impl_initialized = B_TRUE;
+}
+
+static const struct {
+ char *name;
+ uint32_t sel;
+} aes_impl_opts[] = {
+ { "cycle", IMPL_CYCLE },
+ { "fastest", IMPL_FASTEST },
+};
+
+/*
+ * Function sets desired aes implementation.
+ *
+ * If we are called before init(), user preference will be saved in
+ * user_sel_impl, and applied in later init() call. This occurs when module
+ * parameter is specified on module load. Otherwise, directly update
+ * icp_aes_impl.
+ *
+ * @val Name of aes implementation to use
+ * @param Unused.
+ */
+int
+aes_impl_set(const char *val)
+{
+ int err = -EINVAL;
+ char req_name[AES_IMPL_NAME_MAX];
+ uint32_t impl = AES_IMPL_READ(user_sel_impl);
+ size_t i;
+
+ /* sanitize input */
+ i = strnlen(val, AES_IMPL_NAME_MAX);
+ if (i == 0 || i >= AES_IMPL_NAME_MAX)
+ return (err);
+
+ strlcpy(req_name, val, AES_IMPL_NAME_MAX);
+ while (i > 0 && isspace(req_name[i-1]))
+ i--;
+ req_name[i] = '\0';
+
+ /* Check mandatory options */
+ for (i = 0; i < ARRAY_SIZE(aes_impl_opts); i++) {
+ if (strcmp(req_name, aes_impl_opts[i].name) == 0) {
+ impl = aes_impl_opts[i].sel;
+ err = 0;
+ break;
+ }
+ }
+
+ /* check all supported impl if init() was already called */
+ if (err != 0 && aes_impl_initialized) {
+ /* check all supported implementations */
+ for (i = 0; i < aes_supp_impl_cnt; i++) {
+ if (strcmp(req_name, aes_supp_impl[i]->name) == 0) {
+ impl = i;
+ err = 0;
+ break;
+ }
+ }
+ }
+
+ if (err == 0) {
+ if (aes_impl_initialized)
+ atomic_swap_32(&icp_aes_impl, impl);
+ else
+ atomic_swap_32(&user_sel_impl, impl);
+ }
+
+ return (err);
+}
+
+#if defined(_KERNEL) && defined(__linux__)
+
+static int
+icp_aes_impl_set(const char *val, zfs_kernel_param_t *kp)
+{
+ return (aes_impl_set(val));
+}
+
+static int
+icp_aes_impl_get(char *buffer, zfs_kernel_param_t *kp)
+{
+ int i, cnt = 0;
+ char *fmt;
+ const uint32_t impl = AES_IMPL_READ(icp_aes_impl);
+
+ ASSERT(aes_impl_initialized);
+
+ /* list mandatory options */
+ for (i = 0; i < ARRAY_SIZE(aes_impl_opts); i++) {
+ fmt = (impl == aes_impl_opts[i].sel) ? "[%s] " : "%s ";
+ cnt += sprintf(buffer + cnt, fmt, aes_impl_opts[i].name);
+ }
+
+ /* list all supported implementations */
+ for (i = 0; i < aes_supp_impl_cnt; i++) {
+ fmt = (i == impl) ? "[%s] " : "%s ";
+ cnt += sprintf(buffer + cnt, fmt, aes_supp_impl[i]->name);
+ }
+
+ return (cnt);
+}
+
+module_param_call(icp_aes_impl, icp_aes_impl_set, icp_aes_impl_get,
+ NULL, 0644);
+MODULE_PARM_DESC(icp_aes_impl, "Select aes implementation.");
+#endif
diff --git a/sys/contrib/openzfs/module/icp/algs/aes/aes_impl_aesni.c b/sys/contrib/openzfs/module/icp/algs/aes/aes_impl_aesni.c
new file mode 100644
index 000000000000..4b5eefd71b17
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/aes/aes_impl_aesni.c
@@ -0,0 +1,124 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#if defined(__x86_64) && defined(HAVE_AES)
+
+#include <sys/simd.h>
+#include <sys/types.h>
+
+/* These functions are used to execute AES-NI instructions: */
+extern int rijndael_key_setup_enc_intel(uint32_t rk[],
+ const uint32_t cipherKey[], uint64_t keyBits);
+extern int rijndael_key_setup_dec_intel(uint32_t rk[],
+ const uint32_t cipherKey[], uint64_t keyBits);
+extern void aes_encrypt_intel(const uint32_t rk[], int Nr,
+ const uint32_t pt[4], uint32_t ct[4]);
+extern void aes_decrypt_intel(const uint32_t rk[], int Nr,
+ const uint32_t ct[4], uint32_t pt[4]);
+
+
+#include <aes/aes_impl.h>
+
+/*
+ * Expand the 32-bit AES cipher key array into the encryption and decryption
+ * key schedules.
+ *
+ * Parameters:
+ * key AES key schedule to be initialized
+ * keyarr32 User key
+ * keyBits AES key size (128, 192, or 256 bits)
+ */
+static void
+aes_aesni_generate(aes_key_t *key, const uint32_t *keyarr32, int keybits)
+{
+ kfpu_begin();
+ key->nr = rijndael_key_setup_enc_intel(&(key->encr_ks.ks32[0]),
+ keyarr32, keybits);
+ key->nr = rijndael_key_setup_dec_intel(&(key->decr_ks.ks32[0]),
+ keyarr32, keybits);
+ kfpu_end();
+}
+
+/*
+ * Encrypt one block of data. The block is assumed to be an array
+ * of four uint32_t values, so copy for alignment (and byte-order
+ * reversal for little endian systems might be necessary on the
+ * input and output byte streams.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
+ *
+ * Parameters:
+ * rk Key schedule, of aes_ks_t (60 32-bit integers)
+ * Nr Number of rounds
+ * pt Input block (plain text)
+ * ct Output block (crypto text). Can overlap with pt
+ */
+static void
+aes_aesni_encrypt(const uint32_t rk[], int Nr, const uint32_t pt[4],
+ uint32_t ct[4])
+{
+ kfpu_begin();
+ aes_encrypt_intel(rk, Nr, pt, ct);
+ kfpu_end();
+}
+
+/*
+ * Decrypt one block of data. The block is assumed to be an array
+ * of four uint32_t values, so copy for alignment (and byte-order
+ * reversal for little endian systems might be necessary on the
+ * input and output byte streams.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
+ *
+ * Parameters:
+ * rk Key schedule, of aes_ks_t (60 32-bit integers)
+ * Nr Number of rounds
+ * ct Input block (crypto text)
+ * pt Output block (plain text). Can overlap with pt
+ */
+static void
+aes_aesni_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4],
+ uint32_t pt[4])
+{
+ kfpu_begin();
+ aes_decrypt_intel(rk, Nr, ct, pt);
+ kfpu_end();
+}
+
+static boolean_t
+aes_aesni_will_work(void)
+{
+ return (kfpu_allowed() && zfs_aes_available());
+}
+
+const aes_impl_ops_t aes_aesni_impl = {
+ .generate = &aes_aesni_generate,
+ .encrypt = &aes_aesni_encrypt,
+ .decrypt = &aes_aesni_decrypt,
+ .is_supported = &aes_aesni_will_work,
+ .needs_byteswap = B_FALSE,
+ .name = "aesni"
+};
+
+#endif /* defined(__x86_64) && defined(HAVE_AES) */
diff --git a/sys/contrib/openzfs/module/icp/algs/aes/aes_impl_generic.c b/sys/contrib/openzfs/module/icp/algs/aes/aes_impl_generic.c
new file mode 100644
index 000000000000..427c096c6ab3
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/aes/aes_impl_generic.c
@@ -0,0 +1,1242 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <aes/aes_impl.h>
+
+/*
+ * This file is derived from the file rijndael-alg-fst.c taken from the
+ * "optimized C code v3.0" on the "rijndael home page"
+ * http://www.iaik.tu-graz.ac.at/research/krypto/AES/old/~rijmen/rijndael/
+ * pointed by the NIST web-site http://csrc.nist.gov/archive/aes/
+ *
+ * The following note is from the original file:
+ */
+
+/*
+ * rijndael-alg-fst.c
+ *
+ * @version 3.0 (December 2000)
+ *
+ * Optimised ANSI C code for the Rijndael cipher (now AES)
+ *
+ * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
+ * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
+ * @author Paulo Barreto <paulo.barreto@terra.com.br>
+ *
+ * This code is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Constant tables
+ */
+
+/*
+ * Te0[x] = S [x].[02, 01, 01, 03];
+ * Te1[x] = S [x].[03, 02, 01, 01];
+ * Te2[x] = S [x].[01, 03, 02, 01];
+ * Te3[x] = S [x].[01, 01, 03, 02];
+ * Te4[x] = S [x].[01, 01, 01, 01];
+ *
+ * Td0[x] = Si[x].[0e, 09, 0d, 0b];
+ * Td1[x] = Si[x].[0b, 0e, 09, 0d];
+ * Td2[x] = Si[x].[0d, 0b, 0e, 09];
+ * Td3[x] = Si[x].[09, 0d, 0b, 0e];
+ * Td4[x] = Si[x].[01, 01, 01, 01];
+ */
+
+/* Encrypt Sbox constants (for the substitute bytes operation) */
+
+static const uint32_t Te0[256] =
+{
+ 0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
+ 0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,
+ 0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,
+ 0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,
+ 0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,
+ 0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,
+ 0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,
+ 0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,
+ 0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,
+ 0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,
+ 0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,
+ 0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,
+ 0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,
+ 0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,
+ 0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,
+ 0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,
+ 0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,
+ 0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,
+ 0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,
+ 0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,
+ 0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,
+ 0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,
+ 0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,
+ 0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,
+ 0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,
+ 0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,
+ 0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,
+ 0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,
+ 0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,
+ 0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,
+ 0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,
+ 0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,
+ 0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,
+ 0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,
+ 0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,
+ 0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,
+ 0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,
+ 0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,
+ 0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,
+ 0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,
+ 0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,
+ 0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,
+ 0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,
+ 0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,
+ 0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,
+ 0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,
+ 0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,
+ 0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,
+ 0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,
+ 0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,
+ 0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,
+ 0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,
+ 0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,
+ 0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,
+ 0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,
+ 0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,
+ 0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,
+ 0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,
+ 0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,
+ 0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,
+ 0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,
+ 0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,
+ 0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,
+ 0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU
+};
+
+
+static const uint32_t Te1[256] =
+{
+ 0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,
+ 0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,
+ 0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,
+ 0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,
+ 0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,
+ 0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,
+ 0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,
+ 0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,
+ 0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,
+ 0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,
+ 0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,
+ 0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,
+ 0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,
+ 0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,
+ 0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,
+ 0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,
+ 0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,
+ 0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,
+ 0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,
+ 0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,
+ 0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,
+ 0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,
+ 0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,
+ 0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,
+ 0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,
+ 0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,
+ 0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,
+ 0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,
+ 0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,
+ 0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,
+ 0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,
+ 0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,
+ 0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,
+ 0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,
+ 0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,
+ 0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,
+ 0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,
+ 0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,
+ 0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,
+ 0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,
+ 0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,
+ 0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,
+ 0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,
+ 0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,
+ 0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,
+ 0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,
+ 0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,
+ 0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,
+ 0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,
+ 0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,
+ 0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,
+ 0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,
+ 0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,
+ 0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,
+ 0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,
+ 0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,
+ 0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,
+ 0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,
+ 0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,
+ 0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,
+ 0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,
+ 0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,
+ 0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,
+ 0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U
+};
+
+
+static const uint32_t Te2[256] =
+{
+ 0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,
+ 0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,
+ 0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,
+ 0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,
+ 0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,
+ 0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,
+ 0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,
+ 0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,
+ 0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,
+ 0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,
+ 0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,
+ 0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,
+ 0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,
+ 0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,
+ 0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,
+ 0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,
+ 0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,
+ 0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,
+ 0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,
+ 0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,
+ 0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,
+ 0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,
+ 0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,
+ 0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,
+ 0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,
+ 0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,
+ 0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,
+ 0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,
+ 0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,
+ 0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,
+ 0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,
+ 0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,
+ 0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,
+ 0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,
+ 0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,
+ 0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,
+ 0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,
+ 0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,
+ 0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,
+ 0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,
+ 0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,
+ 0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,
+ 0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,
+ 0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,
+ 0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,
+ 0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,
+ 0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,
+ 0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,
+ 0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,
+ 0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,
+ 0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,
+ 0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,
+ 0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,
+ 0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,
+ 0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,
+ 0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,
+ 0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,
+ 0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,
+ 0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,
+ 0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,
+ 0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,
+ 0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,
+ 0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,
+ 0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U
+};
+
+
+static const uint32_t Te3[256] =
+{
+ 0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,
+ 0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,
+ 0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,
+ 0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,
+ 0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,
+ 0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,
+ 0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,
+ 0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,
+ 0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,
+ 0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,
+ 0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,
+ 0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,
+ 0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,
+ 0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,
+ 0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,
+ 0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,
+ 0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,
+ 0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,
+ 0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,
+ 0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,
+ 0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,
+ 0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,
+ 0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,
+ 0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,
+ 0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,
+ 0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,
+ 0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,
+ 0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,
+ 0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,
+ 0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,
+ 0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,
+ 0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,
+ 0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,
+ 0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,
+ 0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,
+ 0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,
+ 0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,
+ 0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,
+ 0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,
+ 0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,
+ 0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,
+ 0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,
+ 0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,
+ 0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,
+ 0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,
+ 0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,
+ 0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,
+ 0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,
+ 0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,
+ 0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,
+ 0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,
+ 0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,
+ 0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,
+ 0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,
+ 0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,
+ 0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,
+ 0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,
+ 0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,
+ 0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,
+ 0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,
+ 0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,
+ 0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,
+ 0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,
+ 0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU
+};
+
+static const uint32_t Te4[256] =
+{
+ 0x63636363U, 0x7c7c7c7cU, 0x77777777U, 0x7b7b7b7bU,
+ 0xf2f2f2f2U, 0x6b6b6b6bU, 0x6f6f6f6fU, 0xc5c5c5c5U,
+ 0x30303030U, 0x01010101U, 0x67676767U, 0x2b2b2b2bU,
+ 0xfefefefeU, 0xd7d7d7d7U, 0xababababU, 0x76767676U,
+ 0xcacacacaU, 0x82828282U, 0xc9c9c9c9U, 0x7d7d7d7dU,
+ 0xfafafafaU, 0x59595959U, 0x47474747U, 0xf0f0f0f0U,
+ 0xadadadadU, 0xd4d4d4d4U, 0xa2a2a2a2U, 0xafafafafU,
+ 0x9c9c9c9cU, 0xa4a4a4a4U, 0x72727272U, 0xc0c0c0c0U,
+ 0xb7b7b7b7U, 0xfdfdfdfdU, 0x93939393U, 0x26262626U,
+ 0x36363636U, 0x3f3f3f3fU, 0xf7f7f7f7U, 0xccccccccU,
+ 0x34343434U, 0xa5a5a5a5U, 0xe5e5e5e5U, 0xf1f1f1f1U,
+ 0x71717171U, 0xd8d8d8d8U, 0x31313131U, 0x15151515U,
+ 0x04040404U, 0xc7c7c7c7U, 0x23232323U, 0xc3c3c3c3U,
+ 0x18181818U, 0x96969696U, 0x05050505U, 0x9a9a9a9aU,
+ 0x07070707U, 0x12121212U, 0x80808080U, 0xe2e2e2e2U,
+ 0xebebebebU, 0x27272727U, 0xb2b2b2b2U, 0x75757575U,
+ 0x09090909U, 0x83838383U, 0x2c2c2c2cU, 0x1a1a1a1aU,
+ 0x1b1b1b1bU, 0x6e6e6e6eU, 0x5a5a5a5aU, 0xa0a0a0a0U,
+ 0x52525252U, 0x3b3b3b3bU, 0xd6d6d6d6U, 0xb3b3b3b3U,
+ 0x29292929U, 0xe3e3e3e3U, 0x2f2f2f2fU, 0x84848484U,
+ 0x53535353U, 0xd1d1d1d1U, 0x00000000U, 0xededededU,
+ 0x20202020U, 0xfcfcfcfcU, 0xb1b1b1b1U, 0x5b5b5b5bU,
+ 0x6a6a6a6aU, 0xcbcbcbcbU, 0xbebebebeU, 0x39393939U,
+ 0x4a4a4a4aU, 0x4c4c4c4cU, 0x58585858U, 0xcfcfcfcfU,
+ 0xd0d0d0d0U, 0xefefefefU, 0xaaaaaaaaU, 0xfbfbfbfbU,
+ 0x43434343U, 0x4d4d4d4dU, 0x33333333U, 0x85858585U,
+ 0x45454545U, 0xf9f9f9f9U, 0x02020202U, 0x7f7f7f7fU,
+ 0x50505050U, 0x3c3c3c3cU, 0x9f9f9f9fU, 0xa8a8a8a8U,
+ 0x51515151U, 0xa3a3a3a3U, 0x40404040U, 0x8f8f8f8fU,
+ 0x92929292U, 0x9d9d9d9dU, 0x38383838U, 0xf5f5f5f5U,
+ 0xbcbcbcbcU, 0xb6b6b6b6U, 0xdadadadaU, 0x21212121U,
+ 0x10101010U, 0xffffffffU, 0xf3f3f3f3U, 0xd2d2d2d2U,
+ 0xcdcdcdcdU, 0x0c0c0c0cU, 0x13131313U, 0xececececU,
+ 0x5f5f5f5fU, 0x97979797U, 0x44444444U, 0x17171717U,
+ 0xc4c4c4c4U, 0xa7a7a7a7U, 0x7e7e7e7eU, 0x3d3d3d3dU,
+ 0x64646464U, 0x5d5d5d5dU, 0x19191919U, 0x73737373U,
+ 0x60606060U, 0x81818181U, 0x4f4f4f4fU, 0xdcdcdcdcU,
+ 0x22222222U, 0x2a2a2a2aU, 0x90909090U, 0x88888888U,
+ 0x46464646U, 0xeeeeeeeeU, 0xb8b8b8b8U, 0x14141414U,
+ 0xdedededeU, 0x5e5e5e5eU, 0x0b0b0b0bU, 0xdbdbdbdbU,
+ 0xe0e0e0e0U, 0x32323232U, 0x3a3a3a3aU, 0x0a0a0a0aU,
+ 0x49494949U, 0x06060606U, 0x24242424U, 0x5c5c5c5cU,
+ 0xc2c2c2c2U, 0xd3d3d3d3U, 0xacacacacU, 0x62626262U,
+ 0x91919191U, 0x95959595U, 0xe4e4e4e4U, 0x79797979U,
+ 0xe7e7e7e7U, 0xc8c8c8c8U, 0x37373737U, 0x6d6d6d6dU,
+ 0x8d8d8d8dU, 0xd5d5d5d5U, 0x4e4e4e4eU, 0xa9a9a9a9U,
+ 0x6c6c6c6cU, 0x56565656U, 0xf4f4f4f4U, 0xeaeaeaeaU,
+ 0x65656565U, 0x7a7a7a7aU, 0xaeaeaeaeU, 0x08080808U,
+ 0xbabababaU, 0x78787878U, 0x25252525U, 0x2e2e2e2eU,
+ 0x1c1c1c1cU, 0xa6a6a6a6U, 0xb4b4b4b4U, 0xc6c6c6c6U,
+ 0xe8e8e8e8U, 0xddddddddU, 0x74747474U, 0x1f1f1f1fU,
+ 0x4b4b4b4bU, 0xbdbdbdbdU, 0x8b8b8b8bU, 0x8a8a8a8aU,
+ 0x70707070U, 0x3e3e3e3eU, 0xb5b5b5b5U, 0x66666666U,
+ 0x48484848U, 0x03030303U, 0xf6f6f6f6U, 0x0e0e0e0eU,
+ 0x61616161U, 0x35353535U, 0x57575757U, 0xb9b9b9b9U,
+ 0x86868686U, 0xc1c1c1c1U, 0x1d1d1d1dU, 0x9e9e9e9eU,
+ 0xe1e1e1e1U, 0xf8f8f8f8U, 0x98989898U, 0x11111111U,
+ 0x69696969U, 0xd9d9d9d9U, 0x8e8e8e8eU, 0x94949494U,
+ 0x9b9b9b9bU, 0x1e1e1e1eU, 0x87878787U, 0xe9e9e9e9U,
+ 0xcecececeU, 0x55555555U, 0x28282828U, 0xdfdfdfdfU,
+ 0x8c8c8c8cU, 0xa1a1a1a1U, 0x89898989U, 0x0d0d0d0dU,
+ 0xbfbfbfbfU, 0xe6e6e6e6U, 0x42424242U, 0x68686868U,
+ 0x41414141U, 0x99999999U, 0x2d2d2d2dU, 0x0f0f0f0fU,
+ 0xb0b0b0b0U, 0x54545454U, 0xbbbbbbbbU, 0x16161616U
+};
+
+/* Decrypt Sbox constants (for the substitute bytes operation) */
+
+static const uint32_t Td0[256] =
+{
+ 0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U,
+ 0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U,
+ 0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U,
+ 0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU,
+ 0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U,
+ 0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U,
+ 0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU,
+ 0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U,
+ 0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU,
+ 0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U,
+ 0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U,
+ 0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U,
+ 0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U,
+ 0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU,
+ 0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U,
+ 0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU,
+ 0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U,
+ 0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU,
+ 0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U,
+ 0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U,
+ 0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U,
+ 0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU,
+ 0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U,
+ 0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU,
+ 0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U,
+ 0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU,
+ 0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U,
+ 0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU,
+ 0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU,
+ 0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U,
+ 0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU,
+ 0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U,
+ 0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU,
+ 0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U,
+ 0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U,
+ 0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U,
+ 0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU,
+ 0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U,
+ 0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U,
+ 0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU,
+ 0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U,
+ 0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U,
+ 0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U,
+ 0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U,
+ 0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U,
+ 0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU,
+ 0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U,
+ 0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U,
+ 0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U,
+ 0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U,
+ 0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U,
+ 0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU,
+ 0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU,
+ 0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU,
+ 0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU,
+ 0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U,
+ 0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U,
+ 0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU,
+ 0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU,
+ 0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U,
+ 0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU,
+ 0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U,
+ 0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U,
+ 0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U
+};
+
+static const uint32_t Td1[256] =
+{
+ 0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU,
+ 0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U,
+ 0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU,
+ 0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U,
+ 0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U,
+ 0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U,
+ 0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U,
+ 0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U,
+ 0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U,
+ 0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU,
+ 0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU,
+ 0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU,
+ 0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U,
+ 0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU,
+ 0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U,
+ 0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U,
+ 0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U,
+ 0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU,
+ 0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU,
+ 0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U,
+ 0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU,
+ 0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U,
+ 0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU,
+ 0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU,
+ 0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U,
+ 0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U,
+ 0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U,
+ 0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU,
+ 0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U,
+ 0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU,
+ 0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U,
+ 0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U,
+ 0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U,
+ 0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU,
+ 0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U,
+ 0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U,
+ 0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U,
+ 0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U,
+ 0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U,
+ 0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U,
+ 0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU,
+ 0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU,
+ 0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U,
+ 0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU,
+ 0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U,
+ 0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU,
+ 0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU,
+ 0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U,
+ 0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU,
+ 0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U,
+ 0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U,
+ 0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U,
+ 0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U,
+ 0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U,
+ 0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U,
+ 0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U,
+ 0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU,
+ 0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U,
+ 0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U,
+ 0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU,
+ 0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U,
+ 0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U,
+ 0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U,
+ 0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U
+};
+
+static const uint32_t Td2[256] =
+{
+ 0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U,
+ 0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U,
+ 0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U,
+ 0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U,
+ 0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU,
+ 0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U,
+ 0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U,
+ 0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U,
+ 0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U,
+ 0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU,
+ 0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U,
+ 0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U,
+ 0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU,
+ 0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U,
+ 0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U,
+ 0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U,
+ 0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U,
+ 0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U,
+ 0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U,
+ 0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU,
+ 0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U,
+ 0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U,
+ 0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U,
+ 0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U,
+ 0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U,
+ 0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU,
+ 0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU,
+ 0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U,
+ 0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU,
+ 0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U,
+ 0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU,
+ 0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU,
+ 0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU,
+ 0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU,
+ 0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U,
+ 0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U,
+ 0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U,
+ 0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U,
+ 0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U,
+ 0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U,
+ 0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U,
+ 0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU,
+ 0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU,
+ 0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U,
+ 0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U,
+ 0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU,
+ 0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU,
+ 0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U,
+ 0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U,
+ 0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U,
+ 0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U,
+ 0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U,
+ 0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U,
+ 0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U,
+ 0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU,
+ 0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U,
+ 0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U,
+ 0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U,
+ 0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U,
+ 0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U,
+ 0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U,
+ 0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU,
+ 0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U,
+ 0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U
+};
+
+static const uint32_t Td3[256] =
+{
+ 0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU,
+ 0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU,
+ 0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U,
+ 0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U,
+ 0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU,
+ 0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU,
+ 0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U,
+ 0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU,
+ 0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U,
+ 0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU,
+ 0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U,
+ 0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U,
+ 0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U,
+ 0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U,
+ 0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U,
+ 0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU,
+ 0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU,
+ 0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U,
+ 0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U,
+ 0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU,
+ 0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU,
+ 0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U,
+ 0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U,
+ 0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U,
+ 0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U,
+ 0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU,
+ 0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U,
+ 0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U,
+ 0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU,
+ 0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU,
+ 0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U,
+ 0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U,
+ 0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U,
+ 0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU,
+ 0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U,
+ 0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U,
+ 0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U,
+ 0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U,
+ 0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U,
+ 0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U,
+ 0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U,
+ 0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU,
+ 0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U,
+ 0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U,
+ 0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU,
+ 0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU,
+ 0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U,
+ 0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU,
+ 0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U,
+ 0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U,
+ 0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U,
+ 0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U,
+ 0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U,
+ 0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U,
+ 0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU,
+ 0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU,
+ 0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU,
+ 0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU,
+ 0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U,
+ 0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U,
+ 0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U,
+ 0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU,
+ 0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U,
+ 0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U
+};
+
+static const uint32_t Td4[256] =
+{
+ 0x52525252U, 0x09090909U, 0x6a6a6a6aU, 0xd5d5d5d5U,
+ 0x30303030U, 0x36363636U, 0xa5a5a5a5U, 0x38383838U,
+ 0xbfbfbfbfU, 0x40404040U, 0xa3a3a3a3U, 0x9e9e9e9eU,
+ 0x81818181U, 0xf3f3f3f3U, 0xd7d7d7d7U, 0xfbfbfbfbU,
+ 0x7c7c7c7cU, 0xe3e3e3e3U, 0x39393939U, 0x82828282U,
+ 0x9b9b9b9bU, 0x2f2f2f2fU, 0xffffffffU, 0x87878787U,
+ 0x34343434U, 0x8e8e8e8eU, 0x43434343U, 0x44444444U,
+ 0xc4c4c4c4U, 0xdedededeU, 0xe9e9e9e9U, 0xcbcbcbcbU,
+ 0x54545454U, 0x7b7b7b7bU, 0x94949494U, 0x32323232U,
+ 0xa6a6a6a6U, 0xc2c2c2c2U, 0x23232323U, 0x3d3d3d3dU,
+ 0xeeeeeeeeU, 0x4c4c4c4cU, 0x95959595U, 0x0b0b0b0bU,
+ 0x42424242U, 0xfafafafaU, 0xc3c3c3c3U, 0x4e4e4e4eU,
+ 0x08080808U, 0x2e2e2e2eU, 0xa1a1a1a1U, 0x66666666U,
+ 0x28282828U, 0xd9d9d9d9U, 0x24242424U, 0xb2b2b2b2U,
+ 0x76767676U, 0x5b5b5b5bU, 0xa2a2a2a2U, 0x49494949U,
+ 0x6d6d6d6dU, 0x8b8b8b8bU, 0xd1d1d1d1U, 0x25252525U,
+ 0x72727272U, 0xf8f8f8f8U, 0xf6f6f6f6U, 0x64646464U,
+ 0x86868686U, 0x68686868U, 0x98989898U, 0x16161616U,
+ 0xd4d4d4d4U, 0xa4a4a4a4U, 0x5c5c5c5cU, 0xccccccccU,
+ 0x5d5d5d5dU, 0x65656565U, 0xb6b6b6b6U, 0x92929292U,
+ 0x6c6c6c6cU, 0x70707070U, 0x48484848U, 0x50505050U,
+ 0xfdfdfdfdU, 0xededededU, 0xb9b9b9b9U, 0xdadadadaU,
+ 0x5e5e5e5eU, 0x15151515U, 0x46464646U, 0x57575757U,
+ 0xa7a7a7a7U, 0x8d8d8d8dU, 0x9d9d9d9dU, 0x84848484U,
+ 0x90909090U, 0xd8d8d8d8U, 0xababababU, 0x00000000U,
+ 0x8c8c8c8cU, 0xbcbcbcbcU, 0xd3d3d3d3U, 0x0a0a0a0aU,
+ 0xf7f7f7f7U, 0xe4e4e4e4U, 0x58585858U, 0x05050505U,
+ 0xb8b8b8b8U, 0xb3b3b3b3U, 0x45454545U, 0x06060606U,
+ 0xd0d0d0d0U, 0x2c2c2c2cU, 0x1e1e1e1eU, 0x8f8f8f8fU,
+ 0xcacacacaU, 0x3f3f3f3fU, 0x0f0f0f0fU, 0x02020202U,
+ 0xc1c1c1c1U, 0xafafafafU, 0xbdbdbdbdU, 0x03030303U,
+ 0x01010101U, 0x13131313U, 0x8a8a8a8aU, 0x6b6b6b6bU,
+ 0x3a3a3a3aU, 0x91919191U, 0x11111111U, 0x41414141U,
+ 0x4f4f4f4fU, 0x67676767U, 0xdcdcdcdcU, 0xeaeaeaeaU,
+ 0x97979797U, 0xf2f2f2f2U, 0xcfcfcfcfU, 0xcecececeU,
+ 0xf0f0f0f0U, 0xb4b4b4b4U, 0xe6e6e6e6U, 0x73737373U,
+ 0x96969696U, 0xacacacacU, 0x74747474U, 0x22222222U,
+ 0xe7e7e7e7U, 0xadadadadU, 0x35353535U, 0x85858585U,
+ 0xe2e2e2e2U, 0xf9f9f9f9U, 0x37373737U, 0xe8e8e8e8U,
+ 0x1c1c1c1cU, 0x75757575U, 0xdfdfdfdfU, 0x6e6e6e6eU,
+ 0x47474747U, 0xf1f1f1f1U, 0x1a1a1a1aU, 0x71717171U,
+ 0x1d1d1d1dU, 0x29292929U, 0xc5c5c5c5U, 0x89898989U,
+ 0x6f6f6f6fU, 0xb7b7b7b7U, 0x62626262U, 0x0e0e0e0eU,
+ 0xaaaaaaaaU, 0x18181818U, 0xbebebebeU, 0x1b1b1b1bU,
+ 0xfcfcfcfcU, 0x56565656U, 0x3e3e3e3eU, 0x4b4b4b4bU,
+ 0xc6c6c6c6U, 0xd2d2d2d2U, 0x79797979U, 0x20202020U,
+ 0x9a9a9a9aU, 0xdbdbdbdbU, 0xc0c0c0c0U, 0xfefefefeU,
+ 0x78787878U, 0xcdcdcdcdU, 0x5a5a5a5aU, 0xf4f4f4f4U,
+ 0x1f1f1f1fU, 0xddddddddU, 0xa8a8a8a8U, 0x33333333U,
+ 0x88888888U, 0x07070707U, 0xc7c7c7c7U, 0x31313131U,
+ 0xb1b1b1b1U, 0x12121212U, 0x10101010U, 0x59595959U,
+ 0x27272727U, 0x80808080U, 0xececececU, 0x5f5f5f5fU,
+ 0x60606060U, 0x51515151U, 0x7f7f7f7fU, 0xa9a9a9a9U,
+ 0x19191919U, 0xb5b5b5b5U, 0x4a4a4a4aU, 0x0d0d0d0dU,
+ 0x2d2d2d2dU, 0xe5e5e5e5U, 0x7a7a7a7aU, 0x9f9f9f9fU,
+ 0x93939393U, 0xc9c9c9c9U, 0x9c9c9c9cU, 0xefefefefU,
+ 0xa0a0a0a0U, 0xe0e0e0e0U, 0x3b3b3b3bU, 0x4d4d4d4dU,
+ 0xaeaeaeaeU, 0x2a2a2a2aU, 0xf5f5f5f5U, 0xb0b0b0b0U,
+ 0xc8c8c8c8U, 0xebebebebU, 0xbbbbbbbbU, 0x3c3c3c3cU,
+ 0x83838383U, 0x53535353U, 0x99999999U, 0x61616161U,
+ 0x17171717U, 0x2b2b2b2bU, 0x04040404U, 0x7e7e7e7eU,
+ 0xbabababaU, 0x77777777U, 0xd6d6d6d6U, 0x26262626U,
+ 0xe1e1e1e1U, 0x69696969U, 0x14141414U, 0x63636363U,
+ 0x55555555U, 0x21212121U, 0x0c0c0c0cU, 0x7d7d7d7dU
+};
+
+/* Rcon is Round Constant; used for encryption key expansion */
+static const uint32_t rcon[RC_LENGTH] =
+{
+ /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
+ 0x01000000, 0x02000000, 0x04000000, 0x08000000,
+ 0x10000000, 0x20000000, 0x40000000, 0x80000000,
+ 0x1B000000, 0x36000000
+};
+
+
+/*
+ * Expand the cipher key into the encryption key schedule.
+ *
+ * Return the number of rounds for the given cipher key size.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
+ *
+ * Parameters:
+ * rk AES key schedule 32-bit array to be initialized
+ * cipherKey User key
+ * keyBits AES key size (128, 192, or 256 bits)
+ */
+static int
+rijndael_key_setup_enc(uint32_t rk[], const uint32_t cipherKey[],
+ int keyBits)
+{
+ int i = 0;
+ uint32_t temp;
+
+ rk[0] = cipherKey[0];
+ rk[1] = cipherKey[1];
+ rk[2] = cipherKey[2];
+ rk[3] = cipherKey[3];
+
+ if (keyBits == 128) {
+ for (;;) {
+ temp = rk[3];
+ rk[4] = rk[0] ^
+ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
+ (Te4[temp & 0xff] & 0x0000ff00) ^
+ (Te4[temp >> 24] & 0x000000ff) ^
+ rcon[i];
+ rk[5] = rk[1] ^ rk[4];
+ rk[6] = rk[2] ^ rk[5];
+ rk[7] = rk[3] ^ rk[6];
+
+ if (++i == 10) {
+ return (10);
+ }
+ rk += 4;
+ }
+ }
+
+ rk[4] = cipherKey[4];
+ rk[5] = cipherKey[5];
+
+ if (keyBits == 192) {
+ for (;;) {
+ temp = rk[5];
+ rk[6] = rk[0] ^
+ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
+ (Te4[temp & 0xff] & 0x0000ff00) ^
+ (Te4[temp >> 24] & 0x000000ff) ^
+ rcon[i];
+ rk[7] = rk[1] ^ rk[6];
+ rk[8] = rk[2] ^ rk[7];
+ rk[9] = rk[3] ^ rk[8];
+
+ if (++i == 8) {
+ return (12);
+ }
+
+ rk[10] = rk[4] ^ rk[9];
+ rk[11] = rk[5] ^ rk[10];
+ rk += 6;
+ }
+ }
+
+ rk[6] = cipherKey[6];
+ rk[7] = cipherKey[7];
+
+ if (keyBits == 256) {
+ for (;;) {
+ temp = rk[7];
+ rk[8] = rk[0] ^
+ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
+ (Te4[temp & 0xff] & 0x0000ff00) ^
+ (Te4[temp >> 24] & 0x000000ff) ^
+ rcon[i];
+ rk[9] = rk[1] ^ rk[8];
+ rk[10] = rk[2] ^ rk[9];
+ rk[11] = rk[3] ^ rk[10];
+
+ if (++i == 7) {
+ return (14);
+ }
+ temp = rk[11];
+ rk[12] = rk[4] ^
+ (Te4[temp >> 24] & 0xff000000) ^
+ (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[temp & 0xff] & 0x000000ff);
+ rk[13] = rk[5] ^ rk[12];
+ rk[14] = rk[6] ^ rk[13];
+ rk[15] = rk[7] ^ rk[14];
+
+ rk += 8;
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Expand the cipher key into the decryption key schedule.
+ * Return the number of rounds for the given cipher key size.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
+ *
+ * Parameters:
+ * rk AES key schedule 32-bit array to be initialized
+ * cipherKey User key
+ * keyBits AES key size (128, 192, or 256 bits)
+ */
+static int
+rijndael_key_setup_dec(uint32_t rk[], const uint32_t cipherKey[], int keyBits)
+{
+ int Nr, i, j;
+ uint32_t temp;
+
+ /* expand the cipher key: */
+ Nr = rijndael_key_setup_enc(rk, cipherKey, keyBits);
+
+ /* invert the order of the round keys: */
+ for (i = 0, j = 4 * Nr; i < j; i += 4, j -= 4) {
+ temp = rk[i];
+ rk[i] = rk[j];
+ rk[j] = temp;
+ temp = rk[i + 1];
+ rk[i + 1] = rk[j + 1];
+ rk[j + 1] = temp;
+ temp = rk[i + 2];
+ rk[i + 2] = rk[j + 2];
+ rk[j + 2] = temp;
+ temp = rk[i + 3];
+ rk[i + 3] = rk[j + 3];
+ rk[j + 3] = temp;
+ }
+
+ /*
+ * apply the inverse MixColumn transform to all
+ * round keys but the first and the last:
+ */
+ for (i = 1; i < Nr; i++) {
+ rk += 4;
+ rk[0] = Td0[Te4[rk[0] >> 24] & 0xff] ^
+ Td1[Te4[(rk[0] >> 16) & 0xff] & 0xff] ^
+ Td2[Te4[(rk[0] >> 8) & 0xff] & 0xff] ^
+ Td3[Te4[rk[0] & 0xff] & 0xff];
+ rk[1] = Td0[Te4[rk[1] >> 24] & 0xff] ^
+ Td1[Te4[(rk[1] >> 16) & 0xff] & 0xff] ^
+ Td2[Te4[(rk[1] >> 8) & 0xff] & 0xff] ^
+ Td3[Te4[rk[1] & 0xff] & 0xff];
+ rk[2] = Td0[Te4[rk[2] >> 24] & 0xff] ^
+ Td1[Te4[(rk[2] >> 16) & 0xff] & 0xff] ^
+ Td2[Te4[(rk[2] >> 8) & 0xff] & 0xff] ^
+ Td3[Te4[rk[2] & 0xff] & 0xff];
+ rk[3] = Td0[Te4[rk[3] >> 24] & 0xff] ^
+ Td1[Te4[(rk[3] >> 16) & 0xff] & 0xff] ^
+ Td2[Te4[(rk[3] >> 8) & 0xff] & 0xff] ^
+ Td3[Te4[rk[3] & 0xff] & 0xff];
+ }
+
+ return (Nr);
+}
+
+/*
+ * Expand the 32-bit AES cipher key array into the encryption and decryption
+ * key schedules.
+ *
+ * Parameters:
+ * key AES key schedule to be initialized
+ * keyarr32 User key
+ * keyBits AES key size (128, 192, or 256 bits)
+ */
+static void
+aes_generic_generate(aes_key_t *key, const uint32_t *keyarr32, int keybits)
+{
+ key->nr = rijndael_key_setup_enc(&(key->encr_ks.ks32[0]), keyarr32,
+ keybits);
+ key->nr = rijndael_key_setup_dec(&(key->decr_ks.ks32[0]), keyarr32,
+ keybits);
+}
+
+/*
+ * Encrypt one block of data. The block is assumed to be an array
+ * of four uint32_t values, so copy for alignment (and byte-order
+ * reversal for little endian systems might be necessary on the
+ * input and output byte streams.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
+ *
+ * Parameters:
+ * rk Key schedule, of aes_ks_t (60 32-bit integers)
+ * Nr Number of rounds
+ * pt Input block (plain text)
+ * ct Output block (crypto text). Can overlap with pt
+ */
+static void
+aes_generic_encrypt(const uint32_t rk[], int Nr, const uint32_t pt[4],
+ uint32_t ct[4])
+{
+ uint32_t s0, s1, s2, s3, t0, t1, t2, t3;
+ int r;
+
+ /*
+ * map byte array block to cipher state
+ * and add initial round key:
+ */
+
+ s0 = pt[0] ^ rk[0];
+ s1 = pt[1] ^ rk[1];
+ s2 = pt[2] ^ rk[2];
+ s3 = pt[3] ^ rk[3];
+
+ /*
+ * Nr - 1 full rounds:
+ */
+
+ r = Nr >> 1;
+
+ for (;;) {
+ t0 = Te0[s0 >> 24] ^
+ Te1[(s1 >> 16) & 0xff] ^
+ Te2[(s2 >> 8) & 0xff] ^
+ Te3[s3 & 0xff] ^
+ rk[4];
+
+ t1 = Te0[s1 >> 24] ^
+ Te1[(s2 >> 16) & 0xff] ^
+ Te2[(s3 >> 8) & 0xff] ^
+ Te3[s0 & 0xff] ^
+ rk[5];
+
+ t2 = Te0[s2 >> 24] ^
+ Te1[(s3 >> 16) & 0xff] ^
+ Te2[(s0 >> 8) & 0xff] ^
+ Te3[s1 & 0xff] ^
+ rk[6];
+
+ t3 = Te0[s3 >> 24] ^
+ Te1[(s0 >> 16) & 0xff] ^
+ Te2[(s1 >> 8) & 0xff] ^
+ Te3[s2 & 0xff] ^
+ rk[7];
+
+ rk += 8;
+
+ if (--r == 0) {
+ break;
+ }
+
+ s0 = Te0[t0 >> 24] ^
+ Te1[(t1 >> 16) & 0xff] ^
+ Te2[(t2 >> 8) & 0xff] ^
+ Te3[t3 & 0xff] ^
+ rk[0];
+
+ s1 = Te0[t1 >> 24] ^
+ Te1[(t2 >> 16) & 0xff] ^
+ Te2[(t3 >> 8) & 0xff] ^
+ Te3[t0 & 0xff] ^
+ rk[1];
+
+ s2 = Te0[t2 >> 24] ^
+ Te1[(t3 >> 16) & 0xff] ^
+ Te2[(t0 >> 8) & 0xff] ^
+ Te3[t1 & 0xff] ^
+ rk[2];
+
+ s3 = Te0[t3 >> 24] ^
+ Te1[(t0 >> 16) & 0xff] ^
+ Te2[(t1 >> 8) & 0xff] ^
+ Te3[t2 & 0xff] ^
+ rk[3];
+ }
+
+ /*
+ * apply last round and
+ * map cipher state to byte array block:
+ */
+
+ s0 = (Te4[(t0 >> 24)] & 0xff000000) ^
+ (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[t3 & 0xff] & 0x000000ff) ^
+ rk[0];
+ ct[0] = s0;
+
+ s1 = (Te4[(t1 >> 24)] & 0xff000000) ^
+ (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[t0 & 0xff] & 0x000000ff) ^
+ rk[1];
+ ct[1] = s1;
+
+ s2 = (Te4[(t2 >> 24)] & 0xff000000) ^
+ (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[t1 & 0xff] & 0x000000ff) ^
+ rk[2];
+ ct[2] = s2;
+
+ s3 = (Te4[(t3 >> 24)] & 0xff000000) ^
+ (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[t2 & 0xff] & 0x000000ff) ^
+ rk[3];
+ ct[3] = s3;
+}
+
+
+/*
+ * Decrypt one block of data. The block is assumed to be an array
+ * of four uint32_t values, so copy for alignment (and byte-order
+ * reversal for little endian systems might be necessary on the
+ * input and output byte streams.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
+ *
+ * Parameters:
+ * rk Key schedule, of aes_ks_t (60 32-bit integers)
+ * Nr Number of rounds
+ * ct Input block (crypto text)
+ * pt Output block (plain text). Can overlap with pt
+ */
+static void
+aes_generic_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4],
+ uint32_t pt[4])
+{
+ uint32_t s0, s1, s2, s3, t0, t1, t2, t3;
+ int r;
+
+ /*
+ * map byte array block to cipher state
+ * and add initial round key:
+ */
+ s0 = ct[0] ^ rk[0];
+ s1 = ct[1] ^ rk[1];
+ s2 = ct[2] ^ rk[2];
+ s3 = ct[3] ^ rk[3];
+
+ /*
+ * Nr - 1 full rounds:
+ */
+
+ r = Nr >> 1;
+
+ for (;;) {
+ t0 = Td0[s0 >> 24] ^
+ Td1[(s3 >> 16) & 0xff] ^
+ Td2[(s2 >> 8) & 0xff] ^
+ Td3[s1 & 0xff] ^
+ rk[4];
+
+ t1 = Td0[s1 >> 24] ^
+ Td1[(s0 >> 16) & 0xff] ^
+ Td2[(s3 >> 8) & 0xff] ^
+ Td3[s2 & 0xff] ^
+ rk[5];
+
+ t2 = Td0[s2 >> 24] ^
+ Td1[(s1 >> 16) & 0xff] ^
+ Td2[(s0 >> 8) & 0xff] ^
+ Td3[s3 & 0xff] ^
+ rk[6];
+
+ t3 = Td0[s3 >> 24] ^
+ Td1[(s2 >> 16) & 0xff] ^
+ Td2[(s1 >> 8) & 0xff] ^
+ Td3[s0 & 0xff] ^
+ rk[7];
+
+ rk += 8;
+
+ if (--r == 0) {
+ break;
+ }
+
+ s0 = Td0[t0 >> 24] ^
+ Td1[(t3 >> 16) & 0xff] ^
+ Td2[(t2 >> 8) & 0xff] ^
+ Td3[t1 & 0xff] ^
+ rk[0];
+
+ s1 = Td0[t1 >> 24] ^
+ Td1[(t0 >> 16) & 0xff] ^
+ Td2[(t3 >> 8) & 0xff] ^
+ Td3[t2 & 0xff] ^
+ rk[1];
+
+ s2 = Td0[t2 >> 24] ^
+ Td1[(t1 >> 16) & 0xff] ^
+ Td2[(t0 >> 8) & 0xff] ^
+ Td3[t3 & 0xff] ^
+ rk[2];
+
+ s3 = Td0[t3 >> 24] ^
+ Td1[(t2 >> 16) & 0xff] ^
+ Td2[(t1 >> 8) & 0xff] ^
+ Td3[t0 & 0xff] ^
+ rk[3];
+ }
+
+ /*
+ * apply last round and
+ * map cipher state to byte array block:
+ */
+
+ s0 = (Td4[t0 >> 24] & 0xff000000) ^
+ (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[t1 & 0xff] & 0x000000ff) ^
+ rk[0];
+ pt[0] = s0;
+
+ s1 = (Td4[t1 >> 24] & 0xff000000) ^
+ (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[t2 & 0xff] & 0x000000ff) ^
+ rk[1];
+ pt[1] = s1;
+
+ s2 = (Td4[t2 >> 24] & 0xff000000) ^
+ (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[t3 & 0xff] & 0x000000ff) ^
+ rk[2];
+ pt[2] = s2;
+
+ s3 = (Td4[t3 >> 24] & 0xff000000) ^
+ (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[t0 & 0xff] & 0x000000ff) ^
+ rk[3];
+ pt[3] = s3;
+}
+
+static boolean_t
+aes_generic_will_work(void)
+{
+ return (B_TRUE);
+}
+
+/*
+ * For _LITTLE_ENDIAN machines, reverse every 4 bytes in the key.
+ * On _BIG_ENDIAN, copy the key without reversing bytes.
+ *
+ * SPARCv8/v9 uses a key schedule array with 64-bit elements.
+ * X86/AMD64 uses a key schedule array with 32-bit elements.
+ */
+const aes_impl_ops_t aes_generic_impl = {
+ .generate = &aes_generic_generate,
+ .encrypt = &aes_generic_encrypt,
+ .decrypt = &aes_generic_decrypt,
+ .is_supported = &aes_generic_will_work,
+#if defined(_ZFS_LITTLE_ENDIAN)
+ .needs_byteswap = B_TRUE,
+#else
+ .needs_byteswap = B_FALSE,
+#endif
+ .name = "generic"
+};
diff --git a/sys/contrib/openzfs/module/icp/algs/aes/aes_impl_x86-64.c b/sys/contrib/openzfs/module/icp/algs/aes/aes_impl_x86-64.c
new file mode 100644
index 000000000000..19f8fd5012cf
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/aes/aes_impl_x86-64.c
@@ -0,0 +1,63 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#if defined(__x86_64)
+
+#include <sys/simd.h>
+#include <aes/aes_impl.h>
+
+/*
+ * Expand the 32-bit AES cipher key array into the encryption and decryption
+ * key schedules.
+ *
+ * Parameters:
+ * key AES key schedule to be initialized
+ * keyarr32 User key
+ * keyBits AES key size (128, 192, or 256 bits)
+ */
+static void
+aes_x86_64_generate(aes_key_t *key, const uint32_t *keyarr32, int keybits)
+{
+ key->nr = rijndael_key_setup_enc_amd64(&(key->encr_ks.ks32[0]),
+ keyarr32, keybits);
+ key->nr = rijndael_key_setup_dec_amd64(&(key->decr_ks.ks32[0]),
+ keyarr32, keybits);
+}
+
+static boolean_t
+aes_x86_64_will_work(void)
+{
+ return (B_TRUE);
+}
+
+const aes_impl_ops_t aes_x86_64_impl = {
+ .generate = &aes_x86_64_generate,
+ .encrypt = &aes_encrypt_amd64,
+ .decrypt = &aes_decrypt_amd64,
+ .is_supported = &aes_x86_64_will_work,
+ .needs_byteswap = B_FALSE,
+ .name = "x86_64"
+};
+
+#endif /* defined(__x86_64) */
diff --git a/sys/contrib/openzfs/module/icp/algs/aes/aes_modes.c b/sys/contrib/openzfs/module/icp/algs/aes/aes_modes.c
new file mode 100644
index 000000000000..9e4b498fffcb
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/aes/aes_modes.c
@@ -0,0 +1,135 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <modes/modes.h>
+#include <aes/aes_impl.h>
+
+/* Copy a 16-byte AES block from "in" to "out" */
+void
+aes_copy_block(uint8_t *in, uint8_t *out)
+{
+ if (IS_P2ALIGNED2(in, out, sizeof (uint32_t))) {
+ /* LINTED: pointer alignment */
+ *(uint32_t *)&out[0] = *(uint32_t *)&in[0];
+ /* LINTED: pointer alignment */
+ *(uint32_t *)&out[4] = *(uint32_t *)&in[4];
+ /* LINTED: pointer alignment */
+ *(uint32_t *)&out[8] = *(uint32_t *)&in[8];
+ /* LINTED: pointer alignment */
+ *(uint32_t *)&out[12] = *(uint32_t *)&in[12];
+ } else {
+ AES_COPY_BLOCK(in, out);
+ }
+}
+
+
+/* XOR a 16-byte AES block of data into dst */
+void
+aes_xor_block(uint8_t *data, uint8_t *dst)
+{
+ if (IS_P2ALIGNED2(dst, data, sizeof (uint32_t))) {
+ /* LINTED: pointer alignment */
+ *(uint32_t *)&dst[0] ^= *(uint32_t *)&data[0];
+ /* LINTED: pointer alignment */
+ *(uint32_t *)&dst[4] ^= *(uint32_t *)&data[4];
+ /* LINTED: pointer alignment */
+ *(uint32_t *)&dst[8] ^= *(uint32_t *)&data[8];
+ /* LINTED: pointer alignment */
+ *(uint32_t *)&dst[12] ^= *(uint32_t *)&data[12];
+ } else {
+ AES_XOR_BLOCK(data, dst);
+ }
+}
+
+
+/*
+ * Encrypt multiple blocks of data according to mode.
+ */
+int
+aes_encrypt_contiguous_blocks(void *ctx, char *data, size_t length,
+ crypto_data_t *out)
+{
+ aes_ctx_t *aes_ctx = ctx;
+ int rv;
+
+ if (aes_ctx->ac_flags & CTR_MODE) {
+ rv = ctr_mode_contiguous_blocks(ctx, data, length, out,
+ AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block);
+ } else if (aes_ctx->ac_flags & CCM_MODE) {
+ rv = ccm_mode_encrypt_contiguous_blocks(ctx, data, length,
+ out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
+ aes_xor_block);
+ } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) {
+ rv = gcm_mode_encrypt_contiguous_blocks(ctx, data, length,
+ out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
+ aes_xor_block);
+ } else if (aes_ctx->ac_flags & CBC_MODE) {
+ rv = cbc_encrypt_contiguous_blocks(ctx,
+ data, length, out, AES_BLOCK_LEN, aes_encrypt_block,
+ aes_copy_block, aes_xor_block);
+ } else {
+ rv = ecb_cipher_contiguous_blocks(ctx, data, length, out,
+ AES_BLOCK_LEN, aes_encrypt_block);
+ }
+ return (rv);
+}
+
+
+/*
+ * Decrypt multiple blocks of data according to mode.
+ */
+int
+aes_decrypt_contiguous_blocks(void *ctx, char *data, size_t length,
+ crypto_data_t *out)
+{
+ aes_ctx_t *aes_ctx = ctx;
+ int rv;
+
+ if (aes_ctx->ac_flags & CTR_MODE) {
+ rv = ctr_mode_contiguous_blocks(ctx, data, length, out,
+ AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block);
+ if (rv == CRYPTO_DATA_LEN_RANGE)
+ rv = CRYPTO_ENCRYPTED_DATA_LEN_RANGE;
+ } else if (aes_ctx->ac_flags & CCM_MODE) {
+ rv = ccm_mode_decrypt_contiguous_blocks(ctx, data, length,
+ out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
+ aes_xor_block);
+ } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) {
+ rv = gcm_mode_decrypt_contiguous_blocks(ctx, data, length,
+ out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
+ aes_xor_block);
+ } else if (aes_ctx->ac_flags & CBC_MODE) {
+ rv = cbc_decrypt_contiguous_blocks(ctx, data, length, out,
+ AES_BLOCK_LEN, aes_decrypt_block, aes_copy_block,
+ aes_xor_block);
+ } else {
+ rv = ecb_cipher_contiguous_blocks(ctx, data, length, out,
+ AES_BLOCK_LEN, aes_decrypt_block);
+ if (rv == CRYPTO_DATA_LEN_RANGE)
+ rv = CRYPTO_ENCRYPTED_DATA_LEN_RANGE;
+ }
+ return (rv);
+}
diff --git a/sys/contrib/openzfs/module/icp/algs/edonr/edonr.c b/sys/contrib/openzfs/module/icp/algs/edonr/edonr.c
new file mode 100644
index 000000000000..7c677095f1ef
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/edonr/edonr.c
@@ -0,0 +1,746 @@
+/*
+ * IDI,NTNU
+ *
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Copyright (C) 2009, 2010, Jorn Amundsen <jorn.amundsen@ntnu.no>
+ * Tweaked Edon-R implementation for SUPERCOP, based on NIST API.
+ *
+ * $Id: edonr.c 517 2013-02-17 20:34:39Z joern $
+ */
+/*
+ * Portions copyright (c) 2013, Saso Kiselkov, All rights reserved
+ */
+
+#include <sys/strings.h>
+#include <sys/edonr.h>
+#include <sys/debug.h>
+
+/* big endian support, provides no-op's if run on little endian hosts */
+#include "edonr_byteorder.h"
+
+#define hashState224(x) ((x)->pipe->p256)
+#define hashState256(x) ((x)->pipe->p256)
+#define hashState384(x) ((x)->pipe->p512)
+#define hashState512(x) ((x)->pipe->p512)
+
+/* shift and rotate shortcuts */
+#define shl(x, n) ((x) << n)
+#define shr(x, n) ((x) >> n)
+
+#define rotl32(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
+#define rotr32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+
+#define rotl64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
+#define rotr64(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
+
+#if !defined(__C99_RESTRICT)
+#define restrict /* restrict */
+#endif
+
+#define EDONR_VALID_HASHBITLEN(x) \
+ ((x) == 512 || (x) == 384 || (x) == 256 || (x) == 224)
+
+/* EdonR224 initial double chaining pipe */
+static const uint32_t i224p2[16] = {
+ 0x00010203ul, 0x04050607ul, 0x08090a0bul, 0x0c0d0e0ful,
+ 0x10111213ul, 0x14151617ul, 0x18191a1bul, 0x1c1d1e1ful,
+ 0x20212223ul, 0x24252627ul, 0x28292a2bul, 0x2c2d2e2ful,
+ 0x30313233ul, 0x34353637ul, 0x38393a3bul, 0x3c3d3e3ful,
+};
+
+/* EdonR256 initial double chaining pipe */
+static const uint32_t i256p2[16] = {
+ 0x40414243ul, 0x44454647ul, 0x48494a4bul, 0x4c4d4e4ful,
+ 0x50515253ul, 0x54555657ul, 0x58595a5bul, 0x5c5d5e5ful,
+ 0x60616263ul, 0x64656667ul, 0x68696a6bul, 0x6c6d6e6ful,
+ 0x70717273ul, 0x74757677ul, 0x78797a7bul, 0x7c7d7e7ful,
+};
+
+/* EdonR384 initial double chaining pipe */
+static const uint64_t i384p2[16] = {
+ 0x0001020304050607ull, 0x08090a0b0c0d0e0full,
+ 0x1011121314151617ull, 0x18191a1b1c1d1e1full,
+ 0x2021222324252627ull, 0x28292a2b2c2d2e2full,
+ 0x3031323334353637ull, 0x38393a3b3c3d3e3full,
+ 0x4041424344454647ull, 0x48494a4b4c4d4e4full,
+ 0x5051525354555657ull, 0x58595a5b5c5d5e5full,
+ 0x6061626364656667ull, 0x68696a6b6c6d6e6full,
+ 0x7071727374757677ull, 0x78797a7b7c7d7e7full
+};
+
+/* EdonR512 initial double chaining pipe */
+static const uint64_t i512p2[16] = {
+ 0x8081828384858687ull, 0x88898a8b8c8d8e8full,
+ 0x9091929394959697ull, 0x98999a9b9c9d9e9full,
+ 0xa0a1a2a3a4a5a6a7ull, 0xa8a9aaabacadaeafull,
+ 0xb0b1b2b3b4b5b6b7ull, 0xb8b9babbbcbdbebfull,
+ 0xc0c1c2c3c4c5c6c7ull, 0xc8c9cacbcccdcecfull,
+ 0xd0d1d2d3d4d5d6d7ull, 0xd8d9dadbdcdddedfull,
+ 0xe0e1e2e3e4e5e6e7ull, 0xe8e9eaebecedeeefull,
+ 0xf0f1f2f3f4f5f6f7ull, 0xf8f9fafbfcfdfeffull
+};
+
+/*
+ * First Latin Square
+ * 0 7 1 3 2 4 6 5
+ * 4 1 7 6 3 0 5 2
+ * 7 0 4 2 5 3 1 6
+ * 1 4 0 5 6 2 7 3
+ * 2 3 6 7 1 5 0 4
+ * 5 2 3 1 7 6 4 0
+ * 3 6 5 0 4 7 2 1
+ * 6 5 2 4 0 1 3 7
+ */
+#define LS1_256(c, x0, x1, x2, x3, x4, x5, x6, x7) \
+{ \
+ uint32_t x04, x17, x23, x56, x07, x26; \
+ x04 = x0+x4, x17 = x1+x7, x07 = x04+x17; \
+ s0 = c + x07 + x2; \
+ s1 = rotl32(x07 + x3, 4); \
+ s2 = rotl32(x07 + x6, 8); \
+ x23 = x2 + x3; \
+ s5 = rotl32(x04 + x23 + x5, 22); \
+ x56 = x5 + x6; \
+ s6 = rotl32(x17 + x56 + x0, 24); \
+ x26 = x23+x56; \
+ s3 = rotl32(x26 + x7, 13); \
+ s4 = rotl32(x26 + x1, 17); \
+ s7 = rotl32(x26 + x4, 29); \
+}
+
+#define LS1_512(c, x0, x1, x2, x3, x4, x5, x6, x7) \
+{ \
+ uint64_t x04, x17, x23, x56, x07, x26; \
+ x04 = x0+x4, x17 = x1+x7, x07 = x04+x17; \
+ s0 = c + x07 + x2; \
+ s1 = rotl64(x07 + x3, 5); \
+ s2 = rotl64(x07 + x6, 15); \
+ x23 = x2 + x3; \
+ s5 = rotl64(x04 + x23 + x5, 40); \
+ x56 = x5 + x6; \
+ s6 = rotl64(x17 + x56 + x0, 50); \
+ x26 = x23+x56; \
+ s3 = rotl64(x26 + x7, 22); \
+ s4 = rotl64(x26 + x1, 31); \
+ s7 = rotl64(x26 + x4, 59); \
+}
+
+/*
+ * Second Orthogonal Latin Square
+ * 0 4 2 3 1 6 5 7
+ * 7 6 3 2 5 4 1 0
+ * 5 3 1 6 0 2 7 4
+ * 1 0 5 4 3 7 2 6
+ * 2 1 0 7 4 5 6 3
+ * 3 5 7 0 6 1 4 2
+ * 4 7 6 1 2 0 3 5
+ * 6 2 4 5 7 3 0 1
+ */
+#define LS2_256(c, y0, y1, y2, y3, y4, y5, y6, y7) \
+{ \
+ uint32_t y01, y25, y34, y67, y04, y05, y27, y37; \
+ y01 = y0+y1, y25 = y2+y5, y05 = y01+y25; \
+ t0 = ~c + y05 + y7; \
+ t2 = rotl32(y05 + y3, 9); \
+ y34 = y3+y4, y04 = y01+y34; \
+ t1 = rotl32(y04 + y6, 5); \
+ t4 = rotl32(y04 + y5, 15); \
+ y67 = y6+y7, y37 = y34+y67; \
+ t3 = rotl32(y37 + y2, 11); \
+ t7 = rotl32(y37 + y0, 27); \
+ y27 = y25+y67; \
+ t5 = rotl32(y27 + y4, 20); \
+ t6 = rotl32(y27 + y1, 25); \
+}
+
+#define LS2_512(c, y0, y1, y2, y3, y4, y5, y6, y7) \
+{ \
+ uint64_t y01, y25, y34, y67, y04, y05, y27, y37; \
+ y01 = y0+y1, y25 = y2+y5, y05 = y01+y25; \
+ t0 = ~c + y05 + y7; \
+ t2 = rotl64(y05 + y3, 19); \
+ y34 = y3+y4, y04 = y01+y34; \
+ t1 = rotl64(y04 + y6, 10); \
+ t4 = rotl64(y04 + y5, 36); \
+ y67 = y6+y7, y37 = y34+y67; \
+ t3 = rotl64(y37 + y2, 29); \
+ t7 = rotl64(y37 + y0, 55); \
+ y27 = y25+y67; \
+ t5 = rotl64(y27 + y4, 44); \
+ t6 = rotl64(y27 + y1, 48); \
+}
+
+#define quasi_exform256(r0, r1, r2, r3, r4, r5, r6, r7) \
+{ \
+ uint32_t s04, s17, s23, s56, t01, t25, t34, t67; \
+ s04 = s0 ^ s4, t01 = t0 ^ t1; \
+ r0 = (s04 ^ s1) + (t01 ^ t5); \
+ t67 = t6 ^ t7; \
+ r1 = (s04 ^ s7) + (t2 ^ t67); \
+ s23 = s2 ^ s3; \
+ r7 = (s23 ^ s5) + (t4 ^ t67); \
+ t34 = t3 ^ t4; \
+ r3 = (s23 ^ s4) + (t0 ^ t34); \
+ s56 = s5 ^ s6; \
+ r5 = (s3 ^ s56) + (t34 ^ t6); \
+ t25 = t2 ^ t5; \
+ r6 = (s2 ^ s56) + (t25 ^ t7); \
+ s17 = s1 ^ s7; \
+ r4 = (s0 ^ s17) + (t1 ^ t25); \
+ r2 = (s17 ^ s6) + (t01 ^ t3); \
+}
+
+#define quasi_exform512(r0, r1, r2, r3, r4, r5, r6, r7) \
+{ \
+ uint64_t s04, s17, s23, s56, t01, t25, t34, t67; \
+ s04 = s0 ^ s4, t01 = t0 ^ t1; \
+ r0 = (s04 ^ s1) + (t01 ^ t5); \
+ t67 = t6 ^ t7; \
+ r1 = (s04 ^ s7) + (t2 ^ t67); \
+ s23 = s2 ^ s3; \
+ r7 = (s23 ^ s5) + (t4 ^ t67); \
+ t34 = t3 ^ t4; \
+ r3 = (s23 ^ s4) + (t0 ^ t34); \
+ s56 = s5 ^ s6; \
+ r5 = (s3 ^ s56) + (t34 ^ t6); \
+ t25 = t2 ^ t5; \
+ r6 = (s2 ^ s56) + (t25 ^ t7); \
+ s17 = s1 ^ s7; \
+ r4 = (s0 ^ s17) + (t1 ^ t25); \
+ r2 = (s17 ^ s6) + (t01 ^ t3); \
+}
+
+static size_t
+Q256(size_t bitlen, const uint32_t *data, uint32_t *restrict p)
+{
+ size_t bl;
+
+ for (bl = bitlen; bl >= EdonR256_BLOCK_BITSIZE;
+ bl -= EdonR256_BLOCK_BITSIZE, data += 16) {
+ uint32_t s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4,
+ t5, t6, t7;
+ uint32_t p0, p1, p2, p3, p4, p5, p6, p7, q0, q1, q2, q3, q4,
+ q5, q6, q7;
+ const uint32_t defix = 0xaaaaaaaa;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+ uint32_t swp0, swp1, swp2, swp3, swp4, swp5, swp6, swp7, swp8,
+ swp9, swp10, swp11, swp12, swp13, swp14, swp15;
+#define d(j) swp ## j
+#define s32(j) ld_swap32((uint32_t *)data + j, swp ## j)
+#else
+#define d(j) data[j]
+#endif
+
+ /* First row of quasigroup e-transformations */
+#if defined(MACHINE_IS_BIG_ENDIAN)
+ s32(8);
+ s32(9);
+ s32(10);
+ s32(11);
+ s32(12);
+ s32(13);
+ s32(14);
+ s32(15);
+#endif
+ LS1_256(defix, d(15), d(14), d(13), d(12), d(11), d(10), d(9),
+ d(8));
+#if defined(MACHINE_IS_BIG_ENDIAN)
+ s32(0);
+ s32(1);
+ s32(2);
+ s32(3);
+ s32(4);
+ s32(5);
+ s32(6);
+ s32(7);
+#undef s32
+#endif
+ LS2_256(defix, d(0), d(1), d(2), d(3), d(4), d(5), d(6), d(7));
+ quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7);
+
+ LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+ LS2_256(defix, d(8), d(9), d(10), d(11), d(12), d(13), d(14),
+ d(15));
+ quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7);
+
+ /* Second row of quasigroup e-transformations */
+ LS1_256(defix, p[8], p[9], p[10], p[11], p[12], p[13], p[14],
+ p[15]);
+ LS2_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+ quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7);
+
+ LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+ LS2_256(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+ quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7);
+
+ /* Third row of quasigroup e-transformations */
+ LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+ LS2_256(defix, p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]);
+ quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7);
+
+ LS1_256(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+ LS2_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+ quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7);
+
+ /* Fourth row of quasigroup e-transformations */
+ LS1_256(defix, d(7), d(6), d(5), d(4), d(3), d(2), d(1), d(0));
+ LS2_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+ quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7);
+
+ LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+ LS2_256(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+ quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7);
+
+ /* Edon-R tweak on the original SHA-3 Edon-R submission. */
+ p[0] ^= d(8) ^ p0;
+ p[1] ^= d(9) ^ p1;
+ p[2] ^= d(10) ^ p2;
+ p[3] ^= d(11) ^ p3;
+ p[4] ^= d(12) ^ p4;
+ p[5] ^= d(13) ^ p5;
+ p[6] ^= d(14) ^ p6;
+ p[7] ^= d(15) ^ p7;
+ p[8] ^= d(0) ^ q0;
+ p[9] ^= d(1) ^ q1;
+ p[10] ^= d(2) ^ q2;
+ p[11] ^= d(3) ^ q3;
+ p[12] ^= d(4) ^ q4;
+ p[13] ^= d(5) ^ q5;
+ p[14] ^= d(6) ^ q6;
+ p[15] ^= d(7) ^ q7;
+ }
+
+#undef d
+ return (bitlen - bl);
+}
+
+/*
+ * Why is this #pragma here?
+ *
+ * Checksum functions like this one can go over the stack frame size check
+ * Linux imposes on 32-bit platforms (-Wframe-larger-than=1024). We can
+ * safely ignore the compiler error since we know that in ZoL, that
+ * the function will be called from a worker thread that won't be using
+ * much stack. The only function that goes over the 1k limit is Q512(),
+ * which only goes over it by a hair (1248 bytes on ARM32).
+ */
+#include <sys/isa_defs.h> /* for _ILP32 */
+#ifdef _ILP32 /* We're 32-bit, assume small stack frames */
+#pragma GCC diagnostic ignored "-Wframe-larger-than="
+#endif
+
+#if defined(__IBMC__) && defined(_AIX) && defined(__64BIT__)
+static inline size_t
+#else
+static size_t
+#endif
+Q512(size_t bitlen, const uint64_t *data, uint64_t *restrict p)
+{
+ size_t bl;
+
+ for (bl = bitlen; bl >= EdonR512_BLOCK_BITSIZE;
+ bl -= EdonR512_BLOCK_BITSIZE, data += 16) {
+ uint64_t s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4,
+ t5, t6, t7;
+ uint64_t p0, p1, p2, p3, p4, p5, p6, p7, q0, q1, q2, q3, q4,
+ q5, q6, q7;
+ const uint64_t defix = 0xaaaaaaaaaaaaaaaaull;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+ uint64_t swp0, swp1, swp2, swp3, swp4, swp5, swp6, swp7, swp8,
+ swp9, swp10, swp11, swp12, swp13, swp14, swp15;
+#define d(j) swp##j
+#define s64(j) ld_swap64((uint64_t *)data+j, swp##j)
+#else
+#define d(j) data[j]
+#endif
+
+ /* First row of quasigroup e-transformations */
+#if defined(MACHINE_IS_BIG_ENDIAN)
+ s64(8);
+ s64(9);
+ s64(10);
+ s64(11);
+ s64(12);
+ s64(13);
+ s64(14);
+ s64(15);
+#endif
+ LS1_512(defix, d(15), d(14), d(13), d(12), d(11), d(10), d(9),
+ d(8));
+#if defined(MACHINE_IS_BIG_ENDIAN)
+ s64(0);
+ s64(1);
+ s64(2);
+ s64(3);
+ s64(4);
+ s64(5);
+ s64(6);
+ s64(7);
+#undef s64
+#endif
+ LS2_512(defix, d(0), d(1), d(2), d(3), d(4), d(5), d(6), d(7));
+ quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7);
+
+ LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+ LS2_512(defix, d(8), d(9), d(10), d(11), d(12), d(13), d(14),
+ d(15));
+ quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7);
+
+ /* Second row of quasigroup e-transformations */
+ LS1_512(defix, p[8], p[9], p[10], p[11], p[12], p[13], p[14],
+ p[15]);
+ LS2_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+ quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7);
+
+ LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+ LS2_512(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+ quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7);
+
+ /* Third row of quasigroup e-transformations */
+ LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+ LS2_512(defix, p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]);
+ quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7);
+
+ LS1_512(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+ LS2_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+ quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7);
+
+ /* Fourth row of quasigroup e-transformations */
+ LS1_512(defix, d(7), d(6), d(5), d(4), d(3), d(2), d(1), d(0));
+ LS2_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+ quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7);
+
+ LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+ LS2_512(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+ quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7);
+
+ /* Edon-R tweak on the original SHA-3 Edon-R submission. */
+ p[0] ^= d(8) ^ p0;
+ p[1] ^= d(9) ^ p1;
+ p[2] ^= d(10) ^ p2;
+ p[3] ^= d(11) ^ p3;
+ p[4] ^= d(12) ^ p4;
+ p[5] ^= d(13) ^ p5;
+ p[6] ^= d(14) ^ p6;
+ p[7] ^= d(15) ^ p7;
+ p[8] ^= d(0) ^ q0;
+ p[9] ^= d(1) ^ q1;
+ p[10] ^= d(2) ^ q2;
+ p[11] ^= d(3) ^ q3;
+ p[12] ^= d(4) ^ q4;
+ p[13] ^= d(5) ^ q5;
+ p[14] ^= d(6) ^ q6;
+ p[15] ^= d(7) ^ q7;
+ }
+
+#undef d
+ return (bitlen - bl);
+}
+
+void
+EdonRInit(EdonRState *state, size_t hashbitlen)
+{
+ ASSERT(EDONR_VALID_HASHBITLEN(hashbitlen));
+ switch (hashbitlen) {
+ case 224:
+ state->hashbitlen = 224;
+ state->bits_processed = 0;
+ state->unprocessed_bits = 0;
+ bcopy(i224p2, hashState224(state)->DoublePipe,
+ 16 * sizeof (uint32_t));
+ break;
+
+ case 256:
+ state->hashbitlen = 256;
+ state->bits_processed = 0;
+ state->unprocessed_bits = 0;
+ bcopy(i256p2, hashState256(state)->DoublePipe,
+ 16 * sizeof (uint32_t));
+ break;
+
+ case 384:
+ state->hashbitlen = 384;
+ state->bits_processed = 0;
+ state->unprocessed_bits = 0;
+ bcopy(i384p2, hashState384(state)->DoublePipe,
+ 16 * sizeof (uint64_t));
+ break;
+
+ case 512:
+ state->hashbitlen = 512;
+ state->bits_processed = 0;
+ state->unprocessed_bits = 0;
+ bcopy(i512p2, hashState224(state)->DoublePipe,
+ 16 * sizeof (uint64_t));
+ break;
+ }
+}
+
+
+void
+EdonRUpdate(EdonRState *state, const uint8_t *data, size_t databitlen)
+{
+ uint32_t *data32;
+ uint64_t *data64;
+
+ size_t bits_processed;
+
+ ASSERT(EDONR_VALID_HASHBITLEN(state->hashbitlen));
+ switch (state->hashbitlen) {
+ case 224:
+ case 256:
+ if (state->unprocessed_bits > 0) {
+ /* LastBytes = databitlen / 8 */
+ int LastBytes = (int)databitlen >> 3;
+
+ ASSERT(state->unprocessed_bits + databitlen <=
+ EdonR256_BLOCK_SIZE * 8);
+
+ bcopy(data, hashState256(state)->LastPart
+ + (state->unprocessed_bits >> 3), LastBytes);
+ state->unprocessed_bits += (int)databitlen;
+ databitlen = state->unprocessed_bits;
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ data32 = (uint32_t *)hashState256(state)->LastPart;
+ } else
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ data32 = (uint32_t *)data;
+
+ bits_processed = Q256(databitlen, data32,
+ hashState256(state)->DoublePipe);
+ state->bits_processed += bits_processed;
+ databitlen -= bits_processed;
+ state->unprocessed_bits = (int)databitlen;
+ if (databitlen > 0) {
+ /* LastBytes = Ceil(databitlen / 8) */
+ int LastBytes =
+ ((~(((-(int)databitlen) >> 3) & 0x01ff)) +
+ 1) & 0x01ff;
+
+ data32 += bits_processed >> 5; /* byte size update */
+ bcopy(data32, hashState256(state)->LastPart, LastBytes);
+ }
+ break;
+
+ case 384:
+ case 512:
+ if (state->unprocessed_bits > 0) {
+ /* LastBytes = databitlen / 8 */
+ int LastBytes = (int)databitlen >> 3;
+
+ ASSERT(state->unprocessed_bits + databitlen <=
+ EdonR512_BLOCK_SIZE * 8);
+
+ bcopy(data, hashState512(state)->LastPart
+ + (state->unprocessed_bits >> 3), LastBytes);
+ state->unprocessed_bits += (int)databitlen;
+ databitlen = state->unprocessed_bits;
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ data64 = (uint64_t *)hashState512(state)->LastPart;
+ } else
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ data64 = (uint64_t *)data;
+
+ bits_processed = Q512(databitlen, data64,
+ hashState512(state)->DoublePipe);
+ state->bits_processed += bits_processed;
+ databitlen -= bits_processed;
+ state->unprocessed_bits = (int)databitlen;
+ if (databitlen > 0) {
+ /* LastBytes = Ceil(databitlen / 8) */
+ int LastBytes =
+ ((~(((-(int)databitlen) >> 3) & 0x03ff)) +
+ 1) & 0x03ff;
+
+ data64 += bits_processed >> 6; /* byte size update */
+ bcopy(data64, hashState512(state)->LastPart, LastBytes);
+ }
+ break;
+ }
+}
+
+void
+EdonRFinal(EdonRState *state, uint8_t *hashval)
+{
+ uint32_t *data32;
+ uint64_t *data64, num_bits;
+
+ size_t databitlen;
+ int LastByte, PadOnePosition;
+
+ num_bits = state->bits_processed + state->unprocessed_bits;
+ ASSERT(EDONR_VALID_HASHBITLEN(state->hashbitlen));
+ switch (state->hashbitlen) {
+ case 224:
+ case 256:
+ LastByte = (int)state->unprocessed_bits >> 3;
+ PadOnePosition = 7 - (state->unprocessed_bits & 0x07);
+ hashState256(state)->LastPart[LastByte] =
+ (hashState256(state)->LastPart[LastByte]
+ & (0xff << (PadOnePosition + 1))) ^
+ (0x01 << PadOnePosition);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ data64 = (uint64_t *)hashState256(state)->LastPart;
+
+ if (state->unprocessed_bits < 448) {
+ (void) memset((hashState256(state)->LastPart) +
+ LastByte + 1, 0x00,
+ EdonR256_BLOCK_SIZE - LastByte - 9);
+ databitlen = EdonR256_BLOCK_SIZE * 8;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+ st_swap64(num_bits, data64 + 7);
+#else
+ data64[7] = num_bits;
+#endif
+ } else {
+ (void) memset((hashState256(state)->LastPart) +
+ LastByte + 1, 0x00,
+ EdonR256_BLOCK_SIZE * 2 - LastByte - 9);
+ databitlen = EdonR256_BLOCK_SIZE * 16;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+ st_swap64(num_bits, data64 + 15);
+#else
+ data64[15] = num_bits;
+#endif
+ }
+
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ data32 = (uint32_t *)hashState256(state)->LastPart;
+ state->bits_processed += Q256(databitlen, data32,
+ hashState256(state)->DoublePipe);
+ break;
+
+ case 384:
+ case 512:
+ LastByte = (int)state->unprocessed_bits >> 3;
+ PadOnePosition = 7 - (state->unprocessed_bits & 0x07);
+ hashState512(state)->LastPart[LastByte] =
+ (hashState512(state)->LastPart[LastByte]
+ & (0xff << (PadOnePosition + 1))) ^
+ (0x01 << PadOnePosition);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ data64 = (uint64_t *)hashState512(state)->LastPart;
+
+ if (state->unprocessed_bits < 960) {
+ (void) memset((hashState512(state)->LastPart) +
+ LastByte + 1, 0x00,
+ EdonR512_BLOCK_SIZE - LastByte - 9);
+ databitlen = EdonR512_BLOCK_SIZE * 8;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+ st_swap64(num_bits, data64 + 15);
+#else
+ data64[15] = num_bits;
+#endif
+ } else {
+ (void) memset((hashState512(state)->LastPart) +
+ LastByte + 1, 0x00,
+ EdonR512_BLOCK_SIZE * 2 - LastByte - 9);
+ databitlen = EdonR512_BLOCK_SIZE * 16;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+ st_swap64(num_bits, data64 + 31);
+#else
+ data64[31] = num_bits;
+#endif
+ }
+
+ state->bits_processed += Q512(databitlen, data64,
+ hashState512(state)->DoublePipe);
+ break;
+ }
+
+ switch (state->hashbitlen) {
+ case 224: {
+#if defined(MACHINE_IS_BIG_ENDIAN)
+ uint32_t *d32 = (uint32_t *)hashval;
+ uint32_t *s32 = hashState224(state)->DoublePipe + 9;
+ int j;
+
+ for (j = 0; j < EdonR224_DIGEST_SIZE >> 2; j++)
+ st_swap32(s32[j], d32 + j);
+#else
+ bcopy(hashState256(state)->DoublePipe + 9, hashval,
+ EdonR224_DIGEST_SIZE);
+#endif
+ break;
+ }
+ case 256: {
+#if defined(MACHINE_IS_BIG_ENDIAN)
+ uint32_t *d32 = (uint32_t *)hashval;
+ uint32_t *s32 = hashState224(state)->DoublePipe + 8;
+ int j;
+
+ for (j = 0; j < EdonR256_DIGEST_SIZE >> 2; j++)
+ st_swap32(s32[j], d32 + j);
+#else
+ bcopy(hashState256(state)->DoublePipe + 8, hashval,
+ EdonR256_DIGEST_SIZE);
+#endif
+ break;
+ }
+ case 384: {
+#if defined(MACHINE_IS_BIG_ENDIAN)
+ uint64_t *d64 = (uint64_t *)hashval;
+ uint64_t *s64 = hashState384(state)->DoublePipe + 10;
+ int j;
+
+ for (j = 0; j < EdonR384_DIGEST_SIZE >> 3; j++)
+ st_swap64(s64[j], d64 + j);
+#else
+ bcopy(hashState384(state)->DoublePipe + 10, hashval,
+ EdonR384_DIGEST_SIZE);
+#endif
+ break;
+ }
+ case 512: {
+#if defined(MACHINE_IS_BIG_ENDIAN)
+ uint64_t *d64 = (uint64_t *)hashval;
+ uint64_t *s64 = hashState512(state)->DoublePipe + 8;
+ int j;
+
+ for (j = 0; j < EdonR512_DIGEST_SIZE >> 3; j++)
+ st_swap64(s64[j], d64 + j);
+#else
+ bcopy(hashState512(state)->DoublePipe + 8, hashval,
+ EdonR512_DIGEST_SIZE);
+#endif
+ break;
+ }
+ }
+}
+
+
+void
+EdonRHash(size_t hashbitlen, const uint8_t *data, size_t databitlen,
+ uint8_t *hashval)
+{
+ EdonRState state;
+
+ EdonRInit(&state, hashbitlen);
+ EdonRUpdate(&state, data, databitlen);
+ EdonRFinal(&state, hashval);
+}
+
+#ifdef _KERNEL
+EXPORT_SYMBOL(EdonRInit);
+EXPORT_SYMBOL(EdonRUpdate);
+EXPORT_SYMBOL(EdonRHash);
+EXPORT_SYMBOL(EdonRFinal);
+#endif
diff --git a/sys/contrib/openzfs/module/icp/algs/edonr/edonr_byteorder.h b/sys/contrib/openzfs/module/icp/algs/edonr/edonr_byteorder.h
new file mode 100644
index 000000000000..2b5d48287f26
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/edonr/edonr_byteorder.h
@@ -0,0 +1,216 @@
+/*
+ * IDI,NTNU
+ *
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Copyright (C) 2009, 2010, Jorn Amundsen <jorn.amundsen@ntnu.no>
+ *
+ * C header file to determine compile machine byte order. Take care when cross
+ * compiling.
+ *
+ * $Id: byteorder.h 517 2013-02-17 20:34:39Z joern $
+ */
+/*
+ * Portions copyright (c) 2013, Saso Kiselkov, All rights reserved
+ */
+
+#ifndef _CRYPTO_EDONR_BYTEORDER_H
+#define _CRYPTO_EDONR_BYTEORDER_H
+
+#include <sys/sysmacros.h>
+#include <sys/param.h>
+
+#if defined(__BYTE_ORDER)
+#if (__BYTE_ORDER == __BIG_ENDIAN)
+#define MACHINE_IS_BIG_ENDIAN
+#elif (__BYTE_ORDER == __LITTLE_ENDIAN)
+#define MACHINE_IS_LITTLE_ENDIAN
+#endif
+#elif defined(BYTE_ORDER)
+#if (BYTE_ORDER == BIG_ENDIAN)
+#define MACHINE_IS_BIG_ENDIAN
+#elif (BYTE_ORDER == LITTLE_ENDIAN)
+#define MACHINE_IS_LITTLE_ENDIAN
+#endif
+#endif /* __BYTE_ORDER || BYTE_ORDER */
+
+#if !defined(MACHINE_IS_BIG_ENDIAN) && !defined(MACHINE_IS_LITTLE_ENDIAN)
+#if defined(_ZFS_BIG_ENDIAN) || defined(_MIPSEB)
+#define MACHINE_IS_BIG_ENDIAN
+#endif
+#if defined(_ZFS_LITTLE_ENDIAN) || defined(_MIPSEL)
+#define MACHINE_IS_LITTLE_ENDIAN
+#endif
+#endif /* !MACHINE_IS_BIG_ENDIAN && !MACHINE_IS_LITTLE_ENDIAN */
+
+#if !defined(MACHINE_IS_BIG_ENDIAN) && !defined(MACHINE_IS_LITTLE_ENDIAN)
+#error unknown machine byte sex
+#endif
+
+#define BYTEORDER_INCLUDED
+
+#if defined(MACHINE_IS_BIG_ENDIAN)
+/*
+ * Byte swapping macros for big endian architectures and compilers,
+ * add as appropriate for other architectures and/or compilers.
+ *
+ * ld_swap64(src,dst) : uint64_t dst = *(src)
+ * st_swap64(src,dst) : *(dst) = uint64_t src
+ */
+
+#if defined(__PPC__) || defined(_ARCH_PPC)
+
+#if defined(__64BIT__)
+#if defined(_ARCH_PWR7)
+#define aix_ld_swap64(s64, d64)\
+ __asm__("ldbrx %0,0,%1" : "=r"(d64) : "r"(s64))
+#define aix_st_swap64(s64, d64)\
+ __asm__ volatile("stdbrx %1,0,%0" : : "r"(d64), "r"(s64))
+#else
+#define aix_ld_swap64(s64, d64) \
+{ \
+ uint64_t *s4 = 0, h; /* initialize to zero for gcc warning */ \
+ \
+ __asm__("addi %0,%3,4;lwbrx %1,0,%3;lwbrx %2,0,%0;rldimi %1,%2,32,0"\
+ : "+r"(s4), "=r"(d64), "=r"(h) : "b"(s64)); \
+}
+
+#define aix_st_swap64(s64, d64) \
+{ \
+ uint64_t *s4 = 0, h; /* initialize to zero for gcc warning */ \
+ h = (s64) >> 32; \
+ __asm__ volatile("addi %0,%3,4;stwbrx %1,0,%3;stwbrx %2,0,%0" \
+ : "+r"(s4) : "r"(s64), "r"(h), "b"(d64)); \
+}
+#endif /* 64BIT && PWR7 */
+#else
+#define aix_ld_swap64(s64, d64) \
+{ \
+ uint32_t *s4 = 0, h, l; /* initialize to zero for gcc warning */\
+ __asm__("addi %0,%3,4;lwbrx %1,0,%3;lwbrx %2,0,%0" \
+ : "+r"(s4), "=r"(l), "=r"(h) : "b"(s64)); \
+ d64 = ((uint64_t)h<<32) | l; \
+}
+
+#define aix_st_swap64(s64, d64) \
+{ \
+ uint32_t *s4 = 0, h, l; /* initialize to zero for gcc warning */\
+ l = (s64) & 0xfffffffful, h = (s64) >> 32; \
+ __asm__ volatile("addi %0,%3,4;stwbrx %1,0,%3;stwbrx %2,0,%0" \
+ : "+r"(s4) : "r"(l), "r"(h), "b"(d64)); \
+}
+#endif /* __64BIT__ */
+#define aix_ld_swap32(s32, d32)\
+ __asm__("lwbrx %0,0,%1" : "=r"(d32) : "r"(s32))
+#define aix_st_swap32(s32, d32)\
+ __asm__ volatile("stwbrx %1,0,%0" : : "r"(d32), "r"(s32))
+#define ld_swap32(s, d) aix_ld_swap32(s, d)
+#define st_swap32(s, d) aix_st_swap32(s, d)
+#define ld_swap64(s, d) aix_ld_swap64(s, d)
+#define st_swap64(s, d) aix_st_swap64(s, d)
+#endif /* __PPC__ || _ARCH_PPC */
+
+#if defined(__sparc)
+#if !defined(__arch64__) && !defined(__sparcv8) && defined(__sparcv9)
+#define __arch64__
+#endif
+#if defined(__GNUC__) || (defined(__SUNPRO_C) && __SUNPRO_C > 0x590)
+/* need Sun Studio C 5.10 and above for GNU inline assembly */
+#if defined(__arch64__)
+#define sparc_ld_swap64(s64, d64) \
+ __asm__("ldxa [%1]0x88,%0" : "=r"(d64) : "r"(s64))
+#define sparc_st_swap64(s64, d64) \
+ __asm__ volatile("stxa %0,[%1]0x88" : : "r"(s64), "r"(d64))
+#define st_swap64(s, d) sparc_st_swap64(s, d)
+#else
+#define sparc_ld_swap64(s64, d64) \
+{ \
+ uint32_t *s4, h, l; \
+ __asm__("add %3,4,%0\n\tlda [%3]0x88,%1\n\tlda [%0]0x88,%2" \
+ : "+r"(s4), "=r"(l), "=r"(h) : "r"(s64)); \
+ d64 = ((uint64_t)h<<32) | l; \
+}
+#define sparc_st_swap64(s64, d64) \
+{ \
+ uint32_t *s4, h, l; \
+ l = (s64) & 0xfffffffful, h = (s64) >> 32; \
+ __asm__ volatile("add %3,4,%0\n\tsta %1,[%3]0x88\n\tsta %2,[%0]0x88"\
+ : "+r"(s4) : "r"(l), "r"(h), "r"(d64)); \
+}
+#endif /* sparc64 */
+#define sparc_ld_swap32(s32, d32)\
+ __asm__("lda [%1]0x88,%0" : "=r"(d32) : "r"(s32))
+#define sparc_st_swap32(s32, d32)\
+ __asm__ volatile("sta %0,[%1]0x88" : : "r"(s32), "r"(d32))
+#define ld_swap32(s, d) sparc_ld_swap32(s, d)
+#define st_swap32(s, d) sparc_st_swap32(s, d)
+#define ld_swap64(s, d) sparc_ld_swap64(s, d)
+#define st_swap64(s, d) sparc_st_swap64(s, d)
+#endif /* GCC || Sun Studio C > 5.9 */
+#endif /* sparc */
+
+/* GCC fallback */
+#if ((__GNUC__ >= 4) || defined(__PGIC__)) && !defined(ld_swap32)
+#define ld_swap32(s, d) (d = __builtin_bswap32(*(s)))
+#define st_swap32(s, d) (*(d) = __builtin_bswap32(s))
+#endif /* GCC4/PGIC && !swap32 */
+#if ((__GNUC__ >= 4) || defined(__PGIC__)) && !defined(ld_swap64)
+#define ld_swap64(s, d) (d = __builtin_bswap64(*(s)))
+#define st_swap64(s, d) (*(d) = __builtin_bswap64(s))
+#endif /* GCC4/PGIC && !swap64 */
+
+/* generic fallback */
+#if !defined(ld_swap32)
+#define ld_swap32(s, d) \
+ (d = (*(s) >> 24) | (*(s) >> 8 & 0xff00) | \
+ (*(s) << 8 & 0xff0000) | (*(s) << 24))
+#define st_swap32(s, d) \
+ (*(d) = ((s) >> 24) | ((s) >> 8 & 0xff00) | \
+ ((s) << 8 & 0xff0000) | ((s) << 24))
+#endif
+#if !defined(ld_swap64)
+#define ld_swap64(s, d) \
+ (d = (*(s) >> 56) | (*(s) >> 40 & 0xff00) | \
+ (*(s) >> 24 & 0xff0000) | (*(s) >> 8 & 0xff000000) | \
+ (*(s) & 0xff000000) << 8 | (*(s) & 0xff0000) << 24 | \
+ (*(s) & 0xff00) << 40 | *(s) << 56)
+#define st_swap64(s, d) \
+ (*(d) = ((s) >> 56) | ((s) >> 40 & 0xff00) | \
+ ((s) >> 24 & 0xff0000) | ((s) >> 8 & 0xff000000) | \
+ ((s) & 0xff000000) << 8 | ((s) & 0xff0000) << 24 | \
+ ((s) & 0xff00) << 40 | (s) << 56)
+#endif
+
+#endif /* MACHINE_IS_BIG_ENDIAN */
+
+
+#if defined(MACHINE_IS_LITTLE_ENDIAN)
+/* replace swaps with simple assignments on little endian systems */
+#undef ld_swap32
+#undef st_swap32
+#define ld_swap32(s, d) (d = *(s))
+#define st_swap32(s, d) (*(d) = s)
+#undef ld_swap64
+#undef st_swap64
+#define ld_swap64(s, d) (d = *(s))
+#define st_swap64(s, d) (*(d) = s)
+#endif /* MACHINE_IS_LITTLE_ENDIAN */
+
+#endif /* _CRYPTO_EDONR_BYTEORDER_H */
diff --git a/sys/contrib/openzfs/module/icp/algs/modes/cbc.c b/sys/contrib/openzfs/module/icp/algs/modes/cbc.c
new file mode 100644
index 000000000000..85864f56dead
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/modes/cbc.c
@@ -0,0 +1,273 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <modes/modes.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+
+/*
+ * Algorithm independent CBC functions.
+ */
+int
+cbc_encrypt_contiguous_blocks(cbc_ctx_t *ctx, char *data, size_t length,
+ crypto_data_t *out, size_t block_size,
+ int (*encrypt)(const void *, const uint8_t *, uint8_t *),
+ void (*copy_block)(uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *))
+{
+ size_t remainder = length;
+ size_t need = 0;
+ uint8_t *datap = (uint8_t *)data;
+ uint8_t *blockp;
+ uint8_t *lastp;
+ void *iov_or_mp;
+ offset_t offset;
+ uint8_t *out_data_1;
+ uint8_t *out_data_2;
+ size_t out_data_1_len;
+
+ if (length + ctx->cbc_remainder_len < block_size) {
+ /* accumulate bytes here and return */
+ bcopy(datap,
+ (uint8_t *)ctx->cbc_remainder + ctx->cbc_remainder_len,
+ length);
+ ctx->cbc_remainder_len += length;
+ ctx->cbc_copy_to = datap;
+ return (CRYPTO_SUCCESS);
+ }
+
+ lastp = (uint8_t *)ctx->cbc_iv;
+ crypto_init_ptrs(out, &iov_or_mp, &offset);
+
+ do {
+ /* Unprocessed data from last call. */
+ if (ctx->cbc_remainder_len > 0) {
+ need = block_size - ctx->cbc_remainder_len;
+
+ if (need > remainder)
+ return (CRYPTO_DATA_LEN_RANGE);
+
+ bcopy(datap, &((uint8_t *)ctx->cbc_remainder)
+ [ctx->cbc_remainder_len], need);
+
+ blockp = (uint8_t *)ctx->cbc_remainder;
+ } else {
+ blockp = datap;
+ }
+
+ /*
+ * XOR the previous cipher block or IV with the
+ * current clear block.
+ */
+ xor_block(blockp, lastp);
+ encrypt(ctx->cbc_keysched, lastp, lastp);
+ crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
+ &out_data_1_len, &out_data_2, block_size);
+
+ /* copy block to where it belongs */
+ if (out_data_1_len == block_size) {
+ copy_block(lastp, out_data_1);
+ } else {
+ bcopy(lastp, out_data_1, out_data_1_len);
+ if (out_data_2 != NULL) {
+ bcopy(lastp + out_data_1_len,
+ out_data_2,
+ block_size - out_data_1_len);
+ }
+ }
+ /* update offset */
+ out->cd_offset += block_size;
+
+ /* Update pointer to next block of data to be processed. */
+ if (ctx->cbc_remainder_len != 0) {
+ datap += need;
+ ctx->cbc_remainder_len = 0;
+ } else {
+ datap += block_size;
+ }
+
+ remainder = (size_t)&data[length] - (size_t)datap;
+
+ /* Incomplete last block. */
+ if (remainder > 0 && remainder < block_size) {
+ bcopy(datap, ctx->cbc_remainder, remainder);
+ ctx->cbc_remainder_len = remainder;
+ ctx->cbc_copy_to = datap;
+ goto out;
+ }
+ ctx->cbc_copy_to = NULL;
+
+ } while (remainder > 0);
+
+out:
+ /*
+ * Save the last encrypted block in the context.
+ */
+ if (ctx->cbc_lastp != NULL) {
+ copy_block((uint8_t *)ctx->cbc_lastp, (uint8_t *)ctx->cbc_iv);
+ ctx->cbc_lastp = (uint8_t *)ctx->cbc_iv;
+ }
+
+ return (CRYPTO_SUCCESS);
+}
+
+#define OTHER(a, ctx) \
+ (((a) == (ctx)->cbc_lastblock) ? (ctx)->cbc_iv : (ctx)->cbc_lastblock)
+
+/* ARGSUSED */
+int
+cbc_decrypt_contiguous_blocks(cbc_ctx_t *ctx, char *data, size_t length,
+ crypto_data_t *out, size_t block_size,
+ int (*decrypt)(const void *, const uint8_t *, uint8_t *),
+ void (*copy_block)(uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *))
+{
+ size_t remainder = length;
+ size_t need = 0;
+ uint8_t *datap = (uint8_t *)data;
+ uint8_t *blockp;
+ uint8_t *lastp;
+ void *iov_or_mp;
+ offset_t offset;
+ uint8_t *out_data_1;
+ uint8_t *out_data_2;
+ size_t out_data_1_len;
+
+ if (length + ctx->cbc_remainder_len < block_size) {
+ /* accumulate bytes here and return */
+ bcopy(datap,
+ (uint8_t *)ctx->cbc_remainder + ctx->cbc_remainder_len,
+ length);
+ ctx->cbc_remainder_len += length;
+ ctx->cbc_copy_to = datap;
+ return (CRYPTO_SUCCESS);
+ }
+
+ lastp = ctx->cbc_lastp;
+ crypto_init_ptrs(out, &iov_or_mp, &offset);
+
+ do {
+ /* Unprocessed data from last call. */
+ if (ctx->cbc_remainder_len > 0) {
+ need = block_size - ctx->cbc_remainder_len;
+
+ if (need > remainder)
+ return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE);
+
+ bcopy(datap, &((uint8_t *)ctx->cbc_remainder)
+ [ctx->cbc_remainder_len], need);
+
+ blockp = (uint8_t *)ctx->cbc_remainder;
+ } else {
+ blockp = datap;
+ }
+
+ /* LINTED: pointer alignment */
+ copy_block(blockp, (uint8_t *)OTHER((uint64_t *)lastp, ctx));
+
+ decrypt(ctx->cbc_keysched, blockp,
+ (uint8_t *)ctx->cbc_remainder);
+ blockp = (uint8_t *)ctx->cbc_remainder;
+
+ /*
+ * XOR the previous cipher block or IV with the
+ * currently decrypted block.
+ */
+ xor_block(lastp, blockp);
+
+ /* LINTED: pointer alignment */
+ lastp = (uint8_t *)OTHER((uint64_t *)lastp, ctx);
+
+ crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
+ &out_data_1_len, &out_data_2, block_size);
+
+ bcopy(blockp, out_data_1, out_data_1_len);
+ if (out_data_2 != NULL) {
+ bcopy(blockp + out_data_1_len, out_data_2,
+ block_size - out_data_1_len);
+ }
+
+ /* update offset */
+ out->cd_offset += block_size;
+
+ /* Update pointer to next block of data to be processed. */
+ if (ctx->cbc_remainder_len != 0) {
+ datap += need;
+ ctx->cbc_remainder_len = 0;
+ } else {
+ datap += block_size;
+ }
+
+ remainder = (size_t)&data[length] - (size_t)datap;
+
+ /* Incomplete last block. */
+ if (remainder > 0 && remainder < block_size) {
+ bcopy(datap, ctx->cbc_remainder, remainder);
+ ctx->cbc_remainder_len = remainder;
+ ctx->cbc_lastp = lastp;
+ ctx->cbc_copy_to = datap;
+ return (CRYPTO_SUCCESS);
+ }
+ ctx->cbc_copy_to = NULL;
+
+ } while (remainder > 0);
+
+ ctx->cbc_lastp = lastp;
+ return (CRYPTO_SUCCESS);
+}
+
+int
+cbc_init_ctx(cbc_ctx_t *cbc_ctx, char *param, size_t param_len,
+ size_t block_size, void (*copy_block)(uint8_t *, uint64_t *))
+{
+ /*
+ * Copy IV into context.
+ *
+ * If cm_param == NULL then the IV comes from the
+ * cd_miscdata field in the crypto_data structure.
+ */
+ if (param != NULL) {
+ ASSERT(param_len == block_size);
+ copy_block((uchar_t *)param, cbc_ctx->cbc_iv);
+ }
+
+ cbc_ctx->cbc_lastp = (uint8_t *)&cbc_ctx->cbc_iv[0];
+ cbc_ctx->cbc_flags |= CBC_MODE;
+ return (CRYPTO_SUCCESS);
+}
+
+/* ARGSUSED */
+void *
+cbc_alloc_ctx(int kmflag)
+{
+ cbc_ctx_t *cbc_ctx;
+
+ if ((cbc_ctx = kmem_zalloc(sizeof (cbc_ctx_t), kmflag)) == NULL)
+ return (NULL);
+
+ cbc_ctx->cbc_flags = CBC_MODE;
+ return (cbc_ctx);
+}
diff --git a/sys/contrib/openzfs/module/icp/algs/modes/ccm.c b/sys/contrib/openzfs/module/icp/algs/modes/ccm.c
new file mode 100644
index 000000000000..5d6507c49db1
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/modes/ccm.c
@@ -0,0 +1,907 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <modes/modes.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+
+#ifdef HAVE_EFFICIENT_UNALIGNED_ACCESS
+#include <sys/byteorder.h>
+#define UNALIGNED_POINTERS_PERMITTED
+#endif
+
+/*
+ * Encrypt multiple blocks of data in CCM mode. Decrypt for CCM mode
+ * is done in another function.
+ */
+int
+ccm_mode_encrypt_contiguous_blocks(ccm_ctx_t *ctx, char *data, size_t length,
+ crypto_data_t *out, size_t block_size,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*copy_block)(uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *))
+{
+ size_t remainder = length;
+ size_t need = 0;
+ uint8_t *datap = (uint8_t *)data;
+ uint8_t *blockp;
+ uint8_t *lastp;
+ void *iov_or_mp;
+ offset_t offset;
+ uint8_t *out_data_1;
+ uint8_t *out_data_2;
+ size_t out_data_1_len;
+ uint64_t counter;
+ uint8_t *mac_buf;
+
+ if (length + ctx->ccm_remainder_len < block_size) {
+ /* accumulate bytes here and return */
+ bcopy(datap,
+ (uint8_t *)ctx->ccm_remainder + ctx->ccm_remainder_len,
+ length);
+ ctx->ccm_remainder_len += length;
+ ctx->ccm_copy_to = datap;
+ return (CRYPTO_SUCCESS);
+ }
+
+ lastp = (uint8_t *)ctx->ccm_cb;
+ crypto_init_ptrs(out, &iov_or_mp, &offset);
+
+ mac_buf = (uint8_t *)ctx->ccm_mac_buf;
+
+ do {
+ /* Unprocessed data from last call. */
+ if (ctx->ccm_remainder_len > 0) {
+ need = block_size - ctx->ccm_remainder_len;
+
+ if (need > remainder)
+ return (CRYPTO_DATA_LEN_RANGE);
+
+ bcopy(datap, &((uint8_t *)ctx->ccm_remainder)
+ [ctx->ccm_remainder_len], need);
+
+ blockp = (uint8_t *)ctx->ccm_remainder;
+ } else {
+ blockp = datap;
+ }
+
+ /*
+ * do CBC MAC
+ *
+ * XOR the previous cipher block current clear block.
+ * mac_buf always contain previous cipher block.
+ */
+ xor_block(blockp, mac_buf);
+ encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf);
+
+ /* ccm_cb is the counter block */
+ encrypt_block(ctx->ccm_keysched, (uint8_t *)ctx->ccm_cb,
+ (uint8_t *)ctx->ccm_tmp);
+
+ lastp = (uint8_t *)ctx->ccm_tmp;
+
+ /*
+ * Increment counter. Counter bits are confined
+ * to the bottom 64 bits of the counter block.
+ */
+#ifdef _ZFS_LITTLE_ENDIAN
+ counter = ntohll(ctx->ccm_cb[1] & ctx->ccm_counter_mask);
+ counter = htonll(counter + 1);
+#else
+ counter = ctx->ccm_cb[1] & ctx->ccm_counter_mask;
+ counter++;
+#endif /* _ZFS_LITTLE_ENDIAN */
+ counter &= ctx->ccm_counter_mask;
+ ctx->ccm_cb[1] =
+ (ctx->ccm_cb[1] & ~(ctx->ccm_counter_mask)) | counter;
+
+ /*
+ * XOR encrypted counter block with the current clear block.
+ */
+ xor_block(blockp, lastp);
+
+ ctx->ccm_processed_data_len += block_size;
+
+ crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
+ &out_data_1_len, &out_data_2, block_size);
+
+ /* copy block to where it belongs */
+ if (out_data_1_len == block_size) {
+ copy_block(lastp, out_data_1);
+ } else {
+ bcopy(lastp, out_data_1, out_data_1_len);
+ if (out_data_2 != NULL) {
+ bcopy(lastp + out_data_1_len,
+ out_data_2,
+ block_size - out_data_1_len);
+ }
+ }
+ /* update offset */
+ out->cd_offset += block_size;
+
+ /* Update pointer to next block of data to be processed. */
+ if (ctx->ccm_remainder_len != 0) {
+ datap += need;
+ ctx->ccm_remainder_len = 0;
+ } else {
+ datap += block_size;
+ }
+
+ remainder = (size_t)&data[length] - (size_t)datap;
+
+ /* Incomplete last block. */
+ if (remainder > 0 && remainder < block_size) {
+ bcopy(datap, ctx->ccm_remainder, remainder);
+ ctx->ccm_remainder_len = remainder;
+ ctx->ccm_copy_to = datap;
+ goto out;
+ }
+ ctx->ccm_copy_to = NULL;
+
+ } while (remainder > 0);
+
+out:
+ return (CRYPTO_SUCCESS);
+}
+
+void
+calculate_ccm_mac(ccm_ctx_t *ctx, uint8_t *ccm_mac,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *))
+{
+ uint64_t counter;
+ uint8_t *counterp, *mac_buf;
+ int i;
+
+ mac_buf = (uint8_t *)ctx->ccm_mac_buf;
+
+ /* first counter block start with index 0 */
+ counter = 0;
+ ctx->ccm_cb[1] = (ctx->ccm_cb[1] & ~(ctx->ccm_counter_mask)) | counter;
+
+ counterp = (uint8_t *)ctx->ccm_tmp;
+ encrypt_block(ctx->ccm_keysched, (uint8_t *)ctx->ccm_cb, counterp);
+
+ /* calculate XOR of MAC with first counter block */
+ for (i = 0; i < ctx->ccm_mac_len; i++) {
+ ccm_mac[i] = mac_buf[i] ^ counterp[i];
+ }
+}
+
+/* ARGSUSED */
+int
+ccm_encrypt_final(ccm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *))
+{
+ uint8_t *lastp, *mac_buf, *ccm_mac_p, *macp = NULL;
+ void *iov_or_mp;
+ offset_t offset;
+ uint8_t *out_data_1;
+ uint8_t *out_data_2;
+ size_t out_data_1_len;
+ int i;
+
+ if (out->cd_length < (ctx->ccm_remainder_len + ctx->ccm_mac_len)) {
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+
+ /*
+ * When we get here, the number of bytes of payload processed
+ * plus whatever data remains, if any,
+ * should be the same as the number of bytes that's being
+ * passed in the argument during init time.
+ */
+ if ((ctx->ccm_processed_data_len + ctx->ccm_remainder_len)
+ != (ctx->ccm_data_len)) {
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+
+ mac_buf = (uint8_t *)ctx->ccm_mac_buf;
+
+ if (ctx->ccm_remainder_len > 0) {
+
+ /* ccm_mac_input_buf is not used for encryption */
+ macp = (uint8_t *)ctx->ccm_mac_input_buf;
+ bzero(macp, block_size);
+
+ /* copy remainder to temporary buffer */
+ bcopy(ctx->ccm_remainder, macp, ctx->ccm_remainder_len);
+
+ /* calculate the CBC MAC */
+ xor_block(macp, mac_buf);
+ encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf);
+
+ /* calculate the counter mode */
+ lastp = (uint8_t *)ctx->ccm_tmp;
+ encrypt_block(ctx->ccm_keysched, (uint8_t *)ctx->ccm_cb, lastp);
+
+ /* XOR with counter block */
+ for (i = 0; i < ctx->ccm_remainder_len; i++) {
+ macp[i] ^= lastp[i];
+ }
+ ctx->ccm_processed_data_len += ctx->ccm_remainder_len;
+ }
+
+ /* Calculate the CCM MAC */
+ ccm_mac_p = (uint8_t *)ctx->ccm_tmp;
+ calculate_ccm_mac(ctx, ccm_mac_p, encrypt_block);
+
+ crypto_init_ptrs(out, &iov_or_mp, &offset);
+ crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
+ &out_data_1_len, &out_data_2,
+ ctx->ccm_remainder_len + ctx->ccm_mac_len);
+
+ if (ctx->ccm_remainder_len > 0) {
+
+ /* copy temporary block to where it belongs */
+ if (out_data_2 == NULL) {
+ /* everything will fit in out_data_1 */
+ bcopy(macp, out_data_1, ctx->ccm_remainder_len);
+ bcopy(ccm_mac_p, out_data_1 + ctx->ccm_remainder_len,
+ ctx->ccm_mac_len);
+ } else {
+
+ if (out_data_1_len < ctx->ccm_remainder_len) {
+
+ size_t data_2_len_used;
+
+ bcopy(macp, out_data_1, out_data_1_len);
+
+ data_2_len_used = ctx->ccm_remainder_len
+ - out_data_1_len;
+
+ bcopy((uint8_t *)macp + out_data_1_len,
+ out_data_2, data_2_len_used);
+ bcopy(ccm_mac_p, out_data_2 + data_2_len_used,
+ ctx->ccm_mac_len);
+ } else {
+ bcopy(macp, out_data_1, out_data_1_len);
+ if (out_data_1_len == ctx->ccm_remainder_len) {
+ /* mac will be in out_data_2 */
+ bcopy(ccm_mac_p, out_data_2,
+ ctx->ccm_mac_len);
+ } else {
+ size_t len_not_used = out_data_1_len -
+ ctx->ccm_remainder_len;
+ /*
+ * part of mac in will be in
+ * out_data_1, part of the mac will be
+ * in out_data_2
+ */
+ bcopy(ccm_mac_p,
+ out_data_1 + ctx->ccm_remainder_len,
+ len_not_used);
+ bcopy(ccm_mac_p + len_not_used,
+ out_data_2,
+ ctx->ccm_mac_len - len_not_used);
+
+ }
+ }
+ }
+ } else {
+ /* copy block to where it belongs */
+ bcopy(ccm_mac_p, out_data_1, out_data_1_len);
+ if (out_data_2 != NULL) {
+ bcopy(ccm_mac_p + out_data_1_len, out_data_2,
+ block_size - out_data_1_len);
+ }
+ }
+ out->cd_offset += ctx->ccm_remainder_len + ctx->ccm_mac_len;
+ ctx->ccm_remainder_len = 0;
+ return (CRYPTO_SUCCESS);
+}
+
+/*
+ * This will only deal with decrypting the last block of the input that
+ * might not be a multiple of block length.
+ */
+static void
+ccm_decrypt_incomplete_block(ccm_ctx_t *ctx,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *))
+{
+ uint8_t *datap, *outp, *counterp;
+ int i;
+
+ datap = (uint8_t *)ctx->ccm_remainder;
+ outp = &((ctx->ccm_pt_buf)[ctx->ccm_processed_data_len]);
+
+ counterp = (uint8_t *)ctx->ccm_tmp;
+ encrypt_block(ctx->ccm_keysched, (uint8_t *)ctx->ccm_cb, counterp);
+
+ /* XOR with counter block */
+ for (i = 0; i < ctx->ccm_remainder_len; i++) {
+ outp[i] = datap[i] ^ counterp[i];
+ }
+}
+
+/*
+ * This will decrypt the cipher text. However, the plaintext won't be
+ * returned to the caller. It will be returned when decrypt_final() is
+ * called if the MAC matches
+ */
+/* ARGSUSED */
+int
+ccm_mode_decrypt_contiguous_blocks(ccm_ctx_t *ctx, char *data, size_t length,
+ crypto_data_t *out, size_t block_size,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*copy_block)(uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *))
+{
+ size_t remainder = length;
+ size_t need = 0;
+ uint8_t *datap = (uint8_t *)data;
+ uint8_t *blockp;
+ uint8_t *cbp;
+ uint64_t counter;
+ size_t pt_len, total_decrypted_len, mac_len, pm_len, pd_len;
+ uint8_t *resultp;
+
+
+ pm_len = ctx->ccm_processed_mac_len;
+
+ if (pm_len > 0) {
+ uint8_t *tmp;
+ /*
+ * all ciphertext has been processed, just waiting for
+ * part of the value of the mac
+ */
+ if ((pm_len + length) > ctx->ccm_mac_len) {
+ return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE);
+ }
+ tmp = (uint8_t *)ctx->ccm_mac_input_buf;
+
+ bcopy(datap, tmp + pm_len, length);
+
+ ctx->ccm_processed_mac_len += length;
+ return (CRYPTO_SUCCESS);
+ }
+
+ /*
+ * If we decrypt the given data, what total amount of data would
+ * have been decrypted?
+ */
+ pd_len = ctx->ccm_processed_data_len;
+ total_decrypted_len = pd_len + length + ctx->ccm_remainder_len;
+
+ if (total_decrypted_len >
+ (ctx->ccm_data_len + ctx->ccm_mac_len)) {
+ return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE);
+ }
+
+ pt_len = ctx->ccm_data_len;
+
+ if (total_decrypted_len > pt_len) {
+ /*
+ * part of the input will be the MAC, need to isolate that
+ * to be dealt with later. The left-over data in
+ * ccm_remainder_len from last time will not be part of the
+ * MAC. Otherwise, it would have already been taken out
+ * when this call is made last time.
+ */
+ size_t pt_part = pt_len - pd_len - ctx->ccm_remainder_len;
+
+ mac_len = length - pt_part;
+
+ ctx->ccm_processed_mac_len = mac_len;
+ bcopy(data + pt_part, ctx->ccm_mac_input_buf, mac_len);
+
+ if (pt_part + ctx->ccm_remainder_len < block_size) {
+ /*
+ * since this is last of the ciphertext, will
+ * just decrypt with it here
+ */
+ bcopy(datap, &((uint8_t *)ctx->ccm_remainder)
+ [ctx->ccm_remainder_len], pt_part);
+ ctx->ccm_remainder_len += pt_part;
+ ccm_decrypt_incomplete_block(ctx, encrypt_block);
+ ctx->ccm_processed_data_len += ctx->ccm_remainder_len;
+ ctx->ccm_remainder_len = 0;
+ return (CRYPTO_SUCCESS);
+ } else {
+ /* let rest of the code handle this */
+ length = pt_part;
+ }
+ } else if (length + ctx->ccm_remainder_len < block_size) {
+ /* accumulate bytes here and return */
+ bcopy(datap,
+ (uint8_t *)ctx->ccm_remainder + ctx->ccm_remainder_len,
+ length);
+ ctx->ccm_remainder_len += length;
+ ctx->ccm_copy_to = datap;
+ return (CRYPTO_SUCCESS);
+ }
+
+ do {
+ /* Unprocessed data from last call. */
+ if (ctx->ccm_remainder_len > 0) {
+ need = block_size - ctx->ccm_remainder_len;
+
+ if (need > remainder)
+ return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE);
+
+ bcopy(datap, &((uint8_t *)ctx->ccm_remainder)
+ [ctx->ccm_remainder_len], need);
+
+ blockp = (uint8_t *)ctx->ccm_remainder;
+ } else {
+ blockp = datap;
+ }
+
+ /* Calculate the counter mode, ccm_cb is the counter block */
+ cbp = (uint8_t *)ctx->ccm_tmp;
+ encrypt_block(ctx->ccm_keysched, (uint8_t *)ctx->ccm_cb, cbp);
+
+ /*
+ * Increment counter.
+ * Counter bits are confined to the bottom 64 bits
+ */
+#ifdef _ZFS_LITTLE_ENDIAN
+ counter = ntohll(ctx->ccm_cb[1] & ctx->ccm_counter_mask);
+ counter = htonll(counter + 1);
+#else
+ counter = ctx->ccm_cb[1] & ctx->ccm_counter_mask;
+ counter++;
+#endif /* _ZFS_LITTLE_ENDIAN */
+ counter &= ctx->ccm_counter_mask;
+ ctx->ccm_cb[1] =
+ (ctx->ccm_cb[1] & ~(ctx->ccm_counter_mask)) | counter;
+
+ /* XOR with the ciphertext */
+ xor_block(blockp, cbp);
+
+ /* Copy the plaintext to the "holding buffer" */
+ resultp = (uint8_t *)ctx->ccm_pt_buf +
+ ctx->ccm_processed_data_len;
+ copy_block(cbp, resultp);
+
+ ctx->ccm_processed_data_len += block_size;
+
+ ctx->ccm_lastp = blockp;
+
+ /* Update pointer to next block of data to be processed. */
+ if (ctx->ccm_remainder_len != 0) {
+ datap += need;
+ ctx->ccm_remainder_len = 0;
+ } else {
+ datap += block_size;
+ }
+
+ remainder = (size_t)&data[length] - (size_t)datap;
+
+ /* Incomplete last block */
+ if (remainder > 0 && remainder < block_size) {
+ bcopy(datap, ctx->ccm_remainder, remainder);
+ ctx->ccm_remainder_len = remainder;
+ ctx->ccm_copy_to = datap;
+ if (ctx->ccm_processed_mac_len > 0) {
+ /*
+ * not expecting anymore ciphertext, just
+ * compute plaintext for the remaining input
+ */
+ ccm_decrypt_incomplete_block(ctx,
+ encrypt_block);
+ ctx->ccm_processed_data_len += remainder;
+ ctx->ccm_remainder_len = 0;
+ }
+ goto out;
+ }
+ ctx->ccm_copy_to = NULL;
+
+ } while (remainder > 0);
+
+out:
+ return (CRYPTO_SUCCESS);
+}
+
+int
+ccm_decrypt_final(ccm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*copy_block)(uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *))
+{
+ size_t mac_remain, pt_len;
+ uint8_t *pt, *mac_buf, *macp, *ccm_mac_p;
+ int rv;
+
+ pt_len = ctx->ccm_data_len;
+
+ /* Make sure output buffer can fit all of the plaintext */
+ if (out->cd_length < pt_len) {
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+
+ pt = ctx->ccm_pt_buf;
+ mac_remain = ctx->ccm_processed_data_len;
+ mac_buf = (uint8_t *)ctx->ccm_mac_buf;
+
+ macp = (uint8_t *)ctx->ccm_tmp;
+
+ while (mac_remain > 0) {
+
+ if (mac_remain < block_size) {
+ bzero(macp, block_size);
+ bcopy(pt, macp, mac_remain);
+ mac_remain = 0;
+ } else {
+ copy_block(pt, macp);
+ mac_remain -= block_size;
+ pt += block_size;
+ }
+
+ /* calculate the CBC MAC */
+ xor_block(macp, mac_buf);
+ encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf);
+ }
+
+ /* Calculate the CCM MAC */
+ ccm_mac_p = (uint8_t *)ctx->ccm_tmp;
+ calculate_ccm_mac((ccm_ctx_t *)ctx, ccm_mac_p, encrypt_block);
+
+ /* compare the input CCM MAC value with what we calculated */
+ if (bcmp(ctx->ccm_mac_input_buf, ccm_mac_p, ctx->ccm_mac_len)) {
+ /* They don't match */
+ return (CRYPTO_INVALID_MAC);
+ } else {
+ rv = crypto_put_output_data(ctx->ccm_pt_buf, out, pt_len);
+ if (rv != CRYPTO_SUCCESS)
+ return (rv);
+ out->cd_offset += pt_len;
+ }
+ return (CRYPTO_SUCCESS);
+}
+
+static int
+ccm_validate_args(CK_AES_CCM_PARAMS *ccm_param, boolean_t is_encrypt_init)
+{
+ size_t macSize, nonceSize;
+ uint8_t q;
+ uint64_t maxValue;
+
+ /*
+ * Check the length of the MAC. The only valid
+ * lengths for the MAC are: 4, 6, 8, 10, 12, 14, 16
+ */
+ macSize = ccm_param->ulMACSize;
+ if ((macSize < 4) || (macSize > 16) || ((macSize % 2) != 0)) {
+ return (CRYPTO_MECHANISM_PARAM_INVALID);
+ }
+
+ /* Check the nonce length. Valid values are 7, 8, 9, 10, 11, 12, 13 */
+ nonceSize = ccm_param->ulNonceSize;
+ if ((nonceSize < 7) || (nonceSize > 13)) {
+ return (CRYPTO_MECHANISM_PARAM_INVALID);
+ }
+
+ /* q is the length of the field storing the length, in bytes */
+ q = (uint8_t)((15 - nonceSize) & 0xFF);
+
+
+ /*
+ * If it is decrypt, need to make sure size of ciphertext is at least
+ * bigger than MAC len
+ */
+ if ((!is_encrypt_init) && (ccm_param->ulDataSize < macSize)) {
+ return (CRYPTO_MECHANISM_PARAM_INVALID);
+ }
+
+ /*
+ * Check to make sure the length of the payload is within the
+ * range of values allowed by q
+ */
+ if (q < 8) {
+ maxValue = (1ULL << (q * 8)) - 1;
+ } else {
+ maxValue = ULONG_MAX;
+ }
+
+ if (ccm_param->ulDataSize > maxValue) {
+ return (CRYPTO_MECHANISM_PARAM_INVALID);
+ }
+ return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Format the first block used in CBC-MAC (B0) and the initial counter
+ * block based on formatting functions and counter generation functions
+ * specified in RFC 3610 and NIST publication 800-38C, appendix A
+ *
+ * b0 is the first block used in CBC-MAC
+ * cb0 is the first counter block
+ *
+ * It's assumed that the arguments b0 and cb0 are preallocated AES blocks
+ *
+ */
+static void
+ccm_format_initial_blocks(uchar_t *nonce, ulong_t nonceSize,
+ ulong_t authDataSize, uint8_t *b0, ccm_ctx_t *aes_ctx)
+{
+ uint64_t payloadSize;
+ uint8_t t, q, have_adata = 0;
+ size_t limit;
+ int i, j, k;
+ uint64_t mask = 0;
+ uint8_t *cb;
+
+ q = (uint8_t)((15 - nonceSize) & 0xFF);
+ t = (uint8_t)((aes_ctx->ccm_mac_len) & 0xFF);
+
+ /* Construct the first octet of b0 */
+ if (authDataSize > 0) {
+ have_adata = 1;
+ }
+ b0[0] = (have_adata << 6) | (((t - 2) / 2) << 3) | (q - 1);
+
+ /* copy the nonce value into b0 */
+ bcopy(nonce, &(b0[1]), nonceSize);
+
+ /* store the length of the payload into b0 */
+ bzero(&(b0[1+nonceSize]), q);
+
+ payloadSize = aes_ctx->ccm_data_len;
+ limit = 8 < q ? 8 : q;
+
+ for (i = 0, j = 0, k = 15; i < limit; i++, j += 8, k--) {
+ b0[k] = (uint8_t)((payloadSize >> j) & 0xFF);
+ }
+
+ /* format the counter block */
+
+ cb = (uint8_t *)aes_ctx->ccm_cb;
+
+ cb[0] = 0x07 & (q-1); /* first byte */
+
+ /* copy the nonce value into the counter block */
+ bcopy(nonce, &(cb[1]), nonceSize);
+
+ bzero(&(cb[1+nonceSize]), q);
+
+ /* Create the mask for the counter field based on the size of nonce */
+ q <<= 3;
+ while (q-- > 0) {
+ mask |= (1ULL << q);
+ }
+
+#ifdef _ZFS_LITTLE_ENDIAN
+ mask = htonll(mask);
+#endif
+ aes_ctx->ccm_counter_mask = mask;
+
+ /*
+ * During calculation, we start using counter block 1, we will
+ * set it up right here.
+ * We can just set the last byte to have the value 1, because
+ * even with the biggest nonce of 13, the last byte of the
+ * counter block will be used for the counter value.
+ */
+ cb[15] = 0x01;
+}
+
+/*
+ * Encode the length of the associated data as
+ * specified in RFC 3610 and NIST publication 800-38C, appendix A
+ */
+static void
+encode_adata_len(ulong_t auth_data_len, uint8_t *encoded, size_t *encoded_len)
+{
+#ifdef UNALIGNED_POINTERS_PERMITTED
+ uint32_t *lencoded_ptr;
+#ifdef _LP64
+ uint64_t *llencoded_ptr;
+#endif
+#endif /* UNALIGNED_POINTERS_PERMITTED */
+
+ if (auth_data_len < ((1ULL<<16) - (1ULL<<8))) {
+ /* 0 < a < (2^16-2^8) */
+ *encoded_len = 2;
+ encoded[0] = (auth_data_len & 0xff00) >> 8;
+ encoded[1] = auth_data_len & 0xff;
+
+ } else if ((auth_data_len >= ((1ULL<<16) - (1ULL<<8))) &&
+ (auth_data_len < (1ULL << 31))) {
+ /* (2^16-2^8) <= a < 2^32 */
+ *encoded_len = 6;
+ encoded[0] = 0xff;
+ encoded[1] = 0xfe;
+#ifdef UNALIGNED_POINTERS_PERMITTED
+ lencoded_ptr = (uint32_t *)&encoded[2];
+ *lencoded_ptr = htonl(auth_data_len);
+#else
+ encoded[2] = (auth_data_len & 0xff000000) >> 24;
+ encoded[3] = (auth_data_len & 0xff0000) >> 16;
+ encoded[4] = (auth_data_len & 0xff00) >> 8;
+ encoded[5] = auth_data_len & 0xff;
+#endif /* UNALIGNED_POINTERS_PERMITTED */
+
+#ifdef _LP64
+ } else {
+ /* 2^32 <= a < 2^64 */
+ *encoded_len = 10;
+ encoded[0] = 0xff;
+ encoded[1] = 0xff;
+#ifdef UNALIGNED_POINTERS_PERMITTED
+ llencoded_ptr = (uint64_t *)&encoded[2];
+ *llencoded_ptr = htonl(auth_data_len);
+#else
+ encoded[2] = (auth_data_len & 0xff00000000000000) >> 56;
+ encoded[3] = (auth_data_len & 0xff000000000000) >> 48;
+ encoded[4] = (auth_data_len & 0xff0000000000) >> 40;
+ encoded[5] = (auth_data_len & 0xff00000000) >> 32;
+ encoded[6] = (auth_data_len & 0xff000000) >> 24;
+ encoded[7] = (auth_data_len & 0xff0000) >> 16;
+ encoded[8] = (auth_data_len & 0xff00) >> 8;
+ encoded[9] = auth_data_len & 0xff;
+#endif /* UNALIGNED_POINTERS_PERMITTED */
+#endif /* _LP64 */
+ }
+}
+
+static int
+ccm_init(ccm_ctx_t *ctx, unsigned char *nonce, size_t nonce_len,
+ unsigned char *auth_data, size_t auth_data_len, size_t block_size,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *))
+{
+ uint8_t *mac_buf, *datap, *ivp, *authp;
+ size_t remainder, processed;
+ uint8_t encoded_a[10]; /* max encoded auth data length is 10 octets */
+ size_t encoded_a_len = 0;
+
+ mac_buf = (uint8_t *)&(ctx->ccm_mac_buf);
+
+ /*
+ * Format the 1st block for CBC-MAC and construct the
+ * 1st counter block.
+ *
+ * aes_ctx->ccm_iv is used for storing the counter block
+ * mac_buf will store b0 at this time.
+ */
+ ccm_format_initial_blocks(nonce, nonce_len,
+ auth_data_len, mac_buf, ctx);
+
+ /* The IV for CBC MAC for AES CCM mode is always zero */
+ ivp = (uint8_t *)ctx->ccm_tmp;
+ bzero(ivp, block_size);
+
+ xor_block(ivp, mac_buf);
+
+ /* encrypt the nonce */
+ encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf);
+
+ /* take care of the associated data, if any */
+ if (auth_data_len == 0) {
+ return (CRYPTO_SUCCESS);
+ }
+
+ encode_adata_len(auth_data_len, encoded_a, &encoded_a_len);
+
+ remainder = auth_data_len;
+
+ /* 1st block: it contains encoded associated data, and some data */
+ authp = (uint8_t *)ctx->ccm_tmp;
+ bzero(authp, block_size);
+ bcopy(encoded_a, authp, encoded_a_len);
+ processed = block_size - encoded_a_len;
+ if (processed > auth_data_len) {
+ /* in case auth_data is very small */
+ processed = auth_data_len;
+ }
+ bcopy(auth_data, authp+encoded_a_len, processed);
+ /* xor with previous buffer */
+ xor_block(authp, mac_buf);
+ encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf);
+ remainder -= processed;
+ if (remainder == 0) {
+ /* a small amount of associated data, it's all done now */
+ return (CRYPTO_SUCCESS);
+ }
+
+ do {
+ if (remainder < block_size) {
+ /*
+ * There's not a block full of data, pad rest of
+ * buffer with zero
+ */
+ bzero(authp, block_size);
+ bcopy(&(auth_data[processed]), authp, remainder);
+ datap = (uint8_t *)authp;
+ remainder = 0;
+ } else {
+ datap = (uint8_t *)(&(auth_data[processed]));
+ processed += block_size;
+ remainder -= block_size;
+ }
+
+ xor_block(datap, mac_buf);
+ encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf);
+
+ } while (remainder > 0);
+
+ return (CRYPTO_SUCCESS);
+}
+
+/*
+ * The following function should be call at encrypt or decrypt init time
+ * for AES CCM mode.
+ */
+int
+ccm_init_ctx(ccm_ctx_t *ccm_ctx, char *param, int kmflag,
+ boolean_t is_encrypt_init, size_t block_size,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *))
+{
+ int rv;
+ CK_AES_CCM_PARAMS *ccm_param;
+
+ if (param != NULL) {
+ ccm_param = (CK_AES_CCM_PARAMS *)param;
+
+ if ((rv = ccm_validate_args(ccm_param,
+ is_encrypt_init)) != 0) {
+ return (rv);
+ }
+
+ ccm_ctx->ccm_mac_len = ccm_param->ulMACSize;
+ if (is_encrypt_init) {
+ ccm_ctx->ccm_data_len = ccm_param->ulDataSize;
+ } else {
+ ccm_ctx->ccm_data_len =
+ ccm_param->ulDataSize - ccm_ctx->ccm_mac_len;
+ ccm_ctx->ccm_processed_mac_len = 0;
+ }
+ ccm_ctx->ccm_processed_data_len = 0;
+
+ ccm_ctx->ccm_flags |= CCM_MODE;
+ } else {
+ return (CRYPTO_MECHANISM_PARAM_INVALID);
+ }
+
+ if (ccm_init(ccm_ctx, ccm_param->nonce, ccm_param->ulNonceSize,
+ ccm_param->authData, ccm_param->ulAuthDataSize, block_size,
+ encrypt_block, xor_block) != 0) {
+ return (CRYPTO_MECHANISM_PARAM_INVALID);
+ }
+ if (!is_encrypt_init) {
+ /* allocate buffer for storing decrypted plaintext */
+ ccm_ctx->ccm_pt_buf = vmem_alloc(ccm_ctx->ccm_data_len,
+ kmflag);
+ if (ccm_ctx->ccm_pt_buf == NULL) {
+ rv = CRYPTO_HOST_MEMORY;
+ }
+ }
+ return (rv);
+}
+
+void *
+ccm_alloc_ctx(int kmflag)
+{
+ ccm_ctx_t *ccm_ctx;
+
+ if ((ccm_ctx = kmem_zalloc(sizeof (ccm_ctx_t), kmflag)) == NULL)
+ return (NULL);
+
+ ccm_ctx->ccm_flags = CCM_MODE;
+ return (ccm_ctx);
+}
diff --git a/sys/contrib/openzfs/module/icp/algs/modes/ctr.c b/sys/contrib/openzfs/module/icp/algs/modes/ctr.c
new file mode 100644
index 000000000000..0188bdd395ff
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/modes/ctr.c
@@ -0,0 +1,228 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <modes/modes.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+#include <sys/byteorder.h>
+
+/*
+ * Encrypt and decrypt multiple blocks of data in counter mode.
+ */
+int
+ctr_mode_contiguous_blocks(ctr_ctx_t *ctx, char *data, size_t length,
+ crypto_data_t *out, size_t block_size,
+ int (*cipher)(const void *ks, const uint8_t *pt, uint8_t *ct),
+ void (*xor_block)(uint8_t *, uint8_t *))
+{
+ size_t remainder = length;
+ size_t need = 0;
+ uint8_t *datap = (uint8_t *)data;
+ uint8_t *blockp;
+ uint8_t *lastp;
+ void *iov_or_mp;
+ offset_t offset;
+ uint8_t *out_data_1;
+ uint8_t *out_data_2;
+ size_t out_data_1_len;
+ uint64_t lower_counter, upper_counter;
+
+ if (length + ctx->ctr_remainder_len < block_size) {
+ /* accumulate bytes here and return */
+ bcopy(datap,
+ (uint8_t *)ctx->ctr_remainder + ctx->ctr_remainder_len,
+ length);
+ ctx->ctr_remainder_len += length;
+ ctx->ctr_copy_to = datap;
+ return (CRYPTO_SUCCESS);
+ }
+
+ lastp = (uint8_t *)ctx->ctr_cb;
+ crypto_init_ptrs(out, &iov_or_mp, &offset);
+
+ do {
+ /* Unprocessed data from last call. */
+ if (ctx->ctr_remainder_len > 0) {
+ need = block_size - ctx->ctr_remainder_len;
+
+ if (need > remainder)
+ return (CRYPTO_DATA_LEN_RANGE);
+
+ bcopy(datap, &((uint8_t *)ctx->ctr_remainder)
+ [ctx->ctr_remainder_len], need);
+
+ blockp = (uint8_t *)ctx->ctr_remainder;
+ } else {
+ blockp = datap;
+ }
+
+ /* ctr_cb is the counter block */
+ cipher(ctx->ctr_keysched, (uint8_t *)ctx->ctr_cb,
+ (uint8_t *)ctx->ctr_tmp);
+
+ lastp = (uint8_t *)ctx->ctr_tmp;
+
+ /*
+ * Increment Counter.
+ */
+ lower_counter = ntohll(ctx->ctr_cb[1] & ctx->ctr_lower_mask);
+ lower_counter = htonll(lower_counter + 1);
+ lower_counter &= ctx->ctr_lower_mask;
+ ctx->ctr_cb[1] = (ctx->ctr_cb[1] & ~(ctx->ctr_lower_mask)) |
+ lower_counter;
+
+ /* wrap around */
+ if (lower_counter == 0) {
+ upper_counter =
+ ntohll(ctx->ctr_cb[0] & ctx->ctr_upper_mask);
+ upper_counter = htonll(upper_counter + 1);
+ upper_counter &= ctx->ctr_upper_mask;
+ ctx->ctr_cb[0] =
+ (ctx->ctr_cb[0] & ~(ctx->ctr_upper_mask)) |
+ upper_counter;
+ }
+
+ /*
+ * XOR encrypted counter block with the current clear block.
+ */
+ xor_block(blockp, lastp);
+
+ crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
+ &out_data_1_len, &out_data_2, block_size);
+
+ /* copy block to where it belongs */
+ bcopy(lastp, out_data_1, out_data_1_len);
+ if (out_data_2 != NULL) {
+ bcopy(lastp + out_data_1_len, out_data_2,
+ block_size - out_data_1_len);
+ }
+ /* update offset */
+ out->cd_offset += block_size;
+
+ /* Update pointer to next block of data to be processed. */
+ if (ctx->ctr_remainder_len != 0) {
+ datap += need;
+ ctx->ctr_remainder_len = 0;
+ } else {
+ datap += block_size;
+ }
+
+ remainder = (size_t)&data[length] - (size_t)datap;
+
+ /* Incomplete last block. */
+ if (remainder > 0 && remainder < block_size) {
+ bcopy(datap, ctx->ctr_remainder, remainder);
+ ctx->ctr_remainder_len = remainder;
+ ctx->ctr_copy_to = datap;
+ goto out;
+ }
+ ctx->ctr_copy_to = NULL;
+
+ } while (remainder > 0);
+
+out:
+ return (CRYPTO_SUCCESS);
+}
+
+int
+ctr_mode_final(ctr_ctx_t *ctx, crypto_data_t *out,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *))
+{
+ uint8_t *lastp;
+ void *iov_or_mp;
+ offset_t offset;
+ uint8_t *out_data_1;
+ uint8_t *out_data_2;
+ size_t out_data_1_len;
+ uint8_t *p;
+ int i;
+
+ if (out->cd_length < ctx->ctr_remainder_len)
+ return (CRYPTO_DATA_LEN_RANGE);
+
+ encrypt_block(ctx->ctr_keysched, (uint8_t *)ctx->ctr_cb,
+ (uint8_t *)ctx->ctr_tmp);
+
+ lastp = (uint8_t *)ctx->ctr_tmp;
+ p = (uint8_t *)ctx->ctr_remainder;
+ for (i = 0; i < ctx->ctr_remainder_len; i++) {
+ p[i] ^= lastp[i];
+ }
+
+ crypto_init_ptrs(out, &iov_or_mp, &offset);
+ crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
+ &out_data_1_len, &out_data_2, ctx->ctr_remainder_len);
+
+ bcopy(p, out_data_1, out_data_1_len);
+ if (out_data_2 != NULL) {
+ bcopy((uint8_t *)p + out_data_1_len,
+ out_data_2, ctx->ctr_remainder_len - out_data_1_len);
+ }
+ out->cd_offset += ctx->ctr_remainder_len;
+ ctx->ctr_remainder_len = 0;
+ return (CRYPTO_SUCCESS);
+}
+
+int
+ctr_init_ctx(ctr_ctx_t *ctr_ctx, ulong_t count, uint8_t *cb,
+ void (*copy_block)(uint8_t *, uint8_t *))
+{
+ uint64_t upper_mask = 0;
+ uint64_t lower_mask = 0;
+
+ if (count == 0 || count > 128) {
+ return (CRYPTO_MECHANISM_PARAM_INVALID);
+ }
+ /* upper 64 bits of the mask */
+ if (count >= 64) {
+ count -= 64;
+ upper_mask = (count == 64) ? UINT64_MAX : (1ULL << count) - 1;
+ lower_mask = UINT64_MAX;
+ } else {
+ /* now the lower 63 bits */
+ lower_mask = (1ULL << count) - 1;
+ }
+ ctr_ctx->ctr_lower_mask = htonll(lower_mask);
+ ctr_ctx->ctr_upper_mask = htonll(upper_mask);
+
+ copy_block(cb, (uchar_t *)ctr_ctx->ctr_cb);
+ ctr_ctx->ctr_lastp = (uint8_t *)&ctr_ctx->ctr_cb[0];
+ ctr_ctx->ctr_flags |= CTR_MODE;
+ return (CRYPTO_SUCCESS);
+}
+
+/* ARGSUSED */
+void *
+ctr_alloc_ctx(int kmflag)
+{
+ ctr_ctx_t *ctr_ctx;
+
+ if ((ctr_ctx = kmem_zalloc(sizeof (ctr_ctx_t), kmflag)) == NULL)
+ return (NULL);
+
+ ctr_ctx->ctr_flags = CTR_MODE;
+ return (ctr_ctx);
+}
diff --git a/sys/contrib/openzfs/module/icp/algs/modes/ecb.c b/sys/contrib/openzfs/module/icp/algs/modes/ecb.c
new file mode 100644
index 000000000000..025f5825cf04
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/modes/ecb.c
@@ -0,0 +1,128 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <modes/modes.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+
+/*
+ * Algorithm independent ECB functions.
+ */
+int
+ecb_cipher_contiguous_blocks(ecb_ctx_t *ctx, char *data, size_t length,
+ crypto_data_t *out, size_t block_size,
+ int (*cipher)(const void *ks, const uint8_t *pt, uint8_t *ct))
+{
+ size_t remainder = length;
+ size_t need = 0;
+ uint8_t *datap = (uint8_t *)data;
+ uint8_t *blockp;
+ uint8_t *lastp;
+ void *iov_or_mp;
+ offset_t offset;
+ uint8_t *out_data_1;
+ uint8_t *out_data_2;
+ size_t out_data_1_len;
+
+ if (length + ctx->ecb_remainder_len < block_size) {
+ /* accumulate bytes here and return */
+ bcopy(datap,
+ (uint8_t *)ctx->ecb_remainder + ctx->ecb_remainder_len,
+ length);
+ ctx->ecb_remainder_len += length;
+ ctx->ecb_copy_to = datap;
+ return (CRYPTO_SUCCESS);
+ }
+
+ lastp = (uint8_t *)ctx->ecb_iv;
+ crypto_init_ptrs(out, &iov_or_mp, &offset);
+
+ do {
+ /* Unprocessed data from last call. */
+ if (ctx->ecb_remainder_len > 0) {
+ need = block_size - ctx->ecb_remainder_len;
+
+ if (need > remainder)
+ return (CRYPTO_DATA_LEN_RANGE);
+
+ bcopy(datap, &((uint8_t *)ctx->ecb_remainder)
+ [ctx->ecb_remainder_len], need);
+
+ blockp = (uint8_t *)ctx->ecb_remainder;
+ } else {
+ blockp = datap;
+ }
+
+ cipher(ctx->ecb_keysched, blockp, lastp);
+ crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
+ &out_data_1_len, &out_data_2, block_size);
+
+ /* copy block to where it belongs */
+ bcopy(lastp, out_data_1, out_data_1_len);
+ if (out_data_2 != NULL) {
+ bcopy(lastp + out_data_1_len, out_data_2,
+ block_size - out_data_1_len);
+ }
+ /* update offset */
+ out->cd_offset += block_size;
+
+ /* Update pointer to next block of data to be processed. */
+ if (ctx->ecb_remainder_len != 0) {
+ datap += need;
+ ctx->ecb_remainder_len = 0;
+ } else {
+ datap += block_size;
+ }
+
+ remainder = (size_t)&data[length] - (size_t)datap;
+
+ /* Incomplete last block. */
+ if (remainder > 0 && remainder < block_size) {
+ bcopy(datap, ctx->ecb_remainder, remainder);
+ ctx->ecb_remainder_len = remainder;
+ ctx->ecb_copy_to = datap;
+ goto out;
+ }
+ ctx->ecb_copy_to = NULL;
+
+ } while (remainder > 0);
+
+out:
+ return (CRYPTO_SUCCESS);
+}
+
+/* ARGSUSED */
+void *
+ecb_alloc_ctx(int kmflag)
+{
+ ecb_ctx_t *ecb_ctx;
+
+ if ((ecb_ctx = kmem_zalloc(sizeof (ecb_ctx_t), kmflag)) == NULL)
+ return (NULL);
+
+ ecb_ctx->ecb_flags = ECB_MODE;
+ return (ecb_ctx);
+}
diff --git a/sys/contrib/openzfs/module/icp/algs/modes/gcm.c b/sys/contrib/openzfs/module/icp/algs/modes/gcm.c
new file mode 100644
index 000000000000..23686c59e8ce
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/modes/gcm.c
@@ -0,0 +1,1587 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <modes/modes.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/icp.h>
+#include <sys/crypto/impl.h>
+#include <sys/byteorder.h>
+#include <sys/simd.h>
+#include <modes/gcm_impl.h>
+#ifdef CAN_USE_GCM_ASM
+#include <aes/aes_impl.h>
+#endif
+
+#define GHASH(c, d, t, o) \
+ xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
+ (o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
+ (uint64_t *)(void *)(t));
+
+/* Select GCM implementation */
+#define IMPL_FASTEST (UINT32_MAX)
+#define IMPL_CYCLE (UINT32_MAX-1)
+#ifdef CAN_USE_GCM_ASM
+#define IMPL_AVX (UINT32_MAX-2)
+#endif
+#define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
+static uint32_t icp_gcm_impl = IMPL_FASTEST;
+static uint32_t user_sel_impl = IMPL_FASTEST;
+
+#ifdef CAN_USE_GCM_ASM
+/* Does the architecture we run on support the MOVBE instruction? */
+boolean_t gcm_avx_can_use_movbe = B_FALSE;
+/*
+ * Whether to use the optimized openssl gcm and ghash implementations.
+ * Set to true if module parameter icp_gcm_impl == "avx".
+ */
+static boolean_t gcm_use_avx = B_FALSE;
+#define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx)
+
+extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
+
+static inline boolean_t gcm_avx_will_work(void);
+static inline void gcm_set_avx(boolean_t);
+static inline boolean_t gcm_toggle_avx(void);
+static inline size_t gcm_simd_get_htab_size(boolean_t);
+
+static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
+ crypto_data_t *, size_t);
+
+static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
+static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
+static int gcm_init_avx(gcm_ctx_t *, unsigned char *, size_t, unsigned char *,
+ size_t, size_t);
+#endif /* ifdef CAN_USE_GCM_ASM */
+
+/*
+ * Encrypt multiple blocks of data in GCM mode. Decrypt for GCM mode
+ * is done in another function.
+ */
+int
+gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
+ crypto_data_t *out, size_t block_size,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*copy_block)(uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *))
+{
+#ifdef CAN_USE_GCM_ASM
+ if (ctx->gcm_use_avx == B_TRUE)
+ return (gcm_mode_encrypt_contiguous_blocks_avx(
+ ctx, data, length, out, block_size));
+#endif
+
+ const gcm_impl_ops_t *gops;
+ size_t remainder = length;
+ size_t need = 0;
+ uint8_t *datap = (uint8_t *)data;
+ uint8_t *blockp;
+ uint8_t *lastp;
+ void *iov_or_mp;
+ offset_t offset;
+ uint8_t *out_data_1;
+ uint8_t *out_data_2;
+ size_t out_data_1_len;
+ uint64_t counter;
+ uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
+
+ if (length + ctx->gcm_remainder_len < block_size) {
+ /* accumulate bytes here and return */
+ bcopy(datap,
+ (uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len,
+ length);
+ ctx->gcm_remainder_len += length;
+ if (ctx->gcm_copy_to == NULL) {
+ ctx->gcm_copy_to = datap;
+ }
+ return (CRYPTO_SUCCESS);
+ }
+
+ lastp = (uint8_t *)ctx->gcm_cb;
+ crypto_init_ptrs(out, &iov_or_mp, &offset);
+
+ gops = gcm_impl_get_ops();
+ do {
+ /* Unprocessed data from last call. */
+ if (ctx->gcm_remainder_len > 0) {
+ need = block_size - ctx->gcm_remainder_len;
+
+ if (need > remainder)
+ return (CRYPTO_DATA_LEN_RANGE);
+
+ bcopy(datap, &((uint8_t *)ctx->gcm_remainder)
+ [ctx->gcm_remainder_len], need);
+
+ blockp = (uint8_t *)ctx->gcm_remainder;
+ } else {
+ blockp = datap;
+ }
+
+ /*
+ * Increment counter. Counter bits are confined
+ * to the bottom 32 bits of the counter block.
+ */
+ counter = ntohll(ctx->gcm_cb[1] & counter_mask);
+ counter = htonll(counter + 1);
+ counter &= counter_mask;
+ ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
+
+ encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
+ (uint8_t *)ctx->gcm_tmp);
+ xor_block(blockp, (uint8_t *)ctx->gcm_tmp);
+
+ lastp = (uint8_t *)ctx->gcm_tmp;
+
+ ctx->gcm_processed_data_len += block_size;
+
+ crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
+ &out_data_1_len, &out_data_2, block_size);
+
+ /* copy block to where it belongs */
+ if (out_data_1_len == block_size) {
+ copy_block(lastp, out_data_1);
+ } else {
+ bcopy(lastp, out_data_1, out_data_1_len);
+ if (out_data_2 != NULL) {
+ bcopy(lastp + out_data_1_len,
+ out_data_2,
+ block_size - out_data_1_len);
+ }
+ }
+ /* update offset */
+ out->cd_offset += block_size;
+
+ /* add ciphertext to the hash */
+ GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops);
+
+ /* Update pointer to next block of data to be processed. */
+ if (ctx->gcm_remainder_len != 0) {
+ datap += need;
+ ctx->gcm_remainder_len = 0;
+ } else {
+ datap += block_size;
+ }
+
+ remainder = (size_t)&data[length] - (size_t)datap;
+
+ /* Incomplete last block. */
+ if (remainder > 0 && remainder < block_size) {
+ bcopy(datap, ctx->gcm_remainder, remainder);
+ ctx->gcm_remainder_len = remainder;
+ ctx->gcm_copy_to = datap;
+ goto out;
+ }
+ ctx->gcm_copy_to = NULL;
+
+ } while (remainder > 0);
+out:
+ return (CRYPTO_SUCCESS);
+}
+
+/* ARGSUSED */
+int
+gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*copy_block)(uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *))
+{
+#ifdef CAN_USE_GCM_ASM
+ if (ctx->gcm_use_avx == B_TRUE)
+ return (gcm_encrypt_final_avx(ctx, out, block_size));
+#endif
+
+ const gcm_impl_ops_t *gops;
+ uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
+ uint8_t *ghash, *macp = NULL;
+ int i, rv;
+
+ if (out->cd_length <
+ (ctx->gcm_remainder_len + ctx->gcm_tag_len)) {
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+
+ gops = gcm_impl_get_ops();
+ ghash = (uint8_t *)ctx->gcm_ghash;
+
+ if (ctx->gcm_remainder_len > 0) {
+ uint64_t counter;
+ uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp;
+
+ /*
+ * Here is where we deal with data that is not a
+ * multiple of the block size.
+ */
+
+ /*
+ * Increment counter.
+ */
+ counter = ntohll(ctx->gcm_cb[1] & counter_mask);
+ counter = htonll(counter + 1);
+ counter &= counter_mask;
+ ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
+
+ encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
+ (uint8_t *)ctx->gcm_tmp);
+
+ macp = (uint8_t *)ctx->gcm_remainder;
+ bzero(macp + ctx->gcm_remainder_len,
+ block_size - ctx->gcm_remainder_len);
+
+ /* XOR with counter block */
+ for (i = 0; i < ctx->gcm_remainder_len; i++) {
+ macp[i] ^= tmpp[i];
+ }
+
+ /* add ciphertext to the hash */
+ GHASH(ctx, macp, ghash, gops);
+
+ ctx->gcm_processed_data_len += ctx->gcm_remainder_len;
+ }
+
+ ctx->gcm_len_a_len_c[1] =
+ htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
+ GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
+ encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
+ (uint8_t *)ctx->gcm_J0);
+ xor_block((uint8_t *)ctx->gcm_J0, ghash);
+
+ if (ctx->gcm_remainder_len > 0) {
+ rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len);
+ if (rv != CRYPTO_SUCCESS)
+ return (rv);
+ }
+ out->cd_offset += ctx->gcm_remainder_len;
+ ctx->gcm_remainder_len = 0;
+ rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
+ if (rv != CRYPTO_SUCCESS)
+ return (rv);
+ out->cd_offset += ctx->gcm_tag_len;
+
+ return (CRYPTO_SUCCESS);
+}
+
+/*
+ * This will only deal with decrypting the last block of the input that
+ * might not be a multiple of block length.
+ */
+static void
+gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *))
+{
+ uint8_t *datap, *outp, *counterp;
+ uint64_t counter;
+ uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
+ int i;
+
+ /*
+ * Increment counter.
+ * Counter bits are confined to the bottom 32 bits
+ */
+ counter = ntohll(ctx->gcm_cb[1] & counter_mask);
+ counter = htonll(counter + 1);
+ counter &= counter_mask;
+ ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
+
+ datap = (uint8_t *)ctx->gcm_remainder;
+ outp = &((ctx->gcm_pt_buf)[index]);
+ counterp = (uint8_t *)ctx->gcm_tmp;
+
+ /* authentication tag */
+ bzero((uint8_t *)ctx->gcm_tmp, block_size);
+ bcopy(datap, (uint8_t *)ctx->gcm_tmp, ctx->gcm_remainder_len);
+
+ /* add ciphertext to the hash */
+ GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops());
+
+ /* decrypt remaining ciphertext */
+ encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);
+
+ /* XOR with counter block */
+ for (i = 0; i < ctx->gcm_remainder_len; i++) {
+ outp[i] = datap[i] ^ counterp[i];
+ }
+}
+
+/* ARGSUSED */
+int
+gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
+ crypto_data_t *out, size_t block_size,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*copy_block)(uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *))
+{
+ size_t new_len;
+ uint8_t *new;
+
+ /*
+ * Copy contiguous ciphertext input blocks to plaintext buffer.
+ * Ciphertext will be decrypted in the final.
+ */
+ if (length > 0) {
+ new_len = ctx->gcm_pt_buf_len + length;
+ new = vmem_alloc(new_len, ctx->gcm_kmflag);
+ if (new == NULL) {
+ vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
+ ctx->gcm_pt_buf = NULL;
+ return (CRYPTO_HOST_MEMORY);
+ }
+ bcopy(ctx->gcm_pt_buf, new, ctx->gcm_pt_buf_len);
+ vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
+ ctx->gcm_pt_buf = new;
+ ctx->gcm_pt_buf_len = new_len;
+ bcopy(data, &ctx->gcm_pt_buf[ctx->gcm_processed_data_len],
+ length);
+ ctx->gcm_processed_data_len += length;
+ }
+
+ ctx->gcm_remainder_len = 0;
+ return (CRYPTO_SUCCESS);
+}
+
+int
+gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *))
+{
+#ifdef CAN_USE_GCM_ASM
+ if (ctx->gcm_use_avx == B_TRUE)
+ return (gcm_decrypt_final_avx(ctx, out, block_size));
+#endif
+
+ const gcm_impl_ops_t *gops;
+ size_t pt_len;
+ size_t remainder;
+ uint8_t *ghash;
+ uint8_t *blockp;
+ uint8_t *cbp;
+ uint64_t counter;
+ uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
+ int processed = 0, rv;
+
+ ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);
+
+ gops = gcm_impl_get_ops();
+ pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
+ ghash = (uint8_t *)ctx->gcm_ghash;
+ blockp = ctx->gcm_pt_buf;
+ remainder = pt_len;
+ while (remainder > 0) {
+ /* Incomplete last block */
+ if (remainder < block_size) {
+ bcopy(blockp, ctx->gcm_remainder, remainder);
+ ctx->gcm_remainder_len = remainder;
+ /*
+ * not expecting anymore ciphertext, just
+ * compute plaintext for the remaining input
+ */
+ gcm_decrypt_incomplete_block(ctx, block_size,
+ processed, encrypt_block, xor_block);
+ ctx->gcm_remainder_len = 0;
+ goto out;
+ }
+ /* add ciphertext to the hash */
+ GHASH(ctx, blockp, ghash, gops);
+
+ /*
+ * Increment counter.
+ * Counter bits are confined to the bottom 32 bits
+ */
+ counter = ntohll(ctx->gcm_cb[1] & counter_mask);
+ counter = htonll(counter + 1);
+ counter &= counter_mask;
+ ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
+
+ cbp = (uint8_t *)ctx->gcm_tmp;
+ encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp);
+
+ /* XOR with ciphertext */
+ xor_block(cbp, blockp);
+
+ processed += block_size;
+ blockp += block_size;
+ remainder -= block_size;
+ }
+out:
+ ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
+ GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
+ encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
+ (uint8_t *)ctx->gcm_J0);
+ xor_block((uint8_t *)ctx->gcm_J0, ghash);
+
+ /* compare the input authentication tag with what we calculated */
+ if (bcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
+ /* They don't match */
+ return (CRYPTO_INVALID_MAC);
+ } else {
+ rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
+ if (rv != CRYPTO_SUCCESS)
+ return (rv);
+ out->cd_offset += pt_len;
+ }
+ return (CRYPTO_SUCCESS);
+}
+
+static int
+gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param)
+{
+ size_t tag_len;
+
+ /*
+ * Check the length of the authentication tag (in bits).
+ */
+ tag_len = gcm_param->ulTagBits;
+ switch (tag_len) {
+ case 32:
+ case 64:
+ case 96:
+ case 104:
+ case 112:
+ case 120:
+ case 128:
+ break;
+ default:
+ return (CRYPTO_MECHANISM_PARAM_INVALID);
+ }
+
+ if (gcm_param->ulIvLen == 0)
+ return (CRYPTO_MECHANISM_PARAM_INVALID);
+
+ return (CRYPTO_SUCCESS);
+}
+
+static void
+gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len,
+ gcm_ctx_t *ctx, size_t block_size,
+ void (*copy_block)(uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *))
+{
+ const gcm_impl_ops_t *gops;
+ uint8_t *cb;
+ ulong_t remainder = iv_len;
+ ulong_t processed = 0;
+ uint8_t *datap, *ghash;
+ uint64_t len_a_len_c[2];
+
+ gops = gcm_impl_get_ops();
+ ghash = (uint8_t *)ctx->gcm_ghash;
+ cb = (uint8_t *)ctx->gcm_cb;
+ if (iv_len == 12) {
+ bcopy(iv, cb, 12);
+ cb[12] = 0;
+ cb[13] = 0;
+ cb[14] = 0;
+ cb[15] = 1;
+ /* J0 will be used again in the final */
+ copy_block(cb, (uint8_t *)ctx->gcm_J0);
+ } else {
+ /* GHASH the IV */
+ do {
+ if (remainder < block_size) {
+ bzero(cb, block_size);
+ bcopy(&(iv[processed]), cb, remainder);
+ datap = (uint8_t *)cb;
+ remainder = 0;
+ } else {
+ datap = (uint8_t *)(&(iv[processed]));
+ processed += block_size;
+ remainder -= block_size;
+ }
+ GHASH(ctx, datap, ghash, gops);
+ } while (remainder > 0);
+
+ len_a_len_c[0] = 0;
+ len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len));
+ GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops);
+
+ /* J0 will be used again in the final */
+ copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb);
+ }
+}
+
+static int
+gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
+ unsigned char *auth_data, size_t auth_data_len, size_t block_size,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*copy_block)(uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *))
+{
+ const gcm_impl_ops_t *gops;
+ uint8_t *ghash, *datap, *authp;
+ size_t remainder, processed;
+
+ /* encrypt zero block to get subkey H */
+ bzero(ctx->gcm_H, sizeof (ctx->gcm_H));
+ encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H,
+ (uint8_t *)ctx->gcm_H);
+
+ gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
+ copy_block, xor_block);
+
+ gops = gcm_impl_get_ops();
+ authp = (uint8_t *)ctx->gcm_tmp;
+ ghash = (uint8_t *)ctx->gcm_ghash;
+ bzero(authp, block_size);
+ bzero(ghash, block_size);
+
+ processed = 0;
+ remainder = auth_data_len;
+ do {
+ if (remainder < block_size) {
+ /*
+ * There's not a block full of data, pad rest of
+ * buffer with zero
+ */
+ bzero(authp, block_size);
+ bcopy(&(auth_data[processed]), authp, remainder);
+ datap = (uint8_t *)authp;
+ remainder = 0;
+ } else {
+ datap = (uint8_t *)(&(auth_data[processed]));
+ processed += block_size;
+ remainder -= block_size;
+ }
+
+ /* add auth data to the hash */
+ GHASH(ctx, datap, ghash, gops);
+
+ } while (remainder > 0);
+
+ return (CRYPTO_SUCCESS);
+}
+
+/*
+ * The following function is called at encrypt or decrypt init time
+ * for AES GCM mode.
+ *
+ * Init the GCM context struct. Handle the cycle and avx implementations here.
+ */
+int
+gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*copy_block)(uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *))
+{
+ int rv;
+ CK_AES_GCM_PARAMS *gcm_param;
+
+ if (param != NULL) {
+ gcm_param = (CK_AES_GCM_PARAMS *)(void *)param;
+
+ if ((rv = gcm_validate_args(gcm_param)) != 0) {
+ return (rv);
+ }
+
+ gcm_ctx->gcm_tag_len = gcm_param->ulTagBits;
+ gcm_ctx->gcm_tag_len >>= 3;
+ gcm_ctx->gcm_processed_data_len = 0;
+
+ /* these values are in bits */
+ gcm_ctx->gcm_len_a_len_c[0]
+ = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen));
+
+ rv = CRYPTO_SUCCESS;
+ gcm_ctx->gcm_flags |= GCM_MODE;
+ } else {
+ return (CRYPTO_MECHANISM_PARAM_INVALID);
+ }
+
+#ifdef CAN_USE_GCM_ASM
+ if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
+ gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
+ } else {
+ /*
+ * Handle the "cycle" implementation by creating avx and
+ * non-avx contexts alternately.
+ */
+ gcm_ctx->gcm_use_avx = gcm_toggle_avx();
+ /*
+ * We don't handle byte swapped key schedules in the avx
+ * code path.
+ */
+ aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched;
+ if (ks->ops->needs_byteswap == B_TRUE) {
+ gcm_ctx->gcm_use_avx = B_FALSE;
+ }
+ /* Use the MOVBE and the BSWAP variants alternately. */
+ if (gcm_ctx->gcm_use_avx == B_TRUE &&
+ zfs_movbe_available() == B_TRUE) {
+ (void) atomic_toggle_boolean_nv(
+ (volatile boolean_t *)&gcm_avx_can_use_movbe);
+ }
+ }
+ /* Allocate Htab memory as needed. */
+ if (gcm_ctx->gcm_use_avx == B_TRUE) {
+ size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
+
+ if (htab_len == 0) {
+ return (CRYPTO_MECHANISM_PARAM_INVALID);
+ }
+ gcm_ctx->gcm_htab_len = htab_len;
+ gcm_ctx->gcm_Htable =
+ (uint64_t *)kmem_alloc(htab_len, gcm_ctx->gcm_kmflag);
+
+ if (gcm_ctx->gcm_Htable == NULL) {
+ return (CRYPTO_HOST_MEMORY);
+ }
+ }
+ /* Avx and non avx context initialization differs from here on. */
+ if (gcm_ctx->gcm_use_avx == B_FALSE) {
+#endif /* ifdef CAN_USE_GCM_ASM */
+ if (gcm_init(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen,
+ gcm_param->pAAD, gcm_param->ulAADLen, block_size,
+ encrypt_block, copy_block, xor_block) != 0) {
+ rv = CRYPTO_MECHANISM_PARAM_INVALID;
+ }
+#ifdef CAN_USE_GCM_ASM
+ } else {
+ if (gcm_init_avx(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen,
+ gcm_param->pAAD, gcm_param->ulAADLen, block_size) != 0) {
+ rv = CRYPTO_MECHANISM_PARAM_INVALID;
+ }
+ }
+#endif /* ifdef CAN_USE_GCM_ASM */
+
+ return (rv);
+}
+
+int
+gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*copy_block)(uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *))
+{
+ int rv;
+ CK_AES_GMAC_PARAMS *gmac_param;
+
+ if (param != NULL) {
+ gmac_param = (CK_AES_GMAC_PARAMS *)(void *)param;
+
+ gcm_ctx->gcm_tag_len = CRYPTO_BITS2BYTES(AES_GMAC_TAG_BITS);
+ gcm_ctx->gcm_processed_data_len = 0;
+
+ /* these values are in bits */
+ gcm_ctx->gcm_len_a_len_c[0]
+ = htonll(CRYPTO_BYTES2BITS(gmac_param->ulAADLen));
+
+ rv = CRYPTO_SUCCESS;
+ gcm_ctx->gcm_flags |= GMAC_MODE;
+ } else {
+ return (CRYPTO_MECHANISM_PARAM_INVALID);
+ }
+
+#ifdef CAN_USE_GCM_ASM
+ /*
+ * Handle the "cycle" implementation by creating avx and non avx
+ * contexts alternately.
+ */
+ if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
+ gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
+ } else {
+ gcm_ctx->gcm_use_avx = gcm_toggle_avx();
+ }
+ /* We don't handle byte swapped key schedules in the avx code path. */
+ aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched;
+ if (ks->ops->needs_byteswap == B_TRUE) {
+ gcm_ctx->gcm_use_avx = B_FALSE;
+ }
+ /* Allocate Htab memory as needed. */
+ if (gcm_ctx->gcm_use_avx == B_TRUE) {
+ size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
+
+ if (htab_len == 0) {
+ return (CRYPTO_MECHANISM_PARAM_INVALID);
+ }
+ gcm_ctx->gcm_htab_len = htab_len;
+ gcm_ctx->gcm_Htable =
+ (uint64_t *)kmem_alloc(htab_len, gcm_ctx->gcm_kmflag);
+
+ if (gcm_ctx->gcm_Htable == NULL) {
+ return (CRYPTO_HOST_MEMORY);
+ }
+ }
+
+ /* Avx and non avx context initialization differs from here on. */
+ if (gcm_ctx->gcm_use_avx == B_FALSE) {
+#endif /* ifdef CAN_USE_GCM_ASM */
+ if (gcm_init(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN,
+ gmac_param->pAAD, gmac_param->ulAADLen, block_size,
+ encrypt_block, copy_block, xor_block) != 0) {
+ rv = CRYPTO_MECHANISM_PARAM_INVALID;
+ }
+#ifdef CAN_USE_GCM_ASM
+ } else {
+ if (gcm_init_avx(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN,
+ gmac_param->pAAD, gmac_param->ulAADLen, block_size) != 0) {
+ rv = CRYPTO_MECHANISM_PARAM_INVALID;
+ }
+ }
+#endif /* ifdef CAN_USE_GCM_ASM */
+
+ return (rv);
+}
+
+void *
+gcm_alloc_ctx(int kmflag)
+{
+ gcm_ctx_t *gcm_ctx;
+
+ if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
+ return (NULL);
+
+ gcm_ctx->gcm_flags = GCM_MODE;
+ return (gcm_ctx);
+}
+
+void *
+gmac_alloc_ctx(int kmflag)
+{
+ gcm_ctx_t *gcm_ctx;
+
+ if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
+ return (NULL);
+
+ gcm_ctx->gcm_flags = GMAC_MODE;
+ return (gcm_ctx);
+}
+
+void
+gcm_set_kmflag(gcm_ctx_t *ctx, int kmflag)
+{
+ ctx->gcm_kmflag = kmflag;
+}
+
+/* GCM implementation that contains the fastest methods */
+static gcm_impl_ops_t gcm_fastest_impl = {
+ .name = "fastest"
+};
+
+/* All compiled in implementations */
+const gcm_impl_ops_t *gcm_all_impl[] = {
+ &gcm_generic_impl,
+#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
+ &gcm_pclmulqdq_impl,
+#endif
+};
+
+/* Indicate that benchmark has been completed */
+static boolean_t gcm_impl_initialized = B_FALSE;
+
+/* Hold all supported implementations */
+static size_t gcm_supp_impl_cnt = 0;
+static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
+
+/*
+ * Returns the GCM operations for encrypt/decrypt/key setup. When a
+ * SIMD implementation is not allowed in the current context, then
+ * fallback to the fastest generic implementation.
+ */
+const gcm_impl_ops_t *
+gcm_impl_get_ops()
+{
+ if (!kfpu_allowed())
+ return (&gcm_generic_impl);
+
+ const gcm_impl_ops_t *ops = NULL;
+ const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
+
+ switch (impl) {
+ case IMPL_FASTEST:
+ ASSERT(gcm_impl_initialized);
+ ops = &gcm_fastest_impl;
+ break;
+ case IMPL_CYCLE:
+ /* Cycle through supported implementations */
+ ASSERT(gcm_impl_initialized);
+ ASSERT3U(gcm_supp_impl_cnt, >, 0);
+ static size_t cycle_impl_idx = 0;
+ size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
+ ops = gcm_supp_impl[idx];
+ break;
+#ifdef CAN_USE_GCM_ASM
+ case IMPL_AVX:
+ /*
+ * Make sure that we return a valid implementation while
+ * switching to the avx implementation since there still
+ * may be unfinished non-avx contexts around.
+ */
+ ops = &gcm_generic_impl;
+ break;
+#endif
+ default:
+ ASSERT3U(impl, <, gcm_supp_impl_cnt);
+ ASSERT3U(gcm_supp_impl_cnt, >, 0);
+ if (impl < ARRAY_SIZE(gcm_all_impl))
+ ops = gcm_supp_impl[impl];
+ break;
+ }
+
+ ASSERT3P(ops, !=, NULL);
+
+ return (ops);
+}
+
+/*
+ * Initialize all supported implementations.
+ */
+void
+gcm_impl_init(void)
+{
+ gcm_impl_ops_t *curr_impl;
+ int i, c;
+
+ /* Move supported implementations into gcm_supp_impls */
+ for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
+ curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
+
+ if (curr_impl->is_supported())
+ gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl;
+ }
+ gcm_supp_impl_cnt = c;
+
+ /*
+ * Set the fastest implementation given the assumption that the
+ * hardware accelerated version is the fastest.
+ */
+#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
+ if (gcm_pclmulqdq_impl.is_supported()) {
+ memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
+ sizeof (gcm_fastest_impl));
+ } else
+#endif
+ {
+ memcpy(&gcm_fastest_impl, &gcm_generic_impl,
+ sizeof (gcm_fastest_impl));
+ }
+
+ strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX);
+
+#ifdef CAN_USE_GCM_ASM
+ /*
+ * Use the avx implementation if it's available and the implementation
+ * hasn't changed from its default value of fastest on module load.
+ */
+ if (gcm_avx_will_work()) {
+#ifdef HAVE_MOVBE
+ if (zfs_movbe_available() == B_TRUE) {
+ atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE);
+ }
+#endif
+ if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
+ gcm_set_avx(B_TRUE);
+ }
+ }
+#endif
+ /* Finish initialization */
+ atomic_swap_32(&icp_gcm_impl, user_sel_impl);
+ gcm_impl_initialized = B_TRUE;
+}
+
+static const struct {
+ char *name;
+ uint32_t sel;
+} gcm_impl_opts[] = {
+ { "cycle", IMPL_CYCLE },
+ { "fastest", IMPL_FASTEST },
+#ifdef CAN_USE_GCM_ASM
+ { "avx", IMPL_AVX },
+#endif
+};
+
+/*
+ * Function sets desired gcm implementation.
+ *
+ * If we are called before init(), user preference will be saved in
+ * user_sel_impl, and applied in later init() call. This occurs when module
+ * parameter is specified on module load. Otherwise, directly update
+ * icp_gcm_impl.
+ *
+ * @val Name of gcm implementation to use
+ * @param Unused.
+ */
+int
+gcm_impl_set(const char *val)
+{
+ int err = -EINVAL;
+ char req_name[GCM_IMPL_NAME_MAX];
+ uint32_t impl = GCM_IMPL_READ(user_sel_impl);
+ size_t i;
+
+ /* sanitize input */
+ i = strnlen(val, GCM_IMPL_NAME_MAX);
+ if (i == 0 || i >= GCM_IMPL_NAME_MAX)
+ return (err);
+
+ strlcpy(req_name, val, GCM_IMPL_NAME_MAX);
+ while (i > 0 && isspace(req_name[i-1]))
+ i--;
+ req_name[i] = '\0';
+
+ /* Check mandatory options */
+ for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
+#ifdef CAN_USE_GCM_ASM
+ /* Ignore avx implementation if it won't work. */
+ if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
+ continue;
+ }
+#endif
+ if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {
+ impl = gcm_impl_opts[i].sel;
+ err = 0;
+ break;
+ }
+ }
+
+ /* check all supported impl if init() was already called */
+ if (err != 0 && gcm_impl_initialized) {
+ /* check all supported implementations */
+ for (i = 0; i < gcm_supp_impl_cnt; i++) {
+ if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) {
+ impl = i;
+ err = 0;
+ break;
+ }
+ }
+ }
+#ifdef CAN_USE_GCM_ASM
+ /*
+ * Use the avx implementation if available and the requested one is
+ * avx or fastest.
+ */
+ if (gcm_avx_will_work() == B_TRUE &&
+ (impl == IMPL_AVX || impl == IMPL_FASTEST)) {
+ gcm_set_avx(B_TRUE);
+ } else {
+ gcm_set_avx(B_FALSE);
+ }
+#endif
+
+ if (err == 0) {
+ if (gcm_impl_initialized)
+ atomic_swap_32(&icp_gcm_impl, impl);
+ else
+ atomic_swap_32(&user_sel_impl, impl);
+ }
+
+ return (err);
+}
+
+#if defined(_KERNEL) && defined(__linux__)
+
+static int
+icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp)
+{
+ return (gcm_impl_set(val));
+}
+
+static int
+icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
+{
+ int i, cnt = 0;
+ char *fmt;
+ const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
+
+ ASSERT(gcm_impl_initialized);
+
+ /* list mandatory options */
+ for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
+#ifdef CAN_USE_GCM_ASM
+ /* Ignore avx implementation if it won't work. */
+ if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
+ continue;
+ }
+#endif
+ fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";
+ cnt += sprintf(buffer + cnt, fmt, gcm_impl_opts[i].name);
+ }
+
+ /* list all supported implementations */
+ for (i = 0; i < gcm_supp_impl_cnt; i++) {
+ fmt = (i == impl) ? "[%s] " : "%s ";
+ cnt += sprintf(buffer + cnt, fmt, gcm_supp_impl[i]->name);
+ }
+
+ return (cnt);
+}
+
+module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,
+ NULL, 0644);
+MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
+#endif /* defined(__KERNEL) */
+
+#ifdef CAN_USE_GCM_ASM
+#define GCM_BLOCK_LEN 16
+/*
+ * The openssl asm routines are 6x aggregated and need that many bytes
+ * at minimum.
+ */
+#define GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6)
+#define GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3)
+/*
+ * Ensure the chunk size is reasonable since we are allocating a
+ * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts.
+ */
+#define GCM_AVX_MAX_CHUNK_SIZE \
+ (((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES)
+
+/* Get the chunk size module parameter. */
+#define GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size
+
+/* Clear the FPU registers since they hold sensitive internal state. */
+#define clear_fpu_regs() clear_fpu_regs_avx()
+#define GHASH_AVX(ctx, in, len) \
+ gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \
+ in, len)
+
+#define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
+
+/*
+ * Module parameter: number of bytes to process at once while owning the FPU.
+ * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is
+ * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES.
+ */
+static uint32_t gcm_avx_chunk_size =
+ ((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
+
+extern void clear_fpu_regs_avx(void);
+extern void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
+extern void aes_encrypt_intel(const uint32_t rk[], int nr,
+ const uint32_t pt[4], uint32_t ct[4]);
+
+extern void gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
+extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
+ const uint8_t *in, size_t len);
+
+extern size_t aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
+ const void *, uint64_t *, uint64_t *);
+
+extern size_t aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
+ const void *, uint64_t *, uint64_t *);
+
+static inline boolean_t
+gcm_avx_will_work(void)
+{
+ /* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
+ return (kfpu_allowed() &&
+ zfs_avx_available() && zfs_aes_available() &&
+ zfs_pclmulqdq_available());
+}
+
+static inline void
+gcm_set_avx(boolean_t val)
+{
+ if (gcm_avx_will_work() == B_TRUE) {
+ atomic_swap_32(&gcm_use_avx, val);
+ }
+}
+
+static inline boolean_t
+gcm_toggle_avx(void)
+{
+ if (gcm_avx_will_work() == B_TRUE) {
+ return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX));
+ } else {
+ return (B_FALSE);
+ }
+}
+
+static inline size_t
+gcm_simd_get_htab_size(boolean_t simd_mode)
+{
+ switch (simd_mode) {
+ case B_TRUE:
+ return (2 * 6 * 2 * sizeof (uint64_t));
+
+ default:
+ return (0);
+ }
+}
+
+/*
+ * Clear sensitive data in the context.
+ *
+ * ctx->gcm_remainder may contain a plaintext remainder. ctx->gcm_H and
+ * ctx->gcm_Htable contain the hash sub key which protects authentication.
+ *
+ * Although extremely unlikely, ctx->gcm_J0 and ctx->gcm_tmp could be used for
+ * a known plaintext attack, they consists of the IV and the first and last
+ * counter respectively. If they should be cleared is debatable.
+ */
+static inline void
+gcm_clear_ctx(gcm_ctx_t *ctx)
+{
+ bzero(ctx->gcm_remainder, sizeof (ctx->gcm_remainder));
+ bzero(ctx->gcm_H, sizeof (ctx->gcm_H));
+ bzero(ctx->gcm_J0, sizeof (ctx->gcm_J0));
+ bzero(ctx->gcm_tmp, sizeof (ctx->gcm_tmp));
+}
+
+/* Increment the GCM counter block by n. */
+static inline void
+gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
+{
+ uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
+ uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask);
+
+ counter = htonll(counter + n);
+ counter &= counter_mask;
+ ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
+}
+
+/*
+ * Encrypt multiple blocks of data in GCM mode.
+ * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
+ * if possible. While processing a chunk the FPU is "locked".
+ */
+static int
+gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
+ size_t length, crypto_data_t *out, size_t block_size)
+{
+ size_t bleft = length;
+ size_t need = 0;
+ size_t done = 0;
+ uint8_t *datap = (uint8_t *)data;
+ size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
+ const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
+ uint64_t *ghash = ctx->gcm_ghash;
+ uint64_t *cb = ctx->gcm_cb;
+ uint8_t *ct_buf = NULL;
+ uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
+ int rv = CRYPTO_SUCCESS;
+
+ ASSERT(block_size == GCM_BLOCK_LEN);
+ /*
+ * If the last call left an incomplete block, try to fill
+ * it first.
+ */
+ if (ctx->gcm_remainder_len > 0) {
+ need = block_size - ctx->gcm_remainder_len;
+ if (length < need) {
+ /* Accumulate bytes here and return. */
+ bcopy(datap, (uint8_t *)ctx->gcm_remainder +
+ ctx->gcm_remainder_len, length);
+
+ ctx->gcm_remainder_len += length;
+ if (ctx->gcm_copy_to == NULL) {
+ ctx->gcm_copy_to = datap;
+ }
+ return (CRYPTO_SUCCESS);
+ } else {
+ /* Complete incomplete block. */
+ bcopy(datap, (uint8_t *)ctx->gcm_remainder +
+ ctx->gcm_remainder_len, need);
+
+ ctx->gcm_copy_to = NULL;
+ }
+ }
+
+ /* Allocate a buffer to encrypt to if there is enough input. */
+ if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
+ ct_buf = vmem_alloc(chunk_size, ctx->gcm_kmflag);
+ if (ct_buf == NULL) {
+ return (CRYPTO_HOST_MEMORY);
+ }
+ }
+
+ /* If we completed an incomplete block, encrypt and write it out. */
+ if (ctx->gcm_remainder_len > 0) {
+ kfpu_begin();
+ aes_encrypt_intel(key->encr_ks.ks32, key->nr,
+ (const uint32_t *)cb, (uint32_t *)tmp);
+
+ gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp);
+ GHASH_AVX(ctx, tmp, block_size);
+ clear_fpu_regs();
+ kfpu_end();
+ rv = crypto_put_output_data(tmp, out, block_size);
+ out->cd_offset += block_size;
+ gcm_incr_counter_block(ctx);
+ ctx->gcm_processed_data_len += block_size;
+ bleft -= need;
+ datap += need;
+ ctx->gcm_remainder_len = 0;
+ }
+
+ /* Do the bulk encryption in chunk_size blocks. */
+ for (; bleft >= chunk_size; bleft -= chunk_size) {
+ kfpu_begin();
+ done = aesni_gcm_encrypt(
+ datap, ct_buf, chunk_size, key, cb, ghash);
+
+ clear_fpu_regs();
+ kfpu_end();
+ if (done != chunk_size) {
+ rv = CRYPTO_FAILED;
+ goto out_nofpu;
+ }
+ rv = crypto_put_output_data(ct_buf, out, chunk_size);
+ if (rv != CRYPTO_SUCCESS) {
+ goto out_nofpu;
+ }
+ out->cd_offset += chunk_size;
+ datap += chunk_size;
+ ctx->gcm_processed_data_len += chunk_size;
+ }
+ /* Check if we are already done. */
+ if (bleft == 0) {
+ goto out_nofpu;
+ }
+ /* Bulk encrypt the remaining data. */
+ kfpu_begin();
+ if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
+ done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash);
+ if (done == 0) {
+ rv = CRYPTO_FAILED;
+ goto out;
+ }
+ rv = crypto_put_output_data(ct_buf, out, done);
+ if (rv != CRYPTO_SUCCESS) {
+ goto out;
+ }
+ out->cd_offset += done;
+ ctx->gcm_processed_data_len += done;
+ datap += done;
+ bleft -= done;
+
+ }
+ /* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */
+ while (bleft > 0) {
+ if (bleft < block_size) {
+ bcopy(datap, ctx->gcm_remainder, bleft);
+ ctx->gcm_remainder_len = bleft;
+ ctx->gcm_copy_to = datap;
+ goto out;
+ }
+ /* Encrypt, hash and write out. */
+ aes_encrypt_intel(key->encr_ks.ks32, key->nr,
+ (const uint32_t *)cb, (uint32_t *)tmp);
+
+ gcm_xor_avx(datap, tmp);
+ GHASH_AVX(ctx, tmp, block_size);
+ rv = crypto_put_output_data(tmp, out, block_size);
+ if (rv != CRYPTO_SUCCESS) {
+ goto out;
+ }
+ out->cd_offset += block_size;
+ gcm_incr_counter_block(ctx);
+ ctx->gcm_processed_data_len += block_size;
+ datap += block_size;
+ bleft -= block_size;
+ }
+out:
+ clear_fpu_regs();
+ kfpu_end();
+out_nofpu:
+ if (ct_buf != NULL) {
+ vmem_free(ct_buf, chunk_size);
+ }
+ return (rv);
+}
+
+/*
+ * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual
+ * incomplete last block. Encrypt the ICB. Calculate the tag and write it out.
+ */
+static int
+gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
+{
+ uint8_t *ghash = (uint8_t *)ctx->gcm_ghash;
+ uint32_t *J0 = (uint32_t *)ctx->gcm_J0;
+ uint8_t *remainder = (uint8_t *)ctx->gcm_remainder;
+ size_t rem_len = ctx->gcm_remainder_len;
+ const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
+ int aes_rounds = ((aes_key_t *)keysched)->nr;
+ int rv;
+
+ ASSERT(block_size == GCM_BLOCK_LEN);
+
+ if (out->cd_length < (rem_len + ctx->gcm_tag_len)) {
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+
+ kfpu_begin();
+ /* Pad last incomplete block with zeros, encrypt and hash. */
+ if (rem_len > 0) {
+ uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
+ const uint32_t *cb = (uint32_t *)ctx->gcm_cb;
+
+ aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp);
+ bzero(remainder + rem_len, block_size - rem_len);
+ for (int i = 0; i < rem_len; i++) {
+ remainder[i] ^= tmp[i];
+ }
+ GHASH_AVX(ctx, remainder, block_size);
+ ctx->gcm_processed_data_len += rem_len;
+ /* No need to increment counter_block, it's the last block. */
+ }
+ /* Finish tag. */
+ ctx->gcm_len_a_len_c[1] =
+ htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
+ GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size);
+ aes_encrypt_intel(keysched, aes_rounds, J0, J0);
+
+ gcm_xor_avx((uint8_t *)J0, ghash);
+ clear_fpu_regs();
+ kfpu_end();
+
+ /* Output remainder. */
+ if (rem_len > 0) {
+ rv = crypto_put_output_data(remainder, out, rem_len);
+ if (rv != CRYPTO_SUCCESS)
+ return (rv);
+ }
+ out->cd_offset += rem_len;
+ ctx->gcm_remainder_len = 0;
+ rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
+ if (rv != CRYPTO_SUCCESS)
+ return (rv);
+
+ out->cd_offset += ctx->gcm_tag_len;
+ /* Clear sensitive data in the context before returning. */
+ gcm_clear_ctx(ctx);
+ return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Finalize decryption: We just have accumulated crypto text, so now we
+ * decrypt it here inplace.
+ */
+static int
+gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
+{
+ ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len);
+ ASSERT3U(block_size, ==, 16);
+
+ size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
+ size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
+ uint8_t *datap = ctx->gcm_pt_buf;
+ const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
+ uint32_t *cb = (uint32_t *)ctx->gcm_cb;
+ uint64_t *ghash = ctx->gcm_ghash;
+ uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
+ int rv = CRYPTO_SUCCESS;
+ size_t bleft, done;
+
+ /*
+ * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be
+ * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of
+ * GCM_AVX_MIN_DECRYPT_BYTES.
+ */
+ for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
+ kfpu_begin();
+ done = aesni_gcm_decrypt(datap, datap, chunk_size,
+ (const void *)key, ctx->gcm_cb, ghash);
+ clear_fpu_regs();
+ kfpu_end();
+ if (done != chunk_size) {
+ return (CRYPTO_FAILED);
+ }
+ datap += done;
+ }
+ /* Decrypt remainder, which is less then chunk size, in one go. */
+ kfpu_begin();
+ if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
+ done = aesni_gcm_decrypt(datap, datap, bleft,
+ (const void *)key, ctx->gcm_cb, ghash);
+ if (done == 0) {
+ clear_fpu_regs();
+ kfpu_end();
+ return (CRYPTO_FAILED);
+ }
+ datap += done;
+ bleft -= done;
+ }
+ ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES);
+
+ /*
+ * Now less then GCM_AVX_MIN_DECRYPT_BYTES bytes remain,
+ * decrypt them block by block.
+ */
+ while (bleft > 0) {
+ /* Incomplete last block. */
+ if (bleft < block_size) {
+ uint8_t *lastb = (uint8_t *)ctx->gcm_remainder;
+
+ bzero(lastb, block_size);
+ bcopy(datap, lastb, bleft);
+ /* The GCM processing. */
+ GHASH_AVX(ctx, lastb, block_size);
+ aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
+ for (size_t i = 0; i < bleft; i++) {
+ datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i];
+ }
+ break;
+ }
+ /* The GCM processing. */
+ GHASH_AVX(ctx, datap, block_size);
+ aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
+ gcm_xor_avx((uint8_t *)tmp, datap);
+ gcm_incr_counter_block(ctx);
+
+ datap += block_size;
+ bleft -= block_size;
+ }
+ if (rv != CRYPTO_SUCCESS) {
+ clear_fpu_regs();
+ kfpu_end();
+ return (rv);
+ }
+ /* Decryption done, finish the tag. */
+ ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
+ GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size);
+ aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0,
+ (uint32_t *)ctx->gcm_J0);
+
+ gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash);
+
+ /* We are done with the FPU, restore its state. */
+ clear_fpu_regs();
+ kfpu_end();
+
+ /* Compare the input authentication tag with what we calculated. */
+ if (bcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
+ /* They don't match. */
+ return (CRYPTO_INVALID_MAC);
+ }
+ rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
+ if (rv != CRYPTO_SUCCESS) {
+ return (rv);
+ }
+ out->cd_offset += pt_len;
+ gcm_clear_ctx(ctx);
+ return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Initialize the GCM params H, Htabtle and the counter block. Save the
+ * initial counter block.
+ */
+static int
+gcm_init_avx(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
+ unsigned char *auth_data, size_t auth_data_len, size_t block_size)
+{
+ uint8_t *cb = (uint8_t *)ctx->gcm_cb;
+ uint64_t *H = ctx->gcm_H;
+ const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
+ int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr;
+ uint8_t *datap = auth_data;
+ size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
+ size_t bleft;
+
+ ASSERT(block_size == GCM_BLOCK_LEN);
+
+ /* Init H (encrypt zero block) and create the initial counter block. */
+ bzero(ctx->gcm_ghash, sizeof (ctx->gcm_ghash));
+ bzero(H, sizeof (ctx->gcm_H));
+ kfpu_begin();
+ aes_encrypt_intel(keysched, aes_rounds,
+ (const uint32_t *)H, (uint32_t *)H);
+
+ gcm_init_htab_avx(ctx->gcm_Htable, H);
+
+ if (iv_len == 12) {
+ bcopy(iv, cb, 12);
+ cb[12] = 0;
+ cb[13] = 0;
+ cb[14] = 0;
+ cb[15] = 1;
+ /* We need the ICB later. */
+ bcopy(cb, ctx->gcm_J0, sizeof (ctx->gcm_J0));
+ } else {
+ /*
+ * Most consumers use 12 byte IVs, so it's OK to use the
+ * original routines for other IV sizes, just avoid nesting
+ * kfpu_begin calls.
+ */
+ clear_fpu_regs();
+ kfpu_end();
+ gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
+ aes_copy_block, aes_xor_block);
+ kfpu_begin();
+ }
+
+ /* Openssl post increments the counter, adjust for that. */
+ gcm_incr_counter_block(ctx);
+
+ /* Ghash AAD in chunk_size blocks. */
+ for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) {
+ GHASH_AVX(ctx, datap, chunk_size);
+ datap += chunk_size;
+ clear_fpu_regs();
+ kfpu_end();
+ kfpu_begin();
+ }
+ /* Ghash the remainder and handle possible incomplete GCM block. */
+ if (bleft > 0) {
+ size_t incomp = bleft % block_size;
+
+ bleft -= incomp;
+ if (bleft > 0) {
+ GHASH_AVX(ctx, datap, bleft);
+ datap += bleft;
+ }
+ if (incomp > 0) {
+ /* Zero pad and hash incomplete last block. */
+ uint8_t *authp = (uint8_t *)ctx->gcm_tmp;
+
+ bzero(authp, block_size);
+ bcopy(datap, authp, incomp);
+ GHASH_AVX(ctx, authp, block_size);
+ }
+ }
+ clear_fpu_regs();
+ kfpu_end();
+ return (CRYPTO_SUCCESS);
+}
+
+#if defined(_KERNEL)
+static int
+icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp)
+{
+ unsigned long val;
+ char val_rounded[16];
+ int error = 0;
+
+ error = kstrtoul(buf, 0, &val);
+ if (error)
+ return (error);
+
+ val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
+
+ if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE)
+ return (-EINVAL);
+
+ snprintf(val_rounded, 16, "%u", (uint32_t)val);
+ error = param_set_uint(val_rounded, kp);
+ return (error);
+}
+
+module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size,
+ param_get_uint, &gcm_avx_chunk_size, 0644);
+
+MODULE_PARM_DESC(icp_gcm_avx_chunk_size,
+ "How many bytes to process while owning the FPU");
+
+#endif /* defined(__KERNEL) */
+#endif /* ifdef CAN_USE_GCM_ASM */
diff --git a/sys/contrib/openzfs/module/icp/algs/modes/gcm_generic.c b/sys/contrib/openzfs/module/icp/algs/modes/gcm_generic.c
new file mode 100644
index 000000000000..16b57998a92f
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/modes/gcm_generic.c
@@ -0,0 +1,83 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <modes/gcm_impl.h>
+
+struct aes_block {
+ uint64_t a;
+ uint64_t b;
+};
+
+/*
+ * Perform a carry-less multiplication (that is, use XOR instead of the
+ * multiply operator) on *x_in and *y and place the result in *res.
+ *
+ * Byte swap the input (*x_in and *y) and the output (*res).
+ *
+ * Note: x_in, y, and res all point to 16-byte numbers (an array of two
+ * 64-bit integers).
+ */
+static void
+gcm_generic_mul(uint64_t *x_in, uint64_t *y, uint64_t *res)
+{
+ static const uint64_t R = 0xe100000000000000ULL;
+ struct aes_block z = {0, 0};
+ struct aes_block v;
+ uint64_t x;
+ int i, j;
+
+ v.a = ntohll(y[0]);
+ v.b = ntohll(y[1]);
+
+ for (j = 0; j < 2; j++) {
+ x = ntohll(x_in[j]);
+ for (i = 0; i < 64; i++, x <<= 1) {
+ if (x & 0x8000000000000000ULL) {
+ z.a ^= v.a;
+ z.b ^= v.b;
+ }
+ if (v.b & 1ULL) {
+ v.b = (v.a << 63)|(v.b >> 1);
+ v.a = (v.a >> 1) ^ R;
+ } else {
+ v.b = (v.a << 63)|(v.b >> 1);
+ v.a = v.a >> 1;
+ }
+ }
+ }
+ res[0] = htonll(z.a);
+ res[1] = htonll(z.b);
+}
+
+static boolean_t
+gcm_generic_will_work(void)
+{
+ return (B_TRUE);
+}
+
+const gcm_impl_ops_t gcm_generic_impl = {
+ .mul = &gcm_generic_mul,
+ .is_supported = &gcm_generic_will_work,
+ .name = "generic"
+};
diff --git a/sys/contrib/openzfs/module/icp/algs/modes/gcm_pclmulqdq.c b/sys/contrib/openzfs/module/icp/algs/modes/gcm_pclmulqdq.c
new file mode 100644
index 000000000000..05920115ce86
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/modes/gcm_pclmulqdq.c
@@ -0,0 +1,64 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
+
+#include <sys/types.h>
+#include <sys/simd.h>
+
+/* These functions are used to execute pclmulqdq based assembly methods */
+extern void gcm_mul_pclmulqdq(uint64_t *, uint64_t *, uint64_t *);
+
+#include <modes/gcm_impl.h>
+
+/*
+ * Perform a carry-less multiplication (that is, use XOR instead of the
+ * multiply operator) on *x_in and *y and place the result in *res.
+ *
+ * Byte swap the input (*x_in and *y) and the output (*res).
+ *
+ * Note: x_in, y, and res all point to 16-byte numbers (an array of two
+ * 64-bit integers).
+ */
+static void
+gcm_pclmulqdq_mul(uint64_t *x_in, uint64_t *y, uint64_t *res)
+{
+ kfpu_begin();
+ gcm_mul_pclmulqdq(x_in, y, res);
+ kfpu_end();
+}
+
+static boolean_t
+gcm_pclmulqdq_will_work(void)
+{
+ return (kfpu_allowed() && zfs_pclmulqdq_available());
+}
+
+const gcm_impl_ops_t gcm_pclmulqdq_impl = {
+ .mul = &gcm_pclmulqdq_mul,
+ .is_supported = &gcm_pclmulqdq_will_work,
+ .name = "pclmulqdq"
+};
+
+#endif /* defined(__x86_64) && defined(HAVE_PCLMULQDQ) */
diff --git a/sys/contrib/openzfs/module/icp/algs/modes/modes.c b/sys/contrib/openzfs/module/icp/algs/modes/modes.c
new file mode 100644
index 000000000000..59743c7d6829
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/modes/modes.c
@@ -0,0 +1,165 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <modes/modes.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+
+/*
+ * Initialize by setting iov_or_mp to point to the current iovec or mp,
+ * and by setting current_offset to an offset within the current iovec or mp.
+ */
+void
+crypto_init_ptrs(crypto_data_t *out, void **iov_or_mp, offset_t *current_offset)
+{
+ offset_t offset;
+
+ switch (out->cd_format) {
+ case CRYPTO_DATA_RAW:
+ *current_offset = out->cd_offset;
+ break;
+
+ case CRYPTO_DATA_UIO: {
+ zfs_uio_t *uiop = out->cd_uio;
+ uint_t vec_idx;
+
+ offset = out->cd_offset;
+ offset = zfs_uio_index_at_offset(uiop, offset, &vec_idx);
+
+ *current_offset = offset;
+ *iov_or_mp = (void *)(uintptr_t)vec_idx;
+ break;
+ }
+ } /* end switch */
+}
+
+/*
+ * Get pointers for where in the output to copy a block of encrypted or
+ * decrypted data. The iov_or_mp argument stores a pointer to the current
+ * iovec or mp, and offset stores an offset into the current iovec or mp.
+ */
+void
+crypto_get_ptrs(crypto_data_t *out, void **iov_or_mp, offset_t *current_offset,
+ uint8_t **out_data_1, size_t *out_data_1_len, uint8_t **out_data_2,
+ size_t amt)
+{
+ offset_t offset;
+
+ switch (out->cd_format) {
+ case CRYPTO_DATA_RAW: {
+ iovec_t *iov;
+
+ offset = *current_offset;
+ iov = &out->cd_raw;
+ if ((offset + amt) <= iov->iov_len) {
+ /* one block fits */
+ *out_data_1 = (uint8_t *)iov->iov_base + offset;
+ *out_data_1_len = amt;
+ *out_data_2 = NULL;
+ *current_offset = offset + amt;
+ }
+ break;
+ }
+
+ case CRYPTO_DATA_UIO: {
+ zfs_uio_t *uio = out->cd_uio;
+ offset_t offset;
+ uint_t vec_idx;
+ uint8_t *p;
+ uint64_t iov_len;
+ void *iov_base;
+
+ offset = *current_offset;
+ vec_idx = (uintptr_t)(*iov_or_mp);
+ zfs_uio_iov_at_index(uio, vec_idx, &iov_base, &iov_len);
+ p = (uint8_t *)iov_base + offset;
+ *out_data_1 = p;
+
+ if (offset + amt <= iov_len) {
+ /* can fit one block into this iov */
+ *out_data_1_len = amt;
+ *out_data_2 = NULL;
+ *current_offset = offset + amt;
+ } else {
+ /* one block spans two iovecs */
+ *out_data_1_len = iov_len - offset;
+ if (vec_idx == zfs_uio_iovcnt(uio))
+ return;
+ vec_idx++;
+ zfs_uio_iov_at_index(uio, vec_idx, &iov_base, &iov_len);
+ *out_data_2 = (uint8_t *)iov_base;
+ *current_offset = amt - *out_data_1_len;
+ }
+ *iov_or_mp = (void *)(uintptr_t)vec_idx;
+ break;
+ }
+ } /* end switch */
+}
+
+void
+crypto_free_mode_ctx(void *ctx)
+{
+ common_ctx_t *common_ctx = (common_ctx_t *)ctx;
+
+ switch (common_ctx->cc_flags &
+ (ECB_MODE|CBC_MODE|CTR_MODE|CCM_MODE|GCM_MODE|GMAC_MODE)) {
+ case ECB_MODE:
+ kmem_free(common_ctx, sizeof (ecb_ctx_t));
+ break;
+
+ case CBC_MODE:
+ kmem_free(common_ctx, sizeof (cbc_ctx_t));
+ break;
+
+ case CTR_MODE:
+ kmem_free(common_ctx, sizeof (ctr_ctx_t));
+ break;
+
+ case CCM_MODE:
+ if (((ccm_ctx_t *)ctx)->ccm_pt_buf != NULL)
+ vmem_free(((ccm_ctx_t *)ctx)->ccm_pt_buf,
+ ((ccm_ctx_t *)ctx)->ccm_data_len);
+
+ kmem_free(ctx, sizeof (ccm_ctx_t));
+ break;
+
+ case GCM_MODE:
+ case GMAC_MODE:
+ if (((gcm_ctx_t *)ctx)->gcm_pt_buf != NULL)
+ vmem_free(((gcm_ctx_t *)ctx)->gcm_pt_buf,
+ ((gcm_ctx_t *)ctx)->gcm_pt_buf_len);
+
+#ifdef CAN_USE_GCM_ASM
+ if (((gcm_ctx_t *)ctx)->gcm_Htable != NULL) {
+ gcm_ctx_t *gcm_ctx = (gcm_ctx_t *)ctx;
+ bzero(gcm_ctx->gcm_Htable, gcm_ctx->gcm_htab_len);
+ kmem_free(gcm_ctx->gcm_Htable, gcm_ctx->gcm_htab_len);
+ }
+#endif
+
+ kmem_free(ctx, sizeof (gcm_ctx_t));
+ }
+}
diff --git a/sys/contrib/openzfs/module/icp/algs/sha1/sha1.c b/sys/contrib/openzfs/module/icp/algs/sha1/sha1.c
new file mode 100644
index 000000000000..da34222c8fc3
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/sha1/sha1.c
@@ -0,0 +1,835 @@
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * The basic framework for this code came from the reference
+ * implementation for MD5. That implementation is Copyright (C)
+ * 1991-2, RSA Data Security, Inc. Created 1991. All rights reserved.
+ *
+ * License to copy and use this software is granted provided that it
+ * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+ * Algorithm" in all material mentioning or referencing this software
+ * or this function.
+ *
+ * License is also granted to make and use derivative works provided
+ * that such works are identified as "derived from the RSA Data
+ * Security, Inc. MD5 Message-Digest Algorithm" in all material
+ * mentioning or referencing the derived work.
+ *
+ * RSA Data Security, Inc. makes no representations concerning either
+ * the merchantability of this software or the suitability of this
+ * software for any particular purpose. It is provided "as is"
+ * without express or implied warranty of any kind.
+ *
+ * These notices must be retained in any copies of any part of this
+ * documentation and/or software.
+ *
+ * NOTE: Cleaned-up and optimized, version of SHA1, based on the FIPS 180-1
+ * standard, available at http://www.itl.nist.gov/fipspubs/fip180-1.htm
+ * Not as fast as one would like -- further optimizations are encouraged
+ * and appreciated.
+ */
+
+#include <sys/zfs_context.h>
+#include <sha1/sha1.h>
+#include <sha1/sha1_consts.h>
+
+#ifdef _LITTLE_ENDIAN
+#include <sys/byteorder.h>
+#define HAVE_HTONL
+#endif
+
+#define _RESTRICT_KYWD
+
+static void Encode(uint8_t *, const uint32_t *, size_t);
+
+#if defined(__sparc)
+
+#define SHA1_TRANSFORM(ctx, in) \
+ SHA1Transform((ctx)->state[0], (ctx)->state[1], (ctx)->state[2], \
+ (ctx)->state[3], (ctx)->state[4], (ctx), (in))
+
+static void SHA1Transform(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t,
+ SHA1_CTX *, const uint8_t *);
+
+#elif defined(__amd64)
+
+#define SHA1_TRANSFORM(ctx, in) sha1_block_data_order((ctx), (in), 1)
+#define SHA1_TRANSFORM_BLOCKS(ctx, in, num) sha1_block_data_order((ctx), \
+ (in), (num))
+
+void sha1_block_data_order(SHA1_CTX *ctx, const void *inpp, size_t num_blocks);
+
+#else
+
+#define SHA1_TRANSFORM(ctx, in) SHA1Transform((ctx), (in))
+
+static void SHA1Transform(SHA1_CTX *, const uint8_t *);
+
+#endif
+
+
+static uint8_t PADDING[64] = { 0x80, /* all zeros */ };
+
+/*
+ * F, G, and H are the basic SHA1 functions.
+ */
+#define F(b, c, d) (((b) & (c)) | ((~b) & (d)))
+#define G(b, c, d) ((b) ^ (c) ^ (d))
+#define H(b, c, d) (((b) & (c)) | (((b)|(c)) & (d)))
+
+/*
+ * SHA1Init()
+ *
+ * purpose: initializes the sha1 context and begins and sha1 digest operation
+ * input: SHA1_CTX * : the context to initializes.
+ * output: void
+ */
+
+void
+SHA1Init(SHA1_CTX *ctx)
+{
+ ctx->count[0] = ctx->count[1] = 0;
+
+ /*
+ * load magic initialization constants. Tell lint
+ * that these constants are unsigned by using U.
+ */
+
+ ctx->state[0] = 0x67452301U;
+ ctx->state[1] = 0xefcdab89U;
+ ctx->state[2] = 0x98badcfeU;
+ ctx->state[3] = 0x10325476U;
+ ctx->state[4] = 0xc3d2e1f0U;
+}
+
+void
+SHA1Update(SHA1_CTX *ctx, const void *inptr, size_t input_len)
+{
+ uint32_t i, buf_index, buf_len;
+ const uint8_t *input = inptr;
+#if defined(__amd64)
+ uint32_t block_count;
+#endif /* __amd64 */
+
+ /* check for noop */
+ if (input_len == 0)
+ return;
+
+ /* compute number of bytes mod 64 */
+ buf_index = (ctx->count[1] >> 3) & 0x3F;
+
+ /* update number of bits */
+ if ((ctx->count[1] += (input_len << 3)) < (input_len << 3))
+ ctx->count[0]++;
+
+ ctx->count[0] += (input_len >> 29);
+
+ buf_len = 64 - buf_index;
+
+ /* transform as many times as possible */
+ i = 0;
+ if (input_len >= buf_len) {
+
+ /*
+ * general optimization:
+ *
+ * only do initial bcopy() and SHA1Transform() if
+ * buf_index != 0. if buf_index == 0, we're just
+ * wasting our time doing the bcopy() since there
+ * wasn't any data left over from a previous call to
+ * SHA1Update().
+ */
+
+ if (buf_index) {
+ bcopy(input, &ctx->buf_un.buf8[buf_index], buf_len);
+ SHA1_TRANSFORM(ctx, ctx->buf_un.buf8);
+ i = buf_len;
+ }
+
+#if !defined(__amd64)
+ for (; i + 63 < input_len; i += 64)
+ SHA1_TRANSFORM(ctx, &input[i]);
+#else
+ block_count = (input_len - i) >> 6;
+ if (block_count > 0) {
+ SHA1_TRANSFORM_BLOCKS(ctx, &input[i], block_count);
+ i += block_count << 6;
+ }
+#endif /* !__amd64 */
+
+ /*
+ * general optimization:
+ *
+ * if i and input_len are the same, return now instead
+ * of calling bcopy(), since the bcopy() in this case
+ * will be an expensive nop.
+ */
+
+ if (input_len == i)
+ return;
+
+ buf_index = 0;
+ }
+
+ /* buffer remaining input */
+ bcopy(&input[i], &ctx->buf_un.buf8[buf_index], input_len - i);
+}
+
+/*
+ * SHA1Final()
+ *
+ * purpose: ends an sha1 digest operation, finalizing the message digest and
+ * zeroing the context.
+ * input: uchar_t * : A buffer to store the digest.
+ * : The function actually uses void* because many
+ * : callers pass things other than uchar_t here.
+ * SHA1_CTX * : the context to finalize, save, and zero
+ * output: void
+ */
+
+void
+SHA1Final(void *digest, SHA1_CTX *ctx)
+{
+ uint8_t bitcount_be[sizeof (ctx->count)];
+ uint32_t index = (ctx->count[1] >> 3) & 0x3f;
+
+ /* store bit count, big endian */
+ Encode(bitcount_be, ctx->count, sizeof (bitcount_be));
+
+ /* pad out to 56 mod 64 */
+ SHA1Update(ctx, PADDING, ((index < 56) ? 56 : 120) - index);
+
+ /* append length (before padding) */
+ SHA1Update(ctx, bitcount_be, sizeof (bitcount_be));
+
+ /* store state in digest */
+ Encode(digest, ctx->state, sizeof (ctx->state));
+
+ /* zeroize sensitive information */
+ bzero(ctx, sizeof (*ctx));
+}
+
+
+#if !defined(__amd64)
+
+typedef uint32_t sha1word;
+
+/*
+ * sparc optimization:
+ *
+ * on the sparc, we can load big endian 32-bit data easily. note that
+ * special care must be taken to ensure the address is 32-bit aligned.
+ * in the interest of speed, we don't check to make sure, since
+ * careful programming can guarantee this for us.
+ */
+
+#if defined(_ZFS_BIG_ENDIAN)
+#define LOAD_BIG_32(addr) (*(uint32_t *)(addr))
+
+#elif defined(HAVE_HTONL)
+#define LOAD_BIG_32(addr) htonl(*((uint32_t *)(addr)))
+
+#else
+#define LOAD_BIG_32(addr) BE_32(*((uint32_t *)(addr)))
+#endif /* _BIG_ENDIAN */
+
+/*
+ * SHA1Transform()
+ */
+#if defined(W_ARRAY)
+#define W(n) w[n]
+#else /* !defined(W_ARRAY) */
+#define W(n) w_ ## n
+#endif /* !defined(W_ARRAY) */
+
+/*
+ * ROTATE_LEFT rotates x left n bits.
+ */
+
+#if defined(__GNUC__) && defined(_LP64)
+static __inline__ uint64_t
+ROTATE_LEFT(uint64_t value, uint32_t n)
+{
+ uint32_t t32;
+
+ t32 = (uint32_t)value;
+ return ((t32 << n) | (t32 >> (32 - n)));
+}
+
+#else
+
+#define ROTATE_LEFT(x, n) \
+ (((x) << (n)) | ((x) >> ((sizeof (x) * NBBY)-(n))))
+
+#endif
+
+#if defined(__sparc)
+
+
+/*
+ * sparc register window optimization:
+ *
+ * `a', `b', `c', `d', and `e' are passed into SHA1Transform
+ * explicitly since it increases the number of registers available to
+ * the compiler. under this scheme, these variables can be held in
+ * %i0 - %i4, which leaves more local and out registers available.
+ *
+ * purpose: sha1 transformation -- updates the digest based on `block'
+ * input: uint32_t : bytes 1 - 4 of the digest
+ * uint32_t : bytes 5 - 8 of the digest
+ * uint32_t : bytes 9 - 12 of the digest
+ * uint32_t : bytes 12 - 16 of the digest
+ * uint32_t : bytes 16 - 20 of the digest
+ * SHA1_CTX * : the context to update
+ * uint8_t [64]: the block to use to update the digest
+ * output: void
+ */
+
+
+void
+SHA1Transform(uint32_t a, uint32_t b, uint32_t c, uint32_t d, uint32_t e,
+ SHA1_CTX *ctx, const uint8_t blk[64])
+{
+ /*
+ * sparc optimization:
+ *
+ * while it is somewhat counter-intuitive, on sparc, it is
+ * more efficient to place all the constants used in this
+ * function in an array and load the values out of the array
+ * than to manually load the constants. this is because
+ * setting a register to a 32-bit value takes two ops in most
+ * cases: a `sethi' and an `or', but loading a 32-bit value
+ * from memory only takes one `ld' (or `lduw' on v9). while
+ * this increases memory usage, the compiler can find enough
+ * other things to do while waiting to keep the pipeline does
+ * not stall. additionally, it is likely that many of these
+ * constants are cached so that later accesses do not even go
+ * out to the bus.
+ *
+ * this array is declared `static' to keep the compiler from
+ * having to bcopy() this array onto the stack frame of
+ * SHA1Transform() each time it is called -- which is
+ * unacceptably expensive.
+ *
+ * the `const' is to ensure that callers are good citizens and
+ * do not try to munge the array. since these routines are
+ * going to be called from inside multithreaded kernelland,
+ * this is a good safety check. -- `sha1_consts' will end up in
+ * .rodata.
+ *
+ * unfortunately, loading from an array in this manner hurts
+ * performance under Intel. So, there is a macro,
+ * SHA1_CONST(), used in SHA1Transform(), that either expands to
+ * a reference to this array, or to the actual constant,
+ * depending on what platform this code is compiled for.
+ */
+
+
+ static const uint32_t sha1_consts[] = {
+ SHA1_CONST_0, SHA1_CONST_1, SHA1_CONST_2, SHA1_CONST_3
+ };
+
+
+ /*
+ * general optimization:
+ *
+ * use individual integers instead of using an array. this is a
+ * win, although the amount it wins by seems to vary quite a bit.
+ */
+
+
+ uint32_t w_0, w_1, w_2, w_3, w_4, w_5, w_6, w_7;
+ uint32_t w_8, w_9, w_10, w_11, w_12, w_13, w_14, w_15;
+
+
+ /*
+ * sparc optimization:
+ *
+ * if `block' is already aligned on a 4-byte boundary, use
+ * LOAD_BIG_32() directly. otherwise, bcopy() into a
+ * buffer that *is* aligned on a 4-byte boundary and then do
+ * the LOAD_BIG_32() on that buffer. benchmarks have shown
+ * that using the bcopy() is better than loading the bytes
+ * individually and doing the endian-swap by hand.
+ *
+ * even though it's quite tempting to assign to do:
+ *
+ * blk = bcopy(ctx->buf_un.buf32, blk, sizeof (ctx->buf_un.buf32));
+ *
+ * and only have one set of LOAD_BIG_32()'s, the compiler
+ * *does not* like that, so please resist the urge.
+ */
+
+
+ if ((uintptr_t)blk & 0x3) { /* not 4-byte aligned? */
+ bcopy(blk, ctx->buf_un.buf32, sizeof (ctx->buf_un.buf32));
+ w_15 = LOAD_BIG_32(ctx->buf_un.buf32 + 15);
+ w_14 = LOAD_BIG_32(ctx->buf_un.buf32 + 14);
+ w_13 = LOAD_BIG_32(ctx->buf_un.buf32 + 13);
+ w_12 = LOAD_BIG_32(ctx->buf_un.buf32 + 12);
+ w_11 = LOAD_BIG_32(ctx->buf_un.buf32 + 11);
+ w_10 = LOAD_BIG_32(ctx->buf_un.buf32 + 10);
+ w_9 = LOAD_BIG_32(ctx->buf_un.buf32 + 9);
+ w_8 = LOAD_BIG_32(ctx->buf_un.buf32 + 8);
+ w_7 = LOAD_BIG_32(ctx->buf_un.buf32 + 7);
+ w_6 = LOAD_BIG_32(ctx->buf_un.buf32 + 6);
+ w_5 = LOAD_BIG_32(ctx->buf_un.buf32 + 5);
+ w_4 = LOAD_BIG_32(ctx->buf_un.buf32 + 4);
+ w_3 = LOAD_BIG_32(ctx->buf_un.buf32 + 3);
+ w_2 = LOAD_BIG_32(ctx->buf_un.buf32 + 2);
+ w_1 = LOAD_BIG_32(ctx->buf_un.buf32 + 1);
+ w_0 = LOAD_BIG_32(ctx->buf_un.buf32 + 0);
+ } else {
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w_15 = LOAD_BIG_32(blk + 60);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w_14 = LOAD_BIG_32(blk + 56);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w_13 = LOAD_BIG_32(blk + 52);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w_12 = LOAD_BIG_32(blk + 48);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w_11 = LOAD_BIG_32(blk + 44);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w_10 = LOAD_BIG_32(blk + 40);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w_9 = LOAD_BIG_32(blk + 36);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w_8 = LOAD_BIG_32(blk + 32);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w_7 = LOAD_BIG_32(blk + 28);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w_6 = LOAD_BIG_32(blk + 24);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w_5 = LOAD_BIG_32(blk + 20);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w_4 = LOAD_BIG_32(blk + 16);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w_3 = LOAD_BIG_32(blk + 12);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w_2 = LOAD_BIG_32(blk + 8);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w_1 = LOAD_BIG_32(blk + 4);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w_0 = LOAD_BIG_32(blk + 0);
+ }
+#else /* !defined(__sparc) */
+
+void /* CSTYLED */
+SHA1Transform(SHA1_CTX *ctx, const uint8_t blk[64])
+{
+ /* CSTYLED */
+ sha1word a = ctx->state[0];
+ sha1word b = ctx->state[1];
+ sha1word c = ctx->state[2];
+ sha1word d = ctx->state[3];
+ sha1word e = ctx->state[4];
+
+#if defined(W_ARRAY)
+ sha1word w[16];
+#else /* !defined(W_ARRAY) */
+ sha1word w_0, w_1, w_2, w_3, w_4, w_5, w_6, w_7;
+ sha1word w_8, w_9, w_10, w_11, w_12, w_13, w_14, w_15;
+#endif /* !defined(W_ARRAY) */
+
+ W(0) = LOAD_BIG_32((void *)(blk + 0));
+ W(1) = LOAD_BIG_32((void *)(blk + 4));
+ W(2) = LOAD_BIG_32((void *)(blk + 8));
+ W(3) = LOAD_BIG_32((void *)(blk + 12));
+ W(4) = LOAD_BIG_32((void *)(blk + 16));
+ W(5) = LOAD_BIG_32((void *)(blk + 20));
+ W(6) = LOAD_BIG_32((void *)(blk + 24));
+ W(7) = LOAD_BIG_32((void *)(blk + 28));
+ W(8) = LOAD_BIG_32((void *)(blk + 32));
+ W(9) = LOAD_BIG_32((void *)(blk + 36));
+ W(10) = LOAD_BIG_32((void *)(blk + 40));
+ W(11) = LOAD_BIG_32((void *)(blk + 44));
+ W(12) = LOAD_BIG_32((void *)(blk + 48));
+ W(13) = LOAD_BIG_32((void *)(blk + 52));
+ W(14) = LOAD_BIG_32((void *)(blk + 56));
+ W(15) = LOAD_BIG_32((void *)(blk + 60));
+
+#endif /* !defined(__sparc) */
+
+ /*
+ * general optimization:
+ *
+ * even though this approach is described in the standard as
+ * being slower algorithmically, it is 30-40% faster than the
+ * "faster" version under SPARC, because this version has more
+ * of the constraints specified at compile-time and uses fewer
+ * variables (and therefore has better register utilization)
+ * than its "speedier" brother. (i've tried both, trust me)
+ *
+ * for either method given in the spec, there is an "assignment"
+ * phase where the following takes place:
+ *
+ * tmp = (main_computation);
+ * e = d; d = c; c = rotate_left(b, 30); b = a; a = tmp;
+ *
+ * we can make the algorithm go faster by not doing this work,
+ * but just pretending that `d' is now `e', etc. this works
+ * really well and obviates the need for a temporary variable.
+ * however, we still explicitly perform the rotate action,
+ * since it is cheaper on SPARC to do it once than to have to
+ * do it over and over again.
+ */
+
+ /* round 1 */
+ e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(0) + SHA1_CONST(0); /* 0 */
+ b = ROTATE_LEFT(b, 30);
+
+ d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(1) + SHA1_CONST(0); /* 1 */
+ a = ROTATE_LEFT(a, 30);
+
+ c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(2) + SHA1_CONST(0); /* 2 */
+ e = ROTATE_LEFT(e, 30);
+
+ b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(3) + SHA1_CONST(0); /* 3 */
+ d = ROTATE_LEFT(d, 30);
+
+ a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(4) + SHA1_CONST(0); /* 4 */
+ c = ROTATE_LEFT(c, 30);
+
+ e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(5) + SHA1_CONST(0); /* 5 */
+ b = ROTATE_LEFT(b, 30);
+
+ d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(6) + SHA1_CONST(0); /* 6 */
+ a = ROTATE_LEFT(a, 30);
+
+ c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(7) + SHA1_CONST(0); /* 7 */
+ e = ROTATE_LEFT(e, 30);
+
+ b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(8) + SHA1_CONST(0); /* 8 */
+ d = ROTATE_LEFT(d, 30);
+
+ a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(9) + SHA1_CONST(0); /* 9 */
+ c = ROTATE_LEFT(c, 30);
+
+ e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(10) + SHA1_CONST(0); /* 10 */
+ b = ROTATE_LEFT(b, 30);
+
+ d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(11) + SHA1_CONST(0); /* 11 */
+ a = ROTATE_LEFT(a, 30);
+
+ c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(12) + SHA1_CONST(0); /* 12 */
+ e = ROTATE_LEFT(e, 30);
+
+ b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(13) + SHA1_CONST(0); /* 13 */
+ d = ROTATE_LEFT(d, 30);
+
+ a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(14) + SHA1_CONST(0); /* 14 */
+ c = ROTATE_LEFT(c, 30);
+
+ e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(15) + SHA1_CONST(0); /* 15 */
+ b = ROTATE_LEFT(b, 30);
+
+ W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1); /* 16 */
+ d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(0) + SHA1_CONST(0);
+ a = ROTATE_LEFT(a, 30);
+
+ W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1); /* 17 */
+ c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(1) + SHA1_CONST(0);
+ e = ROTATE_LEFT(e, 30);
+
+ W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1); /* 18 */
+ b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(2) + SHA1_CONST(0);
+ d = ROTATE_LEFT(d, 30);
+
+ W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1); /* 19 */
+ a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(3) + SHA1_CONST(0);
+ c = ROTATE_LEFT(c, 30);
+
+ /* round 2 */
+ W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1); /* 20 */
+ e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(4) + SHA1_CONST(1);
+ b = ROTATE_LEFT(b, 30);
+
+ W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1); /* 21 */
+ d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(5) + SHA1_CONST(1);
+ a = ROTATE_LEFT(a, 30);
+
+ W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1); /* 22 */
+ c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(6) + SHA1_CONST(1);
+ e = ROTATE_LEFT(e, 30);
+
+ W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1); /* 23 */
+ b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(7) + SHA1_CONST(1);
+ d = ROTATE_LEFT(d, 30);
+
+ W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1); /* 24 */
+ a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(8) + SHA1_CONST(1);
+ c = ROTATE_LEFT(c, 30);
+
+ W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1); /* 25 */
+ e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(9) + SHA1_CONST(1);
+ b = ROTATE_LEFT(b, 30);
+
+ W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1); /* 26 */
+ d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(10) + SHA1_CONST(1);
+ a = ROTATE_LEFT(a, 30);
+
+ W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1); /* 27 */
+ c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(11) + SHA1_CONST(1);
+ e = ROTATE_LEFT(e, 30);
+
+ W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1); /* 28 */
+ b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(12) + SHA1_CONST(1);
+ d = ROTATE_LEFT(d, 30);
+
+ W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1); /* 29 */
+ a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(13) + SHA1_CONST(1);
+ c = ROTATE_LEFT(c, 30);
+
+ W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1); /* 30 */
+ e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(14) + SHA1_CONST(1);
+ b = ROTATE_LEFT(b, 30);
+
+ W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1); /* 31 */
+ d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(15) + SHA1_CONST(1);
+ a = ROTATE_LEFT(a, 30);
+
+ W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1); /* 32 */
+ c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(0) + SHA1_CONST(1);
+ e = ROTATE_LEFT(e, 30);
+
+ W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1); /* 33 */
+ b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(1) + SHA1_CONST(1);
+ d = ROTATE_LEFT(d, 30);
+
+ W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1); /* 34 */
+ a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(2) + SHA1_CONST(1);
+ c = ROTATE_LEFT(c, 30);
+
+ W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1); /* 35 */
+ e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(3) + SHA1_CONST(1);
+ b = ROTATE_LEFT(b, 30);
+
+ W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1); /* 36 */
+ d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(4) + SHA1_CONST(1);
+ a = ROTATE_LEFT(a, 30);
+
+ W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1); /* 37 */
+ c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(5) + SHA1_CONST(1);
+ e = ROTATE_LEFT(e, 30);
+
+ W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1); /* 38 */
+ b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(6) + SHA1_CONST(1);
+ d = ROTATE_LEFT(d, 30);
+
+ W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1); /* 39 */
+ a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(7) + SHA1_CONST(1);
+ c = ROTATE_LEFT(c, 30);
+
+ /* round 3 */
+ W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1); /* 40 */
+ e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(8) + SHA1_CONST(2);
+ b = ROTATE_LEFT(b, 30);
+
+ W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1); /* 41 */
+ d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(9) + SHA1_CONST(2);
+ a = ROTATE_LEFT(a, 30);
+
+ W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1); /* 42 */
+ c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(10) + SHA1_CONST(2);
+ e = ROTATE_LEFT(e, 30);
+
+ W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1); /* 43 */
+ b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(11) + SHA1_CONST(2);
+ d = ROTATE_LEFT(d, 30);
+
+ W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1); /* 44 */
+ a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(12) + SHA1_CONST(2);
+ c = ROTATE_LEFT(c, 30);
+
+ W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1); /* 45 */
+ e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(13) + SHA1_CONST(2);
+ b = ROTATE_LEFT(b, 30);
+
+ W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1); /* 46 */
+ d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(14) + SHA1_CONST(2);
+ a = ROTATE_LEFT(a, 30);
+
+ W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1); /* 47 */
+ c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(15) + SHA1_CONST(2);
+ e = ROTATE_LEFT(e, 30);
+
+ W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1); /* 48 */
+ b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(0) + SHA1_CONST(2);
+ d = ROTATE_LEFT(d, 30);
+
+ W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1); /* 49 */
+ a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(1) + SHA1_CONST(2);
+ c = ROTATE_LEFT(c, 30);
+
+ W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1); /* 50 */
+ e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(2) + SHA1_CONST(2);
+ b = ROTATE_LEFT(b, 30);
+
+ W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1); /* 51 */
+ d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(3) + SHA1_CONST(2);
+ a = ROTATE_LEFT(a, 30);
+
+ W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1); /* 52 */
+ c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(4) + SHA1_CONST(2);
+ e = ROTATE_LEFT(e, 30);
+
+ W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1); /* 53 */
+ b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(5) + SHA1_CONST(2);
+ d = ROTATE_LEFT(d, 30);
+
+ W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1); /* 54 */
+ a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(6) + SHA1_CONST(2);
+ c = ROTATE_LEFT(c, 30);
+
+ W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1); /* 55 */
+ e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(7) + SHA1_CONST(2);
+ b = ROTATE_LEFT(b, 30);
+
+ W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1); /* 56 */
+ d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(8) + SHA1_CONST(2);
+ a = ROTATE_LEFT(a, 30);
+
+ W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1); /* 57 */
+ c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(9) + SHA1_CONST(2);
+ e = ROTATE_LEFT(e, 30);
+
+ W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1); /* 58 */
+ b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(10) + SHA1_CONST(2);
+ d = ROTATE_LEFT(d, 30);
+
+ W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1); /* 59 */
+ a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(11) + SHA1_CONST(2);
+ c = ROTATE_LEFT(c, 30);
+
+ /* round 4 */
+ W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1); /* 60 */
+ e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(12) + SHA1_CONST(3);
+ b = ROTATE_LEFT(b, 30);
+
+ W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1); /* 61 */
+ d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(13) + SHA1_CONST(3);
+ a = ROTATE_LEFT(a, 30);
+
+ W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1); /* 62 */
+ c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(14) + SHA1_CONST(3);
+ e = ROTATE_LEFT(e, 30);
+
+ W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1); /* 63 */
+ b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(15) + SHA1_CONST(3);
+ d = ROTATE_LEFT(d, 30);
+
+ W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1); /* 64 */
+ a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(0) + SHA1_CONST(3);
+ c = ROTATE_LEFT(c, 30);
+
+ W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1); /* 65 */
+ e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(1) + SHA1_CONST(3);
+ b = ROTATE_LEFT(b, 30);
+
+ W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1); /* 66 */
+ d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(2) + SHA1_CONST(3);
+ a = ROTATE_LEFT(a, 30);
+
+ W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1); /* 67 */
+ c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(3) + SHA1_CONST(3);
+ e = ROTATE_LEFT(e, 30);
+
+ W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1); /* 68 */
+ b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(4) + SHA1_CONST(3);
+ d = ROTATE_LEFT(d, 30);
+
+ W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1); /* 69 */
+ a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(5) + SHA1_CONST(3);
+ c = ROTATE_LEFT(c, 30);
+
+ W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1); /* 70 */
+ e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(6) + SHA1_CONST(3);
+ b = ROTATE_LEFT(b, 30);
+
+ W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1); /* 71 */
+ d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(7) + SHA1_CONST(3);
+ a = ROTATE_LEFT(a, 30);
+
+ W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1); /* 72 */
+ c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(8) + SHA1_CONST(3);
+ e = ROTATE_LEFT(e, 30);
+
+ W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1); /* 73 */
+ b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(9) + SHA1_CONST(3);
+ d = ROTATE_LEFT(d, 30);
+
+ W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1); /* 74 */
+ a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(10) + SHA1_CONST(3);
+ c = ROTATE_LEFT(c, 30);
+
+ W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1); /* 75 */
+ e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(11) + SHA1_CONST(3);
+ b = ROTATE_LEFT(b, 30);
+
+ W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1); /* 76 */
+ d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(12) + SHA1_CONST(3);
+ a = ROTATE_LEFT(a, 30);
+
+ W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1); /* 77 */
+ c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(13) + SHA1_CONST(3);
+ e = ROTATE_LEFT(e, 30);
+
+ W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1); /* 78 */
+ b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(14) + SHA1_CONST(3);
+ d = ROTATE_LEFT(d, 30);
+
+ W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1); /* 79 */
+
+ ctx->state[0] += ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(15) +
+ SHA1_CONST(3);
+ ctx->state[1] += b;
+ ctx->state[2] += ROTATE_LEFT(c, 30);
+ ctx->state[3] += d;
+ ctx->state[4] += e;
+
+ /* zeroize sensitive information */
+ W(0) = W(1) = W(2) = W(3) = W(4) = W(5) = W(6) = W(7) = W(8) = 0;
+ W(9) = W(10) = W(11) = W(12) = W(13) = W(14) = W(15) = 0;
+}
+#endif /* !__amd64 */
+
+
+/*
+ * Encode()
+ *
+ * purpose: to convert a list of numbers from little endian to big endian
+ * input: uint8_t * : place to store the converted big endian numbers
+ * uint32_t * : place to get numbers to convert from
+ * size_t : the length of the input in bytes
+ * output: void
+ */
+
+static void
+Encode(uint8_t *_RESTRICT_KYWD output, const uint32_t *_RESTRICT_KYWD input,
+ size_t len)
+{
+ size_t i, j;
+
+#if defined(__sparc)
+ if (IS_P2ALIGNED(output, sizeof (uint32_t))) {
+ for (i = 0, j = 0; j < len; i++, j += 4) {
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ *((uint32_t *)(output + j)) = input[i];
+ }
+ } else {
+#endif /* little endian -- will work on big endian, but slowly */
+
+ for (i = 0, j = 0; j < len; i++, j += 4) {
+ output[j] = (input[i] >> 24) & 0xff;
+ output[j + 1] = (input[i] >> 16) & 0xff;
+ output[j + 2] = (input[i] >> 8) & 0xff;
+ output[j + 3] = input[i] & 0xff;
+ }
+#if defined(__sparc)
+ }
+#endif
+}
diff --git a/sys/contrib/openzfs/module/icp/algs/sha2/sha2.c b/sys/contrib/openzfs/module/icp/algs/sha2/sha2.c
new file mode 100644
index 000000000000..75f6a3c1af4b
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/sha2/sha2.c
@@ -0,0 +1,956 @@
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
+
+/*
+ * The basic framework for this code came from the reference
+ * implementation for MD5. That implementation is Copyright (C)
+ * 1991-2, RSA Data Security, Inc. Created 1991. All rights reserved.
+ *
+ * License to copy and use this software is granted provided that it
+ * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+ * Algorithm" in all material mentioning or referencing this software
+ * or this function.
+ *
+ * License is also granted to make and use derivative works provided
+ * that such works are identified as "derived from the RSA Data
+ * Security, Inc. MD5 Message-Digest Algorithm" in all material
+ * mentioning or referencing the derived work.
+ *
+ * RSA Data Security, Inc. makes no representations concerning either
+ * the merchantability of this software or the suitability of this
+ * software for any particular purpose. It is provided "as is"
+ * without express or implied warranty of any kind.
+ *
+ * These notices must be retained in any copies of any part of this
+ * documentation and/or software.
+ *
+ * NOTE: Cleaned-up and optimized, version of SHA2, based on the FIPS 180-2
+ * standard, available at
+ * http://csrc.nist.gov/publications/fips/fips180-2/fips180-2.pdf
+ * Not as fast as one would like -- further optimizations are encouraged
+ * and appreciated.
+ */
+
+#include <sys/zfs_context.h>
+#define _SHA2_IMPL
+#include <sys/sha2.h>
+#include <sha2/sha2_consts.h>
+
+#define _RESTRICT_KYWD
+
+#ifdef _ZFS_LITTLE_ENDIAN
+#include <sys/byteorder.h>
+#define HAVE_HTONL
+#endif
+#include <sys/isa_defs.h> /* for _ILP32 */
+
+static void Encode(uint8_t *, uint32_t *, size_t);
+static void Encode64(uint8_t *, uint64_t *, size_t);
+
+/* userspace only supports the generic version */
+#if defined(__amd64) && defined(_KERNEL)
+#define SHA512Transform(ctx, in) SHA512TransformBlocks((ctx), (in), 1)
+#define SHA256Transform(ctx, in) SHA256TransformBlocks((ctx), (in), 1)
+
+void SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num);
+void SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num);
+
+#else
+static void SHA256Transform(SHA2_CTX *, const uint8_t *);
+static void SHA512Transform(SHA2_CTX *, const uint8_t *);
+#endif /* __amd64 && _KERNEL */
+
+static uint8_t PADDING[128] = { 0x80, /* all zeros */ };
+
+/*
+ * The low-level checksum routines use a lot of stack space. On systems where
+ * small stacks are enforced (like 32-bit kernel builds), insert compiler memory
+ * barriers to reduce stack frame size. This can reduce the SHA512Transform()
+ * stack frame usage from 3k to <1k on ARM32, for example.
+ */
+#if defined(_ILP32) || defined(__powerpc) /* small stack */
+#define SMALL_STACK_MEMORY_BARRIER asm volatile("": : :"memory");
+#else
+#define SMALL_STACK_MEMORY_BARRIER
+#endif
+
+/* Ch and Maj are the basic SHA2 functions. */
+#define Ch(b, c, d) (((b) & (c)) ^ ((~b) & (d)))
+#define Maj(b, c, d) (((b) & (c)) ^ ((b) & (d)) ^ ((c) & (d)))
+
+/* Rotates x right n bits. */
+#define ROTR(x, n) \
+ (((x) >> (n)) | ((x) << ((sizeof (x) * NBBY)-(n))))
+
+/* Shift x right n bits */
+#define SHR(x, n) ((x) >> (n))
+
+/* SHA256 Functions */
+#define BIGSIGMA0_256(x) (ROTR((x), 2) ^ ROTR((x), 13) ^ ROTR((x), 22))
+#define BIGSIGMA1_256(x) (ROTR((x), 6) ^ ROTR((x), 11) ^ ROTR((x), 25))
+#define SIGMA0_256(x) (ROTR((x), 7) ^ ROTR((x), 18) ^ SHR((x), 3))
+#define SIGMA1_256(x) (ROTR((x), 17) ^ ROTR((x), 19) ^ SHR((x), 10))
+
+#define SHA256ROUND(a, b, c, d, e, f, g, h, i, w) \
+ T1 = h + BIGSIGMA1_256(e) + Ch(e, f, g) + SHA256_CONST(i) + w; \
+ d += T1; \
+ T2 = BIGSIGMA0_256(a) + Maj(a, b, c); \
+ h = T1 + T2
+
+/* SHA384/512 Functions */
+#define BIGSIGMA0(x) (ROTR((x), 28) ^ ROTR((x), 34) ^ ROTR((x), 39))
+#define BIGSIGMA1(x) (ROTR((x), 14) ^ ROTR((x), 18) ^ ROTR((x), 41))
+#define SIGMA0(x) (ROTR((x), 1) ^ ROTR((x), 8) ^ SHR((x), 7))
+#define SIGMA1(x) (ROTR((x), 19) ^ ROTR((x), 61) ^ SHR((x), 6))
+#define SHA512ROUND(a, b, c, d, e, f, g, h, i, w) \
+ T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + SHA512_CONST(i) + w; \
+ d += T1; \
+ T2 = BIGSIGMA0(a) + Maj(a, b, c); \
+ h = T1 + T2; \
+ SMALL_STACK_MEMORY_BARRIER;
+
+/*
+ * sparc optimization:
+ *
+ * on the sparc, we can load big endian 32-bit data easily. note that
+ * special care must be taken to ensure the address is 32-bit aligned.
+ * in the interest of speed, we don't check to make sure, since
+ * careful programming can guarantee this for us.
+ */
+
+#if defined(_ZFS_BIG_ENDIAN)
+#define LOAD_BIG_32(addr) (*(uint32_t *)(addr))
+#define LOAD_BIG_64(addr) (*(uint64_t *)(addr))
+
+#elif defined(HAVE_HTONL)
+#define LOAD_BIG_32(addr) htonl(*((uint32_t *)(addr)))
+#define LOAD_BIG_64(addr) htonll(*((uint64_t *)(addr)))
+
+#else
+/* little endian -- will work on big endian, but slowly */
+#define LOAD_BIG_32(addr) \
+ (((addr)[0] << 24) | ((addr)[1] << 16) | ((addr)[2] << 8) | (addr)[3])
+#define LOAD_BIG_64(addr) \
+ (((uint64_t)(addr)[0] << 56) | ((uint64_t)(addr)[1] << 48) | \
+ ((uint64_t)(addr)[2] << 40) | ((uint64_t)(addr)[3] << 32) | \
+ ((uint64_t)(addr)[4] << 24) | ((uint64_t)(addr)[5] << 16) | \
+ ((uint64_t)(addr)[6] << 8) | (uint64_t)(addr)[7])
+#endif /* _BIG_ENDIAN */
+
+
+#if !defined(__amd64) || !defined(_KERNEL)
+/* SHA256 Transform */
+
+static void
+SHA256Transform(SHA2_CTX *ctx, const uint8_t *blk)
+{
+ uint32_t a = ctx->state.s32[0];
+ uint32_t b = ctx->state.s32[1];
+ uint32_t c = ctx->state.s32[2];
+ uint32_t d = ctx->state.s32[3];
+ uint32_t e = ctx->state.s32[4];
+ uint32_t f = ctx->state.s32[5];
+ uint32_t g = ctx->state.s32[6];
+ uint32_t h = ctx->state.s32[7];
+
+ uint32_t w0, w1, w2, w3, w4, w5, w6, w7;
+ uint32_t w8, w9, w10, w11, w12, w13, w14, w15;
+ uint32_t T1, T2;
+
+#if defined(__sparc)
+ static const uint32_t sha256_consts[] = {
+ SHA256_CONST_0, SHA256_CONST_1, SHA256_CONST_2,
+ SHA256_CONST_3, SHA256_CONST_4, SHA256_CONST_5,
+ SHA256_CONST_6, SHA256_CONST_7, SHA256_CONST_8,
+ SHA256_CONST_9, SHA256_CONST_10, SHA256_CONST_11,
+ SHA256_CONST_12, SHA256_CONST_13, SHA256_CONST_14,
+ SHA256_CONST_15, SHA256_CONST_16, SHA256_CONST_17,
+ SHA256_CONST_18, SHA256_CONST_19, SHA256_CONST_20,
+ SHA256_CONST_21, SHA256_CONST_22, SHA256_CONST_23,
+ SHA256_CONST_24, SHA256_CONST_25, SHA256_CONST_26,
+ SHA256_CONST_27, SHA256_CONST_28, SHA256_CONST_29,
+ SHA256_CONST_30, SHA256_CONST_31, SHA256_CONST_32,
+ SHA256_CONST_33, SHA256_CONST_34, SHA256_CONST_35,
+ SHA256_CONST_36, SHA256_CONST_37, SHA256_CONST_38,
+ SHA256_CONST_39, SHA256_CONST_40, SHA256_CONST_41,
+ SHA256_CONST_42, SHA256_CONST_43, SHA256_CONST_44,
+ SHA256_CONST_45, SHA256_CONST_46, SHA256_CONST_47,
+ SHA256_CONST_48, SHA256_CONST_49, SHA256_CONST_50,
+ SHA256_CONST_51, SHA256_CONST_52, SHA256_CONST_53,
+ SHA256_CONST_54, SHA256_CONST_55, SHA256_CONST_56,
+ SHA256_CONST_57, SHA256_CONST_58, SHA256_CONST_59,
+ SHA256_CONST_60, SHA256_CONST_61, SHA256_CONST_62,
+ SHA256_CONST_63
+ };
+#endif /* __sparc */
+
+ if ((uintptr_t)blk & 0x3) { /* not 4-byte aligned? */
+ bcopy(blk, ctx->buf_un.buf32, sizeof (ctx->buf_un.buf32));
+ blk = (uint8_t *)ctx->buf_un.buf32;
+ }
+
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w0 = LOAD_BIG_32(blk + 4 * 0);
+ SHA256ROUND(a, b, c, d, e, f, g, h, 0, w0);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w1 = LOAD_BIG_32(blk + 4 * 1);
+ SHA256ROUND(h, a, b, c, d, e, f, g, 1, w1);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w2 = LOAD_BIG_32(blk + 4 * 2);
+ SHA256ROUND(g, h, a, b, c, d, e, f, 2, w2);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w3 = LOAD_BIG_32(blk + 4 * 3);
+ SHA256ROUND(f, g, h, a, b, c, d, e, 3, w3);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w4 = LOAD_BIG_32(blk + 4 * 4);
+ SHA256ROUND(e, f, g, h, a, b, c, d, 4, w4);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w5 = LOAD_BIG_32(blk + 4 * 5);
+ SHA256ROUND(d, e, f, g, h, a, b, c, 5, w5);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w6 = LOAD_BIG_32(blk + 4 * 6);
+ SHA256ROUND(c, d, e, f, g, h, a, b, 6, w6);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w7 = LOAD_BIG_32(blk + 4 * 7);
+ SHA256ROUND(b, c, d, e, f, g, h, a, 7, w7);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w8 = LOAD_BIG_32(blk + 4 * 8);
+ SHA256ROUND(a, b, c, d, e, f, g, h, 8, w8);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w9 = LOAD_BIG_32(blk + 4 * 9);
+ SHA256ROUND(h, a, b, c, d, e, f, g, 9, w9);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w10 = LOAD_BIG_32(blk + 4 * 10);
+ SHA256ROUND(g, h, a, b, c, d, e, f, 10, w10);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w11 = LOAD_BIG_32(blk + 4 * 11);
+ SHA256ROUND(f, g, h, a, b, c, d, e, 11, w11);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w12 = LOAD_BIG_32(blk + 4 * 12);
+ SHA256ROUND(e, f, g, h, a, b, c, d, 12, w12);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w13 = LOAD_BIG_32(blk + 4 * 13);
+ SHA256ROUND(d, e, f, g, h, a, b, c, 13, w13);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w14 = LOAD_BIG_32(blk + 4 * 14);
+ SHA256ROUND(c, d, e, f, g, h, a, b, 14, w14);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w15 = LOAD_BIG_32(blk + 4 * 15);
+ SHA256ROUND(b, c, d, e, f, g, h, a, 15, w15);
+
+ w0 = SIGMA1_256(w14) + w9 + SIGMA0_256(w1) + w0;
+ SHA256ROUND(a, b, c, d, e, f, g, h, 16, w0);
+ w1 = SIGMA1_256(w15) + w10 + SIGMA0_256(w2) + w1;
+ SHA256ROUND(h, a, b, c, d, e, f, g, 17, w1);
+ w2 = SIGMA1_256(w0) + w11 + SIGMA0_256(w3) + w2;
+ SHA256ROUND(g, h, a, b, c, d, e, f, 18, w2);
+ w3 = SIGMA1_256(w1) + w12 + SIGMA0_256(w4) + w3;
+ SHA256ROUND(f, g, h, a, b, c, d, e, 19, w3);
+ w4 = SIGMA1_256(w2) + w13 + SIGMA0_256(w5) + w4;
+ SHA256ROUND(e, f, g, h, a, b, c, d, 20, w4);
+ w5 = SIGMA1_256(w3) + w14 + SIGMA0_256(w6) + w5;
+ SHA256ROUND(d, e, f, g, h, a, b, c, 21, w5);
+ w6 = SIGMA1_256(w4) + w15 + SIGMA0_256(w7) + w6;
+ SHA256ROUND(c, d, e, f, g, h, a, b, 22, w6);
+ w7 = SIGMA1_256(w5) + w0 + SIGMA0_256(w8) + w7;
+ SHA256ROUND(b, c, d, e, f, g, h, a, 23, w7);
+ w8 = SIGMA1_256(w6) + w1 + SIGMA0_256(w9) + w8;
+ SHA256ROUND(a, b, c, d, e, f, g, h, 24, w8);
+ w9 = SIGMA1_256(w7) + w2 + SIGMA0_256(w10) + w9;
+ SHA256ROUND(h, a, b, c, d, e, f, g, 25, w9);
+ w10 = SIGMA1_256(w8) + w3 + SIGMA0_256(w11) + w10;
+ SHA256ROUND(g, h, a, b, c, d, e, f, 26, w10);
+ w11 = SIGMA1_256(w9) + w4 + SIGMA0_256(w12) + w11;
+ SHA256ROUND(f, g, h, a, b, c, d, e, 27, w11);
+ w12 = SIGMA1_256(w10) + w5 + SIGMA0_256(w13) + w12;
+ SHA256ROUND(e, f, g, h, a, b, c, d, 28, w12);
+ w13 = SIGMA1_256(w11) + w6 + SIGMA0_256(w14) + w13;
+ SHA256ROUND(d, e, f, g, h, a, b, c, 29, w13);
+ w14 = SIGMA1_256(w12) + w7 + SIGMA0_256(w15) + w14;
+ SHA256ROUND(c, d, e, f, g, h, a, b, 30, w14);
+ w15 = SIGMA1_256(w13) + w8 + SIGMA0_256(w0) + w15;
+ SHA256ROUND(b, c, d, e, f, g, h, a, 31, w15);
+
+ w0 = SIGMA1_256(w14) + w9 + SIGMA0_256(w1) + w0;
+ SHA256ROUND(a, b, c, d, e, f, g, h, 32, w0);
+ w1 = SIGMA1_256(w15) + w10 + SIGMA0_256(w2) + w1;
+ SHA256ROUND(h, a, b, c, d, e, f, g, 33, w1);
+ w2 = SIGMA1_256(w0) + w11 + SIGMA0_256(w3) + w2;
+ SHA256ROUND(g, h, a, b, c, d, e, f, 34, w2);
+ w3 = SIGMA1_256(w1) + w12 + SIGMA0_256(w4) + w3;
+ SHA256ROUND(f, g, h, a, b, c, d, e, 35, w3);
+ w4 = SIGMA1_256(w2) + w13 + SIGMA0_256(w5) + w4;
+ SHA256ROUND(e, f, g, h, a, b, c, d, 36, w4);
+ w5 = SIGMA1_256(w3) + w14 + SIGMA0_256(w6) + w5;
+ SHA256ROUND(d, e, f, g, h, a, b, c, 37, w5);
+ w6 = SIGMA1_256(w4) + w15 + SIGMA0_256(w7) + w6;
+ SHA256ROUND(c, d, e, f, g, h, a, b, 38, w6);
+ w7 = SIGMA1_256(w5) + w0 + SIGMA0_256(w8) + w7;
+ SHA256ROUND(b, c, d, e, f, g, h, a, 39, w7);
+ w8 = SIGMA1_256(w6) + w1 + SIGMA0_256(w9) + w8;
+ SHA256ROUND(a, b, c, d, e, f, g, h, 40, w8);
+ w9 = SIGMA1_256(w7) + w2 + SIGMA0_256(w10) + w9;
+ SHA256ROUND(h, a, b, c, d, e, f, g, 41, w9);
+ w10 = SIGMA1_256(w8) + w3 + SIGMA0_256(w11) + w10;
+ SHA256ROUND(g, h, a, b, c, d, e, f, 42, w10);
+ w11 = SIGMA1_256(w9) + w4 + SIGMA0_256(w12) + w11;
+ SHA256ROUND(f, g, h, a, b, c, d, e, 43, w11);
+ w12 = SIGMA1_256(w10) + w5 + SIGMA0_256(w13) + w12;
+ SHA256ROUND(e, f, g, h, a, b, c, d, 44, w12);
+ w13 = SIGMA1_256(w11) + w6 + SIGMA0_256(w14) + w13;
+ SHA256ROUND(d, e, f, g, h, a, b, c, 45, w13);
+ w14 = SIGMA1_256(w12) + w7 + SIGMA0_256(w15) + w14;
+ SHA256ROUND(c, d, e, f, g, h, a, b, 46, w14);
+ w15 = SIGMA1_256(w13) + w8 + SIGMA0_256(w0) + w15;
+ SHA256ROUND(b, c, d, e, f, g, h, a, 47, w15);
+
+ w0 = SIGMA1_256(w14) + w9 + SIGMA0_256(w1) + w0;
+ SHA256ROUND(a, b, c, d, e, f, g, h, 48, w0);
+ w1 = SIGMA1_256(w15) + w10 + SIGMA0_256(w2) + w1;
+ SHA256ROUND(h, a, b, c, d, e, f, g, 49, w1);
+ w2 = SIGMA1_256(w0) + w11 + SIGMA0_256(w3) + w2;
+ SHA256ROUND(g, h, a, b, c, d, e, f, 50, w2);
+ w3 = SIGMA1_256(w1) + w12 + SIGMA0_256(w4) + w3;
+ SHA256ROUND(f, g, h, a, b, c, d, e, 51, w3);
+ w4 = SIGMA1_256(w2) + w13 + SIGMA0_256(w5) + w4;
+ SHA256ROUND(e, f, g, h, a, b, c, d, 52, w4);
+ w5 = SIGMA1_256(w3) + w14 + SIGMA0_256(w6) + w5;
+ SHA256ROUND(d, e, f, g, h, a, b, c, 53, w5);
+ w6 = SIGMA1_256(w4) + w15 + SIGMA0_256(w7) + w6;
+ SHA256ROUND(c, d, e, f, g, h, a, b, 54, w6);
+ w7 = SIGMA1_256(w5) + w0 + SIGMA0_256(w8) + w7;
+ SHA256ROUND(b, c, d, e, f, g, h, a, 55, w7);
+ w8 = SIGMA1_256(w6) + w1 + SIGMA0_256(w9) + w8;
+ SHA256ROUND(a, b, c, d, e, f, g, h, 56, w8);
+ w9 = SIGMA1_256(w7) + w2 + SIGMA0_256(w10) + w9;
+ SHA256ROUND(h, a, b, c, d, e, f, g, 57, w9);
+ w10 = SIGMA1_256(w8) + w3 + SIGMA0_256(w11) + w10;
+ SHA256ROUND(g, h, a, b, c, d, e, f, 58, w10);
+ w11 = SIGMA1_256(w9) + w4 + SIGMA0_256(w12) + w11;
+ SHA256ROUND(f, g, h, a, b, c, d, e, 59, w11);
+ w12 = SIGMA1_256(w10) + w5 + SIGMA0_256(w13) + w12;
+ SHA256ROUND(e, f, g, h, a, b, c, d, 60, w12);
+ w13 = SIGMA1_256(w11) + w6 + SIGMA0_256(w14) + w13;
+ SHA256ROUND(d, e, f, g, h, a, b, c, 61, w13);
+ w14 = SIGMA1_256(w12) + w7 + SIGMA0_256(w15) + w14;
+ SHA256ROUND(c, d, e, f, g, h, a, b, 62, w14);
+ w15 = SIGMA1_256(w13) + w8 + SIGMA0_256(w0) + w15;
+ SHA256ROUND(b, c, d, e, f, g, h, a, 63, w15);
+
+ ctx->state.s32[0] += a;
+ ctx->state.s32[1] += b;
+ ctx->state.s32[2] += c;
+ ctx->state.s32[3] += d;
+ ctx->state.s32[4] += e;
+ ctx->state.s32[5] += f;
+ ctx->state.s32[6] += g;
+ ctx->state.s32[7] += h;
+}
+
+
+/* SHA384 and SHA512 Transform */
+
+static void
+SHA512Transform(SHA2_CTX *ctx, const uint8_t *blk)
+{
+
+ uint64_t a = ctx->state.s64[0];
+ uint64_t b = ctx->state.s64[1];
+ uint64_t c = ctx->state.s64[2];
+ uint64_t d = ctx->state.s64[3];
+ uint64_t e = ctx->state.s64[4];
+ uint64_t f = ctx->state.s64[5];
+ uint64_t g = ctx->state.s64[6];
+ uint64_t h = ctx->state.s64[7];
+
+ uint64_t w0, w1, w2, w3, w4, w5, w6, w7;
+ uint64_t w8, w9, w10, w11, w12, w13, w14, w15;
+ uint64_t T1, T2;
+
+#if defined(__sparc)
+ static const uint64_t sha512_consts[] = {
+ SHA512_CONST_0, SHA512_CONST_1, SHA512_CONST_2,
+ SHA512_CONST_3, SHA512_CONST_4, SHA512_CONST_5,
+ SHA512_CONST_6, SHA512_CONST_7, SHA512_CONST_8,
+ SHA512_CONST_9, SHA512_CONST_10, SHA512_CONST_11,
+ SHA512_CONST_12, SHA512_CONST_13, SHA512_CONST_14,
+ SHA512_CONST_15, SHA512_CONST_16, SHA512_CONST_17,
+ SHA512_CONST_18, SHA512_CONST_19, SHA512_CONST_20,
+ SHA512_CONST_21, SHA512_CONST_22, SHA512_CONST_23,
+ SHA512_CONST_24, SHA512_CONST_25, SHA512_CONST_26,
+ SHA512_CONST_27, SHA512_CONST_28, SHA512_CONST_29,
+ SHA512_CONST_30, SHA512_CONST_31, SHA512_CONST_32,
+ SHA512_CONST_33, SHA512_CONST_34, SHA512_CONST_35,
+ SHA512_CONST_36, SHA512_CONST_37, SHA512_CONST_38,
+ SHA512_CONST_39, SHA512_CONST_40, SHA512_CONST_41,
+ SHA512_CONST_42, SHA512_CONST_43, SHA512_CONST_44,
+ SHA512_CONST_45, SHA512_CONST_46, SHA512_CONST_47,
+ SHA512_CONST_48, SHA512_CONST_49, SHA512_CONST_50,
+ SHA512_CONST_51, SHA512_CONST_52, SHA512_CONST_53,
+ SHA512_CONST_54, SHA512_CONST_55, SHA512_CONST_56,
+ SHA512_CONST_57, SHA512_CONST_58, SHA512_CONST_59,
+ SHA512_CONST_60, SHA512_CONST_61, SHA512_CONST_62,
+ SHA512_CONST_63, SHA512_CONST_64, SHA512_CONST_65,
+ SHA512_CONST_66, SHA512_CONST_67, SHA512_CONST_68,
+ SHA512_CONST_69, SHA512_CONST_70, SHA512_CONST_71,
+ SHA512_CONST_72, SHA512_CONST_73, SHA512_CONST_74,
+ SHA512_CONST_75, SHA512_CONST_76, SHA512_CONST_77,
+ SHA512_CONST_78, SHA512_CONST_79
+ };
+#endif /* __sparc */
+
+
+ if ((uintptr_t)blk & 0x7) { /* not 8-byte aligned? */
+ bcopy(blk, ctx->buf_un.buf64, sizeof (ctx->buf_un.buf64));
+ blk = (uint8_t *)ctx->buf_un.buf64;
+ }
+
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w0 = LOAD_BIG_64(blk + 8 * 0);
+ SHA512ROUND(a, b, c, d, e, f, g, h, 0, w0);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w1 = LOAD_BIG_64(blk + 8 * 1);
+ SHA512ROUND(h, a, b, c, d, e, f, g, 1, w1);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w2 = LOAD_BIG_64(blk + 8 * 2);
+ SHA512ROUND(g, h, a, b, c, d, e, f, 2, w2);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w3 = LOAD_BIG_64(blk + 8 * 3);
+ SHA512ROUND(f, g, h, a, b, c, d, e, 3, w3);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w4 = LOAD_BIG_64(blk + 8 * 4);
+ SHA512ROUND(e, f, g, h, a, b, c, d, 4, w4);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w5 = LOAD_BIG_64(blk + 8 * 5);
+ SHA512ROUND(d, e, f, g, h, a, b, c, 5, w5);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w6 = LOAD_BIG_64(blk + 8 * 6);
+ SHA512ROUND(c, d, e, f, g, h, a, b, 6, w6);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w7 = LOAD_BIG_64(blk + 8 * 7);
+ SHA512ROUND(b, c, d, e, f, g, h, a, 7, w7);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w8 = LOAD_BIG_64(blk + 8 * 8);
+ SHA512ROUND(a, b, c, d, e, f, g, h, 8, w8);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w9 = LOAD_BIG_64(blk + 8 * 9);
+ SHA512ROUND(h, a, b, c, d, e, f, g, 9, w9);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w10 = LOAD_BIG_64(blk + 8 * 10);
+ SHA512ROUND(g, h, a, b, c, d, e, f, 10, w10);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w11 = LOAD_BIG_64(blk + 8 * 11);
+ SHA512ROUND(f, g, h, a, b, c, d, e, 11, w11);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w12 = LOAD_BIG_64(blk + 8 * 12);
+ SHA512ROUND(e, f, g, h, a, b, c, d, 12, w12);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w13 = LOAD_BIG_64(blk + 8 * 13);
+ SHA512ROUND(d, e, f, g, h, a, b, c, 13, w13);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w14 = LOAD_BIG_64(blk + 8 * 14);
+ SHA512ROUND(c, d, e, f, g, h, a, b, 14, w14);
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ w15 = LOAD_BIG_64(blk + 8 * 15);
+ SHA512ROUND(b, c, d, e, f, g, h, a, 15, w15);
+
+ w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0;
+ SHA512ROUND(a, b, c, d, e, f, g, h, 16, w0);
+ w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1;
+ SHA512ROUND(h, a, b, c, d, e, f, g, 17, w1);
+ w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2;
+ SHA512ROUND(g, h, a, b, c, d, e, f, 18, w2);
+ w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3;
+ SHA512ROUND(f, g, h, a, b, c, d, e, 19, w3);
+ w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4;
+ SHA512ROUND(e, f, g, h, a, b, c, d, 20, w4);
+ w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5;
+ SHA512ROUND(d, e, f, g, h, a, b, c, 21, w5);
+ w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6;
+ SHA512ROUND(c, d, e, f, g, h, a, b, 22, w6);
+ w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7;
+ SHA512ROUND(b, c, d, e, f, g, h, a, 23, w7);
+ w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8;
+ SHA512ROUND(a, b, c, d, e, f, g, h, 24, w8);
+ w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9;
+ SHA512ROUND(h, a, b, c, d, e, f, g, 25, w9);
+ w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10;
+ SHA512ROUND(g, h, a, b, c, d, e, f, 26, w10);
+ w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11;
+ SHA512ROUND(f, g, h, a, b, c, d, e, 27, w11);
+ w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12;
+ SHA512ROUND(e, f, g, h, a, b, c, d, 28, w12);
+ w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13;
+ SHA512ROUND(d, e, f, g, h, a, b, c, 29, w13);
+ w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14;
+ SHA512ROUND(c, d, e, f, g, h, a, b, 30, w14);
+ w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15;
+ SHA512ROUND(b, c, d, e, f, g, h, a, 31, w15);
+
+ w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0;
+ SHA512ROUND(a, b, c, d, e, f, g, h, 32, w0);
+ w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1;
+ SHA512ROUND(h, a, b, c, d, e, f, g, 33, w1);
+ w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2;
+ SHA512ROUND(g, h, a, b, c, d, e, f, 34, w2);
+ w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3;
+ SHA512ROUND(f, g, h, a, b, c, d, e, 35, w3);
+ w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4;
+ SHA512ROUND(e, f, g, h, a, b, c, d, 36, w4);
+ w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5;
+ SHA512ROUND(d, e, f, g, h, a, b, c, 37, w5);
+ w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6;
+ SHA512ROUND(c, d, e, f, g, h, a, b, 38, w6);
+ w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7;
+ SHA512ROUND(b, c, d, e, f, g, h, a, 39, w7);
+ w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8;
+ SHA512ROUND(a, b, c, d, e, f, g, h, 40, w8);
+ w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9;
+ SHA512ROUND(h, a, b, c, d, e, f, g, 41, w9);
+ w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10;
+ SHA512ROUND(g, h, a, b, c, d, e, f, 42, w10);
+ w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11;
+ SHA512ROUND(f, g, h, a, b, c, d, e, 43, w11);
+ w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12;
+ SHA512ROUND(e, f, g, h, a, b, c, d, 44, w12);
+ w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13;
+ SHA512ROUND(d, e, f, g, h, a, b, c, 45, w13);
+ w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14;
+ SHA512ROUND(c, d, e, f, g, h, a, b, 46, w14);
+ w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15;
+ SHA512ROUND(b, c, d, e, f, g, h, a, 47, w15);
+
+ w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0;
+ SHA512ROUND(a, b, c, d, e, f, g, h, 48, w0);
+ w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1;
+ SHA512ROUND(h, a, b, c, d, e, f, g, 49, w1);
+ w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2;
+ SHA512ROUND(g, h, a, b, c, d, e, f, 50, w2);
+ w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3;
+ SHA512ROUND(f, g, h, a, b, c, d, e, 51, w3);
+ w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4;
+ SHA512ROUND(e, f, g, h, a, b, c, d, 52, w4);
+ w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5;
+ SHA512ROUND(d, e, f, g, h, a, b, c, 53, w5);
+ w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6;
+ SHA512ROUND(c, d, e, f, g, h, a, b, 54, w6);
+ w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7;
+ SHA512ROUND(b, c, d, e, f, g, h, a, 55, w7);
+ w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8;
+ SHA512ROUND(a, b, c, d, e, f, g, h, 56, w8);
+ w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9;
+ SHA512ROUND(h, a, b, c, d, e, f, g, 57, w9);
+ w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10;
+ SHA512ROUND(g, h, a, b, c, d, e, f, 58, w10);
+ w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11;
+ SHA512ROUND(f, g, h, a, b, c, d, e, 59, w11);
+ w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12;
+ SHA512ROUND(e, f, g, h, a, b, c, d, 60, w12);
+ w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13;
+ SHA512ROUND(d, e, f, g, h, a, b, c, 61, w13);
+ w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14;
+ SHA512ROUND(c, d, e, f, g, h, a, b, 62, w14);
+ w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15;
+ SHA512ROUND(b, c, d, e, f, g, h, a, 63, w15);
+
+ w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0;
+ SHA512ROUND(a, b, c, d, e, f, g, h, 64, w0);
+ w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1;
+ SHA512ROUND(h, a, b, c, d, e, f, g, 65, w1);
+ w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2;
+ SHA512ROUND(g, h, a, b, c, d, e, f, 66, w2);
+ w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3;
+ SHA512ROUND(f, g, h, a, b, c, d, e, 67, w3);
+ w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4;
+ SHA512ROUND(e, f, g, h, a, b, c, d, 68, w4);
+ w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5;
+ SHA512ROUND(d, e, f, g, h, a, b, c, 69, w5);
+ w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6;
+ SHA512ROUND(c, d, e, f, g, h, a, b, 70, w6);
+ w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7;
+ SHA512ROUND(b, c, d, e, f, g, h, a, 71, w7);
+ w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8;
+ SHA512ROUND(a, b, c, d, e, f, g, h, 72, w8);
+ w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9;
+ SHA512ROUND(h, a, b, c, d, e, f, g, 73, w9);
+ w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10;
+ SHA512ROUND(g, h, a, b, c, d, e, f, 74, w10);
+ w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11;
+ SHA512ROUND(f, g, h, a, b, c, d, e, 75, w11);
+ w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12;
+ SHA512ROUND(e, f, g, h, a, b, c, d, 76, w12);
+ w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13;
+ SHA512ROUND(d, e, f, g, h, a, b, c, 77, w13);
+ w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14;
+ SHA512ROUND(c, d, e, f, g, h, a, b, 78, w14);
+ w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15;
+ SHA512ROUND(b, c, d, e, f, g, h, a, 79, w15);
+
+ ctx->state.s64[0] += a;
+ ctx->state.s64[1] += b;
+ ctx->state.s64[2] += c;
+ ctx->state.s64[3] += d;
+ ctx->state.s64[4] += e;
+ ctx->state.s64[5] += f;
+ ctx->state.s64[6] += g;
+ ctx->state.s64[7] += h;
+
+}
+#endif /* !__amd64 || !_KERNEL */
+
+
+/*
+ * Encode()
+ *
+ * purpose: to convert a list of numbers from little endian to big endian
+ * input: uint8_t * : place to store the converted big endian numbers
+ * uint32_t * : place to get numbers to convert from
+ * size_t : the length of the input in bytes
+ * output: void
+ */
+
+static void
+Encode(uint8_t *_RESTRICT_KYWD output, uint32_t *_RESTRICT_KYWD input,
+ size_t len)
+{
+ size_t i, j;
+
+#if defined(__sparc)
+ if (IS_P2ALIGNED(output, sizeof (uint32_t))) {
+ for (i = 0, j = 0; j < len; i++, j += 4) {
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ *((uint32_t *)(output + j)) = input[i];
+ }
+ } else {
+#endif /* little endian -- will work on big endian, but slowly */
+ for (i = 0, j = 0; j < len; i++, j += 4) {
+ output[j] = (input[i] >> 24) & 0xff;
+ output[j + 1] = (input[i] >> 16) & 0xff;
+ output[j + 2] = (input[i] >> 8) & 0xff;
+ output[j + 3] = input[i] & 0xff;
+ }
+#if defined(__sparc)
+ }
+#endif
+}
+
+static void
+Encode64(uint8_t *_RESTRICT_KYWD output, uint64_t *_RESTRICT_KYWD input,
+ size_t len)
+{
+ size_t i, j;
+
+#if defined(__sparc)
+ if (IS_P2ALIGNED(output, sizeof (uint64_t))) {
+ for (i = 0, j = 0; j < len; i++, j += 8) {
+ /* LINTED E_BAD_PTR_CAST_ALIGN */
+ *((uint64_t *)(output + j)) = input[i];
+ }
+ } else {
+#endif /* little endian -- will work on big endian, but slowly */
+ for (i = 0, j = 0; j < len; i++, j += 8) {
+
+ output[j] = (input[i] >> 56) & 0xff;
+ output[j + 1] = (input[i] >> 48) & 0xff;
+ output[j + 2] = (input[i] >> 40) & 0xff;
+ output[j + 3] = (input[i] >> 32) & 0xff;
+ output[j + 4] = (input[i] >> 24) & 0xff;
+ output[j + 5] = (input[i] >> 16) & 0xff;
+ output[j + 6] = (input[i] >> 8) & 0xff;
+ output[j + 7] = input[i] & 0xff;
+ }
+#if defined(__sparc)
+ }
+#endif
+}
+
+
+void
+SHA2Init(uint64_t mech, SHA2_CTX *ctx)
+{
+
+ switch (mech) {
+ case SHA256_MECH_INFO_TYPE:
+ case SHA256_HMAC_MECH_INFO_TYPE:
+ case SHA256_HMAC_GEN_MECH_INFO_TYPE:
+ ctx->state.s32[0] = 0x6a09e667U;
+ ctx->state.s32[1] = 0xbb67ae85U;
+ ctx->state.s32[2] = 0x3c6ef372U;
+ ctx->state.s32[3] = 0xa54ff53aU;
+ ctx->state.s32[4] = 0x510e527fU;
+ ctx->state.s32[5] = 0x9b05688cU;
+ ctx->state.s32[6] = 0x1f83d9abU;
+ ctx->state.s32[7] = 0x5be0cd19U;
+ break;
+ case SHA384_MECH_INFO_TYPE:
+ case SHA384_HMAC_MECH_INFO_TYPE:
+ case SHA384_HMAC_GEN_MECH_INFO_TYPE:
+ ctx->state.s64[0] = 0xcbbb9d5dc1059ed8ULL;
+ ctx->state.s64[1] = 0x629a292a367cd507ULL;
+ ctx->state.s64[2] = 0x9159015a3070dd17ULL;
+ ctx->state.s64[3] = 0x152fecd8f70e5939ULL;
+ ctx->state.s64[4] = 0x67332667ffc00b31ULL;
+ ctx->state.s64[5] = 0x8eb44a8768581511ULL;
+ ctx->state.s64[6] = 0xdb0c2e0d64f98fa7ULL;
+ ctx->state.s64[7] = 0x47b5481dbefa4fa4ULL;
+ break;
+ case SHA512_MECH_INFO_TYPE:
+ case SHA512_HMAC_MECH_INFO_TYPE:
+ case SHA512_HMAC_GEN_MECH_INFO_TYPE:
+ ctx->state.s64[0] = 0x6a09e667f3bcc908ULL;
+ ctx->state.s64[1] = 0xbb67ae8584caa73bULL;
+ ctx->state.s64[2] = 0x3c6ef372fe94f82bULL;
+ ctx->state.s64[3] = 0xa54ff53a5f1d36f1ULL;
+ ctx->state.s64[4] = 0x510e527fade682d1ULL;
+ ctx->state.s64[5] = 0x9b05688c2b3e6c1fULL;
+ ctx->state.s64[6] = 0x1f83d9abfb41bd6bULL;
+ ctx->state.s64[7] = 0x5be0cd19137e2179ULL;
+ break;
+ case SHA512_224_MECH_INFO_TYPE:
+ ctx->state.s64[0] = 0x8C3D37C819544DA2ULL;
+ ctx->state.s64[1] = 0x73E1996689DCD4D6ULL;
+ ctx->state.s64[2] = 0x1DFAB7AE32FF9C82ULL;
+ ctx->state.s64[3] = 0x679DD514582F9FCFULL;
+ ctx->state.s64[4] = 0x0F6D2B697BD44DA8ULL;
+ ctx->state.s64[5] = 0x77E36F7304C48942ULL;
+ ctx->state.s64[6] = 0x3F9D85A86A1D36C8ULL;
+ ctx->state.s64[7] = 0x1112E6AD91D692A1ULL;
+ break;
+ case SHA512_256_MECH_INFO_TYPE:
+ ctx->state.s64[0] = 0x22312194FC2BF72CULL;
+ ctx->state.s64[1] = 0x9F555FA3C84C64C2ULL;
+ ctx->state.s64[2] = 0x2393B86B6F53B151ULL;
+ ctx->state.s64[3] = 0x963877195940EABDULL;
+ ctx->state.s64[4] = 0x96283EE2A88EFFE3ULL;
+ ctx->state.s64[5] = 0xBE5E1E2553863992ULL;
+ ctx->state.s64[6] = 0x2B0199FC2C85B8AAULL;
+ ctx->state.s64[7] = 0x0EB72DDC81C52CA2ULL;
+ break;
+#ifdef _KERNEL
+ default:
+ cmn_err(CE_PANIC,
+ "sha2_init: failed to find a supported algorithm: 0x%x",
+ (uint32_t)mech);
+
+#endif /* _KERNEL */
+ }
+
+ ctx->algotype = (uint32_t)mech;
+ ctx->count.c64[0] = ctx->count.c64[1] = 0;
+}
+
+#ifndef _KERNEL
+
+// #pragma inline(SHA256Init, SHA384Init, SHA512Init)
+void
+SHA256Init(SHA256_CTX *ctx)
+{
+ SHA2Init(SHA256, ctx);
+}
+
+void
+SHA384Init(SHA384_CTX *ctx)
+{
+ SHA2Init(SHA384, ctx);
+}
+
+void
+SHA512Init(SHA512_CTX *ctx)
+{
+ SHA2Init(SHA512, ctx);
+}
+
+#endif /* _KERNEL */
+
+/*
+ * SHA2Update()
+ *
+ * purpose: continues an sha2 digest operation, using the message block
+ * to update the context.
+ * input: SHA2_CTX * : the context to update
+ * void * : the message block
+ * size_t : the length of the message block, in bytes
+ * output: void
+ */
+
+void
+SHA2Update(SHA2_CTX *ctx, const void *inptr, size_t input_len)
+{
+ uint32_t i, buf_index, buf_len, buf_limit;
+ const uint8_t *input = inptr;
+ uint32_t algotype = ctx->algotype;
+
+ /* check for noop */
+ if (input_len == 0)
+ return;
+
+ if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) {
+ buf_limit = 64;
+
+ /* compute number of bytes mod 64 */
+ buf_index = (ctx->count.c32[1] >> 3) & 0x3F;
+
+ /* update number of bits */
+ if ((ctx->count.c32[1] += (input_len << 3)) < (input_len << 3))
+ ctx->count.c32[0]++;
+
+ ctx->count.c32[0] += (input_len >> 29);
+
+ } else {
+ buf_limit = 128;
+
+ /* compute number of bytes mod 128 */
+ buf_index = (ctx->count.c64[1] >> 3) & 0x7F;
+
+ /* update number of bits */
+ if ((ctx->count.c64[1] += (input_len << 3)) < (input_len << 3))
+ ctx->count.c64[0]++;
+
+ ctx->count.c64[0] += (input_len >> 29);
+ }
+
+ buf_len = buf_limit - buf_index;
+
+ /* transform as many times as possible */
+ i = 0;
+ if (input_len >= buf_len) {
+
+ /*
+ * general optimization:
+ *
+ * only do initial bcopy() and SHA2Transform() if
+ * buf_index != 0. if buf_index == 0, we're just
+ * wasting our time doing the bcopy() since there
+ * wasn't any data left over from a previous call to
+ * SHA2Update().
+ */
+ if (buf_index) {
+ bcopy(input, &ctx->buf_un.buf8[buf_index], buf_len);
+ if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE)
+ SHA256Transform(ctx, ctx->buf_un.buf8);
+ else
+ SHA512Transform(ctx, ctx->buf_un.buf8);
+
+ i = buf_len;
+ }
+
+#if !defined(__amd64) || !defined(_KERNEL)
+ if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) {
+ for (; i + buf_limit - 1 < input_len; i += buf_limit) {
+ SHA256Transform(ctx, &input[i]);
+ }
+ } else {
+ for (; i + buf_limit - 1 < input_len; i += buf_limit) {
+ SHA512Transform(ctx, &input[i]);
+ }
+ }
+
+#else
+ uint32_t block_count;
+ if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) {
+ block_count = (input_len - i) >> 6;
+ if (block_count > 0) {
+ SHA256TransformBlocks(ctx, &input[i],
+ block_count);
+ i += block_count << 6;
+ }
+ } else {
+ block_count = (input_len - i) >> 7;
+ if (block_count > 0) {
+ SHA512TransformBlocks(ctx, &input[i],
+ block_count);
+ i += block_count << 7;
+ }
+ }
+#endif /* !__amd64 || !_KERNEL */
+
+ /*
+ * general optimization:
+ *
+ * if i and input_len are the same, return now instead
+ * of calling bcopy(), since the bcopy() in this case
+ * will be an expensive noop.
+ */
+
+ if (input_len == i)
+ return;
+
+ buf_index = 0;
+ }
+
+ /* buffer remaining input */
+ bcopy(&input[i], &ctx->buf_un.buf8[buf_index], input_len - i);
+}
+
+
+/*
+ * SHA2Final()
+ *
+ * purpose: ends an sha2 digest operation, finalizing the message digest and
+ * zeroing the context.
+ * input: uchar_t * : a buffer to store the digest
+ * : The function actually uses void* because many
+ * : callers pass things other than uchar_t here.
+ * SHA2_CTX * : the context to finalize, save, and zero
+ * output: void
+ */
+
+void
+SHA2Final(void *digest, SHA2_CTX *ctx)
+{
+ uint8_t bitcount_be[sizeof (ctx->count.c32)];
+ uint8_t bitcount_be64[sizeof (ctx->count.c64)];
+ uint32_t index;
+ uint32_t algotype = ctx->algotype;
+
+ if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) {
+ index = (ctx->count.c32[1] >> 3) & 0x3f;
+ Encode(bitcount_be, ctx->count.c32, sizeof (bitcount_be));
+ SHA2Update(ctx, PADDING, ((index < 56) ? 56 : 120) - index);
+ SHA2Update(ctx, bitcount_be, sizeof (bitcount_be));
+ Encode(digest, ctx->state.s32, sizeof (ctx->state.s32));
+ } else {
+ index = (ctx->count.c64[1] >> 3) & 0x7f;
+ Encode64(bitcount_be64, ctx->count.c64,
+ sizeof (bitcount_be64));
+ SHA2Update(ctx, PADDING, ((index < 112) ? 112 : 240) - index);
+ SHA2Update(ctx, bitcount_be64, sizeof (bitcount_be64));
+ if (algotype <= SHA384_HMAC_GEN_MECH_INFO_TYPE) {
+ ctx->state.s64[6] = ctx->state.s64[7] = 0;
+ Encode64(digest, ctx->state.s64,
+ sizeof (uint64_t) * 6);
+ } else if (algotype == SHA512_224_MECH_INFO_TYPE) {
+ uint8_t last[sizeof (uint64_t)];
+ /*
+ * Since SHA-512/224 doesn't align well to 64-bit
+ * boundaries, we must do the encoding in three steps:
+ * 1) encode the three 64-bit words that fit neatly
+ * 2) encode the last 64-bit word to a temp buffer
+ * 3) chop out the lower 32-bits from the temp buffer
+ * and append them to the digest
+ */
+ Encode64(digest, ctx->state.s64, sizeof (uint64_t) * 3);
+ Encode64(last, &ctx->state.s64[3], sizeof (uint64_t));
+ bcopy(last, (uint8_t *)digest + 24, 4);
+ } else if (algotype == SHA512_256_MECH_INFO_TYPE) {
+ Encode64(digest, ctx->state.s64, sizeof (uint64_t) * 4);
+ } else {
+ Encode64(digest, ctx->state.s64,
+ sizeof (ctx->state.s64));
+ }
+ }
+
+ /* zeroize sensitive information */
+ bzero(ctx, sizeof (*ctx));
+}
+
+#ifdef _KERNEL
+EXPORT_SYMBOL(SHA2Init);
+EXPORT_SYMBOL(SHA2Update);
+EXPORT_SYMBOL(SHA2Final);
+#endif
diff --git a/sys/contrib/openzfs/module/icp/algs/skein/THIRDPARTYLICENSE b/sys/contrib/openzfs/module/icp/algs/skein/THIRDPARTYLICENSE
new file mode 100644
index 000000000000..b7434fd17872
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/skein/THIRDPARTYLICENSE
@@ -0,0 +1,3 @@
+Implementation of the Skein hash function.
+Source code author: Doug Whiting, 2008.
+This algorithm and source code is released to the public domain.
diff --git a/sys/contrib/openzfs/module/icp/algs/skein/THIRDPARTYLICENSE.descrip b/sys/contrib/openzfs/module/icp/algs/skein/THIRDPARTYLICENSE.descrip
new file mode 100644
index 000000000000..0ae89cfdf5ce
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/skein/THIRDPARTYLICENSE.descrip
@@ -0,0 +1 @@
+LICENSE TERMS OF SKEIN HASH ALGORITHM IMPLEMENTATION
diff --git a/sys/contrib/openzfs/module/icp/algs/skein/skein.c b/sys/contrib/openzfs/module/icp/algs/skein/skein.c
new file mode 100644
index 000000000000..83fe84260307
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/skein/skein.c
@@ -0,0 +1,911 @@
+/*
+ * Implementation of the Skein hash function.
+ * Source code author: Doug Whiting, 2008.
+ * This algorithm and source code is released to the public domain.
+ */
+/* Copyright 2013 Doug Whiting. This code is released to the public domain. */
+
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/skein.h> /* get the Skein API definitions */
+#include "skein_impl.h" /* get internal definitions */
+
+/* 256-bit Skein */
+/* init the context for a straight hashing operation */
+int
+Skein_256_Init(Skein_256_Ctxt_t *ctx, size_t hashBitLen)
+{
+ union {
+ uint8_t b[SKEIN_256_STATE_BYTES];
+ uint64_t w[SKEIN_256_STATE_WORDS];
+ } cfg; /* config block */
+
+ Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+ ctx->h.hashBitLen = hashBitLen; /* output hash bit count */
+
+ switch (hashBitLen) { /* use pre-computed values, where available */
+#ifndef SKEIN_NO_PRECOMP
+ case 256:
+ bcopy(SKEIN_256_IV_256, ctx->X, sizeof (ctx->X));
+ break;
+ case 224:
+ bcopy(SKEIN_256_IV_224, ctx->X, sizeof (ctx->X));
+ break;
+ case 160:
+ bcopy(SKEIN_256_IV_160, ctx->X, sizeof (ctx->X));
+ break;
+ case 128:
+ bcopy(SKEIN_256_IV_128, ctx->X, sizeof (ctx->X));
+ break;
+#endif
+ default:
+ /* here if there is no precomputed IV value available */
+ /*
+ * build/process the config block, type == CONFIG (could be
+ * precomputed)
+ */
+ /* set tweaks: T0=0; T1=CFG | FINAL */
+ Skein_Start_New_Type(ctx, CFG_FINAL);
+
+ /* set the schema, version */
+ cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+ /* hash result length in bits */
+ cfg.w[1] = Skein_Swap64(hashBitLen);
+ cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+ /* zero pad config block */
+ bzero(&cfg.w[3], sizeof (cfg) - 3 * sizeof (cfg.w[0]));
+
+ /* compute the initial chaining values from config block */
+ /* zero the chaining variables */
+ bzero(ctx->X, sizeof (ctx->X));
+ Skein_256_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+ break;
+ }
+ /*
+ * The chaining vars ctx->X are now initialized for the given
+ * hashBitLen.
+ * Set up to process the data message portion of the hash (default)
+ */
+ Skein_Start_New_Type(ctx, MSG); /* T0=0, T1= MSG type */
+
+ return (SKEIN_SUCCESS);
+}
+
+/* init the context for a MAC and/or tree hash operation */
+/*
+ * [identical to Skein_256_Init() when keyBytes == 0 &&
+ * treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL]
+ */
+int
+Skein_256_InitExt(Skein_256_Ctxt_t *ctx, size_t hashBitLen, uint64_t treeInfo,
+ const uint8_t *key, size_t keyBytes)
+{
+ union {
+ uint8_t b[SKEIN_256_STATE_BYTES];
+ uint64_t w[SKEIN_256_STATE_WORDS];
+ } cfg; /* config block */
+
+ Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+ Skein_Assert(keyBytes == 0 || key != NULL, SKEIN_FAIL);
+
+ /* compute the initial chaining values ctx->X[], based on key */
+ if (keyBytes == 0) { /* is there a key? */
+ /* no key: use all zeroes as key for config block */
+ bzero(ctx->X, sizeof (ctx->X));
+ } else { /* here to pre-process a key */
+
+ Skein_assert(sizeof (cfg.b) >= sizeof (ctx->X));
+ /* do a mini-Init right here */
+ /* set output hash bit count = state size */
+ ctx->h.hashBitLen = 8 * sizeof (ctx->X);
+ /* set tweaks: T0 = 0; T1 = KEY type */
+ Skein_Start_New_Type(ctx, KEY);
+ /* zero the initial chaining variables */
+ bzero(ctx->X, sizeof (ctx->X));
+ /* hash the key */
+ (void) Skein_256_Update(ctx, key, keyBytes);
+ /* put result into cfg.b[] */
+ (void) Skein_256_Final_Pad(ctx, cfg.b);
+ /* copy over into ctx->X[] */
+ bcopy(cfg.b, ctx->X, sizeof (cfg.b));
+#if SKEIN_NEED_SWAP
+ {
+ uint_t i;
+ /* convert key bytes to context words */
+ for (i = 0; i < SKEIN_256_STATE_WORDS; i++)
+ ctx->X[i] = Skein_Swap64(ctx->X[i]);
+ }
+#endif
+ }
+ /*
+ * build/process the config block, type == CONFIG (could be
+ * precomputed for each key)
+ */
+ ctx->h.hashBitLen = hashBitLen; /* output hash bit count */
+ Skein_Start_New_Type(ctx, CFG_FINAL);
+
+ bzero(&cfg.w, sizeof (cfg.w)); /* pre-pad cfg.w[] with zeroes */
+ cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+ cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */
+ /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+ cfg.w[2] = Skein_Swap64(treeInfo);
+
+ Skein_Show_Key(256, &ctx->h, key, keyBytes);
+
+ /* compute the initial chaining values from config block */
+ Skein_256_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+
+ /* The chaining vars ctx->X are now initialized */
+ /* Set up to process the data message portion of the hash (default) */
+ ctx->h.bCnt = 0; /* buffer b[] starts out empty */
+ Skein_Start_New_Type(ctx, MSG);
+
+ return (SKEIN_SUCCESS);
+}
+
+/* process the input bytes */
+int
+Skein_256_Update(Skein_256_Ctxt_t *ctx, const uint8_t *msg, size_t msgByteCnt)
+{
+ size_t n;
+
+ /* catch uninitialized context */
+ Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL);
+
+ /* process full blocks, if any */
+ if (msgByteCnt + ctx->h.bCnt > SKEIN_256_BLOCK_BYTES) {
+ /* finish up any buffered message data */
+ if (ctx->h.bCnt) {
+ /* # bytes free in buffer b[] */
+ n = SKEIN_256_BLOCK_BYTES - ctx->h.bCnt;
+ if (n) {
+ /* check on our logic here */
+ Skein_assert(n < msgByteCnt);
+ bcopy(msg, &ctx->b[ctx->h.bCnt], n);
+ msgByteCnt -= n;
+ msg += n;
+ ctx->h.bCnt += n;
+ }
+ Skein_assert(ctx->h.bCnt == SKEIN_256_BLOCK_BYTES);
+ Skein_256_Process_Block(ctx, ctx->b, 1,
+ SKEIN_256_BLOCK_BYTES);
+ ctx->h.bCnt = 0;
+ }
+ /*
+ * now process any remaining full blocks, directly from input
+ * message data
+ */
+ if (msgByteCnt > SKEIN_256_BLOCK_BYTES) {
+ /* number of full blocks to process */
+ n = (msgByteCnt - 1) / SKEIN_256_BLOCK_BYTES;
+ Skein_256_Process_Block(ctx, msg, n,
+ SKEIN_256_BLOCK_BYTES);
+ msgByteCnt -= n * SKEIN_256_BLOCK_BYTES;
+ msg += n * SKEIN_256_BLOCK_BYTES;
+ }
+ Skein_assert(ctx->h.bCnt == 0);
+ }
+
+ /* copy any remaining source message data bytes into b[] */
+ if (msgByteCnt) {
+ Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES);
+ bcopy(msg, &ctx->b[ctx->h.bCnt], msgByteCnt);
+ ctx->h.bCnt += msgByteCnt;
+ }
+
+ return (SKEIN_SUCCESS);
+}
+
+/* finalize the hash computation and output the result */
+int
+Skein_256_Final(Skein_256_Ctxt_t *ctx, uint8_t *hashVal)
+{
+ size_t i, n, byteCnt;
+ uint64_t X[SKEIN_256_STATE_WORDS];
+
+ /* catch uninitialized context */
+ Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL);
+
+ ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */
+ /* zero pad b[] if necessary */
+ if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES)
+ bzero(&ctx->b[ctx->h.bCnt],
+ SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
+
+ /* process the final block */
+ Skein_256_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+ /* now output the result */
+ /* total number of output bytes */
+ byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+ /* run Threefish in "counter mode" to generate output */
+ /* zero out b[], so it can hold the counter */
+ bzero(ctx->b, sizeof (ctx->b));
+ /* keep a local copy of counter mode "key" */
+ bcopy(ctx->X, X, sizeof (X));
+ for (i = 0; i * SKEIN_256_BLOCK_BYTES < byteCnt; i++) {
+ /* build the counter block */
+ uint64_t tmp = Skein_Swap64((uint64_t)i);
+ bcopy(&tmp, ctx->b, sizeof (tmp));
+ Skein_Start_New_Type(ctx, OUT_FINAL);
+ /* run "counter mode" */
+ Skein_256_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+ /* number of output bytes left to go */
+ n = byteCnt - i * SKEIN_256_BLOCK_BYTES;
+ if (n >= SKEIN_256_BLOCK_BYTES)
+ n = SKEIN_256_BLOCK_BYTES;
+ Skein_Put64_LSB_First(hashVal + i * SKEIN_256_BLOCK_BYTES,
+ ctx->X, n); /* "output" the ctr mode bytes */
+ Skein_Show_Final(256, &ctx->h, n,
+ hashVal + i * SKEIN_256_BLOCK_BYTES);
+ /* restore the counter mode key for next time */
+ bcopy(X, ctx->X, sizeof (X));
+ }
+ return (SKEIN_SUCCESS);
+}
+
+/* 512-bit Skein */
+
+/* init the context for a straight hashing operation */
+int
+Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen)
+{
+ union {
+ uint8_t b[SKEIN_512_STATE_BYTES];
+ uint64_t w[SKEIN_512_STATE_WORDS];
+ } cfg; /* config block */
+
+ Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+ ctx->h.hashBitLen = hashBitLen; /* output hash bit count */
+
+ switch (hashBitLen) { /* use pre-computed values, where available */
+#ifndef SKEIN_NO_PRECOMP
+ case 512:
+ bcopy(SKEIN_512_IV_512, ctx->X, sizeof (ctx->X));
+ break;
+ case 384:
+ bcopy(SKEIN_512_IV_384, ctx->X, sizeof (ctx->X));
+ break;
+ case 256:
+ bcopy(SKEIN_512_IV_256, ctx->X, sizeof (ctx->X));
+ break;
+ case 224:
+ bcopy(SKEIN_512_IV_224, ctx->X, sizeof (ctx->X));
+ break;
+#endif
+ default:
+ /*
+ * here if there is no precomputed IV value available
+ * build/process the config block, type == CONFIG (could be
+ * precomputed)
+ */
+ /* set tweaks: T0=0; T1=CFG | FINAL */
+ Skein_Start_New_Type(ctx, CFG_FINAL);
+
+ /* set the schema, version */
+ cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+ /* hash result length in bits */
+ cfg.w[1] = Skein_Swap64(hashBitLen);
+ cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+ /* zero pad config block */
+ bzero(&cfg.w[3], sizeof (cfg) - 3 * sizeof (cfg.w[0]));
+
+ /* compute the initial chaining values from config block */
+ /* zero the chaining variables */
+ bzero(ctx->X, sizeof (ctx->X));
+ Skein_512_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+ break;
+ }
+
+ /*
+ * The chaining vars ctx->X are now initialized for the given
+ * hashBitLen. Set up to process the data message portion of the
+ * hash (default)
+ */
+ Skein_Start_New_Type(ctx, MSG); /* T0=0, T1= MSG type */
+
+ return (SKEIN_SUCCESS);
+}
+
+/* init the context for a MAC and/or tree hash operation */
+/*
+ * [identical to Skein_512_Init() when keyBytes == 0 &&
+ * treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL]
+ */
+int
+Skein_512_InitExt(Skein_512_Ctxt_t *ctx, size_t hashBitLen, uint64_t treeInfo,
+ const uint8_t *key, size_t keyBytes)
+{
+ union {
+ uint8_t b[SKEIN_512_STATE_BYTES];
+ uint64_t w[SKEIN_512_STATE_WORDS];
+ } cfg; /* config block */
+
+ Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+ Skein_Assert(keyBytes == 0 || key != NULL, SKEIN_FAIL);
+
+ /* compute the initial chaining values ctx->X[], based on key */
+ if (keyBytes == 0) { /* is there a key? */
+ /* no key: use all zeroes as key for config block */
+ bzero(ctx->X, sizeof (ctx->X));
+ } else { /* here to pre-process a key */
+
+ Skein_assert(sizeof (cfg.b) >= sizeof (ctx->X));
+ /* do a mini-Init right here */
+ /* set output hash bit count = state size */
+ ctx->h.hashBitLen = 8 * sizeof (ctx->X);
+ /* set tweaks: T0 = 0; T1 = KEY type */
+ Skein_Start_New_Type(ctx, KEY);
+ /* zero the initial chaining variables */
+ bzero(ctx->X, sizeof (ctx->X));
+ (void) Skein_512_Update(ctx, key, keyBytes); /* hash the key */
+ /* put result into cfg.b[] */
+ (void) Skein_512_Final_Pad(ctx, cfg.b);
+ /* copy over into ctx->X[] */
+ bcopy(cfg.b, ctx->X, sizeof (cfg.b));
+#if SKEIN_NEED_SWAP
+ {
+ uint_t i;
+ /* convert key bytes to context words */
+ for (i = 0; i < SKEIN_512_STATE_WORDS; i++)
+ ctx->X[i] = Skein_Swap64(ctx->X[i]);
+ }
+#endif
+ }
+ /*
+ * build/process the config block, type == CONFIG (could be
+ * precomputed for each key)
+ */
+ ctx->h.hashBitLen = hashBitLen; /* output hash bit count */
+ Skein_Start_New_Type(ctx, CFG_FINAL);
+
+ bzero(&cfg.w, sizeof (cfg.w)); /* pre-pad cfg.w[] with zeroes */
+ cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+ cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */
+ /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+ cfg.w[2] = Skein_Swap64(treeInfo);
+
+ Skein_Show_Key(512, &ctx->h, key, keyBytes);
+
+ /* compute the initial chaining values from config block */
+ Skein_512_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+
+ /* The chaining vars ctx->X are now initialized */
+ /* Set up to process the data message portion of the hash (default) */
+ ctx->h.bCnt = 0; /* buffer b[] starts out empty */
+ Skein_Start_New_Type(ctx, MSG);
+
+ return (SKEIN_SUCCESS);
+}
+
+/* process the input bytes */
+int
+Skein_512_Update(Skein_512_Ctxt_t *ctx, const uint8_t *msg, size_t msgByteCnt)
+{
+ size_t n;
+
+ /* catch uninitialized context */
+ Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL);
+
+ /* process full blocks, if any */
+ if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) {
+ /* finish up any buffered message data */
+ if (ctx->h.bCnt) {
+ /* # bytes free in buffer b[] */
+ n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;
+ if (n) {
+ /* check on our logic here */
+ Skein_assert(n < msgByteCnt);
+ bcopy(msg, &ctx->b[ctx->h.bCnt], n);
+ msgByteCnt -= n;
+ msg += n;
+ ctx->h.bCnt += n;
+ }
+ Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES);
+ Skein_512_Process_Block(ctx, ctx->b, 1,
+ SKEIN_512_BLOCK_BYTES);
+ ctx->h.bCnt = 0;
+ }
+ /*
+ * now process any remaining full blocks, directly from input
+ * message data
+ */
+ if (msgByteCnt > SKEIN_512_BLOCK_BYTES) {
+ /* number of full blocks to process */
+ n = (msgByteCnt - 1) / SKEIN_512_BLOCK_BYTES;
+ Skein_512_Process_Block(ctx, msg, n,
+ SKEIN_512_BLOCK_BYTES);
+ msgByteCnt -= n * SKEIN_512_BLOCK_BYTES;
+ msg += n * SKEIN_512_BLOCK_BYTES;
+ }
+ Skein_assert(ctx->h.bCnt == 0);
+ }
+
+ /* copy any remaining source message data bytes into b[] */
+ if (msgByteCnt) {
+ Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES);
+ bcopy(msg, &ctx->b[ctx->h.bCnt], msgByteCnt);
+ ctx->h.bCnt += msgByteCnt;
+ }
+
+ return (SKEIN_SUCCESS);
+}
+
+/* finalize the hash computation and output the result */
+int
+Skein_512_Final(Skein_512_Ctxt_t *ctx, uint8_t *hashVal)
+{
+ size_t i, n, byteCnt;
+ uint64_t X[SKEIN_512_STATE_WORDS];
+
+ /* catch uninitialized context */
+ Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL);
+
+ ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */
+ /* zero pad b[] if necessary */
+ if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)
+ bzero(&ctx->b[ctx->h.bCnt],
+ SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+
+ /* process the final block */
+ Skein_512_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+ /* now output the result */
+ /* total number of output bytes */
+ byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+ /* run Threefish in "counter mode" to generate output */
+ /* zero out b[], so it can hold the counter */
+ bzero(ctx->b, sizeof (ctx->b));
+ /* keep a local copy of counter mode "key" */
+ bcopy(ctx->X, X, sizeof (X));
+ for (i = 0; i * SKEIN_512_BLOCK_BYTES < byteCnt; i++) {
+ /* build the counter block */
+ uint64_t tmp = Skein_Swap64((uint64_t)i);
+ bcopy(&tmp, ctx->b, sizeof (tmp));
+ Skein_Start_New_Type(ctx, OUT_FINAL);
+ /* run "counter mode" */
+ Skein_512_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+ /* number of output bytes left to go */
+ n = byteCnt - i * SKEIN_512_BLOCK_BYTES;
+ if (n >= SKEIN_512_BLOCK_BYTES)
+ n = SKEIN_512_BLOCK_BYTES;
+ Skein_Put64_LSB_First(hashVal + i * SKEIN_512_BLOCK_BYTES,
+ ctx->X, n); /* "output" the ctr mode bytes */
+ Skein_Show_Final(512, &ctx->h, n,
+ hashVal + i * SKEIN_512_BLOCK_BYTES);
+ /* restore the counter mode key for next time */
+ bcopy(X, ctx->X, sizeof (X));
+ }
+ return (SKEIN_SUCCESS);
+}
+
+/* 1024-bit Skein */
+
+/* init the context for a straight hashing operation */
+int
+Skein1024_Init(Skein1024_Ctxt_t *ctx, size_t hashBitLen)
+{
+ union {
+ uint8_t b[SKEIN1024_STATE_BYTES];
+ uint64_t w[SKEIN1024_STATE_WORDS];
+ } cfg; /* config block */
+
+ Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+ ctx->h.hashBitLen = hashBitLen; /* output hash bit count */
+
+ switch (hashBitLen) { /* use pre-computed values, where available */
+#ifndef SKEIN_NO_PRECOMP
+ case 512:
+ bcopy(SKEIN1024_IV_512, ctx->X, sizeof (ctx->X));
+ break;
+ case 384:
+ bcopy(SKEIN1024_IV_384, ctx->X, sizeof (ctx->X));
+ break;
+ case 1024:
+ bcopy(SKEIN1024_IV_1024, ctx->X, sizeof (ctx->X));
+ break;
+#endif
+ default:
+ /* here if there is no precomputed IV value available */
+ /*
+ * build/process the config block, type == CONFIG (could be
+ * precomputed)
+ */
+ /* set tweaks: T0=0; T1=CFG | FINAL */
+ Skein_Start_New_Type(ctx, CFG_FINAL);
+
+ /* set the schema, version */
+ cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+ /* hash result length in bits */
+ cfg.w[1] = Skein_Swap64(hashBitLen);
+ cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+ /* zero pad config block */
+ bzero(&cfg.w[3], sizeof (cfg) - 3 * sizeof (cfg.w[0]));
+
+ /* compute the initial chaining values from config block */
+ /* zero the chaining variables */
+ bzero(ctx->X, sizeof (ctx->X));
+ Skein1024_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+ break;
+ }
+
+ /*
+ * The chaining vars ctx->X are now initialized for the given
+ * hashBitLen. Set up to process the data message portion of the hash
+ * (default)
+ */
+ Skein_Start_New_Type(ctx, MSG); /* T0=0, T1= MSG type */
+
+ return (SKEIN_SUCCESS);
+}
+
+/* init the context for a MAC and/or tree hash operation */
+/*
+ * [identical to Skein1024_Init() when keyBytes == 0 &&
+ * treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL]
+ */
+int
+Skein1024_InitExt(Skein1024_Ctxt_t *ctx, size_t hashBitLen, uint64_t treeInfo,
+ const uint8_t *key, size_t keyBytes)
+{
+ union {
+ uint8_t b[SKEIN1024_STATE_BYTES];
+ uint64_t w[SKEIN1024_STATE_WORDS];
+ } cfg; /* config block */
+
+ Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+ Skein_Assert(keyBytes == 0 || key != NULL, SKEIN_FAIL);
+
+ /* compute the initial chaining values ctx->X[], based on key */
+ if (keyBytes == 0) { /* is there a key? */
+ /* no key: use all zeroes as key for config block */
+ bzero(ctx->X, sizeof (ctx->X));
+ } else { /* here to pre-process a key */
+ Skein_assert(sizeof (cfg.b) >= sizeof (ctx->X));
+ /* do a mini-Init right here */
+ /* set output hash bit count = state size */
+ ctx->h.hashBitLen = 8 * sizeof (ctx->X);
+ /* set tweaks: T0 = 0; T1 = KEY type */
+ Skein_Start_New_Type(ctx, KEY);
+ /* zero the initial chaining variables */
+ bzero(ctx->X, sizeof (ctx->X));
+ (void) Skein1024_Update(ctx, key, keyBytes); /* hash the key */
+ /* put result into cfg.b[] */
+ (void) Skein1024_Final_Pad(ctx, cfg.b);
+ /* copy over into ctx->X[] */
+ bcopy(cfg.b, ctx->X, sizeof (cfg.b));
+#if SKEIN_NEED_SWAP
+ {
+ uint_t i;
+ /* convert key bytes to context words */
+ for (i = 0; i < SKEIN1024_STATE_WORDS; i++)
+ ctx->X[i] = Skein_Swap64(ctx->X[i]);
+ }
+#endif
+ }
+ /*
+ * build/process the config block, type == CONFIG (could be
+ * precomputed for each key)
+ */
+ ctx->h.hashBitLen = hashBitLen; /* output hash bit count */
+ Skein_Start_New_Type(ctx, CFG_FINAL);
+
+ bzero(&cfg.w, sizeof (cfg.w)); /* pre-pad cfg.w[] with zeroes */
+ cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+ /* hash result length in bits */
+ cfg.w[1] = Skein_Swap64(hashBitLen);
+ /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+ cfg.w[2] = Skein_Swap64(treeInfo);
+
+ Skein_Show_Key(1024, &ctx->h, key, keyBytes);
+
+ /* compute the initial chaining values from config block */
+ Skein1024_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+
+ /* The chaining vars ctx->X are now initialized */
+ /* Set up to process the data message portion of the hash (default) */
+ ctx->h.bCnt = 0; /* buffer b[] starts out empty */
+ Skein_Start_New_Type(ctx, MSG);
+
+ return (SKEIN_SUCCESS);
+}
+
+/* process the input bytes */
+int
+Skein1024_Update(Skein1024_Ctxt_t *ctx, const uint8_t *msg, size_t msgByteCnt)
+{
+ size_t n;
+
+ /* catch uninitialized context */
+ Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL);
+
+ /* process full blocks, if any */
+ if (msgByteCnt + ctx->h.bCnt > SKEIN1024_BLOCK_BYTES) {
+ /* finish up any buffered message data */
+ if (ctx->h.bCnt) {
+ /* # bytes free in buffer b[] */
+ n = SKEIN1024_BLOCK_BYTES - ctx->h.bCnt;
+ if (n) {
+ /* check on our logic here */
+ Skein_assert(n < msgByteCnt);
+ bcopy(msg, &ctx->b[ctx->h.bCnt], n);
+ msgByteCnt -= n;
+ msg += n;
+ ctx->h.bCnt += n;
+ }
+ Skein_assert(ctx->h.bCnt == SKEIN1024_BLOCK_BYTES);
+ Skein1024_Process_Block(ctx, ctx->b, 1,
+ SKEIN1024_BLOCK_BYTES);
+ ctx->h.bCnt = 0;
+ }
+ /*
+ * now process any remaining full blocks, directly from
+ * input message data
+ */
+ if (msgByteCnt > SKEIN1024_BLOCK_BYTES) {
+ /* number of full blocks to process */
+ n = (msgByteCnt - 1) / SKEIN1024_BLOCK_BYTES;
+ Skein1024_Process_Block(ctx, msg, n,
+ SKEIN1024_BLOCK_BYTES);
+ msgByteCnt -= n * SKEIN1024_BLOCK_BYTES;
+ msg += n * SKEIN1024_BLOCK_BYTES;
+ }
+ Skein_assert(ctx->h.bCnt == 0);
+ }
+
+ /* copy any remaining source message data bytes into b[] */
+ if (msgByteCnt) {
+ Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES);
+ bcopy(msg, &ctx->b[ctx->h.bCnt], msgByteCnt);
+ ctx->h.bCnt += msgByteCnt;
+ }
+
+ return (SKEIN_SUCCESS);
+}
+
+/* finalize the hash computation and output the result */
+int
+Skein1024_Final(Skein1024_Ctxt_t *ctx, uint8_t *hashVal)
+{
+ size_t i, n, byteCnt;
+ uint64_t X[SKEIN1024_STATE_WORDS];
+
+ /* catch uninitialized context */
+ Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL);
+
+ ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */
+ /* zero pad b[] if necessary */
+ if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES)
+ bzero(&ctx->b[ctx->h.bCnt],
+ SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
+
+ /* process the final block */
+ Skein1024_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+ /* now output the result */
+ /* total number of output bytes */
+ byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+ /* run Threefish in "counter mode" to generate output */
+ /* zero out b[], so it can hold the counter */
+ bzero(ctx->b, sizeof (ctx->b));
+ /* keep a local copy of counter mode "key" */
+ bcopy(ctx->X, X, sizeof (X));
+ for (i = 0; i * SKEIN1024_BLOCK_BYTES < byteCnt; i++) {
+ /* build the counter block */
+ uint64_t tmp = Skein_Swap64((uint64_t)i);
+ bcopy(&tmp, ctx->b, sizeof (tmp));
+ Skein_Start_New_Type(ctx, OUT_FINAL);
+ /* run "counter mode" */
+ Skein1024_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+ /* number of output bytes left to go */
+ n = byteCnt - i * SKEIN1024_BLOCK_BYTES;
+ if (n >= SKEIN1024_BLOCK_BYTES)
+ n = SKEIN1024_BLOCK_BYTES;
+ Skein_Put64_LSB_First(hashVal + i * SKEIN1024_BLOCK_BYTES,
+ ctx->X, n); /* "output" the ctr mode bytes */
+ Skein_Show_Final(1024, &ctx->h, n,
+ hashVal + i * SKEIN1024_BLOCK_BYTES);
+ /* restore the counter mode key for next time */
+ bcopy(X, ctx->X, sizeof (X));
+ }
+ return (SKEIN_SUCCESS);
+}
+
+/* Functions to support MAC/tree hashing */
+/* (this code is identical for Optimized and Reference versions) */
+
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int
+Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, uint8_t *hashVal)
+{
+ /* catch uninitialized context */
+ Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL);
+
+ ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */
+ /* zero pad b[] if necessary */
+ if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES)
+ bzero(&ctx->b[ctx->h.bCnt],
+ SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
+ /* process the final block */
+ Skein_256_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+ /* "output" the state bytes */
+ Skein_Put64_LSB_First(hashVal, ctx->X, SKEIN_256_BLOCK_BYTES);
+
+ return (SKEIN_SUCCESS);
+}
+
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int
+Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, uint8_t *hashVal)
+{
+ /* catch uninitialized context */
+ Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL);
+
+ ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */
+ /* zero pad b[] if necessary */
+ if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)
+ bzero(&ctx->b[ctx->h.bCnt],
+ SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+ /* process the final block */
+ Skein_512_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+ /* "output" the state bytes */
+ Skein_Put64_LSB_First(hashVal, ctx->X, SKEIN_512_BLOCK_BYTES);
+
+ return (SKEIN_SUCCESS);
+}
+
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int
+Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, uint8_t *hashVal)
+{
+ /* catch uninitialized context */
+ Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL);
+
+ /* tag as the final block */
+ ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;
+ /* zero pad b[] if necessary */
+ if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES)
+ bzero(&ctx->b[ctx->h.bCnt],
+ SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
+ /* process the final block */
+ Skein1024_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+ /* "output" the state bytes */
+ Skein_Put64_LSB_First(hashVal, ctx->X, SKEIN1024_BLOCK_BYTES);
+
+ return (SKEIN_SUCCESS);
+}
+
+#if SKEIN_TREE_HASH
+/* just do the OUTPUT stage */
+int
+Skein_256_Output(Skein_256_Ctxt_t *ctx, uint8_t *hashVal)
+{
+ size_t i, n, byteCnt;
+ uint64_t X[SKEIN_256_STATE_WORDS];
+
+ /* catch uninitialized context */
+ Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL);
+
+ /* now output the result */
+ /* total number of output bytes */
+ byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+ /* run Threefish in "counter mode" to generate output */
+ /* zero out b[], so it can hold the counter */
+ bzero(ctx->b, sizeof (ctx->b));
+ /* keep a local copy of counter mode "key" */
+ bcopy(ctx->X, X, sizeof (X));
+ for (i = 0; i * SKEIN_256_BLOCK_BYTES < byteCnt; i++) {
+ /* build the counter block */
+ uint64_t tmp = Skein_Swap64((uint64_t)i);
+ bcopy(&tmp, ctx->b, sizeof (tmp));
+ Skein_Start_New_Type(ctx, OUT_FINAL);
+ /* run "counter mode" */
+ Skein_256_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+ /* number of output bytes left to go */
+ n = byteCnt - i * SKEIN_256_BLOCK_BYTES;
+ if (n >= SKEIN_256_BLOCK_BYTES)
+ n = SKEIN_256_BLOCK_BYTES;
+ Skein_Put64_LSB_First(hashVal + i * SKEIN_256_BLOCK_BYTES,
+ ctx->X, n); /* "output" the ctr mode bytes */
+ Skein_Show_Final(256, &ctx->h, n,
+ hashVal + i * SKEIN_256_BLOCK_BYTES);
+ /* restore the counter mode key for next time */
+ bcopy(X, ctx->X, sizeof (X));
+ }
+ return (SKEIN_SUCCESS);
+}
+
+/* just do the OUTPUT stage */
+int
+Skein_512_Output(Skein_512_Ctxt_t *ctx, uint8_t *hashVal)
+{
+ size_t i, n, byteCnt;
+ uint64_t X[SKEIN_512_STATE_WORDS];
+
+ /* catch uninitialized context */
+ Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL);
+
+ /* now output the result */
+ /* total number of output bytes */
+ byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+ /* run Threefish in "counter mode" to generate output */
+ /* zero out b[], so it can hold the counter */
+ bzero(ctx->b, sizeof (ctx->b));
+ /* keep a local copy of counter mode "key" */
+ bcopy(ctx->X, X, sizeof (X));
+ for (i = 0; i * SKEIN_512_BLOCK_BYTES < byteCnt; i++) {
+ /* build the counter block */
+ uint64_t tmp = Skein_Swap64((uint64_t)i);
+ bcopy(&tmp, ctx->b, sizeof (tmp));
+ Skein_Start_New_Type(ctx, OUT_FINAL);
+ /* run "counter mode" */
+ Skein_512_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+ /* number of output bytes left to go */
+ n = byteCnt - i * SKEIN_512_BLOCK_BYTES;
+ if (n >= SKEIN_512_BLOCK_BYTES)
+ n = SKEIN_512_BLOCK_BYTES;
+ Skein_Put64_LSB_First(hashVal + i * SKEIN_512_BLOCK_BYTES,
+ ctx->X, n); /* "output" the ctr mode bytes */
+ Skein_Show_Final(256, &ctx->h, n,
+ hashVal + i * SKEIN_512_BLOCK_BYTES);
+ /* restore the counter mode key for next time */
+ bcopy(X, ctx->X, sizeof (X));
+ }
+ return (SKEIN_SUCCESS);
+}
+
+/* just do the OUTPUT stage */
+int
+Skein1024_Output(Skein1024_Ctxt_t *ctx, uint8_t *hashVal)
+{
+ size_t i, n, byteCnt;
+ uint64_t X[SKEIN1024_STATE_WORDS];
+
+ /* catch uninitialized context */
+ Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL);
+
+ /* now output the result */
+ /* total number of output bytes */
+ byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+ /* run Threefish in "counter mode" to generate output */
+ /* zero out b[], so it can hold the counter */
+ bzero(ctx->b, sizeof (ctx->b));
+ /* keep a local copy of counter mode "key" */
+ bcopy(ctx->X, X, sizeof (X));
+ for (i = 0; i * SKEIN1024_BLOCK_BYTES < byteCnt; i++) {
+ /* build the counter block */
+ uint64_t tmp = Skein_Swap64((uint64_t)i);
+ bcopy(&tmp, ctx->b, sizeof (tmp));
+ Skein_Start_New_Type(ctx, OUT_FINAL);
+ /* run "counter mode" */
+ Skein1024_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+ /* number of output bytes left to go */
+ n = byteCnt - i * SKEIN1024_BLOCK_BYTES;
+ if (n >= SKEIN1024_BLOCK_BYTES)
+ n = SKEIN1024_BLOCK_BYTES;
+ Skein_Put64_LSB_First(hashVal + i * SKEIN1024_BLOCK_BYTES,
+ ctx->X, n); /* "output" the ctr mode bytes */
+ Skein_Show_Final(256, &ctx->h, n,
+ hashVal + i * SKEIN1024_BLOCK_BYTES);
+ /* restore the counter mode key for next time */
+ bcopy(X, ctx->X, sizeof (X));
+ }
+ return (SKEIN_SUCCESS);
+}
+#endif
+
+#ifdef _KERNEL
+EXPORT_SYMBOL(Skein_512_Init);
+EXPORT_SYMBOL(Skein_512_InitExt);
+EXPORT_SYMBOL(Skein_512_Update);
+EXPORT_SYMBOL(Skein_512_Final);
+#endif
diff --git a/sys/contrib/openzfs/module/icp/algs/skein/skein_block.c b/sys/contrib/openzfs/module/icp/algs/skein/skein_block.c
new file mode 100644
index 000000000000..7ba165a48511
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/skein/skein_block.c
@@ -0,0 +1,790 @@
+/*
+ * Implementation of the Skein block functions.
+ * Source code author: Doug Whiting, 2008.
+ * This algorithm and source code is released to the public domain.
+ * Compile-time switches:
+ * SKEIN_USE_ASM -- set bits (256/512/1024) to select which
+ * versions use ASM code for block processing
+ * [default: use C for all block sizes]
+ */
+/* Copyright 2013 Doug Whiting. This code is released to the public domain. */
+
+#include <sys/skein.h>
+#include "skein_impl.h"
+#include <sys/isa_defs.h> /* for _ILP32 */
+
+#ifndef SKEIN_USE_ASM
+#define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */
+#endif
+
+#ifndef SKEIN_LOOP
+/*
+ * The low-level checksum routines use a lot of stack space. On systems where
+ * small stacks frame are enforced (like 32-bit kernel builds), do not unroll
+ * checksum calculations to save stack space.
+ *
+ * Even with no loops unrolled, we still can exceed the 1k stack frame limit
+ * in Skein1024_Process_Block() (it hits 1272 bytes on ARM32). We can
+ * safely ignore it though, since that the checksum functions will be called
+ * from a worker thread that won't be using much stack. That's why we have
+ * the #pragma here to ignore the warning.
+ */
+#if defined(_ILP32) || defined(__powerpc) /* Assume small stack */
+#pragma GCC diagnostic ignored "-Wframe-larger-than="
+/*
+ * We're running on 32-bit, don't unroll loops to save stack frame space
+ *
+ * Due to the ways the calculations on SKEIN_LOOP are done in
+ * Skein_*_Process_Block(), a value of 111 disables unrolling loops
+ * in any of those functions.
+ */
+#define SKEIN_LOOP 111
+#else
+/* We're compiling with large stacks */
+#define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */
+#endif
+#endif
+
+/* some useful definitions for code here */
+#define BLK_BITS (WCNT*64)
+#define KW_TWK_BASE (0)
+#define KW_KEY_BASE (3)
+#define ks (kw + KW_KEY_BASE)
+#define ts (kw + KW_TWK_BASE)
+
+/* no debugging in Illumos version */
+#define DebugSaveTweak(ctx)
+
+/* Skein_256 */
+#if !(SKEIN_USE_ASM & 256)
+void
+Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr,
+ size_t blkCnt, size_t byteCntAdd)
+{
+ enum {
+ WCNT = SKEIN_256_STATE_WORDS
+ };
+#undef RCNT
+#define RCNT (SKEIN_256_ROUNDS_TOTAL / 8)
+
+#ifdef SKEIN_LOOP /* configure how much to unroll the loop */
+#define SKEIN_UNROLL_256 (((SKEIN_LOOP) / 100) % 10)
+#else
+#define SKEIN_UNROLL_256 (0)
+#endif
+
+#if SKEIN_UNROLL_256
+#if (RCNT % SKEIN_UNROLL_256)
+#error "Invalid SKEIN_UNROLL_256" /* sanity check on unroll count */
+#endif
+ size_t r;
+ /* key schedule words : chaining vars + tweak + "rotation" */
+ uint64_t kw[WCNT + 4 + RCNT * 2];
+#else
+ uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */
+#endif
+ /* local copy of context vars, for speed */
+ uint64_t X0, X1, X2, X3;
+ uint64_t w[WCNT]; /* local copy of input block */
+#ifdef SKEIN_DEBUG
+ /* use for debugging (help compiler put Xn in registers) */
+ const uint64_t *Xptr[4];
+ Xptr[0] = &X0;
+ Xptr[1] = &X1;
+ Xptr[2] = &X2;
+ Xptr[3] = &X3;
+#endif
+ Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */
+ ts[0] = ctx->h.T[0];
+ ts[1] = ctx->h.T[1];
+ do {
+ /*
+ * this implementation only supports 2**64 input bytes
+ * (no carry out here)
+ */
+ ts[0] += byteCntAdd; /* update processed length */
+
+ /* precompute the key schedule for this block */
+ ks[0] = ctx->X[0];
+ ks[1] = ctx->X[1];
+ ks[2] = ctx->X[2];
+ ks[3] = ctx->X[3];
+ ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY;
+
+ ts[2] = ts[0] ^ ts[1];
+
+ /* get input block in little-endian format */
+ Skein_Get64_LSB_First(w, blkPtr, WCNT);
+ DebugSaveTweak(ctx);
+ Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
+
+ X0 = w[0] + ks[0]; /* do the first full key injection */
+ X1 = w[1] + ks[1] + ts[0];
+ X2 = w[2] + ks[2] + ts[1];
+ X3 = w[3] + ks[3];
+
+ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
+ Xptr); /* show starting state values */
+
+ blkPtr += SKEIN_256_BLOCK_BYTES;
+
+ /* run the rounds */
+
+#define Round256(p0, p1, p2, p3, ROT, rNum) \
+ X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \
+ X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \
+
+#if SKEIN_UNROLL_256 == 0
+#define R256(p0, p1, p2, p3, ROT, rNum) /* fully unrolled */ \
+ Round256(p0, p1, p2, p3, ROT, rNum) \
+ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
+
+#define I256(R) \
+ X0 += ks[((R) + 1) % 5]; /* inject the key schedule value */ \
+ X1 += ks[((R) + 2) % 5] + ts[((R) + 1) % 3]; \
+ X2 += ks[((R) + 3) % 5] + ts[((R) + 2) % 3]; \
+ X3 += ks[((R) + 4) % 5] + (R) + 1; \
+ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+#else /* looping version */
+#define R256(p0, p1, p2, p3, ROT, rNum) \
+ Round256(p0, p1, p2, p3, ROT, rNum) \
+ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
+
+#define I256(R) \
+ X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \
+ X1 += ks[r + (R) + 1] + ts[r + (R) + 0]; \
+ X2 += ks[r + (R) + 2] + ts[r + (R) + 1]; \
+ X3 += ks[r + (R) + 3] + r + (R); \
+ ks[r + (R) + 4] = ks[r + (R) - 1]; /* rotate key schedule */ \
+ ts[r + (R) + 2] = ts[r + (R) - 1]; \
+ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+
+ /* loop through it */
+ for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_256)
+#endif
+ {
+#define R256_8_rounds(R) \
+ R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1); \
+ R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2); \
+ R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3); \
+ R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4); \
+ I256(2 * (R)); \
+ R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5); \
+ R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6); \
+ R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7); \
+ R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8); \
+ I256(2 * (R) + 1);
+
+ R256_8_rounds(0);
+
+#define R256_Unroll_R(NN) \
+ ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL / 8 > (NN)) || \
+ (SKEIN_UNROLL_256 > (NN)))
+
+#if R256_Unroll_R(1)
+ R256_8_rounds(1);
+#endif
+#if R256_Unroll_R(2)
+ R256_8_rounds(2);
+#endif
+#if R256_Unroll_R(3)
+ R256_8_rounds(3);
+#endif
+#if R256_Unroll_R(4)
+ R256_8_rounds(4);
+#endif
+#if R256_Unroll_R(5)
+ R256_8_rounds(5);
+#endif
+#if R256_Unroll_R(6)
+ R256_8_rounds(6);
+#endif
+#if R256_Unroll_R(7)
+ R256_8_rounds(7);
+#endif
+#if R256_Unroll_R(8)
+ R256_8_rounds(8);
+#endif
+#if R256_Unroll_R(9)
+ R256_8_rounds(9);
+#endif
+#if R256_Unroll_R(10)
+ R256_8_rounds(10);
+#endif
+#if R256_Unroll_R(11)
+ R256_8_rounds(11);
+#endif
+#if R256_Unroll_R(12)
+ R256_8_rounds(12);
+#endif
+#if R256_Unroll_R(13)
+ R256_8_rounds(13);
+#endif
+#if R256_Unroll_R(14)
+ R256_8_rounds(14);
+#endif
+#if (SKEIN_UNROLL_256 > 14)
+#error "need more unrolling in Skein_256_Process_Block"
+#endif
+ }
+ /*
+ * do the final "feedforward" xor, update context chaining vars
+ */
+ ctx->X[0] = X0 ^ w[0];
+ ctx->X[1] = X1 ^ w[1];
+ ctx->X[2] = X2 ^ w[2];
+ ctx->X[3] = X3 ^ w[3];
+
+ Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
+
+ ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+ } while (--blkCnt);
+ ctx->h.T[0] = ts[0];
+ ctx->h.T[1] = ts[1];
+}
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t
+Skein_256_Process_Block_CodeSize(void)
+{
+ return ((uint8_t *)Skein_256_Process_Block_CodeSize) -
+ ((uint8_t *)Skein_256_Process_Block);
+}
+
+uint_t
+Skein_256_Unroll_Cnt(void)
+{
+ return (SKEIN_UNROLL_256);
+}
+#endif
+#endif
+
+/* Skein_512 */
+#if !(SKEIN_USE_ASM & 512)
+void
+Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr,
+ size_t blkCnt, size_t byteCntAdd)
+{
+ enum {
+ WCNT = SKEIN_512_STATE_WORDS
+ };
+#undef RCNT
+#define RCNT (SKEIN_512_ROUNDS_TOTAL / 8)
+
+#ifdef SKEIN_LOOP /* configure how much to unroll the loop */
+#define SKEIN_UNROLL_512 (((SKEIN_LOOP) / 10) % 10)
+#else
+#define SKEIN_UNROLL_512 (0)
+#endif
+
+#if SKEIN_UNROLL_512
+#if (RCNT % SKEIN_UNROLL_512)
+#error "Invalid SKEIN_UNROLL_512" /* sanity check on unroll count */
+#endif
+ size_t r;
+ /* key schedule words : chaining vars + tweak + "rotation" */
+ uint64_t kw[WCNT + 4 + RCNT * 2];
+#else
+ uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */
+#endif
+ /* local copy of vars, for speed */
+ uint64_t X0, X1, X2, X3, X4, X5, X6, X7;
+ uint64_t w[WCNT]; /* local copy of input block */
+#ifdef SKEIN_DEBUG
+ /* use for debugging (help compiler put Xn in registers) */
+ const uint64_t *Xptr[8];
+ Xptr[0] = &X0;
+ Xptr[1] = &X1;
+ Xptr[2] = &X2;
+ Xptr[3] = &X3;
+ Xptr[4] = &X4;
+ Xptr[5] = &X5;
+ Xptr[6] = &X6;
+ Xptr[7] = &X7;
+#endif
+
+ Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */
+ ts[0] = ctx->h.T[0];
+ ts[1] = ctx->h.T[1];
+ do {
+ /*
+ * this implementation only supports 2**64 input bytes
+ * (no carry out here)
+ */
+ ts[0] += byteCntAdd; /* update processed length */
+
+ /* precompute the key schedule for this block */
+ ks[0] = ctx->X[0];
+ ks[1] = ctx->X[1];
+ ks[2] = ctx->X[2];
+ ks[3] = ctx->X[3];
+ ks[4] = ctx->X[4];
+ ks[5] = ctx->X[5];
+ ks[6] = ctx->X[6];
+ ks[7] = ctx->X[7];
+ ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
+ ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
+
+ ts[2] = ts[0] ^ ts[1];
+
+ /* get input block in little-endian format */
+ Skein_Get64_LSB_First(w, blkPtr, WCNT);
+ DebugSaveTweak(ctx);
+ Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
+
+ X0 = w[0] + ks[0]; /* do the first full key injection */
+ X1 = w[1] + ks[1];
+ X2 = w[2] + ks[2];
+ X3 = w[3] + ks[3];
+ X4 = w[4] + ks[4];
+ X5 = w[5] + ks[5] + ts[0];
+ X6 = w[6] + ks[6] + ts[1];
+ X7 = w[7] + ks[7];
+
+ blkPtr += SKEIN_512_BLOCK_BYTES;
+
+ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
+ Xptr);
+ /* run the rounds */
+#define Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \
+ X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\
+ X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\
+ X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\
+ X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;
+
+#if SKEIN_UNROLL_512 == 0
+#define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) /* unrolled */ \
+ Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \
+ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
+
+#define I512(R) \
+ X0 += ks[((R) + 1) % 9]; /* inject the key schedule value */\
+ X1 += ks[((R) + 2) % 9]; \
+ X2 += ks[((R) + 3) % 9]; \
+ X3 += ks[((R) + 4) % 9]; \
+ X4 += ks[((R) + 5) % 9]; \
+ X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3]; \
+ X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3]; \
+ X7 += ks[((R) + 8) % 9] + (R) + 1; \
+ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+#else /* looping version */
+#define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \
+ Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \
+ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
+
+#define I512(R) \
+ X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \
+ X1 += ks[r + (R) + 1]; \
+ X2 += ks[r + (R) + 2]; \
+ X3 += ks[r + (R) + 3]; \
+ X4 += ks[r + (R) + 4]; \
+ X5 += ks[r + (R) + 5] + ts[r + (R) + 0]; \
+ X6 += ks[r + (R) + 6] + ts[r + (R) + 1]; \
+ X7 += ks[r + (R) + 7] + r + (R); \
+ ks[r + (R)+8] = ks[r + (R) - 1]; /* rotate key schedule */\
+ ts[r + (R)+2] = ts[r + (R) - 1]; \
+ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+
+ /* loop through it */
+ for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_512)
+#endif /* end of looped code definitions */
+ {
+#define R512_8_rounds(R) /* do 8 full rounds */ \
+ R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1); \
+ R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2); \
+ R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3); \
+ R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4); \
+ I512(2 * (R)); \
+ R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5); \
+ R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6); \
+ R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7); \
+ R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8); \
+ I512(2*(R) + 1); /* and key injection */
+
+ R512_8_rounds(0);
+
+#define R512_Unroll_R(NN) \
+ ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL / 8 > (NN)) || \
+ (SKEIN_UNROLL_512 > (NN)))
+
+#if R512_Unroll_R(1)
+ R512_8_rounds(1);
+#endif
+#if R512_Unroll_R(2)
+ R512_8_rounds(2);
+#endif
+#if R512_Unroll_R(3)
+ R512_8_rounds(3);
+#endif
+#if R512_Unroll_R(4)
+ R512_8_rounds(4);
+#endif
+#if R512_Unroll_R(5)
+ R512_8_rounds(5);
+#endif
+#if R512_Unroll_R(6)
+ R512_8_rounds(6);
+#endif
+#if R512_Unroll_R(7)
+ R512_8_rounds(7);
+#endif
+#if R512_Unroll_R(8)
+ R512_8_rounds(8);
+#endif
+#if R512_Unroll_R(9)
+ R512_8_rounds(9);
+#endif
+#if R512_Unroll_R(10)
+ R512_8_rounds(10);
+#endif
+#if R512_Unroll_R(11)
+ R512_8_rounds(11);
+#endif
+#if R512_Unroll_R(12)
+ R512_8_rounds(12);
+#endif
+#if R512_Unroll_R(13)
+ R512_8_rounds(13);
+#endif
+#if R512_Unroll_R(14)
+ R512_8_rounds(14);
+#endif
+#if (SKEIN_UNROLL_512 > 14)
+#error "need more unrolling in Skein_512_Process_Block"
+#endif
+ }
+
+ /*
+ * do the final "feedforward" xor, update context chaining vars
+ */
+ ctx->X[0] = X0 ^ w[0];
+ ctx->X[1] = X1 ^ w[1];
+ ctx->X[2] = X2 ^ w[2];
+ ctx->X[3] = X3 ^ w[3];
+ ctx->X[4] = X4 ^ w[4];
+ ctx->X[5] = X5 ^ w[5];
+ ctx->X[6] = X6 ^ w[6];
+ ctx->X[7] = X7 ^ w[7];
+ Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
+
+ ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+ } while (--blkCnt);
+ ctx->h.T[0] = ts[0];
+ ctx->h.T[1] = ts[1];
+}
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t
+Skein_512_Process_Block_CodeSize(void)
+{
+ return ((uint8_t *)Skein_512_Process_Block_CodeSize) -
+ ((uint8_t *)Skein_512_Process_Block);
+}
+
+uint_t
+Skein_512_Unroll_Cnt(void)
+{
+ return (SKEIN_UNROLL_512);
+}
+#endif
+#endif
+
+/* Skein1024 */
+#if !(SKEIN_USE_ASM & 1024)
+void
+Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr,
+ size_t blkCnt, size_t byteCntAdd)
+{
+ /* do it in C, always looping (unrolled is bigger AND slower!) */
+ enum {
+ WCNT = SKEIN1024_STATE_WORDS
+ };
+#undef RCNT
+#define RCNT (SKEIN1024_ROUNDS_TOTAL/8)
+
+#ifdef SKEIN_LOOP /* configure how much to unroll the loop */
+#define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10)
+#else
+#define SKEIN_UNROLL_1024 (0)
+#endif
+
+#if (SKEIN_UNROLL_1024 != 0)
+#if (RCNT % SKEIN_UNROLL_1024)
+#error "Invalid SKEIN_UNROLL_1024" /* sanity check on unroll count */
+#endif
+ size_t r;
+ /* key schedule words : chaining vars + tweak + "rotation" */
+ uint64_t kw[WCNT + 4 + RCNT * 2];
+#else
+ uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */
+#endif
+
+ /* local copy of vars, for speed */
+ uint64_t X00, X01, X02, X03, X04, X05, X06, X07, X08, X09, X10, X11,
+ X12, X13, X14, X15;
+ uint64_t w[WCNT]; /* local copy of input block */
+#ifdef SKEIN_DEBUG
+ /* use for debugging (help compiler put Xn in registers) */
+ const uint64_t *Xptr[16];
+ Xptr[0] = &X00;
+ Xptr[1] = &X01;
+ Xptr[2] = &X02;
+ Xptr[3] = &X03;
+ Xptr[4] = &X04;
+ Xptr[5] = &X05;
+ Xptr[6] = &X06;
+ Xptr[7] = &X07;
+ Xptr[8] = &X08;
+ Xptr[9] = &X09;
+ Xptr[10] = &X10;
+ Xptr[11] = &X11;
+ Xptr[12] = &X12;
+ Xptr[13] = &X13;
+ Xptr[14] = &X14;
+ Xptr[15] = &X15;
+#endif
+
+ Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */
+ ts[0] = ctx->h.T[0];
+ ts[1] = ctx->h.T[1];
+ do {
+ /*
+ * this implementation only supports 2**64 input bytes
+ * (no carry out here)
+ */
+ ts[0] += byteCntAdd; /* update processed length */
+
+ /* precompute the key schedule for this block */
+ ks[0] = ctx->X[0];
+ ks[1] = ctx->X[1];
+ ks[2] = ctx->X[2];
+ ks[3] = ctx->X[3];
+ ks[4] = ctx->X[4];
+ ks[5] = ctx->X[5];
+ ks[6] = ctx->X[6];
+ ks[7] = ctx->X[7];
+ ks[8] = ctx->X[8];
+ ks[9] = ctx->X[9];
+ ks[10] = ctx->X[10];
+ ks[11] = ctx->X[11];
+ ks[12] = ctx->X[12];
+ ks[13] = ctx->X[13];
+ ks[14] = ctx->X[14];
+ ks[15] = ctx->X[15];
+ ks[16] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
+ ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^
+ ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^
+ ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY;
+
+ ts[2] = ts[0] ^ ts[1];
+
+ /* get input block in little-endian format */
+ Skein_Get64_LSB_First(w, blkPtr, WCNT);
+ DebugSaveTweak(ctx);
+ Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
+
+ X00 = w[0] + ks[0]; /* do the first full key injection */
+ X01 = w[1] + ks[1];
+ X02 = w[2] + ks[2];
+ X03 = w[3] + ks[3];
+ X04 = w[4] + ks[4];
+ X05 = w[5] + ks[5];
+ X06 = w[6] + ks[6];
+ X07 = w[7] + ks[7];
+ X08 = w[8] + ks[8];
+ X09 = w[9] + ks[9];
+ X10 = w[10] + ks[10];
+ X11 = w[11] + ks[11];
+ X12 = w[12] + ks[12];
+ X13 = w[13] + ks[13] + ts[0];
+ X14 = w[14] + ks[14] + ts[1];
+ X15 = w[15] + ks[15];
+
+ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
+ Xptr);
+
+#define Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \
+ pD, pE, pF, ROT, rNum) \
+ X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\
+ X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\
+ X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\
+ X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;\
+ X##p8 += X##p9; X##p9 = RotL_64(X##p9, ROT##_4); X##p9 ^= X##p8;\
+ X##pA += X##pB; X##pB = RotL_64(X##pB, ROT##_5); X##pB ^= X##pA;\
+ X##pC += X##pD; X##pD = RotL_64(X##pD, ROT##_6); X##pD ^= X##pC;\
+ X##pE += X##pF; X##pF = RotL_64(X##pF, ROT##_7); X##pF ^= X##pE;
+
+#if SKEIN_UNROLL_1024 == 0
+#define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, \
+ pE, pF, ROT, rn) \
+ Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \
+ pD, pE, pF, ROT, rn) \
+ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rn, Xptr);
+
+#define I1024(R) \
+ X00 += ks[((R) + 1) % 17]; /* inject the key schedule value */\
+ X01 += ks[((R) + 2) % 17]; \
+ X02 += ks[((R) + 3) % 17]; \
+ X03 += ks[((R) + 4) % 17]; \
+ X04 += ks[((R) + 5) % 17]; \
+ X05 += ks[((R) + 6) % 17]; \
+ X06 += ks[((R) + 7) % 17]; \
+ X07 += ks[((R) + 8) % 17]; \
+ X08 += ks[((R) + 9) % 17]; \
+ X09 += ks[((R) + 10) % 17]; \
+ X10 += ks[((R) + 11) % 17]; \
+ X11 += ks[((R) + 12) % 17]; \
+ X12 += ks[((R) + 13) % 17]; \
+ X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3]; \
+ X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3]; \
+ X15 += ks[((R) + 16) % 17] + (R) +1; \
+ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+#else /* looping version */
+#define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, \
+ pE, pF, ROT, rn) \
+ Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \
+ pD, pE, pF, ROT, rn) \
+ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rn, Xptr);
+
+#define I1024(R) \
+ X00 += ks[r + (R) + 0]; /* inject the key schedule value */ \
+ X01 += ks[r + (R) + 1]; \
+ X02 += ks[r + (R) + 2]; \
+ X03 += ks[r + (R) + 3]; \
+ X04 += ks[r + (R) + 4]; \
+ X05 += ks[r + (R) + 5]; \
+ X06 += ks[r + (R) + 6]; \
+ X07 += ks[r + (R) + 7]; \
+ X08 += ks[r + (R) + 8]; \
+ X09 += ks[r + (R) + 9]; \
+ X10 += ks[r + (R) + 10]; \
+ X11 += ks[r + (R) + 11]; \
+ X12 += ks[r + (R) + 12]; \
+ X13 += ks[r + (R) + 13] + ts[r + (R) + 0]; \
+ X14 += ks[r + (R) + 14] + ts[r + (R) + 1]; \
+ X15 += ks[r + (R) + 15] + r + (R); \
+ ks[r + (R) + 16] = ks[r + (R) - 1]; /* rotate key schedule */\
+ ts[r + (R) + 2] = ts[r + (R) - 1]; \
+ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+
+ /* loop through it */
+ for (r = 1; r <= 2 * RCNT; r += 2 * SKEIN_UNROLL_1024)
+#endif
+ {
+#define R1024_8_rounds(R) /* do 8 full rounds */ \
+ R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, \
+ 14, 15, R1024_0, 8 * (R) + 1); \
+ R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, \
+ 08, 01, R1024_1, 8 * (R) + 2); \
+ R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, \
+ 10, 09, R1024_2, 8 * (R) + 3); \
+ R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, \
+ 12, 07, R1024_3, 8 * (R) + 4); \
+ I1024(2 * (R)); \
+ R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, \
+ 14, 15, R1024_4, 8 * (R) + 5); \
+ R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, \
+ 08, 01, R1024_5, 8 * (R) + 6); \
+ R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, \
+ 10, 09, R1024_6, 8 * (R) + 7); \
+ R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, \
+ 12, 07, R1024_7, 8 * (R) + 8); \
+ I1024(2 * (R) + 1);
+
+ R1024_8_rounds(0);
+
+#define R1024_Unroll_R(NN) \
+ ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || \
+ (SKEIN_UNROLL_1024 > (NN)))
+
+#if R1024_Unroll_R(1)
+ R1024_8_rounds(1);
+#endif
+#if R1024_Unroll_R(2)
+ R1024_8_rounds(2);
+#endif
+#if R1024_Unroll_R(3)
+ R1024_8_rounds(3);
+#endif
+#if R1024_Unroll_R(4)
+ R1024_8_rounds(4);
+#endif
+#if R1024_Unroll_R(5)
+ R1024_8_rounds(5);
+#endif
+#if R1024_Unroll_R(6)
+ R1024_8_rounds(6);
+#endif
+#if R1024_Unroll_R(7)
+ R1024_8_rounds(7);
+#endif
+#if R1024_Unroll_R(8)
+ R1024_8_rounds(8);
+#endif
+#if R1024_Unroll_R(9)
+ R1024_8_rounds(9);
+#endif
+#if R1024_Unroll_R(10)
+ R1024_8_rounds(10);
+#endif
+#if R1024_Unroll_R(11)
+ R1024_8_rounds(11);
+#endif
+#if R1024_Unroll_R(12)
+ R1024_8_rounds(12);
+#endif
+#if R1024_Unroll_R(13)
+ R1024_8_rounds(13);
+#endif
+#if R1024_Unroll_R(14)
+ R1024_8_rounds(14);
+#endif
+#if (SKEIN_UNROLL_1024 > 14)
+#error "need more unrolling in Skein_1024_Process_Block"
+#endif
+ }
+ /*
+ * do the final "feedforward" xor, update context chaining vars
+ */
+
+ ctx->X[0] = X00 ^ w[0];
+ ctx->X[1] = X01 ^ w[1];
+ ctx->X[2] = X02 ^ w[2];
+ ctx->X[3] = X03 ^ w[3];
+ ctx->X[4] = X04 ^ w[4];
+ ctx->X[5] = X05 ^ w[5];
+ ctx->X[6] = X06 ^ w[6];
+ ctx->X[7] = X07 ^ w[7];
+ ctx->X[8] = X08 ^ w[8];
+ ctx->X[9] = X09 ^ w[9];
+ ctx->X[10] = X10 ^ w[10];
+ ctx->X[11] = X11 ^ w[11];
+ ctx->X[12] = X12 ^ w[12];
+ ctx->X[13] = X13 ^ w[13];
+ ctx->X[14] = X14 ^ w[14];
+ ctx->X[15] = X15 ^ w[15];
+
+ Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
+
+ ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+ blkPtr += SKEIN1024_BLOCK_BYTES;
+ } while (--blkCnt);
+ ctx->h.T[0] = ts[0];
+ ctx->h.T[1] = ts[1];
+}
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t
+Skein1024_Process_Block_CodeSize(void)
+{
+ return ((uint8_t *)Skein1024_Process_Block_CodeSize) -
+ ((uint8_t *)Skein1024_Process_Block);
+}
+
+uint_t
+Skein1024_Unroll_Cnt(void)
+{
+ return (SKEIN_UNROLL_1024);
+}
+#endif
+#endif
diff --git a/sys/contrib/openzfs/module/icp/algs/skein/skein_impl.h b/sys/contrib/openzfs/module/icp/algs/skein/skein_impl.h
new file mode 100644
index 000000000000..205a517d69db
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/skein/skein_impl.h
@@ -0,0 +1,292 @@
+/*
+ * Internal definitions for Skein hashing.
+ * Source code author: Doug Whiting, 2008.
+ * This algorithm and source code is released to the public domain.
+ *
+ * The following compile-time switches may be defined to control some
+ * tradeoffs between speed, code size, error checking, and security.
+ *
+ * The "default" note explains what happens when the switch is not defined.
+ *
+ * SKEIN_DEBUG -- make callouts from inside Skein code
+ * to examine/display intermediate values.
+ * [default: no callouts (no overhead)]
+ *
+ * SKEIN_ERR_CHECK -- how error checking is handled inside Skein
+ * code. If not defined, most error checking
+ * is disabled (for performance). Otherwise,
+ * the switch value is interpreted as:
+ * 0: use assert() to flag errors
+ * 1: return SKEIN_FAIL to flag errors
+ */
+/* Copyright 2013 Doug Whiting. This code is released to the public domain. */
+
+#ifndef _SKEIN_IMPL_H_
+#define _SKEIN_IMPL_H_
+
+#include <sys/skein.h>
+#include <sys/strings.h>
+#include <sys/note.h>
+#include "skein_impl.h"
+#include "skein_port.h"
+
+/*
+ * "Internal" Skein definitions
+ * -- not needed for sequential hashing API, but will be
+ * helpful for other uses of Skein (e.g., tree hash mode).
+ * -- included here so that they can be shared between
+ * reference and optimized code.
+ */
+
+/* tweak word T[1]: bit field starting positions */
+/* offset 64 because it's the second word */
+#define SKEIN_T1_BIT(BIT) ((BIT) - 64)
+
+/* bits 112..118: level in hash tree */
+#define SKEIN_T1_POS_TREE_LVL SKEIN_T1_BIT(112)
+/* bit 119: partial final input byte */
+#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119)
+/* bits 120..125: type field */
+#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120)
+/* bits 126: first block flag */
+#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126)
+/* bit 127: final block flag */
+#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127)
+
+/* tweak word T[1]: flag bit definition(s) */
+#define SKEIN_T1_FLAG_FIRST (((uint64_t)1) << SKEIN_T1_POS_FIRST)
+#define SKEIN_T1_FLAG_FINAL (((uint64_t)1) << SKEIN_T1_POS_FINAL)
+#define SKEIN_T1_FLAG_BIT_PAD (((uint64_t)1) << SKEIN_T1_POS_BIT_PAD)
+
+/* tweak word T[1]: tree level bit field mask */
+#define SKEIN_T1_TREE_LVL_MASK (((uint64_t)0x7F) << SKEIN_T1_POS_TREE_LVL)
+#define SKEIN_T1_TREE_LEVEL(n) (((uint64_t)(n)) << SKEIN_T1_POS_TREE_LVL)
+
+/* tweak word T[1]: block type field */
+#define SKEIN_BLK_TYPE_KEY (0) /* key, for MAC and KDF */
+#define SKEIN_BLK_TYPE_CFG (4) /* configuration block */
+#define SKEIN_BLK_TYPE_PERS (8) /* personalization string */
+#define SKEIN_BLK_TYPE_PK (12) /* public key (for signature hashing) */
+#define SKEIN_BLK_TYPE_KDF (16) /* key identifier for KDF */
+#define SKEIN_BLK_TYPE_NONCE (20) /* nonce for PRNG */
+#define SKEIN_BLK_TYPE_MSG (48) /* message processing */
+#define SKEIN_BLK_TYPE_OUT (63) /* output stage */
+#define SKEIN_BLK_TYPE_MASK (63) /* bit field mask */
+
+#define SKEIN_T1_BLK_TYPE(T) \
+ (((uint64_t)(SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
+/* key, for MAC and KDF */
+#define SKEIN_T1_BLK_TYPE_KEY SKEIN_T1_BLK_TYPE(KEY)
+/* configuration block */
+#define SKEIN_T1_BLK_TYPE_CFG SKEIN_T1_BLK_TYPE(CFG)
+/* personalization string */
+#define SKEIN_T1_BLK_TYPE_PERS SKEIN_T1_BLK_TYPE(PERS)
+/* public key (for digital signature hashing) */
+#define SKEIN_T1_BLK_TYPE_PK SKEIN_T1_BLK_TYPE(PK)
+/* key identifier for KDF */
+#define SKEIN_T1_BLK_TYPE_KDF SKEIN_T1_BLK_TYPE(KDF)
+/* nonce for PRNG */
+#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE)
+/* message processing */
+#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG)
+/* output stage */
+#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT)
+/* field bit mask */
+#define SKEIN_T1_BLK_TYPE_MASK SKEIN_T1_BLK_TYPE(MASK)
+
+#define SKEIN_T1_BLK_TYPE_CFG_FINAL \
+ (SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL)
+#define SKEIN_T1_BLK_TYPE_OUT_FINAL \
+ (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
+
+#define SKEIN_VERSION (1)
+
+#ifndef SKEIN_ID_STRING_LE /* allow compile-time personalization */
+#define SKEIN_ID_STRING_LE (0x33414853) /* "SHA3" (little-endian) */
+#endif
+
+#define SKEIN_MK_64(hi32, lo32) ((lo32) + (((uint64_t)(hi32)) << 32))
+#define SKEIN_SCHEMA_VER SKEIN_MK_64(SKEIN_VERSION, SKEIN_ID_STRING_LE)
+#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA, 0xA9FC1A22)
+
+#define SKEIN_CFG_STR_LEN (4*8)
+
+/* bit field definitions in config block treeInfo word */
+#define SKEIN_CFG_TREE_LEAF_SIZE_POS (0)
+#define SKEIN_CFG_TREE_NODE_SIZE_POS (8)
+#define SKEIN_CFG_TREE_MAX_LEVEL_POS (16)
+
+#define SKEIN_CFG_TREE_LEAF_SIZE_MSK \
+ (((uint64_t)0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS)
+#define SKEIN_CFG_TREE_NODE_SIZE_MSK \
+ (((uint64_t)0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS)
+#define SKEIN_CFG_TREE_MAX_LEVEL_MSK \
+ (((uint64_t)0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS)
+
+#define SKEIN_CFG_TREE_INFO(leaf, node, maxLvl) \
+ ((((uint64_t)(leaf)) << SKEIN_CFG_TREE_LEAF_SIZE_POS) | \
+ (((uint64_t)(node)) << SKEIN_CFG_TREE_NODE_SIZE_POS) | \
+ (((uint64_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS))
+
+/* use as treeInfo in InitExt() call for sequential processing */
+#define SKEIN_CFG_TREE_INFO_SEQUENTIAL SKEIN_CFG_TREE_INFO(0, 0, 0)
+
+/*
+ * Skein macros for getting/setting tweak words, etc.
+ * These are useful for partial input bytes, hash tree init/update, etc.
+ */
+#define Skein_Get_Tweak(ctxPtr, TWK_NUM) ((ctxPtr)->h.T[TWK_NUM])
+#define Skein_Set_Tweak(ctxPtr, TWK_NUM, tVal) \
+ do { \
+ (ctxPtr)->h.T[TWK_NUM] = (tVal); \
+ _NOTE(CONSTCOND) \
+ } while (0)
+
+#define Skein_Get_T0(ctxPtr) Skein_Get_Tweak(ctxPtr, 0)
+#define Skein_Get_T1(ctxPtr) Skein_Get_Tweak(ctxPtr, 1)
+#define Skein_Set_T0(ctxPtr, T0) Skein_Set_Tweak(ctxPtr, 0, T0)
+#define Skein_Set_T1(ctxPtr, T1) Skein_Set_Tweak(ctxPtr, 1, T1)
+
+/* set both tweak words at once */
+#define Skein_Set_T0_T1(ctxPtr, T0, T1) \
+ do { \
+ Skein_Set_T0(ctxPtr, (T0)); \
+ Skein_Set_T1(ctxPtr, (T1)); \
+ _NOTE(CONSTCOND) \
+ } while (0)
+
+#define Skein_Set_Type(ctxPtr, BLK_TYPE) \
+ Skein_Set_T1(ctxPtr, SKEIN_T1_BLK_TYPE_##BLK_TYPE)
+
+/*
+ * set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0;
+ */
+#define Skein_Start_New_Type(ctxPtr, BLK_TYPE) \
+ do { \
+ Skein_Set_T0_T1(ctxPtr, 0, SKEIN_T1_FLAG_FIRST | \
+ SKEIN_T1_BLK_TYPE_ ## BLK_TYPE); \
+ (ctxPtr)->h.bCnt = 0; \
+ _NOTE(CONSTCOND) \
+ } while (0)
+
+#define Skein_Clear_First_Flag(hdr) \
+ do { \
+ (hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST; \
+ _NOTE(CONSTCOND) \
+ } while (0)
+#define Skein_Set_Bit_Pad_Flag(hdr) \
+ do { \
+ (hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; \
+ _NOTE(CONSTCOND) \
+ } while (0)
+
+#define Skein_Set_Tree_Level(hdr, height) \
+ do { \
+ (hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height); \
+ _NOTE(CONSTCOND) \
+ } while (0)
+
+/*
+ * "Internal" Skein definitions for debugging and error checking
+ * Note: in Illumos we always disable debugging features.
+ */
+#define Skein_Show_Block(bits, ctx, X, blkPtr, wPtr, ksEvenPtr, ksOddPtr)
+#define Skein_Show_Round(bits, ctx, r, X)
+#define Skein_Show_R_Ptr(bits, ctx, r, X_ptr)
+#define Skein_Show_Final(bits, ctx, cnt, outPtr)
+#define Skein_Show_Key(bits, ctx, key, keyBytes)
+
+/* run-time checks (e.g., bad params, uninitialized context)? */
+#ifndef SKEIN_ERR_CHECK
+/* default: ignore all Asserts, for performance */
+#define Skein_Assert(x, retCode)
+#define Skein_assert(x)
+#elif defined(SKEIN_ASSERT)
+#include <sys/debug.h>
+#define Skein_Assert(x, retCode) ASSERT(x)
+#define Skein_assert(x) ASSERT(x)
+#else
+#include <sys/debug.h>
+/* caller error */
+#define Skein_Assert(x, retCode) \
+ do { \
+ if (!(x)) \
+ return (retCode); \
+ _NOTE(CONSTCOND) \
+ } while (0)
+/* internal error */
+#define Skein_assert(x) ASSERT(x)
+#endif
+
+/*
+ * Skein block function constants (shared across Ref and Opt code)
+ */
+enum {
+ /* Skein_256 round rotation constants */
+ R_256_0_0 = 14, R_256_0_1 = 16,
+ R_256_1_0 = 52, R_256_1_1 = 57,
+ R_256_2_0 = 23, R_256_2_1 = 40,
+ R_256_3_0 = 5, R_256_3_1 = 37,
+ R_256_4_0 = 25, R_256_4_1 = 33,
+ R_256_5_0 = 46, R_256_5_1 = 12,
+ R_256_6_0 = 58, R_256_6_1 = 22,
+ R_256_7_0 = 32, R_256_7_1 = 32,
+
+ /* Skein_512 round rotation constants */
+ R_512_0_0 = 46, R_512_0_1 = 36, R_512_0_2 = 19, R_512_0_3 = 37,
+ R_512_1_0 = 33, R_512_1_1 = 27, R_512_1_2 = 14, R_512_1_3 = 42,
+ R_512_2_0 = 17, R_512_2_1 = 49, R_512_2_2 = 36, R_512_2_3 = 39,
+ R_512_3_0 = 44, R_512_3_1 = 9, R_512_3_2 = 54, R_512_3_3 = 56,
+ R_512_4_0 = 39, R_512_4_1 = 30, R_512_4_2 = 34, R_512_4_3 = 24,
+ R_512_5_0 = 13, R_512_5_1 = 50, R_512_5_2 = 10, R_512_5_3 = 17,
+ R_512_6_0 = 25, R_512_6_1 = 29, R_512_6_2 = 39, R_512_6_3 = 43,
+ R_512_7_0 = 8, R_512_7_1 = 35, R_512_7_2 = 56, R_512_7_3 = 22,
+
+ /* Skein1024 round rotation constants */
+ R1024_0_0 = 24, R1024_0_1 = 13, R1024_0_2 = 8, R1024_0_3 =
+ 47, R1024_0_4 = 8, R1024_0_5 = 17, R1024_0_6 = 22, R1024_0_7 = 37,
+ R1024_1_0 = 38, R1024_1_1 = 19, R1024_1_2 = 10, R1024_1_3 =
+ 55, R1024_1_4 = 49, R1024_1_5 = 18, R1024_1_6 = 23, R1024_1_7 = 52,
+ R1024_2_0 = 33, R1024_2_1 = 4, R1024_2_2 = 51, R1024_2_3 =
+ 13, R1024_2_4 = 34, R1024_2_5 = 41, R1024_2_6 = 59, R1024_2_7 = 17,
+ R1024_3_0 = 5, R1024_3_1 = 20, R1024_3_2 = 48, R1024_3_3 =
+ 41, R1024_3_4 = 47, R1024_3_5 = 28, R1024_3_6 = 16, R1024_3_7 = 25,
+ R1024_4_0 = 41, R1024_4_1 = 9, R1024_4_2 = 37, R1024_4_3 =
+ 31, R1024_4_4 = 12, R1024_4_5 = 47, R1024_4_6 = 44, R1024_4_7 = 30,
+ R1024_5_0 = 16, R1024_5_1 = 34, R1024_5_2 = 56, R1024_5_3 =
+ 51, R1024_5_4 = 4, R1024_5_5 = 53, R1024_5_6 = 42, R1024_5_7 = 41,
+ R1024_6_0 = 31, R1024_6_1 = 44, R1024_6_2 = 47, R1024_6_3 =
+ 46, R1024_6_4 = 19, R1024_6_5 = 42, R1024_6_6 = 44, R1024_6_7 = 25,
+ R1024_7_0 = 9, R1024_7_1 = 48, R1024_7_2 = 35, R1024_7_3 =
+ 52, R1024_7_4 = 23, R1024_7_5 = 31, R1024_7_6 = 37, R1024_7_7 = 20
+};
+
+/* number of rounds for the different block sizes */
+#define SKEIN_256_ROUNDS_TOTAL (72)
+#define SKEIN_512_ROUNDS_TOTAL (72)
+#define SKEIN1024_ROUNDS_TOTAL (80)
+
+
+extern const uint64_t SKEIN_256_IV_128[];
+extern const uint64_t SKEIN_256_IV_160[];
+extern const uint64_t SKEIN_256_IV_224[];
+extern const uint64_t SKEIN_256_IV_256[];
+extern const uint64_t SKEIN_512_IV_128[];
+extern const uint64_t SKEIN_512_IV_160[];
+extern const uint64_t SKEIN_512_IV_224[];
+extern const uint64_t SKEIN_512_IV_256[];
+extern const uint64_t SKEIN_512_IV_384[];
+extern const uint64_t SKEIN_512_IV_512[];
+extern const uint64_t SKEIN1024_IV_384[];
+extern const uint64_t SKEIN1024_IV_512[];
+extern const uint64_t SKEIN1024_IV_1024[];
+
+/* Functions to process blkCnt (nonzero) full block(s) of data. */
+void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr,
+ size_t blkCnt, size_t byteCntAdd);
+void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr,
+ size_t blkCnt, size_t byteCntAdd);
+void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr,
+ size_t blkCnt, size_t byteCntAdd);
+
+#endif /* _SKEIN_IMPL_H_ */
diff --git a/sys/contrib/openzfs/module/icp/algs/skein/skein_iv.c b/sys/contrib/openzfs/module/icp/algs/skein/skein_iv.c
new file mode 100644
index 000000000000..140d38f76547
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/skein/skein_iv.c
@@ -0,0 +1,185 @@
+/*
+ * Pre-computed Skein IVs
+ *
+ * NOTE: these values are not "magic" constants, but
+ * are generated using the Threefish block function.
+ * They are pre-computed here only for speed; i.e., to
+ * avoid the need for a Threefish call during Init().
+ *
+ * The IV for any fixed hash length may be pre-computed.
+ * Only the most common values are included here.
+ */
+/* Copyright 2013 Doug Whiting. This code is released to the public domain. */
+/*
+ * Illumos implementation note: these constants are for Skein v1.3 as per:
+ * http://www.skein-hash.info/sites/default/files/skein1.3.pdf
+ */
+
+#include <sys/skein.h> /* get Skein macros and types */
+#include "skein_impl.h" /* get internal definitions */
+
+#define MK_64 SKEIN_MK_64
+
+/* blkSize = 256 bits. hashSize = 128 bits */
+const uint64_t SKEIN_256_IV_128[] = {
+ MK_64(0xE1111906, 0x964D7260),
+ MK_64(0x883DAAA7, 0x7C8D811C),
+ MK_64(0x10080DF4, 0x91960F7A),
+ MK_64(0xCCF7DDE5, 0xB45BC1C2)
+};
+
+/* blkSize = 256 bits. hashSize = 160 bits */
+const uint64_t SKEIN_256_IV_160[] = {
+ MK_64(0x14202314, 0x72825E98),
+ MK_64(0x2AC4E9A2, 0x5A77E590),
+ MK_64(0xD47A5856, 0x8838D63E),
+ MK_64(0x2DD2E496, 0x8586AB7D)
+};
+
+/* blkSize = 256 bits. hashSize = 224 bits */
+const uint64_t SKEIN_256_IV_224[] = {
+ MK_64(0xC6098A8C, 0x9AE5EA0B),
+ MK_64(0x876D5686, 0x08C5191C),
+ MK_64(0x99CB88D7, 0xD7F53884),
+ MK_64(0x384BDDB1, 0xAEDDB5DE)
+};
+
+/* blkSize = 256 bits. hashSize = 256 bits */
+const uint64_t SKEIN_256_IV_256[] = {
+ MK_64(0xFC9DA860, 0xD048B449),
+ MK_64(0x2FCA6647, 0x9FA7D833),
+ MK_64(0xB33BC389, 0x6656840F),
+ MK_64(0x6A54E920, 0xFDE8DA69)
+};
+
+/* blkSize = 512 bits. hashSize = 128 bits */
+const uint64_t SKEIN_512_IV_128[] = {
+ MK_64(0xA8BC7BF3, 0x6FBF9F52),
+ MK_64(0x1E9872CE, 0xBD1AF0AA),
+ MK_64(0x309B1790, 0xB32190D3),
+ MK_64(0xBCFBB854, 0x3F94805C),
+ MK_64(0x0DA61BCD, 0x6E31B11B),
+ MK_64(0x1A18EBEA, 0xD46A32E3),
+ MK_64(0xA2CC5B18, 0xCE84AA82),
+ MK_64(0x6982AB28, 0x9D46982D)
+};
+
+/* blkSize = 512 bits. hashSize = 160 bits */
+const uint64_t SKEIN_512_IV_160[] = {
+ MK_64(0x28B81A2A, 0xE013BD91),
+ MK_64(0xC2F11668, 0xB5BDF78F),
+ MK_64(0x1760D8F3, 0xF6A56F12),
+ MK_64(0x4FB74758, 0x8239904F),
+ MK_64(0x21EDE07F, 0x7EAF5056),
+ MK_64(0xD908922E, 0x63ED70B8),
+ MK_64(0xB8EC76FF, 0xECCB52FA),
+ MK_64(0x01A47BB8, 0xA3F27A6E)
+};
+
+/* blkSize = 512 bits. hashSize = 224 bits */
+const uint64_t SKEIN_512_IV_224[] = {
+ MK_64(0xCCD06162, 0x48677224),
+ MK_64(0xCBA65CF3, 0xA92339EF),
+ MK_64(0x8CCD69D6, 0x52FF4B64),
+ MK_64(0x398AED7B, 0x3AB890B4),
+ MK_64(0x0F59D1B1, 0x457D2BD0),
+ MK_64(0x6776FE65, 0x75D4EB3D),
+ MK_64(0x99FBC70E, 0x997413E9),
+ MK_64(0x9E2CFCCF, 0xE1C41EF7)
+};
+
+/* blkSize = 512 bits. hashSize = 256 bits */
+const uint64_t SKEIN_512_IV_256[] = {
+ MK_64(0xCCD044A1, 0x2FDB3E13),
+ MK_64(0xE8359030, 0x1A79A9EB),
+ MK_64(0x55AEA061, 0x4F816E6F),
+ MK_64(0x2A2767A4, 0xAE9B94DB),
+ MK_64(0xEC06025E, 0x74DD7683),
+ MK_64(0xE7A436CD, 0xC4746251),
+ MK_64(0xC36FBAF9, 0x393AD185),
+ MK_64(0x3EEDBA18, 0x33EDFC13)
+};
+
+/* blkSize = 512 bits. hashSize = 384 bits */
+const uint64_t SKEIN_512_IV_384[] = {
+ MK_64(0xA3F6C6BF, 0x3A75EF5F),
+ MK_64(0xB0FEF9CC, 0xFD84FAA4),
+ MK_64(0x9D77DD66, 0x3D770CFE),
+ MK_64(0xD798CBF3, 0xB468FDDA),
+ MK_64(0x1BC4A666, 0x8A0E4465),
+ MK_64(0x7ED7D434, 0xE5807407),
+ MK_64(0x548FC1AC, 0xD4EC44D6),
+ MK_64(0x266E1754, 0x6AA18FF8)
+};
+
+/* blkSize = 512 bits. hashSize = 512 bits */
+const uint64_t SKEIN_512_IV_512[] = {
+ MK_64(0x4903ADFF, 0x749C51CE),
+ MK_64(0x0D95DE39, 0x9746DF03),
+ MK_64(0x8FD19341, 0x27C79BCE),
+ MK_64(0x9A255629, 0xFF352CB1),
+ MK_64(0x5DB62599, 0xDF6CA7B0),
+ MK_64(0xEABE394C, 0xA9D5C3F4),
+ MK_64(0x991112C7, 0x1A75B523),
+ MK_64(0xAE18A40B, 0x660FCC33)
+};
+
+/* blkSize = 1024 bits. hashSize = 384 bits */
+const uint64_t SKEIN1024_IV_384[] = {
+ MK_64(0x5102B6B8, 0xC1894A35),
+ MK_64(0xFEEBC9E3, 0xFE8AF11A),
+ MK_64(0x0C807F06, 0xE32BED71),
+ MK_64(0x60C13A52, 0xB41A91F6),
+ MK_64(0x9716D35D, 0xD4917C38),
+ MK_64(0xE780DF12, 0x6FD31D3A),
+ MK_64(0x797846B6, 0xC898303A),
+ MK_64(0xB172C2A8, 0xB3572A3B),
+ MK_64(0xC9BC8203, 0xA6104A6C),
+ MK_64(0x65909338, 0xD75624F4),
+ MK_64(0x94BCC568, 0x4B3F81A0),
+ MK_64(0x3EBBF51E, 0x10ECFD46),
+ MK_64(0x2DF50F0B, 0xEEB08542),
+ MK_64(0x3B5A6530, 0x0DBC6516),
+ MK_64(0x484B9CD2, 0x167BBCE1),
+ MK_64(0x2D136947, 0xD4CBAFEA)
+};
+
+/* blkSize = 1024 bits. hashSize = 512 bits */
+const uint64_t SKEIN1024_IV_512[] = {
+ MK_64(0xCAEC0E5D, 0x7C1B1B18),
+ MK_64(0xA01B0E04, 0x5F03E802),
+ MK_64(0x33840451, 0xED912885),
+ MK_64(0x374AFB04, 0xEAEC2E1C),
+ MK_64(0xDF25A0E2, 0x813581F7),
+ MK_64(0xE4004093, 0x8B12F9D2),
+ MK_64(0xA662D539, 0xC2ED39B6),
+ MK_64(0xFA8B85CF, 0x45D8C75A),
+ MK_64(0x8316ED8E, 0x29EDE796),
+ MK_64(0x053289C0, 0x2E9F91B8),
+ MK_64(0xC3F8EF1D, 0x6D518B73),
+ MK_64(0xBDCEC3C4, 0xD5EF332E),
+ MK_64(0x549A7E52, 0x22974487),
+ MK_64(0x67070872, 0x5B749816),
+ MK_64(0xB9CD28FB, 0xF0581BD1),
+ MK_64(0x0E2940B8, 0x15804974)
+};
+
+/* blkSize = 1024 bits. hashSize = 1024 bits */
+const uint64_t SKEIN1024_IV_1024[] = {
+ MK_64(0xD593DA07, 0x41E72355),
+ MK_64(0x15B5E511, 0xAC73E00C),
+ MK_64(0x5180E5AE, 0xBAF2C4F0),
+ MK_64(0x03BD41D3, 0xFCBCAFAF),
+ MK_64(0x1CAEC6FD, 0x1983A898),
+ MK_64(0x6E510B8B, 0xCDD0589F),
+ MK_64(0x77E2BDFD, 0xC6394ADA),
+ MK_64(0xC11E1DB5, 0x24DCB0A3),
+ MK_64(0xD6D14AF9, 0xC6329AB5),
+ MK_64(0x6A9B0BFC, 0x6EB67E0D),
+ MK_64(0x9243C60D, 0xCCFF1332),
+ MK_64(0x1A1F1DDE, 0x743F02D4),
+ MK_64(0x0996753C, 0x10ED0BB8),
+ MK_64(0x6572DD22, 0xF2B4969A),
+ MK_64(0x61FD3062, 0xD00A579A),
+ MK_64(0x1DE0536E, 0x8682E539)
+};
diff --git a/sys/contrib/openzfs/module/icp/algs/skein/skein_port.h b/sys/contrib/openzfs/module/icp/algs/skein/skein_port.h
new file mode 100644
index 000000000000..ce4353082552
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/skein/skein_port.h
@@ -0,0 +1,116 @@
+/*
+ * Platform-specific definitions for Skein hash function.
+ *
+ * Source code author: Doug Whiting, 2008.
+ *
+ * This algorithm and source code is released to the public domain.
+ *
+ * Many thanks to Brian Gladman for his portable header files.
+ *
+ * To port Skein to an "unsupported" platform, change the definitions
+ * in this file appropriately.
+ */
+/* Copyright 2013 Doug Whiting. This code is released to the public domain. */
+
+#ifndef _SKEIN_PORT_H_
+#define _SKEIN_PORT_H_
+
+#include <sys/types.h> /* get integer type definitions */
+
+#ifndef RotL_64
+#define RotL_64(x, N) (((x) << (N)) | ((x) >> (64 - (N))))
+#endif
+
+/*
+ * Skein is "natively" little-endian (unlike SHA-xxx), for optimal
+ * performance on x86 CPUs. The Skein code requires the following
+ * definitions for dealing with endianness:
+ *
+ * SKEIN_NEED_SWAP: 0 for little-endian, 1 for big-endian
+ * Skein_Put64_LSB_First
+ * Skein_Get64_LSB_First
+ * Skein_Swap64
+ *
+ * If SKEIN_NEED_SWAP is defined at compile time, it is used here
+ * along with the portable versions of Put64/Get64/Swap64, which
+ * are slow in general.
+ *
+ * Otherwise, an "auto-detect" of endianness is attempted below.
+ * If the default handling doesn't work well, the user may insert
+ * platform-specific code instead (e.g., for big-endian CPUs).
+ *
+ */
+#ifndef SKEIN_NEED_SWAP /* compile-time "override" for endianness? */
+
+#include <sys/isa_defs.h> /* get endianness selection */
+
+#if defined(_ZFS_BIG_ENDIAN)
+/* here for big-endian CPUs */
+#define SKEIN_NEED_SWAP (1)
+#else
+/* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */
+#define SKEIN_NEED_SWAP (0)
+#define Skein_Put64_LSB_First(dst08, src64, bCnt) bcopy(src64, dst08, bCnt)
+#define Skein_Get64_LSB_First(dst64, src08, wCnt) \
+ bcopy(src08, dst64, 8 * (wCnt))
+#endif
+
+#endif /* ifndef SKEIN_NEED_SWAP */
+
+/*
+ * Provide any definitions still needed.
+ */
+#ifndef Skein_Swap64 /* swap for big-endian, nop for little-endian */
+#if SKEIN_NEED_SWAP
+#define Skein_Swap64(w64) \
+ (((((uint64_t)(w64)) & 0xFF) << 56) | \
+ (((((uint64_t)(w64)) >> 8) & 0xFF) << 48) | \
+ (((((uint64_t)(w64)) >> 16) & 0xFF) << 40) | \
+ (((((uint64_t)(w64)) >> 24) & 0xFF) << 32) | \
+ (((((uint64_t)(w64)) >> 32) & 0xFF) << 24) | \
+ (((((uint64_t)(w64)) >> 40) & 0xFF) << 16) | \
+ (((((uint64_t)(w64)) >> 48) & 0xFF) << 8) | \
+ (((((uint64_t)(w64)) >> 56) & 0xFF)))
+#else
+#define Skein_Swap64(w64) (w64)
+#endif
+#endif /* ifndef Skein_Swap64 */
+
+#ifndef Skein_Put64_LSB_First
+static inline void
+Skein_Put64_LSB_First(uint8_t *dst, const uint64_t *src, size_t bCnt)
+{
+ /*
+ * this version is fully portable (big-endian or little-endian),
+ * but slow
+ */
+ size_t n;
+
+ for (n = 0; n < bCnt; n++)
+ dst[n] = (uint8_t)(src[n >> 3] >> (8 * (n & 7)));
+}
+#endif /* ifndef Skein_Put64_LSB_First */
+
+#ifndef Skein_Get64_LSB_First
+static inline void
+Skein_Get64_LSB_First(uint64_t *dst, const uint8_t *src, size_t wCnt)
+{
+ /*
+ * this version is fully portable (big-endian or little-endian),
+ * but slow
+ */
+ size_t n;
+
+ for (n = 0; n < 8 * wCnt; n += 8)
+ dst[n / 8] = (((uint64_t)src[n])) +
+ (((uint64_t)src[n + 1]) << 8) +
+ (((uint64_t)src[n + 2]) << 16) +
+ (((uint64_t)src[n + 3]) << 24) +
+ (((uint64_t)src[n + 4]) << 32) +
+ (((uint64_t)src[n + 5]) << 40) +
+ (((uint64_t)src[n + 6]) << 48) +
+ (((uint64_t)src[n + 7]) << 56);
+}
+#endif /* ifndef Skein_Get64_LSB_First */
+
+#endif /* _SKEIN_PORT_H_ */
diff --git a/sys/contrib/openzfs/module/icp/api/kcf_cipher.c b/sys/contrib/openzfs/module/icp/api/kcf_cipher.c
new file mode 100644
index 000000000000..d6aa48147edb
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/api/kcf_cipher.c
@@ -0,0 +1,930 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/api.h>
+#include <sys/crypto/spi.h>
+#include <sys/crypto/sched_impl.h>
+
+/*
+ * Encryption and decryption routines.
+ */
+
+/*
+ * The following are the possible returned values common to all the routines
+ * below. The applicability of some of these return values depends on the
+ * presence of the arguments.
+ *
+ * CRYPTO_SUCCESS: The operation completed successfully.
+ * CRYPTO_QUEUED: A request was submitted successfully. The callback
+ * routine will be called when the operation is done.
+ * CRYPTO_INVALID_MECH_NUMBER, CRYPTO_INVALID_MECH_PARAM, or
+ * CRYPTO_INVALID_MECH for problems with the 'mech'.
+ * CRYPTO_INVALID_DATA for bogus 'data'
+ * CRYPTO_HOST_MEMORY for failure to allocate memory to handle this work.
+ * CRYPTO_INVALID_CONTEXT: Not a valid context.
+ * CRYPTO_BUSY: Cannot process the request now. Schedule a
+ * crypto_bufcall(), or try later.
+ * CRYPTO_NOT_SUPPORTED and CRYPTO_MECH_NOT_SUPPORTED: No provider is
+ * capable of a function or a mechanism.
+ * CRYPTO_INVALID_KEY: bogus 'key' argument.
+ * CRYPTO_INVALID_PLAINTEXT: bogus 'plaintext' argument.
+ * CRYPTO_INVALID_CIPHERTEXT: bogus 'ciphertext' argument.
+ */
+
+/*
+ * crypto_cipher_init_prov()
+ *
+ * Arguments:
+ *
+ * pd: provider descriptor
+ * sid: session id
+ * mech: crypto_mechanism_t pointer.
+ * mech_type is a valid value previously returned by
+ * crypto_mech2id();
+ * When the mech's parameter is not NULL, its definition depends
+ * on the standard definition of the mechanism.
+ * key: pointer to a crypto_key_t structure.
+ * tmpl: a crypto_ctx_template_t, opaque template of a context of an
+ * encryption or decryption with the 'mech' using 'key'.
+ * 'tmpl' is created by a previous call to
+ * crypto_create_ctx_template().
+ * ctxp: Pointer to a crypto_context_t.
+ * func: CRYPTO_FG_ENCRYPT or CRYPTO_FG_DECRYPT.
+ * cr: crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ * This is a common function invoked internally by both
+ * crypto_encrypt_init() and crypto_decrypt_init().
+ * Asynchronously submits a request for, or synchronously performs the
+ * initialization of an encryption or a decryption operation.
+ * When possible and applicable, will internally use the pre-expanded key
+ * schedule from the context template, tmpl.
+ * When complete and successful, 'ctxp' will contain a crypto_context_t
+ * valid for later calls to encrypt_update() and encrypt_final(), or
+ * decrypt_update() and decrypt_final().
+ * The caller should hold a reference on the specified provider
+ * descriptor before calling this function.
+ *
+ * Context:
+ * Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ * See comment in the beginning of the file.
+ */
+static int
+crypto_cipher_init_prov(crypto_provider_t provider, crypto_session_id_t sid,
+ crypto_mechanism_t *mech, crypto_key_t *key,
+ crypto_spi_ctx_template_t tmpl, crypto_context_t *ctxp,
+ crypto_call_req_t *crq, crypto_func_group_t func)
+{
+ int error;
+ crypto_ctx_t *ctx;
+ kcf_req_params_t params;
+ kcf_provider_desc_t *pd = provider;
+ kcf_provider_desc_t *real_provider = pd;
+
+ ASSERT(KCF_PROV_REFHELD(pd));
+
+ if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) {
+ if (func == CRYPTO_FG_ENCRYPT) {
+ error = kcf_get_hardware_provider(mech->cm_type,
+ CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd,
+ &real_provider, CRYPTO_FG_ENCRYPT);
+ } else {
+ error = kcf_get_hardware_provider(mech->cm_type,
+ CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd,
+ &real_provider, CRYPTO_FG_DECRYPT);
+ }
+
+ if (error != CRYPTO_SUCCESS)
+ return (error);
+ }
+
+ /* Allocate and initialize the canonical context */
+ if ((ctx = kcf_new_ctx(crq, real_provider, sid)) == NULL) {
+ if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)
+ KCF_PROV_REFRELE(real_provider);
+ return (CRYPTO_HOST_MEMORY);
+ }
+
+ /* The fast path for SW providers. */
+ if (CHECK_FASTPATH(crq, pd)) {
+ crypto_mechanism_t lmech;
+
+ lmech = *mech;
+ KCF_SET_PROVIDER_MECHNUM(mech->cm_type, real_provider, &lmech);
+
+ if (func == CRYPTO_FG_ENCRYPT)
+ error = KCF_PROV_ENCRYPT_INIT(real_provider, ctx,
+ &lmech, key, tmpl, KCF_SWFP_RHNDL(crq));
+ else {
+ ASSERT(func == CRYPTO_FG_DECRYPT);
+
+ error = KCF_PROV_DECRYPT_INIT(real_provider, ctx,
+ &lmech, key, tmpl, KCF_SWFP_RHNDL(crq));
+ }
+ KCF_PROV_INCRSTATS(pd, error);
+
+ goto done;
+ }
+
+ /* Check if context sharing is possible */
+ if (pd->pd_prov_type == CRYPTO_HW_PROVIDER &&
+ key->ck_format == CRYPTO_KEY_RAW &&
+ KCF_CAN_SHARE_OPSTATE(pd, mech->cm_type)) {
+ kcf_context_t *tctxp = (kcf_context_t *)ctx;
+ kcf_provider_desc_t *tpd = NULL;
+ crypto_mech_info_t *sinfo;
+
+ if ((kcf_get_sw_prov(mech->cm_type, &tpd, &tctxp->kc_mech,
+ B_FALSE) == CRYPTO_SUCCESS)) {
+ int tlen;
+
+ sinfo = &(KCF_TO_PROV_MECHINFO(tpd, mech->cm_type));
+ /*
+ * key->ck_length from the consumer is always in bits.
+ * We convert it to be in the same unit registered by
+ * the provider in order to do a comparison.
+ */
+ if (sinfo->cm_mech_flags & CRYPTO_KEYSIZE_UNIT_IN_BYTES)
+ tlen = key->ck_length >> 3;
+ else
+ tlen = key->ck_length;
+ /*
+ * Check if the software provider can support context
+ * sharing and support this key length.
+ */
+ if ((sinfo->cm_mech_flags & CRYPTO_CAN_SHARE_OPSTATE) &&
+ (tlen >= sinfo->cm_min_key_length) &&
+ (tlen <= sinfo->cm_max_key_length)) {
+ ctx->cc_flags = CRYPTO_INIT_OPSTATE;
+ tctxp->kc_sw_prov_desc = tpd;
+ } else
+ KCF_PROV_REFRELE(tpd);
+ }
+ }
+
+ if (func == CRYPTO_FG_ENCRYPT) {
+ KCF_WRAP_ENCRYPT_OPS_PARAMS(&params, KCF_OP_INIT, sid,
+ mech, key, NULL, NULL, tmpl);
+ } else {
+ ASSERT(func == CRYPTO_FG_DECRYPT);
+ KCF_WRAP_DECRYPT_OPS_PARAMS(&params, KCF_OP_INIT, sid,
+ mech, key, NULL, NULL, tmpl);
+ }
+
+ error = kcf_submit_request(real_provider, ctx, crq, &params,
+ B_FALSE);
+
+ if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)
+ KCF_PROV_REFRELE(real_provider);
+
+done:
+ if ((error == CRYPTO_SUCCESS) || (error == CRYPTO_QUEUED))
+ *ctxp = (crypto_context_t)ctx;
+ else {
+ /* Release the hold done in kcf_new_ctx(). */
+ KCF_CONTEXT_REFRELE((kcf_context_t *)ctx->cc_framework_private);
+ }
+
+ return (error);
+}
+
+/*
+ * Same as crypto_cipher_init_prov(), but relies on the scheduler to pick
+ * an appropriate provider. See crypto_cipher_init_prov() comments for more
+ * details.
+ */
+static int
+crypto_cipher_init(crypto_mechanism_t *mech, crypto_key_t *key,
+ crypto_ctx_template_t tmpl, crypto_context_t *ctxp,
+ crypto_call_req_t *crq, crypto_func_group_t func)
+{
+ int error;
+ kcf_mech_entry_t *me;
+ kcf_provider_desc_t *pd;
+ kcf_ctx_template_t *ctx_tmpl;
+ crypto_spi_ctx_template_t spi_ctx_tmpl = NULL;
+ kcf_prov_tried_t *list = NULL;
+
+retry:
+ /* pd is returned held */
+ if ((pd = kcf_get_mech_provider(mech->cm_type, &me, &error,
+ list, func, CHECK_RESTRICT(crq), 0)) == NULL) {
+ if (list != NULL)
+ kcf_free_triedlist(list);
+ return (error);
+ }
+
+ /*
+ * For SW providers, check the validity of the context template
+ * It is very rare that the generation number mis-matches, so
+ * is acceptable to fail here, and let the consumer recover by
+ * freeing this tmpl and create a new one for the key and new SW
+ * provider
+ */
+ if ((pd->pd_prov_type == CRYPTO_SW_PROVIDER) &&
+ ((ctx_tmpl = (kcf_ctx_template_t *)tmpl) != NULL)) {
+ if (ctx_tmpl->ct_generation != me->me_gen_swprov) {
+ if (list != NULL)
+ kcf_free_triedlist(list);
+ KCF_PROV_REFRELE(pd);
+ return (CRYPTO_OLD_CTX_TEMPLATE);
+ } else {
+ spi_ctx_tmpl = ctx_tmpl->ct_prov_tmpl;
+ }
+ }
+
+ error = crypto_cipher_init_prov(pd, pd->pd_sid, mech, key,
+ spi_ctx_tmpl, ctxp, crq, func);
+ if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED &&
+ IS_RECOVERABLE(error)) {
+ /* Add pd to the linked list of providers tried. */
+ if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL)
+ goto retry;
+ }
+
+ if (list != NULL)
+ kcf_free_triedlist(list);
+
+ KCF_PROV_REFRELE(pd);
+ return (error);
+}
+
+/*
+ * crypto_encrypt_prov()
+ *
+ * Arguments:
+ * pd: provider descriptor
+ * sid: session id
+ * mech: crypto_mechanism_t pointer.
+ * mech_type is a valid value previously returned by
+ * crypto_mech2id();
+ * When the mech's parameter is not NULL, its definition depends
+ * on the standard definition of the mechanism.
+ * key: pointer to a crypto_key_t structure.
+ * plaintext: The message to be encrypted
+ * ciphertext: Storage for the encrypted message. The length needed
+ * depends on the mechanism, and the plaintext's size.
+ * tmpl: a crypto_ctx_template_t, opaque template of a context of an
+ * encryption with the 'mech' using 'key'. 'tmpl' is created by
+ * a previous call to crypto_create_ctx_template().
+ * cr: crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ * Asynchronously submits a request for, or synchronously performs a
+ * single-part encryption of 'plaintext' with the mechanism 'mech', using
+ * the key 'key'.
+ * When complete and successful, 'ciphertext' will contain the encrypted
+ * message.
+ *
+ * Context:
+ * Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ * See comment in the beginning of the file.
+ */
+int
+crypto_encrypt_prov(crypto_provider_t provider, crypto_session_id_t sid,
+ crypto_mechanism_t *mech, crypto_data_t *plaintext, crypto_key_t *key,
+ crypto_ctx_template_t tmpl, crypto_data_t *ciphertext,
+ crypto_call_req_t *crq)
+{
+ kcf_req_params_t params;
+ kcf_provider_desc_t *pd = provider;
+ kcf_provider_desc_t *real_provider = pd;
+ int error;
+
+ ASSERT(KCF_PROV_REFHELD(pd));
+
+ if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) {
+ error = kcf_get_hardware_provider(mech->cm_type,
+ CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd,
+ &real_provider, CRYPTO_FG_ENCRYPT_ATOMIC);
+
+ if (error != CRYPTO_SUCCESS)
+ return (error);
+ }
+
+ KCF_WRAP_ENCRYPT_OPS_PARAMS(&params, KCF_OP_ATOMIC, sid, mech, key,
+ plaintext, ciphertext, tmpl);
+
+ error = kcf_submit_request(real_provider, NULL, crq, &params, B_FALSE);
+ if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)
+ KCF_PROV_REFRELE(real_provider);
+
+ return (error);
+}
+
+/*
+ * Same as crypto_encrypt_prov(), but relies on the scheduler to pick
+ * a provider. See crypto_encrypt_prov() for more details.
+ */
+int
+crypto_encrypt(crypto_mechanism_t *mech, crypto_data_t *plaintext,
+ crypto_key_t *key, crypto_ctx_template_t tmpl, crypto_data_t *ciphertext,
+ crypto_call_req_t *crq)
+{
+ int error;
+ kcf_mech_entry_t *me;
+ kcf_req_params_t params;
+ kcf_provider_desc_t *pd;
+ kcf_ctx_template_t *ctx_tmpl;
+ crypto_spi_ctx_template_t spi_ctx_tmpl = NULL;
+ kcf_prov_tried_t *list = NULL;
+
+retry:
+ /* pd is returned held */
+ if ((pd = kcf_get_mech_provider(mech->cm_type, &me, &error,
+ list, CRYPTO_FG_ENCRYPT_ATOMIC, CHECK_RESTRICT(crq),
+ plaintext->cd_length)) == NULL) {
+ if (list != NULL)
+ kcf_free_triedlist(list);
+ return (error);
+ }
+
+ /*
+ * For SW providers, check the validity of the context template
+ * It is very rare that the generation number mis-matches, so
+ * is acceptable to fail here, and let the consumer recover by
+ * freeing this tmpl and create a new one for the key and new SW
+ * provider
+ */
+ if ((pd->pd_prov_type == CRYPTO_SW_PROVIDER) &&
+ ((ctx_tmpl = (kcf_ctx_template_t *)tmpl) != NULL)) {
+ if (ctx_tmpl->ct_generation != me->me_gen_swprov) {
+ if (list != NULL)
+ kcf_free_triedlist(list);
+ KCF_PROV_REFRELE(pd);
+ return (CRYPTO_OLD_CTX_TEMPLATE);
+ } else {
+ spi_ctx_tmpl = ctx_tmpl->ct_prov_tmpl;
+ }
+ }
+
+ /* The fast path for SW providers. */
+ if (CHECK_FASTPATH(crq, pd)) {
+ crypto_mechanism_t lmech;
+
+ lmech = *mech;
+ KCF_SET_PROVIDER_MECHNUM(mech->cm_type, pd, &lmech);
+
+ error = KCF_PROV_ENCRYPT_ATOMIC(pd, pd->pd_sid, &lmech, key,
+ plaintext, ciphertext, spi_ctx_tmpl, KCF_SWFP_RHNDL(crq));
+ KCF_PROV_INCRSTATS(pd, error);
+ } else {
+ KCF_WRAP_ENCRYPT_OPS_PARAMS(&params, KCF_OP_ATOMIC, pd->pd_sid,
+ mech, key, plaintext, ciphertext, spi_ctx_tmpl);
+ error = kcf_submit_request(pd, NULL, crq, &params, B_FALSE);
+ }
+
+ if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED &&
+ IS_RECOVERABLE(error)) {
+ /* Add pd to the linked list of providers tried. */
+ if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL)
+ goto retry;
+ }
+
+ if (list != NULL)
+ kcf_free_triedlist(list);
+
+ KCF_PROV_REFRELE(pd);
+ return (error);
+}
+
+/*
+ * crypto_encrypt_init_prov()
+ *
+ * Calls crypto_cipher_init_prov() to initialize an encryption operation.
+ */
+int
+crypto_encrypt_init_prov(crypto_provider_t pd, crypto_session_id_t sid,
+ crypto_mechanism_t *mech, crypto_key_t *key,
+ crypto_ctx_template_t tmpl, crypto_context_t *ctxp,
+ crypto_call_req_t *crq)
+{
+ return (crypto_cipher_init_prov(pd, sid, mech, key, tmpl, ctxp, crq,
+ CRYPTO_FG_ENCRYPT));
+}
+
+/*
+ * crypto_encrypt_init()
+ *
+ * Calls crypto_cipher_init() to initialize an encryption operation
+ */
+int
+crypto_encrypt_init(crypto_mechanism_t *mech, crypto_key_t *key,
+ crypto_ctx_template_t tmpl, crypto_context_t *ctxp,
+ crypto_call_req_t *crq)
+{
+ return (crypto_cipher_init(mech, key, tmpl, ctxp, crq,
+ CRYPTO_FG_ENCRYPT));
+}
+
+/*
+ * crypto_encrypt_update()
+ *
+ * Arguments:
+ * context: A crypto_context_t initialized by encrypt_init().
+ * plaintext: The message part to be encrypted
+ * ciphertext: Storage for the encrypted message part.
+ * cr: crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ * Asynchronously submits a request for, or synchronously performs a
+ * part of an encryption operation.
+ *
+ * Context:
+ * Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ * See comment in the beginning of the file.
+ */
+int
+crypto_encrypt_update(crypto_context_t context, crypto_data_t *plaintext,
+ crypto_data_t *ciphertext, crypto_call_req_t *cr)
+{
+ crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+ kcf_context_t *kcf_ctx;
+ kcf_provider_desc_t *pd;
+ int error;
+ kcf_req_params_t params;
+
+ if ((ctx == NULL) ||
+ ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+ ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+ return (CRYPTO_INVALID_CONTEXT);
+ }
+
+ ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER);
+
+ /* The fast path for SW providers. */
+ if (CHECK_FASTPATH(cr, pd)) {
+ error = KCF_PROV_ENCRYPT_UPDATE(pd, ctx, plaintext,
+ ciphertext, NULL);
+ KCF_PROV_INCRSTATS(pd, error);
+ return (error);
+ }
+
+ /* Check if we should use a software provider for small jobs */
+ if ((ctx->cc_flags & CRYPTO_USE_OPSTATE) && cr == NULL) {
+ if (plaintext->cd_length < kcf_ctx->kc_mech->me_threshold &&
+ kcf_ctx->kc_sw_prov_desc != NULL &&
+ KCF_IS_PROV_USABLE(kcf_ctx->kc_sw_prov_desc)) {
+ pd = kcf_ctx->kc_sw_prov_desc;
+ }
+ }
+
+ KCF_WRAP_ENCRYPT_OPS_PARAMS(&params, KCF_OP_UPDATE,
+ ctx->cc_session, NULL, NULL, plaintext, ciphertext, NULL);
+ error = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+
+ return (error);
+}
+
+/*
+ * crypto_encrypt_final()
+ *
+ * Arguments:
+ * context: A crypto_context_t initialized by encrypt_init().
+ * ciphertext: Storage for the last part of encrypted message
+ * cr: crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ * Asynchronously submits a request for, or synchronously performs the
+ * final part of an encryption operation.
+ *
+ * Context:
+ * Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ * See comment in the beginning of the file.
+ */
+int
+crypto_encrypt_final(crypto_context_t context, crypto_data_t *ciphertext,
+ crypto_call_req_t *cr)
+{
+ crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+ kcf_context_t *kcf_ctx;
+ kcf_provider_desc_t *pd;
+ int error;
+ kcf_req_params_t params;
+
+ if ((ctx == NULL) ||
+ ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+ ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+ return (CRYPTO_INVALID_CONTEXT);
+ }
+
+ ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER);
+
+ /* The fast path for SW providers. */
+ if (CHECK_FASTPATH(cr, pd)) {
+ error = KCF_PROV_ENCRYPT_FINAL(pd, ctx, ciphertext, NULL);
+ KCF_PROV_INCRSTATS(pd, error);
+ } else {
+ KCF_WRAP_ENCRYPT_OPS_PARAMS(&params, KCF_OP_FINAL,
+ ctx->cc_session, NULL, NULL, NULL, ciphertext, NULL);
+ error = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+ }
+
+ /* Release the hold done in kcf_new_ctx() during init step. */
+ KCF_CONTEXT_COND_RELEASE(error, kcf_ctx);
+ return (error);
+}
+
+/*
+ * crypto_decrypt_prov()
+ *
+ * Arguments:
+ * pd: provider descriptor
+ * sid: session id
+ * mech: crypto_mechanism_t pointer.
+ * mech_type is a valid value previously returned by
+ * crypto_mech2id();
+ * When the mech's parameter is not NULL, its definition depends
+ * on the standard definition of the mechanism.
+ * key: pointer to a crypto_key_t structure.
+ * ciphertext: The message to be encrypted
+ * plaintext: Storage for the encrypted message. The length needed
+ * depends on the mechanism, and the plaintext's size.
+ * tmpl: a crypto_ctx_template_t, opaque template of a context of an
+ * encryption with the 'mech' using 'key'. 'tmpl' is created by
+ * a previous call to crypto_create_ctx_template().
+ * cr: crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ * Asynchronously submits a request for, or synchronously performs a
+ * single-part decryption of 'ciphertext' with the mechanism 'mech', using
+ * the key 'key'.
+ * When complete and successful, 'plaintext' will contain the decrypted
+ * message.
+ *
+ * Context:
+ * Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ * See comment in the beginning of the file.
+ */
+int
+crypto_decrypt_prov(crypto_provider_t provider, crypto_session_id_t sid,
+ crypto_mechanism_t *mech, crypto_data_t *ciphertext, crypto_key_t *key,
+ crypto_ctx_template_t tmpl, crypto_data_t *plaintext,
+ crypto_call_req_t *crq)
+{
+ kcf_req_params_t params;
+ kcf_provider_desc_t *pd = provider;
+ kcf_provider_desc_t *real_provider = pd;
+ int rv;
+
+ ASSERT(KCF_PROV_REFHELD(pd));
+
+ if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) {
+ rv = kcf_get_hardware_provider(mech->cm_type,
+ CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd,
+ &real_provider, CRYPTO_FG_DECRYPT_ATOMIC);
+
+ if (rv != CRYPTO_SUCCESS)
+ return (rv);
+ }
+
+ KCF_WRAP_DECRYPT_OPS_PARAMS(&params, KCF_OP_ATOMIC, sid, mech, key,
+ ciphertext, plaintext, tmpl);
+
+ rv = kcf_submit_request(real_provider, NULL, crq, &params, B_FALSE);
+ if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)
+ KCF_PROV_REFRELE(real_provider);
+
+ return (rv);
+}
+
+/*
+ * Same as crypto_decrypt_prov(), but relies on the KCF scheduler to
+ * choose a provider. See crypto_decrypt_prov() comments for more
+ * information.
+ */
+int
+crypto_decrypt(crypto_mechanism_t *mech, crypto_data_t *ciphertext,
+ crypto_key_t *key, crypto_ctx_template_t tmpl, crypto_data_t *plaintext,
+ crypto_call_req_t *crq)
+{
+ int error;
+ kcf_mech_entry_t *me;
+ kcf_req_params_t params;
+ kcf_provider_desc_t *pd;
+ kcf_ctx_template_t *ctx_tmpl;
+ crypto_spi_ctx_template_t spi_ctx_tmpl = NULL;
+ kcf_prov_tried_t *list = NULL;
+
+retry:
+ /* pd is returned held */
+ if ((pd = kcf_get_mech_provider(mech->cm_type, &me, &error,
+ list, CRYPTO_FG_DECRYPT_ATOMIC, CHECK_RESTRICT(crq),
+ ciphertext->cd_length)) == NULL) {
+ if (list != NULL)
+ kcf_free_triedlist(list);
+ return (error);
+ }
+
+ /*
+ * For SW providers, check the validity of the context template
+ * It is very rare that the generation number mis-matches, so
+ * is acceptable to fail here, and let the consumer recover by
+ * freeing this tmpl and create a new one for the key and new SW
+ * provider
+ */
+ if ((pd->pd_prov_type == CRYPTO_SW_PROVIDER) &&
+ ((ctx_tmpl = (kcf_ctx_template_t *)tmpl) != NULL)) {
+ if (ctx_tmpl->ct_generation != me->me_gen_swprov) {
+ if (list != NULL)
+ kcf_free_triedlist(list);
+ KCF_PROV_REFRELE(pd);
+ return (CRYPTO_OLD_CTX_TEMPLATE);
+ } else {
+ spi_ctx_tmpl = ctx_tmpl->ct_prov_tmpl;
+ }
+ }
+
+ /* The fast path for SW providers. */
+ if (CHECK_FASTPATH(crq, pd)) {
+ crypto_mechanism_t lmech;
+
+ lmech = *mech;
+ KCF_SET_PROVIDER_MECHNUM(mech->cm_type, pd, &lmech);
+
+ error = KCF_PROV_DECRYPT_ATOMIC(pd, pd->pd_sid, &lmech, key,
+ ciphertext, plaintext, spi_ctx_tmpl, KCF_SWFP_RHNDL(crq));
+ KCF_PROV_INCRSTATS(pd, error);
+ } else {
+ KCF_WRAP_DECRYPT_OPS_PARAMS(&params, KCF_OP_ATOMIC, pd->pd_sid,
+ mech, key, ciphertext, plaintext, spi_ctx_tmpl);
+ error = kcf_submit_request(pd, NULL, crq, &params, B_FALSE);
+ }
+
+ if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED &&
+ IS_RECOVERABLE(error)) {
+ /* Add pd to the linked list of providers tried. */
+ if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL)
+ goto retry;
+ }
+
+ if (list != NULL)
+ kcf_free_triedlist(list);
+
+ KCF_PROV_REFRELE(pd);
+ return (error);
+}
+
+/*
+ * crypto_decrypt_init_prov()
+ *
+ * Calls crypto_cipher_init_prov() to initialize a decryption operation
+ */
+int
+crypto_decrypt_init_prov(crypto_provider_t pd, crypto_session_id_t sid,
+ crypto_mechanism_t *mech, crypto_key_t *key,
+ crypto_ctx_template_t tmpl, crypto_context_t *ctxp,
+ crypto_call_req_t *crq)
+{
+ return (crypto_cipher_init_prov(pd, sid, mech, key, tmpl, ctxp, crq,
+ CRYPTO_FG_DECRYPT));
+}
+
+/*
+ * crypto_decrypt_init()
+ *
+ * Calls crypto_cipher_init() to initialize a decryption operation
+ */
+int
+crypto_decrypt_init(crypto_mechanism_t *mech, crypto_key_t *key,
+ crypto_ctx_template_t tmpl, crypto_context_t *ctxp,
+ crypto_call_req_t *crq)
+{
+ return (crypto_cipher_init(mech, key, tmpl, ctxp, crq,
+ CRYPTO_FG_DECRYPT));
+}
+
+/*
+ * crypto_decrypt_update()
+ *
+ * Arguments:
+ * context: A crypto_context_t initialized by decrypt_init().
+ * ciphertext: The message part to be decrypted
+ * plaintext: Storage for the decrypted message part.
+ * cr: crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ * Asynchronously submits a request for, or synchronously performs a
+ * part of an decryption operation.
+ *
+ * Context:
+ * Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ * See comment in the beginning of the file.
+ */
+int
+crypto_decrypt_update(crypto_context_t context, crypto_data_t *ciphertext,
+ crypto_data_t *plaintext, crypto_call_req_t *cr)
+{
+ crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+ kcf_context_t *kcf_ctx;
+ kcf_provider_desc_t *pd;
+ int error;
+ kcf_req_params_t params;
+
+ if ((ctx == NULL) ||
+ ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+ ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+ return (CRYPTO_INVALID_CONTEXT);
+ }
+
+ ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER);
+
+ /* The fast path for SW providers. */
+ if (CHECK_FASTPATH(cr, pd)) {
+ error = KCF_PROV_DECRYPT_UPDATE(pd, ctx, ciphertext,
+ plaintext, NULL);
+ KCF_PROV_INCRSTATS(pd, error);
+ return (error);
+ }
+
+ /* Check if we should use a software provider for small jobs */
+ if ((ctx->cc_flags & CRYPTO_USE_OPSTATE) && cr == NULL) {
+ if (ciphertext->cd_length < kcf_ctx->kc_mech->me_threshold &&
+ kcf_ctx->kc_sw_prov_desc != NULL &&
+ KCF_IS_PROV_USABLE(kcf_ctx->kc_sw_prov_desc)) {
+ pd = kcf_ctx->kc_sw_prov_desc;
+ }
+ }
+
+ KCF_WRAP_DECRYPT_OPS_PARAMS(&params, KCF_OP_UPDATE,
+ ctx->cc_session, NULL, NULL, ciphertext, plaintext, NULL);
+ error = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+
+ return (error);
+}
+
+/*
+ * crypto_decrypt_final()
+ *
+ * Arguments:
+ * context: A crypto_context_t initialized by decrypt_init().
+ * plaintext: Storage for the last part of the decrypted message
+ * cr: crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ * Asynchronously submits a request for, or synchronously performs the
+ * final part of a decryption operation.
+ *
+ * Context:
+ * Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ * See comment in the beginning of the file.
+ */
+int
+crypto_decrypt_final(crypto_context_t context, crypto_data_t *plaintext,
+ crypto_call_req_t *cr)
+{
+ crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+ kcf_context_t *kcf_ctx;
+ kcf_provider_desc_t *pd;
+ int error;
+ kcf_req_params_t params;
+
+ if ((ctx == NULL) ||
+ ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+ ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+ return (CRYPTO_INVALID_CONTEXT);
+ }
+
+ ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER);
+
+ /* The fast path for SW providers. */
+ if (CHECK_FASTPATH(cr, pd)) {
+ error = KCF_PROV_DECRYPT_FINAL(pd, ctx, plaintext,
+ NULL);
+ KCF_PROV_INCRSTATS(pd, error);
+ } else {
+ KCF_WRAP_DECRYPT_OPS_PARAMS(&params, KCF_OP_FINAL,
+ ctx->cc_session, NULL, NULL, NULL, plaintext, NULL);
+ error = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+ }
+
+ /* Release the hold done in kcf_new_ctx() during init step. */
+ KCF_CONTEXT_COND_RELEASE(error, kcf_ctx);
+ return (error);
+}
+
+/*
+ * See comments for crypto_encrypt_update().
+ */
+int
+crypto_encrypt_single(crypto_context_t context, crypto_data_t *plaintext,
+ crypto_data_t *ciphertext, crypto_call_req_t *cr)
+{
+ crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+ kcf_context_t *kcf_ctx;
+ kcf_provider_desc_t *pd;
+ int error;
+ kcf_req_params_t params;
+
+ if ((ctx == NULL) ||
+ ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+ ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+ return (CRYPTO_INVALID_CONTEXT);
+ }
+
+ /* The fast path for SW providers. */
+ if (CHECK_FASTPATH(cr, pd)) {
+ error = KCF_PROV_ENCRYPT(pd, ctx, plaintext,
+ ciphertext, NULL);
+ KCF_PROV_INCRSTATS(pd, error);
+ } else {
+ KCF_WRAP_ENCRYPT_OPS_PARAMS(&params, KCF_OP_SINGLE, pd->pd_sid,
+ NULL, NULL, plaintext, ciphertext, NULL);
+ error = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+ }
+
+ /* Release the hold done in kcf_new_ctx() during init step. */
+ KCF_CONTEXT_COND_RELEASE(error, kcf_ctx);
+ return (error);
+}
+
+/*
+ * See comments for crypto_decrypt_update().
+ */
+int
+crypto_decrypt_single(crypto_context_t context, crypto_data_t *ciphertext,
+ crypto_data_t *plaintext, crypto_call_req_t *cr)
+{
+ crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+ kcf_context_t *kcf_ctx;
+ kcf_provider_desc_t *pd;
+ int error;
+ kcf_req_params_t params;
+
+ if ((ctx == NULL) ||
+ ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+ ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+ return (CRYPTO_INVALID_CONTEXT);
+ }
+
+ /* The fast path for SW providers. */
+ if (CHECK_FASTPATH(cr, pd)) {
+ error = KCF_PROV_DECRYPT(pd, ctx, ciphertext,
+ plaintext, NULL);
+ KCF_PROV_INCRSTATS(pd, error);
+ } else {
+ KCF_WRAP_DECRYPT_OPS_PARAMS(&params, KCF_OP_SINGLE, pd->pd_sid,
+ NULL, NULL, ciphertext, plaintext, NULL);
+ error = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+ }
+
+ /* Release the hold done in kcf_new_ctx() during init step. */
+ KCF_CONTEXT_COND_RELEASE(error, kcf_ctx);
+ return (error);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(crypto_encrypt_prov);
+EXPORT_SYMBOL(crypto_encrypt);
+EXPORT_SYMBOL(crypto_encrypt_init_prov);
+EXPORT_SYMBOL(crypto_encrypt_init);
+EXPORT_SYMBOL(crypto_encrypt_update);
+EXPORT_SYMBOL(crypto_encrypt_final);
+EXPORT_SYMBOL(crypto_decrypt_prov);
+EXPORT_SYMBOL(crypto_decrypt);
+EXPORT_SYMBOL(crypto_decrypt_init_prov);
+EXPORT_SYMBOL(crypto_decrypt_init);
+EXPORT_SYMBOL(crypto_decrypt_update);
+EXPORT_SYMBOL(crypto_decrypt_final);
+EXPORT_SYMBOL(crypto_encrypt_single);
+EXPORT_SYMBOL(crypto_decrypt_single);
+#endif
diff --git a/sys/contrib/openzfs/module/icp/api/kcf_ctxops.c b/sys/contrib/openzfs/module/icp/api/kcf_ctxops.c
new file mode 100644
index 000000000000..21b0977d3634
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/api/kcf_ctxops.c
@@ -0,0 +1,151 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/api.h>
+#include <sys/crypto/spi.h>
+#include <sys/crypto/sched_impl.h>
+
+/*
+ * Crypto contexts manipulation routines
+ */
+
+/*
+ * crypto_create_ctx_template()
+ *
+ * Arguments:
+ *
+ * mech: crypto_mechanism_t pointer.
+ * mech_type is a valid value previously returned by
+ * crypto_mech2id();
+ * When the mech's parameter is not NULL, its definition depends
+ * on the standard definition of the mechanism.
+ * key: pointer to a crypto_key_t structure.
+ * ptmpl: a storage for the opaque crypto_ctx_template_t, allocated and
+ * initialized by the software provider this routine is
+ * dispatched to.
+ * kmflag: KM_SLEEP/KM_NOSLEEP mem. alloc. flag.
+ *
+ * Description:
+ * Redirects the call to the software provider of the specified
+ * mechanism. That provider will allocate and pre-compute/pre-expand
+ * the context template, reusable by later calls to crypto_xxx_init().
+ * The size and address of that provider context template are stored
+ * in an internal structure, kcf_ctx_template_t. The address of that
+ * structure is given back to the caller in *ptmpl.
+ *
+ * Context:
+ * Process or interrupt.
+ *
+ * Returns:
+ * CRYPTO_SUCCESS when the context template is successfully created.
+ * CRYPTO_HOST_MEMORY: mem alloc failure
+ * CRYPTO_ARGUMENTS_BAD: NULL storage for the ctx template.
+ * RYPTO_MECHANISM_INVALID: invalid mechanism 'mech'.
+ */
+int
+crypto_create_ctx_template(crypto_mechanism_t *mech, crypto_key_t *key,
+ crypto_ctx_template_t *ptmpl, int kmflag)
+{
+ int error;
+ kcf_mech_entry_t *me;
+ kcf_provider_desc_t *pd;
+ kcf_ctx_template_t *ctx_tmpl;
+ crypto_mechanism_t prov_mech;
+
+ /* A few args validation */
+
+ if (ptmpl == NULL)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ if (mech == NULL)
+ return (CRYPTO_MECHANISM_INVALID);
+
+ error = kcf_get_sw_prov(mech->cm_type, &pd, &me, B_TRUE);
+ if (error != CRYPTO_SUCCESS)
+ return (error);
+
+ if ((ctx_tmpl = (kcf_ctx_template_t *)kmem_alloc(
+ sizeof (kcf_ctx_template_t), kmflag)) == NULL) {
+ KCF_PROV_REFRELE(pd);
+ return (CRYPTO_HOST_MEMORY);
+ }
+
+ /* Pass a mechtype that the provider understands */
+ prov_mech.cm_type = KCF_TO_PROV_MECHNUM(pd, mech->cm_type);
+ prov_mech.cm_param = mech->cm_param;
+ prov_mech.cm_param_len = mech->cm_param_len;
+
+ error = KCF_PROV_CREATE_CTX_TEMPLATE(pd, &prov_mech, key,
+ &(ctx_tmpl->ct_prov_tmpl), &(ctx_tmpl->ct_size), KCF_RHNDL(kmflag));
+
+ if (error == CRYPTO_SUCCESS) {
+ ctx_tmpl->ct_generation = me->me_gen_swprov;
+ *ptmpl = ctx_tmpl;
+ } else {
+ kmem_free(ctx_tmpl, sizeof (kcf_ctx_template_t));
+ }
+ KCF_PROV_REFRELE(pd);
+
+ return (error);
+}
+
+/*
+ * crypto_destroy_ctx_template()
+ *
+ * Arguments:
+ *
+ * tmpl: an opaque crypto_ctx_template_t previously created by
+ * crypto_create_ctx_template()
+ *
+ * Description:
+ * Frees the embedded crypto_spi_ctx_template_t, then the
+ * kcf_ctx_template_t.
+ *
+ * Context:
+ * Process or interrupt.
+ *
+ */
+void
+crypto_destroy_ctx_template(crypto_ctx_template_t tmpl)
+{
+ kcf_ctx_template_t *ctx_tmpl = (kcf_ctx_template_t *)tmpl;
+
+ if (ctx_tmpl == NULL)
+ return;
+
+ ASSERT(ctx_tmpl->ct_prov_tmpl != NULL);
+
+ bzero(ctx_tmpl->ct_prov_tmpl, ctx_tmpl->ct_size);
+ kmem_free(ctx_tmpl->ct_prov_tmpl, ctx_tmpl->ct_size);
+ kmem_free(ctx_tmpl, sizeof (kcf_ctx_template_t));
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(crypto_create_ctx_template);
+EXPORT_SYMBOL(crypto_destroy_ctx_template);
+#endif
diff --git a/sys/contrib/openzfs/module/icp/api/kcf_digest.c b/sys/contrib/openzfs/module/icp/api/kcf_digest.c
new file mode 100644
index 000000000000..aa68d69bc162
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/api/kcf_digest.c
@@ -0,0 +1,491 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/api.h>
+#include <sys/crypto/spi.h>
+#include <sys/crypto/sched_impl.h>
+
+/*
+ * Message digest routines
+ */
+
+/*
+ * The following are the possible returned values common to all the routines
+ * below. The applicability of some of these return values depends on the
+ * presence of the arguments.
+ *
+ * CRYPTO_SUCCESS: The operation completed successfully.
+ * CRYPTO_QUEUED: A request was submitted successfully. The callback
+ * routine will be called when the operation is done.
+ * CRYPTO_MECHANISM_INVALID or CRYPTO_INVALID_MECH_PARAM
+ * for problems with the 'mech'.
+ * CRYPTO_INVALID_DATA for bogus 'data'
+ * CRYPTO_HOST_MEMORY for failure to allocate memory to handle this work.
+ * CRYPTO_INVALID_CONTEXT: Not a valid context.
+ * CRYPTO_BUSY: Cannot process the request now. Schedule a
+ * crypto_bufcall(), or try later.
+ * CRYPTO_NOT_SUPPORTED and CRYPTO_MECH_NOT_SUPPORTED:
+ * No provider is capable of a function or a mechanism.
+ */
+
+
+/*
+ * crypto_digest_prov()
+ *
+ * Arguments:
+ * pd: pointer to the descriptor of the provider to use for this
+ * operation.
+ * sid: provider session id.
+ * mech: crypto_mechanism_t pointer.
+ * mech_type is a valid value previously returned by
+ * crypto_mech2id();
+ * When the mech's parameter is not NULL, its definition depends
+ * on the standard definition of the mechanism.
+ * data: The message to be digested.
+ * digest: Storage for the digest. The length needed depends on the
+ * mechanism.
+ * cr: crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ * Asynchronously submits a request for, or synchronously performs the
+ * digesting operation of 'data' on the specified
+ * provider with the specified session.
+ * When complete and successful, 'digest' will contain the digest value.
+ * The caller should hold a reference on the specified provider
+ * descriptor before calling this function.
+ *
+ * Context:
+ * Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ * See comment in the beginning of the file.
+ */
+int
+crypto_digest_prov(crypto_provider_t provider, crypto_session_id_t sid,
+ crypto_mechanism_t *mech, crypto_data_t *data, crypto_data_t *digest,
+ crypto_call_req_t *crq)
+{
+ kcf_req_params_t params;
+ kcf_provider_desc_t *pd = provider;
+ kcf_provider_desc_t *real_provider = pd;
+ int rv;
+
+ ASSERT(KCF_PROV_REFHELD(pd));
+
+ if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) {
+ rv = kcf_get_hardware_provider(mech->cm_type,
+ CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq),
+ pd, &real_provider, CRYPTO_FG_DIGEST_ATOMIC);
+
+ if (rv != CRYPTO_SUCCESS)
+ return (rv);
+ }
+ KCF_WRAP_DIGEST_OPS_PARAMS(&params, KCF_OP_ATOMIC, sid, mech, NULL,
+ data, digest);
+
+ /* no crypto context to carry between multiple parts. */
+ rv = kcf_submit_request(real_provider, NULL, crq, &params, B_FALSE);
+ if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)
+ KCF_PROV_REFRELE(real_provider);
+
+ return (rv);
+}
+
+
+/*
+ * Same as crypto_digest_prov(), but relies on the KCF scheduler to
+ * choose a provider. See crypto_digest_prov() comments for more information.
+ */
+int
+crypto_digest(crypto_mechanism_t *mech, crypto_data_t *data,
+ crypto_data_t *digest, crypto_call_req_t *crq)
+{
+ int error;
+ kcf_provider_desc_t *pd;
+ kcf_req_params_t params;
+ kcf_prov_tried_t *list = NULL;
+
+retry:
+ /* The pd is returned held */
+ if ((pd = kcf_get_mech_provider(mech->cm_type, NULL, &error, list,
+ CRYPTO_FG_DIGEST_ATOMIC, CHECK_RESTRICT(crq),
+ data->cd_length)) == NULL) {
+ if (list != NULL)
+ kcf_free_triedlist(list);
+ return (error);
+ }
+
+ /* The fast path for SW providers. */
+ if (CHECK_FASTPATH(crq, pd)) {
+ crypto_mechanism_t lmech;
+
+ lmech = *mech;
+ KCF_SET_PROVIDER_MECHNUM(mech->cm_type, pd, &lmech);
+ error = KCF_PROV_DIGEST_ATOMIC(pd, pd->pd_sid, &lmech, data,
+ digest, KCF_SWFP_RHNDL(crq));
+ KCF_PROV_INCRSTATS(pd, error);
+ } else {
+ if (pd->pd_prov_type == CRYPTO_HW_PROVIDER &&
+ (pd->pd_flags & CRYPTO_HASH_NO_UPDATE) &&
+ (data->cd_length > pd->pd_hash_limit)) {
+ error = CRYPTO_BUFFER_TOO_BIG;
+ } else {
+ KCF_WRAP_DIGEST_OPS_PARAMS(&params, KCF_OP_ATOMIC,
+ pd->pd_sid, mech, NULL, data, digest);
+
+ /* no crypto context to carry between multiple parts. */
+ error = kcf_submit_request(pd, NULL, crq, &params,
+ B_FALSE);
+ }
+ }
+
+ if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED &&
+ IS_RECOVERABLE(error)) {
+ /* Add pd to the linked list of providers tried. */
+ if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL)
+ goto retry;
+ }
+
+ if (list != NULL)
+ kcf_free_triedlist(list);
+
+ KCF_PROV_REFRELE(pd);
+ return (error);
+}
+
+/*
+ * crypto_digest_init_prov()
+ *
+ * pd: pointer to the descriptor of the provider to use for this
+ * operation.
+ * sid: provider session id.
+ * mech: crypto_mechanism_t pointer.
+ * mech_type is a valid value previously returned by
+ * crypto_mech2id();
+ * When the mech's parameter is not NULL, its definition depends
+ * on the standard definition of the mechanism.
+ * ctxp: Pointer to a crypto_context_t.
+ * cr: crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ * Asynchronously submits a request for, or synchronously performs the
+ * initialization of a message digest operation on the specified
+ * provider with the specified session.
+ * When complete and successful, 'ctxp' will contain a crypto_context_t
+ * valid for later calls to digest_update() and digest_final().
+ * The caller should hold a reference on the specified provider
+ * descriptor before calling this function.
+ */
+int
+crypto_digest_init_prov(crypto_provider_t provider, crypto_session_id_t sid,
+ crypto_mechanism_t *mech, crypto_context_t *ctxp, crypto_call_req_t *crq)
+{
+ int error;
+ crypto_ctx_t *ctx;
+ kcf_req_params_t params;
+ kcf_provider_desc_t *pd = provider;
+ kcf_provider_desc_t *real_provider = pd;
+
+ ASSERT(KCF_PROV_REFHELD(pd));
+
+ if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) {
+ error = kcf_get_hardware_provider(mech->cm_type,
+ CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd,
+ &real_provider, CRYPTO_FG_DIGEST);
+
+ if (error != CRYPTO_SUCCESS)
+ return (error);
+ }
+
+ /* Allocate and initialize the canonical context */
+ if ((ctx = kcf_new_ctx(crq, real_provider, sid)) == NULL) {
+ if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)
+ KCF_PROV_REFRELE(real_provider);
+ return (CRYPTO_HOST_MEMORY);
+ }
+
+ /* The fast path for SW providers. */
+ if (CHECK_FASTPATH(crq, pd)) {
+ crypto_mechanism_t lmech;
+
+ lmech = *mech;
+ KCF_SET_PROVIDER_MECHNUM(mech->cm_type, real_provider, &lmech);
+ error = KCF_PROV_DIGEST_INIT(real_provider, ctx, &lmech,
+ KCF_SWFP_RHNDL(crq));
+ KCF_PROV_INCRSTATS(pd, error);
+ } else {
+ KCF_WRAP_DIGEST_OPS_PARAMS(&params, KCF_OP_INIT, sid,
+ mech, NULL, NULL, NULL);
+ error = kcf_submit_request(real_provider, ctx, crq, &params,
+ B_FALSE);
+ }
+
+ if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)
+ KCF_PROV_REFRELE(real_provider);
+
+ if ((error == CRYPTO_SUCCESS) || (error == CRYPTO_QUEUED))
+ *ctxp = (crypto_context_t)ctx;
+ else {
+ /* Release the hold done in kcf_new_ctx(). */
+ KCF_CONTEXT_REFRELE((kcf_context_t *)ctx->cc_framework_private);
+ }
+
+ return (error);
+}
+
+/*
+ * Same as crypto_digest_init_prov(), but relies on the KCF scheduler
+ * to choose a provider. See crypto_digest_init_prov() comments for
+ * more information.
+ */
+int
+crypto_digest_init(crypto_mechanism_t *mech, crypto_context_t *ctxp,
+ crypto_call_req_t *crq)
+{
+ int error;
+ kcf_provider_desc_t *pd;
+ kcf_prov_tried_t *list = NULL;
+
+retry:
+ /* The pd is returned held */
+ if ((pd = kcf_get_mech_provider(mech->cm_type, NULL, &error,
+ list, CRYPTO_FG_DIGEST, CHECK_RESTRICT(crq), 0)) == NULL) {
+ if (list != NULL)
+ kcf_free_triedlist(list);
+ return (error);
+ }
+
+ if (pd->pd_prov_type == CRYPTO_HW_PROVIDER &&
+ (pd->pd_flags & CRYPTO_HASH_NO_UPDATE)) {
+ /*
+ * The hardware provider has limited digest support.
+ * So, we fallback early here to using a software provider.
+ *
+ * XXX - need to enhance to do the fallback later in
+ * crypto_digest_update() if the size of accumulated input data
+ * exceeds the maximum size digestable by hardware provider.
+ */
+ error = CRYPTO_BUFFER_TOO_BIG;
+ } else {
+ error = crypto_digest_init_prov(pd, pd->pd_sid,
+ mech, ctxp, crq);
+ }
+
+ if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED &&
+ IS_RECOVERABLE(error)) {
+ /* Add pd to the linked list of providers tried. */
+ if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL)
+ goto retry;
+ }
+
+ if (list != NULL)
+ kcf_free_triedlist(list);
+ KCF_PROV_REFRELE(pd);
+ return (error);
+}
+
+/*
+ * crypto_digest_update()
+ *
+ * Arguments:
+ * context: A crypto_context_t initialized by digest_init().
+ * data: The part of message to be digested.
+ * cr: crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ * Asynchronously submits a request for, or synchronously performs a
+ * part of a message digest operation.
+ *
+ * Context:
+ * Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ * See comment in the beginning of the file.
+ */
+int
+crypto_digest_update(crypto_context_t context, crypto_data_t *data,
+ crypto_call_req_t *cr)
+{
+ crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+ kcf_context_t *kcf_ctx;
+ kcf_provider_desc_t *pd;
+ int error;
+ kcf_req_params_t params;
+
+ if ((ctx == NULL) ||
+ ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+ ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+ return (CRYPTO_INVALID_CONTEXT);
+ }
+
+ ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER);
+
+ /* The fast path for SW providers. */
+ if (CHECK_FASTPATH(cr, pd)) {
+ error = KCF_PROV_DIGEST_UPDATE(pd, ctx, data, NULL);
+ KCF_PROV_INCRSTATS(pd, error);
+ } else {
+ KCF_WRAP_DIGEST_OPS_PARAMS(&params, KCF_OP_UPDATE,
+ ctx->cc_session, NULL, NULL, data, NULL);
+ error = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+ }
+
+ return (error);
+}
+
+/*
+ * crypto_digest_final()
+ *
+ * Arguments:
+ * context: A crypto_context_t initialized by digest_init().
+ * digest: The storage for the digest.
+ * cr: crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ * Asynchronously submits a request for, or synchronously performs the
+ * final part of a message digest operation.
+ *
+ * Context:
+ * Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ * See comment in the beginning of the file.
+ */
+int
+crypto_digest_final(crypto_context_t context, crypto_data_t *digest,
+ crypto_call_req_t *cr)
+{
+ crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+ kcf_context_t *kcf_ctx;
+ kcf_provider_desc_t *pd;
+ int error;
+ kcf_req_params_t params;
+
+ if ((ctx == NULL) ||
+ ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+ ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+ return (CRYPTO_INVALID_CONTEXT);
+ }
+
+ ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER);
+
+ /* The fast path for SW providers. */
+ if (CHECK_FASTPATH(cr, pd)) {
+ error = KCF_PROV_DIGEST_FINAL(pd, ctx, digest, NULL);
+ KCF_PROV_INCRSTATS(pd, error);
+ } else {
+ KCF_WRAP_DIGEST_OPS_PARAMS(&params, KCF_OP_FINAL,
+ ctx->cc_session, NULL, NULL, NULL, digest);
+ error = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+ }
+
+ /* Release the hold done in kcf_new_ctx() during init step. */
+ KCF_CONTEXT_COND_RELEASE(error, kcf_ctx);
+ return (error);
+}
+
+/*
+ * Performs a digest update on the specified key. Note that there is
+ * no k-API crypto_digest_key() equivalent of this function.
+ */
+int
+crypto_digest_key_prov(crypto_context_t context, crypto_key_t *key,
+ crypto_call_req_t *cr)
+{
+ crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+ kcf_context_t *kcf_ctx;
+ kcf_provider_desc_t *pd;
+ int error;
+ kcf_req_params_t params;
+
+ if ((ctx == NULL) ||
+ ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+ ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+ return (CRYPTO_INVALID_CONTEXT);
+ }
+
+ ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER);
+
+ /* The fast path for SW providers. */
+ if (CHECK_FASTPATH(cr, pd)) {
+ error = KCF_PROV_DIGEST_KEY(pd, ctx, key, NULL);
+ KCF_PROV_INCRSTATS(pd, error);
+ } else {
+ KCF_WRAP_DIGEST_OPS_PARAMS(&params, KCF_OP_DIGEST_KEY,
+ ctx->cc_session, NULL, key, NULL, NULL);
+ error = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+ }
+
+ return (error);
+}
+
+/*
+ * See comments for crypto_digest_update() and crypto_digest_final().
+ */
+int
+crypto_digest_single(crypto_context_t context, crypto_data_t *data,
+ crypto_data_t *digest, crypto_call_req_t *cr)
+{
+ crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+ kcf_context_t *kcf_ctx;
+ kcf_provider_desc_t *pd;
+ int error;
+ kcf_req_params_t params;
+
+ if ((ctx == NULL) ||
+ ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+ ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+ return (CRYPTO_INVALID_CONTEXT);
+ }
+
+
+ /* The fast path for SW providers. */
+ if (CHECK_FASTPATH(cr, pd)) {
+ error = KCF_PROV_DIGEST(pd, ctx, data, digest, NULL);
+ KCF_PROV_INCRSTATS(pd, error);
+ } else {
+ KCF_WRAP_DIGEST_OPS_PARAMS(&params, KCF_OP_SINGLE, pd->pd_sid,
+ NULL, NULL, data, digest);
+ error = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+ }
+
+ /* Release the hold done in kcf_new_ctx() during init step. */
+ KCF_CONTEXT_COND_RELEASE(error, kcf_ctx);
+ return (error);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(crypto_digest_prov);
+EXPORT_SYMBOL(crypto_digest);
+EXPORT_SYMBOL(crypto_digest_init_prov);
+EXPORT_SYMBOL(crypto_digest_init);
+EXPORT_SYMBOL(crypto_digest_update);
+EXPORT_SYMBOL(crypto_digest_final);
+EXPORT_SYMBOL(crypto_digest_key_prov);
+EXPORT_SYMBOL(crypto_digest_single);
+#endif
diff --git a/sys/contrib/openzfs/module/icp/api/kcf_mac.c b/sys/contrib/openzfs/module/icp/api/kcf_mac.c
new file mode 100644
index 000000000000..a7722d8f914c
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/api/kcf_mac.c
@@ -0,0 +1,645 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/api.h>
+#include <sys/crypto/spi.h>
+#include <sys/crypto/sched_impl.h>
+
+/*
+ * Message authentication codes routines.
+ */
+
+/*
+ * The following are the possible returned values common to all the routines
+ * below. The applicability of some of these return values depends on the
+ * presence of the arguments.
+ *
+ * CRYPTO_SUCCESS: The operation completed successfully.
+ * CRYPTO_QUEUED: A request was submitted successfully. The callback
+ * routine will be called when the operation is done.
+ * CRYPTO_INVALID_MECH_NUMBER, CRYPTO_INVALID_MECH_PARAM, or
+ * CRYPTO_INVALID_MECH for problems with the 'mech'.
+ * CRYPTO_INVALID_DATA for bogus 'data'
+ * CRYPTO_HOST_MEMORY for failure to allocate memory to handle this work.
+ * CRYPTO_INVALID_CONTEXT: Not a valid context.
+ * CRYPTO_BUSY: Cannot process the request now. Schedule a
+ * crypto_bufcall(), or try later.
+ * CRYPTO_NOT_SUPPORTED and CRYPTO_MECH_NOT_SUPPORTED: No provider is
+ * capable of a function or a mechanism.
+ * CRYPTO_INVALID_KEY: bogus 'key' argument.
+ * CRYPTO_INVALID_MAC: bogus 'mac' argument.
+ */
+
+/*
+ * crypto_mac_prov()
+ *
+ * Arguments:
+ * mech: crypto_mechanism_t pointer.
+ * mech_type is a valid value previously returned by
+ * crypto_mech2id();
+ * When the mech's parameter is not NULL, its definition depends
+ * on the standard definition of the mechanism.
+ * key: pointer to a crypto_key_t structure.
+ * data: The message to compute the MAC for.
+ * mac: Storage for the MAC. The length needed depends on the mechanism.
+ * tmpl: a crypto_ctx_template_t, opaque template of a context of a
+ * MAC with the 'mech' using 'key'. 'tmpl' is created by
+ * a previous call to crypto_create_ctx_template().
+ * cr: crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ * Asynchronously submits a request for, or synchronously performs a
+ * single-part message authentication of 'data' with the mechanism
+ * 'mech', using * the key 'key', on the specified provider with
+ * the specified session id.
+ * When complete and successful, 'mac' will contain the message
+ * authentication code.
+ *
+ * Context:
+ * Process or interrupt, according to the semantics dictated by the 'crq'.
+ *
+ * Returns:
+ * See comment in the beginning of the file.
+ */
+int
+crypto_mac_prov(crypto_provider_t provider, crypto_session_id_t sid,
+ crypto_mechanism_t *mech, crypto_data_t *data, crypto_key_t *key,
+ crypto_ctx_template_t tmpl, crypto_data_t *mac, crypto_call_req_t *crq)
+{
+ kcf_req_params_t params;
+ kcf_provider_desc_t *pd = provider;
+ kcf_provider_desc_t *real_provider = pd;
+ int rv;
+
+ ASSERT(KCF_PROV_REFHELD(pd));
+
+ if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) {
+ rv = kcf_get_hardware_provider(mech->cm_type,
+ CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd,
+ &real_provider, CRYPTO_FG_MAC_ATOMIC);
+
+ if (rv != CRYPTO_SUCCESS)
+ return (rv);
+ }
+
+ KCF_WRAP_MAC_OPS_PARAMS(&params, KCF_OP_ATOMIC, sid, mech, key,
+ data, mac, tmpl);
+ rv = kcf_submit_request(real_provider, NULL, crq, &params, B_FALSE);
+ if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)
+ KCF_PROV_REFRELE(real_provider);
+
+ return (rv);
+}
+
+/*
+ * Same as crypto_mac_prov(), but relies on the KCF scheduler to choose
+ * a provider. See crypto_mac() comments for more information.
+ */
+int
+crypto_mac(crypto_mechanism_t *mech, crypto_data_t *data,
+ crypto_key_t *key, crypto_ctx_template_t tmpl, crypto_data_t *mac,
+ crypto_call_req_t *crq)
+{
+ int error;
+ kcf_mech_entry_t *me;
+ kcf_req_params_t params;
+ kcf_provider_desc_t *pd;
+ kcf_ctx_template_t *ctx_tmpl;
+ crypto_spi_ctx_template_t spi_ctx_tmpl = NULL;
+ kcf_prov_tried_t *list = NULL;
+
+retry:
+ /* The pd is returned held */
+ if ((pd = kcf_get_mech_provider(mech->cm_type, &me, &error,
+ list, CRYPTO_FG_MAC_ATOMIC, CHECK_RESTRICT(crq),
+ data->cd_length)) == NULL) {
+ if (list != NULL)
+ kcf_free_triedlist(list);
+ return (error);
+ }
+
+ /*
+ * For SW providers, check the validity of the context template
+ * It is very rare that the generation number mis-matches, so
+ * is acceptable to fail here, and let the consumer recover by
+ * freeing this tmpl and create a new one for the key and new SW
+ * provider
+ */
+ if ((pd->pd_prov_type == CRYPTO_SW_PROVIDER) &&
+ ((ctx_tmpl = (kcf_ctx_template_t *)tmpl) != NULL)) {
+ if (ctx_tmpl->ct_generation != me->me_gen_swprov) {
+ if (list != NULL)
+ kcf_free_triedlist(list);
+ KCF_PROV_REFRELE(pd);
+ return (CRYPTO_OLD_CTX_TEMPLATE);
+ } else {
+ spi_ctx_tmpl = ctx_tmpl->ct_prov_tmpl;
+ }
+ }
+
+ /* The fast path for SW providers. */
+ if (CHECK_FASTPATH(crq, pd)) {
+ crypto_mechanism_t lmech;
+
+ lmech = *mech;
+ KCF_SET_PROVIDER_MECHNUM(mech->cm_type, pd, &lmech);
+
+ error = KCF_PROV_MAC_ATOMIC(pd, pd->pd_sid, &lmech, key, data,
+ mac, spi_ctx_tmpl, KCF_SWFP_RHNDL(crq));
+ KCF_PROV_INCRSTATS(pd, error);
+ } else {
+ if (pd->pd_prov_type == CRYPTO_HW_PROVIDER &&
+ (pd->pd_flags & CRYPTO_HASH_NO_UPDATE) &&
+ (data->cd_length > pd->pd_hash_limit)) {
+ /*
+ * XXX - We need a check to see if this is indeed
+ * a HMAC. So far, all kernel clients use
+ * this interface only for HMAC. So, this is fine
+ * for now.
+ */
+ error = CRYPTO_BUFFER_TOO_BIG;
+ } else {
+ KCF_WRAP_MAC_OPS_PARAMS(&params, KCF_OP_ATOMIC,
+ pd->pd_sid, mech, key, data, mac, spi_ctx_tmpl);
+
+ error = kcf_submit_request(pd, NULL, crq, &params,
+ KCF_ISDUALREQ(crq));
+ }
+ }
+
+ if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED &&
+ IS_RECOVERABLE(error)) {
+ /* Add pd to the linked list of providers tried. */
+ if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL)
+ goto retry;
+ }
+
+ if (list != NULL)
+ kcf_free_triedlist(list);
+
+ KCF_PROV_REFRELE(pd);
+ return (error);
+}
+
+/*
+ * Single part operation to compute the MAC corresponding to the specified
+ * 'data' and to verify that it matches the MAC specified by 'mac'.
+ * The other arguments are the same as the function crypto_mac_prov().
+ */
+int
+crypto_mac_verify_prov(crypto_provider_t provider, crypto_session_id_t sid,
+ crypto_mechanism_t *mech, crypto_data_t *data, crypto_key_t *key,
+ crypto_ctx_template_t tmpl, crypto_data_t *mac, crypto_call_req_t *crq)
+{
+ kcf_req_params_t params;
+ kcf_provider_desc_t *pd = provider;
+ kcf_provider_desc_t *real_provider = pd;
+ int rv;
+
+ ASSERT(KCF_PROV_REFHELD(pd));
+
+ if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) {
+ rv = kcf_get_hardware_provider(mech->cm_type,
+ CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd,
+ &real_provider, CRYPTO_FG_MAC_ATOMIC);
+
+ if (rv != CRYPTO_SUCCESS)
+ return (rv);
+ }
+
+ KCF_WRAP_MAC_OPS_PARAMS(&params, KCF_OP_MAC_VERIFY_ATOMIC, sid, mech,
+ key, data, mac, tmpl);
+ rv = kcf_submit_request(real_provider, NULL, crq, &params, B_FALSE);
+ if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)
+ KCF_PROV_REFRELE(real_provider);
+
+ return (rv);
+}
+
+/*
+ * Same as crypto_mac_verify_prov(), but relies on the KCF scheduler to choose
+ * a provider. See crypto_mac_verify_prov() comments for more information.
+ */
+int
+crypto_mac_verify(crypto_mechanism_t *mech, crypto_data_t *data,
+ crypto_key_t *key, crypto_ctx_template_t tmpl, crypto_data_t *mac,
+ crypto_call_req_t *crq)
+{
+ int error;
+ kcf_mech_entry_t *me;
+ kcf_req_params_t params;
+ kcf_provider_desc_t *pd;
+ kcf_ctx_template_t *ctx_tmpl;
+ crypto_spi_ctx_template_t spi_ctx_tmpl = NULL;
+ kcf_prov_tried_t *list = NULL;
+
+retry:
+ /* The pd is returned held */
+ if ((pd = kcf_get_mech_provider(mech->cm_type, &me, &error,
+ list, CRYPTO_FG_MAC_ATOMIC, CHECK_RESTRICT(crq),
+ data->cd_length)) == NULL) {
+ if (list != NULL)
+ kcf_free_triedlist(list);
+ return (error);
+ }
+
+ /*
+ * For SW providers, check the validity of the context template
+ * It is very rare that the generation number mis-matches, so
+ * is acceptable to fail here, and let the consumer recover by
+ * freeing this tmpl and create a new one for the key and new SW
+ * provider
+ */
+ if ((pd->pd_prov_type == CRYPTO_SW_PROVIDER) &&
+ ((ctx_tmpl = (kcf_ctx_template_t *)tmpl) != NULL)) {
+ if (ctx_tmpl->ct_generation != me->me_gen_swprov) {
+ if (list != NULL)
+ kcf_free_triedlist(list);
+ KCF_PROV_REFRELE(pd);
+ return (CRYPTO_OLD_CTX_TEMPLATE);
+ } else {
+ spi_ctx_tmpl = ctx_tmpl->ct_prov_tmpl;
+ }
+ }
+
+ /* The fast path for SW providers. */
+ if (CHECK_FASTPATH(crq, pd)) {
+ crypto_mechanism_t lmech;
+
+ lmech = *mech;
+ KCF_SET_PROVIDER_MECHNUM(mech->cm_type, pd, &lmech);
+
+ error = KCF_PROV_MAC_VERIFY_ATOMIC(pd, pd->pd_sid, &lmech, key,
+ data, mac, spi_ctx_tmpl, KCF_SWFP_RHNDL(crq));
+ KCF_PROV_INCRSTATS(pd, error);
+ } else {
+ if (pd->pd_prov_type == CRYPTO_HW_PROVIDER &&
+ (pd->pd_flags & CRYPTO_HASH_NO_UPDATE) &&
+ (data->cd_length > pd->pd_hash_limit)) {
+ /* see comments in crypto_mac() */
+ error = CRYPTO_BUFFER_TOO_BIG;
+ } else {
+ KCF_WRAP_MAC_OPS_PARAMS(&params,
+ KCF_OP_MAC_VERIFY_ATOMIC, pd->pd_sid, mech,
+ key, data, mac, spi_ctx_tmpl);
+
+ error = kcf_submit_request(pd, NULL, crq, &params,
+ KCF_ISDUALREQ(crq));
+ }
+ }
+
+ if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED &&
+ IS_RECOVERABLE(error)) {
+ /* Add pd to the linked list of providers tried. */
+ if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL)
+ goto retry;
+ }
+
+ if (list != NULL)
+ kcf_free_triedlist(list);
+
+ KCF_PROV_REFRELE(pd);
+ return (error);
+}
+
+/*
+ * crypto_mac_init_prov()
+ *
+ * Arguments:
+ * pd: pointer to the descriptor of the provider to use for this
+ * operation.
+ * sid: provider session id.
+ * mech: crypto_mechanism_t pointer.
+ * mech_type is a valid value previously returned by
+ * crypto_mech2id();
+ * When the mech's parameter is not NULL, its definition depends
+ * on the standard definition of the mechanism.
+ * key: pointer to a crypto_key_t structure.
+ * tmpl: a crypto_ctx_template_t, opaque template of a context of a
+ * MAC with the 'mech' using 'key'. 'tmpl' is created by
+ * a previous call to crypto_create_ctx_template().
+ * ctxp: Pointer to a crypto_context_t.
+ * cr: crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ * Asynchronously submits a request for, or synchronously performs the
+ * initialization of a MAC operation on the specified provider with
+ * the specified session.
+ * When possible and applicable, will internally use the pre-computed MAC
+ * context from the context template, tmpl.
+ * When complete and successful, 'ctxp' will contain a crypto_context_t
+ * valid for later calls to mac_update() and mac_final().
+ * The caller should hold a reference on the specified provider
+ * descriptor before calling this function.
+ *
+ * Context:
+ * Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ * See comment in the beginning of the file.
+ */
+int
+crypto_mac_init_prov(crypto_provider_t provider, crypto_session_id_t sid,
+ crypto_mechanism_t *mech, crypto_key_t *key, crypto_spi_ctx_template_t tmpl,
+ crypto_context_t *ctxp, crypto_call_req_t *crq)
+{
+ int rv;
+ crypto_ctx_t *ctx;
+ kcf_req_params_t params;
+ kcf_provider_desc_t *pd = provider;
+ kcf_provider_desc_t *real_provider = pd;
+
+ ASSERT(KCF_PROV_REFHELD(pd));
+
+ if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) {
+ rv = kcf_get_hardware_provider(mech->cm_type,
+ CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd,
+ &real_provider, CRYPTO_FG_MAC);
+
+ if (rv != CRYPTO_SUCCESS)
+ return (rv);
+ }
+
+ /* Allocate and initialize the canonical context */
+ if ((ctx = kcf_new_ctx(crq, real_provider, sid)) == NULL) {
+ if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)
+ KCF_PROV_REFRELE(real_provider);
+ return (CRYPTO_HOST_MEMORY);
+ }
+
+ /* The fast path for SW providers. */
+ if (CHECK_FASTPATH(crq, pd)) {
+ crypto_mechanism_t lmech;
+
+ lmech = *mech;
+ KCF_SET_PROVIDER_MECHNUM(mech->cm_type, real_provider, &lmech);
+ rv = KCF_PROV_MAC_INIT(real_provider, ctx, &lmech, key, tmpl,
+ KCF_SWFP_RHNDL(crq));
+ KCF_PROV_INCRSTATS(pd, rv);
+ } else {
+ KCF_WRAP_MAC_OPS_PARAMS(&params, KCF_OP_INIT, sid, mech, key,
+ NULL, NULL, tmpl);
+ rv = kcf_submit_request(real_provider, ctx, crq, &params,
+ B_FALSE);
+ }
+
+ if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)
+ KCF_PROV_REFRELE(real_provider);
+
+ if ((rv == CRYPTO_SUCCESS) || (rv == CRYPTO_QUEUED))
+ *ctxp = (crypto_context_t)ctx;
+ else {
+ /* Release the hold done in kcf_new_ctx(). */
+ KCF_CONTEXT_REFRELE((kcf_context_t *)ctx->cc_framework_private);
+ }
+
+ return (rv);
+}
+
+/*
+ * Same as crypto_mac_init_prov(), but relies on the KCF scheduler to
+ * choose a provider. See crypto_mac_init_prov() comments for more
+ * information.
+ */
+int
+crypto_mac_init(crypto_mechanism_t *mech, crypto_key_t *key,
+ crypto_ctx_template_t tmpl, crypto_context_t *ctxp,
+ crypto_call_req_t *crq)
+{
+ int error;
+ kcf_mech_entry_t *me;
+ kcf_provider_desc_t *pd;
+ kcf_ctx_template_t *ctx_tmpl;
+ crypto_spi_ctx_template_t spi_ctx_tmpl = NULL;
+ kcf_prov_tried_t *list = NULL;
+
+retry:
+ /* The pd is returned held */
+ if ((pd = kcf_get_mech_provider(mech->cm_type, &me, &error,
+ list, CRYPTO_FG_MAC, CHECK_RESTRICT(crq), 0)) == NULL) {
+ if (list != NULL)
+ kcf_free_triedlist(list);
+ return (error);
+ }
+
+ /*
+ * For SW providers, check the validity of the context template
+ * It is very rare that the generation number mis-matches, so
+ * is acceptable to fail here, and let the consumer recover by
+ * freeing this tmpl and create a new one for the key and new SW
+ * provider
+ */
+
+ if ((pd->pd_prov_type == CRYPTO_SW_PROVIDER) &&
+ ((ctx_tmpl = (kcf_ctx_template_t *)tmpl) != NULL)) {
+ if (ctx_tmpl->ct_generation != me->me_gen_swprov) {
+ if (list != NULL)
+ kcf_free_triedlist(list);
+ KCF_PROV_REFRELE(pd);
+ return (CRYPTO_OLD_CTX_TEMPLATE);
+ } else {
+ spi_ctx_tmpl = ctx_tmpl->ct_prov_tmpl;
+ }
+ }
+
+ if (pd->pd_prov_type == CRYPTO_HW_PROVIDER &&
+ (pd->pd_flags & CRYPTO_HASH_NO_UPDATE)) {
+ /*
+ * The hardware provider has limited HMAC support.
+ * So, we fallback early here to using a software provider.
+ *
+ * XXX - need to enhance to do the fallback later in
+ * crypto_mac_update() if the size of accumulated input data
+ * exceeds the maximum size digestable by hardware provider.
+ */
+ error = CRYPTO_BUFFER_TOO_BIG;
+ } else {
+ error = crypto_mac_init_prov(pd, pd->pd_sid, mech, key,
+ spi_ctx_tmpl, ctxp, crq);
+ }
+ if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED &&
+ IS_RECOVERABLE(error)) {
+ /* Add pd to the linked list of providers tried. */
+ if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL)
+ goto retry;
+ }
+
+ if (list != NULL)
+ kcf_free_triedlist(list);
+
+ KCF_PROV_REFRELE(pd);
+ return (error);
+}
+
+/*
+ * crypto_mac_update()
+ *
+ * Arguments:
+ * context: A crypto_context_t initialized by mac_init().
+ * data: The message part to be MAC'ed
+ * cr: crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ * Asynchronously submits a request for, or synchronously performs a
+ * part of a MAC operation.
+ *
+ * Context:
+ * Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ * See comment in the beginning of the file.
+ */
+int
+crypto_mac_update(crypto_context_t context, crypto_data_t *data,
+ crypto_call_req_t *cr)
+{
+ crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+ kcf_context_t *kcf_ctx;
+ kcf_provider_desc_t *pd;
+ kcf_req_params_t params;
+ int rv;
+
+ if ((ctx == NULL) ||
+ ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+ ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+ return (CRYPTO_INVALID_CONTEXT);
+ }
+
+ ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER);
+
+ /* The fast path for SW providers. */
+ if (CHECK_FASTPATH(cr, pd)) {
+ rv = KCF_PROV_MAC_UPDATE(pd, ctx, data, NULL);
+ KCF_PROV_INCRSTATS(pd, rv);
+ } else {
+ KCF_WRAP_MAC_OPS_PARAMS(&params, KCF_OP_UPDATE,
+ ctx->cc_session, NULL, NULL, data, NULL, NULL);
+ rv = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+ }
+
+ return (rv);
+}
+
+/*
+ * crypto_mac_final()
+ *
+ * Arguments:
+ * context: A crypto_context_t initialized by mac_init().
+ * mac: Storage for the message authentication code.
+ * cr: crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ * Asynchronously submits a request for, or synchronously performs a
+ * part of a message authentication operation.
+ *
+ * Context:
+ * Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ * See comment in the beginning of the file.
+ */
+int
+crypto_mac_final(crypto_context_t context, crypto_data_t *mac,
+ crypto_call_req_t *cr)
+{
+ crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+ kcf_context_t *kcf_ctx;
+ kcf_provider_desc_t *pd;
+ kcf_req_params_t params;
+ int rv;
+
+ if ((ctx == NULL) ||
+ ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+ ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+ return (CRYPTO_INVALID_CONTEXT);
+ }
+
+ ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER);
+
+ /* The fast path for SW providers. */
+ if (CHECK_FASTPATH(cr, pd)) {
+ rv = KCF_PROV_MAC_FINAL(pd, ctx, mac, NULL);
+ KCF_PROV_INCRSTATS(pd, rv);
+ } else {
+ KCF_WRAP_MAC_OPS_PARAMS(&params, KCF_OP_FINAL,
+ ctx->cc_session, NULL, NULL, NULL, mac, NULL);
+ rv = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+ }
+
+ /* Release the hold done in kcf_new_ctx() during init step. */
+ KCF_CONTEXT_COND_RELEASE(rv, kcf_ctx);
+ return (rv);
+}
+
+/*
+ * See comments for crypto_mac_update() and crypto_mac_final().
+ */
+int
+crypto_mac_single(crypto_context_t context, crypto_data_t *data,
+ crypto_data_t *mac, crypto_call_req_t *cr)
+{
+ crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+ kcf_context_t *kcf_ctx;
+ kcf_provider_desc_t *pd;
+ int error;
+ kcf_req_params_t params;
+
+
+ if ((ctx == NULL) ||
+ ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+ ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+ return (CRYPTO_INVALID_CONTEXT);
+ }
+
+
+ /* The fast path for SW providers. */
+ if (CHECK_FASTPATH(cr, pd)) {
+ error = KCF_PROV_MAC(pd, ctx, data, mac, NULL);
+ KCF_PROV_INCRSTATS(pd, error);
+ } else {
+ KCF_WRAP_MAC_OPS_PARAMS(&params, KCF_OP_SINGLE, pd->pd_sid,
+ NULL, NULL, data, mac, NULL);
+ error = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+ }
+
+ /* Release the hold done in kcf_new_ctx() during init step. */
+ KCF_CONTEXT_COND_RELEASE(error, kcf_ctx);
+ return (error);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(crypto_mac_prov);
+EXPORT_SYMBOL(crypto_mac);
+EXPORT_SYMBOL(crypto_mac_verify_prov);
+EXPORT_SYMBOL(crypto_mac_verify);
+EXPORT_SYMBOL(crypto_mac_init_prov);
+EXPORT_SYMBOL(crypto_mac_init);
+EXPORT_SYMBOL(crypto_mac_update);
+EXPORT_SYMBOL(crypto_mac_final);
+EXPORT_SYMBOL(crypto_mac_single);
+#endif
diff --git a/sys/contrib/openzfs/module/icp/api/kcf_miscapi.c b/sys/contrib/openzfs/module/icp/api/kcf_miscapi.c
new file mode 100644
index 000000000000..c0f415b264a7
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/api/kcf_miscapi.c
@@ -0,0 +1,127 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/api.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/sched_impl.h>
+
+/*
+ * All event subscribers are put on a list. kcf_notify_list_lock
+ * protects changes to this list.
+ *
+ * The following locking order is maintained in the code - The
+ * global kcf_notify_list_lock followed by the individual lock
+ * in a kcf_ntfy_elem structure (kn_lock).
+ */
+kmutex_t ntfy_list_lock;
+kcondvar_t ntfy_list_cv; /* cv the service thread waits on */
+static kcf_ntfy_elem_t *ntfy_list_head;
+
+/*
+ * crypto_mech2id()
+ *
+ * Arguments:
+ * . mechname: A null-terminated string identifying the mechanism name.
+ *
+ * Description:
+ * Walks the mechanisms tables, looking for an entry that matches the
+ * mechname. Once it find it, it builds the 64-bit mech_type and returns
+ * it. If there are no hardware or software providers for the mechanism,
+ * but there is an unloaded software provider, this routine will attempt
+ * to load it.
+ *
+ * Context:
+ * Process and interruption.
+ *
+ * Returns:
+ * The unique mechanism identified by 'mechname', if found.
+ * CRYPTO_MECH_INVALID otherwise.
+ */
+crypto_mech_type_t
+crypto_mech2id(char *mechname)
+{
+ return (crypto_mech2id_common(mechname, B_TRUE));
+}
+
+/*
+ * We walk the notification list and do the callbacks.
+ */
+void
+kcf_walk_ntfylist(uint32_t event, void *event_arg)
+{
+ kcf_ntfy_elem_t *nep;
+ int nelem = 0;
+
+ mutex_enter(&ntfy_list_lock);
+
+ /*
+ * Count how many clients are on the notification list. We need
+ * this count to ensure that clients which joined the list after we
+ * have started this walk, are not wrongly notified.
+ */
+ for (nep = ntfy_list_head; nep != NULL; nep = nep->kn_next)
+ nelem++;
+
+ for (nep = ntfy_list_head; (nep != NULL && nelem); nep = nep->kn_next) {
+ nelem--;
+
+ /*
+ * Check if this client is interested in the
+ * event.
+ */
+ if (!(nep->kn_event_mask & event))
+ continue;
+
+ mutex_enter(&nep->kn_lock);
+ nep->kn_state = NTFY_RUNNING;
+ mutex_exit(&nep->kn_lock);
+ mutex_exit(&ntfy_list_lock);
+
+ /*
+ * We invoke the callback routine with no locks held. Another
+ * client could have joined the list meanwhile. This is fine
+ * as we maintain nelem as stated above. The NULL check in the
+ * for loop guards against shrinkage. Also, any callers of
+ * crypto_unnotify_events() at this point cv_wait till kn_state
+ * changes to NTFY_WAITING. Hence, nep is assured to be valid.
+ */
+ (*nep->kn_func)(event, event_arg);
+
+ mutex_enter(&nep->kn_lock);
+ nep->kn_state = NTFY_WAITING;
+ cv_broadcast(&nep->kn_cv);
+ mutex_exit(&nep->kn_lock);
+
+ mutex_enter(&ntfy_list_lock);
+ }
+
+ mutex_exit(&ntfy_list_lock);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(crypto_mech2id);
+#endif
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman
new file mode 100644
index 000000000000..48fea7bb333e
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman
@@ -0,0 +1,23 @@
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software is allowed (with or without
+ changes) provided that:
+
+ 1. source code distributions include the above copyright notice, this
+ list of conditions and the following disclaimer;
+
+ 2. binary distributions include the above copyright notice, this list
+ of conditions and the following disclaimer in their documentation;
+
+ 3. the name of the copyright holder is not used to endorse products
+ built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip
new file mode 100644
index 000000000000..5f822cf27586
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip
@@ -0,0 +1 @@
+PORTIONS OF AES FUNCTIONALITY
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl
new file mode 100644
index 000000000000..92c9e196a318
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl
@@ -0,0 +1,127 @@
+
+ LICENSE ISSUES
+ ==============
+
+ The OpenSSL toolkit stays under a dual license, i.e. both the conditions of
+ the OpenSSL License and the original SSLeay license apply to the toolkit.
+ See below for the actual license texts. Actually both licenses are BSD-style
+ Open Source licenses. In case of any license issues related to OpenSSL
+ please contact openssl-core@openssl.org.
+
+ OpenSSL License
+ ---------------
+
+/* ====================================================================
+ * Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+ Original SSLeay License
+ -----------------------
+
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to. The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * "This product includes cryptographic software written by
+ * Eric Young (eay@cryptsoft.com)"
+ * The word 'cryptographic' can be left out if the routines from the library
+ * being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ * the apps directory (application code) you must include an acknowledgement:
+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed. i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip
new file mode 100644
index 000000000000..5f822cf27586
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip
@@ -0,0 +1 @@
+PORTIONS OF AES FUNCTIONALITY
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aes_aesni.S b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aes_aesni.S
new file mode 100644
index 000000000000..4a80c62097ae
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aes_aesni.S
@@ -0,0 +1,748 @@
+/*
+ * ====================================================================
+ * Written by Intel Corporation for the OpenSSL project to add support
+ * for Intel AES-NI instructions. Rights for redistribution and usage
+ * in source and binary forms are granted according to the OpenSSL
+ * license.
+ *
+ * Author: Huang Ying <ying.huang at intel dot com>
+ * Vinodh Gopal <vinodh.gopal at intel dot com>
+ * Kahraman Akdemir
+ *
+ * Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD)
+ * instructions that are going to be introduced in the next generation
+ * of Intel processor, as of 2009. These instructions enable fast and
+ * secure data encryption and decryption, using the Advanced Encryption
+ * Standard (AES), defined by FIPS Publication number 197. The
+ * architecture introduces six instructions that offer full hardware
+ * support for AES. Four of them support high performance data
+ * encryption and decryption, and the other two instructions support
+ * the AES key expansion procedure.
+ * ====================================================================
+ */
+
+/*
+ * ====================================================================
+ * Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+/*
+ * ====================================================================
+ * OpenSolaris OS modifications
+ *
+ * This source originates as files aes-intel.S and eng_aesni_asm.pl, in
+ * patches sent sent Dec. 9, 2008 and Dec. 24, 2008, respectively, by
+ * Huang Ying of Intel to the openssl-dev mailing list under the subject
+ * of "Add support to Intel AES-NI instruction set for x86_64 platform".
+ *
+ * This OpenSolaris version has these major changes from the original source:
+ *
+ * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
+ * definitions for lint.
+ *
+ * 2. Formatted code, added comments, and added #includes and #defines.
+ *
+ * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
+ * calling kpreempt_disable() and kpreempt_enable().
+ * If the TS bit is not set, Save and restore %xmm registers at the beginning
+ * and end of function calls (%xmm* registers are not saved and restored by
+ * during kernel thread preemption).
+ *
+ * 4. Renamed functions, reordered parameters, and changed return value
+ * to match OpenSolaris:
+ *
+ * OpenSSL interface:
+ * int intel_AES_set_encrypt_key(const unsigned char *userKey,
+ * const int bits, AES_KEY *key);
+ * int intel_AES_set_decrypt_key(const unsigned char *userKey,
+ * const int bits, AES_KEY *key);
+ * Return values for above are non-zero on error, 0 on success.
+ *
+ * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
+ * const AES_KEY *key);
+ * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
+ * const AES_KEY *key);
+ * typedef struct aes_key_st {
+ * unsigned int rd_key[4 *(AES_MAXNR + 1)];
+ * int rounds;
+ * unsigned int pad[3];
+ * } AES_KEY;
+ * Note: AES_LONG is undefined (that is, Intel uses 32-bit key schedules
+ * (ks32) instead of 64-bit (ks64).
+ * Number of rounds (aka round count) is at offset 240 of AES_KEY.
+ *
+ * OpenSolaris OS interface (#ifdefs removed for readability):
+ * int rijndael_key_setup_dec_intel(uint32_t rk[],
+ * const uint32_t cipherKey[], uint64_t keyBits);
+ * int rijndael_key_setup_enc_intel(uint32_t rk[],
+ * const uint32_t cipherKey[], uint64_t keyBits);
+ * Return values for above are 0 on error, number of rounds on success.
+ *
+ * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
+ * const uint32_t pt[4], uint32_t ct[4]);
+ * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
+ * const uint32_t pt[4], uint32_t ct[4]);
+ * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4];
+ * uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t;
+ *
+ * typedef union {
+ * uint32_t ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
+ * } aes_ks_t;
+ * typedef struct aes_key {
+ * aes_ks_t encr_ks, decr_ks;
+ * long double align128;
+ * int flags, nr, type;
+ * } aes_key_t;
+ *
+ * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
+ * ct is crypto text, and MAX_AES_NR is 14.
+ * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
+ *
+ * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary.
+ *
+ * ====================================================================
+ */
+
+
+#if defined(lint) || defined(__lint)
+
+#include <sys/types.h>
+
+/* ARGSUSED */
+void
+aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4],
+ uint32_t ct[4]) {
+}
+/* ARGSUSED */
+void
+aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4],
+ uint32_t pt[4]) {
+}
+/* ARGSUSED */
+int
+rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
+ uint64_t keyBits) {
+ return (0);
+}
+/* ARGSUSED */
+int
+rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
+ uint64_t keyBits) {
+ return (0);
+}
+
+
+#elif defined(HAVE_AES) /* guard by instruction set */
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+/*
+ * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(),
+ * _key_expansion_256a(), _key_expansion_256b()
+ *
+ * Helper functions called by rijndael_key_setup_inc_intel().
+ * Also used indirectly by rijndael_key_setup_dec_intel().
+ *
+ * Input:
+ * %xmm0 User-provided cipher key
+ * %xmm1 Round constant
+ * Output:
+ * (%rcx) AES key
+ */
+
+ENTRY_NP2(_key_expansion_128, _key_expansion_256a)
+_key_expansion_128_local:
+_key_expansion_256a_local:
+ pshufd $0b11111111, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+ movups %xmm0, (%rcx)
+ add $0x10, %rcx
+ ret
+ nop
+SET_SIZE(_key_expansion_128)
+SET_SIZE(_key_expansion_256a)
+
+
+ENTRY_NP(_key_expansion_192a)
+_key_expansion_192a_local:
+ pshufd $0b01010101, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+
+ movups %xmm2, %xmm5
+ movups %xmm2, %xmm6
+ pslldq $4, %xmm5
+ pshufd $0b11111111, %xmm0, %xmm3
+ pxor %xmm3, %xmm2
+ pxor %xmm5, %xmm2
+
+ movups %xmm0, %xmm1
+ shufps $0b01000100, %xmm0, %xmm6
+ movups %xmm6, (%rcx)
+ shufps $0b01001110, %xmm2, %xmm1
+ movups %xmm1, 0x10(%rcx)
+ add $0x20, %rcx
+ ret
+SET_SIZE(_key_expansion_192a)
+
+
+ENTRY_NP(_key_expansion_192b)
+_key_expansion_192b_local:
+ pshufd $0b01010101, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+
+ movups %xmm2, %xmm5
+ pslldq $4, %xmm5
+ pshufd $0b11111111, %xmm0, %xmm3
+ pxor %xmm3, %xmm2
+ pxor %xmm5, %xmm2
+
+ movups %xmm0, (%rcx)
+ add $0x10, %rcx
+ ret
+SET_SIZE(_key_expansion_192b)
+
+
+ENTRY_NP(_key_expansion_256b)
+_key_expansion_256b_local:
+ pshufd $0b10101010, %xmm1, %xmm1
+ shufps $0b00010000, %xmm2, %xmm4
+ pxor %xmm4, %xmm2
+ shufps $0b10001100, %xmm2, %xmm4
+ pxor %xmm4, %xmm2
+ pxor %xmm1, %xmm2
+ movups %xmm2, (%rcx)
+ add $0x10, %rcx
+ ret
+SET_SIZE(_key_expansion_256b)
+
+
+/*
+ * rijndael_key_setup_enc_intel()
+ * Expand the cipher key into the encryption key schedule.
+ *
+ * For kernel code, caller is responsible for ensuring kpreempt_disable()
+ * has been called. This is because %xmm registers are not saved/restored.
+ * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
+ * on entry. Otherwise, if TS is not set, save and restore %xmm registers
+ * on the stack.
+ *
+ * OpenSolaris interface:
+ * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
+ * uint64_t keyBits);
+ * Return value is 0 on error, number of rounds on success.
+ *
+ * Original Intel OpenSSL interface:
+ * int intel_AES_set_encrypt_key(const unsigned char *userKey,
+ * const int bits, AES_KEY *key);
+ * Return value is non-zero on error, 0 on success.
+ */
+
+#ifdef OPENSSL_INTERFACE
+#define rijndael_key_setup_enc_intel intel_AES_set_encrypt_key
+#define rijndael_key_setup_dec_intel intel_AES_set_decrypt_key
+
+#define USERCIPHERKEY rdi /* P1, 64 bits */
+#define KEYSIZE32 esi /* P2, 32 bits */
+#define KEYSIZE64 rsi /* P2, 64 bits */
+#define AESKEY rdx /* P3, 64 bits */
+
+#else /* OpenSolaris Interface */
+#define AESKEY rdi /* P1, 64 bits */
+#define USERCIPHERKEY rsi /* P2, 64 bits */
+#define KEYSIZE32 edx /* P3, 32 bits */
+#define KEYSIZE64 rdx /* P3, 64 bits */
+#endif /* OPENSSL_INTERFACE */
+
+#define ROUNDS32 KEYSIZE32 /* temp */
+#define ROUNDS64 KEYSIZE64 /* temp */
+#define ENDAESKEY USERCIPHERKEY /* temp */
+
+ENTRY_NP(rijndael_key_setup_enc_intel)
+rijndael_key_setup_enc_intel_local:
+ FRAME_BEGIN
+ // NULL pointer sanity check
+ test %USERCIPHERKEY, %USERCIPHERKEY
+ jz .Lenc_key_invalid_param
+ test %AESKEY, %AESKEY
+ jz .Lenc_key_invalid_param
+
+ movups (%USERCIPHERKEY), %xmm0 // user key (first 16 bytes)
+ movups %xmm0, (%AESKEY)
+ lea 0x10(%AESKEY), %rcx // key addr
+ pxor %xmm4, %xmm4 // xmm4 is assumed 0 in _key_expansion_x
+
+ cmp $256, %KEYSIZE32
+ jnz .Lenc_key192
+
+ // AES 256: 14 rounds in encryption key schedule
+#ifdef OPENSSL_INTERFACE
+ mov $14, %ROUNDS32
+ movl %ROUNDS32, 240(%AESKEY) // key.rounds = 14
+#endif /* OPENSSL_INTERFACE */
+
+ movups 0x10(%USERCIPHERKEY), %xmm2 // other user key (2nd 16 bytes)
+ movups %xmm2, (%rcx)
+ add $0x10, %rcx
+
+ aeskeygenassist $0x1, %xmm2, %xmm1 // expand the key
+ call _key_expansion_256a_local
+ aeskeygenassist $0x1, %xmm0, %xmm1
+ call _key_expansion_256b_local
+ aeskeygenassist $0x2, %xmm2, %xmm1 // expand the key
+ call _key_expansion_256a_local
+ aeskeygenassist $0x2, %xmm0, %xmm1
+ call _key_expansion_256b_local
+ aeskeygenassist $0x4, %xmm2, %xmm1 // expand the key
+ call _key_expansion_256a_local
+ aeskeygenassist $0x4, %xmm0, %xmm1
+ call _key_expansion_256b_local
+ aeskeygenassist $0x8, %xmm2, %xmm1 // expand the key
+ call _key_expansion_256a_local
+ aeskeygenassist $0x8, %xmm0, %xmm1
+ call _key_expansion_256b_local
+ aeskeygenassist $0x10, %xmm2, %xmm1 // expand the key
+ call _key_expansion_256a_local
+ aeskeygenassist $0x10, %xmm0, %xmm1
+ call _key_expansion_256b_local
+ aeskeygenassist $0x20, %xmm2, %xmm1 // expand the key
+ call _key_expansion_256a_local
+ aeskeygenassist $0x20, %xmm0, %xmm1
+ call _key_expansion_256b_local
+ aeskeygenassist $0x40, %xmm2, %xmm1 // expand the key
+ call _key_expansion_256a_local
+
+#ifdef OPENSSL_INTERFACE
+ xor %rax, %rax // return 0 (OK)
+#else /* Open Solaris Interface */
+ mov $14, %rax // return # rounds = 14
+#endif
+ FRAME_END
+ ret
+
+.align 4
+.Lenc_key192:
+ cmp $192, %KEYSIZE32
+ jnz .Lenc_key128
+
+ // AES 192: 12 rounds in encryption key schedule
+#ifdef OPENSSL_INTERFACE
+ mov $12, %ROUNDS32
+ movl %ROUNDS32, 240(%AESKEY) // key.rounds = 12
+#endif /* OPENSSL_INTERFACE */
+
+ movq 0x10(%USERCIPHERKEY), %xmm2 // other user key
+ aeskeygenassist $0x1, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192a_local
+ aeskeygenassist $0x2, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192b_local
+ aeskeygenassist $0x4, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192a_local
+ aeskeygenassist $0x8, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192b_local
+ aeskeygenassist $0x10, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192a_local
+ aeskeygenassist $0x20, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192b_local
+ aeskeygenassist $0x40, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192a_local
+ aeskeygenassist $0x80, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192b_local
+
+#ifdef OPENSSL_INTERFACE
+ xor %rax, %rax // return 0 (OK)
+#else /* OpenSolaris Interface */
+ mov $12, %rax // return # rounds = 12
+#endif
+ FRAME_END
+ ret
+
+.align 4
+.Lenc_key128:
+ cmp $128, %KEYSIZE32
+ jnz .Lenc_key_invalid_key_bits
+
+ // AES 128: 10 rounds in encryption key schedule
+#ifdef OPENSSL_INTERFACE
+ mov $10, %ROUNDS32
+ movl %ROUNDS32, 240(%AESKEY) // key.rounds = 10
+#endif /* OPENSSL_INTERFACE */
+
+ aeskeygenassist $0x1, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128_local
+ aeskeygenassist $0x2, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128_local
+ aeskeygenassist $0x4, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128_local
+ aeskeygenassist $0x8, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128_local
+ aeskeygenassist $0x10, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128_local
+ aeskeygenassist $0x20, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128_local
+ aeskeygenassist $0x40, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128_local
+ aeskeygenassist $0x80, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128_local
+ aeskeygenassist $0x1b, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128_local
+ aeskeygenassist $0x36, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128_local
+
+#ifdef OPENSSL_INTERFACE
+ xor %rax, %rax // return 0 (OK)
+#else /* OpenSolaris Interface */
+ mov $10, %rax // return # rounds = 10
+#endif
+ FRAME_END
+ ret
+
+.Lenc_key_invalid_param:
+#ifdef OPENSSL_INTERFACE
+ mov $-1, %rax // user key or AES key pointer is NULL
+ FRAME_END
+ ret
+#else
+ /* FALLTHROUGH */
+#endif /* OPENSSL_INTERFACE */
+
+.Lenc_key_invalid_key_bits:
+#ifdef OPENSSL_INTERFACE
+ mov $-2, %rax // keysize is invalid
+#else /* Open Solaris Interface */
+ xor %rax, %rax // a key pointer is NULL or invalid keysize
+#endif /* OPENSSL_INTERFACE */
+ FRAME_END
+ ret
+ SET_SIZE(rijndael_key_setup_enc_intel)
+
+
+/*
+ * rijndael_key_setup_dec_intel()
+ * Expand the cipher key into the decryption key schedule.
+ *
+ * For kernel code, caller is responsible for ensuring kpreempt_disable()
+ * has been called. This is because %xmm registers are not saved/restored.
+ * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
+ * on entry. Otherwise, if TS is not set, save and restore %xmm registers
+ * on the stack.
+ *
+ * OpenSolaris interface:
+ * int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
+ * uint64_t keyBits);
+ * Return value is 0 on error, number of rounds on success.
+ * P1->P2, P2->P3, P3->P1
+ *
+ * Original Intel OpenSSL interface:
+ * int intel_AES_set_decrypt_key(const unsigned char *userKey,
+ * const int bits, AES_KEY *key);
+ * Return value is non-zero on error, 0 on success.
+ */
+
+ENTRY_NP(rijndael_key_setup_dec_intel)
+FRAME_BEGIN
+ // Generate round keys used for encryption
+ call rijndael_key_setup_enc_intel_local
+ test %rax, %rax
+#ifdef OPENSSL_INTERFACE
+ jnz .Ldec_key_exit // Failed if returned non-0
+#else /* OpenSolaris Interface */
+ jz .Ldec_key_exit // Failed if returned 0
+#endif /* OPENSSL_INTERFACE */
+
+ /*
+ * Convert round keys used for encryption
+ * to a form usable for decryption
+ */
+#ifndef OPENSSL_INTERFACE /* OpenSolaris Interface */
+ mov %rax, %ROUNDS64 // set # rounds (10, 12, or 14)
+ // (already set for OpenSSL)
+#endif
+
+ lea 0x10(%AESKEY), %rcx // key addr
+ shl $4, %ROUNDS32
+ add %AESKEY, %ROUNDS64
+ mov %ROUNDS64, %ENDAESKEY
+
+.align 4
+.Ldec_key_reorder_loop:
+ movups (%AESKEY), %xmm0
+ movups (%ROUNDS64), %xmm1
+ movups %xmm0, (%ROUNDS64)
+ movups %xmm1, (%AESKEY)
+ lea 0x10(%AESKEY), %AESKEY
+ lea -0x10(%ROUNDS64), %ROUNDS64
+ cmp %AESKEY, %ROUNDS64
+ ja .Ldec_key_reorder_loop
+
+.align 4
+.Ldec_key_inv_loop:
+ movups (%rcx), %xmm0
+ // Convert an encryption round key to a form usable for decryption
+ // with the "AES Inverse Mix Columns" instruction
+ aesimc %xmm0, %xmm1
+ movups %xmm1, (%rcx)
+ lea 0x10(%rcx), %rcx
+ cmp %ENDAESKEY, %rcx
+ jnz .Ldec_key_inv_loop
+
+.Ldec_key_exit:
+ // OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error
+ // OpenSSL: rax = 0 for OK, or non-zero for error
+ FRAME_END
+ ret
+ SET_SIZE(rijndael_key_setup_dec_intel)
+
+
+/*
+ * aes_encrypt_intel()
+ * Encrypt a single block (in and out can overlap).
+ *
+ * For kernel code, caller is responsible for ensuring kpreempt_disable()
+ * has been called. This is because %xmm registers are not saved/restored.
+ * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
+ * on entry. Otherwise, if TS is not set, save and restore %xmm registers
+ * on the stack.
+ *
+ * Temporary register usage:
+ * %xmm0 State
+ * %xmm1 Key
+ *
+ * Original OpenSolaris Interface:
+ * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
+ * const uint32_t pt[4], uint32_t ct[4])
+ *
+ * Original Intel OpenSSL Interface:
+ * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
+ * const AES_KEY *key)
+ */
+
+#ifdef OPENSSL_INTERFACE
+#define aes_encrypt_intel intel_AES_encrypt
+#define aes_decrypt_intel intel_AES_decrypt
+
+#define INP rdi /* P1, 64 bits */
+#define OUTP rsi /* P2, 64 bits */
+#define KEYP rdx /* P3, 64 bits */
+
+/* No NROUNDS parameter--offset 240 from KEYP saved in %ecx: */
+#define NROUNDS32 ecx /* temporary, 32 bits */
+#define NROUNDS cl /* temporary, 8 bits */
+
+#else /* OpenSolaris Interface */
+#define KEYP rdi /* P1, 64 bits */
+#define NROUNDS esi /* P2, 32 bits */
+#define INP rdx /* P3, 64 bits */
+#define OUTP rcx /* P4, 64 bits */
+#endif /* OPENSSL_INTERFACE */
+
+#define STATE xmm0 /* temporary, 128 bits */
+#define KEY xmm1 /* temporary, 128 bits */
+
+
+ENTRY_NP(aes_encrypt_intel)
+
+ movups (%INP), %STATE // input
+ movups (%KEYP), %KEY // key
+#ifdef OPENSSL_INTERFACE
+ mov 240(%KEYP), %NROUNDS32 // round count
+#else /* OpenSolaris Interface */
+ /* Round count is already present as P2 in %rsi/%esi */
+#endif /* OPENSSL_INTERFACE */
+
+ pxor %KEY, %STATE // round 0
+ lea 0x30(%KEYP), %KEYP
+ cmp $12, %NROUNDS
+ jb .Lenc128
+ lea 0x20(%KEYP), %KEYP
+ je .Lenc192
+
+ // AES 256
+ lea 0x20(%KEYP), %KEYP
+ movups -0x60(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movups -0x50(%KEYP), %KEY
+ aesenc %KEY, %STATE
+
+.align 4
+.Lenc192:
+ // AES 192 and 256
+ movups -0x40(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movups -0x30(%KEYP), %KEY
+ aesenc %KEY, %STATE
+
+.align 4
+.Lenc128:
+ // AES 128, 192, and 256
+ movups -0x20(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movups -0x10(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movups (%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movups 0x10(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movups 0x20(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movups 0x30(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movups 0x40(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movups 0x50(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movups 0x60(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movups 0x70(%KEYP), %KEY
+ aesenclast %KEY, %STATE // last round
+ movups %STATE, (%OUTP) // output
+
+ ret
+ SET_SIZE(aes_encrypt_intel)
+
+
+/*
+ * aes_decrypt_intel()
+ * Decrypt a single block (in and out can overlap).
+ *
+ * For kernel code, caller is responsible for ensuring kpreempt_disable()
+ * has been called. This is because %xmm registers are not saved/restored.
+ * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
+ * on entry. Otherwise, if TS is not set, save and restore %xmm registers
+ * on the stack.
+ *
+ * Temporary register usage:
+ * %xmm0 State
+ * %xmm1 Key
+ *
+ * Original OpenSolaris Interface:
+ * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
+ * const uint32_t pt[4], uint32_t ct[4])/
+ *
+ * Original Intel OpenSSL Interface:
+ * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
+ * const AES_KEY *key);
+ */
+ENTRY_NP(aes_decrypt_intel)
+
+ movups (%INP), %STATE // input
+ movups (%KEYP), %KEY // key
+#ifdef OPENSSL_INTERFACE
+ mov 240(%KEYP), %NROUNDS32 // round count
+#else /* OpenSolaris Interface */
+ /* Round count is already present as P2 in %rsi/%esi */
+#endif /* OPENSSL_INTERFACE */
+
+ pxor %KEY, %STATE // round 0
+ lea 0x30(%KEYP), %KEYP
+ cmp $12, %NROUNDS
+ jb .Ldec128
+ lea 0x20(%KEYP), %KEYP
+ je .Ldec192
+
+ // AES 256
+ lea 0x20(%KEYP), %KEYP
+ movups -0x60(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movups -0x50(%KEYP), %KEY
+ aesdec %KEY, %STATE
+
+.align 4
+.Ldec192:
+ // AES 192 and 256
+ movups -0x40(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movups -0x30(%KEYP), %KEY
+ aesdec %KEY, %STATE
+
+.align 4
+.Ldec128:
+ // AES 128, 192, and 256
+ movups -0x20(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movups -0x10(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movups (%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movups 0x10(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movups 0x20(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movups 0x30(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movups 0x40(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movups 0x50(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movups 0x60(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movups 0x70(%KEYP), %KEY
+ aesdeclast %KEY, %STATE // last round
+ movups %STATE, (%OUTP) // output
+
+ ret
+ SET_SIZE(aes_decrypt_intel)
+
+#endif /* lint || __lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aes_amd64.S b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aes_amd64.S
new file mode 100644
index 000000000000..9db3a3179230
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aes_amd64.S
@@ -0,0 +1,906 @@
+/*
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+ *
+ * LICENSE TERMS
+ *
+ * The free distribution and use of this software is allowed (with or without
+ * changes) provided that:
+ *
+ * 1. source code distributions include the above copyright notice, this
+ * list of conditions and the following disclaimer;
+ *
+ * 2. binary distributions include the above copyright notice, this list
+ * of conditions and the following disclaimer in their documentation;
+ *
+ * 3. the name of the copyright holder is not used to endorse products
+ * built using this software without specific written permission.
+ *
+ * DISCLAIMER
+ *
+ * This software is provided 'as is' with no explicit or implied warranties
+ * in respect of its properties, including, but not limited to, correctness
+ * and/or fitness for purpose.
+ * ---------------------------------------------------------------------------
+ * Issue 20/12/2007
+ *
+ * I am grateful to Dag Arne Osvik for many discussions of the techniques that
+ * can be used to optimise AES assembler code on AMD64/EM64T architectures.
+ * Some of the techniques used in this implementation are the result of
+ * suggestions made by him for which I am most grateful.
+ *
+ * An AES implementation for AMD64 processors using the YASM assembler. This
+ * implementation provides only encryption, decryption and hence requires key
+ * scheduling support in C. It uses 8k bytes of tables but its encryption and
+ * decryption performance is very close to that obtained using large tables.
+ * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions,
+ * which are as follows:
+ * ms windows gnu/linux/opensolaris os
+ *
+ * in_blk rcx rdi
+ * out_blk rdx rsi
+ * context (cx) r8 rdx
+ *
+ * preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15
+ * registers rdi - on both
+ *
+ * destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11
+ * registers - rdi on both
+ *
+ * The convention used here is that for gnu/linux/opensolaris os.
+ *
+ * This code provides the standard AES block size (128 bits, 16 bytes) and the
+ * three standard AES key sizes (128, 192 and 256 bits). It has the same call
+ * interface as my C implementation. It uses the Microsoft C AMD64 calling
+ * conventions in which the three parameters are placed in rcx, rdx and r8
+ * respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.
+ *
+ * OpenSolaris Note:
+ * Modified to use GNU/Linux/Solaris calling conventions.
+ * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively.
+ *
+ * AES_RETURN aes_encrypt(const unsigned char in_blk[],
+ * unsigned char out_blk[], const aes_encrypt_ctx cx[1])/
+ *
+ * AES_RETURN aes_decrypt(const unsigned char in_blk[],
+ * unsigned char out_blk[], const aes_decrypt_ctx cx[1])/
+ *
+ * AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
+ * const aes_encrypt_ctx cx[1])/
+ *
+ * AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
+ * const aes_decrypt_ctx cx[1])/
+ *
+ * AES_RETURN aes_encrypt_key(const unsigned char key[],
+ * unsigned int len, const aes_decrypt_ctx cx[1])/
+ *
+ * AES_RETURN aes_decrypt_key(const unsigned char key[],
+ * unsigned int len, const aes_decrypt_ctx cx[1])/
+ *
+ * where <NNN> is 128, 102 or 256. In the last two calls the length can be in
+ * either bits or bytes.
+ *
+ * Comment in/out the following lines to obtain the desired subroutines. These
+ * selections MUST match those in the C header file aesopt.h
+ */
+#define AES_REV_DKS /* define if key decryption schedule is reversed */
+
+#define LAST_ROUND_TABLES /* define for the faster version using extra tables */
+
+/*
+ * The encryption key schedule has the following in memory layout where N is the
+ * number of rounds (10, 12 or 14):
+ *
+ * lo: | input key (round 0) | / each round is four 32-bit words
+ * | encryption round 1 |
+ * | encryption round 2 |
+ * ....
+ * | encryption round N-1 |
+ * hi: | encryption round N |
+ *
+ * The decryption key schedule is normally set up so that it has the same
+ * layout as above by actually reversing the order of the encryption key
+ * schedule in memory (this happens when AES_REV_DKS is set):
+ *
+ * lo: | decryption round 0 | = | encryption round N |
+ * | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ]
+ * | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ]
+ * .... ....
+ * | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ]
+ * hi: | decryption round N | = | input key (round 0) |
+ *
+ * with rounds except the first and last modified using inv_mix_column()
+ * But if AES_REV_DKS is NOT set the order of keys is left as it is for
+ * encryption so that it has to be accessed in reverse when used for
+ * decryption (although the inverse mix column modifications are done)
+ *
+ * lo: | decryption round 0 | = | input key (round 0) |
+ * | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ]
+ * | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ]
+ * .... ....
+ * | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
+ * hi: | decryption round N | = | encryption round N |
+ *
+ * This layout is faster when the assembler key scheduling provided here
+ * is used.
+ *
+ * End of user defines
+ */
+
+/*
+ * ---------------------------------------------------------------------------
+ * OpenSolaris OS modifications
+ *
+ * This source originates from Brian Gladman file aes_amd64.asm
+ * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip
+ * with these changes:
+ *
+ * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and
+ * !__GNUC__ ifdefs. Also removed ENCRYPTION, DECRYPTION,
+ * AES_128, AES_192, AES_256, AES_VAR ifdefs.
+ *
+ * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define
+ *
+ * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef
+ *
+ * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax
+ * (operands reversed, literals prefixed with "$", registers prefixed with "%",
+ * and "[register+offset]", addressing changed to "offset(register)",
+ * parenthesis in constant expressions "()" changed to square brackets "[]",
+ * "." removed from local (numeric) labels, and other changes.
+ * Examples:
+ * Intel/yasm/nasm Syntax ATT/OpenSolaris Syntax
+ * mov rax,(4*20h) mov $[4*0x20],%rax
+ * mov rax,[ebx+20h] mov 0x20(%ebx),%rax
+ * lea rax,[ebx+ecx] lea (%ebx,%ecx),%rax
+ * sub rax,[ebx+ecx*4-20h] sub -0x20(%ebx,%ecx,4),%rax
+ *
+ * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
+ * definitions for lint.
+ *
+ * 6. Renamed functions and reordered parameters to match OpenSolaris:
+ * Original Gladman interface:
+ * int aes_encrypt(const unsigned char *in,
+ * unsigned char *out, const aes_encrypt_ctx cx[1])/
+ * int aes_decrypt(const unsigned char *in,
+ * unsigned char *out, const aes_encrypt_ctx cx[1])/
+ * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t,
+ * and a union type, inf., containing inf.l, a uint32_t and
+ * inf.b, a 4-element array of uint32_t. Only b[0] in the array (aka "l") is
+ * used and contains the key schedule length * 16 where key schedule length is
+ * 10, 12, or 14 bytes.
+ *
+ * OpenSolaris OS interface:
+ * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
+ * const uint32_t pt[4], uint32_t ct[4])/
+ * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
+ * const uint32_t pt[4], uint32_t ct[4])/
+ * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/
+ * uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/
+ * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
+ * ct is crypto text, and MAX_AES_NR is 14.
+ * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
+ */
+
+#if defined(lint) || defined(__lint)
+
+#include <sys/types.h>
+/* ARGSUSED */
+void
+aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4],
+ uint32_t ct[4]) {
+}
+/* ARGSUSED */
+void
+aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4],
+ uint32_t pt[4]) {
+}
+
+
+#else
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+#define KS_LENGTH 60
+
+#define raxd eax
+#define rdxd edx
+#define rcxd ecx
+#define rbxd ebx
+#define rsid esi
+#define rdid edi
+
+#define raxb al
+#define rdxb dl
+#define rcxb cl
+#define rbxb bl
+#define rsib sil
+#define rdib dil
+
+// finite field multiplies by {02}, {04} and {08}
+
+#define f2(x) [[x<<1]^[[[x>>7]&1]*0x11b]]
+#define f4(x) [[x<<2]^[[[x>>6]&1]*0x11b]^[[[x>>6]&2]*0x11b]]
+#define f8(x) [[x<<3]^[[[x>>5]&1]*0x11b]^[[[x>>5]&2]*0x11b]^[[[x>>5]&4]*0x11b]]
+
+// finite field multiplies required in table generation
+
+#define f3(x) [[f2(x)] ^ [x]]
+#define f9(x) [[f8(x)] ^ [x]]
+#define fb(x) [[f8(x)] ^ [f2(x)] ^ [x]]
+#define fd(x) [[f8(x)] ^ [f4(x)] ^ [x]]
+#define fe(x) [[f8(x)] ^ [f4(x)] ^ [f2(x)]]
+
+// macros for expanding S-box data
+
+#define u8(x) [f2(x)], [x], [x], [f3(x)], [f2(x)], [x], [x], [f3(x)]
+#define v8(x) [fe(x)], [f9(x)], [fd(x)], [fb(x)], [fe(x)], [f9(x)], [fd(x)], [x]
+#define w8(x) [x], 0, 0, 0, [x], 0, 0, 0
+
+#define enc_vals(x) \
+ .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \
+ .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \
+ .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \
+ .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \
+ .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \
+ .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \
+ .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \
+ .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \
+ .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \
+ .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \
+ .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \
+ .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \
+ .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \
+ .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \
+ .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \
+ .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \
+ .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \
+ .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \
+ .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \
+ .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \
+ .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \
+ .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \
+ .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \
+ .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \
+ .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \
+ .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \
+ .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \
+ .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \
+ .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \
+ .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \
+ .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \
+ .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16)
+
+#define dec_vals(x) \
+ .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \
+ .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \
+ .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \
+ .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \
+ .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \
+ .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \
+ .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \
+ .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \
+ .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \
+ .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \
+ .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \
+ .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \
+ .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \
+ .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \
+ .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \
+ .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \
+ .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \
+ .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \
+ .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \
+ .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \
+ .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \
+ .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \
+ .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \
+ .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \
+ .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \
+ .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \
+ .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \
+ .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \
+ .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \
+ .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \
+ .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \
+ .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d)
+
+#define tptr %rbp /* table pointer */
+#define kptr %r8 /* key schedule pointer */
+#define fofs 128 /* adjust offset in key schedule to keep |disp| < 128 */
+#define fk_ref(x, y) -16*x+fofs+4*y(kptr)
+
+#ifdef AES_REV_DKS
+#define rofs 128
+#define ik_ref(x, y) -16*x+rofs+4*y(kptr)
+
+#else
+#define rofs -128
+#define ik_ref(x, y) 16*x+rofs+4*y(kptr)
+#endif /* AES_REV_DKS */
+
+#define tab_0(x) (tptr,x,8)
+#define tab_1(x) 3(tptr,x,8)
+#define tab_2(x) 2(tptr,x,8)
+#define tab_3(x) 1(tptr,x,8)
+#define tab_f(x) 1(tptr,x,8)
+#define tab_i(x) 7(tptr,x,8)
+
+#define ff_rnd(p1, p2, p3, p4, round) /* normal forward round */ \
+ mov fk_ref(round,0), p1; \
+ mov fk_ref(round,1), p2; \
+ mov fk_ref(round,2), p3; \
+ mov fk_ref(round,3), p4; \
+ \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ shr $16, %eax; \
+ xor tab_0(%rsi), p1; \
+ xor tab_1(%rdi), p4; \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ xor tab_2(%rsi), p3; \
+ xor tab_3(%rdi), p2; \
+ \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ shr $16, %ebx; \
+ xor tab_0(%rsi), p2; \
+ xor tab_1(%rdi), p1; \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ xor tab_2(%rsi), p4; \
+ xor tab_3(%rdi), p3; \
+ \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ shr $16, %ecx; \
+ xor tab_0(%rsi), p3; \
+ xor tab_1(%rdi), p2; \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ xor tab_2(%rsi), p1; \
+ xor tab_3(%rdi), p4; \
+ \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ shr $16, %edx; \
+ xor tab_0(%rsi), p4; \
+ xor tab_1(%rdi), p3; \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ xor tab_2(%rsi), p2; \
+ xor tab_3(%rdi), p1; \
+ \
+ mov p1, %eax; \
+ mov p2, %ebx; \
+ mov p3, %ecx; \
+ mov p4, %edx
+
+#ifdef LAST_ROUND_TABLES
+
+#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \
+ add $2048, tptr; \
+ mov fk_ref(round,0), p1; \
+ mov fk_ref(round,1), p2; \
+ mov fk_ref(round,2), p3; \
+ mov fk_ref(round,3), p4; \
+ \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ shr $16, %eax; \
+ xor tab_0(%rsi), p1; \
+ xor tab_1(%rdi), p4; \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ xor tab_2(%rsi), p3; \
+ xor tab_3(%rdi), p2; \
+ \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ shr $16, %ebx; \
+ xor tab_0(%rsi), p2; \
+ xor tab_1(%rdi), p1; \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ xor tab_2(%rsi), p4; \
+ xor tab_3(%rdi), p3; \
+ \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ shr $16, %ecx; \
+ xor tab_0(%rsi), p3; \
+ xor tab_1(%rdi), p2; \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ xor tab_2(%rsi), p1; \
+ xor tab_3(%rdi), p4; \
+ \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ shr $16, %edx; \
+ xor tab_0(%rsi), p4; \
+ xor tab_1(%rdi), p3; \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ xor tab_2(%rsi), p2; \
+ xor tab_3(%rdi), p1
+
+#else
+
+#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \
+ mov fk_ref(round,0), p1; \
+ mov fk_ref(round,1), p2; \
+ mov fk_ref(round,2), p3; \
+ mov fk_ref(round,3), p4; \
+ \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ shr $16, %eax; \
+ movzx tab_f(%rsi), %esi; \
+ movzx tab_f(%rdi), %edi; \
+ xor %esi, p1; \
+ rol $8, %edi; \
+ xor %edi, p4; \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ movzx tab_f(%rsi), %esi; \
+ movzx tab_f(%rdi), %edi; \
+ rol $16, %esi; \
+ rol $24, %edi; \
+ xor %esi, p3; \
+ xor %edi, p2; \
+ \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ shr $16, %ebx; \
+ movzx tab_f(%rsi), %esi; \
+ movzx tab_f(%rdi), %edi; \
+ xor %esi, p2; \
+ rol $8, %edi; \
+ xor %edi, p1; \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ movzx tab_f(%rsi), %esi; \
+ movzx tab_f(%rdi), %edi; \
+ rol $16, %esi; \
+ rol $24, %edi; \
+ xor %esi, p4; \
+ xor %edi, p3; \
+ \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ movzx tab_f(%rsi), %esi; \
+ movzx tab_f(%rdi), %edi; \
+ shr $16, %ecx; \
+ xor %esi, p3; \
+ rol $8, %edi; \
+ xor %edi, p2; \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ movzx tab_f(%rsi), %esi; \
+ movzx tab_f(%rdi), %edi; \
+ rol $16, %esi; \
+ rol $24, %edi; \
+ xor %esi, p1; \
+ xor %edi, p4; \
+ \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ movzx tab_f(%rsi), %esi; \
+ movzx tab_f(%rdi), %edi; \
+ shr $16, %edx; \
+ xor %esi, p4; \
+ rol $8, %edi; \
+ xor %edi, p3; \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ movzx tab_f(%rsi), %esi; \
+ movzx tab_f(%rdi), %edi; \
+ rol $16, %esi; \
+ rol $24, %edi; \
+ xor %esi, p2; \
+ xor %edi, p1
+
+#endif /* LAST_ROUND_TABLES */
+
+#define ii_rnd(p1, p2, p3, p4, round) /* normal inverse round */ \
+ mov ik_ref(round,0), p1; \
+ mov ik_ref(round,1), p2; \
+ mov ik_ref(round,2), p3; \
+ mov ik_ref(round,3), p4; \
+ \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ shr $16, %eax; \
+ xor tab_0(%rsi), p1; \
+ xor tab_1(%rdi), p2; \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ xor tab_2(%rsi), p3; \
+ xor tab_3(%rdi), p4; \
+ \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ shr $16, %ebx; \
+ xor tab_0(%rsi), p2; \
+ xor tab_1(%rdi), p3; \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ xor tab_2(%rsi), p4; \
+ xor tab_3(%rdi), p1; \
+ \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ shr $16, %ecx; \
+ xor tab_0(%rsi), p3; \
+ xor tab_1(%rdi), p4; \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ xor tab_2(%rsi), p1; \
+ xor tab_3(%rdi), p2; \
+ \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ shr $16, %edx; \
+ xor tab_0(%rsi), p4; \
+ xor tab_1(%rdi), p1; \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ xor tab_2(%rsi), p2; \
+ xor tab_3(%rdi), p3; \
+ \
+ mov p1, %eax; \
+ mov p2, %ebx; \
+ mov p3, %ecx; \
+ mov p4, %edx
+
+#ifdef LAST_ROUND_TABLES
+
+#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \
+ add $2048, tptr; \
+ mov ik_ref(round,0), p1; \
+ mov ik_ref(round,1), p2; \
+ mov ik_ref(round,2), p3; \
+ mov ik_ref(round,3), p4; \
+ \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ shr $16, %eax; \
+ xor tab_0(%rsi), p1; \
+ xor tab_1(%rdi), p2; \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ xor tab_2(%rsi), p3; \
+ xor tab_3(%rdi), p4; \
+ \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ shr $16, %ebx; \
+ xor tab_0(%rsi), p2; \
+ xor tab_1(%rdi), p3; \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ xor tab_2(%rsi), p4; \
+ xor tab_3(%rdi), p1; \
+ \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ shr $16, %ecx; \
+ xor tab_0(%rsi), p3; \
+ xor tab_1(%rdi), p4; \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ xor tab_2(%rsi), p1; \
+ xor tab_3(%rdi), p2; \
+ \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ shr $16, %edx; \
+ xor tab_0(%rsi), p4; \
+ xor tab_1(%rdi), p1; \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ xor tab_2(%rsi), p2; \
+ xor tab_3(%rdi), p3
+
+#else
+
+#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \
+ mov ik_ref(round,0), p1; \
+ mov ik_ref(round,1), p2; \
+ mov ik_ref(round,2), p3; \
+ mov ik_ref(round,3), p4; \
+ \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ movzx tab_i(%rsi), %esi; \
+ movzx tab_i(%rdi), %edi; \
+ shr $16, %eax; \
+ xor %esi, p1; \
+ rol $8, %edi; \
+ xor %edi, p2; \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ movzx tab_i(%rsi), %esi; \
+ movzx tab_i(%rdi), %edi; \
+ rol $16, %esi; \
+ rol $24, %edi; \
+ xor %esi, p3; \
+ xor %edi, p4; \
+ \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ movzx tab_i(%rsi), %esi; \
+ movzx tab_i(%rdi), %edi; \
+ shr $16, %ebx; \
+ xor %esi, p2; \
+ rol $8, %edi; \
+ xor %edi, p3; \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ movzx tab_i(%rsi), %esi; \
+ movzx tab_i(%rdi), %edi; \
+ rol $16, %esi; \
+ rol $24, %edi; \
+ xor %esi, p4; \
+ xor %edi, p1; \
+ \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ movzx tab_i(%rsi), %esi; \
+ movzx tab_i(%rdi), %edi; \
+ shr $16, %ecx; \
+ xor %esi, p3; \
+ rol $8, %edi; \
+ xor %edi, p4; \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ movzx tab_i(%rsi), %esi; \
+ movzx tab_i(%rdi), %edi; \
+ rol $16, %esi; \
+ rol $24, %edi; \
+ xor %esi, p1; \
+ xor %edi, p2; \
+ \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ movzx tab_i(%rsi), %esi; \
+ movzx tab_i(%rdi), %edi; \
+ shr $16, %edx; \
+ xor %esi, p4; \
+ rol $8, %edi; \
+ xor %edi, p1; \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ movzx tab_i(%rsi), %esi; \
+ movzx tab_i(%rdi), %edi; \
+ rol $16, %esi; \
+ rol $24, %edi; \
+ xor %esi, p2; \
+ xor %edi, p3
+
+#endif /* LAST_ROUND_TABLES */
+
+/*
+ * OpenSolaris OS:
+ * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
+ * const uint32_t pt[4], uint32_t ct[4])/
+ *
+ * Original interface:
+ * int aes_encrypt(const unsigned char *in,
+ * unsigned char *out, const aes_encrypt_ctx cx[1])/
+ */
+.data
+.align 64
+enc_tab:
+ enc_vals(u8)
+#ifdef LAST_ROUND_TABLES
+ // Last Round Tables:
+ enc_vals(w8)
+#endif
+
+
+ENTRY_NP(aes_encrypt_amd64)
+#ifdef GLADMAN_INTERFACE
+ // Original interface
+ sub $[4*8], %rsp // gnu/linux/opensolaris binary interface
+ mov %rsi, (%rsp) // output pointer (P2)
+ mov %rdx, %r8 // context (P3)
+
+ mov %rbx, 1*8(%rsp) // P1: input pointer in rdi
+ mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp)
+ mov %r12, 3*8(%rsp) // P3: context in r8
+ movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16
+
+#else
+ // OpenSolaris OS interface
+ sub $[4*8], %rsp // Make room on stack to save registers
+ mov %rcx, (%rsp) // Save output pointer (P4) on stack
+ mov %rdi, %r8 // context (P1)
+ mov %rdx, %rdi // P3: save input pointer
+ shl $4, %esi // P2: esi byte key length * 16
+
+ mov %rbx, 1*8(%rsp) // Save registers
+ mov %rbp, 2*8(%rsp)
+ mov %r12, 3*8(%rsp)
+ // P1: context in r8
+ // P2: byte key length * 16 in esi
+ // P3: input pointer in rdi
+ // P4: output pointer in (rsp)
+#endif /* GLADMAN_INTERFACE */
+
+ lea enc_tab(%rip), tptr
+ sub $fofs, kptr
+
+ // Load input block into registers
+ mov (%rdi), %eax
+ mov 1*4(%rdi), %ebx
+ mov 2*4(%rdi), %ecx
+ mov 3*4(%rdi), %edx
+
+ xor fofs(kptr), %eax
+ xor fofs+4(kptr), %ebx
+ xor fofs+8(kptr), %ecx
+ xor fofs+12(kptr), %edx
+
+ lea (kptr,%rsi), kptr
+ // Jump based on byte key length * 16:
+ cmp $[10*16], %esi
+ je 3f
+ cmp $[12*16], %esi
+ je 2f
+ cmp $[14*16], %esi
+ je 1f
+ mov $-1, %rax // error
+ jmp 4f
+
+ // Perform normal forward rounds
+1: ff_rnd(%r9d, %r10d, %r11d, %r12d, 13)
+ ff_rnd(%r9d, %r10d, %r11d, %r12d, 12)
+2: ff_rnd(%r9d, %r10d, %r11d, %r12d, 11)
+ ff_rnd(%r9d, %r10d, %r11d, %r12d, 10)
+3: ff_rnd(%r9d, %r10d, %r11d, %r12d, 9)
+ ff_rnd(%r9d, %r10d, %r11d, %r12d, 8)
+ ff_rnd(%r9d, %r10d, %r11d, %r12d, 7)
+ ff_rnd(%r9d, %r10d, %r11d, %r12d, 6)
+ ff_rnd(%r9d, %r10d, %r11d, %r12d, 5)
+ ff_rnd(%r9d, %r10d, %r11d, %r12d, 4)
+ ff_rnd(%r9d, %r10d, %r11d, %r12d, 3)
+ ff_rnd(%r9d, %r10d, %r11d, %r12d, 2)
+ ff_rnd(%r9d, %r10d, %r11d, %r12d, 1)
+ fl_rnd(%r9d, %r10d, %r11d, %r12d, 0)
+
+ // Copy results
+ mov (%rsp), %rbx
+ mov %r9d, (%rbx)
+ mov %r10d, 4(%rbx)
+ mov %r11d, 8(%rbx)
+ mov %r12d, 12(%rbx)
+ xor %rax, %rax
+4: // Restore registers
+ mov 1*8(%rsp), %rbx
+ mov 2*8(%rsp), %rbp
+ mov 3*8(%rsp), %r12
+ add $[4*8], %rsp
+ ret
+
+ SET_SIZE(aes_encrypt_amd64)
+
+/*
+ * OpenSolaris OS:
+ * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
+ * const uint32_t pt[4], uint32_t ct[4])/
+ *
+ * Original interface:
+ * int aes_decrypt(const unsigned char *in,
+ * unsigned char *out, const aes_encrypt_ctx cx[1])/
+ */
+.data
+.align 64
+dec_tab:
+ dec_vals(v8)
+#ifdef LAST_ROUND_TABLES
+ // Last Round Tables:
+ dec_vals(w8)
+#endif
+
+
+ENTRY_NP(aes_decrypt_amd64)
+#ifdef GLADMAN_INTERFACE
+ // Original interface
+ sub $[4*8], %rsp // gnu/linux/opensolaris binary interface
+ mov %rsi, (%rsp) // output pointer (P2)
+ mov %rdx, %r8 // context (P3)
+
+ mov %rbx, 1*8(%rsp) // P1: input pointer in rdi
+ mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp)
+ mov %r12, 3*8(%rsp) // P3: context in r8
+ movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16
+
+#else
+ // OpenSolaris OS interface
+ sub $[4*8], %rsp // Make room on stack to save registers
+ mov %rcx, (%rsp) // Save output pointer (P4) on stack
+ mov %rdi, %r8 // context (P1)
+ mov %rdx, %rdi // P3: save input pointer
+ shl $4, %esi // P2: esi byte key length * 16
+
+ mov %rbx, 1*8(%rsp) // Save registers
+ mov %rbp, 2*8(%rsp)
+ mov %r12, 3*8(%rsp)
+ // P1: context in r8
+ // P2: byte key length * 16 in esi
+ // P3: input pointer in rdi
+ // P4: output pointer in (rsp)
+#endif /* GLADMAN_INTERFACE */
+
+ lea dec_tab(%rip), tptr
+ sub $rofs, kptr
+
+ // Load input block into registers
+ mov (%rdi), %eax
+ mov 1*4(%rdi), %ebx
+ mov 2*4(%rdi), %ecx
+ mov 3*4(%rdi), %edx
+
+#ifdef AES_REV_DKS
+ mov kptr, %rdi
+ lea (kptr,%rsi), kptr
+#else
+ lea (kptr,%rsi), %rdi
+#endif
+
+ xor rofs(%rdi), %eax
+ xor rofs+4(%rdi), %ebx
+ xor rofs+8(%rdi), %ecx
+ xor rofs+12(%rdi), %edx
+
+ // Jump based on byte key length * 16:
+ cmp $[10*16], %esi
+ je 3f
+ cmp $[12*16], %esi
+ je 2f
+ cmp $[14*16], %esi
+ je 1f
+ mov $-1, %rax // error
+ jmp 4f
+
+ // Perform normal inverse rounds
+1: ii_rnd(%r9d, %r10d, %r11d, %r12d, 13)
+ ii_rnd(%r9d, %r10d, %r11d, %r12d, 12)
+2: ii_rnd(%r9d, %r10d, %r11d, %r12d, 11)
+ ii_rnd(%r9d, %r10d, %r11d, %r12d, 10)
+3: ii_rnd(%r9d, %r10d, %r11d, %r12d, 9)
+ ii_rnd(%r9d, %r10d, %r11d, %r12d, 8)
+ ii_rnd(%r9d, %r10d, %r11d, %r12d, 7)
+ ii_rnd(%r9d, %r10d, %r11d, %r12d, 6)
+ ii_rnd(%r9d, %r10d, %r11d, %r12d, 5)
+ ii_rnd(%r9d, %r10d, %r11d, %r12d, 4)
+ ii_rnd(%r9d, %r10d, %r11d, %r12d, 3)
+ ii_rnd(%r9d, %r10d, %r11d, %r12d, 2)
+ ii_rnd(%r9d, %r10d, %r11d, %r12d, 1)
+ il_rnd(%r9d, %r10d, %r11d, %r12d, 0)
+
+ // Copy results
+ mov (%rsp), %rbx
+ mov %r9d, (%rbx)
+ mov %r10d, 4(%rbx)
+ mov %r11d, 8(%rbx)
+ mov %r12d, 12(%rbx)
+ xor %rax, %rax
+4: // Restore registers
+ mov 1*8(%rsp), %rbx
+ mov 2*8(%rsp), %rbp
+ mov 3*8(%rsp), %r12
+ add $[4*8], %rsp
+ ret
+
+ SET_SIZE(aes_decrypt_amd64)
+#endif /* lint || __lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aeskey.c b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aeskey.c
new file mode 100644
index 000000000000..c3d1f2990874
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aeskey.c
@@ -0,0 +1,580 @@
+/*
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+ *
+ * LICENSE TERMS
+ *
+ * The free distribution and use of this software is allowed (with or without
+ * changes) provided that:
+ *
+ * 1. source code distributions include the above copyright notice, this
+ * list of conditions and the following disclaimer;
+ *
+ * 2. binary distributions include the above copyright notice, this list
+ * of conditions and the following disclaimer in their documentation;
+ *
+ * 3. the name of the copyright holder is not used to endorse products
+ * built using this software without specific written permission.
+ *
+ * DISCLAIMER
+ *
+ * This software is provided 'as is' with no explicit or implied warranties
+ * in respect of its properties, including, but not limited to, correctness
+ * and/or fitness for purpose.
+ * ---------------------------------------------------------------------------
+ * Issue Date: 20/12/2007
+ */
+
+#include <aes/aes_impl.h>
+#include "aesopt.h"
+#include "aestab.h"
+#include "aestab2.h"
+
+/*
+ * Initialise the key schedule from the user supplied key. The key
+ * length can be specified in bytes, with legal values of 16, 24
+ * and 32, or in bits, with legal values of 128, 192 and 256. These
+ * values correspond with Nk values of 4, 6 and 8 respectively.
+ *
+ * The following macros implement a single cycle in the key
+ * schedule generation process. The number of cycles needed
+ * for each cx->n_col and nk value is:
+ *
+ * nk = 4 5 6 7 8
+ * ------------------------------
+ * cx->n_col = 4 10 9 8 7 7
+ * cx->n_col = 5 14 11 10 9 9
+ * cx->n_col = 6 19 15 12 11 11
+ * cx->n_col = 7 21 19 16 13 14
+ * cx->n_col = 8 29 23 19 17 14
+ */
+
+/*
+ * OpenSolaris changes
+ * 1. Added header files aes_impl.h and aestab2.h
+ * 2. Changed uint_8t and uint_32t to uint8_t and uint32_t
+ * 3. Remove code under ifdef USE_VIA_ACE_IF_PRESENT (always undefined)
+ * 4. Removed always-defined ifdefs FUNCS_IN_C, ENC_KEYING_IN_C,
+ * AES_128, AES_192, AES_256, AES_VAR defines
+ * 5. Changed aes_encrypt_key* aes_decrypt_key* functions to "static void"
+ * 6. Changed N_COLS to MAX_AES_NB
+ * 7. Replaced functions aes_encrypt_key and aes_decrypt_key with
+ * OpenSolaris-compatible functions rijndael_key_setup_enc_amd64 and
+ * rijndael_key_setup_dec_amd64
+ * 8. cstyled code and removed lint warnings
+ */
+
+#if defined(REDUCE_CODE_SIZE)
+#define ls_box ls_sub
+ uint32_t ls_sub(const uint32_t t, const uint32_t n);
+#define inv_mcol im_sub
+ uint32_t im_sub(const uint32_t x);
+#ifdef ENC_KS_UNROLL
+#undef ENC_KS_UNROLL
+#endif
+#ifdef DEC_KS_UNROLL
+#undef DEC_KS_UNROLL
+#endif
+#endif /* REDUCE_CODE_SIZE */
+
+
+#define ke4(k, i) \
+{ k[4 * (i) + 4] = ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \
+ k[4 * (i) + 5] = ss[1] ^= ss[0]; \
+ k[4 * (i) + 6] = ss[2] ^= ss[1]; \
+ k[4 * (i) + 7] = ss[3] ^= ss[2]; \
+}
+
+static void
+aes_encrypt_key128(const unsigned char *key, uint32_t rk[])
+{
+ uint32_t ss[4];
+
+ rk[0] = ss[0] = word_in(key, 0);
+ rk[1] = ss[1] = word_in(key, 1);
+ rk[2] = ss[2] = word_in(key, 2);
+ rk[3] = ss[3] = word_in(key, 3);
+
+#ifdef ENC_KS_UNROLL
+ ke4(rk, 0); ke4(rk, 1);
+ ke4(rk, 2); ke4(rk, 3);
+ ke4(rk, 4); ke4(rk, 5);
+ ke4(rk, 6); ke4(rk, 7);
+ ke4(rk, 8);
+#else
+ {
+ uint32_t i;
+ for (i = 0; i < 9; ++i)
+ ke4(rk, i);
+ }
+#endif /* ENC_KS_UNROLL */
+ ke4(rk, 9);
+}
+
+
+#define kef6(k, i) \
+{ k[6 * (i) + 6] = ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \
+ k[6 * (i) + 7] = ss[1] ^= ss[0]; \
+ k[6 * (i) + 8] = ss[2] ^= ss[1]; \
+ k[6 * (i) + 9] = ss[3] ^= ss[2]; \
+}
+
+#define ke6(k, i) \
+{ kef6(k, i); \
+ k[6 * (i) + 10] = ss[4] ^= ss[3]; \
+ k[6 * (i) + 11] = ss[5] ^= ss[4]; \
+}
+
+static void
+aes_encrypt_key192(const unsigned char *key, uint32_t rk[])
+{
+ uint32_t ss[6];
+
+ rk[0] = ss[0] = word_in(key, 0);
+ rk[1] = ss[1] = word_in(key, 1);
+ rk[2] = ss[2] = word_in(key, 2);
+ rk[3] = ss[3] = word_in(key, 3);
+ rk[4] = ss[4] = word_in(key, 4);
+ rk[5] = ss[5] = word_in(key, 5);
+
+#ifdef ENC_KS_UNROLL
+ ke6(rk, 0); ke6(rk, 1);
+ ke6(rk, 2); ke6(rk, 3);
+ ke6(rk, 4); ke6(rk, 5);
+ ke6(rk, 6);
+#else
+ {
+ uint32_t i;
+ for (i = 0; i < 7; ++i)
+ ke6(rk, i);
+ }
+#endif /* ENC_KS_UNROLL */
+ kef6(rk, 7);
+}
+
+
+
+#define kef8(k, i) \
+{ k[8 * (i) + 8] = ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \
+ k[8 * (i) + 9] = ss[1] ^= ss[0]; \
+ k[8 * (i) + 10] = ss[2] ^= ss[1]; \
+ k[8 * (i) + 11] = ss[3] ^= ss[2]; \
+}
+
+#define ke8(k, i) \
+{ kef8(k, i); \
+ k[8 * (i) + 12] = ss[4] ^= ls_box(ss[3], 0); \
+ k[8 * (i) + 13] = ss[5] ^= ss[4]; \
+ k[8 * (i) + 14] = ss[6] ^= ss[5]; \
+ k[8 * (i) + 15] = ss[7] ^= ss[6]; \
+}
+
+static void
+aes_encrypt_key256(const unsigned char *key, uint32_t rk[])
+{
+ uint32_t ss[8];
+
+ rk[0] = ss[0] = word_in(key, 0);
+ rk[1] = ss[1] = word_in(key, 1);
+ rk[2] = ss[2] = word_in(key, 2);
+ rk[3] = ss[3] = word_in(key, 3);
+ rk[4] = ss[4] = word_in(key, 4);
+ rk[5] = ss[5] = word_in(key, 5);
+ rk[6] = ss[6] = word_in(key, 6);
+ rk[7] = ss[7] = word_in(key, 7);
+
+#ifdef ENC_KS_UNROLL
+ ke8(rk, 0); ke8(rk, 1);
+ ke8(rk, 2); ke8(rk, 3);
+ ke8(rk, 4); ke8(rk, 5);
+#else
+ {
+ uint32_t i;
+ for (i = 0; i < 6; ++i)
+ ke8(rk, i);
+ }
+#endif /* ENC_KS_UNROLL */
+ kef8(rk, 6);
+}
+
+
+/*
+ * Expand the cipher key into the encryption key schedule.
+ *
+ * Return the number of rounds for the given cipher key size.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4 * (Nr + 1).
+ *
+ * Parameters:
+ * rk AES key schedule 32-bit array to be initialized
+ * cipherKey User key
+ * keyBits AES key size (128, 192, or 256 bits)
+ */
+int
+rijndael_key_setup_enc_amd64(uint32_t rk[], const uint32_t cipherKey[],
+ int keyBits)
+{
+ switch (keyBits) {
+ case 128:
+ aes_encrypt_key128((unsigned char *)&cipherKey[0], rk);
+ return (10);
+ case 192:
+ aes_encrypt_key192((unsigned char *)&cipherKey[0], rk);
+ return (12);
+ case 256:
+ aes_encrypt_key256((unsigned char *)&cipherKey[0], rk);
+ return (14);
+ default: /* should never get here */
+ break;
+ }
+
+ return (0);
+}
+
+
+/* this is used to store the decryption round keys */
+/* in forward or reverse order */
+
+#ifdef AES_REV_DKS
+#define v(n, i) ((n) - (i) + 2 * ((i) & 3))
+#else
+#define v(n, i) (i)
+#endif
+
+#if DEC_ROUND == NO_TABLES
+#define ff(x) (x)
+#else
+#define ff(x) inv_mcol(x)
+#if defined(dec_imvars)
+#define d_vars dec_imvars
+#endif
+#endif /* FUNCS_IN_C & DEC_KEYING_IN_C */
+
+
+#define k4e(k, i) \
+{ k[v(40, (4 * (i)) + 4)] = ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \
+ k[v(40, (4 * (i)) + 5)] = ss[1] ^= ss[0]; \
+ k[v(40, (4 * (i)) + 6)] = ss[2] ^= ss[1]; \
+ k[v(40, (4 * (i)) + 7)] = ss[3] ^= ss[2]; \
+}
+
+#if 1
+
+#define kdf4(k, i) \
+{ ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3]; \
+ ss[1] = ss[1] ^ ss[3]; \
+ ss[2] = ss[2] ^ ss[3]; \
+ ss[4] = ls_box(ss[(i + 3) % 4], 3) ^ t_use(r, c)[i]; \
+ ss[i % 4] ^= ss[4]; \
+ ss[4] ^= k[v(40, (4 * (i)))]; k[v(40, (4 * (i)) + 4)] = ff(ss[4]); \
+ ss[4] ^= k[v(40, (4 * (i)) + 1)]; k[v(40, (4 * (i)) + 5)] = ff(ss[4]); \
+ ss[4] ^= k[v(40, (4 * (i)) + 2)]; k[v(40, (4 * (i)) + 6)] = ff(ss[4]); \
+ ss[4] ^= k[v(40, (4 * (i)) + 3)]; k[v(40, (4 * (i)) + 7)] = ff(ss[4]); \
+}
+
+#define kd4(k, i) \
+{ ss[4] = ls_box(ss[(i + 3) % 4], 3) ^ t_use(r, c)[i]; \
+ ss[i % 4] ^= ss[4]; ss[4] = ff(ss[4]); \
+ k[v(40, (4 * (i)) + 4)] = ss[4] ^= k[v(40, (4 * (i)))]; \
+ k[v(40, (4 * (i)) + 5)] = ss[4] ^= k[v(40, (4 * (i)) + 1)]; \
+ k[v(40, (4 * (i)) + 6)] = ss[4] ^= k[v(40, (4 * (i)) + 2)]; \
+ k[v(40, (4 * (i)) + 7)] = ss[4] ^= k[v(40, (4 * (i)) + 3)]; \
+}
+
+#define kdl4(k, i) \
+{ ss[4] = ls_box(ss[(i + 3) % 4], 3) ^ t_use(r, c)[i]; \
+ ss[i % 4] ^= ss[4]; \
+ k[v(40, (4 * (i)) + 4)] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3]; \
+ k[v(40, (4 * (i)) + 5)] = ss[1] ^ ss[3]; \
+ k[v(40, (4 * (i)) + 6)] = ss[0]; \
+ k[v(40, (4 * (i)) + 7)] = ss[1]; \
+}
+
+#else
+
+#define kdf4(k, i) \
+{ ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \
+ k[v(40, (4 * (i)) + 4)] = ff(ss[0]); \
+ ss[1] ^= ss[0]; k[v(40, (4 * (i)) + 5)] = ff(ss[1]); \
+ ss[2] ^= ss[1]; k[v(40, (4 * (i)) + 6)] = ff(ss[2]); \
+ ss[3] ^= ss[2]; k[v(40, (4 * (i)) + 7)] = ff(ss[3]); \
+}
+
+#define kd4(k, i) \
+{ ss[4] = ls_box(ss[3], 3) ^ t_use(r, c)[i]; \
+ ss[0] ^= ss[4]; \
+ ss[4] = ff(ss[4]); \
+ k[v(40, (4 * (i)) + 4)] = ss[4] ^= k[v(40, (4 * (i)))]; \
+ ss[1] ^= ss[0]; \
+ k[v(40, (4 * (i)) + 5)] = ss[4] ^= k[v(40, (4 * (i)) + 1)]; \
+ ss[2] ^= ss[1]; \
+ k[v(40, (4 * (i)) + 6)] = ss[4] ^= k[v(40, (4 * (i)) + 2)]; \
+ ss[3] ^= ss[2]; \
+ k[v(40, (4 * (i)) + 7)] = ss[4] ^= k[v(40, (4 * (i)) + 3)]; \
+}
+
+#define kdl4(k, i) \
+{ ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \
+ k[v(40, (4 * (i)) + 4)] = ss[0]; \
+ ss[1] ^= ss[0]; k[v(40, (4 * (i)) + 5)] = ss[1]; \
+ ss[2] ^= ss[1]; k[v(40, (4 * (i)) + 6)] = ss[2]; \
+ ss[3] ^= ss[2]; k[v(40, (4 * (i)) + 7)] = ss[3]; \
+}
+
+#endif
+
+static void
+aes_decrypt_key128(const unsigned char *key, uint32_t rk[])
+{
+ uint32_t ss[5];
+#if defined(d_vars)
+ d_vars;
+#endif
+ rk[v(40, (0))] = ss[0] = word_in(key, 0);
+ rk[v(40, (1))] = ss[1] = word_in(key, 1);
+ rk[v(40, (2))] = ss[2] = word_in(key, 2);
+ rk[v(40, (3))] = ss[3] = word_in(key, 3);
+
+#ifdef DEC_KS_UNROLL
+ kdf4(rk, 0); kd4(rk, 1);
+ kd4(rk, 2); kd4(rk, 3);
+ kd4(rk, 4); kd4(rk, 5);
+ kd4(rk, 6); kd4(rk, 7);
+ kd4(rk, 8); kdl4(rk, 9);
+#else
+ {
+ uint32_t i;
+ for (i = 0; i < 10; ++i)
+ k4e(rk, i);
+#if !(DEC_ROUND == NO_TABLES)
+ for (i = MAX_AES_NB; i < 10 * MAX_AES_NB; ++i)
+ rk[i] = inv_mcol(rk[i]);
+#endif
+ }
+#endif /* DEC_KS_UNROLL */
+}
+
+
+
+#define k6ef(k, i) \
+{ k[v(48, (6 * (i)) + 6)] = ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \
+ k[v(48, (6 * (i)) + 7)] = ss[1] ^= ss[0]; \
+ k[v(48, (6 * (i)) + 8)] = ss[2] ^= ss[1]; \
+ k[v(48, (6 * (i)) + 9)] = ss[3] ^= ss[2]; \
+}
+
+#define k6e(k, i) \
+{ k6ef(k, i); \
+ k[v(48, (6 * (i)) + 10)] = ss[4] ^= ss[3]; \
+ k[v(48, (6 * (i)) + 11)] = ss[5] ^= ss[4]; \
+}
+
+#define kdf6(k, i) \
+{ ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \
+ k[v(48, (6 * (i)) + 6)] = ff(ss[0]); \
+ ss[1] ^= ss[0]; k[v(48, (6 * (i)) + 7)] = ff(ss[1]); \
+ ss[2] ^= ss[1]; k[v(48, (6 * (i)) + 8)] = ff(ss[2]); \
+ ss[3] ^= ss[2]; k[v(48, (6 * (i)) + 9)] = ff(ss[3]); \
+ ss[4] ^= ss[3]; k[v(48, (6 * (i)) + 10)] = ff(ss[4]); \
+ ss[5] ^= ss[4]; k[v(48, (6 * (i)) + 11)] = ff(ss[5]); \
+}
+
+#define kd6(k, i) \
+{ ss[6] = ls_box(ss[5], 3) ^ t_use(r, c)[i]; \
+ ss[0] ^= ss[6]; ss[6] = ff(ss[6]); \
+ k[v(48, (6 * (i)) + 6)] = ss[6] ^= k[v(48, (6 * (i)))]; \
+ ss[1] ^= ss[0]; \
+ k[v(48, (6 * (i)) + 7)] = ss[6] ^= k[v(48, (6 * (i)) + 1)]; \
+ ss[2] ^= ss[1]; \
+ k[v(48, (6 * (i)) + 8)] = ss[6] ^= k[v(48, (6 * (i)) + 2)]; \
+ ss[3] ^= ss[2]; \
+ k[v(48, (6 * (i)) + 9)] = ss[6] ^= k[v(48, (6 * (i)) + 3)]; \
+ ss[4] ^= ss[3]; \
+ k[v(48, (6 * (i)) + 10)] = ss[6] ^= k[v(48, (6 * (i)) + 4)]; \
+ ss[5] ^= ss[4]; \
+ k[v(48, (6 * (i)) + 11)] = ss[6] ^= k[v(48, (6 * (i)) + 5)]; \
+}
+
+#define kdl6(k, i) \
+{ ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \
+ k[v(48, (6 * (i)) + 6)] = ss[0]; \
+ ss[1] ^= ss[0]; k[v(48, (6 * (i)) + 7)] = ss[1]; \
+ ss[2] ^= ss[1]; k[v(48, (6 * (i)) + 8)] = ss[2]; \
+ ss[3] ^= ss[2]; k[v(48, (6 * (i)) + 9)] = ss[3]; \
+}
+
+static void
+aes_decrypt_key192(const unsigned char *key, uint32_t rk[])
+{
+ uint32_t ss[7];
+#if defined(d_vars)
+ d_vars;
+#endif
+ rk[v(48, (0))] = ss[0] = word_in(key, 0);
+ rk[v(48, (1))] = ss[1] = word_in(key, 1);
+ rk[v(48, (2))] = ss[2] = word_in(key, 2);
+ rk[v(48, (3))] = ss[3] = word_in(key, 3);
+
+#ifdef DEC_KS_UNROLL
+ ss[4] = word_in(key, 4);
+ rk[v(48, (4))] = ff(ss[4]);
+ ss[5] = word_in(key, 5);
+ rk[v(48, (5))] = ff(ss[5]);
+ kdf6(rk, 0); kd6(rk, 1);
+ kd6(rk, 2); kd6(rk, 3);
+ kd6(rk, 4); kd6(rk, 5);
+ kd6(rk, 6); kdl6(rk, 7);
+#else
+ rk[v(48, (4))] = ss[4] = word_in(key, 4);
+ rk[v(48, (5))] = ss[5] = word_in(key, 5);
+ {
+ uint32_t i;
+
+ for (i = 0; i < 7; ++i)
+ k6e(rk, i);
+ k6ef(rk, 7);
+#if !(DEC_ROUND == NO_TABLES)
+ for (i = MAX_AES_NB; i < 12 * MAX_AES_NB; ++i)
+ rk[i] = inv_mcol(rk[i]);
+#endif
+ }
+#endif
+}
+
+
+
+#define k8ef(k, i) \
+{ k[v(56, (8 * (i)) + 8)] = ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \
+ k[v(56, (8 * (i)) + 9)] = ss[1] ^= ss[0]; \
+ k[v(56, (8 * (i)) + 10)] = ss[2] ^= ss[1]; \
+ k[v(56, (8 * (i)) + 11)] = ss[3] ^= ss[2]; \
+}
+
+#define k8e(k, i) \
+{ k8ef(k, i); \
+ k[v(56, (8 * (i)) + 12)] = ss[4] ^= ls_box(ss[3], 0); \
+ k[v(56, (8 * (i)) + 13)] = ss[5] ^= ss[4]; \
+ k[v(56, (8 * (i)) + 14)] = ss[6] ^= ss[5]; \
+ k[v(56, (8 * (i)) + 15)] = ss[7] ^= ss[6]; \
+}
+
+#define kdf8(k, i) \
+{ ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \
+ k[v(56, (8 * (i)) + 8)] = ff(ss[0]); \
+ ss[1] ^= ss[0]; k[v(56, (8 * (i)) + 9)] = ff(ss[1]); \
+ ss[2] ^= ss[1]; k[v(56, (8 * (i)) + 10)] = ff(ss[2]); \
+ ss[3] ^= ss[2]; k[v(56, (8 * (i)) + 11)] = ff(ss[3]); \
+ ss[4] ^= ls_box(ss[3], 0); k[v(56, (8 * (i)) + 12)] = ff(ss[4]); \
+ ss[5] ^= ss[4]; k[v(56, (8 * (i)) + 13)] = ff(ss[5]); \
+ ss[6] ^= ss[5]; k[v(56, (8 * (i)) + 14)] = ff(ss[6]); \
+ ss[7] ^= ss[6]; k[v(56, (8 * (i)) + 15)] = ff(ss[7]); \
+}
+
+#define kd8(k, i) \
+{ ss[8] = ls_box(ss[7], 3) ^ t_use(r, c)[i]; \
+ ss[0] ^= ss[8]; \
+ ss[8] = ff(ss[8]); \
+ k[v(56, (8 * (i)) + 8)] = ss[8] ^= k[v(56, (8 * (i)))]; \
+ ss[1] ^= ss[0]; \
+ k[v(56, (8 * (i)) + 9)] = ss[8] ^= k[v(56, (8 * (i)) + 1)]; \
+ ss[2] ^= ss[1]; \
+ k[v(56, (8 * (i)) + 10)] = ss[8] ^= k[v(56, (8 * (i)) + 2)]; \
+ ss[3] ^= ss[2]; \
+ k[v(56, (8 * (i)) + 11)] = ss[8] ^= k[v(56, (8 * (i)) + 3)]; \
+ ss[8] = ls_box(ss[3], 0); \
+ ss[4] ^= ss[8]; \
+ ss[8] = ff(ss[8]); \
+ k[v(56, (8 * (i)) + 12)] = ss[8] ^= k[v(56, (8 * (i)) + 4)]; \
+ ss[5] ^= ss[4]; \
+ k[v(56, (8 * (i)) + 13)] = ss[8] ^= k[v(56, (8 * (i)) + 5)]; \
+ ss[6] ^= ss[5]; \
+ k[v(56, (8 * (i)) + 14)] = ss[8] ^= k[v(56, (8 * (i)) + 6)]; \
+ ss[7] ^= ss[6]; \
+ k[v(56, (8 * (i)) + 15)] = ss[8] ^= k[v(56, (8 * (i)) + 7)]; \
+}
+
+#define kdl8(k, i) \
+{ ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \
+ k[v(56, (8 * (i)) + 8)] = ss[0]; \
+ ss[1] ^= ss[0]; k[v(56, (8 * (i)) + 9)] = ss[1]; \
+ ss[2] ^= ss[1]; k[v(56, (8 * (i)) + 10)] = ss[2]; \
+ ss[3] ^= ss[2]; k[v(56, (8 * (i)) + 11)] = ss[3]; \
+}
+
+static void
+aes_decrypt_key256(const unsigned char *key, uint32_t rk[])
+{
+ uint32_t ss[9];
+#if defined(d_vars)
+ d_vars;
+#endif
+ rk[v(56, (0))] = ss[0] = word_in(key, 0);
+ rk[v(56, (1))] = ss[1] = word_in(key, 1);
+ rk[v(56, (2))] = ss[2] = word_in(key, 2);
+ rk[v(56, (3))] = ss[3] = word_in(key, 3);
+
+#ifdef DEC_KS_UNROLL
+ ss[4] = word_in(key, 4);
+ rk[v(56, (4))] = ff(ss[4]);
+ ss[5] = word_in(key, 5);
+ rk[v(56, (5))] = ff(ss[5]);
+ ss[6] = word_in(key, 6);
+ rk[v(56, (6))] = ff(ss[6]);
+ ss[7] = word_in(key, 7);
+ rk[v(56, (7))] = ff(ss[7]);
+ kdf8(rk, 0); kd8(rk, 1);
+ kd8(rk, 2); kd8(rk, 3);
+ kd8(rk, 4); kd8(rk, 5);
+ kdl8(rk, 6);
+#else
+ rk[v(56, (4))] = ss[4] = word_in(key, 4);
+ rk[v(56, (5))] = ss[5] = word_in(key, 5);
+ rk[v(56, (6))] = ss[6] = word_in(key, 6);
+ rk[v(56, (7))] = ss[7] = word_in(key, 7);
+ {
+ uint32_t i;
+
+ for (i = 0; i < 6; ++i)
+ k8e(rk, i);
+ k8ef(rk, 6);
+#if !(DEC_ROUND == NO_TABLES)
+ for (i = MAX_AES_NB; i < 14 * MAX_AES_NB; ++i)
+ rk[i] = inv_mcol(rk[i]);
+#endif
+ }
+#endif /* DEC_KS_UNROLL */
+}
+
+
+/*
+ * Expand the cipher key into the decryption key schedule.
+ *
+ * Return the number of rounds for the given cipher key size.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4 * (Nr + 1).
+ *
+ * Parameters:
+ * rk AES key schedule 32-bit array to be initialized
+ * cipherKey User key
+ * keyBits AES key size (128, 192, or 256 bits)
+ */
+int
+rijndael_key_setup_dec_amd64(uint32_t rk[], const uint32_t cipherKey[],
+ int keyBits)
+{
+ switch (keyBits) {
+ case 128:
+ aes_decrypt_key128((unsigned char *)&cipherKey[0], rk);
+ return (10);
+ case 192:
+ aes_decrypt_key192((unsigned char *)&cipherKey[0], rk);
+ return (12);
+ case 256:
+ aes_decrypt_key256((unsigned char *)&cipherKey[0], rk);
+ return (14);
+ default: /* should never get here */
+ break;
+ }
+
+ return (0);
+}
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aesopt.h b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aesopt.h
new file mode 100644
index 000000000000..472111f96e59
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aesopt.h
@@ -0,0 +1,770 @@
+/*
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+ *
+ * LICENSE TERMS
+ *
+ * The free distribution and use of this software is allowed (with or without
+ * changes) provided that:
+ *
+ * 1. source code distributions include the above copyright notice, this
+ * list of conditions and the following disclaimer;
+ *
+ * 2. binary distributions include the above copyright notice, this list
+ * of conditions and the following disclaimer in their documentation;
+ *
+ * 3. the name of the copyright holder is not used to endorse products
+ * built using this software without specific written permission.
+ *
+ * DISCLAIMER
+ *
+ * This software is provided 'as is' with no explicit or implied warranties
+ * in respect of its properties, including, but not limited to, correctness
+ * and/or fitness for purpose.
+ * ---------------------------------------------------------------------------
+ * Issue Date: 20/12/2007
+ *
+ * This file contains the compilation options for AES (Rijndael) and code
+ * that is common across encryption, key scheduling and table generation.
+ *
+ * OPERATION
+ *
+ * These source code files implement the AES algorithm Rijndael designed by
+ * Joan Daemen and Vincent Rijmen. This version is designed for the standard
+ * block size of 16 bytes and for key sizes of 128, 192 and 256 bits (16, 24
+ * and 32 bytes).
+ *
+ * This version is designed for flexibility and speed using operations on
+ * 32-bit words rather than operations on bytes. It can be compiled with
+ * either big or little endian internal byte order but is faster when the
+ * native byte order for the processor is used.
+ *
+ * THE CIPHER INTERFACE
+ *
+ * The cipher interface is implemented as an array of bytes in which lower
+ * AES bit sequence indexes map to higher numeric significance within bytes.
+ */
+
+/*
+ * OpenSolaris changes
+ * 1. Added __cplusplus and _AESTAB_H header guards
+ * 2. Added header files sys/types.h and aes_impl.h
+ * 3. Added defines for AES_ENCRYPT, AES_DECRYPT, AES_REV_DKS, and ASM_AMD64_C
+ * 4. Moved defines for IS_BIG_ENDIAN, IS_LITTLE_ENDIAN, PLATFORM_BYTE_ORDER
+ * from brg_endian.h
+ * 5. Undefined VIA_ACE_POSSIBLE and ASSUME_VIA_ACE_PRESENT
+ * 6. Changed uint_8t and uint_32t to uint8_t and uint32_t
+ * 7. Defined aes_sw32 as htonl() for byte swapping
+ * 8. Cstyled and hdrchk code
+ *
+ */
+
+#ifndef _AESOPT_H
+#define _AESOPT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <aes/aes_impl.h>
+
+/* SUPPORT FEATURES */
+#define AES_ENCRYPT /* if support for encryption is needed */
+#define AES_DECRYPT /* if support for decryption is needed */
+
+/* PLATFORM-SPECIFIC FEATURES */
+#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */
+#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#define AES_REV_DKS /* define to reverse decryption key schedule */
+
+
+/*
+ * CONFIGURATION - THE USE OF DEFINES
+ * Later in this section there are a number of defines that control the
+ * operation of the code. In each section, the purpose of each define is
+ * explained so that the relevant form can be included or excluded by
+ * setting either 1's or 0's respectively on the branches of the related
+ * #if clauses. The following local defines should not be changed.
+ */
+
+#define ENCRYPTION_IN_C 1
+#define DECRYPTION_IN_C 2
+#define ENC_KEYING_IN_C 4
+#define DEC_KEYING_IN_C 8
+
+#define NO_TABLES 0
+#define ONE_TABLE 1
+#define FOUR_TABLES 4
+#define NONE 0
+#define PARTIAL 1
+#define FULL 2
+
+/* --- START OF USER CONFIGURED OPTIONS --- */
+
+/*
+ * 1. BYTE ORDER WITHIN 32 BIT WORDS
+ *
+ * The fundamental data processing units in Rijndael are 8-bit bytes. The
+ * input, output and key input are all enumerated arrays of bytes in which
+ * bytes are numbered starting at zero and increasing to one less than the
+ * number of bytes in the array in question. This enumeration is only used
+ * for naming bytes and does not imply any adjacency or order relationship
+ * from one byte to another. When these inputs and outputs are considered
+ * as bit sequences, bits 8*n to 8*n+7 of the bit sequence are mapped to
+ * byte[n] with bit 8n+i in the sequence mapped to bit 7-i within the byte.
+ * In this implementation bits are numbered from 0 to 7 starting at the
+ * numerically least significant end of each byte. Bit n represents 2^n.
+ *
+ * However, Rijndael can be implemented more efficiently using 32-bit
+ * words by packing bytes into words so that bytes 4*n to 4*n+3 are placed
+ * into word[n]. While in principle these bytes can be assembled into words
+ * in any positions, this implementation only supports the two formats in
+ * which bytes in adjacent positions within words also have adjacent byte
+ * numbers. This order is called big-endian if the lowest numbered bytes
+ * in words have the highest numeric significance and little-endian if the
+ * opposite applies.
+ *
+ * This code can work in either order irrespective of the order used by the
+ * machine on which it runs. Normally the internal byte order will be set
+ * to the order of the processor on which the code is to be run but this
+ * define can be used to reverse this in special situations
+ *
+ * WARNING: Assembler code versions rely on PLATFORM_BYTE_ORDER being set.
+ * This define will hence be redefined later (in section 4) if necessary
+ */
+
+#if 1
+#define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER
+#elif 0
+#define ALGORITHM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0
+#define ALGORITHM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#error The algorithm byte order is not defined
+#endif
+
+/* 2. VIA ACE SUPPORT */
+
+#if defined(__GNUC__) && defined(__i386__) || \
+ defined(_WIN32) && defined(_M_IX86) && \
+ !(defined(_WIN64) || defined(_WIN32_WCE) || \
+ defined(_MSC_VER) && (_MSC_VER <= 800))
+#define VIA_ACE_POSSIBLE
+#endif
+
+/*
+ * Define this option if support for the VIA ACE is required. This uses
+ * inline assembler instructions and is only implemented for the Microsoft,
+ * Intel and GCC compilers. If VIA ACE is known to be present, then defining
+ * ASSUME_VIA_ACE_PRESENT will remove the ordinary encryption/decryption
+ * code. If USE_VIA_ACE_IF_PRESENT is defined then VIA ACE will be used if
+ * it is detected (both present and enabled) but the normal AES code will
+ * also be present.
+ *
+ * When VIA ACE is to be used, all AES encryption contexts MUST be 16 byte
+ * aligned; other input/output buffers do not need to be 16 byte aligned
+ * but there are very large performance gains if this can be arranged.
+ * VIA ACE also requires the decryption key schedule to be in reverse
+ * order (which later checks below ensure).
+ */
+
+/* VIA ACE is not used here for OpenSolaris: */
+#undef VIA_ACE_POSSIBLE
+#undef ASSUME_VIA_ACE_PRESENT
+
+#if 0 && defined(VIA_ACE_POSSIBLE) && !defined(USE_VIA_ACE_IF_PRESENT)
+#define USE_VIA_ACE_IF_PRESENT
+#endif
+
+#if 0 && defined(VIA_ACE_POSSIBLE) && !defined(ASSUME_VIA_ACE_PRESENT)
+#define ASSUME_VIA_ACE_PRESENT
+#endif
+
+
+/*
+ * 3. ASSEMBLER SUPPORT
+ *
+ * This define (which can be on the command line) enables the use of the
+ * assembler code routines for encryption, decryption and key scheduling
+ * as follows:
+ *
+ * ASM_X86_V1C uses the assembler (aes_x86_v1.asm) with large tables for
+ * encryption and decryption and but with key scheduling in C
+ * ASM_X86_V2 uses assembler (aes_x86_v2.asm) with compressed tables for
+ * encryption, decryption and key scheduling
+ * ASM_X86_V2C uses assembler (aes_x86_v2.asm) with compressed tables for
+ * encryption and decryption and but with key scheduling in C
+ * ASM_AMD64_C uses assembler (aes_amd64.asm) with compressed tables for
+ * encryption and decryption and but with key scheduling in C
+ *
+ * Change one 'if 0' below to 'if 1' to select the version or define
+ * as a compilation option.
+ */
+
+#if 0 && !defined(ASM_X86_V1C)
+#define ASM_X86_V1C
+#elif 0 && !defined(ASM_X86_V2)
+#define ASM_X86_V2
+#elif 0 && !defined(ASM_X86_V2C)
+#define ASM_X86_V2C
+#elif 1 && !defined(ASM_AMD64_C)
+#define ASM_AMD64_C
+#endif
+
+#if (defined(ASM_X86_V1C) || defined(ASM_X86_V2) || defined(ASM_X86_V2C)) && \
+ !defined(_M_IX86) || defined(ASM_AMD64_C) && !defined(_M_X64) && \
+ !defined(__amd64)
+#error Assembler code is only available for x86 and AMD64 systems
+#endif
+
+/*
+ * 4. FAST INPUT/OUTPUT OPERATIONS.
+ *
+ * On some machines it is possible to improve speed by transferring the
+ * bytes in the input and output arrays to and from the internal 32-bit
+ * variables by addressing these arrays as if they are arrays of 32-bit
+ * words. On some machines this will always be possible but there may
+ * be a large performance penalty if the byte arrays are not aligned on
+ * the normal word boundaries. On other machines this technique will
+ * lead to memory access errors when such 32-bit word accesses are not
+ * properly aligned. The option SAFE_IO avoids such problems but will
+ * often be slower on those machines that support misaligned access
+ * (especially so if care is taken to align the input and output byte
+ * arrays on 32-bit word boundaries). If SAFE_IO is not defined it is
+ * assumed that access to byte arrays as if they are arrays of 32-bit
+ * words will not cause problems when such accesses are misaligned.
+ */
+#if 1 && !defined(_MSC_VER)
+#define SAFE_IO
+#endif
+
+/*
+ * 5. LOOP UNROLLING
+ *
+ * The code for encryption and decryption cycles through a number of rounds
+ * that can be implemented either in a loop or by expanding the code into a
+ * long sequence of instructions, the latter producing a larger program but
+ * one that will often be much faster. The latter is called loop unrolling.
+ * There are also potential speed advantages in expanding two iterations in
+ * a loop with half the number of iterations, which is called partial loop
+ * unrolling. The following options allow partial or full loop unrolling
+ * to be set independently for encryption and decryption
+ */
+#if 1
+#define ENC_UNROLL FULL
+#elif 0
+#define ENC_UNROLL PARTIAL
+#else
+#define ENC_UNROLL NONE
+#endif
+
+#if 1
+#define DEC_UNROLL FULL
+#elif 0
+#define DEC_UNROLL PARTIAL
+#else
+#define DEC_UNROLL NONE
+#endif
+
+#if 1
+#define ENC_KS_UNROLL
+#endif
+
+#if 1
+#define DEC_KS_UNROLL
+#endif
+
+/*
+ * 6. FAST FINITE FIELD OPERATIONS
+ *
+ * If this section is included, tables are used to provide faster finite
+ * field arithmetic. This has no effect if FIXED_TABLES is defined.
+ */
+#if 1
+#define FF_TABLES
+#endif
+
+/*
+ * 7. INTERNAL STATE VARIABLE FORMAT
+ *
+ * The internal state of Rijndael is stored in a number of local 32-bit
+ * word variables which can be defined either as an array or as individual
+ * names variables. Include this section if you want to store these local
+ * variables in arrays. Otherwise individual local variables will be used.
+ */
+#if 1
+#define ARRAYS
+#endif
+
+/*
+ * 8. FIXED OR DYNAMIC TABLES
+ *
+ * When this section is included the tables used by the code are compiled
+ * statically into the binary file. Otherwise the subroutine aes_init()
+ * must be called to compute them before the code is first used.
+ */
+#if 1 && !(defined(_MSC_VER) && (_MSC_VER <= 800))
+#define FIXED_TABLES
+#endif
+
+/*
+ * 9. MASKING OR CASTING FROM LONGER VALUES TO BYTES
+ *
+ * In some systems it is better to mask longer values to extract bytes
+ * rather than using a cast. This option allows this choice.
+ */
+#if 0
+#define to_byte(x) ((uint8_t)(x))
+#else
+#define to_byte(x) ((x) & 0xff)
+#endif
+
+/*
+ * 10. TABLE ALIGNMENT
+ *
+ * On some systems speed will be improved by aligning the AES large lookup
+ * tables on particular boundaries. This define should be set to a power of
+ * two giving the desired alignment. It can be left undefined if alignment
+ * is not needed. This option is specific to the Microsoft VC++ compiler -
+ * it seems to sometimes cause trouble for the VC++ version 6 compiler.
+ */
+
+#if 1 && defined(_MSC_VER) && (_MSC_VER >= 1300)
+#define TABLE_ALIGN 32
+#endif
+
+/*
+ * 11. REDUCE CODE AND TABLE SIZE
+ *
+ * This replaces some expanded macros with function calls if AES_ASM_V2 or
+ * AES_ASM_V2C are defined
+ */
+
+#if 1 && (defined(ASM_X86_V2) || defined(ASM_X86_V2C))
+#define REDUCE_CODE_SIZE
+#endif
+
+/*
+ * 12. TABLE OPTIONS
+ *
+ * This cipher proceeds by repeating in a number of cycles known as rounds
+ * which are implemented by a round function which is optionally be speeded
+ * up using tables. The basic tables are 256 32-bit words, with either
+ * one or four tables being required for each round function depending on
+ * how much speed is required. Encryption and decryption round functions
+ * are different and the last encryption and decryption round functions are
+ * different again making four different round functions in all.
+ *
+ * This means that:
+ * 1. Normal encryption and decryption rounds can each use either 0, 1
+ * or 4 tables and table spaces of 0, 1024 or 4096 bytes each.
+ * 2. The last encryption and decryption rounds can also use either 0, 1
+ * or 4 tables and table spaces of 0, 1024 or 4096 bytes each.
+ *
+ * Include or exclude the appropriate definitions below to set the number
+ * of tables used by this implementation.
+ */
+
+#if 1 /* set tables for the normal encryption round */
+#define ENC_ROUND FOUR_TABLES
+#elif 0
+#define ENC_ROUND ONE_TABLE
+#else
+#define ENC_ROUND NO_TABLES
+#endif
+
+#if 1 /* set tables for the last encryption round */
+#define LAST_ENC_ROUND FOUR_TABLES
+#elif 0
+#define LAST_ENC_ROUND ONE_TABLE
+#else
+#define LAST_ENC_ROUND NO_TABLES
+#endif
+
+#if 1 /* set tables for the normal decryption round */
+#define DEC_ROUND FOUR_TABLES
+#elif 0
+#define DEC_ROUND ONE_TABLE
+#else
+#define DEC_ROUND NO_TABLES
+#endif
+
+#if 1 /* set tables for the last decryption round */
+#define LAST_DEC_ROUND FOUR_TABLES
+#elif 0
+#define LAST_DEC_ROUND ONE_TABLE
+#else
+#define LAST_DEC_ROUND NO_TABLES
+#endif
+
+/*
+ * The decryption key schedule can be speeded up with tables in the same
+ * way that the round functions can. Include or exclude the following
+ * defines to set this requirement.
+ */
+#if 1
+#define KEY_SCHED FOUR_TABLES
+#elif 0
+#define KEY_SCHED ONE_TABLE
+#else
+#define KEY_SCHED NO_TABLES
+#endif
+
+/* ---- END OF USER CONFIGURED OPTIONS ---- */
+
+/* VIA ACE support is only available for VC++ and GCC */
+
+#if !defined(_MSC_VER) && !defined(__GNUC__)
+#if defined(ASSUME_VIA_ACE_PRESENT)
+#undef ASSUME_VIA_ACE_PRESENT
+#endif
+#if defined(USE_VIA_ACE_IF_PRESENT)
+#undef USE_VIA_ACE_IF_PRESENT
+#endif
+#endif
+
+#if defined(ASSUME_VIA_ACE_PRESENT) && !defined(USE_VIA_ACE_IF_PRESENT)
+#define USE_VIA_ACE_IF_PRESENT
+#endif
+
+#if defined(USE_VIA_ACE_IF_PRESENT) && !defined(AES_REV_DKS)
+#define AES_REV_DKS
+#endif
+
+/* Assembler support requires the use of platform byte order */
+
+#if (defined(ASM_X86_V1C) || defined(ASM_X86_V2C) || defined(ASM_AMD64_C)) && \
+ (ALGORITHM_BYTE_ORDER != PLATFORM_BYTE_ORDER)
+#undef ALGORITHM_BYTE_ORDER
+#define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER
+#endif
+
+/*
+ * In this implementation the columns of the state array are each held in
+ * 32-bit words. The state array can be held in various ways: in an array
+ * of words, in a number of individual word variables or in a number of
+ * processor registers. The following define maps a variable name x and
+ * a column number c to the way the state array variable is to be held.
+ * The first define below maps the state into an array x[c] whereas the
+ * second form maps the state into a number of individual variables x0,
+ * x1, etc. Another form could map individual state columns to machine
+ * register names.
+ */
+
+#if defined(ARRAYS)
+#define s(x, c) x[c]
+#else
+#define s(x, c) x##c
+#endif
+
+/*
+ * This implementation provides subroutines for encryption, decryption
+ * and for setting the three key lengths (separately) for encryption
+ * and decryption. Since not all functions are needed, masks are set
+ * up here to determine which will be implemented in C
+ */
+
+#if !defined(AES_ENCRYPT)
+#define EFUNCS_IN_C 0
+#elif defined(ASSUME_VIA_ACE_PRESENT) || defined(ASM_X86_V1C) || \
+ defined(ASM_X86_V2C) || defined(ASM_AMD64_C)
+#define EFUNCS_IN_C ENC_KEYING_IN_C
+#elif !defined(ASM_X86_V2)
+#define EFUNCS_IN_C (ENCRYPTION_IN_C | ENC_KEYING_IN_C)
+#else
+#define EFUNCS_IN_C 0
+#endif
+
+#if !defined(AES_DECRYPT)
+#define DFUNCS_IN_C 0
+#elif defined(ASSUME_VIA_ACE_PRESENT) || defined(ASM_X86_V1C) || \
+ defined(ASM_X86_V2C) || defined(ASM_AMD64_C)
+#define DFUNCS_IN_C DEC_KEYING_IN_C
+#elif !defined(ASM_X86_V2)
+#define DFUNCS_IN_C (DECRYPTION_IN_C | DEC_KEYING_IN_C)
+#else
+#define DFUNCS_IN_C 0
+#endif
+
+#define FUNCS_IN_C (EFUNCS_IN_C | DFUNCS_IN_C)
+
+/* END OF CONFIGURATION OPTIONS */
+
+/* Disable or report errors on some combinations of options */
+
+#if ENC_ROUND == NO_TABLES && LAST_ENC_ROUND != NO_TABLES
+#undef LAST_ENC_ROUND
+#define LAST_ENC_ROUND NO_TABLES
+#elif ENC_ROUND == ONE_TABLE && LAST_ENC_ROUND == FOUR_TABLES
+#undef LAST_ENC_ROUND
+#define LAST_ENC_ROUND ONE_TABLE
+#endif
+
+#if ENC_ROUND == NO_TABLES && ENC_UNROLL != NONE
+#undef ENC_UNROLL
+#define ENC_UNROLL NONE
+#endif
+
+#if DEC_ROUND == NO_TABLES && LAST_DEC_ROUND != NO_TABLES
+#undef LAST_DEC_ROUND
+#define LAST_DEC_ROUND NO_TABLES
+#elif DEC_ROUND == ONE_TABLE && LAST_DEC_ROUND == FOUR_TABLES
+#undef LAST_DEC_ROUND
+#define LAST_DEC_ROUND ONE_TABLE
+#endif
+
+#if DEC_ROUND == NO_TABLES && DEC_UNROLL != NONE
+#undef DEC_UNROLL
+#define DEC_UNROLL NONE
+#endif
+
+#if (ALGORITHM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define aes_sw32 htonl
+#elif defined(bswap32)
+#define aes_sw32 bswap32
+#elif defined(bswap_32)
+#define aes_sw32 bswap_32
+#else
+#define brot(x, n) (((uint32_t)(x) << (n)) | ((uint32_t)(x) >> (32 - (n))))
+#define aes_sw32(x) ((brot((x), 8) & 0x00ff00ff) | (brot((x), 24) & 0xff00ff00))
+#endif
+
+
+/*
+ * upr(x, n): rotates bytes within words by n positions, moving bytes to
+ * higher index positions with wrap around into low positions
+ * ups(x, n): moves bytes by n positions to higher index positions in
+ * words but without wrap around
+ * bval(x, n): extracts a byte from a word
+ *
+ * WARNING: The definitions given here are intended only for use with
+ * unsigned variables and with shift counts that are compile
+ * time constants
+ */
+
+#if (ALGORITHM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define upr(x, n) (((uint32_t)(x) << (8 * (n))) | \
+ ((uint32_t)(x) >> (32 - 8 * (n))))
+#define ups(x, n) ((uint32_t)(x) << (8 * (n)))
+#define bval(x, n) to_byte((x) >> (8 * (n)))
+#define bytes2word(b0, b1, b2, b3) \
+ (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | \
+ ((uint32_t)(b1) << 8) | (b0))
+#endif
+
+#if (ALGORITHM_BYTE_ORDER == IS_BIG_ENDIAN)
+#define upr(x, n) (((uint32_t)(x) >> (8 * (n))) | \
+ ((uint32_t)(x) << (32 - 8 * (n))))
+#define ups(x, n) ((uint32_t)(x) >> (8 * (n)))
+#define bval(x, n) to_byte((x) >> (24 - 8 * (n)))
+#define bytes2word(b0, b1, b2, b3) \
+ (((uint32_t)(b0) << 24) | ((uint32_t)(b1) << 16) | \
+ ((uint32_t)(b2) << 8) | (b3))
+#endif
+
+#if defined(SAFE_IO)
+#define word_in(x, c) bytes2word(((const uint8_t *)(x) + 4 * c)[0], \
+ ((const uint8_t *)(x) + 4 * c)[1], \
+ ((const uint8_t *)(x) + 4 * c)[2], \
+ ((const uint8_t *)(x) + 4 * c)[3])
+#define word_out(x, c, v) { ((uint8_t *)(x) + 4 * c)[0] = bval(v, 0); \
+ ((uint8_t *)(x) + 4 * c)[1] = bval(v, 1); \
+ ((uint8_t *)(x) + 4 * c)[2] = bval(v, 2); \
+ ((uint8_t *)(x) + 4 * c)[3] = bval(v, 3); }
+#elif (ALGORITHM_BYTE_ORDER == PLATFORM_BYTE_ORDER)
+#define word_in(x, c) (*((uint32_t *)(x) + (c)))
+#define word_out(x, c, v) (*((uint32_t *)(x) + (c)) = (v))
+#else
+#define word_in(x, c) aes_sw32(*((uint32_t *)(x) + (c)))
+#define word_out(x, c, v) (*((uint32_t *)(x) + (c)) = aes_sw32(v))
+#endif
+
+/* the finite field modular polynomial and elements */
+
+#define WPOLY 0x011b
+#define BPOLY 0x1b
+
+/* multiply four bytes in GF(2^8) by 'x' {02} in parallel */
+
+#define m1 0x80808080
+#define m2 0x7f7f7f7f
+#define gf_mulx(x) ((((x) & m2) << 1) ^ ((((x) & m1) >> 7) * BPOLY))
+
+/*
+ * The following defines provide alternative definitions of gf_mulx that might
+ * give improved performance if a fast 32-bit multiply is not available. Note
+ * that a temporary variable u needs to be defined where gf_mulx is used.
+ *
+ * #define gf_mulx(x) (u = (x) & m1, u |= (u >> 1), ((x) & m2) << 1) ^ \
+ * ((u >> 3) | (u >> 6))
+ * #define m4 (0x01010101 * BPOLY)
+ * #define gf_mulx(x) (u = (x) & m1, ((x) & m2) << 1) ^ ((u - (u >> 7)) \
+ * & m4)
+ */
+
+/* Work out which tables are needed for the different options */
+
+#if defined(ASM_X86_V1C)
+#if defined(ENC_ROUND)
+#undef ENC_ROUND
+#endif
+#define ENC_ROUND FOUR_TABLES
+#if defined(LAST_ENC_ROUND)
+#undef LAST_ENC_ROUND
+#endif
+#define LAST_ENC_ROUND FOUR_TABLES
+#if defined(DEC_ROUND)
+#undef DEC_ROUND
+#endif
+#define DEC_ROUND FOUR_TABLES
+#if defined(LAST_DEC_ROUND)
+#undef LAST_DEC_ROUND
+#endif
+#define LAST_DEC_ROUND FOUR_TABLES
+#if defined(KEY_SCHED)
+#undef KEY_SCHED
+#define KEY_SCHED FOUR_TABLES
+#endif
+#endif
+
+#if (FUNCS_IN_C & ENCRYPTION_IN_C) || defined(ASM_X86_V1C)
+#if ENC_ROUND == ONE_TABLE
+#define FT1_SET
+#elif ENC_ROUND == FOUR_TABLES
+#define FT4_SET
+#else
+#define SBX_SET
+#endif
+#if LAST_ENC_ROUND == ONE_TABLE
+#define FL1_SET
+#elif LAST_ENC_ROUND == FOUR_TABLES
+#define FL4_SET
+#elif !defined(SBX_SET)
+#define SBX_SET
+#endif
+#endif
+
+#if (FUNCS_IN_C & DECRYPTION_IN_C) || defined(ASM_X86_V1C)
+#if DEC_ROUND == ONE_TABLE
+#define IT1_SET
+#elif DEC_ROUND == FOUR_TABLES
+#define IT4_SET
+#else
+#define ISB_SET
+#endif
+#if LAST_DEC_ROUND == ONE_TABLE
+#define IL1_SET
+#elif LAST_DEC_ROUND == FOUR_TABLES
+#define IL4_SET
+#elif !defined(ISB_SET)
+#define ISB_SET
+#endif
+#endif
+
+
+#if !(defined(REDUCE_CODE_SIZE) && (defined(ASM_X86_V2) || \
+ defined(ASM_X86_V2C)))
+#if ((FUNCS_IN_C & ENC_KEYING_IN_C) || (FUNCS_IN_C & DEC_KEYING_IN_C))
+#if KEY_SCHED == ONE_TABLE
+#if !defined(FL1_SET) && !defined(FL4_SET)
+#define LS1_SET
+#endif
+#elif KEY_SCHED == FOUR_TABLES
+#if !defined(FL4_SET)
+#define LS4_SET
+#endif
+#elif !defined(SBX_SET)
+#define SBX_SET
+#endif
+#endif
+#if (FUNCS_IN_C & DEC_KEYING_IN_C)
+#if KEY_SCHED == ONE_TABLE
+#define IM1_SET
+#elif KEY_SCHED == FOUR_TABLES
+#define IM4_SET
+#elif !defined(SBX_SET)
+#define SBX_SET
+#endif
+#endif
+#endif
+
+/* generic definitions of Rijndael macros that use tables */
+
+#define no_table(x, box, vf, rf, c) bytes2word(\
+ box[bval(vf(x, 0, c), rf(0, c))], \
+ box[bval(vf(x, 1, c), rf(1, c))], \
+ box[bval(vf(x, 2, c), rf(2, c))], \
+ box[bval(vf(x, 3, c), rf(3, c))])
+
+#define one_table(x, op, tab, vf, rf, c) \
+ (tab[bval(vf(x, 0, c), rf(0, c))] \
+ ^ op(tab[bval(vf(x, 1, c), rf(1, c))], 1) \
+ ^ op(tab[bval(vf(x, 2, c), rf(2, c))], 2) \
+ ^ op(tab[bval(vf(x, 3, c), rf(3, c))], 3))
+
+#define four_tables(x, tab, vf, rf, c) \
+ (tab[0][bval(vf(x, 0, c), rf(0, c))] \
+ ^ tab[1][bval(vf(x, 1, c), rf(1, c))] \
+ ^ tab[2][bval(vf(x, 2, c), rf(2, c))] \
+ ^ tab[3][bval(vf(x, 3, c), rf(3, c))])
+
+#define vf1(x, r, c) (x)
+#define rf1(r, c) (r)
+#define rf2(r, c) ((8+r-c)&3)
+
+/*
+ * Perform forward and inverse column mix operation on four bytes in long word
+ * x in parallel. NOTE: x must be a simple variable, NOT an expression in
+ * these macros.
+ */
+
+#if !(defined(REDUCE_CODE_SIZE) && (defined(ASM_X86_V2) || \
+ defined(ASM_X86_V2C)))
+
+#if defined(FM4_SET) /* not currently used */
+#define fwd_mcol(x) four_tables(x, t_use(f, m), vf1, rf1, 0)
+#elif defined(FM1_SET) /* not currently used */
+#define fwd_mcol(x) one_table(x, upr, t_use(f, m), vf1, rf1, 0)
+#else
+#define dec_fmvars uint32_t g2
+#define fwd_mcol(x) (g2 = gf_mulx(x), g2 ^ upr((x) ^ g2, 3) ^ \
+ upr((x), 2) ^ upr((x), 1))
+#endif
+
+#if defined(IM4_SET)
+#define inv_mcol(x) four_tables(x, t_use(i, m), vf1, rf1, 0)
+#elif defined(IM1_SET)
+#define inv_mcol(x) one_table(x, upr, t_use(i, m), vf1, rf1, 0)
+#else
+#define dec_imvars uint32_t g2, g4, g9
+#define inv_mcol(x) (g2 = gf_mulx(x), g4 = gf_mulx(g2), g9 = \
+ (x) ^ gf_mulx(g4), g4 ^= g9, \
+ (x) ^ g2 ^ g4 ^ upr(g2 ^ g9, 3) ^ \
+ upr(g4, 2) ^ upr(g9, 1))
+#endif
+
+#if defined(FL4_SET)
+#define ls_box(x, c) four_tables(x, t_use(f, l), vf1, rf2, c)
+#elif defined(LS4_SET)
+#define ls_box(x, c) four_tables(x, t_use(l, s), vf1, rf2, c)
+#elif defined(FL1_SET)
+#define ls_box(x, c) one_table(x, upr, t_use(f, l), vf1, rf2, c)
+#elif defined(LS1_SET)
+#define ls_box(x, c) one_table(x, upr, t_use(l, s), vf1, rf2, c)
+#else
+#define ls_box(x, c) no_table(x, t_use(s, box), vf1, rf2, c)
+#endif
+
+#endif
+
+#if defined(ASM_X86_V1C) && defined(AES_DECRYPT) && !defined(ISB_SET)
+#define ISB_SET
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _AESOPT_H */
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aestab.h b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aestab.h
new file mode 100644
index 000000000000..33cdb6c6f9fe
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aestab.h
@@ -0,0 +1,165 @@
+/*
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+ *
+ * LICENSE TERMS
+ *
+ * The free distribution and use of this software is allowed (with or without
+ * changes) provided that:
+ *
+ * 1. source code distributions include the above copyright notice, this
+ * list of conditions and the following disclaimer;
+ *
+ * 2. binary distributions include the above copyright notice, this list
+ * of conditions and the following disclaimer in their documentation;
+ *
+ * 3. the name of the copyright holder is not used to endorse products
+ * built using this software without specific written permission.
+ *
+ * DISCLAIMER
+ *
+ * This software is provided 'as is' with no explicit or implied warranties
+ * in respect of its properties, including, but not limited to, correctness
+ * and/or fitness for purpose.
+ * ---------------------------------------------------------------------------
+ * Issue Date: 20/12/2007
+ *
+ * This file contains the code for declaring the tables needed to implement
+ * AES. The file aesopt.h is assumed to be included before this header file.
+ * If there are no global variables, the definitions here can be used to put
+ * the AES tables in a structure so that a pointer can then be added to the
+ * AES context to pass them to the AES routines that need them. If this
+ * facility is used, the calling program has to ensure that this pointer is
+ * managed appropriately. In particular, the value of the t_dec(in, it) item
+ * in the table structure must be set to zero in order to ensure that the
+ * tables are initialised. In practice the three code sequences in aeskey.c
+ * that control the calls to aes_init() and the aes_init() routine itself will
+ * have to be changed for a specific implementation. If global variables are
+ * available it will generally be preferable to use them with the precomputed
+ * FIXED_TABLES option that uses static global tables.
+ *
+ * The following defines can be used to control the way the tables
+ * are defined, initialised and used in embedded environments that
+ * require special features for these purposes
+ *
+ * the 't_dec' construction is used to declare fixed table arrays
+ * the 't_set' construction is used to set fixed table values
+ * the 't_use' construction is used to access fixed table values
+ *
+ * 256 byte tables:
+ *
+ * t_xxx(s, box) => forward S box
+ * t_xxx(i, box) => inverse S box
+ *
+ * 256 32-bit word OR 4 x 256 32-bit word tables:
+ *
+ * t_xxx(f, n) => forward normal round
+ * t_xxx(f, l) => forward last round
+ * t_xxx(i, n) => inverse normal round
+ * t_xxx(i, l) => inverse last round
+ * t_xxx(l, s) => key schedule table
+ * t_xxx(i, m) => key schedule table
+ *
+ * Other variables and tables:
+ *
+ * t_xxx(r, c) => the rcon table
+ */
+
+/*
+ * OpenSolaris OS modifications
+ *
+ * 1. Added __cplusplus and _AESTAB_H header guards
+ * 2. Added header file sys/types.h
+ * 3. Remove code defined for _MSC_VER
+ * 4. Changed all variables to "static const"
+ * 5. Changed uint_8t and uint_32t to uint8_t and uint32_t
+ * 6. Cstyled and hdrchk code
+ */
+
+#ifndef _AESTAB_H
+#define _AESTAB_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+
+#define t_dec(m, n) t_##m##n
+#define t_set(m, n) t_##m##n
+#define t_use(m, n) t_##m##n
+
+#if defined(DO_TABLES) && defined(FIXED_TABLES)
+#define d_1(t, n, b, e) static const t n[256] = b(e)
+#define d_4(t, n, b, e, f, g, h) static const t n[4][256] = \
+ {b(e), b(f), b(g), b(h)}
+static const uint32_t t_dec(r, c)[RC_LENGTH] = rc_data(w0);
+#else
+#define d_1(t, n, b, e) static const t n[256]
+#define d_4(t, n, b, e, f, g, h) static const t n[4][256]
+static const uint32_t t_dec(r, c)[RC_LENGTH];
+#endif
+
+#if defined(SBX_SET)
+ d_1(uint8_t, t_dec(s, box), sb_data, h0);
+#endif
+#if defined(ISB_SET)
+ d_1(uint8_t, t_dec(i, box), isb_data, h0);
+#endif
+
+#if defined(FT1_SET)
+ d_1(uint32_t, t_dec(f, n), sb_data, u0);
+#endif
+#if defined(FT4_SET)
+ d_4(uint32_t, t_dec(f, n), sb_data, u0, u1, u2, u3);
+#endif
+
+#if defined(FL1_SET)
+ d_1(uint32_t, t_dec(f, l), sb_data, w0);
+#endif
+#if defined(FL4_SET)
+ d_4(uint32_t, t_dec(f, l), sb_data, w0, w1, w2, w3);
+#endif
+
+#if defined(IT1_SET)
+ d_1(uint32_t, t_dec(i, n), isb_data, v0);
+#endif
+#if defined(IT4_SET)
+ d_4(uint32_t, t_dec(i, n), isb_data, v0, v1, v2, v3);
+#endif
+
+#if defined(IL1_SET)
+ d_1(uint32_t, t_dec(i, l), isb_data, w0);
+#endif
+#if defined(IL4_SET)
+ d_4(uint32_t, t_dec(i, l), isb_data, w0, w1, w2, w3);
+#endif
+
+#if defined(LS1_SET)
+#if defined(FL1_SET)
+#undef LS1_SET
+#else
+ d_1(uint32_t, t_dec(l, s), sb_data, w0);
+#endif
+#endif
+
+#if defined(LS4_SET)
+#if defined(FL4_SET)
+#undef LS4_SET
+#else
+ d_4(uint32_t, t_dec(l, s), sb_data, w0, w1, w2, w3);
+#endif
+#endif
+
+#if defined(IM1_SET)
+ d_1(uint32_t, t_dec(i, m), mm_data, v0);
+#endif
+#if defined(IM4_SET)
+ d_4(uint32_t, t_dec(i, m), mm_data, v0, v1, v2, v3);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _AESTAB_H */
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aestab2.h b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aestab2.h
new file mode 100644
index 000000000000..eb13f72b10d8
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aestab2.h
@@ -0,0 +1,594 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _AESTAB2_H
+#define _AESTAB2_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * To create this file for OpenSolaris:
+ * 1. Compile and run tablegen.c, from aes-src-04-03-08.zip,
+ * after defining ASM_AMD64_C
+ * 2. mv aestab2.c aestab2.h
+ * 3. Add __cplusplus and _AESTAB2_H header guards
+ * 3. Add #include <aes_impl.h>
+ * 4. Change "uint_32t" to "uint32_t"
+ * 5. Change all variables to "static const"
+ * 6. Cstyle and hdrchk this file
+ */
+
+#include <aes/aes_impl.h>
+
+static const uint32_t t_rc[RC_LENGTH] =
+{
+ 0x00000001, 0x00000002, 0x00000004, 0x00000008,
+ 0x00000010, 0x00000020, 0x00000040, 0x00000080,
+ 0x0000001b, 0x00000036
+};
+
+static const uint32_t t_ls[4][256] =
+{
+ {
+ 0x00000063, 0x0000007c, 0x00000077, 0x0000007b,
+ 0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5,
+ 0x00000030, 0x00000001, 0x00000067, 0x0000002b,
+ 0x000000fe, 0x000000d7, 0x000000ab, 0x00000076,
+ 0x000000ca, 0x00000082, 0x000000c9, 0x0000007d,
+ 0x000000fa, 0x00000059, 0x00000047, 0x000000f0,
+ 0x000000ad, 0x000000d4, 0x000000a2, 0x000000af,
+ 0x0000009c, 0x000000a4, 0x00000072, 0x000000c0,
+ 0x000000b7, 0x000000fd, 0x00000093, 0x00000026,
+ 0x00000036, 0x0000003f, 0x000000f7, 0x000000cc,
+ 0x00000034, 0x000000a5, 0x000000e5, 0x000000f1,
+ 0x00000071, 0x000000d8, 0x00000031, 0x00000015,
+ 0x00000004, 0x000000c7, 0x00000023, 0x000000c3,
+ 0x00000018, 0x00000096, 0x00000005, 0x0000009a,
+ 0x00000007, 0x00000012, 0x00000080, 0x000000e2,
+ 0x000000eb, 0x00000027, 0x000000b2, 0x00000075,
+ 0x00000009, 0x00000083, 0x0000002c, 0x0000001a,
+ 0x0000001b, 0x0000006e, 0x0000005a, 0x000000a0,
+ 0x00000052, 0x0000003b, 0x000000d6, 0x000000b3,
+ 0x00000029, 0x000000e3, 0x0000002f, 0x00000084,
+ 0x00000053, 0x000000d1, 0x00000000, 0x000000ed,
+ 0x00000020, 0x000000fc, 0x000000b1, 0x0000005b,
+ 0x0000006a, 0x000000cb, 0x000000be, 0x00000039,
+ 0x0000004a, 0x0000004c, 0x00000058, 0x000000cf,
+ 0x000000d0, 0x000000ef, 0x000000aa, 0x000000fb,
+ 0x00000043, 0x0000004d, 0x00000033, 0x00000085,
+ 0x00000045, 0x000000f9, 0x00000002, 0x0000007f,
+ 0x00000050, 0x0000003c, 0x0000009f, 0x000000a8,
+ 0x00000051, 0x000000a3, 0x00000040, 0x0000008f,
+ 0x00000092, 0x0000009d, 0x00000038, 0x000000f5,
+ 0x000000bc, 0x000000b6, 0x000000da, 0x00000021,
+ 0x00000010, 0x000000ff, 0x000000f3, 0x000000d2,
+ 0x000000cd, 0x0000000c, 0x00000013, 0x000000ec,
+ 0x0000005f, 0x00000097, 0x00000044, 0x00000017,
+ 0x000000c4, 0x000000a7, 0x0000007e, 0x0000003d,
+ 0x00000064, 0x0000005d, 0x00000019, 0x00000073,
+ 0x00000060, 0x00000081, 0x0000004f, 0x000000dc,
+ 0x00000022, 0x0000002a, 0x00000090, 0x00000088,
+ 0x00000046, 0x000000ee, 0x000000b8, 0x00000014,
+ 0x000000de, 0x0000005e, 0x0000000b, 0x000000db,
+ 0x000000e0, 0x00000032, 0x0000003a, 0x0000000a,
+ 0x00000049, 0x00000006, 0x00000024, 0x0000005c,
+ 0x000000c2, 0x000000d3, 0x000000ac, 0x00000062,
+ 0x00000091, 0x00000095, 0x000000e4, 0x00000079,
+ 0x000000e7, 0x000000c8, 0x00000037, 0x0000006d,
+ 0x0000008d, 0x000000d5, 0x0000004e, 0x000000a9,
+ 0x0000006c, 0x00000056, 0x000000f4, 0x000000ea,
+ 0x00000065, 0x0000007a, 0x000000ae, 0x00000008,
+ 0x000000ba, 0x00000078, 0x00000025, 0x0000002e,
+ 0x0000001c, 0x000000a6, 0x000000b4, 0x000000c6,
+ 0x000000e8, 0x000000dd, 0x00000074, 0x0000001f,
+ 0x0000004b, 0x000000bd, 0x0000008b, 0x0000008a,
+ 0x00000070, 0x0000003e, 0x000000b5, 0x00000066,
+ 0x00000048, 0x00000003, 0x000000f6, 0x0000000e,
+ 0x00000061, 0x00000035, 0x00000057, 0x000000b9,
+ 0x00000086, 0x000000c1, 0x0000001d, 0x0000009e,
+ 0x000000e1, 0x000000f8, 0x00000098, 0x00000011,
+ 0x00000069, 0x000000d9, 0x0000008e, 0x00000094,
+ 0x0000009b, 0x0000001e, 0x00000087, 0x000000e9,
+ 0x000000ce, 0x00000055, 0x00000028, 0x000000df,
+ 0x0000008c, 0x000000a1, 0x00000089, 0x0000000d,
+ 0x000000bf, 0x000000e6, 0x00000042, 0x00000068,
+ 0x00000041, 0x00000099, 0x0000002d, 0x0000000f,
+ 0x000000b0, 0x00000054, 0x000000bb, 0x00000016
+ },
+ {
+ 0x00006300, 0x00007c00, 0x00007700, 0x00007b00,
+ 0x0000f200, 0x00006b00, 0x00006f00, 0x0000c500,
+ 0x00003000, 0x00000100, 0x00006700, 0x00002b00,
+ 0x0000fe00, 0x0000d700, 0x0000ab00, 0x00007600,
+ 0x0000ca00, 0x00008200, 0x0000c900, 0x00007d00,
+ 0x0000fa00, 0x00005900, 0x00004700, 0x0000f000,
+ 0x0000ad00, 0x0000d400, 0x0000a200, 0x0000af00,
+ 0x00009c00, 0x0000a400, 0x00007200, 0x0000c000,
+ 0x0000b700, 0x0000fd00, 0x00009300, 0x00002600,
+ 0x00003600, 0x00003f00, 0x0000f700, 0x0000cc00,
+ 0x00003400, 0x0000a500, 0x0000e500, 0x0000f100,
+ 0x00007100, 0x0000d800, 0x00003100, 0x00001500,
+ 0x00000400, 0x0000c700, 0x00002300, 0x0000c300,
+ 0x00001800, 0x00009600, 0x00000500, 0x00009a00,
+ 0x00000700, 0x00001200, 0x00008000, 0x0000e200,
+ 0x0000eb00, 0x00002700, 0x0000b200, 0x00007500,
+ 0x00000900, 0x00008300, 0x00002c00, 0x00001a00,
+ 0x00001b00, 0x00006e00, 0x00005a00, 0x0000a000,
+ 0x00005200, 0x00003b00, 0x0000d600, 0x0000b300,
+ 0x00002900, 0x0000e300, 0x00002f00, 0x00008400,
+ 0x00005300, 0x0000d100, 0x00000000, 0x0000ed00,
+ 0x00002000, 0x0000fc00, 0x0000b100, 0x00005b00,
+ 0x00006a00, 0x0000cb00, 0x0000be00, 0x00003900,
+ 0x00004a00, 0x00004c00, 0x00005800, 0x0000cf00,
+ 0x0000d000, 0x0000ef00, 0x0000aa00, 0x0000fb00,
+ 0x00004300, 0x00004d00, 0x00003300, 0x00008500,
+ 0x00004500, 0x0000f900, 0x00000200, 0x00007f00,
+ 0x00005000, 0x00003c00, 0x00009f00, 0x0000a800,
+ 0x00005100, 0x0000a300, 0x00004000, 0x00008f00,
+ 0x00009200, 0x00009d00, 0x00003800, 0x0000f500,
+ 0x0000bc00, 0x0000b600, 0x0000da00, 0x00002100,
+ 0x00001000, 0x0000ff00, 0x0000f300, 0x0000d200,
+ 0x0000cd00, 0x00000c00, 0x00001300, 0x0000ec00,
+ 0x00005f00, 0x00009700, 0x00004400, 0x00001700,
+ 0x0000c400, 0x0000a700, 0x00007e00, 0x00003d00,
+ 0x00006400, 0x00005d00, 0x00001900, 0x00007300,
+ 0x00006000, 0x00008100, 0x00004f00, 0x0000dc00,
+ 0x00002200, 0x00002a00, 0x00009000, 0x00008800,
+ 0x00004600, 0x0000ee00, 0x0000b800, 0x00001400,
+ 0x0000de00, 0x00005e00, 0x00000b00, 0x0000db00,
+ 0x0000e000, 0x00003200, 0x00003a00, 0x00000a00,
+ 0x00004900, 0x00000600, 0x00002400, 0x00005c00,
+ 0x0000c200, 0x0000d300, 0x0000ac00, 0x00006200,
+ 0x00009100, 0x00009500, 0x0000e400, 0x00007900,
+ 0x0000e700, 0x0000c800, 0x00003700, 0x00006d00,
+ 0x00008d00, 0x0000d500, 0x00004e00, 0x0000a900,
+ 0x00006c00, 0x00005600, 0x0000f400, 0x0000ea00,
+ 0x00006500, 0x00007a00, 0x0000ae00, 0x00000800,
+ 0x0000ba00, 0x00007800, 0x00002500, 0x00002e00,
+ 0x00001c00, 0x0000a600, 0x0000b400, 0x0000c600,
+ 0x0000e800, 0x0000dd00, 0x00007400, 0x00001f00,
+ 0x00004b00, 0x0000bd00, 0x00008b00, 0x00008a00,
+ 0x00007000, 0x00003e00, 0x0000b500, 0x00006600,
+ 0x00004800, 0x00000300, 0x0000f600, 0x00000e00,
+ 0x00006100, 0x00003500, 0x00005700, 0x0000b900,
+ 0x00008600, 0x0000c100, 0x00001d00, 0x00009e00,
+ 0x0000e100, 0x0000f800, 0x00009800, 0x00001100,
+ 0x00006900, 0x0000d900, 0x00008e00, 0x00009400,
+ 0x00009b00, 0x00001e00, 0x00008700, 0x0000e900,
+ 0x0000ce00, 0x00005500, 0x00002800, 0x0000df00,
+ 0x00008c00, 0x0000a100, 0x00008900, 0x00000d00,
+ 0x0000bf00, 0x0000e600, 0x00004200, 0x00006800,
+ 0x00004100, 0x00009900, 0x00002d00, 0x00000f00,
+ 0x0000b000, 0x00005400, 0x0000bb00, 0x00001600
+ },
+ {
+ 0x00630000, 0x007c0000, 0x00770000, 0x007b0000,
+ 0x00f20000, 0x006b0000, 0x006f0000, 0x00c50000,
+ 0x00300000, 0x00010000, 0x00670000, 0x002b0000,
+ 0x00fe0000, 0x00d70000, 0x00ab0000, 0x00760000,
+ 0x00ca0000, 0x00820000, 0x00c90000, 0x007d0000,
+ 0x00fa0000, 0x00590000, 0x00470000, 0x00f00000,
+ 0x00ad0000, 0x00d40000, 0x00a20000, 0x00af0000,
+ 0x009c0000, 0x00a40000, 0x00720000, 0x00c00000,
+ 0x00b70000, 0x00fd0000, 0x00930000, 0x00260000,
+ 0x00360000, 0x003f0000, 0x00f70000, 0x00cc0000,
+ 0x00340000, 0x00a50000, 0x00e50000, 0x00f10000,
+ 0x00710000, 0x00d80000, 0x00310000, 0x00150000,
+ 0x00040000, 0x00c70000, 0x00230000, 0x00c30000,
+ 0x00180000, 0x00960000, 0x00050000, 0x009a0000,
+ 0x00070000, 0x00120000, 0x00800000, 0x00e20000,
+ 0x00eb0000, 0x00270000, 0x00b20000, 0x00750000,
+ 0x00090000, 0x00830000, 0x002c0000, 0x001a0000,
+ 0x001b0000, 0x006e0000, 0x005a0000, 0x00a00000,
+ 0x00520000, 0x003b0000, 0x00d60000, 0x00b30000,
+ 0x00290000, 0x00e30000, 0x002f0000, 0x00840000,
+ 0x00530000, 0x00d10000, 0x00000000, 0x00ed0000,
+ 0x00200000, 0x00fc0000, 0x00b10000, 0x005b0000,
+ 0x006a0000, 0x00cb0000, 0x00be0000, 0x00390000,
+ 0x004a0000, 0x004c0000, 0x00580000, 0x00cf0000,
+ 0x00d00000, 0x00ef0000, 0x00aa0000, 0x00fb0000,
+ 0x00430000, 0x004d0000, 0x00330000, 0x00850000,
+ 0x00450000, 0x00f90000, 0x00020000, 0x007f0000,
+ 0x00500000, 0x003c0000, 0x009f0000, 0x00a80000,
+ 0x00510000, 0x00a30000, 0x00400000, 0x008f0000,
+ 0x00920000, 0x009d0000, 0x00380000, 0x00f50000,
+ 0x00bc0000, 0x00b60000, 0x00da0000, 0x00210000,
+ 0x00100000, 0x00ff0000, 0x00f30000, 0x00d20000,
+ 0x00cd0000, 0x000c0000, 0x00130000, 0x00ec0000,
+ 0x005f0000, 0x00970000, 0x00440000, 0x00170000,
+ 0x00c40000, 0x00a70000, 0x007e0000, 0x003d0000,
+ 0x00640000, 0x005d0000, 0x00190000, 0x00730000,
+ 0x00600000, 0x00810000, 0x004f0000, 0x00dc0000,
+ 0x00220000, 0x002a0000, 0x00900000, 0x00880000,
+ 0x00460000, 0x00ee0000, 0x00b80000, 0x00140000,
+ 0x00de0000, 0x005e0000, 0x000b0000, 0x00db0000,
+ 0x00e00000, 0x00320000, 0x003a0000, 0x000a0000,
+ 0x00490000, 0x00060000, 0x00240000, 0x005c0000,
+ 0x00c20000, 0x00d30000, 0x00ac0000, 0x00620000,
+ 0x00910000, 0x00950000, 0x00e40000, 0x00790000,
+ 0x00e70000, 0x00c80000, 0x00370000, 0x006d0000,
+ 0x008d0000, 0x00d50000, 0x004e0000, 0x00a90000,
+ 0x006c0000, 0x00560000, 0x00f40000, 0x00ea0000,
+ 0x00650000, 0x007a0000, 0x00ae0000, 0x00080000,
+ 0x00ba0000, 0x00780000, 0x00250000, 0x002e0000,
+ 0x001c0000, 0x00a60000, 0x00b40000, 0x00c60000,
+ 0x00e80000, 0x00dd0000, 0x00740000, 0x001f0000,
+ 0x004b0000, 0x00bd0000, 0x008b0000, 0x008a0000,
+ 0x00700000, 0x003e0000, 0x00b50000, 0x00660000,
+ 0x00480000, 0x00030000, 0x00f60000, 0x000e0000,
+ 0x00610000, 0x00350000, 0x00570000, 0x00b90000,
+ 0x00860000, 0x00c10000, 0x001d0000, 0x009e0000,
+ 0x00e10000, 0x00f80000, 0x00980000, 0x00110000,
+ 0x00690000, 0x00d90000, 0x008e0000, 0x00940000,
+ 0x009b0000, 0x001e0000, 0x00870000, 0x00e90000,
+ 0x00ce0000, 0x00550000, 0x00280000, 0x00df0000,
+ 0x008c0000, 0x00a10000, 0x00890000, 0x000d0000,
+ 0x00bf0000, 0x00e60000, 0x00420000, 0x00680000,
+ 0x00410000, 0x00990000, 0x002d0000, 0x000f0000,
+ 0x00b00000, 0x00540000, 0x00bb0000, 0x00160000
+ },
+ {
+ 0x63000000, 0x7c000000, 0x77000000, 0x7b000000,
+ 0xf2000000, 0x6b000000, 0x6f000000, 0xc5000000,
+ 0x30000000, 0x01000000, 0x67000000, 0x2b000000,
+ 0xfe000000, 0xd7000000, 0xab000000, 0x76000000,
+ 0xca000000, 0x82000000, 0xc9000000, 0x7d000000,
+ 0xfa000000, 0x59000000, 0x47000000, 0xf0000000,
+ 0xad000000, 0xd4000000, 0xa2000000, 0xaf000000,
+ 0x9c000000, 0xa4000000, 0x72000000, 0xc0000000,
+ 0xb7000000, 0xfd000000, 0x93000000, 0x26000000,
+ 0x36000000, 0x3f000000, 0xf7000000, 0xcc000000,
+ 0x34000000, 0xa5000000, 0xe5000000, 0xf1000000,
+ 0x71000000, 0xd8000000, 0x31000000, 0x15000000,
+ 0x04000000, 0xc7000000, 0x23000000, 0xc3000000,
+ 0x18000000, 0x96000000, 0x05000000, 0x9a000000,
+ 0x07000000, 0x12000000, 0x80000000, 0xe2000000,
+ 0xeb000000, 0x27000000, 0xb2000000, 0x75000000,
+ 0x09000000, 0x83000000, 0x2c000000, 0x1a000000,
+ 0x1b000000, 0x6e000000, 0x5a000000, 0xa0000000,
+ 0x52000000, 0x3b000000, 0xd6000000, 0xb3000000,
+ 0x29000000, 0xe3000000, 0x2f000000, 0x84000000,
+ 0x53000000, 0xd1000000, 0x00000000, 0xed000000,
+ 0x20000000, 0xfc000000, 0xb1000000, 0x5b000000,
+ 0x6a000000, 0xcb000000, 0xbe000000, 0x39000000,
+ 0x4a000000, 0x4c000000, 0x58000000, 0xcf000000,
+ 0xd0000000, 0xef000000, 0xaa000000, 0xfb000000,
+ 0x43000000, 0x4d000000, 0x33000000, 0x85000000,
+ 0x45000000, 0xf9000000, 0x02000000, 0x7f000000,
+ 0x50000000, 0x3c000000, 0x9f000000, 0xa8000000,
+ 0x51000000, 0xa3000000, 0x40000000, 0x8f000000,
+ 0x92000000, 0x9d000000, 0x38000000, 0xf5000000,
+ 0xbc000000, 0xb6000000, 0xda000000, 0x21000000,
+ 0x10000000, 0xff000000, 0xf3000000, 0xd2000000,
+ 0xcd000000, 0x0c000000, 0x13000000, 0xec000000,
+ 0x5f000000, 0x97000000, 0x44000000, 0x17000000,
+ 0xc4000000, 0xa7000000, 0x7e000000, 0x3d000000,
+ 0x64000000, 0x5d000000, 0x19000000, 0x73000000,
+ 0x60000000, 0x81000000, 0x4f000000, 0xdc000000,
+ 0x22000000, 0x2a000000, 0x90000000, 0x88000000,
+ 0x46000000, 0xee000000, 0xb8000000, 0x14000000,
+ 0xde000000, 0x5e000000, 0x0b000000, 0xdb000000,
+ 0xe0000000, 0x32000000, 0x3a000000, 0x0a000000,
+ 0x49000000, 0x06000000, 0x24000000, 0x5c000000,
+ 0xc2000000, 0xd3000000, 0xac000000, 0x62000000,
+ 0x91000000, 0x95000000, 0xe4000000, 0x79000000,
+ 0xe7000000, 0xc8000000, 0x37000000, 0x6d000000,
+ 0x8d000000, 0xd5000000, 0x4e000000, 0xa9000000,
+ 0x6c000000, 0x56000000, 0xf4000000, 0xea000000,
+ 0x65000000, 0x7a000000, 0xae000000, 0x08000000,
+ 0xba000000, 0x78000000, 0x25000000, 0x2e000000,
+ 0x1c000000, 0xa6000000, 0xb4000000, 0xc6000000,
+ 0xe8000000, 0xdd000000, 0x74000000, 0x1f000000,
+ 0x4b000000, 0xbd000000, 0x8b000000, 0x8a000000,
+ 0x70000000, 0x3e000000, 0xb5000000, 0x66000000,
+ 0x48000000, 0x03000000, 0xf6000000, 0x0e000000,
+ 0x61000000, 0x35000000, 0x57000000, 0xb9000000,
+ 0x86000000, 0xc1000000, 0x1d000000, 0x9e000000,
+ 0xe1000000, 0xf8000000, 0x98000000, 0x11000000,
+ 0x69000000, 0xd9000000, 0x8e000000, 0x94000000,
+ 0x9b000000, 0x1e000000, 0x87000000, 0xe9000000,
+ 0xce000000, 0x55000000, 0x28000000, 0xdf000000,
+ 0x8c000000, 0xa1000000, 0x89000000, 0x0d000000,
+ 0xbf000000, 0xe6000000, 0x42000000, 0x68000000,
+ 0x41000000, 0x99000000, 0x2d000000, 0x0f000000,
+ 0xb0000000, 0x54000000, 0xbb000000, 0x16000000
+ }
+};
+
+static const uint32_t t_im[4][256] =
+{
+ {
+ 0x00000000, 0x0b0d090e, 0x161a121c, 0x1d171b12,
+ 0x2c342438, 0x27392d36, 0x3a2e3624, 0x31233f2a,
+ 0x58684870, 0x5365417e, 0x4e725a6c, 0x457f5362,
+ 0x745c6c48, 0x7f516546, 0x62467e54, 0x694b775a,
+ 0xb0d090e0, 0xbbdd99ee, 0xa6ca82fc, 0xadc78bf2,
+ 0x9ce4b4d8, 0x97e9bdd6, 0x8afea6c4, 0x81f3afca,
+ 0xe8b8d890, 0xe3b5d19e, 0xfea2ca8c, 0xf5afc382,
+ 0xc48cfca8, 0xcf81f5a6, 0xd296eeb4, 0xd99be7ba,
+ 0x7bbb3bdb, 0x70b632d5, 0x6da129c7, 0x66ac20c9,
+ 0x578f1fe3, 0x5c8216ed, 0x41950dff, 0x4a9804f1,
+ 0x23d373ab, 0x28de7aa5, 0x35c961b7, 0x3ec468b9,
+ 0x0fe75793, 0x04ea5e9d, 0x19fd458f, 0x12f04c81,
+ 0xcb6bab3b, 0xc066a235, 0xdd71b927, 0xd67cb029,
+ 0xe75f8f03, 0xec52860d, 0xf1459d1f, 0xfa489411,
+ 0x9303e34b, 0x980eea45, 0x8519f157, 0x8e14f859,
+ 0xbf37c773, 0xb43ace7d, 0xa92dd56f, 0xa220dc61,
+ 0xf66d76ad, 0xfd607fa3, 0xe07764b1, 0xeb7a6dbf,
+ 0xda595295, 0xd1545b9b, 0xcc434089, 0xc74e4987,
+ 0xae053edd, 0xa50837d3, 0xb81f2cc1, 0xb31225cf,
+ 0x82311ae5, 0x893c13eb, 0x942b08f9, 0x9f2601f7,
+ 0x46bde64d, 0x4db0ef43, 0x50a7f451, 0x5baafd5f,
+ 0x6a89c275, 0x6184cb7b, 0x7c93d069, 0x779ed967,
+ 0x1ed5ae3d, 0x15d8a733, 0x08cfbc21, 0x03c2b52f,
+ 0x32e18a05, 0x39ec830b, 0x24fb9819, 0x2ff69117,
+ 0x8dd64d76, 0x86db4478, 0x9bcc5f6a, 0x90c15664,
+ 0xa1e2694e, 0xaaef6040, 0xb7f87b52, 0xbcf5725c,
+ 0xd5be0506, 0xdeb30c08, 0xc3a4171a, 0xc8a91e14,
+ 0xf98a213e, 0xf2872830, 0xef903322, 0xe49d3a2c,
+ 0x3d06dd96, 0x360bd498, 0x2b1ccf8a, 0x2011c684,
+ 0x1132f9ae, 0x1a3ff0a0, 0x0728ebb2, 0x0c25e2bc,
+ 0x656e95e6, 0x6e639ce8, 0x737487fa, 0x78798ef4,
+ 0x495ab1de, 0x4257b8d0, 0x5f40a3c2, 0x544daacc,
+ 0xf7daec41, 0xfcd7e54f, 0xe1c0fe5d, 0xeacdf753,
+ 0xdbeec879, 0xd0e3c177, 0xcdf4da65, 0xc6f9d36b,
+ 0xafb2a431, 0xa4bfad3f, 0xb9a8b62d, 0xb2a5bf23,
+ 0x83868009, 0x888b8907, 0x959c9215, 0x9e919b1b,
+ 0x470a7ca1, 0x4c0775af, 0x51106ebd, 0x5a1d67b3,
+ 0x6b3e5899, 0x60335197, 0x7d244a85, 0x7629438b,
+ 0x1f6234d1, 0x146f3ddf, 0x097826cd, 0x02752fc3,
+ 0x335610e9, 0x385b19e7, 0x254c02f5, 0x2e410bfb,
+ 0x8c61d79a, 0x876cde94, 0x9a7bc586, 0x9176cc88,
+ 0xa055f3a2, 0xab58faac, 0xb64fe1be, 0xbd42e8b0,
+ 0xd4099fea, 0xdf0496e4, 0xc2138df6, 0xc91e84f8,
+ 0xf83dbbd2, 0xf330b2dc, 0xee27a9ce, 0xe52aa0c0,
+ 0x3cb1477a, 0x37bc4e74, 0x2aab5566, 0x21a65c68,
+ 0x10856342, 0x1b886a4c, 0x069f715e, 0x0d927850,
+ 0x64d90f0a, 0x6fd40604, 0x72c31d16, 0x79ce1418,
+ 0x48ed2b32, 0x43e0223c, 0x5ef7392e, 0x55fa3020,
+ 0x01b79aec, 0x0aba93e2, 0x17ad88f0, 0x1ca081fe,
+ 0x2d83bed4, 0x268eb7da, 0x3b99acc8, 0x3094a5c6,
+ 0x59dfd29c, 0x52d2db92, 0x4fc5c080, 0x44c8c98e,
+ 0x75ebf6a4, 0x7ee6ffaa, 0x63f1e4b8, 0x68fcedb6,
+ 0xb1670a0c, 0xba6a0302, 0xa77d1810, 0xac70111e,
+ 0x9d532e34, 0x965e273a, 0x8b493c28, 0x80443526,
+ 0xe90f427c, 0xe2024b72, 0xff155060, 0xf418596e,
+ 0xc53b6644, 0xce366f4a, 0xd3217458, 0xd82c7d56,
+ 0x7a0ca137, 0x7101a839, 0x6c16b32b, 0x671bba25,
+ 0x5638850f, 0x5d358c01, 0x40229713, 0x4b2f9e1d,
+ 0x2264e947, 0x2969e049, 0x347efb5b, 0x3f73f255,
+ 0x0e50cd7f, 0x055dc471, 0x184adf63, 0x1347d66d,
+ 0xcadc31d7, 0xc1d138d9, 0xdcc623cb, 0xd7cb2ac5,
+ 0xe6e815ef, 0xede51ce1, 0xf0f207f3, 0xfbff0efd,
+ 0x92b479a7, 0x99b970a9, 0x84ae6bbb, 0x8fa362b5,
+ 0xbe805d9f, 0xb58d5491, 0xa89a4f83, 0xa397468d
+ },
+ {
+ 0x00000000, 0x0d090e0b, 0x1a121c16, 0x171b121d,
+ 0x3424382c, 0x392d3627, 0x2e36243a, 0x233f2a31,
+ 0x68487058, 0x65417e53, 0x725a6c4e, 0x7f536245,
+ 0x5c6c4874, 0x5165467f, 0x467e5462, 0x4b775a69,
+ 0xd090e0b0, 0xdd99eebb, 0xca82fca6, 0xc78bf2ad,
+ 0xe4b4d89c, 0xe9bdd697, 0xfea6c48a, 0xf3afca81,
+ 0xb8d890e8, 0xb5d19ee3, 0xa2ca8cfe, 0xafc382f5,
+ 0x8cfca8c4, 0x81f5a6cf, 0x96eeb4d2, 0x9be7bad9,
+ 0xbb3bdb7b, 0xb632d570, 0xa129c76d, 0xac20c966,
+ 0x8f1fe357, 0x8216ed5c, 0x950dff41, 0x9804f14a,
+ 0xd373ab23, 0xde7aa528, 0xc961b735, 0xc468b93e,
+ 0xe757930f, 0xea5e9d04, 0xfd458f19, 0xf04c8112,
+ 0x6bab3bcb, 0x66a235c0, 0x71b927dd, 0x7cb029d6,
+ 0x5f8f03e7, 0x52860dec, 0x459d1ff1, 0x489411fa,
+ 0x03e34b93, 0x0eea4598, 0x19f15785, 0x14f8598e,
+ 0x37c773bf, 0x3ace7db4, 0x2dd56fa9, 0x20dc61a2,
+ 0x6d76adf6, 0x607fa3fd, 0x7764b1e0, 0x7a6dbfeb,
+ 0x595295da, 0x545b9bd1, 0x434089cc, 0x4e4987c7,
+ 0x053eddae, 0x0837d3a5, 0x1f2cc1b8, 0x1225cfb3,
+ 0x311ae582, 0x3c13eb89, 0x2b08f994, 0x2601f79f,
+ 0xbde64d46, 0xb0ef434d, 0xa7f45150, 0xaafd5f5b,
+ 0x89c2756a, 0x84cb7b61, 0x93d0697c, 0x9ed96777,
+ 0xd5ae3d1e, 0xd8a73315, 0xcfbc2108, 0xc2b52f03,
+ 0xe18a0532, 0xec830b39, 0xfb981924, 0xf691172f,
+ 0xd64d768d, 0xdb447886, 0xcc5f6a9b, 0xc1566490,
+ 0xe2694ea1, 0xef6040aa, 0xf87b52b7, 0xf5725cbc,
+ 0xbe0506d5, 0xb30c08de, 0xa4171ac3, 0xa91e14c8,
+ 0x8a213ef9, 0x872830f2, 0x903322ef, 0x9d3a2ce4,
+ 0x06dd963d, 0x0bd49836, 0x1ccf8a2b, 0x11c68420,
+ 0x32f9ae11, 0x3ff0a01a, 0x28ebb207, 0x25e2bc0c,
+ 0x6e95e665, 0x639ce86e, 0x7487fa73, 0x798ef478,
+ 0x5ab1de49, 0x57b8d042, 0x40a3c25f, 0x4daacc54,
+ 0xdaec41f7, 0xd7e54ffc, 0xc0fe5de1, 0xcdf753ea,
+ 0xeec879db, 0xe3c177d0, 0xf4da65cd, 0xf9d36bc6,
+ 0xb2a431af, 0xbfad3fa4, 0xa8b62db9, 0xa5bf23b2,
+ 0x86800983, 0x8b890788, 0x9c921595, 0x919b1b9e,
+ 0x0a7ca147, 0x0775af4c, 0x106ebd51, 0x1d67b35a,
+ 0x3e58996b, 0x33519760, 0x244a857d, 0x29438b76,
+ 0x6234d11f, 0x6f3ddf14, 0x7826cd09, 0x752fc302,
+ 0x5610e933, 0x5b19e738, 0x4c02f525, 0x410bfb2e,
+ 0x61d79a8c, 0x6cde9487, 0x7bc5869a, 0x76cc8891,
+ 0x55f3a2a0, 0x58faacab, 0x4fe1beb6, 0x42e8b0bd,
+ 0x099fead4, 0x0496e4df, 0x138df6c2, 0x1e84f8c9,
+ 0x3dbbd2f8, 0x30b2dcf3, 0x27a9ceee, 0x2aa0c0e5,
+ 0xb1477a3c, 0xbc4e7437, 0xab55662a, 0xa65c6821,
+ 0x85634210, 0x886a4c1b, 0x9f715e06, 0x9278500d,
+ 0xd90f0a64, 0xd406046f, 0xc31d1672, 0xce141879,
+ 0xed2b3248, 0xe0223c43, 0xf7392e5e, 0xfa302055,
+ 0xb79aec01, 0xba93e20a, 0xad88f017, 0xa081fe1c,
+ 0x83bed42d, 0x8eb7da26, 0x99acc83b, 0x94a5c630,
+ 0xdfd29c59, 0xd2db9252, 0xc5c0804f, 0xc8c98e44,
+ 0xebf6a475, 0xe6ffaa7e, 0xf1e4b863, 0xfcedb668,
+ 0x670a0cb1, 0x6a0302ba, 0x7d1810a7, 0x70111eac,
+ 0x532e349d, 0x5e273a96, 0x493c288b, 0x44352680,
+ 0x0f427ce9, 0x024b72e2, 0x155060ff, 0x18596ef4,
+ 0x3b6644c5, 0x366f4ace, 0x217458d3, 0x2c7d56d8,
+ 0x0ca1377a, 0x01a83971, 0x16b32b6c, 0x1bba2567,
+ 0x38850f56, 0x358c015d, 0x22971340, 0x2f9e1d4b,
+ 0x64e94722, 0x69e04929, 0x7efb5b34, 0x73f2553f,
+ 0x50cd7f0e, 0x5dc47105, 0x4adf6318, 0x47d66d13,
+ 0xdc31d7ca, 0xd138d9c1, 0xc623cbdc, 0xcb2ac5d7,
+ 0xe815efe6, 0xe51ce1ed, 0xf207f3f0, 0xff0efdfb,
+ 0xb479a792, 0xb970a999, 0xae6bbb84, 0xa362b58f,
+ 0x805d9fbe, 0x8d5491b5, 0x9a4f83a8, 0x97468da3
+ },
+ {
+ 0x00000000, 0x090e0b0d, 0x121c161a, 0x1b121d17,
+ 0x24382c34, 0x2d362739, 0x36243a2e, 0x3f2a3123,
+ 0x48705868, 0x417e5365, 0x5a6c4e72, 0x5362457f,
+ 0x6c48745c, 0x65467f51, 0x7e546246, 0x775a694b,
+ 0x90e0b0d0, 0x99eebbdd, 0x82fca6ca, 0x8bf2adc7,
+ 0xb4d89ce4, 0xbdd697e9, 0xa6c48afe, 0xafca81f3,
+ 0xd890e8b8, 0xd19ee3b5, 0xca8cfea2, 0xc382f5af,
+ 0xfca8c48c, 0xf5a6cf81, 0xeeb4d296, 0xe7bad99b,
+ 0x3bdb7bbb, 0x32d570b6, 0x29c76da1, 0x20c966ac,
+ 0x1fe3578f, 0x16ed5c82, 0x0dff4195, 0x04f14a98,
+ 0x73ab23d3, 0x7aa528de, 0x61b735c9, 0x68b93ec4,
+ 0x57930fe7, 0x5e9d04ea, 0x458f19fd, 0x4c8112f0,
+ 0xab3bcb6b, 0xa235c066, 0xb927dd71, 0xb029d67c,
+ 0x8f03e75f, 0x860dec52, 0x9d1ff145, 0x9411fa48,
+ 0xe34b9303, 0xea45980e, 0xf1578519, 0xf8598e14,
+ 0xc773bf37, 0xce7db43a, 0xd56fa92d, 0xdc61a220,
+ 0x76adf66d, 0x7fa3fd60, 0x64b1e077, 0x6dbfeb7a,
+ 0x5295da59, 0x5b9bd154, 0x4089cc43, 0x4987c74e,
+ 0x3eddae05, 0x37d3a508, 0x2cc1b81f, 0x25cfb312,
+ 0x1ae58231, 0x13eb893c, 0x08f9942b, 0x01f79f26,
+ 0xe64d46bd, 0xef434db0, 0xf45150a7, 0xfd5f5baa,
+ 0xc2756a89, 0xcb7b6184, 0xd0697c93, 0xd967779e,
+ 0xae3d1ed5, 0xa73315d8, 0xbc2108cf, 0xb52f03c2,
+ 0x8a0532e1, 0x830b39ec, 0x981924fb, 0x91172ff6,
+ 0x4d768dd6, 0x447886db, 0x5f6a9bcc, 0x566490c1,
+ 0x694ea1e2, 0x6040aaef, 0x7b52b7f8, 0x725cbcf5,
+ 0x0506d5be, 0x0c08deb3, 0x171ac3a4, 0x1e14c8a9,
+ 0x213ef98a, 0x2830f287, 0x3322ef90, 0x3a2ce49d,
+ 0xdd963d06, 0xd498360b, 0xcf8a2b1c, 0xc6842011,
+ 0xf9ae1132, 0xf0a01a3f, 0xebb20728, 0xe2bc0c25,
+ 0x95e6656e, 0x9ce86e63, 0x87fa7374, 0x8ef47879,
+ 0xb1de495a, 0xb8d04257, 0xa3c25f40, 0xaacc544d,
+ 0xec41f7da, 0xe54ffcd7, 0xfe5de1c0, 0xf753eacd,
+ 0xc879dbee, 0xc177d0e3, 0xda65cdf4, 0xd36bc6f9,
+ 0xa431afb2, 0xad3fa4bf, 0xb62db9a8, 0xbf23b2a5,
+ 0x80098386, 0x8907888b, 0x9215959c, 0x9b1b9e91,
+ 0x7ca1470a, 0x75af4c07, 0x6ebd5110, 0x67b35a1d,
+ 0x58996b3e, 0x51976033, 0x4a857d24, 0x438b7629,
+ 0x34d11f62, 0x3ddf146f, 0x26cd0978, 0x2fc30275,
+ 0x10e93356, 0x19e7385b, 0x02f5254c, 0x0bfb2e41,
+ 0xd79a8c61, 0xde94876c, 0xc5869a7b, 0xcc889176,
+ 0xf3a2a055, 0xfaacab58, 0xe1beb64f, 0xe8b0bd42,
+ 0x9fead409, 0x96e4df04, 0x8df6c213, 0x84f8c91e,
+ 0xbbd2f83d, 0xb2dcf330, 0xa9ceee27, 0xa0c0e52a,
+ 0x477a3cb1, 0x4e7437bc, 0x55662aab, 0x5c6821a6,
+ 0x63421085, 0x6a4c1b88, 0x715e069f, 0x78500d92,
+ 0x0f0a64d9, 0x06046fd4, 0x1d1672c3, 0x141879ce,
+ 0x2b3248ed, 0x223c43e0, 0x392e5ef7, 0x302055fa,
+ 0x9aec01b7, 0x93e20aba, 0x88f017ad, 0x81fe1ca0,
+ 0xbed42d83, 0xb7da268e, 0xacc83b99, 0xa5c63094,
+ 0xd29c59df, 0xdb9252d2, 0xc0804fc5, 0xc98e44c8,
+ 0xf6a475eb, 0xffaa7ee6, 0xe4b863f1, 0xedb668fc,
+ 0x0a0cb167, 0x0302ba6a, 0x1810a77d, 0x111eac70,
+ 0x2e349d53, 0x273a965e, 0x3c288b49, 0x35268044,
+ 0x427ce90f, 0x4b72e202, 0x5060ff15, 0x596ef418,
+ 0x6644c53b, 0x6f4ace36, 0x7458d321, 0x7d56d82c,
+ 0xa1377a0c, 0xa8397101, 0xb32b6c16, 0xba25671b,
+ 0x850f5638, 0x8c015d35, 0x97134022, 0x9e1d4b2f,
+ 0xe9472264, 0xe0492969, 0xfb5b347e, 0xf2553f73,
+ 0xcd7f0e50, 0xc471055d, 0xdf63184a, 0xd66d1347,
+ 0x31d7cadc, 0x38d9c1d1, 0x23cbdcc6, 0x2ac5d7cb,
+ 0x15efe6e8, 0x1ce1ede5, 0x07f3f0f2, 0x0efdfbff,
+ 0x79a792b4, 0x70a999b9, 0x6bbb84ae, 0x62b58fa3,
+ 0x5d9fbe80, 0x5491b58d, 0x4f83a89a, 0x468da397
+ },
+ {
+ 0x00000000, 0x0e0b0d09, 0x1c161a12, 0x121d171b,
+ 0x382c3424, 0x3627392d, 0x243a2e36, 0x2a31233f,
+ 0x70586848, 0x7e536541, 0x6c4e725a, 0x62457f53,
+ 0x48745c6c, 0x467f5165, 0x5462467e, 0x5a694b77,
+ 0xe0b0d090, 0xeebbdd99, 0xfca6ca82, 0xf2adc78b,
+ 0xd89ce4b4, 0xd697e9bd, 0xc48afea6, 0xca81f3af,
+ 0x90e8b8d8, 0x9ee3b5d1, 0x8cfea2ca, 0x82f5afc3,
+ 0xa8c48cfc, 0xa6cf81f5, 0xb4d296ee, 0xbad99be7,
+ 0xdb7bbb3b, 0xd570b632, 0xc76da129, 0xc966ac20,
+ 0xe3578f1f, 0xed5c8216, 0xff41950d, 0xf14a9804,
+ 0xab23d373, 0xa528de7a, 0xb735c961, 0xb93ec468,
+ 0x930fe757, 0x9d04ea5e, 0x8f19fd45, 0x8112f04c,
+ 0x3bcb6bab, 0x35c066a2, 0x27dd71b9, 0x29d67cb0,
+ 0x03e75f8f, 0x0dec5286, 0x1ff1459d, 0x11fa4894,
+ 0x4b9303e3, 0x45980eea, 0x578519f1, 0x598e14f8,
+ 0x73bf37c7, 0x7db43ace, 0x6fa92dd5, 0x61a220dc,
+ 0xadf66d76, 0xa3fd607f, 0xb1e07764, 0xbfeb7a6d,
+ 0x95da5952, 0x9bd1545b, 0x89cc4340, 0x87c74e49,
+ 0xddae053e, 0xd3a50837, 0xc1b81f2c, 0xcfb31225,
+ 0xe582311a, 0xeb893c13, 0xf9942b08, 0xf79f2601,
+ 0x4d46bde6, 0x434db0ef, 0x5150a7f4, 0x5f5baafd,
+ 0x756a89c2, 0x7b6184cb, 0x697c93d0, 0x67779ed9,
+ 0x3d1ed5ae, 0x3315d8a7, 0x2108cfbc, 0x2f03c2b5,
+ 0x0532e18a, 0x0b39ec83, 0x1924fb98, 0x172ff691,
+ 0x768dd64d, 0x7886db44, 0x6a9bcc5f, 0x6490c156,
+ 0x4ea1e269, 0x40aaef60, 0x52b7f87b, 0x5cbcf572,
+ 0x06d5be05, 0x08deb30c, 0x1ac3a417, 0x14c8a91e,
+ 0x3ef98a21, 0x30f28728, 0x22ef9033, 0x2ce49d3a,
+ 0x963d06dd, 0x98360bd4, 0x8a2b1ccf, 0x842011c6,
+ 0xae1132f9, 0xa01a3ff0, 0xb20728eb, 0xbc0c25e2,
+ 0xe6656e95, 0xe86e639c, 0xfa737487, 0xf478798e,
+ 0xde495ab1, 0xd04257b8, 0xc25f40a3, 0xcc544daa,
+ 0x41f7daec, 0x4ffcd7e5, 0x5de1c0fe, 0x53eacdf7,
+ 0x79dbeec8, 0x77d0e3c1, 0x65cdf4da, 0x6bc6f9d3,
+ 0x31afb2a4, 0x3fa4bfad, 0x2db9a8b6, 0x23b2a5bf,
+ 0x09838680, 0x07888b89, 0x15959c92, 0x1b9e919b,
+ 0xa1470a7c, 0xaf4c0775, 0xbd51106e, 0xb35a1d67,
+ 0x996b3e58, 0x97603351, 0x857d244a, 0x8b762943,
+ 0xd11f6234, 0xdf146f3d, 0xcd097826, 0xc302752f,
+ 0xe9335610, 0xe7385b19, 0xf5254c02, 0xfb2e410b,
+ 0x9a8c61d7, 0x94876cde, 0x869a7bc5, 0x889176cc,
+ 0xa2a055f3, 0xacab58fa, 0xbeb64fe1, 0xb0bd42e8,
+ 0xead4099f, 0xe4df0496, 0xf6c2138d, 0xf8c91e84,
+ 0xd2f83dbb, 0xdcf330b2, 0xceee27a9, 0xc0e52aa0,
+ 0x7a3cb147, 0x7437bc4e, 0x662aab55, 0x6821a65c,
+ 0x42108563, 0x4c1b886a, 0x5e069f71, 0x500d9278,
+ 0x0a64d90f, 0x046fd406, 0x1672c31d, 0x1879ce14,
+ 0x3248ed2b, 0x3c43e022, 0x2e5ef739, 0x2055fa30,
+ 0xec01b79a, 0xe20aba93, 0xf017ad88, 0xfe1ca081,
+ 0xd42d83be, 0xda268eb7, 0xc83b99ac, 0xc63094a5,
+ 0x9c59dfd2, 0x9252d2db, 0x804fc5c0, 0x8e44c8c9,
+ 0xa475ebf6, 0xaa7ee6ff, 0xb863f1e4, 0xb668fced,
+ 0x0cb1670a, 0x02ba6a03, 0x10a77d18, 0x1eac7011,
+ 0x349d532e, 0x3a965e27, 0x288b493c, 0x26804435,
+ 0x7ce90f42, 0x72e2024b, 0x60ff1550, 0x6ef41859,
+ 0x44c53b66, 0x4ace366f, 0x58d32174, 0x56d82c7d,
+ 0x377a0ca1, 0x397101a8, 0x2b6c16b3, 0x25671bba,
+ 0x0f563885, 0x015d358c, 0x13402297, 0x1d4b2f9e,
+ 0x472264e9, 0x492969e0, 0x5b347efb, 0x553f73f2,
+ 0x7f0e50cd, 0x71055dc4, 0x63184adf, 0x6d1347d6,
+ 0xd7cadc31, 0xd9c1d138, 0xcbdcc623, 0xc5d7cb2a,
+ 0xefe6e815, 0xe1ede51c, 0xf3f0f207, 0xfdfbff0e,
+ 0xa792b479, 0xa999b970, 0xbb84ae6b, 0xb58fa362,
+ 0x9fbe805d, 0x91b58d54, 0x83a89a4f, 0x8da39746
+ }
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _AESTAB2_H */
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams
new file mode 100644
index 000000000000..0de1883dc81b
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams
@@ -0,0 +1,36 @@
+Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ * Redistributions of source code must retain copyright notices,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials
+ provided with the distribution.
+
+ * Neither the name of the CRYPTOGAMS nor the names of its
+ copyright holder and contributors may be used to endorse or
+ promote products derived from this software without specific
+ prior written permission.
+
+ALTERNATIVELY, provided that this notice is retained in full, this
+product may be distributed under the terms of the GNU General Public
+License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+those given above.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip
new file mode 100644
index 000000000000..6184759c8b74
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip
@@ -0,0 +1 @@
+PORTIONS OF GCM and GHASH FUNCTIONALITY
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl
new file mode 100644
index 000000000000..49cc83d2ee29
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl
@@ -0,0 +1,177 @@
+
+ Apache License
+ Version 2.0, January 2004
+ https://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip
new file mode 100644
index 000000000000..6184759c8b74
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip
@@ -0,0 +1 @@
+PORTIONS OF GCM and GHASH FUNCTIONALITY
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
new file mode 100644
index 000000000000..dc71ae2c1c89
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
@@ -0,0 +1,1261 @@
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+#
+# AES-NI-CTR+GHASH stitch.
+#
+# February 2013
+#
+# OpenSSL GCM implementation is organized in such way that its
+# performance is rather close to the sum of its streamed components,
+# in the context parallelized AES-NI CTR and modulo-scheduled
+# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
+# was observed to perform significantly better than the sum of the
+# components on contemporary CPUs, the effort was deemed impossible to
+# justify. This module is based on combination of Intel submissions,
+# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
+# Locktyukhin of Intel Corp. who verified that it reduces shuffles
+# pressure with notable relative improvement, achieving 1.0 cycle per
+# byte processed with 128-bit key on Haswell processor, 0.74 - on
+# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
+# measurements for favourable packet size, one divisible by 96.
+# Applications using the EVP interface will observe a few percent
+# worse performance.]
+#
+# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
+
+# Generated once from
+# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl
+# and modified for ICP. Modification are kept at a bare minimum to ease later
+# upstream merges.
+
+#if defined(__x86_64__) && defined(HAVE_AVX) && \
+ defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
+
+.extern gcm_avx_can_use_movbe
+
+.text
+
+#ifdef HAVE_MOVBE
+.type _aesni_ctr32_ghash_6x,@function
+.align 32
+_aesni_ctr32_ghash_6x:
+.cfi_startproc
+ vmovdqu 32(%r11),%xmm2
+ subq $6,%rdx
+ vpxor %xmm4,%xmm4,%xmm4
+ vmovdqu 0-128(%rcx),%xmm15
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovdqu %xmm4,16+8(%rsp)
+ jmp .Loop6x
+
+.align 32
+.Loop6x:
+ addl $100663296,%ebx
+ jc .Lhandle_ctr32
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm15,%xmm10,%xmm10
+ vpxor %xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32:
+ vmovdqu %xmm1,(%r8)
+ vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
+ vpxor %xmm15,%xmm12,%xmm12
+ vmovups 16-128(%rcx),%xmm2
+ vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
+ xorq %r12,%r12
+ cmpq %r14,%r15
+
+ vaesenc %xmm2,%xmm9,%xmm9
+ vmovdqu 48+8(%rsp),%xmm0
+ vpxor %xmm15,%xmm13,%xmm13
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
+ vaesenc %xmm2,%xmm10,%xmm10
+ vpxor %xmm15,%xmm14,%xmm14
+ setnc %r12b
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vmovdqu 16-32(%r9),%xmm3
+ negq %r12
+ vaesenc %xmm2,%xmm12,%xmm12
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
+ vpxor %xmm4,%xmm8,%xmm8
+ vaesenc %xmm2,%xmm13,%xmm13
+ vpxor %xmm5,%xmm1,%xmm4
+ andq $0x60,%r12
+ vmovups 32-128(%rcx),%xmm15
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
+ vaesenc %xmm2,%xmm14,%xmm14
+
+ vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
+ leaq (%r14,%r12,1),%r14
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
+ vmovdqu 64+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 88(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 80(%r14),%r12
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,32+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,40+8(%rsp)
+ vmovdqu 48-32(%r9),%xmm5
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 48-128(%rcx),%xmm15
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
+ vmovdqu 80+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqu 64-32(%r9),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 64-128(%rcx),%xmm15
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 72(%r14),%r13
+ vpxor %xmm5,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 64(%r14),%r12
+ vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
+ vmovdqu 96+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,48+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,56+8(%rsp)
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 96-32(%r9),%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 80-128(%rcx),%xmm15
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 56(%r14),%r13
+ vpxor %xmm1,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
+ vpxor 112+8(%rsp),%xmm8,%xmm8
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 48(%r14),%r12
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,64+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,72+8(%rsp)
+ vpxor %xmm3,%xmm4,%xmm4
+ vmovdqu 112-32(%r9),%xmm3
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 96-128(%rcx),%xmm15
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 40(%r14),%r13
+ vpxor %xmm2,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 32(%r14),%r12
+ vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,80+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,88+8(%rsp)
+ vpxor %xmm5,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor %xmm1,%xmm6,%xmm6
+
+ vmovups 112-128(%rcx),%xmm15
+ vpslldq $8,%xmm6,%xmm5
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 16(%r11),%xmm3
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm8,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm5,%xmm4,%xmm4
+ movbeq 24(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 16(%r14),%r12
+ vpalignr $8,%xmm4,%xmm4,%xmm0
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ movq %r13,96+8(%rsp)
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r12,104+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ vmovups 128-128(%rcx),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 144-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm10,%xmm10
+ vpsrldq $8,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm11,%xmm11
+ vpxor %xmm6,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm12,%xmm12
+ vpxor %xmm0,%xmm4,%xmm4
+ movbeq 8(%r14),%r13
+ vaesenc %xmm1,%xmm13,%xmm13
+ movbeq 0(%r14),%r12
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 160-128(%rcx),%xmm1
+ cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds.
+ jb .Lenc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 176-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 192-128(%rcx),%xmm1
+ cmpl $14,%ebp // ICP does not zero key schedule.
+ jb .Lenc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 208-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 224-128(%rcx),%xmm1
+ jmp .Lenc_tail
+
+.align 32
+.Lhandle_ctr32:
+ vmovdqu (%r11),%xmm0
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm15,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm15,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpshufb %xmm0,%xmm1,%xmm1
+ jmp .Lresume_ctr32
+
+.align 32
+.Lenc_tail:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vmovdqu %xmm7,16+8(%rsp)
+ vpalignr $8,%xmm4,%xmm4,%xmm8
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ vpxor 0(%rdi),%xmm1,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 16(%rdi),%xmm1,%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 32(%rdi),%xmm1,%xmm5
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 48(%rdi),%xmm1,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 64(%rdi),%xmm1,%xmm7
+ vpxor 80(%rdi),%xmm1,%xmm3
+ vmovdqu (%r8),%xmm1
+
+ vaesenclast %xmm2,%xmm9,%xmm9
+ vmovdqu 32(%r11),%xmm2
+ vaesenclast %xmm0,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm1,%xmm0
+ movq %r13,112+8(%rsp)
+ leaq 96(%rdi),%rdi
+ vaesenclast %xmm5,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm0,%xmm5
+ movq %r12,120+8(%rsp)
+ leaq 96(%rsi),%rsi
+ vmovdqu 0-128(%rcx),%xmm15
+ vaesenclast %xmm6,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm5,%xmm6
+ vaesenclast %xmm7,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm6,%xmm7
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vpaddb %xmm2,%xmm7,%xmm3
+
+ addq $0x60,%r10
+ subq $0x6,%rdx
+ jc .L6x_done
+
+ vmovups %xmm9,-96(%rsi)
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovups %xmm10,-80(%rsi)
+ vmovdqa %xmm0,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vmovdqa %xmm5,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vmovdqa %xmm6,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vmovdqa %xmm7,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vmovdqa %xmm3,%xmm14
+ vmovdqu 32+8(%rsp),%xmm7
+ jmp .Loop6x
+
+.L6x_done:
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpxor %xmm4,%xmm8,%xmm8
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+#endif /* ifdef HAVE_MOVBE */
+
+.type _aesni_ctr32_ghash_no_movbe_6x,@function
+.align 32
+_aesni_ctr32_ghash_no_movbe_6x:
+.cfi_startproc
+ vmovdqu 32(%r11),%xmm2
+ subq $6,%rdx
+ vpxor %xmm4,%xmm4,%xmm4
+ vmovdqu 0-128(%rcx),%xmm15
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovdqu %xmm4,16+8(%rsp)
+ jmp .Loop6x_nmb
+
+.align 32
+.Loop6x_nmb:
+ addl $100663296,%ebx
+ jc .Lhandle_ctr32_nmb
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm15,%xmm10,%xmm10
+ vpxor %xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32_nmb:
+ vmovdqu %xmm1,(%r8)
+ vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
+ vpxor %xmm15,%xmm12,%xmm12
+ vmovups 16-128(%rcx),%xmm2
+ vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
+ xorq %r12,%r12
+ cmpq %r14,%r15
+
+ vaesenc %xmm2,%xmm9,%xmm9
+ vmovdqu 48+8(%rsp),%xmm0
+ vpxor %xmm15,%xmm13,%xmm13
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
+ vaesenc %xmm2,%xmm10,%xmm10
+ vpxor %xmm15,%xmm14,%xmm14
+ setnc %r12b
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vmovdqu 16-32(%r9),%xmm3
+ negq %r12
+ vaesenc %xmm2,%xmm12,%xmm12
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
+ vpxor %xmm4,%xmm8,%xmm8
+ vaesenc %xmm2,%xmm13,%xmm13
+ vpxor %xmm5,%xmm1,%xmm4
+ andq $0x60,%r12
+ vmovups 32-128(%rcx),%xmm15
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
+ vaesenc %xmm2,%xmm14,%xmm14
+
+ vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
+ leaq (%r14,%r12,1),%r14
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
+ vmovdqu 64+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm10,%xmm10
+ movq 88(%r14),%r13
+ bswapq %r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movq 80(%r14),%r12
+ bswapq %r12
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,32+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,40+8(%rsp)
+ vmovdqu 48-32(%r9),%xmm5
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 48-128(%rcx),%xmm15
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
+ vmovdqu 80+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqu 64-32(%r9),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 64-128(%rcx),%xmm15
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm10,%xmm10
+ movq 72(%r14),%r13
+ bswapq %r13
+ vpxor %xmm5,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ movq 64(%r14),%r12
+ bswapq %r12
+ vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
+ vmovdqu 96+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,48+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,56+8(%rsp)
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 96-32(%r9),%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 80-128(%rcx),%xmm15
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm10,%xmm10
+ movq 56(%r14),%r13
+ bswapq %r13
+ vpxor %xmm1,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
+ vpxor 112+8(%rsp),%xmm8,%xmm8
+ vaesenc %xmm15,%xmm11,%xmm11
+ movq 48(%r14),%r12
+ bswapq %r12
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,64+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,72+8(%rsp)
+ vpxor %xmm3,%xmm4,%xmm4
+ vmovdqu 112-32(%r9),%xmm3
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 96-128(%rcx),%xmm15
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
+ vaesenc %xmm15,%xmm10,%xmm10
+ movq 40(%r14),%r13
+ bswapq %r13
+ vpxor %xmm2,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ movq 32(%r14),%r12
+ bswapq %r12
+ vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,80+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,88+8(%rsp)
+ vpxor %xmm5,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor %xmm1,%xmm6,%xmm6
+
+ vmovups 112-128(%rcx),%xmm15
+ vpslldq $8,%xmm6,%xmm5
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 16(%r11),%xmm3
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm8,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm5,%xmm4,%xmm4
+ movq 24(%r14),%r13
+ bswapq %r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movq 16(%r14),%r12
+ bswapq %r12
+ vpalignr $8,%xmm4,%xmm4,%xmm0
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ movq %r13,96+8(%rsp)
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r12,104+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ vmovups 128-128(%rcx),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 144-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm10,%xmm10
+ vpsrldq $8,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm11,%xmm11
+ vpxor %xmm6,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm12,%xmm12
+ vpxor %xmm0,%xmm4,%xmm4
+ movq 8(%r14),%r13
+ bswapq %r13
+ vaesenc %xmm1,%xmm13,%xmm13
+ movq 0(%r14),%r12
+ bswapq %r12
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 160-128(%rcx),%xmm1
+ cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds.
+ jb .Lenc_tail_nmb
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 176-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 192-128(%rcx),%xmm1
+ cmpl $14,%ebp // ICP does not zero key schedule.
+ jb .Lenc_tail_nmb
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 208-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 224-128(%rcx),%xmm1
+ jmp .Lenc_tail_nmb
+
+.align 32
+.Lhandle_ctr32_nmb:
+ vmovdqu (%r11),%xmm0
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm15,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm15,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpshufb %xmm0,%xmm1,%xmm1
+ jmp .Lresume_ctr32_nmb
+
+.align 32
+.Lenc_tail_nmb:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vmovdqu %xmm7,16+8(%rsp)
+ vpalignr $8,%xmm4,%xmm4,%xmm8
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ vpxor 0(%rdi),%xmm1,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 16(%rdi),%xmm1,%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 32(%rdi),%xmm1,%xmm5
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 48(%rdi),%xmm1,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 64(%rdi),%xmm1,%xmm7
+ vpxor 80(%rdi),%xmm1,%xmm3
+ vmovdqu (%r8),%xmm1
+
+ vaesenclast %xmm2,%xmm9,%xmm9
+ vmovdqu 32(%r11),%xmm2
+ vaesenclast %xmm0,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm1,%xmm0
+ movq %r13,112+8(%rsp)
+ leaq 96(%rdi),%rdi
+ vaesenclast %xmm5,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm0,%xmm5
+ movq %r12,120+8(%rsp)
+ leaq 96(%rsi),%rsi
+ vmovdqu 0-128(%rcx),%xmm15
+ vaesenclast %xmm6,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm5,%xmm6
+ vaesenclast %xmm7,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm6,%xmm7
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vpaddb %xmm2,%xmm7,%xmm3
+
+ addq $0x60,%r10
+ subq $0x6,%rdx
+ jc .L6x_done_nmb
+
+ vmovups %xmm9,-96(%rsi)
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovups %xmm10,-80(%rsi)
+ vmovdqa %xmm0,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vmovdqa %xmm5,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vmovdqa %xmm6,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vmovdqa %xmm7,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vmovdqa %xmm3,%xmm14
+ vmovdqu 32+8(%rsp),%xmm7
+ jmp .Loop6x_nmb
+
+.L6x_done_nmb:
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpxor %xmm4,%xmm8,%xmm8
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size _aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x
+
+.globl aesni_gcm_decrypt
+.type aesni_gcm_decrypt,@function
+.align 32
+aesni_gcm_decrypt:
+.cfi_startproc
+ xorq %r10,%r10
+ cmpq $0x60,%rdx
+ jb .Lgcm_dec_abort
+
+ leaq (%rsp),%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ pushq %r9
+.cfi_offset %r9,-64
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq .Lbswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ vmovdqu (%r9),%xmm8
+ andq $-128,%rsp
+ vmovdqu (%r11),%xmm0
+ leaq 128(%rcx),%rcx
+ movq 32(%r9),%r9
+ leaq 32(%r9),%r9
+ movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds.
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc .Ldec_no_key_aliasing
+ cmpq $768,%r15
+ jnc .Ldec_no_key_aliasing
+ subq %r15,%rsp
+.Ldec_no_key_aliasing:
+
+ vmovdqu 80(%rdi),%xmm7
+ leaq (%rdi),%r14
+ vmovdqu 64(%rdi),%xmm4
+ leaq -192(%rdi,%rdx,1),%r15
+ vmovdqu 48(%rdi),%xmm5
+ shrq $4,%rdx
+ xorq %r10,%r10
+ vmovdqu 32(%rdi),%xmm6
+ vpshufb %xmm0,%xmm7,%xmm7
+ vmovdqu 16(%rdi),%xmm2
+ vpshufb %xmm0,%xmm4,%xmm4
+ vmovdqu (%rdi),%xmm3
+ vpshufb %xmm0,%xmm5,%xmm5
+ vmovdqu %xmm4,48(%rsp)
+ vpshufb %xmm0,%xmm6,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm2,%xmm2
+ vmovdqu %xmm6,80(%rsp)
+ vpshufb %xmm0,%xmm3,%xmm3
+ vmovdqu %xmm2,96(%rsp)
+ vmovdqu %xmm3,112(%rsp)
+
+#ifdef HAVE_MOVBE
+#ifdef _KERNEL
+ testl $1,gcm_avx_can_use_movbe(%rip)
+#else
+ testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
+#endif
+ jz 1f
+ call _aesni_ctr32_ghash_6x
+ jmp 2f
+1:
+#endif
+ call _aesni_ctr32_ghash_no_movbe_6x
+2:
+ vmovups %xmm9,-96(%rsi)
+ vmovups %xmm10,-80(%rsi)
+ vmovups %xmm11,-64(%rsi)
+ vmovups %xmm12,-48(%rsi)
+ vmovups %xmm13,-32(%rsi)
+ vmovups %xmm14,-16(%rsi)
+
+ vpshufb (%r11),%xmm8,%xmm8
+ movq -56(%rax),%r9
+.cfi_restore %r9
+ vmovdqu %xmm8,(%r9)
+
+ vzeroupper
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lgcm_dec_abort:
+ movq %r10,%rax
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
+.type _aesni_ctr32_6x,@function
+.align 32
+_aesni_ctr32_6x:
+.cfi_startproc
+ vmovdqu 0-128(%rcx),%xmm4
+ vmovdqu 32(%r11),%xmm2
+ leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds.
+ vmovups 16-128(%rcx),%xmm15
+ leaq 32-128(%rcx),%r12
+ vpxor %xmm4,%xmm1,%xmm9
+ addl $100663296,%ebx
+ jc .Lhandle_ctr32_2
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp .Loop_ctr32
+
+.align 16
+.Loop_ctr32:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+ vmovups (%r12),%xmm15
+ leaq 16(%r12),%r12
+ decl %r13d
+ jnz .Loop_ctr32
+
+ vmovdqu (%r12),%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 0(%rdi),%xmm3,%xmm4
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor 16(%rdi),%xmm3,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 32(%rdi),%xmm3,%xmm6
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 48(%rdi),%xmm3,%xmm8
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 64(%rdi),%xmm3,%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 80(%rdi),%xmm3,%xmm3
+ leaq 96(%rdi),%rdi
+
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vaesenclast %xmm5,%xmm10,%xmm10
+ vaesenclast %xmm6,%xmm11,%xmm11
+ vaesenclast %xmm8,%xmm12,%xmm12
+ vaesenclast %xmm2,%xmm13,%xmm13
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vmovups %xmm9,0(%rsi)
+ vmovups %xmm10,16(%rsi)
+ vmovups %xmm11,32(%rsi)
+ vmovups %xmm12,48(%rsi)
+ vmovups %xmm13,64(%rsi)
+ vmovups %xmm14,80(%rsi)
+ leaq 96(%rsi),%rsi
+
+ .byte 0xf3,0xc3
+.align 32
+.Lhandle_ctr32_2:
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp .Loop_ctr32
+.cfi_endproc
+.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl aesni_gcm_encrypt
+.type aesni_gcm_encrypt,@function
+.align 32
+aesni_gcm_encrypt:
+.cfi_startproc
+ xorq %r10,%r10
+ cmpq $288,%rdx
+ jb .Lgcm_enc_abort
+
+ leaq (%rsp),%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ pushq %r9
+.cfi_offset %r9,-64
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq .Lbswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ leaq 128(%rcx),%rcx
+ vmovdqu (%r11),%xmm0
+ andq $-128,%rsp
+ movl 504-128(%rcx),%ebp // ICP has an larger offset for rounds.
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc .Lenc_no_key_aliasing
+ cmpq $768,%r15
+ jnc .Lenc_no_key_aliasing
+ subq %r15,%rsp
+.Lenc_no_key_aliasing:
+
+ leaq (%rsi),%r14
+ leaq -192(%rsi,%rdx,1),%r15
+ shrq $4,%rdx
+
+ call _aesni_ctr32_6x
+ vpshufb %xmm0,%xmm9,%xmm8
+ vpshufb %xmm0,%xmm10,%xmm2
+ vmovdqu %xmm8,112(%rsp)
+ vpshufb %xmm0,%xmm11,%xmm4
+ vmovdqu %xmm2,96(%rsp)
+ vpshufb %xmm0,%xmm12,%xmm5
+ vmovdqu %xmm4,80(%rsp)
+ vpshufb %xmm0,%xmm13,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm14,%xmm7
+ vmovdqu %xmm6,48(%rsp)
+
+ call _aesni_ctr32_6x
+
+ vmovdqu (%r9),%xmm8
+ movq 32(%r9),%r9
+ leaq 32(%r9),%r9
+ subq $12,%rdx
+ movq $192,%r10
+ vpshufb %xmm0,%xmm8,%xmm8
+
+#ifdef HAVE_MOVBE
+#ifdef _KERNEL
+ testl $1,gcm_avx_can_use_movbe(%rip)
+#else
+ testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
+#endif
+ jz 1f
+ call _aesni_ctr32_ghash_6x
+ jmp 2f
+1:
+#endif
+ call _aesni_ctr32_ghash_no_movbe_6x
+2:
+ vmovdqu 32(%rsp),%xmm7
+ vmovdqu (%r11),%xmm0
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm7,%xmm7,%xmm1
+ vmovdqu 32-32(%r9),%xmm15
+ vmovups %xmm9,-96(%rsi)
+ vpshufb %xmm0,%xmm9,%xmm9
+ vpxor %xmm7,%xmm1,%xmm1
+ vmovups %xmm10,-80(%rsi)
+ vpshufb %xmm0,%xmm10,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vpshufb %xmm0,%xmm11,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vpshufb %xmm0,%xmm12,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vpshufb %xmm0,%xmm13,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vpshufb %xmm0,%xmm14,%xmm14
+ vmovdqu %xmm9,16(%rsp)
+ vmovdqu 48(%rsp),%xmm6
+ vmovdqu 16-32(%r9),%xmm0
+ vpunpckhqdq %xmm6,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
+ vpxor %xmm6,%xmm2,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+
+ vmovdqu 64(%rsp),%xmm9
+ vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm9,%xmm9,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
+ vpxor %xmm9,%xmm5,%xmm5
+ vpxor %xmm7,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vmovdqu 80(%rsp),%xmm1
+ vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm4,%xmm7,%xmm7
+ vpunpckhqdq %xmm1,%xmm1,%xmm4
+ vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpxor %xmm6,%xmm9,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 96(%rsp),%xmm2
+ vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm7,%xmm6,%xmm6
+ vpunpckhqdq %xmm2,%xmm2,%xmm7
+ vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpxor %xmm9,%xmm1,%xmm1
+ vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm5,%xmm4,%xmm4
+
+ vpxor 112(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
+ vmovdqu 112-32(%r9),%xmm0
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm1,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
+ vpxor %xmm4,%xmm7,%xmm4
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm1
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
+ vpxor %xmm14,%xmm1,%xmm1
+ vpxor %xmm5,%xmm6,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
+ vmovdqu 32-32(%r9),%xmm15
+ vpxor %xmm2,%xmm8,%xmm7
+ vpxor %xmm4,%xmm9,%xmm6
+
+ vmovdqu 16-32(%r9),%xmm0
+ vpxor %xmm5,%xmm7,%xmm9
+ vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
+ vpxor %xmm9,%xmm6,%xmm6
+ vpunpckhqdq %xmm13,%xmm13,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
+ vpxor %xmm13,%xmm2,%xmm2
+ vpslldq $8,%xmm6,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+ vpxor %xmm9,%xmm5,%xmm8
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm12,%xmm12,%xmm9
+ vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
+ vpxor %xmm12,%xmm9,%xmm9
+ vpxor %xmm14,%xmm13,%xmm13
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm11,%xmm11,%xmm1
+ vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
+ vpxor %xmm11,%xmm1,%xmm1
+ vpxor %xmm13,%xmm12,%xmm12
+ vxorps 16(%rsp),%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm9,%xmm9
+
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm10,%xmm10,%xmm2
+ vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
+ vpxor %xmm10,%xmm2,%xmm2
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpxor %xmm12,%xmm11,%xmm11
+ vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm9,%xmm1,%xmm1
+
+ vxorps %xmm7,%xmm14,%xmm14
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
+ vmovdqu 112-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm11,%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
+ vpxor %xmm4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
+ vpxor %xmm10,%xmm7,%xmm7
+ vpxor %xmm2,%xmm6,%xmm6
+
+ vpxor %xmm5,%xmm7,%xmm4
+ vpxor %xmm4,%xmm6,%xmm6
+ vpslldq $8,%xmm6,%xmm1
+ vmovdqu 16(%r11),%xmm3
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm1,%xmm5,%xmm8
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm2,%xmm8,%xmm8
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm7,%xmm2,%xmm2
+ vpxor %xmm2,%xmm8,%xmm8
+ vpshufb (%r11),%xmm8,%xmm8
+ movq -56(%rax),%r9
+.cfi_restore %r9
+ vmovdqu %xmm8,(%r9)
+
+ vzeroupper
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lgcm_enc_abort:
+ movq %r10,%rax
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
+
+/* Some utility routines */
+
+/*
+ * clear all fpu registers
+ * void clear_fpu_regs_avx(void);
+ */
+.globl clear_fpu_regs_avx
+.type clear_fpu_regs_avx,@function
+.align 32
+clear_fpu_regs_avx:
+ vzeroall
+ ret
+.size clear_fpu_regs_avx,.-clear_fpu_regs_avx
+
+/*
+ * void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
+ *
+ * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
+ * stores the result at `dst'. The XOR is performed using FPU registers,
+ * so make sure FPU state is saved when running this in the kernel.
+ */
+.globl gcm_xor_avx
+.type gcm_xor_avx,@function
+.align 32
+gcm_xor_avx:
+ movdqu (%rdi), %xmm0
+ movdqu (%rsi), %xmm1
+ pxor %xmm1, %xmm0
+ movdqu %xmm0, (%rsi)
+ ret
+.size gcm_xor_avx,.-gcm_xor_avx
+
+/*
+ * Toggle a boolean_t value atomically and return the new value.
+ * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
+ */
+.globl atomic_toggle_boolean_nv
+.type atomic_toggle_boolean_nv,@function
+.align 32
+atomic_toggle_boolean_nv:
+ xorl %eax, %eax
+ lock
+ xorl $1, (%rdi)
+ jz 1f
+ movl $1, %eax
+1:
+ ret
+.size atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv
+
+.align 64
+.Lbswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
+
+/* Mark the stack non-executable. */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S
new file mode 100644
index 000000000000..59edc4c8d56c
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S
@@ -0,0 +1,254 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009 Intel Corporation
+ * All Rights Reserved.
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
+ * instructions. This file contains an accelerated
+ * Galois Field Multiplication implementation.
+ *
+ * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
+ * carry-less multiplication. More information about PCLMULQDQ can be
+ * found at:
+ * http://software.intel.com/en-us/articles/
+ * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
+ *
+ */
+
+/*
+ * ====================================================================
+ * OpenSolaris OS modifications
+ *
+ * This source originates as file galois_hash_asm.c from
+ * Intel Corporation dated September 21, 2009.
+ *
+ * This OpenSolaris version has these major changes from the original source:
+ *
+ * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
+ * definition for lint.
+ *
+ * 2. Formatted code, added comments, and added #includes and #defines.
+ *
+ * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
+ * calling kpreempt_disable() and kpreempt_enable().
+ * If the TS bit is not set, Save and restore %xmm registers at the beginning
+ * and end of function calls (%xmm* registers are not saved and restored by
+ * during kernel thread preemption).
+ *
+ * 4. Removed code to perform hashing. This is already done with C macro
+ * GHASH in gcm.c. For better performance, this removed code should be
+ * reintegrated in the future to replace the C GHASH macro.
+ *
+ * 5. Added code to byte swap 16-byte input and output.
+ *
+ * 6. Folded in comments from the original C source with embedded assembly
+ * (SB_w_shift_xor.c)
+ *
+ * 7. Renamed function and reordered parameters to match OpenSolaris:
+ * Intel interface:
+ * void galois_hash_asm(unsigned char *hk, unsigned char *s,
+ * unsigned char *d, int length)
+ * OpenSolaris OS interface:
+ * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
+ * ====================================================================
+ */
+
+
+#if defined(lint) || defined(__lint) /* lint */
+
+#include <sys/types.h>
+
+/* ARGSUSED */
+void
+gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
+}
+
+#elif defined(HAVE_PCLMULQDQ) /* guard by instruction set */
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+/*
+ * Use this mask to byte-swap a 16-byte integer with the pshufb instruction
+ */
+
+// static uint8_t byte_swap16_mask[] = {
+// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
+.data
+.align XMM_ALIGN
+.Lbyte_swap16_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+
+/*
+ * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
+ *
+ * Perform a carry-less multiplication (that is, use XOR instead of the
+ * multiply operator) on P1 and P2 and place the result in P3.
+ *
+ * Byte swap the input and the output.
+ *
+ * Note: x_in, y, and res all point to a block of 20-byte numbers
+ * (an array of two 64-bit integers).
+ *
+ * Note2: For kernel code, caller is responsible for ensuring
+ * kpreempt_disable() has been called. This is because %xmm registers are
+ * not saved/restored. Clear and set the CR0.TS bit on entry and exit,
+ * respectively, if TS is set on entry. Otherwise, if TS is not set,
+ * save and restore %xmm registers on the stack.
+ *
+ * Note3: Original Intel definition:
+ * void galois_hash_asm(unsigned char *hk, unsigned char *s,
+ * unsigned char *d, int length)
+ *
+ * Note4: Register/parameter mapping:
+ * Intel:
+ * Parameter 1: %rcx (copied to %xmm0) hk or x_in
+ * Parameter 2: %rdx (copied to %xmm1) s or y
+ * Parameter 3: %rdi (result) d or res
+ * OpenSolaris:
+ * Parameter 1: %rdi (copied to %xmm0) x_in
+ * Parameter 2: %rsi (copied to %xmm1) y
+ * Parameter 3: %rdx (result) res
+ */
+
+ENTRY_NP(gcm_mul_pclmulqdq)
+ //
+ // Copy Parameters
+ //
+ movdqu (%rdi), %xmm0 // P1
+ movdqu (%rsi), %xmm1 // P2
+
+ //
+ // Byte swap 16-byte input
+ //
+ lea .Lbyte_swap16_mask(%rip), %rax
+ movups (%rax), %xmm10
+ pshufb %xmm10, %xmm0
+ pshufb %xmm10, %xmm1
+
+
+ //
+ // Multiply with the hash key
+ //
+ movdqu %xmm0, %xmm3
+ pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0
+
+ movdqu %xmm0, %xmm4
+ pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1
+
+ movdqu %xmm0, %xmm5
+ pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0
+ movdqu %xmm0, %xmm6
+ pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1
+
+ pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0
+
+ movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5
+ psrldq $8, %xmm4 // shift by xmm4 64 bits to the right
+ pslldq $8, %xmm5 // shift by xmm5 64 bits to the left
+ pxor %xmm5, %xmm3
+ pxor %xmm4, %xmm6 // Register pair <xmm6:xmm3> holds the result
+ // of the carry-less multiplication of
+ // xmm0 by xmm1.
+
+ // We shift the result of the multiplication by one bit position
+ // to the left to cope for the fact that the bits are reversed.
+ movdqu %xmm3, %xmm7
+ movdqu %xmm6, %xmm8
+ pslld $1, %xmm3
+ pslld $1, %xmm6
+ psrld $31, %xmm7
+ psrld $31, %xmm8
+ movdqu %xmm7, %xmm9
+ pslldq $4, %xmm8
+ pslldq $4, %xmm7
+ psrldq $12, %xmm9
+ por %xmm7, %xmm3
+ por %xmm8, %xmm6
+ por %xmm9, %xmm6
+
+ //
+ // First phase of the reduction
+ //
+ // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
+ // independently.
+ movdqu %xmm3, %xmm7
+ movdqu %xmm3, %xmm8
+ movdqu %xmm3, %xmm9
+ pslld $31, %xmm7 // packed right shift shifting << 31
+ pslld $30, %xmm8 // packed right shift shifting << 30
+ pslld $25, %xmm9 // packed right shift shifting << 25
+ pxor %xmm8, %xmm7 // xor the shifted versions
+ pxor %xmm9, %xmm7
+ movdqu %xmm7, %xmm8
+ pslldq $12, %xmm7
+ psrldq $4, %xmm8
+ pxor %xmm7, %xmm3 // first phase of the reduction complete
+
+ //
+ // Second phase of the reduction
+ //
+ // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
+ // shift operations.
+ movdqu %xmm3, %xmm2
+ movdqu %xmm3, %xmm4 // packed left shifting >> 1
+ movdqu %xmm3, %xmm5
+ psrld $1, %xmm2
+ psrld $2, %xmm4 // packed left shifting >> 2
+ psrld $7, %xmm5 // packed left shifting >> 7
+ pxor %xmm4, %xmm2 // xor the shifted versions
+ pxor %xmm5, %xmm2
+ pxor %xmm8, %xmm2
+ pxor %xmm2, %xmm3
+ pxor %xmm3, %xmm6 // the result is in xmm6
+
+ //
+ // Byte swap 16-byte result
+ //
+ pshufb %xmm10, %xmm6 // %xmm10 has the swap mask
+
+ //
+ // Store the result
+ //
+ movdqu %xmm6, (%rdx) // P3
+
+
+ //
+ // Return
+ //
+ ret
+ SET_SIZE(gcm_mul_pclmulqdq)
+
+#endif /* lint || __lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/ghash-x86_64.S b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/ghash-x86_64.S
new file mode 100644
index 000000000000..90cc36b43a78
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/ghash-x86_64.S
@@ -0,0 +1,714 @@
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that
+# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
+# function features so called "528B" variant utilizing additional
+# 256+16 bytes of per-key storage [+512 bytes shared table].
+# Performance results are for this streamed GHASH subroutine and are
+# expressed in cycles per processed byte, less is better:
+#
+# gcc 3.4.x(*) assembler
+#
+# P4 28.6 14.0 +100%
+# Opteron 19.3 7.7 +150%
+# Core2 17.8 8.1(**) +120%
+# Atom 31.6 16.8 +88%
+# VIA Nano 21.8 10.1 +115%
+#
+# (*) comparison is not completely fair, because C results are
+# for vanilla "256B" implementation, while assembler results
+# are for "528B";-)
+# (**) it's mystery [to me] why Core2 result is not same as for
+# Opteron;
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
+# See ghash-x86.pl for background information and details about coding
+# techniques.
+#
+# Special thanks to David Woodhouse for providing access to a
+# Westmere-based system on behalf of Intel Open Source Technology Centre.
+
+# December 2012
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9, increase reduction aggregate factor to 4x. As for
+# the latter. ghash-x86.pl discusses that it makes lesser sense to
+# increase aggregate factor. Then why increase here? Critical path
+# consists of 3 independent pclmulqdq instructions, Karatsuba post-
+# processing and reduction. "On top" of this we lay down aggregated
+# multiplication operations, triplets of independent pclmulqdq's. As
+# issue rate for pclmulqdq is limited, it makes lesser sense to
+# aggregate more multiplications than it takes to perform remaining
+# non-multiplication operations. 2x is near-optimal coefficient for
+# contemporary Intel CPUs (therefore modest improvement coefficient),
+# but not for Bulldozer. Latter is because logical SIMD operations
+# are twice as slow in comparison to Intel, so that critical path is
+# longer. A CPU with higher pclmulqdq issue rate would also benefit
+# from higher aggregate factor...
+#
+# Westmere 1.78(+13%)
+# Sandy Bridge 1.80(+8%)
+# Ivy Bridge 1.80(+7%)
+# Haswell 0.55(+93%) (if system doesn't support AVX)
+# Broadwell 0.45(+110%)(if system doesn't support AVX)
+# Skylake 0.44(+110%)(if system doesn't support AVX)
+# Bulldozer 1.49(+27%)
+# Silvermont 2.88(+13%)
+# Knights L 2.12(-) (if system doesn't support AVX)
+# Goldmont 1.08(+24%)
+
+# March 2013
+#
+# ... 8x aggregate factor AVX code path is using reduction algorithm
+# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
+# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
+# sub-optimally in comparison to above mentioned version. But thanks
+# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
+# it performs in 0.41 cycles per byte on Haswell processor, in
+# 0.29 on Broadwell, and in 0.36 on Skylake.
+#
+# Knights Landing achieves 1.09 cpb.
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+
+# Generated once from
+# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/ghash-x86_64.pl
+# and modified for ICP. Modification are kept at a bare minimum to ease later
+# upstream merges.
+
+#if defined(__x86_64__) && defined(HAVE_AVX) && \
+ defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
+
+.text
+
+.globl gcm_gmult_clmul
+.type gcm_gmult_clmul,@function
+.align 16
+gcm_gmult_clmul:
+.cfi_startproc
+.L_gmult_clmul:
+ movdqu (%rdi),%xmm0
+ movdqa .Lbswap_mask(%rip),%xmm5
+ movdqu (%rsi),%xmm2
+ movdqu 32(%rsi),%xmm4
+.byte 102,15,56,0,197
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,197
+ movdqu %xmm0,(%rdi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size gcm_gmult_clmul,.-gcm_gmult_clmul
+
+.globl gcm_init_htab_avx
+.type gcm_init_htab_avx,@function
+.align 32
+gcm_init_htab_avx:
+.cfi_startproc
+ vzeroupper
+
+ vmovdqu (%rsi),%xmm2
+ // KCF/ICP stores H in network byte order with the hi qword first
+ // so we need to swap all bytes, not the 2 qwords.
+ vmovdqu .Lbswap_mask(%rip),%xmm4
+ vpshufb %xmm4,%xmm2,%xmm2
+
+
+ vpshufd $255,%xmm2,%xmm4
+ vpsrlq $63,%xmm2,%xmm3
+ vpsllq $1,%xmm2,%xmm2
+ vpxor %xmm5,%xmm5,%xmm5
+ vpcmpgtd %xmm4,%xmm5,%xmm5
+ vpslldq $8,%xmm3,%xmm3
+ vpor %xmm3,%xmm2,%xmm2
+
+
+ vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vpunpckhqdq %xmm2,%xmm2,%xmm6
+ vmovdqa %xmm2,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ movq $4,%r10
+ jmp .Linit_start_avx
+.align 32
+.Linit_loop_avx:
+ vpalignr $8,%xmm3,%xmm4,%xmm5
+ vmovdqu %xmm5,-16(%rdi)
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+.Linit_start_avx:
+ vmovdqa %xmm0,%xmm5
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+ vpshufd $78,%xmm5,%xmm3
+ vpshufd $78,%xmm0,%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqu %xmm5,0(%rdi)
+ vpxor %xmm0,%xmm4,%xmm4
+ vmovdqu %xmm0,16(%rdi)
+ leaq 48(%rdi),%rdi
+ subq $1,%r10
+ jnz .Linit_loop_avx
+
+ vpalignr $8,%xmm4,%xmm3,%xmm5
+ vmovdqu %xmm5,-16(%rdi)
+
+ vzeroupper
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size gcm_init_htab_avx,.-gcm_init_htab_avx
+
+.globl gcm_gmult_avx
+.type gcm_gmult_avx,@function
+.align 32
+gcm_gmult_avx:
+.cfi_startproc
+ jmp .L_gmult_clmul
+.cfi_endproc
+.size gcm_gmult_avx,.-gcm_gmult_avx
+.globl gcm_ghash_avx
+.type gcm_ghash_avx,@function
+.align 32
+gcm_ghash_avx:
+.cfi_startproc
+ vzeroupper
+
+ vmovdqu (%rdi),%xmm10
+ leaq .L0x1c2_polynomial(%rip),%r10
+ leaq 64(%rsi),%rsi
+ vmovdqu .Lbswap_mask(%rip),%xmm13
+ vpshufb %xmm13,%xmm10,%xmm10
+ cmpq $0x80,%rcx
+ jb .Lshort_avx
+ subq $0x80,%rcx
+
+ vmovdqu 112(%rdx),%xmm14
+ vmovdqu 0-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vmovdqu 32-64(%rsi),%xmm7
+
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vmovdqu 96(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm14,%xmm9,%xmm9
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 16-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 80(%rdx),%xmm14
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 48-64(%rsi),%xmm6
+ vpxor %xmm14,%xmm9,%xmm9
+ vmovdqu 64(%rdx),%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rsi),%xmm7
+
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 48(%rdx),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rsi),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 32(%rdx),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 16(%rdx),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rsi),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu (%rdx),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+
+ leaq 128(%rdx),%rdx
+ cmpq $0x80,%rcx
+ jb .Ltail_avx
+
+ vpxor %xmm10,%xmm15,%xmm15
+ subq $0x80,%rcx
+ jmp .Loop8x_avx
+
+.align 32
+.Loop8x_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 112(%rdx),%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpxor %xmm15,%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
+ vmovdqu 0-64(%rsi),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
+ vmovdqu 32-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 96(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm3,%xmm10,%xmm10
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vxorps %xmm4,%xmm11,%xmm11
+ vmovdqu 16-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm5,%xmm12,%xmm12
+ vxorps %xmm15,%xmm8,%xmm8
+
+ vmovdqu 80(%rdx),%xmm14
+ vpxor %xmm10,%xmm12,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm11,%xmm12,%xmm12
+ vpslldq $8,%xmm12,%xmm9
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vpsrldq $8,%xmm12,%xmm12
+ vpxor %xmm9,%xmm10,%xmm10
+ vmovdqu 48-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vxorps %xmm12,%xmm11,%xmm11
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 64(%rdx),%xmm15
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vxorps %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vmovdqu 48(%rdx),%xmm14
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rsi),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 32(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+ vxorps %xmm12,%xmm10,%xmm10
+
+ vmovdqu 16(%rdx),%xmm14
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rsi),%xmm6
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vxorps %xmm11,%xmm12,%xmm12
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu (%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rsi),%xmm6
+ vpxor %xmm12,%xmm15,%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+ vpxor %xmm10,%xmm15,%xmm15
+
+ leaq 128(%rdx),%rdx
+ subq $0x80,%rcx
+ jnc .Loop8x_avx
+
+ addq $0x80,%rcx
+ jmp .Ltail_no_xor_avx
+
+.align 32
+.Lshort_avx:
+ vmovdqu -16(%rdx,%rcx,1),%xmm14
+ leaq (%rdx,%rcx,1),%rdx
+ vmovdqu 0-64(%rsi),%xmm6
+ vmovdqu 32-64(%rsi),%xmm7
+ vpshufb %xmm13,%xmm14,%xmm15
+
+ vmovdqa %xmm0,%xmm3
+ vmovdqa %xmm1,%xmm4
+ vmovdqa %xmm2,%xmm5
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -32(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 16-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -48(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 48-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 80-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -64(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -80(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 96-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 128-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -96(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -112(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 144-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovq 184-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jmp .Ltail_avx
+
+.align 32
+.Ltail_avx:
+ vpxor %xmm10,%xmm15,%xmm15
+.Ltail_no_xor_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+
+ vmovdqu (%r10),%xmm12
+
+ vpxor %xmm0,%xmm3,%xmm10
+ vpxor %xmm1,%xmm4,%xmm11
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vpxor %xmm10,%xmm5,%xmm5
+ vpxor %xmm11,%xmm5,%xmm5
+ vpslldq $8,%xmm5,%xmm9
+ vpsrldq $8,%xmm5,%xmm5
+ vpxor %xmm9,%xmm10,%xmm10
+ vpxor %xmm5,%xmm11,%xmm11
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm11,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ cmpq $0,%rcx
+ jne .Lshort_avx
+
+ vpshufb %xmm13,%xmm10,%xmm10
+ vmovdqu %xmm10,(%rdi)
+ vzeroupper
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size gcm_ghash_avx,.-gcm_ghash_avx
+.align 64
+.Lbswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+.long 7,0,7,0
+.L7_mask_poly:
+.long 7,0,450,0
+.align 64
+.type .Lrem_4bit,@object
+.Lrem_4bit:
+.long 0,0,0,471859200,0,943718400,0,610271232
+.long 0,1887436800,0,1822425088,0,1220542464,0,1423966208
+.long 0,3774873600,0,4246732800,0,3644850176,0,3311403008
+.long 0,2441084928,0,2376073216,0,2847932416,0,3051356160
+.type .Lrem_8bit,@object
+.Lrem_8bit:
+.value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
+.value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
+.value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
+.value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
+.value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
+.value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
+.value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
+.value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
+.value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
+.value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
+.value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
+.value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
+.value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
+.value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
+.value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
+.value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
+.value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
+.value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
+.value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
+.value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
+.value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
+.value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
+.value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
+.value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
+.value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
+.value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
+.value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
+.value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
+.value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
+.value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
+.value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
+.value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
+
+.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
+
+/* Mark the stack non-executable. */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/sha1/sha1-x86_64.S b/sys/contrib/openzfs/module/icp/asm-x86_64/sha1/sha1-x86_64.S
new file mode 100644
index 000000000000..cb923784a730
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/sha1/sha1-x86_64.S
@@ -0,0 +1,1353 @@
+/*
+ * !/usr/bin/env perl
+ *
+ * ====================================================================
+ * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+ * project. The module is, however, dual licensed under OpenSSL and
+ * CRYPTOGAMS licenses depending on where you obtain it. For further
+ * details see http://www.openssl.org/~appro/cryptogams/.
+ * ====================================================================
+ *
+ * sha1_block procedure for x86_64.
+ *
+ * It was brought to my attention that on EM64T compiler-generated code
+ * was far behind 32-bit assembler implementation. This is unlike on
+ * Opteron where compiler-generated code was only 15% behind 32-bit
+ * assembler, which originally made it hard to motivate the effort.
+ * There was suggestion to mechanically translate 32-bit code, but I
+ * dismissed it, reasoning that x86_64 offers enough register bank
+ * capacity to fully utilize SHA-1 parallelism. Therefore this fresh
+ * implementation:-) However! While 64-bit code does performs better
+ * on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
+ * x86_64 does offer larger *addressable* bank, but out-of-order core
+ * reaches for even more registers through dynamic aliasing, and EM64T
+ * core must have managed to run-time optimize even 32-bit code just as
+ * good as 64-bit one. Performance improvement is summarized in the
+ * following table:
+ *
+ * gcc 3.4 32-bit asm cycles/byte
+ * Opteron +45% +20% 6.8
+ * Xeon P4 +65% +0% 9.9
+ * Core2 +60% +10% 7.0
+ *
+ *
+ * OpenSolaris OS modifications
+ *
+ * Sun elects to use this software under the BSD license.
+ *
+ * This source originates from OpenSSL file sha1-x86_64.pl at
+ * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
+ * (presumably for future OpenSSL release 0.9.8h), with these changes:
+ *
+ * 1. Added perl "use strict" and declared variables.
+ *
+ * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
+ *
+ * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1)
+ * assemblers).
+ *
+ */
+
+/*
+ * This file was generated by a perl script (sha1-x86_64.pl). The comments from
+ * the original file have been pasted above.
+ */
+
+#if defined(lint) || defined(__lint)
+#include <sys/stdint.h>
+#include <sys/sha1.h>
+
+
+/* ARGSUSED */
+void
+sha1_block_data_order(SHA1_CTX *ctx, const void *inpp, size_t blocks)
+{
+}
+
+#else
+#define _ASM
+#include <sys/asm_linkage.h>
+ENTRY_NP(sha1_block_data_order)
+ push %rbx
+ push %rbp
+ push %r12
+ mov %rsp,%rax
+ mov %rdi,%r8 # reassigned argument
+ sub $72,%rsp
+ mov %rsi,%r9 # reassigned argument
+ and $-64,%rsp
+ mov %rdx,%r10 # reassigned argument
+ mov %rax,64(%rsp)
+
+ mov 0(%r8),%edx
+ mov 4(%r8),%esi
+ mov 8(%r8),%edi
+ mov 12(%r8),%ebp
+ mov 16(%r8),%r11d
+.align 4
+.Lloop:
+ mov 0(%r9),%eax
+ bswap %eax
+ mov %eax,0(%rsp)
+ lea 0x5a827999(%eax,%r11d),%r12d
+ mov %edi,%ebx
+ mov 4(%r9),%eax
+ mov %edx,%r11d
+ xor %ebp,%ebx
+ bswap %eax
+ rol $5,%r11d
+ and %esi,%ebx
+ mov %eax,4(%rsp)
+ add %r11d,%r12d
+ xor %ebp,%ebx
+ rol $30,%esi
+ add %ebx,%r12d
+ lea 0x5a827999(%eax,%ebp),%r11d
+ mov %esi,%ebx
+ mov 8(%r9),%eax
+ mov %r12d,%ebp
+ xor %edi,%ebx
+ bswap %eax
+ rol $5,%ebp
+ and %edx,%ebx
+ mov %eax,8(%rsp)
+ add %ebp,%r11d
+ xor %edi,%ebx
+ rol $30,%edx
+ add %ebx,%r11d
+ lea 0x5a827999(%eax,%edi),%ebp
+ mov %edx,%ebx
+ mov 12(%r9),%eax
+ mov %r11d,%edi
+ xor %esi,%ebx
+ bswap %eax
+ rol $5,%edi
+ and %r12d,%ebx
+ mov %eax,12(%rsp)
+ add %edi,%ebp
+ xor %esi,%ebx
+ rol $30,%r12d
+ add %ebx,%ebp
+ lea 0x5a827999(%eax,%esi),%edi
+ mov %r12d,%ebx
+ mov 16(%r9),%eax
+ mov %ebp,%esi
+ xor %edx,%ebx
+ bswap %eax
+ rol $5,%esi
+ and %r11d,%ebx
+ mov %eax,16(%rsp)
+ add %esi,%edi
+ xor %edx,%ebx
+ rol $30,%r11d
+ add %ebx,%edi
+ lea 0x5a827999(%eax,%edx),%esi
+ mov %r11d,%ebx
+ mov 20(%r9),%eax
+ mov %edi,%edx
+ xor %r12d,%ebx
+ bswap %eax
+ rol $5,%edx
+ and %ebp,%ebx
+ mov %eax,20(%rsp)
+ add %edx,%esi
+ xor %r12d,%ebx
+ rol $30,%ebp
+ add %ebx,%esi
+ lea 0x5a827999(%eax,%r12d),%edx
+ mov %ebp,%ebx
+ mov 24(%r9),%eax
+ mov %esi,%r12d
+ xor %r11d,%ebx
+ bswap %eax
+ rol $5,%r12d
+ and %edi,%ebx
+ mov %eax,24(%rsp)
+ add %r12d,%edx
+ xor %r11d,%ebx
+ rol $30,%edi
+ add %ebx,%edx
+ lea 0x5a827999(%eax,%r11d),%r12d
+ mov %edi,%ebx
+ mov 28(%r9),%eax
+ mov %edx,%r11d
+ xor %ebp,%ebx
+ bswap %eax
+ rol $5,%r11d
+ and %esi,%ebx
+ mov %eax,28(%rsp)
+ add %r11d,%r12d
+ xor %ebp,%ebx
+ rol $30,%esi
+ add %ebx,%r12d
+ lea 0x5a827999(%eax,%ebp),%r11d
+ mov %esi,%ebx
+ mov 32(%r9),%eax
+ mov %r12d,%ebp
+ xor %edi,%ebx
+ bswap %eax
+ rol $5,%ebp
+ and %edx,%ebx
+ mov %eax,32(%rsp)
+ add %ebp,%r11d
+ xor %edi,%ebx
+ rol $30,%edx
+ add %ebx,%r11d
+ lea 0x5a827999(%eax,%edi),%ebp
+ mov %edx,%ebx
+ mov 36(%r9),%eax
+ mov %r11d,%edi
+ xor %esi,%ebx
+ bswap %eax
+ rol $5,%edi
+ and %r12d,%ebx
+ mov %eax,36(%rsp)
+ add %edi,%ebp
+ xor %esi,%ebx
+ rol $30,%r12d
+ add %ebx,%ebp
+ lea 0x5a827999(%eax,%esi),%edi
+ mov %r12d,%ebx
+ mov 40(%r9),%eax
+ mov %ebp,%esi
+ xor %edx,%ebx
+ bswap %eax
+ rol $5,%esi
+ and %r11d,%ebx
+ mov %eax,40(%rsp)
+ add %esi,%edi
+ xor %edx,%ebx
+ rol $30,%r11d
+ add %ebx,%edi
+ lea 0x5a827999(%eax,%edx),%esi
+ mov %r11d,%ebx
+ mov 44(%r9),%eax
+ mov %edi,%edx
+ xor %r12d,%ebx
+ bswap %eax
+ rol $5,%edx
+ and %ebp,%ebx
+ mov %eax,44(%rsp)
+ add %edx,%esi
+ xor %r12d,%ebx
+ rol $30,%ebp
+ add %ebx,%esi
+ lea 0x5a827999(%eax,%r12d),%edx
+ mov %ebp,%ebx
+ mov 48(%r9),%eax
+ mov %esi,%r12d
+ xor %r11d,%ebx
+ bswap %eax
+ rol $5,%r12d
+ and %edi,%ebx
+ mov %eax,48(%rsp)
+ add %r12d,%edx
+ xor %r11d,%ebx
+ rol $30,%edi
+ add %ebx,%edx
+ lea 0x5a827999(%eax,%r11d),%r12d
+ mov %edi,%ebx
+ mov 52(%r9),%eax
+ mov %edx,%r11d
+ xor %ebp,%ebx
+ bswap %eax
+ rol $5,%r11d
+ and %esi,%ebx
+ mov %eax,52(%rsp)
+ add %r11d,%r12d
+ xor %ebp,%ebx
+ rol $30,%esi
+ add %ebx,%r12d
+ lea 0x5a827999(%eax,%ebp),%r11d
+ mov %esi,%ebx
+ mov 56(%r9),%eax
+ mov %r12d,%ebp
+ xor %edi,%ebx
+ bswap %eax
+ rol $5,%ebp
+ and %edx,%ebx
+ mov %eax,56(%rsp)
+ add %ebp,%r11d
+ xor %edi,%ebx
+ rol $30,%edx
+ add %ebx,%r11d
+ lea 0x5a827999(%eax,%edi),%ebp
+ mov %edx,%ebx
+ mov 60(%r9),%eax
+ mov %r11d,%edi
+ xor %esi,%ebx
+ bswap %eax
+ rol $5,%edi
+ and %r12d,%ebx
+ mov %eax,60(%rsp)
+ add %edi,%ebp
+ xor %esi,%ebx
+ rol $30,%r12d
+ add %ebx,%ebp
+ lea 0x5a827999(%eax,%esi),%edi
+ mov 0(%rsp),%eax
+ mov %r12d,%ebx
+ mov %ebp,%esi
+ xor 8(%rsp),%eax
+ xor %edx,%ebx
+ rol $5,%esi
+ xor 32(%rsp),%eax
+ and %r11d,%ebx
+ add %esi,%edi
+ xor 52(%rsp),%eax
+ xor %edx,%ebx
+ rol $30,%r11d
+ add %ebx,%edi
+ rol $1,%eax
+ mov %eax,0(%rsp)
+ lea 0x5a827999(%eax,%edx),%esi
+ mov 4(%rsp),%eax
+ mov %r11d,%ebx
+ mov %edi,%edx
+ xor 12(%rsp),%eax
+ xor %r12d,%ebx
+ rol $5,%edx
+ xor 36(%rsp),%eax
+ and %ebp,%ebx
+ add %edx,%esi
+ xor 56(%rsp),%eax
+ xor %r12d,%ebx
+ rol $30,%ebp
+ add %ebx,%esi
+ rol $1,%eax
+ mov %eax,4(%rsp)
+ lea 0x5a827999(%eax,%r12d),%edx
+ mov 8(%rsp),%eax
+ mov %ebp,%ebx
+ mov %esi,%r12d
+ xor 16(%rsp),%eax
+ xor %r11d,%ebx
+ rol $5,%r12d
+ xor 40(%rsp),%eax
+ and %edi,%ebx
+ add %r12d,%edx
+ xor 60(%rsp),%eax
+ xor %r11d,%ebx
+ rol $30,%edi
+ add %ebx,%edx
+ rol $1,%eax
+ mov %eax,8(%rsp)
+ lea 0x5a827999(%eax,%r11d),%r12d
+ mov 12(%rsp),%eax
+ mov %edi,%ebx
+ mov %edx,%r11d
+ xor 20(%rsp),%eax
+ xor %ebp,%ebx
+ rol $5,%r11d
+ xor 44(%rsp),%eax
+ and %esi,%ebx
+ add %r11d,%r12d
+ xor 0(%rsp),%eax
+ xor %ebp,%ebx
+ rol $30,%esi
+ add %ebx,%r12d
+ rol $1,%eax
+ mov %eax,12(%rsp)
+ lea 0x5a827999(%eax,%ebp),%r11d
+ mov 16(%rsp),%eax
+ mov %esi,%ebx
+ mov %r12d,%ebp
+ xor 24(%rsp),%eax
+ xor %edi,%ebx
+ rol $5,%ebp
+ xor 48(%rsp),%eax
+ and %edx,%ebx
+ add %ebp,%r11d
+ xor 4(%rsp),%eax
+ xor %edi,%ebx
+ rol $30,%edx
+ add %ebx,%r11d
+ rol $1,%eax
+ mov %eax,16(%rsp)
+ lea 0x6ed9eba1(%eax,%edi),%ebp
+ mov 20(%rsp),%eax
+ mov %edx,%ebx
+ mov %r11d,%edi
+ xor 28(%rsp),%eax
+ xor %r12d,%ebx
+ rol $5,%edi
+ xor 52(%rsp),%eax
+ xor %esi,%ebx
+ add %edi,%ebp
+ xor 8(%rsp),%eax
+ rol $30,%r12d
+ add %ebx,%ebp
+ rol $1,%eax
+ mov %eax,20(%rsp)
+ lea 0x6ed9eba1(%eax,%esi),%edi
+ mov 24(%rsp),%eax
+ mov %r12d,%ebx
+ mov %ebp,%esi
+ xor 32(%rsp),%eax
+ xor %r11d,%ebx
+ rol $5,%esi
+ xor 56(%rsp),%eax
+ xor %edx,%ebx
+ add %esi,%edi
+ xor 12(%rsp),%eax
+ rol $30,%r11d
+ add %ebx,%edi
+ rol $1,%eax
+ mov %eax,24(%rsp)
+ lea 0x6ed9eba1(%eax,%edx),%esi
+ mov 28(%rsp),%eax
+ mov %r11d,%ebx
+ mov %edi,%edx
+ xor 36(%rsp),%eax
+ xor %ebp,%ebx
+ rol $5,%edx
+ xor 60(%rsp),%eax
+ xor %r12d,%ebx
+ add %edx,%esi
+ xor 16(%rsp),%eax
+ rol $30,%ebp
+ add %ebx,%esi
+ rol $1,%eax
+ mov %eax,28(%rsp)
+ lea 0x6ed9eba1(%eax,%r12d),%edx
+ mov 32(%rsp),%eax
+ mov %ebp,%ebx
+ mov %esi,%r12d
+ xor 40(%rsp),%eax
+ xor %edi,%ebx
+ rol $5,%r12d
+ xor 0(%rsp),%eax
+ xor %r11d,%ebx
+ add %r12d,%edx
+ xor 20(%rsp),%eax
+ rol $30,%edi
+ add %ebx,%edx
+ rol $1,%eax
+ mov %eax,32(%rsp)
+ lea 0x6ed9eba1(%eax,%r11d),%r12d
+ mov 36(%rsp),%eax
+ mov %edi,%ebx
+ mov %edx,%r11d
+ xor 44(%rsp),%eax
+ xor %esi,%ebx
+ rol $5,%r11d
+ xor 4(%rsp),%eax
+ xor %ebp,%ebx
+ add %r11d,%r12d
+ xor 24(%rsp),%eax
+ rol $30,%esi
+ add %ebx,%r12d
+ rol $1,%eax
+ mov %eax,36(%rsp)
+ lea 0x6ed9eba1(%eax,%ebp),%r11d
+ mov 40(%rsp),%eax
+ mov %esi,%ebx
+ mov %r12d,%ebp
+ xor 48(%rsp),%eax
+ xor %edx,%ebx
+ rol $5,%ebp
+ xor 8(%rsp),%eax
+ xor %edi,%ebx
+ add %ebp,%r11d
+ xor 28(%rsp),%eax
+ rol $30,%edx
+ add %ebx,%r11d
+ rol $1,%eax
+ mov %eax,40(%rsp)
+ lea 0x6ed9eba1(%eax,%edi),%ebp
+ mov 44(%rsp),%eax
+ mov %edx,%ebx
+ mov %r11d,%edi
+ xor 52(%rsp),%eax
+ xor %r12d,%ebx
+ rol $5,%edi
+ xor 12(%rsp),%eax
+ xor %esi,%ebx
+ add %edi,%ebp
+ xor 32(%rsp),%eax
+ rol $30,%r12d
+ add %ebx,%ebp
+ rol $1,%eax
+ mov %eax,44(%rsp)
+ lea 0x6ed9eba1(%eax,%esi),%edi
+ mov 48(%rsp),%eax
+ mov %r12d,%ebx
+ mov %ebp,%esi
+ xor 56(%rsp),%eax
+ xor %r11d,%ebx
+ rol $5,%esi
+ xor 16(%rsp),%eax
+ xor %edx,%ebx
+ add %esi,%edi
+ xor 36(%rsp),%eax
+ rol $30,%r11d
+ add %ebx,%edi
+ rol $1,%eax
+ mov %eax,48(%rsp)
+ lea 0x6ed9eba1(%eax,%edx),%esi
+ mov 52(%rsp),%eax
+ mov %r11d,%ebx
+ mov %edi,%edx
+ xor 60(%rsp),%eax
+ xor %ebp,%ebx
+ rol $5,%edx
+ xor 20(%rsp),%eax
+ xor %r12d,%ebx
+ add %edx,%esi
+ xor 40(%rsp),%eax
+ rol $30,%ebp
+ add %ebx,%esi
+ rol $1,%eax
+ mov %eax,52(%rsp)
+ lea 0x6ed9eba1(%eax,%r12d),%edx
+ mov 56(%rsp),%eax
+ mov %ebp,%ebx
+ mov %esi,%r12d
+ xor 0(%rsp),%eax
+ xor %edi,%ebx
+ rol $5,%r12d
+ xor 24(%rsp),%eax
+ xor %r11d,%ebx
+ add %r12d,%edx
+ xor 44(%rsp),%eax
+ rol $30,%edi
+ add %ebx,%edx
+ rol $1,%eax
+ mov %eax,56(%rsp)
+ lea 0x6ed9eba1(%eax,%r11d),%r12d
+ mov 60(%rsp),%eax
+ mov %edi,%ebx
+ mov %edx,%r11d
+ xor 4(%rsp),%eax
+ xor %esi,%ebx
+ rol $5,%r11d
+ xor 28(%rsp),%eax
+ xor %ebp,%ebx
+ add %r11d,%r12d
+ xor 48(%rsp),%eax
+ rol $30,%esi
+ add %ebx,%r12d
+ rol $1,%eax
+ mov %eax,60(%rsp)
+ lea 0x6ed9eba1(%eax,%ebp),%r11d
+ mov 0(%rsp),%eax
+ mov %esi,%ebx
+ mov %r12d,%ebp
+ xor 8(%rsp),%eax
+ xor %edx,%ebx
+ rol $5,%ebp
+ xor 32(%rsp),%eax
+ xor %edi,%ebx
+ add %ebp,%r11d
+ xor 52(%rsp),%eax
+ rol $30,%edx
+ add %ebx,%r11d
+ rol $1,%eax
+ mov %eax,0(%rsp)
+ lea 0x6ed9eba1(%eax,%edi),%ebp
+ mov 4(%rsp),%eax
+ mov %edx,%ebx
+ mov %r11d,%edi
+ xor 12(%rsp),%eax
+ xor %r12d,%ebx
+ rol $5,%edi
+ xor 36(%rsp),%eax
+ xor %esi,%ebx
+ add %edi,%ebp
+ xor 56(%rsp),%eax
+ rol $30,%r12d
+ add %ebx,%ebp
+ rol $1,%eax
+ mov %eax,4(%rsp)
+ lea 0x6ed9eba1(%eax,%esi),%edi
+ mov 8(%rsp),%eax
+ mov %r12d,%ebx
+ mov %ebp,%esi
+ xor 16(%rsp),%eax
+ xor %r11d,%ebx
+ rol $5,%esi
+ xor 40(%rsp),%eax
+ xor %edx,%ebx
+ add %esi,%edi
+ xor 60(%rsp),%eax
+ rol $30,%r11d
+ add %ebx,%edi
+ rol $1,%eax
+ mov %eax,8(%rsp)
+ lea 0x6ed9eba1(%eax,%edx),%esi
+ mov 12(%rsp),%eax
+ mov %r11d,%ebx
+ mov %edi,%edx
+ xor 20(%rsp),%eax
+ xor %ebp,%ebx
+ rol $5,%edx
+ xor 44(%rsp),%eax
+ xor %r12d,%ebx
+ add %edx,%esi
+ xor 0(%rsp),%eax
+ rol $30,%ebp
+ add %ebx,%esi
+ rol $1,%eax
+ mov %eax,12(%rsp)
+ lea 0x6ed9eba1(%eax,%r12d),%edx
+ mov 16(%rsp),%eax
+ mov %ebp,%ebx
+ mov %esi,%r12d
+ xor 24(%rsp),%eax
+ xor %edi,%ebx
+ rol $5,%r12d
+ xor 48(%rsp),%eax
+ xor %r11d,%ebx
+ add %r12d,%edx
+ xor 4(%rsp),%eax
+ rol $30,%edi
+ add %ebx,%edx
+ rol $1,%eax
+ mov %eax,16(%rsp)
+ lea 0x6ed9eba1(%eax,%r11d),%r12d
+ mov 20(%rsp),%eax
+ mov %edi,%ebx
+ mov %edx,%r11d
+ xor 28(%rsp),%eax
+ xor %esi,%ebx
+ rol $5,%r11d
+ xor 52(%rsp),%eax
+ xor %ebp,%ebx
+ add %r11d,%r12d
+ xor 8(%rsp),%eax
+ rol $30,%esi
+ add %ebx,%r12d
+ rol $1,%eax
+ mov %eax,20(%rsp)
+ lea 0x6ed9eba1(%eax,%ebp),%r11d
+ mov 24(%rsp),%eax
+ mov %esi,%ebx
+ mov %r12d,%ebp
+ xor 32(%rsp),%eax
+ xor %edx,%ebx
+ rol $5,%ebp
+ xor 56(%rsp),%eax
+ xor %edi,%ebx
+ add %ebp,%r11d
+ xor 12(%rsp),%eax
+ rol $30,%edx
+ add %ebx,%r11d
+ rol $1,%eax
+ mov %eax,24(%rsp)
+ lea 0x6ed9eba1(%eax,%edi),%ebp
+ mov 28(%rsp),%eax
+ mov %edx,%ebx
+ mov %r11d,%edi
+ xor 36(%rsp),%eax
+ xor %r12d,%ebx
+ rol $5,%edi
+ xor 60(%rsp),%eax
+ xor %esi,%ebx
+ add %edi,%ebp
+ xor 16(%rsp),%eax
+ rol $30,%r12d
+ add %ebx,%ebp
+ rol $1,%eax
+ mov %eax,28(%rsp)
+ lea 0x6ed9eba1(%eax,%esi),%edi
+ mov 32(%rsp),%eax
+ mov %r12d,%ebx
+ mov %ebp,%esi
+ xor 40(%rsp),%eax
+ xor %r11d,%ebx
+ rol $5,%esi
+ xor 0(%rsp),%eax
+ xor %edx,%ebx
+ add %esi,%edi
+ xor 20(%rsp),%eax
+ rol $30,%r11d
+ add %ebx,%edi
+ rol $1,%eax
+ mov %eax,32(%rsp)
+ lea -0x70e44324(%eax,%edx),%esi
+ mov 36(%rsp),%eax
+ mov %ebp,%ebx
+ mov %ebp,%ecx
+ xor 44(%rsp),%eax
+ mov %edi,%edx
+ and %r11d,%ebx
+ xor 4(%rsp),%eax
+ or %r11d,%ecx
+ rol $5,%edx
+ xor 24(%rsp),%eax
+ and %r12d,%ecx
+ add %edx,%esi
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%ebp
+ mov %eax,36(%rsp)
+ add %ebx,%esi
+ lea -0x70e44324(%eax,%r12d),%edx
+ mov 40(%rsp),%eax
+ mov %edi,%ebx
+ mov %edi,%ecx
+ xor 48(%rsp),%eax
+ mov %esi,%r12d
+ and %ebp,%ebx
+ xor 8(%rsp),%eax
+ or %ebp,%ecx
+ rol $5,%r12d
+ xor 28(%rsp),%eax
+ and %r11d,%ecx
+ add %r12d,%edx
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%edi
+ mov %eax,40(%rsp)
+ add %ebx,%edx
+ lea -0x70e44324(%eax,%r11d),%r12d
+ mov 44(%rsp),%eax
+ mov %esi,%ebx
+ mov %esi,%ecx
+ xor 52(%rsp),%eax
+ mov %edx,%r11d
+ and %edi,%ebx
+ xor 12(%rsp),%eax
+ or %edi,%ecx
+ rol $5,%r11d
+ xor 32(%rsp),%eax
+ and %ebp,%ecx
+ add %r11d,%r12d
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%esi
+ mov %eax,44(%rsp)
+ add %ebx,%r12d
+ lea -0x70e44324(%eax,%ebp),%r11d
+ mov 48(%rsp),%eax
+ mov %edx,%ebx
+ mov %edx,%ecx
+ xor 56(%rsp),%eax
+ mov %r12d,%ebp
+ and %esi,%ebx
+ xor 16(%rsp),%eax
+ or %esi,%ecx
+ rol $5,%ebp
+ xor 36(%rsp),%eax
+ and %edi,%ecx
+ add %ebp,%r11d
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%edx
+ mov %eax,48(%rsp)
+ add %ebx,%r11d
+ lea -0x70e44324(%eax,%edi),%ebp
+ mov 52(%rsp),%eax
+ mov %r12d,%ebx
+ mov %r12d,%ecx
+ xor 60(%rsp),%eax
+ mov %r11d,%edi
+ and %edx,%ebx
+ xor 20(%rsp),%eax
+ or %edx,%ecx
+ rol $5,%edi
+ xor 40(%rsp),%eax
+ and %esi,%ecx
+ add %edi,%ebp
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%r12d
+ mov %eax,52(%rsp)
+ add %ebx,%ebp
+ lea -0x70e44324(%eax,%esi),%edi
+ mov 56(%rsp),%eax
+ mov %r11d,%ebx
+ mov %r11d,%ecx
+ xor 0(%rsp),%eax
+ mov %ebp,%esi
+ and %r12d,%ebx
+ xor 24(%rsp),%eax
+ or %r12d,%ecx
+ rol $5,%esi
+ xor 44(%rsp),%eax
+ and %edx,%ecx
+ add %esi,%edi
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%r11d
+ mov %eax,56(%rsp)
+ add %ebx,%edi
+ lea -0x70e44324(%eax,%edx),%esi
+ mov 60(%rsp),%eax
+ mov %ebp,%ebx
+ mov %ebp,%ecx
+ xor 4(%rsp),%eax
+ mov %edi,%edx
+ and %r11d,%ebx
+ xor 28(%rsp),%eax
+ or %r11d,%ecx
+ rol $5,%edx
+ xor 48(%rsp),%eax
+ and %r12d,%ecx
+ add %edx,%esi
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%ebp
+ mov %eax,60(%rsp)
+ add %ebx,%esi
+ lea -0x70e44324(%eax,%r12d),%edx
+ mov 0(%rsp),%eax
+ mov %edi,%ebx
+ mov %edi,%ecx
+ xor 8(%rsp),%eax
+ mov %esi,%r12d
+ and %ebp,%ebx
+ xor 32(%rsp),%eax
+ or %ebp,%ecx
+ rol $5,%r12d
+ xor 52(%rsp),%eax
+ and %r11d,%ecx
+ add %r12d,%edx
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%edi
+ mov %eax,0(%rsp)
+ add %ebx,%edx
+ lea -0x70e44324(%eax,%r11d),%r12d
+ mov 4(%rsp),%eax
+ mov %esi,%ebx
+ mov %esi,%ecx
+ xor 12(%rsp),%eax
+ mov %edx,%r11d
+ and %edi,%ebx
+ xor 36(%rsp),%eax
+ or %edi,%ecx
+ rol $5,%r11d
+ xor 56(%rsp),%eax
+ and %ebp,%ecx
+ add %r11d,%r12d
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%esi
+ mov %eax,4(%rsp)
+ add %ebx,%r12d
+ lea -0x70e44324(%eax,%ebp),%r11d
+ mov 8(%rsp),%eax
+ mov %edx,%ebx
+ mov %edx,%ecx
+ xor 16(%rsp),%eax
+ mov %r12d,%ebp
+ and %esi,%ebx
+ xor 40(%rsp),%eax
+ or %esi,%ecx
+ rol $5,%ebp
+ xor 60(%rsp),%eax
+ and %edi,%ecx
+ add %ebp,%r11d
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%edx
+ mov %eax,8(%rsp)
+ add %ebx,%r11d
+ lea -0x70e44324(%eax,%edi),%ebp
+ mov 12(%rsp),%eax
+ mov %r12d,%ebx
+ mov %r12d,%ecx
+ xor 20(%rsp),%eax
+ mov %r11d,%edi
+ and %edx,%ebx
+ xor 44(%rsp),%eax
+ or %edx,%ecx
+ rol $5,%edi
+ xor 0(%rsp),%eax
+ and %esi,%ecx
+ add %edi,%ebp
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%r12d
+ mov %eax,12(%rsp)
+ add %ebx,%ebp
+ lea -0x70e44324(%eax,%esi),%edi
+ mov 16(%rsp),%eax
+ mov %r11d,%ebx
+ mov %r11d,%ecx
+ xor 24(%rsp),%eax
+ mov %ebp,%esi
+ and %r12d,%ebx
+ xor 48(%rsp),%eax
+ or %r12d,%ecx
+ rol $5,%esi
+ xor 4(%rsp),%eax
+ and %edx,%ecx
+ add %esi,%edi
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%r11d
+ mov %eax,16(%rsp)
+ add %ebx,%edi
+ lea -0x70e44324(%eax,%edx),%esi
+ mov 20(%rsp),%eax
+ mov %ebp,%ebx
+ mov %ebp,%ecx
+ xor 28(%rsp),%eax
+ mov %edi,%edx
+ and %r11d,%ebx
+ xor 52(%rsp),%eax
+ or %r11d,%ecx
+ rol $5,%edx
+ xor 8(%rsp),%eax
+ and %r12d,%ecx
+ add %edx,%esi
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%ebp
+ mov %eax,20(%rsp)
+ add %ebx,%esi
+ lea -0x70e44324(%eax,%r12d),%edx
+ mov 24(%rsp),%eax
+ mov %edi,%ebx
+ mov %edi,%ecx
+ xor 32(%rsp),%eax
+ mov %esi,%r12d
+ and %ebp,%ebx
+ xor 56(%rsp),%eax
+ or %ebp,%ecx
+ rol $5,%r12d
+ xor 12(%rsp),%eax
+ and %r11d,%ecx
+ add %r12d,%edx
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%edi
+ mov %eax,24(%rsp)
+ add %ebx,%edx
+ lea -0x70e44324(%eax,%r11d),%r12d
+ mov 28(%rsp),%eax
+ mov %esi,%ebx
+ mov %esi,%ecx
+ xor 36(%rsp),%eax
+ mov %edx,%r11d
+ and %edi,%ebx
+ xor 60(%rsp),%eax
+ or %edi,%ecx
+ rol $5,%r11d
+ xor 16(%rsp),%eax
+ and %ebp,%ecx
+ add %r11d,%r12d
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%esi
+ mov %eax,28(%rsp)
+ add %ebx,%r12d
+ lea -0x70e44324(%eax,%ebp),%r11d
+ mov 32(%rsp),%eax
+ mov %edx,%ebx
+ mov %edx,%ecx
+ xor 40(%rsp),%eax
+ mov %r12d,%ebp
+ and %esi,%ebx
+ xor 0(%rsp),%eax
+ or %esi,%ecx
+ rol $5,%ebp
+ xor 20(%rsp),%eax
+ and %edi,%ecx
+ add %ebp,%r11d
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%edx
+ mov %eax,32(%rsp)
+ add %ebx,%r11d
+ lea -0x70e44324(%eax,%edi),%ebp
+ mov 36(%rsp),%eax
+ mov %r12d,%ebx
+ mov %r12d,%ecx
+ xor 44(%rsp),%eax
+ mov %r11d,%edi
+ and %edx,%ebx
+ xor 4(%rsp),%eax
+ or %edx,%ecx
+ rol $5,%edi
+ xor 24(%rsp),%eax
+ and %esi,%ecx
+ add %edi,%ebp
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%r12d
+ mov %eax,36(%rsp)
+ add %ebx,%ebp
+ lea -0x70e44324(%eax,%esi),%edi
+ mov 40(%rsp),%eax
+ mov %r11d,%ebx
+ mov %r11d,%ecx
+ xor 48(%rsp),%eax
+ mov %ebp,%esi
+ and %r12d,%ebx
+ xor 8(%rsp),%eax
+ or %r12d,%ecx
+ rol $5,%esi
+ xor 28(%rsp),%eax
+ and %edx,%ecx
+ add %esi,%edi
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%r11d
+ mov %eax,40(%rsp)
+ add %ebx,%edi
+ lea -0x70e44324(%eax,%edx),%esi
+ mov 44(%rsp),%eax
+ mov %ebp,%ebx
+ mov %ebp,%ecx
+ xor 52(%rsp),%eax
+ mov %edi,%edx
+ and %r11d,%ebx
+ xor 12(%rsp),%eax
+ or %r11d,%ecx
+ rol $5,%edx
+ xor 32(%rsp),%eax
+ and %r12d,%ecx
+ add %edx,%esi
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%ebp
+ mov %eax,44(%rsp)
+ add %ebx,%esi
+ lea -0x70e44324(%eax,%r12d),%edx
+ mov 48(%rsp),%eax
+ mov %edi,%ebx
+ mov %edi,%ecx
+ xor 56(%rsp),%eax
+ mov %esi,%r12d
+ and %ebp,%ebx
+ xor 16(%rsp),%eax
+ or %ebp,%ecx
+ rol $5,%r12d
+ xor 36(%rsp),%eax
+ and %r11d,%ecx
+ add %r12d,%edx
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%edi
+ mov %eax,48(%rsp)
+ add %ebx,%edx
+ lea -0x359d3e2a(%eax,%r11d),%r12d
+ mov 52(%rsp),%eax
+ mov %edi,%ebx
+ mov %edx,%r11d
+ xor 60(%rsp),%eax
+ xor %esi,%ebx
+ rol $5,%r11d
+ xor 20(%rsp),%eax
+ xor %ebp,%ebx
+ add %r11d,%r12d
+ xor 40(%rsp),%eax
+ rol $30,%esi
+ add %ebx,%r12d
+ rol $1,%eax
+ mov %eax,52(%rsp)
+ lea -0x359d3e2a(%eax,%ebp),%r11d
+ mov 56(%rsp),%eax
+ mov %esi,%ebx
+ mov %r12d,%ebp
+ xor 0(%rsp),%eax
+ xor %edx,%ebx
+ rol $5,%ebp
+ xor 24(%rsp),%eax
+ xor %edi,%ebx
+ add %ebp,%r11d
+ xor 44(%rsp),%eax
+ rol $30,%edx
+ add %ebx,%r11d
+ rol $1,%eax
+ mov %eax,56(%rsp)
+ lea -0x359d3e2a(%eax,%edi),%ebp
+ mov 60(%rsp),%eax
+ mov %edx,%ebx
+ mov %r11d,%edi
+ xor 4(%rsp),%eax
+ xor %r12d,%ebx
+ rol $5,%edi
+ xor 28(%rsp),%eax
+ xor %esi,%ebx
+ add %edi,%ebp
+ xor 48(%rsp),%eax
+ rol $30,%r12d
+ add %ebx,%ebp
+ rol $1,%eax
+ mov %eax,60(%rsp)
+ lea -0x359d3e2a(%eax,%esi),%edi
+ mov 0(%rsp),%eax
+ mov %r12d,%ebx
+ mov %ebp,%esi
+ xor 8(%rsp),%eax
+ xor %r11d,%ebx
+ rol $5,%esi
+ xor 32(%rsp),%eax
+ xor %edx,%ebx
+ add %esi,%edi
+ xor 52(%rsp),%eax
+ rol $30,%r11d
+ add %ebx,%edi
+ rol $1,%eax
+ mov %eax,0(%rsp)
+ lea -0x359d3e2a(%eax,%edx),%esi
+ mov 4(%rsp),%eax
+ mov %r11d,%ebx
+ mov %edi,%edx
+ xor 12(%rsp),%eax
+ xor %ebp,%ebx
+ rol $5,%edx
+ xor 36(%rsp),%eax
+ xor %r12d,%ebx
+ add %edx,%esi
+ xor 56(%rsp),%eax
+ rol $30,%ebp
+ add %ebx,%esi
+ rol $1,%eax
+ mov %eax,4(%rsp)
+ lea -0x359d3e2a(%eax,%r12d),%edx
+ mov 8(%rsp),%eax
+ mov %ebp,%ebx
+ mov %esi,%r12d
+ xor 16(%rsp),%eax
+ xor %edi,%ebx
+ rol $5,%r12d
+ xor 40(%rsp),%eax
+ xor %r11d,%ebx
+ add %r12d,%edx
+ xor 60(%rsp),%eax
+ rol $30,%edi
+ add %ebx,%edx
+ rol $1,%eax
+ mov %eax,8(%rsp)
+ lea -0x359d3e2a(%eax,%r11d),%r12d
+ mov 12(%rsp),%eax
+ mov %edi,%ebx
+ mov %edx,%r11d
+ xor 20(%rsp),%eax
+ xor %esi,%ebx
+ rol $5,%r11d
+ xor 44(%rsp),%eax
+ xor %ebp,%ebx
+ add %r11d,%r12d
+ xor 0(%rsp),%eax
+ rol $30,%esi
+ add %ebx,%r12d
+ rol $1,%eax
+ mov %eax,12(%rsp)
+ lea -0x359d3e2a(%eax,%ebp),%r11d
+ mov 16(%rsp),%eax
+ mov %esi,%ebx
+ mov %r12d,%ebp
+ xor 24(%rsp),%eax
+ xor %edx,%ebx
+ rol $5,%ebp
+ xor 48(%rsp),%eax
+ xor %edi,%ebx
+ add %ebp,%r11d
+ xor 4(%rsp),%eax
+ rol $30,%edx
+ add %ebx,%r11d
+ rol $1,%eax
+ mov %eax,16(%rsp)
+ lea -0x359d3e2a(%eax,%edi),%ebp
+ mov 20(%rsp),%eax
+ mov %edx,%ebx
+ mov %r11d,%edi
+ xor 28(%rsp),%eax
+ xor %r12d,%ebx
+ rol $5,%edi
+ xor 52(%rsp),%eax
+ xor %esi,%ebx
+ add %edi,%ebp
+ xor 8(%rsp),%eax
+ rol $30,%r12d
+ add %ebx,%ebp
+ rol $1,%eax
+ mov %eax,20(%rsp)
+ lea -0x359d3e2a(%eax,%esi),%edi
+ mov 24(%rsp),%eax
+ mov %r12d,%ebx
+ mov %ebp,%esi
+ xor 32(%rsp),%eax
+ xor %r11d,%ebx
+ rol $5,%esi
+ xor 56(%rsp),%eax
+ xor %edx,%ebx
+ add %esi,%edi
+ xor 12(%rsp),%eax
+ rol $30,%r11d
+ add %ebx,%edi
+ rol $1,%eax
+ mov %eax,24(%rsp)
+ lea -0x359d3e2a(%eax,%edx),%esi
+ mov 28(%rsp),%eax
+ mov %r11d,%ebx
+ mov %edi,%edx
+ xor 36(%rsp),%eax
+ xor %ebp,%ebx
+ rol $5,%edx
+ xor 60(%rsp),%eax
+ xor %r12d,%ebx
+ add %edx,%esi
+ xor 16(%rsp),%eax
+ rol $30,%ebp
+ add %ebx,%esi
+ rol $1,%eax
+ mov %eax,28(%rsp)
+ lea -0x359d3e2a(%eax,%r12d),%edx
+ mov 32(%rsp),%eax
+ mov %ebp,%ebx
+ mov %esi,%r12d
+ xor 40(%rsp),%eax
+ xor %edi,%ebx
+ rol $5,%r12d
+ xor 0(%rsp),%eax
+ xor %r11d,%ebx
+ add %r12d,%edx
+ xor 20(%rsp),%eax
+ rol $30,%edi
+ add %ebx,%edx
+ rol $1,%eax
+ mov %eax,32(%rsp)
+ lea -0x359d3e2a(%eax,%r11d),%r12d
+ mov 36(%rsp),%eax
+ mov %edi,%ebx
+ mov %edx,%r11d
+ xor 44(%rsp),%eax
+ xor %esi,%ebx
+ rol $5,%r11d
+ xor 4(%rsp),%eax
+ xor %ebp,%ebx
+ add %r11d,%r12d
+ xor 24(%rsp),%eax
+ rol $30,%esi
+ add %ebx,%r12d
+ rol $1,%eax
+ mov %eax,36(%rsp)
+ lea -0x359d3e2a(%eax,%ebp),%r11d
+ mov 40(%rsp),%eax
+ mov %esi,%ebx
+ mov %r12d,%ebp
+ xor 48(%rsp),%eax
+ xor %edx,%ebx
+ rol $5,%ebp
+ xor 8(%rsp),%eax
+ xor %edi,%ebx
+ add %ebp,%r11d
+ xor 28(%rsp),%eax
+ rol $30,%edx
+ add %ebx,%r11d
+ rol $1,%eax
+ mov %eax,40(%rsp)
+ lea -0x359d3e2a(%eax,%edi),%ebp
+ mov 44(%rsp),%eax
+ mov %edx,%ebx
+ mov %r11d,%edi
+ xor 52(%rsp),%eax
+ xor %r12d,%ebx
+ rol $5,%edi
+ xor 12(%rsp),%eax
+ xor %esi,%ebx
+ add %edi,%ebp
+ xor 32(%rsp),%eax
+ rol $30,%r12d
+ add %ebx,%ebp
+ rol $1,%eax
+ mov %eax,44(%rsp)
+ lea -0x359d3e2a(%eax,%esi),%edi
+ mov 48(%rsp),%eax
+ mov %r12d,%ebx
+ mov %ebp,%esi
+ xor 56(%rsp),%eax
+ xor %r11d,%ebx
+ rol $5,%esi
+ xor 16(%rsp),%eax
+ xor %edx,%ebx
+ add %esi,%edi
+ xor 36(%rsp),%eax
+ rol $30,%r11d
+ add %ebx,%edi
+ rol $1,%eax
+ mov %eax,48(%rsp)
+ lea -0x359d3e2a(%eax,%edx),%esi
+ mov 52(%rsp),%eax
+ mov %r11d,%ebx
+ mov %edi,%edx
+ xor 60(%rsp),%eax
+ xor %ebp,%ebx
+ rol $5,%edx
+ xor 20(%rsp),%eax
+ xor %r12d,%ebx
+ add %edx,%esi
+ xor 40(%rsp),%eax
+ rol $30,%ebp
+ add %ebx,%esi
+ rol $1,%eax
+ lea -0x359d3e2a(%eax,%r12d),%edx
+ mov 56(%rsp),%eax
+ mov %ebp,%ebx
+ mov %esi,%r12d
+ xor 0(%rsp),%eax
+ xor %edi,%ebx
+ rol $5,%r12d
+ xor 24(%rsp),%eax
+ xor %r11d,%ebx
+ add %r12d,%edx
+ xor 44(%rsp),%eax
+ rol $30,%edi
+ add %ebx,%edx
+ rol $1,%eax
+ lea -0x359d3e2a(%eax,%r11d),%r12d
+ mov 60(%rsp),%eax
+ mov %edi,%ebx
+ mov %edx,%r11d
+ xor 4(%rsp),%eax
+ xor %esi,%ebx
+ rol $5,%r11d
+ xor 28(%rsp),%eax
+ xor %ebp,%ebx
+ add %r11d,%r12d
+ xor 48(%rsp),%eax
+ rol $30,%esi
+ add %ebx,%r12d
+ rol $1,%eax
+ lea -0x359d3e2a(%eax,%ebp),%r11d
+ mov %esi,%ebx
+ mov %r12d,%ebp
+ xor %edx,%ebx
+ rol $5,%ebp
+ xor %edi,%ebx
+ add %ebp,%r11d
+ rol $30,%edx
+ add %ebx,%r11d
+ // Update and save state information in SHA-1 context
+ add 0(%r8),%r11d
+ add 4(%r8),%r12d
+ add 8(%r8),%edx
+ add 12(%r8),%esi
+ add 16(%r8),%edi
+ mov %r11d,0(%r8)
+ mov %r12d,4(%r8)
+ mov %edx,8(%r8)
+ mov %esi,12(%r8)
+ mov %edi,16(%r8)
+
+ xchg %r11d,%edx # mov %r11d,%edx
+ xchg %r12d,%esi # mov %r12d,%esi
+ xchg %r11d,%edi # mov %edx,%edi
+ xchg %r12d,%ebp # mov %esi,%ebp
+ # mov %edi,%r11d
+ lea 64(%r9),%r9
+ sub $1,%r10
+ jnz .Lloop
+ mov 64(%rsp),%rsp
+ pop %r12
+ pop %rbp
+ pop %rbx
+ ret
+SET_SIZE(sha1_block_data_order)
+
+.data
+.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro@openssl.org>"
+
+#endif /* lint || __lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/sha2/sha256_impl.S b/sys/contrib/openzfs/module/icp/asm-x86_64/sha2/sha256_impl.S
new file mode 100644
index 000000000000..766b75355f0b
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/sha2/sha256_impl.S
@@ -0,0 +1,2063 @@
+/*
+ * ====================================================================
+ * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+ * project. Rights for redistribution and usage in source and binary
+ * forms are granted according to the OpenSSL license.
+ * ====================================================================
+ *
+ * sha256/512_block procedure for x86_64.
+ *
+ * 40% improvement over compiler-generated code on Opteron. On EM64T
+ * sha256 was observed to run >80% faster and sha512 - >40%. No magical
+ * tricks, just straight implementation... I really wonder why gcc
+ * [being armed with inline assembler] fails to generate as fast code.
+ * The only thing which is cool about this module is that it's very
+ * same instruction sequence used for both SHA-256 and SHA-512. In
+ * former case the instructions operate on 32-bit operands, while in
+ * latter - on 64-bit ones. All I had to do is to get one flavor right,
+ * the other one passed the test right away:-)
+ *
+ * sha256_block runs in ~1005 cycles on Opteron, which gives you
+ * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
+ * frequency in GHz. sha512_block runs in ~1275 cycles, which results
+ * in 128*1000/1275=100MBps per GHz. Is there room for improvement?
+ * Well, if you compare it to IA-64 implementation, which maintains
+ * X[16] in register bank[!], tends to 4 instructions per CPU clock
+ * cycle and runs in 1003 cycles, 1275 is very good result for 3-way
+ * issue Opteron pipeline and X[16] maintained in memory. So that *if*
+ * there is a way to improve it, *then* the only way would be to try to
+ * offload X[16] updates to SSE unit, but that would require "deeper"
+ * loop unroll, which in turn would naturally cause size blow-up, not
+ * to mention increased complexity! And once again, only *if* it's
+ * actually possible to noticeably improve overall ILP, instruction
+ * level parallelism, on a given CPU implementation in this case.
+ *
+ * Special note on Intel EM64T. While Opteron CPU exhibits perfect
+ * performance ratio of 1.5 between 64- and 32-bit flavors [see above],
+ * [currently available] EM64T CPUs apparently are far from it. On the
+ * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
+ * sha256_block:-( This is presumably because 64-bit shifts/rotates
+ * apparently are not atomic instructions, but implemented in microcode.
+ */
+
+/*
+ * OpenSolaris OS modifications
+ *
+ * Sun elects to use this software under the BSD license.
+ *
+ * This source originates from OpenSSL file sha512-x86_64.pl at
+ * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
+ * (presumably for future OpenSSL release 0.9.8h), with these changes:
+ *
+ * 1. Added perl "use strict" and declared variables.
+ *
+ * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
+ *
+ * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1)
+ * assemblers). Replaced the .picmeup macro with assembler code.
+ *
+ * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype",
+ * at the beginning of SHA2_CTX (the next field is 8-byte aligned).
+ */
+
+/*
+ * This file was generated by a perl script (sha512-x86_64.pl) that were
+ * used to generate sha256 and sha512 variants from the same code base.
+ * The comments from the original file have been pasted above.
+ */
+
+#if defined(lint) || defined(__lint)
+#include <sys/stdint.h>
+#include <sha2/sha2.h>
+
+/* ARGSUSED */
+void
+SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num)
+{
+}
+
+
+#else
+#define _ASM
+#include <sys/asm_linkage.h>
+
+ENTRY_NP(SHA256TransformBlocks)
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ mov %rsp,%rbp # copy %rsp
+ shl $4,%rdx # num*16
+ sub $16*4+4*8,%rsp
+ lea (%rsi,%rdx,4),%rdx # inp+num*16*4
+ and $-64,%rsp # align stack frame
+ add $8,%rdi # Skip OpenSolaris field, "algotype"
+ mov %rdi,16*4+0*8(%rsp) # save ctx, 1st arg
+ mov %rsi,16*4+1*8(%rsp) # save inp, 2nd arg
+ mov %rdx,16*4+2*8(%rsp) # save end pointer, "3rd" arg
+ mov %rbp,16*4+3*8(%rsp) # save copy of %rsp
+
+ #.picmeup %rbp
+ # The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts
+ # the address of the "next" instruction into the target register
+ # (%rbp). This generates these 2 instructions:
+ lea .Llea(%rip),%rbp
+ #nop # .picmeup generates a nop for mod 8 alignment--not needed here
+
+.Llea:
+ lea K256-.(%rbp),%rbp
+
+ mov 4*0(%rdi),%eax
+ mov 4*1(%rdi),%ebx
+ mov 4*2(%rdi),%ecx
+ mov 4*3(%rdi),%edx
+ mov 4*4(%rdi),%r8d
+ mov 4*5(%rdi),%r9d
+ mov 4*6(%rdi),%r10d
+ mov 4*7(%rdi),%r11d
+ jmp .Lloop
+
+.align 16
+.Lloop:
+ xor %rdi,%rdi
+ mov 4*0(%rsi),%r12d
+ bswap %r12d
+ mov %r8d,%r13d
+ mov %r8d,%r14d
+ mov %r9d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r10d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r8d,%r15d # (f^g)&e
+ mov %r12d,0(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r11d,%r12d # T1+=h
+
+ mov %eax,%r11d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %eax,%r13d
+ mov %eax,%r14d
+
+ ror $2,%r11d
+ ror $13,%r13d
+ mov %eax,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r11d
+ ror $9,%r13d
+ or %ecx,%r14d # a|c
+
+ xor %r13d,%r11d # h=Sigma0(a)
+ and %ecx,%r15d # a&c
+ add %r12d,%edx # d+=T1
+
+ and %ebx,%r14d # (a|c)&b
+ add %r12d,%r11d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r11d # h+=Maj(a,b,c)
+ mov 4*1(%rsi),%r12d
+ bswap %r12d
+ mov %edx,%r13d
+ mov %edx,%r14d
+ mov %r8d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r9d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %edx,%r15d # (f^g)&e
+ mov %r12d,4(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r10d,%r12d # T1+=h
+
+ mov %r11d,%r10d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r11d,%r13d
+ mov %r11d,%r14d
+
+ ror $2,%r10d
+ ror $13,%r13d
+ mov %r11d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r10d
+ ror $9,%r13d
+ or %ebx,%r14d # a|c
+
+ xor %r13d,%r10d # h=Sigma0(a)
+ and %ebx,%r15d # a&c
+ add %r12d,%ecx # d+=T1
+
+ and %eax,%r14d # (a|c)&b
+ add %r12d,%r10d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r10d # h+=Maj(a,b,c)
+ mov 4*2(%rsi),%r12d
+ bswap %r12d
+ mov %ecx,%r13d
+ mov %ecx,%r14d
+ mov %edx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r8d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %ecx,%r15d # (f^g)&e
+ mov %r12d,8(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r9d,%r12d # T1+=h
+
+ mov %r10d,%r9d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r10d,%r13d
+ mov %r10d,%r14d
+
+ ror $2,%r9d
+ ror $13,%r13d
+ mov %r10d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r9d
+ ror $9,%r13d
+ or %eax,%r14d # a|c
+
+ xor %r13d,%r9d # h=Sigma0(a)
+ and %eax,%r15d # a&c
+ add %r12d,%ebx # d+=T1
+
+ and %r11d,%r14d # (a|c)&b
+ add %r12d,%r9d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r9d # h+=Maj(a,b,c)
+ mov 4*3(%rsi),%r12d
+ bswap %r12d
+ mov %ebx,%r13d
+ mov %ebx,%r14d
+ mov %ecx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %edx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %ebx,%r15d # (f^g)&e
+ mov %r12d,12(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r8d,%r12d # T1+=h
+
+ mov %r9d,%r8d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r9d,%r13d
+ mov %r9d,%r14d
+
+ ror $2,%r8d
+ ror $13,%r13d
+ mov %r9d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r8d
+ ror $9,%r13d
+ or %r11d,%r14d # a|c
+
+ xor %r13d,%r8d # h=Sigma0(a)
+ and %r11d,%r15d # a&c
+ add %r12d,%eax # d+=T1
+
+ and %r10d,%r14d # (a|c)&b
+ add %r12d,%r8d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r8d # h+=Maj(a,b,c)
+ mov 4*4(%rsi),%r12d
+ bswap %r12d
+ mov %eax,%r13d
+ mov %eax,%r14d
+ mov %ebx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %ecx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %eax,%r15d # (f^g)&e
+ mov %r12d,16(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %edx,%r12d # T1+=h
+
+ mov %r8d,%edx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r8d,%r13d
+ mov %r8d,%r14d
+
+ ror $2,%edx
+ ror $13,%r13d
+ mov %r8d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%edx
+ ror $9,%r13d
+ or %r10d,%r14d # a|c
+
+ xor %r13d,%edx # h=Sigma0(a)
+ and %r10d,%r15d # a&c
+ add %r12d,%r11d # d+=T1
+
+ and %r9d,%r14d # (a|c)&b
+ add %r12d,%edx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%edx # h+=Maj(a,b,c)
+ mov 4*5(%rsi),%r12d
+ bswap %r12d
+ mov %r11d,%r13d
+ mov %r11d,%r14d
+ mov %eax,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %ebx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r11d,%r15d # (f^g)&e
+ mov %r12d,20(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %ecx,%r12d # T1+=h
+
+ mov %edx,%ecx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %edx,%r13d
+ mov %edx,%r14d
+
+ ror $2,%ecx
+ ror $13,%r13d
+ mov %edx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%ecx
+ ror $9,%r13d
+ or %r9d,%r14d # a|c
+
+ xor %r13d,%ecx # h=Sigma0(a)
+ and %r9d,%r15d # a&c
+ add %r12d,%r10d # d+=T1
+
+ and %r8d,%r14d # (a|c)&b
+ add %r12d,%ecx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%ecx # h+=Maj(a,b,c)
+ mov 4*6(%rsi),%r12d
+ bswap %r12d
+ mov %r10d,%r13d
+ mov %r10d,%r14d
+ mov %r11d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %eax,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r10d,%r15d # (f^g)&e
+ mov %r12d,24(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %ebx,%r12d # T1+=h
+
+ mov %ecx,%ebx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %ecx,%r13d
+ mov %ecx,%r14d
+
+ ror $2,%ebx
+ ror $13,%r13d
+ mov %ecx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%ebx
+ ror $9,%r13d
+ or %r8d,%r14d # a|c
+
+ xor %r13d,%ebx # h=Sigma0(a)
+ and %r8d,%r15d # a&c
+ add %r12d,%r9d # d+=T1
+
+ and %edx,%r14d # (a|c)&b
+ add %r12d,%ebx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%ebx # h+=Maj(a,b,c)
+ mov 4*7(%rsi),%r12d
+ bswap %r12d
+ mov %r9d,%r13d
+ mov %r9d,%r14d
+ mov %r10d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r11d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r9d,%r15d # (f^g)&e
+ mov %r12d,28(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %eax,%r12d # T1+=h
+
+ mov %ebx,%eax
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %ebx,%r13d
+ mov %ebx,%r14d
+
+ ror $2,%eax
+ ror $13,%r13d
+ mov %ebx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%eax
+ ror $9,%r13d
+ or %edx,%r14d # a|c
+
+ xor %r13d,%eax # h=Sigma0(a)
+ and %edx,%r15d # a&c
+ add %r12d,%r8d # d+=T1
+
+ and %ecx,%r14d # (a|c)&b
+ add %r12d,%eax # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%eax # h+=Maj(a,b,c)
+ mov 4*8(%rsi),%r12d
+ bswap %r12d
+ mov %r8d,%r13d
+ mov %r8d,%r14d
+ mov %r9d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r10d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r8d,%r15d # (f^g)&e
+ mov %r12d,32(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r11d,%r12d # T1+=h
+
+ mov %eax,%r11d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %eax,%r13d
+ mov %eax,%r14d
+
+ ror $2,%r11d
+ ror $13,%r13d
+ mov %eax,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r11d
+ ror $9,%r13d
+ or %ecx,%r14d # a|c
+
+ xor %r13d,%r11d # h=Sigma0(a)
+ and %ecx,%r15d # a&c
+ add %r12d,%edx # d+=T1
+
+ and %ebx,%r14d # (a|c)&b
+ add %r12d,%r11d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r11d # h+=Maj(a,b,c)
+ mov 4*9(%rsi),%r12d
+ bswap %r12d
+ mov %edx,%r13d
+ mov %edx,%r14d
+ mov %r8d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r9d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %edx,%r15d # (f^g)&e
+ mov %r12d,36(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r10d,%r12d # T1+=h
+
+ mov %r11d,%r10d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r11d,%r13d
+ mov %r11d,%r14d
+
+ ror $2,%r10d
+ ror $13,%r13d
+ mov %r11d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r10d
+ ror $9,%r13d
+ or %ebx,%r14d # a|c
+
+ xor %r13d,%r10d # h=Sigma0(a)
+ and %ebx,%r15d # a&c
+ add %r12d,%ecx # d+=T1
+
+ and %eax,%r14d # (a|c)&b
+ add %r12d,%r10d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r10d # h+=Maj(a,b,c)
+ mov 4*10(%rsi),%r12d
+ bswap %r12d
+ mov %ecx,%r13d
+ mov %ecx,%r14d
+ mov %edx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r8d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %ecx,%r15d # (f^g)&e
+ mov %r12d,40(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r9d,%r12d # T1+=h
+
+ mov %r10d,%r9d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r10d,%r13d
+ mov %r10d,%r14d
+
+ ror $2,%r9d
+ ror $13,%r13d
+ mov %r10d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r9d
+ ror $9,%r13d
+ or %eax,%r14d # a|c
+
+ xor %r13d,%r9d # h=Sigma0(a)
+ and %eax,%r15d # a&c
+ add %r12d,%ebx # d+=T1
+
+ and %r11d,%r14d # (a|c)&b
+ add %r12d,%r9d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r9d # h+=Maj(a,b,c)
+ mov 4*11(%rsi),%r12d
+ bswap %r12d
+ mov %ebx,%r13d
+ mov %ebx,%r14d
+ mov %ecx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %edx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %ebx,%r15d # (f^g)&e
+ mov %r12d,44(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r8d,%r12d # T1+=h
+
+ mov %r9d,%r8d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r9d,%r13d
+ mov %r9d,%r14d
+
+ ror $2,%r8d
+ ror $13,%r13d
+ mov %r9d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r8d
+ ror $9,%r13d
+ or %r11d,%r14d # a|c
+
+ xor %r13d,%r8d # h=Sigma0(a)
+ and %r11d,%r15d # a&c
+ add %r12d,%eax # d+=T1
+
+ and %r10d,%r14d # (a|c)&b
+ add %r12d,%r8d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r8d # h+=Maj(a,b,c)
+ mov 4*12(%rsi),%r12d
+ bswap %r12d
+ mov %eax,%r13d
+ mov %eax,%r14d
+ mov %ebx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %ecx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %eax,%r15d # (f^g)&e
+ mov %r12d,48(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %edx,%r12d # T1+=h
+
+ mov %r8d,%edx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r8d,%r13d
+ mov %r8d,%r14d
+
+ ror $2,%edx
+ ror $13,%r13d
+ mov %r8d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%edx
+ ror $9,%r13d
+ or %r10d,%r14d # a|c
+
+ xor %r13d,%edx # h=Sigma0(a)
+ and %r10d,%r15d # a&c
+ add %r12d,%r11d # d+=T1
+
+ and %r9d,%r14d # (a|c)&b
+ add %r12d,%edx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%edx # h+=Maj(a,b,c)
+ mov 4*13(%rsi),%r12d
+ bswap %r12d
+ mov %r11d,%r13d
+ mov %r11d,%r14d
+ mov %eax,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %ebx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r11d,%r15d # (f^g)&e
+ mov %r12d,52(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %ecx,%r12d # T1+=h
+
+ mov %edx,%ecx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %edx,%r13d
+ mov %edx,%r14d
+
+ ror $2,%ecx
+ ror $13,%r13d
+ mov %edx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%ecx
+ ror $9,%r13d
+ or %r9d,%r14d # a|c
+
+ xor %r13d,%ecx # h=Sigma0(a)
+ and %r9d,%r15d # a&c
+ add %r12d,%r10d # d+=T1
+
+ and %r8d,%r14d # (a|c)&b
+ add %r12d,%ecx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%ecx # h+=Maj(a,b,c)
+ mov 4*14(%rsi),%r12d
+ bswap %r12d
+ mov %r10d,%r13d
+ mov %r10d,%r14d
+ mov %r11d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %eax,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r10d,%r15d # (f^g)&e
+ mov %r12d,56(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %ebx,%r12d # T1+=h
+
+ mov %ecx,%ebx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %ecx,%r13d
+ mov %ecx,%r14d
+
+ ror $2,%ebx
+ ror $13,%r13d
+ mov %ecx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%ebx
+ ror $9,%r13d
+ or %r8d,%r14d # a|c
+
+ xor %r13d,%ebx # h=Sigma0(a)
+ and %r8d,%r15d # a&c
+ add %r12d,%r9d # d+=T1
+
+ and %edx,%r14d # (a|c)&b
+ add %r12d,%ebx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%ebx # h+=Maj(a,b,c)
+ mov 4*15(%rsi),%r12d
+ bswap %r12d
+ mov %r9d,%r13d
+ mov %r9d,%r14d
+ mov %r10d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r11d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r9d,%r15d # (f^g)&e
+ mov %r12d,60(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %eax,%r12d # T1+=h
+
+ mov %ebx,%eax
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %ebx,%r13d
+ mov %ebx,%r14d
+
+ ror $2,%eax
+ ror $13,%r13d
+ mov %ebx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%eax
+ ror $9,%r13d
+ or %edx,%r14d # a|c
+
+ xor %r13d,%eax # h=Sigma0(a)
+ and %edx,%r15d # a&c
+ add %r12d,%r8d # d+=T1
+
+ and %ecx,%r14d # (a|c)&b
+ add %r12d,%eax # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%eax # h+=Maj(a,b,c)
+ jmp .Lrounds_16_xx
+.align 16
+.Lrounds_16_xx:
+ mov 4(%rsp),%r13d
+ mov 56(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 36(%rsp),%r12d
+
+ add 0(%rsp),%r12d
+ mov %r8d,%r13d
+ mov %r8d,%r14d
+ mov %r9d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r10d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r8d,%r15d # (f^g)&e
+ mov %r12d,0(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r11d,%r12d # T1+=h
+
+ mov %eax,%r11d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %eax,%r13d
+ mov %eax,%r14d
+
+ ror $2,%r11d
+ ror $13,%r13d
+ mov %eax,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r11d
+ ror $9,%r13d
+ or %ecx,%r14d # a|c
+
+ xor %r13d,%r11d # h=Sigma0(a)
+ and %ecx,%r15d # a&c
+ add %r12d,%edx # d+=T1
+
+ and %ebx,%r14d # (a|c)&b
+ add %r12d,%r11d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r11d # h+=Maj(a,b,c)
+ mov 8(%rsp),%r13d
+ mov 60(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 40(%rsp),%r12d
+
+ add 4(%rsp),%r12d
+ mov %edx,%r13d
+ mov %edx,%r14d
+ mov %r8d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r9d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %edx,%r15d # (f^g)&e
+ mov %r12d,4(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r10d,%r12d # T1+=h
+
+ mov %r11d,%r10d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r11d,%r13d
+ mov %r11d,%r14d
+
+ ror $2,%r10d
+ ror $13,%r13d
+ mov %r11d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r10d
+ ror $9,%r13d
+ or %ebx,%r14d # a|c
+
+ xor %r13d,%r10d # h=Sigma0(a)
+ and %ebx,%r15d # a&c
+ add %r12d,%ecx # d+=T1
+
+ and %eax,%r14d # (a|c)&b
+ add %r12d,%r10d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r10d # h+=Maj(a,b,c)
+ mov 12(%rsp),%r13d
+ mov 0(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 44(%rsp),%r12d
+
+ add 8(%rsp),%r12d
+ mov %ecx,%r13d
+ mov %ecx,%r14d
+ mov %edx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r8d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %ecx,%r15d # (f^g)&e
+ mov %r12d,8(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r9d,%r12d # T1+=h
+
+ mov %r10d,%r9d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r10d,%r13d
+ mov %r10d,%r14d
+
+ ror $2,%r9d
+ ror $13,%r13d
+ mov %r10d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r9d
+ ror $9,%r13d
+ or %eax,%r14d # a|c
+
+ xor %r13d,%r9d # h=Sigma0(a)
+ and %eax,%r15d # a&c
+ add %r12d,%ebx # d+=T1
+
+ and %r11d,%r14d # (a|c)&b
+ add %r12d,%r9d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r9d # h+=Maj(a,b,c)
+ mov 16(%rsp),%r13d
+ mov 4(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 48(%rsp),%r12d
+
+ add 12(%rsp),%r12d
+ mov %ebx,%r13d
+ mov %ebx,%r14d
+ mov %ecx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %edx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %ebx,%r15d # (f^g)&e
+ mov %r12d,12(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r8d,%r12d # T1+=h
+
+ mov %r9d,%r8d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r9d,%r13d
+ mov %r9d,%r14d
+
+ ror $2,%r8d
+ ror $13,%r13d
+ mov %r9d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r8d
+ ror $9,%r13d
+ or %r11d,%r14d # a|c
+
+ xor %r13d,%r8d # h=Sigma0(a)
+ and %r11d,%r15d # a&c
+ add %r12d,%eax # d+=T1
+
+ and %r10d,%r14d # (a|c)&b
+ add %r12d,%r8d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r8d # h+=Maj(a,b,c)
+ mov 20(%rsp),%r13d
+ mov 8(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 52(%rsp),%r12d
+
+ add 16(%rsp),%r12d
+ mov %eax,%r13d
+ mov %eax,%r14d
+ mov %ebx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %ecx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %eax,%r15d # (f^g)&e
+ mov %r12d,16(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %edx,%r12d # T1+=h
+
+ mov %r8d,%edx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r8d,%r13d
+ mov %r8d,%r14d
+
+ ror $2,%edx
+ ror $13,%r13d
+ mov %r8d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%edx
+ ror $9,%r13d
+ or %r10d,%r14d # a|c
+
+ xor %r13d,%edx # h=Sigma0(a)
+ and %r10d,%r15d # a&c
+ add %r12d,%r11d # d+=T1
+
+ and %r9d,%r14d # (a|c)&b
+ add %r12d,%edx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%edx # h+=Maj(a,b,c)
+ mov 24(%rsp),%r13d
+ mov 12(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 56(%rsp),%r12d
+
+ add 20(%rsp),%r12d
+ mov %r11d,%r13d
+ mov %r11d,%r14d
+ mov %eax,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %ebx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r11d,%r15d # (f^g)&e
+ mov %r12d,20(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %ecx,%r12d # T1+=h
+
+ mov %edx,%ecx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %edx,%r13d
+ mov %edx,%r14d
+
+ ror $2,%ecx
+ ror $13,%r13d
+ mov %edx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%ecx
+ ror $9,%r13d
+ or %r9d,%r14d # a|c
+
+ xor %r13d,%ecx # h=Sigma0(a)
+ and %r9d,%r15d # a&c
+ add %r12d,%r10d # d+=T1
+
+ and %r8d,%r14d # (a|c)&b
+ add %r12d,%ecx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%ecx # h+=Maj(a,b,c)
+ mov 28(%rsp),%r13d
+ mov 16(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 60(%rsp),%r12d
+
+ add 24(%rsp),%r12d
+ mov %r10d,%r13d
+ mov %r10d,%r14d
+ mov %r11d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %eax,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r10d,%r15d # (f^g)&e
+ mov %r12d,24(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %ebx,%r12d # T1+=h
+
+ mov %ecx,%ebx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %ecx,%r13d
+ mov %ecx,%r14d
+
+ ror $2,%ebx
+ ror $13,%r13d
+ mov %ecx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%ebx
+ ror $9,%r13d
+ or %r8d,%r14d # a|c
+
+ xor %r13d,%ebx # h=Sigma0(a)
+ and %r8d,%r15d # a&c
+ add %r12d,%r9d # d+=T1
+
+ and %edx,%r14d # (a|c)&b
+ add %r12d,%ebx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%ebx # h+=Maj(a,b,c)
+ mov 32(%rsp),%r13d
+ mov 20(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 0(%rsp),%r12d
+
+ add 28(%rsp),%r12d
+ mov %r9d,%r13d
+ mov %r9d,%r14d
+ mov %r10d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r11d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r9d,%r15d # (f^g)&e
+ mov %r12d,28(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %eax,%r12d # T1+=h
+
+ mov %ebx,%eax
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %ebx,%r13d
+ mov %ebx,%r14d
+
+ ror $2,%eax
+ ror $13,%r13d
+ mov %ebx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%eax
+ ror $9,%r13d
+ or %edx,%r14d # a|c
+
+ xor %r13d,%eax # h=Sigma0(a)
+ and %edx,%r15d # a&c
+ add %r12d,%r8d # d+=T1
+
+ and %ecx,%r14d # (a|c)&b
+ add %r12d,%eax # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%eax # h+=Maj(a,b,c)
+ mov 36(%rsp),%r13d
+ mov 24(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 4(%rsp),%r12d
+
+ add 32(%rsp),%r12d
+ mov %r8d,%r13d
+ mov %r8d,%r14d
+ mov %r9d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r10d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r8d,%r15d # (f^g)&e
+ mov %r12d,32(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r11d,%r12d # T1+=h
+
+ mov %eax,%r11d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %eax,%r13d
+ mov %eax,%r14d
+
+ ror $2,%r11d
+ ror $13,%r13d
+ mov %eax,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r11d
+ ror $9,%r13d
+ or %ecx,%r14d # a|c
+
+ xor %r13d,%r11d # h=Sigma0(a)
+ and %ecx,%r15d # a&c
+ add %r12d,%edx # d+=T1
+
+ and %ebx,%r14d # (a|c)&b
+ add %r12d,%r11d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r11d # h+=Maj(a,b,c)
+ mov 40(%rsp),%r13d
+ mov 28(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 8(%rsp),%r12d
+
+ add 36(%rsp),%r12d
+ mov %edx,%r13d
+ mov %edx,%r14d
+ mov %r8d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r9d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %edx,%r15d # (f^g)&e
+ mov %r12d,36(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r10d,%r12d # T1+=h
+
+ mov %r11d,%r10d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r11d,%r13d
+ mov %r11d,%r14d
+
+ ror $2,%r10d
+ ror $13,%r13d
+ mov %r11d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r10d
+ ror $9,%r13d
+ or %ebx,%r14d # a|c
+
+ xor %r13d,%r10d # h=Sigma0(a)
+ and %ebx,%r15d # a&c
+ add %r12d,%ecx # d+=T1
+
+ and %eax,%r14d # (a|c)&b
+ add %r12d,%r10d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r10d # h+=Maj(a,b,c)
+ mov 44(%rsp),%r13d
+ mov 32(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 12(%rsp),%r12d
+
+ add 40(%rsp),%r12d
+ mov %ecx,%r13d
+ mov %ecx,%r14d
+ mov %edx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r8d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %ecx,%r15d # (f^g)&e
+ mov %r12d,40(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r9d,%r12d # T1+=h
+
+ mov %r10d,%r9d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r10d,%r13d
+ mov %r10d,%r14d
+
+ ror $2,%r9d
+ ror $13,%r13d
+ mov %r10d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r9d
+ ror $9,%r13d
+ or %eax,%r14d # a|c
+
+ xor %r13d,%r9d # h=Sigma0(a)
+ and %eax,%r15d # a&c
+ add %r12d,%ebx # d+=T1
+
+ and %r11d,%r14d # (a|c)&b
+ add %r12d,%r9d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r9d # h+=Maj(a,b,c)
+ mov 48(%rsp),%r13d
+ mov 36(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 16(%rsp),%r12d
+
+ add 44(%rsp),%r12d
+ mov %ebx,%r13d
+ mov %ebx,%r14d
+ mov %ecx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %edx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %ebx,%r15d # (f^g)&e
+ mov %r12d,44(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r8d,%r12d # T1+=h
+
+ mov %r9d,%r8d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r9d,%r13d
+ mov %r9d,%r14d
+
+ ror $2,%r8d
+ ror $13,%r13d
+ mov %r9d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r8d
+ ror $9,%r13d
+ or %r11d,%r14d # a|c
+
+ xor %r13d,%r8d # h=Sigma0(a)
+ and %r11d,%r15d # a&c
+ add %r12d,%eax # d+=T1
+
+ and %r10d,%r14d # (a|c)&b
+ add %r12d,%r8d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r8d # h+=Maj(a,b,c)
+ mov 52(%rsp),%r13d
+ mov 40(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 20(%rsp),%r12d
+
+ add 48(%rsp),%r12d
+ mov %eax,%r13d
+ mov %eax,%r14d
+ mov %ebx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %ecx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %eax,%r15d # (f^g)&e
+ mov %r12d,48(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %edx,%r12d # T1+=h
+
+ mov %r8d,%edx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r8d,%r13d
+ mov %r8d,%r14d
+
+ ror $2,%edx
+ ror $13,%r13d
+ mov %r8d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%edx
+ ror $9,%r13d
+ or %r10d,%r14d # a|c
+
+ xor %r13d,%edx # h=Sigma0(a)
+ and %r10d,%r15d # a&c
+ add %r12d,%r11d # d+=T1
+
+ and %r9d,%r14d # (a|c)&b
+ add %r12d,%edx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%edx # h+=Maj(a,b,c)
+ mov 56(%rsp),%r13d
+ mov 44(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 24(%rsp),%r12d
+
+ add 52(%rsp),%r12d
+ mov %r11d,%r13d
+ mov %r11d,%r14d
+ mov %eax,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %ebx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r11d,%r15d # (f^g)&e
+ mov %r12d,52(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %ecx,%r12d # T1+=h
+
+ mov %edx,%ecx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %edx,%r13d
+ mov %edx,%r14d
+
+ ror $2,%ecx
+ ror $13,%r13d
+ mov %edx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%ecx
+ ror $9,%r13d
+ or %r9d,%r14d # a|c
+
+ xor %r13d,%ecx # h=Sigma0(a)
+ and %r9d,%r15d # a&c
+ add %r12d,%r10d # d+=T1
+
+ and %r8d,%r14d # (a|c)&b
+ add %r12d,%ecx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%ecx # h+=Maj(a,b,c)
+ mov 60(%rsp),%r13d
+ mov 48(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 28(%rsp),%r12d
+
+ add 56(%rsp),%r12d
+ mov %r10d,%r13d
+ mov %r10d,%r14d
+ mov %r11d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %eax,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r10d,%r15d # (f^g)&e
+ mov %r12d,56(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %ebx,%r12d # T1+=h
+
+ mov %ecx,%ebx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %ecx,%r13d
+ mov %ecx,%r14d
+
+ ror $2,%ebx
+ ror $13,%r13d
+ mov %ecx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%ebx
+ ror $9,%r13d
+ or %r8d,%r14d # a|c
+
+ xor %r13d,%ebx # h=Sigma0(a)
+ and %r8d,%r15d # a&c
+ add %r12d,%r9d # d+=T1
+
+ and %edx,%r14d # (a|c)&b
+ add %r12d,%ebx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%ebx # h+=Maj(a,b,c)
+ mov 0(%rsp),%r13d
+ mov 52(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 32(%rsp),%r12d
+
+ add 60(%rsp),%r12d
+ mov %r9d,%r13d
+ mov %r9d,%r14d
+ mov %r10d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r11d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r9d,%r15d # (f^g)&e
+ mov %r12d,60(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %eax,%r12d # T1+=h
+
+ mov %ebx,%eax
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %ebx,%r13d
+ mov %ebx,%r14d
+
+ ror $2,%eax
+ ror $13,%r13d
+ mov %ebx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%eax
+ ror $9,%r13d
+ or %edx,%r14d # a|c
+
+ xor %r13d,%eax # h=Sigma0(a)
+ and %edx,%r15d # a&c
+ add %r12d,%r8d # d+=T1
+
+ and %ecx,%r14d # (a|c)&b
+ add %r12d,%eax # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%eax # h+=Maj(a,b,c)
+ cmp $64,%rdi
+ jb .Lrounds_16_xx
+
+ mov 16*4+0*8(%rsp),%rdi
+ lea 16*4(%rsi),%rsi
+
+ add 4*0(%rdi),%eax
+ add 4*1(%rdi),%ebx
+ add 4*2(%rdi),%ecx
+ add 4*3(%rdi),%edx
+ add 4*4(%rdi),%r8d
+ add 4*5(%rdi),%r9d
+ add 4*6(%rdi),%r10d
+ add 4*7(%rdi),%r11d
+
+ cmp 16*4+2*8(%rsp),%rsi
+
+ mov %eax,4*0(%rdi)
+ mov %ebx,4*1(%rdi)
+ mov %ecx,4*2(%rdi)
+ mov %edx,4*3(%rdi)
+ mov %r8d,4*4(%rdi)
+ mov %r9d,4*5(%rdi)
+ mov %r10d,4*6(%rdi)
+ mov %r11d,4*7(%rdi)
+ jb .Lloop
+
+ mov 16*4+3*8(%rsp),%rsp
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+
+ ret
+SET_SIZE(SHA256TransformBlocks)
+
+.data
+.align 64
+.type K256,@object
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+#endif /* !lint && !__lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/sha2/sha512_impl.S b/sys/contrib/openzfs/module/icp/asm-x86_64/sha2/sha512_impl.S
new file mode 100644
index 000000000000..6e37618761b2
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/sha2/sha512_impl.S
@@ -0,0 +1,2088 @@
+/*
+ * ====================================================================
+ * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+ * project. Rights for redistribution and usage in source and binary
+ * forms are granted according to the OpenSSL license.
+ * ====================================================================
+ *
+ * sha256/512_block procedure for x86_64.
+ *
+ * 40% improvement over compiler-generated code on Opteron. On EM64T
+ * sha256 was observed to run >80% faster and sha512 - >40%. No magical
+ * tricks, just straight implementation... I really wonder why gcc
+ * [being armed with inline assembler] fails to generate as fast code.
+ * The only thing which is cool about this module is that it's very
+ * same instruction sequence used for both SHA-256 and SHA-512. In
+ * former case the instructions operate on 32-bit operands, while in
+ * latter - on 64-bit ones. All I had to do is to get one flavor right,
+ * the other one passed the test right away:-)
+ *
+ * sha256_block runs in ~1005 cycles on Opteron, which gives you
+ * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
+ * frequency in GHz. sha512_block runs in ~1275 cycles, which results
+ * in 128*1000/1275=100MBps per GHz. Is there room for improvement?
+ * Well, if you compare it to IA-64 implementation, which maintains
+ * X[16] in register bank[!], tends to 4 instructions per CPU clock
+ * cycle and runs in 1003 cycles, 1275 is very good result for 3-way
+ * issue Opteron pipeline and X[16] maintained in memory. So that *if*
+ * there is a way to improve it, *then* the only way would be to try to
+ * offload X[16] updates to SSE unit, but that would require "deeper"
+ * loop unroll, which in turn would naturally cause size blow-up, not
+ * to mention increased complexity! And once again, only *if* it's
+ * actually possible to noticeably improve overall ILP, instruction
+ * level parallelism, on a given CPU implementation in this case.
+ *
+ * Special note on Intel EM64T. While Opteron CPU exhibits perfect
+ * performance ratio of 1.5 between 64- and 32-bit flavors [see above],
+ * [currently available] EM64T CPUs apparently are far from it. On the
+ * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
+ * sha256_block:-( This is presumably because 64-bit shifts/rotates
+ * apparently are not atomic instructions, but implemented in microcode.
+ */
+
+/*
+ * OpenSolaris OS modifications
+ *
+ * Sun elects to use this software under the BSD license.
+ *
+ * This source originates from OpenSSL file sha512-x86_64.pl at
+ * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
+ * (presumably for future OpenSSL release 0.9.8h), with these changes:
+ *
+ * 1. Added perl "use strict" and declared variables.
+ *
+ * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
+ *
+ * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1)
+ * assemblers). Replaced the .picmeup macro with assembler code.
+ *
+ * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype",
+ * at the beginning of SHA2_CTX (the next field is 8-byte aligned).
+ */
+
+/*
+ * This file was generated by a perl script (sha512-x86_64.pl) that were
+ * used to generate sha256 and sha512 variants from the same code base.
+ * The comments from the original file have been pasted above.
+ */
+
+
+#if defined(lint) || defined(__lint)
+#include <sys/stdint.h>
+#include <sha2/sha2.h>
+
+/* ARGSUSED */
+void
+SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num)
+{
+}
+
+
+#else
+#define _ASM
+#include <sys/asm_linkage.h>
+
+ENTRY_NP(SHA512TransformBlocks)
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ mov %rsp,%rbp # copy %rsp
+ shl $4,%rdx # num*16
+ sub $16*8+4*8,%rsp
+ lea (%rsi,%rdx,8),%rdx # inp+num*16*8
+ and $-64,%rsp # align stack frame
+ add $8,%rdi # Skip OpenSolaris field, "algotype"
+ mov %rdi,16*8+0*8(%rsp) # save ctx, 1st arg
+ mov %rsi,16*8+1*8(%rsp) # save inp, 2nd arg
+ mov %rdx,16*8+2*8(%rsp) # save end pointer, "3rd" arg
+ mov %rbp,16*8+3*8(%rsp) # save copy of %rsp
+
+ #.picmeup %rbp
+ # The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts
+ # the address of the "next" instruction into the target register
+ # (%rbp). This generates these 2 instructions:
+ lea .Llea(%rip),%rbp
+ #nop # .picmeup generates a nop for mod 8 alignment--not needed here
+
+.Llea:
+ lea K512-.(%rbp),%rbp
+
+ mov 8*0(%rdi),%rax
+ mov 8*1(%rdi),%rbx
+ mov 8*2(%rdi),%rcx
+ mov 8*3(%rdi),%rdx
+ mov 8*4(%rdi),%r8
+ mov 8*5(%rdi),%r9
+ mov 8*6(%rdi),%r10
+ mov 8*7(%rdi),%r11
+ jmp .Lloop
+
+.align 16
+.Lloop:
+ xor %rdi,%rdi
+ mov 8*0(%rsi),%r12
+ bswap %r12
+ mov %r8,%r13
+ mov %r8,%r14
+ mov %r9,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r10,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r8,%r15 # (f^g)&e
+ mov %r12,0(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r11,%r12 # T1+=h
+
+ mov %rax,%r11
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rax,%r13
+ mov %rax,%r14
+
+ ror $28,%r11
+ ror $34,%r13
+ mov %rax,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r11
+ ror $5,%r13
+ or %rcx,%r14 # a|c
+
+ xor %r13,%r11 # h=Sigma0(a)
+ and %rcx,%r15 # a&c
+ add %r12,%rdx # d+=T1
+
+ and %rbx,%r14 # (a|c)&b
+ add %r12,%r11 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r11 # h+=Maj(a,b,c)
+ mov 8*1(%rsi),%r12
+ bswap %r12
+ mov %rdx,%r13
+ mov %rdx,%r14
+ mov %r8,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r9,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rdx,%r15 # (f^g)&e
+ mov %r12,8(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r10,%r12 # T1+=h
+
+ mov %r11,%r10
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r11,%r13
+ mov %r11,%r14
+
+ ror $28,%r10
+ ror $34,%r13
+ mov %r11,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r10
+ ror $5,%r13
+ or %rbx,%r14 # a|c
+
+ xor %r13,%r10 # h=Sigma0(a)
+ and %rbx,%r15 # a&c
+ add %r12,%rcx # d+=T1
+
+ and %rax,%r14 # (a|c)&b
+ add %r12,%r10 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r10 # h+=Maj(a,b,c)
+ mov 8*2(%rsi),%r12
+ bswap %r12
+ mov %rcx,%r13
+ mov %rcx,%r14
+ mov %rdx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r8,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rcx,%r15 # (f^g)&e
+ mov %r12,16(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r9,%r12 # T1+=h
+
+ mov %r10,%r9
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r10,%r13
+ mov %r10,%r14
+
+ ror $28,%r9
+ ror $34,%r13
+ mov %r10,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r9
+ ror $5,%r13
+ or %rax,%r14 # a|c
+
+ xor %r13,%r9 # h=Sigma0(a)
+ and %rax,%r15 # a&c
+ add %r12,%rbx # d+=T1
+
+ and %r11,%r14 # (a|c)&b
+ add %r12,%r9 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r9 # h+=Maj(a,b,c)
+ mov 8*3(%rsi),%r12
+ bswap %r12
+ mov %rbx,%r13
+ mov %rbx,%r14
+ mov %rcx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rdx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rbx,%r15 # (f^g)&e
+ mov %r12,24(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r8,%r12 # T1+=h
+
+ mov %r9,%r8
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r9,%r13
+ mov %r9,%r14
+
+ ror $28,%r8
+ ror $34,%r13
+ mov %r9,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r8
+ ror $5,%r13
+ or %r11,%r14 # a|c
+
+ xor %r13,%r8 # h=Sigma0(a)
+ and %r11,%r15 # a&c
+ add %r12,%rax # d+=T1
+
+ and %r10,%r14 # (a|c)&b
+ add %r12,%r8 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r8 # h+=Maj(a,b,c)
+ mov 8*4(%rsi),%r12
+ bswap %r12
+ mov %rax,%r13
+ mov %rax,%r14
+ mov %rbx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rcx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rax,%r15 # (f^g)&e
+ mov %r12,32(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rdx,%r12 # T1+=h
+
+ mov %r8,%rdx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r8,%r13
+ mov %r8,%r14
+
+ ror $28,%rdx
+ ror $34,%r13
+ mov %r8,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rdx
+ ror $5,%r13
+ or %r10,%r14 # a|c
+
+ xor %r13,%rdx # h=Sigma0(a)
+ and %r10,%r15 # a&c
+ add %r12,%r11 # d+=T1
+
+ and %r9,%r14 # (a|c)&b
+ add %r12,%rdx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rdx # h+=Maj(a,b,c)
+ mov 8*5(%rsi),%r12
+ bswap %r12
+ mov %r11,%r13
+ mov %r11,%r14
+ mov %rax,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rbx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r11,%r15 # (f^g)&e
+ mov %r12,40(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rcx,%r12 # T1+=h
+
+ mov %rdx,%rcx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rdx,%r13
+ mov %rdx,%r14
+
+ ror $28,%rcx
+ ror $34,%r13
+ mov %rdx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rcx
+ ror $5,%r13
+ or %r9,%r14 # a|c
+
+ xor %r13,%rcx # h=Sigma0(a)
+ and %r9,%r15 # a&c
+ add %r12,%r10 # d+=T1
+
+ and %r8,%r14 # (a|c)&b
+ add %r12,%rcx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rcx # h+=Maj(a,b,c)
+ mov 8*6(%rsi),%r12
+ bswap %r12
+ mov %r10,%r13
+ mov %r10,%r14
+ mov %r11,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rax,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r10,%r15 # (f^g)&e
+ mov %r12,48(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rbx,%r12 # T1+=h
+
+ mov %rcx,%rbx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rcx,%r13
+ mov %rcx,%r14
+
+ ror $28,%rbx
+ ror $34,%r13
+ mov %rcx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rbx
+ ror $5,%r13
+ or %r8,%r14 # a|c
+
+ xor %r13,%rbx # h=Sigma0(a)
+ and %r8,%r15 # a&c
+ add %r12,%r9 # d+=T1
+
+ and %rdx,%r14 # (a|c)&b
+ add %r12,%rbx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rbx # h+=Maj(a,b,c)
+ mov 8*7(%rsi),%r12
+ bswap %r12
+ mov %r9,%r13
+ mov %r9,%r14
+ mov %r10,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r11,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r9,%r15 # (f^g)&e
+ mov %r12,56(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rax,%r12 # T1+=h
+
+ mov %rbx,%rax
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rbx,%r13
+ mov %rbx,%r14
+
+ ror $28,%rax
+ ror $34,%r13
+ mov %rbx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rax
+ ror $5,%r13
+ or %rdx,%r14 # a|c
+
+ xor %r13,%rax # h=Sigma0(a)
+ and %rdx,%r15 # a&c
+ add %r12,%r8 # d+=T1
+
+ and %rcx,%r14 # (a|c)&b
+ add %r12,%rax # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rax # h+=Maj(a,b,c)
+ mov 8*8(%rsi),%r12
+ bswap %r12
+ mov %r8,%r13
+ mov %r8,%r14
+ mov %r9,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r10,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r8,%r15 # (f^g)&e
+ mov %r12,64(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r11,%r12 # T1+=h
+
+ mov %rax,%r11
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rax,%r13
+ mov %rax,%r14
+
+ ror $28,%r11
+ ror $34,%r13
+ mov %rax,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r11
+ ror $5,%r13
+ or %rcx,%r14 # a|c
+
+ xor %r13,%r11 # h=Sigma0(a)
+ and %rcx,%r15 # a&c
+ add %r12,%rdx # d+=T1
+
+ and %rbx,%r14 # (a|c)&b
+ add %r12,%r11 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r11 # h+=Maj(a,b,c)
+ mov 8*9(%rsi),%r12
+ bswap %r12
+ mov %rdx,%r13
+ mov %rdx,%r14
+ mov %r8,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r9,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rdx,%r15 # (f^g)&e
+ mov %r12,72(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r10,%r12 # T1+=h
+
+ mov %r11,%r10
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r11,%r13
+ mov %r11,%r14
+
+ ror $28,%r10
+ ror $34,%r13
+ mov %r11,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r10
+ ror $5,%r13
+ or %rbx,%r14 # a|c
+
+ xor %r13,%r10 # h=Sigma0(a)
+ and %rbx,%r15 # a&c
+ add %r12,%rcx # d+=T1
+
+ and %rax,%r14 # (a|c)&b
+ add %r12,%r10 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r10 # h+=Maj(a,b,c)
+ mov 8*10(%rsi),%r12
+ bswap %r12
+ mov %rcx,%r13
+ mov %rcx,%r14
+ mov %rdx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r8,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rcx,%r15 # (f^g)&e
+ mov %r12,80(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r9,%r12 # T1+=h
+
+ mov %r10,%r9
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r10,%r13
+ mov %r10,%r14
+
+ ror $28,%r9
+ ror $34,%r13
+ mov %r10,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r9
+ ror $5,%r13
+ or %rax,%r14 # a|c
+
+ xor %r13,%r9 # h=Sigma0(a)
+ and %rax,%r15 # a&c
+ add %r12,%rbx # d+=T1
+
+ and %r11,%r14 # (a|c)&b
+ add %r12,%r9 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r9 # h+=Maj(a,b,c)
+ mov 8*11(%rsi),%r12
+ bswap %r12
+ mov %rbx,%r13
+ mov %rbx,%r14
+ mov %rcx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rdx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rbx,%r15 # (f^g)&e
+ mov %r12,88(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r8,%r12 # T1+=h
+
+ mov %r9,%r8
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r9,%r13
+ mov %r9,%r14
+
+ ror $28,%r8
+ ror $34,%r13
+ mov %r9,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r8
+ ror $5,%r13
+ or %r11,%r14 # a|c
+
+ xor %r13,%r8 # h=Sigma0(a)
+ and %r11,%r15 # a&c
+ add %r12,%rax # d+=T1
+
+ and %r10,%r14 # (a|c)&b
+ add %r12,%r8 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r8 # h+=Maj(a,b,c)
+ mov 8*12(%rsi),%r12
+ bswap %r12
+ mov %rax,%r13
+ mov %rax,%r14
+ mov %rbx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rcx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rax,%r15 # (f^g)&e
+ mov %r12,96(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rdx,%r12 # T1+=h
+
+ mov %r8,%rdx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r8,%r13
+ mov %r8,%r14
+
+ ror $28,%rdx
+ ror $34,%r13
+ mov %r8,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rdx
+ ror $5,%r13
+ or %r10,%r14 # a|c
+
+ xor %r13,%rdx # h=Sigma0(a)
+ and %r10,%r15 # a&c
+ add %r12,%r11 # d+=T1
+
+ and %r9,%r14 # (a|c)&b
+ add %r12,%rdx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rdx # h+=Maj(a,b,c)
+ mov 8*13(%rsi),%r12
+ bswap %r12
+ mov %r11,%r13
+ mov %r11,%r14
+ mov %rax,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rbx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r11,%r15 # (f^g)&e
+ mov %r12,104(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rcx,%r12 # T1+=h
+
+ mov %rdx,%rcx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rdx,%r13
+ mov %rdx,%r14
+
+ ror $28,%rcx
+ ror $34,%r13
+ mov %rdx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rcx
+ ror $5,%r13
+ or %r9,%r14 # a|c
+
+ xor %r13,%rcx # h=Sigma0(a)
+ and %r9,%r15 # a&c
+ add %r12,%r10 # d+=T1
+
+ and %r8,%r14 # (a|c)&b
+ add %r12,%rcx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rcx # h+=Maj(a,b,c)
+ mov 8*14(%rsi),%r12
+ bswap %r12
+ mov %r10,%r13
+ mov %r10,%r14
+ mov %r11,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rax,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r10,%r15 # (f^g)&e
+ mov %r12,112(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rbx,%r12 # T1+=h
+
+ mov %rcx,%rbx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rcx,%r13
+ mov %rcx,%r14
+
+ ror $28,%rbx
+ ror $34,%r13
+ mov %rcx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rbx
+ ror $5,%r13
+ or %r8,%r14 # a|c
+
+ xor %r13,%rbx # h=Sigma0(a)
+ and %r8,%r15 # a&c
+ add %r12,%r9 # d+=T1
+
+ and %rdx,%r14 # (a|c)&b
+ add %r12,%rbx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rbx # h+=Maj(a,b,c)
+ mov 8*15(%rsi),%r12
+ bswap %r12
+ mov %r9,%r13
+ mov %r9,%r14
+ mov %r10,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r11,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r9,%r15 # (f^g)&e
+ mov %r12,120(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rax,%r12 # T1+=h
+
+ mov %rbx,%rax
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rbx,%r13
+ mov %rbx,%r14
+
+ ror $28,%rax
+ ror $34,%r13
+ mov %rbx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rax
+ ror $5,%r13
+ or %rdx,%r14 # a|c
+
+ xor %r13,%rax # h=Sigma0(a)
+ and %rdx,%r15 # a&c
+ add %r12,%r8 # d+=T1
+
+ and %rcx,%r14 # (a|c)&b
+ add %r12,%rax # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rax # h+=Maj(a,b,c)
+ jmp .Lrounds_16_xx
+.align 16
+.Lrounds_16_xx:
+ mov 8(%rsp),%r13
+ mov 112(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 72(%rsp),%r12
+
+ add 0(%rsp),%r12
+ mov %r8,%r13
+ mov %r8,%r14
+ mov %r9,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r10,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r8,%r15 # (f^g)&e
+ mov %r12,0(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r11,%r12 # T1+=h
+
+ mov %rax,%r11
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rax,%r13
+ mov %rax,%r14
+
+ ror $28,%r11
+ ror $34,%r13
+ mov %rax,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r11
+ ror $5,%r13
+ or %rcx,%r14 # a|c
+
+ xor %r13,%r11 # h=Sigma0(a)
+ and %rcx,%r15 # a&c
+ add %r12,%rdx # d+=T1
+
+ and %rbx,%r14 # (a|c)&b
+ add %r12,%r11 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r11 # h+=Maj(a,b,c)
+ mov 16(%rsp),%r13
+ mov 120(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 80(%rsp),%r12
+
+ add 8(%rsp),%r12
+ mov %rdx,%r13
+ mov %rdx,%r14
+ mov %r8,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r9,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rdx,%r15 # (f^g)&e
+ mov %r12,8(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r10,%r12 # T1+=h
+
+ mov %r11,%r10
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r11,%r13
+ mov %r11,%r14
+
+ ror $28,%r10
+ ror $34,%r13
+ mov %r11,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r10
+ ror $5,%r13
+ or %rbx,%r14 # a|c
+
+ xor %r13,%r10 # h=Sigma0(a)
+ and %rbx,%r15 # a&c
+ add %r12,%rcx # d+=T1
+
+ and %rax,%r14 # (a|c)&b
+ add %r12,%r10 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r10 # h+=Maj(a,b,c)
+ mov 24(%rsp),%r13
+ mov 0(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 88(%rsp),%r12
+
+ add 16(%rsp),%r12
+ mov %rcx,%r13
+ mov %rcx,%r14
+ mov %rdx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r8,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rcx,%r15 # (f^g)&e
+ mov %r12,16(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r9,%r12 # T1+=h
+
+ mov %r10,%r9
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r10,%r13
+ mov %r10,%r14
+
+ ror $28,%r9
+ ror $34,%r13
+ mov %r10,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r9
+ ror $5,%r13
+ or %rax,%r14 # a|c
+
+ xor %r13,%r9 # h=Sigma0(a)
+ and %rax,%r15 # a&c
+ add %r12,%rbx # d+=T1
+
+ and %r11,%r14 # (a|c)&b
+ add %r12,%r9 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r9 # h+=Maj(a,b,c)
+ mov 32(%rsp),%r13
+ mov 8(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 96(%rsp),%r12
+
+ add 24(%rsp),%r12
+ mov %rbx,%r13
+ mov %rbx,%r14
+ mov %rcx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rdx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rbx,%r15 # (f^g)&e
+ mov %r12,24(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r8,%r12 # T1+=h
+
+ mov %r9,%r8
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r9,%r13
+ mov %r9,%r14
+
+ ror $28,%r8
+ ror $34,%r13
+ mov %r9,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r8
+ ror $5,%r13
+ or %r11,%r14 # a|c
+
+ xor %r13,%r8 # h=Sigma0(a)
+ and %r11,%r15 # a&c
+ add %r12,%rax # d+=T1
+
+ and %r10,%r14 # (a|c)&b
+ add %r12,%r8 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r8 # h+=Maj(a,b,c)
+ mov 40(%rsp),%r13
+ mov 16(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 104(%rsp),%r12
+
+ add 32(%rsp),%r12
+ mov %rax,%r13
+ mov %rax,%r14
+ mov %rbx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rcx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rax,%r15 # (f^g)&e
+ mov %r12,32(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rdx,%r12 # T1+=h
+
+ mov %r8,%rdx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r8,%r13
+ mov %r8,%r14
+
+ ror $28,%rdx
+ ror $34,%r13
+ mov %r8,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rdx
+ ror $5,%r13
+ or %r10,%r14 # a|c
+
+ xor %r13,%rdx # h=Sigma0(a)
+ and %r10,%r15 # a&c
+ add %r12,%r11 # d+=T1
+
+ and %r9,%r14 # (a|c)&b
+ add %r12,%rdx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rdx # h+=Maj(a,b,c)
+ mov 48(%rsp),%r13
+ mov 24(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 112(%rsp),%r12
+
+ add 40(%rsp),%r12
+ mov %r11,%r13
+ mov %r11,%r14
+ mov %rax,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rbx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r11,%r15 # (f^g)&e
+ mov %r12,40(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rcx,%r12 # T1+=h
+
+ mov %rdx,%rcx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rdx,%r13
+ mov %rdx,%r14
+
+ ror $28,%rcx
+ ror $34,%r13
+ mov %rdx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rcx
+ ror $5,%r13
+ or %r9,%r14 # a|c
+
+ xor %r13,%rcx # h=Sigma0(a)
+ and %r9,%r15 # a&c
+ add %r12,%r10 # d+=T1
+
+ and %r8,%r14 # (a|c)&b
+ add %r12,%rcx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rcx # h+=Maj(a,b,c)
+ mov 56(%rsp),%r13
+ mov 32(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 120(%rsp),%r12
+
+ add 48(%rsp),%r12
+ mov %r10,%r13
+ mov %r10,%r14
+ mov %r11,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rax,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r10,%r15 # (f^g)&e
+ mov %r12,48(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rbx,%r12 # T1+=h
+
+ mov %rcx,%rbx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rcx,%r13
+ mov %rcx,%r14
+
+ ror $28,%rbx
+ ror $34,%r13
+ mov %rcx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rbx
+ ror $5,%r13
+ or %r8,%r14 # a|c
+
+ xor %r13,%rbx # h=Sigma0(a)
+ and %r8,%r15 # a&c
+ add %r12,%r9 # d+=T1
+
+ and %rdx,%r14 # (a|c)&b
+ add %r12,%rbx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rbx # h+=Maj(a,b,c)
+ mov 64(%rsp),%r13
+ mov 40(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 0(%rsp),%r12
+
+ add 56(%rsp),%r12
+ mov %r9,%r13
+ mov %r9,%r14
+ mov %r10,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r11,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r9,%r15 # (f^g)&e
+ mov %r12,56(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rax,%r12 # T1+=h
+
+ mov %rbx,%rax
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rbx,%r13
+ mov %rbx,%r14
+
+ ror $28,%rax
+ ror $34,%r13
+ mov %rbx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rax
+ ror $5,%r13
+ or %rdx,%r14 # a|c
+
+ xor %r13,%rax # h=Sigma0(a)
+ and %rdx,%r15 # a&c
+ add %r12,%r8 # d+=T1
+
+ and %rcx,%r14 # (a|c)&b
+ add %r12,%rax # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rax # h+=Maj(a,b,c)
+ mov 72(%rsp),%r13
+ mov 48(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 8(%rsp),%r12
+
+ add 64(%rsp),%r12
+ mov %r8,%r13
+ mov %r8,%r14
+ mov %r9,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r10,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r8,%r15 # (f^g)&e
+ mov %r12,64(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r11,%r12 # T1+=h
+
+ mov %rax,%r11
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rax,%r13
+ mov %rax,%r14
+
+ ror $28,%r11
+ ror $34,%r13
+ mov %rax,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r11
+ ror $5,%r13
+ or %rcx,%r14 # a|c
+
+ xor %r13,%r11 # h=Sigma0(a)
+ and %rcx,%r15 # a&c
+ add %r12,%rdx # d+=T1
+
+ and %rbx,%r14 # (a|c)&b
+ add %r12,%r11 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r11 # h+=Maj(a,b,c)
+ mov 80(%rsp),%r13
+ mov 56(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 16(%rsp),%r12
+
+ add 72(%rsp),%r12
+ mov %rdx,%r13
+ mov %rdx,%r14
+ mov %r8,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r9,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rdx,%r15 # (f^g)&e
+ mov %r12,72(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r10,%r12 # T1+=h
+
+ mov %r11,%r10
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r11,%r13
+ mov %r11,%r14
+
+ ror $28,%r10
+ ror $34,%r13
+ mov %r11,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r10
+ ror $5,%r13
+ or %rbx,%r14 # a|c
+
+ xor %r13,%r10 # h=Sigma0(a)
+ and %rbx,%r15 # a&c
+ add %r12,%rcx # d+=T1
+
+ and %rax,%r14 # (a|c)&b
+ add %r12,%r10 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r10 # h+=Maj(a,b,c)
+ mov 88(%rsp),%r13
+ mov 64(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 24(%rsp),%r12
+
+ add 80(%rsp),%r12
+ mov %rcx,%r13
+ mov %rcx,%r14
+ mov %rdx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r8,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rcx,%r15 # (f^g)&e
+ mov %r12,80(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r9,%r12 # T1+=h
+
+ mov %r10,%r9
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r10,%r13
+ mov %r10,%r14
+
+ ror $28,%r9
+ ror $34,%r13
+ mov %r10,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r9
+ ror $5,%r13
+ or %rax,%r14 # a|c
+
+ xor %r13,%r9 # h=Sigma0(a)
+ and %rax,%r15 # a&c
+ add %r12,%rbx # d+=T1
+
+ and %r11,%r14 # (a|c)&b
+ add %r12,%r9 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r9 # h+=Maj(a,b,c)
+ mov 96(%rsp),%r13
+ mov 72(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 32(%rsp),%r12
+
+ add 88(%rsp),%r12
+ mov %rbx,%r13
+ mov %rbx,%r14
+ mov %rcx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rdx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rbx,%r15 # (f^g)&e
+ mov %r12,88(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r8,%r12 # T1+=h
+
+ mov %r9,%r8
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r9,%r13
+ mov %r9,%r14
+
+ ror $28,%r8
+ ror $34,%r13
+ mov %r9,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r8
+ ror $5,%r13
+ or %r11,%r14 # a|c
+
+ xor %r13,%r8 # h=Sigma0(a)
+ and %r11,%r15 # a&c
+ add %r12,%rax # d+=T1
+
+ and %r10,%r14 # (a|c)&b
+ add %r12,%r8 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r8 # h+=Maj(a,b,c)
+ mov 104(%rsp),%r13
+ mov 80(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 40(%rsp),%r12
+
+ add 96(%rsp),%r12
+ mov %rax,%r13
+ mov %rax,%r14
+ mov %rbx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rcx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rax,%r15 # (f^g)&e
+ mov %r12,96(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rdx,%r12 # T1+=h
+
+ mov %r8,%rdx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r8,%r13
+ mov %r8,%r14
+
+ ror $28,%rdx
+ ror $34,%r13
+ mov %r8,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rdx
+ ror $5,%r13
+ or %r10,%r14 # a|c
+
+ xor %r13,%rdx # h=Sigma0(a)
+ and %r10,%r15 # a&c
+ add %r12,%r11 # d+=T1
+
+ and %r9,%r14 # (a|c)&b
+ add %r12,%rdx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rdx # h+=Maj(a,b,c)
+ mov 112(%rsp),%r13
+ mov 88(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 48(%rsp),%r12
+
+ add 104(%rsp),%r12
+ mov %r11,%r13
+ mov %r11,%r14
+ mov %rax,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rbx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r11,%r15 # (f^g)&e
+ mov %r12,104(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rcx,%r12 # T1+=h
+
+ mov %rdx,%rcx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rdx,%r13
+ mov %rdx,%r14
+
+ ror $28,%rcx
+ ror $34,%r13
+ mov %rdx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rcx
+ ror $5,%r13
+ or %r9,%r14 # a|c
+
+ xor %r13,%rcx # h=Sigma0(a)
+ and %r9,%r15 # a&c
+ add %r12,%r10 # d+=T1
+
+ and %r8,%r14 # (a|c)&b
+ add %r12,%rcx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rcx # h+=Maj(a,b,c)
+ mov 120(%rsp),%r13
+ mov 96(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 56(%rsp),%r12
+
+ add 112(%rsp),%r12
+ mov %r10,%r13
+ mov %r10,%r14
+ mov %r11,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rax,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r10,%r15 # (f^g)&e
+ mov %r12,112(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rbx,%r12 # T1+=h
+
+ mov %rcx,%rbx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rcx,%r13
+ mov %rcx,%r14
+
+ ror $28,%rbx
+ ror $34,%r13
+ mov %rcx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rbx
+ ror $5,%r13
+ or %r8,%r14 # a|c
+
+ xor %r13,%rbx # h=Sigma0(a)
+ and %r8,%r15 # a&c
+ add %r12,%r9 # d+=T1
+
+ and %rdx,%r14 # (a|c)&b
+ add %r12,%rbx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rbx # h+=Maj(a,b,c)
+ mov 0(%rsp),%r13
+ mov 104(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 64(%rsp),%r12
+
+ add 120(%rsp),%r12
+ mov %r9,%r13
+ mov %r9,%r14
+ mov %r10,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r11,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r9,%r15 # (f^g)&e
+ mov %r12,120(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rax,%r12 # T1+=h
+
+ mov %rbx,%rax
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rbx,%r13
+ mov %rbx,%r14
+
+ ror $28,%rax
+ ror $34,%r13
+ mov %rbx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rax
+ ror $5,%r13
+ or %rdx,%r14 # a|c
+
+ xor %r13,%rax # h=Sigma0(a)
+ and %rdx,%r15 # a&c
+ add %r12,%r8 # d+=T1
+
+ and %rcx,%r14 # (a|c)&b
+ add %r12,%rax # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rax # h+=Maj(a,b,c)
+ cmp $80,%rdi
+ jb .Lrounds_16_xx
+
+ mov 16*8+0*8(%rsp),%rdi
+ lea 16*8(%rsi),%rsi
+
+ add 8*0(%rdi),%rax
+ add 8*1(%rdi),%rbx
+ add 8*2(%rdi),%rcx
+ add 8*3(%rdi),%rdx
+ add 8*4(%rdi),%r8
+ add 8*5(%rdi),%r9
+ add 8*6(%rdi),%r10
+ add 8*7(%rdi),%r11
+
+ cmp 16*8+2*8(%rsp),%rsi
+
+ mov %rax,8*0(%rdi)
+ mov %rbx,8*1(%rdi)
+ mov %rcx,8*2(%rdi)
+ mov %rdx,8*3(%rdi)
+ mov %r8,8*4(%rdi)
+ mov %r9,8*5(%rdi)
+ mov %r10,8*6(%rdi)
+ mov %r11,8*7(%rdi)
+ jb .Lloop
+
+ mov 16*8+3*8(%rsp),%rsp
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+
+ ret
+SET_SIZE(SHA512TransformBlocks)
+
+.data
+.align 64
+.type K512,@object
+K512:
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+#endif /* !lint && !__lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/icp/core/kcf_callprov.c b/sys/contrib/openzfs/module/icp/core/kcf_callprov.c
new file mode 100644
index 000000000000..fd2f7e1aac3d
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/core/kcf_callprov.c
@@ -0,0 +1,1567 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/sched_impl.h>
+
+static int kcf_emulate_dual(kcf_provider_desc_t *, crypto_ctx_t *,
+ kcf_req_params_t *);
+
+void
+kcf_free_triedlist(kcf_prov_tried_t *list)
+{
+ kcf_prov_tried_t *l;
+
+ while ((l = list) != NULL) {
+ list = list->pt_next;
+ KCF_PROV_REFRELE(l->pt_pd);
+ kmem_free(l, sizeof (kcf_prov_tried_t));
+ }
+}
+
+kcf_prov_tried_t *
+kcf_insert_triedlist(kcf_prov_tried_t **list, kcf_provider_desc_t *pd,
+ int kmflag)
+{
+ kcf_prov_tried_t *l;
+
+ l = kmem_alloc(sizeof (kcf_prov_tried_t), kmflag);
+ if (l == NULL)
+ return (NULL);
+
+ l->pt_pd = pd;
+ l->pt_next = *list;
+ *list = l;
+
+ return (l);
+}
+
+static boolean_t
+is_in_triedlist(kcf_provider_desc_t *pd, kcf_prov_tried_t *triedl)
+{
+ while (triedl != NULL) {
+ if (triedl->pt_pd == pd)
+ return (B_TRUE);
+ triedl = triedl->pt_next;
+ };
+
+ return (B_FALSE);
+}
+
+/*
+ * Search a mech entry's hardware provider list for the specified
+ * provider. Return true if found.
+ */
+static boolean_t
+is_valid_provider_for_mech(kcf_provider_desc_t *pd, kcf_mech_entry_t *me,
+ crypto_func_group_t fg)
+{
+ kcf_prov_mech_desc_t *prov_chain;
+
+ prov_chain = me->me_hw_prov_chain;
+ if (prov_chain != NULL) {
+ ASSERT(me->me_num_hwprov > 0);
+ for (; prov_chain != NULL; prov_chain = prov_chain->pm_next) {
+ if (prov_chain->pm_prov_desc == pd &&
+ IS_FG_SUPPORTED(prov_chain, fg)) {
+ return (B_TRUE);
+ }
+ }
+ }
+ return (B_FALSE);
+}
+
+/*
+ * This routine, given a logical provider, returns the least loaded
+ * provider belonging to the logical provider. The provider must be
+ * able to do the specified mechanism, i.e. check that the mechanism
+ * hasn't been disabled. In addition, just in case providers are not
+ * entirely equivalent, the provider's entry point is checked for
+ * non-nullness. This is accomplished by having the caller pass, as
+ * arguments, the offset of the function group (offset_1), and the
+ * offset of the function within the function group (offset_2).
+ * Returns NULL if no provider can be found.
+ */
+int
+kcf_get_hardware_provider(crypto_mech_type_t mech_type_1,
+ crypto_mech_type_t mech_type_2, boolean_t call_restrict,
+ kcf_provider_desc_t *old, kcf_provider_desc_t **new, crypto_func_group_t fg)
+{
+ kcf_provider_desc_t *provider, *real_pd = old;
+ kcf_provider_desc_t *gpd = NULL; /* good provider */
+ kcf_provider_desc_t *bpd = NULL; /* busy provider */
+ kcf_provider_list_t *p;
+ kcf_ops_class_t class;
+ kcf_mech_entry_t *me;
+ kcf_mech_entry_tab_t *me_tab;
+ int index, len, gqlen = INT_MAX, rv = CRYPTO_SUCCESS;
+
+ /* get the mech entry for the specified mechanism */
+ class = KCF_MECH2CLASS(mech_type_1);
+ if ((class < KCF_FIRST_OPSCLASS) || (class > KCF_LAST_OPSCLASS)) {
+ return (CRYPTO_MECHANISM_INVALID);
+ }
+
+ me_tab = &kcf_mech_tabs_tab[class];
+ index = KCF_MECH2INDEX(mech_type_1);
+ if ((index < 0) || (index >= me_tab->met_size)) {
+ return (CRYPTO_MECHANISM_INVALID);
+ }
+
+ me = &((me_tab->met_tab)[index]);
+ mutex_enter(&me->me_mutex);
+
+ /*
+ * We assume the provider descriptor will not go away because
+ * it is being held somewhere, i.e. its reference count has been
+ * incremented. In the case of the crypto module, the provider
+ * descriptor is held by the session structure.
+ */
+ if (old->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) {
+ if (old->pd_provider_list == NULL) {
+ real_pd = NULL;
+ rv = CRYPTO_DEVICE_ERROR;
+ goto out;
+ }
+ /*
+ * Find the least loaded real provider. KCF_PROV_LOAD gives
+ * the load (number of pending requests) of the provider.
+ */
+ mutex_enter(&old->pd_lock);
+ p = old->pd_provider_list;
+ while (p != NULL) {
+ provider = p->pl_provider;
+
+ ASSERT(provider->pd_prov_type !=
+ CRYPTO_LOGICAL_PROVIDER);
+
+ if (call_restrict &&
+ (provider->pd_flags & KCF_PROV_RESTRICTED)) {
+ p = p->pl_next;
+ continue;
+ }
+
+ if (!is_valid_provider_for_mech(provider, me, fg)) {
+ p = p->pl_next;
+ continue;
+ }
+
+ /* provider does second mech */
+ if (mech_type_2 != CRYPTO_MECH_INVALID) {
+ int i;
+
+ i = KCF_TO_PROV_MECH_INDX(provider,
+ mech_type_2);
+ if (i == KCF_INVALID_INDX) {
+ p = p->pl_next;
+ continue;
+ }
+ }
+
+ if (provider->pd_state != KCF_PROV_READY) {
+ /* choose BUSY if no READY providers */
+ if (provider->pd_state == KCF_PROV_BUSY)
+ bpd = provider;
+ p = p->pl_next;
+ continue;
+ }
+
+ len = KCF_PROV_LOAD(provider);
+ if (len < gqlen) {
+ gqlen = len;
+ gpd = provider;
+ }
+
+ p = p->pl_next;
+ }
+
+ if (gpd != NULL) {
+ real_pd = gpd;
+ KCF_PROV_REFHOLD(real_pd);
+ } else if (bpd != NULL) {
+ real_pd = bpd;
+ KCF_PROV_REFHOLD(real_pd);
+ } else {
+ /* can't find provider */
+ real_pd = NULL;
+ rv = CRYPTO_MECHANISM_INVALID;
+ }
+ mutex_exit(&old->pd_lock);
+
+ } else {
+ if (!KCF_IS_PROV_USABLE(old) ||
+ (call_restrict && (old->pd_flags & KCF_PROV_RESTRICTED))) {
+ real_pd = NULL;
+ rv = CRYPTO_DEVICE_ERROR;
+ goto out;
+ }
+
+ if (!is_valid_provider_for_mech(old, me, fg)) {
+ real_pd = NULL;
+ rv = CRYPTO_MECHANISM_INVALID;
+ goto out;
+ }
+
+ KCF_PROV_REFHOLD(real_pd);
+ }
+out:
+ mutex_exit(&me->me_mutex);
+ *new = real_pd;
+ return (rv);
+}
+
+/*
+ * Return the best provider for the specified mechanism. The provider
+ * is held and it is the caller's responsibility to release it when done.
+ * The fg input argument is used as a search criterion to pick a provider.
+ * A provider has to support this function group to be picked.
+ *
+ * Find the least loaded provider in the list of providers. We do a linear
+ * search to find one. This is fine as we assume there are only a few
+ * number of providers in this list. If this assumption ever changes,
+ * we should revisit this.
+ *
+ * call_restrict represents if the caller should not be allowed to
+ * use restricted providers.
+ */
+kcf_provider_desc_t *
+kcf_get_mech_provider(crypto_mech_type_t mech_type, kcf_mech_entry_t **mepp,
+ int *error, kcf_prov_tried_t *triedl, crypto_func_group_t fg,
+ boolean_t call_restrict, size_t data_size)
+{
+ kcf_provider_desc_t *pd = NULL, *gpd = NULL;
+ kcf_prov_mech_desc_t *prov_chain, *mdesc;
+ int len, gqlen = INT_MAX;
+ kcf_ops_class_t class;
+ int index;
+ kcf_mech_entry_t *me;
+ kcf_mech_entry_tab_t *me_tab;
+
+ class = KCF_MECH2CLASS(mech_type);
+ if ((class < KCF_FIRST_OPSCLASS) || (class > KCF_LAST_OPSCLASS)) {
+ *error = CRYPTO_MECHANISM_INVALID;
+ return (NULL);
+ }
+
+ me_tab = &kcf_mech_tabs_tab[class];
+ index = KCF_MECH2INDEX(mech_type);
+ if ((index < 0) || (index >= me_tab->met_size)) {
+ *error = CRYPTO_MECHANISM_INVALID;
+ return (NULL);
+ }
+
+ me = &((me_tab->met_tab)[index]);
+ if (mepp != NULL)
+ *mepp = me;
+
+ mutex_enter(&me->me_mutex);
+
+ prov_chain = me->me_hw_prov_chain;
+
+ /*
+ * We check for the threshold for using a hardware provider for
+ * this amount of data. If there is no software provider available
+ * for the mechanism, then the threshold is ignored.
+ */
+ if ((prov_chain != NULL) &&
+ ((data_size == 0) || (me->me_threshold == 0) ||
+ (data_size >= me->me_threshold) ||
+ ((mdesc = me->me_sw_prov) == NULL) ||
+ (!IS_FG_SUPPORTED(mdesc, fg)) ||
+ (!KCF_IS_PROV_USABLE(mdesc->pm_prov_desc)))) {
+ ASSERT(me->me_num_hwprov > 0);
+ /* there is at least one provider */
+
+ /*
+ * Find the least loaded real provider. KCF_PROV_LOAD gives
+ * the load (number of pending requests) of the provider.
+ */
+ while (prov_chain != NULL) {
+ pd = prov_chain->pm_prov_desc;
+
+ if (!IS_FG_SUPPORTED(prov_chain, fg) ||
+ !KCF_IS_PROV_USABLE(pd) ||
+ IS_PROVIDER_TRIED(pd, triedl) ||
+ (call_restrict &&
+ (pd->pd_flags & KCF_PROV_RESTRICTED))) {
+ prov_chain = prov_chain->pm_next;
+ continue;
+ }
+
+ if ((len = KCF_PROV_LOAD(pd)) < gqlen) {
+ gqlen = len;
+ gpd = pd;
+ }
+
+ prov_chain = prov_chain->pm_next;
+ }
+
+ pd = gpd;
+ }
+
+ /* No HW provider for this mech, is there a SW provider? */
+ if (pd == NULL && (mdesc = me->me_sw_prov) != NULL) {
+ pd = mdesc->pm_prov_desc;
+ if (!IS_FG_SUPPORTED(mdesc, fg) ||
+ !KCF_IS_PROV_USABLE(pd) ||
+ IS_PROVIDER_TRIED(pd, triedl) ||
+ (call_restrict && (pd->pd_flags & KCF_PROV_RESTRICTED)))
+ pd = NULL;
+ }
+
+ if (pd == NULL) {
+ /*
+ * We do not want to report CRYPTO_MECH_NOT_SUPPORTED, when
+ * we are in the "fallback to the next provider" case. Rather
+ * we preserve the error, so that the client gets the right
+ * error code.
+ */
+ if (triedl == NULL)
+ *error = CRYPTO_MECH_NOT_SUPPORTED;
+ } else
+ KCF_PROV_REFHOLD(pd);
+
+ mutex_exit(&me->me_mutex);
+ return (pd);
+}
+
+/*
+ * Very similar to kcf_get_mech_provider(). Finds the best provider capable of
+ * a dual operation with both me1 and me2.
+ * When no dual-ops capable providers are available, return the best provider
+ * for me1 only, and sets *prov_mt2 to CRYPTO_INVALID_MECHID;
+ * We assume/expect that a slower HW capable of the dual is still
+ * faster than the 2 fastest providers capable of the individual ops
+ * separately.
+ */
+kcf_provider_desc_t *
+kcf_get_dual_provider(crypto_mechanism_t *mech1, crypto_mechanism_t *mech2,
+ kcf_mech_entry_t **mepp, crypto_mech_type_t *prov_mt1,
+ crypto_mech_type_t *prov_mt2, int *error, kcf_prov_tried_t *triedl,
+ crypto_func_group_t fg1, crypto_func_group_t fg2, boolean_t call_restrict,
+ size_t data_size)
+{
+ kcf_provider_desc_t *pd = NULL, *pdm1 = NULL, *pdm1m2 = NULL;
+ kcf_prov_mech_desc_t *prov_chain, *mdesc;
+ int len, gqlen = INT_MAX, dgqlen = INT_MAX;
+ crypto_mech_info_list_t *mil;
+ crypto_mech_type_t m2id = mech2->cm_type;
+ kcf_mech_entry_t *me;
+
+ /* when mech is a valid mechanism, me will be its mech_entry */
+ if (kcf_get_mech_entry(mech1->cm_type, &me) != KCF_SUCCESS) {
+ *error = CRYPTO_MECHANISM_INVALID;
+ return (NULL);
+ }
+
+ *prov_mt2 = CRYPTO_MECH_INVALID;
+
+ if (mepp != NULL)
+ *mepp = me;
+ mutex_enter(&me->me_mutex);
+
+ prov_chain = me->me_hw_prov_chain;
+ /*
+ * We check the threshold for using a hardware provider for
+ * this amount of data. If there is no software provider available
+ * for the first mechanism, then the threshold is ignored.
+ */
+ if ((prov_chain != NULL) &&
+ ((data_size == 0) || (me->me_threshold == 0) ||
+ (data_size >= me->me_threshold) ||
+ ((mdesc = me->me_sw_prov) == NULL) ||
+ (!IS_FG_SUPPORTED(mdesc, fg1)) ||
+ (!KCF_IS_PROV_USABLE(mdesc->pm_prov_desc)))) {
+ /* there is at least one provider */
+ ASSERT(me->me_num_hwprov > 0);
+
+ /*
+ * Find the least loaded provider capable of the combo
+ * me1 + me2, and save a pointer to the least loaded
+ * provider capable of me1 only.
+ */
+ while (prov_chain != NULL) {
+ pd = prov_chain->pm_prov_desc;
+ len = KCF_PROV_LOAD(pd);
+
+ if (!IS_FG_SUPPORTED(prov_chain, fg1) ||
+ !KCF_IS_PROV_USABLE(pd) ||
+ IS_PROVIDER_TRIED(pd, triedl) ||
+ (call_restrict &&
+ (pd->pd_flags & KCF_PROV_RESTRICTED))) {
+ prov_chain = prov_chain->pm_next;
+ continue;
+ }
+
+ /* Save the best provider capable of m1 */
+ if (len < gqlen) {
+ *prov_mt1 =
+ prov_chain->pm_mech_info.cm_mech_number;
+ gqlen = len;
+ pdm1 = pd;
+ }
+
+ /* See if pd can do me2 too */
+ for (mil = prov_chain->pm_mi_list;
+ mil != NULL; mil = mil->ml_next) {
+ if ((mil->ml_mech_info.cm_func_group_mask &
+ fg2) == 0)
+ continue;
+
+ if ((mil->ml_kcf_mechid == m2id) &&
+ (len < dgqlen)) {
+ /* Bingo! */
+ dgqlen = len;
+ pdm1m2 = pd;
+ *prov_mt2 =
+ mil->ml_mech_info.cm_mech_number;
+ *prov_mt1 = prov_chain->
+ pm_mech_info.cm_mech_number;
+ break;
+ }
+ }
+
+ prov_chain = prov_chain->pm_next;
+ }
+
+ pd = (pdm1m2 != NULL) ? pdm1m2 : pdm1;
+ }
+
+ /* no HW provider for this mech, is there a SW provider? */
+ if (pd == NULL && (mdesc = me->me_sw_prov) != NULL) {
+ pd = mdesc->pm_prov_desc;
+ if (!IS_FG_SUPPORTED(mdesc, fg1) ||
+ !KCF_IS_PROV_USABLE(pd) ||
+ IS_PROVIDER_TRIED(pd, triedl) ||
+ (call_restrict && (pd->pd_flags & KCF_PROV_RESTRICTED)))
+ pd = NULL;
+ else {
+ /* See if pd can do me2 too */
+ for (mil = me->me_sw_prov->pm_mi_list;
+ mil != NULL; mil = mil->ml_next) {
+ if ((mil->ml_mech_info.cm_func_group_mask &
+ fg2) == 0)
+ continue;
+
+ if (mil->ml_kcf_mechid == m2id) {
+ /* Bingo! */
+ *prov_mt2 =
+ mil->ml_mech_info.cm_mech_number;
+ break;
+ }
+ }
+ *prov_mt1 = me->me_sw_prov->pm_mech_info.cm_mech_number;
+ }
+ }
+
+ if (pd == NULL)
+ *error = CRYPTO_MECH_NOT_SUPPORTED;
+ else
+ KCF_PROV_REFHOLD(pd);
+
+ mutex_exit(&me->me_mutex);
+ return (pd);
+}
+
+/*
+ * Do the actual work of calling the provider routines.
+ *
+ * pd - Provider structure
+ * ctx - Context for this operation
+ * params - Parameters for this operation
+ * rhndl - Request handle to use for notification
+ *
+ * The return values are the same as that of the respective SPI.
+ */
+int
+common_submit_request(kcf_provider_desc_t *pd, crypto_ctx_t *ctx,
+ kcf_req_params_t *params, crypto_req_handle_t rhndl)
+{
+ int err = CRYPTO_ARGUMENTS_BAD;
+ kcf_op_type_t optype;
+
+ optype = params->rp_optype;
+
+ switch (params->rp_opgrp) {
+ case KCF_OG_DIGEST: {
+ kcf_digest_ops_params_t *dops = &params->rp_u.digest_params;
+
+ switch (optype) {
+ case KCF_OP_INIT:
+ /*
+ * We should do this only here and not in KCF_WRAP_*
+ * macros. This is because we may want to try other
+ * providers, in case we recover from a failure.
+ */
+ KCF_SET_PROVIDER_MECHNUM(dops->do_framework_mechtype,
+ pd, &dops->do_mech);
+
+ err = KCF_PROV_DIGEST_INIT(pd, ctx, &dops->do_mech,
+ rhndl);
+ break;
+
+ case KCF_OP_SINGLE:
+ err = KCF_PROV_DIGEST(pd, ctx, dops->do_data,
+ dops->do_digest, rhndl);
+ break;
+
+ case KCF_OP_UPDATE:
+ err = KCF_PROV_DIGEST_UPDATE(pd, ctx,
+ dops->do_data, rhndl);
+ break;
+
+ case KCF_OP_FINAL:
+ err = KCF_PROV_DIGEST_FINAL(pd, ctx,
+ dops->do_digest, rhndl);
+ break;
+
+ case KCF_OP_ATOMIC:
+ ASSERT(ctx == NULL);
+ KCF_SET_PROVIDER_MECHNUM(dops->do_framework_mechtype,
+ pd, &dops->do_mech);
+ err = KCF_PROV_DIGEST_ATOMIC(pd, dops->do_sid,
+ &dops->do_mech, dops->do_data, dops->do_digest,
+ rhndl);
+ break;
+
+ case KCF_OP_DIGEST_KEY:
+ err = KCF_PROV_DIGEST_KEY(pd, ctx, dops->do_digest_key,
+ rhndl);
+ break;
+
+ default:
+ break;
+ }
+ break;
+ }
+
+ case KCF_OG_MAC: {
+ kcf_mac_ops_params_t *mops = &params->rp_u.mac_params;
+
+ switch (optype) {
+ case KCF_OP_INIT:
+ KCF_SET_PROVIDER_MECHNUM(mops->mo_framework_mechtype,
+ pd, &mops->mo_mech);
+
+ err = KCF_PROV_MAC_INIT(pd, ctx, &mops->mo_mech,
+ mops->mo_key, mops->mo_templ, rhndl);
+ break;
+
+ case KCF_OP_SINGLE:
+ err = KCF_PROV_MAC(pd, ctx, mops->mo_data,
+ mops->mo_mac, rhndl);
+ break;
+
+ case KCF_OP_UPDATE:
+ err = KCF_PROV_MAC_UPDATE(pd, ctx, mops->mo_data,
+ rhndl);
+ break;
+
+ case KCF_OP_FINAL:
+ err = KCF_PROV_MAC_FINAL(pd, ctx, mops->mo_mac, rhndl);
+ break;
+
+ case KCF_OP_ATOMIC:
+ ASSERT(ctx == NULL);
+ KCF_SET_PROVIDER_MECHNUM(mops->mo_framework_mechtype,
+ pd, &mops->mo_mech);
+
+ err = KCF_PROV_MAC_ATOMIC(pd, mops->mo_sid,
+ &mops->mo_mech, mops->mo_key, mops->mo_data,
+ mops->mo_mac, mops->mo_templ, rhndl);
+ break;
+
+ case KCF_OP_MAC_VERIFY_ATOMIC:
+ ASSERT(ctx == NULL);
+ KCF_SET_PROVIDER_MECHNUM(mops->mo_framework_mechtype,
+ pd, &mops->mo_mech);
+
+ err = KCF_PROV_MAC_VERIFY_ATOMIC(pd, mops->mo_sid,
+ &mops->mo_mech, mops->mo_key, mops->mo_data,
+ mops->mo_mac, mops->mo_templ, rhndl);
+ break;
+
+ default:
+ break;
+ }
+ break;
+ }
+
+ case KCF_OG_ENCRYPT: {
+ kcf_encrypt_ops_params_t *eops = &params->rp_u.encrypt_params;
+
+ switch (optype) {
+ case KCF_OP_INIT:
+ KCF_SET_PROVIDER_MECHNUM(eops->eo_framework_mechtype,
+ pd, &eops->eo_mech);
+
+ err = KCF_PROV_ENCRYPT_INIT(pd, ctx, &eops->eo_mech,
+ eops->eo_key, eops->eo_templ, rhndl);
+ break;
+
+ case KCF_OP_SINGLE:
+ err = KCF_PROV_ENCRYPT(pd, ctx, eops->eo_plaintext,
+ eops->eo_ciphertext, rhndl);
+ break;
+
+ case KCF_OP_UPDATE:
+ err = KCF_PROV_ENCRYPT_UPDATE(pd, ctx,
+ eops->eo_plaintext, eops->eo_ciphertext, rhndl);
+ break;
+
+ case KCF_OP_FINAL:
+ err = KCF_PROV_ENCRYPT_FINAL(pd, ctx,
+ eops->eo_ciphertext, rhndl);
+ break;
+
+ case KCF_OP_ATOMIC:
+ ASSERT(ctx == NULL);
+ KCF_SET_PROVIDER_MECHNUM(eops->eo_framework_mechtype,
+ pd, &eops->eo_mech);
+
+ err = KCF_PROV_ENCRYPT_ATOMIC(pd, eops->eo_sid,
+ &eops->eo_mech, eops->eo_key, eops->eo_plaintext,
+ eops->eo_ciphertext, eops->eo_templ, rhndl);
+ break;
+
+ default:
+ break;
+ }
+ break;
+ }
+
+ case KCF_OG_DECRYPT: {
+ kcf_decrypt_ops_params_t *dcrops = &params->rp_u.decrypt_params;
+
+ switch (optype) {
+ case KCF_OP_INIT:
+ KCF_SET_PROVIDER_MECHNUM(dcrops->dop_framework_mechtype,
+ pd, &dcrops->dop_mech);
+
+ err = KCF_PROV_DECRYPT_INIT(pd, ctx, &dcrops->dop_mech,
+ dcrops->dop_key, dcrops->dop_templ, rhndl);
+ break;
+
+ case KCF_OP_SINGLE:
+ err = KCF_PROV_DECRYPT(pd, ctx, dcrops->dop_ciphertext,
+ dcrops->dop_plaintext, rhndl);
+ break;
+
+ case KCF_OP_UPDATE:
+ err = KCF_PROV_DECRYPT_UPDATE(pd, ctx,
+ dcrops->dop_ciphertext, dcrops->dop_plaintext,
+ rhndl);
+ break;
+
+ case KCF_OP_FINAL:
+ err = KCF_PROV_DECRYPT_FINAL(pd, ctx,
+ dcrops->dop_plaintext, rhndl);
+ break;
+
+ case KCF_OP_ATOMIC:
+ ASSERT(ctx == NULL);
+ KCF_SET_PROVIDER_MECHNUM(dcrops->dop_framework_mechtype,
+ pd, &dcrops->dop_mech);
+
+ err = KCF_PROV_DECRYPT_ATOMIC(pd, dcrops->dop_sid,
+ &dcrops->dop_mech, dcrops->dop_key,
+ dcrops->dop_ciphertext, dcrops->dop_plaintext,
+ dcrops->dop_templ, rhndl);
+ break;
+
+ default:
+ break;
+ }
+ break;
+ }
+
+ case KCF_OG_SIGN: {
+ kcf_sign_ops_params_t *sops = &params->rp_u.sign_params;
+
+ switch (optype) {
+ case KCF_OP_INIT:
+ KCF_SET_PROVIDER_MECHNUM(sops->so_framework_mechtype,
+ pd, &sops->so_mech);
+
+ err = KCF_PROV_SIGN_INIT(pd, ctx, &sops->so_mech,
+ sops->so_key, sops->so_templ, rhndl);
+ break;
+
+ case KCF_OP_SIGN_RECOVER_INIT:
+ KCF_SET_PROVIDER_MECHNUM(sops->so_framework_mechtype,
+ pd, &sops->so_mech);
+
+ err = KCF_PROV_SIGN_RECOVER_INIT(pd, ctx,
+ &sops->so_mech, sops->so_key, sops->so_templ,
+ rhndl);
+ break;
+
+ case KCF_OP_SINGLE:
+ err = KCF_PROV_SIGN(pd, ctx, sops->so_data,
+ sops->so_signature, rhndl);
+ break;
+
+ case KCF_OP_SIGN_RECOVER:
+ err = KCF_PROV_SIGN_RECOVER(pd, ctx,
+ sops->so_data, sops->so_signature, rhndl);
+ break;
+
+ case KCF_OP_UPDATE:
+ err = KCF_PROV_SIGN_UPDATE(pd, ctx, sops->so_data,
+ rhndl);
+ break;
+
+ case KCF_OP_FINAL:
+ err = KCF_PROV_SIGN_FINAL(pd, ctx, sops->so_signature,
+ rhndl);
+ break;
+
+ case KCF_OP_ATOMIC:
+ ASSERT(ctx == NULL);
+ KCF_SET_PROVIDER_MECHNUM(sops->so_framework_mechtype,
+ pd, &sops->so_mech);
+
+ err = KCF_PROV_SIGN_ATOMIC(pd, sops->so_sid,
+ &sops->so_mech, sops->so_key, sops->so_data,
+ sops->so_templ, sops->so_signature, rhndl);
+ break;
+
+ case KCF_OP_SIGN_RECOVER_ATOMIC:
+ ASSERT(ctx == NULL);
+ KCF_SET_PROVIDER_MECHNUM(sops->so_framework_mechtype,
+ pd, &sops->so_mech);
+
+ err = KCF_PROV_SIGN_RECOVER_ATOMIC(pd, sops->so_sid,
+ &sops->so_mech, sops->so_key, sops->so_data,
+ sops->so_templ, sops->so_signature, rhndl);
+ break;
+
+ default:
+ break;
+ }
+ break;
+ }
+
+ case KCF_OG_VERIFY: {
+ kcf_verify_ops_params_t *vops = &params->rp_u.verify_params;
+
+ switch (optype) {
+ case KCF_OP_INIT:
+ KCF_SET_PROVIDER_MECHNUM(vops->vo_framework_mechtype,
+ pd, &vops->vo_mech);
+
+ err = KCF_PROV_VERIFY_INIT(pd, ctx, &vops->vo_mech,
+ vops->vo_key, vops->vo_templ, rhndl);
+ break;
+
+ case KCF_OP_VERIFY_RECOVER_INIT:
+ KCF_SET_PROVIDER_MECHNUM(vops->vo_framework_mechtype,
+ pd, &vops->vo_mech);
+
+ err = KCF_PROV_VERIFY_RECOVER_INIT(pd, ctx,
+ &vops->vo_mech, vops->vo_key, vops->vo_templ,
+ rhndl);
+ break;
+
+ case KCF_OP_SINGLE:
+ err = KCF_PROV_VERIFY(pd, ctx, vops->vo_data,
+ vops->vo_signature, rhndl);
+ break;
+
+ case KCF_OP_VERIFY_RECOVER:
+ err = KCF_PROV_VERIFY_RECOVER(pd, ctx,
+ vops->vo_signature, vops->vo_data, rhndl);
+ break;
+
+ case KCF_OP_UPDATE:
+ err = KCF_PROV_VERIFY_UPDATE(pd, ctx, vops->vo_data,
+ rhndl);
+ break;
+
+ case KCF_OP_FINAL:
+ err = KCF_PROV_VERIFY_FINAL(pd, ctx, vops->vo_signature,
+ rhndl);
+ break;
+
+ case KCF_OP_ATOMIC:
+ ASSERT(ctx == NULL);
+ KCF_SET_PROVIDER_MECHNUM(vops->vo_framework_mechtype,
+ pd, &vops->vo_mech);
+
+ err = KCF_PROV_VERIFY_ATOMIC(pd, vops->vo_sid,
+ &vops->vo_mech, vops->vo_key, vops->vo_data,
+ vops->vo_templ, vops->vo_signature, rhndl);
+ break;
+
+ case KCF_OP_VERIFY_RECOVER_ATOMIC:
+ ASSERT(ctx == NULL);
+ KCF_SET_PROVIDER_MECHNUM(vops->vo_framework_mechtype,
+ pd, &vops->vo_mech);
+
+ err = KCF_PROV_VERIFY_RECOVER_ATOMIC(pd, vops->vo_sid,
+ &vops->vo_mech, vops->vo_key, vops->vo_signature,
+ vops->vo_templ, vops->vo_data, rhndl);
+ break;
+
+ default:
+ break;
+ }
+ break;
+ }
+
+ case KCF_OG_ENCRYPT_MAC: {
+ kcf_encrypt_mac_ops_params_t *eops =
+ &params->rp_u.encrypt_mac_params;
+ kcf_context_t *kcf_secondctx;
+
+ switch (optype) {
+ case KCF_OP_INIT:
+ kcf_secondctx = ((kcf_context_t *)
+ (ctx->cc_framework_private))->kc_secondctx;
+
+ if (kcf_secondctx != NULL) {
+ err = kcf_emulate_dual(pd, ctx, params);
+ break;
+ }
+ KCF_SET_PROVIDER_MECHNUM(
+ eops->em_framework_encr_mechtype,
+ pd, &eops->em_encr_mech);
+
+ KCF_SET_PROVIDER_MECHNUM(
+ eops->em_framework_mac_mechtype,
+ pd, &eops->em_mac_mech);
+
+ err = KCF_PROV_ENCRYPT_MAC_INIT(pd, ctx,
+ &eops->em_encr_mech, eops->em_encr_key,
+ &eops->em_mac_mech, eops->em_mac_key,
+ eops->em_encr_templ, eops->em_mac_templ,
+ rhndl);
+
+ break;
+
+ case KCF_OP_SINGLE:
+ err = KCF_PROV_ENCRYPT_MAC(pd, ctx,
+ eops->em_plaintext, eops->em_ciphertext,
+ eops->em_mac, rhndl);
+ break;
+
+ case KCF_OP_UPDATE:
+ kcf_secondctx = ((kcf_context_t *)
+ (ctx->cc_framework_private))->kc_secondctx;
+ if (kcf_secondctx != NULL) {
+ err = kcf_emulate_dual(pd, ctx, params);
+ break;
+ }
+ err = KCF_PROV_ENCRYPT_MAC_UPDATE(pd, ctx,
+ eops->em_plaintext, eops->em_ciphertext, rhndl);
+ break;
+
+ case KCF_OP_FINAL:
+ kcf_secondctx = ((kcf_context_t *)
+ (ctx->cc_framework_private))->kc_secondctx;
+ if (kcf_secondctx != NULL) {
+ err = kcf_emulate_dual(pd, ctx, params);
+ break;
+ }
+ err = KCF_PROV_ENCRYPT_MAC_FINAL(pd, ctx,
+ eops->em_ciphertext, eops->em_mac, rhndl);
+ break;
+
+ case KCF_OP_ATOMIC:
+ ASSERT(ctx == NULL);
+
+ KCF_SET_PROVIDER_MECHNUM(
+ eops->em_framework_encr_mechtype,
+ pd, &eops->em_encr_mech);
+
+ KCF_SET_PROVIDER_MECHNUM(
+ eops->em_framework_mac_mechtype,
+ pd, &eops->em_mac_mech);
+
+ err = KCF_PROV_ENCRYPT_MAC_ATOMIC(pd, eops->em_sid,
+ &eops->em_encr_mech, eops->em_encr_key,
+ &eops->em_mac_mech, eops->em_mac_key,
+ eops->em_plaintext, eops->em_ciphertext,
+ eops->em_mac,
+ eops->em_encr_templ, eops->em_mac_templ,
+ rhndl);
+
+ break;
+
+ default:
+ break;
+ }
+ break;
+ }
+
+ case KCF_OG_MAC_DECRYPT: {
+ kcf_mac_decrypt_ops_params_t *dops =
+ &params->rp_u.mac_decrypt_params;
+ kcf_context_t *kcf_secondctx;
+
+ switch (optype) {
+ case KCF_OP_INIT:
+ kcf_secondctx = ((kcf_context_t *)
+ (ctx->cc_framework_private))->kc_secondctx;
+
+ if (kcf_secondctx != NULL) {
+ err = kcf_emulate_dual(pd, ctx, params);
+ break;
+ }
+ KCF_SET_PROVIDER_MECHNUM(
+ dops->md_framework_mac_mechtype,
+ pd, &dops->md_mac_mech);
+
+ KCF_SET_PROVIDER_MECHNUM(
+ dops->md_framework_decr_mechtype,
+ pd, &dops->md_decr_mech);
+
+ err = KCF_PROV_MAC_DECRYPT_INIT(pd, ctx,
+ &dops->md_mac_mech, dops->md_mac_key,
+ &dops->md_decr_mech, dops->md_decr_key,
+ dops->md_mac_templ, dops->md_decr_templ,
+ rhndl);
+
+ break;
+
+ case KCF_OP_SINGLE:
+ err = KCF_PROV_MAC_DECRYPT(pd, ctx,
+ dops->md_ciphertext, dops->md_mac,
+ dops->md_plaintext, rhndl);
+ break;
+
+ case KCF_OP_UPDATE:
+ kcf_secondctx = ((kcf_context_t *)
+ (ctx->cc_framework_private))->kc_secondctx;
+ if (kcf_secondctx != NULL) {
+ err = kcf_emulate_dual(pd, ctx, params);
+ break;
+ }
+ err = KCF_PROV_MAC_DECRYPT_UPDATE(pd, ctx,
+ dops->md_ciphertext, dops->md_plaintext, rhndl);
+ break;
+
+ case KCF_OP_FINAL:
+ kcf_secondctx = ((kcf_context_t *)
+ (ctx->cc_framework_private))->kc_secondctx;
+ if (kcf_secondctx != NULL) {
+ err = kcf_emulate_dual(pd, ctx, params);
+ break;
+ }
+ err = KCF_PROV_MAC_DECRYPT_FINAL(pd, ctx,
+ dops->md_mac, dops->md_plaintext, rhndl);
+ break;
+
+ case KCF_OP_ATOMIC:
+ ASSERT(ctx == NULL);
+
+ KCF_SET_PROVIDER_MECHNUM(
+ dops->md_framework_mac_mechtype,
+ pd, &dops->md_mac_mech);
+
+ KCF_SET_PROVIDER_MECHNUM(
+ dops->md_framework_decr_mechtype,
+ pd, &dops->md_decr_mech);
+
+ err = KCF_PROV_MAC_DECRYPT_ATOMIC(pd, dops->md_sid,
+ &dops->md_mac_mech, dops->md_mac_key,
+ &dops->md_decr_mech, dops->md_decr_key,
+ dops->md_ciphertext, dops->md_mac,
+ dops->md_plaintext,
+ dops->md_mac_templ, dops->md_decr_templ,
+ rhndl);
+
+ break;
+
+ case KCF_OP_MAC_VERIFY_DECRYPT_ATOMIC:
+ ASSERT(ctx == NULL);
+
+ KCF_SET_PROVIDER_MECHNUM(
+ dops->md_framework_mac_mechtype,
+ pd, &dops->md_mac_mech);
+
+ KCF_SET_PROVIDER_MECHNUM(
+ dops->md_framework_decr_mechtype,
+ pd, &dops->md_decr_mech);
+
+ err = KCF_PROV_MAC_VERIFY_DECRYPT_ATOMIC(pd,
+ dops->md_sid, &dops->md_mac_mech, dops->md_mac_key,
+ &dops->md_decr_mech, dops->md_decr_key,
+ dops->md_ciphertext, dops->md_mac,
+ dops->md_plaintext,
+ dops->md_mac_templ, dops->md_decr_templ,
+ rhndl);
+
+ break;
+
+ default:
+ break;
+ }
+ break;
+ }
+
+ case KCF_OG_KEY: {
+ kcf_key_ops_params_t *kops = &params->rp_u.key_params;
+
+ ASSERT(ctx == NULL);
+ KCF_SET_PROVIDER_MECHNUM(kops->ko_framework_mechtype, pd,
+ &kops->ko_mech);
+
+ switch (optype) {
+ case KCF_OP_KEY_GENERATE:
+ err = KCF_PROV_KEY_GENERATE(pd, kops->ko_sid,
+ &kops->ko_mech,
+ kops->ko_key_template, kops->ko_key_attribute_count,
+ kops->ko_key_object_id_ptr, rhndl);
+ break;
+
+ case KCF_OP_KEY_GENERATE_PAIR:
+ err = KCF_PROV_KEY_GENERATE_PAIR(pd, kops->ko_sid,
+ &kops->ko_mech,
+ kops->ko_key_template, kops->ko_key_attribute_count,
+ kops->ko_private_key_template,
+ kops->ko_private_key_attribute_count,
+ kops->ko_key_object_id_ptr,
+ kops->ko_private_key_object_id_ptr, rhndl);
+ break;
+
+ case KCF_OP_KEY_WRAP:
+ err = KCF_PROV_KEY_WRAP(pd, kops->ko_sid,
+ &kops->ko_mech,
+ kops->ko_key, kops->ko_key_object_id_ptr,
+ kops->ko_wrapped_key, kops->ko_wrapped_key_len_ptr,
+ rhndl);
+ break;
+
+ case KCF_OP_KEY_UNWRAP:
+ err = KCF_PROV_KEY_UNWRAP(pd, kops->ko_sid,
+ &kops->ko_mech,
+ kops->ko_key, kops->ko_wrapped_key,
+ kops->ko_wrapped_key_len_ptr,
+ kops->ko_key_template, kops->ko_key_attribute_count,
+ kops->ko_key_object_id_ptr, rhndl);
+ break;
+
+ case KCF_OP_KEY_DERIVE:
+ err = KCF_PROV_KEY_DERIVE(pd, kops->ko_sid,
+ &kops->ko_mech,
+ kops->ko_key, kops->ko_key_template,
+ kops->ko_key_attribute_count,
+ kops->ko_key_object_id_ptr, rhndl);
+ break;
+
+ default:
+ break;
+ }
+ break;
+ }
+
+ case KCF_OG_RANDOM: {
+ kcf_random_number_ops_params_t *rops =
+ &params->rp_u.random_number_params;
+
+ ASSERT(ctx == NULL);
+
+ switch (optype) {
+ case KCF_OP_RANDOM_SEED:
+ err = KCF_PROV_SEED_RANDOM(pd, rops->rn_sid,
+ rops->rn_buf, rops->rn_buflen, rops->rn_entropy_est,
+ rops->rn_flags, rhndl);
+ break;
+
+ case KCF_OP_RANDOM_GENERATE:
+ err = KCF_PROV_GENERATE_RANDOM(pd, rops->rn_sid,
+ rops->rn_buf, rops->rn_buflen, rhndl);
+ break;
+
+ default:
+ break;
+ }
+ break;
+ }
+
+ case KCF_OG_SESSION: {
+ kcf_session_ops_params_t *sops = &params->rp_u.session_params;
+
+ ASSERT(ctx == NULL);
+ switch (optype) {
+ case KCF_OP_SESSION_OPEN:
+ /*
+ * so_pd may be a logical provider, in which case
+ * we need to check whether it has been removed.
+ */
+ if (KCF_IS_PROV_REMOVED(sops->so_pd)) {
+ err = CRYPTO_DEVICE_ERROR;
+ break;
+ }
+ err = KCF_PROV_SESSION_OPEN(pd, sops->so_sid_ptr,
+ rhndl, sops->so_pd);
+ break;
+
+ case KCF_OP_SESSION_CLOSE:
+ /*
+ * so_pd may be a logical provider, in which case
+ * we need to check whether it has been removed.
+ */
+ if (KCF_IS_PROV_REMOVED(sops->so_pd)) {
+ err = CRYPTO_DEVICE_ERROR;
+ break;
+ }
+ err = KCF_PROV_SESSION_CLOSE(pd, sops->so_sid,
+ rhndl, sops->so_pd);
+ break;
+
+ case KCF_OP_SESSION_LOGIN:
+ err = KCF_PROV_SESSION_LOGIN(pd, sops->so_sid,
+ sops->so_user_type, sops->so_pin,
+ sops->so_pin_len, rhndl);
+ break;
+
+ case KCF_OP_SESSION_LOGOUT:
+ err = KCF_PROV_SESSION_LOGOUT(pd, sops->so_sid, rhndl);
+ break;
+
+ default:
+ break;
+ }
+ break;
+ }
+
+ case KCF_OG_OBJECT: {
+ kcf_object_ops_params_t *jops = &params->rp_u.object_params;
+
+ ASSERT(ctx == NULL);
+ switch (optype) {
+ case KCF_OP_OBJECT_CREATE:
+ err = KCF_PROV_OBJECT_CREATE(pd, jops->oo_sid,
+ jops->oo_template, jops->oo_attribute_count,
+ jops->oo_object_id_ptr, rhndl);
+ break;
+
+ case KCF_OP_OBJECT_COPY:
+ err = KCF_PROV_OBJECT_COPY(pd, jops->oo_sid,
+ jops->oo_object_id,
+ jops->oo_template, jops->oo_attribute_count,
+ jops->oo_object_id_ptr, rhndl);
+ break;
+
+ case KCF_OP_OBJECT_DESTROY:
+ err = KCF_PROV_OBJECT_DESTROY(pd, jops->oo_sid,
+ jops->oo_object_id, rhndl);
+ break;
+
+ case KCF_OP_OBJECT_GET_SIZE:
+ err = KCF_PROV_OBJECT_GET_SIZE(pd, jops->oo_sid,
+ jops->oo_object_id, jops->oo_object_size, rhndl);
+ break;
+
+ case KCF_OP_OBJECT_GET_ATTRIBUTE_VALUE:
+ err = KCF_PROV_OBJECT_GET_ATTRIBUTE_VALUE(pd,
+ jops->oo_sid, jops->oo_object_id,
+ jops->oo_template, jops->oo_attribute_count, rhndl);
+ break;
+
+ case KCF_OP_OBJECT_SET_ATTRIBUTE_VALUE:
+ err = KCF_PROV_OBJECT_SET_ATTRIBUTE_VALUE(pd,
+ jops->oo_sid, jops->oo_object_id,
+ jops->oo_template, jops->oo_attribute_count, rhndl);
+ break;
+
+ case KCF_OP_OBJECT_FIND_INIT:
+ err = KCF_PROV_OBJECT_FIND_INIT(pd, jops->oo_sid,
+ jops->oo_template, jops->oo_attribute_count,
+ jops->oo_find_init_pp_ptr, rhndl);
+ break;
+
+ case KCF_OP_OBJECT_FIND:
+ err = KCF_PROV_OBJECT_FIND(pd, jops->oo_find_pp,
+ jops->oo_object_id_ptr, jops->oo_max_object_count,
+ jops->oo_object_count_ptr, rhndl);
+ break;
+
+ case KCF_OP_OBJECT_FIND_FINAL:
+ err = KCF_PROV_OBJECT_FIND_FINAL(pd, jops->oo_find_pp,
+ rhndl);
+ break;
+
+ default:
+ break;
+ }
+ break;
+ }
+
+ case KCF_OG_PROVMGMT: {
+ kcf_provmgmt_ops_params_t *pops = &params->rp_u.provmgmt_params;
+
+ ASSERT(ctx == NULL);
+ switch (optype) {
+ case KCF_OP_MGMT_EXTINFO:
+ /*
+ * po_pd may be a logical provider, in which case
+ * we need to check whether it has been removed.
+ */
+ if (KCF_IS_PROV_REMOVED(pops->po_pd)) {
+ err = CRYPTO_DEVICE_ERROR;
+ break;
+ }
+ err = KCF_PROV_EXT_INFO(pd, pops->po_ext_info, rhndl,
+ pops->po_pd);
+ break;
+
+ case KCF_OP_MGMT_INITTOKEN:
+ err = KCF_PROV_INIT_TOKEN(pd, pops->po_pin,
+ pops->po_pin_len, pops->po_label, rhndl);
+ break;
+
+ case KCF_OP_MGMT_INITPIN:
+ err = KCF_PROV_INIT_PIN(pd, pops->po_sid, pops->po_pin,
+ pops->po_pin_len, rhndl);
+ break;
+
+ case KCF_OP_MGMT_SETPIN:
+ err = KCF_PROV_SET_PIN(pd, pops->po_sid,
+ pops->po_old_pin, pops->po_old_pin_len,
+ pops->po_pin, pops->po_pin_len, rhndl);
+ break;
+
+ default:
+ break;
+ }
+ break;
+ }
+
+ case KCF_OG_NOSTORE_KEY: {
+ kcf_key_ops_params_t *kops = &params->rp_u.key_params;
+
+ ASSERT(ctx == NULL);
+ KCF_SET_PROVIDER_MECHNUM(kops->ko_framework_mechtype, pd,
+ &kops->ko_mech);
+
+ switch (optype) {
+ case KCF_OP_KEY_GENERATE:
+ err = KCF_PROV_NOSTORE_KEY_GENERATE(pd, kops->ko_sid,
+ &kops->ko_mech, kops->ko_key_template,
+ kops->ko_key_attribute_count,
+ kops->ko_out_template1,
+ kops->ko_out_attribute_count1, rhndl);
+ break;
+
+ case KCF_OP_KEY_GENERATE_PAIR:
+ err = KCF_PROV_NOSTORE_KEY_GENERATE_PAIR(pd,
+ kops->ko_sid, &kops->ko_mech,
+ kops->ko_key_template, kops->ko_key_attribute_count,
+ kops->ko_private_key_template,
+ kops->ko_private_key_attribute_count,
+ kops->ko_out_template1,
+ kops->ko_out_attribute_count1,
+ kops->ko_out_template2,
+ kops->ko_out_attribute_count2,
+ rhndl);
+ break;
+
+ case KCF_OP_KEY_DERIVE:
+ err = KCF_PROV_NOSTORE_KEY_DERIVE(pd, kops->ko_sid,
+ &kops->ko_mech, kops->ko_key,
+ kops->ko_key_template,
+ kops->ko_key_attribute_count,
+ kops->ko_out_template1,
+ kops->ko_out_attribute_count1, rhndl);
+ break;
+
+ default:
+ break;
+ }
+ break;
+ }
+ default:
+ break;
+ } /* end of switch(params->rp_opgrp) */
+
+ KCF_PROV_INCRSTATS(pd, err);
+ return (err);
+}
+
+
+/*
+ * Emulate the call for a multipart dual ops with 2 single steps.
+ * This routine is always called in the context of a working thread
+ * running kcf_svc_do_run().
+ * The single steps are submitted in a pure synchronous way (blocking).
+ * When this routine returns, kcf_svc_do_run() will call kcf_aop_done()
+ * so the originating consumer's callback gets invoked. kcf_aop_done()
+ * takes care of freeing the operation context. So, this routine does
+ * not free the operation context.
+ *
+ * The provider descriptor is assumed held by the callers.
+ */
+static int
+kcf_emulate_dual(kcf_provider_desc_t *pd, crypto_ctx_t *ctx,
+ kcf_req_params_t *params)
+{
+ int err = CRYPTO_ARGUMENTS_BAD;
+ kcf_op_type_t optype;
+ size_t save_len;
+ off_t save_offset;
+
+ optype = params->rp_optype;
+
+ switch (params->rp_opgrp) {
+ case KCF_OG_ENCRYPT_MAC: {
+ kcf_encrypt_mac_ops_params_t *cmops =
+ &params->rp_u.encrypt_mac_params;
+ kcf_context_t *encr_kcf_ctx;
+ crypto_ctx_t *mac_ctx;
+ kcf_req_params_t encr_params;
+
+ encr_kcf_ctx = (kcf_context_t *)(ctx->cc_framework_private);
+
+ switch (optype) {
+ case KCF_OP_INIT: {
+ encr_kcf_ctx->kc_secondctx = NULL;
+
+ KCF_WRAP_ENCRYPT_OPS_PARAMS(&encr_params, KCF_OP_INIT,
+ pd->pd_sid, &cmops->em_encr_mech,
+ cmops->em_encr_key, NULL, NULL,
+ cmops->em_encr_templ);
+
+ err = kcf_submit_request(pd, ctx, NULL, &encr_params,
+ B_FALSE);
+
+ /* It can't be CRYPTO_QUEUED */
+ if (err != CRYPTO_SUCCESS) {
+ break;
+ }
+
+ err = crypto_mac_init(&cmops->em_mac_mech,
+ cmops->em_mac_key, cmops->em_mac_templ,
+ (crypto_context_t *)&mac_ctx, NULL);
+
+ if (err == CRYPTO_SUCCESS) {
+ encr_kcf_ctx->kc_secondctx = (kcf_context_t *)
+ mac_ctx->cc_framework_private;
+ KCF_CONTEXT_REFHOLD((kcf_context_t *)
+ mac_ctx->cc_framework_private);
+ }
+
+ break;
+
+ }
+ case KCF_OP_UPDATE: {
+ crypto_dual_data_t *ct = cmops->em_ciphertext;
+ crypto_data_t *pt = cmops->em_plaintext;
+ kcf_context_t *mac_kcf_ctx = encr_kcf_ctx->kc_secondctx;
+ crypto_ctx_t *mac_ctx = &mac_kcf_ctx->kc_glbl_ctx;
+
+ KCF_WRAP_ENCRYPT_OPS_PARAMS(&encr_params, KCF_OP_UPDATE,
+ pd->pd_sid, NULL, NULL, pt, (crypto_data_t *)ct,
+ NULL);
+
+ err = kcf_submit_request(pd, ctx, NULL, &encr_params,
+ B_FALSE);
+
+ /* It can't be CRYPTO_QUEUED */
+ if (err != CRYPTO_SUCCESS) {
+ break;
+ }
+
+ save_offset = ct->dd_offset1;
+ save_len = ct->dd_len1;
+ if (ct->dd_len2 == 0) {
+ /*
+ * The previous encrypt step was an
+ * accumulation only and didn't produce any
+ * partial output
+ */
+ if (ct->dd_len1 == 0)
+ break;
+
+ } else {
+ ct->dd_offset1 = ct->dd_offset2;
+ ct->dd_len1 = ct->dd_len2;
+ }
+ err = crypto_mac_update((crypto_context_t)mac_ctx,
+ (crypto_data_t *)ct, NULL);
+
+ ct->dd_offset1 = save_offset;
+ ct->dd_len1 = save_len;
+
+ break;
+ }
+ case KCF_OP_FINAL: {
+ crypto_dual_data_t *ct = cmops->em_ciphertext;
+ crypto_data_t *mac = cmops->em_mac;
+ kcf_context_t *mac_kcf_ctx = encr_kcf_ctx->kc_secondctx;
+ crypto_ctx_t *mac_ctx = &mac_kcf_ctx->kc_glbl_ctx;
+ crypto_context_t mac_context = mac_ctx;
+
+ KCF_WRAP_ENCRYPT_OPS_PARAMS(&encr_params, KCF_OP_FINAL,
+ pd->pd_sid, NULL, NULL, NULL, (crypto_data_t *)ct,
+ NULL);
+
+ err = kcf_submit_request(pd, ctx, NULL, &encr_params,
+ B_FALSE);
+
+ /* It can't be CRYPTO_QUEUED */
+ if (err != CRYPTO_SUCCESS) {
+ crypto_cancel_ctx(mac_context);
+ break;
+ }
+
+ if (ct->dd_len2 > 0) {
+ save_offset = ct->dd_offset1;
+ save_len = ct->dd_len1;
+ ct->dd_offset1 = ct->dd_offset2;
+ ct->dd_len1 = ct->dd_len2;
+
+ err = crypto_mac_update(mac_context,
+ (crypto_data_t *)ct, NULL);
+
+ ct->dd_offset1 = save_offset;
+ ct->dd_len1 = save_len;
+
+ if (err != CRYPTO_SUCCESS) {
+ crypto_cancel_ctx(mac_context);
+ return (err);
+ }
+ }
+
+ /* and finally, collect the MAC */
+ err = crypto_mac_final(mac_context, mac, NULL);
+ break;
+ }
+
+ default:
+ break;
+ }
+ KCF_PROV_INCRSTATS(pd, err);
+ break;
+ }
+ case KCF_OG_MAC_DECRYPT: {
+ kcf_mac_decrypt_ops_params_t *mdops =
+ &params->rp_u.mac_decrypt_params;
+ kcf_context_t *decr_kcf_ctx;
+ crypto_ctx_t *mac_ctx;
+ kcf_req_params_t decr_params;
+
+ decr_kcf_ctx = (kcf_context_t *)(ctx->cc_framework_private);
+
+ switch (optype) {
+ case KCF_OP_INIT: {
+ decr_kcf_ctx->kc_secondctx = NULL;
+
+ err = crypto_mac_init(&mdops->md_mac_mech,
+ mdops->md_mac_key, mdops->md_mac_templ,
+ (crypto_context_t *)&mac_ctx, NULL);
+
+ /* It can't be CRYPTO_QUEUED */
+ if (err != CRYPTO_SUCCESS) {
+ break;
+ }
+
+ KCF_WRAP_DECRYPT_OPS_PARAMS(&decr_params, KCF_OP_INIT,
+ pd->pd_sid, &mdops->md_decr_mech,
+ mdops->md_decr_key, NULL, NULL,
+ mdops->md_decr_templ);
+
+ err = kcf_submit_request(pd, ctx, NULL, &decr_params,
+ B_FALSE);
+
+ /* It can't be CRYPTO_QUEUED */
+ if (err != CRYPTO_SUCCESS) {
+ crypto_cancel_ctx((crypto_context_t)mac_ctx);
+ break;
+ }
+
+ decr_kcf_ctx->kc_secondctx = (kcf_context_t *)
+ mac_ctx->cc_framework_private;
+ KCF_CONTEXT_REFHOLD((kcf_context_t *)
+ mac_ctx->cc_framework_private);
+
+ break;
+ default:
+ break;
+
+ }
+ case KCF_OP_UPDATE: {
+ crypto_dual_data_t *ct = mdops->md_ciphertext;
+ crypto_data_t *pt = mdops->md_plaintext;
+ kcf_context_t *mac_kcf_ctx = decr_kcf_ctx->kc_secondctx;
+ crypto_ctx_t *mac_ctx = &mac_kcf_ctx->kc_glbl_ctx;
+
+ err = crypto_mac_update((crypto_context_t)mac_ctx,
+ (crypto_data_t *)ct, NULL);
+
+ if (err != CRYPTO_SUCCESS)
+ break;
+
+ save_offset = ct->dd_offset1;
+ save_len = ct->dd_len1;
+
+ /* zero ct->dd_len2 means decrypt everything */
+ if (ct->dd_len2 > 0) {
+ ct->dd_offset1 = ct->dd_offset2;
+ ct->dd_len1 = ct->dd_len2;
+ }
+
+ err = crypto_decrypt_update((crypto_context_t)ctx,
+ (crypto_data_t *)ct, pt, NULL);
+
+ ct->dd_offset1 = save_offset;
+ ct->dd_len1 = save_len;
+
+ break;
+ }
+ case KCF_OP_FINAL: {
+ crypto_data_t *pt = mdops->md_plaintext;
+ crypto_data_t *mac = mdops->md_mac;
+ kcf_context_t *mac_kcf_ctx = decr_kcf_ctx->kc_secondctx;
+ crypto_ctx_t *mac_ctx = &mac_kcf_ctx->kc_glbl_ctx;
+
+ err = crypto_mac_final((crypto_context_t)mac_ctx,
+ mac, NULL);
+
+ if (err != CRYPTO_SUCCESS) {
+ crypto_cancel_ctx(ctx);
+ break;
+ }
+
+ /* Get the last chunk of plaintext */
+ KCF_CONTEXT_REFHOLD(decr_kcf_ctx);
+ err = crypto_decrypt_final((crypto_context_t)ctx, pt,
+ NULL);
+
+ break;
+ }
+ }
+ break;
+ }
+ default:
+
+ break;
+ } /* end of switch(params->rp_opgrp) */
+
+ return (err);
+}
diff --git a/sys/contrib/openzfs/module/icp/core/kcf_mech_tabs.c b/sys/contrib/openzfs/module/icp/core/kcf_mech_tabs.c
new file mode 100644
index 000000000000..2642b317d698
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/core/kcf_mech_tabs.c
@@ -0,0 +1,791 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/api.h>
+#include <sys/crypto/impl.h>
+#include <sys/modhash.h>
+
+/* Cryptographic mechanisms tables and their access functions */
+
+/*
+ * Internal numbers assigned to mechanisms are coded as follows:
+ *
+ * +----------------+----------------+
+ * | mech. class | mech. index |
+ * <--- 32-bits --->+<--- 32-bits --->
+ *
+ * the mech_class identifies the table the mechanism belongs to.
+ * mech_index is the index for that mechanism in the table.
+ * A mechanism belongs to exactly 1 table.
+ * The tables are:
+ * . digest_mechs_tab[] for the msg digest mechs.
+ * . cipher_mechs_tab[] for encrypt/decrypt and wrap/unwrap mechs.
+ * . mac_mechs_tab[] for MAC mechs.
+ * . sign_mechs_tab[] for sign & verify mechs.
+ * . keyops_mechs_tab[] for key/key pair generation, and key derivation.
+ * . misc_mechs_tab[] for mechs that don't belong to any of the above.
+ *
+ * There are no holes in the tables.
+ */
+
+/*
+ * Locking conventions:
+ * --------------------
+ * A global mutex, kcf_mech_tabs_lock, serializes writes to the
+ * mechanism table via kcf_create_mech_entry().
+ *
+ * A mutex is associated with every entry of the tables.
+ * The mutex is acquired whenever the entry is accessed for
+ * 1) retrieving the mech_id (comparing the mech name)
+ * 2) finding a provider for an xxx_init() or atomic operation.
+ * 3) altering the mechs entry to add or remove a provider.
+ *
+ * In 2), after a provider is chosen, its prov_desc is held and the
+ * entry's mutex must be dropped. The provider's working function (SPI) is
+ * called outside the mech_entry's mutex.
+ *
+ * The number of providers for a particular mechanism is not expected to be
+ * long enough to justify the cost of using rwlocks, so the per-mechanism
+ * entry mutex won't be very *hot*.
+ *
+ * When both kcf_mech_tabs_lock and a mech_entry mutex need to be held,
+ * kcf_mech_tabs_lock must always be acquired first.
+ *
+ */
+
+ /* Mechanisms tables */
+
+
+/* RFE 4687834 Will deal with the extensibility of these tables later */
+
+kcf_mech_entry_t kcf_digest_mechs_tab[KCF_MAXDIGEST];
+kcf_mech_entry_t kcf_cipher_mechs_tab[KCF_MAXCIPHER];
+kcf_mech_entry_t kcf_mac_mechs_tab[KCF_MAXMAC];
+kcf_mech_entry_t kcf_sign_mechs_tab[KCF_MAXSIGN];
+kcf_mech_entry_t kcf_keyops_mechs_tab[KCF_MAXKEYOPS];
+kcf_mech_entry_t kcf_misc_mechs_tab[KCF_MAXMISC];
+
+kcf_mech_entry_tab_t kcf_mech_tabs_tab[KCF_LAST_OPSCLASS + 1] = {
+ {0, NULL}, /* No class zero */
+ {KCF_MAXDIGEST, kcf_digest_mechs_tab},
+ {KCF_MAXCIPHER, kcf_cipher_mechs_tab},
+ {KCF_MAXMAC, kcf_mac_mechs_tab},
+ {KCF_MAXSIGN, kcf_sign_mechs_tab},
+ {KCF_MAXKEYOPS, kcf_keyops_mechs_tab},
+ {KCF_MAXMISC, kcf_misc_mechs_tab}
+};
+
+/*
+ * Per-algorithm internal thresholds for the minimum input size of before
+ * offloading to hardware provider.
+ * Dispatching a crypto operation to a hardware provider entails paying the
+ * cost of an additional context switch. Measurements with Sun Accelerator 4000
+ * shows that 512-byte jobs or smaller are better handled in software.
+ * There is room for refinement here.
+ *
+ */
+int kcf_md5_threshold = 512;
+int kcf_sha1_threshold = 512;
+int kcf_des_threshold = 512;
+int kcf_des3_threshold = 512;
+int kcf_aes_threshold = 512;
+int kcf_bf_threshold = 512;
+int kcf_rc4_threshold = 512;
+
+kmutex_t kcf_mech_tabs_lock;
+static uint32_t kcf_gen_swprov = 0;
+
+int kcf_mech_hash_size = 256;
+mod_hash_t *kcf_mech_hash; /* mech name to id hash */
+
+static crypto_mech_type_t
+kcf_mech_hash_find(char *mechname)
+{
+ mod_hash_val_t hv;
+ crypto_mech_type_t mt;
+
+ mt = CRYPTO_MECH_INVALID;
+ if (mod_hash_find(kcf_mech_hash, (mod_hash_key_t)mechname, &hv) == 0) {
+ mt = *(crypto_mech_type_t *)hv;
+ ASSERT(mt != CRYPTO_MECH_INVALID);
+ }
+
+ return (mt);
+}
+
+void
+kcf_destroy_mech_tabs(void)
+{
+ int i, max;
+ kcf_ops_class_t class;
+ kcf_mech_entry_t *me_tab;
+
+ if (kcf_mech_hash)
+ mod_hash_destroy_hash(kcf_mech_hash);
+
+ mutex_destroy(&kcf_mech_tabs_lock);
+
+ for (class = KCF_FIRST_OPSCLASS; class <= KCF_LAST_OPSCLASS; class++) {
+ max = kcf_mech_tabs_tab[class].met_size;
+ me_tab = kcf_mech_tabs_tab[class].met_tab;
+ for (i = 0; i < max; i++)
+ mutex_destroy(&(me_tab[i].me_mutex));
+ }
+}
+
+/*
+ * kcf_init_mech_tabs()
+ *
+ * Called by the misc/kcf's _init() routine to initialize the tables
+ * of mech_entry's.
+ */
+void
+kcf_init_mech_tabs(void)
+{
+ int i, max;
+ kcf_ops_class_t class;
+ kcf_mech_entry_t *me_tab;
+
+ /* Initializes the mutex locks. */
+
+ mutex_init(&kcf_mech_tabs_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ /* Then the pre-defined mechanism entries */
+
+ /* Two digests */
+ (void) strncpy(kcf_digest_mechs_tab[0].me_name, SUN_CKM_MD5,
+ CRYPTO_MAX_MECH_NAME);
+ kcf_digest_mechs_tab[0].me_threshold = kcf_md5_threshold;
+
+ (void) strncpy(kcf_digest_mechs_tab[1].me_name, SUN_CKM_SHA1,
+ CRYPTO_MAX_MECH_NAME);
+ kcf_digest_mechs_tab[1].me_threshold = kcf_sha1_threshold;
+
+ /* The symmetric ciphers in various modes */
+ (void) strncpy(kcf_cipher_mechs_tab[0].me_name, SUN_CKM_DES_CBC,
+ CRYPTO_MAX_MECH_NAME);
+ kcf_cipher_mechs_tab[0].me_threshold = kcf_des_threshold;
+
+ (void) strncpy(kcf_cipher_mechs_tab[1].me_name, SUN_CKM_DES3_CBC,
+ CRYPTO_MAX_MECH_NAME);
+ kcf_cipher_mechs_tab[1].me_threshold = kcf_des3_threshold;
+
+ (void) strncpy(kcf_cipher_mechs_tab[2].me_name, SUN_CKM_DES_ECB,
+ CRYPTO_MAX_MECH_NAME);
+ kcf_cipher_mechs_tab[2].me_threshold = kcf_des_threshold;
+
+ (void) strncpy(kcf_cipher_mechs_tab[3].me_name, SUN_CKM_DES3_ECB,
+ CRYPTO_MAX_MECH_NAME);
+ kcf_cipher_mechs_tab[3].me_threshold = kcf_des3_threshold;
+
+ (void) strncpy(kcf_cipher_mechs_tab[4].me_name, SUN_CKM_BLOWFISH_CBC,
+ CRYPTO_MAX_MECH_NAME);
+ kcf_cipher_mechs_tab[4].me_threshold = kcf_bf_threshold;
+
+ (void) strncpy(kcf_cipher_mechs_tab[5].me_name, SUN_CKM_BLOWFISH_ECB,
+ CRYPTO_MAX_MECH_NAME);
+ kcf_cipher_mechs_tab[5].me_threshold = kcf_bf_threshold;
+
+ (void) strncpy(kcf_cipher_mechs_tab[6].me_name, SUN_CKM_AES_CBC,
+ CRYPTO_MAX_MECH_NAME);
+ kcf_cipher_mechs_tab[6].me_threshold = kcf_aes_threshold;
+
+ (void) strncpy(kcf_cipher_mechs_tab[7].me_name, SUN_CKM_AES_ECB,
+ CRYPTO_MAX_MECH_NAME);
+ kcf_cipher_mechs_tab[7].me_threshold = kcf_aes_threshold;
+
+ (void) strncpy(kcf_cipher_mechs_tab[8].me_name, SUN_CKM_RC4,
+ CRYPTO_MAX_MECH_NAME);
+ kcf_cipher_mechs_tab[8].me_threshold = kcf_rc4_threshold;
+
+
+ /* 4 HMACs */
+ (void) strncpy(kcf_mac_mechs_tab[0].me_name, SUN_CKM_MD5_HMAC,
+ CRYPTO_MAX_MECH_NAME);
+ kcf_mac_mechs_tab[0].me_threshold = kcf_md5_threshold;
+
+ (void) strncpy(kcf_mac_mechs_tab[1].me_name, SUN_CKM_MD5_HMAC_GENERAL,
+ CRYPTO_MAX_MECH_NAME);
+ kcf_mac_mechs_tab[1].me_threshold = kcf_md5_threshold;
+
+ (void) strncpy(kcf_mac_mechs_tab[2].me_name, SUN_CKM_SHA1_HMAC,
+ CRYPTO_MAX_MECH_NAME);
+ kcf_mac_mechs_tab[2].me_threshold = kcf_sha1_threshold;
+
+ (void) strncpy(kcf_mac_mechs_tab[3].me_name, SUN_CKM_SHA1_HMAC_GENERAL,
+ CRYPTO_MAX_MECH_NAME);
+ kcf_mac_mechs_tab[3].me_threshold = kcf_sha1_threshold;
+
+
+ /* 1 random number generation pseudo mechanism */
+ (void) strncpy(kcf_misc_mechs_tab[0].me_name, SUN_RANDOM,
+ CRYPTO_MAX_MECH_NAME);
+
+ kcf_mech_hash = mod_hash_create_strhash_nodtr("kcf mech2id hash",
+ kcf_mech_hash_size, mod_hash_null_valdtor);
+
+ for (class = KCF_FIRST_OPSCLASS; class <= KCF_LAST_OPSCLASS; class++) {
+ max = kcf_mech_tabs_tab[class].met_size;
+ me_tab = kcf_mech_tabs_tab[class].met_tab;
+ for (i = 0; i < max; i++) {
+ mutex_init(&(me_tab[i].me_mutex), NULL,
+ MUTEX_DEFAULT, NULL);
+ if (me_tab[i].me_name[0] != 0) {
+ me_tab[i].me_mechid = KCF_MECHID(class, i);
+ (void) mod_hash_insert(kcf_mech_hash,
+ (mod_hash_key_t)me_tab[i].me_name,
+ (mod_hash_val_t)&(me_tab[i].me_mechid));
+ }
+ }
+ }
+}
+
+/*
+ * kcf_create_mech_entry()
+ *
+ * Arguments:
+ * . The class of mechanism.
+ * . the name of the new mechanism.
+ *
+ * Description:
+ * Creates a new mech_entry for a mechanism not yet known to the
+ * framework.
+ * This routine is called by kcf_add_mech_provider, which is
+ * in turn invoked for each mechanism supported by a provider.
+ * The'class' argument depends on the crypto_func_group_t bitmask
+ * in the registering provider's mech_info struct for this mechanism.
+ * When there is ambiguity in the mapping between the crypto_func_group_t
+ * and a class (dual ops, ...) the KCF_MISC_CLASS should be used.
+ *
+ * Context:
+ * User context only.
+ *
+ * Returns:
+ * KCF_INVALID_MECH_CLASS or KCF_INVALID_MECH_NAME if the class or
+ * the mechname is bogus.
+ * KCF_MECH_TAB_FULL when there is no room left in the mech. tabs.
+ * KCF_SUCCESS otherwise.
+ */
+static int
+kcf_create_mech_entry(kcf_ops_class_t class, char *mechname)
+{
+ crypto_mech_type_t mt;
+ kcf_mech_entry_t *me_tab;
+ int i = 0, size;
+
+ if ((class < KCF_FIRST_OPSCLASS) || (class > KCF_LAST_OPSCLASS))
+ return (KCF_INVALID_MECH_CLASS);
+
+ if ((mechname == NULL) || (mechname[0] == 0))
+ return (KCF_INVALID_MECH_NAME);
+ /*
+ * First check if the mechanism is already in one of the tables.
+ * The mech_entry could be in another class.
+ */
+ mutex_enter(&kcf_mech_tabs_lock);
+ mt = kcf_mech_hash_find(mechname);
+ if (mt != CRYPTO_MECH_INVALID) {
+ /* Nothing to do, regardless the suggested class. */
+ mutex_exit(&kcf_mech_tabs_lock);
+ return (KCF_SUCCESS);
+ }
+ /* Now take the next unused mech entry in the class's tab */
+ me_tab = kcf_mech_tabs_tab[class].met_tab;
+ size = kcf_mech_tabs_tab[class].met_size;
+
+ while (i < size) {
+ mutex_enter(&(me_tab[i].me_mutex));
+ if (me_tab[i].me_name[0] == 0) {
+ /* Found an empty spot */
+ (void) strlcpy(me_tab[i].me_name, mechname,
+ CRYPTO_MAX_MECH_NAME);
+ me_tab[i].me_name[CRYPTO_MAX_MECH_NAME-1] = '\0';
+ me_tab[i].me_mechid = KCF_MECHID(class, i);
+ /*
+ * No a-priori information about the new mechanism, so
+ * the threshold is set to zero.
+ */
+ me_tab[i].me_threshold = 0;
+
+ mutex_exit(&(me_tab[i].me_mutex));
+ /* Add the new mechanism to the hash table */
+ (void) mod_hash_insert(kcf_mech_hash,
+ (mod_hash_key_t)me_tab[i].me_name,
+ (mod_hash_val_t)&(me_tab[i].me_mechid));
+ break;
+ }
+ mutex_exit(&(me_tab[i].me_mutex));
+ i++;
+ }
+
+ mutex_exit(&kcf_mech_tabs_lock);
+
+ if (i == size) {
+ return (KCF_MECH_TAB_FULL);
+ }
+
+ return (KCF_SUCCESS);
+}
+
+/*
+ * kcf_add_mech_provider()
+ *
+ * Arguments:
+ * . An index in to the provider mechanism array
+ * . A pointer to the provider descriptor
+ * . A storage for the kcf_prov_mech_desc_t the entry was added at.
+ *
+ * Description:
+ * Adds a new provider of a mechanism to the mechanism's mech_entry
+ * chain.
+ *
+ * Context:
+ * User context only.
+ *
+ * Returns
+ * KCF_SUCCESS on success
+ * KCF_MECH_TAB_FULL otherwise.
+ */
+int
+kcf_add_mech_provider(short mech_indx,
+ kcf_provider_desc_t *prov_desc, kcf_prov_mech_desc_t **pmdpp)
+{
+ int error;
+ kcf_mech_entry_t *mech_entry = NULL;
+ crypto_mech_info_t *mech_info;
+ crypto_mech_type_t kcf_mech_type, mt;
+ kcf_prov_mech_desc_t *prov_mech, *prov_mech2;
+ crypto_func_group_t simple_fg_mask, dual_fg_mask;
+ crypto_mech_info_t *dmi;
+ crypto_mech_info_list_t *mil, *mil2;
+ kcf_mech_entry_t *me;
+ int i;
+
+ ASSERT(prov_desc->pd_prov_type != CRYPTO_LOGICAL_PROVIDER);
+
+ mech_info = &prov_desc->pd_mechanisms[mech_indx];
+
+ /*
+ * A mechanism belongs to exactly one mechanism table.
+ * Find the class corresponding to the function group flag of
+ * the mechanism.
+ */
+ kcf_mech_type = kcf_mech_hash_find(mech_info->cm_mech_name);
+ if (kcf_mech_type == CRYPTO_MECH_INVALID) {
+ crypto_func_group_t fg = mech_info->cm_func_group_mask;
+ kcf_ops_class_t class;
+
+ if (fg & CRYPTO_FG_DIGEST || fg & CRYPTO_FG_DIGEST_ATOMIC)
+ class = KCF_DIGEST_CLASS;
+ else if (fg & CRYPTO_FG_ENCRYPT || fg & CRYPTO_FG_DECRYPT ||
+ fg & CRYPTO_FG_ENCRYPT_ATOMIC ||
+ fg & CRYPTO_FG_DECRYPT_ATOMIC)
+ class = KCF_CIPHER_CLASS;
+ else if (fg & CRYPTO_FG_MAC || fg & CRYPTO_FG_MAC_ATOMIC)
+ class = KCF_MAC_CLASS;
+ else if (fg & CRYPTO_FG_SIGN || fg & CRYPTO_FG_VERIFY ||
+ fg & CRYPTO_FG_SIGN_ATOMIC ||
+ fg & CRYPTO_FG_VERIFY_ATOMIC ||
+ fg & CRYPTO_FG_SIGN_RECOVER ||
+ fg & CRYPTO_FG_VERIFY_RECOVER)
+ class = KCF_SIGN_CLASS;
+ else if (fg & CRYPTO_FG_GENERATE ||
+ fg & CRYPTO_FG_GENERATE_KEY_PAIR ||
+ fg & CRYPTO_FG_WRAP || fg & CRYPTO_FG_UNWRAP ||
+ fg & CRYPTO_FG_DERIVE)
+ class = KCF_KEYOPS_CLASS;
+ else
+ class = KCF_MISC_CLASS;
+
+ /*
+ * Attempt to create a new mech_entry for the specified
+ * mechanism. kcf_create_mech_entry() can handle the case
+ * where such an entry already exists.
+ */
+ if ((error = kcf_create_mech_entry(class,
+ mech_info->cm_mech_name)) != KCF_SUCCESS) {
+ return (error);
+ }
+ /* get the KCF mech type that was assigned to the mechanism */
+ kcf_mech_type = kcf_mech_hash_find(mech_info->cm_mech_name);
+ ASSERT(kcf_mech_type != CRYPTO_MECH_INVALID);
+ }
+
+ error = kcf_get_mech_entry(kcf_mech_type, &mech_entry);
+ ASSERT(error == KCF_SUCCESS);
+
+ /* allocate and initialize new kcf_prov_mech_desc */
+ prov_mech = kmem_zalloc(sizeof (kcf_prov_mech_desc_t), KM_SLEEP);
+ bcopy(mech_info, &prov_mech->pm_mech_info, sizeof (crypto_mech_info_t));
+ prov_mech->pm_prov_desc = prov_desc;
+ prov_desc->pd_mech_indx[KCF_MECH2CLASS(kcf_mech_type)]
+ [KCF_MECH2INDEX(kcf_mech_type)] = mech_indx;
+
+ KCF_PROV_REFHOLD(prov_desc);
+ KCF_PROV_IREFHOLD(prov_desc);
+
+ dual_fg_mask = mech_info->cm_func_group_mask & CRYPTO_FG_DUAL_MASK;
+
+ if (dual_fg_mask == ((crypto_func_group_t)0))
+ goto add_entry;
+
+ simple_fg_mask = (mech_info->cm_func_group_mask &
+ CRYPTO_FG_SIMPLEOP_MASK) | CRYPTO_FG_RANDOM;
+
+ for (i = 0; i < prov_desc->pd_mech_list_count; i++) {
+ dmi = &prov_desc->pd_mechanisms[i];
+
+ /* skip self */
+ if (dmi->cm_mech_number == mech_info->cm_mech_number)
+ continue;
+
+ /* skip if not a dual operation mechanism */
+ if (!(dmi->cm_func_group_mask & dual_fg_mask) ||
+ (dmi->cm_func_group_mask & simple_fg_mask))
+ continue;
+
+ mt = kcf_mech_hash_find(dmi->cm_mech_name);
+ if (mt == CRYPTO_MECH_INVALID)
+ continue;
+
+ if (kcf_get_mech_entry(mt, &me) != KCF_SUCCESS)
+ continue;
+
+ mil = kmem_zalloc(sizeof (*mil), KM_SLEEP);
+ mil2 = kmem_zalloc(sizeof (*mil2), KM_SLEEP);
+
+ /*
+ * Ignore hard-coded entries in the mech table
+ * if the provider hasn't registered.
+ */
+ mutex_enter(&me->me_mutex);
+ if (me->me_hw_prov_chain == NULL && me->me_sw_prov == NULL) {
+ mutex_exit(&me->me_mutex);
+ kmem_free(mil, sizeof (*mil));
+ kmem_free(mil2, sizeof (*mil2));
+ continue;
+ }
+
+ /*
+ * Add other dual mechanisms that have registered
+ * with the framework to this mechanism's
+ * cross-reference list.
+ */
+ mil->ml_mech_info = *dmi; /* struct assignment */
+ mil->ml_kcf_mechid = mt;
+
+ /* add to head of list */
+ mil->ml_next = prov_mech->pm_mi_list;
+ prov_mech->pm_mi_list = mil;
+
+ if (prov_desc->pd_prov_type == CRYPTO_HW_PROVIDER)
+ prov_mech2 = me->me_hw_prov_chain;
+ else
+ prov_mech2 = me->me_sw_prov;
+
+ if (prov_mech2 == NULL) {
+ kmem_free(mil2, sizeof (*mil2));
+ mutex_exit(&me->me_mutex);
+ continue;
+ }
+
+ /*
+ * Update all other cross-reference lists by
+ * adding this new mechanism.
+ */
+ while (prov_mech2 != NULL) {
+ if (prov_mech2->pm_prov_desc == prov_desc) {
+ /* struct assignment */
+ mil2->ml_mech_info = *mech_info;
+ mil2->ml_kcf_mechid = kcf_mech_type;
+
+ /* add to head of list */
+ mil2->ml_next = prov_mech2->pm_mi_list;
+ prov_mech2->pm_mi_list = mil2;
+ break;
+ }
+ prov_mech2 = prov_mech2->pm_next;
+ }
+ if (prov_mech2 == NULL)
+ kmem_free(mil2, sizeof (*mil2));
+
+ mutex_exit(&me->me_mutex);
+ }
+
+add_entry:
+ /*
+ * Add new kcf_prov_mech_desc at the front of HW providers
+ * chain.
+ */
+ switch (prov_desc->pd_prov_type) {
+
+ case CRYPTO_HW_PROVIDER:
+ mutex_enter(&mech_entry->me_mutex);
+ prov_mech->pm_me = mech_entry;
+ prov_mech->pm_next = mech_entry->me_hw_prov_chain;
+ mech_entry->me_hw_prov_chain = prov_mech;
+ mech_entry->me_num_hwprov++;
+ mutex_exit(&mech_entry->me_mutex);
+ break;
+
+ case CRYPTO_SW_PROVIDER:
+ mutex_enter(&mech_entry->me_mutex);
+ if (mech_entry->me_sw_prov != NULL) {
+ /*
+ * There is already a SW provider for this mechanism.
+ * Since we allow only one SW provider per mechanism,
+ * report this condition.
+ */
+ cmn_err(CE_WARN, "The cryptographic software provider "
+ "\"%s\" will not be used for %s. The provider "
+ "\"%s\" will be used for this mechanism "
+ "instead.", prov_desc->pd_description,
+ mech_info->cm_mech_name,
+ mech_entry->me_sw_prov->pm_prov_desc->
+ pd_description);
+ KCF_PROV_REFRELE(prov_desc);
+ kmem_free(prov_mech, sizeof (kcf_prov_mech_desc_t));
+ prov_mech = NULL;
+ } else {
+ /*
+ * Set the provider as the software provider for
+ * this mechanism.
+ */
+ mech_entry->me_sw_prov = prov_mech;
+
+ /* We'll wrap around after 4 billion registrations! */
+ mech_entry->me_gen_swprov = kcf_gen_swprov++;
+ }
+ mutex_exit(&mech_entry->me_mutex);
+ break;
+ default:
+ break;
+ }
+
+ *pmdpp = prov_mech;
+
+ return (KCF_SUCCESS);
+}
+
+/*
+ * kcf_remove_mech_provider()
+ *
+ * Arguments:
+ * . mech_name: the name of the mechanism.
+ * . prov_desc: The provider descriptor
+ *
+ * Description:
+ * Removes a provider from chain of provider descriptors.
+ * The provider is made unavailable to kernel consumers for the specified
+ * mechanism.
+ *
+ * Context:
+ * User context only.
+ */
+void
+kcf_remove_mech_provider(char *mech_name, kcf_provider_desc_t *prov_desc)
+{
+ crypto_mech_type_t mech_type;
+ kcf_prov_mech_desc_t *prov_mech = NULL, *prov_chain;
+ kcf_prov_mech_desc_t **prev_entry_next;
+ kcf_mech_entry_t *mech_entry;
+ crypto_mech_info_list_t *mil, *mil2, *next, **prev_next;
+
+ ASSERT(prov_desc->pd_prov_type != CRYPTO_LOGICAL_PROVIDER);
+
+ /* get the KCF mech type that was assigned to the mechanism */
+ if ((mech_type = kcf_mech_hash_find(mech_name)) ==
+ CRYPTO_MECH_INVALID) {
+ /*
+ * Provider was not allowed for this mech due to policy or
+ * configuration.
+ */
+ return;
+ }
+
+ /* get a ptr to the mech_entry that was created */
+ if (kcf_get_mech_entry(mech_type, &mech_entry) != KCF_SUCCESS) {
+ /*
+ * Provider was not allowed for this mech due to policy or
+ * configuration.
+ */
+ return;
+ }
+
+ mutex_enter(&mech_entry->me_mutex);
+
+ switch (prov_desc->pd_prov_type) {
+
+ case CRYPTO_HW_PROVIDER:
+ /* find the provider in the mech_entry chain */
+ prev_entry_next = &mech_entry->me_hw_prov_chain;
+ prov_mech = mech_entry->me_hw_prov_chain;
+ while (prov_mech != NULL &&
+ prov_mech->pm_prov_desc != prov_desc) {
+ prev_entry_next = &prov_mech->pm_next;
+ prov_mech = prov_mech->pm_next;
+ }
+
+ if (prov_mech == NULL) {
+ /* entry not found, simply return */
+ mutex_exit(&mech_entry->me_mutex);
+ return;
+ }
+
+ /* remove provider entry from mech_entry chain */
+ *prev_entry_next = prov_mech->pm_next;
+ ASSERT(mech_entry->me_num_hwprov > 0);
+ mech_entry->me_num_hwprov--;
+ break;
+
+ case CRYPTO_SW_PROVIDER:
+ if (mech_entry->me_sw_prov == NULL ||
+ mech_entry->me_sw_prov->pm_prov_desc != prov_desc) {
+ /* not the software provider for this mechanism */
+ mutex_exit(&mech_entry->me_mutex);
+ return;
+ }
+ prov_mech = mech_entry->me_sw_prov;
+ mech_entry->me_sw_prov = NULL;
+ break;
+ default:
+ /* unexpected crypto_provider_type_t */
+ mutex_exit(&mech_entry->me_mutex);
+ return;
+ }
+
+ mutex_exit(&mech_entry->me_mutex);
+
+ /* Free the dual ops cross-reference lists */
+ mil = prov_mech->pm_mi_list;
+ while (mil != NULL) {
+ next = mil->ml_next;
+ if (kcf_get_mech_entry(mil->ml_kcf_mechid,
+ &mech_entry) != KCF_SUCCESS) {
+ mil = next;
+ continue;
+ }
+
+ mutex_enter(&mech_entry->me_mutex);
+ if (prov_desc->pd_prov_type == CRYPTO_HW_PROVIDER)
+ prov_chain = mech_entry->me_hw_prov_chain;
+ else
+ prov_chain = mech_entry->me_sw_prov;
+
+ while (prov_chain != NULL) {
+ if (prov_chain->pm_prov_desc == prov_desc) {
+ prev_next = &prov_chain->pm_mi_list;
+ mil2 = prov_chain->pm_mi_list;
+ while (mil2 != NULL &&
+ mil2->ml_kcf_mechid != mech_type) {
+ prev_next = &mil2->ml_next;
+ mil2 = mil2->ml_next;
+ }
+ if (mil2 != NULL) {
+ *prev_next = mil2->ml_next;
+ kmem_free(mil2, sizeof (*mil2));
+ }
+ break;
+ }
+ prov_chain = prov_chain->pm_next;
+ }
+
+ mutex_exit(&mech_entry->me_mutex);
+ kmem_free(mil, sizeof (crypto_mech_info_list_t));
+ mil = next;
+ }
+
+ /* free entry */
+ KCF_PROV_REFRELE(prov_mech->pm_prov_desc);
+ KCF_PROV_IREFRELE(prov_mech->pm_prov_desc);
+ kmem_free(prov_mech, sizeof (kcf_prov_mech_desc_t));
+}
+
+/*
+ * kcf_get_mech_entry()
+ *
+ * Arguments:
+ * . The framework mechanism type
+ * . Storage for the mechanism entry
+ *
+ * Description:
+ * Retrieves the mechanism entry for the mech.
+ *
+ * Context:
+ * User and interrupt contexts.
+ *
+ * Returns:
+ * KCF_MECHANISM_XXX appropriate error code.
+ * KCF_SUCCESS otherwise.
+ */
+int
+kcf_get_mech_entry(crypto_mech_type_t mech_type, kcf_mech_entry_t **mep)
+{
+ kcf_ops_class_t class;
+ int index;
+ kcf_mech_entry_tab_t *me_tab;
+
+ ASSERT(mep != NULL);
+
+ class = KCF_MECH2CLASS(mech_type);
+
+ if ((class < KCF_FIRST_OPSCLASS) || (class > KCF_LAST_OPSCLASS)) {
+ /* the caller won't need to know it's an invalid class */
+ return (KCF_INVALID_MECH_NUMBER);
+ }
+
+ me_tab = &kcf_mech_tabs_tab[class];
+ index = KCF_MECH2INDEX(mech_type);
+
+ if ((index < 0) || (index >= me_tab->met_size)) {
+ return (KCF_INVALID_MECH_NUMBER);
+ }
+
+ *mep = &((me_tab->met_tab)[index]);
+
+ return (KCF_SUCCESS);
+}
+
+/* CURRENTLY UNSUPPORTED: attempting to load the module if it isn't found */
+/*
+ * Lookup the hash table for an entry that matches the mechname.
+ * If there are no hardware or software providers for the mechanism,
+ * but there is an unloaded software provider, this routine will attempt
+ * to load it.
+ *
+ * If the MOD_NOAUTOUNLOAD flag is not set, a software provider is
+ * in constant danger of being unloaded. For consumers that call
+ * crypto_mech2id() only once, the provider will not be reloaded
+ * if it becomes unloaded. If a provider gets loaded elsewhere
+ * without the MOD_NOAUTOUNLOAD flag being set, we set it now.
+ */
+crypto_mech_type_t
+crypto_mech2id_common(char *mechname, boolean_t load_module)
+{
+ crypto_mech_type_t mt = kcf_mech_hash_find(mechname);
+ return (mt);
+}
diff --git a/sys/contrib/openzfs/module/icp/core/kcf_prov_lib.c b/sys/contrib/openzfs/module/icp/core/kcf_prov_lib.c
new file mode 100644
index 000000000000..1b115d976232
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/core/kcf_prov_lib.c
@@ -0,0 +1,227 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <modes/modes.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+
+/*
+ * Utility routine to copy a buffer to a crypto_data structure.
+ */
+
+/*
+ * Utility routine to apply the command, 'cmd', to the
+ * data in the uio structure.
+ */
+int
+crypto_uio_data(crypto_data_t *data, uchar_t *buf, int len, cmd_type_t cmd,
+ void *digest_ctx, void (*update)(void))
+{
+ zfs_uio_t *uiop = data->cd_uio;
+ off_t offset = data->cd_offset;
+ size_t length = len;
+ uint_t vec_idx;
+ size_t cur_len;
+ uchar_t *datap;
+
+ ASSERT(data->cd_format == CRYPTO_DATA_UIO);
+ if (zfs_uio_segflg(uiop) != UIO_SYSSPACE) {
+ return (CRYPTO_ARGUMENTS_BAD);
+ }
+
+ /*
+ * Jump to the first iovec containing data to be
+ * processed.
+ */
+ offset = zfs_uio_index_at_offset(uiop, offset, &vec_idx);
+
+ if (vec_idx == zfs_uio_iovcnt(uiop) && length > 0) {
+ /*
+ * The caller specified an offset that is larger than
+ * the total size of the buffers it provided.
+ */
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+
+ while (vec_idx < zfs_uio_iovcnt(uiop) && length > 0) {
+ cur_len = MIN(zfs_uio_iovlen(uiop, vec_idx) -
+ offset, length);
+
+ datap = (uchar_t *)(zfs_uio_iovbase(uiop, vec_idx) + offset);
+ switch (cmd) {
+ case COPY_FROM_DATA:
+ bcopy(datap, buf, cur_len);
+ buf += cur_len;
+ break;
+ case COPY_TO_DATA:
+ bcopy(buf, datap, cur_len);
+ buf += cur_len;
+ break;
+ case COMPARE_TO_DATA:
+ if (bcmp(datap, buf, cur_len))
+ return (CRYPTO_SIGNATURE_INVALID);
+ buf += cur_len;
+ break;
+ case MD5_DIGEST_DATA:
+ case SHA1_DIGEST_DATA:
+ case SHA2_DIGEST_DATA:
+ case GHASH_DATA:
+ return (CRYPTO_ARGUMENTS_BAD);
+ }
+
+ length -= cur_len;
+ vec_idx++;
+ offset = 0;
+ }
+
+ if (vec_idx == zfs_uio_iovcnt(uiop) && length > 0) {
+ /*
+ * The end of the specified iovec's was reached but
+ * the length requested could not be processed.
+ */
+ switch (cmd) {
+ case COPY_TO_DATA:
+ data->cd_length = len;
+ return (CRYPTO_BUFFER_TOO_SMALL);
+ default:
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+ }
+
+ return (CRYPTO_SUCCESS);
+}
+
+int
+crypto_put_output_data(uchar_t *buf, crypto_data_t *output, int len)
+{
+ switch (output->cd_format) {
+ case CRYPTO_DATA_RAW:
+ if (output->cd_raw.iov_len < len) {
+ output->cd_length = len;
+ return (CRYPTO_BUFFER_TOO_SMALL);
+ }
+ bcopy(buf, (uchar_t *)(output->cd_raw.iov_base +
+ output->cd_offset), len);
+ break;
+
+ case CRYPTO_DATA_UIO:
+ return (crypto_uio_data(output, buf, len,
+ COPY_TO_DATA, NULL, NULL));
+ default:
+ return (CRYPTO_ARGUMENTS_BAD);
+ }
+
+ return (CRYPTO_SUCCESS);
+}
+
+int
+crypto_update_iov(void *ctx, crypto_data_t *input, crypto_data_t *output,
+ int (*cipher)(void *, caddr_t, size_t, crypto_data_t *),
+ void (*copy_block)(uint8_t *, uint64_t *))
+{
+ common_ctx_t *common_ctx = ctx;
+ int rv;
+
+ ASSERT(input != output);
+ if (input->cd_miscdata != NULL) {
+ copy_block((uint8_t *)input->cd_miscdata,
+ &common_ctx->cc_iv[0]);
+ }
+
+ if (input->cd_raw.iov_len < input->cd_length)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ rv = (cipher)(ctx, input->cd_raw.iov_base + input->cd_offset,
+ input->cd_length, output);
+
+ return (rv);
+}
+
+int
+crypto_update_uio(void *ctx, crypto_data_t *input, crypto_data_t *output,
+ int (*cipher)(void *, caddr_t, size_t, crypto_data_t *),
+ void (*copy_block)(uint8_t *, uint64_t *))
+{
+ common_ctx_t *common_ctx = ctx;
+ zfs_uio_t *uiop = input->cd_uio;
+ off_t offset = input->cd_offset;
+ size_t length = input->cd_length;
+ uint_t vec_idx;
+ size_t cur_len;
+
+ ASSERT(input != output);
+ if (input->cd_miscdata != NULL) {
+ copy_block((uint8_t *)input->cd_miscdata,
+ &common_ctx->cc_iv[0]);
+ }
+
+ if (zfs_uio_segflg(input->cd_uio) != UIO_SYSSPACE) {
+ return (CRYPTO_ARGUMENTS_BAD);
+ }
+
+ /*
+ * Jump to the first iovec containing data to be
+ * processed.
+ */
+ offset = zfs_uio_index_at_offset(uiop, offset, &vec_idx);
+ if (vec_idx == zfs_uio_iovcnt(uiop) && length > 0) {
+ /*
+ * The caller specified an offset that is larger than the
+ * total size of the buffers it provided.
+ */
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+
+ /*
+ * Now process the iovecs.
+ */
+ while (vec_idx < zfs_uio_iovcnt(uiop) && length > 0) {
+ cur_len = MIN(zfs_uio_iovlen(uiop, vec_idx) -
+ offset, length);
+
+ int rv = (cipher)(ctx, zfs_uio_iovbase(uiop, vec_idx) + offset,
+ cur_len, output);
+
+ if (rv != CRYPTO_SUCCESS) {
+ return (rv);
+ }
+ length -= cur_len;
+ vec_idx++;
+ offset = 0;
+ }
+
+ if (vec_idx == zfs_uio_iovcnt(uiop) && length > 0) {
+ /*
+ * The end of the specified iovec's was reached but
+ * the length requested could not be processed, i.e.
+ * The caller requested to digest more data than it provided.
+ */
+
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+
+ return (CRYPTO_SUCCESS);
+}
diff --git a/sys/contrib/openzfs/module/icp/core/kcf_prov_tabs.c b/sys/contrib/openzfs/module/icp/core/kcf_prov_tabs.c
new file mode 100644
index 000000000000..94e6937bcd76
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/core/kcf_prov_tabs.c
@@ -0,0 +1,645 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * This file is part of the core Kernel Cryptographic Framework.
+ * It implements the management of tables of Providers. Entries to
+ * added and removed when cryptographic providers register with
+ * and unregister from the framework, respectively. The KCF scheduler
+ * and ioctl pseudo driver call this function to obtain the list
+ * of available providers.
+ *
+ * The provider table is indexed by crypto_provider_id_t. Each
+ * element of the table contains a pointer to a provider descriptor,
+ * or NULL if the entry is free.
+ *
+ * This file also implements helper functions to allocate and free
+ * provider descriptors.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/sched_impl.h>
+#include <sys/crypto/spi.h>
+
+#define KCF_MAX_PROVIDERS 512 /* max number of providers */
+
+/*
+ * Prov_tab is an array of providers which is updated when
+ * a crypto provider registers with kcf. The provider calls the
+ * SPI routine, crypto_register_provider(), which in turn calls
+ * kcf_prov_tab_add_provider().
+ *
+ * A provider unregisters by calling crypto_unregister_provider()
+ * which triggers the removal of the prov_tab entry.
+ * It also calls kcf_remove_mech_provider().
+ *
+ * prov_tab entries are not updated from kcf.conf or by cryptoadm(1M).
+ */
+static kcf_provider_desc_t **prov_tab = NULL;
+static kmutex_t prov_tab_mutex; /* ensure exclusive access to the table */
+static uint_t prov_tab_num = 0; /* number of providers in table */
+static uint_t prov_tab_max = KCF_MAX_PROVIDERS;
+
+void
+kcf_prov_tab_destroy(void)
+{
+ mutex_destroy(&prov_tab_mutex);
+
+ if (prov_tab)
+ kmem_free(prov_tab, prov_tab_max *
+ sizeof (kcf_provider_desc_t *));
+}
+
+/*
+ * Initialize a mutex and the KCF providers table, prov_tab.
+ * The providers table is dynamically allocated with prov_tab_max entries.
+ * Called from kcf module _init().
+ */
+void
+kcf_prov_tab_init(void)
+{
+ mutex_init(&prov_tab_mutex, NULL, MUTEX_DEFAULT, NULL);
+
+ prov_tab = kmem_zalloc(prov_tab_max * sizeof (kcf_provider_desc_t *),
+ KM_SLEEP);
+}
+
+/*
+ * Add a provider to the provider table. If no free entry can be found
+ * for the new provider, returns CRYPTO_HOST_MEMORY. Otherwise, add
+ * the provider to the table, initialize the pd_prov_id field
+ * of the specified provider descriptor to the index in that table,
+ * and return CRYPTO_SUCCESS. Note that a REFHOLD is done on the
+ * provider when pointed to by a table entry.
+ */
+int
+kcf_prov_tab_add_provider(kcf_provider_desc_t *prov_desc)
+{
+ uint_t i;
+
+ ASSERT(prov_tab != NULL);
+
+ mutex_enter(&prov_tab_mutex);
+
+ /* find free slot in providers table */
+ for (i = 1; i < KCF_MAX_PROVIDERS && prov_tab[i] != NULL; i++)
+ ;
+ if (i == KCF_MAX_PROVIDERS) {
+ /* ran out of providers entries */
+ mutex_exit(&prov_tab_mutex);
+ cmn_err(CE_WARN, "out of providers entries");
+ return (CRYPTO_HOST_MEMORY);
+ }
+
+ /* initialize entry */
+ prov_tab[i] = prov_desc;
+ KCF_PROV_REFHOLD(prov_desc);
+ KCF_PROV_IREFHOLD(prov_desc);
+ prov_tab_num++;
+
+ mutex_exit(&prov_tab_mutex);
+
+ /* update provider descriptor */
+ prov_desc->pd_prov_id = i;
+
+ /*
+ * The KCF-private provider handle is defined as the internal
+ * provider id.
+ */
+ prov_desc->pd_kcf_prov_handle =
+ (crypto_kcf_provider_handle_t)prov_desc->pd_prov_id;
+
+ return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Remove the provider specified by its id. A REFRELE is done on the
+ * corresponding provider descriptor before this function returns.
+ * Returns CRYPTO_UNKNOWN_PROVIDER if the provider id is not valid.
+ */
+int
+kcf_prov_tab_rem_provider(crypto_provider_id_t prov_id)
+{
+ kcf_provider_desc_t *prov_desc;
+
+ ASSERT(prov_tab != NULL);
+ ASSERT(prov_tab_num >= 0);
+
+ /*
+ * Validate provider id, since it can be specified by a 3rd-party
+ * provider.
+ */
+
+ mutex_enter(&prov_tab_mutex);
+ if (prov_id >= KCF_MAX_PROVIDERS ||
+ ((prov_desc = prov_tab[prov_id]) == NULL)) {
+ mutex_exit(&prov_tab_mutex);
+ return (CRYPTO_INVALID_PROVIDER_ID);
+ }
+ mutex_exit(&prov_tab_mutex);
+
+ /*
+ * The provider id must remain valid until the associated provider
+ * descriptor is freed. For this reason, we simply release our
+ * reference to the descriptor here. When the reference count
+ * reaches zero, kcf_free_provider_desc() will be invoked and
+ * the associated entry in the providers table will be released
+ * at that time.
+ */
+
+ KCF_PROV_REFRELE(prov_desc);
+ KCF_PROV_IREFRELE(prov_desc);
+
+ return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Returns the provider descriptor corresponding to the specified
+ * provider id. A REFHOLD is done on the descriptor before it is
+ * returned to the caller. It is the responsibility of the caller
+ * to do a REFRELE once it is done with the provider descriptor.
+ */
+kcf_provider_desc_t *
+kcf_prov_tab_lookup(crypto_provider_id_t prov_id)
+{
+ kcf_provider_desc_t *prov_desc;
+
+ mutex_enter(&prov_tab_mutex);
+
+ prov_desc = prov_tab[prov_id];
+
+ if (prov_desc == NULL) {
+ mutex_exit(&prov_tab_mutex);
+ return (NULL);
+ }
+
+ KCF_PROV_REFHOLD(prov_desc);
+
+ mutex_exit(&prov_tab_mutex);
+
+ return (prov_desc);
+}
+
+static void
+allocate_ops_v1(crypto_ops_t *src, crypto_ops_t *dst, uint_t *mech_list_count)
+{
+ if (src->co_control_ops != NULL)
+ dst->co_control_ops = kmem_alloc(sizeof (crypto_control_ops_t),
+ KM_SLEEP);
+
+ if (src->co_digest_ops != NULL)
+ dst->co_digest_ops = kmem_alloc(sizeof (crypto_digest_ops_t),
+ KM_SLEEP);
+
+ if (src->co_cipher_ops != NULL)
+ dst->co_cipher_ops = kmem_alloc(sizeof (crypto_cipher_ops_t),
+ KM_SLEEP);
+
+ if (src->co_mac_ops != NULL)
+ dst->co_mac_ops = kmem_alloc(sizeof (crypto_mac_ops_t),
+ KM_SLEEP);
+
+ if (src->co_sign_ops != NULL)
+ dst->co_sign_ops = kmem_alloc(sizeof (crypto_sign_ops_t),
+ KM_SLEEP);
+
+ if (src->co_verify_ops != NULL)
+ dst->co_verify_ops = kmem_alloc(sizeof (crypto_verify_ops_t),
+ KM_SLEEP);
+
+ if (src->co_dual_ops != NULL)
+ dst->co_dual_ops = kmem_alloc(sizeof (crypto_dual_ops_t),
+ KM_SLEEP);
+
+ if (src->co_dual_cipher_mac_ops != NULL)
+ dst->co_dual_cipher_mac_ops = kmem_alloc(
+ sizeof (crypto_dual_cipher_mac_ops_t), KM_SLEEP);
+
+ if (src->co_random_ops != NULL) {
+ dst->co_random_ops = kmem_alloc(
+ sizeof (crypto_random_number_ops_t), KM_SLEEP);
+
+ /*
+ * Allocate storage to store the array of supported mechanisms
+ * specified by provider. We allocate extra mechanism storage
+ * if the provider has random_ops since we keep an internal
+ * mechanism, SUN_RANDOM, in this case.
+ */
+ (*mech_list_count)++;
+ }
+
+ if (src->co_session_ops != NULL)
+ dst->co_session_ops = kmem_alloc(sizeof (crypto_session_ops_t),
+ KM_SLEEP);
+
+ if (src->co_object_ops != NULL)
+ dst->co_object_ops = kmem_alloc(sizeof (crypto_object_ops_t),
+ KM_SLEEP);
+
+ if (src->co_key_ops != NULL)
+ dst->co_key_ops = kmem_alloc(sizeof (crypto_key_ops_t),
+ KM_SLEEP);
+
+ if (src->co_provider_ops != NULL)
+ dst->co_provider_ops = kmem_alloc(
+ sizeof (crypto_provider_management_ops_t), KM_SLEEP);
+
+ if (src->co_ctx_ops != NULL)
+ dst->co_ctx_ops = kmem_alloc(sizeof (crypto_ctx_ops_t),
+ KM_SLEEP);
+}
+
+static void
+allocate_ops_v2(crypto_ops_t *src, crypto_ops_t *dst)
+{
+ if (src->co_mech_ops != NULL)
+ dst->co_mech_ops = kmem_alloc(sizeof (crypto_mech_ops_t),
+ KM_SLEEP);
+}
+
+static void
+allocate_ops_v3(crypto_ops_t *src, crypto_ops_t *dst)
+{
+ if (src->co_nostore_key_ops != NULL)
+ dst->co_nostore_key_ops =
+ kmem_alloc(sizeof (crypto_nostore_key_ops_t), KM_SLEEP);
+}
+
+/*
+ * Allocate a provider descriptor. mech_list_count specifies the
+ * number of mechanisms supported by the providers, and is used
+ * to allocate storage for the mechanism table.
+ * This function may sleep while allocating memory, which is OK
+ * since it is invoked from user context during provider registration.
+ */
+kcf_provider_desc_t *
+kcf_alloc_provider_desc(crypto_provider_info_t *info)
+{
+ int i, j;
+ kcf_provider_desc_t *desc;
+ uint_t mech_list_count = info->pi_mech_list_count;
+ crypto_ops_t *src_ops = info->pi_ops_vector;
+
+ desc = kmem_zalloc(sizeof (kcf_provider_desc_t), KM_SLEEP);
+
+ /*
+ * pd_description serves two purposes
+ * - Appears as a blank padded PKCS#11 style string, that will be
+ * returned to applications in CK_SLOT_INFO.slotDescription.
+ * This means that we should not have a null character in the
+ * first CRYPTO_PROVIDER_DESCR_MAX_LEN bytes.
+ * - Appears as a null-terminated string that can be used by
+ * other kcf routines.
+ *
+ * So, we allocate enough room for one extra null terminator
+ * which keeps every one happy.
+ */
+ desc->pd_description = kmem_alloc(CRYPTO_PROVIDER_DESCR_MAX_LEN + 1,
+ KM_SLEEP);
+ (void) memset(desc->pd_description, ' ',
+ CRYPTO_PROVIDER_DESCR_MAX_LEN);
+ desc->pd_description[CRYPTO_PROVIDER_DESCR_MAX_LEN] = '\0';
+
+ /*
+ * Since the framework does not require the ops vector specified
+ * by the providers during registration to be persistent,
+ * KCF needs to allocate storage where copies of the ops
+ * vectors are copied.
+ */
+ desc->pd_ops_vector = kmem_zalloc(sizeof (crypto_ops_t), KM_SLEEP);
+
+ if (info->pi_provider_type != CRYPTO_LOGICAL_PROVIDER) {
+ allocate_ops_v1(src_ops, desc->pd_ops_vector, &mech_list_count);
+ if (info->pi_interface_version >= CRYPTO_SPI_VERSION_2)
+ allocate_ops_v2(src_ops, desc->pd_ops_vector);
+ if (info->pi_interface_version == CRYPTO_SPI_VERSION_3)
+ allocate_ops_v3(src_ops, desc->pd_ops_vector);
+ }
+
+ desc->pd_mech_list_count = mech_list_count;
+ desc->pd_mechanisms = kmem_zalloc(sizeof (crypto_mech_info_t) *
+ mech_list_count, KM_SLEEP);
+ for (i = 0; i < KCF_OPS_CLASSSIZE; i++)
+ for (j = 0; j < KCF_MAXMECHTAB; j++)
+ desc->pd_mech_indx[i][j] = KCF_INVALID_INDX;
+
+ desc->pd_prov_id = KCF_PROVID_INVALID;
+ desc->pd_state = KCF_PROV_ALLOCATED;
+
+ mutex_init(&desc->pd_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&desc->pd_resume_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&desc->pd_remove_cv, NULL, CV_DEFAULT, NULL);
+
+ return (desc);
+}
+
+/*
+ * Called by KCF_PROV_REFRELE when a provider's reference count drops
+ * to zero. We free the descriptor when the last reference is released.
+ * However, for software providers, we do not free it when there is an
+ * unregister thread waiting. We signal that thread in this case and
+ * that thread is responsible for freeing the descriptor.
+ */
+void
+kcf_provider_zero_refcnt(kcf_provider_desc_t *desc)
+{
+ mutex_enter(&desc->pd_lock);
+ switch (desc->pd_prov_type) {
+ case CRYPTO_SW_PROVIDER:
+ if (desc->pd_state == KCF_PROV_REMOVED ||
+ desc->pd_state == KCF_PROV_DISABLED) {
+ desc->pd_state = KCF_PROV_FREED;
+ cv_broadcast(&desc->pd_remove_cv);
+ mutex_exit(&desc->pd_lock);
+ break;
+ }
+ /* FALLTHRU */
+
+ case CRYPTO_HW_PROVIDER:
+ case CRYPTO_LOGICAL_PROVIDER:
+ mutex_exit(&desc->pd_lock);
+ kcf_free_provider_desc(desc);
+ }
+}
+
+/*
+ * Free a provider descriptor.
+ */
+void
+kcf_free_provider_desc(kcf_provider_desc_t *desc)
+{
+ if (desc == NULL)
+ return;
+
+ mutex_enter(&prov_tab_mutex);
+ if (desc->pd_prov_id != KCF_PROVID_INVALID) {
+ /* release the associated providers table entry */
+ ASSERT(prov_tab[desc->pd_prov_id] != NULL);
+ prov_tab[desc->pd_prov_id] = NULL;
+ prov_tab_num--;
+ }
+ mutex_exit(&prov_tab_mutex);
+
+ /* free the kernel memory associated with the provider descriptor */
+
+ if (desc->pd_description != NULL)
+ kmem_free(desc->pd_description,
+ CRYPTO_PROVIDER_DESCR_MAX_LEN + 1);
+
+ if (desc->pd_ops_vector != NULL) {
+
+ if (desc->pd_ops_vector->co_control_ops != NULL)
+ kmem_free(desc->pd_ops_vector->co_control_ops,
+ sizeof (crypto_control_ops_t));
+
+ if (desc->pd_ops_vector->co_digest_ops != NULL)
+ kmem_free(desc->pd_ops_vector->co_digest_ops,
+ sizeof (crypto_digest_ops_t));
+
+ if (desc->pd_ops_vector->co_cipher_ops != NULL)
+ kmem_free(desc->pd_ops_vector->co_cipher_ops,
+ sizeof (crypto_cipher_ops_t));
+
+ if (desc->pd_ops_vector->co_mac_ops != NULL)
+ kmem_free(desc->pd_ops_vector->co_mac_ops,
+ sizeof (crypto_mac_ops_t));
+
+ if (desc->pd_ops_vector->co_sign_ops != NULL)
+ kmem_free(desc->pd_ops_vector->co_sign_ops,
+ sizeof (crypto_sign_ops_t));
+
+ if (desc->pd_ops_vector->co_verify_ops != NULL)
+ kmem_free(desc->pd_ops_vector->co_verify_ops,
+ sizeof (crypto_verify_ops_t));
+
+ if (desc->pd_ops_vector->co_dual_ops != NULL)
+ kmem_free(desc->pd_ops_vector->co_dual_ops,
+ sizeof (crypto_dual_ops_t));
+
+ if (desc->pd_ops_vector->co_dual_cipher_mac_ops != NULL)
+ kmem_free(desc->pd_ops_vector->co_dual_cipher_mac_ops,
+ sizeof (crypto_dual_cipher_mac_ops_t));
+
+ if (desc->pd_ops_vector->co_random_ops != NULL)
+ kmem_free(desc->pd_ops_vector->co_random_ops,
+ sizeof (crypto_random_number_ops_t));
+
+ if (desc->pd_ops_vector->co_session_ops != NULL)
+ kmem_free(desc->pd_ops_vector->co_session_ops,
+ sizeof (crypto_session_ops_t));
+
+ if (desc->pd_ops_vector->co_object_ops != NULL)
+ kmem_free(desc->pd_ops_vector->co_object_ops,
+ sizeof (crypto_object_ops_t));
+
+ if (desc->pd_ops_vector->co_key_ops != NULL)
+ kmem_free(desc->pd_ops_vector->co_key_ops,
+ sizeof (crypto_key_ops_t));
+
+ if (desc->pd_ops_vector->co_provider_ops != NULL)
+ kmem_free(desc->pd_ops_vector->co_provider_ops,
+ sizeof (crypto_provider_management_ops_t));
+
+ if (desc->pd_ops_vector->co_ctx_ops != NULL)
+ kmem_free(desc->pd_ops_vector->co_ctx_ops,
+ sizeof (crypto_ctx_ops_t));
+
+ if (desc->pd_ops_vector->co_mech_ops != NULL)
+ kmem_free(desc->pd_ops_vector->co_mech_ops,
+ sizeof (crypto_mech_ops_t));
+
+ if (desc->pd_ops_vector->co_nostore_key_ops != NULL)
+ kmem_free(desc->pd_ops_vector->co_nostore_key_ops,
+ sizeof (crypto_nostore_key_ops_t));
+
+ kmem_free(desc->pd_ops_vector, sizeof (crypto_ops_t));
+ }
+
+ if (desc->pd_mechanisms != NULL)
+ /* free the memory associated with the mechanism info's */
+ kmem_free(desc->pd_mechanisms, sizeof (crypto_mech_info_t) *
+ desc->pd_mech_list_count);
+
+ if (desc->pd_sched_info.ks_taskq != NULL)
+ taskq_destroy(desc->pd_sched_info.ks_taskq);
+
+ mutex_destroy(&desc->pd_lock);
+ cv_destroy(&desc->pd_resume_cv);
+ cv_destroy(&desc->pd_remove_cv);
+
+ kmem_free(desc, sizeof (kcf_provider_desc_t));
+}
+
+/*
+ * Returns an array of hardware and logical provider descriptors,
+ * a.k.a the PKCS#11 slot list. A REFHOLD is done on each descriptor
+ * before the array is returned. The entire table can be freed by
+ * calling kcf_free_provider_tab().
+ */
+int
+kcf_get_slot_list(uint_t *count, kcf_provider_desc_t ***array,
+ boolean_t unverified)
+{
+ kcf_provider_desc_t *prov_desc;
+ kcf_provider_desc_t **p = NULL;
+ char *last;
+ uint_t cnt = 0;
+ uint_t i, j;
+ int rval = CRYPTO_SUCCESS;
+ size_t n, final_size;
+
+ /* count the providers */
+ mutex_enter(&prov_tab_mutex);
+ for (i = 0; i < KCF_MAX_PROVIDERS; i++) {
+ if ((prov_desc = prov_tab[i]) != NULL &&
+ ((prov_desc->pd_prov_type == CRYPTO_HW_PROVIDER &&
+ (prov_desc->pd_flags & CRYPTO_HIDE_PROVIDER) == 0) ||
+ prov_desc->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)) {
+ if (KCF_IS_PROV_USABLE(prov_desc) ||
+ (unverified && KCF_IS_PROV_UNVERIFIED(prov_desc))) {
+ cnt++;
+ }
+ }
+ }
+ mutex_exit(&prov_tab_mutex);
+
+ if (cnt == 0)
+ goto out;
+
+ n = cnt * sizeof (kcf_provider_desc_t *);
+again:
+ p = kmem_zalloc(n, KM_SLEEP);
+
+ /* pointer to last entry in the array */
+ last = (char *)&p[cnt-1];
+
+ mutex_enter(&prov_tab_mutex);
+ /* fill the slot list */
+ for (i = 0, j = 0; i < KCF_MAX_PROVIDERS; i++) {
+ if ((prov_desc = prov_tab[i]) != NULL &&
+ ((prov_desc->pd_prov_type == CRYPTO_HW_PROVIDER &&
+ (prov_desc->pd_flags & CRYPTO_HIDE_PROVIDER) == 0) ||
+ prov_desc->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)) {
+ if (KCF_IS_PROV_USABLE(prov_desc) ||
+ (unverified && KCF_IS_PROV_UNVERIFIED(prov_desc))) {
+ if ((char *)&p[j] > last) {
+ mutex_exit(&prov_tab_mutex);
+ kcf_free_provider_tab(cnt, p);
+ n = n << 1;
+ cnt = cnt << 1;
+ goto again;
+ }
+ p[j++] = prov_desc;
+ KCF_PROV_REFHOLD(prov_desc);
+ }
+ }
+ }
+ mutex_exit(&prov_tab_mutex);
+
+ final_size = j * sizeof (kcf_provider_desc_t *);
+ cnt = j;
+ ASSERT(final_size <= n);
+
+ /* check if buffer we allocated is too large */
+ if (final_size < n) {
+ char *final_buffer = NULL;
+
+ if (final_size > 0) {
+ final_buffer = kmem_alloc(final_size, KM_SLEEP);
+ bcopy(p, final_buffer, final_size);
+ }
+ kmem_free(p, n);
+ p = (kcf_provider_desc_t **)final_buffer;
+ }
+out:
+ *count = cnt;
+ *array = p;
+ return (rval);
+}
+
+/*
+ * Free an array of hardware provider descriptors. A REFRELE
+ * is done on each descriptor before the table is freed.
+ */
+void
+kcf_free_provider_tab(uint_t count, kcf_provider_desc_t **array)
+{
+ kcf_provider_desc_t *prov_desc;
+ int i;
+
+ for (i = 0; i < count; i++) {
+ if ((prov_desc = array[i]) != NULL) {
+ KCF_PROV_REFRELE(prov_desc);
+ }
+ }
+ kmem_free(array, count * sizeof (kcf_provider_desc_t *));
+}
+
+/*
+ * Returns in the location pointed to by pd a pointer to the descriptor
+ * for the software provider for the specified mechanism.
+ * The provider descriptor is returned held and it is the caller's
+ * responsibility to release it when done. The mechanism entry
+ * is returned if the optional argument mep is non NULL.
+ *
+ * Returns one of the CRYPTO_ * error codes on failure, and
+ * CRYPTO_SUCCESS on success.
+ */
+int
+kcf_get_sw_prov(crypto_mech_type_t mech_type, kcf_provider_desc_t **pd,
+ kcf_mech_entry_t **mep, boolean_t log_warn)
+{
+ kcf_mech_entry_t *me;
+
+ /* get the mechanism entry for this mechanism */
+ if (kcf_get_mech_entry(mech_type, &me) != KCF_SUCCESS)
+ return (CRYPTO_MECHANISM_INVALID);
+
+ /*
+ * Get the software provider for this mechanism.
+ * Lock the mech_entry until we grab the 'pd'.
+ */
+ mutex_enter(&me->me_mutex);
+
+ if (me->me_sw_prov == NULL ||
+ (*pd = me->me_sw_prov->pm_prov_desc) == NULL) {
+ /* no SW provider for this mechanism */
+ if (log_warn)
+ cmn_err(CE_WARN, "no SW provider for \"%s\"\n",
+ me->me_name);
+ mutex_exit(&me->me_mutex);
+ return (CRYPTO_MECH_NOT_SUPPORTED);
+ }
+
+ KCF_PROV_REFHOLD(*pd);
+ mutex_exit(&me->me_mutex);
+
+ if (mep != NULL)
+ *mep = me;
+
+ return (CRYPTO_SUCCESS);
+}
diff --git a/sys/contrib/openzfs/module/icp/core/kcf_sched.c b/sys/contrib/openzfs/module/icp/core/kcf_sched.c
new file mode 100644
index 000000000000..81fd15f8ea26
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/core/kcf_sched.c
@@ -0,0 +1,1780 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * This file contains the core framework routines for the
+ * kernel cryptographic framework. These routines are at the
+ * layer, between the kernel API/ioctls and the SPI.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/sched_impl.h>
+#include <sys/crypto/api.h>
+
+kcf_global_swq_t *gswq; /* Global software queue */
+
+/* Thread pool related variables */
+static kcf_pool_t *kcfpool; /* Thread pool of kcfd LWPs */
+int kcf_maxthreads = 2;
+int kcf_minthreads = 1;
+int kcf_thr_multiple = 2; /* Boot-time tunable for experimentation */
+static ulong_t kcf_idlethr_timeout;
+#define KCF_DEFAULT_THRTIMEOUT 60000000 /* 60 seconds */
+
+/* kmem caches used by the scheduler */
+static kmem_cache_t *kcf_sreq_cache;
+static kmem_cache_t *kcf_areq_cache;
+static kmem_cache_t *kcf_context_cache;
+
+/* Global request ID table */
+static kcf_reqid_table_t *kcf_reqid_table[REQID_TABLES];
+
+/* KCF stats. Not protected. */
+static kcf_stats_t kcf_ksdata = {
+ { "total threads in pool", KSTAT_DATA_UINT32},
+ { "idle threads in pool", KSTAT_DATA_UINT32},
+ { "min threads in pool", KSTAT_DATA_UINT32},
+ { "max threads in pool", KSTAT_DATA_UINT32},
+ { "requests in gswq", KSTAT_DATA_UINT32},
+ { "max requests in gswq", KSTAT_DATA_UINT32},
+ { "threads for HW taskq", KSTAT_DATA_UINT32},
+ { "minalloc for HW taskq", KSTAT_DATA_UINT32},
+ { "maxalloc for HW taskq", KSTAT_DATA_UINT32}
+};
+
+static kstat_t *kcf_misc_kstat = NULL;
+ulong_t kcf_swprov_hndl = 0;
+
+static kcf_areq_node_t *kcf_areqnode_alloc(kcf_provider_desc_t *,
+ kcf_context_t *, crypto_call_req_t *, kcf_req_params_t *, boolean_t);
+static int kcf_disp_sw_request(kcf_areq_node_t *);
+static void process_req_hwp(void *);
+static int kcf_enqueue(kcf_areq_node_t *);
+static void kcfpool_alloc(void);
+static void kcf_reqid_delete(kcf_areq_node_t *areq);
+static crypto_req_id_t kcf_reqid_insert(kcf_areq_node_t *areq);
+static int kcf_misc_kstat_update(kstat_t *ksp, int rw);
+
+/*
+ * Create a new context.
+ */
+crypto_ctx_t *
+kcf_new_ctx(crypto_call_req_t *crq, kcf_provider_desc_t *pd,
+ crypto_session_id_t sid)
+{
+ crypto_ctx_t *ctx;
+ kcf_context_t *kcf_ctx;
+
+ kcf_ctx = kmem_cache_alloc(kcf_context_cache,
+ (crq == NULL) ? KM_SLEEP : KM_NOSLEEP);
+ if (kcf_ctx == NULL)
+ return (NULL);
+
+ /* initialize the context for the consumer */
+ kcf_ctx->kc_refcnt = 1;
+ kcf_ctx->kc_req_chain_first = NULL;
+ kcf_ctx->kc_req_chain_last = NULL;
+ kcf_ctx->kc_secondctx = NULL;
+ KCF_PROV_REFHOLD(pd);
+ kcf_ctx->kc_prov_desc = pd;
+ kcf_ctx->kc_sw_prov_desc = NULL;
+ kcf_ctx->kc_mech = NULL;
+
+ ctx = &kcf_ctx->kc_glbl_ctx;
+ ctx->cc_provider = pd->pd_prov_handle;
+ ctx->cc_session = sid;
+ ctx->cc_provider_private = NULL;
+ ctx->cc_framework_private = (void *)kcf_ctx;
+ ctx->cc_flags = 0;
+ ctx->cc_opstate = NULL;
+
+ return (ctx);
+}
+
+/*
+ * Allocate a new async request node.
+ *
+ * ictx - Framework private context pointer
+ * crq - Has callback function and argument. Should be non NULL.
+ * req - The parameters to pass to the SPI
+ */
+static kcf_areq_node_t *
+kcf_areqnode_alloc(kcf_provider_desc_t *pd, kcf_context_t *ictx,
+ crypto_call_req_t *crq, kcf_req_params_t *req, boolean_t isdual)
+{
+ kcf_areq_node_t *arptr, *areq;
+
+ ASSERT(crq != NULL);
+ arptr = kmem_cache_alloc(kcf_areq_cache, KM_NOSLEEP);
+ if (arptr == NULL)
+ return (NULL);
+
+ arptr->an_state = REQ_ALLOCATED;
+ arptr->an_reqarg = *crq;
+ arptr->an_params = *req;
+ arptr->an_context = ictx;
+ arptr->an_isdual = isdual;
+
+ arptr->an_next = arptr->an_prev = NULL;
+ KCF_PROV_REFHOLD(pd);
+ arptr->an_provider = pd;
+ arptr->an_tried_plist = NULL;
+ arptr->an_refcnt = 1;
+ arptr->an_idnext = arptr->an_idprev = NULL;
+
+ /*
+ * Requests for context-less operations do not use the
+ * fields - an_is_my_turn, and an_ctxchain_next.
+ */
+ if (ictx == NULL)
+ return (arptr);
+
+ KCF_CONTEXT_REFHOLD(ictx);
+ /*
+ * Chain this request to the context.
+ */
+ mutex_enter(&ictx->kc_in_use_lock);
+ arptr->an_ctxchain_next = NULL;
+ if ((areq = ictx->kc_req_chain_last) == NULL) {
+ arptr->an_is_my_turn = B_TRUE;
+ ictx->kc_req_chain_last =
+ ictx->kc_req_chain_first = arptr;
+ } else {
+ ASSERT(ictx->kc_req_chain_first != NULL);
+ arptr->an_is_my_turn = B_FALSE;
+ /* Insert the new request to the end of the chain. */
+ areq->an_ctxchain_next = arptr;
+ ictx->kc_req_chain_last = arptr;
+ }
+ mutex_exit(&ictx->kc_in_use_lock);
+
+ return (arptr);
+}
+
+/*
+ * Queue the request node and do one of the following:
+ * - If there is an idle thread signal it to run.
+ * - If there is no idle thread and max running threads is not
+ * reached, signal the creator thread for more threads.
+ *
+ * If the two conditions above are not met, we don't need to do
+ * anything. The request will be picked up by one of the
+ * worker threads when it becomes available.
+ */
+static int
+kcf_disp_sw_request(kcf_areq_node_t *areq)
+{
+ int err;
+ int cnt = 0;
+
+ if ((err = kcf_enqueue(areq)) != 0)
+ return (err);
+
+ if (kcfpool->kp_idlethreads > 0) {
+ /* Signal an idle thread to run */
+ mutex_enter(&gswq->gs_lock);
+ cv_signal(&gswq->gs_cv);
+ mutex_exit(&gswq->gs_lock);
+
+ return (CRYPTO_QUEUED);
+ }
+
+ /*
+ * We keep the number of running threads to be at
+ * kcf_minthreads to reduce gs_lock contention.
+ */
+ cnt = kcf_minthreads -
+ (kcfpool->kp_threads - kcfpool->kp_blockedthreads);
+ if (cnt > 0) {
+ /*
+ * The following ensures the number of threads in pool
+ * does not exceed kcf_maxthreads.
+ */
+ cnt = MIN(cnt, kcf_maxthreads - (int)kcfpool->kp_threads);
+ if (cnt > 0) {
+ /* Signal the creator thread for more threads */
+ mutex_enter(&kcfpool->kp_user_lock);
+ if (!kcfpool->kp_signal_create_thread) {
+ kcfpool->kp_signal_create_thread = B_TRUE;
+ kcfpool->kp_nthrs = cnt;
+ cv_signal(&kcfpool->kp_user_cv);
+ }
+ mutex_exit(&kcfpool->kp_user_lock);
+ }
+ }
+
+ return (CRYPTO_QUEUED);
+}
+
+/*
+ * This routine is called by the taskq associated with
+ * each hardware provider. We notify the kernel consumer
+ * via the callback routine in case of CRYPTO_SUCCESS or
+ * a failure.
+ *
+ * A request can be of type kcf_areq_node_t or of type
+ * kcf_sreq_node_t.
+ */
+static void
+process_req_hwp(void *ireq)
+{
+ int error = 0;
+ crypto_ctx_t *ctx;
+ kcf_call_type_t ctype;
+ kcf_provider_desc_t *pd;
+ kcf_areq_node_t *areq = (kcf_areq_node_t *)ireq;
+ kcf_sreq_node_t *sreq = (kcf_sreq_node_t *)ireq;
+
+ pd = ((ctype = GET_REQ_TYPE(ireq)) == CRYPTO_SYNCH) ?
+ sreq->sn_provider : areq->an_provider;
+
+ /*
+ * Wait if flow control is in effect for the provider. A
+ * CRYPTO_PROVIDER_READY or CRYPTO_PROVIDER_FAILED
+ * notification will signal us. We also get signaled if
+ * the provider is unregistering.
+ */
+ if (pd->pd_state == KCF_PROV_BUSY) {
+ mutex_enter(&pd->pd_lock);
+ while (pd->pd_state == KCF_PROV_BUSY)
+ cv_wait(&pd->pd_resume_cv, &pd->pd_lock);
+ mutex_exit(&pd->pd_lock);
+ }
+
+ /*
+ * Bump the internal reference count while the request is being
+ * processed. This is how we know when it's safe to unregister
+ * a provider. This step must precede the pd_state check below.
+ */
+ KCF_PROV_IREFHOLD(pd);
+
+ /*
+ * Fail the request if the provider has failed. We return a
+ * recoverable error and the notified clients attempt any
+ * recovery. For async clients this is done in kcf_aop_done()
+ * and for sync clients it is done in the k-api routines.
+ */
+ if (pd->pd_state >= KCF_PROV_FAILED) {
+ error = CRYPTO_DEVICE_ERROR;
+ goto bail;
+ }
+
+ if (ctype == CRYPTO_SYNCH) {
+ mutex_enter(&sreq->sn_lock);
+ sreq->sn_state = REQ_INPROGRESS;
+ mutex_exit(&sreq->sn_lock);
+
+ ctx = sreq->sn_context ? &sreq->sn_context->kc_glbl_ctx : NULL;
+ error = common_submit_request(sreq->sn_provider, ctx,
+ sreq->sn_params, sreq);
+ } else {
+ kcf_context_t *ictx;
+ ASSERT(ctype == CRYPTO_ASYNCH);
+
+ /*
+ * We are in the per-hardware provider thread context and
+ * hence can sleep. Note that the caller would have done
+ * a taskq_dispatch(..., TQ_NOSLEEP) and would have returned.
+ */
+ ctx = (ictx = areq->an_context) ? &ictx->kc_glbl_ctx : NULL;
+
+ mutex_enter(&areq->an_lock);
+ /*
+ * We need to maintain ordering for multi-part requests.
+ * an_is_my_turn is set to B_TRUE initially for a request
+ * when it is enqueued and there are no other requests
+ * for that context. It is set later from kcf_aop_done() when
+ * the request before us in the chain of requests for the
+ * context completes. We get signaled at that point.
+ */
+ if (ictx != NULL) {
+ ASSERT(ictx->kc_prov_desc == areq->an_provider);
+
+ while (areq->an_is_my_turn == B_FALSE) {
+ cv_wait(&areq->an_turn_cv, &areq->an_lock);
+ }
+ }
+ areq->an_state = REQ_INPROGRESS;
+ mutex_exit(&areq->an_lock);
+
+ error = common_submit_request(areq->an_provider, ctx,
+ &areq->an_params, areq);
+ }
+
+bail:
+ if (error == CRYPTO_QUEUED) {
+ /*
+ * The request is queued by the provider and we should
+ * get a crypto_op_notification() from the provider later.
+ * We notify the consumer at that time.
+ */
+ return;
+ } else { /* CRYPTO_SUCCESS or other failure */
+ KCF_PROV_IREFRELE(pd);
+ if (ctype == CRYPTO_SYNCH)
+ kcf_sop_done(sreq, error);
+ else
+ kcf_aop_done(areq, error);
+ }
+}
+
+/*
+ * This routine checks if a request can be retried on another
+ * provider. If true, mech1 is initialized to point to the mechanism
+ * structure. mech2 is also initialized in case of a dual operation. fg
+ * is initialized to the correct crypto_func_group_t bit flag. They are
+ * initialized by this routine, so that the caller can pass them to a
+ * kcf_get_mech_provider() or kcf_get_dual_provider() with no further change.
+ *
+ * We check that the request is for a init or atomic routine and that
+ * it is for one of the operation groups used from k-api .
+ */
+static boolean_t
+can_resubmit(kcf_areq_node_t *areq, crypto_mechanism_t **mech1,
+ crypto_mechanism_t **mech2, crypto_func_group_t *fg)
+{
+ kcf_req_params_t *params;
+ kcf_op_type_t optype;
+
+ params = &areq->an_params;
+ optype = params->rp_optype;
+
+ if (!(IS_INIT_OP(optype) || IS_ATOMIC_OP(optype)))
+ return (B_FALSE);
+
+ switch (params->rp_opgrp) {
+ case KCF_OG_DIGEST: {
+ kcf_digest_ops_params_t *dops = &params->rp_u.digest_params;
+
+ dops->do_mech.cm_type = dops->do_framework_mechtype;
+ *mech1 = &dops->do_mech;
+ *fg = (optype == KCF_OP_INIT) ? CRYPTO_FG_DIGEST :
+ CRYPTO_FG_DIGEST_ATOMIC;
+ break;
+ }
+
+ case KCF_OG_MAC: {
+ kcf_mac_ops_params_t *mops = &params->rp_u.mac_params;
+
+ mops->mo_mech.cm_type = mops->mo_framework_mechtype;
+ *mech1 = &mops->mo_mech;
+ *fg = (optype == KCF_OP_INIT) ? CRYPTO_FG_MAC :
+ CRYPTO_FG_MAC_ATOMIC;
+ break;
+ }
+
+ case KCF_OG_SIGN: {
+ kcf_sign_ops_params_t *sops = &params->rp_u.sign_params;
+
+ sops->so_mech.cm_type = sops->so_framework_mechtype;
+ *mech1 = &sops->so_mech;
+ switch (optype) {
+ case KCF_OP_INIT:
+ *fg = CRYPTO_FG_SIGN;
+ break;
+ case KCF_OP_ATOMIC:
+ *fg = CRYPTO_FG_SIGN_ATOMIC;
+ break;
+ default:
+ ASSERT(optype == KCF_OP_SIGN_RECOVER_ATOMIC);
+ *fg = CRYPTO_FG_SIGN_RECOVER_ATOMIC;
+ }
+ break;
+ }
+
+ case KCF_OG_VERIFY: {
+ kcf_verify_ops_params_t *vops = &params->rp_u.verify_params;
+
+ vops->vo_mech.cm_type = vops->vo_framework_mechtype;
+ *mech1 = &vops->vo_mech;
+ switch (optype) {
+ case KCF_OP_INIT:
+ *fg = CRYPTO_FG_VERIFY;
+ break;
+ case KCF_OP_ATOMIC:
+ *fg = CRYPTO_FG_VERIFY_ATOMIC;
+ break;
+ default:
+ ASSERT(optype == KCF_OP_VERIFY_RECOVER_ATOMIC);
+ *fg = CRYPTO_FG_VERIFY_RECOVER_ATOMIC;
+ }
+ break;
+ }
+
+ case KCF_OG_ENCRYPT: {
+ kcf_encrypt_ops_params_t *eops = &params->rp_u.encrypt_params;
+
+ eops->eo_mech.cm_type = eops->eo_framework_mechtype;
+ *mech1 = &eops->eo_mech;
+ *fg = (optype == KCF_OP_INIT) ? CRYPTO_FG_ENCRYPT :
+ CRYPTO_FG_ENCRYPT_ATOMIC;
+ break;
+ }
+
+ case KCF_OG_DECRYPT: {
+ kcf_decrypt_ops_params_t *dcrops = &params->rp_u.decrypt_params;
+
+ dcrops->dop_mech.cm_type = dcrops->dop_framework_mechtype;
+ *mech1 = &dcrops->dop_mech;
+ *fg = (optype == KCF_OP_INIT) ? CRYPTO_FG_DECRYPT :
+ CRYPTO_FG_DECRYPT_ATOMIC;
+ break;
+ }
+
+ case KCF_OG_ENCRYPT_MAC: {
+ kcf_encrypt_mac_ops_params_t *eops =
+ &params->rp_u.encrypt_mac_params;
+
+ eops->em_encr_mech.cm_type = eops->em_framework_encr_mechtype;
+ *mech1 = &eops->em_encr_mech;
+ eops->em_mac_mech.cm_type = eops->em_framework_mac_mechtype;
+ *mech2 = &eops->em_mac_mech;
+ *fg = (optype == KCF_OP_INIT) ? CRYPTO_FG_ENCRYPT_MAC :
+ CRYPTO_FG_ENCRYPT_MAC_ATOMIC;
+ break;
+ }
+
+ case KCF_OG_MAC_DECRYPT: {
+ kcf_mac_decrypt_ops_params_t *dops =
+ &params->rp_u.mac_decrypt_params;
+
+ dops->md_mac_mech.cm_type = dops->md_framework_mac_mechtype;
+ *mech1 = &dops->md_mac_mech;
+ dops->md_decr_mech.cm_type = dops->md_framework_decr_mechtype;
+ *mech2 = &dops->md_decr_mech;
+ *fg = (optype == KCF_OP_INIT) ? CRYPTO_FG_MAC_DECRYPT :
+ CRYPTO_FG_MAC_DECRYPT_ATOMIC;
+ break;
+ }
+
+ default:
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+}
+
+/*
+ * This routine is called when a request to a provider has failed
+ * with a recoverable error. This routine tries to find another provider
+ * and dispatches the request to the new provider, if one is available.
+ * We reuse the request structure.
+ *
+ * A return value of NULL from kcf_get_mech_provider() indicates
+ * we have tried the last provider.
+ */
+static int
+kcf_resubmit_request(kcf_areq_node_t *areq)
+{
+ int error = CRYPTO_FAILED;
+ kcf_context_t *ictx;
+ kcf_provider_desc_t *old_pd;
+ kcf_provider_desc_t *new_pd;
+ crypto_mechanism_t *mech1 = NULL, *mech2 = NULL;
+ crypto_mech_type_t prov_mt1, prov_mt2;
+ crypto_func_group_t fg = 0;
+
+ if (!can_resubmit(areq, &mech1, &mech2, &fg))
+ return (error);
+
+ old_pd = areq->an_provider;
+ /*
+ * Add old_pd to the list of providers already tried. We release
+ * the hold on old_pd (from the earlier kcf_get_mech_provider()) in
+ * kcf_free_triedlist().
+ */
+ if (kcf_insert_triedlist(&areq->an_tried_plist, old_pd,
+ KM_NOSLEEP) == NULL)
+ return (error);
+
+ if (mech1 && !mech2) {
+ new_pd = kcf_get_mech_provider(mech1->cm_type, NULL, &error,
+ areq->an_tried_plist, fg,
+ (areq->an_reqarg.cr_flag & CRYPTO_RESTRICTED), 0);
+ } else {
+ ASSERT(mech1 != NULL && mech2 != NULL);
+
+ new_pd = kcf_get_dual_provider(mech1, mech2, NULL, &prov_mt1,
+ &prov_mt2, &error, areq->an_tried_plist, fg, fg,
+ (areq->an_reqarg.cr_flag & CRYPTO_RESTRICTED), 0);
+ }
+
+ if (new_pd == NULL)
+ return (error);
+
+ /*
+ * We reuse the old context by resetting provider specific
+ * fields in it.
+ */
+ if ((ictx = areq->an_context) != NULL) {
+ crypto_ctx_t *ctx;
+
+ ASSERT(old_pd == ictx->kc_prov_desc);
+ KCF_PROV_REFRELE(ictx->kc_prov_desc);
+ KCF_PROV_REFHOLD(new_pd);
+ ictx->kc_prov_desc = new_pd;
+
+ ctx = &ictx->kc_glbl_ctx;
+ ctx->cc_provider = new_pd->pd_prov_handle;
+ ctx->cc_session = new_pd->pd_sid;
+ ctx->cc_provider_private = NULL;
+ }
+
+ /* We reuse areq. by resetting the provider and context fields. */
+ KCF_PROV_REFRELE(old_pd);
+ KCF_PROV_REFHOLD(new_pd);
+ areq->an_provider = new_pd;
+ mutex_enter(&areq->an_lock);
+ areq->an_state = REQ_WAITING;
+ mutex_exit(&areq->an_lock);
+
+ switch (new_pd->pd_prov_type) {
+ case CRYPTO_SW_PROVIDER:
+ error = kcf_disp_sw_request(areq);
+ break;
+
+ case CRYPTO_HW_PROVIDER: {
+ taskq_t *taskq = new_pd->pd_sched_info.ks_taskq;
+
+ if (taskq_dispatch(taskq, process_req_hwp, areq, TQ_NOSLEEP) ==
+ TASKQID_INVALID) {
+ error = CRYPTO_HOST_MEMORY;
+ } else {
+ error = CRYPTO_QUEUED;
+ }
+
+ break;
+ default:
+ break;
+ }
+ }
+
+ return (error);
+}
+
+static inline int EMPTY_TASKQ(taskq_t *tq)
+{
+#ifdef _KERNEL
+ return (tq->tq_lowest_id == tq->tq_next_id);
+#else
+ return (tq->tq_task.tqent_next == &tq->tq_task || tq->tq_active == 0);
+#endif
+}
+
+/*
+ * Routine called by both ioctl and k-api. The consumer should
+ * bundle the parameters into a kcf_req_params_t structure. A bunch
+ * of macros are available in ops_impl.h for this bundling. They are:
+ *
+ * KCF_WRAP_DIGEST_OPS_PARAMS()
+ * KCF_WRAP_MAC_OPS_PARAMS()
+ * KCF_WRAP_ENCRYPT_OPS_PARAMS()
+ * KCF_WRAP_DECRYPT_OPS_PARAMS() ... etc.
+ *
+ * It is the caller's responsibility to free the ctx argument when
+ * appropriate. See the KCF_CONTEXT_COND_RELEASE macro for details.
+ */
+int
+kcf_submit_request(kcf_provider_desc_t *pd, crypto_ctx_t *ctx,
+ crypto_call_req_t *crq, kcf_req_params_t *params, boolean_t cont)
+{
+ int error = CRYPTO_SUCCESS;
+ kcf_areq_node_t *areq;
+ kcf_sreq_node_t *sreq;
+ kcf_context_t *kcf_ctx;
+ taskq_t *taskq = pd->pd_sched_info.ks_taskq;
+
+ kcf_ctx = ctx ? (kcf_context_t *)ctx->cc_framework_private : NULL;
+
+ /* Synchronous cases */
+ if (crq == NULL) {
+ switch (pd->pd_prov_type) {
+ case CRYPTO_SW_PROVIDER:
+ error = common_submit_request(pd, ctx, params,
+ KCF_RHNDL(KM_SLEEP));
+ break;
+
+ case CRYPTO_HW_PROVIDER:
+ /*
+ * Special case for CRYPTO_SYNCHRONOUS providers that
+ * never return a CRYPTO_QUEUED error. We skip any
+ * request allocation and call the SPI directly.
+ */
+ if ((pd->pd_flags & CRYPTO_SYNCHRONOUS) &&
+ EMPTY_TASKQ(taskq)) {
+ KCF_PROV_IREFHOLD(pd);
+ if (pd->pd_state == KCF_PROV_READY) {
+ error = common_submit_request(pd, ctx,
+ params, KCF_RHNDL(KM_SLEEP));
+ KCF_PROV_IREFRELE(pd);
+ ASSERT(error != CRYPTO_QUEUED);
+ break;
+ }
+ KCF_PROV_IREFRELE(pd);
+ }
+
+ sreq = kmem_cache_alloc(kcf_sreq_cache, KM_SLEEP);
+ sreq->sn_state = REQ_ALLOCATED;
+ sreq->sn_rv = CRYPTO_FAILED;
+ sreq->sn_params = params;
+
+ /*
+ * Note that we do not need to hold the context
+ * for synchronous case as the context will never
+ * become invalid underneath us. We do not need to hold
+ * the provider here either as the caller has a hold.
+ */
+ sreq->sn_context = kcf_ctx;
+ ASSERT(KCF_PROV_REFHELD(pd));
+ sreq->sn_provider = pd;
+
+ ASSERT(taskq != NULL);
+ /*
+ * Call the SPI directly if the taskq is empty and the
+ * provider is not busy, else dispatch to the taskq.
+ * Calling directly is fine as this is the synchronous
+ * case. This is unlike the asynchronous case where we
+ * must always dispatch to the taskq.
+ */
+ if (EMPTY_TASKQ(taskq) &&
+ pd->pd_state == KCF_PROV_READY) {
+ process_req_hwp(sreq);
+ } else {
+ /*
+ * We can not tell from taskq_dispatch() return
+ * value if we exceeded maxalloc. Hence the
+ * check here. Since we are allowed to wait in
+ * the synchronous case, we wait for the taskq
+ * to become empty.
+ */
+ if (taskq->tq_nalloc >= crypto_taskq_maxalloc) {
+ taskq_wait(taskq);
+ }
+
+ (void) taskq_dispatch(taskq, process_req_hwp,
+ sreq, TQ_SLEEP);
+ }
+
+ /*
+ * Wait for the notification to arrive,
+ * if the operation is not done yet.
+ * Bug# 4722589 will make the wait a cv_wait_sig().
+ */
+ mutex_enter(&sreq->sn_lock);
+ while (sreq->sn_state < REQ_DONE)
+ cv_wait(&sreq->sn_cv, &sreq->sn_lock);
+ mutex_exit(&sreq->sn_lock);
+
+ error = sreq->sn_rv;
+ kmem_cache_free(kcf_sreq_cache, sreq);
+
+ break;
+
+ default:
+ error = CRYPTO_FAILED;
+ break;
+ }
+
+ } else { /* Asynchronous cases */
+ switch (pd->pd_prov_type) {
+ case CRYPTO_SW_PROVIDER:
+ if (!(crq->cr_flag & CRYPTO_ALWAYS_QUEUE)) {
+ /*
+ * This case has less overhead since there is
+ * no switching of context.
+ */
+ error = common_submit_request(pd, ctx, params,
+ KCF_RHNDL(KM_NOSLEEP));
+ } else {
+ /*
+ * CRYPTO_ALWAYS_QUEUE is set. We need to
+ * queue the request and return.
+ */
+ areq = kcf_areqnode_alloc(pd, kcf_ctx, crq,
+ params, cont);
+ if (areq == NULL)
+ error = CRYPTO_HOST_MEMORY;
+ else {
+ if (!(crq->cr_flag
+ & CRYPTO_SKIP_REQID)) {
+ /*
+ * Set the request handle. This handle
+ * is used for any crypto_cancel_req(9f)
+ * calls from the consumer. We have to
+ * do this before dispatching the
+ * request.
+ */
+ crq->cr_reqid = kcf_reqid_insert(areq);
+ }
+
+ error = kcf_disp_sw_request(areq);
+ /*
+ * There is an error processing this
+ * request. Remove the handle and
+ * release the request structure.
+ */
+ if (error != CRYPTO_QUEUED) {
+ if (!(crq->cr_flag
+ & CRYPTO_SKIP_REQID))
+ kcf_reqid_delete(areq);
+ KCF_AREQ_REFRELE(areq);
+ }
+ }
+ }
+ break;
+
+ case CRYPTO_HW_PROVIDER:
+ /*
+ * We need to queue the request and return.
+ */
+ areq = kcf_areqnode_alloc(pd, kcf_ctx, crq, params,
+ cont);
+ if (areq == NULL) {
+ error = CRYPTO_HOST_MEMORY;
+ goto done;
+ }
+
+ ASSERT(taskq != NULL);
+ /*
+ * We can not tell from taskq_dispatch() return
+ * value if we exceeded maxalloc. Hence the check
+ * here.
+ */
+ if (taskq->tq_nalloc >= crypto_taskq_maxalloc) {
+ error = CRYPTO_BUSY;
+ KCF_AREQ_REFRELE(areq);
+ goto done;
+ }
+
+ if (!(crq->cr_flag & CRYPTO_SKIP_REQID)) {
+ /*
+ * Set the request handle. This handle is used
+ * for any crypto_cancel_req(9f) calls from the
+ * consumer. We have to do this before dispatching
+ * the request.
+ */
+ crq->cr_reqid = kcf_reqid_insert(areq);
+ }
+
+ if (taskq_dispatch(taskq,
+ process_req_hwp, areq, TQ_NOSLEEP) ==
+ TASKQID_INVALID) {
+ error = CRYPTO_HOST_MEMORY;
+ if (!(crq->cr_flag & CRYPTO_SKIP_REQID))
+ kcf_reqid_delete(areq);
+ KCF_AREQ_REFRELE(areq);
+ } else {
+ error = CRYPTO_QUEUED;
+ }
+ break;
+
+ default:
+ error = CRYPTO_FAILED;
+ break;
+ }
+ }
+
+done:
+ return (error);
+}
+
+/*
+ * We're done with this framework context, so free it. Note that freeing
+ * framework context (kcf_context) frees the global context (crypto_ctx).
+ *
+ * The provider is responsible for freeing provider private context after a
+ * final or single operation and resetting the cc_provider_private field
+ * to NULL. It should do this before it notifies the framework of the
+ * completion. We still need to call KCF_PROV_FREE_CONTEXT to handle cases
+ * like crypto_cancel_ctx(9f).
+ */
+void
+kcf_free_context(kcf_context_t *kcf_ctx)
+{
+ kcf_provider_desc_t *pd = kcf_ctx->kc_prov_desc;
+ crypto_ctx_t *gctx = &kcf_ctx->kc_glbl_ctx;
+ kcf_context_t *kcf_secondctx = kcf_ctx->kc_secondctx;
+
+ /* Release the second context, if any */
+
+ if (kcf_secondctx != NULL)
+ KCF_CONTEXT_REFRELE(kcf_secondctx);
+
+ if (gctx->cc_provider_private != NULL) {
+ mutex_enter(&pd->pd_lock);
+ if (!KCF_IS_PROV_REMOVED(pd)) {
+ /*
+ * Increment the provider's internal refcnt so it
+ * doesn't unregister from the framework while
+ * we're calling the entry point.
+ */
+ KCF_PROV_IREFHOLD(pd);
+ mutex_exit(&pd->pd_lock);
+ (void) KCF_PROV_FREE_CONTEXT(pd, gctx);
+ KCF_PROV_IREFRELE(pd);
+ } else {
+ mutex_exit(&pd->pd_lock);
+ }
+ }
+
+ /* kcf_ctx->kc_prov_desc has a hold on pd */
+ KCF_PROV_REFRELE(kcf_ctx->kc_prov_desc);
+
+ /* check if this context is shared with a software provider */
+ if ((gctx->cc_flags & CRYPTO_INIT_OPSTATE) &&
+ kcf_ctx->kc_sw_prov_desc != NULL) {
+ KCF_PROV_REFRELE(kcf_ctx->kc_sw_prov_desc);
+ }
+
+ kmem_cache_free(kcf_context_cache, kcf_ctx);
+}
+
+/*
+ * Free the request after releasing all the holds.
+ */
+void
+kcf_free_req(kcf_areq_node_t *areq)
+{
+ KCF_PROV_REFRELE(areq->an_provider);
+ if (areq->an_context != NULL)
+ KCF_CONTEXT_REFRELE(areq->an_context);
+
+ if (areq->an_tried_plist != NULL)
+ kcf_free_triedlist(areq->an_tried_plist);
+ kmem_cache_free(kcf_areq_cache, areq);
+}
+
+/*
+ * Utility routine to remove a request from the chain of requests
+ * hanging off a context.
+ */
+static void
+kcf_removereq_in_ctxchain(kcf_context_t *ictx, kcf_areq_node_t *areq)
+{
+ kcf_areq_node_t *cur, *prev;
+
+ /*
+ * Get context lock, search for areq in the chain and remove it.
+ */
+ ASSERT(ictx != NULL);
+ mutex_enter(&ictx->kc_in_use_lock);
+ prev = cur = ictx->kc_req_chain_first;
+
+ while (cur != NULL) {
+ if (cur == areq) {
+ if (prev == cur) {
+ if ((ictx->kc_req_chain_first =
+ cur->an_ctxchain_next) == NULL)
+ ictx->kc_req_chain_last = NULL;
+ } else {
+ if (cur == ictx->kc_req_chain_last)
+ ictx->kc_req_chain_last = prev;
+ prev->an_ctxchain_next = cur->an_ctxchain_next;
+ }
+
+ break;
+ }
+ prev = cur;
+ cur = cur->an_ctxchain_next;
+ }
+ mutex_exit(&ictx->kc_in_use_lock);
+}
+
+/*
+ * Remove the specified node from the global software queue.
+ *
+ * The caller must hold the queue lock and request lock (an_lock).
+ */
+static void
+kcf_remove_node(kcf_areq_node_t *node)
+{
+ kcf_areq_node_t *nextp = node->an_next;
+ kcf_areq_node_t *prevp = node->an_prev;
+
+ if (nextp != NULL)
+ nextp->an_prev = prevp;
+ else
+ gswq->gs_last = prevp;
+
+ if (prevp != NULL)
+ prevp->an_next = nextp;
+ else
+ gswq->gs_first = nextp;
+
+ node->an_state = REQ_CANCELED;
+}
+
+/*
+ * Add the request node to the end of the global software queue.
+ *
+ * The caller should not hold the queue lock. Returns 0 if the
+ * request is successfully queued. Returns CRYPTO_BUSY if the limit
+ * on the number of jobs is exceeded.
+ */
+static int
+kcf_enqueue(kcf_areq_node_t *node)
+{
+ kcf_areq_node_t *tnode;
+
+ mutex_enter(&gswq->gs_lock);
+
+ if (gswq->gs_njobs >= gswq->gs_maxjobs) {
+ mutex_exit(&gswq->gs_lock);
+ return (CRYPTO_BUSY);
+ }
+
+ if (gswq->gs_last == NULL) {
+ gswq->gs_first = gswq->gs_last = node;
+ } else {
+ ASSERT(gswq->gs_last->an_next == NULL);
+ tnode = gswq->gs_last;
+ tnode->an_next = node;
+ gswq->gs_last = node;
+ node->an_prev = tnode;
+ }
+
+ gswq->gs_njobs++;
+
+ /* an_lock not needed here as we hold gs_lock */
+ node->an_state = REQ_WAITING;
+
+ mutex_exit(&gswq->gs_lock);
+
+ return (0);
+}
+
+/*
+ * kmem_cache_alloc constructor for sync request structure.
+ */
+/* ARGSUSED */
+static int
+kcf_sreq_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ kcf_sreq_node_t *sreq = (kcf_sreq_node_t *)buf;
+
+ sreq->sn_type = CRYPTO_SYNCH;
+ cv_init(&sreq->sn_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&sreq->sn_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+kcf_sreq_cache_destructor(void *buf, void *cdrarg)
+{
+ kcf_sreq_node_t *sreq = (kcf_sreq_node_t *)buf;
+
+ mutex_destroy(&sreq->sn_lock);
+ cv_destroy(&sreq->sn_cv);
+}
+
+/*
+ * kmem_cache_alloc constructor for async request structure.
+ */
+/* ARGSUSED */
+static int
+kcf_areq_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ kcf_areq_node_t *areq = (kcf_areq_node_t *)buf;
+
+ areq->an_type = CRYPTO_ASYNCH;
+ areq->an_refcnt = 0;
+ mutex_init(&areq->an_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&areq->an_done, NULL, CV_DEFAULT, NULL);
+ cv_init(&areq->an_turn_cv, NULL, CV_DEFAULT, NULL);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+kcf_areq_cache_destructor(void *buf, void *cdrarg)
+{
+ kcf_areq_node_t *areq = (kcf_areq_node_t *)buf;
+
+ ASSERT(areq->an_refcnt == 0);
+ mutex_destroy(&areq->an_lock);
+ cv_destroy(&areq->an_done);
+ cv_destroy(&areq->an_turn_cv);
+}
+
+/*
+ * kmem_cache_alloc constructor for kcf_context structure.
+ */
+/* ARGSUSED */
+static int
+kcf_context_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ kcf_context_t *kctx = (kcf_context_t *)buf;
+
+ kctx->kc_refcnt = 0;
+ mutex_init(&kctx->kc_in_use_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+kcf_context_cache_destructor(void *buf, void *cdrarg)
+{
+ kcf_context_t *kctx = (kcf_context_t *)buf;
+
+ ASSERT(kctx->kc_refcnt == 0);
+ mutex_destroy(&kctx->kc_in_use_lock);
+}
+
+void
+kcf_sched_destroy(void)
+{
+ int i;
+
+ if (kcf_misc_kstat)
+ kstat_delete(kcf_misc_kstat);
+
+ if (kcfpool) {
+ mutex_destroy(&kcfpool->kp_thread_lock);
+ cv_destroy(&kcfpool->kp_nothr_cv);
+ mutex_destroy(&kcfpool->kp_user_lock);
+ cv_destroy(&kcfpool->kp_user_cv);
+
+ kmem_free(kcfpool, sizeof (kcf_pool_t));
+ }
+
+ for (i = 0; i < REQID_TABLES; i++) {
+ if (kcf_reqid_table[i]) {
+ mutex_destroy(&(kcf_reqid_table[i]->rt_lock));
+ kmem_free(kcf_reqid_table[i],
+ sizeof (kcf_reqid_table_t));
+ }
+ }
+
+ if (gswq) {
+ mutex_destroy(&gswq->gs_lock);
+ cv_destroy(&gswq->gs_cv);
+ kmem_free(gswq, sizeof (kcf_global_swq_t));
+ }
+
+ if (kcf_context_cache)
+ kmem_cache_destroy(kcf_context_cache);
+ if (kcf_areq_cache)
+ kmem_cache_destroy(kcf_areq_cache);
+ if (kcf_sreq_cache)
+ kmem_cache_destroy(kcf_sreq_cache);
+
+ mutex_destroy(&ntfy_list_lock);
+ cv_destroy(&ntfy_list_cv);
+}
+
+/*
+ * Creates and initializes all the structures needed by the framework.
+ */
+void
+kcf_sched_init(void)
+{
+ int i;
+ kcf_reqid_table_t *rt;
+
+ /*
+ * Create all the kmem caches needed by the framework. We set the
+ * align argument to 64, to get a slab aligned to 64-byte as well as
+ * have the objects (cache_chunksize) to be a 64-byte multiple.
+ * This helps to avoid false sharing as this is the size of the
+ * CPU cache line.
+ */
+ kcf_sreq_cache = kmem_cache_create("kcf_sreq_cache",
+ sizeof (struct kcf_sreq_node), 64, kcf_sreq_cache_constructor,
+ kcf_sreq_cache_destructor, NULL, NULL, NULL, 0);
+
+ kcf_areq_cache = kmem_cache_create("kcf_areq_cache",
+ sizeof (struct kcf_areq_node), 64, kcf_areq_cache_constructor,
+ kcf_areq_cache_destructor, NULL, NULL, NULL, 0);
+
+ kcf_context_cache = kmem_cache_create("kcf_context_cache",
+ sizeof (struct kcf_context), 64, kcf_context_cache_constructor,
+ kcf_context_cache_destructor, NULL, NULL, NULL, 0);
+
+ gswq = kmem_alloc(sizeof (kcf_global_swq_t), KM_SLEEP);
+
+ mutex_init(&gswq->gs_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&gswq->gs_cv, NULL, CV_DEFAULT, NULL);
+ gswq->gs_njobs = 0;
+ gswq->gs_maxjobs = kcf_maxthreads * crypto_taskq_maxalloc;
+ gswq->gs_first = gswq->gs_last = NULL;
+
+ /* Initialize the global reqid table */
+ for (i = 0; i < REQID_TABLES; i++) {
+ rt = kmem_zalloc(sizeof (kcf_reqid_table_t), KM_SLEEP);
+ kcf_reqid_table[i] = rt;
+ mutex_init(&rt->rt_lock, NULL, MUTEX_DEFAULT, NULL);
+ rt->rt_curid = i;
+ }
+
+ /* Allocate and initialize the thread pool */
+ kcfpool_alloc();
+
+ /* Initialize the event notification list variables */
+ mutex_init(&ntfy_list_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&ntfy_list_cv, NULL, CV_DEFAULT, NULL);
+
+ /* Create the kcf kstat */
+ kcf_misc_kstat = kstat_create("kcf", 0, "framework_stats", "crypto",
+ KSTAT_TYPE_NAMED, sizeof (kcf_stats_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (kcf_misc_kstat != NULL) {
+ kcf_misc_kstat->ks_data = &kcf_ksdata;
+ kcf_misc_kstat->ks_update = kcf_misc_kstat_update;
+ kstat_install(kcf_misc_kstat);
+ }
+}
+
+/*
+ * Signal the waiting sync client.
+ */
+void
+kcf_sop_done(kcf_sreq_node_t *sreq, int error)
+{
+ mutex_enter(&sreq->sn_lock);
+ sreq->sn_state = REQ_DONE;
+ sreq->sn_rv = error;
+ cv_signal(&sreq->sn_cv);
+ mutex_exit(&sreq->sn_lock);
+}
+
+/*
+ * Callback the async client with the operation status.
+ * We free the async request node and possibly the context.
+ * We also handle any chain of requests hanging off of
+ * the context.
+ */
+void
+kcf_aop_done(kcf_areq_node_t *areq, int error)
+{
+ kcf_op_type_t optype;
+ boolean_t skip_notify = B_FALSE;
+ kcf_context_t *ictx;
+ kcf_areq_node_t *nextreq;
+
+ /*
+ * Handle recoverable errors. This has to be done first
+ * before doing anything else in this routine so that
+ * we do not change the state of the request.
+ */
+ if (error != CRYPTO_SUCCESS && IS_RECOVERABLE(error)) {
+ /*
+ * We try another provider, if one is available. Else
+ * we continue with the failure notification to the
+ * client.
+ */
+ if (kcf_resubmit_request(areq) == CRYPTO_QUEUED)
+ return;
+ }
+
+ mutex_enter(&areq->an_lock);
+ areq->an_state = REQ_DONE;
+ mutex_exit(&areq->an_lock);
+
+ optype = (&areq->an_params)->rp_optype;
+ if ((ictx = areq->an_context) != NULL) {
+ /*
+ * A request after it is removed from the request
+ * queue, still stays on a chain of requests hanging
+ * of its context structure. It needs to be removed
+ * from this chain at this point.
+ */
+ mutex_enter(&ictx->kc_in_use_lock);
+ nextreq = areq->an_ctxchain_next;
+ if (nextreq != NULL) {
+ mutex_enter(&nextreq->an_lock);
+ nextreq->an_is_my_turn = B_TRUE;
+ cv_signal(&nextreq->an_turn_cv);
+ mutex_exit(&nextreq->an_lock);
+ }
+
+ ictx->kc_req_chain_first = nextreq;
+ if (nextreq == NULL)
+ ictx->kc_req_chain_last = NULL;
+ mutex_exit(&ictx->kc_in_use_lock);
+
+ if (IS_SINGLE_OP(optype) || IS_FINAL_OP(optype)) {
+ ASSERT(nextreq == NULL);
+ KCF_CONTEXT_REFRELE(ictx);
+ } else if (error != CRYPTO_SUCCESS && IS_INIT_OP(optype)) {
+ /*
+ * NOTE - We do not release the context in case of update
+ * operations. We require the consumer to free it explicitly,
+ * in case it wants to abandon an update operation. This is done
+ * as there may be mechanisms in ECB mode that can continue
+ * even if an operation on a block fails.
+ */
+ KCF_CONTEXT_REFRELE(ictx);
+ }
+ }
+
+ /* Deal with the internal continuation to this request first */
+
+ if (areq->an_isdual) {
+ kcf_dual_req_t *next_arg;
+ next_arg = (kcf_dual_req_t *)areq->an_reqarg.cr_callback_arg;
+ next_arg->kr_areq = areq;
+ KCF_AREQ_REFHOLD(areq);
+ areq->an_isdual = B_FALSE;
+
+ NOTIFY_CLIENT(areq, error);
+ return;
+ }
+
+ /*
+ * If CRYPTO_NOTIFY_OPDONE flag is set, we should notify
+ * always. If this flag is clear, we skip the notification
+ * provided there are no errors. We check this flag for only
+ * init or update operations. It is ignored for single, final or
+ * atomic operations.
+ */
+ skip_notify = (IS_UPDATE_OP(optype) || IS_INIT_OP(optype)) &&
+ (!(areq->an_reqarg.cr_flag & CRYPTO_NOTIFY_OPDONE)) &&
+ (error == CRYPTO_SUCCESS);
+
+ if (!skip_notify) {
+ NOTIFY_CLIENT(areq, error);
+ }
+
+ if (!(areq->an_reqarg.cr_flag & CRYPTO_SKIP_REQID))
+ kcf_reqid_delete(areq);
+
+ KCF_AREQ_REFRELE(areq);
+}
+
+/*
+ * Allocate the thread pool and initialize all the fields.
+ */
+static void
+kcfpool_alloc()
+{
+ kcfpool = kmem_alloc(sizeof (kcf_pool_t), KM_SLEEP);
+
+ kcfpool->kp_threads = kcfpool->kp_idlethreads = 0;
+ kcfpool->kp_blockedthreads = 0;
+ kcfpool->kp_signal_create_thread = B_FALSE;
+ kcfpool->kp_nthrs = 0;
+ kcfpool->kp_user_waiting = B_FALSE;
+
+ mutex_init(&kcfpool->kp_thread_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&kcfpool->kp_nothr_cv, NULL, CV_DEFAULT, NULL);
+
+ mutex_init(&kcfpool->kp_user_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&kcfpool->kp_user_cv, NULL, CV_DEFAULT, NULL);
+
+ kcf_idlethr_timeout = KCF_DEFAULT_THRTIMEOUT;
+}
+
+/*
+ * Insert the async request in the hash table after assigning it
+ * an ID. Returns the ID.
+ *
+ * The ID is used by the caller to pass as an argument to a
+ * cancel_req() routine later.
+ */
+static crypto_req_id_t
+kcf_reqid_insert(kcf_areq_node_t *areq)
+{
+ int indx;
+ crypto_req_id_t id;
+ kcf_areq_node_t *headp;
+ kcf_reqid_table_t *rt;
+
+ rt = kcf_reqid_table[CPU_SEQID_UNSTABLE & REQID_TABLE_MASK];
+
+ mutex_enter(&rt->rt_lock);
+
+ rt->rt_curid = id =
+ (rt->rt_curid - REQID_COUNTER_LOW) | REQID_COUNTER_HIGH;
+ SET_REQID(areq, id);
+ indx = REQID_HASH(id);
+ headp = areq->an_idnext = rt->rt_idhash[indx];
+ areq->an_idprev = NULL;
+ if (headp != NULL)
+ headp->an_idprev = areq;
+
+ rt->rt_idhash[indx] = areq;
+ mutex_exit(&rt->rt_lock);
+
+ return (id);
+}
+
+/*
+ * Delete the async request from the hash table.
+ */
+static void
+kcf_reqid_delete(kcf_areq_node_t *areq)
+{
+ int indx;
+ kcf_areq_node_t *nextp, *prevp;
+ crypto_req_id_t id = GET_REQID(areq);
+ kcf_reqid_table_t *rt;
+
+ rt = kcf_reqid_table[id & REQID_TABLE_MASK];
+ indx = REQID_HASH(id);
+
+ mutex_enter(&rt->rt_lock);
+
+ nextp = areq->an_idnext;
+ prevp = areq->an_idprev;
+ if (nextp != NULL)
+ nextp->an_idprev = prevp;
+ if (prevp != NULL)
+ prevp->an_idnext = nextp;
+ else
+ rt->rt_idhash[indx] = nextp;
+
+ SET_REQID(areq, 0);
+ cv_broadcast(&areq->an_done);
+
+ mutex_exit(&rt->rt_lock);
+}
+
+/*
+ * Cancel a single asynchronous request.
+ *
+ * We guarantee that no problems will result from calling
+ * crypto_cancel_req() for a request which is either running, or
+ * has already completed. We remove the request from any queues
+ * if it is possible. We wait for request completion if the
+ * request is dispatched to a provider.
+ *
+ * Calling context:
+ * Can be called from user context only.
+ *
+ * NOTE: We acquire the following locks in this routine (in order):
+ * - rt_lock (kcf_reqid_table_t)
+ * - gswq->gs_lock
+ * - areq->an_lock
+ * - ictx->kc_in_use_lock (from kcf_removereq_in_ctxchain())
+ *
+ * This locking order MUST be maintained in code every where else.
+ */
+void
+crypto_cancel_req(crypto_req_id_t id)
+{
+ int indx;
+ kcf_areq_node_t *areq;
+ kcf_provider_desc_t *pd;
+ kcf_context_t *ictx;
+ kcf_reqid_table_t *rt;
+
+ rt = kcf_reqid_table[id & REQID_TABLE_MASK];
+ indx = REQID_HASH(id);
+
+ mutex_enter(&rt->rt_lock);
+ for (areq = rt->rt_idhash[indx]; areq; areq = areq->an_idnext) {
+ if (GET_REQID(areq) == id) {
+ /*
+ * We found the request. It is either still waiting
+ * in the framework queues or running at the provider.
+ */
+ pd = areq->an_provider;
+ ASSERT(pd != NULL);
+
+ switch (pd->pd_prov_type) {
+ case CRYPTO_SW_PROVIDER:
+ mutex_enter(&gswq->gs_lock);
+ mutex_enter(&areq->an_lock);
+
+ /* This request can be safely canceled. */
+ if (areq->an_state <= REQ_WAITING) {
+ /* Remove from gswq, global software queue. */
+ kcf_remove_node(areq);
+ if ((ictx = areq->an_context) != NULL)
+ kcf_removereq_in_ctxchain(ictx, areq);
+
+ mutex_exit(&areq->an_lock);
+ mutex_exit(&gswq->gs_lock);
+ mutex_exit(&rt->rt_lock);
+
+ /* Remove areq from hash table and free it. */
+ kcf_reqid_delete(areq);
+ KCF_AREQ_REFRELE(areq);
+ return;
+ }
+
+ mutex_exit(&areq->an_lock);
+ mutex_exit(&gswq->gs_lock);
+ break;
+
+ case CRYPTO_HW_PROVIDER:
+ /*
+ * There is no interface to remove an entry
+ * once it is on the taskq. So, we do not do
+ * anything for a hardware provider.
+ */
+ break;
+ default:
+ break;
+ }
+
+ /*
+ * The request is running. Wait for the request completion
+ * to notify us.
+ */
+ KCF_AREQ_REFHOLD(areq);
+ while (GET_REQID(areq) == id)
+ cv_wait(&areq->an_done, &rt->rt_lock);
+ KCF_AREQ_REFRELE(areq);
+ break;
+ }
+ }
+
+ mutex_exit(&rt->rt_lock);
+}
+
+/*
+ * Cancel all asynchronous requests associated with the
+ * passed in crypto context and free it.
+ *
+ * A client SHOULD NOT call this routine after calling a crypto_*_final
+ * routine. This routine is called only during intermediate operations.
+ * The client should not use the crypto context after this function returns
+ * since we destroy it.
+ *
+ * Calling context:
+ * Can be called from user context only.
+ */
+void
+crypto_cancel_ctx(crypto_context_t ctx)
+{
+ kcf_context_t *ictx;
+ kcf_areq_node_t *areq;
+
+ if (ctx == NULL)
+ return;
+
+ ictx = (kcf_context_t *)((crypto_ctx_t *)ctx)->cc_framework_private;
+
+ mutex_enter(&ictx->kc_in_use_lock);
+
+ /* Walk the chain and cancel each request */
+ while ((areq = ictx->kc_req_chain_first) != NULL) {
+ /*
+ * We have to drop the lock here as we may have
+ * to wait for request completion. We hold the
+ * request before dropping the lock though, so that it
+ * won't be freed underneath us.
+ */
+ KCF_AREQ_REFHOLD(areq);
+ mutex_exit(&ictx->kc_in_use_lock);
+
+ crypto_cancel_req(GET_REQID(areq));
+ KCF_AREQ_REFRELE(areq);
+
+ mutex_enter(&ictx->kc_in_use_lock);
+ }
+
+ mutex_exit(&ictx->kc_in_use_lock);
+ KCF_CONTEXT_REFRELE(ictx);
+}
+
+/*
+ * Update kstats.
+ */
+static int
+kcf_misc_kstat_update(kstat_t *ksp, int rw)
+{
+ uint_t tcnt;
+ kcf_stats_t *ks_data;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ ks_data = ksp->ks_data;
+
+ ks_data->ks_thrs_in_pool.value.ui32 = kcfpool->kp_threads;
+ /*
+ * The failover thread is counted in kp_idlethreads in
+ * some corner cases. This is done to avoid doing more checks
+ * when submitting a request. We account for those cases below.
+ */
+ if ((tcnt = kcfpool->kp_idlethreads) == (kcfpool->kp_threads + 1))
+ tcnt--;
+ ks_data->ks_idle_thrs.value.ui32 = tcnt;
+ ks_data->ks_minthrs.value.ui32 = kcf_minthreads;
+ ks_data->ks_maxthrs.value.ui32 = kcf_maxthreads;
+ ks_data->ks_swq_njobs.value.ui32 = gswq->gs_njobs;
+ ks_data->ks_swq_maxjobs.value.ui32 = gswq->gs_maxjobs;
+ ks_data->ks_taskq_threads.value.ui32 = crypto_taskq_threads;
+ ks_data->ks_taskq_minalloc.value.ui32 = crypto_taskq_minalloc;
+ ks_data->ks_taskq_maxalloc.value.ui32 = crypto_taskq_maxalloc;
+
+ return (0);
+}
+
+/*
+ * Allocate and initialize a kcf_dual_req, used for saving the arguments of
+ * a dual operation or an atomic operation that has to be internally
+ * simulated with multiple single steps.
+ * crq determines the memory allocation flags.
+ */
+
+kcf_dual_req_t *
+kcf_alloc_req(crypto_call_req_t *crq)
+{
+ kcf_dual_req_t *kcr;
+
+ kcr = kmem_alloc(sizeof (kcf_dual_req_t), KCF_KMFLAG(crq));
+
+ if (kcr == NULL)
+ return (NULL);
+
+ /* Copy the whole crypto_call_req struct, as it isn't persistent */
+ if (crq != NULL)
+ kcr->kr_callreq = *crq;
+ else
+ bzero(&(kcr->kr_callreq), sizeof (crypto_call_req_t));
+ kcr->kr_areq = NULL;
+ kcr->kr_saveoffset = 0;
+ kcr->kr_savelen = 0;
+
+ return (kcr);
+}
+
+/*
+ * Callback routine for the next part of a simulated dual part.
+ * Schedules the next step.
+ *
+ * This routine can be called from interrupt context.
+ */
+void
+kcf_next_req(void *next_req_arg, int status)
+{
+ kcf_dual_req_t *next_req = (kcf_dual_req_t *)next_req_arg;
+ kcf_req_params_t *params = &(next_req->kr_params);
+ kcf_areq_node_t *areq = next_req->kr_areq;
+ int error = status;
+ kcf_provider_desc_t *pd = NULL;
+ crypto_dual_data_t *ct = NULL;
+
+ /* Stop the processing if an error occurred at this step */
+ if (error != CRYPTO_SUCCESS) {
+out:
+ areq->an_reqarg = next_req->kr_callreq;
+ KCF_AREQ_REFRELE(areq);
+ kmem_free(next_req, sizeof (kcf_dual_req_t));
+ areq->an_isdual = B_FALSE;
+ kcf_aop_done(areq, error);
+ return;
+ }
+
+ switch (params->rp_opgrp) {
+ case KCF_OG_MAC: {
+
+ /*
+ * The next req is submitted with the same reqid as the
+ * first part. The consumer only got back that reqid, and
+ * should still be able to cancel the operation during its
+ * second step.
+ */
+ kcf_mac_ops_params_t *mops = &(params->rp_u.mac_params);
+ crypto_ctx_template_t mac_tmpl;
+ kcf_mech_entry_t *me;
+
+ ct = (crypto_dual_data_t *)mops->mo_data;
+ mac_tmpl = (crypto_ctx_template_t)mops->mo_templ;
+
+ /* No expected recoverable failures, so no retry list */
+ pd = kcf_get_mech_provider(mops->mo_framework_mechtype,
+ &me, &error, NULL, CRYPTO_FG_MAC_ATOMIC,
+ (areq->an_reqarg.cr_flag & CRYPTO_RESTRICTED), ct->dd_len2);
+
+ if (pd == NULL) {
+ error = CRYPTO_MECH_NOT_SUPPORTED;
+ goto out;
+ }
+ /* Validate the MAC context template here */
+ if ((pd->pd_prov_type == CRYPTO_SW_PROVIDER) &&
+ (mac_tmpl != NULL)) {
+ kcf_ctx_template_t *ctx_mac_tmpl;
+
+ ctx_mac_tmpl = (kcf_ctx_template_t *)mac_tmpl;
+
+ if (ctx_mac_tmpl->ct_generation != me->me_gen_swprov) {
+ KCF_PROV_REFRELE(pd);
+ error = CRYPTO_OLD_CTX_TEMPLATE;
+ goto out;
+ }
+ mops->mo_templ = ctx_mac_tmpl->ct_prov_tmpl;
+ }
+
+ break;
+ }
+ case KCF_OG_DECRYPT: {
+ kcf_decrypt_ops_params_t *dcrops =
+ &(params->rp_u.decrypt_params);
+
+ ct = (crypto_dual_data_t *)dcrops->dop_ciphertext;
+ /* No expected recoverable failures, so no retry list */
+ pd = kcf_get_mech_provider(dcrops->dop_framework_mechtype,
+ NULL, &error, NULL, CRYPTO_FG_DECRYPT_ATOMIC,
+ (areq->an_reqarg.cr_flag & CRYPTO_RESTRICTED), ct->dd_len1);
+
+ if (pd == NULL) {
+ error = CRYPTO_MECH_NOT_SUPPORTED;
+ goto out;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ /* The second step uses len2 and offset2 of the dual_data */
+ next_req->kr_saveoffset = ct->dd_offset1;
+ next_req->kr_savelen = ct->dd_len1;
+ ct->dd_offset1 = ct->dd_offset2;
+ ct->dd_len1 = ct->dd_len2;
+
+ /* preserve if the caller is restricted */
+ if (areq->an_reqarg.cr_flag & CRYPTO_RESTRICTED) {
+ areq->an_reqarg.cr_flag = CRYPTO_RESTRICTED;
+ } else {
+ areq->an_reqarg.cr_flag = 0;
+ }
+
+ areq->an_reqarg.cr_callback_func = kcf_last_req;
+ areq->an_reqarg.cr_callback_arg = next_req;
+ areq->an_isdual = B_TRUE;
+
+ /*
+ * We would like to call kcf_submit_request() here. But,
+ * that is not possible as that routine allocates a new
+ * kcf_areq_node_t request structure, while we need to
+ * reuse the existing request structure.
+ */
+ switch (pd->pd_prov_type) {
+ case CRYPTO_SW_PROVIDER:
+ error = common_submit_request(pd, NULL, params,
+ KCF_RHNDL(KM_NOSLEEP));
+ break;
+
+ case CRYPTO_HW_PROVIDER: {
+ kcf_provider_desc_t *old_pd;
+ taskq_t *taskq = pd->pd_sched_info.ks_taskq;
+
+ /*
+ * Set the params for the second step in the
+ * dual-ops.
+ */
+ areq->an_params = *params;
+ old_pd = areq->an_provider;
+ KCF_PROV_REFRELE(old_pd);
+ KCF_PROV_REFHOLD(pd);
+ areq->an_provider = pd;
+
+ /*
+ * Note that we have to do a taskq_dispatch()
+ * here as we may be in interrupt context.
+ */
+ if (taskq_dispatch(taskq, process_req_hwp, areq,
+ TQ_NOSLEEP) == (taskqid_t)0) {
+ error = CRYPTO_HOST_MEMORY;
+ } else {
+ error = CRYPTO_QUEUED;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ /*
+ * We have to release the holds on the request and the provider
+ * in all cases.
+ */
+ KCF_AREQ_REFRELE(areq);
+ KCF_PROV_REFRELE(pd);
+
+ if (error != CRYPTO_QUEUED) {
+ /* restore, clean up, and invoke the client's callback */
+
+ ct->dd_offset1 = next_req->kr_saveoffset;
+ ct->dd_len1 = next_req->kr_savelen;
+ areq->an_reqarg = next_req->kr_callreq;
+ kmem_free(next_req, sizeof (kcf_dual_req_t));
+ areq->an_isdual = B_FALSE;
+ kcf_aop_done(areq, error);
+ }
+}
+
+/*
+ * Last part of an emulated dual operation.
+ * Clean up and restore ...
+ */
+void
+kcf_last_req(void *last_req_arg, int status)
+{
+ kcf_dual_req_t *last_req = (kcf_dual_req_t *)last_req_arg;
+
+ kcf_req_params_t *params = &(last_req->kr_params);
+ kcf_areq_node_t *areq = last_req->kr_areq;
+ crypto_dual_data_t *ct = NULL;
+
+ switch (params->rp_opgrp) {
+ case KCF_OG_MAC: {
+ kcf_mac_ops_params_t *mops = &(params->rp_u.mac_params);
+
+ ct = (crypto_dual_data_t *)mops->mo_data;
+ break;
+ }
+ case KCF_OG_DECRYPT: {
+ kcf_decrypt_ops_params_t *dcrops =
+ &(params->rp_u.decrypt_params);
+
+ ct = (crypto_dual_data_t *)dcrops->dop_ciphertext;
+ break;
+ }
+ default: {
+ panic("invalid kcf_op_group_t %d", (int)params->rp_opgrp);
+ return;
+ }
+ }
+ ct->dd_offset1 = last_req->kr_saveoffset;
+ ct->dd_len1 = last_req->kr_savelen;
+
+ /* The submitter used kcf_last_req as its callback */
+
+ if (areq == NULL) {
+ crypto_call_req_t *cr = &last_req->kr_callreq;
+
+ (*(cr->cr_callback_func))(cr->cr_callback_arg, status);
+ kmem_free(last_req, sizeof (kcf_dual_req_t));
+ return;
+ }
+ areq->an_reqarg = last_req->kr_callreq;
+ KCF_AREQ_REFRELE(areq);
+ kmem_free(last_req, sizeof (kcf_dual_req_t));
+ areq->an_isdual = B_FALSE;
+ kcf_aop_done(areq, status);
+}
diff --git a/sys/contrib/openzfs/module/icp/illumos-crypto.c b/sys/contrib/openzfs/module/icp/illumos-crypto.c
new file mode 100644
index 000000000000..3c5ef4393940
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/illumos-crypto.c
@@ -0,0 +1,158 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Datto, Inc. All rights reserved.
+ */
+
+#ifdef _KERNEL
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#else
+#define __exit
+#define __init
+#endif
+
+#include <sys/crypto/common.h>
+#include <sys/crypto/api.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/sched_impl.h>
+#include <sys/modhash_impl.h>
+#include <sys/crypto/icp.h>
+
+/*
+ * Changes made to the original Illumos Crypto Layer for the ICP:
+ *
+ * Several changes were needed to allow the Illumos Crypto Layer
+ * to work in the Linux kernel. Almost all of the changes fall into
+ * one of the following categories:
+ *
+ * 1) Moving the syntax to the C90: This was mostly a matter of
+ * changing func() definitions to func(void). In a few cases,
+ * initializations of structs with unions needed to have brackets
+ * added.
+ *
+ * 2) Changes to allow userspace compilation: The ICP is meant to be
+ * compiled and used in both userspace and kernel space (for ztest and
+ * libzfs), so the _KERNEL macros did not make sense anymore. For the
+ * same reason, many header includes were also changed to use
+ * sys/zfs_context.h
+ *
+ * 3) Moving to a statically compiled architecture: At some point in
+ * the future it may make sense to have encryption algorithms that are
+ * loadable into the ICP at runtime via separate kernel modules.
+ * However, considering that this code will probably not see much use
+ * outside of zfs and zfs encryption only requires aes and sha256
+ * algorithms it seemed like more trouble than it was worth to port over
+ * Illumos's kernel module structure to a Linux kernel module. In
+ * addition, The Illumos code related to keeping track of kernel modules
+ * is very much tied to the Illumos OS and proved difficult to port to
+ * Linux. Therefore, the structure of the ICP was simplified to work
+ * statically and several pieces of code responsible for keeping track
+ * of Illumos kernel modules were removed and simplified. All module
+ * initialization and destruction is now called in this file during
+ * Linux kernel module loading and unloading.
+ *
+ * 4) Adding destructors: The Illumos Crypto Layer is built into
+ * the Illumos kernel and is not meant to be unloaded. Some destructors
+ * were added to allow the ICP to be unloaded without leaking
+ * structures.
+ *
+ * 5) Removing CRYPTO_DATA_MBLK related structures and code:
+ * crypto_data_t can have 3 formats, CRYPTO_DATA_RAW, CRYPTO_DATA_UIO,
+ * and CRYPTO_DATA_MBLK. ZFS only requires the first 2 formats, as the
+ * last one is related to streamed data. To simplify the port, code
+ * related to this format was removed.
+ *
+ * 6) Changes for architecture specific code: Some changes were needed
+ * to make architecture specific assembly compile. The biggest change
+ * here was to functions related to detecting CPU capabilities for amd64.
+ * The Illumos Crypto Layer used called into the Illumos kernel's API
+ * to discover these. They have been converted to instead use the
+ * 'cpuid' instruction as per the Intel spec. In addition, references to
+ * the sun4u' and sparc architectures have been removed so that these
+ * will use the generic implementation.
+ *
+ * 7) Removing sha384 and sha512 code: The sha code was actually very
+ * easy to port. However, the generic sha384 and sha512 code actually
+ * exceeds the stack size on arm and powerpc architectures. In an effort
+ * to remove warnings, this code was removed.
+ *
+ * 8) Change large allocations from kmem_alloc() to vmem_alloc(): In
+ * testing the ICP with the ZFS encryption code, a few allocations were
+ * found that could potentially be very large. These caused the SPL to
+ * throw warnings and so they were changed to use vmem_alloc().
+ *
+ * 9) Makefiles: Makefiles were added that would work with the existing
+ * ZFS Makefiles.
+ */
+
+void __exit
+icp_fini(void)
+{
+ skein_mod_fini();
+ sha2_mod_fini();
+ sha1_mod_fini();
+ edonr_mod_fini();
+ aes_mod_fini();
+ kcf_sched_destroy();
+ kcf_prov_tab_destroy();
+ kcf_destroy_mech_tabs();
+ mod_hash_fini();
+}
+
+/* roughly equivalent to kcf.c: _init() */
+int __init
+icp_init(void)
+{
+ /* initialize the mod hash module */
+ mod_hash_init();
+
+ /* initialize the mechanisms tables supported out-of-the-box */
+ kcf_init_mech_tabs();
+
+ /* initialize the providers tables */
+ kcf_prov_tab_init();
+
+ /*
+ * Initialize scheduling structures. Note that this does NOT
+ * start any threads since it might not be safe to do so.
+ */
+ kcf_sched_init();
+
+ /* initialize algorithms */
+ aes_mod_init();
+ edonr_mod_init();
+ sha1_mod_init();
+ sha2_mod_init();
+ skein_mod_init();
+
+ return (0);
+}
+
+#if defined(_KERNEL)
+module_exit(icp_fini);
+module_init(icp_init);
+MODULE_AUTHOR(ZFS_META_AUTHOR);
+MODULE_LICENSE(ZFS_META_LICENSE);
+MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
+#endif
diff --git a/sys/contrib/openzfs/module/icp/include/aes/aes_impl.h b/sys/contrib/openzfs/module/icp/include/aes/aes_impl.h
new file mode 100644
index 000000000000..41dccaa3848a
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/aes/aes_impl.h
@@ -0,0 +1,227 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _AES_IMPL_H
+#define _AES_IMPL_H
+
+/*
+ * Common definitions used by AES.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+
+/* Similar to sysmacros.h IS_P2ALIGNED, but checks two pointers: */
+#define IS_P2ALIGNED2(v, w, a) \
+ ((((uintptr_t)(v) | (uintptr_t)(w)) & ((uintptr_t)(a) - 1)) == 0)
+
+#define AES_BLOCK_LEN 16 /* bytes */
+/* Round constant length, in number of 32-bit elements: */
+#define RC_LENGTH (5 * ((AES_BLOCK_LEN) / 4 - 2))
+
+#define AES_COPY_BLOCK(src, dst) \
+ (dst)[0] = (src)[0]; \
+ (dst)[1] = (src)[1]; \
+ (dst)[2] = (src)[2]; \
+ (dst)[3] = (src)[3]; \
+ (dst)[4] = (src)[4]; \
+ (dst)[5] = (src)[5]; \
+ (dst)[6] = (src)[6]; \
+ (dst)[7] = (src)[7]; \
+ (dst)[8] = (src)[8]; \
+ (dst)[9] = (src)[9]; \
+ (dst)[10] = (src)[10]; \
+ (dst)[11] = (src)[11]; \
+ (dst)[12] = (src)[12]; \
+ (dst)[13] = (src)[13]; \
+ (dst)[14] = (src)[14]; \
+ (dst)[15] = (src)[15]
+
+#define AES_XOR_BLOCK(src, dst) \
+ (dst)[0] ^= (src)[0]; \
+ (dst)[1] ^= (src)[1]; \
+ (dst)[2] ^= (src)[2]; \
+ (dst)[3] ^= (src)[3]; \
+ (dst)[4] ^= (src)[4]; \
+ (dst)[5] ^= (src)[5]; \
+ (dst)[6] ^= (src)[6]; \
+ (dst)[7] ^= (src)[7]; \
+ (dst)[8] ^= (src)[8]; \
+ (dst)[9] ^= (src)[9]; \
+ (dst)[10] ^= (src)[10]; \
+ (dst)[11] ^= (src)[11]; \
+ (dst)[12] ^= (src)[12]; \
+ (dst)[13] ^= (src)[13]; \
+ (dst)[14] ^= (src)[14]; \
+ (dst)[15] ^= (src)[15]
+
+/* AES key size definitions */
+#define AES_MINBITS 128
+#define AES_MINBYTES ((AES_MINBITS) >> 3)
+#define AES_MAXBITS 256
+#define AES_MAXBYTES ((AES_MAXBITS) >> 3)
+
+#define AES_MIN_KEY_BYTES ((AES_MINBITS) >> 3)
+#define AES_MAX_KEY_BYTES ((AES_MAXBITS) >> 3)
+#define AES_192_KEY_BYTES 24
+#define AES_IV_LEN 16
+
+/* AES key schedule may be implemented with 32- or 64-bit elements: */
+#define AES_32BIT_KS 32
+#define AES_64BIT_KS 64
+
+#define MAX_AES_NR 14 /* Maximum number of rounds */
+#define MAX_AES_NB 4 /* Number of columns comprising a state */
+
+typedef union {
+#ifdef sun4u
+ uint64_t ks64[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
+#endif
+ uint32_t ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
+} aes_ks_t;
+
+typedef struct aes_impl_ops aes_impl_ops_t;
+
+/*
+ * The absolute offset of the encr_ks (0) and the nr (504) fields are hard
+ * coded in aesni-gcm-x86_64, so please don't change (or adjust accordingly).
+ */
+typedef struct aes_key aes_key_t;
+struct aes_key {
+ aes_ks_t encr_ks; /* encryption key schedule */
+ aes_ks_t decr_ks; /* decryption key schedule */
+#ifdef __amd64
+ long double align128; /* Align fields above for Intel AES-NI */
+#endif /* __amd64 */
+ const aes_impl_ops_t *ops; /* ops associated with this schedule */
+ int nr; /* number of rounds (10, 12, or 14) */
+ int type; /* key schedule size (32 or 64 bits) */
+};
+
+/*
+ * Core AES functions.
+ * ks and keysched are pointers to aes_key_t.
+ * They are declared void* as they are intended to be opaque types.
+ * Use function aes_alloc_keysched() to allocate memory for ks and keysched.
+ */
+extern void *aes_alloc_keysched(size_t *size, int kmflag);
+extern void aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits,
+ void *keysched);
+extern int aes_encrypt_block(const void *ks, const uint8_t *pt, uint8_t *ct);
+extern int aes_decrypt_block(const void *ks, const uint8_t *ct, uint8_t *pt);
+
+/*
+ * AES mode functions.
+ * The first 2 functions operate on 16-byte AES blocks.
+ */
+extern void aes_copy_block(uint8_t *in, uint8_t *out);
+extern void aes_xor_block(uint8_t *data, uint8_t *dst);
+
+/* Note: ctx is a pointer to aes_ctx_t defined in modes.h */
+extern int aes_encrypt_contiguous_blocks(void *ctx, char *data, size_t length,
+ crypto_data_t *out);
+extern int aes_decrypt_contiguous_blocks(void *ctx, char *data, size_t length,
+ crypto_data_t *out);
+
+/*
+ * The following definitions and declarations are only used by AES FIPS POST
+ */
+#ifdef _AES_IMPL
+
+typedef enum aes_mech_type {
+ AES_ECB_MECH_INFO_TYPE, /* SUN_CKM_AES_ECB */
+ AES_CBC_MECH_INFO_TYPE, /* SUN_CKM_AES_CBC */
+ AES_CBC_PAD_MECH_INFO_TYPE, /* SUN_CKM_AES_CBC_PAD */
+ AES_CTR_MECH_INFO_TYPE, /* SUN_CKM_AES_CTR */
+ AES_CCM_MECH_INFO_TYPE, /* SUN_CKM_AES_CCM */
+ AES_GCM_MECH_INFO_TYPE, /* SUN_CKM_AES_GCM */
+ AES_GMAC_MECH_INFO_TYPE /* SUN_CKM_AES_GMAC */
+} aes_mech_type_t;
+
+#endif /* _AES_IMPL */
+
+/*
+ * Methods used to define AES implementation
+ *
+ * @aes_gen_f Key generation
+ * @aes_enc_f Function encrypts one block
+ * @aes_dec_f Function decrypts one block
+ * @aes_will_work_f Function tests whether method will function
+ */
+typedef void (*aes_generate_f)(aes_key_t *, const uint32_t *, int);
+typedef void (*aes_encrypt_f)(const uint32_t[], int,
+ const uint32_t[4], uint32_t[4]);
+typedef void (*aes_decrypt_f)(const uint32_t[], int,
+ const uint32_t[4], uint32_t[4]);
+typedef boolean_t (*aes_will_work_f)(void);
+
+#define AES_IMPL_NAME_MAX (16)
+
+struct aes_impl_ops {
+ aes_generate_f generate;
+ aes_encrypt_f encrypt;
+ aes_decrypt_f decrypt;
+ aes_will_work_f is_supported;
+ boolean_t needs_byteswap;
+ char name[AES_IMPL_NAME_MAX];
+};
+
+extern const aes_impl_ops_t aes_generic_impl;
+#if defined(__x86_64)
+extern const aes_impl_ops_t aes_x86_64_impl;
+
+/* These functions are used to execute amd64 instructions for AMD or Intel: */
+extern int rijndael_key_setup_enc_amd64(uint32_t rk[],
+ const uint32_t cipherKey[], int keyBits);
+extern int rijndael_key_setup_dec_amd64(uint32_t rk[],
+ const uint32_t cipherKey[], int keyBits);
+extern void aes_encrypt_amd64(const uint32_t rk[], int Nr,
+ const uint32_t pt[4], uint32_t ct[4]);
+extern void aes_decrypt_amd64(const uint32_t rk[], int Nr,
+ const uint32_t ct[4], uint32_t pt[4]);
+#endif
+#if defined(__x86_64) && defined(HAVE_AES)
+extern const aes_impl_ops_t aes_aesni_impl;
+#endif
+
+/*
+ * Initializes fastest implementation
+ */
+void aes_impl_init(void);
+
+/*
+ * Returns optimal allowed AES implementation
+ */
+const struct aes_impl_ops *aes_impl_get_ops(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _AES_IMPL_H */
diff --git a/sys/contrib/openzfs/module/icp/include/modes/gcm_impl.h b/sys/contrib/openzfs/module/icp/include/modes/gcm_impl.h
new file mode 100644
index 000000000000..28c8f63a7d46
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/modes/gcm_impl.h
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _GCM_IMPL_H
+#define _GCM_IMPL_H
+
+/*
+ * GCM function dispatcher.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+
+/*
+ * Methods used to define GCM implementation
+ *
+ * @gcm_mul_f Perform carry-less multiplication
+ * @gcm_will_work_f Function tests whether implementation will function
+ */
+typedef void (*gcm_mul_f)(uint64_t *, uint64_t *, uint64_t *);
+typedef boolean_t (*gcm_will_work_f)(void);
+
+#define GCM_IMPL_NAME_MAX (16)
+
+typedef struct gcm_impl_ops {
+ gcm_mul_f mul;
+ gcm_will_work_f is_supported;
+ char name[GCM_IMPL_NAME_MAX];
+} gcm_impl_ops_t;
+
+extern const gcm_impl_ops_t gcm_generic_impl;
+#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
+extern const gcm_impl_ops_t gcm_pclmulqdq_impl;
+#endif
+
+/*
+ * Initializes fastest implementation
+ */
+void gcm_impl_init(void);
+
+/*
+ * Returns optimal allowed GCM implementation
+ */
+const struct gcm_impl_ops *gcm_impl_get_ops(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _GCM_IMPL_H */
diff --git a/sys/contrib/openzfs/module/icp/include/modes/modes.h b/sys/contrib/openzfs/module/icp/include/modes/modes.h
new file mode 100644
index 000000000000..ab71197542eb
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/modes/modes.h
@@ -0,0 +1,411 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _COMMON_CRYPTO_MODES_H
+#define _COMMON_CRYPTO_MODES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+
+/*
+ * Does the build chain support all instructions needed for the GCM assembler
+ * routines. AVX support should imply AES-NI and PCLMULQDQ, but make sure
+ * anyhow.
+ */
+#if defined(__x86_64__) && defined(HAVE_AVX) && \
+ defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
+#define CAN_USE_GCM_ASM
+extern boolean_t gcm_avx_can_use_movbe;
+#endif
+
+#define ECB_MODE 0x00000002
+#define CBC_MODE 0x00000004
+#define CTR_MODE 0x00000008
+#define CCM_MODE 0x00000010
+#define GCM_MODE 0x00000020
+#define GMAC_MODE 0x00000040
+
+/*
+ * cc_keysched: Pointer to key schedule.
+ *
+ * cc_keysched_len: Length of the key schedule.
+ *
+ * cc_remainder: This is for residual data, i.e. data that can't
+ * be processed because there are too few bytes.
+ * Must wait until more data arrives.
+ *
+ * cc_remainder_len: Number of bytes in cc_remainder.
+ *
+ * cc_iv: Scratch buffer that sometimes contains the IV.
+ *
+ * cc_lastp: Pointer to previous block of ciphertext.
+ *
+ * cc_copy_to: Pointer to where encrypted residual data needs
+ * to be copied.
+ *
+ * cc_flags: PROVIDER_OWNS_KEY_SCHEDULE
+ * When a context is freed, it is necessary
+ * to know whether the key schedule was allocated
+ * by the caller, or internally, e.g. an init routine.
+ * If allocated by the latter, then it needs to be freed.
+ *
+ * ECB_MODE, CBC_MODE, CTR_MODE, or CCM_MODE
+ */
+struct common_ctx {
+ void *cc_keysched;
+ size_t cc_keysched_len;
+ uint64_t cc_iv[2];
+ uint64_t cc_remainder[2];
+ size_t cc_remainder_len;
+ uint8_t *cc_lastp;
+ uint8_t *cc_copy_to;
+ uint32_t cc_flags;
+};
+
+typedef struct common_ctx common_ctx_t;
+
+typedef struct ecb_ctx {
+ struct common_ctx ecb_common;
+ uint64_t ecb_lastblock[2];
+} ecb_ctx_t;
+
+#define ecb_keysched ecb_common.cc_keysched
+#define ecb_keysched_len ecb_common.cc_keysched_len
+#define ecb_iv ecb_common.cc_iv
+#define ecb_remainder ecb_common.cc_remainder
+#define ecb_remainder_len ecb_common.cc_remainder_len
+#define ecb_lastp ecb_common.cc_lastp
+#define ecb_copy_to ecb_common.cc_copy_to
+#define ecb_flags ecb_common.cc_flags
+
+typedef struct cbc_ctx {
+ struct common_ctx cbc_common;
+ uint64_t cbc_lastblock[2];
+} cbc_ctx_t;
+
+#define cbc_keysched cbc_common.cc_keysched
+#define cbc_keysched_len cbc_common.cc_keysched_len
+#define cbc_iv cbc_common.cc_iv
+#define cbc_remainder cbc_common.cc_remainder
+#define cbc_remainder_len cbc_common.cc_remainder_len
+#define cbc_lastp cbc_common.cc_lastp
+#define cbc_copy_to cbc_common.cc_copy_to
+#define cbc_flags cbc_common.cc_flags
+
+/*
+ * ctr_lower_mask Bit-mask for lower 8 bytes of counter block.
+ * ctr_upper_mask Bit-mask for upper 8 bytes of counter block.
+ */
+typedef struct ctr_ctx {
+ struct common_ctx ctr_common;
+ uint64_t ctr_lower_mask;
+ uint64_t ctr_upper_mask;
+ uint32_t ctr_tmp[4];
+} ctr_ctx_t;
+
+/*
+ * ctr_cb Counter block.
+ */
+#define ctr_keysched ctr_common.cc_keysched
+#define ctr_keysched_len ctr_common.cc_keysched_len
+#define ctr_cb ctr_common.cc_iv
+#define ctr_remainder ctr_common.cc_remainder
+#define ctr_remainder_len ctr_common.cc_remainder_len
+#define ctr_lastp ctr_common.cc_lastp
+#define ctr_copy_to ctr_common.cc_copy_to
+#define ctr_flags ctr_common.cc_flags
+
+/*
+ *
+ * ccm_mac_len: Stores length of the MAC in CCM mode.
+ * ccm_mac_buf: Stores the intermediate value for MAC in CCM encrypt.
+ * In CCM decrypt, stores the input MAC value.
+ * ccm_data_len: Length of the plaintext for CCM mode encrypt, or
+ * length of the ciphertext for CCM mode decrypt.
+ * ccm_processed_data_len:
+ * Length of processed plaintext in CCM mode encrypt,
+ * or length of processed ciphertext for CCM mode decrypt.
+ * ccm_processed_mac_len:
+ * Length of MAC data accumulated in CCM mode decrypt.
+ *
+ * ccm_pt_buf: Only used in CCM mode decrypt. It stores the
+ * decrypted plaintext to be returned when
+ * MAC verification succeeds in decrypt_final.
+ * Memory for this should be allocated in the AES module.
+ *
+ */
+typedef struct ccm_ctx {
+ struct common_ctx ccm_common;
+ uint32_t ccm_tmp[4];
+ size_t ccm_mac_len;
+ uint64_t ccm_mac_buf[2];
+ size_t ccm_data_len;
+ size_t ccm_processed_data_len;
+ size_t ccm_processed_mac_len;
+ uint8_t *ccm_pt_buf;
+ uint64_t ccm_mac_input_buf[2];
+ uint64_t ccm_counter_mask;
+} ccm_ctx_t;
+
+#define ccm_keysched ccm_common.cc_keysched
+#define ccm_keysched_len ccm_common.cc_keysched_len
+#define ccm_cb ccm_common.cc_iv
+#define ccm_remainder ccm_common.cc_remainder
+#define ccm_remainder_len ccm_common.cc_remainder_len
+#define ccm_lastp ccm_common.cc_lastp
+#define ccm_copy_to ccm_common.cc_copy_to
+#define ccm_flags ccm_common.cc_flags
+
+/*
+ * gcm_tag_len: Length of authentication tag.
+ *
+ * gcm_ghash: Stores output from the GHASH function.
+ *
+ * gcm_processed_data_len:
+ * Length of processed plaintext (encrypt) or
+ * length of processed ciphertext (decrypt).
+ *
+ * gcm_pt_buf: Stores the decrypted plaintext returned by
+ * decrypt_final when the computed authentication
+ * tag matches the user supplied tag.
+ *
+ * gcm_pt_buf_len: Length of the plaintext buffer.
+ *
+ * gcm_H: Subkey.
+ *
+ * gcm_Htable: Pre-computed and pre-shifted H, H^2, ... H^6 for the
+ * Karatsuba Algorithm in host byte order.
+ *
+ * gcm_J0: Pre-counter block generated from the IV.
+ *
+ * gcm_len_a_len_c: 64-bit representations of the bit lengths of
+ * AAD and ciphertext.
+ *
+ * gcm_kmflag: Current value of kmflag. Used for allocating
+ * the plaintext buffer during decryption and a
+ * gcm_avx_chunk_size'd buffer for avx enabled encryption.
+ */
+typedef struct gcm_ctx {
+ struct common_ctx gcm_common;
+ size_t gcm_tag_len;
+ size_t gcm_processed_data_len;
+ size_t gcm_pt_buf_len;
+ uint32_t gcm_tmp[4];
+ /*
+ * The offset of gcm_Htable relative to gcm_ghash, (32), is hard coded
+ * in aesni-gcm-x86_64.S, so please don't change (or adjust there).
+ */
+ uint64_t gcm_ghash[2];
+ uint64_t gcm_H[2];
+#ifdef CAN_USE_GCM_ASM
+ uint64_t *gcm_Htable;
+ size_t gcm_htab_len;
+#endif
+ uint64_t gcm_J0[2];
+ uint64_t gcm_len_a_len_c[2];
+ uint8_t *gcm_pt_buf;
+ int gcm_kmflag;
+#ifdef CAN_USE_GCM_ASM
+ boolean_t gcm_use_avx;
+#endif
+} gcm_ctx_t;
+
+#define gcm_keysched gcm_common.cc_keysched
+#define gcm_keysched_len gcm_common.cc_keysched_len
+#define gcm_cb gcm_common.cc_iv
+#define gcm_remainder gcm_common.cc_remainder
+#define gcm_remainder_len gcm_common.cc_remainder_len
+#define gcm_lastp gcm_common.cc_lastp
+#define gcm_copy_to gcm_common.cc_copy_to
+#define gcm_flags gcm_common.cc_flags
+
+#define AES_GMAC_IV_LEN 12
+#define AES_GMAC_TAG_BITS 128
+
+typedef struct aes_ctx {
+ union {
+ ecb_ctx_t acu_ecb;
+ cbc_ctx_t acu_cbc;
+ ctr_ctx_t acu_ctr;
+ ccm_ctx_t acu_ccm;
+ gcm_ctx_t acu_gcm;
+ } acu;
+} aes_ctx_t;
+
+#define ac_flags acu.acu_ecb.ecb_common.cc_flags
+#define ac_remainder_len acu.acu_ecb.ecb_common.cc_remainder_len
+#define ac_keysched acu.acu_ecb.ecb_common.cc_keysched
+#define ac_keysched_len acu.acu_ecb.ecb_common.cc_keysched_len
+#define ac_iv acu.acu_ecb.ecb_common.cc_iv
+#define ac_lastp acu.acu_ecb.ecb_common.cc_lastp
+#define ac_pt_buf acu.acu_ccm.ccm_pt_buf
+#define ac_mac_len acu.acu_ccm.ccm_mac_len
+#define ac_data_len acu.acu_ccm.ccm_data_len
+#define ac_processed_mac_len acu.acu_ccm.ccm_processed_mac_len
+#define ac_processed_data_len acu.acu_ccm.ccm_processed_data_len
+#define ac_tag_len acu.acu_gcm.gcm_tag_len
+
+typedef struct blowfish_ctx {
+ union {
+ ecb_ctx_t bcu_ecb;
+ cbc_ctx_t bcu_cbc;
+ } bcu;
+} blowfish_ctx_t;
+
+#define bc_flags bcu.bcu_ecb.ecb_common.cc_flags
+#define bc_remainder_len bcu.bcu_ecb.ecb_common.cc_remainder_len
+#define bc_keysched bcu.bcu_ecb.ecb_common.cc_keysched
+#define bc_keysched_len bcu.bcu_ecb.ecb_common.cc_keysched_len
+#define bc_iv bcu.bcu_ecb.ecb_common.cc_iv
+#define bc_lastp bcu.bcu_ecb.ecb_common.cc_lastp
+
+typedef struct des_ctx {
+ union {
+ ecb_ctx_t dcu_ecb;
+ cbc_ctx_t dcu_cbc;
+ } dcu;
+} des_ctx_t;
+
+#define dc_flags dcu.dcu_ecb.ecb_common.cc_flags
+#define dc_remainder_len dcu.dcu_ecb.ecb_common.cc_remainder_len
+#define dc_keysched dcu.dcu_ecb.ecb_common.cc_keysched
+#define dc_keysched_len dcu.dcu_ecb.ecb_common.cc_keysched_len
+#define dc_iv dcu.dcu_ecb.ecb_common.cc_iv
+#define dc_lastp dcu.dcu_ecb.ecb_common.cc_lastp
+
+extern int ecb_cipher_contiguous_blocks(ecb_ctx_t *, char *, size_t,
+ crypto_data_t *, size_t, int (*cipher)(const void *, const uint8_t *,
+ uint8_t *));
+
+extern int cbc_encrypt_contiguous_blocks(cbc_ctx_t *, char *, size_t,
+ crypto_data_t *, size_t,
+ int (*encrypt)(const void *, const uint8_t *, uint8_t *),
+ void (*copy_block)(uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *));
+
+extern int cbc_decrypt_contiguous_blocks(cbc_ctx_t *, char *, size_t,
+ crypto_data_t *, size_t,
+ int (*decrypt)(const void *, const uint8_t *, uint8_t *),
+ void (*copy_block)(uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *));
+
+extern int ctr_mode_contiguous_blocks(ctr_ctx_t *, char *, size_t,
+ crypto_data_t *, size_t,
+ int (*cipher)(const void *, const uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *));
+
+extern int ccm_mode_encrypt_contiguous_blocks(ccm_ctx_t *, char *, size_t,
+ crypto_data_t *, size_t,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*copy_block)(uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *));
+
+extern int ccm_mode_decrypt_contiguous_blocks(ccm_ctx_t *, char *, size_t,
+ crypto_data_t *, size_t,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*copy_block)(uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *));
+
+extern int gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *, char *, size_t,
+ crypto_data_t *, size_t,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*copy_block)(uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *));
+
+extern int gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *, char *, size_t,
+ crypto_data_t *, size_t,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*copy_block)(uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *));
+
+int ccm_encrypt_final(ccm_ctx_t *, crypto_data_t *, size_t,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *));
+
+int gcm_encrypt_final(gcm_ctx_t *, crypto_data_t *, size_t,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*copy_block)(uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *));
+
+extern int ccm_decrypt_final(ccm_ctx_t *, crypto_data_t *, size_t,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*copy_block)(uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *));
+
+extern int gcm_decrypt_final(gcm_ctx_t *, crypto_data_t *, size_t,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *));
+
+extern int ctr_mode_final(ctr_ctx_t *, crypto_data_t *,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *));
+
+extern int cbc_init_ctx(cbc_ctx_t *, char *, size_t, size_t,
+ void (*copy_block)(uint8_t *, uint64_t *));
+
+extern int ctr_init_ctx(ctr_ctx_t *, ulong_t, uint8_t *,
+ void (*copy_block)(uint8_t *, uint8_t *));
+
+extern int ccm_init_ctx(ccm_ctx_t *, char *, int, boolean_t, size_t,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *));
+
+extern int gcm_init_ctx(gcm_ctx_t *, char *, size_t,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*copy_block)(uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *));
+
+extern int gmac_init_ctx(gcm_ctx_t *, char *, size_t,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+ void (*copy_block)(uint8_t *, uint8_t *),
+ void (*xor_block)(uint8_t *, uint8_t *));
+
+extern void calculate_ccm_mac(ccm_ctx_t *, uint8_t *,
+ int (*encrypt_block)(const void *, const uint8_t *, uint8_t *));
+
+extern void gcm_mul(uint64_t *, uint64_t *, uint64_t *);
+
+extern void crypto_init_ptrs(crypto_data_t *, void **, offset_t *);
+extern void crypto_get_ptrs(crypto_data_t *, void **, offset_t *,
+ uint8_t **, size_t *, uint8_t **, size_t);
+
+extern void *ecb_alloc_ctx(int);
+extern void *cbc_alloc_ctx(int);
+extern void *ctr_alloc_ctx(int);
+extern void *ccm_alloc_ctx(int);
+extern void *gcm_alloc_ctx(int);
+extern void *gmac_alloc_ctx(int);
+extern void crypto_free_mode_ctx(void *);
+extern void gcm_set_kmflag(gcm_ctx_t *, int);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _COMMON_CRYPTO_MODES_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sha1/sha1.h b/sys/contrib/openzfs/module/icp/include/sha1/sha1.h
new file mode 100644
index 000000000000..251b64fcaeee
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sha1/sha1.h
@@ -0,0 +1,61 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SHA1_H
+#define _SYS_SHA1_H
+
+#include <sys/types.h> /* for uint_* */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * NOTE: n2rng (Niagara2 RNG driver) accesses the state field of
+ * SHA1_CTX directly. NEVER change this structure without verifying
+ * compatibility with n2rng. The important thing is that the state
+ * must be in a field declared as uint32_t state[5].
+ */
+/* SHA-1 context. */
+typedef struct {
+ uint32_t state[5]; /* state (ABCDE) */
+ uint32_t count[2]; /* number of bits, modulo 2^64 (msb first) */
+ union {
+ uint8_t buf8[64]; /* undigested input */
+ uint32_t buf32[16]; /* realigned input */
+ } buf_un;
+} SHA1_CTX;
+
+#define SHA1_DIGEST_LENGTH 20
+
+void SHA1Init(SHA1_CTX *);
+void SHA1Update(SHA1_CTX *, const void *, size_t);
+void SHA1Final(void *, SHA1_CTX *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SHA1_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sha1/sha1_consts.h b/sys/contrib/openzfs/module/icp/include/sha1/sha1_consts.h
new file mode 100644
index 000000000000..848d25ef050f
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sha1/sha1_consts.h
@@ -0,0 +1,65 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 1998, by Sun Microsystems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef _SYS_SHA1_CONSTS_H
+#define _SYS_SHA1_CONSTS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * as explained in sha1.c, loading 32-bit constants on a sparc is expensive
+ * since it involves both a `sethi' and an `or'. thus, we instead use `ld'
+ * to load the constants from an array called `sha1_consts'. however, on
+ * intel (and perhaps other processors), it is cheaper to load the constant
+ * directly. thus, the c code in SHA1Transform() uses the macro SHA1_CONST()
+ * which either expands to a constant or an array reference, depending on
+ * the architecture the code is being compiled for.
+ */
+
+#include <sys/types.h> /* uint32_t */
+
+extern const uint32_t sha1_consts[];
+
+#if defined(__sparc)
+#define SHA1_CONST(x) (sha1_consts[x])
+#else
+#define SHA1_CONST(x) (SHA1_CONST_ ## x)
+#endif
+
+/* constants, as provided in FIPS 180-1 */
+
+#define SHA1_CONST_0 0x5a827999U
+#define SHA1_CONST_1 0x6ed9eba1U
+#define SHA1_CONST_2 0x8f1bbcdcU
+#define SHA1_CONST_3 0xca62c1d6U
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SHA1_CONSTS_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sha1/sha1_impl.h b/sys/contrib/openzfs/module/icp/include/sha1/sha1_impl.h
new file mode 100644
index 000000000000..1c1f8728f9b5
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sha1/sha1_impl.h
@@ -0,0 +1,73 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SHA1_IMPL_H
+#define _SHA1_IMPL_H
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SHA1_HASH_SIZE 20 /* SHA_1 digest length in bytes */
+#define SHA1_DIGEST_LENGTH 20 /* SHA1 digest length in bytes */
+#define SHA1_HMAC_BLOCK_SIZE 64 /* SHA1-HMAC block size */
+#define SHA1_HMAC_MIN_KEY_LEN 1 /* SHA1-HMAC min key length in bytes */
+#define SHA1_HMAC_MAX_KEY_LEN INT_MAX /* SHA1-HMAC max key length in bytes */
+#define SHA1_HMAC_INTS_PER_BLOCK (SHA1_HMAC_BLOCK_SIZE/sizeof (uint32_t))
+
+/*
+ * CSPI information (entry points, provider info, etc.)
+ */
+typedef enum sha1_mech_type {
+ SHA1_MECH_INFO_TYPE, /* SUN_CKM_SHA1 */
+ SHA1_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA1_HMAC */
+ SHA1_HMAC_GEN_MECH_INFO_TYPE /* SUN_CKM_SHA1_HMAC_GENERAL */
+} sha1_mech_type_t;
+
+/*
+ * Context for SHA1 mechanism.
+ */
+typedef struct sha1_ctx {
+ sha1_mech_type_t sc_mech_type; /* type of context */
+ SHA1_CTX sc_sha1_ctx; /* SHA1 context */
+} sha1_ctx_t;
+
+/*
+ * Context for SHA1-HMAC and SHA1-HMAC-GENERAL mechanisms.
+ */
+typedef struct sha1_hmac_ctx {
+ sha1_mech_type_t hc_mech_type; /* type of context */
+ uint32_t hc_digest_len; /* digest len in bytes */
+ SHA1_CTX hc_icontext; /* inner SHA1 context */
+ SHA1_CTX hc_ocontext; /* outer SHA1 context */
+} sha1_hmac_ctx_t;
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SHA1_IMPL_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sha2/sha2_consts.h b/sys/contrib/openzfs/module/icp/include/sha2/sha2_consts.h
new file mode 100644
index 000000000000..3a6645508fe9
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sha2/sha2_consts.h
@@ -0,0 +1,219 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SHA2_CONSTS_H
+#define _SYS_SHA2_CONSTS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Loading 32-bit constants on a sparc is expensive since it involves both
+ * a `sethi' and an `or'. thus, we instead use `ld' to load the constants
+ * from an array called `sha2_consts'. however, on intel (and perhaps other
+ * processors), it is cheaper to load the constant directly. thus, the c
+ * code in SHA transform functions uses the macro SHA2_CONST() which either
+ * expands to a constant or an array reference, depending on
+ * the architecture the code is being compiled for.
+ *
+ * SHA512 constants are used for SHA384
+ */
+
+#include <sys/types.h> /* uint32_t */
+
+extern const uint32_t sha256_consts[];
+extern const uint64_t sha512_consts[];
+
+#if defined(__sparc)
+#define SHA256_CONST(x) (sha256_consts[x])
+#define SHA512_CONST(x) (sha512_consts[x])
+#else
+#define SHA256_CONST(x) (SHA256_CONST_ ## x)
+#define SHA512_CONST(x) (SHA512_CONST_ ## x)
+#endif
+
+/* constants, as provided in FIPS 180-2 */
+
+#define SHA256_CONST_0 0x428a2f98U
+#define SHA256_CONST_1 0x71374491U
+#define SHA256_CONST_2 0xb5c0fbcfU
+#define SHA256_CONST_3 0xe9b5dba5U
+#define SHA256_CONST_4 0x3956c25bU
+#define SHA256_CONST_5 0x59f111f1U
+#define SHA256_CONST_6 0x923f82a4U
+#define SHA256_CONST_7 0xab1c5ed5U
+
+#define SHA256_CONST_8 0xd807aa98U
+#define SHA256_CONST_9 0x12835b01U
+#define SHA256_CONST_10 0x243185beU
+#define SHA256_CONST_11 0x550c7dc3U
+#define SHA256_CONST_12 0x72be5d74U
+#define SHA256_CONST_13 0x80deb1feU
+#define SHA256_CONST_14 0x9bdc06a7U
+#define SHA256_CONST_15 0xc19bf174U
+
+#define SHA256_CONST_16 0xe49b69c1U
+#define SHA256_CONST_17 0xefbe4786U
+#define SHA256_CONST_18 0x0fc19dc6U
+#define SHA256_CONST_19 0x240ca1ccU
+#define SHA256_CONST_20 0x2de92c6fU
+#define SHA256_CONST_21 0x4a7484aaU
+#define SHA256_CONST_22 0x5cb0a9dcU
+#define SHA256_CONST_23 0x76f988daU
+
+#define SHA256_CONST_24 0x983e5152U
+#define SHA256_CONST_25 0xa831c66dU
+#define SHA256_CONST_26 0xb00327c8U
+#define SHA256_CONST_27 0xbf597fc7U
+#define SHA256_CONST_28 0xc6e00bf3U
+#define SHA256_CONST_29 0xd5a79147U
+#define SHA256_CONST_30 0x06ca6351U
+#define SHA256_CONST_31 0x14292967U
+
+#define SHA256_CONST_32 0x27b70a85U
+#define SHA256_CONST_33 0x2e1b2138U
+#define SHA256_CONST_34 0x4d2c6dfcU
+#define SHA256_CONST_35 0x53380d13U
+#define SHA256_CONST_36 0x650a7354U
+#define SHA256_CONST_37 0x766a0abbU
+#define SHA256_CONST_38 0x81c2c92eU
+#define SHA256_CONST_39 0x92722c85U
+
+#define SHA256_CONST_40 0xa2bfe8a1U
+#define SHA256_CONST_41 0xa81a664bU
+#define SHA256_CONST_42 0xc24b8b70U
+#define SHA256_CONST_43 0xc76c51a3U
+#define SHA256_CONST_44 0xd192e819U
+#define SHA256_CONST_45 0xd6990624U
+#define SHA256_CONST_46 0xf40e3585U
+#define SHA256_CONST_47 0x106aa070U
+
+#define SHA256_CONST_48 0x19a4c116U
+#define SHA256_CONST_49 0x1e376c08U
+#define SHA256_CONST_50 0x2748774cU
+#define SHA256_CONST_51 0x34b0bcb5U
+#define SHA256_CONST_52 0x391c0cb3U
+#define SHA256_CONST_53 0x4ed8aa4aU
+#define SHA256_CONST_54 0x5b9cca4fU
+#define SHA256_CONST_55 0x682e6ff3U
+
+#define SHA256_CONST_56 0x748f82eeU
+#define SHA256_CONST_57 0x78a5636fU
+#define SHA256_CONST_58 0x84c87814U
+#define SHA256_CONST_59 0x8cc70208U
+#define SHA256_CONST_60 0x90befffaU
+#define SHA256_CONST_61 0xa4506cebU
+#define SHA256_CONST_62 0xbef9a3f7U
+#define SHA256_CONST_63 0xc67178f2U
+
+#define SHA512_CONST_0 0x428a2f98d728ae22ULL
+#define SHA512_CONST_1 0x7137449123ef65cdULL
+#define SHA512_CONST_2 0xb5c0fbcfec4d3b2fULL
+#define SHA512_CONST_3 0xe9b5dba58189dbbcULL
+#define SHA512_CONST_4 0x3956c25bf348b538ULL
+#define SHA512_CONST_5 0x59f111f1b605d019ULL
+#define SHA512_CONST_6 0x923f82a4af194f9bULL
+#define SHA512_CONST_7 0xab1c5ed5da6d8118ULL
+#define SHA512_CONST_8 0xd807aa98a3030242ULL
+#define SHA512_CONST_9 0x12835b0145706fbeULL
+#define SHA512_CONST_10 0x243185be4ee4b28cULL
+#define SHA512_CONST_11 0x550c7dc3d5ffb4e2ULL
+#define SHA512_CONST_12 0x72be5d74f27b896fULL
+#define SHA512_CONST_13 0x80deb1fe3b1696b1ULL
+#define SHA512_CONST_14 0x9bdc06a725c71235ULL
+#define SHA512_CONST_15 0xc19bf174cf692694ULL
+#define SHA512_CONST_16 0xe49b69c19ef14ad2ULL
+#define SHA512_CONST_17 0xefbe4786384f25e3ULL
+#define SHA512_CONST_18 0x0fc19dc68b8cd5b5ULL
+#define SHA512_CONST_19 0x240ca1cc77ac9c65ULL
+#define SHA512_CONST_20 0x2de92c6f592b0275ULL
+#define SHA512_CONST_21 0x4a7484aa6ea6e483ULL
+#define SHA512_CONST_22 0x5cb0a9dcbd41fbd4ULL
+#define SHA512_CONST_23 0x76f988da831153b5ULL
+#define SHA512_CONST_24 0x983e5152ee66dfabULL
+#define SHA512_CONST_25 0xa831c66d2db43210ULL
+#define SHA512_CONST_26 0xb00327c898fb213fULL
+#define SHA512_CONST_27 0xbf597fc7beef0ee4ULL
+#define SHA512_CONST_28 0xc6e00bf33da88fc2ULL
+#define SHA512_CONST_29 0xd5a79147930aa725ULL
+#define SHA512_CONST_30 0x06ca6351e003826fULL
+#define SHA512_CONST_31 0x142929670a0e6e70ULL
+#define SHA512_CONST_32 0x27b70a8546d22ffcULL
+#define SHA512_CONST_33 0x2e1b21385c26c926ULL
+#define SHA512_CONST_34 0x4d2c6dfc5ac42aedULL
+#define SHA512_CONST_35 0x53380d139d95b3dfULL
+#define SHA512_CONST_36 0x650a73548baf63deULL
+#define SHA512_CONST_37 0x766a0abb3c77b2a8ULL
+#define SHA512_CONST_38 0x81c2c92e47edaee6ULL
+#define SHA512_CONST_39 0x92722c851482353bULL
+#define SHA512_CONST_40 0xa2bfe8a14cf10364ULL
+#define SHA512_CONST_41 0xa81a664bbc423001ULL
+#define SHA512_CONST_42 0xc24b8b70d0f89791ULL
+#define SHA512_CONST_43 0xc76c51a30654be30ULL
+#define SHA512_CONST_44 0xd192e819d6ef5218ULL
+#define SHA512_CONST_45 0xd69906245565a910ULL
+#define SHA512_CONST_46 0xf40e35855771202aULL
+#define SHA512_CONST_47 0x106aa07032bbd1b8ULL
+#define SHA512_CONST_48 0x19a4c116b8d2d0c8ULL
+#define SHA512_CONST_49 0x1e376c085141ab53ULL
+#define SHA512_CONST_50 0x2748774cdf8eeb99ULL
+#define SHA512_CONST_51 0x34b0bcb5e19b48a8ULL
+#define SHA512_CONST_52 0x391c0cb3c5c95a63ULL
+#define SHA512_CONST_53 0x4ed8aa4ae3418acbULL
+#define SHA512_CONST_54 0x5b9cca4f7763e373ULL
+#define SHA512_CONST_55 0x682e6ff3d6b2b8a3ULL
+#define SHA512_CONST_56 0x748f82ee5defb2fcULL
+#define SHA512_CONST_57 0x78a5636f43172f60ULL
+#define SHA512_CONST_58 0x84c87814a1f0ab72ULL
+#define SHA512_CONST_59 0x8cc702081a6439ecULL
+#define SHA512_CONST_60 0x90befffa23631e28ULL
+#define SHA512_CONST_61 0xa4506cebde82bde9ULL
+#define SHA512_CONST_62 0xbef9a3f7b2c67915ULL
+#define SHA512_CONST_63 0xc67178f2e372532bULL
+#define SHA512_CONST_64 0xca273eceea26619cULL
+#define SHA512_CONST_65 0xd186b8c721c0c207ULL
+#define SHA512_CONST_66 0xeada7dd6cde0eb1eULL
+#define SHA512_CONST_67 0xf57d4f7fee6ed178ULL
+#define SHA512_CONST_68 0x06f067aa72176fbaULL
+#define SHA512_CONST_69 0x0a637dc5a2c898a6ULL
+#define SHA512_CONST_70 0x113f9804bef90daeULL
+#define SHA512_CONST_71 0x1b710b35131c471bULL
+#define SHA512_CONST_72 0x28db77f523047d84ULL
+#define SHA512_CONST_73 0x32caab7b40c72493ULL
+#define SHA512_CONST_74 0x3c9ebe0a15c9bebcULL
+#define SHA512_CONST_75 0x431d67c49c100d4cULL
+#define SHA512_CONST_76 0x4cc5d4becb3e42b6ULL
+#define SHA512_CONST_77 0x597f299cfc657e2aULL
+#define SHA512_CONST_78 0x5fcb6fab3ad6faecULL
+#define SHA512_CONST_79 0x6c44198c4a475817ULL
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SHA2_CONSTS_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sha2/sha2_impl.h b/sys/contrib/openzfs/module/icp/include/sha2/sha2_impl.h
new file mode 100644
index 000000000000..b9768d344e95
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sha2/sha2_impl.h
@@ -0,0 +1,64 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SHA2_IMPL_H
+#define _SHA2_IMPL_H
+
+#include <sys/sha2.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+ SHA1_TYPE,
+ SHA256_TYPE,
+ SHA384_TYPE,
+ SHA512_TYPE
+} sha2_mech_t;
+
+/*
+ * Context for SHA2 mechanism.
+ */
+typedef struct sha2_ctx {
+ sha2_mech_type_t sc_mech_type; /* type of context */
+ SHA2_CTX sc_sha2_ctx; /* SHA2 context */
+} sha2_ctx_t;
+
+/*
+ * Context for SHA2 HMAC and HMAC GENERAL mechanisms.
+ */
+typedef struct sha2_hmac_ctx {
+ sha2_mech_type_t hc_mech_type; /* type of context */
+ uint32_t hc_digest_len; /* digest len in bytes */
+ SHA2_CTX hc_icontext; /* inner SHA2 context */
+ SHA2_CTX hc_ocontext; /* outer SHA2 context */
+} sha2_hmac_ctx_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SHA2_IMPL_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/asm_linkage.h b/sys/contrib/openzfs/module/icp/include/sys/asm_linkage.h
new file mode 100644
index 000000000000..49a494b46e0b
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/asm_linkage.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ASM_LINKAGE_H
+#define _SYS_ASM_LINKAGE_H
+
+#if defined(__i386) || defined(__amd64)
+
+#include <sys/ia32/asm_linkage.h> /* XX64 x86/sys/asm_linkage.h */
+
+#endif
+
+#if defined(_KERNEL) && defined(HAVE_KERNEL_OBJTOOL)
+
+#include <asm/frame.h>
+
+#else /* userspace */
+#define FRAME_BEGIN
+#define FRAME_END
+#endif
+
+
+#endif /* _SYS_ASM_LINKAGE_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/bitmap.h b/sys/contrib/openzfs/module/icp/include/sys/bitmap.h
new file mode 100644
index 000000000000..4e86ee70ed9e
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/bitmap.h
@@ -0,0 +1,183 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#ifndef _SYS_BITMAP_H
+#define _SYS_BITMAP_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(_ASM_INLINES) && \
+ (defined(__i386) || defined(__amd64))
+#include <asm/bitmap.h>
+#endif
+
+/*
+ * Operations on bitmaps of arbitrary size
+ * A bitmap is a vector of 1 or more ulong_t's.
+ * The user of the package is responsible for range checks and keeping
+ * track of sizes.
+ */
+
+#ifdef _LP64
+#define BT_ULSHIFT 6 /* log base 2 of BT_NBIPUL, to extract word index */
+#define BT_ULSHIFT32 5 /* log base 2 of BT_NBIPUL, to extract word index */
+#else
+#define BT_ULSHIFT 5 /* log base 2 of BT_NBIPUL, to extract word index */
+#endif
+
+#define BT_NBIPUL (1 << BT_ULSHIFT) /* n bits per ulong_t */
+#define BT_ULMASK (BT_NBIPUL - 1) /* to extract bit index */
+
+#ifdef _LP64
+#define BT_NBIPUL32 (1 << BT_ULSHIFT32) /* n bits per ulong_t */
+#define BT_ULMASK32 (BT_NBIPUL32 - 1) /* to extract bit index */
+#define BT_ULMAXMASK 0xffffffffffffffff /* used by bt_getlowbit */
+#else
+#define BT_ULMAXMASK 0xffffffff
+#endif
+
+/*
+ * bitmap is a ulong_t *, bitindex an index_t
+ *
+ * The macros BT_WIM and BT_BIW internal; there is no need
+ * for users of this package to use them.
+ */
+
+/*
+ * word in map
+ */
+#define BT_WIM(bitmap, bitindex) \
+ ((bitmap)[(bitindex) >> BT_ULSHIFT])
+/*
+ * bit in word
+ */
+#define BT_BIW(bitindex) \
+ (1UL << ((bitindex) & BT_ULMASK))
+
+#ifdef _LP64
+#define BT_WIM32(bitmap, bitindex) \
+ ((bitmap)[(bitindex) >> BT_ULSHIFT32])
+
+#define BT_BIW32(bitindex) \
+ (1UL << ((bitindex) & BT_ULMASK32))
+#endif
+
+/*
+ * These are public macros
+ *
+ * BT_BITOUL == n bits to n ulong_t's
+ */
+#define BT_BITOUL(nbits) \
+ (((nbits) + BT_NBIPUL - 1l) / BT_NBIPUL)
+#define BT_SIZEOFMAP(nbits) \
+ (BT_BITOUL(nbits) * sizeof (ulong_t))
+#define BT_TEST(bitmap, bitindex) \
+ ((BT_WIM((bitmap), (bitindex)) & BT_BIW(bitindex)) ? 1 : 0)
+#define BT_SET(bitmap, bitindex) \
+ { BT_WIM((bitmap), (bitindex)) |= BT_BIW(bitindex); }
+#define BT_CLEAR(bitmap, bitindex) \
+ { BT_WIM((bitmap), (bitindex)) &= ~BT_BIW(bitindex); }
+
+#ifdef _LP64
+#define BT_BITOUL32(nbits) \
+ (((nbits) + BT_NBIPUL32 - 1l) / BT_NBIPUL32)
+#define BT_SIZEOFMAP32(nbits) \
+ (BT_BITOUL32(nbits) * sizeof (uint_t))
+#define BT_TEST32(bitmap, bitindex) \
+ ((BT_WIM32((bitmap), (bitindex)) & BT_BIW32(bitindex)) ? 1 : 0)
+#define BT_SET32(bitmap, bitindex) \
+ { BT_WIM32((bitmap), (bitindex)) |= BT_BIW32(bitindex); }
+#define BT_CLEAR32(bitmap, bitindex) \
+ { BT_WIM32((bitmap), (bitindex)) &= ~BT_BIW32(bitindex); }
+#endif /* _LP64 */
+
+
+/*
+ * BIT_ONLYONESET is a private macro not designed for bitmaps of
+ * arbitrary size. u must be an unsigned integer/long. It returns
+ * true if one and only one bit is set in u.
+ */
+#define BIT_ONLYONESET(u) \
+ ((((u) == 0) ? 0 : ((u) & ((u) - 1)) == 0))
+
+#ifndef _ASM
+
+/*
+ * return next available bit index from map with specified number of bits
+ */
+extern index_t bt_availbit(ulong_t *bitmap, size_t nbits);
+/*
+ * find the highest order bit that is on, and is within or below
+ * the word specified by wx
+ */
+extern int bt_gethighbit(ulong_t *mapp, int wx);
+extern int bt_range(ulong_t *bitmap, size_t *pos1, size_t *pos2,
+ size_t end_pos);
+extern int bt_getlowbit(ulong_t *bitmap, size_t start, size_t stop);
+extern void bt_copy(ulong_t *, ulong_t *, ulong_t);
+
+/*
+ * find the parity
+ */
+extern int odd_parity(ulong_t);
+
+/*
+ * Atomically set/clear bits
+ * Atomic exclusive operations will set "result" to "-1"
+ * if the bit is already set/cleared. "result" will be set
+ * to 0 otherwise.
+ */
+#define BT_ATOMIC_SET(bitmap, bitindex) \
+ { atomic_or_ulong(&(BT_WIM(bitmap, bitindex)), BT_BIW(bitindex)); }
+#define BT_ATOMIC_CLEAR(bitmap, bitindex) \
+ { atomic_and_ulong(&(BT_WIM(bitmap, bitindex)), ~BT_BIW(bitindex)); }
+
+#define BT_ATOMIC_SET_EXCL(bitmap, bitindex, result) \
+ { result = atomic_set_long_excl(&(BT_WIM(bitmap, bitindex)), \
+ (bitindex) % BT_NBIPUL); }
+#define BT_ATOMIC_CLEAR_EXCL(bitmap, bitindex, result) \
+ { result = atomic_clear_long_excl(&(BT_WIM(bitmap, bitindex)), \
+ (bitindex) % BT_NBIPUL); }
+
+/*
+ * Extracts bits between index h (high, inclusive) and l (low, exclusive) from
+ * u, which must be an unsigned integer.
+ */
+#define BITX(u, h, l) (((u) >> (l)) & ((1LU << ((h) - (l) + 1LU)) - 1LU))
+
+#endif /* _ASM */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BITMAP_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/crypto/elfsign.h b/sys/contrib/openzfs/module/icp/include/sys/crypto/elfsign.h
new file mode 100644
index 000000000000..5432f0c8d607
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/crypto/elfsign.h
@@ -0,0 +1,137 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_CRYPTO_ELFSIGN_H
+#define _SYS_CRYPTO_ELFSIGN_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Consolidation Private Interface for elfsign/libpkcs11/kcfd
+ */
+
+#include <sys/zfs_context.h>
+
+/*
+ * Project Private structures and types used for communication between kcfd
+ * and KCF over the door.
+ */
+
+typedef enum ELFsign_status_e {
+ ELFSIGN_UNKNOWN,
+ ELFSIGN_SUCCESS,
+ ELFSIGN_FAILED,
+ ELFSIGN_NOTSIGNED,
+ ELFSIGN_INVALID_CERTPATH,
+ ELFSIGN_INVALID_ELFOBJ,
+ ELFSIGN_RESTRICTED
+} ELFsign_status_t;
+
+#define KCF_KCFD_VERSION1 1
+#define SIG_MAX_LENGTH 1024
+
+#define ELF_SIGNATURE_SECTION ".SUNW_signature"
+
+typedef struct kcf_door_arg_s {
+ short da_version;
+ boolean_t da_iskernel;
+
+ union {
+ char filename[MAXPATHLEN]; /* For request */
+
+ struct kcf_door_result_s { /* For response */
+ ELFsign_status_t status;
+ uint32_t siglen;
+ uchar_t signature[1];
+ } result;
+ } da_u;
+} kcf_door_arg_t;
+
+typedef uint32_t filesig_vers_t;
+
+/*
+ * File Signature Structure
+ * Applicable to ELF and other file formats
+ */
+struct filesignatures {
+ uint32_t filesig_cnt; /* count of signatures */
+ uint32_t filesig_pad; /* unused */
+ union {
+ char filesig_data[1];
+ struct filesig { /* one of these for each signature */
+ uint32_t filesig_size;
+ filesig_vers_t filesig_version;
+ union {
+ struct filesig_version1 {
+ uint32_t filesig_v1_dnsize;
+ uint32_t filesig_v1_sigsize;
+ uint32_t filesig_v1_oidsize;
+ char filesig_v1_data[1];
+ } filesig_v1;
+ struct filesig_version3 {
+ uint64_t filesig_v3_time;
+ uint32_t filesig_v3_dnsize;
+ uint32_t filesig_v3_sigsize;
+ uint32_t filesig_v3_oidsize;
+ char filesig_v3_data[1];
+ } filesig_v3;
+ } _u2;
+ } filesig_sig;
+ uint64_t filesig_align;
+ } _u1;
+};
+#define filesig_sig _u1.filesig_sig
+
+#define filesig_v1_dnsize _u2.filesig_v1.filesig_v1_dnsize
+#define filesig_v1_sigsize _u2.filesig_v1.filesig_v1_sigsize
+#define filesig_v1_oidsize _u2.filesig_v1.filesig_v1_oidsize
+#define filesig_v1_data _u2.filesig_v1.filesig_v1_data
+
+#define filesig_v3_time _u2.filesig_v3.filesig_v3_time
+#define filesig_v3_dnsize _u2.filesig_v3.filesig_v3_dnsize
+#define filesig_v3_sigsize _u2.filesig_v3.filesig_v3_sigsize
+#define filesig_v3_oidsize _u2.filesig_v3.filesig_v3_oidsize
+#define filesig_v3_data _u2.filesig_v3.filesig_v3_data
+
+#define filesig_ALIGN(s) (((s) + sizeof (uint64_t) - 1) & \
+ (-sizeof (uint64_t)))
+#define filesig_next(ptr) (struct filesig *)((void *)((char *)(ptr) + \
+ filesig_ALIGN((ptr)->filesig_size)))
+
+#define FILESIG_UNKNOWN 0 /* unrecognized version */
+#define FILESIG_VERSION1 1 /* version1, all but sig section */
+#define FILESIG_VERSION2 2 /* version1 format, SHF_ALLOC only */
+#define FILESIG_VERSION3 3 /* version3, all but sig section */
+#define FILESIG_VERSION4 4 /* version3 format, SHF_ALLOC only */
+
+#define _PATH_KCFD_DOOR "/etc/svc/volatile/kcfd_door"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CRYPTO_ELFSIGN_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/crypto/impl.h b/sys/contrib/openzfs/module/icp/include/sys/crypto/impl.h
new file mode 100644
index 000000000000..0f37f3f63532
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/crypto/impl.h
@@ -0,0 +1,1363 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_CRYPTO_IMPL_H
+#define _SYS_CRYPTO_IMPL_H
+
+/*
+ * Kernel Cryptographic Framework private implementation definitions.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/api.h>
+#include <sys/crypto/spi.h>
+#include <sys/crypto/ioctl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define KCF_MODULE "kcf"
+
+/*
+ * Prefixes convention: structures internal to the kernel cryptographic
+ * framework start with 'kcf_'. Exposed structure start with 'crypto_'.
+ */
+
+/* Provider stats. Not protected. */
+typedef struct kcf_prov_stats {
+ kstat_named_t ps_ops_total;
+ kstat_named_t ps_ops_passed;
+ kstat_named_t ps_ops_failed;
+ kstat_named_t ps_ops_busy_rval;
+} kcf_prov_stats_t;
+
+/* Various kcf stats. Not protected. */
+typedef struct kcf_stats {
+ kstat_named_t ks_thrs_in_pool;
+ kstat_named_t ks_idle_thrs;
+ kstat_named_t ks_minthrs;
+ kstat_named_t ks_maxthrs;
+ kstat_named_t ks_swq_njobs;
+ kstat_named_t ks_swq_maxjobs;
+ kstat_named_t ks_taskq_threads;
+ kstat_named_t ks_taskq_minalloc;
+ kstat_named_t ks_taskq_maxalloc;
+} kcf_stats_t;
+
+/*
+ * Keep all the information needed by the scheduler from
+ * this provider.
+ */
+typedef struct kcf_sched_info {
+ /* The number of operations dispatched. */
+ uint64_t ks_ndispatches;
+
+ /* The number of operations that failed. */
+ uint64_t ks_nfails;
+
+ /* The number of operations that returned CRYPTO_BUSY. */
+ uint64_t ks_nbusy_rval;
+
+ /* taskq used to dispatch crypto requests */
+ taskq_t *ks_taskq;
+} kcf_sched_info_t;
+
+/*
+ * pd_irefcnt approximates the number of inflight requests to the
+ * provider. Though we increment this counter during registration for
+ * other purposes, that base value is mostly same across all providers.
+ * So, it is a good measure of the load on a provider when it is not
+ * in a busy state. Once a provider notifies it is busy, requests
+ * backup in the taskq. So, we use tq_nalloc in that case which gives
+ * the number of task entries in the task queue. Note that we do not
+ * acquire any locks here as it is not critical to get the exact number
+ * and the lock contention may be too costly for this code path.
+ */
+#define KCF_PROV_LOAD(pd) ((pd)->pd_state != KCF_PROV_BUSY ? \
+ (pd)->pd_irefcnt : (pd)->pd_sched_info.ks_taskq->tq_nalloc)
+
+#define KCF_PROV_INCRSTATS(pd, error) { \
+ (pd)->pd_sched_info.ks_ndispatches++; \
+ if (error == CRYPTO_BUSY) \
+ (pd)->pd_sched_info.ks_nbusy_rval++; \
+ else if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED) \
+ (pd)->pd_sched_info.ks_nfails++; \
+}
+
+
+/*
+ * The following two macros should be
+ * #define KCF_OPS_CLASSSIZE (KCF_LAST_OPSCLASS - KCF_FIRST_OPSCLASS + 2)
+ * #define KCF_MAXMECHTAB KCF_MAXCIPHER
+ *
+ * However, doing that would involve reorganizing the header file a bit.
+ * When impl.h is broken up (bug# 4703218), this will be done. For now,
+ * we hardcode these values.
+ */
+#define KCF_OPS_CLASSSIZE 8
+#define KCF_MAXMECHTAB 32
+
+/*
+ * Valid values for the state of a provider. The order of
+ * the elements is important.
+ *
+ * Routines which get a provider or the list of providers
+ * should pick only those that are either in KCF_PROV_READY state
+ * or in KCF_PROV_BUSY state.
+ */
+typedef enum {
+ KCF_PROV_ALLOCATED = 1,
+ KCF_PROV_UNVERIFIED,
+ KCF_PROV_VERIFICATION_FAILED,
+ /*
+ * state < KCF_PROV_READY means the provider can not
+ * be used at all.
+ */
+ KCF_PROV_READY,
+ KCF_PROV_BUSY,
+ /*
+ * state > KCF_PROV_BUSY means the provider can not
+ * be used for new requests.
+ */
+ KCF_PROV_FAILED,
+ /*
+ * Threads setting the following two states should do so only
+ * if the current state < KCF_PROV_DISABLED.
+ */
+ KCF_PROV_DISABLED,
+ KCF_PROV_REMOVED,
+ KCF_PROV_FREED
+} kcf_prov_state_t;
+
+#define KCF_IS_PROV_UNVERIFIED(pd) ((pd)->pd_state == KCF_PROV_UNVERIFIED)
+#define KCF_IS_PROV_USABLE(pd) ((pd)->pd_state == KCF_PROV_READY || \
+ (pd)->pd_state == KCF_PROV_BUSY)
+#define KCF_IS_PROV_REMOVED(pd) ((pd)->pd_state >= KCF_PROV_REMOVED)
+
+/* Internal flags valid for pd_flags field */
+#define KCF_PROV_RESTRICTED 0x40000000
+#define KCF_LPROV_MEMBER 0x80000000 /* is member of a logical provider */
+
+/*
+ * A provider descriptor structure. There is one such structure per
+ * provider. It is allocated and initialized at registration time and
+ * freed when the provider unregisters.
+ *
+ * pd_prov_type: Provider type, hardware or software
+ * pd_sid: Session ID of the provider used by kernel clients.
+ * This is valid only for session-oriented providers.
+ * pd_refcnt: Reference counter to this provider descriptor
+ * pd_irefcnt: References held by the framework internal structs
+ * pd_lock: lock protects pd_state and pd_provider_list
+ * pd_state: State value of the provider
+ * pd_provider_list: Used to cross-reference logical providers and their
+ * members. Not used for software providers.
+ * pd_resume_cv: cv to wait for state to change from KCF_PROV_BUSY
+ * pd_prov_handle: Provider handle specified by provider
+ * pd_ops_vector: The ops vector specified by Provider
+ * pd_mech_indx: Lookup table which maps a core framework mechanism
+ * number to an index in pd_mechanisms array
+ * pd_mechanisms: Array of mechanisms supported by the provider, specified
+ * by the provider during registration
+ * pd_sched_info: Scheduling information associated with the provider
+ * pd_mech_list_count: The number of entries in pi_mechanisms, specified
+ * by the provider during registration
+ * pd_name: Device name or module name
+ * pd_instance: Device instance
+ * pd_module_id: Module ID returned by modload
+ * pd_mctlp: Pointer to modctl structure for this provider
+ * pd_remove_cv: cv to wait on while the provider queue drains
+ * pd_description: Provider description string
+ * pd_flags bitwise OR of pi_flags from crypto_provider_info_t
+ * and other internal flags defined above.
+ * pd_hash_limit Maximum data size that hash mechanisms of this provider
+ * can support.
+ * pd_kcf_prov_handle: KCF-private handle assigned by KCF
+ * pd_prov_id: Identification # assigned by KCF to provider
+ * pd_kstat: kstat associated with the provider
+ * pd_ks_data: kstat data
+ */
+typedef struct kcf_provider_desc {
+ crypto_provider_type_t pd_prov_type;
+ crypto_session_id_t pd_sid;
+ uint_t pd_refcnt;
+ uint_t pd_irefcnt;
+ kmutex_t pd_lock;
+ kcf_prov_state_t pd_state;
+ struct kcf_provider_list *pd_provider_list;
+ kcondvar_t pd_resume_cv;
+ crypto_provider_handle_t pd_prov_handle;
+ crypto_ops_t *pd_ops_vector;
+ ushort_t pd_mech_indx[KCF_OPS_CLASSSIZE]\
+ [KCF_MAXMECHTAB];
+ crypto_mech_info_t *pd_mechanisms;
+ kcf_sched_info_t pd_sched_info;
+ uint_t pd_mech_list_count;
+ // char *pd_name;
+ // uint_t pd_instance;
+ // int pd_module_id;
+ // struct modctl *pd_mctlp;
+ kcondvar_t pd_remove_cv;
+ char *pd_description;
+ uint_t pd_flags;
+ uint_t pd_hash_limit;
+ crypto_kcf_provider_handle_t pd_kcf_prov_handle;
+ crypto_provider_id_t pd_prov_id;
+ kstat_t *pd_kstat;
+ kcf_prov_stats_t pd_ks_data;
+} kcf_provider_desc_t;
+
+/* useful for making a list of providers */
+typedef struct kcf_provider_list {
+ struct kcf_provider_list *pl_next;
+ struct kcf_provider_desc *pl_provider;
+} kcf_provider_list_t;
+
+/* atomic operations in linux implicitly form a memory barrier */
+#define membar_exit()
+
+/*
+ * If a component has a reference to a kcf_provider_desc_t,
+ * it REFHOLD()s. A new provider descriptor which is referenced only
+ * by the providers table has a reference counter of one.
+ */
+#define KCF_PROV_REFHOLD(desc) { \
+ atomic_add_32(&(desc)->pd_refcnt, 1); \
+ ASSERT((desc)->pd_refcnt != 0); \
+}
+
+#define KCF_PROV_IREFHOLD(desc) { \
+ atomic_add_32(&(desc)->pd_irefcnt, 1); \
+ ASSERT((desc)->pd_irefcnt != 0); \
+}
+
+#define KCF_PROV_IREFRELE(desc) { \
+ ASSERT((desc)->pd_irefcnt != 0); \
+ membar_exit(); \
+ if (atomic_add_32_nv(&(desc)->pd_irefcnt, -1) == 0) { \
+ cv_broadcast(&(desc)->pd_remove_cv); \
+ } \
+}
+
+#define KCF_PROV_REFHELD(desc) ((desc)->pd_refcnt >= 1)
+
+#define KCF_PROV_REFRELE(desc) { \
+ ASSERT((desc)->pd_refcnt != 0); \
+ membar_exit(); \
+ if (atomic_add_32_nv(&(desc)->pd_refcnt, -1) == 0) { \
+ kcf_provider_zero_refcnt((desc)); \
+ } \
+}
+
+
+/* list of crypto_mech_info_t valid as the second mech in a dual operation */
+
+typedef struct crypto_mech_info_list {
+ struct crypto_mech_info_list *ml_next;
+ crypto_mech_type_t ml_kcf_mechid; /* KCF's id */
+ crypto_mech_info_t ml_mech_info;
+} crypto_mech_info_list_t;
+
+/*
+ * An element in a mechanism provider descriptors chain.
+ * The kcf_prov_mech_desc_t is duplicated in every chain the provider belongs
+ * to. This is a small tradeoff memory vs mutex spinning time to access the
+ * common provider field.
+ */
+
+typedef struct kcf_prov_mech_desc {
+ struct kcf_mech_entry *pm_me; /* Back to the head */
+ struct kcf_prov_mech_desc *pm_next; /* Next in the chain */
+ crypto_mech_info_t pm_mech_info; /* Provider mech info */
+ crypto_mech_info_list_t *pm_mi_list; /* list for duals */
+ kcf_provider_desc_t *pm_prov_desc; /* Common desc. */
+} kcf_prov_mech_desc_t;
+
+/* and the notation shortcuts ... */
+#define pm_provider_type pm_prov_desc.pd_provider_type
+#define pm_provider_handle pm_prov_desc.pd_provider_handle
+#define pm_ops_vector pm_prov_desc.pd_ops_vector
+
+/*
+ * A mechanism entry in an xxx_mech_tab[]. me_pad was deemed
+ * to be unnecessary and removed.
+ */
+typedef struct kcf_mech_entry {
+ crypto_mech_name_t me_name; /* mechanism name */
+ crypto_mech_type_t me_mechid; /* Internal id for mechanism */
+ kmutex_t me_mutex; /* access protection */
+ kcf_prov_mech_desc_t *me_hw_prov_chain; /* list of HW providers */
+ kcf_prov_mech_desc_t *me_sw_prov; /* SW provider */
+ /*
+ * Number of HW providers in the chain. There is only one
+ * SW provider. So, we need only a count of HW providers.
+ */
+ int me_num_hwprov;
+ /*
+ * When a SW provider is present, this is the generation number that
+ * ensures no objects from old SW providers are used in the new one
+ */
+ uint32_t me_gen_swprov;
+ /*
+ * threshold for using hardware providers for this mech
+ */
+ size_t me_threshold;
+} kcf_mech_entry_t;
+
+/*
+ * A policy descriptor structure. It is allocated and initialized
+ * when administrative ioctls load disabled mechanisms.
+ *
+ * pd_prov_type: Provider type, hardware or software
+ * pd_name: Device name or module name.
+ * pd_instance: Device instance.
+ * pd_refcnt: Reference counter for this policy descriptor
+ * pd_mutex: Protects array and count of disabled mechanisms.
+ * pd_disabled_count: Count of disabled mechanisms.
+ * pd_disabled_mechs: Array of disabled mechanisms.
+ */
+typedef struct kcf_policy_desc {
+ crypto_provider_type_t pd_prov_type;
+ char *pd_name;
+ uint_t pd_instance;
+ uint_t pd_refcnt;
+ kmutex_t pd_mutex;
+ uint_t pd_disabled_count;
+ crypto_mech_name_t *pd_disabled_mechs;
+} kcf_policy_desc_t;
+
+/*
+ * If a component has a reference to a kcf_policy_desc_t,
+ * it REFHOLD()s. A new policy descriptor which is referenced only
+ * by the policy table has a reference count of one.
+ */
+#define KCF_POLICY_REFHOLD(desc) { \
+ atomic_add_32(&(desc)->pd_refcnt, 1); \
+ ASSERT((desc)->pd_refcnt != 0); \
+}
+
+/*
+ * Releases a reference to a policy descriptor. When the last
+ * reference is released, the descriptor is freed.
+ */
+#define KCF_POLICY_REFRELE(desc) { \
+ ASSERT((desc)->pd_refcnt != 0); \
+ membar_exit(); \
+ if (atomic_add_32_nv(&(desc)->pd_refcnt, -1) == 0) \
+ kcf_policy_free_desc(desc); \
+}
+
+/*
+ * This entry stores the name of a software module and its
+ * mechanisms. The mechanisms are 'hints' that are used to
+ * trigger loading of the module.
+ */
+typedef struct kcf_soft_conf_entry {
+ struct kcf_soft_conf_entry *ce_next;
+ char *ce_name;
+ crypto_mech_name_t *ce_mechs;
+ uint_t ce_count;
+} kcf_soft_conf_entry_t;
+
+extern kmutex_t soft_config_mutex;
+extern kcf_soft_conf_entry_t *soft_config_list;
+
+/*
+ * Global tables. The sizes are from the predefined PKCS#11 v2.20 mechanisms,
+ * with a margin of few extra empty entry points
+ */
+
+#define KCF_MAXDIGEST 16 /* Digests */
+#define KCF_MAXCIPHER 64 /* Ciphers */
+#define KCF_MAXMAC 40 /* Message authentication codes */
+#define KCF_MAXSIGN 24 /* Sign/Verify */
+#define KCF_MAXKEYOPS 116 /* Key generation and derivation */
+#define KCF_MAXMISC 16 /* Others ... */
+
+#define KCF_MAXMECHS KCF_MAXDIGEST + KCF_MAXCIPHER + KCF_MAXMAC + \
+ KCF_MAXSIGN + KCF_MAXKEYOPS + \
+ KCF_MAXMISC
+
+extern kcf_mech_entry_t kcf_digest_mechs_tab[];
+extern kcf_mech_entry_t kcf_cipher_mechs_tab[];
+extern kcf_mech_entry_t kcf_mac_mechs_tab[];
+extern kcf_mech_entry_t kcf_sign_mechs_tab[];
+extern kcf_mech_entry_t kcf_keyops_mechs_tab[];
+extern kcf_mech_entry_t kcf_misc_mechs_tab[];
+
+extern kmutex_t kcf_mech_tabs_lock;
+
+typedef enum {
+ KCF_DIGEST_CLASS = 1,
+ KCF_CIPHER_CLASS,
+ KCF_MAC_CLASS,
+ KCF_SIGN_CLASS,
+ KCF_KEYOPS_CLASS,
+ KCF_MISC_CLASS
+} kcf_ops_class_t;
+
+#define KCF_FIRST_OPSCLASS KCF_DIGEST_CLASS
+#define KCF_LAST_OPSCLASS KCF_MISC_CLASS
+
+/* The table of all the kcf_xxx_mech_tab[]s, indexed by kcf_ops_class */
+
+typedef struct kcf_mech_entry_tab {
+ int met_size; /* Size of the met_tab[] */
+ kcf_mech_entry_t *met_tab; /* the table */
+} kcf_mech_entry_tab_t;
+
+extern kcf_mech_entry_tab_t kcf_mech_tabs_tab[];
+
+#define KCF_MECHID(class, index) \
+ (((crypto_mech_type_t)(class) << 32) | (crypto_mech_type_t)(index))
+
+#define KCF_MECH2CLASS(mech_type) ((kcf_ops_class_t)((mech_type) >> 32))
+
+#define KCF_MECH2INDEX(mech_type) ((int)(mech_type))
+
+#define KCF_TO_PROV_MECH_INDX(pd, mech_type) \
+ ((pd)->pd_mech_indx[KCF_MECH2CLASS(mech_type)] \
+ [KCF_MECH2INDEX(mech_type)])
+
+#define KCF_TO_PROV_MECHINFO(pd, mech_type) \
+ ((pd)->pd_mechanisms[KCF_TO_PROV_MECH_INDX(pd, mech_type)])
+
+#define KCF_TO_PROV_MECHNUM(pd, mech_type) \
+ (KCF_TO_PROV_MECHINFO(pd, mech_type).cm_mech_number)
+
+#define KCF_CAN_SHARE_OPSTATE(pd, mech_type) \
+ ((KCF_TO_PROV_MECHINFO(pd, mech_type).cm_mech_flags) & \
+ CRYPTO_CAN_SHARE_OPSTATE)
+
+/* ps_refcnt is protected by cm_lock in the crypto_minor structure */
+typedef struct crypto_provider_session {
+ struct crypto_provider_session *ps_next;
+ crypto_session_id_t ps_session;
+ kcf_provider_desc_t *ps_provider;
+ kcf_provider_desc_t *ps_real_provider;
+ uint_t ps_refcnt;
+} crypto_provider_session_t;
+
+typedef struct crypto_session_data {
+ kmutex_t sd_lock;
+ kcondvar_t sd_cv;
+ uint32_t sd_flags;
+ int sd_pre_approved_amount;
+ crypto_ctx_t *sd_digest_ctx;
+ crypto_ctx_t *sd_encr_ctx;
+ crypto_ctx_t *sd_decr_ctx;
+ crypto_ctx_t *sd_sign_ctx;
+ crypto_ctx_t *sd_verify_ctx;
+ crypto_ctx_t *sd_sign_recover_ctx;
+ crypto_ctx_t *sd_verify_recover_ctx;
+ kcf_provider_desc_t *sd_provider;
+ void *sd_find_init_cookie;
+ crypto_provider_session_t *sd_provider_session;
+} crypto_session_data_t;
+
+#define CRYPTO_SESSION_IN_USE 0x00000001
+#define CRYPTO_SESSION_IS_BUSY 0x00000002
+#define CRYPTO_SESSION_IS_CLOSED 0x00000004
+
+#define KCF_MAX_PIN_LEN 1024
+
+/*
+ * Per-minor info.
+ *
+ * cm_lock protects everything in this structure except for cm_refcnt.
+ */
+typedef struct crypto_minor {
+ uint_t cm_refcnt;
+ kmutex_t cm_lock;
+ kcondvar_t cm_cv;
+ crypto_session_data_t **cm_session_table;
+ uint_t cm_session_table_count;
+ kcf_provider_desc_t **cm_provider_array;
+ uint_t cm_provider_count;
+ crypto_provider_session_t *cm_provider_session;
+} crypto_minor_t;
+
+/*
+ * Return codes for internal functions
+ */
+#define KCF_SUCCESS 0x0 /* Successful call */
+#define KCF_INVALID_MECH_NUMBER 0x1 /* invalid mechanism number */
+#define KCF_INVALID_MECH_NAME 0x2 /* invalid mechanism name */
+#define KCF_INVALID_MECH_CLASS 0x3 /* invalid mechanism class */
+#define KCF_MECH_TAB_FULL 0x4 /* Need more room in the mech tabs. */
+#define KCF_INVALID_INDX ((ushort_t)-1)
+
+/*
+ * kCF internal mechanism and function group for tracking RNG providers.
+ */
+#define SUN_RANDOM "random"
+#define CRYPTO_FG_RANDOM 0x80000000 /* generate_random() */
+
+/*
+ * Wrappers for ops vectors. In the wrapper definitions below, the pd
+ * argument always corresponds to a pointer to a provider descriptor
+ * of type kcf_prov_desc_t.
+ */
+
+#define KCF_PROV_CONTROL_OPS(pd) ((pd)->pd_ops_vector->co_control_ops)
+#define KCF_PROV_CTX_OPS(pd) ((pd)->pd_ops_vector->co_ctx_ops)
+#define KCF_PROV_DIGEST_OPS(pd) ((pd)->pd_ops_vector->co_digest_ops)
+#define KCF_PROV_CIPHER_OPS(pd) ((pd)->pd_ops_vector->co_cipher_ops)
+#define KCF_PROV_MAC_OPS(pd) ((pd)->pd_ops_vector->co_mac_ops)
+#define KCF_PROV_SIGN_OPS(pd) ((pd)->pd_ops_vector->co_sign_ops)
+#define KCF_PROV_VERIFY_OPS(pd) ((pd)->pd_ops_vector->co_verify_ops)
+#define KCF_PROV_DUAL_OPS(pd) ((pd)->pd_ops_vector->co_dual_ops)
+#define KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) \
+ ((pd)->pd_ops_vector->co_dual_cipher_mac_ops)
+#define KCF_PROV_RANDOM_OPS(pd) ((pd)->pd_ops_vector->co_random_ops)
+#define KCF_PROV_SESSION_OPS(pd) ((pd)->pd_ops_vector->co_session_ops)
+#define KCF_PROV_OBJECT_OPS(pd) ((pd)->pd_ops_vector->co_object_ops)
+#define KCF_PROV_KEY_OPS(pd) ((pd)->pd_ops_vector->co_key_ops)
+#define KCF_PROV_PROVIDER_OPS(pd) ((pd)->pd_ops_vector->co_provider_ops)
+#define KCF_PROV_MECH_OPS(pd) ((pd)->pd_ops_vector->co_mech_ops)
+#define KCF_PROV_NOSTORE_KEY_OPS(pd) \
+ ((pd)->pd_ops_vector->co_nostore_key_ops)
+
+/*
+ * Wrappers for crypto_control_ops(9S) entry points.
+ */
+
+#define KCF_PROV_STATUS(pd, status) ( \
+ (KCF_PROV_CONTROL_OPS(pd) && \
+ KCF_PROV_CONTROL_OPS(pd)->provider_status) ? \
+ KCF_PROV_CONTROL_OPS(pd)->provider_status( \
+ (pd)->pd_prov_handle, status) : \
+ CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_ctx_ops(9S) entry points.
+ */
+
+#define KCF_PROV_CREATE_CTX_TEMPLATE(pd, mech, key, template, size, req) ( \
+ (KCF_PROV_CTX_OPS(pd) && KCF_PROV_CTX_OPS(pd)->create_ctx_template) ? \
+ KCF_PROV_CTX_OPS(pd)->create_ctx_template( \
+ (pd)->pd_prov_handle, mech, key, template, size, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_FREE_CONTEXT(pd, ctx) ( \
+ (KCF_PROV_CTX_OPS(pd) && KCF_PROV_CTX_OPS(pd)->free_context) ? \
+ KCF_PROV_CTX_OPS(pd)->free_context(ctx) : CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_COPYIN_MECH(pd, umech, kmech, errorp, mode) ( \
+ (KCF_PROV_MECH_OPS(pd) && KCF_PROV_MECH_OPS(pd)->copyin_mechanism) ? \
+ KCF_PROV_MECH_OPS(pd)->copyin_mechanism( \
+ (pd)->pd_prov_handle, umech, kmech, errorp, mode) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_COPYOUT_MECH(pd, kmech, umech, errorp, mode) ( \
+ (KCF_PROV_MECH_OPS(pd) && KCF_PROV_MECH_OPS(pd)->copyout_mechanism) ? \
+ KCF_PROV_MECH_OPS(pd)->copyout_mechanism( \
+ (pd)->pd_prov_handle, kmech, umech, errorp, mode) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_FREE_MECH(pd, prov_mech) ( \
+ (KCF_PROV_MECH_OPS(pd) && KCF_PROV_MECH_OPS(pd)->free_mechanism) ? \
+ KCF_PROV_MECH_OPS(pd)->free_mechanism( \
+ (pd)->pd_prov_handle, prov_mech) : CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_digest_ops(9S) entry points.
+ */
+
+#define KCF_PROV_DIGEST_INIT(pd, ctx, mech, req) ( \
+ (KCF_PROV_DIGEST_OPS(pd) && KCF_PROV_DIGEST_OPS(pd)->digest_init) ? \
+ KCF_PROV_DIGEST_OPS(pd)->digest_init(ctx, mech, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+/*
+ * The _ (underscore) in _digest is needed to avoid replacing the
+ * function digest().
+ */
+#define KCF_PROV_DIGEST(pd, ctx, data, _digest, req) ( \
+ (KCF_PROV_DIGEST_OPS(pd) && KCF_PROV_DIGEST_OPS(pd)->digest) ? \
+ KCF_PROV_DIGEST_OPS(pd)->digest(ctx, data, _digest, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_DIGEST_UPDATE(pd, ctx, data, req) ( \
+ (KCF_PROV_DIGEST_OPS(pd) && KCF_PROV_DIGEST_OPS(pd)->digest_update) ? \
+ KCF_PROV_DIGEST_OPS(pd)->digest_update(ctx, data, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_DIGEST_KEY(pd, ctx, key, req) ( \
+ (KCF_PROV_DIGEST_OPS(pd) && KCF_PROV_DIGEST_OPS(pd)->digest_key) ? \
+ KCF_PROV_DIGEST_OPS(pd)->digest_key(ctx, key, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_DIGEST_FINAL(pd, ctx, digest, req) ( \
+ (KCF_PROV_DIGEST_OPS(pd) && KCF_PROV_DIGEST_OPS(pd)->digest_final) ? \
+ KCF_PROV_DIGEST_OPS(pd)->digest_final(ctx, digest, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_DIGEST_ATOMIC(pd, session, mech, data, digest, req) ( \
+ (KCF_PROV_DIGEST_OPS(pd) && KCF_PROV_DIGEST_OPS(pd)->digest_atomic) ? \
+ KCF_PROV_DIGEST_OPS(pd)->digest_atomic( \
+ (pd)->pd_prov_handle, session, mech, data, digest, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_cipher_ops(9S) entry points.
+ */
+
+#define KCF_PROV_ENCRYPT_INIT(pd, ctx, mech, key, template, req) ( \
+ (KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->encrypt_init) ? \
+ KCF_PROV_CIPHER_OPS(pd)->encrypt_init(ctx, mech, key, template, \
+ req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_ENCRYPT(pd, ctx, plaintext, ciphertext, req) ( \
+ (KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->encrypt) ? \
+ KCF_PROV_CIPHER_OPS(pd)->encrypt(ctx, plaintext, ciphertext, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_ENCRYPT_UPDATE(pd, ctx, plaintext, ciphertext, req) ( \
+ (KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->encrypt_update) ? \
+ KCF_PROV_CIPHER_OPS(pd)->encrypt_update(ctx, plaintext, \
+ ciphertext, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_ENCRYPT_FINAL(pd, ctx, ciphertext, req) ( \
+ (KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->encrypt_final) ? \
+ KCF_PROV_CIPHER_OPS(pd)->encrypt_final(ctx, ciphertext, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_ENCRYPT_ATOMIC(pd, session, mech, key, plaintext, ciphertext, \
+ template, req) ( \
+ (KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->encrypt_atomic) ? \
+ KCF_PROV_CIPHER_OPS(pd)->encrypt_atomic( \
+ (pd)->pd_prov_handle, session, mech, key, plaintext, ciphertext, \
+ template, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_DECRYPT_INIT(pd, ctx, mech, key, template, req) ( \
+ (KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->decrypt_init) ? \
+ KCF_PROV_CIPHER_OPS(pd)->decrypt_init(ctx, mech, key, template, \
+ req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_DECRYPT(pd, ctx, ciphertext, plaintext, req) ( \
+ (KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->decrypt) ? \
+ KCF_PROV_CIPHER_OPS(pd)->decrypt(ctx, ciphertext, plaintext, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_DECRYPT_UPDATE(pd, ctx, ciphertext, plaintext, req) ( \
+ (KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->decrypt_update) ? \
+ KCF_PROV_CIPHER_OPS(pd)->decrypt_update(ctx, ciphertext, \
+ plaintext, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_DECRYPT_FINAL(pd, ctx, plaintext, req) ( \
+ (KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->decrypt_final) ? \
+ KCF_PROV_CIPHER_OPS(pd)->decrypt_final(ctx, plaintext, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_DECRYPT_ATOMIC(pd, session, mech, key, ciphertext, plaintext, \
+ template, req) ( \
+ (KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->decrypt_atomic) ? \
+ KCF_PROV_CIPHER_OPS(pd)->decrypt_atomic( \
+ (pd)->pd_prov_handle, session, mech, key, ciphertext, plaintext, \
+ template, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_mac_ops(9S) entry points.
+ */
+
+#define KCF_PROV_MAC_INIT(pd, ctx, mech, key, template, req) ( \
+ (KCF_PROV_MAC_OPS(pd) && KCF_PROV_MAC_OPS(pd)->mac_init) ? \
+ KCF_PROV_MAC_OPS(pd)->mac_init(ctx, mech, key, template, req) \
+ : CRYPTO_NOT_SUPPORTED)
+
+/*
+ * The _ (underscore) in _mac is needed to avoid replacing the
+ * function mac().
+ */
+#define KCF_PROV_MAC(pd, ctx, data, _mac, req) ( \
+ (KCF_PROV_MAC_OPS(pd) && KCF_PROV_MAC_OPS(pd)->mac) ? \
+ KCF_PROV_MAC_OPS(pd)->mac(ctx, data, _mac, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_MAC_UPDATE(pd, ctx, data, req) ( \
+ (KCF_PROV_MAC_OPS(pd) && KCF_PROV_MAC_OPS(pd)->mac_update) ? \
+ KCF_PROV_MAC_OPS(pd)->mac_update(ctx, data, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_MAC_FINAL(pd, ctx, mac, req) ( \
+ (KCF_PROV_MAC_OPS(pd) && KCF_PROV_MAC_OPS(pd)->mac_final) ? \
+ KCF_PROV_MAC_OPS(pd)->mac_final(ctx, mac, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_MAC_ATOMIC(pd, session, mech, key, data, mac, template, \
+ req) ( \
+ (KCF_PROV_MAC_OPS(pd) && KCF_PROV_MAC_OPS(pd)->mac_atomic) ? \
+ KCF_PROV_MAC_OPS(pd)->mac_atomic( \
+ (pd)->pd_prov_handle, session, mech, key, data, mac, template, \
+ req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_MAC_VERIFY_ATOMIC(pd, session, mech, key, data, mac, \
+ template, req) ( \
+ (KCF_PROV_MAC_OPS(pd) && KCF_PROV_MAC_OPS(pd)->mac_verify_atomic) ? \
+ KCF_PROV_MAC_OPS(pd)->mac_verify_atomic( \
+ (pd)->pd_prov_handle, session, mech, key, data, mac, template, \
+ req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_sign_ops(9S) entry points.
+ */
+
+#define KCF_PROV_SIGN_INIT(pd, ctx, mech, key, template, req) ( \
+ (KCF_PROV_SIGN_OPS(pd) && KCF_PROV_SIGN_OPS(pd)->sign_init) ? \
+ KCF_PROV_SIGN_OPS(pd)->sign_init( \
+ ctx, mech, key, template, req) : CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_SIGN(pd, ctx, data, sig, req) ( \
+ (KCF_PROV_SIGN_OPS(pd) && KCF_PROV_SIGN_OPS(pd)->sign) ? \
+ KCF_PROV_SIGN_OPS(pd)->sign(ctx, data, sig, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_SIGN_UPDATE(pd, ctx, data, req) ( \
+ (KCF_PROV_SIGN_OPS(pd) && KCF_PROV_SIGN_OPS(pd)->sign_update) ? \
+ KCF_PROV_SIGN_OPS(pd)->sign_update(ctx, data, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_SIGN_FINAL(pd, ctx, sig, req) ( \
+ (KCF_PROV_SIGN_OPS(pd) && KCF_PROV_SIGN_OPS(pd)->sign_final) ? \
+ KCF_PROV_SIGN_OPS(pd)->sign_final(ctx, sig, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_SIGN_ATOMIC(pd, session, mech, key, data, template, \
+ sig, req) ( \
+ (KCF_PROV_SIGN_OPS(pd) && KCF_PROV_SIGN_OPS(pd)->sign_atomic) ? \
+ KCF_PROV_SIGN_OPS(pd)->sign_atomic( \
+ (pd)->pd_prov_handle, session, mech, key, data, sig, template, \
+ req) : CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_SIGN_RECOVER_INIT(pd, ctx, mech, key, template, \
+ req) ( \
+ (KCF_PROV_SIGN_OPS(pd) && KCF_PROV_SIGN_OPS(pd)->sign_recover_init) ? \
+ KCF_PROV_SIGN_OPS(pd)->sign_recover_init(ctx, mech, key, template, \
+ req) : CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_SIGN_RECOVER(pd, ctx, data, sig, req) ( \
+ (KCF_PROV_SIGN_OPS(pd) && KCF_PROV_SIGN_OPS(pd)->sign_recover) ? \
+ KCF_PROV_SIGN_OPS(pd)->sign_recover(ctx, data, sig, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_SIGN_RECOVER_ATOMIC(pd, session, mech, key, data, template, \
+ sig, req) ( \
+ (KCF_PROV_SIGN_OPS(pd) && \
+ KCF_PROV_SIGN_OPS(pd)->sign_recover_atomic) ? \
+ KCF_PROV_SIGN_OPS(pd)->sign_recover_atomic( \
+ (pd)->pd_prov_handle, session, mech, key, data, sig, template, \
+ req) : CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_verify_ops(9S) entry points.
+ */
+
+#define KCF_PROV_VERIFY_INIT(pd, ctx, mech, key, template, req) ( \
+ (KCF_PROV_VERIFY_OPS(pd) && KCF_PROV_VERIFY_OPS(pd)->verify_init) ? \
+ KCF_PROV_VERIFY_OPS(pd)->verify_init(ctx, mech, key, template, \
+ req) : CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_VERIFY(pd, ctx, data, sig, req) ( \
+ (KCF_PROV_VERIFY_OPS(pd) && KCF_PROV_VERIFY_OPS(pd)->do_verify) ? \
+ KCF_PROV_VERIFY_OPS(pd)->do_verify(ctx, data, sig, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_VERIFY_UPDATE(pd, ctx, data, req) ( \
+ (KCF_PROV_VERIFY_OPS(pd) && KCF_PROV_VERIFY_OPS(pd)->verify_update) ? \
+ KCF_PROV_VERIFY_OPS(pd)->verify_update(ctx, data, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_VERIFY_FINAL(pd, ctx, sig, req) ( \
+ (KCF_PROV_VERIFY_OPS(pd) && KCF_PROV_VERIFY_OPS(pd)->verify_final) ? \
+ KCF_PROV_VERIFY_OPS(pd)->verify_final(ctx, sig, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_VERIFY_ATOMIC(pd, session, mech, key, data, template, sig, \
+ req) ( \
+ (KCF_PROV_VERIFY_OPS(pd) && KCF_PROV_VERIFY_OPS(pd)->verify_atomic) ? \
+ KCF_PROV_VERIFY_OPS(pd)->verify_atomic( \
+ (pd)->pd_prov_handle, session, mech, key, data, sig, template, \
+ req) : CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_VERIFY_RECOVER_INIT(pd, ctx, mech, key, template, \
+ req) ( \
+ (KCF_PROV_VERIFY_OPS(pd) && \
+ KCF_PROV_VERIFY_OPS(pd)->verify_recover_init) ? \
+ KCF_PROV_VERIFY_OPS(pd)->verify_recover_init(ctx, mech, key, \
+ template, req) : CRYPTO_NOT_SUPPORTED)
+
+/* verify_recover() CSPI routine has different argument order than verify() */
+#define KCF_PROV_VERIFY_RECOVER(pd, ctx, sig, data, req) ( \
+ (KCF_PROV_VERIFY_OPS(pd) && KCF_PROV_VERIFY_OPS(pd)->verify_recover) ? \
+ KCF_PROV_VERIFY_OPS(pd)->verify_recover(ctx, sig, data, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+/*
+ * verify_recover_atomic() CSPI routine has different argument order
+ * than verify_atomic().
+ */
+#define KCF_PROV_VERIFY_RECOVER_ATOMIC(pd, session, mech, key, sig, \
+ template, data, req) ( \
+ (KCF_PROV_VERIFY_OPS(pd) && \
+ KCF_PROV_VERIFY_OPS(pd)->verify_recover_atomic) ? \
+ KCF_PROV_VERIFY_OPS(pd)->verify_recover_atomic( \
+ (pd)->pd_prov_handle, session, mech, key, sig, data, template, \
+ req) : CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_dual_ops(9S) entry points.
+ */
+
+#define KCF_PROV_DIGEST_ENCRYPT_UPDATE(digest_ctx, encrypt_ctx, plaintext, \
+ ciphertext, req) ( \
+ (KCF_PROV_DUAL_OPS(pd) && \
+ KCF_PROV_DUAL_OPS(pd)->digest_encrypt_update) ? \
+ KCF_PROV_DUAL_OPS(pd)->digest_encrypt_update( \
+ digest_ctx, encrypt_ctx, plaintext, ciphertext, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_DECRYPT_DIGEST_UPDATE(decrypt_ctx, digest_ctx, ciphertext, \
+ plaintext, req) ( \
+ (KCF_PROV_DUAL_OPS(pd) && \
+ KCF_PROV_DUAL_OPS(pd)->decrypt_digest_update) ? \
+ KCF_PROV_DUAL_OPS(pd)->decrypt_digest_update( \
+ decrypt_ctx, digest_ctx, ciphertext, plaintext, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_SIGN_ENCRYPT_UPDATE(sign_ctx, encrypt_ctx, plaintext, \
+ ciphertext, req) ( \
+ (KCF_PROV_DUAL_OPS(pd) && \
+ KCF_PROV_DUAL_OPS(pd)->sign_encrypt_update) ? \
+ KCF_PROV_DUAL_OPS(pd)->sign_encrypt_update( \
+ sign_ctx, encrypt_ctx, plaintext, ciphertext, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_DECRYPT_VERIFY_UPDATE(decrypt_ctx, verify_ctx, ciphertext, \
+ plaintext, req) ( \
+ (KCF_PROV_DUAL_OPS(pd) && \
+ KCF_PROV_DUAL_OPS(pd)->decrypt_verify_update) ? \
+ KCF_PROV_DUAL_OPS(pd)->decrypt_verify_update( \
+ decrypt_ctx, verify_ctx, ciphertext, plaintext, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_dual_cipher_mac_ops(9S) entry points.
+ */
+
+#define KCF_PROV_ENCRYPT_MAC_INIT(pd, ctx, encr_mech, encr_key, mac_mech, \
+ mac_key, encr_ctx_template, mac_ctx_template, req) ( \
+ (KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \
+ KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_init) ? \
+ KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_init( \
+ ctx, encr_mech, encr_key, mac_mech, mac_key, encr_ctx_template, \
+ mac_ctx_template, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_ENCRYPT_MAC(pd, ctx, plaintext, ciphertext, mac, req) ( \
+ (KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \
+ KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac) ? \
+ KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac( \
+ ctx, plaintext, ciphertext, mac, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_ENCRYPT_MAC_UPDATE(pd, ctx, plaintext, ciphertext, req) ( \
+ (KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \
+ KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_update) ? \
+ KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_update( \
+ ctx, plaintext, ciphertext, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_ENCRYPT_MAC_FINAL(pd, ctx, ciphertext, mac, req) ( \
+ (KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \
+ KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_final) ? \
+ KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_final( \
+ ctx, ciphertext, mac, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_ENCRYPT_MAC_ATOMIC(pd, session, encr_mech, encr_key, \
+ mac_mech, mac_key, plaintext, ciphertext, mac, \
+ encr_ctx_template, mac_ctx_template, req) ( \
+ (KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \
+ KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_atomic) ? \
+ KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_atomic( \
+ (pd)->pd_prov_handle, session, encr_mech, encr_key, \
+ mac_mech, mac_key, plaintext, ciphertext, mac, \
+ encr_ctx_template, mac_ctx_template, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_MAC_DECRYPT_INIT(pd, ctx, mac_mech, mac_key, decr_mech, \
+ decr_key, mac_ctx_template, decr_ctx_template, req) ( \
+ (KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \
+ KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_init) ? \
+ KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_init( \
+ ctx, mac_mech, mac_key, decr_mech, decr_key, mac_ctx_template, \
+ decr_ctx_template, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_MAC_DECRYPT(pd, ctx, ciphertext, mac, plaintext, req) ( \
+ (KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \
+ KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt) ? \
+ KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt( \
+ ctx, ciphertext, mac, plaintext, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_MAC_DECRYPT_UPDATE(pd, ctx, ciphertext, plaintext, req) ( \
+ (KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \
+ KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_update) ? \
+ KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_update( \
+ ctx, ciphertext, plaintext, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_MAC_DECRYPT_FINAL(pd, ctx, mac, plaintext, req) ( \
+ (KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \
+ KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_final) ? \
+ KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_final( \
+ ctx, mac, plaintext, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_MAC_DECRYPT_ATOMIC(pd, session, mac_mech, mac_key, \
+ decr_mech, decr_key, ciphertext, mac, plaintext, \
+ mac_ctx_template, decr_ctx_template, req) ( \
+ (KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \
+ KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_atomic) ? \
+ KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_atomic( \
+ (pd)->pd_prov_handle, session, mac_mech, mac_key, \
+ decr_mech, decr_key, ciphertext, mac, plaintext, \
+ mac_ctx_template, decr_ctx_template, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_MAC_VERIFY_DECRYPT_ATOMIC(pd, session, mac_mech, mac_key, \
+ decr_mech, decr_key, ciphertext, mac, plaintext, \
+ mac_ctx_template, decr_ctx_template, req) ( \
+ (KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \
+ KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_verify_decrypt_atomic \
+ != NULL) ? \
+ KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_verify_decrypt_atomic( \
+ (pd)->pd_prov_handle, session, mac_mech, mac_key, \
+ decr_mech, decr_key, ciphertext, mac, plaintext, \
+ mac_ctx_template, decr_ctx_template, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_random_number_ops(9S) entry points.
+ */
+
+#define KCF_PROV_SEED_RANDOM(pd, session, buf, len, est, flags, req) ( \
+ (KCF_PROV_RANDOM_OPS(pd) && KCF_PROV_RANDOM_OPS(pd)->seed_random) ? \
+ KCF_PROV_RANDOM_OPS(pd)->seed_random((pd)->pd_prov_handle, \
+ session, buf, len, est, flags, req) : CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_GENERATE_RANDOM(pd, session, buf, len, req) ( \
+ (KCF_PROV_RANDOM_OPS(pd) && \
+ KCF_PROV_RANDOM_OPS(pd)->generate_random) ? \
+ KCF_PROV_RANDOM_OPS(pd)->generate_random((pd)->pd_prov_handle, \
+ session, buf, len, req) : CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_session_ops(9S) entry points.
+ *
+ * ops_pd is the provider descriptor that supplies the ops_vector.
+ * pd is the descriptor that supplies the provider handle.
+ * Only session open/close needs two handles.
+ */
+
+#define KCF_PROV_SESSION_OPEN(ops_pd, session, req, pd) ( \
+ (KCF_PROV_SESSION_OPS(ops_pd) && \
+ KCF_PROV_SESSION_OPS(ops_pd)->session_open) ? \
+ KCF_PROV_SESSION_OPS(ops_pd)->session_open((pd)->pd_prov_handle, \
+ session, req) : CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_SESSION_CLOSE(ops_pd, session, req, pd) ( \
+ (KCF_PROV_SESSION_OPS(ops_pd) && \
+ KCF_PROV_SESSION_OPS(ops_pd)->session_close) ? \
+ KCF_PROV_SESSION_OPS(ops_pd)->session_close((pd)->pd_prov_handle, \
+ session, req) : CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_SESSION_LOGIN(pd, session, user_type, pin, len, req) ( \
+ (KCF_PROV_SESSION_OPS(pd) && \
+ KCF_PROV_SESSION_OPS(pd)->session_login) ? \
+ KCF_PROV_SESSION_OPS(pd)->session_login((pd)->pd_prov_handle, \
+ session, user_type, pin, len, req) : CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_SESSION_LOGOUT(pd, session, req) ( \
+ (KCF_PROV_SESSION_OPS(pd) && \
+ KCF_PROV_SESSION_OPS(pd)->session_logout) ? \
+ KCF_PROV_SESSION_OPS(pd)->session_logout((pd)->pd_prov_handle, \
+ session, req) : CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_object_ops(9S) entry points.
+ */
+
+#define KCF_PROV_OBJECT_CREATE(pd, session, template, count, object, req) ( \
+ (KCF_PROV_OBJECT_OPS(pd) && KCF_PROV_OBJECT_OPS(pd)->object_create) ? \
+ KCF_PROV_OBJECT_OPS(pd)->object_create((pd)->pd_prov_handle, \
+ session, template, count, object, req) : CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_OBJECT_COPY(pd, session, object, template, count, \
+ new_object, req) ( \
+ (KCF_PROV_OBJECT_OPS(pd) && KCF_PROV_OBJECT_OPS(pd)->object_copy) ? \
+ KCF_PROV_OBJECT_OPS(pd)->object_copy((pd)->pd_prov_handle, \
+ session, object, template, count, new_object, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_OBJECT_DESTROY(pd, session, object, req) ( \
+ (KCF_PROV_OBJECT_OPS(pd) && KCF_PROV_OBJECT_OPS(pd)->object_destroy) ? \
+ KCF_PROV_OBJECT_OPS(pd)->object_destroy((pd)->pd_prov_handle, \
+ session, object, req) : CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_OBJECT_GET_SIZE(pd, session, object, size, req) ( \
+ (KCF_PROV_OBJECT_OPS(pd) && \
+ KCF_PROV_OBJECT_OPS(pd)->object_get_size) ? \
+ KCF_PROV_OBJECT_OPS(pd)->object_get_size((pd)->pd_prov_handle, \
+ session, object, size, req) : CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_OBJECT_GET_ATTRIBUTE_VALUE(pd, session, object, template, \
+ count, req) ( \
+ (KCF_PROV_OBJECT_OPS(pd) && \
+ KCF_PROV_OBJECT_OPS(pd)->object_get_attribute_value) ? \
+ KCF_PROV_OBJECT_OPS(pd)->object_get_attribute_value( \
+ (pd)->pd_prov_handle, session, object, template, count, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_OBJECT_SET_ATTRIBUTE_VALUE(pd, session, object, template, \
+ count, req) ( \
+ (KCF_PROV_OBJECT_OPS(pd) && \
+ KCF_PROV_OBJECT_OPS(pd)->object_set_attribute_value) ? \
+ KCF_PROV_OBJECT_OPS(pd)->object_set_attribute_value( \
+ (pd)->pd_prov_handle, session, object, template, count, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_OBJECT_FIND_INIT(pd, session, template, count, ppriv, \
+ req) ( \
+ (KCF_PROV_OBJECT_OPS(pd) && \
+ KCF_PROV_OBJECT_OPS(pd)->object_find_init) ? \
+ KCF_PROV_OBJECT_OPS(pd)->object_find_init((pd)->pd_prov_handle, \
+ session, template, count, ppriv, req) : CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_OBJECT_FIND(pd, ppriv, objects, max_objects, object_count, \
+ req) ( \
+ (KCF_PROV_OBJECT_OPS(pd) && KCF_PROV_OBJECT_OPS(pd)->object_find) ? \
+ KCF_PROV_OBJECT_OPS(pd)->object_find( \
+ (pd)->pd_prov_handle, ppriv, objects, max_objects, object_count, \
+ req) : CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_OBJECT_FIND_FINAL(pd, ppriv, req) ( \
+ (KCF_PROV_OBJECT_OPS(pd) && \
+ KCF_PROV_OBJECT_OPS(pd)->object_find_final) ? \
+ KCF_PROV_OBJECT_OPS(pd)->object_find_final( \
+ (pd)->pd_prov_handle, ppriv, req) : CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_key_ops(9S) entry points.
+ */
+
+#define KCF_PROV_KEY_GENERATE(pd, session, mech, template, count, object, \
+ req) ( \
+ (KCF_PROV_KEY_OPS(pd) && KCF_PROV_KEY_OPS(pd)->key_generate) ? \
+ KCF_PROV_KEY_OPS(pd)->key_generate((pd)->pd_prov_handle, \
+ session, mech, template, count, object, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_KEY_GENERATE_PAIR(pd, session, mech, pub_template, \
+ pub_count, priv_template, priv_count, pub_key, priv_key, req) ( \
+ (KCF_PROV_KEY_OPS(pd) && KCF_PROV_KEY_OPS(pd)->key_generate_pair) ? \
+ KCF_PROV_KEY_OPS(pd)->key_generate_pair((pd)->pd_prov_handle, \
+ session, mech, pub_template, pub_count, priv_template, \
+ priv_count, pub_key, priv_key, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_KEY_WRAP(pd, session, mech, wrapping_key, key, wrapped_key, \
+ wrapped_key_len, req) ( \
+ (KCF_PROV_KEY_OPS(pd) && KCF_PROV_KEY_OPS(pd)->key_wrap) ? \
+ KCF_PROV_KEY_OPS(pd)->key_wrap((pd)->pd_prov_handle, \
+ session, mech, wrapping_key, key, wrapped_key, wrapped_key_len, \
+ req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_KEY_UNWRAP(pd, session, mech, unwrapping_key, wrapped_key, \
+ wrapped_key_len, template, count, key, req) ( \
+ (KCF_PROV_KEY_OPS(pd) && KCF_PROV_KEY_OPS(pd)->key_unwrap) ? \
+ KCF_PROV_KEY_OPS(pd)->key_unwrap((pd)->pd_prov_handle, \
+ session, mech, unwrapping_key, wrapped_key, wrapped_key_len, \
+ template, count, key, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_KEY_DERIVE(pd, session, mech, base_key, template, count, \
+ key, req) ( \
+ (KCF_PROV_KEY_OPS(pd) && KCF_PROV_KEY_OPS(pd)->key_derive) ? \
+ KCF_PROV_KEY_OPS(pd)->key_derive((pd)->pd_prov_handle, \
+ session, mech, base_key, template, count, key, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_KEY_CHECK(pd, mech, key) ( \
+ (KCF_PROV_KEY_OPS(pd) && KCF_PROV_KEY_OPS(pd)->key_check) ? \
+ KCF_PROV_KEY_OPS(pd)->key_check((pd)->pd_prov_handle, mech, key) : \
+ CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_provider_management_ops(9S) entry points.
+ *
+ * ops_pd is the provider descriptor that supplies the ops_vector.
+ * pd is the descriptor that supplies the provider handle.
+ * Only ext_info needs two handles.
+ */
+
+#define KCF_PROV_EXT_INFO(ops_pd, provext_info, req, pd) ( \
+ (KCF_PROV_PROVIDER_OPS(ops_pd) && \
+ KCF_PROV_PROVIDER_OPS(ops_pd)->ext_info) ? \
+ KCF_PROV_PROVIDER_OPS(ops_pd)->ext_info((pd)->pd_prov_handle, \
+ provext_info, req) : CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_INIT_TOKEN(pd, pin, pin_len, label, req) ( \
+ (KCF_PROV_PROVIDER_OPS(pd) && KCF_PROV_PROVIDER_OPS(pd)->init_token) ? \
+ KCF_PROV_PROVIDER_OPS(pd)->init_token((pd)->pd_prov_handle, \
+ pin, pin_len, label, req) : CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_INIT_PIN(pd, session, pin, pin_len, req) ( \
+ (KCF_PROV_PROVIDER_OPS(pd) && KCF_PROV_PROVIDER_OPS(pd)->init_pin) ? \
+ KCF_PROV_PROVIDER_OPS(pd)->init_pin((pd)->pd_prov_handle, \
+ session, pin, pin_len, req) : CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_SET_PIN(pd, session, old_pin, old_len, new_pin, new_len, \
+ req) ( \
+ (KCF_PROV_PROVIDER_OPS(pd) && KCF_PROV_PROVIDER_OPS(pd)->set_pin) ? \
+ KCF_PROV_PROVIDER_OPS(pd)->set_pin((pd)->pd_prov_handle, \
+ session, old_pin, old_len, new_pin, new_len, req) : \
+ CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_nostore_key_ops(9S) entry points.
+ */
+
+#define KCF_PROV_NOSTORE_KEY_GENERATE(pd, session, mech, template, count, \
+ out_template, out_count, req) ( \
+ (KCF_PROV_NOSTORE_KEY_OPS(pd) && \
+ KCF_PROV_NOSTORE_KEY_OPS(pd)->nostore_key_generate) ? \
+ KCF_PROV_NOSTORE_KEY_OPS(pd)->nostore_key_generate( \
+ (pd)->pd_prov_handle, session, mech, template, count, \
+ out_template, out_count, req) : CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_NOSTORE_KEY_GENERATE_PAIR(pd, session, mech, pub_template, \
+ pub_count, priv_template, priv_count, out_pub_template, \
+ out_pub_count, out_priv_template, out_priv_count, req) ( \
+ (KCF_PROV_NOSTORE_KEY_OPS(pd) && \
+ KCF_PROV_NOSTORE_KEY_OPS(pd)->nostore_key_generate_pair) ? \
+ KCF_PROV_NOSTORE_KEY_OPS(pd)->nostore_key_generate_pair( \
+ (pd)->pd_prov_handle, session, mech, pub_template, pub_count, \
+ priv_template, priv_count, out_pub_template, out_pub_count, \
+ out_priv_template, out_priv_count, req) : CRYPTO_NOT_SUPPORTED)
+
+#define KCF_PROV_NOSTORE_KEY_DERIVE(pd, session, mech, base_key, template, \
+ count, out_template, out_count, req) ( \
+ (KCF_PROV_NOSTORE_KEY_OPS(pd) && \
+ KCF_PROV_NOSTORE_KEY_OPS(pd)->nostore_key_derive) ? \
+ KCF_PROV_NOSTORE_KEY_OPS(pd)->nostore_key_derive( \
+ (pd)->pd_prov_handle, session, mech, base_key, template, count, \
+ out_template, out_count, req) : CRYPTO_NOT_SUPPORTED)
+
+/*
+ * The following routines are exported by the kcf module (/kernel/misc/kcf)
+ * to the crypto and cryptoadmin modules.
+ */
+
+/* Digest/mac/cipher entry points that take a provider descriptor and session */
+extern int crypto_digest_single(crypto_context_t, crypto_data_t *,
+ crypto_data_t *, crypto_call_req_t *);
+
+extern int crypto_mac_single(crypto_context_t, crypto_data_t *,
+ crypto_data_t *, crypto_call_req_t *);
+
+extern int crypto_encrypt_single(crypto_context_t, crypto_data_t *,
+ crypto_data_t *, crypto_call_req_t *);
+
+extern int crypto_decrypt_single(crypto_context_t, crypto_data_t *,
+ crypto_data_t *, crypto_call_req_t *);
+
+
+/* Other private digest/mac/cipher entry points not exported through k-API */
+extern int crypto_digest_key_prov(crypto_context_t, crypto_key_t *,
+ crypto_call_req_t *);
+
+/* Private sign entry points exported by KCF */
+extern int crypto_sign_single(crypto_context_t, crypto_data_t *,
+ crypto_data_t *, crypto_call_req_t *);
+
+extern int crypto_sign_recover_single(crypto_context_t, crypto_data_t *,
+ crypto_data_t *, crypto_call_req_t *);
+
+/* Private verify entry points exported by KCF */
+extern int crypto_verify_single(crypto_context_t, crypto_data_t *,
+ crypto_data_t *, crypto_call_req_t *);
+
+extern int crypto_verify_recover_single(crypto_context_t, crypto_data_t *,
+ crypto_data_t *, crypto_call_req_t *);
+
+/* Private dual operations entry points exported by KCF */
+extern int crypto_digest_encrypt_update(crypto_context_t, crypto_context_t,
+ crypto_data_t *, crypto_data_t *, crypto_call_req_t *);
+extern int crypto_decrypt_digest_update(crypto_context_t, crypto_context_t,
+ crypto_data_t *, crypto_data_t *, crypto_call_req_t *);
+extern int crypto_sign_encrypt_update(crypto_context_t, crypto_context_t,
+ crypto_data_t *, crypto_data_t *, crypto_call_req_t *);
+extern int crypto_decrypt_verify_update(crypto_context_t, crypto_context_t,
+ crypto_data_t *, crypto_data_t *, crypto_call_req_t *);
+
+/* Random Number Generation */
+int crypto_seed_random(crypto_provider_handle_t provider, uchar_t *buf,
+ size_t len, crypto_call_req_t *req);
+int crypto_generate_random(crypto_provider_handle_t provider, uchar_t *buf,
+ size_t len, crypto_call_req_t *req);
+
+/* Provider Management */
+int crypto_get_provider_info(crypto_provider_id_t id,
+ crypto_provider_info_t **info, crypto_call_req_t *req);
+int crypto_get_provider_mechanisms(crypto_minor_t *, crypto_provider_id_t id,
+ uint_t *count, crypto_mech_name_t **list);
+int crypto_init_token(crypto_provider_handle_t provider, char *pin,
+ size_t pin_len, char *label, crypto_call_req_t *);
+int crypto_init_pin(crypto_provider_handle_t provider, char *pin,
+ size_t pin_len, crypto_call_req_t *req);
+int crypto_set_pin(crypto_provider_handle_t provider, char *old_pin,
+ size_t old_len, char *new_pin, size_t new_len, crypto_call_req_t *req);
+void crypto_free_provider_list(crypto_provider_entry_t *list, uint_t count);
+void crypto_free_provider_info(crypto_provider_info_t *info);
+
+/* Administrative */
+int crypto_get_dev_list(uint_t *count, crypto_dev_list_entry_t **list);
+int crypto_get_soft_list(uint_t *count, char **list, size_t *len);
+int crypto_get_dev_info(char *name, uint_t instance, uint_t *count,
+ crypto_mech_name_t **list);
+int crypto_get_soft_info(caddr_t name, uint_t *count,
+ crypto_mech_name_t **list);
+int crypto_load_dev_disabled(char *name, uint_t instance, uint_t count,
+ crypto_mech_name_t *list);
+int crypto_load_soft_disabled(caddr_t name, uint_t count,
+ crypto_mech_name_t *list);
+int crypto_unload_soft_module(caddr_t path);
+int crypto_load_soft_config(caddr_t name, uint_t count,
+ crypto_mech_name_t *list);
+int crypto_load_door(uint_t did);
+void crypto_free_mech_list(crypto_mech_name_t *list, uint_t count);
+void crypto_free_dev_list(crypto_dev_list_entry_t *list, uint_t count);
+
+/* Miscellaneous */
+int crypto_get_mechanism_number(caddr_t name, crypto_mech_type_t *number);
+int crypto_get_function_list(crypto_provider_id_t id,
+ crypto_function_list_t **list, int kmflag);
+void crypto_free_function_list(crypto_function_list_t *list);
+int crypto_build_permitted_mech_names(kcf_provider_desc_t *,
+ crypto_mech_name_t **, uint_t *, int);
+extern void kcf_destroy_mech_tabs(void);
+extern void kcf_init_mech_tabs(void);
+extern int kcf_add_mech_provider(short, kcf_provider_desc_t *,
+ kcf_prov_mech_desc_t **);
+extern void kcf_remove_mech_provider(char *, kcf_provider_desc_t *);
+extern int kcf_get_mech_entry(crypto_mech_type_t, kcf_mech_entry_t **);
+extern kcf_provider_desc_t *kcf_alloc_provider_desc(crypto_provider_info_t *);
+extern void kcf_provider_zero_refcnt(kcf_provider_desc_t *);
+extern void kcf_free_provider_desc(kcf_provider_desc_t *);
+extern void kcf_soft_config_init(void);
+extern int get_sw_provider_for_mech(crypto_mech_name_t, char **);
+extern crypto_mech_type_t crypto_mech2id_common(char *, boolean_t);
+extern void undo_register_provider(kcf_provider_desc_t *, boolean_t);
+extern void redo_register_provider(kcf_provider_desc_t *);
+extern void kcf_rnd_init(void);
+extern boolean_t kcf_rngprov_check(void);
+extern int kcf_rnd_get_pseudo_bytes(uint8_t *, size_t);
+extern int kcf_rnd_get_bytes(uint8_t *, size_t, boolean_t, boolean_t);
+extern int random_add_pseudo_entropy(uint8_t *, size_t, uint_t);
+extern void kcf_rnd_schedule_timeout(boolean_t);
+extern int crypto_uio_data(crypto_data_t *, uchar_t *, int, cmd_type_t,
+ void *, void (*update)(void));
+extern int crypto_mblk_data(crypto_data_t *, uchar_t *, int, cmd_type_t,
+ void *, void (*update)(void));
+extern int crypto_put_output_data(uchar_t *, crypto_data_t *, int);
+extern int crypto_get_input_data(crypto_data_t *, uchar_t **, uchar_t *);
+extern int crypto_copy_key_to_ctx(crypto_key_t *, crypto_key_t **, size_t *,
+ int kmflag);
+extern int crypto_digest_data(crypto_data_t *, void *, uchar_t *,
+ void (*update)(void), void (*final)(void), uchar_t);
+extern int crypto_update_iov(void *, crypto_data_t *, crypto_data_t *,
+ int (*cipher)(void *, caddr_t, size_t, crypto_data_t *),
+ void (*copy_block)(uint8_t *, uint64_t *));
+extern int crypto_update_uio(void *, crypto_data_t *, crypto_data_t *,
+ int (*cipher)(void *, caddr_t, size_t, crypto_data_t *),
+ void (*copy_block)(uint8_t *, uint64_t *));
+extern int crypto_update_mp(void *, crypto_data_t *, crypto_data_t *,
+ int (*cipher)(void *, caddr_t, size_t, crypto_data_t *),
+ void (*copy_block)(uint8_t *, uint64_t *));
+extern int crypto_get_key_attr(crypto_key_t *, crypto_attr_type_t, uchar_t **,
+ ssize_t *);
+
+/* Access to the provider's table */
+extern void kcf_prov_tab_destroy(void);
+extern void kcf_prov_tab_init(void);
+extern int kcf_prov_tab_add_provider(kcf_provider_desc_t *);
+extern int kcf_prov_tab_rem_provider(crypto_provider_id_t);
+extern kcf_provider_desc_t *kcf_prov_tab_lookup_by_name(char *);
+extern kcf_provider_desc_t *kcf_prov_tab_lookup_by_dev(char *, uint_t);
+extern int kcf_get_hw_prov_tab(uint_t *, kcf_provider_desc_t ***, int,
+ char *, uint_t, boolean_t);
+extern int kcf_get_slot_list(uint_t *, kcf_provider_desc_t ***, boolean_t);
+extern void kcf_free_provider_tab(uint_t, kcf_provider_desc_t **);
+extern kcf_provider_desc_t *kcf_prov_tab_lookup(crypto_provider_id_t);
+extern int kcf_get_sw_prov(crypto_mech_type_t, kcf_provider_desc_t **,
+ kcf_mech_entry_t **, boolean_t);
+
+/* Access to the policy table */
+extern boolean_t is_mech_disabled(kcf_provider_desc_t *, crypto_mech_name_t);
+extern boolean_t is_mech_disabled_byname(crypto_provider_type_t, char *,
+ uint_t, crypto_mech_name_t);
+extern void kcf_policy_tab_init(void);
+extern void kcf_policy_free_desc(kcf_policy_desc_t *);
+extern void kcf_policy_remove_by_name(char *, uint_t *, crypto_mech_name_t **);
+extern void kcf_policy_remove_by_dev(char *, uint_t, uint_t *,
+ crypto_mech_name_t **);
+extern kcf_policy_desc_t *kcf_policy_lookup_by_name(char *);
+extern kcf_policy_desc_t *kcf_policy_lookup_by_dev(char *, uint_t);
+extern int kcf_policy_load_soft_disabled(char *, uint_t, crypto_mech_name_t *,
+ uint_t *, crypto_mech_name_t **);
+extern int kcf_policy_load_dev_disabled(char *, uint_t, uint_t,
+ crypto_mech_name_t *, uint_t *, crypto_mech_name_t **);
+extern boolean_t in_soft_config_list(char *);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CRYPTO_IMPL_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/crypto/ioctl.h b/sys/contrib/openzfs/module/icp/include/sys/crypto/ioctl.h
new file mode 100644
index 000000000000..6e371e343945
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/crypto/ioctl.h
@@ -0,0 +1,1480 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_CRYPTO_IOCTL_H
+#define _SYS_CRYPTO_IOCTL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/api.h>
+#include <sys/crypto/spi.h>
+#include <sys/crypto/common.h>
+
+#define CRYPTO_MAX_ATTRIBUTE_COUNT 128
+
+#define CRYPTO_IOFLAGS_RW_SESSION 0x00000001
+
+#define CRYPTO(x) (('y' << 8) | (x))
+
+#define MAX_NUM_THRESHOLD 7
+
+/* the PKCS11 Mechanisms */
+#define CKM_RC4 0x00000111
+#define CKM_DES3_ECB 0x00000132
+#define CKM_DES3_CBC 0x00000133
+#define CKM_MD5 0x00000210
+#define CKM_SHA_1 0x00000220
+#define CKM_AES_ECB 0x00001081
+#define CKM_AES_CBC 0x00001082
+
+/*
+ * General Purpose Ioctls
+ */
+
+typedef struct fl_mechs_threshold {
+ int mech_type;
+ uint32_t mech_threshold;
+} fl_mechs_threshold_t;
+
+typedef struct crypto_function_list {
+ boolean_t fl_digest_init;
+ boolean_t fl_digest;
+ boolean_t fl_digest_update;
+ boolean_t fl_digest_key;
+ boolean_t fl_digest_final;
+
+ boolean_t fl_encrypt_init;
+ boolean_t fl_encrypt;
+ boolean_t fl_encrypt_update;
+ boolean_t fl_encrypt_final;
+
+ boolean_t fl_decrypt_init;
+ boolean_t fl_decrypt;
+ boolean_t fl_decrypt_update;
+ boolean_t fl_decrypt_final;
+
+ boolean_t fl_mac_init;
+ boolean_t fl_mac;
+ boolean_t fl_mac_update;
+ boolean_t fl_mac_final;
+
+ boolean_t fl_sign_init;
+ boolean_t fl_sign;
+ boolean_t fl_sign_update;
+ boolean_t fl_sign_final;
+ boolean_t fl_sign_recover_init;
+ boolean_t fl_sign_recover;
+
+ boolean_t fl_verify_init;
+ boolean_t fl_verify;
+ boolean_t fl_verify_update;
+ boolean_t fl_verify_final;
+ boolean_t fl_verify_recover_init;
+ boolean_t fl_verify_recover;
+
+ boolean_t fl_digest_encrypt_update;
+ boolean_t fl_decrypt_digest_update;
+ boolean_t fl_sign_encrypt_update;
+ boolean_t fl_decrypt_verify_update;
+
+ boolean_t fl_seed_random;
+ boolean_t fl_generate_random;
+
+ boolean_t fl_session_open;
+ boolean_t fl_session_close;
+ boolean_t fl_session_login;
+ boolean_t fl_session_logout;
+
+ boolean_t fl_object_create;
+ boolean_t fl_object_copy;
+ boolean_t fl_object_destroy;
+ boolean_t fl_object_get_size;
+ boolean_t fl_object_get_attribute_value;
+ boolean_t fl_object_set_attribute_value;
+ boolean_t fl_object_find_init;
+ boolean_t fl_object_find;
+ boolean_t fl_object_find_final;
+
+ boolean_t fl_key_generate;
+ boolean_t fl_key_generate_pair;
+ boolean_t fl_key_wrap;
+ boolean_t fl_key_unwrap;
+ boolean_t fl_key_derive;
+
+ boolean_t fl_init_token;
+ boolean_t fl_init_pin;
+ boolean_t fl_set_pin;
+
+ boolean_t prov_is_limited;
+ uint32_t prov_hash_threshold;
+ uint32_t prov_hash_limit;
+
+ int total_threshold_count;
+ fl_mechs_threshold_t fl_threshold[MAX_NUM_THRESHOLD];
+} crypto_function_list_t;
+
+typedef struct crypto_get_function_list {
+ uint_t fl_return_value;
+ crypto_provider_id_t fl_provider_id;
+ crypto_function_list_t fl_list;
+} crypto_get_function_list_t;
+
+typedef struct crypto_get_mechanism_number {
+ uint_t pn_return_value;
+ caddr_t pn_mechanism_string;
+ size_t pn_mechanism_len;
+ crypto_mech_type_t pn_internal_number;
+} crypto_get_mechanism_number_t;
+
+#ifdef _KERNEL
+#ifdef _SYSCALL32
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct crypto_get_mechanism_number32 {
+ uint32_t pn_return_value;
+ caddr32_t pn_mechanism_string;
+ size32_t pn_mechanism_len;
+ crypto_mech_type_t pn_internal_number;
+} crypto_get_mechanism_number32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+#endif /* _SYSCALL32 */
+#endif /* _KERNEL */
+
+#define CRYPTO_GET_FUNCTION_LIST CRYPTO(20)
+#define CRYPTO_GET_MECHANISM_NUMBER CRYPTO(21)
+
+/*
+ * Session Ioctls
+ */
+
+typedef uint32_t crypto_flags_t;
+
+typedef struct crypto_open_session {
+ uint_t os_return_value;
+ crypto_session_id_t os_session;
+ crypto_flags_t os_flags;
+ crypto_provider_id_t os_provider_id;
+} crypto_open_session_t;
+
+typedef struct crypto_close_session {
+ uint_t cs_return_value;
+ crypto_session_id_t cs_session;
+} crypto_close_session_t;
+
+typedef struct crypto_close_all_sessions {
+ uint_t as_return_value;
+ crypto_provider_id_t as_provider_id;
+} crypto_close_all_sessions_t;
+
+#define CRYPTO_OPEN_SESSION CRYPTO(30)
+#define CRYPTO_CLOSE_SESSION CRYPTO(31)
+#define CRYPTO_CLOSE_ALL_SESSIONS CRYPTO(32)
+
+/*
+ * Login Ioctls
+ */
+typedef struct crypto_login {
+ uint_t co_return_value;
+ crypto_session_id_t co_session;
+ uint_t co_user_type;
+ uint_t co_pin_len;
+ caddr_t co_pin;
+} crypto_login_t;
+
+typedef struct crypto_logout {
+ uint_t cl_return_value;
+ crypto_session_id_t cl_session;
+} crypto_logout_t;
+
+#ifdef _KERNEL
+#ifdef _SYSCALL32
+
+typedef struct crypto_login32 {
+ uint32_t co_return_value;
+ crypto_session_id_t co_session;
+ uint32_t co_user_type;
+ uint32_t co_pin_len;
+ caddr32_t co_pin;
+} crypto_login32_t;
+
+typedef struct crypto_logout32 {
+ uint32_t cl_return_value;
+ crypto_session_id_t cl_session;
+} crypto_logout32_t;
+
+#endif /* _SYSCALL32 */
+#endif /* _KERNEL */
+
+#define CRYPTO_LOGIN CRYPTO(40)
+#define CRYPTO_LOGOUT CRYPTO(41)
+
+/*
+ * Cryptographic Ioctls
+ */
+typedef struct crypto_encrypt {
+ uint_t ce_return_value;
+ crypto_session_id_t ce_session;
+ size_t ce_datalen;
+ caddr_t ce_databuf;
+ size_t ce_encrlen;
+ caddr_t ce_encrbuf;
+ uint_t ce_flags;
+} crypto_encrypt_t;
+
+typedef struct crypto_encrypt_init {
+ uint_t ei_return_value;
+ crypto_session_id_t ei_session;
+ crypto_mechanism_t ei_mech;
+ crypto_key_t ei_key;
+} crypto_encrypt_init_t;
+
+typedef struct crypto_encrypt_update {
+ uint_t eu_return_value;
+ crypto_session_id_t eu_session;
+ size_t eu_datalen;
+ caddr_t eu_databuf;
+ size_t eu_encrlen;
+ caddr_t eu_encrbuf;
+} crypto_encrypt_update_t;
+
+typedef struct crypto_encrypt_final {
+ uint_t ef_return_value;
+ crypto_session_id_t ef_session;
+ size_t ef_encrlen;
+ caddr_t ef_encrbuf;
+} crypto_encrypt_final_t;
+
+typedef struct crypto_decrypt {
+ uint_t cd_return_value;
+ crypto_session_id_t cd_session;
+ size_t cd_encrlen;
+ caddr_t cd_encrbuf;
+ size_t cd_datalen;
+ caddr_t cd_databuf;
+ uint_t cd_flags;
+} crypto_decrypt_t;
+
+typedef struct crypto_decrypt_init {
+ uint_t di_return_value;
+ crypto_session_id_t di_session;
+ crypto_mechanism_t di_mech;
+ crypto_key_t di_key;
+} crypto_decrypt_init_t;
+
+typedef struct crypto_decrypt_update {
+ uint_t du_return_value;
+ crypto_session_id_t du_session;
+ size_t du_encrlen;
+ caddr_t du_encrbuf;
+ size_t du_datalen;
+ caddr_t du_databuf;
+} crypto_decrypt_update_t;
+
+typedef struct crypto_decrypt_final {
+ uint_t df_return_value;
+ crypto_session_id_t df_session;
+ size_t df_datalen;
+ caddr_t df_databuf;
+} crypto_decrypt_final_t;
+
+typedef struct crypto_digest {
+ uint_t cd_return_value;
+ crypto_session_id_t cd_session;
+ size_t cd_datalen;
+ caddr_t cd_databuf;
+ size_t cd_digestlen;
+ caddr_t cd_digestbuf;
+} crypto_digest_t;
+
+typedef struct crypto_digest_init {
+ uint_t di_return_value;
+ crypto_session_id_t di_session;
+ crypto_mechanism_t di_mech;
+} crypto_digest_init_t;
+
+typedef struct crypto_digest_update {
+ uint_t du_return_value;
+ crypto_session_id_t du_session;
+ size_t du_datalen;
+ caddr_t du_databuf;
+} crypto_digest_update_t;
+
+typedef struct crypto_digest_key {
+ uint_t dk_return_value;
+ crypto_session_id_t dk_session;
+ crypto_key_t dk_key;
+} crypto_digest_key_t;
+
+typedef struct crypto_digest_final {
+ uint_t df_return_value;
+ crypto_session_id_t df_session;
+ size_t df_digestlen;
+ caddr_t df_digestbuf;
+} crypto_digest_final_t;
+
+typedef struct crypto_mac {
+ uint_t cm_return_value;
+ crypto_session_id_t cm_session;
+ size_t cm_datalen;
+ caddr_t cm_databuf;
+ size_t cm_maclen;
+ caddr_t cm_macbuf;
+} crypto_mac_t;
+
+typedef struct crypto_mac_init {
+ uint_t mi_return_value;
+ crypto_session_id_t mi_session;
+ crypto_mechanism_t mi_mech;
+ crypto_key_t mi_key;
+} crypto_mac_init_t;
+
+typedef struct crypto_mac_update {
+ uint_t mu_return_value;
+ crypto_session_id_t mu_session;
+ size_t mu_datalen;
+ caddr_t mu_databuf;
+} crypto_mac_update_t;
+
+typedef struct crypto_mac_final {
+ uint_t mf_return_value;
+ crypto_session_id_t mf_session;
+ size_t mf_maclen;
+ caddr_t mf_macbuf;
+} crypto_mac_final_t;
+
+typedef struct crypto_sign {
+ uint_t cs_return_value;
+ crypto_session_id_t cs_session;
+ size_t cs_datalen;
+ caddr_t cs_databuf;
+ size_t cs_signlen;
+ caddr_t cs_signbuf;
+} crypto_sign_t;
+
+typedef struct crypto_sign_init {
+ uint_t si_return_value;
+ crypto_session_id_t si_session;
+ crypto_mechanism_t si_mech;
+ crypto_key_t si_key;
+} crypto_sign_init_t;
+
+typedef struct crypto_sign_update {
+ uint_t su_return_value;
+ crypto_session_id_t su_session;
+ size_t su_datalen;
+ caddr_t su_databuf;
+} crypto_sign_update_t;
+
+typedef struct crypto_sign_final {
+ uint_t sf_return_value;
+ crypto_session_id_t sf_session;
+ size_t sf_signlen;
+ caddr_t sf_signbuf;
+} crypto_sign_final_t;
+
+typedef struct crypto_sign_recover_init {
+ uint_t ri_return_value;
+ crypto_session_id_t ri_session;
+ crypto_mechanism_t ri_mech;
+ crypto_key_t ri_key;
+} crypto_sign_recover_init_t;
+
+typedef struct crypto_sign_recover {
+ uint_t sr_return_value;
+ crypto_session_id_t sr_session;
+ size_t sr_datalen;
+ caddr_t sr_databuf;
+ size_t sr_signlen;
+ caddr_t sr_signbuf;
+} crypto_sign_recover_t;
+
+typedef struct crypto_verify {
+ uint_t cv_return_value;
+ crypto_session_id_t cv_session;
+ size_t cv_datalen;
+ caddr_t cv_databuf;
+ size_t cv_signlen;
+ caddr_t cv_signbuf;
+} crypto_verify_t;
+
+typedef struct crypto_verify_init {
+ uint_t vi_return_value;
+ crypto_session_id_t vi_session;
+ crypto_mechanism_t vi_mech;
+ crypto_key_t vi_key;
+} crypto_verify_init_t;
+
+typedef struct crypto_verify_update {
+ uint_t vu_return_value;
+ crypto_session_id_t vu_session;
+ size_t vu_datalen;
+ caddr_t vu_databuf;
+} crypto_verify_update_t;
+
+typedef struct crypto_verify_final {
+ uint_t vf_return_value;
+ crypto_session_id_t vf_session;
+ size_t vf_signlen;
+ caddr_t vf_signbuf;
+} crypto_verify_final_t;
+
+typedef struct crypto_verify_recover_init {
+ uint_t ri_return_value;
+ crypto_session_id_t ri_session;
+ crypto_mechanism_t ri_mech;
+ crypto_key_t ri_key;
+} crypto_verify_recover_init_t;
+
+typedef struct crypto_verify_recover {
+ uint_t vr_return_value;
+ crypto_session_id_t vr_session;
+ size_t vr_signlen;
+ caddr_t vr_signbuf;
+ size_t vr_datalen;
+ caddr_t vr_databuf;
+} crypto_verify_recover_t;
+
+typedef struct crypto_digest_encrypt_update {
+ uint_t eu_return_value;
+ crypto_session_id_t eu_session;
+ size_t eu_datalen;
+ caddr_t eu_databuf;
+ size_t eu_encrlen;
+ caddr_t eu_encrbuf;
+} crypto_digest_encrypt_update_t;
+
+typedef struct crypto_decrypt_digest_update {
+ uint_t du_return_value;
+ crypto_session_id_t du_session;
+ size_t du_encrlen;
+ caddr_t du_encrbuf;
+ size_t du_datalen;
+ caddr_t du_databuf;
+} crypto_decrypt_digest_update_t;
+
+typedef struct crypto_sign_encrypt_update {
+ uint_t eu_return_value;
+ crypto_session_id_t eu_session;
+ size_t eu_datalen;
+ caddr_t eu_databuf;
+ size_t eu_encrlen;
+ caddr_t eu_encrbuf;
+} crypto_sign_encrypt_update_t;
+
+typedef struct crypto_decrypt_verify_update {
+ uint_t vu_return_value;
+ crypto_session_id_t vu_session;
+ size_t vu_encrlen;
+ caddr_t vu_encrbuf;
+ size_t vu_datalen;
+ caddr_t vu_databuf;
+} crypto_decrypt_verify_update_t;
+
+#ifdef _KERNEL
+#ifdef _SYSCALL32
+
+typedef struct crypto_encrypt32 {
+ uint32_t ce_return_value;
+ crypto_session_id_t ce_session;
+ size32_t ce_datalen;
+ caddr32_t ce_databuf;
+ size32_t ce_encrlen;
+ caddr32_t ce_encrbuf;
+ uint32_t ce_flags;
+} crypto_encrypt32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct crypto_encrypt_init32 {
+ uint32_t ei_return_value;
+ crypto_session_id_t ei_session;
+ crypto_mechanism32_t ei_mech;
+ crypto_key32_t ei_key;
+} crypto_encrypt_init32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+typedef struct crypto_encrypt_update32 {
+ uint32_t eu_return_value;
+ crypto_session_id_t eu_session;
+ size32_t eu_datalen;
+ caddr32_t eu_databuf;
+ size32_t eu_encrlen;
+ caddr32_t eu_encrbuf;
+} crypto_encrypt_update32_t;
+
+typedef struct crypto_encrypt_final32 {
+ uint32_t ef_return_value;
+ crypto_session_id_t ef_session;
+ size32_t ef_encrlen;
+ caddr32_t ef_encrbuf;
+} crypto_encrypt_final32_t;
+
+typedef struct crypto_decrypt32 {
+ uint32_t cd_return_value;
+ crypto_session_id_t cd_session;
+ size32_t cd_encrlen;
+ caddr32_t cd_encrbuf;
+ size32_t cd_datalen;
+ caddr32_t cd_databuf;
+ uint32_t cd_flags;
+} crypto_decrypt32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct crypto_decrypt_init32 {
+ uint32_t di_return_value;
+ crypto_session_id_t di_session;
+ crypto_mechanism32_t di_mech;
+ crypto_key32_t di_key;
+} crypto_decrypt_init32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+typedef struct crypto_decrypt_update32 {
+ uint32_t du_return_value;
+ crypto_session_id_t du_session;
+ size32_t du_encrlen;
+ caddr32_t du_encrbuf;
+ size32_t du_datalen;
+ caddr32_t du_databuf;
+} crypto_decrypt_update32_t;
+
+typedef struct crypto_decrypt_final32 {
+ uint32_t df_return_value;
+ crypto_session_id_t df_session;
+ size32_t df_datalen;
+ caddr32_t df_databuf;
+} crypto_decrypt_final32_t;
+
+typedef struct crypto_digest32 {
+ uint32_t cd_return_value;
+ crypto_session_id_t cd_session;
+ size32_t cd_datalen;
+ caddr32_t cd_databuf;
+ size32_t cd_digestlen;
+ caddr32_t cd_digestbuf;
+} crypto_digest32_t;
+
+typedef struct crypto_digest_init32 {
+ uint32_t di_return_value;
+ crypto_session_id_t di_session;
+ crypto_mechanism32_t di_mech;
+} crypto_digest_init32_t;
+
+typedef struct crypto_digest_update32 {
+ uint32_t du_return_value;
+ crypto_session_id_t du_session;
+ size32_t du_datalen;
+ caddr32_t du_databuf;
+} crypto_digest_update32_t;
+
+typedef struct crypto_digest_key32 {
+ uint32_t dk_return_value;
+ crypto_session_id_t dk_session;
+ crypto_key32_t dk_key;
+} crypto_digest_key32_t;
+
+typedef struct crypto_digest_final32 {
+ uint32_t df_return_value;
+ crypto_session_id_t df_session;
+ size32_t df_digestlen;
+ caddr32_t df_digestbuf;
+} crypto_digest_final32_t;
+
+typedef struct crypto_mac32 {
+ uint32_t cm_return_value;
+ crypto_session_id_t cm_session;
+ size32_t cm_datalen;
+ caddr32_t cm_databuf;
+ size32_t cm_maclen;
+ caddr32_t cm_macbuf;
+} crypto_mac32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct crypto_mac_init32 {
+ uint32_t mi_return_value;
+ crypto_session_id_t mi_session;
+ crypto_mechanism32_t mi_mech;
+ crypto_key32_t mi_key;
+} crypto_mac_init32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+typedef struct crypto_mac_update32 {
+ uint32_t mu_return_value;
+ crypto_session_id_t mu_session;
+ size32_t mu_datalen;
+ caddr32_t mu_databuf;
+} crypto_mac_update32_t;
+
+typedef struct crypto_mac_final32 {
+ uint32_t mf_return_value;
+ crypto_session_id_t mf_session;
+ size32_t mf_maclen;
+ caddr32_t mf_macbuf;
+} crypto_mac_final32_t;
+
+typedef struct crypto_sign32 {
+ uint32_t cs_return_value;
+ crypto_session_id_t cs_session;
+ size32_t cs_datalen;
+ caddr32_t cs_databuf;
+ size32_t cs_signlen;
+ caddr32_t cs_signbuf;
+} crypto_sign32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct crypto_sign_init32 {
+ uint32_t si_return_value;
+ crypto_session_id_t si_session;
+ crypto_mechanism32_t si_mech;
+ crypto_key32_t si_key;
+} crypto_sign_init32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+typedef struct crypto_sign_update32 {
+ uint32_t su_return_value;
+ crypto_session_id_t su_session;
+ size32_t su_datalen;
+ caddr32_t su_databuf;
+} crypto_sign_update32_t;
+
+typedef struct crypto_sign_final32 {
+ uint32_t sf_return_value;
+ crypto_session_id_t sf_session;
+ size32_t sf_signlen;
+ caddr32_t sf_signbuf;
+} crypto_sign_final32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct crypto_sign_recover_init32 {
+ uint32_t ri_return_value;
+ crypto_session_id_t ri_session;
+ crypto_mechanism32_t ri_mech;
+ crypto_key32_t ri_key;
+} crypto_sign_recover_init32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+typedef struct crypto_sign_recover32 {
+ uint32_t sr_return_value;
+ crypto_session_id_t sr_session;
+ size32_t sr_datalen;
+ caddr32_t sr_databuf;
+ size32_t sr_signlen;
+ caddr32_t sr_signbuf;
+} crypto_sign_recover32_t;
+
+typedef struct crypto_verify32 {
+ uint32_t cv_return_value;
+ crypto_session_id_t cv_session;
+ size32_t cv_datalen;
+ caddr32_t cv_databuf;
+ size32_t cv_signlen;
+ caddr32_t cv_signbuf;
+} crypto_verify32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct crypto_verify_init32 {
+ uint32_t vi_return_value;
+ crypto_session_id_t vi_session;
+ crypto_mechanism32_t vi_mech;
+ crypto_key32_t vi_key;
+} crypto_verify_init32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+typedef struct crypto_verify_update32 {
+ uint32_t vu_return_value;
+ crypto_session_id_t vu_session;
+ size32_t vu_datalen;
+ caddr32_t vu_databuf;
+} crypto_verify_update32_t;
+
+typedef struct crypto_verify_final32 {
+ uint32_t vf_return_value;
+ crypto_session_id_t vf_session;
+ size32_t vf_signlen;
+ caddr32_t vf_signbuf;
+} crypto_verify_final32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct crypto_verify_recover_init32 {
+ uint32_t ri_return_value;
+ crypto_session_id_t ri_session;
+ crypto_mechanism32_t ri_mech;
+ crypto_key32_t ri_key;
+} crypto_verify_recover_init32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+typedef struct crypto_verify_recover32 {
+ uint32_t vr_return_value;
+ crypto_session_id_t vr_session;
+ size32_t vr_signlen;
+ caddr32_t vr_signbuf;
+ size32_t vr_datalen;
+ caddr32_t vr_databuf;
+} crypto_verify_recover32_t;
+
+typedef struct crypto_digest_encrypt_update32 {
+ uint32_t eu_return_value;
+ crypto_session_id_t eu_session;
+ size32_t eu_datalen;
+ caddr32_t eu_databuf;
+ size32_t eu_encrlen;
+ caddr32_t eu_encrbuf;
+} crypto_digest_encrypt_update32_t;
+
+typedef struct crypto_decrypt_digest_update32 {
+ uint32_t du_return_value;
+ crypto_session_id_t du_session;
+ size32_t du_encrlen;
+ caddr32_t du_encrbuf;
+ size32_t du_datalen;
+ caddr32_t du_databuf;
+} crypto_decrypt_digest_update32_t;
+
+typedef struct crypto_sign_encrypt_update32 {
+ uint32_t eu_return_value;
+ crypto_session_id_t eu_session;
+ size32_t eu_datalen;
+ caddr32_t eu_databuf;
+ size32_t eu_encrlen;
+ caddr32_t eu_encrbuf;
+} crypto_sign_encrypt_update32_t;
+
+typedef struct crypto_decrypt_verify_update32 {
+ uint32_t vu_return_value;
+ crypto_session_id_t vu_session;
+ size32_t vu_encrlen;
+ caddr32_t vu_encrbuf;
+ size32_t vu_datalen;
+ caddr32_t vu_databuf;
+} crypto_decrypt_verify_update32_t;
+
+#endif /* _SYSCALL32 */
+#endif /* _KERNEL */
+
+#define CRYPTO_ENCRYPT CRYPTO(50)
+#define CRYPTO_ENCRYPT_INIT CRYPTO(51)
+#define CRYPTO_ENCRYPT_UPDATE CRYPTO(52)
+#define CRYPTO_ENCRYPT_FINAL CRYPTO(53)
+#define CRYPTO_DECRYPT CRYPTO(54)
+#define CRYPTO_DECRYPT_INIT CRYPTO(55)
+#define CRYPTO_DECRYPT_UPDATE CRYPTO(56)
+#define CRYPTO_DECRYPT_FINAL CRYPTO(57)
+
+#define CRYPTO_DIGEST CRYPTO(58)
+#define CRYPTO_DIGEST_INIT CRYPTO(59)
+#define CRYPTO_DIGEST_UPDATE CRYPTO(60)
+#define CRYPTO_DIGEST_KEY CRYPTO(61)
+#define CRYPTO_DIGEST_FINAL CRYPTO(62)
+#define CRYPTO_MAC CRYPTO(63)
+#define CRYPTO_MAC_INIT CRYPTO(64)
+#define CRYPTO_MAC_UPDATE CRYPTO(65)
+#define CRYPTO_MAC_FINAL CRYPTO(66)
+
+#define CRYPTO_SIGN CRYPTO(67)
+#define CRYPTO_SIGN_INIT CRYPTO(68)
+#define CRYPTO_SIGN_UPDATE CRYPTO(69)
+#define CRYPTO_SIGN_FINAL CRYPTO(70)
+#define CRYPTO_SIGN_RECOVER_INIT CRYPTO(71)
+#define CRYPTO_SIGN_RECOVER CRYPTO(72)
+#define CRYPTO_VERIFY CRYPTO(73)
+#define CRYPTO_VERIFY_INIT CRYPTO(74)
+#define CRYPTO_VERIFY_UPDATE CRYPTO(75)
+#define CRYPTO_VERIFY_FINAL CRYPTO(76)
+#define CRYPTO_VERIFY_RECOVER_INIT CRYPTO(77)
+#define CRYPTO_VERIFY_RECOVER CRYPTO(78)
+
+#define CRYPTO_DIGEST_ENCRYPT_UPDATE CRYPTO(79)
+#define CRYPTO_DECRYPT_DIGEST_UPDATE CRYPTO(80)
+#define CRYPTO_SIGN_ENCRYPT_UPDATE CRYPTO(81)
+#define CRYPTO_DECRYPT_VERIFY_UPDATE CRYPTO(82)
+
+/*
+ * Random Number Ioctls
+ */
+typedef struct crypto_seed_random {
+ uint_t sr_return_value;
+ crypto_session_id_t sr_session;
+ size_t sr_seedlen;
+ caddr_t sr_seedbuf;
+} crypto_seed_random_t;
+
+typedef struct crypto_generate_random {
+ uint_t gr_return_value;
+ crypto_session_id_t gr_session;
+ caddr_t gr_buf;
+ size_t gr_buflen;
+} crypto_generate_random_t;
+
+#ifdef _KERNEL
+#ifdef _SYSCALL32
+
+typedef struct crypto_seed_random32 {
+ uint32_t sr_return_value;
+ crypto_session_id_t sr_session;
+ size32_t sr_seedlen;
+ caddr32_t sr_seedbuf;
+} crypto_seed_random32_t;
+
+typedef struct crypto_generate_random32 {
+ uint32_t gr_return_value;
+ crypto_session_id_t gr_session;
+ caddr32_t gr_buf;
+ size32_t gr_buflen;
+} crypto_generate_random32_t;
+
+#endif /* _SYSCALL32 */
+#endif /* _KERNEL */
+
+#define CRYPTO_SEED_RANDOM CRYPTO(90)
+#define CRYPTO_GENERATE_RANDOM CRYPTO(91)
+
+/*
+ * Object Management Ioctls
+ */
+typedef struct crypto_object_create {
+ uint_t oc_return_value;
+ crypto_session_id_t oc_session;
+ crypto_object_id_t oc_handle;
+ uint_t oc_count;
+ caddr_t oc_attributes;
+} crypto_object_create_t;
+
+typedef struct crypto_object_copy {
+ uint_t oc_return_value;
+ crypto_session_id_t oc_session;
+ crypto_object_id_t oc_handle;
+ crypto_object_id_t oc_new_handle;
+ uint_t oc_count;
+ caddr_t oc_new_attributes;
+} crypto_object_copy_t;
+
+typedef struct crypto_object_destroy {
+ uint_t od_return_value;
+ crypto_session_id_t od_session;
+ crypto_object_id_t od_handle;
+} crypto_object_destroy_t;
+
+typedef struct crypto_object_get_attribute_value {
+ uint_t og_return_value;
+ crypto_session_id_t og_session;
+ crypto_object_id_t og_handle;
+ uint_t og_count;
+ caddr_t og_attributes;
+} crypto_object_get_attribute_value_t;
+
+typedef struct crypto_object_get_size {
+ uint_t gs_return_value;
+ crypto_session_id_t gs_session;
+ crypto_object_id_t gs_handle;
+ size_t gs_size;
+} crypto_object_get_size_t;
+
+typedef struct crypto_object_set_attribute_value {
+ uint_t sa_return_value;
+ crypto_session_id_t sa_session;
+ crypto_object_id_t sa_handle;
+ uint_t sa_count;
+ caddr_t sa_attributes;
+} crypto_object_set_attribute_value_t;
+
+typedef struct crypto_object_find_init {
+ uint_t fi_return_value;
+ crypto_session_id_t fi_session;
+ uint_t fi_count;
+ caddr_t fi_attributes;
+} crypto_object_find_init_t;
+
+typedef struct crypto_object_find_update {
+ uint_t fu_return_value;
+ crypto_session_id_t fu_session;
+ uint_t fu_max_count;
+ uint_t fu_count;
+ caddr_t fu_handles;
+} crypto_object_find_update_t;
+
+typedef struct crypto_object_find_final {
+ uint_t ff_return_value;
+ crypto_session_id_t ff_session;
+} crypto_object_find_final_t;
+
+#ifdef _KERNEL
+#ifdef _SYSCALL32
+
+typedef struct crypto_object_create32 {
+ uint32_t oc_return_value;
+ crypto_session_id_t oc_session;
+ crypto_object_id_t oc_handle;
+ uint32_t oc_count;
+ caddr32_t oc_attributes;
+} crypto_object_create32_t;
+
+typedef struct crypto_object_copy32 {
+ uint32_t oc_return_value;
+ crypto_session_id_t oc_session;
+ crypto_object_id_t oc_handle;
+ crypto_object_id_t oc_new_handle;
+ uint32_t oc_count;
+ caddr32_t oc_new_attributes;
+} crypto_object_copy32_t;
+
+typedef struct crypto_object_destroy32 {
+ uint32_t od_return_value;
+ crypto_session_id_t od_session;
+ crypto_object_id_t od_handle;
+} crypto_object_destroy32_t;
+
+typedef struct crypto_object_get_attribute_value32 {
+ uint32_t og_return_value;
+ crypto_session_id_t og_session;
+ crypto_object_id_t og_handle;
+ uint32_t og_count;
+ caddr32_t og_attributes;
+} crypto_object_get_attribute_value32_t;
+
+typedef struct crypto_object_get_size32 {
+ uint32_t gs_return_value;
+ crypto_session_id_t gs_session;
+ crypto_object_id_t gs_handle;
+ size32_t gs_size;
+} crypto_object_get_size32_t;
+
+typedef struct crypto_object_set_attribute_value32 {
+ uint32_t sa_return_value;
+ crypto_session_id_t sa_session;
+ crypto_object_id_t sa_handle;
+ uint32_t sa_count;
+ caddr32_t sa_attributes;
+} crypto_object_set_attribute_value32_t;
+
+typedef struct crypto_object_find_init32 {
+ uint32_t fi_return_value;
+ crypto_session_id_t fi_session;
+ uint32_t fi_count;
+ caddr32_t fi_attributes;
+} crypto_object_find_init32_t;
+
+typedef struct crypto_object_find_update32 {
+ uint32_t fu_return_value;
+ crypto_session_id_t fu_session;
+ uint32_t fu_max_count;
+ uint32_t fu_count;
+ caddr32_t fu_handles;
+} crypto_object_find_update32_t;
+
+typedef struct crypto_object_find_final32 {
+ uint32_t ff_return_value;
+ crypto_session_id_t ff_session;
+} crypto_object_find_final32_t;
+
+#endif /* _SYSCALL32 */
+#endif /* _KERNEL */
+
+#define CRYPTO_OBJECT_CREATE CRYPTO(100)
+#define CRYPTO_OBJECT_COPY CRYPTO(101)
+#define CRYPTO_OBJECT_DESTROY CRYPTO(102)
+#define CRYPTO_OBJECT_GET_ATTRIBUTE_VALUE CRYPTO(103)
+#define CRYPTO_OBJECT_GET_SIZE CRYPTO(104)
+#define CRYPTO_OBJECT_SET_ATTRIBUTE_VALUE CRYPTO(105)
+#define CRYPTO_OBJECT_FIND_INIT CRYPTO(106)
+#define CRYPTO_OBJECT_FIND_UPDATE CRYPTO(107)
+#define CRYPTO_OBJECT_FIND_FINAL CRYPTO(108)
+
+/*
+ * Key Generation Ioctls
+ */
+typedef struct crypto_object_generate_key {
+ uint_t gk_return_value;
+ crypto_session_id_t gk_session;
+ crypto_object_id_t gk_handle;
+ crypto_mechanism_t gk_mechanism;
+ uint_t gk_count;
+ caddr_t gk_attributes;
+} crypto_object_generate_key_t;
+
+typedef struct crypto_object_generate_key_pair {
+ uint_t kp_return_value;
+ crypto_session_id_t kp_session;
+ crypto_object_id_t kp_public_handle;
+ crypto_object_id_t kp_private_handle;
+ uint_t kp_public_count;
+ uint_t kp_private_count;
+ caddr_t kp_public_attributes;
+ caddr_t kp_private_attributes;
+ crypto_mechanism_t kp_mechanism;
+} crypto_object_generate_key_pair_t;
+
+typedef struct crypto_object_wrap_key {
+ uint_t wk_return_value;
+ crypto_session_id_t wk_session;
+ crypto_mechanism_t wk_mechanism;
+ crypto_key_t wk_wrapping_key;
+ crypto_object_id_t wk_object_handle;
+ size_t wk_wrapped_key_len;
+ caddr_t wk_wrapped_key;
+} crypto_object_wrap_key_t;
+
+typedef struct crypto_object_unwrap_key {
+ uint_t uk_return_value;
+ crypto_session_id_t uk_session;
+ crypto_mechanism_t uk_mechanism;
+ crypto_key_t uk_unwrapping_key;
+ crypto_object_id_t uk_object_handle;
+ size_t uk_wrapped_key_len;
+ caddr_t uk_wrapped_key;
+ uint_t uk_count;
+ caddr_t uk_attributes;
+} crypto_object_unwrap_key_t;
+
+typedef struct crypto_derive_key {
+ uint_t dk_return_value;
+ crypto_session_id_t dk_session;
+ crypto_mechanism_t dk_mechanism;
+ crypto_key_t dk_base_key;
+ crypto_object_id_t dk_object_handle;
+ uint_t dk_count;
+ caddr_t dk_attributes;
+} crypto_derive_key_t;
+
+#ifdef _KERNEL
+#ifdef _SYSCALL32
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct crypto_object_generate_key32 {
+ uint32_t gk_return_value;
+ crypto_session_id_t gk_session;
+ crypto_object_id_t gk_handle;
+ crypto_mechanism32_t gk_mechanism;
+ uint32_t gk_count;
+ caddr32_t gk_attributes;
+} crypto_object_generate_key32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+typedef struct crypto_object_generate_key_pair32 {
+ uint32_t kp_return_value;
+ crypto_session_id_t kp_session;
+ crypto_object_id_t kp_public_handle;
+ crypto_object_id_t kp_private_handle;
+ uint32_t kp_public_count;
+ uint32_t kp_private_count;
+ caddr32_t kp_public_attributes;
+ caddr32_t kp_private_attributes;
+ crypto_mechanism32_t kp_mechanism;
+} crypto_object_generate_key_pair32_t;
+
+typedef struct crypto_object_wrap_key32 {
+ uint32_t wk_return_value;
+ crypto_session_id_t wk_session;
+ crypto_mechanism32_t wk_mechanism;
+ crypto_key32_t wk_wrapping_key;
+ crypto_object_id_t wk_object_handle;
+ size32_t wk_wrapped_key_len;
+ caddr32_t wk_wrapped_key;
+} crypto_object_wrap_key32_t;
+
+typedef struct crypto_object_unwrap_key32 {
+ uint32_t uk_return_value;
+ crypto_session_id_t uk_session;
+ crypto_mechanism32_t uk_mechanism;
+ crypto_key32_t uk_unwrapping_key;
+ crypto_object_id_t uk_object_handle;
+ size32_t uk_wrapped_key_len;
+ caddr32_t uk_wrapped_key;
+ uint32_t uk_count;
+ caddr32_t uk_attributes;
+} crypto_object_unwrap_key32_t;
+
+typedef struct crypto_derive_key32 {
+ uint32_t dk_return_value;
+ crypto_session_id_t dk_session;
+ crypto_mechanism32_t dk_mechanism;
+ crypto_key32_t dk_base_key;
+ crypto_object_id_t dk_object_handle;
+ uint32_t dk_count;
+ caddr32_t dk_attributes;
+} crypto_derive_key32_t;
+
+#endif /* _SYSCALL32 */
+#endif /* _KERNEL */
+
+#define CRYPTO_GENERATE_KEY CRYPTO(110)
+#define CRYPTO_GENERATE_KEY_PAIR CRYPTO(111)
+#define CRYPTO_WRAP_KEY CRYPTO(112)
+#define CRYPTO_UNWRAP_KEY CRYPTO(113)
+#define CRYPTO_DERIVE_KEY CRYPTO(114)
+
+/*
+ * Provider Management Ioctls
+ */
+
+typedef struct crypto_get_provider_list {
+ uint_t pl_return_value;
+ uint_t pl_count;
+ crypto_provider_entry_t pl_list[1];
+} crypto_get_provider_list_t;
+
+typedef struct crypto_provider_data {
+ uchar_t pd_prov_desc[CRYPTO_PROVIDER_DESCR_MAX_LEN];
+ uchar_t pd_label[CRYPTO_EXT_SIZE_LABEL];
+ uchar_t pd_manufacturerID[CRYPTO_EXT_SIZE_MANUF];
+ uchar_t pd_model[CRYPTO_EXT_SIZE_MODEL];
+ uchar_t pd_serial_number[CRYPTO_EXT_SIZE_SERIAL];
+ ulong_t pd_flags;
+ ulong_t pd_max_session_count;
+ ulong_t pd_session_count;
+ ulong_t pd_max_rw_session_count;
+ ulong_t pd_rw_session_count;
+ ulong_t pd_max_pin_len;
+ ulong_t pd_min_pin_len;
+ ulong_t pd_total_public_memory;
+ ulong_t pd_free_public_memory;
+ ulong_t pd_total_private_memory;
+ ulong_t pd_free_private_memory;
+ crypto_version_t pd_hardware_version;
+ crypto_version_t pd_firmware_version;
+ uchar_t pd_time[CRYPTO_EXT_SIZE_TIME];
+} crypto_provider_data_t;
+
+typedef struct crypto_get_provider_info {
+ uint_t gi_return_value;
+ crypto_provider_id_t gi_provider_id;
+ crypto_provider_data_t gi_provider_data;
+} crypto_get_provider_info_t;
+
+typedef struct crypto_get_provider_mechanisms {
+ uint_t pm_return_value;
+ crypto_provider_id_t pm_provider_id;
+ uint_t pm_count;
+ crypto_mech_name_t pm_list[1];
+} crypto_get_provider_mechanisms_t;
+
+typedef struct crypto_get_provider_mechanism_info {
+ uint_t mi_return_value;
+ crypto_provider_id_t mi_provider_id;
+ crypto_mech_name_t mi_mechanism_name;
+ uint32_t mi_min_key_size;
+ uint32_t mi_max_key_size;
+ uint32_t mi_flags;
+} crypto_get_provider_mechanism_info_t;
+
+typedef struct crypto_init_token {
+ uint_t it_return_value;
+ crypto_provider_id_t it_provider_id;
+ caddr_t it_pin;
+ size_t it_pin_len;
+ caddr_t it_label;
+} crypto_init_token_t;
+
+typedef struct crypto_init_pin {
+ uint_t ip_return_value;
+ crypto_session_id_t ip_session;
+ caddr_t ip_pin;
+ size_t ip_pin_len;
+} crypto_init_pin_t;
+
+typedef struct crypto_set_pin {
+ uint_t sp_return_value;
+ crypto_session_id_t sp_session;
+ caddr_t sp_old_pin;
+ size_t sp_old_len;
+ caddr_t sp_new_pin;
+ size_t sp_new_len;
+} crypto_set_pin_t;
+
+#ifdef _KERNEL
+#ifdef _SYSCALL32
+
+typedef struct crypto_get_provider_list32 {
+ uint32_t pl_return_value;
+ uint32_t pl_count;
+ crypto_provider_entry_t pl_list[1];
+} crypto_get_provider_list32_t;
+
+typedef struct crypto_version32 {
+ uchar_t cv_major;
+ uchar_t cv_minor;
+} crypto_version32_t;
+
+typedef struct crypto_provider_data32 {
+ uchar_t pd_prov_desc[CRYPTO_PROVIDER_DESCR_MAX_LEN];
+ uchar_t pd_label[CRYPTO_EXT_SIZE_LABEL];
+ uchar_t pd_manufacturerID[CRYPTO_EXT_SIZE_MANUF];
+ uchar_t pd_model[CRYPTO_EXT_SIZE_MODEL];
+ uchar_t pd_serial_number[CRYPTO_EXT_SIZE_SERIAL];
+ uint32_t pd_flags;
+ uint32_t pd_max_session_count;
+ uint32_t pd_session_count;
+ uint32_t pd_max_rw_session_count;
+ uint32_t pd_rw_session_count;
+ uint32_t pd_max_pin_len;
+ uint32_t pd_min_pin_len;
+ uint32_t pd_total_public_memory;
+ uint32_t pd_free_public_memory;
+ uint32_t pd_total_private_memory;
+ uint32_t pd_free_private_memory;
+ crypto_version32_t pd_hardware_version;
+ crypto_version32_t pd_firmware_version;
+ uchar_t pd_time[CRYPTO_EXT_SIZE_TIME];
+} crypto_provider_data32_t;
+
+typedef struct crypto_get_provider_info32 {
+ uint32_t gi_return_value;
+ crypto_provider_id_t gi_provider_id;
+ crypto_provider_data32_t gi_provider_data;
+} crypto_get_provider_info32_t;
+
+typedef struct crypto_get_provider_mechanisms32 {
+ uint32_t pm_return_value;
+ crypto_provider_id_t pm_provider_id;
+ uint32_t pm_count;
+ crypto_mech_name_t pm_list[1];
+} crypto_get_provider_mechanisms32_t;
+
+typedef struct crypto_init_token32 {
+ uint32_t it_return_value;
+ crypto_provider_id_t it_provider_id;
+ caddr32_t it_pin;
+ size32_t it_pin_len;
+ caddr32_t it_label;
+} crypto_init_token32_t;
+
+typedef struct crypto_init_pin32 {
+ uint32_t ip_return_value;
+ crypto_session_id_t ip_session;
+ caddr32_t ip_pin;
+ size32_t ip_pin_len;
+} crypto_init_pin32_t;
+
+typedef struct crypto_set_pin32 {
+ uint32_t sp_return_value;
+ crypto_session_id_t sp_session;
+ caddr32_t sp_old_pin;
+ size32_t sp_old_len;
+ caddr32_t sp_new_pin;
+ size32_t sp_new_len;
+} crypto_set_pin32_t;
+
+#endif /* _SYSCALL32 */
+#endif /* _KERNEL */
+
+#define CRYPTO_GET_PROVIDER_LIST CRYPTO(120)
+#define CRYPTO_GET_PROVIDER_INFO CRYPTO(121)
+#define CRYPTO_GET_PROVIDER_MECHANISMS CRYPTO(122)
+#define CRYPTO_GET_PROVIDER_MECHANISM_INFO CRYPTO(123)
+#define CRYPTO_INIT_TOKEN CRYPTO(124)
+#define CRYPTO_INIT_PIN CRYPTO(125)
+#define CRYPTO_SET_PIN CRYPTO(126)
+
+/*
+ * No (Key) Store Key Generation Ioctls
+ */
+typedef struct crypto_nostore_generate_key {
+ uint_t ngk_return_value;
+ crypto_session_id_t ngk_session;
+ crypto_mechanism_t ngk_mechanism;
+ uint_t ngk_in_count;
+ uint_t ngk_out_count;
+ caddr_t ngk_in_attributes;
+ caddr_t ngk_out_attributes;
+} crypto_nostore_generate_key_t;
+
+typedef struct crypto_nostore_generate_key_pair {
+ uint_t nkp_return_value;
+ crypto_session_id_t nkp_session;
+ uint_t nkp_in_public_count;
+ uint_t nkp_in_private_count;
+ uint_t nkp_out_public_count;
+ uint_t nkp_out_private_count;
+ caddr_t nkp_in_public_attributes;
+ caddr_t nkp_in_private_attributes;
+ caddr_t nkp_out_public_attributes;
+ caddr_t nkp_out_private_attributes;
+ crypto_mechanism_t nkp_mechanism;
+} crypto_nostore_generate_key_pair_t;
+
+typedef struct crypto_nostore_derive_key {
+ uint_t ndk_return_value;
+ crypto_session_id_t ndk_session;
+ crypto_mechanism_t ndk_mechanism;
+ crypto_key_t ndk_base_key;
+ uint_t ndk_in_count;
+ uint_t ndk_out_count;
+ caddr_t ndk_in_attributes;
+ caddr_t ndk_out_attributes;
+} crypto_nostore_derive_key_t;
+
+#ifdef _KERNEL
+#ifdef _SYSCALL32
+
+typedef struct crypto_nostore_generate_key32 {
+ uint32_t ngk_return_value;
+ crypto_session_id_t ngk_session;
+ crypto_mechanism32_t ngk_mechanism;
+ uint32_t ngk_in_count;
+ uint32_t ngk_out_count;
+ caddr32_t ngk_in_attributes;
+ caddr32_t ngk_out_attributes;
+} crypto_nostore_generate_key32_t;
+
+typedef struct crypto_nostore_generate_key_pair32 {
+ uint32_t nkp_return_value;
+ crypto_session_id_t nkp_session;
+ uint32_t nkp_in_public_count;
+ uint32_t nkp_in_private_count;
+ uint32_t nkp_out_public_count;
+ uint32_t nkp_out_private_count;
+ caddr32_t nkp_in_public_attributes;
+ caddr32_t nkp_in_private_attributes;
+ caddr32_t nkp_out_public_attributes;
+ caddr32_t nkp_out_private_attributes;
+ crypto_mechanism32_t nkp_mechanism;
+} crypto_nostore_generate_key_pair32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct crypto_nostore_derive_key32 {
+ uint32_t ndk_return_value;
+ crypto_session_id_t ndk_session;
+ crypto_mechanism32_t ndk_mechanism;
+ crypto_key32_t ndk_base_key;
+ uint32_t ndk_in_count;
+ uint32_t ndk_out_count;
+ caddr32_t ndk_in_attributes;
+ caddr32_t ndk_out_attributes;
+} crypto_nostore_derive_key32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+#endif /* _SYSCALL32 */
+#endif /* _KERNEL */
+
+#define CRYPTO_NOSTORE_GENERATE_KEY CRYPTO(127)
+#define CRYPTO_NOSTORE_GENERATE_KEY_PAIR CRYPTO(128)
+#define CRYPTO_NOSTORE_DERIVE_KEY CRYPTO(129)
+
+/*
+ * Mechanism Ioctls
+ */
+
+typedef struct crypto_get_mechanism_list {
+ uint_t ml_return_value;
+ uint_t ml_count;
+ crypto_mech_name_t ml_list[1];
+} crypto_get_mechanism_list_t;
+
+typedef struct crypto_get_all_mechanism_info {
+ uint_t mi_return_value;
+ crypto_mech_name_t mi_mechanism_name;
+ uint_t mi_count;
+ crypto_mechanism_info_t mi_list[1];
+} crypto_get_all_mechanism_info_t;
+
+#ifdef _KERNEL
+#ifdef _SYSCALL32
+
+typedef struct crypto_get_mechanism_list32 {
+ uint32_t ml_return_value;
+ uint32_t ml_count;
+ crypto_mech_name_t ml_list[1];
+} crypto_get_mechanism_list32_t;
+
+typedef struct crypto_get_all_mechanism_info32 {
+ uint32_t mi_return_value;
+ crypto_mech_name_t mi_mechanism_name;
+ uint32_t mi_count;
+ crypto_mechanism_info32_t mi_list[1];
+} crypto_get_all_mechanism_info32_t;
+
+#endif /* _SYSCALL32 */
+#endif /* _KERNEL */
+
+#define CRYPTO_GET_MECHANISM_LIST CRYPTO(140)
+#define CRYPTO_GET_ALL_MECHANISM_INFO CRYPTO(141)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CRYPTO_IOCTL_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/crypto/ioctladmin.h b/sys/contrib/openzfs/module/icp/include/sys/crypto/ioctladmin.h
new file mode 100644
index 000000000000..24babd7755cc
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/crypto/ioctladmin.h
@@ -0,0 +1,136 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_CRYPTO_IOCTLADMIN_H
+#define _SYS_CRYPTO_IOCTLADMIN_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+
+#define ADMIN_IOCTL_DEVICE "/dev/cryptoadm"
+
+#define CRYPTOADMIN(x) (('y' << 8) | (x))
+
+/*
+ * Administrative IOCTLs
+ */
+
+typedef struct crypto_get_dev_list {
+ uint_t dl_return_value;
+ uint_t dl_dev_count;
+ crypto_dev_list_entry_t dl_devs[1];
+} crypto_get_dev_list_t;
+
+typedef struct crypto_get_soft_list {
+ uint_t sl_return_value;
+ uint_t sl_soft_count;
+ size_t sl_soft_len;
+ caddr_t sl_soft_names;
+} crypto_get_soft_list_t;
+
+typedef struct crypto_get_dev_info {
+ uint_t di_return_value;
+ char di_dev_name[MAXNAMELEN];
+ uint_t di_dev_instance;
+ uint_t di_count;
+ crypto_mech_name_t di_list[1];
+} crypto_get_dev_info_t;
+
+typedef struct crypto_get_soft_info {
+ uint_t si_return_value;
+ char si_name[MAXNAMELEN];
+ uint_t si_count;
+ crypto_mech_name_t si_list[1];
+} crypto_get_soft_info_t;
+
+typedef struct crypto_load_dev_disabled {
+ uint_t dd_return_value;
+ char dd_dev_name[MAXNAMELEN];
+ uint_t dd_dev_instance;
+ uint_t dd_count;
+ crypto_mech_name_t dd_list[1];
+} crypto_load_dev_disabled_t;
+
+typedef struct crypto_load_soft_disabled {
+ uint_t sd_return_value;
+ char sd_name[MAXNAMELEN];
+ uint_t sd_count;
+ crypto_mech_name_t sd_list[1];
+} crypto_load_soft_disabled_t;
+
+typedef struct crypto_unload_soft_module {
+ uint_t sm_return_value;
+ char sm_name[MAXNAMELEN];
+} crypto_unload_soft_module_t;
+
+typedef struct crypto_load_soft_config {
+ uint_t sc_return_value;
+ char sc_name[MAXNAMELEN];
+ uint_t sc_count;
+ crypto_mech_name_t sc_list[1];
+} crypto_load_soft_config_t;
+
+typedef struct crypto_load_door {
+ uint_t ld_return_value;
+ uint_t ld_did;
+} crypto_load_door_t;
+
+#ifdef _KERNEL
+#ifdef _SYSCALL32
+
+typedef struct crypto_get_soft_list32 {
+ uint32_t sl_return_value;
+ uint32_t sl_soft_count;
+ size32_t sl_soft_len;
+ caddr32_t sl_soft_names;
+} crypto_get_soft_list32_t;
+
+#endif /* _SYSCALL32 */
+#endif /* _KERNEL */
+
+#define CRYPTO_GET_VERSION CRYPTOADMIN(1)
+#define CRYPTO_GET_DEV_LIST CRYPTOADMIN(2)
+#define CRYPTO_GET_SOFT_LIST CRYPTOADMIN(3)
+#define CRYPTO_GET_DEV_INFO CRYPTOADMIN(4)
+#define CRYPTO_GET_SOFT_INFO CRYPTOADMIN(5)
+#define CRYPTO_LOAD_DEV_DISABLED CRYPTOADMIN(8)
+#define CRYPTO_LOAD_SOFT_DISABLED CRYPTOADMIN(9)
+#define CRYPTO_UNLOAD_SOFT_MODULE CRYPTOADMIN(10)
+#define CRYPTO_LOAD_SOFT_CONFIG CRYPTOADMIN(11)
+#define CRYPTO_POOL_CREATE CRYPTOADMIN(12)
+#define CRYPTO_POOL_WAIT CRYPTOADMIN(13)
+#define CRYPTO_POOL_RUN CRYPTOADMIN(14)
+#define CRYPTO_LOAD_DOOR CRYPTOADMIN(15)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CRYPTO_IOCTLADMIN_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/crypto/ops_impl.h b/sys/contrib/openzfs/module/icp/include/sys/crypto/ops_impl.h
new file mode 100644
index 000000000000..230d74b063fc
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/crypto/ops_impl.h
@@ -0,0 +1,630 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_CRYPTO_OPS_IMPL_H
+#define _SYS_CRYPTO_OPS_IMPL_H
+
+/*
+ * Scheduler internal structures.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/api.h>
+#include <sys/crypto/spi.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/common.h>
+
+/*
+ * The parameters needed for each function group are batched
+ * in one structure. This is much simpler than having a
+ * separate structure for each function.
+ *
+ * In some cases, a field is generically named to keep the
+ * structure small. The comments indicate these cases.
+ */
+typedef struct kcf_digest_ops_params {
+ crypto_session_id_t do_sid;
+ crypto_mech_type_t do_framework_mechtype;
+ crypto_mechanism_t do_mech;
+ crypto_data_t *do_data;
+ crypto_data_t *do_digest;
+ crypto_key_t *do_digest_key; /* Argument for digest_key() */
+} kcf_digest_ops_params_t;
+
+typedef struct kcf_mac_ops_params {
+ crypto_session_id_t mo_sid;
+ crypto_mech_type_t mo_framework_mechtype;
+ crypto_mechanism_t mo_mech;
+ crypto_key_t *mo_key;
+ crypto_data_t *mo_data;
+ crypto_data_t *mo_mac;
+ crypto_spi_ctx_template_t mo_templ;
+} kcf_mac_ops_params_t;
+
+typedef struct kcf_encrypt_ops_params {
+ crypto_session_id_t eo_sid;
+ crypto_mech_type_t eo_framework_mechtype;
+ crypto_mechanism_t eo_mech;
+ crypto_key_t *eo_key;
+ crypto_data_t *eo_plaintext;
+ crypto_data_t *eo_ciphertext;
+ crypto_spi_ctx_template_t eo_templ;
+} kcf_encrypt_ops_params_t;
+
+typedef struct kcf_decrypt_ops_params {
+ crypto_session_id_t dop_sid;
+ crypto_mech_type_t dop_framework_mechtype;
+ crypto_mechanism_t dop_mech;
+ crypto_key_t *dop_key;
+ crypto_data_t *dop_ciphertext;
+ crypto_data_t *dop_plaintext;
+ crypto_spi_ctx_template_t dop_templ;
+} kcf_decrypt_ops_params_t;
+
+typedef struct kcf_sign_ops_params {
+ crypto_session_id_t so_sid;
+ crypto_mech_type_t so_framework_mechtype;
+ crypto_mechanism_t so_mech;
+ crypto_key_t *so_key;
+ crypto_data_t *so_data;
+ crypto_data_t *so_signature;
+ crypto_spi_ctx_template_t so_templ;
+} kcf_sign_ops_params_t;
+
+typedef struct kcf_verify_ops_params {
+ crypto_session_id_t vo_sid;
+ crypto_mech_type_t vo_framework_mechtype;
+ crypto_mechanism_t vo_mech;
+ crypto_key_t *vo_key;
+ crypto_data_t *vo_data;
+ crypto_data_t *vo_signature;
+ crypto_spi_ctx_template_t vo_templ;
+} kcf_verify_ops_params_t;
+
+typedef struct kcf_encrypt_mac_ops_params {
+ crypto_session_id_t em_sid;
+ crypto_mech_type_t em_framework_encr_mechtype;
+ crypto_mechanism_t em_encr_mech;
+ crypto_key_t *em_encr_key;
+ crypto_mech_type_t em_framework_mac_mechtype;
+ crypto_mechanism_t em_mac_mech;
+ crypto_key_t *em_mac_key;
+ crypto_data_t *em_plaintext;
+ crypto_dual_data_t *em_ciphertext;
+ crypto_data_t *em_mac;
+ crypto_spi_ctx_template_t em_encr_templ;
+ crypto_spi_ctx_template_t em_mac_templ;
+} kcf_encrypt_mac_ops_params_t;
+
+typedef struct kcf_mac_decrypt_ops_params {
+ crypto_session_id_t md_sid;
+ crypto_mech_type_t md_framework_mac_mechtype;
+ crypto_mechanism_t md_mac_mech;
+ crypto_key_t *md_mac_key;
+ crypto_mech_type_t md_framework_decr_mechtype;
+ crypto_mechanism_t md_decr_mech;
+ crypto_key_t *md_decr_key;
+ crypto_dual_data_t *md_ciphertext;
+ crypto_data_t *md_mac;
+ crypto_data_t *md_plaintext;
+ crypto_spi_ctx_template_t md_mac_templ;
+ crypto_spi_ctx_template_t md_decr_templ;
+} kcf_mac_decrypt_ops_params_t;
+
+typedef struct kcf_random_number_ops_params {
+ crypto_session_id_t rn_sid;
+ uchar_t *rn_buf;
+ size_t rn_buflen;
+ uint_t rn_entropy_est;
+ uint32_t rn_flags;
+} kcf_random_number_ops_params_t;
+
+/*
+ * so_pd is useful when the provider descriptor (pd) supplying the
+ * provider handle is different from the pd supplying the ops vector.
+ * This is the case for session open/close where so_pd can be the pd
+ * of a logical provider. The pd supplying the ops vector is passed
+ * as an argument to kcf_submit_request().
+ */
+typedef struct kcf_session_ops_params {
+ crypto_session_id_t *so_sid_ptr;
+ crypto_session_id_t so_sid;
+ crypto_user_type_t so_user_type;
+ char *so_pin;
+ size_t so_pin_len;
+ kcf_provider_desc_t *so_pd;
+} kcf_session_ops_params_t;
+
+typedef struct kcf_object_ops_params {
+ crypto_session_id_t oo_sid;
+ crypto_object_id_t oo_object_id;
+ crypto_object_attribute_t *oo_template;
+ uint_t oo_attribute_count;
+ crypto_object_id_t *oo_object_id_ptr;
+ size_t *oo_object_size;
+ void **oo_find_init_pp_ptr;
+ void *oo_find_pp;
+ uint_t oo_max_object_count;
+ uint_t *oo_object_count_ptr;
+} kcf_object_ops_params_t;
+
+/*
+ * ko_key is used to encode wrapping key in key_wrap() and
+ * unwrapping key in key_unwrap(). ko_key_template and
+ * ko_key_attribute_count are used to encode public template
+ * and public template attr count in key_generate_pair().
+ * kops->ko_key_object_id_ptr is used to encode public key
+ * in key_generate_pair().
+ */
+typedef struct kcf_key_ops_params {
+ crypto_session_id_t ko_sid;
+ crypto_mech_type_t ko_framework_mechtype;
+ crypto_mechanism_t ko_mech;
+ crypto_object_attribute_t *ko_key_template;
+ uint_t ko_key_attribute_count;
+ crypto_object_id_t *ko_key_object_id_ptr;
+ crypto_object_attribute_t *ko_private_key_template;
+ uint_t ko_private_key_attribute_count;
+ crypto_object_id_t *ko_private_key_object_id_ptr;
+ crypto_key_t *ko_key;
+ uchar_t *ko_wrapped_key;
+ size_t *ko_wrapped_key_len_ptr;
+ crypto_object_attribute_t *ko_out_template1;
+ crypto_object_attribute_t *ko_out_template2;
+ uint_t ko_out_attribute_count1;
+ uint_t ko_out_attribute_count2;
+} kcf_key_ops_params_t;
+
+/*
+ * po_pin and po_pin_len are used to encode new_pin and new_pin_len
+ * when wrapping set_pin() function parameters.
+ *
+ * po_pd is useful when the provider descriptor (pd) supplying the
+ * provider handle is different from the pd supplying the ops vector.
+ * This is true for the ext_info provider entry point where po_pd
+ * can be the pd of a logical provider. The pd supplying the ops vector
+ * is passed as an argument to kcf_submit_request().
+ */
+typedef struct kcf_provmgmt_ops_params {
+ crypto_session_id_t po_sid;
+ char *po_pin;
+ size_t po_pin_len;
+ char *po_old_pin;
+ size_t po_old_pin_len;
+ char *po_label;
+ crypto_provider_ext_info_t *po_ext_info;
+ kcf_provider_desc_t *po_pd;
+} kcf_provmgmt_ops_params_t;
+
+/*
+ * The operation type within a function group.
+ */
+typedef enum kcf_op_type {
+ /* common ops for all mechanisms */
+ KCF_OP_INIT = 1,
+ KCF_OP_SINGLE, /* pkcs11 sense. So, INIT is already done */
+ KCF_OP_UPDATE,
+ KCF_OP_FINAL,
+ KCF_OP_ATOMIC,
+
+ /* digest_key op */
+ KCF_OP_DIGEST_KEY,
+
+ /* mac specific op */
+ KCF_OP_MAC_VERIFY_ATOMIC,
+
+ /* mac/cipher specific op */
+ KCF_OP_MAC_VERIFY_DECRYPT_ATOMIC,
+
+ /* sign_recover ops */
+ KCF_OP_SIGN_RECOVER_INIT,
+ KCF_OP_SIGN_RECOVER,
+ KCF_OP_SIGN_RECOVER_ATOMIC,
+
+ /* verify_recover ops */
+ KCF_OP_VERIFY_RECOVER_INIT,
+ KCF_OP_VERIFY_RECOVER,
+ KCF_OP_VERIFY_RECOVER_ATOMIC,
+
+ /* random number ops */
+ KCF_OP_RANDOM_SEED,
+ KCF_OP_RANDOM_GENERATE,
+
+ /* session management ops */
+ KCF_OP_SESSION_OPEN,
+ KCF_OP_SESSION_CLOSE,
+ KCF_OP_SESSION_LOGIN,
+ KCF_OP_SESSION_LOGOUT,
+
+ /* object management ops */
+ KCF_OP_OBJECT_CREATE,
+ KCF_OP_OBJECT_COPY,
+ KCF_OP_OBJECT_DESTROY,
+ KCF_OP_OBJECT_GET_SIZE,
+ KCF_OP_OBJECT_GET_ATTRIBUTE_VALUE,
+ KCF_OP_OBJECT_SET_ATTRIBUTE_VALUE,
+ KCF_OP_OBJECT_FIND_INIT,
+ KCF_OP_OBJECT_FIND,
+ KCF_OP_OBJECT_FIND_FINAL,
+
+ /* key management ops */
+ KCF_OP_KEY_GENERATE,
+ KCF_OP_KEY_GENERATE_PAIR,
+ KCF_OP_KEY_WRAP,
+ KCF_OP_KEY_UNWRAP,
+ KCF_OP_KEY_DERIVE,
+ KCF_OP_KEY_CHECK,
+
+ /* provider management ops */
+ KCF_OP_MGMT_EXTINFO,
+ KCF_OP_MGMT_INITTOKEN,
+ KCF_OP_MGMT_INITPIN,
+ KCF_OP_MGMT_SETPIN
+} kcf_op_type_t;
+
+/*
+ * The operation groups that need wrapping of parameters. This is somewhat
+ * similar to the function group type in spi.h except that this also includes
+ * all the functions that don't have a mechanism.
+ *
+ * The wrapper macros should never take these enum values as an argument.
+ * Rather, they are assigned in the macro itself since they are known
+ * from the macro name.
+ */
+typedef enum kcf_op_group {
+ KCF_OG_DIGEST = 1,
+ KCF_OG_MAC,
+ KCF_OG_ENCRYPT,
+ KCF_OG_DECRYPT,
+ KCF_OG_SIGN,
+ KCF_OG_VERIFY,
+ KCF_OG_ENCRYPT_MAC,
+ KCF_OG_MAC_DECRYPT,
+ KCF_OG_RANDOM,
+ KCF_OG_SESSION,
+ KCF_OG_OBJECT,
+ KCF_OG_KEY,
+ KCF_OG_PROVMGMT,
+ KCF_OG_NOSTORE_KEY
+} kcf_op_group_t;
+
+/*
+ * The kcf_op_type_t enum values used here should be only for those
+ * operations for which there is a k-api routine in sys/crypto/api.h.
+ */
+#define IS_INIT_OP(ftype) ((ftype) == KCF_OP_INIT)
+#define IS_SINGLE_OP(ftype) ((ftype) == KCF_OP_SINGLE)
+#define IS_UPDATE_OP(ftype) ((ftype) == KCF_OP_UPDATE)
+#define IS_FINAL_OP(ftype) ((ftype) == KCF_OP_FINAL)
+#define IS_ATOMIC_OP(ftype) ( \
+ (ftype) == KCF_OP_ATOMIC || (ftype) == KCF_OP_MAC_VERIFY_ATOMIC || \
+ (ftype) == KCF_OP_MAC_VERIFY_DECRYPT_ATOMIC || \
+ (ftype) == KCF_OP_SIGN_RECOVER_ATOMIC || \
+ (ftype) == KCF_OP_VERIFY_RECOVER_ATOMIC)
+
+/*
+ * Keep the parameters associated with a request around.
+ * We need to pass them to the SPI.
+ */
+typedef struct kcf_req_params {
+ kcf_op_group_t rp_opgrp;
+ kcf_op_type_t rp_optype;
+
+ union {
+ kcf_digest_ops_params_t digest_params;
+ kcf_mac_ops_params_t mac_params;
+ kcf_encrypt_ops_params_t encrypt_params;
+ kcf_decrypt_ops_params_t decrypt_params;
+ kcf_sign_ops_params_t sign_params;
+ kcf_verify_ops_params_t verify_params;
+ kcf_encrypt_mac_ops_params_t encrypt_mac_params;
+ kcf_mac_decrypt_ops_params_t mac_decrypt_params;
+ kcf_random_number_ops_params_t random_number_params;
+ kcf_session_ops_params_t session_params;
+ kcf_object_ops_params_t object_params;
+ kcf_key_ops_params_t key_params;
+ kcf_provmgmt_ops_params_t provmgmt_params;
+ } rp_u;
+} kcf_req_params_t;
+
+
+/*
+ * The ioctl/k-api code should bundle the parameters into a kcf_req_params_t
+ * structure before calling a scheduler routine. The following macros are
+ * available for that purpose.
+ *
+ * For the most part, the macro arguments closely correspond to the
+ * function parameters. In some cases, we use generic names. The comments
+ * for the structure should indicate these cases.
+ */
+#define KCF_WRAP_DIGEST_OPS_PARAMS(req, ftype, _sid, _mech, _key, \
+ _data, _digest) { \
+ kcf_digest_ops_params_t *dops = &(req)->rp_u.digest_params; \
+ crypto_mechanism_t *mechp = _mech; \
+ \
+ (req)->rp_opgrp = KCF_OG_DIGEST; \
+ (req)->rp_optype = ftype; \
+ dops->do_sid = _sid; \
+ if (mechp != NULL) { \
+ dops->do_mech = *mechp; \
+ dops->do_framework_mechtype = mechp->cm_type; \
+ } \
+ dops->do_digest_key = _key; \
+ dops->do_data = _data; \
+ dops->do_digest = _digest; \
+}
+
+#define KCF_WRAP_MAC_OPS_PARAMS(req, ftype, _sid, _mech, _key, \
+ _data, _mac, _templ) { \
+ kcf_mac_ops_params_t *mops = &(req)->rp_u.mac_params; \
+ crypto_mechanism_t *mechp = _mech; \
+ \
+ (req)->rp_opgrp = KCF_OG_MAC; \
+ (req)->rp_optype = ftype; \
+ mops->mo_sid = _sid; \
+ if (mechp != NULL) { \
+ mops->mo_mech = *mechp; \
+ mops->mo_framework_mechtype = mechp->cm_type; \
+ } \
+ mops->mo_key = _key; \
+ mops->mo_data = _data; \
+ mops->mo_mac = _mac; \
+ mops->mo_templ = _templ; \
+}
+
+#define KCF_WRAP_ENCRYPT_OPS_PARAMS(req, ftype, _sid, _mech, _key, \
+ _plaintext, _ciphertext, _templ) { \
+ kcf_encrypt_ops_params_t *cops = &(req)->rp_u.encrypt_params; \
+ crypto_mechanism_t *mechp = _mech; \
+ \
+ (req)->rp_opgrp = KCF_OG_ENCRYPT; \
+ (req)->rp_optype = ftype; \
+ cops->eo_sid = _sid; \
+ if (mechp != NULL) { \
+ cops->eo_mech = *mechp; \
+ cops->eo_framework_mechtype = mechp->cm_type; \
+ } \
+ cops->eo_key = _key; \
+ cops->eo_plaintext = _plaintext; \
+ cops->eo_ciphertext = _ciphertext; \
+ cops->eo_templ = _templ; \
+}
+
+#define KCF_WRAP_DECRYPT_OPS_PARAMS(req, ftype, _sid, _mech, _key, \
+ _ciphertext, _plaintext, _templ) { \
+ kcf_decrypt_ops_params_t *cops = &(req)->rp_u.decrypt_params; \
+ crypto_mechanism_t *mechp = _mech; \
+ \
+ (req)->rp_opgrp = KCF_OG_DECRYPT; \
+ (req)->rp_optype = ftype; \
+ cops->dop_sid = _sid; \
+ if (mechp != NULL) { \
+ cops->dop_mech = *mechp; \
+ cops->dop_framework_mechtype = mechp->cm_type; \
+ } \
+ cops->dop_key = _key; \
+ cops->dop_ciphertext = _ciphertext; \
+ cops->dop_plaintext = _plaintext; \
+ cops->dop_templ = _templ; \
+}
+
+#define KCF_WRAP_SIGN_OPS_PARAMS(req, ftype, _sid, _mech, _key, \
+ _data, _signature, _templ) { \
+ kcf_sign_ops_params_t *sops = &(req)->rp_u.sign_params; \
+ crypto_mechanism_t *mechp = _mech; \
+ \
+ (req)->rp_opgrp = KCF_OG_SIGN; \
+ (req)->rp_optype = ftype; \
+ sops->so_sid = _sid; \
+ if (mechp != NULL) { \
+ sops->so_mech = *mechp; \
+ sops->so_framework_mechtype = mechp->cm_type; \
+ } \
+ sops->so_key = _key; \
+ sops->so_data = _data; \
+ sops->so_signature = _signature; \
+ sops->so_templ = _templ; \
+}
+
+#define KCF_WRAP_VERIFY_OPS_PARAMS(req, ftype, _sid, _mech, _key, \
+ _data, _signature, _templ) { \
+ kcf_verify_ops_params_t *vops = &(req)->rp_u.verify_params; \
+ crypto_mechanism_t *mechp = _mech; \
+ \
+ (req)->rp_opgrp = KCF_OG_VERIFY; \
+ (req)->rp_optype = ftype; \
+ vops->vo_sid = _sid; \
+ if (mechp != NULL) { \
+ vops->vo_mech = *mechp; \
+ vops->vo_framework_mechtype = mechp->cm_type; \
+ } \
+ vops->vo_key = _key; \
+ vops->vo_data = _data; \
+ vops->vo_signature = _signature; \
+ vops->vo_templ = _templ; \
+}
+
+#define KCF_WRAP_ENCRYPT_MAC_OPS_PARAMS(req, ftype, _sid, _encr_key, \
+ _mac_key, _plaintext, _ciphertext, _mac, _encr_templ, _mac_templ) { \
+ kcf_encrypt_mac_ops_params_t *cmops = &(req)->rp_u.encrypt_mac_params; \
+ \
+ (req)->rp_opgrp = KCF_OG_ENCRYPT_MAC; \
+ (req)->rp_optype = ftype; \
+ cmops->em_sid = _sid; \
+ cmops->em_encr_key = _encr_key; \
+ cmops->em_mac_key = _mac_key; \
+ cmops->em_plaintext = _plaintext; \
+ cmops->em_ciphertext = _ciphertext; \
+ cmops->em_mac = _mac; \
+ cmops->em_encr_templ = _encr_templ; \
+ cmops->em_mac_templ = _mac_templ; \
+}
+
+#define KCF_WRAP_MAC_DECRYPT_OPS_PARAMS(req, ftype, _sid, _mac_key, \
+ _decr_key, _ciphertext, _mac, _plaintext, _mac_templ, _decr_templ) { \
+ kcf_mac_decrypt_ops_params_t *cmops = &(req)->rp_u.mac_decrypt_params; \
+ \
+ (req)->rp_opgrp = KCF_OG_MAC_DECRYPT; \
+ (req)->rp_optype = ftype; \
+ cmops->md_sid = _sid; \
+ cmops->md_mac_key = _mac_key; \
+ cmops->md_decr_key = _decr_key; \
+ cmops->md_ciphertext = _ciphertext; \
+ cmops->md_mac = _mac; \
+ cmops->md_plaintext = _plaintext; \
+ cmops->md_mac_templ = _mac_templ; \
+ cmops->md_decr_templ = _decr_templ; \
+}
+
+#define KCF_WRAP_RANDOM_OPS_PARAMS(req, ftype, _sid, _buf, _buflen, \
+ _est, _flags) { \
+ kcf_random_number_ops_params_t *rops = \
+ &(req)->rp_u.random_number_params; \
+ \
+ (req)->rp_opgrp = KCF_OG_RANDOM; \
+ (req)->rp_optype = ftype; \
+ rops->rn_sid = _sid; \
+ rops->rn_buf = _buf; \
+ rops->rn_buflen = _buflen; \
+ rops->rn_entropy_est = _est; \
+ rops->rn_flags = _flags; \
+}
+
+#define KCF_WRAP_SESSION_OPS_PARAMS(req, ftype, _sid_ptr, _sid, \
+ _user_type, _pin, _pin_len, _pd) { \
+ kcf_session_ops_params_t *sops = &(req)->rp_u.session_params; \
+ \
+ (req)->rp_opgrp = KCF_OG_SESSION; \
+ (req)->rp_optype = ftype; \
+ sops->so_sid_ptr = _sid_ptr; \
+ sops->so_sid = _sid; \
+ sops->so_user_type = _user_type; \
+ sops->so_pin = _pin; \
+ sops->so_pin_len = _pin_len; \
+ sops->so_pd = _pd; \
+}
+
+#define KCF_WRAP_OBJECT_OPS_PARAMS(req, ftype, _sid, _object_id, \
+ _template, _attribute_count, _object_id_ptr, _object_size, \
+ _find_init_pp_ptr, _find_pp, _max_object_count, _object_count_ptr) { \
+ kcf_object_ops_params_t *jops = &(req)->rp_u.object_params; \
+ \
+ (req)->rp_opgrp = KCF_OG_OBJECT; \
+ (req)->rp_optype = ftype; \
+ jops->oo_sid = _sid; \
+ jops->oo_object_id = _object_id; \
+ jops->oo_template = _template; \
+ jops->oo_attribute_count = _attribute_count; \
+ jops->oo_object_id_ptr = _object_id_ptr; \
+ jops->oo_object_size = _object_size; \
+ jops->oo_find_init_pp_ptr = _find_init_pp_ptr; \
+ jops->oo_find_pp = _find_pp; \
+ jops->oo_max_object_count = _max_object_count; \
+ jops->oo_object_count_ptr = _object_count_ptr; \
+}
+
+#define KCF_WRAP_KEY_OPS_PARAMS(req, ftype, _sid, _mech, _key_template, \
+ _key_attribute_count, _key_object_id_ptr, _private_key_template, \
+ _private_key_attribute_count, _private_key_object_id_ptr, \
+ _key, _wrapped_key, _wrapped_key_len_ptr) { \
+ kcf_key_ops_params_t *kops = &(req)->rp_u.key_params; \
+ crypto_mechanism_t *mechp = _mech; \
+ \
+ (req)->rp_opgrp = KCF_OG_KEY; \
+ (req)->rp_optype = ftype; \
+ kops->ko_sid = _sid; \
+ if (mechp != NULL) { \
+ kops->ko_mech = *mechp; \
+ kops->ko_framework_mechtype = mechp->cm_type; \
+ } \
+ kops->ko_key_template = _key_template; \
+ kops->ko_key_attribute_count = _key_attribute_count; \
+ kops->ko_key_object_id_ptr = _key_object_id_ptr; \
+ kops->ko_private_key_template = _private_key_template; \
+ kops->ko_private_key_attribute_count = _private_key_attribute_count; \
+ kops->ko_private_key_object_id_ptr = _private_key_object_id_ptr; \
+ kops->ko_key = _key; \
+ kops->ko_wrapped_key = _wrapped_key; \
+ kops->ko_wrapped_key_len_ptr = _wrapped_key_len_ptr; \
+}
+
+#define KCF_WRAP_PROVMGMT_OPS_PARAMS(req, ftype, _sid, _old_pin, \
+ _old_pin_len, _pin, _pin_len, _label, _ext_info, _pd) { \
+ kcf_provmgmt_ops_params_t *pops = &(req)->rp_u.provmgmt_params; \
+ \
+ (req)->rp_opgrp = KCF_OG_PROVMGMT; \
+ (req)->rp_optype = ftype; \
+ pops->po_sid = _sid; \
+ pops->po_pin = _pin; \
+ pops->po_pin_len = _pin_len; \
+ pops->po_old_pin = _old_pin; \
+ pops->po_old_pin_len = _old_pin_len; \
+ pops->po_label = _label; \
+ pops->po_ext_info = _ext_info; \
+ pops->po_pd = _pd; \
+}
+
+#define KCF_WRAP_NOSTORE_KEY_OPS_PARAMS(req, ftype, _sid, _mech, \
+ _key_template, _key_attribute_count, _private_key_template, \
+ _private_key_attribute_count, _key, _out_template1, \
+ _out_attribute_count1, _out_template2, _out_attribute_count2) { \
+ kcf_key_ops_params_t *kops = &(req)->rp_u.key_params; \
+ crypto_mechanism_t *mechp = _mech; \
+ \
+ (req)->rp_opgrp = KCF_OG_NOSTORE_KEY; \
+ (req)->rp_optype = ftype; \
+ kops->ko_sid = _sid; \
+ if (mechp != NULL) { \
+ kops->ko_mech = *mechp; \
+ kops->ko_framework_mechtype = mechp->cm_type; \
+ } \
+ kops->ko_key_template = _key_template; \
+ kops->ko_key_attribute_count = _key_attribute_count; \
+ kops->ko_key_object_id_ptr = NULL; \
+ kops->ko_private_key_template = _private_key_template; \
+ kops->ko_private_key_attribute_count = _private_key_attribute_count; \
+ kops->ko_private_key_object_id_ptr = NULL; \
+ kops->ko_key = _key; \
+ kops->ko_wrapped_key = NULL; \
+ kops->ko_wrapped_key_len_ptr = 0; \
+ kops->ko_out_template1 = _out_template1; \
+ kops->ko_out_template2 = _out_template2; \
+ kops->ko_out_attribute_count1 = _out_attribute_count1; \
+ kops->ko_out_attribute_count2 = _out_attribute_count2; \
+}
+
+#define KCF_SET_PROVIDER_MECHNUM(fmtype, pd, mechp) \
+ (mechp)->cm_type = \
+ KCF_TO_PROV_MECHNUM(pd, fmtype);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CRYPTO_OPS_IMPL_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/crypto/sched_impl.h b/sys/contrib/openzfs/module/icp/include/sys/crypto/sched_impl.h
new file mode 100644
index 000000000000..85ea0ba1d092
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/crypto/sched_impl.h
@@ -0,0 +1,531 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_CRYPTO_SCHED_IMPL_H
+#define _SYS_CRYPTO_SCHED_IMPL_H
+
+/*
+ * Scheduler internal structures.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/api.h>
+#include <sys/crypto/spi.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/ops_impl.h>
+
+typedef void (kcf_func_t)(void *, int);
+
+typedef enum kcf_req_status {
+ REQ_ALLOCATED = 1,
+ REQ_WAITING, /* At the framework level */
+ REQ_INPROGRESS, /* At the provider level */
+ REQ_DONE,
+ REQ_CANCELED
+} kcf_req_status_t;
+
+typedef enum kcf_call_type {
+ CRYPTO_SYNCH = 1,
+ CRYPTO_ASYNCH
+} kcf_call_type_t;
+
+#define CHECK_RESTRICT(crq) (crq != NULL && \
+ ((crq)->cr_flag & CRYPTO_RESTRICTED))
+
+#define CHECK_RESTRICT_FALSE B_FALSE
+
+#define CHECK_FASTPATH(crq, pd) ((crq) == NULL || \
+ !((crq)->cr_flag & CRYPTO_ALWAYS_QUEUE)) && \
+ (pd)->pd_prov_type == CRYPTO_SW_PROVIDER
+
+#define KCF_KMFLAG(crq) (((crq) == NULL) ? KM_SLEEP : KM_NOSLEEP)
+
+/*
+ * The framework keeps an internal handle to use in the adaptive
+ * asynchronous case. This is the case when a client has the
+ * CRYPTO_ALWAYS_QUEUE bit clear and a software provider is used for
+ * the request. The request is completed in the context of the calling
+ * thread and kernel memory must be allocated with KM_NOSLEEP.
+ *
+ * The framework passes a pointer to the handle in crypto_req_handle_t
+ * argument when it calls the SPI of the software provider. The macros
+ * KCF_RHNDL() and KCF_SWFP_RHNDL() are used to do this.
+ *
+ * When a provider asks the framework for kmflag value via
+ * crypto_kmflag(9S) we use REQHNDL2_KMFLAG() macro.
+ */
+extern ulong_t kcf_swprov_hndl;
+#define KCF_RHNDL(kmflag) (((kmflag) == KM_SLEEP) ? NULL : &kcf_swprov_hndl)
+#define KCF_SWFP_RHNDL(crq) (((crq) == NULL) ? NULL : &kcf_swprov_hndl)
+#define REQHNDL2_KMFLAG(rhndl) \
+ ((rhndl == &kcf_swprov_hndl) ? KM_NOSLEEP : KM_SLEEP)
+
+/* Internal call_req flags. They start after the public ones in api.h */
+
+#define CRYPTO_SETDUAL 0x00001000 /* Set the 'cont' boolean before */
+ /* submitting the request */
+#define KCF_ISDUALREQ(crq) \
+ (((crq) == NULL) ? B_FALSE : (crq->cr_flag & CRYPTO_SETDUAL))
+
+typedef struct kcf_prov_tried {
+ kcf_provider_desc_t *pt_pd;
+ struct kcf_prov_tried *pt_next;
+} kcf_prov_tried_t;
+
+#define IS_FG_SUPPORTED(mdesc, fg) \
+ (((mdesc)->pm_mech_info.cm_func_group_mask & (fg)) != 0)
+
+#define IS_PROVIDER_TRIED(pd, tlist) \
+ (tlist != NULL && is_in_triedlist(pd, tlist))
+
+#define IS_RECOVERABLE(error) \
+ (error == CRYPTO_BUFFER_TOO_BIG || \
+ error == CRYPTO_BUSY || \
+ error == CRYPTO_DEVICE_ERROR || \
+ error == CRYPTO_DEVICE_MEMORY || \
+ error == CRYPTO_KEY_SIZE_RANGE || \
+ error == CRYPTO_NO_PERMISSION)
+
+#define KCF_ATOMIC_INCR(x) atomic_add_32(&(x), 1)
+#define KCF_ATOMIC_DECR(x) atomic_add_32(&(x), -1)
+
+/*
+ * Node structure for synchronous requests.
+ */
+typedef struct kcf_sreq_node {
+ /* Should always be the first field in this structure */
+ kcf_call_type_t sn_type;
+ /*
+ * sn_cv and sr_lock are used to wait for the
+ * operation to complete. sn_lock also protects
+ * the sn_state field.
+ */
+ kcondvar_t sn_cv;
+ kmutex_t sn_lock;
+ kcf_req_status_t sn_state;
+
+ /*
+ * Return value from the operation. This will be
+ * one of the CRYPTO_* errors defined in common.h.
+ */
+ int sn_rv;
+
+ /*
+ * parameters to call the SPI with. This can be
+ * a pointer as we know the caller context/stack stays.
+ */
+ struct kcf_req_params *sn_params;
+
+ /* Internal context for this request */
+ struct kcf_context *sn_context;
+
+ /* Provider handling this request */
+ kcf_provider_desc_t *sn_provider;
+} kcf_sreq_node_t;
+
+/*
+ * Node structure for asynchronous requests. A node can be on
+ * on a chain of requests hanging of the internal context
+ * structure and can be in the global software provider queue.
+ */
+typedef struct kcf_areq_node {
+ /* Should always be the first field in this structure */
+ kcf_call_type_t an_type;
+
+ /* an_lock protects the field an_state */
+ kmutex_t an_lock;
+ kcf_req_status_t an_state;
+ crypto_call_req_t an_reqarg;
+
+ /*
+ * parameters to call the SPI with. We need to
+ * save the params since the caller stack can go away.
+ */
+ struct kcf_req_params an_params;
+
+ /*
+ * The next two fields should be NULL for operations that
+ * don't need a context.
+ */
+ /* Internal context for this request */
+ struct kcf_context *an_context;
+
+ /* next in chain of requests for context */
+ struct kcf_areq_node *an_ctxchain_next;
+
+ kcondvar_t an_turn_cv;
+ boolean_t an_is_my_turn;
+ boolean_t an_isdual; /* for internal reuse */
+
+ /*
+ * Next and previous nodes in the global software
+ * queue. These fields are NULL for a hardware
+ * provider since we use a taskq there.
+ */
+ struct kcf_areq_node *an_next;
+ struct kcf_areq_node *an_prev;
+
+ /* Provider handling this request */
+ kcf_provider_desc_t *an_provider;
+ kcf_prov_tried_t *an_tried_plist;
+
+ struct kcf_areq_node *an_idnext; /* Next in ID hash */
+ struct kcf_areq_node *an_idprev; /* Prev in ID hash */
+ kcondvar_t an_done; /* Signal request completion */
+ uint_t an_refcnt;
+} kcf_areq_node_t;
+
+#define KCF_AREQ_REFHOLD(areq) { \
+ atomic_add_32(&(areq)->an_refcnt, 1); \
+ ASSERT((areq)->an_refcnt != 0); \
+}
+
+#define KCF_AREQ_REFRELE(areq) { \
+ ASSERT((areq)->an_refcnt != 0); \
+ membar_exit(); \
+ if (atomic_add_32_nv(&(areq)->an_refcnt, -1) == 0) \
+ kcf_free_req(areq); \
+}
+
+#define GET_REQ_TYPE(arg) *((kcf_call_type_t *)(arg))
+
+#define NOTIFY_CLIENT(areq, err) (*(areq)->an_reqarg.cr_callback_func)(\
+ (areq)->an_reqarg.cr_callback_arg, err);
+
+/* For internally generated call requests for dual operations */
+typedef struct kcf_call_req {
+ crypto_call_req_t kr_callreq; /* external client call req */
+ kcf_req_params_t kr_params; /* Params saved for next call */
+ kcf_areq_node_t *kr_areq; /* Use this areq */
+ off_t kr_saveoffset;
+ size_t kr_savelen;
+} kcf_dual_req_t;
+
+/*
+ * The following are some what similar to macros in callo.h, which implement
+ * callout tables.
+ *
+ * The lower four bits of the ID are used to encode the table ID to
+ * index in to. The REQID_COUNTER_HIGH bit is used to avoid any check for
+ * wrap around when generating ID. We assume that there won't be a request
+ * which takes more time than 2^^(sizeof (long) - 5) other requests submitted
+ * after it. This ensures there won't be any ID collision.
+ */
+#define REQID_COUNTER_HIGH (1UL << (8 * sizeof (long) - 1))
+#define REQID_COUNTER_SHIFT 4
+#define REQID_COUNTER_LOW (1 << REQID_COUNTER_SHIFT)
+#define REQID_TABLES 16
+#define REQID_TABLE_MASK (REQID_TABLES - 1)
+
+#define REQID_BUCKETS 512
+#define REQID_BUCKET_MASK (REQID_BUCKETS - 1)
+#define REQID_HASH(id) (((id) >> REQID_COUNTER_SHIFT) & REQID_BUCKET_MASK)
+
+#define GET_REQID(areq) (areq)->an_reqarg.cr_reqid
+#define SET_REQID(areq, val) GET_REQID(areq) = val
+
+/*
+ * Hash table for async requests.
+ */
+typedef struct kcf_reqid_table {
+ kmutex_t rt_lock;
+ crypto_req_id_t rt_curid;
+ kcf_areq_node_t *rt_idhash[REQID_BUCKETS];
+} kcf_reqid_table_t;
+
+/*
+ * Global software provider queue structure. Requests to be
+ * handled by a SW provider and have the ALWAYS_QUEUE flag set
+ * get queued here.
+ */
+typedef struct kcf_global_swq {
+ /*
+ * gs_cv and gs_lock are used to wait for new requests.
+ * gs_lock protects the changes to the queue.
+ */
+ kcondvar_t gs_cv;
+ kmutex_t gs_lock;
+ uint_t gs_njobs;
+ uint_t gs_maxjobs;
+ kcf_areq_node_t *gs_first;
+ kcf_areq_node_t *gs_last;
+} kcf_global_swq_t;
+
+
+/*
+ * Internal representation of a canonical context. We contain crypto_ctx_t
+ * structure in order to have just one memory allocation. The SPI
+ * ((crypto_ctx_t *)ctx)->cc_framework_private maps to this structure.
+ */
+typedef struct kcf_context {
+ crypto_ctx_t kc_glbl_ctx;
+ uint_t kc_refcnt;
+ kmutex_t kc_in_use_lock;
+ /*
+ * kc_req_chain_first and kc_req_chain_last are used to chain
+ * multiple async requests using the same context. They should be
+ * NULL for sync requests.
+ */
+ kcf_areq_node_t *kc_req_chain_first;
+ kcf_areq_node_t *kc_req_chain_last;
+ kcf_provider_desc_t *kc_prov_desc; /* Prov. descriptor */
+ kcf_provider_desc_t *kc_sw_prov_desc; /* Prov. descriptor */
+ kcf_mech_entry_t *kc_mech;
+ struct kcf_context *kc_secondctx; /* for dual contexts */
+} kcf_context_t;
+
+/*
+ * Bump up the reference count on the framework private context. A
+ * global context or a request that references this structure should
+ * do a hold.
+ */
+#define KCF_CONTEXT_REFHOLD(ictx) { \
+ atomic_add_32(&(ictx)->kc_refcnt, 1); \
+ ASSERT((ictx)->kc_refcnt != 0); \
+}
+
+/*
+ * Decrement the reference count on the framework private context.
+ * When the last reference is released, the framework private
+ * context structure is freed along with the global context.
+ */
+#define KCF_CONTEXT_REFRELE(ictx) { \
+ ASSERT((ictx)->kc_refcnt != 0); \
+ membar_exit(); \
+ if (atomic_add_32_nv(&(ictx)->kc_refcnt, -1) == 0) \
+ kcf_free_context(ictx); \
+}
+
+/*
+ * Check if we can release the context now. In case of CRYPTO_QUEUED
+ * we do not release it as we can do it only after the provider notified
+ * us. In case of CRYPTO_BUSY, the client can retry the request using
+ * the context, so we do not release the context.
+ *
+ * This macro should be called only from the final routine in
+ * an init/update/final sequence. We do not release the context in case
+ * of update operations. We require the consumer to free it
+ * explicitly, in case it wants to abandon the operation. This is done
+ * as there may be mechanisms in ECB mode that can continue even if
+ * an operation on a block fails.
+ */
+#define KCF_CONTEXT_COND_RELEASE(rv, kcf_ctx) { \
+ if (KCF_CONTEXT_DONE(rv)) \
+ KCF_CONTEXT_REFRELE(kcf_ctx); \
+}
+
+/*
+ * This macro determines whether we're done with a context.
+ */
+#define KCF_CONTEXT_DONE(rv) \
+ ((rv) != CRYPTO_QUEUED && (rv) != CRYPTO_BUSY && \
+ (rv) != CRYPTO_BUFFER_TOO_SMALL)
+
+/*
+ * A crypto_ctx_template_t is internally a pointer to this struct
+ */
+typedef struct kcf_ctx_template {
+ crypto_kcf_provider_handle_t ct_prov_handle; /* provider handle */
+ uint_t ct_generation; /* generation # */
+ size_t ct_size; /* for freeing */
+ crypto_spi_ctx_template_t ct_prov_tmpl; /* context template */
+ /* from the SW prov */
+} kcf_ctx_template_t;
+
+/*
+ * Structure for pool of threads working on global software queue.
+ */
+typedef struct kcf_pool {
+ uint32_t kp_threads; /* Number of threads in pool */
+ uint32_t kp_idlethreads; /* Idle threads in pool */
+ uint32_t kp_blockedthreads; /* Blocked threads in pool */
+
+ /*
+ * cv & lock to monitor the condition when no threads
+ * are around. In this case the failover thread kicks in.
+ */
+ kcondvar_t kp_nothr_cv;
+ kmutex_t kp_thread_lock;
+
+ /* Userspace thread creator variables. */
+ boolean_t kp_signal_create_thread; /* Create requested flag */
+ int kp_nthrs; /* # of threads to create */
+ boolean_t kp_user_waiting; /* Thread waiting for work */
+
+ /*
+ * cv & lock for the condition where more threads need to be
+ * created. kp_user_lock also protects the three fields above.
+ */
+ kcondvar_t kp_user_cv; /* Creator cond. variable */
+ kmutex_t kp_user_lock; /* Creator lock */
+} kcf_pool_t;
+
+
+/*
+ * State of a crypto bufcall element.
+ */
+typedef enum cbuf_state {
+ CBUF_FREE = 1,
+ CBUF_WAITING,
+ CBUF_RUNNING
+} cbuf_state_t;
+
+/*
+ * Structure of a crypto bufcall element.
+ */
+typedef struct kcf_cbuf_elem {
+ /*
+ * lock and cv to wait for CBUF_RUNNING to be done
+ * kc_lock also protects kc_state.
+ */
+ kmutex_t kc_lock;
+ kcondvar_t kc_cv;
+ cbuf_state_t kc_state;
+
+ struct kcf_cbuf_elem *kc_next;
+ struct kcf_cbuf_elem *kc_prev;
+
+ void (*kc_func)(void *arg);
+ void *kc_arg;
+} kcf_cbuf_elem_t;
+
+/*
+ * State of a notify element.
+ */
+typedef enum ntfy_elem_state {
+ NTFY_WAITING = 1,
+ NTFY_RUNNING
+} ntfy_elem_state_t;
+
+/*
+ * Structure of a notify list element.
+ */
+typedef struct kcf_ntfy_elem {
+ /*
+ * lock and cv to wait for NTFY_RUNNING to be done.
+ * kn_lock also protects kn_state.
+ */
+ kmutex_t kn_lock;
+ kcondvar_t kn_cv;
+ ntfy_elem_state_t kn_state;
+
+ struct kcf_ntfy_elem *kn_next;
+ struct kcf_ntfy_elem *kn_prev;
+
+ crypto_notify_callback_t kn_func;
+ uint32_t kn_event_mask;
+} kcf_ntfy_elem_t;
+
+
+/*
+ * The following values are based on the assumption that it would
+ * take around eight cpus to load a hardware provider (This is true for
+ * at least one product) and a kernel client may come from different
+ * low-priority interrupt levels. We will have CRYPTO_TASKQ_MIN number
+ * of cached taskq entries. The CRYPTO_TASKQ_MAX number is based on
+ * a throughput of 1GB/s using 512-byte buffers. These are just
+ * reasonable estimates and might need to change in future.
+ */
+#define CRYPTO_TASKQ_THREADS 8
+#define CRYPTO_TASKQ_MIN 64
+#define CRYPTO_TASKQ_MAX 2 * 1024 * 1024
+
+extern int crypto_taskq_threads;
+extern int crypto_taskq_minalloc;
+extern int crypto_taskq_maxalloc;
+extern kcf_global_swq_t *gswq;
+extern int kcf_maxthreads;
+extern int kcf_minthreads;
+
+/*
+ * All pending crypto bufcalls are put on a list. cbuf_list_lock
+ * protects changes to this list.
+ */
+extern kmutex_t cbuf_list_lock;
+extern kcondvar_t cbuf_list_cv;
+
+/*
+ * All event subscribers are put on a list. kcf_notify_list_lock
+ * protects changes to this list.
+ */
+extern kmutex_t ntfy_list_lock;
+extern kcondvar_t ntfy_list_cv;
+
+boolean_t kcf_get_next_logical_provider_member(kcf_provider_desc_t *,
+ kcf_provider_desc_t *, kcf_provider_desc_t **);
+extern int kcf_get_hardware_provider(crypto_mech_type_t, crypto_mech_type_t,
+ boolean_t, kcf_provider_desc_t *, kcf_provider_desc_t **,
+ crypto_func_group_t);
+extern int kcf_get_hardware_provider_nomech(offset_t, offset_t,
+ boolean_t, kcf_provider_desc_t *, kcf_provider_desc_t **);
+extern void kcf_free_triedlist(kcf_prov_tried_t *);
+extern kcf_prov_tried_t *kcf_insert_triedlist(kcf_prov_tried_t **,
+ kcf_provider_desc_t *, int);
+extern kcf_provider_desc_t *kcf_get_mech_provider(crypto_mech_type_t,
+ kcf_mech_entry_t **, int *, kcf_prov_tried_t *, crypto_func_group_t,
+ boolean_t, size_t);
+extern kcf_provider_desc_t *kcf_get_dual_provider(crypto_mechanism_t *,
+ crypto_mechanism_t *, kcf_mech_entry_t **, crypto_mech_type_t *,
+ crypto_mech_type_t *, int *, kcf_prov_tried_t *,
+ crypto_func_group_t, crypto_func_group_t, boolean_t, size_t);
+extern crypto_ctx_t *kcf_new_ctx(crypto_call_req_t *, kcf_provider_desc_t *,
+ crypto_session_id_t);
+extern int kcf_submit_request(kcf_provider_desc_t *, crypto_ctx_t *,
+ crypto_call_req_t *, kcf_req_params_t *, boolean_t);
+extern void kcf_sched_destroy(void);
+extern void kcf_sched_init(void);
+extern void kcf_sched_start(void);
+extern void kcf_sop_done(kcf_sreq_node_t *, int);
+extern void kcf_aop_done(kcf_areq_node_t *, int);
+extern int common_submit_request(kcf_provider_desc_t *,
+ crypto_ctx_t *, kcf_req_params_t *, crypto_req_handle_t);
+extern void kcf_free_context(kcf_context_t *);
+
+extern int kcf_svc_wait(int *);
+extern int kcf_svc_do_run(void);
+extern int kcf_need_signature_verification(kcf_provider_desc_t *);
+extern void kcf_verify_signature(void *);
+extern struct modctl *kcf_get_modctl(crypto_provider_info_t *);
+extern void verify_unverified_providers(void);
+extern void kcf_free_req(kcf_areq_node_t *areq);
+extern void crypto_bufcall_service(void);
+
+extern void kcf_walk_ntfylist(uint32_t, void *);
+extern void kcf_do_notify(kcf_provider_desc_t *, boolean_t);
+
+extern kcf_dual_req_t *kcf_alloc_req(crypto_call_req_t *);
+extern void kcf_next_req(void *, int);
+extern void kcf_last_req(void *, int);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CRYPTO_SCHED_IMPL_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/crypto/spi.h b/sys/contrib/openzfs/module/icp/include/sys/crypto/spi.h
new file mode 100644
index 000000000000..2c62b5706651
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/crypto/spi.h
@@ -0,0 +1,726 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_CRYPTO_SPI_H
+#define _SYS_CRYPTO_SPI_H
+
+/*
+ * CSPI: Cryptographic Service Provider Interface.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef CONSTIFY_PLUGIN
+#define __no_const __attribute__((no_const))
+#else
+#define __no_const
+#endif /* CONSTIFY_PLUGIN */
+
+#define CRYPTO_SPI_VERSION_1 1
+#define CRYPTO_SPI_VERSION_2 2
+#define CRYPTO_SPI_VERSION_3 3
+
+/*
+ * Provider-private handle. This handle is specified by a provider
+ * when it registers by means of the pi_provider_handle field of
+ * the crypto_provider_info structure, and passed to the provider
+ * when its entry points are invoked.
+ */
+typedef void *crypto_provider_handle_t;
+
+/*
+ * Context templates can be used to by software providers to pre-process
+ * keying material, such as key schedules. They are allocated by
+ * a software provider create_ctx_template(9E) entry point, and passed
+ * as argument to initialization and atomic provider entry points.
+ */
+typedef void *crypto_spi_ctx_template_t;
+
+/*
+ * Request handles are used by the kernel to identify an asynchronous
+ * request being processed by a provider. It is passed by the kernel
+ * to a hardware provider when submitting a request, and must be
+ * specified by a provider when calling crypto_op_notification(9F)
+ */
+typedef void *crypto_req_handle_t;
+
+/* Values for cc_flags field */
+#define CRYPTO_INIT_OPSTATE 0x00000001 /* allocate and init cc_opstate */
+#define CRYPTO_USE_OPSTATE 0x00000002 /* .. start using it as context */
+
+/*
+ * The context structure is passed from the kernel to a provider.
+ * It contains the information needed to process a multi-part or
+ * single part operation. The context structure is not used
+ * by atomic operations.
+ *
+ * Parameters needed to perform a cryptographic operation, such
+ * as keys, mechanisms, input and output buffers, are passed
+ * as separate arguments to Provider routines.
+ */
+typedef struct crypto_ctx {
+ crypto_provider_handle_t cc_provider;
+ crypto_session_id_t cc_session;
+ void *cc_provider_private; /* owned by provider */
+ void *cc_framework_private; /* owned by framework */
+ uint32_t cc_flags; /* flags */
+ void *cc_opstate; /* state */
+} crypto_ctx_t;
+
+/*
+ * Extended provider information.
+ */
+
+/*
+ * valid values for ei_flags field of extended info structure
+ * They match the RSA Security, Inc PKCS#11 tokenInfo flags.
+ */
+#define CRYPTO_EXTF_RNG 0x00000001
+#define CRYPTO_EXTF_WRITE_PROTECTED 0x00000002
+#define CRYPTO_EXTF_LOGIN_REQUIRED 0x00000004
+#define CRYPTO_EXTF_USER_PIN_INITIALIZED 0x00000008
+#define CRYPTO_EXTF_CLOCK_ON_TOKEN 0x00000040
+#define CRYPTO_EXTF_PROTECTED_AUTHENTICATION_PATH 0x00000100
+#define CRYPTO_EXTF_DUAL_CRYPTO_OPERATIONS 0x00000200
+#define CRYPTO_EXTF_TOKEN_INITIALIZED 0x00000400
+#define CRYPTO_EXTF_USER_PIN_COUNT_LOW 0x00010000
+#define CRYPTO_EXTF_USER_PIN_FINAL_TRY 0x00020000
+#define CRYPTO_EXTF_USER_PIN_LOCKED 0x00040000
+#define CRYPTO_EXTF_USER_PIN_TO_BE_CHANGED 0x00080000
+#define CRYPTO_EXTF_SO_PIN_COUNT_LOW 0x00100000
+#define CRYPTO_EXTF_SO_PIN_FINAL_TRY 0x00200000
+#define CRYPTO_EXTF_SO_PIN_LOCKED 0x00400000
+#define CRYPTO_EXTF_SO_PIN_TO_BE_CHANGED 0x00800000
+
+/*
+ * The crypto_control_ops structure contains pointers to control
+ * operations for cryptographic providers. It is passed through
+ * the crypto_ops(9S) structure when providers register with the
+ * kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_control_ops {
+ void (*provider_status)(crypto_provider_handle_t, uint_t *);
+} __no_const crypto_control_ops_t;
+
+/*
+ * The crypto_ctx_ops structure contains points to context and context
+ * templates management operations for cryptographic providers. It is
+ * passed through the crypto_ops(9S) structure when providers register
+ * with the kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_ctx_ops {
+ int (*create_ctx_template)(crypto_provider_handle_t,
+ crypto_mechanism_t *, crypto_key_t *,
+ crypto_spi_ctx_template_t *, size_t *, crypto_req_handle_t);
+ int (*free_context)(crypto_ctx_t *);
+} __no_const crypto_ctx_ops_t;
+
+/*
+ * The crypto_digest_ops structure contains pointers to digest
+ * operations for cryptographic providers. It is passed through
+ * the crypto_ops(9S) structure when providers register with the
+ * kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_digest_ops {
+ int (*digest_init)(crypto_ctx_t *, crypto_mechanism_t *,
+ crypto_req_handle_t);
+ int (*digest)(crypto_ctx_t *, crypto_data_t *, crypto_data_t *,
+ crypto_req_handle_t);
+ int (*digest_update)(crypto_ctx_t *, crypto_data_t *,
+ crypto_req_handle_t);
+ int (*digest_key)(crypto_ctx_t *, crypto_key_t *, crypto_req_handle_t);
+ int (*digest_final)(crypto_ctx_t *, crypto_data_t *,
+ crypto_req_handle_t);
+ int (*digest_atomic)(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_data_t *,
+ crypto_data_t *, crypto_req_handle_t);
+} __no_const crypto_digest_ops_t;
+
+/*
+ * The crypto_cipher_ops structure contains pointers to encryption
+ * and decryption operations for cryptographic providers. It is
+ * passed through the crypto_ops(9S) structure when providers register
+ * with the kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_cipher_ops {
+ int (*encrypt_init)(crypto_ctx_t *,
+ crypto_mechanism_t *, crypto_key_t *,
+ crypto_spi_ctx_template_t, crypto_req_handle_t);
+ int (*encrypt)(crypto_ctx_t *,
+ crypto_data_t *, crypto_data_t *, crypto_req_handle_t);
+ int (*encrypt_update)(crypto_ctx_t *,
+ crypto_data_t *, crypto_data_t *, crypto_req_handle_t);
+ int (*encrypt_final)(crypto_ctx_t *,
+ crypto_data_t *, crypto_req_handle_t);
+ int (*encrypt_atomic)(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_data_t *,
+ crypto_data_t *, crypto_spi_ctx_template_t, crypto_req_handle_t);
+
+ int (*decrypt_init)(crypto_ctx_t *,
+ crypto_mechanism_t *, crypto_key_t *,
+ crypto_spi_ctx_template_t, crypto_req_handle_t);
+ int (*decrypt)(crypto_ctx_t *,
+ crypto_data_t *, crypto_data_t *, crypto_req_handle_t);
+ int (*decrypt_update)(crypto_ctx_t *,
+ crypto_data_t *, crypto_data_t *, crypto_req_handle_t);
+ int (*decrypt_final)(crypto_ctx_t *,
+ crypto_data_t *, crypto_req_handle_t);
+ int (*decrypt_atomic)(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_data_t *,
+ crypto_data_t *, crypto_spi_ctx_template_t, crypto_req_handle_t);
+} __no_const crypto_cipher_ops_t;
+
+/*
+ * The crypto_mac_ops structure contains pointers to MAC
+ * operations for cryptographic providers. It is passed through
+ * the crypto_ops(9S) structure when providers register with the
+ * kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_mac_ops {
+ int (*mac_init)(crypto_ctx_t *,
+ crypto_mechanism_t *, crypto_key_t *,
+ crypto_spi_ctx_template_t, crypto_req_handle_t);
+ int (*mac)(crypto_ctx_t *,
+ crypto_data_t *, crypto_data_t *, crypto_req_handle_t);
+ int (*mac_update)(crypto_ctx_t *,
+ crypto_data_t *, crypto_req_handle_t);
+ int (*mac_final)(crypto_ctx_t *,
+ crypto_data_t *, crypto_req_handle_t);
+ int (*mac_atomic)(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_data_t *,
+ crypto_data_t *, crypto_spi_ctx_template_t,
+ crypto_req_handle_t);
+ int (*mac_verify_atomic)(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_data_t *,
+ crypto_data_t *, crypto_spi_ctx_template_t,
+ crypto_req_handle_t);
+} __no_const crypto_mac_ops_t;
+
+/*
+ * The crypto_sign_ops structure contains pointers to signing
+ * operations for cryptographic providers. It is passed through
+ * the crypto_ops(9S) structure when providers register with the
+ * kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_sign_ops {
+ int (*sign_init)(crypto_ctx_t *,
+ crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t,
+ crypto_req_handle_t);
+ int (*sign)(crypto_ctx_t *,
+ crypto_data_t *, crypto_data_t *, crypto_req_handle_t);
+ int (*sign_update)(crypto_ctx_t *,
+ crypto_data_t *, crypto_req_handle_t);
+ int (*sign_final)(crypto_ctx_t *,
+ crypto_data_t *, crypto_req_handle_t);
+ int (*sign_atomic)(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_data_t *,
+ crypto_data_t *, crypto_spi_ctx_template_t,
+ crypto_req_handle_t);
+ int (*sign_recover_init)(crypto_ctx_t *, crypto_mechanism_t *,
+ crypto_key_t *, crypto_spi_ctx_template_t,
+ crypto_req_handle_t);
+ int (*sign_recover)(crypto_ctx_t *,
+ crypto_data_t *, crypto_data_t *, crypto_req_handle_t);
+ int (*sign_recover_atomic)(crypto_provider_handle_t,
+ crypto_session_id_t, crypto_mechanism_t *, crypto_key_t *,
+ crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t,
+ crypto_req_handle_t);
+} __no_const crypto_sign_ops_t;
+
+/*
+ * The crypto_verify_ops structure contains pointers to verify
+ * operations for cryptographic providers. It is passed through
+ * the crypto_ops(9S) structure when providers register with the
+ * kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_verify_ops {
+ int (*verify_init)(crypto_ctx_t *,
+ crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t,
+ crypto_req_handle_t);
+ int (*do_verify)(crypto_ctx_t *,
+ crypto_data_t *, crypto_data_t *, crypto_req_handle_t);
+ int (*verify_update)(crypto_ctx_t *,
+ crypto_data_t *, crypto_req_handle_t);
+ int (*verify_final)(crypto_ctx_t *,
+ crypto_data_t *, crypto_req_handle_t);
+ int (*verify_atomic)(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_data_t *,
+ crypto_data_t *, crypto_spi_ctx_template_t,
+ crypto_req_handle_t);
+ int (*verify_recover_init)(crypto_ctx_t *, crypto_mechanism_t *,
+ crypto_key_t *, crypto_spi_ctx_template_t,
+ crypto_req_handle_t);
+ int (*verify_recover)(crypto_ctx_t *,
+ crypto_data_t *, crypto_data_t *, crypto_req_handle_t);
+ int (*verify_recover_atomic)(crypto_provider_handle_t,
+ crypto_session_id_t, crypto_mechanism_t *, crypto_key_t *,
+ crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t,
+ crypto_req_handle_t);
+} __no_const crypto_verify_ops_t;
+
+/*
+ * The crypto_dual_ops structure contains pointers to dual
+ * cipher and sign/verify operations for cryptographic providers.
+ * It is passed through the crypto_ops(9S) structure when
+ * providers register with the kernel using
+ * crypto_register_provider(9F).
+ */
+typedef struct crypto_dual_ops {
+ int (*digest_encrypt_update)(
+ crypto_ctx_t *, crypto_ctx_t *, crypto_data_t *,
+ crypto_data_t *, crypto_req_handle_t);
+ int (*decrypt_digest_update)(
+ crypto_ctx_t *, crypto_ctx_t *, crypto_data_t *,
+ crypto_data_t *, crypto_req_handle_t);
+ int (*sign_encrypt_update)(
+ crypto_ctx_t *, crypto_ctx_t *, crypto_data_t *,
+ crypto_data_t *, crypto_req_handle_t);
+ int (*decrypt_verify_update)(
+ crypto_ctx_t *, crypto_ctx_t *, crypto_data_t *,
+ crypto_data_t *, crypto_req_handle_t);
+} __no_const crypto_dual_ops_t;
+
+/*
+ * The crypto_dual_cipher_mac_ops structure contains pointers to dual
+ * cipher and MAC operations for cryptographic providers.
+ * It is passed through the crypto_ops(9S) structure when
+ * providers register with the kernel using
+ * crypto_register_provider(9F).
+ */
+typedef struct crypto_dual_cipher_mac_ops {
+ int (*encrypt_mac_init)(crypto_ctx_t *,
+ crypto_mechanism_t *, crypto_key_t *, crypto_mechanism_t *,
+ crypto_key_t *, crypto_spi_ctx_template_t,
+ crypto_spi_ctx_template_t, crypto_req_handle_t);
+ int (*encrypt_mac)(crypto_ctx_t *,
+ crypto_data_t *, crypto_dual_data_t *, crypto_data_t *,
+ crypto_req_handle_t);
+ int (*encrypt_mac_update)(crypto_ctx_t *,
+ crypto_data_t *, crypto_dual_data_t *, crypto_req_handle_t);
+ int (*encrypt_mac_final)(crypto_ctx_t *,
+ crypto_dual_data_t *, crypto_data_t *, crypto_req_handle_t);
+ int (*encrypt_mac_atomic)(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_mechanism_t *,
+ crypto_key_t *, crypto_data_t *, crypto_dual_data_t *,
+ crypto_data_t *, crypto_spi_ctx_template_t,
+ crypto_spi_ctx_template_t, crypto_req_handle_t);
+
+ int (*mac_decrypt_init)(crypto_ctx_t *,
+ crypto_mechanism_t *, crypto_key_t *, crypto_mechanism_t *,
+ crypto_key_t *, crypto_spi_ctx_template_t,
+ crypto_spi_ctx_template_t, crypto_req_handle_t);
+ int (*mac_decrypt)(crypto_ctx_t *,
+ crypto_dual_data_t *, crypto_data_t *, crypto_data_t *,
+ crypto_req_handle_t);
+ int (*mac_decrypt_update)(crypto_ctx_t *,
+ crypto_dual_data_t *, crypto_data_t *, crypto_req_handle_t);
+ int (*mac_decrypt_final)(crypto_ctx_t *,
+ crypto_data_t *, crypto_data_t *, crypto_req_handle_t);
+ int (*mac_decrypt_atomic)(crypto_provider_handle_t,
+ crypto_session_id_t, crypto_mechanism_t *, crypto_key_t *,
+ crypto_mechanism_t *, crypto_key_t *, crypto_dual_data_t *,
+ crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t,
+ crypto_spi_ctx_template_t, crypto_req_handle_t);
+ int (*mac_verify_decrypt_atomic)(crypto_provider_handle_t,
+ crypto_session_id_t, crypto_mechanism_t *, crypto_key_t *,
+ crypto_mechanism_t *, crypto_key_t *, crypto_dual_data_t *,
+ crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t,
+ crypto_spi_ctx_template_t, crypto_req_handle_t);
+} __no_const crypto_dual_cipher_mac_ops_t;
+
+/*
+ * The crypto_random_number_ops structure contains pointers to random
+ * number operations for cryptographic providers. It is passed through
+ * the crypto_ops(9S) structure when providers register with the
+ * kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_random_number_ops {
+ int (*seed_random)(crypto_provider_handle_t, crypto_session_id_t,
+ uchar_t *, size_t, uint_t, uint32_t, crypto_req_handle_t);
+ int (*generate_random)(crypto_provider_handle_t, crypto_session_id_t,
+ uchar_t *, size_t, crypto_req_handle_t);
+} __no_const crypto_random_number_ops_t;
+
+/*
+ * Flag values for seed_random.
+ */
+#define CRYPTO_SEED_NOW 0x00000001
+
+/*
+ * The crypto_session_ops structure contains pointers to session
+ * operations for cryptographic providers. It is passed through
+ * the crypto_ops(9S) structure when providers register with the
+ * kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_session_ops {
+ int (*session_open)(crypto_provider_handle_t, crypto_session_id_t *,
+ crypto_req_handle_t);
+ int (*session_close)(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_req_handle_t);
+ int (*session_login)(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_user_type_t, char *, size_t, crypto_req_handle_t);
+ int (*session_logout)(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_req_handle_t);
+} __no_const crypto_session_ops_t;
+
+/*
+ * The crypto_object_ops structure contains pointers to object
+ * operations for cryptographic providers. It is passed through
+ * the crypto_ops(9S) structure when providers register with the
+ * kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_object_ops {
+ int (*object_create)(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_object_attribute_t *, uint_t, crypto_object_id_t *,
+ crypto_req_handle_t);
+ int (*object_copy)(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_object_id_t, crypto_object_attribute_t *, uint_t,
+ crypto_object_id_t *, crypto_req_handle_t);
+ int (*object_destroy)(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_object_id_t, crypto_req_handle_t);
+ int (*object_get_size)(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_object_id_t, size_t *, crypto_req_handle_t);
+ int (*object_get_attribute_value)(crypto_provider_handle_t,
+ crypto_session_id_t, crypto_object_id_t,
+ crypto_object_attribute_t *, uint_t, crypto_req_handle_t);
+ int (*object_set_attribute_value)(crypto_provider_handle_t,
+ crypto_session_id_t, crypto_object_id_t,
+ crypto_object_attribute_t *, uint_t, crypto_req_handle_t);
+ int (*object_find_init)(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_object_attribute_t *, uint_t, void **,
+ crypto_req_handle_t);
+ int (*object_find)(crypto_provider_handle_t, void *,
+ crypto_object_id_t *, uint_t, uint_t *, crypto_req_handle_t);
+ int (*object_find_final)(crypto_provider_handle_t, void *,
+ crypto_req_handle_t);
+} __no_const crypto_object_ops_t;
+
+/*
+ * The crypto_key_ops structure contains pointers to key
+ * operations for cryptographic providers. It is passed through
+ * the crypto_ops(9S) structure when providers register with the
+ * kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_key_ops {
+ int (*key_generate)(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_object_attribute_t *, uint_t,
+ crypto_object_id_t *, crypto_req_handle_t);
+ int (*key_generate_pair)(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_object_attribute_t *, uint_t,
+ crypto_object_attribute_t *, uint_t, crypto_object_id_t *,
+ crypto_object_id_t *, crypto_req_handle_t);
+ int (*key_wrap)(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_object_id_t *,
+ uchar_t *, size_t *, crypto_req_handle_t);
+ int (*key_unwrap)(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_key_t *, uchar_t *, size_t *,
+ crypto_object_attribute_t *, uint_t,
+ crypto_object_id_t *, crypto_req_handle_t);
+ int (*key_derive)(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_object_attribute_t *,
+ uint_t, crypto_object_id_t *, crypto_req_handle_t);
+ int (*key_check)(crypto_provider_handle_t, crypto_mechanism_t *,
+ crypto_key_t *);
+} __no_const crypto_key_ops_t;
+
+/*
+ * The crypto_provider_management_ops structure contains pointers
+ * to management operations for cryptographic providers. It is passed
+ * through the crypto_ops(9S) structure when providers register with the
+ * kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_provider_management_ops {
+ int (*ext_info)(crypto_provider_handle_t,
+ crypto_provider_ext_info_t *, crypto_req_handle_t);
+ int (*init_token)(crypto_provider_handle_t, char *, size_t,
+ char *, crypto_req_handle_t);
+ int (*init_pin)(crypto_provider_handle_t, crypto_session_id_t,
+ char *, size_t, crypto_req_handle_t);
+ int (*set_pin)(crypto_provider_handle_t, crypto_session_id_t,
+ char *, size_t, char *, size_t, crypto_req_handle_t);
+} __no_const crypto_provider_management_ops_t;
+
+typedef struct crypto_mech_ops {
+ int (*copyin_mechanism)(crypto_provider_handle_t,
+ crypto_mechanism_t *, crypto_mechanism_t *, int *, int);
+ int (*copyout_mechanism)(crypto_provider_handle_t,
+ crypto_mechanism_t *, crypto_mechanism_t *, int *, int);
+ int (*free_mechanism)(crypto_provider_handle_t, crypto_mechanism_t *);
+} __no_const crypto_mech_ops_t;
+
+typedef struct crypto_nostore_key_ops {
+ int (*nostore_key_generate)(crypto_provider_handle_t,
+ crypto_session_id_t, crypto_mechanism_t *,
+ crypto_object_attribute_t *, uint_t, crypto_object_attribute_t *,
+ uint_t, crypto_req_handle_t);
+ int (*nostore_key_generate_pair)(crypto_provider_handle_t,
+ crypto_session_id_t, crypto_mechanism_t *,
+ crypto_object_attribute_t *, uint_t, crypto_object_attribute_t *,
+ uint_t, crypto_object_attribute_t *, uint_t,
+ crypto_object_attribute_t *, uint_t, crypto_req_handle_t);
+ int (*nostore_key_derive)(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_object_attribute_t *,
+ uint_t, crypto_object_attribute_t *, uint_t, crypto_req_handle_t);
+} __no_const crypto_nostore_key_ops_t;
+
+/*
+ * The crypto_ops(9S) structure contains the structures containing
+ * the pointers to functions implemented by cryptographic providers.
+ * It is specified as part of the crypto_provider_info(9S)
+ * supplied by a provider when it registers with the kernel
+ * by calling crypto_register_provider(9F).
+ */
+typedef struct crypto_ops_v1 {
+ crypto_control_ops_t *co_control_ops;
+ crypto_digest_ops_t *co_digest_ops;
+ crypto_cipher_ops_t *co_cipher_ops;
+ crypto_mac_ops_t *co_mac_ops;
+ crypto_sign_ops_t *co_sign_ops;
+ crypto_verify_ops_t *co_verify_ops;
+ crypto_dual_ops_t *co_dual_ops;
+ crypto_dual_cipher_mac_ops_t *co_dual_cipher_mac_ops;
+ crypto_random_number_ops_t *co_random_ops;
+ crypto_session_ops_t *co_session_ops;
+ crypto_object_ops_t *co_object_ops;
+ crypto_key_ops_t *co_key_ops;
+ crypto_provider_management_ops_t *co_provider_ops;
+ crypto_ctx_ops_t *co_ctx_ops;
+} crypto_ops_v1_t;
+
+typedef struct crypto_ops_v2 {
+ crypto_ops_v1_t v1_ops;
+ crypto_mech_ops_t *co_mech_ops;
+} crypto_ops_v2_t;
+
+typedef struct crypto_ops_v3 {
+ crypto_ops_v2_t v2_ops;
+ crypto_nostore_key_ops_t *co_nostore_key_ops;
+} crypto_ops_v3_t;
+
+typedef struct crypto_ops {
+ union {
+ crypto_ops_v3_t cou_v3;
+ crypto_ops_v2_t cou_v2;
+ crypto_ops_v1_t cou_v1;
+ } cou;
+} crypto_ops_t;
+
+#define co_control_ops cou.cou_v1.co_control_ops
+#define co_digest_ops cou.cou_v1.co_digest_ops
+#define co_cipher_ops cou.cou_v1.co_cipher_ops
+#define co_mac_ops cou.cou_v1.co_mac_ops
+#define co_sign_ops cou.cou_v1.co_sign_ops
+#define co_verify_ops cou.cou_v1.co_verify_ops
+#define co_dual_ops cou.cou_v1.co_dual_ops
+#define co_dual_cipher_mac_ops cou.cou_v1.co_dual_cipher_mac_ops
+#define co_random_ops cou.cou_v1.co_random_ops
+#define co_session_ops cou.cou_v1.co_session_ops
+#define co_object_ops cou.cou_v1.co_object_ops
+#define co_key_ops cou.cou_v1.co_key_ops
+#define co_provider_ops cou.cou_v1.co_provider_ops
+#define co_ctx_ops cou.cou_v1.co_ctx_ops
+#define co_mech_ops cou.cou_v2.co_mech_ops
+#define co_nostore_key_ops cou.cou_v3.co_nostore_key_ops
+
+/*
+ * The mechanism info structure crypto_mech_info_t contains a function group
+ * bit mask cm_func_group_mask. This field, of type crypto_func_group_t,
+ * specifies the provider entry point that can be used a particular
+ * mechanism. The function group mask is a combination of the following values.
+ */
+
+typedef uint32_t crypto_func_group_t;
+
+
+#define CRYPTO_FG_ENCRYPT 0x00000001 /* encrypt_init() */
+#define CRYPTO_FG_DECRYPT 0x00000002 /* decrypt_init() */
+#define CRYPTO_FG_DIGEST 0x00000004 /* digest_init() */
+#define CRYPTO_FG_SIGN 0x00000008 /* sign_init() */
+#define CRYPTO_FG_SIGN_RECOVER 0x00000010 /* sign_recover_init() */
+#define CRYPTO_FG_VERIFY 0x00000020 /* verify_init() */
+#define CRYPTO_FG_VERIFY_RECOVER 0x00000040 /* verify_recover_init() */
+#define CRYPTO_FG_GENERATE 0x00000080 /* key_generate() */
+#define CRYPTO_FG_GENERATE_KEY_PAIR 0x00000100 /* key_generate_pair() */
+#define CRYPTO_FG_WRAP 0x00000200 /* key_wrap() */
+#define CRYPTO_FG_UNWRAP 0x00000400 /* key_unwrap() */
+#define CRYPTO_FG_DERIVE 0x00000800 /* key_derive() */
+#define CRYPTO_FG_MAC 0x00001000 /* mac_init() */
+#define CRYPTO_FG_ENCRYPT_MAC 0x00002000 /* encrypt_mac_init() */
+#define CRYPTO_FG_MAC_DECRYPT 0x00004000 /* decrypt_mac_init() */
+#define CRYPTO_FG_ENCRYPT_ATOMIC 0x00008000 /* encrypt_atomic() */
+#define CRYPTO_FG_DECRYPT_ATOMIC 0x00010000 /* decrypt_atomic() */
+#define CRYPTO_FG_MAC_ATOMIC 0x00020000 /* mac_atomic() */
+#define CRYPTO_FG_DIGEST_ATOMIC 0x00040000 /* digest_atomic() */
+#define CRYPTO_FG_SIGN_ATOMIC 0x00080000 /* sign_atomic() */
+#define CRYPTO_FG_SIGN_RECOVER_ATOMIC 0x00100000 /* sign_recover_atomic() */
+#define CRYPTO_FG_VERIFY_ATOMIC 0x00200000 /* verify_atomic() */
+#define CRYPTO_FG_VERIFY_RECOVER_ATOMIC 0x00400000 /* verify_recover_atomic() */
+#define CRYPTO_FG_ENCRYPT_MAC_ATOMIC 0x00800000 /* encrypt_mac_atomic() */
+#define CRYPTO_FG_MAC_DECRYPT_ATOMIC 0x01000000 /* mac_decrypt_atomic() */
+#define CRYPTO_FG_RESERVED 0x80000000
+
+/*
+ * Maximum length of the pi_provider_description field of the
+ * crypto_provider_info structure.
+ */
+#define CRYPTO_PROVIDER_DESCR_MAX_LEN 64
+
+
+/* Bit mask for all the simple operations */
+#define CRYPTO_FG_SIMPLEOP_MASK (CRYPTO_FG_ENCRYPT | CRYPTO_FG_DECRYPT | \
+ CRYPTO_FG_DIGEST | CRYPTO_FG_SIGN | CRYPTO_FG_VERIFY | CRYPTO_FG_MAC | \
+ CRYPTO_FG_ENCRYPT_ATOMIC | CRYPTO_FG_DECRYPT_ATOMIC | \
+ CRYPTO_FG_MAC_ATOMIC | CRYPTO_FG_DIGEST_ATOMIC | CRYPTO_FG_SIGN_ATOMIC | \
+ CRYPTO_FG_VERIFY_ATOMIC)
+
+/* Bit mask for all the dual operations */
+#define CRYPTO_FG_MAC_CIPHER_MASK (CRYPTO_FG_ENCRYPT_MAC | \
+ CRYPTO_FG_MAC_DECRYPT | CRYPTO_FG_ENCRYPT_MAC_ATOMIC | \
+ CRYPTO_FG_MAC_DECRYPT_ATOMIC)
+
+/* Add other combos to CRYPTO_FG_DUAL_MASK */
+#define CRYPTO_FG_DUAL_MASK CRYPTO_FG_MAC_CIPHER_MASK
+
+/*
+ * The crypto_mech_info structure specifies one of the mechanisms
+ * supported by a cryptographic provider. The pi_mechanisms field of
+ * the crypto_provider_info structure contains a pointer to an array
+ * of crypto_mech_info's.
+ */
+typedef struct crypto_mech_info {
+ crypto_mech_name_t cm_mech_name;
+ crypto_mech_type_t cm_mech_number;
+ crypto_func_group_t cm_func_group_mask;
+ ssize_t cm_min_key_length;
+ ssize_t cm_max_key_length;
+ uint32_t cm_mech_flags;
+} crypto_mech_info_t;
+
+/* Alias the old name to the new name for compatibility. */
+#define cm_keysize_unit cm_mech_flags
+
+/*
+ * The following is used by a provider that sets
+ * CRYPTO_HASH_NO_UPDATE. It needs to specify the maximum
+ * input data size it can digest in this field.
+ */
+#define cm_max_input_length cm_max_key_length
+
+/*
+ * crypto_kcf_provider_handle_t is a handle allocated by the kernel.
+ * It is returned after the provider registers with
+ * crypto_register_provider(), and must be specified by the provider
+ * when calling crypto_unregister_provider(), and
+ * crypto_provider_notification().
+ */
+typedef uint_t crypto_kcf_provider_handle_t;
+
+/*
+ * Provider information. Passed as argument to crypto_register_provider(9F).
+ * Describes the provider and its capabilities. Multiple providers can
+ * register for the same device instance. In this case, the same
+ * pi_provider_dev must be specified with a different pi_provider_handle.
+ */
+typedef struct crypto_provider_info_v1 {
+ uint_t pi_interface_version;
+ char *pi_provider_description;
+ crypto_provider_type_t pi_provider_type;
+ crypto_provider_handle_t pi_provider_handle;
+ crypto_ops_t *pi_ops_vector;
+ uint_t pi_mech_list_count;
+ crypto_mech_info_t *pi_mechanisms;
+ uint_t pi_logical_provider_count;
+ crypto_kcf_provider_handle_t *pi_logical_providers;
+} crypto_provider_info_v1_t;
+
+typedef struct crypto_provider_info_v2 {
+ crypto_provider_info_v1_t v1_info;
+ uint_t pi_flags;
+} crypto_provider_info_v2_t;
+
+typedef struct crypto_provider_info {
+ union {
+ crypto_provider_info_v2_t piu_v2;
+ crypto_provider_info_v1_t piu_v1;
+ } piu;
+} crypto_provider_info_t;
+
+#define pi_interface_version piu.piu_v1.pi_interface_version
+#define pi_provider_description piu.piu_v1.pi_provider_description
+#define pi_provider_type piu.piu_v1.pi_provider_type
+#define pi_provider_handle piu.piu_v1.pi_provider_handle
+#define pi_ops_vector piu.piu_v1.pi_ops_vector
+#define pi_mech_list_count piu.piu_v1.pi_mech_list_count
+#define pi_mechanisms piu.piu_v1.pi_mechanisms
+#define pi_logical_provider_count piu.piu_v1.pi_logical_provider_count
+#define pi_logical_providers piu.piu_v1.pi_logical_providers
+#define pi_flags piu.piu_v2.pi_flags
+
+/* hidden providers can only be accessed via a logical provider */
+#define CRYPTO_HIDE_PROVIDER 0x00000001
+/*
+ * provider can not do multi-part digest (updates) and has a limit
+ * on maximum input data that it can digest.
+ */
+#define CRYPTO_HASH_NO_UPDATE 0x00000002
+
+/* provider can handle the request without returning a CRYPTO_QUEUED */
+#define CRYPTO_SYNCHRONOUS 0x00000004
+
+#define CRYPTO_PIFLAGS_RESERVED2 0x40000000
+#define CRYPTO_PIFLAGS_RESERVED1 0x80000000
+
+/*
+ * Provider status passed by a provider to crypto_provider_notification(9F)
+ * and returned by the provider_status(9E) entry point.
+ */
+#define CRYPTO_PROVIDER_READY 0
+#define CRYPTO_PROVIDER_BUSY 1
+#define CRYPTO_PROVIDER_FAILED 2
+
+/*
+ * Functions exported by Solaris to cryptographic providers. Providers
+ * call these functions to register and unregister, notify the kernel
+ * of state changes, and notify the kernel when a asynchronous request
+ * completed.
+ */
+extern int crypto_register_provider(crypto_provider_info_t *,
+ crypto_kcf_provider_handle_t *);
+extern int crypto_unregister_provider(crypto_kcf_provider_handle_t);
+extern void crypto_provider_notification(crypto_kcf_provider_handle_t, uint_t);
+extern void crypto_op_notification(crypto_req_handle_t, int);
+extern int crypto_kmflag(crypto_req_handle_t);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CRYPTO_SPI_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/ia32/asm_linkage.h b/sys/contrib/openzfs/module/icp/include/sys/ia32/asm_linkage.h
new file mode 100644
index 000000000000..f2dae7093b94
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/ia32/asm_linkage.h
@@ -0,0 +1,307 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _IA32_SYS_ASM_LINKAGE_H
+#define _IA32_SYS_ASM_LINKAGE_H
+
+#include <sys/stack.h>
+#include <sys/trap.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _ASM /* The remainder of this file is only for assembly files */
+
+/*
+ * make annoying differences in assembler syntax go away
+ */
+
+/*
+ * D16 and A16 are used to insert instructions prefixes; the
+ * macros help the assembler code be slightly more portable.
+ */
+#if !defined(__GNUC_AS__)
+/*
+ * /usr/ccs/bin/as prefixes are parsed as separate instructions
+ */
+#define D16 data16;
+#define A16 addr16;
+
+/*
+ * (There are some weird constructs in constant expressions)
+ */
+#define _CONST(const) [const]
+#define _BITNOT(const) -1!_CONST(const)
+#define _MUL(a, b) _CONST(a \* b)
+
+#else
+/*
+ * Why not use the 'data16' and 'addr16' prefixes .. well, the
+ * assembler doesn't quite believe in real mode, and thus argues with
+ * us about what we're trying to do.
+ */
+#define D16 .byte 0x66;
+#define A16 .byte 0x67;
+
+#define _CONST(const) (const)
+#define _BITNOT(const) ~_CONST(const)
+#define _MUL(a, b) _CONST(a * b)
+
+#endif
+
+/*
+ * C pointers are different sizes between i386 and amd64.
+ * These constants can be used to compute offsets into pointer arrays.
+ */
+#if defined(__amd64)
+#define CLONGSHIFT 3
+#define CLONGSIZE 8
+#define CLONGMASK 7
+#elif defined(__i386)
+#define CLONGSHIFT 2
+#define CLONGSIZE 4
+#define CLONGMASK 3
+#endif
+
+/*
+ * Since we know we're either ILP32 or LP64 ..
+ */
+#define CPTRSHIFT CLONGSHIFT
+#define CPTRSIZE CLONGSIZE
+#define CPTRMASK CLONGMASK
+
+#if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT)
+#error "inconsistent shift constants"
+#endif
+
+#if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1)
+#error "inconsistent mask constants"
+#endif
+
+#define ASM_ENTRY_ALIGN 16
+
+/*
+ * SSE register alignment and save areas
+ */
+
+#define XMM_SIZE 16
+#define XMM_ALIGN 16
+
+#if defined(__amd64)
+
+#define SAVE_XMM_PROLOG(sreg, nreg) \
+ subq $_CONST(_MUL(XMM_SIZE, nreg)), %rsp; \
+ movq %rsp, sreg
+
+#define RSTOR_XMM_EPILOG(sreg, nreg) \
+ addq $_CONST(_MUL(XMM_SIZE, nreg)), %rsp
+
+#elif defined(__i386)
+
+#define SAVE_XMM_PROLOG(sreg, nreg) \
+ subl $_CONST(_MUL(XMM_SIZE, nreg) + XMM_ALIGN), %esp; \
+ movl %esp, sreg; \
+ addl $XMM_ALIGN, sreg; \
+ andl $_BITNOT(XMM_ALIGN-1), sreg
+
+#define RSTOR_XMM_EPILOG(sreg, nreg) \
+ addl $_CONST(_MUL(XMM_SIZE, nreg) + XMM_ALIGN), %esp;
+
+#endif /* __i386 */
+
+/*
+ * profiling causes definitions of the MCOUNT and RTMCOUNT
+ * particular to the type
+ */
+#ifdef GPROF
+
+#define MCOUNT(x) \
+ pushl %ebp; \
+ movl %esp, %ebp; \
+ call _mcount; \
+ popl %ebp
+
+#endif /* GPROF */
+
+#ifdef PROF
+
+#define MCOUNT(x) \
+/* CSTYLED */ \
+ .lcomm .L_/**/x/**/1, 4, 4; \
+ pushl %ebp; \
+ movl %esp, %ebp; \
+/* CSTYLED */ \
+ movl $.L_/**/x/**/1, %edx; \
+ call _mcount; \
+ popl %ebp
+
+#endif /* PROF */
+
+/*
+ * if we are not profiling, MCOUNT should be defined to nothing
+ */
+#if !defined(PROF) && !defined(GPROF)
+#define MCOUNT(x)
+#endif /* !defined(PROF) && !defined(GPROF) */
+
+#define RTMCOUNT(x) MCOUNT(x)
+
+/*
+ * Macro to define weak symbol aliases. These are similar to the ANSI-C
+ * #pragma weak _name = name
+ * except a compiler can determine type. The assembler must be told. Hence,
+ * the second parameter must be the type of the symbol (i.e.: function,...)
+ */
+#define ANSI_PRAGMA_WEAK(sym, stype) \
+/* CSTYLED */ \
+ .weak _/**/sym; \
+/* CSTYLED */ \
+ .type _/**/sym, @stype; \
+/* CSTYLED */ \
+_/**/sym = sym
+
+/*
+ * Like ANSI_PRAGMA_WEAK(), but for unrelated names, as in:
+ * #pragma weak sym1 = sym2
+ */
+#define ANSI_PRAGMA_WEAK2(sym1, sym2, stype) \
+ .weak sym1; \
+ .type sym1, @stype; \
+sym1 = sym2
+
+/*
+ * ENTRY provides the standard procedure entry code and an easy way to
+ * insert the calls to mcount for profiling. ENTRY_NP is identical, but
+ * never calls mcount.
+ */
+#define ENTRY(x) \
+ .text; \
+ .align ASM_ENTRY_ALIGN; \
+ .globl x; \
+ .type x, @function; \
+x: MCOUNT(x)
+
+#define ENTRY_NP(x) \
+ .text; \
+ .align ASM_ENTRY_ALIGN; \
+ .globl x; \
+ .type x, @function; \
+x:
+
+#define RTENTRY(x) \
+ .text; \
+ .align ASM_ENTRY_ALIGN; \
+ .globl x; \
+ .type x, @function; \
+x: RTMCOUNT(x)
+
+/*
+ * ENTRY2 is identical to ENTRY but provides two labels for the entry point.
+ */
+#define ENTRY2(x, y) \
+ .text; \
+ .align ASM_ENTRY_ALIGN; \
+ .globl x, y; \
+ .type x, @function; \
+ .type y, @function; \
+/* CSTYLED */ \
+x: ; \
+y: MCOUNT(x)
+
+#define ENTRY_NP2(x, y) \
+ .text; \
+ .align ASM_ENTRY_ALIGN; \
+ .globl x, y; \
+ .type x, @function; \
+ .type y, @function; \
+/* CSTYLED */ \
+x: ; \
+y:
+
+
+/*
+ * ALTENTRY provides for additional entry points.
+ */
+#define ALTENTRY(x) \
+ .globl x; \
+ .type x, @function; \
+x:
+
+/*
+ * DGDEF and DGDEF2 provide global data declarations.
+ *
+ * DGDEF provides a word aligned word of storage.
+ *
+ * DGDEF2 allocates "sz" bytes of storage with **NO** alignment. This
+ * implies this macro is best used for byte arrays.
+ *
+ * DGDEF3 allocates "sz" bytes of storage with "algn" alignment.
+ */
+#define DGDEF2(name, sz) \
+ .data; \
+ .globl name; \
+ .type name, @object; \
+ .size name, sz; \
+name:
+
+#define DGDEF3(name, sz, algn) \
+ .data; \
+ .align algn; \
+ .globl name; \
+ .type name, @object; \
+ .size name, sz; \
+name:
+
+#define DGDEF(name) DGDEF3(name, 4, 4)
+
+/*
+ * SET_SIZE trails a function and set the size for the ELF symbol table.
+ */
+#define SET_SIZE(x) \
+ .size x, [.-x]
+
+/*
+ * NWORD provides native word value.
+ */
+#if defined(__amd64)
+
+/*CSTYLED*/
+#define NWORD quad
+
+#elif defined(__i386)
+
+#define NWORD long
+
+#endif /* __i386 */
+
+#endif /* _ASM */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _IA32_SYS_ASM_LINKAGE_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/ia32/stack.h b/sys/contrib/openzfs/module/icp/include/sys/ia32/stack.h
new file mode 100644
index 000000000000..9e7c089e1182
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/ia32/stack.h
@@ -0,0 +1,160 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _IA32_SYS_STACK_H
+#define _IA32_SYS_STACK_H
+
+#if !defined(_ASM)
+
+#include <sys/types.h>
+
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * In the x86 world, a stack frame looks like this:
+ *
+ * |--------------------------|
+ * 4n+8(%ebp) ->| argument word n |
+ * | ... | (Previous frame)
+ * 8(%ebp) ->| argument word 0 |
+ * |--------------------------|--------------------
+ * 4(%ebp) ->| return address |
+ * |--------------------------|
+ * 0(%ebp) ->| previous %ebp (optional) |
+ * |--------------------------|
+ * -4(%ebp) ->| unspecified | (Current frame)
+ * | ... |
+ * 0(%esp) ->| variable size |
+ * |--------------------------|
+ */
+
+/*
+ * Stack alignment macros.
+ */
+
+#define STACK_ALIGN32 4
+#define STACK_ENTRY_ALIGN32 4
+#define STACK_BIAS32 0
+#define SA32(x) (((x)+(STACK_ALIGN32-1)) & ~(STACK_ALIGN32-1))
+#define STACK_RESERVE32 0
+#define MINFRAME32 0
+
+#if defined(__amd64)
+
+/*
+ * In the amd64 world, a stack frame looks like this:
+ *
+ * |--------------------------|
+ * 8n+16(%rbp)->| argument word n |
+ * | ... | (Previous frame)
+ * 16(%rbp) ->| argument word 0 |
+ * |--------------------------|--------------------
+ * 8(%rbp) ->| return address |
+ * |--------------------------|
+ * 0(%rbp) ->| previous %rbp |
+ * |--------------------------|
+ * -8(%rbp) ->| unspecified | (Current frame)
+ * | ... |
+ * 0(%rsp) ->| variable size |
+ * |--------------------------|
+ * -128(%rsp) ->| reserved for function |
+ * |--------------------------|
+ *
+ * The end of the input argument area must be aligned on a 16-byte
+ * boundary; i.e. (%rsp - 8) % 16 == 0 at function entry.
+ *
+ * The 128-byte location beyond %rsp is considered to be reserved for
+ * functions and is NOT modified by signal handlers. It can be used
+ * to store temporary data that is not needed across function calls.
+ */
+
+/*
+ * Stack alignment macros.
+ */
+
+#define STACK_ALIGN64 16
+#define STACK_ENTRY_ALIGN64 8
+#define STACK_BIAS64 0
+#define SA64(x) (((x)+(STACK_ALIGN64-1)) & ~(STACK_ALIGN64-1))
+#define STACK_RESERVE64 128
+#define MINFRAME64 0
+
+#define STACK_ALIGN STACK_ALIGN64
+#define STACK_ENTRY_ALIGN STACK_ENTRY_ALIGN64
+#define STACK_BIAS STACK_BIAS64
+#define SA(x) SA64(x)
+#define STACK_RESERVE STACK_RESERVE64
+#define MINFRAME MINFRAME64
+
+#elif defined(__i386)
+
+#define STACK_ALIGN STACK_ALIGN32
+#define STACK_ENTRY_ALIGN STACK_ENTRY_ALIGN32
+#define STACK_BIAS STACK_BIAS32
+#define SA(x) SA32(x)
+#define STACK_RESERVE STACK_RESERVE32
+#define MINFRAME MINFRAME32
+
+#endif /* __i386 */
+
+#if defined(_KERNEL) && !defined(_ASM)
+
+#if defined(ZFS_DEBUG)
+#if STACK_ALIGN == 4
+#define ASSERT_STACK_ALIGNED() \
+ { \
+ uint32_t __tmp; \
+ ASSERT((((uintptr_t)&__tmp) & (STACK_ALIGN - 1)) == 0); \
+ }
+#elif (STACK_ALIGN == 16) && (_LONG_DOUBLE_ALIGNMENT == 16)
+#define ASSERT_STACK_ALIGNED() \
+ { \
+ long double __tmp; \
+ ASSERT((((uintptr_t)&__tmp) & (STACK_ALIGN - 1)) == 0); \
+ }
+#endif
+#else /* DEBUG */
+#define ASSERT_STACK_ALIGNED()
+#endif /* DEBUG */
+
+struct regs;
+
+void traceregs(struct regs *);
+void traceback(caddr_t);
+
+#endif /* defined(_KERNEL) && !defined(_ASM) */
+
+#define STACK_GROWTH_DOWN /* stacks grow from high to low addresses */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _IA32_SYS_STACK_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/ia32/trap.h b/sys/contrib/openzfs/module/icp/include/sys/ia32/trap.h
new file mode 100644
index 000000000000..55b94969b80b
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/ia32/trap.h
@@ -0,0 +1,107 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _IA32_SYS_TRAP_H
+#define _IA32_SYS_TRAP_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Trap type values
+ */
+
+#define T_ZERODIV 0x0 /* #de divide by 0 error */
+#define T_SGLSTP 0x1 /* #db single step */
+#define T_NMIFLT 0x2 /* NMI */
+#define T_BPTFLT 0x3 /* #bp breakpoint fault, INT3 insn */
+#define T_OVFLW 0x4 /* #of INTO overflow fault */
+#define T_BOUNDFLT 0x5 /* #br BOUND insn fault */
+#define T_ILLINST 0x6 /* #ud invalid opcode fault */
+#define T_NOEXTFLT 0x7 /* #nm device not available: x87 */
+#define T_DBLFLT 0x8 /* #df double fault */
+#define T_EXTOVRFLT 0x9 /* [not generated: 386 only] */
+#define T_TSSFLT 0xa /* #ts invalid TSS fault */
+#define T_SEGFLT 0xb /* #np segment not present fault */
+#define T_STKFLT 0xc /* #ss stack fault */
+#define T_GPFLT 0xd /* #gp general protection fault */
+#define T_PGFLT 0xe /* #pf page fault */
+#define T_EXTERRFLT 0x10 /* #mf x87 FPU error fault */
+#define T_ALIGNMENT 0x11 /* #ac alignment check error */
+#define T_MCE 0x12 /* #mc machine check exception */
+#define T_SIMDFPE 0x13 /* #xm SSE/SSE exception */
+#define T_DBGENTR 0x14 /* debugger entry */
+#define T_ENDPERR 0x21 /* emulated extension error flt */
+#define T_ENOEXTFLT 0x20 /* emulated ext not present */
+#define T_FASTTRAP 0xd2 /* fast system call */
+#define T_SYSCALLINT 0x91 /* general system call */
+#define T_DTRACE_RET 0x7f /* DTrace pid return */
+#define T_INT80 0x80 /* int80 handler for linux emulation */
+#define T_SOFTINT 0x50fd /* pseudo softint trap type */
+
+/*
+ * Pseudo traps.
+ */
+#define T_INTERRUPT 0x100
+#define T_FAULT 0x200
+#define T_AST 0x400
+#define T_SYSCALL 0x180
+
+
+/*
+ * Values of error code on stack in case of page fault
+ */
+
+#define PF_ERR_MASK 0x01 /* Mask for error bit */
+#define PF_ERR_PAGE 0x00 /* page not present */
+#define PF_ERR_PROT 0x01 /* protection error */
+#define PF_ERR_WRITE 0x02 /* fault caused by write (else read) */
+#define PF_ERR_USER 0x04 /* processor was in user mode */
+ /* (else supervisor) */
+#define PF_ERR_EXEC 0x10 /* attempt to execute a No eXec page (AMD) */
+
+/*
+ * Definitions for fast system call subfunctions
+ */
+#define T_FNULL 0 /* Null trap for testing */
+#define T_FGETFP 1 /* Get emulated FP context */
+#define T_FSETFP 2 /* Set emulated FP context */
+#define T_GETHRTIME 3 /* Get high resolution time */
+#define T_GETHRVTIME 4 /* Get high resolution virtual time */
+#define T_GETHRESTIME 5 /* Get high resolution time */
+#define T_GETLGRP 6 /* Get home lgrpid */
+
+#define T_LASTFAST 6 /* Last valid subfunction */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _IA32_SYS_TRAP_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/modctl.h b/sys/contrib/openzfs/module/icp/include/sys/modctl.h
new file mode 100644
index 000000000000..6c26ad618c93
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/modctl.h
@@ -0,0 +1,477 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_MODCTL_H
+#define _SYS_MODCTL_H
+
+/*
+ * loadable module support.
+ */
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct modlmisc;
+struct modlinkage;
+
+/*
+ * The following structure defines the operations used by modctl
+ * to load and unload modules. Each supported loadable module type
+ * requires a set of mod_ops.
+ */
+struct mod_ops {
+ int (*modm_install)(struct modlmisc *, struct modlinkage *);
+ int (*modm_remove)(struct modlmisc *, struct modlinkage *);
+ int (*modm_info)(void *, struct modlinkage *, int *);
+};
+
+/*
+ * The defined set of mod_ops structures for each loadable module type
+ * Defined in modctl.c
+ */
+extern struct mod_ops mod_brandops;
+#if defined(__i386) || defined(__amd64)
+extern struct mod_ops mod_cpuops;
+#endif
+extern struct mod_ops mod_cryptoops;
+extern struct mod_ops mod_driverops;
+extern struct mod_ops mod_execops;
+extern struct mod_ops mod_fsops;
+extern struct mod_ops mod_miscops;
+extern struct mod_ops mod_schedops;
+extern struct mod_ops mod_strmodops;
+extern struct mod_ops mod_syscallops;
+extern struct mod_ops mod_sockmodops;
+#ifdef _SYSCALL32_IMPL
+extern struct mod_ops mod_syscallops32;
+#endif
+extern struct mod_ops mod_dacfops;
+extern struct mod_ops mod_ippops;
+extern struct mod_ops mod_pcbeops;
+extern struct mod_ops mod_devfsops;
+extern struct mod_ops mod_kiconvops;
+
+/*
+ * Definitions for the module specific linkage structures.
+ * The first two fields are the same in all of the structures.
+ * The linkinfo is for informational purposes only and is returned by
+ * modctl with the MODINFO cmd.
+ */
+
+/* For cryptographic providers */
+struct modlcrypto {
+ struct mod_ops *crypto_modops;
+ char *crypto_linkinfo;
+};
+
+/* For misc */
+struct modlmisc {
+ struct mod_ops *misc_modops;
+ char *misc_linkinfo;
+};
+
+/*
+ * Revision number of loadable modules support. This is the value
+ * that must be used in the modlinkage structure.
+ */
+#define MODREV_1 1
+
+/*
+ * The modlinkage structure is the structure that the module writer
+ * provides to the routines to install, remove, and stat a module.
+ * The ml_linkage element is an array of pointers to linkage structures.
+ * For most modules there is only one linkage structure. We allocate
+ * enough space for 3 linkage structures which happens to be the most
+ * we have in any sun supplied module. For those modules with more
+ * than 3 linkage structures (which is very unlikely), a modlinkage
+ * structure must be kmem_alloc'd in the module wrapper to be big enough
+ * for all of the linkage structures.
+ */
+struct modlinkage {
+ int ml_rev; /* rev of loadable modules system */
+#ifdef _LP64
+ void *ml_linkage[7]; /* more space in 64-bit OS */
+#else
+ void *ml_linkage[4]; /* NULL terminated list of */
+ /* linkage structures */
+#endif
+};
+
+/*
+ * commands. These are the commands supported by the modctl system call.
+ */
+#define MODLOAD 0
+#define MODUNLOAD 1
+#define MODINFO 2
+#define MODRESERVED 3
+#define MODSETMINIROOT 4
+#define MODADDMAJBIND 5
+#define MODGETPATH 6
+#define MODREADSYSBIND 7
+#define MODGETMAJBIND 8
+#define MODGETNAME 9
+#define MODSIZEOF_DEVID 10
+#define MODGETDEVID 11
+#define MODSIZEOF_MINORNAME 12
+#define MODGETMINORNAME 13
+#define MODGETPATHLEN 14
+#define MODEVENTS 15
+#define MODGETFBNAME 16
+#define MODREREADDACF 17
+#define MODLOADDRVCONF 18
+#define MODUNLOADDRVCONF 19
+#define MODREMMAJBIND 20
+#define MODDEVT2INSTANCE 21
+#define MODGETDEVFSPATH_LEN 22
+#define MODGETDEVFSPATH 23
+#define MODDEVID2PATHS 24
+#define MODSETDEVPOLICY 26
+#define MODGETDEVPOLICY 27
+#define MODALLOCPRIV 28
+#define MODGETDEVPOLICYBYNAME 29
+#define MODLOADMINORPERM 31
+#define MODADDMINORPERM 32
+#define MODREMMINORPERM 33
+#define MODREMDRVCLEANUP 34
+#define MODDEVEXISTS 35
+#define MODDEVREADDIR 36
+#define MODDEVNAME 37
+#define MODGETDEVFSPATH_MI_LEN 38
+#define MODGETDEVFSPATH_MI 39
+#define MODRETIRE 40
+#define MODUNRETIRE 41
+#define MODISRETIRED 42
+#define MODDEVEMPTYDIR 43
+#define MODREMDRVALIAS 44
+
+/*
+ * sub cmds for MODEVENTS
+ */
+#define MODEVENTS_FLUSH 0
+#define MODEVENTS_FLUSH_DUMP 1
+#define MODEVENTS_SET_DOOR_UPCALL_FILENAME 2
+#define MODEVENTS_GETDATA 3
+#define MODEVENTS_FREEDATA 4
+#define MODEVENTS_POST_EVENT 5
+#define MODEVENTS_REGISTER_EVENT 6
+
+/*
+ * devname subcmds for MODDEVNAME
+ */
+#define MODDEVNAME_LOOKUPDOOR 0
+#define MODDEVNAME_DEVFSADMNODE 1
+#define MODDEVNAME_NSMAPS 2
+#define MODDEVNAME_PROFILE 3
+#define MODDEVNAME_RECONFIG 4
+#define MODDEVNAME_SYSAVAIL 5
+
+
+/*
+ * Data structure passed to modconfig command in kernel to build devfs tree
+ */
+
+struct aliases {
+ struct aliases *a_next;
+ char *a_name;
+ int a_len;
+};
+
+#define MAXMODCONFNAME 256
+
+struct modconfig {
+ char drvname[MAXMODCONFNAME];
+ char drvclass[MAXMODCONFNAME];
+ int major;
+ int flags;
+ int num_aliases;
+ struct aliases *ap;
+};
+
+#if defined(_SYSCALL32)
+
+struct aliases32 {
+ caddr32_t a_next;
+ caddr32_t a_name;
+ int32_t a_len;
+};
+
+struct modconfig32 {
+ char drvname[MAXMODCONFNAME];
+ char drvclass[MAXMODCONFNAME];
+ int32_t major;
+ int32_t flags;
+ int32_t num_aliases;
+ caddr32_t ap;
+};
+
+#endif /* _SYSCALL32 */
+
+/* flags for modconfig */
+#define MOD_UNBIND_OVERRIDE 0x01 /* fail unbind if in use */
+
+/*
+ * Max module path length
+ */
+#define MOD_MAXPATH 256
+
+/*
+ * Default search path for modules ADDITIONAL to the directory
+ * where the kernel components we booted from are.
+ *
+ * Most often, this will be "/platform/{platform}/kernel /kernel /usr/kernel",
+ * but we don't wire it down here.
+ */
+#define MOD_DEFPATH "/kernel /usr/kernel"
+
+/*
+ * Default file name extension for autoloading modules.
+ */
+#define MOD_DEFEXT ""
+
+/*
+ * Parameters for modinfo
+ */
+#define MODMAXNAMELEN 32 /* max module name length */
+#define MODMAXLINKINFOLEN 32 /* max link info length */
+
+/*
+ * Module specific information.
+ */
+struct modspecific_info {
+ char msi_linkinfo[MODMAXLINKINFOLEN]; /* name in linkage struct */
+ int msi_p0; /* module specific information */
+};
+
+/*
+ * Structure returned by modctl with MODINFO command.
+ */
+#define MODMAXLINK 10 /* max linkages modinfo can handle */
+
+struct modinfo {
+ int mi_info; /* Flags for info wanted */
+ int mi_state; /* Flags for module state */
+ int mi_id; /* id of this loaded module */
+ int mi_nextid; /* id of next module or -1 */
+ caddr_t mi_base; /* virtual addr of text */
+ size_t mi_size; /* size of module in bytes */
+ int mi_rev; /* loadable modules rev */
+ int mi_loadcnt; /* # of times loaded */
+ char mi_name[MODMAXNAMELEN]; /* name of module */
+ struct modspecific_info mi_msinfo[MODMAXLINK];
+ /* mod specific info */
+};
+
+
+#if defined(_SYSCALL32)
+
+#define MODMAXNAMELEN32 32 /* max module name length */
+#define MODMAXLINKINFOLEN32 32 /* max link info length */
+#define MODMAXLINK32 10 /* max linkages modinfo can handle */
+
+struct modspecific_info32 {
+ char msi_linkinfo[MODMAXLINKINFOLEN32]; /* name in linkage struct */
+ int32_t msi_p0; /* module specific information */
+};
+
+struct modinfo32 {
+ int32_t mi_info; /* Flags for info wanted */
+ int32_t mi_state; /* Flags for module state */
+ int32_t mi_id; /* id of this loaded module */
+ int32_t mi_nextid; /* id of next module or -1 */
+ caddr32_t mi_base; /* virtual addr of text */
+ uint32_t mi_size; /* size of module in bytes */
+ int32_t mi_rev; /* loadable modules rev */
+ int32_t mi_loadcnt; /* # of times loaded */
+ char mi_name[MODMAXNAMELEN32]; /* name of module */
+ struct modspecific_info32 mi_msinfo[MODMAXLINK32];
+ /* mod specific info */
+};
+
+#endif /* _SYSCALL32 */
+
+/* Values for mi_info flags */
+#define MI_INFO_ONE 1
+#define MI_INFO_ALL 2
+#define MI_INFO_CNT 4
+#define MI_INFO_LINKAGE 8 /* used internally to extract modlinkage */
+/*
+ * MI_INFO_NOBASE indicates caller does not need mi_base. Failure to use this
+ * flag may lead 32-bit apps to receive an EOVERFLOW error from modctl(MODINFO)
+ * when used with a 64-bit kernel.
+ */
+#define MI_INFO_NOBASE 16
+
+/* Values for mi_state */
+#define MI_LOADED 1
+#define MI_INSTALLED 2
+
+/*
+ * Macros to vector to the appropriate module specific routine.
+ */
+#define MODL_INSTALL(MODL, MODLP) \
+ (*(MODL)->misc_modops->modm_install)(MODL, MODLP)
+#define MODL_REMOVE(MODL, MODLP) \
+ (*(MODL)->misc_modops->modm_remove)(MODL, MODLP)
+#define MODL_INFO(MODL, MODLP, P0) \
+ (*(MODL)->misc_modops->modm_info)(MODL, MODLP, P0)
+
+/*
+ * Definitions for stubs
+ */
+struct mod_stub_info {
+ uintptr_t mods_func_adr;
+ struct mod_modinfo *mods_modinfo;
+ uintptr_t mods_stub_adr;
+ int (*mods_errfcn)(void);
+ int mods_flag; /* flags defined below */
+};
+
+/*
+ * Definitions for mods_flag.
+ */
+#define MODS_WEAK 0x01 /* weak stub (not loaded if called) */
+#define MODS_NOUNLOAD 0x02 /* module not unloadable (no _fini()) */
+#define MODS_INSTALLED 0x10 /* module installed */
+
+struct mod_modinfo {
+ char *modm_module_name;
+ struct modctl *mp;
+ struct mod_stub_info modm_stubs[1];
+};
+
+struct modctl_list {
+ struct modctl_list *modl_next;
+ struct modctl *modl_modp;
+};
+
+/*
+ * Structure to manage a loadable module.
+ * Note: the module (mod_mp) structure's "text" and "text_size" information
+ * are replicated in the modctl structure so that mod_containing_pc()
+ * doesn't have to grab any locks (modctls are persistent; modules are not.)
+ */
+typedef struct modctl {
+ struct modctl *mod_next; /* &modules based list */
+ struct modctl *mod_prev;
+ int mod_id;
+ void *mod_mp;
+ kthread_t *mod_inprogress_thread;
+ struct mod_modinfo *mod_modinfo;
+ struct modlinkage *mod_linkage;
+ char *mod_filename;
+ char *mod_modname;
+
+ char mod_busy; /* inprogress_thread has locked */
+ char mod_want; /* someone waiting for unlock */
+ char mod_prim; /* primary module */
+
+ int mod_ref; /* ref count - from dependent or stub */
+
+ char mod_loaded; /* module in memory */
+ char mod_installed; /* post _init pre _fini */
+ char mod_loadflags;
+ char mod_delay_unload; /* deferred unload */
+
+ struct modctl_list *mod_requisites; /* mods this one depends on. */
+ void *____unused; /* NOTE: reuse (same size) is OK, */
+ /* deletion causes mdb.vs.core issues */
+ int mod_loadcnt; /* number of times mod was loaded */
+ int mod_nenabled; /* # of enabled DTrace probes in mod */
+ char *mod_text;
+ size_t mod_text_size;
+
+ int mod_gencount; /* # times loaded/unloaded */
+ struct modctl *mod_requisite_loading; /* mod circular dependency */
+} modctl_t;
+
+/*
+ * mod_loadflags
+ */
+
+#define MOD_NOAUTOUNLOAD 0x1 /* Auto mod-unloader skips this mod */
+#define MOD_NONOTIFY 0x2 /* No krtld notifications on (un)load */
+#define MOD_NOUNLOAD 0x4 /* Assume EBUSY for all _fini's */
+
+#define MOD_BIND_HASHSIZE 64
+#define MOD_BIND_HASHMASK (MOD_BIND_HASHSIZE-1)
+
+typedef int modid_t;
+
+/*
+ * global function and data declarations
+ */
+extern kmutex_t mod_lock;
+
+extern char *systemfile;
+extern char **syscallnames;
+extern int moddebug;
+
+/*
+ * this is the head of a doubly linked list. Only the next and prev
+ * pointers are used
+ */
+extern modctl_t modules;
+
+/*
+ * Only the following are part of the DDI/DKI
+ */
+extern int mod_install(struct modlinkage *);
+extern int mod_remove(struct modlinkage *);
+extern int mod_info(struct modlinkage *, struct modinfo *);
+
+/*
+ * bit definitions for moddebug.
+ */
+#define MODDEBUG_LOADMSG 0x80000000 /* print "[un]loading..." msg */
+#define MODDEBUG_ERRMSG 0x40000000 /* print detailed error msgs */
+#define MODDEBUG_LOADMSG2 0x20000000 /* print 2nd level msgs */
+#define MODDEBUG_RETIRE 0x10000000 /* print retire msgs */
+#define MODDEBUG_BINDING 0x00040000 /* driver/alias binding */
+#define MODDEBUG_FINI_EBUSY 0x00020000 /* pretend fini returns EBUSY */
+#define MODDEBUG_NOAUL_IPP 0x00010000 /* no Autounloading ipp mods */
+#define MODDEBUG_NOAUL_DACF 0x00008000 /* no Autounloading dacf mods */
+#define MODDEBUG_KEEPTEXT 0x00004000 /* keep text after unloading */
+#define MODDEBUG_NOAUL_DRV 0x00001000 /* no Autounloading Drivers */
+#define MODDEBUG_NOAUL_EXEC 0x00000800 /* no Autounloading Execs */
+#define MODDEBUG_NOAUL_FS 0x00000400 /* no Autounloading File sys */
+#define MODDEBUG_NOAUL_MISC 0x00000200 /* no Autounloading misc */
+#define MODDEBUG_NOAUL_SCHED 0x00000100 /* no Autounloading scheds */
+#define MODDEBUG_NOAUL_STR 0x00000080 /* no Autounloading streams */
+#define MODDEBUG_NOAUL_SYS 0x00000040 /* no Autounloading syscalls */
+#define MODDEBUG_NOCTF 0x00000020 /* do not load CTF debug data */
+#define MODDEBUG_NOAUTOUNLOAD 0x00000010 /* no autounloading at all */
+#define MODDEBUG_DDI_MOD 0x00000008 /* ddi_mod{open,sym,close} */
+#define MODDEBUG_MP_MATCH 0x00000004 /* dev_minorperm */
+#define MODDEBUG_MINORPERM 0x00000002 /* minor perm modctls */
+#define MODDEBUG_USERDEBUG 0x00000001 /* bpt after init_module() */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_MODCTL_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/modhash.h b/sys/contrib/openzfs/module/icp/include/sys/modhash.h
new file mode 100644
index 000000000000..06b52ff02604
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/modhash.h
@@ -0,0 +1,147 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_MODHASH_H
+#define _SYS_MODHASH_H
+
+/*
+ * Generic hash implementation for the kernel.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+
+/*
+ * Opaque data types for storing keys and values
+ */
+typedef void *mod_hash_val_t;
+typedef void *mod_hash_key_t;
+
+/*
+ * Opaque data type for reservation
+ */
+typedef void *mod_hash_hndl_t;
+
+/*
+ * Opaque type for hash itself.
+ */
+struct mod_hash;
+typedef struct mod_hash mod_hash_t;
+
+/*
+ * String hash table
+ */
+mod_hash_t *mod_hash_create_strhash_nodtr(char *, size_t,
+ void (*)(mod_hash_val_t));
+mod_hash_t *mod_hash_create_strhash(char *, size_t, void (*)(mod_hash_val_t));
+void mod_hash_destroy_strhash(mod_hash_t *);
+int mod_hash_strkey_cmp(mod_hash_key_t, mod_hash_key_t);
+void mod_hash_strkey_dtor(mod_hash_key_t);
+void mod_hash_strval_dtor(mod_hash_val_t);
+uint_t mod_hash_bystr(void *, mod_hash_key_t);
+
+/*
+ * Pointer hash table
+ */
+mod_hash_t *mod_hash_create_ptrhash(char *, size_t, void (*)(mod_hash_val_t),
+ size_t);
+void mod_hash_destroy_ptrhash(mod_hash_t *);
+int mod_hash_ptrkey_cmp(mod_hash_key_t, mod_hash_key_t);
+uint_t mod_hash_byptr(void *, mod_hash_key_t);
+
+/*
+ * ID hash table
+ */
+mod_hash_t *mod_hash_create_idhash(char *, size_t, void (*)(mod_hash_val_t));
+void mod_hash_destroy_idhash(mod_hash_t *);
+int mod_hash_idkey_cmp(mod_hash_key_t, mod_hash_key_t);
+uint_t mod_hash_byid(void *, mod_hash_key_t);
+uint_t mod_hash_iddata_gen(size_t);
+
+/*
+ * Hash management functions
+ */
+mod_hash_t *mod_hash_create_extended(char *, size_t, void (*)(mod_hash_key_t),
+ void (*)(mod_hash_val_t), uint_t (*)(void *, mod_hash_key_t), void *,
+ int (*)(mod_hash_key_t, mod_hash_key_t), int);
+
+void mod_hash_destroy_hash(mod_hash_t *);
+void mod_hash_clear(mod_hash_t *);
+
+/*
+ * Null key and value destructors
+ */
+void mod_hash_null_keydtor(mod_hash_key_t);
+void mod_hash_null_valdtor(mod_hash_val_t);
+
+/*
+ * Basic hash operations
+ */
+
+/*
+ * Error codes for insert, remove, find, destroy.
+ */
+#define MH_ERR_NOMEM -1
+#define MH_ERR_DUPLICATE -2
+#define MH_ERR_NOTFOUND -3
+
+/*
+ * Return codes for hash walkers
+ */
+#define MH_WALK_CONTINUE 0
+#define MH_WALK_TERMINATE 1
+
+/*
+ * Basic hash operations
+ */
+int mod_hash_insert(mod_hash_t *, mod_hash_key_t, mod_hash_val_t);
+int mod_hash_replace(mod_hash_t *, mod_hash_key_t, mod_hash_val_t);
+int mod_hash_remove(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *);
+int mod_hash_destroy(mod_hash_t *, mod_hash_key_t);
+int mod_hash_find(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *);
+int mod_hash_find_cb(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *,
+ void (*)(mod_hash_key_t, mod_hash_val_t));
+int mod_hash_find_cb_rval(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *,
+ int (*)(mod_hash_key_t, mod_hash_val_t), int *);
+void mod_hash_walk(mod_hash_t *,
+ uint_t (*)(mod_hash_key_t, mod_hash_val_t *, void *), void *);
+
+/*
+ * Reserving hash operations
+ */
+int mod_hash_reserve(mod_hash_t *, mod_hash_hndl_t *);
+int mod_hash_reserve_nosleep(mod_hash_t *, mod_hash_hndl_t *);
+void mod_hash_cancel(mod_hash_t *, mod_hash_hndl_t *);
+int mod_hash_insert_reserve(mod_hash_t *, mod_hash_key_t, mod_hash_val_t,
+ mod_hash_hndl_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_MODHASH_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/modhash_impl.h b/sys/contrib/openzfs/module/icp/include/sys/modhash_impl.h
new file mode 100644
index 000000000000..3130773aa196
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/modhash_impl.h
@@ -0,0 +1,108 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_MODHASH_IMPL_H
+#define _SYS_MODHASH_IMPL_H
+
+/*
+ * Internal details for the kernel's generic hash implementation.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <sys/modhash.h>
+
+struct mod_hash_entry {
+ mod_hash_key_t mhe_key; /* stored hash key */
+ mod_hash_val_t mhe_val; /* stored hash value */
+ struct mod_hash_entry *mhe_next; /* next item in chain */
+};
+
+struct mod_hash_stat {
+ ulong_t mhs_hit; /* tried a 'find' and it succeeded */
+ ulong_t mhs_miss; /* tried a 'find' but it failed */
+ ulong_t mhs_coll; /* occur when insert fails because of dup's */
+ ulong_t mhs_nelems; /* total number of stored key/value pairs */
+ ulong_t mhs_nomem; /* number of times kmem_alloc failed */
+};
+
+struct mod_hash {
+ krwlock_t mh_contents; /* lock protecting contents */
+ char *mh_name; /* hash name */
+ int mh_sleep; /* kmem_alloc flag */
+ size_t mh_nchains; /* # of elements in mh_entries */
+
+ /* key and val destructor */
+ void (*mh_kdtor)(mod_hash_key_t);
+ void (*mh_vdtor)(mod_hash_val_t);
+
+ /* key comparator */
+ int (*mh_keycmp)(mod_hash_key_t, mod_hash_key_t);
+
+ /* hash algorithm, and algorithm-private data */
+ uint_t (*mh_hashalg)(void *, mod_hash_key_t);
+ void *mh_hashalg_data;
+
+ struct mod_hash *mh_next; /* next hash in list */
+
+ struct mod_hash_stat mh_stat;
+
+ struct mod_hash_entry *mh_entries[1];
+};
+
+/*
+ * MH_SIZE()
+ * Compute the size of a mod_hash_t, in bytes, given the number of
+ * elements it contains.
+ */
+#define MH_SIZE(n) \
+ (sizeof (mod_hash_t) + ((n) - 1) * (sizeof (struct mod_hash_entry *)))
+
+/*
+ * Module initialization; called once.
+ */
+void mod_hash_fini(void);
+void mod_hash_init(void);
+
+/*
+ * Internal routines. Use directly with care.
+ */
+uint_t i_mod_hash(mod_hash_t *, mod_hash_key_t);
+int i_mod_hash_insert_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t,
+ mod_hash_hndl_t);
+int i_mod_hash_remove_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *);
+int i_mod_hash_find_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *);
+void i_mod_hash_walk_nosync(mod_hash_t *, uint_t (*)(mod_hash_key_t,
+ mod_hash_val_t *, void *), void *);
+void i_mod_hash_clear_nosync(mod_hash_t *hash);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_MODHASH_IMPL_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/stack.h b/sys/contrib/openzfs/module/icp/include/sys/stack.h
new file mode 100644
index 000000000000..64fecf409b5c
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/stack.h
@@ -0,0 +1,36 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_STACK_H
+#define _SYS_STACK_H
+
+#if defined(__i386) || defined(__amd64)
+
+#include <sys/ia32/stack.h> /* XX64 x86/sys/stack.h */
+
+#endif
+
+#endif /* _SYS_STACK_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/trap.h b/sys/contrib/openzfs/module/icp/include/sys/trap.h
new file mode 100644
index 000000000000..7f9fd375805f
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/trap.h
@@ -0,0 +1,36 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_TRAP_H
+#define _SYS_TRAP_H
+
+#if defined(__i386) || defined(__amd64)
+
+#include <sys/ia32/trap.h> /* XX64 x86/sys/trap.h */
+
+#endif
+
+#endif /* _SYS_TRAP_H */
diff --git a/sys/contrib/openzfs/module/icp/io/aes.c b/sys/contrib/openzfs/module/icp/io/aes.c
new file mode 100644
index 000000000000..e540af4473f7
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/io/aes.c
@@ -0,0 +1,1457 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * AES provider for the Kernel Cryptographic Framework (KCF)
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/spi.h>
+#include <sys/crypto/icp.h>
+#include <modes/modes.h>
+#include <sys/modctl.h>
+#define _AES_IMPL
+#include <aes/aes_impl.h>
+#include <modes/gcm_impl.h>
+
+#define CRYPTO_PROVIDER_NAME "aes"
+
+extern struct mod_ops mod_cryptoops;
+
+/*
+ * Module linkage information for the kernel.
+ */
+static struct modlcrypto modlcrypto = {
+ &mod_cryptoops,
+ "AES Kernel SW Provider"
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, { (void *)&modlcrypto, NULL }
+};
+
+/*
+ * Mechanism info structure passed to KCF during registration.
+ */
+static crypto_mech_info_t aes_mech_info_tab[] = {
+ /* AES_ECB */
+ {SUN_CKM_AES_ECB, AES_ECB_MECH_INFO_TYPE,
+ CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC |
+ CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC,
+ AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+ /* AES_CBC */
+ {SUN_CKM_AES_CBC, AES_CBC_MECH_INFO_TYPE,
+ CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC |
+ CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC,
+ AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+ /* AES_CTR */
+ {SUN_CKM_AES_CTR, AES_CTR_MECH_INFO_TYPE,
+ CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC |
+ CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC,
+ AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+ /* AES_CCM */
+ {SUN_CKM_AES_CCM, AES_CCM_MECH_INFO_TYPE,
+ CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC |
+ CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC,
+ AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+ /* AES_GCM */
+ {SUN_CKM_AES_GCM, AES_GCM_MECH_INFO_TYPE,
+ CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC |
+ CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC,
+ AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+ /* AES_GMAC */
+ {SUN_CKM_AES_GMAC, AES_GMAC_MECH_INFO_TYPE,
+ CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC |
+ CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC |
+ CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC |
+ CRYPTO_FG_SIGN | CRYPTO_FG_SIGN_ATOMIC |
+ CRYPTO_FG_VERIFY | CRYPTO_FG_VERIFY_ATOMIC,
+ AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES}
+};
+
+static void aes_provider_status(crypto_provider_handle_t, uint_t *);
+
+static crypto_control_ops_t aes_control_ops = {
+ aes_provider_status
+};
+
+static int aes_encrypt_init(crypto_ctx_t *, crypto_mechanism_t *,
+ crypto_key_t *, crypto_spi_ctx_template_t, crypto_req_handle_t);
+static int aes_decrypt_init(crypto_ctx_t *, crypto_mechanism_t *,
+ crypto_key_t *, crypto_spi_ctx_template_t, crypto_req_handle_t);
+static int aes_common_init(crypto_ctx_t *, crypto_mechanism_t *,
+ crypto_key_t *, crypto_spi_ctx_template_t, crypto_req_handle_t, boolean_t);
+static int aes_common_init_ctx(aes_ctx_t *, crypto_spi_ctx_template_t *,
+ crypto_mechanism_t *, crypto_key_t *, int, boolean_t);
+static int aes_encrypt_final(crypto_ctx_t *, crypto_data_t *,
+ crypto_req_handle_t);
+static int aes_decrypt_final(crypto_ctx_t *, crypto_data_t *,
+ crypto_req_handle_t);
+
+static int aes_encrypt(crypto_ctx_t *, crypto_data_t *, crypto_data_t *,
+ crypto_req_handle_t);
+static int aes_encrypt_update(crypto_ctx_t *, crypto_data_t *,
+ crypto_data_t *, crypto_req_handle_t);
+static int aes_encrypt_atomic(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_data_t *,
+ crypto_data_t *, crypto_spi_ctx_template_t, crypto_req_handle_t);
+
+static int aes_decrypt(crypto_ctx_t *, crypto_data_t *, crypto_data_t *,
+ crypto_req_handle_t);
+static int aes_decrypt_update(crypto_ctx_t *, crypto_data_t *,
+ crypto_data_t *, crypto_req_handle_t);
+static int aes_decrypt_atomic(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_data_t *,
+ crypto_data_t *, crypto_spi_ctx_template_t, crypto_req_handle_t);
+
+static crypto_cipher_ops_t aes_cipher_ops = {
+ .encrypt_init = aes_encrypt_init,
+ .encrypt = aes_encrypt,
+ .encrypt_update = aes_encrypt_update,
+ .encrypt_final = aes_encrypt_final,
+ .encrypt_atomic = aes_encrypt_atomic,
+ .decrypt_init = aes_decrypt_init,
+ .decrypt = aes_decrypt,
+ .decrypt_update = aes_decrypt_update,
+ .decrypt_final = aes_decrypt_final,
+ .decrypt_atomic = aes_decrypt_atomic
+};
+
+static int aes_mac_atomic(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *,
+ crypto_spi_ctx_template_t, crypto_req_handle_t);
+static int aes_mac_verify_atomic(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *,
+ crypto_spi_ctx_template_t, crypto_req_handle_t);
+
+static crypto_mac_ops_t aes_mac_ops = {
+ .mac_init = NULL,
+ .mac = NULL,
+ .mac_update = NULL,
+ .mac_final = NULL,
+ .mac_atomic = aes_mac_atomic,
+ .mac_verify_atomic = aes_mac_verify_atomic
+};
+
+static int aes_create_ctx_template(crypto_provider_handle_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t *,
+ size_t *, crypto_req_handle_t);
+static int aes_free_context(crypto_ctx_t *);
+
+static crypto_ctx_ops_t aes_ctx_ops = {
+ .create_ctx_template = aes_create_ctx_template,
+ .free_context = aes_free_context
+};
+
+static crypto_ops_t aes_crypto_ops = {{{{{
+ &aes_control_ops,
+ NULL,
+ &aes_cipher_ops,
+ &aes_mac_ops,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ &aes_ctx_ops
+}}}}};
+
+static crypto_provider_info_t aes_prov_info = {{{{
+ CRYPTO_SPI_VERSION_1,
+ "AES Software Provider",
+ CRYPTO_SW_PROVIDER,
+ NULL,
+ &aes_crypto_ops,
+ sizeof (aes_mech_info_tab)/sizeof (crypto_mech_info_t),
+ aes_mech_info_tab
+}}}};
+
+static crypto_kcf_provider_handle_t aes_prov_handle = 0;
+static crypto_data_t null_crypto_data = { CRYPTO_DATA_RAW };
+
+int
+aes_mod_init(void)
+{
+ int ret;
+
+ /* Determine the fastest available implementation. */
+ aes_impl_init();
+ gcm_impl_init();
+
+ if ((ret = mod_install(&modlinkage)) != 0)
+ return (ret);
+
+ /* Register with KCF. If the registration fails, remove the module. */
+ if (crypto_register_provider(&aes_prov_info, &aes_prov_handle)) {
+ (void) mod_remove(&modlinkage);
+ return (EACCES);
+ }
+
+ return (0);
+}
+
+int
+aes_mod_fini(void)
+{
+ /* Unregister from KCF if module is registered */
+ if (aes_prov_handle != 0) {
+ if (crypto_unregister_provider(aes_prov_handle))
+ return (EBUSY);
+
+ aes_prov_handle = 0;
+ }
+
+ return (mod_remove(&modlinkage));
+}
+
+static int
+aes_check_mech_param(crypto_mechanism_t *mechanism, aes_ctx_t **ctx, int kmflag)
+{
+ void *p = NULL;
+ boolean_t param_required = B_TRUE;
+ size_t param_len;
+ void *(*alloc_fun)(int);
+ int rv = CRYPTO_SUCCESS;
+
+ switch (mechanism->cm_type) {
+ case AES_ECB_MECH_INFO_TYPE:
+ param_required = B_FALSE;
+ alloc_fun = ecb_alloc_ctx;
+ break;
+ case AES_CBC_MECH_INFO_TYPE:
+ param_len = AES_BLOCK_LEN;
+ alloc_fun = cbc_alloc_ctx;
+ break;
+ case AES_CTR_MECH_INFO_TYPE:
+ param_len = sizeof (CK_AES_CTR_PARAMS);
+ alloc_fun = ctr_alloc_ctx;
+ break;
+ case AES_CCM_MECH_INFO_TYPE:
+ param_len = sizeof (CK_AES_CCM_PARAMS);
+ alloc_fun = ccm_alloc_ctx;
+ break;
+ case AES_GCM_MECH_INFO_TYPE:
+ param_len = sizeof (CK_AES_GCM_PARAMS);
+ alloc_fun = gcm_alloc_ctx;
+ break;
+ case AES_GMAC_MECH_INFO_TYPE:
+ param_len = sizeof (CK_AES_GMAC_PARAMS);
+ alloc_fun = gmac_alloc_ctx;
+ break;
+ default:
+ rv = CRYPTO_MECHANISM_INVALID;
+ return (rv);
+ }
+ if (param_required && mechanism->cm_param != NULL &&
+ mechanism->cm_param_len != param_len) {
+ rv = CRYPTO_MECHANISM_PARAM_INVALID;
+ }
+ if (ctx != NULL) {
+ p = (alloc_fun)(kmflag);
+ *ctx = p;
+ }
+ return (rv);
+}
+
+/*
+ * Initialize key schedules for AES
+ */
+static int
+init_keysched(crypto_key_t *key, void *newbie)
+{
+ /*
+ * Only keys by value are supported by this module.
+ */
+ switch (key->ck_format) {
+ case CRYPTO_KEY_RAW:
+ if (key->ck_length < AES_MINBITS ||
+ key->ck_length > AES_MAXBITS) {
+ return (CRYPTO_KEY_SIZE_RANGE);
+ }
+
+ /* key length must be either 128, 192, or 256 */
+ if ((key->ck_length & 63) != 0)
+ return (CRYPTO_KEY_SIZE_RANGE);
+ break;
+ default:
+ return (CRYPTO_KEY_TYPE_INCONSISTENT);
+ }
+
+ aes_init_keysched(key->ck_data, key->ck_length, newbie);
+ return (CRYPTO_SUCCESS);
+}
+
+/*
+ * KCF software provider control entry points.
+ */
+/* ARGSUSED */
+static void
+aes_provider_status(crypto_provider_handle_t provider, uint_t *status)
+{
+ *status = CRYPTO_PROVIDER_READY;
+}
+
+static int
+aes_encrypt_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
+ crypto_key_t *key, crypto_spi_ctx_template_t template,
+ crypto_req_handle_t req)
+{
+ return (aes_common_init(ctx, mechanism, key, template, req, B_TRUE));
+}
+
+static int
+aes_decrypt_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
+ crypto_key_t *key, crypto_spi_ctx_template_t template,
+ crypto_req_handle_t req)
+{
+ return (aes_common_init(ctx, mechanism, key, template, req, B_FALSE));
+}
+
+
+
+/*
+ * KCF software provider encrypt entry points.
+ */
+static int
+aes_common_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
+ crypto_key_t *key, crypto_spi_ctx_template_t template,
+ crypto_req_handle_t req, boolean_t is_encrypt_init)
+{
+ aes_ctx_t *aes_ctx;
+ int rv;
+ int kmflag;
+
+ /*
+ * Only keys by value are supported by this module.
+ */
+ if (key->ck_format != CRYPTO_KEY_RAW) {
+ return (CRYPTO_KEY_TYPE_INCONSISTENT);
+ }
+
+ kmflag = crypto_kmflag(req);
+ if ((rv = aes_check_mech_param(mechanism, &aes_ctx, kmflag))
+ != CRYPTO_SUCCESS)
+ return (rv);
+
+ rv = aes_common_init_ctx(aes_ctx, template, mechanism, key, kmflag,
+ is_encrypt_init);
+ if (rv != CRYPTO_SUCCESS) {
+ crypto_free_mode_ctx(aes_ctx);
+ return (rv);
+ }
+
+ ctx->cc_provider_private = aes_ctx;
+
+ return (CRYPTO_SUCCESS);
+}
+
+static void
+aes_copy_block64(uint8_t *in, uint64_t *out)
+{
+ if (IS_P2ALIGNED(in, sizeof (uint64_t))) {
+ /* LINTED: pointer alignment */
+ out[0] = *(uint64_t *)&in[0];
+ /* LINTED: pointer alignment */
+ out[1] = *(uint64_t *)&in[8];
+ } else {
+ uint8_t *iv8 = (uint8_t *)&out[0];
+
+ AES_COPY_BLOCK(in, iv8);
+ }
+}
+
+
+static int
+aes_encrypt(crypto_ctx_t *ctx, crypto_data_t *plaintext,
+ crypto_data_t *ciphertext, crypto_req_handle_t req)
+{
+ int ret = CRYPTO_FAILED;
+
+ aes_ctx_t *aes_ctx;
+ size_t saved_length, saved_offset, length_needed;
+
+ ASSERT(ctx->cc_provider_private != NULL);
+ aes_ctx = ctx->cc_provider_private;
+
+ /*
+ * For block ciphers, plaintext must be a multiple of AES block size.
+ * This test is only valid for ciphers whose blocksize is a power of 2.
+ */
+ if (((aes_ctx->ac_flags & (CTR_MODE|CCM_MODE|GCM_MODE|GMAC_MODE))
+ == 0) && (plaintext->cd_length & (AES_BLOCK_LEN - 1)) != 0)
+ return (CRYPTO_DATA_LEN_RANGE);
+
+ ASSERT(ciphertext != NULL);
+
+ /*
+ * We need to just return the length needed to store the output.
+ * We should not destroy the context for the following case.
+ */
+ switch (aes_ctx->ac_flags & (CCM_MODE|GCM_MODE|GMAC_MODE)) {
+ case CCM_MODE:
+ length_needed = plaintext->cd_length + aes_ctx->ac_mac_len;
+ break;
+ case GCM_MODE:
+ length_needed = plaintext->cd_length + aes_ctx->ac_tag_len;
+ break;
+ case GMAC_MODE:
+ if (plaintext->cd_length != 0)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ length_needed = aes_ctx->ac_tag_len;
+ break;
+ default:
+ length_needed = plaintext->cd_length;
+ }
+
+ if (ciphertext->cd_length < length_needed) {
+ ciphertext->cd_length = length_needed;
+ return (CRYPTO_BUFFER_TOO_SMALL);
+ }
+
+ saved_length = ciphertext->cd_length;
+ saved_offset = ciphertext->cd_offset;
+
+ /*
+ * Do an update on the specified input data.
+ */
+ ret = aes_encrypt_update(ctx, plaintext, ciphertext, req);
+ if (ret != CRYPTO_SUCCESS) {
+ return (ret);
+ }
+
+ /*
+ * For CCM mode, aes_ccm_encrypt_final() will take care of any
+ * left-over unprocessed data, and compute the MAC
+ */
+ if (aes_ctx->ac_flags & CCM_MODE) {
+ /*
+ * ccm_encrypt_final() will compute the MAC and append
+ * it to existing ciphertext. So, need to adjust the left over
+ * length value accordingly
+ */
+
+ /* order of following 2 lines MUST not be reversed */
+ ciphertext->cd_offset = ciphertext->cd_length;
+ ciphertext->cd_length = saved_length - ciphertext->cd_length;
+ ret = ccm_encrypt_final((ccm_ctx_t *)aes_ctx, ciphertext,
+ AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block);
+ if (ret != CRYPTO_SUCCESS) {
+ return (ret);
+ }
+
+ if (plaintext != ciphertext) {
+ ciphertext->cd_length =
+ ciphertext->cd_offset - saved_offset;
+ }
+ ciphertext->cd_offset = saved_offset;
+ } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) {
+ /*
+ * gcm_encrypt_final() will compute the MAC and append
+ * it to existing ciphertext. So, need to adjust the left over
+ * length value accordingly
+ */
+
+ /* order of following 2 lines MUST not be reversed */
+ ciphertext->cd_offset = ciphertext->cd_length;
+ ciphertext->cd_length = saved_length - ciphertext->cd_length;
+ ret = gcm_encrypt_final((gcm_ctx_t *)aes_ctx, ciphertext,
+ AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
+ aes_xor_block);
+ if (ret != CRYPTO_SUCCESS) {
+ return (ret);
+ }
+
+ if (plaintext != ciphertext) {
+ ciphertext->cd_length =
+ ciphertext->cd_offset - saved_offset;
+ }
+ ciphertext->cd_offset = saved_offset;
+ }
+
+ ASSERT(aes_ctx->ac_remainder_len == 0);
+ (void) aes_free_context(ctx);
+
+ return (ret);
+}
+
+
+static int
+aes_decrypt(crypto_ctx_t *ctx, crypto_data_t *ciphertext,
+ crypto_data_t *plaintext, crypto_req_handle_t req)
+{
+ int ret = CRYPTO_FAILED;
+
+ aes_ctx_t *aes_ctx;
+ off_t saved_offset;
+ size_t saved_length, length_needed;
+
+ ASSERT(ctx->cc_provider_private != NULL);
+ aes_ctx = ctx->cc_provider_private;
+
+ /*
+ * For block ciphers, plaintext must be a multiple of AES block size.
+ * This test is only valid for ciphers whose blocksize is a power of 2.
+ */
+ if (((aes_ctx->ac_flags & (CTR_MODE|CCM_MODE|GCM_MODE|GMAC_MODE))
+ == 0) && (ciphertext->cd_length & (AES_BLOCK_LEN - 1)) != 0) {
+ return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE);
+ }
+
+ ASSERT(plaintext != NULL);
+
+ /*
+ * Return length needed to store the output.
+ * Do not destroy context when plaintext buffer is too small.
+ *
+ * CCM: plaintext is MAC len smaller than cipher text
+ * GCM: plaintext is TAG len smaller than cipher text
+ * GMAC: plaintext length must be zero
+ */
+ switch (aes_ctx->ac_flags & (CCM_MODE|GCM_MODE|GMAC_MODE)) {
+ case CCM_MODE:
+ length_needed = aes_ctx->ac_processed_data_len;
+ break;
+ case GCM_MODE:
+ length_needed = ciphertext->cd_length - aes_ctx->ac_tag_len;
+ break;
+ case GMAC_MODE:
+ if (plaintext->cd_length != 0)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ length_needed = 0;
+ break;
+ default:
+ length_needed = ciphertext->cd_length;
+ }
+
+ if (plaintext->cd_length < length_needed) {
+ plaintext->cd_length = length_needed;
+ return (CRYPTO_BUFFER_TOO_SMALL);
+ }
+
+ saved_offset = plaintext->cd_offset;
+ saved_length = plaintext->cd_length;
+
+ /*
+ * Do an update on the specified input data.
+ */
+ ret = aes_decrypt_update(ctx, ciphertext, plaintext, req);
+ if (ret != CRYPTO_SUCCESS) {
+ goto cleanup;
+ }
+
+ if (aes_ctx->ac_flags & CCM_MODE) {
+ ASSERT(aes_ctx->ac_processed_data_len == aes_ctx->ac_data_len);
+ ASSERT(aes_ctx->ac_processed_mac_len == aes_ctx->ac_mac_len);
+
+ /* order of following 2 lines MUST not be reversed */
+ plaintext->cd_offset = plaintext->cd_length;
+ plaintext->cd_length = saved_length - plaintext->cd_length;
+
+ ret = ccm_decrypt_final((ccm_ctx_t *)aes_ctx, plaintext,
+ AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
+ aes_xor_block);
+ if (ret == CRYPTO_SUCCESS) {
+ if (plaintext != ciphertext) {
+ plaintext->cd_length =
+ plaintext->cd_offset - saved_offset;
+ }
+ } else {
+ plaintext->cd_length = saved_length;
+ }
+
+ plaintext->cd_offset = saved_offset;
+ } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) {
+ /* order of following 2 lines MUST not be reversed */
+ plaintext->cd_offset = plaintext->cd_length;
+ plaintext->cd_length = saved_length - plaintext->cd_length;
+
+ ret = gcm_decrypt_final((gcm_ctx_t *)aes_ctx, plaintext,
+ AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block);
+ if (ret == CRYPTO_SUCCESS) {
+ if (plaintext != ciphertext) {
+ plaintext->cd_length =
+ plaintext->cd_offset - saved_offset;
+ }
+ } else {
+ plaintext->cd_length = saved_length;
+ }
+
+ plaintext->cd_offset = saved_offset;
+ }
+
+ ASSERT(aes_ctx->ac_remainder_len == 0);
+
+cleanup:
+ (void) aes_free_context(ctx);
+
+ return (ret);
+}
+
+
+/* ARGSUSED */
+static int
+aes_encrypt_update(crypto_ctx_t *ctx, crypto_data_t *plaintext,
+ crypto_data_t *ciphertext, crypto_req_handle_t req)
+{
+ off_t saved_offset;
+ size_t saved_length, out_len;
+ int ret = CRYPTO_SUCCESS;
+ aes_ctx_t *aes_ctx;
+
+ ASSERT(ctx->cc_provider_private != NULL);
+ aes_ctx = ctx->cc_provider_private;
+
+ ASSERT(ciphertext != NULL);
+
+ /* compute number of bytes that will hold the ciphertext */
+ out_len = aes_ctx->ac_remainder_len;
+ out_len += plaintext->cd_length;
+ out_len &= ~(AES_BLOCK_LEN - 1);
+
+ /* return length needed to store the output */
+ if (ciphertext->cd_length < out_len) {
+ ciphertext->cd_length = out_len;
+ return (CRYPTO_BUFFER_TOO_SMALL);
+ }
+
+ saved_offset = ciphertext->cd_offset;
+ saved_length = ciphertext->cd_length;
+
+ /*
+ * Do the AES update on the specified input data.
+ */
+ switch (plaintext->cd_format) {
+ case CRYPTO_DATA_RAW:
+ ret = crypto_update_iov(ctx->cc_provider_private,
+ plaintext, ciphertext, aes_encrypt_contiguous_blocks,
+ aes_copy_block64);
+ break;
+ case CRYPTO_DATA_UIO:
+ ret = crypto_update_uio(ctx->cc_provider_private,
+ plaintext, ciphertext, aes_encrypt_contiguous_blocks,
+ aes_copy_block64);
+ break;
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ /*
+ * Since AES counter mode is a stream cipher, we call
+ * ctr_mode_final() to pick up any remaining bytes.
+ * It is an internal function that does not destroy
+ * the context like *normal* final routines.
+ */
+ if ((aes_ctx->ac_flags & CTR_MODE) && (aes_ctx->ac_remainder_len > 0)) {
+ ret = ctr_mode_final((ctr_ctx_t *)aes_ctx,
+ ciphertext, aes_encrypt_block);
+ }
+
+ if (ret == CRYPTO_SUCCESS) {
+ if (plaintext != ciphertext)
+ ciphertext->cd_length =
+ ciphertext->cd_offset - saved_offset;
+ } else {
+ ciphertext->cd_length = saved_length;
+ }
+ ciphertext->cd_offset = saved_offset;
+
+ return (ret);
+}
+
+
+static int
+aes_decrypt_update(crypto_ctx_t *ctx, crypto_data_t *ciphertext,
+ crypto_data_t *plaintext, crypto_req_handle_t req)
+{
+ off_t saved_offset;
+ size_t saved_length, out_len;
+ int ret = CRYPTO_SUCCESS;
+ aes_ctx_t *aes_ctx;
+
+ ASSERT(ctx->cc_provider_private != NULL);
+ aes_ctx = ctx->cc_provider_private;
+
+ ASSERT(plaintext != NULL);
+
+ /*
+ * Compute number of bytes that will hold the plaintext.
+ * This is not necessary for CCM, GCM, and GMAC since these
+ * mechanisms never return plaintext for update operations.
+ */
+ if ((aes_ctx->ac_flags & (CCM_MODE|GCM_MODE|GMAC_MODE)) == 0) {
+ out_len = aes_ctx->ac_remainder_len;
+ out_len += ciphertext->cd_length;
+ out_len &= ~(AES_BLOCK_LEN - 1);
+
+ /* return length needed to store the output */
+ if (plaintext->cd_length < out_len) {
+ plaintext->cd_length = out_len;
+ return (CRYPTO_BUFFER_TOO_SMALL);
+ }
+ }
+
+ saved_offset = plaintext->cd_offset;
+ saved_length = plaintext->cd_length;
+
+ if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE))
+ gcm_set_kmflag((gcm_ctx_t *)aes_ctx, crypto_kmflag(req));
+
+ /*
+ * Do the AES update on the specified input data.
+ */
+ switch (ciphertext->cd_format) {
+ case CRYPTO_DATA_RAW:
+ ret = crypto_update_iov(ctx->cc_provider_private,
+ ciphertext, plaintext, aes_decrypt_contiguous_blocks,
+ aes_copy_block64);
+ break;
+ case CRYPTO_DATA_UIO:
+ ret = crypto_update_uio(ctx->cc_provider_private,
+ ciphertext, plaintext, aes_decrypt_contiguous_blocks,
+ aes_copy_block64);
+ break;
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ /*
+ * Since AES counter mode is a stream cipher, we call
+ * ctr_mode_final() to pick up any remaining bytes.
+ * It is an internal function that does not destroy
+ * the context like *normal* final routines.
+ */
+ if ((aes_ctx->ac_flags & CTR_MODE) && (aes_ctx->ac_remainder_len > 0)) {
+ ret = ctr_mode_final((ctr_ctx_t *)aes_ctx, plaintext,
+ aes_encrypt_block);
+ if (ret == CRYPTO_DATA_LEN_RANGE)
+ ret = CRYPTO_ENCRYPTED_DATA_LEN_RANGE;
+ }
+
+ if (ret == CRYPTO_SUCCESS) {
+ if (ciphertext != plaintext)
+ plaintext->cd_length =
+ plaintext->cd_offset - saved_offset;
+ } else {
+ plaintext->cd_length = saved_length;
+ }
+ plaintext->cd_offset = saved_offset;
+
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+aes_encrypt_final(crypto_ctx_t *ctx, crypto_data_t *data,
+ crypto_req_handle_t req)
+{
+ aes_ctx_t *aes_ctx;
+ int ret;
+
+ ASSERT(ctx->cc_provider_private != NULL);
+ aes_ctx = ctx->cc_provider_private;
+
+ if (data->cd_format != CRYPTO_DATA_RAW &&
+ data->cd_format != CRYPTO_DATA_UIO) {
+ return (CRYPTO_ARGUMENTS_BAD);
+ }
+
+ if (aes_ctx->ac_flags & CTR_MODE) {
+ if (aes_ctx->ac_remainder_len > 0) {
+ ret = ctr_mode_final((ctr_ctx_t *)aes_ctx, data,
+ aes_encrypt_block);
+ if (ret != CRYPTO_SUCCESS)
+ return (ret);
+ }
+ } else if (aes_ctx->ac_flags & CCM_MODE) {
+ ret = ccm_encrypt_final((ccm_ctx_t *)aes_ctx, data,
+ AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block);
+ if (ret != CRYPTO_SUCCESS) {
+ return (ret);
+ }
+ } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) {
+ size_t saved_offset = data->cd_offset;
+
+ ret = gcm_encrypt_final((gcm_ctx_t *)aes_ctx, data,
+ AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
+ aes_xor_block);
+ if (ret != CRYPTO_SUCCESS) {
+ return (ret);
+ }
+ data->cd_length = data->cd_offset - saved_offset;
+ data->cd_offset = saved_offset;
+ } else {
+ /*
+ * There must be no unprocessed plaintext.
+ * This happens if the length of the last data is
+ * not a multiple of the AES block length.
+ */
+ if (aes_ctx->ac_remainder_len > 0) {
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+ data->cd_length = 0;
+ }
+
+ (void) aes_free_context(ctx);
+
+ return (CRYPTO_SUCCESS);
+}
+
+/* ARGSUSED */
+static int
+aes_decrypt_final(crypto_ctx_t *ctx, crypto_data_t *data,
+ crypto_req_handle_t req)
+{
+ aes_ctx_t *aes_ctx;
+ int ret;
+ off_t saved_offset;
+ size_t saved_length;
+
+ ASSERT(ctx->cc_provider_private != NULL);
+ aes_ctx = ctx->cc_provider_private;
+
+ if (data->cd_format != CRYPTO_DATA_RAW &&
+ data->cd_format != CRYPTO_DATA_UIO) {
+ return (CRYPTO_ARGUMENTS_BAD);
+ }
+
+ /*
+ * There must be no unprocessed ciphertext.
+ * This happens if the length of the last ciphertext is
+ * not a multiple of the AES block length.
+ */
+ if (aes_ctx->ac_remainder_len > 0) {
+ if ((aes_ctx->ac_flags & CTR_MODE) == 0)
+ return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE);
+ else {
+ ret = ctr_mode_final((ctr_ctx_t *)aes_ctx, data,
+ aes_encrypt_block);
+ if (ret == CRYPTO_DATA_LEN_RANGE)
+ ret = CRYPTO_ENCRYPTED_DATA_LEN_RANGE;
+ if (ret != CRYPTO_SUCCESS)
+ return (ret);
+ }
+ }
+
+ if (aes_ctx->ac_flags & CCM_MODE) {
+ /*
+ * This is where all the plaintext is returned, make sure
+ * the plaintext buffer is big enough
+ */
+ size_t pt_len = aes_ctx->ac_data_len;
+ if (data->cd_length < pt_len) {
+ data->cd_length = pt_len;
+ return (CRYPTO_BUFFER_TOO_SMALL);
+ }
+
+ ASSERT(aes_ctx->ac_processed_data_len == pt_len);
+ ASSERT(aes_ctx->ac_processed_mac_len == aes_ctx->ac_mac_len);
+ saved_offset = data->cd_offset;
+ saved_length = data->cd_length;
+ ret = ccm_decrypt_final((ccm_ctx_t *)aes_ctx, data,
+ AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
+ aes_xor_block);
+ if (ret == CRYPTO_SUCCESS) {
+ data->cd_length = data->cd_offset - saved_offset;
+ } else {
+ data->cd_length = saved_length;
+ }
+
+ data->cd_offset = saved_offset;
+ if (ret != CRYPTO_SUCCESS) {
+ return (ret);
+ }
+ } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) {
+ /*
+ * This is where all the plaintext is returned, make sure
+ * the plaintext buffer is big enough
+ */
+ gcm_ctx_t *ctx = (gcm_ctx_t *)aes_ctx;
+ size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
+
+ if (data->cd_length < pt_len) {
+ data->cd_length = pt_len;
+ return (CRYPTO_BUFFER_TOO_SMALL);
+ }
+
+ saved_offset = data->cd_offset;
+ saved_length = data->cd_length;
+ ret = gcm_decrypt_final((gcm_ctx_t *)aes_ctx, data,
+ AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block);
+ if (ret == CRYPTO_SUCCESS) {
+ data->cd_length = data->cd_offset - saved_offset;
+ } else {
+ data->cd_length = saved_length;
+ }
+
+ data->cd_offset = saved_offset;
+ if (ret != CRYPTO_SUCCESS) {
+ return (ret);
+ }
+ }
+
+
+ if ((aes_ctx->ac_flags & (CTR_MODE|CCM_MODE|GCM_MODE|GMAC_MODE)) == 0) {
+ data->cd_length = 0;
+ }
+
+ (void) aes_free_context(ctx);
+
+ return (CRYPTO_SUCCESS);
+}
+
+/* ARGSUSED */
+static int
+aes_encrypt_atomic(crypto_provider_handle_t provider,
+ crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+ crypto_key_t *key, crypto_data_t *plaintext, crypto_data_t *ciphertext,
+ crypto_spi_ctx_template_t template, crypto_req_handle_t req)
+{
+ aes_ctx_t aes_ctx; /* on the stack */
+ off_t saved_offset;
+ size_t saved_length;
+ size_t length_needed;
+ int ret;
+
+ ASSERT(ciphertext != NULL);
+
+ /*
+ * CTR, CCM, GCM, and GMAC modes do not require that plaintext
+ * be a multiple of AES block size.
+ */
+ switch (mechanism->cm_type) {
+ case AES_CTR_MECH_INFO_TYPE:
+ case AES_CCM_MECH_INFO_TYPE:
+ case AES_GCM_MECH_INFO_TYPE:
+ case AES_GMAC_MECH_INFO_TYPE:
+ break;
+ default:
+ if ((plaintext->cd_length & (AES_BLOCK_LEN - 1)) != 0)
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+
+ if ((ret = aes_check_mech_param(mechanism, NULL, 0)) != CRYPTO_SUCCESS)
+ return (ret);
+
+ bzero(&aes_ctx, sizeof (aes_ctx_t));
+
+ ret = aes_common_init_ctx(&aes_ctx, template, mechanism, key,
+ crypto_kmflag(req), B_TRUE);
+ if (ret != CRYPTO_SUCCESS)
+ return (ret);
+
+ switch (mechanism->cm_type) {
+ case AES_CCM_MECH_INFO_TYPE:
+ length_needed = plaintext->cd_length + aes_ctx.ac_mac_len;
+ break;
+ case AES_GMAC_MECH_INFO_TYPE:
+ if (plaintext->cd_length != 0)
+ return (CRYPTO_ARGUMENTS_BAD);
+ /* FALLTHRU */
+ case AES_GCM_MECH_INFO_TYPE:
+ length_needed = plaintext->cd_length + aes_ctx.ac_tag_len;
+ break;
+ default:
+ length_needed = plaintext->cd_length;
+ }
+
+ /* return size of buffer needed to store output */
+ if (ciphertext->cd_length < length_needed) {
+ ciphertext->cd_length = length_needed;
+ ret = CRYPTO_BUFFER_TOO_SMALL;
+ goto out;
+ }
+
+ saved_offset = ciphertext->cd_offset;
+ saved_length = ciphertext->cd_length;
+
+ /*
+ * Do an update on the specified input data.
+ */
+ switch (plaintext->cd_format) {
+ case CRYPTO_DATA_RAW:
+ ret = crypto_update_iov(&aes_ctx, plaintext, ciphertext,
+ aes_encrypt_contiguous_blocks, aes_copy_block64);
+ break;
+ case CRYPTO_DATA_UIO:
+ ret = crypto_update_uio(&aes_ctx, plaintext, ciphertext,
+ aes_encrypt_contiguous_blocks, aes_copy_block64);
+ break;
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ if (ret == CRYPTO_SUCCESS) {
+ if (mechanism->cm_type == AES_CCM_MECH_INFO_TYPE) {
+ ret = ccm_encrypt_final((ccm_ctx_t *)&aes_ctx,
+ ciphertext, AES_BLOCK_LEN, aes_encrypt_block,
+ aes_xor_block);
+ if (ret != CRYPTO_SUCCESS)
+ goto out;
+ ASSERT(aes_ctx.ac_remainder_len == 0);
+ } else if (mechanism->cm_type == AES_GCM_MECH_INFO_TYPE ||
+ mechanism->cm_type == AES_GMAC_MECH_INFO_TYPE) {
+ ret = gcm_encrypt_final((gcm_ctx_t *)&aes_ctx,
+ ciphertext, AES_BLOCK_LEN, aes_encrypt_block,
+ aes_copy_block, aes_xor_block);
+ if (ret != CRYPTO_SUCCESS)
+ goto out;
+ ASSERT(aes_ctx.ac_remainder_len == 0);
+ } else if (mechanism->cm_type == AES_CTR_MECH_INFO_TYPE) {
+ if (aes_ctx.ac_remainder_len > 0) {
+ ret = ctr_mode_final((ctr_ctx_t *)&aes_ctx,
+ ciphertext, aes_encrypt_block);
+ if (ret != CRYPTO_SUCCESS)
+ goto out;
+ }
+ } else {
+ ASSERT(aes_ctx.ac_remainder_len == 0);
+ }
+
+ if (plaintext != ciphertext) {
+ ciphertext->cd_length =
+ ciphertext->cd_offset - saved_offset;
+ }
+ } else {
+ ciphertext->cd_length = saved_length;
+ }
+ ciphertext->cd_offset = saved_offset;
+
+out:
+ if (aes_ctx.ac_flags & PROVIDER_OWNS_KEY_SCHEDULE) {
+ bzero(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len);
+ kmem_free(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len);
+ }
+#ifdef CAN_USE_GCM_ASM
+ if (aes_ctx.ac_flags & (GCM_MODE|GMAC_MODE) &&
+ ((gcm_ctx_t *)&aes_ctx)->gcm_Htable != NULL) {
+
+ gcm_ctx_t *ctx = (gcm_ctx_t *)&aes_ctx;
+
+ bzero(ctx->gcm_Htable, ctx->gcm_htab_len);
+ kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len);
+ }
+#endif
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+aes_decrypt_atomic(crypto_provider_handle_t provider,
+ crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+ crypto_key_t *key, crypto_data_t *ciphertext, crypto_data_t *plaintext,
+ crypto_spi_ctx_template_t template, crypto_req_handle_t req)
+{
+ aes_ctx_t aes_ctx; /* on the stack */
+ off_t saved_offset;
+ size_t saved_length;
+ size_t length_needed;
+ int ret;
+
+ ASSERT(plaintext != NULL);
+
+ /*
+ * CCM, GCM, CTR, and GMAC modes do not require that ciphertext
+ * be a multiple of AES block size.
+ */
+ switch (mechanism->cm_type) {
+ case AES_CTR_MECH_INFO_TYPE:
+ case AES_CCM_MECH_INFO_TYPE:
+ case AES_GCM_MECH_INFO_TYPE:
+ case AES_GMAC_MECH_INFO_TYPE:
+ break;
+ default:
+ if ((ciphertext->cd_length & (AES_BLOCK_LEN - 1)) != 0)
+ return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE);
+ }
+
+ if ((ret = aes_check_mech_param(mechanism, NULL, 0)) != CRYPTO_SUCCESS)
+ return (ret);
+
+ bzero(&aes_ctx, sizeof (aes_ctx_t));
+
+ ret = aes_common_init_ctx(&aes_ctx, template, mechanism, key,
+ crypto_kmflag(req), B_FALSE);
+ if (ret != CRYPTO_SUCCESS)
+ return (ret);
+
+ switch (mechanism->cm_type) {
+ case AES_CCM_MECH_INFO_TYPE:
+ length_needed = aes_ctx.ac_data_len;
+ break;
+ case AES_GCM_MECH_INFO_TYPE:
+ length_needed = ciphertext->cd_length - aes_ctx.ac_tag_len;
+ break;
+ case AES_GMAC_MECH_INFO_TYPE:
+ if (plaintext->cd_length != 0)
+ return (CRYPTO_ARGUMENTS_BAD);
+ length_needed = 0;
+ break;
+ default:
+ length_needed = ciphertext->cd_length;
+ }
+
+ /* return size of buffer needed to store output */
+ if (plaintext->cd_length < length_needed) {
+ plaintext->cd_length = length_needed;
+ ret = CRYPTO_BUFFER_TOO_SMALL;
+ goto out;
+ }
+
+ saved_offset = plaintext->cd_offset;
+ saved_length = plaintext->cd_length;
+
+ if (mechanism->cm_type == AES_GCM_MECH_INFO_TYPE ||
+ mechanism->cm_type == AES_GMAC_MECH_INFO_TYPE)
+ gcm_set_kmflag((gcm_ctx_t *)&aes_ctx, crypto_kmflag(req));
+
+ /*
+ * Do an update on the specified input data.
+ */
+ switch (ciphertext->cd_format) {
+ case CRYPTO_DATA_RAW:
+ ret = crypto_update_iov(&aes_ctx, ciphertext, plaintext,
+ aes_decrypt_contiguous_blocks, aes_copy_block64);
+ break;
+ case CRYPTO_DATA_UIO:
+ ret = crypto_update_uio(&aes_ctx, ciphertext, plaintext,
+ aes_decrypt_contiguous_blocks, aes_copy_block64);
+ break;
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ if (ret == CRYPTO_SUCCESS) {
+ if (mechanism->cm_type == AES_CCM_MECH_INFO_TYPE) {
+ ASSERT(aes_ctx.ac_processed_data_len
+ == aes_ctx.ac_data_len);
+ ASSERT(aes_ctx.ac_processed_mac_len
+ == aes_ctx.ac_mac_len);
+ ret = ccm_decrypt_final((ccm_ctx_t *)&aes_ctx,
+ plaintext, AES_BLOCK_LEN, aes_encrypt_block,
+ aes_copy_block, aes_xor_block);
+ ASSERT(aes_ctx.ac_remainder_len == 0);
+ if ((ret == CRYPTO_SUCCESS) &&
+ (ciphertext != plaintext)) {
+ plaintext->cd_length =
+ plaintext->cd_offset - saved_offset;
+ } else {
+ plaintext->cd_length = saved_length;
+ }
+ } else if (mechanism->cm_type == AES_GCM_MECH_INFO_TYPE ||
+ mechanism->cm_type == AES_GMAC_MECH_INFO_TYPE) {
+ ret = gcm_decrypt_final((gcm_ctx_t *)&aes_ctx,
+ plaintext, AES_BLOCK_LEN, aes_encrypt_block,
+ aes_xor_block);
+ ASSERT(aes_ctx.ac_remainder_len == 0);
+ if ((ret == CRYPTO_SUCCESS) &&
+ (ciphertext != plaintext)) {
+ plaintext->cd_length =
+ plaintext->cd_offset - saved_offset;
+ } else {
+ plaintext->cd_length = saved_length;
+ }
+ } else if (mechanism->cm_type != AES_CTR_MECH_INFO_TYPE) {
+ ASSERT(aes_ctx.ac_remainder_len == 0);
+ if (ciphertext != plaintext)
+ plaintext->cd_length =
+ plaintext->cd_offset - saved_offset;
+ } else {
+ if (aes_ctx.ac_remainder_len > 0) {
+ ret = ctr_mode_final((ctr_ctx_t *)&aes_ctx,
+ plaintext, aes_encrypt_block);
+ if (ret == CRYPTO_DATA_LEN_RANGE)
+ ret = CRYPTO_ENCRYPTED_DATA_LEN_RANGE;
+ if (ret != CRYPTO_SUCCESS)
+ goto out;
+ }
+ if (ciphertext != plaintext)
+ plaintext->cd_length =
+ plaintext->cd_offset - saved_offset;
+ }
+ } else {
+ plaintext->cd_length = saved_length;
+ }
+ plaintext->cd_offset = saved_offset;
+
+out:
+ if (aes_ctx.ac_flags & PROVIDER_OWNS_KEY_SCHEDULE) {
+ bzero(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len);
+ kmem_free(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len);
+ }
+
+ if (aes_ctx.ac_flags & CCM_MODE) {
+ if (aes_ctx.ac_pt_buf != NULL) {
+ vmem_free(aes_ctx.ac_pt_buf, aes_ctx.ac_data_len);
+ }
+ } else if (aes_ctx.ac_flags & (GCM_MODE|GMAC_MODE)) {
+ if (((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf != NULL) {
+ vmem_free(((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf,
+ ((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf_len);
+ }
+#ifdef CAN_USE_GCM_ASM
+ if (((gcm_ctx_t *)&aes_ctx)->gcm_Htable != NULL) {
+ gcm_ctx_t *ctx = (gcm_ctx_t *)&aes_ctx;
+
+ bzero(ctx->gcm_Htable, ctx->gcm_htab_len);
+ kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len);
+ }
+#endif
+ }
+
+ return (ret);
+}
+
+/*
+ * KCF software provider context template entry points.
+ */
+/* ARGSUSED */
+static int
+aes_create_ctx_template(crypto_provider_handle_t provider,
+ crypto_mechanism_t *mechanism, crypto_key_t *key,
+ crypto_spi_ctx_template_t *tmpl, size_t *tmpl_size, crypto_req_handle_t req)
+{
+ void *keysched;
+ size_t size;
+ int rv;
+
+ if (mechanism->cm_type != AES_ECB_MECH_INFO_TYPE &&
+ mechanism->cm_type != AES_CBC_MECH_INFO_TYPE &&
+ mechanism->cm_type != AES_CTR_MECH_INFO_TYPE &&
+ mechanism->cm_type != AES_CCM_MECH_INFO_TYPE &&
+ mechanism->cm_type != AES_GCM_MECH_INFO_TYPE &&
+ mechanism->cm_type != AES_GMAC_MECH_INFO_TYPE)
+ return (CRYPTO_MECHANISM_INVALID);
+
+ if ((keysched = aes_alloc_keysched(&size,
+ crypto_kmflag(req))) == NULL) {
+ return (CRYPTO_HOST_MEMORY);
+ }
+
+ /*
+ * Initialize key schedule. Key length information is stored
+ * in the key.
+ */
+ if ((rv = init_keysched(key, keysched)) != CRYPTO_SUCCESS) {
+ bzero(keysched, size);
+ kmem_free(keysched, size);
+ return (rv);
+ }
+
+ *tmpl = keysched;
+ *tmpl_size = size;
+
+ return (CRYPTO_SUCCESS);
+}
+
+
+static int
+aes_free_context(crypto_ctx_t *ctx)
+{
+ aes_ctx_t *aes_ctx = ctx->cc_provider_private;
+
+ if (aes_ctx != NULL) {
+ if (aes_ctx->ac_flags & PROVIDER_OWNS_KEY_SCHEDULE) {
+ ASSERT(aes_ctx->ac_keysched_len != 0);
+ bzero(aes_ctx->ac_keysched, aes_ctx->ac_keysched_len);
+ kmem_free(aes_ctx->ac_keysched,
+ aes_ctx->ac_keysched_len);
+ }
+ crypto_free_mode_ctx(aes_ctx);
+ ctx->cc_provider_private = NULL;
+ }
+
+ return (CRYPTO_SUCCESS);
+}
+
+
+static int
+aes_common_init_ctx(aes_ctx_t *aes_ctx, crypto_spi_ctx_template_t *template,
+ crypto_mechanism_t *mechanism, crypto_key_t *key, int kmflag,
+ boolean_t is_encrypt_init)
+{
+ int rv = CRYPTO_SUCCESS;
+ void *keysched;
+ size_t size = 0;
+
+ if (template == NULL) {
+ if ((keysched = aes_alloc_keysched(&size, kmflag)) == NULL)
+ return (CRYPTO_HOST_MEMORY);
+ /*
+ * Initialize key schedule.
+ * Key length is stored in the key.
+ */
+ if ((rv = init_keysched(key, keysched)) != CRYPTO_SUCCESS) {
+ kmem_free(keysched, size);
+ return (rv);
+ }
+
+ aes_ctx->ac_flags |= PROVIDER_OWNS_KEY_SCHEDULE;
+ aes_ctx->ac_keysched_len = size;
+ } else {
+ keysched = template;
+ }
+ aes_ctx->ac_keysched = keysched;
+
+ switch (mechanism->cm_type) {
+ case AES_CBC_MECH_INFO_TYPE:
+ rv = cbc_init_ctx((cbc_ctx_t *)aes_ctx, mechanism->cm_param,
+ mechanism->cm_param_len, AES_BLOCK_LEN, aes_copy_block64);
+ break;
+ case AES_CTR_MECH_INFO_TYPE: {
+ CK_AES_CTR_PARAMS *pp;
+
+ if (mechanism->cm_param == NULL ||
+ mechanism->cm_param_len != sizeof (CK_AES_CTR_PARAMS)) {
+ return (CRYPTO_MECHANISM_PARAM_INVALID);
+ }
+ pp = (CK_AES_CTR_PARAMS *)(void *)mechanism->cm_param;
+ rv = ctr_init_ctx((ctr_ctx_t *)aes_ctx, pp->ulCounterBits,
+ pp->cb, aes_copy_block);
+ break;
+ }
+ case AES_CCM_MECH_INFO_TYPE:
+ if (mechanism->cm_param == NULL ||
+ mechanism->cm_param_len != sizeof (CK_AES_CCM_PARAMS)) {
+ return (CRYPTO_MECHANISM_PARAM_INVALID);
+ }
+ rv = ccm_init_ctx((ccm_ctx_t *)aes_ctx, mechanism->cm_param,
+ kmflag, is_encrypt_init, AES_BLOCK_LEN, aes_encrypt_block,
+ aes_xor_block);
+ break;
+ case AES_GCM_MECH_INFO_TYPE:
+ if (mechanism->cm_param == NULL ||
+ mechanism->cm_param_len != sizeof (CK_AES_GCM_PARAMS)) {
+ return (CRYPTO_MECHANISM_PARAM_INVALID);
+ }
+ rv = gcm_init_ctx((gcm_ctx_t *)aes_ctx, mechanism->cm_param,
+ AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
+ aes_xor_block);
+ break;
+ case AES_GMAC_MECH_INFO_TYPE:
+ if (mechanism->cm_param == NULL ||
+ mechanism->cm_param_len != sizeof (CK_AES_GMAC_PARAMS)) {
+ return (CRYPTO_MECHANISM_PARAM_INVALID);
+ }
+ rv = gmac_init_ctx((gcm_ctx_t *)aes_ctx, mechanism->cm_param,
+ AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
+ aes_xor_block);
+ break;
+ case AES_ECB_MECH_INFO_TYPE:
+ aes_ctx->ac_flags |= ECB_MODE;
+ }
+
+ if (rv != CRYPTO_SUCCESS) {
+ if (aes_ctx->ac_flags & PROVIDER_OWNS_KEY_SCHEDULE) {
+ bzero(keysched, size);
+ kmem_free(keysched, size);
+ }
+ }
+
+ return (rv);
+}
+
+static int
+process_gmac_mech(crypto_mechanism_t *mech, crypto_data_t *data,
+ CK_AES_GCM_PARAMS *gcm_params)
+{
+ /* LINTED: pointer alignment */
+ CK_AES_GMAC_PARAMS *params = (CK_AES_GMAC_PARAMS *)mech->cm_param;
+
+ if (mech->cm_type != AES_GMAC_MECH_INFO_TYPE)
+ return (CRYPTO_MECHANISM_INVALID);
+
+ if (mech->cm_param_len != sizeof (CK_AES_GMAC_PARAMS))
+ return (CRYPTO_MECHANISM_PARAM_INVALID);
+
+ if (params->pIv == NULL)
+ return (CRYPTO_MECHANISM_PARAM_INVALID);
+
+ gcm_params->pIv = params->pIv;
+ gcm_params->ulIvLen = AES_GMAC_IV_LEN;
+ gcm_params->ulTagBits = AES_GMAC_TAG_BITS;
+
+ if (data == NULL)
+ return (CRYPTO_SUCCESS);
+
+ if (data->cd_format != CRYPTO_DATA_RAW)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ gcm_params->pAAD = (uchar_t *)data->cd_raw.iov_base;
+ gcm_params->ulAADLen = data->cd_length;
+ return (CRYPTO_SUCCESS);
+}
+
+static int
+aes_mac_atomic(crypto_provider_handle_t provider,
+ crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+ crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac,
+ crypto_spi_ctx_template_t template, crypto_req_handle_t req)
+{
+ CK_AES_GCM_PARAMS gcm_params;
+ crypto_mechanism_t gcm_mech;
+ int rv;
+
+ if ((rv = process_gmac_mech(mechanism, data, &gcm_params))
+ != CRYPTO_SUCCESS)
+ return (rv);
+
+ gcm_mech.cm_type = AES_GCM_MECH_INFO_TYPE;
+ gcm_mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS);
+ gcm_mech.cm_param = (char *)&gcm_params;
+
+ return (aes_encrypt_atomic(provider, session_id, &gcm_mech,
+ key, &null_crypto_data, mac, template, req));
+}
+
+static int
+aes_mac_verify_atomic(crypto_provider_handle_t provider,
+ crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+ crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac,
+ crypto_spi_ctx_template_t template, crypto_req_handle_t req)
+{
+ CK_AES_GCM_PARAMS gcm_params;
+ crypto_mechanism_t gcm_mech;
+ int rv;
+
+ if ((rv = process_gmac_mech(mechanism, data, &gcm_params))
+ != CRYPTO_SUCCESS)
+ return (rv);
+
+ gcm_mech.cm_type = AES_GCM_MECH_INFO_TYPE;
+ gcm_mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS);
+ gcm_mech.cm_param = (char *)&gcm_params;
+
+ return (aes_decrypt_atomic(provider, session_id, &gcm_mech,
+ key, mac, &null_crypto_data, template, req));
+}
diff --git a/sys/contrib/openzfs/module/icp/io/edonr_mod.c b/sys/contrib/openzfs/module/icp/io/edonr_mod.c
new file mode 100644
index 000000000000..a806af610629
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/io/edonr_mod.c
@@ -0,0 +1,63 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
+
+#include <sys/modctl.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/icp.h>
+#include <sys/crypto/spi.h>
+#include <sys/sysmacros.h>
+#include <sys/edonr.h>
+
+/*
+ * Unlike sha2 or skein, we won't expose edonr via the Kernel Cryptographic
+ * Framework (KCF), because Edon-R is *NOT* suitable for general-purpose
+ * cryptographic use. Users of Edon-R must interface directly to this module.
+ */
+
+static struct modlmisc modlmisc = {
+ &mod_cryptoops,
+ "Edon-R Message-Digest Algorithm"
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, {&modlmisc, NULL}
+};
+
+int
+edonr_mod_init(void)
+{
+ int error;
+
+ if ((error = mod_install(&modlinkage)) != 0)
+ return (error);
+
+ return (0);
+}
+
+int
+edonr_mod_fini(void)
+{
+ return (mod_remove(&modlinkage));
+}
diff --git a/sys/contrib/openzfs/module/icp/io/sha1_mod.c b/sys/contrib/openzfs/module/icp/io/sha1_mod.c
new file mode 100644
index 000000000000..6dcee6b2ecf2
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/io/sha1_mod.c
@@ -0,0 +1,1230 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/modctl.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/icp.h>
+#include <sys/crypto/spi.h>
+
+#include <sha1/sha1.h>
+#include <sha1/sha1_impl.h>
+
+/*
+ * The sha1 module is created with two modlinkages:
+ * - a modlmisc that allows consumers to directly call the entry points
+ * SHA1Init, SHA1Update, and SHA1Final.
+ * - a modlcrypto that allows the module to register with the Kernel
+ * Cryptographic Framework (KCF) as a software provider for the SHA1
+ * mechanisms.
+ */
+
+static struct modlcrypto modlcrypto = {
+ &mod_cryptoops,
+ "SHA1 Kernel SW Provider 1.1"
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, { &modlcrypto, NULL }
+};
+
+
+/*
+ * Macros to access the SHA1 or SHA1-HMAC contexts from a context passed
+ * by KCF to one of the entry points.
+ */
+
+#define PROV_SHA1_CTX(ctx) ((sha1_ctx_t *)(ctx)->cc_provider_private)
+#define PROV_SHA1_HMAC_CTX(ctx) ((sha1_hmac_ctx_t *)(ctx)->cc_provider_private)
+
+/* to extract the digest length passed as mechanism parameter */
+#define PROV_SHA1_GET_DIGEST_LEN(m, len) { \
+ if (IS_P2ALIGNED((m)->cm_param, sizeof (ulong_t))) \
+ (len) = (uint32_t)*((ulong_t *)(void *)mechanism->cm_param); \
+ else { \
+ ulong_t tmp_ulong; \
+ bcopy((m)->cm_param, &tmp_ulong, sizeof (ulong_t)); \
+ (len) = (uint32_t)tmp_ulong; \
+ } \
+}
+
+#define PROV_SHA1_DIGEST_KEY(ctx, key, len, digest) { \
+ SHA1Init(ctx); \
+ SHA1Update(ctx, key, len); \
+ SHA1Final(digest, ctx); \
+}
+
+/*
+ * Mechanism info structure passed to KCF during registration.
+ */
+static crypto_mech_info_t sha1_mech_info_tab[] = {
+ /* SHA1 */
+ {SUN_CKM_SHA1, SHA1_MECH_INFO_TYPE,
+ CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC,
+ 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS},
+ /* SHA1-HMAC */
+ {SUN_CKM_SHA1_HMAC, SHA1_HMAC_MECH_INFO_TYPE,
+ CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC,
+ SHA1_HMAC_MIN_KEY_LEN, SHA1_HMAC_MAX_KEY_LEN,
+ CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+ /* SHA1-HMAC GENERAL */
+ {SUN_CKM_SHA1_HMAC_GENERAL, SHA1_HMAC_GEN_MECH_INFO_TYPE,
+ CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC,
+ SHA1_HMAC_MIN_KEY_LEN, SHA1_HMAC_MAX_KEY_LEN,
+ CRYPTO_KEYSIZE_UNIT_IN_BYTES}
+};
+
+static void sha1_provider_status(crypto_provider_handle_t, uint_t *);
+
+static crypto_control_ops_t sha1_control_ops = {
+ sha1_provider_status
+};
+
+static int sha1_digest_init(crypto_ctx_t *, crypto_mechanism_t *,
+ crypto_req_handle_t);
+static int sha1_digest(crypto_ctx_t *, crypto_data_t *, crypto_data_t *,
+ crypto_req_handle_t);
+static int sha1_digest_update(crypto_ctx_t *, crypto_data_t *,
+ crypto_req_handle_t);
+static int sha1_digest_final(crypto_ctx_t *, crypto_data_t *,
+ crypto_req_handle_t);
+static int sha1_digest_atomic(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_data_t *, crypto_data_t *,
+ crypto_req_handle_t);
+
+static crypto_digest_ops_t sha1_digest_ops = {
+ .digest_init = sha1_digest_init,
+ .digest = sha1_digest,
+ .digest_update = sha1_digest_update,
+ .digest_key = NULL,
+ .digest_final = sha1_digest_final,
+ .digest_atomic = sha1_digest_atomic
+};
+
+static int sha1_mac_init(crypto_ctx_t *, crypto_mechanism_t *, crypto_key_t *,
+ crypto_spi_ctx_template_t, crypto_req_handle_t);
+static int sha1_mac_update(crypto_ctx_t *, crypto_data_t *,
+ crypto_req_handle_t);
+static int sha1_mac_final(crypto_ctx_t *, crypto_data_t *, crypto_req_handle_t);
+static int sha1_mac_atomic(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *,
+ crypto_spi_ctx_template_t, crypto_req_handle_t);
+static int sha1_mac_verify_atomic(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *,
+ crypto_spi_ctx_template_t, crypto_req_handle_t);
+
+static crypto_mac_ops_t sha1_mac_ops = {
+ .mac_init = sha1_mac_init,
+ .mac = NULL,
+ .mac_update = sha1_mac_update,
+ .mac_final = sha1_mac_final,
+ .mac_atomic = sha1_mac_atomic,
+ .mac_verify_atomic = sha1_mac_verify_atomic
+};
+
+static int sha1_create_ctx_template(crypto_provider_handle_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t *,
+ size_t *, crypto_req_handle_t);
+static int sha1_free_context(crypto_ctx_t *);
+
+static crypto_ctx_ops_t sha1_ctx_ops = {
+ .create_ctx_template = sha1_create_ctx_template,
+ .free_context = sha1_free_context
+};
+
+static crypto_ops_t sha1_crypto_ops = {{{{{
+ &sha1_control_ops,
+ &sha1_digest_ops,
+ NULL,
+ &sha1_mac_ops,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ &sha1_ctx_ops,
+}}}}};
+
+static crypto_provider_info_t sha1_prov_info = {{{{
+ CRYPTO_SPI_VERSION_1,
+ "SHA1 Software Provider",
+ CRYPTO_SW_PROVIDER,
+ NULL,
+ &sha1_crypto_ops,
+ sizeof (sha1_mech_info_tab)/sizeof (crypto_mech_info_t),
+ sha1_mech_info_tab
+}}}};
+
+static crypto_kcf_provider_handle_t sha1_prov_handle = 0;
+
+int
+sha1_mod_init(void)
+{
+ int ret;
+
+ if ((ret = mod_install(&modlinkage)) != 0)
+ return (ret);
+
+ /*
+ * Register with KCF. If the registration fails, log an
+ * error but do not uninstall the module, since the functionality
+ * provided by misc/sha1 should still be available.
+ */
+ if ((ret = crypto_register_provider(&sha1_prov_info,
+ &sha1_prov_handle)) != CRYPTO_SUCCESS)
+ cmn_err(CE_WARN, "sha1 _init: "
+ "crypto_register_provider() failed (0x%x)", ret);
+
+ return (0);
+}
+
+int
+sha1_mod_fini(void)
+{
+ int ret;
+
+ if (sha1_prov_handle != 0) {
+ if ((ret = crypto_unregister_provider(sha1_prov_handle)) !=
+ CRYPTO_SUCCESS) {
+ cmn_err(CE_WARN,
+ "sha1 _fini: crypto_unregister_provider() "
+ "failed (0x%x)", ret);
+ return (EBUSY);
+ }
+ sha1_prov_handle = 0;
+ }
+
+ return (mod_remove(&modlinkage));
+}
+
+/*
+ * KCF software provider control entry points.
+ */
+/* ARGSUSED */
+static void
+sha1_provider_status(crypto_provider_handle_t provider, uint_t *status)
+{
+ *status = CRYPTO_PROVIDER_READY;
+}
+
+/*
+ * KCF software provider digest entry points.
+ */
+
+static int
+sha1_digest_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
+ crypto_req_handle_t req)
+{
+ if (mechanism->cm_type != SHA1_MECH_INFO_TYPE)
+ return (CRYPTO_MECHANISM_INVALID);
+
+ /*
+ * Allocate and initialize SHA1 context.
+ */
+ ctx->cc_provider_private = kmem_alloc(sizeof (sha1_ctx_t),
+ crypto_kmflag(req));
+ if (ctx->cc_provider_private == NULL)
+ return (CRYPTO_HOST_MEMORY);
+
+ PROV_SHA1_CTX(ctx)->sc_mech_type = SHA1_MECH_INFO_TYPE;
+ SHA1Init(&PROV_SHA1_CTX(ctx)->sc_sha1_ctx);
+
+ return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Helper SHA1 digest update function for uio data.
+ */
+static int
+sha1_digest_update_uio(SHA1_CTX *sha1_ctx, crypto_data_t *data)
+{
+ off_t offset = data->cd_offset;
+ size_t length = data->cd_length;
+ uint_t vec_idx = 0;
+ size_t cur_len;
+
+ /* we support only kernel buffer */
+ if (zfs_uio_segflg(data->cd_uio) != UIO_SYSSPACE)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ /*
+ * Jump to the first iovec containing data to be
+ * digested.
+ */
+ offset = zfs_uio_index_at_offset(data->cd_uio, offset, &vec_idx);
+ if (vec_idx == zfs_uio_iovcnt(data->cd_uio)) {
+ /*
+ * The caller specified an offset that is larger than the
+ * total size of the buffers it provided.
+ */
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+
+ /*
+ * Now do the digesting on the iovecs.
+ */
+ while (vec_idx < zfs_uio_iovcnt(data->cd_uio) && length > 0) {
+ cur_len = MIN(zfs_uio_iovlen(data->cd_uio, vec_idx) -
+ offset, length);
+
+ SHA1Update(sha1_ctx,
+ (uint8_t *)zfs_uio_iovbase(data->cd_uio, vec_idx) + offset,
+ cur_len);
+
+ length -= cur_len;
+ vec_idx++;
+ offset = 0;
+ }
+
+ if (vec_idx == zfs_uio_iovcnt(data->cd_uio) && length > 0) {
+ /*
+ * The end of the specified iovec's was reached but
+ * the length requested could not be processed, i.e.
+ * The caller requested to digest more data than it provided.
+ */
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+
+ return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Helper SHA1 digest final function for uio data.
+ * digest_len is the length of the desired digest. If digest_len
+ * is smaller than the default SHA1 digest length, the caller
+ * must pass a scratch buffer, digest_scratch, which must
+ * be at least SHA1_DIGEST_LENGTH bytes.
+ */
+static int
+sha1_digest_final_uio(SHA1_CTX *sha1_ctx, crypto_data_t *digest,
+ ulong_t digest_len, uchar_t *digest_scratch)
+{
+ off_t offset = digest->cd_offset;
+ uint_t vec_idx = 0;
+
+ /* we support only kernel buffer */
+ if (zfs_uio_segflg(digest->cd_uio) != UIO_SYSSPACE)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ /*
+ * Jump to the first iovec containing ptr to the digest to
+ * be returned.
+ */
+ offset = zfs_uio_index_at_offset(digest->cd_uio, offset, &vec_idx);
+ if (vec_idx == zfs_uio_iovcnt(digest->cd_uio)) {
+ /*
+ * The caller specified an offset that is
+ * larger than the total size of the buffers
+ * it provided.
+ */
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+
+ if (offset + digest_len <=
+ zfs_uio_iovlen(digest->cd_uio, vec_idx)) {
+ /*
+ * The computed SHA1 digest will fit in the current
+ * iovec.
+ */
+ if (digest_len != SHA1_DIGEST_LENGTH) {
+ /*
+ * The caller requested a short digest. Digest
+ * into a scratch buffer and return to
+ * the user only what was requested.
+ */
+ SHA1Final(digest_scratch, sha1_ctx);
+ bcopy(digest_scratch, (uchar_t *)
+ zfs_uio_iovbase(digest->cd_uio, vec_idx) + offset,
+ digest_len);
+ } else {
+ SHA1Final((uchar_t *)zfs_uio_iovbase(digest->
+ cd_uio, vec_idx) + offset,
+ sha1_ctx);
+ }
+ } else {
+ /*
+ * The computed digest will be crossing one or more iovec's.
+ * This is bad performance-wise but we need to support it.
+ * Allocate a small scratch buffer on the stack and
+ * copy it piece meal to the specified digest iovec's.
+ */
+ uchar_t digest_tmp[SHA1_DIGEST_LENGTH];
+ off_t scratch_offset = 0;
+ size_t length = digest_len;
+ size_t cur_len;
+
+ SHA1Final(digest_tmp, sha1_ctx);
+
+ while (vec_idx < zfs_uio_iovcnt(digest->cd_uio) && length > 0) {
+ cur_len = MIN(zfs_uio_iovlen(digest->cd_uio, vec_idx) -
+ offset, length);
+ bcopy(digest_tmp + scratch_offset,
+ zfs_uio_iovbase(digest->cd_uio, vec_idx) + offset,
+ cur_len);
+
+ length -= cur_len;
+ vec_idx++;
+ scratch_offset += cur_len;
+ offset = 0;
+ }
+
+ if (vec_idx == zfs_uio_iovcnt(digest->cd_uio) && length > 0) {
+ /*
+ * The end of the specified iovec's was reached but
+ * the length requested could not be processed, i.e.
+ * The caller requested to digest more data than it
+ * provided.
+ */
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+ }
+
+ return (CRYPTO_SUCCESS);
+}
+
+/* ARGSUSED */
+static int
+sha1_digest(crypto_ctx_t *ctx, crypto_data_t *data, crypto_data_t *digest,
+ crypto_req_handle_t req)
+{
+ int ret = CRYPTO_SUCCESS;
+
+ ASSERT(ctx->cc_provider_private != NULL);
+
+ /*
+ * We need to just return the length needed to store the output.
+ * We should not destroy the context for the following cases.
+ */
+ if ((digest->cd_length == 0) ||
+ (digest->cd_length < SHA1_DIGEST_LENGTH)) {
+ digest->cd_length = SHA1_DIGEST_LENGTH;
+ return (CRYPTO_BUFFER_TOO_SMALL);
+ }
+
+ /*
+ * Do the SHA1 update on the specified input data.
+ */
+ switch (data->cd_format) {
+ case CRYPTO_DATA_RAW:
+ SHA1Update(&PROV_SHA1_CTX(ctx)->sc_sha1_ctx,
+ (uint8_t *)data->cd_raw.iov_base + data->cd_offset,
+ data->cd_length);
+ break;
+ case CRYPTO_DATA_UIO:
+ ret = sha1_digest_update_uio(&PROV_SHA1_CTX(ctx)->sc_sha1_ctx,
+ data);
+ break;
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ if (ret != CRYPTO_SUCCESS) {
+ /* the update failed, free context and bail */
+ kmem_free(ctx->cc_provider_private, sizeof (sha1_ctx_t));
+ ctx->cc_provider_private = NULL;
+ digest->cd_length = 0;
+ return (ret);
+ }
+
+ /*
+ * Do a SHA1 final, must be done separately since the digest
+ * type can be different than the input data type.
+ */
+ switch (digest->cd_format) {
+ case CRYPTO_DATA_RAW:
+ SHA1Final((unsigned char *)digest->cd_raw.iov_base +
+ digest->cd_offset, &PROV_SHA1_CTX(ctx)->sc_sha1_ctx);
+ break;
+ case CRYPTO_DATA_UIO:
+ ret = sha1_digest_final_uio(&PROV_SHA1_CTX(ctx)->sc_sha1_ctx,
+ digest, SHA1_DIGEST_LENGTH, NULL);
+ break;
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ /* all done, free context and return */
+
+ if (ret == CRYPTO_SUCCESS) {
+ digest->cd_length = SHA1_DIGEST_LENGTH;
+ } else {
+ digest->cd_length = 0;
+ }
+
+ kmem_free(ctx->cc_provider_private, sizeof (sha1_ctx_t));
+ ctx->cc_provider_private = NULL;
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha1_digest_update(crypto_ctx_t *ctx, crypto_data_t *data,
+ crypto_req_handle_t req)
+{
+ int ret = CRYPTO_SUCCESS;
+
+ ASSERT(ctx->cc_provider_private != NULL);
+
+ /*
+ * Do the SHA1 update on the specified input data.
+ */
+ switch (data->cd_format) {
+ case CRYPTO_DATA_RAW:
+ SHA1Update(&PROV_SHA1_CTX(ctx)->sc_sha1_ctx,
+ (uint8_t *)data->cd_raw.iov_base + data->cd_offset,
+ data->cd_length);
+ break;
+ case CRYPTO_DATA_UIO:
+ ret = sha1_digest_update_uio(&PROV_SHA1_CTX(ctx)->sc_sha1_ctx,
+ data);
+ break;
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha1_digest_final(crypto_ctx_t *ctx, crypto_data_t *digest,
+ crypto_req_handle_t req)
+{
+ int ret = CRYPTO_SUCCESS;
+
+ ASSERT(ctx->cc_provider_private != NULL);
+
+ /*
+ * We need to just return the length needed to store the output.
+ * We should not destroy the context for the following cases.
+ */
+ if ((digest->cd_length == 0) ||
+ (digest->cd_length < SHA1_DIGEST_LENGTH)) {
+ digest->cd_length = SHA1_DIGEST_LENGTH;
+ return (CRYPTO_BUFFER_TOO_SMALL);
+ }
+
+ /*
+ * Do a SHA1 final.
+ */
+ switch (digest->cd_format) {
+ case CRYPTO_DATA_RAW:
+ SHA1Final((unsigned char *)digest->cd_raw.iov_base +
+ digest->cd_offset, &PROV_SHA1_CTX(ctx)->sc_sha1_ctx);
+ break;
+ case CRYPTO_DATA_UIO:
+ ret = sha1_digest_final_uio(&PROV_SHA1_CTX(ctx)->sc_sha1_ctx,
+ digest, SHA1_DIGEST_LENGTH, NULL);
+ break;
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ /* all done, free context and return */
+
+ if (ret == CRYPTO_SUCCESS) {
+ digest->cd_length = SHA1_DIGEST_LENGTH;
+ } else {
+ digest->cd_length = 0;
+ }
+
+ kmem_free(ctx->cc_provider_private, sizeof (sha1_ctx_t));
+ ctx->cc_provider_private = NULL;
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha1_digest_atomic(crypto_provider_handle_t provider,
+ crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+ crypto_data_t *data, crypto_data_t *digest,
+ crypto_req_handle_t req)
+{
+ int ret = CRYPTO_SUCCESS;
+ SHA1_CTX sha1_ctx;
+
+ if (mechanism->cm_type != SHA1_MECH_INFO_TYPE)
+ return (CRYPTO_MECHANISM_INVALID);
+
+ /*
+ * Do the SHA1 init.
+ */
+ SHA1Init(&sha1_ctx);
+
+ /*
+ * Do the SHA1 update on the specified input data.
+ */
+ switch (data->cd_format) {
+ case CRYPTO_DATA_RAW:
+ SHA1Update(&sha1_ctx,
+ (uint8_t *)data->cd_raw.iov_base + data->cd_offset,
+ data->cd_length);
+ break;
+ case CRYPTO_DATA_UIO:
+ ret = sha1_digest_update_uio(&sha1_ctx, data);
+ break;
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ if (ret != CRYPTO_SUCCESS) {
+ /* the update failed, bail */
+ digest->cd_length = 0;
+ return (ret);
+ }
+
+ /*
+ * Do a SHA1 final, must be done separately since the digest
+ * type can be different than the input data type.
+ */
+ switch (digest->cd_format) {
+ case CRYPTO_DATA_RAW:
+ SHA1Final((unsigned char *)digest->cd_raw.iov_base +
+ digest->cd_offset, &sha1_ctx);
+ break;
+ case CRYPTO_DATA_UIO:
+ ret = sha1_digest_final_uio(&sha1_ctx, digest,
+ SHA1_DIGEST_LENGTH, NULL);
+ break;
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ if (ret == CRYPTO_SUCCESS) {
+ digest->cd_length = SHA1_DIGEST_LENGTH;
+ } else {
+ digest->cd_length = 0;
+ }
+
+ return (ret);
+}
+
+/*
+ * KCF software provider mac entry points.
+ *
+ * SHA1 HMAC is: SHA1(key XOR opad, SHA1(key XOR ipad, text))
+ *
+ * Init:
+ * The initialization routine initializes what we denote
+ * as the inner and outer contexts by doing
+ * - for inner context: SHA1(key XOR ipad)
+ * - for outer context: SHA1(key XOR opad)
+ *
+ * Update:
+ * Each subsequent SHA1 HMAC update will result in an
+ * update of the inner context with the specified data.
+ *
+ * Final:
+ * The SHA1 HMAC final will do a SHA1 final operation on the
+ * inner context, and the resulting digest will be used
+ * as the data for an update on the outer context. Last
+ * but not least, a SHA1 final on the outer context will
+ * be performed to obtain the SHA1 HMAC digest to return
+ * to the user.
+ */
+
+/*
+ * Initialize a SHA1-HMAC context.
+ */
+static void
+sha1_mac_init_ctx(sha1_hmac_ctx_t *ctx, void *keyval, uint_t length_in_bytes)
+{
+ uint32_t ipad[SHA1_HMAC_INTS_PER_BLOCK];
+ uint32_t opad[SHA1_HMAC_INTS_PER_BLOCK];
+ uint_t i;
+
+ bzero(ipad, SHA1_HMAC_BLOCK_SIZE);
+ bzero(opad, SHA1_HMAC_BLOCK_SIZE);
+
+ bcopy(keyval, ipad, length_in_bytes);
+ bcopy(keyval, opad, length_in_bytes);
+
+ /* XOR key with ipad (0x36) and opad (0x5c) */
+ for (i = 0; i < SHA1_HMAC_INTS_PER_BLOCK; i++) {
+ ipad[i] ^= 0x36363636;
+ opad[i] ^= 0x5c5c5c5c;
+ }
+
+ /* perform SHA1 on ipad */
+ SHA1Init(&ctx->hc_icontext);
+ SHA1Update(&ctx->hc_icontext, (uint8_t *)ipad, SHA1_HMAC_BLOCK_SIZE);
+
+ /* perform SHA1 on opad */
+ SHA1Init(&ctx->hc_ocontext);
+ SHA1Update(&ctx->hc_ocontext, (uint8_t *)opad, SHA1_HMAC_BLOCK_SIZE);
+}
+
+/*
+ */
+static int
+sha1_mac_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
+ crypto_key_t *key, crypto_spi_ctx_template_t ctx_template,
+ crypto_req_handle_t req)
+{
+ int ret = CRYPTO_SUCCESS;
+ uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length);
+
+ if (mechanism->cm_type != SHA1_HMAC_MECH_INFO_TYPE &&
+ mechanism->cm_type != SHA1_HMAC_GEN_MECH_INFO_TYPE)
+ return (CRYPTO_MECHANISM_INVALID);
+
+ /* Add support for key by attributes (RFE 4706552) */
+ if (key->ck_format != CRYPTO_KEY_RAW)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ ctx->cc_provider_private = kmem_alloc(sizeof (sha1_hmac_ctx_t),
+ crypto_kmflag(req));
+ if (ctx->cc_provider_private == NULL)
+ return (CRYPTO_HOST_MEMORY);
+
+ if (ctx_template != NULL) {
+ /* reuse context template */
+ bcopy(ctx_template, PROV_SHA1_HMAC_CTX(ctx),
+ sizeof (sha1_hmac_ctx_t));
+ } else {
+ /* no context template, compute context */
+ if (keylen_in_bytes > SHA1_HMAC_BLOCK_SIZE) {
+ uchar_t digested_key[SHA1_DIGEST_LENGTH];
+ sha1_hmac_ctx_t *hmac_ctx = ctx->cc_provider_private;
+
+ /*
+ * Hash the passed-in key to get a smaller key.
+ * The inner context is used since it hasn't been
+ * initialized yet.
+ */
+ PROV_SHA1_DIGEST_KEY(&hmac_ctx->hc_icontext,
+ key->ck_data, keylen_in_bytes, digested_key);
+ sha1_mac_init_ctx(PROV_SHA1_HMAC_CTX(ctx),
+ digested_key, SHA1_DIGEST_LENGTH);
+ } else {
+ sha1_mac_init_ctx(PROV_SHA1_HMAC_CTX(ctx),
+ key->ck_data, keylen_in_bytes);
+ }
+ }
+
+ /*
+ * Get the mechanism parameters, if applicable.
+ */
+ PROV_SHA1_HMAC_CTX(ctx)->hc_mech_type = mechanism->cm_type;
+ if (mechanism->cm_type == SHA1_HMAC_GEN_MECH_INFO_TYPE) {
+ if (mechanism->cm_param == NULL ||
+ mechanism->cm_param_len != sizeof (ulong_t))
+ ret = CRYPTO_MECHANISM_PARAM_INVALID;
+ PROV_SHA1_GET_DIGEST_LEN(mechanism,
+ PROV_SHA1_HMAC_CTX(ctx)->hc_digest_len);
+ if (PROV_SHA1_HMAC_CTX(ctx)->hc_digest_len >
+ SHA1_DIGEST_LENGTH)
+ ret = CRYPTO_MECHANISM_PARAM_INVALID;
+ }
+
+ if (ret != CRYPTO_SUCCESS) {
+ bzero(ctx->cc_provider_private, sizeof (sha1_hmac_ctx_t));
+ kmem_free(ctx->cc_provider_private, sizeof (sha1_hmac_ctx_t));
+ ctx->cc_provider_private = NULL;
+ }
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha1_mac_update(crypto_ctx_t *ctx, crypto_data_t *data, crypto_req_handle_t req)
+{
+ int ret = CRYPTO_SUCCESS;
+
+ ASSERT(ctx->cc_provider_private != NULL);
+
+ /*
+ * Do a SHA1 update of the inner context using the specified
+ * data.
+ */
+ switch (data->cd_format) {
+ case CRYPTO_DATA_RAW:
+ SHA1Update(&PROV_SHA1_HMAC_CTX(ctx)->hc_icontext,
+ (uint8_t *)data->cd_raw.iov_base + data->cd_offset,
+ data->cd_length);
+ break;
+ case CRYPTO_DATA_UIO:
+ ret = sha1_digest_update_uio(
+ &PROV_SHA1_HMAC_CTX(ctx)->hc_icontext, data);
+ break;
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha1_mac_final(crypto_ctx_t *ctx, crypto_data_t *mac, crypto_req_handle_t req)
+{
+ int ret = CRYPTO_SUCCESS;
+ uchar_t digest[SHA1_DIGEST_LENGTH];
+ uint32_t digest_len = SHA1_DIGEST_LENGTH;
+
+ ASSERT(ctx->cc_provider_private != NULL);
+
+ if (PROV_SHA1_HMAC_CTX(ctx)->hc_mech_type ==
+ SHA1_HMAC_GEN_MECH_INFO_TYPE)
+ digest_len = PROV_SHA1_HMAC_CTX(ctx)->hc_digest_len;
+
+ /*
+ * We need to just return the length needed to store the output.
+ * We should not destroy the context for the following cases.
+ */
+ if ((mac->cd_length == 0) || (mac->cd_length < digest_len)) {
+ mac->cd_length = digest_len;
+ return (CRYPTO_BUFFER_TOO_SMALL);
+ }
+
+ /*
+ * Do a SHA1 final on the inner context.
+ */
+ SHA1Final(digest, &PROV_SHA1_HMAC_CTX(ctx)->hc_icontext);
+
+ /*
+ * Do a SHA1 update on the outer context, feeding the inner
+ * digest as data.
+ */
+ SHA1Update(&PROV_SHA1_HMAC_CTX(ctx)->hc_ocontext, digest,
+ SHA1_DIGEST_LENGTH);
+
+ /*
+ * Do a SHA1 final on the outer context, storing the computing
+ * digest in the users buffer.
+ */
+ switch (mac->cd_format) {
+ case CRYPTO_DATA_RAW:
+ if (digest_len != SHA1_DIGEST_LENGTH) {
+ /*
+ * The caller requested a short digest. Digest
+ * into a scratch buffer and return to
+ * the user only what was requested.
+ */
+ SHA1Final(digest,
+ &PROV_SHA1_HMAC_CTX(ctx)->hc_ocontext);
+ bcopy(digest, (unsigned char *)mac->cd_raw.iov_base +
+ mac->cd_offset, digest_len);
+ } else {
+ SHA1Final((unsigned char *)mac->cd_raw.iov_base +
+ mac->cd_offset,
+ &PROV_SHA1_HMAC_CTX(ctx)->hc_ocontext);
+ }
+ break;
+ case CRYPTO_DATA_UIO:
+ ret = sha1_digest_final_uio(
+ &PROV_SHA1_HMAC_CTX(ctx)->hc_ocontext, mac,
+ digest_len, digest);
+ break;
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ if (ret == CRYPTO_SUCCESS) {
+ mac->cd_length = digest_len;
+ } else {
+ mac->cd_length = 0;
+ }
+
+ bzero(ctx->cc_provider_private, sizeof (sha1_hmac_ctx_t));
+ kmem_free(ctx->cc_provider_private, sizeof (sha1_hmac_ctx_t));
+ ctx->cc_provider_private = NULL;
+
+ return (ret);
+}
+
+#define SHA1_MAC_UPDATE(data, ctx, ret) { \
+ switch (data->cd_format) { \
+ case CRYPTO_DATA_RAW: \
+ SHA1Update(&(ctx).hc_icontext, \
+ (uint8_t *)data->cd_raw.iov_base + \
+ data->cd_offset, data->cd_length); \
+ break; \
+ case CRYPTO_DATA_UIO: \
+ ret = sha1_digest_update_uio(&(ctx).hc_icontext, data); \
+ break; \
+ default: \
+ ret = CRYPTO_ARGUMENTS_BAD; \
+ } \
+}
+
+/* ARGSUSED */
+static int
+sha1_mac_atomic(crypto_provider_handle_t provider,
+ crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+ crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac,
+ crypto_spi_ctx_template_t ctx_template, crypto_req_handle_t req)
+{
+ int ret = CRYPTO_SUCCESS;
+ uchar_t digest[SHA1_DIGEST_LENGTH];
+ sha1_hmac_ctx_t sha1_hmac_ctx;
+ uint32_t digest_len = SHA1_DIGEST_LENGTH;
+ uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length);
+
+ if (mechanism->cm_type != SHA1_HMAC_MECH_INFO_TYPE &&
+ mechanism->cm_type != SHA1_HMAC_GEN_MECH_INFO_TYPE)
+ return (CRYPTO_MECHANISM_INVALID);
+
+ /* Add support for key by attributes (RFE 4706552) */
+ if (key->ck_format != CRYPTO_KEY_RAW)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ if (ctx_template != NULL) {
+ /* reuse context template */
+ bcopy(ctx_template, &sha1_hmac_ctx, sizeof (sha1_hmac_ctx_t));
+ } else {
+ /* no context template, initialize context */
+ if (keylen_in_bytes > SHA1_HMAC_BLOCK_SIZE) {
+ /*
+ * Hash the passed-in key to get a smaller key.
+ * The inner context is used since it hasn't been
+ * initialized yet.
+ */
+ PROV_SHA1_DIGEST_KEY(&sha1_hmac_ctx.hc_icontext,
+ key->ck_data, keylen_in_bytes, digest);
+ sha1_mac_init_ctx(&sha1_hmac_ctx, digest,
+ SHA1_DIGEST_LENGTH);
+ } else {
+ sha1_mac_init_ctx(&sha1_hmac_ctx, key->ck_data,
+ keylen_in_bytes);
+ }
+ }
+
+ /* get the mechanism parameters, if applicable */
+ if (mechanism->cm_type == SHA1_HMAC_GEN_MECH_INFO_TYPE) {
+ if (mechanism->cm_param == NULL ||
+ mechanism->cm_param_len != sizeof (ulong_t)) {
+ ret = CRYPTO_MECHANISM_PARAM_INVALID;
+ goto bail;
+ }
+ PROV_SHA1_GET_DIGEST_LEN(mechanism, digest_len);
+ if (digest_len > SHA1_DIGEST_LENGTH) {
+ ret = CRYPTO_MECHANISM_PARAM_INVALID;
+ goto bail;
+ }
+ }
+
+ /* do a SHA1 update of the inner context using the specified data */
+ SHA1_MAC_UPDATE(data, sha1_hmac_ctx, ret);
+ if (ret != CRYPTO_SUCCESS)
+ /* the update failed, free context and bail */
+ goto bail;
+
+ /*
+ * Do a SHA1 final on the inner context.
+ */
+ SHA1Final(digest, &sha1_hmac_ctx.hc_icontext);
+
+ /*
+ * Do an SHA1 update on the outer context, feeding the inner
+ * digest as data.
+ */
+ SHA1Update(&sha1_hmac_ctx.hc_ocontext, digest, SHA1_DIGEST_LENGTH);
+
+ /*
+ * Do a SHA1 final on the outer context, storing the computed
+ * digest in the users buffer.
+ */
+ switch (mac->cd_format) {
+ case CRYPTO_DATA_RAW:
+ if (digest_len != SHA1_DIGEST_LENGTH) {
+ /*
+ * The caller requested a short digest. Digest
+ * into a scratch buffer and return to
+ * the user only what was requested.
+ */
+ SHA1Final(digest, &sha1_hmac_ctx.hc_ocontext);
+ bcopy(digest, (unsigned char *)mac->cd_raw.iov_base +
+ mac->cd_offset, digest_len);
+ } else {
+ SHA1Final((unsigned char *)mac->cd_raw.iov_base +
+ mac->cd_offset, &sha1_hmac_ctx.hc_ocontext);
+ }
+ break;
+ case CRYPTO_DATA_UIO:
+ ret = sha1_digest_final_uio(&sha1_hmac_ctx.hc_ocontext, mac,
+ digest_len, digest);
+ break;
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ if (ret == CRYPTO_SUCCESS) {
+ mac->cd_length = digest_len;
+ } else {
+ mac->cd_length = 0;
+ }
+ /* Extra paranoia: zeroize the context on the stack */
+ bzero(&sha1_hmac_ctx, sizeof (sha1_hmac_ctx_t));
+
+ return (ret);
+bail:
+ bzero(&sha1_hmac_ctx, sizeof (sha1_hmac_ctx_t));
+ mac->cd_length = 0;
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha1_mac_verify_atomic(crypto_provider_handle_t provider,
+ crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+ crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac,
+ crypto_spi_ctx_template_t ctx_template, crypto_req_handle_t req)
+{
+ int ret = CRYPTO_SUCCESS;
+ uchar_t digest[SHA1_DIGEST_LENGTH];
+ sha1_hmac_ctx_t sha1_hmac_ctx;
+ uint32_t digest_len = SHA1_DIGEST_LENGTH;
+ uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length);
+
+ if (mechanism->cm_type != SHA1_HMAC_MECH_INFO_TYPE &&
+ mechanism->cm_type != SHA1_HMAC_GEN_MECH_INFO_TYPE)
+ return (CRYPTO_MECHANISM_INVALID);
+
+ /* Add support for key by attributes (RFE 4706552) */
+ if (key->ck_format != CRYPTO_KEY_RAW)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ if (ctx_template != NULL) {
+ /* reuse context template */
+ bcopy(ctx_template, &sha1_hmac_ctx, sizeof (sha1_hmac_ctx_t));
+ } else {
+ /* no context template, initialize context */
+ if (keylen_in_bytes > SHA1_HMAC_BLOCK_SIZE) {
+ /*
+ * Hash the passed-in key to get a smaller key.
+ * The inner context is used since it hasn't been
+ * initialized yet.
+ */
+ PROV_SHA1_DIGEST_KEY(&sha1_hmac_ctx.hc_icontext,
+ key->ck_data, keylen_in_bytes, digest);
+ sha1_mac_init_ctx(&sha1_hmac_ctx, digest,
+ SHA1_DIGEST_LENGTH);
+ } else {
+ sha1_mac_init_ctx(&sha1_hmac_ctx, key->ck_data,
+ keylen_in_bytes);
+ }
+ }
+
+ /* get the mechanism parameters, if applicable */
+ if (mechanism->cm_type == SHA1_HMAC_GEN_MECH_INFO_TYPE) {
+ if (mechanism->cm_param == NULL ||
+ mechanism->cm_param_len != sizeof (ulong_t)) {
+ ret = CRYPTO_MECHANISM_PARAM_INVALID;
+ goto bail;
+ }
+ PROV_SHA1_GET_DIGEST_LEN(mechanism, digest_len);
+ if (digest_len > SHA1_DIGEST_LENGTH) {
+ ret = CRYPTO_MECHANISM_PARAM_INVALID;
+ goto bail;
+ }
+ }
+
+ if (mac->cd_length != digest_len) {
+ ret = CRYPTO_INVALID_MAC;
+ goto bail;
+ }
+
+ /* do a SHA1 update of the inner context using the specified data */
+ SHA1_MAC_UPDATE(data, sha1_hmac_ctx, ret);
+ if (ret != CRYPTO_SUCCESS)
+ /* the update failed, free context and bail */
+ goto bail;
+
+ /* do a SHA1 final on the inner context */
+ SHA1Final(digest, &sha1_hmac_ctx.hc_icontext);
+
+ /*
+ * Do an SHA1 update on the outer context, feeding the inner
+ * digest as data.
+ */
+ SHA1Update(&sha1_hmac_ctx.hc_ocontext, digest, SHA1_DIGEST_LENGTH);
+
+ /*
+ * Do a SHA1 final on the outer context, storing the computed
+ * digest in the users buffer.
+ */
+ SHA1Final(digest, &sha1_hmac_ctx.hc_ocontext);
+
+ /*
+ * Compare the computed digest against the expected digest passed
+ * as argument.
+ */
+
+ switch (mac->cd_format) {
+
+ case CRYPTO_DATA_RAW:
+ if (bcmp(digest, (unsigned char *)mac->cd_raw.iov_base +
+ mac->cd_offset, digest_len) != 0)
+ ret = CRYPTO_INVALID_MAC;
+ break;
+
+ case CRYPTO_DATA_UIO: {
+ off_t offset = mac->cd_offset;
+ uint_t vec_idx = 0;
+ off_t scratch_offset = 0;
+ size_t length = digest_len;
+ size_t cur_len;
+
+ /* we support only kernel buffer */
+ if (zfs_uio_segflg(mac->cd_uio) != UIO_SYSSPACE)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ /* jump to the first iovec containing the expected digest */
+ offset = zfs_uio_index_at_offset(mac->cd_uio, offset, &vec_idx);
+ if (vec_idx == zfs_uio_iovcnt(mac->cd_uio)) {
+ /*
+ * The caller specified an offset that is
+ * larger than the total size of the buffers
+ * it provided.
+ */
+ ret = CRYPTO_DATA_LEN_RANGE;
+ break;
+ }
+
+ /* do the comparison of computed digest vs specified one */
+ while (vec_idx < zfs_uio_iovcnt(mac->cd_uio) && length > 0) {
+ cur_len = MIN(zfs_uio_iovlen(mac->cd_uio, vec_idx) -
+ offset, length);
+
+ if (bcmp(digest + scratch_offset,
+ zfs_uio_iovbase(mac->cd_uio, vec_idx) + offset,
+ cur_len) != 0) {
+ ret = CRYPTO_INVALID_MAC;
+ break;
+ }
+
+ length -= cur_len;
+ vec_idx++;
+ scratch_offset += cur_len;
+ offset = 0;
+ }
+ break;
+ }
+
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ bzero(&sha1_hmac_ctx, sizeof (sha1_hmac_ctx_t));
+ return (ret);
+bail:
+ bzero(&sha1_hmac_ctx, sizeof (sha1_hmac_ctx_t));
+ mac->cd_length = 0;
+ return (ret);
+}
+
+/*
+ * KCF software provider context management entry points.
+ */
+
+/* ARGSUSED */
+static int
+sha1_create_ctx_template(crypto_provider_handle_t provider,
+ crypto_mechanism_t *mechanism, crypto_key_t *key,
+ crypto_spi_ctx_template_t *ctx_template, size_t *ctx_template_size,
+ crypto_req_handle_t req)
+{
+ sha1_hmac_ctx_t *sha1_hmac_ctx_tmpl;
+ uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length);
+
+ if ((mechanism->cm_type != SHA1_HMAC_MECH_INFO_TYPE) &&
+ (mechanism->cm_type != SHA1_HMAC_GEN_MECH_INFO_TYPE)) {
+ return (CRYPTO_MECHANISM_INVALID);
+ }
+
+ /* Add support for key by attributes (RFE 4706552) */
+ if (key->ck_format != CRYPTO_KEY_RAW)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ /*
+ * Allocate and initialize SHA1 context.
+ */
+ sha1_hmac_ctx_tmpl = kmem_alloc(sizeof (sha1_hmac_ctx_t),
+ crypto_kmflag(req));
+ if (sha1_hmac_ctx_tmpl == NULL)
+ return (CRYPTO_HOST_MEMORY);
+
+ if (keylen_in_bytes > SHA1_HMAC_BLOCK_SIZE) {
+ uchar_t digested_key[SHA1_DIGEST_LENGTH];
+
+ /*
+ * Hash the passed-in key to get a smaller key.
+ * The inner context is used since it hasn't been
+ * initialized yet.
+ */
+ PROV_SHA1_DIGEST_KEY(&sha1_hmac_ctx_tmpl->hc_icontext,
+ key->ck_data, keylen_in_bytes, digested_key);
+ sha1_mac_init_ctx(sha1_hmac_ctx_tmpl, digested_key,
+ SHA1_DIGEST_LENGTH);
+ } else {
+ sha1_mac_init_ctx(sha1_hmac_ctx_tmpl, key->ck_data,
+ keylen_in_bytes);
+ }
+
+ sha1_hmac_ctx_tmpl->hc_mech_type = mechanism->cm_type;
+ *ctx_template = (crypto_spi_ctx_template_t)sha1_hmac_ctx_tmpl;
+ *ctx_template_size = sizeof (sha1_hmac_ctx_t);
+
+
+ return (CRYPTO_SUCCESS);
+}
+
+static int
+sha1_free_context(crypto_ctx_t *ctx)
+{
+ uint_t ctx_len;
+ sha1_mech_type_t mech_type;
+
+ if (ctx->cc_provider_private == NULL)
+ return (CRYPTO_SUCCESS);
+
+ /*
+ * We have to free either SHA1 or SHA1-HMAC contexts, which
+ * have different lengths.
+ */
+
+ mech_type = PROV_SHA1_CTX(ctx)->sc_mech_type;
+ if (mech_type == SHA1_MECH_INFO_TYPE)
+ ctx_len = sizeof (sha1_ctx_t);
+ else {
+ ASSERT(mech_type == SHA1_HMAC_MECH_INFO_TYPE ||
+ mech_type == SHA1_HMAC_GEN_MECH_INFO_TYPE);
+ ctx_len = sizeof (sha1_hmac_ctx_t);
+ }
+
+ bzero(ctx->cc_provider_private, ctx_len);
+ kmem_free(ctx->cc_provider_private, ctx_len);
+ ctx->cc_provider_private = NULL;
+
+ return (CRYPTO_SUCCESS);
+}
diff --git a/sys/contrib/openzfs/module/icp/io/sha2_mod.c b/sys/contrib/openzfs/module/icp/io/sha2_mod.c
new file mode 100644
index 000000000000..d690cd0bcb05
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/io/sha2_mod.c
@@ -0,0 +1,1399 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/modctl.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/spi.h>
+#include <sys/crypto/icp.h>
+#define _SHA2_IMPL
+#include <sys/sha2.h>
+#include <sha2/sha2_impl.h>
+
+/*
+ * The sha2 module is created with two modlinkages:
+ * - a modlmisc that allows consumers to directly call the entry points
+ * SHA2Init, SHA2Update, and SHA2Final.
+ * - a modlcrypto that allows the module to register with the Kernel
+ * Cryptographic Framework (KCF) as a software provider for the SHA2
+ * mechanisms.
+ */
+
+static struct modlcrypto modlcrypto = {
+ &mod_cryptoops,
+ "SHA2 Kernel SW Provider"
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, {&modlcrypto, NULL}
+};
+
+/*
+ * Macros to access the SHA2 or SHA2-HMAC contexts from a context passed
+ * by KCF to one of the entry points.
+ */
+
+#define PROV_SHA2_CTX(ctx) ((sha2_ctx_t *)(ctx)->cc_provider_private)
+#define PROV_SHA2_HMAC_CTX(ctx) ((sha2_hmac_ctx_t *)(ctx)->cc_provider_private)
+
+/* to extract the digest length passed as mechanism parameter */
+#define PROV_SHA2_GET_DIGEST_LEN(m, len) { \
+ if (IS_P2ALIGNED((m)->cm_param, sizeof (ulong_t))) \
+ (len) = (uint32_t)*((ulong_t *)(m)->cm_param); \
+ else { \
+ ulong_t tmp_ulong; \
+ bcopy((m)->cm_param, &tmp_ulong, sizeof (ulong_t)); \
+ (len) = (uint32_t)tmp_ulong; \
+ } \
+}
+
+#define PROV_SHA2_DIGEST_KEY(mech, ctx, key, len, digest) { \
+ SHA2Init(mech, ctx); \
+ SHA2Update(ctx, key, len); \
+ SHA2Final(digest, ctx); \
+}
+
+/*
+ * Mechanism info structure passed to KCF during registration.
+ */
+static crypto_mech_info_t sha2_mech_info_tab[] = {
+ /* SHA256 */
+ {SUN_CKM_SHA256, SHA256_MECH_INFO_TYPE,
+ CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC,
+ 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS},
+ /* SHA256-HMAC */
+ {SUN_CKM_SHA256_HMAC, SHA256_HMAC_MECH_INFO_TYPE,
+ CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC,
+ SHA2_HMAC_MIN_KEY_LEN, SHA2_HMAC_MAX_KEY_LEN,
+ CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+ /* SHA256-HMAC GENERAL */
+ {SUN_CKM_SHA256_HMAC_GENERAL, SHA256_HMAC_GEN_MECH_INFO_TYPE,
+ CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC,
+ SHA2_HMAC_MIN_KEY_LEN, SHA2_HMAC_MAX_KEY_LEN,
+ CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+ /* SHA384 */
+ {SUN_CKM_SHA384, SHA384_MECH_INFO_TYPE,
+ CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC,
+ 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS},
+ /* SHA384-HMAC */
+ {SUN_CKM_SHA384_HMAC, SHA384_HMAC_MECH_INFO_TYPE,
+ CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC,
+ SHA2_HMAC_MIN_KEY_LEN, SHA2_HMAC_MAX_KEY_LEN,
+ CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+ /* SHA384-HMAC GENERAL */
+ {SUN_CKM_SHA384_HMAC_GENERAL, SHA384_HMAC_GEN_MECH_INFO_TYPE,
+ CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC,
+ SHA2_HMAC_MIN_KEY_LEN, SHA2_HMAC_MAX_KEY_LEN,
+ CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+ /* SHA512 */
+ {SUN_CKM_SHA512, SHA512_MECH_INFO_TYPE,
+ CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC,
+ 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS},
+ /* SHA512-HMAC */
+ {SUN_CKM_SHA512_HMAC, SHA512_HMAC_MECH_INFO_TYPE,
+ CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC,
+ SHA2_HMAC_MIN_KEY_LEN, SHA2_HMAC_MAX_KEY_LEN,
+ CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+ /* SHA512-HMAC GENERAL */
+ {SUN_CKM_SHA512_HMAC_GENERAL, SHA512_HMAC_GEN_MECH_INFO_TYPE,
+ CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC,
+ SHA2_HMAC_MIN_KEY_LEN, SHA2_HMAC_MAX_KEY_LEN,
+ CRYPTO_KEYSIZE_UNIT_IN_BYTES}
+};
+
+static void sha2_provider_status(crypto_provider_handle_t, uint_t *);
+
+static crypto_control_ops_t sha2_control_ops = {
+ sha2_provider_status
+};
+
+static int sha2_digest_init(crypto_ctx_t *, crypto_mechanism_t *,
+ crypto_req_handle_t);
+static int sha2_digest(crypto_ctx_t *, crypto_data_t *, crypto_data_t *,
+ crypto_req_handle_t);
+static int sha2_digest_update(crypto_ctx_t *, crypto_data_t *,
+ crypto_req_handle_t);
+static int sha2_digest_final(crypto_ctx_t *, crypto_data_t *,
+ crypto_req_handle_t);
+static int sha2_digest_atomic(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_data_t *, crypto_data_t *,
+ crypto_req_handle_t);
+
+static crypto_digest_ops_t sha2_digest_ops = {
+ .digest_init = sha2_digest_init,
+ .digest = sha2_digest,
+ .digest_update = sha2_digest_update,
+ .digest_key = NULL,
+ .digest_final = sha2_digest_final,
+ .digest_atomic = sha2_digest_atomic
+};
+
+static int sha2_mac_init(crypto_ctx_t *, crypto_mechanism_t *, crypto_key_t *,
+ crypto_spi_ctx_template_t, crypto_req_handle_t);
+static int sha2_mac_update(crypto_ctx_t *, crypto_data_t *,
+ crypto_req_handle_t);
+static int sha2_mac_final(crypto_ctx_t *, crypto_data_t *, crypto_req_handle_t);
+static int sha2_mac_atomic(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *,
+ crypto_spi_ctx_template_t, crypto_req_handle_t);
+static int sha2_mac_verify_atomic(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *,
+ crypto_spi_ctx_template_t, crypto_req_handle_t);
+
+static crypto_mac_ops_t sha2_mac_ops = {
+ .mac_init = sha2_mac_init,
+ .mac = NULL,
+ .mac_update = sha2_mac_update,
+ .mac_final = sha2_mac_final,
+ .mac_atomic = sha2_mac_atomic,
+ .mac_verify_atomic = sha2_mac_verify_atomic
+};
+
+static int sha2_create_ctx_template(crypto_provider_handle_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t *,
+ size_t *, crypto_req_handle_t);
+static int sha2_free_context(crypto_ctx_t *);
+
+static crypto_ctx_ops_t sha2_ctx_ops = {
+ .create_ctx_template = sha2_create_ctx_template,
+ .free_context = sha2_free_context
+};
+
+static crypto_ops_t sha2_crypto_ops = {{{{{
+ &sha2_control_ops,
+ &sha2_digest_ops,
+ NULL,
+ &sha2_mac_ops,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ &sha2_ctx_ops
+}}}}};
+
+static crypto_provider_info_t sha2_prov_info = {{{{
+ CRYPTO_SPI_VERSION_1,
+ "SHA2 Software Provider",
+ CRYPTO_SW_PROVIDER,
+ NULL,
+ &sha2_crypto_ops,
+ sizeof (sha2_mech_info_tab)/sizeof (crypto_mech_info_t),
+ sha2_mech_info_tab
+}}}};
+
+static crypto_kcf_provider_handle_t sha2_prov_handle = 0;
+
+int
+sha2_mod_init(void)
+{
+ int ret;
+
+ if ((ret = mod_install(&modlinkage)) != 0)
+ return (ret);
+
+ /*
+ * Register with KCF. If the registration fails, log an
+ * error but do not uninstall the module, since the functionality
+ * provided by misc/sha2 should still be available.
+ */
+ if ((ret = crypto_register_provider(&sha2_prov_info,
+ &sha2_prov_handle)) != CRYPTO_SUCCESS)
+ cmn_err(CE_WARN, "sha2 _init: "
+ "crypto_register_provider() failed (0x%x)", ret);
+
+ return (0);
+}
+
+int
+sha2_mod_fini(void)
+{
+ int ret;
+
+ if (sha2_prov_handle != 0) {
+ if ((ret = crypto_unregister_provider(sha2_prov_handle)) !=
+ CRYPTO_SUCCESS) {
+ cmn_err(CE_WARN,
+ "sha2 _fini: crypto_unregister_provider() "
+ "failed (0x%x)", ret);
+ return (EBUSY);
+ }
+ sha2_prov_handle = 0;
+ }
+
+ return (mod_remove(&modlinkage));
+}
+
+/*
+ * KCF software provider control entry points.
+ */
+/* ARGSUSED */
+static void
+sha2_provider_status(crypto_provider_handle_t provider, uint_t *status)
+{
+ *status = CRYPTO_PROVIDER_READY;
+}
+
+/*
+ * KCF software provider digest entry points.
+ */
+
+static int
+sha2_digest_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
+ crypto_req_handle_t req)
+{
+
+ /*
+ * Allocate and initialize SHA2 context.
+ */
+ ctx->cc_provider_private = kmem_alloc(sizeof (sha2_ctx_t),
+ crypto_kmflag(req));
+ if (ctx->cc_provider_private == NULL)
+ return (CRYPTO_HOST_MEMORY);
+
+ PROV_SHA2_CTX(ctx)->sc_mech_type = mechanism->cm_type;
+ SHA2Init(mechanism->cm_type, &PROV_SHA2_CTX(ctx)->sc_sha2_ctx);
+
+ return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Helper SHA2 digest update function for uio data.
+ */
+static int
+sha2_digest_update_uio(SHA2_CTX *sha2_ctx, crypto_data_t *data)
+{
+ off_t offset = data->cd_offset;
+ size_t length = data->cd_length;
+ uint_t vec_idx = 0;
+ size_t cur_len;
+
+ /* we support only kernel buffer */
+ if (zfs_uio_segflg(data->cd_uio) != UIO_SYSSPACE)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ /*
+ * Jump to the first iovec containing data to be
+ * digested.
+ */
+ offset = zfs_uio_index_at_offset(data->cd_uio, offset, &vec_idx);
+ if (vec_idx == zfs_uio_iovcnt(data->cd_uio)) {
+ /*
+ * The caller specified an offset that is larger than the
+ * total size of the buffers it provided.
+ */
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+
+ /*
+ * Now do the digesting on the iovecs.
+ */
+ while (vec_idx < zfs_uio_iovcnt(data->cd_uio) && length > 0) {
+ cur_len = MIN(zfs_uio_iovlen(data->cd_uio, vec_idx) -
+ offset, length);
+
+ SHA2Update(sha2_ctx, (uint8_t *)zfs_uio_iovbase(data->cd_uio,
+ vec_idx) + offset, cur_len);
+ length -= cur_len;
+ vec_idx++;
+ offset = 0;
+ }
+
+ if (vec_idx == zfs_uio_iovcnt(data->cd_uio) && length > 0) {
+ /*
+ * The end of the specified iovec's was reached but
+ * the length requested could not be processed, i.e.
+ * The caller requested to digest more data than it provided.
+ */
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+
+ return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Helper SHA2 digest final function for uio data.
+ * digest_len is the length of the desired digest. If digest_len
+ * is smaller than the default SHA2 digest length, the caller
+ * must pass a scratch buffer, digest_scratch, which must
+ * be at least the algorithm's digest length bytes.
+ */
+static int
+sha2_digest_final_uio(SHA2_CTX *sha2_ctx, crypto_data_t *digest,
+ ulong_t digest_len, uchar_t *digest_scratch)
+{
+ off_t offset = digest->cd_offset;
+ uint_t vec_idx = 0;
+
+ /* we support only kernel buffer */
+ if (zfs_uio_segflg(digest->cd_uio) != UIO_SYSSPACE)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ /*
+ * Jump to the first iovec containing ptr to the digest to
+ * be returned.
+ */
+ offset = zfs_uio_index_at_offset(digest->cd_uio, offset, &vec_idx);
+ if (vec_idx == zfs_uio_iovcnt(digest->cd_uio)) {
+ /*
+ * The caller specified an offset that is
+ * larger than the total size of the buffers
+ * it provided.
+ */
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+
+ if (offset + digest_len <=
+ zfs_uio_iovlen(digest->cd_uio, vec_idx)) {
+ /*
+ * The computed SHA2 digest will fit in the current
+ * iovec.
+ */
+ if (((sha2_ctx->algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) &&
+ (digest_len != SHA256_DIGEST_LENGTH)) ||
+ ((sha2_ctx->algotype > SHA256_HMAC_GEN_MECH_INFO_TYPE) &&
+ (digest_len != SHA512_DIGEST_LENGTH))) {
+ /*
+ * The caller requested a short digest. Digest
+ * into a scratch buffer and return to
+ * the user only what was requested.
+ */
+ SHA2Final(digest_scratch, sha2_ctx);
+
+ bcopy(digest_scratch, (uchar_t *)
+ zfs_uio_iovbase(digest->cd_uio, vec_idx) + offset,
+ digest_len);
+ } else {
+ SHA2Final((uchar_t *)zfs_uio_iovbase(digest->
+ cd_uio, vec_idx) + offset,
+ sha2_ctx);
+
+ }
+ } else {
+ /*
+ * The computed digest will be crossing one or more iovec's.
+ * This is bad performance-wise but we need to support it.
+ * Allocate a small scratch buffer on the stack and
+ * copy it piece meal to the specified digest iovec's.
+ */
+ uchar_t digest_tmp[SHA512_DIGEST_LENGTH];
+ off_t scratch_offset = 0;
+ size_t length = digest_len;
+ size_t cur_len;
+
+ SHA2Final(digest_tmp, sha2_ctx);
+
+ while (vec_idx < zfs_uio_iovcnt(digest->cd_uio) && length > 0) {
+ cur_len =
+ MIN(zfs_uio_iovlen(digest->cd_uio, vec_idx) -
+ offset, length);
+ bcopy(digest_tmp + scratch_offset,
+ zfs_uio_iovbase(digest->cd_uio, vec_idx) + offset,
+ cur_len);
+
+ length -= cur_len;
+ vec_idx++;
+ scratch_offset += cur_len;
+ offset = 0;
+ }
+
+ if (vec_idx == zfs_uio_iovcnt(digest->cd_uio) && length > 0) {
+ /*
+ * The end of the specified iovec's was reached but
+ * the length requested could not be processed, i.e.
+ * The caller requested to digest more data than it
+ * provided.
+ */
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+ }
+
+ return (CRYPTO_SUCCESS);
+}
+
+/* ARGSUSED */
+static int
+sha2_digest(crypto_ctx_t *ctx, crypto_data_t *data, crypto_data_t *digest,
+ crypto_req_handle_t req)
+{
+ int ret = CRYPTO_SUCCESS;
+ uint_t sha_digest_len;
+
+ ASSERT(ctx->cc_provider_private != NULL);
+
+ switch (PROV_SHA2_CTX(ctx)->sc_mech_type) {
+ case SHA256_MECH_INFO_TYPE:
+ sha_digest_len = SHA256_DIGEST_LENGTH;
+ break;
+ case SHA384_MECH_INFO_TYPE:
+ sha_digest_len = SHA384_DIGEST_LENGTH;
+ break;
+ case SHA512_MECH_INFO_TYPE:
+ sha_digest_len = SHA512_DIGEST_LENGTH;
+ break;
+ default:
+ return (CRYPTO_MECHANISM_INVALID);
+ }
+
+ /*
+ * We need to just return the length needed to store the output.
+ * We should not destroy the context for the following cases.
+ */
+ if ((digest->cd_length == 0) ||
+ (digest->cd_length < sha_digest_len)) {
+ digest->cd_length = sha_digest_len;
+ return (CRYPTO_BUFFER_TOO_SMALL);
+ }
+
+ /*
+ * Do the SHA2 update on the specified input data.
+ */
+ switch (data->cd_format) {
+ case CRYPTO_DATA_RAW:
+ SHA2Update(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx,
+ (uint8_t *)data->cd_raw.iov_base + data->cd_offset,
+ data->cd_length);
+ break;
+ case CRYPTO_DATA_UIO:
+ ret = sha2_digest_update_uio(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx,
+ data);
+ break;
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ if (ret != CRYPTO_SUCCESS) {
+ /* the update failed, free context and bail */
+ kmem_free(ctx->cc_provider_private, sizeof (sha2_ctx_t));
+ ctx->cc_provider_private = NULL;
+ digest->cd_length = 0;
+ return (ret);
+ }
+
+ /*
+ * Do a SHA2 final, must be done separately since the digest
+ * type can be different than the input data type.
+ */
+ switch (digest->cd_format) {
+ case CRYPTO_DATA_RAW:
+ SHA2Final((unsigned char *)digest->cd_raw.iov_base +
+ digest->cd_offset, &PROV_SHA2_CTX(ctx)->sc_sha2_ctx);
+ break;
+ case CRYPTO_DATA_UIO:
+ ret = sha2_digest_final_uio(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx,
+ digest, sha_digest_len, NULL);
+ break;
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ /* all done, free context and return */
+
+ if (ret == CRYPTO_SUCCESS)
+ digest->cd_length = sha_digest_len;
+ else
+ digest->cd_length = 0;
+
+ kmem_free(ctx->cc_provider_private, sizeof (sha2_ctx_t));
+ ctx->cc_provider_private = NULL;
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha2_digest_update(crypto_ctx_t *ctx, crypto_data_t *data,
+ crypto_req_handle_t req)
+{
+ int ret = CRYPTO_SUCCESS;
+
+ ASSERT(ctx->cc_provider_private != NULL);
+
+ /*
+ * Do the SHA2 update on the specified input data.
+ */
+ switch (data->cd_format) {
+ case CRYPTO_DATA_RAW:
+ SHA2Update(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx,
+ (uint8_t *)data->cd_raw.iov_base + data->cd_offset,
+ data->cd_length);
+ break;
+ case CRYPTO_DATA_UIO:
+ ret = sha2_digest_update_uio(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx,
+ data);
+ break;
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha2_digest_final(crypto_ctx_t *ctx, crypto_data_t *digest,
+ crypto_req_handle_t req)
+{
+ int ret = CRYPTO_SUCCESS;
+ uint_t sha_digest_len;
+
+ ASSERT(ctx->cc_provider_private != NULL);
+
+ switch (PROV_SHA2_CTX(ctx)->sc_mech_type) {
+ case SHA256_MECH_INFO_TYPE:
+ sha_digest_len = SHA256_DIGEST_LENGTH;
+ break;
+ case SHA384_MECH_INFO_TYPE:
+ sha_digest_len = SHA384_DIGEST_LENGTH;
+ break;
+ case SHA512_MECH_INFO_TYPE:
+ sha_digest_len = SHA512_DIGEST_LENGTH;
+ break;
+ default:
+ return (CRYPTO_MECHANISM_INVALID);
+ }
+
+ /*
+ * We need to just return the length needed to store the output.
+ * We should not destroy the context for the following cases.
+ */
+ if ((digest->cd_length == 0) ||
+ (digest->cd_length < sha_digest_len)) {
+ digest->cd_length = sha_digest_len;
+ return (CRYPTO_BUFFER_TOO_SMALL);
+ }
+
+ /*
+ * Do a SHA2 final.
+ */
+ switch (digest->cd_format) {
+ case CRYPTO_DATA_RAW:
+ SHA2Final((unsigned char *)digest->cd_raw.iov_base +
+ digest->cd_offset, &PROV_SHA2_CTX(ctx)->sc_sha2_ctx);
+ break;
+ case CRYPTO_DATA_UIO:
+ ret = sha2_digest_final_uio(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx,
+ digest, sha_digest_len, NULL);
+ break;
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ /* all done, free context and return */
+
+ if (ret == CRYPTO_SUCCESS)
+ digest->cd_length = sha_digest_len;
+ else
+ digest->cd_length = 0;
+
+ kmem_free(ctx->cc_provider_private, sizeof (sha2_ctx_t));
+ ctx->cc_provider_private = NULL;
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha2_digest_atomic(crypto_provider_handle_t provider,
+ crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+ crypto_data_t *data, crypto_data_t *digest,
+ crypto_req_handle_t req)
+{
+ int ret = CRYPTO_SUCCESS;
+ SHA2_CTX sha2_ctx;
+ uint32_t sha_digest_len;
+
+ /*
+ * Do the SHA inits.
+ */
+
+ SHA2Init(mechanism->cm_type, &sha2_ctx);
+
+ switch (data->cd_format) {
+ case CRYPTO_DATA_RAW:
+ SHA2Update(&sha2_ctx, (uint8_t *)data->
+ cd_raw.iov_base + data->cd_offset, data->cd_length);
+ break;
+ case CRYPTO_DATA_UIO:
+ ret = sha2_digest_update_uio(&sha2_ctx, data);
+ break;
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ /*
+ * Do the SHA updates on the specified input data.
+ */
+
+ if (ret != CRYPTO_SUCCESS) {
+ /* the update failed, bail */
+ digest->cd_length = 0;
+ return (ret);
+ }
+
+ if (mechanism->cm_type <= SHA256_HMAC_GEN_MECH_INFO_TYPE)
+ sha_digest_len = SHA256_DIGEST_LENGTH;
+ else
+ sha_digest_len = SHA512_DIGEST_LENGTH;
+
+ /*
+ * Do a SHA2 final, must be done separately since the digest
+ * type can be different than the input data type.
+ */
+ switch (digest->cd_format) {
+ case CRYPTO_DATA_RAW:
+ SHA2Final((unsigned char *)digest->cd_raw.iov_base +
+ digest->cd_offset, &sha2_ctx);
+ break;
+ case CRYPTO_DATA_UIO:
+ ret = sha2_digest_final_uio(&sha2_ctx, digest,
+ sha_digest_len, NULL);
+ break;
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ if (ret == CRYPTO_SUCCESS)
+ digest->cd_length = sha_digest_len;
+ else
+ digest->cd_length = 0;
+
+ return (ret);
+}
+
+/*
+ * KCF software provider mac entry points.
+ *
+ * SHA2 HMAC is: SHA2(key XOR opad, SHA2(key XOR ipad, text))
+ *
+ * Init:
+ * The initialization routine initializes what we denote
+ * as the inner and outer contexts by doing
+ * - for inner context: SHA2(key XOR ipad)
+ * - for outer context: SHA2(key XOR opad)
+ *
+ * Update:
+ * Each subsequent SHA2 HMAC update will result in an
+ * update of the inner context with the specified data.
+ *
+ * Final:
+ * The SHA2 HMAC final will do a SHA2 final operation on the
+ * inner context, and the resulting digest will be used
+ * as the data for an update on the outer context. Last
+ * but not least, a SHA2 final on the outer context will
+ * be performed to obtain the SHA2 HMAC digest to return
+ * to the user.
+ */
+
+/*
+ * Initialize a SHA2-HMAC context.
+ */
+static void
+sha2_mac_init_ctx(sha2_hmac_ctx_t *ctx, void *keyval, uint_t length_in_bytes)
+{
+ uint64_t ipad[SHA512_HMAC_BLOCK_SIZE / sizeof (uint64_t)];
+ uint64_t opad[SHA512_HMAC_BLOCK_SIZE / sizeof (uint64_t)];
+ int i, block_size, blocks_per_int64;
+
+ /* Determine the block size */
+ if (ctx->hc_mech_type <= SHA256_HMAC_GEN_MECH_INFO_TYPE) {
+ block_size = SHA256_HMAC_BLOCK_SIZE;
+ blocks_per_int64 = SHA256_HMAC_BLOCK_SIZE / sizeof (uint64_t);
+ } else {
+ block_size = SHA512_HMAC_BLOCK_SIZE;
+ blocks_per_int64 = SHA512_HMAC_BLOCK_SIZE / sizeof (uint64_t);
+ }
+
+ (void) bzero(ipad, block_size);
+ (void) bzero(opad, block_size);
+ (void) bcopy(keyval, ipad, length_in_bytes);
+ (void) bcopy(keyval, opad, length_in_bytes);
+
+ /* XOR key with ipad (0x36) and opad (0x5c) */
+ for (i = 0; i < blocks_per_int64; i ++) {
+ ipad[i] ^= 0x3636363636363636;
+ opad[i] ^= 0x5c5c5c5c5c5c5c5c;
+ }
+
+ /* perform SHA2 on ipad */
+ SHA2Init(ctx->hc_mech_type, &ctx->hc_icontext);
+ SHA2Update(&ctx->hc_icontext, (uint8_t *)ipad, block_size);
+
+ /* perform SHA2 on opad */
+ SHA2Init(ctx->hc_mech_type, &ctx->hc_ocontext);
+ SHA2Update(&ctx->hc_ocontext, (uint8_t *)opad, block_size);
+
+}
+
+/*
+ */
+static int
+sha2_mac_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
+ crypto_key_t *key, crypto_spi_ctx_template_t ctx_template,
+ crypto_req_handle_t req)
+{
+ int ret = CRYPTO_SUCCESS;
+ uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length);
+ uint_t sha_digest_len, sha_hmac_block_size;
+
+ /*
+ * Set the digest length and block size to values appropriate to the
+ * mechanism
+ */
+ switch (mechanism->cm_type) {
+ case SHA256_HMAC_MECH_INFO_TYPE:
+ case SHA256_HMAC_GEN_MECH_INFO_TYPE:
+ sha_digest_len = SHA256_DIGEST_LENGTH;
+ sha_hmac_block_size = SHA256_HMAC_BLOCK_SIZE;
+ break;
+ case SHA384_HMAC_MECH_INFO_TYPE:
+ case SHA384_HMAC_GEN_MECH_INFO_TYPE:
+ case SHA512_HMAC_MECH_INFO_TYPE:
+ case SHA512_HMAC_GEN_MECH_INFO_TYPE:
+ sha_digest_len = SHA512_DIGEST_LENGTH;
+ sha_hmac_block_size = SHA512_HMAC_BLOCK_SIZE;
+ break;
+ default:
+ return (CRYPTO_MECHANISM_INVALID);
+ }
+
+ if (key->ck_format != CRYPTO_KEY_RAW)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ ctx->cc_provider_private = kmem_alloc(sizeof (sha2_hmac_ctx_t),
+ crypto_kmflag(req));
+ if (ctx->cc_provider_private == NULL)
+ return (CRYPTO_HOST_MEMORY);
+
+ PROV_SHA2_HMAC_CTX(ctx)->hc_mech_type = mechanism->cm_type;
+ if (ctx_template != NULL) {
+ /* reuse context template */
+ bcopy(ctx_template, PROV_SHA2_HMAC_CTX(ctx),
+ sizeof (sha2_hmac_ctx_t));
+ } else {
+ /* no context template, compute context */
+ if (keylen_in_bytes > sha_hmac_block_size) {
+ uchar_t digested_key[SHA512_DIGEST_LENGTH];
+ sha2_hmac_ctx_t *hmac_ctx = ctx->cc_provider_private;
+
+ /*
+ * Hash the passed-in key to get a smaller key.
+ * The inner context is used since it hasn't been
+ * initialized yet.
+ */
+ PROV_SHA2_DIGEST_KEY(mechanism->cm_type / 3,
+ &hmac_ctx->hc_icontext,
+ key->ck_data, keylen_in_bytes, digested_key);
+ sha2_mac_init_ctx(PROV_SHA2_HMAC_CTX(ctx),
+ digested_key, sha_digest_len);
+ } else {
+ sha2_mac_init_ctx(PROV_SHA2_HMAC_CTX(ctx),
+ key->ck_data, keylen_in_bytes);
+ }
+ }
+
+ /*
+ * Get the mechanism parameters, if applicable.
+ */
+ if (mechanism->cm_type % 3 == 2) {
+ if (mechanism->cm_param == NULL ||
+ mechanism->cm_param_len != sizeof (ulong_t))
+ ret = CRYPTO_MECHANISM_PARAM_INVALID;
+ PROV_SHA2_GET_DIGEST_LEN(mechanism,
+ PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len);
+ if (PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len > sha_digest_len)
+ ret = CRYPTO_MECHANISM_PARAM_INVALID;
+ }
+
+ if (ret != CRYPTO_SUCCESS) {
+ bzero(ctx->cc_provider_private, sizeof (sha2_hmac_ctx_t));
+ kmem_free(ctx->cc_provider_private, sizeof (sha2_hmac_ctx_t));
+ ctx->cc_provider_private = NULL;
+ }
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha2_mac_update(crypto_ctx_t *ctx, crypto_data_t *data,
+ crypto_req_handle_t req)
+{
+ int ret = CRYPTO_SUCCESS;
+
+ ASSERT(ctx->cc_provider_private != NULL);
+
+ /*
+ * Do a SHA2 update of the inner context using the specified
+ * data.
+ */
+ switch (data->cd_format) {
+ case CRYPTO_DATA_RAW:
+ SHA2Update(&PROV_SHA2_HMAC_CTX(ctx)->hc_icontext,
+ (uint8_t *)data->cd_raw.iov_base + data->cd_offset,
+ data->cd_length);
+ break;
+ case CRYPTO_DATA_UIO:
+ ret = sha2_digest_update_uio(
+ &PROV_SHA2_HMAC_CTX(ctx)->hc_icontext, data);
+ break;
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha2_mac_final(crypto_ctx_t *ctx, crypto_data_t *mac, crypto_req_handle_t req)
+{
+ int ret = CRYPTO_SUCCESS;
+ uchar_t digest[SHA512_DIGEST_LENGTH];
+ uint32_t digest_len, sha_digest_len;
+
+ ASSERT(ctx->cc_provider_private != NULL);
+
+ /* Set the digest lengths to values appropriate to the mechanism */
+ switch (PROV_SHA2_HMAC_CTX(ctx)->hc_mech_type) {
+ case SHA256_HMAC_MECH_INFO_TYPE:
+ sha_digest_len = digest_len = SHA256_DIGEST_LENGTH;
+ break;
+ case SHA384_HMAC_MECH_INFO_TYPE:
+ sha_digest_len = digest_len = SHA384_DIGEST_LENGTH;
+ break;
+ case SHA512_HMAC_MECH_INFO_TYPE:
+ sha_digest_len = digest_len = SHA512_DIGEST_LENGTH;
+ break;
+ case SHA256_HMAC_GEN_MECH_INFO_TYPE:
+ sha_digest_len = SHA256_DIGEST_LENGTH;
+ digest_len = PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len;
+ break;
+ case SHA384_HMAC_GEN_MECH_INFO_TYPE:
+ case SHA512_HMAC_GEN_MECH_INFO_TYPE:
+ sha_digest_len = SHA512_DIGEST_LENGTH;
+ digest_len = PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len;
+ break;
+ default:
+ return (CRYPTO_ARGUMENTS_BAD);
+ }
+
+ /*
+ * We need to just return the length needed to store the output.
+ * We should not destroy the context for the following cases.
+ */
+ if ((mac->cd_length == 0) || (mac->cd_length < digest_len)) {
+ mac->cd_length = digest_len;
+ return (CRYPTO_BUFFER_TOO_SMALL);
+ }
+
+ /*
+ * Do a SHA2 final on the inner context.
+ */
+ SHA2Final(digest, &PROV_SHA2_HMAC_CTX(ctx)->hc_icontext);
+
+ /*
+ * Do a SHA2 update on the outer context, feeding the inner
+ * digest as data.
+ */
+ SHA2Update(&PROV_SHA2_HMAC_CTX(ctx)->hc_ocontext, digest,
+ sha_digest_len);
+
+ /*
+ * Do a SHA2 final on the outer context, storing the computing
+ * digest in the users buffer.
+ */
+ switch (mac->cd_format) {
+ case CRYPTO_DATA_RAW:
+ if (digest_len != sha_digest_len) {
+ /*
+ * The caller requested a short digest. Digest
+ * into a scratch buffer and return to
+ * the user only what was requested.
+ */
+ SHA2Final(digest,
+ &PROV_SHA2_HMAC_CTX(ctx)->hc_ocontext);
+ bcopy(digest, (unsigned char *)mac->cd_raw.iov_base +
+ mac->cd_offset, digest_len);
+ } else {
+ SHA2Final((unsigned char *)mac->cd_raw.iov_base +
+ mac->cd_offset,
+ &PROV_SHA2_HMAC_CTX(ctx)->hc_ocontext);
+ }
+ break;
+ case CRYPTO_DATA_UIO:
+ ret = sha2_digest_final_uio(
+ &PROV_SHA2_HMAC_CTX(ctx)->hc_ocontext, mac,
+ digest_len, digest);
+ break;
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ if (ret == CRYPTO_SUCCESS)
+ mac->cd_length = digest_len;
+ else
+ mac->cd_length = 0;
+
+ bzero(ctx->cc_provider_private, sizeof (sha2_hmac_ctx_t));
+ kmem_free(ctx->cc_provider_private, sizeof (sha2_hmac_ctx_t));
+ ctx->cc_provider_private = NULL;
+
+ return (ret);
+}
+
+#define SHA2_MAC_UPDATE(data, ctx, ret) { \
+ switch (data->cd_format) { \
+ case CRYPTO_DATA_RAW: \
+ SHA2Update(&(ctx).hc_icontext, \
+ (uint8_t *)data->cd_raw.iov_base + \
+ data->cd_offset, data->cd_length); \
+ break; \
+ case CRYPTO_DATA_UIO: \
+ ret = sha2_digest_update_uio(&(ctx).hc_icontext, data); \
+ break; \
+ default: \
+ ret = CRYPTO_ARGUMENTS_BAD; \
+ } \
+}
+
+/* ARGSUSED */
+static int
+sha2_mac_atomic(crypto_provider_handle_t provider,
+ crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+ crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac,
+ crypto_spi_ctx_template_t ctx_template, crypto_req_handle_t req)
+{
+ int ret = CRYPTO_SUCCESS;
+ uchar_t digest[SHA512_DIGEST_LENGTH];
+ sha2_hmac_ctx_t sha2_hmac_ctx;
+ uint32_t sha_digest_len, digest_len, sha_hmac_block_size;
+ uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length);
+
+ /*
+ * Set the digest length and block size to values appropriate to the
+ * mechanism
+ */
+ switch (mechanism->cm_type) {
+ case SHA256_HMAC_MECH_INFO_TYPE:
+ case SHA256_HMAC_GEN_MECH_INFO_TYPE:
+ sha_digest_len = digest_len = SHA256_DIGEST_LENGTH;
+ sha_hmac_block_size = SHA256_HMAC_BLOCK_SIZE;
+ break;
+ case SHA384_HMAC_MECH_INFO_TYPE:
+ case SHA384_HMAC_GEN_MECH_INFO_TYPE:
+ case SHA512_HMAC_MECH_INFO_TYPE:
+ case SHA512_HMAC_GEN_MECH_INFO_TYPE:
+ sha_digest_len = digest_len = SHA512_DIGEST_LENGTH;
+ sha_hmac_block_size = SHA512_HMAC_BLOCK_SIZE;
+ break;
+ default:
+ return (CRYPTO_MECHANISM_INVALID);
+ }
+
+ /* Add support for key by attributes (RFE 4706552) */
+ if (key->ck_format != CRYPTO_KEY_RAW)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ if (ctx_template != NULL) {
+ /* reuse context template */
+ bcopy(ctx_template, &sha2_hmac_ctx, sizeof (sha2_hmac_ctx_t));
+ } else {
+ sha2_hmac_ctx.hc_mech_type = mechanism->cm_type;
+ /* no context template, initialize context */
+ if (keylen_in_bytes > sha_hmac_block_size) {
+ /*
+ * Hash the passed-in key to get a smaller key.
+ * The inner context is used since it hasn't been
+ * initialized yet.
+ */
+ PROV_SHA2_DIGEST_KEY(mechanism->cm_type / 3,
+ &sha2_hmac_ctx.hc_icontext,
+ key->ck_data, keylen_in_bytes, digest);
+ sha2_mac_init_ctx(&sha2_hmac_ctx, digest,
+ sha_digest_len);
+ } else {
+ sha2_mac_init_ctx(&sha2_hmac_ctx, key->ck_data,
+ keylen_in_bytes);
+ }
+ }
+
+ /* get the mechanism parameters, if applicable */
+ if ((mechanism->cm_type % 3) == 2) {
+ if (mechanism->cm_param == NULL ||
+ mechanism->cm_param_len != sizeof (ulong_t)) {
+ ret = CRYPTO_MECHANISM_PARAM_INVALID;
+ goto bail;
+ }
+ PROV_SHA2_GET_DIGEST_LEN(mechanism, digest_len);
+ if (digest_len > sha_digest_len) {
+ ret = CRYPTO_MECHANISM_PARAM_INVALID;
+ goto bail;
+ }
+ }
+
+ /* do a SHA2 update of the inner context using the specified data */
+ SHA2_MAC_UPDATE(data, sha2_hmac_ctx, ret);
+ if (ret != CRYPTO_SUCCESS)
+ /* the update failed, free context and bail */
+ goto bail;
+
+ /*
+ * Do a SHA2 final on the inner context.
+ */
+ SHA2Final(digest, &sha2_hmac_ctx.hc_icontext);
+
+ /*
+ * Do an SHA2 update on the outer context, feeding the inner
+ * digest as data.
+ *
+ * HMAC-SHA384 needs special handling as the outer hash needs only 48
+ * bytes of the inner hash value.
+ */
+ if (mechanism->cm_type == SHA384_HMAC_MECH_INFO_TYPE ||
+ mechanism->cm_type == SHA384_HMAC_GEN_MECH_INFO_TYPE)
+ SHA2Update(&sha2_hmac_ctx.hc_ocontext, digest,
+ SHA384_DIGEST_LENGTH);
+ else
+ SHA2Update(&sha2_hmac_ctx.hc_ocontext, digest, sha_digest_len);
+
+ /*
+ * Do a SHA2 final on the outer context, storing the computed
+ * digest in the users buffer.
+ */
+ switch (mac->cd_format) {
+ case CRYPTO_DATA_RAW:
+ if (digest_len != sha_digest_len) {
+ /*
+ * The caller requested a short digest. Digest
+ * into a scratch buffer and return to
+ * the user only what was requested.
+ */
+ SHA2Final(digest, &sha2_hmac_ctx.hc_ocontext);
+ bcopy(digest, (unsigned char *)mac->cd_raw.iov_base +
+ mac->cd_offset, digest_len);
+ } else {
+ SHA2Final((unsigned char *)mac->cd_raw.iov_base +
+ mac->cd_offset, &sha2_hmac_ctx.hc_ocontext);
+ }
+ break;
+ case CRYPTO_DATA_UIO:
+ ret = sha2_digest_final_uio(&sha2_hmac_ctx.hc_ocontext, mac,
+ digest_len, digest);
+ break;
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ if (ret == CRYPTO_SUCCESS) {
+ mac->cd_length = digest_len;
+ return (CRYPTO_SUCCESS);
+ }
+bail:
+ bzero(&sha2_hmac_ctx, sizeof (sha2_hmac_ctx_t));
+ mac->cd_length = 0;
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha2_mac_verify_atomic(crypto_provider_handle_t provider,
+ crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+ crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac,
+ crypto_spi_ctx_template_t ctx_template, crypto_req_handle_t req)
+{
+ int ret = CRYPTO_SUCCESS;
+ uchar_t digest[SHA512_DIGEST_LENGTH];
+ sha2_hmac_ctx_t sha2_hmac_ctx;
+ uint32_t sha_digest_len, digest_len, sha_hmac_block_size;
+ uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length);
+
+ /*
+ * Set the digest length and block size to values appropriate to the
+ * mechanism
+ */
+ switch (mechanism->cm_type) {
+ case SHA256_HMAC_MECH_INFO_TYPE:
+ case SHA256_HMAC_GEN_MECH_INFO_TYPE:
+ sha_digest_len = digest_len = SHA256_DIGEST_LENGTH;
+ sha_hmac_block_size = SHA256_HMAC_BLOCK_SIZE;
+ break;
+ case SHA384_HMAC_MECH_INFO_TYPE:
+ case SHA384_HMAC_GEN_MECH_INFO_TYPE:
+ case SHA512_HMAC_MECH_INFO_TYPE:
+ case SHA512_HMAC_GEN_MECH_INFO_TYPE:
+ sha_digest_len = digest_len = SHA512_DIGEST_LENGTH;
+ sha_hmac_block_size = SHA512_HMAC_BLOCK_SIZE;
+ break;
+ default:
+ return (CRYPTO_MECHANISM_INVALID);
+ }
+
+ /* Add support for key by attributes (RFE 4706552) */
+ if (key->ck_format != CRYPTO_KEY_RAW)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ if (ctx_template != NULL) {
+ /* reuse context template */
+ bcopy(ctx_template, &sha2_hmac_ctx, sizeof (sha2_hmac_ctx_t));
+ } else {
+ sha2_hmac_ctx.hc_mech_type = mechanism->cm_type;
+ /* no context template, initialize context */
+ if (keylen_in_bytes > sha_hmac_block_size) {
+ /*
+ * Hash the passed-in key to get a smaller key.
+ * The inner context is used since it hasn't been
+ * initialized yet.
+ */
+ PROV_SHA2_DIGEST_KEY(mechanism->cm_type / 3,
+ &sha2_hmac_ctx.hc_icontext,
+ key->ck_data, keylen_in_bytes, digest);
+ sha2_mac_init_ctx(&sha2_hmac_ctx, digest,
+ sha_digest_len);
+ } else {
+ sha2_mac_init_ctx(&sha2_hmac_ctx, key->ck_data,
+ keylen_in_bytes);
+ }
+ }
+
+ /* get the mechanism parameters, if applicable */
+ if (mechanism->cm_type % 3 == 2) {
+ if (mechanism->cm_param == NULL ||
+ mechanism->cm_param_len != sizeof (ulong_t)) {
+ ret = CRYPTO_MECHANISM_PARAM_INVALID;
+ goto bail;
+ }
+ PROV_SHA2_GET_DIGEST_LEN(mechanism, digest_len);
+ if (digest_len > sha_digest_len) {
+ ret = CRYPTO_MECHANISM_PARAM_INVALID;
+ goto bail;
+ }
+ }
+
+ if (mac->cd_length != digest_len) {
+ ret = CRYPTO_INVALID_MAC;
+ goto bail;
+ }
+
+ /* do a SHA2 update of the inner context using the specified data */
+ SHA2_MAC_UPDATE(data, sha2_hmac_ctx, ret);
+ if (ret != CRYPTO_SUCCESS)
+ /* the update failed, free context and bail */
+ goto bail;
+
+ /* do a SHA2 final on the inner context */
+ SHA2Final(digest, &sha2_hmac_ctx.hc_icontext);
+
+ /*
+ * Do an SHA2 update on the outer context, feeding the inner
+ * digest as data.
+ *
+ * HMAC-SHA384 needs special handling as the outer hash needs only 48
+ * bytes of the inner hash value.
+ */
+ if (mechanism->cm_type == SHA384_HMAC_MECH_INFO_TYPE ||
+ mechanism->cm_type == SHA384_HMAC_GEN_MECH_INFO_TYPE)
+ SHA2Update(&sha2_hmac_ctx.hc_ocontext, digest,
+ SHA384_DIGEST_LENGTH);
+ else
+ SHA2Update(&sha2_hmac_ctx.hc_ocontext, digest, sha_digest_len);
+
+ /*
+ * Do a SHA2 final on the outer context, storing the computed
+ * digest in the users buffer.
+ */
+ SHA2Final(digest, &sha2_hmac_ctx.hc_ocontext);
+
+ /*
+ * Compare the computed digest against the expected digest passed
+ * as argument.
+ */
+
+ switch (mac->cd_format) {
+
+ case CRYPTO_DATA_RAW:
+ if (bcmp(digest, (unsigned char *)mac->cd_raw.iov_base +
+ mac->cd_offset, digest_len) != 0)
+ ret = CRYPTO_INVALID_MAC;
+ break;
+
+ case CRYPTO_DATA_UIO: {
+ off_t offset = mac->cd_offset;
+ uint_t vec_idx = 0;
+ off_t scratch_offset = 0;
+ size_t length = digest_len;
+ size_t cur_len;
+
+ /* we support only kernel buffer */
+ if (zfs_uio_segflg(mac->cd_uio) != UIO_SYSSPACE)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ /* jump to the first iovec containing the expected digest */
+ offset = zfs_uio_index_at_offset(mac->cd_uio, offset, &vec_idx);
+ if (vec_idx == zfs_uio_iovcnt(mac->cd_uio)) {
+ /*
+ * The caller specified an offset that is
+ * larger than the total size of the buffers
+ * it provided.
+ */
+ ret = CRYPTO_DATA_LEN_RANGE;
+ break;
+ }
+
+ /* do the comparison of computed digest vs specified one */
+ while (vec_idx < zfs_uio_iovcnt(mac->cd_uio) && length > 0) {
+ cur_len = MIN(zfs_uio_iovlen(mac->cd_uio, vec_idx) -
+ offset, length);
+
+ if (bcmp(digest + scratch_offset,
+ zfs_uio_iovbase(mac->cd_uio, vec_idx) + offset,
+ cur_len) != 0) {
+ ret = CRYPTO_INVALID_MAC;
+ break;
+ }
+
+ length -= cur_len;
+ vec_idx++;
+ scratch_offset += cur_len;
+ offset = 0;
+ }
+ break;
+ }
+
+ default:
+ ret = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ return (ret);
+bail:
+ bzero(&sha2_hmac_ctx, sizeof (sha2_hmac_ctx_t));
+ mac->cd_length = 0;
+ return (ret);
+}
+
+/*
+ * KCF software provider context management entry points.
+ */
+
+/* ARGSUSED */
+static int
+sha2_create_ctx_template(crypto_provider_handle_t provider,
+ crypto_mechanism_t *mechanism, crypto_key_t *key,
+ crypto_spi_ctx_template_t *ctx_template, size_t *ctx_template_size,
+ crypto_req_handle_t req)
+{
+ sha2_hmac_ctx_t *sha2_hmac_ctx_tmpl;
+ uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length);
+ uint32_t sha_digest_len, sha_hmac_block_size;
+
+ /*
+ * Set the digest length and block size to values appropriate to the
+ * mechanism
+ */
+ switch (mechanism->cm_type) {
+ case SHA256_HMAC_MECH_INFO_TYPE:
+ case SHA256_HMAC_GEN_MECH_INFO_TYPE:
+ sha_digest_len = SHA256_DIGEST_LENGTH;
+ sha_hmac_block_size = SHA256_HMAC_BLOCK_SIZE;
+ break;
+ case SHA384_HMAC_MECH_INFO_TYPE:
+ case SHA384_HMAC_GEN_MECH_INFO_TYPE:
+ case SHA512_HMAC_MECH_INFO_TYPE:
+ case SHA512_HMAC_GEN_MECH_INFO_TYPE:
+ sha_digest_len = SHA512_DIGEST_LENGTH;
+ sha_hmac_block_size = SHA512_HMAC_BLOCK_SIZE;
+ break;
+ default:
+ return (CRYPTO_MECHANISM_INVALID);
+ }
+
+ /* Add support for key by attributes (RFE 4706552) */
+ if (key->ck_format != CRYPTO_KEY_RAW)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ /*
+ * Allocate and initialize SHA2 context.
+ */
+ sha2_hmac_ctx_tmpl = kmem_alloc(sizeof (sha2_hmac_ctx_t),
+ crypto_kmflag(req));
+ if (sha2_hmac_ctx_tmpl == NULL)
+ return (CRYPTO_HOST_MEMORY);
+
+ sha2_hmac_ctx_tmpl->hc_mech_type = mechanism->cm_type;
+
+ if (keylen_in_bytes > sha_hmac_block_size) {
+ uchar_t digested_key[SHA512_DIGEST_LENGTH];
+
+ /*
+ * Hash the passed-in key to get a smaller key.
+ * The inner context is used since it hasn't been
+ * initialized yet.
+ */
+ PROV_SHA2_DIGEST_KEY(mechanism->cm_type / 3,
+ &sha2_hmac_ctx_tmpl->hc_icontext,
+ key->ck_data, keylen_in_bytes, digested_key);
+ sha2_mac_init_ctx(sha2_hmac_ctx_tmpl, digested_key,
+ sha_digest_len);
+ } else {
+ sha2_mac_init_ctx(sha2_hmac_ctx_tmpl, key->ck_data,
+ keylen_in_bytes);
+ }
+
+ *ctx_template = (crypto_spi_ctx_template_t)sha2_hmac_ctx_tmpl;
+ *ctx_template_size = sizeof (sha2_hmac_ctx_t);
+
+ return (CRYPTO_SUCCESS);
+}
+
+static int
+sha2_free_context(crypto_ctx_t *ctx)
+{
+ uint_t ctx_len;
+
+ if (ctx->cc_provider_private == NULL)
+ return (CRYPTO_SUCCESS);
+
+ /*
+ * We have to free either SHA2 or SHA2-HMAC contexts, which
+ * have different lengths.
+ *
+ * Note: Below is dependent on the mechanism ordering.
+ */
+
+ if (PROV_SHA2_CTX(ctx)->sc_mech_type % 3 == 0)
+ ctx_len = sizeof (sha2_ctx_t);
+ else
+ ctx_len = sizeof (sha2_hmac_ctx_t);
+
+ bzero(ctx->cc_provider_private, ctx_len);
+ kmem_free(ctx->cc_provider_private, ctx_len);
+ ctx->cc_provider_private = NULL;
+
+ return (CRYPTO_SUCCESS);
+}
diff --git a/sys/contrib/openzfs/module/icp/io/skein_mod.c b/sys/contrib/openzfs/module/icp/io/skein_mod.c
new file mode 100644
index 000000000000..5ee36af12bcb
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/io/skein_mod.c
@@ -0,0 +1,729 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
+
+#include <sys/modctl.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/icp.h>
+#include <sys/crypto/spi.h>
+#include <sys/sysmacros.h>
+#define SKEIN_MODULE_IMPL
+#include <sys/skein.h>
+
+/*
+ * Like the sha2 module, we create the skein module with two modlinkages:
+ * - modlmisc to allow direct calls to Skein_* API functions.
+ * - modlcrypto to integrate well into the Kernel Crypto Framework (KCF).
+ */
+static struct modlmisc modlmisc = {
+ &mod_cryptoops,
+ "Skein Message-Digest Algorithm"
+};
+
+static struct modlcrypto modlcrypto = {
+ &mod_cryptoops,
+ "Skein Kernel SW Provider"
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, {&modlmisc, &modlcrypto, NULL}
+};
+
+static crypto_mech_info_t skein_mech_info_tab[] = {
+ {CKM_SKEIN_256, SKEIN_256_MECH_INFO_TYPE,
+ CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC,
+ 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS},
+ {CKM_SKEIN_256_MAC, SKEIN_256_MAC_MECH_INFO_TYPE,
+ CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, 1, INT_MAX,
+ CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+ {CKM_SKEIN_512, SKEIN_512_MECH_INFO_TYPE,
+ CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC,
+ 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS},
+ {CKM_SKEIN_512_MAC, SKEIN_512_MAC_MECH_INFO_TYPE,
+ CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, 1, INT_MAX,
+ CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+ {CKM_SKEIN1024, SKEIN1024_MECH_INFO_TYPE,
+ CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC,
+ 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS},
+ {CKM_SKEIN1024_MAC, SKEIN1024_MAC_MECH_INFO_TYPE,
+ CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, 1, INT_MAX,
+ CRYPTO_KEYSIZE_UNIT_IN_BYTES}
+};
+
+static void skein_provider_status(crypto_provider_handle_t, uint_t *);
+
+static crypto_control_ops_t skein_control_ops = {
+ skein_provider_status
+};
+
+static int skein_digest_init(crypto_ctx_t *, crypto_mechanism_t *,
+ crypto_req_handle_t);
+static int skein_digest(crypto_ctx_t *, crypto_data_t *, crypto_data_t *,
+ crypto_req_handle_t);
+static int skein_update(crypto_ctx_t *, crypto_data_t *, crypto_req_handle_t);
+static int skein_final(crypto_ctx_t *, crypto_data_t *, crypto_req_handle_t);
+static int skein_digest_atomic(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_data_t *, crypto_data_t *,
+ crypto_req_handle_t);
+
+static crypto_digest_ops_t skein_digest_ops = {
+ .digest_init = skein_digest_init,
+ .digest = skein_digest,
+ .digest_update = skein_update,
+ .digest_key = NULL,
+ .digest_final = skein_final,
+ .digest_atomic = skein_digest_atomic
+};
+
+static int skein_mac_init(crypto_ctx_t *, crypto_mechanism_t *, crypto_key_t *,
+ crypto_spi_ctx_template_t, crypto_req_handle_t);
+static int skein_mac_atomic(crypto_provider_handle_t, crypto_session_id_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *,
+ crypto_spi_ctx_template_t, crypto_req_handle_t);
+
+static crypto_mac_ops_t skein_mac_ops = {
+ .mac_init = skein_mac_init,
+ .mac = NULL,
+ .mac_update = skein_update, /* using regular digest update is OK here */
+ .mac_final = skein_final, /* using regular digest final is OK here */
+ .mac_atomic = skein_mac_atomic,
+ .mac_verify_atomic = NULL
+};
+
+static int skein_create_ctx_template(crypto_provider_handle_t,
+ crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t *,
+ size_t *, crypto_req_handle_t);
+static int skein_free_context(crypto_ctx_t *);
+
+static crypto_ctx_ops_t skein_ctx_ops = {
+ .create_ctx_template = skein_create_ctx_template,
+ .free_context = skein_free_context
+};
+
+static crypto_ops_t skein_crypto_ops = {{{{{
+ &skein_control_ops,
+ &skein_digest_ops,
+ NULL,
+ &skein_mac_ops,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ &skein_ctx_ops,
+}}}}};
+
+static crypto_provider_info_t skein_prov_info = {{{{
+ CRYPTO_SPI_VERSION_1,
+ "Skein Software Provider",
+ CRYPTO_SW_PROVIDER,
+ NULL,
+ &skein_crypto_ops,
+ sizeof (skein_mech_info_tab) / sizeof (crypto_mech_info_t),
+ skein_mech_info_tab
+}}}};
+
+static crypto_kcf_provider_handle_t skein_prov_handle = 0;
+
+typedef struct skein_ctx {
+ skein_mech_type_t sc_mech_type;
+ size_t sc_digest_bitlen;
+ /*LINTED(E_ANONYMOUS_UNION_DECL)*/
+ union {
+ Skein_256_Ctxt_t sc_256;
+ Skein_512_Ctxt_t sc_512;
+ Skein1024_Ctxt_t sc_1024;
+ };
+} skein_ctx_t;
+#define SKEIN_CTX(_ctx_) ((skein_ctx_t *)((_ctx_)->cc_provider_private))
+#define SKEIN_CTX_LVALUE(_ctx_) (_ctx_)->cc_provider_private
+#define SKEIN_OP(_skein_ctx, _op, ...) \
+ do { \
+ skein_ctx_t *sc = (_skein_ctx); \
+ switch (sc->sc_mech_type) { \
+ case SKEIN_256_MECH_INFO_TYPE: \
+ case SKEIN_256_MAC_MECH_INFO_TYPE: \
+ (void) Skein_256_ ## _op(&sc->sc_256, __VA_ARGS__);\
+ break; \
+ case SKEIN_512_MECH_INFO_TYPE: \
+ case SKEIN_512_MAC_MECH_INFO_TYPE: \
+ (void) Skein_512_ ## _op(&sc->sc_512, __VA_ARGS__);\
+ break; \
+ case SKEIN1024_MECH_INFO_TYPE: \
+ case SKEIN1024_MAC_MECH_INFO_TYPE: \
+ (void) Skein1024_ ## _op(&sc->sc_1024, __VA_ARGS__);\
+ break; \
+ } \
+ _NOTE(CONSTCOND) \
+ } while (0)
+
+static int
+skein_get_digest_bitlen(const crypto_mechanism_t *mechanism, size_t *result)
+{
+ if (mechanism->cm_param != NULL) {
+ /*LINTED(E_BAD_PTR_CAST_ALIGN)*/
+ skein_param_t *param = (skein_param_t *)mechanism->cm_param;
+
+ if (mechanism->cm_param_len != sizeof (*param) ||
+ param->sp_digest_bitlen == 0) {
+ return (CRYPTO_MECHANISM_PARAM_INVALID);
+ }
+ *result = param->sp_digest_bitlen;
+ } else {
+ switch (mechanism->cm_type) {
+ case SKEIN_256_MECH_INFO_TYPE:
+ *result = 256;
+ break;
+ case SKEIN_512_MECH_INFO_TYPE:
+ *result = 512;
+ break;
+ case SKEIN1024_MECH_INFO_TYPE:
+ *result = 1024;
+ break;
+ default:
+ return (CRYPTO_MECHANISM_INVALID);
+ }
+ }
+ return (CRYPTO_SUCCESS);
+}
+
+int
+skein_mod_init(void)
+{
+ int error;
+
+ if ((error = mod_install(&modlinkage)) != 0)
+ return (error);
+
+ /*
+ * Try to register with KCF - failure shouldn't unload us, since we
+ * still may want to continue providing misc/skein functionality.
+ */
+ (void) crypto_register_provider(&skein_prov_info, &skein_prov_handle);
+
+ return (0);
+}
+
+int
+skein_mod_fini(void)
+{
+ int ret;
+
+ if (skein_prov_handle != 0) {
+ if ((ret = crypto_unregister_provider(skein_prov_handle)) !=
+ CRYPTO_SUCCESS) {
+ cmn_err(CE_WARN,
+ "skein _fini: crypto_unregister_provider() "
+ "failed (0x%x)", ret);
+ return (EBUSY);
+ }
+ skein_prov_handle = 0;
+ }
+
+ return (mod_remove(&modlinkage));
+}
+
+/*
+ * KCF software provider control entry points.
+ */
+/* ARGSUSED */
+static void
+skein_provider_status(crypto_provider_handle_t provider, uint_t *status)
+{
+ *status = CRYPTO_PROVIDER_READY;
+}
+
+/*
+ * General Skein hashing helper functions.
+ */
+
+/*
+ * Performs an Update on a context with uio input data.
+ */
+static int
+skein_digest_update_uio(skein_ctx_t *ctx, const crypto_data_t *data)
+{
+ off_t offset = data->cd_offset;
+ size_t length = data->cd_length;
+ uint_t vec_idx = 0;
+ size_t cur_len;
+ zfs_uio_t *uio = data->cd_uio;
+
+ /* we support only kernel buffer */
+ if (zfs_uio_segflg(uio) != UIO_SYSSPACE)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ /*
+ * Jump to the first iovec containing data to be
+ * digested.
+ */
+ offset = zfs_uio_index_at_offset(uio, offset, &vec_idx);
+ if (vec_idx == zfs_uio_iovcnt(uio)) {
+ /*
+ * The caller specified an offset that is larger than the
+ * total size of the buffers it provided.
+ */
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+
+ /*
+ * Now do the digesting on the iovecs.
+ */
+ while (vec_idx < zfs_uio_iovcnt(uio) && length > 0) {
+ cur_len = MIN(zfs_uio_iovlen(uio, vec_idx) - offset, length);
+ SKEIN_OP(ctx, Update, (uint8_t *)zfs_uio_iovbase(uio, vec_idx)
+ + offset, cur_len);
+ length -= cur_len;
+ vec_idx++;
+ offset = 0;
+ }
+
+ if (vec_idx == zfs_uio_iovcnt(uio) && length > 0) {
+ /*
+ * The end of the specified iovec's was reached but
+ * the length requested could not be processed, i.e.
+ * The caller requested to digest more data than it provided.
+ */
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+
+ return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Performs a Final on a context and writes to a uio digest output.
+ */
+static int
+skein_digest_final_uio(skein_ctx_t *ctx, crypto_data_t *digest,
+ crypto_req_handle_t req)
+{
+ off_t offset = digest->cd_offset;
+ uint_t vec_idx = 0;
+ zfs_uio_t *uio = digest->cd_uio;
+
+ /* we support only kernel buffer */
+ if (zfs_uio_segflg(uio) != UIO_SYSSPACE)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ /*
+ * Jump to the first iovec containing ptr to the digest to be returned.
+ */
+ offset = zfs_uio_index_at_offset(uio, offset, &vec_idx);
+ if (vec_idx == zfs_uio_iovcnt(uio)) {
+ /*
+ * The caller specified an offset that is larger than the
+ * total size of the buffers it provided.
+ */
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+ if (offset + CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen) <=
+ zfs_uio_iovlen(uio, vec_idx)) {
+ /* The computed digest will fit in the current iovec. */
+ SKEIN_OP(ctx, Final,
+ (uchar_t *)zfs_uio_iovbase(uio, vec_idx) + offset);
+ } else {
+ uint8_t *digest_tmp;
+ off_t scratch_offset = 0;
+ size_t length = CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen);
+ size_t cur_len;
+
+ digest_tmp = kmem_alloc(CRYPTO_BITS2BYTES(
+ ctx->sc_digest_bitlen), crypto_kmflag(req));
+ if (digest_tmp == NULL)
+ return (CRYPTO_HOST_MEMORY);
+ SKEIN_OP(ctx, Final, digest_tmp);
+ while (vec_idx < zfs_uio_iovcnt(uio) && length > 0) {
+ cur_len = MIN(zfs_uio_iovlen(uio, vec_idx) - offset,
+ length);
+ bcopy(digest_tmp + scratch_offset,
+ zfs_uio_iovbase(uio, vec_idx) + offset, cur_len);
+
+ length -= cur_len;
+ vec_idx++;
+ scratch_offset += cur_len;
+ offset = 0;
+ }
+ kmem_free(digest_tmp, CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen));
+
+ if (vec_idx == zfs_uio_iovcnt(uio) && length > 0) {
+ /*
+ * The end of the specified iovec's was reached but
+ * the length requested could not be processed, i.e.
+ * The caller requested to digest more data than it
+ * provided.
+ */
+ return (CRYPTO_DATA_LEN_RANGE);
+ }
+ }
+
+ return (CRYPTO_SUCCESS);
+}
+
+/*
+ * KCF software provider digest entry points.
+ */
+
+/*
+ * Initializes a skein digest context to the configuration in `mechanism'.
+ * The mechanism cm_type must be one of SKEIN_*_MECH_INFO_TYPE. The cm_param
+ * field may contain a skein_param_t structure indicating the length of the
+ * digest the algorithm should produce. Otherwise the default output lengths
+ * are applied (32 bytes for Skein-256, 64 bytes for Skein-512 and 128 bytes
+ * for Skein-1024).
+ */
+static int
+skein_digest_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
+ crypto_req_handle_t req)
+{
+ int error = CRYPTO_SUCCESS;
+
+ if (!VALID_SKEIN_DIGEST_MECH(mechanism->cm_type))
+ return (CRYPTO_MECHANISM_INVALID);
+
+ SKEIN_CTX_LVALUE(ctx) = kmem_alloc(sizeof (*SKEIN_CTX(ctx)),
+ crypto_kmflag(req));
+ if (SKEIN_CTX(ctx) == NULL)
+ return (CRYPTO_HOST_MEMORY);
+
+ SKEIN_CTX(ctx)->sc_mech_type = mechanism->cm_type;
+ error = skein_get_digest_bitlen(mechanism,
+ &SKEIN_CTX(ctx)->sc_digest_bitlen);
+ if (error != CRYPTO_SUCCESS)
+ goto errout;
+ SKEIN_OP(SKEIN_CTX(ctx), Init, SKEIN_CTX(ctx)->sc_digest_bitlen);
+
+ return (CRYPTO_SUCCESS);
+errout:
+ bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+ kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+ SKEIN_CTX_LVALUE(ctx) = NULL;
+ return (error);
+}
+
+/*
+ * Executes a skein_update and skein_digest on a pre-initialized crypto
+ * context in a single step. See the documentation to these functions to
+ * see what to pass here.
+ */
+static int
+skein_digest(crypto_ctx_t *ctx, crypto_data_t *data, crypto_data_t *digest,
+ crypto_req_handle_t req)
+{
+ int error = CRYPTO_SUCCESS;
+
+ ASSERT(SKEIN_CTX(ctx) != NULL);
+
+ if (digest->cd_length <
+ CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen)) {
+ digest->cd_length =
+ CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen);
+ return (CRYPTO_BUFFER_TOO_SMALL);
+ }
+
+ error = skein_update(ctx, data, req);
+ if (error != CRYPTO_SUCCESS) {
+ bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+ kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+ SKEIN_CTX_LVALUE(ctx) = NULL;
+ digest->cd_length = 0;
+ return (error);
+ }
+ error = skein_final(ctx, digest, req);
+
+ return (error);
+}
+
+/*
+ * Performs a skein Update with the input message in `data' (successive calls
+ * can push more data). This is used both for digest and MAC operation.
+ * Supported input data formats are raw, uio and mblk.
+ */
+/*ARGSUSED*/
+static int
+skein_update(crypto_ctx_t *ctx, crypto_data_t *data, crypto_req_handle_t req)
+{
+ int error = CRYPTO_SUCCESS;
+
+ ASSERT(SKEIN_CTX(ctx) != NULL);
+
+ switch (data->cd_format) {
+ case CRYPTO_DATA_RAW:
+ SKEIN_OP(SKEIN_CTX(ctx), Update,
+ (uint8_t *)data->cd_raw.iov_base + data->cd_offset,
+ data->cd_length);
+ break;
+ case CRYPTO_DATA_UIO:
+ error = skein_digest_update_uio(SKEIN_CTX(ctx), data);
+ break;
+ default:
+ error = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ return (error);
+}
+
+/*
+ * Performs a skein Final, writing the output to `digest'. This is used both
+ * for digest and MAC operation.
+ * Supported output digest formats are raw, uio and mblk.
+ */
+/*ARGSUSED*/
+static int
+skein_final(crypto_ctx_t *ctx, crypto_data_t *digest, crypto_req_handle_t req)
+{
+ int error = CRYPTO_SUCCESS;
+
+ ASSERT(SKEIN_CTX(ctx) != NULL);
+
+ if (digest->cd_length <
+ CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen)) {
+ digest->cd_length =
+ CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen);
+ return (CRYPTO_BUFFER_TOO_SMALL);
+ }
+
+ switch (digest->cd_format) {
+ case CRYPTO_DATA_RAW:
+ SKEIN_OP(SKEIN_CTX(ctx), Final,
+ (uint8_t *)digest->cd_raw.iov_base + digest->cd_offset);
+ break;
+ case CRYPTO_DATA_UIO:
+ error = skein_digest_final_uio(SKEIN_CTX(ctx), digest, req);
+ break;
+ default:
+ error = CRYPTO_ARGUMENTS_BAD;
+ }
+
+ if (error == CRYPTO_SUCCESS)
+ digest->cd_length =
+ CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen);
+ else
+ digest->cd_length = 0;
+
+ bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+ kmem_free(SKEIN_CTX(ctx), sizeof (*(SKEIN_CTX(ctx))));
+ SKEIN_CTX_LVALUE(ctx) = NULL;
+
+ return (error);
+}
+
+/*
+ * Performs a full skein digest computation in a single call, configuring the
+ * algorithm according to `mechanism', reading the input to be digested from
+ * `data' and writing the output to `digest'.
+ * Supported input/output formats are raw, uio and mblk.
+ */
+/*ARGSUSED*/
+static int
+skein_digest_atomic(crypto_provider_handle_t provider,
+ crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+ crypto_data_t *data, crypto_data_t *digest, crypto_req_handle_t req)
+{
+ int error;
+ skein_ctx_t skein_ctx;
+ crypto_ctx_t ctx;
+ SKEIN_CTX_LVALUE(&ctx) = &skein_ctx;
+
+ /* Init */
+ if (!VALID_SKEIN_DIGEST_MECH(mechanism->cm_type))
+ return (CRYPTO_MECHANISM_INVALID);
+ skein_ctx.sc_mech_type = mechanism->cm_type;
+ error = skein_get_digest_bitlen(mechanism, &skein_ctx.sc_digest_bitlen);
+ if (error != CRYPTO_SUCCESS)
+ goto out;
+ SKEIN_OP(&skein_ctx, Init, skein_ctx.sc_digest_bitlen);
+
+ if ((error = skein_update(&ctx, data, digest)) != CRYPTO_SUCCESS)
+ goto out;
+ if ((error = skein_final(&ctx, data, digest)) != CRYPTO_SUCCESS)
+ goto out;
+
+out:
+ if (error == CRYPTO_SUCCESS)
+ digest->cd_length =
+ CRYPTO_BITS2BYTES(skein_ctx.sc_digest_bitlen);
+ else
+ digest->cd_length = 0;
+ bzero(&skein_ctx, sizeof (skein_ctx));
+
+ return (error);
+}
+
+/*
+ * Helper function that builds a Skein MAC context from the provided
+ * mechanism and key.
+ */
+static int
+skein_mac_ctx_build(skein_ctx_t *ctx, crypto_mechanism_t *mechanism,
+ crypto_key_t *key)
+{
+ int error;
+
+ if (!VALID_SKEIN_MAC_MECH(mechanism->cm_type))
+ return (CRYPTO_MECHANISM_INVALID);
+ if (key->ck_format != CRYPTO_KEY_RAW)
+ return (CRYPTO_ARGUMENTS_BAD);
+ ctx->sc_mech_type = mechanism->cm_type;
+ error = skein_get_digest_bitlen(mechanism, &ctx->sc_digest_bitlen);
+ if (error != CRYPTO_SUCCESS)
+ return (error);
+ SKEIN_OP(ctx, InitExt, ctx->sc_digest_bitlen, 0, key->ck_data,
+ CRYPTO_BITS2BYTES(key->ck_length));
+
+ return (CRYPTO_SUCCESS);
+}
+
+/*
+ * KCF software provide mac entry points.
+ */
+/*
+ * Initializes a skein MAC context. You may pass a ctx_template, in which
+ * case the template will be reused to make initialization more efficient.
+ * Otherwise a new context will be constructed. The mechanism cm_type must
+ * be one of SKEIN_*_MAC_MECH_INFO_TYPE. Same as in skein_digest_init, you
+ * may pass a skein_param_t in cm_param to configure the length of the
+ * digest. The key must be in raw format.
+ */
+static int
+skein_mac_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
+ crypto_key_t *key, crypto_spi_ctx_template_t ctx_template,
+ crypto_req_handle_t req)
+{
+ int error;
+
+ SKEIN_CTX_LVALUE(ctx) = kmem_alloc(sizeof (*SKEIN_CTX(ctx)),
+ crypto_kmflag(req));
+ if (SKEIN_CTX(ctx) == NULL)
+ return (CRYPTO_HOST_MEMORY);
+
+ if (ctx_template != NULL) {
+ bcopy(ctx_template, SKEIN_CTX(ctx),
+ sizeof (*SKEIN_CTX(ctx)));
+ } else {
+ error = skein_mac_ctx_build(SKEIN_CTX(ctx), mechanism, key);
+ if (error != CRYPTO_SUCCESS)
+ goto errout;
+ }
+
+ return (CRYPTO_SUCCESS);
+errout:
+ bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+ kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+ return (error);
+}
+
+/*
+ * The MAC update and final calls are reused from the regular digest code.
+ */
+
+/*ARGSUSED*/
+/*
+ * Same as skein_digest_atomic, performs an atomic Skein MAC operation in
+ * one step. All the same properties apply to the arguments of this
+ * function as to those of the partial operations above.
+ */
+static int
+skein_mac_atomic(crypto_provider_handle_t provider,
+ crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+ crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac,
+ crypto_spi_ctx_template_t ctx_template, crypto_req_handle_t req)
+{
+ /* faux crypto context just for skein_digest_{update,final} */
+ int error;
+ crypto_ctx_t ctx;
+ skein_ctx_t skein_ctx;
+ SKEIN_CTX_LVALUE(&ctx) = &skein_ctx;
+
+ if (ctx_template != NULL) {
+ bcopy(ctx_template, &skein_ctx, sizeof (skein_ctx));
+ } else {
+ error = skein_mac_ctx_build(&skein_ctx, mechanism, key);
+ if (error != CRYPTO_SUCCESS)
+ goto errout;
+ }
+
+ if ((error = skein_update(&ctx, data, req)) != CRYPTO_SUCCESS)
+ goto errout;
+ if ((error = skein_final(&ctx, mac, req)) != CRYPTO_SUCCESS)
+ goto errout;
+
+ return (CRYPTO_SUCCESS);
+errout:
+ bzero(&skein_ctx, sizeof (skein_ctx));
+ return (error);
+}
+
+/*
+ * KCF software provider context management entry points.
+ */
+
+/*
+ * Constructs a context template for the Skein MAC algorithm. The same
+ * properties apply to the arguments of this function as to those of
+ * skein_mac_init.
+ */
+/*ARGSUSED*/
+static int
+skein_create_ctx_template(crypto_provider_handle_t provider,
+ crypto_mechanism_t *mechanism, crypto_key_t *key,
+ crypto_spi_ctx_template_t *ctx_template, size_t *ctx_template_size,
+ crypto_req_handle_t req)
+{
+ int error;
+ skein_ctx_t *ctx_tmpl;
+
+ ctx_tmpl = kmem_alloc(sizeof (*ctx_tmpl), crypto_kmflag(req));
+ if (ctx_tmpl == NULL)
+ return (CRYPTO_HOST_MEMORY);
+ error = skein_mac_ctx_build(ctx_tmpl, mechanism, key);
+ if (error != CRYPTO_SUCCESS)
+ goto errout;
+ *ctx_template = ctx_tmpl;
+ *ctx_template_size = sizeof (*ctx_tmpl);
+
+ return (CRYPTO_SUCCESS);
+errout:
+ bzero(ctx_tmpl, sizeof (*ctx_tmpl));
+ kmem_free(ctx_tmpl, sizeof (*ctx_tmpl));
+ return (error);
+}
+
+/*
+ * Frees a skein context in a parent crypto context.
+ */
+static int
+skein_free_context(crypto_ctx_t *ctx)
+{
+ if (SKEIN_CTX(ctx) != NULL) {
+ bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+ kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+ SKEIN_CTX_LVALUE(ctx) = NULL;
+ }
+
+ return (CRYPTO_SUCCESS);
+}
diff --git a/sys/contrib/openzfs/module/icp/os/modconf.c b/sys/contrib/openzfs/module/icp/os/modconf.c
new file mode 100644
index 000000000000..3743416ed951
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/os/modconf.c
@@ -0,0 +1,173 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/modctl.h>
+
+/*
+ * Null operations; used for uninitialized and "misc" modules.
+ */
+static int mod_null(struct modlmisc *, struct modlinkage *);
+static int mod_infonull(void *, struct modlinkage *, int *);
+
+/*
+ * Cryptographic Modules
+ */
+struct mod_ops mod_cryptoops = {
+ .modm_install = mod_null,
+ .modm_remove = mod_null,
+ .modm_info = mod_infonull
+};
+
+/*
+ * Null operation; return 0.
+ */
+static int
+mod_null(struct modlmisc *modl, struct modlinkage *modlp)
+{
+ return (0);
+}
+
+/*
+ * Status for User modules.
+ */
+static int
+mod_infonull(void *modl, struct modlinkage *modlp, int *p0)
+{
+ *p0 = -1; /* for modinfo display */
+ return (0);
+}
+
+/*
+ * Install a module.
+ * (This routine is in the Solaris SPARC DDI/DKI)
+ */
+int
+mod_install(struct modlinkage *modlp)
+{
+ int retval = -1; /* No linkage structures */
+ struct modlmisc **linkpp;
+ struct modlmisc **linkpp1;
+
+ if (modlp->ml_rev != MODREV_1) {
+ cmn_err(CE_WARN, "mod_install: "
+ "modlinkage structure is not MODREV_1\n");
+ return (EINVAL);
+ }
+ linkpp = (struct modlmisc **)&modlp->ml_linkage[0];
+
+ while (*linkpp != NULL) {
+ if ((retval = MODL_INSTALL(*linkpp, modlp)) != 0) {
+ linkpp1 = (struct modlmisc **)&modlp->ml_linkage[0];
+
+ while (linkpp1 != linkpp) {
+ MODL_REMOVE(*linkpp1, modlp); /* clean up */
+ linkpp1++;
+ }
+ break;
+ }
+ linkpp++;
+ }
+ return (retval);
+}
+
+static char *reins_err =
+ "Could not reinstall %s\nReboot to correct the problem";
+
+/*
+ * Remove a module. This is called by the module wrapper routine.
+ * (This routine is in the Solaris SPARC DDI/DKI)
+ */
+int
+mod_remove(struct modlinkage *modlp)
+{
+ int retval = 0;
+ struct modlmisc **linkpp, *last_linkp;
+
+ linkpp = (struct modlmisc **)&modlp->ml_linkage[0];
+
+ while (*linkpp != NULL) {
+ if ((retval = MODL_REMOVE(*linkpp, modlp)) != 0) {
+ last_linkp = *linkpp;
+ linkpp = (struct modlmisc **)&modlp->ml_linkage[0];
+ while (*linkpp != last_linkp) {
+ if (MODL_INSTALL(*linkpp, modlp) != 0) {
+ cmn_err(CE_WARN, reins_err,
+ (*linkpp)->misc_linkinfo);
+ break;
+ }
+ linkpp++;
+ }
+ break;
+ }
+ linkpp++;
+ }
+ return (retval);
+}
+
+/*
+ * Get module status.
+ * (This routine is in the Solaris SPARC DDI/DKI)
+ */
+int
+mod_info(struct modlinkage *modlp, struct modinfo *modinfop)
+{
+ int i;
+ int retval = 0;
+ struct modspecific_info *msip;
+ struct modlmisc **linkpp;
+
+ modinfop->mi_rev = modlp->ml_rev;
+
+ linkpp = (struct modlmisc **)modlp->ml_linkage;
+ msip = &modinfop->mi_msinfo[0];
+
+ for (i = 0; i < MODMAXLINK; i++) {
+ if (*linkpp == NULL) {
+ msip->msi_linkinfo[0] = '\0';
+ } else {
+ (void) strlcpy(msip->msi_linkinfo,
+ (*linkpp)->misc_linkinfo, MODMAXLINKINFOLEN);
+ retval = MODL_INFO(*linkpp, modlp, &msip->msi_p0);
+ if (retval != 0)
+ break;
+ linkpp++;
+ }
+ msip++;
+ }
+
+ if (modinfop->mi_info == MI_INFO_LINKAGE) {
+ /*
+ * Slight kludge used to extract the address of the
+ * modlinkage structure from the module (just after
+ * loading a module for the very first time)
+ */
+ modinfop->mi_base = (void *)modlp;
+ }
+
+ if (retval == 0)
+ return (1);
+ return (0);
+}
diff --git a/sys/contrib/openzfs/module/icp/os/modhash.c b/sys/contrib/openzfs/module/icp/os/modhash.c
new file mode 100644
index 000000000000..a897871001ce
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/os/modhash.c
@@ -0,0 +1,927 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * mod_hash: flexible hash table implementation.
+ *
+ * This is a reasonably fast, reasonably flexible hash table implementation
+ * which features pluggable hash algorithms to support storing arbitrary keys
+ * and values. It is designed to handle small (< 100,000 items) amounts of
+ * data. The hash uses chaining to resolve collisions, and does not feature a
+ * mechanism to grow the hash. Care must be taken to pick nchains to be large
+ * enough for the application at hand, or lots of time will be wasted searching
+ * hash chains.
+ *
+ * The client of the hash is required to supply a number of items to support
+ * the various hash functions:
+ *
+ * - Destructor functions for the key and value being hashed.
+ * A destructor is responsible for freeing an object when the hash
+ * table is no longer storing it. Since keys and values can be of
+ * arbitrary type, separate destructors for keys & values are used.
+ * These may be mod_hash_null_keydtor and mod_hash_null_valdtor if no
+ * destructor is needed for either a key or value.
+ *
+ * - A hashing algorithm which returns a uint_t representing a hash index
+ * The number returned need _not_ be between 0 and nchains. The mod_hash
+ * code will take care of doing that. The second argument (after the
+ * key) to the hashing function is a void * that represents
+ * hash_alg_data-- this is provided so that the hashing algorithm can
+ * maintain some state across calls, or keep algorithm-specific
+ * constants associated with the hash table.
+ *
+ * A pointer-hashing and a string-hashing algorithm are supplied in
+ * this file.
+ *
+ * - A key comparator (a la qsort).
+ * This is used when searching the hash chain. The key comparator
+ * determines if two keys match. It should follow the return value
+ * semantics of strcmp.
+ *
+ * string and pointer comparators are supplied in this file.
+ *
+ * mod_hash_create_strhash() and mod_hash_create_ptrhash() provide good
+ * examples of how to create a customized hash table.
+ *
+ * Basic hash operations:
+ *
+ * mod_hash_create_strhash(name, nchains, dtor),
+ * create a hash using strings as keys.
+ * NOTE: This create a hash which automatically cleans up the string
+ * values it is given for keys.
+ *
+ * mod_hash_create_ptrhash(name, nchains, dtor, key_elem_size):
+ * create a hash using pointers as keys.
+ *
+ * mod_hash_create_extended(name, nchains, kdtor, vdtor,
+ * hash_alg, hash_alg_data,
+ * keycmp, sleep)
+ * create a customized hash table.
+ *
+ * mod_hash_destroy_hash(hash):
+ * destroy the given hash table, calling the key and value destructors
+ * on each key-value pair stored in the hash.
+ *
+ * mod_hash_insert(hash, key, val):
+ * place a key, value pair into the given hash.
+ * duplicate keys are rejected.
+ *
+ * mod_hash_insert_reserve(hash, key, val, handle):
+ * place a key, value pair into the given hash, using handle to indicate
+ * the reserved storage for the pair. (no memory allocation is needed
+ * during a mod_hash_insert_reserve.) duplicate keys are rejected.
+ *
+ * mod_hash_reserve(hash, *handle):
+ * reserve storage for a key-value pair using the memory allocation
+ * policy of 'hash', returning the storage handle in 'handle'.
+ *
+ * mod_hash_reserve_nosleep(hash, *handle): reserve storage for a key-value
+ * pair ignoring the memory allocation policy of 'hash' and always without
+ * sleep, returning the storage handle in 'handle'.
+ *
+ * mod_hash_remove(hash, key, *val):
+ * remove a key-value pair with key 'key' from 'hash', destroying the
+ * stored key, and returning the value in val.
+ *
+ * mod_hash_replace(hash, key, val)
+ * atomically remove an existing key-value pair from a hash, and replace
+ * the key and value with the ones supplied. The removed key and value
+ * (if any) are destroyed.
+ *
+ * mod_hash_destroy(hash, key):
+ * remove a key-value pair with key 'key' from 'hash', destroying both
+ * stored key and stored value.
+ *
+ * mod_hash_find(hash, key, val):
+ * find a value in the hash table corresponding to the given key.
+ *
+ * mod_hash_find_cb(hash, key, val, found_callback)
+ * find a value in the hash table corresponding to the given key.
+ * If a value is found, call specified callback passing key and val to it.
+ * The callback is called with the hash lock held.
+ * It is intended to be used in situations where the act of locating the
+ * data must also modify it - such as in reference counting schemes.
+ *
+ * mod_hash_walk(hash, callback(key, elem, arg), arg)
+ * walks all the elements in the hashtable and invokes the callback
+ * function with the key/value pair for each element. the hashtable
+ * is locked for readers so the callback function should not attempt
+ * to do any updates to the hashable. the callback function should
+ * return MH_WALK_CONTINUE to continue walking the hashtable or
+ * MH_WALK_TERMINATE to abort the walk of the hashtable.
+ *
+ * mod_hash_clear(hash):
+ * clears the given hash table of entries, calling the key and value
+ * destructors for every element in the hash.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/bitmap.h>
+#include <sys/modhash_impl.h>
+#include <sys/sysmacros.h>
+
+/*
+ * MH_KEY_DESTROY()
+ * Invoke the key destructor.
+ */
+#define MH_KEY_DESTROY(hash, key) ((hash->mh_kdtor)(key))
+
+/*
+ * MH_VAL_DESTROY()
+ * Invoke the value destructor.
+ */
+#define MH_VAL_DESTROY(hash, val) ((hash->mh_vdtor)(val))
+
+/*
+ * MH_KEYCMP()
+ * Call the key comparator for the given hash keys.
+ */
+#define MH_KEYCMP(hash, key1, key2) ((hash->mh_keycmp)(key1, key2))
+
+/*
+ * Cache for struct mod_hash_entry
+ */
+kmem_cache_t *mh_e_cache = NULL;
+mod_hash_t *mh_head = NULL;
+kmutex_t mh_head_lock;
+
+/*
+ * mod_hash_null_keydtor()
+ * mod_hash_null_valdtor()
+ * no-op key and value destructors.
+ */
+/*ARGSUSED*/
+void
+mod_hash_null_keydtor(mod_hash_key_t key)
+{
+}
+
+/*ARGSUSED*/
+void
+mod_hash_null_valdtor(mod_hash_val_t val)
+{
+}
+
+/*
+ * mod_hash_bystr()
+ * mod_hash_strkey_cmp()
+ * mod_hash_strkey_dtor()
+ * mod_hash_strval_dtor()
+ * Hash and key comparison routines for hashes with string keys.
+ *
+ * mod_hash_create_strhash()
+ * Create a hash using strings as keys
+ *
+ * The string hashing algorithm is from the "Dragon Book" --
+ * "Compilers: Principles, Tools & Techniques", by Aho, Sethi, Ullman
+ */
+
+/*ARGSUSED*/
+uint_t
+mod_hash_bystr(void *hash_data, mod_hash_key_t key)
+{
+ uint_t hash = 0;
+ uint_t g;
+ char *p, *k = (char *)key;
+
+ ASSERT(k);
+ for (p = k; *p != '\0'; p++) {
+ hash = (hash << 4) + *p;
+ if ((g = (hash & 0xf0000000)) != 0) {
+ hash ^= (g >> 24);
+ hash ^= g;
+ }
+ }
+ return (hash);
+}
+
+int
+mod_hash_strkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
+{
+ return (strcmp((char *)key1, (char *)key2));
+}
+
+void
+mod_hash_strkey_dtor(mod_hash_key_t key)
+{
+ char *c = (char *)key;
+ kmem_free(c, strlen(c) + 1);
+}
+
+void
+mod_hash_strval_dtor(mod_hash_val_t val)
+{
+ char *c = (char *)val;
+ kmem_free(c, strlen(c) + 1);
+}
+
+mod_hash_t *
+mod_hash_create_strhash_nodtr(char *name, size_t nchains,
+ void (*val_dtor)(mod_hash_val_t))
+{
+ return mod_hash_create_extended(name, nchains, mod_hash_null_keydtor,
+ val_dtor, mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
+}
+
+mod_hash_t *
+mod_hash_create_strhash(char *name, size_t nchains,
+ void (*val_dtor)(mod_hash_val_t))
+{
+ return mod_hash_create_extended(name, nchains, mod_hash_strkey_dtor,
+ val_dtor, mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
+}
+
+void
+mod_hash_destroy_strhash(mod_hash_t *strhash)
+{
+ ASSERT(strhash);
+ mod_hash_destroy_hash(strhash);
+}
+
+
+/*
+ * mod_hash_byptr()
+ * mod_hash_ptrkey_cmp()
+ * Hash and key comparison routines for hashes with pointer keys.
+ *
+ * mod_hash_create_ptrhash()
+ * mod_hash_destroy_ptrhash()
+ * Create a hash that uses pointers as keys. This hash algorithm
+ * picks an appropriate set of middle bits in the address to hash on
+ * based on the size of the hash table and a hint about the size of
+ * the items pointed at.
+ */
+uint_t
+mod_hash_byptr(void *hash_data, mod_hash_key_t key)
+{
+ uintptr_t k = (uintptr_t)key;
+ k >>= (int)(uintptr_t)hash_data;
+
+ return ((uint_t)k);
+}
+
+int
+mod_hash_ptrkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
+{
+ uintptr_t k1 = (uintptr_t)key1;
+ uintptr_t k2 = (uintptr_t)key2;
+ if (k1 > k2)
+ return (-1);
+ else if (k1 < k2)
+ return (1);
+ else
+ return (0);
+}
+
+mod_hash_t *
+mod_hash_create_ptrhash(char *name, size_t nchains,
+ void (*val_dtor)(mod_hash_val_t), size_t key_elem_size)
+{
+ size_t rshift;
+
+ /*
+ * We want to hash on the bits in the middle of the address word
+ * Bits far to the right in the word have little significance, and
+ * are likely to all look the same (for example, an array of
+ * 256-byte structures will have the bottom 8 bits of address
+ * words the same). So we want to right-shift each address to
+ * ignore the bottom bits.
+ *
+ * The high bits, which are also unused, will get taken out when
+ * mod_hash takes hashkey % nchains.
+ */
+ rshift = highbit64(key_elem_size);
+
+ return mod_hash_create_extended(name, nchains, mod_hash_null_keydtor,
+ val_dtor, mod_hash_byptr, (void *)rshift, mod_hash_ptrkey_cmp,
+ KM_SLEEP);
+}
+
+void
+mod_hash_destroy_ptrhash(mod_hash_t *hash)
+{
+ ASSERT(hash);
+ mod_hash_destroy_hash(hash);
+}
+
+/*
+ * mod_hash_byid()
+ * mod_hash_idkey_cmp()
+ * Hash and key comparison routines for hashes with 32-bit unsigned keys.
+ *
+ * mod_hash_create_idhash()
+ * mod_hash_destroy_idhash()
+ * mod_hash_iddata_gen()
+ * Create a hash that uses numeric keys.
+ *
+ * The hash algorithm is documented in "Introduction to Algorithms"
+ * (Cormen, Leiserson, Rivest); when the hash table is created, it
+ * attempts to find the next largest prime above the number of hash
+ * slots. The hash index is then this number times the key modulo
+ * the hash size, or (key * prime) % nchains.
+ */
+uint_t
+mod_hash_byid(void *hash_data, mod_hash_key_t key)
+{
+ uint_t kval = (uint_t)(uintptr_t)hash_data;
+ return ((uint_t)(uintptr_t)key * (uint_t)kval);
+}
+
+int
+mod_hash_idkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
+{
+ return ((uint_t)(uintptr_t)key1 - (uint_t)(uintptr_t)key2);
+}
+
+/*
+ * Generate the next largest prime number greater than nchains; this value
+ * is intended to be later passed in to mod_hash_create_extended() as the
+ * hash_data.
+ */
+uint_t
+mod_hash_iddata_gen(size_t nchains)
+{
+ uint_t kval, i, prime;
+
+ /*
+ * Pick the first (odd) prime greater than nchains. Make sure kval is
+ * odd (so start with nchains +1 or +2 as appropriate).
+ */
+ kval = (nchains % 2 == 0) ? nchains + 1 : nchains + 2;
+
+ for (;;) {
+ prime = 1;
+ for (i = 3; i * i <= kval; i += 2) {
+ if (kval % i == 0)
+ prime = 0;
+ }
+ if (prime == 1)
+ break;
+ kval += 2;
+ }
+ return (kval);
+}
+
+mod_hash_t *
+mod_hash_create_idhash(char *name, size_t nchains,
+ void (*val_dtor)(mod_hash_val_t))
+{
+ uint_t kval = mod_hash_iddata_gen(nchains);
+
+ return (mod_hash_create_extended(name, nchains, mod_hash_null_keydtor,
+ val_dtor, mod_hash_byid, (void *)(uintptr_t)kval,
+ mod_hash_idkey_cmp, KM_SLEEP));
+}
+
+void
+mod_hash_destroy_idhash(mod_hash_t *hash)
+{
+ ASSERT(hash);
+ mod_hash_destroy_hash(hash);
+}
+
+void
+mod_hash_fini(void)
+{
+ mutex_destroy(&mh_head_lock);
+
+ if (mh_e_cache) {
+ kmem_cache_destroy(mh_e_cache);
+ mh_e_cache = NULL;
+ }
+}
+
+/*
+ * mod_hash_init()
+ * sets up globals, etc for mod_hash_*
+ */
+void
+mod_hash_init(void)
+{
+ ASSERT(mh_e_cache == NULL);
+ mh_e_cache = kmem_cache_create("mod_hash_entries",
+ sizeof (struct mod_hash_entry), 0, NULL, NULL, NULL, NULL,
+ NULL, 0);
+
+ mutex_init(&mh_head_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+/*
+ * mod_hash_create_extended()
+ * The full-blown hash creation function.
+ *
+ * notes:
+ * nchains - how many hash slots to create. More hash slots will
+ * result in shorter hash chains, but will consume
+ * slightly more memory up front.
+ * sleep - should be KM_SLEEP or KM_NOSLEEP, to indicate whether
+ * to sleep for memory, or fail in low-memory conditions.
+ *
+ * Fails only if KM_NOSLEEP was specified, and no memory was available.
+ */
+mod_hash_t *
+mod_hash_create_extended(
+ char *hname, /* descriptive name for hash */
+ size_t nchains, /* number of hash slots */
+ void (*kdtor)(mod_hash_key_t), /* key destructor */
+ void (*vdtor)(mod_hash_val_t), /* value destructor */
+ uint_t (*hash_alg)(void *, mod_hash_key_t), /* hash algorithm */
+ void *hash_alg_data, /* pass-thru arg for hash_alg */
+ int (*keycmp)(mod_hash_key_t, mod_hash_key_t), /* key comparator */
+ int sleep) /* whether to sleep for mem */
+{
+ mod_hash_t *mod_hash;
+ size_t size;
+ ASSERT(hname && keycmp && hash_alg && vdtor && kdtor);
+
+ if ((mod_hash = kmem_zalloc(MH_SIZE(nchains), sleep)) == NULL)
+ return (NULL);
+
+ size = strlen(hname) + 1;
+ mod_hash->mh_name = kmem_alloc(size, sleep);
+ if (mod_hash->mh_name == NULL) {
+ kmem_free(mod_hash, MH_SIZE(nchains));
+ return (NULL);
+ }
+ (void) strlcpy(mod_hash->mh_name, hname, size);
+
+ rw_init(&mod_hash->mh_contents, NULL, RW_DEFAULT, NULL);
+ mod_hash->mh_sleep = sleep;
+ mod_hash->mh_nchains = nchains;
+ mod_hash->mh_kdtor = kdtor;
+ mod_hash->mh_vdtor = vdtor;
+ mod_hash->mh_hashalg = hash_alg;
+ mod_hash->mh_hashalg_data = hash_alg_data;
+ mod_hash->mh_keycmp = keycmp;
+
+ /*
+ * Link the hash up on the list of hashes
+ */
+ mutex_enter(&mh_head_lock);
+ mod_hash->mh_next = mh_head;
+ mh_head = mod_hash;
+ mutex_exit(&mh_head_lock);
+
+ return (mod_hash);
+}
+
+/*
+ * mod_hash_destroy_hash()
+ * destroy a hash table, destroying all of its stored keys and values
+ * as well.
+ */
+void
+mod_hash_destroy_hash(mod_hash_t *hash)
+{
+ mod_hash_t *mhp, *mhpp;
+
+ mutex_enter(&mh_head_lock);
+ /*
+ * Remove the hash from the hash list
+ */
+ if (hash == mh_head) { /* removing 1st list elem */
+ mh_head = mh_head->mh_next;
+ } else {
+ /*
+ * mhpp can start out NULL since we know the 1st elem isn't the
+ * droid we're looking for.
+ */
+ mhpp = NULL;
+ for (mhp = mh_head; mhp != NULL; mhp = mhp->mh_next) {
+ if (mhp == hash) {
+ mhpp->mh_next = mhp->mh_next;
+ break;
+ }
+ mhpp = mhp;
+ }
+ }
+ mutex_exit(&mh_head_lock);
+
+ /*
+ * Clean out keys and values.
+ */
+ mod_hash_clear(hash);
+
+ rw_destroy(&hash->mh_contents);
+ kmem_free(hash->mh_name, strlen(hash->mh_name) + 1);
+ kmem_free(hash, MH_SIZE(hash->mh_nchains));
+}
+
+/*
+ * i_mod_hash()
+ * Call the hashing algorithm for this hash table, with the given key.
+ */
+uint_t
+i_mod_hash(mod_hash_t *hash, mod_hash_key_t key)
+{
+ uint_t h;
+ /*
+ * Prevent div by 0 problems;
+ * Also a nice shortcut when using a hash as a list
+ */
+ if (hash->mh_nchains == 1)
+ return (0);
+
+ h = (hash->mh_hashalg)(hash->mh_hashalg_data, key);
+ return (h % (hash->mh_nchains - 1));
+}
+
+/*
+ * i_mod_hash_insert_nosync()
+ * mod_hash_insert()
+ * mod_hash_insert_reserve()
+ * insert 'val' into the hash table, using 'key' as its key. If 'key' is
+ * already a key in the hash, an error will be returned, and the key-val
+ * pair will not be inserted. i_mod_hash_insert_nosync() supports a simple
+ * handle abstraction, allowing hash entry allocation to be separated from
+ * the hash insertion. this abstraction allows simple use of the mod_hash
+ * structure in situations where mod_hash_insert() with a KM_SLEEP
+ * allocation policy would otherwise be unsafe.
+ */
+int
+i_mod_hash_insert_nosync(mod_hash_t *hash, mod_hash_key_t key,
+ mod_hash_val_t val, mod_hash_hndl_t handle)
+{
+ uint_t hashidx;
+ struct mod_hash_entry *entry;
+
+ ASSERT(hash);
+
+ /*
+ * If we've not been given reserved storage, allocate storage directly,
+ * using the hash's allocation policy.
+ */
+ if (handle == (mod_hash_hndl_t)0) {
+ entry = kmem_cache_alloc(mh_e_cache, hash->mh_sleep);
+ if (entry == NULL) {
+ hash->mh_stat.mhs_nomem++;
+ return (MH_ERR_NOMEM);
+ }
+ } else {
+ entry = (struct mod_hash_entry *)handle;
+ }
+
+ hashidx = i_mod_hash(hash, key);
+ entry->mhe_key = key;
+ entry->mhe_val = val;
+ entry->mhe_next = hash->mh_entries[hashidx];
+
+ hash->mh_entries[hashidx] = entry;
+ hash->mh_stat.mhs_nelems++;
+
+ return (0);
+}
+
+int
+mod_hash_insert(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t val)
+{
+ int res;
+ mod_hash_val_t v;
+
+ rw_enter(&hash->mh_contents, RW_WRITER);
+
+ /*
+ * Disallow duplicate keys in the hash
+ */
+ if (i_mod_hash_find_nosync(hash, key, &v) == 0) {
+ rw_exit(&hash->mh_contents);
+ hash->mh_stat.mhs_coll++;
+ return (MH_ERR_DUPLICATE);
+ }
+
+ res = i_mod_hash_insert_nosync(hash, key, val, (mod_hash_hndl_t)0);
+ rw_exit(&hash->mh_contents);
+
+ return (res);
+}
+
+int
+mod_hash_insert_reserve(mod_hash_t *hash, mod_hash_key_t key,
+ mod_hash_val_t val, mod_hash_hndl_t handle)
+{
+ int res;
+ mod_hash_val_t v;
+
+ rw_enter(&hash->mh_contents, RW_WRITER);
+
+ /*
+ * Disallow duplicate keys in the hash
+ */
+ if (i_mod_hash_find_nosync(hash, key, &v) == 0) {
+ rw_exit(&hash->mh_contents);
+ hash->mh_stat.mhs_coll++;
+ return (MH_ERR_DUPLICATE);
+ }
+ res = i_mod_hash_insert_nosync(hash, key, val, handle);
+ rw_exit(&hash->mh_contents);
+
+ return (res);
+}
+
+/*
+ * mod_hash_reserve()
+ * mod_hash_reserve_nosleep()
+ * mod_hash_cancel()
+ * Make or cancel a mod_hash_entry_t reservation. Reservations are used in
+ * mod_hash_insert_reserve() above.
+ */
+int
+mod_hash_reserve(mod_hash_t *hash, mod_hash_hndl_t *handlep)
+{
+ *handlep = kmem_cache_alloc(mh_e_cache, hash->mh_sleep);
+ if (*handlep == NULL) {
+ hash->mh_stat.mhs_nomem++;
+ return (MH_ERR_NOMEM);
+ }
+
+ return (0);
+}
+
+int
+mod_hash_reserve_nosleep(mod_hash_t *hash, mod_hash_hndl_t *handlep)
+{
+ *handlep = kmem_cache_alloc(mh_e_cache, KM_NOSLEEP);
+ if (*handlep == NULL) {
+ hash->mh_stat.mhs_nomem++;
+ return (MH_ERR_NOMEM);
+ }
+
+ return (0);
+
+}
+
+/*ARGSUSED*/
+void
+mod_hash_cancel(mod_hash_t *hash, mod_hash_hndl_t *handlep)
+{
+ kmem_cache_free(mh_e_cache, *handlep);
+ *handlep = (mod_hash_hndl_t)0;
+}
+
+/*
+ * i_mod_hash_remove_nosync()
+ * mod_hash_remove()
+ * Remove an element from the hash table.
+ */
+int
+i_mod_hash_remove_nosync(mod_hash_t *hash, mod_hash_key_t key,
+ mod_hash_val_t *val)
+{
+ int hashidx;
+ struct mod_hash_entry *e, *ep;
+
+ hashidx = i_mod_hash(hash, key);
+ ep = NULL; /* e's parent */
+
+ for (e = hash->mh_entries[hashidx]; e != NULL; e = e->mhe_next) {
+ if (MH_KEYCMP(hash, e->mhe_key, key) == 0)
+ break;
+ ep = e;
+ }
+
+ if (e == NULL) { /* not found */
+ return (MH_ERR_NOTFOUND);
+ }
+
+ if (ep == NULL) /* special case 1st element in bucket */
+ hash->mh_entries[hashidx] = e->mhe_next;
+ else
+ ep->mhe_next = e->mhe_next;
+
+ /*
+ * Clean up resources used by the node's key.
+ */
+ MH_KEY_DESTROY(hash, e->mhe_key);
+
+ *val = e->mhe_val;
+ kmem_cache_free(mh_e_cache, e);
+ hash->mh_stat.mhs_nelems--;
+
+ return (0);
+}
+
+int
+mod_hash_remove(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val)
+{
+ int res;
+
+ rw_enter(&hash->mh_contents, RW_WRITER);
+ res = i_mod_hash_remove_nosync(hash, key, val);
+ rw_exit(&hash->mh_contents);
+
+ return (res);
+}
+
+/*
+ * mod_hash_replace()
+ * atomically remove an existing key-value pair from a hash, and replace
+ * the key and value with the ones supplied. The removed key and value
+ * (if any) are destroyed.
+ */
+int
+mod_hash_replace(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t val)
+{
+ int res;
+ mod_hash_val_t v;
+
+ rw_enter(&hash->mh_contents, RW_WRITER);
+
+ if (i_mod_hash_remove_nosync(hash, key, &v) == 0) {
+ /*
+ * mod_hash_remove() takes care of freeing up the key resources.
+ */
+ MH_VAL_DESTROY(hash, v);
+ }
+ res = i_mod_hash_insert_nosync(hash, key, val, (mod_hash_hndl_t)0);
+
+ rw_exit(&hash->mh_contents);
+
+ return (res);
+}
+
+/*
+ * mod_hash_destroy()
+ * Remove an element from the hash table matching 'key', and destroy it.
+ */
+int
+mod_hash_destroy(mod_hash_t *hash, mod_hash_key_t key)
+{
+ mod_hash_val_t val;
+ int rv;
+
+ rw_enter(&hash->mh_contents, RW_WRITER);
+
+ if ((rv = i_mod_hash_remove_nosync(hash, key, &val)) == 0) {
+ /*
+ * mod_hash_remove() takes care of freeing up the key resources.
+ */
+ MH_VAL_DESTROY(hash, val);
+ }
+
+ rw_exit(&hash->mh_contents);
+ return (rv);
+}
+
+/*
+ * i_mod_hash_find_nosync()
+ * mod_hash_find()
+ * Find a value in the hash table corresponding to the given key.
+ */
+int
+i_mod_hash_find_nosync(mod_hash_t *hash, mod_hash_key_t key,
+ mod_hash_val_t *val)
+{
+ uint_t hashidx;
+ struct mod_hash_entry *e;
+
+ hashidx = i_mod_hash(hash, key);
+
+ for (e = hash->mh_entries[hashidx]; e != NULL; e = e->mhe_next) {
+ if (MH_KEYCMP(hash, e->mhe_key, key) == 0) {
+ *val = e->mhe_val;
+ hash->mh_stat.mhs_hit++;
+ return (0);
+ }
+ }
+ hash->mh_stat.mhs_miss++;
+ return (MH_ERR_NOTFOUND);
+}
+
+int
+mod_hash_find(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val)
+{
+ int res;
+
+ rw_enter(&hash->mh_contents, RW_READER);
+ res = i_mod_hash_find_nosync(hash, key, val);
+ rw_exit(&hash->mh_contents);
+
+ return (res);
+}
+
+int
+mod_hash_find_cb(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val,
+ void (*find_cb)(mod_hash_key_t, mod_hash_val_t))
+{
+ int res;
+
+ rw_enter(&hash->mh_contents, RW_READER);
+ res = i_mod_hash_find_nosync(hash, key, val);
+ if (res == 0) {
+ find_cb(key, *val);
+ }
+ rw_exit(&hash->mh_contents);
+
+ return (res);
+}
+
+int
+mod_hash_find_cb_rval(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val,
+ int (*find_cb)(mod_hash_key_t, mod_hash_val_t), int *cb_rval)
+{
+ int res;
+
+ rw_enter(&hash->mh_contents, RW_READER);
+ res = i_mod_hash_find_nosync(hash, key, val);
+ if (res == 0) {
+ *cb_rval = find_cb(key, *val);
+ }
+ rw_exit(&hash->mh_contents);
+
+ return (res);
+}
+
+void
+i_mod_hash_walk_nosync(mod_hash_t *hash,
+ uint_t (*callback)(mod_hash_key_t, mod_hash_val_t *, void *), void *arg)
+{
+ struct mod_hash_entry *e;
+ uint_t hashidx;
+ int res = MH_WALK_CONTINUE;
+
+ for (hashidx = 0;
+ (hashidx < (hash->mh_nchains - 1)) && (res == MH_WALK_CONTINUE);
+ hashidx++) {
+ e = hash->mh_entries[hashidx];
+ while ((e != NULL) && (res == MH_WALK_CONTINUE)) {
+ res = callback(e->mhe_key, e->mhe_val, arg);
+ e = e->mhe_next;
+ }
+ }
+}
+
+/*
+ * mod_hash_walk()
+ * Walks all the elements in the hashtable and invokes the callback
+ * function with the key/value pair for each element. The hashtable
+ * is locked for readers so the callback function should not attempt
+ * to do any updates to the hashable. The callback function should
+ * return MH_WALK_CONTINUE to continue walking the hashtable or
+ * MH_WALK_TERMINATE to abort the walk of the hashtable.
+ */
+void
+mod_hash_walk(mod_hash_t *hash,
+ uint_t (*callback)(mod_hash_key_t, mod_hash_val_t *, void *), void *arg)
+{
+ rw_enter(&hash->mh_contents, RW_READER);
+ i_mod_hash_walk_nosync(hash, callback, arg);
+ rw_exit(&hash->mh_contents);
+}
+
+
+/*
+ * i_mod_hash_clear_nosync()
+ * mod_hash_clear()
+ * Clears the given hash table by calling the destructor of every hash
+ * element and freeing up all mod_hash_entry's.
+ */
+void
+i_mod_hash_clear_nosync(mod_hash_t *hash)
+{
+ int i;
+ struct mod_hash_entry *e, *old_e;
+
+ for (i = 0; i < hash->mh_nchains; i++) {
+ e = hash->mh_entries[i];
+ while (e != NULL) {
+ MH_KEY_DESTROY(hash, e->mhe_key);
+ MH_VAL_DESTROY(hash, e->mhe_val);
+ old_e = e;
+ e = e->mhe_next;
+ kmem_cache_free(mh_e_cache, old_e);
+ }
+ hash->mh_entries[i] = NULL;
+ }
+ hash->mh_stat.mhs_nelems = 0;
+}
+
+void
+mod_hash_clear(mod_hash_t *hash)
+{
+ ASSERT(hash);
+ rw_enter(&hash->mh_contents, RW_WRITER);
+ i_mod_hash_clear_nosync(hash);
+ rw_exit(&hash->mh_contents);
+}
diff --git a/sys/contrib/openzfs/module/icp/spi/kcf_spi.c b/sys/contrib/openzfs/module/icp/spi/kcf_spi.c
new file mode 100644
index 000000000000..34b36b81c0ab
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/spi/kcf_spi.c
@@ -0,0 +1,925 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * This file is part of the core Kernel Cryptographic Framework.
+ * It implements the SPI functions exported to cryptographic
+ * providers.
+ */
+
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/sched_impl.h>
+#include <sys/crypto/spi.h>
+
+/*
+ * minalloc and maxalloc values to be used for taskq_create().
+ */
+int crypto_taskq_threads = CRYPTO_TASKQ_THREADS;
+int crypto_taskq_minalloc = CRYPTO_TASKQ_MIN;
+int crypto_taskq_maxalloc = CRYPTO_TASKQ_MAX;
+
+static void remove_provider(kcf_provider_desc_t *);
+static void process_logical_providers(crypto_provider_info_t *,
+ kcf_provider_desc_t *);
+static int init_prov_mechs(crypto_provider_info_t *, kcf_provider_desc_t *);
+static int kcf_prov_kstat_update(kstat_t *, int);
+static void delete_kstat(kcf_provider_desc_t *);
+
+static kcf_prov_stats_t kcf_stats_ks_data_template = {
+ { "kcf_ops_total", KSTAT_DATA_UINT64 },
+ { "kcf_ops_passed", KSTAT_DATA_UINT64 },
+ { "kcf_ops_failed", KSTAT_DATA_UINT64 },
+ { "kcf_ops_returned_busy", KSTAT_DATA_UINT64 }
+};
+
+#define KCF_SPI_COPY_OPS(src, dst, ops) if ((src)->ops != NULL) \
+ *((dst)->ops) = *((src)->ops);
+
+/*
+ * Copy an ops vector from src to dst. Used during provider registration
+ * to copy the ops vector from the provider info structure to the
+ * provider descriptor maintained by KCF.
+ * Copying the ops vector specified by the provider is needed since the
+ * framework does not require the provider info structure to be
+ * persistent.
+ */
+static void
+copy_ops_vector_v1(crypto_ops_t *src_ops, crypto_ops_t *dst_ops)
+{
+ KCF_SPI_COPY_OPS(src_ops, dst_ops, co_control_ops);
+ KCF_SPI_COPY_OPS(src_ops, dst_ops, co_digest_ops);
+ KCF_SPI_COPY_OPS(src_ops, dst_ops, co_cipher_ops);
+ KCF_SPI_COPY_OPS(src_ops, dst_ops, co_mac_ops);
+ KCF_SPI_COPY_OPS(src_ops, dst_ops, co_sign_ops);
+ KCF_SPI_COPY_OPS(src_ops, dst_ops, co_verify_ops);
+ KCF_SPI_COPY_OPS(src_ops, dst_ops, co_dual_ops);
+ KCF_SPI_COPY_OPS(src_ops, dst_ops, co_dual_cipher_mac_ops);
+ KCF_SPI_COPY_OPS(src_ops, dst_ops, co_random_ops);
+ KCF_SPI_COPY_OPS(src_ops, dst_ops, co_session_ops);
+ KCF_SPI_COPY_OPS(src_ops, dst_ops, co_object_ops);
+ KCF_SPI_COPY_OPS(src_ops, dst_ops, co_key_ops);
+ KCF_SPI_COPY_OPS(src_ops, dst_ops, co_provider_ops);
+ KCF_SPI_COPY_OPS(src_ops, dst_ops, co_ctx_ops);
+}
+
+static void
+copy_ops_vector_v2(crypto_ops_t *src_ops, crypto_ops_t *dst_ops)
+{
+ KCF_SPI_COPY_OPS(src_ops, dst_ops, co_mech_ops);
+}
+
+static void
+copy_ops_vector_v3(crypto_ops_t *src_ops, crypto_ops_t *dst_ops)
+{
+ KCF_SPI_COPY_OPS(src_ops, dst_ops, co_nostore_key_ops);
+}
+
+/*
+ * This routine is used to add cryptographic providers to the KEF framework.
+ * Providers pass a crypto_provider_info structure to crypto_register_provider()
+ * and get back a handle. The crypto_provider_info structure contains a
+ * list of mechanisms supported by the provider and an ops vector containing
+ * provider entry points. Hardware providers call this routine in their attach
+ * routines. Software providers call this routine in their _init() routine.
+ */
+int
+crypto_register_provider(crypto_provider_info_t *info,
+ crypto_kcf_provider_handle_t *handle)
+{
+ char *ks_name;
+
+ kcf_provider_desc_t *prov_desc = NULL;
+ int ret = CRYPTO_ARGUMENTS_BAD;
+
+ if (info->pi_interface_version > CRYPTO_SPI_VERSION_3)
+ return (CRYPTO_VERSION_MISMATCH);
+
+ /*
+ * Check provider type, must be software, hardware, or logical.
+ */
+ if (info->pi_provider_type != CRYPTO_HW_PROVIDER &&
+ info->pi_provider_type != CRYPTO_SW_PROVIDER &&
+ info->pi_provider_type != CRYPTO_LOGICAL_PROVIDER)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ /*
+ * Allocate and initialize a new provider descriptor. We also
+ * hold it and release it when done.
+ */
+ prov_desc = kcf_alloc_provider_desc(info);
+ KCF_PROV_REFHOLD(prov_desc);
+
+ prov_desc->pd_prov_type = info->pi_provider_type;
+
+ /* provider-private handle, opaque to KCF */
+ prov_desc->pd_prov_handle = info->pi_provider_handle;
+
+ /* copy provider description string */
+ if (info->pi_provider_description != NULL) {
+ /*
+ * pi_provider_descriptor is a string that can contain
+ * up to CRYPTO_PROVIDER_DESCR_MAX_LEN + 1 characters
+ * INCLUDING the terminating null character. A bcopy()
+ * is necessary here as pd_description should not have
+ * a null character. See comments in kcf_alloc_provider_desc()
+ * for details on pd_description field.
+ */
+ bcopy(info->pi_provider_description, prov_desc->pd_description,
+ MIN(strlen(info->pi_provider_description),
+ (size_t)CRYPTO_PROVIDER_DESCR_MAX_LEN));
+ }
+
+ if (info->pi_provider_type != CRYPTO_LOGICAL_PROVIDER) {
+ if (info->pi_ops_vector == NULL) {
+ goto bail;
+ }
+ copy_ops_vector_v1(info->pi_ops_vector,
+ prov_desc->pd_ops_vector);
+ if (info->pi_interface_version >= CRYPTO_SPI_VERSION_2) {
+ copy_ops_vector_v2(info->pi_ops_vector,
+ prov_desc->pd_ops_vector);
+ prov_desc->pd_flags = info->pi_flags;
+ }
+ if (info->pi_interface_version == CRYPTO_SPI_VERSION_3) {
+ copy_ops_vector_v3(info->pi_ops_vector,
+ prov_desc->pd_ops_vector);
+ }
+ }
+
+ /* object_ops and nostore_key_ops are mutually exclusive */
+ if (prov_desc->pd_ops_vector->co_object_ops &&
+ prov_desc->pd_ops_vector->co_nostore_key_ops) {
+ goto bail;
+ }
+
+ /* process the mechanisms supported by the provider */
+ if ((ret = init_prov_mechs(info, prov_desc)) != CRYPTO_SUCCESS)
+ goto bail;
+
+ /*
+ * Add provider to providers tables, also sets the descriptor
+ * pd_prov_id field.
+ */
+ if ((ret = kcf_prov_tab_add_provider(prov_desc)) != CRYPTO_SUCCESS) {
+ undo_register_provider(prov_desc, B_FALSE);
+ goto bail;
+ }
+
+ /*
+ * We create a taskq only for a hardware provider. The global
+ * software queue is used for software providers. We handle ordering
+ * of multi-part requests in the taskq routine. So, it is safe to
+ * have multiple threads for the taskq. We pass TASKQ_PREPOPULATE flag
+ * to keep some entries cached to improve performance.
+ */
+ if (prov_desc->pd_prov_type == CRYPTO_HW_PROVIDER)
+ prov_desc->pd_sched_info.ks_taskq = taskq_create("kcf_taskq",
+ crypto_taskq_threads, minclsyspri,
+ crypto_taskq_minalloc, crypto_taskq_maxalloc,
+ TASKQ_PREPOPULATE);
+ else
+ prov_desc->pd_sched_info.ks_taskq = NULL;
+
+ /* no kernel session to logical providers */
+ if (prov_desc->pd_prov_type != CRYPTO_LOGICAL_PROVIDER) {
+ /*
+ * Open a session for session-oriented providers. This session
+ * is used for all kernel consumers. This is fine as a provider
+ * is required to support multiple thread access to a session.
+ * We can do this only after the taskq has been created as we
+ * do a kcf_submit_request() to open the session.
+ */
+ if (KCF_PROV_SESSION_OPS(prov_desc) != NULL) {
+ kcf_req_params_t params;
+
+ KCF_WRAP_SESSION_OPS_PARAMS(&params,
+ KCF_OP_SESSION_OPEN, &prov_desc->pd_sid, 0,
+ CRYPTO_USER, NULL, 0, prov_desc);
+ ret = kcf_submit_request(prov_desc, NULL, NULL, &params,
+ B_FALSE);
+
+ if (ret != CRYPTO_SUCCESS) {
+ undo_register_provider(prov_desc, B_TRUE);
+ ret = CRYPTO_FAILED;
+ goto bail;
+ }
+ }
+ }
+
+ if (prov_desc->pd_prov_type != CRYPTO_LOGICAL_PROVIDER) {
+ /*
+ * Create the kstat for this provider. There is a kstat
+ * installed for each successfully registered provider.
+ * This kstat is deleted, when the provider unregisters.
+ */
+ if (prov_desc->pd_prov_type == CRYPTO_SW_PROVIDER) {
+ ks_name = kmem_asprintf("%s_%s",
+ "NONAME", "provider_stats");
+ } else {
+ ks_name = kmem_asprintf("%s_%d_%u_%s",
+ "NONAME", 0, prov_desc->pd_prov_id,
+ "provider_stats");
+ }
+
+ prov_desc->pd_kstat = kstat_create("kcf", 0, ks_name, "crypto",
+ KSTAT_TYPE_NAMED, sizeof (kcf_prov_stats_t) /
+ sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+
+ if (prov_desc->pd_kstat != NULL) {
+ bcopy(&kcf_stats_ks_data_template,
+ &prov_desc->pd_ks_data,
+ sizeof (kcf_stats_ks_data_template));
+ prov_desc->pd_kstat->ks_data = &prov_desc->pd_ks_data;
+ KCF_PROV_REFHOLD(prov_desc);
+ KCF_PROV_IREFHOLD(prov_desc);
+ prov_desc->pd_kstat->ks_private = prov_desc;
+ prov_desc->pd_kstat->ks_update = kcf_prov_kstat_update;
+ kstat_install(prov_desc->pd_kstat);
+ }
+ kmem_strfree(ks_name);
+ }
+
+ if (prov_desc->pd_prov_type == CRYPTO_HW_PROVIDER)
+ process_logical_providers(info, prov_desc);
+
+ mutex_enter(&prov_desc->pd_lock);
+ prov_desc->pd_state = KCF_PROV_READY;
+ mutex_exit(&prov_desc->pd_lock);
+ kcf_do_notify(prov_desc, B_TRUE);
+
+ *handle = prov_desc->pd_kcf_prov_handle;
+ ret = CRYPTO_SUCCESS;
+
+bail:
+ KCF_PROV_REFRELE(prov_desc);
+ return (ret);
+}
+
+/*
+ * This routine is used to notify the framework when a provider is being
+ * removed. Hardware providers call this routine in their detach routines.
+ * Software providers call this routine in their _fini() routine.
+ */
+int
+crypto_unregister_provider(crypto_kcf_provider_handle_t handle)
+{
+ uint_t mech_idx;
+ kcf_provider_desc_t *desc;
+ kcf_prov_state_t saved_state;
+
+ /* lookup provider descriptor */
+ if ((desc = kcf_prov_tab_lookup((crypto_provider_id_t)handle)) == NULL)
+ return (CRYPTO_UNKNOWN_PROVIDER);
+
+ mutex_enter(&desc->pd_lock);
+ /*
+ * Check if any other thread is disabling or removing
+ * this provider. We return if this is the case.
+ */
+ if (desc->pd_state >= KCF_PROV_DISABLED) {
+ mutex_exit(&desc->pd_lock);
+ /* Release reference held by kcf_prov_tab_lookup(). */
+ KCF_PROV_REFRELE(desc);
+ return (CRYPTO_BUSY);
+ }
+
+ saved_state = desc->pd_state;
+ desc->pd_state = KCF_PROV_REMOVED;
+
+ if (saved_state == KCF_PROV_BUSY) {
+ /*
+ * The per-provider taskq threads may be waiting. We
+ * signal them so that they can start failing requests.
+ */
+ cv_broadcast(&desc->pd_resume_cv);
+ }
+
+ if (desc->pd_prov_type == CRYPTO_SW_PROVIDER) {
+ /*
+ * Check if this provider is currently being used.
+ * pd_irefcnt is the number of holds from the internal
+ * structures. We add one to account for the above lookup.
+ */
+ if (desc->pd_refcnt > desc->pd_irefcnt + 1) {
+ desc->pd_state = saved_state;
+ mutex_exit(&desc->pd_lock);
+ /* Release reference held by kcf_prov_tab_lookup(). */
+ KCF_PROV_REFRELE(desc);
+ /*
+ * The administrator presumably will stop the clients
+ * thus removing the holds, when they get the busy
+ * return value. Any retry will succeed then.
+ */
+ return (CRYPTO_BUSY);
+ }
+ }
+ mutex_exit(&desc->pd_lock);
+
+ if (desc->pd_prov_type != CRYPTO_SW_PROVIDER) {
+ remove_provider(desc);
+ }
+
+ if (desc->pd_prov_type != CRYPTO_LOGICAL_PROVIDER) {
+ /* remove the provider from the mechanisms tables */
+ for (mech_idx = 0; mech_idx < desc->pd_mech_list_count;
+ mech_idx++) {
+ kcf_remove_mech_provider(
+ desc->pd_mechanisms[mech_idx].cm_mech_name, desc);
+ }
+ }
+
+ /* remove provider from providers table */
+ if (kcf_prov_tab_rem_provider((crypto_provider_id_t)handle) !=
+ CRYPTO_SUCCESS) {
+ /* Release reference held by kcf_prov_tab_lookup(). */
+ KCF_PROV_REFRELE(desc);
+ return (CRYPTO_UNKNOWN_PROVIDER);
+ }
+
+ delete_kstat(desc);
+
+ if (desc->pd_prov_type == CRYPTO_SW_PROVIDER) {
+ /* Release reference held by kcf_prov_tab_lookup(). */
+ KCF_PROV_REFRELE(desc);
+
+ /*
+ * Wait till the existing requests complete.
+ */
+ mutex_enter(&desc->pd_lock);
+ while (desc->pd_state != KCF_PROV_FREED)
+ cv_wait(&desc->pd_remove_cv, &desc->pd_lock);
+ mutex_exit(&desc->pd_lock);
+ } else {
+ /*
+ * Wait until requests that have been sent to the provider
+ * complete.
+ */
+ mutex_enter(&desc->pd_lock);
+ while (desc->pd_irefcnt > 0)
+ cv_wait(&desc->pd_remove_cv, &desc->pd_lock);
+ mutex_exit(&desc->pd_lock);
+ }
+
+ kcf_do_notify(desc, B_FALSE);
+
+ if (desc->pd_prov_type == CRYPTO_SW_PROVIDER) {
+ /*
+ * This is the only place where kcf_free_provider_desc()
+ * is called directly. KCF_PROV_REFRELE() should free the
+ * structure in all other places.
+ */
+ ASSERT(desc->pd_state == KCF_PROV_FREED &&
+ desc->pd_refcnt == 0);
+ kcf_free_provider_desc(desc);
+ } else {
+ KCF_PROV_REFRELE(desc);
+ }
+
+ return (CRYPTO_SUCCESS);
+}
+
+/*
+ * This routine is used to notify the framework that the state of
+ * a cryptographic provider has changed. Valid state codes are:
+ *
+ * CRYPTO_PROVIDER_READY
+ * The provider indicates that it can process more requests. A provider
+ * will notify with this event if it previously has notified us with a
+ * CRYPTO_PROVIDER_BUSY.
+ *
+ * CRYPTO_PROVIDER_BUSY
+ * The provider can not take more requests.
+ *
+ * CRYPTO_PROVIDER_FAILED
+ * The provider encountered an internal error. The framework will not
+ * be sending any more requests to the provider. The provider may notify
+ * with a CRYPTO_PROVIDER_READY, if it is able to recover from the error.
+ *
+ * This routine can be called from user or interrupt context.
+ */
+void
+crypto_provider_notification(crypto_kcf_provider_handle_t handle, uint_t state)
+{
+ kcf_provider_desc_t *pd;
+
+ /* lookup the provider from the given handle */
+ if ((pd = kcf_prov_tab_lookup((crypto_provider_id_t)handle)) == NULL)
+ return;
+
+ mutex_enter(&pd->pd_lock);
+
+ if (pd->pd_state <= KCF_PROV_VERIFICATION_FAILED)
+ goto out;
+
+ if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) {
+ cmn_err(CE_WARN, "crypto_provider_notification: "
+ "logical provider (%x) ignored\n", handle);
+ goto out;
+ }
+ switch (state) {
+ case CRYPTO_PROVIDER_READY:
+ switch (pd->pd_state) {
+ case KCF_PROV_BUSY:
+ pd->pd_state = KCF_PROV_READY;
+ /*
+ * Signal the per-provider taskq threads that they
+ * can start submitting requests.
+ */
+ cv_broadcast(&pd->pd_resume_cv);
+ break;
+
+ case KCF_PROV_FAILED:
+ /*
+ * The provider recovered from the error. Let us
+ * use it now.
+ */
+ pd->pd_state = KCF_PROV_READY;
+ break;
+ default:
+ break;
+ }
+ break;
+
+ case CRYPTO_PROVIDER_BUSY:
+ switch (pd->pd_state) {
+ case KCF_PROV_READY:
+ pd->pd_state = KCF_PROV_BUSY;
+ break;
+ default:
+ break;
+ }
+ break;
+
+ case CRYPTO_PROVIDER_FAILED:
+ /*
+ * We note the failure and return. The per-provider taskq
+ * threads check this flag and start failing the
+ * requests, if it is set. See process_req_hwp() for details.
+ */
+ switch (pd->pd_state) {
+ case KCF_PROV_READY:
+ pd->pd_state = KCF_PROV_FAILED;
+ break;
+
+ case KCF_PROV_BUSY:
+ pd->pd_state = KCF_PROV_FAILED;
+ /*
+ * The per-provider taskq threads may be waiting. We
+ * signal them so that they can start failing requests.
+ */
+ cv_broadcast(&pd->pd_resume_cv);
+ break;
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+out:
+ mutex_exit(&pd->pd_lock);
+ KCF_PROV_REFRELE(pd);
+}
+
+/*
+ * This routine is used to notify the framework the result of
+ * an asynchronous request handled by a provider. Valid error
+ * codes are the same as the CRYPTO_* errors defined in common.h.
+ *
+ * This routine can be called from user or interrupt context.
+ */
+void
+crypto_op_notification(crypto_req_handle_t handle, int error)
+{
+ kcf_call_type_t ctype;
+
+ if (handle == NULL)
+ return;
+
+ if ((ctype = GET_REQ_TYPE(handle)) == CRYPTO_SYNCH) {
+ kcf_sreq_node_t *sreq = (kcf_sreq_node_t *)handle;
+
+ if (error != CRYPTO_SUCCESS)
+ sreq->sn_provider->pd_sched_info.ks_nfails++;
+ KCF_PROV_IREFRELE(sreq->sn_provider);
+ kcf_sop_done(sreq, error);
+ } else {
+ kcf_areq_node_t *areq = (kcf_areq_node_t *)handle;
+
+ ASSERT(ctype == CRYPTO_ASYNCH);
+ if (error != CRYPTO_SUCCESS)
+ areq->an_provider->pd_sched_info.ks_nfails++;
+ KCF_PROV_IREFRELE(areq->an_provider);
+ kcf_aop_done(areq, error);
+ }
+}
+
+/*
+ * This routine is used by software providers to determine
+ * whether to use KM_SLEEP or KM_NOSLEEP during memory allocation.
+ * Note that hardware providers can always use KM_SLEEP. So,
+ * they do not need to call this routine.
+ *
+ * This routine can be called from user or interrupt context.
+ */
+int
+crypto_kmflag(crypto_req_handle_t handle)
+{
+ return (REQHNDL2_KMFLAG(handle));
+}
+
+/*
+ * Process the mechanism info structures specified by the provider
+ * during registration. A NULL crypto_provider_info_t indicates
+ * an already initialized provider descriptor.
+ *
+ * Mechanisms are not added to the kernel's mechanism table if the
+ * provider is a logical provider.
+ *
+ * Returns CRYPTO_SUCCESS on success, CRYPTO_ARGUMENTS if one
+ * of the specified mechanisms was malformed, or CRYPTO_HOST_MEMORY
+ * if the table of mechanisms is full.
+ */
+static int
+init_prov_mechs(crypto_provider_info_t *info, kcf_provider_desc_t *desc)
+{
+ uint_t mech_idx;
+ uint_t cleanup_idx;
+ int err = CRYPTO_SUCCESS;
+ kcf_prov_mech_desc_t *pmd;
+ int desc_use_count = 0;
+ int mcount = desc->pd_mech_list_count;
+
+ if (desc->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) {
+ if (info != NULL) {
+ ASSERT(info->pi_mechanisms != NULL);
+ bcopy(info->pi_mechanisms, desc->pd_mechanisms,
+ sizeof (crypto_mech_info_t) * mcount);
+ }
+ return (CRYPTO_SUCCESS);
+ }
+
+ /*
+ * Copy the mechanism list from the provider info to the provider
+ * descriptor. desc->pd_mechanisms has an extra crypto_mech_info_t
+ * element if the provider has random_ops since we keep an internal
+ * mechanism, SUN_RANDOM, in this case.
+ */
+ if (info != NULL) {
+ if (info->pi_ops_vector->co_random_ops != NULL) {
+ crypto_mech_info_t *rand_mi;
+
+ /*
+ * Need the following check as it is possible to have
+ * a provider that implements just random_ops and has
+ * pi_mechanisms == NULL.
+ */
+ if (info->pi_mechanisms != NULL) {
+ bcopy(info->pi_mechanisms, desc->pd_mechanisms,
+ sizeof (crypto_mech_info_t) * (mcount - 1));
+ }
+ rand_mi = &desc->pd_mechanisms[mcount - 1];
+
+ bzero(rand_mi, sizeof (crypto_mech_info_t));
+ (void) strncpy(rand_mi->cm_mech_name, SUN_RANDOM,
+ CRYPTO_MAX_MECH_NAME);
+ rand_mi->cm_func_group_mask = CRYPTO_FG_RANDOM;
+ } else {
+ ASSERT(info->pi_mechanisms != NULL);
+ bcopy(info->pi_mechanisms, desc->pd_mechanisms,
+ sizeof (crypto_mech_info_t) * mcount);
+ }
+ }
+
+ /*
+ * For each mechanism support by the provider, add the provider
+ * to the corresponding KCF mechanism mech_entry chain.
+ */
+ for (mech_idx = 0; mech_idx < desc->pd_mech_list_count; mech_idx++) {
+ crypto_mech_info_t *mi = &desc->pd_mechanisms[mech_idx];
+
+ if ((mi->cm_mech_flags & CRYPTO_KEYSIZE_UNIT_IN_BITS) &&
+ (mi->cm_mech_flags & CRYPTO_KEYSIZE_UNIT_IN_BYTES)) {
+ err = CRYPTO_ARGUMENTS_BAD;
+ break;
+ }
+
+ if (desc->pd_flags & CRYPTO_HASH_NO_UPDATE &&
+ mi->cm_func_group_mask & CRYPTO_FG_DIGEST) {
+ /*
+ * We ask the provider to specify the limit
+ * per hash mechanism. But, in practice, a
+ * hardware limitation means all hash mechanisms
+ * will have the same maximum size allowed for
+ * input data. So, we make it a per provider
+ * limit to keep it simple.
+ */
+ if (mi->cm_max_input_length == 0) {
+ err = CRYPTO_ARGUMENTS_BAD;
+ break;
+ } else {
+ desc->pd_hash_limit = mi->cm_max_input_length;
+ }
+ }
+
+ if ((err = kcf_add_mech_provider(mech_idx, desc, &pmd)) !=
+ KCF_SUCCESS)
+ break;
+
+ if (pmd == NULL)
+ continue;
+
+ /* The provider will be used for this mechanism */
+ desc_use_count++;
+ }
+
+ /*
+ * Don't allow multiple software providers with disabled mechanisms
+ * to register. Subsequent enabling of mechanisms will result in
+ * an unsupported configuration, i.e. multiple software providers
+ * per mechanism.
+ */
+ if (desc_use_count == 0 && desc->pd_prov_type == CRYPTO_SW_PROVIDER)
+ return (CRYPTO_ARGUMENTS_BAD);
+
+ if (err == KCF_SUCCESS)
+ return (CRYPTO_SUCCESS);
+
+ /*
+ * An error occurred while adding the mechanism, cleanup
+ * and bail.
+ */
+ for (cleanup_idx = 0; cleanup_idx < mech_idx; cleanup_idx++) {
+ kcf_remove_mech_provider(
+ desc->pd_mechanisms[cleanup_idx].cm_mech_name, desc);
+ }
+
+ if (err == KCF_MECH_TAB_FULL)
+ return (CRYPTO_HOST_MEMORY);
+
+ return (CRYPTO_ARGUMENTS_BAD);
+}
+
+/*
+ * Update routine for kstat. Only privileged users are allowed to
+ * access this information, since this information is sensitive.
+ * There are some cryptographic attacks (e.g. traffic analysis)
+ * which can use this information.
+ */
+static int
+kcf_prov_kstat_update(kstat_t *ksp, int rw)
+{
+ kcf_prov_stats_t *ks_data;
+ kcf_provider_desc_t *pd = (kcf_provider_desc_t *)ksp->ks_private;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ ks_data = ksp->ks_data;
+
+ ks_data->ps_ops_total.value.ui64 = pd->pd_sched_info.ks_ndispatches;
+ ks_data->ps_ops_failed.value.ui64 = pd->pd_sched_info.ks_nfails;
+ ks_data->ps_ops_busy_rval.value.ui64 = pd->pd_sched_info.ks_nbusy_rval;
+ ks_data->ps_ops_passed.value.ui64 =
+ pd->pd_sched_info.ks_ndispatches -
+ pd->pd_sched_info.ks_nfails -
+ pd->pd_sched_info.ks_nbusy_rval;
+
+ return (0);
+}
+
+
+/*
+ * Utility routine called from failure paths in crypto_register_provider()
+ * and from crypto_load_soft_disabled().
+ */
+void
+undo_register_provider(kcf_provider_desc_t *desc, boolean_t remove_prov)
+{
+ uint_t mech_idx;
+
+ /* remove the provider from the mechanisms tables */
+ for (mech_idx = 0; mech_idx < desc->pd_mech_list_count;
+ mech_idx++) {
+ kcf_remove_mech_provider(
+ desc->pd_mechanisms[mech_idx].cm_mech_name, desc);
+ }
+
+ /* remove provider from providers table */
+ if (remove_prov)
+ (void) kcf_prov_tab_rem_provider(desc->pd_prov_id);
+}
+
+/*
+ * Utility routine called from crypto_load_soft_disabled(). Callers
+ * should have done a prior undo_register_provider().
+ */
+void
+redo_register_provider(kcf_provider_desc_t *pd)
+{
+ /* process the mechanisms supported by the provider */
+ (void) init_prov_mechs(NULL, pd);
+
+ /*
+ * Hold provider in providers table. We should not call
+ * kcf_prov_tab_add_provider() here as the provider descriptor
+ * is still valid which means it has an entry in the provider
+ * table.
+ */
+ KCF_PROV_REFHOLD(pd);
+ KCF_PROV_IREFHOLD(pd);
+}
+
+/*
+ * Add provider (p1) to another provider's array of providers (p2).
+ * Hardware and logical providers use this array to cross-reference
+ * each other.
+ */
+static void
+add_provider_to_array(kcf_provider_desc_t *p1, kcf_provider_desc_t *p2)
+{
+ kcf_provider_list_t *new;
+
+ new = kmem_alloc(sizeof (kcf_provider_list_t), KM_SLEEP);
+ mutex_enter(&p2->pd_lock);
+ new->pl_next = p2->pd_provider_list;
+ p2->pd_provider_list = new;
+ KCF_PROV_IREFHOLD(p1);
+ new->pl_provider = p1;
+ mutex_exit(&p2->pd_lock);
+}
+
+/*
+ * Remove provider (p1) from another provider's array of providers (p2).
+ * Hardware and logical providers use this array to cross-reference
+ * each other.
+ */
+static void
+remove_provider_from_array(kcf_provider_desc_t *p1, kcf_provider_desc_t *p2)
+{
+
+ kcf_provider_list_t *pl = NULL, **prev;
+
+ mutex_enter(&p2->pd_lock);
+ for (pl = p2->pd_provider_list, prev = &p2->pd_provider_list;
+ pl != NULL; prev = &pl->pl_next, pl = pl->pl_next) {
+ if (pl->pl_provider == p1) {
+ break;
+ }
+ }
+
+ if (p1 == NULL) {
+ mutex_exit(&p2->pd_lock);
+ return;
+ }
+
+ /* detach and free kcf_provider_list structure */
+ KCF_PROV_IREFRELE(p1);
+ *prev = pl->pl_next;
+ kmem_free(pl, sizeof (*pl));
+ mutex_exit(&p2->pd_lock);
+}
+
+/*
+ * Convert an array of logical provider handles (crypto_provider_id)
+ * stored in a crypto_provider_info structure into an array of provider
+ * descriptors (kcf_provider_desc_t) attached to a logical provider.
+ */
+static void
+process_logical_providers(crypto_provider_info_t *info, kcf_provider_desc_t *hp)
+{
+ kcf_provider_desc_t *lp;
+ crypto_provider_id_t handle;
+ int count = info->pi_logical_provider_count;
+ int i;
+
+ /* add hardware provider to each logical provider */
+ for (i = 0; i < count; i++) {
+ handle = info->pi_logical_providers[i];
+ lp = kcf_prov_tab_lookup((crypto_provider_id_t)handle);
+ if (lp == NULL) {
+ continue;
+ }
+ add_provider_to_array(hp, lp);
+ hp->pd_flags |= KCF_LPROV_MEMBER;
+
+ /*
+ * A hardware provider has to have the provider descriptor of
+ * every logical provider it belongs to, so it can be removed
+ * from the logical provider if the hardware provider
+ * unregisters from the framework.
+ */
+ add_provider_to_array(lp, hp);
+ KCF_PROV_REFRELE(lp);
+ }
+}
+
+/*
+ * This routine removes a provider from all of the logical or
+ * hardware providers it belongs to, and frees the provider's
+ * array of pointers to providers.
+ */
+static void
+remove_provider(kcf_provider_desc_t *pp)
+{
+ kcf_provider_desc_t *p;
+ kcf_provider_list_t *e, *next;
+
+ mutex_enter(&pp->pd_lock);
+ for (e = pp->pd_provider_list; e != NULL; e = next) {
+ p = e->pl_provider;
+ remove_provider_from_array(pp, p);
+ if (p->pd_prov_type == CRYPTO_HW_PROVIDER &&
+ p->pd_provider_list == NULL)
+ p->pd_flags &= ~KCF_LPROV_MEMBER;
+ KCF_PROV_IREFRELE(p);
+ next = e->pl_next;
+ kmem_free(e, sizeof (*e));
+ }
+ pp->pd_provider_list = NULL;
+ mutex_exit(&pp->pd_lock);
+}
+
+/*
+ * Dispatch events as needed for a provider. is_added flag tells
+ * whether the provider is registering or unregistering.
+ */
+void
+kcf_do_notify(kcf_provider_desc_t *prov_desc, boolean_t is_added)
+{
+ int i;
+ crypto_notify_event_change_t ec;
+
+ ASSERT(prov_desc->pd_state > KCF_PROV_VERIFICATION_FAILED);
+
+ /*
+ * Inform interested clients of the mechanisms becoming
+ * available/unavailable. We skip this for logical providers
+ * as they do not affect mechanisms.
+ */
+ if (prov_desc->pd_prov_type != CRYPTO_LOGICAL_PROVIDER) {
+ ec.ec_provider_type = prov_desc->pd_prov_type;
+ ec.ec_change = is_added ? CRYPTO_MECH_ADDED :
+ CRYPTO_MECH_REMOVED;
+ for (i = 0; i < prov_desc->pd_mech_list_count; i++) {
+ (void) strlcpy(ec.ec_mech_name,
+ prov_desc->pd_mechanisms[i].cm_mech_name,
+ CRYPTO_MAX_MECH_NAME);
+ kcf_walk_ntfylist(CRYPTO_EVENT_MECHS_CHANGED, &ec);
+ }
+
+ }
+
+ /*
+ * Inform interested clients about the new or departing provider.
+ * In case of a logical provider, we need to notify the event only
+ * for the logical provider and not for the underlying
+ * providers which are known by the KCF_LPROV_MEMBER bit.
+ */
+ if (prov_desc->pd_prov_type == CRYPTO_LOGICAL_PROVIDER ||
+ (prov_desc->pd_flags & KCF_LPROV_MEMBER) == 0) {
+ kcf_walk_ntfylist(is_added ? CRYPTO_EVENT_PROVIDER_REGISTERED :
+ CRYPTO_EVENT_PROVIDER_UNREGISTERED, prov_desc);
+ }
+}
+
+static void
+delete_kstat(kcf_provider_desc_t *desc)
+{
+ /* destroy the kstat created for this provider */
+ if (desc->pd_kstat != NULL) {
+ kcf_provider_desc_t *kspd = desc->pd_kstat->ks_private;
+
+ /* release reference held by desc->pd_kstat->ks_private */
+ ASSERT(desc == kspd);
+ kstat_delete(kspd->pd_kstat);
+ desc->pd_kstat = NULL;
+ KCF_PROV_REFRELE(kspd);
+ KCF_PROV_IREFRELE(kspd);
+ }
+}
diff --git a/sys/contrib/openzfs/module/lua/Makefile.in b/sys/contrib/openzfs/module/lua/Makefile.in
new file mode 100644
index 000000000000..0a74c17e64e8
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/Makefile.in
@@ -0,0 +1,39 @@
+ifneq ($(KBUILD_EXTMOD),)
+src = @abs_srcdir@
+obj = @abs_builddir@
+endif
+
+MODULE := zlua
+
+obj-$(CONFIG_ZFS) := $(MODULE).o
+
+ccflags-y := -DLUA_USE_LONGLONG
+
+$(MODULE)-objs += lapi.o
+$(MODULE)-objs += lauxlib.o
+$(MODULE)-objs += lbaselib.o
+$(MODULE)-objs += lcode.o
+$(MODULE)-objs += lcompat.o
+$(MODULE)-objs += lcorolib.o
+$(MODULE)-objs += lctype.o
+$(MODULE)-objs += ldebug.o
+$(MODULE)-objs += ldo.o
+$(MODULE)-objs += lfunc.o
+$(MODULE)-objs += lgc.o
+$(MODULE)-objs += llex.o
+$(MODULE)-objs += lmem.o
+$(MODULE)-objs += lobject.o
+$(MODULE)-objs += lopcodes.o
+$(MODULE)-objs += lparser.o
+$(MODULE)-objs += lstate.o
+$(MODULE)-objs += lstring.o
+$(MODULE)-objs += lstrlib.o
+$(MODULE)-objs += ltable.o
+$(MODULE)-objs += ltablib.o
+$(MODULE)-objs += ltm.o
+$(MODULE)-objs += lvm.o
+$(MODULE)-objs += lzio.o
+$(MODULE)-objs += setjmp/setjmp.o
+
+all:
+ mkdir -p setjmp
diff --git a/sys/contrib/openzfs/module/lua/README.zfs b/sys/contrib/openzfs/module/lua/README.zfs
new file mode 100644
index 000000000000..0e22de7a4a18
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/README.zfs
@@ -0,0 +1,80 @@
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+Introduction
+------------
+
+This README describes the Lua interpreter source code that lives in the ZFS
+source tree to enable execution of ZFS channel programs, including its
+maintenance policy, the modifications that have been made to it, and how it
+should (and should not) be used.
+
+For a description of the Lua language and features exposed by ZFS channel
+programs, please refer to the zfs-program(1m) man page instead.
+
+
+Maintenance policy
+------------------
+
+The Lua runtime is considered stable software. Channel programs don't need much
+complicated logic, so updates to the Lua runtime from upstream are viewed as
+nice-to-have, but not required for channel programs to be well-supported. As
+such, the Lua runtime in ZFS should be updated on an as-needed basis for
+security vulnerabilities, but not much else.
+
+
+Modifications to Lua
+--------------------
+
+The version of the Lua runtime we're using in ZFS has been modified in a variety
+of ways to make it more useful for the specific purpose of running channel
+programs. These changes include:
+
+1. "Normal" Lua uses floating point for all numbers it stores, but those aren't
+ useful inside ZFS / the kernel. We have changed the runtime to use int64_t
+ throughout for all numbers.
+2. Some of the Lua standard libraries do file I/O or spawn processes, but
+ neither of these make sense from inside channel programs. We have removed
+ those libraries rather than reimplementing them using kernel APIs.
+3. The "normal" Lua runtime handles errors by failing fatally, but since this
+ version of Lua runs inside the kernel we must handle these failures and
+ return meaningful error codes to userland. We have customized the Lua
+ failure paths so that they aren't fatal.
+4. Running poorly-vetted code inside the kernel is always a risk; even if the
+ ability to do so is restricted to the root user, it's still possible to write
+ an incorrect program that results in an infinite loop or massive memory use.
+ We've added new protections into the Lua interpreter to limit the runtime
+ (measured in number of Lua instructions run) and memory overhead of running
+ a channel program.
+5. The Lua bytecode is not designed to be secure / safe, so it would be easy to
+ pass invalid bytecode which can panic the kernel. By comparison, the parser
+ is hardened and fails gracefully on invalid input. Therefore, we only accept
+ Lua source code at the ioctl level and then interpret it inside the kernel.
+
+Each of these modifications have been tested in the zfs-test suite. If / when
+new modifications are made, new tests should be added to the suite located in
+zfs-tests/tests/functional/channel_program/lua_core.
+
+
+How to use this Lua interpreter
+-------------------------------
+
+From the above, it should be clear that this is not a general-purpose Lua
+interpreter. Additional work would be required to extricate this custom version
+of Lua from ZFS and make it usable by other areas of the kernel.
diff --git a/sys/contrib/openzfs/module/lua/lapi.c b/sys/contrib/openzfs/module/lua/lapi.c
new file mode 100644
index 000000000000..6a845c461052
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lapi.c
@@ -0,0 +1,1345 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lapi.c,v 2.171.1.1 2013/04/12 18:48:47 roberto Exp $
+** Lua API
+** See Copyright Notice in lua.h
+*/
+
+
+#define lapi_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "lapi.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+#include "lvm.h"
+
+
+
+const char lua_ident[] =
+ "$LuaVersion: " LUA_COPYRIGHT " $"
+ "$LuaAuthors: " LUA_AUTHORS " $";
+
+
+/* value at a non-valid index */
+#define NONVALIDVALUE cast(TValue *, luaO_nilobject)
+
+/* corresponding test */
+#define isvalid(o) ((o) != luaO_nilobject)
+
+/* test for pseudo index */
+#define ispseudo(i) ((i) <= LUA_REGISTRYINDEX)
+
+/* test for valid but not pseudo index */
+#define isstackindex(i, o) (isvalid(o) && !ispseudo(i))
+
+#define api_checkvalidindex(L, o) api_check(L, isvalid(o), "invalid index")
+
+#define api_checkstackindex(L, i, o) \
+ api_check(L, isstackindex(i, o), "index not in the stack")
+
+
+static TValue *index2addr (lua_State *L, int idx) {
+ CallInfo *ci = L->ci;
+ if (idx > 0) {
+ TValue *o = ci->func + idx;
+ api_check(L, idx <= ci->top - (ci->func + 1), "unacceptable index");
+ if (o >= L->top) return NONVALIDVALUE;
+ else return o;
+ }
+ else if (!ispseudo(idx)) { /* negative index */
+ api_check(L, idx != 0 && -idx <= L->top - (ci->func + 1), "invalid index");
+ return L->top + idx;
+ }
+ else if (idx == LUA_REGISTRYINDEX)
+ return &G(L)->l_registry;
+ else { /* upvalues */
+ idx = LUA_REGISTRYINDEX - idx;
+ api_check(L, idx <= MAXUPVAL + 1, "upvalue index too large");
+ if (ttislcf(ci->func)) /* light C function? */
+ return NONVALIDVALUE; /* it has no upvalues */
+ else {
+ CClosure *func = clCvalue(ci->func);
+ return (idx <= func->nupvalues) ? &func->upvalue[idx-1] : NONVALIDVALUE;
+ }
+ }
+}
+
+
+/*
+** to be called by 'lua_checkstack' in protected mode, to grow stack
+** capturing memory errors
+*/
+static void growstack (lua_State *L, void *ud) {
+ int size = *(int *)ud;
+ luaD_growstack(L, size);
+}
+
+
+LUA_API int lua_checkstack (lua_State *L, int size) {
+ int res;
+ CallInfo *ci = L->ci;
+ lua_lock(L);
+ if (L->stack_last - L->top > size) /* stack large enough? */
+ res = 1; /* yes; check is OK */
+ else { /* no; need to grow stack */
+ int inuse = cast_int(L->top - L->stack) + EXTRA_STACK;
+ if (inuse > LUAI_MAXSTACK - size) /* can grow without overflow? */
+ res = 0; /* no */
+ else /* try to grow stack */
+ res = (luaD_rawrunprotected(L, &growstack, &size) == LUA_OK);
+ }
+ if (res && ci->top < L->top + size)
+ ci->top = L->top + size; /* adjust frame top */
+ lua_unlock(L);
+ return res;
+}
+
+
+LUA_API void lua_xmove (lua_State *from, lua_State *to, int n) {
+ int i;
+ if (from == to) return;
+ lua_lock(to);
+ api_checknelems(from, n);
+ api_check(from, G(from) == G(to), "moving among independent states");
+ api_check(from, to->ci->top - to->top >= n, "not enough elements to move");
+ from->top -= n;
+ for (i = 0; i < n; i++) {
+ setobj2s(to, to->top++, from->top + i);
+ }
+ lua_unlock(to);
+}
+
+
+LUA_API lua_CFunction lua_atpanic (lua_State *L, lua_CFunction panicf) {
+ lua_CFunction old;
+ lua_lock(L);
+ old = G(L)->panic;
+ G(L)->panic = panicf;
+ lua_unlock(L);
+ return old;
+}
+
+
+LUA_API const lua_Number *lua_version (lua_State *L) {
+ static const lua_Number version = LUA_VERSION_NUM;
+ if (L == NULL) return &version;
+ else return G(L)->version;
+}
+
+
+
+/*
+** basic stack manipulation
+*/
+
+
+/*
+** convert an acceptable stack index into an absolute index
+*/
+LUA_API int lua_absindex (lua_State *L, int idx) {
+ return (idx > 0 || ispseudo(idx))
+ ? idx
+ : cast_int(L->top - L->ci->func + idx);
+}
+
+
+LUA_API int lua_gettop (lua_State *L) {
+ return cast_int(L->top - (L->ci->func + 1));
+}
+
+
+LUA_API void lua_settop (lua_State *L, int idx) {
+ StkId func = L->ci->func;
+ lua_lock(L);
+ if (idx >= 0) {
+ api_check(L, idx <= L->stack_last - (func + 1), "new top too large");
+ while (L->top < (func + 1) + idx)
+ setnilvalue(L->top++);
+ L->top = (func + 1) + idx;
+ }
+ else {
+ api_check(L, -(idx+1) <= (L->top - (func + 1)), "invalid new top");
+ L->top += idx+1; /* `subtract' index (index is negative) */
+ }
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_remove (lua_State *L, int idx) {
+ StkId p;
+ lua_lock(L);
+ p = index2addr(L, idx);
+ api_checkstackindex(L, idx, p);
+ while (++p < L->top) setobjs2s(L, p-1, p);
+ L->top--;
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_insert (lua_State *L, int idx) {
+ StkId p;
+ StkId q;
+ lua_lock(L);
+ p = index2addr(L, idx);
+ api_checkstackindex(L, idx, p);
+ for (q = L->top; q > p; q--) /* use L->top as a temporary */
+ setobjs2s(L, q, q - 1);
+ setobjs2s(L, p, L->top);
+ lua_unlock(L);
+}
+
+
+static void moveto (lua_State *L, TValue *fr, int idx) {
+ TValue *to = index2addr(L, idx);
+ api_checkvalidindex(L, to);
+ setobj(L, to, fr);
+ if (idx < LUA_REGISTRYINDEX) /* function upvalue? */
+ luaC_barrier(L, clCvalue(L->ci->func), fr);
+ /* LUA_REGISTRYINDEX does not need gc barrier
+ (collector revisits it before finishing collection) */
+}
+
+
+LUA_API void lua_replace (lua_State *L, int idx) {
+ lua_lock(L);
+ api_checknelems(L, 1);
+ moveto(L, L->top - 1, idx);
+ L->top--;
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_copy (lua_State *L, int fromidx, int toidx) {
+ TValue *fr;
+ lua_lock(L);
+ fr = index2addr(L, fromidx);
+ moveto(L, fr, toidx);
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_pushvalue (lua_State *L, int idx) {
+ lua_lock(L);
+ setobj2s(L, L->top, index2addr(L, idx));
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+
+/*
+** access functions (stack -> C)
+*/
+
+
+LUA_API int lua_type (lua_State *L, int idx) {
+ StkId o = index2addr(L, idx);
+ return (isvalid(o) ? ttypenv(o) : LUA_TNONE);
+}
+
+
+LUA_API const char *lua_typename (lua_State *L, int t) {
+ UNUSED(L);
+ return ttypename(t);
+}
+
+
+LUA_API int lua_iscfunction (lua_State *L, int idx) {
+ StkId o = index2addr(L, idx);
+ return (ttislcf(o) || (ttisCclosure(o)));
+}
+
+
+LUA_API int lua_isnumber (lua_State *L, int idx) {
+ TValue n;
+ const TValue *o = index2addr(L, idx);
+ return tonumber(o, &n);
+}
+
+
+LUA_API int lua_isstring (lua_State *L, int idx) {
+ int t = lua_type(L, idx);
+ return (t == LUA_TSTRING || t == LUA_TNUMBER);
+}
+
+
+LUA_API int lua_isuserdata (lua_State *L, int idx) {
+ const TValue *o = index2addr(L, idx);
+ return (ttisuserdata(o) || ttislightuserdata(o));
+}
+
+
+LUA_API int lua_rawequal (lua_State *L, int index1, int index2) {
+ StkId o1 = index2addr(L, index1);
+ StkId o2 = index2addr(L, index2);
+ return (isvalid(o1) && isvalid(o2)) ? luaV_rawequalobj(o1, o2) : 0;
+}
+
+
+LUA_API void lua_arith (lua_State *L, int op) {
+ StkId o1; /* 1st operand */
+ StkId o2; /* 2nd operand */
+ lua_lock(L);
+ if (op != LUA_OPUNM) /* all other operations expect two operands */
+ api_checknelems(L, 2);
+ else { /* for unary minus, add fake 2nd operand */
+ api_checknelems(L, 1);
+ setobjs2s(L, L->top, L->top - 1);
+ L->top++;
+ }
+ o1 = L->top - 2;
+ o2 = L->top - 1;
+ if (ttisnumber(o1) && ttisnumber(o2)) {
+ setnvalue(o1, luaO_arith(op, nvalue(o1), nvalue(o2)));
+ }
+ else
+ luaV_arith(L, o1, o1, o2, cast(TMS, op - LUA_OPADD + TM_ADD));
+ L->top--;
+ lua_unlock(L);
+}
+
+
+LUA_API int lua_compare (lua_State *L, int index1, int index2, int op) {
+ StkId o1, o2;
+ int i = 0;
+ lua_lock(L); /* may call tag method */
+ o1 = index2addr(L, index1);
+ o2 = index2addr(L, index2);
+ if (isvalid(o1) && isvalid(o2)) {
+ switch (op) {
+ case LUA_OPEQ: i = equalobj(L, o1, o2); break;
+ case LUA_OPLT: i = luaV_lessthan(L, o1, o2); break;
+ case LUA_OPLE: i = luaV_lessequal(L, o1, o2); break;
+ default: api_check(L, 0, "invalid option");
+ }
+ }
+ lua_unlock(L);
+ return i;
+}
+
+
+LUA_API lua_Number lua_tonumberx (lua_State *L, int idx, int *isnum) {
+ TValue n;
+ const TValue *o = index2addr(L, idx);
+ if (tonumber(o, &n)) {
+ if (isnum) *isnum = 1;
+ return nvalue(o);
+ }
+ else {
+ if (isnum) *isnum = 0;
+ return 0;
+ }
+}
+
+
+LUA_API lua_Integer lua_tointegerx (lua_State *L, int idx, int *isnum) {
+ TValue n;
+ const TValue *o = index2addr(L, idx);
+ if (tonumber(o, &n)) {
+ lua_Integer res;
+ lua_Number num = nvalue(o);
+ lua_number2integer(res, num);
+ if (isnum) *isnum = 1;
+ return res;
+ }
+ else {
+ if (isnum) *isnum = 0;
+ return 0;
+ }
+}
+
+
+LUA_API lua_Unsigned lua_tounsignedx (lua_State *L, int idx, int *isnum) {
+ TValue n;
+ const TValue *o = index2addr(L, idx);
+ if (tonumber(o, &n)) {
+ lua_Unsigned res;
+ lua_Number num = nvalue(o);
+ lua_number2unsigned(res, num);
+ if (isnum) *isnum = 1;
+ return res;
+ }
+ else {
+ if (isnum) *isnum = 0;
+ return 0;
+ }
+}
+
+
+LUA_API int lua_toboolean (lua_State *L, int idx) {
+ const TValue *o = index2addr(L, idx);
+ return !l_isfalse(o);
+}
+
+
+LUA_API const char *lua_tolstring (lua_State *L, int idx, size_t *len) {
+ StkId o = index2addr(L, idx);
+ if (!ttisstring(o)) {
+ lua_lock(L); /* `luaV_tostring' may create a new string */
+ if (!luaV_tostring(L, o)) { /* conversion failed? */
+ if (len != NULL) *len = 0;
+ lua_unlock(L);
+ return NULL;
+ }
+ luaC_checkGC(L);
+ o = index2addr(L, idx); /* previous call may reallocate the stack */
+ lua_unlock(L);
+ }
+ if (len != NULL) *len = tsvalue(o)->len;
+ return svalue(o);
+}
+
+
+LUA_API size_t lua_rawlen (lua_State *L, int idx) {
+ StkId o = index2addr(L, idx);
+ switch (ttypenv(o)) {
+ case LUA_TSTRING: return tsvalue(o)->len;
+ case LUA_TUSERDATA: return uvalue(o)->len;
+ case LUA_TTABLE: return luaH_getn(hvalue(o));
+ default: return 0;
+ }
+}
+
+
+LUA_API lua_CFunction lua_tocfunction (lua_State *L, int idx) {
+ StkId o = index2addr(L, idx);
+ if (ttislcf(o)) return fvalue(o);
+ else if (ttisCclosure(o))
+ return clCvalue(o)->f;
+ else return NULL; /* not a C function */
+}
+
+
+LUA_API void *lua_touserdata (lua_State *L, int idx) {
+ StkId o = index2addr(L, idx);
+ switch (ttypenv(o)) {
+ case LUA_TUSERDATA: return ((void *)(rawuvalue(o) + 1));
+ case LUA_TLIGHTUSERDATA: return pvalue(o);
+ default: return NULL;
+ }
+}
+
+
+LUA_API lua_State *lua_tothread (lua_State *L, int idx) {
+ StkId o = index2addr(L, idx);
+ return (!ttisthread(o)) ? NULL : thvalue(o);
+}
+
+
+LUA_API const void *lua_topointer (lua_State *L, int idx) {
+ StkId o = index2addr(L, idx);
+ switch (ttype(o)) {
+ case LUA_TTABLE: return hvalue(o);
+ case LUA_TLCL: return clLvalue(o);
+ case LUA_TCCL: return clCvalue(o);
+ case LUA_TLCF: return cast(void *, cast(size_t, fvalue(o)));
+ case LUA_TTHREAD: return thvalue(o);
+ case LUA_TUSERDATA:
+ case LUA_TLIGHTUSERDATA:
+ return lua_touserdata(L, idx);
+ default: return NULL;
+ }
+}
+
+
+
+/*
+** push functions (C -> stack)
+*/
+
+
+LUA_API void lua_pushnil (lua_State *L) {
+ lua_lock(L);
+ setnilvalue(L->top);
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_pushnumber (lua_State *L, lua_Number n) {
+ lua_lock(L);
+ setnvalue(L->top, n);
+ luai_checknum(L, L->top,
+ luaG_runerror(L, "C API - attempt to push a signaling NaN"));
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_pushinteger (lua_State *L, lua_Integer n) {
+ lua_lock(L);
+ setnvalue(L->top, cast_num(n));
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_pushunsigned (lua_State *L, lua_Unsigned u) {
+ lua_Number n;
+ lua_lock(L);
+ n = lua_unsigned2number(u);
+ setnvalue(L->top, n);
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+LUA_API const char *lua_pushlstring (lua_State *L, const char *s, size_t len) {
+ TString *ts;
+ lua_lock(L);
+ luaC_checkGC(L);
+ ts = luaS_newlstr(L, s, len);
+ setsvalue2s(L, L->top, ts);
+ api_incr_top(L);
+ lua_unlock(L);
+ return getstr(ts);
+}
+
+
+LUA_API const char *lua_pushstring (lua_State *L, const char *s) {
+ if (s == NULL) {
+ lua_pushnil(L);
+ return NULL;
+ }
+ else {
+ TString *ts;
+ lua_lock(L);
+ luaC_checkGC(L);
+ ts = luaS_new(L, s);
+ setsvalue2s(L, L->top, ts);
+ api_incr_top(L);
+ lua_unlock(L);
+ return getstr(ts);
+ }
+}
+
+
+LUA_API const char *lua_pushvfstring (lua_State *L, const char *fmt,
+ va_list argp) {
+ const char *ret;
+ lua_lock(L);
+ luaC_checkGC(L);
+ ret = luaO_pushvfstring(L, fmt, argp);
+ lua_unlock(L);
+ return ret;
+}
+
+
+LUA_API const char *lua_pushfstring (lua_State *L, const char *fmt, ...) {
+ const char *ret;
+ va_list argp;
+ lua_lock(L);
+ luaC_checkGC(L);
+ va_start(argp, fmt);
+ ret = luaO_pushvfstring(L, fmt, argp);
+ va_end(argp);
+ lua_unlock(L);
+ return ret;
+}
+
+
+LUA_API void lua_pushcclosure (lua_State *L, lua_CFunction fn, int n) {
+ lua_lock(L);
+ if (n == 0) {
+ setfvalue(L->top, fn);
+ }
+ else {
+ Closure *cl;
+ api_checknelems(L, n);
+ api_check(L, n <= MAXUPVAL, "upvalue index too large");
+ luaC_checkGC(L);
+ cl = luaF_newCclosure(L, n);
+ cl->c.f = fn;
+ L->top -= n;
+ while (n--)
+ setobj2n(L, &cl->c.upvalue[n], L->top + n);
+ setclCvalue(L, L->top, cl);
+ }
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_pushboolean (lua_State *L, int b) {
+ lua_lock(L);
+ setbvalue(L->top, (b != 0)); /* ensure that true is 1 */
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_pushlightuserdata (lua_State *L, void *p) {
+ lua_lock(L);
+ setpvalue(L->top, p);
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+LUA_API int lua_pushthread (lua_State *L) {
+ lua_lock(L);
+ setthvalue(L, L->top, L);
+ api_incr_top(L);
+ lua_unlock(L);
+ return (G(L)->mainthread == L);
+}
+
+
+
+/*
+** get functions (Lua -> stack)
+*/
+
+
+LUA_API void lua_getglobal (lua_State *L, const char *var) {
+ Table *reg = hvalue(&G(L)->l_registry);
+ const TValue *gt; /* global table */
+ lua_lock(L);
+ gt = luaH_getint(reg, LUA_RIDX_GLOBALS);
+ setsvalue2s(L, L->top++, luaS_new(L, var));
+ luaV_gettable(L, gt, L->top - 1, L->top - 1);
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_gettable (lua_State *L, int idx) {
+ StkId t;
+ lua_lock(L);
+ t = index2addr(L, idx);
+ luaV_gettable(L, t, L->top - 1, L->top - 1);
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_getfield (lua_State *L, int idx, const char *k) {
+ StkId t;
+ lua_lock(L);
+ t = index2addr(L, idx);
+ setsvalue2s(L, L->top, luaS_new(L, k));
+ api_incr_top(L);
+ luaV_gettable(L, t, L->top - 1, L->top - 1);
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_rawget (lua_State *L, int idx) {
+ StkId t;
+ lua_lock(L);
+ t = index2addr(L, idx);
+ api_check(L, ttistable(t), "table expected");
+ setobj2s(L, L->top - 1, luaH_get(hvalue(t), L->top - 1));
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_rawgeti (lua_State *L, int idx, int n) {
+ StkId t;
+ lua_lock(L);
+ t = index2addr(L, idx);
+ api_check(L, ttistable(t), "table expected");
+ setobj2s(L, L->top, luaH_getint(hvalue(t), n));
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_rawgetp (lua_State *L, int idx, const void *p) {
+ StkId t;
+ TValue k;
+ lua_lock(L);
+ t = index2addr(L, idx);
+ api_check(L, ttistable(t), "table expected");
+ setpvalue(&k, cast(void *, p));
+ setobj2s(L, L->top, luaH_get(hvalue(t), &k));
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_createtable (lua_State *L, int narray, int nrec) {
+ Table *t;
+ lua_lock(L);
+ luaC_checkGC(L);
+ t = luaH_new(L);
+ sethvalue(L, L->top, t);
+ api_incr_top(L);
+ if (narray > 0 || nrec > 0)
+ luaH_resize(L, t, narray, nrec);
+ lua_unlock(L);
+}
+
+
+LUA_API int lua_getmetatable (lua_State *L, int objindex) {
+ const TValue *obj;
+ Table *mt = NULL;
+ int res;
+ lua_lock(L);
+ obj = index2addr(L, objindex);
+ switch (ttypenv(obj)) {
+ case LUA_TTABLE:
+ mt = hvalue(obj)->metatable;
+ break;
+ case LUA_TUSERDATA:
+ mt = uvalue(obj)->metatable;
+ break;
+ default:
+ mt = G(L)->mt[ttypenv(obj)];
+ break;
+ }
+ if (mt == NULL)
+ res = 0;
+ else {
+ sethvalue(L, L->top, mt);
+ api_incr_top(L);
+ res = 1;
+ }
+ lua_unlock(L);
+ return res;
+}
+
+
+LUA_API void lua_getuservalue (lua_State *L, int idx) {
+ StkId o;
+ lua_lock(L);
+ o = index2addr(L, idx);
+ api_check(L, ttisuserdata(o), "userdata expected");
+ if (uvalue(o)->env) {
+ sethvalue(L, L->top, uvalue(o)->env);
+ } else
+ setnilvalue(L->top);
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+/*
+** set functions (stack -> Lua)
+*/
+
+
+LUA_API void lua_setglobal (lua_State *L, const char *var) {
+ Table *reg = hvalue(&G(L)->l_registry);
+ const TValue *gt; /* global table */
+ lua_lock(L);
+ api_checknelems(L, 1);
+ gt = luaH_getint(reg, LUA_RIDX_GLOBALS);
+ setsvalue2s(L, L->top++, luaS_new(L, var));
+ luaV_settable(L, gt, L->top - 1, L->top - 2);
+ L->top -= 2; /* pop value and key */
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_settable (lua_State *L, int idx) {
+ StkId t;
+ lua_lock(L);
+ api_checknelems(L, 2);
+ t = index2addr(L, idx);
+ luaV_settable(L, t, L->top - 2, L->top - 1);
+ L->top -= 2; /* pop index and value */
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_setfield (lua_State *L, int idx, const char *k) {
+ StkId t;
+ lua_lock(L);
+ api_checknelems(L, 1);
+ t = index2addr(L, idx);
+ setsvalue2s(L, L->top++, luaS_new(L, k));
+ luaV_settable(L, t, L->top - 1, L->top - 2);
+ L->top -= 2; /* pop value and key */
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_rawset (lua_State *L, int idx) {
+ StkId t;
+ lua_lock(L);
+ api_checknelems(L, 2);
+ t = index2addr(L, idx);
+ api_check(L, ttistable(t), "table expected");
+ setobj2t(L, luaH_set(L, hvalue(t), L->top-2), L->top-1);
+ invalidateTMcache(hvalue(t));
+ luaC_barrierback(L, gcvalue(t), L->top-1);
+ L->top -= 2;
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_rawseti (lua_State *L, int idx, int n) {
+ StkId t;
+ lua_lock(L);
+ api_checknelems(L, 1);
+ t = index2addr(L, idx);
+ api_check(L, ttistable(t), "table expected");
+ luaH_setint(L, hvalue(t), n, L->top - 1);
+ luaC_barrierback(L, gcvalue(t), L->top-1);
+ L->top--;
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_rawsetp (lua_State *L, int idx, const void *p) {
+ StkId t;
+ TValue k;
+ lua_lock(L);
+ api_checknelems(L, 1);
+ t = index2addr(L, idx);
+ api_check(L, ttistable(t), "table expected");
+ setpvalue(&k, cast(void *, p));
+ setobj2t(L, luaH_set(L, hvalue(t), &k), L->top - 1);
+ luaC_barrierback(L, gcvalue(t), L->top - 1);
+ L->top--;
+ lua_unlock(L);
+}
+
+
+LUA_API int lua_setmetatable (lua_State *L, int objindex) {
+ TValue *obj;
+ Table *mt;
+ lua_lock(L);
+ api_checknelems(L, 1);
+ obj = index2addr(L, objindex);
+ if (ttisnil(L->top - 1))
+ mt = NULL;
+ else {
+ api_check(L, ttistable(L->top - 1), "table expected");
+ mt = hvalue(L->top - 1);
+ }
+ switch (ttypenv(obj)) {
+ case LUA_TTABLE: {
+ hvalue(obj)->metatable = mt;
+ if (mt) {
+ luaC_objbarrierback(L, gcvalue(obj), mt);
+ luaC_checkfinalizer(L, gcvalue(obj), mt);
+ }
+ break;
+ }
+ case LUA_TUSERDATA: {
+ uvalue(obj)->metatable = mt;
+ if (mt) {
+ luaC_objbarrier(L, rawuvalue(obj), mt);
+ luaC_checkfinalizer(L, gcvalue(obj), mt);
+ }
+ break;
+ }
+ default: {
+ G(L)->mt[ttypenv(obj)] = mt;
+ break;
+ }
+ }
+ L->top--;
+ lua_unlock(L);
+ return 1;
+}
+
+
+LUA_API void lua_setuservalue (lua_State *L, int idx) {
+ StkId o;
+ lua_lock(L);
+ api_checknelems(L, 1);
+ o = index2addr(L, idx);
+ api_check(L, ttisuserdata(o), "userdata expected");
+ if (ttisnil(L->top - 1))
+ uvalue(o)->env = NULL;
+ else {
+ api_check(L, ttistable(L->top - 1), "table expected");
+ uvalue(o)->env = hvalue(L->top - 1);
+ luaC_objbarrier(L, gcvalue(o), hvalue(L->top - 1));
+ }
+ L->top--;
+ lua_unlock(L);
+}
+
+
+/*
+** `load' and `call' functions (run Lua code)
+*/
+
+
+#define checkresults(L,na,nr) \
+ api_check(L, (nr) == LUA_MULTRET || (L->ci->top - L->top >= (nr) - (na)), \
+ "results from function overflow current stack size")
+
+
+LUA_API int lua_getctx (lua_State *L, int *ctx) {
+ if (L->ci->callstatus & CIST_YIELDED) {
+ if (ctx) *ctx = L->ci->u.c.ctx;
+ return L->ci->u.c.status;
+ }
+ else return LUA_OK;
+}
+
+
+LUA_API void lua_callk (lua_State *L, int nargs, int nresults, int ctx,
+ lua_CFunction k) {
+ StkId func;
+ lua_lock(L);
+ api_check(L, k == NULL || !isLua(L->ci),
+ "cannot use continuations inside hooks");
+ api_checknelems(L, nargs+1);
+ api_check(L, L->status == LUA_OK, "cannot do calls on non-normal thread");
+ checkresults(L, nargs, nresults);
+ func = L->top - (nargs+1);
+ if (k != NULL && L->nny == 0) { /* need to prepare continuation? */
+ L->ci->u.c.k = k; /* save continuation */
+ L->ci->u.c.ctx = ctx; /* save context */
+ luaD_call(L, func, nresults, 1); /* do the call */
+ }
+ else /* no continuation or no yieldable */
+ luaD_call(L, func, nresults, 0); /* just do the call */
+ adjustresults(L, nresults);
+ lua_unlock(L);
+}
+
+
+
+/*
+** Execute a protected call.
+*/
+struct CallS { /* data to `f_call' */
+ StkId func;
+ int nresults;
+};
+
+
+static void f_call (lua_State *L, void *ud) {
+ struct CallS *c = cast(struct CallS *, ud);
+ luaD_call(L, c->func, c->nresults, 0);
+}
+
+
+
+LUA_API int lua_pcallk (lua_State *L, int nargs, int nresults, int errfunc,
+ int ctx, lua_CFunction k) {
+ struct CallS c;
+ int status;
+ ptrdiff_t func;
+ lua_lock(L);
+ api_check(L, k == NULL || !isLua(L->ci),
+ "cannot use continuations inside hooks");
+ api_checknelems(L, nargs+1);
+ api_check(L, L->status == LUA_OK, "cannot do calls on non-normal thread");
+ checkresults(L, nargs, nresults);
+ if (errfunc == 0)
+ func = 0;
+ else {
+ StkId o = index2addr(L, errfunc);
+ api_checkstackindex(L, errfunc, o);
+ func = savestack(L, o);
+ }
+ c.func = L->top - (nargs+1); /* function to be called */
+ if (k == NULL || L->nny > 0) { /* no continuation or no yieldable? */
+ c.nresults = nresults; /* do a 'conventional' protected call */
+ status = luaD_pcall(L, f_call, &c, savestack(L, c.func), func);
+ }
+ else { /* prepare continuation (call is already protected by 'resume') */
+ CallInfo *ci = L->ci;
+ ci->u.c.k = k; /* save continuation */
+ ci->u.c.ctx = ctx; /* save context */
+ /* save information for error recovery */
+ ci->extra = savestack(L, c.func);
+ ci->u.c.old_allowhook = L->allowhook;
+ ci->u.c.old_errfunc = L->errfunc;
+ L->errfunc = func;
+ /* mark that function may do error recovery */
+ ci->callstatus |= CIST_YPCALL;
+ luaD_call(L, c.func, nresults, 1); /* do the call */
+ ci->callstatus &= ~CIST_YPCALL;
+ L->errfunc = ci->u.c.old_errfunc;
+ status = LUA_OK; /* if it is here, there were no errors */
+ }
+ adjustresults(L, nresults);
+ lua_unlock(L);
+ return status;
+}
+
+
+LUA_API int lua_load (lua_State *L, lua_Reader reader, void *data,
+ const char *chunkname, const char *mode) {
+ ZIO z;
+ int status;
+ lua_lock(L);
+ if (!chunkname) chunkname = "?";
+ luaZ_init(L, &z, reader, data);
+ status = luaD_protectedparser(L, &z, chunkname, mode);
+ if (status == LUA_OK) { /* no errors? */
+ LClosure *f = clLvalue(L->top - 1); /* get newly created function */
+ if (f->nupvalues == 1) { /* does it have one upvalue? */
+ /* get global table from registry */
+ Table *reg = hvalue(&G(L)->l_registry);
+ const TValue *gt = luaH_getint(reg, LUA_RIDX_GLOBALS);
+ /* set global table as 1st upvalue of 'f' (may be LUA_ENV) */
+ setobj(L, f->upvals[0]->v, gt);
+ luaC_barrier(L, f->upvals[0], gt);
+ }
+ }
+ lua_unlock(L);
+ return status;
+}
+
+#if defined(LUA_USE_DUMP)
+LUA_API int lua_dump (lua_State *L, lua_Writer writer, void *data) {
+ int status;
+ TValue *o;
+ lua_lock(L);
+ api_checknelems(L, 1);
+ o = L->top - 1;
+ if (isLfunction(o))
+ status = luaU_dump(L, getproto(o), writer, data, 0);
+ else
+ status = 1;
+ lua_unlock(L);
+ return status;
+}
+#endif
+
+LUA_API int lua_status (lua_State *L) {
+ return L->status;
+}
+
+
+/*
+** Garbage-collection function
+*/
+
+LUA_API int lua_gc (lua_State *L, int what, int data) {
+ int res = 0;
+ global_State *g;
+ lua_lock(L);
+ g = G(L);
+ switch (what) {
+ case LUA_GCSTOP: {
+ g->gcrunning = 0;
+ break;
+ }
+ case LUA_GCRESTART: {
+ luaE_setdebt(g, 0);
+ g->gcrunning = 1;
+ break;
+ }
+ case LUA_GCCOLLECT: {
+ luaC_fullgc(L, 0);
+ break;
+ }
+ case LUA_GCCOUNT: {
+ /* GC values are expressed in Kbytes: #bytes/2^10 */
+ res = cast_int(gettotalbytes(g) >> 10);
+ break;
+ }
+ case LUA_GCCOUNTB: {
+ res = cast_int(gettotalbytes(g) & 0x3ff);
+ break;
+ }
+ case LUA_GCSTEP: {
+ if (g->gckind == KGC_GEN) { /* generational mode? */
+ res = (g->GCestimate == 0); /* true if it will do major collection */
+ luaC_forcestep(L); /* do a single step */
+ }
+ else {
+ lu_mem debt = cast(lu_mem, data) * 1024 - GCSTEPSIZE;
+ if (g->gcrunning)
+ debt += g->GCdebt; /* include current debt */
+ luaE_setdebt(g, debt);
+ luaC_forcestep(L);
+ if (g->gcstate == GCSpause) /* end of cycle? */
+ res = 1; /* signal it */
+ }
+ break;
+ }
+ case LUA_GCSETPAUSE: {
+ res = g->gcpause;
+ g->gcpause = data;
+ break;
+ }
+ case LUA_GCSETMAJORINC: {
+ res = g->gcmajorinc;
+ g->gcmajorinc = data;
+ break;
+ }
+ case LUA_GCSETSTEPMUL: {
+ res = g->gcstepmul;
+ g->gcstepmul = data;
+ break;
+ }
+ case LUA_GCISRUNNING: {
+ res = g->gcrunning;
+ break;
+ }
+ case LUA_GCGEN: { /* change collector to generational mode */
+ luaC_changemode(L, KGC_GEN);
+ break;
+ }
+ case LUA_GCINC: { /* change collector to incremental mode */
+ luaC_changemode(L, KGC_NORMAL);
+ break;
+ }
+ default: res = -1; /* invalid option */
+ }
+ lua_unlock(L);
+ return res;
+}
+
+
+
+/*
+** miscellaneous functions
+*/
+
+
+LUA_API int lua_error (lua_State *L) {
+ lua_lock(L);
+ api_checknelems(L, 1);
+ luaG_errormsg(L);
+ /* code unreachable; will unlock when control actually leaves the kernel */
+ return 0; /* to avoid warnings */
+}
+
+
+LUA_API int lua_next (lua_State *L, int idx) {
+ StkId t;
+ int more;
+ lua_lock(L);
+ t = index2addr(L, idx);
+ api_check(L, ttistable(t), "table expected");
+ more = luaH_next(L, hvalue(t), L->top - 1);
+ if (more) {
+ api_incr_top(L);
+ }
+ else /* no more elements */
+ L->top -= 1; /* remove key */
+ lua_unlock(L);
+ return more;
+}
+
+
+LUA_API void lua_concat (lua_State *L, int n) {
+ lua_lock(L);
+ api_checknelems(L, n);
+ if (n >= 2) {
+ luaC_checkGC(L);
+ luaV_concat(L, n);
+ }
+ else if (n == 0) { /* push empty string */
+ setsvalue2s(L, L->top, luaS_newlstr(L, "", 0));
+ api_incr_top(L);
+ }
+ /* else n == 1; nothing to do */
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_len (lua_State *L, int idx) {
+ StkId t;
+ lua_lock(L);
+ t = index2addr(L, idx);
+ luaV_objlen(L, L->top, t);
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+LUA_API lua_Alloc lua_getallocf (lua_State *L, void **ud) {
+ lua_Alloc f;
+ lua_lock(L);
+ if (ud) *ud = G(L)->ud;
+ f = G(L)->frealloc;
+ lua_unlock(L);
+ return f;
+}
+
+
+LUA_API void lua_setallocf (lua_State *L, lua_Alloc f, void *ud) {
+ lua_lock(L);
+ G(L)->ud = ud;
+ G(L)->frealloc = f;
+ lua_unlock(L);
+}
+
+
+LUA_API void *lua_newuserdata (lua_State *L, size_t size) {
+ Udata *u;
+ lua_lock(L);
+ luaC_checkGC(L);
+ u = luaS_newudata(L, size, NULL);
+ setuvalue(L, L->top, u);
+ api_incr_top(L);
+ lua_unlock(L);
+ return u + 1;
+}
+
+
+
+static const char *aux_upvalue (StkId fi, int n, TValue **val,
+ GCObject **owner) {
+ switch (ttype(fi)) {
+ case LUA_TCCL: { /* C closure */
+ CClosure *f = clCvalue(fi);
+ if (!(1 <= n && n <= f->nupvalues)) return NULL;
+ *val = &f->upvalue[n-1];
+ if (owner) *owner = obj2gco(f);
+ return "";
+ }
+ case LUA_TLCL: { /* Lua closure */
+ LClosure *f = clLvalue(fi);
+ TString *name;
+ Proto *p = f->p;
+ if (!(1 <= n && n <= p->sizeupvalues)) return NULL;
+ *val = f->upvals[n-1]->v;
+ if (owner) *owner = obj2gco(f->upvals[n - 1]);
+ name = p->upvalues[n-1].name;
+ return (name == NULL) ? "" : getstr(name);
+ }
+ default: return NULL; /* not a closure */
+ }
+}
+
+
+LUA_API const char *lua_getupvalue (lua_State *L, int funcindex, int n) {
+ const char *name;
+ TValue *val = NULL; /* to avoid warnings */
+ lua_lock(L);
+ name = aux_upvalue(index2addr(L, funcindex), n, &val, NULL);
+ if (name) {
+ setobj2s(L, L->top, val);
+ api_incr_top(L);
+ }
+ lua_unlock(L);
+ return name;
+}
+
+
+LUA_API const char *lua_setupvalue (lua_State *L, int funcindex, int n) {
+ const char *name;
+ TValue *val = NULL; /* to avoid warnings */
+ GCObject *owner = NULL; /* to avoid warnings */
+ StkId fi;
+ lua_lock(L);
+ fi = index2addr(L, funcindex);
+ api_checknelems(L, 1);
+ name = aux_upvalue(fi, n, &val, &owner);
+ if (name) {
+ L->top--;
+ setobj(L, val, L->top);
+ luaC_barrier(L, owner, L->top);
+ }
+ lua_unlock(L);
+ return name;
+}
+
+
+static UpVal **getupvalref (lua_State *L, int fidx, int n, LClosure **pf) {
+ LClosure *f;
+ StkId fi = index2addr(L, fidx);
+ api_check(L, ttisLclosure(fi), "Lua function expected");
+ f = clLvalue(fi);
+ api_check(L, (1 <= n && n <= f->p->sizeupvalues), "invalid upvalue index");
+ if (pf) *pf = f;
+ return &f->upvals[n - 1]; /* get its upvalue pointer */
+}
+
+
+LUA_API void *lua_upvalueid (lua_State *L, int fidx, int n) {
+ StkId fi = index2addr(L, fidx);
+ switch (ttype(fi)) {
+ case LUA_TLCL: { /* lua closure */
+ return *getupvalref(L, fidx, n, NULL);
+ }
+ case LUA_TCCL: { /* C closure */
+ CClosure *f = clCvalue(fi);
+ api_check(L, 1 <= n && n <= f->nupvalues, "invalid upvalue index");
+ return &f->upvalue[n - 1];
+ }
+ default: {
+ api_check(L, 0, "closure expected");
+ return NULL;
+ }
+ }
+}
+
+
+LUA_API void lua_upvaluejoin (lua_State *L, int fidx1, int n1,
+ int fidx2, int n2) {
+ LClosure *f1;
+ UpVal **up1 = getupvalref(L, fidx1, n1, &f1);
+ UpVal **up2 = getupvalref(L, fidx2, n2, NULL);
+ *up1 = *up2;
+ luaC_objbarrier(L, f1, *up2);
+}
+
+#if defined(_KERNEL)
+
+static int __init
+lua_init(void)
+{
+ return (0);
+}
+
+static void __exit
+lua_fini(void)
+{
+}
+
+module_init(lua_init);
+module_exit(lua_fini);
+
+#endif
+/* END CSTYLED */
+
+ZFS_MODULE_DESCRIPTION("Lua Interpreter for ZFS");
+ZFS_MODULE_AUTHOR("Lua.org");
+ZFS_MODULE_LICENSE("Dual MIT/GPL");
+ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
+
+EXPORT_SYMBOL(lua_absindex);
+EXPORT_SYMBOL(lua_atpanic);
+EXPORT_SYMBOL(lua_checkstack);
+EXPORT_SYMBOL(lua_close);
+EXPORT_SYMBOL(lua_createtable);
+EXPORT_SYMBOL(lua_error);
+EXPORT_SYMBOL(lua_getfield);
+EXPORT_SYMBOL(lua_gettable);
+EXPORT_SYMBOL(lua_gettop);
+EXPORT_SYMBOL(lua_isnumber);
+EXPORT_SYMBOL(lua_isstring);
+EXPORT_SYMBOL(lua_newstate);
+EXPORT_SYMBOL(lua_newuserdata);
+EXPORT_SYMBOL(lua_next);
+EXPORT_SYMBOL(lua_pcallk);
+EXPORT_SYMBOL(lua_pushboolean);
+EXPORT_SYMBOL(lua_pushcclosure);
+EXPORT_SYMBOL(lua_pushfstring);
+EXPORT_SYMBOL(lua_pushinteger);
+EXPORT_SYMBOL(lua_pushlightuserdata);
+EXPORT_SYMBOL(lua_pushnil);
+EXPORT_SYMBOL(lua_pushnumber);
+EXPORT_SYMBOL(lua_pushstring);
+EXPORT_SYMBOL(lua_pushvalue);
+EXPORT_SYMBOL(lua_pushvfstring);
+EXPORT_SYMBOL(lua_remove);
+EXPORT_SYMBOL(lua_replace);
+EXPORT_SYMBOL(lua_setfield);
+EXPORT_SYMBOL(lua_setglobal);
+EXPORT_SYMBOL(lua_sethook);
+EXPORT_SYMBOL(lua_setmetatable);
+EXPORT_SYMBOL(lua_settable);
+EXPORT_SYMBOL(lua_settop);
+EXPORT_SYMBOL(lua_toboolean);
+EXPORT_SYMBOL(lua_tointegerx);
+EXPORT_SYMBOL(lua_tolstring);
+EXPORT_SYMBOL(lua_tonumberx);
+EXPORT_SYMBOL(lua_touserdata);
+EXPORT_SYMBOL(lua_type);
+EXPORT_SYMBOL(lua_typename);
diff --git a/sys/contrib/openzfs/module/lua/lapi.h b/sys/contrib/openzfs/module/lua/lapi.h
new file mode 100644
index 000000000000..509f46f692a7
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lapi.h
@@ -0,0 +1,26 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lapi.h,v 2.7.1.1 2013/04/12 18:48:47 roberto Exp $
+** Auxiliary functions from Lua API
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lapi_h
+#define lapi_h
+
+
+#include "llimits.h"
+#include "lstate.h"
+
+#define api_incr_top(L) {L->top++; api_check(L, L->top <= L->ci->top, \
+ "stack overflow");}
+
+#define adjustresults(L,nres) \
+ { if ((nres) == LUA_MULTRET && L->ci->top < L->top) L->ci->top = L->top; }
+
+#define api_checknelems(L,n) api_check(L, (n) < (L->top - L->ci->func), \
+ "not enough elements in the stack")
+
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lauxlib.c b/sys/contrib/openzfs/module/lua/lauxlib.c
new file mode 100644
index 000000000000..1e0356e7c00e
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lauxlib.c
@@ -0,0 +1,800 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lauxlib.c,v 1.248.1.1 2013/04/12 18:48:47 roberto Exp $
+** Auxiliary functions for building Lua libraries
+** See Copyright Notice in lua.h
+*/
+
+
+/* This file uses only the official API of Lua.
+** Any function declared here could be written as an application function.
+*/
+
+#define lauxlib_c
+#define LUA_LIB
+
+#include <sys/lua/lua.h>
+
+#include <sys/lua/lauxlib.h>
+
+
+/*
+** {======================================================
+** Traceback
+** =======================================================
+*/
+
+
+#define LEVELS1 12 /* size of the first part of the stack */
+#define LEVELS2 10 /* size of the second part of the stack */
+
+
+
+/*
+** search for 'objidx' in table at index -1.
+** return 1 + string at top if find a good name.
+*/
+static int findfield (lua_State *L, int objidx, int level) {
+ if (level == 0 || !lua_istable(L, -1))
+ return 0; /* not found */
+ lua_pushnil(L); /* start 'next' loop */
+ while (lua_next(L, -2)) { /* for each pair in table */
+ if (lua_type(L, -2) == LUA_TSTRING) { /* ignore non-string keys */
+ if (lua_rawequal(L, objidx, -1)) { /* found object? */
+ lua_pop(L, 1); /* remove value (but keep name) */
+ return 1;
+ }
+ else if (findfield(L, objidx, level - 1)) { /* try recursively */
+ lua_remove(L, -2); /* remove table (but keep name) */
+ lua_pushliteral(L, ".");
+ lua_insert(L, -2); /* place '.' between the two names */
+ lua_concat(L, 3);
+ return 1;
+ }
+ }
+ lua_pop(L, 1); /* remove value */
+ }
+ return 0; /* not found */
+}
+
+
+static int pushglobalfuncname (lua_State *L, lua_Debug *ar) {
+ int top = lua_gettop(L);
+ lua_getinfo(L, "f", ar); /* push function */
+ lua_pushglobaltable(L);
+ if (findfield(L, top + 1, 2)) {
+ lua_copy(L, -1, top + 1); /* move name to proper place */
+ lua_pop(L, 2); /* remove pushed values */
+ return 1;
+ }
+ else {
+ lua_settop(L, top); /* remove function and global table */
+ return 0;
+ }
+}
+
+
+static void pushfuncname (lua_State *L, lua_Debug *ar) {
+ if (*ar->namewhat != '\0') /* is there a name? */
+ lua_pushfstring(L, "function " LUA_QS, ar->name);
+ else if (*ar->what == 'm') /* main? */
+ lua_pushliteral(L, "main chunk");
+ else if (*ar->what == 'C') {
+ if (pushglobalfuncname(L, ar)) {
+ lua_pushfstring(L, "function " LUA_QS, lua_tostring(L, -1));
+ lua_remove(L, -2); /* remove name */
+ }
+ else
+ lua_pushliteral(L, "?");
+ }
+ else
+ lua_pushfstring(L, "function <%s:%d>", ar->short_src, ar->linedefined);
+}
+
+
+static int countlevels (lua_State *L) {
+ lua_Debug ar;
+ int li = 1, le = 1;
+ /* find an upper bound */
+ while (lua_getstack(L, le, &ar)) { li = le; le *= 2; }
+ /* do a binary search */
+ while (li < le) {
+ int m = (li + le)/2;
+ if (lua_getstack(L, m, &ar)) li = m + 1;
+ else le = m;
+ }
+ return le - 1;
+}
+
+
+LUALIB_API void luaL_traceback (lua_State *L, lua_State *L1,
+ const char *msg, int level) {
+ lua_Debug ar;
+ int top = lua_gettop(L);
+ int numlevels = countlevels(L1);
+ int mark = (numlevels > LEVELS1 + LEVELS2) ? LEVELS1 : 0;
+ if (msg) lua_pushfstring(L, "%s\n", msg);
+ lua_pushliteral(L, "stack traceback:");
+ while (lua_getstack(L1, level++, &ar)) {
+ if (level == mark) { /* too many levels? */
+ lua_pushliteral(L, "\n\t..."); /* add a '...' */
+ level = numlevels - LEVELS2; /* and skip to last ones */
+ }
+ else {
+ lua_getinfo(L1, "Slnt", &ar);
+ lua_pushfstring(L, "\n\t%s:", ar.short_src);
+ if (ar.currentline > 0)
+ lua_pushfstring(L, "%d:", ar.currentline);
+ lua_pushliteral(L, " in ");
+ pushfuncname(L, &ar);
+ if (ar.istailcall)
+ lua_pushliteral(L, "\n\t(...tail calls...)");
+ lua_concat(L, lua_gettop(L) - top);
+ }
+ }
+ lua_concat(L, lua_gettop(L) - top);
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Error-report functions
+** =======================================================
+*/
+
+LUALIB_API int luaL_argerror (lua_State *L, int narg, const char *extramsg) {
+ lua_Debug ar;
+ if (!lua_getstack(L, 0, &ar)) /* no stack frame? */
+ return luaL_error(L, "bad argument #%d (%s)", narg, extramsg);
+ lua_getinfo(L, "n", &ar);
+ if (strcmp(ar.namewhat, "method") == 0) {
+ narg--; /* do not count `self' */
+ if (narg == 0) /* error is in the self argument itself? */
+ return luaL_error(L, "calling " LUA_QS " on bad self (%s)",
+ ar.name, extramsg);
+ }
+ if (ar.name == NULL)
+ ar.name = (pushglobalfuncname(L, &ar)) ? lua_tostring(L, -1) : "?";
+ return luaL_error(L, "bad argument #%d to " LUA_QS " (%s)",
+ narg, ar.name, extramsg);
+}
+
+
+static int typeerror (lua_State *L, int narg, const char *tname) {
+ const char *msg = lua_pushfstring(L, "%s expected, got %s",
+ tname, luaL_typename(L, narg));
+ return luaL_argerror(L, narg, msg);
+}
+
+
+static void tag_error (lua_State *L, int narg, int tag) {
+ typeerror(L, narg, lua_typename(L, tag));
+}
+
+
+LUALIB_API void luaL_where (lua_State *L, int level) {
+ lua_Debug ar;
+ if (lua_getstack(L, level, &ar)) { /* check function at level */
+ lua_getinfo(L, "Sl", &ar); /* get info about it */
+ if (ar.currentline > 0) { /* is there info? */
+ lua_pushfstring(L, "%s:%d: ", ar.short_src, ar.currentline);
+ return;
+ }
+ }
+ lua_pushliteral(L, ""); /* else, no information available... */
+}
+
+
+LUALIB_API int luaL_error (lua_State *L, const char *fmt, ...) {
+ va_list argp;
+ va_start(argp, fmt);
+ luaL_where(L, 1);
+ lua_pushvfstring(L, fmt, argp);
+ va_end(argp);
+ lua_concat(L, 2);
+ return lua_error(L);
+}
+
+
+#if !defined(inspectstat) /* { */
+
+#if defined(LUA_USE_POSIX)
+
+#include <sys/wait.h>
+
+/*
+** use appropriate macros to interpret 'pclose' return status
+*/
+#define inspectstat(stat,what) \
+ if (WIFEXITED(stat)) { stat = WEXITSTATUS(stat); } \
+ else if (WIFSIGNALED(stat)) { stat = WTERMSIG(stat); what = "signal"; }
+
+#else
+
+#define inspectstat(stat,what) /* no op */
+
+#endif
+
+#endif /* } */
+
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Userdata's metatable manipulation
+** =======================================================
+*/
+
+LUALIB_API int luaL_newmetatable (lua_State *L, const char *tname) {
+ luaL_getmetatable(L, tname); /* try to get metatable */
+ if (!lua_isnil(L, -1)) /* name already in use? */
+ return 0; /* leave previous value on top, but return 0 */
+ lua_pop(L, 1);
+ lua_newtable(L); /* create metatable */
+ lua_pushvalue(L, -1);
+ lua_setfield(L, LUA_REGISTRYINDEX, tname); /* registry.name = metatable */
+ return 1;
+}
+
+
+LUALIB_API void luaL_setmetatable (lua_State *L, const char *tname) {
+ luaL_getmetatable(L, tname);
+ lua_setmetatable(L, -2);
+}
+
+
+LUALIB_API void *luaL_testudata (lua_State *L, int ud, const char *tname) {
+ void *p = lua_touserdata(L, ud);
+ if (p != NULL) { /* value is a userdata? */
+ if (lua_getmetatable(L, ud)) { /* does it have a metatable? */
+ luaL_getmetatable(L, tname); /* get correct metatable */
+ if (!lua_rawequal(L, -1, -2)) /* not the same? */
+ p = NULL; /* value is a userdata with wrong metatable */
+ lua_pop(L, 2); /* remove both metatables */
+ return p;
+ }
+ }
+ return NULL; /* value is not a userdata with a metatable */
+}
+
+
+LUALIB_API void *luaL_checkudata (lua_State *L, int ud, const char *tname) {
+ void *p = luaL_testudata(L, ud, tname);
+ if (p == NULL) typeerror(L, ud, tname);
+ return p;
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Argument check functions
+** =======================================================
+*/
+
+LUALIB_API int luaL_checkoption (lua_State *L, int narg, const char *def,
+ const char *const lst[]) {
+ const char *name = (def) ? luaL_optstring(L, narg, def) :
+ luaL_checkstring(L, narg);
+ int i;
+ for (i=0; lst[i]; i++)
+ if (strcmp(lst[i], name) == 0)
+ return i;
+ return luaL_argerror(L, narg,
+ lua_pushfstring(L, "invalid option " LUA_QS, name));
+}
+
+
+LUALIB_API void luaL_checkstack (lua_State *L, int space, const char *msg) {
+ /* keep some extra space to run error routines, if needed */
+ const int extra = LUA_MINSTACK;
+ if (!lua_checkstack(L, space + extra)) {
+ if (msg)
+ luaL_error(L, "stack overflow (%s)", msg);
+ else
+ luaL_error(L, "stack overflow");
+ }
+}
+
+
+LUALIB_API void luaL_checktype (lua_State *L, int narg, int t) {
+ if (lua_type(L, narg) != t)
+ tag_error(L, narg, t);
+}
+
+
+LUALIB_API void luaL_checkany (lua_State *L, int narg) {
+ if (lua_type(L, narg) == LUA_TNONE)
+ luaL_argerror(L, narg, "value expected");
+}
+
+
+LUALIB_API const char *luaL_checklstring (lua_State *L, int narg, size_t *len) {
+ const char *s = lua_tolstring(L, narg, len);
+ if (!s) tag_error(L, narg, LUA_TSTRING);
+ return s;
+}
+
+
+LUALIB_API const char *luaL_optlstring (lua_State *L, int narg,
+ const char *def, size_t *len) {
+ if (lua_isnoneornil(L, narg)) {
+ if (len)
+ *len = (def ? strlen(def) : 0);
+ return def;
+ }
+ else return luaL_checklstring(L, narg, len);
+}
+
+
+LUALIB_API lua_Number luaL_checknumber (lua_State *L, int narg) {
+ int isnum;
+ lua_Number d = lua_tonumberx(L, narg, &isnum);
+ if (!isnum)
+ tag_error(L, narg, LUA_TNUMBER);
+ return d;
+}
+
+
+LUALIB_API lua_Number luaL_optnumber (lua_State *L, int narg, lua_Number def) {
+ return luaL_opt(L, luaL_checknumber, narg, def);
+}
+
+
+LUALIB_API lua_Integer luaL_checkinteger (lua_State *L, int narg) {
+ int isnum;
+ lua_Integer d = lua_tointegerx(L, narg, &isnum);
+ if (!isnum)
+ tag_error(L, narg, LUA_TNUMBER);
+ return d;
+}
+
+
+LUALIB_API lua_Unsigned luaL_checkunsigned (lua_State *L, int narg) {
+ int isnum;
+ lua_Unsigned d = lua_tounsignedx(L, narg, &isnum);
+ if (!isnum)
+ tag_error(L, narg, LUA_TNUMBER);
+ return d;
+}
+
+
+LUALIB_API lua_Integer luaL_optinteger (lua_State *L, int narg,
+ lua_Integer def) {
+ return luaL_opt(L, luaL_checkinteger, narg, def);
+}
+
+
+LUALIB_API lua_Unsigned luaL_optunsigned (lua_State *L, int narg,
+ lua_Unsigned def) {
+ return luaL_opt(L, luaL_checkunsigned, narg, def);
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Generic Buffer manipulation
+** =======================================================
+*/
+
+/*
+** check whether buffer is using a userdata on the stack as a temporary
+** buffer
+*/
+#define buffonstack(B) ((B)->b != (B)->initb)
+
+
+/*
+** returns a pointer to a free area with at least 'sz' bytes
+*/
+LUALIB_API char *luaL_prepbuffsize (luaL_Buffer *B, size_t sz) {
+ lua_State *L = B->L;
+ if (B->size - B->n < sz) { /* not enough space? */
+ char *newbuff;
+ size_t newsize = B->size * 2; /* double buffer size */
+ if (newsize - B->n < sz) /* not big enough? */
+ newsize = B->n + sz;
+ if (newsize < B->n || newsize - B->n < sz)
+ luaL_error(L, "buffer too large");
+ /* create larger buffer */
+ newbuff = (char *)lua_newuserdata(L, newsize * sizeof(char));
+ /* move content to new buffer */
+ memcpy(newbuff, B->b, B->n * sizeof(char));
+ if (buffonstack(B))
+ lua_remove(L, -2); /* remove old buffer */
+ B->b = newbuff;
+ B->size = newsize;
+ }
+ return &B->b[B->n];
+}
+
+
+LUALIB_API void luaL_addlstring (luaL_Buffer *B, const char *s, size_t l) {
+ char *b = luaL_prepbuffsize(B, l);
+ memcpy(b, s, l * sizeof(char));
+ luaL_addsize(B, l);
+}
+
+
+LUALIB_API void luaL_addstring (luaL_Buffer *B, const char *s) {
+ luaL_addlstring(B, s, strlen(s));
+}
+
+
+LUALIB_API void luaL_pushresult (luaL_Buffer *B) {
+ lua_State *L = B->L;
+ lua_pushlstring(L, B->b, B->n);
+ if (buffonstack(B))
+ lua_remove(L, -2); /* remove old buffer */
+}
+
+
+LUALIB_API void luaL_pushresultsize (luaL_Buffer *B, size_t sz) {
+ luaL_addsize(B, sz);
+ luaL_pushresult(B);
+}
+
+
+LUALIB_API void luaL_addvalue (luaL_Buffer *B) {
+ lua_State *L = B->L;
+ size_t l;
+ const char *s = lua_tolstring(L, -1, &l);
+ if (buffonstack(B))
+ lua_insert(L, -2); /* put value below buffer */
+ luaL_addlstring(B, s, l);
+ lua_remove(L, (buffonstack(B)) ? -2 : -1); /* remove value */
+}
+
+
+LUALIB_API void luaL_buffinit (lua_State *L, luaL_Buffer *B) {
+ B->L = L;
+ B->b = B->initb;
+ B->n = 0;
+ B->size = LUAL_BUFFERSIZE;
+}
+
+
+LUALIB_API char *luaL_buffinitsize (lua_State *L, luaL_Buffer *B, size_t sz) {
+ luaL_buffinit(L, B);
+ return luaL_prepbuffsize(B, sz);
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Reference system
+** =======================================================
+*/
+
+/* index of free-list header */
+#define freelist 0
+
+
+LUALIB_API int luaL_ref (lua_State *L, int t) {
+ int ref;
+ if (lua_isnil(L, -1)) {
+ lua_pop(L, 1); /* remove from stack */
+ return LUA_REFNIL; /* `nil' has a unique fixed reference */
+ }
+ t = lua_absindex(L, t);
+ lua_rawgeti(L, t, freelist); /* get first free element */
+ ref = (int)lua_tointeger(L, -1); /* ref = t[freelist] */
+ lua_pop(L, 1); /* remove it from stack */
+ if (ref != 0) { /* any free element? */
+ lua_rawgeti(L, t, ref); /* remove it from list */
+ lua_rawseti(L, t, freelist); /* (t[freelist] = t[ref]) */
+ }
+ else /* no free elements */
+ ref = (int)lua_rawlen(L, t) + 1; /* get a new reference */
+ lua_rawseti(L, t, ref);
+ return ref;
+}
+
+
+LUALIB_API void luaL_unref (lua_State *L, int t, int ref) {
+ if (ref >= 0) {
+ t = lua_absindex(L, t);
+ lua_rawgeti(L, t, freelist);
+ lua_rawseti(L, t, ref); /* t[ref] = t[freelist] */
+ lua_pushinteger(L, ref);
+ lua_rawseti(L, t, freelist); /* t[freelist] = ref */
+ }
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Load functions
+** =======================================================
+*/
+
+typedef struct LoadS {
+ const char *s;
+ size_t size;
+} LoadS;
+
+
+static const char *getS (lua_State *L, void *ud, size_t *size) {
+ LoadS *ls = (LoadS *)ud;
+ (void)L; /* not used */
+ if (ls->size == 0) return NULL;
+ *size = ls->size;
+ ls->size = 0;
+ return ls->s;
+}
+
+
+LUALIB_API int luaL_loadbufferx (lua_State *L, const char *buff, size_t size,
+ const char *name, const char *mode) {
+ LoadS ls;
+ ls.s = buff;
+ ls.size = size;
+ return lua_load(L, getS, &ls, name, mode);
+}
+
+
+LUALIB_API int luaL_loadstring (lua_State *L, const char *s) {
+ return luaL_loadbuffer(L, s, strlen(s), s);
+}
+
+/* }====================================================== */
+
+
+
+LUALIB_API int luaL_getmetafield (lua_State *L, int obj, const char *event) {
+ if (!lua_getmetatable(L, obj)) /* no metatable? */
+ return 0;
+ lua_pushstring(L, event);
+ lua_rawget(L, -2);
+ if (lua_isnil(L, -1)) {
+ lua_pop(L, 2); /* remove metatable and metafield */
+ return 0;
+ }
+ else {
+ lua_remove(L, -2); /* remove only metatable */
+ return 1;
+ }
+}
+
+
+LUALIB_API int luaL_callmeta (lua_State *L, int obj, const char *event) {
+ obj = lua_absindex(L, obj);
+ if (!luaL_getmetafield(L, obj, event)) /* no metafield? */
+ return 0;
+ lua_pushvalue(L, obj);
+ lua_call(L, 1, 1);
+ return 1;
+}
+
+
+LUALIB_API int luaL_len (lua_State *L, int idx) {
+ int l;
+ int isnum;
+ lua_len(L, idx);
+ l = (int)lua_tointegerx(L, -1, &isnum);
+ if (!isnum)
+ luaL_error(L, "object length is not a number");
+ lua_pop(L, 1); /* remove object */
+ return l;
+}
+
+
+LUALIB_API const char *luaL_tolstring (lua_State *L, int idx, size_t *len) {
+ if (!luaL_callmeta(L, idx, "__tostring")) { /* no metafield? */
+ switch (lua_type(L, idx)) {
+ case LUA_TNUMBER:
+ case LUA_TSTRING:
+ lua_pushvalue(L, idx);
+ break;
+ case LUA_TBOOLEAN:
+ lua_pushstring(L, (lua_toboolean(L, idx) ? "true" : "false"));
+ break;
+ case LUA_TNIL:
+ lua_pushliteral(L, "nil");
+ break;
+ default:
+ lua_pushfstring(L, "%s: %p", luaL_typename(L, idx),
+ lua_topointer(L, idx));
+ break;
+ }
+ }
+ return lua_tolstring(L, -1, len);
+}
+
+
+/*
+** {======================================================
+** Compatibility with 5.1 module functions
+** =======================================================
+*/
+#if defined(LUA_COMPAT_MODULE)
+
+static const char *luaL_findtable (lua_State *L, int idx,
+ const char *fname, int szhint) {
+ const char *e;
+ if (idx) lua_pushvalue(L, idx);
+ do {
+ e = strchr(fname, '.');
+ if (e == NULL) e = fname + strlen(fname);
+ lua_pushlstring(L, fname, e - fname);
+ lua_rawget(L, -2);
+ if (lua_isnil(L, -1)) { /* no such field? */
+ lua_pop(L, 1); /* remove this nil */
+ lua_createtable(L, 0, (*e == '.' ? 1 : szhint)); /* new table for field */
+ lua_pushlstring(L, fname, e - fname);
+ lua_pushvalue(L, -2);
+ lua_settable(L, -4); /* set new table into field */
+ }
+ else if (!lua_istable(L, -1)) { /* field has a non-table value? */
+ lua_pop(L, 2); /* remove table and value */
+ return fname; /* return problematic part of the name */
+ }
+ lua_remove(L, -2); /* remove previous table */
+ fname = e + 1;
+ } while (*e == '.');
+ return NULL;
+}
+
+
+/*
+** Count number of elements in a luaL_Reg list.
+*/
+static int libsize (const luaL_Reg *l) {
+ int size = 0;
+ for (; l && l->name; l++) size++;
+ return size;
+}
+
+
+/*
+** Find or create a module table with a given name. The function
+** first looks at the _LOADED table and, if that fails, try a
+** global variable with that name. In any case, leaves on the stack
+** the module table.
+*/
+LUALIB_API void luaL_pushmodule (lua_State *L, const char *modname,
+ int sizehint) {
+ luaL_findtable(L, LUA_REGISTRYINDEX, "_LOADED", 1); /* get _LOADED table */
+ lua_getfield(L, -1, modname); /* get _LOADED[modname] */
+ if (!lua_istable(L, -1)) { /* not found? */
+ lua_pop(L, 1); /* remove previous result */
+ /* try global variable (and create one if it does not exist) */
+ lua_pushglobaltable(L);
+ if (luaL_findtable(L, 0, modname, sizehint) != NULL)
+ luaL_error(L, "name conflict for module " LUA_QS, modname);
+ lua_pushvalue(L, -1);
+ lua_setfield(L, -3, modname); /* _LOADED[modname] = new table */
+ }
+ lua_remove(L, -2); /* remove _LOADED table */
+}
+
+
+LUALIB_API void luaL_openlib (lua_State *L, const char *libname,
+ const luaL_Reg *l, int nup) {
+ luaL_checkversion(L);
+ if (libname) {
+ luaL_pushmodule(L, libname, libsize(l)); /* get/create library table */
+ lua_insert(L, -(nup + 1)); /* move library table to below upvalues */
+ }
+ if (l)
+ luaL_setfuncs(L, l, nup);
+ else
+ lua_pop(L, nup); /* remove upvalues */
+}
+
+#endif
+/* }====================================================== */
+
+/*
+** set functions from list 'l' into table at top - 'nup'; each
+** function gets the 'nup' elements at the top as upvalues.
+** Returns with only the table at the stack.
+*/
+LUALIB_API void luaL_setfuncs (lua_State *L, const luaL_Reg *l, int nup) {
+ luaL_checkversion(L);
+ luaL_checkstack(L, nup, "too many upvalues");
+ for (; l->name != NULL; l++) { /* fill the table with given functions */
+ int i;
+ for (i = 0; i < nup; i++) /* copy upvalues to the top */
+ lua_pushvalue(L, -nup);
+ lua_pushcclosure(L, l->func, nup); /* closure with those upvalues */
+ lua_setfield(L, -(nup + 2), l->name);
+ }
+ lua_pop(L, nup); /* remove upvalues */
+}
+
+
+/*
+** ensure that stack[idx][fname] has a table and push that table
+** into the stack
+*/
+LUALIB_API int luaL_getsubtable (lua_State *L, int idx, const char *fname) {
+ lua_getfield(L, idx, fname);
+ if (lua_istable(L, -1)) return 1; /* table already there */
+ else {
+ lua_pop(L, 1); /* remove previous result */
+ idx = lua_absindex(L, idx);
+ lua_newtable(L);
+ lua_pushvalue(L, -1); /* copy to be left at top */
+ lua_setfield(L, idx, fname); /* assign new table to field */
+ return 0; /* false, because did not find table there */
+ }
+}
+
+
+/*
+** stripped-down 'require'. Calls 'openf' to open a module,
+** registers the result in 'package.loaded' table and, if 'glb'
+** is true, also registers the result in the global table.
+** Leaves resulting module on the top.
+*/
+LUALIB_API void luaL_requiref (lua_State *L, const char *modname,
+ lua_CFunction openf, int glb) {
+ lua_pushcfunction(L, openf);
+ lua_pushstring(L, modname); /* argument to open function */
+ lua_call(L, 1, 1); /* open module */
+ luaL_getsubtable(L, LUA_REGISTRYINDEX, "_LOADED");
+ lua_pushvalue(L, -2); /* make copy of module (call result) */
+ lua_setfield(L, -2, modname); /* _LOADED[modname] = module */
+ lua_pop(L, 1); /* remove _LOADED table */
+ if (glb) {
+ lua_pushvalue(L, -1); /* copy of 'mod' */
+ lua_setglobal(L, modname); /* _G[modname] = module */
+ }
+}
+
+
+LUALIB_API const char *luaL_gsub (lua_State *L, const char *s, const char *p,
+ const char *r) {
+ const char *wild;
+ size_t l = strlen(p);
+ luaL_Buffer b;
+ luaL_buffinit(L, &b);
+ while ((wild = strstr(s, p)) != NULL) {
+ luaL_addlstring(&b, s, wild - s); /* push prefix */
+ luaL_addstring(&b, r); /* push replacement in place of pattern */
+ s = wild + l; /* continue after `p' */
+ }
+ luaL_addstring(&b, s); /* push last suffix */
+ luaL_pushresult(&b);
+ return lua_tostring(L, -1);
+}
+
+
+LUALIB_API void luaL_checkversion_ (lua_State *L, lua_Number ver) {
+ const lua_Number *v = lua_version(L);
+ if (v != lua_version(NULL))
+ luaL_error(L, "multiple Lua VMs detected");
+ else if (*v != ver)
+ luaL_error(L, "version mismatch: app. needs %f, Lua core provides %f",
+ ver, *v);
+ /* check conversions number -> integer types */
+ lua_pushnumber(L, -(lua_Number)0x1234);
+ if (lua_tointeger(L, -1) != -0x1234 ||
+ lua_tounsigned(L, -1) != (lua_Unsigned)-0x1234)
+ luaL_error(L, "bad conversion number->int;"
+ " must recompile Lua with proper settings");
+ lua_pop(L, 1);
+}
+
+#if defined(_KERNEL)
+
+EXPORT_SYMBOL(luaL_argerror);
+EXPORT_SYMBOL(luaL_error);
+EXPORT_SYMBOL(luaL_loadbufferx);
+EXPORT_SYMBOL(luaL_newmetatable);
+EXPORT_SYMBOL(luaL_traceback);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lbaselib.c b/sys/contrib/openzfs/module/lua/lbaselib.c
new file mode 100644
index 000000000000..854649a0fb4d
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lbaselib.c
@@ -0,0 +1,296 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lbaselib.c,v 1.276.1.1 2013/04/12 18:48:47 roberto Exp $
+** Basic library
+** See Copyright Notice in lua.h
+*/
+
+/* The following built-in lua functions have been removed and are not available
+ * for use in ZFS channel programs:
+ *
+ * dofile
+ * loadfile
+ * load
+ * pcall
+ * print
+ * xpcall
+ */
+
+
+#define lbaselib_c
+#define LUA_LIB
+
+#include <sys/lua/lua.h>
+
+#include <sys/lua/lauxlib.h>
+#include <sys/lua/lualib.h>
+
+#define SPACECHARS " \f\n\r\t\v"
+
+static int luaB_tonumber (lua_State *L) {
+ if (lua_isnoneornil(L, 2)) { /* standard conversion */
+ int isnum;
+ lua_Number n = lua_tonumberx(L, 1, &isnum);
+ if (isnum) {
+ lua_pushnumber(L, n);
+ return 1;
+ } /* else not a number; must be something */
+ luaL_checkany(L, 1);
+ }
+ else {
+ size_t l;
+ const char *s = luaL_checklstring(L, 1, &l);
+ const char *e = s + l; /* end point for 's' */
+ int base = luaL_checkint(L, 2);
+ int neg = 0;
+ luaL_argcheck(L, 2 <= base && base <= 36, 2, "base out of range");
+ s += strspn(s, SPACECHARS); /* skip initial spaces */
+ if (*s == '-') { s++; neg = 1; } /* handle signal */
+ else if (*s == '+') s++;
+ if (isalnum((unsigned char)*s)) {
+ lua_Number n = 0;
+ do {
+ int digit = (isdigit((unsigned char)*s)) ? *s - '0'
+ : toupper((unsigned char)*s) - 'A' + 10;
+ if (digit >= base) break; /* invalid numeral; force a fail */
+ n = n * (lua_Number)base + (lua_Number)digit;
+ s++;
+ } while (isalnum((unsigned char)*s));
+ s += strspn(s, SPACECHARS); /* skip trailing spaces */
+ if (s == e) { /* no invalid trailing characters? */
+ lua_pushnumber(L, (neg) ? -n : n);
+ return 1;
+ } /* else not a number */
+ } /* else not a number */
+ }
+ lua_pushnil(L); /* not a number */
+ return 1;
+}
+
+
+static int luaB_error (lua_State *L) {
+ int level = luaL_optint(L, 2, 1);
+ lua_settop(L, 1);
+ if (lua_isstring(L, 1) && level > 0) { /* add extra information? */
+ luaL_where(L, level);
+ lua_pushvalue(L, 1);
+ lua_concat(L, 2);
+ }
+ return lua_error(L);
+}
+
+
+static int luaB_getmetatable (lua_State *L) {
+ luaL_checkany(L, 1);
+ if (!lua_getmetatable(L, 1)) {
+ lua_pushnil(L);
+ return 1; /* no metatable */
+ }
+ luaL_getmetafield(L, 1, "__metatable");
+ return 1; /* returns either __metatable field (if present) or metatable */
+}
+
+
+static int luaB_setmetatable (lua_State *L) {
+ int t = lua_type(L, 2);
+ luaL_checktype(L, 1, LUA_TTABLE);
+ luaL_argcheck(L, t == LUA_TNIL || t == LUA_TTABLE, 2,
+ "nil or table expected");
+ if (luaL_getmetafield(L, 1, "__metatable"))
+ return luaL_error(L, "cannot change a protected metatable");
+ lua_settop(L, 2);
+ lua_setmetatable(L, 1);
+ return 1;
+}
+
+
+static int luaB_rawequal (lua_State *L) {
+ luaL_checkany(L, 1);
+ luaL_checkany(L, 2);
+ lua_pushboolean(L, lua_rawequal(L, 1, 2));
+ return 1;
+}
+
+
+static int luaB_rawlen (lua_State *L) {
+ int t = lua_type(L, 1);
+ luaL_argcheck(L, t == LUA_TTABLE || t == LUA_TSTRING, 1,
+ "table or string expected");
+ lua_pushinteger(L, lua_rawlen(L, 1));
+ return 1;
+}
+
+
+static int luaB_rawget (lua_State *L) {
+ luaL_checktype(L, 1, LUA_TTABLE);
+ luaL_checkany(L, 2);
+ lua_settop(L, 2);
+ lua_rawget(L, 1);
+ return 1;
+}
+
+static int luaB_rawset (lua_State *L) {
+ luaL_checktype(L, 1, LUA_TTABLE);
+ luaL_checkany(L, 2);
+ luaL_checkany(L, 3);
+ lua_settop(L, 3);
+ lua_rawset(L, 1);
+ return 1;
+}
+
+
+static int luaB_collectgarbage (lua_State *L) {
+ static const char *const opts[] = {"stop", "restart", "collect",
+ "count", "step", "setpause", "setstepmul",
+ "setmajorinc", "isrunning", "generational", "incremental", NULL};
+ static const int optsnum[] = {LUA_GCSTOP, LUA_GCRESTART, LUA_GCCOLLECT,
+ LUA_GCCOUNT, LUA_GCSTEP, LUA_GCSETPAUSE, LUA_GCSETSTEPMUL,
+ LUA_GCSETMAJORINC, LUA_GCISRUNNING, LUA_GCGEN, LUA_GCINC};
+ int o = optsnum[luaL_checkoption(L, 1, "collect", opts)];
+ int ex = luaL_optint(L, 2, 0);
+ int res = lua_gc(L, o, ex);
+ switch (o) {
+ case LUA_GCCOUNT: {
+ int b = lua_gc(L, LUA_GCCOUNTB, 0);
+ lua_pushnumber(L, res + ((lua_Number)b/1024));
+ lua_pushinteger(L, b);
+ return 2;
+ }
+ case LUA_GCSTEP: case LUA_GCISRUNNING: {
+ lua_pushboolean(L, res);
+ return 1;
+ }
+ default: {
+ lua_pushinteger(L, res);
+ return 1;
+ }
+ }
+}
+
+
+static int luaB_type (lua_State *L) {
+ luaL_checkany(L, 1);
+ lua_pushstring(L, luaL_typename(L, 1));
+ return 1;
+}
+
+
+static int pairsmeta (lua_State *L, const char *method, int iszero,
+ lua_CFunction iter) {
+ if (!luaL_getmetafield(L, 1, method)) { /* no metamethod? */
+ luaL_checktype(L, 1, LUA_TTABLE); /* argument must be a table */
+ lua_pushcfunction(L, iter); /* will return generator, */
+ lua_pushvalue(L, 1); /* state, */
+ if (iszero) lua_pushinteger(L, 0); /* and initial value */
+ else lua_pushnil(L);
+ }
+ else {
+ lua_pushvalue(L, 1); /* argument 'self' to metamethod */
+ lua_call(L, 1, 3); /* get 3 values from metamethod */
+ }
+ return 3;
+}
+
+
+static int luaB_next (lua_State *L) {
+ luaL_checktype(L, 1, LUA_TTABLE);
+ lua_settop(L, 2); /* create a 2nd argument if there isn't one */
+ if (lua_next(L, 1))
+ return 2;
+ else {
+ lua_pushnil(L);
+ return 1;
+ }
+}
+
+
+static int luaB_pairs (lua_State *L) {
+ return pairsmeta(L, "__pairs", 0, luaB_next);
+}
+
+
+static int ipairsaux (lua_State *L) {
+ int i = luaL_checkint(L, 2);
+ luaL_checktype(L, 1, LUA_TTABLE);
+ i++; /* next value */
+ lua_pushinteger(L, i);
+ lua_rawgeti(L, 1, i);
+ return (lua_isnil(L, -1)) ? 1 : 2;
+}
+
+
+static int luaB_ipairs (lua_State *L) {
+ return pairsmeta(L, "__ipairs", 1, ipairsaux);
+}
+
+
+static int luaB_assert (lua_State *L) {
+ if (!lua_toboolean(L, 1))
+ return luaL_error(L, "%s", luaL_optstring(L, 2, "assertion failed!"));
+ return lua_gettop(L);
+}
+
+
+static int luaB_select (lua_State *L) {
+ int n = lua_gettop(L);
+ if (lua_type(L, 1) == LUA_TSTRING && *lua_tostring(L, 1) == '#') {
+ lua_pushinteger(L, n-1);
+ return 1;
+ }
+ else {
+ int i = luaL_checkint(L, 1);
+ if (i < 0) i = n + i;
+ else if (i > n) i = n;
+ luaL_argcheck(L, 1 <= i, 1, "index out of range");
+ return n - i;
+ }
+}
+
+static int luaB_tostring (lua_State *L) {
+ luaL_checkany(L, 1);
+ luaL_tolstring(L, 1, NULL);
+ return 1;
+}
+
+static const luaL_Reg base_funcs[] = {
+ {"assert", luaB_assert},
+ {"collectgarbage", luaB_collectgarbage},
+ {"error", luaB_error},
+ {"getmetatable", luaB_getmetatable},
+ {"ipairs", luaB_ipairs},
+#if defined(LUA_COMPAT_LOADSTRING)
+ {"loadstring", luaB_load},
+#endif
+ {"next", luaB_next},
+ {"pairs", luaB_pairs},
+ {"rawequal", luaB_rawequal},
+ {"rawlen", luaB_rawlen},
+ {"rawget", luaB_rawget},
+ {"rawset", luaB_rawset},
+ {"select", luaB_select},
+ {"setmetatable", luaB_setmetatable},
+ {"tonumber", luaB_tonumber},
+ {"tostring", luaB_tostring},
+ {"type", luaB_type},
+ {NULL, NULL}
+};
+
+
+LUAMOD_API int luaopen_base (lua_State *L) {
+ /* set global _G */
+ lua_pushglobaltable(L);
+ lua_pushglobaltable(L);
+ lua_setfield(L, -2, "_G");
+ /* open lib into global table */
+ luaL_setfuncs(L, base_funcs, 0);
+ lua_pushliteral(L, LUA_VERSION);
+ lua_setfield(L, -2, "_VERSION"); /* set global _VERSION */
+ return 1;
+}
+
+#if defined(_KERNEL)
+
+EXPORT_SYMBOL(luaopen_base);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lcode.c b/sys/contrib/openzfs/module/lua/lcode.c
new file mode 100644
index 000000000000..ae9a3d91d810
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lcode.c
@@ -0,0 +1,884 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lcode.c,v 2.62.1.1 2013/04/12 18:48:47 roberto Exp $
+** Code generator for Lua
+** See Copyright Notice in lua.h
+*/
+
+#define lcode_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "lcode.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lgc.h"
+#include "llex.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lparser.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "lvm.h"
+
+
+#define hasjumps(e) ((e)->t != (e)->f)
+
+
+static int isnumeral(expdesc *e) {
+ return (e->k == VKNUM && e->t == NO_JUMP && e->f == NO_JUMP);
+}
+
+
+void luaK_nil (FuncState *fs, int from, int n) {
+ Instruction *previous;
+ int l = from + n - 1; /* last register to set nil */
+ if (fs->pc > fs->lasttarget) { /* no jumps to current position? */
+ previous = &fs->f->code[fs->pc-1];
+ if (GET_OPCODE(*previous) == OP_LOADNIL) {
+ int pfrom = GETARG_A(*previous);
+ int pl = pfrom + GETARG_B(*previous);
+ if ((pfrom <= from && from <= pl + 1) ||
+ (from <= pfrom && pfrom <= l + 1)) { /* can connect both? */
+ if (pfrom < from) from = pfrom; /* from = min(from, pfrom) */
+ if (pl > l) l = pl; /* l = max(l, pl) */
+ SETARG_A(*previous, from);
+ SETARG_B(*previous, l - from);
+ return;
+ }
+ } /* else go through */
+ }
+ luaK_codeABC(fs, OP_LOADNIL, from, n - 1, 0); /* else no optimization */
+}
+
+
+int luaK_jump (FuncState *fs) {
+ int jpc = fs->jpc; /* save list of jumps to here */
+ int j;
+ fs->jpc = NO_JUMP;
+ j = luaK_codeAsBx(fs, OP_JMP, 0, NO_JUMP);
+ luaK_concat(fs, &j, jpc); /* keep them on hold */
+ return j;
+}
+
+
+void luaK_ret (FuncState *fs, int first, int nret) {
+ luaK_codeABC(fs, OP_RETURN, first, nret+1, 0);
+}
+
+
+static int condjump (FuncState *fs, OpCode op, int A, int B, int C) {
+ luaK_codeABC(fs, op, A, B, C);
+ return luaK_jump(fs);
+}
+
+
+static void fixjump (FuncState *fs, int pc, int dest) {
+ Instruction *jmp = &fs->f->code[pc];
+ int offset = dest-(pc+1);
+ lua_assert(dest != NO_JUMP);
+ if (abs(offset) > MAXARG_sBx)
+ luaX_syntaxerror(fs->ls, "control structure too long");
+ SETARG_sBx(*jmp, offset);
+}
+
+
+/*
+** returns current `pc' and marks it as a jump target (to avoid wrong
+** optimizations with consecutive instructions not in the same basic block).
+*/
+int luaK_getlabel (FuncState *fs) {
+ fs->lasttarget = fs->pc;
+ return fs->pc;
+}
+
+
+static int getjump (FuncState *fs, int pc) {
+ int offset = GETARG_sBx(fs->f->code[pc]);
+ if (offset == NO_JUMP) /* point to itself represents end of list */
+ return NO_JUMP; /* end of list */
+ else
+ return (pc+1)+offset; /* turn offset into absolute position */
+}
+
+
+static Instruction *getjumpcontrol (FuncState *fs, int pc) {
+ Instruction *pi = &fs->f->code[pc];
+ if (pc >= 1 && testTMode(GET_OPCODE(*(pi-1))))
+ return pi-1;
+ else
+ return pi;
+}
+
+
+/*
+** check whether list has any jump that do not produce a value
+** (or produce an inverted value)
+*/
+static int need_value (FuncState *fs, int list) {
+ for (; list != NO_JUMP; list = getjump(fs, list)) {
+ Instruction i = *getjumpcontrol(fs, list);
+ if (GET_OPCODE(i) != OP_TESTSET) return 1;
+ }
+ return 0; /* not found */
+}
+
+
+static int patchtestreg (FuncState *fs, int node, int reg) {
+ Instruction *i = getjumpcontrol(fs, node);
+ if (GET_OPCODE(*i) != OP_TESTSET)
+ return 0; /* cannot patch other instructions */
+ if (reg != NO_REG && reg != GETARG_B(*i))
+ SETARG_A(*i, reg);
+ else /* no register to put value or register already has the value */
+ *i = CREATE_ABC(OP_TEST, GETARG_B(*i), 0, GETARG_C(*i));
+
+ return 1;
+}
+
+
+static void removevalues (FuncState *fs, int list) {
+ for (; list != NO_JUMP; list = getjump(fs, list))
+ patchtestreg(fs, list, NO_REG);
+}
+
+
+static void patchlistaux (FuncState *fs, int list, int vtarget, int reg,
+ int dtarget) {
+ while (list != NO_JUMP) {
+ int next = getjump(fs, list);
+ if (patchtestreg(fs, list, reg))
+ fixjump(fs, list, vtarget);
+ else
+ fixjump(fs, list, dtarget); /* jump to default target */
+ list = next;
+ }
+}
+
+
+static void dischargejpc (FuncState *fs) {
+ patchlistaux(fs, fs->jpc, fs->pc, NO_REG, fs->pc);
+ fs->jpc = NO_JUMP;
+}
+
+
+void luaK_patchlist (FuncState *fs, int list, int target) {
+ if (target == fs->pc)
+ luaK_patchtohere(fs, list);
+ else {
+ lua_assert(target < fs->pc);
+ patchlistaux(fs, list, target, NO_REG, target);
+ }
+}
+
+
+LUAI_FUNC void luaK_patchclose (FuncState *fs, int list, int level) {
+ level++; /* argument is +1 to reserve 0 as non-op */
+ while (list != NO_JUMP) {
+ int next = getjump(fs, list);
+ lua_assert(GET_OPCODE(fs->f->code[list]) == OP_JMP &&
+ (GETARG_A(fs->f->code[list]) == 0 ||
+ GETARG_A(fs->f->code[list]) >= level));
+ SETARG_A(fs->f->code[list], level);
+ list = next;
+ }
+}
+
+
+void luaK_patchtohere (FuncState *fs, int list) {
+ luaK_getlabel(fs);
+ luaK_concat(fs, &fs->jpc, list);
+}
+
+
+void luaK_concat (FuncState *fs, int *l1, int l2) {
+ if (l2 == NO_JUMP) return;
+ else if (*l1 == NO_JUMP)
+ *l1 = l2;
+ else {
+ int list = *l1;
+ int next;
+ while ((next = getjump(fs, list)) != NO_JUMP) /* find last element */
+ list = next;
+ fixjump(fs, list, l2);
+ }
+}
+
+
+static int luaK_code (FuncState *fs, Instruction i) {
+ Proto *f = fs->f;
+ dischargejpc(fs); /* `pc' will change */
+ /* put new instruction in code array */
+ luaM_growvector(fs->ls->L, f->code, fs->pc, f->sizecode, Instruction,
+ MAX_INT, "opcodes");
+ f->code[fs->pc] = i;
+ /* save corresponding line information */
+ luaM_growvector(fs->ls->L, f->lineinfo, fs->pc, f->sizelineinfo, int,
+ MAX_INT, "opcodes");
+ f->lineinfo[fs->pc] = fs->ls->lastline;
+ return fs->pc++;
+}
+
+
+int luaK_codeABC (FuncState *fs, OpCode o, int a, int b, int c) {
+ lua_assert(getOpMode(o) == iABC);
+ lua_assert(getBMode(o) != OpArgN || b == 0);
+ lua_assert(getCMode(o) != OpArgN || c == 0);
+ lua_assert(a <= MAXARG_A && b <= MAXARG_B && c <= MAXARG_C);
+ return luaK_code(fs, CREATE_ABC(o, a, b, c));
+}
+
+
+int luaK_codeABx (FuncState *fs, OpCode o, int a, unsigned int bc) {
+ lua_assert(getOpMode(o) == iABx || getOpMode(o) == iAsBx);
+ lua_assert(getCMode(o) == OpArgN);
+ lua_assert(a <= MAXARG_A && bc <= MAXARG_Bx);
+ return luaK_code(fs, CREATE_ABx(o, a, bc));
+}
+
+
+static int codeextraarg (FuncState *fs, int a) {
+ lua_assert(a <= MAXARG_Ax);
+ return luaK_code(fs, CREATE_Ax(OP_EXTRAARG, a));
+}
+
+
+int luaK_codek (FuncState *fs, int reg, int k) {
+ if (k <= MAXARG_Bx)
+ return luaK_codeABx(fs, OP_LOADK, reg, k);
+ else {
+ int p = luaK_codeABx(fs, OP_LOADKX, reg, 0);
+ codeextraarg(fs, k);
+ return p;
+ }
+}
+
+
+void luaK_checkstack (FuncState *fs, int n) {
+ int newstack = fs->freereg + n;
+ if (newstack > fs->f->maxstacksize) {
+ if (newstack >= MAXSTACK)
+ luaX_syntaxerror(fs->ls, "function or expression too complex");
+ fs->f->maxstacksize = cast_byte(newstack);
+ }
+}
+
+
+void luaK_reserveregs (FuncState *fs, int n) {
+ luaK_checkstack(fs, n);
+ fs->freereg += n;
+}
+
+
+static void freereg (FuncState *fs, int reg) {
+ if (!ISK(reg) && reg >= fs->nactvar) {
+ fs->freereg--;
+ lua_assert(reg == fs->freereg);
+ }
+}
+
+
+static void freeexp (FuncState *fs, expdesc *e) {
+ if (e->k == VNONRELOC)
+ freereg(fs, e->u.info);
+}
+
+
+static int addk (FuncState *fs, TValue *key, TValue *v) {
+ lua_State *L = fs->ls->L;
+ TValue *idx = luaH_set(L, fs->h, key);
+ Proto *f = fs->f;
+ int k, oldsize;
+ if (ttisnumber(idx)) {
+ lua_Number n = nvalue(idx);
+ lua_number2int(k, n);
+ if (luaV_rawequalobj(&f->k[k], v))
+ return k;
+ /* else may be a collision (e.g., between 0.0 and "\0\0\0\0\0\0\0\0");
+ go through and create a new entry for this value */
+ }
+ /* constant not found; create a new entry */
+ oldsize = f->sizek;
+ k = fs->nk;
+ /* numerical value does not need GC barrier;
+ table has no metatable, so it does not need to invalidate cache */
+ setnvalue(idx, cast_num(k));
+ luaM_growvector(L, f->k, k, f->sizek, TValue, MAXARG_Ax, "constants");
+ while (oldsize < f->sizek) setnilvalue(&f->k[oldsize++]);
+ setobj(L, &f->k[k], v);
+ fs->nk++;
+ luaC_barrier(L, f, v);
+ return k;
+}
+
+
+int luaK_stringK (FuncState *fs, TString *s) {
+ TValue o;
+ setsvalue(fs->ls->L, &o, s);
+ return addk(fs, &o, &o);
+}
+
+
+int luaK_numberK (FuncState *fs, lua_Number r) {
+ int n;
+ lua_State *L = fs->ls->L;
+ TValue o;
+ setnvalue(&o, r);
+ if (r == 0 || luai_numisnan(NULL, r)) { /* handle -0 and NaN */
+ /* use raw representation as key to avoid numeric problems */
+ setsvalue(L, L->top++, luaS_newlstr(L, (char *)&r, sizeof(r)));
+ n = addk(fs, L->top - 1, &o);
+ L->top--;
+ }
+ else
+ n = addk(fs, &o, &o); /* regular case */
+ return n;
+}
+
+
+static int boolK (FuncState *fs, int b) {
+ TValue o;
+ setbvalue(&o, b);
+ return addk(fs, &o, &o);
+}
+
+
+static int nilK (FuncState *fs) {
+ TValue k, v;
+ setnilvalue(&v);
+ /* cannot use nil as key; instead use table itself to represent nil */
+ sethvalue(fs->ls->L, &k, fs->h);
+ return addk(fs, &k, &v);
+}
+
+
+void luaK_setreturns (FuncState *fs, expdesc *e, int nresults) {
+ if (e->k == VCALL) { /* expression is an open function call? */
+ SETARG_C(getcode(fs, e), nresults+1);
+ }
+ else if (e->k == VVARARG) {
+ SETARG_B(getcode(fs, e), nresults+1);
+ SETARG_A(getcode(fs, e), fs->freereg);
+ luaK_reserveregs(fs, 1);
+ }
+}
+
+
+void luaK_setoneret (FuncState *fs, expdesc *e) {
+ if (e->k == VCALL) { /* expression is an open function call? */
+ e->k = VNONRELOC;
+ e->u.info = GETARG_A(getcode(fs, e));
+ }
+ else if (e->k == VVARARG) {
+ SETARG_B(getcode(fs, e), 2);
+ e->k = VRELOCABLE; /* can relocate its simple result */
+ }
+}
+
+
+void luaK_dischargevars (FuncState *fs, expdesc *e) {
+ switch (e->k) {
+ case VLOCAL: {
+ e->k = VNONRELOC;
+ break;
+ }
+ case VUPVAL: {
+ e->u.info = luaK_codeABC(fs, OP_GETUPVAL, 0, e->u.info, 0);
+ e->k = VRELOCABLE;
+ break;
+ }
+ case VINDEXED: {
+ OpCode op = OP_GETTABUP; /* assume 't' is in an upvalue */
+ freereg(fs, e->u.ind.idx);
+ if (e->u.ind.vt == VLOCAL) { /* 't' is in a register? */
+ freereg(fs, e->u.ind.t);
+ op = OP_GETTABLE;
+ }
+ e->u.info = luaK_codeABC(fs, op, 0, e->u.ind.t, e->u.ind.idx);
+ e->k = VRELOCABLE;
+ break;
+ }
+ case VVARARG:
+ case VCALL: {
+ luaK_setoneret(fs, e);
+ break;
+ }
+ default: break; /* there is one value available (somewhere) */
+ }
+}
+
+
+static int code_label (FuncState *fs, int A, int b, int jump) {
+ luaK_getlabel(fs); /* those instructions may be jump targets */
+ return luaK_codeABC(fs, OP_LOADBOOL, A, b, jump);
+}
+
+
+static void discharge2reg (FuncState *fs, expdesc *e, int reg) {
+ luaK_dischargevars(fs, e);
+ switch (e->k) {
+ case VNIL: {
+ luaK_nil(fs, reg, 1);
+ break;
+ }
+ case VFALSE: case VTRUE: {
+ luaK_codeABC(fs, OP_LOADBOOL, reg, e->k == VTRUE, 0);
+ break;
+ }
+ case VK: {
+ luaK_codek(fs, reg, e->u.info);
+ break;
+ }
+ case VKNUM: {
+ luaK_codek(fs, reg, luaK_numberK(fs, e->u.nval));
+ break;
+ }
+ case VRELOCABLE: {
+ Instruction *pc = &getcode(fs, e);
+ SETARG_A(*pc, reg);
+ break;
+ }
+ case VNONRELOC: {
+ if (reg != e->u.info)
+ luaK_codeABC(fs, OP_MOVE, reg, e->u.info, 0);
+ break;
+ }
+ default: {
+ lua_assert(e->k == VVOID || e->k == VJMP);
+ return; /* nothing to do... */
+ }
+ }
+ e->u.info = reg;
+ e->k = VNONRELOC;
+}
+
+
+static void discharge2anyreg (FuncState *fs, expdesc *e) {
+ if (e->k != VNONRELOC) {
+ luaK_reserveregs(fs, 1);
+ discharge2reg(fs, e, fs->freereg-1);
+ }
+}
+
+
+static void exp2reg (FuncState *fs, expdesc *e, int reg) {
+ discharge2reg(fs, e, reg);
+ if (e->k == VJMP)
+ luaK_concat(fs, &e->t, e->u.info); /* put this jump in `t' list */
+ if (hasjumps(e)) {
+ int final; /* position after whole expression */
+ int p_f = NO_JUMP; /* position of an eventual LOAD false */
+ int p_t = NO_JUMP; /* position of an eventual LOAD true */
+ if (need_value(fs, e->t) || need_value(fs, e->f)) {
+ int fj = (e->k == VJMP) ? NO_JUMP : luaK_jump(fs);
+ p_f = code_label(fs, reg, 0, 1);
+ p_t = code_label(fs, reg, 1, 0);
+ luaK_patchtohere(fs, fj);
+ }
+ final = luaK_getlabel(fs);
+ patchlistaux(fs, e->f, final, reg, p_f);
+ patchlistaux(fs, e->t, final, reg, p_t);
+ }
+ e->f = e->t = NO_JUMP;
+ e->u.info = reg;
+ e->k = VNONRELOC;
+}
+
+
+void luaK_exp2nextreg (FuncState *fs, expdesc *e) {
+ luaK_dischargevars(fs, e);
+ freeexp(fs, e);
+ luaK_reserveregs(fs, 1);
+ exp2reg(fs, e, fs->freereg - 1);
+}
+
+
+int luaK_exp2anyreg (FuncState *fs, expdesc *e) {
+ luaK_dischargevars(fs, e);
+ if (e->k == VNONRELOC) {
+ if (!hasjumps(e)) return e->u.info; /* exp is already in a register */
+ if (e->u.info >= fs->nactvar) { /* reg. is not a local? */
+ exp2reg(fs, e, e->u.info); /* put value on it */
+ return e->u.info;
+ }
+ }
+ luaK_exp2nextreg(fs, e); /* default */
+ return e->u.info;
+}
+
+
+void luaK_exp2anyregup (FuncState *fs, expdesc *e) {
+ if (e->k != VUPVAL || hasjumps(e))
+ luaK_exp2anyreg(fs, e);
+}
+
+
+void luaK_exp2val (FuncState *fs, expdesc *e) {
+ if (hasjumps(e))
+ luaK_exp2anyreg(fs, e);
+ else
+ luaK_dischargevars(fs, e);
+}
+
+
+int luaK_exp2RK (FuncState *fs, expdesc *e) {
+ luaK_exp2val(fs, e);
+ switch (e->k) {
+ case VTRUE:
+ case VFALSE:
+ case VNIL: {
+ if (fs->nk <= MAXINDEXRK) { /* constant fits in RK operand? */
+ e->u.info = (e->k == VNIL) ? nilK(fs) : boolK(fs, (e->k == VTRUE));
+ e->k = VK;
+ return RKASK(e->u.info);
+ }
+ else break;
+ }
+ case VKNUM: {
+ e->u.info = luaK_numberK(fs, e->u.nval);
+ e->k = VK;
+ /* go through */
+ }
+ case VK: {
+ if (e->u.info <= MAXINDEXRK) /* constant fits in argC? */
+ return RKASK(e->u.info);
+ else break;
+ }
+ default: break;
+ }
+ /* not a constant in the right range: put it in a register */
+ return luaK_exp2anyreg(fs, e);
+}
+
+
+void luaK_storevar (FuncState *fs, expdesc *var, expdesc *ex) {
+ switch (var->k) {
+ case VLOCAL: {
+ freeexp(fs, ex);
+ exp2reg(fs, ex, var->u.info);
+ return;
+ }
+ case VUPVAL: {
+ int e = luaK_exp2anyreg(fs, ex);
+ luaK_codeABC(fs, OP_SETUPVAL, e, var->u.info, 0);
+ break;
+ }
+ case VINDEXED: {
+ OpCode op = (var->u.ind.vt == VLOCAL) ? OP_SETTABLE : OP_SETTABUP;
+ int e = luaK_exp2RK(fs, ex);
+ luaK_codeABC(fs, op, var->u.ind.t, var->u.ind.idx, e);
+ break;
+ }
+ default: {
+ lua_assert(0); /* invalid var kind to store */
+ break;
+ }
+ }
+ freeexp(fs, ex);
+}
+
+
+void luaK_self (FuncState *fs, expdesc *e, expdesc *key) {
+ int ereg;
+ luaK_exp2anyreg(fs, e);
+ ereg = e->u.info; /* register where 'e' was placed */
+ freeexp(fs, e);
+ e->u.info = fs->freereg; /* base register for op_self */
+ e->k = VNONRELOC;
+ luaK_reserveregs(fs, 2); /* function and 'self' produced by op_self */
+ luaK_codeABC(fs, OP_SELF, e->u.info, ereg, luaK_exp2RK(fs, key));
+ freeexp(fs, key);
+}
+
+
+static void invertjump (FuncState *fs, expdesc *e) {
+ Instruction *pc = getjumpcontrol(fs, e->u.info);
+ lua_assert(testTMode(GET_OPCODE(*pc)) && GET_OPCODE(*pc) != OP_TESTSET &&
+ GET_OPCODE(*pc) != OP_TEST);
+ SETARG_A(*pc, !(GETARG_A(*pc)));
+}
+
+
+static int jumponcond (FuncState *fs, expdesc *e, int cond) {
+ if (e->k == VRELOCABLE) {
+ Instruction ie = getcode(fs, e);
+ if (GET_OPCODE(ie) == OP_NOT) {
+ fs->pc--; /* remove previous OP_NOT */
+ return condjump(fs, OP_TEST, GETARG_B(ie), 0, !cond);
+ }
+ /* else go through */
+ }
+ discharge2anyreg(fs, e);
+ freeexp(fs, e);
+ return condjump(fs, OP_TESTSET, NO_REG, e->u.info, cond);
+}
+
+
+void luaK_goiftrue (FuncState *fs, expdesc *e) {
+ int pc; /* pc of last jump */
+ luaK_dischargevars(fs, e);
+ switch (e->k) {
+ case VJMP: {
+ invertjump(fs, e);
+ pc = e->u.info;
+ break;
+ }
+ case VK: case VKNUM: case VTRUE: {
+ pc = NO_JUMP; /* always true; do nothing */
+ break;
+ }
+ default: {
+ pc = jumponcond(fs, e, 0);
+ break;
+ }
+ }
+ luaK_concat(fs, &e->f, pc); /* insert last jump in `f' list */
+ luaK_patchtohere(fs, e->t);
+ e->t = NO_JUMP;
+}
+
+
+void luaK_goiffalse (FuncState *fs, expdesc *e) {
+ int pc; /* pc of last jump */
+ luaK_dischargevars(fs, e);
+ switch (e->k) {
+ case VJMP: {
+ pc = e->u.info;
+ break;
+ }
+ case VNIL: case VFALSE: {
+ pc = NO_JUMP; /* always false; do nothing */
+ break;
+ }
+ default: {
+ pc = jumponcond(fs, e, 1);
+ break;
+ }
+ }
+ luaK_concat(fs, &e->t, pc); /* insert last jump in `t' list */
+ luaK_patchtohere(fs, e->f);
+ e->f = NO_JUMP;
+}
+
+
+static void codenot (FuncState *fs, expdesc *e) {
+ luaK_dischargevars(fs, e);
+ switch (e->k) {
+ case VNIL: case VFALSE: {
+ e->k = VTRUE;
+ break;
+ }
+ case VK: case VKNUM: case VTRUE: {
+ e->k = VFALSE;
+ break;
+ }
+ case VJMP: {
+ invertjump(fs, e);
+ break;
+ }
+ case VRELOCABLE:
+ case VNONRELOC: {
+ discharge2anyreg(fs, e);
+ freeexp(fs, e);
+ e->u.info = luaK_codeABC(fs, OP_NOT, 0, e->u.info, 0);
+ e->k = VRELOCABLE;
+ break;
+ }
+ default: {
+ lua_assert(0); /* cannot happen */
+ break;
+ }
+ }
+ /* interchange true and false lists */
+ { int temp = e->f; e->f = e->t; e->t = temp; }
+ removevalues(fs, e->f);
+ removevalues(fs, e->t);
+}
+
+
+void luaK_indexed (FuncState *fs, expdesc *t, expdesc *k) {
+ lua_assert(!hasjumps(t));
+ t->u.ind.t = t->u.info;
+ t->u.ind.idx = luaK_exp2RK(fs, k);
+ t->u.ind.vt = (t->k == VUPVAL) ? VUPVAL
+ : check_exp(vkisinreg(t->k), VLOCAL);
+ t->k = VINDEXED;
+}
+
+
+static int constfolding (OpCode op, expdesc *e1, expdesc *e2) {
+ lua_Number r;
+ if (!isnumeral(e1) || !isnumeral(e2)) return 0;
+ if ((op == OP_DIV || op == OP_MOD) && e2->u.nval == 0)
+ return 0; /* do not attempt to divide by 0 */
+ /*
+ * Patched: check for MIN_INT / -1
+ */
+ if (op == OP_DIV && e1->u.nval == INT64_MIN && e2->u.nval == -1)
+ return 0;
+ r = luaO_arith(op - OP_ADD + LUA_OPADD, e1->u.nval, e2->u.nval);
+ e1->u.nval = r;
+ return 1;
+}
+
+
+static void codearith (FuncState *fs, OpCode op,
+ expdesc *e1, expdesc *e2, int line) {
+ if (constfolding(op, e1, e2))
+ return;
+ else {
+ int o2 = (op != OP_UNM && op != OP_LEN) ? luaK_exp2RK(fs, e2) : 0;
+ int o1 = luaK_exp2RK(fs, e1);
+ if (o1 > o2) {
+ freeexp(fs, e1);
+ freeexp(fs, e2);
+ }
+ else {
+ freeexp(fs, e2);
+ freeexp(fs, e1);
+ }
+ e1->u.info = luaK_codeABC(fs, op, 0, o1, o2);
+ e1->k = VRELOCABLE;
+ luaK_fixline(fs, line);
+ }
+}
+
+
+static void codecomp (FuncState *fs, OpCode op, int cond, expdesc *e1,
+ expdesc *e2) {
+ int o1 = luaK_exp2RK(fs, e1);
+ int o2 = luaK_exp2RK(fs, e2);
+ freeexp(fs, e2);
+ freeexp(fs, e1);
+ if (cond == 0 && op != OP_EQ) {
+ int temp; /* exchange args to replace by `<' or `<=' */
+ temp = o1; o1 = o2; o2 = temp; /* o1 <==> o2 */
+ cond = 1;
+ }
+ e1->u.info = condjump(fs, op, cond, o1, o2);
+ e1->k = VJMP;
+}
+
+
+void luaK_prefix (FuncState *fs, UnOpr op, expdesc *e, int line) {
+ expdesc e2;
+ e2.t = e2.f = NO_JUMP; e2.k = VKNUM; e2.u.nval = 0;
+ switch (op) {
+ case OPR_MINUS: {
+ if (isnumeral(e)) /* minus constant? */
+ e->u.nval = luai_numunm(NULL, e->u.nval); /* fold it */
+ else {
+ luaK_exp2anyreg(fs, e);
+ codearith(fs, OP_UNM, e, &e2, line);
+ }
+ break;
+ }
+ case OPR_NOT: codenot(fs, e); break;
+ case OPR_LEN: {
+ luaK_exp2anyreg(fs, e); /* cannot operate on constants */
+ codearith(fs, OP_LEN, e, &e2, line);
+ break;
+ }
+ default: lua_assert(0);
+ }
+}
+
+
+void luaK_infix (FuncState *fs, BinOpr op, expdesc *v) {
+ switch (op) {
+ case OPR_AND: {
+ luaK_goiftrue(fs, v);
+ break;
+ }
+ case OPR_OR: {
+ luaK_goiffalse(fs, v);
+ break;
+ }
+ case OPR_CONCAT: {
+ luaK_exp2nextreg(fs, v); /* operand must be on the `stack' */
+ break;
+ }
+ case OPR_ADD: case OPR_SUB: case OPR_MUL: case OPR_DIV:
+ case OPR_MOD: case OPR_POW: {
+ if (!isnumeral(v)) luaK_exp2RK(fs, v);
+ break;
+ }
+ default: {
+ luaK_exp2RK(fs, v);
+ break;
+ }
+ }
+}
+
+
+void luaK_posfix (FuncState *fs, BinOpr op,
+ expdesc *e1, expdesc *e2, int line) {
+ switch (op) {
+ case OPR_AND: {
+ lua_assert(e1->t == NO_JUMP); /* list must be closed */
+ luaK_dischargevars(fs, e2);
+ luaK_concat(fs, &e2->f, e1->f);
+ *e1 = *e2;
+ break;
+ }
+ case OPR_OR: {
+ lua_assert(e1->f == NO_JUMP); /* list must be closed */
+ luaK_dischargevars(fs, e2);
+ luaK_concat(fs, &e2->t, e1->t);
+ *e1 = *e2;
+ break;
+ }
+ case OPR_CONCAT: {
+ luaK_exp2val(fs, e2);
+ if (e2->k == VRELOCABLE && GET_OPCODE(getcode(fs, e2)) == OP_CONCAT) {
+ lua_assert(e1->u.info == GETARG_B(getcode(fs, e2))-1);
+ freeexp(fs, e1);
+ SETARG_B(getcode(fs, e2), e1->u.info);
+ e1->k = VRELOCABLE; e1->u.info = e2->u.info;
+ }
+ else {
+ luaK_exp2nextreg(fs, e2); /* operand must be on the 'stack' */
+ codearith(fs, OP_CONCAT, e1, e2, line);
+ }
+ break;
+ }
+ case OPR_ADD: case OPR_SUB: case OPR_MUL: case OPR_DIV:
+ case OPR_MOD: case OPR_POW: {
+ codearith(fs, cast(OpCode, op - OPR_ADD + OP_ADD), e1, e2, line);
+ break;
+ }
+ case OPR_EQ: case OPR_LT: case OPR_LE: {
+ codecomp(fs, cast(OpCode, op - OPR_EQ + OP_EQ), 1, e1, e2);
+ break;
+ }
+ case OPR_NE: case OPR_GT: case OPR_GE: {
+ codecomp(fs, cast(OpCode, op - OPR_NE + OP_EQ), 0, e1, e2);
+ break;
+ }
+ default: lua_assert(0);
+ }
+}
+
+
+void luaK_fixline (FuncState *fs, int line) {
+ fs->f->lineinfo[fs->pc - 1] = line;
+}
+
+
+void luaK_setlist (FuncState *fs, int base, int nelems, int tostore) {
+ int c = (nelems - 1)/LFIELDS_PER_FLUSH + 1;
+ int b = (tostore == LUA_MULTRET) ? 0 : tostore;
+ lua_assert(tostore != 0);
+ if (c <= MAXARG_C)
+ luaK_codeABC(fs, OP_SETLIST, base, b, c);
+ else if (c <= MAXARG_Ax) {
+ luaK_codeABC(fs, OP_SETLIST, base, b, 0);
+ codeextraarg(fs, c);
+ }
+ else
+ luaX_syntaxerror(fs->ls, "constructor too long");
+ fs->freereg = base + 1; /* free registers with list values */
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lcode.h b/sys/contrib/openzfs/module/lua/lcode.h
new file mode 100644
index 000000000000..fd5fad00df3d
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lcode.h
@@ -0,0 +1,85 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lcode.h,v 1.58.1.1 2013/04/12 18:48:47 roberto Exp $
+** Code generator for Lua
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lcode_h
+#define lcode_h
+
+#include "llex.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lparser.h"
+
+
+/*
+** Marks the end of a patch list. It is an invalid value both as an absolute
+** address, and as a list link (would link an element to itself).
+*/
+#define NO_JUMP (-1)
+
+
+/*
+** grep "ORDER OPR" if you change these enums (ORDER OP)
+*/
+typedef enum BinOpr {
+ OPR_ADD, OPR_SUB, OPR_MUL, OPR_DIV, OPR_MOD, OPR_POW,
+ OPR_CONCAT,
+ OPR_EQ, OPR_LT, OPR_LE,
+ OPR_NE, OPR_GT, OPR_GE,
+ OPR_AND, OPR_OR,
+ OPR_NOBINOPR
+} BinOpr;
+
+
+typedef enum UnOpr { OPR_MINUS, OPR_NOT, OPR_LEN, OPR_NOUNOPR } UnOpr;
+
+
+#define getcode(fs,e) ((fs)->f->code[(e)->u.info])
+
+#define luaK_codeAsBx(fs,o,A,sBx) luaK_codeABx(fs,o,A,(sBx)+MAXARG_sBx)
+
+#define luaK_setmultret(fs,e) luaK_setreturns(fs, e, LUA_MULTRET)
+
+#define luaK_jumpto(fs,t) luaK_patchlist(fs, luaK_jump(fs), t)
+
+LUAI_FUNC int luaK_codeABx (FuncState *fs, OpCode o, int A, unsigned int Bx);
+LUAI_FUNC int luaK_codeABC (FuncState *fs, OpCode o, int A, int B, int C);
+LUAI_FUNC int luaK_codek (FuncState *fs, int reg, int k);
+LUAI_FUNC void luaK_fixline (FuncState *fs, int line);
+LUAI_FUNC void luaK_nil (FuncState *fs, int from, int n);
+LUAI_FUNC void luaK_reserveregs (FuncState *fs, int n);
+LUAI_FUNC void luaK_checkstack (FuncState *fs, int n);
+LUAI_FUNC int luaK_stringK (FuncState *fs, TString *s);
+LUAI_FUNC int luaK_numberK (FuncState *fs, lua_Number r);
+LUAI_FUNC void luaK_dischargevars (FuncState *fs, expdesc *e);
+LUAI_FUNC int luaK_exp2anyreg (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_exp2anyregup (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_exp2nextreg (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_exp2val (FuncState *fs, expdesc *e);
+LUAI_FUNC int luaK_exp2RK (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_self (FuncState *fs, expdesc *e, expdesc *key);
+LUAI_FUNC void luaK_indexed (FuncState *fs, expdesc *t, expdesc *k);
+LUAI_FUNC void luaK_goiftrue (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_goiffalse (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_storevar (FuncState *fs, expdesc *var, expdesc *e);
+LUAI_FUNC void luaK_setreturns (FuncState *fs, expdesc *e, int nresults);
+LUAI_FUNC void luaK_setoneret (FuncState *fs, expdesc *e);
+LUAI_FUNC int luaK_jump (FuncState *fs);
+LUAI_FUNC void luaK_ret (FuncState *fs, int first, int nret);
+LUAI_FUNC void luaK_patchlist (FuncState *fs, int list, int target);
+LUAI_FUNC void luaK_patchtohere (FuncState *fs, int list);
+LUAI_FUNC void luaK_patchclose (FuncState *fs, int list, int level);
+LUAI_FUNC void luaK_concat (FuncState *fs, int *l1, int l2);
+LUAI_FUNC int luaK_getlabel (FuncState *fs);
+LUAI_FUNC void luaK_prefix (FuncState *fs, UnOpr op, expdesc *v, int line);
+LUAI_FUNC void luaK_infix (FuncState *fs, BinOpr op, expdesc *v);
+LUAI_FUNC void luaK_posfix (FuncState *fs, BinOpr op, expdesc *v1,
+ expdesc *v2, int line);
+LUAI_FUNC void luaK_setlist (FuncState *fs, int base, int nelems, int tostore);
+
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lcompat.c b/sys/contrib/openzfs/module/lua/lcompat.c
new file mode 100644
index 000000000000..c0a27182c7d8
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lcompat.c
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/lua/lua.h>
+
+
+ssize_t
+lcompat_sprintf(char *buf, size_t size, const char *fmt, ...)
+{
+ ssize_t res;
+ va_list args;
+
+ va_start(args, fmt);
+ res = vsnprintf(buf, size, fmt, args);
+ va_end(args);
+
+ return (res);
+}
+
+int64_t
+lcompat_strtoll(const char *str, char **ptr)
+{
+ int base;
+ const char *cp;
+ int digits;
+ int64_t value;
+ boolean_t is_negative;
+
+ cp = str;
+ while (*cp == ' ' || *cp == '\t' || *cp == '\n') {
+ cp++;
+ }
+ is_negative = (*cp == '-');
+ if (is_negative) {
+ cp++;
+ }
+ base = 10;
+
+ if (*cp == '0') {
+ base = 8;
+ cp++;
+ if (*cp == 'x' || *cp == 'X') {
+ base = 16;
+ cp++;
+ }
+ }
+
+ value = 0;
+ for (; *cp != '\0'; cp++) {
+ if (*cp >= '0' && *cp <= '9') {
+ digits = *cp - '0';
+ } else if (*cp >= 'a' && *cp <= 'f') {
+ digits = *cp - 'a' + 10;
+ } else if (*cp >= 'A' && *cp <= 'F') {
+ digits = *cp - 'A' + 10;
+ } else {
+ break;
+ }
+ if (digits >= base) {
+ break;
+ }
+ value = (value * base) + digits;
+ }
+
+ if (ptr != NULL) {
+ *ptr = (char *)cp;
+ }
+ if (is_negative) {
+ value = -value;
+ }
+ return (value);
+}
+
+int64_t
+lcompat_pow(int64_t x, int64_t y)
+{
+ int64_t result = 1;
+ if (y < 0)
+ return (0);
+
+ while (y) {
+ if (y & 1)
+ result *= x;
+ y >>= 1;
+ x *= x;
+ }
+ return (result);
+}
+
+int
+lcompat_hashnum(int64_t x)
+{
+ x = (~x) + (x << 18);
+ x = x ^ (x >> 31);
+ x = x * 21;
+ x = x ^ (x >> 11);
+ x = x + (x << 6);
+ x = x ^ (x >> 22);
+ return ((int)x);
+}
diff --git a/sys/contrib/openzfs/module/lua/lcorolib.c b/sys/contrib/openzfs/module/lua/lcorolib.c
new file mode 100644
index 000000000000..0300e7ee17d5
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lcorolib.c
@@ -0,0 +1,159 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lcorolib.c,v 1.5.1.1 2013/04/12 18:48:47 roberto Exp $
+** Coroutine Library
+** See Copyright Notice in lua.h
+*/
+
+
+#define lcorolib_c
+#define LUA_LIB
+
+#include <sys/lua/lua.h>
+
+#include <sys/lua/lauxlib.h>
+#include <sys/lua/lualib.h>
+
+
+static int auxresume (lua_State *L, lua_State *co, int narg) {
+ int status;
+ if (!lua_checkstack(co, narg)) {
+ lua_pushliteral(L, "too many arguments to resume");
+ return -1; /* error flag */
+ }
+ if (lua_status(co) == LUA_OK && lua_gettop(co) == 0) {
+ lua_pushliteral(L, "cannot resume dead coroutine");
+ return -1; /* error flag */
+ }
+ lua_xmove(L, co, narg);
+ status = lua_resume(co, L, narg);
+ if (status == LUA_OK || status == LUA_YIELD) {
+ int nres = lua_gettop(co);
+ if (!lua_checkstack(L, nres + 1)) {
+ lua_pop(co, nres); /* remove results anyway */
+ lua_pushliteral(L, "too many results to resume");
+ return -1; /* error flag */
+ }
+ lua_xmove(co, L, nres); /* move yielded values */
+ return nres;
+ }
+ else {
+ lua_xmove(co, L, 1); /* move error message */
+ return -1; /* error flag */
+ }
+}
+
+
+static int luaB_coresume (lua_State *L) {
+ lua_State *co = lua_tothread(L, 1);
+ int r;
+ luaL_argcheck(L, co, 1, "coroutine expected");
+ r = auxresume(L, co, lua_gettop(L) - 1);
+ if (r < 0) {
+ lua_pushboolean(L, 0);
+ lua_insert(L, -2);
+ return 2; /* return false + error message */
+ }
+ else {
+ lua_pushboolean(L, 1);
+ lua_insert(L, -(r + 1));
+ return r + 1; /* return true + 'resume' returns */
+ }
+}
+
+
+static int luaB_auxwrap (lua_State *L) {
+ lua_State *co = lua_tothread(L, lua_upvalueindex(1));
+ int r = auxresume(L, co, lua_gettop(L));
+ if (r < 0) {
+ if (lua_isstring(L, -1)) { /* error object is a string? */
+ luaL_where(L, 1); /* add extra info */
+ lua_insert(L, -2);
+ lua_concat(L, 2);
+ }
+ return lua_error(L); /* propagate error */
+ }
+ return r;
+}
+
+
+static int luaB_cocreate (lua_State *L) {
+ lua_State *NL;
+ luaL_checktype(L, 1, LUA_TFUNCTION);
+ NL = lua_newthread(L);
+ lua_pushvalue(L, 1); /* move function to top */
+ lua_xmove(L, NL, 1); /* move function from L to NL */
+ return 1;
+}
+
+
+static int luaB_cowrap (lua_State *L) {
+ luaB_cocreate(L);
+ lua_pushcclosure(L, luaB_auxwrap, 1);
+ return 1;
+}
+
+
+static int luaB_yield (lua_State *L) {
+ return lua_yield(L, lua_gettop(L));
+}
+
+
+static int luaB_costatus (lua_State *L) {
+ lua_State *co = lua_tothread(L, 1);
+ luaL_argcheck(L, co, 1, "coroutine expected");
+ if (L == co) lua_pushliteral(L, "running");
+ else {
+ switch (lua_status(co)) {
+ case LUA_YIELD:
+ lua_pushliteral(L, "suspended");
+ break;
+ case LUA_OK: {
+ lua_Debug ar;
+ if (lua_getstack(co, 0, &ar) > 0) /* does it have frames? */
+ lua_pushliteral(L, "normal"); /* it is running */
+ else if (lua_gettop(co) == 0)
+ lua_pushliteral(L, "dead");
+ else
+ lua_pushliteral(L, "suspended"); /* initial state */
+ break;
+ }
+ default: /* some error occurred */
+ lua_pushliteral(L, "dead");
+ break;
+ }
+ }
+ return 1;
+}
+
+
+static int luaB_corunning (lua_State *L) {
+ int ismain = lua_pushthread(L);
+ lua_pushboolean(L, ismain);
+ return 2;
+}
+
+
+static const luaL_Reg co_funcs[] = {
+ {"create", luaB_cocreate},
+ {"resume", luaB_coresume},
+ {"running", luaB_corunning},
+ {"status", luaB_costatus},
+ {"wrap", luaB_cowrap},
+ {"yield", luaB_yield},
+ {NULL, NULL}
+};
+
+
+
+LUAMOD_API int luaopen_coroutine (lua_State *L) {
+ luaL_newlib(L, co_funcs);
+ return 1;
+}
+
+#if defined(_KERNEL)
+
+EXPORT_SYMBOL(luaopen_coroutine);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lctype.c b/sys/contrib/openzfs/module/lua/lctype.c
new file mode 100644
index 000000000000..028d278ae4da
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lctype.c
@@ -0,0 +1,52 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lctype.c,v 1.11.1.1 2013/04/12 18:48:47 roberto Exp $
+** 'ctype' functions for Lua
+** See Copyright Notice in lua.h
+*/
+
+#define lctype_c
+#define LUA_CORE
+
+#include "lctype.h"
+
+#if !LUA_USE_CTYPE /* { */
+
+LUAI_DDEF const lu_byte luai_ctype_[UCHAR_MAX + 2] = {
+ 0x00, /* EOZ */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0. */
+ 0x00, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 1. */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x0c, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, /* 2. */
+ 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
+ 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, /* 3. */
+ 0x16, 0x16, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
+ 0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05, /* 4. */
+ 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+ 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 5. */
+ 0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x05,
+ 0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05, /* 6. */
+ 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+ 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 7. */
+ 0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 8. */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 9. */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* a. */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* b. */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* c. */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* d. */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* e. */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* f. */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+#endif /* } */
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lctype.h b/sys/contrib/openzfs/module/lua/lctype.h
new file mode 100644
index 000000000000..b16b6bc7dab3
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lctype.h
@@ -0,0 +1,94 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lctype.h,v 1.12.1.1 2013/04/12 18:48:47 roberto Exp $
+** 'ctype' functions for Lua
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lctype_h
+#define lctype_h
+
+#include <sys/lua/lua.h>
+
+
+/*
+** WARNING: the functions defined here do not necessarily correspond
+** to the similar functions in the standard C ctype.h. They are
+** optimized for the specific needs of Lua
+*/
+
+#if !defined(LUA_USE_CTYPE)
+
+#if 'A' == 65 && '0' == 48
+/* ASCII case: can use its own tables; faster and fixed */
+#define LUA_USE_CTYPE 0
+#else
+/* must use standard C ctype */
+#define LUA_USE_CTYPE 1
+#endif
+
+#endif
+
+
+#if !LUA_USE_CTYPE /* { */
+
+#include "llimits.h"
+
+
+#define ALPHABIT 0
+#define DIGITBIT 1
+#define PRINTBIT 2
+#define SPACEBIT 3
+#define XDIGITBIT 4
+
+
+#define MASK(B) (1 << (B))
+
+
+/*
+** add 1 to char to allow index -1 (EOZ)
+*/
+#define testprop(c,p) (luai_ctype_[(lu_byte)(c)+1] & (p))
+
+/*
+** 'lalpha' (Lua alphabetic) and 'lalnum' (Lua alphanumeric) both include '_'
+*/
+#define lislalpha(c) testprop(c, MASK(ALPHABIT))
+#define lislalnum(c) testprop(c, (MASK(ALPHABIT) | MASK(DIGITBIT)))
+#define lisdigit(c) testprop(c, MASK(DIGITBIT))
+#define lisspace(c) testprop(c, MASK(SPACEBIT))
+#define lisprint(c) testprop(c, MASK(PRINTBIT))
+#define lisxdigit(c) testprop(c, MASK(XDIGITBIT))
+
+/*
+** this 'ltolower' only works for alphabetic characters
+*/
+#define ltolower(c) ((c) | ('A' ^ 'a'))
+
+
+/* two more entries for 0 and -1 (EOZ) */
+LUAI_DDEC const lu_byte luai_ctype_[UCHAR_MAX + 2];
+
+
+#else /* }{ */
+
+/*
+** use standard C ctypes
+*/
+
+#include <ctype.h>
+
+
+#define lislalpha(c) (isalpha(c) || (c) == '_')
+#define lislalnum(c) (isalnum(c) || (c) == '_')
+#define lisdigit(c) (isdigit(c))
+#define lisspace(c) (isspace(c))
+#define lisprint(c) (isprint(c))
+#define lisxdigit(c) (isxdigit(c))
+
+#define ltolower(c) (tolower(c))
+
+#endif /* } */
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/ldebug.c b/sys/contrib/openzfs/module/lua/ldebug.c
new file mode 100644
index 000000000000..da005c44376e
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/ldebug.c
@@ -0,0 +1,608 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: ldebug.c,v 2.90.1.4 2015/02/19 17:05:13 roberto Exp $
+** Debug Interface
+** See Copyright Notice in lua.h
+*/
+
+
+#define ldebug_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "lapi.h"
+#include "lcode.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+#include "lvm.h"
+
+
+
+#define noLuaClosure(f) ((f) == NULL || (f)->c.tt == LUA_TCCL)
+
+
+static const char *getfuncname (lua_State *L, CallInfo *ci, const char **name);
+
+
+static int currentpc (CallInfo *ci) {
+ lua_assert(isLua(ci));
+ return pcRel(ci->u.l.savedpc, ci_func(ci)->p);
+}
+
+
+static int currentline (CallInfo *ci) {
+ return getfuncline(ci_func(ci)->p, currentpc(ci));
+}
+
+
+static void swapextra (lua_State *L) {
+ if (L->status == LUA_YIELD) {
+ CallInfo *ci = L->ci; /* get function that yielded */
+ StkId temp = ci->func; /* exchange its 'func' and 'extra' values */
+ ci->func = restorestack(L, ci->extra);
+ ci->extra = savestack(L, temp);
+ }
+}
+
+
+/*
+** this function can be called asynchronous (e.g. during a signal)
+*/
+LUA_API int lua_sethook (lua_State *L, lua_Hook func, int mask, int count) {
+ if (func == NULL || mask == 0) { /* turn off hooks? */
+ mask = 0;
+ func = NULL;
+ }
+ if (isLua(L->ci))
+ L->oldpc = L->ci->u.l.savedpc;
+ L->hook = func;
+ L->basehookcount = count;
+ resethookcount(L);
+ L->hookmask = cast_byte(mask);
+ return 1;
+}
+
+
+LUA_API lua_Hook lua_gethook (lua_State *L) {
+ return L->hook;
+}
+
+
+LUA_API int lua_gethookmask (lua_State *L) {
+ return L->hookmask;
+}
+
+
+LUA_API int lua_gethookcount (lua_State *L) {
+ return L->basehookcount;
+}
+
+
+LUA_API int lua_getstack (lua_State *L, int level, lua_Debug *ar) {
+ int status;
+ CallInfo *ci;
+ if (level < 0) return 0; /* invalid (negative) level */
+ lua_lock(L);
+ for (ci = L->ci; level > 0 && ci != &L->base_ci; ci = ci->previous)
+ level--;
+ if (level == 0 && ci != &L->base_ci) { /* level found? */
+ status = 1;
+ ar->i_ci = ci;
+ }
+ else status = 0; /* no such level */
+ lua_unlock(L);
+ return status;
+}
+
+
+static const char *upvalname (Proto *p, int uv) {
+ TString *s = check_exp(uv < p->sizeupvalues, p->upvalues[uv].name);
+ if (s == NULL) return "?";
+ else return getstr(s);
+}
+
+
+static const char *findvararg (CallInfo *ci, int n, StkId *pos) {
+ int nparams = clLvalue(ci->func)->p->numparams;
+ if (n >= ci->u.l.base - ci->func - nparams)
+ return NULL; /* no such vararg */
+ else {
+ *pos = ci->func + nparams + n;
+ return "(*vararg)"; /* generic name for any vararg */
+ }
+}
+
+
+static const char *findlocal (lua_State *L, CallInfo *ci, int n,
+ StkId *pos) {
+ const char *name = NULL;
+ StkId base;
+ if (isLua(ci)) {
+ if (n < 0) /* access to vararg values? */
+ return findvararg(ci, -n, pos);
+ else {
+ base = ci->u.l.base;
+ name = luaF_getlocalname(ci_func(ci)->p, n, currentpc(ci));
+ }
+ }
+ else
+ base = ci->func + 1;
+ if (name == NULL) { /* no 'standard' name? */
+ StkId limit = (ci == L->ci) ? L->top : ci->next->func;
+ if (limit - base >= n && n > 0) /* is 'n' inside 'ci' stack? */
+ name = "(*temporary)"; /* generic name for any valid slot */
+ else
+ return NULL; /* no name */
+ }
+ *pos = base + (n - 1);
+ return name;
+}
+
+
+LUA_API const char *lua_getlocal (lua_State *L, const lua_Debug *ar, int n) {
+ const char *name;
+ lua_lock(L);
+ swapextra(L);
+ if (ar == NULL) { /* information about non-active function? */
+ if (!isLfunction(L->top - 1)) /* not a Lua function? */
+ name = NULL;
+ else /* consider live variables at function start (parameters) */
+ name = luaF_getlocalname(clLvalue(L->top - 1)->p, n, 0);
+ }
+ else { /* active function; get information through 'ar' */
+ StkId pos = 0; /* to avoid warnings */
+ name = findlocal(L, ar->i_ci, n, &pos);
+ if (name) {
+ setobj2s(L, L->top, pos);
+ api_incr_top(L);
+ }
+ }
+ swapextra(L);
+ lua_unlock(L);
+ return name;
+}
+
+
+LUA_API const char *lua_setlocal (lua_State *L, const lua_Debug *ar, int n) {
+ StkId pos = 0; /* to avoid warnings */
+ const char *name;
+ lua_lock(L);
+ swapextra(L);
+ name = findlocal(L, ar->i_ci, n, &pos);
+ if (name)
+ setobjs2s(L, pos, L->top - 1);
+ L->top--; /* pop value */
+ swapextra(L);
+ lua_unlock(L);
+ return name;
+}
+
+
+static void funcinfo (lua_Debug *ar, Closure *cl) {
+ if (noLuaClosure(cl)) {
+ ar->source = "=[C]";
+ ar->linedefined = -1;
+ ar->lastlinedefined = -1;
+ ar->what = "C";
+ }
+ else {
+ Proto *p = cl->l.p;
+ ar->source = p->source ? getstr(p->source) : "=?";
+ ar->linedefined = p->linedefined;
+ ar->lastlinedefined = p->lastlinedefined;
+ ar->what = (ar->linedefined == 0) ? "main" : "Lua";
+ }
+ luaO_chunkid(ar->short_src, ar->source, LUA_IDSIZE);
+}
+
+
+static void collectvalidlines (lua_State *L, Closure *f) {
+ if (noLuaClosure(f)) {
+ setnilvalue(L->top);
+ api_incr_top(L);
+ }
+ else {
+ int i;
+ TValue v;
+ int *lineinfo = f->l.p->lineinfo;
+ Table *t = luaH_new(L); /* new table to store active lines */
+ sethvalue(L, L->top, t); /* push it on stack */
+ api_incr_top(L);
+ setbvalue(&v, 1); /* boolean 'true' to be the value of all indices */
+ for (i = 0; i < f->l.p->sizelineinfo; i++) /* for all lines with code */
+ luaH_setint(L, t, lineinfo[i], &v); /* table[line] = true */
+ }
+}
+
+
+static int auxgetinfo (lua_State *L, const char *what, lua_Debug *ar,
+ Closure *f, CallInfo *ci) {
+ int status = 1;
+ for (; *what; what++) {
+ switch (*what) {
+ case 'S': {
+ funcinfo(ar, f);
+ break;
+ }
+ case 'l': {
+ ar->currentline = (ci && isLua(ci)) ? currentline(ci) : -1;
+ break;
+ }
+ case 'u': {
+ ar->nups = (f == NULL) ? 0 : f->c.nupvalues;
+ if (noLuaClosure(f)) {
+ ar->isvararg = 1;
+ ar->nparams = 0;
+ }
+ else {
+ ar->isvararg = f->l.p->is_vararg;
+ ar->nparams = f->l.p->numparams;
+ }
+ break;
+ }
+ case 't': {
+ ar->istailcall = (ci) ? ci->callstatus & CIST_TAIL : 0;
+ break;
+ }
+ case 'n': {
+ /* calling function is a known Lua function? */
+ if (ci && !(ci->callstatus & CIST_TAIL) && isLua(ci->previous))
+ ar->namewhat = getfuncname(L, ci->previous, &ar->name);
+ else
+ ar->namewhat = NULL;
+ if (ar->namewhat == NULL) {
+ ar->namewhat = ""; /* not found */
+ ar->name = NULL;
+ }
+ break;
+ }
+ case 'L':
+ case 'f': /* handled by lua_getinfo */
+ break;
+ default: status = 0; /* invalid option */
+ }
+ }
+ return status;
+}
+
+
+LUA_API int lua_getinfo (lua_State *L, const char *what, lua_Debug *ar) {
+ int status;
+ Closure *cl;
+ CallInfo *ci;
+ StkId func;
+ lua_lock(L);
+ swapextra(L);
+ if (*what == '>') {
+ ci = NULL;
+ func = L->top - 1;
+ api_check(L, ttisfunction(func), "function expected");
+ what++; /* skip the '>' */
+ L->top--; /* pop function */
+ }
+ else {
+ ci = ar->i_ci;
+ func = ci->func;
+ lua_assert(ttisfunction(ci->func));
+ }
+ cl = ttisclosure(func) ? clvalue(func) : NULL;
+ status = auxgetinfo(L, what, ar, cl, ci);
+ if (strchr(what, 'f')) {
+ setobjs2s(L, L->top, func);
+ api_incr_top(L);
+ }
+ swapextra(L);
+ if (strchr(what, 'L'))
+ collectvalidlines(L, cl);
+ lua_unlock(L);
+ return status;
+}
+
+
+/*
+** {======================================================
+** Symbolic Execution
+** =======================================================
+*/
+
+static const char *getobjname (Proto *p, int lastpc, int reg,
+ const char **name);
+
+
+/*
+** find a "name" for the RK value 'c'
+*/
+static void kname (Proto *p, int pc, int c, const char **name) {
+ if (ISK(c)) { /* is 'c' a constant? */
+ TValue *kvalue = &p->k[INDEXK(c)];
+ if (ttisstring(kvalue)) { /* literal constant? */
+ *name = svalue(kvalue); /* it is its own name */
+ return;
+ }
+ /* else no reasonable name found */
+ }
+ else { /* 'c' is a register */
+ const char *what = getobjname(p, pc, c, name); /* search for 'c' */
+ if (what && *what == 'c') { /* found a constant name? */
+ return; /* 'name' already filled */
+ }
+ /* else no reasonable name found */
+ }
+ *name = "?"; /* no reasonable name found */
+}
+
+
+static int filterpc (int pc, int jmptarget) {
+ if (pc < jmptarget) /* is code conditional (inside a jump)? */
+ return -1; /* cannot know who sets that register */
+ else return pc; /* current position sets that register */
+}
+
+
+/*
+** try to find last instruction before 'lastpc' that modified register 'reg'
+*/
+static int findsetreg (Proto *p, int lastpc, int reg) {
+ int pc;
+ int setreg = -1; /* keep last instruction that changed 'reg' */
+ int jmptarget = 0; /* any code before this address is conditional */
+ for (pc = 0; pc < lastpc; pc++) {
+ Instruction i = p->code[pc];
+ OpCode op = GET_OPCODE(i);
+ int a = GETARG_A(i);
+ switch (op) {
+ case OP_LOADNIL: {
+ int b = GETARG_B(i);
+ if (a <= reg && reg <= a + b) /* set registers from 'a' to 'a+b' */
+ setreg = filterpc(pc, jmptarget);
+ break;
+ }
+ case OP_TFORCALL: {
+ if (reg >= a + 2) /* affect all regs above its base */
+ setreg = filterpc(pc, jmptarget);
+ break;
+ }
+ case OP_CALL:
+ case OP_TAILCALL: {
+ if (reg >= a) /* affect all registers above base */
+ setreg = filterpc(pc, jmptarget);
+ break;
+ }
+ case OP_JMP: {
+ int b = GETARG_sBx(i);
+ int dest = pc + 1 + b;
+ /* jump is forward and do not skip `lastpc'? */
+ if (pc < dest && dest <= lastpc) {
+ if (dest > jmptarget)
+ jmptarget = dest; /* update 'jmptarget' */
+ }
+ break;
+ }
+ case OP_TEST: {
+ if (reg == a) /* jumped code can change 'a' */
+ setreg = filterpc(pc, jmptarget);
+ break;
+ }
+ default:
+ if (testAMode(op) && reg == a) /* any instruction that set A */
+ setreg = filterpc(pc, jmptarget);
+ break;
+ }
+ }
+ return setreg;
+}
+
+
+static const char *getobjname (Proto *p, int lastpc, int reg,
+ const char **name) {
+ int pc;
+ *name = luaF_getlocalname(p, reg + 1, lastpc);
+ if (*name) /* is a local? */
+ return "local";
+ /* else try symbolic execution */
+ pc = findsetreg(p, lastpc, reg);
+ if (pc != -1) { /* could find instruction? */
+ Instruction i = p->code[pc];
+ OpCode op = GET_OPCODE(i);
+ switch (op) {
+ case OP_MOVE: {
+ int b = GETARG_B(i); /* move from 'b' to 'a' */
+ if (b < GETARG_A(i))
+ return getobjname(p, pc, b, name); /* get name for 'b' */
+ break;
+ }
+ case OP_GETTABUP:
+ case OP_GETTABLE: {
+ int k = GETARG_C(i); /* key index */
+ int t = GETARG_B(i); /* table index */
+ const char *vn = (op == OP_GETTABLE) /* name of indexed variable */
+ ? luaF_getlocalname(p, t + 1, pc)
+ : upvalname(p, t);
+ kname(p, pc, k, name);
+ return (vn && strcmp(vn, LUA_ENV) == 0) ? "global" : "field";
+ }
+ case OP_GETUPVAL: {
+ *name = upvalname(p, GETARG_B(i));
+ return "upvalue";
+ }
+ case OP_LOADK:
+ case OP_LOADKX: {
+ int b = (op == OP_LOADK) ? GETARG_Bx(i)
+ : GETARG_Ax(p->code[pc + 1]);
+ if (ttisstring(&p->k[b])) {
+ *name = svalue(&p->k[b]);
+ return "constant";
+ }
+ break;
+ }
+ case OP_SELF: {
+ int k = GETARG_C(i); /* key index */
+ kname(p, pc, k, name);
+ return "method";
+ }
+ default: break; /* go through to return NULL */
+ }
+ }
+ return NULL; /* could not find reasonable name */
+}
+
+
+static const char *getfuncname (lua_State *L, CallInfo *ci, const char **name) {
+ TMS tm;
+ Proto *p = ci_func(ci)->p; /* calling function */
+ int pc = currentpc(ci); /* calling instruction index */
+ Instruction i = p->code[pc]; /* calling instruction */
+ switch (GET_OPCODE(i)) {
+ case OP_CALL:
+ case OP_TAILCALL: /* get function name */
+ return getobjname(p, pc, GETARG_A(i), name);
+ case OP_TFORCALL: { /* for iterator */
+ *name = "for iterator";
+ return "for iterator";
+ }
+ /* all other instructions can call only through metamethods */
+ case OP_SELF:
+ case OP_GETTABUP:
+ case OP_GETTABLE: tm = TM_INDEX; break;
+ case OP_SETTABUP:
+ case OP_SETTABLE: tm = TM_NEWINDEX; break;
+ case OP_EQ: tm = TM_EQ; break;
+ case OP_ADD: tm = TM_ADD; break;
+ case OP_SUB: tm = TM_SUB; break;
+ case OP_MUL: tm = TM_MUL; break;
+ case OP_DIV: tm = TM_DIV; break;
+ case OP_MOD: tm = TM_MOD; break;
+ case OP_POW: tm = TM_POW; break;
+ case OP_UNM: tm = TM_UNM; break;
+ case OP_LEN: tm = TM_LEN; break;
+ case OP_LT: tm = TM_LT; break;
+ case OP_LE: tm = TM_LE; break;
+ case OP_CONCAT: tm = TM_CONCAT; break;
+ default:
+ return NULL; /* else no useful name can be found */
+ }
+ *name = getstr(G(L)->tmname[tm]);
+ return "metamethod";
+}
+
+/* }====================================================== */
+
+
+
+/*
+** only ANSI way to check whether a pointer points to an array
+** (used only for error messages, so efficiency is not a big concern)
+*/
+static int isinstack (CallInfo *ci, const TValue *o) {
+ StkId p;
+ for (p = ci->u.l.base; p < ci->top; p++)
+ if (o == p) return 1;
+ return 0;
+}
+
+
+static const char *getupvalname (CallInfo *ci, const TValue *o,
+ const char **name) {
+ LClosure *c = ci_func(ci);
+ int i;
+ for (i = 0; i < c->nupvalues; i++) {
+ if (c->upvals[i]->v == o) {
+ *name = upvalname(c->p, i);
+ return "upvalue";
+ }
+ }
+ return NULL;
+}
+
+
+l_noret luaG_typeerror (lua_State *L, const TValue *o, const char *op) {
+ CallInfo *ci = L->ci;
+ const char *name = NULL;
+ const char *t = objtypename(o);
+ const char *kind = NULL;
+ if (isLua(ci)) {
+ kind = getupvalname(ci, o, &name); /* check whether 'o' is an upvalue */
+ if (!kind && isinstack(ci, o)) /* no? try a register */
+ kind = getobjname(ci_func(ci)->p, currentpc(ci),
+ cast_int(o - ci->u.l.base), &name);
+ }
+ if (kind)
+ luaG_runerror(L, "attempt to %s %s " LUA_QS " (a %s value)",
+ op, kind, name, t);
+ else
+ luaG_runerror(L, "attempt to %s a %s value", op, t);
+}
+
+
+l_noret luaG_concaterror (lua_State *L, StkId p1, StkId p2) {
+ if (ttisstring(p1) || ttisnumber(p1)) p1 = p2;
+ lua_assert(!ttisstring(p1) && !ttisnumber(p1));
+ luaG_typeerror(L, p1, "concatenate");
+}
+
+
+l_noret luaG_aritherror (lua_State *L, const TValue *p1, const TValue *p2) {
+ TValue temp;
+ if (luaV_tonumber(p1, &temp) == NULL)
+ p2 = p1; /* first operand is wrong */
+ luaG_typeerror(L, p2, "perform arithmetic on");
+}
+
+
+l_noret luaG_ordererror (lua_State *L, const TValue *p1, const TValue *p2) {
+ const char *t1 = objtypename(p1);
+ const char *t2 = objtypename(p2);
+ if (t1 == t2)
+ luaG_runerror(L, "attempt to compare two %s values", t1);
+ else
+ luaG_runerror(L, "attempt to compare %s with %s", t1, t2);
+}
+
+
+static void addinfo (lua_State *L, const char *msg) {
+ CallInfo *ci = L->ci;
+ if (isLua(ci)) { /* is Lua code? */
+ char buff[LUA_IDSIZE]; /* add file:line information */
+ int line = currentline(ci);
+ TString *src = ci_func(ci)->p->source;
+ if (src)
+ luaO_chunkid(buff, getstr(src), LUA_IDSIZE);
+ else { /* no source available; use "?" instead */
+ buff[0] = '?'; buff[1] = '\0';
+ }
+ luaO_pushfstring(L, "%s:%d: %s", buff, line, msg);
+ }
+}
+
+
+l_noret luaG_errormsg (lua_State *L) {
+ if (L->errfunc != 0) { /* is there an error handling function? */
+ StkId errfunc = restorestack(L, L->errfunc);
+ if (!ttisfunction(errfunc)) luaD_throw(L, LUA_ERRERR);
+ setobjs2s(L, L->top, L->top - 1); /* move argument */
+ setobjs2s(L, L->top - 1, errfunc); /* push function */
+ L->top++;
+ luaD_call(L, L->top - 2, 1, 0); /* call it */
+ }
+ luaD_throw(L, LUA_ERRRUN);
+}
+
+
+l_noret luaG_runerror (lua_State *L, const char *fmt, ...) {
+ L->runerror++;
+ va_list argp;
+ va_start(argp, fmt);
+ addinfo(L, luaO_pushvfstring(L, fmt, argp));
+ va_end(argp);
+ luaG_errormsg(L);
+ L->runerror--;
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/ldebug.h b/sys/contrib/openzfs/module/lua/ldebug.h
new file mode 100644
index 000000000000..36ed396f26c9
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/ldebug.h
@@ -0,0 +1,36 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: ldebug.h,v 2.7.1.1 2013/04/12 18:48:47 roberto Exp $
+** Auxiliary functions from Debug Interface module
+** See Copyright Notice in lua.h
+*/
+
+#ifndef ldebug_h
+#define ldebug_h
+
+
+#include "lstate.h"
+
+
+#define pcRel(pc, p) (cast(int, (pc) - (p)->code) - 1)
+
+#define getfuncline(f,pc) (((f)->lineinfo) ? (f)->lineinfo[pc] : 0)
+
+#define resethookcount(L) (L->hookcount = L->basehookcount)
+
+/* Active Lua function (given call info) */
+#define ci_func(ci) (clLvalue((ci)->func))
+
+
+LUAI_FUNC l_noret luaG_typeerror (lua_State *L, const TValue *o,
+ const char *opname);
+LUAI_FUNC l_noret luaG_concaterror (lua_State *L, StkId p1, StkId p2);
+LUAI_FUNC l_noret luaG_aritherror (lua_State *L, const TValue *p1,
+ const TValue *p2);
+LUAI_FUNC l_noret luaG_ordererror (lua_State *L, const TValue *p1,
+ const TValue *p2);
+LUAI_FUNC l_noret luaG_runerror (lua_State *L, const char *fmt, ...);
+LUAI_FUNC l_noret luaG_errormsg (lua_State *L);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/ldo.c b/sys/contrib/openzfs/module/lua/ldo.c
new file mode 100644
index 000000000000..f3c3dcb4d81a
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/ldo.c
@@ -0,0 +1,749 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: ldo.c,v 2.108.1.3 2013/11/08 18:22:50 roberto Exp $
+** Stack and Call structure of Lua
+** See Copyright Notice in lua.h
+*/
+
+
+#define ldo_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "lapi.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lparser.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+#include "lvm.h"
+#include "lzio.h"
+
+
+
+/* Return the number of bytes available on the stack. */
+#if defined (_KERNEL) && defined(__linux__)
+#include <asm/current.h>
+static intptr_t stack_remaining(void) {
+ intptr_t local;
+ local = (intptr_t)&local - (intptr_t)current->stack;
+ return local;
+}
+#elif defined (_KERNEL) && defined(__FreeBSD__)
+#include <sys/pcpu.h>
+static intptr_t stack_remaining(void) {
+ intptr_t local;
+ local = (intptr_t)&local - (intptr_t)curthread->td_kstack;
+ return local;
+}
+#else
+static intptr_t stack_remaining(void) {
+ return INTPTR_MAX;
+}
+#endif
+
+/*
+** {======================================================
+** Error-recovery functions
+** =======================================================
+*/
+
+/*
+** LUAI_THROW/LUAI_TRY define how Lua does exception handling. By
+** default, Lua handles errors with exceptions when compiling as
+** C++ code, with _longjmp/_setjmp when asked to use them, and with
+** longjmp/setjmp otherwise.
+*/
+#if !defined(LUAI_THROW)
+
+#ifdef _KERNEL
+
+#ifdef __linux__
+#if defined(__i386__)
+#define JMP_BUF_CNT 6
+#elif defined(__x86_64__)
+#define JMP_BUF_CNT 8
+#elif defined(__sparc__) && defined(__arch64__)
+#define JMP_BUF_CNT 6
+#elif defined(__powerpc__)
+#define JMP_BUF_CNT 26
+#elif defined(__aarch64__)
+#define JMP_BUF_CNT 64
+#elif defined(__arm__)
+#define JMP_BUF_CNT 65
+#elif defined(__mips__)
+#define JMP_BUF_CNT 12
+#elif defined(__s390x__)
+#define JMP_BUF_CNT 18
+#elif defined(__riscv)
+#define JMP_BUF_CNT 64
+#else
+#define JMP_BUF_CNT 1
+#endif
+
+typedef struct _label_t { long long unsigned val[JMP_BUF_CNT]; } label_t;
+
+int setjmp(label_t *) __attribute__ ((__nothrow__));
+extern void longjmp(label_t *) __attribute__((__noreturn__));
+
+#define LUAI_THROW(L,c) longjmp(&(c)->b)
+#define LUAI_TRY(L,c,a) if (setjmp(&(c)->b) == 0) { a }
+#define luai_jmpbuf label_t
+
+/* unsupported arches will build but not be able to run lua programs */
+#if JMP_BUF_CNT == 1
+int setjmp (label_t *buf) {
+ return 1;
+}
+
+void longjmp (label_t * buf) {
+ for (;;);
+}
+#endif
+#else
+#define LUAI_THROW(L,c) longjmp((c)->b, 1)
+#define LUAI_TRY(L,c,a) if (setjmp((c)->b) == 0) { a }
+#define luai_jmpbuf jmp_buf
+#endif
+
+#else /* _KERNEL */
+
+#if defined(__cplusplus) && !defined(LUA_USE_LONGJMP)
+/* C++ exceptions */
+#define LUAI_THROW(L,c) throw(c)
+#define LUAI_TRY(L,c,a) \
+ try { a } catch(...) { if ((c)->status == 0) (c)->status = -1; }
+#define luai_jmpbuf int /* dummy variable */
+
+#elif defined(LUA_USE_ULONGJMP)
+/* in Unix, try _longjmp/_setjmp (more efficient) */
+#define LUAI_THROW(L,c) _longjmp((c)->b, 1)
+#define LUAI_TRY(L,c,a) if (_setjmp((c)->b) == 0) { a }
+#define luai_jmpbuf jmp_buf
+
+#else
+/* default handling with long jumps */
+#define LUAI_THROW(L,c) longjmp((c)->b, 1)
+#define LUAI_TRY(L,c,a) if (setjmp((c)->b) == 0) { a }
+#define luai_jmpbuf jmp_buf
+
+#endif
+
+#endif /* _KERNEL */
+
+#endif /* LUAI_THROW */
+
+
+/* chain list of long jump buffers */
+struct lua_longjmp {
+ struct lua_longjmp *previous;
+ luai_jmpbuf b;
+ volatile int status; /* error code */
+};
+
+
+static void seterrorobj (lua_State *L, int errcode, StkId oldtop) {
+ switch (errcode) {
+ case LUA_ERRMEM: { /* memory error? */
+ setsvalue2s(L, oldtop, G(L)->memerrmsg); /* reuse preregistered msg. */
+ break;
+ }
+ case LUA_ERRERR: {
+ setsvalue2s(L, oldtop, luaS_newliteral(L, "error in error handling"));
+ break;
+ }
+ default: {
+ setobjs2s(L, oldtop, L->top - 1); /* error message on current top */
+ break;
+ }
+ }
+ L->top = oldtop + 1;
+}
+
+
+l_noret luaD_throw (lua_State *L, int errcode) {
+ if (L->errorJmp) { /* thread has an error handler? */
+ L->errorJmp->status = errcode; /* set status */
+ LUAI_THROW(L, L->errorJmp); /* jump to it */
+ }
+ else { /* thread has no error handler */
+ L->status = cast_byte(errcode); /* mark it as dead */
+ if (G(L)->mainthread->errorJmp) { /* main thread has a handler? */
+ setobjs2s(L, G(L)->mainthread->top++, L->top - 1); /* copy error obj. */
+ luaD_throw(G(L)->mainthread, errcode); /* re-throw in main thread */
+ }
+ else { /* no handler at all; abort */
+ if (G(L)->panic) { /* panic function? */
+ lua_unlock(L);
+ G(L)->panic(L); /* call it (last chance to jump out) */
+ }
+ panic("no error handler");
+ }
+ }
+}
+
+
+int luaD_rawrunprotected (lua_State *L, Pfunc f, void *ud) {
+ unsigned short oldnCcalls = L->nCcalls;
+ struct lua_longjmp lj;
+ lj.status = LUA_OK;
+ lj.previous = L->errorJmp; /* chain new error handler */
+ L->errorJmp = &lj;
+ LUAI_TRY(L, &lj,
+ (*f)(L, ud);
+ );
+ L->errorJmp = lj.previous; /* restore old error handler */
+ L->nCcalls = oldnCcalls;
+ return lj.status;
+}
+
+/* }====================================================== */
+
+
+static void correctstack (lua_State *L, TValue *oldstack) {
+ CallInfo *ci;
+ GCObject *up;
+ L->top = (L->top - oldstack) + L->stack;
+ for (up = L->openupval; up != NULL; up = up->gch.next)
+ gco2uv(up)->v = (gco2uv(up)->v - oldstack) + L->stack;
+ for (ci = L->ci; ci != NULL; ci = ci->previous) {
+ ci->top = (ci->top - oldstack) + L->stack;
+ ci->func = (ci->func - oldstack) + L->stack;
+ if (isLua(ci))
+ ci->u.l.base = (ci->u.l.base - oldstack) + L->stack;
+ }
+}
+
+
+/* some space for error handling */
+#define ERRORSTACKSIZE (LUAI_MAXSTACK + 200)
+
+
+void luaD_reallocstack (lua_State *L, int newsize) {
+ TValue *oldstack = L->stack;
+ int lim = L->stacksize;
+ lua_assert(newsize <= LUAI_MAXSTACK || newsize == ERRORSTACKSIZE);
+ lua_assert(L->stack_last - L->stack == L->stacksize - EXTRA_STACK);
+ luaM_reallocvector(L, L->stack, L->stacksize, newsize, TValue);
+ for (; lim < newsize; lim++)
+ setnilvalue(L->stack + lim); /* erase new segment */
+ L->stacksize = newsize;
+ L->stack_last = L->stack + newsize - EXTRA_STACK;
+ correctstack(L, oldstack);
+}
+
+
+void luaD_growstack (lua_State *L, int n) {
+ int size = L->stacksize;
+ if (size > LUAI_MAXSTACK) /* error after extra size? */
+ luaD_throw(L, LUA_ERRERR);
+ else {
+ int needed = cast_int(L->top - L->stack) + n + EXTRA_STACK;
+ int newsize = 2 * size;
+ if (newsize > LUAI_MAXSTACK) newsize = LUAI_MAXSTACK;
+ if (newsize < needed) newsize = needed;
+ if (newsize > LUAI_MAXSTACK) { /* stack overflow? */
+ luaD_reallocstack(L, ERRORSTACKSIZE);
+ luaG_runerror(L, "stack overflow");
+ }
+ else
+ luaD_reallocstack(L, newsize);
+ }
+}
+
+
+static int stackinuse (lua_State *L) {
+ CallInfo *ci;
+ StkId lim = L->top;
+ for (ci = L->ci; ci != NULL; ci = ci->previous) {
+ lua_assert(ci->top <= L->stack_last);
+ if (lim < ci->top) lim = ci->top;
+ }
+ return cast_int(lim - L->stack) + 1; /* part of stack in use */
+}
+
+
+void luaD_shrinkstack (lua_State *L) {
+ int inuse = stackinuse(L);
+ int goodsize = inuse + (inuse / 8) + 2*EXTRA_STACK;
+ if (goodsize > LUAI_MAXSTACK) goodsize = LUAI_MAXSTACK;
+ if (inuse > LUAI_MAXSTACK || /* handling stack overflow? */
+ goodsize >= L->stacksize) /* would grow instead of shrink? */
+ condmovestack(L); /* don't change stack (change only for debugging) */
+ else
+ luaD_reallocstack(L, goodsize); /* shrink it */
+}
+
+
+void luaD_hook (lua_State *L, int event, int line) {
+ lua_Hook hook = L->hook;
+ if (hook && L->allowhook) {
+ CallInfo *ci = L->ci;
+ ptrdiff_t top = savestack(L, L->top);
+ ptrdiff_t ci_top = savestack(L, ci->top);
+ lua_Debug ar;
+ ar.event = event;
+ ar.currentline = line;
+ ar.i_ci = ci;
+ luaD_checkstack(L, LUA_MINSTACK); /* ensure minimum stack size */
+ ci->top = L->top + LUA_MINSTACK;
+ lua_assert(ci->top <= L->stack_last);
+ L->allowhook = 0; /* cannot call hooks inside a hook */
+ ci->callstatus |= CIST_HOOKED;
+ lua_unlock(L);
+ (*hook)(L, &ar);
+ lua_lock(L);
+ lua_assert(!L->allowhook);
+ L->allowhook = 1;
+ ci->top = restorestack(L, ci_top);
+ L->top = restorestack(L, top);
+ ci->callstatus &= ~CIST_HOOKED;
+ }
+}
+
+
+static void callhook (lua_State *L, CallInfo *ci) {
+ int hook = LUA_HOOKCALL;
+ ci->u.l.savedpc++; /* hooks assume 'pc' is already incremented */
+ if (isLua(ci->previous) &&
+ GET_OPCODE(*(ci->previous->u.l.savedpc - 1)) == OP_TAILCALL) {
+ ci->callstatus |= CIST_TAIL;
+ hook = LUA_HOOKTAILCALL;
+ }
+ luaD_hook(L, hook, -1);
+ ci->u.l.savedpc--; /* correct 'pc' */
+}
+
+
+static StkId adjust_varargs (lua_State *L, Proto *p, int actual) {
+ int i;
+ int nfixargs = p->numparams;
+ StkId base, fixed;
+ lua_assert(actual >= nfixargs);
+ /* move fixed parameters to final position */
+ luaD_checkstack(L, p->maxstacksize); /* check again for new 'base' */
+ fixed = L->top - actual; /* first fixed argument */
+ base = L->top; /* final position of first argument */
+ for (i=0; i<nfixargs; i++) {
+ setobjs2s(L, L->top++, fixed + i);
+ setnilvalue(fixed + i);
+ }
+ return base;
+}
+
+
+static StkId tryfuncTM (lua_State *L, StkId func) {
+ const TValue *tm = luaT_gettmbyobj(L, func, TM_CALL);
+ StkId p;
+ ptrdiff_t funcr = savestack(L, func);
+ if (!ttisfunction(tm))
+ luaG_typeerror(L, func, "call");
+ /* Open a hole inside the stack at `func' */
+ for (p = L->top; p > func; p--) setobjs2s(L, p, p-1);
+ incr_top(L);
+ func = restorestack(L, funcr); /* previous call may change stack */
+ setobj2s(L, func, tm); /* tag method is the new function to be called */
+ return func;
+}
+
+
+
+#define next_ci(L) (L->ci = (L->ci->next ? L->ci->next : luaE_extendCI(L)))
+
+
+/*
+** returns true if function has been executed (C function)
+*/
+int luaD_precall (lua_State *L, StkId func, int nresults) {
+ lua_CFunction f;
+ CallInfo *ci;
+ int n; /* number of arguments (Lua) or returns (C) */
+ ptrdiff_t funcr = savestack(L, func);
+ switch (ttype(func)) {
+ case LUA_TLCF: /* light C function */
+ f = fvalue(func);
+ goto Cfunc;
+ case LUA_TCCL: { /* C closure */
+ f = clCvalue(func)->f;
+ Cfunc:
+ luaD_checkstack(L, LUA_MINSTACK); /* ensure minimum stack size */
+ ci = next_ci(L); /* now 'enter' new function */
+ ci->nresults = nresults;
+ ci->func = restorestack(L, funcr);
+ ci->top = L->top + LUA_MINSTACK;
+ lua_assert(ci->top <= L->stack_last);
+ ci->callstatus = 0;
+ luaC_checkGC(L); /* stack grow uses memory */
+ if (L->hookmask & LUA_MASKCALL)
+ luaD_hook(L, LUA_HOOKCALL, -1);
+ lua_unlock(L);
+ n = (*f)(L); /* do the actual call */
+ lua_lock(L);
+ api_checknelems(L, n);
+ luaD_poscall(L, L->top - n);
+ return 1;
+ }
+ case LUA_TLCL: { /* Lua function: prepare its call */
+ StkId base;
+ Proto *p = clLvalue(func)->p;
+ n = cast_int(L->top - func) - 1; /* number of real arguments */
+ luaD_checkstack(L, p->maxstacksize);
+ for (; n < p->numparams; n++)
+ setnilvalue(L->top++); /* complete missing arguments */
+ if (!p->is_vararg) {
+ func = restorestack(L, funcr);
+ base = func + 1;
+ }
+ else {
+ base = adjust_varargs(L, p, n);
+ func = restorestack(L, funcr); /* previous call can change stack */
+ }
+ ci = next_ci(L); /* now 'enter' new function */
+ ci->nresults = nresults;
+ ci->func = func;
+ ci->u.l.base = base;
+ ci->top = base + p->maxstacksize;
+ lua_assert(ci->top <= L->stack_last);
+ ci->u.l.savedpc = p->code; /* starting point */
+ ci->callstatus = CIST_LUA;
+ L->top = ci->top;
+ luaC_checkGC(L); /* stack grow uses memory */
+ if (L->hookmask & LUA_MASKCALL)
+ callhook(L, ci);
+ return 0;
+ }
+ default: { /* not a function */
+ func = tryfuncTM(L, func); /* retry with 'function' tag method */
+ return luaD_precall(L, func, nresults); /* now it must be a function */
+ }
+ }
+}
+
+
+int luaD_poscall (lua_State *L, StkId firstResult) {
+ StkId res;
+ int wanted, i;
+ CallInfo *ci = L->ci;
+ if (L->hookmask & (LUA_MASKRET | LUA_MASKLINE)) {
+ if (L->hookmask & LUA_MASKRET) {
+ ptrdiff_t fr = savestack(L, firstResult); /* hook may change stack */
+ luaD_hook(L, LUA_HOOKRET, -1);
+ firstResult = restorestack(L, fr);
+ }
+ L->oldpc = ci->previous->u.l.savedpc; /* 'oldpc' for caller function */
+ }
+ res = ci->func; /* res == final position of 1st result */
+ wanted = ci->nresults;
+ L->ci = ci = ci->previous; /* back to caller */
+ /* move results to correct place */
+ for (i = wanted; i != 0 && firstResult < L->top; i--)
+ setobjs2s(L, res++, firstResult++);
+ while (i-- > 0)
+ setnilvalue(res++);
+ L->top = res;
+ return (wanted - LUA_MULTRET); /* 0 iff wanted == LUA_MULTRET */
+}
+
+
+/*
+** Call a function (C or Lua). The function to be called is at *func.
+** The arguments are on the stack, right after the function.
+** When returns, all the results are on the stack, starting at the original
+** function position.
+*/
+void luaD_call (lua_State *L, StkId func, int nResults, int allowyield) {
+ if (++L->nCcalls >= LUAI_MAXCCALLS) {
+ if (L->nCcalls == LUAI_MAXCCALLS)
+ luaG_runerror(L, "C stack overflow");
+ else if (L->nCcalls >= (LUAI_MAXCCALLS + (LUAI_MAXCCALLS>>3)))
+ luaD_throw(L, LUA_ERRERR); /* error while handling stack error */
+ }
+ intptr_t remaining = stack_remaining();
+ if (L->runerror == 0 && remaining < LUAI_MINCSTACK)
+ luaG_runerror(L, "C stack overflow");
+ if (L->runerror != 0 && remaining < LUAI_MINCSTACK / 2)
+ luaD_throw(L, LUA_ERRERR); /* error while handling stack error */
+ if (!allowyield) L->nny++;
+ if (!luaD_precall(L, func, nResults)) /* is a Lua function? */
+ luaV_execute(L); /* call it */
+ if (!allowyield) L->nny--;
+ L->nCcalls--;
+}
+
+
+static void finishCcall (lua_State *L) {
+ CallInfo *ci = L->ci;
+ int n;
+ lua_assert(ci->u.c.k != NULL); /* must have a continuation */
+ lua_assert(L->nny == 0);
+ if (ci->callstatus & CIST_YPCALL) { /* was inside a pcall? */
+ ci->callstatus &= ~CIST_YPCALL; /* finish 'lua_pcall' */
+ L->errfunc = ci->u.c.old_errfunc;
+ }
+ /* finish 'lua_callk'/'lua_pcall' */
+ adjustresults(L, ci->nresults);
+ /* call continuation function */
+ if (!(ci->callstatus & CIST_STAT)) /* no call status? */
+ ci->u.c.status = LUA_YIELD; /* 'default' status */
+ lua_assert(ci->u.c.status != LUA_OK);
+ ci->callstatus = (ci->callstatus & ~(CIST_YPCALL | CIST_STAT)) | CIST_YIELDED;
+ lua_unlock(L);
+ n = (*ci->u.c.k)(L);
+ lua_lock(L);
+ api_checknelems(L, n);
+ /* finish 'luaD_precall' */
+ luaD_poscall(L, L->top - n);
+}
+
+
+static void unroll (lua_State *L, void *ud) {
+ UNUSED(ud);
+ for (;;) {
+ if (L->ci == &L->base_ci) /* stack is empty? */
+ return; /* coroutine finished normally */
+ if (!isLua(L->ci)) /* C function? */
+ finishCcall(L);
+ else { /* Lua function */
+ luaV_finishOp(L); /* finish interrupted instruction */
+ luaV_execute(L); /* execute down to higher C 'boundary' */
+ }
+ }
+}
+
+
+/*
+** check whether thread has a suspended protected call
+*/
+static CallInfo *findpcall (lua_State *L) {
+ CallInfo *ci;
+ for (ci = L->ci; ci != NULL; ci = ci->previous) { /* search for a pcall */
+ if (ci->callstatus & CIST_YPCALL)
+ return ci;
+ }
+ return NULL; /* no pending pcall */
+}
+
+
+static int recover (lua_State *L, int status) {
+ StkId oldtop;
+ CallInfo *ci = findpcall(L);
+ if (ci == NULL) return 0; /* no recovery point */
+ /* "finish" luaD_pcall */
+ oldtop = restorestack(L, ci->extra);
+ luaF_close(L, oldtop);
+ seterrorobj(L, status, oldtop);
+ L->ci = ci;
+ L->allowhook = ci->u.c.old_allowhook;
+ L->nny = 0; /* should be zero to be yieldable */
+ luaD_shrinkstack(L);
+ L->errfunc = ci->u.c.old_errfunc;
+ ci->callstatus |= CIST_STAT; /* call has error status */
+ ci->u.c.status = status; /* (here it is) */
+ return 1; /* continue running the coroutine */
+}
+
+
+/*
+** signal an error in the call to 'resume', not in the execution of the
+** coroutine itself. (Such errors should not be handled by any coroutine
+** error handler and should not kill the coroutine.)
+*/
+static l_noret resume_error (lua_State *L, const char *msg, StkId firstArg) {
+ L->top = firstArg; /* remove args from the stack */
+ setsvalue2s(L, L->top, luaS_new(L, msg)); /* push error message */
+ api_incr_top(L);
+ luaD_throw(L, -1); /* jump back to 'lua_resume' */
+}
+
+
+/*
+** do the work for 'lua_resume' in protected mode
+*/
+static void resume_cb (lua_State *L, void *ud) {
+ int nCcalls = L->nCcalls;
+ StkId firstArg = cast(StkId, ud);
+ CallInfo *ci = L->ci;
+ if (nCcalls >= LUAI_MAXCCALLS)
+ resume_error(L, "C stack overflow", firstArg);
+ if (L->status == LUA_OK) { /* may be starting a coroutine */
+ if (ci != &L->base_ci) /* not in base level? */
+ resume_error(L, "cannot resume non-suspended coroutine", firstArg);
+ /* coroutine is in base level; start running it */
+ if (!luaD_precall(L, firstArg - 1, LUA_MULTRET)) /* Lua function? */
+ luaV_execute(L); /* call it */
+ }
+ else if (L->status != LUA_YIELD)
+ resume_error(L, "cannot resume dead coroutine", firstArg);
+ else { /* resuming from previous yield */
+ L->status = LUA_OK;
+ ci->func = restorestack(L, ci->extra);
+ if (isLua(ci)) /* yielded inside a hook? */
+ luaV_execute(L); /* just continue running Lua code */
+ else { /* 'common' yield */
+ if (ci->u.c.k != NULL) { /* does it have a continuation? */
+ int n;
+ ci->u.c.status = LUA_YIELD; /* 'default' status */
+ ci->callstatus |= CIST_YIELDED;
+ lua_unlock(L);
+ n = (*ci->u.c.k)(L); /* call continuation */
+ lua_lock(L);
+ api_checknelems(L, n);
+ firstArg = L->top - n; /* yield results come from continuation */
+ }
+ luaD_poscall(L, firstArg); /* finish 'luaD_precall' */
+ }
+ unroll(L, NULL);
+ }
+ lua_assert(nCcalls == L->nCcalls);
+}
+
+
+LUA_API int lua_resume (lua_State *L, lua_State *from, int nargs) {
+ int status;
+ int oldnny = L->nny; /* save 'nny' */
+ lua_lock(L);
+ luai_userstateresume(L, nargs);
+ L->nCcalls = (from) ? from->nCcalls + 1 : 1;
+ L->nny = 0; /* allow yields */
+ api_checknelems(L, (L->status == LUA_OK) ? nargs + 1 : nargs);
+ status = luaD_rawrunprotected(L, resume_cb, L->top - nargs);
+ if (status == -1) /* error calling 'lua_resume'? */
+ status = LUA_ERRRUN;
+ else { /* yield or regular error */
+ while (status != LUA_OK && status != LUA_YIELD) { /* error? */
+ if (recover(L, status)) /* recover point? */
+ status = luaD_rawrunprotected(L, unroll, NULL); /* run continuation */
+ else { /* unrecoverable error */
+ L->status = cast_byte(status); /* mark thread as `dead' */
+ seterrorobj(L, status, L->top);
+ L->ci->top = L->top;
+ break;
+ }
+ }
+ lua_assert(status == L->status);
+ }
+ L->nny = oldnny; /* restore 'nny' */
+ L->nCcalls--;
+ lua_assert(L->nCcalls == ((from) ? from->nCcalls : 0));
+ lua_unlock(L);
+ return status;
+}
+
+
+LUA_API int lua_yieldk (lua_State *L, int nresults, int ctx, lua_CFunction k) {
+ CallInfo *ci = L->ci;
+ luai_userstateyield(L, nresults);
+ lua_lock(L);
+ api_checknelems(L, nresults);
+ if (L->nny > 0) {
+ if (L != G(L)->mainthread)
+ luaG_runerror(L, "attempt to yield across a C-call boundary");
+ else
+ luaG_runerror(L, "attempt to yield from outside a coroutine");
+ }
+ L->status = LUA_YIELD;
+ ci->extra = savestack(L, ci->func); /* save current 'func' */
+ if (isLua(ci)) { /* inside a hook? */
+ api_check(L, k == NULL, "hooks cannot continue after yielding");
+ }
+ else {
+ if ((ci->u.c.k = k) != NULL) /* is there a continuation? */
+ ci->u.c.ctx = ctx; /* save context */
+ ci->func = L->top - nresults - 1; /* protect stack below results */
+ luaD_throw(L, LUA_YIELD);
+ }
+ lua_assert(ci->callstatus & CIST_HOOKED); /* must be inside a hook */
+ lua_unlock(L);
+ return 0; /* return to 'luaD_hook' */
+}
+
+
+int luaD_pcall (lua_State *L, Pfunc func, void *u,
+ ptrdiff_t old_top, ptrdiff_t ef) {
+ int status;
+ CallInfo *old_ci = L->ci;
+ lu_byte old_allowhooks = L->allowhook;
+ unsigned short old_nny = L->nny;
+ ptrdiff_t old_errfunc = L->errfunc;
+ L->errfunc = ef;
+ status = luaD_rawrunprotected(L, func, u);
+ if (status != LUA_OK) { /* an error occurred? */
+ StkId oldtop = restorestack(L, old_top);
+ luaF_close(L, oldtop); /* close possible pending closures */
+ seterrorobj(L, status, oldtop);
+ L->ci = old_ci;
+ L->allowhook = old_allowhooks;
+ L->nny = old_nny;
+ luaD_shrinkstack(L);
+ }
+ L->errfunc = old_errfunc;
+ return status;
+}
+
+
+
+/*
+** Execute a protected parser.
+*/
+struct SParser { /* data to `f_parser' */
+ ZIO *z;
+ Mbuffer buff; /* dynamic structure used by the scanner */
+ Dyndata dyd; /* dynamic structures used by the parser */
+ const char *mode;
+ const char *name;
+};
+
+
+static void checkmode (lua_State *L, const char *mode, const char *x) {
+ if (mode && strchr(mode, x[0]) == NULL) {
+ luaO_pushfstring(L,
+ "attempt to load a %s chunk (mode is " LUA_QS ")", x, mode);
+ luaD_throw(L, LUA_ERRSYNTAX);
+ }
+}
+
+
+static void f_parser (lua_State *L, void *ud) {
+ int i;
+ Closure *cl;
+ struct SParser *p = cast(struct SParser *, ud);
+ int c = zgetc(p->z); /* read first character */
+ lua_assert(c != LUA_SIGNATURE[0]); /* binary not supported */
+ checkmode(L, p->mode, "text");
+ cl = luaY_parser(L, p->z, &p->buff, &p->dyd, p->name, c);
+ lua_assert(cl->l.nupvalues == cl->l.p->sizeupvalues);
+ for (i = 0; i < cl->l.nupvalues; i++) { /* initialize upvalues */
+ UpVal *up = luaF_newupval(L);
+ cl->l.upvals[i] = up;
+ luaC_objbarrier(L, cl, up);
+ }
+}
+
+
+int luaD_protectedparser (lua_State *L, ZIO *z, const char *name,
+ const char *mode) {
+ struct SParser p;
+ int status;
+ L->nny++; /* cannot yield during parsing */
+ p.z = z; p.name = name; p.mode = mode;
+ p.dyd.actvar.arr = NULL; p.dyd.actvar.size = 0;
+ p.dyd.gt.arr = NULL; p.dyd.gt.size = 0;
+ p.dyd.label.arr = NULL; p.dyd.label.size = 0;
+ luaZ_initbuffer(L, &p.buff);
+ status = luaD_pcall(L, f_parser, &p, savestack(L, L->top), L->errfunc);
+ luaZ_freebuffer(L, &p.buff);
+ luaM_freearray(L, p.dyd.actvar.arr, p.dyd.actvar.size);
+ luaM_freearray(L, p.dyd.gt.arr, p.dyd.gt.size);
+ luaM_freearray(L, p.dyd.label.arr, p.dyd.label.size);
+ L->nny--;
+ return status;
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/ldo.h b/sys/contrib/openzfs/module/lua/ldo.h
new file mode 100644
index 000000000000..2c0e1704d072
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/ldo.h
@@ -0,0 +1,47 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: ldo.h,v 2.20.1.1 2013/04/12 18:48:47 roberto Exp $
+** Stack and Call structure of Lua
+** See Copyright Notice in lua.h
+*/
+
+#ifndef ldo_h
+#define ldo_h
+
+
+#include "lobject.h"
+#include "lstate.h"
+#include "lzio.h"
+
+
+#define luaD_checkstack(L,n) if (L->stack_last - L->top <= (n)) \
+ luaD_growstack(L, n); else condmovestack(L);
+
+
+#define incr_top(L) {L->top++; luaD_checkstack(L,0);}
+
+#define savestack(L,p) ((char *)(p) - (char *)L->stack)
+#define restorestack(L,n) ((TValue *)((char *)L->stack + (n)))
+
+
+/* type of protected functions, to be ran by `runprotected' */
+typedef void (*Pfunc) (lua_State *L, void *ud);
+
+LUAI_FUNC int luaD_protectedparser (lua_State *L, ZIO *z, const char *name,
+ const char *mode);
+LUAI_FUNC void luaD_hook (lua_State *L, int event, int line);
+LUAI_FUNC int luaD_precall (lua_State *L, StkId func, int nresults);
+LUAI_FUNC void luaD_call (lua_State *L, StkId func, int nResults,
+ int allowyield);
+LUAI_FUNC int luaD_pcall (lua_State *L, Pfunc func, void *u,
+ ptrdiff_t oldtop, ptrdiff_t ef);
+LUAI_FUNC int luaD_poscall (lua_State *L, StkId firstResult);
+LUAI_FUNC void luaD_reallocstack (lua_State *L, int newsize);
+LUAI_FUNC void luaD_growstack (lua_State *L, int n);
+LUAI_FUNC void luaD_shrinkstack (lua_State *L);
+
+LUAI_FUNC l_noret luaD_throw (lua_State *L, int errcode);
+LUAI_FUNC int luaD_rawrunprotected (lua_State *L, Pfunc f, void *ud);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lfunc.c b/sys/contrib/openzfs/module/lua/lfunc.c
new file mode 100644
index 000000000000..1a510831259c
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lfunc.c
@@ -0,0 +1,160 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lfunc.c,v 2.30.1.1 2013/04/12 18:48:47 roberto Exp $
+** Auxiliary functions to manipulate prototypes and closures
+** See Copyright Notice in lua.h
+*/
+
+
+#define lfunc_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "lfunc.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+
+
+
+Closure *luaF_newCclosure (lua_State *L, int n) {
+ Closure *c = &luaC_newobj(L, LUA_TCCL, sizeCclosure(n), NULL, 0)->cl;
+ c->c.nupvalues = cast_byte(n);
+ return c;
+}
+
+
+Closure *luaF_newLclosure (lua_State *L, int n) {
+ Closure *c = &luaC_newobj(L, LUA_TLCL, sizeLclosure(n), NULL, 0)->cl;
+ c->l.p = NULL;
+ c->l.nupvalues = cast_byte(n);
+ while (n--) c->l.upvals[n] = NULL;
+ return c;
+}
+
+
+UpVal *luaF_newupval (lua_State *L) {
+ UpVal *uv = &luaC_newobj(L, LUA_TUPVAL, sizeof(UpVal), NULL, 0)->uv;
+ uv->v = &uv->u.value;
+ setnilvalue(uv->v);
+ return uv;
+}
+
+
+UpVal *luaF_findupval (lua_State *L, StkId level) {
+ global_State *g = G(L);
+ GCObject **pp = &L->openupval;
+ UpVal *p;
+ UpVal *uv;
+ while (*pp != NULL && (p = gco2uv(*pp))->v >= level) {
+ GCObject *o = obj2gco(p);
+ lua_assert(p->v != &p->u.value);
+ lua_assert(!isold(o) || isold(obj2gco(L)));
+ if (p->v == level) { /* found a corresponding upvalue? */
+ if (isdead(g, o)) /* is it dead? */
+ changewhite(o); /* resurrect it */
+ return p;
+ }
+ pp = &p->next;
+ }
+ /* not found: create a new one */
+ uv = &luaC_newobj(L, LUA_TUPVAL, sizeof(UpVal), pp, 0)->uv;
+ uv->v = level; /* current value lives in the stack */
+ uv->u.l.prev = &g->uvhead; /* double link it in `uvhead' list */
+ uv->u.l.next = g->uvhead.u.l.next;
+ uv->u.l.next->u.l.prev = uv;
+ g->uvhead.u.l.next = uv;
+ lua_assert(uv->u.l.next->u.l.prev == uv && uv->u.l.prev->u.l.next == uv);
+ return uv;
+}
+
+
+static void unlinkupval (UpVal *uv) {
+ lua_assert(uv->u.l.next->u.l.prev == uv && uv->u.l.prev->u.l.next == uv);
+ uv->u.l.next->u.l.prev = uv->u.l.prev; /* remove from `uvhead' list */
+ uv->u.l.prev->u.l.next = uv->u.l.next;
+}
+
+
+void luaF_freeupval (lua_State *L, UpVal *uv) {
+ if (uv->v != &uv->u.value) /* is it open? */
+ unlinkupval(uv); /* remove from open list */
+ luaM_free(L, uv); /* free upvalue */
+}
+
+
+void luaF_close (lua_State *L, StkId level) {
+ UpVal *uv;
+ global_State *g = G(L);
+ while (L->openupval != NULL && (uv = gco2uv(L->openupval))->v >= level) {
+ GCObject *o = obj2gco(uv);
+ lua_assert(!isblack(o) && uv->v != &uv->u.value);
+ L->openupval = uv->next; /* remove from `open' list */
+ if (isdead(g, o))
+ luaF_freeupval(L, uv); /* free upvalue */
+ else {
+ unlinkupval(uv); /* remove upvalue from 'uvhead' list */
+ setobj(L, &uv->u.value, uv->v); /* move value to upvalue slot */
+ uv->v = &uv->u.value; /* now current value lives here */
+ gch(o)->next = g->allgc; /* link upvalue into 'allgc' list */
+ g->allgc = o;
+ luaC_checkupvalcolor(g, uv);
+ }
+ }
+}
+
+
+Proto *luaF_newproto (lua_State *L) {
+ Proto *f = &luaC_newobj(L, LUA_TPROTO, sizeof(Proto), NULL, 0)->p;
+ f->k = NULL;
+ f->sizek = 0;
+ f->p = NULL;
+ f->sizep = 0;
+ f->code = NULL;
+ f->cache = NULL;
+ f->sizecode = 0;
+ f->lineinfo = NULL;
+ f->sizelineinfo = 0;
+ f->upvalues = NULL;
+ f->sizeupvalues = 0;
+ f->numparams = 0;
+ f->is_vararg = 0;
+ f->maxstacksize = 0;
+ f->locvars = NULL;
+ f->sizelocvars = 0;
+ f->linedefined = 0;
+ f->lastlinedefined = 0;
+ f->source = NULL;
+ return f;
+}
+
+
+void luaF_freeproto (lua_State *L, Proto *f) {
+ luaM_freearray(L, f->code, f->sizecode);
+ luaM_freearray(L, f->p, f->sizep);
+ luaM_freearray(L, f->k, f->sizek);
+ luaM_freearray(L, f->lineinfo, f->sizelineinfo);
+ luaM_freearray(L, f->locvars, f->sizelocvars);
+ luaM_freearray(L, f->upvalues, f->sizeupvalues);
+ luaM_free(L, f);
+}
+
+
+/*
+** Look for n-th local variable at line `line' in function `func'.
+** Returns NULL if not found.
+*/
+const char *luaF_getlocalname (const Proto *f, int local_number, int pc) {
+ int i;
+ for (i = 0; i<f->sizelocvars && f->locvars[i].startpc <= pc; i++) {
+ if (pc < f->locvars[i].endpc) { /* is variable active? */
+ local_number--;
+ if (local_number == 0)
+ return getstr(f->locvars[i].varname);
+ }
+ }
+ return NULL; /* not found */
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lfunc.h b/sys/contrib/openzfs/module/lua/lfunc.h
new file mode 100644
index 000000000000..59a4fa75c46e
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lfunc.h
@@ -0,0 +1,35 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lfunc.h,v 2.8.1.1 2013/04/12 18:48:47 roberto Exp $
+** Auxiliary functions to manipulate prototypes and closures
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lfunc_h
+#define lfunc_h
+
+
+#include "lobject.h"
+
+
+#define sizeCclosure(n) (cast(int, sizeof(CClosure)) + \
+ cast(int, sizeof(TValue)*((n)-1)))
+
+#define sizeLclosure(n) (cast(int, sizeof(LClosure)) + \
+ cast(int, sizeof(TValue *)*((n)-1)))
+
+
+LUAI_FUNC Proto *luaF_newproto (lua_State *L);
+LUAI_FUNC Closure *luaF_newCclosure (lua_State *L, int nelems);
+LUAI_FUNC Closure *luaF_newLclosure (lua_State *L, int nelems);
+LUAI_FUNC UpVal *luaF_newupval (lua_State *L);
+LUAI_FUNC UpVal *luaF_findupval (lua_State *L, StkId level);
+LUAI_FUNC void luaF_close (lua_State *L, StkId level);
+LUAI_FUNC void luaF_freeproto (lua_State *L, Proto *f);
+LUAI_FUNC void luaF_freeupval (lua_State *L, UpVal *uv);
+LUAI_FUNC const char *luaF_getlocalname (const Proto *func, int local_number,
+ int pc);
+
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lgc.c b/sys/contrib/openzfs/module/lua/lgc.c
new file mode 100644
index 000000000000..55feb24119d3
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lgc.c
@@ -0,0 +1,1218 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lgc.c,v 2.140.1.3 2014/09/01 16:55:08 roberto Exp $
+** Garbage Collector
+** See Copyright Notice in lua.h
+*/
+
+#define lgc_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+
+
+
+/*
+** cost of sweeping one element (the size of a small object divided
+** by some adjust for the sweep speed)
+*/
+#define GCSWEEPCOST ((sizeof(TString) + 4) / 4)
+
+/* maximum number of elements to sweep in each single step */
+#define GCSWEEPMAX (cast_int((GCSTEPSIZE / GCSWEEPCOST) / 4))
+
+/* maximum number of finalizers to call in each GC step */
+#define GCFINALIZENUM 4
+
+
+/*
+** macro to adjust 'stepmul': 'stepmul' is actually used like
+** 'stepmul / STEPMULADJ' (value chosen by tests)
+*/
+#define STEPMULADJ 200
+
+
+/*
+** macro to adjust 'pause': 'pause' is actually used like
+** 'pause / PAUSEADJ' (value chosen by tests)
+*/
+#define PAUSEADJ 100
+
+
+/*
+** 'makewhite' erases all color bits plus the old bit and then
+** sets only the current white bit
+*/
+#define maskcolors (~(bit2mask(BLACKBIT, OLDBIT) | WHITEBITS))
+#define makewhite(g,x) \
+ (gch(x)->marked = cast_byte((gch(x)->marked & maskcolors) | luaC_white(g)))
+
+#define white2gray(x) resetbits(gch(x)->marked, WHITEBITS)
+#define black2gray(x) resetbit(gch(x)->marked, BLACKBIT)
+
+
+#define isfinalized(x) testbit(gch(x)->marked, FINALIZEDBIT)
+
+#define checkdeadkey(n) lua_assert(!ttisdeadkey(gkey(n)) || ttisnil(gval(n)))
+
+
+#define checkconsistency(obj) \
+ lua_longassert(!iscollectable(obj) || righttt(obj))
+
+
+#define markvalue(g,o) { checkconsistency(o); \
+ if (valiswhite(o)) reallymarkobject(g,gcvalue(o)); }
+
+#define markobject(g,t) { if ((t) && iswhite(obj2gco(t))) \
+ reallymarkobject(g, obj2gco(t)); }
+
+static void reallymarkobject (global_State *g, GCObject *o);
+
+
+/*
+** {======================================================
+** Generic functions
+** =======================================================
+*/
+
+
+/*
+** one after last element in a hash array
+*/
+#define gnodelast(h) gnode(h, cast(size_t, sizenode(h)))
+
+
+/*
+** link table 'h' into list pointed by 'p'
+*/
+#define linktable(h,p) ((h)->gclist = *(p), *(p) = obj2gco(h))
+
+
+/*
+** if key is not marked, mark its entry as dead (therefore removing it
+** from the table)
+*/
+static void removeentry (Node *n) {
+ lua_assert(ttisnil(gval(n)));
+ if (valiswhite(gkey(n)))
+ setdeadvalue(gkey(n)); /* unused and unmarked key; remove it */
+}
+
+
+/*
+** tells whether a key or value can be cleared from a weak
+** table. Non-collectable objects are never removed from weak
+** tables. Strings behave as `values', so are never removed too. for
+** other objects: if really collected, cannot keep them; for objects
+** being finalized, keep them in keys, but not in values
+*/
+static int iscleared (global_State *g, const TValue *o) {
+ if (!iscollectable(o)) return 0;
+ else if (ttisstring(o)) {
+ markobject(g, rawtsvalue(o)); /* strings are `values', so are never weak */
+ return 0;
+ }
+ else return iswhite(gcvalue(o));
+}
+
+
+/*
+** barrier that moves collector forward, that is, mark the white object
+** being pointed by a black object.
+*/
+void luaC_barrier_ (lua_State *L, GCObject *o, GCObject *v) {
+ global_State *g = G(L);
+ lua_assert(isblack(o) && iswhite(v) && !isdead(g, v) && !isdead(g, o));
+ lua_assert(g->gcstate != GCSpause);
+ lua_assert(gch(o)->tt != LUA_TTABLE);
+ if (keepinvariantout(g)) /* must keep invariant? */
+ reallymarkobject(g, v); /* restore invariant */
+ else { /* sweep phase */
+ lua_assert(issweepphase(g));
+ makewhite(g, o); /* mark main obj. as white to avoid other barriers */
+ }
+}
+
+
+/*
+** barrier that moves collector backward, that is, mark the black object
+** pointing to a white object as gray again. (Current implementation
+** only works for tables; access to 'gclist' is not uniform across
+** different types.)
+*/
+void luaC_barrierback_ (lua_State *L, GCObject *o) {
+ global_State *g = G(L);
+ lua_assert(isblack(o) && !isdead(g, o) && gch(o)->tt == LUA_TTABLE);
+ black2gray(o); /* make object gray (again) */
+ gco2t(o)->gclist = g->grayagain;
+ g->grayagain = o;
+}
+
+
+/*
+** barrier for prototypes. When creating first closure (cache is
+** NULL), use a forward barrier; this may be the only closure of the
+** prototype (if it is a "regular" function, with a single instance)
+** and the prototype may be big, so it is better to avoid traversing
+** it again. Otherwise, use a backward barrier, to avoid marking all
+** possible instances.
+*/
+LUAI_FUNC void luaC_barrierproto_ (lua_State *L, Proto *p, Closure *c) {
+ global_State *g = G(L);
+ lua_assert(isblack(obj2gco(p)));
+ if (p->cache == NULL) { /* first time? */
+ luaC_objbarrier(L, p, c);
+ }
+ else { /* use a backward barrier */
+ black2gray(obj2gco(p)); /* make prototype gray (again) */
+ p->gclist = g->grayagain;
+ g->grayagain = obj2gco(p);
+ }
+}
+
+
+/*
+** check color (and invariants) for an upvalue that was closed,
+** i.e., moved into the 'allgc' list
+*/
+void luaC_checkupvalcolor (global_State *g, UpVal *uv) {
+ GCObject *o = obj2gco(uv);
+ lua_assert(!isblack(o)); /* open upvalues are never black */
+ if (isgray(o)) {
+ if (keepinvariant(g)) {
+ resetoldbit(o); /* see MOVE OLD rule */
+ gray2black(o); /* it is being visited now */
+ markvalue(g, uv->v);
+ }
+ else {
+ lua_assert(issweepphase(g));
+ makewhite(g, o);
+ }
+ }
+}
+
+
+/*
+** create a new collectable object (with given type and size) and link
+** it to '*list'. 'offset' tells how many bytes to allocate before the
+** object itself (used only by states).
+*/
+GCObject *luaC_newobj (lua_State *L, int tt, size_t sz, GCObject **list,
+ int offset) {
+ global_State *g = G(L);
+ char *raw = cast(char *, luaM_newobject(L, novariant(tt), sz));
+ GCObject *o = obj2gco(raw + offset);
+ if (list == NULL)
+ list = &g->allgc; /* standard list for collectable objects */
+ gch(o)->marked = luaC_white(g);
+ gch(o)->tt = tt;
+ gch(o)->next = *list;
+ *list = o;
+ return o;
+}
+
+/* }====================================================== */
+
+
+
+/*
+** {======================================================
+** Mark functions
+** =======================================================
+*/
+
+
+/*
+** mark an object. Userdata, strings, and closed upvalues are visited
+** and turned black here. Other objects are marked gray and added
+** to appropriate list to be visited (and turned black) later. (Open
+** upvalues are already linked in 'headuv' list.)
+*/
+static void reallymarkobject (global_State *g, GCObject *o) {
+ lu_mem size;
+ white2gray(o);
+ switch (gch(o)->tt) {
+ case LUA_TSHRSTR:
+ case LUA_TLNGSTR: {
+ size = sizestring(gco2ts(o));
+ break; /* nothing else to mark; make it black */
+ }
+ case LUA_TUSERDATA: {
+ Table *mt = gco2u(o)->metatable;
+ markobject(g, mt);
+ markobject(g, gco2u(o)->env);
+ size = sizeudata(gco2u(o));
+ break;
+ }
+ case LUA_TUPVAL: {
+ UpVal *uv = gco2uv(o);
+ markvalue(g, uv->v);
+ if (uv->v != &uv->u.value) /* open? */
+ return; /* open upvalues remain gray */
+ size = sizeof(UpVal);
+ break;
+ }
+ case LUA_TLCL: {
+ gco2lcl(o)->gclist = g->gray;
+ g->gray = o;
+ return;
+ }
+ case LUA_TCCL: {
+ gco2ccl(o)->gclist = g->gray;
+ g->gray = o;
+ return;
+ }
+ case LUA_TTABLE: {
+ linktable(gco2t(o), &g->gray);
+ return;
+ }
+ case LUA_TTHREAD: {
+ gco2th(o)->gclist = g->gray;
+ g->gray = o;
+ return;
+ }
+ case LUA_TPROTO: {
+ gco2p(o)->gclist = g->gray;
+ g->gray = o;
+ return;
+ }
+ default: lua_assert(0); return;
+ }
+ gray2black(o);
+ g->GCmemtrav += size;
+}
+
+
+/*
+** mark metamethods for basic types
+*/
+static void markmt (global_State *g) {
+ int i;
+ for (i=0; i < LUA_NUMTAGS; i++)
+ markobject(g, g->mt[i]);
+}
+
+
+/*
+** mark all objects in list of being-finalized
+*/
+static void markbeingfnz (global_State *g) {
+ GCObject *o;
+ for (o = g->tobefnz; o != NULL; o = gch(o)->next) {
+ makewhite(g, o);
+ reallymarkobject(g, o);
+ }
+}
+
+
+/*
+** mark all values stored in marked open upvalues. (See comment in
+** 'lstate.h'.)
+*/
+static void remarkupvals (global_State *g) {
+ UpVal *uv;
+ for (uv = g->uvhead.u.l.next; uv != &g->uvhead; uv = uv->u.l.next) {
+ if (isgray(obj2gco(uv)))
+ markvalue(g, uv->v);
+ }
+}
+
+
+/*
+** mark root set and reset all gray lists, to start a new
+** incremental (or full) collection
+*/
+static void restartcollection (global_State *g) {
+ g->gray = g->grayagain = NULL;
+ g->weak = g->allweak = g->ephemeron = NULL;
+ markobject(g, g->mainthread);
+ markvalue(g, &g->l_registry);
+ markmt(g);
+ markbeingfnz(g); /* mark any finalizing object left from previous cycle */
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Traverse functions
+** =======================================================
+*/
+
+static void traverseweakvalue (global_State *g, Table *h) {
+ Node *n, *limit = gnodelast(h);
+ /* if there is array part, assume it may have white values (do not
+ traverse it just to check) */
+ int hasclears = (h->sizearray > 0);
+ for (n = gnode(h, 0); n < limit; n++) {
+ checkdeadkey(n);
+ if (ttisnil(gval(n))) /* entry is empty? */
+ removeentry(n); /* remove it */
+ else {
+ lua_assert(!ttisnil(gkey(n)));
+ markvalue(g, gkey(n)); /* mark key */
+ if (!hasclears && iscleared(g, gval(n))) /* is there a white value? */
+ hasclears = 1; /* table will have to be cleared */
+ }
+ }
+ if (hasclears)
+ linktable(h, &g->weak); /* has to be cleared later */
+ else /* no white values */
+ linktable(h, &g->grayagain); /* no need to clean */
+}
+
+
+static int traverseephemeron (global_State *g, Table *h) {
+ int marked = 0; /* true if an object is marked in this traversal */
+ int hasclears = 0; /* true if table has white keys */
+ int prop = 0; /* true if table has entry "white-key -> white-value" */
+ Node *n, *limit = gnodelast(h);
+ int i;
+ /* traverse array part (numeric keys are 'strong') */
+ for (i = 0; i < h->sizearray; i++) {
+ if (valiswhite(&h->array[i])) {
+ marked = 1;
+ reallymarkobject(g, gcvalue(&h->array[i]));
+ }
+ }
+ /* traverse hash part */
+ for (n = gnode(h, 0); n < limit; n++) {
+ checkdeadkey(n);
+ if (ttisnil(gval(n))) /* entry is empty? */
+ removeentry(n); /* remove it */
+ else if (iscleared(g, gkey(n))) { /* key is not marked (yet)? */
+ hasclears = 1; /* table must be cleared */
+ if (valiswhite(gval(n))) /* value not marked yet? */
+ prop = 1; /* must propagate again */
+ }
+ else if (valiswhite(gval(n))) { /* value not marked yet? */
+ marked = 1;
+ reallymarkobject(g, gcvalue(gval(n))); /* mark it now */
+ }
+ }
+ if (g->gcstate != GCSatomic || prop)
+ linktable(h, &g->ephemeron); /* have to propagate again */
+ else if (hasclears) /* does table have white keys? */
+ linktable(h, &g->allweak); /* may have to clean white keys */
+ else /* no white keys */
+ linktable(h, &g->grayagain); /* no need to clean */
+ return marked;
+}
+
+
+static void traversestrongtable (global_State *g, Table *h) {
+ Node *n, *limit = gnodelast(h);
+ int i;
+ for (i = 0; i < h->sizearray; i++) /* traverse array part */
+ markvalue(g, &h->array[i]);
+ for (n = gnode(h, 0); n < limit; n++) { /* traverse hash part */
+ checkdeadkey(n);
+ if (ttisnil(gval(n))) /* entry is empty? */
+ removeentry(n); /* remove it */
+ else {
+ lua_assert(!ttisnil(gkey(n)));
+ markvalue(g, gkey(n)); /* mark key */
+ markvalue(g, gval(n)); /* mark value */
+ }
+ }
+}
+
+
+static lu_mem traversetable (global_State *g, Table *h) {
+ const char *weakkey, *weakvalue;
+ const TValue *mode = gfasttm(g, h->metatable, TM_MODE);
+ markobject(g, h->metatable);
+ if (mode && ttisstring(mode) && /* is there a weak mode? */
+ ((weakkey = strchr(svalue(mode), 'k')),
+ (weakvalue = strchr(svalue(mode), 'v')),
+ (weakkey || weakvalue))) { /* is really weak? */
+ black2gray(obj2gco(h)); /* keep table gray */
+ if (!weakkey) /* strong keys? */
+ traverseweakvalue(g, h);
+ else if (!weakvalue) /* strong values? */
+ traverseephemeron(g, h);
+ else /* all weak */
+ linktable(h, &g->allweak); /* nothing to traverse now */
+ }
+ else /* not weak */
+ traversestrongtable(g, h);
+ return sizeof(Table) + sizeof(TValue) * h->sizearray +
+ sizeof(Node) * cast(size_t, sizenode(h));
+}
+
+
+static int traverseproto (global_State *g, Proto *f) {
+ int i;
+ if (f->cache && iswhite(obj2gco(f->cache)))
+ f->cache = NULL; /* allow cache to be collected */
+ markobject(g, f->source);
+ for (i = 0; i < f->sizek; i++) /* mark literals */
+ markvalue(g, &f->k[i]);
+ for (i = 0; i < f->sizeupvalues; i++) /* mark upvalue names */
+ markobject(g, f->upvalues[i].name);
+ for (i = 0; i < f->sizep; i++) /* mark nested protos */
+ markobject(g, f->p[i]);
+ for (i = 0; i < f->sizelocvars; i++) /* mark local-variable names */
+ markobject(g, f->locvars[i].varname);
+ return sizeof(Proto) + sizeof(Instruction) * f->sizecode +
+ sizeof(Proto *) * f->sizep +
+ sizeof(TValue) * f->sizek +
+ sizeof(int) * f->sizelineinfo +
+ sizeof(LocVar) * f->sizelocvars +
+ sizeof(Upvaldesc) * f->sizeupvalues;
+}
+
+
+static lu_mem traverseCclosure (global_State *g, CClosure *cl) {
+ int i;
+ for (i = 0; i < cl->nupvalues; i++) /* mark its upvalues */
+ markvalue(g, &cl->upvalue[i]);
+ return sizeCclosure(cl->nupvalues);
+}
+
+static lu_mem traverseLclosure (global_State *g, LClosure *cl) {
+ int i;
+ markobject(g, cl->p); /* mark its prototype */
+ for (i = 0; i < cl->nupvalues; i++) /* mark its upvalues */
+ markobject(g, cl->upvals[i]);
+ return sizeLclosure(cl->nupvalues);
+}
+
+
+static lu_mem traversestack (global_State *g, lua_State *th) {
+ int n = 0;
+ StkId o = th->stack;
+ if (o == NULL)
+ return 1; /* stack not completely built yet */
+ for (; o < th->top; o++) /* mark live elements in the stack */
+ markvalue(g, o);
+ if (g->gcstate == GCSatomic) { /* final traversal? */
+ StkId lim = th->stack + th->stacksize; /* real end of stack */
+ for (; o < lim; o++) /* clear not-marked stack slice */
+ setnilvalue(o);
+ }
+ else { /* count call infos to compute size */
+ CallInfo *ci;
+ for (ci = &th->base_ci; ci != th->ci; ci = ci->next)
+ n++;
+ }
+ return sizeof(lua_State) + sizeof(TValue) * th->stacksize +
+ sizeof(CallInfo) * n;
+}
+
+
+/*
+** traverse one gray object, turning it to black (except for threads,
+** which are always gray).
+*/
+static void propagatemark (global_State *g) {
+ lu_mem size;
+ GCObject *o = g->gray;
+ lua_assert(isgray(o));
+ gray2black(o);
+ switch (gch(o)->tt) {
+ case LUA_TTABLE: {
+ Table *h = gco2t(o);
+ g->gray = h->gclist; /* remove from 'gray' list */
+ size = traversetable(g, h);
+ break;
+ }
+ case LUA_TLCL: {
+ LClosure *cl = gco2lcl(o);
+ g->gray = cl->gclist; /* remove from 'gray' list */
+ size = traverseLclosure(g, cl);
+ break;
+ }
+ case LUA_TCCL: {
+ CClosure *cl = gco2ccl(o);
+ g->gray = cl->gclist; /* remove from 'gray' list */
+ size = traverseCclosure(g, cl);
+ break;
+ }
+ case LUA_TTHREAD: {
+ lua_State *th = gco2th(o);
+ g->gray = th->gclist; /* remove from 'gray' list */
+ th->gclist = g->grayagain;
+ g->grayagain = o; /* insert into 'grayagain' list */
+ black2gray(o);
+ size = traversestack(g, th);
+ break;
+ }
+ case LUA_TPROTO: {
+ Proto *p = gco2p(o);
+ g->gray = p->gclist; /* remove from 'gray' list */
+ size = traverseproto(g, p);
+ break;
+ }
+ default: lua_assert(0); return;
+ }
+ g->GCmemtrav += size;
+}
+
+
+static void propagateall (global_State *g) {
+ while (g->gray) propagatemark(g);
+}
+
+
+static void propagatelist (global_State *g, GCObject *l) {
+ lua_assert(g->gray == NULL); /* no grays left */
+ g->gray = l;
+ propagateall(g); /* traverse all elements from 'l' */
+}
+
+/*
+** retraverse all gray lists. Because tables may be reinserted in other
+** lists when traversed, traverse the original lists to avoid traversing
+** twice the same table (which is not wrong, but inefficient)
+*/
+static void retraversegrays (global_State *g) {
+ GCObject *weak = g->weak; /* save original lists */
+ GCObject *grayagain = g->grayagain;
+ GCObject *ephemeron = g->ephemeron;
+ g->weak = g->grayagain = g->ephemeron = NULL;
+ propagateall(g); /* traverse main gray list */
+ propagatelist(g, grayagain);
+ propagatelist(g, weak);
+ propagatelist(g, ephemeron);
+}
+
+
+static void convergeephemerons (global_State *g) {
+ int changed;
+ do {
+ GCObject *w;
+ GCObject *next = g->ephemeron; /* get ephemeron list */
+ g->ephemeron = NULL; /* tables will return to this list when traversed */
+ changed = 0;
+ while ((w = next) != NULL) {
+ next = gco2t(w)->gclist;
+ if (traverseephemeron(g, gco2t(w))) { /* traverse marked some value? */
+ propagateall(g); /* propagate changes */
+ changed = 1; /* will have to revisit all ephemeron tables */
+ }
+ }
+ } while (changed);
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Sweep Functions
+** =======================================================
+*/
+
+
+/*
+** clear entries with unmarked keys from all weaktables in list 'l' up
+** to element 'f'
+*/
+static void clearkeys (global_State *g, GCObject *l, GCObject *f) {
+ for (; l != f; l = gco2t(l)->gclist) {
+ Table *h = gco2t(l);
+ Node *n, *limit = gnodelast(h);
+ for (n = gnode(h, 0); n < limit; n++) {
+ if (!ttisnil(gval(n)) && (iscleared(g, gkey(n)))) {
+ setnilvalue(gval(n)); /* remove value ... */
+ removeentry(n); /* and remove entry from table */
+ }
+ }
+ }
+}
+
+
+/*
+** clear entries with unmarked values from all weaktables in list 'l' up
+** to element 'f'
+*/
+static void clearvalues (global_State *g, GCObject *l, GCObject *f) {
+ for (; l != f; l = gco2t(l)->gclist) {
+ Table *h = gco2t(l);
+ Node *n, *limit = gnodelast(h);
+ int i;
+ for (i = 0; i < h->sizearray; i++) {
+ TValue *o = &h->array[i];
+ if (iscleared(g, o)) /* value was collected? */
+ setnilvalue(o); /* remove value */
+ }
+ for (n = gnode(h, 0); n < limit; n++) {
+ if (!ttisnil(gval(n)) && iscleared(g, gval(n))) {
+ setnilvalue(gval(n)); /* remove value ... */
+ removeentry(n); /* and remove entry from table */
+ }
+ }
+ }
+}
+
+
+static void freeobj (lua_State *L, GCObject *o) {
+ switch (gch(o)->tt) {
+ case LUA_TPROTO: luaF_freeproto(L, gco2p(o)); break;
+ case LUA_TLCL: {
+ luaM_freemem(L, o, sizeLclosure(gco2lcl(o)->nupvalues));
+ break;
+ }
+ case LUA_TCCL: {
+ luaM_freemem(L, o, sizeCclosure(gco2ccl(o)->nupvalues));
+ break;
+ }
+ case LUA_TUPVAL: luaF_freeupval(L, gco2uv(o)); break;
+ case LUA_TTABLE: luaH_free(L, gco2t(o)); break;
+ case LUA_TTHREAD: luaE_freethread(L, gco2th(o)); break;
+ case LUA_TUSERDATA: luaM_freemem(L, o, sizeudata(gco2u(o))); break;
+ case LUA_TSHRSTR:
+ G(L)->strt.nuse--;
+ /* FALLTHROUGH */
+ case LUA_TLNGSTR: {
+ luaM_freemem(L, o, sizestring(gco2ts(o)));
+ break;
+ }
+ default: lua_assert(0);
+ }
+}
+
+
+#define sweepwholelist(L,p) sweeplist(L,p,MAX_LUMEM)
+static GCObject **sweeplist (lua_State *L, GCObject **p, lu_mem count);
+
+
+/*
+** sweep the (open) upvalues of a thread and resize its stack and
+** list of call-info structures.
+*/
+static void sweepthread (lua_State *L, lua_State *L1) {
+ if (L1->stack == NULL) return; /* stack not completely built yet */
+ sweepwholelist(L, &L1->openupval); /* sweep open upvalues */
+ luaE_freeCI(L1); /* free extra CallInfo slots */
+ /* should not change the stack during an emergency gc cycle */
+ if (G(L)->gckind != KGC_EMERGENCY)
+ luaD_shrinkstack(L1);
+}
+
+
+/*
+** sweep at most 'count' elements from a list of GCObjects erasing dead
+** objects, where a dead (not alive) object is one marked with the "old"
+** (non current) white and not fixed.
+** In non-generational mode, change all non-dead objects back to white,
+** preparing for next collection cycle.
+** In generational mode, keep black objects black, and also mark them as
+** old; stop when hitting an old object, as all objects after that
+** one will be old too.
+** When object is a thread, sweep its list of open upvalues too.
+*/
+static GCObject **sweeplist (lua_State *L, GCObject **p, lu_mem count) {
+ global_State *g = G(L);
+ int ow = otherwhite(g);
+ int toclear, toset; /* bits to clear and to set in all live objects */
+ int tostop; /* stop sweep when this is true */
+ if (isgenerational(g)) { /* generational mode? */
+ toclear = ~0; /* clear nothing */
+ toset = bitmask(OLDBIT); /* set the old bit of all surviving objects */
+ tostop = bitmask(OLDBIT); /* do not sweep old generation */
+ }
+ else { /* normal mode */
+ toclear = maskcolors; /* clear all color bits + old bit */
+ toset = luaC_white(g); /* make object white */
+ tostop = 0; /* do not stop */
+ }
+ while (*p != NULL && count-- > 0) {
+ GCObject *curr = *p;
+ int marked = gch(curr)->marked;
+ if (isdeadm(ow, marked)) { /* is 'curr' dead? */
+ *p = gch(curr)->next; /* remove 'curr' from list */
+ freeobj(L, curr); /* erase 'curr' */
+ }
+ else {
+ if (testbits(marked, tostop))
+ return NULL; /* stop sweeping this list */
+ if (gch(curr)->tt == LUA_TTHREAD)
+ sweepthread(L, gco2th(curr)); /* sweep thread's upvalues */
+ /* update marks */
+ gch(curr)->marked = cast_byte((marked & toclear) | toset);
+ p = &gch(curr)->next; /* go to next element */
+ }
+ }
+ return (*p == NULL) ? NULL : p;
+}
+
+
+/*
+** sweep a list until a live object (or end of list)
+*/
+static GCObject **sweeptolive (lua_State *L, GCObject **p, int *n) {
+ GCObject ** old = p;
+ int i = 0;
+ do {
+ i++;
+ p = sweeplist(L, p, 1);
+ } while (p == old);
+ if (n) *n += i;
+ return p;
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Finalization
+** =======================================================
+*/
+
+static void checkSizes (lua_State *L) {
+ global_State *g = G(L);
+ if (g->gckind != KGC_EMERGENCY) { /* do not change sizes in emergency */
+ int hs = g->strt.size / 2; /* half the size of the string table */
+ if (g->strt.nuse < cast(lu_int32, hs)) /* using less than that half? */
+ luaS_resize(L, hs); /* halve its size */
+ luaZ_freebuffer(L, &g->buff); /* free concatenation buffer */
+ }
+}
+
+
+static GCObject *udata2finalize (global_State *g) {
+ GCObject *o = g->tobefnz; /* get first element */
+ lua_assert(isfinalized(o));
+ g->tobefnz = gch(o)->next; /* remove it from 'tobefnz' list */
+ gch(o)->next = g->allgc; /* return it to 'allgc' list */
+ g->allgc = o;
+ resetbit(gch(o)->marked, SEPARATED); /* mark that it is not in 'tobefnz' */
+ lua_assert(!isold(o)); /* see MOVE OLD rule */
+ if (!keepinvariantout(g)) /* not keeping invariant? */
+ makewhite(g, o); /* "sweep" object */
+ return o;
+}
+
+
+static void dothecall (lua_State *L, void *ud) {
+ UNUSED(ud);
+ luaD_call(L, L->top - 2, 0, 0);
+}
+
+
+static void GCTM (lua_State *L, int propagateerrors) {
+ global_State *g = G(L);
+ const TValue *tm;
+ TValue v;
+ setgcovalue(L, &v, udata2finalize(g));
+ tm = luaT_gettmbyobj(L, &v, TM_GC);
+ if (tm != NULL && ttisfunction(tm)) { /* is there a finalizer? */
+ int status;
+ lu_byte oldah = L->allowhook;
+ int running = g->gcrunning;
+ L->allowhook = 0; /* stop debug hooks during GC metamethod */
+ g->gcrunning = 0; /* avoid GC steps */
+ setobj2s(L, L->top, tm); /* push finalizer... */
+ setobj2s(L, L->top + 1, &v); /* ... and its argument */
+ L->top += 2; /* and (next line) call the finalizer */
+ status = luaD_pcall(L, dothecall, NULL, savestack(L, L->top - 2), 0);
+ L->allowhook = oldah; /* restore hooks */
+ g->gcrunning = running; /* restore state */
+ if (status != LUA_OK && propagateerrors) { /* error while running __gc? */
+ if (status == LUA_ERRRUN) { /* is there an error object? */
+ const char *msg = (ttisstring(L->top - 1))
+ ? svalue(L->top - 1)
+ : "no message";
+ luaO_pushfstring(L, "error in __gc metamethod (%s)", msg);
+ status = LUA_ERRGCMM; /* error in __gc metamethod */
+ }
+ luaD_throw(L, status); /* re-throw error */
+ }
+ }
+}
+
+
+/*
+** move all unreachable objects (or 'all' objects) that need
+** finalization from list 'finobj' to list 'tobefnz' (to be finalized)
+*/
+static void separatetobefnz (lua_State *L, int all) {
+ global_State *g = G(L);
+ GCObject **p = &g->finobj;
+ GCObject *curr;
+ GCObject **lastnext = &g->tobefnz;
+ /* find last 'next' field in 'tobefnz' list (to add elements in its end) */
+ while (*lastnext != NULL)
+ lastnext = &gch(*lastnext)->next;
+ while ((curr = *p) != NULL) { /* traverse all finalizable objects */
+ lua_assert(!isfinalized(curr));
+ lua_assert(testbit(gch(curr)->marked, SEPARATED));
+ if (!(iswhite(curr) || all)) /* not being collected? */
+ p = &gch(curr)->next; /* don't bother with it */
+ else {
+ l_setbit(gch(curr)->marked, FINALIZEDBIT); /* won't be finalized again */
+ *p = gch(curr)->next; /* remove 'curr' from 'finobj' list */
+ gch(curr)->next = *lastnext; /* link at the end of 'tobefnz' list */
+ *lastnext = curr;
+ lastnext = &gch(curr)->next;
+ }
+ }
+}
+
+
+/*
+** if object 'o' has a finalizer, remove it from 'allgc' list (must
+** search the list to find it) and link it in 'finobj' list.
+*/
+void luaC_checkfinalizer (lua_State *L, GCObject *o, Table *mt) {
+ global_State *g = G(L);
+ if (testbit(gch(o)->marked, SEPARATED) || /* obj. is already separated... */
+ isfinalized(o) || /* ... or is finalized... */
+ gfasttm(g, mt, TM_GC) == NULL) /* or has no finalizer? */
+ return; /* nothing to be done */
+ else { /* move 'o' to 'finobj' list */
+ GCObject **p;
+ GCheader *ho = gch(o);
+ if (g->sweepgc == &ho->next) { /* avoid removing current sweep object */
+ lua_assert(issweepphase(g));
+ g->sweepgc = sweeptolive(L, g->sweepgc, NULL);
+ }
+ /* search for pointer pointing to 'o' */
+ for (p = &g->allgc; *p != o; p = &gch(*p)->next) { /* empty */ }
+ *p = ho->next; /* remove 'o' from root list */
+ ho->next = g->finobj; /* link it in list 'finobj' */
+ g->finobj = o;
+ l_setbit(ho->marked, SEPARATED); /* mark it as such */
+ if (!keepinvariantout(g)) /* not keeping invariant? */
+ makewhite(g, o); /* "sweep" object */
+ else
+ resetoldbit(o); /* see MOVE OLD rule */
+ }
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** GC control
+** =======================================================
+*/
+
+
+/*
+** set a reasonable "time" to wait before starting a new GC cycle;
+** cycle will start when memory use hits threshold
+*/
+static void setpause (global_State *g, l_mem estimate) {
+ l_mem debt, threshold;
+ estimate = estimate / PAUSEADJ; /* adjust 'estimate' */
+ threshold = (g->gcpause < MAX_LMEM / estimate) /* overflow? */
+ ? estimate * g->gcpause /* no overflow */
+ : MAX_LMEM; /* overflow; truncate to maximum */
+ debt = -cast(l_mem, threshold - gettotalbytes(g));
+ luaE_setdebt(g, debt);
+}
+
+
+#define sweepphases \
+ (bitmask(GCSsweepstring) | bitmask(GCSsweepudata) | bitmask(GCSsweep))
+
+
+/*
+** enter first sweep phase (strings) and prepare pointers for other
+** sweep phases. The calls to 'sweeptolive' make pointers point to an
+** object inside the list (instead of to the header), so that the real
+** sweep do not need to skip objects created between "now" and the start
+** of the real sweep.
+** Returns how many objects it swept.
+*/
+static int entersweep (lua_State *L) {
+ global_State *g = G(L);
+ int n = 0;
+ g->gcstate = GCSsweepstring;
+ lua_assert(g->sweepgc == NULL && g->sweepfin == NULL);
+ /* prepare to sweep strings, finalizable objects, and regular objects */
+ g->sweepstrgc = 0;
+ g->sweepfin = sweeptolive(L, &g->finobj, &n);
+ g->sweepgc = sweeptolive(L, &g->allgc, &n);
+ return n;
+}
+
+
+/*
+** change GC mode
+*/
+void luaC_changemode (lua_State *L, int mode) {
+ global_State *g = G(L);
+ if (mode == g->gckind) return; /* nothing to change */
+ if (mode == KGC_GEN) { /* change to generational mode */
+ /* make sure gray lists are consistent */
+ luaC_runtilstate(L, bitmask(GCSpropagate));
+ g->GCestimate = gettotalbytes(g);
+ g->gckind = KGC_GEN;
+ }
+ else { /* change to incremental mode */
+ /* sweep all objects to turn them back to white
+ (as white has not changed, nothing extra will be collected) */
+ g->gckind = KGC_NORMAL;
+ entersweep(L);
+ luaC_runtilstate(L, ~sweepphases);
+ }
+}
+
+
+/*
+** call all pending finalizers
+*/
+static void callallpendingfinalizers (lua_State *L, int propagateerrors) {
+ global_State *g = G(L);
+ while (g->tobefnz) {
+ resetoldbit(g->tobefnz);
+ GCTM(L, propagateerrors);
+ }
+}
+
+
+void luaC_freeallobjects (lua_State *L) {
+ global_State *g = G(L);
+ int i;
+ separatetobefnz(L, 1); /* separate all objects with finalizers */
+ lua_assert(g->finobj == NULL);
+ callallpendingfinalizers(L, 0);
+ g->currentwhite = WHITEBITS; /* this "white" makes all objects look dead */
+ g->gckind = KGC_NORMAL;
+ sweepwholelist(L, &g->finobj); /* finalizers can create objs. in 'finobj' */
+ sweepwholelist(L, &g->allgc);
+ for (i = 0; i < g->strt.size; i++) /* free all string lists */
+ sweepwholelist(L, &g->strt.hash[i]);
+ lua_assert(g->strt.nuse == 0);
+}
+
+
+static l_mem atomic (lua_State *L) {
+ global_State *g = G(L);
+ l_mem work = -cast(l_mem, g->GCmemtrav); /* start counting work */
+ GCObject *origweak, *origall;
+ lua_assert(!iswhite(obj2gco(g->mainthread)));
+ markobject(g, L); /* mark running thread */
+ /* registry and global metatables may be changed by API */
+ markvalue(g, &g->l_registry);
+ markmt(g); /* mark basic metatables */
+ /* remark occasional upvalues of (maybe) dead threads */
+ remarkupvals(g);
+ propagateall(g); /* propagate changes */
+ work += g->GCmemtrav; /* stop counting (do not (re)count grays) */
+ /* traverse objects caught by write barrier and by 'remarkupvals' */
+ retraversegrays(g);
+ work -= g->GCmemtrav; /* restart counting */
+ convergeephemerons(g);
+ /* at this point, all strongly accessible objects are marked. */
+ /* clear values from weak tables, before checking finalizers */
+ clearvalues(g, g->weak, NULL);
+ clearvalues(g, g->allweak, NULL);
+ origweak = g->weak; origall = g->allweak;
+ work += g->GCmemtrav; /* stop counting (objects being finalized) */
+ separatetobefnz(L, 0); /* separate objects to be finalized */
+ markbeingfnz(g); /* mark objects that will be finalized */
+ propagateall(g); /* remark, to propagate `preserveness' */
+ work -= g->GCmemtrav; /* restart counting */
+ convergeephemerons(g);
+ /* at this point, all resurrected objects are marked. */
+ /* remove dead objects from weak tables */
+ clearkeys(g, g->ephemeron, NULL); /* clear keys from all ephemeron tables */
+ clearkeys(g, g->allweak, NULL); /* clear keys from all allweak tables */
+ /* clear values from resurrected weak tables */
+ clearvalues(g, g->weak, origweak);
+ clearvalues(g, g->allweak, origall);
+ g->currentwhite = cast_byte(otherwhite(g)); /* flip current white */
+ work += g->GCmemtrav; /* complete counting */
+ return work; /* estimate of memory marked by 'atomic' */
+}
+
+
+static lu_mem singlestep (lua_State *L) {
+ global_State *g = G(L);
+ switch (g->gcstate) {
+ case GCSpause: {
+ /* start to count memory traversed */
+ g->GCmemtrav = g->strt.size * sizeof(GCObject*);
+ lua_assert(!isgenerational(g));
+ restartcollection(g);
+ g->gcstate = GCSpropagate;
+ return g->GCmemtrav;
+ }
+ case GCSpropagate: {
+ if (g->gray) {
+ lu_mem oldtrav = g->GCmemtrav;
+ propagatemark(g);
+ return g->GCmemtrav - oldtrav; /* memory traversed in this step */
+ }
+ else { /* no more `gray' objects */
+ lu_mem work;
+ int sw;
+ g->gcstate = GCSatomic; /* finish mark phase */
+ g->GCestimate = g->GCmemtrav; /* save what was counted */;
+ work = atomic(L); /* add what was traversed by 'atomic' */
+ g->GCestimate += work; /* estimate of total memory traversed */
+ sw = entersweep(L);
+ return work + sw * GCSWEEPCOST;
+ }
+ }
+ case GCSsweepstring: {
+ int i;
+ for (i = 0; i < GCSWEEPMAX && g->sweepstrgc + i < g->strt.size; i++)
+ sweepwholelist(L, &g->strt.hash[g->sweepstrgc + i]);
+ g->sweepstrgc += i;
+ if (g->sweepstrgc >= g->strt.size) /* no more strings to sweep? */
+ g->gcstate = GCSsweepudata;
+ return i * GCSWEEPCOST;
+ }
+ case GCSsweepudata: {
+ if (g->sweepfin) {
+ g->sweepfin = sweeplist(L, g->sweepfin, GCSWEEPMAX);
+ return GCSWEEPMAX*GCSWEEPCOST;
+ }
+ else {
+ g->gcstate = GCSsweep;
+ return 0;
+ }
+ }
+ case GCSsweep: {
+ if (g->sweepgc) {
+ g->sweepgc = sweeplist(L, g->sweepgc, GCSWEEPMAX);
+ return GCSWEEPMAX*GCSWEEPCOST;
+ }
+ else {
+ /* sweep main thread */
+ GCObject *mt = obj2gco(g->mainthread);
+ sweeplist(L, &mt, 1);
+ checkSizes(L);
+ g->gcstate = GCSpause; /* finish collection */
+ return GCSWEEPCOST;
+ }
+ }
+ default: lua_assert(0); return 0;
+ }
+}
+
+
+/*
+** advances the garbage collector until it reaches a state allowed
+** by 'statemask'
+*/
+void luaC_runtilstate (lua_State *L, int statesmask) {
+ global_State *g = G(L);
+ while (!testbit(statesmask, g->gcstate))
+ singlestep(L);
+}
+
+
+static void generationalcollection (lua_State *L) {
+ global_State *g = G(L);
+ lua_assert(g->gcstate == GCSpropagate);
+ if (g->GCestimate == 0) { /* signal for another major collection? */
+ luaC_fullgc(L, 0); /* perform a full regular collection */
+ g->GCestimate = gettotalbytes(g); /* update control */
+ }
+ else {
+ lu_mem estimate = g->GCestimate;
+ luaC_runtilstate(L, bitmask(GCSpause)); /* run complete (minor) cycle */
+ g->gcstate = GCSpropagate; /* skip restart */
+ if (gettotalbytes(g) > (estimate / 100) * g->gcmajorinc)
+ g->GCestimate = 0; /* signal for a major collection */
+ else
+ g->GCestimate = estimate; /* keep estimate from last major coll. */
+
+ }
+ setpause(g, gettotalbytes(g));
+ lua_assert(g->gcstate == GCSpropagate);
+}
+
+
+static void incstep (lua_State *L) {
+ global_State *g = G(L);
+ l_mem debt = g->GCdebt;
+ int stepmul = g->gcstepmul;
+ if (stepmul < 40) stepmul = 40; /* avoid ridiculous low values (and 0) */
+ /* convert debt from Kb to 'work units' (avoid zero debt and overflows) */
+ debt = (debt / STEPMULADJ) + 1;
+ debt = (debt < MAX_LMEM / stepmul) ? debt * stepmul : MAX_LMEM;
+ do { /* always perform at least one single step */
+ lu_mem work = singlestep(L); /* do some work */
+ debt -= work;
+ } while (debt > -GCSTEPSIZE && g->gcstate != GCSpause);
+ if (g->gcstate == GCSpause)
+ setpause(g, g->GCestimate); /* pause until next cycle */
+ else {
+ debt = (debt / stepmul) * STEPMULADJ; /* convert 'work units' to Kb */
+ luaE_setdebt(g, debt);
+ }
+}
+
+
+/*
+** performs a basic GC step
+*/
+void luaC_forcestep (lua_State *L) {
+ global_State *g = G(L);
+ int i;
+ if (isgenerational(g)) generationalcollection(L);
+ else incstep(L);
+ /* run a few finalizers (or all of them at the end of a collect cycle) */
+ for (i = 0; g->tobefnz && (i < GCFINALIZENUM || g->gcstate == GCSpause); i++)
+ GCTM(L, 1); /* call one finalizer */
+}
+
+
+/*
+** performs a basic GC step only if collector is running
+*/
+void luaC_step (lua_State *L) {
+ global_State *g = G(L);
+ if (g->gcrunning) luaC_forcestep(L);
+ else luaE_setdebt(g, -GCSTEPSIZE); /* avoid being called too often */
+}
+
+
+
+/*
+** performs a full GC cycle; if "isemergency", does not call
+** finalizers (which could change stack positions)
+*/
+void luaC_fullgc (lua_State *L, int isemergency) {
+ global_State *g = G(L);
+ int origkind = g->gckind;
+ lua_assert(origkind != KGC_EMERGENCY);
+ if (isemergency) /* do not run finalizers during emergency GC */
+ g->gckind = KGC_EMERGENCY;
+ else {
+ g->gckind = KGC_NORMAL;
+ callallpendingfinalizers(L, 1);
+ }
+ if (keepinvariant(g)) { /* may there be some black objects? */
+ /* must sweep all objects to turn them back to white
+ (as white has not changed, nothing will be collected) */
+ entersweep(L);
+ }
+ /* finish any pending sweep phase to start a new cycle */
+ luaC_runtilstate(L, bitmask(GCSpause));
+ luaC_runtilstate(L, ~bitmask(GCSpause)); /* start new collection */
+ luaC_runtilstate(L, bitmask(GCSpause)); /* run entire collection */
+ if (origkind == KGC_GEN) { /* generational mode? */
+ /* generational mode must be kept in propagate phase */
+ luaC_runtilstate(L, bitmask(GCSpropagate));
+ }
+ g->gckind = origkind;
+ setpause(g, gettotalbytes(g));
+ if (!isemergency) /* do not run finalizers during emergency GC */
+ callallpendingfinalizers(L, 1);
+}
+
+/* }====================================================== */
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lgc.h b/sys/contrib/openzfs/module/lua/lgc.h
new file mode 100644
index 000000000000..34097a45edfc
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lgc.h
@@ -0,0 +1,159 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lgc.h,v 2.58.1.1 2013/04/12 18:48:47 roberto Exp $
+** Garbage Collector
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lgc_h
+#define lgc_h
+
+
+#include "lobject.h"
+#include "lstate.h"
+
+/*
+** Collectable objects may have one of three colors: white, which
+** means the object is not marked; gray, which means the
+** object is marked, but its references may be not marked; and
+** black, which means that the object and all its references are marked.
+** The main invariant of the garbage collector, while marking objects,
+** is that a black object can never point to a white one. Moreover,
+** any gray object must be in a "gray list" (gray, grayagain, weak,
+** allweak, ephemeron) so that it can be visited again before finishing
+** the collection cycle. These lists have no meaning when the invariant
+** is not being enforced (e.g., sweep phase).
+*/
+
+
+
+/* how much to allocate before next GC step */
+#if !defined(GCSTEPSIZE)
+/* ~100 small strings */
+#define GCSTEPSIZE (cast_int(100 * sizeof(TString)))
+#endif
+
+
+/*
+** Possible states of the Garbage Collector
+*/
+#define GCSpropagate 0
+#define GCSatomic 1
+#define GCSsweepstring 2
+#define GCSsweepudata 3
+#define GCSsweep 4
+#define GCSpause 5
+
+
+#define issweepphase(g) \
+ (GCSsweepstring <= (g)->gcstate && (g)->gcstate <= GCSsweep)
+
+#define isgenerational(g) ((g)->gckind == KGC_GEN)
+
+/*
+** macros to tell when main invariant (white objects cannot point to black
+** ones) must be kept. During a non-generational collection, the sweep
+** phase may break the invariant, as objects turned white may point to
+** still-black objects. The invariant is restored when sweep ends and
+** all objects are white again. During a generational collection, the
+** invariant must be kept all times.
+*/
+
+#define keepinvariant(g) (isgenerational(g) || g->gcstate <= GCSatomic)
+
+
+/*
+** Outside the collector, the state in generational mode is kept in
+** 'propagate', so 'keepinvariant' is always true.
+*/
+#define keepinvariantout(g) \
+ check_exp(g->gcstate == GCSpropagate || !isgenerational(g), \
+ g->gcstate <= GCSatomic)
+
+
+/*
+** some useful bit tricks
+*/
+#define resetbits(x,m) ((x) &= cast(lu_byte, ~(m)))
+#define setbits(x,m) ((x) |= (m))
+#define testbits(x,m) ((x) & (m))
+#define bitmask(b) (1<<(b))
+#define bit2mask(b1,b2) (bitmask(b1) | bitmask(b2))
+#define l_setbit(x,b) setbits(x, bitmask(b))
+#define resetbit(x,b) resetbits(x, bitmask(b))
+#define testbit(x,b) testbits(x, bitmask(b))
+
+
+/* Layout for bit use in `marked' field: */
+#define WHITE0BIT 0 /* object is white (type 0) */
+#define WHITE1BIT 1 /* object is white (type 1) */
+#define BLACKBIT 2 /* object is black */
+#define FINALIZEDBIT 3 /* object has been separated for finalization */
+#define SEPARATED 4 /* object is in 'finobj' list or in 'tobefnz' */
+#define FIXEDBIT 5 /* object is fixed (should not be collected) */
+#define OLDBIT 6 /* object is old (only in generational mode) */
+/* bit 7 is currently used by tests (luaL_checkmemory) */
+
+#define WHITEBITS bit2mask(WHITE0BIT, WHITE1BIT)
+
+
+#define iswhite(x) testbits((x)->gch.marked, WHITEBITS)
+#define isblack(x) testbit((x)->gch.marked, BLACKBIT)
+#define isgray(x) /* neither white nor black */ \
+ (!testbits((x)->gch.marked, WHITEBITS | bitmask(BLACKBIT)))
+
+#define isold(x) testbit((x)->gch.marked, OLDBIT)
+
+/* MOVE OLD rule: whenever an object is moved to the beginning of
+ a GC list, its old bit must be cleared */
+#define resetoldbit(o) resetbit((o)->gch.marked, OLDBIT)
+
+#define otherwhite(g) (g->currentwhite ^ WHITEBITS)
+#define isdeadm(ow,m) (!(((m) ^ WHITEBITS) & (ow)))
+#define isdead(g,v) isdeadm(otherwhite(g), (v)->gch.marked)
+
+#define changewhite(x) ((x)->gch.marked ^= WHITEBITS)
+#define gray2black(x) l_setbit((x)->gch.marked, BLACKBIT)
+
+#define valiswhite(x) (iscollectable(x) && iswhite(gcvalue(x)))
+
+#define luaC_white(g) cast(lu_byte, (g)->currentwhite & WHITEBITS)
+
+
+#define luaC_condGC(L,c) \
+ {if (G(L)->GCdebt > 0) {c;}; condchangemem(L);}
+#define luaC_checkGC(L) luaC_condGC(L, luaC_step(L);)
+
+
+#define luaC_barrier(L,p,v) { if (valiswhite(v) && isblack(obj2gco(p))) \
+ luaC_barrier_(L,obj2gco(p),gcvalue(v)); }
+
+#define luaC_barrierback(L,p,v) { if (valiswhite(v) && isblack(obj2gco(p))) \
+ luaC_barrierback_(L,p); }
+
+#define luaC_objbarrier(L,p,o) \
+ { if (iswhite(obj2gco(o)) && isblack(obj2gco(p))) \
+ luaC_barrier_(L,obj2gco(p),obj2gco(o)); }
+
+#define luaC_objbarrierback(L,p,o) \
+ { if (iswhite(obj2gco(o)) && isblack(obj2gco(p))) luaC_barrierback_(L,p); }
+
+#define luaC_barrierproto(L,p,c) \
+ { if (isblack(obj2gco(p))) luaC_barrierproto_(L,p,c); }
+
+LUAI_FUNC void luaC_freeallobjects (lua_State *L);
+LUAI_FUNC void luaC_step (lua_State *L);
+LUAI_FUNC void luaC_forcestep (lua_State *L);
+LUAI_FUNC void luaC_runtilstate (lua_State *L, int statesmask);
+LUAI_FUNC void luaC_fullgc (lua_State *L, int isemergency);
+LUAI_FUNC GCObject *luaC_newobj (lua_State *L, int tt, size_t sz,
+ GCObject **list, int offset);
+LUAI_FUNC void luaC_barrier_ (lua_State *L, GCObject *o, GCObject *v);
+LUAI_FUNC void luaC_barrierback_ (lua_State *L, GCObject *o);
+LUAI_FUNC void luaC_barrierproto_ (lua_State *L, Proto *p, Closure *c);
+LUAI_FUNC void luaC_checkfinalizer (lua_State *L, GCObject *o, Table *mt);
+LUAI_FUNC void luaC_checkupvalcolor (global_State *g, UpVal *uv);
+LUAI_FUNC void luaC_changemode (lua_State *L, int mode);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/llex.c b/sys/contrib/openzfs/module/lua/llex.c
new file mode 100644
index 000000000000..50c301f599f1
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/llex.c
@@ -0,0 +1,531 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: llex.c,v 2.63.1.3 2015/02/09 17:56:34 roberto Exp $
+** Lexical Analyzer
+** See Copyright Notice in lua.h
+*/
+
+#define llex_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "lctype.h"
+#include "ldo.h"
+#include "llex.h"
+#include "lobject.h"
+#include "lparser.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "lzio.h"
+
+
+
+#define next(ls) (ls->current = zgetc(ls->z))
+
+
+
+#define currIsNewline(ls) (ls->current == '\n' || ls->current == '\r')
+
+
+/* ORDER RESERVED */
+static const char *const luaX_tokens [] = {
+ "and", "break", "do", "else", "elseif",
+ "end", "false", "for", "function", "goto", "if",
+ "in", "local", "nil", "not", "or", "repeat",
+ "return", "then", "true", "until", "while",
+ "..", "...", "==", ">=", "<=", "~=", "::", "<eof>",
+ "<number>", "<name>", "<string>"
+};
+
+
+#define save_and_next(ls) (save(ls, ls->current), next(ls))
+
+
+static l_noret lexerror (LexState *ls, const char *msg, int token);
+
+
+static void save (LexState *ls, int c) {
+ Mbuffer *b = ls->buff;
+ if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
+ size_t newsize;
+ if (luaZ_sizebuffer(b) >= MAX_SIZET/2)
+ lexerror(ls, "lexical element too long", 0);
+ newsize = luaZ_sizebuffer(b) * 2;
+ luaZ_resizebuffer(ls->L, b, newsize);
+ }
+ b->buffer[luaZ_bufflen(b)++] = cast(char, c);
+}
+
+
+void luaX_init (lua_State *L) {
+ int i;
+ for (i=0; i<NUM_RESERVED; i++) {
+ TString *ts = luaS_new(L, luaX_tokens[i]);
+ luaS_fix(ts); /* reserved words are never collected */
+ ts->tsv.extra = cast_byte(i+1); /* reserved word */
+ }
+}
+
+
+const char *luaX_token2str (LexState *ls, int token) {
+ if (token < FIRST_RESERVED) { /* single-byte symbols? */
+ lua_assert(token == cast(unsigned char, token));
+ return (lisprint(token)) ? luaO_pushfstring(ls->L, LUA_QL("%c"), token) :
+ luaO_pushfstring(ls->L, "char(%d)", token);
+ }
+ else {
+ const char *s = luaX_tokens[token - FIRST_RESERVED];
+ if (token < TK_EOS) /* fixed format (symbols and reserved words)? */
+ return luaO_pushfstring(ls->L, LUA_QS, s);
+ else /* names, strings, and numerals */
+ return s;
+ }
+}
+
+
+static const char *txtToken (LexState *ls, int token) {
+ switch (token) {
+ case TK_NAME:
+ case TK_STRING:
+ case TK_NUMBER:
+ save(ls, '\0');
+ return luaO_pushfstring(ls->L, LUA_QS, luaZ_buffer(ls->buff));
+ default:
+ return luaX_token2str(ls, token);
+ }
+}
+
+
+static l_noret lexerror (LexState *ls, const char *msg, int token) {
+ char buff[LUA_IDSIZE];
+ luaO_chunkid(buff, getstr(ls->source), LUA_IDSIZE);
+ msg = luaO_pushfstring(ls->L, "%s:%d: %s", buff, ls->linenumber, msg);
+ if (token)
+ luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
+ luaD_throw(ls->L, LUA_ERRSYNTAX);
+}
+
+
+l_noret luaX_syntaxerror (LexState *ls, const char *msg) {
+ lexerror(ls, msg, ls->t.token);
+}
+
+
+/*
+** creates a new string and anchors it in function's table so that
+** it will not be collected until the end of the function's compilation
+** (by that time it should be anchored in function's prototype)
+*/
+TString *luaX_newstring (LexState *ls, const char *str, size_t l) {
+ lua_State *L = ls->L;
+ TValue *o; /* entry for `str' */
+ TString *ts = luaS_newlstr(L, str, l); /* create new string */
+ setsvalue2s(L, L->top++, ts); /* temporarily anchor it in stack */
+ o = luaH_set(L, ls->fs->h, L->top - 1);
+ if (ttisnil(o)) { /* not in use yet? (see 'addK') */
+ /* boolean value does not need GC barrier;
+ table has no metatable, so it does not need to invalidate cache */
+ setbvalue(o, 1); /* t[string] = true */
+ luaC_checkGC(L);
+ }
+ else { /* string already present */
+ ts = rawtsvalue(keyfromval(o)); /* re-use value previously stored */
+ }
+ L->top--; /* remove string from stack */
+ return ts;
+}
+
+
+/*
+** increment line number and skips newline sequence (any of
+** \n, \r, \n\r, or \r\n)
+*/
+static void inclinenumber (LexState *ls) {
+ int old = ls->current;
+ lua_assert(currIsNewline(ls));
+ next(ls); /* skip `\n' or `\r' */
+ if (currIsNewline(ls) && ls->current != old)
+ next(ls); /* skip `\n\r' or `\r\n' */
+ if (++ls->linenumber >= MAX_INT)
+ lexerror(ls, "chunk has too many lines", 0);
+}
+
+
+void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source,
+ int firstchar) {
+ ls->decpoint = '.';
+ ls->L = L;
+ ls->current = firstchar;
+ ls->lookahead.token = TK_EOS; /* no look-ahead token */
+ ls->z = z;
+ ls->fs = NULL;
+ ls->linenumber = 1;
+ ls->lastline = 1;
+ ls->source = source;
+ ls->envn = luaS_new(L, LUA_ENV); /* create env name */
+ luaS_fix(ls->envn); /* never collect this name */
+ luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER); /* initialize buffer */
+}
+
+
+
+/*
+** =======================================================
+** LEXICAL ANALYZER
+** =======================================================
+*/
+
+
+
+static int check_next (LexState *ls, const char *set) {
+ if (ls->current == '\0' || !strchr(set, ls->current))
+ return 0;
+ save_and_next(ls);
+ return 1;
+}
+
+
+/*
+** change all characters 'from' in buffer to 'to'
+*/
+static void buffreplace (LexState *ls, char from, char to) {
+ size_t n = luaZ_bufflen(ls->buff);
+ char *p = luaZ_buffer(ls->buff);
+ while (n--)
+ if (p[n] == from) p[n] = to;
+}
+
+
+#if !defined(getlocaledecpoint)
+#define getlocaledecpoint() (localeconv()->decimal_point[0])
+#endif
+
+
+#define buff2d(b,e) luaO_str2d(luaZ_buffer(b), luaZ_bufflen(b) - 1, e)
+
+/*
+** in case of format error, try to change decimal point separator to
+** the one defined in the current locale and check again
+*/
+static void trydecpoint (LexState *ls, SemInfo *seminfo) {
+ char old = ls->decpoint;
+ ls->decpoint = getlocaledecpoint();
+ buffreplace(ls, old, ls->decpoint); /* try new decimal separator */
+ if (!buff2d(ls->buff, &seminfo->r)) {
+ /* format error with correct decimal point: no more options */
+ buffreplace(ls, ls->decpoint, '.'); /* undo change (for error message) */
+ lexerror(ls, "malformed number", TK_NUMBER);
+ }
+}
+
+
+/* LUA_NUMBER */
+/*
+** this function is quite liberal in what it accepts, as 'luaO_str2d'
+** will reject ill-formed numerals.
+*/
+static void read_numeral (LexState *ls, SemInfo *seminfo) {
+ const char *expo = "Ee";
+ int first = ls->current;
+ lua_assert(lisdigit(ls->current));
+ save_and_next(ls);
+ if (first == '0' && check_next(ls, "Xx")) /* hexadecimal? */
+ expo = "Pp";
+ for (;;) {
+ if (check_next(ls, expo)) /* exponent part? */
+ (void) check_next(ls, "+-"); /* optional exponent sign */
+ if (lisxdigit(ls->current) || ls->current == '.')
+ save_and_next(ls);
+ else break;
+ }
+ save(ls, '\0');
+ buffreplace(ls, '.', ls->decpoint); /* follow locale for decimal point */
+ if (!buff2d(ls->buff, &seminfo->r)) /* format error? */
+ trydecpoint(ls, seminfo); /* try to update decimal point separator */
+}
+
+
+/*
+** skip a sequence '[=*[' or ']=*]' and return its number of '='s or
+** -1 if sequence is malformed
+*/
+static int skip_sep (LexState *ls) {
+ int count = 0;
+ int s = ls->current;
+ lua_assert(s == '[' || s == ']');
+ save_and_next(ls);
+ while (ls->current == '=') {
+ save_and_next(ls);
+ count++;
+ }
+ return (ls->current == s) ? count : (-count) - 1;
+}
+
+
+static void read_long_string (LexState *ls, SemInfo *seminfo, int sep) {
+ save_and_next(ls); /* skip 2nd `[' */
+ if (currIsNewline(ls)) /* string starts with a newline? */
+ inclinenumber(ls); /* skip it */
+ for (;;) {
+ switch (ls->current) {
+ case EOZ:
+ lexerror(ls, (seminfo) ? "unfinished long string" :
+ "unfinished long comment", TK_EOS);
+ break; /* to avoid warnings */
+ case ']': {
+ if (skip_sep(ls) == sep) {
+ save_and_next(ls); /* skip 2nd `]' */
+ goto endloop;
+ }
+ break;
+ }
+ case '\n': case '\r': {
+ save(ls, '\n');
+ inclinenumber(ls);
+ if (!seminfo) luaZ_resetbuffer(ls->buff); /* avoid wasting space */
+ break;
+ }
+ default: {
+ if (seminfo) save_and_next(ls);
+ else next(ls);
+ }
+ }
+ } endloop:
+ if (seminfo)
+ seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + (2 + sep),
+ luaZ_bufflen(ls->buff) - 2*(2 + sep));
+}
+
+
+static void escerror (LexState *ls, int *c, int n, const char *msg) {
+ int i;
+ luaZ_resetbuffer(ls->buff); /* prepare error message */
+ save(ls, '\\');
+ for (i = 0; i < n && c[i] != EOZ; i++)
+ save(ls, c[i]);
+ lexerror(ls, msg, TK_STRING);
+}
+
+
+static int readhexaesc (LexState *ls) {
+ int c[3], i; /* keep input for error message */
+ int r = 0; /* result accumulator */
+ c[0] = 'x'; /* for error message */
+ for (i = 1; i < 3; i++) { /* read two hexadecimal digits */
+ c[i] = next(ls);
+ if (!lisxdigit(c[i]))
+ escerror(ls, c, i + 1, "hexadecimal digit expected");
+ r = (r << 4) + luaO_hexavalue(c[i]);
+ }
+ return r;
+}
+
+
+static int readdecesc (LexState *ls) {
+ int c[3], i;
+ int r = 0; /* result accumulator */
+ for (i = 0; i < 3 && lisdigit(ls->current); i++) { /* read up to 3 digits */
+ c[i] = ls->current;
+ r = 10*r + c[i] - '0';
+ next(ls);
+ }
+ if (r > UCHAR_MAX)
+ escerror(ls, c, i, "decimal escape too large");
+ return r;
+}
+
+
+static void read_string (LexState *ls, int del, SemInfo *seminfo) {
+ save_and_next(ls); /* keep delimiter (for error messages) */
+ while (ls->current != del) {
+ switch (ls->current) {
+ case EOZ:
+ lexerror(ls, "unfinished string", TK_EOS);
+ break; /* to avoid warnings */
+ case '\n':
+ case '\r':
+ lexerror(ls, "unfinished string", TK_STRING);
+ break; /* to avoid warnings */
+ case '\\': { /* escape sequences */
+ int c; /* final character to be saved */
+ next(ls); /* do not save the `\' */
+ switch (ls->current) {
+ case 'a': c = '\a'; goto read_save;
+ case 'b': c = '\b'; goto read_save;
+ case 'f': c = '\f'; goto read_save;
+ case 'n': c = '\n'; goto read_save;
+ case 'r': c = '\r'; goto read_save;
+ case 't': c = '\t'; goto read_save;
+ case 'v': c = '\v'; goto read_save;
+ case 'x': c = readhexaesc(ls); goto read_save;
+ case '\n': case '\r':
+ inclinenumber(ls); c = '\n'; goto only_save;
+ case '\\': case '\"': case '\'':
+ c = ls->current; goto read_save;
+ case EOZ: goto no_save; /* will raise an error next loop */
+ case 'z': { /* zap following span of spaces */
+ next(ls); /* skip the 'z' */
+ while (lisspace(ls->current)) {
+ if (currIsNewline(ls)) inclinenumber(ls);
+ else next(ls);
+ }
+ goto no_save;
+ }
+ default: {
+ if (!lisdigit(ls->current))
+ escerror(ls, &ls->current, 1, "invalid escape sequence");
+ /* digital escape \ddd */
+ c = readdecesc(ls);
+ goto only_save;
+ }
+ }
+ read_save: next(ls); /* read next character */
+ only_save: save(ls, c); /* save 'c' */
+ no_save: break;
+ }
+ default:
+ save_and_next(ls);
+ }
+ }
+ save_and_next(ls); /* skip delimiter */
+ seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1,
+ luaZ_bufflen(ls->buff) - 2);
+}
+
+
+static int llex (LexState *ls, SemInfo *seminfo) {
+ luaZ_resetbuffer(ls->buff);
+ for (;;) {
+ switch (ls->current) {
+ case '\n': case '\r': { /* line breaks */
+ inclinenumber(ls);
+ break;
+ }
+ case ' ': case '\f': case '\t': case '\v': { /* spaces */
+ next(ls);
+ break;
+ }
+ case '-': { /* '-' or '--' (comment) */
+ next(ls);
+ if (ls->current != '-') return '-';
+ /* else is a comment */
+ next(ls);
+ if (ls->current == '[') { /* long comment? */
+ int sep = skip_sep(ls);
+ luaZ_resetbuffer(ls->buff); /* `skip_sep' may dirty the buffer */
+ if (sep >= 0) {
+ read_long_string(ls, NULL, sep); /* skip long comment */
+ luaZ_resetbuffer(ls->buff); /* previous call may dirty the buff. */
+ break;
+ }
+ }
+ /* else short comment */
+ while (!currIsNewline(ls) && ls->current != EOZ)
+ next(ls); /* skip until end of line (or end of file) */
+ break;
+ }
+ case '[': { /* long string or simply '[' */
+ int sep = skip_sep(ls);
+ if (sep >= 0) {
+ read_long_string(ls, seminfo, sep);
+ return TK_STRING;
+ } else if (sep == -1) {
+ return '[';
+ } else {
+ lexerror(ls, "invalid long string delimiter", TK_STRING);
+ break;
+ }
+ }
+ case '=': {
+ next(ls);
+ if (ls->current != '=') return '=';
+ else { next(ls); return TK_EQ; }
+ }
+ case '<': {
+ next(ls);
+ if (ls->current != '=') return '<';
+ else { next(ls); return TK_LE; }
+ }
+ case '>': {
+ next(ls);
+ if (ls->current != '=') return '>';
+ else { next(ls); return TK_GE; }
+ }
+ case '~': {
+ next(ls);
+ if (ls->current != '=') return '~';
+ else { next(ls); return TK_NE; }
+ }
+ case ':': {
+ next(ls);
+ if (ls->current != ':') return ':';
+ else { next(ls); return TK_DBCOLON; }
+ }
+ case '"': case '\'': { /* short literal strings */
+ read_string(ls, ls->current, seminfo);
+ return TK_STRING;
+ }
+ case '.': { /* '.', '..', '...', or number */
+ save_and_next(ls);
+ if (check_next(ls, ".")) {
+ if (check_next(ls, "."))
+ return TK_DOTS; /* '...' */
+ else return TK_CONCAT; /* '..' */
+ }
+ else if (!lisdigit(ls->current)) return '.';
+ /* else go through */
+ }
+ /* FALLTHROUGH */
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9': {
+ read_numeral(ls, seminfo);
+ return TK_NUMBER;
+ }
+ case EOZ: {
+ return TK_EOS;
+ }
+ default: {
+ if (lislalpha(ls->current)) { /* identifier or reserved word? */
+ TString *ts;
+ do {
+ save_and_next(ls);
+ } while (lislalnum(ls->current));
+ ts = luaX_newstring(ls, luaZ_buffer(ls->buff),
+ luaZ_bufflen(ls->buff));
+ seminfo->ts = ts;
+ if (isreserved(ts)) /* reserved word? */
+ return ts->tsv.extra - 1 + FIRST_RESERVED;
+ else {
+ return TK_NAME;
+ }
+ }
+ else { /* single-char tokens (+ - / ...) */
+ int c = ls->current;
+ next(ls);
+ return c;
+ }
+ }
+ }
+ }
+}
+
+
+void luaX_next (LexState *ls) {
+ ls->lastline = ls->linenumber;
+ if (ls->lookahead.token != TK_EOS) { /* is there a look-ahead token? */
+ ls->t = ls->lookahead; /* use this one */
+ ls->lookahead.token = TK_EOS; /* and discharge it */
+ }
+ else
+ ls->t.token = llex(ls, &ls->t.seminfo); /* read next token */
+}
+
+
+int luaX_lookahead (LexState *ls) {
+ lua_assert(ls->lookahead.token == TK_EOS);
+ ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
+ return ls->lookahead.token;
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/llex.h b/sys/contrib/openzfs/module/lua/llex.h
new file mode 100644
index 000000000000..da58203e8dc8
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/llex.h
@@ -0,0 +1,83 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: llex.h,v 1.72.1.1 2013/04/12 18:48:47 roberto Exp $
+** Lexical Analyzer
+** See Copyright Notice in lua.h
+*/
+
+#ifndef llex_h
+#define llex_h
+
+#include "lobject.h"
+#include "lzio.h"
+
+
+#define FIRST_RESERVED 257
+
+
+
+/*
+* WARNING: if you change the order of this enumeration,
+* grep "ORDER RESERVED"
+*/
+enum RESERVED {
+ /* terminal symbols denoted by reserved words */
+ TK_AND = FIRST_RESERVED, TK_BREAK,
+ TK_DO, TK_ELSE, TK_ELSEIF, TK_END, TK_FALSE, TK_FOR, TK_FUNCTION,
+ TK_GOTO, TK_IF, TK_IN, TK_LOCAL, TK_NIL, TK_NOT, TK_OR, TK_REPEAT,
+ TK_RETURN, TK_THEN, TK_TRUE, TK_UNTIL, TK_WHILE,
+ /* other terminal symbols */
+ TK_CONCAT, TK_DOTS, TK_EQ, TK_GE, TK_LE, TK_NE, TK_DBCOLON, TK_EOS,
+ TK_NUMBER, TK_NAME, TK_STRING
+};
+
+/* number of reserved words */
+#define NUM_RESERVED (cast(int, TK_WHILE-FIRST_RESERVED+1))
+
+
+typedef union {
+ lua_Number r;
+ TString *ts;
+} SemInfo; /* semantics information */
+
+
+typedef struct Token {
+ int token;
+ SemInfo seminfo;
+} Token;
+
+#ifdef current
+#undef current
+#endif
+
+/* state of the lexer plus state of the parser when shared by all
+ functions */
+typedef struct LexState {
+ int current; /* current character (charint) */
+ int linenumber; /* input line counter */
+ int lastline; /* line of last token `consumed' */
+ Token t; /* current token */
+ Token lookahead; /* look ahead token */
+ struct FuncState *fs; /* current function (parser) */
+ struct lua_State *L;
+ ZIO *z; /* input stream */
+ Mbuffer *buff; /* buffer for tokens */
+ struct Dyndata *dyd; /* dynamic structures used by the parser */
+ TString *source; /* current source name */
+ TString *envn; /* environment variable name */
+ char decpoint; /* locale decimal point */
+} LexState;
+
+
+LUAI_FUNC void luaX_init (lua_State *L);
+LUAI_FUNC void luaX_setinput (lua_State *L, LexState *ls, ZIO *z,
+ TString *source, int firstchar);
+LUAI_FUNC TString *luaX_newstring (LexState *ls, const char *str, size_t l);
+LUAI_FUNC void luaX_next (LexState *ls);
+LUAI_FUNC int luaX_lookahead (LexState *ls);
+LUAI_FUNC l_noret luaX_syntaxerror (LexState *ls, const char *s);
+LUAI_FUNC const char *luaX_token2str (LexState *ls, int token);
+
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/llimits.h b/sys/contrib/openzfs/module/lua/llimits.h
new file mode 100644
index 000000000000..177092fbc228
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/llimits.h
@@ -0,0 +1,314 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: llimits.h,v 1.103.1.1 2013/04/12 18:48:47 roberto Exp $
+** Limits, basic types, and some other `installation-dependent' definitions
+** See Copyright Notice in lua.h
+*/
+
+#ifndef llimits_h
+#define llimits_h
+
+
+#include <sys/lua/lua.h>
+
+
+typedef unsigned LUA_INT32 lu_int32;
+
+typedef LUAI_UMEM lu_mem;
+
+typedef LUAI_MEM l_mem;
+
+
+
+/* chars used as small naturals (so that `char' is reserved for characters) */
+typedef unsigned char lu_byte;
+
+
+#define MAX_SIZET ((size_t)(~(size_t)0)-2)
+
+#define MAX_LUMEM ((lu_mem)(~(lu_mem)0)-2)
+
+#define MAX_LMEM ((l_mem) ((MAX_LUMEM >> 1) - 2))
+
+
+#define MAX_INT (INT_MAX-2) /* maximum value of an int (-2 for safety) */
+
+/*
+** conversion of pointer to integer
+** this is for hashing only; there is no problem if the integer
+** cannot hold the whole pointer value
+*/
+#define IntPoint(p) ((unsigned int)(lu_mem)(p))
+
+
+
+/* type to ensure maximum alignment */
+#if !defined(LUAI_USER_ALIGNMENT_T)
+#define LUAI_USER_ALIGNMENT_T union { double u; void *s; long l; }
+#endif
+
+typedef LUAI_USER_ALIGNMENT_T L_Umaxalign;
+
+
+/* result of a `usual argument conversion' over lua_Number */
+typedef LUAI_UACNUMBER l_uacNumber;
+
+
+/* internal assertions for in-house debugging */
+#if defined(lua_assert)
+#define check_exp(c,e) (lua_assert(c), (e))
+/* to avoid problems with conditions too long */
+#define lua_longassert(c) { if (!(c)) lua_assert(0); }
+#else
+#define lua_assert(c) ((void)0)
+#define check_exp(c,e) (e)
+#define lua_longassert(c) ((void)0)
+#endif
+
+/*
+** assertion for checking API calls
+*/
+#if !defined(luai_apicheck)
+
+#if defined(LUA_USE_APICHECK)
+#include <assert.h>
+#define luai_apicheck(L,e) assert(e)
+#else
+#define luai_apicheck(L,e) lua_assert(e)
+#endif
+
+#endif
+
+#define api_check(l,e,msg) luai_apicheck(l,(e) && msg)
+
+
+#if !defined(UNUSED)
+#define UNUSED(x) ((void)(x)) /* to avoid warnings */
+#endif
+
+
+#define cast(t, exp) ((t)(exp))
+
+#define cast_byte(i) cast(lu_byte, (i))
+#define cast_num(i) cast(lua_Number, (i))
+#define cast_int(i) cast(int, (i))
+#define cast_uchar(i) cast(unsigned char, (i))
+
+
+/*
+** non-return type
+**
+** Suppress noreturn attribute in kernel builds to avoid objtool check warnings
+*/
+#if defined(__GNUC__) && !defined(_KERNEL)
+#define l_noret void __attribute__((noreturn))
+#elif defined(_MSC_VER)
+#define l_noret void __declspec(noreturn)
+#else
+#define l_noret void
+#endif
+
+
+
+/*
+** maximum depth for nested C calls and syntactical nested non-terminals
+** in a program. (Value must fit in an unsigned short int.)
+**
+** Note: On amd64 platform, the limit has been measured to be 45. We set
+** the maximum lower to give a margin for changing the amount of stack
+** used by various functions involved in parsing and executing code.
+*/
+#if !defined(LUAI_MAXCCALLS)
+#define LUAI_MAXCCALLS 20
+#endif
+
+/*
+ * Minimum amount of available stack space (in bytes) to make a C call. With
+ * gsub() recursion, the stack space between each luaD_call() is 1256 bytes.
+ */
+#define LUAI_MINCSTACK 4096
+
+/*
+** maximum number of upvalues in a closure (both C and Lua). (Value
+** must fit in an unsigned char.)
+*/
+#define MAXUPVAL UCHAR_MAX
+
+
+/*
+** type for virtual-machine instructions
+** must be an unsigned with (at least) 4 bytes (see details in lopcodes.h)
+*/
+typedef lu_int32 Instruction;
+
+
+
+/* maximum stack for a Lua function */
+#define MAXSTACK 250
+
+
+
+/* minimum size for the string table (must be power of 2) */
+#if !defined(MINSTRTABSIZE)
+#define MINSTRTABSIZE 32
+#endif
+
+
+/* minimum size for string buffer */
+#if !defined(LUA_MINBUFFER)
+#define LUA_MINBUFFER 32
+#endif
+
+
+#if !defined(lua_lock)
+#define lua_lock(L) ((void) 0)
+#define lua_unlock(L) ((void) 0)
+#endif
+
+#if !defined(luai_threadyield)
+#define luai_threadyield(L) {lua_unlock(L); lua_lock(L);}
+#endif
+
+
+/*
+** these macros allow user-specific actions on threads when you defined
+** LUAI_EXTRASPACE and need to do something extra when a thread is
+** created/deleted/resumed/yielded.
+*/
+#if !defined(luai_userstateopen)
+#define luai_userstateopen(L) ((void)L)
+#endif
+
+#if !defined(luai_userstateclose)
+#define luai_userstateclose(L) ((void)L)
+#endif
+
+#if !defined(luai_userstatethread)
+#define luai_userstatethread(L,L1) ((void)L)
+#endif
+
+#if !defined(luai_userstatefree)
+#define luai_userstatefree(L,L1) ((void)L)
+#endif
+
+#if !defined(luai_userstateresume)
+#define luai_userstateresume(L,n) ((void)L)
+#endif
+
+#if !defined(luai_userstateyield)
+#define luai_userstateyield(L,n) ((void)L)
+#endif
+
+/*
+** lua_number2int is a macro to convert lua_Number to int.
+** lua_number2integer is a macro to convert lua_Number to lua_Integer.
+** lua_number2unsigned is a macro to convert a lua_Number to a lua_Unsigned.
+** lua_unsigned2number is a macro to convert a lua_Unsigned to a lua_Number.
+** luai_hashnum is a macro to hash a lua_Number value into an integer.
+** The hash must be deterministic and give reasonable values for
+** both small and large values (outside the range of integers).
+*/
+
+#if defined(MS_ASMTRICK) || defined(LUA_MSASMTRICK) /* { */
+/* trick with Microsoft assembler for X86 */
+
+#define lua_number2int(i,n) __asm {__asm fld n __asm fistp i}
+#define lua_number2integer(i,n) lua_number2int(i, n)
+#define lua_number2unsigned(i,n) \
+ {__int64 l; __asm {__asm fld n __asm fistp l} i = (unsigned int)l;}
+
+
+#elif defined(LUA_IEEE754TRICK) /* }{ */
+/* the next trick should work on any machine using IEEE754 with
+ a 32-bit int type */
+
+union luai_Cast { double l_d; LUA_INT32 l_p[2]; };
+
+#if !defined(LUA_IEEEENDIAN) /* { */
+#define LUAI_EXTRAIEEE \
+ static const union luai_Cast ieeeendian = {-(33.0 + 6755399441055744.0)};
+#define LUA_IEEEENDIANLOC (ieeeendian.l_p[1] == 33)
+#else
+#define LUA_IEEEENDIANLOC LUA_IEEEENDIAN
+#define LUAI_EXTRAIEEE /* empty */
+#endif /* } */
+
+#define lua_number2int32(i,n,t) \
+ { LUAI_EXTRAIEEE \
+ volatile union luai_Cast u; u.l_d = (n) + 6755399441055744.0; \
+ (i) = (t)u.l_p[LUA_IEEEENDIANLOC]; }
+
+#define luai_hashnum(i,n) \
+ { volatile union luai_Cast u; u.l_d = (n) + 1.0; /* avoid -0 */ \
+ (i) = u.l_p[0]; (i) += u.l_p[1]; } /* add double bits for his hash */
+
+#define lua_number2int(i,n) lua_number2int32(i, n, int)
+#define lua_number2unsigned(i,n) lua_number2int32(i, n, lua_Unsigned)
+
+/* the trick can be expanded to lua_Integer when it is a 32-bit value */
+#if defined(LUA_IEEELL)
+#define lua_number2integer(i,n) lua_number2int32(i, n, lua_Integer)
+#endif
+
+#endif /* } */
+
+
+/* the following definitions always work, but may be slow */
+
+#if !defined(lua_number2int)
+#define lua_number2int(i,n) ((i)=(int)(n))
+#endif
+
+#if !defined(lua_number2integer)
+#define lua_number2integer(i,n) ((i)=(lua_Integer)(n))
+#endif
+
+#if !defined(lua_number2unsigned) /* { */
+/* the following definition assures proper modulo behavior */
+#if defined(LUA_NUMBER_DOUBLE) || defined(LUA_NUMBER_FLOAT)
+#include <math.h>
+#define SUPUNSIGNED ((lua_Number)(~(lua_Unsigned)0) + 1)
+#define lua_number2unsigned(i,n) \
+ ((i)=(lua_Unsigned)((n) - floor((n)/SUPUNSIGNED)*SUPUNSIGNED))
+#else
+#define lua_number2unsigned(i,n) ((i)=(lua_Unsigned)(n))
+#endif
+#endif /* } */
+
+
+#if !defined(lua_unsigned2number)
+/* on several machines, coercion from unsigned to double is slow,
+ so it may be worth to avoid */
+#define lua_unsigned2number(u) \
+ (((u) <= (lua_Unsigned)INT_MAX) ? (lua_Number)(int)(u) : (lua_Number)(u))
+#endif
+
+
+
+#if defined(ltable_c) && !defined(luai_hashnum)
+
+#define luai_hashnum(i,n) (i = lcompat_hashnum(n))
+
+#endif
+
+
+
+/*
+** macro to control inclusion of some hard tests on stack reallocation
+*/
+#if !defined(HARDSTACKTESTS)
+#define condmovestack(L) ((void)0)
+#else
+/* realloc stack keeping its size */
+#define condmovestack(L) luaD_reallocstack((L), (L)->stacksize)
+#endif
+
+#if !defined(HARDMEMTESTS)
+#define condchangemem(L) condmovestack(L)
+#else
+#define condchangemem(L) \
+ ((void)(!(G(L)->gcrunning) || (luaC_fullgc(L, 0), 1)))
+#endif
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lmem.c b/sys/contrib/openzfs/module/lua/lmem.c
new file mode 100644
index 000000000000..18bb2514cb01
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lmem.c
@@ -0,0 +1,98 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lmem.c,v 1.84.1.1 2013/04/12 18:48:47 roberto Exp $
+** Interface to Memory Manager
+** See Copyright Notice in lua.h
+*/
+
+
+#define lmem_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "ldebug.h"
+#include "ldo.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+
+
+
+/*
+** About the realloc function:
+** void * frealloc (void *ud, void *ptr, size_t osize, size_t nsize);
+** (`osize' is the old size, `nsize' is the new size)
+**
+** * frealloc(ud, NULL, x, s) creates a new block of size `s' (no
+** matter 'x').
+**
+** * frealloc(ud, p, x, 0) frees the block `p'
+** (in this specific case, frealloc must return NULL);
+** particularly, frealloc(ud, NULL, 0, 0) does nothing
+** (which is equivalent to free(NULL) in ANSI C)
+**
+** frealloc returns NULL if it cannot create or reallocate the area
+** (any reallocation to an equal or smaller size cannot fail!)
+*/
+
+
+
+#define MINSIZEARRAY 4
+
+
+void *luaM_growaux_ (lua_State *L, void *block, int *size, size_t size_elems,
+ int limit, const char *what) {
+ void *newblock;
+ int newsize;
+ if (*size >= limit/2) { /* cannot double it? */
+ if (*size >= limit) /* cannot grow even a little? */
+ luaG_runerror(L, "too many %s (limit is %d)", what, limit);
+ newsize = limit; /* still have at least one free place */
+ }
+ else {
+ newsize = (*size)*2;
+ if (newsize < MINSIZEARRAY)
+ newsize = MINSIZEARRAY; /* minimum size */
+ }
+ newblock = luaM_reallocv(L, block, *size, newsize, size_elems);
+ *size = newsize; /* update only when everything else is OK */
+ return newblock;
+}
+
+
+l_noret luaM_toobig (lua_State *L) {
+ luaG_runerror(L, "memory allocation error: block too big");
+}
+
+
+
+/*
+** generic allocation routine.
+*/
+void *luaM_realloc_ (lua_State *L, void *block, size_t osize, size_t nsize) {
+ void *newblock;
+ global_State *g = G(L);
+ size_t realosize = (block) ? osize : 0;
+ lua_assert((realosize == 0) == (block == NULL));
+#if defined(HARDMEMTESTS)
+ if (nsize > realosize && g->gcrunning)
+ luaC_fullgc(L, 1); /* force a GC whenever possible */
+#endif
+ newblock = (*g->frealloc)(g->ud, block, osize, nsize);
+ if (newblock == NULL && nsize > 0) {
+ api_check(L, nsize > realosize,
+ "realloc cannot fail when shrinking a block");
+ if (g->gcrunning) {
+ luaC_fullgc(L, 1); /* try to free some memory... */
+ newblock = (*g->frealloc)(g->ud, block, osize, nsize); /* try again */
+ }
+ if (newblock == NULL)
+ luaD_throw(L, LUA_ERRMEM);
+ }
+ lua_assert((nsize == 0) == (newblock == NULL));
+ g->GCdebt = (g->GCdebt + nsize) - realosize;
+ return newblock;
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lmem.h b/sys/contrib/openzfs/module/lua/lmem.h
new file mode 100644
index 000000000000..22c04c98c863
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lmem.h
@@ -0,0 +1,56 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lmem.h,v 1.40.1.1 2013/04/12 18:48:47 roberto Exp $
+** Interface to Memory Manager
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lmem_h
+#define lmem_h
+
+
+#include "llimits.h"
+#include <sys/lua/lua.h>
+
+
+/*
+** This macro avoids the runtime division MAX_SIZET/(e), as 'e' is
+** always constant.
+** The macro is somewhat complex to avoid warnings:
+** +1 avoids warnings of "comparison has constant result";
+** cast to 'void' avoids warnings of "value unused".
+*/
+#define luaM_reallocv(L,b,on,n,e) \
+ (cast(void, \
+ (cast(size_t, (n)+1) > MAX_SIZET/(e)) ? (luaM_toobig(L), 0) : 0), \
+ luaM_realloc_(L, (b), (on)*(e), (n)*(e)))
+
+#define luaM_freemem(L, b, s) luaM_realloc_(L, (b), (s), 0)
+#define luaM_free(L, b) luaM_realloc_(L, (b), sizeof(*(b)), 0)
+#define luaM_freearray(L, b, n) luaM_reallocv(L, (b), n, 0, sizeof((b)[0]))
+
+#define luaM_malloc(L,s) luaM_realloc_(L, NULL, 0, (s))
+#define luaM_new(L,t) cast(t *, luaM_malloc(L, sizeof(t)))
+#define luaM_newvector(L,n,t) \
+ cast(t *, luaM_reallocv(L, NULL, 0, n, sizeof(t)))
+
+#define luaM_newobject(L,tag,s) luaM_realloc_(L, NULL, tag, (s))
+
+#define luaM_growvector(L,v,nelems,size,t,limit,e) \
+ if ((nelems)+1 > (size)) \
+ ((v)=cast(t *, luaM_growaux_(L,v,&(size),sizeof(t),limit,e)))
+
+#define luaM_reallocvector(L, v,oldn,n,t) \
+ ((v)=cast(t *, luaM_reallocv(L, v, oldn, n, sizeof(t))))
+
+LUAI_FUNC l_noret luaM_toobig (lua_State *L);
+
+/* not to be called directly */
+LUAI_FUNC void *luaM_realloc_ (lua_State *L, void *block, size_t oldsize,
+ size_t size);
+LUAI_FUNC void *luaM_growaux_ (lua_State *L, void *block, int *size,
+ size_t size_elem, int limit,
+ const char *what);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lobject.c b/sys/contrib/openzfs/module/lua/lobject.c
new file mode 100644
index 000000000000..024d3199fe24
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lobject.c
@@ -0,0 +1,282 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lobject.c,v 2.58.1.1 2013/04/12 18:48:47 roberto Exp $
+** Some generic functions over Lua objects
+** See Copyright Notice in lua.h
+*/
+
+#define lobject_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "lctype.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "lvm.h"
+
+
+
+LUAI_DDEF const TValue luaO_nilobject_ = {NILCONSTANT};
+
+
+/*
+** converts an integer to a "floating point byte", represented as
+** (eeeeexxx), where the real value is (1xxx) * 2^(eeeee - 1) if
+** eeeee != 0 and (xxx) otherwise.
+*/
+int luaO_int2fb (unsigned int x) {
+ int e = 0; /* exponent */
+ if (x < 8) return x;
+ while (x >= 0x10) {
+ x = (x+1) >> 1;
+ e++;
+ }
+ return ((e+1) << 3) | (cast_int(x) - 8);
+}
+
+
+/* converts back */
+int luaO_fb2int (int x) {
+ int e = (x >> 3) & 0x1f;
+ if (e == 0) return x;
+ else return ((x & 7) + 8) << (e - 1);
+}
+
+
+int luaO_ceillog2 (unsigned int x) {
+ static const lu_byte log_2[256] = {
+ 0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+ 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+ 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+ 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+ 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+ 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+ };
+ int l = 0;
+ x--;
+ while (x >= 256) { l += 8; x >>= 8; }
+ return l + log_2[x];
+}
+
+
+lua_Number luaO_arith (int op, lua_Number v1, lua_Number v2) {
+ switch (op) {
+ case LUA_OPADD: return luai_numadd(NULL, v1, v2);
+ case LUA_OPSUB: return luai_numsub(NULL, v1, v2);
+ case LUA_OPMUL: return luai_nummul(NULL, v1, v2);
+ case LUA_OPDIV: return luai_numdiv(NULL, v1, v2);
+ case LUA_OPMOD: return luai_nummod(NULL, v1, v2);
+ case LUA_OPPOW: return luai_numpow(NULL, v1, v2);
+ case LUA_OPUNM: return luai_numunm(NULL, v1);
+ default: lua_assert(0); return 0;
+ }
+}
+
+
+int luaO_hexavalue (int c) {
+ if (lisdigit(c)) return c - '0';
+ else return ltolower(c) - 'a' + 10;
+}
+
+
+#if !defined(lua_strx2number)
+
+
+
+static int isneg (const char **s) {
+ if (**s == '-') { (*s)++; return 1; }
+ else if (**s == '+') (*s)++;
+ return 0;
+}
+
+
+static lua_Number readhexa (const char **s, lua_Number r, int *count) {
+ for (; lisxdigit(cast_uchar(**s)); (*s)++) { /* read integer part */
+ r = (r * cast_num(16.0)) + cast_num(luaO_hexavalue(cast_uchar(**s)));
+ (*count)++;
+ }
+ return r;
+}
+
+
+/*
+** convert an hexadecimal numeric string to a number, following
+** C99 specification for 'strtod'
+*/
+static lua_Number lua_strx2number (const char *s, char **endptr) {
+ lua_Number r = 0.0;
+ int e = 0, i = 0;
+ int neg = 0; /* 1 if number is negative */
+ *endptr = cast(char *, s); /* nothing is valid yet */
+ while (lisspace(cast_uchar(*s))) s++; /* skip initial spaces */
+ neg = isneg(&s); /* check signal */
+ if (!(*s == '0' && (*(s + 1) == 'x' || *(s + 1) == 'X'))) /* check '0x' */
+ return 0.0; /* invalid format (no '0x') */
+ s += 2; /* skip '0x' */
+ r = readhexa(&s, r, &i); /* read integer part */
+ if (*s == '.') {
+ s++; /* skip dot */
+ r = readhexa(&s, r, &e); /* read fractional part */
+ }
+ if (i == 0 && e == 0)
+ return 0.0; /* invalid format (no digit) */
+ e *= -4; /* each fractional digit divides value by 2^-4 */
+ *endptr = cast(char *, s); /* valid up to here */
+ if (*s == 'p' || *s == 'P') { /* exponent part? */
+ int exp1 = 0;
+ int neg1;
+ s++; /* skip 'p' */
+ neg1 = isneg(&s); /* signal */
+ if (!lisdigit(cast_uchar(*s)))
+ goto ret; /* must have at least one digit */
+ while (lisdigit(cast_uchar(*s))) /* read exponent */
+ exp1 = exp1 * 10 + *(s++) - '0';
+ if (neg1) exp1 = -exp1;
+ e += exp1;
+ }
+ *endptr = cast(char *, s); /* valid up to here */
+ ret:
+ if (neg) r = -r;
+ return (r * (1 << e));
+}
+
+#endif
+
+
+int luaO_str2d (const char *s, size_t len, lua_Number *result) {
+ char *endptr;
+ if (strpbrk(s, "nN")) /* reject 'inf' and 'nan' */
+ return 0;
+ else if (strpbrk(s, "xX")) /* hexa? */
+ *result = lua_strx2number(s, &endptr);
+ else
+ *result = lua_str2number(s, &endptr);
+ if (endptr == s) return 0; /* nothing recognized */
+ while (lisspace(cast_uchar(*endptr))) endptr++;
+ return (endptr == s + len); /* OK if no trailing characters */
+}
+
+
+
+static void pushstr (lua_State *L, const char *str, size_t l) {
+ setsvalue2s(L, L->top++, luaS_newlstr(L, str, l));
+}
+
+
+/* this function handles only `%d', `%c', %f, %p, and `%s' formats */
+const char *luaO_pushvfstring (lua_State *L, const char *fmt, va_list argp) {
+ int n = 0;
+ for (;;) {
+ const char *e = strchr(fmt, '%');
+ if (e == NULL) break;
+ luaD_checkstack(L, 2); /* fmt + item */
+ pushstr(L, fmt, e - fmt);
+ switch (*(e+1)) {
+ case 's': {
+ const char *s = va_arg(argp, char *);
+ if (s == NULL) s = "(null)";
+ pushstr(L, s, strlen(s));
+ break;
+ }
+ case 'c': {
+ char buff;
+ buff = cast(char, va_arg(argp, int));
+ pushstr(L, &buff, 1);
+ break;
+ }
+ case 'd': {
+ setnvalue(L->top++, cast_num(va_arg(argp, int)));
+ break;
+ }
+ case 'f': {
+ setnvalue(L->top++, cast_num(va_arg(argp, l_uacNumber)));
+ break;
+ }
+ case 'p': {
+ char buff[4*sizeof(void *) + 8]; /* should be enough space for a `%p' */
+ int l = lcompat_sprintf(buff, sizeof(buff), "%p", va_arg(argp, void *));
+ pushstr(L, buff, l);
+ break;
+ }
+ case '%': {
+ pushstr(L, "%", 1);
+ break;
+ }
+ default: {
+ luaG_runerror(L,
+ "invalid option " LUA_QL("%%%c") " to " LUA_QL("lua_pushfstring"),
+ *(e + 1));
+ }
+ }
+ n += 2;
+ fmt = e+2;
+ }
+ luaD_checkstack(L, 1);
+ pushstr(L, fmt, strlen(fmt));
+ if (n > 0) luaV_concat(L, n + 1);
+ return svalue(L->top - 1);
+}
+
+
+const char *luaO_pushfstring (lua_State *L, const char *fmt, ...) {
+ const char *msg;
+ va_list argp;
+ va_start(argp, fmt);
+ msg = luaO_pushvfstring(L, fmt, argp);
+ va_end(argp);
+ return msg;
+}
+
+
+/* number of chars of a literal string without the ending \0 */
+#define LL(x) (sizeof(x)/sizeof(char) - 1)
+
+#define RETS "..."
+#define PRE "[string \""
+#define POS "\"]"
+
+#define addstr(a,b,l) ( memcpy(a,b,(l) * sizeof(char)), a += (l) )
+
+void luaO_chunkid (char *out, const char *source, size_t bufflen) {
+ size_t l = strlen(source);
+ if (*source == '=') { /* 'literal' source */
+ if (l <= bufflen) /* small enough? */
+ memcpy(out, source + 1, l * sizeof(char));
+ else { /* truncate it */
+ addstr(out, source + 1, bufflen - 1);
+ *out = '\0';
+ }
+ }
+ else if (*source == '@') { /* file name */
+ if (l <= bufflen) /* small enough? */
+ memcpy(out, source + 1, l * sizeof(char));
+ else { /* add '...' before rest of name */
+ addstr(out, RETS, LL(RETS));
+ bufflen -= LL(RETS);
+ memcpy(out, source + 1 + l - bufflen, bufflen * sizeof(char));
+ }
+ }
+ else { /* string; format as [string "source"] */
+ const char *nl = strchr(source, '\n'); /* find first new line (if any) */
+ addstr(out, PRE, LL(PRE)); /* add prefix */
+ bufflen -= LL(PRE RETS POS) + 1; /* save space for prefix+suffix+'\0' */
+ if (l < bufflen && nl == NULL) { /* small one-line source? */
+ addstr(out, source, l); /* keep it */
+ }
+ else {
+ if (nl != NULL) l = nl - source; /* stop at first newline */
+ if (l > bufflen) l = bufflen;
+ addstr(out, source, l);
+ addstr(out, RETS, LL(RETS));
+ }
+ memcpy(out, POS, (LL(POS) + 1) * sizeof(char));
+ }
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lobject.h b/sys/contrib/openzfs/module/lua/lobject.h
new file mode 100644
index 000000000000..a16b8d62eb4b
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lobject.h
@@ -0,0 +1,605 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lobject.h,v 2.71.1.2 2014/05/07 14:14:58 roberto Exp $
+** Type definitions for Lua objects
+** See Copyright Notice in lua.h
+*/
+
+
+#ifndef lobject_h
+#define lobject_h
+
+
+#include "llimits.h"
+#include <sys/lua/lua.h>
+
+
+/*
+** Extra tags for non-values
+*/
+#define LUA_TPROTO LUA_NUMTAGS
+#define LUA_TUPVAL (LUA_NUMTAGS+1)
+#define LUA_TDEADKEY (LUA_NUMTAGS+2)
+
+/*
+** number of all possible tags (including LUA_TNONE but excluding DEADKEY)
+*/
+#define LUA_TOTALTAGS (LUA_TUPVAL+2)
+
+
+/*
+** tags for Tagged Values have the following use of bits:
+** bits 0-3: actual tag (a LUA_T* value)
+** bits 4-5: variant bits
+** bit 6: whether value is collectable
+*/
+
+#define VARBITS (3 << 4)
+
+
+/*
+** LUA_TFUNCTION variants:
+** 0 - Lua function
+** 1 - light C function
+** 2 - regular C function (closure)
+*/
+
+/* Variant tags for functions */
+#define LUA_TLCL (LUA_TFUNCTION | (0 << 4)) /* Lua closure */
+#define LUA_TLCF (LUA_TFUNCTION | (1 << 4)) /* light C function */
+#define LUA_TCCL (LUA_TFUNCTION | (2 << 4)) /* C closure */
+
+
+/* Variant tags for strings */
+#define LUA_TSHRSTR (LUA_TSTRING | (0 << 4)) /* short strings */
+#define LUA_TLNGSTR (LUA_TSTRING | (1 << 4)) /* long strings */
+
+
+/* Bit mark for collectable types */
+#define BIT_ISCOLLECTABLE (1 << 6)
+
+/* mark a tag as collectable */
+#define ctb(t) ((t) | BIT_ISCOLLECTABLE)
+
+
+/*
+** Union of all collectable objects
+*/
+typedef union GCObject GCObject;
+
+
+/*
+** Common Header for all collectable objects (in macro form, to be
+** included in other objects)
+*/
+#define CommonHeader GCObject *next; lu_byte tt; lu_byte marked
+
+
+/*
+** Common header in struct form
+*/
+typedef struct GCheader {
+ CommonHeader;
+} GCheader;
+
+
+
+/*
+** Union of all Lua values
+*/
+typedef union Value Value;
+
+
+#define numfield lua_Number n; /* numbers */
+
+
+
+/*
+** Tagged Values. This is the basic representation of values in Lua,
+** an actual value plus a tag with its type.
+*/
+
+#define TValuefields Value value_; int tt_
+
+typedef struct lua_TValue TValue;
+
+
+/* macro defining a nil value */
+#define NILCONSTANT {NULL}, LUA_TNIL
+
+
+#define val_(o) ((o)->value_)
+#define num_(o) (val_(o).n)
+
+
+/* raw type tag of a TValue */
+#define rttype(o) ((o)->tt_)
+
+/* tag with no variants (bits 0-3) */
+#define novariant(x) ((x) & 0x0F)
+
+/* type tag of a TValue (bits 0-3 for tags + variant bits 4-5) */
+#define ttype(o) (rttype(o) & 0x3F)
+
+/* type tag of a TValue with no variants (bits 0-3) */
+#define ttypenv(o) (novariant(rttype(o)))
+
+
+/* Macros to test type */
+#define checktag(o,t) (rttype(o) == (t))
+#define checktype(o,t) (ttypenv(o) == (t))
+#define ttisnumber(o) checktag((o), LUA_TNUMBER)
+#define ttisnil(o) checktag((o), LUA_TNIL)
+#define ttisboolean(o) checktag((o), LUA_TBOOLEAN)
+#define ttislightuserdata(o) checktag((o), LUA_TLIGHTUSERDATA)
+#define ttisstring(o) checktype((o), LUA_TSTRING)
+#define ttisshrstring(o) checktag((o), ctb(LUA_TSHRSTR))
+#define ttislngstring(o) checktag((o), ctb(LUA_TLNGSTR))
+#define ttistable(o) checktag((o), ctb(LUA_TTABLE))
+#define ttisfunction(o) checktype(o, LUA_TFUNCTION)
+#define ttisclosure(o) ((rttype(o) & 0x1F) == LUA_TFUNCTION)
+#define ttisCclosure(o) checktag((o), ctb(LUA_TCCL))
+#define ttisLclosure(o) checktag((o), ctb(LUA_TLCL))
+#define ttislcf(o) checktag((o), LUA_TLCF)
+#define ttisuserdata(o) checktag((o), ctb(LUA_TUSERDATA))
+#define ttisthread(o) checktag((o), ctb(LUA_TTHREAD))
+#define ttisdeadkey(o) checktag((o), LUA_TDEADKEY)
+
+#define ttisequal(o1,o2) (rttype(o1) == rttype(o2))
+
+/* Macros to access values */
+#define nvalue(o) check_exp(ttisnumber(o), num_(o))
+#define gcvalue(o) check_exp(iscollectable(o), val_(o).gc)
+#define pvalue(o) check_exp(ttislightuserdata(o), val_(o).p)
+#define rawtsvalue(o) check_exp(ttisstring(o), &val_(o).gc->ts)
+#define tsvalue(o) (&rawtsvalue(o)->tsv)
+#define rawuvalue(o) check_exp(ttisuserdata(o), &val_(o).gc->u)
+#define uvalue(o) (&rawuvalue(o)->uv)
+#define clvalue(o) check_exp(ttisclosure(o), &val_(o).gc->cl)
+#define clLvalue(o) check_exp(ttisLclosure(o), &val_(o).gc->cl.l)
+#define clCvalue(o) check_exp(ttisCclosure(o), &val_(o).gc->cl.c)
+#define fvalue(o) check_exp(ttislcf(o), val_(o).f)
+#define hvalue(o) check_exp(ttistable(o), &val_(o).gc->h)
+#define bvalue(o) check_exp(ttisboolean(o), val_(o).b)
+#define thvalue(o) check_exp(ttisthread(o), &val_(o).gc->th)
+/* a dead value may get the 'gc' field, but cannot access its contents */
+#define deadvalue(o) check_exp(ttisdeadkey(o), cast(void *, val_(o).gc))
+
+#define l_isfalse(o) (ttisnil(o) || (ttisboolean(o) && bvalue(o) == 0))
+
+
+#define iscollectable(o) (rttype(o) & BIT_ISCOLLECTABLE)
+
+
+/* Macros for internal tests */
+#define righttt(obj) (ttype(obj) == gcvalue(obj)->gch.tt)
+
+#define checkliveness(g,obj) \
+ lua_longassert(!iscollectable(obj) || \
+ (righttt(obj) && !isdead(g,gcvalue(obj))))
+
+
+/* Macros to set values */
+#define settt_(o,t) ((o)->tt_=(t))
+
+#define setnvalue(obj,x) \
+ { TValue *io=(obj); num_(io)=(x); settt_(io, LUA_TNUMBER); }
+
+#define setnilvalue(obj) settt_(obj, LUA_TNIL)
+
+#define setfvalue(obj,x) \
+ { TValue *io=(obj); val_(io).f=(x); settt_(io, LUA_TLCF); }
+
+#define setpvalue(obj,x) \
+ { TValue *io=(obj); val_(io).p=(x); settt_(io, LUA_TLIGHTUSERDATA); }
+
+#define setbvalue(obj,x) \
+ { TValue *io=(obj); val_(io).b=(x); settt_(io, LUA_TBOOLEAN); }
+
+#define setgcovalue(L,obj,x) \
+ { TValue *io=(obj); GCObject *i_g=(x); \
+ val_(io).gc=i_g; settt_(io, ctb(gch(i_g)->tt)); }
+
+#define setsvalue(L,obj,x) \
+ { TValue *io=(obj); \
+ TString *x_ = (x); \
+ val_(io).gc=cast(GCObject *, x_); settt_(io, ctb(x_->tsv.tt)); \
+ checkliveness(G(L),io); }
+
+#define setuvalue(L,obj,x) \
+ { TValue *io=(obj); \
+ val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TUSERDATA)); \
+ checkliveness(G(L),io); }
+
+#define setthvalue(L,obj,x) \
+ { TValue *io=(obj); \
+ val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TTHREAD)); \
+ checkliveness(G(L),io); }
+
+#define setclLvalue(L,obj,x) \
+ { TValue *io=(obj); \
+ val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TLCL)); \
+ checkliveness(G(L),io); }
+
+#define setclCvalue(L,obj,x) \
+ { TValue *io=(obj); \
+ val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TCCL)); \
+ checkliveness(G(L),io); }
+
+#define sethvalue(L,obj,x) \
+ { TValue *io=(obj); \
+ val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TTABLE)); \
+ checkliveness(G(L),io); }
+
+#define setdeadvalue(obj) settt_(obj, LUA_TDEADKEY)
+
+
+
+#define setobj(L,obj1,obj2) \
+ { const TValue *io2=(obj2); TValue *io1=(obj1); \
+ io1->value_ = io2->value_; io1->tt_ = io2->tt_; \
+ checkliveness(G(L),io1); }
+
+
+/*
+** different types of assignments, according to destination
+*/
+
+/* from stack to (same) stack */
+#define setobjs2s setobj
+/* to stack (not from same stack) */
+#define setobj2s setobj
+#define setsvalue2s setsvalue
+#define sethvalue2s sethvalue
+#define setptvalue2s setptvalue
+/* from table to same table */
+#define setobjt2t setobj
+/* to table */
+#define setobj2t setobj
+/* to new object */
+#define setobj2n setobj
+#define setsvalue2n setsvalue
+
+
+/* check whether a number is valid (useful only for NaN trick) */
+#define luai_checknum(L,o,c) { /* empty */ }
+
+
+/*
+** {======================================================
+** NaN Trick
+** =======================================================
+*/
+#if defined(LUA_NANTRICK)
+
+/*
+** numbers are represented in the 'd_' field. All other values have the
+** value (NNMARK | tag) in 'tt__'. A number with such pattern would be
+** a "signaled NaN", which is never generated by regular operations by
+** the CPU (nor by 'strtod')
+*/
+
+/* allows for external implementation for part of the trick */
+#if !defined(NNMARK) /* { */
+
+
+#if !defined(LUA_IEEEENDIAN)
+#error option 'LUA_NANTRICK' needs 'LUA_IEEEENDIAN'
+#endif
+
+
+#define NNMARK 0x7FF7A500
+#define NNMASK 0x7FFFFF00
+
+#undef TValuefields
+#undef NILCONSTANT
+
+#if (LUA_IEEEENDIAN == 0) /* { */
+
+/* little endian */
+#define TValuefields \
+ union { struct { Value v__; int tt__; } i; double d__; } u
+#define NILCONSTANT {{{NULL}, tag2tt(LUA_TNIL)}}
+/* field-access macros */
+#define v_(o) ((o)->u.i.v__)
+#define d_(o) ((o)->u.d__)
+#define tt_(o) ((o)->u.i.tt__)
+
+#else /* }{ */
+
+/* big endian */
+#define TValuefields \
+ union { struct { int tt__; Value v__; } i; double d__; } u
+#define NILCONSTANT {{tag2tt(LUA_TNIL), {NULL}}}
+/* field-access macros */
+#define v_(o) ((o)->u.i.v__)
+#define d_(o) ((o)->u.d__)
+#define tt_(o) ((o)->u.i.tt__)
+
+#endif /* } */
+
+#endif /* } */
+
+
+/* correspondence with standard representation */
+#undef val_
+#define val_(o) v_(o)
+#undef num_
+#define num_(o) d_(o)
+
+
+#undef numfield
+#define numfield /* no such field; numbers are the entire struct */
+
+/* basic check to distinguish numbers from non-numbers */
+#undef ttisnumber
+#define ttisnumber(o) ((tt_(o) & NNMASK) != NNMARK)
+
+#define tag2tt(t) (NNMARK | (t))
+
+#undef rttype
+#define rttype(o) (ttisnumber(o) ? LUA_TNUMBER : tt_(o) & 0xff)
+
+#undef settt_
+#define settt_(o,t) (tt_(o) = tag2tt(t))
+
+#undef setnvalue
+#define setnvalue(obj,x) \
+ { TValue *io_=(obj); num_(io_)=(x); lua_assert(ttisnumber(io_)); }
+
+#undef setobj
+#define setobj(L,obj1,obj2) \
+ { const TValue *o2_=(obj2); TValue *o1_=(obj1); \
+ o1_->u = o2_->u; \
+ checkliveness(G(L),o1_); }
+
+
+/*
+** these redefinitions are not mandatory, but these forms are more efficient
+*/
+
+#undef checktag
+#undef checktype
+#define checktag(o,t) (tt_(o) == tag2tt(t))
+#define checktype(o,t) (ctb(tt_(o) | VARBITS) == ctb(tag2tt(t) | VARBITS))
+
+#undef ttisequal
+#define ttisequal(o1,o2) \
+ (ttisnumber(o1) ? ttisnumber(o2) : (tt_(o1) == tt_(o2)))
+
+
+#undef luai_checknum
+#define luai_checknum(L,o,c) { if (!ttisnumber(o)) c; }
+
+#endif
+/* }====================================================== */
+
+
+
+/*
+** {======================================================
+** types and prototypes
+** =======================================================
+*/
+
+
+union Value {
+ GCObject *gc; /* collectable objects */
+ void *p; /* light userdata */
+ int b; /* booleans */
+ lua_CFunction f; /* light C functions */
+ numfield /* numbers */
+};
+
+
+struct lua_TValue {
+ TValuefields;
+};
+
+
+typedef TValue *StkId; /* index to stack elements */
+
+
+
+
+/*
+** Header for string value; string bytes follow the end of this structure
+*/
+typedef union TString {
+ L_Umaxalign dummy; /* ensures maximum alignment for strings */
+ struct {
+ CommonHeader;
+ lu_byte extra; /* reserved words for short strings; "has hash" for longs */
+ unsigned int hash;
+ size_t len; /* number of characters in string */
+ } tsv;
+} TString;
+
+
+/* get the actual string (array of bytes) from a TString */
+#define getstr(ts) cast(const char *, (ts) + 1)
+
+/* get the actual string (array of bytes) from a Lua value */
+#define svalue(o) getstr(rawtsvalue(o))
+
+
+/*
+** Header for userdata; memory area follows the end of this structure
+*/
+typedef union Udata {
+ L_Umaxalign dummy; /* ensures maximum alignment for `local' udata */
+ struct {
+ CommonHeader;
+ struct Table *metatable;
+ struct Table *env;
+ size_t len; /* number of bytes */
+ } uv;
+} Udata;
+
+
+
+/*
+** Description of an upvalue for function prototypes
+*/
+typedef struct Upvaldesc {
+ TString *name; /* upvalue name (for debug information) */
+ lu_byte instack; /* whether it is in stack */
+ lu_byte idx; /* index of upvalue (in stack or in outer function's list) */
+} Upvaldesc;
+
+
+/*
+** Description of a local variable for function prototypes
+** (used for debug information)
+*/
+typedef struct LocVar {
+ TString *varname;
+ int startpc; /* first point where variable is active */
+ int endpc; /* first point where variable is dead */
+} LocVar;
+
+
+/*
+** Function Prototypes
+*/
+typedef struct Proto {
+ CommonHeader;
+ TValue *k; /* constants used by the function */
+ Instruction *code;
+ struct Proto **p; /* functions defined inside the function */
+ int *lineinfo; /* map from opcodes to source lines (debug information) */
+ LocVar *locvars; /* information about local variables (debug information) */
+ Upvaldesc *upvalues; /* upvalue information */
+ union Closure *cache; /* last created closure with this prototype */
+ TString *source; /* used for debug information */
+ int sizeupvalues; /* size of 'upvalues' */
+ int sizek; /* size of `k' */
+ int sizecode;
+ int sizelineinfo;
+ int sizep; /* size of `p' */
+ int sizelocvars;
+ int linedefined;
+ int lastlinedefined;
+ GCObject *gclist;
+ lu_byte numparams; /* number of fixed parameters */
+ lu_byte is_vararg;
+ lu_byte maxstacksize; /* maximum stack used by this function */
+} Proto;
+
+
+
+/*
+** Lua Upvalues
+*/
+typedef struct UpVal {
+ CommonHeader;
+ TValue *v; /* points to stack or to its own value */
+ union {
+ TValue value; /* the value (when closed) */
+ struct { /* double linked list (when open) */
+ struct UpVal *prev;
+ struct UpVal *next;
+ } l;
+ } u;
+} UpVal;
+
+
+/*
+** Closures
+*/
+
+#define ClosureHeader \
+ CommonHeader; lu_byte nupvalues; GCObject *gclist
+
+typedef struct CClosure {
+ ClosureHeader;
+ lua_CFunction f;
+ TValue upvalue[1]; /* list of upvalues */
+} CClosure;
+
+
+typedef struct LClosure {
+ ClosureHeader;
+ struct Proto *p;
+ UpVal *upvals[1]; /* list of upvalues */
+} LClosure;
+
+
+typedef union Closure {
+ CClosure c;
+ LClosure l;
+} Closure;
+
+
+#define isLfunction(o) ttisLclosure(o)
+
+#define getproto(o) (clLvalue(o)->p)
+
+
+/*
+** Tables
+*/
+
+typedef union TKey {
+ struct {
+ TValuefields;
+ struct Node *next; /* for chaining */
+ } nk;
+ TValue tvk;
+} TKey;
+
+
+typedef struct Node {
+ TValue i_val;
+ TKey i_key;
+} Node;
+
+
+typedef struct Table {
+ CommonHeader;
+ lu_byte flags; /* 1<<p means tagmethod(p) is not present */
+ lu_byte lsizenode; /* log2 of size of `node' array */
+ int sizearray; /* size of `array' array */
+ TValue *array; /* array part */
+ Node *node;
+ Node *lastfree; /* any free position is before this position */
+ struct Table *metatable;
+ GCObject *gclist;
+} Table;
+
+
+
+/*
+** `module' operation for hashing (size is always a power of 2)
+*/
+#define lmod(s,size) \
+ (check_exp((size&(size-1))==0, (cast(int, (s) & ((size)-1)))))
+
+
+#define twoto(x) (1<<(x))
+#define sizenode(t) (twoto((t)->lsizenode))
+
+
+/*
+** (address of) a fixed nil value
+*/
+#define luaO_nilobject (&luaO_nilobject_)
+
+
+LUAI_DDEC const TValue luaO_nilobject_;
+
+
+LUAI_FUNC int luaO_int2fb (unsigned int x);
+LUAI_FUNC int luaO_fb2int (int x);
+LUAI_FUNC int luaO_ceillog2 (unsigned int x);
+LUAI_FUNC lua_Number luaO_arith (int op, lua_Number v1, lua_Number v2);
+LUAI_FUNC int luaO_str2d (const char *s, size_t len, lua_Number *result);
+LUAI_FUNC int luaO_hexavalue (int c);
+LUAI_FUNC const char *luaO_pushvfstring (lua_State *L, const char *fmt,
+ va_list argp);
+LUAI_FUNC const char *luaO_pushfstring (lua_State *L, const char *fmt, ...);
+LUAI_FUNC void luaO_chunkid (char *out, const char *source, size_t len);
+
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lopcodes.c b/sys/contrib/openzfs/module/lua/lopcodes.c
new file mode 100644
index 000000000000..5f34e6d90515
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lopcodes.c
@@ -0,0 +1,108 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lopcodes.c,v 1.49.1.1 2013/04/12 18:48:47 roberto Exp $
+** Opcodes for Lua virtual machine
+** See Copyright Notice in lua.h
+*/
+
+
+#define lopcodes_c
+#define LUA_CORE
+
+
+#include "lopcodes.h"
+
+
+/* ORDER OP */
+
+LUAI_DDEF const char *const luaP_opnames[NUM_OPCODES+1] = {
+ "MOVE",
+ "LOADK",
+ "LOADKX",
+ "LOADBOOL",
+ "LOADNIL",
+ "GETUPVAL",
+ "GETTABUP",
+ "GETTABLE",
+ "SETTABUP",
+ "SETUPVAL",
+ "SETTABLE",
+ "NEWTABLE",
+ "SELF",
+ "ADD",
+ "SUB",
+ "MUL",
+ "DIV",
+ "MOD",
+ "POW",
+ "UNM",
+ "NOT",
+ "LEN",
+ "CONCAT",
+ "JMP",
+ "EQ",
+ "LT",
+ "LE",
+ "TEST",
+ "TESTSET",
+ "CALL",
+ "TAILCALL",
+ "RETURN",
+ "FORLOOP",
+ "FORPREP",
+ "TFORCALL",
+ "TFORLOOP",
+ "SETLIST",
+ "CLOSURE",
+ "VARARG",
+ "EXTRAARG",
+ NULL
+};
+
+
+#define opmode(t,a,b,c,m) (((t)<<7) | ((a)<<6) | ((b)<<4) | ((c)<<2) | (m))
+
+LUAI_DDEF const lu_byte luaP_opmodes[NUM_OPCODES] = {
+/* T A B C mode opcode */
+ opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_MOVE */
+ ,opmode(0, 1, OpArgK, OpArgN, iABx) /* OP_LOADK */
+ ,opmode(0, 1, OpArgN, OpArgN, iABx) /* OP_LOADKX */
+ ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_LOADBOOL */
+ ,opmode(0, 1, OpArgU, OpArgN, iABC) /* OP_LOADNIL */
+ ,opmode(0, 1, OpArgU, OpArgN, iABC) /* OP_GETUPVAL */
+ ,opmode(0, 1, OpArgU, OpArgK, iABC) /* OP_GETTABUP */
+ ,opmode(0, 1, OpArgR, OpArgK, iABC) /* OP_GETTABLE */
+ ,opmode(0, 0, OpArgK, OpArgK, iABC) /* OP_SETTABUP */
+ ,opmode(0, 0, OpArgU, OpArgN, iABC) /* OP_SETUPVAL */
+ ,opmode(0, 0, OpArgK, OpArgK, iABC) /* OP_SETTABLE */
+ ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_NEWTABLE */
+ ,opmode(0, 1, OpArgR, OpArgK, iABC) /* OP_SELF */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_ADD */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_SUB */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_MUL */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_DIV */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_MOD */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_POW */
+ ,opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_UNM */
+ ,opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_NOT */
+ ,opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_LEN */
+ ,opmode(0, 1, OpArgR, OpArgR, iABC) /* OP_CONCAT */
+ ,opmode(0, 0, OpArgR, OpArgN, iAsBx) /* OP_JMP */
+ ,opmode(1, 0, OpArgK, OpArgK, iABC) /* OP_EQ */
+ ,opmode(1, 0, OpArgK, OpArgK, iABC) /* OP_LT */
+ ,opmode(1, 0, OpArgK, OpArgK, iABC) /* OP_LE */
+ ,opmode(1, 0, OpArgN, OpArgU, iABC) /* OP_TEST */
+ ,opmode(1, 1, OpArgR, OpArgU, iABC) /* OP_TESTSET */
+ ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_CALL */
+ ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_TAILCALL */
+ ,opmode(0, 0, OpArgU, OpArgN, iABC) /* OP_RETURN */
+ ,opmode(0, 1, OpArgR, OpArgN, iAsBx) /* OP_FORLOOP */
+ ,opmode(0, 1, OpArgR, OpArgN, iAsBx) /* OP_FORPREP */
+ ,opmode(0, 0, OpArgN, OpArgU, iABC) /* OP_TFORCALL */
+ ,opmode(0, 1, OpArgR, OpArgN, iAsBx) /* OP_TFORLOOP */
+ ,opmode(0, 0, OpArgU, OpArgU, iABC) /* OP_SETLIST */
+ ,opmode(0, 1, OpArgU, OpArgN, iABx) /* OP_CLOSURE */
+ ,opmode(0, 1, OpArgU, OpArgN, iABC) /* OP_VARARG */
+ ,opmode(0, 0, OpArgU, OpArgU, iAx) /* OP_EXTRAARG */
+};
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lopcodes.h b/sys/contrib/openzfs/module/lua/lopcodes.h
new file mode 100644
index 000000000000..02eeec1ecd06
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lopcodes.h
@@ -0,0 +1,290 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lopcodes.h,v 1.142.1.2 2014/10/20 18:32:09 roberto Exp $
+** Opcodes for Lua virtual machine
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lopcodes_h
+#define lopcodes_h
+
+#include "llimits.h"
+
+
+/*===========================================================================
+ We assume that instructions are unsigned numbers.
+ All instructions have an opcode in the first 6 bits.
+ Instructions can have the following fields:
+ `A' : 8 bits
+ `B' : 9 bits
+ `C' : 9 bits
+ 'Ax' : 26 bits ('A', 'B', and 'C' together)
+ `Bx' : 18 bits (`B' and `C' together)
+ `sBx' : signed Bx
+
+ A signed argument is represented in excess K; that is, the number
+ value is the unsigned value minus K. K is exactly the maximum value
+ for that argument (so that -max is represented by 0, and +max is
+ represented by 2*max), which is half the maximum for the corresponding
+ unsigned argument.
+===========================================================================*/
+
+
+enum OpMode {iABC, iABx, iAsBx, iAx}; /* basic instruction format */
+
+
+/*
+** size and position of opcode arguments.
+*/
+#define SIZE_C 9
+#define SIZE_B 9
+#define SIZE_Bx (SIZE_C + SIZE_B)
+#define SIZE_A 8
+#define SIZE_Ax (SIZE_C + SIZE_B + SIZE_A)
+
+#define SIZE_OP 6
+
+#define POS_OP 0
+#define POS_A (POS_OP + SIZE_OP)
+#define POS_C (POS_A + SIZE_A)
+#define POS_B (POS_C + SIZE_C)
+#define POS_Bx POS_C
+#define POS_Ax POS_A
+
+
+/*
+** limits for opcode arguments.
+** we use (signed) int to manipulate most arguments,
+** so they must fit in LUAI_BITSINT-1 bits (-1 for sign)
+*/
+#if SIZE_Bx < LUAI_BITSINT-1
+#define MAXARG_Bx ((1<<SIZE_Bx)-1)
+#define MAXARG_sBx (MAXARG_Bx>>1) /* `sBx' is signed */
+#else
+#define MAXARG_Bx MAX_INT
+#define MAXARG_sBx MAX_INT
+#endif
+
+#if SIZE_Ax < LUAI_BITSINT-1
+#define MAXARG_Ax ((1<<SIZE_Ax)-1)
+#else
+#define MAXARG_Ax MAX_INT
+#endif
+
+
+#define MAXARG_A ((1<<SIZE_A)-1)
+#define MAXARG_B ((1<<SIZE_B)-1)
+#define MAXARG_C ((1<<SIZE_C)-1)
+
+
+/* creates a mask with `n' 1 bits at position `p' */
+#define MASK1(n,p) ((~((~(Instruction)0)<<(n)))<<(p))
+
+/* creates a mask with `n' 0 bits at position `p' */
+#define MASK0(n,p) (~MASK1(n,p))
+
+/*
+** the following macros help to manipulate instructions
+*/
+
+#define GET_OPCODE(i) (cast(OpCode, ((i)>>POS_OP) & MASK1(SIZE_OP,0)))
+#define SET_OPCODE(i,o) ((i) = (((i)&MASK0(SIZE_OP,POS_OP)) | \
+ ((cast(Instruction, o)<<POS_OP)&MASK1(SIZE_OP,POS_OP))))
+
+#define getarg(i,pos,size) (cast(int, ((i)>>pos) & MASK1(size,0)))
+#define setarg(i,v,pos,size) ((i) = (((i)&MASK0(size,pos)) | \
+ ((cast(Instruction, v)<<pos)&MASK1(size,pos))))
+
+#define GETARG_A(i) getarg(i, POS_A, SIZE_A)
+#define SETARG_A(i,v) setarg(i, v, POS_A, SIZE_A)
+
+#define GETARG_B(i) getarg(i, POS_B, SIZE_B)
+#define SETARG_B(i,v) setarg(i, v, POS_B, SIZE_B)
+
+#define GETARG_C(i) getarg(i, POS_C, SIZE_C)
+#define SETARG_C(i,v) setarg(i, v, POS_C, SIZE_C)
+
+#define GETARG_Bx(i) getarg(i, POS_Bx, SIZE_Bx)
+#define SETARG_Bx(i,v) setarg(i, v, POS_Bx, SIZE_Bx)
+
+#define GETARG_Ax(i) getarg(i, POS_Ax, SIZE_Ax)
+#define SETARG_Ax(i,v) setarg(i, v, POS_Ax, SIZE_Ax)
+
+#define GETARG_sBx(i) (GETARG_Bx(i)-MAXARG_sBx)
+#define SETARG_sBx(i,b) SETARG_Bx((i),cast(unsigned int, (b)+MAXARG_sBx))
+
+
+#define CREATE_ABC(o,a,b,c) ((cast(Instruction, o)<<POS_OP) \
+ | (cast(Instruction, a)<<POS_A) \
+ | (cast(Instruction, b)<<POS_B) \
+ | (cast(Instruction, c)<<POS_C))
+
+#define CREATE_ABx(o,a,bc) ((cast(Instruction, o)<<POS_OP) \
+ | (cast(Instruction, a)<<POS_A) \
+ | (cast(Instruction, bc)<<POS_Bx))
+
+#define CREATE_Ax(o,a) ((cast(Instruction, o)<<POS_OP) \
+ | (cast(Instruction, a)<<POS_Ax))
+
+
+/*
+** Macros to operate RK indices
+*/
+
+/* this bit 1 means constant (0 means register) */
+#define BITRK (1 << (SIZE_B - 1))
+
+/* test whether value is a constant */
+#define ISK(x) ((x) & BITRK)
+
+/* gets the index of the constant */
+#define INDEXK(r) ((int)(r) & ~BITRK)
+
+#define MAXINDEXRK (BITRK - 1)
+
+/* code a constant index as a RK value */
+#define RKASK(x) ((x) | BITRK)
+
+
+/*
+** invalid register that fits in 8 bits
+*/
+#define NO_REG MAXARG_A
+
+
+/*
+** R(x) - register
+** Kst(x) - constant (in constant table)
+** RK(x) == if ISK(x) then Kst(INDEXK(x)) else R(x)
+*/
+
+
+/*
+** grep "ORDER OP" if you change these enums
+*/
+
+typedef enum {
+/*----------------------------------------------------------------------
+name args description
+------------------------------------------------------------------------*/
+OP_MOVE,/* A B R(A) := R(B) */
+OP_LOADK,/* A Bx R(A) := Kst(Bx) */
+OP_LOADKX,/* A R(A) := Kst(extra arg) */
+OP_LOADBOOL,/* A B C R(A) := (Bool)B; if (C) pc++ */
+OP_LOADNIL,/* A B R(A), R(A+1), ..., R(A+B) := nil */
+OP_GETUPVAL,/* A B R(A) := UpValue[B] */
+
+OP_GETTABUP,/* A B C R(A) := UpValue[B][RK(C)] */
+OP_GETTABLE,/* A B C R(A) := R(B)[RK(C)] */
+
+OP_SETTABUP,/* A B C UpValue[A][RK(B)] := RK(C) */
+OP_SETUPVAL,/* A B UpValue[B] := R(A) */
+OP_SETTABLE,/* A B C R(A)[RK(B)] := RK(C) */
+
+OP_NEWTABLE,/* A B C R(A) := {} (size = B,C) */
+
+OP_SELF,/* A B C R(A+1) := R(B); R(A) := R(B)[RK(C)] */
+
+OP_ADD,/* A B C R(A) := RK(B) + RK(C) */
+OP_SUB,/* A B C R(A) := RK(B) - RK(C) */
+OP_MUL,/* A B C R(A) := RK(B) * RK(C) */
+OP_DIV,/* A B C R(A) := RK(B) / RK(C) */
+OP_MOD,/* A B C R(A) := RK(B) % RK(C) */
+OP_POW,/* A B C R(A) := RK(B) ^ RK(C) */
+OP_UNM,/* A B R(A) := -R(B) */
+OP_NOT,/* A B R(A) := not R(B) */
+OP_LEN,/* A B R(A) := length of R(B) */
+
+OP_CONCAT,/* A B C R(A) := R(B).. ... ..R(C) */
+
+OP_JMP,/* A sBx pc+=sBx; if (A) close all upvalues >= R(A - 1) */
+OP_EQ,/* A B C if ((RK(B) == RK(C)) ~= A) then pc++ */
+OP_LT,/* A B C if ((RK(B) < RK(C)) ~= A) then pc++ */
+OP_LE,/* A B C if ((RK(B) <= RK(C)) ~= A) then pc++ */
+
+OP_TEST,/* A C if not (R(A) <=> C) then pc++ */
+OP_TESTSET,/* A B C if (R(B) <=> C) then R(A) := R(B) else pc++ */
+
+OP_CALL,/* A B C R(A), ... ,R(A+C-2) := R(A)(R(A+1), ... ,R(A+B-1)) */
+OP_TAILCALL,/* A B C return R(A)(R(A+1), ... ,R(A+B-1)) */
+OP_RETURN,/* A B return R(A), ... ,R(A+B-2) (see note) */
+
+OP_FORLOOP,/* A sBx R(A)+=R(A+2);
+ if R(A) <?= R(A+1) then { pc+=sBx; R(A+3)=R(A) }*/
+OP_FORPREP,/* A sBx R(A)-=R(A+2); pc+=sBx */
+
+OP_TFORCALL,/* A C R(A+3), ... ,R(A+2+C) := R(A)(R(A+1), R(A+2)); */
+OP_TFORLOOP,/* A sBx if R(A+1) ~= nil then { R(A)=R(A+1); pc += sBx }*/
+
+OP_SETLIST,/* A B C R(A)[(C-1)*FPF+i] := R(A+i), 1 <= i <= B */
+
+OP_CLOSURE,/* A Bx R(A) := closure(KPROTO[Bx]) */
+
+OP_VARARG,/* A B R(A), R(A+1), ..., R(A+B-2) = vararg */
+
+OP_EXTRAARG/* Ax extra (larger) argument for previous opcode */
+} OpCode;
+
+
+#define NUM_OPCODES (cast(int, OP_EXTRAARG) + 1)
+
+
+
+/*===========================================================================
+ Notes:
+ (*) In OP_CALL, if (B == 0) then B = top. If (C == 0), then `top' is
+ set to last_result+1, so next open instruction (OP_CALL, OP_RETURN,
+ OP_SETLIST) may use `top'.
+
+ (*) In OP_VARARG, if (B == 0) then use actual number of varargs and
+ set top (like in OP_CALL with C == 0).
+
+ (*) In OP_RETURN, if (B == 0) then return up to `top'.
+
+ (*) In OP_SETLIST, if (B == 0) then B = `top'; if (C == 0) then next
+ 'instruction' is EXTRAARG(real C).
+
+ (*) In OP_LOADKX, the next 'instruction' is always EXTRAARG.
+
+ (*) For comparisons, A specifies what condition the test should accept
+ (true or false).
+
+ (*) All `skips' (pc++) assume that next instruction is a jump.
+
+===========================================================================*/
+
+
+/*
+** masks for instruction properties. The format is:
+** bits 0-1: op mode
+** bits 2-3: C arg mode
+** bits 4-5: B arg mode
+** bit 6: instruction set register A
+** bit 7: operator is a test (next instruction must be a jump)
+*/
+
+enum OpArgMask {
+ OpArgN, /* argument is not used */
+ OpArgU, /* argument is used */
+ OpArgR, /* argument is a register or a jump offset */
+ OpArgK /* argument is a constant or register/constant */
+};
+
+LUAI_DDEC const lu_byte luaP_opmodes[NUM_OPCODES];
+
+#define getOpMode(m) (cast(enum OpMode, luaP_opmodes[m] & 3))
+#define getBMode(m) (cast(enum OpArgMask, (luaP_opmodes[m] >> 4) & 3))
+#define getCMode(m) (cast(enum OpArgMask, (luaP_opmodes[m] >> 2) & 3))
+#define testAMode(m) (luaP_opmodes[m] & (1 << 6))
+#define testTMode(m) (luaP_opmodes[m] & (1 << 7))
+
+
+LUAI_DDEC const char *const luaP_opnames[NUM_OPCODES+1]; /* opcode names */
+
+
+/* number of list items to accumulate before a SETLIST instruction */
+#define LFIELDS_PER_FLUSH 50
+
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lparser.c b/sys/contrib/openzfs/module/lua/lparser.c
new file mode 100644
index 000000000000..e1dd88f2f654
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lparser.c
@@ -0,0 +1,1643 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lparser.c,v 2.130.1.1 2013/04/12 18:48:47 roberto Exp $
+** Lua Parser
+** See Copyright Notice in lua.h
+*/
+
+#define lparser_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "lcode.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "llex.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lparser.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+
+
+
+/* maximum number of local variables per function (must be smaller
+ than 250, due to the bytecode format) */
+#define MAXVARS 200
+
+
+#define hasmultret(k) ((k) == VCALL || (k) == VVARARG)
+
+
+
+/*
+** nodes for block list (list of active blocks)
+*/
+typedef struct BlockCnt {
+ struct BlockCnt *previous; /* chain */
+ short firstlabel; /* index of first label in this block */
+ short firstgoto; /* index of first pending goto in this block */
+ lu_byte nactvar; /* # active locals outside the block */
+ lu_byte upval; /* true if some variable in the block is an upvalue */
+ lu_byte isloop; /* true if `block' is a loop */
+} BlockCnt;
+
+
+
+/*
+** prototypes for recursive non-terminal functions
+*/
+static void statement (LexState *ls);
+static void expr (LexState *ls, expdesc *v);
+
+
+static void anchor_token (LexState *ls) {
+ /* last token from outer function must be EOS */
+ lua_assert(ls->fs != NULL || ls->t.token == TK_EOS);
+ if (ls->t.token == TK_NAME || ls->t.token == TK_STRING) {
+ TString *ts = ls->t.seminfo.ts;
+ luaX_newstring(ls, getstr(ts), ts->tsv.len);
+ }
+}
+
+
+/* semantic error */
+static l_noret semerror (LexState *ls, const char *msg) {
+ ls->t.token = 0; /* remove 'near to' from final message */
+ luaX_syntaxerror(ls, msg);
+}
+
+
+static l_noret error_expected (LexState *ls, int token) {
+ luaX_syntaxerror(ls,
+ luaO_pushfstring(ls->L, "%s expected", luaX_token2str(ls, token)));
+}
+
+
+static l_noret errorlimit (FuncState *fs, int limit, const char *what) {
+ lua_State *L = fs->ls->L;
+ const char *msg;
+ int line = fs->f->linedefined;
+ const char *where = (line == 0)
+ ? "main function"
+ : luaO_pushfstring(L, "function at line %d", line);
+ msg = luaO_pushfstring(L, "too many %s (limit is %d) in %s",
+ what, limit, where);
+ luaX_syntaxerror(fs->ls, msg);
+}
+
+
+static void checklimit (FuncState *fs, int v, int l, const char *what) {
+ if (v > l) errorlimit(fs, l, what);
+}
+
+
+static int testnext (LexState *ls, int c) {
+ if (ls->t.token == c) {
+ luaX_next(ls);
+ return 1;
+ }
+ else return 0;
+}
+
+
+static void check (LexState *ls, int c) {
+ if (ls->t.token != c)
+ error_expected(ls, c);
+}
+
+
+static void checknext (LexState *ls, int c) {
+ check(ls, c);
+ luaX_next(ls);
+}
+
+
+#define check_condition(ls,c,msg) { if (!(c)) luaX_syntaxerror(ls, msg); }
+
+
+
+static void check_match (LexState *ls, int what, int who, int where) {
+ if (!testnext(ls, what)) {
+ if (where == ls->linenumber)
+ error_expected(ls, what);
+ else {
+ luaX_syntaxerror(ls, luaO_pushfstring(ls->L,
+ "%s expected (to close %s at line %d)",
+ luaX_token2str(ls, what), luaX_token2str(ls, who), where));
+ }
+ }
+}
+
+
+static TString *str_checkname (LexState *ls) {
+ TString *ts;
+ check(ls, TK_NAME);
+ ts = ls->t.seminfo.ts;
+ luaX_next(ls);
+ return ts;
+}
+
+
+static void init_exp (expdesc *e, expkind k, int i) {
+ e->f = e->t = NO_JUMP;
+ e->k = k;
+ e->u.info = i;
+}
+
+
+static void codestring (LexState *ls, expdesc *e, TString *s) {
+ init_exp(e, VK, luaK_stringK(ls->fs, s));
+}
+
+
+static void checkname (LexState *ls, expdesc *e) {
+ codestring(ls, e, str_checkname(ls));
+}
+
+
+static int registerlocalvar (LexState *ls, TString *varname) {
+ FuncState *fs = ls->fs;
+ Proto *f = fs->f;
+ int oldsize = f->sizelocvars;
+ luaM_growvector(ls->L, f->locvars, fs->nlocvars, f->sizelocvars,
+ LocVar, SHRT_MAX, "local variables");
+ while (oldsize < f->sizelocvars) f->locvars[oldsize++].varname = NULL;
+ f->locvars[fs->nlocvars].varname = varname;
+ luaC_objbarrier(ls->L, f, varname);
+ return fs->nlocvars++;
+}
+
+
+static void new_localvar (LexState *ls, TString *name) {
+ FuncState *fs = ls->fs;
+ Dyndata *dyd = ls->dyd;
+ int reg = registerlocalvar(ls, name);
+ checklimit(fs, dyd->actvar.n + 1 - fs->firstlocal,
+ MAXVARS, "local variables");
+ luaM_growvector(ls->L, dyd->actvar.arr, dyd->actvar.n + 1,
+ dyd->actvar.size, Vardesc, MAX_INT, "local variables");
+ dyd->actvar.arr[dyd->actvar.n++].idx = cast(short, reg);
+}
+
+
+static void new_localvarliteral_ (LexState *ls, const char *name, size_t sz) {
+ new_localvar(ls, luaX_newstring(ls, name, sz));
+}
+
+#define new_localvarliteral(ls,v) \
+ new_localvarliteral_(ls, "" v, (sizeof(v)/sizeof(char))-1)
+
+
+static LocVar *getlocvar (FuncState *fs, int i) {
+ int idx = fs->ls->dyd->actvar.arr[fs->firstlocal + i].idx;
+ lua_assert(idx < fs->nlocvars);
+ return &fs->f->locvars[idx];
+}
+
+
+static void adjustlocalvars (LexState *ls, int nvars) {
+ FuncState *fs = ls->fs;
+ fs->nactvar = cast_byte(fs->nactvar + nvars);
+ for (; nvars; nvars--) {
+ getlocvar(fs, fs->nactvar - nvars)->startpc = fs->pc;
+ }
+}
+
+
+static void removevars (FuncState *fs, int tolevel) {
+ fs->ls->dyd->actvar.n -= (fs->nactvar - tolevel);
+ while (fs->nactvar > tolevel)
+ getlocvar(fs, --fs->nactvar)->endpc = fs->pc;
+}
+
+
+static int searchupvalue (FuncState *fs, TString *name) {
+ int i;
+ Upvaldesc *up = fs->f->upvalues;
+ for (i = 0; i < fs->nups; i++) {
+ if (luaS_eqstr(up[i].name, name)) return i;
+ }
+ return -1; /* not found */
+}
+
+
+static int newupvalue (FuncState *fs, TString *name, expdesc *v) {
+ Proto *f = fs->f;
+ int oldsize = f->sizeupvalues;
+ checklimit(fs, fs->nups + 1, MAXUPVAL, "upvalues");
+ luaM_growvector(fs->ls->L, f->upvalues, fs->nups, f->sizeupvalues,
+ Upvaldesc, MAXUPVAL, "upvalues");
+ while (oldsize < f->sizeupvalues) f->upvalues[oldsize++].name = NULL;
+ f->upvalues[fs->nups].instack = (v->k == VLOCAL);
+ f->upvalues[fs->nups].idx = cast_byte(v->u.info);
+ f->upvalues[fs->nups].name = name;
+ luaC_objbarrier(fs->ls->L, f, name);
+ return fs->nups++;
+}
+
+
+static int searchvar (FuncState *fs, TString *n) {
+ int i;
+ for (i = cast_int(fs->nactvar) - 1; i >= 0; i--) {
+ if (luaS_eqstr(n, getlocvar(fs, i)->varname))
+ return i;
+ }
+ return -1; /* not found */
+}
+
+
+/*
+ Mark block where variable at given level was defined
+ (to emit close instructions later).
+*/
+static void markupval (FuncState *fs, int level) {
+ BlockCnt *bl = fs->bl;
+ while (bl->nactvar > level) bl = bl->previous;
+ bl->upval = 1;
+}
+
+
+/*
+ Find variable with given name 'n'. If it is an upvalue, add this
+ upvalue into all intermediate functions.
+*/
+static int singlevaraux (FuncState *fs, TString *n, expdesc *var, int base) {
+ if (fs == NULL) /* no more levels? */
+ return VVOID; /* default is global */
+ else {
+ int v = searchvar(fs, n); /* look up locals at current level */
+ if (v >= 0) { /* found? */
+ init_exp(var, VLOCAL, v); /* variable is local */
+ if (!base)
+ markupval(fs, v); /* local will be used as an upval */
+ return VLOCAL;
+ }
+ else { /* not found as local at current level; try upvalues */
+ int idx = searchupvalue(fs, n); /* try existing upvalues */
+ if (idx < 0) { /* not found? */
+ if (singlevaraux(fs->prev, n, var, 0) == VVOID) /* try upper levels */
+ return VVOID; /* not found; is a global */
+ /* else was LOCAL or UPVAL */
+ idx = newupvalue(fs, n, var); /* will be a new upvalue */
+ }
+ init_exp(var, VUPVAL, idx);
+ return VUPVAL;
+ }
+ }
+}
+
+
+static void singlevar (LexState *ls, expdesc *var) {
+ TString *varname = str_checkname(ls);
+ FuncState *fs = ls->fs;
+ if (singlevaraux(fs, varname, var, 1) == VVOID) { /* global name? */
+ expdesc key;
+ singlevaraux(fs, ls->envn, var, 1); /* get environment variable */
+ lua_assert(var->k == VLOCAL || var->k == VUPVAL);
+ codestring(ls, &key, varname); /* key is variable name */
+ luaK_indexed(fs, var, &key); /* env[varname] */
+ }
+}
+
+
+static void adjust_assign (LexState *ls, int nvars, int nexps, expdesc *e) {
+ FuncState *fs = ls->fs;
+ int extra = nvars - nexps;
+ if (hasmultret(e->k)) {
+ extra++; /* includes call itself */
+ if (extra < 0) extra = 0;
+ luaK_setreturns(fs, e, extra); /* last exp. provides the difference */
+ if (extra > 1) luaK_reserveregs(fs, extra-1);
+ }
+ else {
+ if (e->k != VVOID) luaK_exp2nextreg(fs, e); /* close last expression */
+ if (extra > 0) {
+ int reg = fs->freereg;
+ luaK_reserveregs(fs, extra);
+ luaK_nil(fs, reg, extra);
+ }
+ }
+}
+
+
+static void enterlevel (LexState *ls) {
+ lua_State *L = ls->L;
+ ++L->nCcalls;
+ checklimit(ls->fs, L->nCcalls, LUAI_MAXCCALLS, "C levels");
+}
+
+
+#define leavelevel(ls) ((ls)->L->nCcalls--)
+
+
+static void closegoto (LexState *ls, int g, Labeldesc *label) {
+ int i;
+ FuncState *fs = ls->fs;
+ Labellist *gl = &ls->dyd->gt;
+ Labeldesc *gt = &gl->arr[g];
+ lua_assert(luaS_eqstr(gt->name, label->name));
+ if (gt->nactvar < label->nactvar) {
+ TString *vname = getlocvar(fs, gt->nactvar)->varname;
+ const char *msg = luaO_pushfstring(ls->L,
+ "<goto %s> at line %d jumps into the scope of local " LUA_QS,
+ getstr(gt->name), gt->line, getstr(vname));
+ semerror(ls, msg);
+ }
+ luaK_patchlist(fs, gt->pc, label->pc);
+ /* remove goto from pending list */
+ for (i = g; i < gl->n - 1; i++)
+ gl->arr[i] = gl->arr[i + 1];
+ gl->n--;
+}
+
+
+/*
+** try to close a goto with existing labels; this solves backward jumps
+*/
+static int findlabel (LexState *ls, int g) {
+ int i;
+ BlockCnt *bl = ls->fs->bl;
+ Dyndata *dyd = ls->dyd;
+ Labeldesc *gt = &dyd->gt.arr[g];
+ /* check labels in current block for a match */
+ for (i = bl->firstlabel; i < dyd->label.n; i++) {
+ Labeldesc *lb = &dyd->label.arr[i];
+ if (luaS_eqstr(lb->name, gt->name)) { /* correct label? */
+ if (gt->nactvar > lb->nactvar &&
+ (bl->upval || dyd->label.n > bl->firstlabel))
+ luaK_patchclose(ls->fs, gt->pc, lb->nactvar);
+ closegoto(ls, g, lb); /* close it */
+ return 1;
+ }
+ }
+ return 0; /* label not found; cannot close goto */
+}
+
+
+static int newlabelentry (LexState *ls, Labellist *l, TString *name,
+ int line, int pc) {
+ int n = l->n;
+ luaM_growvector(ls->L, l->arr, n, l->size,
+ Labeldesc, SHRT_MAX, "labels/gotos");
+ l->arr[n].name = name;
+ l->arr[n].line = line;
+ l->arr[n].nactvar = ls->fs->nactvar;
+ l->arr[n].pc = pc;
+ l->n++;
+ return n;
+}
+
+
+/*
+** check whether new label 'lb' matches any pending gotos in current
+** block; solves forward jumps
+*/
+static void findgotos (LexState *ls, Labeldesc *lb) {
+ Labellist *gl = &ls->dyd->gt;
+ int i = ls->fs->bl->firstgoto;
+ while (i < gl->n) {
+ if (luaS_eqstr(gl->arr[i].name, lb->name))
+ closegoto(ls, i, lb);
+ else
+ i++;
+ }
+}
+
+
+/*
+** "export" pending gotos to outer level, to check them against
+** outer labels; if the block being exited has upvalues, and
+** the goto exits the scope of any variable (which can be the
+** upvalue), close those variables being exited.
+*/
+static void movegotosout (FuncState *fs, BlockCnt *bl) {
+ int i = bl->firstgoto;
+ Labellist *gl = &fs->ls->dyd->gt;
+ /* correct pending gotos to current block and try to close it
+ with visible labels */
+ while (i < gl->n) {
+ Labeldesc *gt = &gl->arr[i];
+ if (gt->nactvar > bl->nactvar) {
+ if (bl->upval)
+ luaK_patchclose(fs, gt->pc, bl->nactvar);
+ gt->nactvar = bl->nactvar;
+ }
+ if (!findlabel(fs->ls, i))
+ i++; /* move to next one */
+ }
+}
+
+
+static void enterblock (FuncState *fs, BlockCnt *bl, lu_byte isloop) {
+ bl->isloop = isloop;
+ bl->nactvar = fs->nactvar;
+ bl->firstlabel = fs->ls->dyd->label.n;
+ bl->firstgoto = fs->ls->dyd->gt.n;
+ bl->upval = 0;
+ bl->previous = fs->bl;
+ fs->bl = bl;
+ lua_assert(fs->freereg == fs->nactvar);
+}
+
+
+/*
+** create a label named "break" to resolve break statements
+*/
+static void breaklabel (LexState *ls) {
+ TString *n = luaS_new(ls->L, "break");
+ int l = newlabelentry(ls, &ls->dyd->label, n, 0, ls->fs->pc);
+ findgotos(ls, &ls->dyd->label.arr[l]);
+}
+
+/*
+** generates an error for an undefined 'goto'; choose appropriate
+** message when label name is a reserved word (which can only be 'break')
+*/
+static l_noret undefgoto (LexState *ls, Labeldesc *gt) {
+ const char *msg = isreserved(gt->name)
+ ? "<%s> at line %d not inside a loop"
+ : "no visible label " LUA_QS " for <goto> at line %d";
+ msg = luaO_pushfstring(ls->L, msg, getstr(gt->name), gt->line);
+ semerror(ls, msg);
+}
+
+
+static void leaveblock (FuncState *fs) {
+ BlockCnt *bl = fs->bl;
+ LexState *ls = fs->ls;
+ if (bl->previous && bl->upval) {
+ /* create a 'jump to here' to close upvalues */
+ int j = luaK_jump(fs);
+ luaK_patchclose(fs, j, bl->nactvar);
+ luaK_patchtohere(fs, j);
+ }
+ if (bl->isloop)
+ breaklabel(ls); /* close pending breaks */
+ fs->bl = bl->previous;
+ removevars(fs, bl->nactvar);
+ lua_assert(bl->nactvar == fs->nactvar);
+ fs->freereg = fs->nactvar; /* free registers */
+ ls->dyd->label.n = bl->firstlabel; /* remove local labels */
+ if (bl->previous) /* inner block? */
+ movegotosout(fs, bl); /* update pending gotos to outer block */
+ else if (bl->firstgoto < ls->dyd->gt.n) /* pending gotos in outer block? */
+ undefgoto(ls, &ls->dyd->gt.arr[bl->firstgoto]); /* error */
+}
+
+
+/*
+** adds a new prototype into list of prototypes
+*/
+static Proto *addprototype (LexState *ls) {
+ Proto *clp;
+ lua_State *L = ls->L;
+ FuncState *fs = ls->fs;
+ Proto *f = fs->f; /* prototype of current function */
+ if (fs->np >= f->sizep) {
+ int oldsize = f->sizep;
+ luaM_growvector(L, f->p, fs->np, f->sizep, Proto *, MAXARG_Bx, "functions");
+ while (oldsize < f->sizep) f->p[oldsize++] = NULL;
+ }
+ f->p[fs->np++] = clp = luaF_newproto(L);
+ luaC_objbarrier(L, f, clp);
+ return clp;
+}
+
+
+/*
+** codes instruction to create new closure in parent function.
+** The OP_CLOSURE instruction must use the last available register,
+** so that, if it invokes the GC, the GC knows which registers
+** are in use at that time.
+*/
+static void codeclosure (LexState *ls, expdesc *v) {
+ FuncState *fs = ls->fs->prev;
+ init_exp(v, VRELOCABLE, luaK_codeABx(fs, OP_CLOSURE, 0, fs->np - 1));
+ luaK_exp2nextreg(fs, v); /* fix it at the last register */
+}
+
+
+static void open_func (LexState *ls, FuncState *fs, BlockCnt *bl) {
+ lua_State *L = ls->L;
+ Proto *f;
+ fs->prev = ls->fs; /* linked list of funcstates */
+ fs->ls = ls;
+ ls->fs = fs;
+ fs->pc = 0;
+ fs->lasttarget = 0;
+ fs->jpc = NO_JUMP;
+ fs->freereg = 0;
+ fs->nk = 0;
+ fs->np = 0;
+ fs->nups = 0;
+ fs->nlocvars = 0;
+ fs->nactvar = 0;
+ fs->firstlocal = ls->dyd->actvar.n;
+ fs->bl = NULL;
+ f = fs->f;
+ f->source = ls->source;
+ f->maxstacksize = 2; /* registers 0/1 are always valid */
+ fs->h = luaH_new(L);
+ /* anchor table of constants (to avoid being collected) */
+ sethvalue2s(L, L->top, fs->h);
+ incr_top(L);
+ enterblock(fs, bl, 0);
+}
+
+
+static void close_func (LexState *ls) {
+ lua_State *L = ls->L;
+ FuncState *fs = ls->fs;
+ Proto *f = fs->f;
+ luaK_ret(fs, 0, 0); /* final return */
+ leaveblock(fs);
+ luaM_reallocvector(L, f->code, f->sizecode, fs->pc, Instruction);
+ f->sizecode = fs->pc;
+ luaM_reallocvector(L, f->lineinfo, f->sizelineinfo, fs->pc, int);
+ f->sizelineinfo = fs->pc;
+ luaM_reallocvector(L, f->k, f->sizek, fs->nk, TValue);
+ f->sizek = fs->nk;
+ luaM_reallocvector(L, f->p, f->sizep, fs->np, Proto *);
+ f->sizep = fs->np;
+ luaM_reallocvector(L, f->locvars, f->sizelocvars, fs->nlocvars, LocVar);
+ f->sizelocvars = fs->nlocvars;
+ luaM_reallocvector(L, f->upvalues, f->sizeupvalues, fs->nups, Upvaldesc);
+ f->sizeupvalues = fs->nups;
+ lua_assert(fs->bl == NULL);
+ ls->fs = fs->prev;
+ /* last token read was anchored in defunct function; must re-anchor it */
+ anchor_token(ls);
+ L->top--; /* pop table of constants */
+ luaC_checkGC(L);
+}
+
+
+
+/*============================================================*/
+/* GRAMMAR RULES */
+/*============================================================*/
+
+
+/*
+** check whether current token is in the follow set of a block.
+** 'until' closes syntactical blocks, but do not close scope,
+** so it handled in separate.
+*/
+static int block_follow (LexState *ls, int withuntil) {
+ switch (ls->t.token) {
+ case TK_ELSE: case TK_ELSEIF:
+ case TK_END: case TK_EOS:
+ return 1;
+ case TK_UNTIL: return withuntil;
+ default: return 0;
+ }
+}
+
+
+/*
+ * by inlining statlist() and test_then_block() we cut back the
+ * native stack usage per nested C call from 272 bytes to 152
+ * which allows us to stay within budget for 8K kernel stacks
+ */
+__attribute__((always_inline)) inline
+static void statlist (LexState *ls) {
+ /* statlist -> { stat [`;'] } */
+ while (!block_follow(ls, 1)) {
+ if (ls->t.token == TK_RETURN) {
+ statement(ls);
+ return; /* 'return' must be last statement */
+ }
+ statement(ls);
+ }
+}
+
+
+static void fieldsel (LexState *ls, expdesc *v) {
+ /* fieldsel -> ['.' | ':'] NAME */
+ FuncState *fs = ls->fs;
+ expdesc key;
+ luaK_exp2anyregup(fs, v);
+ luaX_next(ls); /* skip the dot or colon */
+ checkname(ls, &key);
+ luaK_indexed(fs, v, &key);
+}
+
+
+static void yindex (LexState *ls, expdesc *v) {
+ /* index -> '[' expr ']' */
+ luaX_next(ls); /* skip the '[' */
+ expr(ls, v);
+ luaK_exp2val(ls->fs, v);
+ checknext(ls, ']');
+}
+
+
+/*
+** {======================================================================
+** Rules for Constructors
+** =======================================================================
+*/
+
+
+struct ConsControl {
+ expdesc v; /* last list item read */
+ expdesc *t; /* table descriptor */
+ int nh; /* total number of `record' elements */
+ int na; /* total number of array elements */
+ int tostore; /* number of array elements pending to be stored */
+};
+
+
+static void recfield (LexState *ls, struct ConsControl *cc) {
+ /* recfield -> (NAME | `['exp1`]') = exp1 */
+ FuncState *fs = ls->fs;
+ int reg = ls->fs->freereg;
+ expdesc key, val;
+ int rkkey;
+ if (ls->t.token == TK_NAME) {
+ checklimit(fs, cc->nh, MAX_INT, "items in a constructor");
+ checkname(ls, &key);
+ }
+ else /* ls->t.token == '[' */
+ yindex(ls, &key);
+ cc->nh++;
+ checknext(ls, '=');
+ rkkey = luaK_exp2RK(fs, &key);
+ expr(ls, &val);
+ luaK_codeABC(fs, OP_SETTABLE, cc->t->u.info, rkkey, luaK_exp2RK(fs, &val));
+ fs->freereg = reg; /* free registers */
+}
+
+
+static void closelistfield (FuncState *fs, struct ConsControl *cc) {
+ if (cc->v.k == VVOID) return; /* there is no list item */
+ luaK_exp2nextreg(fs, &cc->v);
+ cc->v.k = VVOID;
+ if (cc->tostore == LFIELDS_PER_FLUSH) {
+ luaK_setlist(fs, cc->t->u.info, cc->na, cc->tostore); /* flush */
+ cc->tostore = 0; /* no more items pending */
+ }
+}
+
+
+static void lastlistfield (FuncState *fs, struct ConsControl *cc) {
+ if (cc->tostore == 0) return;
+ if (hasmultret(cc->v.k)) {
+ luaK_setmultret(fs, &cc->v);
+ luaK_setlist(fs, cc->t->u.info, cc->na, LUA_MULTRET);
+ cc->na--; /* do not count last expression (unknown number of elements) */
+ }
+ else {
+ if (cc->v.k != VVOID)
+ luaK_exp2nextreg(fs, &cc->v);
+ luaK_setlist(fs, cc->t->u.info, cc->na, cc->tostore);
+ }
+}
+
+
+static void listfield (LexState *ls, struct ConsControl *cc) {
+ /* listfield -> exp */
+ expr(ls, &cc->v);
+ checklimit(ls->fs, cc->na, MAX_INT, "items in a constructor");
+ cc->na++;
+ cc->tostore++;
+}
+
+
+static void field (LexState *ls, struct ConsControl *cc) {
+ /* field -> listfield | recfield */
+ switch(ls->t.token) {
+ case TK_NAME: { /* may be 'listfield' or 'recfield' */
+ if (luaX_lookahead(ls) != '=') /* expression? */
+ listfield(ls, cc);
+ else
+ recfield(ls, cc);
+ break;
+ }
+ case '[': {
+ recfield(ls, cc);
+ break;
+ }
+ default: {
+ listfield(ls, cc);
+ break;
+ }
+ }
+}
+
+
+static void constructor (LexState *ls, expdesc *t) {
+ /* constructor -> '{' [ field { sep field } [sep] ] '}'
+ sep -> ',' | ';' */
+ FuncState *fs = ls->fs;
+ int line = ls->linenumber;
+ int pc = luaK_codeABC(fs, OP_NEWTABLE, 0, 0, 0);
+ struct ConsControl cc;
+ cc.na = cc.nh = cc.tostore = 0;
+ cc.t = t;
+ init_exp(t, VRELOCABLE, pc);
+ init_exp(&cc.v, VVOID, 0); /* no value (yet) */
+ luaK_exp2nextreg(ls->fs, t); /* fix it at stack top */
+ checknext(ls, '{');
+ do {
+ lua_assert(cc.v.k == VVOID || cc.tostore > 0);
+ if (ls->t.token == '}') break;
+ closelistfield(fs, &cc);
+ field(ls, &cc);
+ } while (testnext(ls, ',') || testnext(ls, ';'));
+ check_match(ls, '}', '{', line);
+ lastlistfield(fs, &cc);
+ SETARG_B(fs->f->code[pc], luaO_int2fb(cc.na)); /* set initial array size */
+ SETARG_C(fs->f->code[pc], luaO_int2fb(cc.nh)); /* set initial table size */
+}
+
+/* }====================================================================== */
+
+
+
+static void parlist (LexState *ls) {
+ /* parlist -> [ param { `,' param } ] */
+ FuncState *fs = ls->fs;
+ Proto *f = fs->f;
+ int nparams = 0;
+ f->is_vararg = 0;
+ if (ls->t.token != ')') { /* is `parlist' not empty? */
+ do {
+ switch (ls->t.token) {
+ case TK_NAME: { /* param -> NAME */
+ new_localvar(ls, str_checkname(ls));
+ nparams++;
+ break;
+ }
+ case TK_DOTS: { /* param -> `...' */
+ luaX_next(ls);
+ f->is_vararg = 1;
+ break;
+ }
+ default: luaX_syntaxerror(ls, "<name> or " LUA_QL("...") " expected");
+ }
+ } while (!f->is_vararg && testnext(ls, ','));
+ }
+ adjustlocalvars(ls, nparams);
+ f->numparams = cast_byte(fs->nactvar);
+ luaK_reserveregs(fs, fs->nactvar); /* reserve register for parameters */
+}
+
+
+static void body (LexState *ls, expdesc *e, int ismethod, int line) {
+ /* body -> `(' parlist `)' block END */
+ FuncState new_fs;
+ BlockCnt bl;
+ new_fs.f = addprototype(ls);
+ new_fs.f->linedefined = line;
+ open_func(ls, &new_fs, &bl);
+ checknext(ls, '(');
+ if (ismethod) {
+ new_localvarliteral(ls, "self"); /* create 'self' parameter */
+ adjustlocalvars(ls, 1);
+ }
+ parlist(ls);
+ checknext(ls, ')');
+ statlist(ls);
+ new_fs.f->lastlinedefined = ls->linenumber;
+ check_match(ls, TK_END, TK_FUNCTION, line);
+ codeclosure(ls, e);
+ close_func(ls);
+}
+
+
+static int explist (LexState *ls, expdesc *v) {
+ /* explist -> expr { `,' expr } */
+ int n = 1; /* at least one expression */
+ expr(ls, v);
+ while (testnext(ls, ',')) {
+ luaK_exp2nextreg(ls->fs, v);
+ expr(ls, v);
+ n++;
+ }
+ return n;
+}
+
+
+static void funcargs (LexState *ls, expdesc *f, int line) {
+ FuncState *fs = ls->fs;
+ expdesc args;
+ int base, nparams;
+ switch (ls->t.token) {
+ case '(': { /* funcargs -> `(' [ explist ] `)' */
+ luaX_next(ls);
+ if (ls->t.token == ')') /* arg list is empty? */
+ args.k = VVOID;
+ else {
+ explist(ls, &args);
+ luaK_setmultret(fs, &args);
+ }
+ check_match(ls, ')', '(', line);
+ break;
+ }
+ case '{': { /* funcargs -> constructor */
+ constructor(ls, &args);
+ break;
+ }
+ case TK_STRING: { /* funcargs -> STRING */
+ codestring(ls, &args, ls->t.seminfo.ts);
+ luaX_next(ls); /* must use `seminfo' before `next' */
+ break;
+ }
+ default: {
+ luaX_syntaxerror(ls, "function arguments expected");
+ }
+ }
+ lua_assert(f->k == VNONRELOC);
+ base = f->u.info; /* base register for call */
+ if (hasmultret(args.k))
+ nparams = LUA_MULTRET; /* open call */
+ else {
+ if (args.k != VVOID)
+ luaK_exp2nextreg(fs, &args); /* close last argument */
+ nparams = fs->freereg - (base+1);
+ }
+ init_exp(f, VCALL, luaK_codeABC(fs, OP_CALL, base, nparams+1, 2));
+ luaK_fixline(fs, line);
+ fs->freereg = base+1; /* call remove function and arguments and leaves
+ (unless changed) one result */
+}
+
+
+
+
+/*
+** {======================================================================
+** Expression parsing
+** =======================================================================
+*/
+
+
+static void primaryexp (LexState *ls, expdesc *v) {
+ /* primaryexp -> NAME | '(' expr ')' */
+ switch (ls->t.token) {
+ case '(': {
+ int line = ls->linenumber;
+ luaX_next(ls);
+ expr(ls, v);
+ check_match(ls, ')', '(', line);
+ luaK_dischargevars(ls->fs, v);
+ return;
+ }
+ case TK_NAME: {
+ singlevar(ls, v);
+ return;
+ }
+ default: {
+ luaX_syntaxerror(ls, "unexpected symbol");
+ }
+ }
+}
+
+
+static void suffixedexp (LexState *ls, expdesc *v) {
+ /* suffixedexp ->
+ primaryexp { '.' NAME | '[' exp ']' | ':' NAME funcargs | funcargs } */
+ FuncState *fs = ls->fs;
+ int line = ls->linenumber;
+ primaryexp(ls, v);
+ for (;;) {
+ switch (ls->t.token) {
+ case '.': { /* fieldsel */
+ fieldsel(ls, v);
+ break;
+ }
+ case '[': { /* `[' exp1 `]' */
+ expdesc key;
+ luaK_exp2anyregup(fs, v);
+ yindex(ls, &key);
+ luaK_indexed(fs, v, &key);
+ break;
+ }
+ case ':': { /* `:' NAME funcargs */
+ expdesc key;
+ luaX_next(ls);
+ checkname(ls, &key);
+ luaK_self(fs, v, &key);
+ funcargs(ls, v, line);
+ break;
+ }
+ case '(': case TK_STRING: case '{': { /* funcargs */
+ luaK_exp2nextreg(fs, v);
+ funcargs(ls, v, line);
+ break;
+ }
+ default: return;
+ }
+ }
+}
+
+
+static void simpleexp (LexState *ls, expdesc *v) {
+ /* simpleexp -> NUMBER | STRING | NIL | TRUE | FALSE | ... |
+ constructor | FUNCTION body | suffixedexp */
+ switch (ls->t.token) {
+ case TK_NUMBER: {
+ init_exp(v, VKNUM, 0);
+ v->u.nval = ls->t.seminfo.r;
+ break;
+ }
+ case TK_STRING: {
+ codestring(ls, v, ls->t.seminfo.ts);
+ break;
+ }
+ case TK_NIL: {
+ init_exp(v, VNIL, 0);
+ break;
+ }
+ case TK_TRUE: {
+ init_exp(v, VTRUE, 0);
+ break;
+ }
+ case TK_FALSE: {
+ init_exp(v, VFALSE, 0);
+ break;
+ }
+ case TK_DOTS: { /* vararg */
+ FuncState *fs = ls->fs;
+ check_condition(ls, fs->f->is_vararg,
+ "cannot use " LUA_QL("...") " outside a vararg function");
+ init_exp(v, VVARARG, luaK_codeABC(fs, OP_VARARG, 0, 1, 0));
+ break;
+ }
+ case '{': { /* constructor */
+ constructor(ls, v);
+ return;
+ }
+ case TK_FUNCTION: {
+ luaX_next(ls);
+ body(ls, v, 0, ls->linenumber);
+ return;
+ }
+ default: {
+ suffixedexp(ls, v);
+ return;
+ }
+ }
+ luaX_next(ls);
+}
+
+
+static UnOpr getunopr (int op) {
+ switch (op) {
+ case TK_NOT: return OPR_NOT;
+ case '-': return OPR_MINUS;
+ case '#': return OPR_LEN;
+ default: return OPR_NOUNOPR;
+ }
+}
+
+
+static BinOpr getbinopr (int op) {
+ switch (op) {
+ case '+': return OPR_ADD;
+ case '-': return OPR_SUB;
+ case '*': return OPR_MUL;
+ case '/': return OPR_DIV;
+ case '%': return OPR_MOD;
+ case '^': return OPR_POW;
+ case TK_CONCAT: return OPR_CONCAT;
+ case TK_NE: return OPR_NE;
+ case TK_EQ: return OPR_EQ;
+ case '<': return OPR_LT;
+ case TK_LE: return OPR_LE;
+ case '>': return OPR_GT;
+ case TK_GE: return OPR_GE;
+ case TK_AND: return OPR_AND;
+ case TK_OR: return OPR_OR;
+ default: return OPR_NOBINOPR;
+ }
+}
+
+
+static const struct {
+ lu_byte left; /* left priority for each binary operator */
+ lu_byte right; /* right priority */
+} priority[] = { /* ORDER OPR */
+ {6, 6}, {6, 6}, {7, 7}, {7, 7}, {7, 7}, /* `+' `-' `*' `/' `%' */
+ {10, 9}, {5, 4}, /* ^, .. (right associative) */
+ {3, 3}, {3, 3}, {3, 3}, /* ==, <, <= */
+ {3, 3}, {3, 3}, {3, 3}, /* ~=, >, >= */
+ {2, 2}, {1, 1} /* and, or */
+};
+
+#define UNARY_PRIORITY 8 /* priority for unary operators */
+
+
+/*
+** subexpr -> (simpleexp | unop subexpr) { binop subexpr }
+** where `binop' is any binary operator with a priority higher than `limit'
+*/
+static BinOpr subexpr (LexState *ls, expdesc *v, int limit) {
+ BinOpr op;
+ UnOpr uop;
+ enterlevel(ls);
+ uop = getunopr(ls->t.token);
+ if (uop != OPR_NOUNOPR) {
+ int line = ls->linenumber;
+ luaX_next(ls);
+ subexpr(ls, v, UNARY_PRIORITY);
+ luaK_prefix(ls->fs, uop, v, line);
+ }
+ else simpleexp(ls, v);
+ /* expand while operators have priorities higher than `limit' */
+ op = getbinopr(ls->t.token);
+ while (op != OPR_NOBINOPR && priority[op].left > limit) {
+ expdesc v2;
+ BinOpr nextop;
+ int line = ls->linenumber;
+ luaX_next(ls);
+ luaK_infix(ls->fs, op, v);
+ /* read sub-expression with higher priority */
+ nextop = subexpr(ls, &v2, priority[op].right);
+ luaK_posfix(ls->fs, op, v, &v2, line);
+ op = nextop;
+ }
+ leavelevel(ls);
+ return op; /* return first untreated operator */
+}
+
+
+static void expr (LexState *ls, expdesc *v) {
+ subexpr(ls, v, 0);
+}
+
+/* }==================================================================== */
+
+
+
+/*
+** {======================================================================
+** Rules for Statements
+** =======================================================================
+*/
+
+
+static void block (LexState *ls) {
+ /* block -> statlist */
+ FuncState *fs = ls->fs;
+ BlockCnt bl;
+ enterblock(fs, &bl, 0);
+ statlist(ls);
+ leaveblock(fs);
+}
+
+
+/*
+** structure to chain all variables in the left-hand side of an
+** assignment
+*/
+struct LHS_assign {
+ struct LHS_assign *prev;
+ expdesc v; /* variable (global, local, upvalue, or indexed) */
+};
+
+
+/*
+** check whether, in an assignment to an upvalue/local variable, the
+** upvalue/local variable is begin used in a previous assignment to a
+** table. If so, save original upvalue/local value in a safe place and
+** use this safe copy in the previous assignment.
+*/
+static void check_conflict (LexState *ls, struct LHS_assign *lh, expdesc *v) {
+ FuncState *fs = ls->fs;
+ int extra = fs->freereg; /* eventual position to save local variable */
+ int conflict = 0;
+ for (; lh; lh = lh->prev) { /* check all previous assignments */
+ if (lh->v.k == VINDEXED) { /* assigning to a table? */
+ /* table is the upvalue/local being assigned now? */
+ if (lh->v.u.ind.vt == v->k && lh->v.u.ind.t == v->u.info) {
+ conflict = 1;
+ lh->v.u.ind.vt = VLOCAL;
+ lh->v.u.ind.t = extra; /* previous assignment will use safe copy */
+ }
+ /* index is the local being assigned? (index cannot be upvalue) */
+ if (v->k == VLOCAL && lh->v.u.ind.idx == v->u.info) {
+ conflict = 1;
+ lh->v.u.ind.idx = extra; /* previous assignment will use safe copy */
+ }
+ }
+ }
+ if (conflict) {
+ /* copy upvalue/local value to a temporary (in position 'extra') */
+ OpCode op = (v->k == VLOCAL) ? OP_MOVE : OP_GETUPVAL;
+ luaK_codeABC(fs, op, extra, v->u.info, 0);
+ luaK_reserveregs(fs, 1);
+ }
+}
+
+
+static void assignment (LexState *ls, struct LHS_assign *lh, int nvars) {
+ expdesc e;
+ check_condition(ls, vkisvar(lh->v.k), "syntax error");
+ if (testnext(ls, ',')) { /* assignment -> ',' suffixedexp assignment */
+ struct LHS_assign nv;
+ nv.prev = lh;
+ suffixedexp(ls, &nv.v);
+ if (nv.v.k != VINDEXED)
+ check_conflict(ls, lh, &nv.v);
+ checklimit(ls->fs, nvars + ls->L->nCcalls, LUAI_MAXCCALLS,
+ "C levels");
+ assignment(ls, &nv, nvars+1);
+ }
+ else { /* assignment -> `=' explist */
+ int nexps;
+ checknext(ls, '=');
+ nexps = explist(ls, &e);
+ if (nexps != nvars) {
+ adjust_assign(ls, nvars, nexps, &e);
+ if (nexps > nvars)
+ ls->fs->freereg -= nexps - nvars; /* remove extra values */
+ }
+ else {
+ luaK_setoneret(ls->fs, &e); /* close last expression */
+ luaK_storevar(ls->fs, &lh->v, &e);
+ return; /* avoid default */
+ }
+ }
+ init_exp(&e, VNONRELOC, ls->fs->freereg-1); /* default assignment */
+ luaK_storevar(ls->fs, &lh->v, &e);
+}
+
+
+static int cond (LexState *ls) {
+ /* cond -> exp */
+ expdesc v;
+ expr(ls, &v); /* read condition */
+ if (v.k == VNIL) v.k = VFALSE; /* `falses' are all equal here */
+ luaK_goiftrue(ls->fs, &v);
+ return v.f;
+}
+
+
+static void gotostat (LexState *ls, int pc) {
+ int line = ls->linenumber;
+ TString *label;
+ int g;
+ if (testnext(ls, TK_GOTO))
+ label = str_checkname(ls);
+ else {
+ luaX_next(ls); /* skip break */
+ label = luaS_new(ls->L, "break");
+ }
+ g = newlabelentry(ls, &ls->dyd->gt, label, line, pc);
+ findlabel(ls, g); /* close it if label already defined */
+}
+
+
+/* check for repeated labels on the same block */
+static void checkrepeated (FuncState *fs, Labellist *ll, TString *label) {
+ int i;
+ for (i = fs->bl->firstlabel; i < ll->n; i++) {
+ if (luaS_eqstr(label, ll->arr[i].name)) {
+ const char *msg = luaO_pushfstring(fs->ls->L,
+ "label " LUA_QS " already defined on line %d",
+ getstr(label), ll->arr[i].line);
+ semerror(fs->ls, msg);
+ }
+ }
+}
+
+
+/* skip no-op statements */
+static void skipnoopstat (LexState *ls) {
+ while (ls->t.token == ';' || ls->t.token == TK_DBCOLON)
+ statement(ls);
+}
+
+
+static void labelstat (LexState *ls, TString *label, int line) {
+ /* label -> '::' NAME '::' */
+ FuncState *fs = ls->fs;
+ Labellist *ll = &ls->dyd->label;
+ int l; /* index of new label being created */
+ checkrepeated(fs, ll, label); /* check for repeated labels */
+ checknext(ls, TK_DBCOLON); /* skip double colon */
+ /* create new entry for this label */
+ l = newlabelentry(ls, ll, label, line, fs->pc);
+ skipnoopstat(ls); /* skip other no-op statements */
+ if (block_follow(ls, 0)) { /* label is last no-op statement in the block? */
+ /* assume that locals are already out of scope */
+ ll->arr[l].nactvar = fs->bl->nactvar;
+ }
+ findgotos(ls, &ll->arr[l]);
+}
+
+
+static void whilestat (LexState *ls, int line) {
+ /* whilestat -> WHILE cond DO block END */
+ FuncState *fs = ls->fs;
+ int whileinit;
+ int condexit;
+ BlockCnt bl;
+ luaX_next(ls); /* skip WHILE */
+ whileinit = luaK_getlabel(fs);
+ condexit = cond(ls);
+ enterblock(fs, &bl, 1);
+ checknext(ls, TK_DO);
+ block(ls);
+ luaK_jumpto(fs, whileinit);
+ check_match(ls, TK_END, TK_WHILE, line);
+ leaveblock(fs);
+ luaK_patchtohere(fs, condexit); /* false conditions finish the loop */
+}
+
+
+static void repeatstat (LexState *ls, int line) {
+ /* repeatstat -> REPEAT block UNTIL cond */
+ int condexit;
+ FuncState *fs = ls->fs;
+ int repeat_init = luaK_getlabel(fs);
+ BlockCnt bl1, bl2;
+ enterblock(fs, &bl1, 1); /* loop block */
+ enterblock(fs, &bl2, 0); /* scope block */
+ luaX_next(ls); /* skip REPEAT */
+ statlist(ls);
+ check_match(ls, TK_UNTIL, TK_REPEAT, line);
+ condexit = cond(ls); /* read condition (inside scope block) */
+ if (bl2.upval) /* upvalues? */
+ luaK_patchclose(fs, condexit, bl2.nactvar);
+ leaveblock(fs); /* finish scope */
+ luaK_patchlist(fs, condexit, repeat_init); /* close the loop */
+ leaveblock(fs); /* finish loop */
+}
+
+
+static int exp1 (LexState *ls) {
+ expdesc e;
+ int reg;
+ expr(ls, &e);
+ luaK_exp2nextreg(ls->fs, &e);
+ lua_assert(e.k == VNONRELOC);
+ reg = e.u.info;
+ return reg;
+}
+
+
+static void forbody (LexState *ls, int base, int line, int nvars, int isnum) {
+ /* forbody -> DO block */
+ BlockCnt bl;
+ FuncState *fs = ls->fs;
+ int prep, endfor;
+ adjustlocalvars(ls, 3); /* control variables */
+ checknext(ls, TK_DO);
+ prep = isnum ? luaK_codeAsBx(fs, OP_FORPREP, base, NO_JUMP) : luaK_jump(fs);
+ enterblock(fs, &bl, 0); /* scope for declared variables */
+ adjustlocalvars(ls, nvars);
+ luaK_reserveregs(fs, nvars);
+ block(ls);
+ leaveblock(fs); /* end of scope for declared variables */
+ luaK_patchtohere(fs, prep);
+ if (isnum) /* numeric for? */
+ endfor = luaK_codeAsBx(fs, OP_FORLOOP, base, NO_JUMP);
+ else { /* generic for */
+ luaK_codeABC(fs, OP_TFORCALL, base, 0, nvars);
+ luaK_fixline(fs, line);
+ endfor = luaK_codeAsBx(fs, OP_TFORLOOP, base + 2, NO_JUMP);
+ }
+ luaK_patchlist(fs, endfor, prep + 1);
+ luaK_fixline(fs, line);
+}
+
+
+static void fornum (LexState *ls, TString *varname, int line) {
+ /* fornum -> NAME = exp1,exp1[,exp1] forbody */
+ FuncState *fs = ls->fs;
+ int base = fs->freereg;
+ new_localvarliteral(ls, "(for index)");
+ new_localvarliteral(ls, "(for limit)");
+ new_localvarliteral(ls, "(for step)");
+ new_localvar(ls, varname);
+ checknext(ls, '=');
+ exp1(ls); /* initial value */
+ checknext(ls, ',');
+ exp1(ls); /* limit */
+ if (testnext(ls, ','))
+ exp1(ls); /* optional step */
+ else { /* default step = 1 */
+ luaK_codek(fs, fs->freereg, luaK_numberK(fs, 1));
+ luaK_reserveregs(fs, 1);
+ }
+ forbody(ls, base, line, 1, 1);
+}
+
+
+static void forlist (LexState *ls, TString *indexname) {
+ /* forlist -> NAME {,NAME} IN explist forbody */
+ FuncState *fs = ls->fs;
+ expdesc e;
+ int nvars = 4; /* gen, state, control, plus at least one declared var */
+ int line;
+ int base = fs->freereg;
+ /* create control variables */
+ new_localvarliteral(ls, "(for generator)");
+ new_localvarliteral(ls, "(for state)");
+ new_localvarliteral(ls, "(for control)");
+ /* create declared variables */
+ new_localvar(ls, indexname);
+ while (testnext(ls, ',')) {
+ new_localvar(ls, str_checkname(ls));
+ nvars++;
+ }
+ checknext(ls, TK_IN);
+ line = ls->linenumber;
+ adjust_assign(ls, 3, explist(ls, &e), &e);
+ luaK_checkstack(fs, 3); /* extra space to call generator */
+ forbody(ls, base, line, nvars - 3, 0);
+}
+
+
+static void forstat (LexState *ls, int line) {
+ /* forstat -> FOR (fornum | forlist) END */
+ FuncState *fs = ls->fs;
+ TString *varname;
+ BlockCnt bl;
+ enterblock(fs, &bl, 1); /* scope for loop and control variables */
+ luaX_next(ls); /* skip `for' */
+ varname = str_checkname(ls); /* first variable name */
+ switch (ls->t.token) {
+ case '=': fornum(ls, varname, line); break;
+ case ',': case TK_IN: forlist(ls, varname); break;
+ default: luaX_syntaxerror(ls, LUA_QL("=") " or " LUA_QL("in") " expected");
+ }
+ check_match(ls, TK_END, TK_FOR, line);
+ leaveblock(fs); /* loop scope (`break' jumps to this point) */
+}
+
+
+__attribute__((always_inline)) inline
+static void test_then_block (LexState *ls, int *escapelist) {
+ /* test_then_block -> [IF | ELSEIF] cond THEN block */
+ BlockCnt bl;
+ FuncState *fs = ls->fs;
+ expdesc v;
+ int jf; /* instruction to skip 'then' code (if condition is false) */
+ luaX_next(ls); /* skip IF or ELSEIF */
+ expr(ls, &v); /* read condition */
+ checknext(ls, TK_THEN);
+ if (ls->t.token == TK_GOTO || ls->t.token == TK_BREAK) {
+ luaK_goiffalse(ls->fs, &v); /* will jump to label if condition is true */
+ enterblock(fs, &bl, 0); /* must enter block before 'goto' */
+ gotostat(ls, v.t); /* handle goto/break */
+ skipnoopstat(ls); /* skip other no-op statements */
+ if (block_follow(ls, 0)) { /* 'goto' is the entire block? */
+ leaveblock(fs);
+ return; /* and that is it */
+ }
+ else /* must skip over 'then' part if condition is false */
+ jf = luaK_jump(fs);
+ }
+ else { /* regular case (not goto/break) */
+ luaK_goiftrue(ls->fs, &v); /* skip over block if condition is false */
+ enterblock(fs, &bl, 0);
+ jf = v.f;
+ }
+ statlist(ls); /* `then' part */
+ leaveblock(fs);
+ if (ls->t.token == TK_ELSE ||
+ ls->t.token == TK_ELSEIF) /* followed by 'else'/'elseif'? */
+ luaK_concat(fs, escapelist, luaK_jump(fs)); /* must jump over it */
+ luaK_patchtohere(fs, jf);
+}
+
+
+static void ifstat (LexState *ls, int line) {
+ /* ifstat -> IF cond THEN block {ELSEIF cond THEN block} [ELSE block] END */
+ FuncState *fs = ls->fs;
+ int escapelist = NO_JUMP; /* exit list for finished parts */
+ test_then_block(ls, &escapelist); /* IF cond THEN block */
+ while (ls->t.token == TK_ELSEIF)
+ test_then_block(ls, &escapelist); /* ELSEIF cond THEN block */
+ if (testnext(ls, TK_ELSE))
+ block(ls); /* `else' part */
+ check_match(ls, TK_END, TK_IF, line);
+ luaK_patchtohere(fs, escapelist); /* patch escape list to 'if' end */
+}
+
+
+static void localfunc (LexState *ls) {
+ expdesc b;
+ FuncState *fs = ls->fs;
+ new_localvar(ls, str_checkname(ls)); /* new local variable */
+ adjustlocalvars(ls, 1); /* enter its scope */
+ body(ls, &b, 0, ls->linenumber); /* function created in next register */
+ /* debug information will only see the variable after this point! */
+ getlocvar(fs, b.u.info)->startpc = fs->pc;
+}
+
+
+static void localstat (LexState *ls) {
+ /* stat -> LOCAL NAME {`,' NAME} [`=' explist] */
+ int nvars = 0;
+ int nexps;
+ expdesc e;
+ do {
+ new_localvar(ls, str_checkname(ls));
+ nvars++;
+ } while (testnext(ls, ','));
+ if (testnext(ls, '='))
+ nexps = explist(ls, &e);
+ else {
+ e.k = VVOID;
+ nexps = 0;
+ }
+ adjust_assign(ls, nvars, nexps, &e);
+ adjustlocalvars(ls, nvars);
+}
+
+
+static int funcname (LexState *ls, expdesc *v) {
+ /* funcname -> NAME {fieldsel} [`:' NAME] */
+ int ismethod = 0;
+ singlevar(ls, v);
+ while (ls->t.token == '.')
+ fieldsel(ls, v);
+ if (ls->t.token == ':') {
+ ismethod = 1;
+ fieldsel(ls, v);
+ }
+ return ismethod;
+}
+
+
+static void funcstat (LexState *ls, int line) {
+ /* funcstat -> FUNCTION funcname body */
+ int ismethod;
+ expdesc v, b;
+ luaX_next(ls); /* skip FUNCTION */
+ ismethod = funcname(ls, &v);
+ body(ls, &b, ismethod, line);
+ luaK_storevar(ls->fs, &v, &b);
+ luaK_fixline(ls->fs, line); /* definition `happens' in the first line */
+}
+
+
+static void exprstat (LexState *ls) {
+ /* stat -> func | assignment */
+ FuncState *fs = ls->fs;
+ struct LHS_assign v;
+ suffixedexp(ls, &v.v);
+ if (ls->t.token == '=' || ls->t.token == ',') { /* stat -> assignment ? */
+ v.prev = NULL;
+ assignment(ls, &v, 1);
+ }
+ else { /* stat -> func */
+ check_condition(ls, v.v.k == VCALL, "syntax error");
+ SETARG_C(getcode(fs, &v.v), 1); /* call statement uses no results */
+ }
+}
+
+
+static void retstat (LexState *ls) {
+ /* stat -> RETURN [explist] [';'] */
+ FuncState *fs = ls->fs;
+ expdesc e;
+ int first, nret; /* registers with returned values */
+ if (block_follow(ls, 1) || ls->t.token == ';')
+ first = nret = 0; /* return no values */
+ else {
+ nret = explist(ls, &e); /* optional return values */
+ if (hasmultret(e.k)) {
+ luaK_setmultret(fs, &e);
+ if (e.k == VCALL && nret == 1) { /* tail call? */
+ SET_OPCODE(getcode(fs,&e), OP_TAILCALL);
+ lua_assert(GETARG_A(getcode(fs,&e)) == fs->nactvar);
+ }
+ first = fs->nactvar;
+ nret = LUA_MULTRET; /* return all values */
+ }
+ else {
+ if (nret == 1) /* only one single value? */
+ first = luaK_exp2anyreg(fs, &e);
+ else {
+ luaK_exp2nextreg(fs, &e); /* values must go to the `stack' */
+ first = fs->nactvar; /* return all `active' values */
+ lua_assert(nret == fs->freereg - first);
+ }
+ }
+ }
+ luaK_ret(fs, first, nret);
+ (void) testnext(ls, ';'); /* skip optional semicolon */
+}
+
+
+static void statement (LexState *ls) {
+ int line = ls->linenumber; /* may be needed for error messages */
+ enterlevel(ls);
+ switch (ls->t.token) {
+ case ';': { /* stat -> ';' (empty statement) */
+ luaX_next(ls); /* skip ';' */
+ break;
+ }
+ case TK_IF: { /* stat -> ifstat */
+ ifstat(ls, line);
+ break;
+ }
+ case TK_WHILE: { /* stat -> whilestat */
+ whilestat(ls, line);
+ break;
+ }
+ case TK_DO: { /* stat -> DO block END */
+ luaX_next(ls); /* skip DO */
+ block(ls);
+ check_match(ls, TK_END, TK_DO, line);
+ break;
+ }
+ case TK_FOR: { /* stat -> forstat */
+ forstat(ls, line);
+ break;
+ }
+ case TK_REPEAT: { /* stat -> repeatstat */
+ repeatstat(ls, line);
+ break;
+ }
+ case TK_FUNCTION: { /* stat -> funcstat */
+ funcstat(ls, line);
+ break;
+ }
+ case TK_LOCAL: { /* stat -> localstat */
+ luaX_next(ls); /* skip LOCAL */
+ if (testnext(ls, TK_FUNCTION)) /* local function? */
+ localfunc(ls);
+ else
+ localstat(ls);
+ break;
+ }
+ case TK_DBCOLON: { /* stat -> label */
+ luaX_next(ls); /* skip double colon */
+ labelstat(ls, str_checkname(ls), line);
+ break;
+ }
+ case TK_RETURN: { /* stat -> retstat */
+ luaX_next(ls); /* skip RETURN */
+ retstat(ls);
+ break;
+ }
+ case TK_BREAK: /* stat -> breakstat */
+ case TK_GOTO: { /* stat -> 'goto' NAME */
+ gotostat(ls, luaK_jump(ls->fs));
+ break;
+ }
+ default: { /* stat -> func | assignment */
+ exprstat(ls);
+ break;
+ }
+ }
+ lua_assert(ls->fs->f->maxstacksize >= ls->fs->freereg &&
+ ls->fs->freereg >= ls->fs->nactvar);
+ ls->fs->freereg = ls->fs->nactvar; /* free registers */
+ leavelevel(ls);
+}
+
+/* }====================================================================== */
+
+
+/*
+** compiles the main function, which is a regular vararg function with an
+** upvalue named LUA_ENV
+*/
+static void mainfunc (LexState *ls, FuncState *fs) {
+ BlockCnt bl;
+ expdesc v;
+ open_func(ls, fs, &bl);
+ fs->f->is_vararg = 1; /* main function is always vararg */
+ init_exp(&v, VLOCAL, 0); /* create and... */
+ newupvalue(fs, ls->envn, &v); /* ...set environment upvalue */
+ luaX_next(ls); /* read first token */
+ statlist(ls); /* parse main body */
+ check(ls, TK_EOS);
+ close_func(ls);
+}
+
+
+Closure *luaY_parser (lua_State *L, ZIO *z, Mbuffer *buff,
+ Dyndata *dyd, const char *name, int firstchar) {
+ LexState lexstate;
+ FuncState funcstate;
+ Closure *cl = luaF_newLclosure(L, 1); /* create main closure */
+ /* anchor closure (to avoid being collected) */
+ setclLvalue(L, L->top, cl);
+ incr_top(L);
+ funcstate.f = cl->l.p = luaF_newproto(L);
+ funcstate.f->source = luaS_new(L, name); /* create and anchor TString */
+ lexstate.buff = buff;
+ lexstate.dyd = dyd;
+ dyd->actvar.n = dyd->gt.n = dyd->label.n = 0;
+ luaX_setinput(L, &lexstate, z, funcstate.f->source, firstchar);
+ mainfunc(&lexstate, &funcstate);
+ lua_assert(!funcstate.prev && funcstate.nups == 1 && !lexstate.fs);
+ /* all scopes should be correctly finished */
+ lua_assert(dyd->actvar.n == 0 && dyd->gt.n == 0 && dyd->label.n == 0);
+ return cl; /* it's on the stack too */
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lparser.h b/sys/contrib/openzfs/module/lua/lparser.h
new file mode 100644
index 000000000000..8aea0523f3e3
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lparser.h
@@ -0,0 +1,121 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lparser.h,v 1.70.1.1 2013/04/12 18:48:47 roberto Exp $
+** Lua Parser
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lparser_h
+#define lparser_h
+
+#include "llimits.h"
+#include "lobject.h"
+#include "lzio.h"
+
+
+/*
+** Expression descriptor
+*/
+
+typedef enum {
+ VVOID, /* no value */
+ VNIL,
+ VTRUE,
+ VFALSE,
+ VK, /* info = index of constant in `k' */
+ VKNUM, /* nval = numerical value */
+ VNONRELOC, /* info = result register */
+ VLOCAL, /* info = local register */
+ VUPVAL, /* info = index of upvalue in 'upvalues' */
+ VINDEXED, /* t = table register/upvalue; idx = index R/K */
+ VJMP, /* info = instruction pc */
+ VRELOCABLE, /* info = instruction pc */
+ VCALL, /* info = instruction pc */
+ VVARARG /* info = instruction pc */
+} expkind;
+
+
+#define vkisvar(k) (VLOCAL <= (k) && (k) <= VINDEXED)
+#define vkisinreg(k) ((k) == VNONRELOC || (k) == VLOCAL)
+
+typedef struct expdesc {
+ expkind k;
+ union {
+ struct { /* for indexed variables (VINDEXED) */
+ short idx; /* index (R/K) */
+ lu_byte t; /* table (register or upvalue) */
+ lu_byte vt; /* whether 't' is register (VLOCAL) or upvalue (VUPVAL) */
+ } ind;
+ int info; /* for generic use */
+ lua_Number nval; /* for VKNUM */
+ } u;
+ int t; /* patch list of `exit when true' */
+ int f; /* patch list of `exit when false' */
+} expdesc;
+
+
+/* description of active local variable */
+typedef struct Vardesc {
+ short idx; /* variable index in stack */
+} Vardesc;
+
+
+/* description of pending goto statements and label statements */
+typedef struct Labeldesc {
+ TString *name; /* label identifier */
+ int pc; /* position in code */
+ int line; /* line where it appeared */
+ lu_byte nactvar; /* local level where it appears in current block */
+} Labeldesc;
+
+
+/* list of labels or gotos */
+typedef struct Labellist {
+ Labeldesc *arr; /* array */
+ int n; /* number of entries in use */
+ int size; /* array size */
+} Labellist;
+
+
+/* dynamic structures used by the parser */
+typedef struct Dyndata {
+ struct { /* list of active local variables */
+ Vardesc *arr;
+ int n;
+ int size;
+ } actvar;
+ Labellist gt; /* list of pending gotos */
+ Labellist label; /* list of active labels */
+} Dyndata;
+
+
+/* control of blocks */
+struct BlockCnt; /* defined in lparser.c */
+
+
+/* state needed to generate code for a given function */
+typedef struct FuncState {
+ Proto *f; /* current function header */
+ Table *h; /* table to find (and reuse) elements in `k' */
+ struct FuncState *prev; /* enclosing function */
+ struct LexState *ls; /* lexical state */
+ struct BlockCnt *bl; /* chain of current blocks */
+ int pc; /* next position to code (equivalent to `ncode') */
+ int lasttarget; /* 'label' of last 'jump label' */
+ int jpc; /* list of pending jumps to `pc' */
+ int nk; /* number of elements in `k' */
+ int np; /* number of elements in `p' */
+ int firstlocal; /* index of first local var (in Dyndata array) */
+ short nlocvars; /* number of elements in 'f->locvars' */
+ lu_byte nactvar; /* number of active local variables */
+ lu_byte nups; /* number of upvalues */
+ lu_byte freereg; /* first free register */
+} FuncState;
+
+
+LUAI_FUNC Closure *luaY_parser (lua_State *L, ZIO *z, Mbuffer *buff,
+ Dyndata *dyd, const char *name, int firstchar);
+
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lstate.c b/sys/contrib/openzfs/module/lua/lstate.c
new file mode 100644
index 000000000000..4d196eced6a3
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lstate.c
@@ -0,0 +1,320 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lstate.c,v 2.99.1.2 2013/11/08 17:45:31 roberto Exp $
+** Global State
+** See Copyright Notice in lua.h
+*/
+
+
+#define lstate_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "lapi.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lgc.h"
+#include "llex.h"
+#include "lmem.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+
+
+#if !defined(LUAI_GCPAUSE)
+#define LUAI_GCPAUSE 200 /* 200% */
+#endif
+
+#if !defined(LUAI_GCMAJOR)
+#define LUAI_GCMAJOR 200 /* 200% */
+#endif
+
+#if !defined(LUAI_GCMUL)
+#define LUAI_GCMUL 200 /* GC runs 'twice the speed' of memory allocation */
+#endif
+
+
+#define MEMERRMSG "not enough memory"
+
+
+/*
+** a macro to help the creation of a unique random seed when a state is
+** created; the seed is used to randomize hashes.
+*/
+#if !defined(luai_makeseed)
+#define luai_makeseed() cast(unsigned int, gethrtime())
+#endif
+
+
+
+/*
+** thread state + extra space
+*/
+typedef struct LX {
+#if defined(LUAI_EXTRASPACE)
+ char buff[LUAI_EXTRASPACE];
+#endif
+ lua_State l;
+} LX;
+
+
+/*
+** Main thread combines a thread state and the global state
+*/
+typedef struct LG {
+ LX l;
+ global_State g;
+} LG;
+
+
+
+#define fromstate(L) (cast(LX *, cast(lu_byte *, (L)) - offsetof(LX, l)))
+
+
+/*
+** Compute an initial seed as random as possible. In ANSI, rely on
+** Address Space Layout Randomization (if present) to increase
+** randomness..
+*/
+#define addbuff(b,p,e) \
+ { size_t t = cast(size_t, e); \
+ memcpy(buff + p, &t, sizeof(t)); p += sizeof(t); }
+
+static unsigned int makeseed (lua_State *L) {
+ char buff[4 * sizeof(size_t)];
+ unsigned int h = luai_makeseed();
+ int p = 0;
+ addbuff(buff, p, L); /* heap variable */
+ addbuff(buff, p, &h); /* local variable */
+ addbuff(buff, p, luaO_nilobject); /* global variable */
+ addbuff(buff, p, &lua_newstate); /* public function */
+ lua_assert(p == sizeof(buff));
+ return luaS_hash(buff, p, h);
+}
+
+
+/*
+** set GCdebt to a new value keeping the value (totalbytes + GCdebt)
+** invariant
+*/
+void luaE_setdebt (global_State *g, l_mem debt) {
+ g->totalbytes -= (debt - g->GCdebt);
+ g->GCdebt = debt;
+}
+
+
+CallInfo *luaE_extendCI (lua_State *L) {
+ CallInfo *ci = luaM_new(L, CallInfo);
+ lua_assert(L->ci->next == NULL);
+ L->ci->next = ci;
+ ci->previous = L->ci;
+ ci->next = NULL;
+ return ci;
+}
+
+
+void luaE_freeCI (lua_State *L) {
+ CallInfo *ci = L->ci;
+ CallInfo *next = ci->next;
+ ci->next = NULL;
+ while ((ci = next) != NULL) {
+ next = ci->next;
+ luaM_free(L, ci);
+ }
+}
+
+
+static void stack_init (lua_State *L1, lua_State *L) {
+ int i; CallInfo *ci;
+ /* initialize stack array */
+ L1->stack = luaM_newvector(L, BASIC_STACK_SIZE, TValue);
+ L1->stacksize = BASIC_STACK_SIZE;
+ for (i = 0; i < BASIC_STACK_SIZE; i++)
+ setnilvalue(L1->stack + i); /* erase new stack */
+ L1->top = L1->stack;
+ L1->stack_last = L1->stack + L1->stacksize - EXTRA_STACK;
+ /* initialize first ci */
+ ci = &L1->base_ci;
+ ci->next = ci->previous = NULL;
+ ci->callstatus = 0;
+ ci->func = L1->top;
+ setnilvalue(L1->top++); /* 'function' entry for this 'ci' */
+ ci->top = L1->top + LUA_MINSTACK;
+ L1->ci = ci;
+}
+
+
+static void freestack (lua_State *L) {
+ if (L->stack == NULL)
+ return; /* stack not completely built yet */
+ L->ci = &L->base_ci; /* free the entire 'ci' list */
+ luaE_freeCI(L);
+ luaM_freearray(L, L->stack, L->stacksize); /* free stack array */
+}
+
+
+/*
+** Create registry table and its predefined values
+*/
+static void init_registry (lua_State *L, global_State *g) {
+ TValue mt;
+ /* create registry */
+ Table *registry = luaH_new(L);
+ sethvalue(L, &g->l_registry, registry);
+ luaH_resize(L, registry, LUA_RIDX_LAST, 0);
+ /* registry[LUA_RIDX_MAINTHREAD] = L */
+ setthvalue(L, &mt, L);
+ luaH_setint(L, registry, LUA_RIDX_MAINTHREAD, &mt);
+ /* registry[LUA_RIDX_GLOBALS] = table of globals */
+ sethvalue(L, &mt, luaH_new(L));
+ luaH_setint(L, registry, LUA_RIDX_GLOBALS, &mt);
+}
+
+
+/*
+** open parts of the state that may cause memory-allocation errors
+*/
+static void f_luaopen (lua_State *L, void *ud) {
+ global_State *g = G(L);
+ UNUSED(ud);
+ stack_init(L, L); /* init stack */
+ init_registry(L, g);
+ luaS_resize(L, MINSTRTABSIZE); /* initial size of string table */
+ luaT_init(L);
+ luaX_init(L);
+ /* pre-create memory-error message */
+ g->memerrmsg = luaS_newliteral(L, MEMERRMSG);
+ luaS_fix(g->memerrmsg); /* it should never be collected */
+ g->gcrunning = 1; /* allow gc */
+ g->version = lua_version(NULL);
+ luai_userstateopen(L);
+}
+
+
+/*
+** preinitialize a state with consistent values without allocating
+** any memory (to avoid errors)
+*/
+static void preinit_state (lua_State *L, global_State *g) {
+ G(L) = g;
+ L->stack = NULL;
+ L->ci = NULL;
+ L->stacksize = 0;
+ L->errorJmp = NULL;
+ L->nCcalls = 0;
+ L->hook = NULL;
+ L->hookmask = 0;
+ L->basehookcount = 0;
+ L->allowhook = 1;
+ resethookcount(L);
+ L->openupval = NULL;
+ L->nny = 1;
+ L->status = LUA_OK;
+ L->errfunc = 0;
+ L->runerror = 0;
+}
+
+
+static void close_state (lua_State *L) {
+ global_State *g = G(L);
+ luaF_close(L, L->stack); /* close all upvalues for this thread */
+ luaC_freeallobjects(L); /* collect all objects */
+ if (g->version) /* closing a fully built state? */
+ luai_userstateclose(L);
+ luaM_freearray(L, G(L)->strt.hash, G(L)->strt.size);
+ luaZ_freebuffer(L, &g->buff);
+ freestack(L);
+ lua_assert(gettotalbytes(g) == sizeof(LG));
+ (*g->frealloc)(g->ud, fromstate(L), sizeof(LG), 0); /* free main block */
+}
+
+
+LUA_API lua_State *lua_newthread (lua_State *L) {
+ lua_State *L1;
+ lua_lock(L);
+ luaC_checkGC(L);
+ L1 = &luaC_newobj(L, LUA_TTHREAD, sizeof(LX), NULL, offsetof(LX, l))->th;
+ setthvalue(L, L->top, L1);
+ api_incr_top(L);
+ preinit_state(L1, G(L));
+ L1->hookmask = L->hookmask;
+ L1->basehookcount = L->basehookcount;
+ L1->hook = L->hook;
+ resethookcount(L1);
+ luai_userstatethread(L, L1);
+ stack_init(L1, L); /* init stack */
+ lua_unlock(L);
+ return L1;
+}
+
+
+void luaE_freethread (lua_State *L, lua_State *L1) {
+ LX *l = fromstate(L1);
+ luaF_close(L1, L1->stack); /* close all upvalues for this thread */
+ lua_assert(L1->openupval == NULL);
+ luai_userstatefree(L, L1);
+ freestack(L1);
+ luaM_free(L, l);
+}
+
+
+LUA_API lua_State *lua_newstate (lua_Alloc f, void *ud) {
+ int i;
+ lua_State *L;
+ global_State *g;
+ LG *l = cast(LG *, (*f)(ud, NULL, LUA_TTHREAD, sizeof(LG)));
+ if (l == NULL) return NULL;
+ L = &l->l.l;
+ g = &l->g;
+ L->next = NULL;
+ L->tt = LUA_TTHREAD;
+ g->currentwhite = bit2mask(WHITE0BIT, FIXEDBIT);
+ L->marked = luaC_white(g);
+ g->gckind = KGC_NORMAL;
+ preinit_state(L, g);
+ g->frealloc = f;
+ g->ud = ud;
+ g->mainthread = L;
+ g->seed = makeseed(L);
+ g->uvhead.u.l.prev = &g->uvhead;
+ g->uvhead.u.l.next = &g->uvhead;
+ g->gcrunning = 0; /* no GC while building state */
+ g->GCestimate = 0;
+ g->strt.size = 0;
+ g->strt.nuse = 0;
+ g->strt.hash = NULL;
+ setnilvalue(&g->l_registry);
+ luaZ_initbuffer(L, &g->buff);
+ g->panic = NULL;
+ g->version = NULL;
+ g->gcstate = GCSpause;
+ g->allgc = NULL;
+ g->finobj = NULL;
+ g->tobefnz = NULL;
+ g->sweepgc = g->sweepfin = NULL;
+ g->gray = g->grayagain = NULL;
+ g->weak = g->ephemeron = g->allweak = NULL;
+ g->totalbytes = sizeof(LG);
+ g->GCdebt = 0;
+ g->gcpause = LUAI_GCPAUSE;
+ g->gcmajorinc = LUAI_GCMAJOR;
+ g->gcstepmul = LUAI_GCMUL;
+ for (i=0; i < LUA_NUMTAGS; i++) g->mt[i] = NULL;
+ if (luaD_rawrunprotected(L, f_luaopen, NULL) != LUA_OK) {
+ /* memory allocation error: free partial state */
+ close_state(L);
+ L = NULL;
+ }
+ return L;
+}
+
+
+LUA_API void lua_close (lua_State *L) {
+ L = G(L)->mainthread; /* only the main thread can be closed */
+ lua_lock(L);
+ close_state(L);
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lstate.h b/sys/contrib/openzfs/module/lua/lstate.h
new file mode 100644
index 000000000000..b636396a6015
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lstate.h
@@ -0,0 +1,230 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lstate.h,v 2.82.1.1 2013/04/12 18:48:47 roberto Exp $
+** Global State
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lstate_h
+#define lstate_h
+
+#include <sys/lua/lua.h>
+
+#include "lobject.h"
+#include "ltm.h"
+#include "lzio.h"
+
+
+/*
+
+** Some notes about garbage-collected objects: All objects in Lua must
+** be kept somehow accessible until being freed.
+**
+** Lua keeps most objects linked in list g->allgc. The link uses field
+** 'next' of the CommonHeader.
+**
+** Strings are kept in several lists headed by the array g->strt.hash.
+**
+** Open upvalues are not subject to independent garbage collection. They
+** are collected together with their respective threads. Lua keeps a
+** double-linked list with all open upvalues (g->uvhead) so that it can
+** mark objects referred by them. (They are always gray, so they must
+** be remarked in the atomic step. Usually their contents would be marked
+** when traversing the respective threads, but the thread may already be
+** dead, while the upvalue is still accessible through closures.)
+**
+** Objects with finalizers are kept in the list g->finobj.
+**
+** The list g->tobefnz links all objects being finalized.
+
+*/
+
+
+struct lua_longjmp; /* defined in ldo.c */
+
+
+
+/* extra stack space to handle TM calls and some other extras */
+#define EXTRA_STACK 5
+
+
+#define BASIC_STACK_SIZE (2*LUA_MINSTACK)
+
+
+/* kinds of Garbage Collection */
+#define KGC_NORMAL 0
+#define KGC_EMERGENCY 1 /* gc was forced by an allocation failure */
+#define KGC_GEN 2 /* generational collection */
+
+
+typedef struct stringtable {
+ GCObject **hash;
+ lu_int32 nuse; /* number of elements */
+ int size;
+} stringtable;
+
+
+/*
+** information about a call
+*/
+typedef struct CallInfo {
+ StkId func; /* function index in the stack */
+ StkId top; /* top for this function */
+ struct CallInfo *previous, *next; /* dynamic call link */
+ short nresults; /* expected number of results from this function */
+ lu_byte callstatus;
+ ptrdiff_t extra;
+ union {
+ struct { /* only for Lua functions */
+ StkId base; /* base for this function */
+ const Instruction *savedpc;
+ } l;
+ struct { /* only for C functions */
+ int ctx; /* context info. in case of yields */
+ lua_CFunction k; /* continuation in case of yields */
+ ptrdiff_t old_errfunc;
+ lu_byte old_allowhook;
+ lu_byte status;
+ } c;
+ } u;
+} CallInfo;
+
+
+/*
+** Bits in CallInfo status
+*/
+#define CIST_LUA (1<<0) /* call is running a Lua function */
+#define CIST_HOOKED (1<<1) /* call is running a debug hook */
+#define CIST_REENTRY (1<<2) /* call is running on same invocation of
+ luaV_execute of previous call */
+#define CIST_YIELDED (1<<3) /* call reentered after suspension */
+#define CIST_YPCALL (1<<4) /* call is a yieldable protected call */
+#define CIST_STAT (1<<5) /* call has an error status (pcall) */
+#define CIST_TAIL (1<<6) /* call was tail called */
+#define CIST_HOOKYIELD (1<<7) /* last hook called yielded */
+
+
+#define isLua(ci) ((ci)->callstatus & CIST_LUA)
+
+
+/*
+** `global state', shared by all threads of this state
+*/
+typedef struct global_State {
+ lua_Alloc frealloc; /* function to reallocate memory */
+ void *ud; /* auxiliary data to `frealloc' */
+ lu_mem totalbytes; /* number of bytes currently allocated - GCdebt */
+ l_mem GCdebt; /* bytes allocated not yet compensated by the collector */
+ lu_mem GCmemtrav; /* memory traversed by the GC */
+ lu_mem GCestimate; /* an estimate of the non-garbage memory in use */
+ stringtable strt; /* hash table for strings */
+ TValue l_registry;
+ unsigned int seed; /* randomized seed for hashes */
+ lu_byte currentwhite;
+ lu_byte gcstate; /* state of garbage collector */
+ lu_byte gckind; /* kind of GC running */
+ lu_byte gcrunning; /* true if GC is running */
+ int sweepstrgc; /* position of sweep in `strt' */
+ GCObject *allgc; /* list of all collectable objects */
+ GCObject *finobj; /* list of collectable objects with finalizers */
+ GCObject **sweepgc; /* current position of sweep in list 'allgc' */
+ GCObject **sweepfin; /* current position of sweep in list 'finobj' */
+ GCObject *gray; /* list of gray objects */
+ GCObject *grayagain; /* list of objects to be traversed atomically */
+ GCObject *weak; /* list of tables with weak values */
+ GCObject *ephemeron; /* list of ephemeron tables (weak keys) */
+ GCObject *allweak; /* list of all-weak tables */
+ GCObject *tobefnz; /* list of userdata to be GC */
+ UpVal uvhead; /* head of double-linked list of all open upvalues */
+ Mbuffer buff; /* temporary buffer for string concatenation */
+ int gcpause; /* size of pause between successive GCs */
+ int gcmajorinc; /* pause between major collections (only in gen. mode) */
+ int gcstepmul; /* GC `granularity' */
+ lua_CFunction panic; /* to be called in unprotected errors */
+ struct lua_State *mainthread;
+ const lua_Number *version; /* pointer to version number */
+ TString *memerrmsg; /* memory-error message */
+ TString *tmname[TM_N]; /* array with tag-method names */
+ struct Table *mt[LUA_NUMTAGS]; /* metatables for basic types */
+} global_State;
+
+
+/*
+** `per thread' state
+*/
+struct lua_State {
+ CommonHeader;
+ lu_byte status;
+ StkId top; /* first free slot in the stack */
+ global_State *l_G;
+ CallInfo *ci; /* call info for current function */
+ const Instruction *oldpc; /* last pc traced */
+ StkId stack_last; /* last free slot in the stack */
+ StkId stack; /* stack base */
+ int stacksize;
+ unsigned short nny; /* number of non-yieldable calls in stack */
+ unsigned short nCcalls; /* number of nested C calls */
+ lu_byte hookmask;
+ lu_byte allowhook;
+ lu_byte runerror; /* handling a runtime error */
+ int basehookcount;
+ int hookcount;
+ lua_Hook hook;
+ GCObject *openupval; /* list of open upvalues in this stack */
+ GCObject *gclist;
+ struct lua_longjmp *errorJmp; /* current error recover point */
+ ptrdiff_t errfunc; /* current error handling function (stack index) */
+ CallInfo base_ci; /* CallInfo for first level (C calling Lua) */
+};
+
+
+#define G(L) (L->l_G)
+
+
+/*
+** Union of all collectable objects
+*/
+union GCObject {
+ GCheader gch; /* common header */
+ union TString ts;
+ union Udata u;
+ union Closure cl;
+ struct Table h;
+ struct Proto p;
+ struct UpVal uv;
+ struct lua_State th; /* thread */
+};
+
+
+#define gch(o) (&(o)->gch)
+
+/* macros to convert a GCObject into a specific value */
+#define rawgco2ts(o) \
+ check_exp(novariant((o)->gch.tt) == LUA_TSTRING, &((o)->ts))
+#define gco2ts(o) (&rawgco2ts(o)->tsv)
+#define rawgco2u(o) check_exp((o)->gch.tt == LUA_TUSERDATA, &((o)->u))
+#define gco2u(o) (&rawgco2u(o)->uv)
+#define gco2lcl(o) check_exp((o)->gch.tt == LUA_TLCL, &((o)->cl.l))
+#define gco2ccl(o) check_exp((o)->gch.tt == LUA_TCCL, &((o)->cl.c))
+#define gco2cl(o) \
+ check_exp(novariant((o)->gch.tt) == LUA_TFUNCTION, &((o)->cl))
+#define gco2t(o) check_exp((o)->gch.tt == LUA_TTABLE, &((o)->h))
+#define gco2p(o) check_exp((o)->gch.tt == LUA_TPROTO, &((o)->p))
+#define gco2uv(o) check_exp((o)->gch.tt == LUA_TUPVAL, &((o)->uv))
+#define gco2th(o) check_exp((o)->gch.tt == LUA_TTHREAD, &((o)->th))
+
+/* macro to convert any Lua object into a GCObject */
+#define obj2gco(v) (cast(GCObject *, (v)))
+
+
+/* actual number of total bytes allocated */
+#define gettotalbytes(g) ((g)->totalbytes + (g)->GCdebt)
+
+LUAI_FUNC void luaE_setdebt (global_State *g, l_mem debt);
+LUAI_FUNC void luaE_freethread (lua_State *L, lua_State *L1);
+LUAI_FUNC CallInfo *luaE_extendCI (lua_State *L);
+LUAI_FUNC void luaE_freeCI (lua_State *L);
+
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lstring.c b/sys/contrib/openzfs/module/lua/lstring.c
new file mode 100644
index 000000000000..7fcef3d88aa3
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lstring.c
@@ -0,0 +1,186 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lstring.c,v 2.26.1.1 2013/04/12 18:48:47 roberto Exp $
+** String table (keeps all strings handled by Lua)
+** See Copyright Notice in lua.h
+*/
+
+
+#define lstring_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+
+
+/*
+** Lua will use at most ~(2^LUAI_HASHLIMIT) bytes from a string to
+** compute its hash
+*/
+#if !defined(LUAI_HASHLIMIT)
+#define LUAI_HASHLIMIT 5
+#endif
+
+
+/*
+** equality for long strings
+*/
+int luaS_eqlngstr (TString *a, TString *b) {
+ size_t len = a->tsv.len;
+ lua_assert(a->tsv.tt == LUA_TLNGSTR && b->tsv.tt == LUA_TLNGSTR);
+ return (a == b) || /* same instance or... */
+ ((len == b->tsv.len) && /* equal length and ... */
+ (memcmp(getstr(a), getstr(b), len) == 0)); /* equal contents */
+}
+
+
+/*
+** equality for strings
+*/
+int luaS_eqstr (TString *a, TString *b) {
+ return (a->tsv.tt == b->tsv.tt) &&
+ (a->tsv.tt == LUA_TSHRSTR ? eqshrstr(a, b) : luaS_eqlngstr(a, b));
+}
+
+
+unsigned int luaS_hash (const char *str, size_t l, unsigned int seed) {
+ unsigned int h = seed ^ cast(unsigned int, l);
+ size_t l1;
+ size_t step = (l >> LUAI_HASHLIMIT) + 1;
+ for (l1 = l; l1 >= step; l1 -= step)
+ h = h ^ ((h<<5) + (h>>2) + cast_byte(str[l1 - 1]));
+ return h;
+}
+
+
+/*
+** resizes the string table
+*/
+void luaS_resize (lua_State *L, int newsize) {
+ int i;
+ stringtable *tb = &G(L)->strt;
+ /* cannot resize while GC is traversing strings */
+ luaC_runtilstate(L, ~bitmask(GCSsweepstring));
+ if (newsize > tb->size) {
+ luaM_reallocvector(L, tb->hash, tb->size, newsize, GCObject *);
+ for (i = tb->size; i < newsize; i++) tb->hash[i] = NULL;
+ }
+ /* rehash */
+ for (i=0; i<tb->size; i++) {
+ GCObject *p = tb->hash[i];
+ tb->hash[i] = NULL;
+ while (p) { /* for each node in the list */
+ GCObject *next = gch(p)->next; /* save next */
+ unsigned int h = lmod(gco2ts(p)->hash, newsize); /* new position */
+ gch(p)->next = tb->hash[h]; /* chain it */
+ tb->hash[h] = p;
+ resetoldbit(p); /* see MOVE OLD rule */
+ p = next;
+ }
+ }
+ if (newsize < tb->size) {
+ /* shrinking slice must be empty */
+ lua_assert(tb->hash[newsize] == NULL && tb->hash[tb->size - 1] == NULL);
+ luaM_reallocvector(L, tb->hash, tb->size, newsize, GCObject *);
+ }
+ tb->size = newsize;
+}
+
+
+/*
+** creates a new string object
+*/
+static TString *createstrobj (lua_State *L, const char *str, size_t l,
+ int tag, unsigned int h, GCObject **list) {
+ TString *ts;
+ char *sbuf;
+ size_t totalsize; /* total size of TString object */
+ totalsize = sizeof(TString) + ((l + 1) * sizeof(char));
+ ts = &luaC_newobj(L, tag, totalsize, list, 0)->ts;
+ ts->tsv.len = l;
+ ts->tsv.hash = h;
+ ts->tsv.extra = 0;
+ sbuf = (char *)(TString *)(ts + 1);
+ memcpy(sbuf, str, l*sizeof(char));
+ sbuf[l] = '\0'; /* ending 0 */
+ return ts;
+}
+
+
+/*
+** creates a new short string, inserting it into string table
+*/
+static TString *newshrstr (lua_State *L, const char *str, size_t l,
+ unsigned int h) {
+ GCObject **list; /* (pointer to) list where it will be inserted */
+ stringtable *tb = &G(L)->strt;
+ TString *s;
+ if (tb->nuse >= cast(lu_int32, tb->size) && tb->size <= MAX_INT/2)
+ luaS_resize(L, tb->size*2); /* too crowded */
+ list = &tb->hash[lmod(h, tb->size)];
+ s = createstrobj(L, str, l, LUA_TSHRSTR, h, list);
+ tb->nuse++;
+ return s;
+}
+
+
+/*
+** checks whether short string exists and reuses it or creates a new one
+*/
+static TString *internshrstr (lua_State *L, const char *str, size_t l) {
+ GCObject *o;
+ global_State *g = G(L);
+ unsigned int h = luaS_hash(str, l, g->seed);
+ for (o = g->strt.hash[lmod(h, g->strt.size)];
+ o != NULL;
+ o = gch(o)->next) {
+ TString *ts = rawgco2ts(o);
+ if (h == ts->tsv.hash &&
+ l == ts->tsv.len &&
+ (memcmp(str, getstr(ts), l * sizeof(char)) == 0)) {
+ if (isdead(G(L), o)) /* string is dead (but was not collected yet)? */
+ changewhite(o); /* resurrect it */
+ return ts;
+ }
+ }
+ return newshrstr(L, str, l, h); /* not found; create a new string */
+}
+
+
+/*
+** new string (with explicit length)
+*/
+TString *luaS_newlstr (lua_State *L, const char *str, size_t l) {
+ if (l <= LUAI_MAXSHORTLEN) /* short string? */
+ return internshrstr(L, str, l);
+ else {
+ if (l + 1 > (MAX_SIZET - sizeof(TString))/sizeof(char))
+ luaM_toobig(L);
+ return createstrobj(L, str, l, LUA_TLNGSTR, G(L)->seed, NULL);
+ }
+}
+
+
+/*
+** new zero-terminated string
+*/
+TString *luaS_new (lua_State *L, const char *str) {
+ return luaS_newlstr(L, str, strlen(str));
+}
+
+
+Udata *luaS_newudata (lua_State *L, size_t s, Table *e) {
+ Udata *u;
+ if (s > MAX_SIZET - sizeof(Udata))
+ luaM_toobig(L);
+ u = &luaC_newobj(L, LUA_TUSERDATA, sizeof(Udata) + s, NULL, 0)->u;
+ u->uv.len = s;
+ u->uv.metatable = NULL;
+ u->uv.env = e;
+ return u;
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lstring.h b/sys/contrib/openzfs/module/lua/lstring.h
new file mode 100644
index 000000000000..66e65379b8e7
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lstring.h
@@ -0,0 +1,48 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lstring.h,v 1.49.1.1 2013/04/12 18:48:47 roberto Exp $
+** String table (keep all strings handled by Lua)
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lstring_h
+#define lstring_h
+
+#include "lgc.h"
+#include "lobject.h"
+#include "lstate.h"
+
+
+#define sizestring(s) (sizeof(union TString)+((s)->len+1)*sizeof(char))
+
+#define sizeudata(u) (sizeof(union Udata)+(u)->len)
+
+#define luaS_newliteral(L, s) (luaS_newlstr(L, "" s, \
+ (sizeof(s)/sizeof(char))-1))
+
+#define luaS_fix(s) l_setbit((s)->tsv.marked, FIXEDBIT)
+
+
+/*
+** test whether a string is a reserved word
+*/
+#define isreserved(s) ((s)->tsv.tt == LUA_TSHRSTR && (s)->tsv.extra > 0)
+
+
+/*
+** equality for short strings, which are always internalized
+*/
+#define eqshrstr(a,b) check_exp((a)->tsv.tt == LUA_TSHRSTR, (a) == (b))
+
+
+LUAI_FUNC unsigned int luaS_hash (const char *str, size_t l, unsigned int seed);
+LUAI_FUNC int luaS_eqlngstr (TString *a, TString *b);
+LUAI_FUNC int luaS_eqstr (TString *a, TString *b);
+LUAI_FUNC void luaS_resize (lua_State *L, int newsize);
+LUAI_FUNC Udata *luaS_newudata (lua_State *L, size_t s, Table *e);
+LUAI_FUNC TString *luaS_newlstr (lua_State *L, const char *str, size_t l);
+LUAI_FUNC TString *luaS_new (lua_State *L, const char *str);
+
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lstrlib.c b/sys/contrib/openzfs/module/lua/lstrlib.c
new file mode 100644
index 000000000000..12027757bf53
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lstrlib.c
@@ -0,0 +1,1040 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lstrlib.c,v 1.178.1.1 2013/04/12 18:48:47 roberto Exp $
+** Standard library for string operations and pattern-matching
+** See Copyright Notice in lua.h
+*/
+
+
+#define lstrlib_c
+#define LUA_LIB
+
+#include <sys/lua/lua.h>
+
+#include <sys/lua/lauxlib.h>
+#include <sys/lua/lualib.h>
+
+
+/*
+** maximum number of captures that a pattern can do during
+** pattern-matching. This limit is arbitrary.
+*/
+#if !defined(LUA_MAXCAPTURES)
+#define LUA_MAXCAPTURES 16
+#endif
+
+
+/* macro to `unsign' a character */
+#define uchar(c) ((unsigned char)(c))
+
+/*
+ * The provided version of sprintf returns a char *, but str_format expects
+ * it to return the number of characters printed. This version has the expected
+ * behavior.
+ */
+static size_t str_sprintf(char *buf, const char *fmt, ...) {
+ va_list args;
+ size_t len;
+
+ va_start(args, fmt);
+ len = vsnprintf(buf, INT_MAX, fmt, args);
+ va_end(args);
+
+ return len;
+}
+
+
+static int str_len (lua_State *L) {
+ size_t l;
+ luaL_checklstring(L, 1, &l);
+ lua_pushinteger(L, (lua_Integer)l);
+ return 1;
+}
+
+
+/* translate a relative string position: negative means back from end */
+static size_t posrelat (ptrdiff_t pos, size_t len) {
+ if (pos >= 0) return (size_t)pos;
+ else if (0u - (size_t)pos > len) return 0;
+ else return len - ((size_t)-pos) + 1;
+}
+
+
+static int str_sub (lua_State *L) {
+ size_t l;
+ const char *s = luaL_checklstring(L, 1, &l);
+ size_t start = posrelat(luaL_checkinteger(L, 2), l);
+ size_t end = posrelat(luaL_optinteger(L, 3, -1), l);
+ if (start < 1) start = 1;
+ if (end > l) end = l;
+ if (start <= end)
+ lua_pushlstring(L, s + start - 1, end - start + 1);
+ else lua_pushliteral(L, "");
+ return 1;
+}
+
+
+static int str_reverse (lua_State *L) {
+ size_t l, i;
+ luaL_Buffer b;
+ const char *s = luaL_checklstring(L, 1, &l);
+ char *p = luaL_buffinitsize(L, &b, l);
+ for (i = 0; i < l; i++)
+ p[i] = s[l - i - 1];
+ luaL_pushresultsize(&b, l);
+ return 1;
+}
+
+
+static int str_lower (lua_State *L) {
+ size_t l;
+ size_t i;
+ luaL_Buffer b;
+ const char *s = luaL_checklstring(L, 1, &l);
+ char *p = luaL_buffinitsize(L, &b, l);
+ for (i=0; i<l; i++)
+ p[i] = tolower(uchar(s[i]));
+ luaL_pushresultsize(&b, l);
+ return 1;
+}
+
+
+static int str_upper (lua_State *L) {
+ size_t l;
+ size_t i;
+ luaL_Buffer b;
+ const char *s = luaL_checklstring(L, 1, &l);
+ char *p = luaL_buffinitsize(L, &b, l);
+ for (i=0; i<l; i++)
+ p[i] = toupper(uchar(s[i]));
+ luaL_pushresultsize(&b, l);
+ return 1;
+}
+
+
+/* reasonable limit to avoid arithmetic overflow */
+#define MAXSIZE ((~(size_t)0) >> 1)
+
+static int str_rep (lua_State *L) {
+ size_t l, lsep;
+ const char *s = luaL_checklstring(L, 1, &l);
+ int n = luaL_checkint(L, 2);
+ const char *sep = luaL_optlstring(L, 3, "", &lsep);
+ if (n <= 0) lua_pushliteral(L, "");
+ else if (l + lsep < l || l + lsep >= MAXSIZE / n) /* may overflow? */
+ return luaL_error(L, "resulting string too large");
+ else {
+ size_t totallen = n * l + (n - 1) * lsep;
+ luaL_Buffer b;
+ char *p = luaL_buffinitsize(L, &b, totallen);
+ while (n-- > 1) { /* first n-1 copies (followed by separator) */
+ memcpy(p, s, l * sizeof(char)); p += l;
+ if (lsep > 0) { /* avoid empty 'memcpy' (may be expensive) */
+ memcpy(p, sep, lsep * sizeof(char)); p += lsep;
+ }
+ }
+ memcpy(p, s, l * sizeof(char)); /* last copy (not followed by separator) */
+ luaL_pushresultsize(&b, totallen);
+ }
+ return 1;
+}
+
+
+static int str_byte (lua_State *L) {
+ size_t l;
+ const char *s = luaL_checklstring(L, 1, &l);
+ size_t posi = posrelat(luaL_optinteger(L, 2, 1), l);
+ size_t pose = posrelat(luaL_optinteger(L, 3, posi), l);
+ int n, i;
+ if (posi < 1) posi = 1;
+ if (pose > l) pose = l;
+ if (posi > pose) return 0; /* empty interval; return no values */
+ n = (int)(pose - posi + 1);
+ if (posi + n <= pose) /* (size_t -> int) overflow? */
+ return luaL_error(L, "string slice too long");
+ luaL_checkstack(L, n, "string slice too long");
+ for (i=0; i<n; i++)
+ lua_pushinteger(L, uchar(s[posi+i-1]));
+ return n;
+}
+
+
+static int str_char (lua_State *L) {
+ int n = lua_gettop(L); /* number of arguments */
+ int i;
+ luaL_Buffer b;
+ char *p = luaL_buffinitsize(L, &b, n);
+ for (i=1; i<=n; i++) {
+ int c = luaL_checkint(L, i);
+ luaL_argcheck(L, uchar(c) == c, i, "value out of range");
+ p[i - 1] = uchar(c);
+ }
+ luaL_pushresultsize(&b, n);
+ return 1;
+}
+
+
+#if defined(LUA_USE_DUMP)
+static int writer (lua_State *L, const void* b, size_t size, void* B) {
+ (void)L;
+ luaL_addlstring((luaL_Buffer*) B, (const char *)b, size);
+ return 0;
+}
+
+
+static int str_dump (lua_State *L) {
+ luaL_Buffer b;
+ luaL_checktype(L, 1, LUA_TFUNCTION);
+ lua_settop(L, 1);
+ luaL_buffinit(L,&b);
+ if (lua_dump(L, writer, &b) != 0)
+ return luaL_error(L, "unable to dump given function");
+ luaL_pushresult(&b);
+ return 1;
+}
+#endif
+
+
+/*
+** {======================================================
+** PATTERN MATCHING
+** =======================================================
+*/
+
+
+#define CAP_UNFINISHED (-1)
+#define CAP_POSITION (-2)
+
+
+typedef struct MatchState {
+ int matchdepth; /* control for recursive depth (to avoid C stack overflow) */
+ const char *src_init; /* init of source string */
+ const char *src_end; /* end ('\0') of source string */
+ const char *p_end; /* end ('\0') of pattern */
+ lua_State *L;
+ int level; /* total number of captures (finished or unfinished) */
+ struct {
+ const char *init;
+ ptrdiff_t len;
+ } capture[LUA_MAXCAPTURES];
+} MatchState;
+
+
+/* recursive function */
+static const char *match (MatchState *ms, const char *s, const char *p);
+
+
+/* maximum recursion depth for 'match' */
+#if !defined(MAXCCALLS)
+#define MAXCCALLS 200
+#endif
+
+
+#define L_ESC '%'
+#define SPECIALS "^$*+?.([%-"
+
+
+static int check_capture (MatchState *ms, int l) {
+ l -= '1';
+ if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED)
+ return luaL_error(ms->L, "invalid capture index %%%d", l + 1);
+ return l;
+}
+
+
+static int capture_to_close (MatchState *ms) {
+ int level = ms->level;
+ for (level--; level>=0; level--)
+ if (ms->capture[level].len == CAP_UNFINISHED) return level;
+ return luaL_error(ms->L, "invalid pattern capture");
+}
+
+
+static const char *classend (MatchState *ms, const char *p) {
+ switch (*p++) {
+ case L_ESC: {
+ if (p == ms->p_end)
+ luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%") ")");
+ return p+1;
+ }
+ case '[': {
+ if (*p == '^') p++;
+ do { /* look for a `]' */
+ if (p == ms->p_end)
+ luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")");
+ if (*(p++) == L_ESC && p < ms->p_end)
+ p++; /* skip escapes (e.g. `%]') */
+ } while (*p != ']');
+ return p+1;
+ }
+ default: {
+ return p;
+ }
+ }
+}
+
+
+static int match_class (int c, int cl) {
+ int res;
+ switch (tolower(cl)) {
+ case 'a' : res = isalpha(c); break;
+ case 'c' : res = iscntrl(c); break;
+ case 'd' : res = isdigit(c); break;
+ case 'g' : res = isgraph(c); break;
+ case 'l' : res = islower(c); break;
+ case 'p' : res = ispunct(c); break;
+ case 's' : res = isspace(c); break;
+ case 'u' : res = isupper(c); break;
+ case 'w' : res = isalnum(c); break;
+ case 'x' : res = isxdigit(c); break;
+ case 'z' : res = (c == 0); break; /* deprecated option */
+ default: return (cl == c);
+ }
+ return (islower(cl) ? res : !res);
+}
+
+
+static int matchbracketclass (int c, const char *p, const char *ec) {
+ int sig = 1;
+ if (*(p+1) == '^') {
+ sig = 0;
+ p++; /* skip the `^' */
+ }
+ while (++p < ec) {
+ if (*p == L_ESC) {
+ p++;
+ if (match_class(c, uchar(*p)))
+ return sig;
+ }
+ else if ((*(p+1) == '-') && (p+2 < ec)) {
+ p+=2;
+ if (uchar(*(p-2)) <= c && c <= uchar(*p))
+ return sig;
+ }
+ else if (uchar(*p) == c) return sig;
+ }
+ return !sig;
+}
+
+
+static int singlematch (MatchState *ms, const char *s, const char *p,
+ const char *ep) {
+ if (s >= ms->src_end)
+ return 0;
+ else {
+ int c = uchar(*s);
+ switch (*p) {
+ case '.': return 1; /* matches any char */
+ case L_ESC: return match_class(c, uchar(*(p+1)));
+ case '[': return matchbracketclass(c, p, ep-1);
+ default: return (uchar(*p) == c);
+ }
+ }
+}
+
+
+static const char *matchbalance (MatchState *ms, const char *s,
+ const char *p) {
+ if (p >= ms->p_end - 1)
+ luaL_error(ms->L, "malformed pattern "
+ "(missing arguments to " LUA_QL("%%b") ")");
+ if (*s != *p) return NULL;
+ else {
+ int b = *p;
+ int e = *(p+1);
+ int cont = 1;
+ while (++s < ms->src_end) {
+ if (*s == e) {
+ if (--cont == 0) return s+1;
+ }
+ else if (*s == b) cont++;
+ }
+ }
+ return NULL; /* string ends out of balance */
+}
+
+
+static const char *max_expand (MatchState *ms, const char *s,
+ const char *p, const char *ep) {
+ ptrdiff_t i = 0; /* counts maximum expand for item */
+ while (singlematch(ms, s + i, p, ep))
+ i++;
+ /* keeps trying to match with the maximum repetitions */
+ while (i>=0) {
+ const char *res = match(ms, (s+i), ep+1);
+ if (res) return res;
+ i--; /* else didn't match; reduce 1 repetition to try again */
+ }
+ return NULL;
+}
+
+
+static const char *min_expand (MatchState *ms, const char *s,
+ const char *p, const char *ep) {
+ for (;;) {
+ const char *res = match(ms, s, ep+1);
+ if (res != NULL)
+ return res;
+ else if (singlematch(ms, s, p, ep))
+ s++; /* try with one more repetition */
+ else return NULL;
+ }
+}
+
+
+static const char *start_capture (MatchState *ms, const char *s,
+ const char *p, int what) {
+ const char *res;
+ int level = ms->level;
+ if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures");
+ ms->capture[level].init = s;
+ ms->capture[level].len = what;
+ ms->level = level+1;
+ if ((res=match(ms, s, p)) == NULL) /* match failed? */
+ ms->level--; /* undo capture */
+ return res;
+}
+
+
+static const char *end_capture (MatchState *ms, const char *s,
+ const char *p) {
+ int l = capture_to_close(ms);
+ const char *res;
+ ms->capture[l].len = s - ms->capture[l].init; /* close capture */
+ if ((res = match(ms, s, p)) == NULL) /* match failed? */
+ ms->capture[l].len = CAP_UNFINISHED; /* undo capture */
+ return res;
+}
+
+
+static const char *match_capture (MatchState *ms, const char *s, int l) {
+ size_t len;
+ l = check_capture(ms, l);
+ len = ms->capture[l].len;
+ if ((size_t)(ms->src_end-s) >= len &&
+ memcmp(ms->capture[l].init, s, len) == 0)
+ return s+len;
+ else return NULL;
+}
+
+
+static const char *match (MatchState *ms, const char *s, const char *p) {
+ if (ms->matchdepth-- == 0)
+ luaL_error(ms->L, "pattern too complex");
+ init: /* using goto's to optimize tail recursion */
+ if (p != ms->p_end) { /* end of pattern? */
+ switch (*p) {
+ case '(': { /* start capture */
+ if (*(p + 1) == ')') /* position capture? */
+ s = start_capture(ms, s, p + 2, CAP_POSITION);
+ else
+ s = start_capture(ms, s, p + 1, CAP_UNFINISHED);
+ break;
+ }
+ case ')': { /* end capture */
+ s = end_capture(ms, s, p + 1);
+ break;
+ }
+ case '$': {
+ if ((p + 1) != ms->p_end) /* is the `$' the last char in pattern? */
+ goto dflt; /* no; go to default */
+ s = (s == ms->src_end) ? s : NULL; /* check end of string */
+ break;
+ }
+ case L_ESC: { /* escaped sequences not in the format class[*+?-]? */
+ switch (*(p + 1)) {
+ case 'b': { /* balanced string? */
+ s = matchbalance(ms, s, p + 2);
+ if (s != NULL) {
+ p += 4; goto init; /* return match(ms, s, p + 4); */
+ } /* else fail (s == NULL) */
+ break;
+ }
+ case 'f': { /* frontier? */
+ const char *ep; char previous;
+ p += 2;
+ if (*p != '[')
+ luaL_error(ms->L, "missing " LUA_QL("[") " after "
+ LUA_QL("%%f") " in pattern");
+ ep = classend(ms, p); /* points to what is next */
+ previous = (s == ms->src_init) ? '\0' : *(s - 1);
+ if (!matchbracketclass(uchar(previous), p, ep - 1) &&
+ matchbracketclass(uchar(*s), p, ep - 1)) {
+ p = ep; goto init; /* return match(ms, s, ep); */
+ }
+ s = NULL; /* match failed */
+ break;
+ }
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ case '8': case '9': { /* capture results (%0-%9)? */
+ s = match_capture(ms, s, uchar(*(p + 1)));
+ if (s != NULL) {
+ p += 2; goto init; /* return match(ms, s, p + 2) */
+ }
+ break;
+ }
+ default: goto dflt;
+ }
+ break;
+ }
+ default: dflt: { /* pattern class plus optional suffix */
+ const char *ep = classend(ms, p); /* points to optional suffix */
+ /* does not match at least once? */
+ if (!singlematch(ms, s, p, ep)) {
+ if (*ep == '*' || *ep == '?' || *ep == '-') { /* accept empty? */
+ p = ep + 1; goto init; /* return match(ms, s, ep + 1); */
+ }
+ else /* '+' or no suffix */
+ s = NULL; /* fail */
+ }
+ else { /* matched once */
+ switch (*ep) { /* handle optional suffix */
+ case '?': { /* optional */
+ const char *res;
+ if ((res = match(ms, s + 1, ep + 1)) != NULL)
+ s = res;
+ else {
+ p = ep + 1; goto init; /* else return match(ms, s, ep + 1); */
+ }
+ break;
+ }
+ case '+': /* 1 or more repetitions */
+ s++; /* 1 match already done */
+ /* FALLTHROUGH */
+ case '*': /* 0 or more repetitions */
+ s = max_expand(ms, s, p, ep);
+ break;
+ case '-': /* 0 or more repetitions (minimum) */
+ s = min_expand(ms, s, p, ep);
+ break;
+ default: /* no suffix */
+ s++; p = ep; goto init; /* return match(ms, s + 1, ep); */
+ }
+ }
+ break;
+ }
+ }
+ }
+ ms->matchdepth++;
+ return s;
+}
+
+
+
+static const char *lmemfind (const char *s1, size_t l1,
+ const char *s2, size_t l2) {
+ if (l2 == 0) return s1; /* empty strings are everywhere */
+ else if (l2 > l1) return NULL; /* avoids a negative `l1' */
+ else {
+ const char *init; /* to search for a `*s2' inside `s1' */
+ l2--; /* 1st char will be checked by `memchr' */
+ l1 = l1-l2; /* `s2' cannot be found after that */
+ while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) {
+ init++; /* 1st char is already checked */
+ if (memcmp(init, s2+1, l2) == 0)
+ return init-1;
+ else { /* correct `l1' and `s1' to try again */
+ l1 -= init-s1;
+ s1 = init;
+ }
+ }
+ return NULL; /* not found */
+ }
+}
+
+
+static void push_onecapture (MatchState *ms, int i, const char *s,
+ const char *e) {
+ if (i >= ms->level) {
+ if (i == 0) /* ms->level == 0, too */
+ lua_pushlstring(ms->L, s, e - s); /* add whole match */
+ else
+ luaL_error(ms->L, "invalid capture index");
+ }
+ else {
+ ptrdiff_t l = ms->capture[i].len;
+ if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture");
+ if (l == CAP_POSITION)
+ lua_pushinteger(ms->L, ms->capture[i].init - ms->src_init + 1);
+ else
+ lua_pushlstring(ms->L, ms->capture[i].init, l);
+ }
+}
+
+
+static int push_captures (MatchState *ms, const char *s, const char *e) {
+ int i;
+ int nlevels = (ms->level == 0 && s) ? 1 : ms->level;
+ luaL_checkstack(ms->L, nlevels, "too many captures");
+ for (i = 0; i < nlevels; i++)
+ push_onecapture(ms, i, s, e);
+ return nlevels; /* number of strings pushed */
+}
+
+
+/* check whether pattern has no special characters */
+static int nospecials (const char *p, size_t l) {
+ size_t upto = 0;
+ do {
+ if (strpbrk(p + upto, SPECIALS))
+ return 0; /* pattern has a special character */
+ upto += strlen(p + upto) + 1; /* may have more after \0 */
+ } while (upto <= l);
+ return 1; /* no special chars found */
+}
+
+
+static int str_find_aux (lua_State *L, int find) {
+ size_t ls, lp;
+ const char *s = luaL_checklstring(L, 1, &ls);
+ const char *p = luaL_checklstring(L, 2, &lp);
+ size_t init = posrelat(luaL_optinteger(L, 3, 1), ls);
+ if (init < 1) init = 1;
+ else if (init > ls + 1) { /* start after string's end? */
+ lua_pushnil(L); /* cannot find anything */
+ return 1;
+ }
+ /* explicit request or no special characters? */
+ if (find && (lua_toboolean(L, 4) || nospecials(p, lp))) {
+ /* do a plain search */
+ const char *s2 = lmemfind(s + init - 1, ls - init + 1, p, lp);
+ if (s2) {
+ lua_pushinteger(L, s2 - s + 1);
+ lua_pushinteger(L, s2 - s + lp);
+ return 2;
+ }
+ }
+ else {
+ MatchState ms;
+ const char *s1 = s + init - 1;
+ int anchor = (*p == '^');
+ if (anchor) {
+ p++; lp--; /* skip anchor character */
+ }
+ ms.L = L;
+ ms.matchdepth = MAXCCALLS;
+ ms.src_init = s;
+ ms.src_end = s + ls;
+ ms.p_end = p + lp;
+ do {
+ const char *res;
+ ms.level = 0;
+ lua_assert(ms.matchdepth == MAXCCALLS);
+ if ((res=match(&ms, s1, p)) != NULL) {
+ if (find) {
+ lua_pushinteger(L, s1 - s + 1); /* start */
+ lua_pushinteger(L, res - s); /* end */
+ return push_captures(&ms, NULL, 0) + 2;
+ }
+ else
+ return push_captures(&ms, s1, res);
+ }
+ } while (s1++ < ms.src_end && !anchor);
+ }
+ lua_pushnil(L); /* not found */
+ return 1;
+}
+
+
+static int str_find (lua_State *L) {
+ return str_find_aux(L, 1);
+}
+
+
+static int str_match (lua_State *L) {
+ return str_find_aux(L, 0);
+}
+
+
+static int gmatch_aux (lua_State *L) {
+ MatchState ms;
+ size_t ls, lp;
+ const char *s = lua_tolstring(L, lua_upvalueindex(1), &ls);
+ const char *p = lua_tolstring(L, lua_upvalueindex(2), &lp);
+ const char *src;
+ ms.L = L;
+ ms.matchdepth = MAXCCALLS;
+ ms.src_init = s;
+ ms.src_end = s+ls;
+ ms.p_end = p + lp;
+ for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3));
+ src <= ms.src_end;
+ src++) {
+ const char *e;
+ ms.level = 0;
+ lua_assert(ms.matchdepth == MAXCCALLS);
+ if ((e = match(&ms, src, p)) != NULL) {
+ lua_Integer newstart = e-s;
+ if (e == src) newstart++; /* empty match? go at least one position */
+ lua_pushinteger(L, newstart);
+ lua_replace(L, lua_upvalueindex(3));
+ return push_captures(&ms, src, e);
+ }
+ }
+ return 0; /* not found */
+}
+
+
+static int str_gmatch (lua_State *L) {
+ luaL_checkstring(L, 1);
+ luaL_checkstring(L, 2);
+ lua_settop(L, 2);
+ lua_pushinteger(L, 0);
+ lua_pushcclosure(L, gmatch_aux, 3);
+ return 1;
+}
+
+
+static void add_s (MatchState *ms, luaL_Buffer *b, const char *s,
+ const char *e) {
+ size_t l, i;
+ const char *news = lua_tolstring(ms->L, 3, &l);
+ for (i = 0; i < l; i++) {
+ if (news[i] != L_ESC)
+ luaL_addchar(b, news[i]);
+ else {
+ i++; /* skip ESC */
+ if (!isdigit(uchar(news[i]))) {
+ if (news[i] != L_ESC)
+ luaL_error(ms->L, "invalid use of " LUA_QL("%c")
+ " in replacement string", L_ESC);
+ luaL_addchar(b, news[i]);
+ }
+ else if (news[i] == '0')
+ luaL_addlstring(b, s, e - s);
+ else {
+ push_onecapture(ms, news[i] - '1', s, e);
+ luaL_addvalue(b); /* add capture to accumulated result */
+ }
+ }
+ }
+}
+
+
+static void add_value (MatchState *ms, luaL_Buffer *b, const char *s,
+ const char *e, int tr) {
+ lua_State *L = ms->L;
+ switch (tr) {
+ case LUA_TFUNCTION: {
+ int n;
+ lua_pushvalue(L, 3);
+ n = push_captures(ms, s, e);
+ lua_call(L, n, 1);
+ break;
+ }
+ case LUA_TTABLE: {
+ push_onecapture(ms, 0, s, e);
+ lua_gettable(L, 3);
+ break;
+ }
+ default: { /* LUA_TNUMBER or LUA_TSTRING */
+ add_s(ms, b, s, e);
+ return;
+ }
+ }
+ if (!lua_toboolean(L, -1)) { /* nil or false? */
+ lua_pop(L, 1);
+ lua_pushlstring(L, s, e - s); /* keep original text */
+ }
+ else if (!lua_isstring(L, -1))
+ luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1));
+ luaL_addvalue(b); /* add result to accumulator */
+}
+
+
+static int str_gsub (lua_State *L) {
+ size_t srcl, lp;
+ const char *src = luaL_checklstring(L, 1, &srcl);
+ const char *p = luaL_checklstring(L, 2, &lp);
+ int tr = lua_type(L, 3);
+ size_t max_s = luaL_optinteger(L, 4, srcl+1);
+ int anchor = (*p == '^');
+ size_t n = 0;
+ MatchState ms;
+ luaL_Buffer b;
+ luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING ||
+ tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3,
+ "string/function/table expected");
+ luaL_buffinit(L, &b);
+ if (anchor) {
+ p++; lp--; /* skip anchor character */
+ }
+ ms.L = L;
+ ms.matchdepth = MAXCCALLS;
+ ms.src_init = src;
+ ms.src_end = src+srcl;
+ ms.p_end = p + lp;
+ while (n < max_s) {
+ const char *e;
+ ms.level = 0;
+ lua_assert(ms.matchdepth == MAXCCALLS);
+ e = match(&ms, src, p);
+ if (e) {
+ n++;
+ add_value(&ms, &b, src, e, tr);
+ }
+ if (e && e>src) /* non empty match? */
+ src = e; /* skip it */
+ else if (src < ms.src_end)
+ luaL_addchar(&b, *src++);
+ else break;
+ if (anchor) break;
+ }
+ luaL_addlstring(&b, src, ms.src_end-src);
+ luaL_pushresult(&b);
+ lua_pushinteger(L, n); /* number of substitutions */
+ return 2;
+}
+
+/* }====================================================== */
+
+
+
+/*
+** {======================================================
+** STRING FORMAT
+** =======================================================
+*/
+
+/*
+** LUA_INTFRMLEN is the length modifier for integer conversions in
+** 'string.format'; LUA_INTFRM_T is the integer type corresponding to
+** the previous length
+*/
+#if !defined(LUA_INTFRMLEN) /* { */
+#if defined(LUA_USE_LONGLONG)
+
+#define LUA_INTFRMLEN "ll"
+#define LUA_INTFRM_T long long
+
+#else
+
+#define LUA_INTFRMLEN "l"
+#define LUA_INTFRM_T long
+
+#endif
+#endif /* } */
+
+
+/*
+** LUA_FLTFRMLEN is the length modifier for float conversions in
+** 'string.format'; LUA_FLTFRM_T is the float type corresponding to
+** the previous length
+*/
+#if !defined(LUA_FLTFRMLEN)
+
+#define LUA_FLTFRMLEN ""
+#define LUA_FLTFRM_T double
+
+#endif
+
+
+/* maximum size of each formatted item (> len(format('%99.99f', -1e308))) */
+#define MAX_ITEM 512
+/* valid flags in a format specification */
+#define FLAGS "-+ #0"
+/*
+** maximum size of each format specification (such as '%-099.99d')
+** (+10 accounts for %99.99x plus margin of error)
+*/
+#define MAX_FORMAT (sizeof(FLAGS) + sizeof(LUA_INTFRMLEN) + 10)
+
+
+static void addquoted (lua_State *L, luaL_Buffer *b, int arg) {
+ size_t l;
+ const char *s = luaL_checklstring(L, arg, &l);
+ luaL_addchar(b, '"');
+ while (l--) {
+ if (*s == '"' || *s == '\\' || *s == '\n') {
+ luaL_addchar(b, '\\');
+ luaL_addchar(b, *s);
+ }
+ else if (*s == '\0' || iscntrl(uchar(*s))) {
+ char buff[10];
+ if (!isdigit(uchar(*(s+1))))
+ snprintf(buff, sizeof(buff), "\\%d", (int)uchar(*s));
+ else
+ snprintf(buff, sizeof(buff), "\\%03d", (int)uchar(*s));
+ luaL_addstring(b, buff);
+ }
+ else
+ luaL_addchar(b, *s);
+ s++;
+ }
+ luaL_addchar(b, '"');
+}
+
+static const char *scanformat (lua_State *L, const char *strfrmt, char *form) {
+ const char *p = strfrmt;
+ while (*p != '\0' && strchr(FLAGS, *p) != NULL) p++; /* skip flags */
+ if ((size_t)(p - strfrmt) >= sizeof(FLAGS)/sizeof(char))
+ luaL_error(L, "invalid format (repeated flags)");
+ if (isdigit(uchar(*p))) p++; /* skip width */
+ if (isdigit(uchar(*p))) p++; /* (2 digits at most) */
+ if (*p == '.') {
+ p++;
+ if (isdigit(uchar(*p))) p++; /* skip precision */
+ if (isdigit(uchar(*p))) p++; /* (2 digits at most) */
+ }
+ if (isdigit(uchar(*p)))
+ luaL_error(L, "invalid format (width or precision too long)");
+ *(form++) = '%';
+ memcpy(form, strfrmt, (p - strfrmt + 1) * sizeof(char));
+ form += p - strfrmt + 1;
+ *form = '\0';
+ return p;
+}
+
+
+/*
+** add length modifier into formats
+*/
+static void addlenmod (char *form, const char *lenmod, size_t size) {
+ size_t l = strlen(form);
+ size_t lm = strlen(lenmod);
+ char spec = form[l - 1];
+ strlcpy(form + l - 1, lenmod, size - (l - 1));
+ form[l + lm - 1] = spec;
+ form[l + lm] = '\0';
+}
+
+
+static int str_format (lua_State *L) {
+ int top = lua_gettop(L);
+ int arg = 1;
+ size_t sfl;
+ const char *strfrmt = luaL_checklstring(L, arg, &sfl);
+ const char *strfrmt_end = strfrmt+sfl;
+ luaL_Buffer b;
+ luaL_buffinit(L, &b);
+ while (strfrmt < strfrmt_end) {
+ if (*strfrmt != L_ESC)
+ luaL_addchar(&b, *strfrmt++);
+ else if (*++strfrmt == L_ESC)
+ luaL_addchar(&b, *strfrmt++); /* %% */
+ else { /* format item */
+ char form[MAX_FORMAT]; /* to store the format (`%...') */
+ char *buff = luaL_prepbuffsize(&b, MAX_ITEM); /* to put formatted item */
+ int nb = 0; /* number of bytes in added item */
+ if (++arg > top)
+ luaL_argerror(L, arg, "no value");
+ strfrmt = scanformat(L, strfrmt, form);
+ switch (*strfrmt++) {
+ case 'c': {
+ nb = str_sprintf(buff, form, luaL_checkint(L, arg));
+ break;
+ }
+ case 'd': case 'i': {
+ lua_Number n = luaL_checknumber(L, arg);
+ LUA_INTFRM_T ni = (LUA_INTFRM_T)n;
+ lua_Number diff = n - (lua_Number)ni;
+ luaL_argcheck(L, -1 < diff && diff < 1, arg,
+ "not a number in proper range");
+ addlenmod(form, LUA_INTFRMLEN, MAX_FORMAT);
+ nb = str_sprintf(buff, form, ni);
+ break;
+ }
+ case 'o': case 'u': case 'x': case 'X': {
+ lua_Number n = luaL_checknumber(L, arg);
+ unsigned LUA_INTFRM_T ni = (unsigned LUA_INTFRM_T)n;
+ lua_Number diff = n - (lua_Number)ni;
+ luaL_argcheck(L, -1 < diff && diff < 1, arg,
+ "not a non-negative number in proper range");
+ addlenmod(form, LUA_INTFRMLEN, MAX_FORMAT);
+ nb = str_sprintf(buff, form, ni);
+ break;
+ }
+#if defined(LUA_USE_FLOAT_FORMATS)
+ case 'e': case 'E': case 'f':
+#if defined(LUA_USE_AFORMAT)
+ case 'a': case 'A':
+#endif
+ case 'g': case 'G': {
+ addlenmod(form, LUA_FLTFRMLEN, MAX_FORMAT);
+ nb = str_sprintf(buff, form, (LUA_FLTFRM_T)luaL_checknumber(L, arg));
+ break;
+ }
+#endif
+ case 'q': {
+ addquoted(L, &b, arg);
+ break;
+ }
+ case 's': {
+ size_t l;
+ const char *s = luaL_tolstring(L, arg, &l);
+ if (!strchr(form, '.') && l >= 100) {
+ /* no precision and string is too long to be formatted;
+ keep original string */
+ luaL_addvalue(&b);
+ break;
+ }
+ else {
+ nb = str_sprintf(buff, form, s);
+ lua_pop(L, 1); /* remove result from 'luaL_tolstring' */
+ break;
+ }
+ }
+ default: { /* also treat cases `pnLlh' */
+ return luaL_error(L, "invalid option " LUA_QL("%%%c") " to "
+ LUA_QL("format"), *(strfrmt - 1));
+ }
+ }
+ luaL_addsize(&b, nb);
+ }
+ }
+ luaL_pushresult(&b);
+ return 1;
+}
+
+/* }====================================================== */
+
+
+static const luaL_Reg strlib[] = {
+ {"byte", str_byte},
+ {"char", str_char},
+#if defined(LUA_USE_DUMP)
+ {"dump", str_dump},
+#endif
+ {"find", str_find},
+ {"format", str_format},
+ {"gmatch", str_gmatch},
+ {"gsub", str_gsub},
+ {"len", str_len},
+ {"lower", str_lower},
+ {"match", str_match},
+ {"rep", str_rep},
+ {"reverse", str_reverse},
+ {"sub", str_sub},
+ {"upper", str_upper},
+ {NULL, NULL}
+};
+
+
+static void createmetatable (lua_State *L) {
+ lua_createtable(L, 0, 1); /* table to be metatable for strings */
+ lua_pushliteral(L, ""); /* dummy string */
+ lua_pushvalue(L, -2); /* copy table */
+ lua_setmetatable(L, -2); /* set table as metatable for strings */
+ lua_pop(L, 1); /* pop dummy string */
+ lua_pushvalue(L, -2); /* get string library */
+ lua_setfield(L, -2, "__index"); /* metatable.__index = string */
+ lua_pop(L, 1); /* pop metatable */
+}
+
+
+/*
+** Open string library
+*/
+LUAMOD_API int luaopen_string (lua_State *L) {
+ luaL_newlib(L, strlib);
+ createmetatable(L);
+ return 1;
+}
+
+#if defined(_KERNEL)
+
+EXPORT_SYMBOL(luaopen_string);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/ltable.c b/sys/contrib/openzfs/module/lua/ltable.c
new file mode 100644
index 000000000000..f60418721bef
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/ltable.c
@@ -0,0 +1,592 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: ltable.c,v 2.72.1.1 2013/04/12 18:48:47 roberto Exp $
+** Lua tables (hash)
+** See Copyright Notice in lua.h
+*/
+
+
+/*
+** Implementation of tables (aka arrays, objects, or hash tables).
+** Tables keep its elements in two parts: an array part and a hash part.
+** Non-negative integer keys are all candidates to be kept in the array
+** part. The actual size of the array is the largest `n' such that at
+** least half the slots between 0 and n are in use.
+** Hash uses a mix of chained scatter table with Brent's variation.
+** A main invariant of these tables is that, if an element is not
+** in its main position (i.e. the `original' position that its hash gives
+** to it), then the colliding element is in its own main position.
+** Hence even when the load factor reaches 100%, performance remains good.
+*/
+
+
+#define ltable_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "ldebug.h"
+#include "ldo.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "lvm.h"
+
+
+/*
+** max size of array part is 2^MAXBITS
+*/
+#if LUAI_BITSINT >= 32
+#define MAXBITS 30
+#else
+#define MAXBITS (LUAI_BITSINT-2)
+#endif
+
+#define MAXASIZE (1 << MAXBITS)
+
+
+#define hashpow2(t,n) (gnode(t, lmod((n), sizenode(t))))
+
+#define hashstr(t,str) hashpow2(t, (str)->tsv.hash)
+#define hashboolean(t,p) hashpow2(t, p)
+
+
+/*
+** for some types, it is better to avoid modulus by power of 2, as
+** they tend to have many 2 factors.
+*/
+#define hashmod(t,n) (gnode(t, ((n) % ((sizenode(t)-1)|1))))
+
+
+#define hashpointer(t,p) hashmod(t, IntPoint(p))
+
+
+#define dummynode (&dummynode_)
+
+#define isdummy(n) ((n) == dummynode)
+
+static const Node dummynode_ = {
+ {NILCONSTANT}, /* value */
+ {{NILCONSTANT, NULL}} /* key */
+};
+
+
+/*
+** hash for lua_Numbers
+*/
+static Node *hashnum (const Table *t, lua_Number n) {
+ int i;
+ luai_hashnum(i, n);
+ if (i < 0) {
+ if (cast(unsigned int, i) == 0u - i) /* use unsigned to avoid overflows */
+ i = 0; /* handle INT_MIN */
+ i = -i; /* must be a positive value */
+ }
+ return hashmod(t, i);
+}
+
+
+
+/*
+** returns the `main' position of an element in a table (that is, the index
+** of its hash value)
+*/
+static Node *mainposition (const Table *t, const TValue *key) {
+ switch (ttype(key)) {
+ case LUA_TNUMBER:
+ return hashnum(t, nvalue(key));
+ case LUA_TLNGSTR: {
+ TString *s = rawtsvalue(key);
+ if (s->tsv.extra == 0) { /* no hash? */
+ s->tsv.hash = luaS_hash(getstr(s), s->tsv.len, s->tsv.hash);
+ s->tsv.extra = 1; /* now it has its hash */
+ }
+ return hashstr(t, rawtsvalue(key));
+ }
+ case LUA_TSHRSTR:
+ return hashstr(t, rawtsvalue(key));
+ case LUA_TBOOLEAN:
+ return hashboolean(t, bvalue(key));
+ case LUA_TLIGHTUSERDATA:
+ return hashpointer(t, pvalue(key));
+ case LUA_TLCF:
+ return hashpointer(t, fvalue(key));
+ default:
+ return hashpointer(t, gcvalue(key));
+ }
+}
+
+
+/*
+** returns the index for `key' if `key' is an appropriate key to live in
+** the array part of the table, -1 otherwise.
+*/
+static int arrayindex (const TValue *key) {
+ if (ttisnumber(key)) {
+ lua_Number n = nvalue(key);
+ int k;
+ lua_number2int(k, n);
+ if (luai_numeq(cast_num(k), n))
+ return k;
+ }
+ return -1; /* `key' did not match some condition */
+}
+
+
+/*
+** returns the index of a `key' for table traversals. First goes all
+** elements in the array part, then elements in the hash part. The
+** beginning of a traversal is signaled by -1.
+*/
+static int findindex (lua_State *L, Table *t, StkId key) {
+ int i;
+ if (ttisnil(key)) return -1; /* first iteration */
+ i = arrayindex(key);
+ if (0 < i && i <= t->sizearray) /* is `key' inside array part? */
+ return i-1; /* yes; that's the index (corrected to C) */
+ else {
+ Node *n = mainposition(t, key);
+ for (;;) { /* check whether `key' is somewhere in the chain */
+ /* key may be dead already, but it is ok to use it in `next' */
+ if (luaV_rawequalobj(gkey(n), key) ||
+ (ttisdeadkey(gkey(n)) && iscollectable(key) &&
+ deadvalue(gkey(n)) == gcvalue(key))) {
+ i = cast_int(n - gnode(t, 0)); /* key index in hash table */
+ /* hash elements are numbered after array ones */
+ return i + t->sizearray;
+ }
+ else n = gnext(n);
+ if (n == NULL)
+ luaG_runerror(L, "invalid key to " LUA_QL("next")); /* key not found */
+ }
+ }
+}
+
+
+int luaH_next (lua_State *L, Table *t, StkId key) {
+ int i = findindex(L, t, key); /* find original element */
+ for (i++; i < t->sizearray; i++) { /* try first array part */
+ if (!ttisnil(&t->array[i])) { /* a non-nil value? */
+ setnvalue(key, cast_num(i+1));
+ setobj2s(L, key+1, &t->array[i]);
+ return 1;
+ }
+ }
+ for (i -= t->sizearray; i < sizenode(t); i++) { /* then hash part */
+ if (!ttisnil(gval(gnode(t, i)))) { /* a non-nil value? */
+ setobj2s(L, key, gkey(gnode(t, i)));
+ setobj2s(L, key+1, gval(gnode(t, i)));
+ return 1;
+ }
+ }
+ return 0; /* no more elements */
+}
+
+
+/*
+** {=============================================================
+** Rehash
+** ==============================================================
+*/
+
+
+static int computesizes (int nums[], int *narray) {
+ int i;
+ int twotoi; /* 2^i */
+ int a = 0; /* number of elements smaller than 2^i */
+ int na = 0; /* number of elements to go to array part */
+ int n = 0; /* optimal size for array part */
+ for (i = 0, twotoi = 1; twotoi/2 < *narray; i++, twotoi *= 2) {
+ if (nums[i] > 0) {
+ a += nums[i];
+ if (a > twotoi/2) { /* more than half elements present? */
+ n = twotoi; /* optimal size (till now) */
+ na = a; /* all elements smaller than n will go to array part */
+ }
+ }
+ if (a == *narray) break; /* all elements already counted */
+ }
+ *narray = n;
+ lua_assert(*narray/2 <= na && na <= *narray);
+ return na;
+}
+
+
+static int countint (const TValue *key, int *nums) {
+ int k = arrayindex(key);
+ if (0 < k && k <= MAXASIZE) { /* is `key' an appropriate array index? */
+ nums[luaO_ceillog2(k)]++; /* count as such */
+ return 1;
+ }
+ else
+ return 0;
+}
+
+
+static int numusearray (const Table *t, int *nums) {
+ int lg;
+ int ttlg; /* 2^lg */
+ int ause = 0; /* summation of `nums' */
+ int i = 1; /* count to traverse all array keys */
+ for (lg=0, ttlg=1; lg<=MAXBITS; lg++, ttlg*=2) { /* for each slice */
+ int lc = 0; /* counter */
+ int lim = ttlg;
+ if (lim > t->sizearray) {
+ lim = t->sizearray; /* adjust upper limit */
+ if (i > lim)
+ break; /* no more elements to count */
+ }
+ /* count elements in range (2^(lg-1), 2^lg] */
+ for (; i <= lim; i++) {
+ if (!ttisnil(&t->array[i-1]))
+ lc++;
+ }
+ nums[lg] += lc;
+ ause += lc;
+ }
+ return ause;
+}
+
+
+static int numusehash (const Table *t, int *nums, int *pnasize) {
+ int totaluse = 0; /* total number of elements */
+ int ause = 0; /* summation of `nums' */
+ int i = sizenode(t);
+ while (i--) {
+ Node *n = &t->node[i];
+ if (!ttisnil(gval(n))) {
+ ause += countint(gkey(n), nums);
+ totaluse++;
+ }
+ }
+ *pnasize += ause;
+ return totaluse;
+}
+
+
+static void setarrayvector (lua_State *L, Table *t, int size) {
+ int i;
+ luaM_reallocvector(L, t->array, t->sizearray, size, TValue);
+ for (i=t->sizearray; i<size; i++)
+ setnilvalue(&t->array[i]);
+ t->sizearray = size;
+}
+
+
+static void setnodevector (lua_State *L, Table *t, int size) {
+ int lsize;
+ if (size == 0) { /* no elements to hash part? */
+ t->node = cast(Node *, dummynode); /* use common `dummynode' */
+ lsize = 0;
+ }
+ else {
+ int i;
+ lsize = luaO_ceillog2(size);
+ if (lsize > MAXBITS)
+ luaG_runerror(L, "table overflow");
+ size = twoto(lsize);
+ t->node = luaM_newvector(L, size, Node);
+ for (i=0; i<size; i++) {
+ Node *n = gnode(t, i);
+ gnext(n) = NULL;
+ setnilvalue(gkey(n));
+ setnilvalue(gval(n));
+ }
+ }
+ t->lsizenode = cast_byte(lsize);
+ t->lastfree = gnode(t, size); /* all positions are free */
+}
+
+
+void luaH_resize (lua_State *L, Table *t, int nasize, int nhsize) {
+ int i;
+ int oldasize = t->sizearray;
+ int oldhsize = t->lsizenode;
+ Node *nold = t->node; /* save old hash ... */
+ if (nasize > oldasize) /* array part must grow? */
+ setarrayvector(L, t, nasize);
+ /* create new hash part with appropriate size */
+ setnodevector(L, t, nhsize);
+ if (nasize < oldasize) { /* array part must shrink? */
+ t->sizearray = nasize;
+ /* re-insert elements from vanishing slice */
+ for (i=nasize; i<oldasize; i++) {
+ if (!ttisnil(&t->array[i]))
+ luaH_setint(L, t, i + 1, &t->array[i]);
+ }
+ /* shrink array */
+ luaM_reallocvector(L, t->array, oldasize, nasize, TValue);
+ }
+ /* re-insert elements from hash part */
+ for (i = twoto(oldhsize) - 1; i >= 0; i--) {
+ Node *old = nold+i;
+ if (!ttisnil(gval(old))) {
+ /* doesn't need barrier/invalidate cache, as entry was
+ already present in the table */
+ setobjt2t(L, luaH_set(L, t, gkey(old)), gval(old));
+ }
+ }
+ if (!isdummy(nold))
+ luaM_freearray(L, nold, cast(size_t, twoto(oldhsize))); /* free old array */
+}
+
+
+void luaH_resizearray (lua_State *L, Table *t, int nasize) {
+ int nsize = isdummy(t->node) ? 0 : sizenode(t);
+ luaH_resize(L, t, nasize, nsize);
+}
+
+
+static void rehash (lua_State *L, Table *t, const TValue *ek) {
+ int nasize, na;
+ int nums[MAXBITS+1]; /* nums[i] = number of keys with 2^(i-1) < k <= 2^i */
+ int i;
+ int totaluse;
+ for (i=0; i<=MAXBITS; i++) nums[i] = 0; /* reset counts */
+ nasize = numusearray(t, nums); /* count keys in array part */
+ totaluse = nasize; /* all those keys are integer keys */
+ totaluse += numusehash(t, nums, &nasize); /* count keys in hash part */
+ /* count extra key */
+ nasize += countint(ek, nums);
+ totaluse++;
+ /* compute new size for array part */
+ na = computesizes(nums, &nasize);
+ /* resize the table to new computed sizes */
+ luaH_resize(L, t, nasize, totaluse - na);
+}
+
+
+
+/*
+** }=============================================================
+*/
+
+
+Table *luaH_new (lua_State *L) {
+ Table *t = &luaC_newobj(L, LUA_TTABLE, sizeof(Table), NULL, 0)->h;
+ t->metatable = NULL;
+ t->flags = cast_byte(~0);
+ t->array = NULL;
+ t->sizearray = 0;
+ setnodevector(L, t, 0);
+ return t;
+}
+
+
+void luaH_free (lua_State *L, Table *t) {
+ if (!isdummy(t->node))
+ luaM_freearray(L, t->node, cast(size_t, sizenode(t)));
+ luaM_freearray(L, t->array, t->sizearray);
+ luaM_free(L, t);
+}
+
+
+static Node *getfreepos (Table *t) {
+ while (t->lastfree > t->node) {
+ t->lastfree--;
+ if (ttisnil(gkey(t->lastfree)))
+ return t->lastfree;
+ }
+ return NULL; /* could not find a free place */
+}
+
+
+
+/*
+** inserts a new key into a hash table; first, check whether key's main
+** position is free. If not, check whether colliding node is in its main
+** position or not: if it is not, move colliding node to an empty place and
+** put new key in its main position; otherwise (colliding node is in its main
+** position), new key goes to an empty position.
+*/
+TValue *luaH_newkey (lua_State *L, Table *t, const TValue *key) {
+ Node *mp;
+ if (ttisnil(key)) luaG_runerror(L, "table index is nil");
+#if defined LUA_HAS_FLOAT_NUMBERS
+ else if (ttisnumber(key) && luai_numisnan(L, nvalue(key)))
+ luaG_runerror(L, "table index is NaN");
+#endif
+ mp = mainposition(t, key);
+ if (!ttisnil(gval(mp)) || isdummy(mp)) { /* main position is taken? */
+ Node *othern;
+ Node *n = getfreepos(t); /* get a free place */
+ if (n == NULL) { /* cannot find a free place? */
+ rehash(L, t, key); /* grow table */
+ /* whatever called 'newkey' take care of TM cache and GC barrier */
+ return luaH_set(L, t, key); /* insert key into grown table */
+ }
+ lua_assert(!isdummy(n));
+ othern = mainposition(t, gkey(mp));
+ if (othern != mp) { /* is colliding node out of its main position? */
+ /* yes; move colliding node into free position */
+ while (gnext(othern) != mp) othern = gnext(othern); /* find previous */
+ gnext(othern) = n; /* redo the chain with `n' in place of `mp' */
+ *n = *mp; /* copy colliding node into free pos. (mp->next also goes) */
+ gnext(mp) = NULL; /* now `mp' is free */
+ setnilvalue(gval(mp));
+ }
+ else { /* colliding node is in its own main position */
+ /* new node will go into free position */
+ gnext(n) = gnext(mp); /* chain new position */
+ gnext(mp) = n;
+ mp = n;
+ }
+ }
+ setobj2t(L, gkey(mp), key);
+ luaC_barrierback(L, obj2gco(t), key);
+ lua_assert(ttisnil(gval(mp)));
+ return gval(mp);
+}
+
+
+/*
+** search function for integers
+*/
+const TValue *luaH_getint (Table *t, int key) {
+ /* (1 <= key && key <= t->sizearray) */
+ if (cast(unsigned int, key-1) < cast(unsigned int, t->sizearray))
+ return &t->array[key-1];
+ else {
+ lua_Number nk = cast_num(key);
+ Node *n = hashnum(t, nk);
+ do { /* check whether `key' is somewhere in the chain */
+ if (ttisnumber(gkey(n)) && luai_numeq(nvalue(gkey(n)), nk))
+ return gval(n); /* that's it */
+ else n = gnext(n);
+ } while (n);
+ return luaO_nilobject;
+ }
+}
+
+
+/*
+** search function for short strings
+*/
+const TValue *luaH_getstr (Table *t, TString *key) {
+ Node *n = hashstr(t, key);
+ lua_assert(key->tsv.tt == LUA_TSHRSTR);
+ do { /* check whether `key' is somewhere in the chain */
+ if (ttisshrstring(gkey(n)) && eqshrstr(rawtsvalue(gkey(n)), key))
+ return gval(n); /* that's it */
+ else n = gnext(n);
+ } while (n);
+ return luaO_nilobject;
+}
+
+
+/*
+** main search function
+*/
+const TValue *luaH_get (Table *t, const TValue *key) {
+ switch (ttype(key)) {
+ case LUA_TSHRSTR: return luaH_getstr(t, rawtsvalue(key));
+ case LUA_TNIL: return luaO_nilobject;
+ case LUA_TNUMBER: {
+ int k;
+ lua_Number n = nvalue(key);
+ lua_number2int(k, n);
+ if (luai_numeq(cast_num(k), n)) /* index is int? */
+ return luaH_getint(t, k); /* use specialized version */
+ /* else go through */
+ }
+ /* FALLTHROUGH */
+ default: {
+ Node *n = mainposition(t, key);
+ do { /* check whether `key' is somewhere in the chain */
+ if (luaV_rawequalobj(gkey(n), key))
+ return gval(n); /* that's it */
+ else n = gnext(n);
+ } while (n);
+ return luaO_nilobject;
+ }
+ }
+}
+
+
+/*
+** beware: when using this function you probably need to check a GC
+** barrier and invalidate the TM cache.
+*/
+TValue *luaH_set (lua_State *L, Table *t, const TValue *key) {
+ const TValue *p = luaH_get(t, key);
+ if (p != luaO_nilobject)
+ return cast(TValue *, p);
+ else return luaH_newkey(L, t, key);
+}
+
+
+void luaH_setint (lua_State *L, Table *t, int key, TValue *value) {
+ const TValue *p = luaH_getint(t, key);
+ TValue *cell;
+ if (p != luaO_nilobject)
+ cell = cast(TValue *, p);
+ else {
+ TValue k;
+ setnvalue(&k, cast_num(key));
+ cell = luaH_newkey(L, t, &k);
+ }
+ setobj2t(L, cell, value);
+}
+
+
+static int unbound_search (Table *t, unsigned int j) {
+ unsigned int i = j; /* i is zero or a present index */
+ j++;
+ /* find `i' and `j' such that i is present and j is not */
+ while (!ttisnil(luaH_getint(t, j))) {
+ i = j;
+ j *= 2;
+ if (j > cast(unsigned int, MAX_INT)) { /* overflow? */
+ /* table was built with bad purposes: resort to linear search */
+ i = 1;
+ while (!ttisnil(luaH_getint(t, i))) i++;
+ return i - 1;
+ }
+ }
+ /* now do a binary search between them */
+ while (j - i > 1) {
+ unsigned int m = (i+j)/2;
+ if (ttisnil(luaH_getint(t, m))) j = m;
+ else i = m;
+ }
+ return i;
+}
+
+
+/*
+** Try to find a boundary in table `t'. A `boundary' is an integer index
+** such that t[i] is non-nil and t[i+1] is nil (and 0 if t[1] is nil).
+*/
+int luaH_getn (Table *t) {
+ unsigned int j = t->sizearray;
+ if (j > 0 && ttisnil(&t->array[j - 1])) {
+ /* there is a boundary in the array part: (binary) search for it */
+ unsigned int i = 0;
+ while (j - i > 1) {
+ unsigned int m = (i+j)/2;
+ if (ttisnil(&t->array[m - 1])) j = m;
+ else i = m;
+ }
+ return i;
+ }
+ /* else must find a boundary in hash part */
+ else if (isdummy(t->node)) /* hash part is empty? */
+ return j; /* that is easy... */
+ else return unbound_search(t, j);
+}
+
+
+
+#if defined(LUA_DEBUG)
+
+Node *luaH_mainposition (const Table *t, const TValue *key) {
+ return mainposition(t, key);
+}
+
+int luaH_isdummy (Node *n) { return isdummy(n); }
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/ltable.h b/sys/contrib/openzfs/module/lua/ltable.h
new file mode 100644
index 000000000000..ea877ebf4eb0
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/ltable.h
@@ -0,0 +1,47 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: ltable.h,v 2.16.1.2 2013/08/30 15:49:41 roberto Exp $
+** Lua tables (hash)
+** See Copyright Notice in lua.h
+*/
+
+#ifndef ltable_h
+#define ltable_h
+
+#include "lobject.h"
+
+
+#define gnode(t,i) ((Node *)&(t)->node[i])
+#define gkey(n) (&(n)->i_key.tvk)
+#define gval(n) (&(n)->i_val)
+#define gnext(n) ((n)->i_key.nk.next)
+
+#define invalidateTMcache(t) ((t)->flags = 0)
+
+/* returns the key, given the value of a table entry */
+#define keyfromval(v) \
+ (gkey(cast(Node *, cast(char *, (v)) - offsetof(Node, i_val))))
+
+
+LUAI_FUNC const TValue *luaH_getint (Table *t, int key);
+LUAI_FUNC void luaH_setint (lua_State *L, Table *t, int key, TValue *value);
+LUAI_FUNC const TValue *luaH_getstr (Table *t, TString *key);
+LUAI_FUNC const TValue *luaH_get (Table *t, const TValue *key);
+LUAI_FUNC TValue *luaH_newkey (lua_State *L, Table *t, const TValue *key);
+LUAI_FUNC TValue *luaH_set (lua_State *L, Table *t, const TValue *key);
+LUAI_FUNC Table *luaH_new (lua_State *L);
+LUAI_FUNC void luaH_resize (lua_State *L, Table *t, int nasize, int nhsize);
+LUAI_FUNC void luaH_resizearray (lua_State *L, Table *t, int nasize);
+LUAI_FUNC void luaH_free (lua_State *L, Table *t);
+LUAI_FUNC int luaH_next (lua_State *L, Table *t, StkId key);
+LUAI_FUNC int luaH_getn (Table *t);
+
+
+#if defined(LUA_DEBUG)
+LUAI_FUNC Node *luaH_mainposition (const Table *t, const TValue *key);
+LUAI_FUNC int luaH_isdummy (Node *n);
+#endif
+
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/ltablib.c b/sys/contrib/openzfs/module/lua/ltablib.c
new file mode 100644
index 000000000000..51cafffaafcd
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/ltablib.c
@@ -0,0 +1,289 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: ltablib.c,v 1.65.1.2 2014/05/07 16:32:55 roberto Exp $
+** Library for Table Manipulation
+** See Copyright Notice in lua.h
+*/
+
+
+#define ltablib_c
+#define LUA_LIB
+
+#include <sys/lua/lua.h>
+
+#include <sys/lua/lauxlib.h>
+#include <sys/lua/lualib.h>
+
+
+#define aux_getn(L,n) (luaL_checktype(L, n, LUA_TTABLE), luaL_len(L, n))
+
+
+
+#if defined(LUA_COMPAT_MAXN)
+static int maxn (lua_State *L) {
+ lua_Number max = 0;
+ luaL_checktype(L, 1, LUA_TTABLE);
+ lua_pushnil(L); /* first key */
+ while (lua_next(L, 1)) {
+ lua_pop(L, 1); /* remove value */
+ if (lua_type(L, -1) == LUA_TNUMBER) {
+ lua_Number v = lua_tonumber(L, -1);
+ if (v > max) max = v;
+ }
+ }
+ lua_pushnumber(L, max);
+ return 1;
+}
+#endif
+
+
+static int tinsert (lua_State *L) {
+ int e = aux_getn(L, 1) + 1; /* first empty element */
+ int pos; /* where to insert new element */
+ switch (lua_gettop(L)) {
+ case 2: { /* called with only 2 arguments */
+ pos = e; /* insert new element at the end */
+ break;
+ }
+ case 3: {
+ int i;
+ pos = luaL_checkint(L, 2); /* 2nd argument is the position */
+ luaL_argcheck(L, 1 <= pos && pos <= e, 2, "position out of bounds");
+ for (i = e; i > pos; i--) { /* move up elements */
+ lua_rawgeti(L, 1, i-1);
+ lua_rawseti(L, 1, i); /* t[i] = t[i-1] */
+ }
+ break;
+ }
+ default: {
+ return luaL_error(L, "wrong number of arguments to " LUA_QL("insert"));
+ }
+ }
+ lua_rawseti(L, 1, pos); /* t[pos] = v */
+ return 0;
+}
+
+
+static int tremove (lua_State *L) {
+ int size = aux_getn(L, 1);
+ int pos = luaL_optint(L, 2, size);
+ if (pos != size) /* validate 'pos' if given */
+ luaL_argcheck(L, 1 <= pos && pos <= size + 1, 1, "position out of bounds");
+ lua_rawgeti(L, 1, pos); /* result = t[pos] */
+ for ( ; pos < size; pos++) {
+ lua_rawgeti(L, 1, pos+1);
+ lua_rawseti(L, 1, pos); /* t[pos] = t[pos+1] */
+ }
+ lua_pushnil(L);
+ lua_rawseti(L, 1, pos); /* t[pos] = nil */
+ return 1;
+}
+
+
+static void addfield (lua_State *L, luaL_Buffer *b, int i) {
+ lua_rawgeti(L, 1, i);
+ if (!lua_isstring(L, -1))
+ luaL_error(L, "invalid value (%s) at index %d in table for "
+ LUA_QL("concat"), luaL_typename(L, -1), i);
+ luaL_addvalue(b);
+}
+
+
+static int tconcat (lua_State *L) {
+ luaL_Buffer b;
+ size_t lsep;
+ int i, last;
+ const char *sep = luaL_optlstring(L, 2, "", &lsep);
+ luaL_checktype(L, 1, LUA_TTABLE);
+ i = luaL_optint(L, 3, 1);
+ last = luaL_opt(L, luaL_checkint, 4, luaL_len(L, 1));
+ luaL_buffinit(L, &b);
+ for (; i < last; i++) {
+ addfield(L, &b, i);
+ luaL_addlstring(&b, sep, lsep);
+ }
+ if (i == last) /* add last value (if interval was not empty) */
+ addfield(L, &b, i);
+ luaL_pushresult(&b);
+ return 1;
+}
+
+
+/*
+** {======================================================
+** Pack/unpack
+** =======================================================
+*/
+
+static int pack (lua_State *L) {
+ int n = lua_gettop(L); /* number of elements to pack */
+ lua_createtable(L, n, 1); /* create result table */
+ lua_pushinteger(L, n);
+ lua_setfield(L, -2, "n"); /* t.n = number of elements */
+ if (n > 0) { /* at least one element? */
+ int i;
+ lua_pushvalue(L, 1);
+ lua_rawseti(L, -2, 1); /* insert first element */
+ lua_replace(L, 1); /* move table into index 1 */
+ for (i = n; i >= 2; i--) /* assign other elements */
+ lua_rawseti(L, 1, i);
+ }
+ return 1; /* return table */
+}
+
+
+static int unpack (lua_State *L) {
+ int i, e;
+ unsigned int n;
+ luaL_checktype(L, 1, LUA_TTABLE);
+ i = luaL_optint(L, 2, 1);
+ e = luaL_opt(L, luaL_checkint, 3, luaL_len(L, 1));
+ if (i > e) return 0; /* empty range */
+ n = (unsigned int)e - (unsigned int)i; /* number of elements minus 1 */
+ if (n > (INT_MAX - 10) || !lua_checkstack(L, ++n))
+ return luaL_error(L, "too many results to unpack");
+ lua_rawgeti(L, 1, i); /* push arg[i] (avoiding overflow problems) */
+ while (i++ < e) /* push arg[i + 1...e] */
+ lua_rawgeti(L, 1, i);
+ return n;
+}
+
+/* }====================================================== */
+
+
+
+/*
+** {======================================================
+** Quicksort
+** (based on `Algorithms in MODULA-3', Robert Sedgewick;
+** Addison-Wesley, 1993.)
+** =======================================================
+*/
+
+
+static void set2 (lua_State *L, int i, int j) {
+ lua_rawseti(L, 1, i);
+ lua_rawseti(L, 1, j);
+}
+
+static int sort_comp (lua_State *L, int a, int b) {
+ if (!lua_isnil(L, 2)) { /* function? */
+ int res;
+ lua_pushvalue(L, 2);
+ lua_pushvalue(L, a-1); /* -1 to compensate function */
+ lua_pushvalue(L, b-2); /* -2 to compensate function and `a' */
+ lua_call(L, 2, 1);
+ res = lua_toboolean(L, -1);
+ lua_pop(L, 1);
+ return res;
+ }
+ else /* a < b? */
+ return lua_compare(L, a, b, LUA_OPLT);
+}
+
+static void auxsort (lua_State *L, int l, int u) {
+ while (l < u) { /* for tail recursion */
+ int i, j;
+ /* sort elements a[l], a[(l+u)/2] and a[u] */
+ lua_rawgeti(L, 1, l);
+ lua_rawgeti(L, 1, u);
+ if (sort_comp(L, -1, -2)) /* a[u] < a[l]? */
+ set2(L, l, u); /* swap a[l] - a[u] */
+ else
+ lua_pop(L, 2);
+ if (u-l == 1) break; /* only 2 elements */
+ i = (l+u)/2;
+ lua_rawgeti(L, 1, i);
+ lua_rawgeti(L, 1, l);
+ if (sort_comp(L, -2, -1)) /* a[i]<a[l]? */
+ set2(L, i, l);
+ else {
+ lua_pop(L, 1); /* remove a[l] */
+ lua_rawgeti(L, 1, u);
+ if (sort_comp(L, -1, -2)) /* a[u]<a[i]? */
+ set2(L, i, u);
+ else
+ lua_pop(L, 2);
+ }
+ if (u-l == 2) break; /* only 3 elements */
+ lua_rawgeti(L, 1, i); /* Pivot */
+ lua_pushvalue(L, -1);
+ lua_rawgeti(L, 1, u-1);
+ set2(L, i, u-1);
+ /* a[l] <= P == a[u-1] <= a[u], only need to sort from l+1 to u-2 */
+ i = l; j = u-1;
+ for (;;) { /* invariant: a[l..i] <= P <= a[j..u] */
+ /* repeat ++i until a[i] >= P */
+ while (lua_rawgeti(L, 1, ++i), sort_comp(L, -1, -2)) {
+ if (i>=u) luaL_error(L, "invalid order function for sorting");
+ lua_pop(L, 1); /* remove a[i] */
+ }
+ /* repeat --j until a[j] <= P */
+ while (lua_rawgeti(L, 1, --j), sort_comp(L, -3, -1)) {
+ if (j<=l) luaL_error(L, "invalid order function for sorting");
+ lua_pop(L, 1); /* remove a[j] */
+ }
+ if (j<i) {
+ lua_pop(L, 3); /* pop pivot, a[i], a[j] */
+ break;
+ }
+ set2(L, i, j);
+ }
+ lua_rawgeti(L, 1, u-1);
+ lua_rawgeti(L, 1, i);
+ set2(L, u-1, i); /* swap pivot (a[u-1]) with a[i] */
+ /* a[l..i-1] <= a[i] == P <= a[i+1..u] */
+ /* adjust so that smaller half is in [j..i] and larger one in [l..u] */
+ if (i-l < u-i) {
+ j=l; i=i-1; l=i+2;
+ }
+ else {
+ j=i+1; i=u; u=j-2;
+ }
+ auxsort(L, j, i); /* call recursively the smaller one */
+ } /* repeat the routine for the larger one */
+}
+
+static int tsort (lua_State *L) {
+ int n = aux_getn(L, 1);
+ luaL_checkstack(L, 40, ""); /* assume array is smaller than 2^40 */
+ if (!lua_isnoneornil(L, 2)) /* is there a 2nd argument? */
+ luaL_checktype(L, 2, LUA_TFUNCTION);
+ lua_settop(L, 2); /* make sure there is two arguments */
+ auxsort(L, 1, n);
+ return 0;
+}
+
+/* }====================================================== */
+
+
+static const luaL_Reg tab_funcs[] = {
+ {"concat", tconcat},
+#if defined(LUA_COMPAT_MAXN)
+ {"maxn", maxn},
+#endif
+ {"insert", tinsert},
+ {"pack", pack},
+ {"unpack", unpack},
+ {"remove", tremove},
+ {"sort", tsort},
+ {NULL, NULL}
+};
+
+
+LUAMOD_API int luaopen_table (lua_State *L) {
+ luaL_newlib(L, tab_funcs);
+#if defined(LUA_COMPAT_UNPACK)
+ /* _G.unpack = table.unpack */
+ lua_getfield(L, -1, "unpack");
+ lua_setglobal(L, "unpack");
+#endif
+ return 1;
+}
+
+#if defined(_KERNEL)
+
+EXPORT_SYMBOL(luaopen_table);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/ltm.c b/sys/contrib/openzfs/module/lua/ltm.c
new file mode 100644
index 000000000000..94f29f7d96d5
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/ltm.c
@@ -0,0 +1,76 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: ltm.c,v 2.14.1.1 2013/04/12 18:48:47 roberto Exp $
+** Tag methods
+** See Copyright Notice in lua.h
+*/
+
+
+#define ltm_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+
+
+static const char udatatypename[] = "userdata";
+
+LUAI_DDEF const char *const luaT_typenames_[LUA_TOTALTAGS] = {
+ "no value",
+ "nil", "boolean", udatatypename, "number",
+ "string", "table", "function", udatatypename, "thread",
+ "proto", "upval" /* these last two cases are used for tests only */
+};
+
+
+void luaT_init (lua_State *L) {
+ static const char *const luaT_eventname[] = { /* ORDER TM */
+ "__index", "__newindex",
+ "__gc", "__mode", "__len", "__eq",
+ "__add", "__sub", "__mul", "__div", "__mod",
+ "__pow", "__unm", "__lt", "__le",
+ "__concat", "__call"
+ };
+ int i;
+ for (i=0; i<TM_N; i++) {
+ G(L)->tmname[i] = luaS_new(L, luaT_eventname[i]);
+ luaS_fix(G(L)->tmname[i]); /* never collect these names */
+ }
+}
+
+
+/*
+** function to be used with macro "fasttm": optimized for absence of
+** tag methods
+*/
+const TValue *luaT_gettm (Table *events, TMS event, TString *ename) {
+ const TValue *tm = luaH_getstr(events, ename);
+ lua_assert(event <= TM_EQ);
+ if (ttisnil(tm)) { /* no tag method? */
+ events->flags |= cast_byte(1u<<event); /* cache this fact */
+ return NULL;
+ }
+ else return tm;
+}
+
+
+const TValue *luaT_gettmbyobj (lua_State *L, const TValue *o, TMS event) {
+ Table *mt;
+ switch (ttypenv(o)) {
+ case LUA_TTABLE:
+ mt = hvalue(o)->metatable;
+ break;
+ case LUA_TUSERDATA:
+ mt = uvalue(o)->metatable;
+ break;
+ default:
+ mt = G(L)->mt[ttypenv(o)];
+ }
+ return (mt ? luaH_getstr(mt, G(L)->tmname[event]) : luaO_nilobject);
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/ltm.h b/sys/contrib/openzfs/module/lua/ltm.h
new file mode 100644
index 000000000000..c056f4637353
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/ltm.h
@@ -0,0 +1,59 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: ltm.h,v 2.11.1.1 2013/04/12 18:48:47 roberto Exp $
+** Tag methods
+** See Copyright Notice in lua.h
+*/
+
+#ifndef ltm_h
+#define ltm_h
+
+
+#include "lobject.h"
+
+
+/*
+* WARNING: if you change the order of this enumeration,
+* grep "ORDER TM"
+*/
+typedef enum {
+ TM_INDEX,
+ TM_NEWINDEX,
+ TM_GC,
+ TM_MODE,
+ TM_LEN,
+ TM_EQ, /* last tag method with `fast' access */
+ TM_ADD,
+ TM_SUB,
+ TM_MUL,
+ TM_DIV,
+ TM_MOD,
+ TM_POW,
+ TM_UNM,
+ TM_LT,
+ TM_LE,
+ TM_CONCAT,
+ TM_CALL,
+ TM_N /* number of elements in the enum */
+} TMS;
+
+
+
+#define gfasttm(g,et,e) ((et) == NULL ? NULL : \
+ ((et)->flags & (1u<<(e))) ? NULL : luaT_gettm(et, e, (g)->tmname[e]))
+
+#define fasttm(l,et,e) gfasttm(G(l), et, e)
+
+#define ttypename(x) luaT_typenames_[(x) + 1]
+#define objtypename(x) ttypename(ttypenv(x))
+
+LUAI_DDEC const char *const luaT_typenames_[LUA_TOTALTAGS];
+
+
+LUAI_FUNC const TValue *luaT_gettm (Table *events, TMS event, TString *ename);
+LUAI_FUNC const TValue *luaT_gettmbyobj (lua_State *L, const TValue *o,
+ TMS event);
+LUAI_FUNC void luaT_init (lua_State *L);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lvm.c b/sys/contrib/openzfs/module/lua/lvm.c
new file mode 100644
index 000000000000..4685be52b449
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lvm.c
@@ -0,0 +1,932 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lvm.c,v 2.155.1.1 2013/04/12 18:48:47 roberto Exp $
+** Lua virtual machine
+** See Copyright Notice in lua.h
+*/
+
+
+#define lvm_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lgc.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+#include "lvm.h"
+
+#ifdef _KERNEL
+#define strcoll(l,r) (strcmp((l),(r)))
+#endif
+
+/* limit for table tag-method chains (to avoid loops) */
+#define MAXTAGLOOP 100
+
+
+const TValue *luaV_tonumber (const TValue *obj, TValue *n) {
+ lua_Number num;
+ if (ttisnumber(obj)) return obj;
+ if (ttisstring(obj) && luaO_str2d(svalue(obj), tsvalue(obj)->len, &num)) {
+ setnvalue(n, num);
+ return n;
+ }
+ else
+ return NULL;
+}
+
+
+int luaV_tostring (lua_State *L, StkId obj) {
+ if (!ttisnumber(obj))
+ return 0;
+ else {
+ char s[LUAI_MAXNUMBER2STR];
+ lua_Number n = nvalue(obj);
+ int l = lua_number2str(s, n);
+ setsvalue2s(L, obj, luaS_newlstr(L, s, l));
+ return 1;
+ }
+}
+
+
+static void traceexec (lua_State *L) {
+ CallInfo *ci = L->ci;
+ lu_byte mask = L->hookmask;
+ int counthook = ((mask & LUA_MASKCOUNT) && L->hookcount == 0);
+ if (counthook)
+ resethookcount(L); /* reset count */
+ if (ci->callstatus & CIST_HOOKYIELD) { /* called hook last time? */
+ ci->callstatus &= ~CIST_HOOKYIELD; /* erase mark */
+ return; /* do not call hook again (VM yielded, so it did not move) */
+ }
+ if (counthook)
+ luaD_hook(L, LUA_HOOKCOUNT, -1); /* call count hook */
+ if (mask & LUA_MASKLINE) {
+ Proto *p = ci_func(ci)->p;
+ int npc = pcRel(ci->u.l.savedpc, p);
+ int newline = getfuncline(p, npc);
+ if (npc == 0 || /* call linehook when enter a new function, */
+ ci->u.l.savedpc <= L->oldpc || /* when jump back (loop), or when */
+ newline != getfuncline(p, pcRel(L->oldpc, p))) /* enter a new line */
+ luaD_hook(L, LUA_HOOKLINE, newline); /* call line hook */
+ }
+ L->oldpc = ci->u.l.savedpc;
+ if (L->status == LUA_YIELD) { /* did hook yield? */
+ if (counthook)
+ L->hookcount = 1; /* undo decrement to zero */
+ ci->u.l.savedpc--; /* undo increment (resume will increment it again) */
+ ci->callstatus |= CIST_HOOKYIELD; /* mark that it yielded */
+ ci->func = L->top - 1; /* protect stack below results */
+ luaD_throw(L, LUA_YIELD);
+ }
+}
+
+
+static void callTM (lua_State *L, const TValue *f, const TValue *p1,
+ const TValue *p2, TValue *p3, int hasres) {
+ if (L == NULL) return;
+
+ ptrdiff_t result = savestack(L, p3);
+ setobj2s(L, L->top++, f); /* push function */
+ setobj2s(L, L->top++, p1); /* 1st argument */
+ setobj2s(L, L->top++, p2); /* 2nd argument */
+ if (!hasres) /* no result? 'p3' is third argument */
+ setobj2s(L, L->top++, p3); /* 3rd argument */
+ /* metamethod may yield only when called from Lua code */
+ luaD_call(L, L->top - (4 - hasres), hasres, isLua(L->ci));
+ if (hasres) { /* if has result, move it to its place */
+ p3 = restorestack(L, result);
+ setobjs2s(L, p3, --L->top);
+ }
+}
+
+
+void luaV_gettable (lua_State *L, const TValue *t, TValue *key, StkId val) {
+ int loop;
+ for (loop = 0; loop < MAXTAGLOOP; loop++) {
+ const TValue *tm;
+ if (ttistable(t)) { /* `t' is a table? */
+ Table *h = hvalue(t);
+ const TValue *res = luaH_get(h, key); /* do a primitive get */
+ if (!ttisnil(res) || /* result is not nil? */
+ (tm = fasttm(L, h->metatable, TM_INDEX)) == NULL) { /* or no TM? */
+ setobj2s(L, val, res);
+ return;
+ }
+ /* else will try the tag method */
+ }
+ else if (ttisnil(tm = luaT_gettmbyobj(L, t, TM_INDEX)))
+ luaG_typeerror(L, t, "index");
+ if (ttisfunction(tm)) {
+ callTM(L, tm, t, key, val, 1);
+ return;
+ }
+ t = tm; /* else repeat with 'tm' */
+ }
+ luaG_runerror(L, "loop in gettable");
+}
+
+
+void luaV_settable (lua_State *L, const TValue *t, TValue *key, StkId val) {
+ int loop;
+ for (loop = 0; loop < MAXTAGLOOP; loop++) {
+ const TValue *tm;
+ if (ttistable(t)) { /* `t' is a table? */
+ Table *h = hvalue(t);
+ TValue *oldval = cast(TValue *, luaH_get(h, key));
+ /* if previous value is not nil, there must be a previous entry
+ in the table; moreover, a metamethod has no relevance */
+ if (!ttisnil(oldval) ||
+ /* previous value is nil; must check the metamethod */
+ ((tm = fasttm(L, h->metatable, TM_NEWINDEX)) == NULL &&
+ /* no metamethod; is there a previous entry in the table? */
+ (oldval != luaO_nilobject ||
+ /* no previous entry; must create one. (The next test is
+ always true; we only need the assignment.) */
+ (oldval = luaH_newkey(L, h, key), 1)))) {
+ /* no metamethod and (now) there is an entry with given key */
+ setobj2t(L, oldval, val); /* assign new value to that entry */
+ invalidateTMcache(h);
+ luaC_barrierback(L, obj2gco(h), val);
+ return;
+ }
+ /* else will try the metamethod */
+ }
+ else /* not a table; check metamethod */
+ if (ttisnil(tm = luaT_gettmbyobj(L, t, TM_NEWINDEX)))
+ luaG_typeerror(L, t, "index");
+ /* there is a metamethod */
+ if (ttisfunction(tm)) {
+ callTM(L, tm, t, key, val, 0);
+ return;
+ }
+ t = tm; /* else repeat with 'tm' */
+ }
+ luaG_runerror(L, "loop in settable");
+}
+
+
+static int call_binTM (lua_State *L, const TValue *p1, const TValue *p2,
+ StkId res, TMS event) {
+ const TValue *tm = luaT_gettmbyobj(L, p1, event); /* try first operand */
+ if (ttisnil(tm))
+ tm = luaT_gettmbyobj(L, p2, event); /* try second operand */
+ if (ttisnil(tm)) return 0;
+ callTM(L, tm, p1, p2, res, 1);
+ return 1;
+}
+
+
+static const TValue *get_equalTM (lua_State *L, Table *mt1, Table *mt2,
+ TMS event) {
+ const TValue *tm1 = fasttm(L, mt1, event);
+ const TValue *tm2;
+ if (tm1 == NULL) return NULL; /* no metamethod */
+ if (mt1 == mt2) return tm1; /* same metatables => same metamethods */
+ tm2 = fasttm(L, mt2, event);
+ if (tm2 == NULL) return NULL; /* no metamethod */
+ if (luaV_rawequalobj(tm1, tm2)) /* same metamethods? */
+ return tm1;
+ return NULL;
+}
+
+
+static int call_orderTM (lua_State *L, const TValue *p1, const TValue *p2,
+ TMS event) {
+ if (!call_binTM(L, p1, p2, L->top, event))
+ return -1; /* no metamethod */
+ else
+ return !l_isfalse(L->top);
+}
+
+
+static int l_strcmp (const TString *ls, const TString *rs) {
+ const char *l = getstr(ls);
+ size_t ll = ls->tsv.len;
+ const char *r = getstr(rs);
+ size_t lr = rs->tsv.len;
+ for (;;) {
+ int temp = strcoll(l, r);
+ if (temp != 0) return temp;
+ else { /* strings are equal up to a `\0' */
+ size_t len = strlen(l); /* index of first `\0' in both strings */
+ if (len == lr) /* r is finished? */
+ return (len == ll) ? 0 : 1;
+ else if (len == ll) /* l is finished? */
+ return -1; /* l is smaller than r (because r is not finished) */
+ /* both strings longer than `len'; go on comparing (after the `\0') */
+ len++;
+ l += len; ll -= len; r += len; lr -= len;
+ }
+ }
+}
+
+
+int luaV_lessthan (lua_State *L, const TValue *l, const TValue *r) {
+ int res;
+ if (ttisnumber(l) && ttisnumber(r))
+ return luai_numlt(L, nvalue(l), nvalue(r));
+ else if (ttisstring(l) && ttisstring(r))
+ return l_strcmp(rawtsvalue(l), rawtsvalue(r)) < 0;
+ else if ((res = call_orderTM(L, l, r, TM_LT)) < 0)
+ luaG_ordererror(L, l, r);
+ return res;
+}
+
+
+int luaV_lessequal (lua_State *L, const TValue *l, const TValue *r) {
+ int res;
+ if (ttisnumber(l) && ttisnumber(r))
+ return luai_numle(L, nvalue(l), nvalue(r));
+ else if (ttisstring(l) && ttisstring(r))
+ return l_strcmp(rawtsvalue(l), rawtsvalue(r)) <= 0;
+ else if ((res = call_orderTM(L, l, r, TM_LE)) >= 0) /* first try `le' */
+ return res;
+ else if ((res = call_orderTM(L, r, l, TM_LT)) < 0) /* else try `lt' */
+ luaG_ordererror(L, l, r);
+ return !res;
+}
+
+
+/*
+** equality of Lua values. L == NULL means raw equality (no metamethods)
+*/
+int luaV_equalobj_ (lua_State *L, const TValue *t1, const TValue *t2) {
+ const TValue *tm;
+ lua_assert(ttisequal(t1, t2));
+ switch (ttype(t1)) {
+ case LUA_TNIL: return 1;
+ case LUA_TNUMBER: return luai_numeq(nvalue(t1), nvalue(t2));
+ case LUA_TBOOLEAN: return bvalue(t1) == bvalue(t2); /* true must be 1 !! */
+ case LUA_TLIGHTUSERDATA: return pvalue(t1) == pvalue(t2);
+ case LUA_TLCF: return fvalue(t1) == fvalue(t2);
+ case LUA_TSHRSTR: return eqshrstr(rawtsvalue(t1), rawtsvalue(t2));
+ case LUA_TLNGSTR: return luaS_eqlngstr(rawtsvalue(t1), rawtsvalue(t2));
+ case LUA_TUSERDATA: {
+ if (uvalue(t1) == uvalue(t2)) return 1;
+ else if (L == NULL) return 0;
+ tm = get_equalTM(L, uvalue(t1)->metatable, uvalue(t2)->metatable, TM_EQ);
+ break; /* will try TM */
+ }
+ case LUA_TTABLE: {
+ if (hvalue(t1) == hvalue(t2)) return 1;
+ else if (L == NULL) return 0;
+ tm = get_equalTM(L, hvalue(t1)->metatable, hvalue(t2)->metatable, TM_EQ);
+ break; /* will try TM */
+ }
+ default:
+ lua_assert(iscollectable(t1));
+ return gcvalue(t1) == gcvalue(t2);
+ }
+ if (tm == NULL || L == NULL) return 0; /* no TM? */
+ callTM(L, tm, t1, t2, L->top, 1); /* call TM */
+ return !l_isfalse(L->top);
+}
+
+
+void luaV_concat (lua_State *L, int total) {
+ lua_assert(total >= 2);
+ do {
+ StkId top = L->top;
+ int n = 2; /* number of elements handled in this pass (at least 2) */
+ if (!(ttisstring(top-2) || ttisnumber(top-2)) || !tostring(L, top-1)) {
+ if (!call_binTM(L, top-2, top-1, top-2, TM_CONCAT))
+ luaG_concaterror(L, top-2, top-1);
+ }
+ else if (tsvalue(top-1)->len == 0) /* second operand is empty? */
+ (void)tostring(L, top - 2); /* result is first operand */
+ else if (ttisstring(top-2) && tsvalue(top-2)->len == 0) {
+ setobjs2s(L, top - 2, top - 1); /* result is second op. */
+ }
+ else {
+ /* at least two non-empty string values; get as many as possible */
+ size_t tl = tsvalue(top-1)->len;
+ char *buffer;
+ int i;
+ /* collect total length */
+ for (i = 1; i < total && tostring(L, top-i-1); i++) {
+ size_t l = tsvalue(top-i-1)->len;
+ if (l >= (MAX_SIZET/sizeof(char)) - tl)
+ luaG_runerror(L, "string length overflow");
+ tl += l;
+ }
+ buffer = luaZ_openspace(L, &G(L)->buff, tl);
+ tl = 0;
+ n = i;
+ do { /* concat all strings */
+ size_t l = tsvalue(top-i)->len;
+ memcpy(buffer+tl, svalue(top-i), l * sizeof(char));
+ tl += l;
+ } while (--i > 0);
+ setsvalue2s(L, top-n, luaS_newlstr(L, buffer, tl));
+ }
+ total -= n-1; /* got 'n' strings to create 1 new */
+ L->top -= n-1; /* popped 'n' strings and pushed one */
+ } while (total > 1); /* repeat until only 1 result left */
+}
+
+
+void luaV_objlen (lua_State *L, StkId ra, const TValue *rb) {
+ const TValue *tm;
+ switch (ttypenv(rb)) {
+ case LUA_TTABLE: {
+ Table *h = hvalue(rb);
+ tm = fasttm(L, h->metatable, TM_LEN);
+ if (tm) break; /* metamethod? break switch to call it */
+ setnvalue(ra, cast_num(luaH_getn(h))); /* else primitive len */
+ return;
+ }
+ case LUA_TSTRING: {
+ setnvalue(ra, cast_num(tsvalue(rb)->len));
+ return;
+ }
+ default: { /* try metamethod */
+ tm = luaT_gettmbyobj(L, rb, TM_LEN);
+ if (ttisnil(tm)) /* no metamethod? */
+ luaG_typeerror(L, rb, "get length of");
+ break;
+ }
+ }
+ callTM(L, tm, rb, rb, ra, 1);
+}
+
+/*
+ * luaV_div and luaV_mod patched in from Lua 5.3.2 in order to properly handle
+ * div/mod by zero (instead of crashing, which is the default behavior in
+ * Lua 5.2)
+ */
+
+/*
+** Integer division; return 'm // n', that is, floor(m/n).
+** C division truncates its result (rounds towards zero).
+** 'floor(q) == trunc(q)' when 'q >= 0' or when 'q' is integer,
+** otherwise 'floor(q) == trunc(q) - 1'.
+*/
+static lua_Number luaV_div (lua_State *L, lua_Number m, lua_Number n) {
+ if ((lua_Unsigned)(n) + 1u <= 1u) { /* special cases: -1 or 0 */
+ if (n == 0)
+ luaG_runerror(L, "attempt to divide by zero");
+ return (0 - m); /* n==-1; avoid overflow with 0x80000...//-1 */
+ }
+ else {
+ lua_Number q = m / n; /* perform C division */
+ if ((m ^ n) < 0 && m % n != 0) /* 'm/n' would be negative non-integer? */
+ q -= 1; /* correct result for different rounding */
+ return q;
+ }
+}
+
+
+/*
+** Integer modulus; return 'm % n'. (Assume that C '%' with
+** negative operands follows C99 behavior. See previous comment
+** about luaV_div.)
+*/
+static lua_Number luaV_mod (lua_State *L, lua_Number m, lua_Number n) {
+ if ((lua_Unsigned)(n) + 1u <= 1u) { /* special cases: -1 or 0 */
+ if (n == 0)
+ luaG_runerror(L, "attempt to perform 'n%%0'");
+ return 0; /* m % -1 == 0; avoid overflow with 0x80000...%-1 */
+ }
+ else {
+ lua_Number r = m % n;
+ if (r != 0 && (m ^ n) < 0) /* 'm/n' would be non-integer negative? */
+ r += n; /* correct result for different rounding */
+ return r;
+ }
+}
+
+/*
+ * End patch from 5.3.2
+ */
+
+void luaV_arith (lua_State *L, StkId ra, const TValue *rb,
+ const TValue *rc, TMS op) {
+ TValue tempb, tempc;
+ const TValue *b, *c;
+ if ((b = luaV_tonumber(rb, &tempb)) != NULL &&
+ (c = luaV_tonumber(rc, &tempc)) != NULL) {
+ /*
+ * Patched: if dividing or modding, use patched functions from 5.3
+ */
+ lua_Number res;
+ int lop = op - TM_ADD + LUA_OPADD;
+ if (lop == LUA_OPDIV) {
+ res = luaV_div(L, nvalue(b), nvalue(c));
+ } else if (lop == LUA_OPMOD) {
+ res = luaV_mod(L, nvalue(b), nvalue(c));
+ } else {
+ res = luaO_arith(op - TM_ADD + LUA_OPADD, nvalue(b), nvalue(c));
+ }
+ setnvalue(ra, res);
+ }
+ else if (!call_binTM(L, rb, rc, ra, op))
+ luaG_aritherror(L, rb, rc);
+}
+
+
+/*
+** check whether cached closure in prototype 'p' may be reused, that is,
+** whether there is a cached closure with the same upvalues needed by
+** new closure to be created.
+*/
+static Closure *getcached (Proto *p, UpVal **encup, StkId base) {
+ Closure *c = p->cache;
+ if (c != NULL) { /* is there a cached closure? */
+ int nup = p->sizeupvalues;
+ Upvaldesc *uv = p->upvalues;
+ int i;
+ for (i = 0; i < nup; i++) { /* check whether it has right upvalues */
+ TValue *v = uv[i].instack ? base + uv[i].idx : encup[uv[i].idx]->v;
+ if (c->l.upvals[i]->v != v)
+ return NULL; /* wrong upvalue; cannot reuse closure */
+ }
+ }
+ return c; /* return cached closure (or NULL if no cached closure) */
+}
+
+
+/*
+** create a new Lua closure, push it in the stack, and initialize
+** its upvalues. Note that the call to 'luaC_barrierproto' must come
+** before the assignment to 'p->cache', as the function needs the
+** original value of that field.
+*/
+static void pushclosure (lua_State *L, Proto *p, UpVal **encup, StkId base,
+ StkId ra) {
+ int nup = p->sizeupvalues;
+ Upvaldesc *uv = p->upvalues;
+ int i;
+ Closure *ncl = luaF_newLclosure(L, nup);
+ ncl->l.p = p;
+ setclLvalue(L, ra, ncl); /* anchor new closure in stack */
+ for (i = 0; i < nup; i++) { /* fill in its upvalues */
+ if (uv[i].instack) /* upvalue refers to local variable? */
+ ncl->l.upvals[i] = luaF_findupval(L, base + uv[i].idx);
+ else /* get upvalue from enclosing function */
+ ncl->l.upvals[i] = encup[uv[i].idx];
+ }
+ luaC_barrierproto(L, p, ncl);
+ p->cache = ncl; /* save it on cache for reuse */
+}
+
+
+/*
+** finish execution of an opcode interrupted by an yield
+*/
+void luaV_finishOp (lua_State *L) {
+ CallInfo *ci = L->ci;
+ StkId base = ci->u.l.base;
+ Instruction inst = *(ci->u.l.savedpc - 1); /* interrupted instruction */
+ OpCode op = GET_OPCODE(inst);
+ switch (op) { /* finish its execution */
+ case OP_ADD: case OP_SUB: case OP_MUL: case OP_DIV:
+ case OP_MOD: case OP_POW: case OP_UNM: case OP_LEN:
+ case OP_GETTABUP: case OP_GETTABLE: case OP_SELF: {
+ setobjs2s(L, base + GETARG_A(inst), --L->top);
+ break;
+ }
+ case OP_LE: case OP_LT: case OP_EQ: {
+ int res = !l_isfalse(L->top - 1);
+ L->top--;
+ /* metamethod should not be called when operand is K */
+ lua_assert(!ISK(GETARG_B(inst)));
+ if (op == OP_LE && /* "<=" using "<" instead? */
+ ttisnil(luaT_gettmbyobj(L, base + GETARG_B(inst), TM_LE)))
+ res = !res; /* invert result */
+ lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_JMP);
+ if (res != GETARG_A(inst)) /* condition failed? */
+ ci->u.l.savedpc++; /* skip jump instruction */
+ break;
+ }
+ case OP_CONCAT: {
+ StkId top = L->top - 1; /* top when 'call_binTM' was called */
+ int b = GETARG_B(inst); /* first element to concatenate */
+ int total = cast_int(top - 1 - (base + b)); /* yet to concatenate */
+ setobj2s(L, top - 2, top); /* put TM result in proper position */
+ if (total > 1) { /* are there elements to concat? */
+ L->top = top - 1; /* top is one after last element (at top-2) */
+ luaV_concat(L, total); /* concat them (may yield again) */
+ }
+ /* move final result to final position */
+ setobj2s(L, ci->u.l.base + GETARG_A(inst), L->top - 1);
+ L->top = ci->top; /* restore top */
+ break;
+ }
+ case OP_TFORCALL: {
+ lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_TFORLOOP);
+ L->top = ci->top; /* correct top */
+ break;
+ }
+ case OP_CALL: {
+ if (GETARG_C(inst) - 1 >= 0) /* nresults >= 0? */
+ L->top = ci->top; /* adjust results */
+ break;
+ }
+ case OP_TAILCALL: case OP_SETTABUP: case OP_SETTABLE:
+ break;
+ default: lua_assert(0);
+ }
+}
+
+
+
+/*
+** some macros for common tasks in `luaV_execute'
+*/
+
+#if !defined luai_runtimecheck
+#define luai_runtimecheck(L, c) /* void */
+#endif
+
+
+#define RA(i) (base+GETARG_A(i))
+/* to be used after possible stack reallocation */
+#define RB(i) check_exp(getBMode(GET_OPCODE(i)) == OpArgR, base+GETARG_B(i))
+#define RC(i) check_exp(getCMode(GET_OPCODE(i)) == OpArgR, base+GETARG_C(i))
+#define RKB(i) check_exp(getBMode(GET_OPCODE(i)) == OpArgK, \
+ ISK(GETARG_B(i)) ? k+INDEXK(GETARG_B(i)) : base+GETARG_B(i))
+#define RKC(i) check_exp(getCMode(GET_OPCODE(i)) == OpArgK, \
+ ISK(GETARG_C(i)) ? k+INDEXK(GETARG_C(i)) : base+GETARG_C(i))
+#define KBx(i) \
+ (k + (GETARG_Bx(i) != 0 ? GETARG_Bx(i) - 1 : GETARG_Ax(*ci->u.l.savedpc++)))
+
+
+/* execute a jump instruction */
+#define dojump(ci,i,e) \
+ { int a = GETARG_A(i); \
+ if (a > 0) luaF_close(L, ci->u.l.base + a - 1); \
+ ci->u.l.savedpc += GETARG_sBx(i) + e; }
+
+/* for test instructions, execute the jump instruction that follows it */
+#define donextjump(ci) { i = *ci->u.l.savedpc; dojump(ci, i, 1); }
+
+
+#define Protect(x) { {x;}; base = ci->u.l.base; }
+
+#define checkGC(L,c) \
+ Protect( luaC_condGC(L,{L->top = (c); /* limit of live values */ \
+ luaC_step(L); \
+ L->top = ci->top;}) /* restore top */ \
+ luai_threadyield(L); )
+
+
+#define arith_op(op,tm) { \
+ TValue *rb = RKB(i); \
+ TValue *rc = RKC(i); \
+ if (ttisnumber(rb) && ttisnumber(rc)) { \
+ lua_Number nb = nvalue(rb), nc = nvalue(rc); \
+ setnvalue(ra, op(L, nb, nc)); \
+ } \
+ else { Protect(luaV_arith(L, ra, rb, rc, tm)); } }
+
+
+#define vmdispatch(o) switch(o)
+#define vmcase(l,b) case l: {b} break;
+#define vmcasenb(l,b) case l: {b} /* nb = no break */
+
+void luaV_execute (lua_State *L) {
+ CallInfo *ci = L->ci;
+ LClosure *cl;
+ TValue *k;
+ StkId base;
+ newframe: /* reentry point when frame changes (call/return) */
+ lua_assert(ci == L->ci);
+ cl = clLvalue(ci->func);
+ k = cl->p->k;
+ base = ci->u.l.base;
+ /* main loop of interpreter */
+ for (;;) {
+ Instruction i = *(ci->u.l.savedpc++);
+ StkId ra;
+ if ((L->hookmask & (LUA_MASKLINE | LUA_MASKCOUNT)) &&
+ (--L->hookcount == 0 || L->hookmask & LUA_MASKLINE)) {
+ Protect(traceexec(L));
+ }
+ /* WARNING: several calls may realloc the stack and invalidate `ra' */
+ ra = RA(i);
+ lua_assert(base == ci->u.l.base);
+ lua_assert(base <= L->top && L->top < L->stack + L->stacksize);
+ vmdispatch (GET_OPCODE(i)) {
+ vmcase(OP_MOVE,
+ setobjs2s(L, ra, RB(i));
+ )
+ vmcase(OP_LOADK,
+ TValue *rb = k + GETARG_Bx(i);
+ setobj2s(L, ra, rb);
+ )
+ vmcase(OP_LOADKX,
+ TValue *rb;
+ lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_EXTRAARG);
+ rb = k + GETARG_Ax(*ci->u.l.savedpc++);
+ setobj2s(L, ra, rb);
+ )
+ vmcase(OP_LOADBOOL,
+ setbvalue(ra, GETARG_B(i));
+ if (GETARG_C(i)) ci->u.l.savedpc++; /* skip next instruction (if C) */
+ )
+ vmcase(OP_LOADNIL,
+ int b = GETARG_B(i);
+ do {
+ setnilvalue(ra++);
+ } while (b--);
+ )
+ vmcase(OP_GETUPVAL,
+ int b = GETARG_B(i);
+ setobj2s(L, ra, cl->upvals[b]->v);
+ )
+ vmcase(OP_GETTABUP,
+ int b = GETARG_B(i);
+ Protect(luaV_gettable(L, cl->upvals[b]->v, RKC(i), ra));
+ )
+ vmcase(OP_GETTABLE,
+ Protect(luaV_gettable(L, RB(i), RKC(i), ra));
+ )
+ vmcase(OP_SETTABUP,
+ int a = GETARG_A(i);
+ Protect(luaV_settable(L, cl->upvals[a]->v, RKB(i), RKC(i)));
+ )
+ vmcase(OP_SETUPVAL,
+ UpVal *uv = cl->upvals[GETARG_B(i)];
+ setobj(L, uv->v, ra);
+ luaC_barrier(L, uv, ra);
+ )
+ vmcase(OP_SETTABLE,
+ Protect(luaV_settable(L, ra, RKB(i), RKC(i)));
+ )
+ vmcase(OP_NEWTABLE,
+ int b = GETARG_B(i);
+ int c = GETARG_C(i);
+ Table *t = luaH_new(L);
+ sethvalue(L, ra, t);
+ if (b != 0 || c != 0)
+ luaH_resize(L, t, luaO_fb2int(b), luaO_fb2int(c));
+ checkGC(L, ra + 1);
+ )
+ vmcase(OP_SELF,
+ StkId rb = RB(i);
+ setobjs2s(L, ra+1, rb);
+ Protect(luaV_gettable(L, rb, RKC(i), ra));
+ )
+ vmcase(OP_ADD,
+ arith_op(luai_numadd, TM_ADD);
+ )
+ vmcase(OP_SUB,
+ arith_op(luai_numsub, TM_SUB);
+ )
+ vmcase(OP_MUL,
+ arith_op(luai_nummul, TM_MUL);
+ )
+ /*
+ * Patched: use luaV_* instead of luai_* to handle div/mod by 0
+ */
+ vmcase(OP_DIV,
+ arith_op(luaV_div, TM_DIV);
+ )
+ vmcase(OP_MOD,
+ arith_op(luaV_mod, TM_MOD);
+ )
+ vmcase(OP_POW,
+ arith_op(luai_numpow, TM_POW);
+ )
+ vmcase(OP_UNM,
+ TValue *rb = RB(i);
+ if (ttisnumber(rb)) {
+ lua_Number nb = nvalue(rb);
+ setnvalue(ra, luai_numunm(L, nb));
+ }
+ else {
+ Protect(luaV_arith(L, ra, rb, rb, TM_UNM));
+ }
+ )
+ vmcase(OP_NOT,
+ TValue *rb = RB(i);
+ int res = l_isfalse(rb); /* next assignment may change this value */
+ setbvalue(ra, res);
+ )
+ vmcase(OP_LEN,
+ Protect(luaV_objlen(L, ra, RB(i)));
+ )
+ vmcase(OP_CONCAT,
+ int b = GETARG_B(i);
+ int c = GETARG_C(i);
+ StkId rb;
+ L->top = base + c + 1; /* mark the end of concat operands */
+ Protect(luaV_concat(L, c - b + 1));
+ ra = RA(i); /* 'luav_concat' may invoke TMs and move the stack */
+ rb = b + base;
+ setobjs2s(L, ra, rb);
+ checkGC(L, (ra >= rb ? ra + 1 : rb));
+ L->top = ci->top; /* restore top */
+ )
+ vmcase(OP_JMP,
+ dojump(ci, i, 0);
+ )
+ vmcase(OP_EQ,
+ TValue *rb = RKB(i);
+ TValue *rc = RKC(i);
+ Protect(
+ if (cast_int(equalobj(L, rb, rc)) != GETARG_A(i))
+ ci->u.l.savedpc++;
+ else
+ donextjump(ci);
+ )
+ )
+ vmcase(OP_LT,
+ Protect(
+ if (luaV_lessthan(L, RKB(i), RKC(i)) != GETARG_A(i))
+ ci->u.l.savedpc++;
+ else
+ donextjump(ci);
+ )
+ )
+ vmcase(OP_LE,
+ Protect(
+ if (luaV_lessequal(L, RKB(i), RKC(i)) != GETARG_A(i))
+ ci->u.l.savedpc++;
+ else
+ donextjump(ci);
+ )
+ )
+ vmcase(OP_TEST,
+ if (GETARG_C(i) ? l_isfalse(ra) : !l_isfalse(ra))
+ ci->u.l.savedpc++;
+ else
+ donextjump(ci);
+ )
+ vmcase(OP_TESTSET,
+ TValue *rb = RB(i);
+ if (GETARG_C(i) ? l_isfalse(rb) : !l_isfalse(rb))
+ ci->u.l.savedpc++;
+ else {
+ setobjs2s(L, ra, rb);
+ donextjump(ci);
+ }
+ )
+ vmcase(OP_CALL,
+ int b = GETARG_B(i);
+ int nresults = GETARG_C(i) - 1;
+ if (b != 0) L->top = ra+b; /* else previous instruction set top */
+ if (luaD_precall(L, ra, nresults)) { /* C function? */
+ if (nresults >= 0) L->top = ci->top; /* adjust results */
+ base = ci->u.l.base;
+ }
+ else { /* Lua function */
+ ci = L->ci;
+ ci->callstatus |= CIST_REENTRY;
+ goto newframe; /* restart luaV_execute over new Lua function */
+ }
+ )
+ vmcase(OP_TAILCALL,
+ int b = GETARG_B(i);
+ if (b != 0) L->top = ra+b; /* else previous instruction set top */
+ lua_assert(GETARG_C(i) - 1 == LUA_MULTRET);
+ if (luaD_precall(L, ra, LUA_MULTRET)) /* C function? */
+ base = ci->u.l.base;
+ else {
+ /* tail call: put called frame (n) in place of caller one (o) */
+ CallInfo *nci = L->ci; /* called frame */
+ CallInfo *oci = nci->previous; /* caller frame */
+ StkId nfunc = nci->func; /* called function */
+ StkId ofunc = oci->func; /* caller function */
+ /* last stack slot filled by 'precall' */
+ StkId lim = nci->u.l.base + getproto(nfunc)->numparams;
+ int aux;
+ /* close all upvalues from previous call */
+ if (cl->p->sizep > 0) luaF_close(L, oci->u.l.base);
+ /* move new frame into old one */
+ for (aux = 0; nfunc + aux < lim; aux++)
+ setobjs2s(L, ofunc + aux, nfunc + aux);
+ oci->u.l.base = ofunc + (nci->u.l.base - nfunc); /* correct base */
+ oci->top = L->top = ofunc + (L->top - nfunc); /* correct top */
+ oci->u.l.savedpc = nci->u.l.savedpc;
+ oci->callstatus |= CIST_TAIL; /* function was tail called */
+ ci = L->ci = oci; /* remove new frame */
+ lua_assert(L->top == oci->u.l.base + getproto(ofunc)->maxstacksize);
+ goto newframe; /* restart luaV_execute over new Lua function */
+ }
+ )
+ vmcasenb(OP_RETURN,
+ int b = GETARG_B(i);
+ if (b != 0) L->top = ra+b-1;
+ if (cl->p->sizep > 0) luaF_close(L, base);
+ b = luaD_poscall(L, ra);
+ if (!(ci->callstatus & CIST_REENTRY)) /* 'ci' still the called one */
+ return; /* external invocation: return */
+ else { /* invocation via reentry: continue execution */
+ ci = L->ci;
+ if (b) L->top = ci->top;
+ lua_assert(isLua(ci));
+ lua_assert(GET_OPCODE(*((ci)->u.l.savedpc - 1)) == OP_CALL);
+ goto newframe; /* restart luaV_execute over new Lua function */
+ }
+ )
+ vmcase(OP_FORLOOP,
+ lua_Number step = nvalue(ra+2);
+ lua_Number idx = luai_numadd(L, nvalue(ra), step); /* increment index */
+ lua_Number limit = nvalue(ra+1);
+ if (luai_numlt(L, 0, step) ? luai_numle(L, idx, limit)
+ : luai_numle(L, limit, idx)) {
+ ci->u.l.savedpc += GETARG_sBx(i); /* jump back */
+ setnvalue(ra, idx); /* update internal index... */
+ setnvalue(ra+3, idx); /* ...and external index */
+ }
+ )
+ vmcase(OP_FORPREP,
+ const TValue *init = ra;
+ const TValue *plimit = ra+1;
+ const TValue *pstep = ra+2;
+ if (!tonumber(init, ra))
+ luaG_runerror(L, LUA_QL("for") " initial value must be a number");
+ else if (!tonumber(plimit, ra+1))
+ luaG_runerror(L, LUA_QL("for") " limit must be a number");
+ else if (!tonumber(pstep, ra+2))
+ luaG_runerror(L, LUA_QL("for") " step must be a number");
+ setnvalue(ra, luai_numsub(L, nvalue(ra), nvalue(pstep)));
+ ci->u.l.savedpc += GETARG_sBx(i);
+ )
+ vmcasenb(OP_TFORCALL,
+ StkId cb = ra + 3; /* call base */
+ setobjs2s(L, cb+2, ra+2);
+ setobjs2s(L, cb+1, ra+1);
+ setobjs2s(L, cb, ra);
+ L->top = cb + 3; /* func. + 2 args (state and index) */
+ Protect(luaD_call(L, cb, GETARG_C(i), 1));
+ L->top = ci->top;
+ i = *(ci->u.l.savedpc++); /* go to next instruction */
+ ra = RA(i);
+ lua_assert(GET_OPCODE(i) == OP_TFORLOOP);
+ goto l_tforloop;
+ )
+ vmcase(OP_TFORLOOP,
+ l_tforloop:
+ if (!ttisnil(ra + 1)) { /* continue loop? */
+ setobjs2s(L, ra, ra + 1); /* save control variable */
+ ci->u.l.savedpc += GETARG_sBx(i); /* jump back */
+ }
+ )
+ vmcase(OP_SETLIST,
+ int n = GETARG_B(i);
+ int c = GETARG_C(i);
+ int last;
+ Table *h;
+ if (n == 0) n = cast_int(L->top - ra) - 1;
+ if (c == 0) {
+ lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_EXTRAARG);
+ c = GETARG_Ax(*ci->u.l.savedpc++);
+ }
+ luai_runtimecheck(L, ttistable(ra));
+ h = hvalue(ra);
+ last = ((c-1)*LFIELDS_PER_FLUSH) + n;
+ if (last > h->sizearray) /* needs more space? */
+ luaH_resizearray(L, h, last); /* pre-allocate it at once */
+ for (; n > 0; n--) {
+ TValue *val = ra+n;
+ luaH_setint(L, h, last--, val);
+ luaC_barrierback(L, obj2gco(h), val);
+ }
+ L->top = ci->top; /* correct top (in case of previous open call) */
+ )
+ vmcase(OP_CLOSURE,
+ Proto *p = cl->p->p[GETARG_Bx(i)];
+ Closure *ncl = getcached(p, cl->upvals, base); /* cached closure */
+ if (ncl == NULL) /* no match? */
+ pushclosure(L, p, cl->upvals, base, ra); /* create a new one */
+ else
+ setclLvalue(L, ra, ncl); /* push cashed closure */
+ checkGC(L, ra + 1);
+ )
+ vmcase(OP_VARARG,
+ int b = GETARG_B(i) - 1;
+ int j;
+ int n = cast_int(base - ci->func) - cl->p->numparams - 1;
+ if (b < 0) { /* B == 0? */
+ b = n; /* get all var. arguments */
+ Protect(luaD_checkstack(L, n));
+ ra = RA(i); /* previous call may change the stack */
+ L->top = ra + n;
+ }
+ for (j = 0; j < b; j++) {
+ if (j < n) {
+ setobjs2s(L, ra + j, base - n + j);
+ }
+ else {
+ setnilvalue(ra + j);
+ }
+ }
+ )
+ vmcase(OP_EXTRAARG,
+ lua_assert(0);
+ )
+ }
+ }
+}
+
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lvm.h b/sys/contrib/openzfs/module/lua/lvm.h
new file mode 100644
index 000000000000..2d2be9836f69
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lvm.h
@@ -0,0 +1,46 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lvm.h,v 2.18.1.1 2013/04/12 18:48:47 roberto Exp $
+** Lua virtual machine
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lvm_h
+#define lvm_h
+
+
+#include "ldo.h"
+#include "lobject.h"
+#include "ltm.h"
+
+
+#define tostring(L,o) (ttisstring(o) || (luaV_tostring(L, o)))
+
+#define tonumber(o,n) (ttisnumber(o) || (((o) = luaV_tonumber(o,n)) != NULL))
+
+#define equalobj(L,o1,o2) (ttisequal(o1, o2) && luaV_equalobj_(L, o1, o2))
+
+#define luaV_rawequalobj(o1,o2) equalobj(NULL,o1,o2)
+
+
+/* not to called directly */
+LUAI_FUNC int luaV_equalobj_ (lua_State *L, const TValue *t1, const TValue *t2);
+
+
+LUAI_FUNC int luaV_lessthan (lua_State *L, const TValue *l, const TValue *r);
+LUAI_FUNC int luaV_lessequal (lua_State *L, const TValue *l, const TValue *r);
+LUAI_FUNC const TValue *luaV_tonumber (const TValue *obj, TValue *n);
+LUAI_FUNC int luaV_tostring (lua_State *L, StkId obj);
+LUAI_FUNC void luaV_gettable (lua_State *L, const TValue *t, TValue *key,
+ StkId val);
+LUAI_FUNC void luaV_settable (lua_State *L, const TValue *t, TValue *key,
+ StkId val);
+LUAI_FUNC void luaV_finishOp (lua_State *L);
+LUAI_FUNC void luaV_execute (lua_State *L);
+LUAI_FUNC void luaV_concat (lua_State *L, int total);
+LUAI_FUNC void luaV_arith (lua_State *L, StkId ra, const TValue *rb,
+ const TValue *rc, TMS op);
+LUAI_FUNC void luaV_objlen (lua_State *L, StkId ra, const TValue *rb);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lzio.c b/sys/contrib/openzfs/module/lua/lzio.c
new file mode 100644
index 000000000000..bfbb41cf8ed3
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lzio.c
@@ -0,0 +1,74 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lzio.c,v 1.35.1.1 2013/04/12 18:48:47 roberto Exp $
+** Buffered streams
+** See Copyright Notice in lua.h
+*/
+
+
+#define lzio_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "llimits.h"
+#include "lmem.h"
+#include "lstate.h"
+#include "lzio.h"
+
+
+int luaZ_fill (ZIO *z) {
+ size_t size;
+ lua_State *L = z->L;
+ const char *buff;
+ lua_unlock(L);
+ buff = z->reader(L, z->data, &size);
+ lua_lock(L);
+ if (buff == NULL || size == 0)
+ return EOZ;
+ z->n = size - 1; /* discount char being returned */
+ z->p = buff;
+ return cast_uchar(*(z->p++));
+}
+
+
+void luaZ_init (lua_State *L, ZIO *z, lua_Reader reader, void *data) {
+ z->L = L;
+ z->reader = reader;
+ z->data = data;
+ z->n = 0;
+ z->p = NULL;
+}
+
+
+/* --------------------------------------------------------------- read --- */
+size_t luaZ_read (ZIO *z, void *b, size_t n) {
+ while (n) {
+ size_t m;
+ if (z->n == 0) { /* no bytes in buffer? */
+ if (luaZ_fill(z) == EOZ) /* try to read more */
+ return n; /* no more input; return number of missing bytes */
+ else {
+ z->n++; /* luaZ_fill consumed first byte; put it back */
+ z->p--;
+ }
+ }
+ m = (n <= z->n) ? n : z->n; /* min. between n and z->n */
+ memcpy(b, z->p, m);
+ z->n -= m;
+ z->p += m;
+ b = (char *)b + m;
+ n -= m;
+ }
+ return 0;
+}
+
+/* ------------------------------------------------------------------------ */
+char *luaZ_openspace (lua_State *L, Mbuffer *buff, size_t n) {
+ if (n > buff->buffsize) {
+ if (n < LUA_MINBUFFER) n = LUA_MINBUFFER;
+ luaZ_resizebuffer(L, buff, n);
+ }
+ return buff->buffer;
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lzio.h b/sys/contrib/openzfs/module/lua/lzio.h
new file mode 100644
index 000000000000..27908759d509
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lzio.h
@@ -0,0 +1,67 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lzio.h,v 1.26.1.1 2013/04/12 18:48:47 roberto Exp $
+** Buffered streams
+** See Copyright Notice in lua.h
+*/
+
+
+#ifndef lzio_h
+#define lzio_h
+
+#include <sys/lua/lua.h>
+
+#include "lmem.h"
+
+
+#define EOZ (-1) /* end of stream */
+
+typedef struct Zio ZIO;
+
+#define zgetc(z) (((z)->n--)>0 ? cast_uchar(*(z)->p++) : luaZ_fill(z))
+
+
+typedef struct Mbuffer {
+ char *buffer;
+ size_t n;
+ size_t buffsize;
+} Mbuffer;
+
+#define luaZ_initbuffer(L, buff) ((buff)->buffer = NULL, (buff)->buffsize = 0)
+
+#define luaZ_buffer(buff) ((buff)->buffer)
+#define luaZ_sizebuffer(buff) ((buff)->buffsize)
+#define luaZ_bufflen(buff) ((buff)->n)
+
+#define luaZ_resetbuffer(buff) ((buff)->n = 0)
+
+
+#define luaZ_resizebuffer(L, buff, size) \
+ (luaM_reallocvector(L, (buff)->buffer, (buff)->buffsize, size, char), \
+ (buff)->buffsize = size)
+
+#define luaZ_freebuffer(L, buff) luaZ_resizebuffer(L, buff, 0)
+
+
+LUAI_FUNC char *luaZ_openspace (lua_State *L, Mbuffer *buff, size_t n);
+LUAI_FUNC void luaZ_init (lua_State *L, ZIO *z, lua_Reader reader,
+ void *data);
+LUAI_FUNC size_t luaZ_read (ZIO* z, void* b, size_t n); /* read next n bytes */
+
+
+
+/* --------- Private Part ------------------ */
+
+struct Zio {
+ size_t n; /* bytes still unread */
+ const char *p; /* current position in buffer */
+ lua_Reader reader; /* reader function */
+ void* data; /* additional data */
+ lua_State *L; /* Lua state (for reader) */
+};
+
+
+LUAI_FUNC int luaZ_fill (ZIO *z);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/setjmp/setjmp.S b/sys/contrib/openzfs/module/lua/setjmp/setjmp.S
new file mode 100644
index 000000000000..1f461a0a4ef3
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/setjmp/setjmp.S
@@ -0,0 +1,19 @@
+#if defined(__x86_64__)
+#include "setjmp_x86_64.S"
+#elif defined(__i386__)
+#include "setjmp_i386.S"
+#elif defined(__aarch64__)
+#include "setjmp_aarch64.S"
+#elif defined(__arm__)
+#include "setjmp_arm.S"
+#elif defined(__sparc__) && defined(__arch64__)
+#include "setjmp_sparc64.S"
+#elif defined(__powerpc__)
+#include "setjmp_ppc.S"
+#elif defined(__mips__)
+#include "setjmp_mips.S"
+#elif defined(__s390x__)
+#include "setjmp_s390x.S"
+#elif defined(__riscv)
+#include "setjmp_rv64g.S"
+#endif
diff --git a/sys/contrib/openzfs/module/lua/setjmp/setjmp_aarch64.S b/sys/contrib/openzfs/module/lua/setjmp/setjmp_aarch64.S
new file mode 100644
index 000000000000..a5a9a85fd57e
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/setjmp/setjmp_aarch64.S
@@ -0,0 +1,86 @@
+/*-
+ * Copyright (c) 2014 Andrew Turner
+ * Copyright (c) 2014-2015 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed by Andrew Turner
+ * under sponsorship from the FreeBSD Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+
+#ifdef __aarch64__
+
+#define ENTRY(sym) \
+ .text; \
+ .globl sym; \
+ .align 2; \
+ .type sym,#function; \
+sym:
+
+#define END(sym) \
+ .size sym, . - sym
+
+
+ENTRY(setjmp)
+ /* Store the stack pointer */
+ mov x8, sp
+ str x8, [x0], #8
+
+ /* Store the general purpose registers and lr */
+ stp x19, x20, [x0], #16
+ stp x21, x22, [x0], #16
+ stp x23, x24, [x0], #16
+ stp x25, x26, [x0], #16
+ stp x27, x28, [x0], #16
+ stp x29, x30, [x0], #16
+
+ /* Return value */
+ mov x0, #0
+ ret
+END(setjmp)
+
+ENTRY(longjmp)
+ /* Restore the stack pointer */
+ ldr x8, [x0], #8
+ mov sp, x8
+
+ /* Restore the general purpose registers and lr */
+ ldp x19, x20, [x0], #16
+ ldp x21, x22, [x0], #16
+ ldp x23, x24, [x0], #16
+ ldp x25, x26, [x0], #16
+ ldp x27, x28, [x0], #16
+ ldp x29, x30, [x0], #16
+
+ /* Load the return value */
+ mov x0, x1
+ ret
+END(longjmp)
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#endif /* __aarch64__ */
diff --git a/sys/contrib/openzfs/module/lua/setjmp/setjmp_arm.S b/sys/contrib/openzfs/module/lua/setjmp/setjmp_arm.S
new file mode 100644
index 000000000000..78bc3e0b347d
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/setjmp/setjmp_arm.S
@@ -0,0 +1,84 @@
+/*-
+ * Copyright 2004-2014 Olivier Houchard <cognet@FreeBSD.org>
+ * Copyright 2012-2014 Ian Lepore <ian@FreeBSD.org>
+ * Copyright 2013-2014 Andrew Turner <andrew@FreeBSD.org>
+ * Copyright 2014 Svatopluk Kraus <onwahe@gmail.com>
+ * Copyright 2014 Michal Meloun <meloun@miracle.cz>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#if defined(__arm__) && !defined(__aarch64__)
+
+#if defined(__thumb2__)
+#define _FUNC_MODE .code 16; .thumb_func
+#else
+#define _FUNC_MODE .code 32
+#endif
+
+#define ENTRY(x) \
+ .text; \
+ .syntax unified; \
+ .align 2; \
+ .global x; \
+ .type x,#function; \
+ _FUNC_MODE; \
+x:
+
+#define END(x) \
+ .size x, . - x;
+
+#define RET bx lr
+
+
+/*
+ * setjump + longjmp
+ */
+ENTRY(setjmp)
+#if defined(__thumb2__)
+ mov ip, sp
+ stmia r0, {r4-r12,r14}
+#else
+ stmia r0, {r4-r14}
+#endif
+ mov r0, #0x00000000
+ RET
+END(setjmp)
+
+ENTRY(longjmp)
+#if defined(__thumb2__)
+ ldmia r0, {r4-r12,r14}
+ mov sp, ip
+#else
+ ldmia r0, {r4-r14}
+#endif
+ mov r0, #0x00000001
+ RET
+END(longjmp)
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#endif
diff --git a/sys/contrib/openzfs/module/lua/setjmp/setjmp_i386.S b/sys/contrib/openzfs/module/lua/setjmp/setjmp_i386.S
new file mode 100644
index 000000000000..6d6a5f332688
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/setjmp/setjmp_i386.S
@@ -0,0 +1,69 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#define ENTRY(x) \
+ .text; \
+ .align 8; \
+ .globl x; \
+ .type x, @function; \
+x:
+
+#define SET_SIZE(x) \
+ .size x, [.-x]
+
+/*
+ * Setjmp and longjmp implement non-local gotos using state vectors
+ * type label_t.
+ */
+#ifdef __i386__
+
+ ENTRY(setjmp) /* save area is passed in eax */
+ movl %ebp, 0(%eax) /* save ebp */
+ movl %ebx, 4(%eax) /* save ebx */
+ movl %esi, 8(%eax) /* save esi */
+ movl %edi, 12(%eax) /* save edi */
+ movl %esp, 16(%eax) /* save esp */
+ movl (%esp), %ecx /* %eip (return address) */
+ movl %ecx, 20(%eax) /* save eip */
+ subl %eax, %eax /* return 0 */
+ ret
+ SET_SIZE(setjmp)
+
+ ENTRY(longjmp) /* save area is passed in eax */
+ movl 0(%eax), %ebp /* restore ebp */
+ movl 4(%eax), %ebx /* restore ebx */
+ movl 8(%eax), %esi /* restore esi */
+ movl 12(%eax), %edi /* restore edi */
+ movl 16(%eax), %esp /* restore esp */
+ movl 20(%eax), %ecx /* %eip (return address) */
+ addl $4, %esp /* pop ret adr */
+ jmp *%ecx /* indirect jump */
+ SET_SIZE(longjmp)
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#endif /* __i386__ */
diff --git a/sys/contrib/openzfs/module/lua/setjmp/setjmp_mips.S b/sys/contrib/openzfs/module/lua/setjmp/setjmp_mips.S
new file mode 100644
index 000000000000..0084fbfa4bec
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/setjmp/setjmp_mips.S
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009
+ * The President and Fellows of Harvard College.
+ * Copyright (c) 2017 MIPS Technologies, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <asm/asm.h>
+#include <asm/regdef.h>
+
+/*
+ * setjmp and longjmp for MIPS.
+ */
+
+ .text
+ .set noreorder
+
+ /*
+ * int setjmp(jmp_buf jb);
+ *
+ * Save the current state so we can return again from the call later
+ * if/when longjmp is called. (If the function that called setjmp
+ * returns before longjmp is called, the results are undefined. We
+ * only need to save registers, not the whole contents of the stack.)
+ */
+LEAF(setjmp)
+ /*
+ * jmp_buf is in a0. We need to save s0-s8, sp, gp, and ra in it.
+ * Don't store more registers without adjusting machine/setjmp.h.
+ */
+
+ REG_S sp, 0(a0) /* save registers */
+ REG_S ra, 1*SZREG(a0)
+ REG_S gp, 2*SZREG(a0)
+ REG_S s0, 3*SZREG(a0)
+ REG_S s1, 4*SZREG(a0)
+ REG_S s2, 5*SZREG(a0)
+ REG_S s3, 6*SZREG(a0)
+ REG_S s4, 7*SZREG(a0)
+ REG_S s5, 8*SZREG(a0)
+ REG_S s6, 9*SZREG(a0)
+ REG_S s7, 10*SZREG(a0)
+ REG_S s8, 11*SZREG(a0)
+
+ jr ra /* done */
+ move v0, zero /* return 0 (in delay slot) */
+END(setjmp)
+
+
+ /*
+ * void longjmp(jmp_buf jb, int code);
+ */
+LEAF(longjmp)
+ /*
+ * jmp_buf is in a0. Return code is in a1.
+ * We need to restore s0-s8, sp, gp, and ra from the jmp_buf.
+ * The return code is forced to 1 if 0 is passed in.
+ */
+
+ sltiu t0, a1, 1 /* set t0 to 1 if return code is 0... otherwise 0 */
+ addu a1, a1, t0 /* update the return code */
+
+ REG_L sp, 0(a0) /* restore registers */
+ REG_L ra, 1*SZREG(a0)
+ REG_L gp, 2*SZREG(a0)
+ REG_L s0, 3*SZREG(a0)
+ REG_L s1, 4*SZREG(a0)
+ REG_L s2, 5*SZREG(a0)
+ REG_L s3, 6*SZREG(a0)
+ REG_L s4, 7*SZREG(a0)
+ REG_L s5, 8*SZREG(a0)
+ REG_L s6, 9*SZREG(a0)
+ REG_L s7, 10*SZREG(a0)
+ REG_L s8, 11*SZREG(a0)
+
+ jr ra /* return, to where setjmp was called from */
+ move v0, a1 /* set return value */
+END(longjmp)
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/lua/setjmp/setjmp_ppc.S b/sys/contrib/openzfs/module/lua/setjmp/setjmp_ppc.S
new file mode 100644
index 000000000000..72aa5d5ab5b0
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/setjmp/setjmp_ppc.S
@@ -0,0 +1,165 @@
+/* $FreeBSD$ */
+/* from: NetBSD: setjmp.S,v 1.1 1998/01/27 15:13:12 sakamoto Exp $ */
+/* from: OpenBSD: setjmp.S,v 1.2 1996/12/28 06:22:18 rahnds Exp */
+/* kernel version of this file, does not have signal goop */
+/* int setjmp(jmp_buf env) */
+
+#define _ASM
+#include <asm/types.h>
+
+#ifdef __powerpc64__
+#if !defined(PPC64_ELF_ABI_v2) && !defined(PPC64_ELF_ABI_v1)
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+#define PPC64_ELF_ABI_v2
+#endif /* _CALL_ELF */
+#endif /* PPC64_ELF_ABI_ */
+#endif /* __powerpc64__ */
+
+#ifdef __powerpc64__
+#define LD_REG ld
+#define ST_REG std
+#define REGWIDTH 8
+#else
+#define LD_REG lwz
+#define ST_REG stw
+#define REGWIDTH 4
+#endif /* __powerpc64__ */
+
+#define JMP_r1 1*REGWIDTH
+#define JMP_r2 2*REGWIDTH
+#define JMP_r14 3*REGWIDTH
+#define JMP_r15 4*REGWIDTH
+#define JMP_r16 5*REGWIDTH
+#define JMP_r17 6*REGWIDTH
+#define JMP_r18 7*REGWIDTH
+#define JMP_r19 8*REGWIDTH
+#define JMP_r20 9*REGWIDTH
+#define JMP_r21 10*REGWIDTH
+#define JMP_r22 11*REGWIDTH
+#define JMP_r23 12*REGWIDTH
+#define JMP_r24 13*REGWIDTH
+#define JMP_r25 14*REGWIDTH
+#define JMP_r26 15*REGWIDTH
+#define JMP_r27 16*REGWIDTH
+#define JMP_r28 17*REGWIDTH
+#define JMP_r29 18*REGWIDTH
+#define JMP_r30 19*REGWIDTH
+#define JMP_r31 20*REGWIDTH
+#define JMP_lr 21*REGWIDTH
+#define JMP_cr 22*REGWIDTH
+#define JMP_ctr 23*REGWIDTH
+#define JMP_xer 24*REGWIDTH
+
+#ifdef __powerpc64__
+#ifdef PPC64_ELF_ABI_v2
+
+#define ENTRY(name) \
+ .align 2 ; \
+ .type name,@function; \
+ .weak name; \
+name:
+
+#else /* PPC64_ELF_ABI_v1 */
+
+#define XGLUE(a,b) a##b
+#define GLUE(a,b) XGLUE(a,b)
+#define ENTRY(name) \
+ .align 2 ; \
+ .weak name; \
+ .weak GLUE(.,name); \
+ .pushsection ".opd","aw"; \
+name: \
+ .quad GLUE(.,name); \
+ .quad .TOC.@tocbase; \
+ .quad 0; \
+ .popsection; \
+ .type GLUE(.,name),@function; \
+GLUE(.,name):
+
+#endif /* PPC64_ELF_ABI_v2 */
+
+#else /* 32-bit */
+
+#define ENTRY(name) \
+ .text; \
+ .p2align 4; \
+ .weak name; \
+ .type name,@function; \
+name:
+
+#endif /* __powerpc64__ */
+
+
+ENTRY(setjmp)
+ ST_REG 31, JMP_r31(3)
+ /* r1, r2, r14-r30 */
+ ST_REG 1, JMP_r1 (3)
+ ST_REG 2, JMP_r2 (3)
+ ST_REG 14, JMP_r14(3)
+ ST_REG 15, JMP_r15(3)
+ ST_REG 16, JMP_r16(3)
+ ST_REG 17, JMP_r17(3)
+ ST_REG 18, JMP_r18(3)
+ ST_REG 19, JMP_r19(3)
+ ST_REG 20, JMP_r20(3)
+ ST_REG 21, JMP_r21(3)
+ ST_REG 22, JMP_r22(3)
+ ST_REG 23, JMP_r23(3)
+ ST_REG 24, JMP_r24(3)
+ ST_REG 25, JMP_r25(3)
+ ST_REG 26, JMP_r26(3)
+ ST_REG 27, JMP_r27(3)
+ ST_REG 28, JMP_r28(3)
+ ST_REG 29, JMP_r29(3)
+ ST_REG 30, JMP_r30(3)
+ /* cr, lr, ctr, xer */
+ mfcr 0
+ ST_REG 0, JMP_cr(3)
+ mflr 0
+ ST_REG 0, JMP_lr(3)
+ mfctr 0
+ ST_REG 0, JMP_ctr(3)
+ mfxer 0
+ ST_REG 0, JMP_xer(3)
+ /* f14-f31, fpscr */
+ li 3, 0
+ blr
+
+ENTRY(longjmp)
+ LD_REG 31, JMP_r31(3)
+ /* r1, r2, r14-r30 */
+ LD_REG 1, JMP_r1 (3)
+ LD_REG 2, JMP_r2 (3)
+ LD_REG 14, JMP_r14(3)
+ LD_REG 15, JMP_r15(3)
+ LD_REG 16, JMP_r16(3)
+ LD_REG 17, JMP_r17(3)
+ LD_REG 18, JMP_r18(3)
+ LD_REG 19, JMP_r19(3)
+ LD_REG 20, JMP_r20(3)
+ LD_REG 21, JMP_r21(3)
+ LD_REG 22, JMP_r22(3)
+ LD_REG 23, JMP_r23(3)
+ LD_REG 24, JMP_r24(3)
+ LD_REG 25, JMP_r25(3)
+ LD_REG 26, JMP_r26(3)
+ LD_REG 27, JMP_r27(3)
+ LD_REG 28, JMP_r28(3)
+ LD_REG 29, JMP_r29(3)
+ LD_REG 30, JMP_r30(3)
+ /* cr, lr, ctr, xer */
+ LD_REG 0, JMP_cr(3)
+ mtcr 0
+ LD_REG 0, JMP_lr(3)
+ mtlr 0
+ LD_REG 0, JMP_ctr(3)
+ mtctr 0
+ LD_REG 0, JMP_xer(3)
+ mtxer 0
+ /* f14-f31, fpscr */
+ mr 3, 4
+ blr
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/lua/setjmp/setjmp_rv64g.S b/sys/contrib/openzfs/module/lua/setjmp/setjmp_rv64g.S
new file mode 100644
index 000000000000..7f6c50d25a4c
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/setjmp/setjmp_rv64g.S
@@ -0,0 +1,91 @@
+/*-
+ * Copyright (c) 2015-2016 Ruslan Bukin <br@bsdpad.com>
+ * All rights reserved.
+ *
+ * Portions of this software were developed by SRI International and the
+ * University of Cambridge Computer Laboratory under DARPA/AFRL contract
+ * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme.
+ *
+ * Portions of this software were developed by the University of Cambridge
+ * Computer Laboratory as part of the CTSRD Project, with support from the
+ * UK Higher Education Innovation Fund (HEIF).
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#define ENTRY(sym) \
+ .text; .globl sym; .type sym,@function; sym:
+#define END(sym) .size sym, . - sym
+
+
+ENTRY(setjmp)
+ /* Store the stack pointer */
+ sd sp, (0 * 8)(a0)
+ addi a0, a0, (1 * 8)
+
+ /* Store the general purpose registers and ra */
+ sd s0, (0 * 8)(a0)
+ sd s1, (1 * 8)(a0)
+ sd s2, (2 * 8)(a0)
+ sd s3, (3 * 8)(a0)
+ sd s4, (4 * 8)(a0)
+ sd s5, (5 * 8)(a0)
+ sd s6, (6 * 8)(a0)
+ sd s7, (7 * 8)(a0)
+ sd s8, (8 * 8)(a0)
+ sd s9, (9 * 8)(a0)
+ sd s10, (10 * 8)(a0)
+ sd s11, (11 * 8)(a0)
+ sd ra, (12 * 8)(a0)
+ addi a0, a0, (13 * 8)
+
+ /* Return value */
+ li a0, 0
+ ret
+END(setjmp)
+
+ENTRY(longjmp)
+ /* Restore the stack pointer */
+ ld t0, 0(a0)
+ mv sp, t0
+ addi a0, a0, (1 * 8)
+
+ /* Restore the general purpose registers and ra */
+ ld s0, (0 * 8)(a0)
+ ld s1, (1 * 8)(a0)
+ ld s2, (2 * 8)(a0)
+ ld s3, (3 * 8)(a0)
+ ld s4, (4 * 8)(a0)
+ ld s5, (5 * 8)(a0)
+ ld s6, (6 * 8)(a0)
+ ld s7, (7 * 8)(a0)
+ ld s8, (8 * 8)(a0)
+ ld s9, (9 * 8)(a0)
+ ld s10, (10 * 8)(a0)
+ ld s11, (11 * 8)(a0)
+ ld ra, (12 * 8)(a0)
+ addi a0, a0, (13 * 8)
+
+ /* Load the return value */
+ mv a0, a1
+ ret
+END(longjmp)
diff --git a/sys/contrib/openzfs/module/lua/setjmp/setjmp_s390x.S b/sys/contrib/openzfs/module/lua/setjmp/setjmp_s390x.S
new file mode 100644
index 000000000000..336c66c08b51
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/setjmp/setjmp_s390x.S
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2005-2014 Rich Felker, et al.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+ .global setjmp
+ .type setjmp,@function
+setjmp:
+ stmg %r6, %r15, 0(%r2)
+
+ std %f8, 10*8(%r2)
+ std %f9, 11*8(%r2)
+ std %f10, 12*8(%r2)
+ std %f11, 13*8(%r2)
+ std %f12, 14*8(%r2)
+ std %f13, 15*8(%r2)
+ std %f14, 16*8(%r2)
+ std %f15, 17*8(%r2)
+
+ lghi %r2, 0
+ br %r14
+
+ .global longjmp
+ .type longjmp,@function
+longjmp:
+
+1:
+ lmg %r6, %r15, 0(%r2)
+
+ ld %f8, 10*8(%r2)
+ ld %f9, 11*8(%r2)
+ ld %f10, 12*8(%r2)
+ ld %f11, 13*8(%r2)
+ ld %f12, 14*8(%r2)
+ ld %f13, 15*8(%r2)
+ ld %f14, 16*8(%r2)
+ ld %f15, 17*8(%r2)
+
+ ltgr %r2, %r3
+ bnzr %r14
+ lhi %r2, 1
+ br %r14
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/lua/setjmp/setjmp_sparc64.S b/sys/contrib/openzfs/module/lua/setjmp/setjmp_sparc64.S
new file mode 100644
index 000000000000..a37a71cbce33
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/setjmp/setjmp_sparc64.S
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This software was developed by the Computer Systems Engineering group
+ * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
+ * contributed to Berkeley.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Header: _setjmp.s,v 1.1 91/07/06 16:45:53 torek Exp
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+#if 0
+ .asciz "@(#)_setjmp.s 8.1 (Berkeley) 6/4/93"
+#else
+ RCSID("$NetBSD: _setjmp.S,v 1.4 1998/10/08 02:27:59 eeh Exp $")
+#endif
+#endif /* LIBC_SCCS and not lint */
+
+#define _JB_FP 0x0
+#define _JB_PC 0x8
+#define _JB_SP 0x10
+
+ .register %g2,#ignore
+ .register %g3,#ignore
+
+#define ENTRY(x) \
+ .text ; \
+ .align 32 ; \
+ .globl x ; \
+ .type x,@function ; \
+x:
+
+#define END(x) \
+ .size x, . - x
+
+/*
+ * C library -- setjmp, longjmp
+ *
+ * longjmp(a,v)
+ * will generate a "return(v?v:1)" from
+ * the last call to
+ * setjmp(a)
+ * by restoring the previous context.
+ */
+
+ENTRY(setjmp)
+ stx %sp, [%o0 + _JB_SP]
+ stx %o7, [%o0 + _JB_PC]
+ stx %fp, [%o0 + _JB_FP]
+ retl
+ clr %o0
+END(setjmp)
+
+ENTRY(longjmp)
+ mov 1, %g1
+ movrnz %o1, %o1, %g1
+ mov %o0, %g2
+ ldx [%g2 + _JB_FP], %g3
+1: cmp %fp, %g3
+ bl,a 1b
+ restore
+ be,a 2f
+ ldx [%g2 + _JB_SP], %o0
+
+.Lbotch:
+ illtrap
+
+2: cmp %o0, %sp
+ bge,a 3f
+ mov %o0, %sp
+ b,a .Lbotch
+ nop
+3: ldx [%g2 + _JB_PC], %o7
+ retl
+ mov %g1, %o0
+END(longjmp)
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/lua/setjmp/setjmp_x86_64.S b/sys/contrib/openzfs/module/lua/setjmp/setjmp_x86_64.S
new file mode 100644
index 000000000000..a469cbad780e
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/setjmp/setjmp_x86_64.S
@@ -0,0 +1,77 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+
+#define ENTRY(x) \
+ .text; \
+ .align 8; \
+ .globl x; \
+ .type x, @function; \
+x:
+
+#define SET_SIZE(x) \
+ .size x, [.-x]
+
+
+/*
+ * Setjmp and longjmp implement non-local gotos using state vectors
+ * type label_t.
+ */
+#ifdef __x86_64__
+
+ ENTRY(setjmp)
+ movq %rsp, 0(%rdi)
+ movq %rbp, 8(%rdi)
+ movq %rbx, 16(%rdi)
+ movq %r12, 24(%rdi)
+ movq %r13, 32(%rdi)
+ movq %r14, 40(%rdi)
+ movq %r15, 48(%rdi)
+ movq 0(%rsp), %rdx /* return address */
+ movq %rdx, 56(%rdi) /* rip */
+ xorl %eax, %eax /* return 0 */
+ ret
+ SET_SIZE(setjmp)
+
+ ENTRY(longjmp)
+ movq 0(%rdi), %rsp
+ movq 8(%rdi), %rbp
+ movq 16(%rdi), %rbx
+ movq 24(%rdi), %r12
+ movq 32(%rdi), %r13
+ movq 40(%rdi), %r14
+ movq 48(%rdi), %r15
+ movq 56(%rdi), %rdx /* return address */
+ movq %rdx, 0(%rsp)
+ xorl %eax, %eax
+ incl %eax /* return 1 */
+ ret
+ SET_SIZE(longjmp)
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#endif /* __x86_64__ */
diff --git a/sys/contrib/openzfs/module/nvpair/Makefile.in b/sys/contrib/openzfs/module/nvpair/Makefile.in
new file mode 100644
index 000000000000..d8145236674b
--- /dev/null
+++ b/sys/contrib/openzfs/module/nvpair/Makefile.in
@@ -0,0 +1,13 @@
+ifneq ($(KBUILD_EXTMOD),)
+src = @abs_srcdir@
+obj = @abs_builddir@
+endif
+
+MODULE := znvpair
+
+obj-$(CONFIG_ZFS) := $(MODULE).o
+
+$(MODULE)-objs += nvpair.o
+$(MODULE)-objs += fnvpair.o
+$(MODULE)-objs += nvpair_alloc_spl.o
+$(MODULE)-objs += nvpair_alloc_fixed.o
diff --git a/sys/contrib/openzfs/module/nvpair/fnvpair.c b/sys/contrib/openzfs/module/nvpair/fnvpair.c
new file mode 100644
index 000000000000..dc8257e48594
--- /dev/null
+++ b/sys/contrib/openzfs/module/nvpair/fnvpair.c
@@ -0,0 +1,660 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/nvpair.h>
+#include <sys/kmem.h>
+#include <sys/debug.h>
+#include <sys/param.h>
+#ifndef _KERNEL
+#include <stdlib.h>
+#endif
+
+/*
+ * "Force" nvlist wrapper.
+ *
+ * These functions wrap the nvlist_* functions with assertions that assume
+ * the operation is successful. This allows the caller's code to be much
+ * more readable, especially for the fnvlist_lookup_* and fnvpair_value_*
+ * functions, which can return the requested value (rather than filling in
+ * a pointer).
+ *
+ * These functions use NV_UNIQUE_NAME, encoding NV_ENCODE_NATIVE, and allocate
+ * with KM_SLEEP.
+ *
+ * More wrappers should be added as needed -- for example
+ * nvlist_lookup_*_array and nvpair_value_*_array.
+ */
+
+nvlist_t *
+fnvlist_alloc(void)
+{
+ nvlist_t *nvl;
+ VERIFY0(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP));
+ return (nvl);
+}
+
+void
+fnvlist_free(nvlist_t *nvl)
+{
+ nvlist_free(nvl);
+}
+
+size_t
+fnvlist_size(nvlist_t *nvl)
+{
+ size_t size;
+ VERIFY0(nvlist_size(nvl, &size, NV_ENCODE_NATIVE));
+ return (size);
+}
+
+/*
+ * Returns allocated buffer of size *sizep. Caller must free the buffer with
+ * fnvlist_pack_free().
+ */
+char *
+fnvlist_pack(nvlist_t *nvl, size_t *sizep)
+{
+ char *packed = 0;
+ VERIFY3U(nvlist_pack(nvl, &packed, sizep, NV_ENCODE_NATIVE,
+ KM_SLEEP), ==, 0);
+ return (packed);
+}
+
+/*ARGSUSED*/
+void
+fnvlist_pack_free(char *pack, size_t size)
+{
+#ifdef _KERNEL
+ kmem_free(pack, size);
+#else
+ free(pack);
+#endif
+}
+
+nvlist_t *
+fnvlist_unpack(char *buf, size_t buflen)
+{
+ nvlist_t *rv;
+ VERIFY0(nvlist_unpack(buf, buflen, &rv, KM_SLEEP));
+ return (rv);
+}
+
+nvlist_t *
+fnvlist_dup(nvlist_t *nvl)
+{
+ nvlist_t *rv;
+ VERIFY0(nvlist_dup(nvl, &rv, KM_SLEEP));
+ return (rv);
+}
+
+void
+fnvlist_merge(nvlist_t *dst, nvlist_t *src)
+{
+ VERIFY0(nvlist_merge(dst, src, KM_SLEEP));
+}
+
+size_t
+fnvlist_num_pairs(nvlist_t *nvl)
+{
+ size_t count = 0;
+ nvpair_t *pair;
+
+ for (pair = nvlist_next_nvpair(nvl, 0); pair != NULL;
+ pair = nvlist_next_nvpair(nvl, pair))
+ count++;
+ return (count);
+}
+
+void
+fnvlist_add_boolean(nvlist_t *nvl, const char *name)
+{
+ VERIFY0(nvlist_add_boolean(nvl, name));
+}
+
+void
+fnvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val)
+{
+ VERIFY0(nvlist_add_boolean_value(nvl, name, val));
+}
+
+void
+fnvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val)
+{
+ VERIFY0(nvlist_add_byte(nvl, name, val));
+}
+
+void
+fnvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val)
+{
+ VERIFY0(nvlist_add_int8(nvl, name, val));
+}
+
+void
+fnvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val)
+{
+ VERIFY0(nvlist_add_uint8(nvl, name, val));
+}
+
+void
+fnvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val)
+{
+ VERIFY0(nvlist_add_int16(nvl, name, val));
+}
+
+void
+fnvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val)
+{
+ VERIFY0(nvlist_add_uint16(nvl, name, val));
+}
+
+void
+fnvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val)
+{
+ VERIFY0(nvlist_add_int32(nvl, name, val));
+}
+
+void
+fnvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val)
+{
+ VERIFY0(nvlist_add_uint32(nvl, name, val));
+}
+
+void
+fnvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val)
+{
+ VERIFY0(nvlist_add_int64(nvl, name, val));
+}
+
+void
+fnvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val)
+{
+ VERIFY0(nvlist_add_uint64(nvl, name, val));
+}
+
+void
+fnvlist_add_string(nvlist_t *nvl, const char *name, const char *val)
+{
+ VERIFY0(nvlist_add_string(nvl, name, val));
+}
+
+void
+fnvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val)
+{
+ VERIFY0(nvlist_add_nvlist(nvl, name, val));
+}
+
+void
+fnvlist_add_nvpair(nvlist_t *nvl, nvpair_t *pair)
+{
+ VERIFY0(nvlist_add_nvpair(nvl, pair));
+}
+
+void
+fnvlist_add_boolean_array(nvlist_t *nvl, const char *name,
+ boolean_t *val, uint_t n)
+{
+ VERIFY0(nvlist_add_boolean_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *val, uint_t n)
+{
+ VERIFY0(nvlist_add_byte_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *val, uint_t n)
+{
+ VERIFY0(nvlist_add_int8_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *val, uint_t n)
+{
+ VERIFY0(nvlist_add_uint8_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *val, uint_t n)
+{
+ VERIFY0(nvlist_add_int16_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_uint16_array(nvlist_t *nvl, const char *name,
+ uint16_t *val, uint_t n)
+{
+ VERIFY0(nvlist_add_uint16_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *val, uint_t n)
+{
+ VERIFY0(nvlist_add_int32_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_uint32_array(nvlist_t *nvl, const char *name,
+ uint32_t *val, uint_t n)
+{
+ VERIFY0(nvlist_add_uint32_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *val, uint_t n)
+{
+ VERIFY0(nvlist_add_int64_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_uint64_array(nvlist_t *nvl, const char *name,
+ uint64_t *val, uint_t n)
+{
+ VERIFY0(nvlist_add_uint64_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_string_array(nvlist_t *nvl, const char *name,
+ char * const *val, uint_t n)
+{
+ VERIFY0(nvlist_add_string_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_nvlist_array(nvlist_t *nvl, const char *name,
+ nvlist_t **val, uint_t n)
+{
+ VERIFY0(nvlist_add_nvlist_array(nvl, name, val, n));
+}
+
+void
+fnvlist_remove(nvlist_t *nvl, const char *name)
+{
+ VERIFY0(nvlist_remove_all(nvl, name));
+}
+
+void
+fnvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *pair)
+{
+ VERIFY0(nvlist_remove_nvpair(nvl, pair));
+}
+
+nvpair_t *
+fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name)
+{
+ nvpair_t *rv;
+ VERIFY0(nvlist_lookup_nvpair(nvl, name, &rv));
+ return (rv);
+}
+
+/* returns B_TRUE if the entry exists */
+boolean_t
+fnvlist_lookup_boolean(nvlist_t *nvl, const char *name)
+{
+ return (nvlist_lookup_boolean(nvl, name) == 0);
+}
+
+boolean_t
+fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name)
+{
+ boolean_t rv;
+ VERIFY0(nvlist_lookup_boolean_value(nvl, name, &rv));
+ return (rv);
+}
+
+uchar_t
+fnvlist_lookup_byte(nvlist_t *nvl, const char *name)
+{
+ uchar_t rv;
+ VERIFY0(nvlist_lookup_byte(nvl, name, &rv));
+ return (rv);
+}
+
+int8_t
+fnvlist_lookup_int8(nvlist_t *nvl, const char *name)
+{
+ int8_t rv;
+ VERIFY0(nvlist_lookup_int8(nvl, name, &rv));
+ return (rv);
+}
+
+int16_t
+fnvlist_lookup_int16(nvlist_t *nvl, const char *name)
+{
+ int16_t rv;
+ VERIFY0(nvlist_lookup_int16(nvl, name, &rv));
+ return (rv);
+}
+
+int32_t
+fnvlist_lookup_int32(nvlist_t *nvl, const char *name)
+{
+ int32_t rv;
+ VERIFY0(nvlist_lookup_int32(nvl, name, &rv));
+ return (rv);
+}
+
+int64_t
+fnvlist_lookup_int64(nvlist_t *nvl, const char *name)
+{
+ int64_t rv;
+ VERIFY0(nvlist_lookup_int64(nvl, name, &rv));
+ return (rv);
+}
+
+uint8_t
+fnvlist_lookup_uint8(nvlist_t *nvl, const char *name)
+{
+ uint8_t rv;
+ VERIFY0(nvlist_lookup_uint8(nvl, name, &rv));
+ return (rv);
+}
+
+uint16_t
+fnvlist_lookup_uint16(nvlist_t *nvl, const char *name)
+{
+ uint16_t rv;
+ VERIFY0(nvlist_lookup_uint16(nvl, name, &rv));
+ return (rv);
+}
+
+uint32_t
+fnvlist_lookup_uint32(nvlist_t *nvl, const char *name)
+{
+ uint32_t rv;
+ VERIFY0(nvlist_lookup_uint32(nvl, name, &rv));
+ return (rv);
+}
+
+uint64_t
+fnvlist_lookup_uint64(nvlist_t *nvl, const char *name)
+{
+ uint64_t rv;
+ VERIFY0(nvlist_lookup_uint64(nvl, name, &rv));
+ return (rv);
+}
+
+char *
+fnvlist_lookup_string(nvlist_t *nvl, const char *name)
+{
+ char *rv;
+ VERIFY0(nvlist_lookup_string(nvl, name, &rv));
+ return (rv);
+}
+
+nvlist_t *
+fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name)
+{
+ nvlist_t *rv;
+ VERIFY0(nvlist_lookup_nvlist(nvl, name, &rv));
+ return (rv);
+}
+boolean_t *
+fnvlist_lookup_boolean_array(nvlist_t *nvl, const char *name, uint_t *n)
+{
+ boolean_t *rv;
+ VERIFY0(nvlist_lookup_boolean_array(nvl, name, &rv, n));
+ return (rv);
+}
+
+uchar_t *
+fnvlist_lookup_byte_array(nvlist_t *nvl, const char *name, uint_t *n)
+{
+ uchar_t *rv;
+ VERIFY0(nvlist_lookup_byte_array(nvl, name, &rv, n));
+ return (rv);
+}
+
+int8_t *
+fnvlist_lookup_int8_array(nvlist_t *nvl, const char *name, uint_t *n)
+{
+ int8_t *rv;
+ VERIFY0(nvlist_lookup_int8_array(nvl, name, &rv, n));
+ return (rv);
+}
+
+uint8_t *
+fnvlist_lookup_uint8_array(nvlist_t *nvl, const char *name, uint_t *n)
+{
+ uint8_t *rv;
+ VERIFY0(nvlist_lookup_uint8_array(nvl, name, &rv, n));
+ return (rv);
+}
+
+int16_t *
+fnvlist_lookup_int16_array(nvlist_t *nvl, const char *name, uint_t *n)
+{
+ int16_t *rv;
+ VERIFY0(nvlist_lookup_int16_array(nvl, name, &rv, n));
+ return (rv);
+}
+
+uint16_t *
+fnvlist_lookup_uint16_array(nvlist_t *nvl, const char *name, uint_t *n)
+{
+ uint16_t *rv;
+ VERIFY0(nvlist_lookup_uint16_array(nvl, name, &rv, n));
+ return (rv);
+}
+
+int32_t *
+fnvlist_lookup_int32_array(nvlist_t *nvl, const char *name, uint_t *n)
+{
+ int32_t *rv;
+ VERIFY0(nvlist_lookup_int32_array(nvl, name, &rv, n));
+ return (rv);
+}
+
+uint32_t *
+fnvlist_lookup_uint32_array(nvlist_t *nvl, const char *name, uint_t *n)
+{
+ uint32_t *rv;
+ VERIFY0(nvlist_lookup_uint32_array(nvl, name, &rv, n));
+ return (rv);
+}
+
+int64_t *
+fnvlist_lookup_int64_array(nvlist_t *nvl, const char *name, uint_t *n)
+{
+ int64_t *rv;
+ VERIFY0(nvlist_lookup_int64_array(nvl, name, &rv, n));
+ return (rv);
+}
+
+uint64_t *
+fnvlist_lookup_uint64_array(nvlist_t *nvl, const char *name, uint_t *n)
+{
+ uint64_t *rv;
+ VERIFY0(nvlist_lookup_uint64_array(nvl, name, &rv, n));
+ return (rv);
+}
+
+boolean_t
+fnvpair_value_boolean_value(nvpair_t *nvp)
+{
+ boolean_t rv;
+ VERIFY0(nvpair_value_boolean_value(nvp, &rv));
+ return (rv);
+}
+
+uchar_t
+fnvpair_value_byte(nvpair_t *nvp)
+{
+ uchar_t rv;
+ VERIFY0(nvpair_value_byte(nvp, &rv));
+ return (rv);
+}
+
+int8_t
+fnvpair_value_int8(nvpair_t *nvp)
+{
+ int8_t rv;
+ VERIFY0(nvpair_value_int8(nvp, &rv));
+ return (rv);
+}
+
+int16_t
+fnvpair_value_int16(nvpair_t *nvp)
+{
+ int16_t rv;
+ VERIFY0(nvpair_value_int16(nvp, &rv));
+ return (rv);
+}
+
+int32_t
+fnvpair_value_int32(nvpair_t *nvp)
+{
+ int32_t rv;
+ VERIFY0(nvpair_value_int32(nvp, &rv));
+ return (rv);
+}
+
+int64_t
+fnvpair_value_int64(nvpair_t *nvp)
+{
+ int64_t rv;
+ VERIFY0(nvpair_value_int64(nvp, &rv));
+ return (rv);
+}
+
+uint8_t
+fnvpair_value_uint8(nvpair_t *nvp)
+{
+ uint8_t rv;
+ VERIFY0(nvpair_value_uint8(nvp, &rv));
+ return (rv);
+}
+
+uint16_t
+fnvpair_value_uint16(nvpair_t *nvp)
+{
+ uint16_t rv;
+ VERIFY0(nvpair_value_uint16(nvp, &rv));
+ return (rv);
+}
+
+uint32_t
+fnvpair_value_uint32(nvpair_t *nvp)
+{
+ uint32_t rv;
+ VERIFY0(nvpair_value_uint32(nvp, &rv));
+ return (rv);
+}
+
+uint64_t
+fnvpair_value_uint64(nvpair_t *nvp)
+{
+ uint64_t rv;
+ VERIFY0(nvpair_value_uint64(nvp, &rv));
+ return (rv);
+}
+
+char *
+fnvpair_value_string(nvpair_t *nvp)
+{
+ char *rv;
+ VERIFY0(nvpair_value_string(nvp, &rv));
+ return (rv);
+}
+
+nvlist_t *
+fnvpair_value_nvlist(nvpair_t *nvp)
+{
+ nvlist_t *rv;
+ VERIFY0(nvpair_value_nvlist(nvp, &rv));
+ return (rv);
+}
+
+#if defined(_KERNEL)
+
+EXPORT_SYMBOL(fnvlist_alloc);
+EXPORT_SYMBOL(fnvlist_free);
+EXPORT_SYMBOL(fnvlist_size);
+EXPORT_SYMBOL(fnvlist_pack);
+EXPORT_SYMBOL(fnvlist_pack_free);
+EXPORT_SYMBOL(fnvlist_unpack);
+EXPORT_SYMBOL(fnvlist_dup);
+EXPORT_SYMBOL(fnvlist_merge);
+
+EXPORT_SYMBOL(fnvlist_add_nvpair);
+EXPORT_SYMBOL(fnvlist_add_boolean);
+EXPORT_SYMBOL(fnvlist_add_boolean_value);
+EXPORT_SYMBOL(fnvlist_add_byte);
+EXPORT_SYMBOL(fnvlist_add_int8);
+EXPORT_SYMBOL(fnvlist_add_uint8);
+EXPORT_SYMBOL(fnvlist_add_int16);
+EXPORT_SYMBOL(fnvlist_add_uint16);
+EXPORT_SYMBOL(fnvlist_add_int32);
+EXPORT_SYMBOL(fnvlist_add_uint32);
+EXPORT_SYMBOL(fnvlist_add_int64);
+EXPORT_SYMBOL(fnvlist_add_uint64);
+EXPORT_SYMBOL(fnvlist_add_string);
+EXPORT_SYMBOL(fnvlist_add_nvlist);
+EXPORT_SYMBOL(fnvlist_add_boolean_array);
+EXPORT_SYMBOL(fnvlist_add_byte_array);
+EXPORT_SYMBOL(fnvlist_add_int8_array);
+EXPORT_SYMBOL(fnvlist_add_uint8_array);
+EXPORT_SYMBOL(fnvlist_add_int16_array);
+EXPORT_SYMBOL(fnvlist_add_uint16_array);
+EXPORT_SYMBOL(fnvlist_add_int32_array);
+EXPORT_SYMBOL(fnvlist_add_uint32_array);
+EXPORT_SYMBOL(fnvlist_add_int64_array);
+EXPORT_SYMBOL(fnvlist_add_uint64_array);
+EXPORT_SYMBOL(fnvlist_add_string_array);
+EXPORT_SYMBOL(fnvlist_add_nvlist_array);
+
+EXPORT_SYMBOL(fnvlist_remove);
+EXPORT_SYMBOL(fnvlist_remove_nvpair);
+
+EXPORT_SYMBOL(fnvlist_lookup_nvpair);
+EXPORT_SYMBOL(fnvlist_lookup_boolean);
+EXPORT_SYMBOL(fnvlist_lookup_boolean_value);
+EXPORT_SYMBOL(fnvlist_lookup_byte);
+EXPORT_SYMBOL(fnvlist_lookup_int8);
+EXPORT_SYMBOL(fnvlist_lookup_uint8);
+EXPORT_SYMBOL(fnvlist_lookup_int16);
+EXPORT_SYMBOL(fnvlist_lookup_uint16);
+EXPORT_SYMBOL(fnvlist_lookup_int32);
+EXPORT_SYMBOL(fnvlist_lookup_uint32);
+EXPORT_SYMBOL(fnvlist_lookup_int64);
+EXPORT_SYMBOL(fnvlist_lookup_uint64);
+EXPORT_SYMBOL(fnvlist_lookup_string);
+EXPORT_SYMBOL(fnvlist_lookup_nvlist);
+
+EXPORT_SYMBOL(fnvpair_value_boolean_value);
+EXPORT_SYMBOL(fnvpair_value_byte);
+EXPORT_SYMBOL(fnvpair_value_int8);
+EXPORT_SYMBOL(fnvpair_value_uint8);
+EXPORT_SYMBOL(fnvpair_value_int16);
+EXPORT_SYMBOL(fnvpair_value_uint16);
+EXPORT_SYMBOL(fnvpair_value_int32);
+EXPORT_SYMBOL(fnvpair_value_uint32);
+EXPORT_SYMBOL(fnvpair_value_int64);
+EXPORT_SYMBOL(fnvpair_value_uint64);
+EXPORT_SYMBOL(fnvpair_value_string);
+EXPORT_SYMBOL(fnvpair_value_nvlist);
+EXPORT_SYMBOL(fnvlist_num_pairs);
+
+#endif
diff --git a/sys/contrib/openzfs/module/nvpair/nvpair.c b/sys/contrib/openzfs/module/nvpair/nvpair.c
new file mode 100644
index 000000000000..990a4482c993
--- /dev/null
+++ b/sys/contrib/openzfs/module/nvpair/nvpair.c
@@ -0,0 +1,3738 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2017 by Delphix. All rights reserved.
+ * Copyright 2018 RackTop Systems.
+ */
+
+/*
+ * Links to Illumos.org for more information on Interface Libraries:
+ * [1] https://illumos.org/man/3lib/libnvpair
+ * [2] https://illumos.org/man/3nvpair/nvlist_alloc
+ * [3] https://illumos.org/man/9f/nvlist_alloc
+ * [4] https://illumos.org/man/9f/nvlist_next_nvpair
+ * [5] https://illumos.org/man/9f/nvpair_value_byte
+ */
+
+#include <sys/debug.h>
+#include <sys/isa_defs.h>
+#include <sys/nvpair.h>
+#include <sys/nvpair_impl.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/strings.h>
+#include <rpc/xdr.h>
+#include <sys/mod.h>
+
+#if defined(_KERNEL)
+#include <sys/sunddi.h>
+#include <sys/sysmacros.h>
+#else
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stddef.h>
+#endif
+
+#define skip_whitespace(p) while ((*(p) == ' ') || (*(p) == '\t')) p++
+
+/*
+ * nvpair.c - Provides kernel & userland interfaces for manipulating
+ * name-value pairs.
+ *
+ * Overview Diagram
+ *
+ * +--------------+
+ * | nvlist_t |
+ * |--------------|
+ * | nvl_version |
+ * | nvl_nvflag |
+ * | nvl_priv -+-+
+ * | nvl_flag | |
+ * | nvl_pad | |
+ * +--------------+ |
+ * V
+ * +--------------+ last i_nvp in list
+ * | nvpriv_t | +--------------------->
+ * |--------------| |
+ * +--+- nvp_list | | +------------+
+ * | | nvp_last -+--+ + nv_alloc_t |
+ * | | nvp_curr | |------------|
+ * | | nvp_nva -+----> | nva_ops |
+ * | | nvp_stat | | nva_arg |
+ * | +--------------+ +------------+
+ * |
+ * +-------+
+ * V
+ * +---------------------+ +-------------------+
+ * | i_nvp_t | +-->| i_nvp_t | +-->
+ * |---------------------| | |-------------------| |
+ * | nvi_next -+--+ | nvi_next -+--+
+ * | nvi_prev (NULL) | <----+ nvi_prev |
+ * | . . . . . . . . . . | | . . . . . . . . . |
+ * | nvp (nvpair_t) | | nvp (nvpair_t) |
+ * | - nvp_size | | - nvp_size |
+ * | - nvp_name_sz | | - nvp_name_sz |
+ * | - nvp_value_elem | | - nvp_value_elem |
+ * | - nvp_type | | - nvp_type |
+ * | - data ... | | - data ... |
+ * +---------------------+ +-------------------+
+ *
+ *
+ *
+ * +---------------------+ +---------------------+
+ * | i_nvp_t | +--> +-->| i_nvp_t (last) |
+ * |---------------------| | | |---------------------|
+ * | nvi_next -+--+ ... --+ | nvi_next (NULL) |
+ * <-+- nvi_prev |<-- ... <----+ nvi_prev |
+ * | . . . . . . . . . | | . . . . . . . . . |
+ * | nvp (nvpair_t) | | nvp (nvpair_t) |
+ * | - nvp_size | | - nvp_size |
+ * | - nvp_name_sz | | - nvp_name_sz |
+ * | - nvp_value_elem | | - nvp_value_elem |
+ * | - DATA_TYPE_NVLIST | | - nvp_type |
+ * | - data (embedded) | | - data ... |
+ * | nvlist name | +---------------------+
+ * | +--------------+ |
+ * | | nvlist_t | |
+ * | |--------------| |
+ * | | nvl_version | |
+ * | | nvl_nvflag | |
+ * | | nvl_priv --+---+---->
+ * | | nvl_flag | |
+ * | | nvl_pad | |
+ * | +--------------+ |
+ * +---------------------+
+ *
+ *
+ * N.B. nvpair_t may be aligned on 4 byte boundary, so +4 will
+ * allow value to be aligned on 8 byte boundary
+ *
+ * name_len is the length of the name string including the null terminator
+ * so it must be >= 1
+ */
+#define NVP_SIZE_CALC(name_len, data_len) \
+ (NV_ALIGN((sizeof (nvpair_t)) + name_len) + NV_ALIGN(data_len))
+
+static int i_get_value_size(data_type_t type, const void *data, uint_t nelem);
+static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type,
+ uint_t nelem, const void *data);
+
+#define NV_STAT_EMBEDDED 0x1
+#define EMBEDDED_NVL(nvp) ((nvlist_t *)(void *)NVP_VALUE(nvp))
+#define EMBEDDED_NVL_ARRAY(nvp) ((nvlist_t **)(void *)NVP_VALUE(nvp))
+
+#define NVP_VALOFF(nvp) (NV_ALIGN(sizeof (nvpair_t) + (nvp)->nvp_name_sz))
+#define NVPAIR2I_NVP(nvp) \
+ ((i_nvp_t *)((size_t)(nvp) - offsetof(i_nvp_t, nvi_nvp)))
+
+#ifdef _KERNEL
+int nvpair_max_recursion = 20;
+#else
+int nvpair_max_recursion = 100;
+#endif
+
+uint64_t nvlist_hashtable_init_size = (1 << 4);
+
+int
+nv_alloc_init(nv_alloc_t *nva, const nv_alloc_ops_t *nvo, /* args */ ...)
+{
+ va_list valist;
+ int err = 0;
+
+ nva->nva_ops = nvo;
+ nva->nva_arg = NULL;
+
+ va_start(valist, nvo);
+ if (nva->nva_ops->nv_ao_init != NULL)
+ err = nva->nva_ops->nv_ao_init(nva, valist);
+ va_end(valist);
+
+ return (err);
+}
+
+void
+nv_alloc_reset(nv_alloc_t *nva)
+{
+ if (nva->nva_ops->nv_ao_reset != NULL)
+ nva->nva_ops->nv_ao_reset(nva);
+}
+
+void
+nv_alloc_fini(nv_alloc_t *nva)
+{
+ if (nva->nva_ops->nv_ao_fini != NULL)
+ nva->nva_ops->nv_ao_fini(nva);
+}
+
+nv_alloc_t *
+nvlist_lookup_nv_alloc(nvlist_t *nvl)
+{
+ nvpriv_t *priv;
+
+ if (nvl == NULL ||
+ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return (NULL);
+
+ return (priv->nvp_nva);
+}
+
+static void *
+nv_mem_zalloc(nvpriv_t *nvp, size_t size)
+{
+ nv_alloc_t *nva = nvp->nvp_nva;
+ void *buf;
+
+ if ((buf = nva->nva_ops->nv_ao_alloc(nva, size)) != NULL)
+ bzero(buf, size);
+
+ return (buf);
+}
+
+static void
+nv_mem_free(nvpriv_t *nvp, void *buf, size_t size)
+{
+ nv_alloc_t *nva = nvp->nvp_nva;
+
+ nva->nva_ops->nv_ao_free(nva, buf, size);
+}
+
+static void
+nv_priv_init(nvpriv_t *priv, nv_alloc_t *nva, uint32_t stat)
+{
+ bzero(priv, sizeof (nvpriv_t));
+
+ priv->nvp_nva = nva;
+ priv->nvp_stat = stat;
+}
+
+static nvpriv_t *
+nv_priv_alloc(nv_alloc_t *nva)
+{
+ nvpriv_t *priv;
+
+ /*
+ * nv_mem_alloc() cannot called here because it needs the priv
+ * argument.
+ */
+ if ((priv = nva->nva_ops->nv_ao_alloc(nva, sizeof (nvpriv_t))) == NULL)
+ return (NULL);
+
+ nv_priv_init(priv, nva, 0);
+
+ return (priv);
+}
+
+/*
+ * Embedded lists need their own nvpriv_t's. We create a new
+ * nvpriv_t using the parameters and allocator from the parent
+ * list's nvpriv_t.
+ */
+static nvpriv_t *
+nv_priv_alloc_embedded(nvpriv_t *priv)
+{
+ nvpriv_t *emb_priv;
+
+ if ((emb_priv = nv_mem_zalloc(priv, sizeof (nvpriv_t))) == NULL)
+ return (NULL);
+
+ nv_priv_init(emb_priv, priv->nvp_nva, NV_STAT_EMBEDDED);
+
+ return (emb_priv);
+}
+
+static int
+nvt_tab_alloc(nvpriv_t *priv, uint64_t buckets)
+{
+ ASSERT3P(priv->nvp_hashtable, ==, NULL);
+ ASSERT0(priv->nvp_nbuckets);
+ ASSERT0(priv->nvp_nentries);
+
+ i_nvp_t **tab = nv_mem_zalloc(priv, buckets * sizeof (i_nvp_t *));
+ if (tab == NULL)
+ return (ENOMEM);
+
+ priv->nvp_hashtable = tab;
+ priv->nvp_nbuckets = buckets;
+ return (0);
+}
+
+static void
+nvt_tab_free(nvpriv_t *priv)
+{
+ i_nvp_t **tab = priv->nvp_hashtable;
+ if (tab == NULL) {
+ ASSERT0(priv->nvp_nbuckets);
+ ASSERT0(priv->nvp_nentries);
+ return;
+ }
+
+ nv_mem_free(priv, tab, priv->nvp_nbuckets * sizeof (i_nvp_t *));
+
+ priv->nvp_hashtable = NULL;
+ priv->nvp_nbuckets = 0;
+ priv->nvp_nentries = 0;
+}
+
+static uint32_t
+nvt_hash(const char *p)
+{
+ uint32_t g, hval = 0;
+
+ while (*p) {
+ hval = (hval << 4) + *p++;
+ if ((g = (hval & 0xf0000000)) != 0)
+ hval ^= g >> 24;
+ hval &= ~g;
+ }
+ return (hval);
+}
+
+static boolean_t
+nvt_nvpair_match(nvpair_t *nvp1, nvpair_t *nvp2, uint32_t nvflag)
+{
+ boolean_t match = B_FALSE;
+ if (nvflag & NV_UNIQUE_NAME_TYPE) {
+ if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0 &&
+ NVP_TYPE(nvp1) == NVP_TYPE(nvp2))
+ match = B_TRUE;
+ } else {
+ ASSERT(nvflag == 0 || nvflag & NV_UNIQUE_NAME);
+ if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0)
+ match = B_TRUE;
+ }
+ return (match);
+}
+
+static nvpair_t *
+nvt_lookup_name_type(nvlist_t *nvl, const char *name, data_type_t type)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ ASSERT(priv != NULL);
+
+ i_nvp_t **tab = priv->nvp_hashtable;
+
+ if (tab == NULL) {
+ ASSERT3P(priv->nvp_list, ==, NULL);
+ ASSERT0(priv->nvp_nbuckets);
+ ASSERT0(priv->nvp_nentries);
+ return (NULL);
+ } else {
+ ASSERT(priv->nvp_nbuckets != 0);
+ }
+
+ uint64_t hash = nvt_hash(name);
+ uint64_t index = hash & (priv->nvp_nbuckets - 1);
+
+ ASSERT3U(index, <, priv->nvp_nbuckets);
+ i_nvp_t *entry = tab[index];
+
+ for (i_nvp_t *e = entry; e != NULL; e = e->nvi_hashtable_next) {
+ if (strcmp(NVP_NAME(&e->nvi_nvp), name) == 0 &&
+ (type == DATA_TYPE_DONTCARE ||
+ NVP_TYPE(&e->nvi_nvp) == type))
+ return (&e->nvi_nvp);
+ }
+ return (NULL);
+}
+
+static nvpair_t *
+nvt_lookup_name(nvlist_t *nvl, const char *name)
+{
+ return (nvt_lookup_name_type(nvl, name, DATA_TYPE_DONTCARE));
+}
+
+static int
+nvt_resize(nvpriv_t *priv, uint32_t new_size)
+{
+ i_nvp_t **tab = priv->nvp_hashtable;
+
+ /*
+ * Migrate all the entries from the current table
+ * to a newly-allocated table with the new size by
+ * re-adjusting the pointers of their entries.
+ */
+ uint32_t size = priv->nvp_nbuckets;
+ uint32_t new_mask = new_size - 1;
+ ASSERT(ISP2(new_size));
+
+ i_nvp_t **new_tab = nv_mem_zalloc(priv, new_size * sizeof (i_nvp_t *));
+ if (new_tab == NULL)
+ return (ENOMEM);
+
+ uint32_t nentries = 0;
+ for (uint32_t i = 0; i < size; i++) {
+ i_nvp_t *next, *e = tab[i];
+
+ while (e != NULL) {
+ next = e->nvi_hashtable_next;
+
+ uint32_t hash = nvt_hash(NVP_NAME(&e->nvi_nvp));
+ uint32_t index = hash & new_mask;
+
+ e->nvi_hashtable_next = new_tab[index];
+ new_tab[index] = e;
+ nentries++;
+
+ e = next;
+ }
+ tab[i] = NULL;
+ }
+ ASSERT3U(nentries, ==, priv->nvp_nentries);
+
+ nvt_tab_free(priv);
+
+ priv->nvp_hashtable = new_tab;
+ priv->nvp_nbuckets = new_size;
+ priv->nvp_nentries = nentries;
+
+ return (0);
+}
+
+static boolean_t
+nvt_needs_togrow(nvpriv_t *priv)
+{
+ /*
+ * Grow only when we have more elements than buckets
+ * and the # of buckets doesn't overflow.
+ */
+ return (priv->nvp_nentries > priv->nvp_nbuckets &&
+ (UINT32_MAX >> 1) >= priv->nvp_nbuckets);
+}
+
+/*
+ * Allocate a new table that's twice the size of the old one,
+ * and migrate all the entries from the old one to the new
+ * one by re-adjusting their pointers.
+ */
+static int
+nvt_grow(nvpriv_t *priv)
+{
+ uint32_t current_size = priv->nvp_nbuckets;
+ /* ensure we won't overflow */
+ ASSERT3U(UINT32_MAX >> 1, >=, current_size);
+ return (nvt_resize(priv, current_size << 1));
+}
+
+static boolean_t
+nvt_needs_toshrink(nvpriv_t *priv)
+{
+ /*
+ * Shrink only when the # of elements is less than or
+ * equal to 1/4 the # of buckets. Never shrink less than
+ * nvlist_hashtable_init_size.
+ */
+ ASSERT3U(priv->nvp_nbuckets, >=, nvlist_hashtable_init_size);
+ if (priv->nvp_nbuckets == nvlist_hashtable_init_size)
+ return (B_FALSE);
+ return (priv->nvp_nentries <= (priv->nvp_nbuckets >> 2));
+}
+
+/*
+ * Allocate a new table that's half the size of the old one,
+ * and migrate all the entries from the old one to the new
+ * one by re-adjusting their pointers.
+ */
+static int
+nvt_shrink(nvpriv_t *priv)
+{
+ uint32_t current_size = priv->nvp_nbuckets;
+ /* ensure we won't overflow */
+ ASSERT3U(current_size, >=, nvlist_hashtable_init_size);
+ return (nvt_resize(priv, current_size >> 1));
+}
+
+static int
+nvt_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+
+ if (nvt_needs_toshrink(priv)) {
+ int err = nvt_shrink(priv);
+ if (err != 0)
+ return (err);
+ }
+ i_nvp_t **tab = priv->nvp_hashtable;
+
+ char *name = NVP_NAME(nvp);
+ uint64_t hash = nvt_hash(name);
+ uint64_t index = hash & (priv->nvp_nbuckets - 1);
+
+ ASSERT3U(index, <, priv->nvp_nbuckets);
+ i_nvp_t *bucket = tab[index];
+
+ for (i_nvp_t *prev = NULL, *e = bucket;
+ e != NULL; prev = e, e = e->nvi_hashtable_next) {
+ if (nvt_nvpair_match(&e->nvi_nvp, nvp, nvl->nvl_nvflag)) {
+ if (prev != NULL) {
+ prev->nvi_hashtable_next =
+ e->nvi_hashtable_next;
+ } else {
+ ASSERT3P(e, ==, bucket);
+ tab[index] = e->nvi_hashtable_next;
+ }
+ e->nvi_hashtable_next = NULL;
+ priv->nvp_nentries--;
+ break;
+ }
+ }
+
+ return (0);
+}
+
+static int
+nvt_add_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+
+ /* initialize nvpair table now if it doesn't exist. */
+ if (priv->nvp_hashtable == NULL) {
+ int err = nvt_tab_alloc(priv, nvlist_hashtable_init_size);
+ if (err != 0)
+ return (err);
+ }
+
+ /*
+ * if we don't allow duplicate entries, make sure to
+ * unlink any existing entries from the table.
+ */
+ if (nvl->nvl_nvflag != 0) {
+ int err = nvt_remove_nvpair(nvl, nvp);
+ if (err != 0)
+ return (err);
+ }
+
+ if (nvt_needs_togrow(priv)) {
+ int err = nvt_grow(priv);
+ if (err != 0)
+ return (err);
+ }
+ i_nvp_t **tab = priv->nvp_hashtable;
+
+ char *name = NVP_NAME(nvp);
+ uint64_t hash = nvt_hash(name);
+ uint64_t index = hash & (priv->nvp_nbuckets - 1);
+
+ ASSERT3U(index, <, priv->nvp_nbuckets);
+ i_nvp_t *bucket = tab[index];
+
+ /* insert link at the beginning of the bucket */
+ i_nvp_t *new_entry = NVPAIR2I_NVP(nvp);
+ ASSERT3P(new_entry->nvi_hashtable_next, ==, NULL);
+ new_entry->nvi_hashtable_next = bucket;
+ tab[index] = new_entry;
+
+ priv->nvp_nentries++;
+ return (0);
+}
+
+static void
+nvlist_init(nvlist_t *nvl, uint32_t nvflag, nvpriv_t *priv)
+{
+ nvl->nvl_version = NV_VERSION;
+ nvl->nvl_nvflag = nvflag & (NV_UNIQUE_NAME|NV_UNIQUE_NAME_TYPE);
+ nvl->nvl_priv = (uint64_t)(uintptr_t)priv;
+ nvl->nvl_flag = 0;
+ nvl->nvl_pad = 0;
+}
+
+uint_t
+nvlist_nvflag(nvlist_t *nvl)
+{
+ return (nvl->nvl_nvflag);
+}
+
+static nv_alloc_t *
+nvlist_nv_alloc(int kmflag)
+{
+#if defined(_KERNEL)
+ switch (kmflag) {
+ case KM_SLEEP:
+ return (nv_alloc_sleep);
+ case KM_NOSLEEP:
+ return (nv_alloc_nosleep);
+ default:
+ return (nv_alloc_pushpage);
+ }
+#else
+ return (nv_alloc_nosleep);
+#endif /* _KERNEL */
+}
+
+/*
+ * nvlist_alloc - Allocate nvlist.
+ */
+int
+nvlist_alloc(nvlist_t **nvlp, uint_t nvflag, int kmflag)
+{
+ return (nvlist_xalloc(nvlp, nvflag, nvlist_nv_alloc(kmflag)));
+}
+
+int
+nvlist_xalloc(nvlist_t **nvlp, uint_t nvflag, nv_alloc_t *nva)
+{
+ nvpriv_t *priv;
+
+ if (nvlp == NULL || nva == NULL)
+ return (EINVAL);
+
+ if ((priv = nv_priv_alloc(nva)) == NULL)
+ return (ENOMEM);
+
+ if ((*nvlp = nv_mem_zalloc(priv,
+ NV_ALIGN(sizeof (nvlist_t)))) == NULL) {
+ nv_mem_free(priv, priv, sizeof (nvpriv_t));
+ return (ENOMEM);
+ }
+
+ nvlist_init(*nvlp, nvflag, priv);
+
+ return (0);
+}
+
+/*
+ * nvp_buf_alloc - Allocate i_nvp_t for storing a new nv pair.
+ */
+static nvpair_t *
+nvp_buf_alloc(nvlist_t *nvl, size_t len)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ i_nvp_t *buf;
+ nvpair_t *nvp;
+ size_t nvsize;
+
+ /*
+ * Allocate the buffer
+ */
+ nvsize = len + offsetof(i_nvp_t, nvi_nvp);
+
+ if ((buf = nv_mem_zalloc(priv, nvsize)) == NULL)
+ return (NULL);
+
+ nvp = &buf->nvi_nvp;
+ nvp->nvp_size = len;
+
+ return (nvp);
+}
+
+/*
+ * nvp_buf_free - de-Allocate an i_nvp_t.
+ */
+static void
+nvp_buf_free(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ size_t nvsize = nvp->nvp_size + offsetof(i_nvp_t, nvi_nvp);
+
+ nv_mem_free(priv, NVPAIR2I_NVP(nvp), nvsize);
+}
+
+/*
+ * nvp_buf_link - link a new nv pair into the nvlist.
+ */
+static void
+nvp_buf_link(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ i_nvp_t *curr = NVPAIR2I_NVP(nvp);
+
+ /* Put element at end of nvlist */
+ if (priv->nvp_list == NULL) {
+ priv->nvp_list = priv->nvp_last = curr;
+ } else {
+ curr->nvi_prev = priv->nvp_last;
+ priv->nvp_last->nvi_next = curr;
+ priv->nvp_last = curr;
+ }
+}
+
+/*
+ * nvp_buf_unlink - unlink an removed nvpair out of the nvlist.
+ */
+static void
+nvp_buf_unlink(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ i_nvp_t *curr = NVPAIR2I_NVP(nvp);
+
+ /*
+ * protect nvlist_next_nvpair() against walking on freed memory.
+ */
+ if (priv->nvp_curr == curr)
+ priv->nvp_curr = curr->nvi_next;
+
+ if (curr == priv->nvp_list)
+ priv->nvp_list = curr->nvi_next;
+ else
+ curr->nvi_prev->nvi_next = curr->nvi_next;
+
+ if (curr == priv->nvp_last)
+ priv->nvp_last = curr->nvi_prev;
+ else
+ curr->nvi_next->nvi_prev = curr->nvi_prev;
+}
+
+/*
+ * take a nvpair type and number of elements and make sure the are valid
+ */
+static int
+i_validate_type_nelem(data_type_t type, uint_t nelem)
+{
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ if (nelem != 0)
+ return (EINVAL);
+ break;
+ case DATA_TYPE_BOOLEAN_VALUE:
+ case DATA_TYPE_BYTE:
+ case DATA_TYPE_INT8:
+ case DATA_TYPE_UINT8:
+ case DATA_TYPE_INT16:
+ case DATA_TYPE_UINT16:
+ case DATA_TYPE_INT32:
+ case DATA_TYPE_UINT32:
+ case DATA_TYPE_INT64:
+ case DATA_TYPE_UINT64:
+ case DATA_TYPE_STRING:
+ case DATA_TYPE_HRTIME:
+ case DATA_TYPE_NVLIST:
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+#endif
+ if (nelem != 1)
+ return (EINVAL);
+ break;
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_BYTE_ARRAY:
+ case DATA_TYPE_INT8_ARRAY:
+ case DATA_TYPE_UINT8_ARRAY:
+ case DATA_TYPE_INT16_ARRAY:
+ case DATA_TYPE_UINT16_ARRAY:
+ case DATA_TYPE_INT32_ARRAY:
+ case DATA_TYPE_UINT32_ARRAY:
+ case DATA_TYPE_INT64_ARRAY:
+ case DATA_TYPE_UINT64_ARRAY:
+ case DATA_TYPE_STRING_ARRAY:
+ case DATA_TYPE_NVLIST_ARRAY:
+ /* we allow arrays with 0 elements */
+ break;
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * Verify nvp_name_sz and check the name string length.
+ */
+static int
+i_validate_nvpair_name(nvpair_t *nvp)
+{
+ if ((nvp->nvp_name_sz <= 0) ||
+ (nvp->nvp_size < NVP_SIZE_CALC(nvp->nvp_name_sz, 0)))
+ return (EFAULT);
+
+ /* verify the name string, make sure its terminated */
+ if (NVP_NAME(nvp)[nvp->nvp_name_sz - 1] != '\0')
+ return (EFAULT);
+
+ return (strlen(NVP_NAME(nvp)) == nvp->nvp_name_sz - 1 ? 0 : EFAULT);
+}
+
+static int
+i_validate_nvpair_value(data_type_t type, uint_t nelem, const void *data)
+{
+ switch (type) {
+ case DATA_TYPE_BOOLEAN_VALUE:
+ if (*(boolean_t *)data != B_TRUE &&
+ *(boolean_t *)data != B_FALSE)
+ return (EINVAL);
+ break;
+ case DATA_TYPE_BOOLEAN_ARRAY: {
+ int i;
+
+ for (i = 0; i < nelem; i++)
+ if (((boolean_t *)data)[i] != B_TRUE &&
+ ((boolean_t *)data)[i] != B_FALSE)
+ return (EINVAL);
+ break;
+ }
+ default:
+ break;
+ }
+
+ return (0);
+}
+
+/*
+ * This function takes a pointer to what should be a nvpair and it's size
+ * and then verifies that all the nvpair fields make sense and can be
+ * trusted. This function is used when decoding packed nvpairs.
+ */
+static int
+i_validate_nvpair(nvpair_t *nvp)
+{
+ data_type_t type = NVP_TYPE(nvp);
+ int size1, size2;
+
+ /* verify nvp_name_sz, check the name string length */
+ if (i_validate_nvpair_name(nvp) != 0)
+ return (EFAULT);
+
+ if (i_validate_nvpair_value(type, NVP_NELEM(nvp), NVP_VALUE(nvp)) != 0)
+ return (EFAULT);
+
+ /*
+ * verify nvp_type, nvp_value_elem, and also possibly
+ * verify string values and get the value size.
+ */
+ size2 = i_get_value_size(type, NVP_VALUE(nvp), NVP_NELEM(nvp));
+ size1 = nvp->nvp_size - NVP_VALOFF(nvp);
+ if (size2 < 0 || size1 != NV_ALIGN(size2))
+ return (EFAULT);
+
+ return (0);
+}
+
+static int
+nvlist_copy_pairs(nvlist_t *snvl, nvlist_t *dnvl)
+{
+ nvpriv_t *priv;
+ i_nvp_t *curr;
+
+ if ((priv = (nvpriv_t *)(uintptr_t)snvl->nvl_priv) == NULL)
+ return (EINVAL);
+
+ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
+ nvpair_t *nvp = &curr->nvi_nvp;
+ int err;
+
+ if ((err = nvlist_add_common(dnvl, NVP_NAME(nvp), NVP_TYPE(nvp),
+ NVP_NELEM(nvp), NVP_VALUE(nvp))) != 0)
+ return (err);
+ }
+
+ return (0);
+}
+
+/*
+ * Frees all memory allocated for an nvpair (like embedded lists) with
+ * the exception of the nvpair buffer itself.
+ */
+static void
+nvpair_free(nvpair_t *nvp)
+{
+ switch (NVP_TYPE(nvp)) {
+ case DATA_TYPE_NVLIST:
+ nvlist_free(EMBEDDED_NVL(nvp));
+ break;
+ case DATA_TYPE_NVLIST_ARRAY: {
+ nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
+ int i;
+
+ for (i = 0; i < NVP_NELEM(nvp); i++)
+ if (nvlp[i] != NULL)
+ nvlist_free(nvlp[i]);
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+/*
+ * nvlist_free - free an unpacked nvlist
+ */
+void
+nvlist_free(nvlist_t *nvl)
+{
+ nvpriv_t *priv;
+ i_nvp_t *curr;
+
+ if (nvl == NULL ||
+ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return;
+
+ /*
+ * Unpacked nvlist are linked through i_nvp_t
+ */
+ curr = priv->nvp_list;
+ while (curr != NULL) {
+ nvpair_t *nvp = &curr->nvi_nvp;
+ curr = curr->nvi_next;
+
+ nvpair_free(nvp);
+ nvp_buf_free(nvl, nvp);
+ }
+
+ if (!(priv->nvp_stat & NV_STAT_EMBEDDED))
+ nv_mem_free(priv, nvl, NV_ALIGN(sizeof (nvlist_t)));
+ else
+ nvl->nvl_priv = 0;
+
+ nvt_tab_free(priv);
+ nv_mem_free(priv, priv, sizeof (nvpriv_t));
+}
+
+static int
+nvlist_contains_nvp(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ i_nvp_t *curr;
+
+ if (nvp == NULL)
+ return (0);
+
+ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next)
+ if (&curr->nvi_nvp == nvp)
+ return (1);
+
+ return (0);
+}
+
+/*
+ * Make a copy of nvlist
+ */
+int
+nvlist_dup(nvlist_t *nvl, nvlist_t **nvlp, int kmflag)
+{
+ return (nvlist_xdup(nvl, nvlp, nvlist_nv_alloc(kmflag)));
+}
+
+int
+nvlist_xdup(nvlist_t *nvl, nvlist_t **nvlp, nv_alloc_t *nva)
+{
+ int err;
+ nvlist_t *ret;
+
+ if (nvl == NULL || nvlp == NULL)
+ return (EINVAL);
+
+ if ((err = nvlist_xalloc(&ret, nvl->nvl_nvflag, nva)) != 0)
+ return (err);
+
+ if ((err = nvlist_copy_pairs(nvl, ret)) != 0)
+ nvlist_free(ret);
+ else
+ *nvlp = ret;
+
+ return (err);
+}
+
+/*
+ * Remove all with matching name
+ */
+int
+nvlist_remove_all(nvlist_t *nvl, const char *name)
+{
+ int error = ENOENT;
+
+ if (nvl == NULL || name == NULL || nvl->nvl_priv == 0)
+ return (EINVAL);
+
+ nvpair_t *nvp;
+ while ((nvp = nvt_lookup_name(nvl, name)) != NULL) {
+ VERIFY0(nvlist_remove_nvpair(nvl, nvp));
+ error = 0;
+ }
+
+ return (error);
+}
+
+/*
+ * Remove first one with matching name and type
+ */
+int
+nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type)
+{
+ if (nvl == NULL || name == NULL || nvl->nvl_priv == 0)
+ return (EINVAL);
+
+ nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type);
+ if (nvp == NULL)
+ return (ENOENT);
+
+ return (nvlist_remove_nvpair(nvl, nvp));
+}
+
+int
+nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+ if (nvl == NULL || nvp == NULL)
+ return (EINVAL);
+
+ int err = nvt_remove_nvpair(nvl, nvp);
+ if (err != 0)
+ return (err);
+
+ nvp_buf_unlink(nvl, nvp);
+ nvpair_free(nvp);
+ nvp_buf_free(nvl, nvp);
+ return (0);
+}
+
+/*
+ * This function calculates the size of an nvpair value.
+ *
+ * The data argument controls the behavior in case of the data types
+ * DATA_TYPE_STRING and
+ * DATA_TYPE_STRING_ARRAY
+ * Is data == NULL then the size of the string(s) is excluded.
+ */
+static int
+i_get_value_size(data_type_t type, const void *data, uint_t nelem)
+{
+ uint64_t value_sz;
+
+ if (i_validate_type_nelem(type, nelem) != 0)
+ return (-1);
+
+ /* Calculate required size for holding value */
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ value_sz = 0;
+ break;
+ case DATA_TYPE_BOOLEAN_VALUE:
+ value_sz = sizeof (boolean_t);
+ break;
+ case DATA_TYPE_BYTE:
+ value_sz = sizeof (uchar_t);
+ break;
+ case DATA_TYPE_INT8:
+ value_sz = sizeof (int8_t);
+ break;
+ case DATA_TYPE_UINT8:
+ value_sz = sizeof (uint8_t);
+ break;
+ case DATA_TYPE_INT16:
+ value_sz = sizeof (int16_t);
+ break;
+ case DATA_TYPE_UINT16:
+ value_sz = sizeof (uint16_t);
+ break;
+ case DATA_TYPE_INT32:
+ value_sz = sizeof (int32_t);
+ break;
+ case DATA_TYPE_UINT32:
+ value_sz = sizeof (uint32_t);
+ break;
+ case DATA_TYPE_INT64:
+ value_sz = sizeof (int64_t);
+ break;
+ case DATA_TYPE_UINT64:
+ value_sz = sizeof (uint64_t);
+ break;
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+ value_sz = sizeof (double);
+ break;
+#endif
+ case DATA_TYPE_STRING:
+ if (data == NULL)
+ value_sz = 0;
+ else
+ value_sz = strlen(data) + 1;
+ break;
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (boolean_t);
+ break;
+ case DATA_TYPE_BYTE_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (uchar_t);
+ break;
+ case DATA_TYPE_INT8_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (int8_t);
+ break;
+ case DATA_TYPE_UINT8_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (uint8_t);
+ break;
+ case DATA_TYPE_INT16_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (int16_t);
+ break;
+ case DATA_TYPE_UINT16_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (uint16_t);
+ break;
+ case DATA_TYPE_INT32_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (int32_t);
+ break;
+ case DATA_TYPE_UINT32_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (uint32_t);
+ break;
+ case DATA_TYPE_INT64_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (int64_t);
+ break;
+ case DATA_TYPE_UINT64_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (uint64_t);
+ break;
+ case DATA_TYPE_STRING_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (uint64_t);
+
+ if (data != NULL) {
+ char *const *strs = data;
+ uint_t i;
+
+ /* no alignment requirement for strings */
+ for (i = 0; i < nelem; i++) {
+ if (strs[i] == NULL)
+ return (-1);
+ value_sz += strlen(strs[i]) + 1;
+ }
+ }
+ break;
+ case DATA_TYPE_HRTIME:
+ value_sz = sizeof (hrtime_t);
+ break;
+ case DATA_TYPE_NVLIST:
+ value_sz = NV_ALIGN(sizeof (nvlist_t));
+ break;
+ case DATA_TYPE_NVLIST_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (uint64_t) +
+ (uint64_t)nelem * NV_ALIGN(sizeof (nvlist_t));
+ break;
+ default:
+ return (-1);
+ }
+
+ return (value_sz > INT32_MAX ? -1 : (int)value_sz);
+}
+
+static int
+nvlist_copy_embedded(nvlist_t *nvl, nvlist_t *onvl, nvlist_t *emb_nvl)
+{
+ nvpriv_t *priv;
+ int err;
+
+ if ((priv = nv_priv_alloc_embedded((nvpriv_t *)(uintptr_t)
+ nvl->nvl_priv)) == NULL)
+ return (ENOMEM);
+
+ nvlist_init(emb_nvl, onvl->nvl_nvflag, priv);
+
+ if ((err = nvlist_copy_pairs(onvl, emb_nvl)) != 0) {
+ nvlist_free(emb_nvl);
+ emb_nvl->nvl_priv = 0;
+ }
+
+ return (err);
+}
+
+/*
+ * nvlist_add_common - Add new <name,value> pair to nvlist
+ */
+static int
+nvlist_add_common(nvlist_t *nvl, const char *name,
+ data_type_t type, uint_t nelem, const void *data)
+{
+ nvpair_t *nvp;
+ uint_t i;
+
+ int nvp_sz, name_sz, value_sz;
+ int err = 0;
+
+ if (name == NULL || nvl == NULL || nvl->nvl_priv == 0)
+ return (EINVAL);
+
+ if (nelem != 0 && data == NULL)
+ return (EINVAL);
+
+ /*
+ * Verify type and nelem and get the value size.
+ * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
+ * is the size of the string(s) included.
+ */
+ if ((value_sz = i_get_value_size(type, data, nelem)) < 0)
+ return (EINVAL);
+
+ if (i_validate_nvpair_value(type, nelem, data) != 0)
+ return (EINVAL);
+
+ /*
+ * If we're adding an nvlist or nvlist array, ensure that we are not
+ * adding the input nvlist to itself, which would cause recursion,
+ * and ensure that no NULL nvlist pointers are present.
+ */
+ switch (type) {
+ case DATA_TYPE_NVLIST:
+ if (data == nvl || data == NULL)
+ return (EINVAL);
+ break;
+ case DATA_TYPE_NVLIST_ARRAY: {
+ nvlist_t **onvlp = (nvlist_t **)data;
+ for (i = 0; i < nelem; i++) {
+ if (onvlp[i] == nvl || onvlp[i] == NULL)
+ return (EINVAL);
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ /* calculate sizes of the nvpair elements and the nvpair itself */
+ name_sz = strlen(name) + 1;
+ if (name_sz >= 1ULL << (sizeof (nvp->nvp_name_sz) * NBBY - 1))
+ return (EINVAL);
+
+ nvp_sz = NVP_SIZE_CALC(name_sz, value_sz);
+
+ if ((nvp = nvp_buf_alloc(nvl, nvp_sz)) == NULL)
+ return (ENOMEM);
+
+ ASSERT(nvp->nvp_size == nvp_sz);
+ nvp->nvp_name_sz = name_sz;
+ nvp->nvp_value_elem = nelem;
+ nvp->nvp_type = type;
+ bcopy(name, NVP_NAME(nvp), name_sz);
+
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ break;
+ case DATA_TYPE_STRING_ARRAY: {
+ char *const *strs = data;
+ char *buf = NVP_VALUE(nvp);
+ char **cstrs = (void *)buf;
+
+ /* skip pre-allocated space for pointer array */
+ buf += nelem * sizeof (uint64_t);
+ for (i = 0; i < nelem; i++) {
+ int slen = strlen(strs[i]) + 1;
+ bcopy(strs[i], buf, slen);
+ cstrs[i] = buf;
+ buf += slen;
+ }
+ break;
+ }
+ case DATA_TYPE_NVLIST: {
+ nvlist_t *nnvl = EMBEDDED_NVL(nvp);
+ nvlist_t *onvl = (nvlist_t *)data;
+
+ if ((err = nvlist_copy_embedded(nvl, onvl, nnvl)) != 0) {
+ nvp_buf_free(nvl, nvp);
+ return (err);
+ }
+ break;
+ }
+ case DATA_TYPE_NVLIST_ARRAY: {
+ nvlist_t **onvlp = (nvlist_t **)data;
+ nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
+ nvlist_t *embedded = (nvlist_t *)
+ ((uintptr_t)nvlp + nelem * sizeof (uint64_t));
+
+ for (i = 0; i < nelem; i++) {
+ if ((err = nvlist_copy_embedded(nvl,
+ onvlp[i], embedded)) != 0) {
+ /*
+ * Free any successfully created lists
+ */
+ nvpair_free(nvp);
+ nvp_buf_free(nvl, nvp);
+ return (err);
+ }
+
+ nvlp[i] = embedded++;
+ }
+ break;
+ }
+ default:
+ bcopy(data, NVP_VALUE(nvp), value_sz);
+ }
+
+ /* if unique name, remove before add */
+ if (nvl->nvl_nvflag & NV_UNIQUE_NAME)
+ (void) nvlist_remove_all(nvl, name);
+ else if (nvl->nvl_nvflag & NV_UNIQUE_NAME_TYPE)
+ (void) nvlist_remove(nvl, name, type);
+
+ err = nvt_add_nvpair(nvl, nvp);
+ if (err != 0) {
+ nvpair_free(nvp);
+ nvp_buf_free(nvl, nvp);
+ return (err);
+ }
+ nvp_buf_link(nvl, nvp);
+
+ return (0);
+}
+
+int
+nvlist_add_boolean(nvlist_t *nvl, const char *name)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN, 0, NULL));
+}
+
+int
+nvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_VALUE, 1, &val));
+}
+
+int
+nvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE, 1, &val));
+}
+
+int
+nvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT8, 1, &val));
+}
+
+int
+nvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8, 1, &val));
+}
+
+int
+nvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT16, 1, &val));
+}
+
+int
+nvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16, 1, &val));
+}
+
+int
+nvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT32, 1, &val));
+}
+
+int
+nvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32, 1, &val));
+}
+
+int
+nvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT64, 1, &val));
+}
+
+int
+nvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64, 1, &val));
+}
+
+#if !defined(_KERNEL)
+int
+nvlist_add_double(nvlist_t *nvl, const char *name, double val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_DOUBLE, 1, &val));
+}
+#endif
+
+int
+nvlist_add_string(nvlist_t *nvl, const char *name, const char *val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_STRING, 1, (void *)val));
+}
+
+int
+nvlist_add_boolean_array(nvlist_t *nvl, const char *name,
+ boolean_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_ARRAY, n, a));
+}
+
+int
+nvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a));
+}
+
+int
+nvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a));
+}
+
+int
+nvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a));
+}
+
+int
+nvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a));
+}
+
+int
+nvlist_add_uint16_array(nvlist_t *nvl, const char *name, uint16_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a));
+}
+
+int
+nvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a));
+}
+
+int
+nvlist_add_uint32_array(nvlist_t *nvl, const char *name, uint32_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a));
+}
+
+int
+nvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a));
+}
+
+int
+nvlist_add_uint64_array(nvlist_t *nvl, const char *name, uint64_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a));
+}
+
+int
+nvlist_add_string_array(nvlist_t *nvl, const char *name,
+ char *const *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a));
+}
+
+int
+nvlist_add_hrtime(nvlist_t *nvl, const char *name, hrtime_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_HRTIME, 1, &val));
+}
+
+int
+nvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST, 1, val));
+}
+
+int
+nvlist_add_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t **a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a));
+}
+
+/* reading name-value pairs */
+nvpair_t *
+nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv;
+ i_nvp_t *curr;
+
+ if (nvl == NULL ||
+ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return (NULL);
+
+ curr = NVPAIR2I_NVP(nvp);
+
+ /*
+ * Ensure that nvp is a valid nvpair on this nvlist.
+ * NB: nvp_curr is used only as a hint so that we don't always
+ * have to walk the list to determine if nvp is still on the list.
+ */
+ if (nvp == NULL)
+ curr = priv->nvp_list;
+ else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp))
+ curr = curr->nvi_next;
+ else
+ curr = NULL;
+
+ priv->nvp_curr = curr;
+
+ return (curr != NULL ? &curr->nvi_nvp : NULL);
+}
+
+nvpair_t *
+nvlist_prev_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv;
+ i_nvp_t *curr;
+
+ if (nvl == NULL ||
+ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return (NULL);
+
+ curr = NVPAIR2I_NVP(nvp);
+
+ if (nvp == NULL)
+ curr = priv->nvp_last;
+ else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp))
+ curr = curr->nvi_prev;
+ else
+ curr = NULL;
+
+ priv->nvp_curr = curr;
+
+ return (curr != NULL ? &curr->nvi_nvp : NULL);
+}
+
+boolean_t
+nvlist_empty(nvlist_t *nvl)
+{
+ nvpriv_t *priv;
+
+ if (nvl == NULL ||
+ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return (B_TRUE);
+
+ return (priv->nvp_list == NULL);
+}
+
+char *
+nvpair_name(nvpair_t *nvp)
+{
+ return (NVP_NAME(nvp));
+}
+
+data_type_t
+nvpair_type(nvpair_t *nvp)
+{
+ return (NVP_TYPE(nvp));
+}
+
+int
+nvpair_type_is_array(nvpair_t *nvp)
+{
+ data_type_t type = NVP_TYPE(nvp);
+
+ if ((type == DATA_TYPE_BYTE_ARRAY) ||
+ (type == DATA_TYPE_INT8_ARRAY) ||
+ (type == DATA_TYPE_UINT8_ARRAY) ||
+ (type == DATA_TYPE_INT16_ARRAY) ||
+ (type == DATA_TYPE_UINT16_ARRAY) ||
+ (type == DATA_TYPE_INT32_ARRAY) ||
+ (type == DATA_TYPE_UINT32_ARRAY) ||
+ (type == DATA_TYPE_INT64_ARRAY) ||
+ (type == DATA_TYPE_UINT64_ARRAY) ||
+ (type == DATA_TYPE_BOOLEAN_ARRAY) ||
+ (type == DATA_TYPE_STRING_ARRAY) ||
+ (type == DATA_TYPE_NVLIST_ARRAY))
+ return (1);
+ return (0);
+
+}
+
+static int
+nvpair_value_common(nvpair_t *nvp, data_type_t type, uint_t *nelem, void *data)
+{
+ int value_sz;
+
+ if (nvp == NULL || nvpair_type(nvp) != type)
+ return (EINVAL);
+
+ /*
+ * For non-array types, we copy the data.
+ * For array types (including string), we set a pointer.
+ */
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ if (nelem != NULL)
+ *nelem = 0;
+ break;
+
+ case DATA_TYPE_BOOLEAN_VALUE:
+ case DATA_TYPE_BYTE:
+ case DATA_TYPE_INT8:
+ case DATA_TYPE_UINT8:
+ case DATA_TYPE_INT16:
+ case DATA_TYPE_UINT16:
+ case DATA_TYPE_INT32:
+ case DATA_TYPE_UINT32:
+ case DATA_TYPE_INT64:
+ case DATA_TYPE_UINT64:
+ case DATA_TYPE_HRTIME:
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+#endif
+ if (data == NULL)
+ return (EINVAL);
+ if ((value_sz = i_get_value_size(type, NULL, 1)) < 0)
+ return (EINVAL);
+ bcopy(NVP_VALUE(nvp), data, (size_t)value_sz);
+ if (nelem != NULL)
+ *nelem = 1;
+ break;
+
+ case DATA_TYPE_NVLIST:
+ case DATA_TYPE_STRING:
+ if (data == NULL)
+ return (EINVAL);
+ *(void **)data = (void *)NVP_VALUE(nvp);
+ if (nelem != NULL)
+ *nelem = 1;
+ break;
+
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_BYTE_ARRAY:
+ case DATA_TYPE_INT8_ARRAY:
+ case DATA_TYPE_UINT8_ARRAY:
+ case DATA_TYPE_INT16_ARRAY:
+ case DATA_TYPE_UINT16_ARRAY:
+ case DATA_TYPE_INT32_ARRAY:
+ case DATA_TYPE_UINT32_ARRAY:
+ case DATA_TYPE_INT64_ARRAY:
+ case DATA_TYPE_UINT64_ARRAY:
+ case DATA_TYPE_STRING_ARRAY:
+ case DATA_TYPE_NVLIST_ARRAY:
+ if (nelem == NULL || data == NULL)
+ return (EINVAL);
+ if ((*nelem = NVP_NELEM(nvp)) != 0)
+ *(void **)data = (void *)NVP_VALUE(nvp);
+ else
+ *(void **)data = NULL;
+ break;
+
+ default:
+ return (ENOTSUP);
+ }
+
+ return (0);
+}
+
+static int
+nvlist_lookup_common(nvlist_t *nvl, const char *name, data_type_t type,
+ uint_t *nelem, void *data)
+{
+ if (name == NULL || nvl == NULL || nvl->nvl_priv == 0)
+ return (EINVAL);
+
+ if (!(nvl->nvl_nvflag & (NV_UNIQUE_NAME | NV_UNIQUE_NAME_TYPE)))
+ return (ENOTSUP);
+
+ nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type);
+ if (nvp == NULL)
+ return (ENOENT);
+
+ return (nvpair_value_common(nvp, type, nelem, data));
+}
+
+int
+nvlist_lookup_boolean(nvlist_t *nvl, const char *name)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_BOOLEAN, NULL, NULL));
+}
+
+int
+nvlist_lookup_boolean_value(nvlist_t *nvl, const char *name, boolean_t *val)
+{
+ return (nvlist_lookup_common(nvl, name,
+ DATA_TYPE_BOOLEAN_VALUE, NULL, val));
+}
+
+int
+nvlist_lookup_byte(nvlist_t *nvl, const char *name, uchar_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE, NULL, val));
+}
+
+int
+nvlist_lookup_int8(nvlist_t *nvl, const char *name, int8_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8, NULL, val));
+}
+
+int
+nvlist_lookup_uint8(nvlist_t *nvl, const char *name, uint8_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8, NULL, val));
+}
+
+int
+nvlist_lookup_int16(nvlist_t *nvl, const char *name, int16_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16, NULL, val));
+}
+
+int
+nvlist_lookup_uint16(nvlist_t *nvl, const char *name, uint16_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16, NULL, val));
+}
+
+int
+nvlist_lookup_int32(nvlist_t *nvl, const char *name, int32_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32, NULL, val));
+}
+
+int
+nvlist_lookup_uint32(nvlist_t *nvl, const char *name, uint32_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32, NULL, val));
+}
+
+int
+nvlist_lookup_int64(nvlist_t *nvl, const char *name, int64_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64, NULL, val));
+}
+
+int
+nvlist_lookup_uint64(nvlist_t *nvl, const char *name, uint64_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64, NULL, val));
+}
+
+#if !defined(_KERNEL)
+int
+nvlist_lookup_double(nvlist_t *nvl, const char *name, double *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_DOUBLE, NULL, val));
+}
+#endif
+
+int
+nvlist_lookup_string(nvlist_t *nvl, const char *name, char **val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING, NULL, val));
+}
+
+int
+nvlist_lookup_nvlist(nvlist_t *nvl, const char *name, nvlist_t **val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST, NULL, val));
+}
+
+int
+nvlist_lookup_boolean_array(nvlist_t *nvl, const char *name,
+ boolean_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name,
+ DATA_TYPE_BOOLEAN_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_byte_array(nvlist_t *nvl, const char *name,
+ uchar_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_int8_array(nvlist_t *nvl, const char *name, int8_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_uint8_array(nvlist_t *nvl, const char *name,
+ uint8_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_int16_array(nvlist_t *nvl, const char *name,
+ int16_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_uint16_array(nvlist_t *nvl, const char *name,
+ uint16_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_int32_array(nvlist_t *nvl, const char *name,
+ int32_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_uint32_array(nvlist_t *nvl, const char *name,
+ uint32_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_int64_array(nvlist_t *nvl, const char *name,
+ int64_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_uint64_array(nvlist_t *nvl, const char *name,
+ uint64_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_string_array(nvlist_t *nvl, const char *name,
+ char ***a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_nvlist_array(nvlist_t *nvl, const char *name,
+ nvlist_t ***a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_hrtime(nvlist_t *nvl, const char *name, hrtime_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_HRTIME, NULL, val));
+}
+
+int
+nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...)
+{
+ va_list ap;
+ char *name;
+ int noentok = (flag & NV_FLAG_NOENTOK ? 1 : 0);
+ int ret = 0;
+
+ va_start(ap, flag);
+ while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
+ data_type_t type;
+ void *val;
+ uint_t *nelem;
+
+ switch (type = va_arg(ap, data_type_t)) {
+ case DATA_TYPE_BOOLEAN:
+ ret = nvlist_lookup_common(nvl, name, type, NULL, NULL);
+ break;
+
+ case DATA_TYPE_BOOLEAN_VALUE:
+ case DATA_TYPE_BYTE:
+ case DATA_TYPE_INT8:
+ case DATA_TYPE_UINT8:
+ case DATA_TYPE_INT16:
+ case DATA_TYPE_UINT16:
+ case DATA_TYPE_INT32:
+ case DATA_TYPE_UINT32:
+ case DATA_TYPE_INT64:
+ case DATA_TYPE_UINT64:
+ case DATA_TYPE_HRTIME:
+ case DATA_TYPE_STRING:
+ case DATA_TYPE_NVLIST:
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+#endif
+ val = va_arg(ap, void *);
+ ret = nvlist_lookup_common(nvl, name, type, NULL, val);
+ break;
+
+ case DATA_TYPE_BYTE_ARRAY:
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_INT8_ARRAY:
+ case DATA_TYPE_UINT8_ARRAY:
+ case DATA_TYPE_INT16_ARRAY:
+ case DATA_TYPE_UINT16_ARRAY:
+ case DATA_TYPE_INT32_ARRAY:
+ case DATA_TYPE_UINT32_ARRAY:
+ case DATA_TYPE_INT64_ARRAY:
+ case DATA_TYPE_UINT64_ARRAY:
+ case DATA_TYPE_STRING_ARRAY:
+ case DATA_TYPE_NVLIST_ARRAY:
+ val = va_arg(ap, void *);
+ nelem = va_arg(ap, uint_t *);
+ ret = nvlist_lookup_common(nvl, name, type, nelem, val);
+ break;
+
+ default:
+ ret = EINVAL;
+ }
+
+ if (ret == ENOENT && noentok)
+ ret = 0;
+ }
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * Find the 'name'ed nvpair in the nvlist 'nvl'. If 'name' found, the function
+ * returns zero and a pointer to the matching nvpair is returned in '*ret'
+ * (given 'ret' is non-NULL). If 'sep' is specified then 'name' will penitrate
+ * multiple levels of embedded nvlists, with 'sep' as the separator. As an
+ * example, if sep is '.', name might look like: "a" or "a.b" or "a.c[3]" or
+ * "a.d[3].e[1]". This matches the C syntax for array embed (for convenience,
+ * code also supports "a.d[3]e[1]" syntax).
+ *
+ * If 'ip' is non-NULL and the last name component is an array, return the
+ * value of the "...[index]" array index in *ip. For an array reference that
+ * is not indexed, *ip will be returned as -1. If there is a syntax error in
+ * 'name', and 'ep' is non-NULL then *ep will be set to point to the location
+ * inside the 'name' string where the syntax error was detected.
+ */
+static int
+nvlist_lookup_nvpair_ei_sep(nvlist_t *nvl, const char *name, const char sep,
+ nvpair_t **ret, int *ip, char **ep)
+{
+ nvpair_t *nvp;
+ const char *np;
+ char *sepp = NULL;
+ char *idxp, *idxep;
+ nvlist_t **nva;
+ long idx = 0;
+ int n;
+
+ if (ip)
+ *ip = -1; /* not indexed */
+ if (ep)
+ *ep = NULL;
+
+ if ((nvl == NULL) || (name == NULL))
+ return (EINVAL);
+
+ sepp = NULL;
+ idx = 0;
+ /* step through components of name */
+ for (np = name; np && *np; np = sepp) {
+ /* ensure unique names */
+ if (!(nvl->nvl_nvflag & NV_UNIQUE_NAME))
+ return (ENOTSUP);
+
+ /* skip white space */
+ skip_whitespace(np);
+ if (*np == 0)
+ break;
+
+ /* set 'sepp' to end of current component 'np' */
+ if (sep)
+ sepp = strchr(np, sep);
+ else
+ sepp = NULL;
+
+ /* find start of next "[ index ]..." */
+ idxp = strchr(np, '[');
+
+ /* if sepp comes first, set idxp to NULL */
+ if (sepp && idxp && (sepp < idxp))
+ idxp = NULL;
+
+ /*
+ * At this point 'idxp' is set if there is an index
+ * expected for the current component.
+ */
+ if (idxp) {
+ /* set 'n' to length of current 'np' name component */
+ n = idxp++ - np;
+
+ /* keep sepp up to date for *ep use as we advance */
+ skip_whitespace(idxp);
+ sepp = idxp;
+
+ /* determine the index value */
+#if defined(_KERNEL)
+ if (ddi_strtol(idxp, &idxep, 0, &idx))
+ goto fail;
+#else
+ idx = strtol(idxp, &idxep, 0);
+#endif
+ if (idxep == idxp)
+ goto fail;
+
+ /* keep sepp up to date for *ep use as we advance */
+ sepp = idxep;
+
+ /* skip white space index value and check for ']' */
+ skip_whitespace(sepp);
+ if (*sepp++ != ']')
+ goto fail;
+
+ /* for embedded arrays, support C syntax: "a[1].b" */
+ skip_whitespace(sepp);
+ if (sep && (*sepp == sep))
+ sepp++;
+ } else if (sepp) {
+ n = sepp++ - np;
+ } else {
+ n = strlen(np);
+ }
+
+ /* trim trailing whitespace by reducing length of 'np' */
+ if (n == 0)
+ goto fail;
+ for (n--; (np[n] == ' ') || (np[n] == '\t'); n--)
+ ;
+ n++;
+
+ /* skip whitespace, and set sepp to NULL if complete */
+ if (sepp) {
+ skip_whitespace(sepp);
+ if (*sepp == 0)
+ sepp = NULL;
+ }
+
+ /*
+ * At this point:
+ * o 'n' is the length of current 'np' component.
+ * o 'idxp' is set if there was an index, and value 'idx'.
+ * o 'sepp' is set to the beginning of the next component,
+ * and set to NULL if we have no more components.
+ *
+ * Search for nvpair with matching component name.
+ */
+ for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL;
+ nvp = nvlist_next_nvpair(nvl, nvp)) {
+
+ /* continue if no match on name */
+ if (strncmp(np, nvpair_name(nvp), n) ||
+ (strlen(nvpair_name(nvp)) != n))
+ continue;
+
+ /* if indexed, verify type is array oriented */
+ if (idxp && !nvpair_type_is_array(nvp))
+ goto fail;
+
+ /*
+ * Full match found, return nvp and idx if this
+ * was the last component.
+ */
+ if (sepp == NULL) {
+ if (ret)
+ *ret = nvp;
+ if (ip && idxp)
+ *ip = (int)idx; /* return index */
+ return (0); /* found */
+ }
+
+ /*
+ * More components: current match must be
+ * of DATA_TYPE_NVLIST or DATA_TYPE_NVLIST_ARRAY
+ * to support going deeper.
+ */
+ if (nvpair_type(nvp) == DATA_TYPE_NVLIST) {
+ nvl = EMBEDDED_NVL(nvp);
+ break;
+ } else if (nvpair_type(nvp) == DATA_TYPE_NVLIST_ARRAY) {
+ (void) nvpair_value_nvlist_array(nvp,
+ &nva, (uint_t *)&n);
+ if ((n < 0) || (idx >= n))
+ goto fail;
+ nvl = nva[idx];
+ break;
+ }
+
+ /* type does not support more levels */
+ goto fail;
+ }
+ if (nvp == NULL)
+ goto fail; /* 'name' not found */
+
+ /* search for match of next component in embedded 'nvl' list */
+ }
+
+fail: if (ep && sepp)
+ *ep = sepp;
+ return (EINVAL);
+}
+
+/*
+ * Return pointer to nvpair with specified 'name'.
+ */
+int
+nvlist_lookup_nvpair(nvlist_t *nvl, const char *name, nvpair_t **ret)
+{
+ return (nvlist_lookup_nvpair_ei_sep(nvl, name, 0, ret, NULL, NULL));
+}
+
+/*
+ * Determine if named nvpair exists in nvlist (use embedded separator of '.'
+ * and return array index). See nvlist_lookup_nvpair_ei_sep for more detailed
+ * description.
+ */
+int nvlist_lookup_nvpair_embedded_index(nvlist_t *nvl,
+ const char *name, nvpair_t **ret, int *ip, char **ep)
+{
+ return (nvlist_lookup_nvpair_ei_sep(nvl, name, '.', ret, ip, ep));
+}
+
+boolean_t
+nvlist_exists(nvlist_t *nvl, const char *name)
+{
+ nvpriv_t *priv;
+ nvpair_t *nvp;
+ i_nvp_t *curr;
+
+ if (name == NULL || nvl == NULL ||
+ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return (B_FALSE);
+
+ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
+ nvp = &curr->nvi_nvp;
+
+ if (strcmp(name, NVP_NAME(nvp)) == 0)
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+int
+nvpair_value_boolean_value(nvpair_t *nvp, boolean_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_VALUE, NULL, val));
+}
+
+int
+nvpair_value_byte(nvpair_t *nvp, uchar_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_BYTE, NULL, val));
+}
+
+int
+nvpair_value_int8(nvpair_t *nvp, int8_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT8, NULL, val));
+}
+
+int
+nvpair_value_uint8(nvpair_t *nvp, uint8_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT8, NULL, val));
+}
+
+int
+nvpair_value_int16(nvpair_t *nvp, int16_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT16, NULL, val));
+}
+
+int
+nvpair_value_uint16(nvpair_t *nvp, uint16_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT16, NULL, val));
+}
+
+int
+nvpair_value_int32(nvpair_t *nvp, int32_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT32, NULL, val));
+}
+
+int
+nvpair_value_uint32(nvpair_t *nvp, uint32_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT32, NULL, val));
+}
+
+int
+nvpair_value_int64(nvpair_t *nvp, int64_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT64, NULL, val));
+}
+
+int
+nvpair_value_uint64(nvpair_t *nvp, uint64_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT64, NULL, val));
+}
+
+#if !defined(_KERNEL)
+int
+nvpair_value_double(nvpair_t *nvp, double *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_DOUBLE, NULL, val));
+}
+#endif
+
+int
+nvpair_value_string(nvpair_t *nvp, char **val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_STRING, NULL, val));
+}
+
+int
+nvpair_value_nvlist(nvpair_t *nvp, nvlist_t **val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_NVLIST, NULL, val));
+}
+
+int
+nvpair_value_boolean_array(nvpair_t *nvp, boolean_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_byte_array(nvpair_t *nvp, uchar_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_BYTE_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_int8_array(nvpair_t *nvp, int8_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT8_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_uint8_array(nvpair_t *nvp, uint8_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT8_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_int16_array(nvpair_t *nvp, int16_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT16_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_uint16_array(nvpair_t *nvp, uint16_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT16_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_int32_array(nvpair_t *nvp, int32_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT32_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_uint32_array(nvpair_t *nvp, uint32_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT32_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_int64_array(nvpair_t *nvp, int64_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT64_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_uint64_array(nvpair_t *nvp, uint64_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT64_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_string_array(nvpair_t *nvp, char ***val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_STRING_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_nvlist_array(nvpair_t *nvp, nvlist_t ***val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_NVLIST_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_hrtime(nvpair_t *nvp, hrtime_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_HRTIME, NULL, val));
+}
+
+/*
+ * Add specified pair to the list.
+ */
+int
+nvlist_add_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+ if (nvl == NULL || nvp == NULL)
+ return (EINVAL);
+
+ return (nvlist_add_common(nvl, NVP_NAME(nvp), NVP_TYPE(nvp),
+ NVP_NELEM(nvp), NVP_VALUE(nvp)));
+}
+
+/*
+ * Merge the supplied nvlists and put the result in dst.
+ * The merged list will contain all names specified in both lists,
+ * the values are taken from nvl in the case of duplicates.
+ * Return 0 on success.
+ */
+/*ARGSUSED*/
+int
+nvlist_merge(nvlist_t *dst, nvlist_t *nvl, int flag)
+{
+ if (nvl == NULL || dst == NULL)
+ return (EINVAL);
+
+ if (dst != nvl)
+ return (nvlist_copy_pairs(nvl, dst));
+
+ return (0);
+}
+
+/*
+ * Encoding related routines
+ */
+#define NVS_OP_ENCODE 0
+#define NVS_OP_DECODE 1
+#define NVS_OP_GETSIZE 2
+
+typedef struct nvs_ops nvs_ops_t;
+
+typedef struct {
+ int nvs_op;
+ const nvs_ops_t *nvs_ops;
+ void *nvs_private;
+ nvpriv_t *nvs_priv;
+ int nvs_recursion;
+} nvstream_t;
+
+/*
+ * nvs operations are:
+ * - nvs_nvlist
+ * encoding / decoding of an nvlist header (nvlist_t)
+ * calculates the size used for header and end detection
+ *
+ * - nvs_nvpair
+ * responsible for the first part of encoding / decoding of an nvpair
+ * calculates the decoded size of an nvpair
+ *
+ * - nvs_nvp_op
+ * second part of encoding / decoding of an nvpair
+ *
+ * - nvs_nvp_size
+ * calculates the encoding size of an nvpair
+ *
+ * - nvs_nvl_fini
+ * encodes the end detection mark (zeros).
+ */
+struct nvs_ops {
+ int (*nvs_nvlist)(nvstream_t *, nvlist_t *, size_t *);
+ int (*nvs_nvpair)(nvstream_t *, nvpair_t *, size_t *);
+ int (*nvs_nvp_op)(nvstream_t *, nvpair_t *);
+ int (*nvs_nvp_size)(nvstream_t *, nvpair_t *, size_t *);
+ int (*nvs_nvl_fini)(nvstream_t *);
+};
+
+typedef struct {
+ char nvh_encoding; /* nvs encoding method */
+ char nvh_endian; /* nvs endian */
+ char nvh_reserved1; /* reserved for future use */
+ char nvh_reserved2; /* reserved for future use */
+} nvs_header_t;
+
+static int
+nvs_encode_pairs(nvstream_t *nvs, nvlist_t *nvl)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ i_nvp_t *curr;
+
+ /*
+ * Walk nvpair in list and encode each nvpair
+ */
+ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next)
+ if (nvs->nvs_ops->nvs_nvpair(nvs, &curr->nvi_nvp, NULL) != 0)
+ return (EFAULT);
+
+ return (nvs->nvs_ops->nvs_nvl_fini(nvs));
+}
+
+static int
+nvs_decode_pairs(nvstream_t *nvs, nvlist_t *nvl)
+{
+ nvpair_t *nvp;
+ size_t nvsize;
+ int err;
+
+ /*
+ * Get decoded size of next pair in stream, alloc
+ * memory for nvpair_t, then decode the nvpair
+ */
+ while ((err = nvs->nvs_ops->nvs_nvpair(nvs, NULL, &nvsize)) == 0) {
+ if (nvsize == 0) /* end of list */
+ break;
+
+ /* make sure len makes sense */
+ if (nvsize < NVP_SIZE_CALC(1, 0))
+ return (EFAULT);
+
+ if ((nvp = nvp_buf_alloc(nvl, nvsize)) == NULL)
+ return (ENOMEM);
+
+ if ((err = nvs->nvs_ops->nvs_nvp_op(nvs, nvp)) != 0) {
+ nvp_buf_free(nvl, nvp);
+ return (err);
+ }
+
+ if (i_validate_nvpair(nvp) != 0) {
+ nvpair_free(nvp);
+ nvp_buf_free(nvl, nvp);
+ return (EFAULT);
+ }
+
+ err = nvt_add_nvpair(nvl, nvp);
+ if (err != 0) {
+ nvpair_free(nvp);
+ nvp_buf_free(nvl, nvp);
+ return (err);
+ }
+ nvp_buf_link(nvl, nvp);
+ }
+ return (err);
+}
+
+static int
+nvs_getsize_pairs(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ i_nvp_t *curr;
+ uint64_t nvsize = *buflen;
+ size_t size;
+
+ /*
+ * Get encoded size of nvpairs in nvlist
+ */
+ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
+ if (nvs->nvs_ops->nvs_nvp_size(nvs, &curr->nvi_nvp, &size) != 0)
+ return (EINVAL);
+
+ if ((nvsize += size) > INT32_MAX)
+ return (EINVAL);
+ }
+
+ *buflen = nvsize;
+ return (0);
+}
+
+static int
+nvs_operation(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen)
+{
+ int err;
+
+ if (nvl->nvl_priv == 0)
+ return (EFAULT);
+
+ /*
+ * Perform the operation, starting with header, then each nvpair
+ */
+ if ((err = nvs->nvs_ops->nvs_nvlist(nvs, nvl, buflen)) != 0)
+ return (err);
+
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ err = nvs_encode_pairs(nvs, nvl);
+ break;
+
+ case NVS_OP_DECODE:
+ err = nvs_decode_pairs(nvs, nvl);
+ break;
+
+ case NVS_OP_GETSIZE:
+ err = nvs_getsize_pairs(nvs, nvl, buflen);
+ break;
+
+ default:
+ err = EINVAL;
+ }
+
+ return (err);
+}
+
+static int
+nvs_embedded(nvstream_t *nvs, nvlist_t *embedded)
+{
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE: {
+ int err;
+
+ if (nvs->nvs_recursion >= nvpair_max_recursion)
+ return (EINVAL);
+ nvs->nvs_recursion++;
+ err = nvs_operation(nvs, embedded, NULL);
+ nvs->nvs_recursion--;
+ return (err);
+ }
+ case NVS_OP_DECODE: {
+ nvpriv_t *priv;
+ int err;
+
+ if (embedded->nvl_version != NV_VERSION)
+ return (ENOTSUP);
+
+ if ((priv = nv_priv_alloc_embedded(nvs->nvs_priv)) == NULL)
+ return (ENOMEM);
+
+ nvlist_init(embedded, embedded->nvl_nvflag, priv);
+
+ if (nvs->nvs_recursion >= nvpair_max_recursion) {
+ nvlist_free(embedded);
+ return (EINVAL);
+ }
+ nvs->nvs_recursion++;
+ if ((err = nvs_operation(nvs, embedded, NULL)) != 0)
+ nvlist_free(embedded);
+ nvs->nvs_recursion--;
+ return (err);
+ }
+ default:
+ break;
+ }
+
+ return (EINVAL);
+}
+
+static int
+nvs_embedded_nvl_array(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
+{
+ size_t nelem = NVP_NELEM(nvp);
+ nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
+ int i;
+
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ for (i = 0; i < nelem; i++)
+ if (nvs_embedded(nvs, nvlp[i]) != 0)
+ return (EFAULT);
+ break;
+
+ case NVS_OP_DECODE: {
+ size_t len = nelem * sizeof (uint64_t);
+ nvlist_t *embedded = (nvlist_t *)((uintptr_t)nvlp + len);
+
+ bzero(nvlp, len); /* don't trust packed data */
+ for (i = 0; i < nelem; i++) {
+ if (nvs_embedded(nvs, embedded) != 0) {
+ nvpair_free(nvp);
+ return (EFAULT);
+ }
+
+ nvlp[i] = embedded++;
+ }
+ break;
+ }
+ case NVS_OP_GETSIZE: {
+ uint64_t nvsize = 0;
+
+ for (i = 0; i < nelem; i++) {
+ size_t nvp_sz = 0;
+
+ if (nvs_operation(nvs, nvlp[i], &nvp_sz) != 0)
+ return (EINVAL);
+
+ if ((nvsize += nvp_sz) > INT32_MAX)
+ return (EINVAL);
+ }
+
+ *size = nvsize;
+ break;
+ }
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+static int nvs_native(nvstream_t *, nvlist_t *, char *, size_t *);
+static int nvs_xdr(nvstream_t *, nvlist_t *, char *, size_t *);
+
+/*
+ * Common routine for nvlist operations:
+ * encode, decode, getsize (encoded size).
+ */
+static int
+nvlist_common(nvlist_t *nvl, char *buf, size_t *buflen, int encoding,
+ int nvs_op)
+{
+ int err = 0;
+ nvstream_t nvs;
+ int nvl_endian;
+#if defined(_ZFS_LITTLE_ENDIAN)
+ int host_endian = 1;
+#elif defined(_ZFS_BIG_ENDIAN)
+ int host_endian = 0;
+#else
+#error "No endian defined!"
+#endif /* _ZFS_LITTLE_ENDIAN */
+ nvs_header_t *nvh;
+
+ if (buflen == NULL || nvl == NULL ||
+ (nvs.nvs_priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return (EINVAL);
+
+ nvs.nvs_op = nvs_op;
+ nvs.nvs_recursion = 0;
+
+ /*
+ * For NVS_OP_ENCODE and NVS_OP_DECODE make sure an nvlist and
+ * a buffer is allocated. The first 4 bytes in the buffer are
+ * used for encoding method and host endian.
+ */
+ switch (nvs_op) {
+ case NVS_OP_ENCODE:
+ if (buf == NULL || *buflen < sizeof (nvs_header_t))
+ return (EINVAL);
+
+ nvh = (void *)buf;
+ nvh->nvh_encoding = encoding;
+ nvh->nvh_endian = nvl_endian = host_endian;
+ nvh->nvh_reserved1 = 0;
+ nvh->nvh_reserved2 = 0;
+ break;
+
+ case NVS_OP_DECODE:
+ if (buf == NULL || *buflen < sizeof (nvs_header_t))
+ return (EINVAL);
+
+ /* get method of encoding from first byte */
+ nvh = (void *)buf;
+ encoding = nvh->nvh_encoding;
+ nvl_endian = nvh->nvh_endian;
+ break;
+
+ case NVS_OP_GETSIZE:
+ nvl_endian = host_endian;
+
+ /*
+ * add the size for encoding
+ */
+ *buflen = sizeof (nvs_header_t);
+ break;
+
+ default:
+ return (ENOTSUP);
+ }
+
+ /*
+ * Create an nvstream with proper encoding method
+ */
+ switch (encoding) {
+ case NV_ENCODE_NATIVE:
+ /*
+ * check endianness, in case we are unpacking
+ * from a file
+ */
+ if (nvl_endian != host_endian)
+ return (ENOTSUP);
+ err = nvs_native(&nvs, nvl, buf, buflen);
+ break;
+ case NV_ENCODE_XDR:
+ err = nvs_xdr(&nvs, nvl, buf, buflen);
+ break;
+ default:
+ err = ENOTSUP;
+ break;
+ }
+
+ return (err);
+}
+
+int
+nvlist_size(nvlist_t *nvl, size_t *size, int encoding)
+{
+ return (nvlist_common(nvl, NULL, size, encoding, NVS_OP_GETSIZE));
+}
+
+/*
+ * Pack nvlist into contiguous memory
+ */
+int
+nvlist_pack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding,
+ int kmflag)
+{
+ return (nvlist_xpack(nvl, bufp, buflen, encoding,
+ nvlist_nv_alloc(kmflag)));
+}
+
+int
+nvlist_xpack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding,
+ nv_alloc_t *nva)
+{
+ nvpriv_t nvpriv;
+ size_t alloc_size;
+ char *buf;
+ int err;
+
+ if (nva == NULL || nvl == NULL || bufp == NULL || buflen == NULL)
+ return (EINVAL);
+
+ if (*bufp != NULL)
+ return (nvlist_common(nvl, *bufp, buflen, encoding,
+ NVS_OP_ENCODE));
+
+ /*
+ * Here is a difficult situation:
+ * 1. The nvlist has fixed allocator properties.
+ * All other nvlist routines (like nvlist_add_*, ...) use
+ * these properties.
+ * 2. When using nvlist_pack() the user can specify their own
+ * allocator properties (e.g. by using KM_NOSLEEP).
+ *
+ * We use the user specified properties (2). A clearer solution
+ * will be to remove the kmflag from nvlist_pack(), but we will
+ * not change the interface.
+ */
+ nv_priv_init(&nvpriv, nva, 0);
+
+ if ((err = nvlist_size(nvl, &alloc_size, encoding)))
+ return (err);
+
+ if ((buf = nv_mem_zalloc(&nvpriv, alloc_size)) == NULL)
+ return (ENOMEM);
+
+ if ((err = nvlist_common(nvl, buf, &alloc_size, encoding,
+ NVS_OP_ENCODE)) != 0) {
+ nv_mem_free(&nvpriv, buf, alloc_size);
+ } else {
+ *buflen = alloc_size;
+ *bufp = buf;
+ }
+
+ return (err);
+}
+
+/*
+ * Unpack buf into an nvlist_t
+ */
+int
+nvlist_unpack(char *buf, size_t buflen, nvlist_t **nvlp, int kmflag)
+{
+ return (nvlist_xunpack(buf, buflen, nvlp, nvlist_nv_alloc(kmflag)));
+}
+
+int
+nvlist_xunpack(char *buf, size_t buflen, nvlist_t **nvlp, nv_alloc_t *nva)
+{
+ nvlist_t *nvl;
+ int err;
+
+ if (nvlp == NULL)
+ return (EINVAL);
+
+ if ((err = nvlist_xalloc(&nvl, 0, nva)) != 0)
+ return (err);
+
+ if ((err = nvlist_common(nvl, buf, &buflen, NV_ENCODE_NATIVE,
+ NVS_OP_DECODE)) != 0)
+ nvlist_free(nvl);
+ else
+ *nvlp = nvl;
+
+ return (err);
+}
+
+/*
+ * Native encoding functions
+ */
+typedef struct {
+ /*
+ * This structure is used when decoding a packed nvpair in
+ * the native format. n_base points to a buffer containing the
+ * packed nvpair. n_end is a pointer to the end of the buffer.
+ * (n_end actually points to the first byte past the end of the
+ * buffer.) n_curr is a pointer that lies between n_base and n_end.
+ * It points to the current data that we are decoding.
+ * The amount of data left in the buffer is equal to n_end - n_curr.
+ * n_flag is used to recognize a packed embedded list.
+ */
+ caddr_t n_base;
+ caddr_t n_end;
+ caddr_t n_curr;
+ uint_t n_flag;
+} nvs_native_t;
+
+static int
+nvs_native_create(nvstream_t *nvs, nvs_native_t *native, char *buf,
+ size_t buflen)
+{
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ case NVS_OP_DECODE:
+ nvs->nvs_private = native;
+ native->n_curr = native->n_base = buf;
+ native->n_end = buf + buflen;
+ native->n_flag = 0;
+ return (0);
+
+ case NVS_OP_GETSIZE:
+ nvs->nvs_private = native;
+ native->n_curr = native->n_base = native->n_end = NULL;
+ native->n_flag = 0;
+ return (0);
+ default:
+ return (EINVAL);
+ }
+}
+
+/*ARGSUSED*/
+static void
+nvs_native_destroy(nvstream_t *nvs)
+{
+}
+
+static int
+native_cp(nvstream_t *nvs, void *buf, size_t size)
+{
+ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+
+ if (native->n_curr + size > native->n_end)
+ return (EFAULT);
+
+ /*
+ * The bcopy() below eliminates alignment requirement
+ * on the buffer (stream) and is preferred over direct access.
+ */
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ bcopy(buf, native->n_curr, size);
+ break;
+ case NVS_OP_DECODE:
+ bcopy(native->n_curr, buf, size);
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ native->n_curr += size;
+ return (0);
+}
+
+/*
+ * operate on nvlist_t header
+ */
+static int
+nvs_native_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size)
+{
+ nvs_native_t *native = nvs->nvs_private;
+
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ case NVS_OP_DECODE:
+ if (native->n_flag)
+ return (0); /* packed embedded list */
+
+ native->n_flag = 1;
+
+ /* copy version and nvflag of the nvlist_t */
+ if (native_cp(nvs, &nvl->nvl_version, sizeof (int32_t)) != 0 ||
+ native_cp(nvs, &nvl->nvl_nvflag, sizeof (int32_t)) != 0)
+ return (EFAULT);
+
+ return (0);
+
+ case NVS_OP_GETSIZE:
+ /*
+ * if calculate for packed embedded list
+ * 4 for end of the embedded list
+ * else
+ * 2 * sizeof (int32_t) for nvl_version and nvl_nvflag
+ * and 4 for end of the entire list
+ */
+ if (native->n_flag) {
+ *size += 4;
+ } else {
+ native->n_flag = 1;
+ *size += 2 * sizeof (int32_t) + 4;
+ }
+
+ return (0);
+
+ default:
+ return (EINVAL);
+ }
+}
+
+static int
+nvs_native_nvl_fini(nvstream_t *nvs)
+{
+ if (nvs->nvs_op == NVS_OP_ENCODE) {
+ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+ /*
+ * Add 4 zero bytes at end of nvlist. They are used
+ * for end detection by the decode routine.
+ */
+ if (native->n_curr + sizeof (int) > native->n_end)
+ return (EFAULT);
+
+ bzero(native->n_curr, sizeof (int));
+ native->n_curr += sizeof (int);
+ }
+
+ return (0);
+}
+
+static int
+nvpair_native_embedded(nvstream_t *nvs, nvpair_t *nvp)
+{
+ if (nvs->nvs_op == NVS_OP_ENCODE) {
+ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+ nvlist_t *packed = (void *)
+ (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp));
+ /*
+ * Null out the pointer that is meaningless in the packed
+ * structure. The address may not be aligned, so we have
+ * to use bzero.
+ */
+ bzero((char *)packed + offsetof(nvlist_t, nvl_priv),
+ sizeof (uint64_t));
+ }
+
+ return (nvs_embedded(nvs, EMBEDDED_NVL(nvp)));
+}
+
+static int
+nvpair_native_embedded_array(nvstream_t *nvs, nvpair_t *nvp)
+{
+ if (nvs->nvs_op == NVS_OP_ENCODE) {
+ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+ char *value = native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp);
+ size_t len = NVP_NELEM(nvp) * sizeof (uint64_t);
+ nvlist_t *packed = (nvlist_t *)((uintptr_t)value + len);
+ int i;
+ /*
+ * Null out pointers that are meaningless in the packed
+ * structure. The addresses may not be aligned, so we have
+ * to use bzero.
+ */
+ bzero(value, len);
+
+ for (i = 0; i < NVP_NELEM(nvp); i++, packed++)
+ /*
+ * Null out the pointer that is meaningless in the
+ * packed structure. The address may not be aligned,
+ * so we have to use bzero.
+ */
+ bzero((char *)packed + offsetof(nvlist_t, nvl_priv),
+ sizeof (uint64_t));
+ }
+
+ return (nvs_embedded_nvl_array(nvs, nvp, NULL));
+}
+
+static void
+nvpair_native_string_array(nvstream_t *nvs, nvpair_t *nvp)
+{
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE: {
+ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+ uint64_t *strp = (void *)
+ (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp));
+ /*
+ * Null out pointers that are meaningless in the packed
+ * structure. The addresses may not be aligned, so we have
+ * to use bzero.
+ */
+ bzero(strp, NVP_NELEM(nvp) * sizeof (uint64_t));
+ break;
+ }
+ case NVS_OP_DECODE: {
+ char **strp = (void *)NVP_VALUE(nvp);
+ char *buf = ((char *)strp + NVP_NELEM(nvp) * sizeof (uint64_t));
+ int i;
+
+ for (i = 0; i < NVP_NELEM(nvp); i++) {
+ strp[i] = buf;
+ buf += strlen(buf) + 1;
+ }
+ break;
+ }
+ }
+}
+
+static int
+nvs_native_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
+{
+ data_type_t type;
+ int value_sz;
+ int ret = 0;
+
+ /*
+ * We do the initial bcopy of the data before we look at
+ * the nvpair type, because when we're decoding, we won't
+ * have the correct values for the pair until we do the bcopy.
+ */
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ case NVS_OP_DECODE:
+ if (native_cp(nvs, nvp, nvp->nvp_size) != 0)
+ return (EFAULT);
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ /* verify nvp_name_sz, check the name string length */
+ if (i_validate_nvpair_name(nvp) != 0)
+ return (EFAULT);
+
+ type = NVP_TYPE(nvp);
+
+ /*
+ * Verify type and nelem and get the value size.
+ * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
+ * is the size of the string(s) excluded.
+ */
+ if ((value_sz = i_get_value_size(type, NULL, NVP_NELEM(nvp))) < 0)
+ return (EFAULT);
+
+ if (NVP_SIZE_CALC(nvp->nvp_name_sz, value_sz) > nvp->nvp_size)
+ return (EFAULT);
+
+ switch (type) {
+ case DATA_TYPE_NVLIST:
+ ret = nvpair_native_embedded(nvs, nvp);
+ break;
+ case DATA_TYPE_NVLIST_ARRAY:
+ ret = nvpair_native_embedded_array(nvs, nvp);
+ break;
+ case DATA_TYPE_STRING_ARRAY:
+ nvpair_native_string_array(nvs, nvp);
+ break;
+ default:
+ break;
+ }
+
+ return (ret);
+}
+
+static int
+nvs_native_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
+{
+ uint64_t nvp_sz = nvp->nvp_size;
+
+ switch (NVP_TYPE(nvp)) {
+ case DATA_TYPE_NVLIST: {
+ size_t nvsize = 0;
+
+ if (nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize) != 0)
+ return (EINVAL);
+
+ nvp_sz += nvsize;
+ break;
+ }
+ case DATA_TYPE_NVLIST_ARRAY: {
+ size_t nvsize;
+
+ if (nvs_embedded_nvl_array(nvs, nvp, &nvsize) != 0)
+ return (EINVAL);
+
+ nvp_sz += nvsize;
+ break;
+ }
+ default:
+ break;
+ }
+
+ if (nvp_sz > INT32_MAX)
+ return (EINVAL);
+
+ *size = nvp_sz;
+
+ return (0);
+}
+
+static int
+nvs_native_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
+{
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ return (nvs_native_nvp_op(nvs, nvp));
+
+ case NVS_OP_DECODE: {
+ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+ int32_t decode_len;
+
+ /* try to read the size value from the stream */
+ if (native->n_curr + sizeof (int32_t) > native->n_end)
+ return (EFAULT);
+ bcopy(native->n_curr, &decode_len, sizeof (int32_t));
+
+ /* sanity check the size value */
+ if (decode_len < 0 ||
+ decode_len > native->n_end - native->n_curr)
+ return (EFAULT);
+
+ *size = decode_len;
+
+ /*
+ * If at the end of the stream then move the cursor
+ * forward, otherwise nvpair_native_op() will read
+ * the entire nvpair at the same cursor position.
+ */
+ if (*size == 0)
+ native->n_curr += sizeof (int32_t);
+ break;
+ }
+
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+static const nvs_ops_t nvs_native_ops = {
+ .nvs_nvlist = nvs_native_nvlist,
+ .nvs_nvpair = nvs_native_nvpair,
+ .nvs_nvp_op = nvs_native_nvp_op,
+ .nvs_nvp_size = nvs_native_nvp_size,
+ .nvs_nvl_fini = nvs_native_nvl_fini
+};
+
+static int
+nvs_native(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen)
+{
+ nvs_native_t native;
+ int err;
+
+ nvs->nvs_ops = &nvs_native_ops;
+
+ if ((err = nvs_native_create(nvs, &native, buf + sizeof (nvs_header_t),
+ *buflen - sizeof (nvs_header_t))) != 0)
+ return (err);
+
+ err = nvs_operation(nvs, nvl, buflen);
+
+ nvs_native_destroy(nvs);
+
+ return (err);
+}
+
+/*
+ * XDR encoding functions
+ *
+ * An xdr packed nvlist is encoded as:
+ *
+ * - encoding method and host endian (4 bytes)
+ * - nvl_version (4 bytes)
+ * - nvl_nvflag (4 bytes)
+ *
+ * - encoded nvpairs, the format of one xdr encoded nvpair is:
+ * - encoded size of the nvpair (4 bytes)
+ * - decoded size of the nvpair (4 bytes)
+ * - name string, (4 + sizeof(NV_ALIGN4(string))
+ * a string is coded as size (4 bytes) and data
+ * - data type (4 bytes)
+ * - number of elements in the nvpair (4 bytes)
+ * - data
+ *
+ * - 2 zero's for end of the entire list (8 bytes)
+ */
+static int
+nvs_xdr_create(nvstream_t *nvs, XDR *xdr, char *buf, size_t buflen)
+{
+ /* xdr data must be 4 byte aligned */
+ if ((ulong_t)buf % 4 != 0)
+ return (EFAULT);
+
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ xdrmem_create(xdr, buf, (uint_t)buflen, XDR_ENCODE);
+ nvs->nvs_private = xdr;
+ return (0);
+ case NVS_OP_DECODE:
+ xdrmem_create(xdr, buf, (uint_t)buflen, XDR_DECODE);
+ nvs->nvs_private = xdr;
+ return (0);
+ case NVS_OP_GETSIZE:
+ nvs->nvs_private = NULL;
+ return (0);
+ default:
+ return (EINVAL);
+ }
+}
+
+static void
+nvs_xdr_destroy(nvstream_t *nvs)
+{
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ case NVS_OP_DECODE:
+ xdr_destroy((XDR *)nvs->nvs_private);
+ break;
+ default:
+ break;
+ }
+}
+
+static int
+nvs_xdr_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size)
+{
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ case NVS_OP_DECODE: {
+ XDR *xdr = nvs->nvs_private;
+
+ if (!xdr_int(xdr, &nvl->nvl_version) ||
+ !xdr_u_int(xdr, &nvl->nvl_nvflag))
+ return (EFAULT);
+ break;
+ }
+ case NVS_OP_GETSIZE: {
+ /*
+ * 2 * 4 for nvl_version + nvl_nvflag
+ * and 8 for end of the entire list
+ */
+ *size += 2 * 4 + 8;
+ break;
+ }
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+static int
+nvs_xdr_nvl_fini(nvstream_t *nvs)
+{
+ if (nvs->nvs_op == NVS_OP_ENCODE) {
+ XDR *xdr = nvs->nvs_private;
+ int zero = 0;
+
+ if (!xdr_int(xdr, &zero) || !xdr_int(xdr, &zero))
+ return (EFAULT);
+ }
+
+ return (0);
+}
+
+/*
+ * The format of xdr encoded nvpair is:
+ * encode_size, decode_size, name string, data type, nelem, data
+ */
+static int
+nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
+{
+ data_type_t type;
+ char *buf;
+ char *buf_end = (char *)nvp + nvp->nvp_size;
+ int value_sz;
+ uint_t nelem, buflen;
+ bool_t ret = FALSE;
+ XDR *xdr = nvs->nvs_private;
+
+ ASSERT(xdr != NULL && nvp != NULL);
+
+ /* name string */
+ if ((buf = NVP_NAME(nvp)) >= buf_end)
+ return (EFAULT);
+ buflen = buf_end - buf;
+
+ if (!xdr_string(xdr, &buf, buflen - 1))
+ return (EFAULT);
+ nvp->nvp_name_sz = strlen(buf) + 1;
+
+ /* type and nelem */
+ if (!xdr_int(xdr, (int *)&nvp->nvp_type) ||
+ !xdr_int(xdr, &nvp->nvp_value_elem))
+ return (EFAULT);
+
+ type = NVP_TYPE(nvp);
+ nelem = nvp->nvp_value_elem;
+
+ /*
+ * Verify type and nelem and get the value size.
+ * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
+ * is the size of the string(s) excluded.
+ */
+ if ((value_sz = i_get_value_size(type, NULL, nelem)) < 0)
+ return (EFAULT);
+
+ /* if there is no data to extract then return */
+ if (nelem == 0)
+ return (0);
+
+ /* value */
+ if ((buf = NVP_VALUE(nvp)) >= buf_end)
+ return (EFAULT);
+ buflen = buf_end - buf;
+
+ if (buflen < value_sz)
+ return (EFAULT);
+
+ switch (type) {
+ case DATA_TYPE_NVLIST:
+ if (nvs_embedded(nvs, (void *)buf) == 0)
+ return (0);
+ break;
+
+ case DATA_TYPE_NVLIST_ARRAY:
+ if (nvs_embedded_nvl_array(nvs, nvp, NULL) == 0)
+ return (0);
+ break;
+
+ case DATA_TYPE_BOOLEAN:
+ ret = TRUE;
+ break;
+
+ case DATA_TYPE_BYTE:
+ case DATA_TYPE_INT8:
+ case DATA_TYPE_UINT8:
+ ret = xdr_char(xdr, buf);
+ break;
+
+ case DATA_TYPE_INT16:
+ ret = xdr_short(xdr, (void *)buf);
+ break;
+
+ case DATA_TYPE_UINT16:
+ ret = xdr_u_short(xdr, (void *)buf);
+ break;
+
+ case DATA_TYPE_BOOLEAN_VALUE:
+ case DATA_TYPE_INT32:
+ ret = xdr_int(xdr, (void *)buf);
+ break;
+
+ case DATA_TYPE_UINT32:
+ ret = xdr_u_int(xdr, (void *)buf);
+ break;
+
+ case DATA_TYPE_INT64:
+ ret = xdr_longlong_t(xdr, (void *)buf);
+ break;
+
+ case DATA_TYPE_UINT64:
+ ret = xdr_u_longlong_t(xdr, (void *)buf);
+ break;
+
+ case DATA_TYPE_HRTIME:
+ /*
+ * NOTE: must expose the definition of hrtime_t here
+ */
+ ret = xdr_longlong_t(xdr, (void *)buf);
+ break;
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+ ret = xdr_double(xdr, (void *)buf);
+ break;
+#endif
+ case DATA_TYPE_STRING:
+ ret = xdr_string(xdr, &buf, buflen - 1);
+ break;
+
+ case DATA_TYPE_BYTE_ARRAY:
+ ret = xdr_opaque(xdr, buf, nelem);
+ break;
+
+ case DATA_TYPE_INT8_ARRAY:
+ case DATA_TYPE_UINT8_ARRAY:
+ ret = xdr_array(xdr, &buf, &nelem, buflen, sizeof (int8_t),
+ (xdrproc_t)xdr_char);
+ break;
+
+ case DATA_TYPE_INT16_ARRAY:
+ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int16_t),
+ sizeof (int16_t), (xdrproc_t)xdr_short);
+ break;
+
+ case DATA_TYPE_UINT16_ARRAY:
+ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint16_t),
+ sizeof (uint16_t), (xdrproc_t)xdr_u_short);
+ break;
+
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_INT32_ARRAY:
+ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int32_t),
+ sizeof (int32_t), (xdrproc_t)xdr_int);
+ break;
+
+ case DATA_TYPE_UINT32_ARRAY:
+ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint32_t),
+ sizeof (uint32_t), (xdrproc_t)xdr_u_int);
+ break;
+
+ case DATA_TYPE_INT64_ARRAY:
+ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int64_t),
+ sizeof (int64_t), (xdrproc_t)xdr_longlong_t);
+ break;
+
+ case DATA_TYPE_UINT64_ARRAY:
+ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint64_t),
+ sizeof (uint64_t), (xdrproc_t)xdr_u_longlong_t);
+ break;
+
+ case DATA_TYPE_STRING_ARRAY: {
+ size_t len = nelem * sizeof (uint64_t);
+ char **strp = (void *)buf;
+ int i;
+
+ if (nvs->nvs_op == NVS_OP_DECODE)
+ bzero(buf, len); /* don't trust packed data */
+
+ for (i = 0; i < nelem; i++) {
+ if (buflen <= len)
+ return (EFAULT);
+
+ buf += len;
+ buflen -= len;
+
+ if (xdr_string(xdr, &buf, buflen - 1) != TRUE)
+ return (EFAULT);
+
+ if (nvs->nvs_op == NVS_OP_DECODE)
+ strp[i] = buf;
+ len = strlen(buf) + 1;
+ }
+ ret = TRUE;
+ break;
+ }
+ default:
+ break;
+ }
+
+ return (ret == TRUE ? 0 : EFAULT);
+}
+
+static int
+nvs_xdr_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
+{
+ data_type_t type = NVP_TYPE(nvp);
+ /*
+ * encode_size + decode_size + name string size + data type + nelem
+ * where name string size = 4 + NV_ALIGN4(strlen(NVP_NAME(nvp)))
+ */
+ uint64_t nvp_sz = 4 + 4 + 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) + 4 + 4;
+
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ break;
+
+ case DATA_TYPE_BOOLEAN_VALUE:
+ case DATA_TYPE_BYTE:
+ case DATA_TYPE_INT8:
+ case DATA_TYPE_UINT8:
+ case DATA_TYPE_INT16:
+ case DATA_TYPE_UINT16:
+ case DATA_TYPE_INT32:
+ case DATA_TYPE_UINT32:
+ nvp_sz += 4; /* 4 is the minimum xdr unit */
+ break;
+
+ case DATA_TYPE_INT64:
+ case DATA_TYPE_UINT64:
+ case DATA_TYPE_HRTIME:
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+#endif
+ nvp_sz += 8;
+ break;
+
+ case DATA_TYPE_STRING:
+ nvp_sz += 4 + NV_ALIGN4(strlen((char *)NVP_VALUE(nvp)));
+ break;
+
+ case DATA_TYPE_BYTE_ARRAY:
+ nvp_sz += NV_ALIGN4(NVP_NELEM(nvp));
+ break;
+
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_INT8_ARRAY:
+ case DATA_TYPE_UINT8_ARRAY:
+ case DATA_TYPE_INT16_ARRAY:
+ case DATA_TYPE_UINT16_ARRAY:
+ case DATA_TYPE_INT32_ARRAY:
+ case DATA_TYPE_UINT32_ARRAY:
+ nvp_sz += 4 + 4 * (uint64_t)NVP_NELEM(nvp);
+ break;
+
+ case DATA_TYPE_INT64_ARRAY:
+ case DATA_TYPE_UINT64_ARRAY:
+ nvp_sz += 4 + 8 * (uint64_t)NVP_NELEM(nvp);
+ break;
+
+ case DATA_TYPE_STRING_ARRAY: {
+ int i;
+ char **strs = (void *)NVP_VALUE(nvp);
+
+ for (i = 0; i < NVP_NELEM(nvp); i++)
+ nvp_sz += 4 + NV_ALIGN4(strlen(strs[i]));
+
+ break;
+ }
+
+ case DATA_TYPE_NVLIST:
+ case DATA_TYPE_NVLIST_ARRAY: {
+ size_t nvsize = 0;
+ int old_nvs_op = nvs->nvs_op;
+ int err;
+
+ nvs->nvs_op = NVS_OP_GETSIZE;
+ if (type == DATA_TYPE_NVLIST)
+ err = nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize);
+ else
+ err = nvs_embedded_nvl_array(nvs, nvp, &nvsize);
+ nvs->nvs_op = old_nvs_op;
+
+ if (err != 0)
+ return (EINVAL);
+
+ nvp_sz += nvsize;
+ break;
+ }
+
+ default:
+ return (EINVAL);
+ }
+
+ if (nvp_sz > INT32_MAX)
+ return (EINVAL);
+
+ *size = nvp_sz;
+
+ return (0);
+}
+
+
+/*
+ * The NVS_XDR_MAX_LEN macro takes a packed xdr buffer of size x and estimates
+ * the largest nvpair that could be encoded in the buffer.
+ *
+ * See comments above nvpair_xdr_op() for the format of xdr encoding.
+ * The size of a xdr packed nvpair without any data is 5 words.
+ *
+ * Using the size of the data directly as an estimate would be ok
+ * in all cases except one. If the data type is of DATA_TYPE_STRING_ARRAY
+ * then the actual nvpair has space for an array of pointers to index
+ * the strings. These pointers are not encoded into the packed xdr buffer.
+ *
+ * If the data is of type DATA_TYPE_STRING_ARRAY and all the strings are
+ * of length 0, then each string is encoded in xdr format as a single word.
+ * Therefore when expanded to an nvpair there will be 2.25 word used for
+ * each string. (a int64_t allocated for pointer usage, and a single char
+ * for the null termination.)
+ *
+ * This is the calculation performed by the NVS_XDR_MAX_LEN macro.
+ */
+#define NVS_XDR_HDR_LEN ((size_t)(5 * 4))
+#define NVS_XDR_DATA_LEN(y) (((size_t)(y) <= NVS_XDR_HDR_LEN) ? \
+ 0 : ((size_t)(y) - NVS_XDR_HDR_LEN))
+#define NVS_XDR_MAX_LEN(x) (NVP_SIZE_CALC(1, 0) + \
+ (NVS_XDR_DATA_LEN(x) * 2) + \
+ NV_ALIGN4((NVS_XDR_DATA_LEN(x) / 4)))
+
+static int
+nvs_xdr_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
+{
+ XDR *xdr = nvs->nvs_private;
+ int32_t encode_len, decode_len;
+
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE: {
+ size_t nvsize;
+
+ if (nvs_xdr_nvp_size(nvs, nvp, &nvsize) != 0)
+ return (EFAULT);
+
+ decode_len = nvp->nvp_size;
+ encode_len = nvsize;
+ if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len))
+ return (EFAULT);
+
+ return (nvs_xdr_nvp_op(nvs, nvp));
+ }
+ case NVS_OP_DECODE: {
+ struct xdr_bytesrec bytesrec;
+
+ /* get the encode and decode size */
+ if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len))
+ return (EFAULT);
+ *size = decode_len;
+
+ /* are we at the end of the stream? */
+ if (*size == 0)
+ return (0);
+
+ /* sanity check the size parameter */
+ if (!xdr_control(xdr, XDR_GET_BYTES_AVAIL, &bytesrec))
+ return (EFAULT);
+
+ if (*size > NVS_XDR_MAX_LEN(bytesrec.xc_num_avail))
+ return (EFAULT);
+ break;
+ }
+
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+static const struct nvs_ops nvs_xdr_ops = {
+ .nvs_nvlist = nvs_xdr_nvlist,
+ .nvs_nvpair = nvs_xdr_nvpair,
+ .nvs_nvp_op = nvs_xdr_nvp_op,
+ .nvs_nvp_size = nvs_xdr_nvp_size,
+ .nvs_nvl_fini = nvs_xdr_nvl_fini
+};
+
+static int
+nvs_xdr(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen)
+{
+ XDR xdr;
+ int err;
+
+ nvs->nvs_ops = &nvs_xdr_ops;
+
+ if ((err = nvs_xdr_create(nvs, &xdr, buf + sizeof (nvs_header_t),
+ *buflen - sizeof (nvs_header_t))) != 0)
+ return (err);
+
+ err = nvs_operation(nvs, nvl, buflen);
+
+ nvs_xdr_destroy(nvs);
+
+ return (err);
+}
+
+#if defined(_KERNEL)
+static int __init
+nvpair_init(void)
+{
+ return (0);
+}
+
+static void __exit
+nvpair_fini(void)
+{
+}
+
+module_init(nvpair_init);
+module_exit(nvpair_fini);
+#endif
+
+ZFS_MODULE_DESCRIPTION("Generic name/value pair implementation");
+ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR);
+ZFS_MODULE_LICENSE(ZFS_META_LICENSE);
+ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
+
+EXPORT_SYMBOL(nv_alloc_init);
+EXPORT_SYMBOL(nv_alloc_reset);
+EXPORT_SYMBOL(nv_alloc_fini);
+
+/* list management */
+EXPORT_SYMBOL(nvlist_alloc);
+EXPORT_SYMBOL(nvlist_free);
+EXPORT_SYMBOL(nvlist_size);
+EXPORT_SYMBOL(nvlist_pack);
+EXPORT_SYMBOL(nvlist_unpack);
+EXPORT_SYMBOL(nvlist_dup);
+EXPORT_SYMBOL(nvlist_merge);
+
+EXPORT_SYMBOL(nvlist_xalloc);
+EXPORT_SYMBOL(nvlist_xpack);
+EXPORT_SYMBOL(nvlist_xunpack);
+EXPORT_SYMBOL(nvlist_xdup);
+EXPORT_SYMBOL(nvlist_lookup_nv_alloc);
+
+EXPORT_SYMBOL(nvlist_add_nvpair);
+EXPORT_SYMBOL(nvlist_add_boolean);
+EXPORT_SYMBOL(nvlist_add_boolean_value);
+EXPORT_SYMBOL(nvlist_add_byte);
+EXPORT_SYMBOL(nvlist_add_int8);
+EXPORT_SYMBOL(nvlist_add_uint8);
+EXPORT_SYMBOL(nvlist_add_int16);
+EXPORT_SYMBOL(nvlist_add_uint16);
+EXPORT_SYMBOL(nvlist_add_int32);
+EXPORT_SYMBOL(nvlist_add_uint32);
+EXPORT_SYMBOL(nvlist_add_int64);
+EXPORT_SYMBOL(nvlist_add_uint64);
+EXPORT_SYMBOL(nvlist_add_string);
+EXPORT_SYMBOL(nvlist_add_nvlist);
+EXPORT_SYMBOL(nvlist_add_boolean_array);
+EXPORT_SYMBOL(nvlist_add_byte_array);
+EXPORT_SYMBOL(nvlist_add_int8_array);
+EXPORT_SYMBOL(nvlist_add_uint8_array);
+EXPORT_SYMBOL(nvlist_add_int16_array);
+EXPORT_SYMBOL(nvlist_add_uint16_array);
+EXPORT_SYMBOL(nvlist_add_int32_array);
+EXPORT_SYMBOL(nvlist_add_uint32_array);
+EXPORT_SYMBOL(nvlist_add_int64_array);
+EXPORT_SYMBOL(nvlist_add_uint64_array);
+EXPORT_SYMBOL(nvlist_add_string_array);
+EXPORT_SYMBOL(nvlist_add_nvlist_array);
+EXPORT_SYMBOL(nvlist_next_nvpair);
+EXPORT_SYMBOL(nvlist_prev_nvpair);
+EXPORT_SYMBOL(nvlist_empty);
+EXPORT_SYMBOL(nvlist_add_hrtime);
+
+EXPORT_SYMBOL(nvlist_remove);
+EXPORT_SYMBOL(nvlist_remove_nvpair);
+EXPORT_SYMBOL(nvlist_remove_all);
+
+EXPORT_SYMBOL(nvlist_lookup_boolean);
+EXPORT_SYMBOL(nvlist_lookup_boolean_value);
+EXPORT_SYMBOL(nvlist_lookup_byte);
+EXPORT_SYMBOL(nvlist_lookup_int8);
+EXPORT_SYMBOL(nvlist_lookup_uint8);
+EXPORT_SYMBOL(nvlist_lookup_int16);
+EXPORT_SYMBOL(nvlist_lookup_uint16);
+EXPORT_SYMBOL(nvlist_lookup_int32);
+EXPORT_SYMBOL(nvlist_lookup_uint32);
+EXPORT_SYMBOL(nvlist_lookup_int64);
+EXPORT_SYMBOL(nvlist_lookup_uint64);
+EXPORT_SYMBOL(nvlist_lookup_string);
+EXPORT_SYMBOL(nvlist_lookup_nvlist);
+EXPORT_SYMBOL(nvlist_lookup_boolean_array);
+EXPORT_SYMBOL(nvlist_lookup_byte_array);
+EXPORT_SYMBOL(nvlist_lookup_int8_array);
+EXPORT_SYMBOL(nvlist_lookup_uint8_array);
+EXPORT_SYMBOL(nvlist_lookup_int16_array);
+EXPORT_SYMBOL(nvlist_lookup_uint16_array);
+EXPORT_SYMBOL(nvlist_lookup_int32_array);
+EXPORT_SYMBOL(nvlist_lookup_uint32_array);
+EXPORT_SYMBOL(nvlist_lookup_int64_array);
+EXPORT_SYMBOL(nvlist_lookup_uint64_array);
+EXPORT_SYMBOL(nvlist_lookup_string_array);
+EXPORT_SYMBOL(nvlist_lookup_nvlist_array);
+EXPORT_SYMBOL(nvlist_lookup_hrtime);
+EXPORT_SYMBOL(nvlist_lookup_pairs);
+
+EXPORT_SYMBOL(nvlist_lookup_nvpair);
+EXPORT_SYMBOL(nvlist_exists);
+
+/* processing nvpair */
+EXPORT_SYMBOL(nvpair_name);
+EXPORT_SYMBOL(nvpair_type);
+EXPORT_SYMBOL(nvpair_value_boolean_value);
+EXPORT_SYMBOL(nvpair_value_byte);
+EXPORT_SYMBOL(nvpair_value_int8);
+EXPORT_SYMBOL(nvpair_value_uint8);
+EXPORT_SYMBOL(nvpair_value_int16);
+EXPORT_SYMBOL(nvpair_value_uint16);
+EXPORT_SYMBOL(nvpair_value_int32);
+EXPORT_SYMBOL(nvpair_value_uint32);
+EXPORT_SYMBOL(nvpair_value_int64);
+EXPORT_SYMBOL(nvpair_value_uint64);
+EXPORT_SYMBOL(nvpair_value_string);
+EXPORT_SYMBOL(nvpair_value_nvlist);
+EXPORT_SYMBOL(nvpair_value_boolean_array);
+EXPORT_SYMBOL(nvpair_value_byte_array);
+EXPORT_SYMBOL(nvpair_value_int8_array);
+EXPORT_SYMBOL(nvpair_value_uint8_array);
+EXPORT_SYMBOL(nvpair_value_int16_array);
+EXPORT_SYMBOL(nvpair_value_uint16_array);
+EXPORT_SYMBOL(nvpair_value_int32_array);
+EXPORT_SYMBOL(nvpair_value_uint32_array);
+EXPORT_SYMBOL(nvpair_value_int64_array);
+EXPORT_SYMBOL(nvpair_value_uint64_array);
+EXPORT_SYMBOL(nvpair_value_string_array);
+EXPORT_SYMBOL(nvpair_value_nvlist_array);
+EXPORT_SYMBOL(nvpair_value_hrtime);
diff --git a/sys/contrib/openzfs/module/nvpair/nvpair_alloc_fixed.c b/sys/contrib/openzfs/module/nvpair/nvpair_alloc_fixed.c
new file mode 100644
index 000000000000..c8a604a2bfac
--- /dev/null
+++ b/sys/contrib/openzfs/module/nvpair/nvpair_alloc_fixed.c
@@ -0,0 +1,115 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/isa_defs.h>
+#include <sys/nvpair.h>
+#include <sys/sysmacros.h>
+
+/*
+ * This allocator is very simple.
+ * - it uses a pre-allocated buffer for memory allocations.
+ * - it does _not_ free memory in the pre-allocated buffer.
+ *
+ * The reason for the selected implementation is simplicity.
+ * This allocator is designed for the usage in interrupt context when
+ * the caller may not wait for free memory.
+ */
+
+/* pre-allocated buffer for memory allocations */
+typedef struct nvbuf {
+ uintptr_t nvb_buf; /* address of pre-allocated buffer */
+ uintptr_t nvb_lim; /* limit address in the buffer */
+ uintptr_t nvb_cur; /* current address in the buffer */
+} nvbuf_t;
+
+/*
+ * Initialize the pre-allocated buffer allocator. The caller needs to supply
+ *
+ * buf address of pre-allocated buffer
+ * bufsz size of pre-allocated buffer
+ *
+ * nv_fixed_init() calculates the remaining members of nvbuf_t.
+ */
+static int
+nv_fixed_init(nv_alloc_t *nva, va_list valist)
+{
+ uintptr_t base = va_arg(valist, uintptr_t);
+ uintptr_t lim = base + va_arg(valist, size_t);
+ nvbuf_t *nvb = (nvbuf_t *)P2ROUNDUP(base, sizeof (uintptr_t));
+
+ if (base == 0 || (uintptr_t)&nvb[1] > lim)
+ return (EINVAL);
+
+ nvb->nvb_buf = (uintptr_t)&nvb[0];
+ nvb->nvb_cur = (uintptr_t)&nvb[1];
+ nvb->nvb_lim = lim;
+ nva->nva_arg = nvb;
+
+ return (0);
+}
+
+static void *
+nv_fixed_alloc(nv_alloc_t *nva, size_t size)
+{
+ nvbuf_t *nvb = nva->nva_arg;
+ uintptr_t new = nvb->nvb_cur;
+
+ if (size == 0 || new + size > nvb->nvb_lim)
+ return (NULL);
+
+ nvb->nvb_cur = P2ROUNDUP(new + size, sizeof (uintptr_t));
+
+ return ((void *)new);
+}
+
+/*ARGSUSED*/
+static void
+nv_fixed_free(nv_alloc_t *nva, void *buf, size_t size)
+{
+ /* don't free memory in the pre-allocated buffer */
+}
+
+static void
+nv_fixed_reset(nv_alloc_t *nva)
+{
+ nvbuf_t *nvb = nva->nva_arg;
+
+ nvb->nvb_cur = (uintptr_t)&nvb[1];
+}
+
+const nv_alloc_ops_t nv_fixed_ops_def = {
+ .nv_ao_init = nv_fixed_init,
+ .nv_ao_fini = NULL,
+ .nv_ao_alloc = nv_fixed_alloc,
+ .nv_ao_free = nv_fixed_free,
+ .nv_ao_reset = nv_fixed_reset
+};
+
+const nv_alloc_ops_t *nv_fixed_ops = &nv_fixed_ops_def;
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(nv_fixed_ops);
+#endif
diff --git a/sys/contrib/openzfs/module/nvpair/nvpair_alloc_spl.c b/sys/contrib/openzfs/module/nvpair/nvpair_alloc_spl.c
new file mode 100644
index 000000000000..ed8fa4d09402
--- /dev/null
+++ b/sys/contrib/openzfs/module/nvpair/nvpair_alloc_spl.c
@@ -0,0 +1,96 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at * usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/nvpair.h>
+#include <sys/kmem.h>
+#include <sys/vmem.h>
+
+static void *
+nv_alloc_sleep_spl(nv_alloc_t *nva, size_t size)
+{
+ return (vmem_alloc(size, KM_SLEEP));
+}
+
+static void *
+nv_alloc_pushpage_spl(nv_alloc_t *nva, size_t size)
+{
+ return (vmem_alloc(size, KM_PUSHPAGE));
+}
+
+static void *
+nv_alloc_nosleep_spl(nv_alloc_t *nva, size_t size)
+{
+ return (kmem_alloc(size, KM_NOSLEEP));
+}
+
+static void
+nv_free_spl(nv_alloc_t *nva, void *buf, size_t size)
+{
+ kmem_free(buf, size);
+}
+
+const nv_alloc_ops_t spl_sleep_ops_def = {
+ .nv_ao_init = NULL,
+ .nv_ao_fini = NULL,
+ .nv_ao_alloc = nv_alloc_sleep_spl,
+ .nv_ao_free = nv_free_spl,
+ .nv_ao_reset = NULL
+};
+
+const nv_alloc_ops_t spl_pushpage_ops_def = {
+ .nv_ao_init = NULL,
+ .nv_ao_fini = NULL,
+ .nv_ao_alloc = nv_alloc_pushpage_spl,
+ .nv_ao_free = nv_free_spl,
+ .nv_ao_reset = NULL
+};
+
+const nv_alloc_ops_t spl_nosleep_ops_def = {
+ .nv_ao_init = NULL,
+ .nv_ao_fini = NULL,
+ .nv_ao_alloc = nv_alloc_nosleep_spl,
+ .nv_ao_free = nv_free_spl,
+ .nv_ao_reset = NULL
+};
+
+nv_alloc_t nv_alloc_sleep_def = {
+ &spl_sleep_ops_def,
+ NULL
+};
+
+nv_alloc_t nv_alloc_pushpage_def = {
+ &spl_pushpage_ops_def,
+ NULL
+};
+
+nv_alloc_t nv_alloc_nosleep_def = {
+ &spl_nosleep_ops_def,
+ NULL
+};
+
+nv_alloc_t *nv_alloc_sleep = &nv_alloc_sleep_def;
+nv_alloc_t *nv_alloc_pushpage = &nv_alloc_pushpage_def;
+nv_alloc_t *nv_alloc_nosleep = &nv_alloc_nosleep_def;
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/acl_common.c b/sys/contrib/openzfs/module/os/freebsd/spl/acl_common.c
new file mode 100644
index 000000000000..66e27cefa396
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/acl_common.c
@@ -0,0 +1,1709 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/avl.h>
+#include <sys/misc.h>
+#if defined(_KERNEL)
+#include <sys/kmem.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <acl/acl_common.h>
+#include <sys/debug.h>
+#else
+#include <errno.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <grp.h>
+#include <pwd.h>
+#include <acl_common.h>
+#define ASSERT assert
+#endif
+
+#define ACE_POSIX_SUPPORTED_BITS (ACE_READ_DATA | \
+ ACE_WRITE_DATA | ACE_APPEND_DATA | ACE_EXECUTE | \
+ ACE_READ_ATTRIBUTES | ACE_READ_ACL | ACE_WRITE_ACL)
+
+
+#define ACL_SYNCHRONIZE_SET_DENY 0x0000001
+#define ACL_SYNCHRONIZE_SET_ALLOW 0x0000002
+#define ACL_SYNCHRONIZE_ERR_DENY 0x0000004
+#define ACL_SYNCHRONIZE_ERR_ALLOW 0x0000008
+
+#define ACL_WRITE_OWNER_SET_DENY 0x0000010
+#define ACL_WRITE_OWNER_SET_ALLOW 0x0000020
+#define ACL_WRITE_OWNER_ERR_DENY 0x0000040
+#define ACL_WRITE_OWNER_ERR_ALLOW 0x0000080
+
+#define ACL_DELETE_SET_DENY 0x0000100
+#define ACL_DELETE_SET_ALLOW 0x0000200
+#define ACL_DELETE_ERR_DENY 0x0000400
+#define ACL_DELETE_ERR_ALLOW 0x0000800
+
+#define ACL_WRITE_ATTRS_OWNER_SET_DENY 0x0001000
+#define ACL_WRITE_ATTRS_OWNER_SET_ALLOW 0x0002000
+#define ACL_WRITE_ATTRS_OWNER_ERR_DENY 0x0004000
+#define ACL_WRITE_ATTRS_OWNER_ERR_ALLOW 0x0008000
+
+#define ACL_WRITE_ATTRS_WRITER_SET_DENY 0x0010000
+#define ACL_WRITE_ATTRS_WRITER_SET_ALLOW 0x0020000
+#define ACL_WRITE_ATTRS_WRITER_ERR_DENY 0x0040000
+#define ACL_WRITE_ATTRS_WRITER_ERR_ALLOW 0x0080000
+
+#define ACL_WRITE_NAMED_WRITER_SET_DENY 0x0100000
+#define ACL_WRITE_NAMED_WRITER_SET_ALLOW 0x0200000
+#define ACL_WRITE_NAMED_WRITER_ERR_DENY 0x0400000
+#define ACL_WRITE_NAMED_WRITER_ERR_ALLOW 0x0800000
+
+#define ACL_READ_NAMED_READER_SET_DENY 0x1000000
+#define ACL_READ_NAMED_READER_SET_ALLOW 0x2000000
+#define ACL_READ_NAMED_READER_ERR_DENY 0x4000000
+#define ACL_READ_NAMED_READER_ERR_ALLOW 0x8000000
+
+
+#define ACE_VALID_MASK_BITS (\
+ ACE_READ_DATA | \
+ ACE_LIST_DIRECTORY | \
+ ACE_WRITE_DATA | \
+ ACE_ADD_FILE | \
+ ACE_APPEND_DATA | \
+ ACE_ADD_SUBDIRECTORY | \
+ ACE_READ_NAMED_ATTRS | \
+ ACE_WRITE_NAMED_ATTRS | \
+ ACE_EXECUTE | \
+ ACE_DELETE_CHILD | \
+ ACE_READ_ATTRIBUTES | \
+ ACE_WRITE_ATTRIBUTES | \
+ ACE_DELETE | \
+ ACE_READ_ACL | \
+ ACE_WRITE_ACL | \
+ ACE_WRITE_OWNER | \
+ ACE_SYNCHRONIZE)
+
+#define ACE_MASK_UNDEFINED 0x80000000
+
+#define ACE_VALID_FLAG_BITS (ACE_FILE_INHERIT_ACE | \
+ ACE_DIRECTORY_INHERIT_ACE | \
+ ACE_NO_PROPAGATE_INHERIT_ACE | ACE_INHERIT_ONLY_ACE | \
+ ACE_SUCCESSFUL_ACCESS_ACE_FLAG | ACE_FAILED_ACCESS_ACE_FLAG | \
+ ACE_IDENTIFIER_GROUP | ACE_OWNER | ACE_GROUP | ACE_EVERYONE)
+
+/*
+ * ACL conversion helpers
+ */
+
+typedef enum {
+ ace_unused,
+ ace_user_obj,
+ ace_user,
+ ace_group, /* includes GROUP and GROUP_OBJ */
+ ace_other_obj
+} ace_to_aent_state_t;
+
+typedef struct acevals {
+ uid_t key;
+ avl_node_t avl;
+ uint32_t mask;
+ uint32_t allowed;
+ uint32_t denied;
+ int aent_type;
+} acevals_t;
+
+typedef struct ace_list {
+ acevals_t user_obj;
+ avl_tree_t user;
+ int numusers;
+ acevals_t group_obj;
+ avl_tree_t group;
+ int numgroups;
+ acevals_t other_obj;
+ uint32_t acl_mask;
+ int hasmask;
+ int dfacl_flag;
+ ace_to_aent_state_t state;
+ int seen; /* bitmask of all aclent_t a_type values seen */
+} ace_list_t;
+
+/*
+ * Generic shellsort, from K&R (1st ed, p 58.), somewhat modified.
+ * v = Ptr to array/vector of objs
+ * n = # objs in the array
+ * s = size of each obj (must be multiples of a word size)
+ * f = ptr to function to compare two objs
+ * returns (-1 = less than, 0 = equal, 1 = greater than
+ */
+void
+ksort(caddr_t v, int n, int s, int (*f)(void *, void *))
+{
+ int g, i, j, ii;
+ unsigned int *p1, *p2;
+ unsigned int tmp;
+
+ /* No work to do */
+ if (v == NULL || n <= 1)
+ return;
+
+ /* Sanity check on arguments */
+ ASSERT(((uintptr_t)v & 0x3) == 0 && (s & 0x3) == 0);
+ ASSERT(s > 0);
+ for (g = n / 2; g > 0; g /= 2) {
+ for (i = g; i < n; i++) {
+ for (j = i - g; j >= 0 &&
+ (*f)(v + j * s, v + (j + g) * s) == 1;
+ j -= g) {
+ p1 = (void *)(v + j * s);
+ p2 = (void *)(v + (j + g) * s);
+ for (ii = 0; ii < s / 4; ii++) {
+ tmp = *p1;
+ *p1++ = *p2;
+ *p2++ = tmp;
+ }
+ }
+ }
+ }
+}
+
+/*
+ * Compare two acls, all fields. Returns:
+ * -1 (less than)
+ * 0 (equal)
+ * +1 (greater than)
+ */
+int
+cmp2acls(void *a, void *b)
+{
+ aclent_t *x = (aclent_t *)a;
+ aclent_t *y = (aclent_t *)b;
+
+ /* Compare types */
+ if (x->a_type < y->a_type)
+ return (-1);
+ if (x->a_type > y->a_type)
+ return (1);
+ /* Equal types; compare id's */
+ if (x->a_id < y->a_id)
+ return (-1);
+ if (x->a_id > y->a_id)
+ return (1);
+ /* Equal ids; compare perms */
+ if (x->a_perm < y->a_perm)
+ return (-1);
+ if (x->a_perm > y->a_perm)
+ return (1);
+ /* Totally equal */
+ return (0);
+}
+
+static int
+cacl_malloc(void **ptr, size_t size)
+{
+ *ptr = kmem_zalloc(size, KM_SLEEP);
+ return (0);
+}
+
+
+#if !defined(_KERNEL)
+acl_t *
+acl_alloc(enum acl_type type)
+{
+ acl_t *aclp;
+
+ if (cacl_malloc((void **)&aclp, sizeof (acl_t)) != 0)
+ return (NULL);
+
+ aclp->acl_aclp = NULL;
+ aclp->acl_cnt = 0;
+
+ switch (type) {
+ case ACE_T:
+ aclp->acl_type = ACE_T;
+ aclp->acl_entry_size = sizeof (ace_t);
+ break;
+ case ACLENT_T:
+ aclp->acl_type = ACLENT_T;
+ aclp->acl_entry_size = sizeof (aclent_t);
+ break;
+ default:
+ acl_free(aclp);
+ aclp = NULL;
+ }
+ return (aclp);
+}
+
+/*
+ * Free acl_t structure
+ */
+void
+acl_free(acl_t *aclp)
+{
+ int acl_size;
+
+ if (aclp == NULL)
+ return;
+
+ if (aclp->acl_aclp) {
+ acl_size = aclp->acl_cnt * aclp->acl_entry_size;
+ cacl_free(aclp->acl_aclp, acl_size);
+ }
+
+ cacl_free(aclp, sizeof (acl_t));
+}
+
+static uint32_t
+access_mask_set(int haswriteperm, int hasreadperm, int isowner, int isallow)
+{
+ uint32_t access_mask = 0;
+ int acl_produce;
+ int synchronize_set = 0, write_owner_set = 0;
+ int delete_set = 0, write_attrs_set = 0;
+ int read_named_set = 0, write_named_set = 0;
+
+ acl_produce = (ACL_SYNCHRONIZE_SET_ALLOW |
+ ACL_WRITE_ATTRS_OWNER_SET_ALLOW |
+ ACL_WRITE_ATTRS_WRITER_SET_DENY);
+
+ if (isallow) {
+ synchronize_set = ACL_SYNCHRONIZE_SET_ALLOW;
+ write_owner_set = ACL_WRITE_OWNER_SET_ALLOW;
+ delete_set = ACL_DELETE_SET_ALLOW;
+ if (hasreadperm)
+ read_named_set = ACL_READ_NAMED_READER_SET_ALLOW;
+ if (haswriteperm)
+ write_named_set = ACL_WRITE_NAMED_WRITER_SET_ALLOW;
+ if (isowner)
+ write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_ALLOW;
+ else if (haswriteperm)
+ write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_ALLOW;
+ } else {
+
+ synchronize_set = ACL_SYNCHRONIZE_SET_DENY;
+ write_owner_set = ACL_WRITE_OWNER_SET_DENY;
+ delete_set = ACL_DELETE_SET_DENY;
+ if (hasreadperm)
+ read_named_set = ACL_READ_NAMED_READER_SET_DENY;
+ if (haswriteperm)
+ write_named_set = ACL_WRITE_NAMED_WRITER_SET_DENY;
+ if (isowner)
+ write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_DENY;
+ else if (haswriteperm)
+ write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_DENY;
+ else
+ /*
+ * If the entity is not the owner and does not
+ * have write permissions ACE_WRITE_ATTRIBUTES will
+ * always go in the DENY ACE.
+ */
+ access_mask |= ACE_WRITE_ATTRIBUTES;
+ }
+
+ if (acl_produce & synchronize_set)
+ access_mask |= ACE_SYNCHRONIZE;
+ if (acl_produce & write_owner_set)
+ access_mask |= ACE_WRITE_OWNER;
+ if (acl_produce & delete_set)
+ access_mask |= ACE_DELETE;
+ if (acl_produce & write_attrs_set)
+ access_mask |= ACE_WRITE_ATTRIBUTES;
+ if (acl_produce & read_named_set)
+ access_mask |= ACE_READ_NAMED_ATTRS;
+ if (acl_produce & write_named_set)
+ access_mask |= ACE_WRITE_NAMED_ATTRS;
+
+ return (access_mask);
+}
+
+/*
+ * Given an mode_t, convert it into an access_mask as used
+ * by nfsace, assuming aclent_t -> nfsace semantics.
+ */
+static uint32_t
+mode_to_ace_access(mode_t mode, boolean_t isdir, int isowner, int isallow)
+{
+ uint32_t access = 0;
+ int haswriteperm = 0;
+ int hasreadperm = 0;
+
+ if (isallow) {
+ haswriteperm = (mode & S_IWOTH);
+ hasreadperm = (mode & S_IROTH);
+ } else {
+ haswriteperm = !(mode & S_IWOTH);
+ hasreadperm = !(mode & S_IROTH);
+ }
+
+ /*
+ * The following call takes care of correctly setting the following
+ * mask bits in the access_mask:
+ * ACE_SYNCHRONIZE, ACE_WRITE_OWNER, ACE_DELETE,
+ * ACE_WRITE_ATTRIBUTES, ACE_WRITE_NAMED_ATTRS, ACE_READ_NAMED_ATTRS
+ */
+ access = access_mask_set(haswriteperm, hasreadperm, isowner, isallow);
+
+ if (isallow) {
+ access |= ACE_READ_ACL | ACE_READ_ATTRIBUTES;
+ if (isowner)
+ access |= ACE_WRITE_ACL;
+ } else {
+ if (! isowner)
+ access |= ACE_WRITE_ACL;
+ }
+
+ /* read */
+ if (mode & S_IROTH) {
+ access |= ACE_READ_DATA;
+ }
+ /* write */
+ if (mode & S_IWOTH) {
+ access |= ACE_WRITE_DATA |
+ ACE_APPEND_DATA;
+ if (isdir)
+ access |= ACE_DELETE_CHILD;
+ }
+ /* exec */
+ if (mode & S_IXOTH) {
+ access |= ACE_EXECUTE;
+ }
+
+ return (access);
+}
+
+/*
+ * Given an nfsace (presumably an ALLOW entry), make a
+ * corresponding DENY entry at the address given.
+ */
+static void
+ace_make_deny(ace_t *allow, ace_t *deny, int isdir, int isowner)
+{
+ (void) memcpy(deny, allow, sizeof (ace_t));
+
+ deny->a_who = allow->a_who;
+
+ deny->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
+ deny->a_access_mask ^= ACE_POSIX_SUPPORTED_BITS;
+ if (isdir)
+ deny->a_access_mask ^= ACE_DELETE_CHILD;
+
+ deny->a_access_mask &= ~(ACE_SYNCHRONIZE | ACE_WRITE_OWNER |
+ ACE_DELETE | ACE_WRITE_ATTRIBUTES | ACE_READ_NAMED_ATTRS |
+ ACE_WRITE_NAMED_ATTRS);
+ deny->a_access_mask |= access_mask_set((allow->a_access_mask &
+ ACE_WRITE_DATA), (allow->a_access_mask & ACE_READ_DATA), isowner,
+ B_FALSE);
+}
+/*
+ * Make an initial pass over an array of aclent_t's. Gather
+ * information such as an ACL_MASK (if any), number of users,
+ * number of groups, and whether the array needs to be sorted.
+ */
+static int
+ln_aent_preprocess(aclent_t *aclent, int n,
+ int *hasmask, mode_t *mask,
+ int *numuser, int *numgroup, int *needsort)
+{
+ int error = 0;
+ int i;
+ int curtype = 0;
+
+ *hasmask = 0;
+ *mask = 07;
+ *needsort = 0;
+ *numuser = 0;
+ *numgroup = 0;
+
+ for (i = 0; i < n; i++) {
+ if (aclent[i].a_type < curtype)
+ *needsort = 1;
+ else if (aclent[i].a_type > curtype)
+ curtype = aclent[i].a_type;
+ if (aclent[i].a_type & USER)
+ (*numuser)++;
+ if (aclent[i].a_type & (GROUP | GROUP_OBJ))
+ (*numgroup)++;
+ if (aclent[i].a_type & CLASS_OBJ) {
+ if (*hasmask) {
+ error = EINVAL;
+ goto out;
+ } else {
+ *hasmask = 1;
+ *mask = aclent[i].a_perm;
+ }
+ }
+ }
+
+ if ((! *hasmask) && (*numuser + *numgroup > 1)) {
+ error = EINVAL;
+ goto out;
+ }
+
+out:
+ return (error);
+}
+
+/*
+ * Convert an array of aclent_t into an array of nfsace entries,
+ * following POSIX draft -> nfsv4 conversion semantics as outlined in
+ * the IETF draft.
+ */
+static int
+ln_aent_to_ace(aclent_t *aclent, int n, ace_t **acepp, int *rescount, int isdir)
+{
+ int error = 0;
+ mode_t mask;
+ int numuser, numgroup, needsort;
+ int resultsize = 0;
+ int i, groupi = 0, skip;
+ ace_t *acep, *result = NULL;
+ int hasmask;
+
+ error = ln_aent_preprocess(aclent, n, &hasmask, &mask,
+ &numuser, &numgroup, &needsort);
+ if (error != 0)
+ goto out;
+
+ /* allow + deny for each aclent */
+ resultsize = n * 2;
+ if (hasmask) {
+ /*
+ * stick extra deny on the group_obj and on each
+ * user|group for the mask (the group_obj was added
+ * into the count for numgroup)
+ */
+ resultsize += numuser + numgroup;
+ /* ... and don't count the mask itself */
+ resultsize -= 2;
+ }
+
+ /* sort the source if necessary */
+ if (needsort)
+ ksort((caddr_t)aclent, n, sizeof (aclent_t), cmp2acls);
+
+ if (cacl_malloc((void **)&result, resultsize * sizeof (ace_t)) != 0)
+ goto out;
+
+ acep = result;
+
+ for (i = 0; i < n; i++) {
+ /*
+ * don't process CLASS_OBJ (mask); mask was grabbed in
+ * ln_aent_preprocess()
+ */
+ if (aclent[i].a_type & CLASS_OBJ)
+ continue;
+
+ /* If we need an ACL_MASK emulator, prepend it now */
+ if ((hasmask) &&
+ (aclent[i].a_type & (USER | GROUP | GROUP_OBJ))) {
+ acep->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
+ acep->a_flags = 0;
+ if (aclent[i].a_type & GROUP_OBJ) {
+ acep->a_who = (uid_t)-1;
+ acep->a_flags |=
+ (ACE_IDENTIFIER_GROUP|ACE_GROUP);
+ } else if (aclent[i].a_type & USER) {
+ acep->a_who = aclent[i].a_id;
+ } else {
+ acep->a_who = aclent[i].a_id;
+ acep->a_flags |= ACE_IDENTIFIER_GROUP;
+ }
+ if (aclent[i].a_type & ACL_DEFAULT) {
+ acep->a_flags |= ACE_INHERIT_ONLY_ACE |
+ ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE;
+ }
+ /*
+ * Set the access mask for the prepended deny
+ * ace. To do this, we invert the mask (found
+ * in ln_aent_preprocess()) then convert it to an
+ * DENY ace access_mask.
+ */
+ acep->a_access_mask = mode_to_ace_access((mask ^ 07),
+ isdir, 0, 0);
+ acep += 1;
+ }
+
+ /* handle a_perm -> access_mask */
+ acep->a_access_mask = mode_to_ace_access(aclent[i].a_perm,
+ isdir, aclent[i].a_type & USER_OBJ, 1);
+
+ /* emulate a default aclent */
+ if (aclent[i].a_type & ACL_DEFAULT) {
+ acep->a_flags |= ACE_INHERIT_ONLY_ACE |
+ ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE;
+ }
+
+ /*
+ * handle a_perm and a_id
+ *
+ * this must be done last, since it involves the
+ * corresponding deny aces, which are handled
+ * differently for each different a_type.
+ */
+ if (aclent[i].a_type & USER_OBJ) {
+ acep->a_who = (uid_t)-1;
+ acep->a_flags |= ACE_OWNER;
+ ace_make_deny(acep, acep + 1, isdir, B_TRUE);
+ acep += 2;
+ } else if (aclent[i].a_type & USER) {
+ acep->a_who = aclent[i].a_id;
+ ace_make_deny(acep, acep + 1, isdir, B_FALSE);
+ acep += 2;
+ } else if (aclent[i].a_type & (GROUP_OBJ | GROUP)) {
+ if (aclent[i].a_type & GROUP_OBJ) {
+ acep->a_who = (uid_t)-1;
+ acep->a_flags |= ACE_GROUP;
+ } else {
+ acep->a_who = aclent[i].a_id;
+ }
+ acep->a_flags |= ACE_IDENTIFIER_GROUP;
+ /*
+ * Set the corresponding deny for the group ace.
+ *
+ * The deny aces go after all of the groups, unlike
+ * everything else, where they immediately follow
+ * the allow ace.
+ *
+ * We calculate "skip", the number of slots to
+ * skip ahead for the deny ace, here.
+ *
+ * The pattern is:
+ * MD1 A1 MD2 A2 MD3 A3 D1 D2 D3
+ * thus, skip is
+ * (2 * numgroup) - 1 - groupi
+ * (2 * numgroup) to account for MD + A
+ * - 1 to account for the fact that we're on the
+ * access (A), not the mask (MD)
+ * - groupi to account for the fact that we have
+ * passed up groupi number of MD's.
+ */
+ skip = (2 * numgroup) - 1 - groupi;
+ ace_make_deny(acep, acep + skip, isdir, B_FALSE);
+ /*
+ * If we just did the last group, skip acep past
+ * all of the denies; else, just move ahead one.
+ */
+ if (++groupi >= numgroup)
+ acep += numgroup + 1;
+ else
+ acep += 1;
+ } else if (aclent[i].a_type & OTHER_OBJ) {
+ acep->a_who = (uid_t)-1;
+ acep->a_flags |= ACE_EVERYONE;
+ ace_make_deny(acep, acep + 1, isdir, B_FALSE);
+ acep += 2;
+ } else {
+ error = EINVAL;
+ goto out;
+ }
+ }
+
+ *acepp = result;
+ *rescount = resultsize;
+
+out:
+ if (error != 0) {
+ if ((result != NULL) && (resultsize > 0)) {
+ cacl_free(result, resultsize * sizeof (ace_t));
+ }
+ }
+
+ return (error);
+}
+
+static int
+convert_aent_to_ace(aclent_t *aclentp, int aclcnt, boolean_t isdir,
+ ace_t **retacep, int *retacecnt)
+{
+ ace_t *acep;
+ ace_t *dfacep;
+ int acecnt = 0;
+ int dfacecnt = 0;
+ int dfaclstart = 0;
+ int dfaclcnt = 0;
+ aclent_t *aclp;
+ int i;
+ int error;
+ int acesz, dfacesz;
+
+ ksort((caddr_t)aclentp, aclcnt, sizeof (aclent_t), cmp2acls);
+
+ for (i = 0, aclp = aclentp; i < aclcnt; aclp++, i++) {
+ if (aclp->a_type & ACL_DEFAULT)
+ break;
+ }
+
+ if (i < aclcnt) {
+ dfaclstart = i;
+ dfaclcnt = aclcnt - i;
+ }
+
+ if (dfaclcnt && !isdir) {
+ return (EINVAL);
+ }
+
+ error = ln_aent_to_ace(aclentp, i, &acep, &acecnt, isdir);
+ if (error)
+ return (error);
+
+ if (dfaclcnt) {
+ error = ln_aent_to_ace(&aclentp[dfaclstart], dfaclcnt,
+ &dfacep, &dfacecnt, isdir);
+ if (error) {
+ if (acep) {
+ cacl_free(acep, acecnt * sizeof (ace_t));
+ }
+ return (error);
+ }
+ }
+
+ if (dfacecnt != 0) {
+ acesz = sizeof (ace_t) * acecnt;
+ dfacesz = sizeof (ace_t) * dfacecnt;
+ acep = cacl_realloc(acep, acesz, acesz + dfacesz);
+ if (acep == NULL)
+ return (ENOMEM);
+ if (dfaclcnt) {
+ (void) memcpy(acep + acecnt, dfacep, dfacesz);
+ }
+ }
+ if (dfaclcnt)
+ cacl_free(dfacep, dfacecnt * sizeof (ace_t));
+
+ *retacecnt = acecnt + dfacecnt;
+ *retacep = acep;
+ return (0);
+}
+
+static int
+ace_mask_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir)
+{
+ int error = 0;
+ o_mode_t mode = 0;
+ uint32_t bits, wantbits;
+
+ /* read */
+ if (mask & ACE_READ_DATA)
+ mode |= S_IROTH;
+
+ /* write */
+ wantbits = (ACE_WRITE_DATA | ACE_APPEND_DATA);
+ if (isdir)
+ wantbits |= ACE_DELETE_CHILD;
+ bits = mask & wantbits;
+ if (bits != 0) {
+ if (bits != wantbits) {
+ error = ENOTSUP;
+ goto out;
+ }
+ mode |= S_IWOTH;
+ }
+
+ /* exec */
+ if (mask & ACE_EXECUTE) {
+ mode |= S_IXOTH;
+ }
+
+ *modep = mode;
+
+out:
+ return (error);
+}
+
+static void
+acevals_init(acevals_t *vals, uid_t key)
+{
+ bzero(vals, sizeof (*vals));
+ vals->allowed = ACE_MASK_UNDEFINED;
+ vals->denied = ACE_MASK_UNDEFINED;
+ vals->mask = ACE_MASK_UNDEFINED;
+ vals->key = key;
+}
+
+static void
+ace_list_init(ace_list_t *al, int dfacl_flag)
+{
+ acevals_init(&al->user_obj, 0);
+ acevals_init(&al->group_obj, 0);
+ acevals_init(&al->other_obj, 0);
+ al->numusers = 0;
+ al->numgroups = 0;
+ al->acl_mask = 0;
+ al->hasmask = 0;
+ al->state = ace_unused;
+ al->seen = 0;
+ al->dfacl_flag = dfacl_flag;
+}
+
+/*
+ * Find or create an acevals holder for a given id and avl tree.
+ *
+ * Note that only one thread will ever touch these avl trees, so
+ * there is no need for locking.
+ */
+static acevals_t *
+acevals_find(ace_t *ace, avl_tree_t *avl, int *num)
+{
+ acevals_t key, *rc;
+ avl_index_t where;
+
+ key.key = ace->a_who;
+ rc = avl_find(avl, &key, &where);
+ if (rc != NULL)
+ return (rc);
+
+ /* this memory is freed by ln_ace_to_aent()->ace_list_free() */
+ if (cacl_malloc((void **)&rc, sizeof (acevals_t)) != 0)
+ return (NULL);
+
+ acevals_init(rc, ace->a_who);
+ avl_insert(avl, rc, where);
+ (*num)++;
+
+ return (rc);
+}
+
+static int
+access_mask_check(ace_t *acep, int mask_bit, int isowner)
+{
+ int set_deny, err_deny;
+ int set_allow, err_allow;
+ int acl_consume;
+ int haswriteperm, hasreadperm;
+
+ if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) {
+ haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 0 : 1;
+ hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 0 : 1;
+ } else {
+ haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 1 : 0;
+ hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 1 : 0;
+ }
+
+ acl_consume = (ACL_SYNCHRONIZE_ERR_DENY |
+ ACL_DELETE_ERR_DENY |
+ ACL_WRITE_OWNER_ERR_DENY |
+ ACL_WRITE_OWNER_ERR_ALLOW |
+ ACL_WRITE_ATTRS_OWNER_SET_ALLOW |
+ ACL_WRITE_ATTRS_OWNER_ERR_DENY |
+ ACL_WRITE_ATTRS_WRITER_SET_DENY |
+ ACL_WRITE_ATTRS_WRITER_ERR_ALLOW |
+ ACL_WRITE_NAMED_WRITER_ERR_DENY |
+ ACL_READ_NAMED_READER_ERR_DENY);
+
+ if (mask_bit == ACE_SYNCHRONIZE) {
+ set_deny = ACL_SYNCHRONIZE_SET_DENY;
+ err_deny = ACL_SYNCHRONIZE_ERR_DENY;
+ set_allow = ACL_SYNCHRONIZE_SET_ALLOW;
+ err_allow = ACL_SYNCHRONIZE_ERR_ALLOW;
+ } else if (mask_bit == ACE_WRITE_OWNER) {
+ set_deny = ACL_WRITE_OWNER_SET_DENY;
+ err_deny = ACL_WRITE_OWNER_ERR_DENY;
+ set_allow = ACL_WRITE_OWNER_SET_ALLOW;
+ err_allow = ACL_WRITE_OWNER_ERR_ALLOW;
+ } else if (mask_bit == ACE_DELETE) {
+ set_deny = ACL_DELETE_SET_DENY;
+ err_deny = ACL_DELETE_ERR_DENY;
+ set_allow = ACL_DELETE_SET_ALLOW;
+ err_allow = ACL_DELETE_ERR_ALLOW;
+ } else if (mask_bit == ACE_WRITE_ATTRIBUTES) {
+ if (isowner) {
+ set_deny = ACL_WRITE_ATTRS_OWNER_SET_DENY;
+ err_deny = ACL_WRITE_ATTRS_OWNER_ERR_DENY;
+ set_allow = ACL_WRITE_ATTRS_OWNER_SET_ALLOW;
+ err_allow = ACL_WRITE_ATTRS_OWNER_ERR_ALLOW;
+ } else if (haswriteperm) {
+ set_deny = ACL_WRITE_ATTRS_WRITER_SET_DENY;
+ err_deny = ACL_WRITE_ATTRS_WRITER_ERR_DENY;
+ set_allow = ACL_WRITE_ATTRS_WRITER_SET_ALLOW;
+ err_allow = ACL_WRITE_ATTRS_WRITER_ERR_ALLOW;
+ } else {
+ if ((acep->a_access_mask & mask_bit) &&
+ (acep->a_type & ACE_ACCESS_ALLOWED_ACE_TYPE)) {
+ return (ENOTSUP);
+ }
+ return (0);
+ }
+ } else if (mask_bit == ACE_READ_NAMED_ATTRS) {
+ if (!hasreadperm)
+ return (0);
+
+ set_deny = ACL_READ_NAMED_READER_SET_DENY;
+ err_deny = ACL_READ_NAMED_READER_ERR_DENY;
+ set_allow = ACL_READ_NAMED_READER_SET_ALLOW;
+ err_allow = ACL_READ_NAMED_READER_ERR_ALLOW;
+ } else if (mask_bit == ACE_WRITE_NAMED_ATTRS) {
+ if (!haswriteperm)
+ return (0);
+
+ set_deny = ACL_WRITE_NAMED_WRITER_SET_DENY;
+ err_deny = ACL_WRITE_NAMED_WRITER_ERR_DENY;
+ set_allow = ACL_WRITE_NAMED_WRITER_SET_ALLOW;
+ err_allow = ACL_WRITE_NAMED_WRITER_ERR_ALLOW;
+ } else {
+ return (EINVAL);
+ }
+
+ if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) {
+ if (acl_consume & set_deny) {
+ if (!(acep->a_access_mask & mask_bit)) {
+ return (ENOTSUP);
+ }
+ } else if (acl_consume & err_deny) {
+ if (acep->a_access_mask & mask_bit) {
+ return (ENOTSUP);
+ }
+ }
+ } else {
+ /* ACE_ACCESS_ALLOWED_ACE_TYPE */
+ if (acl_consume & set_allow) {
+ if (!(acep->a_access_mask & mask_bit)) {
+ return (ENOTSUP);
+ }
+ } else if (acl_consume & err_allow) {
+ if (acep->a_access_mask & mask_bit) {
+ return (ENOTSUP);
+ }
+ }
+ }
+ return (0);
+}
+
+static int
+ace_to_aent_legal(ace_t *acep)
+{
+ int error = 0;
+ int isowner;
+
+ /* only ALLOW or DENY */
+ if ((acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE) &&
+ (acep->a_type != ACE_ACCESS_DENIED_ACE_TYPE)) {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ /* check for invalid flags */
+ if (acep->a_flags & ~(ACE_VALID_FLAG_BITS)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /* some flags are illegal */
+ if (acep->a_flags & (ACE_SUCCESSFUL_ACCESS_ACE_FLAG |
+ ACE_FAILED_ACCESS_ACE_FLAG |
+ ACE_NO_PROPAGATE_INHERIT_ACE)) {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ /* check for invalid masks */
+ if (acep->a_access_mask & ~(ACE_VALID_MASK_BITS)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ if ((acep->a_flags & ACE_OWNER)) {
+ isowner = 1;
+ } else {
+ isowner = 0;
+ }
+
+ error = access_mask_check(acep, ACE_SYNCHRONIZE, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_WRITE_OWNER, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_DELETE, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_WRITE_ATTRIBUTES, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_READ_NAMED_ATTRS, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_WRITE_NAMED_ATTRS, isowner);
+ if (error)
+ goto out;
+
+ /* more detailed checking of masks */
+ if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) {
+ if (! (acep->a_access_mask & ACE_READ_ATTRIBUTES)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((acep->a_access_mask & ACE_WRITE_DATA) &&
+ (! (acep->a_access_mask & ACE_APPEND_DATA))) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((! (acep->a_access_mask & ACE_WRITE_DATA)) &&
+ (acep->a_access_mask & ACE_APPEND_DATA)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ }
+
+ /* ACL enforcement */
+ if ((acep->a_access_mask & ACE_READ_ACL) &&
+ (acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if (acep->a_access_mask & ACE_WRITE_ACL) {
+ if ((acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) &&
+ (isowner)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) &&
+ (! isowner)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ }
+
+out:
+ return (error);
+}
+
+static int
+ace_allow_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir)
+{
+ /* ACE_READ_ACL and ACE_READ_ATTRIBUTES must both be set */
+ if ((mask & (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) !=
+ (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) {
+ return (ENOTSUP);
+ }
+
+ return (ace_mask_to_mode(mask, modep, isdir));
+}
+
+static int
+acevals_to_aent(acevals_t *vals, aclent_t *dest, ace_list_t *list,
+ uid_t owner, gid_t group, boolean_t isdir)
+{
+ int error;
+ uint32_t flips = ACE_POSIX_SUPPORTED_BITS;
+
+ if (isdir)
+ flips |= ACE_DELETE_CHILD;
+ if (vals->allowed != (vals->denied ^ flips)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((list->hasmask) && (list->acl_mask != vals->mask) &&
+ (vals->aent_type & (USER | GROUP | GROUP_OBJ))) {
+ error = ENOTSUP;
+ goto out;
+ }
+ error = ace_allow_to_mode(vals->allowed, &dest->a_perm, isdir);
+ if (error != 0)
+ goto out;
+ dest->a_type = vals->aent_type;
+ if (dest->a_type & (USER | GROUP)) {
+ dest->a_id = vals->key;
+ } else if (dest->a_type & USER_OBJ) {
+ dest->a_id = owner;
+ } else if (dest->a_type & GROUP_OBJ) {
+ dest->a_id = group;
+ } else if (dest->a_type & OTHER_OBJ) {
+ dest->a_id = 0;
+ } else {
+ error = EINVAL;
+ goto out;
+ }
+
+out:
+ return (error);
+}
+
+
+static int
+ace_list_to_aent(ace_list_t *list, aclent_t **aclentp, int *aclcnt,
+ uid_t owner, gid_t group, boolean_t isdir)
+{
+ int error = 0;
+ aclent_t *aent, *result = NULL;
+ acevals_t *vals;
+ int resultcount;
+
+ if ((list->seen & (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) !=
+ (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((! list->hasmask) && (list->numusers + list->numgroups > 0)) {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ resultcount = 3 + list->numusers + list->numgroups;
+ /*
+ * This must be the same condition as below, when we add the CLASS_OBJ
+ * (aka ACL mask)
+ */
+ if ((list->hasmask) || (! list->dfacl_flag))
+ resultcount += 1;
+
+ if (cacl_malloc((void **)&result,
+ resultcount * sizeof (aclent_t)) != 0) {
+ error = ENOMEM;
+ goto out;
+ }
+ aent = result;
+
+ /* USER_OBJ */
+ if (!(list->user_obj.aent_type & USER_OBJ)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ error = acevals_to_aent(&list->user_obj, aent, list, owner, group,
+ isdir);
+
+ if (error != 0)
+ goto out;
+ ++aent;
+ /* USER */
+ vals = NULL;
+ for (vals = avl_first(&list->user); vals != NULL;
+ vals = AVL_NEXT(&list->user, vals)) {
+ if (!(vals->aent_type & USER)) {
+ error = EINVAL;
+ goto out;
+ }
+ error = acevals_to_aent(vals, aent, list, owner, group,
+ isdir);
+ if (error != 0)
+ goto out;
+ ++aent;
+ }
+ /* GROUP_OBJ */
+ if (!(list->group_obj.aent_type & GROUP_OBJ)) {
+ error = EINVAL;
+ goto out;
+ }
+ error = acevals_to_aent(&list->group_obj, aent, list, owner, group,
+ isdir);
+ if (error != 0)
+ goto out;
+ ++aent;
+ /* GROUP */
+ vals = NULL;
+ for (vals = avl_first(&list->group); vals != NULL;
+ vals = AVL_NEXT(&list->group, vals)) {
+ if (!(vals->aent_type & GROUP)) {
+ error = EINVAL;
+ goto out;
+ }
+ error = acevals_to_aent(vals, aent, list, owner, group,
+ isdir);
+ if (error != 0)
+ goto out;
+ ++aent;
+ }
+ /*
+ * CLASS_OBJ (aka ACL_MASK)
+ *
+ * An ACL_MASK is not fabricated if the ACL is a default ACL.
+ * This is to follow UFS's behavior.
+ */
+ if ((list->hasmask) || (! list->dfacl_flag)) {
+ if (list->hasmask) {
+ uint32_t flips = ACE_POSIX_SUPPORTED_BITS;
+ if (isdir)
+ flips |= ACE_DELETE_CHILD;
+ error = ace_mask_to_mode(list->acl_mask ^ flips,
+ &aent->a_perm, isdir);
+ if (error != 0)
+ goto out;
+ } else {
+ /* fabricate the ACL_MASK from the group permissions */
+ error = ace_mask_to_mode(list->group_obj.allowed,
+ &aent->a_perm, isdir);
+ if (error != 0)
+ goto out;
+ }
+ aent->a_id = 0;
+ aent->a_type = CLASS_OBJ | list->dfacl_flag;
+ ++aent;
+ }
+ /* OTHER_OBJ */
+ if (!(list->other_obj.aent_type & OTHER_OBJ)) {
+ error = EINVAL;
+ goto out;
+ }
+ error = acevals_to_aent(&list->other_obj, aent, list, owner, group,
+ isdir);
+ if (error != 0)
+ goto out;
+ ++aent;
+
+ *aclentp = result;
+ *aclcnt = resultcount;
+
+out:
+ if (error != 0) {
+ if (result != NULL)
+ cacl_free(result, resultcount * sizeof (aclent_t));
+ }
+
+ return (error);
+}
+
+
+/*
+ * free all data associated with an ace_list
+ */
+static void
+ace_list_free(ace_list_t *al)
+{
+ acevals_t *node;
+ void *cookie;
+
+ if (al == NULL)
+ return;
+
+ cookie = NULL;
+ while ((node = avl_destroy_nodes(&al->user, &cookie)) != NULL)
+ cacl_free(node, sizeof (acevals_t));
+ cookie = NULL;
+ while ((node = avl_destroy_nodes(&al->group, &cookie)) != NULL)
+ cacl_free(node, sizeof (acevals_t));
+
+ avl_destroy(&al->user);
+ avl_destroy(&al->group);
+
+ /* free the container itself */
+ cacl_free(al, sizeof (ace_list_t));
+}
+
+static int
+acevals_compare(const void *va, const void *vb)
+{
+ const acevals_t *a = va, *b = vb;
+
+ if (a->key == b->key)
+ return (0);
+
+ if (a->key > b->key)
+ return (1);
+
+ else
+ return (-1);
+}
+
+/*
+ * Convert a list of ace_t entries to equivalent regular and default
+ * aclent_t lists. Return error (ENOTSUP) when conversion is not possible.
+ */
+static int
+ln_ace_to_aent(ace_t *ace, int n, uid_t owner, gid_t group,
+ aclent_t **aclentp, int *aclcnt, aclent_t **dfaclentp, int *dfaclcnt,
+ boolean_t isdir)
+{
+ int error = 0;
+ ace_t *acep;
+ uint32_t bits;
+ int i;
+ ace_list_t *normacl = NULL, *dfacl = NULL, *acl;
+ acevals_t *vals;
+
+ *aclentp = NULL;
+ *aclcnt = 0;
+ *dfaclentp = NULL;
+ *dfaclcnt = 0;
+
+ /* we need at least user_obj, group_obj, and other_obj */
+ if (n < 6) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if (ace == NULL) {
+ error = EINVAL;
+ goto out;
+ }
+
+ error = cacl_malloc((void **)&normacl, sizeof (ace_list_t));
+ if (error != 0)
+ goto out;
+
+ avl_create(&normacl->user, acevals_compare, sizeof (acevals_t),
+ offsetof(acevals_t, avl));
+ avl_create(&normacl->group, acevals_compare, sizeof (acevals_t),
+ offsetof(acevals_t, avl));
+
+ ace_list_init(normacl, 0);
+
+ error = cacl_malloc((void **)&dfacl, sizeof (ace_list_t));
+ if (error != 0)
+ goto out;
+
+ avl_create(&dfacl->user, acevals_compare, sizeof (acevals_t),
+ offsetof(acevals_t, avl));
+ avl_create(&dfacl->group, acevals_compare, sizeof (acevals_t),
+ offsetof(acevals_t, avl));
+ ace_list_init(dfacl, ACL_DEFAULT);
+
+ /* process every ace_t... */
+ for (i = 0; i < n; i++) {
+ acep = &ace[i];
+
+ /* rule out certain cases quickly */
+ error = ace_to_aent_legal(acep);
+ if (error != 0)
+ goto out;
+
+ /*
+ * Turn off these bits in order to not have to worry about
+ * them when doing the checks for compliments.
+ */
+ acep->a_access_mask &= ~(ACE_WRITE_OWNER | ACE_DELETE |
+ ACE_SYNCHRONIZE | ACE_WRITE_ATTRIBUTES |
+ ACE_READ_NAMED_ATTRS | ACE_WRITE_NAMED_ATTRS);
+
+ /* see if this should be a regular or default acl */
+ bits = acep->a_flags &
+ (ACE_INHERIT_ONLY_ACE |
+ ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE);
+ if (bits != 0) {
+ /* all or nothing on these inherit bits */
+ if (bits != (ACE_INHERIT_ONLY_ACE |
+ ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ acl = dfacl;
+ } else {
+ acl = normacl;
+ }
+
+ if ((acep->a_flags & ACE_OWNER)) {
+ if (acl->state > ace_user_obj) {
+ error = ENOTSUP;
+ goto out;
+ }
+ acl->state = ace_user_obj;
+ acl->seen |= USER_OBJ;
+ vals = &acl->user_obj;
+ vals->aent_type = USER_OBJ | acl->dfacl_flag;
+ } else if ((acep->a_flags & ACE_EVERYONE)) {
+ acl->state = ace_other_obj;
+ acl->seen |= OTHER_OBJ;
+ vals = &acl->other_obj;
+ vals->aent_type = OTHER_OBJ | acl->dfacl_flag;
+ } else if (acep->a_flags & ACE_IDENTIFIER_GROUP) {
+ if (acl->state > ace_group) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((acep->a_flags & ACE_GROUP)) {
+ acl->seen |= GROUP_OBJ;
+ vals = &acl->group_obj;
+ vals->aent_type = GROUP_OBJ | acl->dfacl_flag;
+ } else {
+ acl->seen |= GROUP;
+ vals = acevals_find(acep, &acl->group,
+ &acl->numgroups);
+ if (vals == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ vals->aent_type = GROUP | acl->dfacl_flag;
+ }
+ acl->state = ace_group;
+ } else {
+ if (acl->state > ace_user) {
+ error = ENOTSUP;
+ goto out;
+ }
+ acl->state = ace_user;
+ acl->seen |= USER;
+ vals = acevals_find(acep, &acl->user,
+ &acl->numusers);
+ if (vals == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ vals->aent_type = USER | acl->dfacl_flag;
+ }
+
+ if (!(acl->state > ace_unused)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) {
+ /* no more than one allowed per aclent_t */
+ if (vals->allowed != ACE_MASK_UNDEFINED) {
+ error = ENOTSUP;
+ goto out;
+ }
+ vals->allowed = acep->a_access_mask;
+ } else {
+ /*
+ * it's a DENY; if there was a previous DENY, it
+ * must have been an ACL_MASK.
+ */
+ if (vals->denied != ACE_MASK_UNDEFINED) {
+ /* ACL_MASK is for USER and GROUP only */
+ if ((acl->state != ace_user) &&
+ (acl->state != ace_group)) {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ if (! acl->hasmask) {
+ acl->hasmask = 1;
+ acl->acl_mask = vals->denied;
+ /* check for mismatched ACL_MASK emulations */
+ } else if (acl->acl_mask != vals->denied) {
+ error = ENOTSUP;
+ goto out;
+ }
+ vals->mask = vals->denied;
+ }
+ vals->denied = acep->a_access_mask;
+ }
+ }
+
+ /* done collating; produce the aclent_t lists */
+ if (normacl->state != ace_unused) {
+ error = ace_list_to_aent(normacl, aclentp, aclcnt,
+ owner, group, isdir);
+ if (error != 0) {
+ goto out;
+ }
+ }
+ if (dfacl->state != ace_unused) {
+ error = ace_list_to_aent(dfacl, dfaclentp, dfaclcnt,
+ owner, group, isdir);
+ if (error != 0) {
+ goto out;
+ }
+ }
+
+out:
+ if (normacl != NULL)
+ ace_list_free(normacl);
+ if (dfacl != NULL)
+ ace_list_free(dfacl);
+
+ return (error);
+}
+
+static int
+convert_ace_to_aent(ace_t *acebufp, int acecnt, boolean_t isdir,
+ uid_t owner, gid_t group, aclent_t **retaclentp, int *retaclcnt)
+{
+ int error = 0;
+ aclent_t *aclentp, *dfaclentp;
+ int aclcnt, dfaclcnt;
+ int aclsz, dfaclsz;
+
+ error = ln_ace_to_aent(acebufp, acecnt, owner, group,
+ &aclentp, &aclcnt, &dfaclentp, &dfaclcnt, isdir);
+
+ if (error)
+ return (error);
+
+
+ if (dfaclcnt != 0) {
+ /*
+ * Slap aclentp and dfaclentp into a single array.
+ */
+ aclsz = sizeof (aclent_t) * aclcnt;
+ dfaclsz = sizeof (aclent_t) * dfaclcnt;
+ aclentp = cacl_realloc(aclentp, aclsz, aclsz + dfaclsz);
+ if (aclentp != NULL) {
+ (void) memcpy(aclentp + aclcnt, dfaclentp, dfaclsz);
+ } else {
+ error = ENOMEM;
+ }
+ }
+
+ if (aclentp) {
+ *retaclentp = aclentp;
+ *retaclcnt = aclcnt + dfaclcnt;
+ }
+
+ if (dfaclentp)
+ cacl_free(dfaclentp, dfaclsz);
+
+ return (error);
+}
+
+
+int
+acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir, uid_t owner,
+ gid_t group)
+{
+ int aclcnt;
+ void *acldata;
+ int error;
+
+ /*
+ * See if we need to translate
+ */
+ if ((target_flavor == _ACL_ACE_ENABLED && aclp->acl_type == ACE_T) ||
+ (target_flavor == _ACL_ACLENT_ENABLED &&
+ aclp->acl_type == ACLENT_T))
+ return (0);
+
+ if (target_flavor == -1) {
+ error = EINVAL;
+ goto out;
+ }
+
+ if (target_flavor == _ACL_ACE_ENABLED &&
+ aclp->acl_type == ACLENT_T) {
+ error = convert_aent_to_ace(aclp->acl_aclp,
+ aclp->acl_cnt, isdir, (ace_t **)&acldata, &aclcnt);
+ if (error)
+ goto out;
+
+ } else if (target_flavor == _ACL_ACLENT_ENABLED &&
+ aclp->acl_type == ACE_T) {
+ error = convert_ace_to_aent(aclp->acl_aclp, aclp->acl_cnt,
+ isdir, owner, group, (aclent_t **)&acldata, &aclcnt);
+ if (error)
+ goto out;
+ } else {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ /*
+ * replace old acl with newly translated acl
+ */
+ cacl_free(aclp->acl_aclp, aclp->acl_cnt * aclp->acl_entry_size);
+ aclp->acl_aclp = acldata;
+ aclp->acl_cnt = aclcnt;
+ if (target_flavor == _ACL_ACE_ENABLED) {
+ aclp->acl_type = ACE_T;
+ aclp->acl_entry_size = sizeof (ace_t);
+ } else {
+ aclp->acl_type = ACLENT_T;
+ aclp->acl_entry_size = sizeof (aclent_t);
+ }
+ return (0);
+
+out:
+
+#if !defined(_KERNEL)
+ errno = error;
+ return (-1);
+#else
+ return (error);
+#endif
+}
+#endif /* !_KERNEL */
+
+#define SET_ACE(acl, index, who, mask, type, flags) { \
+ acl[0][index].a_who = (uint32_t)who; \
+ acl[0][index].a_type = type; \
+ acl[0][index].a_flags = flags; \
+ acl[0][index++].a_access_mask = mask; \
+}
+
+void
+acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks)
+{
+ uint32_t read_mask = ACE_READ_DATA;
+ uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA;
+ uint32_t execute_mask = ACE_EXECUTE;
+
+ (void) isdir; /* will need this later */
+
+ masks->deny1 = 0;
+ if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH)))
+ masks->deny1 |= read_mask;
+ if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH)))
+ masks->deny1 |= write_mask;
+ if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH)))
+ masks->deny1 |= execute_mask;
+
+ masks->deny2 = 0;
+ if (!(mode & S_IRGRP) && (mode & S_IROTH))
+ masks->deny2 |= read_mask;
+ if (!(mode & S_IWGRP) && (mode & S_IWOTH))
+ masks->deny2 |= write_mask;
+ if (!(mode & S_IXGRP) && (mode & S_IXOTH))
+ masks->deny2 |= execute_mask;
+
+ masks->allow0 = 0;
+ if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH)))
+ masks->allow0 |= read_mask;
+ if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH)))
+ masks->allow0 |= write_mask;
+ if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH)))
+ masks->allow0 |= execute_mask;
+
+ masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL|
+ ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES|
+ ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE;
+ if (mode & S_IRUSR)
+ masks->owner |= read_mask;
+ if (mode & S_IWUSR)
+ masks->owner |= write_mask;
+ if (mode & S_IXUSR)
+ masks->owner |= execute_mask;
+
+ masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
+ ACE_SYNCHRONIZE;
+ if (mode & S_IRGRP)
+ masks->group |= read_mask;
+ if (mode & S_IWGRP)
+ masks->group |= write_mask;
+ if (mode & S_IXGRP)
+ masks->group |= execute_mask;
+
+ masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
+ ACE_SYNCHRONIZE;
+ if (mode & S_IROTH)
+ masks->everyone |= read_mask;
+ if (mode & S_IWOTH)
+ masks->everyone |= write_mask;
+ if (mode & S_IXOTH)
+ masks->everyone |= execute_mask;
+}
+
+int
+acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count)
+{
+ int index = 0;
+ int error;
+ trivial_acl_t masks;
+
+ *count = 3;
+ acl_trivial_access_masks(mode, isdir, &masks);
+
+ if (masks.allow0)
+ (*count)++;
+ if (masks.deny1)
+ (*count)++;
+ if (masks.deny2)
+ (*count)++;
+
+ if ((error = cacl_malloc((void **)acl, *count * sizeof (ace_t))) != 0)
+ return (error);
+
+ if (masks.allow0) {
+ SET_ACE(acl, index, -1, masks.allow0,
+ ACE_ACCESS_ALLOWED_ACE_TYPE, ACE_OWNER);
+ }
+ if (masks.deny1) {
+ SET_ACE(acl, index, -1, masks.deny1,
+ ACE_ACCESS_DENIED_ACE_TYPE, ACE_OWNER);
+ }
+ if (masks.deny2) {
+ SET_ACE(acl, index, -1, masks.deny2,
+ ACE_ACCESS_DENIED_ACE_TYPE, ACE_GROUP|ACE_IDENTIFIER_GROUP);
+ }
+
+ SET_ACE(acl, index, -1, masks.owner, ACE_ACCESS_ALLOWED_ACE_TYPE,
+ ACE_OWNER);
+ SET_ACE(acl, index, -1, masks.group, ACE_ACCESS_ALLOWED_ACE_TYPE,
+ ACE_IDENTIFIER_GROUP|ACE_GROUP);
+ SET_ACE(acl, index, -1, masks.everyone, ACE_ACCESS_ALLOWED_ACE_TYPE,
+ ACE_EVERYONE);
+
+ return (0);
+}
+
+/*
+ * ace_trivial:
+ * determine whether an ace_t acl is trivial
+ *
+ * Trivialness implies that the acl is composed of only
+ * owner, group, everyone entries. ACL can't
+ * have read_acl denied, and write_owner/write_acl/write_attributes
+ * can only be owner@ entry.
+ */
+int
+ace_trivial_common(void *acep, int aclcnt,
+ uint64_t (*walk)(void *, uint64_t, int aclcnt,
+ uint16_t *, uint16_t *, uint32_t *))
+{
+ uint16_t flags;
+ uint32_t mask;
+ uint16_t type;
+ uint64_t cookie = 0;
+
+ while ((cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask))) {
+ switch (flags & ACE_TYPE_FLAGS) {
+ case ACE_OWNER:
+ case ACE_GROUP|ACE_IDENTIFIER_GROUP:
+ case ACE_EVERYONE:
+ break;
+ default:
+ return (1);
+
+ }
+
+ if (flags & (ACE_FILE_INHERIT_ACE|
+ ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|
+ ACE_INHERIT_ONLY_ACE))
+ return (1);
+
+ /*
+ * Special check for some special bits
+ *
+ * Don't allow anybody to deny reading basic
+ * attributes or a files ACL.
+ */
+ if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+ (type == ACE_ACCESS_DENIED_ACE_TYPE))
+ return (1);
+
+ /*
+ * Delete permissions are never set by default
+ */
+ if (mask & (ACE_DELETE|ACE_DELETE_CHILD))
+ return (1);
+ /*
+ * only allow owner@ to have
+ * write_acl/write_owner/write_attributes/write_xattr/
+ */
+ if (type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
+ (!(flags & ACE_OWNER) && (mask &
+ (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES|
+ ACE_WRITE_NAMED_ATTRS))))
+ return (1);
+
+ }
+ return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/callb.c b/sys/contrib/openzfs/module/os/freebsd/spl/callb.c
new file mode 100644
index 000000000000..fffa85b6b91b
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/callb.c
@@ -0,0 +1,373 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/mutex.h>
+#include <sys/condvar.h>
+#include <sys/callb.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/kobj.h>
+#include <sys/systm.h> /* for delay() */
+#include <sys/taskq.h> /* For TASKQ_NAMELEN */
+#include <sys/kernel.h>
+
+#define CB_MAXNAME TASKQ_NAMELEN
+
+/*
+ * The callb mechanism provides generic event scheduling/echoing.
+ * A callb function is registered and called on behalf of the event.
+ */
+typedef struct callb {
+ struct callb *c_next; /* next in class or on freelist */
+ kthread_id_t c_thread; /* ptr to caller's thread struct */
+ char c_flag; /* info about the callb state */
+ uchar_t c_class; /* this callb's class */
+ kcondvar_t c_done_cv; /* signal callb completion */
+ boolean_t (*c_func)(void *, int);
+ /* cb function: returns true if ok */
+ void *c_arg; /* arg to c_func */
+ char c_name[CB_MAXNAME+1]; /* debug:max func name length */
+} callb_t;
+
+/*
+ * callb c_flag bitmap definitions
+ */
+#define CALLB_FREE 0x0
+#define CALLB_TAKEN 0x1
+#define CALLB_EXECUTING 0x2
+
+/*
+ * Basic structure for a callb table.
+ * All callbs are organized into different class groups described
+ * by ct_class array.
+ * The callbs within a class are single-linked and normally run by a
+ * serial execution.
+ */
+typedef struct callb_table {
+ kmutex_t ct_lock; /* protect all callb states */
+ callb_t *ct_freelist; /* free callb structures */
+ int ct_busy; /* != 0 prevents additions */
+ kcondvar_t ct_busy_cv; /* to wait for not busy */
+ int ct_ncallb; /* num of callbs allocated */
+ callb_t *ct_first_cb[NCBCLASS]; /* ptr to 1st callb in a class */
+} callb_table_t;
+
+int callb_timeout_sec = CPR_KTHREAD_TIMEOUT_SEC;
+
+static callb_id_t callb_add_common(boolean_t (*)(void *, int),
+ void *, int, char *, kthread_id_t);
+
+static callb_table_t callb_table; /* system level callback table */
+static callb_table_t *ct = &callb_table;
+static kmutex_t callb_safe_mutex;
+callb_cpr_t callb_cprinfo_safe = {
+ &callb_safe_mutex, CALLB_CPR_ALWAYS_SAFE, 0, {0, 0} };
+
+/*
+ * Init all callb tables in the system.
+ */
+static void
+callb_init(void *dummy __unused)
+{
+ callb_table.ct_busy = 0; /* mark table open for additions */
+ mutex_init(&callb_safe_mutex, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&callb_table.ct_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+static void
+callb_fini(void *dummy __unused)
+{
+ callb_t *cp;
+ int i;
+
+ mutex_enter(&ct->ct_lock);
+ for (i = 0; i < 16; i++) {
+ while ((cp = ct->ct_freelist) != NULL) {
+ ct->ct_freelist = cp->c_next;
+ ct->ct_ncallb--;
+ kmem_free(cp, sizeof (callb_t));
+ }
+ if (ct->ct_ncallb == 0)
+ break;
+ /* Not all callbacks finished, waiting for the rest. */
+ mutex_exit(&ct->ct_lock);
+ tsleep(ct, 0, "callb", hz / 4);
+ mutex_enter(&ct->ct_lock);
+ }
+ if (ct->ct_ncallb > 0)
+ printf("%s: Leaked %d callbacks!\n", __func__, ct->ct_ncallb);
+ mutex_exit(&ct->ct_lock);
+ mutex_destroy(&callb_safe_mutex);
+ mutex_destroy(&callb_table.ct_lock);
+}
+
+/*
+ * callout_add() is called to register func() be called later.
+ */
+static callb_id_t
+callb_add_common(boolean_t (*func)(void *arg, int code),
+ void *arg, int class, char *name, kthread_id_t t)
+{
+ callb_t *cp;
+
+ ASSERT(class < NCBCLASS);
+
+ mutex_enter(&ct->ct_lock);
+ while (ct->ct_busy)
+ cv_wait(&ct->ct_busy_cv, &ct->ct_lock);
+ if ((cp = ct->ct_freelist) == NULL) {
+ ct->ct_ncallb++;
+ cp = (callb_t *)kmem_zalloc(sizeof (callb_t), KM_SLEEP);
+ }
+ ct->ct_freelist = cp->c_next;
+ cp->c_thread = t;
+ cp->c_func = func;
+ cp->c_arg = arg;
+ cp->c_class = (uchar_t)class;
+ cp->c_flag |= CALLB_TAKEN;
+#ifdef ZFS_DEBUG
+ if (strlen(name) > CB_MAXNAME)
+ cmn_err(CE_WARN, "callb_add: name of callback function '%s' "
+ "too long -- truncated to %d chars",
+ name, CB_MAXNAME);
+#endif
+ (void) strncpy(cp->c_name, name, CB_MAXNAME);
+ cp->c_name[CB_MAXNAME] = '\0';
+
+ /*
+ * Insert the new callb at the head of its class list.
+ */
+ cp->c_next = ct->ct_first_cb[class];
+ ct->ct_first_cb[class] = cp;
+
+ mutex_exit(&ct->ct_lock);
+ return ((callb_id_t)cp);
+}
+
+/*
+ * The default function to add an entry to the callback table. Since
+ * it uses curthread as the thread identifier to store in the table,
+ * it should be used for the normal case of a thread which is calling
+ * to add ITSELF to the table.
+ */
+callb_id_t
+callb_add(boolean_t (*func)(void *arg, int code),
+ void *arg, int class, char *name)
+{
+ return (callb_add_common(func, arg, class, name, curthread));
+}
+
+/*
+ * A special version of callb_add() above for use by threads which
+ * might be adding an entry to the table on behalf of some other
+ * thread (for example, one which is constructed but not yet running).
+ * In this version the thread id is an argument.
+ */
+callb_id_t
+callb_add_thread(boolean_t (*func)(void *arg, int code),
+ void *arg, int class, char *name, kthread_id_t t)
+{
+ return (callb_add_common(func, arg, class, name, t));
+}
+
+/*
+ * callout_delete() is called to remove an entry identified by id
+ * that was originally placed there by a call to callout_add().
+ * return -1 if fail to delete a callb entry otherwise return 0.
+ */
+int
+callb_delete(callb_id_t id)
+{
+ callb_t **pp;
+ callb_t *me = (callb_t *)id;
+
+ mutex_enter(&ct->ct_lock);
+
+ for (;;) {
+ pp = &ct->ct_first_cb[me->c_class];
+ while (*pp != NULL && *pp != me)
+ pp = &(*pp)->c_next;
+
+#ifdef ZFS_DEBUG
+ if (*pp != me) {
+ cmn_err(CE_WARN, "callb delete bogus entry 0x%p",
+ (void *)me);
+ mutex_exit(&ct->ct_lock);
+ return (-1);
+ }
+#endif /* DEBUG */
+
+ /*
+ * It is not allowed to delete a callb in the middle of
+ * executing otherwise, the callb_execute() will be confused.
+ */
+ if (!(me->c_flag & CALLB_EXECUTING))
+ break;
+
+ cv_wait(&me->c_done_cv, &ct->ct_lock);
+ }
+ /* relink the class list */
+ *pp = me->c_next;
+
+ /* clean up myself and return the free callb to the head of freelist */
+ me->c_flag = CALLB_FREE;
+ me->c_next = ct->ct_freelist;
+ ct->ct_freelist = me;
+
+ mutex_exit(&ct->ct_lock);
+ return (0);
+}
+
+/*
+ * class: indicates to execute all callbs in the same class;
+ * code: optional argument for the callb functions.
+ * return: = 0: success
+ * != 0: ptr to string supplied when callback was registered
+ */
+void *
+callb_execute_class(int class, int code)
+{
+ callb_t *cp;
+ void *ret = NULL;
+
+ ASSERT(class < NCBCLASS);
+
+ mutex_enter(&ct->ct_lock);
+
+ for (cp = ct->ct_first_cb[class];
+ cp != NULL && ret == 0; cp = cp->c_next) {
+ while (cp->c_flag & CALLB_EXECUTING)
+ cv_wait(&cp->c_done_cv, &ct->ct_lock);
+ /*
+ * cont if the callb is deleted while we're sleeping
+ */
+ if (cp->c_flag == CALLB_FREE)
+ continue;
+ cp->c_flag |= CALLB_EXECUTING;
+
+#ifdef CALLB_DEBUG
+ printf("callb_execute: name=%s func=%p arg=%p\n",
+ cp->c_name, (void *)cp->c_func, (void *)cp->c_arg);
+#endif /* CALLB_DEBUG */
+
+ mutex_exit(&ct->ct_lock);
+ /* If callback function fails, pass back client's name */
+ if (!(*cp->c_func)(cp->c_arg, code))
+ ret = cp->c_name;
+ mutex_enter(&ct->ct_lock);
+
+ cp->c_flag &= ~CALLB_EXECUTING;
+ cv_broadcast(&cp->c_done_cv);
+ }
+ mutex_exit(&ct->ct_lock);
+ return (ret);
+}
+
+/*
+ * callers make sure no recursive entries to this func.
+ * dp->cc_lockp is registered by callb_add to protect callb_cpr_t structure.
+ *
+ * When calling to stop a kernel thread (code == CB_CODE_CPR_CHKPT) we
+ * use a cv_timedwait() in case the kernel thread is blocked.
+ *
+ * Note that this is a generic callback handler for daemon CPR and
+ * should NOT be changed to accommodate any specific requirement in a daemon.
+ * Individual daemons that require changes to the handler shall write
+ * callback routines in their own daemon modules.
+ */
+boolean_t
+callb_generic_cpr(void *arg, int code)
+{
+ callb_cpr_t *cp = (callb_cpr_t *)arg;
+ clock_t ret = 0; /* assume success */
+
+ mutex_enter(cp->cc_lockp);
+
+ switch (code) {
+ case CB_CODE_CPR_CHKPT:
+ cp->cc_events |= CALLB_CPR_START;
+#ifdef CPR_NOT_THREAD_SAFE
+ while (!(cp->cc_events & CALLB_CPR_SAFE))
+ /* cv_timedwait() returns -1 if it times out. */
+ if ((ret = cv_reltimedwait(&cp->cc_callb_cv,
+ cp->cc_lockp, (callb_timeout_sec * hz),
+ TR_CLOCK_TICK)) == -1)
+ break;
+#endif
+ break;
+
+ case CB_CODE_CPR_RESUME:
+ cp->cc_events &= ~CALLB_CPR_START;
+ cv_signal(&cp->cc_stop_cv);
+ break;
+ }
+ mutex_exit(cp->cc_lockp);
+ return (ret != -1);
+}
+
+/*
+ * The generic callback function associated with kernel threads which
+ * are always considered safe.
+ */
+/* ARGSUSED */
+boolean_t
+callb_generic_cpr_safe(void *arg, int code)
+{
+ return (B_TRUE);
+}
+/*
+ * Prevent additions to callback table.
+ */
+void
+callb_lock_table(void)
+{
+ mutex_enter(&ct->ct_lock);
+ ASSERT(ct->ct_busy == 0);
+ ct->ct_busy = 1;
+ mutex_exit(&ct->ct_lock);
+}
+
+/*
+ * Allow additions to callback table.
+ */
+void
+callb_unlock_table(void)
+{
+ mutex_enter(&ct->ct_lock);
+ ASSERT(ct->ct_busy != 0);
+ ct->ct_busy = 0;
+ cv_broadcast(&ct->ct_busy_cv);
+ mutex_exit(&ct->ct_lock);
+}
+
+SYSINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_init, NULL);
+SYSUNINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_fini, NULL);
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/list.c b/sys/contrib/openzfs/module/os/freebsd/spl/list.c
new file mode 100644
index 000000000000..0f5ae629126c
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/list.c
@@ -0,0 +1,244 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Generic doubly-linked list implementation
+ */
+
+#include <sys/param.h>
+#include <sys/list.h>
+#include <sys/list_impl.h>
+#include <sys/types.h>
+#include <sys/debug.h>
+
+#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset))
+#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset))
+#define list_empty(a) ((a)->list_head.list_next == &(a)->list_head)
+
+#define list_insert_after_node(list, node, object) { \
+ list_node_t *lnew = list_d2l(list, object); \
+ lnew->list_prev = (node); \
+ lnew->list_next = (node)->list_next; \
+ (node)->list_next->list_prev = lnew; \
+ (node)->list_next = lnew; \
+}
+
+#define list_insert_before_node(list, node, object) { \
+ list_node_t *lnew = list_d2l(list, object); \
+ lnew->list_next = (node); \
+ lnew->list_prev = (node)->list_prev; \
+ (node)->list_prev->list_next = lnew; \
+ (node)->list_prev = lnew; \
+}
+
+#define list_remove_node(node) \
+ (node)->list_prev->list_next = (node)->list_next; \
+ (node)->list_next->list_prev = (node)->list_prev; \
+ (node)->list_next = (node)->list_prev = NULL
+
+void
+list_create(list_t *list, size_t size, size_t offset)
+{
+ ASSERT(list);
+ ASSERT(size > 0);
+ ASSERT(size >= offset + sizeof (list_node_t));
+
+ list->list_size = size;
+ list->list_offset = offset;
+ list->list_head.list_next = list->list_head.list_prev =
+ &list->list_head;
+}
+
+void
+list_destroy(list_t *list)
+{
+ list_node_t *node = &list->list_head;
+
+ ASSERT(list);
+ ASSERT(list->list_head.list_next == node);
+ ASSERT(list->list_head.list_prev == node);
+
+ node->list_next = node->list_prev = NULL;
+}
+
+void
+list_insert_after(list_t *list, void *object, void *nobject)
+{
+ if (object == NULL) {
+ list_insert_head(list, nobject);
+ } else {
+ list_node_t *lold = list_d2l(list, object);
+ list_insert_after_node(list, lold, nobject);
+ }
+}
+
+void
+list_insert_before(list_t *list, void *object, void *nobject)
+{
+ if (object == NULL) {
+ list_insert_tail(list, nobject);
+ } else {
+ list_node_t *lold = list_d2l(list, object);
+ list_insert_before_node(list, lold, nobject);
+ }
+}
+
+void
+list_insert_head(list_t *list, void *object)
+{
+ list_node_t *lold = &list->list_head;
+ list_insert_after_node(list, lold, object);
+}
+
+void
+list_insert_tail(list_t *list, void *object)
+{
+ list_node_t *lold = &list->list_head;
+ list_insert_before_node(list, lold, object);
+}
+
+void
+list_remove(list_t *list, void *object)
+{
+ list_node_t *lold = list_d2l(list, object);
+ ASSERT(!list_empty(list));
+ ASSERT(lold->list_next != NULL);
+ list_remove_node(lold);
+}
+
+void *
+list_remove_head(list_t *list)
+{
+ list_node_t *head = list->list_head.list_next;
+ if (head == &list->list_head)
+ return (NULL);
+ list_remove_node(head);
+ return (list_object(list, head));
+}
+
+void *
+list_remove_tail(list_t *list)
+{
+ list_node_t *tail = list->list_head.list_prev;
+ if (tail == &list->list_head)
+ return (NULL);
+ list_remove_node(tail);
+ return (list_object(list, tail));
+}
+
+void *
+list_head(list_t *list)
+{
+ if (list_empty(list))
+ return (NULL);
+ return (list_object(list, list->list_head.list_next));
+}
+
+void *
+list_tail(list_t *list)
+{
+ if (list_empty(list))
+ return (NULL);
+ return (list_object(list, list->list_head.list_prev));
+}
+
+void *
+list_next(list_t *list, void *object)
+{
+ list_node_t *node = list_d2l(list, object);
+
+ if (node->list_next != &list->list_head)
+ return (list_object(list, node->list_next));
+
+ return (NULL);
+}
+
+void *
+list_prev(list_t *list, void *object)
+{
+ list_node_t *node = list_d2l(list, object);
+
+ if (node->list_prev != &list->list_head)
+ return (list_object(list, node->list_prev));
+
+ return (NULL);
+}
+
+/*
+ * Insert src list after dst list. Empty src list thereafter.
+ */
+void
+list_move_tail(list_t *dst, list_t *src)
+{
+ list_node_t *dstnode = &dst->list_head;
+ list_node_t *srcnode = &src->list_head;
+
+ ASSERT(dst->list_size == src->list_size);
+ ASSERT(dst->list_offset == src->list_offset);
+
+ if (list_empty(src))
+ return;
+
+ dstnode->list_prev->list_next = srcnode->list_next;
+ srcnode->list_next->list_prev = dstnode->list_prev;
+ dstnode->list_prev = srcnode->list_prev;
+ srcnode->list_prev->list_next = dstnode;
+
+ /* empty src list */
+ srcnode->list_next = srcnode->list_prev = srcnode;
+}
+
+void
+list_link_replace(list_node_t *lold, list_node_t *lnew)
+{
+ ASSERT(list_link_active(lold));
+ ASSERT(!list_link_active(lnew));
+
+ lnew->list_next = lold->list_next;
+ lnew->list_prev = lold->list_prev;
+ lold->list_prev->list_next = lnew;
+ lold->list_next->list_prev = lnew;
+ lold->list_next = lold->list_prev = NULL;
+}
+
+void
+list_link_init(list_node_t *link)
+{
+ link->list_next = NULL;
+ link->list_prev = NULL;
+}
+
+int
+list_link_active(list_node_t *link)
+{
+ EQUIV(link->list_next == NULL, link->list_prev == NULL);
+ return (link->list_next != NULL);
+}
+
+int
+list_is_empty(list_t *list)
+{
+ return (list_empty(list));
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/sha224.h b/sys/contrib/openzfs/module/os/freebsd/spl/sha224.h
new file mode 100644
index 000000000000..0abd43068708
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/sha224.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2005 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SHA224_H_
+#define _SHA224_H_
+
+#ifndef _KERNEL
+#include <sys/types.h>
+#endif
+
+#define SHA224_BLOCK_LENGTH 64
+#define SHA224_DIGEST_LENGTH 28
+#define SHA224_DIGEST_STRING_LENGTH (SHA224_DIGEST_LENGTH * 2 + 1)
+
+typedef struct SHA224Context {
+ uint32_t state[8];
+ uint64_t count;
+ uint8_t buf[SHA224_BLOCK_LENGTH];
+} SHA224_CTX;
+
+__BEGIN_DECLS
+
+/* Ensure libmd symbols do not clash with libcrypto */
+
+#ifndef SHA224_Init
+#define SHA224_Init _libmd_SHA224_Init
+#endif
+#ifndef SHA224_Update
+#define SHA224_Update _libmd_SHA224_Update
+#endif
+#ifndef SHA224_Final
+#define SHA224_Final _libmd_SHA224_Final
+#endif
+#ifndef SHA224_End
+#define SHA224_End _libmd_SHA224_End
+#endif
+#ifndef SHA224_Fd
+#define SHA224_Fd _libmd_SHA224_Fd
+#endif
+#ifndef SHA224_FdChunk
+#define SHA224_FdChunk _libmd_SHA224_FdChunk
+#endif
+#ifndef SHA224_File
+#define SHA224_File _libmd_SHA224_File
+#endif
+#ifndef SHA224_FileChunk
+#define SHA224_FileChunk _libmd_SHA224_FileChunk
+#endif
+#ifndef SHA224_Data
+#define SHA224_Data _libmd_SHA224_Data
+#endif
+
+#ifndef SHA224_version
+#define SHA224_version _libmd_SHA224_version
+#endif
+
+void SHA224_Init(SHA224_CTX *);
+void SHA224_Update(SHA224_CTX *, const void *, size_t);
+void SHA224_Final(unsigned char [__min_size(SHA224_DIGEST_LENGTH)],
+ SHA224_CTX *);
+#ifndef _KERNEL
+char *SHA224_End(SHA224_CTX *, char *);
+char *SHA224_Data(const void *, unsigned int, char *);
+char *SHA224_Fd(int, char *);
+char *SHA224_FdChunk(int, char *, off_t, off_t);
+char *SHA224_File(const char *, char *);
+char *SHA224_FileChunk(const char *, char *, off_t, off_t);
+#endif
+__END_DECLS
+
+#endif /* !_SHA224_H_ */
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/sha256.h b/sys/contrib/openzfs/module/os/freebsd/spl/sha256.h
new file mode 100644
index 000000000000..193c0c025120
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/sha256.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2005 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SHA256_H_
+#define _SHA256_H_
+
+#ifndef _KERNEL
+#include <sys/types.h>
+#endif
+
+#define SHA256_BLOCK_LENGTH 64
+#define SHA256_DIGEST_LENGTH 32
+#define SHA256_DIGEST_STRING_LENGTH (SHA256_DIGEST_LENGTH * 2 + 1)
+
+typedef struct SHA256Context {
+ uint32_t state[8];
+ uint64_t count;
+ uint8_t buf[SHA256_BLOCK_LENGTH];
+} SHA256_CTX;
+
+__BEGIN_DECLS
+
+/* Ensure libmd symbols do not clash with libcrypto */
+
+#ifndef SHA256_Init
+#define SHA256_Init _libmd_SHA256_Init
+#endif
+#ifndef SHA256_Update
+#define SHA256_Update _libmd_SHA256_Update
+#endif
+#ifndef SHA256_Final
+#define SHA256_Final _libmd_SHA256_Final
+#endif
+#ifndef SHA256_End
+#define SHA256_End _libmd_SHA256_End
+#endif
+#ifndef SHA256_Fd
+#define SHA256_Fd _libmd_SHA256_Fd
+#endif
+#ifndef SHA256_FdChunk
+#define SHA256_FdChunk _libmd_SHA256_FdChunk
+#endif
+#ifndef SHA256_File
+#define SHA256_File _libmd_SHA256_File
+#endif
+#ifndef SHA256_FileChunk
+#define SHA256_FileChunk _libmd_SHA256_FileChunk
+#endif
+#ifndef SHA256_Data
+#define SHA256_Data _libmd_SHA256_Data
+#endif
+
+#ifndef SHA256_Transform
+#define SHA256_Transform _libmd_SHA256_Transform
+#endif
+#ifndef SHA256_version
+#define SHA256_version _libmd_SHA256_version
+#endif
+
+void SHA256_Init(SHA256_CTX *);
+void SHA256_Update(SHA256_CTX *, const void *, size_t);
+void SHA256_Final(unsigned char [__min_size(SHA256_DIGEST_LENGTH)],
+ SHA256_CTX *);
+#ifndef _KERNEL
+char *SHA256_End(SHA256_CTX *, char *);
+char *SHA256_Data(const void *, unsigned int, char *);
+char *SHA256_Fd(int, char *);
+char *SHA256_FdChunk(int, char *, off_t, off_t);
+char *SHA256_File(const char *, char *);
+char *SHA256_FileChunk(const char *, char *, off_t, off_t);
+#endif
+__END_DECLS
+
+#endif /* !_SHA256_H_ */
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/sha256c.c b/sys/contrib/openzfs/module/os/freebsd/spl/sha256c.c
new file mode 100644
index 000000000000..241cf8c9ae76
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/sha256c.c
@@ -0,0 +1,378 @@
+/*
+ * Copyright 2005 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+
+#include <sys/byteorder.h>
+#include <sys/endian.h>
+#include "sha224.h"
+#include "sha256.h"
+
+#if BYTE_ORDER == BIG_ENDIAN
+
+/* Copy a vector of big-endian uint32_t into a vector of bytes */
+#define be32enc_vect(dst, src, len) \
+ memcpy((void *)dst, (const void *)src, (size_t)len)
+
+/* Copy a vector of bytes into a vector of big-endian uint32_t */
+#define be32dec_vect(dst, src, len) \
+ memcpy((void *)dst, (const void *)src, (size_t)len)
+
+#else /* BYTE_ORDER != BIG_ENDIAN */
+
+/*
+ * Encode a length len/4 vector of (uint32_t) into a length len vector of
+ * (unsigned char) in big-endian form. Assumes len is a multiple of 4.
+ */
+static void
+be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len / 4; i++)
+ be32enc(dst + i * 4, src[i]);
+}
+
+/*
+ * Decode a big-endian length len vector of (unsigned char) into a length
+ * len/4 vector of (uint32_t). Assumes len is a multiple of 4.
+ */
+static void
+be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len / 4; i++)
+ dst[i] = be32dec(src + i * 4);
+}
+
+#endif /* BYTE_ORDER != BIG_ENDIAN */
+
+/* SHA256 round constants. */
+static const uint32_t K[64] = {
+ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+ 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+ 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+ 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+ 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+ 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+ 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+ 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+ 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+ 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+ 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+ 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+ 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+ 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+ 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+ 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+/* Elementary functions used by SHA256 */
+#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
+#define Maj(x, y, z) ((x & (y | z)) | (y & z))
+#define SHR(x, n) (x >> n)
+#define ROTR(x, n) ((x >> n) | (x << (32 - n)))
+#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
+#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k) \
+ h += S1(e) + Ch(e, f, g) + k; \
+ d += h; \
+ h += S0(a) + Maj(a, b, c);
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i, ii) \
+ RND(S[(64 - i) % 8], S[(65 - i) % 8], \
+ S[(66 - i) % 8], S[(67 - i) % 8], \
+ S[(68 - i) % 8], S[(69 - i) % 8], \
+ S[(70 - i) % 8], S[(71 - i) % 8], \
+ W[i + ii] + K[i + ii])
+
+/* Message schedule computation */
+#define MSCH(W, ii, i) \
+ W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + \
+ s0(W[i + ii + 1]) + W[i + ii]
+
+/*
+ * SHA256 block compression function. The 256-bit state is transformed via
+ * the 512-bit input block to produce a new state.
+ */
+static void
+SHA256_Transform(uint32_t *state, const unsigned char block[64])
+{
+ uint32_t W[64];
+ uint32_t S[8];
+ int i;
+
+ /* 1. Prepare the first part of the message schedule W. */
+ be32dec_vect(W, block, 64);
+
+ /* 2. Initialize working variables. */
+ memcpy(S, state, 32);
+
+ /* 3. Mix. */
+ for (i = 0; i < 64; i += 16) {
+ RNDr(S, W, 0, i);
+ RNDr(S, W, 1, i);
+ RNDr(S, W, 2, i);
+ RNDr(S, W, 3, i);
+ RNDr(S, W, 4, i);
+ RNDr(S, W, 5, i);
+ RNDr(S, W, 6, i);
+ RNDr(S, W, 7, i);
+ RNDr(S, W, 8, i);
+ RNDr(S, W, 9, i);
+ RNDr(S, W, 10, i);
+ RNDr(S, W, 11, i);
+ RNDr(S, W, 12, i);
+ RNDr(S, W, 13, i);
+ RNDr(S, W, 14, i);
+ RNDr(S, W, 15, i);
+
+ if (i == 48)
+ break;
+ MSCH(W, 0, i);
+ MSCH(W, 1, i);
+ MSCH(W, 2, i);
+ MSCH(W, 3, i);
+ MSCH(W, 4, i);
+ MSCH(W, 5, i);
+ MSCH(W, 6, i);
+ MSCH(W, 7, i);
+ MSCH(W, 8, i);
+ MSCH(W, 9, i);
+ MSCH(W, 10, i);
+ MSCH(W, 11, i);
+ MSCH(W, 12, i);
+ MSCH(W, 13, i);
+ MSCH(W, 14, i);
+ MSCH(W, 15, i);
+ }
+
+ /* 4. Mix local working variables into global state */
+ for (i = 0; i < 8; i++)
+ state[i] += S[i];
+}
+
+static unsigned char PAD[64] = {
+ 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* Add padding and terminating bit-count. */
+static void
+SHA256_Pad(SHA256_CTX * ctx)
+{
+ size_t r;
+
+ /* Figure out how many bytes we have buffered. */
+ r = (ctx->count >> 3) & 0x3f;
+
+ /* Pad to 56 mod 64, transforming if we finish a block en route. */
+ if (r < 56) {
+ /* Pad to 56 mod 64. */
+ memcpy(&ctx->buf[r], PAD, 56 - r);
+ } else {
+ /* Finish the current block and mix. */
+ memcpy(&ctx->buf[r], PAD, 64 - r);
+ SHA256_Transform(ctx->state, ctx->buf);
+
+ /* The start of the final block is all zeroes. */
+ memset(&ctx->buf[0], 0, 56);
+ }
+
+ /* Add the terminating bit-count. */
+ be64enc(&ctx->buf[56], ctx->count);
+
+ /* Mix in the final block. */
+ SHA256_Transform(ctx->state, ctx->buf);
+}
+
+/* SHA-256 initialization. Begins a SHA-256 operation. */
+void
+SHA256_Init(SHA256_CTX * ctx)
+{
+
+ /* Zero bits processed so far */
+ ctx->count = 0;
+
+ /* Magic initialization constants */
+ ctx->state[0] = 0x6A09E667;
+ ctx->state[1] = 0xBB67AE85;
+ ctx->state[2] = 0x3C6EF372;
+ ctx->state[3] = 0xA54FF53A;
+ ctx->state[4] = 0x510E527F;
+ ctx->state[5] = 0x9B05688C;
+ ctx->state[6] = 0x1F83D9AB;
+ ctx->state[7] = 0x5BE0CD19;
+}
+
+/* Add bytes into the hash */
+void
+SHA256_Update(SHA256_CTX * ctx, const void *in, size_t len)
+{
+ uint64_t bitlen;
+ uint32_t r;
+ const unsigned char *src = in;
+
+ /* Number of bytes left in the buffer from previous updates */
+ r = (ctx->count >> 3) & 0x3f;
+
+ /* Convert the length into a number of bits */
+ bitlen = len << 3;
+
+ /* Update number of bits */
+ ctx->count += bitlen;
+
+ /* Handle the case where we don't need to perform any transforms */
+ if (len < 64 - r) {
+ memcpy(&ctx->buf[r], src, len);
+ return;
+ }
+
+ /* Finish the current block */
+ memcpy(&ctx->buf[r], src, 64 - r);
+ SHA256_Transform(ctx->state, ctx->buf);
+ src += 64 - r;
+ len -= 64 - r;
+
+ /* Perform complete blocks */
+ while (len >= 64) {
+ SHA256_Transform(ctx->state, src);
+ src += 64;
+ len -= 64;
+ }
+
+ /* Copy left over data into buffer */
+ memcpy(ctx->buf, src, len);
+}
+
+/*
+ * SHA-256 finalization. Pads the input data, exports the hash value,
+ * and clears the context state.
+ */
+void
+SHA256_Final(unsigned char digest[static SHA256_DIGEST_LENGTH], SHA256_CTX *ctx)
+{
+
+ /* Add padding */
+ SHA256_Pad(ctx);
+
+ /* Write the hash */
+ be32enc_vect(digest, ctx->state, SHA256_DIGEST_LENGTH);
+
+ /* Clear the context state */
+ explicit_bzero(ctx, sizeof (*ctx));
+}
+
+/* SHA-224: ******************************************************* */
+/*
+ * the SHA224 and SHA256 transforms are identical
+ */
+
+/* SHA-224 initialization. Begins a SHA-224 operation. */
+void
+SHA224_Init(SHA224_CTX * ctx)
+{
+
+ /* Zero bits processed so far */
+ ctx->count = 0;
+
+ /* Magic initialization constants */
+ ctx->state[0] = 0xC1059ED8;
+ ctx->state[1] = 0x367CD507;
+ ctx->state[2] = 0x3070DD17;
+ ctx->state[3] = 0xF70E5939;
+ ctx->state[4] = 0xFFC00B31;
+ ctx->state[5] = 0x68581511;
+ ctx->state[6] = 0x64f98FA7;
+ ctx->state[7] = 0xBEFA4FA4;
+}
+
+/* Add bytes into the SHA-224 hash */
+void
+SHA224_Update(SHA224_CTX * ctx, const void *in, size_t len)
+{
+
+ SHA256_Update((SHA256_CTX *)ctx, in, len);
+}
+
+/*
+ * SHA-224 finalization. Pads the input data, exports the hash value,
+ * and clears the context state.
+ */
+void
+SHA224_Final(unsigned char digest[static SHA224_DIGEST_LENGTH], SHA224_CTX *ctx)
+{
+
+ /* Add padding */
+ SHA256_Pad((SHA256_CTX *)ctx);
+
+ /* Write the hash */
+ be32enc_vect(digest, ctx->state, SHA224_DIGEST_LENGTH);
+
+ /* Clear the context state */
+ explicit_bzero(ctx, sizeof (*ctx));
+}
+
+#ifdef WEAK_REFS
+/*
+ * When building libmd, provide weak references. Note: this is not
+ * activated in the context of compiling these sources for internal
+ * use in libcrypt.
+ */
+#undef SHA256_Init
+__weak_reference(_libmd_SHA256_Init, SHA256_Init);
+#undef SHA256_Update
+__weak_reference(_libmd_SHA256_Update, SHA256_Update);
+#undef SHA256_Final
+__weak_reference(_libmd_SHA256_Final, SHA256_Final);
+#undef SHA256_Transform
+__weak_reference(_libmd_SHA256_Transform, SHA256_Transform);
+
+#undef SHA224_Init
+__weak_reference(_libmd_SHA224_Init, SHA224_Init);
+#undef SHA224_Update
+__weak_reference(_libmd_SHA224_Update, SHA224_Update);
+#undef SHA224_Final
+__weak_reference(_libmd_SHA224_Final, SHA224_Final);
+#endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/sha384.h b/sys/contrib/openzfs/module/os/freebsd/spl/sha384.h
new file mode 100644
index 000000000000..67250cee0313
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/sha384.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2005 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SHA384_H_
+#define _SHA384_H_
+
+#ifndef _KERNEL
+#include <sys/types.h>
+#endif
+
+#define SHA384_BLOCK_LENGTH 128
+#define SHA384_DIGEST_LENGTH 48
+#define SHA384_DIGEST_STRING_LENGTH (SHA384_DIGEST_LENGTH * 2 + 1)
+
+typedef struct SHA384Context {
+ uint64_t state[8];
+ uint64_t count[2];
+ uint8_t buf[SHA384_BLOCK_LENGTH];
+} SHA384_CTX;
+
+__BEGIN_DECLS
+
+/* Ensure libmd symbols do not clash with libcrypto */
+#ifndef SHA384_Init
+#define SHA384_Init _libmd_SHA384_Init
+#endif
+#ifndef SHA384_Update
+#define SHA384_Update _libmd_SHA384_Update
+#endif
+#ifndef SHA384_Final
+#define SHA384_Final _libmd_SHA384_Final
+#endif
+#ifndef SHA384_End
+#define SHA384_End _libmd_SHA384_End
+#endif
+#ifndef SHA384_Fd
+#define SHA384_Fd _libmd_SHA384_Fd
+#endif
+#ifndef SHA384_FdChunk
+#define SHA384_FdChunk _libmd_SHA384_FdChunk
+#endif
+#ifndef SHA384_File
+#define SHA384_File _libmd_SHA384_File
+#endif
+#ifndef SHA384_FileChunk
+#define SHA384_FileChunk _libmd_SHA384_FileChunk
+#endif
+#ifndef SHA384_Data
+#define SHA384_Data _libmd_SHA384_Data
+#endif
+
+#ifndef SHA384_version
+#define SHA384_version _libmd_SHA384_version
+#endif
+
+void SHA384_Init(SHA384_CTX *);
+void SHA384_Update(SHA384_CTX *, const void *, size_t);
+void SHA384_Final(unsigned char [__min_size(SHA384_DIGEST_LENGTH)],
+ SHA384_CTX *);
+#ifndef _KERNEL
+char *SHA384_End(SHA384_CTX *, char *);
+char *SHA384_Data(const void *, unsigned int, char *);
+char *SHA384_Fd(int, char *);
+char *SHA384_FdChunk(int, char *, off_t, off_t);
+char *SHA384_File(const char *, char *);
+char *SHA384_FileChunk(const char *, char *, off_t, off_t);
+#endif
+
+__END_DECLS
+
+#endif /* !_SHA384_H_ */
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/sha512.h b/sys/contrib/openzfs/module/os/freebsd/spl/sha512.h
new file mode 100644
index 000000000000..b6fb733ca54e
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/sha512.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright 2005 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SHA512_H_
+#define _SHA512_H_
+
+#ifndef _KERNEL
+#include <sys/types.h>
+#endif
+
+#define SHA512_BLOCK_LENGTH 128
+#define SHA512_DIGEST_LENGTH 64
+#define SHA512_DIGEST_STRING_LENGTH (SHA512_DIGEST_LENGTH * 2 + 1)
+
+typedef struct SHA512Context {
+ uint64_t state[8];
+ uint64_t count[2];
+ uint8_t buf[SHA512_BLOCK_LENGTH];
+} SHA512_CTX;
+
+__BEGIN_DECLS
+
+/* Ensure libmd symbols do not clash with libcrypto */
+#if 0
+#ifndef SHA512_Init
+#define SHA512_Init _libmd_SHA512_Init
+#endif
+#ifndef SHA512_Update
+#define SHA512_Update _libmd_SHA512_Update
+#endif
+#ifndef SHA512_Final
+#define SHA512_Final _libmd_SHA512_Final
+#endif
+#endif
+#ifndef SHA512_End
+#define SHA512_End _libmd_SHA512_End
+#endif
+#ifndef SHA512_Fd
+#define SHA512_Fd _libmd_SHA512_Fd
+#endif
+#ifndef SHA512_FdChunk
+#define SHA512_FdChunk _libmd_SHA512_FdChunk
+#endif
+#ifndef SHA512_File
+#define SHA512_File _libmd_SHA512_File
+#endif
+#ifndef SHA512_FileChunk
+#define SHA512_FileChunk _libmd_SHA512_FileChunk
+#endif
+#ifndef SHA512_Data
+#define SHA512_Data _libmd_SHA512_Data
+#endif
+
+#ifndef SHA512_Transform
+#define SHA512_Transform _libmd_SHA512_Transform
+#endif
+#ifndef SHA512_version
+#define SHA512_version _libmd_SHA512_version
+#endif
+
+void SHA512_Init(SHA512_CTX *);
+void SHA512_Update(SHA512_CTX *, const void *, size_t);
+void SHA512_Final(unsigned char [__min_size(SHA512_DIGEST_LENGTH)],
+ SHA512_CTX *);
+#ifndef _KERNEL
+char *SHA512_End(SHA512_CTX *, char *);
+char *SHA512_Data(const void *, unsigned int, char *);
+char *SHA512_Fd(int, char *);
+char *SHA512_FdChunk(int, char *, off_t, off_t);
+char *SHA512_File(const char *, char *);
+char *SHA512_FileChunk(const char *, char *, off_t, off_t);
+#endif
+
+__END_DECLS
+
+#endif /* !_SHA512_H_ */
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/sha512c.c b/sys/contrib/openzfs/module/os/freebsd/spl/sha512c.c
new file mode 100644
index 000000000000..146f338f0ed4
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/sha512c.c
@@ -0,0 +1,508 @@
+/*
+ * Copyright 2005 Colin Percival
+ * Copyright (c) 2015 Allan Jude <allanjude@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/endian.h>
+#include <sys/types.h>
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+#include "sha512.h"
+#include "sha512t.h"
+#include "sha384.h"
+
+#if BYTE_ORDER == BIG_ENDIAN
+
+/* Copy a vector of big-endian uint64_t into a vector of bytes */
+#define be64enc_vect(dst, src, len) \
+ memcpy((void *)dst, (const void *)src, (size_t)len)
+
+/* Copy a vector of bytes into a vector of big-endian uint64_t */
+#define be64dec_vect(dst, src, len) \
+ memcpy((void *)dst, (const void *)src, (size_t)len)
+
+#else /* BYTE_ORDER != BIG_ENDIAN */
+
+/*
+ * Encode a length len/4 vector of (uint64_t) into a length len vector of
+ * (unsigned char) in big-endian form. Assumes len is a multiple of 8.
+ */
+static void
+be64enc_vect(unsigned char *dst, const uint64_t *src, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len / 8; i++)
+ be64enc(dst + i * 8, src[i]);
+}
+
+/*
+ * Decode a big-endian length len vector of (unsigned char) into a length
+ * len/4 vector of (uint64_t). Assumes len is a multiple of 8.
+ */
+static void
+be64dec_vect(uint64_t *dst, const unsigned char *src, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len / 8; i++)
+ dst[i] = be64dec(src + i * 8);
+}
+
+#endif /* BYTE_ORDER != BIG_ENDIAN */
+
+/* SHA512 round constants. */
+static const uint64_t K[80] = {
+ 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
+ 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
+ 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+ 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
+ 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
+ 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+ 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
+ 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
+ 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+ 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
+ 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
+ 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+ 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
+ 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
+ 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+ 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
+ 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
+ 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+ 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
+ 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
+ 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+ 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
+ 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
+ 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+ 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
+ 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
+ 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+ 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
+ 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
+ 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+ 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
+ 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
+ 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+ 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
+ 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
+ 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+ 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
+ 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
+ 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+ 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
+};
+
+/* Elementary functions used by SHA512 */
+#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
+#define Maj(x, y, z) ((x & (y | z)) | (y & z))
+#define SHR(x, n) (x >> n)
+#define ROTR(x, n) ((x >> n) | (x << (64 - n)))
+#define S0(x) (ROTR(x, 28) ^ ROTR(x, 34) ^ ROTR(x, 39))
+#define S1(x) (ROTR(x, 14) ^ ROTR(x, 18) ^ ROTR(x, 41))
+#define s0(x) (ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x, 7))
+#define s1(x) (ROTR(x, 19) ^ ROTR(x, 61) ^ SHR(x, 6))
+
+/* SHA512 round function */
+#define RND(a, b, c, d, e, f, g, h, k) \
+ h += S1(e) + Ch(e, f, g) + k; \
+ d += h; \
+ h += S0(a) + Maj(a, b, c);
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i, ii) \
+ RND(S[(80 - i) % 8], S[(81 - i) % 8], \
+ S[(82 - i) % 8], S[(83 - i) % 8], \
+ S[(84 - i) % 8], S[(85 - i) % 8], \
+ S[(86 - i) % 8], S[(87 - i) % 8], \
+ W[i + ii] + K[i + ii])
+
+/* Message schedule computation */
+#define MSCH(W, ii, i) \
+ W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + \
+ s0(W[i + ii + 1]) + W[i + ii]
+
+/*
+ * SHA512 block compression function. The 512-bit state is transformed via
+ * the 512-bit input block to produce a new state.
+ */
+static void
+SHA512_Transform(uint64_t *state,
+ const unsigned char block[SHA512_BLOCK_LENGTH])
+{
+ uint64_t W[80];
+ uint64_t S[8];
+ int i;
+
+ /* 1. Prepare the first part of the message schedule W. */
+ be64dec_vect(W, block, SHA512_BLOCK_LENGTH);
+
+ /* 2. Initialize working variables. */
+ memcpy(S, state, SHA512_DIGEST_LENGTH);
+
+ /* 3. Mix. */
+ for (i = 0; i < 80; i += 16) {
+ RNDr(S, W, 0, i);
+ RNDr(S, W, 1, i);
+ RNDr(S, W, 2, i);
+ RNDr(S, W, 3, i);
+ RNDr(S, W, 4, i);
+ RNDr(S, W, 5, i);
+ RNDr(S, W, 6, i);
+ RNDr(S, W, 7, i);
+ RNDr(S, W, 8, i);
+ RNDr(S, W, 9, i);
+ RNDr(S, W, 10, i);
+ RNDr(S, W, 11, i);
+ RNDr(S, W, 12, i);
+ RNDr(S, W, 13, i);
+ RNDr(S, W, 14, i);
+ RNDr(S, W, 15, i);
+
+ if (i == 64)
+ break;
+ MSCH(W, 0, i);
+ MSCH(W, 1, i);
+ MSCH(W, 2, i);
+ MSCH(W, 3, i);
+ MSCH(W, 4, i);
+ MSCH(W, 5, i);
+ MSCH(W, 6, i);
+ MSCH(W, 7, i);
+ MSCH(W, 8, i);
+ MSCH(W, 9, i);
+ MSCH(W, 10, i);
+ MSCH(W, 11, i);
+ MSCH(W, 12, i);
+ MSCH(W, 13, i);
+ MSCH(W, 14, i);
+ MSCH(W, 15, i);
+ }
+
+ /* 4. Mix local working variables into global state */
+ for (i = 0; i < 8; i++)
+ state[i] += S[i];
+}
+
+static unsigned char PAD[SHA512_BLOCK_LENGTH] = {
+ 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* Add padding and terminating bit-count. */
+static void
+SHA512_Pad(SHA512_CTX * ctx)
+{
+ size_t r;
+
+ /* Figure out how many bytes we have buffered. */
+ r = (ctx->count[1] >> 3) & 0x7f;
+
+ /* Pad to 112 mod 128, transforming if we finish a block en route. */
+ if (r < 112) {
+ /* Pad to 112 mod 128. */
+ memcpy(&ctx->buf[r], PAD, 112 - r);
+ } else {
+ /* Finish the current block and mix. */
+ memcpy(&ctx->buf[r], PAD, 128 - r);
+ SHA512_Transform(ctx->state, ctx->buf);
+
+ /* The start of the final block is all zeroes. */
+ memset(&ctx->buf[0], 0, 112);
+ }
+
+ /* Add the terminating bit-count. */
+ be64enc_vect(&ctx->buf[112], ctx->count, 16);
+
+ /* Mix in the final block. */
+ SHA512_Transform(ctx->state, ctx->buf);
+}
+
+/* SHA-512 initialization. Begins a SHA-512 operation. */
+void
+SHA512_Init(SHA512_CTX * ctx)
+{
+
+ /* Zero bits processed so far */
+ ctx->count[0] = ctx->count[1] = 0;
+
+ /* Magic initialization constants */
+ ctx->state[0] = 0x6a09e667f3bcc908ULL;
+ ctx->state[1] = 0xbb67ae8584caa73bULL;
+ ctx->state[2] = 0x3c6ef372fe94f82bULL;
+ ctx->state[3] = 0xa54ff53a5f1d36f1ULL;
+ ctx->state[4] = 0x510e527fade682d1ULL;
+ ctx->state[5] = 0x9b05688c2b3e6c1fULL;
+ ctx->state[6] = 0x1f83d9abfb41bd6bULL;
+ ctx->state[7] = 0x5be0cd19137e2179ULL;
+}
+
+/* Add bytes into the hash */
+void
+SHA512_Update(SHA512_CTX * ctx, const void *in, size_t len)
+{
+ uint64_t bitlen[2];
+ uint64_t r;
+ const unsigned char *src = in;
+
+ /* Number of bytes left in the buffer from previous updates */
+ r = (ctx->count[1] >> 3) & 0x7f;
+
+ /* Convert the length into a number of bits */
+ bitlen[1] = ((uint64_t)len) << 3;
+ bitlen[0] = ((uint64_t)len) >> 61;
+
+ /* Update number of bits */
+ if ((ctx->count[1] += bitlen[1]) < bitlen[1])
+ ctx->count[0]++;
+ ctx->count[0] += bitlen[0];
+
+ /* Handle the case where we don't need to perform any transforms */
+ if (len < SHA512_BLOCK_LENGTH - r) {
+ memcpy(&ctx->buf[r], src, len);
+ return;
+ }
+
+ /* Finish the current block */
+ memcpy(&ctx->buf[r], src, SHA512_BLOCK_LENGTH - r);
+ SHA512_Transform(ctx->state, ctx->buf);
+ src += SHA512_BLOCK_LENGTH - r;
+ len -= SHA512_BLOCK_LENGTH - r;
+
+ /* Perform complete blocks */
+ while (len >= SHA512_BLOCK_LENGTH) {
+ SHA512_Transform(ctx->state, src);
+ src += SHA512_BLOCK_LENGTH;
+ len -= SHA512_BLOCK_LENGTH;
+ }
+
+ /* Copy left over data into buffer */
+ memcpy(ctx->buf, src, len);
+}
+
+/*
+ * SHA-512 finalization. Pads the input data, exports the hash value,
+ * and clears the context state.
+ */
+void
+SHA512_Final(unsigned char digest[static SHA512_DIGEST_LENGTH], SHA512_CTX *ctx)
+{
+
+ /* Add padding */
+ SHA512_Pad(ctx);
+
+ /* Write the hash */
+ be64enc_vect(digest, ctx->state, SHA512_DIGEST_LENGTH);
+
+ /* Clear the context state */
+ explicit_bzero(ctx, sizeof (*ctx));
+}
+
+/* SHA-512t: ******************************************************** */
+/*
+ * the SHA512t transforms are identical to SHA512 so reuse the existing function
+ */
+void
+SHA512_224_Init(SHA512_CTX * ctx)
+{
+
+ /* Zero bits processed so far */
+ ctx->count[0] = ctx->count[1] = 0;
+
+ /* Magic initialization constants */
+ ctx->state[0] = 0x8c3d37c819544da2ULL;
+ ctx->state[1] = 0x73e1996689dcd4d6ULL;
+ ctx->state[2] = 0x1dfab7ae32ff9c82ULL;
+ ctx->state[3] = 0x679dd514582f9fcfULL;
+ ctx->state[4] = 0x0f6d2b697bd44da8ULL;
+ ctx->state[5] = 0x77e36f7304c48942ULL;
+ ctx->state[6] = 0x3f9d85a86a1d36c8ULL;
+ ctx->state[7] = 0x1112e6ad91d692a1ULL;
+}
+
+void
+SHA512_224_Update(SHA512_CTX * ctx, const void *in, size_t len)
+{
+
+ SHA512_Update(ctx, in, len);
+}
+
+void
+SHA512_224_Final(unsigned char digest[static SHA512_224_DIGEST_LENGTH],
+ SHA512_CTX *ctx)
+{
+
+ /* Add padding */
+ SHA512_Pad(ctx);
+
+ /* Write the hash */
+ be64enc_vect(digest, ctx->state, SHA512_224_DIGEST_LENGTH);
+
+ /* Clear the context state */
+ explicit_bzero(ctx, sizeof (*ctx));
+}
+
+void
+SHA512_256_Init(SHA512_CTX * ctx)
+{
+
+ /* Zero bits processed so far */
+ ctx->count[0] = ctx->count[1] = 0;
+
+ /* Magic initialization constants */
+ ctx->state[0] = 0x22312194fc2bf72cULL;
+ ctx->state[1] = 0x9f555fa3c84c64c2ULL;
+ ctx->state[2] = 0x2393b86b6f53b151ULL;
+ ctx->state[3] = 0x963877195940eabdULL;
+ ctx->state[4] = 0x96283ee2a88effe3ULL;
+ ctx->state[5] = 0xbe5e1e2553863992ULL;
+ ctx->state[6] = 0x2b0199fc2c85b8aaULL;
+ ctx->state[7] = 0x0eb72ddc81c52ca2ULL;
+}
+
+void
+SHA512_256_Update(SHA512_CTX * ctx, const void *in, size_t len)
+{
+
+ SHA512_Update(ctx, in, len);
+}
+
+void
+SHA512_256_Final(unsigned char digest[static SHA512_256_DIGEST_LENGTH],
+ SHA512_CTX * ctx)
+{
+
+ /* Add padding */
+ SHA512_Pad(ctx);
+
+ /* Write the hash */
+ be64enc_vect(digest, ctx->state, SHA512_256_DIGEST_LENGTH);
+
+ /* Clear the context state */
+ explicit_bzero(ctx, sizeof (*ctx));
+}
+
+/* ** SHA-384: ******************************************************** */
+/*
+ * the SHA384 and SHA512 transforms are identical, so SHA384 is skipped
+ */
+
+/* SHA-384 initialization. Begins a SHA-384 operation. */
+void
+SHA384_Init(SHA384_CTX * ctx)
+{
+
+ /* Zero bits processed so far */
+ ctx->count[0] = ctx->count[1] = 0;
+
+ /* Magic initialization constants */
+ ctx->state[0] = 0xcbbb9d5dc1059ed8ULL;
+ ctx->state[1] = 0x629a292a367cd507ULL;
+ ctx->state[2] = 0x9159015a3070dd17ULL;
+ ctx->state[3] = 0x152fecd8f70e5939ULL;
+ ctx->state[4] = 0x67332667ffc00b31ULL;
+ ctx->state[5] = 0x8eb44a8768581511ULL;
+ ctx->state[6] = 0xdb0c2e0d64f98fa7ULL;
+ ctx->state[7] = 0x47b5481dbefa4fa4ULL;
+}
+
+/* Add bytes into the SHA-384 hash */
+void
+SHA384_Update(SHA384_CTX * ctx, const void *in, size_t len)
+{
+
+ SHA512_Update((SHA512_CTX *)ctx, in, len);
+}
+
+/*
+ * SHA-384 finalization. Pads the input data, exports the hash value,
+ * and clears the context state.
+ */
+void
+SHA384_Final(unsigned char digest[static SHA384_DIGEST_LENGTH], SHA384_CTX *ctx)
+{
+
+ /* Add padding */
+ SHA512_Pad((SHA512_CTX *)ctx);
+
+ /* Write the hash */
+ be64enc_vect(digest, ctx->state, SHA384_DIGEST_LENGTH);
+
+ /* Clear the context state */
+ explicit_bzero(ctx, sizeof (*ctx));
+}
+
+#if 0
+/*
+ * When building libmd, provide weak references. Note: this is not
+ * activated in the context of compiling these sources for internal
+ * use in libcrypt.
+ */
+#undef SHA512_Init
+__weak_reference(_libmd_SHA512_Init, SHA512_Init);
+#undef SHA512_Update
+__weak_reference(_libmd_SHA512_Update, SHA512_Update);
+#undef SHA512_Final
+__weak_reference(_libmd_SHA512_Final, SHA512_Final);
+#undef SHA512_Transform
+__weak_reference(_libmd_SHA512_Transform, SHA512_Transform);
+
+#undef SHA512_224_Init
+__weak_reference(_libmd_SHA512_224_Init, SHA512_224_Init);
+#undef SHA512_224_Update
+__weak_reference(_libmd_SHA512_224_Update, SHA512_224_Update);
+#undef SHA512_224_Final
+__weak_reference(_libmd_SHA512_224_Final, SHA512_224_Final);
+
+#undef SHA512_256_Init
+__weak_reference(_libmd_SHA512_256_Init, SHA512_256_Init);
+#undef SHA512_256_Update
+__weak_reference(_libmd_SHA512_256_Update, SHA512_256_Update);
+#undef SHA512_256_Final
+__weak_reference(_libmd_SHA512_256_Final, SHA512_256_Final);
+
+#undef SHA384_Init
+__weak_reference(_libmd_SHA384_Init, SHA384_Init);
+#undef SHA384_Update
+__weak_reference(_libmd_SHA384_Update, SHA384_Update);
+#undef SHA384_Final
+__weak_reference(_libmd_SHA384_Final, SHA384_Final);
+#endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/sha512t.h b/sys/contrib/openzfs/module/os/freebsd/spl/sha512t.h
new file mode 100644
index 000000000000..703867fc0288
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/sha512t.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2015 Allan Jude <allanjude@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SHA512T_H_
+#define _SHA512T_H_
+
+#include "sha512.h"
+
+#ifndef _KERNEL
+#include <sys/types.h>
+#endif
+
+#define SHA512_224_DIGEST_LENGTH 28
+#define SHA512_224_DIGEST_STRING_LENGTH (SHA512_224_DIGEST_LENGTH * 2 + 1)
+#define SHA512_256_DIGEST_LENGTH 32
+#define SHA512_256_DIGEST_STRING_LENGTH (SHA512_256_DIGEST_LENGTH * 2 + 1)
+
+__BEGIN_DECLS
+
+/* Ensure libmd symbols do not clash with libcrypto */
+#ifndef SHA512_224_Init
+#define SHA512_224_Init _libmd_SHA512_224_Init
+#endif
+#ifndef SHA512_224_Update
+#define SHA512_224_Update _libmd_SHA512_224_Update
+#endif
+#ifndef SHA512_224_Final
+#define SHA512_224_Final _libmd_SHA512_224_Final
+#endif
+#ifndef SHA512_224_End
+#define SHA512_224_End _libmd_SHA512_224_End
+#endif
+#ifndef SHA512_224_Fd
+#define SHA512_224_Fd _libmd_SHA512_224_Fd
+#endif
+#ifndef SHA512_224_FdChunk
+#define SHA512_224_FdChunk _libmd_SHA512_224_FdChunk
+#endif
+#ifndef SHA512_224_File
+#define SHA512_224_File _libmd_SHA512_224_File
+#endif
+#ifndef SHA512_224_FileChunk
+#define SHA512_224_FileChunk _libmd_SHA512_224_FileChunk
+#endif
+#ifndef SHA512_224_Data
+#define SHA512_224_Data _libmd_SHA512_224_Data
+#endif
+
+#ifndef SHA512_224_Transform
+#define SHA512_224_Transform _libmd_SHA512_224_Transform
+#endif
+#ifndef SHA512_224_version
+#define SHA512_224_version _libmd_SHA512_224_version
+#endif
+
+#ifndef SHA512_256_Init
+#define SHA512_256_Init _libmd_SHA512_256_Init
+#endif
+#ifndef SHA512_256_Update
+#define SHA512_256_Update _libmd_SHA512_256_Update
+#endif
+#ifndef SHA512_256_Final
+#define SHA512_256_Final _libmd_SHA512_256_Final
+#endif
+#ifndef SHA512_256_End
+#define SHA512_256_End _libmd_SHA512_256_End
+#endif
+#ifndef SHA512_256_Fd
+#define SHA512_256_Fd _libmd_SHA512_256_Fd
+#endif
+#ifndef SHA512_256_FdChunk
+#define SHA512_256_FdChunk _libmd_SHA512_256_FdChunk
+#endif
+#ifndef SHA512_256_File
+#define SHA512_256_File _libmd_SHA512_256_File
+#endif
+#ifndef SHA512_256_FileChunk
+#define SHA512_256_FileChunk _libmd_SHA512_256_FileChunk
+#endif
+#ifndef SHA512_256_Data
+#define SHA512_256_Data _libmd_SHA512_256_Data
+#endif
+
+#ifndef SHA512_256_Transform
+#define SHA512_256_Transform _libmd_SHA512_256_Transform
+#endif
+#ifndef SHA512_256_version
+#define SHA512_256_version _libmd_SHA512_256_version
+#endif
+
+void SHA512_224_Init(SHA512_CTX *);
+void SHA512_224_Update(SHA512_CTX *, const void *, size_t);
+void SHA512_224_Final(unsigned char [__min_size(SHA512_224_DIGEST_LENGTH)],
+ SHA512_CTX *);
+#ifndef _KERNEL
+char *SHA512_224_End(SHA512_CTX *, char *);
+char *SHA512_224_Data(const void *, unsigned int, char *);
+char *SHA512_224_Fd(int, char *);
+char *SHA512_224_FdChunk(int, char *, off_t, off_t);
+char *SHA512_224_File(const char *, char *);
+char *SHA512_224_FileChunk(const char *, char *, off_t, off_t);
+#endif
+void SHA512_256_Init(SHA512_CTX *);
+void SHA512_256_Update(SHA512_CTX *, const void *, size_t);
+void SHA512_256_Final(unsigned char [__min_size(SHA512_256_DIGEST_LENGTH)],
+ SHA512_CTX *);
+#ifndef _KERNEL
+char *SHA512_256_End(SHA512_CTX *, char *);
+char *SHA512_256_Data(const void *, unsigned int, char *);
+char *SHA512_256_Fd(int, char *);
+char *SHA512_256_FdChunk(int, char *, off_t, off_t);
+char *SHA512_256_File(const char *, char *);
+char *SHA512_256_FileChunk(const char *, char *, off_t, off_t);
+#endif
+
+__END_DECLS
+
+#endif /* !_SHA512T_H_ */
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_acl.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_acl.c
new file mode 100644
index 000000000000..74c26d03f87f
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_acl.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2008, 2009 Edward Tomasz Napierała <trasz@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/errno.h>
+#include <sys/zfs_acl.h>
+#include <sys/acl.h>
+
+struct zfs2bsd {
+ uint32_t zb_zfs;
+ int zb_bsd;
+};
+
+struct zfs2bsd perms[] = {{ACE_READ_DATA, ACL_READ_DATA},
+ {ACE_WRITE_DATA, ACL_WRITE_DATA},
+ {ACE_EXECUTE, ACL_EXECUTE},
+ {ACE_APPEND_DATA, ACL_APPEND_DATA},
+ {ACE_DELETE_CHILD, ACL_DELETE_CHILD},
+ {ACE_DELETE, ACL_DELETE},
+ {ACE_READ_ATTRIBUTES, ACL_READ_ATTRIBUTES},
+ {ACE_WRITE_ATTRIBUTES, ACL_WRITE_ATTRIBUTES},
+ {ACE_READ_NAMED_ATTRS, ACL_READ_NAMED_ATTRS},
+ {ACE_WRITE_NAMED_ATTRS, ACL_WRITE_NAMED_ATTRS},
+ {ACE_READ_ACL, ACL_READ_ACL},
+ {ACE_WRITE_ACL, ACL_WRITE_ACL},
+ {ACE_WRITE_OWNER, ACL_WRITE_OWNER},
+ {ACE_SYNCHRONIZE, ACL_SYNCHRONIZE},
+ {0, 0}};
+
+struct zfs2bsd flags[] = {{ACE_FILE_INHERIT_ACE,
+ ACL_ENTRY_FILE_INHERIT},
+ {ACE_DIRECTORY_INHERIT_ACE,
+ ACL_ENTRY_DIRECTORY_INHERIT},
+ {ACE_NO_PROPAGATE_INHERIT_ACE,
+ ACL_ENTRY_NO_PROPAGATE_INHERIT},
+ {ACE_INHERIT_ONLY_ACE,
+ ACL_ENTRY_INHERIT_ONLY},
+ {ACE_INHERITED_ACE,
+ ACL_ENTRY_INHERITED},
+ {ACE_SUCCESSFUL_ACCESS_ACE_FLAG,
+ ACL_ENTRY_SUCCESSFUL_ACCESS},
+ {ACE_FAILED_ACCESS_ACE_FLAG,
+ ACL_ENTRY_FAILED_ACCESS},
+ {0, 0}};
+
+static int
+_bsd_from_zfs(uint32_t zfs, const struct zfs2bsd *table)
+{
+ const struct zfs2bsd *tmp;
+ int bsd = 0;
+
+ for (tmp = table; tmp->zb_zfs != 0; tmp++) {
+ if (zfs & tmp->zb_zfs)
+ bsd |= tmp->zb_bsd;
+ }
+
+ return (bsd);
+}
+
+static uint32_t
+_zfs_from_bsd(int bsd, const struct zfs2bsd *table)
+{
+ const struct zfs2bsd *tmp;
+ uint32_t zfs = 0;
+
+ for (tmp = table; tmp->zb_bsd != 0; tmp++) {
+ if (bsd & tmp->zb_bsd)
+ zfs |= tmp->zb_zfs;
+ }
+
+ return (zfs);
+}
+
+int
+acl_from_aces(struct acl *aclp, const ace_t *aces, int nentries)
+{
+ int i;
+ struct acl_entry *entry;
+ const ace_t *ace;
+
+ if (nentries < 1) {
+ printf("acl_from_aces: empty ZFS ACL; returning EINVAL.\n");
+ return (EINVAL);
+ }
+
+ if (nentries > ACL_MAX_ENTRIES) {
+ /*
+ * I believe it may happen only when moving a pool
+ * from SunOS to FreeBSD.
+ */
+ printf("acl_from_aces: ZFS ACL too big to fit "
+ "into 'struct acl'; returning EINVAL.\n");
+ return (EINVAL);
+ }
+
+ bzero(aclp, sizeof (*aclp));
+ aclp->acl_maxcnt = ACL_MAX_ENTRIES;
+ aclp->acl_cnt = nentries;
+
+ for (i = 0; i < nentries; i++) {
+ entry = &(aclp->acl_entry[i]);
+ ace = &(aces[i]);
+
+ if (ace->a_flags & ACE_OWNER)
+ entry->ae_tag = ACL_USER_OBJ;
+ else if (ace->a_flags & ACE_GROUP)
+ entry->ae_tag = ACL_GROUP_OBJ;
+ else if (ace->a_flags & ACE_EVERYONE)
+ entry->ae_tag = ACL_EVERYONE;
+ else if (ace->a_flags & ACE_IDENTIFIER_GROUP)
+ entry->ae_tag = ACL_GROUP;
+ else
+ entry->ae_tag = ACL_USER;
+
+ if (entry->ae_tag == ACL_USER || entry->ae_tag == ACL_GROUP)
+ entry->ae_id = ace->a_who;
+ else
+ entry->ae_id = ACL_UNDEFINED_ID;
+
+ entry->ae_perm = _bsd_from_zfs(ace->a_access_mask, perms);
+ entry->ae_flags = _bsd_from_zfs(ace->a_flags, flags);
+
+ switch (ace->a_type) {
+ case ACE_ACCESS_ALLOWED_ACE_TYPE:
+ entry->ae_entry_type = ACL_ENTRY_TYPE_ALLOW;
+ break;
+ case ACE_ACCESS_DENIED_ACE_TYPE:
+ entry->ae_entry_type = ACL_ENTRY_TYPE_DENY;
+ break;
+ case ACE_SYSTEM_AUDIT_ACE_TYPE:
+ entry->ae_entry_type = ACL_ENTRY_TYPE_AUDIT;
+ break;
+ case ACE_SYSTEM_ALARM_ACE_TYPE:
+ entry->ae_entry_type = ACL_ENTRY_TYPE_ALARM;
+ break;
+ default:
+ panic("acl_from_aces: a_type is 0x%x", ace->a_type);
+ }
+ }
+
+ return (0);
+}
+
+void
+aces_from_acl(ace_t *aces, int *nentries, const struct acl *aclp)
+{
+ int i;
+ const struct acl_entry *entry;
+ ace_t *ace;
+
+ bzero(aces, sizeof (*aces) * aclp->acl_cnt);
+
+ *nentries = aclp->acl_cnt;
+
+ for (i = 0; i < aclp->acl_cnt; i++) {
+ entry = &(aclp->acl_entry[i]);
+ ace = &(aces[i]);
+
+ ace->a_who = entry->ae_id;
+
+ if (entry->ae_tag == ACL_USER_OBJ)
+ ace->a_flags = ACE_OWNER;
+ else if (entry->ae_tag == ACL_GROUP_OBJ)
+ ace->a_flags = (ACE_GROUP | ACE_IDENTIFIER_GROUP);
+ else if (entry->ae_tag == ACL_GROUP)
+ ace->a_flags = ACE_IDENTIFIER_GROUP;
+ else if (entry->ae_tag == ACL_EVERYONE)
+ ace->a_flags = ACE_EVERYONE;
+ else /* ACL_USER */
+ ace->a_flags = 0;
+
+ ace->a_access_mask = _zfs_from_bsd(entry->ae_perm, perms);
+ ace->a_flags |= _zfs_from_bsd(entry->ae_flags, flags);
+
+ switch (entry->ae_entry_type) {
+ case ACL_ENTRY_TYPE_ALLOW:
+ ace->a_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
+ break;
+ case ACL_ENTRY_TYPE_DENY:
+ ace->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
+ break;
+ case ACL_ENTRY_TYPE_ALARM:
+ ace->a_type = ACE_SYSTEM_ALARM_ACE_TYPE;
+ break;
+ case ACL_ENTRY_TYPE_AUDIT:
+ ace->a_type = ACE_SYSTEM_AUDIT_ACE_TYPE;
+ break;
+ default:
+ panic("aces_from_acl: ae_entry_type is 0x%x",
+ entry->ae_entry_type);
+ }
+ }
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_atomic.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_atomic.c
new file mode 100644
index 000000000000..80040fc6a3e3
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_atomic.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/atomic.h>
+
+#if !defined(__LP64__) && !defined(__mips_n32) && \
+ !defined(ARM_HAVE_ATOMIC64) && !defined(I386_HAVE_ATOMIC64) && \
+ !defined(HAS_EMULATED_ATOMIC64)
+
+#ifdef _KERNEL
+#include <sys/kernel.h>
+
+struct mtx atomic_mtx;
+MTX_SYSINIT(atomic, &atomic_mtx, "atomic", MTX_DEF);
+#else
+#include <pthread.h>
+
+#define mtx_lock(lock) pthread_mutex_lock(lock)
+#define mtx_unlock(lock) pthread_mutex_unlock(lock)
+
+static pthread_mutex_t atomic_mtx;
+
+static __attribute__((constructor)) void
+atomic_init(void)
+{
+ pthread_mutex_init(&atomic_mtx, NULL);
+}
+#endif
+
+void
+atomic_add_64(volatile uint64_t *target, int64_t delta)
+{
+
+ mtx_lock(&atomic_mtx);
+ *target += delta;
+ mtx_unlock(&atomic_mtx);
+}
+
+void
+atomic_dec_64(volatile uint64_t *target)
+{
+
+ mtx_lock(&atomic_mtx);
+ *target -= 1;
+ mtx_unlock(&atomic_mtx);
+}
+
+uint64_t
+atomic_swap_64(volatile uint64_t *a, uint64_t value)
+{
+ uint64_t ret;
+
+ mtx_lock(&atomic_mtx);
+ ret = *a;
+ *a = value;
+ mtx_unlock(&atomic_mtx);
+ return (ret);
+}
+
+uint64_t
+atomic_load_64(volatile uint64_t *a)
+{
+ uint64_t ret;
+
+ mtx_lock(&atomic_mtx);
+ ret = *a;
+ mtx_unlock(&atomic_mtx);
+ return (ret);
+}
+
+uint64_t
+atomic_add_64_nv(volatile uint64_t *target, int64_t delta)
+{
+ uint64_t newval;
+
+ mtx_lock(&atomic_mtx);
+ newval = (*target += delta);
+ mtx_unlock(&atomic_mtx);
+ return (newval);
+}
+
+uint64_t
+atomic_cas_64(volatile uint64_t *target, uint64_t cmp, uint64_t newval)
+{
+ uint64_t oldval;
+
+ mtx_lock(&atomic_mtx);
+ oldval = *target;
+ if (oldval == cmp)
+ *target = newval;
+ mtx_unlock(&atomic_mtx);
+ return (oldval);
+}
+#endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_cmn_err.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_cmn_err.c
new file mode 100644
index 000000000000..22c7338b7399
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_cmn_err.c
@@ -0,0 +1,77 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * $FreeBSD$
+ */
+/*
+ * Copyright 2007 John Birrell <jb@FreeBSD.org>. All rights reserved.
+ * Copyright 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cmn_err.h>
+
+void
+vcmn_err(int ce, const char *fmt, va_list adx)
+{
+ char buf[256];
+ const char *prefix;
+
+ prefix = NULL; /* silence unwitty compilers */
+ switch (ce) {
+ case CE_CONT:
+ prefix = "Solaris(cont): ";
+ break;
+ case CE_NOTE:
+ prefix = "Solaris: NOTICE: ";
+ break;
+ case CE_WARN:
+ prefix = "Solaris: WARNING: ";
+ break;
+ case CE_PANIC:
+ prefix = "Solaris(panic): ";
+ break;
+ case CE_IGNORE:
+ break;
+ default:
+ panic("Solaris: unknown severity level");
+ }
+ if (ce == CE_PANIC) {
+ vsnprintf(buf, sizeof (buf), fmt, adx);
+ panic("%s%s", prefix, buf);
+ }
+ if (ce != CE_IGNORE) {
+ printf("%s", prefix);
+ vprintf(fmt, adx);
+ printf("\n");
+ }
+}
+
+void
+cmn_err(int type, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vcmn_err(type, fmt, ap);
+ va_end(ap);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_dtrace.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_dtrace.c
new file mode 100644
index 000000000000..6b2872bcc066
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_dtrace.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2014 The FreeBSD Project.
+ * All rights reserved.
+ *
+ * This software was developed by Steven Hartland.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/sdt.h>
+
+/* CSTYLED */
+SDT_PROBE_DEFINE1(sdt, , , set__error, "int");
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c
new file mode 100644
index 000000000000..cfc61dd7fc2a
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2006-2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/byteorder.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/debug.h>
+#include <sys/mutex.h>
+#include <sys/vmmeter.h>
+
+
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+
+#ifdef KMEM_DEBUG
+#include <sys/queue.h>
+#include <sys/stack.h>
+#endif
+
+#ifdef _KERNEL
+MALLOC_DEFINE(M_SOLARIS, "solaris", "Solaris");
+#else
+#define malloc(size, type, flags) malloc(size)
+#define free(addr, type) free(addr)
+#endif
+
+#ifdef KMEM_DEBUG
+struct kmem_item {
+ struct stack stack;
+ LIST_ENTRY(kmem_item) next;
+};
+static LIST_HEAD(, kmem_item) kmem_items;
+static struct mtx kmem_items_mtx;
+MTX_SYSINIT(kmem_items_mtx, &kmem_items_mtx, "kmem_items", MTX_DEF);
+#endif /* KMEM_DEBUG */
+
+#include <sys/vmem.h>
+
+void *
+zfs_kmem_alloc(size_t size, int kmflags)
+{
+ void *p;
+#ifdef KMEM_DEBUG
+ struct kmem_item *i;
+
+ size += sizeof (struct kmem_item);
+#endif
+ p = malloc(MAX(size, 16), M_SOLARIS, kmflags);
+#ifndef _KERNEL
+ if (kmflags & KM_SLEEP)
+ assert(p != NULL);
+#endif
+#ifdef KMEM_DEBUG
+ if (p != NULL) {
+ i = p;
+ p = (uint8_t *)p + sizeof (struct kmem_item);
+ stack_save(&i->stack);
+ mtx_lock(&kmem_items_mtx);
+ LIST_INSERT_HEAD(&kmem_items, i, next);
+ mtx_unlock(&kmem_items_mtx);
+ }
+#endif
+ return (p);
+}
+
+void
+zfs_kmem_free(void *buf, size_t size __unused)
+{
+#ifdef KMEM_DEBUG
+ if (buf == NULL) {
+ printf("%s: attempt to free NULL\n", __func__);
+ return;
+ }
+ struct kmem_item *i;
+
+ buf = (uint8_t *)buf - sizeof (struct kmem_item);
+ mtx_lock(&kmem_items_mtx);
+ LIST_FOREACH(i, &kmem_items, next) {
+ if (i == buf)
+ break;
+ }
+ ASSERT(i != NULL);
+ LIST_REMOVE(i, next);
+ mtx_unlock(&kmem_items_mtx);
+ memset(buf, 0xDC, MAX(size, 16));
+#endif
+ free(buf, M_SOLARIS);
+}
+
+static uint64_t kmem_size_val;
+
+static void
+kmem_size_init(void *unused __unused)
+{
+
+ kmem_size_val = (uint64_t)vm_cnt.v_page_count * PAGE_SIZE;
+ if (kmem_size_val > vm_kmem_size)
+ kmem_size_val = vm_kmem_size;
+}
+SYSINIT(kmem_size_init, SI_SUB_KMEM, SI_ORDER_ANY, kmem_size_init, NULL);
+
+uint64_t
+kmem_size(void)
+{
+
+ return (kmem_size_val);
+}
+
+static int
+kmem_std_constructor(void *mem, int size __unused, void *private, int flags)
+{
+ struct kmem_cache *cache = private;
+
+ return (cache->kc_constructor(mem, cache->kc_private, flags));
+}
+
+static void
+kmem_std_destructor(void *mem, int size __unused, void *private)
+{
+ struct kmem_cache *cache = private;
+
+ cache->kc_destructor(mem, cache->kc_private);
+}
+
+kmem_cache_t *
+kmem_cache_create(char *name, size_t bufsize, size_t align,
+ int (*constructor)(void *, void *, int), void (*destructor)(void *, void *),
+ void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags)
+{
+ kmem_cache_t *cache;
+
+ ASSERT(vmp == NULL);
+
+ cache = kmem_alloc(sizeof (*cache), KM_SLEEP);
+ strlcpy(cache->kc_name, name, sizeof (cache->kc_name));
+ cache->kc_constructor = constructor;
+ cache->kc_destructor = destructor;
+ cache->kc_private = private;
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
+ cache->kc_zone = uma_zcreate(cache->kc_name, bufsize,
+ constructor != NULL ? kmem_std_constructor : NULL,
+ destructor != NULL ? kmem_std_destructor : NULL,
+ NULL, NULL, align > 0 ? align - 1 : 0, cflags);
+#else
+ cache->kc_size = bufsize;
+#endif
+
+ return (cache);
+}
+
+void
+kmem_cache_destroy(kmem_cache_t *cache)
+{
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
+ uma_zdestroy(cache->kc_zone);
+#endif
+ kmem_free(cache, sizeof (*cache));
+}
+
+void *
+kmem_cache_alloc(kmem_cache_t *cache, int flags)
+{
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
+ return (uma_zalloc_arg(cache->kc_zone, cache, flags));
+#else
+ void *p;
+
+ p = kmem_alloc(cache->kc_size, flags);
+ if (p != NULL && cache->kc_constructor != NULL)
+ kmem_std_constructor(p, cache->kc_size, cache, flags);
+ return (p);
+#endif
+}
+
+void
+kmem_cache_free(kmem_cache_t *cache, void *buf)
+{
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
+ uma_zfree_arg(cache->kc_zone, buf, cache);
+#else
+ if (cache->kc_destructor != NULL)
+ kmem_std_destructor(buf, cache->kc_size, cache);
+ kmem_free(buf, cache->kc_size);
+#endif
+}
+
+/*
+ * Allow our caller to determine if there are running reaps.
+ *
+ * This call is very conservative and may return B_TRUE even when
+ * reaping activity isn't active. If it returns B_FALSE, then reaping
+ * activity is definitely inactive.
+ */
+boolean_t
+kmem_cache_reap_active(void)
+{
+
+ return (B_FALSE);
+}
+
+/*
+ * Reap (almost) everything soon.
+ *
+ * Note: this does not wait for the reap-tasks to complete. Caller
+ * should use kmem_cache_reap_active() (above) and/or moderation to
+ * avoid scheduling too many reap-tasks.
+ */
+#ifdef _KERNEL
+void
+kmem_cache_reap_soon(kmem_cache_t *cache)
+{
+#ifndef KMEM_DEBUG
+#if __FreeBSD_version >= 1300043
+ uma_zone_reclaim(cache->kc_zone, UMA_RECLAIM_DRAIN);
+#else
+ zone_drain(cache->kc_zone);
+#endif
+#endif
+}
+
+void
+kmem_reap(void)
+{
+#if __FreeBSD_version >= 1300043
+ uma_reclaim(UMA_RECLAIM_TRIM);
+#else
+ uma_reclaim();
+#endif
+}
+#else
+void
+kmem_cache_reap_soon(kmem_cache_t *cache __unused)
+{
+}
+
+void
+kmem_reap(void)
+{
+}
+#endif
+
+int
+kmem_debugging(void)
+{
+ return (0);
+}
+
+void *
+calloc(size_t n, size_t s)
+{
+ return (kmem_zalloc(n * s, KM_NOSLEEP));
+}
+
+char *
+kmem_vasprintf(const char *fmt, va_list adx)
+{
+ char *msg;
+ va_list adx2;
+
+ va_copy(adx2, adx);
+ msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP);
+ (void) vsprintf(msg, fmt, adx2);
+ va_end(adx2);
+
+ return (msg);
+}
+
+#include <vm/uma.h>
+#include <vm/uma_int.h>
+#ifdef KMEM_DEBUG
+#error "KMEM_DEBUG not currently supported"
+#endif
+
+uint64_t
+spl_kmem_cache_inuse(kmem_cache_t *cache)
+{
+ return (uma_zone_get_cur(cache->kc_zone));
+}
+
+uint64_t
+spl_kmem_cache_entry_size(kmem_cache_t *cache)
+{
+ return (cache->kc_zone->uz_size);
+}
+
+/*
+ * Register a move callback for cache defragmentation.
+ * XXX: Unimplemented but harmless to stub out for now.
+ */
+void
+spl_kmem_cache_set_move(kmem_cache_t *skc,
+ kmem_cbrc_t (move)(void *, void *, size_t, void *))
+{
+ ASSERT(move != NULL);
+}
+
+#ifdef KMEM_DEBUG
+void kmem_show(void *);
+void
+kmem_show(void *dummy __unused)
+{
+ struct kmem_item *i;
+
+ mtx_lock(&kmem_items_mtx);
+ if (LIST_EMPTY(&kmem_items))
+ printf("KMEM_DEBUG: No leaked elements.\n");
+ else {
+ printf("KMEM_DEBUG: Leaked elements:\n\n");
+ LIST_FOREACH(i, &kmem_items, next) {
+ printf("address=%p\n", i);
+ stack_print_ddb(&i->stack);
+ printf("\n");
+ }
+ }
+ mtx_unlock(&kmem_items_mtx);
+}
+
+SYSUNINIT(sol_kmem, SI_SUB_CPU, SI_ORDER_FIRST, kmem_show, NULL);
+#endif /* KMEM_DEBUG */
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c
new file mode 100644
index 000000000000..6bdef466c253
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c
@@ -0,0 +1,575 @@
+/*
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Links to Illumos.org for more information on kstat function:
+ * [1] https://illumos.org/man/1M/kstat
+ * [2] https://illumos.org/man/9f/kstat_create
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/sysctl.h>
+#include <sys/kstat.h>
+#include <sys/sbuf.h>
+
+static MALLOC_DEFINE(M_KSTAT, "kstat_data", "Kernel statistics");
+
+SYSCTL_ROOT_NODE(OID_AUTO, kstat, CTLFLAG_RW, 0, "Kernel statistics");
+
+void
+__kstat_set_raw_ops(kstat_t *ksp,
+ int (*headers)(char *buf, size_t size),
+ int (*data)(char *buf, size_t size, void *data),
+ void *(*addr)(kstat_t *ksp, loff_t index))
+{
+ ksp->ks_raw_ops.headers = headers;
+ ksp->ks_raw_ops.data = data;
+ ksp->ks_raw_ops.addr = addr;
+}
+
+void
+__kstat_set_seq_raw_ops(kstat_t *ksp,
+ int (*headers)(struct seq_file *f),
+ int (*data)(char *buf, size_t size, void *data),
+ void *(*addr)(kstat_t *ksp, loff_t index))
+{
+ ksp->ks_raw_ops.seq_headers = headers;
+ ksp->ks_raw_ops.data = data;
+ ksp->ks_raw_ops.addr = addr;
+}
+
+static int
+kstat_default_update(kstat_t *ksp, int rw)
+{
+ ASSERT(ksp != NULL);
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ return (0);
+}
+
+static int
+kstat_resize_raw(kstat_t *ksp)
+{
+ if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX)
+ return (ENOMEM);
+
+ free(ksp->ks_raw_buf, M_TEMP);
+ ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX);
+ ksp->ks_raw_buf = malloc(ksp->ks_raw_bufsize, M_TEMP, M_WAITOK);
+
+ return (0);
+}
+
+static void *
+kstat_raw_default_addr(kstat_t *ksp, loff_t n)
+{
+ if (n == 0)
+ return (ksp->ks_data);
+ return (NULL);
+}
+
+static int
+kstat_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ kstat_t *ksp = arg1;
+ kstat_named_t *ksent;
+ uint64_t val;
+
+ ksent = ksp->ks_data;
+ /* Select the correct element */
+ ksent += arg2;
+ /* Update the aggsums before reading */
+ (void) ksp->ks_update(ksp, KSTAT_READ);
+ val = ksent->value.ui64;
+
+ return (sysctl_handle_64(oidp, &val, 0, req));
+}
+
+static int
+kstat_sysctl_string(SYSCTL_HANDLER_ARGS)
+{
+ kstat_t *ksp = arg1;
+ kstat_named_t *ksent = ksp->ks_data;
+ char *val;
+ uint32_t len = 0;
+
+ /* Select the correct element */
+ ksent += arg2;
+ /* Update the aggsums before reading */
+ (void) ksp->ks_update(ksp, KSTAT_READ);
+ val = KSTAT_NAMED_STR_PTR(ksent);
+ len = KSTAT_NAMED_STR_BUFLEN(ksent);
+ val[len-1] = '\0';
+
+ return (sysctl_handle_string(oidp, val, len, req));
+}
+
+static int
+kstat_sysctl_io(SYSCTL_HANDLER_ARGS)
+{
+ struct sbuf *sb;
+ kstat_t *ksp = arg1;
+ kstat_io_t *kip = ksp->ks_data;
+ int rc;
+
+ sb = sbuf_new_auto();
+ if (sb == NULL)
+ return (ENOMEM);
+ /* Update the aggsums before reading */
+ (void) ksp->ks_update(ksp, KSTAT_READ);
+
+ /* though wlentime & friends are signed, they will never be negative */
+ sbuf_printf(sb,
+ "%-8llu %-8llu %-8u %-8u %-8llu %-8llu "
+ "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n",
+ kip->nread, kip->nwritten,
+ kip->reads, kip->writes,
+ kip->wtime, kip->wlentime, kip->wlastupdate,
+ kip->rtime, kip->rlentime, kip->rlastupdate,
+ kip->wcnt, kip->rcnt);
+ rc = sbuf_finish(sb);
+ if (rc == 0)
+ rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb));
+ sbuf_delete(sb);
+ return (rc);
+}
+
+static int
+kstat_sysctl_raw(SYSCTL_HANDLER_ARGS)
+{
+ struct sbuf *sb;
+ void *data;
+ kstat_t *ksp = arg1;
+ void *(*addr_op)(kstat_t *ksp, loff_t index);
+ int n, has_header, rc = 0;
+
+ sb = sbuf_new_auto();
+ if (sb == NULL)
+ return (ENOMEM);
+
+ if (ksp->ks_raw_ops.addr)
+ addr_op = ksp->ks_raw_ops.addr;
+ else
+ addr_op = kstat_raw_default_addr;
+
+ mutex_enter(ksp->ks_lock);
+
+ /* Update the aggsums before reading */
+ (void) ksp->ks_update(ksp, KSTAT_READ);
+
+ ksp->ks_raw_bufsize = PAGE_SIZE;
+ ksp->ks_raw_buf = malloc(PAGE_SIZE, M_TEMP, M_WAITOK);
+
+ n = 0;
+ has_header = (ksp->ks_raw_ops.headers ||
+ ksp->ks_raw_ops.seq_headers);
+
+restart_headers:
+ if (ksp->ks_raw_ops.headers) {
+ rc = ksp->ks_raw_ops.headers(
+ ksp->ks_raw_buf, ksp->ks_raw_bufsize);
+ } else if (ksp->ks_raw_ops.seq_headers) {
+ struct seq_file f;
+
+ f.sf_buf = ksp->ks_raw_buf;
+ f.sf_size = ksp->ks_raw_bufsize;
+ rc = ksp->ks_raw_ops.seq_headers(&f);
+ }
+ if (has_header) {
+ if (rc == ENOMEM && !kstat_resize_raw(ksp))
+ goto restart_headers;
+ if (rc == 0)
+ sbuf_printf(sb, "\n%s", ksp->ks_raw_buf);
+ }
+
+ while ((data = addr_op(ksp, n)) != NULL) {
+restart:
+ if (ksp->ks_raw_ops.data) {
+ rc = ksp->ks_raw_ops.data(ksp->ks_raw_buf,
+ ksp->ks_raw_bufsize, data);
+ if (rc == ENOMEM && !kstat_resize_raw(ksp))
+ goto restart;
+ if (rc == 0)
+ sbuf_printf(sb, "%s", ksp->ks_raw_buf);
+
+ } else {
+ ASSERT(ksp->ks_ndata == 1);
+ sbuf_hexdump(sb, ksp->ks_data,
+ ksp->ks_data_size, NULL, 0);
+ }
+ n++;
+ }
+ free(ksp->ks_raw_buf, M_TEMP);
+ mutex_exit(ksp->ks_lock);
+ sbuf_trim(sb);
+ rc = sbuf_finish(sb);
+ if (rc == 0)
+ rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb));
+ sbuf_delete(sb);
+ return (rc);
+}
+
+kstat_t *
+__kstat_create(const char *module, int instance, const char *name,
+ const char *class, uchar_t ks_type, uint_t ks_ndata, uchar_t flags)
+{
+ char buf[KSTAT_STRLEN];
+ struct sysctl_oid *root;
+ kstat_t *ksp;
+ char *pool;
+
+ KASSERT(instance == 0, ("instance=%d", instance));
+ if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO))
+ ASSERT(ks_ndata == 1);
+
+ if (class == NULL)
+ class = "misc";
+
+ /*
+ * Allocate the main structure. We don't need to keep a copy of
+ * module in here, because it is only used for sysctl node creation
+ * done in this function.
+ */
+ ksp = malloc(sizeof (*ksp), M_KSTAT, M_WAITOK|M_ZERO);
+
+ ksp->ks_crtime = gethrtime();
+ ksp->ks_snaptime = ksp->ks_crtime;
+ ksp->ks_instance = instance;
+ (void) strlcpy(ksp->ks_name, name, KSTAT_STRLEN);
+ (void) strlcpy(ksp->ks_class, class, KSTAT_STRLEN);
+ ksp->ks_type = ks_type;
+ ksp->ks_flags = flags;
+ ksp->ks_update = kstat_default_update;
+
+ mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL);
+ ksp->ks_lock = &ksp->ks_private_lock;
+
+ switch (ksp->ks_type) {
+ case KSTAT_TYPE_RAW:
+ ksp->ks_ndata = 1;
+ ksp->ks_data_size = ks_ndata;
+ break;
+ case KSTAT_TYPE_NAMED:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t);
+ break;
+ case KSTAT_TYPE_INTR:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t);
+ break;
+ case KSTAT_TYPE_IO:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t);
+ break;
+ case KSTAT_TYPE_TIMER:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t);
+ break;
+ default:
+ panic("Undefined kstat type %d\n", ksp->ks_type);
+ }
+
+ if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) {
+ ksp->ks_data = NULL;
+ } else {
+ ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP);
+ if (ksp->ks_data == NULL) {
+ kmem_free(ksp, sizeof (*ksp));
+ ksp = NULL;
+ }
+ }
+
+ /*
+ * Some kstats use a module name like "zfs/poolname" to distinguish a
+ * set of kstats belonging to a specific pool. Split on '/' to add an
+ * extra node for the pool name if needed.
+ */
+ (void) strlcpy(buf, module, KSTAT_STRLEN);
+ module = buf;
+ pool = strchr(module, '/');
+ if (pool != NULL)
+ *pool++ = '\0';
+
+ /*
+ * Create sysctl tree for those statistics:
+ *
+ * kstat.<module>[.<pool>].<class>.<name>
+ */
+ sysctl_ctx_init(&ksp->ks_sysctl_ctx);
+ root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx,
+ SYSCTL_STATIC_CHILDREN(_kstat), OID_AUTO, module, CTLFLAG_RW, 0,
+ "");
+ if (root == NULL) {
+ printf("%s: Cannot create kstat.%s tree!\n", __func__, module);
+ sysctl_ctx_free(&ksp->ks_sysctl_ctx);
+ free(ksp, M_KSTAT);
+ return (NULL);
+ }
+ if (pool != NULL) {
+ root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx,
+ SYSCTL_CHILDREN(root), OID_AUTO, pool, CTLFLAG_RW, 0, "");
+ if (root == NULL) {
+ printf("%s: Cannot create kstat.%s.%s tree!\n",
+ __func__, module, pool);
+ sysctl_ctx_free(&ksp->ks_sysctl_ctx);
+ free(ksp, M_KSTAT);
+ return (NULL);
+ }
+ }
+ root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root),
+ OID_AUTO, class, CTLFLAG_RW, 0, "");
+ if (root == NULL) {
+ if (pool != NULL)
+ printf("%s: Cannot create kstat.%s.%s.%s tree!\n",
+ __func__, module, pool, class);
+ else
+ printf("%s: Cannot create kstat.%s.%s tree!\n",
+ __func__, module, class);
+ sysctl_ctx_free(&ksp->ks_sysctl_ctx);
+ free(ksp, M_KSTAT);
+ return (NULL);
+ }
+ if (ksp->ks_type == KSTAT_TYPE_NAMED) {
+ root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx,
+ SYSCTL_CHILDREN(root),
+ OID_AUTO, name, CTLFLAG_RW, 0, "");
+ if (root == NULL) {
+ if (pool != NULL)
+ printf("%s: Cannot create kstat.%s.%s.%s.%s "
+ "tree!\n", __func__, module, pool, class,
+ name);
+ else
+ printf("%s: Cannot create kstat.%s.%s.%s "
+ "tree!\n", __func__, module, class, name);
+ sysctl_ctx_free(&ksp->ks_sysctl_ctx);
+ free(ksp, M_KSTAT);
+ return (NULL);
+ }
+
+ }
+ ksp->ks_sysctl_root = root;
+
+ return (ksp);
+}
+
+static void
+kstat_install_named(kstat_t *ksp)
+{
+ kstat_named_t *ksent;
+ char *namelast;
+ int typelast;
+
+ ksent = ksp->ks_data;
+
+ VERIFY((ksp->ks_flags & KSTAT_FLAG_VIRTUAL) || ksent != NULL);
+
+ typelast = 0;
+ namelast = NULL;
+
+ for (int i = 0; i < ksp->ks_ndata; i++, ksent++) {
+ if (ksent->data_type != 0) {
+ typelast = ksent->data_type;
+ namelast = ksent->name;
+ }
+ switch (typelast) {
+ case KSTAT_DATA_CHAR:
+ /* Not Implemented */
+ break;
+ case KSTAT_DATA_INT32:
+ SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+ SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+ OID_AUTO, namelast,
+ CTLTYPE_S32 | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ ksp, i, kstat_sysctl, "I", namelast);
+ break;
+ case KSTAT_DATA_UINT32:
+ SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+ SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+ OID_AUTO, namelast,
+ CTLTYPE_U32 | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ ksp, i, kstat_sysctl, "IU", namelast);
+ break;
+ case KSTAT_DATA_INT64:
+ SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+ SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+ OID_AUTO, namelast,
+ CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ ksp, i, kstat_sysctl, "Q", namelast);
+ break;
+ case KSTAT_DATA_UINT64:
+ SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+ SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+ OID_AUTO, namelast,
+ CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ ksp, i, kstat_sysctl, "QU", namelast);
+ break;
+ case KSTAT_DATA_LONG:
+ SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+ SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+ OID_AUTO, namelast,
+ CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ ksp, i, kstat_sysctl, "L", namelast);
+ break;
+ case KSTAT_DATA_ULONG:
+ SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+ SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+ OID_AUTO, namelast,
+ CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ ksp, i, kstat_sysctl, "LU", namelast);
+ break;
+ case KSTAT_DATA_STRING:
+ SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+ SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+ OID_AUTO, namelast,
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ ksp, i, kstat_sysctl_string, "A", namelast);
+ break;
+ default:
+ panic("unsupported type: %d", typelast);
+ }
+ }
+}
+
+void
+kstat_install(kstat_t *ksp)
+{
+ struct sysctl_oid *root;
+
+ if (ksp->ks_ndata == UINT32_MAX)
+ VERIFY(ksp->ks_type == KSTAT_TYPE_RAW);
+
+ switch (ksp->ks_type) {
+ case KSTAT_TYPE_NAMED:
+ return (kstat_install_named(ksp));
+ case KSTAT_TYPE_RAW:
+ if (ksp->ks_raw_ops.data) {
+ root = SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+ SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+ OID_AUTO, ksp->ks_name, CTLTYPE_STRING | CTLFLAG_RD
+ | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
+ ksp, 0, kstat_sysctl_raw, "A", ksp->ks_name);
+ } else {
+ root = SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+ SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+ OID_AUTO, ksp->ks_name, CTLTYPE_OPAQUE | CTLFLAG_RD
+ | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
+ ksp, 0, kstat_sysctl_raw, "", ksp->ks_name);
+ }
+ break;
+ case KSTAT_TYPE_IO:
+ root = SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+ SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+ OID_AUTO, ksp->ks_name,
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ ksp, 0, kstat_sysctl_io, "A", ksp->ks_name);
+ break;
+ case KSTAT_TYPE_TIMER:
+ case KSTAT_TYPE_INTR:
+ default:
+ panic("unsupported kstat type %d\n", ksp->ks_type);
+ }
+ VERIFY(root != NULL);
+ ksp->ks_sysctl_root = root;
+}
+
+void
+kstat_delete(kstat_t *ksp)
+{
+
+ sysctl_ctx_free(&ksp->ks_sysctl_ctx);
+ ksp->ks_lock = NULL;
+ mutex_destroy(&ksp->ks_private_lock);
+ free(ksp, M_KSTAT);
+}
+
+void
+kstat_waitq_enter(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t wcnt;
+
+ new = gethrtime();
+ delta = new - kiop->wlastupdate;
+ kiop->wlastupdate = new;
+ wcnt = kiop->wcnt++;
+ if (wcnt != 0) {
+ kiop->wlentime += delta * wcnt;
+ kiop->wtime += delta;
+ }
+}
+
+void
+kstat_waitq_exit(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t wcnt;
+
+ new = gethrtime();
+ delta = new - kiop->wlastupdate;
+ kiop->wlastupdate = new;
+ wcnt = kiop->wcnt--;
+ ASSERT((int)wcnt > 0);
+ kiop->wlentime += delta * wcnt;
+ kiop->wtime += delta;
+}
+
+void
+kstat_runq_enter(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t rcnt;
+
+ new = gethrtime();
+ delta = new - kiop->rlastupdate;
+ kiop->rlastupdate = new;
+ rcnt = kiop->rcnt++;
+ if (rcnt != 0) {
+ kiop->rlentime += delta * rcnt;
+ kiop->rtime += delta;
+ }
+}
+
+void
+kstat_runq_exit(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t rcnt;
+
+ new = gethrtime();
+ delta = new - kiop->rlastupdate;
+ kiop->rlastupdate = new;
+ rcnt = kiop->rcnt--;
+ ASSERT((int)rcnt > 0);
+ kiop->rlentime += delta * rcnt;
+ kiop->rtime += delta;
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c
new file mode 100644
index 000000000000..0354b986cd5f
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/limits.h>
+#include <sys/misc.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+
+#include <sys/zfs_context.h>
+
+static struct opensolaris_utsname hw_utsname = {
+ .machine = MACHINE
+};
+
+#ifndef KERNEL_STATIC
+char hw_serial[11] = "0";
+
+utsname_t *
+utsname(void)
+{
+ return (&hw_utsname);
+}
+#endif
+
+static void
+opensolaris_utsname_init(void *arg)
+{
+
+ hw_utsname.sysname = ostype;
+ hw_utsname.nodename = prison0.pr_hostname;
+ hw_utsname.release = osrelease;
+ snprintf(hw_utsname.version, sizeof (hw_utsname.version),
+ "%d", osreldate);
+}
+
+char *
+kmem_strdup(const char *s)
+{
+ char *buf;
+
+ buf = kmem_alloc(strlen(s) + 1, KM_SLEEP);
+ strcpy(buf, s);
+ return (buf);
+}
+
+int
+ddi_copyin(const void *from, void *to, size_t len, int flags)
+{
+ /* Fake ioctl() issued by kernel, 'from' is a kernel address */
+ if (flags & FKIOCTL) {
+ memcpy(to, from, len);
+ return (0);
+ }
+
+ return (copyin(from, to, len));
+}
+
+int
+ddi_copyout(const void *from, void *to, size_t len, int flags)
+{
+ /* Fake ioctl() issued by kernel, 'from' is a kernel address */
+ if (flags & FKIOCTL) {
+ memcpy(to, from, len);
+ return (0);
+ }
+
+ return (copyout(from, to, len));
+}
+
+int
+spl_panic(const char *file, const char *func, int line, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vpanic(fmt, ap);
+ va_end(ap);
+}
+
+
+SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY,
+ opensolaris_utsname_init, NULL);
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c
new file mode 100644
index 000000000000..5ecd3d310361
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c
@@ -0,0 +1,438 @@
+/*
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/mntent.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/jail.h>
+#include <sys/policy.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+
+
+int
+secpolicy_nfs(cred_t *cr)
+{
+
+ return (spl_priv_check_cred(cr, PRIV_NFS_DAEMON));
+}
+
+int
+secpolicy_zfs(cred_t *cr)
+{
+
+ return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT));
+}
+
+int
+secpolicy_zfs_proc(cred_t *cr, proc_t *proc)
+{
+
+ return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT));
+}
+
+int
+secpolicy_sys_config(cred_t *cr, int checkonly __unused)
+{
+
+ return (spl_priv_check_cred(cr, PRIV_ZFS_POOL_CONFIG));
+}
+
+int
+secpolicy_zinject(cred_t *cr)
+{
+
+ return (spl_priv_check_cred(cr, PRIV_ZFS_INJECT));
+}
+
+int
+secpolicy_fs_unmount(cred_t *cr, struct mount *vfsp __unused)
+{
+
+ return (spl_priv_check_cred(cr, PRIV_VFS_UNMOUNT));
+}
+
+int
+secpolicy_fs_owner(struct mount *mp, cred_t *cr)
+{
+
+ if (zfs_super_owner) {
+ if (cr->cr_uid == mp->mnt_cred->cr_uid &&
+ cr->cr_prison == mp->mnt_cred->cr_prison) {
+ return (0);
+ }
+ }
+ return (EPERM);
+}
+
+/*
+ * This check is done in kern_link(), so we could just return 0 here.
+ */
+extern int hardlink_check_uid;
+int
+secpolicy_basic_link(vnode_t *vp, cred_t *cr)
+{
+
+ if (!hardlink_check_uid)
+ return (0);
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return (0);
+ return (spl_priv_check_cred(cr, PRIV_VFS_LINK));
+}
+
+int
+secpolicy_vnode_stky_modify(cred_t *cr)
+{
+
+ return (EPERM);
+}
+
+int
+secpolicy_vnode_remove(vnode_t *vp, cred_t *cr)
+{
+
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return (0);
+ return (spl_priv_check_cred(cr, PRIV_VFS_ADMIN));
+}
+
+int
+secpolicy_vnode_access(cred_t *cr, vnode_t *vp, uid_t owner, accmode_t accmode)
+{
+
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return (0);
+
+ if ((accmode & VREAD) && spl_priv_check_cred(cr, PRIV_VFS_READ) != 0)
+ return (EACCES);
+ if ((accmode & VWRITE) &&
+ spl_priv_check_cred(cr, PRIV_VFS_WRITE) != 0) {
+ return (EACCES);
+ }
+ if (accmode & VEXEC) {
+ if (vp->v_type == VDIR) {
+ if (spl_priv_check_cred(cr, PRIV_VFS_LOOKUP) != 0)
+ return (EACCES);
+ } else {
+ if (spl_priv_check_cred(cr, PRIV_VFS_EXEC) != 0)
+ return (EACCES);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Like secpolicy_vnode_access() but we get the actual wanted mode and the
+ * current mode of the file, not the missing bits.
+ */
+int
+secpolicy_vnode_access2(cred_t *cr, vnode_t *vp, uid_t owner,
+ accmode_t curmode, accmode_t wantmode)
+{
+ accmode_t mode;
+
+ mode = ~curmode & wantmode;
+
+ if (mode == 0)
+ return (0);
+
+ return (secpolicy_vnode_access(cr, vp, owner, mode));
+}
+
+int
+secpolicy_vnode_any_access(cred_t *cr, vnode_t *vp, uid_t owner)
+{
+ static int privs[] = {
+ PRIV_VFS_ADMIN,
+ PRIV_VFS_READ,
+ PRIV_VFS_WRITE,
+ PRIV_VFS_EXEC,
+ PRIV_VFS_LOOKUP
+ };
+ int i;
+
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return (0);
+
+ /* Same as secpolicy_vnode_setdac */
+ if (owner == cr->cr_uid)
+ return (0);
+
+ for (i = 0; i < sizeof (privs)/sizeof (int); i++) {
+ int priv;
+
+ switch (priv = privs[i]) {
+ case PRIV_VFS_EXEC:
+ if (vp->v_type == VDIR)
+ continue;
+ break;
+ case PRIV_VFS_LOOKUP:
+ if (vp->v_type != VDIR)
+ continue;
+ break;
+ }
+ if (spl_priv_check_cred(cr, priv) == 0)
+ return (0);
+ }
+ return (EPERM);
+}
+
+int
+secpolicy_vnode_setdac(vnode_t *vp, cred_t *cr, uid_t owner)
+{
+
+ if (owner == cr->cr_uid)
+ return (0);
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return (0);
+ return (spl_priv_check_cred(cr, PRIV_VFS_ADMIN));
+}
+
+int
+secpolicy_vnode_setattr(cred_t *cr, vnode_t *vp, struct vattr *vap,
+ const struct vattr *ovap, int flags,
+ int unlocked_access(void *, int, cred_t *), void *node)
+{
+ int mask = vap->va_mask;
+ int error;
+
+ if (mask & AT_SIZE) {
+ if (vp->v_type == VDIR)
+ return (EISDIR);
+ error = unlocked_access(node, VWRITE, cr);
+ if (error)
+ return (error);
+ }
+ if (mask & AT_MODE) {
+ /*
+ * If not the owner of the file then check privilege
+ * for two things: the privilege to set the mode at all
+ * and, if we're setting setuid, we also need permissions
+ * to add the set-uid bit, if we're not the owner.
+ * In the specific case of creating a set-uid root
+ * file, we need even more permissions.
+ */
+ error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid);
+ if (error)
+ return (error);
+ error = secpolicy_setid_setsticky_clear(vp, vap, ovap, cr);
+ if (error)
+ return (error);
+ } else {
+ vap->va_mode = ovap->va_mode;
+ }
+ if (mask & (AT_UID | AT_GID)) {
+ error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid);
+ if (error)
+ return (error);
+
+ /*
+ * To change the owner of a file, or change the group of
+ * a file to a group of which we are not a member, the
+ * caller must have privilege.
+ */
+ if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) ||
+ ((mask & AT_GID) && vap->va_gid != ovap->va_gid &&
+ !groupmember(vap->va_gid, cr))) {
+ if (secpolicy_fs_owner(vp->v_mount, cr) != 0) {
+ error = spl_priv_check_cred(cr, PRIV_VFS_CHOWN);
+ if (error)
+ return (error);
+ }
+ }
+
+ if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) ||
+ ((mask & AT_GID) && vap->va_gid != ovap->va_gid)) {
+ secpolicy_setid_clear(vap, vp, cr);
+ }
+ }
+ if (mask & (AT_ATIME | AT_MTIME)) {
+ /*
+ * From utimes(2):
+ * If times is NULL, ... The caller must be the owner of
+ * the file, have permission to write the file, or be the
+ * super-user.
+ * If times is non-NULL, ... The caller must be the owner of
+ * the file or be the super-user.
+ */
+ error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid);
+ if (error && (vap->va_vaflags & VA_UTIMES_NULL))
+ error = unlocked_access(node, VWRITE, cr);
+ if (error)
+ return (error);
+ }
+ return (0);
+}
+
+int
+secpolicy_vnode_create_gid(cred_t *cr)
+{
+
+ return (EPERM);
+}
+
+int
+secpolicy_vnode_setids_setgids(vnode_t *vp, cred_t *cr, gid_t gid)
+{
+
+ if (groupmember(gid, cr))
+ return (0);
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return (0);
+ return (spl_priv_check_cred(cr, PRIV_VFS_SETGID));
+}
+
+int
+secpolicy_vnode_setid_retain(znode_t *zp, cred_t *cr,
+ boolean_t issuidroot __unused)
+{
+
+ if (secpolicy_fs_owner(ZTOV(zp)->v_mount, cr) == 0)
+ return (0);
+ return (spl_priv_check_cred(cr, PRIV_VFS_RETAINSUGID));
+}
+
+void
+secpolicy_setid_clear(struct vattr *vap, vnode_t *vp, cred_t *cr)
+{
+
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return;
+
+ if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0) {
+ if (spl_priv_check_cred(cr, PRIV_VFS_RETAINSUGID)) {
+ vap->va_mask |= AT_MODE;
+ vap->va_mode &= ~(S_ISUID|S_ISGID);
+ }
+ }
+}
+
+int
+secpolicy_setid_setsticky_clear(vnode_t *vp, struct vattr *vap,
+ const struct vattr *ovap, cred_t *cr)
+{
+ int error;
+
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return (0);
+
+ /*
+ * Privileged processes may set the sticky bit on non-directories,
+ * as well as set the setgid bit on a file with a group that the process
+ * is not a member of. Both of these are allowed in jail(8).
+ */
+ if (vp->v_type != VDIR && (vap->va_mode & S_ISTXT)) {
+ if (spl_priv_check_cred(cr, PRIV_VFS_STICKYFILE))
+ return (EFTYPE);
+ }
+ /*
+ * Check for privilege if attempting to set the
+ * group-id bit.
+ */
+ if ((vap->va_mode & S_ISGID) != 0) {
+ error = secpolicy_vnode_setids_setgids(vp, cr, ovap->va_gid);
+ if (error)
+ return (error);
+ }
+ /*
+ * Deny setting setuid if we are not the file owner.
+ */
+ if ((vap->va_mode & S_ISUID) && ovap->va_uid != cr->cr_uid) {
+ error = spl_priv_check_cred(cr, PRIV_VFS_ADMIN);
+ if (error)
+ return (error);
+ }
+ return (0);
+}
+
+int
+secpolicy_fs_mount(cred_t *cr, vnode_t *mvp, struct mount *vfsp)
+{
+
+ return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT));
+}
+
+int
+secpolicy_vnode_owner(vnode_t *vp, cred_t *cr, uid_t owner)
+{
+
+ if (owner == cr->cr_uid)
+ return (0);
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return (0);
+
+ /* XXX: vfs_suser()? */
+ return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT_OWNER));
+}
+
+int
+secpolicy_vnode_chown(vnode_t *vp, cred_t *cr, uid_t owner)
+{
+
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return (0);
+ return (spl_priv_check_cred(cr, PRIV_VFS_CHOWN));
+}
+
+void
+secpolicy_fs_mount_clearopts(cred_t *cr, struct mount *vfsp)
+{
+
+ if (spl_priv_check_cred(cr, PRIV_VFS_MOUNT_NONUSER) != 0) {
+ MNT_ILOCK(vfsp);
+ vfsp->vfs_flag |= VFS_NOSETUID | MNT_USER;
+ vfs_clearmntopt(vfsp, MNTOPT_SETUID);
+ vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL, 0);
+ MNT_IUNLOCK(vfsp);
+ }
+}
+
+/*
+ * Check privileges for setting xvattr attributes
+ */
+int
+secpolicy_xvattr(vnode_t *vp, xvattr_t *xvap, uid_t owner, cred_t *cr,
+ vtype_t vtype)
+{
+
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return (0);
+ return (spl_priv_check_cred(cr, PRIV_VFS_SYSFLAGS));
+}
+
+int
+secpolicy_smb(cred_t *cr)
+{
+
+ return (spl_priv_check_cred(cr, PRIV_NETSMB));
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_procfs_list.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_procfs_list.c
new file mode 100644
index 000000000000..e8448ce00686
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_procfs_list.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/list.h>
+#include <sys/mutex.h>
+#include <sys/procfs_list.h>
+
+typedef struct procfs_list_iter {
+ procfs_list_t *pli_pl;
+ void *pli_elt;
+} pli_t;
+
+void
+seq_printf(struct seq_file *f, const char *fmt, ...)
+{
+ va_list adx;
+
+ va_start(adx, fmt);
+ (void) vsnprintf(f->sf_buf, f->sf_size, fmt, adx);
+ va_end(adx);
+}
+
+static int
+procfs_list_update(kstat_t *ksp, int rw)
+{
+ procfs_list_t *pl = ksp->ks_private;
+
+ if (rw == KSTAT_WRITE)
+ pl->pl_clear(pl);
+
+ return (0);
+}
+
+static int
+procfs_list_data(char *buf, size_t size, void *data)
+{
+ pli_t *p;
+ void *elt;
+ procfs_list_t *pl;
+ struct seq_file f;
+
+ p = data;
+ pl = p->pli_pl;
+ elt = p->pli_elt;
+ free(p, M_TEMP);
+ f.sf_buf = buf;
+ f.sf_size = size;
+ return (pl->pl_show(&f, elt));
+}
+
+static void *
+procfs_list_addr(kstat_t *ksp, loff_t n)
+{
+ procfs_list_t *pl = ksp->ks_private;
+ void *elt = ksp->ks_private1;
+ pli_t *p = NULL;
+
+
+ if (n == 0)
+ ksp->ks_private1 = list_head(&pl->pl_list);
+ else if (elt)
+ ksp->ks_private1 = list_next(&pl->pl_list, elt);
+
+ if (ksp->ks_private1) {
+ p = malloc(sizeof (*p), M_TEMP, M_WAITOK);
+ p->pli_pl = pl;
+ p->pli_elt = ksp->ks_private1;
+ }
+
+ return (p);
+}
+
+void
+procfs_list_install(const char *module,
+ const char *submodule,
+ const char *name,
+ mode_t mode,
+ procfs_list_t *procfs_list,
+ int (*show)(struct seq_file *f, void *p),
+ int (*show_header)(struct seq_file *f),
+ int (*clear)(procfs_list_t *procfs_list),
+ size_t procfs_list_node_off)
+{
+ kstat_t *procfs_kstat;
+
+ mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&procfs_list->pl_list,
+ procfs_list_node_off + sizeof (procfs_list_node_t),
+ procfs_list_node_off + offsetof(procfs_list_node_t, pln_link));
+ procfs_list->pl_show = show;
+ procfs_list->pl_show_header = show_header;
+ procfs_list->pl_clear = clear;
+ procfs_list->pl_next_id = 1;
+ procfs_list->pl_node_offset = procfs_list_node_off;
+
+ procfs_kstat = kstat_create(module, 0, name, submodule,
+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+
+ if (procfs_kstat) {
+ procfs_kstat->ks_lock = &procfs_list->pl_lock;
+ procfs_kstat->ks_ndata = UINT32_MAX;
+ procfs_kstat->ks_private = procfs_list;
+ procfs_kstat->ks_update = procfs_list_update;
+ kstat_set_seq_raw_ops(procfs_kstat, show_header,
+ procfs_list_data, procfs_list_addr);
+ kstat_install(procfs_kstat);
+ procfs_list->pl_private = procfs_kstat;
+ }
+}
+
+void
+procfs_list_uninstall(procfs_list_t *procfs_list)
+{}
+
+void
+procfs_list_destroy(procfs_list_t *procfs_list)
+{
+ ASSERT(list_is_empty(&procfs_list->pl_list));
+ kstat_delete(procfs_list->pl_private);
+ list_destroy(&procfs_list->pl_list);
+ mutex_destroy(&procfs_list->pl_lock);
+}
+
+#define NODE_ID(procfs_list, obj) \
+ (((procfs_list_node_t *)(((char *)obj) + \
+ (procfs_list)->pl_node_offset))->pln_id)
+
+void
+procfs_list_add(procfs_list_t *procfs_list, void *p)
+{
+ ASSERT(MUTEX_HELD(&procfs_list->pl_lock));
+ NODE_ID(procfs_list, p) = procfs_list->pl_next_id++;
+ list_insert_tail(&procfs_list->pl_list, p);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c
new file mode 100644
index 000000000000..d13b64b4cd26
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c
@@ -0,0 +1,107 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * $FreeBSD$
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/string.h>
+#include <sys/kmem.h>
+#include <machine/stdarg.h>
+
+#define IS_DIGIT(c) ((c) >= '0' && (c) <= '9')
+
+#define IS_ALPHA(c) \
+ (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
+
+char *
+strpbrk(const char *s, const char *b)
+{
+ const char *p;
+
+ do {
+ for (p = b; *p != '\0' && *p != *s; ++p)
+ ;
+ if (*p != '\0')
+ return ((char *)s);
+ } while (*s++);
+
+ return (NULL);
+}
+
+/*
+ * Convert a string into a valid C identifier by replacing invalid
+ * characters with '_'. Also makes sure the string is nul-terminated
+ * and takes up at most n bytes.
+ */
+void
+strident_canon(char *s, size_t n)
+{
+ char c;
+ char *end = s + n - 1;
+
+ if ((c = *s) == 0)
+ return;
+
+ if (!IS_ALPHA(c) && c != '_')
+ *s = '_';
+
+ while (s < end && ((c = *(++s)) != 0)) {
+ if (!IS_ALPHA(c) && !IS_DIGIT(c) && c != '_')
+ *s = '_';
+ }
+ *s = 0;
+}
+
+/*
+ * Do not change the length of the returned string; it must be freed
+ * with strfree().
+ */
+char *
+kmem_asprintf(const char *fmt, ...)
+{
+ int size;
+ va_list adx;
+ char *buf;
+
+ va_start(adx, fmt);
+ size = vsnprintf(NULL, 0, fmt, adx) + 1;
+ va_end(adx);
+
+ buf = kmem_alloc(size, KM_SLEEP);
+
+ va_start(adx, fmt);
+ (void) vsnprintf(buf, size, fmt, adx);
+ va_end(adx);
+
+ return (buf);
+}
+
+void
+kmem_strfree(char *str)
+{
+ ASSERT(str != NULL);
+ kmem_free(str, strlen(str) + 1);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_sunddi.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sunddi.c
new file mode 100644
index 000000000000..ebec77bdb37f
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sunddi.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/limits.h>
+#include <sys/misc.h>
+#include <sys/sunddi.h>
+#include <sys/sysctl.h>
+
+int
+ddi_strtol(const char *str, char **nptr, int base, long *result)
+{
+
+ *result = strtol(str, nptr, base);
+ return (0);
+}
+
+int
+ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result)
+{
+
+ if (str == hw_serial) {
+ *result = prison0.pr_hostid;
+ return (0);
+ }
+
+ *result = strtoul(str, nptr, base);
+ return (0);
+}
+
+int
+ddi_strtoull(const char *str, char **nptr, int base, unsigned long long *result)
+{
+
+ *result = (unsigned long long)strtouq(str, nptr, base);
+ return (0);
+}
+
+int
+ddi_strtoll(const char *str, char **nptr, int base, long long *result)
+{
+
+ *result = (long long)strtoq(str, nptr, base);
+ return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c
new file mode 100644
index 000000000000..8c0e495681e9
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kmem.h>
+#include <sys/list.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/nvpair.h>
+#include <sys/sunddi.h>
+#include <sys/sysevent.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+#include <sys/bus.h>
+
+static int
+log_sysevent(nvlist_t *event)
+{
+ struct sbuf *sb;
+ const char *type;
+ char typestr[128];
+ nvpair_t *elem = NULL;
+
+ sb = sbuf_new_auto();
+ if (sb == NULL)
+ return (ENOMEM);
+ type = NULL;
+
+ while ((elem = nvlist_next_nvpair(event, elem)) != NULL) {
+ switch (nvpair_type(elem)) {
+ case DATA_TYPE_BOOLEAN:
+ {
+ boolean_t value;
+
+ (void) nvpair_value_boolean_value(elem, &value);
+ sbuf_printf(sb, " %s=%s", nvpair_name(elem),
+ value ? "true" : "false");
+ break;
+ }
+ case DATA_TYPE_UINT8:
+ {
+ uint8_t value;
+
+ (void) nvpair_value_uint8(elem, &value);
+ sbuf_printf(sb, " %s=%hhu", nvpair_name(elem), value);
+ break;
+ }
+ case DATA_TYPE_INT32:
+ {
+ int32_t value;
+
+ (void) nvpair_value_int32(elem, &value);
+ sbuf_printf(sb, " %s=%jd", nvpair_name(elem),
+ (intmax_t)value);
+ break;
+ }
+ case DATA_TYPE_UINT32:
+ {
+ uint32_t value;
+
+ (void) nvpair_value_uint32(elem, &value);
+ sbuf_printf(sb, " %s=%ju", nvpair_name(elem),
+ (uintmax_t)value);
+ break;
+ }
+ case DATA_TYPE_INT64:
+ {
+ int64_t value;
+
+ (void) nvpair_value_int64(elem, &value);
+ sbuf_printf(sb, " %s=%jd", nvpair_name(elem),
+ (intmax_t)value);
+ break;
+ }
+ case DATA_TYPE_UINT64:
+ {
+ uint64_t value;
+
+ (void) nvpair_value_uint64(elem, &value);
+ sbuf_printf(sb, " %s=%ju", nvpair_name(elem),
+ (uintmax_t)value);
+ break;
+ }
+ case DATA_TYPE_STRING:
+ {
+ char *value;
+
+ (void) nvpair_value_string(elem, &value);
+ sbuf_printf(sb, " %s=%s", nvpair_name(elem), value);
+ if (strcmp(FM_CLASS, nvpair_name(elem)) == 0)
+ type = value;
+ break;
+ }
+ case DATA_TYPE_UINT8_ARRAY:
+ {
+ uint8_t *value;
+ uint_t ii, nelem;
+
+ (void) nvpair_value_uint8_array(elem, &value, &nelem);
+ sbuf_printf(sb, " %s=", nvpair_name(elem));
+ for (ii = 0; ii < nelem; ii++)
+ sbuf_printf(sb, "%02hhx", value[ii]);
+ break;
+ }
+ case DATA_TYPE_UINT16_ARRAY:
+ {
+ uint16_t *value;
+ uint_t ii, nelem;
+
+ (void) nvpair_value_uint16_array(elem, &value, &nelem);
+ sbuf_printf(sb, " %s=", nvpair_name(elem));
+ for (ii = 0; ii < nelem; ii++)
+ sbuf_printf(sb, "%04hx", value[ii]);
+ break;
+ }
+ case DATA_TYPE_UINT32_ARRAY:
+ {
+ uint32_t *value;
+ uint_t ii, nelem;
+
+ (void) nvpair_value_uint32_array(elem, &value, &nelem);
+ sbuf_printf(sb, " %s=", nvpair_name(elem));
+ for (ii = 0; ii < nelem; ii++)
+ sbuf_printf(sb, "%08jx", (uintmax_t)value[ii]);
+ break;
+ }
+ case DATA_TYPE_INT64_ARRAY:
+ {
+ int64_t *value;
+ uint_t ii, nelem;
+
+ (void) nvpair_value_int64_array(elem, &value, &nelem);
+ sbuf_printf(sb, " %s=", nvpair_name(elem));
+ for (ii = 0; ii < nelem; ii++)
+ sbuf_printf(sb, "%016lld",
+ (long long)value[ii]);
+ break;
+ }
+ case DATA_TYPE_UINT64_ARRAY:
+ {
+ uint64_t *value;
+ uint_t ii, nelem;
+
+ (void) nvpair_value_uint64_array(elem, &value, &nelem);
+ sbuf_printf(sb, " %s=", nvpair_name(elem));
+ for (ii = 0; ii < nelem; ii++)
+ sbuf_printf(sb, "%016jx", (uintmax_t)value[ii]);
+ break;
+ }
+ case DATA_TYPE_STRING_ARRAY:
+ {
+ char **strarr;
+ uint_t ii, nelem;
+
+ (void) nvpair_value_string_array(elem, &strarr, &nelem);
+
+ for (ii = 0; ii < nelem; ii++) {
+ if (strarr[ii] == NULL) {
+ sbuf_printf(sb, " <NULL>");
+ continue;
+ }
+
+ sbuf_printf(sb, " %s", strarr[ii]);
+ if (strcmp(FM_CLASS, strarr[ii]) == 0)
+ type = strarr[ii];
+ }
+ break;
+ }
+ case DATA_TYPE_NVLIST:
+ /* XXX - requires recursing in log_sysevent */
+ break;
+ default:
+ printf("%s: type %d is not implemented\n", __func__,
+ nvpair_type(elem));
+ break;
+ }
+ }
+
+ if (sbuf_finish(sb) != 0) {
+ sbuf_delete(sb);
+ return (ENOMEM);
+ }
+
+ if (type == NULL)
+ type = "";
+ if (strncmp(type, "ESC_ZFS_", 8) == 0) {
+ snprintf(typestr, sizeof (typestr), "misc.fs.zfs.%s", type + 8);
+ type = typestr;
+ }
+ devctl_notify("ZFS", "ZFS", type, sbuf_data(sb));
+ sbuf_delete(sb);
+
+ return (0);
+}
+
+static void
+sysevent_worker(void *arg __unused)
+{
+ zfs_zevent_t *ze;
+ nvlist_t *event;
+ uint64_t dropped = 0;
+ uint64_t dst_size;
+ int error;
+
+ zfs_zevent_init(&ze);
+ for (;;) {
+ dst_size = 131072;
+ dropped = 0;
+ event = NULL;
+ error = zfs_zevent_next(ze, &event,
+ &dst_size, &dropped);
+ if (error) {
+ error = zfs_zevent_wait(ze);
+ if (error == ESHUTDOWN)
+ break;
+ } else {
+ VERIFY(event != NULL);
+ log_sysevent(event);
+ nvlist_free(event);
+ }
+ }
+ zfs_zevent_destroy(ze);
+ kthread_exit();
+}
+
+void
+ddi_sysevent_init(void)
+{
+ kproc_kthread_add(sysevent_worker, NULL, &system_proc, NULL, 0, 0,
+ "zfskern", "sysevent");
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c
new file mode 100644
index 000000000000..8ad6de9b5e9f
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2009 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Copyright (c) 2012 Spectra Logic Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/ck.h>
+#include <sys/epoch.h>
+#include <sys/kernel.h>
+#include <sys/kmem.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/taskq.h>
+#include <sys/taskqueue.h>
+#include <sys/zfs_context.h>
+
+#if defined(__i386__) || defined(__amd64__) || defined(__aarch64__)
+#include <machine/pcb.h>
+#endif
+
+#include <vm/uma.h>
+
+#if __FreeBSD_version < 1201522
+#define taskqueue_start_threads_in_proc(tqp, count, pri, proc, name, ...) \
+ taskqueue_start_threads(tqp, count, pri, name, __VA_ARGS__)
+#endif
+
+static uint_t taskq_tsd;
+static uma_zone_t taskq_zone;
+
+taskq_t *system_taskq = NULL;
+taskq_t *system_delay_taskq = NULL;
+taskq_t *dynamic_taskq = NULL;
+
+proc_t *system_proc;
+
+extern int uma_align_cache;
+
+static MALLOC_DEFINE(M_TASKQ, "taskq", "taskq structures");
+
+static CK_LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl;
+static unsigned long tqenthash;
+static unsigned long tqenthashlock;
+static struct sx *tqenthashtbl_lock;
+
+static taskqid_t tqidnext;
+
+#define TQIDHASH(tqid) (&tqenthashtbl[(tqid) & tqenthash])
+#define TQIDHASHLOCK(tqid) (&tqenthashtbl_lock[((tqid) & tqenthashlock)])
+
+#define TIMEOUT_TASK 1
+#define NORMAL_TASK 2
+
+static void
+system_taskq_init(void *arg)
+{
+ int i;
+
+ tsd_create(&taskq_tsd, NULL);
+ tqenthashtbl = hashinit(mp_ncpus * 8, M_TASKQ, &tqenthash);
+ tqenthashlock = (tqenthash + 1) / 8;
+ if (tqenthashlock > 0)
+ tqenthashlock--;
+ tqenthashtbl_lock =
+ malloc(sizeof (*tqenthashtbl_lock) * (tqenthashlock + 1),
+ M_TASKQ, M_WAITOK | M_ZERO);
+ for (i = 0; i < tqenthashlock + 1; i++)
+ sx_init_flags(&tqenthashtbl_lock[i], "tqenthash", SX_DUPOK);
+ taskq_zone = uma_zcreate("taskq_zone", sizeof (taskq_ent_t),
+ NULL, NULL, NULL, NULL,
+ UMA_ALIGN_CACHE, 0);
+ system_taskq = taskq_create("system_taskq", mp_ncpus, minclsyspri,
+ 0, 0, 0);
+ system_delay_taskq = taskq_create("system_delay_taskq", mp_ncpus,
+ minclsyspri, 0, 0, 0);
+}
+SYSINIT(system_taskq_init, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_init,
+ NULL);
+
+static void
+system_taskq_fini(void *arg)
+{
+ int i;
+
+ taskq_destroy(system_delay_taskq);
+ taskq_destroy(system_taskq);
+ uma_zdestroy(taskq_zone);
+ tsd_destroy(&taskq_tsd);
+ for (i = 0; i < tqenthashlock + 1; i++)
+ sx_destroy(&tqenthashtbl_lock[i]);
+ for (i = 0; i < tqenthash + 1; i++)
+ VERIFY(CK_LIST_EMPTY(&tqenthashtbl[i]));
+ free(tqenthashtbl_lock, M_TASKQ);
+ free(tqenthashtbl, M_TASKQ);
+}
+SYSUNINIT(system_taskq_fini, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_fini,
+ NULL);
+
+#ifdef __LP64__
+static taskqid_t
+__taskq_genid(void)
+{
+ taskqid_t tqid;
+
+ /*
+ * Assume a 64-bit counter will not wrap in practice.
+ */
+ tqid = atomic_add_64_nv(&tqidnext, 1);
+ VERIFY(tqid);
+ return (tqid);
+}
+#else
+static taskqid_t
+__taskq_genid(void)
+{
+ taskqid_t tqid;
+
+ for (;;) {
+ tqid = atomic_add_32_nv(&tqidnext, 1);
+ if (__predict_true(tqid != 0))
+ break;
+ }
+ VERIFY(tqid);
+ return (tqid);
+}
+#endif
+
+static taskq_ent_t *
+taskq_lookup(taskqid_t tqid)
+{
+ taskq_ent_t *ent = NULL;
+
+ sx_xlock(TQIDHASHLOCK(tqid));
+ CK_LIST_FOREACH(ent, TQIDHASH(tqid), tqent_hash) {
+ if (ent->tqent_id == tqid)
+ break;
+ }
+ if (ent != NULL)
+ refcount_acquire(&ent->tqent_rc);
+ sx_xunlock(TQIDHASHLOCK(tqid));
+ return (ent);
+}
+
+static taskqid_t
+taskq_insert(taskq_ent_t *ent)
+{
+ taskqid_t tqid;
+
+ tqid = __taskq_genid();
+ ent->tqent_id = tqid;
+ ent->tqent_registered = B_TRUE;
+ sx_xlock(TQIDHASHLOCK(tqid));
+ CK_LIST_INSERT_HEAD(TQIDHASH(tqid), ent, tqent_hash);
+ sx_xunlock(TQIDHASHLOCK(tqid));
+ return (tqid);
+}
+
+static void
+taskq_remove(taskq_ent_t *ent)
+{
+ taskqid_t tqid = ent->tqent_id;
+
+ if (!ent->tqent_registered)
+ return;
+
+ sx_xlock(TQIDHASHLOCK(tqid));
+ CK_LIST_REMOVE(ent, tqent_hash);
+ sx_xunlock(TQIDHASHLOCK(tqid));
+ ent->tqent_registered = B_FALSE;
+}
+
+static void
+taskq_tsd_set(void *context)
+{
+ taskq_t *tq = context;
+
+#if defined(__amd64__) || defined(__aarch64__)
+ if (context != NULL && tsd_get(taskq_tsd) == NULL)
+ fpu_kern_thread(FPU_KERN_NORMAL);
+#endif
+ tsd_set(taskq_tsd, tq);
+}
+
+static taskq_t *
+taskq_create_impl(const char *name, int nthreads, pri_t pri,
+ proc_t *proc __maybe_unused, uint_t flags)
+{
+ taskq_t *tq;
+
+ if ((flags & TASKQ_THREADS_CPU_PCT) != 0)
+ nthreads = MAX((mp_ncpus * nthreads) / 100, 1);
+
+ tq = kmem_alloc(sizeof (*tq), KM_SLEEP);
+ tq->tq_queue = taskqueue_create(name, M_WAITOK,
+ taskqueue_thread_enqueue, &tq->tq_queue);
+ taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_INIT,
+ taskq_tsd_set, tq);
+ taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN,
+ taskq_tsd_set, NULL);
+ (void) taskqueue_start_threads_in_proc(&tq->tq_queue, nthreads, pri,
+ proc, "%s", name);
+
+ return ((taskq_t *)tq);
+}
+
+taskq_t *
+taskq_create(const char *name, int nthreads, pri_t pri, int minalloc __unused,
+ int maxalloc __unused, uint_t flags)
+{
+ return (taskq_create_impl(name, nthreads, pri, system_proc, flags));
+}
+
+taskq_t *
+taskq_create_proc(const char *name, int nthreads, pri_t pri,
+ int minalloc __unused, int maxalloc __unused, proc_t *proc, uint_t flags)
+{
+ return (taskq_create_impl(name, nthreads, pri, proc, flags));
+}
+
+void
+taskq_destroy(taskq_t *tq)
+{
+
+ taskqueue_free(tq->tq_queue);
+ kmem_free(tq, sizeof (*tq));
+}
+
+int
+taskq_member(taskq_t *tq, kthread_t *thread)
+{
+
+ return (taskqueue_member(tq->tq_queue, thread));
+}
+
+taskq_t *
+taskq_of_curthread(void)
+{
+ return (tsd_get(taskq_tsd));
+}
+
+static void
+taskq_free(taskq_ent_t *task)
+{
+ taskq_remove(task);
+ if (refcount_release(&task->tqent_rc))
+ uma_zfree(taskq_zone, task);
+}
+
+int
+taskq_cancel_id(taskq_t *tq, taskqid_t tid)
+{
+ uint32_t pend;
+ int rc;
+ taskq_ent_t *ent;
+
+ if (tid == 0)
+ return (0);
+
+ if ((ent = taskq_lookup(tid)) == NULL)
+ return (0);
+
+ ent->tqent_cancelled = B_TRUE;
+ if (ent->tqent_type == TIMEOUT_TASK) {
+ rc = taskqueue_cancel_timeout(tq->tq_queue,
+ &ent->tqent_timeout_task, &pend);
+ } else
+ rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend);
+ if (rc == EBUSY) {
+ taskqueue_drain(tq->tq_queue, &ent->tqent_task);
+ } else if (pend) {
+ /*
+ * Tasks normally free themselves when run, but here the task
+ * was cancelled so it did not free itself.
+ */
+ taskq_free(ent);
+ }
+ /* Free the extra reference we added with taskq_lookup. */
+ taskq_free(ent);
+ return (rc);
+}
+
+static void
+taskq_run(void *arg, int pending __unused)
+{
+ taskq_ent_t *task = arg;
+
+ if (!task->tqent_cancelled)
+ task->tqent_func(task->tqent_arg);
+ taskq_free(task);
+}
+
+taskqid_t
+taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
+ uint_t flags, clock_t expire_time)
+{
+ taskq_ent_t *task;
+ taskqid_t tqid;
+ clock_t timo;
+ int mflag;
+
+ timo = expire_time - ddi_get_lbolt();
+ if (timo <= 0)
+ return (taskq_dispatch(tq, func, arg, flags));
+
+ if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP)
+ mflag = M_WAITOK;
+ else
+ mflag = M_NOWAIT;
+
+ task = uma_zalloc(taskq_zone, mflag);
+ if (task == NULL)
+ return (0);
+ task->tqent_func = func;
+ task->tqent_arg = arg;
+ task->tqent_type = TIMEOUT_TASK;
+ task->tqent_cancelled = B_FALSE;
+ refcount_init(&task->tqent_rc, 1);
+ tqid = taskq_insert(task);
+ TIMEOUT_TASK_INIT(tq->tq_queue, &task->tqent_timeout_task, 0,
+ taskq_run, task);
+
+ taskqueue_enqueue_timeout(tq->tq_queue, &task->tqent_timeout_task,
+ timo);
+ return (tqid);
+}
+
+taskqid_t
+taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
+{
+ taskq_ent_t *task;
+ int mflag, prio;
+ taskqid_t tqid;
+
+ if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP)
+ mflag = M_WAITOK;
+ else
+ mflag = M_NOWAIT;
+ /*
+ * If TQ_FRONT is given, we want higher priority for this task, so it
+ * can go at the front of the queue.
+ */
+ prio = !!(flags & TQ_FRONT);
+
+ task = uma_zalloc(taskq_zone, mflag);
+ if (task == NULL)
+ return (0);
+ refcount_init(&task->tqent_rc, 1);
+ task->tqent_func = func;
+ task->tqent_arg = arg;
+ task->tqent_cancelled = B_FALSE;
+ task->tqent_type = NORMAL_TASK;
+ tqid = taskq_insert(task);
+ TASK_INIT(&task->tqent_task, prio, taskq_run, task);
+ taskqueue_enqueue(tq->tq_queue, &task->tqent_task);
+ return (tqid);
+}
+
+static void
+taskq_run_ent(void *arg, int pending __unused)
+{
+ taskq_ent_t *task = arg;
+
+ task->tqent_func(task->tqent_arg);
+}
+
+void
+taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags,
+ taskq_ent_t *task)
+{
+ int prio;
+
+ /*
+ * If TQ_FRONT is given, we want higher priority for this task, so it
+ * can go at the front of the queue.
+ */
+ prio = !!(flags & TQ_FRONT);
+ task->tqent_cancelled = B_FALSE;
+ task->tqent_registered = B_FALSE;
+ task->tqent_id = 0;
+ task->tqent_func = func;
+ task->tqent_arg = arg;
+
+ TASK_INIT(&task->tqent_task, prio, taskq_run_ent, task);
+ taskqueue_enqueue(tq->tq_queue, &task->tqent_task);
+}
+
+void
+taskq_wait(taskq_t *tq)
+{
+ taskqueue_quiesce(tq->tq_queue);
+}
+
+void
+taskq_wait_id(taskq_t *tq, taskqid_t tid)
+{
+ taskq_ent_t *ent;
+
+ if (tid == 0)
+ return;
+ if ((ent = taskq_lookup(tid)) == NULL)
+ return;
+
+ taskqueue_drain(tq->tq_queue, &ent->tqent_task);
+ taskq_free(ent);
+}
+
+void
+taskq_wait_outstanding(taskq_t *tq, taskqid_t id __unused)
+{
+ taskqueue_drain_all(tq->tq_queue);
+}
+
+int
+taskq_empty_ent(taskq_ent_t *t)
+{
+ return (t->tqent_task.ta_pending == 0);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_uio.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_uio.c
new file mode 100644
index 000000000000..f5f3524f7b9d
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_uio.c
@@ -0,0 +1,100 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+#include <sys/zfs_znode.h>
+
+/*
+ * same as zfs_uiomove() but doesn't modify uio structure.
+ * return in cbytes how many bytes were copied.
+ */
+int
+zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes)
+{
+ struct iovec small_iovec[1];
+ struct uio small_uio_clone;
+ struct uio *uio_clone;
+ int error;
+
+ ASSERT3U(zfs_uio_rw(uio), ==, rw);
+ if (zfs_uio_iovcnt(uio) == 1) {
+ small_uio_clone = *(GET_UIO_STRUCT(uio));
+ small_iovec[0] = *(GET_UIO_STRUCT(uio)->uio_iov);
+ small_uio_clone.uio_iov = small_iovec;
+ uio_clone = &small_uio_clone;
+ } else {
+ uio_clone = cloneuio(GET_UIO_STRUCT(uio));
+ }
+
+ error = vn_io_fault_uiomove(p, n, uio_clone);
+ *cbytes = zfs_uio_resid(uio) - uio_clone->uio_resid;
+ if (uio_clone != &small_uio_clone)
+ free(uio_clone, M_IOV);
+ return (error);
+}
+
+/*
+ * Drop the next n chars out of *uiop.
+ */
+void
+zfs_uioskip(zfs_uio_t *uio, size_t n)
+{
+ zfs_uio_seg_t segflg;
+
+ /* For the full compatibility with illumos. */
+ if (n > zfs_uio_resid(uio))
+ return;
+
+ segflg = zfs_uio_segflg(uio);
+ zfs_uio_segflg(uio) = UIO_NOCOPY;
+ zfs_uiomove(NULL, n, zfs_uio_rw(uio), uio);
+ zfs_uio_segflg(uio) = segflg;
+}
+
+int
+zfs_uio_fault_move(void *p, size_t n, zfs_uio_rw_t dir, zfs_uio_t *uio)
+{
+ ASSERT(zfs_uio_rw(uio) == dir);
+ return (vn_io_fault_uiomove(p, n, GET_UIO_STRUCT(uio)));
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vfs.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vfs.c
new file mode 100644
index 000000000000..09c8401267df
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vfs.c
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2006-2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/cred.h>
+#include <sys/vfs.h>
+#include <sys/priv.h>
+#include <sys/libkern.h>
+
+#include <sys/mutex.h>
+#include <sys/vnode.h>
+#include <sys/taskq.h>
+
+#include <sys/ccompat.h>
+
+MALLOC_DECLARE(M_MOUNT);
+
+void
+vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg,
+ int flags __unused)
+{
+ struct vfsopt *opt;
+ size_t namesize;
+ int locked;
+
+ if (!(locked = mtx_owned(MNT_MTX(vfsp))))
+ MNT_ILOCK(vfsp);
+
+ if (vfsp->mnt_opt == NULL) {
+ void *opts;
+
+ MNT_IUNLOCK(vfsp);
+ opts = malloc(sizeof (*vfsp->mnt_opt), M_MOUNT, M_WAITOK);
+ MNT_ILOCK(vfsp);
+ if (vfsp->mnt_opt == NULL) {
+ vfsp->mnt_opt = opts;
+ TAILQ_INIT(vfsp->mnt_opt);
+ } else {
+ free(opts, M_MOUNT);
+ }
+ }
+
+ MNT_IUNLOCK(vfsp);
+
+ opt = malloc(sizeof (*opt), M_MOUNT, M_WAITOK);
+ namesize = strlen(name) + 1;
+ opt->name = malloc(namesize, M_MOUNT, M_WAITOK);
+ strlcpy(opt->name, name, namesize);
+ opt->pos = -1;
+ opt->seen = 1;
+ if (arg == NULL) {
+ opt->value = NULL;
+ opt->len = 0;
+ } else {
+ opt->len = strlen(arg) + 1;
+ opt->value = malloc(opt->len, M_MOUNT, M_WAITOK);
+ bcopy(arg, opt->value, opt->len);
+ }
+
+ MNT_ILOCK(vfsp);
+ TAILQ_INSERT_TAIL(vfsp->mnt_opt, opt, link);
+ if (!locked)
+ MNT_IUNLOCK(vfsp);
+}
+
+void
+vfs_clearmntopt(vfs_t *vfsp, const char *name)
+{
+ int locked;
+
+ if (!(locked = mtx_owned(MNT_MTX(vfsp))))
+ MNT_ILOCK(vfsp);
+ vfs_deleteopt(vfsp->mnt_opt, name);
+ if (!locked)
+ MNT_IUNLOCK(vfsp);
+}
+
+int
+vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp)
+{
+ struct vfsoptlist *opts = vfsp->mnt_optnew;
+ int error;
+
+ if (opts == NULL)
+ return (0);
+ error = vfs_getopt(opts, opt, (void **)argp, NULL);
+ return (error != 0 ? 0 : 1);
+}
+
+int
+mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath,
+ char *fspec, int fsflags)
+{
+ struct vfsconf *vfsp;
+ struct mount *mp;
+ vnode_t *vp, *mvp;
+ struct ucred *cr;
+ int error;
+
+ ASSERT_VOP_ELOCKED(*vpp, "mount_snapshot");
+
+ vp = *vpp;
+ *vpp = NULL;
+ error = 0;
+
+ /*
+ * Be ultra-paranoid about making sure the type and fspath
+ * variables will fit in our mp buffers, including the
+ * terminating NUL.
+ */
+ if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
+ error = ENAMETOOLONG;
+ if (error == 0 && (vfsp = vfs_byname_kld(fstype, td, &error)) == NULL)
+ error = ENODEV;
+ if (error == 0 && vp->v_type != VDIR)
+ error = ENOTDIR;
+ /*
+ * We need vnode lock to protect v_mountedhere and vnode interlock
+ * to protect v_iflag.
+ */
+ if (error == 0) {
+ VI_LOCK(vp);
+ if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL)
+ vp->v_iflag |= VI_MOUNT;
+ else
+ error = EBUSY;
+ VI_UNLOCK(vp);
+ }
+ if (error != 0) {
+ vput(vp);
+ return (error);
+ }
+ vn_seqc_write_begin(vp);
+ VOP_UNLOCK1(vp);
+
+ /*
+ * Allocate and initialize the filesystem.
+ * We don't want regular user that triggered snapshot mount to be able
+ * to unmount it, so pass credentials of the parent mount.
+ */
+ mp = vfs_mount_alloc(vp, vfsp, fspath, vp->v_mount->mnt_cred);
+
+ mp->mnt_optnew = NULL;
+ vfs_setmntopt(mp, "from", fspec, 0);
+ mp->mnt_optnew = mp->mnt_opt;
+ mp->mnt_opt = NULL;
+
+ /*
+ * Set the mount level flags.
+ */
+ mp->mnt_flag = fsflags & MNT_UPDATEMASK;
+ /*
+ * Snapshots are always read-only.
+ */
+ mp->mnt_flag |= MNT_RDONLY;
+ /*
+ * We don't want snapshots to allow access to vulnerable setuid
+ * programs, so we turn off setuid when mounting snapshots.
+ */
+ mp->mnt_flag |= MNT_NOSUID;
+ /*
+ * We don't want snapshots to be visible in regular
+ * mount(8) and df(1) output.
+ */
+ mp->mnt_flag |= MNT_IGNORE;
+ /*
+ * XXX: This is evil, but we can't mount a snapshot as a regular user.
+ * XXX: Is is safe when snapshot is mounted from within a jail?
+ */
+ cr = td->td_ucred;
+ td->td_ucred = kcred;
+ error = VFS_MOUNT(mp);
+ td->td_ucred = cr;
+
+ if (error != 0) {
+ /*
+ * Clear VI_MOUNT and decrement the use count "atomically",
+ * under the vnode lock. This is not strictly required,
+ * but makes it easier to reason about the life-cycle and
+ * ownership of the covered vnode.
+ */
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ VI_LOCK(vp);
+ vp->v_iflag &= ~VI_MOUNT;
+ VI_UNLOCK(vp);
+ vn_seqc_write_end(vp);
+ vput(vp);
+ vfs_unbusy(mp);
+ vfs_freeopts(mp->mnt_optnew);
+ mp->mnt_vnodecovered = NULL;
+ vfs_mount_destroy(mp);
+ return (error);
+ }
+
+ if (mp->mnt_opt != NULL)
+ vfs_freeopts(mp->mnt_opt);
+ mp->mnt_opt = mp->mnt_optnew;
+ (void) VFS_STATFS(mp, &mp->mnt_stat);
+
+ /*
+ * Prevent external consumers of mount options from reading
+ * mnt_optnew.
+ */
+ mp->mnt_optnew = NULL;
+
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#ifdef FREEBSD_NAMECACHE
+ cache_purge(vp);
+#endif
+ VI_LOCK(vp);
+ vp->v_iflag &= ~VI_MOUNT;
+#ifdef VIRF_MOUNTPOINT
+ vn_irflag_set_locked(vp, VIRF_MOUNTPOINT);
+#endif
+ vp->v_mountedhere = mp;
+ VI_UNLOCK(vp);
+ /* Put the new filesystem on the mount list. */
+ mtx_lock(&mountlist_mtx);
+ TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+ mtx_unlock(&mountlist_mtx);
+ vfs_event_signal(NULL, VQ_MOUNT, 0);
+ if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp))
+ panic("mount: lost mount");
+ vn_seqc_write_end(vp);
+ VOP_UNLOCK1(vp);
+#if __FreeBSD_version >= 1300048
+ vfs_op_exit(mp);
+#endif
+ vfs_unbusy(mp);
+ *vpp = mvp;
+ return (0);
+}
+
+/*
+ * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
+ * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
+ * the file system as a result of releasing the vnode. Note, file systems
+ * already have to handle the race where the vnode is incremented before the
+ * inactive routine is called and does its locking.
+ *
+ * Warning: Excessive use of this routine can lead to performance problems.
+ * This is because taskqs throttle back allocation if too many are created.
+ */
+void
+vn_rele_async(vnode_t *vp, taskq_t *taskq)
+{
+ VERIFY(vp->v_count > 0);
+ if (refcount_release_if_not_last(&vp->v_usecount)) {
+#if __FreeBSD_version < 1300045
+ vdrop(vp);
+#endif
+ return;
+ }
+ VERIFY(taskq_dispatch((taskq_t *)taskq,
+ (task_func_t *)vrele, vp, TQ_SLEEP) != 0);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c
new file mode 100644
index 000000000000..739ddb05e895
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2013 EMC Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/counter.h>
+
+#include <sys/byteorder.h>
+#include <sys/lock.h>
+#include <sys/freebsd_rwlock.h>
+#include <sys/vm.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+
+const int zfs_vm_pagerret_bad = VM_PAGER_BAD;
+const int zfs_vm_pagerret_error = VM_PAGER_ERROR;
+const int zfs_vm_pagerret_ok = VM_PAGER_OK;
+const int zfs_vm_pagerput_sync = VM_PAGER_PUT_SYNC;
+const int zfs_vm_pagerput_inval = VM_PAGER_PUT_INVAL;
+
+void
+zfs_vmobject_assert_wlocked(vm_object_t object)
+{
+
+ /*
+ * This is not ideal because FILE/LINE used by assertions will not
+ * be too helpful, but it must be an hard function for
+ * compatibility reasons.
+ */
+ VM_OBJECT_ASSERT_WLOCKED(object);
+}
+
+void
+zfs_vmobject_wlock(vm_object_t object)
+{
+
+ VM_OBJECT_WLOCK(object);
+}
+
+void
+zfs_vmobject_wunlock(vm_object_t object)
+{
+
+ VM_OBJECT_WUNLOCK(object);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_zlib.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_zlib.c
new file mode 100644
index 000000000000..3644eba77ca1
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_zlib.c
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/zmod.h>
+#if __FreeBSD_version >= 1300041
+#include <contrib/zlib/zlib.h>
+#else
+#include <sys/zlib.h>
+#endif
+#include <sys/kobj.h>
+
+
+/*ARGSUSED*/
+static void *
+zcalloc(void *opaque, uint_t items, uint_t size)
+{
+
+ return (malloc((size_t)items*size, M_SOLARIS, M_NOWAIT));
+}
+
+/*ARGSUSED*/
+static void
+zcfree(void *opaque, void *ptr)
+{
+
+ free(ptr, M_SOLARIS);
+}
+
+static int
+zlib_deflateInit(z_stream *stream, int level)
+{
+
+ stream->zalloc = zcalloc;
+ stream->opaque = NULL;
+ stream->zfree = zcfree;
+
+ return (deflateInit(stream, level));
+}
+
+static int
+zlib_deflate(z_stream *stream, int flush)
+{
+ return (deflate(stream, flush));
+}
+
+static int
+zlib_deflateEnd(z_stream *stream)
+{
+ return (deflateEnd(stream));
+}
+
+static int
+zlib_inflateInit(z_stream *stream)
+{
+ stream->zalloc = zcalloc;
+ stream->opaque = NULL;
+ stream->zfree = zcfree;
+
+ return (inflateInit(stream));
+}
+
+static int
+zlib_inflate(z_stream *stream, int finish)
+{
+#if __FreeBSD_version >= 1300024
+ return (inflate(stream, finish));
+#else
+ return (_zlib104_inflate(stream, finish));
+#endif
+}
+
+
+static int
+zlib_inflateEnd(z_stream *stream)
+{
+ return (inflateEnd(stream));
+}
+
+/*
+ * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc
+ * and vfree for every call. Using a kmem_cache also has the advantage
+ * that improves the odds that the memory used will be local to this cpu.
+ * To further improve things it might be wise to create a dedicated per-cpu
+ * workspace for use. This would take some additional care because we then
+ * must disable preemption around the critical section, and verify that
+ * zlib_deflate* and zlib_inflate* never internally call schedule().
+ */
+static void *
+zlib_workspace_alloc(int flags)
+{
+ // return (kmem_cache_alloc(zlib_workspace_cache, flags));
+ return (NULL);
+}
+
+static void
+zlib_workspace_free(void *workspace)
+{
+ // kmem_cache_free(zlib_workspace_cache, workspace);
+}
+
+/*
+ * Compresses the source buffer into the destination buffer. The level
+ * parameter has the same meaning as in deflateInit. sourceLen is the byte
+ * length of the source buffer. Upon entry, destLen is the total size of the
+ * destination buffer, which must be at least 0.1% larger than sourceLen plus
+ * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer.
+ *
+ * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ * memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+ * Z_STREAM_ERROR if the level parameter is invalid.
+ */
+int
+z_compress_level(void *dest, size_t *destLen, const void *source,
+ size_t sourceLen, int level)
+{
+ z_stream stream;
+ int err;
+
+ bzero(&stream, sizeof (stream));
+ stream.next_in = (Byte *)source;
+ stream.avail_in = (uInt)sourceLen;
+ stream.next_out = dest;
+ stream.avail_out = (uInt)*destLen;
+ stream.opaque = NULL;
+
+ if ((size_t)stream.avail_out != *destLen)
+ return (Z_BUF_ERROR);
+
+ stream.opaque = zlib_workspace_alloc(KM_SLEEP);
+#if 0
+ if (!stream.opaque)
+ return (Z_MEM_ERROR);
+#endif
+ err = zlib_deflateInit(&stream, level);
+ if (err != Z_OK) {
+ zlib_workspace_free(stream.opaque);
+ return (err);
+ }
+
+ err = zlib_deflate(&stream, Z_FINISH);
+ if (err != Z_STREAM_END) {
+ zlib_deflateEnd(&stream);
+ zlib_workspace_free(stream.opaque);
+ return (err == Z_OK ? Z_BUF_ERROR : err);
+ }
+ *destLen = stream.total_out;
+
+ err = zlib_deflateEnd(&stream);
+ zlib_workspace_free(stream.opaque);
+ return (err);
+}
+
+/*
+ * Decompresses the source buffer into the destination buffer. sourceLen is
+ * the byte length of the source buffer. Upon entry, destLen is the total
+ * size of the destination buffer, which must be large enough to hold the
+ * entire uncompressed data. (The size of the uncompressed data must have
+ * been saved previously by the compressor and transmitted to the decompressor
+ * by some mechanism outside the scope of this compression library.)
+ * Upon exit, destLen is the actual size of the compressed buffer.
+ * This function can be used to decompress a whole file at once if the
+ * input file is mmap'ed.
+ *
+ * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
+ * enough memory, Z_BUF_ERROR if there was not enough room in the output
+ * buffer, or Z_DATA_ERROR if the input data was corrupted.
+ */
+int
+z_uncompress(void *dest, size_t *destLen, const void *source, size_t sourceLen)
+{
+ z_stream stream;
+ int err;
+
+ bzero(&stream, sizeof (stream));
+
+ stream.next_in = (Byte *)source;
+ stream.avail_in = (uInt)sourceLen;
+ stream.next_out = dest;
+ stream.avail_out = (uInt)*destLen;
+
+ if ((size_t)stream.avail_out != *destLen)
+ return (Z_BUF_ERROR);
+
+ stream.opaque = zlib_workspace_alloc(KM_SLEEP);
+#if 0
+ if (!stream.opaque)
+ return (Z_MEM_ERROR);
+#endif
+ err = zlib_inflateInit(&stream);
+ if (err != Z_OK) {
+ zlib_workspace_free(stream.opaque);
+ return (err);
+ }
+
+ err = zlib_inflate(&stream, Z_FINISH);
+ if (err != Z_STREAM_END) {
+ zlib_inflateEnd(&stream);
+ zlib_workspace_free(stream.opaque);
+
+ if (err == Z_NEED_DICT ||
+ (err == Z_BUF_ERROR && stream.avail_in == 0))
+ return (Z_DATA_ERROR);
+
+ return (err);
+ }
+ *destLen = stream.total_out;
+
+ err = zlib_inflateEnd(&stream);
+ zlib_workspace_free(stream.opaque);
+
+ return (err);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_zone.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_zone.c
new file mode 100644
index 000000000000..bd3f019b2fa6
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_zone.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/malloc.h>
+#include <sys/queue.h>
+#include <sys/jail.h>
+#include <sys/osd.h>
+#include <sys/priv.h>
+#include <sys/zone.h>
+
+#include <sys/policy.h>
+
+static MALLOC_DEFINE(M_ZONES, "zones_data", "Zones data");
+
+/*
+ * Structure to record list of ZFS datasets exported to a zone.
+ */
+typedef struct zone_dataset {
+ LIST_ENTRY(zone_dataset) zd_next;
+ char zd_dataset[0];
+} zone_dataset_t;
+
+LIST_HEAD(zone_dataset_head, zone_dataset);
+
+static int zone_slot;
+
+int
+zone_dataset_attach(struct ucred *cred, const char *dataset, int jailid)
+{
+ struct zone_dataset_head *head;
+ zone_dataset_t *zd, *zd2;
+ struct prison *pr;
+ int dofree, error;
+
+ if ((error = spl_priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0)
+ return (error);
+
+ /* Allocate memory before we grab prison's mutex. */
+ zd = malloc(sizeof (*zd) + strlen(dataset) + 1, M_ZONES, M_WAITOK);
+
+ sx_slock(&allprison_lock);
+ pr = prison_find(jailid); /* Locks &pr->pr_mtx. */
+ sx_sunlock(&allprison_lock);
+ if (pr == NULL) {
+ free(zd, M_ZONES);
+ return (ENOENT);
+ }
+
+ head = osd_jail_get(pr, zone_slot);
+ if (head != NULL) {
+ dofree = 0;
+ LIST_FOREACH(zd2, head, zd_next) {
+ if (strcmp(dataset, zd2->zd_dataset) == 0) {
+ free(zd, M_ZONES);
+ error = EEXIST;
+ goto end;
+ }
+ }
+ } else {
+ dofree = 1;
+ prison_hold_locked(pr);
+ mtx_unlock(&pr->pr_mtx);
+ head = malloc(sizeof (*head), M_ZONES, M_WAITOK);
+ LIST_INIT(head);
+ mtx_lock(&pr->pr_mtx);
+ error = osd_jail_set(pr, zone_slot, head);
+ KASSERT(error == 0, ("osd_jail_set() failed (error=%d)",
+ error));
+ }
+ strcpy(zd->zd_dataset, dataset);
+ LIST_INSERT_HEAD(head, zd, zd_next);
+end:
+ if (dofree)
+ prison_free_locked(pr);
+ else
+ mtx_unlock(&pr->pr_mtx);
+ return (error);
+}
+
+int
+zone_dataset_detach(struct ucred *cred, const char *dataset, int jailid)
+{
+ struct zone_dataset_head *head;
+ zone_dataset_t *zd;
+ struct prison *pr;
+ int error;
+
+ if ((error = spl_priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0)
+ return (error);
+
+ sx_slock(&allprison_lock);
+ pr = prison_find(jailid);
+ sx_sunlock(&allprison_lock);
+ if (pr == NULL)
+ return (ENOENT);
+ head = osd_jail_get(pr, zone_slot);
+ if (head == NULL) {
+ error = ENOENT;
+ goto end;
+ }
+ LIST_FOREACH(zd, head, zd_next) {
+ if (strcmp(dataset, zd->zd_dataset) == 0)
+ break;
+ }
+ if (zd == NULL)
+ error = ENOENT;
+ else {
+ LIST_REMOVE(zd, zd_next);
+ free(zd, M_ZONES);
+ if (LIST_EMPTY(head))
+ osd_jail_del(pr, zone_slot);
+ error = 0;
+ }
+end:
+ mtx_unlock(&pr->pr_mtx);
+ return (error);
+}
+
+/*
+ * Returns true if the named dataset is visible in the current zone.
+ * The 'write' parameter is set to 1 if the dataset is also writable.
+ */
+int
+zone_dataset_visible(const char *dataset, int *write)
+{
+ struct zone_dataset_head *head;
+ zone_dataset_t *zd;
+ struct prison *pr;
+ size_t len;
+ int ret = 0;
+
+ if (dataset[0] == '\0')
+ return (0);
+ if (INGLOBALZONE(curproc)) {
+ if (write != NULL)
+ *write = 1;
+ return (1);
+ }
+ pr = curthread->td_ucred->cr_prison;
+ mtx_lock(&pr->pr_mtx);
+ head = osd_jail_get(pr, zone_slot);
+ if (head == NULL)
+ goto end;
+
+ /*
+ * Walk the list once, looking for datasets which match exactly, or
+ * specify a dataset underneath an exported dataset. If found, return
+ * true and note that it is writable.
+ */
+ LIST_FOREACH(zd, head, zd_next) {
+ len = strlen(zd->zd_dataset);
+ if (strlen(dataset) >= len &&
+ bcmp(dataset, zd->zd_dataset, len) == 0 &&
+ (dataset[len] == '\0' || dataset[len] == '/' ||
+ dataset[len] == '@')) {
+ if (write)
+ *write = 1;
+ ret = 1;
+ goto end;
+ }
+ }
+
+ /*
+ * Walk the list a second time, searching for datasets which are parents
+ * of exported datasets. These should be visible, but read-only.
+ *
+ * Note that we also have to support forms such as 'pool/dataset/', with
+ * a trailing slash.
+ */
+ LIST_FOREACH(zd, head, zd_next) {
+ len = strlen(dataset);
+ if (dataset[len - 1] == '/')
+ len--; /* Ignore trailing slash */
+ if (len < strlen(zd->zd_dataset) &&
+ bcmp(dataset, zd->zd_dataset, len) == 0 &&
+ zd->zd_dataset[len] == '/') {
+ if (write)
+ *write = 0;
+ ret = 1;
+ goto end;
+ }
+ }
+end:
+ mtx_unlock(&pr->pr_mtx);
+ return (ret);
+}
+
+static void
+zone_destroy(void *arg)
+{
+ struct zone_dataset_head *head;
+ zone_dataset_t *zd;
+
+ head = arg;
+ while ((zd = LIST_FIRST(head)) != NULL) {
+ LIST_REMOVE(zd, zd_next);
+ free(zd, M_ZONES);
+ }
+ free(head, M_ZONES);
+}
+
+uint32_t
+zone_get_hostid(void *ptr)
+{
+
+ KASSERT(ptr == NULL, ("only NULL pointer supported in %s", __func__));
+
+ return ((uint32_t)curthread->td_ucred->cr_prison->pr_hostid);
+}
+
+static void
+zone_sysinit(void *arg __unused)
+{
+
+ zone_slot = osd_jail_register(zone_destroy, NULL);
+}
+
+static void
+zone_sysuninit(void *arg __unused)
+{
+
+ osd_jail_deregister(zone_slot);
+}
+
+SYSINIT(zone_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysinit, NULL);
+SYSUNINIT(zone_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysuninit, NULL);
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c
new file mode 100644
index 000000000000..ff4d80ef1dfd
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c
@@ -0,0 +1,487 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+/*
+ * See abd.c for a general overview of the arc buffered data (ABD).
+ *
+ * Using a large proportion of scattered ABDs decreases ARC fragmentation since
+ * when we are at the limit of allocatable space, using equal-size chunks will
+ * allow us to quickly reclaim enough space for a new large allocation (assuming
+ * it is also scattered).
+ *
+ * ABDs are allocated scattered by default unless the caller uses
+ * abd_alloc_linear() or zfs_abd_scatter_enabled is disabled.
+ */
+
+#include <sys/abd_impl.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+
+typedef struct abd_stats {
+ kstat_named_t abdstat_struct_size;
+ kstat_named_t abdstat_scatter_cnt;
+ kstat_named_t abdstat_scatter_data_size;
+ kstat_named_t abdstat_scatter_chunk_waste;
+ kstat_named_t abdstat_linear_cnt;
+ kstat_named_t abdstat_linear_data_size;
+} abd_stats_t;
+
+static abd_stats_t abd_stats = {
+ /* Amount of memory occupied by all of the abd_t struct allocations */
+ { "struct_size", KSTAT_DATA_UINT64 },
+ /*
+ * The number of scatter ABDs which are currently allocated, excluding
+ * ABDs which don't own their data (for instance the ones which were
+ * allocated through abd_get_offset()).
+ */
+ { "scatter_cnt", KSTAT_DATA_UINT64 },
+ /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
+ { "scatter_data_size", KSTAT_DATA_UINT64 },
+ /*
+ * The amount of space wasted at the end of the last chunk across all
+ * scatter ABDs tracked by scatter_cnt.
+ */
+ { "scatter_chunk_waste", KSTAT_DATA_UINT64 },
+ /*
+ * The number of linear ABDs which are currently allocated, excluding
+ * ABDs which don't own their data (for instance the ones which were
+ * allocated through abd_get_offset() and abd_get_from_buf()). If an
+ * ABD takes ownership of its buf then it will become tracked.
+ */
+ { "linear_cnt", KSTAT_DATA_UINT64 },
+ /* Amount of data stored in all linear ABDs tracked by linear_cnt */
+ { "linear_data_size", KSTAT_DATA_UINT64 },
+};
+
+/*
+ * The size of the chunks ABD allocates. Because the sizes allocated from the
+ * kmem_cache can't change, this tunable can only be modified at boot. Changing
+ * it at runtime would cause ABD iteration to work incorrectly for ABDs which
+ * were allocated with the old size, so a safeguard has been put in place which
+ * will cause the machine to panic if you change it and try to access the data
+ * within a scattered ABD.
+ */
+size_t zfs_abd_chunk_size = 4096;
+
+#if defined(_KERNEL)
+SYSCTL_DECL(_vfs_zfs);
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN,
+ &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers");
+SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN,
+ &zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates");
+#endif
+
+kmem_cache_t *abd_chunk_cache;
+static kstat_t *abd_ksp;
+
+/*
+ * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose chunks are
+ * just a single zero'd sized zfs_abd_chunk_size buffer. This
+ * allows us to conserve memory by only using a single zero buffer
+ * for the scatter chunks.
+ */
+abd_t *abd_zero_scatter = NULL;
+static char *abd_zero_buf = NULL;
+
+static void
+abd_free_chunk(void *c)
+{
+ kmem_cache_free(abd_chunk_cache, c);
+}
+
+static uint_t
+abd_chunkcnt_for_bytes(size_t size)
+{
+ return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size);
+}
+
+static inline uint_t
+abd_scatter_chunkcnt(abd_t *abd)
+{
+ ASSERT(!abd_is_linear(abd));
+ return (abd_chunkcnt_for_bytes(
+ ABD_SCATTER(abd).abd_offset + abd->abd_size));
+}
+
+boolean_t
+abd_size_alloc_linear(size_t size)
+{
+ return (size <= zfs_abd_chunk_size ? B_TRUE : B_FALSE);
+}
+
+void
+abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
+{
+ uint_t n = abd_scatter_chunkcnt(abd);
+ ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
+ int waste = n * zfs_abd_chunk_size - abd->abd_size;
+ if (op == ABDSTAT_INCR) {
+ ABDSTAT_BUMP(abdstat_scatter_cnt);
+ ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size);
+ ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste);
+ arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
+ } else {
+ ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
+ ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
+ ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste);
+ arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE);
+ }
+}
+
+void
+abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
+{
+ ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
+ if (op == ABDSTAT_INCR) {
+ ABDSTAT_BUMP(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
+ } else {
+ ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
+ }
+}
+
+void
+abd_verify_scatter(abd_t *abd)
+{
+ uint_t i, n;
+
+ /*
+ * There is no scatter linear pages in FreeBSD so there is an
+ * if an error if the ABD has been marked as a linear page.
+ */
+ ASSERT(!abd_is_linear_page(abd));
+ ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
+ zfs_abd_chunk_size);
+ n = abd_scatter_chunkcnt(abd);
+ for (i = 0; i < n; i++) {
+ ASSERT3P(ABD_SCATTER(abd).abd_chunks[i], !=, NULL);
+ }
+}
+
+void
+abd_alloc_chunks(abd_t *abd, size_t size)
+{
+ uint_t i, n;
+
+ n = abd_chunkcnt_for_bytes(size);
+ for (i = 0; i < n; i++) {
+ void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
+ ASSERT3P(c, !=, NULL);
+ ABD_SCATTER(abd).abd_chunks[i] = c;
+ }
+ ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size;
+}
+
+void
+abd_free_chunks(abd_t *abd)
+{
+ uint_t i, n;
+
+ n = abd_scatter_chunkcnt(abd);
+ for (i = 0; i < n; i++) {
+ abd_free_chunk(ABD_SCATTER(abd).abd_chunks[i]);
+ }
+}
+
+abd_t *
+abd_alloc_struct_impl(size_t size)
+{
+ uint_t chunkcnt = abd_chunkcnt_for_bytes(size);
+ /*
+ * In the event we are allocating a gang ABD, the size passed in
+ * will be 0. We must make sure to set abd_size to the size of an
+ * ABD struct as opposed to an ABD scatter with 0 chunks. The gang
+ * ABD struct allocation accounts for an additional 24 bytes over
+ * a scatter ABD with 0 chunks.
+ */
+ size_t abd_size = MAX(sizeof (abd_t),
+ offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]));
+ abd_t *abd = kmem_alloc(abd_size, KM_PUSHPAGE);
+ ASSERT3P(abd, !=, NULL);
+ ABDSTAT_INCR(abdstat_struct_size, abd_size);
+
+ return (abd);
+}
+
+void
+abd_free_struct_impl(abd_t *abd)
+{
+ uint_t chunkcnt = abd_is_linear(abd) || abd_is_gang(abd) ? 0 :
+ abd_scatter_chunkcnt(abd);
+ ssize_t size = MAX(sizeof (abd_t),
+ offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]));
+ kmem_free(abd, size);
+ ABDSTAT_INCR(abdstat_struct_size, -size);
+}
+
+/*
+ * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where
+ * each chunk in the scatterlist will be set to abd_zero_buf.
+ */
+static void
+abd_alloc_zero_scatter(void)
+{
+ uint_t i, n;
+
+ n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
+ abd_zero_buf = kmem_zalloc(zfs_abd_chunk_size, KM_SLEEP);
+ abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
+
+ abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_ZEROS;
+ abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
+
+ ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
+ ABD_SCATTER(abd_zero_scatter).abd_chunk_size =
+ zfs_abd_chunk_size;
+
+ for (i = 0; i < n; i++) {
+ ABD_SCATTER(abd_zero_scatter).abd_chunks[i] =
+ abd_zero_buf;
+ }
+
+ ABDSTAT_BUMP(abdstat_scatter_cnt);
+ ABDSTAT_INCR(abdstat_scatter_data_size, zfs_abd_chunk_size);
+}
+
+static void
+abd_free_zero_scatter(void)
+{
+ ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
+ ABDSTAT_INCR(abdstat_scatter_data_size, -(int)zfs_abd_chunk_size);
+
+ abd_free_struct(abd_zero_scatter);
+ abd_zero_scatter = NULL;
+ kmem_free(abd_zero_buf, zfs_abd_chunk_size);
+}
+
+void
+abd_init(void)
+{
+ abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
+ NULL, NULL, NULL, NULL, 0, KMC_NODEBUG);
+
+ abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
+ sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+ if (abd_ksp != NULL) {
+ abd_ksp->ks_data = &abd_stats;
+ kstat_install(abd_ksp);
+ }
+
+ abd_alloc_zero_scatter();
+}
+
+void
+abd_fini(void)
+{
+ abd_free_zero_scatter();
+
+ if (abd_ksp != NULL) {
+ kstat_delete(abd_ksp);
+ abd_ksp = NULL;
+ }
+
+ kmem_cache_destroy(abd_chunk_cache);
+ abd_chunk_cache = NULL;
+}
+
+void
+abd_free_linear_page(abd_t *abd)
+{
+ /*
+ * FreeBSD does not have have scatter linear pages
+ * so there is an error.
+ */
+ VERIFY(0);
+}
+
+/*
+ * If we're going to use this ABD for doing I/O using the block layer, the
+ * consumer of the ABD data doesn't care if it's scattered or not, and we don't
+ * plan to store this ABD in memory for a long period of time, we should
+ * allocate the ABD type that requires the least data copying to do the I/O.
+ *
+ * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os
+ * using a scatter/gather list we should switch to that and replace this call
+ * with vanilla abd_alloc().
+ */
+abd_t *
+abd_alloc_for_io(size_t size, boolean_t is_metadata)
+{
+ return (abd_alloc_linear(size, is_metadata));
+}
+
+abd_t *
+abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off)
+{
+ abd_verify(sabd);
+ ASSERT3U(off, <=, sabd->abd_size);
+
+ size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
+ uint_t chunkcnt = abd_scatter_chunkcnt(sabd) -
+ (new_offset / zfs_abd_chunk_size);
+
+ /*
+ * If an abd struct is provided, it is only the minimum size. If we
+ * need additional chunks, we need to allocate a new struct.
+ */
+ if (abd != NULL &&
+ offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]) >
+ sizeof (abd_t)) {
+ abd = NULL;
+ }
+
+ if (abd == NULL)
+ abd = abd_alloc_struct(chunkcnt * zfs_abd_chunk_size);
+
+ /*
+ * Even if this buf is filesystem metadata, we only track that
+ * if we own the underlying data buffer, which is not true in
+ * this case. Therefore, we don't ever use ABD_FLAG_META here.
+ */
+
+ ABD_SCATTER(abd).abd_offset = new_offset % zfs_abd_chunk_size;
+ ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size;
+
+ /* Copy the scatterlist starting at the correct offset */
+ (void) memcpy(&ABD_SCATTER(abd).abd_chunks,
+ &ABD_SCATTER(sabd).abd_chunks[new_offset /
+ zfs_abd_chunk_size],
+ chunkcnt * sizeof (void *));
+
+ return (abd);
+}
+
+static inline size_t
+abd_iter_scatter_chunk_offset(struct abd_iter *aiter)
+{
+ ASSERT(!abd_is_linear(aiter->iter_abd));
+ return ((ABD_SCATTER(aiter->iter_abd).abd_offset +
+ aiter->iter_pos) % zfs_abd_chunk_size);
+}
+
+static inline size_t
+abd_iter_scatter_chunk_index(struct abd_iter *aiter)
+{
+ ASSERT(!abd_is_linear(aiter->iter_abd));
+ return ((ABD_SCATTER(aiter->iter_abd).abd_offset +
+ aiter->iter_pos) / zfs_abd_chunk_size);
+}
+
+/*
+ * Initialize the abd_iter.
+ */
+void
+abd_iter_init(struct abd_iter *aiter, abd_t *abd)
+{
+ ASSERT(!abd_is_gang(abd));
+ abd_verify(abd);
+ aiter->iter_abd = abd;
+ aiter->iter_pos = 0;
+ aiter->iter_mapaddr = NULL;
+ aiter->iter_mapsize = 0;
+}
+
+/*
+ * This is just a helper function to see if we have exhausted the
+ * abd_iter and reached the end.
+ */
+boolean_t
+abd_iter_at_end(struct abd_iter *aiter)
+{
+ return (aiter->iter_pos == aiter->iter_abd->abd_size);
+}
+
+/*
+ * Advance the iterator by a certain amount. Cannot be called when a chunk is
+ * in use. This can be safely called when the aiter has already exhausted, in
+ * which case this does nothing.
+ */
+void
+abd_iter_advance(struct abd_iter *aiter, size_t amount)
+{
+ ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+ ASSERT0(aiter->iter_mapsize);
+
+ /* There's nothing left to advance to, so do nothing */
+ if (abd_iter_at_end(aiter))
+ return;
+
+ aiter->iter_pos += amount;
+}
+
+/*
+ * Map the current chunk into aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+void
+abd_iter_map(struct abd_iter *aiter)
+{
+ void *paddr;
+ size_t offset = 0;
+
+ ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+ ASSERT0(aiter->iter_mapsize);
+
+ /* Panic if someone has changed zfs_abd_chunk_size */
+ IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size ==
+ ABD_SCATTER(aiter->iter_abd).abd_chunk_size);
+
+ /* There's nothing left to iterate over, so do nothing */
+ if (abd_iter_at_end(aiter))
+ return;
+
+ if (abd_is_linear(aiter->iter_abd)) {
+ offset = aiter->iter_pos;
+ aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
+ paddr = ABD_LINEAR_BUF(aiter->iter_abd);
+ } else {
+ size_t index = abd_iter_scatter_chunk_index(aiter);
+ offset = abd_iter_scatter_chunk_offset(aiter);
+ aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset,
+ aiter->iter_abd->abd_size - aiter->iter_pos);
+ paddr = ABD_SCATTER(aiter->iter_abd).abd_chunks[index];
+ }
+ aiter->iter_mapaddr = (char *)paddr + offset;
+}
+
+/*
+ * Unmap the current chunk from aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+void
+abd_iter_unmap(struct abd_iter *aiter)
+{
+ /* There's nothing left to unmap, so do nothing */
+ if (abd_iter_at_end(aiter))
+ return;
+
+ ASSERT3P(aiter->iter_mapaddr, !=, NULL);
+ ASSERT3U(aiter->iter_mapsize, >, 0);
+
+ aiter->iter_mapaddr = NULL;
+ aiter->iter_mapsize = 0;
+}
+
+void
+abd_cache_reap_now(void)
+{
+ kmem_cache_reap_soon(abd_chunk_cache);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c
new file mode 100644
index 000000000000..4fc7468bfa47
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c
@@ -0,0 +1,255 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/spa_impl.h>
+#include <sys/counter.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <sys/arc.h>
+#include <sys/zfs_refcount.h>
+#include <sys/vdev.h>
+#include <sys/vdev_trim.h>
+#include <sys/vdev_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
+#include <sys/multilist.h>
+#include <sys/abd.h>
+#include <sys/zil.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/eventhandler.h>
+#include <sys/callb.h>
+#include <sys/kstat.h>
+#include <sys/zthr.h>
+#include <zfs_fletcher.h>
+#include <sys/arc_impl.h>
+#include <sys/sdt.h>
+#include <sys/aggsum.h>
+#include <sys/vnode.h>
+#include <cityhash.h>
+#include <machine/vmparam.h>
+#include <sys/vm.h>
+#include <sys/vmmeter.h>
+
+extern struct vfsops zfs_vfsops;
+
+uint_t zfs_arc_free_target = 0;
+
+static void
+arc_free_target_init(void *unused __unused)
+{
+ zfs_arc_free_target = vm_cnt.v_free_target;
+}
+SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
+ arc_free_target_init, NULL);
+
+/*
+ * We don't have a tunable for arc_free_target due to the dependency on
+ * pagedaemon initialisation.
+ */
+static int
+sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
+{
+ uint_t val;
+ int err;
+
+ val = zfs_arc_free_target;
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val < minfree)
+ return (EINVAL);
+ if (val > vm_cnt.v_page_count)
+ return (EINVAL);
+
+ zfs_arc_free_target = val;
+
+ return (0);
+}
+SYSCTL_DECL(_vfs_zfs);
+/* BEGIN CSTYLED */
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
+ CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof (uint_t),
+ sysctl_vfs_zfs_arc_free_target, "IU",
+ "Desired number of free pages below which ARC triggers reclaim");
+/* END CSTYLED */
+
+int64_t
+arc_available_memory(void)
+{
+ int64_t lowest = INT64_MAX;
+ int64_t n __unused;
+
+ /*
+ * Cooperate with pagedaemon when it's time for it to scan
+ * and reclaim some pages.
+ */
+ n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target);
+ if (n < lowest) {
+ lowest = n;
+ }
+#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
+ /*
+ * If we're on an i386 platform, it's possible that we'll exhaust the
+ * kernel heap space before we ever run out of available physical
+ * memory. Most checks of the size of the heap_area compare against
+ * tune.t_minarmem, which is the minimum available real memory that we
+ * can have in the system. However, this is generally fixed at 25 pages
+ * which is so low that it's useless. In this comparison, we seek to
+ * calculate the total heap-size, and reclaim if more than 3/4ths of the
+ * heap is allocated. (Or, in the calculation, if less than 1/4th is
+ * free)
+ */
+ n = uma_avail() - (long)(uma_limit() / 4);
+ if (n < lowest) {
+ lowest = n;
+ }
+#endif
+
+ DTRACE_PROBE1(arc__available_memory, int64_t, lowest);
+ return (lowest);
+}
+
+/*
+ * Return a default max arc size based on the amount of physical memory.
+ */
+uint64_t
+arc_default_max(uint64_t min, uint64_t allmem)
+{
+ uint64_t size;
+
+ if (allmem >= 1 << 30)
+ size = allmem - (1 << 30);
+ else
+ size = min;
+ return (MAX(allmem * 5 / 8, size));
+}
+
+/*
+ * Helper function for arc_prune_async() it is responsible for safely
+ * handling the execution of a registered arc_prune_func_t.
+ */
+static void
+arc_prune_task(void *arg)
+{
+ int64_t nr_scan = *(int64_t *)arg;
+
+ arc_reduce_target_size(ptob(nr_scan));
+ free(arg, M_TEMP);
+ vnlru_free(nr_scan, &zfs_vfsops);
+}
+
+/*
+ * Notify registered consumers they must drop holds on a portion of the ARC
+ * buffered they reference. This provides a mechanism to ensure the ARC can
+ * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This
+ * is analogous to dnlc_reduce_cache() but more generic.
+ *
+ * This operation is performed asynchronously so it may be safely called
+ * in the context of the arc_reclaim_thread(). A reference is taken here
+ * for each registered arc_prune_t and the arc_prune_task() is responsible
+ * for releasing it once the registered arc_prune_func_t has completed.
+ */
+void
+arc_prune_async(int64_t adjust)
+{
+
+ int64_t *adjustptr;
+
+ if ((adjustptr = malloc(sizeof (int64_t), M_TEMP, M_NOWAIT)) == NULL)
+ return;
+
+ *adjustptr = adjust;
+ taskq_dispatch(arc_prune_taskq, arc_prune_task, adjustptr, TQ_SLEEP);
+ ARCSTAT_BUMP(arcstat_prune);
+}
+
+uint64_t
+arc_all_memory(void)
+{
+ return (ptob(physmem));
+}
+
+int
+arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
+{
+ return (0);
+}
+
+uint64_t
+arc_free_memory(void)
+{
+ return (ptob(freemem));
+}
+
+static eventhandler_tag arc_event_lowmem = NULL;
+
+static void
+arc_lowmem(void *arg __unused, int howto __unused)
+{
+ int64_t free_memory, to_free;
+
+ arc_no_grow = B_TRUE;
+ arc_warm = B_TRUE;
+ arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
+ free_memory = arc_available_memory();
+ to_free = (arc_c >> arc_shrink_shift) - MIN(free_memory, 0);
+ DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free);
+ arc_reduce_target_size(to_free);
+
+ /*
+ * It is unsafe to block here in arbitrary threads, because we can come
+ * here from ARC itself and may hold ARC locks and thus risk a deadlock
+ * with ARC reclaim thread.
+ */
+ if (curproc == pageproc)
+ arc_wait_for_eviction(to_free);
+ else
+ arc_wait_for_eviction(0);
+}
+
+void
+arc_lowmem_init(void)
+{
+ arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
+ EVENTHANDLER_PRI_FIRST);
+
+}
+
+void
+arc_lowmem_fini(void)
+{
+ if (arc_event_lowmem != NULL)
+ EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
+}
+
+void
+arc_register_hotplug(void)
+{
+}
+
+void
+arc_unregister_hotplug(void)
+{
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c
new file mode 100644
index 000000000000..fbf998416234
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c
@@ -0,0 +1,611 @@
+/*
+ * Copyright (c) 2005-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * Copyright (c) 2018 Sean Eric Fagan <sef@ixsystems.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Portions of this file are derived from sys/geom/eli/g_eli_hmac.c
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+
+#ifdef _KERNEL
+#include <sys/libkern.h>
+#include <sys/malloc.h>
+#include <sys/sysctl.h>
+#include <opencrypto/cryptodev.h>
+#include <opencrypto/xform.h>
+#else
+#include <strings.h>
+#endif
+
+#include <sys/zio_crypt.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+
+#include <sys/freebsd_crypto.h>
+
+#define SHA512_HMAC_BLOCK_SIZE 128
+
+static int crypt_sessions = 0;
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, crypt_sessions, CTLFLAG_RD,
+ &crypt_sessions, 0, "Number of cryptographic sessions created");
+
+void
+crypto_mac_init(struct hmac_ctx *ctx, const crypto_key_t *c_key)
+{
+ uint8_t k_ipad[SHA512_HMAC_BLOCK_SIZE],
+ k_opad[SHA512_HMAC_BLOCK_SIZE],
+ key[SHA512_HMAC_BLOCK_SIZE];
+ SHA512_CTX lctx;
+ int i;
+ size_t cl_bytes = CRYPTO_BITS2BYTES(c_key->ck_length);
+
+ /*
+ * This code is based on the similar code in geom/eli/g_eli_hmac.c
+ */
+ explicit_bzero(key, sizeof (key));
+ if (c_key->ck_length == 0)
+ /* do nothing */;
+ else if (cl_bytes <= SHA512_HMAC_BLOCK_SIZE)
+ bcopy(c_key->ck_data, key, cl_bytes);
+ else {
+ /*
+ * If key is longer than 128 bytes reset it to
+ * key = SHA512(key).
+ */
+ SHA512_Init(&lctx);
+ SHA512_Update(&lctx, c_key->ck_data, cl_bytes);
+ SHA512_Final(key, &lctx);
+ }
+
+ /* XOR key with ipad and opad values. */
+ for (i = 0; i < sizeof (key); i++) {
+ k_ipad[i] = key[i] ^ 0x36;
+ k_opad[i] = key[i] ^ 0x5c;
+ }
+ explicit_bzero(key, sizeof (key));
+
+ /* Start inner SHA512. */
+ SHA512_Init(&ctx->innerctx);
+ SHA512_Update(&ctx->innerctx, k_ipad, sizeof (k_ipad));
+ explicit_bzero(k_ipad, sizeof (k_ipad));
+ /* Start outer SHA512. */
+ SHA512_Init(&ctx->outerctx);
+ SHA512_Update(&ctx->outerctx, k_opad, sizeof (k_opad));
+ explicit_bzero(k_opad, sizeof (k_opad));
+}
+
+void
+crypto_mac_update(struct hmac_ctx *ctx, const void *data, size_t datasize)
+{
+ SHA512_Update(&ctx->innerctx, data, datasize);
+}
+
+void
+crypto_mac_final(struct hmac_ctx *ctx, void *md, size_t mdsize)
+{
+ uint8_t digest[SHA512_DIGEST_LENGTH];
+
+ /* Complete inner hash */
+ SHA512_Final(digest, &ctx->innerctx);
+
+ /* Complete outer hash */
+ SHA512_Update(&ctx->outerctx, digest, sizeof (digest));
+ SHA512_Final(digest, &ctx->outerctx);
+
+ explicit_bzero(ctx, sizeof (*ctx));
+ /* mdsize == 0 means "Give me the whole hash!" */
+ if (mdsize == 0)
+ mdsize = SHA512_DIGEST_LENGTH;
+ bcopy(digest, md, mdsize);
+ explicit_bzero(digest, sizeof (digest));
+}
+
+void
+crypto_mac(const crypto_key_t *key, const void *in_data, size_t in_data_size,
+ void *out_data, size_t out_data_size)
+{
+ struct hmac_ctx ctx;
+
+ crypto_mac_init(&ctx, key);
+ crypto_mac_update(&ctx, in_data, in_data_size);
+ crypto_mac_final(&ctx, out_data, out_data_size);
+}
+
+static int
+freebsd_zfs_crypt_done(struct cryptop *crp)
+{
+ freebsd_crypt_session_t *ses;
+
+ ses = crp->crp_opaque;
+ mtx_lock(&ses->fs_lock);
+ ses->fs_done = true;
+ mtx_unlock(&ses->fs_lock);
+ wakeup(crp);
+ return (0);
+}
+
+void
+freebsd_crypt_freesession(freebsd_crypt_session_t *sess)
+{
+ mtx_destroy(&sess->fs_lock);
+ crypto_freesession(sess->fs_sid);
+ explicit_bzero(sess, sizeof (*sess));
+}
+
+static int
+zfs_crypto_dispatch(freebsd_crypt_session_t *session, struct cryptop *crp)
+{
+ int error;
+
+ crp->crp_opaque = session;
+ crp->crp_callback = freebsd_zfs_crypt_done;
+ for (;;) {
+ error = crypto_dispatch(crp);
+ if (error)
+ break;
+ mtx_lock(&session->fs_lock);
+ while (session->fs_done == false)
+ msleep(crp, &session->fs_lock, PRIBIO,
+ "zfs_crypto", hz/5);
+ mtx_unlock(&session->fs_lock);
+
+ if (crp->crp_etype != EAGAIN) {
+ error = crp->crp_etype;
+ break;
+ }
+ crp->crp_etype = 0;
+ crp->crp_flags &= ~CRYPTO_F_DONE;
+ session->fs_done = false;
+#if __FreeBSD_version < 1300087
+ /*
+ * Session ID changed, so we should record that,
+ * and try again
+ */
+ session->fs_sid = crp->crp_session;
+#endif
+ }
+ return (error);
+}
+static void
+freebsd_crypt_uio_debug_log(boolean_t encrypt,
+ freebsd_crypt_session_t *input_sessionp,
+ struct zio_crypt_info *c_info,
+ zfs_uio_t *data_uio,
+ crypto_key_t *key,
+ uint8_t *ivbuf,
+ size_t datalen,
+ size_t auth_len)
+{
+#ifdef FCRYPTO_DEBUG
+ struct cryptodesc *crd;
+ uint8_t *p = NULL;
+ size_t total = 0;
+
+ printf("%s(%s, %p, { %s, %d, %d, %s }, %p, { %d, %p, %u }, "
+ "%p, %u, %u)\n",
+ __FUNCTION__, encrypt ? "encrypt" : "decrypt", input_sessionp,
+ c_info->ci_algname, c_info->ci_crypt_type,
+ (unsigned int)c_info->ci_keylen, c_info->ci_name,
+ data_uio, key->ck_format, key->ck_data,
+ (unsigned int)key->ck_length,
+ ivbuf, (unsigned int)datalen, (unsigned int)auth_len);
+ printf("\tkey = { ");
+ for (int i = 0; i < key->ck_length / 8; i++) {
+ uint8_t *b = (uint8_t *)key->ck_data;
+ printf("%02x ", b[i]);
+ }
+ printf("}\n");
+ for (int i = 0; i < zfs_uio_iovcnt(data_uio); i++) {
+ printf("\tiovec #%d: <%p, %u>\n", i,
+ zfs_uio_iovbase(data_uio, i),
+ (unsigned int)zfs_uio_iovlen(data_uio, i));
+ total += zfs_uio_iovlen(data_uio, i);
+ }
+ zfs_uio_resid(data_uio) = total;
+#endif
+}
+/*
+ * Create a new cryptographic session. This should
+ * happen every time the key changes (including when
+ * it's first loaded).
+ */
+#if __FreeBSD_version >= 1300087
+int
+freebsd_crypt_newsession(freebsd_crypt_session_t *sessp,
+ struct zio_crypt_info *c_info, crypto_key_t *key)
+{
+ struct crypto_session_params csp;
+ int error = 0;
+
+#ifdef FCRYPTO_DEBUG
+ printf("%s(%p, { %s, %d, %d, %s }, { %d, %p, %u })\n",
+ __FUNCTION__, sessp,
+ c_info->ci_algname, c_info->ci_crypt_type,
+ (unsigned int)c_info->ci_keylen, c_info->ci_name,
+ key->ck_format, key->ck_data, (unsigned int)key->ck_length);
+ printf("\tkey = { ");
+ for (int i = 0; i < key->ck_length / 8; i++) {
+ uint8_t *b = (uint8_t *)key->ck_data;
+ printf("%02x ", b[i]);
+ }
+ printf("}\n");
+#endif
+ bzero(&csp, sizeof (csp));
+ csp.csp_mode = CSP_MODE_AEAD;
+ csp.csp_cipher_key = key->ck_data;
+ csp.csp_cipher_klen = key->ck_length / 8;
+ switch (c_info->ci_crypt_type) {
+ case ZC_TYPE_GCM:
+ csp.csp_cipher_alg = CRYPTO_AES_NIST_GCM_16;
+ csp.csp_ivlen = AES_GCM_IV_LEN;
+ switch (key->ck_length/8) {
+ case AES_128_GMAC_KEY_LEN:
+ case AES_192_GMAC_KEY_LEN:
+ case AES_256_GMAC_KEY_LEN:
+ break;
+ default:
+ error = EINVAL;
+ goto bad;
+ }
+ break;
+ case ZC_TYPE_CCM:
+ csp.csp_cipher_alg = CRYPTO_AES_CCM_16;
+ csp.csp_ivlen = AES_CCM_IV_LEN;
+ switch (key->ck_length/8) {
+ case AES_128_CBC_MAC_KEY_LEN:
+ case AES_192_CBC_MAC_KEY_LEN:
+ case AES_256_CBC_MAC_KEY_LEN:
+ break;
+ default:
+ error = EINVAL;
+ goto bad;
+ break;
+ }
+ break;
+ default:
+ error = ENOTSUP;
+ goto bad;
+ }
+ error = crypto_newsession(&sessp->fs_sid, &csp,
+ CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE);
+ mtx_init(&sessp->fs_lock, "FreeBSD Cryptographic Session Lock",
+ NULL, MTX_DEF);
+ crypt_sessions++;
+bad:
+#ifdef FCRYPTO_DEBUG
+ if (error)
+ printf("%s: returning error %d\n", __FUNCTION__, error);
+#endif
+ return (error);
+}
+
+int
+freebsd_crypt_uio(boolean_t encrypt,
+ freebsd_crypt_session_t *input_sessionp,
+ struct zio_crypt_info *c_info,
+ zfs_uio_t *data_uio,
+ crypto_key_t *key,
+ uint8_t *ivbuf,
+ size_t datalen,
+ size_t auth_len)
+{
+ struct cryptop *crp;
+ freebsd_crypt_session_t *session = NULL;
+ int error = 0;
+ size_t total = 0;
+
+ freebsd_crypt_uio_debug_log(encrypt, input_sessionp, c_info, data_uio,
+ key, ivbuf, datalen, auth_len);
+ for (int i = 0; i < zfs_uio_iovcnt(data_uio); i++)
+ total += zfs_uio_iovlen(data_uio, i);
+ zfs_uio_resid(data_uio) = total;
+ if (input_sessionp == NULL) {
+ session = kmem_zalloc(sizeof (*session), KM_SLEEP);
+ error = freebsd_crypt_newsession(session, c_info, key);
+ if (error)
+ goto out;
+ } else
+ session = input_sessionp;
+
+ crp = crypto_getreq(session->fs_sid, M_WAITOK);
+ if (encrypt) {
+ crp->crp_op = CRYPTO_OP_ENCRYPT |
+ CRYPTO_OP_COMPUTE_DIGEST;
+ } else {
+ crp->crp_op = CRYPTO_OP_DECRYPT |
+ CRYPTO_OP_VERIFY_DIGEST;
+ }
+ crp->crp_flags = CRYPTO_F_CBIFSYNC | CRYPTO_F_IV_SEPARATE;
+ crypto_use_uio(crp, GET_UIO_STRUCT(data_uio));
+
+ crp->crp_aad_start = 0;
+ crp->crp_aad_length = auth_len;
+ crp->crp_payload_start = auth_len;
+ crp->crp_payload_length = datalen;
+ crp->crp_digest_start = auth_len + datalen;
+
+ bcopy(ivbuf, crp->crp_iv, ZIO_DATA_IV_LEN);
+ error = zfs_crypto_dispatch(session, crp);
+ crypto_freereq(crp);
+out:
+#ifdef FCRYPTO_DEBUG
+ if (error)
+ printf("%s: returning error %d\n", __FUNCTION__, error);
+#endif
+ if (input_sessionp == NULL) {
+ freebsd_crypt_freesession(session);
+ kmem_free(session, sizeof (*session));
+ }
+ return (error);
+}
+
+#else
+int
+freebsd_crypt_newsession(freebsd_crypt_session_t *sessp,
+ struct zio_crypt_info *c_info, crypto_key_t *key)
+{
+ struct cryptoini cria, crie, *crip;
+ struct enc_xform *xform;
+ struct auth_hash *xauth;
+ int error = 0;
+ crypto_session_t sid;
+
+#ifdef FCRYPTO_DEBUG
+ printf("%s(%p, { %s, %d, %d, %s }, { %d, %p, %u })\n",
+ __FUNCTION__, sessp,
+ c_info->ci_algname, c_info->ci_crypt_type,
+ (unsigned int)c_info->ci_keylen, c_info->ci_name,
+ key->ck_format, key->ck_data, (unsigned int)key->ck_length);
+ printf("\tkey = { ");
+ for (int i = 0; i < key->ck_length / 8; i++) {
+ uint8_t *b = (uint8_t *)key->ck_data;
+ printf("%02x ", b[i]);
+ }
+ printf("}\n");
+#endif
+ switch (c_info->ci_crypt_type) {
+ case ZC_TYPE_GCM:
+ xform = &enc_xform_aes_nist_gcm;
+ switch (key->ck_length/8) {
+ case AES_128_GMAC_KEY_LEN:
+ xauth = &auth_hash_nist_gmac_aes_128;
+ break;
+ case AES_192_GMAC_KEY_LEN:
+ xauth = &auth_hash_nist_gmac_aes_192;
+ break;
+ case AES_256_GMAC_KEY_LEN:
+ xauth = &auth_hash_nist_gmac_aes_256;
+ break;
+ default:
+ error = EINVAL;
+ goto bad;
+ }
+ break;
+ case ZC_TYPE_CCM:
+ xform = &enc_xform_ccm;
+ switch (key->ck_length/8) {
+ case AES_128_CBC_MAC_KEY_LEN:
+ xauth = &auth_hash_ccm_cbc_mac_128;
+ break;
+ case AES_192_CBC_MAC_KEY_LEN:
+ xauth = &auth_hash_ccm_cbc_mac_192;
+ break;
+ case AES_256_CBC_MAC_KEY_LEN:
+ xauth = &auth_hash_ccm_cbc_mac_256;
+ break;
+ default:
+ error = EINVAL;
+ goto bad;
+ break;
+ }
+ break;
+ default:
+ error = ENOTSUP;
+ goto bad;
+ }
+#ifdef FCRYPTO_DEBUG
+ printf("%s(%d): Using crypt %s (key length %u [%u bytes]), "
+ "auth %s (key length %d)\n",
+ __FUNCTION__, __LINE__,
+ xform->name, (unsigned int)key->ck_length,
+ (unsigned int)key->ck_length/8,
+ xauth->name, xauth->keysize);
+#endif
+
+ bzero(&crie, sizeof (crie));
+ bzero(&cria, sizeof (cria));
+
+ crie.cri_alg = xform->type;
+ crie.cri_key = key->ck_data;
+ crie.cri_klen = key->ck_length;
+
+ cria.cri_alg = xauth->type;
+ cria.cri_key = key->ck_data;
+ cria.cri_klen = key->ck_length;
+
+ cria.cri_next = &crie;
+ crie.cri_next = NULL;
+ crip = &cria;
+ // Everything else is bzero'd
+
+ error = crypto_newsession(&sid, crip,
+ CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE);
+ if (error != 0) {
+ printf("%s(%d): crypto_newsession failed with %d\n",
+ __FUNCTION__, __LINE__, error);
+ goto bad;
+ }
+ sessp->fs_sid = sid;
+ mtx_init(&sessp->fs_lock, "FreeBSD Cryptographic Session Lock",
+ NULL, MTX_DEF);
+ crypt_sessions++;
+bad:
+ return (error);
+}
+
+/*
+ * The meat of encryption/decryption.
+ * If sessp is NULL, then it will create a
+ * temporary cryptographic session, and release
+ * it when done.
+ */
+int
+freebsd_crypt_uio(boolean_t encrypt,
+ freebsd_crypt_session_t *input_sessionp,
+ struct zio_crypt_info *c_info,
+ zfs_uio_t *data_uio,
+ crypto_key_t *key,
+ uint8_t *ivbuf,
+ size_t datalen,
+ size_t auth_len)
+{
+ struct cryptop *crp;
+ struct cryptodesc *enc_desc, *auth_desc;
+ struct enc_xform *xform;
+ struct auth_hash *xauth;
+ freebsd_crypt_session_t *session = NULL;
+ int error;
+
+ freebsd_crypt_uio_debug_log(encrypt, input_sessionp, c_info, data_uio,
+ key, ivbuf, datalen, auth_len);
+ switch (c_info->ci_crypt_type) {
+ case ZC_TYPE_GCM:
+ xform = &enc_xform_aes_nist_gcm;
+ switch (key->ck_length/8) {
+ case AES_128_GMAC_KEY_LEN:
+ xauth = &auth_hash_nist_gmac_aes_128;
+ break;
+ case AES_192_GMAC_KEY_LEN:
+ xauth = &auth_hash_nist_gmac_aes_192;
+ break;
+ case AES_256_GMAC_KEY_LEN:
+ xauth = &auth_hash_nist_gmac_aes_256;
+ break;
+ default:
+ error = EINVAL;
+ goto bad;
+ }
+ break;
+ case ZC_TYPE_CCM:
+ xform = &enc_xform_ccm;
+ switch (key->ck_length/8) {
+ case AES_128_CBC_MAC_KEY_LEN:
+ xauth = &auth_hash_ccm_cbc_mac_128;
+ break;
+ case AES_192_CBC_MAC_KEY_LEN:
+ xauth = &auth_hash_ccm_cbc_mac_192;
+ break;
+ case AES_256_CBC_MAC_KEY_LEN:
+ xauth = &auth_hash_ccm_cbc_mac_256;
+ break;
+ default:
+ error = EINVAL;
+ goto bad;
+ break;
+ }
+ break;
+ default:
+ error = ENOTSUP;
+ goto bad;
+ }
+
+#ifdef FCRYPTO_DEBUG
+ printf("%s(%d): Using crypt %s (key length %u [%u bytes]), "
+ "auth %s (key length %d)\n",
+ __FUNCTION__, __LINE__,
+ xform->name, (unsigned int)key->ck_length,
+ (unsigned int)key->ck_length/8,
+ xauth->name, xauth->keysize);
+#endif
+
+ if (input_sessionp == NULL) {
+ session = kmem_zalloc(sizeof (*session), KM_SLEEP);
+ error = freebsd_crypt_newsession(session, c_info, key);
+ if (error)
+ goto out;
+ } else
+ session = input_sessionp;
+
+ crp = crypto_getreq(2);
+ if (crp == NULL) {
+ error = ENOMEM;
+ goto bad;
+ }
+
+ auth_desc = crp->crp_desc;
+ enc_desc = auth_desc->crd_next;
+
+ crp->crp_session = session->fs_sid;
+ crp->crp_ilen = auth_len + datalen;
+ crp->crp_buf = (void*)GET_UIO_STRUCT(data_uio);
+ crp->crp_flags = CRYPTO_F_IOV | CRYPTO_F_CBIFSYNC;
+
+ auth_desc->crd_skip = 0;
+ auth_desc->crd_len = auth_len;
+ auth_desc->crd_inject = auth_len + datalen;
+ auth_desc->crd_alg = xauth->type;
+#ifdef FCRYPTO_DEBUG
+ printf("%s: auth: skip = %u, len = %u, inject = %u\n",
+ __FUNCTION__, auth_desc->crd_skip, auth_desc->crd_len,
+ auth_desc->crd_inject);
+#endif
+
+ enc_desc->crd_skip = auth_len;
+ enc_desc->crd_len = datalen;
+ enc_desc->crd_inject = auth_len;
+ enc_desc->crd_alg = xform->type;
+ enc_desc->crd_flags = CRD_F_IV_EXPLICIT | CRD_F_IV_PRESENT;
+ bcopy(ivbuf, enc_desc->crd_iv, ZIO_DATA_IV_LEN);
+ enc_desc->crd_next = NULL;
+
+#ifdef FCRYPTO_DEBUG
+ printf("%s: enc: skip = %u, len = %u, inject = %u\n",
+ __FUNCTION__, enc_desc->crd_skip, enc_desc->crd_len,
+ enc_desc->crd_inject);
+#endif
+
+ if (encrypt)
+ enc_desc->crd_flags |= CRD_F_ENCRYPT;
+
+ error = zfs_crypto_dispatch(session, crp);
+ crypto_freereq(crp);
+out:
+ if (input_sessionp == NULL) {
+ freebsd_crypt_freesession(session);
+ kmem_free(session, sizeof (*session));
+ }
+bad:
+#ifdef FCRYPTO_DEBUG
+ if (error)
+ printf("%s: returning error %d\n", __FUNCTION__, error);
+#endif
+ return (error);
+}
+#endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c
new file mode 100644
index 000000000000..8e412d9c1359
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_prop.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/sa.h>
+#include <sys/zfeature.h>
+#include <sys/abd.h>
+#include <sys/zfs_rlock.h>
+#include <sys/racct.h>
+#include <sys/vm.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vnops.h>
+
+#include <sys/ccompat.h>
+
+#ifndef IDX_TO_OFF
+#define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT)
+#endif
+
+#if __FreeBSD_version < 1300051
+#define VM_ALLOC_BUSY_FLAGS VM_ALLOC_NOBUSY
+#else
+#define VM_ALLOC_BUSY_FLAGS VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY
+#endif
+
+
+#if __FreeBSD_version < 1300072
+#define dmu_page_lock(m) vm_page_lock(m)
+#define dmu_page_unlock(m) vm_page_unlock(m)
+#else
+#define dmu_page_lock(m)
+#define dmu_page_unlock(m)
+#endif
+
+static int
+dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
+ numbufsp, dbpp, DMU_READ_PREFETCH);
+
+ dnode_rele(dn, FTAG);
+
+ return (err);
+}
+
+int
+dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ vm_page_t *ma, dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ struct sf_buf *sf;
+ int numbufs, i;
+ int err;
+
+ if (size == 0)
+ return (0);
+
+ err = dmu_buf_hold_array(os, object, offset, size,
+ FALSE, FTAG, &numbufs, &dbp);
+ if (err)
+ return (err);
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy, copied, thiscpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+ caddr_t va;
+
+ ASSERT(size > 0);
+ ASSERT3U(db->db_size, >=, PAGESIZE);
+
+ bufoff = offset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+ if (tocpy == db->db_size)
+ dmu_buf_will_fill(db, tx);
+ else
+ dmu_buf_will_dirty(db, tx);
+
+ for (copied = 0; copied < tocpy; copied += PAGESIZE) {
+ ASSERT3U(ptoa((*ma)->pindex), ==,
+ db->db_offset + bufoff);
+ thiscpy = MIN(PAGESIZE, tocpy - copied);
+ va = zfs_map_page(*ma, &sf);
+ bcopy(va, (char *)db->db_data + bufoff, thiscpy);
+ zfs_unmap_page(sf);
+ ma += 1;
+ bufoff += PAGESIZE;
+ }
+
+ if (tocpy == db->db_size)
+ dmu_buf_fill_done(db, tx);
+
+ offset += tocpy;
+ size -= tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ return (err);
+}
+
+int
+dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
+ int *rbehind, int *rahead, int last_size)
+{
+ struct sf_buf *sf;
+ vm_object_t vmobj;
+ vm_page_t m;
+ dmu_buf_t **dbp;
+ dmu_buf_t *db;
+ caddr_t va;
+ int numbufs, i;
+ int bufoff, pgoff, tocpy;
+ int mi, di;
+ int err;
+
+ ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex);
+ ASSERT(last_size <= PAGE_SIZE);
+
+ err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex),
+ IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp);
+ if (err != 0)
+ return (err);
+
+#ifdef ZFS_DEBUG
+ IMPLY(last_size < PAGE_SIZE, *rahead == 0);
+ if (dbp[0]->db_offset != 0 || numbufs > 1) {
+ for (i = 0; i < numbufs; i++) {
+ ASSERT(ISP2(dbp[i]->db_size));
+ ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0);
+ ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size);
+ }
+ }
+#endif
+
+ vmobj = ma[0]->object;
+ zfs_vmobject_wlock_12(vmobj);
+
+ db = dbp[0];
+ for (i = 0; i < *rbehind; i++) {
+ m = vm_page_grab_unlocked(vmobj, ma[0]->pindex - 1 - i,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_BUSY_FLAGS);
+ if (m == NULL)
+ break;
+ if (!vm_page_none_valid(m)) {
+ ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
+ vm_page_do_sunbusy(m);
+ break;
+ }
+ ASSERT(m->dirty == 0);
+ ASSERT(!pmap_page_is_write_mapped(m));
+
+ ASSERT(db->db_size > PAGE_SIZE);
+ bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
+ va = zfs_map_page(m, &sf);
+ bcopy((char *)db->db_data + bufoff, va, PAGESIZE);
+ zfs_unmap_page(sf);
+ vm_page_valid(m);
+ dmu_page_lock(m);
+ if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
+ vm_page_activate(m);
+ else
+ vm_page_deactivate(m);
+ dmu_page_unlock(m);
+ vm_page_do_sunbusy(m);
+ }
+ *rbehind = i;
+
+ bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size;
+ pgoff = 0;
+ for (mi = 0, di = 0; mi < count && di < numbufs; ) {
+ if (pgoff == 0) {
+ m = ma[mi];
+ if (m != bogus_page) {
+ vm_page_assert_xbusied(m);
+ ASSERT(vm_page_none_valid(m));
+ ASSERT(m->dirty == 0);
+ ASSERT(!pmap_page_is_write_mapped(m));
+ va = zfs_map_page(m, &sf);
+ }
+ }
+ if (bufoff == 0)
+ db = dbp[di];
+
+ if (m != bogus_page) {
+ ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==,
+ db->db_offset + bufoff);
+ }
+
+ /*
+ * We do not need to clamp the copy size by the file
+ * size as the last block is zero-filled beyond the
+ * end of file anyway.
+ */
+ tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff);
+ if (m != bogus_page)
+ bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy);
+
+ pgoff += tocpy;
+ ASSERT(pgoff <= PAGESIZE);
+ if (pgoff == PAGESIZE) {
+ if (m != bogus_page) {
+ zfs_unmap_page(sf);
+ vm_page_valid(m);
+ }
+ ASSERT(mi < count);
+ mi++;
+ pgoff = 0;
+ }
+
+ bufoff += tocpy;
+ ASSERT(bufoff <= db->db_size);
+ if (bufoff == db->db_size) {
+ ASSERT(di < numbufs);
+ di++;
+ bufoff = 0;
+ }
+ }
+
+#ifdef ZFS_DEBUG
+ /*
+ * Three possibilities:
+ * - last requested page ends at a buffer boundary and , thus,
+ * all pages and buffers have been iterated;
+ * - all requested pages are filled, but the last buffer
+ * has not been exhausted;
+ * the read-ahead is possible only in this case;
+ * - all buffers have been read, but the last page has not been
+ * fully filled;
+ * this is only possible if the file has only a single buffer
+ * with a size that is not a multiple of the page size.
+ */
+ if (mi == count) {
+ ASSERT(di >= numbufs - 1);
+ IMPLY(*rahead != 0, di == numbufs - 1);
+ IMPLY(*rahead != 0, bufoff != 0);
+ ASSERT(pgoff == 0);
+ }
+ if (di == numbufs) {
+ ASSERT(mi >= count - 1);
+ ASSERT(*rahead == 0);
+ IMPLY(pgoff == 0, mi == count);
+ if (pgoff != 0) {
+ ASSERT(mi == count - 1);
+ ASSERT((dbp[0]->db_size & PAGE_MASK) != 0);
+ }
+ }
+#endif
+ if (pgoff != 0) {
+ ASSERT(m != bogus_page);
+ bzero(va + pgoff, PAGESIZE - pgoff);
+ zfs_unmap_page(sf);
+ vm_page_valid(m);
+ }
+
+ for (i = 0; i < *rahead; i++) {
+ m = vm_page_grab_unlocked(vmobj, ma[count - 1]->pindex + 1 + i,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_BUSY_FLAGS);
+ if (m == NULL)
+ break;
+ if (!vm_page_none_valid(m)) {
+ ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
+ vm_page_do_sunbusy(m);
+ break;
+ }
+ ASSERT(m->dirty == 0);
+ ASSERT(!pmap_page_is_mapped(m));
+
+ ASSERT(db->db_size > PAGE_SIZE);
+ bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
+ tocpy = MIN(db->db_size - bufoff, PAGESIZE);
+ va = zfs_map_page(m, &sf);
+ bcopy((char *)db->db_data + bufoff, va, tocpy);
+ if (tocpy < PAGESIZE) {
+ ASSERT(i == *rahead - 1);
+ ASSERT((db->db_size & PAGE_MASK) != 0);
+ bzero(va + tocpy, PAGESIZE - tocpy);
+ }
+ zfs_unmap_page(sf);
+ vm_page_valid(m);
+ dmu_page_lock(m);
+ if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
+ vm_page_activate(m);
+ else
+ vm_page_deactivate(m);
+ dmu_page_unlock(m);
+ vm_page_do_sunbusy(m);
+ }
+ *rahead = i;
+ zfs_vmobject_wunlock_12(vmobj);
+
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/hkdf.c b/sys/contrib/openzfs/module/os/freebsd/zfs/hkdf.c
new file mode 100644
index 000000000000..8324ff2319b6
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/hkdf.c
@@ -0,0 +1,102 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, Datto, Inc. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/hkdf.h>
+#include <sys/freebsd_crypto.h>
+#include <sys/hkdf.h>
+
+static int
+hkdf_sha512_extract(uint8_t *salt, uint_t salt_len, uint8_t *key_material,
+ uint_t km_len, uint8_t *out_buf)
+{
+ crypto_key_t key;
+
+ /* initialize the salt as a crypto key */
+ key.ck_format = CRYPTO_KEY_RAW;
+ key.ck_length = CRYPTO_BYTES2BITS(salt_len);
+ key.ck_data = salt;
+
+ crypto_mac(&key, key_material, km_len, out_buf, SHA512_DIGEST_LENGTH);
+
+ return (0);
+}
+
+static int
+hkdf_sha512_expand(uint8_t *extract_key, uint8_t *info, uint_t info_len,
+ uint8_t *out_buf, uint_t out_len)
+{
+ struct hmac_ctx ctx;
+ crypto_key_t key;
+ uint_t i, T_len = 0, pos = 0;
+ uint8_t c;
+ uint_t N = (out_len + SHA512_DIGEST_LENGTH) / SHA512_DIGEST_LENGTH;
+ uint8_t T[SHA512_DIGEST_LENGTH];
+
+ if (N > 255)
+ return (SET_ERROR(EINVAL));
+
+ /* initialize the salt as a crypto key */
+ key.ck_format = CRYPTO_KEY_RAW;
+ key.ck_length = CRYPTO_BYTES2BITS(SHA512_DIGEST_LENGTH);
+ key.ck_data = extract_key;
+
+ for (i = 1; i <= N; i++) {
+ c = i;
+
+ crypto_mac_init(&ctx, &key);
+ crypto_mac_update(&ctx, T, T_len);
+ crypto_mac_update(&ctx, info, info_len);
+ crypto_mac_update(&ctx, &c, 1);
+ crypto_mac_final(&ctx, T, SHA512_DIGEST_LENGTH);
+ bcopy(T, out_buf + pos,
+ (i != N) ? SHA512_DIGEST_LENGTH : (out_len - pos));
+ pos += SHA512_DIGEST_LENGTH;
+ }
+
+ return (0);
+}
+
+/*
+ * HKDF is designed to be a relatively fast function for deriving keys from a
+ * master key + a salt. We use this function to generate new encryption keys
+ * so as to avoid hitting the cryptographic limits of the underlying
+ * encryption modes. Note that, for the sake of deriving encryption keys, the
+ * info parameter is called the "salt" everywhere else in the code.
+ */
+int
+hkdf_sha512(uint8_t *key_material, uint_t km_len, uint8_t *salt,
+ uint_t salt_len, uint8_t *info, uint_t info_len, uint8_t *output_key,
+ uint_t out_len)
+{
+ int ret;
+ uint8_t extract_key[SHA512_DIGEST_LENGTH];
+
+ ret = hkdf_sha512_extract(salt, salt_len, key_material, km_len,
+ extract_key);
+ if (ret != 0)
+ return (ret);
+
+ ret = hkdf_sha512_expand(extract_key, info, info_len, output_key,
+ out_len);
+ if (ret != 0)
+ return (ret);
+
+ return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c
new file mode 100644
index 000000000000..c11d4dbcf660
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/buf.h>
+#include <sys/cmn_err.h>
+#include <sys/conf.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_send.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dsl_crypt.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_scan.h>
+#include <sys/dsl_userhold.h>
+#include <sys/errno.h>
+#include <sys/eventhandler.h>
+#include <sys/file.h>
+#include <sys/fm/util.h>
+#include <sys/fs/zfs.h>
+#include <sys/kernel.h>
+#include <sys/kmem.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/nvpair.h>
+#include <sys/policy.h>
+#include <sys/proc.h>
+#include <sys/sdt.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/stat.h>
+#include <sys/sunddi.h>
+#include <sys/systm.h>
+#include <sys/taskqueue.h>
+#include <sys/uio.h>
+#include <sys/vdev.h>
+#include <sys/vdev_removal.h>
+#include <sys/zap.h>
+#include <sys/zcp.h>
+#include <sys/zfeature.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_ioctl_compat.h>
+#include <sys/zfs_ioctl_impl.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zio_checksum.h>
+#include <sys/zone.h>
+#include <sys/zvol.h>
+
+#include "zfs_comutil.h"
+#include "zfs_deleg.h"
+#include "zfs_namecheck.h"
+#include "zfs_prop.h"
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_DECL(_vfs_zfs_vdev);
+
+extern uint_t rrw_tsd_key;
+static int zfs_version_ioctl = ZFS_IOCVER_OZFS;
+SYSCTL_DECL(_vfs_zfs_version);
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, ioctl, CTLFLAG_RD, &zfs_version_ioctl,
+ 0, "ZFS_IOCTL_VERSION");
+
+static struct cdev *zfsdev;
+
+static struct root_hold_token *zfs_root_token;
+
+extern uint_t rrw_tsd_key;
+extern uint_t zfs_allow_log_key;
+extern uint_t zfs_geom_probe_vdev_key;
+
+static int zfs__init(void);
+static int zfs__fini(void);
+static void zfs_shutdown(void *, int);
+
+static eventhandler_tag zfs_shutdown_event_tag;
+extern zfsdev_state_t *zfsdev_state_list;
+
+#define ZFS_MIN_KSTACK_PAGES 4
+
+static int
+zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag,
+ struct thread *td)
+{
+ uint_t len;
+ int vecnum;
+ zfs_iocparm_t *zp;
+ zfs_cmd_t *zc;
+ zfs_cmd_legacy_t *zcl;
+ int rc, error;
+ void *uaddr;
+
+ len = IOCPARM_LEN(zcmd);
+ vecnum = zcmd & 0xff;
+ zp = (void *)arg;
+ uaddr = (void *)zp->zfs_cmd;
+ error = 0;
+ zcl = NULL;
+
+ if (len != sizeof (zfs_iocparm_t)) {
+ printf("len %d vecnum: %d sizeof (zfs_cmd_t) %ju\n",
+ len, vecnum, (uintmax_t)sizeof (zfs_cmd_t));
+ return (EINVAL);
+ }
+
+ zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+ /*
+ * Remap ioctl code for legacy user binaries
+ */
+ if (zp->zfs_ioctl_version == ZFS_IOCVER_LEGACY) {
+ vecnum = zfs_ioctl_legacy_to_ozfs(vecnum);
+ if (vecnum < 0) {
+ kmem_free(zc, sizeof (zfs_cmd_t));
+ return (ENOTSUP);
+ }
+ zcl = kmem_zalloc(sizeof (zfs_cmd_legacy_t), KM_SLEEP);
+ if (copyin(uaddr, zcl, sizeof (zfs_cmd_legacy_t))) {
+ error = SET_ERROR(EFAULT);
+ goto out;
+ }
+ zfs_cmd_legacy_to_ozfs(zcl, zc);
+ } else if (copyin(uaddr, zc, sizeof (zfs_cmd_t))) {
+ error = SET_ERROR(EFAULT);
+ goto out;
+ }
+ error = zfsdev_ioctl_common(vecnum, zc, 0);
+ if (zcl) {
+ zfs_cmd_ozfs_to_legacy(zc, zcl);
+ rc = copyout(zcl, uaddr, sizeof (*zcl));
+ } else {
+ rc = copyout(zc, uaddr, sizeof (*zc));
+ }
+ if (error == 0 && rc != 0)
+ error = SET_ERROR(EFAULT);
+out:
+ if (zcl)
+ kmem_free(zcl, sizeof (zfs_cmd_legacy_t));
+ kmem_free(zc, sizeof (zfs_cmd_t));
+ MPASS(tsd_get(rrw_tsd_key) == NULL);
+ return (error);
+}
+
+static void
+zfsdev_close(void *data)
+{
+ zfsdev_state_t *zs, *zsp = data;
+
+ mutex_enter(&zfsdev_state_lock);
+ for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
+ if (zs == zsp)
+ break;
+ }
+ if (zs == NULL || zs->zs_minor <= 0) {
+ mutex_exit(&zfsdev_state_lock);
+ return;
+ }
+ zs->zs_minor = -1;
+ zfs_onexit_destroy(zs->zs_onexit);
+ zfs_zevent_destroy(zs->zs_zevent);
+ mutex_exit(&zfsdev_state_lock);
+ zs->zs_onexit = NULL;
+ zs->zs_zevent = NULL;
+}
+
+static int
+zfs_ctldev_init(struct cdev *devp)
+{
+ boolean_t newzs = B_FALSE;
+ minor_t minor;
+ zfsdev_state_t *zs, *zsprev = NULL;
+
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+ minor = zfsdev_minor_alloc();
+ if (minor == 0)
+ return (SET_ERROR(ENXIO));
+
+ for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
+ if (zs->zs_minor == -1)
+ break;
+ zsprev = zs;
+ }
+
+ if (!zs) {
+ zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP);
+ newzs = B_TRUE;
+ }
+
+ devfs_set_cdevpriv(zs, zfsdev_close);
+
+ zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit);
+ zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent);
+
+ if (newzs) {
+ zs->zs_minor = minor;
+ wmb();
+ zsprev->zs_next = zs;
+ } else {
+ wmb();
+ zs->zs_minor = minor;
+ }
+ return (0);
+}
+
+static int
+zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td)
+{
+ int error;
+
+ mutex_enter(&zfsdev_state_lock);
+ error = zfs_ctldev_init(devp);
+ mutex_exit(&zfsdev_state_lock);
+
+ return (error);
+}
+
+static struct cdevsw zfs_cdevsw = {
+ .d_version = D_VERSION,
+ .d_open = zfsdev_open,
+ .d_ioctl = zfsdev_ioctl,
+ .d_name = ZFS_DRIVER
+};
+
+int
+zfsdev_attach(void)
+{
+ zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666,
+ ZFS_DRIVER);
+ return (0);
+}
+
+void
+zfsdev_detach(void)
+{
+ if (zfsdev != NULL)
+ destroy_dev(zfsdev);
+}
+
+int
+zfs__init(void)
+{
+ int error;
+
+#if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES
+ printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack "
+ "overflow panic!\nPlease consider adding "
+ "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES,
+ ZFS_MIN_KSTACK_PAGES);
+#endif
+ zfs_root_token = root_mount_hold("ZFS");
+ if ((error = zfs_kmod_init()) != 0) {
+ printf("ZFS: Failed to Load ZFS Filesystem"
+ ", rc = %d\n", error);
+ root_mount_rel(zfs_root_token);
+ return (error);
+ }
+
+
+ tsd_create(&zfs_geom_probe_vdev_key, NULL);
+
+ printf("ZFS storage pool version: features support ("
+ SPA_VERSION_STRING ")\n");
+ root_mount_rel(zfs_root_token);
+ ddi_sysevent_init();
+ return (0);
+}
+
+int
+zfs__fini(void)
+{
+ if (zfs_busy() || zvol_busy() ||
+ zio_injection_enabled) {
+ return (EBUSY);
+ }
+ zfs_kmod_fini();
+ tsd_destroy(&zfs_geom_probe_vdev_key);
+ return (0);
+}
+
+static void
+zfs_shutdown(void *arg __unused, int howto __unused)
+{
+
+ /*
+ * ZFS fini routines can not properly work in a panic-ed system.
+ */
+ if (panicstr == NULL)
+ zfs__fini();
+}
+
+static int
+zfs_modevent(module_t mod, int type, void *unused __unused)
+{
+ int err;
+
+ switch (type) {
+ case MOD_LOAD:
+ err = zfs__init();
+ if (err == 0)
+ zfs_shutdown_event_tag = EVENTHANDLER_REGISTER(
+ shutdown_post_sync, zfs_shutdown, NULL,
+ SHUTDOWN_PRI_FIRST);
+ return (err);
+ case MOD_UNLOAD:
+ err = zfs__fini();
+ if (err == 0 && zfs_shutdown_event_tag != NULL)
+ EVENTHANDLER_DEREGISTER(shutdown_post_sync,
+ zfs_shutdown_event_tag);
+ return (err);
+ case MOD_SHUTDOWN:
+ return (0);
+ default:
+ break;
+ }
+ return (EOPNOTSUPP);
+}
+
+static moduledata_t zfs_mod = {
+ "zfsctrl",
+ zfs_modevent,
+ 0
+};
+
+#ifdef _KERNEL
+EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0);
+#endif
+
+DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_CLOCKS, SI_ORDER_ANY);
+MODULE_VERSION(zfsctrl, 1);
+#if __FreeBSD_version > 1300092
+MODULE_DEPEND(zfsctrl, xdr, 1, 1, 1);
+#else
+MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1);
+#endif
+MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1);
+MODULE_DEPEND(zfsctrl, crypto, 1, 1, 1);
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/spa_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/spa_os.c
new file mode 100644
index 000000000000..2bc78cb451e8
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/spa_os.c
@@ -0,0 +1,281 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ */
+
+
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/ddt.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_os.h>
+#include <sys/vdev_removal.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/vdev_indirect_births.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/bpobj.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_objset.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/fs/zfs.h>
+#include <sys/arc.h>
+#include <sys/callb.h>
+#include <sys/spa_boot.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/dsl_scan.h>
+#include <sys/dmu_send.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_userhold.h>
+#include <sys/zfeature.h>
+#include <sys/zvol.h>
+#include <sys/abd.h>
+#include <sys/callb.h>
+#include <sys/zone.h>
+
+#include "zfs_prop.h"
+#include "zfs_comutil.h"
+
+static nvlist_t *
+spa_generate_rootconf(const char *name)
+{
+ nvlist_t **configs, **tops;
+ nvlist_t *config;
+ nvlist_t *best_cfg, *nvtop, *nvroot;
+ uint64_t *holes;
+ uint64_t best_txg;
+ uint64_t nchildren;
+ uint64_t pgid;
+ uint64_t count;
+ uint64_t i;
+ uint_t nholes;
+
+ if (vdev_geom_read_pool_label(name, &configs, &count) != 0)
+ return (NULL);
+
+ ASSERT3U(count, !=, 0);
+ best_txg = 0;
+ for (i = 0; i < count; i++) {
+ uint64_t txg;
+
+ VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG,
+ &txg) == 0);
+ if (txg > best_txg) {
+ best_txg = txg;
+ best_cfg = configs[i];
+ }
+ }
+
+ nchildren = 1;
+ nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren);
+ holes = NULL;
+ nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY,
+ &holes, &nholes);
+
+ tops = kmem_zalloc(nchildren * sizeof (void *), KM_SLEEP);
+ for (i = 0; i < nchildren; i++) {
+ if (i >= count)
+ break;
+ if (configs[i] == NULL)
+ continue;
+ VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE,
+ &nvtop) == 0);
+ nvlist_dup(nvtop, &tops[i], KM_SLEEP);
+ }
+ for (i = 0; holes != NULL && i < nholes; i++) {
+ if (i >= nchildren)
+ continue;
+ if (tops[holes[i]] != NULL)
+ continue;
+ nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP);
+ VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_HOLE) == 0);
+ VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID,
+ holes[i]) == 0);
+ VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID,
+ 0) == 0);
+ }
+ for (i = 0; i < nchildren; i++) {
+ if (tops[i] != NULL)
+ continue;
+ nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP);
+ VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_MISSING) == 0);
+ VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID,
+ i) == 0);
+ VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID,
+ 0) == 0);
+ }
+
+ /*
+ * Create pool config based on the best vdev config.
+ */
+ nvlist_dup(best_cfg, &config, KM_SLEEP);
+
+ /*
+ * Put this pool's top-level vdevs into a root vdev.
+ */
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &pgid) == 0);
+ VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_ROOT) == 0);
+ VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
+ VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
+ VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ tops, nchildren) == 0);
+
+ /*
+ * Replace the existing vdev_tree with the new root vdev in
+ * this pool's configuration (remove the old, add the new).
+ */
+ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
+
+ /*
+ * Drop vdev config elements that should not be present at pool level.
+ */
+ nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64);
+ nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64);
+
+ for (i = 0; i < count; i++)
+ nvlist_free(configs[i]);
+ kmem_free(configs, count * sizeof (void *));
+ for (i = 0; i < nchildren; i++)
+ nvlist_free(tops[i]);
+ kmem_free(tops, nchildren * sizeof (void *));
+ nvlist_free(nvroot);
+ return (config);
+}
+
+int
+spa_import_rootpool(const char *name, bool checkpointrewind)
+{
+ spa_t *spa;
+ vdev_t *rvd;
+ nvlist_t *config, *nvtop;
+ uint64_t txg;
+ char *pname;
+ int error;
+
+ /*
+ * Read the label from the boot device and generate a configuration.
+ */
+ config = spa_generate_rootconf(name);
+
+ mutex_enter(&spa_namespace_lock);
+ if (config != NULL) {
+ VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+ &pname) == 0 && strcmp(name, pname) == 0);
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg)
+ == 0);
+
+ if ((spa = spa_lookup(pname)) != NULL) {
+ /*
+ * The pool could already be imported,
+ * e.g., after reboot -r.
+ */
+ if (spa->spa_state == POOL_STATE_ACTIVE) {
+ mutex_exit(&spa_namespace_lock);
+ nvlist_free(config);
+ return (0);
+ }
+
+ /*
+ * Remove the existing root pool from the namespace so
+ * that we can replace it with the correct config
+ * we just read in.
+ */
+ spa_remove(spa);
+ }
+ spa = spa_add(pname, config, NULL);
+
+ /*
+ * Set spa_ubsync.ub_version as it can be used in vdev_alloc()
+ * via spa_version().
+ */
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+ &spa->spa_ubsync.ub_version) != 0)
+ spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
+ } else if ((spa = spa_lookup(name)) == NULL) {
+ mutex_exit(&spa_namespace_lock);
+ nvlist_free(config);
+ cmn_err(CE_NOTE, "Cannot find the pool label for '%s'",
+ name);
+ return (EIO);
+ } else {
+ VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0);
+ }
+ spa->spa_is_root = B_TRUE;
+ spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
+ if (checkpointrewind) {
+ spa->spa_import_flags |= ZFS_IMPORT_CHECKPOINT;
+ }
+
+ /*
+ * Build up a vdev tree based on the boot device's label config.
+ */
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvtop) == 0);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
+ VDEV_ALLOC_ROOTPOOL);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ if (error) {
+ mutex_exit(&spa_namespace_lock);
+ nvlist_free(config);
+ cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
+ pname);
+ return (error);
+ }
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ vdev_free(rvd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ mutex_exit(&spa_namespace_lock);
+
+ nvlist_free(config);
+ return (0);
+}
+
+const char *
+spa_history_zone(void)
+{
+ return ("freebsd");
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
new file mode 100644
index 000000000000..647c1463ba14
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
@@ -0,0 +1,700 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/buf.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/dmu.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/sunddi.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+#include <sys/nvpair.h>
+#include <sys/mount.h>
+#include <sys/taskqueue.h>
+#include <sys/sdt.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+#include <sys/dsl_scan.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_send.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dsl_userhold.h>
+#include <sys/zfeature.h>
+#include <sys/zcp.h>
+#include <sys/zio_checksum.h>
+#include <sys/vdev_removal.h>
+#include <sys/dsl_crypt.h>
+
+#include <sys/zfs_ioctl_compat.h>
+#include <sys/zfs_context.h>
+
+#include <sys/arc_impl.h>
+#include <sys/dsl_pool.h>
+
+
+/* BEGIN CSTYLED */
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, arc, CTLFLAG_RW, 0, "ZFS adaptive replacement cache");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS condense");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf, CTLFLAG_RW, 0, "ZFS disk buf cache");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf_cache, CTLFLAG_RW, 0, "ZFS disk buf cache");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, deadman, CTLFLAG_RW, 0, "ZFS deadman");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS dedup");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, l2arc, CTLFLAG_RW, 0, "ZFS l2arc");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, livelist, CTLFLAG_RW, 0, "ZFS livelist");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, lua, CTLFLAG_RW, 0, "ZFS lua");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, mg, CTLFLAG_RW, 0, "ZFS metaslab group");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, multihost, CTLFLAG_RW, 0, "ZFS multihost protection");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, prefetch, CTLFLAG_RW, 0, "ZFS prefetch");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, reconstruct, CTLFLAG_RW, 0, "ZFS reconstruct");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, recv, CTLFLAG_RW, 0, "ZFS receive");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, send, CTLFLAG_RW, 0, "ZFS send");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, spa, CTLFLAG_RW, 0, "ZFS space allocation");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RW, 0, "ZFS TRIM");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS transaction group");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, vnops, CTLFLAG_RW, 0, "ZFS VNOPS");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zevent, CTLFLAG_RW, 0, "ZFS event");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zil, CTLFLAG_RW, 0, "ZFS ZIL");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
+
+SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0,
+ "ZFS livelist condense");
+SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache");
+SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, file, CTLFLAG_RW, 0, "ZFS VDEV file");
+SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0,
+ "ZFS VDEV mirror");
+
+SYSCTL_DECL(_vfs_zfs_version);
+SYSCTL_CONST_STRING(_vfs_zfs_version, OID_AUTO, module, CTLFLAG_RD,
+ (ZFS_META_VERSION "-" ZFS_META_RELEASE), "OpenZFS module version");
+
+extern arc_state_t ARC_anon;
+extern arc_state_t ARC_mru;
+extern arc_state_t ARC_mru_ghost;
+extern arc_state_t ARC_mfu;
+extern arc_state_t ARC_mfu_ghost;
+extern arc_state_t ARC_l2c_only;
+
+/*
+ * minimum lifespan of a prefetch block in clock ticks
+ * (initialized in arc_init())
+ */
+
+/* arc.c */
+
+/* legacy compat */
+extern uint64_t l2arc_write_max; /* def max write size */
+extern uint64_t l2arc_write_boost; /* extra warmup write */
+extern uint64_t l2arc_headroom; /* # of dev writes */
+extern uint64_t l2arc_headroom_boost;
+extern uint64_t l2arc_feed_secs; /* interval seconds */
+extern uint64_t l2arc_feed_min_ms; /* min interval msecs */
+extern int l2arc_noprefetch; /* don't cache prefetch bufs */
+extern int l2arc_feed_again; /* turbo warmup */
+extern int l2arc_norw; /* no reads during writes */
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
+ &l2arc_write_max, 0, "max write size (LEGACY)");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
+ &l2arc_write_boost, 0, "extra write during warmup (LEGACY)");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
+ &l2arc_headroom, 0, "number of dev writes (LEGACY)");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
+ &l2arc_feed_secs, 0, "interval seconds (LEGACY)");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
+ &l2arc_feed_min_ms, 0, "min interval milliseconds (LEGACY)");
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
+ &l2arc_noprefetch, 0, "don't cache prefetch bufs (LEGACY)");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
+ &l2arc_feed_again, 0, "turbo warmup (LEGACY)");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
+ &l2arc_norw, 0, "no reads during writes (LEGACY)");
+#if 0
+extern int zfs_compressed_arc_enabled;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RW,
+ &zfs_compressed_arc_enabled, 1, "compressed arc buffers (LEGACY)");
+#endif
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
+ &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
+ &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+ "size of anonymous state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
+ &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+ "size of anonymous state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
+ &ARC_mru.arcs_size.rc_count, 0, "size of mru state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD,
+ &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+ "size of metadata in mru state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD,
+ &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+ "size of data in mru state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
+ &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD,
+ &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+ "size of metadata in mru ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD,
+ &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+ "size of data in mru ghost state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
+ &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD,
+ &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+ "size of metadata in mfu state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD,
+ &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+ "size of data in mfu state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
+ &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD,
+ &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+ "size of metadata in mfu ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
+ &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+ "size of data in mfu ghost state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
+ &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state");
+
+static int
+sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
+{
+ int err, val;
+
+ val = arc_no_grow_shift;
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val < 0 || val >= arc_shrink_shift)
+ return (EINVAL);
+
+ arc_no_grow_shift = val;
+ return (0);
+}
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift,
+ CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, sizeof (int),
+ sysctl_vfs_zfs_arc_no_grow_shift, "I",
+ "log2(fraction of ARC which must be free to allow growing)");
+
+int
+param_set_arc_long(SYSCTL_HANDLER_ARGS)
+{
+ int err;
+
+ err = sysctl_handle_long(oidp, arg1, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ arc_tuning_update(B_TRUE);
+
+ return (0);
+}
+
+int
+param_set_arc_int(SYSCTL_HANDLER_ARGS)
+{
+ int err;
+
+ err = sysctl_handle_int(oidp, arg1, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ arc_tuning_update(B_TRUE);
+
+ return (0);
+}
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min,
+ CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
+ &zfs_arc_min, sizeof (zfs_arc_min), param_set_arc_long, "LU",
+ "min arc size (LEGACY)");
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max,
+ CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
+ &zfs_arc_max, sizeof (zfs_arc_max), param_set_arc_long, "LU",
+ "max arc size (LEGACY)");
+
+/* dbuf.c */
+
+
+/* dmu.c */
+
+/* dmu_zfetch.c */
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH (LEGACY)");
+
+/* max bytes to prefetch per stream (default 8MB) */
+extern uint32_t zfetch_max_distance;
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN,
+ &zfetch_max_distance, 0, "Max bytes to prefetch per stream (LEGACY)");
+
+/* max bytes to prefetch indirects for per stream (default 64MB) */
+extern uint32_t zfetch_max_idistance;
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN,
+ &zfetch_max_idistance, 0,
+ "Max bytes to prefetch indirects for per stream (LEGACY)");
+
+/* dsl_pool.c */
+
+/* dnode.c */
+extern int zfs_default_bs;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, default_bs, CTLFLAG_RWTUN,
+ &zfs_default_bs, 0, "Default dnode block shift");
+
+extern int zfs_default_ibs;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, default_ibs, CTLFLAG_RWTUN,
+ &zfs_default_ibs, 0, "Default dnode indirect block shift");
+
+
+/* dsl_scan.c */
+
+/* metaslab.c */
+
+/*
+ * In pools where the log space map feature is not enabled we touch
+ * multiple metaslabs (and their respective space maps) with each
+ * transaction group. Thus, we benefit from having a small space map
+ * block size since it allows us to issue more I/O operations scattered
+ * around the disk. So a sane default for the space map block size
+ * is 8~16K.
+ */
+extern int zfs_metaslab_sm_blksz_no_log;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_no_log, CTLFLAG_RDTUN,
+ &zfs_metaslab_sm_blksz_no_log, 0,
+ "Block size for space map in pools with log space map disabled. "
+ "Power of 2 and greater than 4096.");
+
+/*
+ * When the log space map feature is enabled, we accumulate a lot of
+ * changes per metaslab that are flushed once in a while so we benefit
+ * from a bigger block size like 128K for the metaslab space maps.
+ */
+extern int zfs_metaslab_sm_blksz_with_log;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_with_log, CTLFLAG_RDTUN,
+ &zfs_metaslab_sm_blksz_with_log, 0,
+ "Block size for space map in pools with log space map enabled. "
+ "Power of 2 and greater than 4096.");
+
+/*
+ * The in-core space map representation is more compact than its on-disk form.
+ * The zfs_condense_pct determines how much more compact the in-core
+ * space map representation must be before we compact it on-disk.
+ * Values should be greater than or equal to 100.
+ */
+extern int zfs_condense_pct;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
+ &zfs_condense_pct, 0,
+ "Condense on-disk spacemap when it is more than this many percents"
+ " of in-memory counterpart");
+
+extern int zfs_remove_max_segment;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, remove_max_segment, CTLFLAG_RWTUN,
+ &zfs_remove_max_segment, 0, "Largest contiguous segment ZFS will attempt to"
+ " allocate when removing a device");
+
+extern int zfs_removal_suspend_progress;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress, CTLFLAG_RWTUN,
+ &zfs_removal_suspend_progress, 0, "Ensures certain actions can happen while"
+ " in the middle of a removal");
+
+
+/*
+ * Minimum size which forces the dynamic allocator to change
+ * it's allocation strategy. Once the space map cannot satisfy
+ * an allocation of this size then it switches to using more
+ * aggressive strategy (i.e search by size rather than offset).
+ */
+extern uint64_t metaslab_df_alloc_threshold;
+SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
+ &metaslab_df_alloc_threshold, 0,
+ "Minimum size which forces the dynamic allocator to change it's allocation strategy");
+
+/*
+ * The minimum free space, in percent, which must be available
+ * in a space map to continue allocations in a first-fit fashion.
+ * Once the space map's free space drops below this level we dynamically
+ * switch to using best-fit allocations.
+ */
+extern int metaslab_df_free_pct;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN,
+ &metaslab_df_free_pct, 0,
+ "The minimum free space, in percent, which must be available in a "
+ "space map to continue allocations in a first-fit fashion");
+
+/*
+ * Percentage of all cpus that can be used by the metaslab taskq.
+ */
+extern int metaslab_load_pct;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN,
+ &metaslab_load_pct, 0,
+ "Percentage of cpus that can be used by the metaslab taskq");
+
+/*
+ * Max number of metaslabs per group to preload.
+ */
+extern int metaslab_preload_limit;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
+ &metaslab_preload_limit, 0,
+ "Max number of metaslabs per group to preload");
+
+/* refcount.c */
+extern int reference_tracking_enable;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, reference_tracking_enable, CTLFLAG_RDTUN,
+ &reference_tracking_enable, 0,
+ "Track reference holders to refcount_t objects, used mostly by ZFS");
+
+/* spa.c */
+extern int zfs_ccw_retry_interval;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RWTUN,
+ &zfs_ccw_retry_interval, 0,
+ "Configuration cache file write, retry after failure, interval (seconds)");
+
+extern uint64_t zfs_max_missing_tvds_cachefile;
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, CTLFLAG_RWTUN,
+ &zfs_max_missing_tvds_cachefile, 0,
+ "allow importing pools with missing top-level vdevs in cache file");
+
+extern uint64_t zfs_max_missing_tvds_scan;
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, CTLFLAG_RWTUN,
+ &zfs_max_missing_tvds_scan, 0,
+ "allow importing pools with missing top-level vdevs during scan");
+
+/* spa_misc.c */
+extern int zfs_flags;
+static int
+sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS)
+{
+ int err, val;
+
+ val = zfs_flags;
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ /*
+ * ZFS_DEBUG_MODIFY must be enabled prior to boot so all
+ * arc buffers in the system have the necessary additional
+ * checksum data. However, it is safe to disable at any
+ * time.
+ */
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ val &= ~ZFS_DEBUG_MODIFY;
+ zfs_flags = val;
+
+ return (0);
+}
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags,
+ CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, NULL, 0,
+ sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing.");
+
+int
+param_set_deadman_synctime(SYSCTL_HANDLER_ARGS)
+{
+ unsigned long val;
+ int err;
+
+ val = zfs_deadman_synctime_ms;
+ err = sysctl_handle_long(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+ zfs_deadman_synctime_ms = val;
+
+ spa_set_deadman_synctime(MSEC2NSEC(zfs_deadman_synctime_ms));
+
+ return (0);
+}
+
+int
+param_set_deadman_ziotime(SYSCTL_HANDLER_ARGS)
+{
+ unsigned long val;
+ int err;
+
+ val = zfs_deadman_ziotime_ms;
+ err = sysctl_handle_long(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+ zfs_deadman_ziotime_ms = val;
+
+ spa_set_deadman_ziotime(MSEC2NSEC(zfs_deadman_synctime_ms));
+
+ return (0);
+}
+
+int
+param_set_deadman_failmode(SYSCTL_HANDLER_ARGS)
+{
+ char buf[16];
+ int rc;
+
+ if (req->newptr == NULL)
+ strlcpy(buf, zfs_deadman_failmode, sizeof (buf));
+
+ rc = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+ if (rc || req->newptr == NULL)
+ return (rc);
+ if (strcmp(buf, zfs_deadman_failmode) == 0)
+ return (0);
+ if (!strcmp(buf, "wait"))
+ zfs_deadman_failmode = "wait";
+ if (!strcmp(buf, "continue"))
+ zfs_deadman_failmode = "continue";
+ if (!strcmp(buf, "panic"))
+ zfs_deadman_failmode = "panic";
+
+ return (-param_set_deadman_failmode_common(buf));
+}
+
+
+/* spacemap.c */
+extern int space_map_ibs;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN,
+ &space_map_ibs, 0, "Space map indirect block shift");
+
+
+/* vdev.c */
+int
+param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS)
+{
+ uint64_t val;
+ int err;
+
+ val = zfs_vdev_min_auto_ashift;
+ err = sysctl_handle_64(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (SET_ERROR(err));
+
+ if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift)
+ return (SET_ERROR(EINVAL));
+
+ zfs_vdev_min_auto_ashift = val;
+
+ return (0);
+}
+
+int
+param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS)
+{
+ uint64_t val;
+ int err;
+
+ val = zfs_vdev_max_auto_ashift;
+ err = sysctl_handle_64(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (SET_ERROR(err));
+
+ if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift)
+ return (SET_ERROR(EINVAL));
+
+ zfs_vdev_max_auto_ashift = val;
+
+ return (0);
+}
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift,
+ CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
+ &zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift),
+ param_set_min_auto_ashift, "QU",
+ "Min ashift used when creating new top-level vdev. (LEGACY)");
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
+ CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
+ &zfs_vdev_max_auto_ashift, sizeof (zfs_vdev_max_auto_ashift),
+ param_set_max_auto_ashift, "QU",
+ "Max ashift used when optimizing for logical -> physical sector size on "
+ "new top-level vdevs. (LEGACY)");
+
+/*
+ * Since the DTL space map of a vdev is not expected to have a lot of
+ * entries, we default its block size to 4K.
+ */
+extern int zfs_vdev_dtl_sm_blksz;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, CTLFLAG_RDTUN,
+ &zfs_vdev_dtl_sm_blksz, 0,
+ "Block size for DTL space map. Power of 2 and greater than 4096.");
+
+/*
+ * vdev-wide space maps that have lots of entries written to them at
+ * the end of each transaction can benefit from a higher I/O bandwidth
+ * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
+ */
+extern int zfs_vdev_standard_sm_blksz;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN,
+ &zfs_vdev_standard_sm_blksz, 0,
+ "Block size for standard space map. Power of 2 and greater than 4096.");
+
+extern int vdev_validate_skip;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip, CTLFLAG_RDTUN,
+ &vdev_validate_skip, 0,
+ "Enable to bypass vdev_validate().");
+
+
+/* vdev_cache.c */
+
+/* vdev_mirror.c */
+/*
+ * The load configuration settings below are tuned by default for
+ * the case where all devices are of the same rotational type.
+ *
+ * If there is a mixture of rotating and non-rotating media, setting
+ * non_rotating_seek_inc to 0 may well provide better results as it
+ * will direct more reads to the non-rotating vdevs which are more
+ * likely to have a higher performance.
+ */
+
+
+/* vdev_queue.c */
+#define ZFS_VDEV_QUEUE_KNOB_MIN(name) \
+extern uint32_t zfs_vdev_ ## name ## _min_active; \
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, CTLFLAG_RWTUN,\
+ &zfs_vdev_ ## name ## _min_active, 0, \
+ "Initial number of I/O requests of type " #name \
+ " active for each device");
+
+#define ZFS_VDEV_QUEUE_KNOB_MAX(name) \
+extern uint32_t zfs_vdev_ ## name ## _max_active; \
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, CTLFLAG_RWTUN, \
+ &zfs_vdev_ ## name ## _max_active, 0, \
+ "Maximum number of I/O requests of type " #name \
+ " active for each device");
+
+
+#undef ZFS_VDEV_QUEUE_KNOB
+
+extern uint32_t zfs_vdev_max_active;
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN,
+ &zfs_vdev_max_active, 0,
+ "The maximum number of I/Os of all types active for each device. (LEGACY)");
+
+extern int zfs_vdev_def_queue_depth;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, def_queue_depth, CTLFLAG_RWTUN,
+ &zfs_vdev_def_queue_depth, 0,
+ "Default queue depth for each allocator");
+
+/*extern uint64_t zfs_multihost_history;
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, multihost_history, CTLFLAG_RWTUN,
+ &zfs_multihost_history, 0,
+ "Historical staticists for the last N multihost updates");*/
+
+#ifdef notyet
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, trim_on_init, CTLFLAG_RW,
+ &vdev_trim_on_init, 0, "Enable/disable full vdev trim on initialisation");
+#endif
+
+
+/* zio.c */
+#if defined(__LP64__)
+int zio_use_uma = 1;
+#else
+int zio_use_uma = 0;
+#endif
+
+SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
+ "Use uma(9) for ZIO allocations");
+SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
+ "Exclude metadata buffers from dumps as well");
+
+int
+param_set_slop_shift(SYSCTL_HANDLER_ARGS)
+{
+ int val;
+ int err;
+
+ val = *(int *)arg1;
+
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val < 1 || val > 31)
+ return (EINVAL);
+
+ *(int *)arg1 = val;
+
+ return (0);
+}
+
+int
+param_set_multihost_interval(SYSCTL_HANDLER_ARGS)
+{
+ int err;
+
+ err = sysctl_handle_long(oidp, arg1, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (spa_mode_global != SPA_MODE_UNINIT)
+ mmp_signal_all_threads();
+
+ return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c
new file mode 100644
index 000000000000..825bd706e0c0
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c
@@ -0,0 +1,354 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/file.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/abd.h>
+#include <sys/stat.h>
+
+/*
+ * Virtual device vector for files.
+ */
+
+static taskq_t *vdev_file_taskq;
+
+unsigned long vdev_file_logical_ashift = SPA_MINBLOCKSHIFT;
+unsigned long vdev_file_physical_ashift = SPA_MINBLOCKSHIFT;
+
+void
+vdev_file_init(void)
+{
+ vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16),
+ minclsyspri, max_ncpus, INT_MAX, 0);
+}
+
+void
+vdev_file_fini(void)
+{
+ taskq_destroy(vdev_file_taskq);
+}
+
+static void
+vdev_file_hold(vdev_t *vd)
+{
+ ASSERT(vd->vdev_path != NULL);
+}
+
+static void
+vdev_file_rele(vdev_t *vd)
+{
+ ASSERT(vd->vdev_path != NULL);
+}
+
+static mode_t
+vdev_file_open_mode(spa_mode_t spa_mode)
+{
+ mode_t mode = 0;
+
+ if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) {
+ mode = O_RDWR;
+ } else if (spa_mode & SPA_MODE_READ) {
+ mode = O_RDONLY;
+ } else if (spa_mode & SPA_MODE_WRITE) {
+ mode = O_WRONLY;
+ }
+
+ return (mode | O_LARGEFILE);
+}
+
+static int
+vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+ vdev_file_t *vf;
+ zfs_file_t *fp;
+ zfs_file_attr_t zfa;
+ int error;
+
+ /*
+ * Rotational optimizations only make sense on block devices.
+ */
+ vd->vdev_nonrot = B_TRUE;
+
+ /*
+ * Allow TRIM on file based vdevs. This may not always be supported,
+ * since it depends on your kernel version and underlying filesystem
+ * type but it is always safe to attempt.
+ */
+ vd->vdev_has_trim = B_TRUE;
+
+ /*
+ * Disable secure TRIM on file based vdevs. There is no way to
+ * request this behavior from the underlying filesystem.
+ */
+ vd->vdev_has_securetrim = B_FALSE;
+
+ /*
+ * We must have a pathname, and it must be absolute.
+ */
+ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Reopen the device if it's not currently open. Otherwise,
+ * just update the physical size of the device.
+ */
+ if (vd->vdev_tsd != NULL) {
+ ASSERT(vd->vdev_reopening);
+ vf = vd->vdev_tsd;
+ goto skip_open;
+ }
+
+ vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
+
+ /*
+ * We always open the files from the root of the global zone, even if
+ * we're in a local zone. If the user has gotten to this point, the
+ * administrator has already decided that the pool should be available
+ * to local zone users, so the underlying devices should be as well.
+ */
+ ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
+
+ error = zfs_file_open(vd->vdev_path,
+ vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp);
+ if (error) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (error);
+ }
+
+ vf->vf_file = fp;
+
+#ifdef _KERNEL
+ /*
+ * Make sure it's a regular file.
+ */
+ if (zfs_file_getattr(fp, &zfa)) {
+ return (SET_ERROR(ENODEV));
+ }
+ if (!S_ISREG(zfa.zfa_mode)) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (SET_ERROR(ENODEV));
+ }
+#endif
+
+skip_open:
+
+ error = zfs_file_getattr(vf->vf_file, &zfa);
+ if (error) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (error);
+ }
+
+ *max_psize = *psize = zfa.zfa_size;
+ *logical_ashift = vdev_file_logical_ashift;
+ *physical_ashift = vdev_file_physical_ashift;
+
+ return (0);
+}
+
+static void
+vdev_file_close(vdev_t *vd)
+{
+ vdev_file_t *vf = vd->vdev_tsd;
+
+ if (vd->vdev_reopening || vf == NULL)
+ return;
+
+ if (vf->vf_file != NULL) {
+ zfs_file_close(vf->vf_file);
+ }
+
+ vd->vdev_delayed_close = B_FALSE;
+ kmem_free(vf, sizeof (vdev_file_t));
+ vd->vdev_tsd = NULL;
+}
+
+/*
+ * Implements the interrupt side for file vdev types. This routine will be
+ * called when the I/O completes allowing us to transfer the I/O to the
+ * interrupt taskqs. For consistency, the code structure mimics disk vdev
+ * types.
+ */
+static void
+vdev_file_io_intr(zio_t *zio)
+{
+ zio_delay_interrupt(zio);
+}
+
+static void
+vdev_file_io_strategy(void *arg)
+{
+ zio_t *zio = arg;
+ vdev_t *vd = zio->io_vd;
+ vdev_file_t *vf;
+ void *buf;
+ ssize_t resid;
+ loff_t off;
+ ssize_t size;
+ int err;
+
+ off = zio->io_offset;
+ size = zio->io_size;
+ resid = 0;
+
+ vf = vd->vdev_tsd;
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+ if (zio->io_type == ZIO_TYPE_READ) {
+ buf = abd_borrow_buf(zio->io_abd, zio->io_size);
+ err = zfs_file_pread(vf->vf_file, buf, size, off, &resid);
+ abd_return_buf_copy(zio->io_abd, buf, size);
+ } else {
+ buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+ err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid);
+ abd_return_buf(zio->io_abd, buf, size);
+ }
+ if (resid != 0 && zio->io_error == 0)
+ zio->io_error = ENOSPC;
+
+ vdev_file_io_intr(zio);
+}
+
+static void
+vdev_file_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_file_t *vf = vd->vdev_tsd;
+
+ if (zio->io_type == ZIO_TYPE_IOCTL) {
+ /* XXPOLICY */
+ if (!vdev_readable(vd)) {
+ zio->io_error = SET_ERROR(ENXIO);
+ zio_interrupt(zio);
+ return;
+ }
+
+ switch (zio->io_cmd) {
+ case DKIOCFLUSHWRITECACHE:
+ zio->io_error = zfs_file_fsync(vf->vf_file,
+ O_SYNC|O_DSYNC);
+ break;
+ default:
+ zio->io_error = SET_ERROR(ENOTSUP);
+ }
+
+ zio_execute(zio);
+ return;
+ } else if (zio->io_type == ZIO_TYPE_TRIM) {
+#ifdef notyet
+ int mode = 0;
+
+ ASSERT3U(zio->io_size, !=, 0);
+
+ /* XXX FreeBSD has no fallocate routine in file ops */
+ zio->io_error = zfs_file_fallocate(vf->vf_file,
+ mode, zio->io_offset, zio->io_size);
+#endif
+ zio->io_error = SET_ERROR(ENOTSUP);
+ zio_execute(zio);
+ return;
+ }
+ ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+ zio->io_target_timestamp = zio_handle_io_delay(zio);
+
+ VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
+ TQ_SLEEP), !=, 0);
+}
+
+/* ARGSUSED */
+static void
+vdev_file_io_done(zio_t *zio)
+{
+}
+
+vdev_ops_t vdev_file_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
+ .vdev_op_open = vdev_file_open,
+ .vdev_op_close = vdev_file_close,
+ .vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
+ .vdev_op_io_start = vdev_file_io_start,
+ .vdev_op_io_done = vdev_file_io_done,
+ .vdev_op_state_change = NULL,
+ .vdev_op_need_resilver = NULL,
+ .vdev_op_hold = vdev_file_hold,
+ .vdev_op_rele = vdev_file_rele,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
+ .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */
+ .vdev_op_leaf = B_TRUE /* leaf vdev */
+};
+
+/*
+ * From userland we access disks just like files.
+ */
+#ifndef _KERNEL
+
+vdev_ops_t vdev_disk_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
+ .vdev_op_open = vdev_file_open,
+ .vdev_op_close = vdev_file_close,
+ .vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
+ .vdev_op_io_start = vdev_file_io_start,
+ .vdev_op_io_done = vdev_file_io_done,
+ .vdev_op_state_change = NULL,
+ .vdev_op_need_resilver = NULL,
+ .vdev_op_hold = vdev_file_hold,
+ .vdev_op_rele = vdev_file_rele,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
+ .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */
+ .vdev_op_leaf = B_TRUE /* leaf vdev */
+};
+
+#endif
+
+ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, ULONG, ZMOD_RW,
+ "Logical ashift for file-based devices");
+ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, ULONG, ZMOD_RW,
+ "Physical ashift for file-based devices");
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c
new file mode 100644
index 000000000000..c9e8e21982cf
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c
@@ -0,0 +1,1214 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/bio.h>
+#include <sys/file.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_os.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <geom/geom.h>
+#include <geom/geom_disk.h>
+#include <geom/geom_int.h>
+
+#ifndef g_topology_locked
+#define g_topology_locked() sx_xlocked(&topology_lock)
+#endif
+
+/*
+ * Virtual device vector for GEOM.
+ */
+
+static g_attrchanged_t vdev_geom_attrchanged;
+struct g_class zfs_vdev_class = {
+ .name = "ZFS::VDEV",
+ .version = G_VERSION,
+ .attrchanged = vdev_geom_attrchanged,
+};
+
+struct consumer_vdev_elem {
+ SLIST_ENTRY(consumer_vdev_elem) elems;
+ vdev_t *vd;
+};
+
+SLIST_HEAD(consumer_priv_t, consumer_vdev_elem);
+/* BEGIN CSTYLED */
+_Static_assert(sizeof (((struct g_consumer *)NULL)->private)
+ == sizeof (struct consumer_priv_t*),
+ "consumer_priv_t* can't be stored in g_consumer.private");
+
+DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
+
+SYSCTL_DECL(_vfs_zfs_vdev);
+/* Don't send BIO_FLUSH. */
+static int vdev_geom_bio_flush_disable;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN,
+ &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
+/* Don't send BIO_DELETE. */
+static int vdev_geom_bio_delete_disable;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN,
+ &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
+/* END CSTYLED */
+
+/* Declare local functions */
+static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
+
+/*
+ * Thread local storage used to indicate when a thread is probing geoms
+ * for their guids. If NULL, this thread is not tasting geoms. If non NULL,
+ * it is looking for a replacement for the vdev_t* that is its value.
+ */
+uint_t zfs_geom_probe_vdev_key;
+
+static void
+vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp,
+ boolean_t do_null_update)
+{
+ boolean_t needs_update = B_FALSE;
+ char *physpath;
+ int error, physpath_len;
+
+ physpath_len = MAXPATHLEN;
+ physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
+ error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
+ if (error == 0) {
+ char *old_physpath;
+
+ /* g_topology lock ensures that vdev has not been closed */
+ g_topology_assert();
+ old_physpath = vd->vdev_physpath;
+ vd->vdev_physpath = spa_strdup(physpath);
+
+ if (old_physpath != NULL) {
+ needs_update = (strcmp(old_physpath,
+ vd->vdev_physpath) != 0);
+ spa_strfree(old_physpath);
+ } else
+ needs_update = do_null_update;
+ }
+ g_free(physpath);
+
+ /*
+ * If the physical path changed, update the config.
+ * Only request an update for previously unset physpaths if
+ * requested by the caller.
+ */
+ if (needs_update)
+ spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
+
+}
+
+static void
+vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
+{
+ struct consumer_priv_t *priv;
+ struct consumer_vdev_elem *elem;
+
+ priv = (struct consumer_priv_t *)&cp->private;
+ if (SLIST_EMPTY(priv))
+ return;
+
+ SLIST_FOREACH(elem, priv, elems) {
+ vdev_t *vd = elem->vd;
+ if (strcmp(attr, "GEOM::physpath") == 0) {
+ vdev_geom_set_physpath(vd, cp, /* null_update */B_TRUE);
+ return;
+ }
+ }
+}
+
+static void
+vdev_geom_resize(struct g_consumer *cp)
+{
+ struct consumer_priv_t *priv;
+ struct consumer_vdev_elem *elem;
+ spa_t *spa;
+ vdev_t *vd;
+
+ priv = (struct consumer_priv_t *)&cp->private;
+ if (SLIST_EMPTY(priv))
+ return;
+
+ SLIST_FOREACH(elem, priv, elems) {
+ vd = elem->vd;
+ if (vd->vdev_state != VDEV_STATE_HEALTHY)
+ continue;
+ spa = vd->vdev_spa;
+ if (!spa->spa_autoexpand)
+ continue;
+ vdev_online(spa, vd->vdev_guid, ZFS_ONLINE_EXPAND, NULL);
+ }
+}
+
+static void
+vdev_geom_orphan(struct g_consumer *cp)
+{
+ struct consumer_priv_t *priv;
+ // cppcheck-suppress uninitvar
+ struct consumer_vdev_elem *elem;
+
+ g_topology_assert();
+
+ priv = (struct consumer_priv_t *)&cp->private;
+ if (SLIST_EMPTY(priv))
+ /* Vdev close in progress. Ignore the event. */
+ return;
+
+ /*
+ * Orphan callbacks occur from the GEOM event thread.
+ * Concurrent with this call, new I/O requests may be
+ * working their way through GEOM about to find out
+ * (only once executed by the g_down thread) that we've
+ * been orphaned from our disk provider. These I/Os
+ * must be retired before we can detach our consumer.
+ * This is most easily achieved by acquiring the
+ * SPA ZIO configuration lock as a writer, but doing
+ * so with the GEOM topology lock held would cause
+ * a lock order reversal. Instead, rely on the SPA's
+ * async removal support to invoke a close on this
+ * vdev once it is safe to do so.
+ */
+ // cppcheck-suppress All
+ SLIST_FOREACH(elem, priv, elems) {
+ // cppcheck-suppress uninitvar
+ vdev_t *vd = elem->vd;
+
+ vd->vdev_remove_wanted = B_TRUE;
+ spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
+ }
+}
+
+static struct g_consumer *
+vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity)
+{
+ struct g_geom *gp;
+ struct g_consumer *cp;
+ int error;
+
+ g_topology_assert();
+
+ ZFS_LOG(1, "Attaching to %s.", pp->name);
+
+ if (sanity) {
+ if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
+ ZFS_LOG(1, "Failing attach of %s. "
+ "Incompatible sectorsize %d\n",
+ pp->name, pp->sectorsize);
+ return (NULL);
+ } else if (pp->mediasize < SPA_MINDEVSIZE) {
+ ZFS_LOG(1, "Failing attach of %s. "
+ "Incompatible mediasize %ju\n",
+ pp->name, pp->mediasize);
+ return (NULL);
+ }
+ }
+
+ /* Do we have geom already? No? Create one. */
+ LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
+ if (gp->flags & G_GEOM_WITHER)
+ continue;
+ if (strcmp(gp->name, "zfs::vdev") != 0)
+ continue;
+ break;
+ }
+ if (gp == NULL) {
+ gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
+ gp->orphan = vdev_geom_orphan;
+ gp->attrchanged = vdev_geom_attrchanged;
+ gp->resize = vdev_geom_resize;
+ cp = g_new_consumer(gp);
+ error = g_attach(cp, pp);
+ if (error != 0) {
+ ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
+ __LINE__, error);
+ vdev_geom_detach(cp, B_FALSE);
+ return (NULL);
+ }
+ error = g_access(cp, 1, 0, 1);
+ if (error != 0) {
+ ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
+ __LINE__, error);
+ vdev_geom_detach(cp, B_FALSE);
+ return (NULL);
+ }
+ ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
+ } else {
+ /* Check if we are already connected to this provider. */
+ LIST_FOREACH(cp, &gp->consumer, consumer) {
+ if (cp->provider == pp) {
+ ZFS_LOG(1, "Found consumer for %s.", pp->name);
+ break;
+ }
+ }
+ if (cp == NULL) {
+ cp = g_new_consumer(gp);
+ error = g_attach(cp, pp);
+ if (error != 0) {
+ ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
+ __func__, __LINE__, error);
+ vdev_geom_detach(cp, B_FALSE);
+ return (NULL);
+ }
+ error = g_access(cp, 1, 0, 1);
+ if (error != 0) {
+ ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
+ __func__, __LINE__, error);
+ vdev_geom_detach(cp, B_FALSE);
+ return (NULL);
+ }
+ ZFS_LOG(1, "Created consumer for %s.", pp->name);
+ } else {
+ error = g_access(cp, 1, 0, 1);
+ if (error != 0) {
+ ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
+ __func__, __LINE__, error);
+ return (NULL);
+ }
+ ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
+ }
+ }
+
+ if (vd != NULL)
+ vd->vdev_tsd = cp;
+
+ cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
+ return (cp);
+}
+
+static void
+vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
+{
+ struct g_geom *gp;
+
+ g_topology_assert();
+
+ ZFS_LOG(1, "Detaching from %s.",
+ cp->provider && cp->provider->name ? cp->provider->name : "NULL");
+
+ gp = cp->geom;
+ if (open_for_read)
+ g_access(cp, -1, 0, -1);
+ /* Destroy consumer on last close. */
+ if (cp->acr == 0 && cp->ace == 0) {
+ if (cp->acw > 0)
+ g_access(cp, 0, -cp->acw, 0);
+ if (cp->provider != NULL) {
+ ZFS_LOG(1, "Destroying consumer for %s.",
+ cp->provider->name ? cp->provider->name : "NULL");
+ g_detach(cp);
+ }
+ g_destroy_consumer(cp);
+ }
+ /* Destroy geom if there are no consumers left. */
+ if (LIST_EMPTY(&gp->consumer)) {
+ ZFS_LOG(1, "Destroyed geom %s.", gp->name);
+ g_wither_geom(gp, ENXIO);
+ }
+}
+
+static void
+vdev_geom_close_locked(vdev_t *vd)
+{
+ struct g_consumer *cp;
+ struct consumer_priv_t *priv;
+ struct consumer_vdev_elem *elem, *elem_temp;
+
+ g_topology_assert();
+
+ cp = vd->vdev_tsd;
+ vd->vdev_delayed_close = B_FALSE;
+ if (cp == NULL)
+ return;
+
+ ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
+ KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__));
+ priv = (struct consumer_priv_t *)&cp->private;
+ vd->vdev_tsd = NULL;
+ SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) {
+ if (elem->vd == vd) {
+ SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems);
+ g_free(elem);
+ }
+ }
+
+ vdev_geom_detach(cp, B_TRUE);
+}
+
+/*
+ * Issue one or more bios to the vdev in parallel
+ * cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO
+ * operation is described by parallel entries from each array. There may be
+ * more bios actually issued than entries in the array
+ */
+static void
+vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
+ off_t *sizes, int *errors, int ncmds)
+{
+ struct bio **bios;
+ uint8_t *p;
+ off_t off, maxio, s, end;
+ int i, n_bios, j;
+ size_t bios_size;
+
+ maxio = maxphys - (maxphys % cp->provider->sectorsize);
+ n_bios = 0;
+
+ /* How many bios are required for all commands ? */
+ for (i = 0; i < ncmds; i++)
+ n_bios += (sizes[i] + maxio - 1) / maxio;
+
+ /* Allocate memory for the bios */
+ bios_size = n_bios * sizeof (struct bio *);
+ bios = kmem_zalloc(bios_size, KM_SLEEP);
+
+ /* Prepare and issue all of the bios */
+ for (i = j = 0; i < ncmds; i++) {
+ off = offsets[i];
+ p = datas[i];
+ s = sizes[i];
+ end = off + s;
+ ASSERT((off % cp->provider->sectorsize) == 0);
+ ASSERT((s % cp->provider->sectorsize) == 0);
+
+ for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
+ bios[j] = g_alloc_bio();
+ bios[j]->bio_cmd = cmds[i];
+ bios[j]->bio_done = NULL;
+ bios[j]->bio_offset = off;
+ bios[j]->bio_length = MIN(s, maxio);
+ bios[j]->bio_data = (caddr_t)p;
+ g_io_request(bios[j], cp);
+ }
+ }
+ ASSERT(j == n_bios);
+
+ /* Wait for all of the bios to complete, and clean them up */
+ for (i = j = 0; i < ncmds; i++) {
+ off = offsets[i];
+ s = sizes[i];
+ end = off + s;
+
+ for (; off < end; off += maxio, s -= maxio, j++) {
+ errors[i] = biowait(bios[j], "vdev_geom_io") ||
+ errors[i];
+ g_destroy_bio(bios[j]);
+ }
+ }
+ kmem_free(bios, bios_size);
+}
+
+/*
+ * Read the vdev config from a device. Return the number of valid labels that
+ * were found. The vdev config will be returned in config if and only if at
+ * least one valid label was found.
+ */
+static int
+vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp)
+{
+ struct g_provider *pp;
+ nvlist_t *config;
+ vdev_phys_t *vdev_lists[VDEV_LABELS];
+ char *buf;
+ size_t buflen;
+ uint64_t psize, state, txg;
+ off_t offsets[VDEV_LABELS];
+ off_t size;
+ off_t sizes[VDEV_LABELS];
+ int cmds[VDEV_LABELS];
+ int errors[VDEV_LABELS];
+ int l, nlabels;
+
+ g_topology_assert_not();
+
+ pp = cp->provider;
+ ZFS_LOG(1, "Reading config from %s...", pp->name);
+
+ psize = pp->mediasize;
+ psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
+
+ size = sizeof (*vdev_lists[0]) + pp->sectorsize -
+ ((sizeof (*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
+
+ buflen = sizeof (vdev_lists[0]->vp_nvlist);
+
+ /* Create all of the IO requests */
+ for (l = 0; l < VDEV_LABELS; l++) {
+ cmds[l] = BIO_READ;
+ vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
+ offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
+ sizes[l] = size;
+ errors[l] = 0;
+ ASSERT(offsets[l] % pp->sectorsize == 0);
+ }
+
+ /* Issue the IO requests */
+ vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
+ VDEV_LABELS);
+
+ /* Parse the labels */
+ config = *configp = NULL;
+ nlabels = 0;
+ for (l = 0; l < VDEV_LABELS; l++) {
+ if (errors[l] != 0)
+ continue;
+
+ buf = vdev_lists[l]->vp_nvlist;
+
+ if (nvlist_unpack(buf, buflen, &config, 0) != 0)
+ continue;
+
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ &state) != 0 || state > POOL_STATE_L2CACHE) {
+ nvlist_free(config);
+ continue;
+ }
+
+ if (state != POOL_STATE_SPARE &&
+ state != POOL_STATE_L2CACHE &&
+ (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ &txg) != 0 || txg == 0)) {
+ nvlist_free(config);
+ continue;
+ }
+
+ if (*configp != NULL)
+ nvlist_free(*configp);
+ *configp = config;
+ nlabels++;
+ }
+
+ /* Free the label storage */
+ for (l = 0; l < VDEV_LABELS; l++)
+ kmem_free(vdev_lists[l], size);
+
+ return (nlabels);
+}
+
+static void
+resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
+{
+ nvlist_t **new_configs;
+ uint64_t i;
+
+ if (id < *count)
+ return;
+ new_configs = kmem_zalloc((id + 1) * sizeof (nvlist_t *),
+ KM_SLEEP);
+ for (i = 0; i < *count; i++)
+ new_configs[i] = (*configs)[i];
+ if (*configs != NULL)
+ kmem_free(*configs, *count * sizeof (void *));
+ *configs = new_configs;
+ *count = id + 1;
+}
+
+static void
+process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
+ const char *name, uint64_t *known_pool_guid)
+{
+ nvlist_t *vdev_tree;
+ uint64_t pool_guid;
+ uint64_t vdev_guid;
+ uint64_t id, txg, known_txg;
+ char *pname;
+
+ if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
+ strcmp(pname, name) != 0)
+ goto ignore;
+
+ if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
+ goto ignore;
+
+ if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
+ goto ignore;
+
+ if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
+ goto ignore;
+
+ if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
+ goto ignore;
+
+ VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
+
+ if (*known_pool_guid != 0) {
+ if (pool_guid != *known_pool_guid)
+ goto ignore;
+ } else
+ *known_pool_guid = pool_guid;
+
+ resize_configs(configs, count, id);
+
+ if ((*configs)[id] != NULL) {
+ VERIFY(nvlist_lookup_uint64((*configs)[id],
+ ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
+ if (txg <= known_txg)
+ goto ignore;
+ nvlist_free((*configs)[id]);
+ }
+
+ (*configs)[id] = cfg;
+ return;
+
+ignore:
+ nvlist_free(cfg);
+}
+
+int
+vdev_geom_read_pool_label(const char *name,
+ nvlist_t ***configs, uint64_t *count)
+{
+ struct g_class *mp;
+ struct g_geom *gp;
+ struct g_provider *pp;
+ struct g_consumer *zcp;
+ nvlist_t *vdev_cfg;
+ uint64_t pool_guid;
+ int nlabels;
+
+ DROP_GIANT();
+ g_topology_lock();
+
+ *configs = NULL;
+ *count = 0;
+ pool_guid = 0;
+ LIST_FOREACH(mp, &g_classes, class) {
+ if (mp == &zfs_vdev_class)
+ continue;
+ LIST_FOREACH(gp, &mp->geom, geom) {
+ if (gp->flags & G_GEOM_WITHER)
+ continue;
+ LIST_FOREACH(pp, &gp->provider, provider) {
+ if (pp->flags & G_PF_WITHER)
+ continue;
+ zcp = vdev_geom_attach(pp, NULL, B_TRUE);
+ if (zcp == NULL)
+ continue;
+ g_topology_unlock();
+ nlabels = vdev_geom_read_config(zcp, &vdev_cfg);
+ g_topology_lock();
+ vdev_geom_detach(zcp, B_TRUE);
+ if (nlabels == 0)
+ continue;
+ ZFS_LOG(1, "successfully read vdev config");
+
+ process_vdev_config(configs, count,
+ vdev_cfg, name, &pool_guid);
+ }
+ }
+ }
+ g_topology_unlock();
+ PICKUP_GIANT();
+
+ return (*count > 0 ? 0 : ENOENT);
+}
+
+enum match {
+ NO_MATCH = 0, /* No matching labels found */
+ TOPGUID_MATCH = 1, /* Labels match top guid, not vdev guid */
+ ZERO_MATCH = 1, /* Should never be returned */
+ ONE_MATCH = 2, /* 1 label matching the vdev_guid */
+ TWO_MATCH = 3, /* 2 label matching the vdev_guid */
+ THREE_MATCH = 4, /* 3 label matching the vdev_guid */
+ FULL_MATCH = 5 /* all labels match the vdev_guid */
+};
+
+static enum match
+vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
+{
+ nvlist_t *config;
+ uint64_t pool_guid, top_guid, vdev_guid;
+ struct g_consumer *cp;
+ int nlabels;
+
+ cp = vdev_geom_attach(pp, NULL, B_TRUE);
+ if (cp == NULL) {
+ ZFS_LOG(1, "Unable to attach tasting instance to %s.",
+ pp->name);
+ return (NO_MATCH);
+ }
+ g_topology_unlock();
+ nlabels = vdev_geom_read_config(cp, &config);
+ g_topology_lock();
+ vdev_geom_detach(cp, B_TRUE);
+ if (nlabels == 0) {
+ ZFS_LOG(1, "Unable to read config from %s.", pp->name);
+ return (NO_MATCH);
+ }
+
+ pool_guid = 0;
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
+ top_guid = 0;
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
+ vdev_guid = 0;
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
+ nvlist_free(config);
+
+ /*
+ * Check that the label's pool guid matches the desired guid.
+ * Inactive spares and L2ARCs do not have any pool guid in the label.
+ */
+ if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
+ ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
+ pp->name,
+ (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
+ return (NO_MATCH);
+ }
+
+ /*
+ * Check that the label's vdev guid matches the desired guid.
+ * The second condition handles possible race on vdev detach, when
+ * remaining vdev receives GUID of destroyed top level mirror vdev.
+ */
+ if (vdev_guid == vd->vdev_guid) {
+ ZFS_LOG(1, "guids match for provider %s.", pp->name);
+ return (ZERO_MATCH + nlabels);
+ } else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
+ ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
+ return (TOPGUID_MATCH);
+ }
+ ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
+ pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
+ return (NO_MATCH);
+}
+
+static struct g_consumer *
+vdev_geom_attach_by_guids(vdev_t *vd)
+{
+ struct g_class *mp;
+ struct g_geom *gp;
+ struct g_provider *pp, *best_pp;
+ struct g_consumer *cp;
+ const char *vdpath;
+ enum match match, best_match;
+
+ g_topology_assert();
+
+ vdpath = vd->vdev_path + sizeof ("/dev/") - 1;
+ cp = NULL;
+ best_pp = NULL;
+ best_match = NO_MATCH;
+ LIST_FOREACH(mp, &g_classes, class) {
+ if (mp == &zfs_vdev_class)
+ continue;
+ LIST_FOREACH(gp, &mp->geom, geom) {
+ if (gp->flags & G_GEOM_WITHER)
+ continue;
+ LIST_FOREACH(pp, &gp->provider, provider) {
+ match = vdev_attach_ok(vd, pp);
+ if (match > best_match) {
+ best_match = match;
+ best_pp = pp;
+ } else if (match == best_match) {
+ if (strcmp(pp->name, vdpath) == 0) {
+ best_pp = pp;
+ }
+ }
+ if (match == FULL_MATCH)
+ goto out;
+ }
+ }
+ }
+
+out:
+ if (best_pp) {
+ cp = vdev_geom_attach(best_pp, vd, B_TRUE);
+ if (cp == NULL) {
+ printf("ZFS WARNING: Unable to attach to %s.\n",
+ best_pp->name);
+ }
+ }
+ return (cp);
+}
+
+static struct g_consumer *
+vdev_geom_open_by_guids(vdev_t *vd)
+{
+ struct g_consumer *cp;
+ char *buf;
+ size_t len;
+
+ g_topology_assert();
+
+ ZFS_LOG(1, "Searching by guids [%ju:%ju].",
+ (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
+ cp = vdev_geom_attach_by_guids(vd);
+ if (cp != NULL) {
+ len = strlen(cp->provider->name) + strlen("/dev/") + 1;
+ buf = kmem_alloc(len, KM_SLEEP);
+
+ snprintf(buf, len, "/dev/%s", cp->provider->name);
+ spa_strfree(vd->vdev_path);
+ vd->vdev_path = buf;
+
+ ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
+ (uintmax_t)spa_guid(vd->vdev_spa),
+ (uintmax_t)vd->vdev_guid, cp->provider->name);
+ } else {
+ ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
+ (uintmax_t)spa_guid(vd->vdev_spa),
+ (uintmax_t)vd->vdev_guid);
+ }
+
+ return (cp);
+}
+
+static struct g_consumer *
+vdev_geom_open_by_path(vdev_t *vd, int check_guid)
+{
+ struct g_provider *pp;
+ struct g_consumer *cp;
+
+ g_topology_assert();
+
+ cp = NULL;
+ pp = g_provider_by_name(vd->vdev_path + sizeof ("/dev/") - 1);
+ if (pp != NULL) {
+ ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
+ if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH)
+ cp = vdev_geom_attach(pp, vd, B_FALSE);
+ }
+
+ return (cp);
+}
+
+static int
+vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+ struct g_provider *pp;
+ struct g_consumer *cp;
+ int error, has_trim;
+ uint16_t rate;
+
+ /*
+ * Set the TLS to indicate downstack that we
+ * should not access zvols
+ */
+ VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
+
+ /*
+ * We must have a pathname, and it must be absolute.
+ */
+ if (vd->vdev_path == NULL || strncmp(vd->vdev_path, "/dev/", 5) != 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ /*
+ * Reopen the device if it's not currently open. Otherwise,
+ * just update the physical size of the device.
+ */
+ if ((cp = vd->vdev_tsd) != NULL) {
+ ASSERT(vd->vdev_reopening);
+ goto skip_open;
+ }
+
+ DROP_GIANT();
+ g_topology_lock();
+ error = 0;
+
+ if (vd->vdev_spa->spa_is_splitting ||
+ ((vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
+ (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
+ vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)))) {
+ /*
+ * We are dealing with a vdev that hasn't been previously
+ * opened (since boot), and we are not loading an
+ * existing pool configuration. This looks like a
+ * vdev add operation to a new or existing pool.
+ * Assume the user really wants to do this, and find
+ * GEOM provider by its name, ignoring GUID mismatches.
+ *
+ * XXPOLICY: It would be safer to only allow a device
+ * that is unlabeled or labeled but missing
+ * GUID information to be opened in this fashion,
+ * unless we are doing a split, in which case we
+ * should allow any guid.
+ */
+ cp = vdev_geom_open_by_path(vd, 0);
+ } else {
+ /*
+ * Try using the recorded path for this device, but only
+ * accept it if its label data contains the expected GUIDs.
+ */
+ cp = vdev_geom_open_by_path(vd, 1);
+ if (cp == NULL) {
+ /*
+ * The device at vd->vdev_path doesn't have the
+ * expected GUIDs. The disks might have merely
+ * moved around so try all other GEOM providers
+ * to find one with the right GUIDs.
+ */
+ cp = vdev_geom_open_by_guids(vd);
+ }
+ }
+
+ /* Clear the TLS now that tasting is done */
+ VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
+
+ if (cp == NULL) {
+ ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path);
+ error = ENOENT;
+ } else {
+ struct consumer_priv_t *priv;
+ struct consumer_vdev_elem *elem;
+ int spamode;
+
+ priv = (struct consumer_priv_t *)&cp->private;
+ if (cp->private == NULL)
+ SLIST_INIT(priv);
+ elem = g_malloc(sizeof (*elem), M_WAITOK|M_ZERO);
+ elem->vd = vd;
+ SLIST_INSERT_HEAD(priv, elem, elems);
+
+ spamode = spa_mode(vd->vdev_spa);
+ if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
+ !ISP2(cp->provider->sectorsize)) {
+ ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
+ cp->provider->name);
+
+ vdev_geom_close_locked(vd);
+ error = EINVAL;
+ cp = NULL;
+ } else if (cp->acw == 0 && (spamode & FWRITE) != 0) {
+ int i;
+
+ for (i = 0; i < 5; i++) {
+ error = g_access(cp, 0, 1, 0);
+ if (error == 0)
+ break;
+ g_topology_unlock();
+ tsleep(vd, 0, "vdev", hz / 2);
+ g_topology_lock();
+ }
+ if (error != 0) {
+ printf("ZFS WARNING: Unable to open %s for "
+ "writing (error=%d).\n",
+ cp->provider->name, error);
+ vdev_geom_close_locked(vd);
+ cp = NULL;
+ }
+ }
+ }
+
+ /* Fetch initial physical path information for this device. */
+ if (cp != NULL) {
+ vdev_geom_attrchanged(cp, "GEOM::physpath");
+
+ /* Set other GEOM characteristics */
+ vdev_geom_set_physpath(vd, cp, /* do_null_update */B_FALSE);
+ }
+
+ g_topology_unlock();
+ PICKUP_GIANT();
+ if (cp == NULL) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]",
+ error);
+ return (error);
+ }
+skip_open:
+ pp = cp->provider;
+
+ /*
+ * Determine the actual size of the device.
+ */
+ *max_psize = *psize = pp->mediasize;
+
+ /*
+ * Determine the device's minimum transfer size and preferred
+ * transfer size.
+ */
+ *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
+ *physical_ashift = 0;
+ if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) &&
+ ISP2(pp->stripesize) && pp->stripesize <= (1 << ASHIFT_MAX) &&
+ pp->stripeoffset == 0)
+ *physical_ashift = highbit(pp->stripesize) - 1;
+
+ /*
+ * Clear the nowritecache settings, so that on a vdev_reopen()
+ * we will try again.
+ */
+ vd->vdev_nowritecache = B_FALSE;
+
+ /* Inform the ZIO pipeline that we are non-rotational. */
+ error = g_getattr("GEOM::rotation_rate", cp, &rate);
+ if (error == 0 && rate == DISK_RR_NON_ROTATING)
+ vd->vdev_nonrot = B_TRUE;
+ else
+ vd->vdev_nonrot = B_FALSE;
+
+ /* Set when device reports it supports TRIM. */
+ error = g_getattr("GEOM::candelete", cp, &has_trim);
+ vd->vdev_has_trim = (error == 0 && has_trim);
+
+ /* Set when device reports it supports secure TRIM. */
+ /* unavailable on FreeBSD */
+ vd->vdev_has_securetrim = B_FALSE;
+
+ return (0);
+}
+
+static void
+vdev_geom_close(vdev_t *vd)
+{
+ struct g_consumer *cp;
+ boolean_t locked;
+
+ cp = vd->vdev_tsd;
+
+ DROP_GIANT();
+ locked = g_topology_locked();
+ if (!locked)
+ g_topology_lock();
+
+ if (!vd->vdev_reopening ||
+ (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 ||
+ (cp->provider != NULL && cp->provider->error != 0))))
+ vdev_geom_close_locked(vd);
+
+ if (!locked)
+ g_topology_unlock();
+ PICKUP_GIANT();
+}
+
+static void
+vdev_geom_io_intr(struct bio *bp)
+{
+ vdev_t *vd;
+ zio_t *zio;
+
+ zio = bp->bio_caller1;
+ vd = zio->io_vd;
+ zio->io_error = bp->bio_error;
+ if (zio->io_error == 0 && bp->bio_resid != 0)
+ zio->io_error = SET_ERROR(EIO);
+
+ switch (zio->io_error) {
+ case ENOTSUP:
+ /*
+ * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
+ * that future attempts will never succeed. In this case
+ * we set a persistent flag so that we don't bother with
+ * requests in the future.
+ */
+ switch (bp->bio_cmd) {
+ case BIO_FLUSH:
+ vd->vdev_nowritecache = B_TRUE;
+ break;
+ case BIO_DELETE:
+ break;
+ }
+ break;
+ case ENXIO:
+ if (!vd->vdev_remove_wanted) {
+ /*
+ * If provider's error is set we assume it is being
+ * removed.
+ */
+ if (bp->bio_to->error != 0) {
+ vd->vdev_remove_wanted = B_TRUE;
+ spa_async_request(zio->io_spa,
+ SPA_ASYNC_REMOVE);
+ } else if (!vd->vdev_delayed_close) {
+ vd->vdev_delayed_close = B_TRUE;
+ }
+ }
+ break;
+ }
+
+ /*
+ * We have to split bio freeing into two parts, because the ABD code
+ * cannot be called in this context and vdev_op_io_done is not called
+ * for ZIO_TYPE_IOCTL zio-s.
+ */
+ if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
+ g_destroy_bio(bp);
+ zio->io_bio = NULL;
+ }
+ zio_delay_interrupt(zio);
+}
+
+static void
+vdev_geom_io_start(zio_t *zio)
+{
+ vdev_t *vd;
+ struct g_consumer *cp;
+ struct bio *bp;
+
+ vd = zio->io_vd;
+
+ switch (zio->io_type) {
+ case ZIO_TYPE_IOCTL:
+ /* XXPOLICY */
+ if (!vdev_readable(vd)) {
+ zio->io_error = SET_ERROR(ENXIO);
+ zio_interrupt(zio);
+ return;
+ } else {
+ switch (zio->io_cmd) {
+ case DKIOCFLUSHWRITECACHE:
+ if (zfs_nocacheflush ||
+ vdev_geom_bio_flush_disable)
+ break;
+ if (vd->vdev_nowritecache) {
+ zio->io_error = SET_ERROR(ENOTSUP);
+ break;
+ }
+ goto sendreq;
+ default:
+ zio->io_error = SET_ERROR(ENOTSUP);
+ }
+ }
+
+ zio_execute(zio);
+ return;
+ case ZIO_TYPE_TRIM:
+ if (!vdev_geom_bio_delete_disable) {
+ goto sendreq;
+ }
+ zio_execute(zio);
+ return;
+ default:
+ ;
+ /* PASSTHROUGH --- placate compiler */
+ }
+sendreq:
+ ASSERT(zio->io_type == ZIO_TYPE_READ ||
+ zio->io_type == ZIO_TYPE_WRITE ||
+ zio->io_type == ZIO_TYPE_TRIM ||
+ zio->io_type == ZIO_TYPE_IOCTL);
+
+ cp = vd->vdev_tsd;
+ if (cp == NULL) {
+ zio->io_error = SET_ERROR(ENXIO);
+ zio_interrupt(zio);
+ return;
+ }
+ bp = g_alloc_bio();
+ bp->bio_caller1 = zio;
+ switch (zio->io_type) {
+ case ZIO_TYPE_READ:
+ case ZIO_TYPE_WRITE:
+ zio->io_target_timestamp = zio_handle_io_delay(zio);
+ bp->bio_offset = zio->io_offset;
+ bp->bio_length = zio->io_size;
+ if (zio->io_type == ZIO_TYPE_READ) {
+ bp->bio_cmd = BIO_READ;
+ bp->bio_data =
+ abd_borrow_buf(zio->io_abd, zio->io_size);
+ } else {
+ bp->bio_cmd = BIO_WRITE;
+ bp->bio_data =
+ abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+ }
+ break;
+ case ZIO_TYPE_TRIM:
+ bp->bio_cmd = BIO_DELETE;
+ bp->bio_data = NULL;
+ bp->bio_offset = zio->io_offset;
+ bp->bio_length = zio->io_size;
+ break;
+ case ZIO_TYPE_IOCTL:
+ bp->bio_cmd = BIO_FLUSH;
+ bp->bio_data = NULL;
+ bp->bio_offset = cp->provider->mediasize;
+ bp->bio_length = 0;
+ break;
+ default:
+ panic("invalid zio->io_type: %d\n", zio->io_type);
+ }
+ bp->bio_done = vdev_geom_io_intr;
+ zio->io_bio = bp;
+
+ g_io_request(bp, cp);
+}
+
+static void
+vdev_geom_io_done(zio_t *zio)
+{
+ struct bio *bp = zio->io_bio;
+
+ if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
+ ASSERT(bp == NULL);
+ return;
+ }
+
+ if (bp == NULL) {
+ ASSERT3S(zio->io_error, ==, ENXIO);
+ return;
+ }
+
+ if (zio->io_type == ZIO_TYPE_READ)
+ abd_return_buf_copy(zio->io_abd, bp->bio_data, zio->io_size);
+ else
+ abd_return_buf(zio->io_abd, bp->bio_data, zio->io_size);
+
+ g_destroy_bio(bp);
+ zio->io_bio = NULL;
+}
+
+static void
+vdev_geom_hold(vdev_t *vd)
+{
+}
+
+static void
+vdev_geom_rele(vdev_t *vd)
+{
+}
+
+vdev_ops_t vdev_disk_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
+ .vdev_op_open = vdev_geom_open,
+ .vdev_op_close = vdev_geom_close,
+ .vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
+ .vdev_op_io_start = vdev_geom_io_start,
+ .vdev_op_io_done = vdev_geom_io_done,
+ .vdev_op_state_change = NULL,
+ .vdev_op_need_resilver = NULL,
+ .vdev_op_hold = vdev_geom_hold,
+ .vdev_op_rele = vdev_geom_rele,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
+ .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */
+ .vdev_op_leaf = B_TRUE /* leaf vdev */
+};
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_label_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_label_os.c
new file mode 100644
index 000000000000..97cb201934dc
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_label_os.c
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/vdev.h>
+#include <sys/vdev_os.h>
+#include <sys/vdev_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/zio.h>
+#include <sys/dsl_scan.h>
+#include <sys/abd.h>
+#include <sys/fs/zfs.h>
+
+int
+vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size)
+{
+ spa_t *spa = vd->vdev_spa;
+ zio_t *zio;
+ abd_t *pad2;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+ int error;
+
+ if (size > VDEV_PAD_SIZE)
+ return (EINVAL);
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (ENODEV);
+ if (vdev_is_dead(vd))
+ return (ENXIO);
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
+ abd_zero(pad2, VDEV_PAD_SIZE);
+ abd_copy_from_buf(pad2, buf, size);
+
+retry:
+ zio = zio_root(spa, NULL, NULL, flags);
+ vdev_label_write(zio, vd, 0, pad2,
+ offsetof(vdev_label_t, vl_be),
+ VDEV_PAD_SIZE, NULL, NULL, flags);
+ error = zio_wait(zio);
+ if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
+ flags |= ZIO_FLAG_TRYHARD;
+ goto retry;
+ }
+
+ abd_free(pad2);
+ return (error);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
new file mode 100644
index 000000000000..23b87de8bd0d
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
@@ -0,0 +1,2700 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/sdt.h>
+#include <sys/fs/zfs.h>
+#include <sys/policy.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_quota.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/dmu.h>
+#include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <acl/acl_common.h>
+
+
+#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE
+#define DENY ACE_ACCESS_DENIED_ACE_TYPE
+#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE
+#define MIN_ACE_TYPE ALLOW
+
+#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP)
+#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \
+ ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE)
+#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+
+#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \
+ ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \
+ ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \
+ ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE)
+
+#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
+#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \
+ ACE_DELETE|ACE_DELETE_CHILD)
+#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS)
+
+#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \
+ ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE)
+
+#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER)
+
+#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\
+ ZFS_ACL_PROTECTED)
+
+#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\
+ ZFS_ACL_OBJ_ACE)
+
+#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH)
+
+static uint16_t
+zfs_ace_v0_get_type(void *acep)
+{
+ return (((zfs_oldace_t *)acep)->z_type);
+}
+
+static uint16_t
+zfs_ace_v0_get_flags(void *acep)
+{
+ return (((zfs_oldace_t *)acep)->z_flags);
+}
+
+static uint32_t
+zfs_ace_v0_get_mask(void *acep)
+{
+ return (((zfs_oldace_t *)acep)->z_access_mask);
+}
+
+static uint64_t
+zfs_ace_v0_get_who(void *acep)
+{
+ return (((zfs_oldace_t *)acep)->z_fuid);
+}
+
+static void
+zfs_ace_v0_set_type(void *acep, uint16_t type)
+{
+ ((zfs_oldace_t *)acep)->z_type = type;
+}
+
+static void
+zfs_ace_v0_set_flags(void *acep, uint16_t flags)
+{
+ ((zfs_oldace_t *)acep)->z_flags = flags;
+}
+
+static void
+zfs_ace_v0_set_mask(void *acep, uint32_t mask)
+{
+ ((zfs_oldace_t *)acep)->z_access_mask = mask;
+}
+
+static void
+zfs_ace_v0_set_who(void *acep, uint64_t who)
+{
+ ((zfs_oldace_t *)acep)->z_fuid = who;
+}
+
+/*ARGSUSED*/
+static size_t
+zfs_ace_v0_size(void *acep)
+{
+ return (sizeof (zfs_oldace_t));
+}
+
+static size_t
+zfs_ace_v0_abstract_size(void)
+{
+ return (sizeof (zfs_oldace_t));
+}
+
+static int
+zfs_ace_v0_mask_off(void)
+{
+ return (offsetof(zfs_oldace_t, z_access_mask));
+}
+
+/*ARGSUSED*/
+static int
+zfs_ace_v0_data(void *acep, void **datap)
+{
+ *datap = NULL;
+ return (0);
+}
+
+static acl_ops_t zfs_acl_v0_ops = {
+ zfs_ace_v0_get_mask,
+ zfs_ace_v0_set_mask,
+ zfs_ace_v0_get_flags,
+ zfs_ace_v0_set_flags,
+ zfs_ace_v0_get_type,
+ zfs_ace_v0_set_type,
+ zfs_ace_v0_get_who,
+ zfs_ace_v0_set_who,
+ zfs_ace_v0_size,
+ zfs_ace_v0_abstract_size,
+ zfs_ace_v0_mask_off,
+ zfs_ace_v0_data
+};
+
+static uint16_t
+zfs_ace_fuid_get_type(void *acep)
+{
+ return (((zfs_ace_hdr_t *)acep)->z_type);
+}
+
+static uint16_t
+zfs_ace_fuid_get_flags(void *acep)
+{
+ return (((zfs_ace_hdr_t *)acep)->z_flags);
+}
+
+static uint32_t
+zfs_ace_fuid_get_mask(void *acep)
+{
+ return (((zfs_ace_hdr_t *)acep)->z_access_mask);
+}
+
+static uint64_t
+zfs_ace_fuid_get_who(void *args)
+{
+ uint16_t entry_type;
+ zfs_ace_t *acep = args;
+
+ entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+
+ if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE)
+ return (-1);
+ return (((zfs_ace_t *)acep)->z_fuid);
+}
+
+static void
+zfs_ace_fuid_set_type(void *acep, uint16_t type)
+{
+ ((zfs_ace_hdr_t *)acep)->z_type = type;
+}
+
+static void
+zfs_ace_fuid_set_flags(void *acep, uint16_t flags)
+{
+ ((zfs_ace_hdr_t *)acep)->z_flags = flags;
+}
+
+static void
+zfs_ace_fuid_set_mask(void *acep, uint32_t mask)
+{
+ ((zfs_ace_hdr_t *)acep)->z_access_mask = mask;
+}
+
+static void
+zfs_ace_fuid_set_who(void *arg, uint64_t who)
+{
+ zfs_ace_t *acep = arg;
+
+ uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+
+ if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE)
+ return;
+ acep->z_fuid = who;
+}
+
+static size_t
+zfs_ace_fuid_size(void *acep)
+{
+ zfs_ace_hdr_t *zacep = acep;
+ uint16_t entry_type;
+
+ switch (zacep->z_type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ return (sizeof (zfs_object_ace_t));
+ case ALLOW:
+ case DENY:
+ entry_type =
+ (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS);
+ if (entry_type == ACE_OWNER ||
+ entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE)
+ return (sizeof (zfs_ace_hdr_t));
+ /*FALLTHROUGH*/
+ default:
+ return (sizeof (zfs_ace_t));
+ }
+}
+
+static size_t
+zfs_ace_fuid_abstract_size(void)
+{
+ return (sizeof (zfs_ace_hdr_t));
+}
+
+static int
+zfs_ace_fuid_mask_off(void)
+{
+ return (offsetof(zfs_ace_hdr_t, z_access_mask));
+}
+
+static int
+zfs_ace_fuid_data(void *acep, void **datap)
+{
+ zfs_ace_t *zacep = acep;
+ zfs_object_ace_t *zobjp;
+
+ switch (zacep->z_hdr.z_type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ zobjp = acep;
+ *datap = (caddr_t)zobjp + sizeof (zfs_ace_t);
+ return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t));
+ default:
+ *datap = NULL;
+ return (0);
+ }
+}
+
+static acl_ops_t zfs_acl_fuid_ops = {
+ zfs_ace_fuid_get_mask,
+ zfs_ace_fuid_set_mask,
+ zfs_ace_fuid_get_flags,
+ zfs_ace_fuid_set_flags,
+ zfs_ace_fuid_get_type,
+ zfs_ace_fuid_set_type,
+ zfs_ace_fuid_get_who,
+ zfs_ace_fuid_set_who,
+ zfs_ace_fuid_size,
+ zfs_ace_fuid_abstract_size,
+ zfs_ace_fuid_mask_off,
+ zfs_ace_fuid_data
+};
+
+/*
+ * The following three functions are provided for compatibility with
+ * older ZPL version in order to determine if the file use to have
+ * an external ACL and what version of ACL previously existed on the
+ * file. Would really be nice to not need this, sigh.
+ */
+uint64_t
+zfs_external_acl(znode_t *zp)
+{
+ zfs_acl_phys_t acl_phys;
+ int error;
+
+ if (zp->z_is_sa)
+ return (0);
+
+ /*
+ * Need to deal with a potential
+ * race where zfs_sa_upgrade could cause
+ * z_isa_sa to change.
+ *
+ * If the lookup fails then the state of z_is_sa should have
+ * changed.
+ */
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
+ &acl_phys, sizeof (acl_phys))) == 0)
+ return (acl_phys.z_acl_extern_obj);
+ else {
+ /*
+ * after upgrade the SA_ZPL_ZNODE_ACL should have been
+ * removed
+ */
+ VERIFY(zp->z_is_sa && error == ENOENT);
+ return (0);
+ }
+}
+
+/*
+ * Determine size of ACL in bytes
+ *
+ * This is more complicated than it should be since we have to deal
+ * with old external ACLs.
+ */
+static int
+zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount,
+ zfs_acl_phys_t *aclphys)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t acl_count;
+ int size;
+ int error;
+
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+ if (zp->z_is_sa) {
+ if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs),
+ &size)) != 0)
+ return (error);
+ *aclsize = size;
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs),
+ &acl_count, sizeof (acl_count))) != 0)
+ return (error);
+ *aclcount = acl_count;
+ } else {
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+ aclphys, sizeof (*aclphys))) != 0)
+ return (error);
+
+ if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) {
+ *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size);
+ *aclcount = aclphys->z_acl_size;
+ } else {
+ *aclsize = aclphys->z_acl_size;
+ *aclcount = aclphys->z_acl_count;
+ }
+ }
+ return (0);
+}
+
+int
+zfs_znode_acl_version(znode_t *zp)
+{
+ zfs_acl_phys_t acl_phys;
+
+ if (zp->z_is_sa)
+ return (ZFS_ACL_VERSION_FUID);
+ else {
+ int error;
+
+ /*
+ * Need to deal with a potential
+ * race where zfs_sa_upgrade could cause
+ * z_isa_sa to change.
+ *
+ * If the lookup fails then the state of z_is_sa should have
+ * changed.
+ */
+ if ((error = sa_lookup(zp->z_sa_hdl,
+ SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
+ &acl_phys, sizeof (acl_phys))) == 0)
+ return (acl_phys.z_acl_version);
+ else {
+ /*
+ * After upgrade SA_ZPL_ZNODE_ACL should have
+ * been removed.
+ */
+ VERIFY(zp->z_is_sa && error == ENOENT);
+ return (ZFS_ACL_VERSION_FUID);
+ }
+ }
+}
+
+static int
+zfs_acl_version(int version)
+{
+ if (version < ZPL_VERSION_FUID)
+ return (ZFS_ACL_VERSION_INITIAL);
+ else
+ return (ZFS_ACL_VERSION_FUID);
+}
+
+static int
+zfs_acl_version_zp(znode_t *zp)
+{
+ return (zfs_acl_version(zp->z_zfsvfs->z_version));
+}
+
+zfs_acl_t *
+zfs_acl_alloc(int vers)
+{
+ zfs_acl_t *aclp;
+
+ aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
+ list_create(&aclp->z_acl, sizeof (zfs_acl_node_t),
+ offsetof(zfs_acl_node_t, z_next));
+ aclp->z_version = vers;
+ if (vers == ZFS_ACL_VERSION_FUID)
+ aclp->z_ops = &zfs_acl_fuid_ops;
+ else
+ aclp->z_ops = &zfs_acl_v0_ops;
+ return (aclp);
+}
+
+zfs_acl_node_t *
+zfs_acl_node_alloc(size_t bytes)
+{
+ zfs_acl_node_t *aclnode;
+
+ aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP);
+ if (bytes) {
+ aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP);
+ aclnode->z_allocdata = aclnode->z_acldata;
+ aclnode->z_allocsize = bytes;
+ aclnode->z_size = bytes;
+ }
+
+ return (aclnode);
+}
+
+static void
+zfs_acl_node_free(zfs_acl_node_t *aclnode)
+{
+ if (aclnode->z_allocsize)
+ kmem_free(aclnode->z_allocdata, aclnode->z_allocsize);
+ kmem_free(aclnode, sizeof (zfs_acl_node_t));
+}
+
+static void
+zfs_acl_release_nodes(zfs_acl_t *aclp)
+{
+ zfs_acl_node_t *aclnode;
+
+ while ((aclnode = list_head(&aclp->z_acl))) {
+ list_remove(&aclp->z_acl, aclnode);
+ zfs_acl_node_free(aclnode);
+ }
+ aclp->z_acl_count = 0;
+ aclp->z_acl_bytes = 0;
+}
+
+void
+zfs_acl_free(zfs_acl_t *aclp)
+{
+ zfs_acl_release_nodes(aclp);
+ list_destroy(&aclp->z_acl);
+ kmem_free(aclp, sizeof (zfs_acl_t));
+}
+
+static boolean_t
+zfs_acl_valid_ace_type(uint_t type, uint_t flags)
+{
+ uint16_t entry_type;
+
+ switch (type) {
+ case ALLOW:
+ case DENY:
+ case ACE_SYSTEM_AUDIT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_ACE_TYPE:
+ entry_type = flags & ACE_TYPE_FLAGS;
+ return (entry_type == ACE_OWNER ||
+ entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE || entry_type == 0 ||
+ entry_type == ACE_IDENTIFIER_GROUP);
+ default:
+ if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+static boolean_t
+zfs_ace_valid(vtype_t obj_type, zfs_acl_t *aclp, uint16_t type, uint16_t iflags)
+{
+ /*
+ * first check type of entry
+ */
+
+ if (!zfs_acl_valid_ace_type(type, iflags))
+ return (B_FALSE);
+
+ switch (type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ if (aclp->z_version < ZFS_ACL_VERSION_FUID)
+ return (B_FALSE);
+ aclp->z_hints |= ZFS_ACL_OBJ_ACE;
+ }
+
+ /*
+ * next check inheritance level flags
+ */
+
+ if (obj_type == VDIR &&
+ (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+ aclp->z_hints |= ZFS_INHERIT_ACE;
+
+ if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
+ if ((iflags & (ACE_FILE_INHERIT_ACE|
+ ACE_DIRECTORY_INHERIT_ACE)) == 0) {
+ return (B_FALSE);
+ }
+ }
+
+ return (B_TRUE);
+}
+
+static void *
+zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who,
+ uint32_t *access_mask, uint16_t *iflags, uint16_t *type)
+{
+ zfs_acl_node_t *aclnode;
+
+ ASSERT(aclp);
+
+ if (start == NULL) {
+ aclnode = list_head(&aclp->z_acl);
+ if (aclnode == NULL)
+ return (NULL);
+
+ aclp->z_next_ace = aclnode->z_acldata;
+ aclp->z_curr_node = aclnode;
+ aclnode->z_ace_idx = 0;
+ }
+
+ aclnode = aclp->z_curr_node;
+
+ if (aclnode == NULL)
+ return (NULL);
+
+ if (aclnode->z_ace_idx >= aclnode->z_ace_count) {
+ aclnode = list_next(&aclp->z_acl, aclnode);
+ if (aclnode == NULL)
+ return (NULL);
+ else {
+ aclp->z_curr_node = aclnode;
+ aclnode->z_ace_idx = 0;
+ aclp->z_next_ace = aclnode->z_acldata;
+ }
+ }
+
+ if (aclnode->z_ace_idx < aclnode->z_ace_count) {
+ void *acep = aclp->z_next_ace;
+ size_t ace_size;
+
+ /*
+ * Make sure we don't overstep our bounds
+ */
+ ace_size = aclp->z_ops->ace_size(acep);
+
+ if (((caddr_t)acep + ace_size) >
+ ((caddr_t)aclnode->z_acldata + aclnode->z_size)) {
+ return (NULL);
+ }
+
+ *iflags = aclp->z_ops->ace_flags_get(acep);
+ *type = aclp->z_ops->ace_type_get(acep);
+ *access_mask = aclp->z_ops->ace_mask_get(acep);
+ *who = aclp->z_ops->ace_who_get(acep);
+ aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size;
+ aclnode->z_ace_idx++;
+
+ return ((void *)acep);
+ }
+ return (NULL);
+}
+
+/*ARGSUSED*/
+static uint64_t
+zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt,
+ uint16_t *flags, uint16_t *type, uint32_t *mask)
+{
+ zfs_acl_t *aclp = datap;
+ zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie;
+ uint64_t who;
+
+ acep = zfs_acl_next_ace(aclp, acep, &who, mask,
+ flags, type);
+ return ((uint64_t)(uintptr_t)acep);
+}
+
+/*
+ * Copy ACE to internal ZFS format.
+ * While processing the ACL each ACE will be validated for correctness.
+ * ACE FUIDs will be created later.
+ */
+static int
+zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp,
+ void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size,
+ zfs_fuid_info_t **fuidp, cred_t *cr)
+{
+ int i;
+ uint16_t entry_type;
+ zfs_ace_t *aceptr = z_acl;
+ ace_t *acep = datap;
+ zfs_object_ace_t *zobjacep;
+ ace_object_t *aceobjp;
+
+ for (i = 0; i != aclcnt; i++) {
+ aceptr->z_hdr.z_access_mask = acep->a_access_mask;
+ aceptr->z_hdr.z_flags = acep->a_flags;
+ aceptr->z_hdr.z_type = acep->a_type;
+ entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS;
+ if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP &&
+ entry_type != ACE_EVERYONE) {
+ aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who,
+ cr, (entry_type == 0) ?
+ ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp);
+ }
+
+ /*
+ * Make sure ACE is valid
+ */
+ if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type,
+ aceptr->z_hdr.z_flags) != B_TRUE)
+ return (SET_ERROR(EINVAL));
+
+ switch (acep->a_type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ zobjacep = (zfs_object_ace_t *)aceptr;
+ aceobjp = (ace_object_t *)acep;
+
+ bcopy(aceobjp->a_obj_type, zobjacep->z_object_type,
+ sizeof (aceobjp->a_obj_type));
+ bcopy(aceobjp->a_inherit_obj_type,
+ zobjacep->z_inherit_type,
+ sizeof (aceobjp->a_inherit_obj_type));
+ acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t));
+ break;
+ default:
+ acep = (ace_t *)((caddr_t)acep + sizeof (ace_t));
+ }
+
+ aceptr = (zfs_ace_t *)((caddr_t)aceptr +
+ aclp->z_ops->ace_size(aceptr));
+ }
+
+ *size = (caddr_t)aceptr - (caddr_t)z_acl;
+
+ return (0);
+}
+
+/*
+ * Copy ZFS ACEs to fixed size ace_t layout
+ */
+static void
+zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr,
+ void *datap, int filter)
+{
+ uint64_t who;
+ uint32_t access_mask;
+ uint16_t iflags, type;
+ zfs_ace_hdr_t *zacep = NULL;
+ ace_t *acep = datap;
+ ace_object_t *objacep;
+ zfs_object_ace_t *zobjacep;
+ size_t ace_size;
+ uint16_t entry_type;
+
+ while ((zacep = zfs_acl_next_ace(aclp, zacep,
+ &who, &access_mask, &iflags, &type))) {
+
+ switch (type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ if (filter) {
+ continue;
+ }
+ zobjacep = (zfs_object_ace_t *)zacep;
+ objacep = (ace_object_t *)acep;
+ bcopy(zobjacep->z_object_type,
+ objacep->a_obj_type,
+ sizeof (zobjacep->z_object_type));
+ bcopy(zobjacep->z_inherit_type,
+ objacep->a_inherit_obj_type,
+ sizeof (zobjacep->z_inherit_type));
+ ace_size = sizeof (ace_object_t);
+ break;
+ default:
+ ace_size = sizeof (ace_t);
+ break;
+ }
+
+ entry_type = (iflags & ACE_TYPE_FLAGS);
+ if ((entry_type != ACE_OWNER &&
+ entry_type != OWNING_GROUP &&
+ entry_type != ACE_EVERYONE)) {
+ acep->a_who = zfs_fuid_map_id(zfsvfs, who,
+ cr, (entry_type & ACE_IDENTIFIER_GROUP) ?
+ ZFS_ACE_GROUP : ZFS_ACE_USER);
+ } else {
+ acep->a_who = (uid_t)(int64_t)who;
+ }
+ acep->a_access_mask = access_mask;
+ acep->a_flags = iflags;
+ acep->a_type = type;
+ acep = (ace_t *)((caddr_t)acep + ace_size);
+ }
+}
+
+static int
+zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep,
+ zfs_oldace_t *z_acl, int aclcnt, size_t *size)
+{
+ int i;
+ zfs_oldace_t *aceptr = z_acl;
+
+ for (i = 0; i != aclcnt; i++, aceptr++) {
+ aceptr->z_access_mask = acep[i].a_access_mask;
+ aceptr->z_type = acep[i].a_type;
+ aceptr->z_flags = acep[i].a_flags;
+ aceptr->z_fuid = acep[i].a_who;
+ /*
+ * Make sure ACE is valid
+ */
+ if (zfs_ace_valid(obj_type, aclp, aceptr->z_type,
+ aceptr->z_flags) != B_TRUE)
+ return (SET_ERROR(EINVAL));
+ }
+ *size = (caddr_t)aceptr - (caddr_t)z_acl;
+ return (0);
+}
+
+/*
+ * convert old ACL format to new
+ */
+void
+zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr)
+{
+ zfs_oldace_t *oldaclp;
+ int i;
+ uint16_t type, iflags;
+ uint32_t access_mask;
+ uint64_t who;
+ void *cookie = NULL;
+ zfs_acl_node_t *newaclnode;
+
+ ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL);
+ /*
+ * First create the ACE in a contiguous piece of memory
+ * for zfs_copy_ace_2_fuid().
+ *
+ * We only convert an ACL once, so this won't happen
+ * everytime.
+ */
+ oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count,
+ KM_SLEEP);
+ i = 0;
+ while ((cookie = zfs_acl_next_ace(aclp, cookie, &who,
+ &access_mask, &iflags, &type))) {
+ oldaclp[i].z_flags = iflags;
+ oldaclp[i].z_type = type;
+ oldaclp[i].z_fuid = who;
+ oldaclp[i++].z_access_mask = access_mask;
+ }
+
+ newaclnode = zfs_acl_node_alloc(aclp->z_acl_count *
+ sizeof (zfs_object_ace_t));
+ aclp->z_ops = &zfs_acl_fuid_ops;
+ VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp,
+ oldaclp, newaclnode->z_acldata, aclp->z_acl_count,
+ &newaclnode->z_size, NULL, cr) == 0);
+ newaclnode->z_ace_count = aclp->z_acl_count;
+ aclp->z_version = ZFS_ACL_VERSION;
+ kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t));
+
+ /*
+ * Release all previous ACL nodes
+ */
+
+ zfs_acl_release_nodes(aclp);
+
+ list_insert_head(&aclp->z_acl, newaclnode);
+
+ aclp->z_acl_bytes = newaclnode->z_size;
+ aclp->z_acl_count = newaclnode->z_ace_count;
+
+}
+
+/*
+ * Convert unix access mask to v4 access mask
+ */
+static uint32_t
+zfs_unix_to_v4(uint32_t access_mask)
+{
+ uint32_t new_mask = 0;
+
+ if (access_mask & S_IXOTH)
+ new_mask |= ACE_EXECUTE;
+ if (access_mask & S_IWOTH)
+ new_mask |= ACE_WRITE_DATA;
+ if (access_mask & S_IROTH)
+ new_mask |= ACE_READ_DATA;
+ return (new_mask);
+}
+
+static void
+zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask,
+ uint16_t access_type, uint64_t fuid, uint16_t entry_type)
+{
+ uint16_t type = entry_type & ACE_TYPE_FLAGS;
+
+ aclp->z_ops->ace_mask_set(acep, access_mask);
+ aclp->z_ops->ace_type_set(acep, access_type);
+ aclp->z_ops->ace_flags_set(acep, entry_type);
+ if ((type != ACE_OWNER && type != OWNING_GROUP &&
+ type != ACE_EVERYONE))
+ aclp->z_ops->ace_who_set(acep, fuid);
+}
+
+/*
+ * Determine mode of file based on ACL.
+ */
+uint64_t
+zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
+ uint64_t *pflags, uint64_t fuid, uint64_t fgid)
+{
+ int entry_type;
+ mode_t mode;
+ mode_t seen = 0;
+ zfs_ace_hdr_t *acep = NULL;
+ uint64_t who;
+ uint16_t iflags, type;
+ uint32_t access_mask;
+ boolean_t an_exec_denied = B_FALSE;
+
+ mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
+
+ while ((acep = zfs_acl_next_ace(aclp, acep, &who,
+ &access_mask, &iflags, &type))) {
+
+ if (!zfs_acl_valid_ace_type(type, iflags))
+ continue;
+
+ entry_type = (iflags & ACE_TYPE_FLAGS);
+
+ /*
+ * Skip over any inherit_only ACEs
+ */
+ if (iflags & ACE_INHERIT_ONLY_ACE)
+ continue;
+
+ if (entry_type == ACE_OWNER || (entry_type == 0 &&
+ who == fuid)) {
+ if ((access_mask & ACE_READ_DATA) &&
+ (!(seen & S_IRUSR))) {
+ seen |= S_IRUSR;
+ if (type == ALLOW) {
+ mode |= S_IRUSR;
+ }
+ }
+ if ((access_mask & ACE_WRITE_DATA) &&
+ (!(seen & S_IWUSR))) {
+ seen |= S_IWUSR;
+ if (type == ALLOW) {
+ mode |= S_IWUSR;
+ }
+ }
+ if ((access_mask & ACE_EXECUTE) &&
+ (!(seen & S_IXUSR))) {
+ seen |= S_IXUSR;
+ if (type == ALLOW) {
+ mode |= S_IXUSR;
+ }
+ }
+ } else if (entry_type == OWNING_GROUP ||
+ (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) {
+ if ((access_mask & ACE_READ_DATA) &&
+ (!(seen & S_IRGRP))) {
+ seen |= S_IRGRP;
+ if (type == ALLOW) {
+ mode |= S_IRGRP;
+ }
+ }
+ if ((access_mask & ACE_WRITE_DATA) &&
+ (!(seen & S_IWGRP))) {
+ seen |= S_IWGRP;
+ if (type == ALLOW) {
+ mode |= S_IWGRP;
+ }
+ }
+ if ((access_mask & ACE_EXECUTE) &&
+ (!(seen & S_IXGRP))) {
+ seen |= S_IXGRP;
+ if (type == ALLOW) {
+ mode |= S_IXGRP;
+ }
+ }
+ } else if (entry_type == ACE_EVERYONE) {
+ if ((access_mask & ACE_READ_DATA)) {
+ if (!(seen & S_IRUSR)) {
+ seen |= S_IRUSR;
+ if (type == ALLOW) {
+ mode |= S_IRUSR;
+ }
+ }
+ if (!(seen & S_IRGRP)) {
+ seen |= S_IRGRP;
+ if (type == ALLOW) {
+ mode |= S_IRGRP;
+ }
+ }
+ if (!(seen & S_IROTH)) {
+ seen |= S_IROTH;
+ if (type == ALLOW) {
+ mode |= S_IROTH;
+ }
+ }
+ }
+ if ((access_mask & ACE_WRITE_DATA)) {
+ if (!(seen & S_IWUSR)) {
+ seen |= S_IWUSR;
+ if (type == ALLOW) {
+ mode |= S_IWUSR;
+ }
+ }
+ if (!(seen & S_IWGRP)) {
+ seen |= S_IWGRP;
+ if (type == ALLOW) {
+ mode |= S_IWGRP;
+ }
+ }
+ if (!(seen & S_IWOTH)) {
+ seen |= S_IWOTH;
+ if (type == ALLOW) {
+ mode |= S_IWOTH;
+ }
+ }
+ }
+ if ((access_mask & ACE_EXECUTE)) {
+ if (!(seen & S_IXUSR)) {
+ seen |= S_IXUSR;
+ if (type == ALLOW) {
+ mode |= S_IXUSR;
+ }
+ }
+ if (!(seen & S_IXGRP)) {
+ seen |= S_IXGRP;
+ if (type == ALLOW) {
+ mode |= S_IXGRP;
+ }
+ }
+ if (!(seen & S_IXOTH)) {
+ seen |= S_IXOTH;
+ if (type == ALLOW) {
+ mode |= S_IXOTH;
+ }
+ }
+ }
+ } else {
+ /*
+ * Only care if this IDENTIFIER_GROUP or
+ * USER ACE denies execute access to someone,
+ * mode is not affected
+ */
+ if ((access_mask & ACE_EXECUTE) && type == DENY)
+ an_exec_denied = B_TRUE;
+ }
+ }
+
+ /*
+ * Failure to allow is effectively a deny, so execute permission
+ * is denied if it was never mentioned or if we explicitly
+ * weren't allowed it.
+ */
+ if (!an_exec_denied &&
+ ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS ||
+ (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS))
+ an_exec_denied = B_TRUE;
+
+ if (an_exec_denied)
+ *pflags &= ~ZFS_NO_EXECS_DENIED;
+ else
+ *pflags |= ZFS_NO_EXECS_DENIED;
+
+ return (mode);
+}
+
+/*
+ * Read an external acl object. If the intent is to modify, always
+ * create a new acl and leave any cached acl in place.
+ */
+int
+zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp,
+ boolean_t will_modify)
+{
+ zfs_acl_t *aclp;
+ int aclsize;
+ int acl_count;
+ zfs_acl_node_t *aclnode;
+ zfs_acl_phys_t znode_acl;
+ int version;
+ int error;
+
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+ if (zp->z_zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+
+ if (zp->z_acl_cached && !will_modify) {
+ *aclpp = zp->z_acl_cached;
+ return (0);
+ }
+
+ version = zfs_znode_acl_version(zp);
+
+ if ((error = zfs_acl_znode_info(zp, &aclsize,
+ &acl_count, &znode_acl)) != 0) {
+ goto done;
+ }
+
+ aclp = zfs_acl_alloc(version);
+
+ aclp->z_acl_count = acl_count;
+ aclp->z_acl_bytes = aclsize;
+
+ aclnode = zfs_acl_node_alloc(aclsize);
+ aclnode->z_ace_count = aclp->z_acl_count;
+ aclnode->z_size = aclsize;
+
+ if (!zp->z_is_sa) {
+ if (znode_acl.z_acl_extern_obj) {
+ error = dmu_read(zp->z_zfsvfs->z_os,
+ znode_acl.z_acl_extern_obj, 0, aclnode->z_size,
+ aclnode->z_acldata, DMU_READ_PREFETCH);
+ } else {
+ bcopy(znode_acl.z_ace_data, aclnode->z_acldata,
+ aclnode->z_size);
+ }
+ } else {
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs),
+ aclnode->z_acldata, aclnode->z_size);
+ }
+
+ if (error != 0) {
+ zfs_acl_free(aclp);
+ zfs_acl_node_free(aclnode);
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = SET_ERROR(EIO);
+ goto done;
+ }
+
+ list_insert_head(&aclp->z_acl, aclnode);
+
+ *aclpp = aclp;
+ if (!will_modify)
+ zp->z_acl_cached = aclp;
+done:
+ return (error);
+}
+
+/*ARGSUSED*/
+void
+zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen,
+ boolean_t start, void *userdata)
+{
+ zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata;
+
+ if (start) {
+ cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl);
+ } else {
+ cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl,
+ cb->cb_acl_node);
+ }
+ *dataptr = cb->cb_acl_node->z_acldata;
+ *length = cb->cb_acl_node->z_size;
+}
+
+int
+zfs_acl_chown_setattr(znode_t *zp)
+{
+ int error;
+ zfs_acl_t *aclp;
+
+ if (zp->z_zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+ ASSERT_VOP_IN_SEQC(ZTOV(zp));
+
+ if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0)
+ zp->z_mode = zfs_mode_compute(zp->z_mode, aclp,
+ &zp->z_pflags, zp->z_uid, zp->z_gid);
+ return (error);
+}
+
+/*
+ * common code for setting ACLs.
+ *
+ * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl.
+ * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's
+ * already checked the acl and knows whether to inherit.
+ */
+int
+zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
+{
+ int error;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ dmu_object_type_t otype;
+ zfs_acl_locator_cb_t locate = { 0 };
+ uint64_t mode;
+ sa_bulk_attr_t bulk[5];
+ uint64_t ctime[2];
+ int count = 0;
+ zfs_acl_phys_t acl_phys;
+
+ ASSERT_VOP_IN_SEQC(ZTOV(zp));
+
+ mode = zp->z_mode;
+
+ mode = zfs_mode_compute(mode, aclp, &zp->z_pflags,
+ zp->z_uid, zp->z_gid);
+
+ zp->z_mode = mode;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &mode, sizeof (mode));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+
+ if (zp->z_acl_cached) {
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = NULL;
+ }
+
+ /*
+ * Upgrade needed?
+ */
+ if (!zfsvfs->z_use_fuids) {
+ otype = DMU_OT_OLDACL;
+ } else {
+ if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) &&
+ (zfsvfs->z_version >= ZPL_VERSION_FUID))
+ zfs_acl_xform(zp, aclp, cr);
+ ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID);
+ otype = DMU_OT_ACL;
+ }
+
+ /*
+ * Arrgh, we have to handle old on disk format
+ * as well as newer (preferred) SA format.
+ */
+
+ if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */
+ locate.cb_aclp = aclp;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs),
+ zfs_acl_data_locator, &locate, aclp->z_acl_bytes);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs),
+ NULL, &aclp->z_acl_count, sizeof (uint64_t));
+ } else { /* Painful legacy way */
+ zfs_acl_node_t *aclnode;
+ uint64_t off = 0;
+ uint64_t aoid;
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+ &acl_phys, sizeof (acl_phys))) != 0)
+ return (error);
+
+ aoid = acl_phys.z_acl_extern_obj;
+
+ if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ /*
+ * If ACL was previously external and we are now
+ * converting to new ACL format then release old
+ * ACL object and create a new one.
+ */
+ if (aoid &&
+ aclp->z_version != acl_phys.z_acl_version) {
+ error = dmu_object_free(zfsvfs->z_os, aoid, tx);
+ if (error)
+ return (error);
+ aoid = 0;
+ }
+ if (aoid == 0) {
+ aoid = dmu_object_alloc(zfsvfs->z_os,
+ otype, aclp->z_acl_bytes,
+ otype == DMU_OT_ACL ?
+ DMU_OT_SYSACL : DMU_OT_NONE,
+ otype == DMU_OT_ACL ?
+ DN_OLD_MAX_BONUSLEN : 0, tx);
+ } else {
+ (void) dmu_object_set_blocksize(zfsvfs->z_os,
+ aoid, aclp->z_acl_bytes, 0, tx);
+ }
+ acl_phys.z_acl_extern_obj = aoid;
+ for (aclnode = list_head(&aclp->z_acl); aclnode;
+ aclnode = list_next(&aclp->z_acl, aclnode)) {
+ if (aclnode->z_ace_count == 0)
+ continue;
+ dmu_write(zfsvfs->z_os, aoid, off,
+ aclnode->z_size, aclnode->z_acldata, tx);
+ off += aclnode->z_size;
+ }
+ } else {
+ void *start = acl_phys.z_ace_data;
+ /*
+ * Migrating back embedded?
+ */
+ if (acl_phys.z_acl_extern_obj) {
+ error = dmu_object_free(zfsvfs->z_os,
+ acl_phys.z_acl_extern_obj, tx);
+ if (error)
+ return (error);
+ acl_phys.z_acl_extern_obj = 0;
+ }
+
+ for (aclnode = list_head(&aclp->z_acl); aclnode;
+ aclnode = list_next(&aclp->z_acl, aclnode)) {
+ if (aclnode->z_ace_count == 0)
+ continue;
+ bcopy(aclnode->z_acldata, start,
+ aclnode->z_size);
+ start = (caddr_t)start + aclnode->z_size;
+ }
+ }
+ /*
+ * If Old version then swap count/bytes to match old
+ * layout of znode_acl_phys_t.
+ */
+ if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+ acl_phys.z_acl_size = aclp->z_acl_count;
+ acl_phys.z_acl_count = aclp->z_acl_bytes;
+ } else {
+ acl_phys.z_acl_size = aclp->z_acl_bytes;
+ acl_phys.z_acl_count = aclp->z_acl_count;
+ }
+ acl_phys.z_acl_version = aclp->z_version;
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+ &acl_phys, sizeof (acl_phys));
+ }
+
+ /*
+ * Replace ACL wide bits, but first clear them.
+ */
+ zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS;
+
+ zp->z_pflags |= aclp->z_hints;
+
+ if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
+ zp->z_pflags |= ZFS_ACL_TRIVIAL;
+
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime);
+ return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
+}
+
+static void
+zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t split, boolean_t trim,
+ zfs_acl_t *aclp)
+{
+ void *acep = NULL;
+ uint64_t who;
+ int new_count, new_bytes;
+ int ace_size;
+ int entry_type;
+ uint16_t iflags, type;
+ uint32_t access_mask;
+ zfs_acl_node_t *newnode;
+ size_t abstract_size = aclp->z_ops->ace_abstract_size();
+ void *zacep;
+ boolean_t isdir;
+ trivial_acl_t masks;
+
+ new_count = new_bytes = 0;
+
+ isdir = (vtype == VDIR);
+
+ acl_trivial_access_masks((mode_t)mode, isdir, &masks);
+
+ newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes);
+
+ zacep = newnode->z_acldata;
+ if (masks.allow0) {
+ zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ new_count++;
+ new_bytes += abstract_size;
+ }
+ if (masks.deny1) {
+ zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ new_count++;
+ new_bytes += abstract_size;
+ }
+ if (masks.deny2) {
+ zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ new_count++;
+ new_bytes += abstract_size;
+ }
+
+ while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
+ &iflags, &type))) {
+ entry_type = (iflags & ACE_TYPE_FLAGS);
+ /*
+ * ACEs used to represent the file mode may be divided
+ * into an equivalent pair of inherit-only and regular
+ * ACEs, if they are inheritable.
+ * Skip regular ACEs, which are replaced by the new mode.
+ */
+ if (split && (entry_type == ACE_OWNER ||
+ entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE)) {
+ if (!isdir || !(iflags &
+ (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+ continue;
+ /*
+ * We preserve owner@, group@, or @everyone
+ * permissions, if they are inheritable, by
+ * copying them to inherit_only ACEs. This
+ * prevents inheritable permissions from being
+ * altered along with the file mode.
+ */
+ iflags |= ACE_INHERIT_ONLY_ACE;
+ }
+
+ /*
+ * If this ACL has any inheritable ACEs, mark that in
+ * the hints (which are later masked into the pflags)
+ * so create knows to do inheritance.
+ */
+ if (isdir && (iflags &
+ (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+ aclp->z_hints |= ZFS_INHERIT_ACE;
+
+ if ((type != ALLOW && type != DENY) ||
+ (iflags & ACE_INHERIT_ONLY_ACE)) {
+ switch (type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ aclp->z_hints |= ZFS_ACL_OBJ_ACE;
+ break;
+ }
+ } else {
+ /*
+ * Limit permissions granted by ACEs to be no greater
+ * than permissions of the requested group mode.
+ * Applies when the "aclmode" property is set to
+ * "groupmask".
+ */
+ if ((type == ALLOW) && trim)
+ access_mask &= masks.group;
+ }
+ zfs_set_ace(aclp, zacep, access_mask, type, who, iflags);
+ ace_size = aclp->z_ops->ace_size(acep);
+ zacep = (void *)((uintptr_t)zacep + ace_size);
+ new_count++;
+ new_bytes += ace_size;
+ }
+ zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE);
+
+ new_count += 3;
+ new_bytes += abstract_size * 3;
+ zfs_acl_release_nodes(aclp);
+ aclp->z_acl_count = new_count;
+ aclp->z_acl_bytes = new_bytes;
+ newnode->z_ace_count = new_count;
+ newnode->z_size = new_bytes;
+ list_insert_tail(&aclp->z_acl, newnode);
+}
+
+int
+zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
+{
+ int error = 0;
+
+ mutex_enter(&zp->z_acl_lock);
+ if (zp->z_zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+ if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD)
+ *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
+ else
+ error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE);
+
+ if (error == 0) {
+ (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
+ zfs_acl_chmod(ZTOV(zp)->v_type, mode, B_TRUE,
+ (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp);
+ }
+ mutex_exit(&zp->z_acl_lock);
+
+ return (error);
+}
+
+/*
+ * Should ACE be inherited?
+ */
+static int
+zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags)
+{
+ int iflags = (acep_flags & 0xf);
+
+ if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
+ return (1);
+ else if (iflags & ACE_FILE_INHERIT_ACE)
+ return (!((vtype == VDIR) &&
+ (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)));
+ return (0);
+}
+
+/*
+ * inherit inheritable ACEs from parent
+ */
+static zfs_acl_t *
+zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
+ uint64_t mode, boolean_t *need_chmod)
+{
+ void *pacep = NULL;
+ void *acep;
+ zfs_acl_node_t *aclnode;
+ zfs_acl_t *aclp = NULL;
+ uint64_t who;
+ uint32_t access_mask;
+ uint16_t iflags, newflags, type;
+ size_t ace_size;
+ void *data1, *data2;
+ size_t data1sz, data2sz;
+ uint_t aclinherit;
+ boolean_t isdir = (vtype == VDIR);
+ boolean_t isreg = (vtype == VREG);
+
+ *need_chmod = B_TRUE;
+
+ aclp = zfs_acl_alloc(paclp->z_version);
+ aclinherit = zfsvfs->z_acl_inherit;
+ if (aclinherit == ZFS_ACL_DISCARD || vtype == VLNK)
+ return (aclp);
+
+ while ((pacep = zfs_acl_next_ace(paclp, pacep, &who,
+ &access_mask, &iflags, &type))) {
+
+ /*
+ * don't inherit bogus ACEs
+ */
+ if (!zfs_acl_valid_ace_type(type, iflags))
+ continue;
+
+ /*
+ * Check if ACE is inheritable by this vnode
+ */
+ if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) ||
+ !zfs_ace_can_use(vtype, iflags))
+ continue;
+
+ /*
+ * If owner@, group@, or everyone@ inheritable
+ * then zfs_acl_chmod() isn't needed.
+ */
+ if ((aclinherit == ZFS_ACL_PASSTHROUGH ||
+ aclinherit == ZFS_ACL_PASSTHROUGH_X) &&
+ ((iflags & (ACE_OWNER|ACE_EVERYONE)) ||
+ ((iflags & OWNING_GROUP) == OWNING_GROUP)) &&
+ (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE))))
+ *need_chmod = B_FALSE;
+
+ /*
+ * Strip inherited execute permission from file if
+ * not in mode
+ */
+ if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW &&
+ !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) {
+ access_mask &= ~ACE_EXECUTE;
+ }
+
+ /*
+ * Strip write_acl and write_owner from permissions
+ * when inheriting an ACE
+ */
+ if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) {
+ access_mask &= ~RESTRICTED_CLEAR;
+ }
+
+ ace_size = aclp->z_ops->ace_size(pacep);
+ aclnode = zfs_acl_node_alloc(ace_size);
+ list_insert_tail(&aclp->z_acl, aclnode);
+ acep = aclnode->z_acldata;
+
+ zfs_set_ace(aclp, acep, access_mask, type,
+ who, iflags|ACE_INHERITED_ACE);
+
+ /*
+ * Copy special opaque data if any
+ */
+ if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) {
+ VERIFY((data2sz = aclp->z_ops->ace_data(acep,
+ &data2)) == data1sz);
+ bcopy(data1, data2, data2sz);
+ }
+
+ aclp->z_acl_count++;
+ aclnode->z_ace_count++;
+ aclp->z_acl_bytes += aclnode->z_size;
+ newflags = aclp->z_ops->ace_flags_get(acep);
+
+ /*
+ * If ACE is not to be inherited further, or if the vnode is
+ * not a directory, remove all inheritance flags
+ */
+ if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) {
+ newflags &= ~ALL_INHERIT;
+ aclp->z_ops->ace_flags_set(acep,
+ newflags|ACE_INHERITED_ACE);
+ continue;
+ }
+
+ /*
+ * This directory has an inheritable ACE
+ */
+ aclp->z_hints |= ZFS_INHERIT_ACE;
+
+ /*
+ * If only FILE_INHERIT is set then turn on
+ * inherit_only
+ */
+ if ((iflags & (ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) {
+ newflags |= ACE_INHERIT_ONLY_ACE;
+ aclp->z_ops->ace_flags_set(acep,
+ newflags|ACE_INHERITED_ACE);
+ } else {
+ newflags &= ~ACE_INHERIT_ONLY_ACE;
+ aclp->z_ops->ace_flags_set(acep,
+ newflags|ACE_INHERITED_ACE);
+ }
+ }
+ if (zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
+ aclp->z_acl_count != 0) {
+ *need_chmod = B_FALSE;
+ }
+
+ return (aclp);
+}
+
+/*
+ * Create file system object initial permissions
+ * including inheritable ACEs.
+ * Also, create FUIDs for owner and group.
+ */
+int
+zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
+ vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids)
+{
+ int error;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zfs_acl_t *paclp;
+ gid_t gid;
+ boolean_t need_chmod = B_TRUE;
+ boolean_t trim = B_FALSE;
+ boolean_t inherited = B_FALSE;
+
+ if ((flag & IS_ROOT_NODE) == 0) {
+ if (zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+ } else
+ ASSERT(dzp->z_vnode == NULL);
+ bzero(acl_ids, sizeof (zfs_acl_ids_t));
+ acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+
+ if (vsecp)
+ if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr,
+ &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0)
+ return (error);
+ /*
+ * Determine uid and gid.
+ */
+ if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay ||
+ ((flag & IS_XATTR) && (vap->va_type == VDIR))) {
+ acl_ids->z_fuid = zfs_fuid_create(zfsvfs,
+ (uint64_t)vap->va_uid, cr,
+ ZFS_OWNER, &acl_ids->z_fuidp);
+ acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
+ (uint64_t)vap->va_gid, cr,
+ ZFS_GROUP, &acl_ids->z_fuidp);
+ gid = vap->va_gid;
+ } else {
+ acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER,
+ cr, &acl_ids->z_fuidp);
+ acl_ids->z_fgid = 0;
+ if (vap->va_mask & AT_GID) {
+ acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
+ (uint64_t)vap->va_gid,
+ cr, ZFS_GROUP, &acl_ids->z_fuidp);
+ gid = vap->va_gid;
+ if (acl_ids->z_fgid != dzp->z_gid &&
+ !groupmember(vap->va_gid, cr) &&
+ secpolicy_vnode_create_gid(cr) != 0)
+ acl_ids->z_fgid = 0;
+ }
+ if (acl_ids->z_fgid == 0) {
+ char *domain;
+ uint32_t rid;
+
+ acl_ids->z_fgid = dzp->z_gid;
+ gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
+ cr, ZFS_GROUP);
+
+ if (zfsvfs->z_use_fuids &&
+ IS_EPHEMERAL(acl_ids->z_fgid)) {
+ domain =
+ zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx,
+ FUID_INDEX(acl_ids->z_fgid));
+ rid = FUID_RID(acl_ids->z_fgid);
+ zfs_fuid_node_add(&acl_ids->z_fuidp,
+ domain, rid, FUID_INDEX(acl_ids->z_fgid),
+ acl_ids->z_fgid, ZFS_GROUP);
+ }
+ }
+ }
+
+ /*
+ * If we're creating a directory, and the parent directory has the
+ * set-GID bit set, set in on the new directory.
+ * Otherwise, if the user is neither privileged nor a member of the
+ * file's new group, clear the file's set-GID bit.
+ */
+
+ if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) &&
+ (vap->va_type == VDIR)) {
+ acl_ids->z_mode |= S_ISGID;
+ } else {
+ if ((acl_ids->z_mode & S_ISGID) &&
+ secpolicy_vnode_setids_setgids(ZTOV(dzp), cr, gid) != 0)
+ acl_ids->z_mode &= ~S_ISGID;
+ }
+
+ if (acl_ids->z_aclp == NULL) {
+ mutex_enter(&dzp->z_acl_lock);
+ if (!(flag & IS_ROOT_NODE) &&
+ (dzp->z_pflags & ZFS_INHERIT_ACE) &&
+ !(dzp->z_pflags & ZFS_XATTR)) {
+ VERIFY0(zfs_acl_node_read(dzp, B_TRUE,
+ &paclp, B_FALSE));
+ acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
+ vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
+ inherited = B_TRUE;
+ } else {
+ acl_ids->z_aclp =
+ zfs_acl_alloc(zfs_acl_version_zp(dzp));
+ acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
+ }
+ mutex_exit(&dzp->z_acl_lock);
+
+ if (need_chmod) {
+ if (vap->va_type == VDIR)
+ acl_ids->z_aclp->z_hints |=
+ ZFS_ACL_AUTO_INHERIT;
+
+ if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK &&
+ zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH &&
+ zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X)
+ trim = B_TRUE;
+ zfs_acl_chmod(vap->va_type, acl_ids->z_mode, B_FALSE,
+ trim, acl_ids->z_aclp);
+ }
+ }
+
+ if (inherited || vsecp) {
+ acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode,
+ acl_ids->z_aclp, &acl_ids->z_aclp->z_hints,
+ acl_ids->z_fuid, acl_ids->z_fgid);
+ if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0)
+ acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
+ }
+
+ return (0);
+}
+
+/*
+ * Free ACL and fuid_infop, but not the acl_ids structure
+ */
+void
+zfs_acl_ids_free(zfs_acl_ids_t *acl_ids)
+{
+ if (acl_ids->z_aclp)
+ zfs_acl_free(acl_ids->z_aclp);
+ if (acl_ids->z_fuidp)
+ zfs_fuid_info_free(acl_ids->z_fuidp);
+ acl_ids->z_aclp = NULL;
+ acl_ids->z_fuidp = NULL;
+}
+
+boolean_t
+zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid)
+{
+ return (zfs_id_overquota(zv, DMU_USERUSED_OBJECT, acl_ids->z_fuid) ||
+ zfs_id_overquota(zv, DMU_GROUPUSED_OBJECT, acl_ids->z_fgid) ||
+ (projid != ZFS_DEFAULT_PROJID && projid != ZFS_INVALID_PROJID &&
+ zfs_id_overquota(zv, DMU_PROJECTUSED_OBJECT, projid)));
+}
+
+/*
+ * Retrieve a file's ACL
+ */
+int
+zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
+{
+ zfs_acl_t *aclp;
+ ulong_t mask;
+ int error;
+ int count = 0;
+ int largeace = 0;
+
+ mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT |
+ VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES);
+
+ if (mask == 0)
+ return (SET_ERROR(ENOSYS));
+
+ if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr)))
+ return (error);
+
+ mutex_enter(&zp->z_acl_lock);
+
+ if (zp->z_zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+ error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE);
+ if (error != 0) {
+ mutex_exit(&zp->z_acl_lock);
+ return (error);
+ }
+
+ /*
+ * Scan ACL to determine number of ACEs
+ */
+ if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) {
+ void *zacep = NULL;
+ uint64_t who;
+ uint32_t access_mask;
+ uint16_t type, iflags;
+
+ while ((zacep = zfs_acl_next_ace(aclp, zacep,
+ &who, &access_mask, &iflags, &type))) {
+ switch (type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ largeace++;
+ continue;
+ default:
+ count++;
+ }
+ }
+ vsecp->vsa_aclcnt = count;
+ } else
+ count = (int)aclp->z_acl_count;
+
+ if (mask & VSA_ACECNT) {
+ vsecp->vsa_aclcnt = count;
+ }
+
+ if (mask & VSA_ACE) {
+ size_t aclsz;
+
+ aclsz = count * sizeof (ace_t) +
+ sizeof (ace_object_t) * largeace;
+
+ vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP);
+ vsecp->vsa_aclentsz = aclsz;
+
+ if (aclp->z_version == ZFS_ACL_VERSION_FUID)
+ zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr,
+ vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES));
+ else {
+ zfs_acl_node_t *aclnode;
+ void *start = vsecp->vsa_aclentp;
+
+ for (aclnode = list_head(&aclp->z_acl); aclnode;
+ aclnode = list_next(&aclp->z_acl, aclnode)) {
+ bcopy(aclnode->z_acldata, start,
+ aclnode->z_size);
+ start = (caddr_t)start + aclnode->z_size;
+ }
+ ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp ==
+ aclp->z_acl_bytes);
+ }
+ }
+ if (mask & VSA_ACE_ACLFLAGS) {
+ vsecp->vsa_aclflags = 0;
+ if (zp->z_pflags & ZFS_ACL_DEFAULTED)
+ vsecp->vsa_aclflags |= ACL_DEFAULTED;
+ if (zp->z_pflags & ZFS_ACL_PROTECTED)
+ vsecp->vsa_aclflags |= ACL_PROTECTED;
+ if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT)
+ vsecp->vsa_aclflags |= ACL_AUTO_INHERIT;
+ }
+
+ mutex_exit(&zp->z_acl_lock);
+
+ return (0);
+}
+
+int
+zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_type,
+ vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp)
+{
+ zfs_acl_t *aclp;
+ zfs_acl_node_t *aclnode;
+ int aclcnt = vsecp->vsa_aclcnt;
+ int error;
+
+ if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0)
+ return (SET_ERROR(EINVAL));
+
+ aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version));
+
+ aclp->z_hints = 0;
+ aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t));
+ if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+ if ((error = zfs_copy_ace_2_oldace(obj_type, aclp,
+ (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata,
+ aclcnt, &aclnode->z_size)) != 0) {
+ zfs_acl_free(aclp);
+ zfs_acl_node_free(aclnode);
+ return (error);
+ }
+ } else {
+ if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp,
+ vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt,
+ &aclnode->z_size, fuidp, cr)) != 0) {
+ zfs_acl_free(aclp);
+ zfs_acl_node_free(aclnode);
+ return (error);
+ }
+ }
+ aclp->z_acl_bytes = aclnode->z_size;
+ aclnode->z_ace_count = aclcnt;
+ aclp->z_acl_count = aclcnt;
+ list_insert_head(&aclp->z_acl, aclnode);
+
+ /*
+ * If flags are being set then add them to z_hints
+ */
+ if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) {
+ if (vsecp->vsa_aclflags & ACL_PROTECTED)
+ aclp->z_hints |= ZFS_ACL_PROTECTED;
+ if (vsecp->vsa_aclflags & ACL_DEFAULTED)
+ aclp->z_hints |= ZFS_ACL_DEFAULTED;
+ if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT)
+ aclp->z_hints |= ZFS_ACL_AUTO_INHERIT;
+ }
+
+ *zaclp = aclp;
+
+ return (0);
+}
+
+/*
+ * Set a file's ACL
+ */
+int
+zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
+ dmu_tx_t *tx;
+ int error;
+ zfs_acl_t *aclp;
+ zfs_fuid_info_t *fuidp = NULL;
+ boolean_t fuid_dirtied;
+ uint64_t acl_obj;
+
+ if (zp->z_zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+ if (mask == 0)
+ return (SET_ERROR(ENOSYS));
+
+ if (zp->z_pflags & ZFS_IMMUTABLE)
+ return (SET_ERROR(EPERM));
+
+ if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)))
+ return (error);
+
+ error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp,
+ &aclp);
+ if (error)
+ return (error);
+
+ /*
+ * If ACL wide flags aren't being set then preserve any
+ * existing flags.
+ */
+ if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) {
+ aclp->z_hints |=
+ (zp->z_pflags & V4_ACL_WIDE_FLAGS);
+ }
+top:
+ mutex_enter(&zp->z_acl_lock);
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+
+ /*
+ * If old version and ACL won't fit in bonus and we aren't
+ * upgrading then take out necessary DMU holds
+ */
+
+ if ((acl_obj = zfs_external_acl(zp)) != 0) {
+ if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+ zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) {
+ dmu_tx_hold_free(tx, acl_obj, 0,
+ DMU_OBJECT_END);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ aclp->z_acl_bytes);
+ } else {
+ dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes);
+ }
+ } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
+ }
+
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_NOWAIT);
+ if (error) {
+ mutex_exit(&zp->z_acl_lock);
+
+ if (error == ERESTART) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ zfs_acl_free(aclp);
+ return (error);
+ }
+
+ error = zfs_aclset_common(zp, aclp, cr, tx);
+ ASSERT(error == 0);
+ ASSERT(zp->z_acl_cached == NULL);
+ zp->z_acl_cached = aclp;
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
+
+ if (fuidp)
+ zfs_fuid_info_free(fuidp);
+ dmu_tx_commit(tx);
+ mutex_exit(&zp->z_acl_lock);
+
+ return (error);
+}
+
+/*
+ * Check accesses of interest (AoI) against attributes of the dataset
+ * such as read-only. Returns zero if no AoI conflict with dataset
+ * attributes, otherwise an appropriate errno is returned.
+ */
+static int
+zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
+{
+ if ((v4_mode & WRITE_MASK) &&
+ (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
+ (!IS_DEVVP(ZTOV(zp)) ||
+ (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) {
+ return (SET_ERROR(EROFS));
+ }
+
+ /*
+ * Intentionally allow ZFS_READONLY through here.
+ * See zfs_zaccess_common().
+ */
+ if ((v4_mode & WRITE_MASK_DATA) &&
+ (zp->z_pflags & ZFS_IMMUTABLE)) {
+ return (SET_ERROR(EPERM));
+ }
+
+ /*
+ * In FreeBSD we allow to modify directory's content is ZFS_NOUNLINK
+ * (sunlnk) is set. We just don't allow directory removal, which is
+ * handled in zfs_zaccess_delete().
+ */
+ if ((v4_mode & ACE_DELETE) &&
+ (zp->z_pflags & ZFS_NOUNLINK)) {
+ return (EPERM);
+ }
+
+ if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) &&
+ (zp->z_pflags & ZFS_AV_QUARANTINED))) {
+ return (SET_ERROR(EACCES));
+ }
+
+ return (0);
+}
+
+/*
+ * The primary usage of this function is to loop through all of the
+ * ACEs in the znode, determining what accesses of interest (AoI) to
+ * the caller are allowed or denied. The AoI are expressed as bits in
+ * the working_mode parameter. As each ACE is processed, bits covered
+ * by that ACE are removed from the working_mode. This removal
+ * facilitates two things. The first is that when the working mode is
+ * empty (= 0), we know we've looked at all the AoI. The second is
+ * that the ACE interpretation rules don't allow a later ACE to undo
+ * something granted or denied by an earlier ACE. Removing the
+ * discovered access or denial enforces this rule. At the end of
+ * processing the ACEs, all AoI that were found to be denied are
+ * placed into the working_mode, giving the caller a mask of denied
+ * accesses. Returns:
+ * 0 if all AoI granted
+ * EACCESS if the denied mask is non-zero
+ * other error if abnormal failure (e.g., IO error)
+ *
+ * A secondary usage of the function is to determine if any of the
+ * AoI are granted. If an ACE grants any access in
+ * the working_mode, we immediately short circuit out of the function.
+ * This mode is chosen by setting anyaccess to B_TRUE. The
+ * working_mode is not a denied access mask upon exit if the function
+ * is used in this manner.
+ */
+static int
+zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
+ boolean_t anyaccess, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zfs_acl_t *aclp;
+ int error;
+ uid_t uid = crgetuid(cr);
+ uint64_t who;
+ uint16_t type, iflags;
+ uint16_t entry_type;
+ uint32_t access_mask;
+ uint32_t deny_mask = 0;
+ zfs_ace_hdr_t *acep = NULL;
+ boolean_t checkit;
+ uid_t gowner;
+ uid_t fowner;
+
+ zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
+
+ mutex_enter(&zp->z_acl_lock);
+
+ if (zp->z_zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+ error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE);
+ if (error != 0) {
+ mutex_exit(&zp->z_acl_lock);
+ return (error);
+ }
+
+ ASSERT(zp->z_acl_cached);
+
+ while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
+ &iflags, &type))) {
+ uint32_t mask_matched;
+
+ if (!zfs_acl_valid_ace_type(type, iflags))
+ continue;
+
+ if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE))
+ continue;
+
+ /* Skip ACE if it does not affect any AoI */
+ mask_matched = (access_mask & *working_mode);
+ if (!mask_matched)
+ continue;
+
+ entry_type = (iflags & ACE_TYPE_FLAGS);
+
+ checkit = B_FALSE;
+
+ switch (entry_type) {
+ case ACE_OWNER:
+ if (uid == fowner)
+ checkit = B_TRUE;
+ break;
+ case OWNING_GROUP:
+ who = gowner;
+ /*FALLTHROUGH*/
+ case ACE_IDENTIFIER_GROUP:
+ checkit = zfs_groupmember(zfsvfs, who, cr);
+ break;
+ case ACE_EVERYONE:
+ checkit = B_TRUE;
+ break;
+
+ /* USER Entry */
+ default:
+ if (entry_type == 0) {
+ uid_t newid;
+
+ newid = zfs_fuid_map_id(zfsvfs, who, cr,
+ ZFS_ACE_USER);
+ if (newid != UID_NOBODY &&
+ uid == newid)
+ checkit = B_TRUE;
+ break;
+ } else {
+ mutex_exit(&zp->z_acl_lock);
+ return (SET_ERROR(EIO));
+ }
+ }
+
+ if (checkit) {
+ if (type == DENY) {
+ DTRACE_PROBE3(zfs__ace__denies,
+ znode_t *, zp,
+ zfs_ace_hdr_t *, acep,
+ uint32_t, mask_matched);
+ deny_mask |= mask_matched;
+ } else {
+ DTRACE_PROBE3(zfs__ace__allows,
+ znode_t *, zp,
+ zfs_ace_hdr_t *, acep,
+ uint32_t, mask_matched);
+ if (anyaccess) {
+ mutex_exit(&zp->z_acl_lock);
+ return (0);
+ }
+ }
+ *working_mode &= ~mask_matched;
+ }
+
+ /* Are we done? */
+ if (*working_mode == 0)
+ break;
+ }
+
+ mutex_exit(&zp->z_acl_lock);
+
+ /* Put the found 'denies' back on the working mode */
+ if (deny_mask) {
+ *working_mode |= deny_mask;
+ return (SET_ERROR(EACCES));
+ } else if (*working_mode) {
+ return (-1);
+ }
+
+ return (0);
+}
+
+/*
+ * Return true if any access whatsoever granted, we don't actually
+ * care what access is granted.
+ */
+boolean_t
+zfs_has_access(znode_t *zp, cred_t *cr)
+{
+ uint32_t have = ACE_ALL_PERMS;
+
+ if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
+ uid_t owner;
+
+ owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+ return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0);
+ }
+ return (B_TRUE);
+}
+
+static int
+zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
+ boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int err;
+
+ *working_mode = v4_mode;
+ *check_privs = B_TRUE;
+
+ /*
+ * Short circuit empty requests
+ */
+ if (v4_mode == 0 || zfsvfs->z_replay) {
+ *working_mode = 0;
+ return (0);
+ }
+
+ if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) {
+ *check_privs = B_FALSE;
+ return (err);
+ }
+
+ /*
+ * The caller requested that the ACL check be skipped. This
+ * would only happen if the caller checked VOP_ACCESS() with a
+ * 32 bit ACE mask and already had the appropriate permissions.
+ */
+ if (skipaclchk) {
+ *working_mode = 0;
+ return (0);
+ }
+
+ /*
+ * Note: ZFS_READONLY represents the "DOS R/O" attribute.
+ * When that flag is set, we should behave as if write access
+ * were not granted by anything in the ACL. In particular:
+ * We _must_ allow writes after opening the file r/w, then
+ * setting the DOS R/O attribute, and writing some more.
+ * (Similar to how you can write after fchmod(fd, 0444).)
+ *
+ * Therefore ZFS_READONLY is ignored in the dataset check
+ * above, and checked here as if part of the ACL check.
+ * Also note: DOS R/O is ignored for directories.
+ */
+ if ((v4_mode & WRITE_MASK_DATA) &&
+ (ZTOV(zp)->v_type != VDIR) &&
+ (zp->z_pflags & ZFS_READONLY)) {
+ return (SET_ERROR(EPERM));
+ }
+
+ return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr));
+}
+
+static int
+zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs,
+ cred_t *cr)
+{
+ if (*working_mode != ACE_WRITE_DATA)
+ return (SET_ERROR(EACCES));
+
+ return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode,
+ check_privs, B_FALSE, cr));
+}
+
+/*
+ * Check if VEXEC is allowed.
+ *
+ * This routine is based on zfs_fastaccesschk_execute which has slowpath
+ * calling zfs_zaccess. This would be incorrect on FreeBSD (see
+ * zfs_freebsd_access for the difference). Thus this variant let's the
+ * caller handle the slowpath (if necessary).
+ *
+ * On top of that we perform a lockless check for ZFS_NO_EXECS_DENIED.
+ *
+ * Safe access to znode_t is provided by the vnode lock.
+ */
+int
+zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
+{
+ boolean_t is_attr;
+
+ if (zdp->z_pflags & ZFS_AV_QUARANTINED)
+ return (1);
+
+ is_attr = ((zdp->z_pflags & ZFS_XATTR) &&
+ (ZTOV(zdp)->v_type == VDIR));
+ if (is_attr)
+ return (1);
+
+ if (zdp->z_pflags & ZFS_NO_EXECS_DENIED)
+ return (0);
+
+ return (1);
+}
+
+
+/*
+ * Determine whether Access should be granted/denied.
+ *
+ * The least priv subsystem is always consulted as a basic privilege
+ * can define any form of access.
+ */
+int
+zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
+{
+ uint32_t working_mode;
+ int error;
+ int is_attr;
+ boolean_t check_privs;
+ znode_t *xzp = NULL;
+ znode_t *check_zp = zp;
+ mode_t needed_bits;
+ uid_t owner;
+
+ is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR));
+
+#ifdef __FreeBSD_kernel__
+ /*
+ * In FreeBSD, we don't care about permissions of individual ADS.
+ * Note that not checking them is not just an optimization - without
+ * this shortcut, EA operations may bogusly fail with EACCES.
+ */
+ if (zp->z_pflags & ZFS_XATTR)
+ return (0);
+#else
+ /*
+ * If attribute then validate against base file
+ */
+ if (is_attr) {
+ uint64_t parent;
+
+ if ((error = sa_lookup(zp->z_sa_hdl,
+ SA_ZPL_PARENT(zp->z_zfsvfs), &parent,
+ sizeof (parent))) != 0)
+ return (error);
+
+ if ((error = zfs_zget(zp->z_zfsvfs,
+ parent, &xzp)) != 0) {
+ return (error);
+ }
+
+ check_zp = xzp;
+
+ /*
+ * fixup mode to map to xattr perms
+ */
+
+ if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
+ mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+ mode |= ACE_WRITE_NAMED_ATTRS;
+ }
+
+ if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
+ mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
+ mode |= ACE_READ_NAMED_ATTRS;
+ }
+ }
+#endif
+
+ owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+ /*
+ * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC
+ * in needed_bits. Map the bits mapped by working_mode (currently
+ * missing) in missing_bits.
+ * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode),
+ * needed_bits.
+ */
+ needed_bits = 0;
+
+ working_mode = mode;
+ if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+ owner == crgetuid(cr))
+ working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+ if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+ ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+ needed_bits |= VREAD;
+ if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+ ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+ needed_bits |= VWRITE;
+ if (working_mode & ACE_EXECUTE)
+ needed_bits |= VEXEC;
+
+ if ((error = zfs_zaccess_common(check_zp, mode, &working_mode,
+ &check_privs, skipaclchk, cr)) == 0) {
+ if (is_attr)
+ VN_RELE(ZTOV(xzp));
+ return (secpolicy_vnode_access2(cr, ZTOV(zp), owner,
+ needed_bits, needed_bits));
+ }
+
+ if (error && !check_privs) {
+ if (is_attr)
+ VN_RELE(ZTOV(xzp));
+ return (error);
+ }
+
+ if (error && (flags & V_APPEND)) {
+ error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr);
+ }
+
+ if (error && check_privs) {
+ mode_t checkmode = 0;
+ vnode_t *check_vp = ZTOV(check_zp);
+
+ /*
+ * First check for implicit owner permission on
+ * read_acl/read_attributes
+ */
+
+ error = 0;
+ ASSERT(working_mode != 0);
+
+ if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) &&
+ owner == crgetuid(cr)))
+ working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+ if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+ ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+ checkmode |= VREAD;
+ if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+ ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+ checkmode |= VWRITE;
+ if (working_mode & ACE_EXECUTE)
+ checkmode |= VEXEC;
+
+ error = secpolicy_vnode_access2(cr, check_vp, owner,
+ needed_bits & ~checkmode, needed_bits);
+
+ if (error == 0 && (working_mode & ACE_WRITE_OWNER))
+ error = secpolicy_vnode_chown(check_vp, cr, owner);
+ if (error == 0 && (working_mode & ACE_WRITE_ACL))
+ error = secpolicy_vnode_setdac(check_vp, cr, owner);
+
+ if (error == 0 && (working_mode &
+ (ACE_DELETE|ACE_DELETE_CHILD)))
+ error = secpolicy_vnode_remove(check_vp, cr);
+
+ if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) {
+ error = secpolicy_vnode_chown(check_vp, cr, owner);
+ }
+ if (error == 0) {
+ /*
+ * See if any bits other than those already checked
+ * for are still present. If so then return EACCES
+ */
+ if (working_mode & ~(ZFS_CHECKED_MASKS)) {
+ error = SET_ERROR(EACCES);
+ }
+ }
+ } else if (error == 0) {
+ error = secpolicy_vnode_access2(cr, ZTOV(zp), owner,
+ needed_bits, needed_bits);
+ }
+
+
+ if (is_attr)
+ VN_RELE(ZTOV(xzp));
+
+ return (error);
+}
+
+/*
+ * Translate traditional unix VREAD/VWRITE/VEXEC mode into
+ * NFSv4-style ZFS ACL format and call zfs_zaccess()
+ */
+int
+zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr)
+{
+ return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr));
+}
+
+/*
+ * Access function for secpolicy_vnode_setattr
+ */
+int
+zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr)
+{
+ int v4_mode = zfs_unix_to_v4(mode >> 6);
+
+ return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr));
+}
+
+static int
+zfs_delete_final_check(znode_t *zp, znode_t *dzp,
+ mode_t available_perms, cred_t *cr)
+{
+ int error;
+ uid_t downer;
+
+ downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER);
+
+ error = secpolicy_vnode_access2(cr, ZTOV(dzp),
+ downer, available_perms, VWRITE|VEXEC);
+
+ if (error == 0)
+ error = zfs_sticky_remove_access(dzp, zp, cr);
+
+ return (error);
+}
+
+/*
+ * Determine whether Access should be granted/deny, without
+ * consulting least priv subsystem.
+ *
+ * The following chart is the recommended NFSv4 enforcement for
+ * ability to delete an object.
+ *
+ * -------------------------------------------------------
+ * | Parent Dir | Target Object Permissions |
+ * | permissions | |
+ * -------------------------------------------------------
+ * | | ACL Allows | ACL Denies| Delete |
+ * | | Delete | Delete | unspecified|
+ * -------------------------------------------------------
+ * | ACL Allows | Permit | Permit | Permit |
+ * | DELETE_CHILD | |
+ * -------------------------------------------------------
+ * | ACL Denies | Permit | Deny | Deny |
+ * | DELETE_CHILD | | | |
+ * -------------------------------------------------------
+ * | ACL specifies | | | |
+ * | only allow | Permit | Permit | Permit |
+ * | write and | | | |
+ * | execute | | | |
+ * -------------------------------------------------------
+ * | ACL denies | | | |
+ * | write and | Permit | Deny | Deny |
+ * | execute | | | |
+ * -------------------------------------------------------
+ * ^
+ * |
+ * No search privilege, can't even look up file?
+ *
+ */
+int
+zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
+{
+ uint32_t dzp_working_mode = 0;
+ uint32_t zp_working_mode = 0;
+ int dzp_error, zp_error;
+ mode_t available_perms;
+ boolean_t dzpcheck_privs = B_TRUE;
+ boolean_t zpcheck_privs = B_TRUE;
+
+ /*
+ * We want specific DELETE permissions to
+ * take precedence over WRITE/EXECUTE. We don't
+ * want an ACL such as this to mess us up.
+ * user:joe:write_data:deny,user:joe:delete:allow
+ *
+ * However, deny permissions may ultimately be overridden
+ * by secpolicy_vnode_access().
+ *
+ * We will ask for all of the necessary permissions and then
+ * look at the working modes from the directory and target object
+ * to determine what was found.
+ */
+
+ if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
+ return (SET_ERROR(EPERM));
+
+ /*
+ * First row
+ * If the directory permissions allow the delete, we are done.
+ */
+ if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD,
+ &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0)
+ return (0);
+
+ /*
+ * If target object has delete permission then we are done
+ */
+ if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode,
+ &zpcheck_privs, B_FALSE, cr)) == 0)
+ return (0);
+
+ ASSERT(dzp_error && zp_error);
+
+ if (!dzpcheck_privs)
+ return (dzp_error);
+ if (!zpcheck_privs)
+ return (zp_error);
+
+ /*
+ * Second row
+ *
+ * If directory returns EACCES then delete_child was denied
+ * due to deny delete_child. In this case send the request through
+ * secpolicy_vnode_remove(). We don't use zfs_delete_final_check()
+ * since that *could* allow the delete based on write/execute permission
+ * and we want delete permissions to override write/execute.
+ */
+
+ if (dzp_error == EACCES) {
+ /* XXXPJD: s/dzp/zp/ ? */
+ return (secpolicy_vnode_remove(ZTOV(dzp), cr));
+ }
+ /*
+ * Third Row
+ * only need to see if we have write/execute on directory.
+ */
+
+ dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA,
+ &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr);
+
+ if (dzp_error != 0 && !dzpcheck_privs)
+ return (dzp_error);
+
+ /*
+ * Fourth row
+ */
+
+ available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : VWRITE;
+ available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : VEXEC;
+
+ return (zfs_delete_final_check(zp, dzp, available_perms, cr));
+
+}
+
+int
+zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
+ znode_t *tzp, cred_t *cr)
+{
+ int add_perm;
+ int error;
+
+ if (szp->z_pflags & ZFS_AV_QUARANTINED)
+ return (SET_ERROR(EACCES));
+
+ add_perm = (ZTOV(szp)->v_type == VDIR) ?
+ ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
+
+ /*
+ * Rename permissions are combination of delete permission +
+ * add file/subdir permission.
+ *
+ * BSD operating systems also require write permission
+ * on the directory being moved from one parent directory
+ * to another.
+ */
+ if (ZTOV(szp)->v_type == VDIR && ZTOV(sdzp) != ZTOV(tdzp)) {
+ if ((error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr)))
+ return (error);
+ }
+
+ /*
+ * first make sure we do the delete portion.
+ *
+ * If that succeeds then check for add_file/add_subdir permissions
+ */
+
+ if ((error = zfs_zaccess_delete(sdzp, szp, cr)))
+ return (error);
+
+ /*
+ * If we have a tzp, see if we can delete it?
+ */
+ if (tzp && (error = zfs_zaccess_delete(tdzp, tzp, cr)))
+ return (error);
+
+ /*
+ * Now check for add permissions
+ */
+ error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr);
+
+ return (error);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
new file mode 100644
index 000000000000..f472aecdbafb
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
@@ -0,0 +1,1360 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
+ */
+
+/*
+ * ZFS control directory (a.k.a. ".zfs")
+ *
+ * This directory provides a common location for all ZFS meta-objects.
+ * Currently, this is only the 'snapshot' directory, but this may expand in the
+ * future. The elements are built using the GFS primitives, as the hierarchy
+ * does not actually exist on disk.
+ *
+ * For 'snapshot', we don't want to have all snapshots always mounted, because
+ * this would take up a huge amount of space in /etc/mnttab. We have three
+ * types of objects:
+ *
+ * ctldir ------> snapshotdir -------> snapshot
+ * |
+ * |
+ * V
+ * mounted fs
+ *
+ * The 'snapshot' node contains just enough information to lookup '..' and act
+ * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we
+ * perform an automount of the underlying filesystem and return the
+ * corresponding vnode.
+ *
+ * All mounts are handled automatically by the kernel, but unmounts are
+ * (currently) handled from user land. The main reason is that there is no
+ * reliable way to auto-unmount the filesystem when it's "no longer in use".
+ * When the user unmounts a filesystem, we call zfsctl_unmount(), which
+ * unmounts any snapshots within the snapshot directory.
+ *
+ * The '.zfs', '.zfs/snapshot', and all directories created under
+ * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
+ * share the same vfs_t as the head filesystem (what '.zfs' lives under).
+ *
+ * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
+ * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
+ * However, vnodes within these mounted on file systems have their v_vfsp
+ * fields set to the head filesystem to make NFS happy (see
+ * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t
+ * so that it cannot be freed until all snapshots have been unmounted.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/libkern.h>
+#include <sys/dirent.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/namei.h>
+#include <sys/stat.h>
+#include <sys/dmu.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_deleg.h>
+#include <sys/mount.h>
+#include <sys/zap.h>
+#include <sys/sysproto.h>
+
+#include "zfs_namecheck.h"
+
+#include <sys/kernel.h>
+#include <sys/ccompat.h>
+
+/* Common access mode for all virtual directories under the ctldir */
+const uint16_t zfsctl_ctldir_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
+ S_IROTH | S_IXOTH;
+
+/*
+ * "Synthetic" filesystem implementation.
+ */
+
+/*
+ * Assert that A implies B.
+ */
+#define KASSERT_IMPLY(A, B, msg) KASSERT(!(A) || (B), (msg));
+
+static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes");
+
+typedef struct sfs_node {
+ char sn_name[ZFS_MAX_DATASET_NAME_LEN];
+ uint64_t sn_parent_id;
+ uint64_t sn_id;
+} sfs_node_t;
+
+/*
+ * Check the parent's ID as well as the node's to account for a chance
+ * that IDs originating from different domains (snapshot IDs, artificial
+ * IDs, znode IDs) may clash.
+ */
+static int
+sfs_compare_ids(struct vnode *vp, void *arg)
+{
+ sfs_node_t *n1 = vp->v_data;
+ sfs_node_t *n2 = arg;
+ bool equal;
+
+ equal = n1->sn_id == n2->sn_id &&
+ n1->sn_parent_id == n2->sn_parent_id;
+
+ /* Zero means equality. */
+ return (!equal);
+}
+
+static int
+sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id,
+ uint64_t id, struct vnode **vpp)
+{
+ sfs_node_t search;
+ int err;
+
+ search.sn_id = id;
+ search.sn_parent_id = parent_id;
+ err = vfs_hash_get(mp, (uint32_t)id, flags, curthread, vpp,
+ sfs_compare_ids, &search);
+ return (err);
+}
+
+static int
+sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id,
+ uint64_t id, struct vnode **vpp)
+{
+ int err;
+
+ KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data"));
+ err = vfs_hash_insert(vp, (uint32_t)id, flags, curthread, vpp,
+ sfs_compare_ids, vp->v_data);
+ return (err);
+}
+
+static void
+sfs_vnode_remove(struct vnode *vp)
+{
+ vfs_hash_remove(vp);
+}
+
+typedef void sfs_vnode_setup_fn(vnode_t *vp, void *arg);
+
+static int
+sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id,
+ const char *tag, struct vop_vector *vops,
+ sfs_vnode_setup_fn setup, void *arg,
+ struct vnode **vpp)
+{
+ struct vnode *vp;
+ int error;
+
+ error = sfs_vnode_get(mp, flags, parent_id, id, vpp);
+ if (error != 0 || *vpp != NULL) {
+ KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
+ "sfs vnode with no data");
+ return (error);
+ }
+
+ /* Allocate a new vnode/inode. */
+ error = getnewvnode(tag, mp, vops, &vp);
+ if (error != 0) {
+ *vpp = NULL;
+ return (error);
+ }
+
+ /*
+ * Exclusively lock the vnode vnode while it's being constructed.
+ */
+ lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
+ error = insmntque(vp, mp);
+ if (error != 0) {
+ *vpp = NULL;
+ return (error);
+ }
+
+ setup(vp, arg);
+
+ error = sfs_vnode_insert(vp, flags, parent_id, id, vpp);
+ if (error != 0 || *vpp != NULL) {
+ KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
+ "sfs vnode with no data");
+ return (error);
+ }
+
+ *vpp = vp;
+ return (0);
+}
+
+static void
+sfs_print_node(sfs_node_t *node)
+{
+ printf("\tname = %s\n", node->sn_name);
+ printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id);
+ printf("\tid = %ju\n", (uintmax_t)node->sn_id);
+}
+
+static sfs_node_t *
+sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id)
+{
+ struct sfs_node *node;
+
+ KASSERT(strlen(name) < sizeof (node->sn_name),
+ ("sfs node name is too long"));
+ KASSERT(size >= sizeof (*node), ("sfs node size is too small"));
+ node = malloc(size, M_SFSNODES, M_WAITOK | M_ZERO);
+ strlcpy(node->sn_name, name, sizeof (node->sn_name));
+ node->sn_parent_id = parent_id;
+ node->sn_id = id;
+
+ return (node);
+}
+
+static void
+sfs_destroy_node(sfs_node_t *node)
+{
+ free(node, M_SFSNODES);
+}
+
+static void *
+sfs_reclaim_vnode(vnode_t *vp)
+{
+ void *data;
+
+ sfs_vnode_remove(vp);
+ data = vp->v_data;
+ vp->v_data = NULL;
+ return (data);
+}
+
+static int
+sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap,
+ zfs_uio_t *uio, off_t *offp)
+{
+ struct dirent entry;
+ int error;
+
+ /* Reset ncookies for subsequent use of vfs_read_dirent. */
+ if (ap->a_ncookies != NULL)
+ *ap->a_ncookies = 0;
+
+ if (zfs_uio_resid(uio) < sizeof (entry))
+ return (SET_ERROR(EINVAL));
+
+ if (zfs_uio_offset(uio) < 0)
+ return (SET_ERROR(EINVAL));
+ if (zfs_uio_offset(uio) == 0) {
+ entry.d_fileno = id;
+ entry.d_type = DT_DIR;
+ entry.d_name[0] = '.';
+ entry.d_name[1] = '\0';
+ entry.d_namlen = 1;
+ entry.d_reclen = sizeof (entry);
+ error = vfs_read_dirent(ap, &entry, zfs_uio_offset(uio));
+ if (error != 0)
+ return (SET_ERROR(error));
+ }
+
+ if (zfs_uio_offset(uio) < sizeof (entry))
+ return (SET_ERROR(EINVAL));
+ if (zfs_uio_offset(uio) == sizeof (entry)) {
+ entry.d_fileno = parent_id;
+ entry.d_type = DT_DIR;
+ entry.d_name[0] = '.';
+ entry.d_name[1] = '.';
+ entry.d_name[2] = '\0';
+ entry.d_namlen = 2;
+ entry.d_reclen = sizeof (entry);
+ error = vfs_read_dirent(ap, &entry, zfs_uio_offset(uio));
+ if (error != 0)
+ return (SET_ERROR(error));
+ }
+
+ if (offp != NULL)
+ *offp = 2 * sizeof (entry);
+ return (0);
+}
+
+
+/*
+ * .zfs inode namespace
+ *
+ * We need to generate unique inode numbers for all files and directories
+ * within the .zfs pseudo-filesystem. We use the following scheme:
+ *
+ * ENTRY ZFSCTL_INODE
+ * .zfs 1
+ * .zfs/snapshot 2
+ * .zfs/snapshot/<snap> objectid(snap)
+ */
+#define ZFSCTL_INO_SNAP(id) (id)
+
+static struct vop_vector zfsctl_ops_root;
+static struct vop_vector zfsctl_ops_snapdir;
+static struct vop_vector zfsctl_ops_snapshot;
+
+void
+zfsctl_init(void)
+{
+}
+
+void
+zfsctl_fini(void)
+{
+}
+
+boolean_t
+zfsctl_is_node(vnode_t *vp)
+{
+ return (vn_matchops(vp, zfsctl_ops_root) ||
+ vn_matchops(vp, zfsctl_ops_snapdir) ||
+ vn_matchops(vp, zfsctl_ops_snapshot));
+
+}
+
+typedef struct zfsctl_root {
+ sfs_node_t node;
+ sfs_node_t *snapdir;
+ timestruc_t cmtime;
+} zfsctl_root_t;
+
+
+/*
+ * Create the '.zfs' directory.
+ */
+void
+zfsctl_create(zfsvfs_t *zfsvfs)
+{
+ zfsctl_root_t *dot_zfs;
+ sfs_node_t *snapdir;
+ vnode_t *rvp;
+ uint64_t crtime[2];
+
+ ASSERT(zfsvfs->z_ctldir == NULL);
+
+ snapdir = sfs_alloc_node(sizeof (*snapdir), "snapshot", ZFSCTL_INO_ROOT,
+ ZFSCTL_INO_SNAPDIR);
+ dot_zfs = (zfsctl_root_t *)sfs_alloc_node(sizeof (*dot_zfs), ".zfs", 0,
+ ZFSCTL_INO_ROOT);
+ dot_zfs->snapdir = snapdir;
+
+ VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0);
+ VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
+ &crtime, sizeof (crtime)));
+ ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime);
+ vput(rvp);
+
+ zfsvfs->z_ctldir = dot_zfs;
+}
+
+/*
+ * Destroy the '.zfs' directory. Only called when the filesystem is unmounted.
+ * The nodes must not have any associated vnodes by now as they should be
+ * vflush-ed.
+ */
+void
+zfsctl_destroy(zfsvfs_t *zfsvfs)
+{
+ sfs_destroy_node(zfsvfs->z_ctldir->snapdir);
+ sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir);
+ zfsvfs->z_ctldir = NULL;
+}
+
+static int
+zfsctl_fs_root_vnode(struct mount *mp, void *arg __unused, int flags,
+ struct vnode **vpp)
+{
+ return (VFS_ROOT(mp, flags, vpp));
+}
+
+static void
+zfsctl_common_vnode_setup(vnode_t *vp, void *arg)
+{
+ ASSERT_VOP_ELOCKED(vp, __func__);
+
+ /* We support shared locking. */
+ VN_LOCK_ASHARE(vp);
+ vp->v_type = VDIR;
+ vp->v_data = arg;
+}
+
+static int
+zfsctl_root_vnode(struct mount *mp, void *arg __unused, int flags,
+ struct vnode **vpp)
+{
+ void *node;
+ int err;
+
+ node = ((zfsvfs_t *)mp->mnt_data)->z_ctldir;
+ err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root,
+ zfsctl_common_vnode_setup, node, vpp);
+ return (err);
+}
+
+static int
+zfsctl_snapdir_vnode(struct mount *mp, void *arg __unused, int flags,
+ struct vnode **vpp)
+{
+ void *node;
+ int err;
+
+ node = ((zfsvfs_t *)mp->mnt_data)->z_ctldir->snapdir;
+ err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs",
+ &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp);
+ return (err);
+}
+
+/*
+ * Given a root znode, retrieve the associated .zfs directory.
+ * Add a hold to the vnode and return it.
+ */
+int
+zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **vpp)
+{
+ int error;
+
+ error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp);
+ return (error);
+}
+
+/*
+ * Common open routine. Disallow any write access.
+ */
+static int
+zfsctl_common_open(struct vop_open_args *ap)
+{
+ int flags = ap->a_mode;
+
+ if (flags & FWRITE)
+ return (SET_ERROR(EACCES));
+
+ return (0);
+}
+
+/*
+ * Common close routine. Nothing to do here.
+ */
+/* ARGSUSED */
+static int
+zfsctl_common_close(struct vop_close_args *ap)
+{
+ return (0);
+}
+
+/*
+ * Common access routine. Disallow writes.
+ */
+static int
+zfsctl_common_access(struct vop_access_args *ap)
+{
+ accmode_t accmode = ap->a_accmode;
+
+ if (accmode & VWRITE)
+ return (SET_ERROR(EACCES));
+ return (0);
+}
+
+/*
+ * Common getattr function. Fill in basic information.
+ */
+static void
+zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
+{
+ timestruc_t now;
+ sfs_node_t *node;
+
+ node = vp->v_data;
+
+ vap->va_uid = 0;
+ vap->va_gid = 0;
+ vap->va_rdev = 0;
+ /*
+ * We are a purely virtual object, so we have no
+ * blocksize or allocated blocks.
+ */
+ vap->va_blksize = 0;
+ vap->va_nblocks = 0;
+ vap->va_seq = 0;
+ vn_fsid(vp, vap);
+ vap->va_mode = zfsctl_ctldir_mode;
+ vap->va_type = VDIR;
+ /*
+ * We live in the now (for atime).
+ */
+ gethrestime(&now);
+ vap->va_atime = now;
+ /* FreeBSD: Reset chflags(2) flags. */
+ vap->va_flags = 0;
+
+ vap->va_nodeid = node->sn_id;
+
+ /* At least '.' and '..'. */
+ vap->va_nlink = 2;
+}
+
+#ifndef _OPENSOLARIS_SYS_VNODE_H_
+struct vop_fid_args {
+ struct vnode *a_vp;
+ struct fid *a_fid;
+};
+#endif
+
+static int
+zfsctl_common_fid(struct vop_fid_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+ fid_t *fidp = (void *)ap->a_fid;
+ sfs_node_t *node = vp->v_data;
+ uint64_t object = node->sn_id;
+ zfid_short_t *zfid;
+ int i;
+
+ zfid = (zfid_short_t *)fidp;
+ zfid->zf_len = SHORT_FID_LEN;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+ /* .zfs nodes always have a generation number of 0 */
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ zfid->zf_gen[i] = 0;
+
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_reclaim_args {
+ struct vnode *a_vp;
+ struct thread *a_td;
+};
+#endif
+
+static int
+zfsctl_common_reclaim(struct vop_reclaim_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+
+ (void) sfs_reclaim_vnode(vp);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_print_args {
+ struct vnode *a_vp;
+};
+#endif
+
+static int
+zfsctl_common_print(struct vop_print_args *ap)
+{
+ sfs_print_node(ap->a_vp->v_data);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_getattr_args {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+};
+#endif
+
+/*
+ * Get root directory attributes.
+ */
+static int
+zfsctl_root_getattr(struct vop_getattr_args *ap)
+{
+ struct vnode *vp = ap->a_vp;
+ struct vattr *vap = ap->a_vap;
+ zfsctl_root_t *node = vp->v_data;
+
+ zfsctl_common_getattr(vp, vap);
+ vap->va_ctime = node->cmtime;
+ vap->va_mtime = vap->va_ctime;
+ vap->va_birthtime = vap->va_ctime;
+ vap->va_nlink += 1; /* snapdir */
+ vap->va_size = vap->va_nlink;
+ return (0);
+}
+
+/*
+ * When we lookup "." we still can be asked to lock it
+ * differently, can't we?
+ */
+static int
+zfsctl_relock_dot(vnode_t *dvp, int ltype)
+{
+ vref(dvp);
+ if (ltype != VOP_ISLOCKED(dvp)) {
+ if (ltype == LK_EXCLUSIVE)
+ vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+ else /* if (ltype == LK_SHARED) */
+ vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
+
+ /* Relock for the "." case may left us with reclaimed vnode. */
+ if (VN_IS_DOOMED(dvp)) {
+ vrele(dvp);
+ return (SET_ERROR(ENOENT));
+ }
+ }
+ return (0);
+}
+
+/*
+ * Special case the handling of "..".
+ */
+static int
+zfsctl_root_lookup(struct vop_lookup_args *ap)
+{
+ struct componentname *cnp = ap->a_cnp;
+ vnode_t *dvp = ap->a_dvp;
+ vnode_t **vpp = ap->a_vpp;
+ int flags = ap->a_cnp->cn_flags;
+ int lkflags = ap->a_cnp->cn_lkflags;
+ int nameiop = ap->a_cnp->cn_nameiop;
+ int err;
+
+ ASSERT(dvp->v_type == VDIR);
+
+ if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+ return (SET_ERROR(ENOTSUP));
+
+ if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
+ err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
+ if (err == 0)
+ *vpp = dvp;
+ } else if ((flags & ISDOTDOT) != 0) {
+ err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL,
+ lkflags, vpp);
+ } else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) {
+ err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp);
+ } else {
+ err = SET_ERROR(ENOENT);
+ }
+ if (err != 0)
+ *vpp = NULL;
+ return (err);
+}
+
+static int
+zfsctl_root_readdir(struct vop_readdir_args *ap)
+{
+ struct dirent entry;
+ vnode_t *vp = ap->a_vp;
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ zfsctl_root_t *node = vp->v_data;
+ zfs_uio_t uio;
+ int *eofp = ap->a_eofflag;
+ off_t dots_offset;
+ int error;
+
+ zfs_uio_init(&uio, ap->a_uio);
+
+ ASSERT(vp->v_type == VDIR);
+
+ error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, &uio,
+ &dots_offset);
+ if (error != 0) {
+ if (error == ENAMETOOLONG) /* ran out of destination space */
+ error = 0;
+ return (error);
+ }
+ if (zfs_uio_offset(&uio) != dots_offset)
+ return (SET_ERROR(EINVAL));
+
+ CTASSERT(sizeof (node->snapdir->sn_name) <= sizeof (entry.d_name));
+ entry.d_fileno = node->snapdir->sn_id;
+ entry.d_type = DT_DIR;
+ strcpy(entry.d_name, node->snapdir->sn_name);
+ entry.d_namlen = strlen(entry.d_name);
+ entry.d_reclen = sizeof (entry);
+ error = vfs_read_dirent(ap, &entry, zfs_uio_offset(&uio));
+ if (error != 0) {
+ if (error == ENAMETOOLONG)
+ error = 0;
+ return (SET_ERROR(error));
+ }
+ if (eofp != NULL)
+ *eofp = 1;
+ return (0);
+}
+
+static int
+zfsctl_root_vptocnp(struct vop_vptocnp_args *ap)
+{
+ static const char dotzfs_name[4] = ".zfs";
+ vnode_t *dvp;
+ int error;
+
+ if (*ap->a_buflen < sizeof (dotzfs_name))
+ return (SET_ERROR(ENOMEM));
+
+ error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL,
+ LK_SHARED, &dvp);
+ if (error != 0)
+ return (SET_ERROR(error));
+
+ VOP_UNLOCK1(dvp);
+ *ap->a_vpp = dvp;
+ *ap->a_buflen -= sizeof (dotzfs_name);
+ bcopy(dotzfs_name, ap->a_buf + *ap->a_buflen, sizeof (dotzfs_name));
+ return (0);
+}
+
+static int
+zfsctl_common_pathconf(struct vop_pathconf_args *ap)
+{
+ /*
+ * We care about ACL variables so that user land utilities like ls
+ * can display them correctly. Since the ctldir's st_dev is set to be
+ * the same as the parent dataset, we must support all variables that
+ * it supports.
+ */
+ switch (ap->a_name) {
+ case _PC_LINK_MAX:
+ *ap->a_retval = MIN(LONG_MAX, ZFS_LINK_MAX);
+ return (0);
+
+ case _PC_FILESIZEBITS:
+ *ap->a_retval = 64;
+ return (0);
+
+ case _PC_MIN_HOLE_SIZE:
+ *ap->a_retval = (int)SPA_MINBLOCKSIZE;
+ return (0);
+
+ case _PC_ACL_EXTENDED:
+ *ap->a_retval = 0;
+ return (0);
+
+ case _PC_ACL_NFS4:
+ *ap->a_retval = 1;
+ return (0);
+
+ case _PC_ACL_PATH_MAX:
+ *ap->a_retval = ACL_MAX_ENTRIES;
+ return (0);
+
+ case _PC_NAME_MAX:
+ *ap->a_retval = NAME_MAX;
+ return (0);
+
+ default:
+ return (vop_stdpathconf(ap));
+ }
+}
+
+/*
+ * Returns a trivial ACL
+ */
+static int
+zfsctl_common_getacl(struct vop_getacl_args *ap)
+{
+ int i;
+
+ if (ap->a_type != ACL_TYPE_NFS4)
+ return (EINVAL);
+
+ acl_nfs4_sync_acl_from_mode(ap->a_aclp, zfsctl_ctldir_mode, 0);
+ /*
+ * acl_nfs4_sync_acl_from_mode assumes that the owner can always modify
+ * attributes. That is not the case for the ctldir, so we must clear
+ * those bits. We also must clear ACL_READ_NAMED_ATTRS, because xattrs
+ * aren't supported by the ctldir.
+ */
+ for (i = 0; i < ap->a_aclp->acl_cnt; i++) {
+ struct acl_entry *entry;
+ entry = &(ap->a_aclp->acl_entry[i]);
+ entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER |
+ ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS |
+ ACL_READ_NAMED_ATTRS);
+ }
+
+ return (0);
+}
+
+static struct vop_vector zfsctl_ops_root = {
+ .vop_default = &default_vnodeops,
+#if __FreeBSD_version >= 1300121
+ .vop_fplookup_vexec = VOP_EAGAIN,
+#endif
+ .vop_open = zfsctl_common_open,
+ .vop_close = zfsctl_common_close,
+ .vop_ioctl = VOP_EINVAL,
+ .vop_getattr = zfsctl_root_getattr,
+ .vop_access = zfsctl_common_access,
+ .vop_readdir = zfsctl_root_readdir,
+ .vop_lookup = zfsctl_root_lookup,
+ .vop_inactive = VOP_NULL,
+ .vop_reclaim = zfsctl_common_reclaim,
+ .vop_fid = zfsctl_common_fid,
+ .vop_print = zfsctl_common_print,
+ .vop_vptocnp = zfsctl_root_vptocnp,
+ .vop_pathconf = zfsctl_common_pathconf,
+ .vop_getacl = zfsctl_common_getacl,
+};
+VFS_VOP_VECTOR_REGISTER(zfsctl_ops_root);
+
+static int
+zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
+{
+ objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
+
+ dmu_objset_name(os, zname);
+ if (strlen(zname) + 1 + strlen(name) >= len)
+ return (SET_ERROR(ENAMETOOLONG));
+ (void) strcat(zname, "@");
+ (void) strcat(zname, name);
+ return (0);
+}
+
+static int
+zfsctl_snapshot_lookup(vnode_t *vp, const char *name, uint64_t *id)
+{
+ objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
+ int err;
+
+ err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id);
+ return (err);
+}
+
+/*
+ * Given a vnode get a root vnode of a filesystem mounted on top of
+ * the vnode, if any. The root vnode is referenced and locked.
+ * If no filesystem is mounted then the orinal vnode remains referenced
+ * and locked. If any error happens the orinal vnode is unlocked and
+ * released.
+ */
+static int
+zfsctl_mounted_here(vnode_t **vpp, int flags)
+{
+ struct mount *mp;
+ int err;
+
+ ASSERT_VOP_LOCKED(*vpp, __func__);
+ ASSERT3S((*vpp)->v_type, ==, VDIR);
+
+ if ((mp = (*vpp)->v_mountedhere) != NULL) {
+ err = vfs_busy(mp, 0);
+ KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err));
+ KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint"));
+ vput(*vpp);
+ err = VFS_ROOT(mp, flags, vpp);
+ vfs_unbusy(mp);
+ return (err);
+ }
+ return (EJUSTRETURN);
+}
+
+typedef struct {
+ const char *snap_name;
+ uint64_t snap_id;
+} snapshot_setup_arg_t;
+
+static void
+zfsctl_snapshot_vnode_setup(vnode_t *vp, void *arg)
+{
+ snapshot_setup_arg_t *ssa = arg;
+ sfs_node_t *node;
+
+ ASSERT_VOP_ELOCKED(vp, __func__);
+
+ node = sfs_alloc_node(sizeof (sfs_node_t),
+ ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id);
+ zfsctl_common_vnode_setup(vp, node);
+
+ /* We have to support recursive locking. */
+ VN_LOCK_AREC(vp);
+}
+
+/*
+ * Lookup entry point for the 'snapshot' directory. Try to open the
+ * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
+ * Perform a mount of the associated dataset on top of the vnode.
+ * There are four possibilities:
+ * - the snapshot node and vnode do not exist
+ * - the snapshot vnode is covered by the mounted snapshot
+ * - the snapshot vnode is not covered yet, the mount operation is in progress
+ * - the snapshot vnode is not covered, because the snapshot has been unmounted
+ * The last two states are transient and should be relatively short-lived.
+ */
+static int
+zfsctl_snapdir_lookup(struct vop_lookup_args *ap)
+{
+ vnode_t *dvp = ap->a_dvp;
+ vnode_t **vpp = ap->a_vpp;
+ struct componentname *cnp = ap->a_cnp;
+ char name[NAME_MAX + 1];
+ char fullname[ZFS_MAX_DATASET_NAME_LEN];
+ char *mountpoint;
+ size_t mountpoint_len;
+ zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+ uint64_t snap_id;
+ int nameiop = cnp->cn_nameiop;
+ int lkflags = cnp->cn_lkflags;
+ int flags = cnp->cn_flags;
+ int err;
+
+ ASSERT(dvp->v_type == VDIR);
+
+ if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+ return (SET_ERROR(ENOTSUP));
+
+ if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
+ err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
+ if (err == 0)
+ *vpp = dvp;
+ return (err);
+ }
+ if (flags & ISDOTDOT) {
+ err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags,
+ vpp);
+ return (err);
+ }
+
+ if (cnp->cn_namelen >= sizeof (name))
+ return (SET_ERROR(ENAMETOOLONG));
+
+ strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
+ err = zfsctl_snapshot_lookup(dvp, name, &snap_id);
+ if (err != 0)
+ return (SET_ERROR(ENOENT));
+
+ for (;;) {
+ snapshot_setup_arg_t ssa;
+
+ ssa.snap_name = name;
+ ssa.snap_id = snap_id;
+ err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR,
+ snap_id, "zfs", &zfsctl_ops_snapshot,
+ zfsctl_snapshot_vnode_setup, &ssa, vpp);
+ if (err != 0)
+ return (err);
+
+ /* Check if a new vnode has just been created. */
+ if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE)
+ break;
+
+ /*
+ * Check if a snapshot is already mounted on top of the vnode.
+ */
+ err = zfsctl_mounted_here(vpp, lkflags);
+ if (err != EJUSTRETURN)
+ return (err);
+
+ /*
+ * If the vnode is not covered, then either the mount operation
+ * is in progress or the snapshot has already been unmounted
+ * but the vnode hasn't been inactivated and reclaimed yet.
+ * We can try to re-use the vnode in the latter case.
+ */
+ VI_LOCK(*vpp);
+ if (((*vpp)->v_iflag & VI_MOUNT) == 0) {
+ /*
+ * Upgrade to exclusive lock in order to:
+ * - avoid race conditions
+ * - satisfy the contract of mount_snapshot()
+ */
+ err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK);
+ if (err == 0)
+ break;
+ } else {
+ VI_UNLOCK(*vpp);
+ }
+
+ /*
+ * In this state we can loop on uncontested locks and starve
+ * the thread doing the lengthy, non-trivial mount operation.
+ * So, yield to prevent that from happening.
+ */
+ vput(*vpp);
+ kern_yield(PRI_USER);
+ }
+
+ VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof (fullname), fullname));
+
+ mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
+ strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1;
+ mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
+ (void) snprintf(mountpoint, mountpoint_len,
+ "%s/" ZFS_CTLDIR_NAME "/snapshot/%s",
+ dvp->v_vfsp->mnt_stat.f_mntonname, name);
+
+ err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0);
+ kmem_free(mountpoint, mountpoint_len);
+ if (err == 0) {
+ /*
+ * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
+ *
+ * This is where we lie about our v_vfsp in order to
+ * make .zfs/snapshot/<snapname> accessible over NFS
+ * without requiring manual mounts of <snapname>.
+ */
+ ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
+ VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
+
+ /* Clear the root flag (set via VFS_ROOT) as well. */
+ (*vpp)->v_vflag &= ~VV_ROOT;
+ }
+
+ if (err != 0)
+ *vpp = NULL;
+ return (err);
+}
+
+static int
+zfsctl_snapdir_readdir(struct vop_readdir_args *ap)
+{
+ char snapname[ZFS_MAX_DATASET_NAME_LEN];
+ struct dirent entry;
+ vnode_t *vp = ap->a_vp;
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ zfs_uio_t uio;
+ int *eofp = ap->a_eofflag;
+ off_t dots_offset;
+ int error;
+
+ zfs_uio_init(&uio, ap->a_uio);
+
+ ASSERT(vp->v_type == VDIR);
+
+ error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap,
+ &uio, &dots_offset);
+ if (error != 0) {
+ if (error == ENAMETOOLONG) /* ran out of destination space */
+ error = 0;
+ return (error);
+ }
+
+ ZFS_ENTER(zfsvfs);
+ for (;;) {
+ uint64_t cookie;
+ uint64_t id;
+
+ cookie = zfs_uio_offset(&uio) - dots_offset;
+
+ dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
+ error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof (snapname),
+ snapname, &id, &cookie, NULL);
+ dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
+ if (error != 0) {
+ if (error == ENOENT) {
+ if (eofp != NULL)
+ *eofp = 1;
+ error = 0;
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ entry.d_fileno = id;
+ entry.d_type = DT_DIR;
+ strcpy(entry.d_name, snapname);
+ entry.d_namlen = strlen(entry.d_name);
+ entry.d_reclen = sizeof (entry);
+ error = vfs_read_dirent(ap, &entry, zfs_uio_offset(&uio));
+ if (error != 0) {
+ if (error == ENAMETOOLONG)
+ error = 0;
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(error));
+ }
+ zfs_uio_setoffset(&uio, cookie + dots_offset);
+ }
+ /* NOTREACHED */
+}
+
+static int
+zfsctl_snapdir_getattr(struct vop_getattr_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+ vattr_t *vap = ap->a_vap;
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ dsl_dataset_t *ds;
+ uint64_t snap_count;
+ int err;
+
+ ZFS_ENTER(zfsvfs);
+ ds = dmu_objset_ds(zfsvfs->z_os);
+ zfsctl_common_getattr(vp, vap);
+ vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os);
+ vap->va_mtime = vap->va_ctime;
+ vap->va_birthtime = vap->va_ctime;
+ if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
+ err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
+ if (err != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ vap->va_nlink += snap_count;
+ }
+ vap->va_size = vap->va_nlink;
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+static struct vop_vector zfsctl_ops_snapdir = {
+ .vop_default = &default_vnodeops,
+#if __FreeBSD_version >= 1300121
+ .vop_fplookup_vexec = VOP_EAGAIN,
+#endif
+ .vop_open = zfsctl_common_open,
+ .vop_close = zfsctl_common_close,
+ .vop_getattr = zfsctl_snapdir_getattr,
+ .vop_access = zfsctl_common_access,
+ .vop_readdir = zfsctl_snapdir_readdir,
+ .vop_lookup = zfsctl_snapdir_lookup,
+ .vop_reclaim = zfsctl_common_reclaim,
+ .vop_fid = zfsctl_common_fid,
+ .vop_print = zfsctl_common_print,
+ .vop_pathconf = zfsctl_common_pathconf,
+ .vop_getacl = zfsctl_common_getacl,
+};
+VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapdir);
+
+
+static int
+zfsctl_snapshot_inactive(struct vop_inactive_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+
+ VERIFY(vrecycle(vp) == 1);
+ return (0);
+}
+
+static int
+zfsctl_snapshot_reclaim(struct vop_reclaim_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+ void *data = vp->v_data;
+
+ sfs_reclaim_vnode(vp);
+ sfs_destroy_node(data);
+ return (0);
+}
+
+static int
+zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap)
+{
+ struct mount *mp;
+ vnode_t *dvp;
+ vnode_t *vp;
+ sfs_node_t *node;
+ size_t len;
+ int locked;
+ int error;
+
+ vp = ap->a_vp;
+ node = vp->v_data;
+ len = strlen(node->sn_name);
+ if (*ap->a_buflen < len)
+ return (SET_ERROR(ENOMEM));
+
+ /*
+ * Prevent unmounting of the snapshot while the vnode lock
+ * is not held. That is not strictly required, but allows
+ * us to assert that an uncovered snapshot vnode is never
+ * "leaked".
+ */
+ mp = vp->v_mountedhere;
+ if (mp == NULL)
+ return (SET_ERROR(ENOENT));
+ error = vfs_busy(mp, 0);
+ KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error));
+
+ /*
+ * We can vput the vnode as we can now depend on the reference owned
+ * by the busied mp. But we also need to hold the vnode, because
+ * the reference may go after vfs_unbusy() which has to be called
+ * before we can lock the vnode again.
+ */
+ locked = VOP_ISLOCKED(vp);
+#if __FreeBSD_version >= 1300045
+ enum vgetstate vs = vget_prep(vp);
+#else
+ vhold(vp);
+#endif
+ vput(vp);
+
+ /* Look up .zfs/snapshot, our parent. */
+ error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp);
+ if (error == 0) {
+ VOP_UNLOCK1(dvp);
+ *ap->a_vpp = dvp;
+ *ap->a_buflen -= len;
+ bcopy(node->sn_name, ap->a_buf + *ap->a_buflen, len);
+ }
+ vfs_unbusy(mp);
+#if __FreeBSD_version >= 1300045
+ vget_finish(vp, locked | LK_RETRY, vs);
+#else
+ vget(vp, locked | LK_VNHELD | LK_RETRY, curthread);
+#endif
+ return (error);
+}
+
+/*
+ * These VP's should never see the light of day. They should always
+ * be covered.
+ */
+static struct vop_vector zfsctl_ops_snapshot = {
+#if __FreeBSD_version >= 1300121
+ .vop_fplookup_vexec = VOP_EAGAIN,
+#endif
+ .vop_inactive = zfsctl_snapshot_inactive,
+#if __FreeBSD_version >= 1300045
+ .vop_need_inactive = vop_stdneed_inactive,
+#endif
+ .vop_reclaim = zfsctl_snapshot_reclaim,
+ .vop_vptocnp = zfsctl_snapshot_vptocnp,
+ .vop_lock1 = vop_stdlock,
+ .vop_unlock = vop_stdunlock,
+ .vop_islocked = vop_stdislocked,
+ .vop_advlockpurge = vop_stdadvlockpurge, /* called by vgone */
+ .vop_print = zfsctl_common_print,
+};
+VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapshot);
+
+int
+zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
+{
+ zfsvfs_t *zfsvfs __unused = vfsp->vfs_data;
+ vnode_t *vp;
+ int error;
+
+ ASSERT(zfsvfs->z_ctldir != NULL);
+ *zfsvfsp = NULL;
+ error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
+ ZFSCTL_INO_SNAPDIR, objsetid, &vp);
+ if (error == 0 && vp != NULL) {
+ /*
+ * XXX Probably need to at least reference, if not busy, the mp.
+ */
+ if (vp->v_mountedhere != NULL)
+ *zfsvfsp = vp->v_mountedhere->mnt_data;
+ vput(vp);
+ }
+ if (*zfsvfsp == NULL)
+ return (SET_ERROR(EINVAL));
+ return (0);
+}
+
+/*
+ * Unmount any snapshots for the given filesystem. This is called from
+ * zfs_umount() - if we have a ctldir, then go through and unmount all the
+ * snapshots.
+ */
+int
+zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
+{
+ char snapname[ZFS_MAX_DATASET_NAME_LEN];
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ struct mount *mp;
+ vnode_t *vp;
+ uint64_t cookie;
+ int error;
+
+ ASSERT(zfsvfs->z_ctldir != NULL);
+
+ cookie = 0;
+ for (;;) {
+ uint64_t id;
+
+ dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
+ error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof (snapname),
+ snapname, &id, &cookie, NULL);
+ dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
+ if (error != 0) {
+ if (error == ENOENT)
+ error = 0;
+ break;
+ }
+
+ for (;;) {
+ error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
+ ZFSCTL_INO_SNAPDIR, id, &vp);
+ if (error != 0 || vp == NULL)
+ break;
+
+ mp = vp->v_mountedhere;
+
+ /*
+ * v_mountedhere being NULL means that the
+ * (uncovered) vnode is in a transient state
+ * (mounting or unmounting), so loop until it
+ * settles down.
+ */
+ if (mp != NULL)
+ break;
+ vput(vp);
+ }
+ if (error != 0)
+ break;
+ if (vp == NULL)
+ continue; /* no mountpoint, nothing to do */
+
+ /*
+ * The mount-point vnode is kept locked to avoid spurious EBUSY
+ * from a concurrent umount.
+ * The vnode lock must have recursive locking enabled.
+ */
+ vfs_ref(mp);
+ error = dounmount(mp, fflags, curthread);
+ KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1,
+ ("extra references after unmount"));
+ vput(vp);
+ if (error != 0)
+ break;
+ }
+ KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0,
+ ("force unmounting failed"));
+ return (error);
+}
+
+int
+zfsctl_snapshot_unmount(const char *snapname, int flags __unused)
+{
+ vfs_t *vfsp = NULL;
+ zfsvfs_t *zfsvfs = NULL;
+
+ if (strchr(snapname, '@') == NULL)
+ return (0);
+
+ int err = getzfsvfs(snapname, &zfsvfs);
+ if (err != 0) {
+ ASSERT3P(zfsvfs, ==, NULL);
+ return (0);
+ }
+ vfsp = zfsvfs->z_vfs;
+
+ ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os)));
+
+ vfs_ref(vfsp);
+ vfs_unbusy(vfsp);
+ return (dounmount(vfsp, MS_FORCE, curthread));
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_debug.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_debug.c
new file mode 100644
index 000000000000..74742ad3669f
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_debug.c
@@ -0,0 +1,251 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/kstat.h>
+
+typedef struct zfs_dbgmsg {
+ list_node_t zdm_node;
+ time_t zdm_timestamp;
+ int zdm_size;
+ char zdm_msg[1]; /* variable length allocation */
+} zfs_dbgmsg_t;
+
+list_t zfs_dbgmsgs;
+int zfs_dbgmsg_size = 0;
+kmutex_t zfs_dbgmsgs_lock;
+int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
+kstat_t *zfs_dbgmsg_kstat;
+
+/*
+ * Internal ZFS debug messages are enabled by default.
+ *
+ * # Print debug messages
+ * dtrace -n 'zfs-dbgmsg { print(stringof(arg0)); }'
+ *
+ * # Disable the kernel debug message log.
+ * sysctl vfs.zfs.dbgmsg_enable=0
+ */
+int zfs_dbgmsg_enable = 1;
+
+static int
+zfs_dbgmsg_headers(char *buf, size_t size)
+{
+ (void) snprintf(buf, size, "%-12s %-8s\n", "timestamp", "message");
+
+ return (0);
+}
+
+static int
+zfs_dbgmsg_data(char *buf, size_t size, void *data)
+{
+ zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)data;
+
+ (void) snprintf(buf, size, "%-12llu %-s\n",
+ (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg);
+
+ return (0);
+}
+
+static void *
+zfs_dbgmsg_addr(kstat_t *ksp, loff_t n)
+{
+ zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)ksp->ks_private;
+
+ ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock));
+
+ if (n == 0)
+ ksp->ks_private = list_head(&zfs_dbgmsgs);
+ else if (zdm)
+ ksp->ks_private = list_next(&zfs_dbgmsgs, zdm);
+
+ return (ksp->ks_private);
+}
+
+static void
+zfs_dbgmsg_purge(int max_size)
+{
+ zfs_dbgmsg_t *zdm;
+ int size;
+
+ ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock));
+
+ while (zfs_dbgmsg_size > max_size) {
+ zdm = list_remove_head(&zfs_dbgmsgs);
+ if (zdm == NULL)
+ return;
+
+ size = zdm->zdm_size;
+ kmem_free(zdm, size);
+ zfs_dbgmsg_size -= size;
+ }
+}
+
+static int
+zfs_dbgmsg_update(kstat_t *ksp, int rw)
+{
+ if (rw == KSTAT_WRITE)
+ zfs_dbgmsg_purge(0);
+
+ return (0);
+}
+
+void
+zfs_dbgmsg_init(void)
+{
+ list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t),
+ offsetof(zfs_dbgmsg_t, zdm_node));
+ mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ zfs_dbgmsg_kstat = kstat_create("zfs", 0, "dbgmsg", "misc",
+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+ if (zfs_dbgmsg_kstat) {
+ zfs_dbgmsg_kstat->ks_lock = &zfs_dbgmsgs_lock;
+ zfs_dbgmsg_kstat->ks_ndata = UINT32_MAX;
+ zfs_dbgmsg_kstat->ks_private = NULL;
+ zfs_dbgmsg_kstat->ks_update = zfs_dbgmsg_update;
+ kstat_set_raw_ops(zfs_dbgmsg_kstat, zfs_dbgmsg_headers,
+ zfs_dbgmsg_data, zfs_dbgmsg_addr);
+ kstat_install(zfs_dbgmsg_kstat);
+ }
+}
+
+void
+zfs_dbgmsg_fini(void)
+{
+ if (zfs_dbgmsg_kstat)
+ kstat_delete(zfs_dbgmsg_kstat);
+ /*
+ * TODO - decide how to make this permanent
+ */
+#ifdef _KERNEL
+ mutex_enter(&zfs_dbgmsgs_lock);
+ zfs_dbgmsg_purge(0);
+ mutex_exit(&zfs_dbgmsgs_lock);
+ mutex_destroy(&zfs_dbgmsgs_lock);
+#endif
+}
+
+void
+__zfs_dbgmsg(char *buf)
+{
+ zfs_dbgmsg_t *zdm;
+ int size;
+
+ DTRACE_PROBE1(zfs__dbgmsg, char *, buf);
+
+ size = sizeof (zfs_dbgmsg_t) + strlen(buf);
+ zdm = kmem_zalloc(size, KM_SLEEP);
+ zdm->zdm_size = size;
+ zdm->zdm_timestamp = gethrestime_sec();
+ strcpy(zdm->zdm_msg, buf);
+
+ mutex_enter(&zfs_dbgmsgs_lock);
+ list_insert_tail(&zfs_dbgmsgs, zdm);
+ zfs_dbgmsg_size += size;
+ zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0));
+ mutex_exit(&zfs_dbgmsgs_lock);
+}
+
+void
+__set_error(const char *file, const char *func, int line, int err)
+{
+ /*
+ * To enable this:
+ *
+ * $ echo 512 >/sys/module/zfs/parameters/zfs_flags
+ */
+ if (zfs_flags & ZFS_DEBUG_SET_ERROR)
+ __dprintf(B_FALSE, file, func, line, "error %lu", err);
+}
+
+#ifdef _KERNEL
+void
+__dprintf(boolean_t dprint, const char *file, const char *func,
+ int line, const char *fmt, ...)
+{
+ const char *newfile;
+ va_list adx;
+ size_t size;
+ char *buf;
+ char *nl;
+ int i;
+
+ size = 1024;
+ buf = kmem_alloc(size, KM_SLEEP);
+
+ /*
+ * Get rid of annoying prefix to filename.
+ */
+ newfile = strrchr(file, '/');
+ if (newfile != NULL) {
+ newfile = newfile + 1; /* Get rid of leading / */
+ } else {
+ newfile = file;
+ }
+
+ i = snprintf(buf, size, "%s:%d:%s(): ", newfile, line, func);
+
+ if (i < size) {
+ va_start(adx, fmt);
+ (void) vsnprintf(buf + i, size - i, fmt, adx);
+ va_end(adx);
+ }
+
+ /*
+ * Get rid of trailing newline.
+ */
+ nl = strrchr(buf, '\n');
+ if (nl != NULL)
+ *nl = '\0';
+
+ __zfs_dbgmsg(buf);
+
+ kmem_free(buf, size);
+}
+
+#else
+
+void
+zfs_dbgmsg_print(const char *tag)
+{
+ zfs_dbgmsg_t *zdm;
+
+ (void) printf("ZFS_DBGMSG(%s):\n", tag);
+ mutex_enter(&zfs_dbgmsgs_lock);
+ for (zdm = list_head(&zfs_dbgmsgs); zdm;
+ zdm = list_next(&zfs_dbgmsgs, zdm))
+ (void) printf("%s\n", zdm->zdm_msg);
+ mutex_exit(&zfs_dbgmsgs_lock);
+}
+#endif /* _KERNEL */
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, dbgmsg_enable, INT, ZMOD_RW,
+ "Enable ZFS debug message log");
+
+ZFS_MODULE_PARAM(zfs, zfs_, dbgmsg_maxsize, INT, ZMOD_RW,
+ "Maximum ZFS debug log size");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c
new file mode 100644
index 000000000000..fb01012dd6e7
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c
@@ -0,0 +1,968 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/extdirent.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/uio.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/unistd.h>
+#include <sys/sunddi.h>
+#include <sys/random.h>
+#include <sys/policy.h>
+#include <sys/condvar.h>
+#include <sys/callb.h>
+#include <sys/smp.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zap.h>
+#include <sys/dmu.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+
+#include <sys/ccompat.h>
+
+/*
+ * zfs_match_find() is used by zfs_dirent_lookup() to perform zap lookups
+ * of names after deciding which is the appropriate lookup interface.
+ */
+static int
+zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,
+ matchtype_t mt, uint64_t *zoid)
+{
+ int error;
+
+ if (zfsvfs->z_norm) {
+
+ /*
+ * In the non-mixed case we only expect there would ever
+ * be one match, but we need to use the normalizing lookup.
+ */
+ error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
+ zoid, mt, NULL, 0, NULL);
+ } else {
+ error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
+ }
+ *zoid = ZFS_DIRENT_OBJ(*zoid);
+
+ return (error);
+}
+
+/*
+ * Look up a directory entry under a locked vnode.
+ * dvp being locked gives us a guarantee that there are no concurrent
+ * modification of the directory and, thus, if a node can be found in
+ * the directory, then it must not be unlinked.
+ *
+ * Input arguments:
+ * dzp - znode for directory
+ * name - name of entry to lock
+ * flag - ZNEW: if the entry already exists, fail with EEXIST.
+ * ZEXISTS: if the entry does not exist, fail with ENOENT.
+ * ZXATTR: we want dzp's xattr directory
+ *
+ * Output arguments:
+ * zpp - pointer to the znode for the entry (NULL if there isn't one)
+ *
+ * Return value: 0 on success or errno on failure.
+ *
+ * NOTE: Always checks for, and rejects, '.' and '..'.
+ */
+int
+zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag)
+{
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ znode_t *zp;
+ matchtype_t mt = 0;
+ uint64_t zoid;
+ int error = 0;
+
+ if (zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+
+ *zpp = NULL;
+
+ /*
+ * Verify that we are not trying to lock '.', '..', or '.zfs'
+ */
+ if (name[0] == '.' &&
+ (((name[1] == '\0') || (name[1] == '.' && name[2] == '\0')) ||
+ (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)))
+ return (SET_ERROR(EEXIST));
+
+ /*
+ * Case sensitivity and normalization preferences are set when
+ * the file system is created. These are stored in the
+ * zfsvfs->z_case and zfsvfs->z_norm fields. These choices
+ * affect how we perform zap lookups.
+ *
+ * When matching we may need to normalize & change case according to
+ * FS settings.
+ *
+ * Note that a normalized match is necessary for a case insensitive
+ * filesystem when the lookup request is not exact because normalization
+ * can fold case independent of normalizing code point sequences.
+ *
+ * See the table above zfs_dropname().
+ */
+ if (zfsvfs->z_norm != 0) {
+ mt = MT_NORMALIZE;
+
+ /*
+ * Determine if the match needs to honor the case specified in
+ * lookup, and if so keep track of that so that during
+ * normalization we don't fold case.
+ */
+ if (zfsvfs->z_case == ZFS_CASE_MIXED) {
+ mt |= MT_MATCH_CASE;
+ }
+ }
+
+ /*
+ * Only look in or update the DNLC if we are looking for the
+ * name on a file system that does not require normalization
+ * or case folding. We can also look there if we happen to be
+ * on a non-normalizing, mixed sensitivity file system IF we
+ * are looking for the exact name.
+ *
+ * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE
+ * because in that case MT_EXACT and MT_FIRST should produce exactly
+ * the same result.
+ */
+
+ if (dzp->z_unlinked && !(flag & ZXATTR))
+ return (ENOENT);
+ if (flag & ZXATTR) {
+ error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
+ sizeof (zoid));
+ if (error == 0)
+ error = (zoid == 0 ? ENOENT : 0);
+ } else {
+ error = zfs_match_find(zfsvfs, dzp, name, mt, &zoid);
+ }
+ if (error) {
+ if (error != ENOENT || (flag & ZEXISTS)) {
+ return (error);
+ }
+ } else {
+ if (flag & ZNEW) {
+ return (SET_ERROR(EEXIST));
+ }
+ error = zfs_zget(zfsvfs, zoid, &zp);
+ if (error)
+ return (error);
+ ASSERT(!zp->z_unlinked);
+ *zpp = zp;
+ }
+
+ return (0);
+}
+
+static int
+zfs_dd_lookup(znode_t *dzp, znode_t **zpp)
+{
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ znode_t *zp;
+ uint64_t parent;
+ int error;
+
+#ifdef ZFS_DEBUG
+ if (zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+ ASSERT(ZFS_TEARDOWN_READ_HELD(zfsvfs));
+#endif
+ if (dzp->z_unlinked)
+ return (ENOENT);
+
+ if ((error = sa_lookup(dzp->z_sa_hdl,
+ SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+ return (error);
+
+ error = zfs_zget(zfsvfs, parent, &zp);
+ if (error == 0)
+ *zpp = zp;
+ return (error);
+}
+
+int
+zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp)
+{
+ zfsvfs_t *zfsvfs __unused = dzp->z_zfsvfs;
+ znode_t *zp = NULL;
+ int error = 0;
+
+#ifdef ZFS_DEBUG
+ if (zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+ ASSERT(ZFS_TEARDOWN_READ_HELD(zfsvfs));
+#endif
+ if (dzp->z_unlinked)
+ return (SET_ERROR(ENOENT));
+
+ if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+ *zpp = dzp;
+ } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+ error = zfs_dd_lookup(dzp, &zp);
+ if (error == 0)
+ *zpp = zp;
+ } else {
+ error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS);
+ if (error == 0) {
+ dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
+ *zpp = zp;
+ }
+ }
+ return (error);
+}
+
+/*
+ * unlinked Set (formerly known as the "delete queue") Error Handling
+ *
+ * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
+ * don't specify the name of the entry that we will be manipulating. We
+ * also fib and say that we won't be adding any new entries to the
+ * unlinked set, even though we might (this is to lower the minimum file
+ * size that can be deleted in a full filesystem). So on the small
+ * chance that the nlink list is using a fat zap (ie. has more than
+ * 2000 entries), we *may* not pre-read a block that's needed.
+ * Therefore it is remotely possible for some of the assertions
+ * regarding the unlinked set below to fail due to i/o error. On a
+ * nondebug system, this will result in the space being leaked.
+ */
+void
+zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ ASSERT(zp->z_unlinked);
+ ASSERT(zp->z_links == 0);
+
+ VERIFY3U(0, ==,
+ zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
+
+ dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1);
+}
+
+/*
+ * Clean up any znodes that had no links when we either crashed or
+ * (force) umounted the file system.
+ */
+void
+zfs_unlinked_drain(zfsvfs_t *zfsvfs)
+{
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ dmu_object_info_t doi;
+ znode_t *zp;
+ dmu_tx_t *tx;
+ int error;
+
+ /*
+ * Iterate over the contents of the unlinked set.
+ */
+ for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
+ zap_cursor_retrieve(&zc, &zap) == 0;
+ zap_cursor_advance(&zc)) {
+
+ /*
+ * See what kind of object we have in list
+ */
+
+ error = dmu_object_info(zfsvfs->z_os,
+ zap.za_first_integer, &doi);
+ if (error != 0)
+ continue;
+
+ ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
+ (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
+ /*
+ * We need to re-mark these list entries for deletion,
+ * so we pull them back into core and set zp->z_unlinked.
+ */
+ error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
+
+ /*
+ * We may pick up znodes that are already marked for deletion.
+ * This could happen during the purge of an extended attribute
+ * directory. All we need to do is skip over them, since they
+ * are already in the system marked z_unlinked.
+ */
+ if (error != 0)
+ continue;
+
+ vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
+
+ /*
+ * Due to changes in zfs_rmnode we need to make sure the
+ * link count is set to zero here.
+ */
+ if (zp->z_links != 0) {
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ dmu_tx_abort(tx);
+ vput(ZTOV(zp));
+ continue;
+ }
+ zp->z_links = 0;
+ VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+ &zp->z_links, sizeof (zp->z_links), tx));
+ dmu_tx_commit(tx);
+ }
+
+ zp->z_unlinked = B_TRUE;
+ vput(ZTOV(zp));
+ }
+ zap_cursor_fini(&zc);
+}
+
+/*
+ * Delete the entire contents of a directory. Return a count
+ * of the number of entries that could not be deleted. If we encounter
+ * an error, return a count of at least one so that the directory stays
+ * in the unlinked set.
+ *
+ * NOTE: this function assumes that the directory is inactive,
+ * so there is no need to lock its entries before deletion.
+ * Also, it assumes the directory contents is *only* regular
+ * files.
+ */
+static int
+zfs_purgedir(znode_t *dzp)
+{
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ znode_t *xzp;
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ int skipped = 0;
+ int error;
+
+ for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
+ (error = zap_cursor_retrieve(&zc, &zap)) == 0;
+ zap_cursor_advance(&zc)) {
+ error = zfs_zget(zfsvfs,
+ ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
+ if (error) {
+ skipped += 1;
+ continue;
+ }
+
+ vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
+ ASSERT((ZTOV(xzp)->v_type == VREG) ||
+ (ZTOV(xzp)->v_type == VLNK));
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
+ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ /* Is this really needed ? */
+ zfs_sa_upgrade_txholds(tx, xzp);
+ dmu_tx_mark_netfree(tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ vput(ZTOV(xzp));
+ skipped += 1;
+ continue;
+ }
+
+ error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL);
+ if (error)
+ skipped += 1;
+ dmu_tx_commit(tx);
+
+ vput(ZTOV(xzp));
+ }
+ zap_cursor_fini(&zc);
+ if (error != ENOENT)
+ skipped += 1;
+ return (skipped);
+}
+
+extern taskq_t *zfsvfs_taskq;
+
+void
+zfs_rmnode(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ objset_t *os = zfsvfs->z_os;
+ dmu_tx_t *tx;
+ uint64_t acl_obj;
+ uint64_t xattr_obj;
+ uint64_t count;
+ int error;
+
+ ASSERT(zp->z_links == 0);
+ if (zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+
+ /*
+ * If this is an attribute directory, purge its contents.
+ */
+ if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR &&
+ (zp->z_pflags & ZFS_XATTR)) {
+ if (zfs_purgedir(zp) != 0) {
+ /*
+ * Not enough space to delete some xattrs.
+ * Leave it in the unlinked set.
+ */
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_free(zp);
+ return;
+ }
+ } else {
+ /*
+ * Free up all the data in the file. We don't do this for
+ * XATTR directories because we need truncate and remove to be
+ * in the same tx, like in zfs_znode_delete(). Otherwise, if
+ * we crash here we'll end up with an inconsistent truncated
+ * zap object in the delete queue. Note a truncated file is
+ * harmless since it only contains user data.
+ */
+ error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
+ if (error) {
+ /*
+ * Not enough space or we were interrupted by unmount.
+ * Leave the file in the unlinked set.
+ */
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_free(zp);
+ return;
+ }
+ }
+
+ /*
+ * If the file has extended attributes, we're going to unlink
+ * the xattr dir.
+ */
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+ if (error)
+ xattr_obj = 0;
+
+ acl_obj = zfs_external_acl(zp);
+
+ /*
+ * Set up the final transaction.
+ */
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ if (xattr_obj)
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
+ if (acl_obj)
+ dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ /*
+ * Not enough space to delete the file. Leave it in the
+ * unlinked set, leaking it until the fs is remounted (at
+ * which point we'll call zfs_unlinked_drain() to process it).
+ */
+ dmu_tx_abort(tx);
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_free(zp);
+ return;
+ }
+
+ /*
+ * FreeBSD's implementation of zfs_zget requires a vnode to back it.
+ * This means that we could end up calling into getnewvnode while
+ * calling zfs_rmnode as a result of a prior call to getnewvnode
+ * trying to clear vnodes out of the cache. If this repeats we can
+ * recurse enough that we overflow our stack. To avoid this, we
+ * avoid calling zfs_zget on the xattr znode and instead simply add
+ * it to the unlinked set and schedule a call to zfs_unlinked_drain.
+ */
+ if (xattr_obj) {
+ /* Add extended attribute directory to the unlinked set. */
+ VERIFY3U(0, ==,
+ zap_add_int(os, zfsvfs->z_unlinkedobj, xattr_obj, tx));
+ }
+
+ mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock);
+
+ /* Remove this znode from the unlinked set */
+ VERIFY3U(0, ==,
+ zap_remove_int(os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
+
+ if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) {
+ cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv);
+ }
+
+ mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock);
+
+ dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1);
+
+ zfs_znode_delete(zp, tx);
+
+ dmu_tx_commit(tx);
+
+ if (xattr_obj) {
+ /*
+ * We're using the FreeBSD taskqueue API here instead of
+ * the Solaris taskq API since the FreeBSD API allows for a
+ * task to be enqueued multiple times but executed once.
+ */
+ taskqueue_enqueue(zfsvfs_taskq->tq_queue,
+ &zfsvfs->z_unlinked_drain_task);
+ }
+}
+
+static uint64_t
+zfs_dirent(znode_t *zp, uint64_t mode)
+{
+ uint64_t de = zp->z_id;
+
+ if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE)
+ de |= IFTODT(mode) << 60;
+ return (de);
+}
+
+/*
+ * Link zp into dzp. Can only fail if zp has been unlinked.
+ */
+int
+zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+ int flag)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ vnode_t *vp = ZTOV(zp);
+ uint64_t value;
+ int zp_is_dir = (vp->v_type == VDIR);
+ sa_bulk_attr_t bulk[5];
+ uint64_t mtime[2], ctime[2];
+ int count = 0;
+ int error;
+
+ if (zfsvfs->z_replay == B_FALSE) {
+ ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+ }
+ if (zp_is_dir) {
+ if (dzp->z_links >= ZFS_LINK_MAX)
+ return (SET_ERROR(EMLINK));
+ }
+ if (!(flag & ZRENAMING)) {
+ if (zp->z_unlinked) { /* no new links to unlinked zp */
+ ASSERT(!(flag & (ZNEW | ZEXISTS)));
+ return (SET_ERROR(ENOENT));
+ }
+ if (zp->z_links >= ZFS_LINK_MAX - zp_is_dir) {
+ return (SET_ERROR(EMLINK));
+ }
+ zp->z_links++;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &zp->z_links, sizeof (zp->z_links));
+
+ } else {
+ ASSERT(zp->z_unlinked == 0);
+ }
+ value = zfs_dirent(zp, zp->z_mode);
+ error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name,
+ 8, 1, &value, tx);
+
+ /*
+ * zap_add could fail to add the entry if it exceeds the capacity of the
+ * leaf-block and zap_leaf_split() failed to help.
+ * The caller of this routine is responsible for failing the transaction
+ * which will rollback the SA updates done above.
+ */
+ if (error != 0) {
+ if (!(flag & ZRENAMING) && !(flag & ZNEW))
+ zp->z_links--;
+ return (error);
+ }
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+ &dzp->z_id, sizeof (dzp->z_id));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+
+ if (!(flag & ZNEW)) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+ ctime);
+ }
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ ASSERT0(error);
+
+ dzp->z_size++;
+ dzp->z_links += zp_is_dir;
+ count = 0;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &dzp->z_size, sizeof (dzp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &dzp->z_links, sizeof (dzp->z_links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ mtime, sizeof (mtime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &dzp->z_pflags, sizeof (dzp->z_pflags));
+ zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
+ error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+ ASSERT0(error);
+ return (0);
+}
+
+/*
+ * The match type in the code for this function should conform to:
+ *
+ * ------------------------------------------------------------------------
+ * fs type | z_norm | lookup type | match type
+ * ---------|-------------|-------------|----------------------------------
+ * CS !norm | 0 | 0 | 0 (exact)
+ * CS norm | formX | 0 | MT_NORMALIZE
+ * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE
+ * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
+ * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE
+ * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
+ * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
+ * CM !norm | upper | ZCILOOK | MT_NORMALIZE
+ * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
+ * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE
+ *
+ * Abbreviations:
+ * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed
+ * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER)
+ * formX = unicode normalization form set on fs creation
+ */
+static int
+zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+ int flag)
+{
+ int error;
+
+ if (zp->z_zfsvfs->z_norm) {
+ matchtype_t mt = MT_NORMALIZE;
+
+ if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) {
+ mt |= MT_MATCH_CASE;
+ }
+
+ error = zap_remove_norm(zp->z_zfsvfs->z_os, dzp->z_id,
+ name, mt, tx);
+ } else {
+ error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, name, tx);
+ }
+
+ return (error);
+}
+
+/*
+ * Unlink zp from dzp, and mark zp for deletion if this was the last link.
+ * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
+ * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
+ * If it's non-NULL, we use it to indicate whether the znode needs deletion,
+ * and it's the caller's job to do it.
+ */
+int
+zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+ int flag, boolean_t *unlinkedp)
+{
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ vnode_t *vp = ZTOV(zp);
+ int zp_is_dir = (vp->v_type == VDIR);
+ boolean_t unlinked = B_FALSE;
+ sa_bulk_attr_t bulk[5];
+ uint64_t mtime[2], ctime[2];
+ int count = 0;
+ int error;
+
+ if (zfsvfs->z_replay == B_FALSE) {
+ ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+ }
+ if (!(flag & ZRENAMING)) {
+
+ if (zp_is_dir && !zfs_dirempty(zp))
+ return (SET_ERROR(ENOTEMPTY));
+
+ /*
+ * If we get here, we are going to try to remove the object.
+ * First try removing the name from the directory; if that
+ * fails, return the error.
+ */
+ error = zfs_dropname(dzp, name, zp, tx, flag);
+ if (error != 0) {
+ return (error);
+ }
+
+ if (zp->z_links <= zp_is_dir) {
+ zfs_panic_recover("zfs: link count on vnode %p is %u, "
+ "should be at least %u", zp->z_vnode,
+ (int)zp->z_links,
+ zp_is_dir + 1);
+ zp->z_links = zp_is_dir + 1;
+ }
+ if (--zp->z_links == zp_is_dir) {
+ zp->z_unlinked = B_TRUE;
+ zp->z_links = 0;
+ unlinked = B_TRUE;
+ } else {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, sizeof (zp->z_pflags));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+ ctime);
+ }
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+ NULL, &zp->z_links, sizeof (zp->z_links));
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ count = 0;
+ ASSERT0(error);
+ } else {
+ ASSERT(zp->z_unlinked == 0);
+ error = zfs_dropname(dzp, name, zp, tx, flag);
+ if (error != 0)
+ return (error);
+ }
+
+ dzp->z_size--; /* one dirent removed */
+ dzp->z_links -= zp_is_dir; /* ".." link from zp */
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+ NULL, &dzp->z_links, sizeof (dzp->z_links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+ NULL, &dzp->z_size, sizeof (dzp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+ NULL, ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+ NULL, mtime, sizeof (mtime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
+ zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
+ error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+ ASSERT0(error);
+
+ if (unlinkedp != NULL)
+ *unlinkedp = unlinked;
+ else if (unlinked)
+ zfs_unlinked_add(zp, tx);
+
+ return (0);
+}
+
+/*
+ * Indicate whether the directory is empty.
+ */
+boolean_t
+zfs_dirempty(znode_t *dzp)
+{
+ return (dzp->z_size == 2);
+}
+
+int
+zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xvpp, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ znode_t *xzp;
+ dmu_tx_t *tx;
+ int error;
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
+ uint64_t parent __unused;
+
+ *xvpp = NULL;
+
+ if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
+ &acl_ids)) != 0)
+ return (error);
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, 0)) {
+ zfs_acl_ids_free(&acl_ids);
+ return (SET_ERROR(EDQUOT));
+ }
+
+ getnewvnode_reserve_();
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_abort(tx);
+ getnewvnode_drop_reserve();
+ return (error);
+ }
+ zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+#ifdef ZFS_DEBUG
+ error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (parent));
+ ASSERT(error == 0 && parent == zp->z_id);
+#endif
+
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
+ sizeof (xzp->z_id), tx));
+
+ (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
+ xzp, "", NULL, acl_ids.z_fuidp, vap);
+
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_commit(tx);
+
+ getnewvnode_drop_reserve();
+
+ *xvpp = xzp;
+
+ return (0);
+}
+
+/*
+ * Return a znode for the extended attribute directory for zp.
+ * ** If the directory does not already exist, it is created **
+ *
+ * IN: zp - znode to obtain attribute directory from
+ * cr - credentials of caller
+ * flags - flags from the VOP_LOOKUP call
+ *
+ * OUT: xzpp - pointer to extended attribute znode
+ *
+ * RETURN: 0 on success
+ * error number on failure
+ */
+int
+zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ znode_t *xzp;
+ vattr_t va;
+ int error;
+top:
+ error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR);
+ if (error)
+ return (error);
+
+ if (xzp != NULL) {
+ *xzpp = xzp;
+ return (0);
+ }
+
+
+ if (!(flags & CREATE_XATTR_DIR))
+ return (SET_ERROR(ENOATTR));
+
+ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+ return (SET_ERROR(EROFS));
+ }
+
+ /*
+ * The ability to 'create' files in an attribute
+ * directory comes from the write_xattr permission on the base file.
+ *
+ * The ability to 'search' an attribute directory requires
+ * read_xattr permission on the base file.
+ *
+ * Once in a directory the ability to read/write attributes
+ * is controlled by the permissions on the attribute file.
+ */
+ va.va_mask = AT_MODE | AT_UID | AT_GID;
+ va.va_type = VDIR;
+ va.va_mode = S_IFDIR | S_ISVTX | 0777;
+ zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
+
+ error = zfs_make_xattrdir(zp, &va, xzpp, cr);
+
+ if (error == ERESTART) {
+ /* NB: we already did dmu_tx_wait() if necessary */
+ goto top;
+ }
+ if (error == 0)
+ VOP_UNLOCK1(ZTOV(*xzpp));
+
+ return (error);
+}
+
+/*
+ * Decide whether it is okay to remove within a sticky directory.
+ *
+ * In sticky directories, write access is not sufficient;
+ * you can remove entries from a directory only if:
+ *
+ * you own the directory,
+ * you own the entry,
+ * the entry is a plain file and you have write access,
+ * or you are privileged (checked in secpolicy...).
+ *
+ * The function returns 0 if remove access is granted.
+ */
+int
+zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
+{
+ uid_t uid;
+ uid_t downer;
+ uid_t fowner;
+ zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
+
+ if (zdp->z_zfsvfs->z_replay)
+ return (0);
+
+ if ((zdp->z_mode & S_ISVTX) == 0)
+ return (0);
+
+ downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER);
+ fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+
+ if ((uid = crgetuid(cr)) == downer || uid == fowner ||
+ (ZTOV(zp)->v_type == VREG &&
+ zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0))
+ return (0);
+ else
+ return (secpolicy_vnode_remove(ZTOV(zp), cr));
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
new file mode 100644
index 000000000000..06546c12e420
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_recv.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_file.h>
+#include <sys/buf.h>
+#include <sys/stat.h>
+
+int
+zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp)
+{
+ struct thread *td;
+ int rc, fd;
+
+ td = curthread;
+ pwd_ensure_dirs();
+ /* 12.x doesn't take a const char * */
+ rc = kern_openat(td, AT_FDCWD, __DECONST(char *, path),
+ UIO_SYSSPACE, flags, mode);
+ if (rc)
+ return (SET_ERROR(rc));
+ fd = td->td_retval[0];
+ td->td_retval[0] = 0;
+ if (fget(curthread, fd, &cap_no_rights, fpp))
+ kern_close(td, fd);
+ return (0);
+}
+
+void
+zfs_file_close(zfs_file_t *fp)
+{
+ fo_close(fp, curthread);
+}
+
+static int
+zfs_file_write_impl(zfs_file_t *fp, const void *buf, size_t count, loff_t *offp,
+ ssize_t *resid)
+{
+ ssize_t rc;
+ struct uio auio;
+ struct thread *td;
+ struct iovec aiov;
+
+ td = curthread;
+ aiov.iov_base = (void *)(uintptr_t)buf;
+ aiov.iov_len = count;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_resid = count;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_td = td;
+ auio.uio_offset = *offp;
+
+ if ((fp->f_flag & FWRITE) == 0)
+ return (SET_ERROR(EBADF));
+
+ if (fp->f_type == DTYPE_VNODE)
+ bwillwrite();
+
+ rc = fo_write(fp, &auio, td->td_ucred, FOF_OFFSET, td);
+ if (rc)
+ return (SET_ERROR(rc));
+ if (resid)
+ *resid = auio.uio_resid;
+ else if (auio.uio_resid)
+ return (SET_ERROR(EIO));
+ *offp += count - auio.uio_resid;
+ return (rc);
+}
+
+int
+zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
+{
+ loff_t off = fp->f_offset;
+ ssize_t rc;
+
+ rc = zfs_file_write_impl(fp, buf, count, &off, resid);
+ if (rc == 0)
+ fp->f_offset = off;
+
+ return (SET_ERROR(rc));
+}
+
+int
+zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off,
+ ssize_t *resid)
+{
+ return (zfs_file_write_impl(fp, buf, count, &off, resid));
+}
+
+static int
+zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *offp,
+ ssize_t *resid)
+{
+ ssize_t rc;
+ struct uio auio;
+ struct thread *td;
+ struct iovec aiov;
+
+ td = curthread;
+ aiov.iov_base = (void *)(uintptr_t)buf;
+ aiov.iov_len = count;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_resid = count;
+ auio.uio_rw = UIO_READ;
+ auio.uio_td = td;
+ auio.uio_offset = *offp;
+
+ if ((fp->f_flag & FREAD) == 0)
+ return (SET_ERROR(EBADF));
+
+ rc = fo_read(fp, &auio, td->td_ucred, FOF_OFFSET, td);
+ if (rc)
+ return (SET_ERROR(rc));
+ if (resid)
+ *resid = auio.uio_resid;
+ *offp += count - auio.uio_resid;
+ return (SET_ERROR(0));
+}
+
+int
+zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid)
+{
+ loff_t off = fp->f_offset;
+ ssize_t rc;
+
+ rc = zfs_file_read_impl(fp, buf, count, &off, resid);
+ if (rc == 0)
+ fp->f_offset = off;
+ return (rc);
+}
+
+int
+zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off,
+ ssize_t *resid)
+{
+ return (zfs_file_read_impl(fp, buf, count, &off, resid));
+}
+
+int
+zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence)
+{
+ int rc;
+ struct thread *td;
+
+ td = curthread;
+ if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0)
+ return (SET_ERROR(ESPIPE));
+ rc = fo_seek(fp, *offp, whence, td);
+ if (rc == 0)
+ *offp = td->td_uretoff.tdu_off;
+ return (SET_ERROR(rc));
+}
+
+int
+zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr)
+{
+ struct thread *td;
+ struct stat sb;
+ int rc;
+
+ td = curthread;
+
+ rc = fo_stat(fp, &sb, td->td_ucred, td);
+ if (rc)
+ return (SET_ERROR(rc));
+ zfattr->zfa_size = sb.st_size;
+ zfattr->zfa_mode = sb.st_mode;
+
+ return (0);
+}
+
+static __inline int
+zfs_vop_fsync(vnode_t *vp)
+{
+ struct mount *mp;
+ int error;
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ goto drop;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ error = VOP_FSYNC(vp, MNT_WAIT, curthread);
+ VOP_UNLOCK1(vp);
+ vn_finished_write(mp);
+drop:
+ return (SET_ERROR(error));
+}
+
+int
+zfs_file_fsync(zfs_file_t *fp, int flags)
+{
+ if (fp->f_type != DTYPE_VNODE)
+ return (EINVAL);
+
+ return (zfs_vop_fsync(fp->f_vnode));
+}
+
+int
+zfs_file_get(int fd, zfs_file_t **fpp)
+{
+ struct file *fp;
+
+ if (fget(curthread, fd, &cap_no_rights, &fp))
+ return (SET_ERROR(EBADF));
+
+ *fpp = fp;
+ return (0);
+}
+
+void
+zfs_file_put(int fd)
+{
+ struct file *fp;
+
+ /* No CAP_ rights required, as we're only releasing. */
+ if (fget(curthread, fd, &cap_no_rights, &fp) == 0) {
+ fdrop(fp, curthread);
+ fdrop(fp, curthread);
+ }
+}
+
+loff_t
+zfs_file_off(zfs_file_t *fp)
+{
+ return (fp->f_offset);
+}
+
+void *
+zfs_file_private(zfs_file_t *fp)
+{
+ file_t *tmpfp;
+ void *data;
+ int error;
+
+ tmpfp = curthread->td_fpop;
+ curthread->td_fpop = fp;
+ error = devfs_get_cdevpriv(&data);
+ curthread->td_fpop = tmpfp;
+ if (error != 0)
+ return (NULL);
+ return (data);
+}
+
+int
+zfs_file_unlink(const char *fnamep)
+{
+ zfs_uio_seg_t seg = UIO_SYSSPACE;
+ int rc;
+
+#if __FreeBSD_version >= 1300018
+ rc = kern_funlinkat(curthread, AT_FDCWD, fnamep, FD_NONE, seg, 0, 0);
+#else
+#ifdef AT_BENEATH
+ rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep),
+ seg, 0, 0);
+#else
+ rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep),
+ seg, 0);
+#endif
+#endif
+ return (SET_ERROR(rc));
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_compat.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_compat.c
new file mode 100644
index 000000000000..81967bed73f9
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_compat.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/zfs_ioctl_compat.h>
+
+enum zfs_ioc_legacy {
+ ZFS_IOC_LEGACY_NONE = -1,
+ ZFS_IOC_LEGACY_FIRST = 0,
+ ZFS_LEGACY_IOC = ZFS_IOC_LEGACY_FIRST,
+ ZFS_IOC_LEGACY_POOL_CREATE = ZFS_IOC_LEGACY_FIRST,
+ ZFS_IOC_LEGACY_POOL_DESTROY,
+ ZFS_IOC_LEGACY_POOL_IMPORT,
+ ZFS_IOC_LEGACY_POOL_EXPORT,
+ ZFS_IOC_LEGACY_POOL_CONFIGS,
+ ZFS_IOC_LEGACY_POOL_STATS,
+ ZFS_IOC_LEGACY_POOL_TRYIMPORT,
+ ZFS_IOC_LEGACY_POOL_SCAN,
+ ZFS_IOC_LEGACY_POOL_FREEZE,
+ ZFS_IOC_LEGACY_POOL_UPGRADE,
+ ZFS_IOC_LEGACY_POOL_GET_HISTORY,
+ ZFS_IOC_LEGACY_VDEV_ADD,
+ ZFS_IOC_LEGACY_VDEV_REMOVE,
+ ZFS_IOC_LEGACY_VDEV_SET_STATE,
+ ZFS_IOC_LEGACY_VDEV_ATTACH,
+ ZFS_IOC_LEGACY_VDEV_DETACH,
+ ZFS_IOC_LEGACY_VDEV_SETPATH,
+ ZFS_IOC_LEGACY_VDEV_SETFRU,
+ ZFS_IOC_LEGACY_OBJSET_STATS,
+ ZFS_IOC_LEGACY_OBJSET_ZPLPROPS,
+ ZFS_IOC_LEGACY_DATASET_LIST_NEXT,
+ ZFS_IOC_LEGACY_SNAPSHOT_LIST_NEXT,
+ ZFS_IOC_LEGACY_SET_PROP,
+ ZFS_IOC_LEGACY_CREATE,
+ ZFS_IOC_LEGACY_DESTROY,
+ ZFS_IOC_LEGACY_ROLLBACK,
+ ZFS_IOC_LEGACY_RENAME,
+ ZFS_IOC_LEGACY_RECV,
+ ZFS_IOC_LEGACY_SEND,
+ ZFS_IOC_LEGACY_INJECT_FAULT,
+ ZFS_IOC_LEGACY_CLEAR_FAULT,
+ ZFS_IOC_LEGACY_INJECT_LIST_NEXT,
+ ZFS_IOC_LEGACY_ERROR_LOG,
+ ZFS_IOC_LEGACY_CLEAR,
+ ZFS_IOC_LEGACY_PROMOTE,
+ ZFS_IOC_LEGACY_DESTROY_SNAPS,
+ ZFS_IOC_LEGACY_SNAPSHOT,
+ ZFS_IOC_LEGACY_DSOBJ_TO_DSNAME,
+ ZFS_IOC_LEGACY_OBJ_TO_PATH,
+ ZFS_IOC_LEGACY_POOL_SET_PROPS,
+ ZFS_IOC_LEGACY_POOL_GET_PROPS,
+ ZFS_IOC_LEGACY_SET_FSACL,
+ ZFS_IOC_LEGACY_GET_FSACL,
+ ZFS_IOC_LEGACY_SHARE,
+ ZFS_IOC_LEGACY_INHERIT_PROP,
+ ZFS_IOC_LEGACY_SMB_ACL,
+ ZFS_IOC_LEGACY_USERSPACE_ONE,
+ ZFS_IOC_LEGACY_USERSPACE_MANY,
+ ZFS_IOC_LEGACY_USERSPACE_UPGRADE,
+ ZFS_IOC_LEGACY_HOLD,
+ ZFS_IOC_LEGACY_RELEASE,
+ ZFS_IOC_LEGACY_GET_HOLDS,
+ ZFS_IOC_LEGACY_OBJSET_RECVD_PROPS,
+ ZFS_IOC_LEGACY_VDEV_SPLIT,
+ ZFS_IOC_LEGACY_NEXT_OBJ,
+ ZFS_IOC_LEGACY_DIFF,
+ ZFS_IOC_LEGACY_TMP_SNAPSHOT,
+ ZFS_IOC_LEGACY_OBJ_TO_STATS,
+ ZFS_IOC_LEGACY_JAIL,
+ ZFS_IOC_LEGACY_UNJAIL,
+ ZFS_IOC_LEGACY_POOL_REGUID,
+ ZFS_IOC_LEGACY_SPACE_WRITTEN,
+ ZFS_IOC_LEGACY_SPACE_SNAPS,
+ ZFS_IOC_LEGACY_SEND_PROGRESS,
+ ZFS_IOC_LEGACY_POOL_REOPEN,
+ ZFS_IOC_LEGACY_LOG_HISTORY,
+ ZFS_IOC_LEGACY_SEND_NEW,
+ ZFS_IOC_LEGACY_SEND_SPACE,
+ ZFS_IOC_LEGACY_CLONE,
+ ZFS_IOC_LEGACY_BOOKMARK,
+ ZFS_IOC_LEGACY_GET_BOOKMARKS,
+ ZFS_IOC_LEGACY_DESTROY_BOOKMARKS,
+ ZFS_IOC_LEGACY_NEXTBOOT,
+ ZFS_IOC_LEGACY_CHANNEL_PROGRAM,
+ ZFS_IOC_LEGACY_REMAP,
+ ZFS_IOC_LEGACY_POOL_CHECKPOINT,
+ ZFS_IOC_LEGACY_POOL_DISCARD_CHECKPOINT,
+ ZFS_IOC_LEGACY_POOL_INITIALIZE,
+ ZFS_IOC_LEGACY_POOL_SYNC,
+ ZFS_IOC_LEGACY_LAST
+};
+
+unsigned static long zfs_ioctl_legacy_to_ozfs_[] = {
+ ZFS_IOC_POOL_CREATE, /* 0x00 */
+ ZFS_IOC_POOL_DESTROY, /* 0x01 */
+ ZFS_IOC_POOL_IMPORT, /* 0x02 */
+ ZFS_IOC_POOL_EXPORT, /* 0x03 */
+ ZFS_IOC_POOL_CONFIGS, /* 0x04 */
+ ZFS_IOC_POOL_STATS, /* 0x05 */
+ ZFS_IOC_POOL_TRYIMPORT, /* 0x06 */
+ ZFS_IOC_POOL_SCAN, /* 0x07 */
+ ZFS_IOC_POOL_FREEZE, /* 0x08 */
+ ZFS_IOC_POOL_UPGRADE, /* 0x09 */
+ ZFS_IOC_POOL_GET_HISTORY, /* 0x0a */
+ ZFS_IOC_VDEV_ADD, /* 0x0b */
+ ZFS_IOC_VDEV_REMOVE, /* 0x0c */
+ ZFS_IOC_VDEV_SET_STATE, /* 0x0d */
+ ZFS_IOC_VDEV_ATTACH, /* 0x0e */
+ ZFS_IOC_VDEV_DETACH, /* 0x0f */
+ ZFS_IOC_VDEV_SETPATH, /* 0x10 */
+ ZFS_IOC_VDEV_SETFRU, /* 0x11 */
+ ZFS_IOC_OBJSET_STATS, /* 0x12 */
+ ZFS_IOC_OBJSET_ZPLPROPS, /* 0x13 */
+ ZFS_IOC_DATASET_LIST_NEXT, /* 0x14 */
+ ZFS_IOC_SNAPSHOT_LIST_NEXT, /* 0x15 */
+ ZFS_IOC_SET_PROP, /* 0x16 */
+ ZFS_IOC_CREATE, /* 0x17 */
+ ZFS_IOC_DESTROY, /* 0x18 */
+ ZFS_IOC_ROLLBACK, /* 0x19 */
+ ZFS_IOC_RENAME, /* 0x1a */
+ ZFS_IOC_RECV, /* 0x1b */
+ ZFS_IOC_SEND, /* 0x1c */
+ ZFS_IOC_INJECT_FAULT, /* 0x1d */
+ ZFS_IOC_CLEAR_FAULT, /* 0x1e */
+ ZFS_IOC_INJECT_LIST_NEXT, /* 0x1f */
+ ZFS_IOC_ERROR_LOG, /* 0x20 */
+ ZFS_IOC_CLEAR, /* 0x21 */
+ ZFS_IOC_PROMOTE, /* 0x22 */
+ /* start of mismatch */
+
+ ZFS_IOC_DESTROY_SNAPS, /* 0x23:0x3b */
+ ZFS_IOC_SNAPSHOT, /* 0x24:0x23 */
+ ZFS_IOC_DSOBJ_TO_DSNAME, /* 0x25:0x24 */
+ ZFS_IOC_OBJ_TO_PATH, /* 0x26:0x25 */
+ ZFS_IOC_POOL_SET_PROPS, /* 0x27:0x26 */
+ ZFS_IOC_POOL_GET_PROPS, /* 0x28:0x27 */
+ ZFS_IOC_SET_FSACL, /* 0x29:0x28 */
+ ZFS_IOC_GET_FSACL, /* 0x30:0x29 */
+ ZFS_IOC_SHARE, /* 0x2b:0x2a */
+ ZFS_IOC_INHERIT_PROP, /* 0x2c:0x2b */
+ ZFS_IOC_SMB_ACL, /* 0x2d:0x2c */
+ ZFS_IOC_USERSPACE_ONE, /* 0x2e:0x2d */
+ ZFS_IOC_USERSPACE_MANY, /* 0x2f:0x2e */
+ ZFS_IOC_USERSPACE_UPGRADE, /* 0x30:0x2f */
+ ZFS_IOC_HOLD, /* 0x31:0x30 */
+ ZFS_IOC_RELEASE, /* 0x32:0x31 */
+ ZFS_IOC_GET_HOLDS, /* 0x33:0x32 */
+ ZFS_IOC_OBJSET_RECVD_PROPS, /* 0x34:0x33 */
+ ZFS_IOC_VDEV_SPLIT, /* 0x35:0x34 */
+ ZFS_IOC_NEXT_OBJ, /* 0x36:0x35 */
+ ZFS_IOC_DIFF, /* 0x37:0x36 */
+ ZFS_IOC_TMP_SNAPSHOT, /* 0x38:0x37 */
+ ZFS_IOC_OBJ_TO_STATS, /* 0x39:0x38 */
+ ZFS_IOC_JAIL, /* 0x3a:0xc2 */
+ ZFS_IOC_UNJAIL, /* 0x3b:0xc3 */
+ ZFS_IOC_POOL_REGUID, /* 0x3c:0x3c */
+ ZFS_IOC_SPACE_WRITTEN, /* 0x3d:0x39 */
+ ZFS_IOC_SPACE_SNAPS, /* 0x3e:0x3a */
+ ZFS_IOC_SEND_PROGRESS, /* 0x3f:0x3e */
+ ZFS_IOC_POOL_REOPEN, /* 0x40:0x3d */
+ ZFS_IOC_LOG_HISTORY, /* 0x41:0x3f */
+ ZFS_IOC_SEND_NEW, /* 0x42:0x40 */
+ ZFS_IOC_SEND_SPACE, /* 0x43:0x41 */
+ ZFS_IOC_CLONE, /* 0x44:0x42 */
+ ZFS_IOC_BOOKMARK, /* 0x45:0x43 */
+ ZFS_IOC_GET_BOOKMARKS, /* 0x46:0x44 */
+ ZFS_IOC_DESTROY_BOOKMARKS, /* 0x47:0x45 */
+ ZFS_IOC_NEXTBOOT, /* 0x48:0xc1 */
+ ZFS_IOC_CHANNEL_PROGRAM, /* 0x49:0x48 */
+ ZFS_IOC_REMAP, /* 0x4a:0x4c */
+ ZFS_IOC_POOL_CHECKPOINT, /* 0x4b:0x4d */
+ ZFS_IOC_POOL_DISCARD_CHECKPOINT, /* 0x4c:0x4e */
+ ZFS_IOC_POOL_INITIALIZE, /* 0x4d:0x4f */
+};
+
+unsigned static long zfs_ioctl_ozfs_to_legacy_common_[] = {
+ ZFS_IOC_POOL_CREATE, /* 0x00 */
+ ZFS_IOC_POOL_DESTROY, /* 0x01 */
+ ZFS_IOC_POOL_IMPORT, /* 0x02 */
+ ZFS_IOC_POOL_EXPORT, /* 0x03 */
+ ZFS_IOC_POOL_CONFIGS, /* 0x04 */
+ ZFS_IOC_POOL_STATS, /* 0x05 */
+ ZFS_IOC_POOL_TRYIMPORT, /* 0x06 */
+ ZFS_IOC_POOL_SCAN, /* 0x07 */
+ ZFS_IOC_POOL_FREEZE, /* 0x08 */
+ ZFS_IOC_POOL_UPGRADE, /* 0x09 */
+ ZFS_IOC_POOL_GET_HISTORY, /* 0x0a */
+ ZFS_IOC_VDEV_ADD, /* 0x0b */
+ ZFS_IOC_VDEV_REMOVE, /* 0x0c */
+ ZFS_IOC_VDEV_SET_STATE, /* 0x0d */
+ ZFS_IOC_VDEV_ATTACH, /* 0x0e */
+ ZFS_IOC_VDEV_DETACH, /* 0x0f */
+ ZFS_IOC_VDEV_SETPATH, /* 0x10 */
+ ZFS_IOC_VDEV_SETFRU, /* 0x11 */
+ ZFS_IOC_OBJSET_STATS, /* 0x12 */
+ ZFS_IOC_OBJSET_ZPLPROPS, /* 0x13 */
+ ZFS_IOC_DATASET_LIST_NEXT, /* 0x14 */
+ ZFS_IOC_SNAPSHOT_LIST_NEXT, /* 0x15 */
+ ZFS_IOC_SET_PROP, /* 0x16 */
+ ZFS_IOC_CREATE, /* 0x17 */
+ ZFS_IOC_DESTROY, /* 0x18 */
+ ZFS_IOC_ROLLBACK, /* 0x19 */
+ ZFS_IOC_RENAME, /* 0x1a */
+ ZFS_IOC_RECV, /* 0x1b */
+ ZFS_IOC_SEND, /* 0x1c */
+ ZFS_IOC_INJECT_FAULT, /* 0x1d */
+ ZFS_IOC_CLEAR_FAULT, /* 0x1e */
+ ZFS_IOC_INJECT_LIST_NEXT, /* 0x1f */
+ ZFS_IOC_ERROR_LOG, /* 0x20 */
+ ZFS_IOC_CLEAR, /* 0x21 */
+ ZFS_IOC_PROMOTE, /* 0x22 */
+ /* start of mismatch */
+ ZFS_IOC_LEGACY_SNAPSHOT, /* 0x23 */
+ ZFS_IOC_LEGACY_DSOBJ_TO_DSNAME, /* 0x24 */
+ ZFS_IOC_LEGACY_OBJ_TO_PATH, /* 0x25 */
+ ZFS_IOC_LEGACY_POOL_SET_PROPS, /* 0x26 */
+ ZFS_IOC_LEGACY_POOL_GET_PROPS, /* 0x27 */
+ ZFS_IOC_LEGACY_SET_FSACL, /* 0x28 */
+ ZFS_IOC_LEGACY_GET_FSACL, /* 0x29 */
+ ZFS_IOC_LEGACY_SHARE, /* 0x2a */
+ ZFS_IOC_LEGACY_INHERIT_PROP, /* 0x2b */
+ ZFS_IOC_LEGACY_SMB_ACL, /* 0x2c */
+ ZFS_IOC_LEGACY_USERSPACE_ONE, /* 0x2d */
+ ZFS_IOC_LEGACY_USERSPACE_MANY, /* 0x2e */
+ ZFS_IOC_LEGACY_USERSPACE_UPGRADE, /* 0x2f */
+ ZFS_IOC_LEGACY_HOLD, /* 0x30 */
+ ZFS_IOC_LEGACY_RELEASE, /* 0x31 */
+ ZFS_IOC_LEGACY_GET_HOLDS, /* 0x32 */
+ ZFS_IOC_LEGACY_OBJSET_RECVD_PROPS, /* 0x33 */
+ ZFS_IOC_LEGACY_VDEV_SPLIT, /* 0x34 */
+ ZFS_IOC_LEGACY_NEXT_OBJ, /* 0x35 */
+ ZFS_IOC_LEGACY_DIFF, /* 0x36 */
+ ZFS_IOC_LEGACY_TMP_SNAPSHOT, /* 0x37 */
+ ZFS_IOC_LEGACY_OBJ_TO_STATS, /* 0x38 */
+ ZFS_IOC_LEGACY_SPACE_WRITTEN, /* 0x39 */
+ ZFS_IOC_LEGACY_SPACE_SNAPS, /* 0x3a */
+ ZFS_IOC_LEGACY_DESTROY_SNAPS, /* 0x3b */
+ ZFS_IOC_LEGACY_POOL_REGUID, /* 0x3c */
+ ZFS_IOC_LEGACY_POOL_REOPEN, /* 0x3d */
+ ZFS_IOC_LEGACY_SEND_PROGRESS, /* 0x3e */
+ ZFS_IOC_LEGACY_LOG_HISTORY, /* 0x3f */
+ ZFS_IOC_LEGACY_SEND_NEW, /* 0x40 */
+ ZFS_IOC_LEGACY_SEND_SPACE, /* 0x41 */
+ ZFS_IOC_LEGACY_CLONE, /* 0x42 */
+ ZFS_IOC_LEGACY_BOOKMARK, /* 0x43 */
+ ZFS_IOC_LEGACY_GET_BOOKMARKS, /* 0x44 */
+ ZFS_IOC_LEGACY_DESTROY_BOOKMARKS, /* 0x45 */
+ ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_RECV_NEW */
+ ZFS_IOC_LEGACY_POOL_SYNC, /* 0x47 */
+ ZFS_IOC_LEGACY_CHANNEL_PROGRAM, /* 0x48 */
+ ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_LOAD_KEY */
+ ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_UNLOAD_KEY */
+ ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_CHANGE_KEY */
+ ZFS_IOC_LEGACY_REMAP, /* 0x4c */
+ ZFS_IOC_LEGACY_POOL_CHECKPOINT, /* 0x4d */
+ ZFS_IOC_LEGACY_POOL_DISCARD_CHECKPOINT, /* 0x4e */
+ ZFS_IOC_LEGACY_POOL_INITIALIZE, /* 0x4f */
+ ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_POOL_TRIM */
+ ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_REDACT */
+ ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_GET_BOOKMARK_PROPS */
+ ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_WAIT */
+ ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_WAIT_FS */
+};
+
+unsigned static long zfs_ioctl_ozfs_to_legacy_platform_[] = {
+ ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_EVENTS_NEXT */
+ ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_EVENTS_CLEAR */
+ ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_EVENTS_SEEK */
+ ZFS_IOC_LEGACY_NEXTBOOT,
+ ZFS_IOC_LEGACY_JAIL,
+ ZFS_IOC_LEGACY_UNJAIL,
+ ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_SET_BOOTENV */
+ ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_GET_BOOTENV */
+};
+
+int
+zfs_ioctl_legacy_to_ozfs(int request)
+{
+ if (request >= sizeof (zfs_ioctl_legacy_to_ozfs_)/sizeof (long))
+ return (-1);
+ return (zfs_ioctl_legacy_to_ozfs_[request]);
+}
+
+int
+zfs_ioctl_ozfs_to_legacy(int request)
+{
+ if (request > ZFS_IOC_LAST)
+ return (-1);
+
+ if (request > ZFS_IOC_PLATFORM) {
+ request -= ZFS_IOC_PLATFORM + 1;
+ return (zfs_ioctl_ozfs_to_legacy_platform_[request]);
+ }
+ if (request >= sizeof (zfs_ioctl_ozfs_to_legacy_common_)/sizeof (long))
+ return (-1);
+ return (zfs_ioctl_ozfs_to_legacy_common_[request]);
+}
+
+void
+zfs_cmd_legacy_to_ozfs(zfs_cmd_legacy_t *src, zfs_cmd_t *dst)
+{
+ memcpy(dst, src, offsetof(zfs_cmd_t, zc_objset_stats));
+ *&dst->zc_objset_stats = *&src->zc_objset_stats;
+ memcpy(&dst->zc_begin_record, &src->zc_begin_record,
+ offsetof(zfs_cmd_t, zc_sendobj) -
+ offsetof(zfs_cmd_t, zc_begin_record));
+ memcpy(&dst->zc_sendobj, &src->zc_sendobj,
+ sizeof (zfs_cmd_t) - 8 - offsetof(zfs_cmd_t, zc_sendobj));
+ dst->zc_zoneid = src->zc_jailid;
+}
+
+void
+zfs_cmd_ozfs_to_legacy(zfs_cmd_t *src, zfs_cmd_legacy_t *dst)
+{
+ memcpy(dst, src, offsetof(zfs_cmd_t, zc_objset_stats));
+ *&dst->zc_objset_stats = *&src->zc_objset_stats;
+ *&dst->zc_begin_record.drr_u.drr_begin = *&src->zc_begin_record;
+ dst->zc_begin_record.drr_payloadlen = 0;
+ dst->zc_begin_record.drr_type = 0;
+
+ memcpy(&dst->zc_inject_record, &src->zc_inject_record,
+ offsetof(zfs_cmd_t, zc_sendobj) -
+ offsetof(zfs_cmd_t, zc_inject_record));
+ dst->zc_resumable = B_FALSE;
+ memcpy(&dst->zc_sendobj, &src->zc_sendobj,
+ sizeof (zfs_cmd_t) - 8 - offsetof(zfs_cmd_t, zc_sendobj));
+ dst->zc_jailid = src->zc_zoneid;
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_os.c
new file mode 100644
index 000000000000..0e0c16033b15
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_os.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/nvpair.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_os.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zone.h>
+#include <vm/vm_pageout.h>
+
+#include <sys/zfs_ioctl_impl.h>
+
+#if __FreeBSD_version < 1201517
+#define vm_page_max_user_wired vm_page_max_wired
+#endif
+
+int
+zfs_vfs_ref(zfsvfs_t **zfvp)
+{
+ int error = 0;
+
+ if (*zfvp == NULL)
+ return (SET_ERROR(ESRCH));
+
+ error = vfs_busy((*zfvp)->z_vfs, 0);
+ if (error != 0) {
+ *zfvp = NULL;
+ error = SET_ERROR(ESRCH);
+ }
+ return (error);
+}
+
+int
+zfs_vfs_held(zfsvfs_t *zfsvfs)
+{
+ return (zfsvfs->z_vfs != NULL);
+}
+
+void
+zfs_vfs_rele(zfsvfs_t *zfsvfs)
+{
+ vfs_unbusy(zfsvfs->z_vfs);
+}
+
+static const zfs_ioc_key_t zfs_keys_nextboot[] = {
+ {"command", DATA_TYPE_STRING, 0},
+ { ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 0},
+ { ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, 0}
+};
+
+static int
+zfs_ioc_jail(zfs_cmd_t *zc)
+{
+
+ return (zone_dataset_attach(curthread->td_ucred, zc->zc_name,
+ (int)zc->zc_zoneid));
+}
+
+static int
+zfs_ioc_unjail(zfs_cmd_t *zc)
+{
+
+ return (zone_dataset_detach(curthread->td_ucred, zc->zc_name,
+ (int)zc->zc_zoneid));
+}
+
+static int
+zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ char name[MAXNAMELEN];
+ spa_t *spa;
+ vdev_t *vd;
+ char *command;
+ uint64_t pool_guid;
+ uint64_t vdev_guid;
+ int error;
+
+ if (nvlist_lookup_uint64(innvl,
+ ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
+ return (EINVAL);
+ if (nvlist_lookup_uint64(innvl,
+ ZPOOL_CONFIG_GUID, &vdev_guid) != 0)
+ return (EINVAL);
+ if (nvlist_lookup_string(innvl,
+ "command", &command) != 0)
+ return (EINVAL);
+
+ mutex_enter(&spa_namespace_lock);
+ spa = spa_by_guid(pool_guid, vdev_guid);
+ if (spa != NULL)
+ strcpy(name, spa_name(spa));
+ mutex_exit(&spa_namespace_lock);
+ if (spa == NULL)
+ return (ENOENT);
+
+ if ((error = spa_open(name, &spa, FTAG)) != 0)
+ return (error);
+ spa_vdev_state_enter(spa, SCL_ALL);
+ vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
+ if (vd == NULL) {
+ (void) spa_vdev_state_exit(spa, NULL, ENXIO);
+ spa_close(spa, FTAG);
+ return (ENODEV);
+ }
+ error = vdev_label_write_pad2(vd, command, strlen(command));
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+uint64_t
+zfs_max_nvlist_src_size_os(void)
+{
+ if (zfs_max_nvlist_src_size != 0)
+ return (zfs_max_nvlist_src_size);
+
+ return (ptob(vm_page_max_user_wired) / 4);
+}
+
+void
+zfs_ioctl_init_os(void)
+{
+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail,
+ zfs_secpolicy_config, POOL_CHECK_NONE);
+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail,
+ zfs_secpolicy_config, POOL_CHECK_NONE);
+ zfs_ioctl_register("fbsd_nextboot", ZFS_IOC_NEXTBOOT,
+ zfs_ioc_nextboot, zfs_secpolicy_config, NO_NAME,
+ POOL_CHECK_NONE, B_FALSE, B_FALSE, zfs_keys_nextboot, 3);
+
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
new file mode 100644
index 000000000000..7bc6b83d0272
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
@@ -0,0 +1,2301 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
+ * All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/acl.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/mntent.h>
+#include <sys/mount.h>
+#include <sys/cmn_err.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_dir.h>
+#include <sys/zil.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_deleg.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/policy.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/sunddi.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/spa_boot.h>
+#include <sys/jail.h>
+#include <ufs/ufs/quota.h>
+#include <sys/zfs_quota.h>
+
+#include "zfs_comutil.h"
+
+#ifndef MNTK_VMSETSIZE_BUG
+#define MNTK_VMSETSIZE_BUG 0
+#endif
+#ifndef MNTK_NOMSYNC
+#define MNTK_NOMSYNC 8
+#endif
+
+/* BEGIN CSTYLED */
+struct mtx zfs_debug_mtx;
+MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
+
+SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
+
+int zfs_super_owner;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
+ "File system owner can perform privileged operation on his file systems");
+
+int zfs_debug_level;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
+ "Debug level");
+
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
+static int zfs_version_acl = ZFS_ACL_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
+ "ZFS_ACL_VERSION");
+static int zfs_version_spa = SPA_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
+ "SPA_VERSION");
+static int zfs_version_zpl = ZPL_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
+ "ZPL_VERSION");
+/* END CSTYLED */
+
+static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
+static int zfs_mount(vfs_t *vfsp);
+static int zfs_umount(vfs_t *vfsp, int fflag);
+static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
+static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
+static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
+static int zfs_sync(vfs_t *vfsp, int waitfor);
+#if __FreeBSD_version >= 1300098
+static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
+ struct ucred **credanonp, int *numsecflavors, int *secflavors);
+#else
+static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
+ struct ucred **credanonp, int *numsecflavors, int **secflavors);
+#endif
+static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
+static void zfs_freevfs(vfs_t *vfsp);
+
+struct vfsops zfs_vfsops = {
+ .vfs_mount = zfs_mount,
+ .vfs_unmount = zfs_umount,
+#if __FreeBSD_version >= 1300049
+ .vfs_root = vfs_cache_root,
+ .vfs_cachedroot = zfs_root,
+#else
+ .vfs_root = zfs_root,
+#endif
+ .vfs_statfs = zfs_statfs,
+ .vfs_vget = zfs_vget,
+ .vfs_sync = zfs_sync,
+ .vfs_checkexp = zfs_checkexp,
+ .vfs_fhtovp = zfs_fhtovp,
+ .vfs_quotactl = zfs_quotactl,
+};
+
+VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
+
+/*
+ * We need to keep a count of active fs's.
+ * This is necessary to prevent our module
+ * from being unloaded after a umount -f
+ */
+static uint32_t zfs_active_fs_count = 0;
+
+int
+zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
+ char *setpoint)
+{
+ int error;
+ zfsvfs_t *zfvp;
+ vfs_t *vfsp;
+ objset_t *os;
+ uint64_t tmp = *val;
+
+ error = dmu_objset_from_ds(ds, &os);
+ if (error != 0)
+ return (error);
+
+ error = getzfsvfs_impl(os, &zfvp);
+ if (error != 0)
+ return (error);
+ if (zfvp == NULL)
+ return (ENOENT);
+ vfsp = zfvp->z_vfs;
+ switch (zfs_prop) {
+ case ZFS_PROP_ATIME:
+ if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
+ tmp = 0;
+ if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
+ tmp = 1;
+ break;
+ case ZFS_PROP_DEVICES:
+ if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
+ tmp = 0;
+ if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
+ tmp = 1;
+ break;
+ case ZFS_PROP_EXEC:
+ if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
+ tmp = 0;
+ if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
+ tmp = 1;
+ break;
+ case ZFS_PROP_SETUID:
+ if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
+ tmp = 0;
+ if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
+ tmp = 1;
+ break;
+ case ZFS_PROP_READONLY:
+ if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
+ tmp = 0;
+ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
+ tmp = 1;
+ break;
+ case ZFS_PROP_XATTR:
+ if (zfvp->z_flags & ZSB_XATTR)
+ tmp = zfvp->z_xattr;
+ break;
+ case ZFS_PROP_NBMAND:
+ if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
+ tmp = 0;
+ if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
+ tmp = 1;
+ break;
+ default:
+ vfs_unbusy(vfsp);
+ return (ENOENT);
+ }
+
+ vfs_unbusy(vfsp);
+ if (tmp != *val) {
+ (void) strcpy(setpoint, "temporary");
+ *val = tmp;
+ }
+ return (0);
+}
+
+static int
+zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
+{
+ int error = 0;
+ char buf[32];
+ uint64_t usedobj, quotaobj;
+ uint64_t quota, used = 0;
+ timespec_t now;
+
+ usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
+ quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
+
+ if (quotaobj == 0 || zfsvfs->z_replay) {
+ error = ENOENT;
+ goto done;
+ }
+ (void) sprintf(buf, "%llx", (longlong_t)id);
+ if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
+ buf, sizeof (quota), 1, &quota)) != 0) {
+ dprintf("%s(%d): quotaobj lookup failed\n",
+ __FUNCTION__, __LINE__);
+ goto done;
+ }
+ /*
+ * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
+ * So we set them to be the same.
+ */
+ dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
+ error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used);
+ if (error && error != ENOENT) {
+ dprintf("%s(%d): usedobj failed; %d\n",
+ __FUNCTION__, __LINE__, error);
+ goto done;
+ }
+ dqp->dqb_curblocks = btodb(used);
+ dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
+ vfs_timestamp(&now);
+ /*
+ * Setting this to 0 causes FreeBSD quota(8) to print
+ * the number of days since the epoch, which isn't
+ * particularly useful.
+ */
+ dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
+done:
+ return (error);
+}
+
+static int
+zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ struct thread *td;
+ int cmd, type, error = 0;
+ int bitsize;
+ zfs_userquota_prop_t quota_type;
+ struct dqblk64 dqblk = { 0 };
+
+ td = curthread;
+ cmd = cmds >> SUBCMDSHIFT;
+ type = cmds & SUBCMDMASK;
+
+ ZFS_ENTER(zfsvfs);
+ if (id == -1) {
+ switch (type) {
+ case USRQUOTA:
+ id = td->td_ucred->cr_ruid;
+ break;
+ case GRPQUOTA:
+ id = td->td_ucred->cr_rgid;
+ break;
+ default:
+ error = EINVAL;
+ if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
+ vfs_unbusy(vfsp);
+ goto done;
+ }
+ }
+ /*
+ * Map BSD type to:
+ * ZFS_PROP_USERUSED,
+ * ZFS_PROP_USERQUOTA,
+ * ZFS_PROP_GROUPUSED,
+ * ZFS_PROP_GROUPQUOTA
+ */
+ switch (cmd) {
+ case Q_SETQUOTA:
+ case Q_SETQUOTA32:
+ if (type == USRQUOTA)
+ quota_type = ZFS_PROP_USERQUOTA;
+ else if (type == GRPQUOTA)
+ quota_type = ZFS_PROP_GROUPQUOTA;
+ else
+ error = EINVAL;
+ break;
+ case Q_GETQUOTA:
+ case Q_GETQUOTA32:
+ if (type == USRQUOTA)
+ quota_type = ZFS_PROP_USERUSED;
+ else if (type == GRPQUOTA)
+ quota_type = ZFS_PROP_GROUPUSED;
+ else
+ error = EINVAL;
+ break;
+ }
+
+ /*
+ * Depending on the cmd, we may need to get
+ * the ruid and domain (see fuidstr_to_sid?),
+ * the fuid (how?), or other information.
+ * Create fuid using zfs_fuid_create(zfsvfs, id,
+ * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
+ * I think I can use just the id?
+ *
+ * Look at zfs_id_overquota() to look up a quota.
+ * zap_lookup(something, quotaobj, fuidstring,
+ * sizeof (long long), 1, &quota)
+ *
+ * See zfs_set_userquota() to set a quota.
+ */
+ if ((uint32_t)type >= MAXQUOTAS) {
+ error = EINVAL;
+ goto done;
+ }
+
+ switch (cmd) {
+ case Q_GETQUOTASIZE:
+ bitsize = 64;
+ error = copyout(&bitsize, arg, sizeof (int));
+ break;
+ case Q_QUOTAON:
+ // As far as I can tell, you can't turn quotas on or off on zfs
+ error = 0;
+ vfs_unbusy(vfsp);
+ break;
+ case Q_QUOTAOFF:
+ error = ENOTSUP;
+ vfs_unbusy(vfsp);
+ break;
+ case Q_SETQUOTA:
+ error = copyin(arg, &dqblk, sizeof (dqblk));
+ if (error == 0)
+ error = zfs_set_userquota(zfsvfs, quota_type,
+ "", id, dbtob(dqblk.dqb_bhardlimit));
+ break;
+ case Q_GETQUOTA:
+ error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
+ if (error == 0)
+ error = copyout(&dqblk, arg, sizeof (dqblk));
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+done:
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+
+boolean_t
+zfs_is_readonly(zfsvfs_t *zfsvfs)
+{
+ return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY));
+}
+
+/*ARGSUSED*/
+static int
+zfs_sync(vfs_t *vfsp, int waitfor)
+{
+
+ /*
+ * Data integrity is job one. We don't want a compromised kernel
+ * writing to the storage pool, so we never sync during panic.
+ */
+ if (panicstr)
+ return (0);
+
+ /*
+ * Ignore the system syncher. ZFS already commits async data
+ * at zfs_txg_timeout intervals.
+ */
+ if (waitfor == MNT_LAZY)
+ return (0);
+
+ if (vfsp != NULL) {
+ /*
+ * Sync a specific filesystem.
+ */
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ dsl_pool_t *dp;
+ int error;
+
+ error = vfs_stdsync(vfsp, waitfor);
+ if (error != 0)
+ return (error);
+
+ ZFS_ENTER(zfsvfs);
+ dp = dmu_objset_pool(zfsvfs->z_os);
+
+ /*
+ * If the system is shutting down, then skip any
+ * filesystems which may exist on a suspended pool.
+ */
+ if (rebooting && spa_suspended(dp->dp_spa)) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ if (zfsvfs->z_log != NULL)
+ zil_commit(zfsvfs->z_log, 0);
+
+ ZFS_EXIT(zfsvfs);
+ } else {
+ /*
+ * Sync all ZFS filesystems. This is what happens when you
+ * run sync(8). Unlike other filesystems, ZFS honors the
+ * request by waiting for all pools to commit all dirty data.
+ */
+ spa_sync_allpools();
+ }
+
+ return (0);
+}
+
+static void
+atime_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == TRUE) {
+ zfsvfs->z_atime = TRUE;
+ zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
+ } else {
+ zfsvfs->z_atime = FALSE;
+ zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
+ }
+}
+
+static void
+xattr_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == ZFS_XATTR_OFF) {
+ zfsvfs->z_flags &= ~ZSB_XATTR;
+ } else {
+ zfsvfs->z_flags |= ZSB_XATTR;
+
+ if (newval == ZFS_XATTR_SA)
+ zfsvfs->z_xattr_sa = B_TRUE;
+ else
+ zfsvfs->z_xattr_sa = B_FALSE;
+ }
+}
+
+static void
+blksz_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+ ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
+ ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
+ ASSERT(ISP2(newval));
+
+ zfsvfs->z_max_blksz = newval;
+ zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
+}
+
+static void
+readonly_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval) {
+ /* XXX locking on vfs_flag? */
+ zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
+ } else {
+ /* XXX locking on vfs_flag? */
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
+ }
+}
+
+static void
+setuid_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == FALSE) {
+ zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
+ } else {
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
+ }
+}
+
+static void
+exec_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == FALSE) {
+ zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
+ } else {
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
+ }
+}
+
+/*
+ * The nbmand mount option can be changed at mount time.
+ * We can't allow it to be toggled on live file systems or incorrect
+ * behavior may be seen from cifs clients
+ *
+ * This property isn't registered via dsl_prop_register(), but this callback
+ * will be called when a file system is first mounted
+ */
+static void
+nbmand_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+ if (newval == FALSE) {
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
+ } else {
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
+ }
+}
+
+static void
+snapdir_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_show_ctldir = newval;
+}
+
+static void
+vscan_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_vscan = newval;
+}
+
+static void
+acl_mode_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_acl_mode = newval;
+}
+
+static void
+acl_inherit_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_acl_inherit = newval;
+}
+
+static void
+acl_type_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_acl_type = newval;
+}
+
+static int
+zfs_register_callbacks(vfs_t *vfsp)
+{
+ struct dsl_dataset *ds = NULL;
+ objset_t *os = NULL;
+ zfsvfs_t *zfsvfs = NULL;
+ uint64_t nbmand;
+ boolean_t readonly = B_FALSE;
+ boolean_t do_readonly = B_FALSE;
+ boolean_t setuid = B_FALSE;
+ boolean_t do_setuid = B_FALSE;
+ boolean_t exec = B_FALSE;
+ boolean_t do_exec = B_FALSE;
+ boolean_t xattr = B_FALSE;
+ boolean_t atime = B_FALSE;
+ boolean_t do_atime = B_FALSE;
+ boolean_t do_xattr = B_FALSE;
+ int error = 0;
+
+ ASSERT(vfsp);
+ zfsvfs = vfsp->vfs_data;
+ ASSERT(zfsvfs);
+ os = zfsvfs->z_os;
+
+ /*
+ * This function can be called for a snapshot when we update snapshot's
+ * mount point, which isn't really supported.
+ */
+ if (dmu_objset_is_snapshot(os))
+ return (EOPNOTSUPP);
+
+ /*
+ * The act of registering our callbacks will destroy any mount
+ * options we may have. In order to enable temporary overrides
+ * of mount options, we stash away the current values and
+ * restore them after we register the callbacks.
+ */
+ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
+ !spa_writeable(dmu_objset_spa(os))) {
+ readonly = B_TRUE;
+ do_readonly = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
+ readonly = B_FALSE;
+ do_readonly = B_TRUE;
+ }
+ if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
+ setuid = B_FALSE;
+ do_setuid = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
+ setuid = B_TRUE;
+ do_setuid = B_TRUE;
+ }
+ if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
+ exec = B_FALSE;
+ do_exec = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
+ exec = B_TRUE;
+ do_exec = B_TRUE;
+ }
+ if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
+ zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF;
+ do_xattr = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
+ zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
+ do_xattr = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) {
+ zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
+ do_xattr = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) {
+ zfsvfs->z_xattr = xattr = ZFS_XATTR_SA;
+ do_xattr = B_TRUE;
+ }
+ if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
+ atime = B_FALSE;
+ do_atime = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
+ atime = B_TRUE;
+ do_atime = B_TRUE;
+ }
+
+ /*
+ * We need to enter pool configuration here, so that we can use
+ * dsl_prop_get_int_ds() to handle the special nbmand property below.
+ * dsl_prop_get_integer() can not be used, because it has to acquire
+ * spa_namespace_lock and we can not do that because we already hold
+ * z_teardown_lock. The problem is that spa_write_cachefile() is called
+ * with spa_namespace_lock held and the function calls ZFS vnode
+ * operations to write the cache file and thus z_teardown_lock is
+ * acquired after spa_namespace_lock.
+ */
+ ds = dmu_objset_ds(os);
+ dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+
+ /*
+ * nbmand is a special property. It can only be changed at
+ * mount time.
+ *
+ * This is weird, but it is documented to only be changeable
+ * at mount time.
+ */
+ if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
+ nbmand = B_FALSE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
+ nbmand = B_TRUE;
+ } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0)) {
+ dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+ return (error);
+ }
+
+ /*
+ * Register property callbacks.
+ *
+ * It would probably be fine to just check for i/o error from
+ * the first prop_register(), but I guess I like to go
+ * overboard...
+ */
+ error = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
+ zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
+ dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+ if (error)
+ goto unregister;
+
+ /*
+ * Invoke our callbacks to restore temporary mount options.
+ */
+ if (do_readonly)
+ readonly_changed_cb(zfsvfs, readonly);
+ if (do_setuid)
+ setuid_changed_cb(zfsvfs, setuid);
+ if (do_exec)
+ exec_changed_cb(zfsvfs, exec);
+ if (do_xattr)
+ xattr_changed_cb(zfsvfs, xattr);
+ if (do_atime)
+ atime_changed_cb(zfsvfs, atime);
+
+ nbmand_changed_cb(zfsvfs, nbmand);
+
+ return (0);
+
+unregister:
+ dsl_prop_unregister_all(ds, zfsvfs);
+ return (error);
+}
+
+/*
+ * Associate this zfsvfs with the given objset, which must be owned.
+ * This will cache a bunch of on-disk state from the objset in the
+ * zfsvfs.
+ */
+static int
+zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
+{
+ int error;
+ uint64_t val;
+
+ zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
+ zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
+ zfsvfs->z_os = os;
+
+ error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
+ if (error != 0)
+ return (error);
+ if (zfsvfs->z_version >
+ zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
+ (void) printf("Can't mount a version %lld file system "
+ "on a version %lld pool\n. Pool must be upgraded to mount "
+ "this file system.", (u_longlong_t)zfsvfs->z_version,
+ (u_longlong_t)spa_version(dmu_objset_spa(os)));
+ return (SET_ERROR(ENOTSUP));
+ }
+ error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
+ if (error != 0)
+ return (error);
+ zfsvfs->z_norm = (int)val;
+
+ error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
+ if (error != 0)
+ return (error);
+ zfsvfs->z_utf8 = (val != 0);
+
+ error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
+ if (error != 0)
+ return (error);
+ zfsvfs->z_case = (uint_t)val;
+
+ error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val);
+ if (error != 0)
+ return (error);
+ zfsvfs->z_acl_type = (uint_t)val;
+
+ /*
+ * Fold case on file systems that are always or sometimes case
+ * insensitive.
+ */
+ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
+ zfsvfs->z_case == ZFS_CASE_MIXED)
+ zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
+ zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+ zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+
+ uint64_t sa_obj = 0;
+ if (zfsvfs->z_use_sa) {
+ /* should either have both of these objects or none */
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
+ &sa_obj);
+ if (error != 0)
+ return (error);
+ }
+
+ error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+ &zfsvfs->z_attr_table);
+ if (error != 0)
+ return (error);
+
+ if (zfsvfs->z_version >= ZPL_VERSION_SA)
+ sa_register_update_callback(os, zfs_sa_upgrade);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
+ &zfsvfs->z_root);
+ if (error != 0)
+ return (error);
+ ASSERT(zfsvfs->z_root != 0);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
+ &zfsvfs->z_unlinkedobj);
+ if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
+ 8, 1, &zfsvfs->z_userquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_userquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
+ 8, 1, &zfsvfs->z_groupquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_groupquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
+ 8, 1, &zfsvfs->z_projectquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_projectquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
+ 8, 1, &zfsvfs->z_userobjquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_userobjquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
+ 8, 1, &zfsvfs->z_groupobjquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_groupobjquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
+ 8, 1, &zfsvfs->z_projectobjquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_projectobjquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
+ &zfsvfs->z_fuid_obj);
+ if (error == ENOENT)
+ zfsvfs->z_fuid_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
+ &zfsvfs->z_shares_dir);
+ if (error == ENOENT)
+ zfsvfs->z_shares_dir = 0;
+ else if (error != 0)
+ return (error);
+
+ /*
+ * Only use the name cache if we are looking for a
+ * name on a file system that does not require normalization
+ * or case folding. We can also look there if we happen to be
+ * on a non-normalizing, mixed sensitivity file system IF we
+ * are looking for the exact name (which is always the case on
+ * FreeBSD).
+ */
+ zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
+ ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
+ !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
+
+ return (0);
+}
+
+taskq_t *zfsvfs_taskq;
+
+static void
+zfsvfs_task_unlinked_drain(void *context, int pending __unused)
+{
+
+ zfs_unlinked_drain((zfsvfs_t *)context);
+}
+
+int
+zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
+{
+ objset_t *os;
+ zfsvfs_t *zfsvfs;
+ int error;
+ boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
+
+ /*
+ * XXX: Fix struct statfs so this isn't necessary!
+ *
+ * The 'osname' is used as the filesystem's special node, which means
+ * it must fit in statfs.f_mntfromname, or else it can't be
+ * enumerated, so libzfs_mnttab_find() returns NULL, which causes
+ * 'zfs unmount' to think it's not mounted when it is.
+ */
+ if (strlen(osname) >= MNAMELEN)
+ return (SET_ERROR(ENAMETOOLONG));
+
+ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+
+ error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs,
+ &os);
+ if (error != 0) {
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+ return (error);
+ }
+
+ error = zfsvfs_create_impl(zfvp, zfsvfs, os);
+
+ return (error);
+}
+
+
+int
+zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
+{
+ int error;
+
+ zfsvfs->z_vfs = NULL;
+ zfsvfs->z_parent = zfsvfs;
+
+ mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+ offsetof(znode_t, z_link_node));
+ TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
+ zfsvfs_task_unlinked_drain, zfsvfs);
+ ZFS_TEARDOWN_INIT(zfsvfs);
+ ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs);
+ rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
+ for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+
+ error = zfsvfs_init(zfsvfs, os);
+ if (error != 0) {
+ dmu_objset_disown(os, B_TRUE, zfsvfs);
+ *zfvp = NULL;
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+ return (error);
+ }
+
+ *zfvp = zfsvfs;
+ return (0);
+}
+
+static int
+zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
+{
+ int error;
+
+ /*
+ * Check for a bad on-disk format version now since we
+ * lied about owning the dataset readonly before.
+ */
+ if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
+ dmu_objset_incompatible_encryption_version(zfsvfs->z_os))
+ return (SET_ERROR(EROFS));
+
+ error = zfs_register_callbacks(zfsvfs->z_vfs);
+ if (error)
+ return (error);
+
+ zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+
+ /*
+ * If we are not mounting (ie: online recv), then we don't
+ * have to worry about replaying the log as we blocked all
+ * operations out since we closed the ZIL.
+ */
+ if (mounting) {
+ boolean_t readonly;
+
+ ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
+ dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
+
+ /*
+ * During replay we remove the read only flag to
+ * allow replays to succeed.
+ */
+ readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
+ if (readonly != 0) {
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
+ } else {
+ dsl_dir_t *dd;
+ zap_stats_t zs;
+
+ if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
+ &zs) == 0) {
+ dataset_kstats_update_nunlinks_kstat(
+ &zfsvfs->z_kstat, zs.zs_num_entries);
+ dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
+ "num_entries in unlinked set: %llu",
+ zs.zs_num_entries);
+ }
+
+ zfs_unlinked_drain(zfsvfs);
+ dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
+ dd->dd_activity_cancelled = B_FALSE;
+ }
+
+ /*
+ * Parse and replay the intent log.
+ *
+ * Because of ziltest, this must be done after
+ * zfs_unlinked_drain(). (Further note: ziltest
+ * doesn't use readonly mounts, where
+ * zfs_unlinked_drain() isn't called.) This is because
+ * ziltest causes spa_sync() to think it's committed,
+ * but actually it is not, so the intent log contains
+ * many txg's worth of changes.
+ *
+ * In particular, if object N is in the unlinked set in
+ * the last txg to actually sync, then it could be
+ * actually freed in a later txg and then reallocated
+ * in a yet later txg. This would write a "create
+ * object N" record to the intent log. Normally, this
+ * would be fine because the spa_sync() would have
+ * written out the fact that object N is free, before
+ * we could write the "create object N" intent log
+ * record.
+ *
+ * But when we are in ziltest mode, we advance the "open
+ * txg" without actually spa_sync()-ing the changes to
+ * disk. So we would see that object N is still
+ * allocated and in the unlinked set, and there is an
+ * intent log record saying to allocate it.
+ */
+ if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
+ if (zil_replay_disable) {
+ zil_destroy(zfsvfs->z_log, B_FALSE);
+ } else {
+ boolean_t use_nc = zfsvfs->z_use_namecache;
+ zfsvfs->z_use_namecache = B_FALSE;
+ zfsvfs->z_replay = B_TRUE;
+ zil_replay(zfsvfs->z_os, zfsvfs,
+ zfs_replay_vector);
+ zfsvfs->z_replay = B_FALSE;
+ zfsvfs->z_use_namecache = use_nc;
+ }
+ }
+
+ /* restore readonly bit */
+ if (readonly != 0)
+ zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
+ }
+
+ /*
+ * Set the objset user_ptr to track its zfsvfs.
+ */
+ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
+ dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+ mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
+
+ return (0);
+}
+
+void
+zfsvfs_free(zfsvfs_t *zfsvfs)
+{
+ int i;
+
+ zfs_fuid_destroy(zfsvfs);
+
+ mutex_destroy(&zfsvfs->z_znodes_lock);
+ mutex_destroy(&zfsvfs->z_lock);
+ ASSERT(zfsvfs->z_nr_znodes == 0);
+ list_destroy(&zfsvfs->z_all_znodes);
+ ZFS_TEARDOWN_DESTROY(zfsvfs);
+ ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs);
+ rw_destroy(&zfsvfs->z_fuid_lock);
+ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ mutex_destroy(&zfsvfs->z_hold_mtx[i]);
+ dataset_kstats_destroy(&zfsvfs->z_kstat);
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+}
+
+static void
+zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
+{
+ zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+ if (zfsvfs->z_vfs) {
+ if (zfsvfs->z_use_fuids) {
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
+ } else {
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
+ }
+ }
+ zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+}
+
+static int
+zfs_domount(vfs_t *vfsp, char *osname)
+{
+ uint64_t recordsize, fsid_guid;
+ int error = 0;
+ zfsvfs_t *zfsvfs;
+
+ ASSERT(vfsp);
+ ASSERT(osname);
+
+ error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs);
+ if (error)
+ return (error);
+ zfsvfs->z_vfs = vfsp;
+
+ if ((error = dsl_prop_get_integer(osname,
+ "recordsize", &recordsize, NULL)))
+ goto out;
+ zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
+ zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
+
+ vfsp->vfs_data = zfsvfs;
+ vfsp->mnt_flag |= MNT_LOCAL;
+ vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
+ vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
+ vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
+ /*
+ * This can cause a loss of coherence between ARC and page cache
+ * on ZoF - unclear if the problem is in FreeBSD or ZoF
+ */
+ vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */
+ vfsp->mnt_kern_flag |= MNTK_NOMSYNC;
+ vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG;
+
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
+ vfsp->mnt_kern_flag |= MNTK_FPLOOKUP;
+#endif
+ /*
+ * The fsid is 64 bits, composed of an 8-bit fs type, which
+ * separates our fsid from any other filesystem types, and a
+ * 56-bit objset unique ID. The objset unique ID is unique to
+ * all objsets open on this system, provided by unique_create().
+ * The 8-bit fs type must be put in the low bits of fsid[1]
+ * because that's where other Solaris filesystems put it.
+ */
+ fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
+ ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
+ vfsp->vfs_fsid.val[0] = fsid_guid;
+ vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
+ (vfsp->mnt_vfc->vfc_typenum & 0xFF);
+
+ /*
+ * Set features for file system.
+ */
+ zfs_set_fuid_feature(zfsvfs);
+ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+ vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
+ vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
+ vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
+ } else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
+ vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
+ vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
+ }
+ vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
+
+ if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
+ uint64_t pval;
+
+ atime_changed_cb(zfsvfs, B_FALSE);
+ readonly_changed_cb(zfsvfs, B_TRUE);
+ if ((error = dsl_prop_get_integer(osname,
+ "xattr", &pval, NULL)))
+ goto out;
+ xattr_changed_cb(zfsvfs, pval);
+ if ((error = dsl_prop_get_integer(osname,
+ "acltype", &pval, NULL)))
+ goto out;
+ acl_type_changed_cb(zfsvfs, pval);
+ zfsvfs->z_issnap = B_TRUE;
+ zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
+
+ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
+ dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+ mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
+ } else {
+ if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
+ goto out;
+ }
+
+ vfs_mountedfrom(vfsp, osname);
+
+ if (!zfsvfs->z_issnap)
+ zfsctl_create(zfsvfs);
+out:
+ if (error) {
+ dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
+ zfsvfs_free(zfsvfs);
+ } else {
+ atomic_inc_32(&zfs_active_fs_count);
+ }
+
+ return (error);
+}
+
+static void
+zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
+{
+ objset_t *os = zfsvfs->z_os;
+
+ if (!dmu_objset_is_snapshot(os))
+ dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
+}
+
+static int
+getpoolname(const char *osname, char *poolname)
+{
+ char *p;
+
+ p = strchr(osname, '/');
+ if (p == NULL) {
+ if (strlen(osname) >= MAXNAMELEN)
+ return (ENAMETOOLONG);
+ (void) strcpy(poolname, osname);
+ } else {
+ if (p - osname >= MAXNAMELEN)
+ return (ENAMETOOLONG);
+ (void) strncpy(poolname, osname, p - osname);
+ poolname[p - osname] = '\0';
+ }
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfs_mount(vfs_t *vfsp)
+{
+ kthread_t *td = curthread;
+ vnode_t *mvp = vfsp->mnt_vnodecovered;
+ cred_t *cr = td->td_ucred;
+ char *osname;
+ int error = 0;
+ int canwrite;
+
+ if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * If full-owner-access is enabled and delegated administration is
+ * turned on, we must set nosuid.
+ */
+ if (zfs_super_owner &&
+ dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
+ secpolicy_fs_mount_clearopts(cr, vfsp);
+ }
+
+ /*
+ * Check for mount privilege?
+ *
+ * If we don't have privilege then see if
+ * we have local permission to allow it
+ */
+ error = secpolicy_fs_mount(cr, mvp, vfsp);
+ if (error) {
+ if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
+ goto out;
+
+ if (!(vfsp->vfs_flag & MS_REMOUNT)) {
+ vattr_t vattr;
+
+ /*
+ * Make sure user is the owner of the mount point
+ * or has sufficient privileges.
+ */
+
+ vattr.va_mask = AT_UID;
+
+ vn_lock(mvp, LK_SHARED | LK_RETRY);
+ if (VOP_GETATTR(mvp, &vattr, cr)) {
+ VOP_UNLOCK1(mvp);
+ goto out;
+ }
+
+ if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
+ VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
+ VOP_UNLOCK1(mvp);
+ goto out;
+ }
+ VOP_UNLOCK1(mvp);
+ }
+
+ secpolicy_fs_mount_clearopts(cr, vfsp);
+ }
+
+ /*
+ * Refuse to mount a filesystem if we are in a local zone and the
+ * dataset is not visible.
+ */
+ if (!INGLOBALZONE(curproc) &&
+ (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
+ error = SET_ERROR(EPERM);
+ goto out;
+ }
+
+ vfsp->vfs_flag |= MNT_NFS4ACLS;
+
+ /*
+ * When doing a remount, we simply refresh our temporary properties
+ * according to those options set in the current VFS options.
+ */
+ if (vfsp->vfs_flag & MS_REMOUNT) {
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+ /*
+ * Refresh mount options with z_teardown_lock blocking I/O while
+ * the filesystem is in an inconsistent state.
+ * The lock also serializes this code with filesystem
+ * manipulations between entry to zfs_suspend_fs() and return
+ * from zfs_resume_fs().
+ */
+ ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
+ zfs_unregister_callbacks(zfsvfs);
+ error = zfs_register_callbacks(vfsp);
+ ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
+ goto out;
+ }
+
+ /* Initial root mount: try hard to import the requested root pool. */
+ if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
+ (vfsp->vfs_flag & MNT_UPDATE) == 0) {
+ char pname[MAXNAMELEN];
+
+ error = getpoolname(osname, pname);
+ if (error == 0)
+ error = spa_import_rootpool(pname, false);
+ if (error)
+ goto out;
+ }
+ DROP_GIANT();
+ error = zfs_domount(vfsp, osname);
+ PICKUP_GIANT();
+
+out:
+ return (error);
+}
+
+static int
+zfs_statfs(vfs_t *vfsp, struct statfs *statp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ uint64_t refdbytes, availbytes, usedobjs, availobjs;
+
+ statp->f_version = STATFS_VERSION;
+
+ ZFS_ENTER(zfsvfs);
+
+ dmu_objset_space(zfsvfs->z_os,
+ &refdbytes, &availbytes, &usedobjs, &availobjs);
+
+ /*
+ * The underlying storage pool actually uses multiple block sizes.
+ * We report the fragsize as the smallest block size we support,
+ * and we report our blocksize as the filesystem's maximum blocksize.
+ */
+ statp->f_bsize = SPA_MINBLOCKSIZE;
+ statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
+
+ /*
+ * The following report "total" blocks of various kinds in the
+ * file system, but reported in terms of f_frsize - the
+ * "fragment" size.
+ */
+
+ statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
+ statp->f_bfree = availbytes / statp->f_bsize;
+ statp->f_bavail = statp->f_bfree; /* no root reservation */
+
+ /*
+ * statvfs() should really be called statufs(), because it assumes
+ * static metadata. ZFS doesn't preallocate files, so the best
+ * we can do is report the max that could possibly fit in f_files,
+ * and that minus the number actually used in f_ffree.
+ * For f_ffree, report the smaller of the number of object available
+ * and the number of blocks (each object will take at least a block).
+ */
+ statp->f_ffree = MIN(availobjs, statp->f_bfree);
+ statp->f_files = statp->f_ffree + usedobjs;
+
+ /*
+ * We're a zfs filesystem.
+ */
+ strlcpy(statp->f_fstypename, "zfs",
+ sizeof (statp->f_fstypename));
+
+ strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
+ sizeof (statp->f_mntfromname));
+ strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
+ sizeof (statp->f_mntonname));
+
+ statp->f_namemax = MAXNAMELEN - 1;
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+static int
+zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ znode_t *rootzp;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
+ if (error == 0)
+ *vpp = ZTOV(rootzp);
+
+ ZFS_EXIT(zfsvfs);
+
+ if (error == 0) {
+ error = vn_lock(*vpp, flags);
+ if (error != 0) {
+ VN_RELE(*vpp);
+ *vpp = NULL;
+ }
+ }
+ return (error);
+}
+
+/*
+ * Teardown the zfsvfs::z_os.
+ *
+ * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
+ * and 'z_teardown_inactive_lock' held.
+ */
+static int
+zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
+{
+ znode_t *zp;
+ dsl_dir_t *dd;
+
+ /*
+ * If someone has not already unmounted this file system,
+ * drain the zrele_taskq to ensure all active references to the
+ * zfsvfs_t have been handled only then can it be safely destroyed.
+ */
+ if (zfsvfs->z_os) {
+ /*
+ * If we're unmounting we have to wait for the list to
+ * drain completely.
+ *
+ * If we're not unmounting there's no guarantee the list
+ * will drain completely, but zreles run from the taskq
+ * may add the parents of dir-based xattrs to the taskq
+ * so we want to wait for these.
+ *
+ * We can safely read z_nr_znodes without locking because the
+ * VFS has already blocked operations which add to the
+ * z_all_znodes list and thus increment z_nr_znodes.
+ */
+ int round = 0;
+ while (zfsvfs->z_nr_znodes > 0) {
+ taskq_wait_outstanding(dsl_pool_zrele_taskq(
+ dmu_objset_pool(zfsvfs->z_os)), 0);
+ if (++round > 1 && !unmounting)
+ break;
+ }
+ }
+ ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
+
+ if (!unmounting) {
+ /*
+ * We purge the parent filesystem's vfsp as the parent
+ * filesystem and all of its snapshots have their vnode's
+ * v_vfsp set to the parent's filesystem's vfsp. Note,
+ * 'z_parent' is self referential for non-snapshots.
+ */
+#ifdef FREEBSD_NAMECACHE
+#if __FreeBSD_version >= 1300117
+ cache_purgevfs(zfsvfs->z_parent->z_vfs);
+#else
+ cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
+#endif
+#endif
+ }
+
+ /*
+ * Close the zil. NB: Can't close the zil while zfs_inactive
+ * threads are blocked as zil_close can call zfs_inactive.
+ */
+ if (zfsvfs->z_log) {
+ zil_close(zfsvfs->z_log);
+ zfsvfs->z_log = NULL;
+ }
+
+ ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs);
+
+ /*
+ * If we are not unmounting (ie: online recv) and someone already
+ * unmounted this file system while we were doing the switcheroo,
+ * or a reopen of z_os failed then just bail out now.
+ */
+ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
+ ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
+ ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
+ return (SET_ERROR(EIO));
+ }
+
+ /*
+ * At this point there are no vops active, and any new vops will
+ * fail with EIO since we have z_teardown_lock for writer (only
+ * relevant for forced unmount).
+ *
+ * Release all holds on dbufs.
+ */
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
+ zp = list_next(&zfsvfs->z_all_znodes, zp))
+ if (zp->z_sa_hdl) {
+ ASSERT(ZTOV(zp)->v_count >= 0);
+ zfs_znode_dmu_fini(zp);
+ }
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ /*
+ * If we are unmounting, set the unmounted flag and let new vops
+ * unblock. zfs_inactive will have the unmounted behavior, and all
+ * other vops will fail with EIO.
+ */
+ if (unmounting) {
+ zfsvfs->z_unmounted = B_TRUE;
+ ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
+ ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
+ }
+
+ /*
+ * z_os will be NULL if there was an error in attempting to reopen
+ * zfsvfs, so just return as the properties had already been
+ * unregistered and cached data had been evicted before.
+ */
+ if (zfsvfs->z_os == NULL)
+ return (0);
+
+ /*
+ * Unregister properties.
+ */
+ zfs_unregister_callbacks(zfsvfs);
+
+ /*
+ * Evict cached data
+ */
+ if (!zfs_is_readonly(zfsvfs))
+ txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+ dmu_objset_evict_dbufs(zfsvfs->z_os);
+ dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
+ dsl_dir_cancel_waiters(dd);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfs_umount(vfs_t *vfsp, int fflag)
+{
+ kthread_t *td = curthread;
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ objset_t *os;
+ cred_t *cr = td->td_ucred;
+ int ret;
+
+ ret = secpolicy_fs_unmount(cr, vfsp);
+ if (ret) {
+ if (dsl_deleg_access((char *)vfsp->vfs_resource,
+ ZFS_DELEG_PERM_MOUNT, cr))
+ return (ret);
+ }
+
+ /*
+ * Unmount any snapshots mounted under .zfs before unmounting the
+ * dataset itself.
+ */
+ if (zfsvfs->z_ctldir != NULL) {
+ if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
+ return (ret);
+ }
+
+ if (fflag & MS_FORCE) {
+ /*
+ * Mark file system as unmounted before calling
+ * vflush(FORCECLOSE). This way we ensure no future vnops
+ * will be called and risk operating on DOOMED vnodes.
+ */
+ ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
+ zfsvfs->z_unmounted = B_TRUE;
+ ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
+ }
+
+ /*
+ * Flush all the files.
+ */
+ ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
+ if (ret != 0)
+ return (ret);
+ while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
+ &zfsvfs->z_unlinked_drain_task, NULL) != 0)
+ taskqueue_drain(zfsvfs_taskq->tq_queue,
+ &zfsvfs->z_unlinked_drain_task);
+
+ VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
+ os = zfsvfs->z_os;
+
+ /*
+ * z_os will be NULL if there was an error in
+ * attempting to reopen zfsvfs.
+ */
+ if (os != NULL) {
+ /*
+ * Unset the objset user_ptr.
+ */
+ mutex_enter(&os->os_user_ptr_lock);
+ dmu_objset_set_user(os, NULL);
+ mutex_exit(&os->os_user_ptr_lock);
+
+ /*
+ * Finally release the objset
+ */
+ dmu_objset_disown(os, B_TRUE, zfsvfs);
+ }
+
+ /*
+ * We can now safely destroy the '.zfs' directory node.
+ */
+ if (zfsvfs->z_ctldir != NULL)
+ zfsctl_destroy(zfsvfs);
+ zfs_freevfs(vfsp);
+
+ return (0);
+}
+
+static int
+zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ znode_t *zp;
+ int err;
+
+ /*
+ * zfs_zget() can't operate on virtual entries like .zfs/ or
+ * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
+ * This will make NFS to switch to LOOKUP instead of using VGET.
+ */
+ if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
+ (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
+ return (EOPNOTSUPP);
+
+ ZFS_ENTER(zfsvfs);
+ err = zfs_zget(zfsvfs, ino, &zp);
+ if (err == 0 && zp->z_unlinked) {
+ vrele(ZTOV(zp));
+ err = EINVAL;
+ }
+ if (err == 0)
+ *vpp = ZTOV(zp);
+ ZFS_EXIT(zfsvfs);
+ if (err == 0) {
+ err = vn_lock(*vpp, flags);
+ if (err != 0)
+ vrele(*vpp);
+ }
+ if (err != 0)
+ *vpp = NULL;
+ return (err);
+}
+
+static int
+#if __FreeBSD_version >= 1300098
+zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
+ struct ucred **credanonp, int *numsecflavors, int *secflavors)
+#else
+zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
+ struct ucred **credanonp, int *numsecflavors, int **secflavors)
+#endif
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+ /*
+ * If this is regular file system vfsp is the same as
+ * zfsvfs->z_parent->z_vfs, but if it is snapshot,
+ * zfsvfs->z_parent->z_vfs represents parent file system
+ * which we have to use here, because only this file system
+ * has mnt_export configured.
+ */
+ return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
+ credanonp, numsecflavors, secflavors));
+}
+
+CTASSERT(SHORT_FID_LEN <= sizeof (struct fid));
+CTASSERT(LONG_FID_LEN <= sizeof (struct fid));
+
+static int
+zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
+{
+ struct componentname cn;
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ znode_t *zp;
+ vnode_t *dvp;
+ uint64_t object = 0;
+ uint64_t fid_gen = 0;
+ uint64_t gen_mask;
+ uint64_t zp_gen;
+ int i, err;
+
+ *vpp = NULL;
+
+ ZFS_ENTER(zfsvfs);
+
+ /*
+ * On FreeBSD we can get snapshot's mount point or its parent file
+ * system mount point depending if snapshot is already mounted or not.
+ */
+ if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
+ zfid_long_t *zlfid = (zfid_long_t *)fidp;
+ uint64_t objsetid = 0;
+ uint64_t setgen = 0;
+
+ for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+ objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
+
+ for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+ setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
+
+ ZFS_EXIT(zfsvfs);
+
+ err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
+ if (err)
+ return (SET_ERROR(EINVAL));
+ ZFS_ENTER(zfsvfs);
+ }
+
+ if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
+ zfid_short_t *zfid = (zfid_short_t *)fidp;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
+
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
+ } else {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * A zero fid_gen means we are in .zfs or the .zfs/snapshot
+ * directory tree. If the object == zfsvfs->z_shares_dir, then
+ * we are in the .zfs/shares directory tree.
+ */
+ if ((fid_gen == 0 &&
+ (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
+ (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
+ ZFS_EXIT(zfsvfs);
+ VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
+ if (object == ZFSCTL_INO_SNAPDIR) {
+ cn.cn_nameptr = "snapshot";
+ cn.cn_namelen = strlen(cn.cn_nameptr);
+ cn.cn_nameiop = LOOKUP;
+ cn.cn_flags = ISLASTCN | LOCKLEAF;
+ cn.cn_lkflags = flags;
+ VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
+ vput(dvp);
+ } else if (object == zfsvfs->z_shares_dir) {
+ /*
+ * XXX This branch must not be taken,
+ * if it is, then the lookup below will
+ * explode.
+ */
+ cn.cn_nameptr = "shares";
+ cn.cn_namelen = strlen(cn.cn_nameptr);
+ cn.cn_nameiop = LOOKUP;
+ cn.cn_flags = ISLASTCN;
+ cn.cn_lkflags = flags;
+ VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
+ vput(dvp);
+ } else {
+ *vpp = dvp;
+ }
+ return (err);
+ }
+
+ gen_mask = -1ULL >> (64 - 8 * i);
+
+ dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
+ if ((err = zfs_zget(zfsvfs, object, &zp))) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
+ sizeof (uint64_t));
+ zp_gen = zp_gen & gen_mask;
+ if (zp_gen == 0)
+ zp_gen = 1;
+ if (zp->z_unlinked || zp_gen != fid_gen) {
+ dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
+ vrele(ZTOV(zp));
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ *vpp = ZTOV(zp);
+ ZFS_EXIT(zfsvfs);
+ err = vn_lock(*vpp, flags);
+ if (err == 0)
+ vnode_create_vobject(*vpp, zp->z_size, curthread);
+ else
+ *vpp = NULL;
+ return (err);
+}
+
+/*
+ * Block out VOPs and close zfsvfs_t::z_os
+ *
+ * Note, if successful, then we return with the 'z_teardown_lock' and
+ * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying
+ * dataset and objset intact so that they can be atomically handed off during
+ * a subsequent rollback or recv operation and the resume thereafter.
+ */
+int
+zfs_suspend_fs(zfsvfs_t *zfsvfs)
+{
+ int error;
+
+ if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
+ return (error);
+
+ return (0);
+}
+
+/*
+ * Rebuild SA and release VOPs. Note that ownership of the underlying dataset
+ * is an invariant across any of the operations that can be performed while the
+ * filesystem was suspended. Whether it succeeded or failed, the preconditions
+ * are the same: the relevant objset and associated dataset are owned by
+ * zfsvfs, held, and long held on entry.
+ */
+int
+zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
+{
+ int err;
+ znode_t *zp;
+
+ ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
+ ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
+
+ /*
+ * We already own this, so just update the objset_t, as the one we
+ * had before may have been evicted.
+ */
+ objset_t *os;
+ VERIFY3P(ds->ds_owner, ==, zfsvfs);
+ VERIFY(dsl_dataset_long_held(ds));
+ dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
+ dsl_pool_config_enter(dp, FTAG);
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+ dsl_pool_config_exit(dp, FTAG);
+
+ err = zfsvfs_init(zfsvfs, os);
+ if (err != 0)
+ goto bail;
+
+ ds->ds_dir->dd_activity_cancelled = B_FALSE;
+ VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
+
+ zfs_set_fuid_feature(zfsvfs);
+
+ /*
+ * Attempt to re-establish all the active znodes with
+ * their dbufs. If a zfs_rezget() fails, then we'll let
+ * any potential callers discover that via ZFS_ENTER_VERIFY_VP
+ * when they try to use their znode.
+ */
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ for (zp = list_head(&zfsvfs->z_all_znodes); zp;
+ zp = list_next(&zfsvfs->z_all_znodes, zp)) {
+ (void) zfs_rezget(zp);
+ }
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+bail:
+ /* release the VOPs */
+ ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
+ ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
+
+ if (err) {
+ /*
+ * Since we couldn't setup the sa framework, try to force
+ * unmount this file system.
+ */
+ if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
+ vfs_ref(zfsvfs->z_vfs);
+ (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
+ }
+ }
+ return (err);
+}
+
+static void
+zfs_freevfs(vfs_t *vfsp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+ zfsvfs_free(zfsvfs);
+
+ atomic_dec_32(&zfs_active_fs_count);
+}
+
+#ifdef __i386__
+static int desiredvnodes_backup;
+#include <sys/vmmeter.h>
+
+
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+#endif
+
+static void
+zfs_vnodes_adjust(void)
+{
+#ifdef __i386__
+ int newdesiredvnodes;
+
+ desiredvnodes_backup = desiredvnodes;
+
+ /*
+ * We calculate newdesiredvnodes the same way it is done in
+ * vntblinit(). If it is equal to desiredvnodes, it means that
+ * it wasn't tuned by the administrator and we can tune it down.
+ */
+ newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
+ vm_kmem_size / (5 * (sizeof (struct vm_object) +
+ sizeof (struct vnode))));
+ if (newdesiredvnodes == desiredvnodes)
+ desiredvnodes = (3 * newdesiredvnodes) / 4;
+#endif
+}
+
+static void
+zfs_vnodes_adjust_back(void)
+{
+
+#ifdef __i386__
+ desiredvnodes = desiredvnodes_backup;
+#endif
+}
+
+void
+zfs_init(void)
+{
+
+ printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
+
+ /*
+ * Initialize .zfs directory structures
+ */
+ zfsctl_init();
+
+ /*
+ * Initialize znode cache, vnode ops, etc...
+ */
+ zfs_znode_init();
+
+ /*
+ * Reduce number of vnodes. Originally number of vnodes is calculated
+ * with UFS inode in mind. We reduce it here, because it's too big for
+ * ZFS/i386.
+ */
+ zfs_vnodes_adjust();
+
+ dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
+
+ zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
+}
+
+void
+zfs_fini(void)
+{
+ taskq_destroy(zfsvfs_taskq);
+ zfsctl_fini();
+ zfs_znode_fini();
+ zfs_vnodes_adjust_back();
+}
+
+int
+zfs_busy(void)
+{
+ return (zfs_active_fs_count != 0);
+}
+
+/*
+ * Release VOPs and unmount a suspended filesystem.
+ */
+int
+zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
+{
+ ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
+ ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
+
+ /*
+ * We already own this, so just hold and rele it to update the
+ * objset_t, as the one we had before may have been evicted.
+ */
+ objset_t *os;
+ VERIFY3P(ds->ds_owner, ==, zfsvfs);
+ VERIFY(dsl_dataset_long_held(ds));
+ dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
+ dsl_pool_config_enter(dp, FTAG);
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+ dsl_pool_config_exit(dp, FTAG);
+ zfsvfs->z_os = os;
+
+ /* release the VOPs */
+ ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
+ ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
+
+ /*
+ * Try to force unmount this file system.
+ */
+ (void) zfs_umount(zfsvfs->z_vfs, 0);
+ zfsvfs->z_unmounted = B_TRUE;
+ return (0);
+}
+
+int
+zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
+{
+ int error;
+ objset_t *os = zfsvfs->z_os;
+ dmu_tx_t *tx;
+
+ if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
+ return (SET_ERROR(EINVAL));
+
+ if (newvers < zfsvfs->z_version)
+ return (SET_ERROR(EINVAL));
+
+ if (zfs_spa_version_map(newvers) >
+ spa_version(dmu_objset_spa(zfsvfs->z_os)))
+ return (SET_ERROR(ENOTSUP));
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
+ if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
+ ZFS_SA_ATTRS);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ }
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ return (error);
+ }
+
+ error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+ 8, 1, &newvers, tx);
+
+ if (error) {
+ dmu_tx_commit(tx);
+ return (error);
+ }
+
+ if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+ uint64_t sa_obj;
+
+ ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
+ SPA_VERSION_SA);
+ sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+
+ error = zap_add(os, MASTER_NODE_OBJ,
+ ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+ ASSERT0(error);
+
+ VERIFY(0 == sa_set_sa_object(os, sa_obj));
+ sa_register_update_callback(os, zfs_sa_upgrade);
+ }
+
+ spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
+ "from %ju to %ju", (uintmax_t)zfsvfs->z_version,
+ (uintmax_t)newvers);
+ dmu_tx_commit(tx);
+
+ zfsvfs->z_version = newvers;
+ os->os_version = newvers;
+
+ zfs_set_fuid_feature(zfsvfs);
+
+ return (0);
+}
+
+/*
+ * Read a property stored within the master node.
+ */
+int
+zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
+{
+ uint64_t *cached_copy = NULL;
+
+ /*
+ * Figure out where in the objset_t the cached copy would live, if it
+ * is available for the requested property.
+ */
+ if (os != NULL) {
+ switch (prop) {
+ case ZFS_PROP_VERSION:
+ cached_copy = &os->os_version;
+ break;
+ case ZFS_PROP_NORMALIZE:
+ cached_copy = &os->os_normalization;
+ break;
+ case ZFS_PROP_UTF8ONLY:
+ cached_copy = &os->os_utf8only;
+ break;
+ case ZFS_PROP_CASE:
+ cached_copy = &os->os_casesensitivity;
+ break;
+ default:
+ break;
+ }
+ }
+ if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
+ *value = *cached_copy;
+ return (0);
+ }
+
+ /*
+ * If the property wasn't cached, look up the file system's value for
+ * the property. For the version property, we look up a slightly
+ * different string.
+ */
+ const char *pname;
+ int error = ENOENT;
+ if (prop == ZFS_PROP_VERSION) {
+ pname = ZPL_VERSION_STR;
+ } else {
+ pname = zfs_prop_to_name(prop);
+ }
+
+ if (os != NULL) {
+ ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
+ error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
+ }
+
+ if (error == ENOENT) {
+ /* No value set, use the default value */
+ switch (prop) {
+ case ZFS_PROP_VERSION:
+ *value = ZPL_VERSION;
+ break;
+ case ZFS_PROP_NORMALIZE:
+ case ZFS_PROP_UTF8ONLY:
+ *value = 0;
+ break;
+ case ZFS_PROP_CASE:
+ *value = ZFS_CASE_SENSITIVE;
+ break;
+ case ZFS_PROP_ACLTYPE:
+ *value = ZFS_ACLTYPE_NFSV4;
+ break;
+ default:
+ return (error);
+ }
+ error = 0;
+ }
+
+ /*
+ * If one of the methods for getting the property value above worked,
+ * copy it into the objset_t's cache.
+ */
+ if (error == 0 && cached_copy != NULL) {
+ *cached_copy = *value;
+ }
+
+ return (error);
+}
+
+/*
+ * Return true if the corresponding vfs's unmounted flag is set.
+ * Otherwise return false.
+ * If this function returns true we know VFS unmount has been initiated.
+ */
+boolean_t
+zfs_get_vfs_flag_unmounted(objset_t *os)
+{
+ zfsvfs_t *zfvp;
+ boolean_t unmounted = B_FALSE;
+
+ ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
+
+ mutex_enter(&os->os_user_ptr_lock);
+ zfvp = dmu_objset_get_user(os);
+ if (zfvp != NULL && zfvp->z_vfs != NULL &&
+ (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
+ unmounted = B_TRUE;
+ mutex_exit(&os->os_user_ptr_lock);
+
+ return (unmounted);
+}
+
+#ifdef _KERNEL
+void
+zfsvfs_update_fromname(const char *oldname, const char *newname)
+{
+ char tmpbuf[MAXPATHLEN];
+ struct mount *mp;
+ char *fromname;
+ size_t oldlen;
+
+ oldlen = strlen(oldname);
+
+ mtx_lock(&mountlist_mtx);
+ TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+ fromname = mp->mnt_stat.f_mntfromname;
+ if (strcmp(fromname, oldname) == 0) {
+ (void) strlcpy(fromname, newname,
+ sizeof (mp->mnt_stat.f_mntfromname));
+ continue;
+ }
+ if (strncmp(fromname, oldname, oldlen) == 0 &&
+ (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
+ (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s",
+ newname, fromname + oldlen);
+ (void) strlcpy(fromname, tmpbuf,
+ sizeof (mp->mnt_stat.f_mntfromname));
+ continue;
+ }
+ }
+ mtx_unlock(&mountlist_mtx);
+}
+#endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
new file mode 100644
index 000000000000..d5f0da9ecd4b
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -0,0 +1,5888 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+/* Portions Copyright 2007 Jeremy Teo */
+/* Portions Copyright 2010 Robert Milkowski */
+
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/endian.h>
+#include <sys/vm.h>
+#include <sys/vnode.h>
+#if __FreeBSD_version >= 1300102
+#include <sys/smr.h>
+#endif
+#include <sys/dirent.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/taskq.h>
+#include <sys/uio.h>
+#include <sys/atomic.h>
+#include <sys/namei.h>
+#include <sys/mman.h>
+#include <sys/cmn_err.h>
+#include <sys/kdb.h>
+#include <sys/sysproto.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/policy.h>
+#include <sys/sunddi.h>
+#include <sys/filio.h>
+#include <sys/sid.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_quota.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_rlock.h>
+#include <sys/extdirent.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/sched.h>
+#include <sys/acl.h>
+#include <sys/vmmeter.h>
+#include <vm/vm_param.h>
+#include <sys/zil.h>
+#include <sys/zfs_vnops.h>
+
+#include <vm/vm_object.h>
+
+#include <sys/extattr.h>
+#include <sys/priv.h>
+
+#ifndef VN_OPEN_INVFS
+#define VN_OPEN_INVFS 0x0
+#endif
+
+VFS_SMR_DECLARE;
+
+#if __FreeBSD_version >= 1300047
+#define vm_page_wire_lock(pp)
+#define vm_page_wire_unlock(pp)
+#else
+#define vm_page_wire_lock(pp) vm_page_lock(pp)
+#define vm_page_wire_unlock(pp) vm_page_unlock(pp)
+#endif
+
+#ifdef DEBUG_VFS_LOCKS
+#define VNCHECKREF(vp) \
+ VNASSERT((vp)->v_holdcnt > 0 && (vp)->v_usecount > 0, vp, \
+ ("%s: wrong ref counts", __func__));
+#else
+#define VNCHECKREF(vp)
+#endif
+
+/*
+ * Programming rules.
+ *
+ * Each vnode op performs some logical unit of work. To do this, the ZPL must
+ * properly lock its in-core state, create a DMU transaction, do the work,
+ * record this work in the intent log (ZIL), commit the DMU transaction,
+ * and wait for the intent log to commit if it is a synchronous operation.
+ * Moreover, the vnode ops must work in both normal and log replay context.
+ * The ordering of events is important to avoid deadlocks and references
+ * to freed memory. The example below illustrates the following Big Rules:
+ *
+ * (1) A check must be made in each zfs thread for a mounted file system.
+ * This is done avoiding races using ZFS_ENTER(zfsvfs).
+ * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
+ * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
+ * can return EIO from the calling function.
+ *
+ * (2) VN_RELE() should always be the last thing except for zil_commit()
+ * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
+ * First, if it's the last reference, the vnode/znode
+ * can be freed, so the zp may point to freed memory. Second, the last
+ * reference will call zfs_zinactive(), which may induce a lot of work --
+ * pushing cached pages (which acquires range locks) and syncing out
+ * cached atime changes. Third, zfs_zinactive() may require a new tx,
+ * which could deadlock the system if you were already holding one.
+ * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
+ *
+ * (3) All range locks must be grabbed before calling dmu_tx_assign(),
+ * as they can span dmu_tx_assign() calls.
+ *
+ * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
+ * dmu_tx_assign(). This is critical because we don't want to block
+ * while holding locks.
+ *
+ * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This
+ * reduces lock contention and CPU usage when we must wait (note that if
+ * throughput is constrained by the storage, nearly every transaction
+ * must wait).
+ *
+ * Note, in particular, that if a lock is sometimes acquired before
+ * the tx assigns, and sometimes after (e.g. z_lock), then failing
+ * to use a non-blocking assign can deadlock the system. The scenario:
+ *
+ * Thread A has grabbed a lock before calling dmu_tx_assign().
+ * Thread B is in an already-assigned tx, and blocks for this lock.
+ * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
+ * forever, because the previous txg can't quiesce until B's tx commits.
+ *
+ * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
+ * then drop all locks, call dmu_tx_wait(), and try again. On subsequent
+ * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
+ * to indicate that this operation has already called dmu_tx_wait().
+ * This will ensure that we don't retry forever, waiting a short bit
+ * each time.
+ *
+ * (5) If the operation succeeded, generate the intent log entry for it
+ * before dropping locks. This ensures that the ordering of events
+ * in the intent log matches the order in which they actually occurred.
+ * During ZIL replay the zfs_log_* functions will update the sequence
+ * number to indicate the zil transaction has replayed.
+ *
+ * (6) At the end of each vnode op, the DMU tx must always commit,
+ * regardless of whether there were any errors.
+ *
+ * (7) After dropping all locks, invoke zil_commit(zilog, foid)
+ * to ensure that synchronous semantics are provided when necessary.
+ *
+ * In general, this is how things should be ordered in each vnode op:
+ *
+ * ZFS_ENTER(zfsvfs); // exit if unmounted
+ * top:
+ * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD())
+ * rw_enter(...); // grab any other locks you need
+ * tx = dmu_tx_create(...); // get DMU tx
+ * dmu_tx_hold_*(); // hold each object you might modify
+ * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ * if (error) {
+ * rw_exit(...); // drop locks
+ * zfs_dirent_unlock(dl); // unlock directory entry
+ * VN_RELE(...); // release held vnodes
+ * if (error == ERESTART) {
+ * waited = B_TRUE;
+ * dmu_tx_wait(tx);
+ * dmu_tx_abort(tx);
+ * goto top;
+ * }
+ * dmu_tx_abort(tx); // abort DMU tx
+ * ZFS_EXIT(zfsvfs); // finished in zfs
+ * return (error); // really out of space
+ * }
+ * error = do_real_work(); // do whatever this VOP does
+ * if (error == 0)
+ * zfs_log_*(...); // on success, make ZIL entry
+ * dmu_tx_commit(tx); // commit DMU tx -- error or not
+ * rw_exit(...); // drop locks
+ * zfs_dirent_unlock(dl); // unlock directory entry
+ * VN_RELE(...); // release held vnodes
+ * zil_commit(zilog, foid); // synchronous when necessary
+ * ZFS_EXIT(zfsvfs); // finished in zfs
+ * return (error); // done, report error
+ */
+
+/* ARGSUSED */
+static int
+zfs_open(vnode_t **vpp, int flag, cred_t *cr)
+{
+ znode_t *zp = VTOZ(*vpp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
+ ((flag & FAPPEND) == 0)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
+ ZTOV(zp)->v_type == VREG &&
+ !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
+ if (fs_vscan(*vpp, cr, 0) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EACCES));
+ }
+ }
+
+ /* Keep a count of the synchronous opens in the znode */
+ if (flag & (FSYNC | FDSYNC))
+ atomic_inc_32(&zp->z_sync_cnt);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ /* Decrement the synchronous opens in the znode */
+ if ((flag & (FSYNC | FDSYNC)) && (count == 1))
+ atomic_dec_32(&zp->z_sync_cnt);
+
+ if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
+ ZTOV(zp)->v_type == VREG &&
+ !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
+ VERIFY(fs_vscan(vp, cr, 1) == 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
+ int *rvalp)
+{
+ loff_t off;
+ int error;
+
+ switch (com) {
+ case _FIOFFS:
+ {
+ return (0);
+
+ /*
+ * The following two ioctls are used by bfu. Faking out,
+ * necessary to avoid bfu errors.
+ */
+ }
+ case _FIOGDIO:
+ case _FIOSDIO:
+ {
+ return (0);
+ }
+
+ case F_SEEK_DATA:
+ case F_SEEK_HOLE:
+ {
+ off = *(offset_t *)data;
+ /* offset parameter is in/out */
+ error = zfs_holey(VTOZ(vp), com, &off);
+ if (error)
+ return (error);
+ *(offset_t *)data = off;
+ return (0);
+ }
+ }
+ return (SET_ERROR(ENOTTY));
+}
+
+static vm_page_t
+page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
+{
+ vm_object_t obj;
+ vm_page_t pp;
+ int64_t end;
+
+ /*
+ * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
+ * aligned boundaries, if the range is not aligned. As a result a
+ * DEV_BSIZE subrange with partially dirty data may get marked as clean.
+ * It may happen that all DEV_BSIZE subranges are marked clean and thus
+ * the whole page would be considered clean despite have some
+ * dirty data.
+ * For this reason we should shrink the range to DEV_BSIZE aligned
+ * boundaries before calling vm_page_clear_dirty.
+ */
+ end = rounddown2(off + nbytes, DEV_BSIZE);
+ off = roundup2(off, DEV_BSIZE);
+ nbytes = end - off;
+
+ obj = vp->v_object;
+ zfs_vmobject_assert_wlocked_12(obj);
+#if __FreeBSD_version < 1300050
+ for (;;) {
+ if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
+ pp->valid) {
+ if (vm_page_xbusied(pp)) {
+ /*
+ * Reference the page before unlocking and
+ * sleeping so that the page daemon is less
+ * likely to reclaim it.
+ */
+ vm_page_reference(pp);
+ vm_page_lock(pp);
+ zfs_vmobject_wunlock(obj);
+ vm_page_busy_sleep(pp, "zfsmwb", true);
+ zfs_vmobject_wlock(obj);
+ continue;
+ }
+ vm_page_sbusy(pp);
+ } else if (pp != NULL) {
+ ASSERT(!pp->valid);
+ pp = NULL;
+ }
+ if (pp != NULL) {
+ ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+ vm_object_pip_add(obj, 1);
+ pmap_remove_write(pp);
+ if (nbytes != 0)
+ vm_page_clear_dirty(pp, off, nbytes);
+ }
+ break;
+ }
+#else
+ vm_page_grab_valid_unlocked(&pp, obj, OFF_TO_IDX(start),
+ VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_NORMAL |
+ VM_ALLOC_IGN_SBUSY);
+ if (pp != NULL) {
+ ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+ vm_object_pip_add(obj, 1);
+ pmap_remove_write(pp);
+ if (nbytes != 0)
+ vm_page_clear_dirty(pp, off, nbytes);
+ }
+#endif
+ return (pp);
+}
+
+static void
+page_unbusy(vm_page_t pp)
+{
+
+ vm_page_sunbusy(pp);
+#if __FreeBSD_version >= 1300041
+ vm_object_pip_wakeup(pp->object);
+#else
+ vm_object_pip_subtract(pp->object, 1);
+#endif
+}
+
+#if __FreeBSD_version > 1300051
+static vm_page_t
+page_hold(vnode_t *vp, int64_t start)
+{
+ vm_object_t obj;
+ vm_page_t m;
+
+ obj = vp->v_object;
+ vm_page_grab_valid_unlocked(&m, obj, OFF_TO_IDX(start),
+ VM_ALLOC_NOCREAT | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY |
+ VM_ALLOC_NOBUSY);
+ return (m);
+}
+#else
+static vm_page_t
+page_hold(vnode_t *vp, int64_t start)
+{
+ vm_object_t obj;
+ vm_page_t pp;
+
+ obj = vp->v_object;
+ zfs_vmobject_assert_wlocked(obj);
+
+ for (;;) {
+ if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
+ pp->valid) {
+ if (vm_page_xbusied(pp)) {
+ /*
+ * Reference the page before unlocking and
+ * sleeping so that the page daemon is less
+ * likely to reclaim it.
+ */
+ vm_page_reference(pp);
+ vm_page_lock(pp);
+ zfs_vmobject_wunlock(obj);
+ vm_page_busy_sleep(pp, "zfsmwb", true);
+ zfs_vmobject_wlock(obj);
+ continue;
+ }
+
+ ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+ vm_page_wire_lock(pp);
+ vm_page_hold(pp);
+ vm_page_wire_unlock(pp);
+
+ } else
+ pp = NULL;
+ break;
+ }
+ return (pp);
+}
+#endif
+
+static void
+page_unhold(vm_page_t pp)
+{
+
+ vm_page_wire_lock(pp);
+#if __FreeBSD_version >= 1300035
+ vm_page_unwire(pp, PQ_ACTIVE);
+#else
+ vm_page_unhold(pp);
+#endif
+ vm_page_wire_unlock(pp);
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages. What this means:
+ *
+ * On Write: If we find a memory mapped page, we write to *both*
+ * the page and the dmu buffer.
+ */
+void
+update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
+{
+ vm_object_t obj;
+ struct sf_buf *sf;
+ vnode_t *vp = ZTOV(zp);
+ caddr_t va;
+ int off;
+
+ ASSERT(vp->v_mount != NULL);
+ obj = vp->v_object;
+ ASSERT(obj != NULL);
+
+ off = start & PAGEOFFSET;
+ zfs_vmobject_wlock_12(obj);
+#if __FreeBSD_version >= 1300041
+ vm_object_pip_add(obj, 1);
+#endif
+ for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+ vm_page_t pp;
+ int nbytes = imin(PAGESIZE - off, len);
+
+ if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
+ zfs_vmobject_wunlock_12(obj);
+
+ va = zfs_map_page(pp, &sf);
+ (void) dmu_read(os, zp->z_id, start + off, nbytes,
+ va + off, DMU_READ_PREFETCH);
+ zfs_unmap_page(sf);
+
+ zfs_vmobject_wlock_12(obj);
+ page_unbusy(pp);
+ }
+ len -= nbytes;
+ off = 0;
+ }
+#if __FreeBSD_version >= 1300041
+ vm_object_pip_wakeup(obj);
+#else
+ vm_object_pip_wakeupn(obj, 0);
+#endif
+ zfs_vmobject_wunlock_12(obj);
+}
+
+/*
+ * Read with UIO_NOCOPY flag means that sendfile(2) requests
+ * ZFS to populate a range of page cache pages with data.
+ *
+ * NOTE: this function could be optimized to pre-allocate
+ * all pages in advance, drain exclusive busy on all of them,
+ * map them into contiguous KVA region and populate them
+ * in one single dmu_read() call.
+ */
+int
+mappedread_sf(znode_t *zp, int nbytes, zfs_uio_t *uio)
+{
+ vnode_t *vp = ZTOV(zp);
+ objset_t *os = zp->z_zfsvfs->z_os;
+ struct sf_buf *sf;
+ vm_object_t obj;
+ vm_page_t pp;
+ int64_t start;
+ caddr_t va;
+ int len = nbytes;
+ int error = 0;
+
+ ASSERT(zfs_uio_segflg(uio) == UIO_NOCOPY);
+ ASSERT(vp->v_mount != NULL);
+ obj = vp->v_object;
+ ASSERT(obj != NULL);
+ ASSERT((zfs_uio_offset(uio) & PAGEOFFSET) == 0);
+
+ zfs_vmobject_wlock_12(obj);
+ for (start = zfs_uio_offset(uio); len > 0; start += PAGESIZE) {
+ int bytes = MIN(PAGESIZE, len);
+
+ pp = vm_page_grab_unlocked(obj, OFF_TO_IDX(start),
+ VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
+ if (vm_page_none_valid(pp)) {
+ zfs_vmobject_wunlock_12(obj);
+ va = zfs_map_page(pp, &sf);
+ error = dmu_read(os, zp->z_id, start, bytes, va,
+ DMU_READ_PREFETCH);
+ if (bytes != PAGESIZE && error == 0)
+ bzero(va + bytes, PAGESIZE - bytes);
+ zfs_unmap_page(sf);
+ zfs_vmobject_wlock_12(obj);
+#if __FreeBSD_version >= 1300081
+ if (error == 0) {
+ vm_page_valid(pp);
+ vm_page_activate(pp);
+ vm_page_do_sunbusy(pp);
+ } else {
+ zfs_vmobject_wlock(obj);
+ if (!vm_page_wired(pp) && pp->valid == 0 &&
+ vm_page_busy_tryupgrade(pp))
+ vm_page_free(pp);
+ else
+ vm_page_sunbusy(pp);
+ zfs_vmobject_wunlock(obj);
+ }
+#else
+ vm_page_do_sunbusy(pp);
+ vm_page_lock(pp);
+ if (error) {
+ if (pp->wire_count == 0 && pp->valid == 0 &&
+ !vm_page_busied(pp))
+ vm_page_free(pp);
+ } else {
+ pp->valid = VM_PAGE_BITS_ALL;
+ vm_page_activate(pp);
+ }
+ vm_page_unlock(pp);
+#endif
+ } else {
+ ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+ vm_page_do_sunbusy(pp);
+ }
+ if (error)
+ break;
+ zfs_uio_advance(uio, bytes);
+ len -= bytes;
+ }
+ zfs_vmobject_wunlock_12(obj);
+ return (error);
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages. What this means:
+ *
+ * On Read: We "read" preferentially from memory mapped pages,
+ * else we default from the dmu buffer.
+ *
+ * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
+ * the file is memory mapped.
+ */
+int
+mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
+{
+ vnode_t *vp = ZTOV(zp);
+ vm_object_t obj;
+ int64_t start;
+ int len = nbytes;
+ int off;
+ int error = 0;
+
+ ASSERT(vp->v_mount != NULL);
+ obj = vp->v_object;
+ ASSERT(obj != NULL);
+
+ start = zfs_uio_offset(uio);
+ off = start & PAGEOFFSET;
+ zfs_vmobject_wlock_12(obj);
+ for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+ vm_page_t pp;
+ uint64_t bytes = MIN(PAGESIZE - off, len);
+
+ if ((pp = page_hold(vp, start))) {
+ struct sf_buf *sf;
+ caddr_t va;
+
+ zfs_vmobject_wunlock_12(obj);
+ va = zfs_map_page(pp, &sf);
+ error = vn_io_fault_uiomove(va + off, bytes,
+ GET_UIO_STRUCT(uio));
+ zfs_unmap_page(sf);
+ zfs_vmobject_wlock_12(obj);
+ page_unhold(pp);
+ } else {
+ zfs_vmobject_wunlock_12(obj);
+ error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+ uio, bytes);
+ zfs_vmobject_wlock_12(obj);
+ }
+ len -= bytes;
+ off = 0;
+ if (error)
+ break;
+ }
+ zfs_vmobject_wunlock_12(obj);
+ return (error);
+}
+
+int
+zfs_write_simple(znode_t *zp, const void *data, size_t len,
+ loff_t pos, size_t *presid)
+{
+ int error = 0;
+ ssize_t resid;
+
+ error = vn_rdwr(UIO_WRITE, ZTOV(zp), __DECONST(void *, data), len, pos,
+ UIO_SYSSPACE, IO_SYNC, kcred, NOCRED, &resid, curthread);
+
+ if (error) {
+ return (SET_ERROR(error));
+ } else if (presid == NULL) {
+ if (resid != 0) {
+ error = SET_ERROR(EIO);
+ }
+ } else {
+ *presid = resid;
+ }
+ return (error);
+}
+
+void
+zfs_zrele_async(znode_t *zp)
+{
+ vnode_t *vp = ZTOV(zp);
+ objset_t *os = ITOZSB(vp)->z_os;
+
+ VN_RELE_ASYNC(vp, dsl_pool_zrele_taskq(dmu_objset_pool(os)));
+}
+
+static int
+zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
+{
+ int error;
+
+ *vpp = arg;
+ error = vn_lock(*vpp, lkflags);
+ if (error != 0)
+ vrele(*vpp);
+ return (error);
+}
+
+static int
+zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
+{
+ znode_t *zdp = VTOZ(dvp);
+ zfsvfs_t *zfsvfs __unused = zdp->z_zfsvfs;
+ int error;
+ int ltype;
+
+ if (zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_LOCKED(dvp, __func__);
+
+ if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+ ASSERT3P(dvp, ==, vp);
+ vref(dvp);
+ ltype = lkflags & LK_TYPE_MASK;
+ if (ltype != VOP_ISLOCKED(dvp)) {
+ if (ltype == LK_EXCLUSIVE)
+ vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+ else /* if (ltype == LK_SHARED) */
+ vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
+
+ /*
+ * Relock for the "." case could leave us with
+ * reclaimed vnode.
+ */
+ if (VN_IS_DOOMED(dvp)) {
+ vrele(dvp);
+ return (SET_ERROR(ENOENT));
+ }
+ }
+ return (0);
+ } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+ /*
+ * Note that in this case, dvp is the child vnode, and we
+ * are looking up the parent vnode - exactly reverse from
+ * normal operation. Unlocking dvp requires some rather
+ * tricky unlock/relock dance to prevent mp from being freed;
+ * use vn_vget_ino_gen() which takes care of all that.
+ *
+ * XXX Note that there is a time window when both vnodes are
+ * unlocked. It is possible, although highly unlikely, that
+ * during that window the parent-child relationship between
+ * the vnodes may change, for example, get reversed.
+ * In that case we would have a wrong lock order for the vnodes.
+ * All other filesystems seem to ignore this problem, so we
+ * do the same here.
+ * A potential solution could be implemented as follows:
+ * - using LK_NOWAIT when locking the second vnode and retrying
+ * if necessary
+ * - checking that the parent-child relationship still holds
+ * after locking both vnodes and retrying if it doesn't
+ */
+ error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
+ return (error);
+ } else {
+ error = vn_lock(vp, lkflags);
+ if (error != 0)
+ vrele(vp);
+ return (error);
+ }
+}
+
+/*
+ * Lookup an entry in a directory, or an extended attribute directory.
+ * If it exists, return a held vnode reference for it.
+ *
+ * IN: dvp - vnode of directory to search.
+ * nm - name of entry to lookup.
+ * pnp - full pathname to lookup [UNUSED].
+ * flags - LOOKUP_XATTR set if looking for an attribute.
+ * rdir - root directory vnode [UNUSED].
+ * cr - credentials of caller.
+ * ct - caller context
+ *
+ * OUT: vpp - vnode of located entry, NULL if not found.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * NA
+ */
+/* ARGSUSED */
+static int
+zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp,
+ struct componentname *cnp, int nameiop, cred_t *cr, kthread_t *td,
+ int flags, boolean_t cached)
+{
+ znode_t *zdp = VTOZ(dvp);
+ znode_t *zp;
+ zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
+ int error = 0;
+
+ /*
+ * Fast path lookup, however we must skip DNLC lookup
+ * for case folding or normalizing lookups because the
+ * DNLC code only stores the passed in name. This means
+ * creating 'a' and removing 'A' on a case insensitive
+ * file system would work, but DNLC still thinks 'a'
+ * exists and won't let you create it again on the next
+ * pass through fast path.
+ */
+ if (!(flags & LOOKUP_XATTR)) {
+ if (dvp->v_type != VDIR) {
+ return (SET_ERROR(ENOTDIR));
+ } else if (zdp->z_sa_hdl == NULL) {
+ return (SET_ERROR(EIO));
+ }
+ }
+
+ DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp,
+ const char *, nm);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zdp);
+
+ *vpp = NULL;
+
+ if (flags & LOOKUP_XATTR) {
+ /*
+ * If the xattr property is off, refuse the lookup request.
+ */
+ if (!(zfsvfs->z_flags & ZSB_XATTR)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+
+ /*
+ * We don't allow recursive attributes..
+ * Maybe someday we will.
+ */
+ if (zdp->z_pflags & ZFS_XATTR) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if ((error = zfs_get_xattrdir(VTOZ(dvp), &zp, cr, flags))) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ *vpp = ZTOV(zp);
+
+ /*
+ * Do we have permission to get into attribute directory?
+ */
+ error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr);
+ if (error) {
+ vrele(ZTOV(zp));
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Check accessibility of directory if we're not coming in via
+ * VOP_CACHEDLOOKUP.
+ */
+ if (!cached) {
+#ifdef NOEXECCHECK
+ if ((cnp->cn_flags & NOEXECCHECK) != 0) {
+ cnp->cn_flags &= ~NOEXECCHECK;
+ } else
+#endif
+ if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
+ NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+
+
+ /*
+ * First handle the special cases.
+ */
+ if ((cnp->cn_flags & ISDOTDOT) != 0) {
+ /*
+ * If we are a snapshot mounted under .zfs, return
+ * the vp for the snapshot directory.
+ */
+ if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
+ struct componentname cn;
+ vnode_t *zfsctl_vp;
+ int ltype;
+
+ ZFS_EXIT(zfsvfs);
+ ltype = VOP_ISLOCKED(dvp);
+ VOP_UNLOCK1(dvp);
+ error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
+ &zfsctl_vp);
+ if (error == 0) {
+ cn.cn_nameptr = "snapshot";
+ cn.cn_namelen = strlen(cn.cn_nameptr);
+ cn.cn_nameiop = cnp->cn_nameiop;
+ cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
+ cn.cn_lkflags = cnp->cn_lkflags;
+ error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
+ vput(zfsctl_vp);
+ }
+ vn_lock(dvp, ltype | LK_RETRY);
+ return (error);
+ }
+ }
+ if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
+ ZFS_EXIT(zfsvfs);
+ if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+ return (SET_ERROR(ENOTSUP));
+ error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
+ return (error);
+ }
+
+ /*
+ * The loop is retry the lookup if the parent-child relationship
+ * changes during the dot-dot locking complexities.
+ */
+ for (;;) {
+ uint64_t parent;
+
+ error = zfs_dirlook(zdp, nm, &zp);
+ if (error == 0)
+ *vpp = ZTOV(zp);
+
+ ZFS_EXIT(zfsvfs);
+ if (error != 0)
+ break;
+
+ error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
+ if (error != 0) {
+ /*
+ * If we've got a locking error, then the vnode
+ * got reclaimed because of a force unmount.
+ * We never enter doomed vnodes into the name cache.
+ */
+ *vpp = NULL;
+ return (error);
+ }
+
+ if ((cnp->cn_flags & ISDOTDOT) == 0)
+ break;
+
+ ZFS_ENTER(zfsvfs);
+ if (zdp->z_sa_hdl == NULL) {
+ error = SET_ERROR(EIO);
+ } else {
+ error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (parent));
+ }
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ vput(ZTOV(zp));
+ break;
+ }
+ if (zp->z_id == parent) {
+ ZFS_EXIT(zfsvfs);
+ break;
+ }
+ vput(ZTOV(zp));
+ }
+
+ if (error != 0)
+ *vpp = NULL;
+
+ /* Translate errors and add SAVENAME when needed. */
+ if (cnp->cn_flags & ISLASTCN) {
+ switch (nameiop) {
+ case CREATE:
+ case RENAME:
+ if (error == ENOENT) {
+ error = EJUSTRETURN;
+ cnp->cn_flags |= SAVENAME;
+ break;
+ }
+ /* FALLTHROUGH */
+ case DELETE:
+ if (error == 0)
+ cnp->cn_flags |= SAVENAME;
+ break;
+ }
+ }
+
+ /* Insert name into cache (as non-existent) if appropriate. */
+ if (zfsvfs->z_use_namecache && !zfsvfs->z_replay &&
+ error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
+ cache_enter(dvp, NULL, cnp);
+
+ /* Insert name into cache if appropriate. */
+ if (zfsvfs->z_use_namecache && !zfsvfs->z_replay &&
+ error == 0 && (cnp->cn_flags & MAKEENTRY)) {
+ if (!(cnp->cn_flags & ISLASTCN) ||
+ (nameiop != DELETE && nameiop != RENAME)) {
+ cache_enter(dvp, *vpp, cnp);
+ }
+ }
+
+ return (error);
+}
+
+/*
+ * Attempt to create a new entry in a directory. If the entry
+ * already exists, truncate the file if permissible, else return
+ * an error. Return the vp of the created or trunc'd file.
+ *
+ * IN: dvp - vnode of directory to put new file entry in.
+ * name - name of new file entry.
+ * vap - attributes of new file.
+ * excl - flag indicating exclusive or non-exclusive mode.
+ * mode - mode to open file with.
+ * cr - credentials of caller.
+ * flag - large file flag [UNUSED].
+ * ct - caller context
+ * vsecp - ACL to be set
+ *
+ * OUT: vpp - vnode of created or trunc'd entry.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated if new entry created
+ * vp - ctime|mtime always, atime if new
+ */
+
+/* ARGSUSED */
+int
+zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode,
+ znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp)
+{
+ znode_t *zp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog;
+ objset_t *os;
+ dmu_tx_t *tx;
+ int error;
+ ksid_t *ksid;
+ uid_t uid;
+ gid_t gid = crgetgid(cr);
+ uint64_t projid = ZFS_DEFAULT_PROJID;
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
+ uint64_t txtype;
+#ifdef DEBUG_VFS_LOCKS
+ vnode_t *dvp = ZTOV(dzp);
+#endif
+
+ /*
+ * If we have an ephemeral id, ACL, or XVATTR then
+ * make sure file system is at proper version
+ */
+
+ ksid = crgetsid(cr, KSID_OWNER);
+ if (ksid)
+ uid = ksid_getid(ksid);
+ else
+ uid = crgetuid(cr);
+
+ if (zfsvfs->z_use_fuids == B_FALSE &&
+ (vsecp || (vap->va_mask & AT_XVATTR) ||
+ IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ os = zfsvfs->z_os;
+ zilog = zfsvfs->z_log;
+
+ if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
+ NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+
+ if (vap->va_mask & AT_XVATTR) {
+ if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap,
+ crgetuid(cr), cr, vap->va_type)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ *zpp = NULL;
+
+ if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
+ vap->va_mode &= ~S_ISVTX;
+
+ error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
+ if (error) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ ASSERT3P(zp, ==, NULL);
+
+ /*
+ * Create a new file object and update the directory
+ * to reference it.
+ */
+ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+ goto out;
+ }
+
+ /*
+ * We only support the creation of regular files in
+ * extended attribute directories.
+ */
+
+ if ((dzp->z_pflags & ZFS_XATTR) &&
+ (vap->va_type != VREG)) {
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ if ((error = zfs_acl_ids_create(dzp, 0, vap,
+ cr, vsecp, &acl_ids)) != 0)
+ goto out;
+
+ if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
+ projid = zfs_inherit_projid(dzp);
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
+ zfs_acl_ids_free(&acl_ids);
+ error = SET_ERROR(EDQUOT);
+ goto out;
+ }
+
+ getnewvnode_reserve_();
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ if (!zfsvfs->z_use_sa &&
+ acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, acl_ids.z_aclp->z_acl_bytes);
+ }
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_abort(tx);
+ getnewvnode_drop_reserve();
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ (void) zfs_link_create(dzp, name, zp, tx, ZNEW);
+ txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
+ zfs_log_create(zilog, tx, txtype, dzp, zp, name,
+ vsecp, acl_ids.z_fuidp, vap);
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_commit(tx);
+
+ getnewvnode_drop_reserve();
+
+out:
+ VNCHECKREF(dvp);
+ if (error == 0) {
+ *zpp = zp;
+ }
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Remove an entry from a directory.
+ *
+ * IN: dvp - vnode of directory to remove entry from.
+ * name - name of entry to remove.
+ * cr - credentials of caller.
+ * ct - caller context
+ * flags - case flags
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * dvp - ctime|mtime
+ * vp - ctime (if nlink > 0)
+ */
+
+/*ARGSUSED*/
+static int
+zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
+{
+ znode_t *dzp = VTOZ(dvp);
+ znode_t *zp;
+ znode_t *xzp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog;
+ uint64_t xattr_obj;
+ uint64_t obj = 0;
+ dmu_tx_t *tx;
+ boolean_t unlinked;
+ uint64_t txtype;
+ int error;
+
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ zp = VTOZ(vp);
+ ZFS_VERIFY_ZP(zp);
+ zilog = zfsvfs->z_log;
+
+ xattr_obj = 0;
+ xzp = NULL;
+
+ if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
+ goto out;
+ }
+
+ /*
+ * Need to use rmdir for removing directories.
+ */
+ if (vp->v_type == VDIR) {
+ error = SET_ERROR(EPERM);
+ goto out;
+ }
+
+ vnevent_remove(vp, dvp, name, ct);
+
+ obj = zp->z_id;
+
+ /* are there any extended attributes? */
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+ if (error == 0 && xattr_obj) {
+ error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+ ASSERT0(error);
+ }
+
+ /*
+ * We may delete the znode now, or we may put it in the unlinked set;
+ * it depends on whether we're the last link, and on whether there are
+ * other holds on the vnode. So we dmu_tx_hold() the right things to
+ * allow for either case.
+ */
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ zfs_sa_upgrade_txholds(tx, dzp);
+
+ if (xzp) {
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
+ }
+
+ /* charge as an update -- would be nice not to charge at all */
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+
+ /*
+ * Mark this transaction as typically resulting in a net free of space
+ */
+ dmu_tx_mark_netfree(tx);
+
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Remove the directory entry.
+ */
+ error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
+
+ if (error) {
+ dmu_tx_commit(tx);
+ goto out;
+ }
+
+ if (unlinked) {
+ zfs_unlinked_add(zp, tx);
+ vp->v_vflag |= VV_NOSYNC;
+ }
+ /* XXX check changes to linux vnops */
+ txtype = TX_REMOVE;
+ zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
+
+ dmu_tx_commit(tx);
+out:
+
+ if (xzp)
+ vrele(ZTOV(xzp));
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+
+static int
+zfs_lookup_internal(znode_t *dzp, const char *name, vnode_t **vpp,
+ struct componentname *cnp, int nameiop)
+{
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ int error;
+
+ cnp->cn_nameptr = __DECONST(char *, name);
+ cnp->cn_namelen = strlen(name);
+ cnp->cn_nameiop = nameiop;
+ cnp->cn_flags = ISLASTCN | SAVENAME;
+ cnp->cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
+ cnp->cn_cred = kcred;
+ cnp->cn_thread = curthread;
+
+ if (zfsvfs->z_use_namecache && !zfsvfs->z_replay) {
+ struct vop_lookup_args a;
+
+ a.a_gen.a_desc = &vop_lookup_desc;
+ a.a_dvp = ZTOV(dzp);
+ a.a_vpp = vpp;
+ a.a_cnp = cnp;
+ error = vfs_cache_lookup(&a);
+ } else {
+ error = zfs_lookup(ZTOV(dzp), name, vpp, cnp, nameiop, kcred,
+ curthread, 0, B_FALSE);
+ }
+#ifdef ZFS_DEBUG
+ if (error) {
+ printf("got error %d on name %s on op %d\n", error, name,
+ nameiop);
+ kdb_backtrace();
+ }
+#endif
+ return (error);
+}
+
+int
+zfs_remove(znode_t *dzp, const char *name, cred_t *cr, int flags)
+{
+ vnode_t *vp;
+ int error;
+ struct componentname cn;
+
+ if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE)))
+ return (error);
+
+ error = zfs_remove_(ZTOV(dzp), vp, name, cr);
+ vput(vp);
+ return (error);
+}
+/*
+ * Create a new directory and insert it into dvp using the name
+ * provided. Return a pointer to the inserted directory.
+ *
+ * IN: dvp - vnode of directory to add subdir to.
+ * dirname - name of new directory.
+ * vap - attributes of new directory.
+ * cr - credentials of caller.
+ * ct - caller context
+ * flags - case flags
+ * vsecp - ACL to be set
+ *
+ * OUT: vpp - vnode of created directory.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated
+ * vp - ctime|mtime|atime updated
+ */
+/*ARGSUSED*/
+int
+zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp,
+ cred_t *cr, int flags, vsecattr_t *vsecp)
+{
+ znode_t *zp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog;
+ uint64_t txtype;
+ dmu_tx_t *tx;
+ int error;
+ ksid_t *ksid;
+ uid_t uid;
+ gid_t gid = crgetgid(cr);
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
+
+ ASSERT(vap->va_type == VDIR);
+
+ /*
+ * If we have an ephemeral id, ACL, or XVATTR then
+ * make sure file system is at proper version
+ */
+
+ ksid = crgetsid(cr, KSID_OWNER);
+ if (ksid)
+ uid = ksid_getid(ksid);
+ else
+ uid = crgetuid(cr);
+ if (zfsvfs->z_use_fuids == B_FALSE &&
+ ((vap->va_mask & AT_XVATTR) ||
+ IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ zilog = zfsvfs->z_log;
+
+ if (dzp->z_pflags & ZFS_XATTR) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (zfsvfs->z_utf8 && u8_validate(dirname,
+ strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+
+ if (vap->va_mask & AT_XVATTR) {
+ if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap,
+ crgetuid(cr), cr, vap->va_type)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
+ NULL, &acl_ids)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * First make sure the new directory doesn't exist.
+ *
+ * Existence is checked first to make sure we don't return
+ * EACCES instead of EEXIST which can cause some applications
+ * to fail.
+ */
+ *zpp = NULL;
+
+ if ((error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW))) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ ASSERT3P(zp, ==, NULL);
+
+ if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EDQUOT));
+ }
+
+ /*
+ * Add a new entry to the directory.
+ */
+ getnewvnode_reserve_();
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ acl_ids.z_aclp->z_acl_bytes);
+ }
+
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_abort(tx);
+ getnewvnode_drop_reserve();
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Create new node.
+ */
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ /*
+ * Now put new name in parent dir.
+ */
+ (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
+
+ *zpp = zp;
+
+ txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
+ zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
+ acl_ids.z_fuidp, vap);
+
+ zfs_acl_ids_free(&acl_ids);
+
+ dmu_tx_commit(tx);
+
+ getnewvnode_drop_reserve();
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+#if __FreeBSD_version < 1300124
+static void
+cache_vop_rmdir(struct vnode *dvp, struct vnode *vp)
+{
+
+ cache_purge(dvp);
+ cache_purge(vp);
+}
+#endif
+
+/*
+ * Remove a directory subdir entry. If the current working
+ * directory is the same as the subdir to be removed, the
+ * remove will fail.
+ *
+ * IN: dvp - vnode of directory to remove from.
+ * name - name of directory to be removed.
+ * cwd - vnode of current working directory.
+ * cr - credentials of caller.
+ * ct - caller context
+ * flags - case flags
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+static int
+zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
+{
+ znode_t *dzp = VTOZ(dvp);
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog;
+ dmu_tx_t *tx;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ ZFS_VERIFY_ZP(zp);
+ zilog = zfsvfs->z_log;
+
+
+ if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
+ goto out;
+ }
+
+ if (vp->v_type != VDIR) {
+ error = SET_ERROR(ENOTDIR);
+ goto out;
+ }
+
+ vnevent_rmdir(vp, dvp, name, ct);
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ zfs_sa_upgrade_txholds(tx, zp);
+ zfs_sa_upgrade_txholds(tx, dzp);
+ dmu_tx_mark_netfree(tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
+
+ if (error == 0) {
+ uint64_t txtype = TX_RMDIR;
+ zfs_log_remove(zilog, tx, txtype, dzp, name,
+ ZFS_NO_OBJECT, B_FALSE);
+ }
+
+ dmu_tx_commit(tx);
+
+ cache_vop_rmdir(dvp, vp);
+out:
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+int
+zfs_rmdir(znode_t *dzp, const char *name, znode_t *cwd, cred_t *cr, int flags)
+{
+ struct componentname cn;
+ vnode_t *vp;
+ int error;
+
+ if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE)))
+ return (error);
+
+ error = zfs_rmdir_(ZTOV(dzp), vp, name, cr);
+ vput(vp);
+ return (error);
+}
+
+/*
+ * Read as many directory entries as will fit into the provided
+ * buffer from the given directory cursor position (specified in
+ * the uio structure).
+ *
+ * IN: vp - vnode of directory to read.
+ * uio - structure supplying read location, range info,
+ * and return buffer.
+ * cr - credentials of caller.
+ * ct - caller context
+ * flags - case flags
+ *
+ * OUT: uio - updated offset and range, buffer filled.
+ * eofp - set to true if end-of-file detected.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * vp - atime updated
+ *
+ * Note that the low 4 bits of the cookie returned by zap is always zero.
+ * This allows us to use the low range for "special" directory entries:
+ * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
+ * we use the offset 2 for the '.zfs' directory.
+ */
+/* ARGSUSED */
+static int
+zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,
+ int *ncookies, ulong_t **cookies)
+{
+ znode_t *zp = VTOZ(vp);
+ iovec_t *iovp;
+ edirent_t *eodp;
+ dirent64_t *odp;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ objset_t *os;
+ caddr_t outbuf;
+ size_t bufsize;
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ uint_t bytes_wanted;
+ uint64_t offset; /* must be unsigned; checks for < 1 */
+ uint64_t parent;
+ int local_eof;
+ int outcount;
+ int error;
+ uint8_t prefetch;
+ boolean_t check_sysattrs;
+ uint8_t type;
+ int ncooks;
+ ulong_t *cooks = NULL;
+ int flags = 0;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (parent))) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * If we are not given an eof variable,
+ * use a local one.
+ */
+ if (eofp == NULL)
+ eofp = &local_eof;
+
+ /*
+ * Check for valid iov_len.
+ */
+ if (GET_UIO_STRUCT(uio)->uio_iov->iov_len <= 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Quit if directory has been removed (posix)
+ */
+ if ((*eofp = zp->z_unlinked) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ error = 0;
+ os = zfsvfs->z_os;
+ offset = zfs_uio_offset(uio);
+ prefetch = zp->z_zn_prefetch;
+
+ /*
+ * Initialize the iterator cursor.
+ */
+ if (offset <= 3) {
+ /*
+ * Start iteration from the beginning of the directory.
+ */
+ zap_cursor_init(&zc, os, zp->z_id);
+ } else {
+ /*
+ * The offset is a serialized cursor.
+ */
+ zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
+ }
+
+ /*
+ * Get space to change directory entries into fs independent format.
+ */
+ iovp = GET_UIO_STRUCT(uio)->uio_iov;
+ bytes_wanted = iovp->iov_len;
+ if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1) {
+ bufsize = bytes_wanted;
+ outbuf = kmem_alloc(bufsize, KM_SLEEP);
+ odp = (struct dirent64 *)outbuf;
+ } else {
+ bufsize = bytes_wanted;
+ outbuf = NULL;
+ odp = (struct dirent64 *)iovp->iov_base;
+ }
+ eodp = (struct edirent *)odp;
+
+ if (ncookies != NULL) {
+ /*
+ * Minimum entry size is dirent size and 1 byte for a file name.
+ */
+ ncooks = zfs_uio_resid(uio) / (sizeof (struct dirent) -
+ sizeof (((struct dirent *)NULL)->d_name) + 1);
+ cooks = malloc(ncooks * sizeof (ulong_t), M_TEMP, M_WAITOK);
+ *cookies = cooks;
+ *ncookies = ncooks;
+ }
+ /*
+ * If this VFS supports the system attribute view interface; and
+ * we're looking at an extended attribute directory; and we care
+ * about normalization conflicts on this vfs; then we must check
+ * for normalization conflicts with the sysattr name space.
+ */
+#ifdef TODO
+ check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
+ (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
+ (flags & V_RDDIR_ENTFLAGS);
+#else
+ check_sysattrs = 0;
+#endif
+
+ /*
+ * Transform to file-system independent format
+ */
+ outcount = 0;
+ while (outcount < bytes_wanted) {
+ ino64_t objnum;
+ ushort_t reclen;
+ off64_t *next = NULL;
+
+ /*
+ * Special case `.', `..', and `.zfs'.
+ */
+ if (offset == 0) {
+ (void) strcpy(zap.za_name, ".");
+ zap.za_normalization_conflict = 0;
+ objnum = zp->z_id;
+ type = DT_DIR;
+ } else if (offset == 1) {
+ (void) strcpy(zap.za_name, "..");
+ zap.za_normalization_conflict = 0;
+ objnum = parent;
+ type = DT_DIR;
+ } else if (offset == 2 && zfs_show_ctldir(zp)) {
+ (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
+ zap.za_normalization_conflict = 0;
+ objnum = ZFSCTL_INO_ROOT;
+ type = DT_DIR;
+ } else {
+ /*
+ * Grab next entry.
+ */
+ if ((error = zap_cursor_retrieve(&zc, &zap))) {
+ if ((*eofp = (error == ENOENT)) != 0)
+ break;
+ else
+ goto update;
+ }
+
+ if (zap.za_integer_length != 8 ||
+ zap.za_num_integers != 1) {
+ cmn_err(CE_WARN, "zap_readdir: bad directory "
+ "entry, obj = %lld, offset = %lld\n",
+ (u_longlong_t)zp->z_id,
+ (u_longlong_t)offset);
+ error = SET_ERROR(ENXIO);
+ goto update;
+ }
+
+ objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
+ /*
+ * MacOS X can extract the object type here such as:
+ * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
+ */
+ type = ZFS_DIRENT_TYPE(zap.za_first_integer);
+
+ if (check_sysattrs && !zap.za_normalization_conflict) {
+#ifdef TODO
+ zap.za_normalization_conflict =
+ xattr_sysattr_casechk(zap.za_name);
+#else
+ panic("%s:%u: TODO", __func__, __LINE__);
+#endif
+ }
+ }
+
+ if (flags & V_RDDIR_ACCFILTER) {
+ /*
+ * If we have no access at all, don't include
+ * this entry in the returned information
+ */
+ znode_t *ezp;
+ if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
+ goto skip_entry;
+ if (!zfs_has_access(ezp, cr)) {
+ vrele(ZTOV(ezp));
+ goto skip_entry;
+ }
+ vrele(ZTOV(ezp));
+ }
+
+ if (flags & V_RDDIR_ENTFLAGS)
+ reclen = EDIRENT_RECLEN(strlen(zap.za_name));
+ else
+ reclen = DIRENT64_RECLEN(strlen(zap.za_name));
+
+ /*
+ * Will this entry fit in the buffer?
+ */
+ if (outcount + reclen > bufsize) {
+ /*
+ * Did we manage to fit anything in the buffer?
+ */
+ if (!outcount) {
+ error = SET_ERROR(EINVAL);
+ goto update;
+ }
+ break;
+ }
+ if (flags & V_RDDIR_ENTFLAGS) {
+ /*
+ * Add extended flag entry:
+ */
+ eodp->ed_ino = objnum;
+ eodp->ed_reclen = reclen;
+ /* NOTE: ed_off is the offset for the *next* entry */
+ next = &(eodp->ed_off);
+ eodp->ed_eflags = zap.za_normalization_conflict ?
+ ED_CASE_CONFLICT : 0;
+ (void) strncpy(eodp->ed_name, zap.za_name,
+ EDIRENT_NAMELEN(reclen));
+ eodp = (edirent_t *)((intptr_t)eodp + reclen);
+ } else {
+ /*
+ * Add normal entry:
+ */
+ odp->d_ino = objnum;
+ odp->d_reclen = reclen;
+ odp->d_namlen = strlen(zap.za_name);
+ /* NOTE: d_off is the offset for the *next* entry. */
+ next = &odp->d_off;
+ strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
+ odp->d_type = type;
+ dirent_terminate(odp);
+ odp = (dirent64_t *)((intptr_t)odp + reclen);
+ }
+ outcount += reclen;
+
+ ASSERT(outcount <= bufsize);
+
+ /* Prefetch znode */
+ if (prefetch)
+ dmu_prefetch(os, objnum, 0, 0, 0,
+ ZIO_PRIORITY_SYNC_READ);
+
+ skip_entry:
+ /*
+ * Move to the next entry, fill in the previous offset.
+ */
+ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
+ zap_cursor_advance(&zc);
+ offset = zap_cursor_serialize(&zc);
+ } else {
+ offset += 1;
+ }
+
+ /* Fill the offset right after advancing the cursor. */
+ if (next != NULL)
+ *next = offset;
+ if (cooks != NULL) {
+ *cooks++ = offset;
+ ncooks--;
+ KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
+ }
+ }
+ zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
+
+ /* Subtract unused cookies */
+ if (ncookies != NULL)
+ *ncookies -= ncooks;
+
+ if (zfs_uio_segflg(uio) == UIO_SYSSPACE && zfs_uio_iovcnt(uio) == 1) {
+ iovp->iov_base += outcount;
+ iovp->iov_len -= outcount;
+ zfs_uio_resid(uio) -= outcount;
+ } else if ((error =
+ zfs_uiomove(outbuf, (long)outcount, UIO_READ, uio))) {
+ /*
+ * Reset the pointer.
+ */
+ offset = zfs_uio_offset(uio);
+ }
+
+update:
+ zap_cursor_fini(&zc);
+ if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1)
+ kmem_free(outbuf, bufsize);
+
+ if (error == ENOENT)
+ error = 0;
+
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+
+ zfs_uio_setoffset(uio, offset);
+ ZFS_EXIT(zfsvfs);
+ if (error != 0 && cookies != NULL) {
+ free(*cookies, M_TEMP);
+ *cookies = NULL;
+ *ncookies = 0;
+ }
+ return (error);
+}
+
+/*
+ * Get the requested file attributes and place them in the provided
+ * vattr structure.
+ *
+ * IN: vp - vnode of file.
+ * vap - va_mask identifies requested attributes.
+ * If AT_XVATTR set, then optional attrs are requested
+ * flags - ATTR_NOACLCHECK (CIFS server context)
+ * cr - credentials of caller.
+ *
+ * OUT: vap - attribute values.
+ *
+ * RETURN: 0 (always succeeds).
+ */
+/* ARGSUSED */
+static int
+zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error = 0;
+ uint32_t blksize;
+ u_longlong_t nblocks;
+ uint64_t mtime[2], ctime[2], crtime[2], rdev;
+ xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
+ xoptattr_t *xoap = NULL;
+ boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+ sa_bulk_attr_t bulk[4];
+ int count = 0;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
+ if (vp->v_type == VBLK || vp->v_type == VCHR)
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
+ &rdev, 8);
+
+ if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
+ * Also, if we are the owner don't bother, since owner should
+ * always be allowed to read basic attributes of file.
+ */
+ if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
+ (vap->va_uid != crgetuid(cr))) {
+ if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
+ skipaclchk, cr))) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ /*
+ * Return all attributes. It's cheaper to provide the answer
+ * than to determine whether we were asked the question.
+ */
+
+ vap->va_type = IFTOVT(zp->z_mode);
+ vap->va_mode = zp->z_mode & ~S_IFMT;
+ vn_fsid(vp, vap);
+ vap->va_nodeid = zp->z_id;
+ vap->va_nlink = zp->z_links;
+ if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) &&
+ zp->z_links < ZFS_LINK_MAX)
+ vap->va_nlink++;
+ vap->va_size = zp->z_size;
+ if (vp->v_type == VBLK || vp->v_type == VCHR)
+ vap->va_rdev = zfs_cmpldev(rdev);
+ vap->va_seq = zp->z_seq;
+ vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */
+ vap->va_filerev = zp->z_seq;
+
+ /*
+ * Add in any requested optional attributes and the create time.
+ * Also set the corresponding bits in the returned attribute bitmap.
+ */
+ if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
+ if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
+ xoap->xoa_archive =
+ ((zp->z_pflags & ZFS_ARCHIVE) != 0);
+ XVA_SET_RTN(xvap, XAT_ARCHIVE);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
+ xoap->xoa_readonly =
+ ((zp->z_pflags & ZFS_READONLY) != 0);
+ XVA_SET_RTN(xvap, XAT_READONLY);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
+ xoap->xoa_system =
+ ((zp->z_pflags & ZFS_SYSTEM) != 0);
+ XVA_SET_RTN(xvap, XAT_SYSTEM);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
+ xoap->xoa_hidden =
+ ((zp->z_pflags & ZFS_HIDDEN) != 0);
+ XVA_SET_RTN(xvap, XAT_HIDDEN);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+ xoap->xoa_nounlink =
+ ((zp->z_pflags & ZFS_NOUNLINK) != 0);
+ XVA_SET_RTN(xvap, XAT_NOUNLINK);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+ xoap->xoa_immutable =
+ ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
+ XVA_SET_RTN(xvap, XAT_IMMUTABLE);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+ xoap->xoa_appendonly =
+ ((zp->z_pflags & ZFS_APPENDONLY) != 0);
+ XVA_SET_RTN(xvap, XAT_APPENDONLY);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+ xoap->xoa_nodump =
+ ((zp->z_pflags & ZFS_NODUMP) != 0);
+ XVA_SET_RTN(xvap, XAT_NODUMP);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
+ xoap->xoa_opaque =
+ ((zp->z_pflags & ZFS_OPAQUE) != 0);
+ XVA_SET_RTN(xvap, XAT_OPAQUE);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+ xoap->xoa_av_quarantined =
+ ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
+ XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+ xoap->xoa_av_modified =
+ ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
+ XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
+ vp->v_type == VREG) {
+ zfs_sa_get_scanstamp(zp, xvap);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+ xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
+ XVA_SET_RTN(xvap, XAT_REPARSE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
+ xoap->xoa_generation = zp->z_gen;
+ XVA_SET_RTN(xvap, XAT_GEN);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+ xoap->xoa_offline =
+ ((zp->z_pflags & ZFS_OFFLINE) != 0);
+ XVA_SET_RTN(xvap, XAT_OFFLINE);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+ xoap->xoa_sparse =
+ ((zp->z_pflags & ZFS_SPARSE) != 0);
+ XVA_SET_RTN(xvap, XAT_SPARSE);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
+ xoap->xoa_projinherit =
+ ((zp->z_pflags & ZFS_PROJINHERIT) != 0);
+ XVA_SET_RTN(xvap, XAT_PROJINHERIT);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
+ xoap->xoa_projid = zp->z_projid;
+ XVA_SET_RTN(xvap, XAT_PROJID);
+ }
+ }
+
+ ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
+ ZFS_TIME_DECODE(&vap->va_mtime, mtime);
+ ZFS_TIME_DECODE(&vap->va_ctime, ctime);
+ ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
+
+
+ sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
+ vap->va_blksize = blksize;
+ vap->va_bytes = nblocks << 9; /* nblocks * 512 */
+
+ if (zp->z_blksz == 0) {
+ /*
+ * Block size hasn't been set; suggest maximal I/O transfers.
+ */
+ vap->va_blksize = zfsvfs->z_max_blksz;
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Set the file attributes to the values contained in the
+ * vattr structure.
+ *
+ * IN: zp - znode of file to be modified.
+ * vap - new attribute values.
+ * If AT_XVATTR set, then optional attrs are being set
+ * flags - ATTR_UTIME set if non-default time values provided.
+ * - ATTR_NOACLCHECK (CIFS context only).
+ * cr - credentials of caller.
+ * ct - caller context
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * vp - ctime updated, mtime updated if size changed.
+ */
+/* ARGSUSED */
+int
+zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr)
+{
+ vnode_t *vp = ZTOV(zp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ objset_t *os = zfsvfs->z_os;
+ zilog_t *zilog;
+ dmu_tx_t *tx;
+ vattr_t oldva;
+ xvattr_t tmpxvattr;
+ uint_t mask = vap->va_mask;
+ uint_t saved_mask = 0;
+ uint64_t saved_mode;
+ int trim_mask = 0;
+ uint64_t new_mode;
+ uint64_t new_uid, new_gid;
+ uint64_t xattr_obj;
+ uint64_t mtime[2], ctime[2];
+ uint64_t projid = ZFS_INVALID_PROJID;
+ znode_t *attrzp;
+ int need_policy = FALSE;
+ int err, err2;
+ zfs_fuid_info_t *fuidp = NULL;
+ xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
+ xoptattr_t *xoap;
+ zfs_acl_t *aclp;
+ boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+ boolean_t fuid_dirtied = B_FALSE;
+ sa_bulk_attr_t bulk[7], xattr_bulk[7];
+ int count = 0, xattr_count = 0;
+
+ if (mask == 0)
+ return (0);
+
+ if (mask & AT_NOSET)
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ zilog = zfsvfs->z_log;
+
+ /*
+ * Make sure that if we have ephemeral uid/gid or xvattr specified
+ * that file system is at proper version level
+ */
+
+ if (zfsvfs->z_use_fuids == B_FALSE &&
+ (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
+ ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
+ (mask & AT_XVATTR))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (mask & AT_SIZE && vp->v_type == VDIR) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EISDIR));
+ }
+
+ if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * If this is an xvattr_t, then get a pointer to the structure of
+ * optional attributes. If this is NULL, then we have a vattr_t.
+ */
+ xoap = xva_getxoptattr(xvap);
+
+ xva_init(&tmpxvattr);
+
+ /*
+ * Immutable files can only alter immutable bit and atime
+ */
+ if ((zp->z_pflags & ZFS_IMMUTABLE) &&
+ ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
+ ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ /*
+ * Note: ZFS_READONLY is handled in zfs_zaccess_common.
+ */
+
+ /*
+ * Verify timestamps doesn't overflow 32 bits.
+ * ZFS can handle large timestamps, but 32bit syscalls can't
+ * handle times greater than 2039. This check should be removed
+ * once large timestamps are fully supported.
+ */
+ if (mask & (AT_ATIME | AT_MTIME)) {
+ if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
+ ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EOVERFLOW));
+ }
+ }
+ if (xoap != NULL && (mask & AT_XVATTR)) {
+ if (XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
+ TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EOVERFLOW));
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
+ if (!dmu_objset_projectquota_enabled(os) ||
+ (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+
+ projid = xoap->xoa_projid;
+ if (unlikely(projid == ZFS_INVALID_PROJID)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
+ projid = ZFS_INVALID_PROJID;
+ else
+ need_policy = TRUE;
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
+ (xoap->xoa_projinherit !=
+ ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
+ (!dmu_objset_projectquota_enabled(os) ||
+ (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode)))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+ }
+
+ attrzp = NULL;
+ aclp = NULL;
+
+ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EROFS));
+ }
+
+ /*
+ * First validate permissions
+ */
+
+ if (mask & AT_SIZE) {
+ /*
+ * XXX - Note, we are not providing any open
+ * mode flags here (like FNDELAY), so we may
+ * block if there are locks present... this
+ * should be addressed in openat().
+ */
+ /* XXX - would it be OK to generate a log record here? */
+ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
+ if (err) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ }
+
+ if (mask & (AT_ATIME|AT_MTIME) ||
+ ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
+ XVA_ISSET_REQ(xvap, XAT_READONLY) ||
+ XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
+ XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
+ XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
+ XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
+ XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
+ need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
+ skipaclchk, cr);
+ }
+
+ if (mask & (AT_UID|AT_GID)) {
+ int idmask = (mask & (AT_UID|AT_GID));
+ int take_owner;
+ int take_group;
+
+ /*
+ * NOTE: even if a new mode is being set,
+ * we may clear S_ISUID/S_ISGID bits.
+ */
+
+ if (!(mask & AT_MODE))
+ vap->va_mode = zp->z_mode;
+
+ /*
+ * Take ownership or chgrp to group we are a member of
+ */
+
+ take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
+ take_group = (mask & AT_GID) &&
+ zfs_groupmember(zfsvfs, vap->va_gid, cr);
+
+ /*
+ * If both AT_UID and AT_GID are set then take_owner and
+ * take_group must both be set in order to allow taking
+ * ownership.
+ *
+ * Otherwise, send the check through secpolicy_vnode_setattr()
+ *
+ */
+
+ if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
+ ((idmask == AT_UID) && take_owner) ||
+ ((idmask == AT_GID) && take_group)) {
+ if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
+ skipaclchk, cr) == 0) {
+ /*
+ * Remove setuid/setgid for non-privileged users
+ */
+ secpolicy_setid_clear(vap, vp, cr);
+ trim_mask = (mask & (AT_UID|AT_GID));
+ } else {
+ need_policy = TRUE;
+ }
+ } else {
+ need_policy = TRUE;
+ }
+ }
+
+ oldva.va_mode = zp->z_mode;
+ zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
+ if (mask & AT_XVATTR) {
+ /*
+ * Update xvattr mask to include only those attributes
+ * that are actually changing.
+ *
+ * the bits will be restored prior to actually setting
+ * the attributes so the caller thinks they were set.
+ */
+ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+ if (xoap->xoa_appendonly !=
+ ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_APPENDONLY);
+ XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
+ if (xoap->xoa_projinherit !=
+ ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
+ XVA_SET_REQ(&tmpxvattr, XAT_PROJINHERIT);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+ if (xoap->xoa_nounlink !=
+ ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_NOUNLINK);
+ XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+ if (xoap->xoa_immutable !=
+ ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
+ XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+ if (xoap->xoa_nodump !=
+ ((zp->z_pflags & ZFS_NODUMP) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_NODUMP);
+ XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+ if (xoap->xoa_av_modified !=
+ ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
+ XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+ if ((vp->v_type != VREG &&
+ xoap->xoa_av_quarantined) ||
+ xoap->xoa_av_quarantined !=
+ ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
+ XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ if (need_policy == FALSE &&
+ (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
+ XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
+ need_policy = TRUE;
+ }
+ }
+
+ if (mask & AT_MODE) {
+ if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
+ err = secpolicy_setid_setsticky_clear(vp, vap,
+ &oldva, cr);
+ if (err) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ trim_mask |= AT_MODE;
+ } else {
+ need_policy = TRUE;
+ }
+ }
+
+ if (need_policy) {
+ /*
+ * If trim_mask is set then take ownership
+ * has been granted or write_acl is present and user
+ * has the ability to modify mode. In that case remove
+ * UID|GID and or MODE from mask so that
+ * secpolicy_vnode_setattr() doesn't revoke it.
+ */
+
+ if (trim_mask) {
+ saved_mask = vap->va_mask;
+ vap->va_mask &= ~trim_mask;
+ if (trim_mask & AT_MODE) {
+ /*
+ * Save the mode, as secpolicy_vnode_setattr()
+ * will overwrite it with ova.va_mode.
+ */
+ saved_mode = vap->va_mode;
+ }
+ }
+ err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
+ (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
+ if (err) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+
+ if (trim_mask) {
+ vap->va_mask |= saved_mask;
+ if (trim_mask & AT_MODE) {
+ /*
+ * Recover the mode after
+ * secpolicy_vnode_setattr().
+ */
+ vap->va_mode = saved_mode;
+ }
+ }
+ }
+
+ /*
+ * secpolicy_vnode_setattr, or take ownership may have
+ * changed va_mask
+ */
+ mask = vap->va_mask;
+
+ if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) {
+ err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+
+ if (err == 0 && xattr_obj) {
+ err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
+ if (err == 0) {
+ err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
+ if (err != 0)
+ vrele(ZTOV(attrzp));
+ }
+ if (err)
+ goto out2;
+ }
+ if (mask & AT_UID) {
+ new_uid = zfs_fuid_create(zfsvfs,
+ (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
+ if (new_uid != zp->z_uid &&
+ zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
+ new_uid)) {
+ if (attrzp)
+ vput(ZTOV(attrzp));
+ err = SET_ERROR(EDQUOT);
+ goto out2;
+ }
+ }
+
+ if (mask & AT_GID) {
+ new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
+ cr, ZFS_GROUP, &fuidp);
+ if (new_gid != zp->z_gid &&
+ zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
+ new_gid)) {
+ if (attrzp)
+ vput(ZTOV(attrzp));
+ err = SET_ERROR(EDQUOT);
+ goto out2;
+ }
+ }
+
+ if (projid != ZFS_INVALID_PROJID &&
+ zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
+ if (attrzp)
+ vput(ZTOV(attrzp));
+ err = SET_ERROR(EDQUOT);
+ goto out2;
+ }
+ }
+ tx = dmu_tx_create(os);
+
+ if (mask & AT_MODE) {
+ uint64_t pmode = zp->z_mode;
+ uint64_t acl_obj;
+ new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
+
+ if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
+ !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
+ err = SET_ERROR(EPERM);
+ goto out;
+ }
+
+ if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
+ goto out;
+
+ if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
+ /*
+ * Are we upgrading ACL from old V0 format
+ * to V1 format?
+ */
+ if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+ zfs_znode_acl_version(zp) ==
+ ZFS_ACL_VERSION_INITIAL) {
+ dmu_tx_hold_free(tx, acl_obj, 0,
+ DMU_OBJECT_END);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, aclp->z_acl_bytes);
+ } else {
+ dmu_tx_hold_write(tx, acl_obj, 0,
+ aclp->z_acl_bytes);
+ }
+ } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, aclp->z_acl_bytes);
+ }
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ } else {
+ if (((mask & AT_XVATTR) &&
+ XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
+ (projid != ZFS_INVALID_PROJID &&
+ !(zp->z_pflags & ZFS_PROJID)))
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ else
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ }
+
+ if (attrzp) {
+ dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
+ }
+
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+
+ zfs_sa_upgrade_txholds(tx, zp);
+
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err)
+ goto out;
+
+ count = 0;
+ /*
+ * Set each attribute requested.
+ * We group settings according to the locks they need to acquire.
+ *
+ * Note: you cannot set ctime directly, although it will be
+ * updated as a side-effect of calling this function.
+ */
+
+ if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
+ /*
+ * For the existed object that is upgraded from old system,
+ * its on-disk layout has no slot for the project ID attribute.
+ * But quota accounting logic needs to access related slots by
+ * offset directly. So we need to adjust old objects' layout
+ * to make the project ID to some unified and fixed offset.
+ */
+ if (attrzp)
+ err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
+ if (err == 0)
+ err = sa_add_projid(zp->z_sa_hdl, tx, projid);
+
+ if (unlikely(err == EEXIST))
+ err = 0;
+ else if (err != 0)
+ goto out;
+ else
+ projid = ZFS_INVALID_PROJID;
+ }
+
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_enter(&zp->z_acl_lock);
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+
+ if (attrzp) {
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_enter(&attrzp->z_acl_lock);
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
+ sizeof (attrzp->z_pflags));
+ if (projid != ZFS_INVALID_PROJID) {
+ attrzp->z_projid = projid;
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
+ sizeof (attrzp->z_projid));
+ }
+ }
+
+ if (mask & (AT_UID|AT_GID)) {
+
+ if (mask & AT_UID) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &new_uid, sizeof (new_uid));
+ zp->z_uid = new_uid;
+ if (attrzp) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_UID(zfsvfs), NULL, &new_uid,
+ sizeof (new_uid));
+ attrzp->z_uid = new_uid;
+ }
+ }
+
+ if (mask & AT_GID) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
+ NULL, &new_gid, sizeof (new_gid));
+ zp->z_gid = new_gid;
+ if (attrzp) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_GID(zfsvfs), NULL, &new_gid,
+ sizeof (new_gid));
+ attrzp->z_gid = new_gid;
+ }
+ }
+ if (!(mask & AT_MODE)) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
+ NULL, &new_mode, sizeof (new_mode));
+ new_mode = zp->z_mode;
+ }
+ err = zfs_acl_chown_setattr(zp);
+ ASSERT(err == 0);
+ if (attrzp) {
+ err = zfs_acl_chown_setattr(attrzp);
+ ASSERT(err == 0);
+ }
+ }
+
+ if (mask & AT_MODE) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &new_mode, sizeof (new_mode));
+ zp->z_mode = new_mode;
+ ASSERT3U((uintptr_t)aclp, !=, 0);
+ err = zfs_aclset_common(zp, aclp, cr, tx);
+ ASSERT0(err);
+ if (zp->z_acl_cached)
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = aclp;
+ aclp = NULL;
+ }
+
+
+ if (mask & AT_ATIME) {
+ ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &zp->z_atime, sizeof (zp->z_atime));
+ }
+
+ if (mask & AT_MTIME) {
+ ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ mtime, sizeof (mtime));
+ }
+
+ if (projid != ZFS_INVALID_PROJID) {
+ zp->z_projid = projid;
+ SA_ADD_BULK_ATTR(bulk, count,
+ SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
+ sizeof (zp->z_projid));
+ }
+
+ /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
+ if (mask & AT_SIZE && !(mask & AT_MTIME)) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+ NULL, mtime, sizeof (mtime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+ } else if (mask != 0) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime);
+ if (attrzp) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
+ mtime, ctime);
+ }
+ }
+
+ /*
+ * Do this after setting timestamps to prevent timestamp
+ * update from toggling bit
+ */
+
+ if (xoap && (mask & AT_XVATTR)) {
+
+ if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
+ xoap->xoa_createtime = vap->va_birthtime;
+ /*
+ * restore trimmed off masks
+ * so that return masks can be set for caller.
+ */
+
+ if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
+ XVA_SET_REQ(xvap, XAT_APPENDONLY);
+ }
+ if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
+ XVA_SET_REQ(xvap, XAT_NOUNLINK);
+ }
+ if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
+ XVA_SET_REQ(xvap, XAT_IMMUTABLE);
+ }
+ if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
+ XVA_SET_REQ(xvap, XAT_NODUMP);
+ }
+ if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
+ XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
+ }
+ if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
+ XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
+ }
+ if (XVA_ISSET_REQ(&tmpxvattr, XAT_PROJINHERIT)) {
+ XVA_SET_REQ(xvap, XAT_PROJINHERIT);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
+ ASSERT(vp->v_type == VREG);
+
+ zfs_xvattr_set(zp, xvap, tx);
+ }
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ if (mask != 0)
+ zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
+
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_exit(&zp->z_acl_lock);
+
+ if (attrzp) {
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_exit(&attrzp->z_acl_lock);
+ }
+out:
+ if (err == 0 && attrzp) {
+ err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
+ xattr_count, tx);
+ ASSERT(err2 == 0);
+ }
+
+ if (attrzp)
+ vput(ZTOV(attrzp));
+
+ if (aclp)
+ zfs_acl_free(aclp);
+
+ if (fuidp) {
+ zfs_fuid_info_free(fuidp);
+ fuidp = NULL;
+ }
+
+ if (err) {
+ dmu_tx_abort(tx);
+ } else {
+ err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ dmu_tx_commit(tx);
+ }
+
+out2:
+ if (os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (err);
+}
+
+/*
+ * We acquire all but fdvp locks using non-blocking acquisitions. If we
+ * fail to acquire any lock in the path we will drop all held locks,
+ * acquire the new lock in a blocking fashion, and then release it and
+ * restart the rename. This acquire/release step ensures that we do not
+ * spin on a lock waiting for release. On error release all vnode locks
+ * and decrement references the way tmpfs_rename() would do.
+ */
+static int
+zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
+ struct vnode *tdvp, struct vnode **tvpp,
+ const struct componentname *scnp, const struct componentname *tcnp)
+{
+ zfsvfs_t *zfsvfs;
+ struct vnode *nvp, *svp, *tvp;
+ znode_t *sdzp, *tdzp, *szp, *tzp;
+ const char *snm = scnp->cn_nameptr;
+ const char *tnm = tcnp->cn_nameptr;
+ int error;
+
+ VOP_UNLOCK1(tdvp);
+ if (*tvpp != NULL && *tvpp != tdvp)
+ VOP_UNLOCK1(*tvpp);
+
+relock:
+ error = vn_lock(sdvp, LK_EXCLUSIVE);
+ if (error)
+ goto out;
+ sdzp = VTOZ(sdvp);
+
+ error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
+ if (error != 0) {
+ VOP_UNLOCK1(sdvp);
+ if (error != EBUSY)
+ goto out;
+ error = vn_lock(tdvp, LK_EXCLUSIVE);
+ if (error)
+ goto out;
+ VOP_UNLOCK1(tdvp);
+ goto relock;
+ }
+ tdzp = VTOZ(tdvp);
+
+ /*
+ * Before using sdzp and tdzp we must ensure that they are live.
+ * As a porting legacy from illumos we have two things to worry
+ * about. One is typical for FreeBSD and it is that the vnode is
+ * not reclaimed (doomed). The other is that the znode is live.
+ * The current code can invalidate the znode without acquiring the
+ * corresponding vnode lock if the object represented by the znode
+ * and vnode is no longer valid after a rollback or receive operation.
+ * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
+ * that protects the znodes from the invalidation.
+ */
+ zfsvfs = sdzp->z_zfsvfs;
+ ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
+ ZFS_ENTER(zfsvfs);
+
+ /*
+ * We can not use ZFS_VERIFY_ZP() here because it could directly return
+ * bypassing the cleanup code in the case of an error.
+ */
+ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
+ ZFS_EXIT(zfsvfs);
+ VOP_UNLOCK1(sdvp);
+ VOP_UNLOCK1(tdvp);
+ error = SET_ERROR(EIO);
+ goto out;
+ }
+
+ /*
+ * Re-resolve svp to be certain it still exists and fetch the
+ * correct vnode.
+ */
+ error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
+ if (error != 0) {
+ /* Source entry invalid or not there. */
+ ZFS_EXIT(zfsvfs);
+ VOP_UNLOCK1(sdvp);
+ VOP_UNLOCK1(tdvp);
+ if ((scnp->cn_flags & ISDOTDOT) != 0 ||
+ (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+ svp = ZTOV(szp);
+
+ /*
+ * Re-resolve tvp, if it disappeared we just carry on.
+ */
+ error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ VOP_UNLOCK1(sdvp);
+ VOP_UNLOCK1(tdvp);
+ vrele(svp);
+ if ((tcnp->cn_flags & ISDOTDOT) != 0)
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+ if (tzp != NULL)
+ tvp = ZTOV(tzp);
+ else
+ tvp = NULL;
+
+ /*
+ * At present the vnode locks must be acquired before z_teardown_lock,
+ * although it would be more logical to use the opposite order.
+ */
+ ZFS_EXIT(zfsvfs);
+
+ /*
+ * Now try acquire locks on svp and tvp.
+ */
+ nvp = svp;
+ error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
+ if (error != 0) {
+ VOP_UNLOCK1(sdvp);
+ VOP_UNLOCK1(tdvp);
+ if (tvp != NULL)
+ vrele(tvp);
+ if (error != EBUSY) {
+ vrele(nvp);
+ goto out;
+ }
+ error = vn_lock(nvp, LK_EXCLUSIVE);
+ if (error != 0) {
+ vrele(nvp);
+ goto out;
+ }
+ VOP_UNLOCK1(nvp);
+ /*
+ * Concurrent rename race.
+ * XXX ?
+ */
+ if (nvp == tdvp) {
+ vrele(nvp);
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+ vrele(*svpp);
+ *svpp = nvp;
+ goto relock;
+ }
+ vrele(*svpp);
+ *svpp = nvp;
+
+ if (*tvpp != NULL)
+ vrele(*tvpp);
+ *tvpp = NULL;
+ if (tvp != NULL) {
+ nvp = tvp;
+ error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
+ if (error != 0) {
+ VOP_UNLOCK1(sdvp);
+ VOP_UNLOCK1(tdvp);
+ VOP_UNLOCK1(*svpp);
+ if (error != EBUSY) {
+ vrele(nvp);
+ goto out;
+ }
+ error = vn_lock(nvp, LK_EXCLUSIVE);
+ if (error != 0) {
+ vrele(nvp);
+ goto out;
+ }
+ vput(nvp);
+ goto relock;
+ }
+ *tvpp = nvp;
+ }
+
+ return (0);
+
+out:
+ return (error);
+}
+
+/*
+ * Note that we must use VRELE_ASYNC in this function as it walks
+ * up the directory tree and vrele may need to acquire an exclusive
+ * lock if a last reference to a vnode is dropped.
+ */
+static int
+zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
+{
+ zfsvfs_t *zfsvfs;
+ znode_t *zp, *zp1;
+ uint64_t parent;
+ int error;
+
+ zfsvfs = tdzp->z_zfsvfs;
+ if (tdzp == szp)
+ return (SET_ERROR(EINVAL));
+ if (tdzp == sdzp)
+ return (0);
+ if (tdzp->z_id == zfsvfs->z_root)
+ return (0);
+ zp = tdzp;
+ for (;;) {
+ ASSERT(!zp->z_unlinked);
+ if ((error = sa_lookup(zp->z_sa_hdl,
+ SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+ break;
+
+ if (parent == szp->z_id) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+ if (parent == zfsvfs->z_root)
+ break;
+ if (parent == sdzp->z_id)
+ break;
+
+ error = zfs_zget(zfsvfs, parent, &zp1);
+ if (error != 0)
+ break;
+
+ if (zp != tdzp)
+ VN_RELE_ASYNC(ZTOV(zp),
+ dsl_pool_zrele_taskq(
+ dmu_objset_pool(zfsvfs->z_os)));
+ zp = zp1;
+ }
+
+ if (error == ENOTDIR)
+ panic("checkpath: .. not a directory\n");
+ if (zp != tdzp)
+ VN_RELE_ASYNC(ZTOV(zp),
+ dsl_pool_zrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
+ return (error);
+}
+
+#if __FreeBSD_version < 1300124
+static void
+cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
+ struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
+{
+
+ cache_purge(fvp);
+ if (tvp != NULL)
+ cache_purge(tvp);
+ cache_purge_negative(tdvp);
+}
+#endif
+
+/*
+ * Move an entry from the provided source directory to the target
+ * directory. Change the entry name as indicated.
+ *
+ * IN: sdvp - Source directory containing the "old entry".
+ * snm - Old entry name.
+ * tdvp - Target directory to contain the "new entry".
+ * tnm - New entry name.
+ * cr - credentials of caller.
+ * ct - caller context
+ * flags - case flags
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * sdvp,tdvp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+static int
+zfs_rename_(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
+ vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
+ cred_t *cr, int log)
+{
+ zfsvfs_t *zfsvfs;
+ znode_t *sdzp, *tdzp, *szp, *tzp;
+ zilog_t *zilog = NULL;
+ dmu_tx_t *tx;
+ const char *snm = scnp->cn_nameptr;
+ const char *tnm = tcnp->cn_nameptr;
+ int error = 0;
+ bool want_seqc_end __maybe_unused = false;
+
+ /* Reject renames across filesystems. */
+ if ((*svpp)->v_mount != tdvp->v_mount ||
+ ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
+ error = SET_ERROR(EXDEV);
+ goto out;
+ }
+
+ if (zfsctl_is_node(tdvp)) {
+ error = SET_ERROR(EXDEV);
+ goto out;
+ }
+
+ /*
+ * Lock all four vnodes to ensure safety and semantics of renaming.
+ */
+ error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
+ if (error != 0) {
+ /* no vnodes are locked in the case of error here */
+ return (error);
+ }
+
+ tdzp = VTOZ(tdvp);
+ sdzp = VTOZ(sdvp);
+ zfsvfs = tdzp->z_zfsvfs;
+ zilog = zfsvfs->z_log;
+
+ /*
+ * After we re-enter ZFS_ENTER() we will have to revalidate all
+ * znodes involved.
+ */
+ ZFS_ENTER(zfsvfs);
+
+ if (zfsvfs->z_utf8 && u8_validate(tnm,
+ strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ error = SET_ERROR(EILSEQ);
+ goto unlockout;
+ }
+
+ /* If source and target are the same file, there is nothing to do. */
+ if ((*svpp) == (*tvpp)) {
+ error = 0;
+ goto unlockout;
+ }
+
+ if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
+ ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
+ (*tvpp)->v_mountedhere != NULL)) {
+ error = SET_ERROR(EXDEV);
+ goto unlockout;
+ }
+
+ /*
+ * We can not use ZFS_VERIFY_ZP() here because it could directly return
+ * bypassing the cleanup code in the case of an error.
+ */
+ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
+ error = SET_ERROR(EIO);
+ goto unlockout;
+ }
+
+ szp = VTOZ(*svpp);
+ tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
+ if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
+ error = SET_ERROR(EIO);
+ goto unlockout;
+ }
+
+ /*
+ * This is to prevent the creation of links into attribute space
+ * by renaming a linked file into/outof an attribute directory.
+ * See the comment in zfs_link() for why this is considered bad.
+ */
+ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
+ error = SET_ERROR(EINVAL);
+ goto unlockout;
+ }
+
+ /*
+ * If we are using project inheritance, means if the directory has
+ * ZFS_PROJINHERIT set, then its descendant directories will inherit
+ * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
+ * such case, we only allow renames into our tree when the project
+ * IDs are the same.
+ */
+ if (tdzp->z_pflags & ZFS_PROJINHERIT &&
+ tdzp->z_projid != szp->z_projid) {
+ error = SET_ERROR(EXDEV);
+ goto unlockout;
+ }
+
+ /*
+ * Must have write access at the source to remove the old entry
+ * and write access at the target to create the new entry.
+ * Note that if target and source are the same, this can be
+ * done in a single check.
+ */
+ if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)))
+ goto unlockout;
+
+ if ((*svpp)->v_type == VDIR) {
+ /*
+ * Avoid ".", "..", and aliases of "." for obvious reasons.
+ */
+ if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
+ sdzp == szp ||
+ (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
+ error = EINVAL;
+ goto unlockout;
+ }
+
+ /*
+ * Check to make sure rename is valid.
+ * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
+ */
+ if ((error = zfs_rename_check(szp, sdzp, tdzp)))
+ goto unlockout;
+ }
+
+ /*
+ * Does target exist?
+ */
+ if (tzp) {
+ /*
+ * Source and target must be the same type.
+ */
+ if ((*svpp)->v_type == VDIR) {
+ if ((*tvpp)->v_type != VDIR) {
+ error = SET_ERROR(ENOTDIR);
+ goto unlockout;
+ } else {
+ cache_purge(tdvp);
+ if (sdvp != tdvp)
+ cache_purge(sdvp);
+ }
+ } else {
+ if ((*tvpp)->v_type == VDIR) {
+ error = SET_ERROR(EISDIR);
+ goto unlockout;
+ }
+ }
+ }
+
+ vn_seqc_write_begin(*svpp);
+ vn_seqc_write_begin(sdvp);
+ if (*tvpp != NULL)
+ vn_seqc_write_begin(*tvpp);
+ if (tdvp != *tvpp)
+ vn_seqc_write_begin(tdvp);
+#if __FreeBSD_version >= 1300102
+ want_seqc_end = true;
+#endif
+ vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
+ if (tzp)
+ vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
+
+ /*
+ * notify the target directory if it is not the same
+ * as source directory.
+ */
+ if (tdvp != sdvp) {
+ vnevent_rename_dest_dir(tdvp, ct);
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
+ dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
+ if (sdzp != tdzp) {
+ dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, tdzp);
+ }
+ if (tzp) {
+ dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, tzp);
+ }
+
+ zfs_sa_upgrade_txholds(tx, szp);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ goto unlockout;
+ }
+
+
+ if (tzp) /* Attempt to remove the existing target */
+ error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
+
+ if (error == 0) {
+ error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
+ if (error == 0) {
+ szp->z_pflags |= ZFS_AV_MODIFIED;
+
+ error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+ (void *)&szp->z_pflags, sizeof (uint64_t), tx);
+ ASSERT0(error);
+
+ error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
+ NULL);
+ if (error == 0) {
+ zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
+ snm, tdzp, tnm, szp);
+
+ /*
+ * Update path information for the target vnode
+ */
+ vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
+ } else {
+ /*
+ * At this point, we have successfully created
+ * the target name, but have failed to remove
+ * the source name. Since the create was done
+ * with the ZRENAMING flag, there are
+ * complications; for one, the link count is
+ * wrong. The easiest way to deal with this
+ * is to remove the newly created target, and
+ * return the original error. This must
+ * succeed; fortunately, it is very unlikely to
+ * fail, since we just created it.
+ */
+ VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
+ ZRENAMING, NULL), ==, 0);
+ }
+ }
+ if (error == 0) {
+ cache_vop_rename(sdvp, *svpp, tdvp, *tvpp, scnp, tcnp);
+ }
+ }
+
+ dmu_tx_commit(tx);
+
+unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */
+ ZFS_EXIT(zfsvfs);
+ if (want_seqc_end) {
+ vn_seqc_write_end(*svpp);
+ vn_seqc_write_end(sdvp);
+ if (*tvpp != NULL)
+ vn_seqc_write_end(*tvpp);
+ if (tdvp != *tvpp)
+ vn_seqc_write_end(tdvp);
+ want_seqc_end = false;
+ }
+ VOP_UNLOCK1(*svpp);
+ VOP_UNLOCK1(sdvp);
+
+out: /* original two vnodes are locked */
+ MPASS(!want_seqc_end);
+ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ if (*tvpp != NULL)
+ VOP_UNLOCK1(*tvpp);
+ if (tdvp != *tvpp)
+ VOP_UNLOCK1(tdvp);
+ return (error);
+}
+
+int
+zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname,
+ cred_t *cr, int flags)
+{
+ struct componentname scn, tcn;
+ vnode_t *sdvp, *tdvp;
+ vnode_t *svp, *tvp;
+ int error;
+ svp = tvp = NULL;
+
+ sdvp = ZTOV(sdzp);
+ tdvp = ZTOV(tdzp);
+ error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE);
+ if (sdzp->z_zfsvfs->z_replay == B_FALSE)
+ VOP_UNLOCK1(sdvp);
+ if (error != 0)
+ goto fail;
+ VOP_UNLOCK1(svp);
+
+ vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
+ error = zfs_lookup_internal(tdzp, tname, &tvp, &tcn, RENAME);
+ if (error == EJUSTRETURN)
+ tvp = NULL;
+ else if (error != 0) {
+ VOP_UNLOCK1(tdvp);
+ goto fail;
+ }
+
+ error = zfs_rename_(sdvp, &svp, &scn, tdvp, &tvp, &tcn, cr, 0);
+fail:
+ if (svp != NULL)
+ vrele(svp);
+ if (tvp != NULL)
+ vrele(tvp);
+
+ return (error);
+}
+
+/*
+ * Insert the indicated symbolic reference entry into the directory.
+ *
+ * IN: dvp - Directory to contain new symbolic link.
+ * link - Name for new symlink entry.
+ * vap - Attributes of new entry.
+ * cr - credentials of caller.
+ * ct - caller context
+ * flags - case flags
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+int
+zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap,
+ const char *link, znode_t **zpp, cred_t *cr, int flags)
+{
+ znode_t *zp;
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog;
+ uint64_t len = strlen(link);
+ int error;
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
+ uint64_t txtype = TX_SYMLINK;
+
+ ASSERT(vap->va_type == VLNK);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ zilog = zfsvfs->z_log;
+
+ if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
+ NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+
+ if (len > MAXPATHLEN) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(ENAMETOOLONG));
+ }
+
+ if ((error = zfs_acl_ids_create(dzp, 0,
+ vap, cr, NULL, &acl_ids)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Attempt to lock directory; fail if entry already exists.
+ */
+ error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
+ if (error) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids,
+ 0 /* projid */)) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EDQUOT));
+ }
+
+ getnewvnode_reserve_();
+ tx = dmu_tx_create(zfsvfs->z_os);
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE + len);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ acl_ids.z_aclp->z_acl_bytes);
+ }
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_abort(tx);
+ getnewvnode_drop_reserve();
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Create a new object for the symlink.
+ * for version 4 ZPL datsets the symlink will be an SA attribute
+ */
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ if (zp->z_is_sa)
+ error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
+ __DECONST(void *, link), len, tx);
+ else
+ zfs_sa_symlink(zp, __DECONST(char *, link), len, tx);
+
+ zp->z_size = len;
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+ &zp->z_size, sizeof (zp->z_size), tx);
+ /*
+ * Insert the new object into the directory.
+ */
+ (void) zfs_link_create(dzp, name, zp, tx, ZNEW);
+
+ zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
+ *zpp = zp;
+
+ zfs_acl_ids_free(&acl_ids);
+
+ dmu_tx_commit(tx);
+
+ getnewvnode_drop_reserve();
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Return, in the buffer contained in the provided uio structure,
+ * the symbolic path referred to by vp.
+ *
+ * IN: vp - vnode of symbolic link.
+ * uio - structure to contain the link path.
+ * cr - credentials of caller.
+ * ct - caller context
+ *
+ * OUT: uio - structure containing the link path.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * vp - atime updated
+ */
+/* ARGSUSED */
+static int
+zfs_readlink(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if (zp->z_is_sa)
+ error = sa_lookup_uio(zp->z_sa_hdl,
+ SA_ZPL_SYMLINK(zfsvfs), uio);
+ else
+ error = zfs_sa_readlink(zp, uio);
+
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Insert a new entry into directory tdvp referencing svp.
+ *
+ * IN: tdvp - Directory to contain new entry.
+ * svp - vnode of new entry.
+ * name - name of new entry.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * tdvp - ctime|mtime updated
+ * svp - ctime updated
+ */
+/* ARGSUSED */
+int
+zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr,
+ int flags)
+{
+ znode_t *tzp;
+ zfsvfs_t *zfsvfs = tdzp->z_zfsvfs;
+ zilog_t *zilog;
+ dmu_tx_t *tx;
+ int error;
+ uint64_t parent;
+ uid_t owner;
+
+ ASSERT(ZTOV(tdzp)->v_type == VDIR);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(tdzp);
+ zilog = zfsvfs->z_log;
+
+ /*
+ * POSIX dictates that we return EPERM here.
+ * Better choices include ENOTSUP or EISDIR.
+ */
+ if (ZTOV(szp)->v_type == VDIR) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ ZFS_VERIFY_ZP(szp);
+
+ /*
+ * If we are using project inheritance, means if the directory has
+ * ZFS_PROJINHERIT set, then its descendant directories will inherit
+ * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
+ * such case, we only allow hard link creation in our tree when the
+ * project IDs are the same.
+ */
+ if (tdzp->z_pflags & ZFS_PROJINHERIT &&
+ tdzp->z_projid != szp->z_projid) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EXDEV));
+ }
+
+ if (szp->z_pflags & (ZFS_APPENDONLY |
+ ZFS_IMMUTABLE | ZFS_READONLY)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ /* Prevent links to .zfs/shares files */
+
+ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (uint64_t))) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ if (parent == zfsvfs->z_shares_dir) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ if (zfsvfs->z_utf8 && u8_validate(name,
+ strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+
+ /*
+ * We do not support links between attributes and non-attributes
+ * because of the potential security risk of creating links
+ * into "normal" file space in order to circumvent restrictions
+ * imposed in attribute space.
+ */
+ if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+
+ owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
+ if (owner != crgetuid(cr) && secpolicy_basic_link(ZTOV(szp), cr) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Attempt to lock directory; fail if entry already exists.
+ */
+ error = zfs_dirent_lookup(tdzp, name, &tzp, ZNEW);
+ if (error) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
+ zfs_sa_upgrade_txholds(tx, szp);
+ zfs_sa_upgrade_txholds(tx, tdzp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ error = zfs_link_create(tdzp, name, szp, tx, 0);
+
+ if (error == 0) {
+ uint64_t txtype = TX_LINK;
+ zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
+ }
+
+ dmu_tx_commit(tx);
+
+ if (error == 0) {
+ vnevent_link(ZTOV(szp), ct);
+ }
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Free or allocate space in a file. Currently, this function only
+ * supports the `F_FREESP' command. However, this command is somewhat
+ * misnamed, as its functionality includes the ability to allocate as
+ * well as free space.
+ *
+ * IN: ip - inode of file to free data in.
+ * cmd - action to take (only F_FREESP supported).
+ * bfp - section of file to free/alloc.
+ * flag - current file open mode flags.
+ * offset - current file offset.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * ip - ctime|mtime updated
+ */
+/* ARGSUSED */
+int
+zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
+ offset_t offset, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ uint64_t off, len;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if (cmd != F_FREESP) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Callers might not be able to detect properly that we are read-only,
+ * so check it explicitly here.
+ */
+ if (zfs_is_readonly(zfsvfs)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EROFS));
+ }
+
+ if (bfp->l_len < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Permissions aren't checked on Solaris because on this OS
+ * zfs_space() can only be called with an opened file handle.
+ * On Linux we can get here through truncate_range() which
+ * operates directly on inodes, so we need to check access rights.
+ */
+ if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ off = bfp->l_start;
+ len = bfp->l_len; /* 0 means from off to end of file */
+
+ error = zfs_freesp(zp, off, len, flag, TRUE);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*ARGSUSED*/
+static void
+zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs);
+ if (zp->z_sa_hdl == NULL) {
+ /*
+ * The fs has been unmounted, or we did a
+ * suspend/resume and this file no longer exists.
+ */
+ ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
+ vrecycle(vp);
+ return;
+ }
+
+ if (zp->z_unlinked) {
+ /*
+ * Fast path to recycle a vnode of a removed file.
+ */
+ ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
+ vrecycle(vp);
+ return;
+ }
+
+ if (zp->z_atime_dirty && zp->z_unlinked == 0) {
+ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
+ (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
+ zp->z_atime_dirty = 0;
+ dmu_tx_commit(tx);
+ }
+ }
+ ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
+}
+
+
+CTASSERT(sizeof (struct zfid_short) <= sizeof (struct fid));
+CTASSERT(sizeof (struct zfid_long) <= sizeof (struct fid));
+
+/*ARGSUSED*/
+static int
+zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint32_t gen;
+ uint64_t gen64;
+ uint64_t object = zp->z_id;
+ zfid_short_t *zfid;
+ int size, i, error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
+ &gen64, sizeof (uint64_t))) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ gen = (uint32_t)gen64;
+
+ size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
+ fidp->fid_len = size;
+
+ zfid = (zfid_short_t *)fidp;
+
+ zfid->zf_len = size;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+ /* Must have a non-zero generation number to distinguish from .zfs */
+ if (gen == 0)
+ gen = 1;
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
+
+ if (size == LONG_FID_LEN) {
+ uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
+ zfid_long_t *zlfid;
+
+ zlfid = (zfid_long_t *)fidp;
+
+ for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+ zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
+
+ /* XXX - this should be the generation number for the objset */
+ for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+ zlfid->zf_setgen[i] = 0;
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+static int
+zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
+ caller_context_t *ct)
+{
+ znode_t *zp;
+ zfsvfs_t *zfsvfs;
+
+ switch (cmd) {
+ case _PC_LINK_MAX:
+ *valp = MIN(LONG_MAX, ZFS_LINK_MAX);
+ return (0);
+
+ case _PC_FILESIZEBITS:
+ *valp = 64;
+ return (0);
+ case _PC_MIN_HOLE_SIZE:
+ *valp = (int)SPA_MINBLOCKSIZE;
+ return (0);
+ case _PC_ACL_EXTENDED:
+#if 0 /* POSIX ACLs are not implemented for ZFS on FreeBSD yet. */
+ zp = VTOZ(vp);
+ zfsvfs = zp->z_zfsvfs;
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+ *valp = zfsvfs->z_acl_type == ZFSACLTYPE_POSIX ? 1 : 0;
+ ZFS_EXIT(zfsvfs);
+#else
+ *valp = 0;
+#endif
+ return (0);
+
+ case _PC_ACL_NFS4:
+ zp = VTOZ(vp);
+ zfsvfs = zp->z_zfsvfs;
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+ *valp = zfsvfs->z_acl_type == ZFS_ACLTYPE_NFSV4 ? 1 : 0;
+ ZFS_EXIT(zfsvfs);
+ return (0);
+
+ case _PC_ACL_PATH_MAX:
+ *valp = ACL_MAX_ENTRIES;
+ return (0);
+
+ default:
+ return (EOPNOTSUPP);
+ }
+}
+
+static int
+zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
+ int *rahead)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ objset_t *os = zp->z_zfsvfs->z_os;
+ zfs_locked_range_t *lr;
+ vm_object_t object;
+ off_t start, end, obj_size;
+ uint_t blksz;
+ int pgsin_b, pgsin_a;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ start = IDX_TO_OFF(ma[0]->pindex);
+ end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
+
+ /*
+ * Lock a range covering all required and optional pages.
+ * Note that we need to handle the case of the block size growing.
+ */
+ for (;;) {
+ blksz = zp->z_blksz;
+ lr = zfs_rangelock_tryenter(&zp->z_rangelock,
+ rounddown(start, blksz),
+ roundup(end, blksz) - rounddown(start, blksz), RL_READER);
+ if (lr == NULL) {
+ if (rahead != NULL) {
+ *rahead = 0;
+ rahead = NULL;
+ }
+ if (rbehind != NULL) {
+ *rbehind = 0;
+ rbehind = NULL;
+ }
+ break;
+ }
+ if (blksz == zp->z_blksz)
+ break;
+ zfs_rangelock_exit(lr);
+ }
+
+ object = ma[0]->object;
+ zfs_vmobject_wlock(object);
+ obj_size = object->un_pager.vnp.vnp_size;
+ zfs_vmobject_wunlock(object);
+ if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
+ if (lr != NULL)
+ zfs_rangelock_exit(lr);
+ ZFS_EXIT(zfsvfs);
+ return (zfs_vm_pagerret_bad);
+ }
+
+ pgsin_b = 0;
+ if (rbehind != NULL) {
+ pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
+ pgsin_b = MIN(*rbehind, pgsin_b);
+ }
+
+ pgsin_a = 0;
+ if (rahead != NULL) {
+ pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
+ if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
+ pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
+ pgsin_a = MIN(*rahead, pgsin_a);
+ }
+
+ /*
+ * NB: we need to pass the exact byte size of the data that we expect
+ * to read after accounting for the file size. This is required because
+ * ZFS will panic if we request DMU to read beyond the end of the last
+ * allocated block.
+ */
+ error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
+ MIN(end, obj_size) - (end - PAGE_SIZE));
+
+ if (lr != NULL)
+ zfs_rangelock_exit(lr);
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+ ZFS_EXIT(zfsvfs);
+
+ if (error != 0)
+ return (zfs_vm_pagerret_error);
+
+ VM_CNT_INC(v_vnodein);
+ VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a);
+ if (rbehind != NULL)
+ *rbehind = pgsin_b;
+ if (rahead != NULL)
+ *rahead = pgsin_a;
+ return (zfs_vm_pagerret_ok);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_getpages_args {
+ struct vnode *a_vp;
+ vm_page_t *a_m;
+ int a_count;
+ int *a_rbehind;
+ int *a_rahead;
+};
+#endif
+
+static int
+zfs_freebsd_getpages(struct vop_getpages_args *ap)
+{
+
+ return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
+ ap->a_rahead));
+}
+
+static int
+zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
+ int *rtvals)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zfs_locked_range_t *lr;
+ dmu_tx_t *tx;
+ struct sf_buf *sf;
+ vm_object_t object;
+ vm_page_t m;
+ caddr_t va;
+ size_t tocopy;
+ size_t lo_len;
+ vm_ooffset_t lo_off;
+ vm_ooffset_t off;
+ uint_t blksz;
+ int ncount;
+ int pcount;
+ int err;
+ int i;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ object = vp->v_object;
+ pcount = btoc(len);
+ ncount = pcount;
+
+ KASSERT(ma[0]->object == object, ("mismatching object"));
+ KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
+
+ for (i = 0; i < pcount; i++)
+ rtvals[i] = zfs_vm_pagerret_error;
+
+ off = IDX_TO_OFF(ma[0]->pindex);
+ blksz = zp->z_blksz;
+ lo_off = rounddown(off, blksz);
+ lo_len = roundup(len + (off - lo_off), blksz);
+ lr = zfs_rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER);
+
+ zfs_vmobject_wlock(object);
+ if (len + off > object->un_pager.vnp.vnp_size) {
+ if (object->un_pager.vnp.vnp_size > off) {
+ int pgoff;
+
+ len = object->un_pager.vnp.vnp_size - off;
+ ncount = btoc(len);
+ if ((pgoff = (int)len & PAGE_MASK) != 0) {
+ /*
+ * If the object is locked and the following
+ * conditions hold, then the page's dirty
+ * field cannot be concurrently changed by a
+ * pmap operation.
+ */
+ m = ma[ncount - 1];
+ vm_page_assert_sbusied(m);
+ KASSERT(!pmap_page_is_write_mapped(m),
+ ("zfs_putpages: page %p is not read-only",
+ m));
+ vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
+ pgoff);
+ }
+ } else {
+ len = 0;
+ ncount = 0;
+ }
+ if (ncount < pcount) {
+ for (i = ncount; i < pcount; i++) {
+ rtvals[i] = zfs_vm_pagerret_bad;
+ }
+ }
+ }
+ zfs_vmobject_wunlock(object);
+
+ if (ncount == 0)
+ goto out;
+
+ if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, zp->z_uid) ||
+ zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, zp->z_gid) ||
+ (zp->z_projid != ZFS_DEFAULT_PROJID &&
+ zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
+ zp->z_projid))) {
+ goto out;
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_write(tx, zp->z_id, off, len);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err != 0) {
+ dmu_tx_abort(tx);
+ goto out;
+ }
+
+ if (zp->z_blksz < PAGE_SIZE) {
+ for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
+ tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
+ va = zfs_map_page(ma[i], &sf);
+ dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
+ zfs_unmap_page(sf);
+ }
+ } else {
+ err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
+ }
+
+ if (err == 0) {
+ uint64_t mtime[2], ctime[2];
+ sa_bulk_attr_t bulk[3];
+ int count = 0;
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+ err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ ASSERT0(err);
+ /*
+ * XXX we should be passing a callback to undirty
+ * but that would make the locking messier
+ */
+ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off,
+ len, 0, NULL, NULL);
+
+ zfs_vmobject_wlock(object);
+ for (i = 0; i < ncount; i++) {
+ rtvals[i] = zfs_vm_pagerret_ok;
+ vm_page_undirty(ma[i]);
+ }
+ zfs_vmobject_wunlock(object);
+ VM_CNT_INC(v_vnodeout);
+ VM_CNT_ADD(v_vnodepgsout, ncount);
+ }
+ dmu_tx_commit(tx);
+
+out:
+ zfs_rangelock_exit(lr);
+ if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
+ zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zfsvfs->z_log, zp->z_id);
+ ZFS_EXIT(zfsvfs);
+ return (rtvals[0]);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_putpages_args {
+ struct vnode *a_vp;
+ vm_page_t *a_m;
+ int a_count;
+ int a_sync;
+ int *a_rtvals;
+};
+#endif
+
+static int
+zfs_freebsd_putpages(struct vop_putpages_args *ap)
+{
+
+ return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
+ ap->a_rtvals));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_bmap_args {
+ struct vnode *a_vp;
+ daddr_t a_bn;
+ struct bufobj **a_bop;
+ daddr_t *a_bnp;
+ int *a_runp;
+ int *a_runb;
+};
+#endif
+
+static int
+zfs_freebsd_bmap(struct vop_bmap_args *ap)
+{
+
+ if (ap->a_bop != NULL)
+ *ap->a_bop = &ap->a_vp->v_bufobj;
+ if (ap->a_bnp != NULL)
+ *ap->a_bnp = ap->a_bn;
+ if (ap->a_runp != NULL)
+ *ap->a_runp = 0;
+ if (ap->a_runb != NULL)
+ *ap->a_runb = 0;
+
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_open_args {
+ struct vnode *a_vp;
+ int a_mode;
+ struct ucred *a_cred;
+ struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_open(struct vop_open_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+ znode_t *zp = VTOZ(vp);
+ int error;
+
+ error = zfs_open(&vp, ap->a_mode, ap->a_cred);
+ if (error == 0)
+ vnode_create_vobject(vp, zp->z_size, ap->a_td);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_close_args {
+ struct vnode *a_vp;
+ int a_fflag;
+ struct ucred *a_cred;
+ struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_close(struct vop_close_args *ap)
+{
+
+ return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_ioctl_args {
+ struct vnode *a_vp;
+ ulong_t a_command;
+ caddr_t a_data;
+ int a_fflag;
+ struct ucred *cred;
+ struct thread *td;
+};
+#endif
+
+static int
+zfs_freebsd_ioctl(struct vop_ioctl_args *ap)
+{
+
+ return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
+ ap->a_fflag, ap->a_cred, NULL));
+}
+
+static int
+ioflags(int ioflags)
+{
+ int flags = 0;
+
+ if (ioflags & IO_APPEND)
+ flags |= FAPPEND;
+ if (ioflags & IO_NDELAY)
+ flags |= FNONBLOCK;
+ if (ioflags & IO_SYNC)
+ flags |= (FSYNC | FDSYNC | FRSYNC);
+
+ return (flags);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_read_args {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ struct ucred *a_cred;
+};
+#endif
+
+static int
+zfs_freebsd_read(struct vop_read_args *ap)
+{
+ zfs_uio_t uio;
+ zfs_uio_init(&uio, ap->a_uio);
+ return (zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag),
+ ap->a_cred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_write_args {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ struct ucred *a_cred;
+};
+#endif
+
+static int
+zfs_freebsd_write(struct vop_write_args *ap)
+{
+ zfs_uio_t uio;
+ zfs_uio_init(&uio, ap->a_uio);
+ return (zfs_write(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag),
+ ap->a_cred));
+}
+
+#if __FreeBSD_version >= 1300102
+/*
+ * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
+ * the comment above cache_fplookup for details.
+ */
+static int
+zfs_freebsd_fplookup_vexec(struct vop_fplookup_vexec_args *v)
+{
+ vnode_t *vp;
+ znode_t *zp;
+ uint64_t pflags;
+
+ vp = v->a_vp;
+ zp = VTOZ_SMR(vp);
+ if (__predict_false(zp == NULL))
+ return (EAGAIN);
+ pflags = atomic_load_64(&zp->z_pflags);
+ if (pflags & ZFS_AV_QUARANTINED)
+ return (EAGAIN);
+ if (pflags & ZFS_XATTR)
+ return (EAGAIN);
+ if ((pflags & ZFS_NO_EXECS_DENIED) == 0)
+ return (EAGAIN);
+ return (0);
+}
+#endif
+
+static int
+zfs_freebsd_fplookup_symlink(struct vop_fplookup_symlink_args *v)
+{
+ vnode_t *vp;
+ znode_t *zp;
+ char *target;
+
+ vp = v->a_vp;
+ zp = VTOZ_SMR(vp);
+ if (__predict_false(zp == NULL)) {
+ return (EAGAIN);
+ }
+
+ target = atomic_load_consume_ptr(&zp->z_cached_symlink);
+ if (target == NULL) {
+ return (EAGAIN);
+ }
+ return (cache_symlink_resolve(v->a_fpl, target, strlen(target)));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_access_args {
+ struct vnode *a_vp;
+ accmode_t a_accmode;
+ struct ucred *a_cred;
+ struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_access(struct vop_access_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+ znode_t *zp = VTOZ(vp);
+ accmode_t accmode;
+ int error = 0;
+
+
+ if (ap->a_accmode == VEXEC) {
+ if (zfs_fastaccesschk_execute(zp, ap->a_cred) == 0)
+ return (0);
+ }
+
+ /*
+ * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
+ */
+ accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
+ if (accmode != 0)
+ error = zfs_access(zp, accmode, 0, ap->a_cred);
+
+ /*
+ * VADMIN has to be handled by vaccess().
+ */
+ if (error == 0) {
+ accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
+ if (accmode != 0) {
+#if __FreeBSD_version >= 1300105
+ error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
+ zp->z_gid, accmode, ap->a_cred);
+#else
+ error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
+ zp->z_gid, accmode, ap->a_cred, NULL);
+#endif
+ }
+ }
+
+ /*
+ * For VEXEC, ensure that at least one execute bit is set for
+ * non-directories.
+ */
+ if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
+ (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
+ error = EACCES;
+ }
+
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_lookup_args {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+};
+#endif
+
+static int
+zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
+{
+ struct componentname *cnp = ap->a_cnp;
+ char nm[NAME_MAX + 1];
+
+ ASSERT(cnp->cn_namelen < sizeof (nm));
+ strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof (nm)));
+
+ return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
+ cnp->cn_cred, cnp->cn_thread, 0, cached));
+}
+
+static int
+zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap)
+{
+
+ return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_lookup_args {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+};
+#endif
+
+static int
+zfs_cache_lookup(struct vop_lookup_args *ap)
+{
+ zfsvfs_t *zfsvfs;
+
+ zfsvfs = ap->a_dvp->v_mount->mnt_data;
+ if (zfsvfs->z_use_namecache)
+ return (vfs_cache_lookup(ap));
+ else
+ return (zfs_freebsd_lookup(ap, B_FALSE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_create_args {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+};
+#endif
+
+static int
+zfs_freebsd_create(struct vop_create_args *ap)
+{
+ zfsvfs_t *zfsvfs;
+ struct componentname *cnp = ap->a_cnp;
+ vattr_t *vap = ap->a_vap;
+ znode_t *zp = NULL;
+ int rc, mode;
+
+ ASSERT(cnp->cn_flags & SAVENAME);
+
+ vattr_init_mask(vap);
+ mode = vap->va_mode & ALLPERMS;
+ zfsvfs = ap->a_dvp->v_mount->mnt_data;
+ *ap->a_vpp = NULL;
+
+ rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, !EXCL, mode,
+ &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */);
+ if (rc == 0)
+ *ap->a_vpp = ZTOV(zp);
+ if (zfsvfs->z_use_namecache &&
+ rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
+ cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
+
+ return (rc);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_remove_args {
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+};
+#endif
+
+static int
+zfs_freebsd_remove(struct vop_remove_args *ap)
+{
+
+ ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+
+ return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
+ ap->a_cnp->cn_cred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_mkdir_args {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+};
+#endif
+
+static int
+zfs_freebsd_mkdir(struct vop_mkdir_args *ap)
+{
+ vattr_t *vap = ap->a_vap;
+ znode_t *zp = NULL;
+ int rc;
+
+ ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+
+ vattr_init_mask(vap);
+ *ap->a_vpp = NULL;
+
+ rc = zfs_mkdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, vap, &zp,
+ ap->a_cnp->cn_cred, 0, NULL);
+
+ if (rc == 0)
+ *ap->a_vpp = ZTOV(zp);
+ return (rc);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_rmdir_args {
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+};
+#endif
+
+static int
+zfs_freebsd_rmdir(struct vop_rmdir_args *ap)
+{
+ struct componentname *cnp = ap->a_cnp;
+
+ ASSERT(cnp->cn_flags & SAVENAME);
+
+ return (zfs_rmdir_(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_readdir_args {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ struct ucred *a_cred;
+ int *a_eofflag;
+ int *a_ncookies;
+ ulong_t **a_cookies;
+};
+#endif
+
+static int
+zfs_freebsd_readdir(struct vop_readdir_args *ap)
+{
+ zfs_uio_t uio;
+ zfs_uio_init(&uio, ap->a_uio);
+ return (zfs_readdir(ap->a_vp, &uio, ap->a_cred, ap->a_eofflag,
+ ap->a_ncookies, ap->a_cookies));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_fsync_args {
+ struct vnode *a_vp;
+ int a_waitfor;
+ struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_fsync(struct vop_fsync_args *ap)
+{
+
+ vop_stdfsync(ap);
+ return (zfs_fsync(VTOZ(ap->a_vp), 0, ap->a_td->td_ucred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_getattr_args {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+};
+#endif
+
+static int
+zfs_freebsd_getattr(struct vop_getattr_args *ap)
+{
+ vattr_t *vap = ap->a_vap;
+ xvattr_t xvap;
+ ulong_t fflags = 0;
+ int error;
+
+ xva_init(&xvap);
+ xvap.xva_vattr = *vap;
+ xvap.xva_vattr.va_mask |= AT_XVATTR;
+
+ /* Convert chflags into ZFS-type flags. */
+ /* XXX: what about SF_SETTABLE?. */
+ XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
+ XVA_SET_REQ(&xvap, XAT_APPENDONLY);
+ XVA_SET_REQ(&xvap, XAT_NOUNLINK);
+ XVA_SET_REQ(&xvap, XAT_NODUMP);
+ XVA_SET_REQ(&xvap, XAT_READONLY);
+ XVA_SET_REQ(&xvap, XAT_ARCHIVE);
+ XVA_SET_REQ(&xvap, XAT_SYSTEM);
+ XVA_SET_REQ(&xvap, XAT_HIDDEN);
+ XVA_SET_REQ(&xvap, XAT_REPARSE);
+ XVA_SET_REQ(&xvap, XAT_OFFLINE);
+ XVA_SET_REQ(&xvap, XAT_SPARSE);
+
+ error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred);
+ if (error != 0)
+ return (error);
+
+ /* Convert ZFS xattr into chflags. */
+#define FLAG_CHECK(fflag, xflag, xfield) do { \
+ if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \
+ fflags |= (fflag); \
+} while (0)
+ FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
+ xvap.xva_xoptattrs.xoa_immutable);
+ FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
+ xvap.xva_xoptattrs.xoa_appendonly);
+ FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
+ xvap.xva_xoptattrs.xoa_nounlink);
+ FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
+ xvap.xva_xoptattrs.xoa_archive);
+ FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
+ xvap.xva_xoptattrs.xoa_nodump);
+ FLAG_CHECK(UF_READONLY, XAT_READONLY,
+ xvap.xva_xoptattrs.xoa_readonly);
+ FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
+ xvap.xva_xoptattrs.xoa_system);
+ FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
+ xvap.xva_xoptattrs.xoa_hidden);
+ FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
+ xvap.xva_xoptattrs.xoa_reparse);
+ FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
+ xvap.xva_xoptattrs.xoa_offline);
+ FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
+ xvap.xva_xoptattrs.xoa_sparse);
+
+#undef FLAG_CHECK
+ *vap = xvap.xva_vattr;
+ vap->va_flags = fflags;
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_setattr_args {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+};
+#endif
+
+static int
+zfs_freebsd_setattr(struct vop_setattr_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+ vattr_t *vap = ap->a_vap;
+ cred_t *cred = ap->a_cred;
+ xvattr_t xvap;
+ ulong_t fflags;
+ uint64_t zflags;
+
+ vattr_init_mask(vap);
+ vap->va_mask &= ~AT_NOSET;
+
+ xva_init(&xvap);
+ xvap.xva_vattr = *vap;
+
+ zflags = VTOZ(vp)->z_pflags;
+
+ if (vap->va_flags != VNOVAL) {
+ zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
+ int error;
+
+ if (zfsvfs->z_use_fuids == B_FALSE)
+ return (EOPNOTSUPP);
+
+ fflags = vap->va_flags;
+ /*
+ * XXX KDM
+ * We need to figure out whether it makes sense to allow
+ * UF_REPARSE through, since we don't really have other
+ * facilities to handle reparse points and zfs_setattr()
+ * doesn't currently allow setting that attribute anyway.
+ */
+ if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
+ UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
+ UF_OFFLINE|UF_SPARSE)) != 0)
+ return (EOPNOTSUPP);
+ /*
+ * Unprivileged processes are not permitted to unset system
+ * flags, or modify flags if any system flags are set.
+ * Privileged non-jail processes may not modify system flags
+ * if securelevel > 0 and any existing system flags are set.
+ * Privileged jail processes behave like privileged non-jail
+ * processes if the PR_ALLOW_CHFLAGS permission bit is set;
+ * otherwise, they behave like unprivileged processes.
+ */
+ if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
+ spl_priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) {
+ if (zflags &
+ (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
+ error = securelevel_gt(cred, 0);
+ if (error != 0)
+ return (error);
+ }
+ } else {
+ /*
+ * Callers may only modify the file flags on
+ * objects they have VADMIN rights for.
+ */
+ if ((error = VOP_ACCESS(vp, VADMIN, cred,
+ curthread)) != 0)
+ return (error);
+ if (zflags &
+ (ZFS_IMMUTABLE | ZFS_APPENDONLY |
+ ZFS_NOUNLINK)) {
+ return (EPERM);
+ }
+ if (fflags &
+ (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
+ return (EPERM);
+ }
+ }
+
+#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \
+ if (((fflags & (fflag)) && !(zflags & (zflag))) || \
+ ((zflags & (zflag)) && !(fflags & (fflag)))) { \
+ XVA_SET_REQ(&xvap, (xflag)); \
+ (xfield) = ((fflags & (fflag)) != 0); \
+ } \
+} while (0)
+ /* Convert chflags into ZFS-type flags. */
+ /* XXX: what about SF_SETTABLE?. */
+ FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
+ xvap.xva_xoptattrs.xoa_immutable);
+ FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
+ xvap.xva_xoptattrs.xoa_appendonly);
+ FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
+ xvap.xva_xoptattrs.xoa_nounlink);
+ FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
+ xvap.xva_xoptattrs.xoa_archive);
+ FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
+ xvap.xva_xoptattrs.xoa_nodump);
+ FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
+ xvap.xva_xoptattrs.xoa_readonly);
+ FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
+ xvap.xva_xoptattrs.xoa_system);
+ FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
+ xvap.xva_xoptattrs.xoa_hidden);
+ FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
+ xvap.xva_xoptattrs.xoa_reparse);
+ FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
+ xvap.xva_xoptattrs.xoa_offline);
+ FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
+ xvap.xva_xoptattrs.xoa_sparse);
+#undef FLAG_CHANGE
+ }
+ if (vap->va_birthtime.tv_sec != VNOVAL) {
+ xvap.xva_vattr.va_mask |= AT_XVATTR;
+ XVA_SET_REQ(&xvap, XAT_CREATETIME);
+ }
+ return (zfs_setattr(VTOZ(vp), (vattr_t *)&xvap, 0, cred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_rename_args {
+ struct vnode *a_fdvp;
+ struct vnode *a_fvp;
+ struct componentname *a_fcnp;
+ struct vnode *a_tdvp;
+ struct vnode *a_tvp;
+ struct componentname *a_tcnp;
+};
+#endif
+
+static int
+zfs_freebsd_rename(struct vop_rename_args *ap)
+{
+ vnode_t *fdvp = ap->a_fdvp;
+ vnode_t *fvp = ap->a_fvp;
+ vnode_t *tdvp = ap->a_tdvp;
+ vnode_t *tvp = ap->a_tvp;
+ int error;
+
+ ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
+ ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
+
+ error = zfs_rename_(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
+ ap->a_tcnp, ap->a_fcnp->cn_cred, 1);
+
+ vrele(fdvp);
+ vrele(fvp);
+ vrele(tdvp);
+ if (tvp != NULL)
+ vrele(tvp);
+
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_symlink_args {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ char *a_target;
+};
+#endif
+
+static int
+zfs_freebsd_symlink(struct vop_symlink_args *ap)
+{
+ struct componentname *cnp = ap->a_cnp;
+ vattr_t *vap = ap->a_vap;
+ znode_t *zp = NULL;
+ char *symlink;
+ size_t symlink_len;
+ int rc;
+
+ ASSERT(cnp->cn_flags & SAVENAME);
+
+ vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */
+ vattr_init_mask(vap);
+ *ap->a_vpp = NULL;
+
+ rc = zfs_symlink(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap,
+ ap->a_target, &zp, cnp->cn_cred, 0 /* flags */);
+ if (rc == 0) {
+ *ap->a_vpp = ZTOV(zp);
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+ MPASS(zp->z_cached_symlink == NULL);
+ symlink_len = strlen(ap->a_target);
+ symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK);
+ if (symlink != NULL) {
+ memcpy(symlink, ap->a_target, symlink_len);
+ symlink[symlink_len] = '\0';
+ atomic_store_rel_ptr((uintptr_t *)&zp->z_cached_symlink,
+ (uintptr_t)symlink);
+ }
+ }
+ return (rc);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_readlink_args {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ struct ucred *a_cred;
+};
+#endif
+
+static int
+zfs_freebsd_readlink(struct vop_readlink_args *ap)
+{
+ zfs_uio_t uio;
+ znode_t *zp = VTOZ(ap->a_vp);
+ char *symlink, *base;
+ size_t symlink_len;
+ int error;
+ bool trycache;
+
+ zfs_uio_init(&uio, ap->a_uio);
+ trycache = false;
+ if (zfs_uio_segflg(&uio) == UIO_SYSSPACE &&
+ zfs_uio_iovcnt(&uio) == 1) {
+ base = zfs_uio_iovbase(&uio, 0);
+ symlink_len = zfs_uio_iovlen(&uio, 0);
+ trycache = true;
+ }
+ error = zfs_readlink(ap->a_vp, &uio, ap->a_cred, NULL);
+ if (atomic_load_ptr(&zp->z_cached_symlink) != NULL ||
+ error != 0 || !trycache) {
+ return (error);
+ }
+ symlink_len -= zfs_uio_resid(&uio);
+ symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK);
+ if (symlink != NULL) {
+ memcpy(symlink, base, symlink_len);
+ symlink[symlink_len] = '\0';
+ if (!atomic_cmpset_rel_ptr((uintptr_t *)&zp->z_cached_symlink,
+ (uintptr_t)NULL, (uintptr_t)symlink)) {
+ cache_symlink_free(symlink, symlink_len + 1);
+ }
+ }
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_link_args {
+ struct vnode *a_tdvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+};
+#endif
+
+static int
+zfs_freebsd_link(struct vop_link_args *ap)
+{
+ struct componentname *cnp = ap->a_cnp;
+ vnode_t *vp = ap->a_vp;
+ vnode_t *tdvp = ap->a_tdvp;
+
+ if (tdvp->v_mount != vp->v_mount)
+ return (EXDEV);
+
+ ASSERT(cnp->cn_flags & SAVENAME);
+
+ return (zfs_link(VTOZ(tdvp), VTOZ(vp),
+ cnp->cn_nameptr, cnp->cn_cred, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_inactive_args {
+ struct vnode *a_vp;
+ struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_inactive(struct vop_inactive_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+
+#if __FreeBSD_version >= 1300123
+ zfs_inactive(vp, curthread->td_ucred, NULL);
+#else
+ zfs_inactive(vp, ap->a_td->td_ucred, NULL);
+#endif
+ return (0);
+}
+
+#if __FreeBSD_version >= 1300042
+#ifndef _SYS_SYSPROTO_H_
+struct vop_need_inactive_args {
+ struct vnode *a_vp;
+};
+#endif
+
+static int
+zfs_freebsd_need_inactive(struct vop_need_inactive_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int need;
+
+ if (vn_need_pageq_flush(vp))
+ return (1);
+
+ if (!ZFS_TEARDOWN_INACTIVE_TRY_ENTER_READ(zfsvfs))
+ return (1);
+ need = (zp->z_sa_hdl == NULL || zp->z_unlinked || zp->z_atime_dirty);
+ ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
+
+ return (need);
+}
+#endif
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_reclaim_args {
+ struct vnode *a_vp;
+ struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_reclaim(struct vop_reclaim_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ ASSERT(zp != NULL);
+
+#if __FreeBSD_version < 1300042
+ /* Destroy the vm object and flush associated pages. */
+ vnode_destroy_vobject(vp);
+#endif
+ /*
+ * z_teardown_inactive_lock protects from a race with
+ * zfs_znode_dmu_fini in zfsvfs_teardown during
+ * force unmount.
+ */
+ ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs);
+ if (zp->z_sa_hdl == NULL)
+ zfs_znode_free(zp);
+ else
+ zfs_zinactive(zp);
+ ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
+
+ vp->v_data = NULL;
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_fid_args {
+ struct vnode *a_vp;
+ struct fid *a_fid;
+};
+#endif
+
+static int
+zfs_freebsd_fid(struct vop_fid_args *ap)
+{
+
+ return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
+}
+
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_pathconf_args {
+ struct vnode *a_vp;
+ int a_name;
+ register_t *a_retval;
+} *ap;
+#endif
+
+static int
+zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
+{
+ ulong_t val;
+ int error;
+
+ error = zfs_pathconf(ap->a_vp, ap->a_name, &val,
+ curthread->td_ucred, NULL);
+ if (error == 0) {
+ *ap->a_retval = val;
+ return (error);
+ }
+ if (error != EOPNOTSUPP)
+ return (error);
+
+ switch (ap->a_name) {
+ case _PC_NAME_MAX:
+ *ap->a_retval = NAME_MAX;
+ return (0);
+ case _PC_PIPE_BUF:
+ if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
+ *ap->a_retval = PIPE_BUF;
+ return (0);
+ }
+ return (EINVAL);
+ default:
+ return (vop_stdpathconf(ap));
+ }
+}
+
+/*
+ * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
+ * extended attribute name:
+ *
+ * NAMESPACE PREFIX
+ * system freebsd:system:
+ * user (none, can be used to access ZFS fsattr(5) attributes
+ * created on Solaris)
+ */
+static int
+zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
+ size_t size)
+{
+ const char *namespace, *prefix, *suffix;
+
+ /* We don't allow '/' character in attribute name. */
+ if (strchr(name, '/') != NULL)
+ return (EINVAL);
+ /* We don't allow attribute names that start with "freebsd:" string. */
+ if (strncmp(name, "freebsd:", 8) == 0)
+ return (EINVAL);
+
+ bzero(attrname, size);
+
+ switch (attrnamespace) {
+ case EXTATTR_NAMESPACE_USER:
+#if 0
+ prefix = "freebsd:";
+ namespace = EXTATTR_NAMESPACE_USER_STRING;
+ suffix = ":";
+#else
+ /*
+ * This is the default namespace by which we can access all
+ * attributes created on Solaris.
+ */
+ prefix = namespace = suffix = "";
+#endif
+ break;
+ case EXTATTR_NAMESPACE_SYSTEM:
+ prefix = "freebsd:";
+ namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
+ suffix = ":";
+ break;
+ case EXTATTR_NAMESPACE_EMPTY:
+ default:
+ return (EINVAL);
+ }
+ if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
+ name) >= size) {
+ return (ENAMETOOLONG);
+ }
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_getextattr {
+ IN struct vnode *a_vp;
+ IN int a_attrnamespace;
+ IN const char *a_name;
+ INOUT struct uio *a_uio;
+ OUT size_t *a_size;
+ IN struct ucred *a_cred;
+ IN struct thread *a_td;
+};
+#endif
+
+/*
+ * Vnode operating to retrieve a named extended attribute.
+ */
+static int
+zfs_getextattr(struct vop_getextattr_args *ap)
+{
+ zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+ struct thread *td = ap->a_td;
+ struct nameidata nd;
+ char attrname[255];
+ struct vattr va;
+ vnode_t *xvp = NULL, *vp;
+ int error, flags;
+
+ /*
+ * If the xattr property is off, refuse the request.
+ */
+ if (!(zfsvfs->z_flags & ZSB_XATTR)) {
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+
+ error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
+ ap->a_cred, ap->a_td, VREAD);
+ if (error != 0)
+ return (error);
+
+ error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
+ sizeof (attrname));
+ if (error != 0)
+ return (error);
+
+ ZFS_ENTER(zfsvfs);
+
+ error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+ LOOKUP_XATTR, B_FALSE);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ flags = FREAD;
+ NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
+ xvp, td);
+ error = vn_open_cred(&nd, &flags, 0, VN_OPEN_INVFS, ap->a_cred, NULL);
+ vp = nd.ni_vp;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ if (error == ENOENT)
+ error = ENOATTR;
+ return (error);
+ }
+
+ if (ap->a_size != NULL) {
+ error = VOP_GETATTR(vp, &va, ap->a_cred);
+ if (error == 0)
+ *ap->a_size = (size_t)va.va_size;
+ } else if (ap->a_uio != NULL)
+ error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
+
+ VOP_UNLOCK1(vp);
+ vn_close(vp, flags, ap->a_cred, td);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_deleteextattr {
+ IN struct vnode *a_vp;
+ IN int a_attrnamespace;
+ IN const char *a_name;
+ IN struct ucred *a_cred;
+ IN struct thread *a_td;
+};
+#endif
+
+/*
+ * Vnode operation to remove a named attribute.
+ */
+static int
+zfs_deleteextattr(struct vop_deleteextattr_args *ap)
+{
+ zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+ struct thread *td = ap->a_td;
+ struct nameidata nd;
+ char attrname[255];
+ vnode_t *xvp = NULL, *vp;
+ int error;
+
+ /*
+ * If the xattr property is off, refuse the request.
+ */
+ if (!(zfsvfs->z_flags & ZSB_XATTR)) {
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+
+ error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
+ ap->a_cred, ap->a_td, VWRITE);
+ if (error != 0)
+ return (error);
+
+ error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
+ sizeof (attrname));
+ if (error != 0)
+ return (error);
+
+ ZFS_ENTER(zfsvfs);
+
+ error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+ LOOKUP_XATTR, B_FALSE);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
+ UIO_SYSSPACE, attrname, xvp, td);
+ error = namei(&nd);
+ vp = nd.ni_vp;
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (error == ENOENT)
+ error = ENOATTR;
+ return (error);
+ }
+
+ error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ vput(nd.ni_dvp);
+ if (vp == nd.ni_dvp)
+ vrele(vp);
+ else
+ vput(vp);
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_setextattr {
+ IN struct vnode *a_vp;
+ IN int a_attrnamespace;
+ IN const char *a_name;
+ INOUT struct uio *a_uio;
+ IN struct ucred *a_cred;
+ IN struct thread *a_td;
+};
+#endif
+
+/*
+ * Vnode operation to set a named attribute.
+ */
+static int
+zfs_setextattr(struct vop_setextattr_args *ap)
+{
+ zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+ struct thread *td = ap->a_td;
+ struct nameidata nd;
+ char attrname[255];
+ struct vattr va;
+ vnode_t *xvp = NULL, *vp;
+ int error, flags;
+
+ /*
+ * If the xattr property is off, refuse the request.
+ */
+ if (!(zfsvfs->z_flags & ZSB_XATTR)) {
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+
+ error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
+ ap->a_cred, ap->a_td, VWRITE);
+ if (error != 0)
+ return (error);
+ error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
+ sizeof (attrname));
+ if (error != 0)
+ return (error);
+
+ ZFS_ENTER(zfsvfs);
+
+ error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+ LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ flags = FFLAGS(O_WRONLY | O_CREAT);
+ NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
+ xvp, td);
+ error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred,
+ NULL);
+ vp = nd.ni_vp;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ VATTR_NULL(&va);
+ va.va_size = 0;
+ error = VOP_SETATTR(vp, &va, ap->a_cred);
+ if (error == 0)
+ VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
+
+ VOP_UNLOCK1(vp);
+ vn_close(vp, flags, ap->a_cred, td);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_listextattr {
+ IN struct vnode *a_vp;
+ IN int a_attrnamespace;
+ INOUT struct uio *a_uio;
+ OUT size_t *a_size;
+ IN struct ucred *a_cred;
+ IN struct thread *a_td;
+};
+#endif
+
+/*
+ * Vnode operation to retrieve extended attributes on a vnode.
+ */
+static int
+zfs_listextattr(struct vop_listextattr_args *ap)
+{
+ zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+ struct thread *td = ap->a_td;
+ struct nameidata nd;
+ char attrprefix[16];
+ uint8_t dirbuf[sizeof (struct dirent)];
+ struct dirent *dp;
+ struct iovec aiov;
+ struct uio auio;
+ size_t *sizep = ap->a_size;
+ size_t plen;
+ vnode_t *xvp = NULL, *vp;
+ int done, error, eof, pos;
+ zfs_uio_t uio;
+
+ zfs_uio_init(&uio, ap->a_uio);
+
+ /*
+ * If the xattr property is off, refuse the request.
+ */
+ if (!(zfsvfs->z_flags & ZSB_XATTR)) {
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+
+ error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
+ ap->a_cred, ap->a_td, VREAD);
+ if (error != 0)
+ return (error);
+
+ error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
+ sizeof (attrprefix));
+ if (error != 0)
+ return (error);
+ plen = strlen(attrprefix);
+
+ ZFS_ENTER(zfsvfs);
+
+ if (sizep != NULL)
+ *sizep = 0;
+
+ error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+ LOOKUP_XATTR, B_FALSE);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ /*
+ * ENOATTR means that the EA directory does not yet exist,
+ * i.e. there are no extended attributes there.
+ */
+ if (error == ENOATTR)
+ error = 0;
+ return (error);
+ }
+
+ NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
+ UIO_SYSSPACE, ".", xvp, td);
+ error = namei(&nd);
+ vp = nd.ni_vp;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_td = td;
+ auio.uio_rw = UIO_READ;
+ auio.uio_offset = 0;
+
+ do {
+ uint8_t nlen;
+
+ aiov.iov_base = (void *)dirbuf;
+ aiov.iov_len = sizeof (dirbuf);
+ auio.uio_resid = sizeof (dirbuf);
+ error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
+ done = sizeof (dirbuf) - auio.uio_resid;
+ if (error != 0)
+ break;
+ for (pos = 0; pos < done; ) {
+ dp = (struct dirent *)(dirbuf + pos);
+ pos += dp->d_reclen;
+ /*
+ * XXX: Temporarily we also accept DT_UNKNOWN, as this
+ * is what we get when attribute was created on Solaris.
+ */
+ if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
+ continue;
+ if (plen == 0 &&
+ strncmp(dp->d_name, "freebsd:", 8) == 0)
+ continue;
+ else if (strncmp(dp->d_name, attrprefix, plen) != 0)
+ continue;
+ nlen = dp->d_namlen - plen;
+ if (sizep != NULL)
+ *sizep += 1 + nlen;
+ else if (GET_UIO_STRUCT(&uio) != NULL) {
+ /*
+ * Format of extattr name entry is one byte for
+ * length and the rest for name.
+ */
+ error = zfs_uiomove(&nlen, 1, zfs_uio_rw(&uio),
+ &uio);
+ if (error == 0) {
+ error = zfs_uiomove(dp->d_name + plen,
+ nlen, zfs_uio_rw(&uio), &uio);
+ }
+ if (error != 0)
+ break;
+ }
+ }
+ } while (!eof && error == 0);
+
+ vput(vp);
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_getacl_args {
+ struct vnode *vp;
+ acl_type_t type;
+ struct acl *aclp;
+ struct ucred *cred;
+ struct thread *td;
+};
+#endif
+
+static int
+zfs_freebsd_getacl(struct vop_getacl_args *ap)
+{
+ int error;
+ vsecattr_t vsecattr;
+
+ if (ap->a_type != ACL_TYPE_NFS4)
+ return (EINVAL);
+
+ vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
+ if ((error = zfs_getsecattr(VTOZ(ap->a_vp),
+ &vsecattr, 0, ap->a_cred)))
+ return (error);
+
+ error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp,
+ vsecattr.vsa_aclcnt);
+ if (vsecattr.vsa_aclentp != NULL)
+ kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
+
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_setacl_args {
+ struct vnode *vp;
+ acl_type_t type;
+ struct acl *aclp;
+ struct ucred *cred;
+ struct thread *td;
+};
+#endif
+
+static int
+zfs_freebsd_setacl(struct vop_setacl_args *ap)
+{
+ int error;
+ vsecattr_t vsecattr;
+ int aclbsize; /* size of acl list in bytes */
+ aclent_t *aaclp;
+
+ if (ap->a_type != ACL_TYPE_NFS4)
+ return (EINVAL);
+
+ if (ap->a_aclp == NULL)
+ return (EINVAL);
+
+ if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
+ return (EINVAL);
+
+ /*
+ * With NFSv4 ACLs, chmod(2) may need to add additional entries,
+ * splitting every entry into two and appending "canonical six"
+ * entries at the end. Don't allow for setting an ACL that would
+ * cause chmod(2) to run out of ACL entries.
+ */
+ if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
+ return (ENOSPC);
+
+ error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
+ if (error != 0)
+ return (error);
+
+ vsecattr.vsa_mask = VSA_ACE;
+ aclbsize = ap->a_aclp->acl_cnt * sizeof (ace_t);
+ vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
+ aaclp = vsecattr.vsa_aclentp;
+ vsecattr.vsa_aclentsz = aclbsize;
+
+ aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
+ error = zfs_setsecattr(VTOZ(ap->a_vp), &vsecattr, 0, ap->a_cred);
+ kmem_free(aaclp, aclbsize);
+
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_aclcheck_args {
+ struct vnode *vp;
+ acl_type_t type;
+ struct acl *aclp;
+ struct ucred *cred;
+ struct thread *td;
+};
+#endif
+
+static int
+zfs_freebsd_aclcheck(struct vop_aclcheck_args *ap)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+zfs_vptocnp(struct vop_vptocnp_args *ap)
+{
+ vnode_t *covered_vp;
+ vnode_t *vp = ap->a_vp;
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ znode_t *zp = VTOZ(vp);
+ int ltype;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ /*
+ * If we are a snapshot mounted under .zfs, run the operation
+ * on the covered vnode.
+ */
+ if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
+ char name[MAXNAMLEN + 1];
+ znode_t *dzp;
+ size_t len;
+
+ error = zfs_znode_parent_and_name(zp, &dzp, name);
+ if (error == 0) {
+ len = strlen(name);
+ if (*ap->a_buflen < len)
+ error = SET_ERROR(ENOMEM);
+ }
+ if (error == 0) {
+ *ap->a_buflen -= len;
+ bcopy(name, ap->a_buf + *ap->a_buflen, len);
+ *ap->a_vpp = ZTOV(dzp);
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ ZFS_EXIT(zfsvfs);
+
+ covered_vp = vp->v_mount->mnt_vnodecovered;
+#if __FreeBSD_version >= 1300045
+ enum vgetstate vs = vget_prep(covered_vp);
+#else
+ vhold(covered_vp);
+#endif
+ ltype = VOP_ISLOCKED(vp);
+ VOP_UNLOCK1(vp);
+#if __FreeBSD_version >= 1300045
+ error = vget_finish(covered_vp, LK_SHARED, vs);
+#else
+ error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread);
+#endif
+ if (error == 0) {
+#if __FreeBSD_version >= 1300123
+ error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_buf,
+ ap->a_buflen);
+#else
+ error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
+ ap->a_buf, ap->a_buflen);
+#endif
+ vput(covered_vp);
+ }
+ vn_lock(vp, ltype | LK_RETRY);
+ if (VN_IS_DOOMED(vp))
+ error = SET_ERROR(ENOENT);
+ return (error);
+}
+
+struct vop_vector zfs_vnodeops;
+struct vop_vector zfs_fifoops;
+struct vop_vector zfs_shareops;
+
+struct vop_vector zfs_vnodeops = {
+ .vop_default = &default_vnodeops,
+ .vop_inactive = zfs_freebsd_inactive,
+#if __FreeBSD_version >= 1300042
+ .vop_need_inactive = zfs_freebsd_need_inactive,
+#endif
+ .vop_reclaim = zfs_freebsd_reclaim,
+#if __FreeBSD_version >= 1300102
+ .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec,
+#endif
+ .vop_fplookup_symlink = zfs_freebsd_fplookup_symlink,
+ .vop_access = zfs_freebsd_access,
+ .vop_allocate = VOP_EINVAL,
+ .vop_lookup = zfs_cache_lookup,
+ .vop_cachedlookup = zfs_freebsd_cachedlookup,
+ .vop_getattr = zfs_freebsd_getattr,
+ .vop_setattr = zfs_freebsd_setattr,
+ .vop_create = zfs_freebsd_create,
+ .vop_mknod = (vop_mknod_t *)zfs_freebsd_create,
+ .vop_mkdir = zfs_freebsd_mkdir,
+ .vop_readdir = zfs_freebsd_readdir,
+ .vop_fsync = zfs_freebsd_fsync,
+ .vop_open = zfs_freebsd_open,
+ .vop_close = zfs_freebsd_close,
+ .vop_rmdir = zfs_freebsd_rmdir,
+ .vop_ioctl = zfs_freebsd_ioctl,
+ .vop_link = zfs_freebsd_link,
+ .vop_symlink = zfs_freebsd_symlink,
+ .vop_readlink = zfs_freebsd_readlink,
+ .vop_read = zfs_freebsd_read,
+ .vop_write = zfs_freebsd_write,
+ .vop_remove = zfs_freebsd_remove,
+ .vop_rename = zfs_freebsd_rename,
+ .vop_pathconf = zfs_freebsd_pathconf,
+ .vop_bmap = zfs_freebsd_bmap,
+ .vop_fid = zfs_freebsd_fid,
+ .vop_getextattr = zfs_getextattr,
+ .vop_deleteextattr = zfs_deleteextattr,
+ .vop_setextattr = zfs_setextattr,
+ .vop_listextattr = zfs_listextattr,
+ .vop_getacl = zfs_freebsd_getacl,
+ .vop_setacl = zfs_freebsd_setacl,
+ .vop_aclcheck = zfs_freebsd_aclcheck,
+ .vop_getpages = zfs_freebsd_getpages,
+ .vop_putpages = zfs_freebsd_putpages,
+ .vop_vptocnp = zfs_vptocnp,
+#if __FreeBSD_version >= 1300064
+ .vop_lock1 = vop_lock,
+ .vop_unlock = vop_unlock,
+ .vop_islocked = vop_islocked,
+#endif
+};
+VFS_VOP_VECTOR_REGISTER(zfs_vnodeops);
+
+struct vop_vector zfs_fifoops = {
+ .vop_default = &fifo_specops,
+ .vop_fsync = zfs_freebsd_fsync,
+#if __FreeBSD_version >= 1300102
+ .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec,
+#endif
+ .vop_fplookup_symlink = zfs_freebsd_fplookup_symlink,
+ .vop_access = zfs_freebsd_access,
+ .vop_getattr = zfs_freebsd_getattr,
+ .vop_inactive = zfs_freebsd_inactive,
+ .vop_read = VOP_PANIC,
+ .vop_reclaim = zfs_freebsd_reclaim,
+ .vop_setattr = zfs_freebsd_setattr,
+ .vop_write = VOP_PANIC,
+ .vop_pathconf = zfs_freebsd_pathconf,
+ .vop_fid = zfs_freebsd_fid,
+ .vop_getacl = zfs_freebsd_getacl,
+ .vop_setacl = zfs_freebsd_setacl,
+ .vop_aclcheck = zfs_freebsd_aclcheck,
+};
+VFS_VOP_VECTOR_REGISTER(zfs_fifoops);
+
+/*
+ * special share hidden files vnode operations template
+ */
+struct vop_vector zfs_shareops = {
+ .vop_default = &default_vnodeops,
+#if __FreeBSD_version >= 1300121
+ .vop_fplookup_vexec = VOP_EAGAIN,
+#endif
+ .vop_fplookup_symlink = VOP_EAGAIN,
+ .vop_access = zfs_freebsd_access,
+ .vop_inactive = zfs_freebsd_inactive,
+ .vop_reclaim = zfs_freebsd_reclaim,
+ .vop_fid = zfs_freebsd_fid,
+ .vop_pathconf = zfs_freebsd_pathconf,
+};
+VFS_VOP_VECTOR_REGISTER(zfs_shareops);
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c
new file mode 100644
index 000000000000..0491b2ff3e28
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c
@@ -0,0 +1,2067 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+/* Portions Copyright 2007 Jeremy Teo */
+/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/mntent.h>
+#include <sys/u8_textprep.h>
+#include <sys/dsl_dataset.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/atomic.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_rlock.h>
+#include <sys/zfs_fuid.h>
+#include <sys/dnode.h>
+#include <sys/fs/zfs.h>
+#endif /* _KERNEL */
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/zfs_refcount.h>
+#include <sys/stat.h>
+#include <sys/zap.h>
+#include <sys/zfs_znode.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_stat.h>
+
+#include "zfs_prop.h"
+#include "zfs_comutil.h"
+
+/* Used by fstat(1). */
+SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD,
+ SYSCTL_NULL_INT_PTR, sizeof (znode_t), "sizeof(znode_t)");
+
+/*
+ * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
+ * turned on when DEBUG is also defined.
+ */
+#ifdef ZFS_DEBUG
+#define ZNODE_STATS
+#endif /* DEBUG */
+
+#ifdef ZNODE_STATS
+#define ZNODE_STAT_ADD(stat) ((stat)++)
+#else
+#define ZNODE_STAT_ADD(stat) /* nothing */
+#endif /* ZNODE_STATS */
+
+/*
+ * Functions needed for userland (ie: libzpool) are not put under
+ * #ifdef_KERNEL; the rest of the functions have dependencies
+ * (such as VFS logic) that will not compile easily in userland.
+ */
+#ifdef _KERNEL
+#if !defined(KMEM_DEBUG) && __FreeBSD_version >= 1300102
+#define _ZFS_USE_SMR
+static uma_zone_t znode_uma_zone;
+#else
+static kmem_cache_t *znode_cache = NULL;
+#endif
+
+extern struct vop_vector zfs_vnodeops;
+extern struct vop_vector zfs_fifoops;
+extern struct vop_vector zfs_shareops;
+
+
+/*
+ * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
+ * z_rangelock. It will modify the offset and length of the lock to reflect
+ * znode-specific information, and convert RL_APPEND to RL_WRITER. This is
+ * called with the rangelock_t's rl_lock held, which avoids races.
+ */
+static void
+zfs_rangelock_cb(zfs_locked_range_t *new, void *arg)
+{
+ znode_t *zp = arg;
+
+ /*
+ * If in append mode, convert to writer and lock starting at the
+ * current end of file.
+ */
+ if (new->lr_type == RL_APPEND) {
+ new->lr_offset = zp->z_size;
+ new->lr_type = RL_WRITER;
+ }
+
+ /*
+ * If we need to grow the block size then lock the whole file range.
+ */
+ uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
+ if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
+ zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
+ new->lr_offset = 0;
+ new->lr_length = UINT64_MAX;
+ }
+}
+
+static int
+zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
+{
+ znode_t *zp = buf;
+
+ POINTER_INVALIDATE(&zp->z_zfsvfs);
+
+ list_link_init(&zp->z_link_node);
+
+ mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
+
+ zp->z_acl_cached = NULL;
+ zp->z_vnode = NULL;
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+zfs_znode_cache_destructor(void *buf, void *arg)
+{
+ znode_t *zp = buf;
+
+ ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
+ ASSERT3P(zp->z_vnode, ==, NULL);
+ ASSERT(!list_link_active(&zp->z_link_node));
+ mutex_destroy(&zp->z_acl_lock);
+ zfs_rangelock_fini(&zp->z_rangelock);
+
+ ASSERT(zp->z_acl_cached == NULL);
+}
+
+
+#ifdef _ZFS_USE_SMR
+VFS_SMR_DECLARE;
+
+static int
+zfs_znode_cache_constructor_smr(void *mem, int size __unused, void *private,
+ int flags)
+{
+
+ return (zfs_znode_cache_constructor(mem, private, flags));
+}
+
+static void
+zfs_znode_cache_destructor_smr(void *mem, int size __unused, void *private)
+{
+
+ zfs_znode_cache_destructor(mem, private);
+}
+
+void
+zfs_znode_init(void)
+{
+ /*
+ * Initialize zcache
+ */
+ ASSERT(znode_uma_zone == NULL);
+ znode_uma_zone = uma_zcreate("zfs_znode_cache",
+ sizeof (znode_t), zfs_znode_cache_constructor_smr,
+ zfs_znode_cache_destructor_smr, NULL, NULL, 0, 0);
+ VFS_SMR_ZONE_SET(znode_uma_zone);
+}
+
+static znode_t *
+zfs_znode_alloc_kmem(int flags)
+{
+
+ return (uma_zalloc_smr(znode_uma_zone, flags));
+}
+
+static void
+zfs_znode_free_kmem(znode_t *zp)
+{
+
+ uma_zfree_smr(znode_uma_zone, zp);
+}
+#else
+void
+zfs_znode_init(void)
+{
+ /*
+ * Initialize zcache
+ */
+ ASSERT(znode_cache == NULL);
+ znode_cache = kmem_cache_create("zfs_znode_cache",
+ sizeof (znode_t), 0, zfs_znode_cache_constructor,
+ zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
+}
+
+static znode_t *
+zfs_znode_alloc_kmem(int flags)
+{
+
+ return (kmem_cache_alloc(znode_cache, flags));
+}
+
+static void
+zfs_znode_free_kmem(znode_t *zp)
+{
+
+ kmem_cache_free(znode_cache, zp);
+}
+#endif
+
+void
+zfs_znode_fini(void)
+{
+ /*
+ * Cleanup zcache
+ */
+#ifdef _ZFS_USE_SMR
+ if (znode_uma_zone) {
+ uma_zdestroy(znode_uma_zone);
+ znode_uma_zone = NULL;
+ }
+#else
+ if (znode_cache) {
+ kmem_cache_destroy(znode_cache);
+ znode_cache = NULL;
+ }
+#endif
+}
+
+
+static int
+zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
+{
+ zfs_acl_ids_t acl_ids;
+ vattr_t vattr;
+ znode_t *sharezp;
+ znode_t *zp;
+ int error;
+
+ vattr.va_mask = AT_MODE|AT_UID|AT_GID;
+ vattr.va_type = VDIR;
+ vattr.va_mode = S_IFDIR|0555;
+ vattr.va_uid = crgetuid(kcred);
+ vattr.va_gid = crgetgid(kcred);
+
+ sharezp = zfs_znode_alloc_kmem(KM_SLEEP);
+ ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
+ sharezp->z_unlinked = 0;
+ sharezp->z_atime_dirty = 0;
+ sharezp->z_zfsvfs = zfsvfs;
+ sharezp->z_is_sa = zfsvfs->z_use_sa;
+
+ VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
+ kcred, NULL, &acl_ids));
+ zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
+ ASSERT3P(zp, ==, sharezp);
+ POINTER_INVALIDATE(&sharezp->z_zfsvfs);
+ error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+ ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
+ zfsvfs->z_shares_dir = sharezp->z_id;
+
+ zfs_acl_ids_free(&acl_ids);
+ sa_handle_destroy(sharezp->z_sa_hdl);
+ zfs_znode_free_kmem(sharezp);
+
+ return (error);
+}
+
+/*
+ * define a couple of values we need available
+ * for both 64 and 32 bit environments.
+ */
+#ifndef NBITSMINOR64
+#define NBITSMINOR64 32
+#endif
+#ifndef MAXMAJ64
+#define MAXMAJ64 0xffffffffUL
+#endif
+#ifndef MAXMIN64
+#define MAXMIN64 0xffffffffUL
+#endif
+
+/*
+ * Create special expldev for ZFS private use.
+ * Can't use standard expldev since it doesn't do
+ * what we want. The standard expldev() takes a
+ * dev32_t in LP64 and expands it to a long dev_t.
+ * We need an interface that takes a dev32_t in ILP32
+ * and expands it to a long dev_t.
+ */
+static uint64_t
+zfs_expldev(dev_t dev)
+{
+ return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev));
+}
+/*
+ * Special cmpldev for ZFS private use.
+ * Can't use standard cmpldev since it takes
+ * a long dev_t and compresses it to dev32_t in
+ * LP64. We need to do a compaction of a long dev_t
+ * to a dev32_t in ILP32.
+ */
+dev_t
+zfs_cmpldev(uint64_t dev)
+{
+ return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64)));
+}
+
+static void
+zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
+ dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
+{
+ ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
+ ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
+
+ ASSERT(zp->z_sa_hdl == NULL);
+ ASSERT(zp->z_acl_cached == NULL);
+ if (sa_hdl == NULL) {
+ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
+ SA_HDL_SHARED, &zp->z_sa_hdl));
+ } else {
+ zp->z_sa_hdl = sa_hdl;
+ sa_set_userp(sa_hdl, zp);
+ }
+
+ zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
+
+ /*
+ * Slap on VROOT if we are the root znode unless we are the root
+ * node of a snapshot mounted under .zfs.
+ */
+ if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs)
+ ZTOV(zp)->v_flag |= VROOT;
+
+ vn_exists(ZTOV(zp));
+}
+
+void
+zfs_znode_dmu_fini(znode_t *zp)
+{
+ ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
+ zp->z_unlinked ||
+ ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zp->z_zfsvfs));
+
+ sa_handle_destroy(zp->z_sa_hdl);
+ zp->z_sa_hdl = NULL;
+}
+
+static void
+zfs_vnode_forget(vnode_t *vp)
+{
+
+ /* copied from insmntque_stddtr */
+ vp->v_data = NULL;
+ vp->v_op = &dead_vnodeops;
+ vgone(vp);
+ vput(vp);
+}
+
+/*
+ * Construct a new znode/vnode and initialize.
+ *
+ * This does not do a call to dmu_set_user() that is
+ * up to the caller to do, in case you don't want to
+ * return the znode
+ */
+static znode_t *
+zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
+ dmu_object_type_t obj_type, sa_handle_t *hdl)
+{
+ znode_t *zp;
+ vnode_t *vp;
+ uint64_t mode;
+ uint64_t parent;
+#ifdef notyet
+ uint64_t mtime[2], ctime[2];
+#endif
+ uint64_t projid = ZFS_DEFAULT_PROJID;
+ sa_bulk_attr_t bulk[9];
+ int count = 0;
+ int error;
+
+ zp = zfs_znode_alloc_kmem(KM_SLEEP);
+
+#ifndef _ZFS_USE_SMR
+ KASSERT((zfsvfs->z_parent->z_vfs->mnt_kern_flag & MNTK_FPLOOKUP) == 0,
+ ("%s: fast path lookup enabled without smr", __func__));
+#endif
+
+#if __FreeBSD_version >= 1300076
+ KASSERT(curthread->td_vp_reserved != NULL,
+ ("zfs_znode_alloc: getnewvnode without any vnodes reserved"));
+#else
+ KASSERT(curthread->td_vp_reserv > 0,
+ ("zfs_znode_alloc: getnewvnode without any vnodes reserved"));
+#endif
+ error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp);
+ if (error != 0) {
+ zfs_znode_free_kmem(zp);
+ return (NULL);
+ }
+ zp->z_vnode = vp;
+ vp->v_data = zp;
+
+ ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
+
+ zp->z_sa_hdl = NULL;
+ zp->z_unlinked = 0;
+ zp->z_atime_dirty = 0;
+ zp->z_mapcnt = 0;
+ zp->z_id = db->db_object;
+ zp->z_blksz = blksz;
+ zp->z_seq = 0x7A4653;
+ zp->z_sync_cnt = 0;
+ atomic_store_ptr(&zp->z_cached_symlink, NULL);
+
+ vp = ZTOV(zp);
+
+ zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &zp->z_links, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &zp->z_atime, 16);
+#ifdef notyet
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, 16);
+#endif
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &zp->z_uid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+ &zp->z_gid, 8);
+
+ if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0 ||
+ (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
+ (zp->z_pflags & ZFS_PROJID) &&
+ sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
+ if (hdl == NULL)
+ sa_handle_destroy(zp->z_sa_hdl);
+ zfs_vnode_forget(vp);
+ zp->z_vnode = NULL;
+ zfs_znode_free_kmem(zp);
+ return (NULL);
+ }
+
+ zp->z_projid = projid;
+ zp->z_mode = mode;
+
+ /* Cache the xattr parent id */
+ if (zp->z_pflags & ZFS_XATTR)
+ zp->z_xattr_parent = parent;
+
+ vp->v_type = IFTOVT((mode_t)mode);
+
+ switch (vp->v_type) {
+ case VDIR:
+ zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
+ break;
+ case VFIFO:
+ vp->v_op = &zfs_fifoops;
+ break;
+ case VREG:
+ if (parent == zfsvfs->z_shares_dir) {
+ ASSERT(zp->z_uid == 0 && zp->z_gid == 0);
+ vp->v_op = &zfs_shareops;
+ }
+ break;
+ default:
+ break;
+ }
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ list_insert_tail(&zfsvfs->z_all_znodes, zp);
+ zfsvfs->z_nr_znodes++;
+ zp->z_zfsvfs = zfsvfs;
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ /*
+ * Acquire vnode lock before making it available to the world.
+ */
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ VN_LOCK_AREC(vp);
+ if (vp->v_type != VFIFO)
+ VN_LOCK_ASHARE(vp);
+
+ return (zp);
+}
+
+static uint64_t empty_xattr;
+static uint64_t pad[4];
+static zfs_acl_phys_t acl_phys;
+/*
+ * Create a new DMU object to hold a zfs znode.
+ *
+ * IN: dzp - parent directory for new znode
+ * vap - file attributes for new znode
+ * tx - dmu transaction id for zap operations
+ * cr - credentials of caller
+ * flag - flags:
+ * IS_ROOT_NODE - new object will be root
+ * IS_XATTR - new object is an attribute
+ * bonuslen - length of bonus buffer
+ * setaclp - File/Dir initial ACL
+ * fuidp - Tracks fuid allocation.
+ *
+ * OUT: zpp - allocated znode
+ *
+ */
+void
+zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
+ uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
+{
+ uint64_t crtime[2], atime[2], mtime[2], ctime[2];
+ uint64_t mode, size, links, parent, pflags;
+ uint64_t dzp_pflags = 0;
+ uint64_t rdev = 0;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ dmu_buf_t *db;
+ timestruc_t now;
+ uint64_t gen, obj;
+ int err;
+ int bonuslen;
+ int dnodesize;
+ sa_handle_t *sa_hdl;
+ dmu_object_type_t obj_type;
+ sa_bulk_attr_t *sa_attrs;
+ int cnt = 0;
+ zfs_acl_locator_cb_t locate = { 0 };
+
+ ASSERT(vap && ((vap->va_mask & AT_MODE) == AT_MODE));
+
+ if (zfsvfs->z_replay) {
+ obj = vap->va_nodeid;
+ now = vap->va_ctime; /* see zfs_replay_create() */
+ gen = vap->va_nblocks; /* ditto */
+ dnodesize = vap->va_fsid; /* ditto */
+ } else {
+ obj = 0;
+ vfs_timestamp(&now);
+ gen = dmu_tx_get_txg(tx);
+ dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
+ }
+
+ if (dnodesize == 0)
+ dnodesize = DNODE_MIN_SIZE;
+
+ obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
+ bonuslen = (obj_type == DMU_OT_SA) ?
+ DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
+
+ /*
+ * Create a new DMU object.
+ */
+ /*
+ * There's currently no mechanism for pre-reading the blocks that will
+ * be needed to allocate a new object, so we accept the small chance
+ * that there will be an i/o error and we will fail one of the
+ * assertions below.
+ */
+ if (vap->va_type == VDIR) {
+ if (zfsvfs->z_replay) {
+ VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
+ zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
+ obj_type, bonuslen, dnodesize, tx));
+ } else {
+ obj = zap_create_norm_dnsize(zfsvfs->z_os,
+ zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
+ obj_type, bonuslen, dnodesize, tx);
+ }
+ } else {
+ if (zfsvfs->z_replay) {
+ VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
+ DMU_OT_PLAIN_FILE_CONTENTS, 0,
+ obj_type, bonuslen, dnodesize, tx));
+ } else {
+ obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
+ DMU_OT_PLAIN_FILE_CONTENTS, 0,
+ obj_type, bonuslen, dnodesize, tx);
+ }
+ }
+
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
+ VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
+
+ /*
+ * If this is the root, fix up the half-initialized parent pointer
+ * to reference the just-allocated physical data area.
+ */
+ if (flag & IS_ROOT_NODE) {
+ dzp->z_id = obj;
+ } else {
+ dzp_pflags = dzp->z_pflags;
+ }
+
+ /*
+ * If parent is an xattr, so am I.
+ */
+ if (dzp_pflags & ZFS_XATTR) {
+ flag |= IS_XATTR;
+ }
+
+ if (zfsvfs->z_use_fuids)
+ pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
+ else
+ pflags = 0;
+
+ if (vap->va_type == VDIR) {
+ size = 2; /* contents ("." and "..") */
+ links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
+ } else {
+ size = links = 0;
+ }
+
+ if (vap->va_type == VBLK || vap->va_type == VCHR) {
+ rdev = zfs_expldev(vap->va_rdev);
+ }
+
+ parent = dzp->z_id;
+ mode = acl_ids->z_mode;
+ if (flag & IS_XATTR)
+ pflags |= ZFS_XATTR;
+
+ /*
+ * No execs denied will be determined when zfs_mode_compute() is called.
+ */
+ pflags |= acl_ids->z_aclp->z_hints &
+ (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
+ ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
+
+ ZFS_TIME_ENCODE(&now, crtime);
+ ZFS_TIME_ENCODE(&now, ctime);
+
+ if (vap->va_mask & AT_ATIME) {
+ ZFS_TIME_ENCODE(&vap->va_atime, atime);
+ } else {
+ ZFS_TIME_ENCODE(&now, atime);
+ }
+
+ if (vap->va_mask & AT_MTIME) {
+ ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+ } else {
+ ZFS_TIME_ENCODE(&now, mtime);
+ }
+
+ /* Now add in all of the "SA" attributes */
+ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
+ &sa_hdl));
+
+ /*
+ * Setup the array of attributes to be replaced/set on the new file
+ *
+ * order for DMU_OT_ZNODE is critical since it needs to be constructed
+ * in the old znode_phys_t format. Don't change this ordering
+ */
+ sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
+
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+ NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+ NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+ NULL, &crtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+ NULL, &gen, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+ NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+ NULL, &size, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+ NULL, &parent, 8);
+ } else {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+ NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+ NULL, &size, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+ NULL, &gen, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
+ NULL, &acl_ids->z_fuid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
+ NULL, &acl_ids->z_fgid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+ NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &pflags, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+ NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+ NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+ NULL, &crtime, 16);
+ }
+
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
+
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
+ &empty_xattr, 8);
+ }
+ if (obj_type == DMU_OT_ZNODE ||
+ (vap->va_type == VBLK || vap->va_type == VCHR)) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
+ NULL, &rdev, 8);
+
+ }
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &pflags, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
+ &acl_ids->z_fuid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
+ &acl_ids->z_fgid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
+ sizeof (uint64_t) * 4);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+ &acl_phys, sizeof (zfs_acl_phys_t));
+ } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+ &acl_ids->z_aclp->z_acl_count, 8);
+ locate.cb_aclp = acl_ids->z_aclp;
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
+ zfs_acl_data_locator, &locate,
+ acl_ids->z_aclp->z_acl_bytes);
+ mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
+ acl_ids->z_fuid, acl_ids->z_fgid);
+ }
+
+ VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
+
+ if (!(flag & IS_ROOT_NODE)) {
+ *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
+ ASSERT(*zpp != NULL);
+ } else {
+ /*
+ * If we are creating the root node, the "parent" we
+ * passed in is the znode for the root.
+ */
+ *zpp = dzp;
+
+ (*zpp)->z_sa_hdl = sa_hdl;
+ }
+
+ (*zpp)->z_pflags = pflags;
+ (*zpp)->z_mode = mode;
+ (*zpp)->z_dnodesize = dnodesize;
+
+ if (vap->va_mask & AT_XVATTR)
+ zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
+
+ if (obj_type == DMU_OT_ZNODE ||
+ acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
+ VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
+ }
+ if (!(flag & IS_ROOT_NODE)) {
+ vnode_t *vp;
+
+ vp = ZTOV(*zpp);
+ vp->v_vflag |= VV_FORCEINSMQ;
+ err = insmntque(vp, zfsvfs->z_vfs);
+ vp->v_vflag &= ~VV_FORCEINSMQ;
+ KASSERT(err == 0, ("insmntque() failed: error %d", err));
+ }
+ kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
+}
+
+/*
+ * Update in-core attributes. It is assumed the caller will be doing an
+ * sa_bulk_update to push the changes out.
+ */
+void
+zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
+{
+ xoptattr_t *xoap;
+
+ xoap = xva_getxoptattr(xvap);
+ ASSERT(xoap);
+
+ ASSERT_VOP_IN_SEQC(ZTOV(zp));
+
+ if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
+ uint64_t times[2];
+ ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
+ &times, sizeof (times), tx);
+ XVA_SET_RTN(xvap, XAT_CREATETIME);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
+ ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_READONLY);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
+ ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_HIDDEN);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
+ ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_SYSTEM);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
+ ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_ARCHIVE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+ ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_IMMUTABLE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+ ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_NOUNLINK);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+ ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_APPENDONLY);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+ ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_NODUMP);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
+ ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_OPAQUE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+ ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
+ xoap->xoa_av_quarantined, zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+ ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
+ zfs_sa_set_scanstamp(zp, xvap, tx);
+ XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+ ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_REPARSE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+ ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_OFFLINE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+ ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_SPARSE);
+ }
+}
+
+int
+zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
+{
+ dmu_object_info_t doi;
+ dmu_buf_t *db;
+ znode_t *zp;
+ vnode_t *vp;
+ sa_handle_t *hdl;
+ struct thread *td;
+ int locked;
+ int err;
+
+ td = curthread;
+ getnewvnode_reserve_();
+again:
+ *zpp = NULL;
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
+
+ err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ if (err) {
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ getnewvnode_drop_reserve();
+ return (err);
+ }
+
+ dmu_object_info_from_db(db, &doi);
+ if (doi.doi_bonus_type != DMU_OT_SA &&
+ (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ (doi.doi_bonus_type == DMU_OT_ZNODE &&
+ doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+ sa_buf_rele(db, NULL);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ getnewvnode_drop_reserve();
+ return (SET_ERROR(EINVAL));
+ }
+
+ hdl = dmu_buf_get_user(db);
+ if (hdl != NULL) {
+ zp = sa_get_userdata(hdl);
+
+ /*
+ * Since "SA" does immediate eviction we
+ * should never find a sa handle that doesn't
+ * know about the znode.
+ */
+ ASSERT3P(zp, !=, NULL);
+ ASSERT3U(zp->z_id, ==, obj_num);
+ if (zp->z_unlinked) {
+ err = SET_ERROR(ENOENT);
+ } else {
+ vp = ZTOV(zp);
+ /*
+ * Don't let the vnode disappear after
+ * ZFS_OBJ_HOLD_EXIT.
+ */
+ VN_HOLD(vp);
+ *zpp = zp;
+ err = 0;
+ }
+
+ sa_buf_rele(db, NULL);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+
+ if (err) {
+ getnewvnode_drop_reserve();
+ return (err);
+ }
+
+ locked = VOP_ISLOCKED(vp);
+ VI_LOCK(vp);
+ if (VN_IS_DOOMED(vp) && locked != LK_EXCLUSIVE) {
+ /*
+ * The vnode is doomed and this thread doesn't
+ * hold the exclusive lock on it, so the vnode
+ * must be being reclaimed by another thread.
+ * Otherwise the doomed vnode is being reclaimed
+ * by this thread and zfs_zget is called from
+ * ZIL internals.
+ */
+ VI_UNLOCK(vp);
+
+ /*
+ * XXX vrele() locks the vnode when the last reference
+ * is dropped. Although in this case the vnode is
+ * doomed / dead and so no inactivation is required,
+ * the vnode lock is still acquired. That could result
+ * in a LOR with z_teardown_lock if another thread holds
+ * the vnode's lock and tries to take z_teardown_lock.
+ * But that is only possible if the other thread peforms
+ * a ZFS vnode operation on the vnode. That either
+ * should not happen if the vnode is dead or the thread
+ * should also have a reference to the vnode and thus
+ * our reference is not last.
+ */
+ VN_RELE(vp);
+ goto again;
+ }
+ VI_UNLOCK(vp);
+ getnewvnode_drop_reserve();
+ return (err);
+ }
+
+ /*
+ * Not found create new znode/vnode
+ * but only if file exists.
+ *
+ * There is a small window where zfs_vget() could
+ * find this object while a file create is still in
+ * progress. This is checked for in zfs_znode_alloc()
+ *
+ * if zfs_znode_alloc() fails it will drop the hold on the
+ * bonus buffer.
+ */
+ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
+ doi.doi_bonus_type, NULL);
+ if (zp == NULL) {
+ err = SET_ERROR(ENOENT);
+ } else {
+ *zpp = zp;
+ }
+ if (err == 0) {
+ vnode_t *vp = ZTOV(zp);
+
+ err = insmntque(vp, zfsvfs->z_vfs);
+ if (err == 0) {
+ vp->v_hash = obj_num;
+ VOP_UNLOCK1(vp);
+ } else {
+ zp->z_vnode = NULL;
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_free(zp);
+ *zpp = NULL;
+ }
+ }
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ getnewvnode_drop_reserve();
+ return (err);
+}
+
+int
+zfs_rezget(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ dmu_object_info_t doi;
+ dmu_buf_t *db;
+ vnode_t *vp;
+ uint64_t obj_num = zp->z_id;
+ uint64_t mode, size;
+ sa_bulk_attr_t bulk[8];
+ int err;
+ int count = 0;
+ uint64_t gen;
+
+ /*
+ * Remove cached pages before reloading the znode, so that they are not
+ * lingering after we run into any error. Ideally, we should vgone()
+ * the vnode in case of error, but currently we cannot do that
+ * because of the LOR between the vnode lock and z_teardown_lock.
+ * So, instead, we have to "doom" the znode in the illumos style.
+ */
+ vp = ZTOV(zp);
+ vn_pages_remove(vp, 0, 0);
+
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
+
+ mutex_enter(&zp->z_acl_lock);
+ if (zp->z_acl_cached) {
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = NULL;
+ }
+
+ mutex_exit(&zp->z_acl_lock);
+ ASSERT(zp->z_sa_hdl == NULL);
+ err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ if (err) {
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (err);
+ }
+
+ dmu_object_info_from_db(db, &doi);
+ if (doi.doi_bonus_type != DMU_OT_SA &&
+ (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ (doi.doi_bonus_type == DMU_OT_ZNODE &&
+ doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+ sa_buf_rele(db, NULL);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (SET_ERROR(EINVAL));
+ }
+
+ zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
+ size = zp->z_size;
+
+ /* reload cached values */
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
+ &gen, sizeof (gen));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, sizeof (zp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &zp->z_links, sizeof (zp->z_links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &zp->z_atime, sizeof (zp->z_atime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &zp->z_uid, sizeof (zp->z_uid));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+ &zp->z_gid, sizeof (zp->z_gid));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &mode, sizeof (mode));
+
+ if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (SET_ERROR(EIO));
+ }
+
+ zp->z_mode = mode;
+
+ if (gen != zp->z_gen) {
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (SET_ERROR(EIO));
+ }
+
+ /*
+ * It is highly improbable but still quite possible that two
+ * objects in different datasets are created with the same
+ * object numbers and in transaction groups with the same
+ * numbers. znodes corresponding to those objects would
+ * have the same z_id and z_gen, but their other attributes
+ * may be different.
+ * zfs recv -F may replace one of such objects with the other.
+ * As a result file properties recorded in the replaced
+ * object's vnode may no longer match the received object's
+ * properties. At present the only cached property is the
+ * files type recorded in v_type.
+ * So, handle this case by leaving the old vnode and znode
+ * disassociated from the actual object. A new vnode and a
+ * znode will be created if the object is accessed
+ * (e.g. via a look-up). The old vnode and znode will be
+ * recycled when the last vnode reference is dropped.
+ */
+ if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) {
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (SET_ERROR(EIO));
+ }
+
+ /*
+ * If the file has zero links, then it has been unlinked on the send
+ * side and it must be in the received unlinked set.
+ * We call zfs_znode_dmu_fini() now to prevent any accesses to the
+ * stale data and to prevent automatically removal of the file in
+ * zfs_zinactive(). The file will be removed either when it is removed
+ * on the send side and the next incremental stream is received or
+ * when the unlinked set gets processed.
+ */
+ zp->z_unlinked = (zp->z_links == 0);
+ if (zp->z_unlinked) {
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (0);
+ }
+
+ zp->z_blksz = doi.doi_data_block_size;
+ if (zp->z_size != size)
+ vnode_pager_setsize(vp, zp->z_size);
+
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+
+ return (0);
+}
+
+void
+zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ objset_t *os = zfsvfs->z_os;
+ uint64_t obj = zp->z_id;
+ uint64_t acl_obj = zfs_external_acl(zp);
+
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
+ if (acl_obj) {
+ VERIFY(!zp->z_is_sa);
+ VERIFY(0 == dmu_object_free(os, acl_obj, tx));
+ }
+ VERIFY(0 == dmu_object_free(os, obj, tx));
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
+ zfs_znode_free(zp);
+}
+
+void
+zfs_zinactive(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t z_id = zp->z_id;
+
+ ASSERT(zp->z_sa_hdl);
+
+ /*
+ * Don't allow a zfs_zget() while were trying to release this znode
+ */
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
+
+ /*
+ * If this was the last reference to a file with no links, remove
+ * the file from the file system unless the file system is mounted
+ * read-only. That can happen, for example, if the file system was
+ * originally read-write, the file was opened, then unlinked and
+ * the file system was made read-only before the file was finally
+ * closed. The file will remain in the unlinked set.
+ */
+ if (zp->z_unlinked) {
+ ASSERT(!zfsvfs->z_issnap);
+ if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) {
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+ zfs_rmnode(zp);
+ return;
+ }
+ }
+
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+ zfs_znode_free(zp);
+}
+
+void
+zfs_znode_free(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ char *symlink;
+
+ ASSERT(zp->z_sa_hdl == NULL);
+ zp->z_vnode = NULL;
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ POINTER_INVALIDATE(&zp->z_zfsvfs);
+ list_remove(&zfsvfs->z_all_znodes, zp);
+ zfsvfs->z_nr_znodes--;
+ mutex_exit(&zfsvfs->z_znodes_lock);
+ symlink = atomic_load_ptr(&zp->z_cached_symlink);
+ if (symlink != NULL) {
+ atomic_store_rel_ptr((uintptr_t *)&zp->z_cached_symlink, (uintptr_t)NULL);
+ cache_symlink_free(symlink, strlen(symlink) + 1);
+ }
+
+ if (zp->z_acl_cached) {
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = NULL;
+ }
+
+ zfs_znode_free_kmem(zp);
+}
+
+void
+zfs_tstamp_update_setup_ext(znode_t *zp, uint_t flag, uint64_t mtime[2],
+ uint64_t ctime[2], boolean_t have_tx)
+{
+ timestruc_t now;
+
+ vfs_timestamp(&now);
+
+ if (have_tx) { /* will sa_bulk_update happen really soon? */
+ zp->z_atime_dirty = 0;
+ zp->z_seq++;
+ } else {
+ zp->z_atime_dirty = 1;
+ }
+
+ if (flag & AT_ATIME) {
+ ZFS_TIME_ENCODE(&now, zp->z_atime);
+ }
+
+ if (flag & AT_MTIME) {
+ ZFS_TIME_ENCODE(&now, mtime);
+ if (zp->z_zfsvfs->z_use_fuids) {
+ zp->z_pflags |= (ZFS_ARCHIVE |
+ ZFS_AV_MODIFIED);
+ }
+ }
+
+ if (flag & AT_CTIME) {
+ ZFS_TIME_ENCODE(&now, ctime);
+ if (zp->z_zfsvfs->z_use_fuids)
+ zp->z_pflags |= ZFS_ARCHIVE;
+ }
+}
+
+
+void
+zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
+ uint64_t ctime[2])
+{
+ zfs_tstamp_update_setup_ext(zp, flag, mtime, ctime, B_TRUE);
+}
+/*
+ * Grow the block size for a file.
+ *
+ * IN: zp - znode of file to free data in.
+ * size - requested block size
+ * tx - open transaction.
+ *
+ * NOTE: this function assumes that the znode is write locked.
+ */
+void
+zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
+{
+ int error;
+ u_longlong_t dummy;
+
+ if (size <= zp->z_blksz)
+ return;
+ /*
+ * If the file size is already greater than the current blocksize,
+ * we will not grow. If there is more than one block in a file,
+ * the blocksize cannot change.
+ */
+ if (zp->z_blksz && zp->z_size > zp->z_blksz)
+ return;
+
+ error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
+ size, 0, tx);
+
+ if (error == ENOTSUP)
+ return;
+ ASSERT0(error);
+
+ /* What blocksize did we actually get? */
+ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
+}
+
+/*
+ * Increase the file length
+ *
+ * IN: zp - znode of file to free data in.
+ * end - new end-of-file
+ *
+ * RETURN: 0 on success, error code on failure
+ */
+static int
+zfs_extend(znode_t *zp, uint64_t end)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ dmu_tx_t *tx;
+ zfs_locked_range_t *lr;
+ uint64_t newblksz;
+ int error;
+
+ /*
+ * We will change zp_size, lock the whole file.
+ */
+ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
+
+ /*
+ * Nothing to do if file already at desired length.
+ */
+ if (end <= zp->z_size) {
+ zfs_rangelock_exit(lr);
+ return (0);
+ }
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ if (end > zp->z_blksz &&
+ (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
+ /*
+ * We are growing the file past the current block size.
+ */
+ if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
+ /*
+ * File's blocksize is already larger than the
+ * "recordsize" property. Only let it grow to
+ * the next power of 2.
+ */
+ ASSERT(!ISP2(zp->z_blksz));
+ newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
+ } else {
+ newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
+ }
+ dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
+ } else {
+ newblksz = 0;
+ }
+
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ zfs_rangelock_exit(lr);
+ return (error);
+ }
+
+ if (newblksz)
+ zfs_grow_blocksize(zp, newblksz, tx);
+
+ zp->z_size = end;
+
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
+ &zp->z_size, sizeof (zp->z_size), tx));
+
+ vnode_pager_setsize(ZTOV(zp), end);
+
+ zfs_rangelock_exit(lr);
+
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+
+/*
+ * Free space in a file.
+ *
+ * IN: zp - znode of file to free data in.
+ * off - start of section to free.
+ * len - length of section to free.
+ *
+ * RETURN: 0 on success, error code on failure
+ */
+static int
+zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zfs_locked_range_t *lr;
+ int error;
+
+ /*
+ * Lock the range being freed.
+ */
+ lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
+
+ /*
+ * Nothing to do if file already at desired length.
+ */
+ if (off >= zp->z_size) {
+ zfs_rangelock_exit(lr);
+ return (0);
+ }
+
+ if (off + len > zp->z_size)
+ len = zp->z_size - off;
+
+ error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
+
+ if (error == 0) {
+ /*
+ * In FreeBSD we cannot free block in the middle of a file,
+ * but only at the end of a file, so this code path should
+ * never happen.
+ */
+ vnode_pager_setsize(ZTOV(zp), off);
+ }
+
+ zfs_rangelock_exit(lr);
+
+ return (error);
+}
+
+/*
+ * Truncate a file
+ *
+ * IN: zp - znode of file to free data in.
+ * end - new end-of-file.
+ *
+ * RETURN: 0 on success, error code on failure
+ */
+static int
+zfs_trunc(znode_t *zp, uint64_t end)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ vnode_t *vp = ZTOV(zp);
+ dmu_tx_t *tx;
+ zfs_locked_range_t *lr;
+ int error;
+ sa_bulk_attr_t bulk[2];
+ int count = 0;
+
+ /*
+ * We will change zp_size, lock the whole file.
+ */
+ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
+
+ /*
+ * Nothing to do if file already at desired length.
+ */
+ if (end >= zp->z_size) {
+ zfs_rangelock_exit(lr);
+ return (0);
+ }
+
+ error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
+ DMU_OBJECT_END);
+ if (error) {
+ zfs_rangelock_exit(lr);
+ return (error);
+ }
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ dmu_tx_mark_netfree(tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ zfs_rangelock_exit(lr);
+ return (error);
+ }
+
+ zp->z_size = end;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+ NULL, &zp->z_size, sizeof (zp->z_size));
+
+ if (end == 0) {
+ zp->z_pflags &= ~ZFS_SPARSE;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, 8);
+ }
+ VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
+
+ dmu_tx_commit(tx);
+
+ /*
+ * Clear any mapped pages in the truncated region. This has to
+ * happen outside of the transaction to avoid the possibility of
+ * a deadlock with someone trying to push a page that we are
+ * about to invalidate.
+ */
+ vnode_pager_setsize(vp, end);
+
+ zfs_rangelock_exit(lr);
+
+ return (0);
+}
+
+/*
+ * Free space in a file
+ *
+ * IN: zp - znode of file to free data in.
+ * off - start of range
+ * len - end of range (0 => EOF)
+ * flag - current file open mode flags.
+ * log - TRUE if this action should be logged
+ *
+ * RETURN: 0 on success, error code on failure
+ */
+int
+zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
+{
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ uint64_t mode;
+ uint64_t mtime[2], ctime[2];
+ sa_bulk_attr_t bulk[3];
+ int count = 0;
+ int error;
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
+ sizeof (mode))) != 0)
+ return (error);
+
+ if (off > zp->z_size) {
+ error = zfs_extend(zp, off+len);
+ if (error == 0 && log)
+ goto log;
+ else
+ return (error);
+ }
+
+ if (len == 0) {
+ error = zfs_trunc(zp, off);
+ } else {
+ if ((error = zfs_free_range(zp, off, len)) == 0 &&
+ off + len > zp->z_size)
+ error = zfs_extend(zp, off+len);
+ }
+ if (error || !log)
+ return (error);
+log:
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ return (error);
+ }
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, 8);
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ ASSERT(error == 0);
+
+ zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
+
+ dmu_tx_commit(tx);
+ return (0);
+}
+
+void
+zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
+{
+ uint64_t moid, obj, sa_obj, version;
+ uint64_t sense = ZFS_CASE_SENSITIVE;
+ uint64_t norm = 0;
+ nvpair_t *elem;
+ int error;
+ int i;
+ znode_t *rootzp = NULL;
+ zfsvfs_t *zfsvfs;
+ vattr_t vattr;
+ znode_t *zp;
+ zfs_acl_ids_t acl_ids;
+
+ /*
+ * First attempt to create master node.
+ */
+ /*
+ * In an empty objset, there are no blocks to read and thus
+ * there can be no i/o errors (which we assert below).
+ */
+ moid = MASTER_NODE_OBJ;
+ error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+ ASSERT(error == 0);
+
+ /*
+ * Set starting attributes.
+ */
+ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
+ /* For the moment we expect all zpl props to be uint64_ts */
+ uint64_t val;
+ char *name;
+
+ ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
+ VERIFY(nvpair_value_uint64(elem, &val) == 0);
+ name = nvpair_name(elem);
+ if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
+ if (val < version)
+ version = val;
+ } else {
+ error = zap_update(os, moid, name, 8, 1, &val, tx);
+ }
+ ASSERT(error == 0);
+ if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
+ norm = val;
+ else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
+ sense = val;
+ }
+ ASSERT(version != 0);
+ error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
+
+ /*
+ * Create zap object used for SA attribute registration
+ */
+
+ if (version >= ZPL_VERSION_SA) {
+ sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+ error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+ ASSERT(error == 0);
+ } else {
+ sa_obj = 0;
+ }
+ /*
+ * Create a delete queue.
+ */
+ obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
+
+ error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
+ ASSERT(error == 0);
+
+ /*
+ * Create root znode. Create minimal znode/vnode/zfsvfs
+ * to allow zfs_mknode to work.
+ */
+ VATTR_NULL(&vattr);
+ vattr.va_mask = AT_MODE|AT_UID|AT_GID;
+ vattr.va_type = VDIR;
+ vattr.va_mode = S_IFDIR|0755;
+ vattr.va_uid = crgetuid(cr);
+ vattr.va_gid = crgetgid(cr);
+
+ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+
+ rootzp = zfs_znode_alloc_kmem(KM_SLEEP);
+ ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
+ rootzp->z_unlinked = 0;
+ rootzp->z_atime_dirty = 0;
+ rootzp->z_is_sa = USE_SA(version, os);
+
+ zfsvfs->z_os = os;
+ zfsvfs->z_parent = zfsvfs;
+ zfsvfs->z_version = version;
+ zfsvfs->z_use_fuids = USE_FUIDS(version, os);
+ zfsvfs->z_use_sa = USE_SA(version, os);
+ zfsvfs->z_norm = norm;
+
+ error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+ &zfsvfs->z_attr_table);
+
+ ASSERT(error == 0);
+
+ /*
+ * Fold case on file systems that are always or sometimes case
+ * insensitive.
+ */
+ if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
+ zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
+ mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+ offsetof(znode_t, z_link_node));
+
+ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+
+ rootzp->z_zfsvfs = zfsvfs;
+ VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
+ cr, NULL, &acl_ids));
+ zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
+ ASSERT3P(zp, ==, rootzp);
+ error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
+ ASSERT(error == 0);
+ zfs_acl_ids_free(&acl_ids);
+ POINTER_INVALIDATE(&rootzp->z_zfsvfs);
+
+ sa_handle_destroy(rootzp->z_sa_hdl);
+ zfs_znode_free_kmem(rootzp);
+
+ /*
+ * Create shares directory
+ */
+
+ error = zfs_create_share_dir(zfsvfs, tx);
+
+ ASSERT(error == 0);
+
+ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ mutex_destroy(&zfsvfs->z_hold_mtx[i]);
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+}
+#endif /* _KERNEL */
+
+static int
+zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
+{
+ uint64_t sa_obj = 0;
+ int error;
+
+ error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
+ if (error != 0 && error != ENOENT)
+ return (error);
+
+ error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
+ return (error);
+}
+
+static int
+zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
+ dmu_buf_t **db, void *tag)
+{
+ dmu_object_info_t doi;
+ int error;
+
+ if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
+ return (error);
+
+ dmu_object_info_from_db(*db, &doi);
+ if ((doi.doi_bonus_type != DMU_OT_SA &&
+ doi.doi_bonus_type != DMU_OT_ZNODE) ||
+ (doi.doi_bonus_type == DMU_OT_ZNODE &&
+ doi.doi_bonus_size < sizeof (znode_phys_t))) {
+ sa_buf_rele(*db, tag);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
+ if (error != 0) {
+ sa_buf_rele(*db, tag);
+ return (error);
+ }
+
+ return (0);
+}
+
+static void
+zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
+{
+ sa_handle_destroy(hdl);
+ sa_buf_rele(db, tag);
+}
+
+/*
+ * Given an object number, return its parent object number and whether
+ * or not the object is an extended attribute directory.
+ */
+static int
+zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
+ uint64_t *pobjp, int *is_xattrdir)
+{
+ uint64_t parent;
+ uint64_t pflags;
+ uint64_t mode;
+ uint64_t parent_mode;
+ sa_bulk_attr_t bulk[3];
+ sa_handle_t *sa_hdl;
+ dmu_buf_t *sa_db;
+ int count = 0;
+ int error;
+
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
+ &parent, sizeof (parent));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
+ &pflags, sizeof (pflags));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+ &mode, sizeof (mode));
+
+ if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
+ return (error);
+
+ /*
+ * When a link is removed its parent pointer is not changed and will
+ * be invalid. There are two cases where a link is removed but the
+ * file stays around, when it goes to the delete queue and when there
+ * are additional links.
+ */
+ error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
+ zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
+ if (error != 0)
+ return (error);
+
+ *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
+
+ /*
+ * Extended attributes can be applied to files, directories, etc.
+ * Otherwise the parent must be a directory.
+ */
+ if (!*is_xattrdir && !S_ISDIR(parent_mode))
+ return (SET_ERROR(EINVAL));
+
+ *pobjp = parent;
+
+ return (0);
+}
+
+/*
+ * Given an object number, return some zpl level statistics
+ */
+static int
+zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
+ zfs_stat_t *sb)
+{
+ sa_bulk_attr_t bulk[4];
+ int count = 0;
+
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+ &sb->zs_mode, sizeof (sb->zs_mode));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
+ &sb->zs_gen, sizeof (sb->zs_gen));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
+ &sb->zs_links, sizeof (sb->zs_links));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
+ &sb->zs_ctime, sizeof (sb->zs_ctime));
+
+ return (sa_bulk_lookup(hdl, bulk, count));
+}
+
+static int
+zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
+ sa_attr_type_t *sa_table, char *buf, int len)
+{
+ sa_handle_t *sa_hdl;
+ sa_handle_t *prevhdl = NULL;
+ dmu_buf_t *prevdb = NULL;
+ dmu_buf_t *sa_db = NULL;
+ char *path = buf + len - 1;
+ int error;
+
+ *path = '\0';
+ sa_hdl = hdl;
+
+ uint64_t deleteq_obj;
+ VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
+ ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
+ error = zap_lookup_int(osp, deleteq_obj, obj);
+ if (error == 0) {
+ return (ESTALE);
+ } else if (error != ENOENT) {
+ return (error);
+ }
+ error = 0;
+
+ for (;;) {
+ uint64_t pobj;
+ char component[MAXNAMELEN + 2];
+ size_t complen;
+ int is_xattrdir;
+
+ if (prevdb) {
+ ASSERT(prevhdl != NULL);
+ zfs_release_sa_handle(prevhdl, prevdb, FTAG);
+ }
+
+ if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
+ &is_xattrdir)) != 0)
+ break;
+
+ if (pobj == obj) {
+ if (path[0] != '/')
+ *--path = '/';
+ break;
+ }
+
+ component[0] = '/';
+ if (is_xattrdir) {
+ (void) sprintf(component + 1, "<xattrdir>");
+ } else {
+ error = zap_value_search(osp, pobj, obj,
+ ZFS_DIRENT_OBJ(-1ULL), component + 1);
+ if (error != 0)
+ break;
+ }
+
+ complen = strlen(component);
+ path -= complen;
+ ASSERT(path >= buf);
+ bcopy(component, path, complen);
+ obj = pobj;
+
+ if (sa_hdl != hdl) {
+ prevhdl = sa_hdl;
+ prevdb = sa_db;
+ }
+ error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
+ if (error != 0) {
+ sa_hdl = prevhdl;
+ sa_db = prevdb;
+ break;
+ }
+ }
+
+ if (sa_hdl != NULL && sa_hdl != hdl) {
+ ASSERT(sa_db != NULL);
+ zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
+ }
+
+ if (error == 0)
+ (void) memmove(buf, path, buf + len - path);
+
+ return (error);
+}
+
+int
+zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
+{
+ sa_attr_type_t *sa_table;
+ sa_handle_t *hdl;
+ dmu_buf_t *db;
+ int error;
+
+ error = zfs_sa_setup(osp, &sa_table);
+ if (error != 0)
+ return (error);
+
+ error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+ zfs_release_sa_handle(hdl, db, FTAG);
+ return (error);
+}
+
+int
+zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
+ char *buf, int len)
+{
+ char *path = buf + len - 1;
+ sa_attr_type_t *sa_table;
+ sa_handle_t *hdl;
+ dmu_buf_t *db;
+ int error;
+
+ *path = '\0';
+
+ error = zfs_sa_setup(osp, &sa_table);
+ if (error != 0)
+ return (error);
+
+ error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
+ if (error != 0) {
+ zfs_release_sa_handle(hdl, db, FTAG);
+ return (error);
+ }
+
+ error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+ zfs_release_sa_handle(hdl, db, FTAG);
+ return (error);
+}
+
+
+void
+zfs_znode_update_vfs(znode_t *zp)
+{
+ vm_object_t object;
+
+ if ((object = ZTOV(zp)->v_object) == NULL ||
+ zp->z_size == object->un_pager.vnp.vnp_size)
+ return;
+
+ vnode_pager_setsize(ZTOV(zp), zp->z_size);
+}
+
+
+#ifdef _KERNEL
+int
+zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t parent;
+ int is_xattrdir;
+ int err;
+
+ /* Extended attributes should not be visible as regular files. */
+ if ((zp->z_pflags & ZFS_XATTR) != 0)
+ return (SET_ERROR(EINVAL));
+
+ err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table,
+ &parent, &is_xattrdir);
+ if (err != 0)
+ return (err);
+ ASSERT0(is_xattrdir);
+
+ /* No name as this is a root object. */
+ if (parent == zp->z_id)
+ return (SET_ERROR(EINVAL));
+
+ err = zap_value_search(zfsvfs->z_os, parent, zp->z_id,
+ ZFS_DIRENT_OBJ(-1ULL), buf);
+ if (err != 0)
+ return (err);
+ err = zfs_zget(zfsvfs, parent, dzpp);
+ return (err);
+}
+#endif /* _KERNEL */
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
new file mode 100644
index 000000000000..9fe678d2574f
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
@@ -0,0 +1,1839 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, Datto, Inc. All rights reserved.
+ */
+
+#include <sys/zio_crypt.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dnode.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/zil.h>
+#include <sys/sha2.h>
+#include <sys/hkdf.h>
+
+/*
+ * This file is responsible for handling all of the details of generating
+ * encryption parameters and performing encryption and authentication.
+ *
+ * BLOCK ENCRYPTION PARAMETERS:
+ * Encryption /Authentication Algorithm Suite (crypt):
+ * The encryption algorithm, mode, and key length we are going to use. We
+ * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit
+ * keys. All authentication is currently done with SHA512-HMAC.
+ *
+ * Plaintext:
+ * The unencrypted data that we want to encrypt.
+ *
+ * Initialization Vector (IV):
+ * An initialization vector for the encryption algorithms. This is used to
+ * "tweak" the encryption algorithms so that two blocks of the same data are
+ * encrypted into different ciphertext outputs, thus obfuscating block patterns.
+ * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is
+ * never reused with the same encryption key. This value is stored unencrypted
+ * and must simply be provided to the decryption function. We use a 96 bit IV
+ * (as recommended by NIST) for all block encryption. For non-dedup blocks we
+ * derive the IV randomly. The first 64 bits of the IV are stored in the second
+ * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of
+ * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits
+ * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count
+ * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of
+ * level 0 blocks is the number of allocated dnodes in that block. The on-disk
+ * format supports at most 2^15 slots per L0 dnode block, because the maximum
+ * block size is 16MB (2^24). In either case, for level 0 blocks this number
+ * will still be smaller than UINT32_MAX so it is safe to store the IV in the
+ * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count
+ * for the dnode code.
+ *
+ * Master key:
+ * This is the most important secret data of an encrypted dataset. It is used
+ * along with the salt to generate that actual encryption keys via HKDF. We
+ * do not use the master key to directly encrypt any data because there are
+ * theoretical limits on how much data can actually be safely encrypted with
+ * any encryption mode. The master key is stored encrypted on disk with the
+ * user's wrapping key. Its length is determined by the encryption algorithm.
+ * For details on how this is stored see the block comment in dsl_crypt.c
+ *
+ * Salt:
+ * Used as an input to the HKDF function, along with the master key. We use a
+ * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt
+ * can be used for encrypting many blocks, so we cache the current salt and the
+ * associated derived key in zio_crypt_t so we do not need to derive it again
+ * needlessly.
+ *
+ * Encryption Key:
+ * A secret binary key, generated from an HKDF function used to encrypt and
+ * decrypt data.
+ *
+ * Message Authentication Code (MAC)
+ * The MAC is an output of authenticated encryption modes such as AES-GCM and
+ * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted
+ * data on disk and return garbage to the application. Effectively, it is a
+ * checksum that can not be reproduced by an attacker. We store the MAC in the
+ * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated
+ * regular checksum of the ciphertext which can be used for scrubbing.
+ *
+ * OBJECT AUTHENTICATION:
+ * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because
+ * they contain some info that always needs to be readable. To prevent this
+ * data from being altered, we authenticate this data using SHA512-HMAC. This
+ * will produce a MAC (similar to the one produced via encryption) which can
+ * be used to verify the object was not modified. HMACs do not require key
+ * rotation or IVs, so we can keep up to the full 3 copies of authenticated
+ * data.
+ *
+ * ZIL ENCRYPTION:
+ * ZIL blocks have their bp written to disk ahead of the associated data, so we
+ * cannot store the MAC there as we normally do. For these blocks the MAC is
+ * stored in the embedded checksum within the zil_chain_t header. The salt and
+ * IV are generated for the block on bp allocation instead of at encryption
+ * time. In addition, ZIL blocks have some pieces that must be left in plaintext
+ * for claiming even though all of the sensitive user data still needs to be
+ * encrypted. The function zio_crypt_init_uios_zil() handles parsing which
+ * pieces of the block need to be encrypted. All data that is not encrypted is
+ * authenticated using the AAD mechanisms that the supported encryption modes
+ * provide for. In order to preserve the semantics of the ZIL for encrypted
+ * datasets, the ZIL is not protected at the objset level as described below.
+ *
+ * DNODE ENCRYPTION:
+ * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left
+ * in plaintext for scrubbing and claiming, but the bonus buffers might contain
+ * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing
+ * which which pieces of the block need to be encrypted. For more details about
+ * dnode authentication and encryption, see zio_crypt_init_uios_dnode().
+ *
+ * OBJECT SET AUTHENTICATION:
+ * Up to this point, everything we have encrypted and authenticated has been
+ * at level 0 (or -2 for the ZIL). If we did not do any further work the
+ * on-disk format would be susceptible to attacks that deleted or rearranged
+ * the order of level 0 blocks. Ideally, the cleanest solution would be to
+ * maintain a tree of authentication MACs going up the bp tree. However, this
+ * presents a problem for raw sends. Send files do not send information about
+ * indirect blocks so there would be no convenient way to transfer the MACs and
+ * they cannot be recalculated on the receive side without the master key which
+ * would defeat one of the purposes of raw sends in the first place. Instead,
+ * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs
+ * from the level below. We also include some portable fields from blk_prop such
+ * as the lsize and compression algorithm to prevent the data from being
+ * misinterpreted.
+ *
+ * At the objset level, we maintain 2 separate 256 bit MACs in the
+ * objset_phys_t. The first one is "portable" and is the logical root of the
+ * MAC tree maintained in the metadnode's bps. The second, is "local" and is
+ * used as the root MAC for the user accounting objects, which are also not
+ * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload
+ * of the send file. The useraccounting code ensures that the useraccounting
+ * info is not present upon a receive, so the local MAC can simply be cleared
+ * out at that time. For more info about objset_phys_t authentication, see
+ * zio_crypt_do_objset_hmacs().
+ *
+ * CONSIDERATIONS FOR DEDUP:
+ * In order for dedup to work, blocks that we want to dedup with one another
+ * need to use the same IV and encryption key, so that they will have the same
+ * ciphertext. Normally, one should never reuse an IV with the same encryption
+ * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both
+ * blocks. In this case, however, since we are using the same plaintext as
+ * well all that we end up with is a duplicate of the original ciphertext we
+ * already had. As a result, an attacker with read access to the raw disk will
+ * be able to tell which blocks are the same but this information is given away
+ * by dedup anyway. In order to get the same IVs and encryption keys for
+ * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC
+ * here so that a reproducible checksum of the plaintext is never available to
+ * the attacker. The HMAC key is kept alongside the master key, encrypted on
+ * disk. The first 64 bits of the HMAC are used in place of the random salt, and
+ * the next 96 bits are used as the IV. As a result of this mechanism, dedup
+ * will only work within a clone family since encrypted dedup requires use of
+ * the same master and HMAC keys.
+ */
+
+/*
+ * After encrypting many blocks with the same key we may start to run up
+ * against the theoretical limits of how much data can securely be encrypted
+ * with a single key using the supported encryption modes. The most obvious
+ * limitation is that our risk of generating 2 equivalent 96 bit IVs increases
+ * the more IVs we generate (which both GCM and CCM modes strictly forbid).
+ * This risk actually grows surprisingly quickly over time according to the
+ * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have
+ * generated n IVs with a cryptographically secure RNG, the approximate
+ * probability p(n) of a collision is given as:
+ *
+ * p(n) ~= e^(-n*(n-1)/(2*(2^96)))
+ *
+ * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html]
+ *
+ * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion
+ * we must not write more than 398,065,730 blocks with the same encryption key.
+ * Therefore, we rotate our keys after 400,000,000 blocks have been written by
+ * generating a new random 64 bit salt for our HKDF encryption key generation
+ * function.
+ */
+#define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000
+#define ZFS_CURRENT_MAX_SALT_USES \
+ (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT))
+unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT;
+
+/*
+ * Set to a nonzero value to cause zio_do_crypt_uio() to fail 1/this many
+ * calls, to test decryption error handling code paths.
+ */
+uint64_t zio_decrypt_fail_fraction = 0;
+
+typedef struct blkptr_auth_buf {
+ uint64_t bab_prop; /* blk_prop - portable mask */
+ uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */
+ uint64_t bab_pad; /* reserved for future use */
+} blkptr_auth_buf_t;
+
+zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = {
+ {"", ZC_TYPE_NONE, 0, "inherit"},
+ {"", ZC_TYPE_NONE, 0, "on"},
+ {"", ZC_TYPE_NONE, 0, "off"},
+ {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"},
+ {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"},
+ {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"},
+ {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"},
+ {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"},
+ {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"}
+};
+
+static void
+zio_crypt_key_destroy_early(zio_crypt_key_t *key)
+{
+ rw_destroy(&key->zk_salt_lock);
+
+ /* free crypto templates */
+ bzero(&key->zk_session, sizeof (key->zk_session));
+
+ /* zero out sensitive data */
+ bzero(key, sizeof (zio_crypt_key_t));
+}
+
+void
+zio_crypt_key_destroy(zio_crypt_key_t *key)
+{
+
+ freebsd_crypt_freesession(&key->zk_session);
+ zio_crypt_key_destroy_early(key);
+}
+
+int
+zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key)
+{
+ int ret;
+ crypto_mechanism_t mech __unused;
+ uint_t keydata_len;
+ zio_crypt_info_t *ci = NULL;
+
+ ASSERT(key != NULL);
+ ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+
+ ci = &zio_crypt_table[crypt];
+ if (ci->ci_crypt_type != ZC_TYPE_GCM &&
+ ci->ci_crypt_type != ZC_TYPE_CCM)
+ return (ENOTSUP);
+
+ keydata_len = zio_crypt_table[crypt].ci_keylen;
+ bzero(key, sizeof (zio_crypt_key_t));
+ rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
+
+ /* fill keydata buffers and salt with random data */
+ ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t));
+ if (ret != 0)
+ goto error;
+
+ ret = random_get_bytes(key->zk_master_keydata, keydata_len);
+ if (ret != 0)
+ goto error;
+
+ ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN);
+ if (ret != 0)
+ goto error;
+
+ ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
+ if (ret != 0)
+ goto error;
+
+ /* derive the current key from the master key */
+ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+ key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
+ keydata_len);
+ if (ret != 0)
+ goto error;
+
+ /* initialize keys for the ICP */
+ key->zk_current_key.ck_format = CRYPTO_KEY_RAW;
+ key->zk_current_key.ck_data = key->zk_current_keydata;
+ key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+ key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW;
+ key->zk_hmac_key.ck_data = &key->zk_hmac_key;
+ key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
+
+ ci = &zio_crypt_table[crypt];
+ if (ci->ci_crypt_type != ZC_TYPE_GCM &&
+ ci->ci_crypt_type != ZC_TYPE_CCM)
+ return (ENOTSUP);
+
+ ret = freebsd_crypt_newsession(&key->zk_session, ci,
+ &key->zk_current_key);
+ if (ret)
+ goto error;
+
+ key->zk_crypt = crypt;
+ key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION;
+ key->zk_salt_count = 0;
+
+ return (0);
+
+error:
+ zio_crypt_key_destroy_early(key);
+ return (ret);
+}
+
+static int
+zio_crypt_key_change_salt(zio_crypt_key_t *key)
+{
+ int ret = 0;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ crypto_mechanism_t mech __unused;
+
+ uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen;
+
+ /* generate a new salt */
+ ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN);
+ if (ret != 0)
+ goto error;
+
+ rw_enter(&key->zk_salt_lock, RW_WRITER);
+
+ /* someone beat us to the salt rotation, just unlock and return */
+ if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES)
+ goto out_unlock;
+
+ /* derive the current key from the master key and the new salt */
+ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+ salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len);
+ if (ret != 0)
+ goto out_unlock;
+
+ /* assign the salt and reset the usage count */
+ bcopy(salt, key->zk_salt, ZIO_DATA_SALT_LEN);
+ key->zk_salt_count = 0;
+
+ freebsd_crypt_freesession(&key->zk_session);
+ ret = freebsd_crypt_newsession(&key->zk_session,
+ &zio_crypt_table[key->zk_crypt], &key->zk_current_key);
+ if (ret != 0)
+ goto out_unlock;
+
+ rw_exit(&key->zk_salt_lock);
+
+ return (0);
+
+out_unlock:
+ rw_exit(&key->zk_salt_lock);
+error:
+ return (ret);
+}
+
+/* See comment above zfs_key_max_salt_uses definition for details */
+int
+zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt)
+{
+ int ret;
+ boolean_t salt_change;
+
+ rw_enter(&key->zk_salt_lock, RW_READER);
+
+ bcopy(key->zk_salt, salt, ZIO_DATA_SALT_LEN);
+ salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >=
+ ZFS_CURRENT_MAX_SALT_USES);
+
+ rw_exit(&key->zk_salt_lock);
+
+ if (salt_change) {
+ ret = zio_crypt_key_change_salt(key);
+ if (ret != 0)
+ goto error;
+ }
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+void *failed_decrypt_buf;
+int failed_decrypt_size;
+
+/*
+ * This function handles all encryption and decryption in zfs. When
+ * encrypting it expects puio to reference the plaintext and cuio to
+ * reference the ciphertext. cuio must have enough space for the
+ * ciphertext + room for a MAC. datalen should be the length of the
+ * plaintext / ciphertext alone.
+ */
+/*
+ * The implementation for FreeBSD's OpenCrypto.
+ *
+ * The big difference between ICP and FOC is that FOC uses a single
+ * buffer for input and output. This means that (for AES-GCM, the
+ * only one supported right now) the source must be copied into the
+ * destination, and the destination must have the AAD, and the tag/MAC,
+ * already associated with it. (Both implementations can use a uio.)
+ *
+ * Since the auth data is part of the iovec array, all we need to know
+ * is the length: 0 means there's no AAD.
+ *
+ */
+static int
+zio_do_crypt_uio_opencrypto(boolean_t encrypt, freebsd_crypt_session_t *sess,
+ uint64_t crypt, crypto_key_t *key, uint8_t *ivbuf, uint_t datalen,
+ zfs_uio_t *uio, uint_t auth_len)
+{
+ zio_crypt_info_t *ci;
+ int ret;
+
+ ci = &zio_crypt_table[crypt];
+ if (ci->ci_crypt_type != ZC_TYPE_GCM &&
+ ci->ci_crypt_type != ZC_TYPE_CCM)
+ return (ENOTSUP);
+
+
+ ret = freebsd_crypt_uio(encrypt, sess, ci, uio, key, ivbuf,
+ datalen, auth_len);
+ if (ret != 0) {
+#ifdef FCRYPTO_DEBUG
+ printf("%s(%d): Returning error %s\n",
+ __FUNCTION__, __LINE__, encrypt ? "EIO" : "ECKSUM");
+#endif
+ ret = SET_ERROR(encrypt ? EIO : ECKSUM);
+ }
+
+ return (ret);
+}
+
+int
+zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv,
+ uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out)
+{
+ int ret;
+ uint64_t aad[3];
+ /*
+ * With OpenCrypto in FreeBSD, the same buffer is used for
+ * input and output. Also, the AAD (for AES-GMC at least)
+ * needs to logically go in front.
+ */
+ zfs_uio_t cuio;
+ struct uio cuio_s;
+ iovec_t iovecs[4];
+ uint64_t crypt = key->zk_crypt;
+ uint_t enc_len, keydata_len, aad_len;
+
+ ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+ ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW);
+
+ zfs_uio_init(&cuio, &cuio_s);
+
+ keydata_len = zio_crypt_table[crypt].ci_keylen;
+
+ /* generate iv for wrapping the master and hmac key */
+ ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN);
+ if (ret != 0)
+ goto error;
+
+ /*
+ * Since we only support one buffer, we need to copy
+ * the plain text (source) to the cipher buffer (dest).
+ * We set iovecs[0] -- the authentication data -- below.
+ */
+ bcopy((void*)key->zk_master_keydata, keydata_out, keydata_len);
+ bcopy((void*)key->zk_hmac_keydata, hmac_keydata_out,
+ SHA512_HMAC_KEYLEN);
+ iovecs[1].iov_base = keydata_out;
+ iovecs[1].iov_len = keydata_len;
+ iovecs[2].iov_base = hmac_keydata_out;
+ iovecs[2].iov_len = SHA512_HMAC_KEYLEN;
+ iovecs[3].iov_base = mac;
+ iovecs[3].iov_len = WRAPPING_MAC_LEN;
+
+ /*
+ * Although we don't support writing to the old format, we do
+ * support rewrapping the key so that the user can move and
+ * quarantine datasets on the old format.
+ */
+ if (key->zk_version == 0) {
+ aad_len = sizeof (uint64_t);
+ aad[0] = LE_64(key->zk_guid);
+ } else {
+ ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+ aad_len = sizeof (uint64_t) * 3;
+ aad[0] = LE_64(key->zk_guid);
+ aad[1] = LE_64(crypt);
+ aad[2] = LE_64(key->zk_version);
+ }
+
+ iovecs[0].iov_base = aad;
+ iovecs[0].iov_len = aad_len;
+ enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN;
+
+ GET_UIO_STRUCT(&cuio)->uio_iov = iovecs;
+ zfs_uio_iovcnt(&cuio) = 4;
+ zfs_uio_segflg(&cuio) = UIO_SYSSPACE;
+
+ /* encrypt the keys and store the resulting ciphertext and mac */
+ ret = zio_do_crypt_uio_opencrypto(B_TRUE, NULL, crypt, cwkey,
+ iv, enc_len, &cuio, aad_len);
+ if (ret != 0)
+ goto error;
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+int
+zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version,
+ uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv,
+ uint8_t *mac, zio_crypt_key_t *key)
+{
+ int ret;
+ uint64_t aad[3];
+ /*
+ * With OpenCrypto in FreeBSD, the same buffer is used for
+ * input and output. Also, the AAD (for AES-GMC at least)
+ * needs to logically go in front.
+ */
+ zfs_uio_t cuio;
+ struct uio cuio_s;
+ iovec_t iovecs[4];
+ void *src, *dst;
+ uint_t enc_len, keydata_len, aad_len;
+
+ ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+ ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW);
+
+ keydata_len = zio_crypt_table[crypt].ci_keylen;
+ rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
+
+ zfs_uio_init(&cuio, &cuio_s);
+
+ /*
+ * Since we only support one buffer, we need to copy
+ * the encrypted buffer (source) to the plain buffer
+ * (dest). We set iovecs[0] -- the authentication data --
+ * below.
+ */
+ dst = key->zk_master_keydata;
+ src = keydata;
+
+ bcopy(src, dst, keydata_len);
+
+ dst = key->zk_hmac_keydata;
+ src = hmac_keydata;
+ bcopy(src, dst, SHA512_HMAC_KEYLEN);
+
+ iovecs[1].iov_base = key->zk_master_keydata;
+ iovecs[1].iov_len = keydata_len;
+ iovecs[2].iov_base = key->zk_hmac_keydata;
+ iovecs[2].iov_len = SHA512_HMAC_KEYLEN;
+ iovecs[3].iov_base = mac;
+ iovecs[3].iov_len = WRAPPING_MAC_LEN;
+
+ if (version == 0) {
+ aad_len = sizeof (uint64_t);
+ aad[0] = LE_64(guid);
+ } else {
+ ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+ aad_len = sizeof (uint64_t) * 3;
+ aad[0] = LE_64(guid);
+ aad[1] = LE_64(crypt);
+ aad[2] = LE_64(version);
+ }
+
+ enc_len = keydata_len + SHA512_HMAC_KEYLEN;
+ iovecs[0].iov_base = aad;
+ iovecs[0].iov_len = aad_len;
+
+ GET_UIO_STRUCT(&cuio)->uio_iov = iovecs;
+ zfs_uio_iovcnt(&cuio) = 4;
+ zfs_uio_segflg(&cuio) = UIO_SYSSPACE;
+
+ /* decrypt the keys and store the result in the output buffers */
+ ret = zio_do_crypt_uio_opencrypto(B_FALSE, NULL, crypt, cwkey,
+ iv, enc_len, &cuio, aad_len);
+
+ if (ret != 0)
+ goto error;
+
+ /* generate a fresh salt */
+ ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
+ if (ret != 0)
+ goto error;
+
+ /* derive the current key from the master key */
+ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+ key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
+ keydata_len);
+ if (ret != 0)
+ goto error;
+
+ /* initialize keys for ICP */
+ key->zk_current_key.ck_format = CRYPTO_KEY_RAW;
+ key->zk_current_key.ck_data = key->zk_current_keydata;
+ key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+ key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW;
+ key->zk_hmac_key.ck_data = key->zk_hmac_keydata;
+ key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
+
+ ret = freebsd_crypt_newsession(&key->zk_session,
+ &zio_crypt_table[crypt], &key->zk_current_key);
+ if (ret != 0)
+ goto error;
+
+ key->zk_crypt = crypt;
+ key->zk_version = version;
+ key->zk_guid = guid;
+ key->zk_salt_count = 0;
+
+ return (0);
+
+error:
+ zio_crypt_key_destroy_early(key);
+ return (ret);
+}
+
+int
+zio_crypt_generate_iv(uint8_t *ivbuf)
+{
+ int ret;
+
+ /* randomly generate the IV */
+ ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN);
+ if (ret != 0)
+ goto error;
+
+ return (0);
+
+error:
+ bzero(ivbuf, ZIO_DATA_IV_LEN);
+ return (ret);
+}
+
+int
+zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen,
+ uint8_t *digestbuf, uint_t digestlen)
+{
+ uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH];
+
+ ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH);
+
+ crypto_mac(&key->zk_hmac_key, data, datalen,
+ raw_digestbuf, SHA512_DIGEST_LENGTH);
+
+ bcopy(raw_digestbuf, digestbuf, digestlen);
+
+ return (0);
+}
+
+int
+zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data,
+ uint_t datalen, uint8_t *ivbuf, uint8_t *salt)
+{
+ int ret;
+ uint8_t digestbuf[SHA512_DIGEST_LENGTH];
+
+ ret = zio_crypt_do_hmac(key, data, datalen,
+ digestbuf, SHA512_DIGEST_LENGTH);
+ if (ret != 0)
+ return (ret);
+
+ bcopy(digestbuf, salt, ZIO_DATA_SALT_LEN);
+ bcopy(digestbuf + ZIO_DATA_SALT_LEN, ivbuf, ZIO_DATA_IV_LEN);
+
+ return (0);
+}
+
+/*
+ * The following functions are used to encode and decode encryption parameters
+ * into blkptr_t and zil_header_t. The ICP wants to use these parameters as
+ * byte strings, which normally means that these strings would not need to deal
+ * with byteswapping at all. However, both blkptr_t and zil_header_t may be
+ * byteswapped by lower layers and so we must "undo" that byteswap here upon
+ * decoding and encoding in a non-native byteorder. These functions require
+ * that the byteorder bit is correct before being called.
+ */
+void
+zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv)
+{
+ uint64_t val64;
+ uint32_t val32;
+
+ ASSERT(BP_IS_ENCRYPTED(bp));
+
+ if (!BP_SHOULD_BYTESWAP(bp)) {
+ bcopy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t));
+ bcopy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t));
+ bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
+ BP_SET_IV2(bp, val32);
+ } else {
+ bcopy(salt, &val64, sizeof (uint64_t));
+ bp->blk_dva[2].dva_word[0] = BSWAP_64(val64);
+
+ bcopy(iv, &val64, sizeof (uint64_t));
+ bp->blk_dva[2].dva_word[1] = BSWAP_64(val64);
+
+ bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
+ BP_SET_IV2(bp, BSWAP_32(val32));
+ }
+}
+
+void
+zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv)
+{
+ uint64_t val64;
+ uint32_t val32;
+
+ ASSERT(BP_IS_PROTECTED(bp));
+
+ /* for convenience, so callers don't need to check */
+ if (BP_IS_AUTHENTICATED(bp)) {
+ bzero(salt, ZIO_DATA_SALT_LEN);
+ bzero(iv, ZIO_DATA_IV_LEN);
+ return;
+ }
+
+ if (!BP_SHOULD_BYTESWAP(bp)) {
+ bcopy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t));
+ bcopy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t));
+
+ val32 = (uint32_t)BP_GET_IV2(bp);
+ bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
+ } else {
+ val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]);
+ bcopy(&val64, salt, sizeof (uint64_t));
+
+ val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]);
+ bcopy(&val64, iv, sizeof (uint64_t));
+
+ val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp));
+ bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
+ }
+}
+
+void
+zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac)
+{
+ uint64_t val64;
+
+ ASSERT(BP_USES_CRYPT(bp));
+ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET);
+
+ if (!BP_SHOULD_BYTESWAP(bp)) {
+ bcopy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t));
+ bcopy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3],
+ sizeof (uint64_t));
+ } else {
+ bcopy(mac, &val64, sizeof (uint64_t));
+ bp->blk_cksum.zc_word[2] = BSWAP_64(val64);
+
+ bcopy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t));
+ bp->blk_cksum.zc_word[3] = BSWAP_64(val64);
+ }
+}
+
+void
+zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac)
+{
+ uint64_t val64;
+
+ ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp));
+
+ /* for convenience, so callers don't need to check */
+ if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+ bzero(mac, ZIO_DATA_MAC_LEN);
+ return;
+ }
+
+ if (!BP_SHOULD_BYTESWAP(bp)) {
+ bcopy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t));
+ bcopy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t),
+ sizeof (uint64_t));
+ } else {
+ val64 = BSWAP_64(bp->blk_cksum.zc_word[2]);
+ bcopy(&val64, mac, sizeof (uint64_t));
+
+ val64 = BSWAP_64(bp->blk_cksum.zc_word[3]);
+ bcopy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t));
+ }
+}
+
+void
+zio_crypt_encode_mac_zil(void *data, uint8_t *mac)
+{
+ zil_chain_t *zilc = data;
+
+ bcopy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t));
+ bcopy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3],
+ sizeof (uint64_t));
+}
+
+void
+zio_crypt_decode_mac_zil(const void *data, uint8_t *mac)
+{
+ /*
+ * The ZIL MAC is embedded in the block it protects, which will
+ * not have been byteswapped by the time this function has been called.
+ * As a result, we don't need to worry about byteswapping the MAC.
+ */
+ const zil_chain_t *zilc = data;
+
+ bcopy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t));
+ bcopy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t),
+ sizeof (uint64_t));
+}
+
+/*
+ * This routine takes a block of dnodes (src_abd) and copies only the bonus
+ * buffers to the same offsets in the dst buffer. datalen should be the size
+ * of both the src_abd and the dst buffer (not just the length of the bonus
+ * buffers).
+ */
+void
+zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen)
+{
+ uint_t i, max_dnp = datalen >> DNODE_SHIFT;
+ uint8_t *src;
+ dnode_phys_t *dnp, *sdnp, *ddnp;
+
+ src = abd_borrow_buf_copy(src_abd, datalen);
+
+ sdnp = (dnode_phys_t *)src;
+ ddnp = (dnode_phys_t *)dst;
+
+ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+ dnp = &sdnp[i];
+ if (dnp->dn_type != DMU_OT_NONE &&
+ DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
+ dnp->dn_bonuslen != 0) {
+ bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]),
+ DN_MAX_BONUS_LEN(dnp));
+ }
+ }
+
+ abd_return_buf(src_abd, src, datalen);
+}
+
+/*
+ * This function decides what fields from blk_prop are included in
+ * the on-disk various MAC algorithms.
+ */
+static void
+zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version)
+{
+ int avoidlint = SPA_MINBLOCKSIZE;
+ /*
+ * Version 0 did not properly zero out all non-portable fields
+ * as it should have done. We maintain this code so that we can
+ * do read-only imports of pools on this version.
+ */
+ if (version == 0) {
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_CHECKSUM(bp, 0);
+ BP_SET_PSIZE(bp, avoidlint);
+ return;
+ }
+
+ ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+
+ /*
+ * The hole_birth feature might set these fields even if this bp
+ * is a hole. We zero them out here to guarantee that raw sends
+ * will function with or without the feature.
+ */
+ if (BP_IS_HOLE(bp)) {
+ bp->blk_prop = 0ULL;
+ return;
+ }
+
+ /*
+ * At L0 we want to verify these fields to ensure that data blocks
+ * can not be reinterpreted. For instance, we do not want an attacker
+ * to trick us into returning raw lz4 compressed data to the user
+ * by modifying the compression bits. At higher levels, we cannot
+ * enforce this policy since raw sends do not convey any information
+ * about indirect blocks, so these values might be different on the
+ * receive side. Fortunately, this does not open any new attack
+ * vectors, since any alterations that can be made to a higher level
+ * bp must still verify the correct order of the layer below it.
+ */
+ if (BP_GET_LEVEL(bp) != 0) {
+ BP_SET_BYTEORDER(bp, 0);
+ BP_SET_COMPRESS(bp, 0);
+
+ /*
+ * psize cannot be set to zero or it will trigger
+ * asserts, but the value doesn't really matter as
+ * long as it is constant.
+ */
+ BP_SET_PSIZE(bp, avoidlint);
+ }
+
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_CHECKSUM(bp, 0);
+}
+
+static void
+zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp,
+ blkptr_auth_buf_t *bab, uint_t *bab_len)
+{
+ blkptr_t tmpbp = *bp;
+
+ if (should_bswap)
+ byteswap_uint64_array(&tmpbp, sizeof (blkptr_t));
+
+ ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp));
+ ASSERT0(BP_IS_EMBEDDED(&tmpbp));
+
+ zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac);
+
+ /*
+ * We always MAC blk_prop in LE to ensure portability. This
+ * must be done after decoding the mac, since the endianness
+ * will get zero'd out here.
+ */
+ zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version);
+ bab->bab_prop = LE_64(tmpbp.blk_prop);
+ bab->bab_pad = 0ULL;
+
+ /* version 0 did not include the padding */
+ *bab_len = sizeof (blkptr_auth_buf_t);
+ if (version == 0)
+ *bab_len -= sizeof (uint64_t);
+}
+
+static int
+zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version,
+ boolean_t should_bswap, blkptr_t *bp)
+{
+ uint_t bab_len;
+ blkptr_auth_buf_t bab;
+
+ zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+ crypto_mac_update(ctx, &bab, bab_len);
+
+ return (0);
+}
+
+static void
+zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version,
+ boolean_t should_bswap, blkptr_t *bp)
+{
+ uint_t bab_len;
+ blkptr_auth_buf_t bab;
+
+ zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+ SHA2Update(ctx, &bab, bab_len);
+}
+
+static void
+zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version,
+ boolean_t should_bswap, blkptr_t *bp)
+{
+ uint_t bab_len;
+ blkptr_auth_buf_t bab;
+
+ zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+ bcopy(&bab, *aadp, bab_len);
+ *aadp += bab_len;
+ *aad_len += bab_len;
+}
+
+static int
+zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version,
+ boolean_t should_bswap, dnode_phys_t *dnp)
+{
+ int ret, i;
+ dnode_phys_t *adnp;
+ boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
+ uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)];
+
+ /* authenticate the core dnode (masking out non-portable bits) */
+ bcopy(dnp, tmp_dncore, sizeof (tmp_dncore));
+ adnp = (dnode_phys_t *)tmp_dncore;
+ if (le_bswap) {
+ adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec);
+ adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen);
+ adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid);
+ adnp->dn_used = BSWAP_64(adnp->dn_used);
+ }
+ adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
+ adnp->dn_used = 0;
+
+ crypto_mac_update(ctx, adnp, sizeof (tmp_dncore));
+
+ for (i = 0; i < dnp->dn_nblkptr; i++) {
+ ret = zio_crypt_bp_do_hmac_updates(ctx, version,
+ should_bswap, &dnp->dn_blkptr[i]);
+ if (ret != 0)
+ goto error;
+ }
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ ret = zio_crypt_bp_do_hmac_updates(ctx, version,
+ should_bswap, DN_SPILL_BLKPTR(dnp));
+ if (ret != 0)
+ goto error;
+ }
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+/*
+ * objset_phys_t blocks introduce a number of exceptions to the normal
+ * authentication process. objset_phys_t's contain 2 separate HMACS for
+ * protecting the integrity of their data. The portable_mac protects the
+ * metadnode. This MAC can be sent with a raw send and protects against
+ * reordering of data within the metadnode. The local_mac protects the user
+ * accounting objects which are not sent from one system to another.
+ *
+ * In addition, objset blocks are the only blocks that can be modified and
+ * written to disk without the key loaded under certain circumstances. During
+ * zil_claim() we need to be able to update the zil_header_t to complete
+ * claiming log blocks and during raw receives we need to write out the
+ * portable_mac from the send file. Both of these actions are possible
+ * because these fields are not protected by either MAC so neither one will
+ * need to modify the MACs without the key. However, when the modified blocks
+ * are written out they will be byteswapped into the host machine's native
+ * endianness which will modify fields protected by the MAC. As a result, MAC
+ * calculation for objset blocks works slightly differently from other block
+ * types. Where other block types MAC the data in whatever endianness is
+ * written to disk, objset blocks always MAC little endian version of their
+ * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP()
+ * and le_bswap indicates whether a byteswap is needed to get this block
+ * into little endian format.
+ */
+/* ARGSUSED */
+int
+zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen,
+ boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac)
+{
+ int ret;
+ struct hmac_ctx hash_ctx;
+ struct hmac_ctx *ctx = &hash_ctx;
+ objset_phys_t *osp = data;
+ uint64_t intval;
+ boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
+ uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH];
+ uint8_t raw_local_mac[SHA512_DIGEST_LENGTH];
+
+
+ /* calculate the portable MAC from the portable fields and metadnode */
+ crypto_mac_init(ctx, &key->zk_hmac_key);
+
+ /* add in the os_type */
+ intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type);
+ crypto_mac_update(ctx, &intval, sizeof (uint64_t));
+
+ /* add in the portable os_flags */
+ intval = osp->os_flags;
+ if (should_bswap)
+ intval = BSWAP_64(intval);
+ intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
+ /* CONSTCOND */
+ if (!ZFS_HOST_BYTEORDER)
+ intval = BSWAP_64(intval);
+
+ crypto_mac_update(ctx, &intval, sizeof (uint64_t));
+
+ /* add in fields from the metadnode */
+ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+ should_bswap, &osp->os_meta_dnode);
+ if (ret)
+ goto error;
+
+ crypto_mac_final(ctx, raw_portable_mac, SHA512_DIGEST_LENGTH);
+
+ bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN);
+
+ /*
+ * This is necessary here as we check next whether
+ * OBJSET_FLAG_USERACCOUNTING_COMPLETE or
+ * OBJSET_FLAG_USEROBJACCOUNTING are set in order to
+ * decide if the local_mac should be zeroed out.
+ */
+ intval = osp->os_flags;
+ if (should_bswap)
+ intval = BSWAP_64(intval);
+
+ /*
+ * The local MAC protects the user, group and project accounting.
+ * If these objects are not present, the local MAC is zeroed out.
+ */
+ if ((datalen >= OBJSET_PHYS_SIZE_V3 &&
+ osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
+ osp->os_groupused_dnode.dn_type == DMU_OT_NONE &&
+ osp->os_projectused_dnode.dn_type == DMU_OT_NONE) ||
+ (datalen >= OBJSET_PHYS_SIZE_V2 &&
+ osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
+ osp->os_groupused_dnode.dn_type == DMU_OT_NONE) ||
+ (datalen <= OBJSET_PHYS_SIZE_V1) ||
+ (((intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE) == 0 ||
+ (intval & OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE) == 0) &&
+ key->zk_version > 0)) {
+ bzero(local_mac, ZIO_OBJSET_MAC_LEN);
+ return (0);
+ }
+
+ /* calculate the local MAC from the userused and groupused dnodes */
+ crypto_mac_init(ctx, &key->zk_hmac_key);
+
+ /* add in the non-portable os_flags */
+ intval = osp->os_flags;
+ if (should_bswap)
+ intval = BSWAP_64(intval);
+ intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
+ /* CONSTCOND */
+ if (!ZFS_HOST_BYTEORDER)
+ intval = BSWAP_64(intval);
+
+ crypto_mac_update(ctx, &intval, sizeof (uint64_t));
+
+ /* XXX check dnode type ... */
+ /* add in fields from the user accounting dnodes */
+ if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) {
+ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+ should_bswap, &osp->os_userused_dnode);
+ if (ret)
+ goto error;
+ }
+
+ if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) {
+ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+ should_bswap, &osp->os_groupused_dnode);
+ if (ret)
+ goto error;
+ }
+
+ if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE &&
+ datalen >= OBJSET_PHYS_SIZE_V3) {
+ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+ should_bswap, &osp->os_projectused_dnode);
+ if (ret)
+ goto error;
+ }
+
+ crypto_mac_final(ctx, raw_local_mac, SHA512_DIGEST_LENGTH);
+
+ bcopy(raw_local_mac, local_mac, ZIO_OBJSET_MAC_LEN);
+
+ return (0);
+
+error:
+ bzero(portable_mac, ZIO_OBJSET_MAC_LEN);
+ bzero(local_mac, ZIO_OBJSET_MAC_LEN);
+ return (ret);
+}
+
+static void
+zio_crypt_destroy_uio(zfs_uio_t *uio)
+{
+ if (GET_UIO_STRUCT(uio)->uio_iov)
+ kmem_free(GET_UIO_STRUCT(uio)->uio_iov,
+ zfs_uio_iovcnt(uio) * sizeof (iovec_t));
+}
+
+/*
+ * This function parses an uncompressed indirect block and returns a checksum
+ * of all the portable fields from all of the contained bps. The portable
+ * fields are the MAC and all of the fields from blk_prop except for the dedup,
+ * checksum, and psize bits. For an explanation of the purpose of this, see
+ * the comment block on object set authentication.
+ */
+static int
+zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf,
+ uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum)
+{
+ blkptr_t *bp;
+ int i, epb = datalen >> SPA_BLKPTRSHIFT;
+ SHA2_CTX ctx;
+ uint8_t digestbuf[SHA512_DIGEST_LENGTH];
+
+ /* checksum all of the MACs from the layer below */
+ SHA2Init(SHA512, &ctx);
+ for (i = 0, bp = buf; i < epb; i++, bp++) {
+ zio_crypt_bp_do_indrect_checksum_updates(&ctx, version,
+ byteswap, bp);
+ }
+ SHA2Final(digestbuf, &ctx);
+
+ if (generate) {
+ bcopy(digestbuf, cksum, ZIO_DATA_MAC_LEN);
+ return (0);
+ }
+
+ if (bcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) {
+#ifdef FCRYPTO_DEBUG
+ printf("%s(%d): Setting ECKSUM\n", __FUNCTION__, __LINE__);
+#endif
+ return (SET_ERROR(ECKSUM));
+ }
+ return (0);
+}
+
+int
+zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf,
+ uint_t datalen, boolean_t byteswap, uint8_t *cksum)
+{
+ int ret;
+
+ /*
+ * Unfortunately, callers of this function will not always have
+ * easy access to the on-disk format version. This info is
+ * normally found in the DSL Crypto Key, but the checksum-of-MACs
+ * is expected to be verifiable even when the key isn't loaded.
+ * Here, instead of doing a ZAP lookup for the version for each
+ * zio, we simply try both existing formats.
+ */
+ ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf,
+ datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum);
+ if (ret == ECKSUM) {
+ ASSERT(!generate);
+ ret = zio_crypt_do_indirect_mac_checksum_impl(generate,
+ buf, datalen, 0, byteswap, cksum);
+ }
+
+ return (ret);
+}
+
+int
+zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd,
+ uint_t datalen, boolean_t byteswap, uint8_t *cksum)
+{
+ int ret;
+ void *buf;
+
+ buf = abd_borrow_buf_copy(abd, datalen);
+ ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen,
+ byteswap, cksum);
+ abd_return_buf(abd, buf, datalen);
+
+ return (ret);
+}
+
+/*
+ * Special case handling routine for encrypting / decrypting ZIL blocks.
+ * We do not check for the older ZIL chain because the encryption feature
+ * was not available before the newer ZIL chain was introduced. The goal
+ * here is to encrypt everything except the blkptr_t of a lr_write_t and
+ * the zil_chain_t header. Everything that is not encrypted is authenticated.
+ */
+/*
+ * The OpenCrypto used in FreeBSD does not use separate source and
+ * destination buffers; instead, the same buffer is used. Further, to
+ * accommodate some of the drivers, the authbuf needs to be logically before
+ * the data. This means that we need to copy the source to the destination,
+ * and set up an extra iovec_t at the beginning to handle the authbuf.
+ * It also means we'll only return one zfs_uio_t.
+ */
+
+/* ARGSUSED */
+static int
+zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
+ uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio,
+ zfs_uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len,
+ boolean_t *no_crypt)
+{
+ uint8_t *aadbuf = zio_buf_alloc(datalen);
+ uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp;
+ iovec_t *dst_iovecs;
+ zil_chain_t *zilc;
+ lr_t *lr;
+ uint64_t txtype, lr_len;
+ uint_t crypt_len, nr_iovecs, vec;
+ uint_t aad_len = 0, total_len = 0;
+
+ if (encrypt) {
+ src = plainbuf;
+ dst = cipherbuf;
+ } else {
+ src = cipherbuf;
+ dst = plainbuf;
+ }
+ bcopy(src, dst, datalen);
+
+ /* Find the start and end record of the log block. */
+ zilc = (zil_chain_t *)src;
+ slrp = src + sizeof (zil_chain_t);
+ aadp = aadbuf;
+ blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+
+ /*
+ * Calculate the number of encrypted iovecs we will need.
+ */
+
+ /* We need at least two iovecs -- one for the AAD, one for the MAC. */
+ nr_iovecs = 2;
+
+ for (; slrp < blkend; slrp += lr_len) {
+ lr = (lr_t *)slrp;
+
+ if (byteswap) {
+ txtype = BSWAP_64(lr->lrc_txtype);
+ lr_len = BSWAP_64(lr->lrc_reclen);
+ } else {
+ txtype = lr->lrc_txtype;
+ lr_len = lr->lrc_reclen;
+ }
+
+ nr_iovecs++;
+ if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t))
+ nr_iovecs++;
+ }
+
+ dst_iovecs = kmem_alloc(nr_iovecs * sizeof (iovec_t), KM_SLEEP);
+
+ /*
+ * Copy the plain zil header over and authenticate everything except
+ * the checksum that will store our MAC. If we are writing the data
+ * the embedded checksum will not have been calculated yet, so we don't
+ * authenticate that.
+ */
+ bcopy(src, aadp, sizeof (zil_chain_t) - sizeof (zio_eck_t));
+ aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t);
+ aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t);
+
+ slrp = src + sizeof (zil_chain_t);
+ dlrp = dst + sizeof (zil_chain_t);
+
+ /*
+ * Loop over records again, filling in iovecs.
+ */
+
+ /* The first iovec will contain the authbuf. */
+ vec = 1;
+
+ for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) {
+ lr = (lr_t *)slrp;
+
+ if (!byteswap) {
+ txtype = lr->lrc_txtype;
+ lr_len = lr->lrc_reclen;
+ } else {
+ txtype = BSWAP_64(lr->lrc_txtype);
+ lr_len = BSWAP_64(lr->lrc_reclen);
+ }
+
+ /* copy the common lr_t */
+ bcopy(slrp, dlrp, sizeof (lr_t));
+ bcopy(slrp, aadp, sizeof (lr_t));
+ aadp += sizeof (lr_t);
+ aad_len += sizeof (lr_t);
+
+ /*
+ * If this is a TX_WRITE record we want to encrypt everything
+ * except the bp if exists. If the bp does exist we want to
+ * authenticate it.
+ */
+ if (txtype == TX_WRITE) {
+ crypt_len = sizeof (lr_write_t) -
+ sizeof (lr_t) - sizeof (blkptr_t);
+ dst_iovecs[vec].iov_base = (char *)dlrp +
+ sizeof (lr_t);
+ dst_iovecs[vec].iov_len = crypt_len;
+
+ /* copy the bp now since it will not be encrypted */
+ bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+ dlrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+ sizeof (blkptr_t));
+ bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+ aadp, sizeof (blkptr_t));
+ aadp += sizeof (blkptr_t);
+ aad_len += sizeof (blkptr_t);
+ vec++;
+ total_len += crypt_len;
+
+ if (lr_len != sizeof (lr_write_t)) {
+ crypt_len = lr_len - sizeof (lr_write_t);
+ dst_iovecs[vec].iov_base = (char *)
+ dlrp + sizeof (lr_write_t);
+ dst_iovecs[vec].iov_len = crypt_len;
+ vec++;
+ total_len += crypt_len;
+ }
+ } else {
+ crypt_len = lr_len - sizeof (lr_t);
+ dst_iovecs[vec].iov_base = (char *)dlrp +
+ sizeof (lr_t);
+ dst_iovecs[vec].iov_len = crypt_len;
+ vec++;
+ total_len += crypt_len;
+ }
+ }
+
+ /* The last iovec will contain the MAC. */
+ ASSERT3U(vec, ==, nr_iovecs - 1);
+
+ /* AAD */
+ dst_iovecs[0].iov_base = aadbuf;
+ dst_iovecs[0].iov_len = aad_len;
+ /* MAC */
+ dst_iovecs[vec].iov_base = 0;
+ dst_iovecs[vec].iov_len = 0;
+
+ *no_crypt = (vec == 1);
+ *enc_len = total_len;
+ *authbuf = aadbuf;
+ *auth_len = aad_len;
+ GET_UIO_STRUCT(out_uio)->uio_iov = dst_iovecs;
+ zfs_uio_iovcnt(out_uio) = nr_iovecs;
+
+ return (0);
+}
+
+/*
+ * Special case handling routine for encrypting / decrypting dnode blocks.
+ */
+static int
+zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version,
+ uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
+ zfs_uio_t *puio, zfs_uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf,
+ uint_t *auth_len, boolean_t *no_crypt)
+{
+ uint8_t *aadbuf = zio_buf_alloc(datalen);
+ uint8_t *src, *dst, *aadp;
+ dnode_phys_t *dnp, *adnp, *sdnp, *ddnp;
+ iovec_t *dst_iovecs;
+ uint_t nr_iovecs, crypt_len, vec;
+ uint_t aad_len = 0, total_len = 0;
+ uint_t i, j, max_dnp = datalen >> DNODE_SHIFT;
+
+ if (encrypt) {
+ src = plainbuf;
+ dst = cipherbuf;
+ } else {
+ src = cipherbuf;
+ dst = plainbuf;
+ }
+ bcopy(src, dst, datalen);
+
+ sdnp = (dnode_phys_t *)src;
+ ddnp = (dnode_phys_t *)dst;
+ aadp = aadbuf;
+
+ /*
+ * Count the number of iovecs we will need to do the encryption by
+ * counting the number of bonus buffers that need to be encrypted.
+ */
+
+ /* We need at least two iovecs -- one for the AAD, one for the MAC. */
+ nr_iovecs = 2;
+
+ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+ /*
+ * This block may still be byteswapped. However, all of the
+ * values we use are either uint8_t's (for which byteswapping
+ * is a noop) or a * != 0 check, which will work regardless
+ * of whether or not we byteswap.
+ */
+ if (sdnp[i].dn_type != DMU_OT_NONE &&
+ DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) &&
+ sdnp[i].dn_bonuslen != 0) {
+ nr_iovecs++;
+ }
+ }
+
+ dst_iovecs = kmem_alloc(nr_iovecs * sizeof (iovec_t), KM_SLEEP);
+
+ /*
+ * Iterate through the dnodes again, this time filling in the uios
+ * we allocated earlier. We also concatenate any data we want to
+ * authenticate onto aadbuf.
+ */
+
+ /* The first iovec will contain the authbuf. */
+ vec = 1;
+
+ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+ dnp = &sdnp[i];
+
+ /* copy over the core fields and blkptrs (kept as plaintext) */
+ bcopy(dnp, &ddnp[i], (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp);
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ bcopy(DN_SPILL_BLKPTR(dnp), DN_SPILL_BLKPTR(&ddnp[i]),
+ sizeof (blkptr_t));
+ }
+
+ /*
+ * Handle authenticated data. We authenticate everything in
+ * the dnode that can be brought over when we do a raw send.
+ * This includes all of the core fields as well as the MACs
+ * stored in the bp checksums and all of the portable bits
+ * from blk_prop. We include the dnode padding here in case it
+ * ever gets used in the future. Some dn_flags and dn_used are
+ * not portable so we mask those out values out of the
+ * authenticated data.
+ */
+ crypt_len = offsetof(dnode_phys_t, dn_blkptr);
+ bcopy(dnp, aadp, crypt_len);
+ adnp = (dnode_phys_t *)aadp;
+ adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
+ adnp->dn_used = 0;
+ aadp += crypt_len;
+ aad_len += crypt_len;
+
+ for (j = 0; j < dnp->dn_nblkptr; j++) {
+ zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
+ version, byteswap, &dnp->dn_blkptr[j]);
+ }
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
+ version, byteswap, DN_SPILL_BLKPTR(dnp));
+ }
+
+ /*
+ * If this bonus buffer needs to be encrypted, we prepare an
+ * iovec_t. The encryption / decryption functions will fill
+ * this in for us with the encrypted or decrypted data.
+ * Otherwise we add the bonus buffer to the authenticated
+ * data buffer and copy it over to the destination. The
+ * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that
+ * we can guarantee alignment with the AES block size
+ * (128 bits).
+ */
+ crypt_len = DN_MAX_BONUS_LEN(dnp);
+ if (dnp->dn_type != DMU_OT_NONE &&
+ DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
+ dnp->dn_bonuslen != 0) {
+ dst_iovecs[vec].iov_base = DN_BONUS(&ddnp[i]);
+ dst_iovecs[vec].iov_len = crypt_len;
+
+ vec++;
+ total_len += crypt_len;
+ } else {
+ bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), crypt_len);
+ bcopy(DN_BONUS(dnp), aadp, crypt_len);
+ aadp += crypt_len;
+ aad_len += crypt_len;
+ }
+ }
+
+ /* The last iovec will contain the MAC. */
+ ASSERT3U(vec, ==, nr_iovecs - 1);
+
+ /* AAD */
+ dst_iovecs[0].iov_base = aadbuf;
+ dst_iovecs[0].iov_len = aad_len;
+ /* MAC */
+ dst_iovecs[vec].iov_base = 0;
+ dst_iovecs[vec].iov_len = 0;
+
+ *no_crypt = (vec == 1);
+ *enc_len = total_len;
+ *authbuf = aadbuf;
+ *auth_len = aad_len;
+ GET_UIO_STRUCT(out_uio)->uio_iov = dst_iovecs;
+ zfs_uio_iovcnt(out_uio) = nr_iovecs;
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf,
+ uint8_t *cipherbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *out_uio,
+ uint_t *enc_len)
+{
+ int ret;
+ uint_t nr_plain = 1, nr_cipher = 2;
+ iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL;
+ void *src, *dst;
+
+ cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t),
+ KM_SLEEP);
+ if (!cipher_iovecs) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+ bzero(cipher_iovecs, nr_cipher * sizeof (iovec_t));
+
+ if (encrypt) {
+ src = plainbuf;
+ dst = cipherbuf;
+ } else {
+ src = cipherbuf;
+ dst = plainbuf;
+ }
+ bcopy(src, dst, datalen);
+ cipher_iovecs[0].iov_base = dst;
+ cipher_iovecs[0].iov_len = datalen;
+
+ *enc_len = datalen;
+ GET_UIO_STRUCT(out_uio)->uio_iov = cipher_iovecs;
+ zfs_uio_iovcnt(out_uio) = nr_cipher;
+
+ return (0);
+
+error:
+ if (plain_iovecs != NULL)
+ kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t));
+ if (cipher_iovecs != NULL)
+ kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t));
+
+ *enc_len = 0;
+ GET_UIO_STRUCT(out_uio)->uio_iov = NULL;
+ zfs_uio_iovcnt(out_uio) = 0;
+
+ return (ret);
+}
+
+/*
+ * This function builds up the plaintext (puio) and ciphertext (cuio) uios so
+ * that they can be used for encryption and decryption by zio_do_crypt_uio().
+ * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks
+ * requiring special handling to parse out pieces that are to be encrypted. The
+ * authbuf is used by these special cases to store additional authenticated
+ * data (AAD) for the encryption modes.
+ */
+static int
+zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot,
+ uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
+ uint8_t *mac, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len,
+ uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt)
+{
+ int ret;
+ iovec_t *mac_iov;
+
+ ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE);
+
+ /* route to handler */
+ switch (ot) {
+ case DMU_OT_INTENT_LOG:
+ ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf,
+ datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len,
+ no_crypt);
+ break;
+ case DMU_OT_DNODE:
+ ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf,
+ cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf,
+ auth_len, no_crypt);
+ break;
+ default:
+ ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf,
+ datalen, puio, cuio, enc_len);
+ *authbuf = NULL;
+ *auth_len = 0;
+ *no_crypt = B_FALSE;
+ break;
+ }
+
+ if (ret != 0)
+ goto error;
+
+ /* populate the uios */
+ zfs_uio_segflg(cuio) = UIO_SYSSPACE;
+
+ mac_iov =
+ ((iovec_t *)&(GET_UIO_STRUCT(cuio)->
+ uio_iov[zfs_uio_iovcnt(cuio) - 1]));
+ mac_iov->iov_base = (void *)mac;
+ mac_iov->iov_len = ZIO_DATA_MAC_LEN;
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+void *failed_decrypt_buf;
+int faile_decrypt_size;
+
+/*
+ * Primary encryption / decryption entrypoint for zio data.
+ */
+int
+zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
+ dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv,
+ uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf,
+ boolean_t *no_crypt)
+{
+ int ret;
+ boolean_t locked = B_FALSE;
+ uint64_t crypt = key->zk_crypt;
+ uint_t keydata_len = zio_crypt_table[crypt].ci_keylen;
+ uint_t enc_len, auth_len;
+ zfs_uio_t puio, cuio;
+ struct uio puio_s, cuio_s;
+ uint8_t enc_keydata[MASTER_KEY_MAX_LEN];
+ crypto_key_t tmp_ckey, *ckey = NULL;
+ freebsd_crypt_session_t *tmpl = NULL;
+ uint8_t *authbuf = NULL;
+
+
+ zfs_uio_init(&puio, &puio_s);
+ zfs_uio_init(&cuio, &cuio_s);
+ bzero(GET_UIO_STRUCT(&puio), sizeof (struct uio));
+ bzero(GET_UIO_STRUCT(&cuio), sizeof (struct uio));
+
+#ifdef FCRYPTO_DEBUG
+ printf("%s(%s, %p, %p, %d, %p, %p, %u, %s, %p, %p, %p)\n",
+ __FUNCTION__,
+ encrypt ? "encrypt" : "decrypt",
+ key, salt, ot, iv, mac, datalen,
+ byteswap ? "byteswap" : "native_endian", plainbuf,
+ cipherbuf, no_crypt);
+
+ printf("\tkey = {");
+ for (int i = 0; i < key->zk_current_key.ck_length/8; i++)
+ printf("%02x ", ((uint8_t *)key->zk_current_key.ck_data)[i]);
+ printf("}\n");
+#endif
+ /* create uios for encryption */
+ ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf,
+ cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len,
+ &authbuf, &auth_len, no_crypt);
+ if (ret != 0)
+ return (ret);
+
+ /*
+ * If the needed key is the current one, just use it. Otherwise we
+ * need to generate a temporary one from the given salt + master key.
+ * If we are encrypting, we must return a copy of the current salt
+ * so that it can be stored in the blkptr_t.
+ */
+ rw_enter(&key->zk_salt_lock, RW_READER);
+ locked = B_TRUE;
+
+ if (bcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) {
+ ckey = &key->zk_current_key;
+ tmpl = &key->zk_session;
+ } else {
+ rw_exit(&key->zk_salt_lock);
+ locked = B_FALSE;
+
+ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+ salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len);
+ if (ret != 0)
+ goto error;
+ tmp_ckey.ck_format = CRYPTO_KEY_RAW;
+ tmp_ckey.ck_data = enc_keydata;
+ tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+ ckey = &tmp_ckey;
+ tmpl = NULL;
+ }
+
+ /* perform the encryption / decryption */
+ ret = zio_do_crypt_uio_opencrypto(encrypt, tmpl, key->zk_crypt,
+ ckey, iv, enc_len, &cuio, auth_len);
+ if (ret != 0)
+ goto error;
+ if (locked) {
+ rw_exit(&key->zk_salt_lock);
+ locked = B_FALSE;
+ }
+
+ if (authbuf != NULL)
+ zio_buf_free(authbuf, datalen);
+ if (ckey == &tmp_ckey)
+ bzero(enc_keydata, keydata_len);
+ zio_crypt_destroy_uio(&puio);
+ zio_crypt_destroy_uio(&cuio);
+
+ return (0);
+
+error:
+ if (!encrypt) {
+ if (failed_decrypt_buf != NULL)
+ kmem_free(failed_decrypt_buf, failed_decrypt_size);
+ failed_decrypt_buf = kmem_alloc(datalen, KM_SLEEP);
+ failed_decrypt_size = datalen;
+ bcopy(cipherbuf, failed_decrypt_buf, datalen);
+ }
+ if (locked)
+ rw_exit(&key->zk_salt_lock);
+ if (authbuf != NULL)
+ zio_buf_free(authbuf, datalen);
+ if (ckey == &tmp_ckey)
+ bzero(enc_keydata, keydata_len);
+ zio_crypt_destroy_uio(&puio);
+ zio_crypt_destroy_uio(&cuio);
+ return (SET_ERROR(ret));
+}
+
+/*
+ * Simple wrapper around zio_do_crypt_data() to work with abd's instead of
+ * linear buffers.
+ */
+int
+zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot,
+ boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac,
+ uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt)
+{
+ int ret;
+ void *ptmp, *ctmp;
+
+ if (encrypt) {
+ ptmp = abd_borrow_buf_copy(pabd, datalen);
+ ctmp = abd_borrow_buf(cabd, datalen);
+ } else {
+ ptmp = abd_borrow_buf(pabd, datalen);
+ ctmp = abd_borrow_buf_copy(cabd, datalen);
+ }
+
+ ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac,
+ datalen, ptmp, ctmp, no_crypt);
+ if (ret != 0)
+ goto error;
+
+ if (encrypt) {
+ abd_return_buf(pabd, ptmp, datalen);
+ abd_return_buf_copy(cabd, ctmp, datalen);
+ } else {
+ abd_return_buf_copy(pabd, ptmp, datalen);
+ abd_return_buf(cabd, ctmp, datalen);
+ }
+
+ return (0);
+
+error:
+ if (encrypt) {
+ abd_return_buf(pabd, ptmp, datalen);
+ abd_return_buf_copy(cabd, ctmp, datalen);
+ } else {
+ abd_return_buf_copy(pabd, ptmp, datalen);
+ abd_return_buf(cabd, ctmp, datalen);
+ }
+
+ return (SET_ERROR(ret));
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+/* BEGIN CSTYLED */
+module_param(zfs_key_max_salt_uses, ulong, 0644);
+MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value "
+ "can be used for generating encryption keys before it is rotated");
+/* END CSTYLED */
+#endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
new file mode 100644
index 000000000000..2389b1a06355
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
@@ -0,0 +1,1525 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions Copyright 2010 Robert Milkowski
+ *
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
+
+/*
+ * ZFS volume emulation driver.
+ *
+ * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
+ * Volumes are accessed through the symbolic links named:
+ *
+ * /dev/zvol/<pool_name>/<dataset_name>
+ *
+ * Volumes are persistent through reboot. No user command needs to be
+ * run before opening and using a device.
+ *
+ * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
+ * in the system. Except when they're simply character devices (volmode=dev).
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/proc.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/disk.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dnode.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/byteorder.h>
+#include <sys/sunddi.h>
+#include <sys/dirent.h>
+#include <sys/policy.h>
+#include <sys/queue.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zil.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_rlock.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_raidz.h>
+#include <sys/zvol.h>
+#include <sys/zil_impl.h>
+#include <sys/dataset_kstats.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/zfeature.h>
+#include <sys/zio_checksum.h>
+#include <sys/zil_impl.h>
+#include <sys/filio.h>
+
+#include <geom/geom.h>
+#include <sys/zvol.h>
+#include <sys/zvol_impl.h>
+
+#include "zfs_namecheck.h"
+
+#define ZVOL_DUMPSIZE "dumpsize"
+
+#ifdef ZVOL_LOCK_DEBUG
+#define ZVOL_RW_READER RW_WRITER
+#define ZVOL_RW_READ_HELD RW_WRITE_HELD
+#else
+#define ZVOL_RW_READER RW_READER
+#define ZVOL_RW_READ_HELD RW_READ_HELD
+#endif
+
+enum zvol_geom_state {
+ ZVOL_GEOM_UNINIT,
+ ZVOL_GEOM_STOPPED,
+ ZVOL_GEOM_RUNNING,
+};
+
+struct zvol_state_os {
+#define zso_dev _zso_state._zso_dev
+#define zso_geom _zso_state._zso_geom
+ union {
+ /* volmode=dev */
+ struct zvol_state_dev {
+ struct cdev *zsd_cdev;
+ uint64_t zsd_sync_cnt;
+ } _zso_dev;
+
+ /* volmode=geom */
+ struct zvol_state_geom {
+ struct g_provider *zsg_provider;
+ struct bio_queue_head zsg_queue;
+ struct mtx zsg_queue_mtx;
+ enum zvol_geom_state zsg_state;
+ } _zso_geom;
+ } _zso_state;
+ int zso_dying;
+};
+
+static uint32_t zvol_minors;
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
+ "Expose as GEOM providers (1), device files (2) or neither");
+static boolean_t zpool_on_zvol = B_FALSE;
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
+ "Allow zpools to use zvols as vdevs (DANGEROUS)");
+
+/*
+ * Toggle unmap functionality.
+ */
+boolean_t zvol_unmap_enabled = B_TRUE;
+
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
+ &zvol_unmap_enabled, 0, "Enable UNMAP functionality");
+
+/*
+ * zvol maximum transfer in one DMU tx.
+ */
+int zvol_maxphys = DMU_MAX_ACCESS / 2;
+
+static void zvol_ensure_zilog(zvol_state_t *zv);
+
+static d_open_t zvol_cdev_open;
+static d_close_t zvol_cdev_close;
+static d_ioctl_t zvol_cdev_ioctl;
+static d_read_t zvol_cdev_read;
+static d_write_t zvol_cdev_write;
+static d_strategy_t zvol_geom_bio_strategy;
+
+static struct cdevsw zvol_cdevsw = {
+ .d_name = "zvol",
+ .d_version = D_VERSION,
+ .d_flags = D_DISK | D_TRACKCLOSE,
+ .d_open = zvol_cdev_open,
+ .d_close = zvol_cdev_close,
+ .d_ioctl = zvol_cdev_ioctl,
+ .d_read = zvol_cdev_read,
+ .d_write = zvol_cdev_write,
+ .d_strategy = zvol_geom_bio_strategy,
+};
+
+extern uint_t zfs_geom_probe_vdev_key;
+
+struct g_class zfs_zvol_class = {
+ .name = "ZFS::ZVOL",
+ .version = G_VERSION,
+};
+
+DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
+
+static int zvol_geom_open(struct g_provider *pp, int flag, int count);
+static int zvol_geom_close(struct g_provider *pp, int flag, int count);
+static void zvol_geom_run(zvol_state_t *zv);
+static void zvol_geom_destroy(zvol_state_t *zv);
+static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
+static void zvol_geom_worker(void *arg);
+static void zvol_geom_bio_start(struct bio *bp);
+static int zvol_geom_bio_getattr(struct bio *bp);
+/* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */
+
+/*
+ * GEOM mode implementation
+ */
+
+/*ARGSUSED*/
+static int
+zvol_geom_open(struct g_provider *pp, int flag, int count)
+{
+ zvol_state_t *zv;
+ int err = 0;
+ boolean_t drop_suspend = B_FALSE;
+ boolean_t drop_namespace = B_FALSE;
+
+ if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
+ /*
+ * if zfs_geom_probe_vdev_key is set, that means that zfs is
+ * attempting to probe geom providers while looking for a
+ * replacement for a missing VDEV. In this case, the
+ * spa_namespace_lock will not be held, but it is still illegal
+ * to use a zvol as a vdev. Deadlocks can result if another
+ * thread has spa_namespace_lock
+ */
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+
+retry:
+ rw_enter(&zvol_state_lock, ZVOL_RW_READER);
+ zv = pp->private;
+ if (zv == NULL) {
+ rw_exit(&zvol_state_lock);
+ err = SET_ERROR(ENXIO);
+ goto out_locked;
+ }
+
+ if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
+ /*
+ * We need to guarantee that the namespace lock is held
+ * to avoid spurious failures in zvol_first_open.
+ */
+ drop_namespace = B_TRUE;
+ if (!mutex_tryenter(&spa_namespace_lock)) {
+ rw_exit(&zvol_state_lock);
+ mutex_enter(&spa_namespace_lock);
+ goto retry;
+ }
+ }
+ mutex_enter(&zv->zv_state_lock);
+ if (zv->zv_zso->zso_dying) {
+ rw_exit(&zvol_state_lock);
+ err = SET_ERROR(ENXIO);
+ goto out_zv_locked;
+ }
+ ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
+
+ /*
+ * make sure zvol is not suspended during first open
+ * (hold zv_suspend_lock) and respect proper lock acquisition
+ * ordering - zv_suspend_lock before zv_state_lock
+ */
+ if (zv->zv_open_count == 0) {
+ drop_suspend = B_TRUE;
+ if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+ mutex_enter(&zv->zv_state_lock);
+ /* check to see if zv_suspend_lock is needed */
+ if (zv->zv_open_count != 0) {
+ rw_exit(&zv->zv_suspend_lock);
+ drop_suspend = B_FALSE;
+ }
+ }
+ }
+ rw_exit(&zvol_state_lock);
+
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+ if (zv->zv_open_count == 0) {
+ ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+ err = zvol_first_open(zv, !(flag & FWRITE));
+ if (err)
+ goto out_zv_locked;
+ pp->mediasize = zv->zv_volsize;
+ pp->stripeoffset = 0;
+ pp->stripesize = zv->zv_volblocksize;
+ }
+
+ /*
+ * Check for a bad on-disk format version now since we
+ * lied about owning the dataset readonly before.
+ */
+ if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
+ dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
+ err = SET_ERROR(EROFS);
+ goto out_opened;
+ }
+ if (zv->zv_flags & ZVOL_EXCL) {
+ err = SET_ERROR(EBUSY);
+ goto out_opened;
+ }
+#ifdef FEXCL
+ if (flag & FEXCL) {
+ if (zv->zv_open_count != 0) {
+ err = SET_ERROR(EBUSY);
+ goto out_opened;
+ }
+ zv->zv_flags |= ZVOL_EXCL;
+ }
+#endif
+
+ zv->zv_open_count += count;
+out_opened:
+ if (zv->zv_open_count == 0) {
+ zvol_last_close(zv);
+ wakeup(zv);
+ }
+out_zv_locked:
+ mutex_exit(&zv->zv_state_lock);
+out_locked:
+ if (drop_namespace)
+ mutex_exit(&spa_namespace_lock);
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+ return (err);
+}
+
+/*ARGSUSED*/
+static int
+zvol_geom_close(struct g_provider *pp, int flag, int count)
+{
+ zvol_state_t *zv;
+ boolean_t drop_suspend = B_TRUE;
+ int new_open_count;
+
+ rw_enter(&zvol_state_lock, ZVOL_RW_READER);
+ zv = pp->private;
+ if (zv == NULL) {
+ rw_exit(&zvol_state_lock);
+ return (SET_ERROR(ENXIO));
+ }
+
+ mutex_enter(&zv->zv_state_lock);
+ if (zv->zv_flags & ZVOL_EXCL) {
+ ASSERT3U(zv->zv_open_count, ==, 1);
+ zv->zv_flags &= ~ZVOL_EXCL;
+ }
+
+ ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
+
+ /*
+ * If the open count is zero, this is a spurious close.
+ * That indicates a bug in the kernel / DDI framework.
+ */
+ ASSERT3U(zv->zv_open_count, >, 0);
+
+ /*
+ * make sure zvol is not suspended during last close
+ * (hold zv_suspend_lock) and respect proper lock acquisition
+ * ordering - zv_suspend_lock before zv_state_lock
+ */
+ new_open_count = zv->zv_open_count - count;
+ if (new_open_count == 0) {
+ if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+ mutex_enter(&zv->zv_state_lock);
+ /* check to see if zv_suspend_lock is needed */
+ new_open_count = zv->zv_open_count - count;
+ if (new_open_count != 0) {
+ rw_exit(&zv->zv_suspend_lock);
+ drop_suspend = B_FALSE;
+ }
+ }
+ } else {
+ drop_suspend = B_FALSE;
+ }
+ rw_exit(&zvol_state_lock);
+
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+ /*
+ * You may get multiple opens, but only one close.
+ */
+ zv->zv_open_count = new_open_count;
+ if (zv->zv_open_count == 0) {
+ ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+ zvol_last_close(zv);
+ wakeup(zv);
+ }
+
+ mutex_exit(&zv->zv_state_lock);
+
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+ return (0);
+}
+
+static void
+zvol_geom_run(zvol_state_t *zv)
+{
+ struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+ struct g_provider *pp = zsg->zsg_provider;
+
+ ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
+
+ g_error_provider(pp, 0);
+
+ kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
+ "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
+}
+
+static void
+zvol_geom_destroy(zvol_state_t *zv)
+{
+ struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+ struct g_provider *pp = zsg->zsg_provider;
+
+ ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
+
+ g_topology_assert();
+
+ mutex_enter(&zv->zv_state_lock);
+ VERIFY(zsg->zsg_state == ZVOL_GEOM_RUNNING);
+ mutex_exit(&zv->zv_state_lock);
+ zsg->zsg_provider = NULL;
+ g_wither_geom(pp->geom, ENXIO);
+}
+
+void
+zvol_wait_close(zvol_state_t *zv)
+{
+
+ if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
+ return;
+ mutex_enter(&zv->zv_state_lock);
+ zv->zv_zso->zso_dying = B_TRUE;
+
+ if (zv->zv_open_count)
+ msleep(zv, &zv->zv_state_lock,
+ PRIBIO, "zvol:dying", 10*hz);
+ mutex_exit(&zv->zv_state_lock);
+}
+
+
+static int
+zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
+{
+ int count, error, flags;
+
+ g_topology_assert();
+
+ /*
+ * To make it easier we expect either open or close, but not both
+ * at the same time.
+ */
+ KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
+ (acr <= 0 && acw <= 0 && ace <= 0),
+ ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
+ pp->name, acr, acw, ace));
+
+ if (pp->private == NULL) {
+ if (acr <= 0 && acw <= 0 && ace <= 0)
+ return (0);
+ return (pp->error);
+ }
+
+ /*
+ * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
+ * ace != 0, because GEOM already handles that and handles it a bit
+ * differently. GEOM allows for multiple read/exclusive consumers and
+ * ZFS allows only one exclusive consumer, no matter if it is reader or
+ * writer. I like better the way GEOM works so I'll leave it for GEOM
+ * to decide what to do.
+ */
+
+ count = acr + acw + ace;
+ if (count == 0)
+ return (0);
+
+ flags = 0;
+ if (acr != 0 || ace != 0)
+ flags |= FREAD;
+ if (acw != 0)
+ flags |= FWRITE;
+
+ g_topology_unlock();
+ if (count > 0)
+ error = zvol_geom_open(pp, flags, count);
+ else
+ error = zvol_geom_close(pp, flags, -count);
+ g_topology_lock();
+ return (error);
+}
+
+static void
+zvol_geom_worker(void *arg)
+{
+ zvol_state_t *zv = arg;
+ struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+ struct bio *bp;
+
+ ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
+
+ thread_lock(curthread);
+ sched_prio(curthread, PRIBIO);
+ thread_unlock(curthread);
+
+ for (;;) {
+ mtx_lock(&zsg->zsg_queue_mtx);
+ bp = bioq_takefirst(&zsg->zsg_queue);
+ if (bp == NULL) {
+ if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
+ zsg->zsg_state = ZVOL_GEOM_RUNNING;
+ wakeup(&zsg->zsg_state);
+ mtx_unlock(&zsg->zsg_queue_mtx);
+ kthread_exit();
+ }
+ msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
+ PRIBIO | PDROP, "zvol:io", 0);
+ continue;
+ }
+ mtx_unlock(&zsg->zsg_queue_mtx);
+ zvol_geom_bio_strategy(bp);
+ }
+}
+
+static void
+zvol_geom_bio_start(struct bio *bp)
+{
+ zvol_state_t *zv = bp->bio_to->private;
+ struct zvol_state_geom *zsg;
+ boolean_t first;
+
+ if (zv == NULL) {
+ g_io_deliver(bp, ENXIO);
+ return;
+ }
+ if (bp->bio_cmd == BIO_GETATTR) {
+ if (zvol_geom_bio_getattr(bp))
+ g_io_deliver(bp, EOPNOTSUPP);
+ return;
+ }
+
+ if (!THREAD_CAN_SLEEP()) {
+ zsg = &zv->zv_zso->zso_geom;
+ mtx_lock(&zsg->zsg_queue_mtx);
+ first = (bioq_first(&zsg->zsg_queue) == NULL);
+ bioq_insert_tail(&zsg->zsg_queue, bp);
+ mtx_unlock(&zsg->zsg_queue_mtx);
+ if (first)
+ wakeup_one(&zsg->zsg_queue);
+ return;
+ }
+
+ zvol_geom_bio_strategy(bp);
+}
+
+static int
+zvol_geom_bio_getattr(struct bio *bp)
+{
+ zvol_state_t *zv;
+
+ zv = bp->bio_to->private;
+ ASSERT3P(zv, !=, NULL);
+
+ spa_t *spa = dmu_objset_spa(zv->zv_objset);
+ uint64_t refd, avail, usedobjs, availobjs;
+
+ if (g_handleattr_int(bp, "GEOM::candelete", 1))
+ return (0);
+ if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
+ dmu_objset_space(zv->zv_objset, &refd, &avail,
+ &usedobjs, &availobjs);
+ if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
+ return (0);
+ } else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
+ dmu_objset_space(zv->zv_objset, &refd, &avail,
+ &usedobjs, &availobjs);
+ if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
+ return (0);
+ } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
+ avail = metaslab_class_get_space(spa_normal_class(spa));
+ avail -= metaslab_class_get_alloc(spa_normal_class(spa));
+ if (g_handleattr_off_t(bp, "poolblocksavail",
+ avail / DEV_BSIZE))
+ return (0);
+ } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
+ refd = metaslab_class_get_alloc(spa_normal_class(spa));
+ if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
+ return (0);
+ }
+ return (1);
+}
+
+static void
+zvol_geom_bio_strategy(struct bio *bp)
+{
+ zvol_state_t *zv;
+ uint64_t off, volsize;
+ size_t resid;
+ char *addr;
+ objset_t *os;
+ zfs_locked_range_t *lr;
+ int error = 0;
+ boolean_t doread = B_FALSE;
+ boolean_t is_dumpified;
+ boolean_t sync;
+
+ if (bp->bio_to)
+ zv = bp->bio_to->private;
+ else
+ zv = bp->bio_dev->si_drv2;
+
+ if (zv == NULL) {
+ error = SET_ERROR(ENXIO);
+ goto out;
+ }
+
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+
+ switch (bp->bio_cmd) {
+ case BIO_READ:
+ doread = B_TRUE;
+ break;
+ case BIO_WRITE:
+ case BIO_FLUSH:
+ case BIO_DELETE:
+ if (zv->zv_flags & ZVOL_RDONLY) {
+ error = SET_ERROR(EROFS);
+ goto resume;
+ }
+ zvol_ensure_zilog(zv);
+ if (bp->bio_cmd == BIO_FLUSH)
+ goto sync;
+ break;
+ default:
+ error = SET_ERROR(EOPNOTSUPP);
+ goto resume;
+ }
+
+ off = bp->bio_offset;
+ volsize = zv->zv_volsize;
+
+ os = zv->zv_objset;
+ ASSERT3P(os, !=, NULL);
+
+ addr = bp->bio_data;
+ resid = bp->bio_length;
+
+ if (resid > 0 && off >= volsize) {
+ error = SET_ERROR(EIO);
+ goto resume;
+ }
+
+ is_dumpified = B_FALSE;
+ sync = !doread && !is_dumpified &&
+ zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+
+ /*
+ * There must be no buffer changes when doing a dmu_sync() because
+ * we can't change the data whilst calculating the checksum.
+ */
+ lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
+ doread ? RL_READER : RL_WRITER);
+
+ if (bp->bio_cmd == BIO_DELETE) {
+ dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ dmu_tx_abort(tx);
+ } else {
+ zvol_log_truncate(zv, tx, off, resid, sync);
+ dmu_tx_commit(tx);
+ error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
+ off, resid);
+ resid = 0;
+ }
+ goto unlock;
+ }
+ while (resid != 0 && off < volsize) {
+ size_t size = MIN(resid, zvol_maxphys);
+ if (doread) {
+ error = dmu_read(os, ZVOL_OBJ, off, size, addr,
+ DMU_READ_PREFETCH);
+ } else {
+ dmu_tx_t *tx = dmu_tx_create(os);
+ dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
+ zvol_log_write(zv, tx, off, size, sync);
+ dmu_tx_commit(tx);
+ }
+ }
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = SET_ERROR(EIO);
+ break;
+ }
+ off += size;
+ addr += size;
+ resid -= size;
+ }
+unlock:
+ zfs_rangelock_exit(lr);
+
+ bp->bio_completed = bp->bio_length - resid;
+ if (bp->bio_completed < bp->bio_length && off > volsize)
+ error = SET_ERROR(EINVAL);
+
+ switch (bp->bio_cmd) {
+ case BIO_FLUSH:
+ break;
+ case BIO_READ:
+ dataset_kstats_update_read_kstats(&zv->zv_kstat,
+ bp->bio_completed);
+ break;
+ case BIO_WRITE:
+ dataset_kstats_update_write_kstats(&zv->zv_kstat,
+ bp->bio_completed);
+ break;
+ case BIO_DELETE:
+ break;
+ default:
+ break;
+ }
+
+ if (sync) {
+sync:
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ }
+resume:
+ rw_exit(&zv->zv_suspend_lock);
+out:
+ if (bp->bio_to)
+ g_io_deliver(bp, error);
+ else
+ biofinish(bp, NULL, error);
+}
+
+/*
+ * Character device mode implementation
+ */
+
+static int
+zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
+{
+ zvol_state_t *zv;
+ uint64_t volsize;
+ zfs_locked_range_t *lr;
+ int error = 0;
+ zfs_uio_t uio;
+
+ zfs_uio_init(&uio, uio_s);
+
+ zv = dev->si_drv2;
+
+ volsize = zv->zv_volsize;
+ /*
+ * uio_loffset == volsize isn't an error as
+ * its required for EOF processing.
+ */
+ if (zfs_uio_resid(&uio) > 0 &&
+ (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
+ return (SET_ERROR(EIO));
+
+ lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
+ zfs_uio_resid(&uio), RL_READER);
+ while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
+ uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
+
+ /* don't read past the end */
+ if (bytes > volsize - zfs_uio_offset(&uio))
+ bytes = volsize - zfs_uio_offset(&uio);
+
+ error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = SET_ERROR(EIO);
+ break;
+ }
+ }
+ zfs_rangelock_exit(lr);
+
+ return (error);
+}
+
+static int
+zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
+{
+ zvol_state_t *zv;
+ uint64_t volsize;
+ zfs_locked_range_t *lr;
+ int error = 0;
+ boolean_t sync;
+ zfs_uio_t uio;
+
+ zv = dev->si_drv2;
+
+ volsize = zv->zv_volsize;
+
+ zfs_uio_init(&uio, uio_s);
+
+ if (zfs_uio_resid(&uio) > 0 &&
+ (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
+ return (SET_ERROR(EIO));
+
+ sync = (ioflag & IO_SYNC) ||
+ (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
+
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+ zvol_ensure_zilog(zv);
+
+ lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
+ zfs_uio_resid(&uio), RL_WRITER);
+ while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
+ uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
+ uint64_t off = zfs_uio_offset(&uio);
+ dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+
+ if (bytes > volsize - off) /* don't write past the end */
+ bytes = volsize - off;
+
+ dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ break;
+ }
+ error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
+ if (error == 0)
+ zvol_log_write(zv, tx, off, bytes, sync);
+ dmu_tx_commit(tx);
+
+ if (error)
+ break;
+ }
+ zfs_rangelock_exit(lr);
+ if (sync)
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ rw_exit(&zv->zv_suspend_lock);
+ return (error);
+}
+
+static int
+zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
+{
+ zvol_state_t *zv;
+ struct zvol_state_dev *zsd;
+ int err = 0;
+ boolean_t drop_suspend = B_FALSE;
+ boolean_t drop_namespace = B_FALSE;
+
+retry:
+ rw_enter(&zvol_state_lock, ZVOL_RW_READER);
+ zv = dev->si_drv2;
+ if (zv == NULL) {
+ rw_exit(&zvol_state_lock);
+ err = SET_ERROR(ENXIO);
+ goto out_locked;
+ }
+
+ if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
+ /*
+ * We need to guarantee that the namespace lock is held
+ * to avoid spurious failures in zvol_first_open.
+ */
+ drop_namespace = B_TRUE;
+ if (!mutex_tryenter(&spa_namespace_lock)) {
+ rw_exit(&zvol_state_lock);
+ mutex_enter(&spa_namespace_lock);
+ goto retry;
+ }
+ }
+ mutex_enter(&zv->zv_state_lock);
+
+ ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
+
+ /*
+ * make sure zvol is not suspended during first open
+ * (hold zv_suspend_lock) and respect proper lock acquisition
+ * ordering - zv_suspend_lock before zv_state_lock
+ */
+ if (zv->zv_open_count == 0) {
+ drop_suspend = B_TRUE;
+ if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+ mutex_enter(&zv->zv_state_lock);
+ /* check to see if zv_suspend_lock is needed */
+ if (zv->zv_open_count != 0) {
+ rw_exit(&zv->zv_suspend_lock);
+ drop_suspend = B_FALSE;
+ }
+ }
+ }
+ rw_exit(&zvol_state_lock);
+
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+ if (zv->zv_open_count == 0) {
+ ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+ err = zvol_first_open(zv, !(flags & FWRITE));
+ if (err)
+ goto out_zv_locked;
+ }
+
+ if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+ err = SET_ERROR(EROFS);
+ goto out_opened;
+ }
+ if (zv->zv_flags & ZVOL_EXCL) {
+ err = SET_ERROR(EBUSY);
+ goto out_opened;
+ }
+#ifdef FEXCL
+ if (flags & FEXCL) {
+ if (zv->zv_open_count != 0) {
+ err = SET_ERROR(EBUSY);
+ goto out_opened;
+ }
+ zv->zv_flags |= ZVOL_EXCL;
+ }
+#endif
+
+ zv->zv_open_count++;
+ if (flags & (FSYNC | FDSYNC)) {
+ zsd = &zv->zv_zso->zso_dev;
+ zsd->zsd_sync_cnt++;
+ if (zsd->zsd_sync_cnt == 1 &&
+ (zv->zv_flags & ZVOL_WRITTEN_TO) != 0)
+ zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
+ }
+out_opened:
+ if (zv->zv_open_count == 0) {
+ zvol_last_close(zv);
+ wakeup(zv);
+ }
+out_zv_locked:
+ mutex_exit(&zv->zv_state_lock);
+out_locked:
+ if (drop_namespace)
+ mutex_exit(&spa_namespace_lock);
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+ return (err);
+}
+
+static int
+zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
+{
+ zvol_state_t *zv;
+ struct zvol_state_dev *zsd;
+ boolean_t drop_suspend = B_TRUE;
+
+ rw_enter(&zvol_state_lock, ZVOL_RW_READER);
+ zv = dev->si_drv2;
+ if (zv == NULL) {
+ rw_exit(&zvol_state_lock);
+ return (SET_ERROR(ENXIO));
+ }
+
+ mutex_enter(&zv->zv_state_lock);
+ if (zv->zv_flags & ZVOL_EXCL) {
+ ASSERT3U(zv->zv_open_count, ==, 1);
+ zv->zv_flags &= ~ZVOL_EXCL;
+ }
+
+ ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
+
+ /*
+ * If the open count is zero, this is a spurious close.
+ * That indicates a bug in the kernel / DDI framework.
+ */
+ ASSERT3U(zv->zv_open_count, >, 0);
+ /*
+ * make sure zvol is not suspended during last close
+ * (hold zv_suspend_lock) and respect proper lock acquisition
+ * ordering - zv_suspend_lock before zv_state_lock
+ */
+ if (zv->zv_open_count == 1) {
+ if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+ mutex_enter(&zv->zv_state_lock);
+ /* check to see if zv_suspend_lock is needed */
+ if (zv->zv_open_count != 1) {
+ rw_exit(&zv->zv_suspend_lock);
+ drop_suspend = B_FALSE;
+ }
+ }
+ } else {
+ drop_suspend = B_FALSE;
+ }
+ rw_exit(&zvol_state_lock);
+
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+ /*
+ * You may get multiple opens, but only one close.
+ */
+ zv->zv_open_count--;
+ if (flags & (FSYNC | FDSYNC)) {
+ zsd = &zv->zv_zso->zso_dev;
+ zsd->zsd_sync_cnt--;
+ }
+
+ if (zv->zv_open_count == 0) {
+ ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+ zvol_last_close(zv);
+ wakeup(zv);
+ }
+
+ mutex_exit(&zv->zv_state_lock);
+
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+ return (0);
+}
+
+static int
+zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
+ int fflag, struct thread *td)
+{
+ zvol_state_t *zv;
+ zfs_locked_range_t *lr;
+ off_t offset, length;
+ int i, error;
+ boolean_t sync;
+
+ zv = dev->si_drv2;
+
+ error = 0;
+ KASSERT(zv->zv_open_count > 0,
+ ("Device with zero access count in %s", __func__));
+
+ i = IOCPARM_LEN(cmd);
+ switch (cmd) {
+ case DIOCGSECTORSIZE:
+ *(uint32_t *)data = DEV_BSIZE;
+ break;
+ case DIOCGMEDIASIZE:
+ *(off_t *)data = zv->zv_volsize;
+ break;
+ case DIOCGFLUSH:
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+ if (zv->zv_zilog != NULL)
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ rw_exit(&zv->zv_suspend_lock);
+ break;
+ case DIOCGDELETE:
+ if (!zvol_unmap_enabled)
+ break;
+
+ offset = ((off_t *)data)[0];
+ length = ((off_t *)data)[1];
+ if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
+ offset < 0 || offset >= zv->zv_volsize ||
+ length <= 0) {
+ printf("%s: offset=%jd length=%jd\n", __func__, offset,
+ length);
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+ zvol_ensure_zilog(zv);
+ lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
+ RL_WRITER);
+ dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ sync = FALSE;
+ dmu_tx_abort(tx);
+ } else {
+ sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
+ zvol_log_truncate(zv, tx, offset, length, sync);
+ dmu_tx_commit(tx);
+ error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
+ offset, length);
+ }
+ zfs_rangelock_exit(lr);
+ if (sync)
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ rw_exit(&zv->zv_suspend_lock);
+ break;
+ case DIOCGSTRIPESIZE:
+ *(off_t *)data = zv->zv_volblocksize;
+ break;
+ case DIOCGSTRIPEOFFSET:
+ *(off_t *)data = 0;
+ break;
+ case DIOCGATTR: {
+ spa_t *spa = dmu_objset_spa(zv->zv_objset);
+ struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
+ uint64_t refd, avail, usedobjs, availobjs;
+
+ if (strcmp(arg->name, "GEOM::candelete") == 0)
+ arg->value.i = 1;
+ else if (strcmp(arg->name, "blocksavail") == 0) {
+ dmu_objset_space(zv->zv_objset, &refd, &avail,
+ &usedobjs, &availobjs);
+ arg->value.off = avail / DEV_BSIZE;
+ } else if (strcmp(arg->name, "blocksused") == 0) {
+ dmu_objset_space(zv->zv_objset, &refd, &avail,
+ &usedobjs, &availobjs);
+ arg->value.off = refd / DEV_BSIZE;
+ } else if (strcmp(arg->name, "poolblocksavail") == 0) {
+ avail = metaslab_class_get_space(spa_normal_class(spa));
+ avail -= metaslab_class_get_alloc(
+ spa_normal_class(spa));
+ arg->value.off = avail / DEV_BSIZE;
+ } else if (strcmp(arg->name, "poolblocksused") == 0) {
+ refd = metaslab_class_get_alloc(spa_normal_class(spa));
+ arg->value.off = refd / DEV_BSIZE;
+ } else
+ error = SET_ERROR(ENOIOCTL);
+ break;
+ }
+ case FIOSEEKHOLE:
+ case FIOSEEKDATA: {
+ off_t *off = (off_t *)data;
+ uint64_t noff;
+ boolean_t hole;
+
+ hole = (cmd == FIOSEEKHOLE);
+ noff = *off;
+ error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
+ *off = noff;
+ break;
+ }
+ default:
+ error = SET_ERROR(ENOIOCTL);
+ }
+
+ return (error);
+}
+
+/*
+ * Misc. helpers
+ */
+
+static void
+zvol_ensure_zilog(zvol_state_t *zv)
+{
+ ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+
+ /*
+ * Open a ZIL if this is the first time we have written to this
+ * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
+ * than zv_state_lock so that we don't need to acquire an
+ * additional lock in this path.
+ */
+ if (zv->zv_zilog == NULL) {
+ if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
+ rw_exit(&zv->zv_suspend_lock);
+ rw_enter(&zv->zv_suspend_lock, RW_WRITER);
+ }
+ if (zv->zv_zilog == NULL) {
+ zv->zv_zilog = zil_open(zv->zv_objset,
+ zvol_get_data);
+ zv->zv_flags |= ZVOL_WRITTEN_TO;
+ }
+ rw_downgrade(&zv->zv_suspend_lock);
+ }
+}
+
+static boolean_t
+zvol_is_zvol_impl(const char *device)
+{
+ return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
+}
+
+static void
+zvol_rename_minor(zvol_state_t *zv, const char *newname)
+{
+ ASSERT(RW_LOCK_HELD(&zvol_state_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+ /* move to new hashtable entry */
+ zv->zv_hash = zvol_name_hash(zv->zv_name);
+ hlist_del(&zv->zv_hlink);
+ hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
+
+ if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+ struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+ struct g_provider *pp = zsg->zsg_provider;
+ struct g_geom *gp;
+
+ g_topology_lock();
+ gp = pp->geom;
+ ASSERT3P(gp, !=, NULL);
+
+ zsg->zsg_provider = NULL;
+ g_wither_provider(pp, ENXIO);
+
+ pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
+ pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
+ pp->sectorsize = DEV_BSIZE;
+ pp->mediasize = zv->zv_volsize;
+ pp->private = zv;
+ zsg->zsg_provider = pp;
+ g_error_provider(pp, 0);
+ g_topology_unlock();
+ } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+ struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+ struct cdev *dev;
+ struct make_dev_args args;
+
+ dev = zsd->zsd_cdev;
+ if (dev != NULL) {
+ destroy_dev(dev);
+ dev = zsd->zsd_cdev = NULL;
+ if (zv->zv_open_count > 0) {
+ zv->zv_flags &= ~ZVOL_EXCL;
+ zv->zv_open_count = 0;
+ /* XXX need suspend lock but lock order */
+ zvol_last_close(zv);
+ }
+ }
+
+ make_dev_args_init(&args);
+ args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+ args.mda_devsw = &zvol_cdevsw;
+ args.mda_cr = NULL;
+ args.mda_uid = UID_ROOT;
+ args.mda_gid = GID_OPERATOR;
+ args.mda_mode = 0640;
+ args.mda_si_drv2 = zv;
+ if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
+ == 0) {
+ dev->si_iosize_max = maxphys;
+ zsd->zsd_cdev = dev;
+ }
+ }
+ strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
+}
+
+/*
+ * Remove minor node for the specified volume.
+ */
+static void
+zvol_free(zvol_state_t *zv)
+{
+ ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+ ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT0(zv->zv_open_count);
+
+ ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
+
+ rw_destroy(&zv->zv_suspend_lock);
+ zfs_rangelock_fini(&zv->zv_rangelock);
+
+ if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+ struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+ struct g_provider *pp __maybe_unused = zsg->zsg_provider;
+
+ ASSERT3P(pp->private, ==, NULL);
+
+ g_topology_lock();
+ zvol_geom_destroy(zv);
+ g_topology_unlock();
+ mtx_destroy(&zsg->zsg_queue_mtx);
+ } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+ struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+ struct cdev *dev = zsd->zsd_cdev;
+
+ ASSERT3P(dev->si_drv2, ==, NULL);
+
+ destroy_dev(dev);
+ }
+
+ mutex_destroy(&zv->zv_state_lock);
+ dataset_kstats_destroy(&zv->zv_kstat);
+ kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
+ kmem_free(zv, sizeof (zvol_state_t));
+ zvol_minors--;
+}
+
+/*
+ * Create a minor node (plus a whole lot more) for the specified volume.
+ */
+static int
+zvol_create_minor_impl(const char *name)
+{
+ zvol_state_t *zv;
+ objset_t *os;
+ dmu_object_info_t *doi;
+ uint64_t volsize;
+ uint64_t volmode, hash;
+ int error;
+
+ ZFS_LOG(1, "Creating ZVOL %s...", name);
+ hash = zvol_name_hash(name);
+ if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ mutex_exit(&zv->zv_state_lock);
+ return (SET_ERROR(EEXIST));
+ }
+
+ DROP_GIANT();
+
+ doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
+
+ /* lie and say we're read-only */
+ error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
+ if (error)
+ goto out_doi;
+
+ error = dmu_object_info(os, ZVOL_OBJ, doi);
+ if (error)
+ goto out_dmu_objset_disown;
+
+ error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+ if (error)
+ goto out_dmu_objset_disown;
+
+ error = dsl_prop_get_integer(name,
+ zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
+ if (error || volmode == ZFS_VOLMODE_DEFAULT)
+ volmode = zvol_volmode;
+ error = 0;
+
+ /*
+ * zvol_alloc equivalent ...
+ */
+ zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
+ zv->zv_hash = hash;
+ mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
+ zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
+ zv->zv_volmode = volmode;
+ if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+ struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+ struct g_provider *pp;
+ struct g_geom *gp;
+
+ zsg->zsg_state = ZVOL_GEOM_UNINIT;
+ mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
+
+ g_topology_lock();
+ gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
+ gp->start = zvol_geom_bio_start;
+ gp->access = zvol_geom_access;
+ pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
+ pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
+ pp->sectorsize = DEV_BSIZE;
+ pp->mediasize = 0;
+ pp->private = zv;
+
+ zsg->zsg_provider = pp;
+ bioq_init(&zsg->zsg_queue);
+ } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+ struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+ struct cdev *dev;
+ struct make_dev_args args;
+
+ make_dev_args_init(&args);
+ args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+ args.mda_devsw = &zvol_cdevsw;
+ args.mda_cr = NULL;
+ args.mda_uid = UID_ROOT;
+ args.mda_gid = GID_OPERATOR;
+ args.mda_mode = 0640;
+ args.mda_si_drv2 = zv;
+ error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
+ if (error) {
+ kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
+ mutex_destroy(&zv->zv_state_lock);
+ kmem_free(zv, sizeof (*zv));
+ dmu_objset_disown(os, B_TRUE, FTAG);
+ goto out_doi;
+ }
+ dev->si_iosize_max = maxphys;
+ zsd->zsd_cdev = dev;
+ }
+ (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
+ rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
+ zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
+
+ if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
+ zv->zv_flags |= ZVOL_RDONLY;
+
+ zv->zv_volblocksize = doi->doi_data_block_size;
+ zv->zv_volsize = volsize;
+ zv->zv_objset = os;
+
+ if (spa_writeable(dmu_objset_spa(os))) {
+ if (zil_replay_disable)
+ zil_destroy(dmu_objset_zil(os), B_FALSE);
+ else
+ zil_replay(os, zv, zvol_replay_vector);
+ }
+ ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
+ dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
+
+ /* TODO: prefetch for geom tasting */
+
+ zv->zv_objset = NULL;
+out_dmu_objset_disown:
+ dmu_objset_disown(os, B_TRUE, FTAG);
+
+ if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
+ zvol_geom_run(zv);
+ g_topology_unlock();
+ }
+out_doi:
+ kmem_free(doi, sizeof (dmu_object_info_t));
+ if (error == 0) {
+ rw_enter(&zvol_state_lock, RW_WRITER);
+ zvol_insert(zv);
+ zvol_minors++;
+ rw_exit(&zvol_state_lock);
+ ZFS_LOG(1, "ZVOL %s created.", name);
+ }
+ PICKUP_GIANT();
+ return (error);
+}
+
+static void
+zvol_clear_private(zvol_state_t *zv)
+{
+ ASSERT(RW_LOCK_HELD(&zvol_state_lock));
+ if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+ struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+ struct g_provider *pp = zsg->zsg_provider;
+
+ if (pp->private == NULL) /* already cleared */
+ return;
+
+ mtx_lock(&zsg->zsg_queue_mtx);
+ zsg->zsg_state = ZVOL_GEOM_STOPPED;
+ pp->private = NULL;
+ wakeup_one(&zsg->zsg_queue);
+ while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
+ msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
+ 0, "zvol:w", 0);
+ mtx_unlock(&zsg->zsg_queue_mtx);
+ ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+ } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+ struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+ struct cdev *dev = zsd->zsd_cdev;
+
+ dev->si_drv2 = NULL;
+ }
+}
+
+static int
+zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
+{
+ zv->zv_volsize = volsize;
+ if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+ struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+ struct g_provider *pp = zsg->zsg_provider;
+
+ g_topology_lock();
+
+ if (pp->private == NULL) {
+ g_topology_unlock();
+ return (SET_ERROR(ENXIO));
+ }
+
+ /*
+ * Do not invoke resize event when initial size was zero.
+ * ZVOL initializes the size on first open, this is not
+ * real resizing.
+ */
+ if (pp->mediasize == 0)
+ pp->mediasize = zv->zv_volsize;
+ else
+ g_resize_provider(pp, zv->zv_volsize);
+
+ g_topology_unlock();
+ }
+ return (0);
+}
+
+static void
+zvol_set_disk_ro_impl(zvol_state_t *zv, int flags)
+{
+ // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
+}
+
+static void
+zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity)
+{
+ // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
+}
+
+const static zvol_platform_ops_t zvol_freebsd_ops = {
+ .zv_free = zvol_free,
+ .zv_rename_minor = zvol_rename_minor,
+ .zv_create_minor = zvol_create_minor_impl,
+ .zv_update_volsize = zvol_update_volsize,
+ .zv_clear_private = zvol_clear_private,
+ .zv_is_zvol = zvol_is_zvol_impl,
+ .zv_set_disk_ro = zvol_set_disk_ro_impl,
+ .zv_set_capacity = zvol_set_capacity_impl,
+};
+
+/*
+ * Public interfaces
+ */
+
+int
+zvol_busy(void)
+{
+ return (zvol_minors != 0);
+}
+
+int
+zvol_init(void)
+{
+ zvol_init_impl();
+ zvol_register_ops(&zvol_freebsd_ops);
+ return (0);
+}
+
+void
+zvol_fini(void)
+{
+ zvol_fini_impl();
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/Makefile.in b/sys/contrib/openzfs/module/os/linux/spl/Makefile.in
new file mode 100644
index 000000000000..b2325f91b4a7
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/Makefile.in
@@ -0,0 +1,17 @@
+$(MODULE)-objs += ../os/linux/spl/spl-atomic.o
+$(MODULE)-objs += ../os/linux/spl/spl-condvar.o
+$(MODULE)-objs += ../os/linux/spl/spl-cred.o
+$(MODULE)-objs += ../os/linux/spl/spl-err.o
+$(MODULE)-objs += ../os/linux/spl/spl-generic.o
+$(MODULE)-objs += ../os/linux/spl/spl-kmem.o
+$(MODULE)-objs += ../os/linux/spl/spl-kmem-cache.o
+$(MODULE)-objs += ../os/linux/spl/spl-kstat.o
+$(MODULE)-objs += ../os/linux/spl/spl-proc.o
+$(MODULE)-objs += ../os/linux/spl/spl-procfs-list.o
+$(MODULE)-objs += ../os/linux/spl/spl-taskq.o
+$(MODULE)-objs += ../os/linux/spl/spl-thread.o
+$(MODULE)-objs += ../os/linux/spl/spl-trace.o
+$(MODULE)-objs += ../os/linux/spl/spl-tsd.o
+$(MODULE)-objs += ../os/linux/spl/spl-vmem.o
+$(MODULE)-objs += ../os/linux/spl/spl-xdr.o
+$(MODULE)-objs += ../os/linux/spl/spl-zlib.o
diff --git a/sys/contrib/openzfs/module/os/linux/spl/README.md b/sys/contrib/openzfs/module/os/linux/spl/README.md
new file mode 100644
index 000000000000..906530bcf2ad
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/README.md
@@ -0,0 +1,16 @@
+The Solaris Porting Layer, SPL, is a Linux kernel module which provides a
+compatibility layer used by the [OpenZFS](https://github.com/openzfs/zfs) project.
+
+# Installation
+
+The latest version of the SPL is maintained as part of this repository.
+Only when building ZFS version 0.7.x or earlier must an external SPL release
+be used. These releases can be found at:
+
+ * Version 0.7.x: https://github.com/zfsonlinux/spl/tree/spl-0.7-release
+ * Version 0.6.5.x: https://github.com/zfsonlinux/spl/tree/spl-0.6.5-release
+
+# Release
+
+The SPL is released under a GPLv2 license.
+For more details see the NOTICE and THIRDPARTYLICENSE files; `UCRL-CODE-235197`
diff --git a/sys/contrib/openzfs/module/os/linux/spl/THIRDPARTYLICENSE.gplv2 b/sys/contrib/openzfs/module/os/linux/spl/THIRDPARTYLICENSE.gplv2
new file mode 100644
index 000000000000..d159169d1050
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/THIRDPARTYLICENSE.gplv2
@@ -0,0 +1,339 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/sys/contrib/openzfs/module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip b/sys/contrib/openzfs/module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip
new file mode 100644
index 000000000000..78535a8ee133
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip
@@ -0,0 +1 @@
+COMPATIBILITY LAYER FOR OPENZFS ON LINUX
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-atomic.c b/sys/contrib/openzfs/module/os/linux/spl/spl-atomic.c
new file mode 100644
index 000000000000..accf656fbcc6
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-atomic.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Atomic Implementation.
+ */
+
+#include <sys/atomic.h>
+
+#ifdef ATOMIC_SPINLOCK
+/* Global atomic lock declarations */
+DEFINE_SPINLOCK(atomic32_lock);
+DEFINE_SPINLOCK(atomic64_lock);
+
+EXPORT_SYMBOL(atomic32_lock);
+EXPORT_SYMBOL(atomic64_lock);
+#endif /* ATOMIC_SPINLOCK */
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c b/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c
new file mode 100644
index 000000000000..d0461a9f1298
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c
@@ -0,0 +1,509 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Credential Implementation.
+ */
+
+#include <sys/condvar.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <linux/hrtimer.h>
+#include <linux/compiler_compat.h>
+#include <linux/mod_compat.h>
+
+#include <linux/sched.h>
+
+#ifdef HAVE_SCHED_SIGNAL_HEADER
+#include <linux/sched/signal.h>
+#endif
+
+#define MAX_HRTIMEOUT_SLACK_US 1000
+unsigned int spl_schedule_hrtimeout_slack_us = 0;
+
+static int
+param_set_hrtimeout_slack(const char *buf, zfs_kernel_param_t *kp)
+{
+ unsigned long val;
+ int error;
+
+ error = kstrtoul(buf, 0, &val);
+ if (error)
+ return (error);
+
+ if (val > MAX_HRTIMEOUT_SLACK_US)
+ return (-EINVAL);
+
+ error = param_set_uint(buf, kp);
+ if (error < 0)
+ return (error);
+
+ return (0);
+}
+
+module_param_call(spl_schedule_hrtimeout_slack_us, param_set_hrtimeout_slack,
+ param_get_uint, &spl_schedule_hrtimeout_slack_us, 0644);
+MODULE_PARM_DESC(spl_schedule_hrtimeout_slack_us,
+ "schedule_hrtimeout_range() delta/slack value in us, default(0)");
+
+void
+__cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg)
+{
+ ASSERT(cvp);
+ ASSERT(name == NULL);
+ ASSERT(type == CV_DEFAULT);
+ ASSERT(arg == NULL);
+
+ cvp->cv_magic = CV_MAGIC;
+ init_waitqueue_head(&cvp->cv_event);
+ init_waitqueue_head(&cvp->cv_destroy);
+ atomic_set(&cvp->cv_waiters, 0);
+ atomic_set(&cvp->cv_refs, 1);
+ cvp->cv_mutex = NULL;
+}
+EXPORT_SYMBOL(__cv_init);
+
+static int
+cv_destroy_wakeup(kcondvar_t *cvp)
+{
+ if (!atomic_read(&cvp->cv_waiters) && !atomic_read(&cvp->cv_refs)) {
+ ASSERT(cvp->cv_mutex == NULL);
+ ASSERT(!waitqueue_active(&cvp->cv_event));
+ return (1);
+ }
+
+ return (0);
+}
+
+void
+__cv_destroy(kcondvar_t *cvp)
+{
+ ASSERT(cvp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+
+ cvp->cv_magic = CV_DESTROY;
+ atomic_dec(&cvp->cv_refs);
+
+ /* Block until all waiters are woken and references dropped. */
+ while (cv_destroy_wakeup(cvp) == 0)
+ wait_event_timeout(cvp->cv_destroy, cv_destroy_wakeup(cvp), 1);
+
+ ASSERT3P(cvp->cv_mutex, ==, NULL);
+ ASSERT3S(atomic_read(&cvp->cv_refs), ==, 0);
+ ASSERT3S(atomic_read(&cvp->cv_waiters), ==, 0);
+ ASSERT3S(waitqueue_active(&cvp->cv_event), ==, 0);
+}
+EXPORT_SYMBOL(__cv_destroy);
+
+static void
+cv_wait_common(kcondvar_t *cvp, kmutex_t *mp, int state, int io)
+{
+ DEFINE_WAIT(wait);
+ kmutex_t *m;
+
+ ASSERT(cvp);
+ ASSERT(mp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+ ASSERT(mutex_owned(mp));
+ atomic_inc(&cvp->cv_refs);
+
+ m = READ_ONCE(cvp->cv_mutex);
+ if (!m)
+ m = xchg(&cvp->cv_mutex, mp);
+ /* Ensure the same mutex is used by all callers */
+ ASSERT(m == NULL || m == mp);
+
+ prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
+ atomic_inc(&cvp->cv_waiters);
+
+ /*
+ * Mutex should be dropped after prepare_to_wait() this
+ * ensures we're linked in to the waiters list and avoids the
+ * race where 'cvp->cv_waiters > 0' but the list is empty.
+ */
+ mutex_exit(mp);
+ if (io)
+ io_schedule();
+ else
+ schedule();
+
+ /* No more waiters a different mutex could be used */
+ if (atomic_dec_and_test(&cvp->cv_waiters)) {
+ /*
+ * This is set without any lock, so it's racy. But this is
+ * just for debug anyway, so make it best-effort
+ */
+ cvp->cv_mutex = NULL;
+ wake_up(&cvp->cv_destroy);
+ }
+
+ finish_wait(&cvp->cv_event, &wait);
+ atomic_dec(&cvp->cv_refs);
+
+ /*
+ * Hold mutex after we release the cvp, otherwise we could dead lock
+ * with a thread holding the mutex and call cv_destroy.
+ */
+ mutex_enter(mp);
+}
+
+void
+__cv_wait(kcondvar_t *cvp, kmutex_t *mp)
+{
+ cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 0);
+}
+EXPORT_SYMBOL(__cv_wait);
+
+void
+__cv_wait_io(kcondvar_t *cvp, kmutex_t *mp)
+{
+ cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 1);
+}
+EXPORT_SYMBOL(__cv_wait_io);
+
+int
+__cv_wait_io_sig(kcondvar_t *cvp, kmutex_t *mp)
+{
+ cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 1);
+
+ return (signal_pending(current) ? 0 : 1);
+}
+EXPORT_SYMBOL(__cv_wait_io_sig);
+
+int
+__cv_wait_sig(kcondvar_t *cvp, kmutex_t *mp)
+{
+ cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 0);
+
+ return (signal_pending(current) ? 0 : 1);
+}
+EXPORT_SYMBOL(__cv_wait_sig);
+
+void
+__cv_wait_idle(kcondvar_t *cvp, kmutex_t *mp)
+{
+ sigset_t blocked, saved;
+
+ sigfillset(&blocked);
+ (void) sigprocmask(SIG_BLOCK, &blocked, &saved);
+ cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 0);
+ (void) sigprocmask(SIG_SETMASK, &saved, NULL);
+}
+EXPORT_SYMBOL(__cv_wait_idle);
+
+#if defined(HAVE_IO_SCHEDULE_TIMEOUT)
+#define spl_io_schedule_timeout(t) io_schedule_timeout(t)
+#else
+
+struct spl_task_timer {
+ struct timer_list timer;
+ struct task_struct *task;
+};
+
+static void
+__cv_wakeup(spl_timer_list_t t)
+{
+ struct timer_list *tmr = (struct timer_list *)t;
+ struct spl_task_timer *task_timer = from_timer(task_timer, tmr, timer);
+
+ wake_up_process(task_timer->task);
+}
+
+static long
+spl_io_schedule_timeout(long time_left)
+{
+ long expire_time = jiffies + time_left;
+ struct spl_task_timer task_timer;
+ struct timer_list *timer = &task_timer.timer;
+
+ task_timer.task = current;
+
+ timer_setup(timer, __cv_wakeup, 0);
+
+ timer->expires = expire_time;
+ add_timer(timer);
+
+ io_schedule();
+
+ del_timer_sync(timer);
+
+ time_left = expire_time - jiffies;
+
+ return (time_left < 0 ? 0 : time_left);
+}
+#endif
+
+/*
+ * 'expire_time' argument is an absolute wall clock time in jiffies.
+ * Return value is time left (expire_time - now) or -1 if timeout occurred.
+ */
+static clock_t
+__cv_timedwait_common(kcondvar_t *cvp, kmutex_t *mp, clock_t expire_time,
+ int state, int io)
+{
+ DEFINE_WAIT(wait);
+ kmutex_t *m;
+ clock_t time_left;
+
+ ASSERT(cvp);
+ ASSERT(mp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+ ASSERT(mutex_owned(mp));
+
+ /* XXX - Does not handle jiffie wrap properly */
+ time_left = expire_time - jiffies;
+ if (time_left <= 0)
+ return (-1);
+
+ atomic_inc(&cvp->cv_refs);
+ m = READ_ONCE(cvp->cv_mutex);
+ if (!m)
+ m = xchg(&cvp->cv_mutex, mp);
+ /* Ensure the same mutex is used by all callers */
+ ASSERT(m == NULL || m == mp);
+
+ prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
+ atomic_inc(&cvp->cv_waiters);
+
+ /*
+ * Mutex should be dropped after prepare_to_wait() this
+ * ensures we're linked in to the waiters list and avoids the
+ * race where 'cvp->cv_waiters > 0' but the list is empty.
+ */
+ mutex_exit(mp);
+ if (io)
+ time_left = spl_io_schedule_timeout(time_left);
+ else
+ time_left = schedule_timeout(time_left);
+
+ /* No more waiters a different mutex could be used */
+ if (atomic_dec_and_test(&cvp->cv_waiters)) {
+ /*
+ * This is set without any lock, so it's racy. But this is
+ * just for debug anyway, so make it best-effort
+ */
+ cvp->cv_mutex = NULL;
+ wake_up(&cvp->cv_destroy);
+ }
+
+ finish_wait(&cvp->cv_event, &wait);
+ atomic_dec(&cvp->cv_refs);
+
+ /*
+ * Hold mutex after we release the cvp, otherwise we could dead lock
+ * with a thread holding the mutex and call cv_destroy.
+ */
+ mutex_enter(mp);
+ return (time_left > 0 ? 1 : -1);
+}
+
+int
+__cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
+{
+ return (__cv_timedwait_common(cvp, mp, exp_time,
+ TASK_UNINTERRUPTIBLE, 0));
+}
+EXPORT_SYMBOL(__cv_timedwait);
+
+int
+__cv_timedwait_io(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
+{
+ return (__cv_timedwait_common(cvp, mp, exp_time,
+ TASK_UNINTERRUPTIBLE, 1));
+}
+EXPORT_SYMBOL(__cv_timedwait_io);
+
+int
+__cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
+{
+ int rc;
+
+ rc = __cv_timedwait_common(cvp, mp, exp_time, TASK_INTERRUPTIBLE, 0);
+ return (signal_pending(current) ? 0 : rc);
+}
+EXPORT_SYMBOL(__cv_timedwait_sig);
+
+int
+__cv_timedwait_idle(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
+{
+ sigset_t blocked, saved;
+ int rc;
+
+ sigfillset(&blocked);
+ (void) sigprocmask(SIG_BLOCK, &blocked, &saved);
+ rc = __cv_timedwait_common(cvp, mp, exp_time,
+ TASK_INTERRUPTIBLE, 0);
+ (void) sigprocmask(SIG_SETMASK, &saved, NULL);
+
+ return (rc);
+}
+EXPORT_SYMBOL(__cv_timedwait_idle);
+/*
+ * 'expire_time' argument is an absolute clock time in nanoseconds.
+ * Return value is time left (expire_time - now) or -1 if timeout occurred.
+ */
+static clock_t
+__cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time,
+ hrtime_t res, int state)
+{
+ DEFINE_WAIT(wait);
+ kmutex_t *m;
+ hrtime_t time_left;
+ ktime_t ktime_left;
+ u64 slack = 0;
+ int rc;
+
+ ASSERT(cvp);
+ ASSERT(mp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+ ASSERT(mutex_owned(mp));
+
+ time_left = expire_time - gethrtime();
+ if (time_left <= 0)
+ return (-1);
+
+ atomic_inc(&cvp->cv_refs);
+ m = READ_ONCE(cvp->cv_mutex);
+ if (!m)
+ m = xchg(&cvp->cv_mutex, mp);
+ /* Ensure the same mutex is used by all callers */
+ ASSERT(m == NULL || m == mp);
+
+ prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
+ atomic_inc(&cvp->cv_waiters);
+
+ /*
+ * Mutex should be dropped after prepare_to_wait() this
+ * ensures we're linked in to the waiters list and avoids the
+ * race where 'cvp->cv_waiters > 0' but the list is empty.
+ */
+ mutex_exit(mp);
+
+ ktime_left = ktime_set(0, time_left);
+ slack = MIN(MAX(res, spl_schedule_hrtimeout_slack_us * NSEC_PER_USEC),
+ MAX_HRTIMEOUT_SLACK_US * NSEC_PER_USEC);
+ rc = schedule_hrtimeout_range(&ktime_left, slack, HRTIMER_MODE_REL);
+
+ /* No more waiters a different mutex could be used */
+ if (atomic_dec_and_test(&cvp->cv_waiters)) {
+ /*
+ * This is set without any lock, so it's racy. But this is
+ * just for debug anyway, so make it best-effort
+ */
+ cvp->cv_mutex = NULL;
+ wake_up(&cvp->cv_destroy);
+ }
+
+ finish_wait(&cvp->cv_event, &wait);
+ atomic_dec(&cvp->cv_refs);
+
+ mutex_enter(mp);
+ return (rc == -EINTR ? 1 : -1);
+}
+
+/*
+ * Compatibility wrapper for the cv_timedwait_hires() Illumos interface.
+ */
+static int
+cv_timedwait_hires_common(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
+ hrtime_t res, int flag, int state)
+{
+ if (!(flag & CALLOUT_FLAG_ABSOLUTE))
+ tim += gethrtime();
+
+ return (__cv_timedwait_hires(cvp, mp, tim, res, state));
+}
+
+int
+cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res,
+ int flag)
+{
+ return (cv_timedwait_hires_common(cvp, mp, tim, res, flag,
+ TASK_UNINTERRUPTIBLE));
+}
+EXPORT_SYMBOL(cv_timedwait_hires);
+
+int
+cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
+ hrtime_t res, int flag)
+{
+ int rc;
+
+ rc = cv_timedwait_hires_common(cvp, mp, tim, res, flag,
+ TASK_INTERRUPTIBLE);
+ return (signal_pending(current) ? 0 : rc);
+}
+EXPORT_SYMBOL(cv_timedwait_sig_hires);
+
+int
+cv_timedwait_idle_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
+ hrtime_t res, int flag)
+{
+ sigset_t blocked, saved;
+ int rc;
+
+ sigfillset(&blocked);
+ (void) sigprocmask(SIG_BLOCK, &blocked, &saved);
+ rc = cv_timedwait_hires_common(cvp, mp, tim, res, flag,
+ TASK_INTERRUPTIBLE);
+ (void) sigprocmask(SIG_SETMASK, &saved, NULL);
+
+ return (rc);
+}
+EXPORT_SYMBOL(cv_timedwait_idle_hires);
+
+void
+__cv_signal(kcondvar_t *cvp)
+{
+ ASSERT(cvp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+ atomic_inc(&cvp->cv_refs);
+
+ /*
+ * All waiters are added with WQ_FLAG_EXCLUSIVE so only one
+ * waiter will be set runnable with each call to wake_up().
+ * Additionally wake_up() holds a spin_lock associated with
+ * the wait queue to ensure we don't race waking up processes.
+ */
+ if (atomic_read(&cvp->cv_waiters) > 0)
+ wake_up(&cvp->cv_event);
+
+ atomic_dec(&cvp->cv_refs);
+}
+EXPORT_SYMBOL(__cv_signal);
+
+void
+__cv_broadcast(kcondvar_t *cvp)
+{
+ ASSERT(cvp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+ atomic_inc(&cvp->cv_refs);
+
+ /*
+ * Wake_up_all() will wake up all waiters even those which
+ * have the WQ_FLAG_EXCLUSIVE flag set.
+ */
+ if (atomic_read(&cvp->cv_waiters) > 0)
+ wake_up_all(&cvp->cv_event);
+
+ atomic_dec(&cvp->cv_refs);
+}
+EXPORT_SYMBOL(__cv_broadcast);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-cred.c b/sys/contrib/openzfs/module/os/linux/spl/spl-cred.c
new file mode 100644
index 000000000000..8fe1cc30ba99
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-cred.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Credential Implementation.
+ */
+
+#include <sys/cred.h>
+
+static int
+cr_groups_search(const struct group_info *group_info, kgid_t grp)
+{
+ unsigned int left, right, mid;
+ int cmp;
+
+ if (!group_info)
+ return (0);
+
+ left = 0;
+ right = group_info->ngroups;
+ while (left < right) {
+ mid = (left + right) / 2;
+ cmp = KGID_TO_SGID(grp) -
+ KGID_TO_SGID(GROUP_AT(group_info, mid));
+
+ if (cmp > 0)
+ left = mid + 1;
+ else if (cmp < 0)
+ right = mid;
+ else
+ return (1);
+ }
+ return (0);
+}
+
+/* Hold a reference on the credential */
+void
+crhold(cred_t *cr)
+{
+ (void) get_cred((const cred_t *)cr);
+}
+
+/* Free a reference on the credential */
+void
+crfree(cred_t *cr)
+{
+ put_cred((const cred_t *)cr);
+}
+
+/* Return the number of supplemental groups */
+int
+crgetngroups(const cred_t *cr)
+{
+ struct group_info *gi;
+ int rc;
+
+ gi = cr->group_info;
+ rc = gi->ngroups;
+#ifndef HAVE_GROUP_INFO_GID
+ /*
+ * For Linux <= 4.8,
+ * crgetgroups will only returns gi->blocks[0], which contains only
+ * the first NGROUPS_PER_BLOCK groups.
+ */
+ if (rc > NGROUPS_PER_BLOCK) {
+ WARN_ON_ONCE(1);
+ rc = NGROUPS_PER_BLOCK;
+ }
+#endif
+ return (rc);
+}
+
+/*
+ * Return an array of supplemental gids. The returned address is safe
+ * to use as long as the caller has taken a reference with crhold().
+ *
+ * Linux 4.9 API change, group_info changed from 2d array via ->blocks to 1d
+ * array via ->gid.
+ */
+gid_t *
+crgetgroups(const cred_t *cr)
+{
+ struct group_info *gi;
+ gid_t *gids = NULL;
+
+ gi = cr->group_info;
+#ifdef HAVE_GROUP_INFO_GID
+ gids = KGIDP_TO_SGIDP(gi->gid);
+#else
+ if (gi->nblocks > 0)
+ gids = KGIDP_TO_SGIDP(gi->blocks[0]);
+#endif
+ return (gids);
+}
+
+/* Check if the passed gid is available in supplied credential. */
+int
+groupmember(gid_t gid, const cred_t *cr)
+{
+ struct group_info *gi;
+ int rc;
+
+ gi = cr->group_info;
+ rc = cr_groups_search(gi, SGID_TO_KGID(gid));
+
+ return (rc);
+}
+
+/* Return the effective user id */
+uid_t
+crgetuid(const cred_t *cr)
+{
+ return (KUID_TO_SUID(cr->euid));
+}
+
+/* Return the real user id */
+uid_t
+crgetruid(const cred_t *cr)
+{
+ return (KUID_TO_SUID(cr->uid));
+}
+
+/* Return the saved user id */
+uid_t
+crgetsuid(const cred_t *cr)
+{
+ return (KUID_TO_SUID(cr->suid));
+}
+
+/* Return the filesystem user id */
+uid_t
+crgetfsuid(const cred_t *cr)
+{
+ return (KUID_TO_SUID(cr->fsuid));
+}
+
+/* Return the effective group id */
+gid_t
+crgetgid(const cred_t *cr)
+{
+ return (KGID_TO_SGID(cr->egid));
+}
+
+/* Return the real group id */
+gid_t
+crgetrgid(const cred_t *cr)
+{
+ return (KGID_TO_SGID(cr->gid));
+}
+
+/* Return the saved group id */
+gid_t
+crgetsgid(const cred_t *cr)
+{
+ return (KGID_TO_SGID(cr->sgid));
+}
+
+/* Return the filesystem group id */
+gid_t
+crgetfsgid(const cred_t *cr)
+{
+ return (KGID_TO_SGID(cr->fsgid));
+}
+
+EXPORT_SYMBOL(crhold);
+EXPORT_SYMBOL(crfree);
+EXPORT_SYMBOL(crgetuid);
+EXPORT_SYMBOL(crgetruid);
+EXPORT_SYMBOL(crgetsuid);
+EXPORT_SYMBOL(crgetfsuid);
+EXPORT_SYMBOL(crgetgid);
+EXPORT_SYMBOL(crgetrgid);
+EXPORT_SYMBOL(crgetsgid);
+EXPORT_SYMBOL(crgetfsgid);
+EXPORT_SYMBOL(crgetngroups);
+EXPORT_SYMBOL(crgetgroups);
+EXPORT_SYMBOL(groupmember);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-err.c b/sys/contrib/openzfs/module/os/linux/spl/spl-err.c
new file mode 100644
index 000000000000..10b768d57360
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-err.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Error Implementation.
+ */
+
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+
+/*
+ * It is often useful to actually have the panic crash the node so you
+ * can then get notified of the event, get the crashdump for later
+ * analysis and other such goodies.
+ * But we would still default to the current default of not to do that.
+ */
+/* BEGIN CSTYLED */
+unsigned int spl_panic_halt;
+module_param(spl_panic_halt, uint, 0644);
+MODULE_PARM_DESC(spl_panic_halt, "Cause kernel panic on assertion failures");
+/* END CSTYLED */
+
+void
+spl_dumpstack(void)
+{
+ printk("Showing stack for process %d\n", current->pid);
+ dump_stack();
+}
+EXPORT_SYMBOL(spl_dumpstack);
+
+int
+spl_panic(const char *file, const char *func, int line, const char *fmt, ...)
+{
+ const char *newfile;
+ char msg[MAXMSGLEN];
+ va_list ap;
+
+ newfile = strrchr(file, '/');
+ if (newfile != NULL)
+ newfile = newfile + 1;
+ else
+ newfile = file;
+
+ va_start(ap, fmt);
+ (void) vsnprintf(msg, sizeof (msg), fmt, ap);
+ va_end(ap);
+
+ printk(KERN_EMERG "%s", msg);
+ printk(KERN_EMERG "PANIC at %s:%d:%s()\n", newfile, line, func);
+ if (spl_panic_halt)
+ panic("%s", msg);
+
+ spl_dumpstack();
+
+ /* Halt the thread to facilitate further debugging */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ while (1)
+ schedule();
+
+ /* Unreachable */
+ return (1);
+}
+EXPORT_SYMBOL(spl_panic);
+
+void
+vcmn_err(int ce, const char *fmt, va_list ap)
+{
+ char msg[MAXMSGLEN];
+
+ vsnprintf(msg, MAXMSGLEN, fmt, ap);
+
+ switch (ce) {
+ case CE_IGNORE:
+ break;
+ case CE_CONT:
+ printk("%s", msg);
+ break;
+ case CE_NOTE:
+ printk(KERN_NOTICE "NOTICE: %s\n", msg);
+ break;
+ case CE_WARN:
+ printk(KERN_WARNING "WARNING: %s\n", msg);
+ break;
+ case CE_PANIC:
+ printk(KERN_EMERG "PANIC: %s\n", msg);
+ spl_dumpstack();
+
+ /* Halt the thread to facilitate further debugging */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ while (1)
+ schedule();
+ }
+} /* vcmn_err() */
+EXPORT_SYMBOL(vcmn_err);
+
+void
+cmn_err(int ce, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vcmn_err(ce, fmt, ap);
+ va_end(ap);
+} /* cmn_err() */
+EXPORT_SYMBOL(cmn_err);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
new file mode 100644
index 000000000000..36fdff72a133
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
@@ -0,0 +1,841 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Generic Implementation.
+ */
+
+#include <sys/sysmacros.h>
+#include <sys/systeminfo.h>
+#include <sys/vmsystm.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/vmem.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/taskq.h>
+#include <sys/tsd.h>
+#include <sys/zmod.h>
+#include <sys/debug.h>
+#include <sys/proc.h>
+#include <sys/kstat.h>
+#include <sys/file.h>
+#include <sys/sunddi.h>
+#include <linux/ctype.h>
+#include <sys/disp.h>
+#include <sys/random.h>
+#include <sys/strings.h>
+#include <linux/kmod.h>
+#include "zfs_gitrev.h"
+#include <linux/mod_compat.h>
+#include <sys/cred.h>
+#include <sys/vnode.h>
+
+char spl_gitrev[64] = ZFS_META_GITREV;
+
+/* BEGIN CSTYLED */
+unsigned long spl_hostid = 0;
+EXPORT_SYMBOL(spl_hostid);
+/* BEGIN CSTYLED */
+module_param(spl_hostid, ulong, 0644);
+MODULE_PARM_DESC(spl_hostid, "The system hostid.");
+/* END CSTYLED */
+
+proc_t p0;
+EXPORT_SYMBOL(p0);
+
+/*
+ * Xorshift Pseudo Random Number Generator based on work by Sebastiano Vigna
+ *
+ * "Further scramblings of Marsaglia's xorshift generators"
+ * http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf
+ *
+ * random_get_pseudo_bytes() is an API function on Illumos whose sole purpose
+ * is to provide bytes containing random numbers. It is mapped to /dev/urandom
+ * on Illumos, which uses a "FIPS 186-2 algorithm". No user of the SPL's
+ * random_get_pseudo_bytes() needs bytes that are of cryptographic quality, so
+ * we can implement it using a fast PRNG that we seed using Linux' actual
+ * equivalent to random_get_pseudo_bytes(). We do this by providing each CPU
+ * with an independent seed so that all calls to random_get_pseudo_bytes() are
+ * free of atomic instructions.
+ *
+ * A consequence of using a fast PRNG is that using random_get_pseudo_bytes()
+ * to generate words larger than 128 bits will paradoxically be limited to
+ * `2^128 - 1` possibilities. This is because we have a sequence of `2^128 - 1`
+ * 128-bit words and selecting the first will implicitly select the second. If
+ * a caller finds this behavior undesirable, random_get_bytes() should be used
+ * instead.
+ *
+ * XXX: Linux interrupt handlers that trigger within the critical section
+ * formed by `s[1] = xp[1];` and `xp[0] = s[0];` and call this function will
+ * see the same numbers. Nothing in the code currently calls this in an
+ * interrupt handler, so this is considered to be okay. If that becomes a
+ * problem, we could create a set of per-cpu variables for interrupt handlers
+ * and use them when in_interrupt() from linux/preempt_mask.h evaluates to
+ * true.
+ */
+void __percpu *spl_pseudo_entropy;
+
+/*
+ * spl_rand_next()/spl_rand_jump() are copied from the following CC-0 licensed
+ * file:
+ *
+ * http://xorshift.di.unimi.it/xorshift128plus.c
+ */
+
+static inline uint64_t
+spl_rand_next(uint64_t *s)
+{
+ uint64_t s1 = s[0];
+ const uint64_t s0 = s[1];
+ s[0] = s0;
+ s1 ^= s1 << 23; // a
+ s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c
+ return (s[1] + s0);
+}
+
+static inline void
+spl_rand_jump(uint64_t *s)
+{
+ static const uint64_t JUMP[] =
+ { 0x8a5cd789635d2dff, 0x121fd2155c472f96 };
+
+ uint64_t s0 = 0;
+ uint64_t s1 = 0;
+ int i, b;
+ for (i = 0; i < sizeof (JUMP) / sizeof (*JUMP); i++)
+ for (b = 0; b < 64; b++) {
+ if (JUMP[i] & 1ULL << b) {
+ s0 ^= s[0];
+ s1 ^= s[1];
+ }
+ (void) spl_rand_next(s);
+ }
+
+ s[0] = s0;
+ s[1] = s1;
+}
+
+int
+random_get_pseudo_bytes(uint8_t *ptr, size_t len)
+{
+ uint64_t *xp, s[2];
+
+ ASSERT(ptr);
+
+ xp = get_cpu_ptr(spl_pseudo_entropy);
+
+ s[0] = xp[0];
+ s[1] = xp[1];
+
+ while (len) {
+ union {
+ uint64_t ui64;
+ uint8_t byte[sizeof (uint64_t)];
+ }entropy;
+ int i = MIN(len, sizeof (uint64_t));
+
+ len -= i;
+ entropy.ui64 = spl_rand_next(s);
+
+ while (i--)
+ *ptr++ = entropy.byte[i];
+ }
+
+ xp[0] = s[0];
+ xp[1] = s[1];
+
+ put_cpu_ptr(spl_pseudo_entropy);
+
+ return (0);
+}
+
+
+EXPORT_SYMBOL(random_get_pseudo_bytes);
+
+#if BITS_PER_LONG == 32
+
+/*
+ * Support 64/64 => 64 division on a 32-bit platform. While the kernel
+ * provides a div64_u64() function for this we do not use it because the
+ * implementation is flawed. There are cases which return incorrect
+ * results as late as linux-2.6.35. Until this is fixed upstream the
+ * spl must provide its own implementation.
+ *
+ * This implementation is a slightly modified version of the algorithm
+ * proposed by the book 'Hacker's Delight'. The original source can be
+ * found here and is available for use without restriction.
+ *
+ * http://www.hackersdelight.org/HDcode/newCode/divDouble.c
+ */
+
+/*
+ * Calculate number of leading of zeros for a 64-bit value.
+ */
+static int
+nlz64(uint64_t x)
+{
+ register int n = 0;
+
+ if (x == 0)
+ return (64);
+
+ if (x <= 0x00000000FFFFFFFFULL) { n = n + 32; x = x << 32; }
+ if (x <= 0x0000FFFFFFFFFFFFULL) { n = n + 16; x = x << 16; }
+ if (x <= 0x00FFFFFFFFFFFFFFULL) { n = n + 8; x = x << 8; }
+ if (x <= 0x0FFFFFFFFFFFFFFFULL) { n = n + 4; x = x << 4; }
+ if (x <= 0x3FFFFFFFFFFFFFFFULL) { n = n + 2; x = x << 2; }
+ if (x <= 0x7FFFFFFFFFFFFFFFULL) { n = n + 1; }
+
+ return (n);
+}
+
+/*
+ * Newer kernels have a div_u64() function but we define our own
+ * to simplify portability between kernel versions.
+ */
+static inline uint64_t
+__div_u64(uint64_t u, uint32_t v)
+{
+ (void) do_div(u, v);
+ return (u);
+}
+
+/*
+ * Turn off missing prototypes warning for these functions. They are
+ * replacements for libgcc-provided functions and will never be called
+ * directly.
+ */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmissing-prototypes"
+
+/*
+ * Implementation of 64-bit unsigned division for 32-bit machines.
+ *
+ * First the procedure takes care of the case in which the divisor is a
+ * 32-bit quantity. There are two subcases: (1) If the left half of the
+ * dividend is less than the divisor, one execution of do_div() is all that
+ * is required (overflow is not possible). (2) Otherwise it does two
+ * divisions, using the grade school method.
+ */
+uint64_t
+__udivdi3(uint64_t u, uint64_t v)
+{
+ uint64_t u0, u1, v1, q0, q1, k;
+ int n;
+
+ if (v >> 32 == 0) { // If v < 2**32:
+ if (u >> 32 < v) { // If u/v cannot overflow,
+ return (__div_u64(u, v)); // just do one division.
+ } else { // If u/v would overflow:
+ u1 = u >> 32; // Break u into two halves.
+ u0 = u & 0xFFFFFFFF;
+ q1 = __div_u64(u1, v); // First quotient digit.
+ k = u1 - q1 * v; // First remainder, < v.
+ u0 += (k << 32);
+ q0 = __div_u64(u0, v); // Seconds quotient digit.
+ return ((q1 << 32) + q0);
+ }
+ } else { // If v >= 2**32:
+ n = nlz64(v); // 0 <= n <= 31.
+ v1 = (v << n) >> 32; // Normalize divisor, MSB is 1.
+ u1 = u >> 1; // To ensure no overflow.
+ q1 = __div_u64(u1, v1); // Get quotient from
+ q0 = (q1 << n) >> 31; // Undo normalization and
+ // division of u by 2.
+ if (q0 != 0) // Make q0 correct or
+ q0 = q0 - 1; // too small by 1.
+ if ((u - q0 * v) >= v)
+ q0 = q0 + 1; // Now q0 is correct.
+
+ return (q0);
+ }
+}
+EXPORT_SYMBOL(__udivdi3);
+
+/* BEGIN CSTYLED */
+#ifndef abs64
+#define abs64(x) ({ uint64_t t = (x) >> 63; ((x) ^ t) - t; })
+#endif
+/* END CSTYLED */
+
+/*
+ * Implementation of 64-bit signed division for 32-bit machines.
+ */
+int64_t
+__divdi3(int64_t u, int64_t v)
+{
+ int64_t q, t;
+ q = __udivdi3(abs64(u), abs64(v));
+ t = (u ^ v) >> 63; // If u, v have different
+ return ((q ^ t) - t); // signs, negate q.
+}
+EXPORT_SYMBOL(__divdi3);
+
+/*
+ * Implementation of 64-bit unsigned modulo for 32-bit machines.
+ */
+uint64_t
+__umoddi3(uint64_t dividend, uint64_t divisor)
+{
+ return (dividend - (divisor * __udivdi3(dividend, divisor)));
+}
+EXPORT_SYMBOL(__umoddi3);
+
+/* 64-bit signed modulo for 32-bit machines. */
+int64_t
+__moddi3(int64_t n, int64_t d)
+{
+ int64_t q;
+ boolean_t nn = B_FALSE;
+
+ if (n < 0) {
+ nn = B_TRUE;
+ n = -n;
+ }
+ if (d < 0)
+ d = -d;
+
+ q = __umoddi3(n, d);
+
+ return (nn ? -q : q);
+}
+EXPORT_SYMBOL(__moddi3);
+
+/*
+ * Implementation of 64-bit unsigned division/modulo for 32-bit machines.
+ */
+uint64_t
+__udivmoddi4(uint64_t n, uint64_t d, uint64_t *r)
+{
+ uint64_t q = __udivdi3(n, d);
+ if (r)
+ *r = n - d * q;
+ return (q);
+}
+EXPORT_SYMBOL(__udivmoddi4);
+
+/*
+ * Implementation of 64-bit signed division/modulo for 32-bit machines.
+ */
+int64_t
+__divmoddi4(int64_t n, int64_t d, int64_t *r)
+{
+ int64_t q, rr;
+ boolean_t nn = B_FALSE;
+ boolean_t nd = B_FALSE;
+ if (n < 0) {
+ nn = B_TRUE;
+ n = -n;
+ }
+ if (d < 0) {
+ nd = B_TRUE;
+ d = -d;
+ }
+
+ q = __udivmoddi4(n, d, (uint64_t *)&rr);
+
+ if (nn != nd)
+ q = -q;
+ if (nn)
+ rr = -rr;
+ if (r)
+ *r = rr;
+ return (q);
+}
+EXPORT_SYMBOL(__divmoddi4);
+
+#if defined(__arm) || defined(__arm__)
+/*
+ * Implementation of 64-bit (un)signed division for 32-bit arm machines.
+ *
+ * Run-time ABI for the ARM Architecture (page 20). A pair of (unsigned)
+ * long longs is returned in {{r0, r1}, {r2,r3}}, the quotient in {r0, r1},
+ * and the remainder in {r2, r3}. The return type is specifically left
+ * set to 'void' to ensure the compiler does not overwrite these registers
+ * during the return. All results are in registers as per ABI
+ */
+void
+__aeabi_uldivmod(uint64_t u, uint64_t v)
+{
+ uint64_t res;
+ uint64_t mod;
+
+ res = __udivdi3(u, v);
+ mod = __umoddi3(u, v);
+ {
+ register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF);
+ register uint32_t r1 asm("r1") = (res >> 32);
+ register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF);
+ register uint32_t r3 asm("r3") = (mod >> 32);
+
+ /* BEGIN CSTYLED */
+ asm volatile(""
+ : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3) /* output */
+ : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */
+ /* END CSTYLED */
+
+ return; /* r0; */
+ }
+}
+EXPORT_SYMBOL(__aeabi_uldivmod);
+
+void
+__aeabi_ldivmod(int64_t u, int64_t v)
+{
+ int64_t res;
+ uint64_t mod;
+
+ res = __divdi3(u, v);
+ mod = __umoddi3(u, v);
+ {
+ register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF);
+ register uint32_t r1 asm("r1") = (res >> 32);
+ register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF);
+ register uint32_t r3 asm("r3") = (mod >> 32);
+
+ /* BEGIN CSTYLED */
+ asm volatile(""
+ : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3) /* output */
+ : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */
+ /* END CSTYLED */
+
+ return; /* r0; */
+ }
+}
+EXPORT_SYMBOL(__aeabi_ldivmod);
+#endif /* __arm || __arm__ */
+
+#pragma GCC diagnostic pop
+
+#endif /* BITS_PER_LONG */
+
+/*
+ * NOTE: The strtoxx behavior is solely based on my reading of the Solaris
+ * ddi_strtol(9F) man page. I have not verified the behavior of these
+ * functions against their Solaris counterparts. It is possible that I
+ * may have misinterpreted the man page or the man page is incorrect.
+ */
+int ddi_strtoul(const char *, char **, int, unsigned long *);
+int ddi_strtol(const char *, char **, int, long *);
+int ddi_strtoull(const char *, char **, int, unsigned long long *);
+int ddi_strtoll(const char *, char **, int, long long *);
+
+#define define_ddi_strtoux(type, valtype) \
+int ddi_strtou##type(const char *str, char **endptr, \
+ int base, valtype *result) \
+{ \
+ valtype last_value, value = 0; \
+ char *ptr = (char *)str; \
+ int flag = 1, digit; \
+ \
+ if (strlen(ptr) == 0) \
+ return (EINVAL); \
+ \
+ /* Auto-detect base based on prefix */ \
+ if (!base) { \
+ if (str[0] == '0') { \
+ if (tolower(str[1]) == 'x' && isxdigit(str[2])) { \
+ base = 16; /* hex */ \
+ ptr += 2; \
+ } else if (str[1] >= '0' && str[1] < 8) { \
+ base = 8; /* octal */ \
+ ptr += 1; \
+ } else { \
+ return (EINVAL); \
+ } \
+ } else { \
+ base = 10; /* decimal */ \
+ } \
+ } \
+ \
+ while (1) { \
+ if (isdigit(*ptr)) \
+ digit = *ptr - '0'; \
+ else if (isalpha(*ptr)) \
+ digit = tolower(*ptr) - 'a' + 10; \
+ else \
+ break; \
+ \
+ if (digit >= base) \
+ break; \
+ \
+ last_value = value; \
+ value = value * base + digit; \
+ if (last_value > value) /* Overflow */ \
+ return (ERANGE); \
+ \
+ flag = 1; \
+ ptr++; \
+ } \
+ \
+ if (flag) \
+ *result = value; \
+ \
+ if (endptr) \
+ *endptr = (char *)(flag ? ptr : str); \
+ \
+ return (0); \
+} \
+
+#define define_ddi_strtox(type, valtype) \
+int ddi_strto##type(const char *str, char **endptr, \
+ int base, valtype *result) \
+{ \
+ int rc; \
+ \
+ if (*str == '-') { \
+ rc = ddi_strtou##type(str + 1, endptr, base, result); \
+ if (!rc) { \
+ if (*endptr == str + 1) \
+ *endptr = (char *)str; \
+ else \
+ *result = -*result; \
+ } \
+ } else { \
+ rc = ddi_strtou##type(str, endptr, base, result); \
+ } \
+ \
+ return (rc); \
+}
+
+define_ddi_strtoux(l, unsigned long)
+define_ddi_strtox(l, long)
+define_ddi_strtoux(ll, unsigned long long)
+define_ddi_strtox(ll, long long)
+
+EXPORT_SYMBOL(ddi_strtoul);
+EXPORT_SYMBOL(ddi_strtol);
+EXPORT_SYMBOL(ddi_strtoll);
+EXPORT_SYMBOL(ddi_strtoull);
+
+int
+ddi_copyin(const void *from, void *to, size_t len, int flags)
+{
+ /* Fake ioctl() issued by kernel, 'from' is a kernel address */
+ if (flags & FKIOCTL) {
+ memcpy(to, from, len);
+ return (0);
+ }
+
+ return (copyin(from, to, len));
+}
+EXPORT_SYMBOL(ddi_copyin);
+
+int
+ddi_copyout(const void *from, void *to, size_t len, int flags)
+{
+ /* Fake ioctl() issued by kernel, 'from' is a kernel address */
+ if (flags & FKIOCTL) {
+ memcpy(to, from, len);
+ return (0);
+ }
+
+ return (copyout(from, to, len));
+}
+EXPORT_SYMBOL(ddi_copyout);
+
+static ssize_t
+spl_kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
+{
+#if defined(HAVE_KERNEL_READ_PPOS)
+ return (kernel_read(file, buf, count, pos));
+#else
+ mm_segment_t saved_fs;
+ ssize_t ret;
+
+ saved_fs = get_fs();
+ set_fs(KERNEL_DS);
+
+ ret = vfs_read(file, (void __user *)buf, count, pos);
+
+ set_fs(saved_fs);
+
+ return (ret);
+#endif
+}
+
+static int
+spl_getattr(struct file *filp, struct kstat *stat)
+{
+ int rc;
+
+ ASSERT(filp);
+ ASSERT(stat);
+
+#if defined(HAVE_4ARGS_VFS_GETATTR)
+ rc = vfs_getattr(&filp->f_path, stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+#elif defined(HAVE_2ARGS_VFS_GETATTR)
+ rc = vfs_getattr(&filp->f_path, stat);
+#else
+ rc = vfs_getattr(filp->f_path.mnt, filp->f_dentry, stat);
+#endif
+ if (rc)
+ return (-rc);
+
+ return (0);
+}
+
+/*
+ * Read the unique system identifier from the /etc/hostid file.
+ *
+ * The behavior of /usr/bin/hostid on Linux systems with the
+ * regular eglibc and coreutils is:
+ *
+ * 1. Generate the value if the /etc/hostid file does not exist
+ * or if the /etc/hostid file is less than four bytes in size.
+ *
+ * 2. If the /etc/hostid file is at least 4 bytes, then return
+ * the first four bytes [0..3] in native endian order.
+ *
+ * 3. Always ignore bytes [4..] if they exist in the file.
+ *
+ * Only the first four bytes are significant, even on systems that
+ * have a 64-bit word size.
+ *
+ * See:
+ *
+ * eglibc: sysdeps/unix/sysv/linux/gethostid.c
+ * coreutils: src/hostid.c
+ *
+ * Notes:
+ *
+ * The /etc/hostid file on Solaris is a text file that often reads:
+ *
+ * # DO NOT EDIT
+ * "0123456789"
+ *
+ * Directly copying this file to Linux results in a constant
+ * hostid of 4f442023 because the default comment constitutes
+ * the first four bytes of the file.
+ *
+ */
+
+char *spl_hostid_path = HW_HOSTID_PATH;
+module_param(spl_hostid_path, charp, 0444);
+MODULE_PARM_DESC(spl_hostid_path, "The system hostid file (/etc/hostid)");
+
+static int
+hostid_read(uint32_t *hostid)
+{
+ uint64_t size;
+ uint32_t value = 0;
+ int error;
+ loff_t off;
+ struct file *filp;
+ struct kstat stat;
+
+ filp = filp_open(spl_hostid_path, 0, 0);
+
+ if (IS_ERR(filp))
+ return (ENOENT);
+
+ error = spl_getattr(filp, &stat);
+ if (error) {
+ filp_close(filp, 0);
+ return (error);
+ }
+ size = stat.size;
+ if (size < sizeof (HW_HOSTID_MASK)) {
+ filp_close(filp, 0);
+ return (EINVAL);
+ }
+
+ off = 0;
+ /*
+ * Read directly into the variable like eglibc does.
+ * Short reads are okay; native behavior is preserved.
+ */
+ error = spl_kernel_read(filp, &value, sizeof (value), &off);
+ if (error < 0) {
+ filp_close(filp, 0);
+ return (EIO);
+ }
+
+ /* Mask down to 32 bits like coreutils does. */
+ *hostid = (value & HW_HOSTID_MASK);
+ filp_close(filp, 0);
+
+ return (0);
+}
+
+/*
+ * Return the system hostid. Preferentially use the spl_hostid module option
+ * when set, otherwise use the value in the /etc/hostid file.
+ */
+uint32_t
+zone_get_hostid(void *zone)
+{
+ uint32_t hostid;
+
+ ASSERT3P(zone, ==, NULL);
+
+ if (spl_hostid != 0)
+ return ((uint32_t)(spl_hostid & HW_HOSTID_MASK));
+
+ if (hostid_read(&hostid) == 0)
+ return (hostid);
+
+ return (0);
+}
+EXPORT_SYMBOL(zone_get_hostid);
+
+static int
+spl_kvmem_init(void)
+{
+ int rc = 0;
+
+ rc = spl_kmem_init();
+ if (rc)
+ return (rc);
+
+ rc = spl_vmem_init();
+ if (rc) {
+ spl_kmem_fini();
+ return (rc);
+ }
+
+ return (rc);
+}
+
+/*
+ * We initialize the random number generator with 128 bits of entropy from the
+ * system random number generator. In the improbable case that we have a zero
+ * seed, we fallback to the system jiffies, unless it is also zero, in which
+ * situation we use a preprogrammed seed. We step forward by 2^64 iterations to
+ * initialize each of the per-cpu seeds so that the sequences generated on each
+ * CPU are guaranteed to never overlap in practice.
+ */
+static void __init
+spl_random_init(void)
+{
+ uint64_t s[2];
+ int i = 0;
+
+ spl_pseudo_entropy = __alloc_percpu(2 * sizeof (uint64_t),
+ sizeof (uint64_t));
+
+ get_random_bytes(s, sizeof (s));
+
+ if (s[0] == 0 && s[1] == 0) {
+ if (jiffies != 0) {
+ s[0] = jiffies;
+ s[1] = ~0 - jiffies;
+ } else {
+ (void) memcpy(s, "improbable seed", sizeof (s));
+ }
+ printk("SPL: get_random_bytes() returned 0 "
+ "when generating random seed. Setting initial seed to "
+ "0x%016llx%016llx.\n", cpu_to_be64(s[0]),
+ cpu_to_be64(s[1]));
+ }
+
+ for_each_possible_cpu(i) {
+ uint64_t *wordp = per_cpu_ptr(spl_pseudo_entropy, i);
+
+ spl_rand_jump(s);
+
+ wordp[0] = s[0];
+ wordp[1] = s[1];
+ }
+}
+
+static void
+spl_random_fini(void)
+{
+ free_percpu(spl_pseudo_entropy);
+}
+
+static void
+spl_kvmem_fini(void)
+{
+ spl_vmem_fini();
+ spl_kmem_fini();
+}
+
+static int __init
+spl_init(void)
+{
+ int rc = 0;
+
+ bzero(&p0, sizeof (proc_t));
+ spl_random_init();
+
+ if ((rc = spl_kvmem_init()))
+ goto out1;
+
+ if ((rc = spl_tsd_init()))
+ goto out2;
+
+ if ((rc = spl_taskq_init()))
+ goto out3;
+
+ if ((rc = spl_kmem_cache_init()))
+ goto out4;
+
+ if ((rc = spl_proc_init()))
+ goto out5;
+
+ if ((rc = spl_kstat_init()))
+ goto out6;
+
+ if ((rc = spl_zlib_init()))
+ goto out7;
+
+ return (rc);
+
+out7:
+ spl_kstat_fini();
+out6:
+ spl_proc_fini();
+out5:
+ spl_kmem_cache_fini();
+out4:
+ spl_taskq_fini();
+out3:
+ spl_tsd_fini();
+out2:
+ spl_kvmem_fini();
+out1:
+ return (rc);
+}
+
+static void __exit
+spl_fini(void)
+{
+ spl_zlib_fini();
+ spl_kstat_fini();
+ spl_proc_fini();
+ spl_kmem_cache_fini();
+ spl_taskq_fini();
+ spl_tsd_fini();
+ spl_kvmem_fini();
+ spl_random_fini();
+}
+
+module_init(spl_init);
+module_exit(spl_fini);
+
+ZFS_MODULE_DESCRIPTION("Solaris Porting Layer");
+ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR);
+ZFS_MODULE_LICENSE("GPL");
+ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c
new file mode 100644
index 000000000000..6b3d559ffc1c
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c
@@ -0,0 +1,1468 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/percpu_compat.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/taskq.h>
+#include <sys/timer.h>
+#include <sys/vmem.h>
+#include <sys/wait.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/prefetch.h>
+
+/*
+ * Within the scope of spl-kmem.c file the kmem_cache_* definitions
+ * are removed to allow access to the real Linux slab allocator.
+ */
+#undef kmem_cache_destroy
+#undef kmem_cache_create
+#undef kmem_cache_alloc
+#undef kmem_cache_free
+
+
+/*
+ * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}()
+ * with smp_mb__{before,after}_atomic() because they were redundant. This is
+ * only used inside our SLAB allocator, so we implement an internal wrapper
+ * here to give us smp_mb__{before,after}_atomic() on older kernels.
+ */
+#ifndef smp_mb__before_atomic
+#define smp_mb__before_atomic(x) smp_mb__before_clear_bit(x)
+#endif
+
+#ifndef smp_mb__after_atomic
+#define smp_mb__after_atomic(x) smp_mb__after_clear_bit(x)
+#endif
+
+/* BEGIN CSTYLED */
+
+/*
+ * Cache magazines are an optimization designed to minimize the cost of
+ * allocating memory. They do this by keeping a per-cpu cache of recently
+ * freed objects, which can then be reallocated without taking a lock. This
+ * can improve performance on highly contended caches. However, because
+ * objects in magazines will prevent otherwise empty slabs from being
+ * immediately released this may not be ideal for low memory machines.
+ *
+ * For this reason spl_kmem_cache_magazine_size can be used to set a maximum
+ * magazine size. When this value is set to 0 the magazine size will be
+ * automatically determined based on the object size. Otherwise magazines
+ * will be limited to 2-256 objects per magazine (i.e per cpu). Magazines
+ * may never be entirely disabled in this implementation.
+ */
+unsigned int spl_kmem_cache_magazine_size = 0;
+module_param(spl_kmem_cache_magazine_size, uint, 0444);
+MODULE_PARM_DESC(spl_kmem_cache_magazine_size,
+ "Default magazine size (2-256), set automatically (0)");
+
+/*
+ * The default behavior is to report the number of objects remaining in the
+ * cache. This allows the Linux VM to repeatedly reclaim objects from the
+ * cache when memory is low satisfy other memory allocations. Alternately,
+ * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache
+ * is reclaimed. This may increase the likelihood of out of memory events.
+ */
+unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */;
+module_param(spl_kmem_cache_reclaim, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)");
+
+unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;
+module_param(spl_kmem_cache_obj_per_slab, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");
+
+unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE;
+module_param(spl_kmem_cache_max_size, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
+
+/*
+ * For small objects the Linux slab allocator should be used to make the most
+ * efficient use of the memory. However, large objects are not supported by
+ * the Linux slab and therefore the SPL implementation is preferred. A cutoff
+ * of 16K was determined to be optimal for architectures using 4K pages.
+ */
+#if PAGE_SIZE == 4096
+unsigned int spl_kmem_cache_slab_limit = 16384;
+#else
+unsigned int spl_kmem_cache_slab_limit = 0;
+#endif
+module_param(spl_kmem_cache_slab_limit, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
+ "Objects less than N bytes use the Linux slab");
+
+/*
+ * The number of threads available to allocate new slabs for caches. This
+ * should not need to be tuned but it is available for performance analysis.
+ */
+unsigned int spl_kmem_cache_kmem_threads = 4;
+module_param(spl_kmem_cache_kmem_threads, uint, 0444);
+MODULE_PARM_DESC(spl_kmem_cache_kmem_threads,
+ "Number of spl_kmem_cache threads");
+/* END CSTYLED */
+
+/*
+ * Slab allocation interfaces
+ *
+ * While the Linux slab implementation was inspired by the Solaris
+ * implementation I cannot use it to emulate the Solaris APIs. I
+ * require two features which are not provided by the Linux slab.
+ *
+ * 1) Constructors AND destructors. Recent versions of the Linux
+ * kernel have removed support for destructors. This is a deal
+ * breaker for the SPL which contains particularly expensive
+ * initializers for mutex's, condition variables, etc. We also
+ * require a minimal level of cleanup for these data types unlike
+ * many Linux data types which do need to be explicitly destroyed.
+ *
+ * 2) Virtual address space backed slab. Callers of the Solaris slab
+ * expect it to work well for both small are very large allocations.
+ * Because of memory fragmentation the Linux slab which is backed
+ * by kmalloc'ed memory performs very badly when confronted with
+ * large numbers of large allocations. Basing the slab on the
+ * virtual address space removes the need for contiguous pages
+ * and greatly improve performance for large allocations.
+ *
+ * For these reasons, the SPL has its own slab implementation with
+ * the needed features. It is not as highly optimized as either the
+ * Solaris or Linux slabs, but it should get me most of what is
+ * needed until it can be optimized or obsoleted by another approach.
+ *
+ * One serious concern I do have about this method is the relatively
+ * small virtual address space on 32bit arches. This will seriously
+ * constrain the size of the slab caches and their performance.
+ */
+
+struct list_head spl_kmem_cache_list; /* List of caches */
+struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
+taskq_t *spl_kmem_cache_taskq; /* Task queue for aging / reclaim */
+
+static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
+
+static void *
+kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
+{
+ gfp_t lflags = kmem_flags_convert(flags);
+ void *ptr;
+
+ ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM);
+
+ /* Resulting allocated memory will be page aligned */
+ ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
+
+ return (ptr);
+}
+
+static void
+kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
+{
+ ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
+
+ /*
+ * The Linux direct reclaim path uses this out of band value to
+ * determine if forward progress is being made. Normally this is
+ * incremented by kmem_freepages() which is part of the various
+ * Linux slab implementations. However, since we are using none
+ * of that infrastructure we are responsible for incrementing it.
+ */
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
+
+ vfree(ptr);
+}
+
+/*
+ * Required space for each aligned sks.
+ */
+static inline uint32_t
+spl_sks_size(spl_kmem_cache_t *skc)
+{
+ return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t),
+ skc->skc_obj_align, uint32_t));
+}
+
+/*
+ * Required space for each aligned object.
+ */
+static inline uint32_t
+spl_obj_size(spl_kmem_cache_t *skc)
+{
+ uint32_t align = skc->skc_obj_align;
+
+ return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
+ P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t));
+}
+
+uint64_t
+spl_kmem_cache_inuse(kmem_cache_t *cache)
+{
+ return (cache->skc_obj_total);
+}
+EXPORT_SYMBOL(spl_kmem_cache_inuse);
+
+uint64_t
+spl_kmem_cache_entry_size(kmem_cache_t *cache)
+{
+ return (cache->skc_obj_size);
+}
+EXPORT_SYMBOL(spl_kmem_cache_entry_size);
+
+/*
+ * Lookup the spl_kmem_object_t for an object given that object.
+ */
+static inline spl_kmem_obj_t *
+spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
+{
+ return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
+ skc->skc_obj_align, uint32_t));
+}
+
+/*
+ * It's important that we pack the spl_kmem_obj_t structure and the
+ * actual objects in to one large address space to minimize the number
+ * of calls to the allocator. It is far better to do a few large
+ * allocations and then subdivide it ourselves. Now which allocator
+ * we use requires balancing a few trade offs.
+ *
+ * For small objects we use kmem_alloc() because as long as you are
+ * only requesting a small number of pages (ideally just one) its cheap.
+ * However, when you start requesting multiple pages with kmem_alloc()
+ * it gets increasingly expensive since it requires contiguous pages.
+ * For this reason we shift to vmem_alloc() for slabs of large objects
+ * which removes the need for contiguous pages. We do not use
+ * vmem_alloc() in all cases because there is significant locking
+ * overhead in __get_vm_area_node(). This function takes a single
+ * global lock when acquiring an available virtual address range which
+ * serializes all vmem_alloc()'s for all slab caches. Using slightly
+ * different allocation functions for small and large objects should
+ * give us the best of both worlds.
+ *
+ * +------------------------+
+ * | spl_kmem_slab_t --+-+ |
+ * | skc_obj_size <-+ | |
+ * | spl_kmem_obj_t | |
+ * | skc_obj_size <---+ |
+ * | spl_kmem_obj_t | |
+ * | ... v |
+ * +------------------------+
+ */
+static spl_kmem_slab_t *
+spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
+{
+ spl_kmem_slab_t *sks;
+ void *base;
+ uint32_t obj_size;
+
+ base = kv_alloc(skc, skc->skc_slab_size, flags);
+ if (base == NULL)
+ return (NULL);
+
+ sks = (spl_kmem_slab_t *)base;
+ sks->sks_magic = SKS_MAGIC;
+ sks->sks_objs = skc->skc_slab_objs;
+ sks->sks_age = jiffies;
+ sks->sks_cache = skc;
+ INIT_LIST_HEAD(&sks->sks_list);
+ INIT_LIST_HEAD(&sks->sks_free_list);
+ sks->sks_ref = 0;
+ obj_size = spl_obj_size(skc);
+
+ for (int i = 0; i < sks->sks_objs; i++) {
+ void *obj = base + spl_sks_size(skc) + (i * obj_size);
+
+ ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
+ spl_kmem_obj_t *sko = spl_sko_from_obj(skc, obj);
+ sko->sko_addr = obj;
+ sko->sko_magic = SKO_MAGIC;
+ sko->sko_slab = sks;
+ INIT_LIST_HEAD(&sko->sko_list);
+ list_add_tail(&sko->sko_list, &sks->sks_free_list);
+ }
+
+ return (sks);
+}
+
+/*
+ * Remove a slab from complete or partial list, it must be called with
+ * the 'skc->skc_lock' held but the actual free must be performed
+ * outside the lock to prevent deadlocking on vmem addresses.
+ */
+static void
+spl_slab_free(spl_kmem_slab_t *sks,
+ struct list_head *sks_list, struct list_head *sko_list)
+{
+ spl_kmem_cache_t *skc;
+
+ ASSERT(sks->sks_magic == SKS_MAGIC);
+ ASSERT(sks->sks_ref == 0);
+
+ skc = sks->sks_cache;
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+
+ /*
+ * Update slab/objects counters in the cache, then remove the
+ * slab from the skc->skc_partial_list. Finally add the slab
+ * and all its objects in to the private work lists where the
+ * destructors will be called and the memory freed to the system.
+ */
+ skc->skc_obj_total -= sks->sks_objs;
+ skc->skc_slab_total--;
+ list_del(&sks->sks_list);
+ list_add(&sks->sks_list, sks_list);
+ list_splice_init(&sks->sks_free_list, sko_list);
+}
+
+/*
+ * Reclaim empty slabs at the end of the partial list.
+ */
+static void
+spl_slab_reclaim(spl_kmem_cache_t *skc)
+{
+ spl_kmem_slab_t *sks = NULL, *m = NULL;
+ spl_kmem_obj_t *sko = NULL, *n = NULL;
+ LIST_HEAD(sks_list);
+ LIST_HEAD(sko_list);
+
+ /*
+ * Empty slabs and objects must be moved to a private list so they
+ * can be safely freed outside the spin lock. All empty slabs are
+ * at the end of skc->skc_partial_list, therefore once a non-empty
+ * slab is found we can stop scanning.
+ */
+ spin_lock(&skc->skc_lock);
+ list_for_each_entry_safe_reverse(sks, m,
+ &skc->skc_partial_list, sks_list) {
+
+ if (sks->sks_ref > 0)
+ break;
+
+ spl_slab_free(sks, &sks_list, &sko_list);
+ }
+ spin_unlock(&skc->skc_lock);
+
+ /*
+ * The following two loops ensure all the object destructors are run,
+ * and the slabs themselves are freed. This is all done outside the
+ * skc->skc_lock since this allows the destructor to sleep, and
+ * allows us to perform a conditional reschedule when a freeing a
+ * large number of objects and slabs back to the system.
+ */
+
+ list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
+ ASSERT(sko->sko_magic == SKO_MAGIC);
+ }
+
+ list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
+ ASSERT(sks->sks_magic == SKS_MAGIC);
+ kv_free(skc, sks, skc->skc_slab_size);
+ }
+}
+
+static spl_kmem_emergency_t *
+spl_emergency_search(struct rb_root *root, void *obj)
+{
+ struct rb_node *node = root->rb_node;
+ spl_kmem_emergency_t *ske;
+ unsigned long address = (unsigned long)obj;
+
+ while (node) {
+ ske = container_of(node, spl_kmem_emergency_t, ske_node);
+
+ if (address < ske->ske_obj)
+ node = node->rb_left;
+ else if (address > ske->ske_obj)
+ node = node->rb_right;
+ else
+ return (ske);
+ }
+
+ return (NULL);
+}
+
+static int
+spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+ spl_kmem_emergency_t *ske_tmp;
+ unsigned long address = ske->ske_obj;
+
+ while (*new) {
+ ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
+
+ parent = *new;
+ if (address < ske_tmp->ske_obj)
+ new = &((*new)->rb_left);
+ else if (address > ske_tmp->ske_obj)
+ new = &((*new)->rb_right);
+ else
+ return (0);
+ }
+
+ rb_link_node(&ske->ske_node, parent, new);
+ rb_insert_color(&ske->ske_node, root);
+
+ return (1);
+}
+
+/*
+ * Allocate a single emergency object and track it in a red black tree.
+ */
+static int
+spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
+{
+ gfp_t lflags = kmem_flags_convert(flags);
+ spl_kmem_emergency_t *ske;
+ int order = get_order(skc->skc_obj_size);
+ int empty;
+
+ /* Last chance use a partial slab if one now exists */
+ spin_lock(&skc->skc_lock);
+ empty = list_empty(&skc->skc_partial_list);
+ spin_unlock(&skc->skc_lock);
+ if (!empty)
+ return (-EEXIST);
+
+ ske = kmalloc(sizeof (*ske), lflags);
+ if (ske == NULL)
+ return (-ENOMEM);
+
+ ske->ske_obj = __get_free_pages(lflags, order);
+ if (ske->ske_obj == 0) {
+ kfree(ske);
+ return (-ENOMEM);
+ }
+
+ spin_lock(&skc->skc_lock);
+ empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
+ if (likely(empty)) {
+ skc->skc_obj_total++;
+ skc->skc_obj_emergency++;
+ if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
+ skc->skc_obj_emergency_max = skc->skc_obj_emergency;
+ }
+ spin_unlock(&skc->skc_lock);
+
+ if (unlikely(!empty)) {
+ free_pages(ske->ske_obj, order);
+ kfree(ske);
+ return (-EINVAL);
+ }
+
+ *obj = (void *)ske->ske_obj;
+
+ return (0);
+}
+
+/*
+ * Locate the passed object in the red black tree and free it.
+ */
+static int
+spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
+{
+ spl_kmem_emergency_t *ske;
+ int order = get_order(skc->skc_obj_size);
+
+ spin_lock(&skc->skc_lock);
+ ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
+ if (ske) {
+ rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
+ skc->skc_obj_emergency--;
+ skc->skc_obj_total--;
+ }
+ spin_unlock(&skc->skc_lock);
+
+ if (ske == NULL)
+ return (-ENOENT);
+
+ free_pages(ske->ske_obj, order);
+ kfree(ske);
+
+ return (0);
+}
+
+/*
+ * Release objects from the per-cpu magazine back to their slab. The flush
+ * argument contains the max number of entries to remove from the magazine.
+ */
+static void
+spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
+{
+ spin_lock(&skc->skc_lock);
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+
+ int count = MIN(flush, skm->skm_avail);
+ for (int i = 0; i < count; i++)
+ spl_cache_shrink(skc, skm->skm_objs[i]);
+
+ skm->skm_avail -= count;
+ memmove(skm->skm_objs, &(skm->skm_objs[count]),
+ sizeof (void *) * skm->skm_avail);
+
+ spin_unlock(&skc->skc_lock);
+}
+
+/*
+ * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
+ * When on-slab we want to target spl_kmem_cache_obj_per_slab. However,
+ * for very small objects we may end up with more than this so as not
+ * to waste space in the minimal allocation of a single page. Also for
+ * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min,
+ * lower than this and we will fail.
+ */
+static int
+spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
+{
+ uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs;
+
+ sks_size = spl_sks_size(skc);
+ obj_size = spl_obj_size(skc);
+ max_size = (spl_kmem_cache_max_size * 1024 * 1024);
+ tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size);
+
+ if (tgt_size <= max_size) {
+ tgt_objs = (tgt_size - sks_size) / obj_size;
+ } else {
+ tgt_objs = (max_size - sks_size) / obj_size;
+ tgt_size = (tgt_objs * obj_size) + sks_size;
+ }
+
+ if (tgt_objs == 0)
+ return (-ENOSPC);
+
+ *objs = tgt_objs;
+ *size = tgt_size;
+
+ return (0);
+}
+
+/*
+ * Make a guess at reasonable per-cpu magazine size based on the size of
+ * each object and the cost of caching N of them in each magazine. Long
+ * term this should really adapt based on an observed usage heuristic.
+ */
+static int
+spl_magazine_size(spl_kmem_cache_t *skc)
+{
+ uint32_t obj_size = spl_obj_size(skc);
+ int size;
+
+ if (spl_kmem_cache_magazine_size > 0)
+ return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2));
+
+ /* Per-magazine sizes below assume a 4Kib page size */
+ if (obj_size > (PAGE_SIZE * 256))
+ size = 4; /* Minimum 4Mib per-magazine */
+ else if (obj_size > (PAGE_SIZE * 32))
+ size = 16; /* Minimum 2Mib per-magazine */
+ else if (obj_size > (PAGE_SIZE))
+ size = 64; /* Minimum 256Kib per-magazine */
+ else if (obj_size > (PAGE_SIZE / 4))
+ size = 128; /* Minimum 128Kib per-magazine */
+ else
+ size = 256;
+
+ return (size);
+}
+
+/*
+ * Allocate a per-cpu magazine to associate with a specific core.
+ */
+static spl_kmem_magazine_t *
+spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
+{
+ spl_kmem_magazine_t *skm;
+ int size = sizeof (spl_kmem_magazine_t) +
+ sizeof (void *) * skc->skc_mag_size;
+
+ skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
+ if (skm) {
+ skm->skm_magic = SKM_MAGIC;
+ skm->skm_avail = 0;
+ skm->skm_size = skc->skc_mag_size;
+ skm->skm_refill = skc->skc_mag_refill;
+ skm->skm_cache = skc;
+ skm->skm_cpu = cpu;
+ }
+
+ return (skm);
+}
+
+/*
+ * Free a per-cpu magazine associated with a specific core.
+ */
+static void
+spl_magazine_free(spl_kmem_magazine_t *skm)
+{
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+ ASSERT(skm->skm_avail == 0);
+ kfree(skm);
+}
+
+/*
+ * Create all pre-cpu magazines of reasonable sizes.
+ */
+static int
+spl_magazine_create(spl_kmem_cache_t *skc)
+{
+ int i = 0;
+
+ ASSERT((skc->skc_flags & KMC_SLAB) == 0);
+
+ skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) *
+ num_possible_cpus(), kmem_flags_convert(KM_SLEEP));
+ skc->skc_mag_size = spl_magazine_size(skc);
+ skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
+
+ for_each_possible_cpu(i) {
+ skc->skc_mag[i] = spl_magazine_alloc(skc, i);
+ if (!skc->skc_mag[i]) {
+ for (i--; i >= 0; i--)
+ spl_magazine_free(skc->skc_mag[i]);
+
+ kfree(skc->skc_mag);
+ return (-ENOMEM);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Destroy all pre-cpu magazines.
+ */
+static void
+spl_magazine_destroy(spl_kmem_cache_t *skc)
+{
+ spl_kmem_magazine_t *skm;
+ int i = 0;
+
+ ASSERT((skc->skc_flags & KMC_SLAB) == 0);
+
+ for_each_possible_cpu(i) {
+ skm = skc->skc_mag[i];
+ spl_cache_flush(skc, skm, skm->skm_avail);
+ spl_magazine_free(skm);
+ }
+
+ kfree(skc->skc_mag);
+}
+
+/*
+ * Create a object cache based on the following arguments:
+ * name cache name
+ * size cache object size
+ * align cache object alignment
+ * ctor cache object constructor
+ * dtor cache object destructor
+ * reclaim cache object reclaim
+ * priv cache private data for ctor/dtor/reclaim
+ * vmp unused must be NULL
+ * flags
+ * KMC_KVMEM Force kvmem backed SPL cache
+ * KMC_SLAB Force Linux slab backed cache
+ * KMC_NODEBUG Disable debugging (unsupported)
+ */
+spl_kmem_cache_t *
+spl_kmem_cache_create(char *name, size_t size, size_t align,
+ spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, void *reclaim,
+ void *priv, void *vmp, int flags)
+{
+ gfp_t lflags = kmem_flags_convert(KM_SLEEP);
+ spl_kmem_cache_t *skc;
+ int rc;
+
+ /*
+ * Unsupported flags
+ */
+ ASSERT(vmp == NULL);
+ ASSERT(reclaim == NULL);
+
+ might_sleep();
+
+ skc = kzalloc(sizeof (*skc), lflags);
+ if (skc == NULL)
+ return (NULL);
+
+ skc->skc_magic = SKC_MAGIC;
+ skc->skc_name_size = strlen(name) + 1;
+ skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags);
+ if (skc->skc_name == NULL) {
+ kfree(skc);
+ return (NULL);
+ }
+ strncpy(skc->skc_name, name, skc->skc_name_size);
+
+ skc->skc_ctor = ctor;
+ skc->skc_dtor = dtor;
+ skc->skc_private = priv;
+ skc->skc_vmp = vmp;
+ skc->skc_linux_cache = NULL;
+ skc->skc_flags = flags;
+ skc->skc_obj_size = size;
+ skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
+ atomic_set(&skc->skc_ref, 0);
+
+ INIT_LIST_HEAD(&skc->skc_list);
+ INIT_LIST_HEAD(&skc->skc_complete_list);
+ INIT_LIST_HEAD(&skc->skc_partial_list);
+ skc->skc_emergency_tree = RB_ROOT;
+ spin_lock_init(&skc->skc_lock);
+ init_waitqueue_head(&skc->skc_waitq);
+ skc->skc_slab_fail = 0;
+ skc->skc_slab_create = 0;
+ skc->skc_slab_destroy = 0;
+ skc->skc_slab_total = 0;
+ skc->skc_slab_alloc = 0;
+ skc->skc_slab_max = 0;
+ skc->skc_obj_total = 0;
+ skc->skc_obj_alloc = 0;
+ skc->skc_obj_max = 0;
+ skc->skc_obj_deadlock = 0;
+ skc->skc_obj_emergency = 0;
+ skc->skc_obj_emergency_max = 0;
+
+ rc = percpu_counter_init_common(&skc->skc_linux_alloc, 0,
+ GFP_KERNEL);
+ if (rc != 0) {
+ kfree(skc);
+ return (NULL);
+ }
+
+ /*
+ * Verify the requested alignment restriction is sane.
+ */
+ if (align) {
+ VERIFY(ISP2(align));
+ VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN);
+ VERIFY3U(align, <=, PAGE_SIZE);
+ skc->skc_obj_align = align;
+ }
+
+ /*
+ * When no specific type of slab is requested (kmem, vmem, or
+ * linuxslab) then select a cache type based on the object size
+ * and default tunables.
+ */
+ if (!(skc->skc_flags & (KMC_SLAB | KMC_KVMEM))) {
+ if (spl_kmem_cache_slab_limit &&
+ size <= (size_t)spl_kmem_cache_slab_limit) {
+ /*
+ * Objects smaller than spl_kmem_cache_slab_limit can
+ * use the Linux slab for better space-efficiency.
+ */
+ skc->skc_flags |= KMC_SLAB;
+ } else {
+ /*
+ * All other objects are considered large and are
+ * placed on kvmem backed slabs.
+ */
+ skc->skc_flags |= KMC_KVMEM;
+ }
+ }
+
+ /*
+ * Given the type of slab allocate the required resources.
+ */
+ if (skc->skc_flags & KMC_KVMEM) {
+ rc = spl_slab_size(skc,
+ &skc->skc_slab_objs, &skc->skc_slab_size);
+ if (rc)
+ goto out;
+
+ rc = spl_magazine_create(skc);
+ if (rc)
+ goto out;
+ } else {
+ unsigned long slabflags = 0;
+
+ if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) {
+ rc = EINVAL;
+ goto out;
+ }
+
+#if defined(SLAB_USERCOPY)
+ /*
+ * Required for PAX-enabled kernels if the slab is to be
+ * used for copying between user and kernel space.
+ */
+ slabflags |= SLAB_USERCOPY;
+#endif
+
+#if defined(HAVE_KMEM_CACHE_CREATE_USERCOPY)
+ /*
+ * Newer grsec patchset uses kmem_cache_create_usercopy()
+ * instead of SLAB_USERCOPY flag
+ */
+ skc->skc_linux_cache = kmem_cache_create_usercopy(
+ skc->skc_name, size, align, slabflags, 0, size, NULL);
+#else
+ skc->skc_linux_cache = kmem_cache_create(
+ skc->skc_name, size, align, slabflags, NULL);
+#endif
+ if (skc->skc_linux_cache == NULL) {
+ rc = ENOMEM;
+ goto out;
+ }
+ }
+
+ down_write(&spl_kmem_cache_sem);
+ list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
+ up_write(&spl_kmem_cache_sem);
+
+ return (skc);
+out:
+ kfree(skc->skc_name);
+ percpu_counter_destroy(&skc->skc_linux_alloc);
+ kfree(skc);
+ return (NULL);
+}
+EXPORT_SYMBOL(spl_kmem_cache_create);
+
+/*
+ * Register a move callback for cache defragmentation.
+ * XXX: Unimplemented but harmless to stub out for now.
+ */
+void
+spl_kmem_cache_set_move(spl_kmem_cache_t *skc,
+ kmem_cbrc_t (move)(void *, void *, size_t, void *))
+{
+ ASSERT(move != NULL);
+}
+EXPORT_SYMBOL(spl_kmem_cache_set_move);
+
+/*
+ * Destroy a cache and all objects associated with the cache.
+ */
+void
+spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
+{
+ DECLARE_WAIT_QUEUE_HEAD(wq);
+ taskqid_t id;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(skc->skc_flags & (KMC_KVMEM | KMC_SLAB));
+
+ down_write(&spl_kmem_cache_sem);
+ list_del_init(&skc->skc_list);
+ up_write(&spl_kmem_cache_sem);
+
+ /* Cancel any and wait for any pending delayed tasks */
+ VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+ spin_lock(&skc->skc_lock);
+ id = skc->skc_taskqid;
+ spin_unlock(&skc->skc_lock);
+
+ taskq_cancel_id(spl_kmem_cache_taskq, id);
+
+ /*
+ * Wait until all current callers complete, this is mainly
+ * to catch the case where a low memory situation triggers a
+ * cache reaping action which races with this destroy.
+ */
+ wait_event(wq, atomic_read(&skc->skc_ref) == 0);
+
+ if (skc->skc_flags & KMC_KVMEM) {
+ spl_magazine_destroy(skc);
+ spl_slab_reclaim(skc);
+ } else {
+ ASSERT(skc->skc_flags & KMC_SLAB);
+ kmem_cache_destroy(skc->skc_linux_cache);
+ }
+
+ spin_lock(&skc->skc_lock);
+
+ /*
+ * Validate there are no objects in use and free all the
+ * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers.
+ */
+ ASSERT3U(skc->skc_slab_alloc, ==, 0);
+ ASSERT3U(skc->skc_obj_alloc, ==, 0);
+ ASSERT3U(skc->skc_slab_total, ==, 0);
+ ASSERT3U(skc->skc_obj_total, ==, 0);
+ ASSERT3U(skc->skc_obj_emergency, ==, 0);
+ ASSERT(list_empty(&skc->skc_complete_list));
+
+ ASSERT3U(percpu_counter_sum(&skc->skc_linux_alloc), ==, 0);
+ percpu_counter_destroy(&skc->skc_linux_alloc);
+
+ spin_unlock(&skc->skc_lock);
+
+ kfree(skc->skc_name);
+ kfree(skc);
+}
+EXPORT_SYMBOL(spl_kmem_cache_destroy);
+
+/*
+ * Allocate an object from a slab attached to the cache. This is used to
+ * repopulate the per-cpu magazine caches in batches when they run low.
+ */
+static void *
+spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
+{
+ spl_kmem_obj_t *sko;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(sks->sks_magic == SKS_MAGIC);
+
+ sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
+ ASSERT(sko->sko_magic == SKO_MAGIC);
+ ASSERT(sko->sko_addr != NULL);
+
+ /* Remove from sks_free_list */
+ list_del_init(&sko->sko_list);
+
+ sks->sks_age = jiffies;
+ sks->sks_ref++;
+ skc->skc_obj_alloc++;
+
+ /* Track max obj usage statistics */
+ if (skc->skc_obj_alloc > skc->skc_obj_max)
+ skc->skc_obj_max = skc->skc_obj_alloc;
+
+ /* Track max slab usage statistics */
+ if (sks->sks_ref == 1) {
+ skc->skc_slab_alloc++;
+
+ if (skc->skc_slab_alloc > skc->skc_slab_max)
+ skc->skc_slab_max = skc->skc_slab_alloc;
+ }
+
+ return (sko->sko_addr);
+}
+
+/*
+ * Generic slab allocation function to run by the global work queues.
+ * It is responsible for allocating a new slab, linking it in to the list
+ * of partial slabs, and then waking any waiters.
+ */
+static int
+__spl_cache_grow(spl_kmem_cache_t *skc, int flags)
+{
+ spl_kmem_slab_t *sks;
+
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+ sks = spl_slab_alloc(skc, flags);
+ spl_fstrans_unmark(cookie);
+
+ spin_lock(&skc->skc_lock);
+ if (sks) {
+ skc->skc_slab_total++;
+ skc->skc_obj_total += sks->sks_objs;
+ list_add_tail(&sks->sks_list, &skc->skc_partial_list);
+
+ smp_mb__before_atomic();
+ clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
+ smp_mb__after_atomic();
+ }
+ spin_unlock(&skc->skc_lock);
+
+ return (sks == NULL ? -ENOMEM : 0);
+}
+
+static void
+spl_cache_grow_work(void *data)
+{
+ spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
+ spl_kmem_cache_t *skc = ska->ska_cache;
+
+ int error = __spl_cache_grow(skc, ska->ska_flags);
+
+ atomic_dec(&skc->skc_ref);
+ smp_mb__before_atomic();
+ clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
+ smp_mb__after_atomic();
+ if (error == 0)
+ wake_up_all(&skc->skc_waitq);
+
+ kfree(ska);
+}
+
+/*
+ * Returns non-zero when a new slab should be available.
+ */
+static int
+spl_cache_grow_wait(spl_kmem_cache_t *skc)
+{
+ return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags));
+}
+
+/*
+ * No available objects on any slabs, create a new slab. Note that this
+ * functionality is disabled for KMC_SLAB caches which are backed by the
+ * Linux slab.
+ */
+static int
+spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
+{
+ int remaining, rc = 0;
+
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT((skc->skc_flags & KMC_SLAB) == 0);
+ might_sleep();
+ *obj = NULL;
+
+ /*
+ * Before allocating a new slab wait for any reaping to complete and
+ * then return so the local magazine can be rechecked for new objects.
+ */
+ if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
+ rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
+ TASK_UNINTERRUPTIBLE);
+ return (rc ? rc : -EAGAIN);
+ }
+
+ /*
+ * Note: It would be nice to reduce the overhead of context switch
+ * and improve NUMA locality, by trying to allocate a new slab in the
+ * current process context with KM_NOSLEEP flag.
+ *
+ * However, this can't be applied to vmem/kvmem due to a bug that
+ * spl_vmalloc() doesn't honor gfp flags in page table allocation.
+ */
+
+ /*
+ * This is handled by dispatching a work request to the global work
+ * queue. This allows us to asynchronously allocate a new slab while
+ * retaining the ability to safely fall back to a smaller synchronous
+ * allocations to ensure forward progress is always maintained.
+ */
+ if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
+ spl_kmem_alloc_t *ska;
+
+ ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags));
+ if (ska == NULL) {
+ clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags);
+ smp_mb__after_atomic();
+ wake_up_all(&skc->skc_waitq);
+ return (-ENOMEM);
+ }
+
+ atomic_inc(&skc->skc_ref);
+ ska->ska_cache = skc;
+ ska->ska_flags = flags;
+ taskq_init_ent(&ska->ska_tqe);
+ taskq_dispatch_ent(spl_kmem_cache_taskq,
+ spl_cache_grow_work, ska, 0, &ska->ska_tqe);
+ }
+
+ /*
+ * The goal here is to only detect the rare case where a virtual slab
+ * allocation has deadlocked. We must be careful to minimize the use
+ * of emergency objects which are more expensive to track. Therefore,
+ * we set a very long timeout for the asynchronous allocation and if
+ * the timeout is reached the cache is flagged as deadlocked. From
+ * this point only new emergency objects will be allocated until the
+ * asynchronous allocation completes and clears the deadlocked flag.
+ */
+ if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
+ rc = spl_emergency_alloc(skc, flags, obj);
+ } else {
+ remaining = wait_event_timeout(skc->skc_waitq,
+ spl_cache_grow_wait(skc), HZ / 10);
+
+ if (!remaining) {
+ spin_lock(&skc->skc_lock);
+ if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
+ set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
+ skc->skc_obj_deadlock++;
+ }
+ spin_unlock(&skc->skc_lock);
+ }
+
+ rc = -ENOMEM;
+ }
+
+ return (rc);
+}
+
+/*
+ * Refill a per-cpu magazine with objects from the slabs for this cache.
+ * Ideally the magazine can be repopulated using existing objects which have
+ * been released, however if we are unable to locate enough free objects new
+ * slabs of objects will be created. On success NULL is returned, otherwise
+ * the address of a single emergency object is returned for use by the caller.
+ */
+static void *
+spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
+{
+ spl_kmem_slab_t *sks;
+ int count = 0, rc, refill;
+ void *obj = NULL;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+
+ refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
+ spin_lock(&skc->skc_lock);
+
+ while (refill > 0) {
+ /* No slabs available we may need to grow the cache */
+ if (list_empty(&skc->skc_partial_list)) {
+ spin_unlock(&skc->skc_lock);
+
+ local_irq_enable();
+ rc = spl_cache_grow(skc, flags, &obj);
+ local_irq_disable();
+
+ /* Emergency object for immediate use by caller */
+ if (rc == 0 && obj != NULL)
+ return (obj);
+
+ if (rc)
+ goto out;
+
+ /* Rescheduled to different CPU skm is not local */
+ if (skm != skc->skc_mag[smp_processor_id()])
+ goto out;
+
+ /*
+ * Potentially rescheduled to the same CPU but
+ * allocations may have occurred from this CPU while
+ * we were sleeping so recalculate max refill.
+ */
+ refill = MIN(refill, skm->skm_size - skm->skm_avail);
+
+ spin_lock(&skc->skc_lock);
+ continue;
+ }
+
+ /* Grab the next available slab */
+ sks = list_entry((&skc->skc_partial_list)->next,
+ spl_kmem_slab_t, sks_list);
+ ASSERT(sks->sks_magic == SKS_MAGIC);
+ ASSERT(sks->sks_ref < sks->sks_objs);
+ ASSERT(!list_empty(&sks->sks_free_list));
+
+ /*
+ * Consume as many objects as needed to refill the requested
+ * cache. We must also be careful not to overfill it.
+ */
+ while (sks->sks_ref < sks->sks_objs && refill-- > 0 &&
+ ++count) {
+ ASSERT(skm->skm_avail < skm->skm_size);
+ ASSERT(count < skm->skm_size);
+ skm->skm_objs[skm->skm_avail++] =
+ spl_cache_obj(skc, sks);
+ }
+
+ /* Move slab to skc_complete_list when full */
+ if (sks->sks_ref == sks->sks_objs) {
+ list_del(&sks->sks_list);
+ list_add(&sks->sks_list, &skc->skc_complete_list);
+ }
+ }
+
+ spin_unlock(&skc->skc_lock);
+out:
+ return (NULL);
+}
+
+/*
+ * Release an object back to the slab from which it came.
+ */
+static void
+spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
+{
+ spl_kmem_slab_t *sks = NULL;
+ spl_kmem_obj_t *sko = NULL;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+
+ sko = spl_sko_from_obj(skc, obj);
+ ASSERT(sko->sko_magic == SKO_MAGIC);
+ sks = sko->sko_slab;
+ ASSERT(sks->sks_magic == SKS_MAGIC);
+ ASSERT(sks->sks_cache == skc);
+ list_add(&sko->sko_list, &sks->sks_free_list);
+
+ sks->sks_age = jiffies;
+ sks->sks_ref--;
+ skc->skc_obj_alloc--;
+
+ /*
+ * Move slab to skc_partial_list when no longer full. Slabs
+ * are added to the head to keep the partial list is quasi-full
+ * sorted order. Fuller at the head, emptier at the tail.
+ */
+ if (sks->sks_ref == (sks->sks_objs - 1)) {
+ list_del(&sks->sks_list);
+ list_add(&sks->sks_list, &skc->skc_partial_list);
+ }
+
+ /*
+ * Move empty slabs to the end of the partial list so
+ * they can be easily found and freed during reclamation.
+ */
+ if (sks->sks_ref == 0) {
+ list_del(&sks->sks_list);
+ list_add_tail(&sks->sks_list, &skc->skc_partial_list);
+ skc->skc_slab_alloc--;
+ }
+}
+
+/*
+ * Allocate an object from the per-cpu magazine, or if the magazine
+ * is empty directly allocate from a slab and repopulate the magazine.
+ */
+void *
+spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
+{
+ spl_kmem_magazine_t *skm;
+ void *obj = NULL;
+
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+ /*
+ * Allocate directly from a Linux slab. All optimizations are left
+ * to the underlying cache we only need to guarantee that KM_SLEEP
+ * callers will never fail.
+ */
+ if (skc->skc_flags & KMC_SLAB) {
+ struct kmem_cache *slc = skc->skc_linux_cache;
+ do {
+ obj = kmem_cache_alloc(slc, kmem_flags_convert(flags));
+ } while ((obj == NULL) && !(flags & KM_NOSLEEP));
+
+ if (obj != NULL) {
+ /*
+ * Even though we leave everything up to the
+ * underlying cache we still keep track of
+ * how many objects we've allocated in it for
+ * better debuggability.
+ */
+ percpu_counter_inc(&skc->skc_linux_alloc);
+ }
+ goto ret;
+ }
+
+ local_irq_disable();
+
+restart:
+ /*
+ * Safe to update per-cpu structure without lock, but
+ * in the restart case we must be careful to reacquire
+ * the local magazine since this may have changed
+ * when we need to grow the cache.
+ */
+ skm = skc->skc_mag[smp_processor_id()];
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+
+ if (likely(skm->skm_avail)) {
+ /* Object available in CPU cache, use it */
+ obj = skm->skm_objs[--skm->skm_avail];
+ } else {
+ obj = spl_cache_refill(skc, skm, flags);
+ if ((obj == NULL) && !(flags & KM_NOSLEEP))
+ goto restart;
+
+ local_irq_enable();
+ goto ret;
+ }
+
+ local_irq_enable();
+ ASSERT(obj);
+ ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
+
+ret:
+ /* Pre-emptively migrate object to CPU L1 cache */
+ if (obj) {
+ if (obj && skc->skc_ctor)
+ skc->skc_ctor(obj, skc->skc_private, flags);
+ else
+ prefetchw(obj);
+ }
+
+ return (obj);
+}
+EXPORT_SYMBOL(spl_kmem_cache_alloc);
+
+/*
+ * Free an object back to the local per-cpu magazine, there is no
+ * guarantee that this is the same magazine the object was originally
+ * allocated from. We may need to flush entire from the magazine
+ * back to the slabs to make space.
+ */
+void
+spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
+{
+ spl_kmem_magazine_t *skm;
+ unsigned long flags;
+ int do_reclaim = 0;
+ int do_emergency = 0;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+ /*
+ * Run the destructor
+ */
+ if (skc->skc_dtor)
+ skc->skc_dtor(obj, skc->skc_private);
+
+ /*
+ * Free the object from the Linux underlying Linux slab.
+ */
+ if (skc->skc_flags & KMC_SLAB) {
+ kmem_cache_free(skc->skc_linux_cache, obj);
+ percpu_counter_dec(&skc->skc_linux_alloc);
+ return;
+ }
+
+ /*
+ * While a cache has outstanding emergency objects all freed objects
+ * must be checked. However, since emergency objects will never use
+ * a virtual address these objects can be safely excluded as an
+ * optimization.
+ */
+ if (!is_vmalloc_addr(obj)) {
+ spin_lock(&skc->skc_lock);
+ do_emergency = (skc->skc_obj_emergency > 0);
+ spin_unlock(&skc->skc_lock);
+
+ if (do_emergency && (spl_emergency_free(skc, obj) == 0))
+ return;
+ }
+
+ local_irq_save(flags);
+
+ /*
+ * Safe to update per-cpu structure without lock, but
+ * no remote memory allocation tracking is being performed
+ * it is entirely possible to allocate an object from one
+ * CPU cache and return it to another.
+ */
+ skm = skc->skc_mag[smp_processor_id()];
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+
+ /*
+ * Per-CPU cache full, flush it to make space for this object,
+ * this may result in an empty slab which can be reclaimed once
+ * interrupts are re-enabled.
+ */
+ if (unlikely(skm->skm_avail >= skm->skm_size)) {
+ spl_cache_flush(skc, skm, skm->skm_refill);
+ do_reclaim = 1;
+ }
+
+ /* Available space in cache, use it */
+ skm->skm_objs[skm->skm_avail++] = obj;
+
+ local_irq_restore(flags);
+
+ if (do_reclaim)
+ spl_slab_reclaim(skc);
+}
+EXPORT_SYMBOL(spl_kmem_cache_free);
+
+/*
+ * Depending on how many and which objects are released it may simply
+ * repopulate the local magazine which will then need to age-out. Objects
+ * which cannot fit in the magazine will be released back to their slabs
+ * which will also need to age out before being released. This is all just
+ * best effort and we do not want to thrash creating and destroying slabs.
+ */
+void
+spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
+{
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+ if (skc->skc_flags & KMC_SLAB)
+ return;
+
+ atomic_inc(&skc->skc_ref);
+
+ /*
+ * Prevent concurrent cache reaping when contended.
+ */
+ if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags))
+ goto out;
+
+ /* Reclaim from the magazine and free all now empty slabs. */
+ unsigned long irq_flags;
+ local_irq_save(irq_flags);
+ spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
+ spl_cache_flush(skc, skm, skm->skm_avail);
+ local_irq_restore(irq_flags);
+
+ spl_slab_reclaim(skc);
+ clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
+out:
+ atomic_dec(&skc->skc_ref);
+}
+EXPORT_SYMBOL(spl_kmem_cache_reap_now);
+
+/*
+ * This is stubbed out for code consistency with other platforms. There
+ * is existing logic to prevent concurrent reaping so while this is ugly
+ * it should do no harm.
+ */
+int
+spl_kmem_cache_reap_active()
+{
+ return (0);
+}
+EXPORT_SYMBOL(spl_kmem_cache_reap_active);
+
+/*
+ * Reap all free slabs from all registered caches.
+ */
+void
+spl_kmem_reap(void)
+{
+ spl_kmem_cache_t *skc = NULL;
+
+ down_read(&spl_kmem_cache_sem);
+ list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
+ spl_kmem_cache_reap_now(skc);
+ }
+ up_read(&spl_kmem_cache_sem);
+}
+EXPORT_SYMBOL(spl_kmem_reap);
+
+int
+spl_kmem_cache_init(void)
+{
+ init_rwsem(&spl_kmem_cache_sem);
+ INIT_LIST_HEAD(&spl_kmem_cache_list);
+ spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
+ spl_kmem_cache_kmem_threads, maxclsyspri,
+ spl_kmem_cache_kmem_threads * 8, INT_MAX,
+ TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+
+ return (0);
+}
+
+void
+spl_kmem_cache_fini(void)
+{
+ taskq_destroy(spl_kmem_cache_taskq);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c
new file mode 100644
index 000000000000..943966cbb17a
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c
@@ -0,0 +1,617 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <sys/debug.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/vmem.h>
+
+/*
+ * As a general rule kmem_alloc() allocations should be small, preferably
+ * just a few pages since they must by physically contiguous. Therefore, a
+ * rate limited warning will be printed to the console for any kmem_alloc()
+ * which exceeds a reasonable threshold.
+ *
+ * The default warning threshold is set to sixteen pages but capped at 64K to
+ * accommodate systems using large pages. This value was selected to be small
+ * enough to ensure the largest allocations are quickly noticed and fixed.
+ * But large enough to avoid logging any warnings when a allocation size is
+ * larger than optimal but not a serious concern. Since this value is tunable,
+ * developers are encouraged to set it lower when testing so any new largish
+ * allocations are quickly caught. These warnings may be disabled by setting
+ * the threshold to zero.
+ */
+/* BEGIN CSTYLED */
+unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024);
+module_param(spl_kmem_alloc_warn, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_alloc_warn,
+ "Warning threshold in bytes for a kmem_alloc()");
+EXPORT_SYMBOL(spl_kmem_alloc_warn);
+
+/*
+ * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE.
+ * Allocations which are marginally smaller than this limit may succeed but
+ * should still be avoided due to the expense of locating a contiguous range
+ * of free pages. Therefore, a maximum kmem size with reasonable safely
+ * margin of 4x is set. Kmem_alloc() allocations larger than this maximum
+ * will quickly fail. Vmem_alloc() allocations less than or equal to this
+ * value will use kmalloc(), but shift to vmalloc() when exceeding this value.
+ */
+unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2);
+module_param(spl_kmem_alloc_max, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_alloc_max,
+ "Maximum size in bytes for a kmem_alloc()");
+EXPORT_SYMBOL(spl_kmem_alloc_max);
+/* END CSTYLED */
+
+int
+kmem_debugging(void)
+{
+ return (0);
+}
+EXPORT_SYMBOL(kmem_debugging);
+
+char *
+kmem_vasprintf(const char *fmt, va_list ap)
+{
+ va_list aq;
+ char *ptr;
+
+ do {
+ va_copy(aq, ap);
+ ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq);
+ va_end(aq);
+ } while (ptr == NULL);
+
+ return (ptr);
+}
+EXPORT_SYMBOL(kmem_vasprintf);
+
+char *
+kmem_asprintf(const char *fmt, ...)
+{
+ va_list ap;
+ char *ptr;
+
+ do {
+ va_start(ap, fmt);
+ ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap);
+ va_end(ap);
+ } while (ptr == NULL);
+
+ return (ptr);
+}
+EXPORT_SYMBOL(kmem_asprintf);
+
+static char *
+__strdup(const char *str, int flags)
+{
+ char *ptr;
+ int n;
+
+ n = strlen(str);
+ ptr = kmalloc(n + 1, kmem_flags_convert(flags));
+ if (ptr)
+ memcpy(ptr, str, n + 1);
+
+ return (ptr);
+}
+
+char *
+kmem_strdup(const char *str)
+{
+ return (__strdup(str, KM_SLEEP));
+}
+EXPORT_SYMBOL(kmem_strdup);
+
+void
+kmem_strfree(char *str)
+{
+ kfree(str);
+}
+EXPORT_SYMBOL(kmem_strfree);
+
+void *
+spl_kvmalloc(size_t size, gfp_t lflags)
+{
+#ifdef HAVE_KVMALLOC
+ /*
+ * GFP_KERNEL allocations can safely use kvmalloc which may
+ * improve performance by avoiding a) high latency caused by
+ * vmalloc's on-access allocation, b) performance loss due to
+ * MMU memory address mapping and c) vmalloc locking overhead.
+ * This has the side-effect that the slab statistics will
+ * incorrectly report this as a vmem allocation, but that is
+ * purely cosmetic.
+ */
+ if ((lflags & GFP_KERNEL) == GFP_KERNEL)
+ return (kvmalloc(size, lflags));
+#endif
+
+ gfp_t kmalloc_lflags = lflags;
+
+ if (size > PAGE_SIZE) {
+ /*
+ * We need to set __GFP_NOWARN here since spl_kvmalloc is not
+ * only called by spl_kmem_alloc_impl but can be called
+ * directly with custom lflags, too. In that case
+ * kmem_flags_convert does not get called, which would
+ * implicitly set __GFP_NOWARN.
+ */
+ kmalloc_lflags |= __GFP_NOWARN;
+
+ /*
+ * N.B. __GFP_RETRY_MAYFAIL is supported only for large
+ * e (>32kB) allocations.
+ *
+ * We have to override __GFP_RETRY_MAYFAIL by __GFP_NORETRY
+ * for !costly requests because there is no other way to tell
+ * the allocator that we want to fail rather than retry
+ * endlessly.
+ */
+ if (!(kmalloc_lflags & __GFP_RETRY_MAYFAIL) ||
+ (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+ kmalloc_lflags |= __GFP_NORETRY;
+ }
+ }
+
+ /*
+ * We first try kmalloc - even for big sizes - and fall back to
+ * spl_vmalloc if that fails.
+ *
+ * For non-__GFP-RECLAIM allocations we always stick to
+ * kmalloc_node, and fail when kmalloc is not successful (returns
+ * NULL).
+ * We cannot fall back to spl_vmalloc in this case because spl_vmalloc
+ * internally uses GPF_KERNEL allocations.
+ */
+ void *ptr = kmalloc_node(size, kmalloc_lflags, NUMA_NO_NODE);
+ if (ptr || size <= PAGE_SIZE ||
+ (lflags & __GFP_RECLAIM) != __GFP_RECLAIM) {
+ return (ptr);
+ }
+
+ return (spl_vmalloc(size, lflags | __GFP_HIGHMEM));
+}
+
+/*
+ * General purpose unified implementation of kmem_alloc(). It is an
+ * amalgamation of Linux and Illumos allocator design. It should never be
+ * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains
+ * relatively portable. Consumers may only access this function through
+ * wrappers that enforce the common flags to ensure portability.
+ */
+inline void *
+spl_kmem_alloc_impl(size_t size, int flags, int node)
+{
+ gfp_t lflags = kmem_flags_convert(flags);
+ void *ptr;
+
+ /*
+ * Log abnormally large allocations and rate limit the console output.
+ * Allocations larger than spl_kmem_alloc_warn should be performed
+ * through the vmem_alloc()/vmem_zalloc() interfaces.
+ */
+ if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) &&
+ !(flags & KM_VMEM)) {
+ printk(KERN_WARNING
+ "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n"
+ "https://github.com/openzfs/zfs/issues/new\n",
+ (unsigned long)size, flags);
+ dump_stack();
+ }
+
+ /*
+ * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used
+ * unlike kmem_alloc() with KM_SLEEP on Illumos.
+ */
+ do {
+ /*
+ * Calling kmalloc_node() when the size >= spl_kmem_alloc_max
+ * is unsafe. This must fail for all for kmem_alloc() and
+ * kmem_zalloc() callers.
+ *
+ * For vmem_alloc() and vmem_zalloc() callers it is permissible
+ * to use spl_vmalloc(). However, in general use of
+ * spl_vmalloc() is strongly discouraged because a global lock
+ * must be acquired. Contention on this lock can significantly
+ * impact performance so frequently manipulating the virtual
+ * address space is strongly discouraged.
+ */
+ if (size > spl_kmem_alloc_max) {
+ if (flags & KM_VMEM) {
+ ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM);
+ } else {
+ return (NULL);
+ }
+ } else {
+ if (flags & KM_VMEM) {
+ ptr = spl_kvmalloc(size, lflags);
+ } else {
+ ptr = kmalloc_node(size, lflags, node);
+ }
+ }
+
+ if (likely(ptr) || (flags & KM_NOSLEEP))
+ return (ptr);
+
+ /*
+ * Try hard to satisfy the allocation. However, when progress
+ * cannot be made, the allocation is allowed to fail.
+ */
+ if ((lflags & GFP_KERNEL) == GFP_KERNEL)
+ lflags |= __GFP_RETRY_MAYFAIL;
+
+ /*
+ * Use cond_resched() instead of congestion_wait() to avoid
+ * deadlocking systems where there are no block devices.
+ */
+ cond_resched();
+ } while (1);
+
+ return (NULL);
+}
+
+inline void
+spl_kmem_free_impl(const void *buf, size_t size)
+{
+ if (is_vmalloc_addr(buf))
+ vfree(buf);
+ else
+ kfree(buf);
+}
+
+/*
+ * Memory allocation and accounting for kmem_* * style allocations. When
+ * DEBUG_KMEM is enabled the total memory allocated will be tracked and
+ * any memory leaked will be reported during module unload.
+ *
+ * ./configure --enable-debug-kmem
+ */
+#ifdef DEBUG_KMEM
+
+/* Shim layer memory accounting */
+#ifdef HAVE_ATOMIC64_T
+atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
+unsigned long long kmem_alloc_max = 0;
+#else /* HAVE_ATOMIC64_T */
+atomic_t kmem_alloc_used = ATOMIC_INIT(0);
+unsigned long long kmem_alloc_max = 0;
+#endif /* HAVE_ATOMIC64_T */
+
+EXPORT_SYMBOL(kmem_alloc_used);
+EXPORT_SYMBOL(kmem_alloc_max);
+
+inline void *
+spl_kmem_alloc_debug(size_t size, int flags, int node)
+{
+ void *ptr;
+
+ ptr = spl_kmem_alloc_impl(size, flags, node);
+ if (ptr) {
+ kmem_alloc_used_add(size);
+ if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
+ kmem_alloc_max = kmem_alloc_used_read();
+ }
+
+ return (ptr);
+}
+
+inline void
+spl_kmem_free_debug(const void *ptr, size_t size)
+{
+ kmem_alloc_used_sub(size);
+ spl_kmem_free_impl(ptr, size);
+}
+
+/*
+ * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
+ * but also the location of every alloc and free. When the SPL module is
+ * unloaded a list of all leaked addresses and where they were allocated
+ * will be dumped to the console. Enabling this feature has a significant
+ * impact on performance but it makes finding memory leaks straight forward.
+ *
+ * Not surprisingly with debugging enabled the xmem_locks are very highly
+ * contended particularly on xfree(). If we want to run with this detailed
+ * debugging enabled for anything other than debugging we need to minimize
+ * the contention by moving to a lock per xmem_table entry model.
+ *
+ * ./configure --enable-debug-kmem-tracking
+ */
+#ifdef DEBUG_KMEM_TRACKING
+
+#include <linux/hash.h>
+#include <linux/ctype.h>
+
+#define KMEM_HASH_BITS 10
+#define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS)
+
+typedef struct kmem_debug {
+ struct hlist_node kd_hlist; /* Hash node linkage */
+ struct list_head kd_list; /* List of all allocations */
+ void *kd_addr; /* Allocation pointer */
+ size_t kd_size; /* Allocation size */
+ const char *kd_func; /* Allocation function */
+ int kd_line; /* Allocation line */
+} kmem_debug_t;
+
+static spinlock_t kmem_lock;
+static struct hlist_head kmem_table[KMEM_TABLE_SIZE];
+static struct list_head kmem_list;
+
+static kmem_debug_t *
+kmem_del_init(spinlock_t *lock, struct hlist_head *table,
+ int bits, const void *addr)
+{
+ struct hlist_head *head;
+ struct hlist_node *node = NULL;
+ struct kmem_debug *p;
+ unsigned long flags;
+
+ spin_lock_irqsave(lock, flags);
+
+ head = &table[hash_ptr((void *)addr, bits)];
+ hlist_for_each(node, head) {
+ p = list_entry(node, struct kmem_debug, kd_hlist);
+ if (p->kd_addr == addr) {
+ hlist_del_init(&p->kd_hlist);
+ list_del_init(&p->kd_list);
+ spin_unlock_irqrestore(lock, flags);
+ return (p);
+ }
+ }
+
+ spin_unlock_irqrestore(lock, flags);
+
+ return (NULL);
+}
+
+inline void *
+spl_kmem_alloc_track(size_t size, int flags,
+ const char *func, int line, int node)
+{
+ void *ptr = NULL;
+ kmem_debug_t *dptr;
+ unsigned long irq_flags;
+
+ dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags));
+ if (dptr == NULL)
+ return (NULL);
+
+ dptr->kd_func = __strdup(func, flags);
+ if (dptr->kd_func == NULL) {
+ kfree(dptr);
+ return (NULL);
+ }
+
+ ptr = spl_kmem_alloc_debug(size, flags, node);
+ if (ptr == NULL) {
+ kfree(dptr->kd_func);
+ kfree(dptr);
+ return (NULL);
+ }
+
+ INIT_HLIST_NODE(&dptr->kd_hlist);
+ INIT_LIST_HEAD(&dptr->kd_list);
+
+ dptr->kd_addr = ptr;
+ dptr->kd_size = size;
+ dptr->kd_line = line;
+
+ spin_lock_irqsave(&kmem_lock, irq_flags);
+ hlist_add_head(&dptr->kd_hlist,
+ &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
+ list_add_tail(&dptr->kd_list, &kmem_list);
+ spin_unlock_irqrestore(&kmem_lock, irq_flags);
+
+ return (ptr);
+}
+
+inline void
+spl_kmem_free_track(const void *ptr, size_t size)
+{
+ kmem_debug_t *dptr;
+
+ /* Ignore NULL pointer since we haven't tracked it at all */
+ if (ptr == NULL)
+ return;
+
+ /* Must exist in hash due to kmem_alloc() */
+ dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
+ ASSERT3P(dptr, !=, NULL);
+ ASSERT3S(dptr->kd_size, ==, size);
+
+ kfree(dptr->kd_func);
+ kfree(dptr);
+
+ spl_kmem_free_debug(ptr, size);
+}
+#endif /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM */
+
+/*
+ * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces.
+ */
+void *
+spl_kmem_alloc(size_t size, int flags, const char *func, int line)
+{
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+ return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_kmem_alloc);
+
+void *
+spl_kmem_zalloc(size_t size, int flags, const char *func, int line)
+{
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+ flags |= KM_ZERO;
+
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+ return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_kmem_zalloc);
+
+void
+spl_kmem_free(const void *buf, size_t size)
+{
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_free_impl(buf, size));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_free_debug(buf, size));
+#else
+ return (spl_kmem_free_track(buf, size));
+#endif
+}
+EXPORT_SYMBOL(spl_kmem_free);
+
+#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
+static char *
+spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
+{
+ int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
+ int i, flag = 1;
+
+ ASSERT(str != NULL && len >= 17);
+ memset(str, 0, len);
+
+ /*
+ * Check for a fully printable string, and while we are at
+ * it place the printable characters in the passed buffer.
+ */
+ for (i = 0; i < size; i++) {
+ str[i] = ((char *)(kd->kd_addr))[i];
+ if (isprint(str[i])) {
+ continue;
+ } else {
+ /*
+ * Minimum number of printable characters found
+ * to make it worthwhile to print this as ascii.
+ */
+ if (i > min)
+ break;
+
+ flag = 0;
+ break;
+ }
+ }
+
+ if (!flag) {
+ sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
+ *((uint8_t *)kd->kd_addr),
+ *((uint8_t *)kd->kd_addr + 2),
+ *((uint8_t *)kd->kd_addr + 4),
+ *((uint8_t *)kd->kd_addr + 6),
+ *((uint8_t *)kd->kd_addr + 8),
+ *((uint8_t *)kd->kd_addr + 10),
+ *((uint8_t *)kd->kd_addr + 12),
+ *((uint8_t *)kd->kd_addr + 14));
+ }
+
+ return (str);
+}
+
+static int
+spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
+{
+ int i;
+
+ spin_lock_init(lock);
+ INIT_LIST_HEAD(list);
+
+ for (i = 0; i < size; i++)
+ INIT_HLIST_HEAD(&kmem_table[i]);
+
+ return (0);
+}
+
+static void
+spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
+{
+ unsigned long flags;
+ kmem_debug_t *kd = NULL;
+ char str[17];
+
+ spin_lock_irqsave(lock, flags);
+ if (!list_empty(list))
+ printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
+ "size", "data", "func", "line");
+
+ list_for_each_entry(kd, list, kd_list) {
+ printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
+ (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
+ kd->kd_func, kd->kd_line);
+ }
+
+ spin_unlock_irqrestore(lock, flags);
+}
+#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
+
+int
+spl_kmem_init(void)
+{
+
+#ifdef DEBUG_KMEM
+ kmem_alloc_used_set(0);
+
+
+
+#ifdef DEBUG_KMEM_TRACKING
+ spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
+#endif /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM */
+
+ return (0);
+}
+
+void
+spl_kmem_fini(void)
+{
+#ifdef DEBUG_KMEM
+ /*
+ * Display all unreclaimed memory addresses, including the
+ * allocation size and the first few bytes of what's located
+ * at that address to aid in debugging. Performance is not
+ * a serious concern here since it is module unload time.
+ */
+ if (kmem_alloc_used_read() != 0)
+ printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n",
+ (unsigned long)kmem_alloc_used_read(), kmem_alloc_max);
+
+#ifdef DEBUG_KMEM_TRACKING
+ spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
+#endif /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM */
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c
new file mode 100644
index 000000000000..c7f1aadf784e
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c
@@ -0,0 +1,781 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Kstat Implementation.
+ *
+ * Links to Illumos.org for more information on kstat function:
+ * [1] https://illumos.org/man/1M/kstat
+ * [2] https://illumos.org/man/9f/kstat_create
+ */
+
+#include <linux/seq_file.h>
+#include <sys/kstat.h>
+#include <sys/vmem.h>
+#include <sys/cmn_err.h>
+#include <sys/sysmacros.h>
+
+static kmutex_t kstat_module_lock;
+static struct list_head kstat_module_list;
+static kid_t kstat_id;
+
+static int
+kstat_resize_raw(kstat_t *ksp)
+{
+ if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX)
+ return (ENOMEM);
+
+ vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize);
+ ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX);
+ ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP);
+
+ return (0);
+}
+
+void
+kstat_waitq_enter(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t wcnt;
+
+ new = gethrtime();
+ delta = new - kiop->wlastupdate;
+ kiop->wlastupdate = new;
+ wcnt = kiop->wcnt++;
+ if (wcnt != 0) {
+ kiop->wlentime += delta * wcnt;
+ kiop->wtime += delta;
+ }
+}
+EXPORT_SYMBOL(kstat_waitq_enter);
+
+void
+kstat_waitq_exit(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t wcnt;
+
+ new = gethrtime();
+ delta = new - kiop->wlastupdate;
+ kiop->wlastupdate = new;
+ wcnt = kiop->wcnt--;
+ ASSERT((int)wcnt > 0);
+ kiop->wlentime += delta * wcnt;
+ kiop->wtime += delta;
+}
+EXPORT_SYMBOL(kstat_waitq_exit);
+
+void
+kstat_runq_enter(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t rcnt;
+
+ new = gethrtime();
+ delta = new - kiop->rlastupdate;
+ kiop->rlastupdate = new;
+ rcnt = kiop->rcnt++;
+ if (rcnt != 0) {
+ kiop->rlentime += delta * rcnt;
+ kiop->rtime += delta;
+ }
+}
+EXPORT_SYMBOL(kstat_runq_enter);
+
+void
+kstat_runq_exit(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t rcnt;
+
+ new = gethrtime();
+ delta = new - kiop->rlastupdate;
+ kiop->rlastupdate = new;
+ rcnt = kiop->rcnt--;
+ ASSERT((int)rcnt > 0);
+ kiop->rlentime += delta * rcnt;
+ kiop->rtime += delta;
+}
+EXPORT_SYMBOL(kstat_runq_exit);
+
+static int
+kstat_seq_show_headers(struct seq_file *f)
+{
+ kstat_t *ksp = (kstat_t *)f->private;
+ int rc = 0;
+
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ seq_printf(f, "%d %d 0x%02x %d %d %lld %lld\n",
+ ksp->ks_kid, ksp->ks_type, ksp->ks_flags,
+ ksp->ks_ndata, (int)ksp->ks_data_size,
+ ksp->ks_crtime, ksp->ks_snaptime);
+
+ switch (ksp->ks_type) {
+ case KSTAT_TYPE_RAW:
+restart:
+ if (ksp->ks_raw_ops.headers) {
+ rc = ksp->ks_raw_ops.headers(
+ ksp->ks_raw_buf, ksp->ks_raw_bufsize);
+ if (rc == ENOMEM && !kstat_resize_raw(ksp))
+ goto restart;
+ if (!rc)
+ seq_puts(f, ksp->ks_raw_buf);
+ } else {
+ seq_printf(f, "raw data\n");
+ }
+ break;
+ case KSTAT_TYPE_NAMED:
+ seq_printf(f, "%-31s %-4s %s\n",
+ "name", "type", "data");
+ break;
+ case KSTAT_TYPE_INTR:
+ seq_printf(f, "%-8s %-8s %-8s %-8s %-8s\n",
+ "hard", "soft", "watchdog",
+ "spurious", "multsvc");
+ break;
+ case KSTAT_TYPE_IO:
+ seq_printf(f,
+ "%-8s %-8s %-8s %-8s %-8s %-8s "
+ "%-8s %-8s %-8s %-8s %-8s %-8s\n",
+ "nread", "nwritten", "reads", "writes",
+ "wtime", "wlentime", "wupdate",
+ "rtime", "rlentime", "rupdate",
+ "wcnt", "rcnt");
+ break;
+ case KSTAT_TYPE_TIMER:
+ seq_printf(f,
+ "%-31s %-8s "
+ "%-8s %-8s %-8s %-8s %-8s\n",
+ "name", "events", "elapsed",
+ "min", "max", "start", "stop");
+ break;
+ default:
+ PANIC("Undefined kstat type %d\n", ksp->ks_type);
+ }
+
+ return (-rc);
+}
+
+static int
+kstat_seq_show_raw(struct seq_file *f, unsigned char *p, int l)
+{
+ int i, j;
+
+ for (i = 0; ; i++) {
+ seq_printf(f, "%03x:", i);
+
+ for (j = 0; j < 16; j++) {
+ if (i * 16 + j >= l) {
+ seq_printf(f, "\n");
+ goto out;
+ }
+
+ seq_printf(f, " %02x", (unsigned char)p[i * 16 + j]);
+ }
+ seq_printf(f, "\n");
+ }
+out:
+ return (0);
+}
+
+static int
+kstat_seq_show_named(struct seq_file *f, kstat_named_t *knp)
+{
+ seq_printf(f, "%-31s %-4d ", knp->name, knp->data_type);
+
+ switch (knp->data_type) {
+ case KSTAT_DATA_CHAR:
+ knp->value.c[15] = '\0'; /* NULL terminate */
+ seq_printf(f, "%-16s", knp->value.c);
+ break;
+ /*
+ * NOTE - We need to be more careful able what tokens are
+ * used for each arch, for now this is correct for x86_64.
+ */
+ case KSTAT_DATA_INT32:
+ seq_printf(f, "%d", knp->value.i32);
+ break;
+ case KSTAT_DATA_UINT32:
+ seq_printf(f, "%u", knp->value.ui32);
+ break;
+ case KSTAT_DATA_INT64:
+ seq_printf(f, "%lld", (signed long long)knp->value.i64);
+ break;
+ case KSTAT_DATA_UINT64:
+ seq_printf(f, "%llu",
+ (unsigned long long)knp->value.ui64);
+ break;
+ case KSTAT_DATA_LONG:
+ seq_printf(f, "%ld", knp->value.l);
+ break;
+ case KSTAT_DATA_ULONG:
+ seq_printf(f, "%lu", knp->value.ul);
+ break;
+ case KSTAT_DATA_STRING:
+ KSTAT_NAMED_STR_PTR(knp)
+ [KSTAT_NAMED_STR_BUFLEN(knp)-1] = '\0';
+ seq_printf(f, "%s", KSTAT_NAMED_STR_PTR(knp));
+ break;
+ default:
+ PANIC("Undefined kstat data type %d\n", knp->data_type);
+ }
+
+ seq_printf(f, "\n");
+
+ return (0);
+}
+
+static int
+kstat_seq_show_intr(struct seq_file *f, kstat_intr_t *kip)
+{
+ seq_printf(f, "%-8u %-8u %-8u %-8u %-8u\n",
+ kip->intrs[KSTAT_INTR_HARD],
+ kip->intrs[KSTAT_INTR_SOFT],
+ kip->intrs[KSTAT_INTR_WATCHDOG],
+ kip->intrs[KSTAT_INTR_SPURIOUS],
+ kip->intrs[KSTAT_INTR_MULTSVC]);
+
+ return (0);
+}
+
+static int
+kstat_seq_show_io(struct seq_file *f, kstat_io_t *kip)
+{
+ /* though wlentime & friends are signed, they will never be negative */
+ seq_printf(f,
+ "%-8llu %-8llu %-8u %-8u %-8llu %-8llu "
+ "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n",
+ kip->nread, kip->nwritten,
+ kip->reads, kip->writes,
+ kip->wtime, kip->wlentime, kip->wlastupdate,
+ kip->rtime, kip->rlentime, kip->rlastupdate,
+ kip->wcnt, kip->rcnt);
+
+ return (0);
+}
+
+static int
+kstat_seq_show_timer(struct seq_file *f, kstat_timer_t *ktp)
+{
+ seq_printf(f,
+ "%-31s %-8llu %-8llu %-8llu %-8llu %-8llu %-8llu\n",
+ ktp->name, ktp->num_events, ktp->elapsed_time,
+ ktp->min_time, ktp->max_time,
+ ktp->start_time, ktp->stop_time);
+
+ return (0);
+}
+
+static int
+kstat_seq_show(struct seq_file *f, void *p)
+{
+ kstat_t *ksp = (kstat_t *)f->private;
+ int rc = 0;
+
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ switch (ksp->ks_type) {
+ case KSTAT_TYPE_RAW:
+restart:
+ if (ksp->ks_raw_ops.data) {
+ rc = ksp->ks_raw_ops.data(
+ ksp->ks_raw_buf, ksp->ks_raw_bufsize, p);
+ if (rc == ENOMEM && !kstat_resize_raw(ksp))
+ goto restart;
+ if (!rc)
+ seq_puts(f, ksp->ks_raw_buf);
+ } else {
+ ASSERT(ksp->ks_ndata == 1);
+ rc = kstat_seq_show_raw(f, ksp->ks_data,
+ ksp->ks_data_size);
+ }
+ break;
+ case KSTAT_TYPE_NAMED:
+ rc = kstat_seq_show_named(f, (kstat_named_t *)p);
+ break;
+ case KSTAT_TYPE_INTR:
+ rc = kstat_seq_show_intr(f, (kstat_intr_t *)p);
+ break;
+ case KSTAT_TYPE_IO:
+ rc = kstat_seq_show_io(f, (kstat_io_t *)p);
+ break;
+ case KSTAT_TYPE_TIMER:
+ rc = kstat_seq_show_timer(f, (kstat_timer_t *)p);
+ break;
+ default:
+ PANIC("Undefined kstat type %d\n", ksp->ks_type);
+ }
+
+ return (-rc);
+}
+
+static int
+kstat_default_update(kstat_t *ksp, int rw)
+{
+ ASSERT(ksp != NULL);
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ return (0);
+}
+
+static void *
+kstat_seq_data_addr(kstat_t *ksp, loff_t n)
+{
+ void *rc = NULL;
+
+ switch (ksp->ks_type) {
+ case KSTAT_TYPE_RAW:
+ if (ksp->ks_raw_ops.addr)
+ rc = ksp->ks_raw_ops.addr(ksp, n);
+ else
+ rc = ksp->ks_data;
+ break;
+ case KSTAT_TYPE_NAMED:
+ rc = ksp->ks_data + n * sizeof (kstat_named_t);
+ break;
+ case KSTAT_TYPE_INTR:
+ rc = ksp->ks_data + n * sizeof (kstat_intr_t);
+ break;
+ case KSTAT_TYPE_IO:
+ rc = ksp->ks_data + n * sizeof (kstat_io_t);
+ break;
+ case KSTAT_TYPE_TIMER:
+ rc = ksp->ks_data + n * sizeof (kstat_timer_t);
+ break;
+ default:
+ PANIC("Undefined kstat type %d\n", ksp->ks_type);
+ }
+
+ return (rc);
+}
+
+static void *
+kstat_seq_start(struct seq_file *f, loff_t *pos)
+{
+ loff_t n = *pos;
+ kstat_t *ksp = (kstat_t *)f->private;
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ mutex_enter(ksp->ks_lock);
+
+ if (ksp->ks_type == KSTAT_TYPE_RAW) {
+ ksp->ks_raw_bufsize = PAGE_SIZE;
+ ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP);
+ }
+
+ /* Dynamically update kstat, on error existing kstats are used */
+ (void) ksp->ks_update(ksp, KSTAT_READ);
+
+ ksp->ks_snaptime = gethrtime();
+
+ if (!(ksp->ks_flags & KSTAT_FLAG_NO_HEADERS) && !n &&
+ kstat_seq_show_headers(f))
+ return (NULL);
+
+ if (n >= ksp->ks_ndata)
+ return (NULL);
+
+ return (kstat_seq_data_addr(ksp, n));
+}
+
+static void *
+kstat_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+ kstat_t *ksp = (kstat_t *)f->private;
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ ++*pos;
+ if (*pos >= ksp->ks_ndata)
+ return (NULL);
+
+ return (kstat_seq_data_addr(ksp, *pos));
+}
+
+static void
+kstat_seq_stop(struct seq_file *f, void *v)
+{
+ kstat_t *ksp = (kstat_t *)f->private;
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ if (ksp->ks_type == KSTAT_TYPE_RAW)
+ vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize);
+
+ mutex_exit(ksp->ks_lock);
+}
+
+static struct seq_operations kstat_seq_ops = {
+ .show = kstat_seq_show,
+ .start = kstat_seq_start,
+ .next = kstat_seq_next,
+ .stop = kstat_seq_stop,
+};
+
+static kstat_module_t *
+kstat_find_module(char *name)
+{
+ kstat_module_t *module = NULL;
+
+ list_for_each_entry(module, &kstat_module_list, ksm_module_list) {
+ if (strncmp(name, module->ksm_name, KSTAT_STRLEN) == 0)
+ return (module);
+ }
+
+ return (NULL);
+}
+
+static kstat_module_t *
+kstat_create_module(char *name)
+{
+ kstat_module_t *module;
+ struct proc_dir_entry *pde;
+
+ pde = proc_mkdir(name, proc_spl_kstat);
+ if (pde == NULL)
+ return (NULL);
+
+ module = kmem_alloc(sizeof (kstat_module_t), KM_SLEEP);
+ module->ksm_proc = pde;
+ strlcpy(module->ksm_name, name, KSTAT_STRLEN+1);
+ INIT_LIST_HEAD(&module->ksm_kstat_list);
+ list_add_tail(&module->ksm_module_list, &kstat_module_list);
+
+ return (module);
+
+}
+
+static void
+kstat_delete_module(kstat_module_t *module)
+{
+ ASSERT(list_empty(&module->ksm_kstat_list));
+ remove_proc_entry(module->ksm_name, proc_spl_kstat);
+ list_del(&module->ksm_module_list);
+ kmem_free(module, sizeof (kstat_module_t));
+}
+
+static int
+proc_kstat_open(struct inode *inode, struct file *filp)
+{
+ struct seq_file *f;
+ int rc;
+
+ rc = seq_open(filp, &kstat_seq_ops);
+ if (rc)
+ return (rc);
+
+ f = filp->private_data;
+ f->private = PDE_DATA(inode);
+
+ return (0);
+}
+
+static ssize_t
+proc_kstat_write(struct file *filp, const char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct seq_file *f = filp->private_data;
+ kstat_t *ksp = f->private;
+ int rc;
+
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ mutex_enter(ksp->ks_lock);
+ rc = ksp->ks_update(ksp, KSTAT_WRITE);
+ mutex_exit(ksp->ks_lock);
+
+ if (rc)
+ return (-rc);
+
+ *ppos += len;
+ return (len);
+}
+
+static const kstat_proc_op_t proc_kstat_operations = {
+#ifdef HAVE_PROC_OPS_STRUCT
+ .proc_open = proc_kstat_open,
+ .proc_write = proc_kstat_write,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
+#else
+ .open = proc_kstat_open,
+ .write = proc_kstat_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+#endif
+};
+
+void
+__kstat_set_raw_ops(kstat_t *ksp,
+ int (*headers)(char *buf, size_t size),
+ int (*data)(char *buf, size_t size, void *data),
+ void *(*addr)(kstat_t *ksp, loff_t index))
+{
+ ksp->ks_raw_ops.headers = headers;
+ ksp->ks_raw_ops.data = data;
+ ksp->ks_raw_ops.addr = addr;
+}
+EXPORT_SYMBOL(__kstat_set_raw_ops);
+
+void
+kstat_proc_entry_init(kstat_proc_entry_t *kpep, const char *module,
+ const char *name)
+{
+ kpep->kpe_owner = NULL;
+ kpep->kpe_proc = NULL;
+ INIT_LIST_HEAD(&kpep->kpe_list);
+ strncpy(kpep->kpe_module, module, KSTAT_STRLEN);
+ strncpy(kpep->kpe_name, name, KSTAT_STRLEN);
+}
+EXPORT_SYMBOL(kstat_proc_entry_init);
+
+kstat_t *
+__kstat_create(const char *ks_module, int ks_instance, const char *ks_name,
+ const char *ks_class, uchar_t ks_type, uint_t ks_ndata,
+ uchar_t ks_flags)
+{
+ kstat_t *ksp;
+
+ ASSERT(ks_module);
+ ASSERT(ks_instance == 0);
+ ASSERT(ks_name);
+
+ if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO))
+ ASSERT(ks_ndata == 1);
+
+ ksp = kmem_zalloc(sizeof (*ksp), KM_SLEEP);
+ if (ksp == NULL)
+ return (ksp);
+
+ mutex_enter(&kstat_module_lock);
+ ksp->ks_kid = kstat_id;
+ kstat_id++;
+ mutex_exit(&kstat_module_lock);
+
+ ksp->ks_magic = KS_MAGIC;
+ mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL);
+ ksp->ks_lock = &ksp->ks_private_lock;
+
+ ksp->ks_crtime = gethrtime();
+ ksp->ks_snaptime = ksp->ks_crtime;
+ ksp->ks_instance = ks_instance;
+ strncpy(ksp->ks_class, ks_class, KSTAT_STRLEN);
+ ksp->ks_type = ks_type;
+ ksp->ks_flags = ks_flags;
+ ksp->ks_update = kstat_default_update;
+ ksp->ks_private = NULL;
+ ksp->ks_raw_ops.headers = NULL;
+ ksp->ks_raw_ops.data = NULL;
+ ksp->ks_raw_ops.addr = NULL;
+ ksp->ks_raw_buf = NULL;
+ ksp->ks_raw_bufsize = 0;
+ kstat_proc_entry_init(&ksp->ks_proc, ks_module, ks_name);
+
+ switch (ksp->ks_type) {
+ case KSTAT_TYPE_RAW:
+ ksp->ks_ndata = 1;
+ ksp->ks_data_size = ks_ndata;
+ break;
+ case KSTAT_TYPE_NAMED:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t);
+ break;
+ case KSTAT_TYPE_INTR:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t);
+ break;
+ case KSTAT_TYPE_IO:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t);
+ break;
+ case KSTAT_TYPE_TIMER:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t);
+ break;
+ default:
+ PANIC("Undefined kstat type %d\n", ksp->ks_type);
+ }
+
+ if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) {
+ ksp->ks_data = NULL;
+ } else {
+ ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP);
+ if (ksp->ks_data == NULL) {
+ kmem_free(ksp, sizeof (*ksp));
+ ksp = NULL;
+ }
+ }
+
+ return (ksp);
+}
+EXPORT_SYMBOL(__kstat_create);
+
+static int
+kstat_detect_collision(kstat_proc_entry_t *kpep)
+{
+ kstat_module_t *module;
+ kstat_proc_entry_t *tmp = NULL;
+ char *parent;
+ char *cp;
+
+ parent = kmem_asprintf("%s", kpep->kpe_module);
+
+ if ((cp = strrchr(parent, '/')) == NULL) {
+ kmem_strfree(parent);
+ return (0);
+ }
+
+ cp[0] = '\0';
+ if ((module = kstat_find_module(parent)) != NULL) {
+ list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) {
+ if (strncmp(tmp->kpe_name, cp+1, KSTAT_STRLEN) == 0) {
+ kmem_strfree(parent);
+ return (EEXIST);
+ }
+ }
+ }
+
+ kmem_strfree(parent);
+ return (0);
+}
+
+/*
+ * Add a file to the proc filesystem under the kstat namespace (i.e.
+ * /proc/spl/kstat/). The file need not necessarily be implemented as a
+ * kstat.
+ */
+void
+kstat_proc_entry_install(kstat_proc_entry_t *kpep, mode_t mode,
+ const kstat_proc_op_t *proc_ops, void *data)
+{
+ kstat_module_t *module;
+ kstat_proc_entry_t *tmp = NULL;
+
+ ASSERT(kpep);
+
+ mutex_enter(&kstat_module_lock);
+
+ module = kstat_find_module(kpep->kpe_module);
+ if (module == NULL) {
+ if (kstat_detect_collision(kpep) != 0) {
+ cmn_err(CE_WARN, "kstat_create('%s', '%s'): namespace" \
+ " collision", kpep->kpe_module, kpep->kpe_name);
+ goto out;
+ }
+ module = kstat_create_module(kpep->kpe_module);
+ if (module == NULL)
+ goto out;
+ }
+
+ /*
+ * Only one entry by this name per-module, on failure the module
+ * shouldn't be deleted because we know it has at least one entry.
+ */
+ list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) {
+ if (strncmp(tmp->kpe_name, kpep->kpe_name, KSTAT_STRLEN) == 0)
+ goto out;
+ }
+
+ list_add_tail(&kpep->kpe_list, &module->ksm_kstat_list);
+
+ kpep->kpe_owner = module;
+ kpep->kpe_proc = proc_create_data(kpep->kpe_name, mode,
+ module->ksm_proc, proc_ops, data);
+ if (kpep->kpe_proc == NULL) {
+ list_del_init(&kpep->kpe_list);
+ if (list_empty(&module->ksm_kstat_list))
+ kstat_delete_module(module);
+ }
+out:
+ mutex_exit(&kstat_module_lock);
+
+}
+EXPORT_SYMBOL(kstat_proc_entry_install);
+
+void
+__kstat_install(kstat_t *ksp)
+{
+ ASSERT(ksp);
+ mode_t mode;
+ /* Specify permission modes for different kstats */
+ if (strncmp(ksp->ks_proc.kpe_name, "dbufs", KSTAT_STRLEN) == 0) {
+ mode = 0600;
+ } else {
+ mode = 0644;
+ }
+ kstat_proc_entry_install(
+ &ksp->ks_proc, mode, &proc_kstat_operations, ksp);
+}
+EXPORT_SYMBOL(__kstat_install);
+
+void
+kstat_proc_entry_delete(kstat_proc_entry_t *kpep)
+{
+ kstat_module_t *module = kpep->kpe_owner;
+ if (kpep->kpe_proc)
+ remove_proc_entry(kpep->kpe_name, module->ksm_proc);
+
+ mutex_enter(&kstat_module_lock);
+ list_del_init(&kpep->kpe_list);
+
+ /*
+ * Remove top level module directory if it wasn't empty before, but now
+ * is.
+ */
+ if (kpep->kpe_proc && list_empty(&module->ksm_kstat_list))
+ kstat_delete_module(module);
+ mutex_exit(&kstat_module_lock);
+
+}
+EXPORT_SYMBOL(kstat_proc_entry_delete);
+
+void
+__kstat_delete(kstat_t *ksp)
+{
+ kstat_proc_entry_delete(&ksp->ks_proc);
+
+ if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL))
+ kmem_free(ksp->ks_data, ksp->ks_data_size);
+
+ ksp->ks_lock = NULL;
+ mutex_destroy(&ksp->ks_private_lock);
+ kmem_free(ksp, sizeof (*ksp));
+}
+EXPORT_SYMBOL(__kstat_delete);
+
+int
+spl_kstat_init(void)
+{
+ mutex_init(&kstat_module_lock, NULL, MUTEX_DEFAULT, NULL);
+ INIT_LIST_HEAD(&kstat_module_list);
+ kstat_id = 0;
+ return (0);
+}
+
+void
+spl_kstat_fini(void)
+{
+ ASSERT(list_empty(&kstat_module_list));
+ mutex_destroy(&kstat_module_lock);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
new file mode 100644
index 000000000000..3e58598d43f8
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
@@ -0,0 +1,790 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Proc Implementation.
+ */
+
+#include <sys/systeminfo.h>
+#include <sys/kstat.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/vmem.h>
+#include <sys/taskq.h>
+#include <sys/proc.h>
+#include <linux/ctype.h>
+#include <linux/kmod.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/version.h>
+
+#if defined(CONSTIFY_PLUGIN) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)
+typedef struct ctl_table __no_const spl_ctl_table;
+#else
+typedef struct ctl_table spl_ctl_table;
+#endif
+
+static unsigned long table_min = 0;
+static unsigned long table_max = ~0;
+
+static struct ctl_table_header *spl_header = NULL;
+static struct proc_dir_entry *proc_spl = NULL;
+static struct proc_dir_entry *proc_spl_kmem = NULL;
+static struct proc_dir_entry *proc_spl_kmem_slab = NULL;
+static struct proc_dir_entry *proc_spl_taskq_all = NULL;
+static struct proc_dir_entry *proc_spl_taskq = NULL;
+struct proc_dir_entry *proc_spl_kstat = NULL;
+
+static int
+proc_copyin_string(char *kbuffer, int kbuffer_size, const char *ubuffer,
+ int ubuffer_size)
+{
+ int size;
+
+ if (ubuffer_size > kbuffer_size)
+ return (-EOVERFLOW);
+
+ if (copy_from_user((void *)kbuffer, (void *)ubuffer, ubuffer_size))
+ return (-EFAULT);
+
+ /* strip trailing whitespace */
+ size = strnlen(kbuffer, ubuffer_size);
+ while (size-- >= 0)
+ if (!isspace(kbuffer[size]))
+ break;
+
+ /* empty string */
+ if (size < 0)
+ return (-EINVAL);
+
+ /* no space to terminate */
+ if (size == kbuffer_size)
+ return (-EOVERFLOW);
+
+ kbuffer[size + 1] = 0;
+ return (0);
+}
+
+static int
+proc_copyout_string(char *ubuffer, int ubuffer_size, const char *kbuffer,
+ char *append)
+{
+ /*
+ * NB if 'append' != NULL, it's a single character to append to the
+ * copied out string - usually "\n", for /proc entries and
+ * (i.e. a terminating zero byte) for sysctl entries
+ */
+ int size = MIN(strlen(kbuffer), ubuffer_size);
+
+ if (copy_to_user(ubuffer, kbuffer, size))
+ return (-EFAULT);
+
+ if (append != NULL && size < ubuffer_size) {
+ if (copy_to_user(ubuffer + size, append, 1))
+ return (-EFAULT);
+
+ size++;
+ }
+
+ return (size);
+}
+
+#ifdef DEBUG_KMEM
+static int
+proc_domemused(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int rc = 0;
+ unsigned long min = 0, max = ~0, val;
+ spl_ctl_table dummy = *table;
+
+ dummy.data = &val;
+ dummy.proc_handler = &proc_dointvec;
+ dummy.extra1 = &min;
+ dummy.extra2 = &max;
+
+ if (write) {
+ *ppos += *lenp;
+ } else {
+#ifdef HAVE_ATOMIC64_T
+ val = atomic64_read((atomic64_t *)table->data);
+#else
+ val = atomic_read((atomic_t *)table->data);
+#endif /* HAVE_ATOMIC64_T */
+ rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos);
+ }
+
+ return (rc);
+}
+#endif /* DEBUG_KMEM */
+
+static int
+proc_doslab(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int rc = 0;
+ unsigned long min = 0, max = ~0, val = 0, mask;
+ spl_ctl_table dummy = *table;
+ spl_kmem_cache_t *skc = NULL;
+
+ dummy.data = &val;
+ dummy.proc_handler = &proc_dointvec;
+ dummy.extra1 = &min;
+ dummy.extra2 = &max;
+
+ if (write) {
+ *ppos += *lenp;
+ } else {
+ down_read(&spl_kmem_cache_sem);
+ mask = (unsigned long)table->data;
+
+ list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
+
+ /* Only use slabs of the correct kmem/vmem type */
+ if (!(skc->skc_flags & mask))
+ continue;
+
+ /* Sum the specified field for selected slabs */
+ switch (mask & (KMC_TOTAL | KMC_ALLOC | KMC_MAX)) {
+ case KMC_TOTAL:
+ val += skc->skc_slab_size * skc->skc_slab_total;
+ break;
+ case KMC_ALLOC:
+ val += skc->skc_obj_size * skc->skc_obj_alloc;
+ break;
+ case KMC_MAX:
+ val += skc->skc_obj_size * skc->skc_obj_max;
+ break;
+ }
+ }
+
+ up_read(&spl_kmem_cache_sem);
+ rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos);
+ }
+
+ return (rc);
+}
+
+static int
+proc_dohostid(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int len, rc = 0;
+ char *end, str[32];
+
+ if (write) {
+ /*
+ * We can't use proc_doulongvec_minmax() in the write
+ * case here because hostid while a hex value has no
+ * leading 0x which confuses the helper function.
+ */
+ rc = proc_copyin_string(str, sizeof (str), buffer, *lenp);
+ if (rc < 0)
+ return (rc);
+
+ spl_hostid = simple_strtoul(str, &end, 16);
+ if (str == end)
+ return (-EINVAL);
+
+ } else {
+ len = snprintf(str, sizeof (str), "%lx",
+ (unsigned long) zone_get_hostid(NULL));
+ if (*ppos >= len)
+ rc = 0;
+ else
+ rc = proc_copyout_string(buffer,
+ *lenp, str + *ppos, "\n");
+
+ if (rc >= 0) {
+ *lenp = rc;
+ *ppos += rc;
+ }
+ }
+
+ return (rc);
+}
+
+static void
+taskq_seq_show_headers(struct seq_file *f)
+{
+ seq_printf(f, "%-25s %5s %5s %5s %5s %5s %5s %12s %5s %10s\n",
+ "taskq", "act", "nthr", "spwn", "maxt", "pri",
+ "mina", "maxa", "cura", "flags");
+}
+
+/* indices into the lheads array below */
+#define LHEAD_PEND 0
+#define LHEAD_PRIO 1
+#define LHEAD_DELAY 2
+#define LHEAD_WAIT 3
+#define LHEAD_ACTIVE 4
+#define LHEAD_SIZE 5
+
+/* BEGIN CSTYLED */
+static unsigned int spl_max_show_tasks = 512;
+module_param(spl_max_show_tasks, uint, 0644);
+MODULE_PARM_DESC(spl_max_show_tasks, "Max number of tasks shown in taskq proc");
+/* END CSTYLED */
+
+static int
+taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag)
+{
+ taskq_t *tq = p;
+ taskq_thread_t *tqt = NULL;
+ spl_wait_queue_entry_t *wq;
+ struct task_struct *tsk;
+ taskq_ent_t *tqe;
+ char name[100];
+ struct list_head *lheads[LHEAD_SIZE], *lh;
+ static char *list_names[LHEAD_SIZE] =
+ {"pend", "prio", "delay", "wait", "active" };
+ int i, j, have_lheads = 0;
+ unsigned long wflags, flags;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ spin_lock_irqsave(&tq->tq_wait_waitq.lock, wflags);
+
+ /* get the various lists and check whether they're empty */
+ lheads[LHEAD_PEND] = &tq->tq_pend_list;
+ lheads[LHEAD_PRIO] = &tq->tq_prio_list;
+ lheads[LHEAD_DELAY] = &tq->tq_delay_list;
+#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
+ lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.head;
+#else
+ lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.task_list;
+#endif
+ lheads[LHEAD_ACTIVE] = &tq->tq_active_list;
+
+ for (i = 0; i < LHEAD_SIZE; ++i) {
+ if (list_empty(lheads[i]))
+ lheads[i] = NULL;
+ else
+ ++have_lheads;
+ }
+
+ /* early return in non-"all" mode if lists are all empty */
+ if (!allflag && !have_lheads) {
+ spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ return (0);
+ }
+
+ /* unlock the waitq quickly */
+ if (!lheads[LHEAD_WAIT])
+ spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
+
+ /* show the base taskq contents */
+ snprintf(name, sizeof (name), "%s/%d", tq->tq_name, tq->tq_instance);
+ seq_printf(f, "%-25s ", name);
+ seq_printf(f, "%5d %5d %5d %5d %5d %5d %12d %5d %10x\n",
+ tq->tq_nactive, tq->tq_nthreads, tq->tq_nspawn,
+ tq->tq_maxthreads, tq->tq_pri, tq->tq_minalloc, tq->tq_maxalloc,
+ tq->tq_nalloc, tq->tq_flags);
+
+ /* show the active list */
+ if (lheads[LHEAD_ACTIVE]) {
+ j = 0;
+ list_for_each_entry(tqt, &tq->tq_active_list, tqt_active_list) {
+ if (j == 0)
+ seq_printf(f, "\t%s:",
+ list_names[LHEAD_ACTIVE]);
+ else if (j == 2) {
+ seq_printf(f, "\n\t ");
+ j = 0;
+ }
+ seq_printf(f, " [%d]%pf(%ps)",
+ tqt->tqt_thread->pid,
+ tqt->tqt_task->tqent_func,
+ tqt->tqt_task->tqent_arg);
+ ++j;
+ }
+ seq_printf(f, "\n");
+ }
+
+ for (i = LHEAD_PEND; i <= LHEAD_WAIT; ++i)
+ if (lheads[i]) {
+ j = 0;
+ list_for_each(lh, lheads[i]) {
+ if (spl_max_show_tasks != 0 &&
+ j >= spl_max_show_tasks) {
+ seq_printf(f, "\n\t(truncated)");
+ break;
+ }
+ /* show the wait waitq list */
+ if (i == LHEAD_WAIT) {
+#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
+ wq = list_entry(lh,
+ spl_wait_queue_entry_t, entry);
+#else
+ wq = list_entry(lh,
+ spl_wait_queue_entry_t, task_list);
+#endif
+ if (j == 0)
+ seq_printf(f, "\t%s:",
+ list_names[i]);
+ else if (j % 8 == 0)
+ seq_printf(f, "\n\t ");
+
+ tsk = wq->private;
+ seq_printf(f, " %d", tsk->pid);
+ /* pend, prio and delay lists */
+ } else {
+ tqe = list_entry(lh, taskq_ent_t,
+ tqent_list);
+ if (j == 0)
+ seq_printf(f, "\t%s:",
+ list_names[i]);
+ else if (j % 2 == 0)
+ seq_printf(f, "\n\t ");
+
+ seq_printf(f, " %pf(%ps)",
+ tqe->tqent_func,
+ tqe->tqent_arg);
+ }
+ ++j;
+ }
+ seq_printf(f, "\n");
+ }
+ if (lheads[LHEAD_WAIT])
+ spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ return (0);
+}
+
+static int
+taskq_all_seq_show(struct seq_file *f, void *p)
+{
+ return (taskq_seq_show_impl(f, p, B_TRUE));
+}
+
+static int
+taskq_seq_show(struct seq_file *f, void *p)
+{
+ return (taskq_seq_show_impl(f, p, B_FALSE));
+}
+
+static void *
+taskq_seq_start(struct seq_file *f, loff_t *pos)
+{
+ struct list_head *p;
+ loff_t n = *pos;
+
+ down_read(&tq_list_sem);
+ if (!n)
+ taskq_seq_show_headers(f);
+
+ p = tq_list.next;
+ while (n--) {
+ p = p->next;
+ if (p == &tq_list)
+ return (NULL);
+ }
+
+ return (list_entry(p, taskq_t, tq_taskqs));
+}
+
+static void *
+taskq_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+ taskq_t *tq = p;
+
+ ++*pos;
+ return ((tq->tq_taskqs.next == &tq_list) ?
+ NULL : list_entry(tq->tq_taskqs.next, taskq_t, tq_taskqs));
+}
+
+static void
+slab_seq_show_headers(struct seq_file *f)
+{
+ seq_printf(f,
+ "--------------------- cache ----------"
+ "--------------------------------------------- "
+ "----- slab ------ "
+ "---- object ----- "
+ "--- emergency ---\n");
+ seq_printf(f,
+ "name "
+ " flags size alloc slabsize objsize "
+ "total alloc max "
+ "total alloc max "
+ "dlock alloc max\n");
+}
+
+static int
+slab_seq_show(struct seq_file *f, void *p)
+{
+ spl_kmem_cache_t *skc = p;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+
+ if (skc->skc_flags & KMC_SLAB) {
+ /*
+ * This cache is backed by a generic Linux kmem cache which
+ * has its own accounting. For these caches we only track
+ * the number of active allocated objects that exist within
+ * the underlying Linux slabs. For the overall statistics of
+ * the underlying Linux cache please refer to /proc/slabinfo.
+ */
+ spin_lock(&skc->skc_lock);
+ uint64_t objs_allocated =
+ percpu_counter_sum(&skc->skc_linux_alloc);
+ seq_printf(f, "%-36s ", skc->skc_name);
+ seq_printf(f, "0x%05lx %9s %9lu %8s %8u "
+ "%5s %5s %5s %5s %5lu %5s %5s %5s %5s\n",
+ (long unsigned)skc->skc_flags,
+ "-",
+ (long unsigned)(skc->skc_obj_size * objs_allocated),
+ "-",
+ (unsigned)skc->skc_obj_size,
+ "-", "-", "-", "-",
+ (long unsigned)objs_allocated,
+ "-", "-", "-", "-");
+ spin_unlock(&skc->skc_lock);
+ return (0);
+ }
+
+ spin_lock(&skc->skc_lock);
+ seq_printf(f, "%-36s ", skc->skc_name);
+ seq_printf(f, "0x%05lx %9lu %9lu %8u %8u "
+ "%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n",
+ (long unsigned)skc->skc_flags,
+ (long unsigned)(skc->skc_slab_size * skc->skc_slab_total),
+ (long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc),
+ (unsigned)skc->skc_slab_size,
+ (unsigned)skc->skc_obj_size,
+ (long unsigned)skc->skc_slab_total,
+ (long unsigned)skc->skc_slab_alloc,
+ (long unsigned)skc->skc_slab_max,
+ (long unsigned)skc->skc_obj_total,
+ (long unsigned)skc->skc_obj_alloc,
+ (long unsigned)skc->skc_obj_max,
+ (long unsigned)skc->skc_obj_deadlock,
+ (long unsigned)skc->skc_obj_emergency,
+ (long unsigned)skc->skc_obj_emergency_max);
+ spin_unlock(&skc->skc_lock);
+ return (0);
+}
+
+static void *
+slab_seq_start(struct seq_file *f, loff_t *pos)
+{
+ struct list_head *p;
+ loff_t n = *pos;
+
+ down_read(&spl_kmem_cache_sem);
+ if (!n)
+ slab_seq_show_headers(f);
+
+ p = spl_kmem_cache_list.next;
+ while (n--) {
+ p = p->next;
+ if (p == &spl_kmem_cache_list)
+ return (NULL);
+ }
+
+ return (list_entry(p, spl_kmem_cache_t, skc_list));
+}
+
+static void *
+slab_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+ spl_kmem_cache_t *skc = p;
+
+ ++*pos;
+ return ((skc->skc_list.next == &spl_kmem_cache_list) ?
+ NULL : list_entry(skc->skc_list.next, spl_kmem_cache_t, skc_list));
+}
+
+static void
+slab_seq_stop(struct seq_file *f, void *v)
+{
+ up_read(&spl_kmem_cache_sem);
+}
+
+static struct seq_operations slab_seq_ops = {
+ .show = slab_seq_show,
+ .start = slab_seq_start,
+ .next = slab_seq_next,
+ .stop = slab_seq_stop,
+};
+
+static int
+proc_slab_open(struct inode *inode, struct file *filp)
+{
+ return (seq_open(filp, &slab_seq_ops));
+}
+
+static const kstat_proc_op_t proc_slab_operations = {
+#ifdef HAVE_PROC_OPS_STRUCT
+ .proc_open = proc_slab_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
+#else
+ .open = proc_slab_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+#endif
+};
+
+static void
+taskq_seq_stop(struct seq_file *f, void *v)
+{
+ up_read(&tq_list_sem);
+}
+
+static struct seq_operations taskq_all_seq_ops = {
+ .show = taskq_all_seq_show,
+ .start = taskq_seq_start,
+ .next = taskq_seq_next,
+ .stop = taskq_seq_stop,
+};
+
+static struct seq_operations taskq_seq_ops = {
+ .show = taskq_seq_show,
+ .start = taskq_seq_start,
+ .next = taskq_seq_next,
+ .stop = taskq_seq_stop,
+};
+
+static int
+proc_taskq_all_open(struct inode *inode, struct file *filp)
+{
+ return (seq_open(filp, &taskq_all_seq_ops));
+}
+
+static int
+proc_taskq_open(struct inode *inode, struct file *filp)
+{
+ return (seq_open(filp, &taskq_seq_ops));
+}
+
+static const kstat_proc_op_t proc_taskq_all_operations = {
+#ifdef HAVE_PROC_OPS_STRUCT
+ .proc_open = proc_taskq_all_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
+#else
+ .open = proc_taskq_all_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+#endif
+};
+
+static const kstat_proc_op_t proc_taskq_operations = {
+#ifdef HAVE_PROC_OPS_STRUCT
+ .proc_open = proc_taskq_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
+#else
+ .open = proc_taskq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+#endif
+};
+
+static struct ctl_table spl_kmem_table[] = {
+#ifdef DEBUG_KMEM
+ {
+ .procname = "kmem_used",
+ .data = &kmem_alloc_used,
+#ifdef HAVE_ATOMIC64_T
+ .maxlen = sizeof (atomic64_t),
+#else
+ .maxlen = sizeof (atomic_t),
+#endif /* HAVE_ATOMIC64_T */
+ .mode = 0444,
+ .proc_handler = &proc_domemused,
+ },
+ {
+ .procname = "kmem_max",
+ .data = &kmem_alloc_max,
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doulongvec_minmax,
+ },
+#endif /* DEBUG_KMEM */
+ {
+ .procname = "slab_kvmem_total",
+ .data = (void *)(KMC_KVMEM | KMC_TOTAL),
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doslab,
+ },
+ {
+ .procname = "slab_kvmem_alloc",
+ .data = (void *)(KMC_KVMEM | KMC_ALLOC),
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doslab,
+ },
+ {
+ .procname = "slab_kvmem_max",
+ .data = (void *)(KMC_KVMEM | KMC_MAX),
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doslab,
+ },
+ {},
+};
+
+static struct ctl_table spl_kstat_table[] = {
+ {},
+};
+
+static struct ctl_table spl_table[] = {
+ /*
+ * NB No .strategy entries have been provided since
+ * sysctl(8) prefers to go via /proc for portability.
+ */
+ {
+ .procname = "gitrev",
+ .data = spl_gitrev,
+ .maxlen = sizeof (spl_gitrev),
+ .mode = 0444,
+ .proc_handler = &proc_dostring,
+ },
+ {
+ .procname = "hostid",
+ .data = &spl_hostid,
+ .maxlen = sizeof (unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_dohostid,
+ },
+ {
+ .procname = "kmem",
+ .mode = 0555,
+ .child = spl_kmem_table,
+ },
+ {
+ .procname = "kstat",
+ .mode = 0555,
+ .child = spl_kstat_table,
+ },
+ {},
+};
+
+static struct ctl_table spl_dir[] = {
+ {
+ .procname = "spl",
+ .mode = 0555,
+ .child = spl_table,
+ },
+ {}
+};
+
+static struct ctl_table spl_root[] = {
+ {
+ .procname = "kernel",
+ .mode = 0555,
+ .child = spl_dir,
+ },
+ {}
+};
+
+int
+spl_proc_init(void)
+{
+ int rc = 0;
+
+ spl_header = register_sysctl_table(spl_root);
+ if (spl_header == NULL)
+ return (-EUNATCH);
+
+ proc_spl = proc_mkdir("spl", NULL);
+ if (proc_spl == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+
+ proc_spl_taskq_all = proc_create_data("taskq-all", 0444, proc_spl,
+ &proc_taskq_all_operations, NULL);
+ if (proc_spl_taskq_all == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+
+ proc_spl_taskq = proc_create_data("taskq", 0444, proc_spl,
+ &proc_taskq_operations, NULL);
+ if (proc_spl_taskq == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+
+ proc_spl_kmem = proc_mkdir("kmem", proc_spl);
+ if (proc_spl_kmem == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+
+ proc_spl_kmem_slab = proc_create_data("slab", 0444, proc_spl_kmem,
+ &proc_slab_operations, NULL);
+ if (proc_spl_kmem_slab == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+
+ proc_spl_kstat = proc_mkdir("kstat", proc_spl);
+ if (proc_spl_kstat == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+out:
+ if (rc) {
+ remove_proc_entry("kstat", proc_spl);
+ remove_proc_entry("slab", proc_spl_kmem);
+ remove_proc_entry("kmem", proc_spl);
+ remove_proc_entry("taskq-all", proc_spl);
+ remove_proc_entry("taskq", proc_spl);
+ remove_proc_entry("spl", NULL);
+ unregister_sysctl_table(spl_header);
+ }
+
+ return (rc);
+}
+
+void
+spl_proc_fini(void)
+{
+ remove_proc_entry("kstat", proc_spl);
+ remove_proc_entry("slab", proc_spl_kmem);
+ remove_proc_entry("kmem", proc_spl);
+ remove_proc_entry("taskq-all", proc_spl);
+ remove_proc_entry("taskq", proc_spl);
+ remove_proc_entry("spl", NULL);
+
+ ASSERT(spl_header != NULL);
+ unregister_sysctl_table(spl_header);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c b/sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c
new file mode 100644
index 000000000000..cae13228c62c
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c
@@ -0,0 +1,284 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/list.h>
+#include <sys/mutex.h>
+#include <sys/procfs_list.h>
+#include <linux/proc_fs.h>
+
+/*
+ * A procfs_list is a wrapper around a linked list which implements the seq_file
+ * interface, allowing the contents of the list to be exposed through procfs.
+ * The kernel already has some utilities to help implement the seq_file
+ * interface for linked lists (seq_list_*), but they aren't appropriate for use
+ * with lists that have many entries, because seq_list_start walks the list at
+ * the start of each read syscall to find where it left off, so reading a file
+ * ends up being quadratic in the number of entries in the list.
+ *
+ * This implementation avoids this penalty by maintaining a separate cursor into
+ * the list per instance of the file that is open. It also maintains some extra
+ * information in each node of the list to prevent reads of entries that have
+ * been dropped from the list.
+ *
+ * Callers should only add elements to the list using procfs_list_add, which
+ * adds an element to the tail of the list. Other operations can be performed
+ * directly on the wrapped list using the normal list manipulation functions,
+ * but elements should only be removed from the head of the list.
+ */
+
+#define NODE_ID(procfs_list, obj) \
+ (((procfs_list_node_t *)(((char *)obj) + \
+ (procfs_list)->pl_node_offset))->pln_id)
+
+typedef struct procfs_list_cursor {
+ procfs_list_t *procfs_list; /* List into which this cursor points */
+ void *cached_node; /* Most recently accessed node */
+ loff_t cached_pos; /* Position of cached_node */
+} procfs_list_cursor_t;
+
+static int
+procfs_list_seq_show(struct seq_file *f, void *p)
+{
+ procfs_list_cursor_t *cursor = f->private;
+ procfs_list_t *procfs_list = cursor->procfs_list;
+
+ ASSERT(MUTEX_HELD(&procfs_list->pl_lock));
+ if (p == SEQ_START_TOKEN) {
+ if (procfs_list->pl_show_header != NULL)
+ return (procfs_list->pl_show_header(f));
+ else
+ return (0);
+ }
+ return (procfs_list->pl_show(f, p));
+}
+
+static void *
+procfs_list_next_node(procfs_list_cursor_t *cursor, loff_t *pos)
+{
+ void *next_node;
+ procfs_list_t *procfs_list = cursor->procfs_list;
+
+ if (cursor->cached_node == SEQ_START_TOKEN)
+ next_node = list_head(&procfs_list->pl_list);
+ else
+ next_node = list_next(&procfs_list->pl_list,
+ cursor->cached_node);
+
+ if (next_node != NULL) {
+ cursor->cached_node = next_node;
+ cursor->cached_pos = NODE_ID(procfs_list, cursor->cached_node);
+ *pos = cursor->cached_pos;
+ } else {
+ /*
+ * seq_read() expects ->next() to update the position even
+ * when there are no more entries. Advance the position to
+ * prevent a warning from being logged.
+ */
+ cursor->cached_node = NULL;
+ cursor->cached_pos++;
+ *pos = cursor->cached_pos;
+ }
+
+ return (next_node);
+}
+
+static void *
+procfs_list_seq_start(struct seq_file *f, loff_t *pos)
+{
+ procfs_list_cursor_t *cursor = f->private;
+ procfs_list_t *procfs_list = cursor->procfs_list;
+
+ mutex_enter(&procfs_list->pl_lock);
+
+ if (*pos == 0) {
+ cursor->cached_node = SEQ_START_TOKEN;
+ cursor->cached_pos = 0;
+ return (SEQ_START_TOKEN);
+ } else if (cursor->cached_node == NULL) {
+ return (NULL);
+ }
+
+ /*
+ * Check if our cached pointer has become stale, which happens if the
+ * the message where we left off has been dropped from the list since
+ * the last read syscall completed.
+ */
+ void *oldest_node = list_head(&procfs_list->pl_list);
+ if (cursor->cached_node != SEQ_START_TOKEN && (oldest_node == NULL ||
+ NODE_ID(procfs_list, oldest_node) > cursor->cached_pos))
+ return (ERR_PTR(-EIO));
+
+ /*
+ * If it isn't starting from the beginning of the file, the seq_file
+ * code will either pick up at the same position it visited last or the
+ * following one.
+ */
+ if (*pos == cursor->cached_pos) {
+ return (cursor->cached_node);
+ } else {
+ ASSERT3U(*pos, ==, cursor->cached_pos + 1);
+ return (procfs_list_next_node(cursor, pos));
+ }
+}
+
+static void *
+procfs_list_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+ procfs_list_cursor_t *cursor = f->private;
+ ASSERT(MUTEX_HELD(&cursor->procfs_list->pl_lock));
+ return (procfs_list_next_node(cursor, pos));
+}
+
+static void
+procfs_list_seq_stop(struct seq_file *f, void *p)
+{
+ procfs_list_cursor_t *cursor = f->private;
+ procfs_list_t *procfs_list = cursor->procfs_list;
+ mutex_exit(&procfs_list->pl_lock);
+}
+
+static struct seq_operations procfs_list_seq_ops = {
+ .show = procfs_list_seq_show,
+ .start = procfs_list_seq_start,
+ .next = procfs_list_seq_next,
+ .stop = procfs_list_seq_stop,
+};
+
+static int
+procfs_list_open(struct inode *inode, struct file *filp)
+{
+ int rc = seq_open_private(filp, &procfs_list_seq_ops,
+ sizeof (procfs_list_cursor_t));
+ if (rc != 0)
+ return (rc);
+
+ struct seq_file *f = filp->private_data;
+ procfs_list_cursor_t *cursor = f->private;
+ cursor->procfs_list = PDE_DATA(inode);
+ cursor->cached_node = NULL;
+ cursor->cached_pos = 0;
+
+ return (0);
+}
+
+static ssize_t
+procfs_list_write(struct file *filp, const char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct seq_file *f = filp->private_data;
+ procfs_list_cursor_t *cursor = f->private;
+ procfs_list_t *procfs_list = cursor->procfs_list;
+ int rc;
+
+ if (procfs_list->pl_clear != NULL &&
+ (rc = procfs_list->pl_clear(procfs_list)) != 0)
+ return (-rc);
+ return (len);
+}
+
+static const kstat_proc_op_t procfs_list_operations = {
+#ifdef HAVE_PROC_OPS_STRUCT
+ .proc_open = procfs_list_open,
+ .proc_write = procfs_list_write,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release_private,
+#else
+ .open = procfs_list_open,
+ .write = procfs_list_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+#endif
+};
+
+/*
+ * Initialize a procfs_list and create a file for it in the proc filesystem
+ * under the kstat namespace.
+ */
+void
+procfs_list_install(const char *module,
+ const char *submodule,
+ const char *name,
+ mode_t mode,
+ procfs_list_t *procfs_list,
+ int (*show)(struct seq_file *f, void *p),
+ int (*show_header)(struct seq_file *f),
+ int (*clear)(procfs_list_t *procfs_list),
+ size_t procfs_list_node_off)
+{
+ char *modulestr;
+
+ if (submodule != NULL)
+ modulestr = kmem_asprintf("%s/%s", module, submodule);
+ else
+ modulestr = kmem_asprintf("%s", module);
+ mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&procfs_list->pl_list,
+ procfs_list_node_off + sizeof (procfs_list_node_t),
+ procfs_list_node_off + offsetof(procfs_list_node_t, pln_link));
+ procfs_list->pl_next_id = 1; /* Save id 0 for SEQ_START_TOKEN */
+ procfs_list->pl_show = show;
+ procfs_list->pl_show_header = show_header;
+ procfs_list->pl_clear = clear;
+ procfs_list->pl_node_offset = procfs_list_node_off;
+
+ kstat_proc_entry_init(&procfs_list->pl_kstat_entry, modulestr, name);
+ kstat_proc_entry_install(&procfs_list->pl_kstat_entry, mode,
+ &procfs_list_operations, procfs_list);
+ kmem_strfree(modulestr);
+}
+EXPORT_SYMBOL(procfs_list_install);
+
+/* Remove the proc filesystem file corresponding to the given list */
+void
+procfs_list_uninstall(procfs_list_t *procfs_list)
+{
+ kstat_proc_entry_delete(&procfs_list->pl_kstat_entry);
+}
+EXPORT_SYMBOL(procfs_list_uninstall);
+
+void
+procfs_list_destroy(procfs_list_t *procfs_list)
+{
+ ASSERT(list_is_empty(&procfs_list->pl_list));
+ list_destroy(&procfs_list->pl_list);
+ mutex_destroy(&procfs_list->pl_lock);
+}
+EXPORT_SYMBOL(procfs_list_destroy);
+
+/*
+ * Add a new node to the tail of the list. While the standard list manipulation
+ * functions can be use for all other operation, adding elements to the list
+ * should only be done using this helper so that the id of the new node is set
+ * correctly.
+ */
+void
+procfs_list_add(procfs_list_t *procfs_list, void *p)
+{
+ ASSERT(MUTEX_HELD(&procfs_list->pl_lock));
+ NODE_ID(procfs_list, p) = procfs_list->pl_next_id++;
+ list_insert_tail(&procfs_list->pl_list, p);
+}
+EXPORT_SYMBOL(procfs_list_add);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
new file mode 100644
index 000000000000..61631256c858
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
@@ -0,0 +1,1428 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Task Queue Implementation.
+ */
+
+#include <sys/timer.h>
+#include <sys/taskq.h>
+#include <sys/kmem.h>
+#include <sys/tsd.h>
+#include <sys/trace_spl.h>
+#ifdef HAVE_CPU_HOTPLUG
+#include <linux/cpuhotplug.h>
+#endif
+
+int spl_taskq_thread_bind = 0;
+module_param(spl_taskq_thread_bind, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
+
+
+int spl_taskq_thread_dynamic = 1;
+module_param(spl_taskq_thread_dynamic, int, 0444);
+MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads");
+
+int spl_taskq_thread_priority = 1;
+module_param(spl_taskq_thread_priority, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_priority,
+ "Allow non-default priority for taskq threads");
+
+int spl_taskq_thread_sequential = 4;
+module_param(spl_taskq_thread_sequential, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_sequential,
+ "Create new taskq threads after N sequential tasks");
+
+/* Global system-wide dynamic task queue available for all consumers */
+taskq_t *system_taskq;
+EXPORT_SYMBOL(system_taskq);
+/* Global dynamic task queue for long delay */
+taskq_t *system_delay_taskq;
+EXPORT_SYMBOL(system_delay_taskq);
+
+/* Private dedicated taskq for creating new taskq threads on demand. */
+static taskq_t *dynamic_taskq;
+static taskq_thread_t *taskq_thread_create(taskq_t *);
+
+#ifdef HAVE_CPU_HOTPLUG
+/* Multi-callback id for cpu hotplugging. */
+static int spl_taskq_cpuhp_state;
+#endif
+
+/* List of all taskqs */
+LIST_HEAD(tq_list);
+struct rw_semaphore tq_list_sem;
+static uint_t taskq_tsd;
+
+static int
+task_km_flags(uint_t flags)
+{
+ if (flags & TQ_NOSLEEP)
+ return (KM_NOSLEEP);
+
+ if (flags & TQ_PUSHPAGE)
+ return (KM_PUSHPAGE);
+
+ return (KM_SLEEP);
+}
+
+/*
+ * taskq_find_by_name - Find the largest instance number of a named taskq.
+ */
+static int
+taskq_find_by_name(const char *name)
+{
+ struct list_head *tql = NULL;
+ taskq_t *tq;
+
+ list_for_each_prev(tql, &tq_list) {
+ tq = list_entry(tql, taskq_t, tq_taskqs);
+ if (strcmp(name, tq->tq_name) == 0)
+ return (tq->tq_instance);
+ }
+ return (-1);
+}
+
+/*
+ * NOTE: Must be called with tq->tq_lock held, returns a list_t which
+ * is not attached to the free, work, or pending taskq lists.
+ */
+static taskq_ent_t *
+task_alloc(taskq_t *tq, uint_t flags, unsigned long *irqflags)
+{
+ taskq_ent_t *t;
+ int count = 0;
+
+ ASSERT(tq);
+retry:
+ /* Acquire taskq_ent_t's from free list if available */
+ if (!list_empty(&tq->tq_free_list) && !(flags & TQ_NEW)) {
+ t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list);
+
+ ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+ ASSERT(!(t->tqent_flags & TQENT_FLAG_CANCEL));
+ ASSERT(!timer_pending(&t->tqent_timer));
+
+ list_del_init(&t->tqent_list);
+ return (t);
+ }
+
+ /* Free list is empty and memory allocations are prohibited */
+ if (flags & TQ_NOALLOC)
+ return (NULL);
+
+ /* Hit maximum taskq_ent_t pool size */
+ if (tq->tq_nalloc >= tq->tq_maxalloc) {
+ if (flags & TQ_NOSLEEP)
+ return (NULL);
+
+ /*
+ * Sleep periodically polling the free list for an available
+ * taskq_ent_t. Dispatching with TQ_SLEEP should always succeed
+ * but we cannot block forever waiting for an taskq_ent_t to
+ * show up in the free list, otherwise a deadlock can happen.
+ *
+ * Therefore, we need to allocate a new task even if the number
+ * of allocated tasks is above tq->tq_maxalloc, but we still
+ * end up delaying the task allocation by one second, thereby
+ * throttling the task dispatch rate.
+ */
+ spin_unlock_irqrestore(&tq->tq_lock, *irqflags);
+ schedule_timeout(HZ / 100);
+ spin_lock_irqsave_nested(&tq->tq_lock, *irqflags,
+ tq->tq_lock_class);
+ if (count < 100) {
+ count++;
+ goto retry;
+ }
+ }
+
+ spin_unlock_irqrestore(&tq->tq_lock, *irqflags);
+ t = kmem_alloc(sizeof (taskq_ent_t), task_km_flags(flags));
+ spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, tq->tq_lock_class);
+
+ if (t) {
+ taskq_init_ent(t);
+ tq->tq_nalloc++;
+ }
+
+ return (t);
+}
+
+/*
+ * NOTE: Must be called with tq->tq_lock held, expects the taskq_ent_t
+ * to already be removed from the free, work, or pending taskq lists.
+ */
+static void
+task_free(taskq_t *tq, taskq_ent_t *t)
+{
+ ASSERT(tq);
+ ASSERT(t);
+ ASSERT(list_empty(&t->tqent_list));
+ ASSERT(!timer_pending(&t->tqent_timer));
+
+ kmem_free(t, sizeof (taskq_ent_t));
+ tq->tq_nalloc--;
+}
+
+/*
+ * NOTE: Must be called with tq->tq_lock held, either destroys the
+ * taskq_ent_t if too many exist or moves it to the free list for later use.
+ */
+static void
+task_done(taskq_t *tq, taskq_ent_t *t)
+{
+ ASSERT(tq);
+ ASSERT(t);
+
+ /* Wake tasks blocked in taskq_wait_id() */
+ wake_up_all(&t->tqent_waitq);
+
+ list_del_init(&t->tqent_list);
+
+ if (tq->tq_nalloc <= tq->tq_minalloc) {
+ t->tqent_id = TASKQID_INVALID;
+ t->tqent_func = NULL;
+ t->tqent_arg = NULL;
+ t->tqent_flags = 0;
+
+ list_add_tail(&t->tqent_list, &tq->tq_free_list);
+ } else {
+ task_free(tq, t);
+ }
+}
+
+/*
+ * When a delayed task timer expires remove it from the delay list and
+ * add it to the priority list in order for immediate processing.
+ */
+static void
+task_expire_impl(taskq_ent_t *t)
+{
+ taskq_ent_t *w;
+ taskq_t *tq = t->tqent_taskq;
+ struct list_head *l = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+
+ if (t->tqent_flags & TQENT_FLAG_CANCEL) {
+ ASSERT(list_empty(&t->tqent_list));
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ return;
+ }
+
+ t->tqent_birth = jiffies;
+ DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t);
+
+ /*
+ * The priority list must be maintained in strict task id order
+ * from lowest to highest for lowest_id to be easily calculable.
+ */
+ list_del(&t->tqent_list);
+ list_for_each_prev(l, &tq->tq_prio_list) {
+ w = list_entry(l, taskq_ent_t, tqent_list);
+ if (w->tqent_id < t->tqent_id) {
+ list_add(&t->tqent_list, l);
+ break;
+ }
+ }
+ if (l == &tq->tq_prio_list)
+ list_add(&t->tqent_list, &tq->tq_prio_list);
+
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ wake_up(&tq->tq_work_waitq);
+}
+
+static void
+task_expire(spl_timer_list_t tl)
+{
+ struct timer_list *tmr = (struct timer_list *)tl;
+ taskq_ent_t *t = from_timer(t, tmr, tqent_timer);
+ task_expire_impl(t);
+}
+
+/*
+ * Returns the lowest incomplete taskqid_t. The taskqid_t may
+ * be queued on the pending list, on the priority list, on the
+ * delay list, or on the work list currently being handled, but
+ * it is not 100% complete yet.
+ */
+static taskqid_t
+taskq_lowest_id(taskq_t *tq)
+{
+ taskqid_t lowest_id = tq->tq_next_id;
+ taskq_ent_t *t;
+ taskq_thread_t *tqt;
+
+ if (!list_empty(&tq->tq_pend_list)) {
+ t = list_entry(tq->tq_pend_list.next, taskq_ent_t, tqent_list);
+ lowest_id = MIN(lowest_id, t->tqent_id);
+ }
+
+ if (!list_empty(&tq->tq_prio_list)) {
+ t = list_entry(tq->tq_prio_list.next, taskq_ent_t, tqent_list);
+ lowest_id = MIN(lowest_id, t->tqent_id);
+ }
+
+ if (!list_empty(&tq->tq_delay_list)) {
+ t = list_entry(tq->tq_delay_list.next, taskq_ent_t, tqent_list);
+ lowest_id = MIN(lowest_id, t->tqent_id);
+ }
+
+ if (!list_empty(&tq->tq_active_list)) {
+ tqt = list_entry(tq->tq_active_list.next, taskq_thread_t,
+ tqt_active_list);
+ ASSERT(tqt->tqt_id != TASKQID_INVALID);
+ lowest_id = MIN(lowest_id, tqt->tqt_id);
+ }
+
+ return (lowest_id);
+}
+
+/*
+ * Insert a task into a list keeping the list sorted by increasing taskqid.
+ */
+static void
+taskq_insert_in_order(taskq_t *tq, taskq_thread_t *tqt)
+{
+ taskq_thread_t *w;
+ struct list_head *l = NULL;
+
+ ASSERT(tq);
+ ASSERT(tqt);
+
+ list_for_each_prev(l, &tq->tq_active_list) {
+ w = list_entry(l, taskq_thread_t, tqt_active_list);
+ if (w->tqt_id < tqt->tqt_id) {
+ list_add(&tqt->tqt_active_list, l);
+ break;
+ }
+ }
+ if (l == &tq->tq_active_list)
+ list_add(&tqt->tqt_active_list, &tq->tq_active_list);
+}
+
+/*
+ * Find and return a task from the given list if it exists. The list
+ * must be in lowest to highest task id order.
+ */
+static taskq_ent_t *
+taskq_find_list(taskq_t *tq, struct list_head *lh, taskqid_t id)
+{
+ struct list_head *l = NULL;
+ taskq_ent_t *t;
+
+ list_for_each(l, lh) {
+ t = list_entry(l, taskq_ent_t, tqent_list);
+
+ if (t->tqent_id == id)
+ return (t);
+
+ if (t->tqent_id > id)
+ break;
+ }
+
+ return (NULL);
+}
+
+/*
+ * Find an already dispatched task given the task id regardless of what
+ * state it is in. If a task is still pending it will be returned.
+ * If a task is executing, then -EBUSY will be returned instead.
+ * If the task has already been run then NULL is returned.
+ */
+static taskq_ent_t *
+taskq_find(taskq_t *tq, taskqid_t id)
+{
+ taskq_thread_t *tqt;
+ struct list_head *l = NULL;
+ taskq_ent_t *t;
+
+ t = taskq_find_list(tq, &tq->tq_delay_list, id);
+ if (t)
+ return (t);
+
+ t = taskq_find_list(tq, &tq->tq_prio_list, id);
+ if (t)
+ return (t);
+
+ t = taskq_find_list(tq, &tq->tq_pend_list, id);
+ if (t)
+ return (t);
+
+ list_for_each(l, &tq->tq_active_list) {
+ tqt = list_entry(l, taskq_thread_t, tqt_active_list);
+ if (tqt->tqt_id == id) {
+ /*
+ * Instead of returning tqt_task, we just return a non
+ * NULL value to prevent misuse, since tqt_task only
+ * has two valid fields.
+ */
+ return (ERR_PTR(-EBUSY));
+ }
+ }
+
+ return (NULL);
+}
+
+/*
+ * Theory for the taskq_wait_id(), taskq_wait_outstanding(), and
+ * taskq_wait() functions below.
+ *
+ * Taskq waiting is accomplished by tracking the lowest outstanding task
+ * id and the next available task id. As tasks are dispatched they are
+ * added to the tail of the pending, priority, or delay lists. As worker
+ * threads become available the tasks are removed from the heads of these
+ * lists and linked to the worker threads. This ensures the lists are
+ * kept sorted by lowest to highest task id.
+ *
+ * Therefore the lowest outstanding task id can be quickly determined by
+ * checking the head item from all of these lists. This value is stored
+ * with the taskq as the lowest id. It only needs to be recalculated when
+ * either the task with the current lowest id completes or is canceled.
+ *
+ * By blocking until the lowest task id exceeds the passed task id the
+ * taskq_wait_outstanding() function can be easily implemented. Similarly,
+ * by blocking until the lowest task id matches the next task id taskq_wait()
+ * can be implemented.
+ *
+ * Callers should be aware that when there are multiple worked threads it
+ * is possible for larger task ids to complete before smaller ones. Also
+ * when the taskq contains delay tasks with small task ids callers may
+ * block for a considerable length of time waiting for them to expire and
+ * execute.
+ */
+static int
+taskq_wait_id_check(taskq_t *tq, taskqid_t id)
+{
+ int rc;
+ unsigned long flags;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ rc = (taskq_find(tq, id) == NULL);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ return (rc);
+}
+
+/*
+ * The taskq_wait_id() function blocks until the passed task id completes.
+ * This does not guarantee that all lower task ids have completed.
+ */
+void
+taskq_wait_id(taskq_t *tq, taskqid_t id)
+{
+ wait_event(tq->tq_wait_waitq, taskq_wait_id_check(tq, id));
+}
+EXPORT_SYMBOL(taskq_wait_id);
+
+static int
+taskq_wait_outstanding_check(taskq_t *tq, taskqid_t id)
+{
+ int rc;
+ unsigned long flags;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ rc = (id < tq->tq_lowest_id);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ return (rc);
+}
+
+/*
+ * The taskq_wait_outstanding() function will block until all tasks with a
+ * lower taskqid than the passed 'id' have been completed. Note that all
+ * task id's are assigned monotonically at dispatch time. Zero may be
+ * passed for the id to indicate all tasks dispatch up to this point,
+ * but not after, should be waited for.
+ */
+void
+taskq_wait_outstanding(taskq_t *tq, taskqid_t id)
+{
+ id = id ? id : tq->tq_next_id - 1;
+ wait_event(tq->tq_wait_waitq, taskq_wait_outstanding_check(tq, id));
+}
+EXPORT_SYMBOL(taskq_wait_outstanding);
+
+static int
+taskq_wait_check(taskq_t *tq)
+{
+ int rc;
+ unsigned long flags;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ rc = (tq->tq_lowest_id == tq->tq_next_id);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ return (rc);
+}
+
+/*
+ * The taskq_wait() function will block until the taskq is empty.
+ * This means that if a taskq re-dispatches work to itself taskq_wait()
+ * callers will block indefinitely.
+ */
+void
+taskq_wait(taskq_t *tq)
+{
+ wait_event(tq->tq_wait_waitq, taskq_wait_check(tq));
+}
+EXPORT_SYMBOL(taskq_wait);
+
+int
+taskq_member(taskq_t *tq, kthread_t *t)
+{
+ return (tq == (taskq_t *)tsd_get_by_thread(taskq_tsd, t));
+}
+EXPORT_SYMBOL(taskq_member);
+
+taskq_t *
+taskq_of_curthread(void)
+{
+ return (tsd_get(taskq_tsd));
+}
+EXPORT_SYMBOL(taskq_of_curthread);
+
+/*
+ * Cancel an already dispatched task given the task id. Still pending tasks
+ * will be immediately canceled, and if the task is active the function will
+ * block until it completes. Preallocated tasks which are canceled must be
+ * freed by the caller.
+ */
+int
+taskq_cancel_id(taskq_t *tq, taskqid_t id)
+{
+ taskq_ent_t *t;
+ int rc = ENOENT;
+ unsigned long flags;
+
+ ASSERT(tq);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ t = taskq_find(tq, id);
+ if (t && t != ERR_PTR(-EBUSY)) {
+ list_del_init(&t->tqent_list);
+ t->tqent_flags |= TQENT_FLAG_CANCEL;
+
+ /*
+ * When canceling the lowest outstanding task id we
+ * must recalculate the new lowest outstanding id.
+ */
+ if (tq->tq_lowest_id == t->tqent_id) {
+ tq->tq_lowest_id = taskq_lowest_id(tq);
+ ASSERT3S(tq->tq_lowest_id, >, t->tqent_id);
+ }
+
+ /*
+ * The task_expire() function takes the tq->tq_lock so drop
+ * drop the lock before synchronously cancelling the timer.
+ */
+ if (timer_pending(&t->tqent_timer)) {
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ del_timer_sync(&t->tqent_timer);
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ }
+
+ if (!(t->tqent_flags & TQENT_FLAG_PREALLOC))
+ task_done(tq, t);
+
+ rc = 0;
+ }
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ if (t == ERR_PTR(-EBUSY)) {
+ taskq_wait_id(tq, id);
+ rc = EBUSY;
+ }
+
+ return (rc);
+}
+EXPORT_SYMBOL(taskq_cancel_id);
+
+static int taskq_thread_spawn(taskq_t *tq);
+
+taskqid_t
+taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
+{
+ taskq_ent_t *t;
+ taskqid_t rc = TASKQID_INVALID;
+ unsigned long irqflags;
+
+ ASSERT(tq);
+ ASSERT(func);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class);
+
+ /* Taskq being destroyed and all tasks drained */
+ if (!(tq->tq_flags & TASKQ_ACTIVE))
+ goto out;
+
+ /* Do not queue the task unless there is idle thread for it */
+ ASSERT(tq->tq_nactive <= tq->tq_nthreads);
+ if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
+ /* Dynamic taskq may be able to spawn another thread */
+ if (!(tq->tq_flags & TASKQ_DYNAMIC) ||
+ taskq_thread_spawn(tq) == 0)
+ goto out;
+ }
+
+ if ((t = task_alloc(tq, flags, &irqflags)) == NULL)
+ goto out;
+
+ spin_lock(&t->tqent_lock);
+
+ /* Queue to the front of the list to enforce TQ_NOQUEUE semantics */
+ if (flags & TQ_NOQUEUE)
+ list_add(&t->tqent_list, &tq->tq_prio_list);
+ /* Queue to the priority list instead of the pending list */
+ else if (flags & TQ_FRONT)
+ list_add_tail(&t->tqent_list, &tq->tq_prio_list);
+ else
+ list_add_tail(&t->tqent_list, &tq->tq_pend_list);
+
+ t->tqent_id = rc = tq->tq_next_id;
+ tq->tq_next_id++;
+ t->tqent_func = func;
+ t->tqent_arg = arg;
+ t->tqent_taskq = tq;
+ t->tqent_timer.function = NULL;
+ t->tqent_timer.expires = 0;
+
+ t->tqent_birth = jiffies;
+ DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t);
+
+ ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
+ spin_unlock(&t->tqent_lock);
+
+ wake_up(&tq->tq_work_waitq);
+out:
+ /* Spawn additional taskq threads if required. */
+ if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads)
+ (void) taskq_thread_spawn(tq);
+
+ spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+ return (rc);
+}
+EXPORT_SYMBOL(taskq_dispatch);
+
+taskqid_t
+taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
+ uint_t flags, clock_t expire_time)
+{
+ taskqid_t rc = TASKQID_INVALID;
+ taskq_ent_t *t;
+ unsigned long irqflags;
+
+ ASSERT(tq);
+ ASSERT(func);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class);
+
+ /* Taskq being destroyed and all tasks drained */
+ if (!(tq->tq_flags & TASKQ_ACTIVE))
+ goto out;
+
+ if ((t = task_alloc(tq, flags, &irqflags)) == NULL)
+ goto out;
+
+ spin_lock(&t->tqent_lock);
+
+ /* Queue to the delay list for subsequent execution */
+ list_add_tail(&t->tqent_list, &tq->tq_delay_list);
+
+ t->tqent_id = rc = tq->tq_next_id;
+ tq->tq_next_id++;
+ t->tqent_func = func;
+ t->tqent_arg = arg;
+ t->tqent_taskq = tq;
+ t->tqent_timer.function = task_expire;
+ t->tqent_timer.expires = (unsigned long)expire_time;
+ add_timer(&t->tqent_timer);
+
+ ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
+ spin_unlock(&t->tqent_lock);
+out:
+ /* Spawn additional taskq threads if required. */
+ if (tq->tq_nactive == tq->tq_nthreads)
+ (void) taskq_thread_spawn(tq);
+ spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+ return (rc);
+}
+EXPORT_SYMBOL(taskq_dispatch_delay);
+
+void
+taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
+ taskq_ent_t *t)
+{
+ unsigned long irqflags;
+ ASSERT(tq);
+ ASSERT(func);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
+ tq->tq_lock_class);
+
+ /* Taskq being destroyed and all tasks drained */
+ if (!(tq->tq_flags & TASKQ_ACTIVE)) {
+ t->tqent_id = TASKQID_INVALID;
+ goto out;
+ }
+
+ if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
+ /* Dynamic taskq may be able to spawn another thread */
+ if (!(tq->tq_flags & TASKQ_DYNAMIC) ||
+ taskq_thread_spawn(tq) == 0)
+ goto out2;
+ flags |= TQ_FRONT;
+ }
+
+ spin_lock(&t->tqent_lock);
+
+ /*
+ * Make sure the entry is not on some other taskq; it is important to
+ * ASSERT() under lock
+ */
+ ASSERT(taskq_empty_ent(t));
+
+ /*
+ * Mark it as a prealloc'd task. This is important
+ * to ensure that we don't free it later.
+ */
+ t->tqent_flags |= TQENT_FLAG_PREALLOC;
+
+ /* Queue to the priority list instead of the pending list */
+ if (flags & TQ_FRONT)
+ list_add_tail(&t->tqent_list, &tq->tq_prio_list);
+ else
+ list_add_tail(&t->tqent_list, &tq->tq_pend_list);
+
+ t->tqent_id = tq->tq_next_id;
+ tq->tq_next_id++;
+ t->tqent_func = func;
+ t->tqent_arg = arg;
+ t->tqent_taskq = tq;
+
+ t->tqent_birth = jiffies;
+ DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t);
+
+ spin_unlock(&t->tqent_lock);
+
+ wake_up(&tq->tq_work_waitq);
+out:
+ /* Spawn additional taskq threads if required. */
+ if (tq->tq_nactive == tq->tq_nthreads)
+ (void) taskq_thread_spawn(tq);
+out2:
+ spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+}
+EXPORT_SYMBOL(taskq_dispatch_ent);
+
+int
+taskq_empty_ent(taskq_ent_t *t)
+{
+ return (list_empty(&t->tqent_list));
+}
+EXPORT_SYMBOL(taskq_empty_ent);
+
+void
+taskq_init_ent(taskq_ent_t *t)
+{
+ spin_lock_init(&t->tqent_lock);
+ init_waitqueue_head(&t->tqent_waitq);
+ timer_setup(&t->tqent_timer, NULL, 0);
+ INIT_LIST_HEAD(&t->tqent_list);
+ t->tqent_id = 0;
+ t->tqent_func = NULL;
+ t->tqent_arg = NULL;
+ t->tqent_flags = 0;
+ t->tqent_taskq = NULL;
+}
+EXPORT_SYMBOL(taskq_init_ent);
+
+/*
+ * Return the next pending task, preference is given to tasks on the
+ * priority list which were dispatched with TQ_FRONT.
+ */
+static taskq_ent_t *
+taskq_next_ent(taskq_t *tq)
+{
+ struct list_head *list;
+
+ if (!list_empty(&tq->tq_prio_list))
+ list = &tq->tq_prio_list;
+ else if (!list_empty(&tq->tq_pend_list))
+ list = &tq->tq_pend_list;
+ else
+ return (NULL);
+
+ return (list_entry(list->next, taskq_ent_t, tqent_list));
+}
+
+/*
+ * Spawns a new thread for the specified taskq.
+ */
+static void
+taskq_thread_spawn_task(void *arg)
+{
+ taskq_t *tq = (taskq_t *)arg;
+ unsigned long flags;
+
+ if (taskq_thread_create(tq) == NULL) {
+ /* restore spawning count if failed */
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ tq->tq_nspawn--;
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ }
+}
+
+/*
+ * Spawn addition threads for dynamic taskqs (TASKQ_DYNAMIC) the current
+ * number of threads is insufficient to handle the pending tasks. These
+ * new threads must be created by the dedicated dynamic_taskq to avoid
+ * deadlocks between thread creation and memory reclaim. The system_taskq
+ * which is also a dynamic taskq cannot be safely used for this.
+ */
+static int
+taskq_thread_spawn(taskq_t *tq)
+{
+ int spawning = 0;
+
+ if (!(tq->tq_flags & TASKQ_DYNAMIC))
+ return (0);
+
+ if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) &&
+ (tq->tq_flags & TASKQ_ACTIVE)) {
+ spawning = (++tq->tq_nspawn);
+ taskq_dispatch(dynamic_taskq, taskq_thread_spawn_task,
+ tq, TQ_NOSLEEP);
+ }
+
+ return (spawning);
+}
+
+/*
+ * Threads in a dynamic taskq should only exit once it has been completely
+ * drained and no other threads are actively servicing tasks. This prevents
+ * threads from being created and destroyed more than is required.
+ *
+ * The first thread is the thread list is treated as the primary thread.
+ * There is nothing special about the primary thread but in order to avoid
+ * all the taskq pids from changing we opt to make it long running.
+ */
+static int
+taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt)
+{
+ if (!(tq->tq_flags & TASKQ_DYNAMIC))
+ return (0);
+
+ if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t,
+ tqt_thread_list) == tqt)
+ return (0);
+
+ return
+ ((tq->tq_nspawn == 0) && /* No threads are being spawned */
+ (tq->tq_nactive == 0) && /* No threads are handling tasks */
+ (tq->tq_nthreads > 1) && /* More than 1 thread is running */
+ (!taskq_next_ent(tq)) && /* There are no pending tasks */
+ (spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */
+}
+
+static int
+taskq_thread(void *args)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ sigset_t blocked;
+ taskq_thread_t *tqt = args;
+ taskq_t *tq;
+ taskq_ent_t *t;
+ int seq_tasks = 0;
+ unsigned long flags;
+ taskq_ent_t dup_task = {};
+
+ ASSERT(tqt);
+ ASSERT(tqt->tqt_tq);
+ tq = tqt->tqt_tq;
+ current->flags |= PF_NOFREEZE;
+
+ (void) spl_fstrans_mark();
+
+ sigfillset(&blocked);
+ sigprocmask(SIG_BLOCK, &blocked, NULL);
+ flush_signals(current);
+
+ tsd_set(taskq_tsd, tq);
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ /*
+ * If we are dynamically spawned, decrease spawning count. Note that
+ * we could be created during taskq_create, in which case we shouldn't
+ * do the decrement. But it's fine because taskq_create will reset
+ * tq_nspawn later.
+ */
+ if (tq->tq_flags & TASKQ_DYNAMIC)
+ tq->tq_nspawn--;
+
+ /* Immediately exit if more threads than allowed were created. */
+ if (tq->tq_nthreads >= tq->tq_maxthreads)
+ goto error;
+
+ tq->tq_nthreads++;
+ list_add_tail(&tqt->tqt_thread_list, &tq->tq_thread_list);
+ wake_up(&tq->tq_wait_waitq);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ while (!kthread_should_stop()) {
+
+ if (list_empty(&tq->tq_pend_list) &&
+ list_empty(&tq->tq_prio_list)) {
+
+ if (taskq_thread_should_stop(tq, tqt)) {
+ wake_up_all(&tq->tq_wait_waitq);
+ break;
+ }
+
+ add_wait_queue_exclusive(&tq->tq_work_waitq, &wait);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ schedule();
+ seq_tasks = 0;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ remove_wait_queue(&tq->tq_work_waitq, &wait);
+ } else {
+ __set_current_state(TASK_RUNNING);
+ }
+
+ if ((t = taskq_next_ent(tq)) != NULL) {
+ list_del_init(&t->tqent_list);
+
+ /*
+ * A TQENT_FLAG_PREALLOC task may be reused or freed
+ * during the task function call. Store tqent_id and
+ * tqent_flags here.
+ *
+ * Also use an on stack taskq_ent_t for tqt_task
+ * assignment in this case; we want to make sure
+ * to duplicate all fields, so the values are
+ * correct when it's accessed via DTRACE_PROBE*.
+ */
+ tqt->tqt_id = t->tqent_id;
+ tqt->tqt_flags = t->tqent_flags;
+
+ if (t->tqent_flags & TQENT_FLAG_PREALLOC) {
+ dup_task = *t;
+ t = &dup_task;
+ }
+ tqt->tqt_task = t;
+
+ taskq_insert_in_order(tq, tqt);
+ tq->tq_nactive++;
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ DTRACE_PROBE1(taskq_ent__start, taskq_ent_t *, t);
+
+ /* Perform the requested task */
+ t->tqent_func(t->tqent_arg);
+
+ DTRACE_PROBE1(taskq_ent__finish, taskq_ent_t *, t);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ tq->tq_nactive--;
+ list_del_init(&tqt->tqt_active_list);
+ tqt->tqt_task = NULL;
+
+ /* For prealloc'd tasks, we don't free anything. */
+ if (!(tqt->tqt_flags & TQENT_FLAG_PREALLOC))
+ task_done(tq, t);
+
+ /*
+ * When the current lowest outstanding taskqid is
+ * done calculate the new lowest outstanding id
+ */
+ if (tq->tq_lowest_id == tqt->tqt_id) {
+ tq->tq_lowest_id = taskq_lowest_id(tq);
+ ASSERT3S(tq->tq_lowest_id, >, tqt->tqt_id);
+ }
+
+ /* Spawn additional taskq threads if required. */
+ if ((++seq_tasks) > spl_taskq_thread_sequential &&
+ taskq_thread_spawn(tq))
+ seq_tasks = 0;
+
+ tqt->tqt_id = TASKQID_INVALID;
+ tqt->tqt_flags = 0;
+ wake_up_all(&tq->tq_wait_waitq);
+ } else {
+ if (taskq_thread_should_stop(tq, tqt))
+ break;
+ }
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ }
+
+ __set_current_state(TASK_RUNNING);
+ tq->tq_nthreads--;
+ list_del_init(&tqt->tqt_thread_list);
+error:
+ kmem_free(tqt, sizeof (taskq_thread_t));
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ tsd_set(taskq_tsd, NULL);
+ thread_exit();
+
+ return (0);
+}
+
+static taskq_thread_t *
+taskq_thread_create(taskq_t *tq)
+{
+ static int last_used_cpu = 0;
+ taskq_thread_t *tqt;
+
+ tqt = kmem_alloc(sizeof (*tqt), KM_PUSHPAGE);
+ INIT_LIST_HEAD(&tqt->tqt_thread_list);
+ INIT_LIST_HEAD(&tqt->tqt_active_list);
+ tqt->tqt_tq = tq;
+ tqt->tqt_id = TASKQID_INVALID;
+
+ tqt->tqt_thread = spl_kthread_create(taskq_thread, tqt,
+ "%s", tq->tq_name);
+ if (tqt->tqt_thread == NULL) {
+ kmem_free(tqt, sizeof (taskq_thread_t));
+ return (NULL);
+ }
+
+ if (spl_taskq_thread_bind) {
+ last_used_cpu = (last_used_cpu + 1) % num_online_cpus();
+ kthread_bind(tqt->tqt_thread, last_used_cpu);
+ }
+
+ if (spl_taskq_thread_priority)
+ set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(tq->tq_pri));
+
+ wake_up_process(tqt->tqt_thread);
+
+ return (tqt);
+}
+
+taskq_t *
+taskq_create(const char *name, int threads_arg, pri_t pri,
+ int minalloc, int maxalloc, uint_t flags)
+{
+ taskq_t *tq;
+ taskq_thread_t *tqt;
+ int count = 0, rc = 0, i;
+ unsigned long irqflags;
+ int nthreads = threads_arg;
+
+ ASSERT(name != NULL);
+ ASSERT(minalloc >= 0);
+ ASSERT(maxalloc <= INT_MAX);
+ ASSERT(!(flags & (TASKQ_CPR_SAFE))); /* Unsupported */
+
+ /* Scale the number of threads using nthreads as a percentage */
+ if (flags & TASKQ_THREADS_CPU_PCT) {
+ ASSERT(nthreads <= 100);
+ ASSERT(nthreads >= 0);
+ nthreads = MIN(threads_arg, 100);
+ nthreads = MAX(nthreads, 0);
+ nthreads = MAX((num_online_cpus() * nthreads) /100, 1);
+ }
+
+ tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE);
+ if (tq == NULL)
+ return (NULL);
+
+ tq->tq_hp_support = B_FALSE;
+#ifdef HAVE_CPU_HOTPLUG
+ if (flags & TASKQ_THREADS_CPU_PCT) {
+ tq->tq_hp_support = B_TRUE;
+ if (cpuhp_state_add_instance_nocalls(spl_taskq_cpuhp_state,
+ &tq->tq_hp_cb_node) != 0) {
+ kmem_free(tq, sizeof (*tq));
+ return (NULL);
+ }
+ }
+#endif
+
+ spin_lock_init(&tq->tq_lock);
+ INIT_LIST_HEAD(&tq->tq_thread_list);
+ INIT_LIST_HEAD(&tq->tq_active_list);
+ tq->tq_name = kmem_strdup(name);
+ tq->tq_nactive = 0;
+ tq->tq_nthreads = 0;
+ tq->tq_nspawn = 0;
+ tq->tq_maxthreads = nthreads;
+ tq->tq_cpu_pct = threads_arg;
+ tq->tq_pri = pri;
+ tq->tq_minalloc = minalloc;
+ tq->tq_maxalloc = maxalloc;
+ tq->tq_nalloc = 0;
+ tq->tq_flags = (flags | TASKQ_ACTIVE);
+ tq->tq_next_id = TASKQID_INITIAL;
+ tq->tq_lowest_id = TASKQID_INITIAL;
+ INIT_LIST_HEAD(&tq->tq_free_list);
+ INIT_LIST_HEAD(&tq->tq_pend_list);
+ INIT_LIST_HEAD(&tq->tq_prio_list);
+ INIT_LIST_HEAD(&tq->tq_delay_list);
+ init_waitqueue_head(&tq->tq_work_waitq);
+ init_waitqueue_head(&tq->tq_wait_waitq);
+ tq->tq_lock_class = TQ_LOCK_GENERAL;
+ INIT_LIST_HEAD(&tq->tq_taskqs);
+
+ if (flags & TASKQ_PREPOPULATE) {
+ spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
+ tq->tq_lock_class);
+
+ for (i = 0; i < minalloc; i++)
+ task_done(tq, task_alloc(tq, TQ_PUSHPAGE | TQ_NEW,
+ &irqflags));
+
+ spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+ }
+
+ if ((flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic)
+ nthreads = 1;
+
+ for (i = 0; i < nthreads; i++) {
+ tqt = taskq_thread_create(tq);
+ if (tqt == NULL)
+ rc = 1;
+ else
+ count++;
+ }
+
+ /* Wait for all threads to be started before potential destroy */
+ wait_event(tq->tq_wait_waitq, tq->tq_nthreads == count);
+ /*
+ * taskq_thread might have touched nspawn, but we don't want them to
+ * because they're not dynamically spawned. So we reset it to 0
+ */
+ tq->tq_nspawn = 0;
+
+ if (rc) {
+ taskq_destroy(tq);
+ tq = NULL;
+ } else {
+ down_write(&tq_list_sem);
+ tq->tq_instance = taskq_find_by_name(name) + 1;
+ list_add_tail(&tq->tq_taskqs, &tq_list);
+ up_write(&tq_list_sem);
+ }
+
+ return (tq);
+}
+EXPORT_SYMBOL(taskq_create);
+
+void
+taskq_destroy(taskq_t *tq)
+{
+ struct task_struct *thread;
+ taskq_thread_t *tqt;
+ taskq_ent_t *t;
+ unsigned long flags;
+
+ ASSERT(tq);
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ tq->tq_flags &= ~TASKQ_ACTIVE;
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+#ifdef HAVE_CPU_HOTPLUG
+ if (tq->tq_hp_support) {
+ VERIFY0(cpuhp_state_remove_instance_nocalls(
+ spl_taskq_cpuhp_state, &tq->tq_hp_cb_node));
+ }
+#endif
+ /*
+ * When TASKQ_ACTIVE is clear new tasks may not be added nor may
+ * new worker threads be spawned for dynamic taskq.
+ */
+ if (dynamic_taskq != NULL)
+ taskq_wait_outstanding(dynamic_taskq, 0);
+
+ taskq_wait(tq);
+
+ /* remove taskq from global list used by the kstats */
+ down_write(&tq_list_sem);
+ list_del(&tq->tq_taskqs);
+ up_write(&tq_list_sem);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ /* wait for spawning threads to insert themselves to the list */
+ while (tq->tq_nspawn) {
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ schedule_timeout_interruptible(1);
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ }
+
+ /*
+ * Signal each thread to exit and block until it does. Each thread
+ * is responsible for removing itself from the list and freeing its
+ * taskq_thread_t. This allows for idle threads to opt to remove
+ * themselves from the taskq. They can be recreated as needed.
+ */
+ while (!list_empty(&tq->tq_thread_list)) {
+ tqt = list_entry(tq->tq_thread_list.next,
+ taskq_thread_t, tqt_thread_list);
+ thread = tqt->tqt_thread;
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ kthread_stop(thread);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ }
+
+ while (!list_empty(&tq->tq_free_list)) {
+ t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list);
+
+ ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
+ list_del_init(&t->tqent_list);
+ task_free(tq, t);
+ }
+
+ ASSERT0(tq->tq_nthreads);
+ ASSERT0(tq->tq_nalloc);
+ ASSERT0(tq->tq_nspawn);
+ ASSERT(list_empty(&tq->tq_thread_list));
+ ASSERT(list_empty(&tq->tq_active_list));
+ ASSERT(list_empty(&tq->tq_free_list));
+ ASSERT(list_empty(&tq->tq_pend_list));
+ ASSERT(list_empty(&tq->tq_prio_list));
+ ASSERT(list_empty(&tq->tq_delay_list));
+
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ kmem_strfree(tq->tq_name);
+ kmem_free(tq, sizeof (taskq_t));
+}
+EXPORT_SYMBOL(taskq_destroy);
+
+static unsigned int spl_taskq_kick = 0;
+
+/*
+ * 2.6.36 API Change
+ * module_param_cb is introduced to take kernel_param_ops and
+ * module_param_call is marked as obsolete. Also set and get operations
+ * were changed to take a 'const struct kernel_param *'.
+ */
+static int
+#ifdef module_param_cb
+param_set_taskq_kick(const char *val, const struct kernel_param *kp)
+#else
+param_set_taskq_kick(const char *val, struct kernel_param *kp)
+#endif
+{
+ int ret;
+ taskq_t *tq = NULL;
+ taskq_ent_t *t;
+ unsigned long flags;
+
+ ret = param_set_uint(val, kp);
+ if (ret < 0 || !spl_taskq_kick)
+ return (ret);
+ /* reset value */
+ spl_taskq_kick = 0;
+
+ down_read(&tq_list_sem);
+ list_for_each_entry(tq, &tq_list, tq_taskqs) {
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ /* Check if the first pending is older than 5 seconds */
+ t = taskq_next_ent(tq);
+ if (t && time_after(jiffies, t->tqent_birth + 5*HZ)) {
+ (void) taskq_thread_spawn(tq);
+ printk(KERN_INFO "spl: Kicked taskq %s/%d\n",
+ tq->tq_name, tq->tq_instance);
+ }
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ }
+ up_read(&tq_list_sem);
+ return (ret);
+}
+
+#ifdef module_param_cb
+static const struct kernel_param_ops param_ops_taskq_kick = {
+ .set = param_set_taskq_kick,
+ .get = param_get_uint,
+};
+module_param_cb(spl_taskq_kick, &param_ops_taskq_kick, &spl_taskq_kick, 0644);
+#else
+module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint,
+ &spl_taskq_kick, 0644);
+#endif
+MODULE_PARM_DESC(spl_taskq_kick,
+ "Write nonzero to kick stuck taskqs to spawn more threads");
+
+#ifdef HAVE_CPU_HOTPLUG
+/*
+ * This callback will be called exactly once for each core that comes online,
+ * for each dynamic taskq. We attempt to expand taskqs that have
+ * TASKQ_THREADS_CPU_PCT set. We need to redo the percentage calculation every
+ * time, to correctly determine whether or not to add a thread.
+ */
+static int
+spl_taskq_expand(unsigned int cpu, struct hlist_node *node)
+{
+ taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node);
+ unsigned long flags;
+ int err = 0;
+
+ ASSERT(tq);
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+
+ if (!(tq->tq_flags & TASKQ_ACTIVE))
+ goto out;
+
+ ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT);
+ int nthreads = MIN(tq->tq_cpu_pct, 100);
+ nthreads = MAX(((num_online_cpus() + 1) * nthreads) / 100, 1);
+ tq->tq_maxthreads = nthreads;
+
+ if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) &&
+ tq->tq_maxthreads > tq->tq_nthreads) {
+ ASSERT3U(tq->tq_maxthreads, ==, tq->tq_nthreads + 1);
+ taskq_thread_t *tqt = taskq_thread_create(tq);
+ if (tqt == NULL)
+ err = -1;
+ }
+
+out:
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ return (err);
+}
+
+/*
+ * While we don't support offlining CPUs, it is possible that CPUs will fail
+ * to online successfully. We do need to be able to handle this case
+ * gracefully.
+ */
+static int
+spl_taskq_prepare_down(unsigned int cpu, struct hlist_node *node)
+{
+ taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node);
+ unsigned long flags;
+
+ ASSERT(tq);
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+
+ if (!(tq->tq_flags & TASKQ_ACTIVE))
+ goto out;
+
+ ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT);
+ int nthreads = MIN(tq->tq_cpu_pct, 100);
+ nthreads = MAX(((num_online_cpus()) * nthreads) / 100, 1);
+ tq->tq_maxthreads = nthreads;
+
+ if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) &&
+ tq->tq_maxthreads < tq->tq_nthreads) {
+ ASSERT3U(tq->tq_maxthreads, ==, tq->tq_nthreads - 1);
+ taskq_thread_t *tqt = list_entry(tq->tq_thread_list.next,
+ taskq_thread_t, tqt_thread_list);
+ struct task_struct *thread = tqt->tqt_thread;
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ kthread_stop(thread);
+
+ return (0);
+ }
+
+out:
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ return (0);
+}
+#endif
+
+int
+spl_taskq_init(void)
+{
+ init_rwsem(&tq_list_sem);
+ tsd_create(&taskq_tsd, NULL);
+
+#ifdef HAVE_CPU_HOTPLUG
+ spl_taskq_cpuhp_state = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+ "fs/spl_taskq:online", spl_taskq_expand, spl_taskq_prepare_down);
+#endif
+
+ system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64),
+ maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
+ if (system_taskq == NULL)
+ return (1);
+
+ system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4),
+ maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
+ if (system_delay_taskq == NULL) {
+#ifdef HAVE_CPU_HOTPLUG
+ cpuhp_remove_multi_state(spl_taskq_cpuhp_state);
+#endif
+ taskq_destroy(system_taskq);
+ return (1);
+ }
+
+ dynamic_taskq = taskq_create("spl_dynamic_taskq", 1,
+ maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE);
+ if (dynamic_taskq == NULL) {
+#ifdef HAVE_CPU_HOTPLUG
+ cpuhp_remove_multi_state(spl_taskq_cpuhp_state);
+#endif
+ taskq_destroy(system_taskq);
+ taskq_destroy(system_delay_taskq);
+ return (1);
+ }
+
+ /*
+ * This is used to annotate tq_lock, so
+ * taskq_dispatch -> taskq_thread_spawn -> taskq_dispatch
+ * does not trigger a lockdep warning re: possible recursive locking
+ */
+ dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC;
+
+ return (0);
+}
+
+void
+spl_taskq_fini(void)
+{
+ taskq_destroy(dynamic_taskq);
+ dynamic_taskq = NULL;
+
+ taskq_destroy(system_delay_taskq);
+ system_delay_taskq = NULL;
+
+ taskq_destroy(system_taskq);
+ system_taskq = NULL;
+
+ tsd_destroy(&taskq_tsd);
+
+#ifdef HAVE_CPU_HOTPLUG
+ cpuhp_remove_multi_state(spl_taskq_cpuhp_state);
+ spl_taskq_cpuhp_state = 0;
+#endif
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c
new file mode 100644
index 000000000000..db23fb64a298
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Thread Implementation.
+ */
+
+#include <sys/thread.h>
+#include <sys/kmem.h>
+#include <sys/tsd.h>
+
+/*
+ * Thread interfaces
+ */
+typedef struct thread_priv_s {
+ unsigned long tp_magic; /* Magic */
+ int tp_name_size; /* Name size */
+ char *tp_name; /* Name (without _thread suffix) */
+ void (*tp_func)(void *); /* Registered function */
+ void *tp_args; /* Args to be passed to function */
+ size_t tp_len; /* Len to be passed to function */
+ int tp_state; /* State to start thread at */
+ pri_t tp_pri; /* Priority to start threat at */
+} thread_priv_t;
+
+static int
+thread_generic_wrapper(void *arg)
+{
+ thread_priv_t *tp = (thread_priv_t *)arg;
+ void (*func)(void *);
+ void *args;
+
+ ASSERT(tp->tp_magic == TP_MAGIC);
+ func = tp->tp_func;
+ args = tp->tp_args;
+ set_current_state(tp->tp_state);
+ set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri));
+ kmem_free(tp->tp_name, tp->tp_name_size);
+ kmem_free(tp, sizeof (thread_priv_t));
+
+ if (func)
+ func(args);
+
+ return (0);
+}
+
+void
+__thread_exit(void)
+{
+ tsd_exit();
+ complete_and_exit(NULL, 0);
+ /* Unreachable */
+}
+EXPORT_SYMBOL(__thread_exit);
+
+/*
+ * thread_create() may block forever if it cannot create a thread or
+ * allocate memory. This is preferable to returning a NULL which Solaris
+ * style callers likely never check for... since it can't fail.
+ */
+kthread_t *
+__thread_create(caddr_t stk, size_t stksize, thread_func_t func,
+ const char *name, void *args, size_t len, proc_t *pp, int state, pri_t pri)
+{
+ thread_priv_t *tp;
+ struct task_struct *tsk;
+ char *p;
+
+ /* Option pp is simply ignored */
+ /* Variable stack size unsupported */
+ ASSERT(stk == NULL);
+
+ tp = kmem_alloc(sizeof (thread_priv_t), KM_PUSHPAGE);
+ if (tp == NULL)
+ return (NULL);
+
+ tp->tp_magic = TP_MAGIC;
+ tp->tp_name_size = strlen(name) + 1;
+
+ tp->tp_name = kmem_alloc(tp->tp_name_size, KM_PUSHPAGE);
+ if (tp->tp_name == NULL) {
+ kmem_free(tp, sizeof (thread_priv_t));
+ return (NULL);
+ }
+
+ strncpy(tp->tp_name, name, tp->tp_name_size);
+
+ /*
+ * Strip trailing "_thread" from passed name which will be the func
+ * name since the exposed API has no parameter for passing a name.
+ */
+ p = strstr(tp->tp_name, "_thread");
+ if (p)
+ p[0] = '\0';
+
+ tp->tp_func = func;
+ tp->tp_args = args;
+ tp->tp_len = len;
+ tp->tp_state = state;
+ tp->tp_pri = pri;
+
+ tsk = spl_kthread_create(thread_generic_wrapper, (void *)tp,
+ "%s", tp->tp_name);
+ if (IS_ERR(tsk))
+ return (NULL);
+
+ wake_up_process(tsk);
+ return ((kthread_t *)tsk);
+}
+EXPORT_SYMBOL(__thread_create);
+
+/*
+ * spl_kthread_create - Wrapper providing pre-3.13 semantics for
+ * kthread_create() in which it is not killable and less likely
+ * to return -ENOMEM.
+ */
+struct task_struct *
+spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...)
+{
+ struct task_struct *tsk;
+ va_list args;
+ char name[TASK_COMM_LEN];
+
+ va_start(args, namefmt);
+ vsnprintf(name, sizeof (name), namefmt, args);
+ va_end(args);
+ do {
+ tsk = kthread_create(func, data, "%s", name);
+ if (IS_ERR(tsk)) {
+ if (signal_pending(current)) {
+ clear_thread_flag(TIF_SIGPENDING);
+ continue;
+ }
+ if (PTR_ERR(tsk) == -ENOMEM)
+ continue;
+ return (NULL);
+ } else {
+ return (tsk);
+ }
+ } while (1);
+}
+EXPORT_SYMBOL(spl_kthread_create);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c b/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c
new file mode 100644
index 000000000000..7912a381294d
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c
@@ -0,0 +1,33 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Each DTRACE_PROBE must define its trace point in one (and only one)
+ * source file, so this dummy file exists for that purpose.
+ */
+
+#include <sys/taskq.h>
+
+#ifdef _KERNEL
+#define CREATE_TRACE_POINTS
+#include <sys/trace.h>
+#include <sys/trace_taskq.h>
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c b/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c
new file mode 100644
index 000000000000..546db9ab8bd7
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c
@@ -0,0 +1,719 @@
+/*
+ * Copyright (C) 2010 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Solaris Porting Layer (SPL) Thread Specific Data Implementation.
+ *
+ * Thread specific data has implemented using a hash table, this avoids
+ * the need to add a member to the task structure and allows maximum
+ * portability between kernels. This implementation has been optimized
+ * to keep the tsd_set() and tsd_get() times as small as possible.
+ *
+ * The majority of the entries in the hash table are for specific tsd
+ * entries. These entries are hashed by the product of their key and
+ * pid because by design the key and pid are guaranteed to be unique.
+ * Their product also has the desirable properly that it will be uniformly
+ * distributed over the hash bins providing neither the pid nor key is zero.
+ * Under linux the zero pid is always the init process and thus won't be
+ * used, and this implementation is careful to never to assign a zero key.
+ * By default the hash table is sized to 512 bins which is expected to
+ * be sufficient for light to moderate usage of thread specific data.
+ *
+ * The hash table contains two additional type of entries. They first
+ * type is entry is called a 'key' entry and it is added to the hash during
+ * tsd_create(). It is used to store the address of the destructor function
+ * and it is used as an anchor point. All tsd entries which use the same
+ * key will be linked to this entry. This is used during tsd_destroy() to
+ * quickly call the destructor function for all tsd associated with the key.
+ * The 'key' entry may be looked up with tsd_hash_search() by passing the
+ * key you wish to lookup and DTOR_PID constant as the pid.
+ *
+ * The second type of entry is called a 'pid' entry and it is added to the
+ * hash the first time a process set a key. The 'pid' entry is also used
+ * as an anchor and all tsd for the process will be linked to it. This
+ * list is using during tsd_exit() to ensure all registered destructors
+ * are run for the process. The 'pid' entry may be looked up with
+ * tsd_hash_search() by passing the PID_KEY constant as the key, and
+ * the process pid. Note that tsd_exit() is called by thread_exit()
+ * so if your using the Solaris thread API you should not need to call
+ * tsd_exit() directly.
+ *
+ */
+
+#include <sys/kmem.h>
+#include <sys/thread.h>
+#include <sys/tsd.h>
+#include <linux/hash.h>
+
+typedef struct tsd_hash_bin {
+ spinlock_t hb_lock;
+ struct hlist_head hb_head;
+} tsd_hash_bin_t;
+
+typedef struct tsd_hash_table {
+ spinlock_t ht_lock;
+ uint_t ht_bits;
+ uint_t ht_key;
+ tsd_hash_bin_t *ht_bins;
+} tsd_hash_table_t;
+
+typedef struct tsd_hash_entry {
+ uint_t he_key;
+ pid_t he_pid;
+ dtor_func_t he_dtor;
+ void *he_value;
+ struct hlist_node he_list;
+ struct list_head he_key_list;
+ struct list_head he_pid_list;
+} tsd_hash_entry_t;
+
+static tsd_hash_table_t *tsd_hash_table = NULL;
+
+
+/*
+ * tsd_hash_search - searches hash table for tsd_hash_entry
+ * @table: hash table
+ * @key: search key
+ * @pid: search pid
+ */
+static tsd_hash_entry_t *
+tsd_hash_search(tsd_hash_table_t *table, uint_t key, pid_t pid)
+{
+ struct hlist_node *node = NULL;
+ tsd_hash_entry_t *entry;
+ tsd_hash_bin_t *bin;
+ ulong_t hash;
+
+ hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits);
+ bin = &table->ht_bins[hash];
+ spin_lock(&bin->hb_lock);
+ hlist_for_each(node, &bin->hb_head) {
+ entry = list_entry(node, tsd_hash_entry_t, he_list);
+ if ((entry->he_key == key) && (entry->he_pid == pid)) {
+ spin_unlock(&bin->hb_lock);
+ return (entry);
+ }
+ }
+
+ spin_unlock(&bin->hb_lock);
+ return (NULL);
+}
+
+/*
+ * tsd_hash_dtor - call the destructor and free all entries on the list
+ * @work: list of hash entries
+ *
+ * For a list of entries which have all already been removed from the
+ * hash call their registered destructor then free the associated memory.
+ */
+static void
+tsd_hash_dtor(struct hlist_head *work)
+{
+ tsd_hash_entry_t *entry;
+
+ while (!hlist_empty(work)) {
+ entry = hlist_entry(work->first, tsd_hash_entry_t, he_list);
+ hlist_del(&entry->he_list);
+
+ if (entry->he_dtor && entry->he_pid != DTOR_PID)
+ entry->he_dtor(entry->he_value);
+
+ kmem_free(entry, sizeof (tsd_hash_entry_t));
+ }
+}
+
+/*
+ * tsd_hash_add - adds an entry to hash table
+ * @table: hash table
+ * @key: search key
+ * @pid: search pid
+ *
+ * The caller is responsible for ensuring the unique key/pid do not
+ * already exist in the hash table. This possible because all entries
+ * are thread specific thus a concurrent thread will never attempt to
+ * add this key/pid. Because multiple bins must be checked to add
+ * links to the dtor and pid entries the entire table is locked.
+ */
+static int
+tsd_hash_add(tsd_hash_table_t *table, uint_t key, pid_t pid, void *value)
+{
+ tsd_hash_entry_t *entry, *dtor_entry, *pid_entry;
+ tsd_hash_bin_t *bin;
+ ulong_t hash;
+ int rc = 0;
+
+ ASSERT3P(tsd_hash_search(table, key, pid), ==, NULL);
+
+ /* New entry allocate structure, set value, and add to hash */
+ entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
+ if (entry == NULL)
+ return (ENOMEM);
+
+ entry->he_key = key;
+ entry->he_pid = pid;
+ entry->he_value = value;
+ INIT_HLIST_NODE(&entry->he_list);
+ INIT_LIST_HEAD(&entry->he_key_list);
+ INIT_LIST_HEAD(&entry->he_pid_list);
+
+ spin_lock(&table->ht_lock);
+
+ /* Destructor entry must exist for all valid keys */
+ dtor_entry = tsd_hash_search(table, entry->he_key, DTOR_PID);
+ ASSERT3P(dtor_entry, !=, NULL);
+ entry->he_dtor = dtor_entry->he_dtor;
+
+ /* Process entry must exist for all valid processes */
+ pid_entry = tsd_hash_search(table, PID_KEY, entry->he_pid);
+ ASSERT3P(pid_entry, !=, NULL);
+
+ hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits);
+ bin = &table->ht_bins[hash];
+ spin_lock(&bin->hb_lock);
+
+ /* Add to the hash, key, and pid lists */
+ hlist_add_head(&entry->he_list, &bin->hb_head);
+ list_add(&entry->he_key_list, &dtor_entry->he_key_list);
+ list_add(&entry->he_pid_list, &pid_entry->he_pid_list);
+
+ spin_unlock(&bin->hb_lock);
+ spin_unlock(&table->ht_lock);
+
+ return (rc);
+}
+
+/*
+ * tsd_hash_add_key - adds a destructor entry to the hash table
+ * @table: hash table
+ * @keyp: search key
+ * @dtor: key destructor
+ *
+ * For every unique key there is a single entry in the hash which is used
+ * as anchor. All other thread specific entries for this key are linked
+ * to this anchor via the 'he_key_list' list head. On return they keyp
+ * will be set to the next available key for the hash table.
+ */
+static int
+tsd_hash_add_key(tsd_hash_table_t *table, uint_t *keyp, dtor_func_t dtor)
+{
+ tsd_hash_entry_t *tmp_entry, *entry;
+ tsd_hash_bin_t *bin;
+ ulong_t hash;
+ int keys_checked = 0;
+
+ ASSERT3P(table, !=, NULL);
+
+ /* Allocate entry to be used as a destructor for this key */
+ entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
+ if (entry == NULL)
+ return (ENOMEM);
+
+ /* Determine next available key value */
+ spin_lock(&table->ht_lock);
+ do {
+ /* Limited to TSD_KEYS_MAX concurrent unique keys */
+ if (table->ht_key++ > TSD_KEYS_MAX)
+ table->ht_key = 1;
+
+ /* Ensure failure when all TSD_KEYS_MAX keys are in use */
+ if (keys_checked++ >= TSD_KEYS_MAX) {
+ spin_unlock(&table->ht_lock);
+ return (ENOENT);
+ }
+
+ tmp_entry = tsd_hash_search(table, table->ht_key, DTOR_PID);
+ } while (tmp_entry);
+
+ /* Add destructor entry in to hash table */
+ entry->he_key = *keyp = table->ht_key;
+ entry->he_pid = DTOR_PID;
+ entry->he_dtor = dtor;
+ entry->he_value = NULL;
+ INIT_HLIST_NODE(&entry->he_list);
+ INIT_LIST_HEAD(&entry->he_key_list);
+ INIT_LIST_HEAD(&entry->he_pid_list);
+
+ hash = hash_long((ulong_t)*keyp * (ulong_t)DTOR_PID, table->ht_bits);
+ bin = &table->ht_bins[hash];
+ spin_lock(&bin->hb_lock);
+
+ hlist_add_head(&entry->he_list, &bin->hb_head);
+
+ spin_unlock(&bin->hb_lock);
+ spin_unlock(&table->ht_lock);
+
+ return (0);
+}
+
+/*
+ * tsd_hash_add_pid - adds a process entry to the hash table
+ * @table: hash table
+ * @pid: search pid
+ *
+ * For every process there is a single entry in the hash which is used
+ * as anchor. All other thread specific entries for this process are
+ * linked to this anchor via the 'he_pid_list' list head.
+ */
+static int
+tsd_hash_add_pid(tsd_hash_table_t *table, pid_t pid)
+{
+ tsd_hash_entry_t *entry;
+ tsd_hash_bin_t *bin;
+ ulong_t hash;
+
+ /* Allocate entry to be used as the process reference */
+ entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
+ if (entry == NULL)
+ return (ENOMEM);
+
+ spin_lock(&table->ht_lock);
+ entry->he_key = PID_KEY;
+ entry->he_pid = pid;
+ entry->he_dtor = NULL;
+ entry->he_value = NULL;
+ INIT_HLIST_NODE(&entry->he_list);
+ INIT_LIST_HEAD(&entry->he_key_list);
+ INIT_LIST_HEAD(&entry->he_pid_list);
+
+ hash = hash_long((ulong_t)PID_KEY * (ulong_t)pid, table->ht_bits);
+ bin = &table->ht_bins[hash];
+ spin_lock(&bin->hb_lock);
+
+ hlist_add_head(&entry->he_list, &bin->hb_head);
+
+ spin_unlock(&bin->hb_lock);
+ spin_unlock(&table->ht_lock);
+
+ return (0);
+}
+
+/*
+ * tsd_hash_del - delete an entry from hash table, key, and pid lists
+ * @table: hash table
+ * @key: search key
+ * @pid: search pid
+ */
+static void
+tsd_hash_del(tsd_hash_table_t *table, tsd_hash_entry_t *entry)
+{
+ hlist_del(&entry->he_list);
+ list_del_init(&entry->he_key_list);
+ list_del_init(&entry->he_pid_list);
+}
+
+/*
+ * tsd_hash_table_init - allocate a hash table
+ * @bits: hash table size
+ *
+ * A hash table with 2^bits bins will be created, it may not be resized
+ * after the fact and must be free'd with tsd_hash_table_fini().
+ */
+static tsd_hash_table_t *
+tsd_hash_table_init(uint_t bits)
+{
+ tsd_hash_table_t *table;
+ int hash, size = (1 << bits);
+
+ table = kmem_zalloc(sizeof (tsd_hash_table_t), KM_SLEEP);
+ if (table == NULL)
+ return (NULL);
+
+ table->ht_bins = kmem_zalloc(sizeof (tsd_hash_bin_t) * size, KM_SLEEP);
+ if (table->ht_bins == NULL) {
+ kmem_free(table, sizeof (tsd_hash_table_t));
+ return (NULL);
+ }
+
+ for (hash = 0; hash < size; hash++) {
+ spin_lock_init(&table->ht_bins[hash].hb_lock);
+ INIT_HLIST_HEAD(&table->ht_bins[hash].hb_head);
+ }
+
+ spin_lock_init(&table->ht_lock);
+ table->ht_bits = bits;
+ table->ht_key = 1;
+
+ return (table);
+}
+
+/*
+ * tsd_hash_table_fini - free a hash table
+ * @table: hash table
+ *
+ * Free a hash table allocated by tsd_hash_table_init(). If the hash
+ * table is not empty this function will call the proper destructor for
+ * all remaining entries before freeing the memory used by those entries.
+ */
+static void
+tsd_hash_table_fini(tsd_hash_table_t *table)
+{
+ HLIST_HEAD(work);
+ tsd_hash_bin_t *bin;
+ tsd_hash_entry_t *entry;
+ int size, i;
+
+ ASSERT3P(table, !=, NULL);
+ spin_lock(&table->ht_lock);
+ for (i = 0, size = (1 << table->ht_bits); i < size; i++) {
+ bin = &table->ht_bins[i];
+ spin_lock(&bin->hb_lock);
+ while (!hlist_empty(&bin->hb_head)) {
+ entry = hlist_entry(bin->hb_head.first,
+ tsd_hash_entry_t, he_list);
+ tsd_hash_del(table, entry);
+ hlist_add_head(&entry->he_list, &work);
+ }
+ spin_unlock(&bin->hb_lock);
+ }
+ spin_unlock(&table->ht_lock);
+
+ tsd_hash_dtor(&work);
+ kmem_free(table->ht_bins, sizeof (tsd_hash_bin_t)*(1<<table->ht_bits));
+ kmem_free(table, sizeof (tsd_hash_table_t));
+}
+
+/*
+ * tsd_remove_entry - remove a tsd entry for this thread
+ * @entry: entry to remove
+ *
+ * Remove the thread specific data @entry for this thread.
+ * If this is the last entry for this thread, also remove the PID entry.
+ */
+static void
+tsd_remove_entry(tsd_hash_entry_t *entry)
+{
+ HLIST_HEAD(work);
+ tsd_hash_table_t *table;
+ tsd_hash_entry_t *pid_entry;
+ tsd_hash_bin_t *pid_entry_bin, *entry_bin;
+ ulong_t hash;
+
+ table = tsd_hash_table;
+ ASSERT3P(table, !=, NULL);
+ ASSERT3P(entry, !=, NULL);
+
+ spin_lock(&table->ht_lock);
+
+ hash = hash_long((ulong_t)entry->he_key *
+ (ulong_t)entry->he_pid, table->ht_bits);
+ entry_bin = &table->ht_bins[hash];
+
+ /* save the possible pid_entry */
+ pid_entry = list_entry(entry->he_pid_list.next, tsd_hash_entry_t,
+ he_pid_list);
+
+ /* remove entry */
+ spin_lock(&entry_bin->hb_lock);
+ tsd_hash_del(table, entry);
+ hlist_add_head(&entry->he_list, &work);
+ spin_unlock(&entry_bin->hb_lock);
+
+ /* if pid_entry is indeed pid_entry, then remove it if it's empty */
+ if (pid_entry->he_key == PID_KEY &&
+ list_empty(&pid_entry->he_pid_list)) {
+ hash = hash_long((ulong_t)pid_entry->he_key *
+ (ulong_t)pid_entry->he_pid, table->ht_bits);
+ pid_entry_bin = &table->ht_bins[hash];
+
+ spin_lock(&pid_entry_bin->hb_lock);
+ tsd_hash_del(table, pid_entry);
+ hlist_add_head(&pid_entry->he_list, &work);
+ spin_unlock(&pid_entry_bin->hb_lock);
+ }
+
+ spin_unlock(&table->ht_lock);
+
+ tsd_hash_dtor(&work);
+}
+
+/*
+ * tsd_set - set thread specific data
+ * @key: lookup key
+ * @value: value to set
+ *
+ * Caller must prevent racing tsd_create() or tsd_destroy(), protected
+ * from racing tsd_get() or tsd_set() because it is thread specific.
+ * This function has been optimized to be fast for the update case.
+ * When setting the tsd initially it will be slower due to additional
+ * required locking and potential memory allocations.
+ */
+int
+tsd_set(uint_t key, void *value)
+{
+ tsd_hash_table_t *table;
+ tsd_hash_entry_t *entry;
+ pid_t pid;
+ int rc;
+ /* mark remove if value is NULL */
+ boolean_t remove = (value == NULL);
+
+ table = tsd_hash_table;
+ pid = curthread->pid;
+ ASSERT3P(table, !=, NULL);
+
+ if ((key == 0) || (key > TSD_KEYS_MAX))
+ return (EINVAL);
+
+ /* Entry already exists in hash table update value */
+ entry = tsd_hash_search(table, key, pid);
+ if (entry) {
+ entry->he_value = value;
+ /* remove the entry */
+ if (remove)
+ tsd_remove_entry(entry);
+ return (0);
+ }
+
+ /* don't create entry if value is NULL */
+ if (remove)
+ return (0);
+
+ /* Add a process entry to the hash if not yet exists */
+ entry = tsd_hash_search(table, PID_KEY, pid);
+ if (entry == NULL) {
+ rc = tsd_hash_add_pid(table, pid);
+ if (rc)
+ return (rc);
+ }
+
+ rc = tsd_hash_add(table, key, pid, value);
+ return (rc);
+}
+EXPORT_SYMBOL(tsd_set);
+
+/*
+ * tsd_get - get thread specific data
+ * @key: lookup key
+ *
+ * Caller must prevent racing tsd_create() or tsd_destroy(). This
+ * implementation is designed to be fast and scalable, it does not
+ * lock the entire table only a single hash bin.
+ */
+void *
+tsd_get(uint_t key)
+{
+ tsd_hash_entry_t *entry;
+
+ ASSERT3P(tsd_hash_table, !=, NULL);
+
+ if ((key == 0) || (key > TSD_KEYS_MAX))
+ return (NULL);
+
+ entry = tsd_hash_search(tsd_hash_table, key, curthread->pid);
+ if (entry == NULL)
+ return (NULL);
+
+ return (entry->he_value);
+}
+EXPORT_SYMBOL(tsd_get);
+
+/*
+ * tsd_get_by_thread - get thread specific data for specified thread
+ * @key: lookup key
+ * @thread: thread to lookup
+ *
+ * Caller must prevent racing tsd_create() or tsd_destroy(). This
+ * implementation is designed to be fast and scalable, it does not
+ * lock the entire table only a single hash bin.
+ */
+void *
+tsd_get_by_thread(uint_t key, kthread_t *thread)
+{
+ tsd_hash_entry_t *entry;
+
+ ASSERT3P(tsd_hash_table, !=, NULL);
+
+ if ((key == 0) || (key > TSD_KEYS_MAX))
+ return (NULL);
+
+ entry = tsd_hash_search(tsd_hash_table, key, thread->pid);
+ if (entry == NULL)
+ return (NULL);
+
+ return (entry->he_value);
+}
+EXPORT_SYMBOL(tsd_get_by_thread);
+
+/*
+ * tsd_create - create thread specific data key
+ * @keyp: lookup key address
+ * @dtor: destructor called during tsd_destroy() or tsd_exit()
+ *
+ * Provided key must be set to 0 or it assumed to be already in use.
+ * The dtor is allowed to be NULL in which case no additional cleanup
+ * for the data is performed during tsd_destroy() or tsd_exit().
+ *
+ * Caller must prevent racing tsd_set() or tsd_get(), this function is
+ * safe from racing tsd_create(), tsd_destroy(), and tsd_exit().
+ */
+void
+tsd_create(uint_t *keyp, dtor_func_t dtor)
+{
+ ASSERT3P(keyp, !=, NULL);
+ if (*keyp)
+ return;
+
+ (void) tsd_hash_add_key(tsd_hash_table, keyp, dtor);
+}
+EXPORT_SYMBOL(tsd_create);
+
+/*
+ * tsd_destroy - destroy thread specific data
+ * @keyp: lookup key address
+ *
+ * Destroys the thread specific data on all threads which use this key.
+ *
+ * Caller must prevent racing tsd_set() or tsd_get(), this function is
+ * safe from racing tsd_create(), tsd_destroy(), and tsd_exit().
+ */
+void
+tsd_destroy(uint_t *keyp)
+{
+ HLIST_HEAD(work);
+ tsd_hash_table_t *table;
+ tsd_hash_entry_t *dtor_entry, *entry;
+ tsd_hash_bin_t *dtor_entry_bin, *entry_bin;
+ ulong_t hash;
+
+ table = tsd_hash_table;
+ ASSERT3P(table, !=, NULL);
+
+ spin_lock(&table->ht_lock);
+ dtor_entry = tsd_hash_search(table, *keyp, DTOR_PID);
+ if (dtor_entry == NULL) {
+ spin_unlock(&table->ht_lock);
+ return;
+ }
+
+ /*
+ * All threads which use this key must be linked off of the
+ * DTOR_PID entry. They are removed from the hash table and
+ * linked in to a private working list to be destroyed.
+ */
+ while (!list_empty(&dtor_entry->he_key_list)) {
+ entry = list_entry(dtor_entry->he_key_list.next,
+ tsd_hash_entry_t, he_key_list);
+ ASSERT3U(dtor_entry->he_key, ==, entry->he_key);
+ ASSERT3P(dtor_entry->he_dtor, ==, entry->he_dtor);
+
+ hash = hash_long((ulong_t)entry->he_key *
+ (ulong_t)entry->he_pid, table->ht_bits);
+ entry_bin = &table->ht_bins[hash];
+
+ spin_lock(&entry_bin->hb_lock);
+ tsd_hash_del(table, entry);
+ hlist_add_head(&entry->he_list, &work);
+ spin_unlock(&entry_bin->hb_lock);
+ }
+
+ hash = hash_long((ulong_t)dtor_entry->he_key *
+ (ulong_t)dtor_entry->he_pid, table->ht_bits);
+ dtor_entry_bin = &table->ht_bins[hash];
+
+ spin_lock(&dtor_entry_bin->hb_lock);
+ tsd_hash_del(table, dtor_entry);
+ hlist_add_head(&dtor_entry->he_list, &work);
+ spin_unlock(&dtor_entry_bin->hb_lock);
+ spin_unlock(&table->ht_lock);
+
+ tsd_hash_dtor(&work);
+ *keyp = 0;
+}
+EXPORT_SYMBOL(tsd_destroy);
+
+/*
+ * tsd_exit - destroys all thread specific data for this thread
+ *
+ * Destroys all the thread specific data for this thread.
+ *
+ * Caller must prevent racing tsd_set() or tsd_get(), this function is
+ * safe from racing tsd_create(), tsd_destroy(), and tsd_exit().
+ */
+void
+tsd_exit(void)
+{
+ HLIST_HEAD(work);
+ tsd_hash_table_t *table;
+ tsd_hash_entry_t *pid_entry, *entry;
+ tsd_hash_bin_t *pid_entry_bin, *entry_bin;
+ ulong_t hash;
+
+ table = tsd_hash_table;
+ ASSERT3P(table, !=, NULL);
+
+ spin_lock(&table->ht_lock);
+ pid_entry = tsd_hash_search(table, PID_KEY, curthread->pid);
+ if (pid_entry == NULL) {
+ spin_unlock(&table->ht_lock);
+ return;
+ }
+
+ /*
+ * All keys associated with this pid must be linked off of the
+ * PID_KEY entry. They are removed from the hash table and
+ * linked in to a private working list to be destroyed.
+ */
+
+ while (!list_empty(&pid_entry->he_pid_list)) {
+ entry = list_entry(pid_entry->he_pid_list.next,
+ tsd_hash_entry_t, he_pid_list);
+ ASSERT3U(pid_entry->he_pid, ==, entry->he_pid);
+
+ hash = hash_long((ulong_t)entry->he_key *
+ (ulong_t)entry->he_pid, table->ht_bits);
+ entry_bin = &table->ht_bins[hash];
+
+ spin_lock(&entry_bin->hb_lock);
+ tsd_hash_del(table, entry);
+ hlist_add_head(&entry->he_list, &work);
+ spin_unlock(&entry_bin->hb_lock);
+ }
+
+ hash = hash_long((ulong_t)pid_entry->he_key *
+ (ulong_t)pid_entry->he_pid, table->ht_bits);
+ pid_entry_bin = &table->ht_bins[hash];
+
+ spin_lock(&pid_entry_bin->hb_lock);
+ tsd_hash_del(table, pid_entry);
+ hlist_add_head(&pid_entry->he_list, &work);
+ spin_unlock(&pid_entry_bin->hb_lock);
+ spin_unlock(&table->ht_lock);
+
+ tsd_hash_dtor(&work);
+}
+EXPORT_SYMBOL(tsd_exit);
+
+int
+spl_tsd_init(void)
+{
+ tsd_hash_table = tsd_hash_table_init(TSD_HASH_TABLE_BITS_DEFAULT);
+ if (tsd_hash_table == NULL)
+ return (1);
+
+ return (0);
+}
+
+void
+spl_tsd_fini(void)
+{
+ tsd_hash_table_fini(tsd_hash_table);
+ tsd_hash_table = NULL;
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-vmem.c b/sys/contrib/openzfs/module/os/linux/spl/spl-vmem.c
new file mode 100644
index 000000000000..cab3e9549cfe
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-vmem.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/percpu_compat.h>
+#include <sys/debug.h>
+#include <sys/vmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/shrinker.h>
+#include <linux/module.h>
+
+/*
+ * Public vmem_alloc(), vmem_zalloc() and vmem_free() interfaces.
+ */
+void *
+spl_vmem_alloc(size_t size, int flags, const char *func, int line)
+{
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+ flags |= KM_VMEM;
+
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+ return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_vmem_alloc);
+
+void *
+spl_vmem_zalloc(size_t size, int flags, const char *func, int line)
+{
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+ flags |= (KM_VMEM | KM_ZERO);
+
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+ return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_vmem_zalloc);
+
+void
+spl_vmem_free(const void *buf, size_t size)
+{
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_free_impl(buf, size));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_free_debug(buf, size));
+#else
+ return (spl_kmem_free_track(buf, size));
+#endif
+}
+EXPORT_SYMBOL(spl_vmem_free);
+
+int
+spl_vmem_init(void)
+{
+ return (0);
+}
+
+void
+spl_vmem_fini(void)
+{
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-xdr.c b/sys/contrib/openzfs/module/os/linux/spl/spl-xdr.c
new file mode 100644
index 000000000000..5e763c25606f
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-xdr.c
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2008-2010 Sun Microsystems, Inc.
+ * Written by Ricardo Correia <Ricardo.M.Correia@Sun.COM>
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) XDR Implementation.
+ */
+
+#include <linux/string.h>
+#include <sys/kmem.h>
+#include <sys/debug.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <rpc/xdr.h>
+
+/*
+ * SPL's XDR mem implementation.
+ *
+ * This is used by libnvpair to serialize/deserialize the name-value pair data
+ * structures into byte arrays in a well-defined and portable manner.
+ *
+ * These data structures are used by the DMU/ZFS to flexibly manipulate various
+ * information in memory and later serialize it/deserialize it to disk.
+ * Examples of usages include the pool configuration, lists of pool and dataset
+ * properties, etc.
+ *
+ * Reference documentation for the XDR representation and XDR operations can be
+ * found in RFC 1832 and xdr(3), respectively.
+ *
+ * === Implementation shortcomings ===
+ *
+ * It is assumed that the following C types have the following sizes:
+ *
+ * char/unsigned char: 1 byte
+ * short/unsigned short: 2 bytes
+ * int/unsigned int: 4 bytes
+ * longlong_t/u_longlong_t: 8 bytes
+ *
+ * The C standard allows these types to be larger (and in the case of ints,
+ * shorter), so if that is the case on some compiler/architecture, the build
+ * will fail (on purpose).
+ *
+ * If someone wants to fix the code to work properly on such environments, then:
+ *
+ * 1) Preconditions should be added to xdrmem_enc functions to make sure the
+ * caller doesn't pass arguments which exceed the expected range.
+ * 2) Functions which take signed integers should be changed to properly do
+ * sign extension.
+ * 3) For ints with less than 32 bits, well.. I suspect you'll have bigger
+ * problems than this implementation.
+ *
+ * It is also assumed that:
+ *
+ * 1) Chars have 8 bits.
+ * 2) We can always do 32-bit-aligned int memory accesses and byte-aligned
+ * memcpy, memset and memcmp.
+ * 3) Arrays passed to xdr_array() are packed and the compiler/architecture
+ * supports element-sized-aligned memory accesses.
+ * 4) Negative integers are natively stored in two's complement binary
+ * representation.
+ *
+ * No checks are done for the 4 assumptions above, though.
+ *
+ * === Caller expectations ===
+ *
+ * Existing documentation does not describe the semantics of XDR operations very
+ * well. Therefore, some assumptions about failure semantics will be made and
+ * will be described below:
+ *
+ * 1) If any encoding operation fails (e.g., due to lack of buffer space), the
+ * the stream should be considered valid only up to the encoding operation
+ * previous to the one that first failed. However, the stream size as returned
+ * by xdr_control() cannot be considered to be strictly correct (it may be
+ * bigger).
+ *
+ * Putting it another way, if there is an encoding failure it's undefined
+ * whether anything is added to the stream in that operation and therefore
+ * neither xdr_control() nor future encoding operations on the same stream can
+ * be relied upon to produce correct results.
+ *
+ * 2) If a decoding operation fails, it's undefined whether anything will be
+ * decoded into passed buffers/pointers during that operation, or what the
+ * values on those buffers will look like.
+ *
+ * Future decoding operations on the same stream will also have similar
+ * undefined behavior.
+ *
+ * 3) When the first decoding operation fails it is OK to trust the results of
+ * previous decoding operations on the same stream, as long as the caller
+ * expects a failure to be possible (e.g. due to end-of-stream).
+ *
+ * However, this is highly discouraged because the caller should know the
+ * stream size and should be coded to expect any decoding failure to be data
+ * corruption due to hardware, accidental or even malicious causes, which should
+ * be handled gracefully in all cases.
+ *
+ * In very rare situations where there are strong reasons to believe the data
+ * can be trusted to be valid and non-tampered with, then the caller may assume
+ * a decoding failure to be a bug (e.g. due to mismatched data types) and may
+ * fail non-gracefully.
+ *
+ * 4) Non-zero padding bytes will cause the decoding operation to fail.
+ *
+ * 5) Zero bytes on string types will also cause the decoding operation to fail.
+ *
+ * 6) It is assumed that either the pointer to the stream buffer given by the
+ * caller is 32-bit aligned or the architecture supports non-32-bit-aligned int
+ * memory accesses.
+ *
+ * 7) The stream buffer and encoding/decoding buffers/ptrs should not overlap.
+ *
+ * 8) If a caller passes pointers to non-kernel memory (e.g., pointers to user
+ * space or MMIO space), the computer may explode.
+ */
+
+static struct xdr_ops xdrmem_encode_ops;
+static struct xdr_ops xdrmem_decode_ops;
+
+void
+xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size,
+ const enum xdr_op op)
+{
+ switch (op) {
+ case XDR_ENCODE:
+ xdrs->x_ops = &xdrmem_encode_ops;
+ break;
+ case XDR_DECODE:
+ xdrs->x_ops = &xdrmem_decode_ops;
+ break;
+ default:
+ xdrs->x_ops = NULL; /* Let the caller know we failed */
+ return;
+ }
+
+ xdrs->x_op = op;
+ xdrs->x_addr = addr;
+ xdrs->x_addr_end = addr + size;
+
+ if (xdrs->x_addr_end < xdrs->x_addr) {
+ xdrs->x_ops = NULL;
+ }
+}
+EXPORT_SYMBOL(xdrmem_create);
+
+static bool_t
+xdrmem_control(XDR *xdrs, int req, void *info)
+{
+ struct xdr_bytesrec *rec = (struct xdr_bytesrec *)info;
+
+ if (req != XDR_GET_BYTES_AVAIL)
+ return (FALSE);
+
+ rec->xc_is_last_record = TRUE; /* always TRUE in xdrmem streams */
+ rec->xc_num_avail = xdrs->x_addr_end - xdrs->x_addr;
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt)
+{
+ uint_t size = roundup(cnt, 4);
+ uint_t pad;
+
+ if (size < cnt)
+ return (FALSE); /* Integer overflow */
+
+ if (xdrs->x_addr > xdrs->x_addr_end)
+ return (FALSE);
+
+ if (xdrs->x_addr_end - xdrs->x_addr < size)
+ return (FALSE);
+
+ memcpy(xdrs->x_addr, cp, cnt);
+
+ xdrs->x_addr += cnt;
+
+ pad = size - cnt;
+ if (pad > 0) {
+ memset(xdrs->x_addr, 0, pad);
+ xdrs->x_addr += pad;
+ }
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_dec_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt)
+{
+ static uint32_t zero = 0;
+ uint_t size = roundup(cnt, 4);
+ uint_t pad;
+
+ if (size < cnt)
+ return (FALSE); /* Integer overflow */
+
+ if (xdrs->x_addr > xdrs->x_addr_end)
+ return (FALSE);
+
+ if (xdrs->x_addr_end - xdrs->x_addr < size)
+ return (FALSE);
+
+ memcpy(cp, xdrs->x_addr, cnt);
+ xdrs->x_addr += cnt;
+
+ pad = size - cnt;
+ if (pad > 0) {
+ /* An inverted memchr() would be useful here... */
+ if (memcmp(&zero, xdrs->x_addr, pad) != 0)
+ return (FALSE);
+
+ xdrs->x_addr += pad;
+ }
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_uint32(XDR *xdrs, uint32_t val)
+{
+ if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end)
+ return (FALSE);
+
+ *((uint32_t *)xdrs->x_addr) = cpu_to_be32(val);
+
+ xdrs->x_addr += sizeof (uint32_t);
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_dec_uint32(XDR *xdrs, uint32_t *val)
+{
+ if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end)
+ return (FALSE);
+
+ *val = be32_to_cpu(*((uint32_t *)xdrs->x_addr));
+
+ xdrs->x_addr += sizeof (uint32_t);
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_char(XDR *xdrs, char *cp)
+{
+ uint32_t val;
+
+ BUILD_BUG_ON(sizeof (char) != 1);
+ val = *((unsigned char *) cp);
+
+ return (xdrmem_enc_uint32(xdrs, val));
+}
+
+static bool_t
+xdrmem_dec_char(XDR *xdrs, char *cp)
+{
+ uint32_t val;
+
+ BUILD_BUG_ON(sizeof (char) != 1);
+
+ if (!xdrmem_dec_uint32(xdrs, &val))
+ return (FALSE);
+
+ /*
+ * If any of the 3 other bytes are non-zero then val will be greater
+ * than 0xff and we fail because according to the RFC, this block does
+ * not have a char encoded in it.
+ */
+ if (val > 0xff)
+ return (FALSE);
+
+ *((unsigned char *) cp) = val;
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_ushort(XDR *xdrs, unsigned short *usp)
+{
+ BUILD_BUG_ON(sizeof (unsigned short) != 2);
+
+ return (xdrmem_enc_uint32(xdrs, *usp));
+}
+
+static bool_t
+xdrmem_dec_ushort(XDR *xdrs, unsigned short *usp)
+{
+ uint32_t val;
+
+ BUILD_BUG_ON(sizeof (unsigned short) != 2);
+
+ if (!xdrmem_dec_uint32(xdrs, &val))
+ return (FALSE);
+
+ /*
+ * Short ints are not in the RFC, but we assume similar logic as in
+ * xdrmem_dec_char().
+ */
+ if (val > 0xffff)
+ return (FALSE);
+
+ *usp = val;
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_uint(XDR *xdrs, unsigned *up)
+{
+ BUILD_BUG_ON(sizeof (unsigned) != 4);
+
+ return (xdrmem_enc_uint32(xdrs, *up));
+}
+
+static bool_t
+xdrmem_dec_uint(XDR *xdrs, unsigned *up)
+{
+ BUILD_BUG_ON(sizeof (unsigned) != 4);
+
+ return (xdrmem_dec_uint32(xdrs, (uint32_t *)up));
+}
+
+static bool_t
+xdrmem_enc_ulonglong(XDR *xdrs, u_longlong_t *ullp)
+{
+ BUILD_BUG_ON(sizeof (u_longlong_t) != 8);
+
+ if (!xdrmem_enc_uint32(xdrs, *ullp >> 32))
+ return (FALSE);
+
+ return (xdrmem_enc_uint32(xdrs, *ullp & 0xffffffff));
+}
+
+static bool_t
+xdrmem_dec_ulonglong(XDR *xdrs, u_longlong_t *ullp)
+{
+ uint32_t low, high;
+
+ BUILD_BUG_ON(sizeof (u_longlong_t) != 8);
+
+ if (!xdrmem_dec_uint32(xdrs, &high))
+ return (FALSE);
+ if (!xdrmem_dec_uint32(xdrs, &low))
+ return (FALSE);
+
+ *ullp = ((u_longlong_t)high << 32) | low;
+
+ return (TRUE);
+}
+
+static bool_t
+xdr_enc_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize,
+ const uint_t elsize, const xdrproc_t elproc)
+{
+ uint_t i;
+ caddr_t addr = *arrp;
+
+ if (*sizep > maxsize || *sizep > UINT_MAX / elsize)
+ return (FALSE);
+
+ if (!xdrmem_enc_uint(xdrs, sizep))
+ return (FALSE);
+
+ for (i = 0; i < *sizep; i++) {
+ if (!elproc(xdrs, addr))
+ return (FALSE);
+ addr += elsize;
+ }
+
+ return (TRUE);
+}
+
+static bool_t
+xdr_dec_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize,
+ const uint_t elsize, const xdrproc_t elproc)
+{
+ uint_t i, size;
+ bool_t alloc = FALSE;
+ caddr_t addr;
+
+ if (!xdrmem_dec_uint(xdrs, sizep))
+ return (FALSE);
+
+ size = *sizep;
+
+ if (size > maxsize || size > UINT_MAX / elsize)
+ return (FALSE);
+
+ /*
+ * The Solaris man page says: "If *arrp is NULL when decoding,
+ * xdr_array() allocates memory and *arrp points to it".
+ */
+ if (*arrp == NULL) {
+ BUILD_BUG_ON(sizeof (uint_t) > sizeof (size_t));
+
+ *arrp = kmem_alloc(size * elsize, KM_NOSLEEP);
+ if (*arrp == NULL)
+ return (FALSE);
+
+ alloc = TRUE;
+ }
+
+ addr = *arrp;
+
+ for (i = 0; i < size; i++) {
+ if (!elproc(xdrs, addr)) {
+ if (alloc)
+ kmem_free(*arrp, size * elsize);
+ return (FALSE);
+ }
+ addr += elsize;
+ }
+
+ return (TRUE);
+}
+
+static bool_t
+xdr_enc_string(XDR *xdrs, char **sp, const uint_t maxsize)
+{
+ size_t slen = strlen(*sp);
+ uint_t len;
+
+ if (slen > maxsize)
+ return (FALSE);
+
+ len = slen;
+
+ if (!xdrmem_enc_uint(xdrs, &len))
+ return (FALSE);
+
+ return (xdrmem_enc_bytes(xdrs, *sp, len));
+}
+
+static bool_t
+xdr_dec_string(XDR *xdrs, char **sp, const uint_t maxsize)
+{
+ uint_t size;
+ bool_t alloc = FALSE;
+
+ if (!xdrmem_dec_uint(xdrs, &size))
+ return (FALSE);
+
+ if (size > maxsize || size > UINT_MAX - 1)
+ return (FALSE);
+
+ /*
+ * Solaris man page: "If *sp is NULL when decoding, xdr_string()
+ * allocates memory and *sp points to it".
+ */
+ if (*sp == NULL) {
+ BUILD_BUG_ON(sizeof (uint_t) > sizeof (size_t));
+
+ *sp = kmem_alloc(size + 1, KM_NOSLEEP);
+ if (*sp == NULL)
+ return (FALSE);
+
+ alloc = TRUE;
+ }
+
+ if (!xdrmem_dec_bytes(xdrs, *sp, size))
+ goto fail;
+
+ if (memchr(*sp, 0, size) != NULL)
+ goto fail;
+
+ (*sp)[size] = '\0';
+
+ return (TRUE);
+
+fail:
+ if (alloc)
+ kmem_free(*sp, size + 1);
+
+ return (FALSE);
+}
+
+static struct xdr_ops xdrmem_encode_ops = {
+ .xdr_control = xdrmem_control,
+ .xdr_char = xdrmem_enc_char,
+ .xdr_u_short = xdrmem_enc_ushort,
+ .xdr_u_int = xdrmem_enc_uint,
+ .xdr_u_longlong_t = xdrmem_enc_ulonglong,
+ .xdr_opaque = xdrmem_enc_bytes,
+ .xdr_string = xdr_enc_string,
+ .xdr_array = xdr_enc_array
+};
+
+static struct xdr_ops xdrmem_decode_ops = {
+ .xdr_control = xdrmem_control,
+ .xdr_char = xdrmem_dec_char,
+ .xdr_u_short = xdrmem_dec_ushort,
+ .xdr_u_int = xdrmem_dec_uint,
+ .xdr_u_longlong_t = xdrmem_dec_ulonglong,
+ .xdr_opaque = xdrmem_dec_bytes,
+ .xdr_string = xdr_dec_string,
+ .xdr_array = xdr_dec_array
+};
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-zlib.c b/sys/contrib/openzfs/module/os/linux/spl/spl-zlib.c
new file mode 100644
index 000000000000..589496da0c78
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-zlib.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * z_compress_level/z_uncompress are nearly identical copies of the
+ * compress2/uncompress functions provided by the official zlib package
+ * available at http://zlib.net/. The only changes made we to slightly
+ * adapt the functions called to match the linux kernel implementation
+ * of zlib. The full zlib license follows:
+ *
+ * zlib.h -- interface of the 'zlib' general purpose compression library
+ * version 1.2.5, April 19th, 2010
+ *
+ * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * Jean-loup Gailly
+ * Mark Adler
+ */
+
+
+#include <linux/percpu_compat.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/zmod.h>
+
+static spl_kmem_cache_t *zlib_workspace_cache;
+
+/*
+ * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc
+ * and vfree for every call. Using a kmem_cache also has the advantage
+ * that improves the odds that the memory used will be local to this cpu.
+ * To further improve things it might be wise to create a dedicated per-cpu
+ * workspace for use. This would take some additional care because we then
+ * must disable preemption around the critical section, and verify that
+ * zlib_deflate* and zlib_inflate* never internally call schedule().
+ */
+static void *
+zlib_workspace_alloc(int flags)
+{
+ return (kmem_cache_alloc(zlib_workspace_cache, flags & ~(__GFP_FS)));
+}
+
+static void
+zlib_workspace_free(void *workspace)
+{
+ kmem_cache_free(zlib_workspace_cache, workspace);
+}
+
+/*
+ * Compresses the source buffer into the destination buffer. The level
+ * parameter has the same meaning as in deflateInit. sourceLen is the byte
+ * length of the source buffer. Upon entry, destLen is the total size of the
+ * destination buffer, which must be at least 0.1% larger than sourceLen plus
+ * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer.
+ *
+ * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ * memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+ * Z_STREAM_ERROR if the level parameter is invalid.
+ */
+int
+z_compress_level(void *dest, size_t *destLen, const void *source,
+ size_t sourceLen, int level)
+{
+ z_stream stream;
+ int err;
+
+ stream.next_in = (Byte *)source;
+ stream.avail_in = (uInt)sourceLen;
+ stream.next_out = dest;
+ stream.avail_out = (uInt)*destLen;
+
+ if ((size_t)stream.avail_out != *destLen)
+ return (Z_BUF_ERROR);
+
+ stream.workspace = zlib_workspace_alloc(KM_SLEEP);
+ if (!stream.workspace)
+ return (Z_MEM_ERROR);
+
+ err = zlib_deflateInit(&stream, level);
+ if (err != Z_OK) {
+ zlib_workspace_free(stream.workspace);
+ return (err);
+ }
+
+ err = zlib_deflate(&stream, Z_FINISH);
+ if (err != Z_STREAM_END) {
+ zlib_deflateEnd(&stream);
+ zlib_workspace_free(stream.workspace);
+ return (err == Z_OK ? Z_BUF_ERROR : err);
+ }
+ *destLen = stream.total_out;
+
+ err = zlib_deflateEnd(&stream);
+ zlib_workspace_free(stream.workspace);
+
+ return (err);
+}
+EXPORT_SYMBOL(z_compress_level);
+
+/*
+ * Decompresses the source buffer into the destination buffer. sourceLen is
+ * the byte length of the source buffer. Upon entry, destLen is the total
+ * size of the destination buffer, which must be large enough to hold the
+ * entire uncompressed data. (The size of the uncompressed data must have
+ * been saved previously by the compressor and transmitted to the decompressor
+ * by some mechanism outside the scope of this compression library.)
+ * Upon exit, destLen is the actual size of the compressed buffer.
+ * This function can be used to decompress a whole file at once if the
+ * input file is mmap'ed.
+ *
+ * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
+ * enough memory, Z_BUF_ERROR if there was not enough room in the output
+ * buffer, or Z_DATA_ERROR if the input data was corrupted.
+ */
+int
+z_uncompress(void *dest, size_t *destLen, const void *source, size_t sourceLen)
+{
+ z_stream stream;
+ int err;
+
+ stream.next_in = (Byte *)source;
+ stream.avail_in = (uInt)sourceLen;
+ stream.next_out = dest;
+ stream.avail_out = (uInt)*destLen;
+
+ if ((size_t)stream.avail_out != *destLen)
+ return (Z_BUF_ERROR);
+
+ stream.workspace = zlib_workspace_alloc(KM_SLEEP);
+ if (!stream.workspace)
+ return (Z_MEM_ERROR);
+
+ err = zlib_inflateInit(&stream);
+ if (err != Z_OK) {
+ zlib_workspace_free(stream.workspace);
+ return (err);
+ }
+
+ err = zlib_inflate(&stream, Z_FINISH);
+ if (err != Z_STREAM_END) {
+ zlib_inflateEnd(&stream);
+ zlib_workspace_free(stream.workspace);
+
+ if (err == Z_NEED_DICT ||
+ (err == Z_BUF_ERROR && stream.avail_in == 0))
+ return (Z_DATA_ERROR);
+
+ return (err);
+ }
+ *destLen = stream.total_out;
+
+ err = zlib_inflateEnd(&stream);
+ zlib_workspace_free(stream.workspace);
+
+ return (err);
+}
+EXPORT_SYMBOL(z_uncompress);
+
+int
+spl_zlib_init(void)
+{
+ int size;
+
+ size = MAX(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
+ zlib_inflate_workspacesize());
+
+ zlib_workspace_cache = kmem_cache_create(
+ "spl_zlib_workspace_cache",
+ size, 0, NULL, NULL, NULL, NULL, NULL,
+ KMC_KVMEM);
+ if (!zlib_workspace_cache)
+ return (1);
+
+ return (0);
+}
+
+void
+spl_zlib_fini(void)
+{
+ kmem_cache_destroy(zlib_workspace_cache);
+ zlib_workspace_cache = NULL;
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in b/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in
new file mode 100644
index 000000000000..75bec52c94e2
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in
@@ -0,0 +1,37 @@
+#
+# Linux specific sources included from module/zfs/Makefile.in
+#
+
+# Suppress unused-value warnings in sparc64 architecture headers
+ccflags-$(CONFIG_SPARC64) += -Wno-unused-value
+
+$(MODULE)-objs += ../os/linux/zfs/abd_os.o
+$(MODULE)-objs += ../os/linux/zfs/arc_os.o
+$(MODULE)-objs += ../os/linux/zfs/mmp_os.o
+$(MODULE)-objs += ../os/linux/zfs/policy.o
+$(MODULE)-objs += ../os/linux/zfs/trace.o
+$(MODULE)-objs += ../os/linux/zfs/qat.o
+$(MODULE)-objs += ../os/linux/zfs/qat_compress.o
+$(MODULE)-objs += ../os/linux/zfs/qat_crypt.o
+$(MODULE)-objs += ../os/linux/zfs/spa_misc_os.o
+$(MODULE)-objs += ../os/linux/zfs/vdev_disk.o
+$(MODULE)-objs += ../os/linux/zfs/vdev_file.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_acl.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_ctldir.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_debug.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_dir.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_file_os.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_ioctl_os.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_sysfs.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_uio.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_vfsops.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_vnops_os.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_znode.o
+$(MODULE)-objs += ../os/linux/zfs/zio_crypt.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_ctldir.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_export.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_file.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_inode.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_super.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_xattr.o
+$(MODULE)-objs += ../os/linux/zfs/zvol_os.o
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
new file mode 100644
index 000000000000..d82e5f4dcf15
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
@@ -0,0 +1,1073 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2019 by Delphix. All rights reserved.
+ */
+
+/*
+ * See abd.c for a general overview of the arc buffered data (ABD).
+ *
+ * Linear buffers act exactly like normal buffers and are always mapped into the
+ * kernel's virtual memory space, while scattered ABD data chunks are allocated
+ * as physical pages and then mapped in only while they are actually being
+ * accessed through one of the abd_* library functions. Using scattered ABDs
+ * provides several benefits:
+ *
+ * (1) They avoid use of kmem_*, preventing performance problems where running
+ * kmem_reap on very large memory systems never finishes and causes
+ * constant TLB shootdowns.
+ *
+ * (2) Fragmentation is less of an issue since when we are at the limit of
+ * allocatable space, we won't have to search around for a long free
+ * hole in the VA space for large ARC allocations. Each chunk is mapped in
+ * individually, so even if we are using HIGHMEM (see next point) we
+ * wouldn't need to worry about finding a contiguous address range.
+ *
+ * (3) If we are not using HIGHMEM, then all physical memory is always
+ * mapped into the kernel's address space, so we also avoid the map /
+ * unmap costs on each ABD access.
+ *
+ * If we are not using HIGHMEM, scattered buffers which have only one chunk
+ * can be treated as linear buffers, because they are contiguous in the
+ * kernel's virtual address space. See abd_alloc_chunks() for details.
+ */
+
+#include <sys/abd_impl.h>
+#include <sys/param.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+#ifdef _KERNEL
+#include <linux/kmap_compat.h>
+#include <linux/scatterlist.h>
+#else
+#define MAX_ORDER 1
+#endif
+
+typedef struct abd_stats {
+ kstat_named_t abdstat_struct_size;
+ kstat_named_t abdstat_linear_cnt;
+ kstat_named_t abdstat_linear_data_size;
+ kstat_named_t abdstat_scatter_cnt;
+ kstat_named_t abdstat_scatter_data_size;
+ kstat_named_t abdstat_scatter_chunk_waste;
+ kstat_named_t abdstat_scatter_orders[MAX_ORDER];
+ kstat_named_t abdstat_scatter_page_multi_chunk;
+ kstat_named_t abdstat_scatter_page_multi_zone;
+ kstat_named_t abdstat_scatter_page_alloc_retry;
+ kstat_named_t abdstat_scatter_sg_table_retry;
+} abd_stats_t;
+
+static abd_stats_t abd_stats = {
+ /* Amount of memory occupied by all of the abd_t struct allocations */
+ { "struct_size", KSTAT_DATA_UINT64 },
+ /*
+ * The number of linear ABDs which are currently allocated, excluding
+ * ABDs which don't own their data (for instance the ones which were
+ * allocated through abd_get_offset() and abd_get_from_buf()). If an
+ * ABD takes ownership of its buf then it will become tracked.
+ */
+ { "linear_cnt", KSTAT_DATA_UINT64 },
+ /* Amount of data stored in all linear ABDs tracked by linear_cnt */
+ { "linear_data_size", KSTAT_DATA_UINT64 },
+ /*
+ * The number of scatter ABDs which are currently allocated, excluding
+ * ABDs which don't own their data (for instance the ones which were
+ * allocated through abd_get_offset()).
+ */
+ { "scatter_cnt", KSTAT_DATA_UINT64 },
+ /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
+ { "scatter_data_size", KSTAT_DATA_UINT64 },
+ /*
+ * The amount of space wasted at the end of the last chunk across all
+ * scatter ABDs tracked by scatter_cnt.
+ */
+ { "scatter_chunk_waste", KSTAT_DATA_UINT64 },
+ /*
+ * The number of compound allocations of a given order. These
+ * allocations are spread over all currently allocated ABDs, and
+ * act as a measure of memory fragmentation.
+ */
+ { { "scatter_order_N", KSTAT_DATA_UINT64 } },
+ /*
+ * The number of scatter ABDs which contain multiple chunks.
+ * ABDs are preferentially allocated from the minimum number of
+ * contiguous multi-page chunks, a single chunk is optimal.
+ */
+ { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 },
+ /*
+ * The number of scatter ABDs which are split across memory zones.
+ * ABDs are preferentially allocated using pages from a single zone.
+ */
+ { "scatter_page_multi_zone", KSTAT_DATA_UINT64 },
+ /*
+ * The total number of retries encountered when attempting to
+ * allocate the pages to populate the scatter ABD.
+ */
+ { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 },
+ /*
+ * The total number of retries encountered when attempting to
+ * allocate the sg table for an ABD.
+ */
+ { "scatter_sg_table_retry", KSTAT_DATA_UINT64 },
+};
+
+#define abd_for_each_sg(abd, sg, n, i) \
+ for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i)
+
+unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1;
+
+/*
+ * zfs_abd_scatter_min_size is the minimum allocation size to use scatter
+ * ABD's. Smaller allocations will use linear ABD's which uses
+ * zio_[data_]buf_alloc().
+ *
+ * Scatter ABD's use at least one page each, so sub-page allocations waste
+ * some space when allocated as scatter (e.g. 2KB scatter allocation wastes
+ * half of each page). Using linear ABD's for small allocations means that
+ * they will be put on slabs which contain many allocations. This can
+ * improve memory efficiency, but it also makes it much harder for ARC
+ * evictions to actually free pages, because all the buffers on one slab need
+ * to be freed in order for the slab (and underlying pages) to be freed.
+ * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's
+ * possible for them to actually waste more memory than scatter (one page per
+ * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th).
+ *
+ * Spill blocks are typically 512B and are heavily used on systems running
+ * selinux with the default dnode size and the `xattr=sa` property set.
+ *
+ * By default we use linear allocations for 512B and 1KB, and scatter
+ * allocations for larger (1.5KB and up).
+ */
+int zfs_abd_scatter_min_size = 512 * 3;
+
+/*
+ * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are
+ * just a single zero'd page. This allows us to conserve memory by
+ * only using a single zero page for the scatterlist.
+ */
+abd_t *abd_zero_scatter = NULL;
+
+struct page;
+/*
+ * abd_zero_page we will be an allocated zero'd PAGESIZE buffer, which is
+ * assigned to set each of the pages of abd_zero_scatter.
+ */
+static struct page *abd_zero_page = NULL;
+
+static kmem_cache_t *abd_cache = NULL;
+static kstat_t *abd_ksp;
+
+static uint_t
+abd_chunkcnt_for_bytes(size_t size)
+{
+ return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE);
+}
+
+abd_t *
+abd_alloc_struct_impl(size_t size)
+{
+ /*
+ * In Linux we do not use the size passed in during ABD
+ * allocation, so we just ignore it.
+ */
+ abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE);
+ ASSERT3P(abd, !=, NULL);
+ ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t));
+
+ return (abd);
+}
+
+void
+abd_free_struct_impl(abd_t *abd)
+{
+ kmem_cache_free(abd_cache, abd);
+ ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t));
+}
+
+#ifdef _KERNEL
+/*
+ * Mark zfs data pages so they can be excluded from kernel crash dumps
+ */
+#ifdef _LP64
+#define ABD_FILE_CACHE_PAGE 0x2F5ABDF11ECAC4E
+
+static inline void
+abd_mark_zfs_page(struct page *page)
+{
+ get_page(page);
+ SetPagePrivate(page);
+ set_page_private(page, ABD_FILE_CACHE_PAGE);
+}
+
+static inline void
+abd_unmark_zfs_page(struct page *page)
+{
+ set_page_private(page, 0UL);
+ ClearPagePrivate(page);
+ put_page(page);
+}
+#else
+#define abd_mark_zfs_page(page)
+#define abd_unmark_zfs_page(page)
+#endif /* _LP64 */
+
+#ifndef CONFIG_HIGHMEM
+
+#ifndef __GFP_RECLAIM
+#define __GFP_RECLAIM __GFP_WAIT
+#endif
+
+/*
+ * The goal is to minimize fragmentation by preferentially populating ABDs
+ * with higher order compound pages from a single zone. Allocation size is
+ * progressively decreased until it can be satisfied without performing
+ * reclaim or compaction. When necessary this function will degenerate to
+ * allocating individual pages and allowing reclaim to satisfy allocations.
+ */
+void
+abd_alloc_chunks(abd_t *abd, size_t size)
+{
+ struct list_head pages;
+ struct sg_table table;
+ struct scatterlist *sg;
+ struct page *page, *tmp_page = NULL;
+ gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
+ gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM;
+ int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1);
+ int nr_pages = abd_chunkcnt_for_bytes(size);
+ int chunks = 0, zones = 0;
+ size_t remaining_size;
+ int nid = NUMA_NO_NODE;
+ int alloc_pages = 0;
+
+ INIT_LIST_HEAD(&pages);
+
+ while (alloc_pages < nr_pages) {
+ unsigned chunk_pages;
+ int order;
+
+ order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
+ chunk_pages = (1U << order);
+
+ page = alloc_pages_node(nid, order ? gfp_comp : gfp, order);
+ if (page == NULL) {
+ if (order == 0) {
+ ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
+ schedule_timeout_interruptible(1);
+ } else {
+ max_order = MAX(0, order - 1);
+ }
+ continue;
+ }
+
+ list_add_tail(&page->lru, &pages);
+
+ if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid))
+ zones++;
+
+ nid = page_to_nid(page);
+ ABDSTAT_BUMP(abdstat_scatter_orders[order]);
+ chunks++;
+ alloc_pages += chunk_pages;
+ }
+
+ ASSERT3S(alloc_pages, ==, nr_pages);
+
+ while (sg_alloc_table(&table, chunks, gfp)) {
+ ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
+ schedule_timeout_interruptible(1);
+ }
+
+ sg = table.sgl;
+ remaining_size = size;
+ list_for_each_entry_safe(page, tmp_page, &pages, lru) {
+ size_t sg_size = MIN(PAGESIZE << compound_order(page),
+ remaining_size);
+ sg_set_page(sg, page, sg_size, 0);
+ abd_mark_zfs_page(page);
+ remaining_size -= sg_size;
+
+ sg = sg_next(sg);
+ list_del(&page->lru);
+ }
+
+ /*
+ * These conditions ensure that a possible transformation to a linear
+ * ABD would be valid.
+ */
+ ASSERT(!PageHighMem(sg_page(table.sgl)));
+ ASSERT0(ABD_SCATTER(abd).abd_offset);
+
+ if (table.nents == 1) {
+ /*
+ * Since there is only one entry, this ABD can be represented
+ * as a linear buffer. All single-page (4K) ABD's can be
+ * represented this way. Some multi-page ABD's can also be
+ * represented this way, if we were able to allocate a single
+ * "chunk" (higher-order "page" which represents a power-of-2
+ * series of physically-contiguous pages). This is often the
+ * case for 2-page (8K) ABD's.
+ *
+ * Representing a single-entry scatter ABD as a linear ABD
+ * has the performance advantage of avoiding the copy (and
+ * allocation) in abd_borrow_buf_copy / abd_return_buf_copy.
+ * A performance increase of around 5% has been observed for
+ * ARC-cached reads (of small blocks which can take advantage
+ * of this).
+ *
+ * Note that this optimization is only possible because the
+ * pages are always mapped into the kernel's address space.
+ * This is not the case for highmem pages, so the
+ * optimization can not be made there.
+ */
+ abd->abd_flags |= ABD_FLAG_LINEAR;
+ abd->abd_flags |= ABD_FLAG_LINEAR_PAGE;
+ abd->abd_u.abd_linear.abd_sgl = table.sgl;
+ ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl));
+ } else if (table.nents > 1) {
+ ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
+ abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
+
+ if (zones) {
+ ABDSTAT_BUMP(abdstat_scatter_page_multi_zone);
+ abd->abd_flags |= ABD_FLAG_MULTI_ZONE;
+ }
+
+ ABD_SCATTER(abd).abd_sgl = table.sgl;
+ ABD_SCATTER(abd).abd_nents = table.nents;
+ }
+}
+#else
+
+/*
+ * Allocate N individual pages to construct a scatter ABD. This function
+ * makes no attempt to request contiguous pages and requires the minimal
+ * number of kernel interfaces. It's designed for maximum compatibility.
+ */
+void
+abd_alloc_chunks(abd_t *abd, size_t size)
+{
+ struct scatterlist *sg = NULL;
+ struct sg_table table;
+ struct page *page;
+ gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
+ int nr_pages = abd_chunkcnt_for_bytes(size);
+ int i = 0;
+
+ while (sg_alloc_table(&table, nr_pages, gfp)) {
+ ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
+ schedule_timeout_interruptible(1);
+ }
+
+ ASSERT3U(table.nents, ==, nr_pages);
+ ABD_SCATTER(abd).abd_sgl = table.sgl;
+ ABD_SCATTER(abd).abd_nents = nr_pages;
+
+ abd_for_each_sg(abd, sg, nr_pages, i) {
+ while ((page = __page_cache_alloc(gfp)) == NULL) {
+ ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
+ schedule_timeout_interruptible(1);
+ }
+
+ ABDSTAT_BUMP(abdstat_scatter_orders[0]);
+ sg_set_page(sg, page, PAGESIZE, 0);
+ abd_mark_zfs_page(page);
+ }
+
+ if (nr_pages > 1) {
+ ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
+ abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
+ }
+}
+#endif /* !CONFIG_HIGHMEM */
+
+/*
+ * This must be called if any of the sg_table allocation functions
+ * are called.
+ */
+static void
+abd_free_sg_table(abd_t *abd)
+{
+ struct sg_table table;
+
+ table.sgl = ABD_SCATTER(abd).abd_sgl;
+ table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents;
+ sg_free_table(&table);
+}
+
+void
+abd_free_chunks(abd_t *abd)
+{
+ struct scatterlist *sg = NULL;
+ struct page *page;
+ int nr_pages = ABD_SCATTER(abd).abd_nents;
+ int order, i = 0;
+
+ if (abd->abd_flags & ABD_FLAG_MULTI_ZONE)
+ ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone);
+
+ if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK)
+ ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
+
+ abd_for_each_sg(abd, sg, nr_pages, i) {
+ page = sg_page(sg);
+ abd_unmark_zfs_page(page);
+ order = compound_order(page);
+ __free_pages(page, order);
+ ASSERT3U(sg->length, <=, PAGE_SIZE << order);
+ ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]);
+ }
+ abd_free_sg_table(abd);
+}
+
+/*
+ * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where each page in
+ * the scatterlist will be set to the zero'd out buffer abd_zero_page.
+ */
+static void
+abd_alloc_zero_scatter(void)
+{
+ struct scatterlist *sg = NULL;
+ struct sg_table table;
+ gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
+ gfp_t gfp_zero_page = gfp | __GFP_ZERO;
+ int nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
+ int i = 0;
+
+ while ((abd_zero_page = __page_cache_alloc(gfp_zero_page)) == NULL) {
+ ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
+ schedule_timeout_interruptible(1);
+ }
+ abd_mark_zfs_page(abd_zero_page);
+
+ while (sg_alloc_table(&table, nr_pages, gfp)) {
+ ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
+ schedule_timeout_interruptible(1);
+ }
+ ASSERT3U(table.nents, ==, nr_pages);
+
+ abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
+ abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
+ ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
+ ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl;
+ ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
+ abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
+ abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
+
+ abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
+ sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
+ }
+
+ ABDSTAT_BUMP(abdstat_scatter_cnt);
+ ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE);
+ ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
+}
+
+#else /* _KERNEL */
+
+#ifndef PAGE_SHIFT
+#define PAGE_SHIFT (highbit64(PAGESIZE)-1)
+#endif
+
+#define zfs_kmap_atomic(chunk, km) ((void *)chunk)
+#define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0)
+#define local_irq_save(flags) do { (void)(flags); } while (0)
+#define local_irq_restore(flags) do { (void)(flags); } while (0)
+#define nth_page(pg, i) \
+ ((struct page *)((void *)(pg) + (i) * PAGESIZE))
+
+struct scatterlist {
+ struct page *page;
+ int length;
+ int end;
+};
+
+static void
+sg_init_table(struct scatterlist *sg, int nr)
+{
+ memset(sg, 0, nr * sizeof (struct scatterlist));
+ sg[nr - 1].end = 1;
+}
+
+/*
+ * This must be called if any of the sg_table allocation functions
+ * are called.
+ */
+static void
+abd_free_sg_table(abd_t *abd)
+{
+ int nents = ABD_SCATTER(abd).abd_nents;
+ vmem_free(ABD_SCATTER(abd).abd_sgl,
+ nents * sizeof (struct scatterlist));
+}
+
+#define for_each_sg(sgl, sg, nr, i) \
+ for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
+
+static inline void
+sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
+ unsigned int offset)
+{
+ /* currently we don't use offset */
+ ASSERT(offset == 0);
+ sg->page = page;
+ sg->length = len;
+}
+
+static inline struct page *
+sg_page(struct scatterlist *sg)
+{
+ return (sg->page);
+}
+
+static inline struct scatterlist *
+sg_next(struct scatterlist *sg)
+{
+ if (sg->end)
+ return (NULL);
+
+ return (sg + 1);
+}
+
+void
+abd_alloc_chunks(abd_t *abd, size_t size)
+{
+ unsigned nr_pages = abd_chunkcnt_for_bytes(size);
+ struct scatterlist *sg;
+ int i;
+
+ ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
+ sizeof (struct scatterlist), KM_SLEEP);
+ sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
+
+ abd_for_each_sg(abd, sg, nr_pages, i) {
+ struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
+ sg_set_page(sg, p, PAGESIZE, 0);
+ }
+ ABD_SCATTER(abd).abd_nents = nr_pages;
+}
+
+void
+abd_free_chunks(abd_t *abd)
+{
+ int i, n = ABD_SCATTER(abd).abd_nents;
+ struct scatterlist *sg;
+
+ abd_for_each_sg(abd, sg, n, i) {
+ for (int j = 0; j < sg->length; j += PAGESIZE) {
+ struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT);
+ umem_free(p, PAGESIZE);
+ }
+ }
+ abd_free_sg_table(abd);
+}
+
+static void
+abd_alloc_zero_scatter(void)
+{
+ unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
+ struct scatterlist *sg;
+ int i;
+
+ abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
+ memset(abd_zero_page, 0, PAGESIZE);
+ abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
+ abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
+ abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
+ ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
+ ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
+ abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
+ zfs_refcount_create(&abd_zero_scatter->abd_children);
+ ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages *
+ sizeof (struct scatterlist), KM_SLEEP);
+
+ sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages);
+
+ abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
+ sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
+ }
+
+ ABDSTAT_BUMP(abdstat_scatter_cnt);
+ ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE);
+ ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
+}
+
+#endif /* _KERNEL */
+
+boolean_t
+abd_size_alloc_linear(size_t size)
+{
+ return (size < zfs_abd_scatter_min_size ? B_TRUE : B_FALSE);
+}
+
+void
+abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
+{
+ ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
+ int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size;
+ if (op == ABDSTAT_INCR) {
+ ABDSTAT_BUMP(abdstat_scatter_cnt);
+ ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size);
+ ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste);
+ arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
+ } else {
+ ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
+ ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
+ ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste);
+ arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE);
+ }
+}
+
+void
+abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
+{
+ ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
+ if (op == ABDSTAT_INCR) {
+ ABDSTAT_BUMP(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
+ } else {
+ ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
+ }
+}
+
+void
+abd_verify_scatter(abd_t *abd)
+{
+ size_t n;
+ int i = 0;
+ struct scatterlist *sg = NULL;
+
+ ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
+ ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
+ ABD_SCATTER(abd).abd_sgl->length);
+ n = ABD_SCATTER(abd).abd_nents;
+ abd_for_each_sg(abd, sg, n, i) {
+ ASSERT3P(sg_page(sg), !=, NULL);
+ }
+}
+
+static void
+abd_free_zero_scatter(void)
+{
+ ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
+ ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGESIZE);
+ ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
+
+ abd_free_sg_table(abd_zero_scatter);
+ abd_free_struct(abd_zero_scatter);
+ abd_zero_scatter = NULL;
+ ASSERT3P(abd_zero_page, !=, NULL);
+#if defined(_KERNEL)
+ abd_unmark_zfs_page(abd_zero_page);
+ __free_page(abd_zero_page);
+#else
+ umem_free(abd_zero_page, PAGESIZE);
+#endif /* _KERNEL */
+}
+
+void
+abd_init(void)
+{
+ int i;
+
+ abd_cache = kmem_cache_create("abd_t", sizeof (abd_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
+ sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+ if (abd_ksp != NULL) {
+ for (i = 0; i < MAX_ORDER; i++) {
+ snprintf(abd_stats.abdstat_scatter_orders[i].name,
+ KSTAT_STRLEN, "scatter_order_%d", i);
+ abd_stats.abdstat_scatter_orders[i].data_type =
+ KSTAT_DATA_UINT64;
+ }
+ abd_ksp->ks_data = &abd_stats;
+ kstat_install(abd_ksp);
+ }
+
+ abd_alloc_zero_scatter();
+}
+
+void
+abd_fini(void)
+{
+ abd_free_zero_scatter();
+
+ if (abd_ksp != NULL) {
+ kstat_delete(abd_ksp);
+ abd_ksp = NULL;
+ }
+
+ if (abd_cache) {
+ kmem_cache_destroy(abd_cache);
+ abd_cache = NULL;
+ }
+}
+
+void
+abd_free_linear_page(abd_t *abd)
+{
+ /* Transform it back into a scatter ABD for freeing */
+ struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
+ abd->abd_flags &= ~ABD_FLAG_LINEAR;
+ abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
+ ABD_SCATTER(abd).abd_nents = 1;
+ ABD_SCATTER(abd).abd_offset = 0;
+ ABD_SCATTER(abd).abd_sgl = sg;
+ abd_free_chunks(abd);
+
+ abd_update_scatter_stats(abd, ABDSTAT_DECR);
+}
+
+/*
+ * If we're going to use this ABD for doing I/O using the block layer, the
+ * consumer of the ABD data doesn't care if it's scattered or not, and we don't
+ * plan to store this ABD in memory for a long period of time, we should
+ * allocate the ABD type that requires the least data copying to do the I/O.
+ *
+ * On Linux the optimal thing to do would be to use abd_get_offset() and
+ * construct a new ABD which shares the original pages thereby eliminating
+ * the copy. But for the moment a new linear ABD is allocated until this
+ * performance optimization can be implemented.
+ */
+abd_t *
+abd_alloc_for_io(size_t size, boolean_t is_metadata)
+{
+ return (abd_alloc(size, is_metadata));
+}
+
+abd_t *
+abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off)
+{
+ int i = 0;
+ struct scatterlist *sg = NULL;
+
+ abd_verify(sabd);
+ ASSERT3U(off, <=, sabd->abd_size);
+
+ size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
+
+ if (abd == NULL)
+ abd = abd_alloc_struct(0);
+
+ /*
+ * Even if this buf is filesystem metadata, we only track that
+ * if we own the underlying data buffer, which is not true in
+ * this case. Therefore, we don't ever use ABD_FLAG_META here.
+ */
+
+ abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) {
+ if (new_offset < sg->length)
+ break;
+ new_offset -= sg->length;
+ }
+
+ ABD_SCATTER(abd).abd_sgl = sg;
+ ABD_SCATTER(abd).abd_offset = new_offset;
+ ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
+
+ return (abd);
+}
+
+/*
+ * Initialize the abd_iter.
+ */
+void
+abd_iter_init(struct abd_iter *aiter, abd_t *abd)
+{
+ ASSERT(!abd_is_gang(abd));
+ abd_verify(abd);
+ aiter->iter_abd = abd;
+ aiter->iter_mapaddr = NULL;
+ aiter->iter_mapsize = 0;
+ aiter->iter_pos = 0;
+ if (abd_is_linear(abd)) {
+ aiter->iter_offset = 0;
+ aiter->iter_sg = NULL;
+ } else {
+ aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
+ aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
+ }
+}
+
+/*
+ * This is just a helper function to see if we have exhausted the
+ * abd_iter and reached the end.
+ */
+boolean_t
+abd_iter_at_end(struct abd_iter *aiter)
+{
+ return (aiter->iter_pos == aiter->iter_abd->abd_size);
+}
+
+/*
+ * Advance the iterator by a certain amount. Cannot be called when a chunk is
+ * in use. This can be safely called when the aiter has already exhausted, in
+ * which case this does nothing.
+ */
+void
+abd_iter_advance(struct abd_iter *aiter, size_t amount)
+{
+ ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+ ASSERT0(aiter->iter_mapsize);
+
+ /* There's nothing left to advance to, so do nothing */
+ if (abd_iter_at_end(aiter))
+ return;
+
+ aiter->iter_pos += amount;
+ aiter->iter_offset += amount;
+ if (!abd_is_linear(aiter->iter_abd)) {
+ while (aiter->iter_offset >= aiter->iter_sg->length) {
+ aiter->iter_offset -= aiter->iter_sg->length;
+ aiter->iter_sg = sg_next(aiter->iter_sg);
+ if (aiter->iter_sg == NULL) {
+ ASSERT0(aiter->iter_offset);
+ break;
+ }
+ }
+ }
+}
+
+/*
+ * Map the current chunk into aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+void
+abd_iter_map(struct abd_iter *aiter)
+{
+ void *paddr;
+ size_t offset = 0;
+
+ ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+ ASSERT0(aiter->iter_mapsize);
+
+ /* There's nothing left to iterate over, so do nothing */
+ if (abd_iter_at_end(aiter))
+ return;
+
+ if (abd_is_linear(aiter->iter_abd)) {
+ ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
+ offset = aiter->iter_offset;
+ aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
+ paddr = ABD_LINEAR_BUF(aiter->iter_abd);
+ } else {
+ offset = aiter->iter_offset;
+ aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset,
+ aiter->iter_abd->abd_size - aiter->iter_pos);
+
+ paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg),
+ km_table[aiter->iter_km]);
+ }
+
+ aiter->iter_mapaddr = (char *)paddr + offset;
+}
+
+/*
+ * Unmap the current chunk from aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+void
+abd_iter_unmap(struct abd_iter *aiter)
+{
+ /* There's nothing left to unmap, so do nothing */
+ if (abd_iter_at_end(aiter))
+ return;
+
+ if (!abd_is_linear(aiter->iter_abd)) {
+ /* LINTED E_FUNC_SET_NOT_USED */
+ zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset,
+ km_table[aiter->iter_km]);
+ }
+
+ ASSERT3P(aiter->iter_mapaddr, !=, NULL);
+ ASSERT3U(aiter->iter_mapsize, >, 0);
+
+ aiter->iter_mapaddr = NULL;
+ aiter->iter_mapsize = 0;
+}
+
+void
+abd_cache_reap_now(void)
+{
+}
+
+#if defined(_KERNEL)
+/*
+ * bio_nr_pages for ABD.
+ * @off is the offset in @abd
+ */
+unsigned long
+abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off)
+{
+ unsigned long pos;
+
+ if (abd_is_gang(abd)) {
+ unsigned long count = 0;
+
+ for (abd_t *cabd = abd_gang_get_offset(abd, &off);
+ cabd != NULL && size != 0;
+ cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
+ ASSERT3U(off, <, cabd->abd_size);
+ int mysize = MIN(size, cabd->abd_size - off);
+ count += abd_nr_pages_off(cabd, mysize, off);
+ size -= mysize;
+ off = 0;
+ }
+ return (count);
+ }
+
+ if (abd_is_linear(abd))
+ pos = (unsigned long)abd_to_buf(abd) + off;
+ else
+ pos = ABD_SCATTER(abd).abd_offset + off;
+
+ return (((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) -
+ (pos >> PAGE_SHIFT));
+}
+
+static unsigned int
+bio_map(struct bio *bio, void *buf_ptr, unsigned int bio_size)
+{
+ unsigned int offset, size, i;
+ struct page *page;
+
+ offset = offset_in_page(buf_ptr);
+ for (i = 0; i < bio->bi_max_vecs; i++) {
+ size = PAGE_SIZE - offset;
+
+ if (bio_size <= 0)
+ break;
+
+ if (size > bio_size)
+ size = bio_size;
+
+ if (is_vmalloc_addr(buf_ptr))
+ page = vmalloc_to_page(buf_ptr);
+ else
+ page = virt_to_page(buf_ptr);
+
+ /*
+ * Some network related block device uses tcp_sendpage, which
+ * doesn't behave well when using 0-count page, this is a
+ * safety net to catch them.
+ */
+ ASSERT3S(page_count(page), >, 0);
+
+ if (bio_add_page(bio, page, size, offset) != size)
+ break;
+
+ buf_ptr += size;
+ bio_size -= size;
+ offset = 0;
+ }
+
+ return (bio_size);
+}
+
+/*
+ * bio_map for gang ABD.
+ */
+static unsigned int
+abd_gang_bio_map_off(struct bio *bio, abd_t *abd,
+ unsigned int io_size, size_t off)
+{
+ ASSERT(abd_is_gang(abd));
+
+ for (abd_t *cabd = abd_gang_get_offset(abd, &off);
+ cabd != NULL;
+ cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
+ ASSERT3U(off, <, cabd->abd_size);
+ int size = MIN(io_size, cabd->abd_size - off);
+ int remainder = abd_bio_map_off(bio, cabd, size, off);
+ io_size -= (size - remainder);
+ if (io_size == 0 || remainder > 0)
+ return (io_size);
+ off = 0;
+ }
+ ASSERT0(io_size);
+ return (io_size);
+}
+
+/*
+ * bio_map for ABD.
+ * @off is the offset in @abd
+ * Remaining IO size is returned
+ */
+unsigned int
+abd_bio_map_off(struct bio *bio, abd_t *abd,
+ unsigned int io_size, size_t off)
+{
+ struct abd_iter aiter;
+
+ ASSERT3U(io_size, <=, abd->abd_size - off);
+ if (abd_is_linear(abd))
+ return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, io_size));
+
+ ASSERT(!abd_is_linear(abd));
+ if (abd_is_gang(abd))
+ return (abd_gang_bio_map_off(bio, abd, io_size, off));
+
+ abd_iter_init(&aiter, abd);
+ abd_iter_advance(&aiter, off);
+
+ for (int i = 0; i < bio->bi_max_vecs; i++) {
+ struct page *pg;
+ size_t len, sgoff, pgoff;
+ struct scatterlist *sg;
+
+ if (io_size <= 0)
+ break;
+
+ sg = aiter.iter_sg;
+ sgoff = aiter.iter_offset;
+ pgoff = sgoff & (PAGESIZE - 1);
+ len = MIN(io_size, PAGESIZE - pgoff);
+ ASSERT(len > 0);
+
+ pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT);
+ if (bio_add_page(bio, pg, len, pgoff) != len)
+ break;
+
+ io_size -= len;
+ abd_iter_advance(&aiter, len);
+ }
+
+ return (io_size);
+}
+
+/* Tunable Parameters */
+module_param(zfs_abd_scatter_enabled, int, 0644);
+MODULE_PARM_DESC(zfs_abd_scatter_enabled,
+ "Toggle whether ABD allocations must be linear.");
+module_param(zfs_abd_scatter_min_size, int, 0644);
+MODULE_PARM_DESC(zfs_abd_scatter_min_size,
+ "Minimum size of scatter allocations.");
+/* CSTYLED */
+module_param(zfs_abd_scatter_max_order, uint, 0644);
+MODULE_PARM_DESC(zfs_abd_scatter_max_order,
+ "Maximum order allocation used for a scatter ABD.");
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
new file mode 100644
index 000000000000..83d4a3d8496c
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
@@ -0,0 +1,530 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, Joyent, Inc.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
+ */
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/spa_impl.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <sys/arc.h>
+#include <sys/zfs_refcount.h>
+#include <sys/vdev.h>
+#include <sys/vdev_trim.h>
+#include <sys/vdev_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
+#include <sys/multilist.h>
+#include <sys/abd.h>
+#include <sys/zil.h>
+#include <sys/fm/fs/zfs.h>
+#ifdef _KERNEL
+#include <sys/shrinker.h>
+#include <sys/vmsystm.h>
+#include <sys/zpl.h>
+#include <linux/page_compat.h>
+#include <linux/notifier.h>
+#include <linux/memory.h>
+#endif
+#include <sys/callb.h>
+#include <sys/kstat.h>
+#include <sys/zthr.h>
+#include <zfs_fletcher.h>
+#include <sys/arc_impl.h>
+#include <sys/trace_zfs.h>
+#include <sys/aggsum.h>
+
+/*
+ * This is a limit on how many pages the ARC shrinker makes available for
+ * eviction in response to one page allocation attempt. Note that in
+ * practice, the kernel's shrinker can ask us to evict up to about 4x this
+ * for one allocation attempt.
+ *
+ * The default limit of 10,000 (in practice, 160MB per allocation attempt
+ * with 4K pages) limits the amount of time spent attempting to reclaim ARC
+ * memory to less than 100ms per allocation attempt, even with a small
+ * average compressed block size of ~8KB.
+ *
+ * See also the comment in arc_shrinker_count().
+ * Set to 0 to disable limit.
+ */
+int zfs_arc_shrinker_limit = 10000;
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static struct notifier_block arc_hotplug_callback_mem_nb;
+#endif
+
+/*
+ * Return a default max arc size based on the amount of physical memory.
+ */
+uint64_t
+arc_default_max(uint64_t min, uint64_t allmem)
+{
+ /* Default to 1/2 of all memory. */
+ return (MAX(allmem / 2, min));
+}
+
+#ifdef _KERNEL
+/*
+ * Return maximum amount of memory that we could possibly use. Reduced
+ * to half of all memory in user space which is primarily used for testing.
+ */
+uint64_t
+arc_all_memory(void)
+{
+#ifdef CONFIG_HIGHMEM
+ return (ptob(zfs_totalram_pages - zfs_totalhigh_pages));
+#else
+ return (ptob(zfs_totalram_pages));
+#endif /* CONFIG_HIGHMEM */
+}
+
+/*
+ * Return the amount of memory that is considered free. In user space
+ * which is primarily used for testing we pretend that free memory ranges
+ * from 0-20% of all memory.
+ */
+uint64_t
+arc_free_memory(void)
+{
+#ifdef CONFIG_HIGHMEM
+ struct sysinfo si;
+ si_meminfo(&si);
+ return (ptob(si.freeram - si.freehigh));
+#else
+ return (ptob(nr_free_pages() +
+ nr_inactive_file_pages()));
+#endif /* CONFIG_HIGHMEM */
+}
+
+/*
+ * Return the amount of memory that can be consumed before reclaim will be
+ * needed. Positive if there is sufficient free memory, negative indicates
+ * the amount of memory that needs to be freed up.
+ */
+int64_t
+arc_available_memory(void)
+{
+ return (arc_free_memory() - arc_sys_free);
+}
+
+static uint64_t
+arc_evictable_memory(void)
+{
+ int64_t asize = aggsum_value(&arc_size);
+ uint64_t arc_clean =
+ zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_DATA]) +
+ zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) +
+ zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_DATA]) +
+ zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
+ uint64_t arc_dirty = MAX((int64_t)asize - (int64_t)arc_clean, 0);
+
+ /*
+ * Scale reported evictable memory in proportion to page cache, cap
+ * at specified min/max.
+ */
+ uint64_t min = (ptob(nr_file_pages()) / 100) * zfs_arc_pc_percent;
+ min = MAX(arc_c_min, MIN(arc_c_max, min));
+
+ if (arc_dirty >= min)
+ return (arc_clean);
+
+ return (MAX((int64_t)asize - (int64_t)min, 0));
+}
+
+/*
+ * The _count() function returns the number of free-able objects.
+ * The _scan() function returns the number of objects that were freed.
+ */
+static unsigned long
+arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ /*
+ * __GFP_FS won't be set if we are called from ZFS code (see
+ * kmem_flags_convert(), which removes it). To avoid a deadlock, we
+ * don't allow evicting in this case. We return 0 rather than
+ * SHRINK_STOP so that the shrinker logic doesn't accumulate a
+ * deficit against us.
+ */
+ if (!(sc->gfp_mask & __GFP_FS)) {
+ return (0);
+ }
+
+ /*
+ * This code is reached in the "direct reclaim" case, where the
+ * kernel (outside ZFS) is trying to allocate a page, and the system
+ * is low on memory.
+ *
+ * The kernel's shrinker code doesn't understand how many pages the
+ * ARC's callback actually frees, so it may ask the ARC to shrink a
+ * lot for one page allocation. This is problematic because it may
+ * take a long time, thus delaying the page allocation, and because
+ * it may force the ARC to unnecessarily shrink very small.
+ *
+ * Therefore, we limit the amount of data that we say is evictable,
+ * which limits the amount that the shrinker will ask us to evict for
+ * one page allocation attempt.
+ *
+ * In practice, we may be asked to shrink 4x the limit to satisfy one
+ * page allocation, before the kernel's shrinker code gives up on us.
+ * When that happens, we rely on the kernel code to find the pages
+ * that we freed before invoking the OOM killer. This happens in
+ * __alloc_pages_slowpath(), which retries and finds the pages we
+ * freed when it calls get_page_from_freelist().
+ *
+ * See also the comment above zfs_arc_shrinker_limit.
+ */
+ int64_t limit = zfs_arc_shrinker_limit != 0 ?
+ zfs_arc_shrinker_limit : INT64_MAX;
+ return (MIN(limit, btop((int64_t)arc_evictable_memory())));
+}
+
+static unsigned long
+arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ ASSERT((sc->gfp_mask & __GFP_FS) != 0);
+
+ /* The arc is considered warm once reclaim has occurred */
+ if (unlikely(arc_warm == B_FALSE))
+ arc_warm = B_TRUE;
+
+ /*
+ * Evict the requested number of pages by reducing arc_c and waiting
+ * for the requested amount of data to be evicted.
+ */
+ arc_reduce_target_size(ptob(sc->nr_to_scan));
+ arc_wait_for_eviction(ptob(sc->nr_to_scan));
+ if (current->reclaim_state != NULL)
+ current->reclaim_state->reclaimed_slab += sc->nr_to_scan;
+
+ /*
+ * We are experiencing memory pressure which the arc_evict_zthr was
+ * unable to keep up with. Set arc_no_grow to briefly pause arc
+ * growth to avoid compounding the memory pressure.
+ */
+ arc_no_grow = B_TRUE;
+
+ /*
+ * When direct reclaim is observed it usually indicates a rapid
+ * increase in memory pressure. This occurs because the kswapd
+ * threads were unable to asynchronously keep enough free memory
+ * available.
+ */
+ if (current_is_kswapd()) {
+ ARCSTAT_BUMP(arcstat_memory_indirect_count);
+ } else {
+ ARCSTAT_BUMP(arcstat_memory_direct_count);
+ }
+
+ return (sc->nr_to_scan);
+}
+
+SPL_SHRINKER_DECLARE(arc_shrinker,
+ arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS);
+
+int
+arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
+{
+ uint64_t free_memory = arc_free_memory();
+
+ if (free_memory > arc_all_memory() * arc_lotsfree_percent / 100)
+ return (0);
+
+ if (txg > spa->spa_lowmem_last_txg) {
+ spa->spa_lowmem_last_txg = txg;
+ spa->spa_lowmem_page_load = 0;
+ }
+ /*
+ * If we are in pageout, we know that memory is already tight,
+ * the arc is already going to be evicting, so we just want to
+ * continue to let page writes occur as quickly as possible.
+ */
+ if (current_is_kswapd()) {
+ if (spa->spa_lowmem_page_load >
+ MAX(arc_sys_free / 4, free_memory) / 4) {
+ DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
+ return (SET_ERROR(ERESTART));
+ }
+ /* Note: reserve is inflated, so we deflate */
+ atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8);
+ return (0);
+ } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) {
+ /* memory is low, delay before restarting */
+ ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+ DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
+ return (SET_ERROR(EAGAIN));
+ }
+ spa->spa_lowmem_page_load = 0;
+ return (0);
+}
+
+static void
+arc_set_sys_free(uint64_t allmem)
+{
+ /*
+ * The ARC tries to keep at least this much memory available for the
+ * system. This gives the ARC time to shrink in response to memory
+ * pressure, before running completely out of memory and invoking the
+ * direct-reclaim ARC shrinker.
+ *
+ * This should be more than twice high_wmark_pages(), so that
+ * arc_wait_for_eviction() will wait until at least the
+ * high_wmark_pages() are free (see arc_evict_state_impl()).
+ *
+ * Note: Even when the system is very low on memory, the kernel's
+ * shrinker code may only ask for one "batch" of pages (512KB) to be
+ * evicted. If concurrent allocations consume these pages, there may
+ * still be insufficient free pages, and the OOM killer takes action.
+ *
+ * By setting arc_sys_free large enough, and having
+ * arc_wait_for_eviction() wait until there is at least arc_sys_free/2
+ * free memory, it is much less likely that concurrent allocations can
+ * consume all the memory that was evicted before checking for
+ * OOM.
+ *
+ * It's hard to iterate the zones from a linux kernel module, which
+ * makes it difficult to determine the watermark dynamically. Instead
+ * we compute the maximum high watermark for this system, based
+ * on the amount of memory, assuming default parameters on Linux kernel
+ * 5.3.
+ */
+
+ /*
+ * Base wmark_low is 4 * the square root of Kbytes of RAM.
+ */
+ long wmark = 4 * int_sqrt(allmem/1024) * 1024;
+
+ /*
+ * Clamp to between 128K and 64MB.
+ */
+ wmark = MAX(wmark, 128 * 1024);
+ wmark = MIN(wmark, 64 * 1024 * 1024);
+
+ /*
+ * watermark_boost can increase the wmark by up to 150%.
+ */
+ wmark += wmark * 150 / 100;
+
+ /*
+ * arc_sys_free needs to be more than 2x the watermark, because
+ * arc_wait_for_eviction() waits for half of arc_sys_free. Bump this up
+ * to 3x to ensure we're above it.
+ */
+ arc_sys_free = wmark * 3 + allmem / 32;
+}
+
+void
+arc_lowmem_init(void)
+{
+ uint64_t allmem = arc_all_memory();
+
+ /*
+ * Register a shrinker to support synchronous (direct) memory
+ * reclaim from the arc. This is done to prevent kswapd from
+ * swapping out pages when it is preferable to shrink the arc.
+ */
+ spl_register_shrinker(&arc_shrinker);
+ arc_set_sys_free(allmem);
+}
+
+void
+arc_lowmem_fini(void)
+{
+ spl_unregister_shrinker(&arc_shrinker);
+}
+
+int
+param_set_arc_long(const char *buf, zfs_kernel_param_t *kp)
+{
+ int error;
+
+ error = param_set_long(buf, kp);
+ if (error < 0)
+ return (SET_ERROR(error));
+
+ arc_tuning_update(B_TRUE);
+
+ return (0);
+}
+
+int
+param_set_arc_int(const char *buf, zfs_kernel_param_t *kp)
+{
+ int error;
+
+ error = param_set_int(buf, kp);
+ if (error < 0)
+ return (SET_ERROR(error));
+
+ arc_tuning_update(B_TRUE);
+
+ return (0);
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/* ARGSUSED */
+static int
+arc_hotplug_callback(struct notifier_block *self, unsigned long action,
+ void *arg)
+{
+ uint64_t allmem = arc_all_memory();
+ if (action != MEM_ONLINE)
+ return (NOTIFY_OK);
+
+ arc_set_limits(allmem);
+
+#ifdef __LP64__
+ if (zfs_dirty_data_max_max == 0)
+ zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024,
+ allmem * zfs_dirty_data_max_max_percent / 100);
+#else
+ if (zfs_dirty_data_max_max == 0)
+ zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024,
+ allmem * zfs_dirty_data_max_max_percent / 100);
+#endif
+
+ arc_set_sys_free(allmem);
+ return (NOTIFY_OK);
+}
+#endif
+
+void
+arc_register_hotplug(void)
+{
+#ifdef CONFIG_MEMORY_HOTPLUG
+ arc_hotplug_callback_mem_nb.notifier_call = arc_hotplug_callback;
+ /* There is no significance to the value 100 */
+ arc_hotplug_callback_mem_nb.priority = 100;
+ register_memory_notifier(&arc_hotplug_callback_mem_nb);
+#endif
+}
+
+void
+arc_unregister_hotplug(void)
+{
+#ifdef CONFIG_MEMORY_HOTPLUG
+ unregister_memory_notifier(&arc_hotplug_callback_mem_nb);
+#endif
+}
+#else /* _KERNEL */
+int64_t
+arc_available_memory(void)
+{
+ int64_t lowest = INT64_MAX;
+
+ /* Every 100 calls, free a small amount */
+ if (spa_get_random(100) == 0)
+ lowest = -1024;
+
+ return (lowest);
+}
+
+int
+arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
+{
+ return (0);
+}
+
+uint64_t
+arc_all_memory(void)
+{
+ return (ptob(physmem) / 2);
+}
+
+uint64_t
+arc_free_memory(void)
+{
+ return (spa_get_random(arc_all_memory() * 20 / 100));
+}
+
+void
+arc_register_hotplug(void)
+{
+}
+
+void
+arc_unregister_hotplug(void)
+{
+}
+#endif /* _KERNEL */
+
+/*
+ * Helper function for arc_prune_async() it is responsible for safely
+ * handling the execution of a registered arc_prune_func_t.
+ */
+static void
+arc_prune_task(void *ptr)
+{
+ arc_prune_t *ap = (arc_prune_t *)ptr;
+ arc_prune_func_t *func = ap->p_pfunc;
+
+ if (func != NULL)
+ func(ap->p_adjust, ap->p_private);
+
+ zfs_refcount_remove(&ap->p_refcnt, func);
+}
+
+/*
+ * Notify registered consumers they must drop holds on a portion of the ARC
+ * buffered they reference. This provides a mechanism to ensure the ARC can
+ * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This
+ * is analogous to dnlc_reduce_cache() but more generic.
+ *
+ * This operation is performed asynchronously so it may be safely called
+ * in the context of the arc_reclaim_thread(). A reference is taken here
+ * for each registered arc_prune_t and the arc_prune_task() is responsible
+ * for releasing it once the registered arc_prune_func_t has completed.
+ */
+void
+arc_prune_async(int64_t adjust)
+{
+ arc_prune_t *ap;
+
+ mutex_enter(&arc_prune_mtx);
+ for (ap = list_head(&arc_prune_list); ap != NULL;
+ ap = list_next(&arc_prune_list, ap)) {
+
+ if (zfs_refcount_count(&ap->p_refcnt) >= 2)
+ continue;
+
+ zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc);
+ ap->p_adjust = adjust;
+ if (taskq_dispatch(arc_prune_taskq, arc_prune_task,
+ ap, TQ_SLEEP) == TASKQID_INVALID) {
+ zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc);
+ continue;
+ }
+ ARCSTAT_BUMP(arcstat_prune);
+ }
+ mutex_exit(&arc_prune_mtx);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW,
+ "Limit on number of pages that ARC shrinker can reclaim at once");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c b/sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c
new file mode 100644
index 000000000000..ff3ef1bf6ad9
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c
@@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/mmp.h>
+
+int
+param_set_multihost_interval(const char *val, zfs_kernel_param_t *kp)
+{
+ int ret;
+
+ ret = param_set_ulong(val, kp);
+ if (ret < 0)
+ return (ret);
+
+ if (spa_mode_global != SPA_MODE_UNINIT)
+ mmp_signal_all_threads();
+
+ return (ret);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/policy.c b/sys/contrib/openzfs/module/os/linux/zfs/policy.c
new file mode 100644
index 000000000000..8780d7f6c70a
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/policy.c
@@ -0,0 +1,375 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
+ * Copyright (C) 2016 Lawrence Livermore National Security, LLC.
+ *
+ * For Linux the vast majority of this enforcement is already handled via
+ * the standard Linux VFS permission checks. However certain administrative
+ * commands which bypass the standard mechanisms may need to make use of
+ * this functionality.
+ */
+
+#include <sys/policy.h>
+#include <linux/security.h>
+#include <linux/vfs_compat.h>
+
+/*
+ * The passed credentials cannot be directly verified because Linux only
+ * provides and interface to check the *current* process credentials. In
+ * order to handle this the capable() test is only run when the passed
+ * credentials match the current process credentials or the kcred. In
+ * all other cases this function must fail and return the passed err.
+ */
+static int
+priv_policy_ns(const cred_t *cr, int capability, int err,
+ struct user_namespace *ns)
+{
+ if (cr != CRED() && (cr != kcred))
+ return (err);
+
+#if defined(CONFIG_USER_NS)
+ if (!(ns ? ns_capable(ns, capability) : capable(capability)))
+#else
+ if (!capable(capability))
+#endif
+ return (err);
+
+ return (0);
+}
+
+static int
+priv_policy(const cred_t *cr, int capability, int err)
+{
+ return (priv_policy_ns(cr, capability, err, NULL));
+}
+
+static int
+priv_policy_user(const cred_t *cr, int capability, int err)
+{
+ /*
+ * All priv_policy_user checks are preceded by kuid/kgid_has_mapping()
+ * checks. If we cannot do them, we shouldn't be using ns_capable()
+ * since we don't know whether the affected files are valid in our
+ * namespace.
+ */
+#if defined(CONFIG_USER_NS)
+ return (priv_policy_ns(cr, capability, err, cr->user_ns));
+#else
+ return (priv_policy_ns(cr, capability, err, NULL));
+#endif
+}
+
+/*
+ * Checks for operations that are either client-only or are used by
+ * both clients and servers.
+ */
+int
+secpolicy_nfs(const cred_t *cr)
+{
+ return (priv_policy(cr, CAP_SYS_ADMIN, EPERM));
+}
+
+/*
+ * Catch all system configuration.
+ */
+int
+secpolicy_sys_config(const cred_t *cr, boolean_t checkonly)
+{
+ return (priv_policy(cr, CAP_SYS_ADMIN, EPERM));
+}
+
+/*
+ * Like secpolicy_vnode_access() but we get the actual wanted mode and the
+ * current mode of the file, not the missing bits.
+ *
+ * Enforced in the Linux VFS.
+ */
+int
+secpolicy_vnode_access2(const cred_t *cr, struct inode *ip, uid_t owner,
+ mode_t curmode, mode_t wantmode)
+{
+ return (0);
+}
+
+/*
+ * This is a special routine for ZFS; it is used to determine whether
+ * any of the privileges in effect allow any form of access to the
+ * file. There's no reason to audit this or any reason to record
+ * this. More work is needed to do the "KPLD" stuff.
+ */
+int
+secpolicy_vnode_any_access(const cred_t *cr, struct inode *ip, uid_t owner)
+{
+ if (crgetfsuid(cr) == owner)
+ return (0);
+
+ if (inode_owner_or_capable(ip))
+ return (0);
+
+#if defined(CONFIG_USER_NS)
+ if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
+ return (EPERM);
+#endif
+
+ if (priv_policy_user(cr, CAP_DAC_OVERRIDE, EPERM) == 0)
+ return (0);
+
+ if (priv_policy_user(cr, CAP_DAC_READ_SEARCH, EPERM) == 0)
+ return (0);
+
+ return (EPERM);
+}
+
+/*
+ * Determine if subject can chown owner of a file.
+ */
+int
+secpolicy_vnode_chown(const cred_t *cr, uid_t owner)
+{
+ if (crgetfsuid(cr) == owner)
+ return (0);
+
+#if defined(CONFIG_USER_NS)
+ if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
+ return (EPERM);
+#endif
+
+ return (priv_policy_user(cr, CAP_FOWNER, EPERM));
+}
+
+/*
+ * Determine if subject can change group ownership of a file.
+ */
+int
+secpolicy_vnode_create_gid(const cred_t *cr)
+{
+ return (priv_policy(cr, CAP_SETGID, EPERM));
+}
+
+/*
+ * Policy determines whether we can remove an entry from a directory,
+ * regardless of permission bits.
+ */
+int
+secpolicy_vnode_remove(const cred_t *cr)
+{
+ return (priv_policy(cr, CAP_FOWNER, EPERM));
+}
+
+/*
+ * Determine that subject can modify the mode of a file. allzone privilege
+ * needed when modifying root owned object.
+ */
+int
+secpolicy_vnode_setdac(const cred_t *cr, uid_t owner)
+{
+ if (crgetfsuid(cr) == owner)
+ return (0);
+
+#if defined(CONFIG_USER_NS)
+ if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
+ return (EPERM);
+#endif
+
+ return (priv_policy_user(cr, CAP_FOWNER, EPERM));
+}
+
+/*
+ * Are we allowed to retain the set-uid/set-gid bits when
+ * changing ownership or when writing to a file?
+ * "issuid" should be true when set-uid; only in that case
+ * root ownership is checked (setgid is assumed).
+ *
+ * Enforced in the Linux VFS.
+ */
+int
+secpolicy_vnode_setid_retain(struct znode *zp __maybe_unused, const cred_t *cr,
+ boolean_t issuidroot)
+{
+ return (priv_policy_user(cr, CAP_FSETID, EPERM));
+}
+
+/*
+ * Determine that subject can set the file setgid flag.
+ */
+int
+secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid)
+{
+#if defined(CONFIG_USER_NS)
+ if (!kgid_has_mapping(cr->user_ns, SGID_TO_KGID(gid)))
+ return (EPERM);
+#endif
+ if (crgetfsgid(cr) != gid && !groupmember(gid, cr))
+ return (priv_policy_user(cr, CAP_FSETID, EPERM));
+
+ return (0);
+}
+
+/*
+ * Determine if the subject can inject faults in the ZFS fault injection
+ * framework. Requires all privileges.
+ */
+int
+secpolicy_zinject(const cred_t *cr)
+{
+ return (priv_policy(cr, CAP_SYS_ADMIN, EACCES));
+}
+
+/*
+ * Determine if the subject has permission to manipulate ZFS datasets
+ * (not pools). Equivalent to the SYS_MOUNT privilege.
+ */
+int
+secpolicy_zfs(const cred_t *cr)
+{
+ return (priv_policy(cr, CAP_SYS_ADMIN, EACCES));
+}
+
+/*
+ * Equivalent to secpolicy_zfs(), but works even if the cred_t is not that of
+ * the current process. Takes both cred_t and proc_t so that this can work
+ * easily on all platforms.
+ *
+ * The has_capability() function was first exported in the 4.10 Linux kernel
+ * then backported to some LTS kernels. Prior to this change there was no
+ * mechanism to perform this check therefore EACCES is returned when the
+ * functionality is not present in the kernel.
+ */
+int
+secpolicy_zfs_proc(const cred_t *cr, proc_t *proc)
+{
+#if defined(HAVE_HAS_CAPABILITY)
+ if (!has_capability(proc, CAP_SYS_ADMIN))
+ return (EACCES);
+ return (0);
+#else
+ return (EACCES);
+#endif
+}
+
+void
+secpolicy_setid_clear(vattr_t *vap, cred_t *cr)
+{
+ if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 &&
+ secpolicy_vnode_setid_retain(NULL, cr,
+ (vap->va_mode & S_ISUID) != 0 &&
+ (vap->va_mask & AT_UID) != 0 && vap->va_uid == 0) != 0) {
+ vap->va_mask |= AT_MODE;
+ vap->va_mode &= ~(S_ISUID|S_ISGID);
+ }
+}
+
+/*
+ * Determine that subject can set the file setid flags.
+ */
+static int
+secpolicy_vnode_setid_modify(const cred_t *cr, uid_t owner)
+{
+ if (crgetfsuid(cr) == owner)
+ return (0);
+
+#if defined(CONFIG_USER_NS)
+ if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
+ return (EPERM);
+#endif
+
+ return (priv_policy_user(cr, CAP_FSETID, EPERM));
+}
+
+/*
+ * Determine that subject can make a file a "sticky".
+ *
+ * Enforced in the Linux VFS.
+ */
+static int
+secpolicy_vnode_stky_modify(const cred_t *cr)
+{
+ return (0);
+}
+
+int
+secpolicy_setid_setsticky_clear(struct inode *ip, vattr_t *vap,
+ const vattr_t *ovap, cred_t *cr)
+{
+ int error;
+
+ if ((vap->va_mode & S_ISUID) != 0 &&
+ (error = secpolicy_vnode_setid_modify(cr,
+ ovap->va_uid)) != 0) {
+ return (error);
+ }
+
+ /*
+ * Check privilege if attempting to set the
+ * sticky bit on a non-directory.
+ */
+ if (!S_ISDIR(ip->i_mode) && (vap->va_mode & S_ISVTX) != 0 &&
+ secpolicy_vnode_stky_modify(cr) != 0) {
+ vap->va_mode &= ~S_ISVTX;
+ }
+
+ /*
+ * Check for privilege if attempting to set the
+ * group-id bit.
+ */
+ if ((vap->va_mode & S_ISGID) != 0 &&
+ secpolicy_vnode_setids_setgids(cr, ovap->va_gid) != 0) {
+ vap->va_mode &= ~S_ISGID;
+ }
+
+ return (0);
+}
+
+/*
+ * Check privileges for setting xvattr attributes
+ */
+int
+secpolicy_xvattr(xvattr_t *xvap, uid_t owner, cred_t *cr, mode_t type)
+{
+ return (secpolicy_vnode_chown(cr, owner));
+}
+
+/*
+ * Check privileges for setattr attributes.
+ *
+ * Enforced in the Linux VFS.
+ */
+int
+secpolicy_vnode_setattr(cred_t *cr, struct inode *ip, struct vattr *vap,
+ const struct vattr *ovap, int flags,
+ int unlocked_access(void *, int, cred_t *), void *node)
+{
+ return (0);
+}
+
+/*
+ * Check privileges for links.
+ *
+ * Enforced in the Linux VFS.
+ */
+int
+secpolicy_basic_link(const cred_t *cr)
+{
+ return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/qat.c b/sys/contrib/openzfs/module/os/linux/zfs/qat.c
new file mode 100644
index 000000000000..08613b3a2042
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/qat.c
@@ -0,0 +1,105 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#if defined(_KERNEL) && defined(HAVE_QAT)
+#include <sys/zfs_context.h>
+#include <sys/qat.h>
+
+qat_stats_t qat_stats = {
+ { "comp_requests", KSTAT_DATA_UINT64 },
+ { "comp_total_in_bytes", KSTAT_DATA_UINT64 },
+ { "comp_total_out_bytes", KSTAT_DATA_UINT64 },
+ { "decomp_requests", KSTAT_DATA_UINT64 },
+ { "decomp_total_in_bytes", KSTAT_DATA_UINT64 },
+ { "decomp_total_out_bytes", KSTAT_DATA_UINT64 },
+ { "dc_fails", KSTAT_DATA_UINT64 },
+ { "encrypt_requests", KSTAT_DATA_UINT64 },
+ { "encrypt_total_in_bytes", KSTAT_DATA_UINT64 },
+ { "encrypt_total_out_bytes", KSTAT_DATA_UINT64 },
+ { "decrypt_requests", KSTAT_DATA_UINT64 },
+ { "decrypt_total_in_bytes", KSTAT_DATA_UINT64 },
+ { "decrypt_total_out_bytes", KSTAT_DATA_UINT64 },
+ { "crypt_fails", KSTAT_DATA_UINT64 },
+ { "cksum_requests", KSTAT_DATA_UINT64 },
+ { "cksum_total_in_bytes", KSTAT_DATA_UINT64 },
+ { "cksum_fails", KSTAT_DATA_UINT64 },
+};
+
+static kstat_t *qat_ksp = NULL;
+
+CpaStatus
+qat_mem_alloc_contig(void **pp_mem_addr, Cpa32U size_bytes)
+{
+ *pp_mem_addr = kmalloc(size_bytes, GFP_KERNEL);
+ if (*pp_mem_addr == NULL)
+ return (CPA_STATUS_RESOURCE);
+ return (CPA_STATUS_SUCCESS);
+}
+
+void
+qat_mem_free_contig(void **pp_mem_addr)
+{
+ if (*pp_mem_addr != NULL) {
+ kfree(*pp_mem_addr);
+ *pp_mem_addr = NULL;
+ }
+}
+
+int
+qat_init(void)
+{
+ qat_ksp = kstat_create("zfs", 0, "qat", "misc",
+ KSTAT_TYPE_NAMED, sizeof (qat_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ if (qat_ksp != NULL) {
+ qat_ksp->ks_data = &qat_stats;
+ kstat_install(qat_ksp);
+ }
+
+ /*
+ * Just set the disable flag when qat init failed, qat can be
+ * turned on again in post-process after zfs module is loaded, e.g.:
+ * echo 0 > /sys/module/zfs/parameters/zfs_qat_compress_disable
+ */
+ if (qat_dc_init() != 0)
+ zfs_qat_compress_disable = 1;
+
+ if (qat_cy_init() != 0) {
+ zfs_qat_checksum_disable = 1;
+ zfs_qat_encrypt_disable = 1;
+ }
+
+ return (0);
+}
+
+void
+qat_fini(void)
+{
+ if (qat_ksp != NULL) {
+ kstat_delete(qat_ksp);
+ qat_ksp = NULL;
+ }
+
+ qat_cy_fini();
+ qat_dc_fini();
+}
+
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/qat_compress.c b/sys/contrib/openzfs/module/os/linux/zfs/qat_compress.c
new file mode 100644
index 000000000000..ad3ead3b16e3
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/qat_compress.c
@@ -0,0 +1,569 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#if defined(_KERNEL) && defined(HAVE_QAT)
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/completion.h>
+#include <sys/zfs_context.h>
+#include <sys/byteorder.h>
+#include <sys/zio.h>
+#include <sys/qat.h>
+
+/*
+ * Max instances in a QAT device, each instance is a channel to submit
+ * jobs to QAT hardware, this is only for pre-allocating instance and
+ * session arrays; the actual number of instances are defined in the
+ * QAT driver's configuration file.
+ */
+#define QAT_DC_MAX_INSTANCES 48
+
+/*
+ * ZLIB head and foot size
+ */
+#define ZLIB_HEAD_SZ 2
+#define ZLIB_FOOT_SZ 4
+
+static CpaInstanceHandle dc_inst_handles[QAT_DC_MAX_INSTANCES];
+static CpaDcSessionHandle session_handles[QAT_DC_MAX_INSTANCES];
+static CpaBufferList **buffer_array[QAT_DC_MAX_INSTANCES];
+static Cpa16U num_inst = 0;
+static Cpa32U inst_num = 0;
+static boolean_t qat_dc_init_done = B_FALSE;
+int zfs_qat_compress_disable = 0;
+
+boolean_t
+qat_dc_use_accel(size_t s_len)
+{
+ return (!zfs_qat_compress_disable &&
+ qat_dc_init_done &&
+ s_len >= QAT_MIN_BUF_SIZE &&
+ s_len <= QAT_MAX_BUF_SIZE);
+}
+
+static void
+qat_dc_callback(void *p_callback, CpaStatus status)
+{
+ if (p_callback != NULL)
+ complete((struct completion *)p_callback);
+}
+
+static void
+qat_dc_clean(void)
+{
+ Cpa16U buff_num = 0;
+ Cpa16U num_inter_buff_lists = 0;
+
+ for (Cpa16U i = 0; i < num_inst; i++) {
+ cpaDcStopInstance(dc_inst_handles[i]);
+ QAT_PHYS_CONTIG_FREE(session_handles[i]);
+ /* free intermediate buffers */
+ if (buffer_array[i] != NULL) {
+ cpaDcGetNumIntermediateBuffers(
+ dc_inst_handles[i], &num_inter_buff_lists);
+ for (buff_num = 0; buff_num < num_inter_buff_lists;
+ buff_num++) {
+ CpaBufferList *buffer_inter =
+ buffer_array[i][buff_num];
+ if (buffer_inter->pBuffers) {
+ QAT_PHYS_CONTIG_FREE(
+ buffer_inter->pBuffers->pData);
+ QAT_PHYS_CONTIG_FREE(
+ buffer_inter->pBuffers);
+ }
+ QAT_PHYS_CONTIG_FREE(
+ buffer_inter->pPrivateMetaData);
+ QAT_PHYS_CONTIG_FREE(buffer_inter);
+ }
+ }
+ }
+
+ num_inst = 0;
+ qat_dc_init_done = B_FALSE;
+}
+
+int
+qat_dc_init(void)
+{
+ CpaStatus status = CPA_STATUS_SUCCESS;
+ Cpa32U sess_size = 0;
+ Cpa32U ctx_size = 0;
+ Cpa16U num_inter_buff_lists = 0;
+ Cpa16U buff_num = 0;
+ Cpa32U buff_meta_size = 0;
+ CpaDcSessionSetupData sd = {0};
+
+ if (qat_dc_init_done)
+ return (0);
+
+ status = cpaDcGetNumInstances(&num_inst);
+ if (status != CPA_STATUS_SUCCESS)
+ return (-1);
+
+ /* if the user has configured no QAT compression units just return */
+ if (num_inst == 0)
+ return (0);
+
+ if (num_inst > QAT_DC_MAX_INSTANCES)
+ num_inst = QAT_DC_MAX_INSTANCES;
+
+ status = cpaDcGetInstances(num_inst, &dc_inst_handles[0]);
+ if (status != CPA_STATUS_SUCCESS)
+ return (-1);
+
+ for (Cpa16U i = 0; i < num_inst; i++) {
+ cpaDcSetAddressTranslation(dc_inst_handles[i],
+ (void*)virt_to_phys);
+
+ status = cpaDcBufferListGetMetaSize(dc_inst_handles[i],
+ 1, &buff_meta_size);
+
+ if (status == CPA_STATUS_SUCCESS)
+ status = cpaDcGetNumIntermediateBuffers(
+ dc_inst_handles[i], &num_inter_buff_lists);
+
+ if (status == CPA_STATUS_SUCCESS && num_inter_buff_lists != 0)
+ status = QAT_PHYS_CONTIG_ALLOC(&buffer_array[i],
+ num_inter_buff_lists *
+ sizeof (CpaBufferList *));
+
+ for (buff_num = 0; buff_num < num_inter_buff_lists;
+ buff_num++) {
+ if (status == CPA_STATUS_SUCCESS)
+ status = QAT_PHYS_CONTIG_ALLOC(
+ &buffer_array[i][buff_num],
+ sizeof (CpaBufferList));
+
+ if (status == CPA_STATUS_SUCCESS)
+ status = QAT_PHYS_CONTIG_ALLOC(
+ &buffer_array[i][buff_num]->
+ pPrivateMetaData,
+ buff_meta_size);
+
+ if (status == CPA_STATUS_SUCCESS)
+ status = QAT_PHYS_CONTIG_ALLOC(
+ &buffer_array[i][buff_num]->pBuffers,
+ sizeof (CpaFlatBuffer));
+
+ if (status == CPA_STATUS_SUCCESS) {
+ /*
+ * implementation requires an intermediate
+ * buffer approximately twice the size of
+ * output buffer, which is 2x max buffer
+ * size here.
+ */
+ status = QAT_PHYS_CONTIG_ALLOC(
+ &buffer_array[i][buff_num]->pBuffers->
+ pData, 2 * QAT_MAX_BUF_SIZE);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ buffer_array[i][buff_num]->numBuffers = 1;
+ buffer_array[i][buff_num]->pBuffers->
+ dataLenInBytes = 2 * QAT_MAX_BUF_SIZE;
+ }
+ }
+
+ status = cpaDcStartInstance(dc_inst_handles[i],
+ num_inter_buff_lists, buffer_array[i]);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ sd.compLevel = CPA_DC_L1;
+ sd.compType = CPA_DC_DEFLATE;
+ sd.huffType = CPA_DC_HT_FULL_DYNAMIC;
+ sd.sessDirection = CPA_DC_DIR_COMBINED;
+ sd.sessState = CPA_DC_STATELESS;
+ sd.deflateWindowSize = 7;
+ sd.checksum = CPA_DC_ADLER32;
+ status = cpaDcGetSessionSize(dc_inst_handles[i],
+ &sd, &sess_size, &ctx_size);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ QAT_PHYS_CONTIG_ALLOC(&session_handles[i], sess_size);
+ if (session_handles[i] == NULL)
+ goto fail;
+
+ status = cpaDcInitSession(dc_inst_handles[i],
+ session_handles[i],
+ &sd, NULL, qat_dc_callback);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+ }
+
+ qat_dc_init_done = B_TRUE;
+ return (0);
+fail:
+ qat_dc_clean();
+ return (-1);
+}
+
+void
+qat_dc_fini(void)
+{
+ if (!qat_dc_init_done)
+ return;
+
+ qat_dc_clean();
+}
+
+/*
+ * The "add" parameter is an additional buffer which is passed
+ * to QAT as a scratch buffer alongside the destination buffer
+ * in case the "compressed" data ends up being larger than the
+ * original source data. This is necessary to prevent QAT from
+ * generating buffer overflow warnings for incompressible data.
+ */
+static int
+qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len,
+ char *dst, int dst_len, char *add, int add_len, size_t *c_len)
+{
+ CpaInstanceHandle dc_inst_handle;
+ CpaDcSessionHandle session_handle;
+ CpaBufferList *buf_list_src = NULL;
+ CpaBufferList *buf_list_dst = NULL;
+ CpaFlatBuffer *flat_buf_src = NULL;
+ CpaFlatBuffer *flat_buf_dst = NULL;
+ Cpa8U *buffer_meta_src = NULL;
+ Cpa8U *buffer_meta_dst = NULL;
+ Cpa32U buffer_meta_size = 0;
+ CpaDcRqResults dc_results;
+ CpaStatus status = CPA_STATUS_FAIL;
+ Cpa32U hdr_sz = 0;
+ Cpa32U compressed_sz;
+ Cpa32U num_src_buf = (src_len >> PAGE_SHIFT) + 2;
+ Cpa32U num_dst_buf = (dst_len >> PAGE_SHIFT) + 2;
+ Cpa32U num_add_buf = (add_len >> PAGE_SHIFT) + 2;
+ Cpa32U bytes_left;
+ Cpa32U dst_pages = 0;
+ Cpa32U adler32 = 0;
+ char *data;
+ struct page *page;
+ struct page **in_pages = NULL;
+ struct page **out_pages = NULL;
+ struct page **add_pages = NULL;
+ Cpa32U page_off = 0;
+ struct completion complete;
+ Cpa32U page_num = 0;
+ Cpa16U i;
+
+ /*
+ * We increment num_src_buf and num_dst_buf by 2 to allow
+ * us to handle non page-aligned buffer addresses and buffers
+ * whose sizes are not divisible by PAGE_SIZE.
+ */
+ Cpa32U src_buffer_list_mem_size = sizeof (CpaBufferList) +
+ (num_src_buf * sizeof (CpaFlatBuffer));
+ Cpa32U dst_buffer_list_mem_size = sizeof (CpaBufferList) +
+ ((num_dst_buf + num_add_buf) * sizeof (CpaFlatBuffer));
+
+ status = QAT_PHYS_CONTIG_ALLOC(&in_pages,
+ num_src_buf * sizeof (struct page *));
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ status = QAT_PHYS_CONTIG_ALLOC(&out_pages,
+ num_dst_buf * sizeof (struct page *));
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ status = QAT_PHYS_CONTIG_ALLOC(&add_pages,
+ num_add_buf * sizeof (struct page *));
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst;
+ dc_inst_handle = dc_inst_handles[i];
+ session_handle = session_handles[i];
+
+ cpaDcBufferListGetMetaSize(dc_inst_handle, num_src_buf,
+ &buffer_meta_size);
+ status = QAT_PHYS_CONTIG_ALLOC(&buffer_meta_src, buffer_meta_size);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ cpaDcBufferListGetMetaSize(dc_inst_handle, num_dst_buf + num_add_buf,
+ &buffer_meta_size);
+ status = QAT_PHYS_CONTIG_ALLOC(&buffer_meta_dst, buffer_meta_size);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ /* build source buffer list */
+ status = QAT_PHYS_CONTIG_ALLOC(&buf_list_src, src_buffer_list_mem_size);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ flat_buf_src = (CpaFlatBuffer *)(buf_list_src + 1);
+
+ buf_list_src->pBuffers = flat_buf_src; /* always point to first one */
+
+ /* build destination buffer list */
+ status = QAT_PHYS_CONTIG_ALLOC(&buf_list_dst, dst_buffer_list_mem_size);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1);
+
+ buf_list_dst->pBuffers = flat_buf_dst; /* always point to first one */
+
+ buf_list_src->numBuffers = 0;
+ buf_list_src->pPrivateMetaData = buffer_meta_src;
+ bytes_left = src_len;
+ data = src;
+ page_num = 0;
+ while (bytes_left > 0) {
+ page_off = ((long)data & ~PAGE_MASK);
+ page = qat_mem_to_page(data);
+ in_pages[page_num] = page;
+ flat_buf_src->pData = kmap(page) + page_off;
+ flat_buf_src->dataLenInBytes =
+ min((long)PAGE_SIZE - page_off, (long)bytes_left);
+
+ bytes_left -= flat_buf_src->dataLenInBytes;
+ data += flat_buf_src->dataLenInBytes;
+ flat_buf_src++;
+ buf_list_src->numBuffers++;
+ page_num++;
+ }
+
+ buf_list_dst->numBuffers = 0;
+ buf_list_dst->pPrivateMetaData = buffer_meta_dst;
+ bytes_left = dst_len;
+ data = dst;
+ page_num = 0;
+ while (bytes_left > 0) {
+ page_off = ((long)data & ~PAGE_MASK);
+ page = qat_mem_to_page(data);
+ flat_buf_dst->pData = kmap(page) + page_off;
+ out_pages[page_num] = page;
+ flat_buf_dst->dataLenInBytes =
+ min((long)PAGE_SIZE - page_off, (long)bytes_left);
+
+ bytes_left -= flat_buf_dst->dataLenInBytes;
+ data += flat_buf_dst->dataLenInBytes;
+ flat_buf_dst++;
+ buf_list_dst->numBuffers++;
+ page_num++;
+ dst_pages++;
+ }
+
+ /* map additional scratch pages into the destination buffer list */
+ bytes_left = add_len;
+ data = add;
+ page_num = 0;
+ while (bytes_left > 0) {
+ page_off = ((long)data & ~PAGE_MASK);
+ page = qat_mem_to_page(data);
+ flat_buf_dst->pData = kmap(page) + page_off;
+ add_pages[page_num] = page;
+ flat_buf_dst->dataLenInBytes =
+ min((long)PAGE_SIZE - page_off, (long)bytes_left);
+
+ bytes_left -= flat_buf_dst->dataLenInBytes;
+ data += flat_buf_dst->dataLenInBytes;
+ flat_buf_dst++;
+ buf_list_dst->numBuffers++;
+ page_num++;
+ }
+
+ init_completion(&complete);
+
+ if (dir == QAT_COMPRESS) {
+ QAT_STAT_BUMP(comp_requests);
+ QAT_STAT_INCR(comp_total_in_bytes, src_len);
+
+ cpaDcGenerateHeader(session_handle,
+ buf_list_dst->pBuffers, &hdr_sz);
+ buf_list_dst->pBuffers->pData += hdr_sz;
+ buf_list_dst->pBuffers->dataLenInBytes -= hdr_sz;
+ status = cpaDcCompressData(
+ dc_inst_handle, session_handle,
+ buf_list_src, buf_list_dst,
+ &dc_results, CPA_DC_FLUSH_FINAL,
+ &complete);
+ if (status != CPA_STATUS_SUCCESS) {
+ goto fail;
+ }
+
+ /* we now wait until the completion of the operation. */
+ wait_for_completion(&complete);
+
+ if (dc_results.status != CPA_STATUS_SUCCESS) {
+ status = CPA_STATUS_FAIL;
+ goto fail;
+ }
+
+ compressed_sz = dc_results.produced;
+ if (compressed_sz + hdr_sz + ZLIB_FOOT_SZ > dst_len) {
+ status = CPA_STATUS_INCOMPRESSIBLE;
+ goto fail;
+ }
+
+ flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1);
+ /* move to the last page */
+ flat_buf_dst += (compressed_sz + hdr_sz) >> PAGE_SHIFT;
+
+ /* no space for gzip footer in the last page */
+ if (((compressed_sz + hdr_sz) % PAGE_SIZE)
+ + ZLIB_FOOT_SZ > PAGE_SIZE) {
+ status = CPA_STATUS_INCOMPRESSIBLE;
+ goto fail;
+ }
+
+ /* jump to the end of the buffer and append footer */
+ flat_buf_dst->pData =
+ (char *)((unsigned long)flat_buf_dst->pData & PAGE_MASK)
+ + ((compressed_sz + hdr_sz) % PAGE_SIZE);
+ flat_buf_dst->dataLenInBytes = ZLIB_FOOT_SZ;
+
+ dc_results.produced = 0;
+ status = cpaDcGenerateFooter(session_handle,
+ flat_buf_dst, &dc_results);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ *c_len = compressed_sz + dc_results.produced + hdr_sz;
+ QAT_STAT_INCR(comp_total_out_bytes, *c_len);
+ } else {
+ ASSERT3U(dir, ==, QAT_DECOMPRESS);
+ QAT_STAT_BUMP(decomp_requests);
+ QAT_STAT_INCR(decomp_total_in_bytes, src_len);
+
+ buf_list_src->pBuffers->pData += ZLIB_HEAD_SZ;
+ buf_list_src->pBuffers->dataLenInBytes -= ZLIB_HEAD_SZ;
+ status = cpaDcDecompressData(dc_inst_handle, session_handle,
+ buf_list_src, buf_list_dst, &dc_results, CPA_DC_FLUSH_FINAL,
+ &complete);
+
+ if (CPA_STATUS_SUCCESS != status) {
+ status = CPA_STATUS_FAIL;
+ goto fail;
+ }
+
+ /* we now wait until the completion of the operation. */
+ wait_for_completion(&complete);
+
+ if (dc_results.status != CPA_STATUS_SUCCESS) {
+ status = CPA_STATUS_FAIL;
+ goto fail;
+ }
+
+ /* verify adler checksum */
+ adler32 = *(Cpa32U *)(src + dc_results.consumed + ZLIB_HEAD_SZ);
+ if (adler32 != BSWAP_32(dc_results.checksum)) {
+ status = CPA_STATUS_FAIL;
+ goto fail;
+ }
+ *c_len = dc_results.produced;
+ QAT_STAT_INCR(decomp_total_out_bytes, *c_len);
+ }
+
+fail:
+ if (status != CPA_STATUS_SUCCESS && status != CPA_STATUS_INCOMPRESSIBLE)
+ QAT_STAT_BUMP(dc_fails);
+
+ if (in_pages) {
+ for (page_num = 0;
+ page_num < buf_list_src->numBuffers;
+ page_num++) {
+ kunmap(in_pages[page_num]);
+ }
+ QAT_PHYS_CONTIG_FREE(in_pages);
+ }
+
+ if (out_pages) {
+ for (page_num = 0; page_num < dst_pages; page_num++) {
+ kunmap(out_pages[page_num]);
+ }
+ QAT_PHYS_CONTIG_FREE(out_pages);
+ }
+
+ if (add_pages) {
+ for (page_num = 0;
+ page_num < buf_list_dst->numBuffers - dst_pages;
+ page_num++) {
+ kunmap(add_pages[page_num]);
+ }
+ QAT_PHYS_CONTIG_FREE(add_pages);
+ }
+
+ QAT_PHYS_CONTIG_FREE(buffer_meta_src);
+ QAT_PHYS_CONTIG_FREE(buffer_meta_dst);
+ QAT_PHYS_CONTIG_FREE(buf_list_src);
+ QAT_PHYS_CONTIG_FREE(buf_list_dst);
+
+ return (status);
+}
+
+/*
+ * Entry point for QAT accelerated compression / decompression.
+ */
+int
+qat_compress(qat_compress_dir_t dir, char *src, int src_len,
+ char *dst, int dst_len, size_t *c_len)
+{
+ int ret;
+ size_t add_len = 0;
+ void *add = NULL;
+
+ if (dir == QAT_COMPRESS) {
+ add_len = dst_len;
+ add = zio_data_buf_alloc(add_len);
+ }
+
+ ret = qat_compress_impl(dir, src, src_len, dst,
+ dst_len, add, add_len, c_len);
+
+ if (dir == QAT_COMPRESS)
+ zio_data_buf_free(add, add_len);
+
+ return (ret);
+}
+
+static int
+param_set_qat_compress(const char *val, zfs_kernel_param_t *kp)
+{
+ int ret;
+ int *pvalue = kp->arg;
+ ret = param_set_int(val, kp);
+ if (ret)
+ return (ret);
+ /*
+ * zfs_qat_compress_disable = 0: enable qat compress
+ * try to initialize qat instance if it has not been done
+ */
+ if (*pvalue == 0 && !qat_dc_init_done) {
+ ret = qat_dc_init();
+ if (ret != 0) {
+ zfs_qat_compress_disable = 1;
+ return (ret);
+ }
+ }
+ return (ret);
+}
+
+module_param_call(zfs_qat_compress_disable, param_set_qat_compress,
+ param_get_int, &zfs_qat_compress_disable, 0644);
+MODULE_PARM_DESC(zfs_qat_compress_disable, "Enable/Disable QAT compression");
+
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/qat_crypt.c b/sys/contrib/openzfs/module/os/linux/zfs/qat_crypt.c
new file mode 100644
index 000000000000..4771b2f3bec5
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/qat_crypt.c
@@ -0,0 +1,630 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * This file represents the QAT implementation of checksums and encryption.
+ * Internally, QAT shares the same cryptographic instances for both of these
+ * operations, so the code has been combined here. QAT data compression uses
+ * compression instances, so that code is separated into qat_compress.c
+ */
+
+#if defined(_KERNEL) && defined(HAVE_QAT)
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/completion.h>
+#include <sys/zfs_context.h>
+#include <sys/zio_crypt.h>
+#include "lac/cpa_cy_im.h"
+#include "lac/cpa_cy_common.h"
+#include <sys/qat.h>
+
+/*
+ * Max instances in a QAT device, each instance is a channel to submit
+ * jobs to QAT hardware, this is only for pre-allocating instances
+ * and session arrays; the actual number of instances are defined in
+ * the QAT driver's configure file.
+ */
+#define QAT_CRYPT_MAX_INSTANCES 48
+
+#define MAX_PAGE_NUM 1024
+
+static Cpa32U inst_num = 0;
+static Cpa16U num_inst = 0;
+static CpaInstanceHandle cy_inst_handles[QAT_CRYPT_MAX_INSTANCES];
+static boolean_t qat_cy_init_done = B_FALSE;
+int zfs_qat_encrypt_disable = 0;
+int zfs_qat_checksum_disable = 0;
+
+typedef struct cy_callback {
+ CpaBoolean verify_result;
+ struct completion complete;
+} cy_callback_t;
+
+static void
+symcallback(void *p_callback, CpaStatus status, const CpaCySymOp operation,
+ void *op_data, CpaBufferList *buf_list_dst, CpaBoolean verify)
+{
+ cy_callback_t *cb = p_callback;
+
+ if (cb != NULL) {
+ /* indicate that the function has been called */
+ cb->verify_result = verify;
+ complete(&cb->complete);
+ }
+}
+
+boolean_t
+qat_crypt_use_accel(size_t s_len)
+{
+ return (!zfs_qat_encrypt_disable &&
+ qat_cy_init_done &&
+ s_len >= QAT_MIN_BUF_SIZE &&
+ s_len <= QAT_MAX_BUF_SIZE);
+}
+
+boolean_t
+qat_checksum_use_accel(size_t s_len)
+{
+ return (!zfs_qat_checksum_disable &&
+ qat_cy_init_done &&
+ s_len >= QAT_MIN_BUF_SIZE &&
+ s_len <= QAT_MAX_BUF_SIZE);
+}
+
+void
+qat_cy_clean(void)
+{
+ for (Cpa16U i = 0; i < num_inst; i++)
+ cpaCyStopInstance(cy_inst_handles[i]);
+
+ num_inst = 0;
+ qat_cy_init_done = B_FALSE;
+}
+
+int
+qat_cy_init(void)
+{
+ CpaStatus status = CPA_STATUS_FAIL;
+
+ if (qat_cy_init_done)
+ return (0);
+
+ status = cpaCyGetNumInstances(&num_inst);
+ if (status != CPA_STATUS_SUCCESS)
+ return (-1);
+
+ /* if the user has configured no QAT encryption units just return */
+ if (num_inst == 0)
+ return (0);
+
+ if (num_inst > QAT_CRYPT_MAX_INSTANCES)
+ num_inst = QAT_CRYPT_MAX_INSTANCES;
+
+ status = cpaCyGetInstances(num_inst, &cy_inst_handles[0]);
+ if (status != CPA_STATUS_SUCCESS)
+ return (-1);
+
+ for (Cpa16U i = 0; i < num_inst; i++) {
+ status = cpaCySetAddressTranslation(cy_inst_handles[i],
+ (void *)virt_to_phys);
+ if (status != CPA_STATUS_SUCCESS)
+ goto error;
+
+ status = cpaCyStartInstance(cy_inst_handles[i]);
+ if (status != CPA_STATUS_SUCCESS)
+ goto error;
+ }
+
+ qat_cy_init_done = B_TRUE;
+ return (0);
+
+error:
+ qat_cy_clean();
+ return (-1);
+}
+
+void
+qat_cy_fini(void)
+{
+ if (!qat_cy_init_done)
+ return;
+
+ qat_cy_clean();
+}
+
+static CpaStatus
+qat_init_crypt_session_ctx(qat_encrypt_dir_t dir, CpaInstanceHandle inst_handle,
+ CpaCySymSessionCtx **cy_session_ctx, crypto_key_t *key,
+ Cpa64U crypt, Cpa32U aad_len)
+{
+ CpaStatus status = CPA_STATUS_SUCCESS;
+ Cpa32U ctx_size;
+ Cpa32U ciper_algorithm;
+ Cpa32U hash_algorithm;
+ CpaCySymSessionSetupData sd = { 0 };
+
+ if (zio_crypt_table[crypt].ci_crypt_type == ZC_TYPE_CCM) {
+ return (CPA_STATUS_FAIL);
+ } else {
+ ciper_algorithm = CPA_CY_SYM_CIPHER_AES_GCM;
+ hash_algorithm = CPA_CY_SYM_HASH_AES_GCM;
+ }
+
+ sd.cipherSetupData.cipherAlgorithm = ciper_algorithm;
+ sd.cipherSetupData.pCipherKey = key->ck_data;
+ sd.cipherSetupData.cipherKeyLenInBytes = key->ck_length / 8;
+ sd.hashSetupData.hashAlgorithm = hash_algorithm;
+ sd.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_AUTH;
+ sd.hashSetupData.digestResultLenInBytes = ZIO_DATA_MAC_LEN;
+ sd.hashSetupData.authModeSetupData.aadLenInBytes = aad_len;
+ sd.sessionPriority = CPA_CY_PRIORITY_NORMAL;
+ sd.symOperation = CPA_CY_SYM_OP_ALGORITHM_CHAINING;
+ sd.digestIsAppended = CPA_FALSE;
+ sd.verifyDigest = CPA_FALSE;
+
+ if (dir == QAT_ENCRYPT) {
+ sd.cipherSetupData.cipherDirection =
+ CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT;
+ sd.algChainOrder =
+ CPA_CY_SYM_ALG_CHAIN_ORDER_HASH_THEN_CIPHER;
+ } else {
+ ASSERT3U(dir, ==, QAT_DECRYPT);
+ sd.cipherSetupData.cipherDirection =
+ CPA_CY_SYM_CIPHER_DIRECTION_DECRYPT;
+ sd.algChainOrder =
+ CPA_CY_SYM_ALG_CHAIN_ORDER_CIPHER_THEN_HASH;
+ }
+
+ status = cpaCySymSessionCtxGetSize(inst_handle, &sd, &ctx_size);
+ if (status != CPA_STATUS_SUCCESS)
+ return (status);
+
+ status = QAT_PHYS_CONTIG_ALLOC(cy_session_ctx, ctx_size);
+ if (status != CPA_STATUS_SUCCESS)
+ return (status);
+
+ status = cpaCySymInitSession(inst_handle, symcallback, &sd,
+ *cy_session_ctx);
+ if (status != CPA_STATUS_SUCCESS) {
+ QAT_PHYS_CONTIG_FREE(*cy_session_ctx);
+ return (status);
+ }
+
+ return (CPA_STATUS_SUCCESS);
+}
+
+static CpaStatus
+qat_init_checksum_session_ctx(CpaInstanceHandle inst_handle,
+ CpaCySymSessionCtx **cy_session_ctx, Cpa64U cksum)
+{
+ CpaStatus status = CPA_STATUS_SUCCESS;
+ Cpa32U ctx_size;
+ Cpa32U hash_algorithm;
+ CpaCySymSessionSetupData sd = { 0 };
+
+ /*
+ * ZFS's SHA512 checksum is actually SHA512/256, which uses
+ * a different IV from standard SHA512. QAT does not support
+ * SHA512/256, so we can only support SHA256.
+ */
+ if (cksum == ZIO_CHECKSUM_SHA256)
+ hash_algorithm = CPA_CY_SYM_HASH_SHA256;
+ else
+ return (CPA_STATUS_FAIL);
+
+ sd.sessionPriority = CPA_CY_PRIORITY_NORMAL;
+ sd.symOperation = CPA_CY_SYM_OP_HASH;
+ sd.hashSetupData.hashAlgorithm = hash_algorithm;
+ sd.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_PLAIN;
+ sd.hashSetupData.digestResultLenInBytes = sizeof (zio_cksum_t);
+ sd.digestIsAppended = CPA_FALSE;
+ sd.verifyDigest = CPA_FALSE;
+
+ status = cpaCySymSessionCtxGetSize(inst_handle, &sd, &ctx_size);
+ if (status != CPA_STATUS_SUCCESS)
+ return (status);
+
+ status = QAT_PHYS_CONTIG_ALLOC(cy_session_ctx, ctx_size);
+ if (status != CPA_STATUS_SUCCESS)
+ return (status);
+
+ status = cpaCySymInitSession(inst_handle, symcallback, &sd,
+ *cy_session_ctx);
+ if (status != CPA_STATUS_SUCCESS) {
+ QAT_PHYS_CONTIG_FREE(*cy_session_ctx);
+ return (status);
+ }
+
+ return (CPA_STATUS_SUCCESS);
+}
+
+static CpaStatus
+qat_init_cy_buffer_lists(CpaInstanceHandle inst_handle, uint32_t nr_bufs,
+ CpaBufferList *src, CpaBufferList *dst)
+{
+ CpaStatus status = CPA_STATUS_SUCCESS;
+ Cpa32U meta_size = 0;
+
+ status = cpaCyBufferListGetMetaSize(inst_handle, nr_bufs, &meta_size);
+ if (status != CPA_STATUS_SUCCESS)
+ return (status);
+
+ status = QAT_PHYS_CONTIG_ALLOC(&src->pPrivateMetaData, meta_size);
+ if (status != CPA_STATUS_SUCCESS)
+ goto error;
+
+ if (src != dst) {
+ status = QAT_PHYS_CONTIG_ALLOC(&dst->pPrivateMetaData,
+ meta_size);
+ if (status != CPA_STATUS_SUCCESS)
+ goto error;
+ }
+
+ return (CPA_STATUS_SUCCESS);
+
+error:
+ QAT_PHYS_CONTIG_FREE(src->pPrivateMetaData);
+ if (src != dst)
+ QAT_PHYS_CONTIG_FREE(dst->pPrivateMetaData);
+
+ return (status);
+}
+
+int
+qat_crypt(qat_encrypt_dir_t dir, uint8_t *src_buf, uint8_t *dst_buf,
+ uint8_t *aad_buf, uint32_t aad_len, uint8_t *iv_buf, uint8_t *digest_buf,
+ crypto_key_t *key, uint64_t crypt, uint32_t enc_len)
+{
+ CpaStatus status = CPA_STATUS_SUCCESS;
+ Cpa16U i;
+ CpaInstanceHandle cy_inst_handle;
+ Cpa16U nr_bufs = (enc_len >> PAGE_SHIFT) + 2;
+ Cpa32U bytes_left = 0;
+ Cpa8S *data = NULL;
+ CpaCySymSessionCtx *cy_session_ctx = NULL;
+ cy_callback_t cb;
+ CpaCySymOpData op_data = { 0 };
+ CpaBufferList src_buffer_list = { 0 };
+ CpaBufferList dst_buffer_list = { 0 };
+ CpaFlatBuffer *flat_src_buf_array = NULL;
+ CpaFlatBuffer *flat_src_buf = NULL;
+ CpaFlatBuffer *flat_dst_buf_array = NULL;
+ CpaFlatBuffer *flat_dst_buf = NULL;
+ struct page *in_pages[MAX_PAGE_NUM];
+ struct page *out_pages[MAX_PAGE_NUM];
+ Cpa32U in_page_num = 0;
+ Cpa32U out_page_num = 0;
+ Cpa32U in_page_off = 0;
+ Cpa32U out_page_off = 0;
+
+ if (dir == QAT_ENCRYPT) {
+ QAT_STAT_BUMP(encrypt_requests);
+ QAT_STAT_INCR(encrypt_total_in_bytes, enc_len);
+ } else {
+ QAT_STAT_BUMP(decrypt_requests);
+ QAT_STAT_INCR(decrypt_total_in_bytes, enc_len);
+ }
+
+ i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst;
+ cy_inst_handle = cy_inst_handles[i];
+
+ status = qat_init_crypt_session_ctx(dir, cy_inst_handle,
+ &cy_session_ctx, key, crypt, aad_len);
+ if (status != CPA_STATUS_SUCCESS) {
+ /* don't count CCM as a failure since it's not supported */
+ if (zio_crypt_table[crypt].ci_crypt_type == ZC_TYPE_GCM)
+ QAT_STAT_BUMP(crypt_fails);
+ return (status);
+ }
+
+ /*
+ * We increment nr_bufs by 2 to allow us to handle non
+ * page-aligned buffer addresses and buffers whose sizes
+ * are not divisible by PAGE_SIZE.
+ */
+ status = qat_init_cy_buffer_lists(cy_inst_handle, nr_bufs,
+ &src_buffer_list, &dst_buffer_list);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ status = QAT_PHYS_CONTIG_ALLOC(&flat_src_buf_array,
+ nr_bufs * sizeof (CpaFlatBuffer));
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+ status = QAT_PHYS_CONTIG_ALLOC(&flat_dst_buf_array,
+ nr_bufs * sizeof (CpaFlatBuffer));
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+ status = QAT_PHYS_CONTIG_ALLOC(&op_data.pDigestResult,
+ ZIO_DATA_MAC_LEN);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+ status = QAT_PHYS_CONTIG_ALLOC(&op_data.pIv,
+ ZIO_DATA_IV_LEN);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+ if (aad_len > 0) {
+ status = QAT_PHYS_CONTIG_ALLOC(&op_data.pAdditionalAuthData,
+ aad_len);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+ bcopy(aad_buf, op_data.pAdditionalAuthData, aad_len);
+ }
+
+ bytes_left = enc_len;
+ data = src_buf;
+ flat_src_buf = flat_src_buf_array;
+ while (bytes_left > 0) {
+ in_page_off = ((long)data & ~PAGE_MASK);
+ in_pages[in_page_num] = qat_mem_to_page(data);
+ flat_src_buf->pData = kmap(in_pages[in_page_num]) + in_page_off;
+ flat_src_buf->dataLenInBytes =
+ min((long)PAGE_SIZE - in_page_off, (long)bytes_left);
+ data += flat_src_buf->dataLenInBytes;
+ bytes_left -= flat_src_buf->dataLenInBytes;
+ flat_src_buf++;
+ in_page_num++;
+ }
+ src_buffer_list.pBuffers = flat_src_buf_array;
+ src_buffer_list.numBuffers = in_page_num;
+
+ bytes_left = enc_len;
+ data = dst_buf;
+ flat_dst_buf = flat_dst_buf_array;
+ while (bytes_left > 0) {
+ out_page_off = ((long)data & ~PAGE_MASK);
+ out_pages[out_page_num] = qat_mem_to_page(data);
+ flat_dst_buf->pData = kmap(out_pages[out_page_num]) +
+ out_page_off;
+ flat_dst_buf->dataLenInBytes =
+ min((long)PAGE_SIZE - out_page_off, (long)bytes_left);
+ data += flat_dst_buf->dataLenInBytes;
+ bytes_left -= flat_dst_buf->dataLenInBytes;
+ flat_dst_buf++;
+ out_page_num++;
+ }
+ dst_buffer_list.pBuffers = flat_dst_buf_array;
+ dst_buffer_list.numBuffers = out_page_num;
+
+ op_data.sessionCtx = cy_session_ctx;
+ op_data.packetType = CPA_CY_SYM_PACKET_TYPE_FULL;
+ op_data.cryptoStartSrcOffsetInBytes = 0;
+ op_data.messageLenToCipherInBytes = 0;
+ op_data.hashStartSrcOffsetInBytes = 0;
+ op_data.messageLenToHashInBytes = 0;
+ op_data.messageLenToCipherInBytes = enc_len;
+ op_data.ivLenInBytes = ZIO_DATA_IV_LEN;
+ bcopy(iv_buf, op_data.pIv, ZIO_DATA_IV_LEN);
+ /* if dir is QAT_DECRYPT, copy digest_buf to pDigestResult */
+ if (dir == QAT_DECRYPT)
+ bcopy(digest_buf, op_data.pDigestResult, ZIO_DATA_MAC_LEN);
+
+ cb.verify_result = CPA_FALSE;
+ init_completion(&cb.complete);
+ status = cpaCySymPerformOp(cy_inst_handle, &cb, &op_data,
+ &src_buffer_list, &dst_buffer_list, NULL);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ /* we now wait until the completion of the operation. */
+ wait_for_completion(&cb.complete);
+
+ if (cb.verify_result == CPA_FALSE) {
+ status = CPA_STATUS_FAIL;
+ goto fail;
+ }
+
+ if (dir == QAT_ENCRYPT) {
+ /* if dir is QAT_ENCRYPT, save pDigestResult to digest_buf */
+ bcopy(op_data.pDigestResult, digest_buf, ZIO_DATA_MAC_LEN);
+ QAT_STAT_INCR(encrypt_total_out_bytes, enc_len);
+ } else {
+ QAT_STAT_INCR(decrypt_total_out_bytes, enc_len);
+ }
+
+fail:
+ if (status != CPA_STATUS_SUCCESS)
+ QAT_STAT_BUMP(crypt_fails);
+
+ for (i = 0; i < in_page_num; i++)
+ kunmap(in_pages[i]);
+ for (i = 0; i < out_page_num; i++)
+ kunmap(out_pages[i]);
+
+ cpaCySymRemoveSession(cy_inst_handle, cy_session_ctx);
+ if (aad_len > 0)
+ QAT_PHYS_CONTIG_FREE(op_data.pAdditionalAuthData);
+ QAT_PHYS_CONTIG_FREE(op_data.pIv);
+ QAT_PHYS_CONTIG_FREE(op_data.pDigestResult);
+ QAT_PHYS_CONTIG_FREE(src_buffer_list.pPrivateMetaData);
+ QAT_PHYS_CONTIG_FREE(dst_buffer_list.pPrivateMetaData);
+ QAT_PHYS_CONTIG_FREE(cy_session_ctx);
+ QAT_PHYS_CONTIG_FREE(flat_src_buf_array);
+ QAT_PHYS_CONTIG_FREE(flat_dst_buf_array);
+
+ return (status);
+}
+
+int
+qat_checksum(uint64_t cksum, uint8_t *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ CpaStatus status;
+ Cpa16U i;
+ CpaInstanceHandle cy_inst_handle;
+ Cpa16U nr_bufs = (size >> PAGE_SHIFT) + 2;
+ Cpa32U bytes_left = 0;
+ Cpa8S *data = NULL;
+ CpaCySymSessionCtx *cy_session_ctx = NULL;
+ cy_callback_t cb;
+ Cpa8U *digest_buffer = NULL;
+ CpaCySymOpData op_data = { 0 };
+ CpaBufferList src_buffer_list = { 0 };
+ CpaFlatBuffer *flat_src_buf_array = NULL;
+ CpaFlatBuffer *flat_src_buf = NULL;
+ struct page *in_pages[MAX_PAGE_NUM];
+ Cpa32U page_num = 0;
+ Cpa32U page_off = 0;
+
+ QAT_STAT_BUMP(cksum_requests);
+ QAT_STAT_INCR(cksum_total_in_bytes, size);
+
+ i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst;
+ cy_inst_handle = cy_inst_handles[i];
+
+ status = qat_init_checksum_session_ctx(cy_inst_handle,
+ &cy_session_ctx, cksum);
+ if (status != CPA_STATUS_SUCCESS) {
+ /* don't count unsupported checksums as a failure */
+ if (cksum == ZIO_CHECKSUM_SHA256 ||
+ cksum == ZIO_CHECKSUM_SHA512)
+ QAT_STAT_BUMP(cksum_fails);
+ return (status);
+ }
+
+ /*
+ * We increment nr_bufs by 2 to allow us to handle non
+ * page-aligned buffer addresses and buffers whose sizes
+ * are not divisible by PAGE_SIZE.
+ */
+ status = qat_init_cy_buffer_lists(cy_inst_handle, nr_bufs,
+ &src_buffer_list, &src_buffer_list);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ status = QAT_PHYS_CONTIG_ALLOC(&flat_src_buf_array,
+ nr_bufs * sizeof (CpaFlatBuffer));
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+ status = QAT_PHYS_CONTIG_ALLOC(&digest_buffer,
+ sizeof (zio_cksum_t));
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ bytes_left = size;
+ data = buf;
+ flat_src_buf = flat_src_buf_array;
+ while (bytes_left > 0) {
+ page_off = ((long)data & ~PAGE_MASK);
+ in_pages[page_num] = qat_mem_to_page(data);
+ flat_src_buf->pData = kmap(in_pages[page_num]) + page_off;
+ flat_src_buf->dataLenInBytes =
+ min((long)PAGE_SIZE - page_off, (long)bytes_left);
+ data += flat_src_buf->dataLenInBytes;
+ bytes_left -= flat_src_buf->dataLenInBytes;
+ flat_src_buf++;
+ page_num++;
+ }
+ src_buffer_list.pBuffers = flat_src_buf_array;
+ src_buffer_list.numBuffers = page_num;
+
+ op_data.sessionCtx = cy_session_ctx;
+ op_data.packetType = CPA_CY_SYM_PACKET_TYPE_FULL;
+ op_data.hashStartSrcOffsetInBytes = 0;
+ op_data.messageLenToHashInBytes = size;
+ op_data.pDigestResult = digest_buffer;
+
+ cb.verify_result = CPA_FALSE;
+ init_completion(&cb.complete);
+ status = cpaCySymPerformOp(cy_inst_handle, &cb, &op_data,
+ &src_buffer_list, &src_buffer_list, NULL);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ /* we now wait until the completion of the operation. */
+ wait_for_completion(&cb.complete);
+
+ if (cb.verify_result == CPA_FALSE) {
+ status = CPA_STATUS_FAIL;
+ goto fail;
+ }
+
+ bcopy(digest_buffer, zcp, sizeof (zio_cksum_t));
+
+fail:
+ if (status != CPA_STATUS_SUCCESS)
+ QAT_STAT_BUMP(cksum_fails);
+
+ for (i = 0; i < page_num; i++)
+ kunmap(in_pages[i]);
+
+ cpaCySymRemoveSession(cy_inst_handle, cy_session_ctx);
+ QAT_PHYS_CONTIG_FREE(digest_buffer);
+ QAT_PHYS_CONTIG_FREE(src_buffer_list.pPrivateMetaData);
+ QAT_PHYS_CONTIG_FREE(cy_session_ctx);
+ QAT_PHYS_CONTIG_FREE(flat_src_buf_array);
+
+ return (status);
+}
+
+static int
+param_set_qat_encrypt(const char *val, zfs_kernel_param_t *kp)
+{
+ int ret;
+ int *pvalue = kp->arg;
+ ret = param_set_int(val, kp);
+ if (ret)
+ return (ret);
+ /*
+ * zfs_qat_encrypt_disable = 0: enable qat encrypt
+ * try to initialize qat instance if it has not been done
+ */
+ if (*pvalue == 0 && !qat_cy_init_done) {
+ ret = qat_cy_init();
+ if (ret != 0) {
+ zfs_qat_encrypt_disable = 1;
+ return (ret);
+ }
+ }
+ return (ret);
+}
+
+static int
+param_set_qat_checksum(const char *val, zfs_kernel_param_t *kp)
+{
+ int ret;
+ int *pvalue = kp->arg;
+ ret = param_set_int(val, kp);
+ if (ret)
+ return (ret);
+ /*
+ * set_checksum_param_ops = 0: enable qat checksum
+ * try to initialize qat instance if it has not been done
+ */
+ if (*pvalue == 0 && !qat_cy_init_done) {
+ ret = qat_cy_init();
+ if (ret != 0) {
+ zfs_qat_checksum_disable = 1;
+ return (ret);
+ }
+ }
+ return (ret);
+}
+
+module_param_call(zfs_qat_encrypt_disable, param_set_qat_encrypt,
+ param_get_int, &zfs_qat_encrypt_disable, 0644);
+MODULE_PARM_DESC(zfs_qat_encrypt_disable, "Enable/Disable QAT encryption");
+
+module_param_call(zfs_qat_checksum_disable, param_set_qat_checksum,
+ param_get_int, &zfs_qat_checksum_disable, 0644);
+MODULE_PARM_DESC(zfs_qat_checksum_disable, "Enable/Disable QAT checksumming");
+
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c
new file mode 100644
index 000000000000..5672cd6d5c5e
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c
@@ -0,0 +1,110 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/fm/util.h>
+#include <sys/dsl_scan.h>
+#include <sys/fs/zfs.h>
+#include <sys/kstat.h>
+#include "zfs_prop.h"
+
+
+int
+param_set_deadman_failmode(const char *val, zfs_kernel_param_t *kp)
+{
+ int error;
+
+ error = -param_set_deadman_failmode_common(val);
+ if (error == 0)
+ error = param_set_charp(val, kp);
+
+ return (error);
+}
+
+int
+param_set_deadman_ziotime(const char *val, zfs_kernel_param_t *kp)
+{
+ int error;
+
+ error = param_set_ulong(val, kp);
+ if (error < 0)
+ return (SET_ERROR(error));
+
+ spa_set_deadman_ziotime(MSEC2NSEC(zfs_deadman_ziotime_ms));
+
+ return (0);
+}
+
+int
+param_set_deadman_synctime(const char *val, zfs_kernel_param_t *kp)
+{
+ int error;
+
+ error = param_set_ulong(val, kp);
+ if (error < 0)
+ return (SET_ERROR(error));
+
+ spa_set_deadman_synctime(MSEC2NSEC(zfs_deadman_synctime_ms));
+
+ return (0);
+}
+
+int
+param_set_slop_shift(const char *buf, zfs_kernel_param_t *kp)
+{
+ unsigned long val;
+ int error;
+
+ error = kstrtoul(buf, 0, &val);
+ if (error)
+ return (SET_ERROR(error));
+
+ if (val < 1 || val > 31)
+ return (SET_ERROR(-EINVAL));
+
+ error = param_set_int(buf, kp);
+ if (error < 0)
+ return (SET_ERROR(error));
+
+ return (0);
+}
+
+const char *
+spa_history_zone(void)
+{
+ return ("linux");
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/trace.c b/sys/contrib/openzfs/module/os/linux/zfs/trace.c
new file mode 100644
index 000000000000..a690822ae14c
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/trace.c
@@ -0,0 +1,55 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Each DTRACE_PROBE must define its trace point in one (and only one)
+ * source file, so this dummy file exists for that purpose.
+ */
+
+#include <sys/multilist.h>
+#include <sys/arc_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dnode.h>
+#include <sys/zfs_znode.h>
+#include <sys/zil_impl.h>
+
+#ifdef _KERNEL
+#define CREATE_TRACE_POINTS
+#include <sys/trace.h>
+#include <sys/trace_acl.h>
+#include <sys/trace_arc.h>
+#include <sys/trace_dbgmsg.h>
+#include <sys/trace_dbuf.h>
+#include <sys/trace_dmu.h>
+#include <sys/trace_dnode.h>
+#include <sys/trace_multilist.h>
+#include <sys/trace_rrwlock.h>
+#include <sys/trace_txg.h>
+#include <sys/trace_vdev.h>
+#include <sys/trace_zil.h>
+#include <sys/trace_zio.h>
+#include <sys/trace_zrlock.h>
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
new file mode 100644
index 000000000000..b373f2c2e83c
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
@@ -0,0 +1,919 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * LLNL-CODE-403049.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_disk.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
+#include <sys/abd.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <linux/blkpg.h>
+#include <linux/msdos_fs.h>
+#include <linux/vfs_compat.h>
+
+typedef struct vdev_disk {
+ struct block_device *vd_bdev;
+ krwlock_t vd_lock;
+} vdev_disk_t;
+
+/*
+ * Unique identifier for the exclusive vdev holder.
+ */
+static void *zfs_vdev_holder = VDEV_HOLDER;
+
+/*
+ * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the
+ * device is missing. The missing path may be transient since the links
+ * can be briefly removed and recreated in response to udev events.
+ */
+static unsigned zfs_vdev_open_timeout_ms = 1000;
+
+/*
+ * Size of the "reserved" partition, in blocks.
+ */
+#define EFI_MIN_RESV_SIZE (16 * 1024)
+
+/*
+ * Virtual device vector for disks.
+ */
+typedef struct dio_request {
+ zio_t *dr_zio; /* Parent ZIO */
+ atomic_t dr_ref; /* References */
+ int dr_error; /* Bio error */
+ int dr_bio_count; /* Count of bio's */
+ struct bio *dr_bio[0]; /* Attached bio's */
+} dio_request_t;
+
+static fmode_t
+vdev_bdev_mode(spa_mode_t spa_mode)
+{
+ fmode_t mode = 0;
+
+ if (spa_mode & SPA_MODE_READ)
+ mode |= FMODE_READ;
+
+ if (spa_mode & SPA_MODE_WRITE)
+ mode |= FMODE_WRITE;
+
+ return (mode);
+}
+
+/*
+ * Returns the usable capacity (in bytes) for the partition or disk.
+ */
+static uint64_t
+bdev_capacity(struct block_device *bdev)
+{
+ return (i_size_read(bdev->bd_inode));
+}
+
+#if !defined(HAVE_BDEV_WHOLE)
+static inline struct block_device *
+bdev_whole(struct block_device *bdev)
+{
+ return (bdev->bd_contains);
+}
+#endif
+
+/*
+ * Returns the maximum expansion capacity of the block device (in bytes).
+ *
+ * It is possible to expand a vdev when it has been created as a wholedisk
+ * and the containing block device has increased in capacity. Or when the
+ * partition containing the pool has been manually increased in size.
+ *
+ * This function is only responsible for calculating the potential expansion
+ * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is
+ * responsible for verifying the expected partition layout in the wholedisk
+ * case, and updating the partition table if appropriate. Once the partition
+ * size has been increased the additional capacity will be visible using
+ * bdev_capacity().
+ *
+ * The returned maximum expansion capacity is always expected to be larger, or
+ * at the very least equal, to its usable capacity to prevent overestimating
+ * the pool expandsize.
+ */
+static uint64_t
+bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
+{
+ uint64_t psize;
+ int64_t available;
+
+ if (wholedisk && bdev != bdev_whole(bdev)) {
+ /*
+ * When reporting maximum expansion capacity for a wholedisk
+ * deduct any capacity which is expected to be lost due to
+ * alignment restrictions. Over reporting this value isn't
+ * harmful and would only result in slightly less capacity
+ * than expected post expansion.
+ * The estimated available space may be slightly smaller than
+ * bdev_capacity() for devices where the number of sectors is
+ * not a multiple of the alignment size and the partition layout
+ * is keeping less than PARTITION_END_ALIGNMENT bytes after the
+ * "reserved" EFI partition: in such cases return the device
+ * usable capacity.
+ */
+ available = i_size_read(bdev_whole(bdev)->bd_inode) -
+ ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
+ PARTITION_END_ALIGNMENT) << SECTOR_BITS);
+ psize = MAX(available, bdev_capacity(bdev));
+ } else {
+ psize = bdev_capacity(bdev);
+ }
+
+ return (psize);
+}
+
+static void
+vdev_disk_error(zio_t *zio)
+{
+ /*
+ * This function can be called in interrupt context, for instance while
+ * handling IRQs coming from a misbehaving disk device; use printk()
+ * which is safe from any context.
+ */
+ printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d "
+ "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa),
+ zio->io_vd->vdev_path, zio->io_error, zio->io_type,
+ (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
+ zio->io_flags);
+}
+
+static int
+vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+ struct block_device *bdev;
+ fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
+ hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms);
+ vdev_disk_t *vd;
+
+ /* Must have a pathname and it must be absolute. */
+ if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
+ v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ vdev_dbgmsg(v, "invalid vdev_path");
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Reopen the device if it is currently open. When expanding a
+ * partition force re-scanning the partition table if userland
+ * did not take care of this already. We need to do this while closed
+ * in order to get an accurate updated block device size. Then
+ * since udev may need to recreate the device links increase the
+ * open retry timeout before reporting the device as unavailable.
+ */
+ vd = v->vdev_tsd;
+ if (vd) {
+ char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
+ boolean_t reread_part = B_FALSE;
+
+ rw_enter(&vd->vd_lock, RW_WRITER);
+ bdev = vd->vd_bdev;
+ vd->vd_bdev = NULL;
+
+ if (bdev) {
+ if (v->vdev_expanding && bdev != bdev_whole(bdev)) {
+ bdevname(bdev_whole(bdev), disk_name + 5);
+ /*
+ * If userland has BLKPG_RESIZE_PARTITION,
+ * then it should have updated the partition
+ * table already. We can detect this by
+ * comparing our current physical size
+ * with that of the device. If they are
+ * the same, then we must not have
+ * BLKPG_RESIZE_PARTITION or it failed to
+ * update the partition table online. We
+ * fallback to rescanning the partition
+ * table from the kernel below. However,
+ * if the capacity already reflects the
+ * updated partition, then we skip
+ * rescanning the partition table here.
+ */
+ if (v->vdev_psize == bdev_capacity(bdev))
+ reread_part = B_TRUE;
+ }
+
+ blkdev_put(bdev, mode | FMODE_EXCL);
+ }
+
+ if (reread_part) {
+ bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL,
+ zfs_vdev_holder);
+ if (!IS_ERR(bdev)) {
+ int error = vdev_bdev_reread_part(bdev);
+ blkdev_put(bdev, mode | FMODE_EXCL);
+ if (error == 0) {
+ timeout = MSEC2NSEC(
+ zfs_vdev_open_timeout_ms * 2);
+ }
+ }
+ }
+ } else {
+ vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+
+ rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
+ rw_enter(&vd->vd_lock, RW_WRITER);
+ }
+
+ /*
+ * Devices are always opened by the path provided at configuration
+ * time. This means that if the provided path is a udev by-id path
+ * then drives may be re-cabled without an issue. If the provided
+ * path is a udev by-path path, then the physical location information
+ * will be preserved. This can be critical for more complicated
+ * configurations where drives are located in specific physical
+ * locations to maximize the systems tolerance to component failure.
+ *
+ * Alternatively, you can provide your own udev rule to flexibly map
+ * the drives as you see fit. It is not advised that you use the
+ * /dev/[hd]d devices which may be reordered due to probing order.
+ * Devices in the wrong locations will be detected by the higher
+ * level vdev validation.
+ *
+ * The specified paths may be briefly removed and recreated in
+ * response to udev events. This should be exceptionally unlikely
+ * because the zpool command makes every effort to verify these paths
+ * have already settled prior to reaching this point. Therefore,
+ * a ENOENT failure at this point is highly likely to be transient
+ * and it is reasonable to sleep and retry before giving up. In
+ * practice delays have been observed to be on the order of 100ms.
+ */
+ hrtime_t start = gethrtime();
+ bdev = ERR_PTR(-ENXIO);
+ while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) {
+ bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL,
+ zfs_vdev_holder);
+ if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
+ schedule_timeout(MSEC_TO_TICK(10));
+ } else if (IS_ERR(bdev)) {
+ break;
+ }
+ }
+
+ if (IS_ERR(bdev)) {
+ int error = -PTR_ERR(bdev);
+ vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error,
+ (u_longlong_t)(gethrtime() - start),
+ (u_longlong_t)timeout);
+ vd->vd_bdev = NULL;
+ v->vdev_tsd = vd;
+ rw_exit(&vd->vd_lock);
+ return (SET_ERROR(error));
+ } else {
+ vd->vd_bdev = bdev;
+ v->vdev_tsd = vd;
+ rw_exit(&vd->vd_lock);
+ }
+
+ struct request_queue *q = bdev_get_queue(vd->vd_bdev);
+
+ /* Determine the physical block size */
+ int physical_block_size = bdev_physical_block_size(vd->vd_bdev);
+
+ /* Determine the logical block size */
+ int logical_block_size = bdev_logical_block_size(vd->vd_bdev);
+
+ /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
+ v->vdev_nowritecache = B_FALSE;
+
+ /* Set when device reports it supports TRIM. */
+ v->vdev_has_trim = !!blk_queue_discard(q);
+
+ /* Set when device reports it supports secure TRIM. */
+ v->vdev_has_securetrim = !!blk_queue_discard_secure(q);
+
+ /* Inform the ZIO pipeline that we are non-rotational */
+ v->vdev_nonrot = blk_queue_nonrot(q);
+
+ /* Physical volume size in bytes for the partition */
+ *psize = bdev_capacity(vd->vd_bdev);
+
+ /* Physical volume size in bytes including possible expansion space */
+ *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk);
+
+ /* Based on the minimum sector size set the block size */
+ *physical_ashift = highbit64(MAX(physical_block_size,
+ SPA_MINBLOCKSIZE)) - 1;
+
+ *logical_ashift = highbit64(MAX(logical_block_size,
+ SPA_MINBLOCKSIZE)) - 1;
+
+ return (0);
+}
+
+static void
+vdev_disk_close(vdev_t *v)
+{
+ vdev_disk_t *vd = v->vdev_tsd;
+
+ if (v->vdev_reopening || vd == NULL)
+ return;
+
+ if (vd->vd_bdev != NULL) {
+ blkdev_put(vd->vd_bdev,
+ vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL);
+ }
+
+ rw_destroy(&vd->vd_lock);
+ kmem_free(vd, sizeof (vdev_disk_t));
+ v->vdev_tsd = NULL;
+}
+
+static dio_request_t *
+vdev_disk_dio_alloc(int bio_count)
+{
+ dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
+ sizeof (struct bio *) * bio_count, KM_SLEEP);
+ atomic_set(&dr->dr_ref, 0);
+ dr->dr_bio_count = bio_count;
+ dr->dr_error = 0;
+
+ for (int i = 0; i < dr->dr_bio_count; i++)
+ dr->dr_bio[i] = NULL;
+
+ return (dr);
+}
+
+static void
+vdev_disk_dio_free(dio_request_t *dr)
+{
+ int i;
+
+ for (i = 0; i < dr->dr_bio_count; i++)
+ if (dr->dr_bio[i])
+ bio_put(dr->dr_bio[i]);
+
+ kmem_free(dr, sizeof (dio_request_t) +
+ sizeof (struct bio *) * dr->dr_bio_count);
+}
+
+static void
+vdev_disk_dio_get(dio_request_t *dr)
+{
+ atomic_inc(&dr->dr_ref);
+}
+
+static int
+vdev_disk_dio_put(dio_request_t *dr)
+{
+ int rc = atomic_dec_return(&dr->dr_ref);
+
+ /*
+ * Free the dio_request when the last reference is dropped and
+ * ensure zio_interpret is called only once with the correct zio
+ */
+ if (rc == 0) {
+ zio_t *zio = dr->dr_zio;
+ int error = dr->dr_error;
+
+ vdev_disk_dio_free(dr);
+
+ if (zio) {
+ zio->io_error = error;
+ ASSERT3S(zio->io_error, >=, 0);
+ if (zio->io_error)
+ vdev_disk_error(zio);
+
+ zio_delay_interrupt(zio);
+ }
+ }
+
+ return (rc);
+}
+
+BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
+{
+ dio_request_t *dr = bio->bi_private;
+ int rc;
+
+ if (dr->dr_error == 0) {
+#ifdef HAVE_1ARG_BIO_END_IO_T
+ dr->dr_error = BIO_END_IO_ERROR(bio);
+#else
+ if (error)
+ dr->dr_error = -(error);
+ else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+ dr->dr_error = EIO;
+#endif
+ }
+
+ /* Drop reference acquired by __vdev_disk_physio */
+ rc = vdev_disk_dio_put(dr);
+}
+
+static inline void
+vdev_submit_bio_impl(struct bio *bio)
+{
+#ifdef HAVE_1ARG_SUBMIT_BIO
+ submit_bio(bio);
+#else
+ submit_bio(0, bio);
+#endif
+}
+
+/*
+ * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so
+ * replace it with preempt_schedule under the following condition:
+ */
+#if defined(CONFIG_ARM64) && \
+ defined(CONFIG_PREEMPTION) && \
+ defined(CONFIG_BLK_CGROUP)
+#define preempt_schedule_notrace(x) preempt_schedule(x)
+#endif
+
+#ifdef HAVE_BIO_SET_DEV
+#if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY)
+/*
+ * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by
+ * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched().
+ * As a side effect the function was converted to GPL-only. Define our
+ * own version when needed which uses rcu_read_lock_sched().
+ */
+#if defined(HAVE_BLKG_TRYGET_GPL_ONLY)
+static inline bool
+vdev_blkg_tryget(struct blkcg_gq *blkg)
+{
+ struct percpu_ref *ref = &blkg->refcnt;
+ unsigned long __percpu *count;
+ bool rc;
+
+ rcu_read_lock_sched();
+
+ if (__ref_is_percpu(ref, &count)) {
+ this_cpu_inc(*count);
+ rc = true;
+ } else {
+#ifdef ZFS_PERCPU_REF_COUNT_IN_DATA
+ rc = atomic_long_inc_not_zero(&ref->data->count);
+#else
+ rc = atomic_long_inc_not_zero(&ref->count);
+#endif
+ }
+
+ rcu_read_unlock_sched();
+
+ return (rc);
+}
+#elif defined(HAVE_BLKG_TRYGET)
+#define vdev_blkg_tryget(bg) blkg_tryget(bg)
+#endif
+/*
+ * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the
+ * GPL-only bio_associate_blkg() symbol thus inadvertently converting
+ * the entire macro. Provide a minimal version which always assigns the
+ * request queue's root_blkg to the bio.
+ */
+static inline void
+vdev_bio_associate_blkg(struct bio *bio)
+{
+ struct request_queue *q = bio->bi_disk->queue;
+
+ ASSERT3P(q, !=, NULL);
+ ASSERT3P(bio->bi_blkg, ==, NULL);
+
+ if (q->root_blkg && vdev_blkg_tryget(q->root_blkg))
+ bio->bi_blkg = q->root_blkg;
+}
+#define bio_associate_blkg vdev_bio_associate_blkg
+#endif
+#else
+/*
+ * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels.
+ */
+static inline void
+bio_set_dev(struct bio *bio, struct block_device *bdev)
+{
+ bio->bi_bdev = bdev;
+}
+#endif /* HAVE_BIO_SET_DEV */
+
+static inline void
+vdev_submit_bio(struct bio *bio)
+{
+ struct bio_list *bio_list = current->bio_list;
+ current->bio_list = NULL;
+ vdev_submit_bio_impl(bio);
+ current->bio_list = bio_list;
+}
+
+static int
+__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
+ size_t io_size, uint64_t io_offset, int rw, int flags)
+{
+ dio_request_t *dr;
+ uint64_t abd_offset;
+ uint64_t bio_offset;
+ int bio_size;
+ int bio_count = 16;
+ int error = 0;
+ struct blk_plug plug;
+
+ /*
+ * Accessing outside the block device is never allowed.
+ */
+ if (io_offset + io_size > bdev->bd_inode->i_size) {
+ vdev_dbgmsg(zio->io_vd,
+ "Illegal access %llu size %llu, device size %llu",
+ io_offset, io_size, i_size_read(bdev->bd_inode));
+ return (SET_ERROR(EIO));
+ }
+
+retry:
+ dr = vdev_disk_dio_alloc(bio_count);
+
+ if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
+ bio_set_flags_failfast(bdev, &flags);
+
+ dr->dr_zio = zio;
+
+ /*
+ * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which
+ * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio
+ * can cover at least 128KB and at most 1MB. When the required number
+ * of iovec's exceeds this, we are forced to break the IO in multiple
+ * bio's and wait for them all to complete. This is likely if the
+ * recordsize property is increased beyond 1MB. The default
+ * bio_count=16 should typically accommodate the maximum-size zio of
+ * 16MB.
+ */
+
+ abd_offset = 0;
+ bio_offset = io_offset;
+ bio_size = io_size;
+ for (int i = 0; i <= dr->dr_bio_count; i++) {
+
+ /* Finished constructing bio's for given buffer */
+ if (bio_size <= 0)
+ break;
+
+ /*
+ * If additional bio's are required, we have to retry, but
+ * this should be rare - see the comment above.
+ */
+ if (dr->dr_bio_count == i) {
+ vdev_disk_dio_free(dr);
+ bio_count *= 2;
+ goto retry;
+ }
+
+ /* bio_alloc() with __GFP_WAIT never returns NULL */
+ dr->dr_bio[i] = bio_alloc(GFP_NOIO,
+ MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset),
+ BIO_MAX_PAGES));
+ if (unlikely(dr->dr_bio[i] == NULL)) {
+ vdev_disk_dio_free(dr);
+ return (SET_ERROR(ENOMEM));
+ }
+
+ /* Matching put called by vdev_disk_physio_completion */
+ vdev_disk_dio_get(dr);
+
+ bio_set_dev(dr->dr_bio[i], bdev);
+ BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
+ dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
+ dr->dr_bio[i]->bi_private = dr;
+ bio_set_op_attrs(dr->dr_bio[i], rw, flags);
+
+ /* Remaining size is returned to become the new size */
+ bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd,
+ bio_size, abd_offset);
+
+ /* Advance in buffer and construct another bio if needed */
+ abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
+ bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
+ }
+
+ /* Extra reference to protect dio_request during vdev_submit_bio */
+ vdev_disk_dio_get(dr);
+
+ if (dr->dr_bio_count > 1)
+ blk_start_plug(&plug);
+
+ /* Submit all bio's associated with this dio */
+ for (int i = 0; i < dr->dr_bio_count; i++) {
+ if (dr->dr_bio[i])
+ vdev_submit_bio(dr->dr_bio[i]);
+ }
+
+ if (dr->dr_bio_count > 1)
+ blk_finish_plug(&plug);
+
+ (void) vdev_disk_dio_put(dr);
+
+ return (error);
+}
+
+BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
+{
+ zio_t *zio = bio->bi_private;
+#ifdef HAVE_1ARG_BIO_END_IO_T
+ zio->io_error = BIO_END_IO_ERROR(bio);
+#else
+ zio->io_error = -error;
+#endif
+
+ if (zio->io_error && (zio->io_error == EOPNOTSUPP))
+ zio->io_vd->vdev_nowritecache = B_TRUE;
+
+ bio_put(bio);
+ ASSERT3S(zio->io_error, >=, 0);
+ if (zio->io_error)
+ vdev_disk_error(zio);
+ zio_interrupt(zio);
+}
+
+static int
+vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
+{
+ struct request_queue *q;
+ struct bio *bio;
+
+ q = bdev_get_queue(bdev);
+ if (!q)
+ return (SET_ERROR(ENXIO));
+
+ bio = bio_alloc(GFP_NOIO, 0);
+ /* bio_alloc() with __GFP_WAIT never returns NULL */
+ if (unlikely(bio == NULL))
+ return (SET_ERROR(ENOMEM));
+
+ bio->bi_end_io = vdev_disk_io_flush_completion;
+ bio->bi_private = zio;
+ bio_set_dev(bio, bdev);
+ bio_set_flush(bio);
+ vdev_submit_bio(bio);
+ invalidate_bdev(bdev);
+
+ return (0);
+}
+
+static void
+vdev_disk_io_start(zio_t *zio)
+{
+ vdev_t *v = zio->io_vd;
+ vdev_disk_t *vd = v->vdev_tsd;
+ unsigned long trim_flags = 0;
+ int rw, error;
+
+ /*
+ * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
+ * Nothing to be done here but return failure.
+ */
+ if (vd == NULL) {
+ zio->io_error = ENXIO;
+ zio_interrupt(zio);
+ return;
+ }
+
+ rw_enter(&vd->vd_lock, RW_READER);
+
+ /*
+ * If the vdev is closed, it's likely due to a failed reopen and is
+ * in the UNAVAIL state. Nothing to be done here but return failure.
+ */
+ if (vd->vd_bdev == NULL) {
+ rw_exit(&vd->vd_lock);
+ zio->io_error = ENXIO;
+ zio_interrupt(zio);
+ return;
+ }
+
+ switch (zio->io_type) {
+ case ZIO_TYPE_IOCTL:
+
+ if (!vdev_readable(v)) {
+ rw_exit(&vd->vd_lock);
+ zio->io_error = SET_ERROR(ENXIO);
+ zio_interrupt(zio);
+ return;
+ }
+
+ switch (zio->io_cmd) {
+ case DKIOCFLUSHWRITECACHE:
+
+ if (zfs_nocacheflush)
+ break;
+
+ if (v->vdev_nowritecache) {
+ zio->io_error = SET_ERROR(ENOTSUP);
+ break;
+ }
+
+ error = vdev_disk_io_flush(vd->vd_bdev, zio);
+ if (error == 0) {
+ rw_exit(&vd->vd_lock);
+ return;
+ }
+
+ zio->io_error = error;
+
+ break;
+
+ default:
+ zio->io_error = SET_ERROR(ENOTSUP);
+ }
+
+ rw_exit(&vd->vd_lock);
+ zio_execute(zio);
+ return;
+ case ZIO_TYPE_WRITE:
+ rw = WRITE;
+ break;
+
+ case ZIO_TYPE_READ:
+ rw = READ;
+ break;
+
+ case ZIO_TYPE_TRIM:
+#if defined(BLKDEV_DISCARD_SECURE)
+ if (zio->io_trim_flags & ZIO_TRIM_SECURE)
+ trim_flags |= BLKDEV_DISCARD_SECURE;
+#endif
+ zio->io_error = -blkdev_issue_discard(vd->vd_bdev,
+ zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS,
+ trim_flags);
+
+ rw_exit(&vd->vd_lock);
+ zio_interrupt(zio);
+ return;
+
+ default:
+ rw_exit(&vd->vd_lock);
+ zio->io_error = SET_ERROR(ENOTSUP);
+ zio_interrupt(zio);
+ return;
+ }
+
+ zio->io_target_timestamp = zio_handle_io_delay(zio);
+ error = __vdev_disk_physio(vd->vd_bdev, zio,
+ zio->io_size, zio->io_offset, rw, 0);
+ rw_exit(&vd->vd_lock);
+
+ if (error) {
+ zio->io_error = error;
+ zio_interrupt(zio);
+ return;
+ }
+}
+
+static void
+vdev_disk_io_done(zio_t *zio)
+{
+ /*
+ * If the device returned EIO, we revalidate the media. If it is
+ * determined the media has changed this triggers the asynchronous
+ * removal of the device from the configuration.
+ */
+ if (zio->io_error == EIO) {
+ vdev_t *v = zio->io_vd;
+ vdev_disk_t *vd = v->vdev_tsd;
+
+ if (zfs_check_media_change(vd->vd_bdev)) {
+ invalidate_bdev(vd->vd_bdev);
+ v->vdev_remove_wanted = B_TRUE;
+ spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
+ }
+ }
+}
+
+static void
+vdev_disk_hold(vdev_t *vd)
+{
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
+
+ /* We must have a pathname, and it must be absolute. */
+ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
+ return;
+
+ /*
+ * Only prefetch path and devid info if the device has
+ * never been opened.
+ */
+ if (vd->vdev_tsd != NULL)
+ return;
+
+}
+
+static void
+vdev_disk_rele(vdev_t *vd)
+{
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
+
+ /* XXX: Implement me as a vnode rele for the device */
+}
+
+vdev_ops_t vdev_disk_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
+ .vdev_op_open = vdev_disk_open,
+ .vdev_op_close = vdev_disk_close,
+ .vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
+ .vdev_op_io_start = vdev_disk_io_start,
+ .vdev_op_io_done = vdev_disk_io_done,
+ .vdev_op_state_change = NULL,
+ .vdev_op_need_resilver = NULL,
+ .vdev_op_hold = vdev_disk_hold,
+ .vdev_op_rele = vdev_disk_rele,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
+ .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */
+ .vdev_op_leaf = B_TRUE /* leaf vdev */
+};
+
+/*
+ * The zfs_vdev_scheduler module option has been deprecated. Setting this
+ * value no longer has any effect. It has not yet been entirely removed
+ * to allow the module to be loaded if this option is specified in the
+ * /etc/modprobe.d/zfs.conf file. The following warning will be logged.
+ */
+static int
+param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
+{
+ int error = param_set_charp(val, kp);
+ if (error == 0) {
+ printk(KERN_INFO "The 'zfs_vdev_scheduler' module option "
+ "is not supported.\n");
+ }
+
+ return (error);
+}
+
+char *zfs_vdev_scheduler = "unused";
+module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
+ param_get_charp, &zfs_vdev_scheduler, 0644);
+MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");
+
+int
+param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
+{
+ uint64_t val;
+ int error;
+
+ error = kstrtoull(buf, 0, &val);
+ if (error < 0)
+ return (SET_ERROR(error));
+
+ if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift)
+ return (SET_ERROR(-EINVAL));
+
+ error = param_set_ulong(buf, kp);
+ if (error < 0)
+ return (SET_ERROR(error));
+
+ return (0);
+}
+
+int
+param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
+{
+ uint64_t val;
+ int error;
+
+ error = kstrtoull(buf, 0, &val);
+ if (error < 0)
+ return (SET_ERROR(error));
+
+ if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift)
+ return (SET_ERROR(-EINVAL));
+
+ error = param_set_ulong(buf, kp);
+ if (error < 0)
+ return (SET_ERROR(error));
+
+ return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c
new file mode 100644
index 000000000000..bf8a13ae6154
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c
@@ -0,0 +1,382 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/abd.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/zfs_file.h>
+#ifdef _KERNEL
+#include <linux/falloc.h>
+#endif
+/*
+ * Virtual device vector for files.
+ */
+
+static taskq_t *vdev_file_taskq;
+
+/*
+ * By default, the logical/physical ashift for file vdevs is set to
+ * SPA_MINBLOCKSHIFT (9). This allows all file vdevs to use 512B (1 << 9)
+ * blocksizes. Users may opt to change one or both of these for testing
+ * or performance reasons. Care should be taken as these values will
+ * impact the vdev_ashift setting which can only be set at vdev creation
+ * time.
+ */
+unsigned long vdev_file_logical_ashift = SPA_MINBLOCKSHIFT;
+unsigned long vdev_file_physical_ashift = SPA_MINBLOCKSHIFT;
+
+static void
+vdev_file_hold(vdev_t *vd)
+{
+ ASSERT(vd->vdev_path != NULL);
+}
+
+static void
+vdev_file_rele(vdev_t *vd)
+{
+ ASSERT(vd->vdev_path != NULL);
+}
+
+static mode_t
+vdev_file_open_mode(spa_mode_t spa_mode)
+{
+ mode_t mode = 0;
+
+ if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) {
+ mode = O_RDWR;
+ } else if (spa_mode & SPA_MODE_READ) {
+ mode = O_RDONLY;
+ } else if (spa_mode & SPA_MODE_WRITE) {
+ mode = O_WRONLY;
+ }
+
+ return (mode | O_LARGEFILE);
+}
+
+static int
+vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+ vdev_file_t *vf;
+ zfs_file_t *fp;
+ zfs_file_attr_t zfa;
+ int error;
+
+ /*
+ * Rotational optimizations only make sense on block devices.
+ */
+ vd->vdev_nonrot = B_TRUE;
+
+ /*
+ * Allow TRIM on file based vdevs. This may not always be supported,
+ * since it depends on your kernel version and underlying filesystem
+ * type but it is always safe to attempt.
+ */
+ vd->vdev_has_trim = B_TRUE;
+
+ /*
+ * Disable secure TRIM on file based vdevs. There is no way to
+ * request this behavior from the underlying filesystem.
+ */
+ vd->vdev_has_securetrim = B_FALSE;
+
+ /*
+ * We must have a pathname, and it must be absolute.
+ */
+ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Reopen the device if it's not currently open. Otherwise,
+ * just update the physical size of the device.
+ */
+ if (vd->vdev_tsd != NULL) {
+ ASSERT(vd->vdev_reopening);
+ vf = vd->vdev_tsd;
+ goto skip_open;
+ }
+
+ vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
+
+ /*
+ * We always open the files from the root of the global zone, even if
+ * we're in a local zone. If the user has gotten to this point, the
+ * administrator has already decided that the pool should be available
+ * to local zone users, so the underlying devices should be as well.
+ */
+ ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
+
+ error = zfs_file_open(vd->vdev_path,
+ vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp);
+ if (error) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (error);
+ }
+
+ vf->vf_file = fp;
+
+#ifdef _KERNEL
+ /*
+ * Make sure it's a regular file.
+ */
+ if (zfs_file_getattr(fp, &zfa)) {
+ return (SET_ERROR(ENODEV));
+ }
+ if (!S_ISREG(zfa.zfa_mode)) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (SET_ERROR(ENODEV));
+ }
+#endif
+
+skip_open:
+
+ error = zfs_file_getattr(vf->vf_file, &zfa);
+ if (error) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (error);
+ }
+
+ *max_psize = *psize = zfa.zfa_size;
+ *logical_ashift = vdev_file_logical_ashift;
+ *physical_ashift = vdev_file_physical_ashift;
+
+ return (0);
+}
+
+static void
+vdev_file_close(vdev_t *vd)
+{
+ vdev_file_t *vf = vd->vdev_tsd;
+
+ if (vd->vdev_reopening || vf == NULL)
+ return;
+
+ if (vf->vf_file != NULL) {
+ (void) zfs_file_close(vf->vf_file);
+ }
+
+ vd->vdev_delayed_close = B_FALSE;
+ kmem_free(vf, sizeof (vdev_file_t));
+ vd->vdev_tsd = NULL;
+}
+
+static void
+vdev_file_io_strategy(void *arg)
+{
+ zio_t *zio = (zio_t *)arg;
+ vdev_t *vd = zio->io_vd;
+ vdev_file_t *vf = vd->vdev_tsd;
+ ssize_t resid;
+ void *buf;
+ loff_t off;
+ ssize_t size;
+ int err;
+
+ off = zio->io_offset;
+ size = zio->io_size;
+ resid = 0;
+
+ if (zio->io_type == ZIO_TYPE_READ) {
+ buf = abd_borrow_buf(zio->io_abd, zio->io_size);
+ err = zfs_file_pread(vf->vf_file, buf, size, off, &resid);
+ abd_return_buf_copy(zio->io_abd, buf, size);
+ } else {
+ buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+ err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid);
+ abd_return_buf(zio->io_abd, buf, size);
+ }
+ zio->io_error = err;
+ if (resid != 0 && zio->io_error == 0)
+ zio->io_error = SET_ERROR(ENOSPC);
+
+ zio_delay_interrupt(zio);
+}
+
+static void
+vdev_file_io_fsync(void *arg)
+{
+ zio_t *zio = (zio_t *)arg;
+ vdev_file_t *vf = zio->io_vd->vdev_tsd;
+
+ zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC | O_DSYNC);
+
+ zio_interrupt(zio);
+}
+
+static void
+vdev_file_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_file_t *vf = vd->vdev_tsd;
+
+ if (zio->io_type == ZIO_TYPE_IOCTL) {
+ /* XXPOLICY */
+ if (!vdev_readable(vd)) {
+ zio->io_error = SET_ERROR(ENXIO);
+ zio_interrupt(zio);
+ return;
+ }
+
+ switch (zio->io_cmd) {
+ case DKIOCFLUSHWRITECACHE:
+
+ if (zfs_nocacheflush)
+ break;
+
+ /*
+ * We cannot safely call vfs_fsync() when PF_FSTRANS
+ * is set in the current context. Filesystems like
+ * XFS include sanity checks to verify it is not
+ * already set, see xfs_vm_writepage(). Therefore
+ * the sync must be dispatched to a different context.
+ */
+ if (__spl_pf_fstrans_check()) {
+ VERIFY3U(taskq_dispatch(vdev_file_taskq,
+ vdev_file_io_fsync, zio, TQ_SLEEP), !=,
+ TASKQID_INVALID);
+ return;
+ }
+
+ zio->io_error = zfs_file_fsync(vf->vf_file,
+ O_SYNC | O_DSYNC);
+ break;
+ default:
+ zio->io_error = SET_ERROR(ENOTSUP);
+ }
+
+ zio_execute(zio);
+ return;
+ } else if (zio->io_type == ZIO_TYPE_TRIM) {
+ int mode = 0;
+
+ ASSERT3U(zio->io_size, !=, 0);
+#ifdef __linux__
+ mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
+#endif
+ zio->io_error = zfs_file_fallocate(vf->vf_file,
+ mode, zio->io_offset, zio->io_size);
+ zio_execute(zio);
+ return;
+ }
+
+ zio->io_target_timestamp = zio_handle_io_delay(zio);
+
+ VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
+ TQ_SLEEP), !=, TASKQID_INVALID);
+}
+
+/* ARGSUSED */
+static void
+vdev_file_io_done(zio_t *zio)
+{
+}
+
+vdev_ops_t vdev_file_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
+ .vdev_op_open = vdev_file_open,
+ .vdev_op_close = vdev_file_close,
+ .vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
+ .vdev_op_io_start = vdev_file_io_start,
+ .vdev_op_io_done = vdev_file_io_done,
+ .vdev_op_state_change = NULL,
+ .vdev_op_need_resilver = NULL,
+ .vdev_op_hold = vdev_file_hold,
+ .vdev_op_rele = vdev_file_rele,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
+ .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */
+ .vdev_op_leaf = B_TRUE /* leaf vdev */
+};
+
+void
+vdev_file_init(void)
+{
+ vdev_file_taskq = taskq_create("z_vdev_file", MAX(boot_ncpus, 16),
+ minclsyspri, boot_ncpus, INT_MAX, TASKQ_DYNAMIC);
+
+ VERIFY(vdev_file_taskq);
+}
+
+void
+vdev_file_fini(void)
+{
+ taskq_destroy(vdev_file_taskq);
+}
+
+/*
+ * From userland we access disks just like files.
+ */
+#ifndef _KERNEL
+
+vdev_ops_t vdev_disk_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
+ .vdev_op_open = vdev_file_open,
+ .vdev_op_close = vdev_file_close,
+ .vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
+ .vdev_op_io_start = vdev_file_io_start,
+ .vdev_op_io_done = vdev_file_io_done,
+ .vdev_op_state_change = NULL,
+ .vdev_op_need_resilver = NULL,
+ .vdev_op_hold = vdev_file_hold,
+ .vdev_op_rele = vdev_file_rele,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
+ .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */
+ .vdev_op_leaf = B_TRUE /* leaf vdev */
+};
+
+#endif
+
+ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, ULONG, ZMOD_RW,
+ "Logical ashift for file-based devices");
+ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, ULONG, ZMOD_RW,
+ "Physical ashift for file-based devices");
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
new file mode 100644
index 000000000000..2628325c0ba9
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
@@ -0,0 +1,2932 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+ */
+
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/sid.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/fs/zfs.h>
+#include <sys/policy.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_quota.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/dmu.h>
+#include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/trace_acl.h>
+#include <sys/zpl.h>
+
+#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE
+#define DENY ACE_ACCESS_DENIED_ACE_TYPE
+#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE
+#define MIN_ACE_TYPE ALLOW
+
+#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP)
+#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \
+ ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE)
+#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+
+#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \
+ ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \
+ ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \
+ ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE)
+
+#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
+#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \
+ ACE_DELETE|ACE_DELETE_CHILD)
+#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS)
+
+#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \
+ ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE)
+
+#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER)
+
+#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\
+ ZFS_ACL_PROTECTED)
+
+#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\
+ ZFS_ACL_OBJ_ACE)
+
+#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH)
+
+#define IDMAP_WK_CREATOR_OWNER_UID 2147483648U
+
+static uint16_t
+zfs_ace_v0_get_type(void *acep)
+{
+ return (((zfs_oldace_t *)acep)->z_type);
+}
+
+static uint16_t
+zfs_ace_v0_get_flags(void *acep)
+{
+ return (((zfs_oldace_t *)acep)->z_flags);
+}
+
+static uint32_t
+zfs_ace_v0_get_mask(void *acep)
+{
+ return (((zfs_oldace_t *)acep)->z_access_mask);
+}
+
+static uint64_t
+zfs_ace_v0_get_who(void *acep)
+{
+ return (((zfs_oldace_t *)acep)->z_fuid);
+}
+
+static void
+zfs_ace_v0_set_type(void *acep, uint16_t type)
+{
+ ((zfs_oldace_t *)acep)->z_type = type;
+}
+
+static void
+zfs_ace_v0_set_flags(void *acep, uint16_t flags)
+{
+ ((zfs_oldace_t *)acep)->z_flags = flags;
+}
+
+static void
+zfs_ace_v0_set_mask(void *acep, uint32_t mask)
+{
+ ((zfs_oldace_t *)acep)->z_access_mask = mask;
+}
+
+static void
+zfs_ace_v0_set_who(void *acep, uint64_t who)
+{
+ ((zfs_oldace_t *)acep)->z_fuid = who;
+}
+
+/*ARGSUSED*/
+static size_t
+zfs_ace_v0_size(void *acep)
+{
+ return (sizeof (zfs_oldace_t));
+}
+
+static size_t
+zfs_ace_v0_abstract_size(void)
+{
+ return (sizeof (zfs_oldace_t));
+}
+
+static int
+zfs_ace_v0_mask_off(void)
+{
+ return (offsetof(zfs_oldace_t, z_access_mask));
+}
+
+/*ARGSUSED*/
+static int
+zfs_ace_v0_data(void *acep, void **datap)
+{
+ *datap = NULL;
+ return (0);
+}
+
+static acl_ops_t zfs_acl_v0_ops = {
+ .ace_mask_get = zfs_ace_v0_get_mask,
+ .ace_mask_set = zfs_ace_v0_set_mask,
+ .ace_flags_get = zfs_ace_v0_get_flags,
+ .ace_flags_set = zfs_ace_v0_set_flags,
+ .ace_type_get = zfs_ace_v0_get_type,
+ .ace_type_set = zfs_ace_v0_set_type,
+ .ace_who_get = zfs_ace_v0_get_who,
+ .ace_who_set = zfs_ace_v0_set_who,
+ .ace_size = zfs_ace_v0_size,
+ .ace_abstract_size = zfs_ace_v0_abstract_size,
+ .ace_mask_off = zfs_ace_v0_mask_off,
+ .ace_data = zfs_ace_v0_data
+};
+
+static uint16_t
+zfs_ace_fuid_get_type(void *acep)
+{
+ return (((zfs_ace_hdr_t *)acep)->z_type);
+}
+
+static uint16_t
+zfs_ace_fuid_get_flags(void *acep)
+{
+ return (((zfs_ace_hdr_t *)acep)->z_flags);
+}
+
+static uint32_t
+zfs_ace_fuid_get_mask(void *acep)
+{
+ return (((zfs_ace_hdr_t *)acep)->z_access_mask);
+}
+
+static uint64_t
+zfs_ace_fuid_get_who(void *args)
+{
+ uint16_t entry_type;
+ zfs_ace_t *acep = args;
+
+ entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+
+ if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE)
+ return (-1);
+ return (((zfs_ace_t *)acep)->z_fuid);
+}
+
+static void
+zfs_ace_fuid_set_type(void *acep, uint16_t type)
+{
+ ((zfs_ace_hdr_t *)acep)->z_type = type;
+}
+
+static void
+zfs_ace_fuid_set_flags(void *acep, uint16_t flags)
+{
+ ((zfs_ace_hdr_t *)acep)->z_flags = flags;
+}
+
+static void
+zfs_ace_fuid_set_mask(void *acep, uint32_t mask)
+{
+ ((zfs_ace_hdr_t *)acep)->z_access_mask = mask;
+}
+
+static void
+zfs_ace_fuid_set_who(void *arg, uint64_t who)
+{
+ zfs_ace_t *acep = arg;
+
+ uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+
+ if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE)
+ return;
+ acep->z_fuid = who;
+}
+
+static size_t
+zfs_ace_fuid_size(void *acep)
+{
+ zfs_ace_hdr_t *zacep = acep;
+ uint16_t entry_type;
+
+ switch (zacep->z_type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ return (sizeof (zfs_object_ace_t));
+ case ALLOW:
+ case DENY:
+ entry_type =
+ (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS);
+ if (entry_type == ACE_OWNER ||
+ entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE)
+ return (sizeof (zfs_ace_hdr_t));
+ /*FALLTHROUGH*/
+ default:
+ return (sizeof (zfs_ace_t));
+ }
+}
+
+static size_t
+zfs_ace_fuid_abstract_size(void)
+{
+ return (sizeof (zfs_ace_hdr_t));
+}
+
+static int
+zfs_ace_fuid_mask_off(void)
+{
+ return (offsetof(zfs_ace_hdr_t, z_access_mask));
+}
+
+static int
+zfs_ace_fuid_data(void *acep, void **datap)
+{
+ zfs_ace_t *zacep = acep;
+ zfs_object_ace_t *zobjp;
+
+ switch (zacep->z_hdr.z_type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ zobjp = acep;
+ *datap = (caddr_t)zobjp + sizeof (zfs_ace_t);
+ return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t));
+ default:
+ *datap = NULL;
+ return (0);
+ }
+}
+
+static acl_ops_t zfs_acl_fuid_ops = {
+ .ace_mask_get = zfs_ace_fuid_get_mask,
+ .ace_mask_set = zfs_ace_fuid_set_mask,
+ .ace_flags_get = zfs_ace_fuid_get_flags,
+ .ace_flags_set = zfs_ace_fuid_set_flags,
+ .ace_type_get = zfs_ace_fuid_get_type,
+ .ace_type_set = zfs_ace_fuid_set_type,
+ .ace_who_get = zfs_ace_fuid_get_who,
+ .ace_who_set = zfs_ace_fuid_set_who,
+ .ace_size = zfs_ace_fuid_size,
+ .ace_abstract_size = zfs_ace_fuid_abstract_size,
+ .ace_mask_off = zfs_ace_fuid_mask_off,
+ .ace_data = zfs_ace_fuid_data
+};
+
+/*
+ * The following three functions are provided for compatibility with
+ * older ZPL version in order to determine if the file use to have
+ * an external ACL and what version of ACL previously existed on the
+ * file. Would really be nice to not need this, sigh.
+ */
+uint64_t
+zfs_external_acl(znode_t *zp)
+{
+ zfs_acl_phys_t acl_phys;
+ int error;
+
+ if (zp->z_is_sa)
+ return (0);
+
+ /*
+ * Need to deal with a potential
+ * race where zfs_sa_upgrade could cause
+ * z_isa_sa to change.
+ *
+ * If the lookup fails then the state of z_is_sa should have
+ * changed.
+ */
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(ZTOZSB(zp)),
+ &acl_phys, sizeof (acl_phys))) == 0)
+ return (acl_phys.z_acl_extern_obj);
+ else {
+ /*
+ * after upgrade the SA_ZPL_ZNODE_ACL should have been
+ * removed
+ */
+ VERIFY(zp->z_is_sa && error == ENOENT);
+ return (0);
+ }
+}
+
+/*
+ * Determine size of ACL in bytes
+ *
+ * This is more complicated than it should be since we have to deal
+ * with old external ACLs.
+ */
+static int
+zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount,
+ zfs_acl_phys_t *aclphys)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ uint64_t acl_count;
+ int size;
+ int error;
+
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+ if (zp->z_is_sa) {
+ if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs),
+ &size)) != 0)
+ return (error);
+ *aclsize = size;
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs),
+ &acl_count, sizeof (acl_count))) != 0)
+ return (error);
+ *aclcount = acl_count;
+ } else {
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+ aclphys, sizeof (*aclphys))) != 0)
+ return (error);
+
+ if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) {
+ *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size);
+ *aclcount = aclphys->z_acl_size;
+ } else {
+ *aclsize = aclphys->z_acl_size;
+ *aclcount = aclphys->z_acl_count;
+ }
+ }
+ return (0);
+}
+
+int
+zfs_znode_acl_version(znode_t *zp)
+{
+ zfs_acl_phys_t acl_phys;
+
+ if (zp->z_is_sa)
+ return (ZFS_ACL_VERSION_FUID);
+ else {
+ int error;
+
+ /*
+ * Need to deal with a potential
+ * race where zfs_sa_upgrade could cause
+ * z_isa_sa to change.
+ *
+ * If the lookup fails then the state of z_is_sa should have
+ * changed.
+ */
+ if ((error = sa_lookup(zp->z_sa_hdl,
+ SA_ZPL_ZNODE_ACL(ZTOZSB(zp)),
+ &acl_phys, sizeof (acl_phys))) == 0)
+ return (acl_phys.z_acl_version);
+ else {
+ /*
+ * After upgrade SA_ZPL_ZNODE_ACL should have
+ * been removed.
+ */
+ VERIFY(zp->z_is_sa && error == ENOENT);
+ return (ZFS_ACL_VERSION_FUID);
+ }
+ }
+}
+
+static int
+zfs_acl_version(int version)
+{
+ if (version < ZPL_VERSION_FUID)
+ return (ZFS_ACL_VERSION_INITIAL);
+ else
+ return (ZFS_ACL_VERSION_FUID);
+}
+
+static int
+zfs_acl_version_zp(znode_t *zp)
+{
+ return (zfs_acl_version(ZTOZSB(zp)->z_version));
+}
+
+zfs_acl_t *
+zfs_acl_alloc(int vers)
+{
+ zfs_acl_t *aclp;
+
+ aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
+ list_create(&aclp->z_acl, sizeof (zfs_acl_node_t),
+ offsetof(zfs_acl_node_t, z_next));
+ aclp->z_version = vers;
+ if (vers == ZFS_ACL_VERSION_FUID)
+ aclp->z_ops = &zfs_acl_fuid_ops;
+ else
+ aclp->z_ops = &zfs_acl_v0_ops;
+ return (aclp);
+}
+
+zfs_acl_node_t *
+zfs_acl_node_alloc(size_t bytes)
+{
+ zfs_acl_node_t *aclnode;
+
+ aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP);
+ if (bytes) {
+ aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP);
+ aclnode->z_allocdata = aclnode->z_acldata;
+ aclnode->z_allocsize = bytes;
+ aclnode->z_size = bytes;
+ }
+
+ return (aclnode);
+}
+
+static void
+zfs_acl_node_free(zfs_acl_node_t *aclnode)
+{
+ if (aclnode->z_allocsize)
+ kmem_free(aclnode->z_allocdata, aclnode->z_allocsize);
+ kmem_free(aclnode, sizeof (zfs_acl_node_t));
+}
+
+static void
+zfs_acl_release_nodes(zfs_acl_t *aclp)
+{
+ zfs_acl_node_t *aclnode;
+
+ while ((aclnode = list_head(&aclp->z_acl))) {
+ list_remove(&aclp->z_acl, aclnode);
+ zfs_acl_node_free(aclnode);
+ }
+ aclp->z_acl_count = 0;
+ aclp->z_acl_bytes = 0;
+}
+
+void
+zfs_acl_free(zfs_acl_t *aclp)
+{
+ zfs_acl_release_nodes(aclp);
+ list_destroy(&aclp->z_acl);
+ kmem_free(aclp, sizeof (zfs_acl_t));
+}
+
+static boolean_t
+zfs_acl_valid_ace_type(uint_t type, uint_t flags)
+{
+ uint16_t entry_type;
+
+ switch (type) {
+ case ALLOW:
+ case DENY:
+ case ACE_SYSTEM_AUDIT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_ACE_TYPE:
+ entry_type = flags & ACE_TYPE_FLAGS;
+ return (entry_type == ACE_OWNER ||
+ entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE || entry_type == 0 ||
+ entry_type == ACE_IDENTIFIER_GROUP);
+ default:
+ if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+static boolean_t
+zfs_ace_valid(umode_t obj_mode, zfs_acl_t *aclp, uint16_t type, uint16_t iflags)
+{
+ /*
+ * first check type of entry
+ */
+
+ if (!zfs_acl_valid_ace_type(type, iflags))
+ return (B_FALSE);
+
+ switch (type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ if (aclp->z_version < ZFS_ACL_VERSION_FUID)
+ return (B_FALSE);
+ aclp->z_hints |= ZFS_ACL_OBJ_ACE;
+ }
+
+ /*
+ * next check inheritance level flags
+ */
+
+ if (S_ISDIR(obj_mode) &&
+ (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+ aclp->z_hints |= ZFS_INHERIT_ACE;
+
+ if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
+ if ((iflags & (ACE_FILE_INHERIT_ACE|
+ ACE_DIRECTORY_INHERIT_ACE)) == 0) {
+ return (B_FALSE);
+ }
+ }
+
+ return (B_TRUE);
+}
+
+static void *
+zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who,
+ uint32_t *access_mask, uint16_t *iflags, uint16_t *type)
+{
+ zfs_acl_node_t *aclnode;
+
+ ASSERT(aclp);
+
+ if (start == NULL) {
+ aclnode = list_head(&aclp->z_acl);
+ if (aclnode == NULL)
+ return (NULL);
+
+ aclp->z_next_ace = aclnode->z_acldata;
+ aclp->z_curr_node = aclnode;
+ aclnode->z_ace_idx = 0;
+ }
+
+ aclnode = aclp->z_curr_node;
+
+ if (aclnode == NULL)
+ return (NULL);
+
+ if (aclnode->z_ace_idx >= aclnode->z_ace_count) {
+ aclnode = list_next(&aclp->z_acl, aclnode);
+ if (aclnode == NULL)
+ return (NULL);
+ else {
+ aclp->z_curr_node = aclnode;
+ aclnode->z_ace_idx = 0;
+ aclp->z_next_ace = aclnode->z_acldata;
+ }
+ }
+
+ if (aclnode->z_ace_idx < aclnode->z_ace_count) {
+ void *acep = aclp->z_next_ace;
+ size_t ace_size;
+
+ /*
+ * Make sure we don't overstep our bounds
+ */
+ ace_size = aclp->z_ops->ace_size(acep);
+
+ if (((caddr_t)acep + ace_size) >
+ ((caddr_t)aclnode->z_acldata + aclnode->z_size)) {
+ return (NULL);
+ }
+
+ *iflags = aclp->z_ops->ace_flags_get(acep);
+ *type = aclp->z_ops->ace_type_get(acep);
+ *access_mask = aclp->z_ops->ace_mask_get(acep);
+ *who = aclp->z_ops->ace_who_get(acep);
+ aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size;
+ aclnode->z_ace_idx++;
+
+ return ((void *)acep);
+ }
+ return (NULL);
+}
+
+/*ARGSUSED*/
+static uint64_t
+zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt,
+ uint16_t *flags, uint16_t *type, uint32_t *mask)
+{
+ zfs_acl_t *aclp = datap;
+ zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie;
+ uint64_t who;
+
+ acep = zfs_acl_next_ace(aclp, acep, &who, mask,
+ flags, type);
+ return ((uint64_t)(uintptr_t)acep);
+}
+
+/*
+ * Copy ACE to internal ZFS format.
+ * While processing the ACL each ACE will be validated for correctness.
+ * ACE FUIDs will be created later.
+ */
+static int
+zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *aclp,
+ void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size,
+ zfs_fuid_info_t **fuidp, cred_t *cr)
+{
+ int i;
+ uint16_t entry_type;
+ zfs_ace_t *aceptr = z_acl;
+ ace_t *acep = datap;
+ zfs_object_ace_t *zobjacep;
+ ace_object_t *aceobjp;
+
+ for (i = 0; i != aclcnt; i++) {
+ aceptr->z_hdr.z_access_mask = acep->a_access_mask;
+ aceptr->z_hdr.z_flags = acep->a_flags;
+ aceptr->z_hdr.z_type = acep->a_type;
+ entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS;
+ if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP &&
+ entry_type != ACE_EVERYONE) {
+ aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who,
+ cr, (entry_type == 0) ?
+ ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp);
+ }
+
+ /*
+ * Make sure ACE is valid
+ */
+ if (zfs_ace_valid(obj_mode, aclp, aceptr->z_hdr.z_type,
+ aceptr->z_hdr.z_flags) != B_TRUE)
+ return (SET_ERROR(EINVAL));
+
+ switch (acep->a_type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ zobjacep = (zfs_object_ace_t *)aceptr;
+ aceobjp = (ace_object_t *)acep;
+
+ bcopy(aceobjp->a_obj_type, zobjacep->z_object_type,
+ sizeof (aceobjp->a_obj_type));
+ bcopy(aceobjp->a_inherit_obj_type,
+ zobjacep->z_inherit_type,
+ sizeof (aceobjp->a_inherit_obj_type));
+ acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t));
+ break;
+ default:
+ acep = (ace_t *)((caddr_t)acep + sizeof (ace_t));
+ }
+
+ aceptr = (zfs_ace_t *)((caddr_t)aceptr +
+ aclp->z_ops->ace_size(aceptr));
+ }
+
+ *size = (caddr_t)aceptr - (caddr_t)z_acl;
+
+ return (0);
+}
+
+/*
+ * Copy ZFS ACEs to fixed size ace_t layout
+ */
+static void
+zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr,
+ void *datap, int filter)
+{
+ uint64_t who;
+ uint32_t access_mask;
+ uint16_t iflags, type;
+ zfs_ace_hdr_t *zacep = NULL;
+ ace_t *acep = datap;
+ ace_object_t *objacep;
+ zfs_object_ace_t *zobjacep;
+ size_t ace_size;
+ uint16_t entry_type;
+
+ while ((zacep = zfs_acl_next_ace(aclp, zacep,
+ &who, &access_mask, &iflags, &type))) {
+
+ switch (type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ if (filter) {
+ continue;
+ }
+ zobjacep = (zfs_object_ace_t *)zacep;
+ objacep = (ace_object_t *)acep;
+ bcopy(zobjacep->z_object_type,
+ objacep->a_obj_type,
+ sizeof (zobjacep->z_object_type));
+ bcopy(zobjacep->z_inherit_type,
+ objacep->a_inherit_obj_type,
+ sizeof (zobjacep->z_inherit_type));
+ ace_size = sizeof (ace_object_t);
+ break;
+ default:
+ ace_size = sizeof (ace_t);
+ break;
+ }
+
+ entry_type = (iflags & ACE_TYPE_FLAGS);
+ if ((entry_type != ACE_OWNER &&
+ entry_type != OWNING_GROUP &&
+ entry_type != ACE_EVERYONE)) {
+ acep->a_who = zfs_fuid_map_id(zfsvfs, who,
+ cr, (entry_type & ACE_IDENTIFIER_GROUP) ?
+ ZFS_ACE_GROUP : ZFS_ACE_USER);
+ } else {
+ acep->a_who = (uid_t)(int64_t)who;
+ }
+ acep->a_access_mask = access_mask;
+ acep->a_flags = iflags;
+ acep->a_type = type;
+ acep = (ace_t *)((caddr_t)acep + ace_size);
+ }
+}
+
+static int
+zfs_copy_ace_2_oldace(umode_t obj_mode, zfs_acl_t *aclp, ace_t *acep,
+ zfs_oldace_t *z_acl, int aclcnt, size_t *size)
+{
+ int i;
+ zfs_oldace_t *aceptr = z_acl;
+
+ for (i = 0; i != aclcnt; i++, aceptr++) {
+ aceptr->z_access_mask = acep[i].a_access_mask;
+ aceptr->z_type = acep[i].a_type;
+ aceptr->z_flags = acep[i].a_flags;
+ aceptr->z_fuid = acep[i].a_who;
+ /*
+ * Make sure ACE is valid
+ */
+ if (zfs_ace_valid(obj_mode, aclp, aceptr->z_type,
+ aceptr->z_flags) != B_TRUE)
+ return (SET_ERROR(EINVAL));
+ }
+ *size = (caddr_t)aceptr - (caddr_t)z_acl;
+ return (0);
+}
+
+/*
+ * convert old ACL format to new
+ */
+void
+zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr)
+{
+ zfs_oldace_t *oldaclp;
+ int i;
+ uint16_t type, iflags;
+ uint32_t access_mask;
+ uint64_t who;
+ void *cookie = NULL;
+ zfs_acl_node_t *newaclnode;
+
+ ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL);
+ /*
+ * First create the ACE in a contiguous piece of memory
+ * for zfs_copy_ace_2_fuid().
+ *
+ * We only convert an ACL once, so this won't happen
+ * every time.
+ */
+ oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count,
+ KM_SLEEP);
+ i = 0;
+ while ((cookie = zfs_acl_next_ace(aclp, cookie, &who,
+ &access_mask, &iflags, &type))) {
+ oldaclp[i].z_flags = iflags;
+ oldaclp[i].z_type = type;
+ oldaclp[i].z_fuid = who;
+ oldaclp[i++].z_access_mask = access_mask;
+ }
+
+ newaclnode = zfs_acl_node_alloc(aclp->z_acl_count *
+ sizeof (zfs_object_ace_t));
+ aclp->z_ops = &zfs_acl_fuid_ops;
+ VERIFY(zfs_copy_ace_2_fuid(ZTOZSB(zp), ZTOI(zp)->i_mode,
+ aclp, oldaclp, newaclnode->z_acldata, aclp->z_acl_count,
+ &newaclnode->z_size, NULL, cr) == 0);
+ newaclnode->z_ace_count = aclp->z_acl_count;
+ aclp->z_version = ZFS_ACL_VERSION;
+ kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t));
+
+ /*
+ * Release all previous ACL nodes
+ */
+
+ zfs_acl_release_nodes(aclp);
+
+ list_insert_head(&aclp->z_acl, newaclnode);
+
+ aclp->z_acl_bytes = newaclnode->z_size;
+ aclp->z_acl_count = newaclnode->z_ace_count;
+
+}
+
+/*
+ * Convert unix access mask to v4 access mask
+ */
+static uint32_t
+zfs_unix_to_v4(uint32_t access_mask)
+{
+ uint32_t new_mask = 0;
+
+ if (access_mask & S_IXOTH)
+ new_mask |= ACE_EXECUTE;
+ if (access_mask & S_IWOTH)
+ new_mask |= ACE_WRITE_DATA;
+ if (access_mask & S_IROTH)
+ new_mask |= ACE_READ_DATA;
+ return (new_mask);
+}
+
+static void
+zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask,
+ uint16_t access_type, uint64_t fuid, uint16_t entry_type)
+{
+ uint16_t type = entry_type & ACE_TYPE_FLAGS;
+
+ aclp->z_ops->ace_mask_set(acep, access_mask);
+ aclp->z_ops->ace_type_set(acep, access_type);
+ aclp->z_ops->ace_flags_set(acep, entry_type);
+ if ((type != ACE_OWNER && type != OWNING_GROUP &&
+ type != ACE_EVERYONE))
+ aclp->z_ops->ace_who_set(acep, fuid);
+}
+
+/*
+ * Determine mode of file based on ACL.
+ */
+uint64_t
+zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
+ uint64_t *pflags, uint64_t fuid, uint64_t fgid)
+{
+ int entry_type;
+ mode_t mode;
+ mode_t seen = 0;
+ zfs_ace_hdr_t *acep = NULL;
+ uint64_t who;
+ uint16_t iflags, type;
+ uint32_t access_mask;
+ boolean_t an_exec_denied = B_FALSE;
+
+ mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
+
+ while ((acep = zfs_acl_next_ace(aclp, acep, &who,
+ &access_mask, &iflags, &type))) {
+
+ if (!zfs_acl_valid_ace_type(type, iflags))
+ continue;
+
+ entry_type = (iflags & ACE_TYPE_FLAGS);
+
+ /*
+ * Skip over any inherit_only ACEs
+ */
+ if (iflags & ACE_INHERIT_ONLY_ACE)
+ continue;
+
+ if (entry_type == ACE_OWNER || (entry_type == 0 &&
+ who == fuid)) {
+ if ((access_mask & ACE_READ_DATA) &&
+ (!(seen & S_IRUSR))) {
+ seen |= S_IRUSR;
+ if (type == ALLOW) {
+ mode |= S_IRUSR;
+ }
+ }
+ if ((access_mask & ACE_WRITE_DATA) &&
+ (!(seen & S_IWUSR))) {
+ seen |= S_IWUSR;
+ if (type == ALLOW) {
+ mode |= S_IWUSR;
+ }
+ }
+ if ((access_mask & ACE_EXECUTE) &&
+ (!(seen & S_IXUSR))) {
+ seen |= S_IXUSR;
+ if (type == ALLOW) {
+ mode |= S_IXUSR;
+ }
+ }
+ } else if (entry_type == OWNING_GROUP ||
+ (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) {
+ if ((access_mask & ACE_READ_DATA) &&
+ (!(seen & S_IRGRP))) {
+ seen |= S_IRGRP;
+ if (type == ALLOW) {
+ mode |= S_IRGRP;
+ }
+ }
+ if ((access_mask & ACE_WRITE_DATA) &&
+ (!(seen & S_IWGRP))) {
+ seen |= S_IWGRP;
+ if (type == ALLOW) {
+ mode |= S_IWGRP;
+ }
+ }
+ if ((access_mask & ACE_EXECUTE) &&
+ (!(seen & S_IXGRP))) {
+ seen |= S_IXGRP;
+ if (type == ALLOW) {
+ mode |= S_IXGRP;
+ }
+ }
+ } else if (entry_type == ACE_EVERYONE) {
+ if ((access_mask & ACE_READ_DATA)) {
+ if (!(seen & S_IRUSR)) {
+ seen |= S_IRUSR;
+ if (type == ALLOW) {
+ mode |= S_IRUSR;
+ }
+ }
+ if (!(seen & S_IRGRP)) {
+ seen |= S_IRGRP;
+ if (type == ALLOW) {
+ mode |= S_IRGRP;
+ }
+ }
+ if (!(seen & S_IROTH)) {
+ seen |= S_IROTH;
+ if (type == ALLOW) {
+ mode |= S_IROTH;
+ }
+ }
+ }
+ if ((access_mask & ACE_WRITE_DATA)) {
+ if (!(seen & S_IWUSR)) {
+ seen |= S_IWUSR;
+ if (type == ALLOW) {
+ mode |= S_IWUSR;
+ }
+ }
+ if (!(seen & S_IWGRP)) {
+ seen |= S_IWGRP;
+ if (type == ALLOW) {
+ mode |= S_IWGRP;
+ }
+ }
+ if (!(seen & S_IWOTH)) {
+ seen |= S_IWOTH;
+ if (type == ALLOW) {
+ mode |= S_IWOTH;
+ }
+ }
+ }
+ if ((access_mask & ACE_EXECUTE)) {
+ if (!(seen & S_IXUSR)) {
+ seen |= S_IXUSR;
+ if (type == ALLOW) {
+ mode |= S_IXUSR;
+ }
+ }
+ if (!(seen & S_IXGRP)) {
+ seen |= S_IXGRP;
+ if (type == ALLOW) {
+ mode |= S_IXGRP;
+ }
+ }
+ if (!(seen & S_IXOTH)) {
+ seen |= S_IXOTH;
+ if (type == ALLOW) {
+ mode |= S_IXOTH;
+ }
+ }
+ }
+ } else {
+ /*
+ * Only care if this IDENTIFIER_GROUP or
+ * USER ACE denies execute access to someone,
+ * mode is not affected
+ */
+ if ((access_mask & ACE_EXECUTE) && type == DENY)
+ an_exec_denied = B_TRUE;
+ }
+ }
+
+ /*
+ * Failure to allow is effectively a deny, so execute permission
+ * is denied if it was never mentioned or if we explicitly
+ * weren't allowed it.
+ */
+ if (!an_exec_denied &&
+ ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS ||
+ (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS))
+ an_exec_denied = B_TRUE;
+
+ if (an_exec_denied)
+ *pflags &= ~ZFS_NO_EXECS_DENIED;
+ else
+ *pflags |= ZFS_NO_EXECS_DENIED;
+
+ return (mode);
+}
+
+/*
+ * Read an external acl object. If the intent is to modify, always
+ * create a new acl and leave any cached acl in place.
+ */
+int
+zfs_acl_node_read(struct znode *zp, boolean_t have_lock, zfs_acl_t **aclpp,
+ boolean_t will_modify)
+{
+ zfs_acl_t *aclp;
+ int aclsize = 0;
+ int acl_count = 0;
+ zfs_acl_node_t *aclnode;
+ zfs_acl_phys_t znode_acl;
+ int version;
+ int error;
+ boolean_t drop_lock = B_FALSE;
+
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+
+ if (zp->z_acl_cached && !will_modify) {
+ *aclpp = zp->z_acl_cached;
+ return (0);
+ }
+
+ /*
+ * close race where znode could be upgrade while trying to
+ * read the znode attributes.
+ *
+ * But this could only happen if the file isn't already an SA
+ * znode
+ */
+ if (!zp->z_is_sa && !have_lock) {
+ mutex_enter(&zp->z_lock);
+ drop_lock = B_TRUE;
+ }
+ version = zfs_znode_acl_version(zp);
+
+ if ((error = zfs_acl_znode_info(zp, &aclsize,
+ &acl_count, &znode_acl)) != 0) {
+ goto done;
+ }
+
+ aclp = zfs_acl_alloc(version);
+
+ aclp->z_acl_count = acl_count;
+ aclp->z_acl_bytes = aclsize;
+
+ aclnode = zfs_acl_node_alloc(aclsize);
+ aclnode->z_ace_count = aclp->z_acl_count;
+ aclnode->z_size = aclsize;
+
+ if (!zp->z_is_sa) {
+ if (znode_acl.z_acl_extern_obj) {
+ error = dmu_read(ZTOZSB(zp)->z_os,
+ znode_acl.z_acl_extern_obj, 0, aclnode->z_size,
+ aclnode->z_acldata, DMU_READ_PREFETCH);
+ } else {
+ bcopy(znode_acl.z_ace_data, aclnode->z_acldata,
+ aclnode->z_size);
+ }
+ } else {
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(ZTOZSB(zp)),
+ aclnode->z_acldata, aclnode->z_size);
+ }
+
+ if (error != 0) {
+ zfs_acl_free(aclp);
+ zfs_acl_node_free(aclnode);
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = SET_ERROR(EIO);
+ goto done;
+ }
+
+ list_insert_head(&aclp->z_acl, aclnode);
+
+ *aclpp = aclp;
+ if (!will_modify)
+ zp->z_acl_cached = aclp;
+done:
+ if (drop_lock)
+ mutex_exit(&zp->z_lock);
+ return (error);
+}
+
+/*ARGSUSED*/
+void
+zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen,
+ boolean_t start, void *userdata)
+{
+ zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata;
+
+ if (start) {
+ cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl);
+ } else {
+ cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl,
+ cb->cb_acl_node);
+ }
+ *dataptr = cb->cb_acl_node->z_acldata;
+ *length = cb->cb_acl_node->z_size;
+}
+
+int
+zfs_acl_chown_setattr(znode_t *zp)
+{
+ int error;
+ zfs_acl_t *aclp;
+
+ if (ZTOZSB(zp)->z_acl_type == ZFS_ACLTYPE_POSIX)
+ return (0);
+
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+
+ error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE);
+ if (error == 0 && aclp->z_acl_count > 0)
+ zp->z_mode = ZTOI(zp)->i_mode =
+ zfs_mode_compute(zp->z_mode, aclp,
+ &zp->z_pflags, KUID_TO_SUID(ZTOI(zp)->i_uid),
+ KGID_TO_SGID(ZTOI(zp)->i_gid));
+
+ /*
+ * Some ZFS implementations (ZEVO) create neither a ZNODE_ACL
+ * nor a DACL_ACES SA in which case ENOENT is returned from
+ * zfs_acl_node_read() when the SA can't be located.
+ * Allow chown/chgrp to succeed in these cases rather than
+ * returning an error that makes no sense in the context of
+ * the caller.
+ */
+ if (error == ENOENT)
+ return (0);
+
+ return (error);
+}
+
+typedef struct trivial_acl {
+ uint32_t allow0; /* allow mask for bits only in owner */
+ uint32_t deny1; /* deny mask for bits not in owner */
+ uint32_t deny2; /* deny mask for bits not in group */
+ uint32_t owner; /* allow mask matching mode */
+ uint32_t group; /* allow mask matching mode */
+ uint32_t everyone; /* allow mask matching mode */
+} trivial_acl_t;
+
+static void
+acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks)
+{
+ uint32_t read_mask = ACE_READ_DATA;
+ uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA;
+ uint32_t execute_mask = ACE_EXECUTE;
+
+ if (isdir)
+ write_mask |= ACE_DELETE_CHILD;
+
+ masks->deny1 = 0;
+
+ if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH)))
+ masks->deny1 |= read_mask;
+ if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH)))
+ masks->deny1 |= write_mask;
+ if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH)))
+ masks->deny1 |= execute_mask;
+
+ masks->deny2 = 0;
+ if (!(mode & S_IRGRP) && (mode & S_IROTH))
+ masks->deny2 |= read_mask;
+ if (!(mode & S_IWGRP) && (mode & S_IWOTH))
+ masks->deny2 |= write_mask;
+ if (!(mode & S_IXGRP) && (mode & S_IXOTH))
+ masks->deny2 |= execute_mask;
+
+ masks->allow0 = 0;
+ if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH)))
+ masks->allow0 |= read_mask;
+ if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH)))
+ masks->allow0 |= write_mask;
+ if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH)))
+ masks->allow0 |= execute_mask;
+
+ masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL|
+ ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES|
+ ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE;
+ if (mode & S_IRUSR)
+ masks->owner |= read_mask;
+ if (mode & S_IWUSR)
+ masks->owner |= write_mask;
+ if (mode & S_IXUSR)
+ masks->owner |= execute_mask;
+
+ masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
+ ACE_SYNCHRONIZE;
+ if (mode & S_IRGRP)
+ masks->group |= read_mask;
+ if (mode & S_IWGRP)
+ masks->group |= write_mask;
+ if (mode & S_IXGRP)
+ masks->group |= execute_mask;
+
+ masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
+ ACE_SYNCHRONIZE;
+ if (mode & S_IROTH)
+ masks->everyone |= read_mask;
+ if (mode & S_IWOTH)
+ masks->everyone |= write_mask;
+ if (mode & S_IXOTH)
+ masks->everyone |= execute_mask;
+}
+
+/*
+ * ace_trivial:
+ * determine whether an ace_t acl is trivial
+ *
+ * Trivialness implies that the acl is composed of only
+ * owner, group, everyone entries. ACL can't
+ * have read_acl denied, and write_owner/write_acl/write_attributes
+ * can only be owner@ entry.
+ */
+static int
+ace_trivial_common(void *acep, int aclcnt,
+ uint64_t (*walk)(void *, uint64_t, int aclcnt,
+ uint16_t *, uint16_t *, uint32_t *))
+{
+ uint16_t flags;
+ uint32_t mask;
+ uint16_t type;
+ uint64_t cookie = 0;
+
+ while ((cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask))) {
+ switch (flags & ACE_TYPE_FLAGS) {
+ case ACE_OWNER:
+ case ACE_GROUP|ACE_IDENTIFIER_GROUP:
+ case ACE_EVERYONE:
+ break;
+ default:
+ return (1);
+ }
+
+ if (flags & (ACE_FILE_INHERIT_ACE|
+ ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|
+ ACE_INHERIT_ONLY_ACE))
+ return (1);
+
+ /*
+ * Special check for some special bits
+ *
+ * Don't allow anybody to deny reading basic
+ * attributes or a files ACL.
+ */
+ if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+ (type == ACE_ACCESS_DENIED_ACE_TYPE))
+ return (1);
+
+ /*
+ * Delete permission is never set by default
+ */
+ if (mask & ACE_DELETE)
+ return (1);
+
+ /*
+ * Child delete permission should be accompanied by write
+ */
+ if ((mask & ACE_DELETE_CHILD) && !(mask & ACE_WRITE_DATA))
+ return (1);
+
+ /*
+ * only allow owner@ to have
+ * write_acl/write_owner/write_attributes/write_xattr/
+ */
+ if (type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
+ (!(flags & ACE_OWNER) && (mask &
+ (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES|
+ ACE_WRITE_NAMED_ATTRS))))
+ return (1);
+
+ }
+
+ return (0);
+}
+
+/*
+ * common code for setting ACLs.
+ *
+ * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl.
+ * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's
+ * already checked the acl and knows whether to inherit.
+ */
+int
+zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
+{
+ int error;
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ dmu_object_type_t otype;
+ zfs_acl_locator_cb_t locate = { 0 };
+ uint64_t mode;
+ sa_bulk_attr_t bulk[5];
+ uint64_t ctime[2];
+ int count = 0;
+ zfs_acl_phys_t acl_phys;
+
+ mode = zp->z_mode;
+
+ mode = zfs_mode_compute(mode, aclp, &zp->z_pflags,
+ KUID_TO_SUID(ZTOI(zp)->i_uid), KGID_TO_SGID(ZTOI(zp)->i_gid));
+
+ zp->z_mode = ZTOI(zp)->i_mode = mode;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &mode, sizeof (mode));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+
+ if (zp->z_acl_cached) {
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = NULL;
+ }
+
+ /*
+ * Upgrade needed?
+ */
+ if (!zfsvfs->z_use_fuids) {
+ otype = DMU_OT_OLDACL;
+ } else {
+ if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) &&
+ (zfsvfs->z_version >= ZPL_VERSION_FUID))
+ zfs_acl_xform(zp, aclp, cr);
+ ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID);
+ otype = DMU_OT_ACL;
+ }
+
+ /*
+ * Arrgh, we have to handle old on disk format
+ * as well as newer (preferred) SA format.
+ */
+
+ if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */
+ locate.cb_aclp = aclp;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs),
+ zfs_acl_data_locator, &locate, aclp->z_acl_bytes);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs),
+ NULL, &aclp->z_acl_count, sizeof (uint64_t));
+ } else { /* Painful legacy way */
+ zfs_acl_node_t *aclnode;
+ uint64_t off = 0;
+ uint64_t aoid;
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+ &acl_phys, sizeof (acl_phys))) != 0)
+ return (error);
+
+ aoid = acl_phys.z_acl_extern_obj;
+
+ if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ /*
+ * If ACL was previously external and we are now
+ * converting to new ACL format then release old
+ * ACL object and create a new one.
+ */
+ if (aoid &&
+ aclp->z_version != acl_phys.z_acl_version) {
+ error = dmu_object_free(zfsvfs->z_os, aoid, tx);
+ if (error)
+ return (error);
+ aoid = 0;
+ }
+ if (aoid == 0) {
+ aoid = dmu_object_alloc(zfsvfs->z_os,
+ otype, aclp->z_acl_bytes,
+ otype == DMU_OT_ACL ?
+ DMU_OT_SYSACL : DMU_OT_NONE,
+ otype == DMU_OT_ACL ?
+ DN_OLD_MAX_BONUSLEN : 0, tx);
+ } else {
+ (void) dmu_object_set_blocksize(zfsvfs->z_os,
+ aoid, aclp->z_acl_bytes, 0, tx);
+ }
+ acl_phys.z_acl_extern_obj = aoid;
+ for (aclnode = list_head(&aclp->z_acl); aclnode;
+ aclnode = list_next(&aclp->z_acl, aclnode)) {
+ if (aclnode->z_ace_count == 0)
+ continue;
+ dmu_write(zfsvfs->z_os, aoid, off,
+ aclnode->z_size, aclnode->z_acldata, tx);
+ off += aclnode->z_size;
+ }
+ } else {
+ void *start = acl_phys.z_ace_data;
+ /*
+ * Migrating back embedded?
+ */
+ if (acl_phys.z_acl_extern_obj) {
+ error = dmu_object_free(zfsvfs->z_os,
+ acl_phys.z_acl_extern_obj, tx);
+ if (error)
+ return (error);
+ acl_phys.z_acl_extern_obj = 0;
+ }
+
+ for (aclnode = list_head(&aclp->z_acl); aclnode;
+ aclnode = list_next(&aclp->z_acl, aclnode)) {
+ if (aclnode->z_ace_count == 0)
+ continue;
+ bcopy(aclnode->z_acldata, start,
+ aclnode->z_size);
+ start = (caddr_t)start + aclnode->z_size;
+ }
+ }
+ /*
+ * If Old version then swap count/bytes to match old
+ * layout of znode_acl_phys_t.
+ */
+ if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+ acl_phys.z_acl_size = aclp->z_acl_count;
+ acl_phys.z_acl_count = aclp->z_acl_bytes;
+ } else {
+ acl_phys.z_acl_size = aclp->z_acl_bytes;
+ acl_phys.z_acl_count = aclp->z_acl_count;
+ }
+ acl_phys.z_acl_version = aclp->z_version;
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+ &acl_phys, sizeof (acl_phys));
+ }
+
+ /*
+ * Replace ACL wide bits, but first clear them.
+ */
+ zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS;
+
+ zp->z_pflags |= aclp->z_hints;
+
+ if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
+ zp->z_pflags |= ZFS_ACL_TRIVIAL;
+
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime);
+ return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
+}
+
+static void
+zfs_acl_chmod(boolean_t isdir, uint64_t mode, boolean_t split, boolean_t trim,
+ zfs_acl_t *aclp)
+{
+ void *acep = NULL;
+ uint64_t who;
+ int new_count, new_bytes;
+ int ace_size;
+ int entry_type;
+ uint16_t iflags, type;
+ uint32_t access_mask;
+ zfs_acl_node_t *newnode;
+ size_t abstract_size = aclp->z_ops->ace_abstract_size();
+ void *zacep;
+ trivial_acl_t masks;
+
+ new_count = new_bytes = 0;
+
+ acl_trivial_access_masks((mode_t)mode, isdir, &masks);
+
+ newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes);
+
+ zacep = newnode->z_acldata;
+ if (masks.allow0) {
+ zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ new_count++;
+ new_bytes += abstract_size;
+ }
+ if (masks.deny1) {
+ zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ new_count++;
+ new_bytes += abstract_size;
+ }
+ if (masks.deny2) {
+ zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ new_count++;
+ new_bytes += abstract_size;
+ }
+
+ while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
+ &iflags, &type))) {
+ entry_type = (iflags & ACE_TYPE_FLAGS);
+ /*
+ * ACEs used to represent the file mode may be divided
+ * into an equivalent pair of inherit-only and regular
+ * ACEs, if they are inheritable.
+ * Skip regular ACEs, which are replaced by the new mode.
+ */
+ if (split && (entry_type == ACE_OWNER ||
+ entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE)) {
+ if (!isdir || !(iflags &
+ (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+ continue;
+ /*
+ * We preserve owner@, group@, or @everyone
+ * permissions, if they are inheritable, by
+ * copying them to inherit_only ACEs. This
+ * prevents inheritable permissions from being
+ * altered along with the file mode.
+ */
+ iflags |= ACE_INHERIT_ONLY_ACE;
+ }
+
+ /*
+ * If this ACL has any inheritable ACEs, mark that in
+ * the hints (which are later masked into the pflags)
+ * so create knows to do inheritance.
+ */
+ if (isdir && (iflags &
+ (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+ aclp->z_hints |= ZFS_INHERIT_ACE;
+
+ if ((type != ALLOW && type != DENY) ||
+ (iflags & ACE_INHERIT_ONLY_ACE)) {
+ switch (type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ aclp->z_hints |= ZFS_ACL_OBJ_ACE;
+ break;
+ }
+ } else {
+ /*
+ * Limit permissions to be no greater than
+ * group permissions.
+ * The "aclinherit" and "aclmode" properties
+ * affect policy for create and chmod(2),
+ * respectively.
+ */
+ if ((type == ALLOW) && trim)
+ access_mask &= masks.group;
+ }
+ zfs_set_ace(aclp, zacep, access_mask, type, who, iflags);
+ ace_size = aclp->z_ops->ace_size(acep);
+ zacep = (void *)((uintptr_t)zacep + ace_size);
+ new_count++;
+ new_bytes += ace_size;
+ }
+ zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE);
+
+ new_count += 3;
+ new_bytes += abstract_size * 3;
+ zfs_acl_release_nodes(aclp);
+ aclp->z_acl_count = new_count;
+ aclp->z_acl_bytes = new_bytes;
+ newnode->z_ace_count = new_count;
+ newnode->z_size = new_bytes;
+ list_insert_tail(&aclp->z_acl, newnode);
+}
+
+int
+zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
+{
+ int error = 0;
+
+ mutex_enter(&zp->z_acl_lock);
+ mutex_enter(&zp->z_lock);
+ if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_DISCARD)
+ *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
+ else
+ error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE);
+
+ if (error == 0) {
+ (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
+ zfs_acl_chmod(S_ISDIR(ZTOI(zp)->i_mode), mode, B_TRUE,
+ (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp);
+ }
+ mutex_exit(&zp->z_lock);
+ mutex_exit(&zp->z_acl_lock);
+
+ return (error);
+}
+
+/*
+ * Should ACE be inherited?
+ */
+static int
+zfs_ace_can_use(umode_t obj_mode, uint16_t acep_flags)
+{
+ int iflags = (acep_flags & 0xf);
+
+ if (S_ISDIR(obj_mode) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
+ return (1);
+ else if (iflags & ACE_FILE_INHERIT_ACE)
+ return (!(S_ISDIR(obj_mode) &&
+ (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)));
+ return (0);
+}
+
+/*
+ * inherit inheritable ACEs from parent
+ */
+static zfs_acl_t *
+zfs_acl_inherit(zfsvfs_t *zfsvfs, umode_t va_mode, zfs_acl_t *paclp,
+ uint64_t mode, boolean_t *need_chmod)
+{
+ void *pacep = NULL;
+ void *acep;
+ zfs_acl_node_t *aclnode;
+ zfs_acl_t *aclp = NULL;
+ uint64_t who;
+ uint32_t access_mask;
+ uint16_t iflags, newflags, type;
+ size_t ace_size;
+ void *data1, *data2;
+ size_t data1sz, data2sz;
+ uint_t aclinherit;
+ boolean_t isdir = S_ISDIR(va_mode);
+ boolean_t isreg = S_ISREG(va_mode);
+
+ *need_chmod = B_TRUE;
+
+ aclp = zfs_acl_alloc(paclp->z_version);
+ aclinherit = zfsvfs->z_acl_inherit;
+ if (aclinherit == ZFS_ACL_DISCARD || S_ISLNK(va_mode))
+ return (aclp);
+
+ while ((pacep = zfs_acl_next_ace(paclp, pacep, &who,
+ &access_mask, &iflags, &type))) {
+
+ /*
+ * don't inherit bogus ACEs
+ */
+ if (!zfs_acl_valid_ace_type(type, iflags))
+ continue;
+
+ /*
+ * Check if ACE is inheritable by this vnode
+ */
+ if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) ||
+ !zfs_ace_can_use(va_mode, iflags))
+ continue;
+
+ /*
+ * If owner@, group@, or everyone@ inheritable
+ * then zfs_acl_chmod() isn't needed.
+ */
+ if ((aclinherit == ZFS_ACL_PASSTHROUGH ||
+ aclinherit == ZFS_ACL_PASSTHROUGH_X) &&
+ ((iflags & (ACE_OWNER|ACE_EVERYONE)) ||
+ ((iflags & OWNING_GROUP) == OWNING_GROUP)) &&
+ (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE))))
+ *need_chmod = B_FALSE;
+
+ /*
+ * Strip inherited execute permission from file if
+ * not in mode
+ */
+ if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW &&
+ !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) {
+ access_mask &= ~ACE_EXECUTE;
+ }
+
+ /*
+ * Strip write_acl and write_owner from permissions
+ * when inheriting an ACE
+ */
+ if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) {
+ access_mask &= ~RESTRICTED_CLEAR;
+ }
+
+ ace_size = aclp->z_ops->ace_size(pacep);
+ aclnode = zfs_acl_node_alloc(ace_size);
+ list_insert_tail(&aclp->z_acl, aclnode);
+ acep = aclnode->z_acldata;
+
+ zfs_set_ace(aclp, acep, access_mask, type,
+ who, iflags|ACE_INHERITED_ACE);
+
+ /*
+ * Copy special opaque data if any
+ */
+ if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) {
+ VERIFY((data2sz = aclp->z_ops->ace_data(acep,
+ &data2)) == data1sz);
+ bcopy(data1, data2, data2sz);
+ }
+
+ aclp->z_acl_count++;
+ aclnode->z_ace_count++;
+ aclp->z_acl_bytes += aclnode->z_size;
+ newflags = aclp->z_ops->ace_flags_get(acep);
+
+ /*
+ * If ACE is not to be inherited further, or if the vnode is
+ * not a directory, remove all inheritance flags
+ */
+ if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) {
+ newflags &= ~ALL_INHERIT;
+ aclp->z_ops->ace_flags_set(acep,
+ newflags|ACE_INHERITED_ACE);
+ continue;
+ }
+
+ /*
+ * This directory has an inheritable ACE
+ */
+ aclp->z_hints |= ZFS_INHERIT_ACE;
+
+ /*
+ * If only FILE_INHERIT is set then turn on
+ * inherit_only
+ */
+ if ((iflags & (ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) {
+ newflags |= ACE_INHERIT_ONLY_ACE;
+ aclp->z_ops->ace_flags_set(acep,
+ newflags|ACE_INHERITED_ACE);
+ } else {
+ newflags &= ~ACE_INHERIT_ONLY_ACE;
+ aclp->z_ops->ace_flags_set(acep,
+ newflags|ACE_INHERITED_ACE);
+ }
+ }
+ if (zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
+ aclp->z_acl_count != 0) {
+ *need_chmod = B_FALSE;
+ }
+
+ return (aclp);
+}
+
+/*
+ * Create file system object initial permissions
+ * including inheritable ACEs.
+ * Also, create FUIDs for owner and group.
+ */
+int
+zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
+ vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids)
+{
+ int error;
+ zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+ zfs_acl_t *paclp;
+ gid_t gid = vap->va_gid;
+ boolean_t need_chmod = B_TRUE;
+ boolean_t trim = B_FALSE;
+ boolean_t inherited = B_FALSE;
+
+ bzero(acl_ids, sizeof (zfs_acl_ids_t));
+ acl_ids->z_mode = vap->va_mode;
+
+ if (vsecp)
+ if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_mode, vsecp,
+ cr, &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0)
+ return (error);
+
+ acl_ids->z_fuid = vap->va_uid;
+ acl_ids->z_fgid = vap->va_gid;
+#ifdef HAVE_KSID
+ /*
+ * Determine uid and gid.
+ */
+ if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay ||
+ ((flag & IS_XATTR) && (S_ISDIR(vap->va_mode)))) {
+ acl_ids->z_fuid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid,
+ cr, ZFS_OWNER, &acl_ids->z_fuidp);
+ acl_ids->z_fgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
+ cr, ZFS_GROUP, &acl_ids->z_fuidp);
+ gid = vap->va_gid;
+ } else {
+ acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER,
+ cr, &acl_ids->z_fuidp);
+ acl_ids->z_fgid = 0;
+ if (vap->va_mask & AT_GID) {
+ acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
+ (uint64_t)vap->va_gid,
+ cr, ZFS_GROUP, &acl_ids->z_fuidp);
+ gid = vap->va_gid;
+ if (acl_ids->z_fgid != KGID_TO_SGID(ZTOI(dzp)->i_gid) &&
+ !groupmember(vap->va_gid, cr) &&
+ secpolicy_vnode_create_gid(cr) != 0)
+ acl_ids->z_fgid = 0;
+ }
+ if (acl_ids->z_fgid == 0) {
+ if (dzp->z_mode & S_ISGID) {
+ char *domain;
+ uint32_t rid;
+
+ acl_ids->z_fgid = KGID_TO_SGID(
+ ZTOI(dzp)->i_gid);
+ gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
+ cr, ZFS_GROUP);
+
+ if (zfsvfs->z_use_fuids &&
+ IS_EPHEMERAL(acl_ids->z_fgid)) {
+ domain = zfs_fuid_idx_domain(
+ &zfsvfs->z_fuid_idx,
+ FUID_INDEX(acl_ids->z_fgid));
+ rid = FUID_RID(acl_ids->z_fgid);
+ zfs_fuid_node_add(&acl_ids->z_fuidp,
+ domain, rid,
+ FUID_INDEX(acl_ids->z_fgid),
+ acl_ids->z_fgid, ZFS_GROUP);
+ }
+ } else {
+ acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs,
+ ZFS_GROUP, cr, &acl_ids->z_fuidp);
+ gid = crgetgid(cr);
+ }
+ }
+ }
+#endif /* HAVE_KSID */
+
+ /*
+ * If we're creating a directory, and the parent directory has the
+ * set-GID bit set, set in on the new directory.
+ * Otherwise, if the user is neither privileged nor a member of the
+ * file's new group, clear the file's set-GID bit.
+ */
+
+ if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) &&
+ (S_ISDIR(vap->va_mode))) {
+ acl_ids->z_mode |= S_ISGID;
+ } else {
+ if ((acl_ids->z_mode & S_ISGID) &&
+ secpolicy_vnode_setids_setgids(cr, gid) != 0)
+ acl_ids->z_mode &= ~S_ISGID;
+ }
+
+ if (acl_ids->z_aclp == NULL) {
+ mutex_enter(&dzp->z_acl_lock);
+ mutex_enter(&dzp->z_lock);
+ if (!(flag & IS_ROOT_NODE) &&
+ (dzp->z_pflags & ZFS_INHERIT_ACE) &&
+ !(dzp->z_pflags & ZFS_XATTR)) {
+ VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE,
+ &paclp, B_FALSE));
+ acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
+ vap->va_mode, paclp, acl_ids->z_mode, &need_chmod);
+ inherited = B_TRUE;
+ } else {
+ acl_ids->z_aclp =
+ zfs_acl_alloc(zfs_acl_version_zp(dzp));
+ acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
+ }
+ mutex_exit(&dzp->z_lock);
+ mutex_exit(&dzp->z_acl_lock);
+
+ if (need_chmod) {
+ if (S_ISDIR(vap->va_mode))
+ acl_ids->z_aclp->z_hints |=
+ ZFS_ACL_AUTO_INHERIT;
+
+ if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK &&
+ zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH &&
+ zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X)
+ trim = B_TRUE;
+ zfs_acl_chmod(vap->va_mode, acl_ids->z_mode, B_FALSE,
+ trim, acl_ids->z_aclp);
+ }
+ }
+
+ if (inherited || vsecp) {
+ acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode,
+ acl_ids->z_aclp, &acl_ids->z_aclp->z_hints,
+ acl_ids->z_fuid, acl_ids->z_fgid);
+ if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0)
+ acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
+ }
+
+ return (0);
+}
+
+/*
+ * Free ACL and fuid_infop, but not the acl_ids structure
+ */
+void
+zfs_acl_ids_free(zfs_acl_ids_t *acl_ids)
+{
+ if (acl_ids->z_aclp)
+ zfs_acl_free(acl_ids->z_aclp);
+ if (acl_ids->z_fuidp)
+ zfs_fuid_info_free(acl_ids->z_fuidp);
+ acl_ids->z_aclp = NULL;
+ acl_ids->z_fuidp = NULL;
+}
+
+boolean_t
+zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid)
+{
+ return (zfs_id_overquota(zv, DMU_USERUSED_OBJECT, acl_ids->z_fuid) ||
+ zfs_id_overquota(zv, DMU_GROUPUSED_OBJECT, acl_ids->z_fgid) ||
+ (projid != ZFS_DEFAULT_PROJID && projid != ZFS_INVALID_PROJID &&
+ zfs_id_overquota(zv, DMU_PROJECTUSED_OBJECT, projid)));
+}
+
+/*
+ * Retrieve a file's ACL
+ */
+int
+zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
+{
+ zfs_acl_t *aclp;
+ ulong_t mask;
+ int error;
+ int count = 0;
+ int largeace = 0;
+
+ mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT |
+ VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES);
+
+ if (mask == 0)
+ return (SET_ERROR(ENOSYS));
+
+ if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr)))
+ return (error);
+
+ mutex_enter(&zp->z_acl_lock);
+
+ error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
+ if (error != 0) {
+ mutex_exit(&zp->z_acl_lock);
+ return (error);
+ }
+
+ /*
+ * Scan ACL to determine number of ACEs
+ */
+ if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) {
+ void *zacep = NULL;
+ uint64_t who;
+ uint32_t access_mask;
+ uint16_t type, iflags;
+
+ while ((zacep = zfs_acl_next_ace(aclp, zacep,
+ &who, &access_mask, &iflags, &type))) {
+ switch (type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ largeace++;
+ continue;
+ default:
+ count++;
+ }
+ }
+ vsecp->vsa_aclcnt = count;
+ } else
+ count = (int)aclp->z_acl_count;
+
+ if (mask & VSA_ACECNT) {
+ vsecp->vsa_aclcnt = count;
+ }
+
+ if (mask & VSA_ACE) {
+ size_t aclsz;
+
+ aclsz = count * sizeof (ace_t) +
+ sizeof (ace_object_t) * largeace;
+
+ vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP);
+ vsecp->vsa_aclentsz = aclsz;
+
+ if (aclp->z_version == ZFS_ACL_VERSION_FUID)
+ zfs_copy_fuid_2_ace(ZTOZSB(zp), aclp, cr,
+ vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES));
+ else {
+ zfs_acl_node_t *aclnode;
+ void *start = vsecp->vsa_aclentp;
+
+ for (aclnode = list_head(&aclp->z_acl); aclnode;
+ aclnode = list_next(&aclp->z_acl, aclnode)) {
+ bcopy(aclnode->z_acldata, start,
+ aclnode->z_size);
+ start = (caddr_t)start + aclnode->z_size;
+ }
+ ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp ==
+ aclp->z_acl_bytes);
+ }
+ }
+ if (mask & VSA_ACE_ACLFLAGS) {
+ vsecp->vsa_aclflags = 0;
+ if (zp->z_pflags & ZFS_ACL_DEFAULTED)
+ vsecp->vsa_aclflags |= ACL_DEFAULTED;
+ if (zp->z_pflags & ZFS_ACL_PROTECTED)
+ vsecp->vsa_aclflags |= ACL_PROTECTED;
+ if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT)
+ vsecp->vsa_aclflags |= ACL_AUTO_INHERIT;
+ }
+
+ mutex_exit(&zp->z_acl_lock);
+
+ return (0);
+}
+
+int
+zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_mode,
+ vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp)
+{
+ zfs_acl_t *aclp;
+ zfs_acl_node_t *aclnode;
+ int aclcnt = vsecp->vsa_aclcnt;
+ int error;
+
+ if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0)
+ return (SET_ERROR(EINVAL));
+
+ aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version));
+
+ aclp->z_hints = 0;
+ aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t));
+ if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+ if ((error = zfs_copy_ace_2_oldace(obj_mode, aclp,
+ (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata,
+ aclcnt, &aclnode->z_size)) != 0) {
+ zfs_acl_free(aclp);
+ zfs_acl_node_free(aclnode);
+ return (error);
+ }
+ } else {
+ if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_mode, aclp,
+ vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt,
+ &aclnode->z_size, fuidp, cr)) != 0) {
+ zfs_acl_free(aclp);
+ zfs_acl_node_free(aclnode);
+ return (error);
+ }
+ }
+ aclp->z_acl_bytes = aclnode->z_size;
+ aclnode->z_ace_count = aclcnt;
+ aclp->z_acl_count = aclcnt;
+ list_insert_head(&aclp->z_acl, aclnode);
+
+ /*
+ * If flags are being set then add them to z_hints
+ */
+ if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) {
+ if (vsecp->vsa_aclflags & ACL_PROTECTED)
+ aclp->z_hints |= ZFS_ACL_PROTECTED;
+ if (vsecp->vsa_aclflags & ACL_DEFAULTED)
+ aclp->z_hints |= ZFS_ACL_DEFAULTED;
+ if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT)
+ aclp->z_hints |= ZFS_ACL_AUTO_INHERIT;
+ }
+
+ *zaclp = aclp;
+
+ return (0);
+}
+
+/*
+ * Set a file's ACL
+ */
+int
+zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ zilog_t *zilog = zfsvfs->z_log;
+ ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
+ dmu_tx_t *tx;
+ int error;
+ zfs_acl_t *aclp;
+ zfs_fuid_info_t *fuidp = NULL;
+ boolean_t fuid_dirtied;
+ uint64_t acl_obj;
+
+ if (mask == 0)
+ return (SET_ERROR(ENOSYS));
+
+ if (zp->z_pflags & ZFS_IMMUTABLE)
+ return (SET_ERROR(EPERM));
+
+ if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)))
+ return (error);
+
+ error = zfs_vsec_2_aclp(zfsvfs, ZTOI(zp)->i_mode, vsecp, cr, &fuidp,
+ &aclp);
+ if (error)
+ return (error);
+
+ /*
+ * If ACL wide flags aren't being set then preserve any
+ * existing flags.
+ */
+ if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) {
+ aclp->z_hints |=
+ (zp->z_pflags & V4_ACL_WIDE_FLAGS);
+ }
+top:
+ mutex_enter(&zp->z_acl_lock);
+ mutex_enter(&zp->z_lock);
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+
+ /*
+ * If old version and ACL won't fit in bonus and we aren't
+ * upgrading then take out necessary DMU holds
+ */
+
+ if ((acl_obj = zfs_external_acl(zp)) != 0) {
+ if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+ zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) {
+ dmu_tx_hold_free(tx, acl_obj, 0,
+ DMU_OBJECT_END);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ aclp->z_acl_bytes);
+ } else {
+ dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes);
+ }
+ } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
+ }
+
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_NOWAIT);
+ if (error) {
+ mutex_exit(&zp->z_acl_lock);
+ mutex_exit(&zp->z_lock);
+
+ if (error == ERESTART) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ zfs_acl_free(aclp);
+ return (error);
+ }
+
+ error = zfs_aclset_common(zp, aclp, cr, tx);
+ ASSERT(error == 0);
+ ASSERT(zp->z_acl_cached == NULL);
+ zp->z_acl_cached = aclp;
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
+
+ if (fuidp)
+ zfs_fuid_info_free(fuidp);
+ dmu_tx_commit(tx);
+
+ mutex_exit(&zp->z_lock);
+ mutex_exit(&zp->z_acl_lock);
+
+ return (error);
+}
+
+/*
+ * Check accesses of interest (AoI) against attributes of the dataset
+ * such as read-only. Returns zero if no AoI conflict with dataset
+ * attributes, otherwise an appropriate errno is returned.
+ */
+static int
+zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
+{
+ if ((v4_mode & WRITE_MASK) && (zfs_is_readonly(ZTOZSB(zp))) &&
+ (!Z_ISDEV(ZTOI(zp)->i_mode) ||
+ (Z_ISDEV(ZTOI(zp)->i_mode) && (v4_mode & WRITE_MASK_ATTRS)))) {
+ return (SET_ERROR(EROFS));
+ }
+
+ /*
+ * Only check for READONLY on non-directories.
+ */
+ if ((v4_mode & WRITE_MASK_DATA) &&
+ ((!S_ISDIR(ZTOI(zp)->i_mode) &&
+ (zp->z_pflags & (ZFS_READONLY | ZFS_IMMUTABLE))) ||
+ (S_ISDIR(ZTOI(zp)->i_mode) &&
+ (zp->z_pflags & ZFS_IMMUTABLE)))) {
+ return (SET_ERROR(EPERM));
+ }
+
+ if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) &&
+ (zp->z_pflags & ZFS_NOUNLINK)) {
+ return (SET_ERROR(EPERM));
+ }
+
+ if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) &&
+ (zp->z_pflags & ZFS_AV_QUARANTINED))) {
+ return (SET_ERROR(EACCES));
+ }
+
+ return (0);
+}
+
+/*
+ * The primary usage of this function is to loop through all of the
+ * ACEs in the znode, determining what accesses of interest (AoI) to
+ * the caller are allowed or denied. The AoI are expressed as bits in
+ * the working_mode parameter. As each ACE is processed, bits covered
+ * by that ACE are removed from the working_mode. This removal
+ * facilitates two things. The first is that when the working mode is
+ * empty (= 0), we know we've looked at all the AoI. The second is
+ * that the ACE interpretation rules don't allow a later ACE to undo
+ * something granted or denied by an earlier ACE. Removing the
+ * discovered access or denial enforces this rule. At the end of
+ * processing the ACEs, all AoI that were found to be denied are
+ * placed into the working_mode, giving the caller a mask of denied
+ * accesses. Returns:
+ * 0 if all AoI granted
+ * EACCES if the denied mask is non-zero
+ * other error if abnormal failure (e.g., IO error)
+ *
+ * A secondary usage of the function is to determine if any of the
+ * AoI are granted. If an ACE grants any access in
+ * the working_mode, we immediately short circuit out of the function.
+ * This mode is chosen by setting anyaccess to B_TRUE. The
+ * working_mode is not a denied access mask upon exit if the function
+ * is used in this manner.
+ */
+static int
+zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
+ boolean_t anyaccess, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ zfs_acl_t *aclp;
+ int error;
+ uid_t uid = crgetuid(cr);
+ uint64_t who;
+ uint16_t type, iflags;
+ uint16_t entry_type;
+ uint32_t access_mask;
+ uint32_t deny_mask = 0;
+ zfs_ace_hdr_t *acep = NULL;
+ boolean_t checkit;
+ uid_t gowner;
+ uid_t fowner;
+
+ zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
+
+ mutex_enter(&zp->z_acl_lock);
+
+ error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
+ if (error != 0) {
+ mutex_exit(&zp->z_acl_lock);
+ return (error);
+ }
+
+ ASSERT(zp->z_acl_cached);
+
+ while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
+ &iflags, &type))) {
+ uint32_t mask_matched;
+
+ if (!zfs_acl_valid_ace_type(type, iflags))
+ continue;
+
+ if (S_ISDIR(ZTOI(zp)->i_mode) &&
+ (iflags & ACE_INHERIT_ONLY_ACE))
+ continue;
+
+ /* Skip ACE if it does not affect any AoI */
+ mask_matched = (access_mask & *working_mode);
+ if (!mask_matched)
+ continue;
+
+ entry_type = (iflags & ACE_TYPE_FLAGS);
+
+ checkit = B_FALSE;
+
+ switch (entry_type) {
+ case ACE_OWNER:
+ if (uid == fowner)
+ checkit = B_TRUE;
+ break;
+ case OWNING_GROUP:
+ who = gowner;
+ /*FALLTHROUGH*/
+ case ACE_IDENTIFIER_GROUP:
+ checkit = zfs_groupmember(zfsvfs, who, cr);
+ break;
+ case ACE_EVERYONE:
+ checkit = B_TRUE;
+ break;
+
+ /* USER Entry */
+ default:
+ if (entry_type == 0) {
+ uid_t newid;
+
+ newid = zfs_fuid_map_id(zfsvfs, who, cr,
+ ZFS_ACE_USER);
+ if (newid != IDMAP_WK_CREATOR_OWNER_UID &&
+ uid == newid)
+ checkit = B_TRUE;
+ break;
+ } else {
+ mutex_exit(&zp->z_acl_lock);
+ return (SET_ERROR(EIO));
+ }
+ }
+
+ if (checkit) {
+ if (type == DENY) {
+ DTRACE_PROBE3(zfs__ace__denies,
+ znode_t *, zp,
+ zfs_ace_hdr_t *, acep,
+ uint32_t, mask_matched);
+ deny_mask |= mask_matched;
+ } else {
+ DTRACE_PROBE3(zfs__ace__allows,
+ znode_t *, zp,
+ zfs_ace_hdr_t *, acep,
+ uint32_t, mask_matched);
+ if (anyaccess) {
+ mutex_exit(&zp->z_acl_lock);
+ return (0);
+ }
+ }
+ *working_mode &= ~mask_matched;
+ }
+
+ /* Are we done? */
+ if (*working_mode == 0)
+ break;
+ }
+
+ mutex_exit(&zp->z_acl_lock);
+
+ /* Put the found 'denies' back on the working mode */
+ if (deny_mask) {
+ *working_mode |= deny_mask;
+ return (SET_ERROR(EACCES));
+ } else if (*working_mode) {
+ return (-1);
+ }
+
+ return (0);
+}
+
+/*
+ * Return true if any access whatsoever granted, we don't actually
+ * care what access is granted.
+ */
+boolean_t
+zfs_has_access(znode_t *zp, cred_t *cr)
+{
+ uint32_t have = ACE_ALL_PERMS;
+
+ if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
+ uid_t owner;
+
+ owner = zfs_fuid_map_id(ZTOZSB(zp),
+ KUID_TO_SUID(ZTOI(zp)->i_uid), cr, ZFS_OWNER);
+ return (secpolicy_vnode_any_access(cr, ZTOI(zp), owner) == 0);
+ }
+ return (B_TRUE);
+}
+
+static int
+zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
+ boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ int err;
+
+ *working_mode = v4_mode;
+ *check_privs = B_TRUE;
+
+ /*
+ * Short circuit empty requests
+ */
+ if (v4_mode == 0 || zfsvfs->z_replay) {
+ *working_mode = 0;
+ return (0);
+ }
+
+ if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) {
+ *check_privs = B_FALSE;
+ return (err);
+ }
+
+ /*
+ * The caller requested that the ACL check be skipped. This
+ * would only happen if the caller checked VOP_ACCESS() with a
+ * 32 bit ACE mask and already had the appropriate permissions.
+ */
+ if (skipaclchk) {
+ *working_mode = 0;
+ return (0);
+ }
+
+ return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr));
+}
+
+static int
+zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs,
+ cred_t *cr)
+{
+ if (*working_mode != ACE_WRITE_DATA)
+ return (SET_ERROR(EACCES));
+
+ return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode,
+ check_privs, B_FALSE, cr));
+}
+
+int
+zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
+{
+ boolean_t owner = B_FALSE;
+ boolean_t groupmbr = B_FALSE;
+ boolean_t is_attr;
+ uid_t uid = crgetuid(cr);
+ int error;
+
+ if (zdp->z_pflags & ZFS_AV_QUARANTINED)
+ return (SET_ERROR(EACCES));
+
+ is_attr = ((zdp->z_pflags & ZFS_XATTR) &&
+ (S_ISDIR(ZTOI(zdp)->i_mode)));
+ if (is_attr)
+ goto slow;
+
+
+ mutex_enter(&zdp->z_acl_lock);
+
+ if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) {
+ mutex_exit(&zdp->z_acl_lock);
+ return (0);
+ }
+
+ if (KUID_TO_SUID(ZTOI(zdp)->i_uid) != 0 ||
+ KGID_TO_SGID(ZTOI(zdp)->i_gid) != 0) {
+ mutex_exit(&zdp->z_acl_lock);
+ goto slow;
+ }
+
+ if (uid == KUID_TO_SUID(ZTOI(zdp)->i_uid)) {
+ owner = B_TRUE;
+ if (zdp->z_mode & S_IXUSR) {
+ mutex_exit(&zdp->z_acl_lock);
+ return (0);
+ } else {
+ mutex_exit(&zdp->z_acl_lock);
+ goto slow;
+ }
+ }
+ if (groupmember(KGID_TO_SGID(ZTOI(zdp)->i_gid), cr)) {
+ groupmbr = B_TRUE;
+ if (zdp->z_mode & S_IXGRP) {
+ mutex_exit(&zdp->z_acl_lock);
+ return (0);
+ } else {
+ mutex_exit(&zdp->z_acl_lock);
+ goto slow;
+ }
+ }
+ if (!owner && !groupmbr) {
+ if (zdp->z_mode & S_IXOTH) {
+ mutex_exit(&zdp->z_acl_lock);
+ return (0);
+ }
+ }
+
+ mutex_exit(&zdp->z_acl_lock);
+
+slow:
+ DTRACE_PROBE(zfs__fastpath__execute__access__miss);
+ ZFS_ENTER(ZTOZSB(zdp));
+ error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr);
+ ZFS_EXIT(ZTOZSB(zdp));
+ return (error);
+}
+
+/*
+ * Determine whether Access should be granted/denied.
+ *
+ * The least priv subsystem is always consulted as a basic privilege
+ * can define any form of access.
+ */
+int
+zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
+{
+ uint32_t working_mode;
+ int error;
+ int is_attr;
+ boolean_t check_privs;
+ znode_t *xzp;
+ znode_t *check_zp = zp;
+ mode_t needed_bits;
+ uid_t owner;
+
+ is_attr = ((zp->z_pflags & ZFS_XATTR) && S_ISDIR(ZTOI(zp)->i_mode));
+
+ /*
+ * If attribute then validate against base file
+ */
+ if (is_attr) {
+ if ((error = zfs_zget(ZTOZSB(zp),
+ zp->z_xattr_parent, &xzp)) != 0) {
+ return (error);
+ }
+
+ check_zp = xzp;
+
+ /*
+ * fixup mode to map to xattr perms
+ */
+
+ if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
+ mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+ mode |= ACE_WRITE_NAMED_ATTRS;
+ }
+
+ if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
+ mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
+ mode |= ACE_READ_NAMED_ATTRS;
+ }
+ }
+
+ owner = zfs_fuid_map_id(ZTOZSB(zp), KUID_TO_SUID(ZTOI(zp)->i_uid),
+ cr, ZFS_OWNER);
+ /*
+ * Map the bits required to the standard inode flags
+ * S_IRUSR|S_IWUSR|S_IXUSR in the needed_bits. Map the bits
+ * mapped by working_mode (currently missing) in missing_bits.
+ * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode),
+ * needed_bits.
+ */
+ needed_bits = 0;
+
+ working_mode = mode;
+ if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+ owner == crgetuid(cr))
+ working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+ if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+ ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+ needed_bits |= S_IRUSR;
+ if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+ ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+ needed_bits |= S_IWUSR;
+ if (working_mode & ACE_EXECUTE)
+ needed_bits |= S_IXUSR;
+
+ if ((error = zfs_zaccess_common(check_zp, mode, &working_mode,
+ &check_privs, skipaclchk, cr)) == 0) {
+ if (is_attr)
+ zrele(xzp);
+ return (secpolicy_vnode_access2(cr, ZTOI(zp), owner,
+ needed_bits, needed_bits));
+ }
+
+ if (error && !check_privs) {
+ if (is_attr)
+ zrele(xzp);
+ return (error);
+ }
+
+ if (error && (flags & V_APPEND)) {
+ error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr);
+ }
+
+ if (error && check_privs) {
+ mode_t checkmode = 0;
+
+ /*
+ * First check for implicit owner permission on
+ * read_acl/read_attributes
+ */
+
+ error = 0;
+ ASSERT(working_mode != 0);
+
+ if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) &&
+ owner == crgetuid(cr)))
+ working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+ if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+ ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+ checkmode |= S_IRUSR;
+ if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+ ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+ checkmode |= S_IWUSR;
+ if (working_mode & ACE_EXECUTE)
+ checkmode |= S_IXUSR;
+
+ error = secpolicy_vnode_access2(cr, ZTOI(check_zp), owner,
+ needed_bits & ~checkmode, needed_bits);
+
+ if (error == 0 && (working_mode & ACE_WRITE_OWNER))
+ error = secpolicy_vnode_chown(cr, owner);
+ if (error == 0 && (working_mode & ACE_WRITE_ACL))
+ error = secpolicy_vnode_setdac(cr, owner);
+
+ if (error == 0 && (working_mode &
+ (ACE_DELETE|ACE_DELETE_CHILD)))
+ error = secpolicy_vnode_remove(cr);
+
+ if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) {
+ error = secpolicy_vnode_chown(cr, owner);
+ }
+ if (error == 0) {
+ /*
+ * See if any bits other than those already checked
+ * for are still present. If so then return EACCES
+ */
+ if (working_mode & ~(ZFS_CHECKED_MASKS)) {
+ error = SET_ERROR(EACCES);
+ }
+ }
+ } else if (error == 0) {
+ error = secpolicy_vnode_access2(cr, ZTOI(zp), owner,
+ needed_bits, needed_bits);
+ }
+
+ if (is_attr)
+ zrele(xzp);
+
+ return (error);
+}
+
+/*
+ * Translate traditional unix S_IRUSR/S_IWUSR/S_IXUSR mode into
+ * NFSv4-style ZFS ACL format and call zfs_zaccess()
+ */
+int
+zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr)
+{
+ return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr));
+}
+
+/*
+ * Access function for secpolicy_vnode_setattr
+ */
+int
+zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr)
+{
+ int v4_mode = zfs_unix_to_v4(mode >> 6);
+
+ return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr));
+}
+
+/* See zfs_zaccess_delete() */
+int zfs_write_implies_delete_child = 1;
+
+/*
+ * Determine whether delete access should be granted.
+ *
+ * The following chart outlines how we handle delete permissions which is
+ * how recent versions of windows (Windows 2008) handles it. The efficiency
+ * comes from not having to check the parent ACL where the object itself grants
+ * delete:
+ *
+ * -------------------------------------------------------
+ * | Parent Dir | Target Object Permissions |
+ * | permissions | |
+ * -------------------------------------------------------
+ * | | ACL Allows | ACL Denies| Delete |
+ * | | Delete | Delete | unspecified|
+ * -------------------------------------------------------
+ * | ACL Allows | Permit | Deny * | Permit |
+ * | DELETE_CHILD | | | |
+ * -------------------------------------------------------
+ * | ACL Denies | Permit | Deny | Deny |
+ * | DELETE_CHILD | | | |
+ * -------------------------------------------------------
+ * | ACL specifies | | | |
+ * | only allow | Permit | Deny * | Permit |
+ * | write and | | | |
+ * | execute | | | |
+ * -------------------------------------------------------
+ * | ACL denies | | | |
+ * | write and | Permit | Deny | Deny |
+ * | execute | | | |
+ * -------------------------------------------------------
+ * ^
+ * |
+ * Re. execute permission on the directory: if that's missing,
+ * the vnode lookup of the target will fail before we get here.
+ *
+ * Re [*] in the table above: NFSv4 would normally Permit delete for
+ * these two cells of the matrix.
+ * See acl.h for notes on which ACE_... flags should be checked for which
+ * operations. Specifically, the NFSv4 committee recommendation is in
+ * conflict with the Windows interpretation of DENY ACEs, where DENY ACEs
+ * should take precedence ahead of ALLOW ACEs.
+ *
+ * This implementation always consults the target object's ACL first.
+ * If a DENY ACE is present on the target object that specifies ACE_DELETE,
+ * delete access is denied. If an ALLOW ACE with ACE_DELETE is present on
+ * the target object, access is allowed. If and only if no entries with
+ * ACE_DELETE are present in the object's ACL, check the container's ACL
+ * for entries with ACE_DELETE_CHILD.
+ *
+ * A summary of the logic implemented from the table above is as follows:
+ *
+ * First check for DENY ACEs that apply.
+ * If either target or container has a deny, EACCES.
+ *
+ * Delete access can then be summarized as follows:
+ * 1: The object to be deleted grants ACE_DELETE, or
+ * 2: The containing directory grants ACE_DELETE_CHILD.
+ * In a Windows system, that would be the end of the story.
+ * In this system, (2) has some complications...
+ * 2a: "sticky" bit on a directory adds restrictions, and
+ * 2b: existing ACEs from previous versions of ZFS may
+ * not carry ACE_DELETE_CHILD where they should, so we
+ * also allow delete when ACE_WRITE_DATA is granted.
+ *
+ * Note: 2b is technically a work-around for a prior bug,
+ * which hopefully can go away some day. For those who
+ * no longer need the work around, and for testing, this
+ * work-around is made conditional via the tunable:
+ * zfs_write_implies_delete_child
+ */
+int
+zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
+{
+ uint32_t wanted_dirperms;
+ uint32_t dzp_working_mode = 0;
+ uint32_t zp_working_mode = 0;
+ int dzp_error, zp_error;
+ boolean_t dzpcheck_privs;
+ boolean_t zpcheck_privs;
+
+ if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
+ return (SET_ERROR(EPERM));
+
+ /*
+ * Case 1:
+ * If target object grants ACE_DELETE then we are done. This is
+ * indicated by a return value of 0. For this case we don't worry
+ * about the sticky bit because sticky only applies to the parent
+ * directory and this is the child access result.
+ *
+ * If we encounter a DENY ACE here, we're also done (EACCES).
+ * Note that if we hit a DENY ACE here (on the target) it should
+ * take precedence over a DENY ACE on the container, so that when
+ * we have more complete auditing support we will be able to
+ * report an access failure against the specific target.
+ * (This is part of why we're checking the target first.)
+ */
+ zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode,
+ &zpcheck_privs, B_FALSE, cr);
+ if (zp_error == EACCES) {
+ /* We hit a DENY ACE. */
+ if (!zpcheck_privs)
+ return (SET_ERROR(zp_error));
+ return (secpolicy_vnode_remove(cr));
+
+ }
+ if (zp_error == 0)
+ return (0);
+
+ /*
+ * Case 2:
+ * If the containing directory grants ACE_DELETE_CHILD,
+ * or we're in backward compatibility mode and the
+ * containing directory has ACE_WRITE_DATA, allow.
+ * Case 2b is handled with wanted_dirperms.
+ */
+ wanted_dirperms = ACE_DELETE_CHILD;
+ if (zfs_write_implies_delete_child)
+ wanted_dirperms |= ACE_WRITE_DATA;
+ dzp_error = zfs_zaccess_common(dzp, wanted_dirperms,
+ &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr);
+ if (dzp_error == EACCES) {
+ /* We hit a DENY ACE. */
+ if (!dzpcheck_privs)
+ return (SET_ERROR(dzp_error));
+ return (secpolicy_vnode_remove(cr));
+ }
+
+ /*
+ * Cases 2a, 2b (continued)
+ *
+ * Note: dzp_working_mode now contains any permissions
+ * that were NOT granted. Therefore, if any of the
+ * wanted_dirperms WERE granted, we will have:
+ * dzp_working_mode != wanted_dirperms
+ * We're really asking if ANY of those permissions
+ * were granted, and if so, grant delete access.
+ */
+ if (dzp_working_mode != wanted_dirperms)
+ dzp_error = 0;
+
+ /*
+ * dzp_error is 0 if the container granted us permissions to "modify".
+ * If we do not have permission via one or more ACEs, our current
+ * privileges may still permit us to modify the container.
+ *
+ * dzpcheck_privs is false when i.e. the FS is read-only.
+ * Otherwise, do privilege checks for the container.
+ */
+ if (dzp_error != 0 && dzpcheck_privs) {
+ uid_t owner;
+
+ /*
+ * The secpolicy call needs the requested access and
+ * the current access mode of the container, but it
+ * only knows about Unix-style modes (VEXEC, VWRITE),
+ * so this must condense the fine-grained ACE bits into
+ * Unix modes.
+ *
+ * The VEXEC flag is easy, because we know that has
+ * always been checked before we get here (during the
+ * lookup of the target vnode). The container has not
+ * granted us permissions to "modify", so we do not set
+ * the VWRITE flag in the current access mode.
+ */
+ owner = zfs_fuid_map_id(ZTOZSB(dzp),
+ KUID_TO_SUID(ZTOI(dzp)->i_uid), cr, ZFS_OWNER);
+ dzp_error = secpolicy_vnode_access2(cr, ZTOI(dzp),
+ owner, S_IXUSR, S_IWUSR|S_IXUSR);
+ }
+ if (dzp_error != 0) {
+ /*
+ * Note: We may have dzp_error = -1 here (from
+ * zfs_zacess_common). Don't return that.
+ */
+ return (SET_ERROR(EACCES));
+ }
+
+
+ /*
+ * At this point, we know that the directory permissions allow
+ * us to modify, but we still need to check for the additional
+ * restrictions that apply when the "sticky bit" is set.
+ *
+ * Yes, zfs_sticky_remove_access() also checks this bit, but
+ * checking it here and skipping the call below is nice when
+ * you're watching all of this with dtrace.
+ */
+ if ((dzp->z_mode & S_ISVTX) == 0)
+ return (0);
+
+ /*
+ * zfs_sticky_remove_access will succeed if:
+ * 1. The sticky bit is absent.
+ * 2. We pass the sticky bit restrictions.
+ * 3. We have privileges that always allow file removal.
+ */
+ return (zfs_sticky_remove_access(dzp, zp, cr));
+}
+
+int
+zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
+ znode_t *tzp, cred_t *cr)
+{
+ int add_perm;
+ int error;
+
+ if (szp->z_pflags & ZFS_AV_QUARANTINED)
+ return (SET_ERROR(EACCES));
+
+ add_perm = S_ISDIR(ZTOI(szp)->i_mode) ?
+ ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
+
+ /*
+ * Rename permissions are combination of delete permission +
+ * add file/subdir permission.
+ */
+
+ /*
+ * first make sure we do the delete portion.
+ *
+ * If that succeeds then check for add_file/add_subdir permissions
+ */
+
+ if ((error = zfs_zaccess_delete(sdzp, szp, cr)))
+ return (error);
+
+ /*
+ * If we have a tzp, see if we can delete it?
+ */
+ if (tzp) {
+ if ((error = zfs_zaccess_delete(tdzp, tzp, cr)))
+ return (error);
+ }
+
+ /*
+ * Now check for add permissions
+ */
+ error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr);
+
+ return (error);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
new file mode 100644
index 000000000000..a1668e46e4f9
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
@@ -0,0 +1,1260 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ *
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * LLNL-CODE-403049.
+ * Rewritten for Linux by:
+ * Rohan Puri <rohan.puri15@gmail.com>
+ * Brian Behlendorf <behlendorf1@llnl.gov>
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright (c) 2018 George Melikov. All Rights Reserved.
+ * Copyright (c) 2019 Datto, Inc. All rights reserved.
+ * Copyright (c) 2020 The MathWorks, Inc. All rights reserved.
+ */
+
+/*
+ * ZFS control directory (a.k.a. ".zfs")
+ *
+ * This directory provides a common location for all ZFS meta-objects.
+ * Currently, this is only the 'snapshot' and 'shares' directory, but this may
+ * expand in the future. The elements are built dynamically, as the hierarchy
+ * does not actually exist on disk.
+ *
+ * For 'snapshot', we don't want to have all snapshots always mounted, because
+ * this would take up a huge amount of space in /etc/mnttab. We have three
+ * types of objects:
+ *
+ * ctldir ------> snapshotdir -------> snapshot
+ * |
+ * |
+ * V
+ * mounted fs
+ *
+ * The 'snapshot' node contains just enough information to lookup '..' and act
+ * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we
+ * perform an automount of the underlying filesystem and return the
+ * corresponding inode.
+ *
+ * All mounts are handled automatically by an user mode helper which invokes
+ * the mount procedure. Unmounts are handled by allowing the mount
+ * point to expire so the kernel may automatically unmount it.
+ *
+ * The '.zfs', '.zfs/snapshot', and all directories created under
+ * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') all share the same
+ * zfsvfs_t as the head filesystem (what '.zfs' lives under).
+ *
+ * File systems mounted on top of the '.zfs/snapshot/<snapname>' paths
+ * (ie: snapshots) are complete ZFS filesystems and have their own unique
+ * zfsvfs_t. However, the fsid reported by these mounts will be the same
+ * as that used by the parent zfsvfs_t to make NFS happy.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/stat.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_deleg.h>
+#include <sys/zpl.h>
+#include <sys/mntent.h>
+#include "zfs_namecheck.h"
+
+/*
+ * Two AVL trees are maintained which contain all currently automounted
+ * snapshots. Every automounted snapshots maps to a single zfs_snapentry_t
+ * entry which MUST:
+ *
+ * - be attached to both trees, and
+ * - be unique, no duplicate entries are allowed.
+ *
+ * The zfs_snapshots_by_name tree is indexed by the full dataset name
+ * while the zfs_snapshots_by_objsetid tree is indexed by the unique
+ * objsetid. This allows for fast lookups either by name or objsetid.
+ */
+static avl_tree_t zfs_snapshots_by_name;
+static avl_tree_t zfs_snapshots_by_objsetid;
+static krwlock_t zfs_snapshot_lock;
+
+/*
+ * Control Directory Tunables (.zfs)
+ */
+int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT;
+int zfs_admin_snapshot = 0;
+
+typedef struct {
+ char *se_name; /* full snapshot name */
+ char *se_path; /* full mount path */
+ spa_t *se_spa; /* pool spa */
+ uint64_t se_objsetid; /* snapshot objset id */
+ struct dentry *se_root_dentry; /* snapshot root dentry */
+ taskqid_t se_taskqid; /* scheduled unmount taskqid */
+ avl_node_t se_node_name; /* zfs_snapshots_by_name link */
+ avl_node_t se_node_objsetid; /* zfs_snapshots_by_objsetid link */
+ zfs_refcount_t se_refcount; /* reference count */
+} zfs_snapentry_t;
+
+static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay);
+
+/*
+ * Allocate a new zfs_snapentry_t being careful to make a copy of the
+ * the snapshot name and provided mount point. No reference is taken.
+ */
+static zfs_snapentry_t *
+zfsctl_snapshot_alloc(const char *full_name, const char *full_path, spa_t *spa,
+ uint64_t objsetid, struct dentry *root_dentry)
+{
+ zfs_snapentry_t *se;
+
+ se = kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP);
+
+ se->se_name = kmem_strdup(full_name);
+ se->se_path = kmem_strdup(full_path);
+ se->se_spa = spa;
+ se->se_objsetid = objsetid;
+ se->se_root_dentry = root_dentry;
+ se->se_taskqid = TASKQID_INVALID;
+
+ zfs_refcount_create(&se->se_refcount);
+
+ return (se);
+}
+
+/*
+ * Free a zfs_snapentry_t the caller must ensure there are no active
+ * references.
+ */
+static void
+zfsctl_snapshot_free(zfs_snapentry_t *se)
+{
+ zfs_refcount_destroy(&se->se_refcount);
+ kmem_strfree(se->se_name);
+ kmem_strfree(se->se_path);
+
+ kmem_free(se, sizeof (zfs_snapentry_t));
+}
+
+/*
+ * Hold a reference on the zfs_snapentry_t.
+ */
+static void
+zfsctl_snapshot_hold(zfs_snapentry_t *se)
+{
+ zfs_refcount_add(&se->se_refcount, NULL);
+}
+
+/*
+ * Release a reference on the zfs_snapentry_t. When the number of
+ * references drops to zero the structure will be freed.
+ */
+static void
+zfsctl_snapshot_rele(zfs_snapentry_t *se)
+{
+ if (zfs_refcount_remove(&se->se_refcount, NULL) == 0)
+ zfsctl_snapshot_free(se);
+}
+
+/*
+ * Add a zfs_snapentry_t to both the zfs_snapshots_by_name and
+ * zfs_snapshots_by_objsetid trees. While the zfs_snapentry_t is part
+ * of the trees a reference is held.
+ */
+static void
+zfsctl_snapshot_add(zfs_snapentry_t *se)
+{
+ ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
+ zfsctl_snapshot_hold(se);
+ avl_add(&zfs_snapshots_by_name, se);
+ avl_add(&zfs_snapshots_by_objsetid, se);
+}
+
+/*
+ * Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and
+ * zfs_snapshots_by_objsetid trees. Upon removal a reference is dropped,
+ * this can result in the structure being freed if that was the last
+ * remaining reference.
+ */
+static void
+zfsctl_snapshot_remove(zfs_snapentry_t *se)
+{
+ ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
+ avl_remove(&zfs_snapshots_by_name, se);
+ avl_remove(&zfs_snapshots_by_objsetid, se);
+ zfsctl_snapshot_rele(se);
+}
+
+/*
+ * Snapshot name comparison function for the zfs_snapshots_by_name.
+ */
+static int
+snapentry_compare_by_name(const void *a, const void *b)
+{
+ const zfs_snapentry_t *se_a = a;
+ const zfs_snapentry_t *se_b = b;
+ int ret;
+
+ ret = strcmp(se_a->se_name, se_b->se_name);
+
+ if (ret < 0)
+ return (-1);
+ else if (ret > 0)
+ return (1);
+ else
+ return (0);
+}
+
+/*
+ * Snapshot name comparison function for the zfs_snapshots_by_objsetid.
+ */
+static int
+snapentry_compare_by_objsetid(const void *a, const void *b)
+{
+ const zfs_snapentry_t *se_a = a;
+ const zfs_snapentry_t *se_b = b;
+
+ if (se_a->se_spa != se_b->se_spa)
+ return ((ulong_t)se_a->se_spa < (ulong_t)se_b->se_spa ? -1 : 1);
+
+ if (se_a->se_objsetid < se_b->se_objsetid)
+ return (-1);
+ else if (se_a->se_objsetid > se_b->se_objsetid)
+ return (1);
+ else
+ return (0);
+}
+
+/*
+ * Find a zfs_snapentry_t in zfs_snapshots_by_name. If the snapname
+ * is found a pointer to the zfs_snapentry_t is returned and a reference
+ * taken on the structure. The caller is responsible for dropping the
+ * reference with zfsctl_snapshot_rele(). If the snapname is not found
+ * NULL will be returned.
+ */
+static zfs_snapentry_t *
+zfsctl_snapshot_find_by_name(const char *snapname)
+{
+ zfs_snapentry_t *se, search;
+
+ ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));
+
+ search.se_name = (char *)snapname;
+ se = avl_find(&zfs_snapshots_by_name, &search, NULL);
+ if (se)
+ zfsctl_snapshot_hold(se);
+
+ return (se);
+}
+
+/*
+ * Find a zfs_snapentry_t in zfs_snapshots_by_objsetid given the objset id
+ * rather than the snapname. In all other respects it behaves the same
+ * as zfsctl_snapshot_find_by_name().
+ */
+static zfs_snapentry_t *
+zfsctl_snapshot_find_by_objsetid(spa_t *spa, uint64_t objsetid)
+{
+ zfs_snapentry_t *se, search;
+
+ ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));
+
+ search.se_spa = spa;
+ search.se_objsetid = objsetid;
+ se = avl_find(&zfs_snapshots_by_objsetid, &search, NULL);
+ if (se)
+ zfsctl_snapshot_hold(se);
+
+ return (se);
+}
+
+/*
+ * Rename a zfs_snapentry_t in the zfs_snapshots_by_name. The structure is
+ * removed, renamed, and added back to the new correct location in the tree.
+ */
+static int
+zfsctl_snapshot_rename(const char *old_snapname, const char *new_snapname)
+{
+ zfs_snapentry_t *se;
+
+ ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
+
+ se = zfsctl_snapshot_find_by_name(old_snapname);
+ if (se == NULL)
+ return (SET_ERROR(ENOENT));
+
+ zfsctl_snapshot_remove(se);
+ kmem_strfree(se->se_name);
+ se->se_name = kmem_strdup(new_snapname);
+ zfsctl_snapshot_add(se);
+ zfsctl_snapshot_rele(se);
+
+ return (0);
+}
+
+/*
+ * Delayed task responsible for unmounting an expired automounted snapshot.
+ */
+static void
+snapentry_expire(void *data)
+{
+ zfs_snapentry_t *se = (zfs_snapentry_t *)data;
+ spa_t *spa = se->se_spa;
+ uint64_t objsetid = se->se_objsetid;
+
+ if (zfs_expire_snapshot <= 0) {
+ zfsctl_snapshot_rele(se);
+ return;
+ }
+
+ se->se_taskqid = TASKQID_INVALID;
+ (void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE);
+ zfsctl_snapshot_rele(se);
+
+ /*
+ * Reschedule the unmount if the zfs_snapentry_t wasn't removed.
+ * This can occur when the snapshot is busy.
+ */
+ rw_enter(&zfs_snapshot_lock, RW_READER);
+ if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {
+ zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
+ zfsctl_snapshot_rele(se);
+ }
+ rw_exit(&zfs_snapshot_lock);
+}
+
+/*
+ * Cancel an automatic unmount of a snapname. This callback is responsible
+ * for dropping the reference on the zfs_snapentry_t which was taken when
+ * during dispatch.
+ */
+static void
+zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se)
+{
+ if (taskq_cancel_id(system_delay_taskq, se->se_taskqid) == 0) {
+ se->se_taskqid = TASKQID_INVALID;
+ zfsctl_snapshot_rele(se);
+ }
+}
+
+/*
+ * Dispatch the unmount task for delayed handling with a hold protecting it.
+ */
+static void
+zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay)
+{
+ ASSERT3S(se->se_taskqid, ==, TASKQID_INVALID);
+
+ if (delay <= 0)
+ return;
+
+ zfsctl_snapshot_hold(se);
+ se->se_taskqid = taskq_dispatch_delay(system_delay_taskq,
+ snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ);
+}
+
+/*
+ * Schedule an automatic unmount of objset id to occur in delay seconds from
+ * now. Any previous delayed unmount will be cancelled in favor of the
+ * updated deadline. A reference is taken by zfsctl_snapshot_find_by_name()
+ * and held until the outstanding task is handled or cancelled.
+ */
+int
+zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay)
+{
+ zfs_snapentry_t *se;
+ int error = ENOENT;
+
+ rw_enter(&zfs_snapshot_lock, RW_READER);
+ if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {
+ zfsctl_snapshot_unmount_cancel(se);
+ zfsctl_snapshot_unmount_delay_impl(se, delay);
+ zfsctl_snapshot_rele(se);
+ error = 0;
+ }
+ rw_exit(&zfs_snapshot_lock);
+
+ return (error);
+}
+
+/*
+ * Check if snapname is currently mounted. Returned non-zero when mounted
+ * and zero when unmounted.
+ */
+static boolean_t
+zfsctl_snapshot_ismounted(const char *snapname)
+{
+ zfs_snapentry_t *se;
+ boolean_t ismounted = B_FALSE;
+
+ rw_enter(&zfs_snapshot_lock, RW_READER);
+ if ((se = zfsctl_snapshot_find_by_name(snapname)) != NULL) {
+ zfsctl_snapshot_rele(se);
+ ismounted = B_TRUE;
+ }
+ rw_exit(&zfs_snapshot_lock);
+
+ return (ismounted);
+}
+
+/*
+ * Check if the given inode is a part of the virtual .zfs directory.
+ */
+boolean_t
+zfsctl_is_node(struct inode *ip)
+{
+ return (ITOZ(ip)->z_is_ctldir);
+}
+
+/*
+ * Check if the given inode is a .zfs/snapshots/snapname directory.
+ */
+boolean_t
+zfsctl_is_snapdir(struct inode *ip)
+{
+ return (zfsctl_is_node(ip) && (ip->i_ino <= ZFSCTL_INO_SNAPDIRS));
+}
+
+/*
+ * Allocate a new inode with the passed id and ops.
+ */
+static struct inode *
+zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
+ const struct file_operations *fops, const struct inode_operations *ops)
+{
+ inode_timespec_t now;
+ struct inode *ip;
+ znode_t *zp;
+
+ ip = new_inode(zfsvfs->z_sb);
+ if (ip == NULL)
+ return (NULL);
+
+ now = current_time(ip);
+ zp = ITOZ(ip);
+ ASSERT3P(zp->z_dirlocks, ==, NULL);
+ ASSERT3P(zp->z_acl_cached, ==, NULL);
+ ASSERT3P(zp->z_xattr_cached, ==, NULL);
+ zp->z_id = id;
+ zp->z_unlinked = B_FALSE;
+ zp->z_atime_dirty = B_FALSE;
+ zp->z_zn_prefetch = B_FALSE;
+ zp->z_is_sa = B_FALSE;
+ zp->z_is_mapped = B_FALSE;
+ zp->z_is_ctldir = B_TRUE;
+ zp->z_is_stale = B_FALSE;
+ zp->z_sa_hdl = NULL;
+ zp->z_blksz = 0;
+ zp->z_seq = 0;
+ zp->z_mapcnt = 0;
+ zp->z_size = 0;
+ zp->z_pflags = 0;
+ zp->z_mode = 0;
+ zp->z_sync_cnt = 0;
+ ip->i_generation = 0;
+ ip->i_ino = id;
+ ip->i_mode = (S_IFDIR | S_IRWXUGO);
+ ip->i_uid = SUID_TO_KUID(0);
+ ip->i_gid = SGID_TO_KGID(0);
+ ip->i_blkbits = SPA_MINBLOCKSHIFT;
+ ip->i_atime = now;
+ ip->i_mtime = now;
+ ip->i_ctime = now;
+ ip->i_fop = fops;
+ ip->i_op = ops;
+#if defined(IOP_XATTR)
+ ip->i_opflags &= ~IOP_XATTR;
+#endif
+
+ if (insert_inode_locked(ip)) {
+ unlock_new_inode(ip);
+ iput(ip);
+ return (NULL);
+ }
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ list_insert_tail(&zfsvfs->z_all_znodes, zp);
+ zfsvfs->z_nr_znodes++;
+ membar_producer();
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ unlock_new_inode(ip);
+
+ return (ip);
+}
+
+/*
+ * Lookup the inode with given id, it will be allocated if needed.
+ */
+static struct inode *
+zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id,
+ const struct file_operations *fops, const struct inode_operations *ops)
+{
+ struct inode *ip = NULL;
+
+ while (ip == NULL) {
+ ip = ilookup(zfsvfs->z_sb, (unsigned long)id);
+ if (ip)
+ break;
+
+ /* May fail due to concurrent zfsctl_inode_alloc() */
+ ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops);
+ }
+
+ return (ip);
+}
+
+/*
+ * Create the '.zfs' directory. This directory is cached as part of the VFS
+ * structure. This results in a hold on the zfsvfs_t. The code in zfs_umount()
+ * therefore checks against a vfs_count of 2 instead of 1. This reference
+ * is removed when the ctldir is destroyed in the unmount. All other entities
+ * under the '.zfs' directory are created dynamically as needed.
+ *
+ * Because the dynamically created '.zfs' directory entries assume the use
+ * of 64-bit inode numbers this support must be disabled on 32-bit systems.
+ */
+int
+zfsctl_create(zfsvfs_t *zfsvfs)
+{
+ ASSERT(zfsvfs->z_ctldir == NULL);
+
+ zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT,
+ &zpl_fops_root, &zpl_ops_root);
+ if (zfsvfs->z_ctldir == NULL)
+ return (SET_ERROR(ENOENT));
+
+ return (0);
+}
+
+/*
+ * Destroy the '.zfs' directory or remove a snapshot from zfs_snapshots_by_name.
+ * Only called when the filesystem is unmounted.
+ */
+void
+zfsctl_destroy(zfsvfs_t *zfsvfs)
+{
+ if (zfsvfs->z_issnap) {
+ zfs_snapentry_t *se;
+ spa_t *spa = zfsvfs->z_os->os_spa;
+ uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
+
+ rw_enter(&zfs_snapshot_lock, RW_WRITER);
+ se = zfsctl_snapshot_find_by_objsetid(spa, objsetid);
+ if (se != NULL)
+ zfsctl_snapshot_remove(se);
+ rw_exit(&zfs_snapshot_lock);
+ if (se != NULL) {
+ zfsctl_snapshot_unmount_cancel(se);
+ zfsctl_snapshot_rele(se);
+ }
+ } else if (zfsvfs->z_ctldir) {
+ iput(zfsvfs->z_ctldir);
+ zfsvfs->z_ctldir = NULL;
+ }
+}
+
+/*
+ * Given a root znode, retrieve the associated .zfs directory.
+ * Add a hold to the vnode and return it.
+ */
+struct inode *
+zfsctl_root(znode_t *zp)
+{
+ ASSERT(zfs_has_ctldir(zp));
+ igrab(ZTOZSB(zp)->z_ctldir);
+ return (ZTOZSB(zp)->z_ctldir);
+}
+
+/*
+ * Generate a long fid to indicate a snapdir. We encode whether snapdir is
+ * already mounted in gen field. We do this because nfsd lookup will not
+ * trigger automount. Next time the nfsd does fh_to_dentry, we will notice
+ * this and do automount and return ESTALE to force nfsd revalidate and follow
+ * mount.
+ */
+static int
+zfsctl_snapdir_fid(struct inode *ip, fid_t *fidp)
+{
+ zfid_short_t *zfid = (zfid_short_t *)fidp;
+ zfid_long_t *zlfid = (zfid_long_t *)fidp;
+ uint32_t gen = 0;
+ uint64_t object;
+ uint64_t objsetid;
+ int i;
+ struct dentry *dentry;
+
+ if (fidp->fid_len < LONG_FID_LEN) {
+ fidp->fid_len = LONG_FID_LEN;
+ return (SET_ERROR(ENOSPC));
+ }
+
+ object = ip->i_ino;
+ objsetid = ZFSCTL_INO_SNAPDIRS - ip->i_ino;
+ zfid->zf_len = LONG_FID_LEN;
+
+ dentry = d_obtain_alias(igrab(ip));
+ if (!IS_ERR(dentry)) {
+ gen = !!d_mountpoint(dentry);
+ dput(dentry);
+ }
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
+
+ for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+ zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
+
+ for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+ zlfid->zf_setgen[i] = 0;
+
+ return (0);
+}
+
+/*
+ * Generate an appropriate fid for an entry in the .zfs directory.
+ */
+int
+zfsctl_fid(struct inode *ip, fid_t *fidp)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ uint64_t object = zp->z_id;
+ zfid_short_t *zfid;
+ int i;
+
+ ZFS_ENTER(zfsvfs);
+
+ if (zfsctl_is_snapdir(ip)) {
+ ZFS_EXIT(zfsvfs);
+ return (zfsctl_snapdir_fid(ip, fidp));
+ }
+
+ if (fidp->fid_len < SHORT_FID_LEN) {
+ fidp->fid_len = SHORT_FID_LEN;
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(ENOSPC));
+ }
+
+ zfid = (zfid_short_t *)fidp;
+
+ zfid->zf_len = SHORT_FID_LEN;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+ /* .zfs znodes always have a generation number of 0 */
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ zfid->zf_gen[i] = 0;
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Construct a full dataset name in full_name: "pool/dataset@snap_name"
+ */
+static int
+zfsctl_snapshot_name(zfsvfs_t *zfsvfs, const char *snap_name, int len,
+ char *full_name)
+{
+ objset_t *os = zfsvfs->z_os;
+
+ if (zfs_component_namecheck(snap_name, NULL, NULL) != 0)
+ return (SET_ERROR(EILSEQ));
+
+ dmu_objset_name(os, full_name);
+ if ((strlen(full_name) + 1 + strlen(snap_name)) >= len)
+ return (SET_ERROR(ENAMETOOLONG));
+
+ (void) strcat(full_name, "@");
+ (void) strcat(full_name, snap_name);
+
+ return (0);
+}
+
+/*
+ * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/"
+ */
+static int
+zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid,
+ int path_len, char *full_path)
+{
+ objset_t *os = zfsvfs->z_os;
+ fstrans_cookie_t cookie;
+ char *snapname;
+ boolean_t case_conflict;
+ uint64_t id, pos = 0;
+ int error = 0;
+
+ if (zfsvfs->z_vfs->vfs_mntpoint == NULL)
+ return (SET_ERROR(ENOENT));
+
+ cookie = spl_fstrans_mark();
+ snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+
+ while (error == 0) {
+ dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+ error = dmu_snapshot_list_next(zfsvfs->z_os,
+ ZFS_MAX_DATASET_NAME_LEN, snapname, &id, &pos,
+ &case_conflict);
+ dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+ if (error)
+ goto out;
+
+ if (id == objsetid)
+ break;
+ }
+
+ snprintf(full_path, path_len, "%s/.zfs/snapshot/%s",
+ zfsvfs->z_vfs->vfs_mntpoint, snapname);
+out:
+ kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN);
+ spl_fstrans_unmark(cookie);
+
+ return (error);
+}
+
+/*
+ * Special case the handling of "..".
+ */
+int
+zfsctl_root_lookup(struct inode *dip, const char *name, struct inode **ipp,
+ int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
+{
+ zfsvfs_t *zfsvfs = ITOZSB(dip);
+ int error = 0;
+
+ ZFS_ENTER(zfsvfs);
+
+ if (strcmp(name, "..") == 0) {
+ *ipp = dip->i_sb->s_root->d_inode;
+ } else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) {
+ *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIR,
+ &zpl_fops_snapdir, &zpl_ops_snapdir);
+ } else if (strcmp(name, ZFS_SHAREDIR_NAME) == 0) {
+ *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SHARES,
+ &zpl_fops_shares, &zpl_ops_shares);
+ } else {
+ *ipp = NULL;
+ }
+
+ if (*ipp == NULL)
+ error = SET_ERROR(ENOENT);
+
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*
+ * Lookup entry point for the 'snapshot' directory. Try to open the
+ * snapshot if it exist, creating the pseudo filesystem inode as necessary.
+ */
+int
+zfsctl_snapdir_lookup(struct inode *dip, const char *name, struct inode **ipp,
+ int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
+{
+ zfsvfs_t *zfsvfs = ITOZSB(dip);
+ uint64_t id;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ error = dmu_snapshot_lookup(zfsvfs->z_os, name, &id);
+ if (error) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIRS - id,
+ &simple_dir_operations, &simple_dir_inode_operations);
+ if (*ipp == NULL)
+ error = SET_ERROR(ENOENT);
+
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*
+ * Renaming a directory under '.zfs/snapshot' will automatically trigger
+ * a rename of the snapshot to the new given name. The rename is confined
+ * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere.
+ */
+int
+zfsctl_snapdir_rename(struct inode *sdip, const char *snm,
+ struct inode *tdip, const char *tnm, cred_t *cr, int flags)
+{
+ zfsvfs_t *zfsvfs = ITOZSB(sdip);
+ char *to, *from, *real, *fsname;
+ int error;
+
+ if (!zfs_admin_snapshot)
+ return (SET_ERROR(EACCES));
+
+ ZFS_ENTER(zfsvfs);
+
+ to = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+ from = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+ real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+ fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+
+ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+ error = dmu_snapshot_realname(zfsvfs->z_os, snm, real,
+ ZFS_MAX_DATASET_NAME_LEN, NULL);
+ if (error == 0) {
+ snm = real;
+ } else if (error != ENOTSUP) {
+ goto out;
+ }
+ }
+
+ dmu_objset_name(zfsvfs->z_os, fsname);
+
+ error = zfsctl_snapshot_name(ITOZSB(sdip), snm,
+ ZFS_MAX_DATASET_NAME_LEN, from);
+ if (error == 0)
+ error = zfsctl_snapshot_name(ITOZSB(tdip), tnm,
+ ZFS_MAX_DATASET_NAME_LEN, to);
+ if (error == 0)
+ error = zfs_secpolicy_rename_perms(from, to, cr);
+ if (error != 0)
+ goto out;
+
+ /*
+ * Cannot move snapshots out of the snapdir.
+ */
+ if (sdip != tdip) {
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ /*
+ * No-op when names are identical.
+ */
+ if (strcmp(snm, tnm) == 0) {
+ error = 0;
+ goto out;
+ }
+
+ rw_enter(&zfs_snapshot_lock, RW_WRITER);
+
+ error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE);
+ if (error == 0)
+ (void) zfsctl_snapshot_rename(snm, tnm);
+
+ rw_exit(&zfs_snapshot_lock);
+out:
+ kmem_free(from, ZFS_MAX_DATASET_NAME_LEN);
+ kmem_free(to, ZFS_MAX_DATASET_NAME_LEN);
+ kmem_free(real, ZFS_MAX_DATASET_NAME_LEN);
+ kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
+
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*
+ * Removing a directory under '.zfs/snapshot' will automatically trigger
+ * the removal of the snapshot with the given name.
+ */
+int
+zfsctl_snapdir_remove(struct inode *dip, const char *name, cred_t *cr,
+ int flags)
+{
+ zfsvfs_t *zfsvfs = ITOZSB(dip);
+ char *snapname, *real;
+ int error;
+
+ if (!zfs_admin_snapshot)
+ return (SET_ERROR(EACCES));
+
+ ZFS_ENTER(zfsvfs);
+
+ snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+ real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+
+ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+ error = dmu_snapshot_realname(zfsvfs->z_os, name, real,
+ ZFS_MAX_DATASET_NAME_LEN, NULL);
+ if (error == 0) {
+ name = real;
+ } else if (error != ENOTSUP) {
+ goto out;
+ }
+ }
+
+ error = zfsctl_snapshot_name(ITOZSB(dip), name,
+ ZFS_MAX_DATASET_NAME_LEN, snapname);
+ if (error == 0)
+ error = zfs_secpolicy_destroy_perms(snapname, cr);
+ if (error != 0)
+ goto out;
+
+ error = zfsctl_snapshot_unmount(snapname, MNT_FORCE);
+ if ((error == 0) || (error == ENOENT))
+ error = dsl_destroy_snapshot(snapname, B_FALSE);
+out:
+ kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN);
+ kmem_free(real, ZFS_MAX_DATASET_NAME_LEN);
+
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*
+ * Creating a directory under '.zfs/snapshot' will automatically trigger
+ * the creation of a new snapshot with the given name.
+ */
+int
+zfsctl_snapdir_mkdir(struct inode *dip, const char *dirname, vattr_t *vap,
+ struct inode **ipp, cred_t *cr, int flags)
+{
+ zfsvfs_t *zfsvfs = ITOZSB(dip);
+ char *dsname;
+ int error;
+
+ if (!zfs_admin_snapshot)
+ return (SET_ERROR(EACCES));
+
+ dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+
+ if (zfs_component_namecheck(dirname, NULL, NULL) != 0) {
+ error = SET_ERROR(EILSEQ);
+ goto out;
+ }
+
+ dmu_objset_name(zfsvfs->z_os, dsname);
+
+ error = zfs_secpolicy_snapshot_perms(dsname, cr);
+ if (error != 0)
+ goto out;
+
+ if (error == 0) {
+ error = dmu_objset_snapshot_one(dsname, dirname);
+ if (error != 0)
+ goto out;
+
+ error = zfsctl_snapdir_lookup(dip, dirname, ipp,
+ 0, cr, NULL, NULL);
+ }
+out:
+ kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
+
+ return (error);
+}
+
+/*
+ * Flush everything out of the kernel's export table and such.
+ * This is needed as once the snapshot is used over NFS, its
+ * entries in svc_export and svc_expkey caches hold reference
+ * to the snapshot mount point. There is no known way of flushing
+ * only the entries related to the snapshot.
+ */
+static void
+exportfs_flush(void)
+{
+ char *argv[] = { "/usr/sbin/exportfs", "-f", NULL };
+ char *envp[] = { NULL };
+
+ (void) call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+}
+
+/*
+ * Attempt to unmount a snapshot by making a call to user space.
+ * There is no assurance that this can or will succeed, is just a
+ * best effort. In the case where it does fail, perhaps because
+ * it's in use, the unmount will fail harmlessly.
+ */
+int
+zfsctl_snapshot_unmount(const char *snapname, int flags)
+{
+ char *argv[] = { "/usr/bin/env", "umount", "-t", "zfs", "-n", NULL,
+ NULL };
+ char *envp[] = { NULL };
+ zfs_snapentry_t *se;
+ int error;
+
+ rw_enter(&zfs_snapshot_lock, RW_READER);
+ if ((se = zfsctl_snapshot_find_by_name(snapname)) == NULL) {
+ rw_exit(&zfs_snapshot_lock);
+ return (SET_ERROR(ENOENT));
+ }
+ rw_exit(&zfs_snapshot_lock);
+
+ exportfs_flush();
+
+ if (flags & MNT_FORCE)
+ argv[4] = "-fn";
+ argv[5] = se->se_path;
+ dprintf("unmount; path=%s\n", se->se_path);
+ error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+ zfsctl_snapshot_rele(se);
+
+
+ /*
+ * The umount system utility will return 256 on error. We must
+ * assume this error is because the file system is busy so it is
+ * converted to the more sensible EBUSY.
+ */
+ if (error)
+ error = SET_ERROR(EBUSY);
+
+ return (error);
+}
+
+int
+zfsctl_snapshot_mount(struct path *path, int flags)
+{
+ struct dentry *dentry = path->dentry;
+ struct inode *ip = dentry->d_inode;
+ zfsvfs_t *zfsvfs;
+ zfsvfs_t *snap_zfsvfs;
+ zfs_snapentry_t *se;
+ char *full_name, *full_path;
+ char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL,
+ NULL };
+ char *envp[] = { NULL };
+ int error;
+ struct path spath;
+
+ if (ip == NULL)
+ return (SET_ERROR(EISDIR));
+
+ zfsvfs = ITOZSB(ip);
+ ZFS_ENTER(zfsvfs);
+
+ full_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+ full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+
+ error = zfsctl_snapshot_name(zfsvfs, dname(dentry),
+ ZFS_MAX_DATASET_NAME_LEN, full_name);
+ if (error)
+ goto error;
+
+ /*
+ * Construct a mount point path from sb of the ctldir inode and dirent
+ * name, instead of from d_path(), so that chroot'd process doesn't fail
+ * on mount.zfs(8).
+ */
+ snprintf(full_path, MAXPATHLEN, "%s/.zfs/snapshot/%s",
+ zfsvfs->z_vfs->vfs_mntpoint ? zfsvfs->z_vfs->vfs_mntpoint : "",
+ dname(dentry));
+
+ /*
+ * Multiple concurrent automounts of a snapshot are never allowed.
+ * The snapshot may be manually mounted as many times as desired.
+ */
+ if (zfsctl_snapshot_ismounted(full_name)) {
+ error = 0;
+ goto error;
+ }
+
+ /*
+ * Attempt to mount the snapshot from user space. Normally this
+ * would be done using the vfs_kern_mount() function, however that
+ * function is marked GPL-only and cannot be used. On error we
+ * careful to log the real error to the console and return EISDIR
+ * to safely abort the automount. This should be very rare.
+ *
+ * If the user mode helper happens to return EBUSY, a concurrent
+ * mount is already in progress in which case the error is ignored.
+ * Take note that if the program was executed successfully the return
+ * value from call_usermodehelper() will be (exitcode << 8 + signal).
+ */
+ dprintf("mount; name=%s path=%s\n", full_name, full_path);
+ argv[5] = full_name;
+ argv[6] = full_path;
+ error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+ if (error) {
+ if (!(error & MOUNT_BUSY << 8)) {
+ zfs_dbgmsg("Unable to automount %s error=%d",
+ full_path, error);
+ error = SET_ERROR(EISDIR);
+ } else {
+ /*
+ * EBUSY, this could mean a concurrent mount, or the
+ * snapshot has already been mounted at completely
+ * different place. We return 0 so VFS will retry. For
+ * the latter case the VFS will retry several times
+ * and return ELOOP, which is probably not a very good
+ * behavior.
+ */
+ error = 0;
+ }
+ goto error;
+ }
+
+ /*
+ * Follow down in to the mounted snapshot and set MNT_SHRINKABLE
+ * to identify this as an automounted filesystem.
+ */
+ spath = *path;
+ path_get(&spath);
+ if (follow_down_one(&spath)) {
+ snap_zfsvfs = ITOZSB(spath.dentry->d_inode);
+ snap_zfsvfs->z_parent = zfsvfs;
+ dentry = spath.dentry;
+ spath.mnt->mnt_flags |= MNT_SHRINKABLE;
+
+ rw_enter(&zfs_snapshot_lock, RW_WRITER);
+ se = zfsctl_snapshot_alloc(full_name, full_path,
+ snap_zfsvfs->z_os->os_spa, dmu_objset_id(snap_zfsvfs->z_os),
+ dentry);
+ zfsctl_snapshot_add(se);
+ zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
+ rw_exit(&zfs_snapshot_lock);
+ }
+ path_put(&spath);
+error:
+ kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN);
+ kmem_free(full_path, MAXPATHLEN);
+
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*
+ * Get the snapdir inode from fid
+ */
+int
+zfsctl_snapdir_vget(struct super_block *sb, uint64_t objsetid, int gen,
+ struct inode **ipp)
+{
+ int error;
+ struct path path;
+ char *mnt;
+ struct dentry *dentry;
+
+ mnt = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+ error = zfsctl_snapshot_path_objset(sb->s_fs_info, objsetid,
+ MAXPATHLEN, mnt);
+ if (error)
+ goto out;
+
+ /* Trigger automount */
+ error = -kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path);
+ if (error)
+ goto out;
+
+ path_put(&path);
+ /*
+ * Get the snapdir inode. Note, we don't want to use the above
+ * path because it contains the root of the snapshot rather
+ * than the snapdir.
+ */
+ *ipp = ilookup(sb, ZFSCTL_INO_SNAPDIRS - objsetid);
+ if (*ipp == NULL) {
+ error = SET_ERROR(ENOENT);
+ goto out;
+ }
+
+ /* check gen, see zfsctl_snapdir_fid */
+ dentry = d_obtain_alias(igrab(*ipp));
+ if (gen != (!IS_ERR(dentry) && d_mountpoint(dentry))) {
+ iput(*ipp);
+ *ipp = NULL;
+ error = SET_ERROR(ENOENT);
+ }
+ if (!IS_ERR(dentry))
+ dput(dentry);
+out:
+ kmem_free(mnt, MAXPATHLEN);
+ return (error);
+}
+
+int
+zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp,
+ int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
+{
+ zfsvfs_t *zfsvfs = ITOZSB(dip);
+ znode_t *zp;
+ znode_t *dzp;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ if (zfsvfs->z_shares_dir == 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
+ error = zfs_lookup(dzp, name, &zp, 0, cr, NULL, NULL);
+ zrele(dzp);
+ }
+
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*
+ * Initialize the various pieces we'll need to create and manipulate .zfs
+ * directories. Currently this is unused but available.
+ */
+void
+zfsctl_init(void)
+{
+ avl_create(&zfs_snapshots_by_name, snapentry_compare_by_name,
+ sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,
+ se_node_name));
+ avl_create(&zfs_snapshots_by_objsetid, snapentry_compare_by_objsetid,
+ sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,
+ se_node_objsetid));
+ rw_init(&zfs_snapshot_lock, NULL, RW_DEFAULT, NULL);
+}
+
+/*
+ * Cleanup the various pieces we needed for .zfs directories. In particular
+ * ensure the expiry timer is canceled safely.
+ */
+void
+zfsctl_fini(void)
+{
+ avl_destroy(&zfs_snapshots_by_name);
+ avl_destroy(&zfs_snapshots_by_objsetid);
+ rw_destroy(&zfs_snapshot_lock);
+}
+
+module_param(zfs_admin_snapshot, int, 0644);
+MODULE_PARM_DESC(zfs_admin_snapshot, "Enable mkdir/rmdir/mv in .zfs/snapshot");
+
+module_param(zfs_expire_snapshot, int, 0644);
+MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot");
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c
new file mode 100644
index 000000000000..8d7f04097da8
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c
@@ -0,0 +1,255 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/trace_zfs.h>
+
+typedef struct zfs_dbgmsg {
+ procfs_list_node_t zdm_node;
+ uint64_t zdm_timestamp;
+ int zdm_size;
+ char zdm_msg[1]; /* variable length allocation */
+} zfs_dbgmsg_t;
+
+procfs_list_t zfs_dbgmsgs;
+int zfs_dbgmsg_size = 0;
+int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
+
+/*
+ * Internal ZFS debug messages are enabled by default.
+ *
+ * # Print debug messages
+ * cat /proc/spl/kstat/zfs/dbgmsg
+ *
+ * # Disable the kernel debug message log.
+ * echo 0 > /sys/module/zfs/parameters/zfs_dbgmsg_enable
+ *
+ * # Clear the kernel debug message log.
+ * echo 0 >/proc/spl/kstat/zfs/dbgmsg
+ */
+int zfs_dbgmsg_enable = 1;
+
+static int
+zfs_dbgmsg_show_header(struct seq_file *f)
+{
+ seq_printf(f, "%-12s %-8s\n", "timestamp", "message");
+ return (0);
+}
+
+static int
+zfs_dbgmsg_show(struct seq_file *f, void *p)
+{
+ zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)p;
+ seq_printf(f, "%-12llu %-s\n",
+ (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg);
+ return (0);
+}
+
+static void
+zfs_dbgmsg_purge(int max_size)
+{
+ while (zfs_dbgmsg_size > max_size) {
+ zfs_dbgmsg_t *zdm = list_remove_head(&zfs_dbgmsgs.pl_list);
+ if (zdm == NULL)
+ return;
+
+ int size = zdm->zdm_size;
+ kmem_free(zdm, size);
+ zfs_dbgmsg_size -= size;
+ }
+}
+
+static int
+zfs_dbgmsg_clear(procfs_list_t *procfs_list)
+{
+ mutex_enter(&zfs_dbgmsgs.pl_lock);
+ zfs_dbgmsg_purge(0);
+ mutex_exit(&zfs_dbgmsgs.pl_lock);
+ return (0);
+}
+
+void
+zfs_dbgmsg_init(void)
+{
+ procfs_list_install("zfs",
+ NULL,
+ "dbgmsg",
+ 0600,
+ &zfs_dbgmsgs,
+ zfs_dbgmsg_show,
+ zfs_dbgmsg_show_header,
+ zfs_dbgmsg_clear,
+ offsetof(zfs_dbgmsg_t, zdm_node));
+}
+
+void
+zfs_dbgmsg_fini(void)
+{
+ procfs_list_uninstall(&zfs_dbgmsgs);
+ zfs_dbgmsg_purge(0);
+
+ /*
+ * TODO - decide how to make this permanent
+ */
+#ifdef _KERNEL
+ procfs_list_destroy(&zfs_dbgmsgs);
+#endif
+}
+
+void
+__set_error(const char *file, const char *func, int line, int err)
+{
+ /*
+ * To enable this:
+ *
+ * $ echo 512 >/sys/module/zfs/parameters/zfs_flags
+ */
+ if (zfs_flags & ZFS_DEBUG_SET_ERROR)
+ __dprintf(B_FALSE, file, func, line, "error %lu", err);
+}
+
+void
+__zfs_dbgmsg(char *buf)
+{
+ int size = sizeof (zfs_dbgmsg_t) + strlen(buf);
+ zfs_dbgmsg_t *zdm = kmem_zalloc(size, KM_SLEEP);
+ zdm->zdm_size = size;
+ zdm->zdm_timestamp = gethrestime_sec();
+ strcpy(zdm->zdm_msg, buf);
+
+ mutex_enter(&zfs_dbgmsgs.pl_lock);
+ procfs_list_add(&zfs_dbgmsgs, zdm);
+ zfs_dbgmsg_size += size;
+ zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0));
+ mutex_exit(&zfs_dbgmsgs.pl_lock);
+}
+
+#ifdef _KERNEL
+
+void
+__dprintf(boolean_t dprint, const char *file, const char *func,
+ int line, const char *fmt, ...)
+{
+ const char *newfile;
+ va_list adx;
+ size_t size;
+ char *buf;
+ char *nl;
+ int i;
+ char *prefix = (dprint) ? "dprintf: " : "";
+
+ size = 1024;
+ buf = kmem_alloc(size, KM_SLEEP);
+
+ /*
+ * Get rid of annoying prefix to filename.
+ */
+ newfile = strrchr(file, '/');
+ if (newfile != NULL) {
+ newfile = newfile + 1; /* Get rid of leading / */
+ } else {
+ newfile = file;
+ }
+
+ i = snprintf(buf, size, "%s%s:%d:%s(): ", prefix, newfile, line, func);
+
+ if (i < size) {
+ va_start(adx, fmt);
+ (void) vsnprintf(buf + i, size - i, fmt, adx);
+ va_end(adx);
+ }
+
+ /*
+ * Get rid of trailing newline for dprintf logs.
+ */
+ if (dprint && buf[0] != '\0') {
+ nl = &buf[strlen(buf) - 1];
+ if (*nl == '\n')
+ *nl = '\0';
+ }
+
+ /*
+ * To get this data enable the zfs__dprintf trace point as shown:
+ *
+ * # Enable zfs__dprintf tracepoint, clear the tracepoint ring buffer
+ * $ echo 1 > /sys/kernel/debug/tracing/events/zfs/enable
+ * $ echo 0 > /sys/kernel/debug/tracing/trace
+ *
+ * # Dump the ring buffer.
+ * $ cat /sys/kernel/debug/tracing/trace
+ */
+ DTRACE_PROBE1(zfs__dprintf, char *, buf);
+
+ /*
+ * To get this data:
+ *
+ * $ cat /proc/spl/kstat/zfs/dbgmsg
+ *
+ * To clear the buffer:
+ * $ echo 0 > /proc/spl/kstat/zfs/dbgmsg
+ */
+ __zfs_dbgmsg(buf);
+
+ kmem_free(buf, size);
+}
+
+#else
+
+void
+zfs_dbgmsg_print(const char *tag)
+{
+ ssize_t ret __attribute__((unused));
+
+ /*
+ * We use write() in this function instead of printf()
+ * so it is safe to call from a signal handler.
+ */
+ ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11);
+ ret = write(STDOUT_FILENO, tag, strlen(tag));
+ ret = write(STDOUT_FILENO, ") START:\n", 9);
+
+ mutex_enter(&zfs_dbgmsgs.pl_lock);
+ for (zfs_dbgmsg_t *zdm = list_head(&zfs_dbgmsgs.pl_list); zdm != NULL;
+ zdm = list_next(&zfs_dbgmsgs.pl_list, zdm)) {
+ ret = write(STDOUT_FILENO, zdm->zdm_msg,
+ strlen(zdm->zdm_msg));
+ ret = write(STDOUT_FILENO, "\n", 1);
+ }
+
+ ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11);
+ ret = write(STDOUT_FILENO, tag, strlen(tag));
+ ret = write(STDOUT_FILENO, ") END\n", 6);
+
+ mutex_exit(&zfs_dbgmsgs.pl_lock);
+}
+#endif /* _KERNEL */
+
+#ifdef _KERNEL
+module_param(zfs_dbgmsg_enable, int, 0644);
+MODULE_PARM_DESC(zfs_dbgmsg_enable, "Enable ZFS debug message log");
+
+module_param(zfs_dbgmsg_maxsize, int, 0644);
+MODULE_PARM_DESC(zfs_dbgmsg_maxsize, "Maximum ZFS debug log size");
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c
new file mode 100644
index 000000000000..207a51d75bc9
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c
@@ -0,0 +1,1225 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/uio.h>
+#include <sys/pathname.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/sunddi.h>
+#include <sys/random.h>
+#include <sys/policy.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_vnops.h>
+#include <sys/fs/zfs.h>
+#include <sys/zap.h>
+#include <sys/dmu.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+
+/*
+ * zfs_match_find() is used by zfs_dirent_lock() to perform zap lookups
+ * of names after deciding which is the appropriate lookup interface.
+ */
+static int
+zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,
+ matchtype_t mt, boolean_t update, int *deflags, pathname_t *rpnp,
+ uint64_t *zoid)
+{
+ boolean_t conflict = B_FALSE;
+ int error;
+
+ if (zfsvfs->z_norm) {
+ size_t bufsz = 0;
+ char *buf = NULL;
+
+ if (rpnp) {
+ buf = rpnp->pn_buf;
+ bufsz = rpnp->pn_bufsize;
+ }
+
+ /*
+ * In the non-mixed case we only expect there would ever
+ * be one match, but we need to use the normalizing lookup.
+ */
+ error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
+ zoid, mt, buf, bufsz, &conflict);
+ } else {
+ error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
+ }
+
+ /*
+ * Allow multiple entries provided the first entry is
+ * the object id. Non-zpl consumers may safely make
+ * use of the additional space.
+ *
+ * XXX: This should be a feature flag for compatibility
+ */
+ if (error == EOVERFLOW)
+ error = 0;
+
+ if (zfsvfs->z_norm && !error && deflags)
+ *deflags = conflict ? ED_CASE_CONFLICT : 0;
+
+ *zoid = ZFS_DIRENT_OBJ(*zoid);
+
+ return (error);
+}
+
+/*
+ * Lock a directory entry. A dirlock on <dzp, name> protects that name
+ * in dzp's directory zap object. As long as you hold a dirlock, you can
+ * assume two things: (1) dzp cannot be reaped, and (2) no other thread
+ * can change the zap entry for (i.e. link or unlink) this name.
+ *
+ * Input arguments:
+ * dzp - znode for directory
+ * name - name of entry to lock
+ * flag - ZNEW: if the entry already exists, fail with EEXIST.
+ * ZEXISTS: if the entry does not exist, fail with ENOENT.
+ * ZSHARED: allow concurrent access with other ZSHARED callers.
+ * ZXATTR: we want dzp's xattr directory
+ * ZCILOOK: On a mixed sensitivity file system,
+ * this lookup should be case-insensitive.
+ * ZCIEXACT: On a purely case-insensitive file system,
+ * this lookup should be case-sensitive.
+ * ZRENAMING: we are locking for renaming, force narrow locks
+ * ZHAVELOCK: Don't grab the z_name_lock for this call. The
+ * current thread already holds it.
+ *
+ * Output arguments:
+ * zpp - pointer to the znode for the entry (NULL if there isn't one)
+ * dlpp - pointer to the dirlock for this entry (NULL on error)
+ * direntflags - (case-insensitive lookup only)
+ * flags if multiple case-sensitive matches exist in directory
+ * realpnp - (case-insensitive lookup only)
+ * actual name matched within the directory
+ *
+ * Return value: 0 on success or errno on failure.
+ *
+ * NOTE: Always checks for, and rejects, '.' and '..'.
+ * NOTE: For case-insensitive file systems we take wide locks (see below),
+ * but return znode pointers to a single match.
+ */
+int
+zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name,
+ znode_t **zpp, int flag, int *direntflags, pathname_t *realpnp)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+ zfs_dirlock_t *dl;
+ boolean_t update;
+ matchtype_t mt = 0;
+ uint64_t zoid;
+ int error = 0;
+ int cmpflags;
+
+ *zpp = NULL;
+ *dlpp = NULL;
+
+ /*
+ * Verify that we are not trying to lock '.', '..', or '.zfs'
+ */
+ if ((name[0] == '.' &&
+ (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) ||
+ (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0))
+ return (SET_ERROR(EEXIST));
+
+ /*
+ * Case sensitivity and normalization preferences are set when
+ * the file system is created. These are stored in the
+ * zfsvfs->z_case and zfsvfs->z_norm fields. These choices
+ * affect what vnodes can be cached in the DNLC, how we
+ * perform zap lookups, and the "width" of our dirlocks.
+ *
+ * A normal dirlock locks a single name. Note that with
+ * normalization a name can be composed multiple ways, but
+ * when normalized, these names all compare equal. A wide
+ * dirlock locks multiple names. We need these when the file
+ * system is supporting mixed-mode access. It is sometimes
+ * necessary to lock all case permutations of file name at
+ * once so that simultaneous case-insensitive/case-sensitive
+ * behaves as rationally as possible.
+ */
+
+ /*
+ * When matching we may need to normalize & change case according to
+ * FS settings.
+ *
+ * Note that a normalized match is necessary for a case insensitive
+ * filesystem when the lookup request is not exact because normalization
+ * can fold case independent of normalizing code point sequences.
+ *
+ * See the table above zfs_dropname().
+ */
+ if (zfsvfs->z_norm != 0) {
+ mt = MT_NORMALIZE;
+
+ /*
+ * Determine if the match needs to honor the case specified in
+ * lookup, and if so keep track of that so that during
+ * normalization we don't fold case.
+ */
+ if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE &&
+ (flag & ZCIEXACT)) ||
+ (zfsvfs->z_case == ZFS_CASE_MIXED && !(flag & ZCILOOK))) {
+ mt |= MT_MATCH_CASE;
+ }
+ }
+
+ /*
+ * Only look in or update the DNLC if we are looking for the
+ * name on a file system that does not require normalization
+ * or case folding. We can also look there if we happen to be
+ * on a non-normalizing, mixed sensitivity file system IF we
+ * are looking for the exact name.
+ *
+ * Maybe can add TO-UPPERed version of name to dnlc in ci-only
+ * case for performance improvement?
+ */
+ update = !zfsvfs->z_norm ||
+ (zfsvfs->z_case == ZFS_CASE_MIXED &&
+ !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));
+
+ /*
+ * ZRENAMING indicates we are in a situation where we should
+ * take narrow locks regardless of the file system's
+ * preferences for normalizing and case folding. This will
+ * prevent us deadlocking trying to grab the same wide lock
+ * twice if the two names happen to be case-insensitive
+ * matches.
+ */
+ if (flag & ZRENAMING)
+ cmpflags = 0;
+ else
+ cmpflags = zfsvfs->z_norm;
+
+ /*
+ * Wait until there are no locks on this name.
+ *
+ * Don't grab the lock if it is already held. However, cannot
+ * have both ZSHARED and ZHAVELOCK together.
+ */
+ ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK));
+ if (!(flag & ZHAVELOCK))
+ rw_enter(&dzp->z_name_lock, RW_READER);
+
+ mutex_enter(&dzp->z_lock);
+ for (;;) {
+ if (dzp->z_unlinked && !(flag & ZXATTR)) {
+ mutex_exit(&dzp->z_lock);
+ if (!(flag & ZHAVELOCK))
+ rw_exit(&dzp->z_name_lock);
+ return (SET_ERROR(ENOENT));
+ }
+ for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
+ if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,
+ U8_UNICODE_LATEST, &error) == 0) || error != 0)
+ break;
+ }
+ if (error != 0) {
+ mutex_exit(&dzp->z_lock);
+ if (!(flag & ZHAVELOCK))
+ rw_exit(&dzp->z_name_lock);
+ return (SET_ERROR(ENOENT));
+ }
+ if (dl == NULL) {
+ /*
+ * Allocate a new dirlock and add it to the list.
+ */
+ dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
+ cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
+ dl->dl_name = name;
+ dl->dl_sharecnt = 0;
+ dl->dl_namelock = 0;
+ dl->dl_namesize = 0;
+ dl->dl_dzp = dzp;
+ dl->dl_next = dzp->z_dirlocks;
+ dzp->z_dirlocks = dl;
+ break;
+ }
+ if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
+ break;
+ cv_wait(&dl->dl_cv, &dzp->z_lock);
+ }
+
+ /*
+ * If the z_name_lock was NOT held for this dirlock record it.
+ */
+ if (flag & ZHAVELOCK)
+ dl->dl_namelock = 1;
+
+ if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
+ /*
+ * We're the second shared reference to dl. Make a copy of
+ * dl_name in case the first thread goes away before we do.
+ * Note that we initialize the new name before storing its
+ * pointer into dl_name, because the first thread may load
+ * dl->dl_name at any time. It'll either see the old value,
+ * which belongs to it, or the new shared copy; either is OK.
+ */
+ dl->dl_namesize = strlen(dl->dl_name) + 1;
+ name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
+ bcopy(dl->dl_name, name, dl->dl_namesize);
+ dl->dl_name = name;
+ }
+
+ mutex_exit(&dzp->z_lock);
+
+ /*
+ * We have a dirlock on the name. (Note that it is the dirlock,
+ * not the dzp's z_lock, that protects the name in the zap object.)
+ * See if there's an object by this name; if so, put a hold on it.
+ */
+ if (flag & ZXATTR) {
+ error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
+ sizeof (zoid));
+ if (error == 0)
+ error = (zoid == 0 ? SET_ERROR(ENOENT) : 0);
+ } else {
+ error = zfs_match_find(zfsvfs, dzp, name, mt,
+ update, direntflags, realpnp, &zoid);
+ }
+ if (error) {
+ if (error != ENOENT || (flag & ZEXISTS)) {
+ zfs_dirent_unlock(dl);
+ return (error);
+ }
+ } else {
+ if (flag & ZNEW) {
+ zfs_dirent_unlock(dl);
+ return (SET_ERROR(EEXIST));
+ }
+ error = zfs_zget(zfsvfs, zoid, zpp);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ return (error);
+ }
+ }
+
+ *dlpp = dl;
+
+ return (0);
+}
+
+/*
+ * Unlock this directory entry and wake anyone who was waiting for it.
+ */
+void
+zfs_dirent_unlock(zfs_dirlock_t *dl)
+{
+ znode_t *dzp = dl->dl_dzp;
+ zfs_dirlock_t **prev_dl, *cur_dl;
+
+ mutex_enter(&dzp->z_lock);
+
+ if (!dl->dl_namelock)
+ rw_exit(&dzp->z_name_lock);
+
+ if (dl->dl_sharecnt > 1) {
+ dl->dl_sharecnt--;
+ mutex_exit(&dzp->z_lock);
+ return;
+ }
+ prev_dl = &dzp->z_dirlocks;
+ while ((cur_dl = *prev_dl) != dl)
+ prev_dl = &cur_dl->dl_next;
+ *prev_dl = dl->dl_next;
+ cv_broadcast(&dl->dl_cv);
+ mutex_exit(&dzp->z_lock);
+
+ if (dl->dl_namesize != 0)
+ kmem_free(dl->dl_name, dl->dl_namesize);
+ cv_destroy(&dl->dl_cv);
+ kmem_free(dl, sizeof (*dl));
+}
+
+/*
+ * Look up an entry in a directory.
+ *
+ * NOTE: '.' and '..' are handled as special cases because
+ * no directory entries are actually stored for them. If this is
+ * the root of a filesystem, then '.zfs' is also treated as a
+ * special pseudo-directory.
+ */
+int
+zfs_dirlook(znode_t *dzp, char *name, znode_t **zpp, int flags,
+ int *deflg, pathname_t *rpnp)
+{
+ zfs_dirlock_t *dl;
+ znode_t *zp;
+ struct inode *ip;
+ int error = 0;
+ uint64_t parent;
+
+ if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+ *zpp = dzp;
+ zhold(*zpp);
+ } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+ zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+
+ /*
+ * If we are a snapshot mounted under .zfs, return
+ * the inode pointer for the snapshot directory.
+ */
+ if ((error = sa_lookup(dzp->z_sa_hdl,
+ SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+ return (error);
+
+ if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) {
+ error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
+ "snapshot", &ip, 0, kcred, NULL, NULL);
+ *zpp = ITOZ(ip);
+ return (error);
+ }
+ rw_enter(&dzp->z_parent_lock, RW_READER);
+ error = zfs_zget(zfsvfs, parent, &zp);
+ if (error == 0)
+ *zpp = zp;
+ rw_exit(&dzp->z_parent_lock);
+ } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
+ ip = zfsctl_root(dzp);
+ *zpp = ITOZ(ip);
+ } else {
+ int zf;
+
+ zf = ZEXISTS | ZSHARED;
+ if (flags & FIGNORECASE)
+ zf |= ZCILOOK;
+
+ error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);
+ if (error == 0) {
+ *zpp = zp;
+ zfs_dirent_unlock(dl);
+ dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
+ }
+ rpnp = NULL;
+ }
+
+ if ((flags & FIGNORECASE) && rpnp && !error)
+ (void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);
+
+ return (error);
+}
+
+/*
+ * unlinked Set (formerly known as the "delete queue") Error Handling
+ *
+ * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
+ * don't specify the name of the entry that we will be manipulating. We
+ * also fib and say that we won't be adding any new entries to the
+ * unlinked set, even though we might (this is to lower the minimum file
+ * size that can be deleted in a full filesystem). So on the small
+ * chance that the nlink list is using a fat zap (ie. has more than
+ * 2000 entries), we *may* not pre-read a block that's needed.
+ * Therefore it is remotely possible for some of the assertions
+ * regarding the unlinked set below to fail due to i/o error. On a
+ * nondebug system, this will result in the space being leaked.
+ */
+void
+zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+
+ ASSERT(zp->z_unlinked);
+ ASSERT(ZTOI(zp)->i_nlink == 0);
+
+ VERIFY3U(0, ==,
+ zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
+
+ dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1);
+}
+
+/*
+ * Clean up any znodes that had no links when we either crashed or
+ * (force) umounted the file system.
+ */
+static void
+zfs_unlinked_drain_task(void *arg)
+{
+ zfsvfs_t *zfsvfs = arg;
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ dmu_object_info_t doi;
+ znode_t *zp;
+ int error;
+
+ ASSERT3B(zfsvfs->z_draining, ==, B_TRUE);
+
+ /*
+ * Iterate over the contents of the unlinked set.
+ */
+ for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
+ zap_cursor_retrieve(&zc, &zap) == 0 && !zfsvfs->z_drain_cancel;
+ zap_cursor_advance(&zc)) {
+
+ /*
+ * See what kind of object we have in list
+ */
+
+ error = dmu_object_info(zfsvfs->z_os,
+ zap.za_first_integer, &doi);
+ if (error != 0)
+ continue;
+
+ ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
+ (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
+ /*
+ * We need to re-mark these list entries for deletion,
+ * so we pull them back into core and set zp->z_unlinked.
+ */
+ error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
+
+ /*
+ * We may pick up znodes that are already marked for deletion.
+ * This could happen during the purge of an extended attribute
+ * directory. All we need to do is skip over them, since they
+ * are already in the system marked z_unlinked.
+ */
+ if (error != 0)
+ continue;
+
+ zp->z_unlinked = B_TRUE;
+
+ /*
+ * zrele() decrements the znode's ref count and may cause
+ * it to be synchronously freed. We interrupt freeing
+ * of this znode by checking the return value of
+ * dmu_objset_zfs_unmounting() in dmu_free_long_range()
+ * when an unmount is requested.
+ */
+ zrele(zp);
+ ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
+ }
+ zap_cursor_fini(&zc);
+
+ zfsvfs->z_draining = B_FALSE;
+ zfsvfs->z_drain_task = TASKQID_INVALID;
+}
+
+/*
+ * Sets z_draining then tries to dispatch async unlinked drain.
+ * If that fails executes synchronous unlinked drain.
+ */
+void
+zfs_unlinked_drain(zfsvfs_t *zfsvfs)
+{
+ ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
+ ASSERT3B(zfsvfs->z_draining, ==, B_FALSE);
+
+ zfsvfs->z_draining = B_TRUE;
+ zfsvfs->z_drain_cancel = B_FALSE;
+
+ zfsvfs->z_drain_task = taskq_dispatch(
+ dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)),
+ zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP);
+ if (zfsvfs->z_drain_task == TASKQID_INVALID) {
+ zfs_dbgmsg("async zfs_unlinked_drain dispatch failed");
+ zfs_unlinked_drain_task(zfsvfs);
+ }
+}
+
+/*
+ * Wait for the unlinked drain taskq task to stop. This will interrupt the
+ * unlinked set processing if it is in progress.
+ */
+void
+zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs)
+{
+ ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
+
+ if (zfsvfs->z_draining) {
+ zfsvfs->z_drain_cancel = B_TRUE;
+ taskq_cancel_id(dsl_pool_unlinked_drain_taskq(
+ dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task);
+ zfsvfs->z_drain_task = TASKQID_INVALID;
+ zfsvfs->z_draining = B_FALSE;
+ }
+}
+
+/*
+ * Delete the entire contents of a directory. Return a count
+ * of the number of entries that could not be deleted. If we encounter
+ * an error, return a count of at least one so that the directory stays
+ * in the unlinked set.
+ *
+ * NOTE: this function assumes that the directory is inactive,
+ * so there is no need to lock its entries before deletion.
+ * Also, it assumes the directory contents is *only* regular
+ * files.
+ */
+static int
+zfs_purgedir(znode_t *dzp)
+{
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ znode_t *xzp;
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+ zfs_dirlock_t dl;
+ int skipped = 0;
+ int error;
+
+ for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
+ (error = zap_cursor_retrieve(&zc, &zap)) == 0;
+ zap_cursor_advance(&zc)) {
+ error = zfs_zget(zfsvfs,
+ ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
+ if (error) {
+ skipped += 1;
+ continue;
+ }
+
+ ASSERT(S_ISREG(ZTOI(xzp)->i_mode) ||
+ S_ISLNK(ZTOI(xzp)->i_mode));
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
+ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ /* Is this really needed ? */
+ zfs_sa_upgrade_txholds(tx, xzp);
+ dmu_tx_mark_netfree(tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ zfs_zrele_async(xzp);
+ skipped += 1;
+ continue;
+ }
+ bzero(&dl, sizeof (dl));
+ dl.dl_dzp = dzp;
+ dl.dl_name = zap.za_name;
+
+ error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
+ if (error)
+ skipped += 1;
+ dmu_tx_commit(tx);
+
+ zfs_zrele_async(xzp);
+ }
+ zap_cursor_fini(&zc);
+ if (error != ENOENT)
+ skipped += 1;
+ return (skipped);
+}
+
+void
+zfs_rmnode(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ objset_t *os = zfsvfs->z_os;
+ znode_t *xzp = NULL;
+ dmu_tx_t *tx;
+ uint64_t acl_obj;
+ uint64_t xattr_obj;
+ uint64_t links;
+ int error;
+
+ ASSERT(ZTOI(zp)->i_nlink == 0);
+ ASSERT(atomic_read(&ZTOI(zp)->i_count) == 0);
+
+ /*
+ * If this is an attribute directory, purge its contents.
+ */
+ if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) {
+ if (zfs_purgedir(zp) != 0) {
+ /*
+ * Not enough space to delete some xattrs.
+ * Leave it in the unlinked set.
+ */
+ zfs_znode_dmu_fini(zp);
+
+ return;
+ }
+ }
+
+ /*
+ * Free up all the data in the file. We don't do this for directories
+ * because we need truncate and remove to be in the same tx, like in
+ * zfs_znode_delete(). Otherwise, if we crash here we'll end up with
+ * an inconsistent truncated zap object in the delete queue. Note a
+ * truncated file is harmless since it only contains user data.
+ */
+ if (S_ISREG(ZTOI(zp)->i_mode)) {
+ error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
+ if (error) {
+ /*
+ * Not enough space or we were interrupted by unmount.
+ * Leave the file in the unlinked set.
+ */
+ zfs_znode_dmu_fini(zp);
+ return;
+ }
+ }
+
+ /*
+ * If the file has extended attributes, we're going to unlink
+ * the xattr dir.
+ */
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+ if (error == 0 && xattr_obj) {
+ error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+ ASSERT(error == 0);
+ }
+
+ acl_obj = zfs_external_acl(zp);
+
+ /*
+ * Set up the final transaction.
+ */
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ if (xzp) {
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
+ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
+ }
+ if (acl_obj)
+ dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ /*
+ * Not enough space to delete the file. Leave it in the
+ * unlinked set, leaking it until the fs is remounted (at
+ * which point we'll call zfs_unlinked_drain() to process it).
+ */
+ dmu_tx_abort(tx);
+ zfs_znode_dmu_fini(zp);
+ goto out;
+ }
+
+ if (xzp) {
+ ASSERT(error == 0);
+ mutex_enter(&xzp->z_lock);
+ xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */
+ clear_nlink(ZTOI(xzp)); /* no more links to it */
+ links = 0;
+ VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+ &links, sizeof (links), tx));
+ mutex_exit(&xzp->z_lock);
+ zfs_unlinked_add(xzp, tx);
+ }
+
+ mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock);
+
+ /*
+ * Remove this znode from the unlinked set. If a has rollback has
+ * occurred while a file is open and unlinked. Then when the file
+ * is closed post rollback it will not exist in the rolled back
+ * version of the unlinked object.
+ */
+ error = zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
+ zp->z_id, tx);
+ VERIFY(error == 0 || error == ENOENT);
+
+ uint64_t count;
+ if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) {
+ cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv);
+ }
+
+ mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock);
+
+ dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1);
+
+ zfs_znode_delete(zp, tx);
+
+ dmu_tx_commit(tx);
+out:
+ if (xzp)
+ zfs_zrele_async(xzp);
+}
+
+static uint64_t
+zfs_dirent(znode_t *zp, uint64_t mode)
+{
+ uint64_t de = zp->z_id;
+
+ if (ZTOZSB(zp)->z_version >= ZPL_VERSION_DIRENT_TYPE)
+ de |= IFTODT(mode) << 60;
+ return (de);
+}
+
+/*
+ * Link zp into dl. Can fail in the following cases :
+ * - if zp has been unlinked.
+ * - if the number of entries with the same hash (aka. colliding entries)
+ * exceed the capacity of a leaf-block of fatzap and splitting of the
+ * leaf-block does not help.
+ */
+int
+zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
+{
+ znode_t *dzp = dl->dl_dzp;
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ uint64_t value;
+ int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);
+ sa_bulk_attr_t bulk[5];
+ uint64_t mtime[2], ctime[2];
+ uint64_t links;
+ int count = 0;
+ int error;
+
+ mutex_enter(&zp->z_lock);
+
+ if (!(flag & ZRENAMING)) {
+ if (zp->z_unlinked) { /* no new links to unlinked zp */
+ ASSERT(!(flag & (ZNEW | ZEXISTS)));
+ mutex_exit(&zp->z_lock);
+ return (SET_ERROR(ENOENT));
+ }
+ if (!(flag & ZNEW)) {
+ /*
+ * ZNEW nodes come from zfs_mknode() where the link
+ * count has already been initialised
+ */
+ inc_nlink(ZTOI(zp));
+ links = ZTOI(zp)->i_nlink;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+ NULL, &links, sizeof (links));
+ }
+ }
+
+ value = zfs_dirent(zp, zp->z_mode);
+ error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1,
+ &value, tx);
+
+ /*
+ * zap_add could fail to add the entry if it exceeds the capacity of the
+ * leaf-block and zap_leaf_split() failed to help.
+ * The caller of this routine is responsible for failing the transaction
+ * which will rollback the SA updates done above.
+ */
+ if (error != 0) {
+ if (!(flag & ZRENAMING) && !(flag & ZNEW))
+ drop_nlink(ZTOI(zp));
+ mutex_exit(&zp->z_lock);
+ return (error);
+ }
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+ &dzp->z_id, sizeof (dzp->z_id));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+
+ if (!(flag & ZNEW)) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+ ctime);
+ }
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ ASSERT(error == 0);
+
+ mutex_exit(&zp->z_lock);
+
+ mutex_enter(&dzp->z_lock);
+ dzp->z_size++;
+ if (zp_is_dir)
+ inc_nlink(ZTOI(dzp));
+ links = ZTOI(dzp)->i_nlink;
+ count = 0;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &dzp->z_size, sizeof (dzp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &links, sizeof (links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ mtime, sizeof (mtime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &dzp->z_pflags, sizeof (dzp->z_pflags));
+ zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
+ error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+ ASSERT(error == 0);
+ mutex_exit(&dzp->z_lock);
+
+ return (0);
+}
+
+/*
+ * The match type in the code for this function should conform to:
+ *
+ * ------------------------------------------------------------------------
+ * fs type | z_norm | lookup type | match type
+ * ---------|-------------|-------------|----------------------------------
+ * CS !norm | 0 | 0 | 0 (exact)
+ * CS norm | formX | 0 | MT_NORMALIZE
+ * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE
+ * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
+ * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE
+ * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
+ * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
+ * CM !norm | upper | ZCILOOK | MT_NORMALIZE
+ * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
+ * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE
+ *
+ * Abbreviations:
+ * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed
+ * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER)
+ * formX = unicode normalization form set on fs creation
+ */
+static int
+zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx,
+ int flag)
+{
+ int error;
+
+ if (ZTOZSB(zp)->z_norm) {
+ matchtype_t mt = MT_NORMALIZE;
+
+ if ((ZTOZSB(zp)->z_case == ZFS_CASE_INSENSITIVE &&
+ (flag & ZCIEXACT)) ||
+ (ZTOZSB(zp)->z_case == ZFS_CASE_MIXED &&
+ !(flag & ZCILOOK))) {
+ mt |= MT_MATCH_CASE;
+ }
+
+ error = zap_remove_norm(ZTOZSB(zp)->z_os, dzp->z_id,
+ dl->dl_name, mt, tx);
+ } else {
+ error = zap_remove(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name,
+ tx);
+ }
+
+ return (error);
+}
+
+/*
+ * Unlink zp from dl, and mark zp for deletion if this was the last link. Can
+ * fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY).
+ * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
+ * If it's non-NULL, we use it to indicate whether the znode needs deletion,
+ * and it's the caller's job to do it.
+ */
+int
+zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
+ boolean_t *unlinkedp)
+{
+ znode_t *dzp = dl->dl_dzp;
+ zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+ int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);
+ boolean_t unlinked = B_FALSE;
+ sa_bulk_attr_t bulk[5];
+ uint64_t mtime[2], ctime[2];
+ uint64_t links;
+ int count = 0;
+ int error;
+
+ if (!(flag & ZRENAMING)) {
+ mutex_enter(&zp->z_lock);
+
+ if (zp_is_dir && !zfs_dirempty(zp)) {
+ mutex_exit(&zp->z_lock);
+ return (SET_ERROR(ENOTEMPTY));
+ }
+
+ /*
+ * If we get here, we are going to try to remove the object.
+ * First try removing the name from the directory; if that
+ * fails, return the error.
+ */
+ error = zfs_dropname(dl, zp, dzp, tx, flag);
+ if (error != 0) {
+ mutex_exit(&zp->z_lock);
+ return (error);
+ }
+
+ if (ZTOI(zp)->i_nlink <= zp_is_dir) {
+ zfs_panic_recover("zfs: link count on %lu is %u, "
+ "should be at least %u", zp->z_id,
+ (int)ZTOI(zp)->i_nlink, zp_is_dir + 1);
+ set_nlink(ZTOI(zp), zp_is_dir + 1);
+ }
+ drop_nlink(ZTOI(zp));
+ if (ZTOI(zp)->i_nlink == zp_is_dir) {
+ zp->z_unlinked = B_TRUE;
+ clear_nlink(ZTOI(zp));
+ unlinked = B_TRUE;
+ } else {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, sizeof (zp->z_pflags));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+ ctime);
+ }
+ links = ZTOI(zp)->i_nlink;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+ NULL, &links, sizeof (links));
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ count = 0;
+ ASSERT(error == 0);
+ mutex_exit(&zp->z_lock);
+ } else {
+ error = zfs_dropname(dl, zp, dzp, tx, flag);
+ if (error != 0)
+ return (error);
+ }
+
+ mutex_enter(&dzp->z_lock);
+ dzp->z_size--; /* one dirent removed */
+ if (zp_is_dir)
+ drop_nlink(ZTOI(dzp)); /* ".." link from zp */
+ links = ZTOI(dzp)->i_nlink;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+ NULL, &links, sizeof (links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+ NULL, &dzp->z_size, sizeof (dzp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+ NULL, ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+ NULL, mtime, sizeof (mtime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
+ zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
+ error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+ ASSERT(error == 0);
+ mutex_exit(&dzp->z_lock);
+
+ if (unlinkedp != NULL)
+ *unlinkedp = unlinked;
+ else if (unlinked)
+ zfs_unlinked_add(zp, tx);
+
+ return (0);
+}
+
+/*
+ * Indicate whether the directory is empty. Works with or without z_lock
+ * held, but can only be consider a hint in the latter case. Returns true
+ * if only "." and ".." remain and there's no work in progress.
+ *
+ * The internal ZAP size, rather than zp->z_size, needs to be checked since
+ * some consumers (Lustre) do not strictly maintain an accurate SA_ZPL_SIZE.
+ */
+boolean_t
+zfs_dirempty(znode_t *dzp)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+ uint64_t count;
+ int error;
+
+ if (dzp->z_dirlocks != NULL)
+ return (B_FALSE);
+
+ error = zap_count(zfsvfs->z_os, dzp->z_id, &count);
+ if (error != 0 || count != 0)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+int
+zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ znode_t *xzp;
+ dmu_tx_t *tx;
+ int error;
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
+#ifdef ZFS_DEBUG
+ uint64_t parent;
+#endif
+
+ *xzpp = NULL;
+
+ if ((error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr)))
+ return (error);
+
+ if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
+ &acl_ids)) != 0)
+ return (error);
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) {
+ zfs_acl_ids_free(&acl_ids);
+ return (SET_ERROR(EDQUOT));
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_abort(tx);
+ return (error);
+ }
+ zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+#ifdef ZFS_DEBUG
+ error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (parent));
+ ASSERT(error == 0 && parent == zp->z_id);
+#endif
+
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
+ sizeof (xzp->z_id), tx));
+
+ if (!zp->z_unlinked)
+ (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
+ xzp, "", NULL, acl_ids.z_fuidp, vap);
+
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_commit(tx);
+
+ *xzpp = xzp;
+
+ return (0);
+}
+
+/*
+ * Return a znode for the extended attribute directory for zp.
+ * ** If the directory does not already exist, it is created **
+ *
+ * IN: zp - znode to obtain attribute directory from
+ * cr - credentials of caller
+ * flags - flags from the VOP_LOOKUP call
+ *
+ * OUT: xipp - pointer to extended attribute znode
+ *
+ * RETURN: 0 on success
+ * error number on failure
+ */
+int
+zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ znode_t *xzp;
+ zfs_dirlock_t *dl;
+ vattr_t va;
+ int error;
+top:
+ error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL);
+ if (error)
+ return (error);
+
+ if (xzp != NULL) {
+ *xzpp = xzp;
+ zfs_dirent_unlock(dl);
+ return (0);
+ }
+
+ if (!(flags & CREATE_XATTR_DIR)) {
+ zfs_dirent_unlock(dl);
+ return (SET_ERROR(ENOENT));
+ }
+
+ if (zfs_is_readonly(zfsvfs)) {
+ zfs_dirent_unlock(dl);
+ return (SET_ERROR(EROFS));
+ }
+
+ /*
+ * The ability to 'create' files in an attribute
+ * directory comes from the write_xattr permission on the base file.
+ *
+ * The ability to 'search' an attribute directory requires
+ * read_xattr permission on the base file.
+ *
+ * Once in a directory the ability to read/write attributes
+ * is controlled by the permissions on the attribute file.
+ */
+ va.va_mask = ATTR_MODE | ATTR_UID | ATTR_GID;
+ va.va_mode = S_IFDIR | S_ISVTX | 0777;
+ zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
+
+ va.va_dentry = NULL;
+ error = zfs_make_xattrdir(zp, &va, xzpp, cr);
+ zfs_dirent_unlock(dl);
+
+ if (error == ERESTART) {
+ /* NB: we already did dmu_tx_wait() if necessary */
+ goto top;
+ }
+
+ return (error);
+}
+
+/*
+ * Decide whether it is okay to remove within a sticky directory.
+ *
+ * In sticky directories, write access is not sufficient;
+ * you can remove entries from a directory only if:
+ *
+ * you own the directory,
+ * you own the entry,
+ * you have write access to the entry,
+ * or you are privileged (checked in secpolicy...).
+ *
+ * The function returns 0 if remove access is granted.
+ */
+int
+zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
+{
+ uid_t uid;
+ uid_t downer;
+ uid_t fowner;
+ zfsvfs_t *zfsvfs = ZTOZSB(zdp);
+
+ if (zfsvfs->z_replay)
+ return (0);
+
+ if ((zdp->z_mode & S_ISVTX) == 0)
+ return (0);
+
+ downer = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zdp)->i_uid),
+ cr, ZFS_OWNER);
+ fowner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zp)->i_uid),
+ cr, ZFS_OWNER);
+
+ if ((uid = crgetuid(cr)) == downer || uid == fowner ||
+ zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)
+ return (0);
+ else
+ return (secpolicy_vnode_remove(cr));
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
new file mode 100644
index 000000000000..99c6ffc95940
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
@@ -0,0 +1,440 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zfs_file.h>
+#include <sys/stat.h>
+#include <sys/file.h>
+#include <linux/falloc.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#ifdef HAVE_FDTABLE_HEADER
+#include <linux/fdtable.h>
+#endif
+
+/*
+ * Open file
+ *
+ * path - fully qualified path to file
+ * flags - file attributes O_READ / O_WRITE / O_EXCL
+ * fpp - pointer to return file pointer
+ *
+ * Returns 0 on success underlying error on failure.
+ */
+int
+zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp)
+{
+ struct file *filp;
+ int saved_umask;
+
+ if (!(flags & O_CREAT) && (flags & O_WRONLY))
+ flags |= O_EXCL;
+
+ if (flags & O_CREAT)
+ saved_umask = xchg(&current->fs->umask, 0);
+
+ filp = filp_open(path, flags, mode);
+
+ if (flags & O_CREAT)
+ (void) xchg(&current->fs->umask, saved_umask);
+
+ if (IS_ERR(filp))
+ return (-PTR_ERR(filp));
+
+ *fpp = filp;
+ return (0);
+}
+
+void
+zfs_file_close(zfs_file_t *fp)
+{
+ filp_close(fp, 0);
+}
+
+static ssize_t
+zfs_file_write_impl(zfs_file_t *fp, const void *buf, size_t count, loff_t *off)
+{
+#if defined(HAVE_KERNEL_WRITE_PPOS)
+ return (kernel_write(fp, buf, count, off));
+#else
+ mm_segment_t saved_fs;
+ ssize_t rc;
+
+ saved_fs = get_fs();
+ set_fs(KERNEL_DS);
+
+ rc = vfs_write(fp, (__force const char __user __user *)buf, count, off);
+
+ set_fs(saved_fs);
+
+ return (rc);
+#endif
+}
+
+/*
+ * Stateful write - use os internal file pointer to determine where to
+ * write and update on successful completion.
+ *
+ * fp - pointer to file (pipe, socket, etc) to write to
+ * buf - buffer to write
+ * count - # of bytes to write
+ * resid - pointer to count of unwritten bytes (if short write)
+ *
+ * Returns 0 on success errno on failure.
+ */
+int
+zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
+{
+ loff_t off = fp->f_pos;
+ ssize_t rc;
+
+ rc = zfs_file_write_impl(fp, buf, count, &off);
+ if (rc < 0)
+ return (-rc);
+
+ fp->f_pos = off;
+
+ if (resid) {
+ *resid = count - rc;
+ } else if (rc != count) {
+ return (EIO);
+ }
+
+ return (0);
+}
+
+/*
+ * Stateless write - os internal file pointer is not updated.
+ *
+ * fp - pointer to file (pipe, socket, etc) to write to
+ * buf - buffer to write
+ * count - # of bytes to write
+ * off - file offset to write to (only valid for seekable types)
+ * resid - pointer to count of unwritten bytes
+ *
+ * Returns 0 on success errno on failure.
+ */
+int
+zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off,
+ ssize_t *resid)
+{
+ ssize_t rc;
+
+ rc = zfs_file_write_impl(fp, buf, count, &off);
+ if (rc < 0)
+ return (-rc);
+
+ if (resid) {
+ *resid = count - rc;
+ } else if (rc != count) {
+ return (EIO);
+ }
+
+ return (0);
+}
+
+static ssize_t
+zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *off)
+{
+#if defined(HAVE_KERNEL_READ_PPOS)
+ return (kernel_read(fp, buf, count, off));
+#else
+ mm_segment_t saved_fs;
+ ssize_t rc;
+
+ saved_fs = get_fs();
+ set_fs(KERNEL_DS);
+
+ rc = vfs_read(fp, (void __user *)buf, count, off);
+ set_fs(saved_fs);
+
+ return (rc);
+#endif
+}
+
+/*
+ * Stateful read - use os internal file pointer to determine where to
+ * read and update on successful completion.
+ *
+ * fp - pointer to file (pipe, socket, etc) to read from
+ * buf - buffer to write
+ * count - # of bytes to read
+ * resid - pointer to count of unread bytes (if short read)
+ *
+ * Returns 0 on success errno on failure.
+ */
+int
+zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid)
+{
+ loff_t off = fp->f_pos;
+ ssize_t rc;
+
+ rc = zfs_file_read_impl(fp, buf, count, &off);
+ if (rc < 0)
+ return (-rc);
+
+ fp->f_pos = off;
+
+ if (resid) {
+ *resid = count - rc;
+ } else if (rc != count) {
+ return (EIO);
+ }
+
+ return (0);
+}
+
+/*
+ * Stateless read - os internal file pointer is not updated.
+ *
+ * fp - pointer to file (pipe, socket, etc) to read from
+ * buf - buffer to write
+ * count - # of bytes to write
+ * off - file offset to read from (only valid for seekable types)
+ * resid - pointer to count of unwritten bytes (if short write)
+ *
+ * Returns 0 on success errno on failure.
+ */
+int
+zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off,
+ ssize_t *resid)
+{
+ ssize_t rc;
+
+ rc = zfs_file_read_impl(fp, buf, count, &off);
+ if (rc < 0)
+ return (-rc);
+
+ if (resid) {
+ *resid = count - rc;
+ } else if (rc != count) {
+ return (EIO);
+ }
+
+ return (0);
+}
+
+/*
+ * lseek - set / get file pointer
+ *
+ * fp - pointer to file (pipe, socket, etc) to read from
+ * offp - value to seek to, returns current value plus passed offset
+ * whence - see man pages for standard lseek whence values
+ *
+ * Returns 0 on success errno on failure (ESPIPE for non seekable types)
+ */
+int
+zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence)
+{
+ loff_t rc;
+
+ if (*offp < 0 || *offp > MAXOFFSET_T)
+ return (EINVAL);
+
+ rc = vfs_llseek(fp, *offp, whence);
+ if (rc < 0)
+ return (-rc);
+
+ *offp = rc;
+
+ return (0);
+}
+
+/*
+ * Get file attributes
+ *
+ * filp - file pointer
+ * zfattr - pointer to file attr structure
+ *
+ * Currently only used for fetching size and file mode.
+ *
+ * Returns 0 on success or error code of underlying getattr call on failure.
+ */
+int
+zfs_file_getattr(zfs_file_t *filp, zfs_file_attr_t *zfattr)
+{
+ struct kstat stat;
+ int rc;
+
+#if defined(HAVE_4ARGS_VFS_GETATTR)
+ rc = vfs_getattr(&filp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+#elif defined(HAVE_2ARGS_VFS_GETATTR)
+ rc = vfs_getattr(&filp->f_path, &stat);
+#else
+ rc = vfs_getattr(filp->f_path.mnt, filp->f_dentry, &stat);
+#endif
+ if (rc)
+ return (-rc);
+
+ zfattr->zfa_size = stat.size;
+ zfattr->zfa_mode = stat.mode;
+
+ return (0);
+}
+
+/*
+ * Sync file to disk
+ *
+ * filp - file pointer
+ * flags - O_SYNC and or O_DSYNC
+ *
+ * Returns 0 on success or error code of underlying sync call on failure.
+ */
+int
+zfs_file_fsync(zfs_file_t *filp, int flags)
+{
+ int datasync = 0;
+ int error;
+ int fstrans;
+
+ if (flags & O_DSYNC)
+ datasync = 1;
+
+ /*
+ * May enter XFS which generates a warning when PF_FSTRANS is set.
+ * To avoid this the flag is cleared over vfs_sync() and then reset.
+ */
+ fstrans = __spl_pf_fstrans_check();
+ if (fstrans)
+ current->flags &= ~(__SPL_PF_FSTRANS);
+
+ error = -vfs_fsync(filp, datasync);
+
+ if (fstrans)
+ current->flags |= __SPL_PF_FSTRANS;
+
+ return (error);
+}
+
+/*
+ * fallocate - allocate or free space on disk
+ *
+ * fp - file pointer
+ * mode (non-standard options for hole punching etc)
+ * offset - offset to start allocating or freeing from
+ * len - length to free / allocate
+ *
+ * OPTIONAL
+ */
+int
+zfs_file_fallocate(zfs_file_t *fp, int mode, loff_t offset, loff_t len)
+{
+ /*
+ * May enter XFS which generates a warning when PF_FSTRANS is set.
+ * To avoid this the flag is cleared over vfs_sync() and then reset.
+ */
+ int fstrans = __spl_pf_fstrans_check();
+ if (fstrans)
+ current->flags &= ~(__SPL_PF_FSTRANS);
+
+ /*
+ * When supported by the underlying file system preferentially
+ * use the fallocate() callback to preallocate the space.
+ */
+ int error = EOPNOTSUPP;
+ if (fp->f_op->fallocate)
+ error = fp->f_op->fallocate(fp, mode, offset, len);
+
+ if (fstrans)
+ current->flags |= __SPL_PF_FSTRANS;
+
+ return (error);
+}
+
+/*
+ * Request current file pointer offset
+ *
+ * fp - pointer to file
+ *
+ * Returns current file offset.
+ */
+loff_t
+zfs_file_off(zfs_file_t *fp)
+{
+ return (fp->f_pos);
+}
+
+/*
+ * Request file pointer private data
+ *
+ * fp - pointer to file
+ *
+ * Returns pointer to file private data.
+ */
+void *
+zfs_file_private(zfs_file_t *fp)
+{
+ return (fp->private_data);
+}
+
+/*
+ * unlink file
+ *
+ * path - fully qualified file path
+ *
+ * Returns 0 on success.
+ *
+ * OPTIONAL
+ */
+int
+zfs_file_unlink(const char *path)
+{
+ return (EOPNOTSUPP);
+}
+
+/*
+ * Get reference to file pointer
+ *
+ * fd - input file descriptor
+ * fpp - pointer to file pointer
+ *
+ * Returns 0 on success EBADF on failure.
+ */
+int
+zfs_file_get(int fd, zfs_file_t **fpp)
+{
+ zfs_file_t *fp;
+
+ fp = fget(fd);
+ if (fp == NULL)
+ return (EBADF);
+
+ *fpp = fp;
+
+ return (0);
+}
+
+/*
+ * Drop reference to file pointer
+ *
+ * fd - input file descriptor
+ */
+void
+zfs_file_put(int fd)
+{
+ struct file *fp;
+
+ if ((fp = fget(fd)) != NULL) {
+ fput(fp);
+ fput(fp);
+ }
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c
new file mode 100644
index 000000000000..b88e0497d000
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c
@@ -0,0 +1,329 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright 2011 Martin Matuska
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Portions Copyright 2012 Pawel Jakub Dawidek <pawel@dawidek.net>
+ * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Toomas Soome <tsoome@me.com>
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ * Copyright 2017 RackTop Systems.
+ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+ * Copyright (c) 2019 Datto Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/stat.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/nvpair.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+#include <sys/fm/util.h>
+#include <sys/dsl_crypt.h>
+
+#include <sys/zfs_ioctl_impl.h>
+
+#include <sys/zfs_sysfs.h>
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+
+boolean_t
+zfs_vfs_held(zfsvfs_t *zfsvfs)
+{
+ return (zfsvfs->z_sb != NULL);
+}
+
+int
+zfs_vfs_ref(zfsvfs_t **zfvp)
+{
+ if (*zfvp == NULL || (*zfvp)->z_sb == NULL ||
+ !atomic_inc_not_zero(&((*zfvp)->z_sb->s_active))) {
+ return (SET_ERROR(ESRCH));
+ }
+ return (0);
+}
+
+void
+zfs_vfs_rele(zfsvfs_t *zfsvfs)
+{
+ deactivate_super(zfsvfs->z_sb);
+}
+
+static int
+zfsdev_state_init(struct file *filp)
+{
+ zfsdev_state_t *zs, *zsprev = NULL;
+ minor_t minor;
+ boolean_t newzs = B_FALSE;
+
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+ minor = zfsdev_minor_alloc();
+ if (minor == 0)
+ return (SET_ERROR(ENXIO));
+
+ for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
+ if (zs->zs_minor == -1)
+ break;
+ zsprev = zs;
+ }
+
+ if (!zs) {
+ zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP);
+ newzs = B_TRUE;
+ }
+
+ filp->private_data = zs;
+
+ zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit);
+ zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent);
+
+ /*
+ * In order to provide for lock-free concurrent read access
+ * to the minor list in zfsdev_get_state_impl(), new entries
+ * must be completely written before linking them into the
+ * list whereas existing entries are already linked; the last
+ * operation must be updating zs_minor (from -1 to the new
+ * value).
+ */
+ if (newzs) {
+ zs->zs_minor = minor;
+ smp_wmb();
+ zsprev->zs_next = zs;
+ } else {
+ smp_wmb();
+ zs->zs_minor = minor;
+ }
+
+ return (0);
+}
+
+static int
+zfsdev_state_destroy(struct file *filp)
+{
+ zfsdev_state_t *zs;
+
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+ ASSERT(filp->private_data != NULL);
+
+ zs = filp->private_data;
+ zs->zs_minor = -1;
+ zfs_onexit_destroy(zs->zs_onexit);
+ zfs_zevent_destroy(zs->zs_zevent);
+ zs->zs_onexit = NULL;
+ zs->zs_zevent = NULL;
+
+ return (0);
+}
+
+static int
+zfsdev_open(struct inode *ino, struct file *filp)
+{
+ int error;
+
+ mutex_enter(&zfsdev_state_lock);
+ error = zfsdev_state_init(filp);
+ mutex_exit(&zfsdev_state_lock);
+
+ return (-error);
+}
+
+static int
+zfsdev_release(struct inode *ino, struct file *filp)
+{
+ int error;
+
+ mutex_enter(&zfsdev_state_lock);
+ error = zfsdev_state_destroy(filp);
+ mutex_exit(&zfsdev_state_lock);
+
+ return (-error);
+}
+
+static long
+zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg)
+{
+ uint_t vecnum;
+ zfs_cmd_t *zc;
+ int error, rc;
+
+ vecnum = cmd - ZFS_IOC_FIRST;
+
+ zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+
+ if (ddi_copyin((void *)(uintptr_t)arg, zc, sizeof (zfs_cmd_t), 0)) {
+ error = -SET_ERROR(EFAULT);
+ goto out;
+ }
+ error = -zfsdev_ioctl_common(vecnum, zc, 0);
+ rc = ddi_copyout(zc, (void *)(uintptr_t)arg, sizeof (zfs_cmd_t), 0);
+ if (error == 0 && rc != 0)
+ error = -SET_ERROR(EFAULT);
+out:
+ kmem_free(zc, sizeof (zfs_cmd_t));
+ return (error);
+
+}
+
+uint64_t
+zfs_max_nvlist_src_size_os(void)
+{
+ if (zfs_max_nvlist_src_size != 0)
+ return (zfs_max_nvlist_src_size);
+
+ return (KMALLOC_MAX_SIZE);
+}
+
+void
+zfs_ioctl_init_os(void)
+{
+}
+
+#ifdef CONFIG_COMPAT
+static long
+zfsdev_compat_ioctl(struct file *filp, unsigned cmd, unsigned long arg)
+{
+ return (zfsdev_ioctl(filp, cmd, arg));
+}
+#else
+#define zfsdev_compat_ioctl NULL
+#endif
+
+static const struct file_operations zfsdev_fops = {
+ .open = zfsdev_open,
+ .release = zfsdev_release,
+ .unlocked_ioctl = zfsdev_ioctl,
+ .compat_ioctl = zfsdev_compat_ioctl,
+ .owner = THIS_MODULE,
+};
+
+static struct miscdevice zfs_misc = {
+ .minor = ZFS_DEVICE_MINOR,
+ .name = ZFS_DRIVER,
+ .fops = &zfsdev_fops,
+};
+
+MODULE_ALIAS_MISCDEV(ZFS_DEVICE_MINOR);
+MODULE_ALIAS("devname:zfs");
+
+int
+zfsdev_attach(void)
+{
+ int error;
+
+ error = misc_register(&zfs_misc);
+ if (error == -EBUSY) {
+ /*
+ * Fallback to dynamic minor allocation in the event of a
+ * collision with a reserved minor in linux/miscdevice.h.
+ * In this case the kernel modules must be manually loaded.
+ */
+ printk(KERN_INFO "ZFS: misc_register() with static minor %d "
+ "failed %d, retrying with MISC_DYNAMIC_MINOR\n",
+ ZFS_DEVICE_MINOR, error);
+
+ zfs_misc.minor = MISC_DYNAMIC_MINOR;
+ error = misc_register(&zfs_misc);
+ }
+
+ if (error)
+ printk(KERN_INFO "ZFS: misc_register() failed %d\n", error);
+
+ return (error);
+}
+
+void
+zfsdev_detach(void)
+{
+ misc_deregister(&zfs_misc);
+}
+
+#ifdef ZFS_DEBUG
+#define ZFS_DEBUG_STR " (DEBUG mode)"
+#else
+#define ZFS_DEBUG_STR ""
+#endif
+
+static int __init
+_init(void)
+{
+ int error;
+
+ if ((error = zfs_kmod_init()) != 0) {
+ printk(KERN_NOTICE "ZFS: Failed to Load ZFS Filesystem v%s-%s%s"
+ ", rc = %d\n", ZFS_META_VERSION, ZFS_META_RELEASE,
+ ZFS_DEBUG_STR, error);
+
+ return (-error);
+ }
+
+ zfs_sysfs_init();
+
+ printk(KERN_NOTICE "ZFS: Loaded module v%s-%s%s, "
+ "ZFS pool version %s, ZFS filesystem version %s\n",
+ ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR,
+ SPA_VERSION_STRING, ZPL_VERSION_STRING);
+#ifndef CONFIG_FS_POSIX_ACL
+ printk(KERN_NOTICE "ZFS: Posix ACLs disabled by kernel\n");
+#endif /* CONFIG_FS_POSIX_ACL */
+
+ return (0);
+}
+
+static void __exit
+_fini(void)
+{
+ zfs_sysfs_fini();
+ zfs_kmod_fini();
+
+ printk(KERN_NOTICE "ZFS: Unloaded module v%s-%s%s\n",
+ ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR);
+}
+
+#if defined(_KERNEL)
+module_init(_init);
+module_exit(_fini);
+#endif
+
+ZFS_MODULE_DESCRIPTION("ZFS");
+ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR);
+ZFS_MODULE_LICENSE(ZFS_META_LICENSE);
+ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c
new file mode 100644
index 000000000000..fb7c68987360
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c
@@ -0,0 +1,662 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2018, 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/zfeature.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_sysfs.h>
+#include <sys/kmem.h>
+#include <sys/fs/zfs.h>
+#include <linux/kobject.h>
+
+#include "zfs_prop.h"
+
+#if !defined(_KERNEL)
+#error kernel builds only
+#endif
+
+/*
+ * ZFS Module sysfs support
+ *
+ * This extends our sysfs '/sys/module/zfs' entry to include feature
+ * and property attributes. The primary consumer of this information
+ * is user processes, like the zfs CLI, that need to know what the
+ * current loaded ZFS module supports. The libzfs binary will consult
+ * this information when instantiating the zfs|zpool property tables
+ * and the pool features table.
+ *
+ * The added top-level directories are:
+ * /sys/module/zfs
+ * ├── features.kernel
+ * ├── features.pool
+ * ├── properties.dataset
+ * └── properties.pool
+ *
+ * The local interface for the zfs kobjects includes:
+ * zfs_kobj_init()
+ * zfs_kobj_add()
+ * zfs_kobj_release()
+ * zfs_kobj_add_attr()
+ * zfs_kobj_fini()
+ */
+
+/*
+ * A zfs_mod_kobj_t represents a zfs kobject under '/sys/module/zfs'
+ */
+struct zfs_mod_kobj;
+typedef struct zfs_mod_kobj zfs_mod_kobj_t;
+
+struct zfs_mod_kobj {
+ struct kobject zko_kobj;
+ struct kobj_type zko_kobj_type;
+ struct sysfs_ops zko_sysfs_ops;
+ size_t zko_attr_count;
+ struct attribute *zko_attr_list; /* allocated */
+ struct attribute **zko_default_attrs; /* allocated */
+ size_t zko_child_count;
+ zfs_mod_kobj_t *zko_children; /* allocated */
+};
+
+#define ATTR_TABLE_SIZE(cnt) (sizeof (struct attribute) * (cnt))
+/* Note +1 for NULL terminator slot */
+#define DEFAULT_ATTR_SIZE(cnt) (sizeof (struct attribute *) * (cnt + 1))
+#define CHILD_TABLE_SIZE(cnt) (sizeof (zfs_mod_kobj_t) * (cnt))
+
+/*
+ * These are the top-level kobjects under '/sys/module/zfs/'
+ */
+static zfs_mod_kobj_t kernel_features_kobj;
+static zfs_mod_kobj_t pool_features_kobj;
+static zfs_mod_kobj_t dataset_props_kobj;
+static zfs_mod_kobj_t pool_props_kobj;
+
+/*
+ * The show function is used to provide the content
+ * of an attribute into a PAGE_SIZE buffer.
+ */
+typedef ssize_t (*sysfs_show_func)(struct kobject *, struct attribute *,
+ char *);
+
+static void
+zfs_kobj_fini(zfs_mod_kobj_t *zkobj)
+{
+ /* finalize any child kobjects */
+ if (zkobj->zko_child_count != 0) {
+ ASSERT(zkobj->zko_children);
+ for (int i = 0; i < zkobj->zko_child_count; i++)
+ zfs_kobj_fini(&zkobj->zko_children[i]);
+ }
+
+ /* kobject_put() will call zfs_kobj_release() to release memory */
+ kobject_del(&zkobj->zko_kobj);
+ kobject_put(&zkobj->zko_kobj);
+}
+
+static void
+zfs_kobj_release(struct kobject *kobj)
+{
+ zfs_mod_kobj_t *zkobj = container_of(kobj, zfs_mod_kobj_t, zko_kobj);
+
+ if (zkobj->zko_attr_list != NULL) {
+ ASSERT3S(zkobj->zko_attr_count, !=, 0);
+ kmem_free(zkobj->zko_attr_list,
+ ATTR_TABLE_SIZE(zkobj->zko_attr_count));
+ zkobj->zko_attr_list = NULL;
+ }
+
+ if (zkobj->zko_default_attrs != NULL) {
+ kmem_free(zkobj->zko_default_attrs,
+ DEFAULT_ATTR_SIZE(zkobj->zko_attr_count));
+ zkobj->zko_default_attrs = NULL;
+ }
+
+ if (zkobj->zko_child_count != 0) {
+ ASSERT(zkobj->zko_children);
+
+ kmem_free(zkobj->zko_children,
+ CHILD_TABLE_SIZE(zkobj->zko_child_count));
+ zkobj->zko_child_count = 0;
+ zkobj->zko_children = NULL;
+ }
+
+ zkobj->zko_attr_count = 0;
+}
+
+#ifndef sysfs_attr_init
+#define sysfs_attr_init(attr) do {} while (0)
+#endif
+
+static void
+zfs_kobj_add_attr(zfs_mod_kobj_t *zkobj, int attr_num, const char *attr_name)
+{
+ VERIFY3U(attr_num, <, zkobj->zko_attr_count);
+ ASSERT(zkobj->zko_attr_list);
+ ASSERT(zkobj->zko_default_attrs);
+
+ zkobj->zko_attr_list[attr_num].name = attr_name;
+ zkobj->zko_attr_list[attr_num].mode = 0444;
+ zkobj->zko_default_attrs[attr_num] = &zkobj->zko_attr_list[attr_num];
+ sysfs_attr_init(&zkobj->zko_attr_list[attr_num]);
+}
+
+static int
+zfs_kobj_init(zfs_mod_kobj_t *zkobj, int attr_cnt, int child_cnt,
+ sysfs_show_func show_func)
+{
+ /*
+ * Initialize object's attributes. Count can be zero.
+ */
+ if (attr_cnt > 0) {
+ zkobj->zko_attr_list = kmem_zalloc(ATTR_TABLE_SIZE(attr_cnt),
+ KM_SLEEP);
+ if (zkobj->zko_attr_list == NULL)
+ return (ENOMEM);
+ }
+ /* this will always have at least one slot for NULL termination */
+ zkobj->zko_default_attrs = kmem_zalloc(DEFAULT_ATTR_SIZE(attr_cnt),
+ KM_SLEEP);
+ if (zkobj->zko_default_attrs == NULL) {
+ if (zkobj->zko_attr_list != NULL) {
+ kmem_free(zkobj->zko_attr_list,
+ ATTR_TABLE_SIZE(attr_cnt));
+ }
+ return (ENOMEM);
+ }
+ zkobj->zko_attr_count = attr_cnt;
+ zkobj->zko_kobj_type.default_attrs = zkobj->zko_default_attrs;
+
+ if (child_cnt > 0) {
+ zkobj->zko_children = kmem_zalloc(CHILD_TABLE_SIZE(child_cnt),
+ KM_SLEEP);
+ if (zkobj->zko_children == NULL) {
+ if (zkobj->zko_default_attrs != NULL) {
+ kmem_free(zkobj->zko_default_attrs,
+ DEFAULT_ATTR_SIZE(attr_cnt));
+ }
+ if (zkobj->zko_attr_list != NULL) {
+ kmem_free(zkobj->zko_attr_list,
+ ATTR_TABLE_SIZE(attr_cnt));
+ }
+ return (ENOMEM);
+ }
+ zkobj->zko_child_count = child_cnt;
+ }
+
+ zkobj->zko_sysfs_ops.show = show_func;
+ zkobj->zko_kobj_type.sysfs_ops = &zkobj->zko_sysfs_ops;
+ zkobj->zko_kobj_type.release = zfs_kobj_release;
+
+ return (0);
+}
+
+static int
+zfs_kobj_add(zfs_mod_kobj_t *zkobj, struct kobject *parent, const char *name)
+{
+ /* zko_default_attrs must be NULL terminated */
+ ASSERT(zkobj->zko_default_attrs != NULL);
+ ASSERT(zkobj->zko_default_attrs[zkobj->zko_attr_count] == NULL);
+
+ kobject_init(&zkobj->zko_kobj, &zkobj->zko_kobj_type);
+ return (kobject_add(&zkobj->zko_kobj, parent, name));
+}
+
+/*
+ * Each zfs property has these common attributes
+ */
+static const char *zprop_attrs[] = {
+ "type",
+ "readonly",
+ "setonce",
+ "visible",
+ "values",
+ "default",
+ "datasets" /* zfs properties only */
+};
+
+#define ZFS_PROP_ATTR_COUNT ARRAY_SIZE(zprop_attrs)
+#define ZPOOL_PROP_ATTR_COUNT (ZFS_PROP_ATTR_COUNT - 1)
+
+static const char *zprop_types[] = {
+ "number",
+ "string",
+ "index",
+};
+
+typedef struct zfs_type_map {
+ zfs_type_t ztm_type;
+ const char *ztm_name;
+} zfs_type_map_t;
+
+static zfs_type_map_t type_map[] = {
+ {ZFS_TYPE_FILESYSTEM, "filesystem"},
+ {ZFS_TYPE_SNAPSHOT, "snapshot"},
+ {ZFS_TYPE_VOLUME, "volume"},
+ {ZFS_TYPE_BOOKMARK, "bookmark"}
+};
+
+/*
+ * Show the content for a zfs property attribute
+ */
+static ssize_t
+zprop_sysfs_show(const char *attr_name, const zprop_desc_t *property,
+ char *buf, size_t buflen)
+{
+ const char *show_str;
+ char number[32];
+
+ /* For dataset properties list the dataset types that apply */
+ if (strcmp(attr_name, "datasets") == 0 &&
+ property->pd_types != ZFS_TYPE_POOL) {
+ int len = 0;
+
+ for (int i = 0; i < ARRAY_SIZE(type_map); i++) {
+ if (type_map[i].ztm_type & property->pd_types) {
+ len += snprintf(buf + len, buflen - len, "%s ",
+ type_map[i].ztm_name);
+ }
+ }
+ len += snprintf(buf + len, buflen - len, "\n");
+ return (len);
+ }
+
+ if (strcmp(attr_name, "type") == 0) {
+ show_str = zprop_types[property->pd_proptype];
+ } else if (strcmp(attr_name, "readonly") == 0) {
+ show_str = property->pd_attr == PROP_READONLY ? "1" : "0";
+ } else if (strcmp(attr_name, "setonce") == 0) {
+ show_str = property->pd_attr == PROP_ONETIME ? "1" : "0";
+ } else if (strcmp(attr_name, "visible") == 0) {
+ show_str = property->pd_visible ? "1" : "0";
+ } else if (strcmp(attr_name, "values") == 0) {
+ show_str = property->pd_values ? property->pd_values : "";
+ } else if (strcmp(attr_name, "default") == 0) {
+ switch (property->pd_proptype) {
+ case PROP_TYPE_NUMBER:
+ (void) snprintf(number, sizeof (number), "%llu",
+ (u_longlong_t)property->pd_numdefault);
+ show_str = number;
+ break;
+ case PROP_TYPE_STRING:
+ show_str = property->pd_strdefault ?
+ property->pd_strdefault : "";
+ break;
+ case PROP_TYPE_INDEX:
+ if (zprop_index_to_string(property->pd_propnum,
+ property->pd_numdefault, &show_str,
+ property->pd_types) != 0) {
+ show_str = "";
+ }
+ break;
+ default:
+ return (0);
+ }
+ } else {
+ return (0);
+ }
+
+ return (snprintf(buf, buflen, "%s\n", show_str));
+}
+
+static ssize_t
+dataset_property_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ zfs_prop_t prop = zfs_name_to_prop(kobject_name(kobj));
+ zprop_desc_t *prop_tbl = zfs_prop_get_table();
+ ssize_t len;
+
+ ASSERT3U(prop, <, ZFS_NUM_PROPS);
+
+ len = zprop_sysfs_show(attr->name, &prop_tbl[prop], buf, PAGE_SIZE);
+
+ return (len);
+}
+
+static ssize_t
+pool_property_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ zpool_prop_t prop = zpool_name_to_prop(kobject_name(kobj));
+ zprop_desc_t *prop_tbl = zpool_prop_get_table();
+ ssize_t len;
+
+ ASSERT3U(prop, <, ZPOOL_NUM_PROPS);
+
+ len = zprop_sysfs_show(attr->name, &prop_tbl[prop], buf, PAGE_SIZE);
+
+ return (len);
+}
+
+/*
+ * ZFS kernel feature attributes for '/sys/module/zfs/features.kernel'
+ *
+ * This list is intended for kernel features that don't have a pool feature
+ * association or that extend existing user kernel interfaces.
+ *
+ * A user process can easily check if the running zfs kernel module
+ * supports the new feature.
+ */
+static const char *zfs_kernel_features[] = {
+ /* --> Add new kernel features here */
+ "com.delphix:vdev_initialize",
+ "org.zfsonlinux:vdev_trim",
+ "org.openzfs:l2arc_persistent",
+};
+
+#define KERNEL_FEATURE_COUNT ARRAY_SIZE(zfs_kernel_features)
+
+static ssize_t
+kernel_feature_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ if (strcmp(attr->name, "supported") == 0)
+ return (snprintf(buf, PAGE_SIZE, "yes\n"));
+ return (0);
+}
+
+static void
+kernel_feature_to_kobj(zfs_mod_kobj_t *parent, int slot, const char *name)
+{
+ zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[slot];
+
+ ASSERT3U(slot, <, KERNEL_FEATURE_COUNT);
+ ASSERT(name);
+
+ int err = zfs_kobj_init(zfs_kobj, 1, 0, kernel_feature_show);
+ if (err)
+ return;
+
+ zfs_kobj_add_attr(zfs_kobj, 0, "supported");
+
+ err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name);
+ if (err)
+ zfs_kobj_release(&zfs_kobj->zko_kobj);
+}
+
+static int
+zfs_kernel_features_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent)
+{
+ /*
+ * Create a parent kobject to host kernel features.
+ *
+ * '/sys/module/zfs/features.kernel'
+ */
+ int err = zfs_kobj_init(zfs_kobj, 0, KERNEL_FEATURE_COUNT,
+ kernel_feature_show);
+ if (err)
+ return (err);
+ err = zfs_kobj_add(zfs_kobj, parent, ZFS_SYSFS_KERNEL_FEATURES);
+ if (err) {
+ zfs_kobj_release(&zfs_kobj->zko_kobj);
+ return (err);
+ }
+
+ /*
+ * Now create a kobject for each feature.
+ *
+ * '/sys/module/zfs/features.kernel/<feature>'
+ */
+ for (int f = 0; f < KERNEL_FEATURE_COUNT; f++)
+ kernel_feature_to_kobj(zfs_kobj, f, zfs_kernel_features[f]);
+
+ return (0);
+}
+
+/*
+ * Each pool feature has these common attributes
+ */
+static const char *pool_feature_attrs[] = {
+ "description",
+ "guid",
+ "uname",
+ "readonly_compatible",
+ "required_for_mos",
+ "activate_on_enable",
+ "per_dataset"
+};
+
+#define ZPOOL_FEATURE_ATTR_COUNT ARRAY_SIZE(pool_feature_attrs)
+
+/*
+ * Show the content for the given zfs pool feature attribute
+ */
+static ssize_t
+pool_feature_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ spa_feature_t fid;
+
+ if (zfeature_lookup_guid(kobject_name(kobj), &fid) != 0)
+ return (0);
+
+ ASSERT3U(fid, <, SPA_FEATURES);
+
+ zfeature_flags_t flags = spa_feature_table[fid].fi_flags;
+ const char *show_str = NULL;
+
+ if (strcmp(attr->name, "description") == 0) {
+ show_str = spa_feature_table[fid].fi_desc;
+ } else if (strcmp(attr->name, "guid") == 0) {
+ show_str = spa_feature_table[fid].fi_guid;
+ } else if (strcmp(attr->name, "uname") == 0) {
+ show_str = spa_feature_table[fid].fi_uname;
+ } else if (strcmp(attr->name, "readonly_compatible") == 0) {
+ show_str = flags & ZFEATURE_FLAG_READONLY_COMPAT ? "1" : "0";
+ } else if (strcmp(attr->name, "required_for_mos") == 0) {
+ show_str = flags & ZFEATURE_FLAG_MOS ? "1" : "0";
+ } else if (strcmp(attr->name, "activate_on_enable") == 0) {
+ show_str = flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE ? "1" : "0";
+ } else if (strcmp(attr->name, "per_dataset") == 0) {
+ show_str = flags & ZFEATURE_FLAG_PER_DATASET ? "1" : "0";
+ }
+ if (show_str == NULL)
+ return (0);
+
+ return (snprintf(buf, PAGE_SIZE, "%s\n", show_str));
+}
+
+static void
+pool_feature_to_kobj(zfs_mod_kobj_t *parent, spa_feature_t fid,
+ const char *name)
+{
+ zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[fid];
+
+ ASSERT3U(fid, <, SPA_FEATURES);
+ ASSERT(name);
+
+ int err = zfs_kobj_init(zfs_kobj, ZPOOL_FEATURE_ATTR_COUNT, 0,
+ pool_feature_show);
+ if (err)
+ return;
+
+ for (int i = 0; i < ZPOOL_FEATURE_ATTR_COUNT; i++)
+ zfs_kobj_add_attr(zfs_kobj, i, pool_feature_attrs[i]);
+
+ err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name);
+ if (err)
+ zfs_kobj_release(&zfs_kobj->zko_kobj);
+}
+
+static int
+zfs_pool_features_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent)
+{
+ /*
+ * Create a parent kobject to host pool features.
+ *
+ * '/sys/module/zfs/features.pool'
+ */
+ int err = zfs_kobj_init(zfs_kobj, 0, SPA_FEATURES, pool_feature_show);
+ if (err)
+ return (err);
+ err = zfs_kobj_add(zfs_kobj, parent, ZFS_SYSFS_POOL_FEATURES);
+ if (err) {
+ zfs_kobj_release(&zfs_kobj->zko_kobj);
+ return (err);
+ }
+
+ /*
+ * Now create a kobject for each feature.
+ *
+ * '/sys/module/zfs/features.pool/<feature>'
+ */
+ for (spa_feature_t i = 0; i < SPA_FEATURES; i++)
+ pool_feature_to_kobj(zfs_kobj, i, spa_feature_table[i].fi_guid);
+
+ return (0);
+}
+
+typedef struct prop_to_kobj_arg {
+ zprop_desc_t *p2k_table;
+ zfs_mod_kobj_t *p2k_parent;
+ sysfs_show_func p2k_show_func;
+ int p2k_attr_count;
+} prop_to_kobj_arg_t;
+
+static int
+zprop_to_kobj(int prop, void *args)
+{
+ prop_to_kobj_arg_t *data = args;
+ zfs_mod_kobj_t *parent = data->p2k_parent;
+ zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[prop];
+ const char *name = data->p2k_table[prop].pd_name;
+ int err;
+
+ ASSERT(name);
+
+ err = zfs_kobj_init(zfs_kobj, data->p2k_attr_count, 0,
+ data->p2k_show_func);
+ if (err)
+ return (ZPROP_CONT);
+
+ for (int i = 0; i < data->p2k_attr_count; i++)
+ zfs_kobj_add_attr(zfs_kobj, i, zprop_attrs[i]);
+
+ err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name);
+ if (err)
+ zfs_kobj_release(&zfs_kobj->zko_kobj);
+
+ return (ZPROP_CONT);
+}
+
+static int
+zfs_sysfs_properties_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent,
+ zfs_type_t type)
+{
+ prop_to_kobj_arg_t context;
+ const char *name;
+ int err;
+
+ /*
+ * Create a parent kobject to host properties.
+ *
+ * '/sys/module/zfs/properties.<type>'
+ */
+ if (type == ZFS_TYPE_POOL) {
+ name = ZFS_SYSFS_POOL_PROPERTIES;
+ context.p2k_table = zpool_prop_get_table();
+ context.p2k_attr_count = ZPOOL_PROP_ATTR_COUNT;
+ context.p2k_parent = zfs_kobj;
+ context.p2k_show_func = pool_property_show;
+ err = zfs_kobj_init(zfs_kobj, 0, ZPOOL_NUM_PROPS,
+ pool_property_show);
+ } else {
+ name = ZFS_SYSFS_DATASET_PROPERTIES;
+ context.p2k_table = zfs_prop_get_table();
+ context.p2k_attr_count = ZFS_PROP_ATTR_COUNT;
+ context.p2k_parent = zfs_kobj;
+ context.p2k_show_func = dataset_property_show;
+ err = zfs_kobj_init(zfs_kobj, 0, ZFS_NUM_PROPS,
+ dataset_property_show);
+ }
+
+ if (err)
+ return (err);
+
+ err = zfs_kobj_add(zfs_kobj, parent, name);
+ if (err) {
+ zfs_kobj_release(&zfs_kobj->zko_kobj);
+ return (err);
+ }
+
+ /*
+ * Create a kobject for each property.
+ *
+ * '/sys/module/zfs/properties.<type>/<property>'
+ */
+ (void) zprop_iter_common(zprop_to_kobj, &context, B_TRUE,
+ B_FALSE, type);
+
+ return (err);
+}
+
+void
+zfs_sysfs_init(void)
+{
+ struct kobject *parent;
+#if defined(CONFIG_ZFS) && !defined(CONFIG_ZFS_MODULE)
+ parent = kobject_create_and_add("zfs", fs_kobj);
+#else
+ parent = &(((struct module *)(THIS_MODULE))->mkobj).kobj;
+#endif
+ int err;
+
+ if (parent == NULL)
+ return;
+
+ err = zfs_kernel_features_init(&kernel_features_kobj, parent);
+ if (err)
+ return;
+
+ err = zfs_pool_features_init(&pool_features_kobj, parent);
+ if (err) {
+ zfs_kobj_fini(&kernel_features_kobj);
+ return;
+ }
+
+ err = zfs_sysfs_properties_init(&pool_props_kobj, parent,
+ ZFS_TYPE_POOL);
+ if (err) {
+ zfs_kobj_fini(&kernel_features_kobj);
+ zfs_kobj_fini(&pool_features_kobj);
+ return;
+ }
+
+ err = zfs_sysfs_properties_init(&dataset_props_kobj, parent,
+ ZFS_TYPE_FILESYSTEM);
+ if (err) {
+ zfs_kobj_fini(&kernel_features_kobj);
+ zfs_kobj_fini(&pool_features_kobj);
+ zfs_kobj_fini(&pool_props_kobj);
+ return;
+ }
+}
+
+void
+zfs_sysfs_fini(void)
+{
+ /*
+ * Remove top-level kobjects; each will remove any children kobjects
+ */
+ zfs_kobj_fini(&kernel_features_kobj);
+ zfs_kobj_fini(&pool_features_kobj);
+ zfs_kobj_fini(&dataset_props_kobj);
+ zfs_kobj_fini(&pool_props_kobj);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
new file mode 100644
index 000000000000..3b0f824115f8
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
@@ -0,0 +1,333 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+/*
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ */
+
+#ifdef _KERNEL
+
+#include <sys/types.h>
+#include <sys/uio_impl.h>
+#include <sys/sysmacros.h>
+#include <sys/strings.h>
+#include <linux/kmap_compat.h>
+#include <linux/uaccess.h>
+
+/*
+ * Move "n" bytes at byte address "p"; "rw" indicates the direction
+ * of the move, and the I/O parameters are provided in "uio", which is
+ * update to reflect the data which was moved. Returns 0 on success or
+ * a non-zero errno on failure.
+ */
+static int
+zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
+{
+ const struct iovec *iov = uio->uio_iov;
+ size_t skip = uio->uio_skip;
+ ulong_t cnt;
+
+ while (n && uio->uio_resid) {
+ cnt = MIN(iov->iov_len - skip, n);
+ switch (uio->uio_segflg) {
+ case UIO_USERSPACE:
+ /*
+ * p = kernel data pointer
+ * iov->iov_base = user data pointer
+ */
+ if (rw == UIO_READ) {
+ if (copy_to_user(iov->iov_base+skip, p, cnt))
+ return (EFAULT);
+ } else {
+ unsigned long b_left = 0;
+ if (uio->uio_fault_disable) {
+ if (!zfs_access_ok(VERIFY_READ,
+ (iov->iov_base + skip), cnt)) {
+ return (EFAULT);
+ }
+ pagefault_disable();
+ b_left =
+ __copy_from_user_inatomic(p,
+ (iov->iov_base + skip), cnt);
+ pagefault_enable();
+ } else {
+ b_left =
+ copy_from_user(p,
+ (iov->iov_base + skip), cnt);
+ }
+ if (b_left > 0) {
+ unsigned long c_bytes =
+ cnt - b_left;
+ uio->uio_skip += c_bytes;
+ ASSERT3U(uio->uio_skip, <,
+ iov->iov_len);
+ uio->uio_resid -= c_bytes;
+ uio->uio_loffset += c_bytes;
+ return (EFAULT);
+ }
+ }
+ break;
+ case UIO_SYSSPACE:
+ if (rw == UIO_READ)
+ bcopy(p, iov->iov_base + skip, cnt);
+ else
+ bcopy(iov->iov_base + skip, p, cnt);
+ break;
+ default:
+ ASSERT(0);
+ }
+ skip += cnt;
+ if (skip == iov->iov_len) {
+ skip = 0;
+ uio->uio_iov = (++iov);
+ uio->uio_iovcnt--;
+ }
+ uio->uio_skip = skip;
+ uio->uio_resid -= cnt;
+ uio->uio_loffset += cnt;
+ p = (caddr_t)p + cnt;
+ n -= cnt;
+ }
+ return (0);
+}
+
+static int
+zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
+{
+ const struct bio_vec *bv = uio->uio_bvec;
+ size_t skip = uio->uio_skip;
+ ulong_t cnt;
+
+ while (n && uio->uio_resid) {
+ void *paddr;
+ cnt = MIN(bv->bv_len - skip, n);
+
+ paddr = zfs_kmap_atomic(bv->bv_page, KM_USER1);
+ if (rw == UIO_READ)
+ bcopy(p, paddr + bv->bv_offset + skip, cnt);
+ else
+ bcopy(paddr + bv->bv_offset + skip, p, cnt);
+ zfs_kunmap_atomic(paddr, KM_USER1);
+
+ skip += cnt;
+ if (skip == bv->bv_len) {
+ skip = 0;
+ uio->uio_bvec = (++bv);
+ uio->uio_iovcnt--;
+ }
+ uio->uio_skip = skip;
+ uio->uio_resid -= cnt;
+ uio->uio_loffset += cnt;
+ p = (caddr_t)p + cnt;
+ n -= cnt;
+ }
+ return (0);
+}
+
+#if defined(HAVE_VFS_IOV_ITER)
+static int
+zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio,
+ boolean_t revert)
+{
+ size_t cnt = MIN(n, uio->uio_resid);
+
+ if (uio->uio_skip)
+ iov_iter_advance(uio->uio_iter, uio->uio_skip);
+
+ if (rw == UIO_READ)
+ cnt = copy_to_iter(p, cnt, uio->uio_iter);
+ else
+ cnt = copy_from_iter(p, cnt, uio->uio_iter);
+
+ /*
+ * When operating on a full pipe no bytes are processed.
+ * In which case return EFAULT which is converted to EAGAIN
+ * by the kernel's generic_file_splice_read() function.
+ */
+ if (cnt == 0)
+ return (EFAULT);
+
+ /*
+ * Revert advancing the uio_iter. This is set by zfs_uiocopy()
+ * to avoid consuming the uio and its iov_iter structure.
+ */
+ if (revert)
+ iov_iter_revert(uio->uio_iter, cnt);
+
+ uio->uio_resid -= cnt;
+ uio->uio_loffset += cnt;
+
+ return (0);
+}
+#endif
+
+int
+zfs_uiomove(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
+{
+ if (uio->uio_segflg == UIO_BVEC)
+ return (zfs_uiomove_bvec(p, n, rw, uio));
+#if defined(HAVE_VFS_IOV_ITER)
+ else if (uio->uio_segflg == UIO_ITER)
+ return (zfs_uiomove_iter(p, n, rw, uio, B_FALSE));
+#endif
+ else
+ return (zfs_uiomove_iov(p, n, rw, uio));
+}
+EXPORT_SYMBOL(zfs_uiomove);
+
+/*
+ * Fault in the pages of the first n bytes specified by the uio structure.
+ * 1 byte in each page is touched and the uio struct is unmodified. Any
+ * error will terminate the process as this is only a best attempt to get
+ * the pages resident.
+ */
+int
+zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio)
+{
+ if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC) {
+ /* There's never a need to fault in kernel pages */
+ return (0);
+#if defined(HAVE_VFS_IOV_ITER)
+ } else if (uio->uio_segflg == UIO_ITER) {
+ /*
+ * At least a Linux 4.9 kernel, iov_iter_fault_in_readable()
+ * can be relied on to fault in user pages when referenced.
+ */
+ if (iov_iter_fault_in_readable(uio->uio_iter, n))
+ return (EFAULT);
+#endif
+ } else {
+ /* Fault in all user pages */
+ ASSERT3S(uio->uio_segflg, ==, UIO_USERSPACE);
+ const struct iovec *iov = uio->uio_iov;
+ int iovcnt = uio->uio_iovcnt;
+ size_t skip = uio->uio_skip;
+ uint8_t tmp;
+ caddr_t p;
+
+ for (; n > 0 && iovcnt > 0; iov++, iovcnt--, skip = 0) {
+ ulong_t cnt = MIN(iov->iov_len - skip, n);
+ /* empty iov */
+ if (cnt == 0)
+ continue;
+ n -= cnt;
+ /* touch each page in this segment. */
+ p = iov->iov_base + skip;
+ while (cnt) {
+ if (get_user(tmp, (uint8_t *)p))
+ return (EFAULT);
+ ulong_t incr = MIN(cnt, PAGESIZE);
+ p += incr;
+ cnt -= incr;
+ }
+ /* touch the last byte in case it straddles a page. */
+ p--;
+ if (get_user(tmp, (uint8_t *)p))
+ return (EFAULT);
+ }
+ }
+
+ if (iterp && iov_iter_fault_in_readable(iterp, n))
+ return (EFAULT);
+#endif
+ return (0);
+}
+EXPORT_SYMBOL(zfs_uio_prefaultpages);
+
+/*
+ * The same as zfs_uiomove() but doesn't modify uio structure.
+ * return in cbytes how many bytes were copied.
+ */
+int
+zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes)
+{
+ zfs_uio_t uio_copy;
+ int ret;
+
+ bcopy(uio, &uio_copy, sizeof (zfs_uio_t));
+
+ if (uio->uio_segflg == UIO_BVEC)
+ ret = zfs_uiomove_bvec(p, n, rw, &uio_copy);
+#if defined(HAVE_VFS_IOV_ITER)
+ else if (uio->uio_segflg == UIO_ITER)
+ ret = zfs_uiomove_iter(p, n, rw, &uio_copy, B_TRUE);
+#endif
+ else
+ ret = zfs_uiomove_iov(p, n, rw, &uio_copy);
+
+ *cbytes = uio->uio_resid - uio_copy.uio_resid;
+
+ return (ret);
+}
+EXPORT_SYMBOL(zfs_uiocopy);
+
+/*
+ * Drop the next n chars out of *uio.
+ */
+void
+zfs_uioskip(zfs_uio_t *uio, size_t n)
+{
+ if (n > uio->uio_resid)
+ return;
+
+ if (uio->uio_segflg == UIO_BVEC) {
+ uio->uio_skip += n;
+ while (uio->uio_iovcnt &&
+ uio->uio_skip >= uio->uio_bvec->bv_len) {
+ uio->uio_skip -= uio->uio_bvec->bv_len;
+ uio->uio_bvec++;
+ uio->uio_iovcnt--;
+ }
+#if defined(HAVE_VFS_IOV_ITER)
+ } else if (uio->uio_segflg == UIO_ITER) {
+ iov_iter_advance(uio->uio_iter, n);
+#endif
+ } else {
+ uio->uio_skip += n;
+ while (uio->uio_iovcnt &&
+ uio->uio_skip >= uio->uio_iov->iov_len) {
+ uio->uio_skip -= uio->uio_iov->iov_len;
+ uio->uio_iov++;
+ uio->uio_iovcnt--;
+ }
+ }
+ uio->uio_loffset += n;
+ uio->uio_resid -= n;
+}
+EXPORT_SYMBOL(zfs_uioskip);
+
+#endif /* _KERNEL */
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
new file mode 100644
index 000000000000..3cc4b560e477
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
@@ -0,0 +1,2176 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/mntent.h>
+#include <sys/cmn_err.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_dir.h>
+#include <sys/zil.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_deleg.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/policy.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_quota.h>
+#include <sys/sunddi.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/spa_boot.h>
+#include <sys/objlist.h>
+#include <sys/zpl.h>
+#include <linux/vfs_compat.h>
+#include "zfs_comutil.h"
+
+enum {
+ TOKEN_RO,
+ TOKEN_RW,
+ TOKEN_SETUID,
+ TOKEN_NOSETUID,
+ TOKEN_EXEC,
+ TOKEN_NOEXEC,
+ TOKEN_DEVICES,
+ TOKEN_NODEVICES,
+ TOKEN_DIRXATTR,
+ TOKEN_SAXATTR,
+ TOKEN_XATTR,
+ TOKEN_NOXATTR,
+ TOKEN_ATIME,
+ TOKEN_NOATIME,
+ TOKEN_RELATIME,
+ TOKEN_NORELATIME,
+ TOKEN_NBMAND,
+ TOKEN_NONBMAND,
+ TOKEN_MNTPOINT,
+ TOKEN_LAST,
+};
+
+static const match_table_t zpl_tokens = {
+ { TOKEN_RO, MNTOPT_RO },
+ { TOKEN_RW, MNTOPT_RW },
+ { TOKEN_SETUID, MNTOPT_SETUID },
+ { TOKEN_NOSETUID, MNTOPT_NOSETUID },
+ { TOKEN_EXEC, MNTOPT_EXEC },
+ { TOKEN_NOEXEC, MNTOPT_NOEXEC },
+ { TOKEN_DEVICES, MNTOPT_DEVICES },
+ { TOKEN_NODEVICES, MNTOPT_NODEVICES },
+ { TOKEN_DIRXATTR, MNTOPT_DIRXATTR },
+ { TOKEN_SAXATTR, MNTOPT_SAXATTR },
+ { TOKEN_XATTR, MNTOPT_XATTR },
+ { TOKEN_NOXATTR, MNTOPT_NOXATTR },
+ { TOKEN_ATIME, MNTOPT_ATIME },
+ { TOKEN_NOATIME, MNTOPT_NOATIME },
+ { TOKEN_RELATIME, MNTOPT_RELATIME },
+ { TOKEN_NORELATIME, MNTOPT_NORELATIME },
+ { TOKEN_NBMAND, MNTOPT_NBMAND },
+ { TOKEN_NONBMAND, MNTOPT_NONBMAND },
+ { TOKEN_MNTPOINT, MNTOPT_MNTPOINT "=%s" },
+ { TOKEN_LAST, NULL },
+};
+
+static void
+zfsvfs_vfs_free(vfs_t *vfsp)
+{
+ if (vfsp != NULL) {
+ if (vfsp->vfs_mntpoint != NULL)
+ kmem_strfree(vfsp->vfs_mntpoint);
+
+ kmem_free(vfsp, sizeof (vfs_t));
+ }
+}
+
+static int
+zfsvfs_parse_option(char *option, int token, substring_t *args, vfs_t *vfsp)
+{
+ switch (token) {
+ case TOKEN_RO:
+ vfsp->vfs_readonly = B_TRUE;
+ vfsp->vfs_do_readonly = B_TRUE;
+ break;
+ case TOKEN_RW:
+ vfsp->vfs_readonly = B_FALSE;
+ vfsp->vfs_do_readonly = B_TRUE;
+ break;
+ case TOKEN_SETUID:
+ vfsp->vfs_setuid = B_TRUE;
+ vfsp->vfs_do_setuid = B_TRUE;
+ break;
+ case TOKEN_NOSETUID:
+ vfsp->vfs_setuid = B_FALSE;
+ vfsp->vfs_do_setuid = B_TRUE;
+ break;
+ case TOKEN_EXEC:
+ vfsp->vfs_exec = B_TRUE;
+ vfsp->vfs_do_exec = B_TRUE;
+ break;
+ case TOKEN_NOEXEC:
+ vfsp->vfs_exec = B_FALSE;
+ vfsp->vfs_do_exec = B_TRUE;
+ break;
+ case TOKEN_DEVICES:
+ vfsp->vfs_devices = B_TRUE;
+ vfsp->vfs_do_devices = B_TRUE;
+ break;
+ case TOKEN_NODEVICES:
+ vfsp->vfs_devices = B_FALSE;
+ vfsp->vfs_do_devices = B_TRUE;
+ break;
+ case TOKEN_DIRXATTR:
+ vfsp->vfs_xattr = ZFS_XATTR_DIR;
+ vfsp->vfs_do_xattr = B_TRUE;
+ break;
+ case TOKEN_SAXATTR:
+ vfsp->vfs_xattr = ZFS_XATTR_SA;
+ vfsp->vfs_do_xattr = B_TRUE;
+ break;
+ case TOKEN_XATTR:
+ vfsp->vfs_xattr = ZFS_XATTR_DIR;
+ vfsp->vfs_do_xattr = B_TRUE;
+ break;
+ case TOKEN_NOXATTR:
+ vfsp->vfs_xattr = ZFS_XATTR_OFF;
+ vfsp->vfs_do_xattr = B_TRUE;
+ break;
+ case TOKEN_ATIME:
+ vfsp->vfs_atime = B_TRUE;
+ vfsp->vfs_do_atime = B_TRUE;
+ break;
+ case TOKEN_NOATIME:
+ vfsp->vfs_atime = B_FALSE;
+ vfsp->vfs_do_atime = B_TRUE;
+ break;
+ case TOKEN_RELATIME:
+ vfsp->vfs_relatime = B_TRUE;
+ vfsp->vfs_do_relatime = B_TRUE;
+ break;
+ case TOKEN_NORELATIME:
+ vfsp->vfs_relatime = B_FALSE;
+ vfsp->vfs_do_relatime = B_TRUE;
+ break;
+ case TOKEN_NBMAND:
+ vfsp->vfs_nbmand = B_TRUE;
+ vfsp->vfs_do_nbmand = B_TRUE;
+ break;
+ case TOKEN_NONBMAND:
+ vfsp->vfs_nbmand = B_FALSE;
+ vfsp->vfs_do_nbmand = B_TRUE;
+ break;
+ case TOKEN_MNTPOINT:
+ vfsp->vfs_mntpoint = match_strdup(&args[0]);
+ if (vfsp->vfs_mntpoint == NULL)
+ return (SET_ERROR(ENOMEM));
+
+ break;
+ default:
+ break;
+ }
+
+ return (0);
+}
+
+/*
+ * Parse the raw mntopts and return a vfs_t describing the options.
+ */
+static int
+zfsvfs_parse_options(char *mntopts, vfs_t **vfsp)
+{
+ vfs_t *tmp_vfsp;
+ int error;
+
+ tmp_vfsp = kmem_zalloc(sizeof (vfs_t), KM_SLEEP);
+
+ if (mntopts != NULL) {
+ substring_t args[MAX_OPT_ARGS];
+ char *tmp_mntopts, *p, *t;
+ int token;
+
+ tmp_mntopts = t = kmem_strdup(mntopts);
+ if (tmp_mntopts == NULL)
+ return (SET_ERROR(ENOMEM));
+
+ while ((p = strsep(&t, ",")) != NULL) {
+ if (!*p)
+ continue;
+
+ args[0].to = args[0].from = NULL;
+ token = match_token(p, zpl_tokens, args);
+ error = zfsvfs_parse_option(p, token, args, tmp_vfsp);
+ if (error) {
+ kmem_strfree(tmp_mntopts);
+ zfsvfs_vfs_free(tmp_vfsp);
+ return (error);
+ }
+ }
+
+ kmem_strfree(tmp_mntopts);
+ }
+
+ *vfsp = tmp_vfsp;
+
+ return (0);
+}
+
+boolean_t
+zfs_is_readonly(zfsvfs_t *zfsvfs)
+{
+ return (!!(zfsvfs->z_sb->s_flags & SB_RDONLY));
+}
+
+/*ARGSUSED*/
+int
+zfs_sync(struct super_block *sb, int wait, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = sb->s_fs_info;
+
+ /*
+ * Semantically, the only requirement is that the sync be initiated.
+ * The DMU syncs out txgs frequently, so there's nothing to do.
+ */
+ if (!wait)
+ return (0);
+
+ if (zfsvfs != NULL) {
+ /*
+ * Sync a specific filesystem.
+ */
+ dsl_pool_t *dp;
+
+ ZFS_ENTER(zfsvfs);
+ dp = dmu_objset_pool(zfsvfs->z_os);
+
+ /*
+ * If the system is shutting down, then skip any
+ * filesystems which may exist on a suspended pool.
+ */
+ if (spa_suspended(dp->dp_spa)) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ if (zfsvfs->z_log != NULL)
+ zil_commit(zfsvfs->z_log, 0);
+
+ ZFS_EXIT(zfsvfs);
+ } else {
+ /*
+ * Sync all ZFS filesystems. This is what happens when you
+ * run sync(1). Unlike other filesystems, ZFS honors the
+ * request by waiting for all pools to commit all dirty data.
+ */
+ spa_sync_allpools();
+ }
+
+ return (0);
+}
+
+static void
+atime_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+ struct super_block *sb = zfsvfs->z_sb;
+
+ if (sb == NULL)
+ return;
+ /*
+ * Update SB_NOATIME bit in VFS super block. Since atime update is
+ * determined by atime_needs_update(), atime_needs_update() needs to
+ * return false if atime is turned off, and not unconditionally return
+ * false if atime is turned on.
+ */
+ if (newval)
+ sb->s_flags &= ~SB_NOATIME;
+ else
+ sb->s_flags |= SB_NOATIME;
+}
+
+static void
+relatime_changed_cb(void *arg, uint64_t newval)
+{
+ ((zfsvfs_t *)arg)->z_relatime = newval;
+}
+
+static void
+xattr_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == ZFS_XATTR_OFF) {
+ zfsvfs->z_flags &= ~ZSB_XATTR;
+ } else {
+ zfsvfs->z_flags |= ZSB_XATTR;
+
+ if (newval == ZFS_XATTR_SA)
+ zfsvfs->z_xattr_sa = B_TRUE;
+ else
+ zfsvfs->z_xattr_sa = B_FALSE;
+ }
+}
+
+static void
+acltype_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ switch (newval) {
+ case ZFS_ACLTYPE_NFSV4:
+ case ZFS_ACLTYPE_OFF:
+ zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF;
+ zfsvfs->z_sb->s_flags &= ~SB_POSIXACL;
+ break;
+ case ZFS_ACLTYPE_POSIX:
+#ifdef CONFIG_FS_POSIX_ACL
+ zfsvfs->z_acl_type = ZFS_ACLTYPE_POSIX;
+ zfsvfs->z_sb->s_flags |= SB_POSIXACL;
+#else
+ zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF;
+ zfsvfs->z_sb->s_flags &= ~SB_POSIXACL;
+#endif /* CONFIG_FS_POSIX_ACL */
+ break;
+ default:
+ break;
+ }
+}
+
+static void
+blksz_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+ ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
+ ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
+ ASSERT(ISP2(newval));
+
+ zfsvfs->z_max_blksz = newval;
+}
+
+static void
+readonly_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+ struct super_block *sb = zfsvfs->z_sb;
+
+ if (sb == NULL)
+ return;
+
+ if (newval)
+ sb->s_flags |= SB_RDONLY;
+ else
+ sb->s_flags &= ~SB_RDONLY;
+}
+
+static void
+devices_changed_cb(void *arg, uint64_t newval)
+{
+}
+
+static void
+setuid_changed_cb(void *arg, uint64_t newval)
+{
+}
+
+static void
+exec_changed_cb(void *arg, uint64_t newval)
+{
+}
+
+static void
+nbmand_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+ struct super_block *sb = zfsvfs->z_sb;
+
+ if (sb == NULL)
+ return;
+
+ if (newval == TRUE)
+ sb->s_flags |= SB_MANDLOCK;
+ else
+ sb->s_flags &= ~SB_MANDLOCK;
+}
+
+static void
+snapdir_changed_cb(void *arg, uint64_t newval)
+{
+ ((zfsvfs_t *)arg)->z_show_ctldir = newval;
+}
+
+static void
+vscan_changed_cb(void *arg, uint64_t newval)
+{
+ ((zfsvfs_t *)arg)->z_vscan = newval;
+}
+
+static void
+acl_mode_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_acl_mode = newval;
+}
+
+static void
+acl_inherit_changed_cb(void *arg, uint64_t newval)
+{
+ ((zfsvfs_t *)arg)->z_acl_inherit = newval;
+}
+
+static int
+zfs_register_callbacks(vfs_t *vfsp)
+{
+ struct dsl_dataset *ds = NULL;
+ objset_t *os = NULL;
+ zfsvfs_t *zfsvfs = NULL;
+ int error = 0;
+
+ ASSERT(vfsp);
+ zfsvfs = vfsp->vfs_data;
+ ASSERT(zfsvfs);
+ os = zfsvfs->z_os;
+
+ /*
+ * The act of registering our callbacks will destroy any mount
+ * options we may have. In order to enable temporary overrides
+ * of mount options, we stash away the current values and
+ * restore them after we register the callbacks.
+ */
+ if (zfs_is_readonly(zfsvfs) || !spa_writeable(dmu_objset_spa(os))) {
+ vfsp->vfs_do_readonly = B_TRUE;
+ vfsp->vfs_readonly = B_TRUE;
+ }
+
+ /*
+ * Register property callbacks.
+ *
+ * It would probably be fine to just check for i/o error from
+ * the first prop_register(), but I guess I like to go
+ * overboard...
+ */
+ ds = dmu_objset_ds(os);
+ dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+ error = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_RELATIME), relatime_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_ACLTYPE), acltype_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
+ zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_NBMAND), nbmand_changed_cb, zfsvfs);
+ dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+ if (error)
+ goto unregister;
+
+ /*
+ * Invoke our callbacks to restore temporary mount options.
+ */
+ if (vfsp->vfs_do_readonly)
+ readonly_changed_cb(zfsvfs, vfsp->vfs_readonly);
+ if (vfsp->vfs_do_setuid)
+ setuid_changed_cb(zfsvfs, vfsp->vfs_setuid);
+ if (vfsp->vfs_do_exec)
+ exec_changed_cb(zfsvfs, vfsp->vfs_exec);
+ if (vfsp->vfs_do_devices)
+ devices_changed_cb(zfsvfs, vfsp->vfs_devices);
+ if (vfsp->vfs_do_xattr)
+ xattr_changed_cb(zfsvfs, vfsp->vfs_xattr);
+ if (vfsp->vfs_do_atime)
+ atime_changed_cb(zfsvfs, vfsp->vfs_atime);
+ if (vfsp->vfs_do_relatime)
+ relatime_changed_cb(zfsvfs, vfsp->vfs_relatime);
+ if (vfsp->vfs_do_nbmand)
+ nbmand_changed_cb(zfsvfs, vfsp->vfs_nbmand);
+
+ return (0);
+
+unregister:
+ dsl_prop_unregister_all(ds, zfsvfs);
+ return (error);
+}
+
+/*
+ * Takes a dataset, a property, a value and that value's setpoint as
+ * found in the ZAP. Checks if the property has been changed in the vfs.
+ * If so, val and setpoint will be overwritten with updated content.
+ * Otherwise, they are left unchanged.
+ */
+int
+zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
+ char *setpoint)
+{
+ int error;
+ zfsvfs_t *zfvp;
+ vfs_t *vfsp;
+ objset_t *os;
+ uint64_t tmp = *val;
+
+ error = dmu_objset_from_ds(ds, &os);
+ if (error != 0)
+ return (error);
+
+ if (dmu_objset_type(os) != DMU_OST_ZFS)
+ return (EINVAL);
+
+ mutex_enter(&os->os_user_ptr_lock);
+ zfvp = dmu_objset_get_user(os);
+ mutex_exit(&os->os_user_ptr_lock);
+ if (zfvp == NULL)
+ return (ESRCH);
+
+ vfsp = zfvp->z_vfs;
+
+ switch (zfs_prop) {
+ case ZFS_PROP_ATIME:
+ if (vfsp->vfs_do_atime)
+ tmp = vfsp->vfs_atime;
+ break;
+ case ZFS_PROP_RELATIME:
+ if (vfsp->vfs_do_relatime)
+ tmp = vfsp->vfs_relatime;
+ break;
+ case ZFS_PROP_DEVICES:
+ if (vfsp->vfs_do_devices)
+ tmp = vfsp->vfs_devices;
+ break;
+ case ZFS_PROP_EXEC:
+ if (vfsp->vfs_do_exec)
+ tmp = vfsp->vfs_exec;
+ break;
+ case ZFS_PROP_SETUID:
+ if (vfsp->vfs_do_setuid)
+ tmp = vfsp->vfs_setuid;
+ break;
+ case ZFS_PROP_READONLY:
+ if (vfsp->vfs_do_readonly)
+ tmp = vfsp->vfs_readonly;
+ break;
+ case ZFS_PROP_XATTR:
+ if (vfsp->vfs_do_xattr)
+ tmp = vfsp->vfs_xattr;
+ break;
+ case ZFS_PROP_NBMAND:
+ if (vfsp->vfs_do_nbmand)
+ tmp = vfsp->vfs_nbmand;
+ break;
+ default:
+ return (ENOENT);
+ }
+
+ if (tmp != *val) {
+ (void) strcpy(setpoint, "temporary");
+ *val = tmp;
+ }
+ return (0);
+}
+
+/*
+ * Associate this zfsvfs with the given objset, which must be owned.
+ * This will cache a bunch of on-disk state from the objset in the
+ * zfsvfs.
+ */
+static int
+zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
+{
+ int error;
+ uint64_t val;
+
+ zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
+ zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
+ zfsvfs->z_os = os;
+
+ error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
+ if (error != 0)
+ return (error);
+ if (zfsvfs->z_version >
+ zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
+ (void) printk("Can't mount a version %lld file system "
+ "on a version %lld pool\n. Pool must be upgraded to mount "
+ "this file system.\n", (u_longlong_t)zfsvfs->z_version,
+ (u_longlong_t)spa_version(dmu_objset_spa(os)));
+ return (SET_ERROR(ENOTSUP));
+ }
+ error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
+ if (error != 0)
+ return (error);
+ zfsvfs->z_norm = (int)val;
+
+ error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
+ if (error != 0)
+ return (error);
+ zfsvfs->z_utf8 = (val != 0);
+
+ error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
+ if (error != 0)
+ return (error);
+ zfsvfs->z_case = (uint_t)val;
+
+ if ((error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val)) != 0)
+ return (error);
+ zfsvfs->z_acl_type = (uint_t)val;
+
+ /*
+ * Fold case on file systems that are always or sometimes case
+ * insensitive.
+ */
+ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
+ zfsvfs->z_case == ZFS_CASE_MIXED)
+ zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
+ zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+ zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+
+ uint64_t sa_obj = 0;
+ if (zfsvfs->z_use_sa) {
+ /* should either have both of these objects or none */
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
+ &sa_obj);
+ if (error != 0)
+ return (error);
+
+ error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val);
+ if ((error == 0) && (val == ZFS_XATTR_SA))
+ zfsvfs->z_xattr_sa = B_TRUE;
+ }
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
+ &zfsvfs->z_root);
+ if (error != 0)
+ return (error);
+ ASSERT(zfsvfs->z_root != 0);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
+ &zfsvfs->z_unlinkedobj);
+ if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
+ 8, 1, &zfsvfs->z_userquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_userquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
+ 8, 1, &zfsvfs->z_groupquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_groupquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
+ 8, 1, &zfsvfs->z_projectquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_projectquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
+ 8, 1, &zfsvfs->z_userobjquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_userobjquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
+ 8, 1, &zfsvfs->z_groupobjquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_groupobjquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
+ 8, 1, &zfsvfs->z_projectobjquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_projectobjquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
+ &zfsvfs->z_fuid_obj);
+ if (error == ENOENT)
+ zfsvfs->z_fuid_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
+ &zfsvfs->z_shares_dir);
+ if (error == ENOENT)
+ zfsvfs->z_shares_dir = 0;
+ else if (error != 0)
+ return (error);
+
+ error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+ &zfsvfs->z_attr_table);
+ if (error != 0)
+ return (error);
+
+ if (zfsvfs->z_version >= ZPL_VERSION_SA)
+ sa_register_update_callback(os, zfs_sa_upgrade);
+
+ return (0);
+}
+
+int
+zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
+{
+ objset_t *os;
+ zfsvfs_t *zfsvfs;
+ int error;
+ boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
+
+ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+
+ error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, &os);
+ if (error != 0) {
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+ return (error);
+ }
+
+ error = zfsvfs_create_impl(zfvp, zfsvfs, os);
+ if (error != 0) {
+ dmu_objset_disown(os, B_TRUE, zfsvfs);
+ }
+ return (error);
+}
+
+
+/*
+ * Note: zfsvfs is assumed to be malloc'd, and will be freed by this function
+ * on a failure. Do not pass in a statically allocated zfsvfs.
+ */
+int
+zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
+{
+ int error;
+
+ zfsvfs->z_vfs = NULL;
+ zfsvfs->z_sb = NULL;
+ zfsvfs->z_parent = zfsvfs;
+
+ mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+ offsetof(znode_t, z_link_node));
+ ZFS_TEARDOWN_INIT(zfsvfs);
+ rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
+ rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
+
+ int size = MIN(1 << (highbit64(zfs_object_mutex_size) - 1),
+ ZFS_OBJ_MTX_MAX);
+ zfsvfs->z_hold_size = size;
+ zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
+ KM_SLEEP);
+ zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
+ for (int i = 0; i != size; i++) {
+ avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
+ sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
+ mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
+ }
+
+ error = zfsvfs_init(zfsvfs, os);
+ if (error != 0) {
+ *zfvp = NULL;
+ zfsvfs_free(zfsvfs);
+ return (error);
+ }
+
+ zfsvfs->z_drain_task = TASKQID_INVALID;
+ zfsvfs->z_draining = B_FALSE;
+ zfsvfs->z_drain_cancel = B_TRUE;
+
+ *zfvp = zfsvfs;
+ return (0);
+}
+
+static int
+zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
+{
+ int error;
+ boolean_t readonly = zfs_is_readonly(zfsvfs);
+
+ error = zfs_register_callbacks(zfsvfs->z_vfs);
+ if (error)
+ return (error);
+
+ zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+
+ /*
+ * If we are not mounting (ie: online recv), then we don't
+ * have to worry about replaying the log as we blocked all
+ * operations out since we closed the ZIL.
+ */
+ if (mounting) {
+ ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
+ dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
+
+ /*
+ * During replay we remove the read only flag to
+ * allow replays to succeed.
+ */
+ if (readonly != 0) {
+ readonly_changed_cb(zfsvfs, B_FALSE);
+ } else {
+ zap_stats_t zs;
+ if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
+ &zs) == 0) {
+ dataset_kstats_update_nunlinks_kstat(
+ &zfsvfs->z_kstat, zs.zs_num_entries);
+ dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
+ "num_entries in unlinked set: %llu",
+ zs.zs_num_entries);
+ }
+ zfs_unlinked_drain(zfsvfs);
+ dsl_dir_t *dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
+ dd->dd_activity_cancelled = B_FALSE;
+ }
+
+ /*
+ * Parse and replay the intent log.
+ *
+ * Because of ziltest, this must be done after
+ * zfs_unlinked_drain(). (Further note: ziltest
+ * doesn't use readonly mounts, where
+ * zfs_unlinked_drain() isn't called.) This is because
+ * ziltest causes spa_sync() to think it's committed,
+ * but actually it is not, so the intent log contains
+ * many txg's worth of changes.
+ *
+ * In particular, if object N is in the unlinked set in
+ * the last txg to actually sync, then it could be
+ * actually freed in a later txg and then reallocated
+ * in a yet later txg. This would write a "create
+ * object N" record to the intent log. Normally, this
+ * would be fine because the spa_sync() would have
+ * written out the fact that object N is free, before
+ * we could write the "create object N" intent log
+ * record.
+ *
+ * But when we are in ziltest mode, we advance the "open
+ * txg" without actually spa_sync()-ing the changes to
+ * disk. So we would see that object N is still
+ * allocated and in the unlinked set, and there is an
+ * intent log record saying to allocate it.
+ */
+ if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
+ if (zil_replay_disable) {
+ zil_destroy(zfsvfs->z_log, B_FALSE);
+ } else {
+ zfsvfs->z_replay = B_TRUE;
+ zil_replay(zfsvfs->z_os, zfsvfs,
+ zfs_replay_vector);
+ zfsvfs->z_replay = B_FALSE;
+ }
+ }
+
+ /* restore readonly bit */
+ if (readonly != 0)
+ readonly_changed_cb(zfsvfs, B_TRUE);
+ }
+
+ /*
+ * Set the objset user_ptr to track its zfsvfs.
+ */
+ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
+ dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+ mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
+
+ return (0);
+}
+
+void
+zfsvfs_free(zfsvfs_t *zfsvfs)
+{
+ int i, size = zfsvfs->z_hold_size;
+
+ zfs_fuid_destroy(zfsvfs);
+
+ mutex_destroy(&zfsvfs->z_znodes_lock);
+ mutex_destroy(&zfsvfs->z_lock);
+ list_destroy(&zfsvfs->z_all_znodes);
+ ZFS_TEARDOWN_DESTROY(zfsvfs);
+ rw_destroy(&zfsvfs->z_teardown_inactive_lock);
+ rw_destroy(&zfsvfs->z_fuid_lock);
+ for (i = 0; i != size; i++) {
+ avl_destroy(&zfsvfs->z_hold_trees[i]);
+ mutex_destroy(&zfsvfs->z_hold_locks[i]);
+ }
+ vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
+ vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
+ zfsvfs_vfs_free(zfsvfs->z_vfs);
+ dataset_kstats_destroy(&zfsvfs->z_kstat);
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+}
+
+static void
+zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
+{
+ zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+ zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+}
+
+static void
+zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
+{
+ objset_t *os = zfsvfs->z_os;
+
+ if (!dmu_objset_is_snapshot(os))
+ dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
+}
+
+#ifdef HAVE_MLSLABEL
+/*
+ * Check that the hex label string is appropriate for the dataset being
+ * mounted into the global_zone proper.
+ *
+ * Return an error if the hex label string is not default or
+ * admin_low/admin_high. For admin_low labels, the corresponding
+ * dataset must be readonly.
+ */
+int
+zfs_check_global_label(const char *dsname, const char *hexsl)
+{
+ if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
+ return (0);
+ if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
+ return (0);
+ if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
+ /* must be readonly */
+ uint64_t rdonly;
+
+ if (dsl_prop_get_integer(dsname,
+ zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
+ return (SET_ERROR(EACCES));
+ return (rdonly ? 0 : SET_ERROR(EACCES));
+ }
+ return (SET_ERROR(EACCES));
+}
+#endif /* HAVE_MLSLABEL */
+
+static int
+zfs_statfs_project(zfsvfs_t *zfsvfs, znode_t *zp, struct kstatfs *statp,
+ uint32_t bshift)
+{
+ char buf[20 + DMU_OBJACCT_PREFIX_LEN];
+ uint64_t offset = DMU_OBJACCT_PREFIX_LEN;
+ uint64_t quota;
+ uint64_t used;
+ int err;
+
+ strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1);
+ err = zfs_id_to_fuidstr(zfsvfs, NULL, zp->z_projid, buf + offset,
+ sizeof (buf) - offset, B_FALSE);
+ if (err)
+ return (err);
+
+ if (zfsvfs->z_projectquota_obj == 0)
+ goto objs;
+
+ err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectquota_obj,
+ buf + offset, 8, 1, &quota);
+ if (err == ENOENT)
+ goto objs;
+ else if (err)
+ return (err);
+
+ err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT,
+ buf + offset, 8, 1, &used);
+ if (unlikely(err == ENOENT)) {
+ uint32_t blksize;
+ u_longlong_t nblocks;
+
+ /*
+ * Quota accounting is async, so it is possible race case.
+ * There is at least one object with the given project ID.
+ */
+ sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
+ if (unlikely(zp->z_blksz == 0))
+ blksize = zfsvfs->z_max_blksz;
+
+ used = blksize * nblocks;
+ } else if (err) {
+ return (err);
+ }
+
+ statp->f_blocks = quota >> bshift;
+ statp->f_bfree = (quota > used) ? ((quota - used) >> bshift) : 0;
+ statp->f_bavail = statp->f_bfree;
+
+objs:
+ if (zfsvfs->z_projectobjquota_obj == 0)
+ return (0);
+
+ err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectobjquota_obj,
+ buf + offset, 8, 1, &quota);
+ if (err == ENOENT)
+ return (0);
+ else if (err)
+ return (err);
+
+ err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT,
+ buf, 8, 1, &used);
+ if (unlikely(err == ENOENT)) {
+ /*
+ * Quota accounting is async, so it is possible race case.
+ * There is at least one object with the given project ID.
+ */
+ used = 1;
+ } else if (err) {
+ return (err);
+ }
+
+ statp->f_files = quota;
+ statp->f_ffree = (quota > used) ? (quota - used) : 0;
+
+ return (0);
+}
+
+int
+zfs_statvfs(struct inode *ip, struct kstatfs *statp)
+{
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ uint64_t refdbytes, availbytes, usedobjs, availobjs;
+ int err = 0;
+
+ ZFS_ENTER(zfsvfs);
+
+ dmu_objset_space(zfsvfs->z_os,
+ &refdbytes, &availbytes, &usedobjs, &availobjs);
+
+ uint64_t fsid = dmu_objset_fsid_guid(zfsvfs->z_os);
+ /*
+ * The underlying storage pool actually uses multiple block
+ * size. Under Solaris frsize (fragment size) is reported as
+ * the smallest block size we support, and bsize (block size)
+ * as the filesystem's maximum block size. Unfortunately,
+ * under Linux the fragment size and block size are often used
+ * interchangeably. Thus we are forced to report both of them
+ * as the filesystem's maximum block size.
+ */
+ statp->f_frsize = zfsvfs->z_max_blksz;
+ statp->f_bsize = zfsvfs->z_max_blksz;
+ uint32_t bshift = fls(statp->f_bsize) - 1;
+
+ /*
+ * The following report "total" blocks of various kinds in
+ * the file system, but reported in terms of f_bsize - the
+ * "preferred" size.
+ */
+
+ /* Round up so we never have a filesystem using 0 blocks. */
+ refdbytes = P2ROUNDUP(refdbytes, statp->f_bsize);
+ statp->f_blocks = (refdbytes + availbytes) >> bshift;
+ statp->f_bfree = availbytes >> bshift;
+ statp->f_bavail = statp->f_bfree; /* no root reservation */
+
+ /*
+ * statvfs() should really be called statufs(), because it assumes
+ * static metadata. ZFS doesn't preallocate files, so the best
+ * we can do is report the max that could possibly fit in f_files,
+ * and that minus the number actually used in f_ffree.
+ * For f_ffree, report the smaller of the number of objects available
+ * and the number of blocks (each object will take at least a block).
+ */
+ statp->f_ffree = MIN(availobjs, availbytes >> DNODE_SHIFT);
+ statp->f_files = statp->f_ffree + usedobjs;
+ statp->f_fsid.val[0] = (uint32_t)fsid;
+ statp->f_fsid.val[1] = (uint32_t)(fsid >> 32);
+ statp->f_type = ZFS_SUPER_MAGIC;
+ statp->f_namelen = MAXNAMELEN - 1;
+
+ /*
+ * We have all of 40 characters to stuff a string here.
+ * Is there anything useful we could/should provide?
+ */
+ bzero(statp->f_spare, sizeof (statp->f_spare));
+
+ if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
+ dmu_objset_projectquota_present(zfsvfs->z_os)) {
+ znode_t *zp = ITOZ(ip);
+
+ if (zp->z_pflags & ZFS_PROJINHERIT && zp->z_projid &&
+ zpl_is_valid_projid(zp->z_projid))
+ err = zfs_statfs_project(zfsvfs, zp, statp, bshift);
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (err);
+}
+
+static int
+zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp)
+{
+ znode_t *rootzp;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
+ if (error == 0)
+ *ipp = ZTOI(rootzp);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Linux kernels older than 3.1 do not support a per-filesystem shrinker.
+ * To accommodate this we must improvise and manually walk the list of znodes
+ * attempting to prune dentries in order to be able to drop the inodes.
+ *
+ * To avoid scanning the same znodes multiple times they are always rotated
+ * to the end of the z_all_znodes list. New znodes are inserted at the
+ * end of the list so we're always scanning the oldest znodes first.
+ */
+static int
+zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan)
+{
+ znode_t **zp_array, *zp;
+ int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *));
+ int objects = 0;
+ int i = 0, j = 0;
+
+ zp_array = kmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP);
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) {
+
+ if ((i++ > nr_to_scan) || (j >= max_array))
+ break;
+
+ ASSERT(list_link_active(&zp->z_link_node));
+ list_remove(&zfsvfs->z_all_znodes, zp);
+ list_insert_tail(&zfsvfs->z_all_znodes, zp);
+
+ /* Skip active znodes and .zfs entries */
+ if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir)
+ continue;
+
+ if (igrab(ZTOI(zp)) == NULL)
+ continue;
+
+ zp_array[j] = zp;
+ j++;
+ }
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ for (i = 0; i < j; i++) {
+ zp = zp_array[i];
+
+ ASSERT3P(zp, !=, NULL);
+ d_prune_aliases(ZTOI(zp));
+
+ if (atomic_read(&ZTOI(zp)->i_count) == 1)
+ objects++;
+
+ zrele(zp);
+ }
+
+ kmem_free(zp_array, max_array * sizeof (znode_t *));
+
+ return (objects);
+}
+
+/*
+ * The ARC has requested that the filesystem drop entries from the dentry
+ * and inode caches. This can occur when the ARC needs to free meta data
+ * blocks but can't because they are all pinned by entries in these caches.
+ */
+int
+zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
+{
+ zfsvfs_t *zfsvfs = sb->s_fs_info;
+ int error = 0;
+ struct shrinker *shrinker = &sb->s_shrink;
+ struct shrink_control sc = {
+ .nr_to_scan = nr_to_scan,
+ .gfp_mask = GFP_KERNEL,
+ };
+
+ ZFS_ENTER(zfsvfs);
+
+#if defined(HAVE_SPLIT_SHRINKER_CALLBACK) && \
+ defined(SHRINK_CONTROL_HAS_NID) && \
+ defined(SHRINKER_NUMA_AWARE)
+ if (sb->s_shrink.flags & SHRINKER_NUMA_AWARE) {
+ *objects = 0;
+ for_each_online_node(sc.nid) {
+ *objects += (*shrinker->scan_objects)(shrinker, &sc);
+ }
+ } else {
+ *objects = (*shrinker->scan_objects)(shrinker, &sc);
+ }
+
+#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK)
+ *objects = (*shrinker->scan_objects)(shrinker, &sc);
+#elif defined(HAVE_SINGLE_SHRINKER_CALLBACK)
+ *objects = (*shrinker->shrink)(shrinker, &sc);
+#elif defined(HAVE_D_PRUNE_ALIASES)
+#define D_PRUNE_ALIASES_IS_DEFAULT
+ *objects = zfs_prune_aliases(zfsvfs, nr_to_scan);
+#else
+#error "No available dentry and inode cache pruning mechanism."
+#endif
+
+#if defined(HAVE_D_PRUNE_ALIASES) && !defined(D_PRUNE_ALIASES_IS_DEFAULT)
+#undef D_PRUNE_ALIASES_IS_DEFAULT
+ /*
+ * Fall back to zfs_prune_aliases if the kernel's per-superblock
+ * shrinker couldn't free anything, possibly due to the inodes being
+ * allocated in a different memcg.
+ */
+ if (*objects == 0)
+ *objects = zfs_prune_aliases(zfsvfs, nr_to_scan);
+#endif
+
+ ZFS_EXIT(zfsvfs);
+
+ dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
+ "pruning, nr_to_scan=%lu objects=%d error=%d\n",
+ nr_to_scan, *objects, error);
+
+ return (error);
+}
+
+/*
+ * Teardown the zfsvfs_t.
+ *
+ * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
+ * and 'z_teardown_inactive_lock' held.
+ */
+static int
+zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
+{
+ znode_t *zp;
+
+ zfs_unlinked_drain_stop_wait(zfsvfs);
+
+ /*
+ * If someone has not already unmounted this file system,
+ * drain the zrele_taskq to ensure all active references to the
+ * zfsvfs_t have been handled only then can it be safely destroyed.
+ */
+ if (zfsvfs->z_os) {
+ /*
+ * If we're unmounting we have to wait for the list to
+ * drain completely.
+ *
+ * If we're not unmounting there's no guarantee the list
+ * will drain completely, but iputs run from the taskq
+ * may add the parents of dir-based xattrs to the taskq
+ * so we want to wait for these.
+ *
+ * We can safely read z_nr_znodes without locking because the
+ * VFS has already blocked operations which add to the
+ * z_all_znodes list and thus increment z_nr_znodes.
+ */
+ int round = 0;
+ while (zfsvfs->z_nr_znodes > 0) {
+ taskq_wait_outstanding(dsl_pool_zrele_taskq(
+ dmu_objset_pool(zfsvfs->z_os)), 0);
+ if (++round > 1 && !unmounting)
+ break;
+ }
+ }
+
+ ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
+
+ if (!unmounting) {
+ /*
+ * We purge the parent filesystem's super block as the
+ * parent filesystem and all of its snapshots have their
+ * inode's super block set to the parent's filesystem's
+ * super block. Note, 'z_parent' is self referential
+ * for non-snapshots.
+ */
+ shrink_dcache_sb(zfsvfs->z_parent->z_sb);
+ }
+
+ /*
+ * Close the zil. NB: Can't close the zil while zfs_inactive
+ * threads are blocked as zil_close can call zfs_inactive.
+ */
+ if (zfsvfs->z_log) {
+ zil_close(zfsvfs->z_log);
+ zfsvfs->z_log = NULL;
+ }
+
+ rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
+
+ /*
+ * If we are not unmounting (ie: online recv) and someone already
+ * unmounted this file system while we were doing the switcheroo,
+ * or a reopen of z_os failed then just bail out now.
+ */
+ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+ ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
+ return (SET_ERROR(EIO));
+ }
+
+ /*
+ * At this point there are no VFS ops active, and any new VFS ops
+ * will fail with EIO since we have z_teardown_lock for writer (only
+ * relevant for forced unmount).
+ *
+ * Release all holds on dbufs. We also grab an extra reference to all
+ * the remaining inodes so that the kernel does not attempt to free
+ * any inodes of a suspended fs. This can cause deadlocks since the
+ * zfs_resume_fs() process may involve starting threads, which might
+ * attempt to free unreferenced inodes to free up memory for the new
+ * thread.
+ */
+ if (!unmounting) {
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
+ zp = list_next(&zfsvfs->z_all_znodes, zp)) {
+ if (zp->z_sa_hdl)
+ zfs_znode_dmu_fini(zp);
+ if (igrab(ZTOI(zp)) != NULL)
+ zp->z_suspended = B_TRUE;
+
+ }
+ mutex_exit(&zfsvfs->z_znodes_lock);
+ }
+
+ /*
+ * If we are unmounting, set the unmounted flag and let new VFS ops
+ * unblock. zfs_inactive will have the unmounted behavior, and all
+ * other VFS ops will fail with EIO.
+ */
+ if (unmounting) {
+ zfsvfs->z_unmounted = B_TRUE;
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+ ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
+ }
+
+ /*
+ * z_os will be NULL if there was an error in attempting to reopen
+ * zfsvfs, so just return as the properties had already been
+ *
+ * unregistered and cached data had been evicted before.
+ */
+ if (zfsvfs->z_os == NULL)
+ return (0);
+
+ /*
+ * Unregister properties.
+ */
+ zfs_unregister_callbacks(zfsvfs);
+
+ /*
+ * Evict cached data. We must write out any dirty data before
+ * disowning the dataset.
+ */
+ objset_t *os = zfsvfs->z_os;
+ boolean_t os_dirty = B_FALSE;
+ for (int t = 0; t < TXG_SIZE; t++) {
+ if (dmu_objset_is_dirty(os, t)) {
+ os_dirty = B_TRUE;
+ break;
+ }
+ }
+ if (!zfs_is_readonly(zfsvfs) && os_dirty) {
+ txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+ }
+ dmu_objset_evict_dbufs(zfsvfs->z_os);
+ dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
+ dsl_dir_cancel_waiters(dd);
+
+ return (0);
+}
+
+#if defined(HAVE_SUPER_SETUP_BDI_NAME)
+atomic_long_t zfs_bdi_seq = ATOMIC_LONG_INIT(0);
+#endif
+
+int
+zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
+{
+ const char *osname = zm->mnt_osname;
+ struct inode *root_inode = NULL;
+ uint64_t recordsize;
+ int error = 0;
+ zfsvfs_t *zfsvfs = NULL;
+ vfs_t *vfs = NULL;
+
+ ASSERT(zm);
+ ASSERT(osname);
+
+ error = zfsvfs_parse_options(zm->mnt_data, &vfs);
+ if (error)
+ return (error);
+
+ error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs);
+ if (error) {
+ zfsvfs_vfs_free(vfs);
+ goto out;
+ }
+
+ if ((error = dsl_prop_get_integer(osname, "recordsize",
+ &recordsize, NULL))) {
+ zfsvfs_vfs_free(vfs);
+ goto out;
+ }
+
+ vfs->vfs_data = zfsvfs;
+ zfsvfs->z_vfs = vfs;
+ zfsvfs->z_sb = sb;
+ sb->s_fs_info = zfsvfs;
+ sb->s_magic = ZFS_SUPER_MAGIC;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_time_gran = 1;
+ sb->s_blocksize = recordsize;
+ sb->s_blocksize_bits = ilog2(recordsize);
+
+ error = -zpl_bdi_setup(sb, "zfs");
+ if (error)
+ goto out;
+
+ sb->s_bdi->ra_pages = 0;
+
+ /* Set callback operations for the file system. */
+ sb->s_op = &zpl_super_operations;
+ sb->s_xattr = zpl_xattr_handlers;
+ sb->s_export_op = &zpl_export_operations;
+ sb->s_d_op = &zpl_dentry_operations;
+
+ /* Set features for file system. */
+ zfs_set_fuid_feature(zfsvfs);
+
+ if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
+ uint64_t pval;
+
+ atime_changed_cb(zfsvfs, B_FALSE);
+ readonly_changed_cb(zfsvfs, B_TRUE);
+ if ((error = dsl_prop_get_integer(osname,
+ "xattr", &pval, NULL)))
+ goto out;
+ xattr_changed_cb(zfsvfs, pval);
+ if ((error = dsl_prop_get_integer(osname,
+ "acltype", &pval, NULL)))
+ goto out;
+ acltype_changed_cb(zfsvfs, pval);
+ zfsvfs->z_issnap = B_TRUE;
+ zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
+ zfsvfs->z_snap_defer_time = jiffies;
+
+ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
+ dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+ mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
+ } else {
+ if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
+ goto out;
+ }
+
+ /* Allocate a root inode for the filesystem. */
+ error = zfs_root(zfsvfs, &root_inode);
+ if (error) {
+ (void) zfs_umount(sb);
+ goto out;
+ }
+
+ /* Allocate a root dentry for the filesystem */
+ sb->s_root = d_make_root(root_inode);
+ if (sb->s_root == NULL) {
+ (void) zfs_umount(sb);
+ error = SET_ERROR(ENOMEM);
+ goto out;
+ }
+
+ if (!zfsvfs->z_issnap)
+ zfsctl_create(zfsvfs);
+
+ zfsvfs->z_arc_prune = arc_add_prune_callback(zpl_prune_sb, sb);
+out:
+ if (error) {
+ if (zfsvfs != NULL) {
+ dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
+ zfsvfs_free(zfsvfs);
+ }
+ /*
+ * make sure we don't have dangling sb->s_fs_info which
+ * zfs_preumount will use.
+ */
+ sb->s_fs_info = NULL;
+ }
+
+ return (error);
+}
+
+/*
+ * Called when an unmount is requested and certain sanity checks have
+ * already passed. At this point no dentries or inodes have been reclaimed
+ * from their respective caches. We drop the extra reference on the .zfs
+ * control directory to allow everything to be reclaimed. All snapshots
+ * must already have been unmounted to reach this point.
+ */
+void
+zfs_preumount(struct super_block *sb)
+{
+ zfsvfs_t *zfsvfs = sb->s_fs_info;
+
+ /* zfsvfs is NULL when zfs_domount fails during mount */
+ if (zfsvfs) {
+ zfs_unlinked_drain_stop_wait(zfsvfs);
+ zfsctl_destroy(sb->s_fs_info);
+ /*
+ * Wait for zrele_async before entering evict_inodes in
+ * generic_shutdown_super. The reason we must finish before
+ * evict_inodes is when lazytime is on, or when zfs_purgedir
+ * calls zfs_zget, zrele would bump i_count from 0 to 1. This
+ * would race with the i_count check in evict_inodes. This means
+ * it could destroy the inode while we are still using it.
+ *
+ * We wait for two passes. xattr directories in the first pass
+ * may add xattr entries in zfs_purgedir, so in the second pass
+ * we wait for them. We don't use taskq_wait here because it is
+ * a pool wide taskq. Other mounted filesystems can constantly
+ * do zrele_async and there's no guarantee when taskq will be
+ * empty.
+ */
+ taskq_wait_outstanding(dsl_pool_zrele_taskq(
+ dmu_objset_pool(zfsvfs->z_os)), 0);
+ taskq_wait_outstanding(dsl_pool_zrele_taskq(
+ dmu_objset_pool(zfsvfs->z_os)), 0);
+ }
+}
+
+/*
+ * Called once all other unmount released tear down has occurred.
+ * It is our responsibility to release any remaining infrastructure.
+ */
+/*ARGSUSED*/
+int
+zfs_umount(struct super_block *sb)
+{
+ zfsvfs_t *zfsvfs = sb->s_fs_info;
+ objset_t *os;
+
+ if (zfsvfs->z_arc_prune != NULL)
+ arc_remove_prune_callback(zfsvfs->z_arc_prune);
+ VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
+ os = zfsvfs->z_os;
+ zpl_bdi_destroy(sb);
+
+ /*
+ * z_os will be NULL if there was an error in
+ * attempting to reopen zfsvfs.
+ */
+ if (os != NULL) {
+ /*
+ * Unset the objset user_ptr.
+ */
+ mutex_enter(&os->os_user_ptr_lock);
+ dmu_objset_set_user(os, NULL);
+ mutex_exit(&os->os_user_ptr_lock);
+
+ /*
+ * Finally release the objset
+ */
+ dmu_objset_disown(os, B_TRUE, zfsvfs);
+ }
+
+ zfsvfs_free(zfsvfs);
+ return (0);
+}
+
+int
+zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm)
+{
+ zfsvfs_t *zfsvfs = sb->s_fs_info;
+ vfs_t *vfsp;
+ boolean_t issnap = dmu_objset_is_snapshot(zfsvfs->z_os);
+ int error;
+
+ if ((issnap || !spa_writeable(dmu_objset_spa(zfsvfs->z_os))) &&
+ !(*flags & SB_RDONLY)) {
+ *flags |= SB_RDONLY;
+ return (EROFS);
+ }
+
+ error = zfsvfs_parse_options(zm->mnt_data, &vfsp);
+ if (error)
+ return (error);
+
+ if (!zfs_is_readonly(zfsvfs) && (*flags & SB_RDONLY))
+ txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+
+ zfs_unregister_callbacks(zfsvfs);
+ zfsvfs_vfs_free(zfsvfs->z_vfs);
+
+ vfsp->vfs_data = zfsvfs;
+ zfsvfs->z_vfs = vfsp;
+ if (!issnap)
+ (void) zfs_register_callbacks(vfsp);
+
+ return (error);
+}
+
+int
+zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp)
+{
+ zfsvfs_t *zfsvfs = sb->s_fs_info;
+ znode_t *zp;
+ uint64_t object = 0;
+ uint64_t fid_gen = 0;
+ uint64_t gen_mask;
+ uint64_t zp_gen;
+ int i, err;
+
+ *ipp = NULL;
+
+ if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
+ zfid_short_t *zfid = (zfid_short_t *)fidp;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
+
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
+ } else {
+ return (SET_ERROR(EINVAL));
+ }
+
+ /* LONG_FID_LEN means snapdirs */
+ if (fidp->fid_len == LONG_FID_LEN) {
+ zfid_long_t *zlfid = (zfid_long_t *)fidp;
+ uint64_t objsetid = 0;
+ uint64_t setgen = 0;
+
+ for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+ objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
+
+ for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+ setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
+
+ if (objsetid != ZFSCTL_INO_SNAPDIRS - object) {
+ dprintf("snapdir fid: objsetid (%llu) != "
+ "ZFSCTL_INO_SNAPDIRS (%llu) - object (%llu)\n",
+ objsetid, ZFSCTL_INO_SNAPDIRS, object);
+
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (fid_gen > 1 || setgen != 0) {
+ dprintf("snapdir fid: fid_gen (%llu) and setgen "
+ "(%llu)\n", fid_gen, setgen);
+ return (SET_ERROR(EINVAL));
+ }
+
+ return (zfsctl_snapdir_vget(sb, objsetid, fid_gen, ipp));
+ }
+
+ ZFS_ENTER(zfsvfs);
+ /* A zero fid_gen means we are in the .zfs control directories */
+ if (fid_gen == 0 &&
+ (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
+ *ipp = zfsvfs->z_ctldir;
+ ASSERT(*ipp != NULL);
+ if (object == ZFSCTL_INO_SNAPDIR) {
+ VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp,
+ 0, kcred, NULL, NULL) == 0);
+ } else {
+ igrab(*ipp);
+ }
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ gen_mask = -1ULL >> (64 - 8 * i);
+
+ dprintf("getting %llu [%llu mask %llx]\n", object, fid_gen, gen_mask);
+ if ((err = zfs_zget(zfsvfs, object, &zp))) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+
+ /* Don't export xattr stuff */
+ if (zp->z_pflags & ZFS_XATTR) {
+ zrele(zp);
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(ENOENT));
+ }
+
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
+ sizeof (uint64_t));
+ zp_gen = zp_gen & gen_mask;
+ if (zp_gen == 0)
+ zp_gen = 1;
+ if ((fid_gen == 0) && (zfsvfs->z_root == object))
+ fid_gen = zp_gen;
+ if (zp->z_unlinked || zp_gen != fid_gen) {
+ dprintf("znode gen (%llu) != fid gen (%llu)\n", zp_gen,
+ fid_gen);
+ zrele(zp);
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(ENOENT));
+ }
+
+ *ipp = ZTOI(zp);
+ if (*ipp)
+ zfs_znode_update_vfs(ITOZ(*ipp));
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Block out VFS ops and close zfsvfs_t
+ *
+ * Note, if successful, then we return with the 'z_teardown_lock' and
+ * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying
+ * dataset and objset intact so that they can be atomically handed off during
+ * a subsequent rollback or recv operation and the resume thereafter.
+ */
+int
+zfs_suspend_fs(zfsvfs_t *zfsvfs)
+{
+ int error;
+
+ if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
+ return (error);
+
+ return (0);
+}
+
+/*
+ * Rebuild SA and release VOPs. Note that ownership of the underlying dataset
+ * is an invariant across any of the operations that can be performed while the
+ * filesystem was suspended. Whether it succeeded or failed, the preconditions
+ * are the same: the relevant objset and associated dataset are owned by
+ * zfsvfs, held, and long held on entry.
+ */
+int
+zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
+{
+ int err, err2;
+ znode_t *zp;
+
+ ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
+ ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
+
+ /*
+ * We already own this, so just update the objset_t, as the one we
+ * had before may have been evicted.
+ */
+ objset_t *os;
+ VERIFY3P(ds->ds_owner, ==, zfsvfs);
+ VERIFY(dsl_dataset_long_held(ds));
+ dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
+ dsl_pool_config_enter(dp, FTAG);
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+ dsl_pool_config_exit(dp, FTAG);
+
+ err = zfsvfs_init(zfsvfs, os);
+ if (err != 0)
+ goto bail;
+
+ ds->ds_dir->dd_activity_cancelled = B_FALSE;
+ VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
+
+ zfs_set_fuid_feature(zfsvfs);
+ zfsvfs->z_rollback_time = jiffies;
+
+ /*
+ * Attempt to re-establish all the active inodes with their
+ * dbufs. If a zfs_rezget() fails, then we unhash the inode
+ * and mark it stale. This prevents a collision if a new
+ * inode/object is created which must use the same inode
+ * number. The stale inode will be be released when the
+ * VFS prunes the dentry holding the remaining references
+ * on the stale inode.
+ */
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ for (zp = list_head(&zfsvfs->z_all_znodes); zp;
+ zp = list_next(&zfsvfs->z_all_znodes, zp)) {
+ err2 = zfs_rezget(zp);
+ if (err2) {
+ remove_inode_hash(ZTOI(zp));
+ zp->z_is_stale = B_TRUE;
+ }
+
+ /* see comment in zfs_suspend_fs() */
+ if (zp->z_suspended) {
+ zfs_zrele_async(zp);
+ zp->z_suspended = B_FALSE;
+ }
+ }
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ if (!zfs_is_readonly(zfsvfs) && !zfsvfs->z_unmounted) {
+ /*
+ * zfs_suspend_fs() could have interrupted freeing
+ * of dnodes. We need to restart this freeing so
+ * that we don't "leak" the space.
+ */
+ zfs_unlinked_drain(zfsvfs);
+ }
+
+ /*
+ * Most of the time zfs_suspend_fs is used for changing the contents
+ * of the underlying dataset. ZFS rollback and receive operations
+ * might create files for which negative dentries are present in
+ * the cache. Since walking the dcache would require a lot of GPL-only
+ * code duplication, it's much easier on these rather rare occasions
+ * just to flush the whole dcache for the given dataset/filesystem.
+ */
+ shrink_dcache_sb(zfsvfs->z_sb);
+
+bail:
+ if (err != 0)
+ zfsvfs->z_unmounted = B_TRUE;
+
+ /* release the VFS ops */
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+ ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
+
+ if (err != 0) {
+ /*
+ * Since we couldn't setup the sa framework, try to force
+ * unmount this file system.
+ */
+ if (zfsvfs->z_os)
+ (void) zfs_umount(zfsvfs->z_sb);
+ }
+ return (err);
+}
+
+/*
+ * Release VOPs and unmount a suspended filesystem.
+ */
+int
+zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
+{
+ ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
+ ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
+
+ /*
+ * We already own this, so just hold and rele it to update the
+ * objset_t, as the one we had before may have been evicted.
+ */
+ objset_t *os;
+ VERIFY3P(ds->ds_owner, ==, zfsvfs);
+ VERIFY(dsl_dataset_long_held(ds));
+ dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
+ dsl_pool_config_enter(dp, FTAG);
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+ dsl_pool_config_exit(dp, FTAG);
+ zfsvfs->z_os = os;
+
+ /* release the VOPs */
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+ ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
+
+ /*
+ * Try to force unmount this file system.
+ */
+ (void) zfs_umount(zfsvfs->z_sb);
+ zfsvfs->z_unmounted = B_TRUE;
+ return (0);
+}
+
+/*
+ * Automounted snapshots rely on periodic revalidation
+ * to defer snapshots from being automatically unmounted.
+ */
+
+inline void
+zfs_exit_fs(zfsvfs_t *zfsvfs)
+{
+ if (!zfsvfs->z_issnap)
+ return;
+
+ if (time_after(jiffies, zfsvfs->z_snap_defer_time +
+ MAX(zfs_expire_snapshot * HZ / 2, HZ))) {
+ zfsvfs->z_snap_defer_time = jiffies;
+ zfsctl_snapshot_unmount_delay(zfsvfs->z_os->os_spa,
+ dmu_objset_id(zfsvfs->z_os),
+ zfs_expire_snapshot);
+ }
+}
+
+int
+zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
+{
+ int error;
+ objset_t *os = zfsvfs->z_os;
+ dmu_tx_t *tx;
+
+ if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
+ return (SET_ERROR(EINVAL));
+
+ if (newvers < zfsvfs->z_version)
+ return (SET_ERROR(EINVAL));
+
+ if (zfs_spa_version_map(newvers) >
+ spa_version(dmu_objset_spa(zfsvfs->z_os)))
+ return (SET_ERROR(ENOTSUP));
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
+ if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
+ ZFS_SA_ATTRS);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ }
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ return (error);
+ }
+
+ error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+ 8, 1, &newvers, tx);
+
+ if (error) {
+ dmu_tx_commit(tx);
+ return (error);
+ }
+
+ if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+ uint64_t sa_obj;
+
+ ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
+ SPA_VERSION_SA);
+ sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+
+ error = zap_add(os, MASTER_NODE_OBJ,
+ ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+ ASSERT0(error);
+
+ VERIFY(0 == sa_set_sa_object(os, sa_obj));
+ sa_register_update_callback(os, zfs_sa_upgrade);
+ }
+
+ spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
+ "from %llu to %llu", zfsvfs->z_version, newvers);
+
+ dmu_tx_commit(tx);
+
+ zfsvfs->z_version = newvers;
+ os->os_version = newvers;
+
+ zfs_set_fuid_feature(zfsvfs);
+
+ return (0);
+}
+
+/*
+ * Read a property stored within the master node.
+ */
+int
+zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
+{
+ uint64_t *cached_copy = NULL;
+
+ /*
+ * Figure out where in the objset_t the cached copy would live, if it
+ * is available for the requested property.
+ */
+ if (os != NULL) {
+ switch (prop) {
+ case ZFS_PROP_VERSION:
+ cached_copy = &os->os_version;
+ break;
+ case ZFS_PROP_NORMALIZE:
+ cached_copy = &os->os_normalization;
+ break;
+ case ZFS_PROP_UTF8ONLY:
+ cached_copy = &os->os_utf8only;
+ break;
+ case ZFS_PROP_CASE:
+ cached_copy = &os->os_casesensitivity;
+ break;
+ default:
+ break;
+ }
+ }
+ if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
+ *value = *cached_copy;
+ return (0);
+ }
+
+ /*
+ * If the property wasn't cached, look up the file system's value for
+ * the property. For the version property, we look up a slightly
+ * different string.
+ */
+ const char *pname;
+ int error = ENOENT;
+ if (prop == ZFS_PROP_VERSION)
+ pname = ZPL_VERSION_STR;
+ else
+ pname = zfs_prop_to_name(prop);
+
+ if (os != NULL) {
+ ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
+ error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
+ }
+
+ if (error == ENOENT) {
+ /* No value set, use the default value */
+ switch (prop) {
+ case ZFS_PROP_VERSION:
+ *value = ZPL_VERSION;
+ break;
+ case ZFS_PROP_NORMALIZE:
+ case ZFS_PROP_UTF8ONLY:
+ *value = 0;
+ break;
+ case ZFS_PROP_CASE:
+ *value = ZFS_CASE_SENSITIVE;
+ break;
+ case ZFS_PROP_ACLTYPE:
+ *value = ZFS_ACLTYPE_OFF;
+ break;
+ default:
+ return (error);
+ }
+ error = 0;
+ }
+
+ /*
+ * If one of the methods for getting the property value above worked,
+ * copy it into the objset_t's cache.
+ */
+ if (error == 0 && cached_copy != NULL) {
+ *cached_copy = *value;
+ }
+
+ return (error);
+}
+
+/*
+ * Return true if the corresponding vfs's unmounted flag is set.
+ * Otherwise return false.
+ * If this function returns true we know VFS unmount has been initiated.
+ */
+boolean_t
+zfs_get_vfs_flag_unmounted(objset_t *os)
+{
+ zfsvfs_t *zfvp;
+ boolean_t unmounted = B_FALSE;
+
+ ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
+
+ mutex_enter(&os->os_user_ptr_lock);
+ zfvp = dmu_objset_get_user(os);
+ if (zfvp != NULL && zfvp->z_unmounted)
+ unmounted = B_TRUE;
+ mutex_exit(&os->os_user_ptr_lock);
+
+ return (unmounted);
+}
+
+/*ARGSUSED*/
+void
+zfsvfs_update_fromname(const char *oldname, const char *newname)
+{
+ /*
+ * We don't need to do anything here, the devname is always current by
+ * virtue of zfsvfs->z_sb->s_op->show_devname.
+ */
+}
+
+void
+zfs_init(void)
+{
+ zfsctl_init();
+ zfs_znode_init();
+ dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
+ register_filesystem(&zpl_fs_type);
+}
+
+void
+zfs_fini(void)
+{
+ /*
+ * we don't use outstanding because zpl_posix_acl_free might add more.
+ */
+ taskq_wait(system_delay_taskq);
+ taskq_wait(system_taskq);
+ unregister_filesystem(&zpl_fs_type);
+ zfs_znode_fini();
+ zfsctl_fini();
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zfs_suspend_fs);
+EXPORT_SYMBOL(zfs_resume_fs);
+EXPORT_SYMBOL(zfs_set_version);
+EXPORT_SYMBOL(zfsvfs_create);
+EXPORT_SYMBOL(zfsvfs_free);
+EXPORT_SYMBOL(zfs_is_readonly);
+EXPORT_SYMBOL(zfs_domount);
+EXPORT_SYMBOL(zfs_preumount);
+EXPORT_SYMBOL(zfs_umount);
+EXPORT_SYMBOL(zfs_remount);
+EXPORT_SYMBOL(zfs_statvfs);
+EXPORT_SYMBOL(zfs_vget);
+EXPORT_SYMBOL(zfs_prune);
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
new file mode 100644
index 000000000000..84c33b541ea3
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
@@ -0,0 +1,4010 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+/* Portions Copyright 2007 Jeremy Teo */
+/* Portions Copyright 2010 Robert Milkowski */
+
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/taskq.h>
+#include <sys/uio.h>
+#include <sys/vmsystm.h>
+#include <sys/atomic.h>
+#include <sys/pathname.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/policy.h>
+#include <sys/sunddi.h>
+#include <sys/sid.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_quota.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_rlock.h>
+#include <sys/cred.h>
+#include <sys/zpl.h>
+#include <sys/zil.h>
+#include <sys/sa_impl.h>
+
+/*
+ * Programming rules.
+ *
+ * Each vnode op performs some logical unit of work. To do this, the ZPL must
+ * properly lock its in-core state, create a DMU transaction, do the work,
+ * record this work in the intent log (ZIL), commit the DMU transaction,
+ * and wait for the intent log to commit if it is a synchronous operation.
+ * Moreover, the vnode ops must work in both normal and log replay context.
+ * The ordering of events is important to avoid deadlocks and references
+ * to freed memory. The example below illustrates the following Big Rules:
+ *
+ * (1) A check must be made in each zfs thread for a mounted file system.
+ * This is done avoiding races using ZFS_ENTER(zfsvfs).
+ * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
+ * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
+ * can return EIO from the calling function.
+ *
+ * (2) zrele() should always be the last thing except for zil_commit() (if
+ * necessary) and ZFS_EXIT(). This is for 3 reasons: First, if it's the
+ * last reference, the vnode/znode can be freed, so the zp may point to
+ * freed memory. Second, the last reference will call zfs_zinactive(),
+ * which may induce a lot of work -- pushing cached pages (which acquires
+ * range locks) and syncing out cached atime changes. Third,
+ * zfs_zinactive() may require a new tx, which could deadlock the system
+ * if you were already holding one. This deadlock occurs because the tx
+ * currently being operated on prevents a txg from syncing, which
+ * prevents the new tx from progressing, resulting in a deadlock. If you
+ * must call zrele() within a tx, use zfs_zrele_async(). Note that iput()
+ * is a synonym for zrele().
+ *
+ * (3) All range locks must be grabbed before calling dmu_tx_assign(),
+ * as they can span dmu_tx_assign() calls.
+ *
+ * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
+ * dmu_tx_assign(). This is critical because we don't want to block
+ * while holding locks.
+ *
+ * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This
+ * reduces lock contention and CPU usage when we must wait (note that if
+ * throughput is constrained by the storage, nearly every transaction
+ * must wait).
+ *
+ * Note, in particular, that if a lock is sometimes acquired before
+ * the tx assigns, and sometimes after (e.g. z_lock), then failing
+ * to use a non-blocking assign can deadlock the system. The scenario:
+ *
+ * Thread A has grabbed a lock before calling dmu_tx_assign().
+ * Thread B is in an already-assigned tx, and blocks for this lock.
+ * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
+ * forever, because the previous txg can't quiesce until B's tx commits.
+ *
+ * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
+ * then drop all locks, call dmu_tx_wait(), and try again. On subsequent
+ * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
+ * to indicate that this operation has already called dmu_tx_wait().
+ * This will ensure that we don't retry forever, waiting a short bit
+ * each time.
+ *
+ * (5) If the operation succeeded, generate the intent log entry for it
+ * before dropping locks. This ensures that the ordering of events
+ * in the intent log matches the order in which they actually occurred.
+ * During ZIL replay the zfs_log_* functions will update the sequence
+ * number to indicate the zil transaction has replayed.
+ *
+ * (6) At the end of each vnode op, the DMU tx must always commit,
+ * regardless of whether there were any errors.
+ *
+ * (7) After dropping all locks, invoke zil_commit(zilog, foid)
+ * to ensure that synchronous semantics are provided when necessary.
+ *
+ * In general, this is how things should be ordered in each vnode op:
+ *
+ * ZFS_ENTER(zfsvfs); // exit if unmounted
+ * top:
+ * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab())
+ * rw_enter(...); // grab any other locks you need
+ * tx = dmu_tx_create(...); // get DMU tx
+ * dmu_tx_hold_*(); // hold each object you might modify
+ * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ * if (error) {
+ * rw_exit(...); // drop locks
+ * zfs_dirent_unlock(dl); // unlock directory entry
+ * zrele(...); // release held znodes
+ * if (error == ERESTART) {
+ * waited = B_TRUE;
+ * dmu_tx_wait(tx);
+ * dmu_tx_abort(tx);
+ * goto top;
+ * }
+ * dmu_tx_abort(tx); // abort DMU tx
+ * ZFS_EXIT(zfsvfs); // finished in zfs
+ * return (error); // really out of space
+ * }
+ * error = do_real_work(); // do whatever this VOP does
+ * if (error == 0)
+ * zfs_log_*(...); // on success, make ZIL entry
+ * dmu_tx_commit(tx); // commit DMU tx -- error or not
+ * rw_exit(...); // drop locks
+ * zfs_dirent_unlock(dl); // unlock directory entry
+ * zrele(...); // release held znodes
+ * zil_commit(zilog, foid); // synchronous when necessary
+ * ZFS_EXIT(zfsvfs); // finished in zfs
+ * return (error); // done, report error
+ */
+
+/*
+ * Virus scanning is unsupported. It would be possible to add a hook
+ * here to performance the required virus scan. This could be done
+ * entirely in the kernel or potentially as an update to invoke a
+ * scanning utility.
+ */
+static int
+zfs_vscan(struct inode *ip, cred_t *cr, int async)
+{
+ return (0);
+}
+
+/* ARGSUSED */
+int
+zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ /* Honor ZFS_APPENDONLY file attribute */
+ if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
+ ((flag & O_APPEND) == 0)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ /* Virus scan eligible files on open */
+ if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) &&
+ !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
+ if (zfs_vscan(ip, cr, 0) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EACCES));
+ }
+ }
+
+ /* Keep a count of the synchronous opens in the znode */
+ if (flag & O_SYNC)
+ atomic_inc_32(&zp->z_sync_cnt);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/* ARGSUSED */
+int
+zfs_close(struct inode *ip, int flag, cred_t *cr)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ /* Decrement the synchronous opens in the znode */
+ if (flag & O_SYNC)
+ atomic_dec_32(&zp->z_sync_cnt);
+
+ if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) &&
+ !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
+ VERIFY(zfs_vscan(ip, cr, 1) == 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+#if defined(_KERNEL)
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages. What this means:
+ *
+ * On Write: If we find a memory mapped page, we write to *both*
+ * the page and the dmu buffer.
+ */
+void
+update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
+{
+ struct inode *ip = ZTOI(zp);
+ struct address_space *mp = ip->i_mapping;
+ struct page *pp;
+ uint64_t nbytes;
+ int64_t off;
+ void *pb;
+
+ off = start & (PAGE_SIZE-1);
+ for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
+ nbytes = MIN(PAGE_SIZE - off, len);
+
+ pp = find_lock_page(mp, start >> PAGE_SHIFT);
+ if (pp) {
+ if (mapping_writably_mapped(mp))
+ flush_dcache_page(pp);
+
+ pb = kmap(pp);
+ (void) dmu_read(os, zp->z_id, start + off, nbytes,
+ pb + off, DMU_READ_PREFETCH);
+ kunmap(pp);
+
+ if (mapping_writably_mapped(mp))
+ flush_dcache_page(pp);
+
+ mark_page_accessed(pp);
+ SetPageUptodate(pp);
+ ClearPageError(pp);
+ unlock_page(pp);
+ put_page(pp);
+ }
+
+ len -= nbytes;
+ off = 0;
+ }
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages. What this means:
+ *
+ * On Read: We "read" preferentially from memory mapped pages,
+ * else we default from the dmu buffer.
+ *
+ * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
+ * the file is memory mapped.
+ */
+int
+mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
+{
+ struct inode *ip = ZTOI(zp);
+ struct address_space *mp = ip->i_mapping;
+ struct page *pp;
+ int64_t start, off;
+ uint64_t bytes;
+ int len = nbytes;
+ int error = 0;
+ void *pb;
+
+ start = uio->uio_loffset;
+ off = start & (PAGE_SIZE-1);
+ for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
+ bytes = MIN(PAGE_SIZE - off, len);
+
+ pp = find_lock_page(mp, start >> PAGE_SHIFT);
+ if (pp) {
+ ASSERT(PageUptodate(pp));
+ unlock_page(pp);
+
+ pb = kmap(pp);
+ error = zfs_uiomove(pb + off, bytes, UIO_READ, uio);
+ kunmap(pp);
+
+ if (mapping_writably_mapped(mp))
+ flush_dcache_page(pp);
+
+ mark_page_accessed(pp);
+ put_page(pp);
+ } else {
+ error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+ uio, bytes);
+ }
+
+ len -= bytes;
+ off = 0;
+ if (error)
+ break;
+ }
+ return (error);
+}
+#endif /* _KERNEL */
+
+unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
+
+/*
+ * Write the bytes to a file.
+ *
+ * IN: zp - znode of file to be written to
+ * data - bytes to write
+ * len - number of bytes to write
+ * pos - offset to start writing at
+ *
+ * OUT: resid - remaining bytes to write
+ *
+ * RETURN: 0 if success
+ * positive error code if failure. EIO is returned
+ * for a short write when residp isn't provided.
+ *
+ * Timestamps:
+ * zp - ctime|mtime updated if byte count > 0
+ */
+int
+zfs_write_simple(znode_t *zp, const void *data, size_t len,
+ loff_t pos, size_t *residp)
+{
+ fstrans_cookie_t cookie;
+ int error;
+
+ struct iovec iov;
+ iov.iov_base = (void *)data;
+ iov.iov_len = len;
+
+ zfs_uio_t uio;
+ zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0);
+
+ cookie = spl_fstrans_mark();
+ error = zfs_write(zp, &uio, 0, kcred);
+ spl_fstrans_unmark(cookie);
+
+ if (error == 0) {
+ if (residp != NULL)
+ *residp = zfs_uio_resid(&uio);
+ else if (zfs_uio_resid(&uio) != 0)
+ error = SET_ERROR(EIO);
+ }
+
+ return (error);
+}
+
+void
+zfs_zrele_async(znode_t *zp)
+{
+ struct inode *ip = ZTOI(zp);
+ objset_t *os = ITOZSB(ip)->z_os;
+
+ ASSERT(atomic_read(&ip->i_count) > 0);
+ ASSERT(os != NULL);
+
+ /*
+ * If decrementing the count would put us at 0, we can't do it inline
+ * here, because that would be synchronous. Instead, dispatch an iput
+ * to run later.
+ *
+ * For more information on the dangers of a synchronous iput, see the
+ * header comment of this file.
+ */
+ if (!atomic_add_unless(&ip->i_count, -1, 1)) {
+ VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)),
+ (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID);
+ }
+}
+
+
+/*
+ * Lookup an entry in a directory, or an extended attribute directory.
+ * If it exists, return a held inode reference for it.
+ *
+ * IN: zdp - znode of directory to search.
+ * nm - name of entry to lookup.
+ * flags - LOOKUP_XATTR set if looking for an attribute.
+ * cr - credentials of caller.
+ * direntflags - directory lookup flags
+ * realpnp - returned pathname.
+ *
+ * OUT: zpp - znode of located entry, NULL if not found.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * NA
+ */
+/* ARGSUSED */
+int
+zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
+ int *direntflags, pathname_t *realpnp)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zdp);
+ int error = 0;
+
+ /*
+ * Fast path lookup, however we must skip DNLC lookup
+ * for case folding or normalizing lookups because the
+ * DNLC code only stores the passed in name. This means
+ * creating 'a' and removing 'A' on a case insensitive
+ * file system would work, but DNLC still thinks 'a'
+ * exists and won't let you create it again on the next
+ * pass through fast path.
+ */
+ if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
+
+ if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
+ return (SET_ERROR(ENOTDIR));
+ } else if (zdp->z_sa_hdl == NULL) {
+ return (SET_ERROR(EIO));
+ }
+
+ if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
+ error = zfs_fastaccesschk_execute(zdp, cr);
+ if (!error) {
+ *zpp = zdp;
+ zhold(*zpp);
+ return (0);
+ }
+ return (error);
+ }
+ }
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zdp);
+
+ *zpp = NULL;
+
+ if (flags & LOOKUP_XATTR) {
+ /*
+ * We don't allow recursive attributes..
+ * Maybe someday we will.
+ */
+ if (zdp->z_pflags & ZFS_XATTR) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Do we have permission to get into attribute directory?
+ */
+
+ if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0,
+ B_FALSE, cr))) {
+ zrele(*zpp);
+ *zpp = NULL;
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(ENOTDIR));
+ }
+
+ /*
+ * Check accessibility of directory.
+ */
+
+ if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
+ NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+
+ error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp);
+ if ((error == 0) && (*zpp))
+ zfs_znode_update_vfs(*zpp);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Attempt to create a new entry in a directory. If the entry
+ * already exists, truncate the file if permissible, else return
+ * an error. Return the ip of the created or trunc'd file.
+ *
+ * IN: dzp - znode of directory to put new file entry in.
+ * name - name of new file entry.
+ * vap - attributes of new file.
+ * excl - flag indicating exclusive or non-exclusive mode.
+ * mode - mode to open file with.
+ * cr - credentials of caller.
+ * flag - file flag.
+ * vsecp - ACL to be set
+ *
+ * OUT: zpp - znode of created or trunc'd entry.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * dzp - ctime|mtime updated if new entry created
+ * zp - ctime|mtime always, atime if new
+ */
+
+/* ARGSUSED */
+int
+zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl,
+ int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp)
+{
+ znode_t *zp;
+ zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+ zilog_t *zilog;
+ objset_t *os;
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ int error;
+ uid_t uid;
+ gid_t gid;
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
+ boolean_t have_acl = B_FALSE;
+ boolean_t waited = B_FALSE;
+
+ /*
+ * If we have an ephemeral id, ACL, or XVATTR then
+ * make sure file system is at proper version
+ */
+
+ gid = crgetgid(cr);
+ uid = crgetuid(cr);
+
+ if (zfsvfs->z_use_fuids == B_FALSE &&
+ (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+ return (SET_ERROR(EINVAL));
+
+ if (name == NULL)
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ os = zfsvfs->z_os;
+ zilog = zfsvfs->z_log;
+
+ if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
+ NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+
+ if (vap->va_mask & ATTR_XVATTR) {
+ if ((error = secpolicy_xvattr((xvattr_t *)vap,
+ crgetuid(cr), cr, vap->va_mode)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+top:
+ *zpp = NULL;
+ if (*name == '\0') {
+ /*
+ * Null component name refers to the directory itself.
+ */
+ zhold(dzp);
+ zp = dzp;
+ dl = NULL;
+ error = 0;
+ } else {
+ /* possible igrab(zp) */
+ int zflg = 0;
+
+ if (flag & FIGNORECASE)
+ zflg |= ZCILOOK;
+
+ error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
+ NULL, NULL);
+ if (error) {
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
+ if (strcmp(name, "..") == 0)
+ error = SET_ERROR(EISDIR);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ if (zp == NULL) {
+ uint64_t txtype;
+ uint64_t projid = ZFS_DEFAULT_PROJID;
+
+ /*
+ * Create a new file object and update the directory
+ * to reference it.
+ */
+ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
+ goto out;
+ }
+
+ /*
+ * We only support the creation of regular files in
+ * extended attribute directories.
+ */
+
+ if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
+ cr, vsecp, &acl_ids)) != 0)
+ goto out;
+ have_acl = B_TRUE;
+
+ if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
+ projid = zfs_inherit_projid(dzp);
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
+ zfs_acl_ids_free(&acl_ids);
+ error = SET_ERROR(EDQUOT);
+ goto out;
+ }
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ if (!zfsvfs->z_use_sa &&
+ acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, acl_ids.z_aclp->z_acl_bytes);
+ }
+
+ error = dmu_tx_assign(tx,
+ (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART) {
+ waited = B_TRUE;
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+ error = zfs_link_create(dl, zp, tx, ZNEW);
+ if (error != 0) {
+ /*
+ * Since, we failed to add the directory entry for it,
+ * delete the newly created dnode.
+ */
+ zfs_znode_delete(zp, tx);
+ remove_inode_hash(ZTOI(zp));
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_commit(tx);
+ goto out;
+ }
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
+ if (flag & FIGNORECASE)
+ txtype |= TX_CI;
+ zfs_log_create(zilog, tx, txtype, dzp, zp, name,
+ vsecp, acl_ids.z_fuidp, vap);
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_commit(tx);
+ } else {
+ int aflags = (flag & O_APPEND) ? V_APPEND : 0;
+
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
+ have_acl = B_FALSE;
+
+ /*
+ * A directory entry already exists for this name.
+ */
+ /*
+ * Can't truncate an existing file if in exclusive mode.
+ */
+ if (excl) {
+ error = SET_ERROR(EEXIST);
+ goto out;
+ }
+ /*
+ * Can't open a directory for writing.
+ */
+ if (S_ISDIR(ZTOI(zp)->i_mode)) {
+ error = SET_ERROR(EISDIR);
+ goto out;
+ }
+ /*
+ * Verify requested access to file.
+ */
+ if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
+ goto out;
+ }
+
+ mutex_enter(&dzp->z_lock);
+ dzp->z_seq++;
+ mutex_exit(&dzp->z_lock);
+
+ /*
+ * Truncate regular files if requested.
+ */
+ if (S_ISREG(ZTOI(zp)->i_mode) &&
+ (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
+ /* we can't hold any locks when calling zfs_freesp() */
+ if (dl) {
+ zfs_dirent_unlock(dl);
+ dl = NULL;
+ }
+ error = zfs_freesp(zp, 0, 0, mode, TRUE);
+ }
+ }
+out:
+
+ if (dl)
+ zfs_dirent_unlock(dl);
+
+ if (error) {
+ if (zp)
+ zrele(zp);
+ } else {
+ zfs_znode_update_vfs(dzp);
+ zfs_znode_update_vfs(zp);
+ *zpp = zp;
+ }
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/* ARGSUSED */
+int
+zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
+ int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp)
+{
+ znode_t *zp = NULL, *dzp = ITOZ(dip);
+ zfsvfs_t *zfsvfs = ITOZSB(dip);
+ objset_t *os;
+ dmu_tx_t *tx;
+ int error;
+ uid_t uid;
+ gid_t gid;
+ zfs_acl_ids_t acl_ids;
+ uint64_t projid = ZFS_DEFAULT_PROJID;
+ boolean_t fuid_dirtied;
+ boolean_t have_acl = B_FALSE;
+ boolean_t waited = B_FALSE;
+
+ /*
+ * If we have an ephemeral id, ACL, or XVATTR then
+ * make sure file system is at proper version
+ */
+
+ gid = crgetgid(cr);
+ uid = crgetuid(cr);
+
+ if (zfsvfs->z_use_fuids == B_FALSE &&
+ (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ os = zfsvfs->z_os;
+
+ if (vap->va_mask & ATTR_XVATTR) {
+ if ((error = secpolicy_xvattr((xvattr_t *)vap,
+ crgetuid(cr), cr, vap->va_mode)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+top:
+ *ipp = NULL;
+
+ /*
+ * Create a new file object and update the directory
+ * to reference it.
+ */
+ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
+ goto out;
+ }
+
+ if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
+ cr, vsecp, &acl_ids)) != 0)
+ goto out;
+ have_acl = B_TRUE;
+
+ if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
+ projid = zfs_inherit_projid(dzp);
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
+ zfs_acl_ids_free(&acl_ids);
+ error = SET_ERROR(EDQUOT);
+ goto out;
+ }
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ if (!zfsvfs->z_use_sa &&
+ acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, acl_ids.z_aclp->z_acl_bytes);
+ }
+ error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ if (error) {
+ if (error == ERESTART) {
+ waited = B_TRUE;
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ /* Add to unlinked set */
+ zp->z_unlinked = B_TRUE;
+ zfs_unlinked_add(zp, tx);
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_commit(tx);
+out:
+
+ if (error) {
+ if (zp)
+ zrele(zp);
+ } else {
+ zfs_znode_update_vfs(dzp);
+ zfs_znode_update_vfs(zp);
+ *ipp = ZTOI(zp);
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Remove an entry from a directory.
+ *
+ * IN: dzp - znode of directory to remove entry from.
+ * name - name of entry to remove.
+ * cr - credentials of caller.
+ * flags - case flags.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * dzp - ctime|mtime
+ * ip - ctime (if nlink > 0)
+ */
+
+uint64_t null_xattr = 0;
+
+/*ARGSUSED*/
+int
+zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
+{
+ znode_t *zp;
+ znode_t *xzp;
+ zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+ zilog_t *zilog;
+ uint64_t acl_obj, xattr_obj;
+ uint64_t xattr_obj_unlinked = 0;
+ uint64_t obj = 0;
+ uint64_t links;
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ boolean_t may_delete_now, delete_now = FALSE;
+ boolean_t unlinked, toobig = FALSE;
+ uint64_t txtype;
+ pathname_t *realnmp = NULL;
+ pathname_t realnm;
+ int error;
+ int zflg = ZEXISTS;
+ boolean_t waited = B_FALSE;
+
+ if (name == NULL)
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ zilog = zfsvfs->z_log;
+
+ if (flags & FIGNORECASE) {
+ zflg |= ZCILOOK;
+ pn_alloc(&realnm);
+ realnmp = &realnm;
+ }
+
+top:
+ xattr_obj = 0;
+ xzp = NULL;
+ /*
+ * Attempt to lock directory; fail if entry doesn't exist.
+ */
+ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
+ NULL, realnmp))) {
+ if (realnmp)
+ pn_free(realnmp);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
+ goto out;
+ }
+
+ /*
+ * Need to use rmdir for removing directories.
+ */
+ if (S_ISDIR(ZTOI(zp)->i_mode)) {
+ error = SET_ERROR(EPERM);
+ goto out;
+ }
+
+ mutex_enter(&zp->z_lock);
+ may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 &&
+ !(zp->z_is_mapped);
+ mutex_exit(&zp->z_lock);
+
+ /*
+ * We may delete the znode now, or we may put it in the unlinked set;
+ * it depends on whether we're the last link, and on whether there are
+ * other holds on the inode. So we dmu_tx_hold() the right things to
+ * allow for either case.
+ */
+ obj = zp->z_id;
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ zfs_sa_upgrade_txholds(tx, dzp);
+ if (may_delete_now) {
+ toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
+ /* if the file is too big, only hold_free a token amount */
+ dmu_tx_hold_free(tx, zp->z_id, 0,
+ (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
+ }
+
+ /* are there any extended attributes? */
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+ if (error == 0 && xattr_obj) {
+ error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+ ASSERT0(error);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
+ }
+
+ mutex_enter(&zp->z_lock);
+ if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
+ dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+ mutex_exit(&zp->z_lock);
+
+ /* charge as an update -- would be nice not to charge at all */
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+
+ /*
+ * Mark this transaction as typically resulting in a net free of space
+ */
+ dmu_tx_mark_netfree(tx);
+
+ error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART) {
+ waited = B_TRUE;
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ zrele(zp);
+ if (xzp)
+ zrele(xzp);
+ goto top;
+ }
+ if (realnmp)
+ pn_free(realnmp);
+ dmu_tx_abort(tx);
+ zrele(zp);
+ if (xzp)
+ zrele(xzp);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Remove the directory entry.
+ */
+ error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
+
+ if (error) {
+ dmu_tx_commit(tx);
+ goto out;
+ }
+
+ if (unlinked) {
+ /*
+ * Hold z_lock so that we can make sure that the ACL obj
+ * hasn't changed. Could have been deleted due to
+ * zfs_sa_upgrade().
+ */
+ mutex_enter(&zp->z_lock);
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
+ delete_now = may_delete_now && !toobig &&
+ atomic_read(&ZTOI(zp)->i_count) == 1 &&
+ !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked &&
+ zfs_external_acl(zp) == acl_obj;
+ }
+
+ if (delete_now) {
+ if (xattr_obj_unlinked) {
+ ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
+ mutex_enter(&xzp->z_lock);
+ xzp->z_unlinked = B_TRUE;
+ clear_nlink(ZTOI(xzp));
+ links = 0;
+ error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+ &links, sizeof (links), tx);
+ ASSERT3U(error, ==, 0);
+ mutex_exit(&xzp->z_lock);
+ zfs_unlinked_add(xzp, tx);
+
+ if (zp->z_is_sa)
+ error = sa_remove(zp->z_sa_hdl,
+ SA_ZPL_XATTR(zfsvfs), tx);
+ else
+ error = sa_update(zp->z_sa_hdl,
+ SA_ZPL_XATTR(zfsvfs), &null_xattr,
+ sizeof (uint64_t), tx);
+ ASSERT0(error);
+ }
+ /*
+ * Add to the unlinked set because a new reference could be
+ * taken concurrently resulting in a deferred destruction.
+ */
+ zfs_unlinked_add(zp, tx);
+ mutex_exit(&zp->z_lock);
+ } else if (unlinked) {
+ mutex_exit(&zp->z_lock);
+ zfs_unlinked_add(zp, tx);
+ }
+
+ txtype = TX_REMOVE;
+ if (flags & FIGNORECASE)
+ txtype |= TX_CI;
+ zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
+
+ dmu_tx_commit(tx);
+out:
+ if (realnmp)
+ pn_free(realnmp);
+
+ zfs_dirent_unlock(dl);
+ zfs_znode_update_vfs(dzp);
+ zfs_znode_update_vfs(zp);
+
+ if (delete_now)
+ zrele(zp);
+ else
+ zfs_zrele_async(zp);
+
+ if (xzp) {
+ zfs_znode_update_vfs(xzp);
+ zfs_zrele_async(xzp);
+ }
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Create a new directory and insert it into dzp using the name
+ * provided. Return a pointer to the inserted directory.
+ *
+ * IN: dzp - znode of directory to add subdir to.
+ * dirname - name of new directory.
+ * vap - attributes of new directory.
+ * cr - credentials of caller.
+ * flags - case flags.
+ * vsecp - ACL to be set
+ *
+ * OUT: zpp - znode of created directory.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * dzp - ctime|mtime updated
+ * zpp - ctime|mtime|atime updated
+ */
+/*ARGSUSED*/
+int
+zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp,
+ cred_t *cr, int flags, vsecattr_t *vsecp)
+{
+ znode_t *zp;
+ zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+ zilog_t *zilog;
+ zfs_dirlock_t *dl;
+ uint64_t txtype;
+ dmu_tx_t *tx;
+ int error;
+ int zf = ZNEW;
+ uid_t uid;
+ gid_t gid = crgetgid(cr);
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
+ boolean_t waited = B_FALSE;
+
+ ASSERT(S_ISDIR(vap->va_mode));
+
+ /*
+ * If we have an ephemeral id, ACL, or XVATTR then
+ * make sure file system is at proper version
+ */
+
+ uid = crgetuid(cr);
+ if (zfsvfs->z_use_fuids == B_FALSE &&
+ (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+ return (SET_ERROR(EINVAL));
+
+ if (dirname == NULL)
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ zilog = zfsvfs->z_log;
+
+ if (dzp->z_pflags & ZFS_XATTR) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (zfsvfs->z_utf8 && u8_validate(dirname,
+ strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+ if (flags & FIGNORECASE)
+ zf |= ZCILOOK;
+
+ if (vap->va_mask & ATTR_XVATTR) {
+ if ((error = secpolicy_xvattr((xvattr_t *)vap,
+ crgetuid(cr), cr, vap->va_mode)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
+ vsecp, &acl_ids)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ /*
+ * First make sure the new directory doesn't exist.
+ *
+ * Existence is checked first to make sure we don't return
+ * EACCES instead of EEXIST which can cause some applications
+ * to fail.
+ */
+top:
+ *zpp = NULL;
+
+ if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
+ NULL, NULL))) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) {
+ zfs_acl_ids_free(&acl_ids);
+ zfs_dirent_unlock(dl);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
+ zfs_acl_ids_free(&acl_ids);
+ zfs_dirent_unlock(dl);
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EDQUOT));
+ }
+
+ /*
+ * Add a new entry to the directory.
+ */
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ acl_ids.z_aclp->z_acl_bytes);
+ }
+
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+
+ error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART) {
+ waited = B_TRUE;
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Create new node.
+ */
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+ /*
+ * Now put new name in parent dir.
+ */
+ error = zfs_link_create(dl, zp, tx, ZNEW);
+ if (error != 0) {
+ zfs_znode_delete(zp, tx);
+ remove_inode_hash(ZTOI(zp));
+ goto out;
+ }
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ *zpp = zp;
+
+ txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
+ if (flags & FIGNORECASE)
+ txtype |= TX_CI;
+ zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
+ acl_ids.z_fuidp, vap);
+
+out:
+ zfs_acl_ids_free(&acl_ids);
+
+ dmu_tx_commit(tx);
+
+ zfs_dirent_unlock(dl);
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ if (error != 0) {
+ zrele(zp);
+ } else {
+ zfs_znode_update_vfs(dzp);
+ zfs_znode_update_vfs(zp);
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Remove a directory subdir entry. If the current working
+ * directory is the same as the subdir to be removed, the
+ * remove will fail.
+ *
+ * IN: dzp - znode of directory to remove from.
+ * name - name of directory to be removed.
+ * cwd - inode of current working directory.
+ * cr - credentials of caller.
+ * flags - case flags
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * dzp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+int
+zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr,
+ int flags)
+{
+ znode_t *zp;
+ zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+ zilog_t *zilog;
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ int error;
+ int zflg = ZEXISTS;
+ boolean_t waited = B_FALSE;
+
+ if (name == NULL)
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ zilog = zfsvfs->z_log;
+
+ if (flags & FIGNORECASE)
+ zflg |= ZCILOOK;
+top:
+ zp = NULL;
+
+ /*
+ * Attempt to lock directory; fail if entry doesn't exist.
+ */
+ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
+ NULL, NULL))) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
+ goto out;
+ }
+
+ if (!S_ISDIR(ZTOI(zp)->i_mode)) {
+ error = SET_ERROR(ENOTDIR);
+ goto out;
+ }
+
+ if (zp == cwd) {
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ /*
+ * Grab a lock on the directory to make sure that no one is
+ * trying to add (or lookup) entries while we are removing it.
+ */
+ rw_enter(&zp->z_name_lock, RW_WRITER);
+
+ /*
+ * Grab a lock on the parent pointer to make sure we play well
+ * with the treewalk and directory rename code.
+ */
+ rw_enter(&zp->z_parent_lock, RW_WRITER);
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ zfs_sa_upgrade_txholds(tx, zp);
+ zfs_sa_upgrade_txholds(tx, dzp);
+ dmu_tx_mark_netfree(tx);
+ error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ if (error) {
+ rw_exit(&zp->z_parent_lock);
+ rw_exit(&zp->z_name_lock);
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART) {
+ waited = B_TRUE;
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ zrele(zp);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ zrele(zp);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
+
+ if (error == 0) {
+ uint64_t txtype = TX_RMDIR;
+ if (flags & FIGNORECASE)
+ txtype |= TX_CI;
+ zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT,
+ B_FALSE);
+ }
+
+ dmu_tx_commit(tx);
+
+ rw_exit(&zp->z_parent_lock);
+ rw_exit(&zp->z_name_lock);
+out:
+ zfs_dirent_unlock(dl);
+
+ zfs_znode_update_vfs(dzp);
+ zfs_znode_update_vfs(zp);
+ zrele(zp);
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Read directory entries from the given directory cursor position and emit
+ * name and position for each entry.
+ *
+ * IN: ip - inode of directory to read.
+ * ctx - directory entry context.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * ip - atime updated
+ *
+ * Note that the low 4 bits of the cookie returned by zap is always zero.
+ * This allows us to use the low range for "special" directory entries:
+ * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
+ * we use the offset 2 for the '.zfs' directory.
+ */
+/* ARGSUSED */
+int
+zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ objset_t *os;
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ int error;
+ uint8_t prefetch;
+ uint8_t type;
+ int done = 0;
+ uint64_t parent;
+ uint64_t offset; /* must be unsigned; checks for < 1 */
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (parent))) != 0)
+ goto out;
+
+ /*
+ * Quit if directory has been removed (posix)
+ */
+ if (zp->z_unlinked)
+ goto out;
+
+ error = 0;
+ os = zfsvfs->z_os;
+ offset = ctx->pos;
+ prefetch = zp->z_zn_prefetch;
+
+ /*
+ * Initialize the iterator cursor.
+ */
+ if (offset <= 3) {
+ /*
+ * Start iteration from the beginning of the directory.
+ */
+ zap_cursor_init(&zc, os, zp->z_id);
+ } else {
+ /*
+ * The offset is a serialized cursor.
+ */
+ zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
+ }
+
+ /*
+ * Transform to file-system independent format
+ */
+ while (!done) {
+ uint64_t objnum;
+ /*
+ * Special case `.', `..', and `.zfs'.
+ */
+ if (offset == 0) {
+ (void) strcpy(zap.za_name, ".");
+ zap.za_normalization_conflict = 0;
+ objnum = zp->z_id;
+ type = DT_DIR;
+ } else if (offset == 1) {
+ (void) strcpy(zap.za_name, "..");
+ zap.za_normalization_conflict = 0;
+ objnum = parent;
+ type = DT_DIR;
+ } else if (offset == 2 && zfs_show_ctldir(zp)) {
+ (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
+ zap.za_normalization_conflict = 0;
+ objnum = ZFSCTL_INO_ROOT;
+ type = DT_DIR;
+ } else {
+ /*
+ * Grab next entry.
+ */
+ if ((error = zap_cursor_retrieve(&zc, &zap))) {
+ if (error == ENOENT)
+ break;
+ else
+ goto update;
+ }
+
+ /*
+ * Allow multiple entries provided the first entry is
+ * the object id. Non-zpl consumers may safely make
+ * use of the additional space.
+ *
+ * XXX: This should be a feature flag for compatibility
+ */
+ if (zap.za_integer_length != 8 ||
+ zap.za_num_integers == 0) {
+ cmn_err(CE_WARN, "zap_readdir: bad directory "
+ "entry, obj = %lld, offset = %lld, "
+ "length = %d, num = %lld\n",
+ (u_longlong_t)zp->z_id,
+ (u_longlong_t)offset,
+ zap.za_integer_length,
+ (u_longlong_t)zap.za_num_integers);
+ error = SET_ERROR(ENXIO);
+ goto update;
+ }
+
+ objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
+ type = ZFS_DIRENT_TYPE(zap.za_first_integer);
+ }
+
+ done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name),
+ objnum, type);
+ if (done)
+ break;
+
+ /* Prefetch znode */
+ if (prefetch) {
+ dmu_prefetch(os, objnum, 0, 0, 0,
+ ZIO_PRIORITY_SYNC_READ);
+ }
+
+ /*
+ * Move to the next entry, fill in the previous offset.
+ */
+ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
+ zap_cursor_advance(&zc);
+ offset = zap_cursor_serialize(&zc);
+ } else {
+ offset += 1;
+ }
+ ctx->pos = offset;
+ }
+ zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
+
+update:
+ zap_cursor_fini(&zc);
+ if (error == ENOENT)
+ error = 0;
+out:
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*
+ * Get the basic file attributes and place them in the provided kstat
+ * structure. The inode is assumed to be the authoritative source
+ * for most of the attributes. However, the znode currently has the
+ * authoritative atime, blksize, and block count.
+ *
+ * IN: ip - inode of file.
+ *
+ * OUT: sp - kstat values.
+ *
+ * RETURN: 0 (always succeeds)
+ */
+/* ARGSUSED */
+int
+zfs_getattr_fast(struct inode *ip, struct kstat *sp)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ uint32_t blksize;
+ u_longlong_t nblocks;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ mutex_enter(&zp->z_lock);
+
+ generic_fillattr(ip, sp);
+ /*
+ * +1 link count for root inode with visible '.zfs' directory.
+ */
+ if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
+ if (sp->nlink < ZFS_LINK_MAX)
+ sp->nlink++;
+
+ sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
+ sp->blksize = blksize;
+ sp->blocks = nblocks;
+
+ if (unlikely(zp->z_blksz == 0)) {
+ /*
+ * Block size hasn't been set; suggest maximal I/O transfers.
+ */
+ sp->blksize = zfsvfs->z_max_blksz;
+ }
+
+ mutex_exit(&zp->z_lock);
+
+ /*
+ * Required to prevent NFS client from detecting different inode
+ * numbers of snapshot root dentry before and after snapshot mount.
+ */
+ if (zfsvfs->z_issnap) {
+ if (ip->i_sb->s_root->d_inode == ip)
+ sp->ino = ZFSCTL_INO_SNAPDIRS -
+ dmu_objset_id(zfsvfs->z_os);
+ }
+
+ ZFS_EXIT(zfsvfs);
+
+ return (0);
+}
+
+/*
+ * For the operation of changing file's user/group/project, we need to
+ * handle not only the main object that is assigned to the file directly,
+ * but also the ones that are used by the file via hidden xattr directory.
+ *
+ * Because the xattr directory may contains many EA entries, as to it may
+ * be impossible to change all of them via the transaction of changing the
+ * main object's user/group/project attributes. Then we have to change them
+ * via other multiple independent transactions one by one. It may be not good
+ * solution, but we have no better idea yet.
+ */
+static int
+zfs_setattr_dir(znode_t *dzp)
+{
+ struct inode *dxip = ZTOI(dzp);
+ struct inode *xip = NULL;
+ zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+ objset_t *os = zfsvfs->z_os;
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ zfs_dirlock_t *dl;
+ znode_t *zp = NULL;
+ dmu_tx_t *tx = NULL;
+ uint64_t uid, gid;
+ sa_bulk_attr_t bulk[4];
+ int count;
+ int err;
+
+ zap_cursor_init(&zc, os, dzp->z_id);
+ while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) {
+ count = 0;
+ if (zap.za_integer_length != 8 || zap.za_num_integers != 1) {
+ err = ENXIO;
+ break;
+ }
+
+ err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp,
+ ZEXISTS, NULL, NULL);
+ if (err == ENOENT)
+ goto next;
+ if (err)
+ break;
+
+ xip = ZTOI(zp);
+ if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) &&
+ KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) &&
+ zp->z_projid == dzp->z_projid)
+ goto next;
+
+ tx = dmu_tx_create(os);
+ if (!(zp->z_pflags & ZFS_PROJID))
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ else
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err)
+ break;
+
+ mutex_enter(&dzp->z_lock);
+
+ if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) {
+ xip->i_uid = dxip->i_uid;
+ uid = zfs_uid_read(dxip);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &uid, sizeof (uid));
+ }
+
+ if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) {
+ xip->i_gid = dxip->i_gid;
+ gid = zfs_gid_read(dxip);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+ &gid, sizeof (gid));
+ }
+
+ if (zp->z_projid != dzp->z_projid) {
+ if (!(zp->z_pflags & ZFS_PROJID)) {
+ zp->z_pflags |= ZFS_PROJID;
+ SA_ADD_BULK_ATTR(bulk, count,
+ SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags,
+ sizeof (zp->z_pflags));
+ }
+
+ zp->z_projid = dzp->z_projid;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs),
+ NULL, &zp->z_projid, sizeof (zp->z_projid));
+ }
+
+ mutex_exit(&dzp->z_lock);
+
+ if (likely(count > 0)) {
+ err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ dmu_tx_commit(tx);
+ } else {
+ dmu_tx_abort(tx);
+ }
+ tx = NULL;
+ if (err != 0 && err != ENOENT)
+ break;
+
+next:
+ if (zp) {
+ zrele(zp);
+ zp = NULL;
+ zfs_dirent_unlock(dl);
+ }
+ zap_cursor_advance(&zc);
+ }
+
+ if (tx)
+ dmu_tx_abort(tx);
+ if (zp) {
+ zrele(zp);
+ zfs_dirent_unlock(dl);
+ }
+ zap_cursor_fini(&zc);
+
+ return (err == ENOENT ? 0 : err);
+}
+
+/*
+ * Set the file attributes to the values contained in the
+ * vattr structure.
+ *
+ * IN: zp - znode of file to be modified.
+ * vap - new attribute values.
+ * If ATTR_XVATTR set, then optional attrs are being set
+ * flags - ATTR_UTIME set if non-default time values provided.
+ * - ATTR_NOACLCHECK (CIFS context only).
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * ip - ctime updated, mtime updated if size changed.
+ */
+/* ARGSUSED */
+int
+zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr)
+{
+ struct inode *ip;
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ objset_t *os = zfsvfs->z_os;
+ zilog_t *zilog;
+ dmu_tx_t *tx;
+ vattr_t oldva;
+ xvattr_t *tmpxvattr;
+ uint_t mask = vap->va_mask;
+ uint_t saved_mask = 0;
+ int trim_mask = 0;
+ uint64_t new_mode;
+ uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid;
+ uint64_t xattr_obj;
+ uint64_t mtime[2], ctime[2], atime[2];
+ uint64_t projid = ZFS_INVALID_PROJID;
+ znode_t *attrzp;
+ int need_policy = FALSE;
+ int err, err2 = 0;
+ zfs_fuid_info_t *fuidp = NULL;
+ xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
+ xoptattr_t *xoap;
+ zfs_acl_t *aclp;
+ boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+ boolean_t fuid_dirtied = B_FALSE;
+ boolean_t handle_eadir = B_FALSE;
+ sa_bulk_attr_t *bulk, *xattr_bulk;
+ int count = 0, xattr_count = 0, bulks = 8;
+
+ if (mask == 0)
+ return (0);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+ ip = ZTOI(zp);
+
+ /*
+ * If this is a xvattr_t, then get a pointer to the structure of
+ * optional attributes. If this is NULL, then we have a vattr_t.
+ */
+ xoap = xva_getxoptattr(xvap);
+ if (xoap != NULL && (mask & ATTR_XVATTR)) {
+ if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
+ if (!dmu_objset_projectquota_enabled(os) ||
+ (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ projid = xoap->xoa_projid;
+ if (unlikely(projid == ZFS_INVALID_PROJID)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
+ projid = ZFS_INVALID_PROJID;
+ else
+ need_policy = TRUE;
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
+ (xoap->xoa_projinherit !=
+ ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
+ (!dmu_objset_projectquota_enabled(os) ||
+ (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(ENOTSUP));
+ }
+ }
+
+ zilog = zfsvfs->z_log;
+
+ /*
+ * Make sure that if we have ephemeral uid/gid or xvattr specified
+ * that file system is at proper version level
+ */
+
+ if (zfsvfs->z_use_fuids == B_FALSE &&
+ (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
+ ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
+ (mask & ATTR_XVATTR))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EISDIR));
+ }
+
+ if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
+ xva_init(tmpxvattr);
+
+ bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
+ xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
+
+ /*
+ * Immutable files can only alter immutable bit and atime
+ */
+ if ((zp->z_pflags & ZFS_IMMUTABLE) &&
+ ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
+ ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
+ err = SET_ERROR(EPERM);
+ goto out3;
+ }
+
+ if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
+ err = SET_ERROR(EPERM);
+ goto out3;
+ }
+
+ /*
+ * Verify timestamps doesn't overflow 32 bits.
+ * ZFS can handle large timestamps, but 32bit syscalls can't
+ * handle times greater than 2039. This check should be removed
+ * once large timestamps are fully supported.
+ */
+ if (mask & (ATTR_ATIME | ATTR_MTIME)) {
+ if (((mask & ATTR_ATIME) &&
+ TIMESPEC_OVERFLOW(&vap->va_atime)) ||
+ ((mask & ATTR_MTIME) &&
+ TIMESPEC_OVERFLOW(&vap->va_mtime))) {
+ err = SET_ERROR(EOVERFLOW);
+ goto out3;
+ }
+ }
+
+top:
+ attrzp = NULL;
+ aclp = NULL;
+
+ /* Can this be moved to before the top label? */
+ if (zfs_is_readonly(zfsvfs)) {
+ err = SET_ERROR(EROFS);
+ goto out3;
+ }
+
+ /*
+ * First validate permissions
+ */
+
+ if (mask & ATTR_SIZE) {
+ err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
+ if (err)
+ goto out3;
+
+ /*
+ * XXX - Note, we are not providing any open
+ * mode flags here (like FNDELAY), so we may
+ * block if there are locks present... this
+ * should be addressed in openat().
+ */
+ /* XXX - would it be OK to generate a log record here? */
+ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
+ if (err)
+ goto out3;
+ }
+
+ if (mask & (ATTR_ATIME|ATTR_MTIME) ||
+ ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
+ XVA_ISSET_REQ(xvap, XAT_READONLY) ||
+ XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
+ XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
+ XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
+ XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
+ XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
+ need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
+ skipaclchk, cr);
+ }
+
+ if (mask & (ATTR_UID|ATTR_GID)) {
+ int idmask = (mask & (ATTR_UID|ATTR_GID));
+ int take_owner;
+ int take_group;
+
+ /*
+ * NOTE: even if a new mode is being set,
+ * we may clear S_ISUID/S_ISGID bits.
+ */
+
+ if (!(mask & ATTR_MODE))
+ vap->va_mode = zp->z_mode;
+
+ /*
+ * Take ownership or chgrp to group we are a member of
+ */
+
+ take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr));
+ take_group = (mask & ATTR_GID) &&
+ zfs_groupmember(zfsvfs, vap->va_gid, cr);
+
+ /*
+ * If both ATTR_UID and ATTR_GID are set then take_owner and
+ * take_group must both be set in order to allow taking
+ * ownership.
+ *
+ * Otherwise, send the check through secpolicy_vnode_setattr()
+ *
+ */
+
+ if (((idmask == (ATTR_UID|ATTR_GID)) &&
+ take_owner && take_group) ||
+ ((idmask == ATTR_UID) && take_owner) ||
+ ((idmask == ATTR_GID) && take_group)) {
+ if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
+ skipaclchk, cr) == 0) {
+ /*
+ * Remove setuid/setgid for non-privileged users
+ */
+ (void) secpolicy_setid_clear(vap, cr);
+ trim_mask = (mask & (ATTR_UID|ATTR_GID));
+ } else {
+ need_policy = TRUE;
+ }
+ } else {
+ need_policy = TRUE;
+ }
+ }
+
+ mutex_enter(&zp->z_lock);
+ oldva.va_mode = zp->z_mode;
+ zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
+ if (mask & ATTR_XVATTR) {
+ /*
+ * Update xvattr mask to include only those attributes
+ * that are actually changing.
+ *
+ * the bits will be restored prior to actually setting
+ * the attributes so the caller thinks they were set.
+ */
+ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+ if (xoap->xoa_appendonly !=
+ ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_APPENDONLY);
+ XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
+ if (xoap->xoa_projinherit !=
+ ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
+ XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+ if (xoap->xoa_nounlink !=
+ ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_NOUNLINK);
+ XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+ if (xoap->xoa_immutable !=
+ ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
+ XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+ if (xoap->xoa_nodump !=
+ ((zp->z_pflags & ZFS_NODUMP) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_NODUMP);
+ XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+ if (xoap->xoa_av_modified !=
+ ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
+ XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+ if ((!S_ISREG(ip->i_mode) &&
+ xoap->xoa_av_quarantined) ||
+ xoap->xoa_av_quarantined !=
+ ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
+ XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+ mutex_exit(&zp->z_lock);
+ err = SET_ERROR(EPERM);
+ goto out3;
+ }
+
+ if (need_policy == FALSE &&
+ (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
+ XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
+ need_policy = TRUE;
+ }
+ }
+
+ mutex_exit(&zp->z_lock);
+
+ if (mask & ATTR_MODE) {
+ if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
+ err = secpolicy_setid_setsticky_clear(ip, vap,
+ &oldva, cr);
+ if (err)
+ goto out3;
+
+ trim_mask |= ATTR_MODE;
+ } else {
+ need_policy = TRUE;
+ }
+ }
+
+ if (need_policy) {
+ /*
+ * If trim_mask is set then take ownership
+ * has been granted or write_acl is present and user
+ * has the ability to modify mode. In that case remove
+ * UID|GID and or MODE from mask so that
+ * secpolicy_vnode_setattr() doesn't revoke it.
+ */
+
+ if (trim_mask) {
+ saved_mask = vap->va_mask;
+ vap->va_mask &= ~trim_mask;
+ }
+ err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
+ (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
+ if (err)
+ goto out3;
+
+ if (trim_mask)
+ vap->va_mask |= saved_mask;
+ }
+
+ /*
+ * secpolicy_vnode_setattr, or take ownership may have
+ * changed va_mask
+ */
+ mask = vap->va_mask;
+
+ if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) {
+ handle_eadir = B_TRUE;
+ err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+
+ if (err == 0 && xattr_obj) {
+ err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
+ if (err)
+ goto out2;
+ }
+ if (mask & ATTR_UID) {
+ new_kuid = zfs_fuid_create(zfsvfs,
+ (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
+ if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
+ zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
+ new_kuid)) {
+ if (attrzp)
+ zrele(attrzp);
+ err = SET_ERROR(EDQUOT);
+ goto out2;
+ }
+ }
+
+ if (mask & ATTR_GID) {
+ new_kgid = zfs_fuid_create(zfsvfs,
+ (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
+ if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
+ zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
+ new_kgid)) {
+ if (attrzp)
+ zrele(attrzp);
+ err = SET_ERROR(EDQUOT);
+ goto out2;
+ }
+ }
+
+ if (projid != ZFS_INVALID_PROJID &&
+ zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
+ if (attrzp)
+ zrele(attrzp);
+ err = EDQUOT;
+ goto out2;
+ }
+ }
+ tx = dmu_tx_create(os);
+
+ if (mask & ATTR_MODE) {
+ uint64_t pmode = zp->z_mode;
+ uint64_t acl_obj;
+ new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
+
+ if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED &&
+ !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
+ err = EPERM;
+ goto out;
+ }
+
+ if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
+ goto out;
+
+ mutex_enter(&zp->z_lock);
+ if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
+ /*
+ * Are we upgrading ACL from old V0 format
+ * to V1 format?
+ */
+ if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+ zfs_znode_acl_version(zp) ==
+ ZFS_ACL_VERSION_INITIAL) {
+ dmu_tx_hold_free(tx, acl_obj, 0,
+ DMU_OBJECT_END);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, aclp->z_acl_bytes);
+ } else {
+ dmu_tx_hold_write(tx, acl_obj, 0,
+ aclp->z_acl_bytes);
+ }
+ } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, aclp->z_acl_bytes);
+ }
+ mutex_exit(&zp->z_lock);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ } else {
+ if (((mask & ATTR_XVATTR) &&
+ XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
+ (projid != ZFS_INVALID_PROJID &&
+ !(zp->z_pflags & ZFS_PROJID)))
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ else
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ }
+
+ if (attrzp) {
+ dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
+ }
+
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+
+ zfs_sa_upgrade_txholds(tx, zp);
+
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err)
+ goto out;
+
+ count = 0;
+ /*
+ * Set each attribute requested.
+ * We group settings according to the locks they need to acquire.
+ *
+ * Note: you cannot set ctime directly, although it will be
+ * updated as a side-effect of calling this function.
+ */
+
+ if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
+ /*
+ * For the existed object that is upgraded from old system,
+ * its on-disk layout has no slot for the project ID attribute.
+ * But quota accounting logic needs to access related slots by
+ * offset directly. So we need to adjust old objects' layout
+ * to make the project ID to some unified and fixed offset.
+ */
+ if (attrzp)
+ err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
+ if (err == 0)
+ err = sa_add_projid(zp->z_sa_hdl, tx, projid);
+
+ if (unlikely(err == EEXIST))
+ err = 0;
+ else if (err != 0)
+ goto out;
+ else
+ projid = ZFS_INVALID_PROJID;
+ }
+
+ if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
+ mutex_enter(&zp->z_acl_lock);
+ mutex_enter(&zp->z_lock);
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+
+ if (attrzp) {
+ if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
+ mutex_enter(&attrzp->z_acl_lock);
+ mutex_enter(&attrzp->z_lock);
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
+ sizeof (attrzp->z_pflags));
+ if (projid != ZFS_INVALID_PROJID) {
+ attrzp->z_projid = projid;
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
+ sizeof (attrzp->z_projid));
+ }
+ }
+
+ if (mask & (ATTR_UID|ATTR_GID)) {
+
+ if (mask & ATTR_UID) {
+ ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
+ new_uid = zfs_uid_read(ZTOI(zp));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &new_uid, sizeof (new_uid));
+ if (attrzp) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_UID(zfsvfs), NULL, &new_uid,
+ sizeof (new_uid));
+ ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
+ }
+ }
+
+ if (mask & ATTR_GID) {
+ ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
+ new_gid = zfs_gid_read(ZTOI(zp));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
+ NULL, &new_gid, sizeof (new_gid));
+ if (attrzp) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_GID(zfsvfs), NULL, &new_gid,
+ sizeof (new_gid));
+ ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
+ }
+ }
+ if (!(mask & ATTR_MODE)) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
+ NULL, &new_mode, sizeof (new_mode));
+ new_mode = zp->z_mode;
+ }
+ err = zfs_acl_chown_setattr(zp);
+ ASSERT(err == 0);
+ if (attrzp) {
+ err = zfs_acl_chown_setattr(attrzp);
+ ASSERT(err == 0);
+ }
+ }
+
+ if (mask & ATTR_MODE) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &new_mode, sizeof (new_mode));
+ zp->z_mode = ZTOI(zp)->i_mode = new_mode;
+ ASSERT3P(aclp, !=, NULL);
+ err = zfs_aclset_common(zp, aclp, cr, tx);
+ ASSERT0(err);
+ if (zp->z_acl_cached)
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = aclp;
+ aclp = NULL;
+ }
+
+ if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
+ zp->z_atime_dirty = B_FALSE;
+ ZFS_TIME_ENCODE(&ip->i_atime, atime);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &atime, sizeof (atime));
+ }
+
+ if (mask & (ATTR_MTIME | ATTR_SIZE)) {
+ ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+ ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate(
+ vap->va_mtime, ZTOI(zp));
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ mtime, sizeof (mtime));
+ }
+
+ if (mask & (ATTR_CTIME | ATTR_SIZE)) {
+ ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
+ ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime,
+ ZTOI(zp));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ ctime, sizeof (ctime));
+ }
+
+ if (projid != ZFS_INVALID_PROJID) {
+ zp->z_projid = projid;
+ SA_ADD_BULK_ATTR(bulk, count,
+ SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
+ sizeof (zp->z_projid));
+ }
+
+ if (attrzp && mask) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
+ sizeof (ctime));
+ }
+
+ /*
+ * Do this after setting timestamps to prevent timestamp
+ * update from toggling bit
+ */
+
+ if (xoap && (mask & ATTR_XVATTR)) {
+
+ /*
+ * restore trimmed off masks
+ * so that return masks can be set for caller.
+ */
+
+ if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
+ XVA_SET_REQ(xvap, XAT_APPENDONLY);
+ }
+ if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
+ XVA_SET_REQ(xvap, XAT_NOUNLINK);
+ }
+ if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
+ XVA_SET_REQ(xvap, XAT_IMMUTABLE);
+ }
+ if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
+ XVA_SET_REQ(xvap, XAT_NODUMP);
+ }
+ if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
+ XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
+ }
+ if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
+ XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
+ }
+ if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) {
+ XVA_SET_REQ(xvap, XAT_PROJINHERIT);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
+ ASSERT(S_ISREG(ip->i_mode));
+
+ zfs_xvattr_set(zp, xvap, tx);
+ }
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ if (mask != 0)
+ zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
+
+ mutex_exit(&zp->z_lock);
+ if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
+ mutex_exit(&zp->z_acl_lock);
+
+ if (attrzp) {
+ if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
+ mutex_exit(&attrzp->z_acl_lock);
+ mutex_exit(&attrzp->z_lock);
+ }
+out:
+ if (err == 0 && xattr_count > 0) {
+ err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
+ xattr_count, tx);
+ ASSERT(err2 == 0);
+ }
+
+ if (aclp)
+ zfs_acl_free(aclp);
+
+ if (fuidp) {
+ zfs_fuid_info_free(fuidp);
+ fuidp = NULL;
+ }
+
+ if (err) {
+ dmu_tx_abort(tx);
+ if (attrzp)
+ zrele(attrzp);
+ if (err == ERESTART)
+ goto top;
+ } else {
+ if (count > 0)
+ err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ dmu_tx_commit(tx);
+ if (attrzp) {
+ if (err2 == 0 && handle_eadir)
+ err2 = zfs_setattr_dir(attrzp);
+ zrele(attrzp);
+ }
+ zfs_znode_update_vfs(zp);
+ }
+
+out2:
+ if (os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+out3:
+ kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
+ kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks);
+ kmem_free(tmpxvattr, sizeof (xvattr_t));
+ ZFS_EXIT(zfsvfs);
+ return (err);
+}
+
+typedef struct zfs_zlock {
+ krwlock_t *zl_rwlock; /* lock we acquired */
+ znode_t *zl_znode; /* znode we held */
+ struct zfs_zlock *zl_next; /* next in list */
+} zfs_zlock_t;
+
+/*
+ * Drop locks and release vnodes that were held by zfs_rename_lock().
+ */
+static void
+zfs_rename_unlock(zfs_zlock_t **zlpp)
+{
+ zfs_zlock_t *zl;
+
+ while ((zl = *zlpp) != NULL) {
+ if (zl->zl_znode != NULL)
+ zfs_zrele_async(zl->zl_znode);
+ rw_exit(zl->zl_rwlock);
+ *zlpp = zl->zl_next;
+ kmem_free(zl, sizeof (*zl));
+ }
+}
+
+/*
+ * Search back through the directory tree, using the ".." entries.
+ * Lock each directory in the chain to prevent concurrent renames.
+ * Fail any attempt to move a directory into one of its own descendants.
+ * XXX - z_parent_lock can overlap with map or grow locks
+ */
+static int
+zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
+{
+ zfs_zlock_t *zl;
+ znode_t *zp = tdzp;
+ uint64_t rootid = ZTOZSB(zp)->z_root;
+ uint64_t oidp = zp->z_id;
+ krwlock_t *rwlp = &szp->z_parent_lock;
+ krw_t rw = RW_WRITER;
+
+ /*
+ * First pass write-locks szp and compares to zp->z_id.
+ * Later passes read-lock zp and compare to zp->z_parent.
+ */
+ do {
+ if (!rw_tryenter(rwlp, rw)) {
+ /*
+ * Another thread is renaming in this path.
+ * Note that if we are a WRITER, we don't have any
+ * parent_locks held yet.
+ */
+ if (rw == RW_READER && zp->z_id > szp->z_id) {
+ /*
+ * Drop our locks and restart
+ */
+ zfs_rename_unlock(&zl);
+ *zlpp = NULL;
+ zp = tdzp;
+ oidp = zp->z_id;
+ rwlp = &szp->z_parent_lock;
+ rw = RW_WRITER;
+ continue;
+ } else {
+ /*
+ * Wait for other thread to drop its locks
+ */
+ rw_enter(rwlp, rw);
+ }
+ }
+
+ zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
+ zl->zl_rwlock = rwlp;
+ zl->zl_znode = NULL;
+ zl->zl_next = *zlpp;
+ *zlpp = zl;
+
+ if (oidp == szp->z_id) /* We're a descendant of szp */
+ return (SET_ERROR(EINVAL));
+
+ if (oidp == rootid) /* We've hit the top */
+ return (0);
+
+ if (rw == RW_READER) { /* i.e. not the first pass */
+ int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
+ if (error)
+ return (error);
+ zl->zl_znode = zp;
+ }
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
+ &oidp, sizeof (oidp));
+ rwlp = &zp->z_parent_lock;
+ rw = RW_READER;
+
+ } while (zp->z_id != sdzp->z_id);
+
+ return (0);
+}
+
+/*
+ * Move an entry from the provided source directory to the target
+ * directory. Change the entry name as indicated.
+ *
+ * IN: sdzp - Source directory containing the "old entry".
+ * snm - Old entry name.
+ * tdzp - Target directory to contain the "new entry".
+ * tnm - New entry name.
+ * cr - credentials of caller.
+ * flags - case flags
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * sdzp,tdzp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+int
+zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
+ cred_t *cr, int flags)
+{
+ znode_t *szp, *tzp;
+ zfsvfs_t *zfsvfs = ZTOZSB(sdzp);
+ zilog_t *zilog;
+ zfs_dirlock_t *sdl, *tdl;
+ dmu_tx_t *tx;
+ zfs_zlock_t *zl;
+ int cmp, serr, terr;
+ int error = 0;
+ int zflg = 0;
+ boolean_t waited = B_FALSE;
+
+ if (snm == NULL || tnm == NULL)
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(sdzp);
+ zilog = zfsvfs->z_log;
+
+ ZFS_VERIFY_ZP(tdzp);
+
+ /*
+ * We check i_sb because snapshots and the ctldir must have different
+ * super blocks.
+ */
+ if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb ||
+ zfsctl_is_node(ZTOI(tdzp))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EXDEV));
+ }
+
+ if (zfsvfs->z_utf8 && u8_validate(tnm,
+ strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+
+ if (flags & FIGNORECASE)
+ zflg |= ZCILOOK;
+
+top:
+ szp = NULL;
+ tzp = NULL;
+ zl = NULL;
+
+ /*
+ * This is to prevent the creation of links into attribute space
+ * by renaming a linked file into/outof an attribute directory.
+ * See the comment in zfs_link() for why this is considered bad.
+ */
+ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Lock source and target directory entries. To prevent deadlock,
+ * a lock ordering must be defined. We lock the directory with
+ * the smallest object id first, or if it's a tie, the one with
+ * the lexically first name.
+ */
+ if (sdzp->z_id < tdzp->z_id) {
+ cmp = -1;
+ } else if (sdzp->z_id > tdzp->z_id) {
+ cmp = 1;
+ } else {
+ /*
+ * First compare the two name arguments without
+ * considering any case folding.
+ */
+ int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
+
+ cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
+ ASSERT(error == 0 || !zfsvfs->z_utf8);
+ if (cmp == 0) {
+ /*
+ * POSIX: "If the old argument and the new argument
+ * both refer to links to the same existing file,
+ * the rename() function shall return successfully
+ * and perform no other action."
+ */
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+ /*
+ * If the file system is case-folding, then we may
+ * have some more checking to do. A case-folding file
+ * system is either supporting mixed case sensitivity
+ * access or is completely case-insensitive. Note
+ * that the file system is always case preserving.
+ *
+ * In mixed sensitivity mode case sensitive behavior
+ * is the default. FIGNORECASE must be used to
+ * explicitly request case insensitive behavior.
+ *
+ * If the source and target names provided differ only
+ * by case (e.g., a request to rename 'tim' to 'Tim'),
+ * we will treat this as a special case in the
+ * case-insensitive mode: as long as the source name
+ * is an exact match, we will allow this to proceed as
+ * a name-change request.
+ */
+ if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
+ (zfsvfs->z_case == ZFS_CASE_MIXED &&
+ flags & FIGNORECASE)) &&
+ u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
+ &error) == 0) {
+ /*
+ * case preserving rename request, require exact
+ * name matches
+ */
+ zflg |= ZCIEXACT;
+ zflg &= ~ZCILOOK;
+ }
+ }
+
+ /*
+ * If the source and destination directories are the same, we should
+ * grab the z_name_lock of that directory only once.
+ */
+ if (sdzp == tdzp) {
+ zflg |= ZHAVELOCK;
+ rw_enter(&sdzp->z_name_lock, RW_READER);
+ }
+
+ if (cmp < 0) {
+ serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
+ ZEXISTS | zflg, NULL, NULL);
+ terr = zfs_dirent_lock(&tdl,
+ tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
+ } else {
+ terr = zfs_dirent_lock(&tdl,
+ tdzp, tnm, &tzp, zflg, NULL, NULL);
+ serr = zfs_dirent_lock(&sdl,
+ sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
+ NULL, NULL);
+ }
+
+ if (serr) {
+ /*
+ * Source entry invalid or not there.
+ */
+ if (!terr) {
+ zfs_dirent_unlock(tdl);
+ if (tzp)
+ zrele(tzp);
+ }
+
+ if (sdzp == tdzp)
+ rw_exit(&sdzp->z_name_lock);
+
+ if (strcmp(snm, "..") == 0)
+ serr = EINVAL;
+ ZFS_EXIT(zfsvfs);
+ return (serr);
+ }
+ if (terr) {
+ zfs_dirent_unlock(sdl);
+ zrele(szp);
+
+ if (sdzp == tdzp)
+ rw_exit(&sdzp->z_name_lock);
+
+ if (strcmp(tnm, "..") == 0)
+ terr = EINVAL;
+ ZFS_EXIT(zfsvfs);
+ return (terr);
+ }
+
+ /*
+ * If we are using project inheritance, means if the directory has
+ * ZFS_PROJINHERIT set, then its descendant directories will inherit
+ * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
+ * such case, we only allow renames into our tree when the project
+ * IDs are the same.
+ */
+ if (tdzp->z_pflags & ZFS_PROJINHERIT &&
+ tdzp->z_projid != szp->z_projid) {
+ error = SET_ERROR(EXDEV);
+ goto out;
+ }
+
+ /*
+ * Must have write access at the source to remove the old entry
+ * and write access at the target to create the new entry.
+ * Note that if target and source are the same, this can be
+ * done in a single check.
+ */
+
+ if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)))
+ goto out;
+
+ if (S_ISDIR(ZTOI(szp)->i_mode)) {
+ /*
+ * Check to make sure rename is valid.
+ * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
+ */
+ if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
+ goto out;
+ }
+
+ /*
+ * Does target exist?
+ */
+ if (tzp) {
+ /*
+ * Source and target must be the same type.
+ */
+ if (S_ISDIR(ZTOI(szp)->i_mode)) {
+ if (!S_ISDIR(ZTOI(tzp)->i_mode)) {
+ error = SET_ERROR(ENOTDIR);
+ goto out;
+ }
+ } else {
+ if (S_ISDIR(ZTOI(tzp)->i_mode)) {
+ error = SET_ERROR(EISDIR);
+ goto out;
+ }
+ }
+ /*
+ * POSIX dictates that when the source and target
+ * entries refer to the same file object, rename
+ * must do nothing and exit without error.
+ */
+ if (szp->z_id == tzp->z_id) {
+ error = 0;
+ goto out;
+ }
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
+ dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
+ if (sdzp != tdzp) {
+ dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, tdzp);
+ }
+ if (tzp) {
+ dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, tzp);
+ }
+
+ zfs_sa_upgrade_txholds(tx, szp);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ if (error) {
+ if (zl != NULL)
+ zfs_rename_unlock(&zl);
+ zfs_dirent_unlock(sdl);
+ zfs_dirent_unlock(tdl);
+
+ if (sdzp == tdzp)
+ rw_exit(&sdzp->z_name_lock);
+
+ if (error == ERESTART) {
+ waited = B_TRUE;
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ zrele(szp);
+ if (tzp)
+ zrele(tzp);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ zrele(szp);
+ if (tzp)
+ zrele(tzp);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (tzp) /* Attempt to remove the existing target */
+ error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
+
+ if (error == 0) {
+ error = zfs_link_create(tdl, szp, tx, ZRENAMING);
+ if (error == 0) {
+ szp->z_pflags |= ZFS_AV_MODIFIED;
+ if (tdzp->z_pflags & ZFS_PROJINHERIT)
+ szp->z_pflags |= ZFS_PROJINHERIT;
+
+ error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+ (void *)&szp->z_pflags, sizeof (uint64_t), tx);
+ ASSERT0(error);
+
+ error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
+ if (error == 0) {
+ zfs_log_rename(zilog, tx, TX_RENAME |
+ (flags & FIGNORECASE ? TX_CI : 0), sdzp,
+ sdl->dl_name, tdzp, tdl->dl_name, szp);
+ } else {
+ /*
+ * At this point, we have successfully created
+ * the target name, but have failed to remove
+ * the source name. Since the create was done
+ * with the ZRENAMING flag, there are
+ * complications; for one, the link count is
+ * wrong. The easiest way to deal with this
+ * is to remove the newly created target, and
+ * return the original error. This must
+ * succeed; fortunately, it is very unlikely to
+ * fail, since we just created it.
+ */
+ VERIFY3U(zfs_link_destroy(tdl, szp, tx,
+ ZRENAMING, NULL), ==, 0);
+ }
+ } else {
+ /*
+ * If we had removed the existing target, subsequent
+ * call to zfs_link_create() to add back the same entry
+ * but, the new dnode (szp) should not fail.
+ */
+ ASSERT(tzp == NULL);
+ }
+ }
+
+ dmu_tx_commit(tx);
+out:
+ if (zl != NULL)
+ zfs_rename_unlock(&zl);
+
+ zfs_dirent_unlock(sdl);
+ zfs_dirent_unlock(tdl);
+
+ zfs_znode_update_vfs(sdzp);
+ if (sdzp == tdzp)
+ rw_exit(&sdzp->z_name_lock);
+
+ if (sdzp != tdzp)
+ zfs_znode_update_vfs(tdzp);
+
+ zfs_znode_update_vfs(szp);
+ zrele(szp);
+ if (tzp) {
+ zfs_znode_update_vfs(tzp);
+ zrele(tzp);
+ }
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Insert the indicated symbolic reference entry into the directory.
+ *
+ * IN: dzp - Directory to contain new symbolic link.
+ * name - Name of directory entry in dip.
+ * vap - Attributes of new entry.
+ * link - Name for new symlink entry.
+ * cr - credentials of caller.
+ * flags - case flags
+ *
+ * OUT: zpp - Znode for new symbolic link.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * dip - ctime|mtime updated
+ */
+/*ARGSUSED*/
+int
+zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link,
+ znode_t **zpp, cred_t *cr, int flags)
+{
+ znode_t *zp;
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+ zilog_t *zilog;
+ uint64_t len = strlen(link);
+ int error;
+ int zflg = ZNEW;
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
+ uint64_t txtype = TX_SYMLINK;
+ boolean_t waited = B_FALSE;
+
+ ASSERT(S_ISLNK(vap->va_mode));
+
+ if (name == NULL)
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ zilog = zfsvfs->z_log;
+
+ if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
+ NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+ if (flags & FIGNORECASE)
+ zflg |= ZCILOOK;
+
+ if (len > MAXPATHLEN) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(ENAMETOOLONG));
+ }
+
+ if ((error = zfs_acl_ids_create(dzp, 0,
+ vap, cr, NULL, &acl_ids)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+top:
+ *zpp = NULL;
+
+ /*
+ * Attempt to lock directory; fail if entry already exists.
+ */
+ error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
+ if (error) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+ zfs_acl_ids_free(&acl_ids);
+ zfs_dirent_unlock(dl);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
+ zfs_acl_ids_free(&acl_ids);
+ zfs_dirent_unlock(dl);
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EDQUOT));
+ }
+ tx = dmu_tx_create(zfsvfs->z_os);
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE + len);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ acl_ids.z_aclp->z_acl_bytes);
+ }
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART) {
+ waited = B_TRUE;
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Create a new object for the symlink.
+ * for version 4 ZPL datsets the symlink will be an SA attribute
+ */
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ mutex_enter(&zp->z_lock);
+ if (zp->z_is_sa)
+ error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
+ link, len, tx);
+ else
+ zfs_sa_symlink(zp, link, len, tx);
+ mutex_exit(&zp->z_lock);
+
+ zp->z_size = len;
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+ &zp->z_size, sizeof (zp->z_size), tx);
+ /*
+ * Insert the new object into the directory.
+ */
+ error = zfs_link_create(dl, zp, tx, ZNEW);
+ if (error != 0) {
+ zfs_znode_delete(zp, tx);
+ remove_inode_hash(ZTOI(zp));
+ } else {
+ if (flags & FIGNORECASE)
+ txtype |= TX_CI;
+ zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
+
+ zfs_znode_update_vfs(dzp);
+ zfs_znode_update_vfs(zp);
+ }
+
+ zfs_acl_ids_free(&acl_ids);
+
+ dmu_tx_commit(tx);
+
+ zfs_dirent_unlock(dl);
+
+ if (error == 0) {
+ *zpp = zp;
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+ } else {
+ zrele(zp);
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Return, in the buffer contained in the provided uio structure,
+ * the symbolic path referred to by ip.
+ *
+ * IN: ip - inode of symbolic link
+ * uio - structure to contain the link path.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * ip - atime updated
+ */
+/* ARGSUSED */
+int
+zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ mutex_enter(&zp->z_lock);
+ if (zp->z_is_sa)
+ error = sa_lookup_uio(zp->z_sa_hdl,
+ SA_ZPL_SYMLINK(zfsvfs), uio);
+ else
+ error = zfs_sa_readlink(zp, uio);
+ mutex_exit(&zp->z_lock);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Insert a new entry into directory tdzp referencing szp.
+ *
+ * IN: tdzp - Directory to contain new entry.
+ * szp - znode of new entry.
+ * name - name of new entry.
+ * cr - credentials of caller.
+ * flags - case flags.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * tdzp - ctime|mtime updated
+ * szp - ctime updated
+ */
+/* ARGSUSED */
+int
+zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
+ int flags)
+{
+ struct inode *sip = ZTOI(szp);
+ znode_t *tzp;
+ zfsvfs_t *zfsvfs = ZTOZSB(tdzp);
+ zilog_t *zilog;
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ int error;
+ int zf = ZNEW;
+ uint64_t parent;
+ uid_t owner;
+ boolean_t waited = B_FALSE;
+ boolean_t is_tmpfile = 0;
+ uint64_t txg;
+#ifdef HAVE_TMPFILE
+ is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
+#endif
+ ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode));
+
+ if (name == NULL)
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(tdzp);
+ zilog = zfsvfs->z_log;
+
+ /*
+ * POSIX dictates that we return EPERM here.
+ * Better choices include ENOTSUP or EISDIR.
+ */
+ if (S_ISDIR(sip->i_mode)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ ZFS_VERIFY_ZP(szp);
+
+ /*
+ * If we are using project inheritance, means if the directory has
+ * ZFS_PROJINHERIT set, then its descendant directories will inherit
+ * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
+ * such case, we only allow hard link creation in our tree when the
+ * project IDs are the same.
+ */
+ if (tdzp->z_pflags & ZFS_PROJINHERIT &&
+ tdzp->z_projid != szp->z_projid) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EXDEV));
+ }
+
+ /*
+ * We check i_sb because snapshots and the ctldir must have different
+ * super blocks.
+ */
+ if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EXDEV));
+ }
+
+ /* Prevent links to .zfs/shares files */
+
+ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (uint64_t))) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ if (parent == zfsvfs->z_shares_dir) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ if (zfsvfs->z_utf8 && u8_validate(name,
+ strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+ if (flags & FIGNORECASE)
+ zf |= ZCILOOK;
+
+ /*
+ * We do not support links between attributes and non-attributes
+ * because of the potential security risk of creating links
+ * into "normal" file space in order to circumvent restrictions
+ * imposed in attribute space.
+ */
+ if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
+ cr, ZFS_OWNER);
+ if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+top:
+ /*
+ * Attempt to lock directory; fail if entry already exists.
+ */
+ error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL);
+ if (error) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
+ if (is_tmpfile)
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+
+ zfs_sa_upgrade_txholds(tx, szp);
+ zfs_sa_upgrade_txholds(tx, tdzp);
+ error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART) {
+ waited = B_TRUE;
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ /* unmark z_unlinked so zfs_link_create will not reject */
+ if (is_tmpfile)
+ szp->z_unlinked = B_FALSE;
+ error = zfs_link_create(dl, szp, tx, 0);
+
+ if (error == 0) {
+ uint64_t txtype = TX_LINK;
+ /*
+ * tmpfile is created to be in z_unlinkedobj, so remove it.
+ * Also, we don't log in ZIL, because all previous file
+ * operation on the tmpfile are ignored by ZIL. Instead we
+ * always wait for txg to sync to make sure all previous
+ * operation are sync safe.
+ */
+ if (is_tmpfile) {
+ VERIFY(zap_remove_int(zfsvfs->z_os,
+ zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
+ } else {
+ if (flags & FIGNORECASE)
+ txtype |= TX_CI;
+ zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
+ }
+ } else if (is_tmpfile) {
+ /* restore z_unlinked since when linking failed */
+ szp->z_unlinked = B_TRUE;
+ }
+ txg = dmu_tx_get_txg(tx);
+ dmu_tx_commit(tx);
+
+ zfs_dirent_unlock(dl);
+
+ if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED)
+ txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg);
+
+ zfs_znode_update_vfs(tdzp);
+ zfs_znode_update_vfs(szp);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+static void
+zfs_putpage_commit_cb(void *arg)
+{
+ struct page *pp = arg;
+
+ ClearPageError(pp);
+ end_page_writeback(pp);
+}
+
+/*
+ * Push a page out to disk, once the page is on stable storage the
+ * registered commit callback will be run as notification of completion.
+ *
+ * IN: ip - page mapped for inode.
+ * pp - page to push (page is locked)
+ * wbc - writeback control data
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * ip - ctime|mtime updated
+ */
+/* ARGSUSED */
+int
+zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ loff_t offset;
+ loff_t pgoff;
+ unsigned int pglen;
+ dmu_tx_t *tx;
+ caddr_t va;
+ int err = 0;
+ uint64_t mtime[2], ctime[2];
+ sa_bulk_attr_t bulk[3];
+ int cnt = 0;
+ struct address_space *mapping;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ ASSERT(PageLocked(pp));
+
+ pgoff = page_offset(pp); /* Page byte-offset in file */
+ offset = i_size_read(ip); /* File length in bytes */
+ pglen = MIN(PAGE_SIZE, /* Page length in bytes */
+ P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
+
+ /* Page is beyond end of file */
+ if (pgoff >= offset) {
+ unlock_page(pp);
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ /* Truncate page length to end of file */
+ if (pgoff + pglen > offset)
+ pglen = offset - pgoff;
+
+#if 0
+ /*
+ * FIXME: Allow mmap writes past its quota. The correct fix
+ * is to register a page_mkwrite() handler to count the page
+ * against its quota when it is about to be dirtied.
+ */
+ if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
+ KUID_TO_SUID(ip->i_uid)) ||
+ zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
+ KGID_TO_SGID(ip->i_gid)) ||
+ (zp->z_projid != ZFS_DEFAULT_PROJID &&
+ zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
+ zp->z_projid))) {
+ err = EDQUOT;
+ }
+#endif
+
+ /*
+ * The ordering here is critical and must adhere to the following
+ * rules in order to avoid deadlocking in either zfs_read() or
+ * zfs_free_range() due to a lock inversion.
+ *
+ * 1) The page must be unlocked prior to acquiring the range lock.
+ * This is critical because zfs_read() calls find_lock_page()
+ * which may block on the page lock while holding the range lock.
+ *
+ * 2) Before setting or clearing write back on a page the range lock
+ * must be held in order to prevent a lock inversion with the
+ * zfs_free_range() function.
+ *
+ * This presents a problem because upon entering this function the
+ * page lock is already held. To safely acquire the range lock the
+ * page lock must be dropped. This creates a window where another
+ * process could truncate, invalidate, dirty, or write out the page.
+ *
+ * Therefore, after successfully reacquiring the range and page locks
+ * the current page state is checked. In the common case everything
+ * will be as is expected and it can be written out. However, if
+ * the page state has changed it must be handled accordingly.
+ */
+ mapping = pp->mapping;
+ redirty_page_for_writepage(wbc, pp);
+ unlock_page(pp);
+
+ zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
+ pgoff, pglen, RL_WRITER);
+ lock_page(pp);
+
+ /* Page mapping changed or it was no longer dirty, we're done */
+ if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
+ unlock_page(pp);
+ zfs_rangelock_exit(lr);
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ /* Another process started write block if required */
+ if (PageWriteback(pp)) {
+ unlock_page(pp);
+ zfs_rangelock_exit(lr);
+
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ if (PageWriteback(pp))
+ wait_on_page_bit(pp, PG_writeback);
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ /* Clear the dirty flag the required locks are held */
+ if (!clear_page_dirty_for_io(pp)) {
+ unlock_page(pp);
+ zfs_rangelock_exit(lr);
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ /*
+ * Counterpart for redirty_page_for_writepage() above. This page
+ * was in fact not skipped and should not be counted as if it were.
+ */
+ wbc->pages_skipped--;
+ set_page_writeback(pp);
+ unlock_page(pp);
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+
+ err = dmu_tx_assign(tx, TXG_NOWAIT);
+ if (err != 0) {
+ if (err == ERESTART)
+ dmu_tx_wait(tx);
+
+ dmu_tx_abort(tx);
+ __set_page_dirty_nobuffers(pp);
+ ClearPageError(pp);
+ end_page_writeback(pp);
+ zfs_rangelock_exit(lr);
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+
+ va = kmap(pp);
+ ASSERT3U(pglen, <=, PAGE_SIZE);
+ dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
+ kunmap(pp);
+
+ SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+
+ /* Preserve the mtime and ctime provided by the inode */
+ ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
+ ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
+ zp->z_atime_dirty = B_FALSE;
+ zp->z_seq++;
+
+ err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
+
+ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
+ zfs_putpage_commit_cb, pp);
+ dmu_tx_commit(tx);
+
+ zfs_rangelock_exit(lr);
+
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ /*
+ * Note that this is rarely called under writepages(), because
+ * writepages() normally handles the entire commit for
+ * performance reasons.
+ */
+ zil_commit(zfsvfs->z_log, zp->z_id);
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (err);
+}
+
+/*
+ * Update the system attributes when the inode has been dirtied. For the
+ * moment we only update the mode, atime, mtime, and ctime.
+ */
+int
+zfs_dirty_inode(struct inode *ip, int flags)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ dmu_tx_t *tx;
+ uint64_t mode, atime[2], mtime[2], ctime[2];
+ sa_bulk_attr_t bulk[4];
+ int error = 0;
+ int cnt = 0;
+
+ if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
+ return (0);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+#ifdef I_DIRTY_TIME
+ /*
+ * This is the lazytime semantic introduced in Linux 4.0
+ * This flag will only be called from update_time when lazytime is set.
+ * (Note, I_DIRTY_SYNC will also set if not lazytime)
+ * Fortunately mtime and ctime are managed within ZFS itself, so we
+ * only need to dirty atime.
+ */
+ if (flags == I_DIRTY_TIME) {
+ zp->z_atime_dirty = B_TRUE;
+ goto out;
+ }
+#endif
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ goto out;
+ }
+
+ mutex_enter(&zp->z_lock);
+ zp->z_atime_dirty = B_FALSE;
+
+ SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+
+ /* Preserve the mode, mtime and ctime provided by the inode */
+ ZFS_TIME_ENCODE(&ip->i_atime, atime);
+ ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
+ ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
+ mode = ip->i_mode;
+
+ zp->z_mode = mode;
+
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
+ mutex_exit(&zp->z_lock);
+
+ dmu_tx_commit(tx);
+out:
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*ARGSUSED*/
+void
+zfs_inactive(struct inode *ip)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ uint64_t atime[2];
+ int error;
+ int need_unlock = 0;
+
+ /* Only read lock if we haven't already write locked, e.g. rollback */
+ if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
+ need_unlock = 1;
+ rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
+ }
+ if (zp->z_sa_hdl == NULL) {
+ if (need_unlock)
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+ return;
+ }
+
+ if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) {
+ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ ZFS_TIME_ENCODE(&ip->i_atime, atime);
+ mutex_enter(&zp->z_lock);
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
+ (void *)&atime, sizeof (atime), tx);
+ zp->z_atime_dirty = B_FALSE;
+ mutex_exit(&zp->z_lock);
+ dmu_tx_commit(tx);
+ }
+ }
+
+ zfs_zinactive(zp);
+ if (need_unlock)
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+}
+
+/*
+ * Fill pages with data from the disk.
+ */
+static int
+zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ objset_t *os;
+ struct page *cur_pp;
+ u_offset_t io_off, total;
+ size_t io_len;
+ loff_t i_size;
+ unsigned page_idx;
+ int err;
+
+ os = zfsvfs->z_os;
+ io_len = nr_pages << PAGE_SHIFT;
+ i_size = i_size_read(ip);
+ io_off = page_offset(pl[0]);
+
+ if (io_off + io_len > i_size)
+ io_len = i_size - io_off;
+
+ /*
+ * Iterate over list of pages and read each page individually.
+ */
+ page_idx = 0;
+ for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
+ caddr_t va;
+
+ cur_pp = pl[page_idx++];
+ va = kmap(cur_pp);
+ err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
+ DMU_READ_PREFETCH);
+ kunmap(cur_pp);
+ if (err) {
+ /* convert checksum errors into IO errors */
+ if (err == ECKSUM)
+ err = SET_ERROR(EIO);
+ return (err);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Uses zfs_fillpage to read data from the file and fill the pages.
+ *
+ * IN: ip - inode of file to get data from.
+ * pl - list of pages to read
+ * nr_pages - number of pages to read
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * vp - atime updated
+ */
+/* ARGSUSED */
+int
+zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ int err;
+
+ if (pl == NULL)
+ return (0);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ err = zfs_fillpage(ip, pl, nr_pages);
+
+ ZFS_EXIT(zfsvfs);
+ return (err);
+}
+
+/*
+ * Check ZFS specific permissions to memory map a section of a file.
+ *
+ * IN: ip - inode of the file to mmap
+ * off - file offset
+ * addrp - start address in memory region
+ * len - length of memory region
+ * vm_flags- address flags
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ */
+/*ARGSUSED*/
+int
+zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
+ unsigned long vm_flags)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if ((vm_flags & VM_WRITE) && (zp->z_pflags &
+ (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ if ((vm_flags & (VM_READ | VM_EXEC)) &&
+ (zp->z_pflags & ZFS_AV_QUARANTINED)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EACCES));
+ }
+
+ if (off < 0 || len > MAXOFFSET_T - off) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(ENXIO));
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Free or allocate space in a file. Currently, this function only
+ * supports the `F_FREESP' command. However, this command is somewhat
+ * misnamed, as its functionality includes the ability to allocate as
+ * well as free space.
+ *
+ * IN: zp - znode of file to free data in.
+ * cmd - action to take (only F_FREESP supported).
+ * bfp - section of file to free/alloc.
+ * flag - current file open mode flags.
+ * offset - current file offset.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * zp - ctime|mtime updated
+ */
+/* ARGSUSED */
+int
+zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
+ offset_t offset, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ uint64_t off, len;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if (cmd != F_FREESP) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Callers might not be able to detect properly that we are read-only,
+ * so check it explicitly here.
+ */
+ if (zfs_is_readonly(zfsvfs)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EROFS));
+ }
+
+ if (bfp->l_len < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Permissions aren't checked on Solaris because on this OS
+ * zfs_space() can only be called with an opened file handle.
+ * On Linux we can get here through truncate_range() which
+ * operates directly on inodes, so we need to check access rights.
+ */
+ if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ off = bfp->l_start;
+ len = bfp->l_len; /* 0 means from off to end of file */
+
+ error = zfs_freesp(zp, off, len, flag, TRUE);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*ARGSUSED*/
+int
+zfs_fid(struct inode *ip, fid_t *fidp)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ uint32_t gen;
+ uint64_t gen64;
+ uint64_t object = zp->z_id;
+ zfid_short_t *zfid;
+ int size, i, error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
+ &gen64, sizeof (uint64_t))) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ gen = (uint32_t)gen64;
+
+ size = SHORT_FID_LEN;
+
+ zfid = (zfid_short_t *)fidp;
+
+ zfid->zf_len = size;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+ /* Must have a non-zero generation number to distinguish from .zfs */
+ if (gen == 0)
+ gen = 1;
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zfs_open);
+EXPORT_SYMBOL(zfs_close);
+EXPORT_SYMBOL(zfs_lookup);
+EXPORT_SYMBOL(zfs_create);
+EXPORT_SYMBOL(zfs_tmpfile);
+EXPORT_SYMBOL(zfs_remove);
+EXPORT_SYMBOL(zfs_mkdir);
+EXPORT_SYMBOL(zfs_rmdir);
+EXPORT_SYMBOL(zfs_readdir);
+EXPORT_SYMBOL(zfs_getattr_fast);
+EXPORT_SYMBOL(zfs_setattr);
+EXPORT_SYMBOL(zfs_rename);
+EXPORT_SYMBOL(zfs_symlink);
+EXPORT_SYMBOL(zfs_readlink);
+EXPORT_SYMBOL(zfs_link);
+EXPORT_SYMBOL(zfs_inactive);
+EXPORT_SYMBOL(zfs_space);
+EXPORT_SYMBOL(zfs_fid);
+EXPORT_SYMBOL(zfs_getpage);
+EXPORT_SYMBOL(zfs_putpage);
+EXPORT_SYMBOL(zfs_dirty_inode);
+EXPORT_SYMBOL(zfs_map);
+
+/* BEGIN CSTYLED */
+module_param(zfs_delete_blocks, ulong, 0644);
+MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
+/* END CSTYLED */
+
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c
new file mode 100644
index 000000000000..d59c1bb0716a
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c
@@ -0,0 +1,2244 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ */
+
+/* Portions Copyright 2007 Jeremy Teo */
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/mntent.h>
+#include <sys/u8_textprep.h>
+#include <sys/dsl_dataset.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/atomic.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_rlock.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/dnode.h>
+#include <sys/fs/zfs.h>
+#include <sys/zpl.h>
+#endif /* _KERNEL */
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/zfs_refcount.h>
+#include <sys/stat.h>
+#include <sys/zap.h>
+#include <sys/zfs_znode.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_stat.h>
+
+#include "zfs_prop.h"
+#include "zfs_comutil.h"
+
+/*
+ * Functions needed for userland (ie: libzpool) are not put under
+ * #ifdef_KERNEL; the rest of the functions have dependencies
+ * (such as VFS logic) that will not compile easily in userland.
+ */
+#ifdef _KERNEL
+
+static kmem_cache_t *znode_cache = NULL;
+static kmem_cache_t *znode_hold_cache = NULL;
+unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
+
+/*
+ * This is used by the test suite so that it can delay znodes from being
+ * freed in order to inspect the unlinked set.
+ */
+int zfs_unlink_suspend_progress = 0;
+
+/*
+ * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
+ * z_rangelock. It will modify the offset and length of the lock to reflect
+ * znode-specific information, and convert RL_APPEND to RL_WRITER. This is
+ * called with the rangelock_t's rl_lock held, which avoids races.
+ */
+static void
+zfs_rangelock_cb(zfs_locked_range_t *new, void *arg)
+{
+ znode_t *zp = arg;
+
+ /*
+ * If in append mode, convert to writer and lock starting at the
+ * current end of file.
+ */
+ if (new->lr_type == RL_APPEND) {
+ new->lr_offset = zp->z_size;
+ new->lr_type = RL_WRITER;
+ }
+
+ /*
+ * If we need to grow the block size then lock the whole file range.
+ */
+ uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
+ if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
+ zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
+ new->lr_offset = 0;
+ new->lr_length = UINT64_MAX;
+ }
+}
+
+/*ARGSUSED*/
+static int
+zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
+{
+ znode_t *zp = buf;
+
+ inode_init_once(ZTOI(zp));
+ list_link_init(&zp->z_link_node);
+
+ mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
+ rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
+ rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL);
+ mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
+ rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
+
+ zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
+
+ zp->z_dirlocks = NULL;
+ zp->z_acl_cached = NULL;
+ zp->z_xattr_cached = NULL;
+ zp->z_xattr_parent = 0;
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+zfs_znode_cache_destructor(void *buf, void *arg)
+{
+ znode_t *zp = buf;
+
+ ASSERT(!list_link_active(&zp->z_link_node));
+ mutex_destroy(&zp->z_lock);
+ rw_destroy(&zp->z_parent_lock);
+ rw_destroy(&zp->z_name_lock);
+ mutex_destroy(&zp->z_acl_lock);
+ rw_destroy(&zp->z_xattr_lock);
+ zfs_rangelock_fini(&zp->z_rangelock);
+
+ ASSERT(zp->z_dirlocks == NULL);
+ ASSERT(zp->z_acl_cached == NULL);
+ ASSERT(zp->z_xattr_cached == NULL);
+}
+
+static int
+zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags)
+{
+ znode_hold_t *zh = buf;
+
+ mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL);
+ zfs_refcount_create(&zh->zh_refcount);
+ zh->zh_obj = ZFS_NO_OBJECT;
+
+ return (0);
+}
+
+static void
+zfs_znode_hold_cache_destructor(void *buf, void *arg)
+{
+ znode_hold_t *zh = buf;
+
+ mutex_destroy(&zh->zh_lock);
+ zfs_refcount_destroy(&zh->zh_refcount);
+}
+
+void
+zfs_znode_init(void)
+{
+ /*
+ * Initialize zcache. The KMC_SLAB hint is used in order that it be
+ * backed by kmalloc() when on the Linux slab in order that any
+ * wait_on_bit() operations on the related inode operate properly.
+ */
+ ASSERT(znode_cache == NULL);
+ znode_cache = kmem_cache_create("zfs_znode_cache",
+ sizeof (znode_t), 0, zfs_znode_cache_constructor,
+ zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB);
+
+ ASSERT(znode_hold_cache == NULL);
+ znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
+ sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
+ zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+zfs_znode_fini(void)
+{
+ /*
+ * Cleanup zcache
+ */
+ if (znode_cache)
+ kmem_cache_destroy(znode_cache);
+ znode_cache = NULL;
+
+ if (znode_hold_cache)
+ kmem_cache_destroy(znode_hold_cache);
+ znode_hold_cache = NULL;
+}
+
+/*
+ * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to
+ * serialize access to a znode and its SA buffer while the object is being
+ * created or destroyed. This kind of locking would normally reside in the
+ * znode itself but in this case that's impossible because the znode and SA
+ * buffer may not yet exist. Therefore the locking is handled externally
+ * with an array of mutexs and AVLs trees which contain per-object locks.
+ *
+ * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted
+ * in to the correct AVL tree and finally the per-object lock is held. In
+ * zfs_znode_hold_exit() the process is reversed. The per-object lock is
+ * released, removed from the AVL tree and destroyed if there are no waiters.
+ *
+ * This scheme has two important properties:
+ *
+ * 1) No memory allocations are performed while holding one of the z_hold_locks.
+ * This ensures evict(), which can be called from direct memory reclaim, will
+ * never block waiting on a z_hold_locks which just happens to have hashed
+ * to the same index.
+ *
+ * 2) All locks used to serialize access to an object are per-object and never
+ * shared. This minimizes lock contention without creating a large number
+ * of dedicated locks.
+ *
+ * On the downside it does require znode_lock_t structures to be frequently
+ * allocated and freed. However, because these are backed by a kmem cache
+ * and very short lived this cost is minimal.
+ */
+int
+zfs_znode_hold_compare(const void *a, const void *b)
+{
+ const znode_hold_t *zh_a = (const znode_hold_t *)a;
+ const znode_hold_t *zh_b = (const znode_hold_t *)b;
+
+ return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj));
+}
+
+static boolean_t __maybe_unused
+zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj)
+{
+ znode_hold_t *zh, search;
+ int i = ZFS_OBJ_HASH(zfsvfs, obj);
+ boolean_t held;
+
+ search.zh_obj = obj;
+
+ mutex_enter(&zfsvfs->z_hold_locks[i]);
+ zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
+ held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE;
+ mutex_exit(&zfsvfs->z_hold_locks[i]);
+
+ return (held);
+}
+
+static znode_hold_t *
+zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj)
+{
+ znode_hold_t *zh, *zh_new, search;
+ int i = ZFS_OBJ_HASH(zfsvfs, obj);
+ boolean_t found = B_FALSE;
+
+ zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP);
+ zh_new->zh_obj = obj;
+ search.zh_obj = obj;
+
+ mutex_enter(&zfsvfs->z_hold_locks[i]);
+ zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
+ if (likely(zh == NULL)) {
+ zh = zh_new;
+ avl_add(&zfsvfs->z_hold_trees[i], zh);
+ } else {
+ ASSERT3U(zh->zh_obj, ==, obj);
+ found = B_TRUE;
+ }
+ zfs_refcount_add(&zh->zh_refcount, NULL);
+ mutex_exit(&zfsvfs->z_hold_locks[i]);
+
+ if (found == B_TRUE)
+ kmem_cache_free(znode_hold_cache, zh_new);
+
+ ASSERT(MUTEX_NOT_HELD(&zh->zh_lock));
+ ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0);
+ mutex_enter(&zh->zh_lock);
+
+ return (zh);
+}
+
+static void
+zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh)
+{
+ int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj);
+ boolean_t remove = B_FALSE;
+
+ ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj));
+ ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0);
+ mutex_exit(&zh->zh_lock);
+
+ mutex_enter(&zfsvfs->z_hold_locks[i]);
+ if (zfs_refcount_remove(&zh->zh_refcount, NULL) == 0) {
+ avl_remove(&zfsvfs->z_hold_trees[i], zh);
+ remove = B_TRUE;
+ }
+ mutex_exit(&zfsvfs->z_hold_locks[i]);
+
+ if (remove == B_TRUE)
+ kmem_cache_free(znode_hold_cache, zh);
+}
+
+dev_t
+zfs_cmpldev(uint64_t dev)
+{
+ return (dev);
+}
+
+static void
+zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
+ dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
+{
+ ASSERT(zfs_znode_held(zfsvfs, zp->z_id));
+
+ mutex_enter(&zp->z_lock);
+
+ ASSERT(zp->z_sa_hdl == NULL);
+ ASSERT(zp->z_acl_cached == NULL);
+ if (sa_hdl == NULL) {
+ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
+ SA_HDL_SHARED, &zp->z_sa_hdl));
+ } else {
+ zp->z_sa_hdl = sa_hdl;
+ sa_set_userp(sa_hdl, zp);
+ }
+
+ zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
+
+ mutex_exit(&zp->z_lock);
+}
+
+void
+zfs_znode_dmu_fini(znode_t *zp)
+{
+ ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || zp->z_unlinked ||
+ RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock));
+
+ sa_handle_destroy(zp->z_sa_hdl);
+ zp->z_sa_hdl = NULL;
+}
+
+/*
+ * Called by new_inode() to allocate a new inode.
+ */
+int
+zfs_inode_alloc(struct super_block *sb, struct inode **ip)
+{
+ znode_t *zp;
+
+ zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+ *ip = ZTOI(zp);
+
+ return (0);
+}
+
+/*
+ * Called in multiple places when an inode should be destroyed.
+ */
+void
+zfs_inode_destroy(struct inode *ip)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ if (list_link_active(&zp->z_link_node)) {
+ list_remove(&zfsvfs->z_all_znodes, zp);
+ zfsvfs->z_nr_znodes--;
+ }
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ if (zp->z_acl_cached) {
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = NULL;
+ }
+
+ if (zp->z_xattr_cached) {
+ nvlist_free(zp->z_xattr_cached);
+ zp->z_xattr_cached = NULL;
+ }
+
+ kmem_cache_free(znode_cache, zp);
+}
+
+static void
+zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
+{
+ uint64_t rdev = 0;
+
+ switch (ip->i_mode & S_IFMT) {
+ case S_IFREG:
+ ip->i_op = &zpl_inode_operations;
+ ip->i_fop = &zpl_file_operations;
+ ip->i_mapping->a_ops = &zpl_address_space_operations;
+ break;
+
+ case S_IFDIR:
+ ip->i_op = &zpl_dir_inode_operations;
+ ip->i_fop = &zpl_dir_file_operations;
+ ITOZ(ip)->z_zn_prefetch = B_TRUE;
+ break;
+
+ case S_IFLNK:
+ ip->i_op = &zpl_symlink_inode_operations;
+ break;
+
+ /*
+ * rdev is only stored in a SA only for device files.
+ */
+ case S_IFCHR:
+ case S_IFBLK:
+ (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev,
+ sizeof (rdev));
+ /*FALLTHROUGH*/
+ case S_IFIFO:
+ case S_IFSOCK:
+ init_special_inode(ip, ip->i_mode, rdev);
+ ip->i_op = &zpl_special_inode_operations;
+ break;
+
+ default:
+ zfs_panic_recover("inode %llu has invalid mode: 0x%x\n",
+ (u_longlong_t)ip->i_ino, ip->i_mode);
+
+ /* Assume the inode is a file and attempt to continue */
+ ip->i_mode = S_IFREG | 0644;
+ ip->i_op = &zpl_inode_operations;
+ ip->i_fop = &zpl_file_operations;
+ ip->i_mapping->a_ops = &zpl_address_space_operations;
+ break;
+ }
+}
+
+static void
+zfs_set_inode_flags(znode_t *zp, struct inode *ip)
+{
+ /*
+ * Linux and Solaris have different sets of file attributes, so we
+ * restrict this conversion to the intersection of the two.
+ */
+#ifdef HAVE_INODE_SET_FLAGS
+ unsigned int flags = 0;
+ if (zp->z_pflags & ZFS_IMMUTABLE)
+ flags |= S_IMMUTABLE;
+ if (zp->z_pflags & ZFS_APPENDONLY)
+ flags |= S_APPEND;
+
+ inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND);
+#else
+ if (zp->z_pflags & ZFS_IMMUTABLE)
+ ip->i_flags |= S_IMMUTABLE;
+ else
+ ip->i_flags &= ~S_IMMUTABLE;
+
+ if (zp->z_pflags & ZFS_APPENDONLY)
+ ip->i_flags |= S_APPEND;
+ else
+ ip->i_flags &= ~S_APPEND;
+#endif
+}
+
+/*
+ * Update the embedded inode given the znode.
+ */
+void
+zfs_znode_update_vfs(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs;
+ struct inode *ip;
+ uint32_t blksize;
+ u_longlong_t i_blocks;
+
+ ASSERT(zp != NULL);
+ zfsvfs = ZTOZSB(zp);
+ ip = ZTOI(zp);
+
+ /* Skip .zfs control nodes which do not exist on disk. */
+ if (zfsctl_is_node(ip))
+ return;
+
+ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks);
+
+ spin_lock(&ip->i_lock);
+ ip->i_mode = zp->z_mode;
+ ip->i_blocks = i_blocks;
+ i_size_write(ip, zp->z_size);
+ spin_unlock(&ip->i_lock);
+}
+
+
+/*
+ * Construct a znode+inode and initialize.
+ *
+ * This does not do a call to dmu_set_user() that is
+ * up to the caller to do, in case you don't want to
+ * return the znode
+ */
+static znode_t *
+zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
+ dmu_object_type_t obj_type, sa_handle_t *hdl)
+{
+ znode_t *zp;
+ struct inode *ip;
+ uint64_t mode;
+ uint64_t parent;
+ uint64_t tmp_gen;
+ uint64_t links;
+ uint64_t z_uid, z_gid;
+ uint64_t atime[2], mtime[2], ctime[2];
+ uint64_t projid = ZFS_DEFAULT_PROJID;
+ sa_bulk_attr_t bulk[11];
+ int count = 0;
+
+ ASSERT(zfsvfs != NULL);
+
+ ip = new_inode(zfsvfs->z_sb);
+ if (ip == NULL)
+ return (NULL);
+
+ zp = ITOZ(ip);
+ ASSERT(zp->z_dirlocks == NULL);
+ ASSERT3P(zp->z_acl_cached, ==, NULL);
+ ASSERT3P(zp->z_xattr_cached, ==, NULL);
+ zp->z_unlinked = B_FALSE;
+ zp->z_atime_dirty = B_FALSE;
+ zp->z_is_mapped = B_FALSE;
+ zp->z_is_ctldir = B_FALSE;
+ zp->z_is_stale = B_FALSE;
+ zp->z_suspended = B_FALSE;
+ zp->z_sa_hdl = NULL;
+ zp->z_mapcnt = 0;
+ zp->z_id = db->db_object;
+ zp->z_blksz = blksz;
+ zp->z_seq = 0x7A4653;
+ zp->z_sync_cnt = 0;
+
+ zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+ &parent, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+
+ if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 ||
+ (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
+ (zp->z_pflags & ZFS_PROJID) &&
+ sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
+ if (hdl == NULL)
+ sa_handle_destroy(zp->z_sa_hdl);
+ zp->z_sa_hdl = NULL;
+ goto error;
+ }
+
+ zp->z_projid = projid;
+ zp->z_mode = ip->i_mode = mode;
+ ip->i_generation = (uint32_t)tmp_gen;
+ ip->i_blkbits = SPA_MINBLOCKSHIFT;
+ set_nlink(ip, (uint32_t)links);
+ zfs_uid_write(ip, z_uid);
+ zfs_gid_write(ip, z_gid);
+ zfs_set_inode_flags(zp, ip);
+
+ /* Cache the xattr parent id */
+ if (zp->z_pflags & ZFS_XATTR)
+ zp->z_xattr_parent = parent;
+
+ ZFS_TIME_DECODE(&ip->i_atime, atime);
+ ZFS_TIME_DECODE(&ip->i_mtime, mtime);
+ ZFS_TIME_DECODE(&ip->i_ctime, ctime);
+
+ ip->i_ino = zp->z_id;
+ zfs_znode_update_vfs(zp);
+ zfs_inode_set_ops(zfsvfs, ip);
+
+ /*
+ * The only way insert_inode_locked() can fail is if the ip->i_ino
+ * number is already hashed for this super block. This can never
+ * happen because the inode numbers map 1:1 with the object numbers.
+ *
+ * The one exception is rolling back a mounted file system, but in
+ * this case all the active inode are unhashed during the rollback.
+ */
+ VERIFY3S(insert_inode_locked(ip), ==, 0);
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ list_insert_tail(&zfsvfs->z_all_znodes, zp);
+ zfsvfs->z_nr_znodes++;
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ unlock_new_inode(ip);
+ return (zp);
+
+error:
+ iput(ip);
+ return (NULL);
+}
+
+/*
+ * Safely mark an inode dirty. Inodes which are part of a read-only
+ * file system or snapshot may not be dirtied.
+ */
+void
+zfs_mark_inode_dirty(struct inode *ip)
+{
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+
+ if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
+ return;
+
+ mark_inode_dirty(ip);
+}
+
+static uint64_t empty_xattr;
+static uint64_t pad[4];
+static zfs_acl_phys_t acl_phys;
+/*
+ * Create a new DMU object to hold a zfs znode.
+ *
+ * IN: dzp - parent directory for new znode
+ * vap - file attributes for new znode
+ * tx - dmu transaction id for zap operations
+ * cr - credentials of caller
+ * flag - flags:
+ * IS_ROOT_NODE - new object will be root
+ * IS_TMPFILE - new object is of O_TMPFILE
+ * IS_XATTR - new object is an attribute
+ * acl_ids - ACL related attributes
+ *
+ * OUT: zpp - allocated znode (set to dzp if IS_ROOT_NODE)
+ *
+ */
+void
+zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
+ uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
+{
+ uint64_t crtime[2], atime[2], mtime[2], ctime[2];
+ uint64_t mode, size, links, parent, pflags;
+ uint64_t projid = ZFS_DEFAULT_PROJID;
+ uint64_t rdev = 0;
+ zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+ dmu_buf_t *db;
+ inode_timespec_t now;
+ uint64_t gen, obj;
+ int bonuslen;
+ int dnodesize;
+ sa_handle_t *sa_hdl;
+ dmu_object_type_t obj_type;
+ sa_bulk_attr_t *sa_attrs;
+ int cnt = 0;
+ zfs_acl_locator_cb_t locate = { 0 };
+ znode_hold_t *zh;
+
+ if (zfsvfs->z_replay) {
+ obj = vap->va_nodeid;
+ now = vap->va_ctime; /* see zfs_replay_create() */
+ gen = vap->va_nblocks; /* ditto */
+ dnodesize = vap->va_fsid; /* ditto */
+ } else {
+ obj = 0;
+ gethrestime(&now);
+ gen = dmu_tx_get_txg(tx);
+ dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
+ }
+
+ if (dnodesize == 0)
+ dnodesize = DNODE_MIN_SIZE;
+
+ obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
+
+ bonuslen = (obj_type == DMU_OT_SA) ?
+ DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
+
+ /*
+ * Create a new DMU object.
+ */
+ /*
+ * There's currently no mechanism for pre-reading the blocks that will
+ * be needed to allocate a new object, so we accept the small chance
+ * that there will be an i/o error and we will fail one of the
+ * assertions below.
+ */
+ if (S_ISDIR(vap->va_mode)) {
+ if (zfsvfs->z_replay) {
+ VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
+ zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
+ obj_type, bonuslen, dnodesize, tx));
+ } else {
+ obj = zap_create_norm_dnsize(zfsvfs->z_os,
+ zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
+ obj_type, bonuslen, dnodesize, tx);
+ }
+ } else {
+ if (zfsvfs->z_replay) {
+ VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
+ DMU_OT_PLAIN_FILE_CONTENTS, 0,
+ obj_type, bonuslen, dnodesize, tx));
+ } else {
+ obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
+ DMU_OT_PLAIN_FILE_CONTENTS, 0,
+ obj_type, bonuslen, dnodesize, tx);
+ }
+ }
+
+ zh = zfs_znode_hold_enter(zfsvfs, obj);
+ VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
+
+ /*
+ * If this is the root, fix up the half-initialized parent pointer
+ * to reference the just-allocated physical data area.
+ */
+ if (flag & IS_ROOT_NODE) {
+ dzp->z_id = obj;
+ }
+
+ /*
+ * If parent is an xattr, so am I.
+ */
+ if (dzp->z_pflags & ZFS_XATTR) {
+ flag |= IS_XATTR;
+ }
+
+ if (zfsvfs->z_use_fuids)
+ pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
+ else
+ pflags = 0;
+
+ if (S_ISDIR(vap->va_mode)) {
+ size = 2; /* contents ("." and "..") */
+ links = 2;
+ } else {
+ size = 0;
+ links = (flag & IS_TMPFILE) ? 0 : 1;
+ }
+
+ if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))
+ rdev = vap->va_rdev;
+
+ parent = dzp->z_id;
+ mode = acl_ids->z_mode;
+ if (flag & IS_XATTR)
+ pflags |= ZFS_XATTR;
+
+ if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) {
+ /*
+ * With ZFS_PROJID flag, we can easily know whether there is
+ * project ID stored on disk or not. See zfs_space_delta_cb().
+ */
+ if (obj_type != DMU_OT_ZNODE &&
+ dmu_objset_projectquota_enabled(zfsvfs->z_os))
+ pflags |= ZFS_PROJID;
+
+ /*
+ * Inherit project ID from parent if required.
+ */
+ projid = zfs_inherit_projid(dzp);
+ if (dzp->z_pflags & ZFS_PROJINHERIT)
+ pflags |= ZFS_PROJINHERIT;
+ }
+
+ /*
+ * No execs denied will be determined when zfs_mode_compute() is called.
+ */
+ pflags |= acl_ids->z_aclp->z_hints &
+ (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
+ ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
+
+ ZFS_TIME_ENCODE(&now, crtime);
+ ZFS_TIME_ENCODE(&now, ctime);
+
+ if (vap->va_mask & ATTR_ATIME) {
+ ZFS_TIME_ENCODE(&vap->va_atime, atime);
+ } else {
+ ZFS_TIME_ENCODE(&now, atime);
+ }
+
+ if (vap->va_mask & ATTR_MTIME) {
+ ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+ } else {
+ ZFS_TIME_ENCODE(&now, mtime);
+ }
+
+ /* Now add in all of the "SA" attributes */
+ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
+ &sa_hdl));
+
+ /*
+ * Setup the array of attributes to be replaced/set on the new file
+ *
+ * order for DMU_OT_ZNODE is critical since it needs to be constructed
+ * in the old znode_phys_t format. Don't change this ordering
+ */
+ sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
+
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+ NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+ NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+ NULL, &crtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+ NULL, &gen, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+ NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+ NULL, &size, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+ NULL, &parent, 8);
+ } else {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+ NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+ NULL, &size, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+ NULL, &gen, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
+ NULL, &acl_ids->z_fuid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
+ NULL, &acl_ids->z_fgid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+ NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &pflags, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+ NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+ NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+ NULL, &crtime, 16);
+ }
+
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
+
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
+ &empty_xattr, 8);
+ } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
+ pflags & ZFS_PROJID) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs),
+ NULL, &projid, 8);
+ }
+ if (obj_type == DMU_OT_ZNODE ||
+ (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
+ NULL, &rdev, 8);
+ }
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &pflags, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
+ &acl_ids->z_fuid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
+ &acl_ids->z_fgid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
+ sizeof (uint64_t) * 4);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+ &acl_phys, sizeof (zfs_acl_phys_t));
+ } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+ &acl_ids->z_aclp->z_acl_count, 8);
+ locate.cb_aclp = acl_ids->z_aclp;
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
+ zfs_acl_data_locator, &locate,
+ acl_ids->z_aclp->z_acl_bytes);
+ mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
+ acl_ids->z_fuid, acl_ids->z_fgid);
+ }
+
+ VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
+
+ if (!(flag & IS_ROOT_NODE)) {
+ /*
+ * The call to zfs_znode_alloc() may fail if memory is low
+ * via the call path: alloc_inode() -> inode_init_always() ->
+ * security_inode_alloc() -> inode_alloc_security(). Since
+ * the existing code is written such that zfs_mknode() can
+ * not fail retry until sufficient memory has been reclaimed.
+ */
+ do {
+ *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
+ } while (*zpp == NULL);
+
+ VERIFY(*zpp != NULL);
+ VERIFY(dzp != NULL);
+ } else {
+ /*
+ * If we are creating the root node, the "parent" we
+ * passed in is the znode for the root.
+ */
+ *zpp = dzp;
+
+ (*zpp)->z_sa_hdl = sa_hdl;
+ }
+
+ (*zpp)->z_pflags = pflags;
+ (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode;
+ (*zpp)->z_dnodesize = dnodesize;
+ (*zpp)->z_projid = projid;
+
+ if (obj_type == DMU_OT_ZNODE ||
+ acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
+ VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
+ }
+ kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
+ zfs_znode_hold_exit(zfsvfs, zh);
+}
+
+/*
+ * Update in-core attributes. It is assumed the caller will be doing an
+ * sa_bulk_update to push the changes out.
+ */
+void
+zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
+{
+ xoptattr_t *xoap;
+ boolean_t update_inode = B_FALSE;
+
+ xoap = xva_getxoptattr(xvap);
+ ASSERT(xoap);
+
+ if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
+ uint64_t times[2];
+ ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
+ &times, sizeof (times), tx);
+ XVA_SET_RTN(xvap, XAT_CREATETIME);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
+ ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_READONLY);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
+ ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_HIDDEN);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
+ ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_SYSTEM);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
+ ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_ARCHIVE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+ ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_IMMUTABLE);
+
+ update_inode = B_TRUE;
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+ ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_NOUNLINK);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+ ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_APPENDONLY);
+
+ update_inode = B_TRUE;
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+ ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_NODUMP);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
+ ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_OPAQUE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+ ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
+ xoap->xoa_av_quarantined, zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+ ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
+ zfs_sa_set_scanstamp(zp, xvap, tx);
+ XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+ ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_REPARSE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+ ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_OFFLINE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+ ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_SPARSE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
+ ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_PROJINHERIT);
+ }
+
+ if (update_inode)
+ zfs_set_inode_flags(zp, ZTOI(zp));
+}
+
+int
+zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
+{
+ dmu_object_info_t doi;
+ dmu_buf_t *db;
+ znode_t *zp;
+ znode_hold_t *zh;
+ int err;
+ sa_handle_t *hdl;
+
+ *zpp = NULL;
+
+again:
+ zh = zfs_znode_hold_enter(zfsvfs, obj_num);
+
+ err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ if (err) {
+ zfs_znode_hold_exit(zfsvfs, zh);
+ return (err);
+ }
+
+ dmu_object_info_from_db(db, &doi);
+ if (doi.doi_bonus_type != DMU_OT_SA &&
+ (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ (doi.doi_bonus_type == DMU_OT_ZNODE &&
+ doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+ sa_buf_rele(db, NULL);
+ zfs_znode_hold_exit(zfsvfs, zh);
+ return (SET_ERROR(EINVAL));
+ }
+
+ hdl = dmu_buf_get_user(db);
+ if (hdl != NULL) {
+ zp = sa_get_userdata(hdl);
+
+
+ /*
+ * Since "SA" does immediate eviction we
+ * should never find a sa handle that doesn't
+ * know about the znode.
+ */
+
+ ASSERT3P(zp, !=, NULL);
+
+ mutex_enter(&zp->z_lock);
+ ASSERT3U(zp->z_id, ==, obj_num);
+ /*
+ * If zp->z_unlinked is set, the znode is already marked
+ * for deletion and should not be discovered. Check this
+ * after checking igrab() due to fsetxattr() & O_TMPFILE.
+ *
+ * If igrab() returns NULL the VFS has independently
+ * determined the inode should be evicted and has
+ * called iput_final() to start the eviction process.
+ * The SA handle is still valid but because the VFS
+ * requires that the eviction succeed we must drop
+ * our locks and references to allow the eviction to
+ * complete. The zfs_zget() may then be retried.
+ *
+ * This unlikely case could be optimized by registering
+ * a sops->drop_inode() callback. The callback would
+ * need to detect the active SA hold thereby informing
+ * the VFS that this inode should not be evicted.
+ */
+ if (igrab(ZTOI(zp)) == NULL) {
+ if (zp->z_unlinked)
+ err = SET_ERROR(ENOENT);
+ else
+ err = SET_ERROR(EAGAIN);
+ } else {
+ *zpp = zp;
+ err = 0;
+ }
+
+ mutex_exit(&zp->z_lock);
+ sa_buf_rele(db, NULL);
+ zfs_znode_hold_exit(zfsvfs, zh);
+
+ if (err == EAGAIN) {
+ /* inode might need this to finish evict */
+ cond_resched();
+ goto again;
+ }
+ return (err);
+ }
+
+ /*
+ * Not found create new znode/vnode but only if file exists.
+ *
+ * There is a small window where zfs_vget() could
+ * find this object while a file create is still in
+ * progress. This is checked for in zfs_znode_alloc()
+ *
+ * if zfs_znode_alloc() fails it will drop the hold on the
+ * bonus buffer.
+ */
+ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
+ doi.doi_bonus_type, NULL);
+ if (zp == NULL) {
+ err = SET_ERROR(ENOENT);
+ } else {
+ *zpp = zp;
+ }
+ zfs_znode_hold_exit(zfsvfs, zh);
+ return (err);
+}
+
+int
+zfs_rezget(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ dmu_object_info_t doi;
+ dmu_buf_t *db;
+ uint64_t obj_num = zp->z_id;
+ uint64_t mode;
+ uint64_t links;
+ sa_bulk_attr_t bulk[10];
+ int err;
+ int count = 0;
+ uint64_t gen;
+ uint64_t z_uid, z_gid;
+ uint64_t atime[2], mtime[2], ctime[2];
+ uint64_t projid = ZFS_DEFAULT_PROJID;
+ znode_hold_t *zh;
+
+ /*
+ * skip ctldir, otherwise they will always get invalidated. This will
+ * cause funny behaviour for the mounted snapdirs. Especially for
+ * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent
+ * anyone automount it again as long as someone is still using the
+ * detached mount.
+ */
+ if (zp->z_is_ctldir)
+ return (0);
+
+ zh = zfs_znode_hold_enter(zfsvfs, obj_num);
+
+ mutex_enter(&zp->z_acl_lock);
+ if (zp->z_acl_cached) {
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = NULL;
+ }
+ mutex_exit(&zp->z_acl_lock);
+
+ rw_enter(&zp->z_xattr_lock, RW_WRITER);
+ if (zp->z_xattr_cached) {
+ nvlist_free(zp->z_xattr_cached);
+ zp->z_xattr_cached = NULL;
+ }
+ rw_exit(&zp->z_xattr_lock);
+
+ ASSERT(zp->z_sa_hdl == NULL);
+ err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ if (err) {
+ zfs_znode_hold_exit(zfsvfs, zh);
+ return (err);
+ }
+
+ dmu_object_info_from_db(db, &doi);
+ if (doi.doi_bonus_type != DMU_OT_SA &&
+ (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ (doi.doi_bonus_type == DMU_OT_ZNODE &&
+ doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+ sa_buf_rele(db, NULL);
+ zfs_znode_hold_exit(zfsvfs, zh);
+ return (SET_ERROR(EINVAL));
+ }
+
+ zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
+
+ /* reload cached values */
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
+ &gen, sizeof (gen));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, sizeof (zp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &links, sizeof (links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &z_uid, sizeof (z_uid));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+ &z_gid, sizeof (z_gid));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &mode, sizeof (mode));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &atime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, 16);
+
+ if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_hold_exit(zfsvfs, zh);
+ return (SET_ERROR(EIO));
+ }
+
+ if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) {
+ err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs),
+ &projid, 8);
+ if (err != 0 && err != ENOENT) {
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_hold_exit(zfsvfs, zh);
+ return (SET_ERROR(err));
+ }
+ }
+
+ zp->z_projid = projid;
+ zp->z_mode = ZTOI(zp)->i_mode = mode;
+ zfs_uid_write(ZTOI(zp), z_uid);
+ zfs_gid_write(ZTOI(zp), z_gid);
+
+ ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime);
+ ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime);
+ ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime);
+
+ if ((uint32_t)gen != ZTOI(zp)->i_generation) {
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_hold_exit(zfsvfs, zh);
+ return (SET_ERROR(EIO));
+ }
+
+ set_nlink(ZTOI(zp), (uint32_t)links);
+ zfs_set_inode_flags(zp, ZTOI(zp));
+
+ zp->z_blksz = doi.doi_data_block_size;
+ zp->z_atime_dirty = B_FALSE;
+ zfs_znode_update_vfs(zp);
+
+ /*
+ * If the file has zero links, then it has been unlinked on the send
+ * side and it must be in the received unlinked set.
+ * We call zfs_znode_dmu_fini() now to prevent any accesses to the
+ * stale data and to prevent automatic removal of the file in
+ * zfs_zinactive(). The file will be removed either when it is removed
+ * on the send side and the next incremental stream is received or
+ * when the unlinked set gets processed.
+ */
+ zp->z_unlinked = (ZTOI(zp)->i_nlink == 0);
+ if (zp->z_unlinked)
+ zfs_znode_dmu_fini(zp);
+
+ zfs_znode_hold_exit(zfsvfs, zh);
+
+ return (0);
+}
+
+void
+zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ objset_t *os = zfsvfs->z_os;
+ uint64_t obj = zp->z_id;
+ uint64_t acl_obj = zfs_external_acl(zp);
+ znode_hold_t *zh;
+
+ zh = zfs_znode_hold_enter(zfsvfs, obj);
+ if (acl_obj) {
+ VERIFY(!zp->z_is_sa);
+ VERIFY(0 == dmu_object_free(os, acl_obj, tx));
+ }
+ VERIFY(0 == dmu_object_free(os, obj, tx));
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_hold_exit(zfsvfs, zh);
+}
+
+void
+zfs_zinactive(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ uint64_t z_id = zp->z_id;
+ znode_hold_t *zh;
+
+ ASSERT(zp->z_sa_hdl);
+
+ /*
+ * Don't allow a zfs_zget() while were trying to release this znode.
+ */
+ zh = zfs_znode_hold_enter(zfsvfs, z_id);
+
+ mutex_enter(&zp->z_lock);
+
+ /*
+ * If this was the last reference to a file with no links, remove
+ * the file from the file system unless the file system is mounted
+ * read-only. That can happen, for example, if the file system was
+ * originally read-write, the file was opened, then unlinked and
+ * the file system was made read-only before the file was finally
+ * closed. The file will remain in the unlinked set.
+ */
+ if (zp->z_unlinked) {
+ ASSERT(!zfsvfs->z_issnap);
+ if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) {
+ mutex_exit(&zp->z_lock);
+ zfs_znode_hold_exit(zfsvfs, zh);
+ zfs_rmnode(zp);
+ return;
+ }
+ }
+
+ mutex_exit(&zp->z_lock);
+ zfs_znode_dmu_fini(zp);
+
+ zfs_znode_hold_exit(zfsvfs, zh);
+}
+
+#if defined(HAVE_INODE_TIMESPEC64_TIMES)
+#define zfs_compare_timespec timespec64_compare
+#else
+#define zfs_compare_timespec timespec_compare
+#endif
+
+/*
+ * Determine whether the znode's atime must be updated. The logic mostly
+ * duplicates the Linux kernel's relatime_need_update() functionality.
+ * This function is only called if the underlying filesystem actually has
+ * atime updates enabled.
+ */
+boolean_t
+zfs_relatime_need_update(const struct inode *ip)
+{
+ inode_timespec_t now;
+
+ gethrestime(&now);
+ /*
+ * In relatime mode, only update the atime if the previous atime
+ * is earlier than either the ctime or mtime or if at least a day
+ * has passed since the last update of atime.
+ */
+ if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0)
+ return (B_TRUE);
+
+ if (zfs_compare_timespec(&ip->i_ctime, &ip->i_atime) >= 0)
+ return (B_TRUE);
+
+ if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60)
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+/*
+ * Prepare to update znode time stamps.
+ *
+ * IN: zp - znode requiring timestamp update
+ * flag - ATTR_MTIME, ATTR_CTIME flags
+ *
+ * OUT: zp - z_seq
+ * mtime - new mtime
+ * ctime - new ctime
+ *
+ * Note: We don't update atime here, because we rely on Linux VFS to do
+ * atime updating.
+ */
+void
+zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
+ uint64_t ctime[2])
+{
+ inode_timespec_t now;
+
+ gethrestime(&now);
+
+ zp->z_seq++;
+
+ if (flag & ATTR_MTIME) {
+ ZFS_TIME_ENCODE(&now, mtime);
+ ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime);
+ if (ZTOZSB(zp)->z_use_fuids) {
+ zp->z_pflags |= (ZFS_ARCHIVE |
+ ZFS_AV_MODIFIED);
+ }
+ }
+
+ if (flag & ATTR_CTIME) {
+ ZFS_TIME_ENCODE(&now, ctime);
+ ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime);
+ if (ZTOZSB(zp)->z_use_fuids)
+ zp->z_pflags |= ZFS_ARCHIVE;
+ }
+}
+
+/*
+ * Grow the block size for a file.
+ *
+ * IN: zp - znode of file to free data in.
+ * size - requested block size
+ * tx - open transaction.
+ *
+ * NOTE: this function assumes that the znode is write locked.
+ */
+void
+zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
+{
+ int error;
+ u_longlong_t dummy;
+
+ if (size <= zp->z_blksz)
+ return;
+ /*
+ * If the file size is already greater than the current blocksize,
+ * we will not grow. If there is more than one block in a file,
+ * the blocksize cannot change.
+ */
+ if (zp->z_blksz && zp->z_size > zp->z_blksz)
+ return;
+
+ error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id,
+ size, 0, tx);
+
+ if (error == ENOTSUP)
+ return;
+ ASSERT0(error);
+
+ /* What blocksize did we actually get? */
+ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
+}
+
+/*
+ * Increase the file length
+ *
+ * IN: zp - znode of file to free data in.
+ * end - new end-of-file
+ *
+ * RETURN: 0 on success, error code on failure
+ */
+static int
+zfs_extend(znode_t *zp, uint64_t end)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ dmu_tx_t *tx;
+ zfs_locked_range_t *lr;
+ uint64_t newblksz;
+ int error;
+
+ /*
+ * We will change zp_size, lock the whole file.
+ */
+ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
+
+ /*
+ * Nothing to do if file already at desired length.
+ */
+ if (end <= zp->z_size) {
+ zfs_rangelock_exit(lr);
+ return (0);
+ }
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ if (end > zp->z_blksz &&
+ (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
+ /*
+ * We are growing the file past the current block size.
+ */
+ if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
+ /*
+ * File's blocksize is already larger than the
+ * "recordsize" property. Only let it grow to
+ * the next power of 2.
+ */
+ ASSERT(!ISP2(zp->z_blksz));
+ newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
+ } else {
+ newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
+ }
+ dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
+ } else {
+ newblksz = 0;
+ }
+
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ zfs_rangelock_exit(lr);
+ return (error);
+ }
+
+ if (newblksz)
+ zfs_grow_blocksize(zp, newblksz, tx);
+
+ zp->z_size = end;
+
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
+ &zp->z_size, sizeof (zp->z_size), tx));
+
+ zfs_rangelock_exit(lr);
+
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+
+/*
+ * zfs_zero_partial_page - Modeled after update_pages() but
+ * with different arguments and semantics for use by zfs_freesp().
+ *
+ * Zeroes a piece of a single page cache entry for zp at offset
+ * start and length len.
+ *
+ * Caller must acquire a range lock on the file for the region
+ * being zeroed in order that the ARC and page cache stay in sync.
+ */
+static void
+zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len)
+{
+ struct address_space *mp = ZTOI(zp)->i_mapping;
+ struct page *pp;
+ int64_t off;
+ void *pb;
+
+ ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK));
+
+ off = start & (PAGE_SIZE - 1);
+ start &= PAGE_MASK;
+
+ pp = find_lock_page(mp, start >> PAGE_SHIFT);
+ if (pp) {
+ if (mapping_writably_mapped(mp))
+ flush_dcache_page(pp);
+
+ pb = kmap(pp);
+ bzero(pb + off, len);
+ kunmap(pp);
+
+ if (mapping_writably_mapped(mp))
+ flush_dcache_page(pp);
+
+ mark_page_accessed(pp);
+ SetPageUptodate(pp);
+ ClearPageError(pp);
+ unlock_page(pp);
+ put_page(pp);
+ }
+}
+
+/*
+ * Free space in a file.
+ *
+ * IN: zp - znode of file to free data in.
+ * off - start of section to free.
+ * len - length of section to free.
+ *
+ * RETURN: 0 on success, error code on failure
+ */
+static int
+zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ zfs_locked_range_t *lr;
+ int error;
+
+ /*
+ * Lock the range being freed.
+ */
+ lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
+
+ /*
+ * Nothing to do if file already at desired length.
+ */
+ if (off >= zp->z_size) {
+ zfs_rangelock_exit(lr);
+ return (0);
+ }
+
+ if (off + len > zp->z_size)
+ len = zp->z_size - off;
+
+ error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
+
+ /*
+ * Zero partial page cache entries. This must be done under a
+ * range lock in order to keep the ARC and page cache in sync.
+ */
+ if (zp->z_is_mapped) {
+ loff_t first_page, last_page, page_len;
+ loff_t first_page_offset, last_page_offset;
+
+ /* first possible full page in hole */
+ first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ /* last page of hole */
+ last_page = (off + len) >> PAGE_SHIFT;
+
+ /* offset of first_page */
+ first_page_offset = first_page << PAGE_SHIFT;
+ /* offset of last_page */
+ last_page_offset = last_page << PAGE_SHIFT;
+
+ /* truncate whole pages */
+ if (last_page_offset > first_page_offset) {
+ truncate_inode_pages_range(ZTOI(zp)->i_mapping,
+ first_page_offset, last_page_offset - 1);
+ }
+
+ /* truncate sub-page ranges */
+ if (first_page > last_page) {
+ /* entire punched area within a single page */
+ zfs_zero_partial_page(zp, off, len);
+ } else {
+ /* beginning of punched area at the end of a page */
+ page_len = first_page_offset - off;
+ if (page_len > 0)
+ zfs_zero_partial_page(zp, off, page_len);
+
+ /* end of punched area at the beginning of a page */
+ page_len = off + len - last_page_offset;
+ if (page_len > 0)
+ zfs_zero_partial_page(zp, last_page_offset,
+ page_len);
+ }
+ }
+ zfs_rangelock_exit(lr);
+
+ return (error);
+}
+
+/*
+ * Truncate a file
+ *
+ * IN: zp - znode of file to free data in.
+ * end - new end-of-file.
+ *
+ * RETURN: 0 on success, error code on failure
+ */
+static int
+zfs_trunc(znode_t *zp, uint64_t end)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ dmu_tx_t *tx;
+ zfs_locked_range_t *lr;
+ int error;
+ sa_bulk_attr_t bulk[2];
+ int count = 0;
+
+ /*
+ * We will change zp_size, lock the whole file.
+ */
+ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
+
+ /*
+ * Nothing to do if file already at desired length.
+ */
+ if (end >= zp->z_size) {
+ zfs_rangelock_exit(lr);
+ return (0);
+ }
+
+ error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
+ DMU_OBJECT_END);
+ if (error) {
+ zfs_rangelock_exit(lr);
+ return (error);
+ }
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ dmu_tx_mark_netfree(tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ zfs_rangelock_exit(lr);
+ return (error);
+ }
+
+ zp->z_size = end;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+ NULL, &zp->z_size, sizeof (zp->z_size));
+
+ if (end == 0) {
+ zp->z_pflags &= ~ZFS_SPARSE;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, 8);
+ }
+ VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
+
+ dmu_tx_commit(tx);
+ zfs_rangelock_exit(lr);
+
+ return (0);
+}
+
+/*
+ * Free space in a file
+ *
+ * IN: zp - znode of file to free data in.
+ * off - start of range
+ * len - end of range (0 => EOF)
+ * flag - current file open mode flags.
+ * log - TRUE if this action should be logged
+ *
+ * RETURN: 0 on success, error code on failure
+ */
+int
+zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
+{
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ zilog_t *zilog = zfsvfs->z_log;
+ uint64_t mode;
+ uint64_t mtime[2], ctime[2];
+ sa_bulk_attr_t bulk[3];
+ int count = 0;
+ int error;
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
+ sizeof (mode))) != 0)
+ return (error);
+
+ if (off > zp->z_size) {
+ error = zfs_extend(zp, off+len);
+ if (error == 0 && log)
+ goto log;
+ goto out;
+ }
+
+ if (len == 0) {
+ error = zfs_trunc(zp, off);
+ } else {
+ if ((error = zfs_free_range(zp, off, len)) == 0 &&
+ off + len > zp->z_size)
+ error = zfs_extend(zp, off+len);
+ }
+ if (error || !log)
+ goto out;
+log:
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ goto out;
+ }
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, 8);
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ ASSERT(error == 0);
+
+ zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
+
+ dmu_tx_commit(tx);
+
+ zfs_znode_update_vfs(zp);
+ error = 0;
+
+out:
+ /*
+ * Truncate the page cache - for file truncate operations, use
+ * the purpose-built API for truncations. For punching operations,
+ * the truncation is handled under a range lock in zfs_free_range.
+ */
+ if (len == 0)
+ truncate_setsize(ZTOI(zp), off);
+ return (error);
+}
+
+void
+zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
+{
+ struct super_block *sb;
+ zfsvfs_t *zfsvfs;
+ uint64_t moid, obj, sa_obj, version;
+ uint64_t sense = ZFS_CASE_SENSITIVE;
+ uint64_t norm = 0;
+ nvpair_t *elem;
+ int size;
+ int error;
+ int i;
+ znode_t *rootzp = NULL;
+ vattr_t vattr;
+ znode_t *zp;
+ zfs_acl_ids_t acl_ids;
+
+ /*
+ * First attempt to create master node.
+ */
+ /*
+ * In an empty objset, there are no blocks to read and thus
+ * there can be no i/o errors (which we assert below).
+ */
+ moid = MASTER_NODE_OBJ;
+ error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+ ASSERT(error == 0);
+
+ /*
+ * Set starting attributes.
+ */
+ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
+ /* For the moment we expect all zpl props to be uint64_ts */
+ uint64_t val;
+ char *name;
+
+ ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
+ VERIFY(nvpair_value_uint64(elem, &val) == 0);
+ name = nvpair_name(elem);
+ if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
+ if (val < version)
+ version = val;
+ } else {
+ error = zap_update(os, moid, name, 8, 1, &val, tx);
+ }
+ ASSERT(error == 0);
+ if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
+ norm = val;
+ else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
+ sense = val;
+ }
+ ASSERT(version != 0);
+ error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
+
+ /*
+ * Create zap object used for SA attribute registration
+ */
+
+ if (version >= ZPL_VERSION_SA) {
+ sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+ error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+ ASSERT(error == 0);
+ } else {
+ sa_obj = 0;
+ }
+ /*
+ * Create a delete queue.
+ */
+ obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
+
+ error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
+ ASSERT(error == 0);
+
+ /*
+ * Create root znode. Create minimal znode/inode/zfsvfs/sb
+ * to allow zfs_mknode to work.
+ */
+ vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID;
+ vattr.va_mode = S_IFDIR|0755;
+ vattr.va_uid = crgetuid(cr);
+ vattr.va_gid = crgetgid(cr);
+
+ rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+ rootzp->z_unlinked = B_FALSE;
+ rootzp->z_atime_dirty = B_FALSE;
+ rootzp->z_is_sa = USE_SA(version, os);
+ rootzp->z_pflags = 0;
+
+ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+ zfsvfs->z_os = os;
+ zfsvfs->z_parent = zfsvfs;
+ zfsvfs->z_version = version;
+ zfsvfs->z_use_fuids = USE_FUIDS(version, os);
+ zfsvfs->z_use_sa = USE_SA(version, os);
+ zfsvfs->z_norm = norm;
+
+ sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP);
+ sb->s_fs_info = zfsvfs;
+
+ ZTOI(rootzp)->i_sb = sb;
+
+ error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+ &zfsvfs->z_attr_table);
+
+ ASSERT(error == 0);
+
+ /*
+ * Fold case on file systems that are always or sometimes case
+ * insensitive.
+ */
+ if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
+ zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
+ mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+ offsetof(znode_t, z_link_node));
+
+ size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX);
+ zfsvfs->z_hold_size = size;
+ zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
+ KM_SLEEP);
+ zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
+ for (i = 0; i != size; i++) {
+ avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
+ sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
+ mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
+ }
+
+ VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
+ cr, NULL, &acl_ids));
+ zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
+ ASSERT3P(zp, ==, rootzp);
+ error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
+ ASSERT(error == 0);
+ zfs_acl_ids_free(&acl_ids);
+
+ atomic_set(&ZTOI(rootzp)->i_count, 0);
+ sa_handle_destroy(rootzp->z_sa_hdl);
+ kmem_cache_free(znode_cache, rootzp);
+
+ for (i = 0; i != size; i++) {
+ avl_destroy(&zfsvfs->z_hold_trees[i]);
+ mutex_destroy(&zfsvfs->z_hold_locks[i]);
+ }
+
+ mutex_destroy(&zfsvfs->z_znodes_lock);
+
+ vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
+ vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
+ kmem_free(sb, sizeof (struct super_block));
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+}
+#endif /* _KERNEL */
+
+static int
+zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
+{
+ uint64_t sa_obj = 0;
+ int error;
+
+ error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
+ if (error != 0 && error != ENOENT)
+ return (error);
+
+ error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
+ return (error);
+}
+
+static int
+zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
+ dmu_buf_t **db, void *tag)
+{
+ dmu_object_info_t doi;
+ int error;
+
+ if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
+ return (error);
+
+ dmu_object_info_from_db(*db, &doi);
+ if ((doi.doi_bonus_type != DMU_OT_SA &&
+ doi.doi_bonus_type != DMU_OT_ZNODE) ||
+ (doi.doi_bonus_type == DMU_OT_ZNODE &&
+ doi.doi_bonus_size < sizeof (znode_phys_t))) {
+ sa_buf_rele(*db, tag);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
+ if (error != 0) {
+ sa_buf_rele(*db, tag);
+ return (error);
+ }
+
+ return (0);
+}
+
+static void
+zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
+{
+ sa_handle_destroy(hdl);
+ sa_buf_rele(db, tag);
+}
+
+/*
+ * Given an object number, return its parent object number and whether
+ * or not the object is an extended attribute directory.
+ */
+static int
+zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
+ uint64_t *pobjp, int *is_xattrdir)
+{
+ uint64_t parent;
+ uint64_t pflags;
+ uint64_t mode;
+ uint64_t parent_mode;
+ sa_bulk_attr_t bulk[3];
+ sa_handle_t *sa_hdl;
+ dmu_buf_t *sa_db;
+ int count = 0;
+ int error;
+
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
+ &parent, sizeof (parent));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
+ &pflags, sizeof (pflags));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+ &mode, sizeof (mode));
+
+ if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
+ return (error);
+
+ /*
+ * When a link is removed its parent pointer is not changed and will
+ * be invalid. There are two cases where a link is removed but the
+ * file stays around, when it goes to the delete queue and when there
+ * are additional links.
+ */
+ error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
+ zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
+ if (error != 0)
+ return (error);
+
+ *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
+
+ /*
+ * Extended attributes can be applied to files, directories, etc.
+ * Otherwise the parent must be a directory.
+ */
+ if (!*is_xattrdir && !S_ISDIR(parent_mode))
+ return (SET_ERROR(EINVAL));
+
+ *pobjp = parent;
+
+ return (0);
+}
+
+/*
+ * Given an object number, return some zpl level statistics
+ */
+static int
+zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
+ zfs_stat_t *sb)
+{
+ sa_bulk_attr_t bulk[4];
+ int count = 0;
+
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+ &sb->zs_mode, sizeof (sb->zs_mode));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
+ &sb->zs_gen, sizeof (sb->zs_gen));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
+ &sb->zs_links, sizeof (sb->zs_links));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
+ &sb->zs_ctime, sizeof (sb->zs_ctime));
+
+ return (sa_bulk_lookup(hdl, bulk, count));
+}
+
+static int
+zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
+ sa_attr_type_t *sa_table, char *buf, int len)
+{
+ sa_handle_t *sa_hdl;
+ sa_handle_t *prevhdl = NULL;
+ dmu_buf_t *prevdb = NULL;
+ dmu_buf_t *sa_db = NULL;
+ char *path = buf + len - 1;
+ int error;
+
+ *path = '\0';
+ sa_hdl = hdl;
+
+ uint64_t deleteq_obj;
+ VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
+ ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
+ error = zap_lookup_int(osp, deleteq_obj, obj);
+ if (error == 0) {
+ return (ESTALE);
+ } else if (error != ENOENT) {
+ return (error);
+ }
+ error = 0;
+
+ for (;;) {
+ uint64_t pobj = 0;
+ char component[MAXNAMELEN + 2];
+ size_t complen;
+ int is_xattrdir = 0;
+
+ if (prevdb) {
+ ASSERT(prevhdl != NULL);
+ zfs_release_sa_handle(prevhdl, prevdb, FTAG);
+ }
+
+ if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
+ &is_xattrdir)) != 0)
+ break;
+
+ if (pobj == obj) {
+ if (path[0] != '/')
+ *--path = '/';
+ break;
+ }
+
+ component[0] = '/';
+ if (is_xattrdir) {
+ (void) sprintf(component + 1, "<xattrdir>");
+ } else {
+ error = zap_value_search(osp, pobj, obj,
+ ZFS_DIRENT_OBJ(-1ULL), component + 1);
+ if (error != 0)
+ break;
+ }
+
+ complen = strlen(component);
+ path -= complen;
+ ASSERT(path >= buf);
+ bcopy(component, path, complen);
+ obj = pobj;
+
+ if (sa_hdl != hdl) {
+ prevhdl = sa_hdl;
+ prevdb = sa_db;
+ }
+ error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
+ if (error != 0) {
+ sa_hdl = prevhdl;
+ sa_db = prevdb;
+ break;
+ }
+ }
+
+ if (sa_hdl != NULL && sa_hdl != hdl) {
+ ASSERT(sa_db != NULL);
+ zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
+ }
+
+ if (error == 0)
+ (void) memmove(buf, path, buf + len - path);
+
+ return (error);
+}
+
+int
+zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
+{
+ sa_attr_type_t *sa_table;
+ sa_handle_t *hdl;
+ dmu_buf_t *db;
+ int error;
+
+ error = zfs_sa_setup(osp, &sa_table);
+ if (error != 0)
+ return (error);
+
+ error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+ zfs_release_sa_handle(hdl, db, FTAG);
+ return (error);
+}
+
+int
+zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
+ char *buf, int len)
+{
+ char *path = buf + len - 1;
+ sa_attr_type_t *sa_table;
+ sa_handle_t *hdl;
+ dmu_buf_t *db;
+ int error;
+
+ *path = '\0';
+
+ error = zfs_sa_setup(osp, &sa_table);
+ if (error != 0)
+ return (error);
+
+ error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
+ if (error != 0) {
+ zfs_release_sa_handle(hdl, db, FTAG);
+ return (error);
+ }
+
+ error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+ zfs_release_sa_handle(hdl, db, FTAG);
+ return (error);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zfs_create_fs);
+EXPORT_SYMBOL(zfs_obj_to_path);
+
+/* CSTYLED */
+module_param(zfs_object_mutex_size, uint, 0644);
+MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
+module_param(zfs_unlink_suspend_progress, int, 0644);
+MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks "
+"(debug - leaks space into the unlinked set)");
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c
new file mode 100644
index 000000000000..284ca706ede5
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c
@@ -0,0 +1,2049 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, Datto, Inc. All rights reserved.
+ */
+
+#include <sys/zio_crypt.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dnode.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/zil.h>
+#include <sys/sha2.h>
+#include <sys/hkdf.h>
+#include <sys/qat.h>
+
+/*
+ * This file is responsible for handling all of the details of generating
+ * encryption parameters and performing encryption and authentication.
+ *
+ * BLOCK ENCRYPTION PARAMETERS:
+ * Encryption /Authentication Algorithm Suite (crypt):
+ * The encryption algorithm, mode, and key length we are going to use. We
+ * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit
+ * keys. All authentication is currently done with SHA512-HMAC.
+ *
+ * Plaintext:
+ * The unencrypted data that we want to encrypt.
+ *
+ * Initialization Vector (IV):
+ * An initialization vector for the encryption algorithms. This is used to
+ * "tweak" the encryption algorithms so that two blocks of the same data are
+ * encrypted into different ciphertext outputs, thus obfuscating block patterns.
+ * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is
+ * never reused with the same encryption key. This value is stored unencrypted
+ * and must simply be provided to the decryption function. We use a 96 bit IV
+ * (as recommended by NIST) for all block encryption. For non-dedup blocks we
+ * derive the IV randomly. The first 64 bits of the IV are stored in the second
+ * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of
+ * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits
+ * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count
+ * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of
+ * level 0 blocks is the number of allocated dnodes in that block. The on-disk
+ * format supports at most 2^15 slots per L0 dnode block, because the maximum
+ * block size is 16MB (2^24). In either case, for level 0 blocks this number
+ * will still be smaller than UINT32_MAX so it is safe to store the IV in the
+ * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count
+ * for the dnode code.
+ *
+ * Master key:
+ * This is the most important secret data of an encrypted dataset. It is used
+ * along with the salt to generate that actual encryption keys via HKDF. We
+ * do not use the master key to directly encrypt any data because there are
+ * theoretical limits on how much data can actually be safely encrypted with
+ * any encryption mode. The master key is stored encrypted on disk with the
+ * user's wrapping key. Its length is determined by the encryption algorithm.
+ * For details on how this is stored see the block comment in dsl_crypt.c
+ *
+ * Salt:
+ * Used as an input to the HKDF function, along with the master key. We use a
+ * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt
+ * can be used for encrypting many blocks, so we cache the current salt and the
+ * associated derived key in zio_crypt_t so we do not need to derive it again
+ * needlessly.
+ *
+ * Encryption Key:
+ * A secret binary key, generated from an HKDF function used to encrypt and
+ * decrypt data.
+ *
+ * Message Authentication Code (MAC)
+ * The MAC is an output of authenticated encryption modes such as AES-GCM and
+ * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted
+ * data on disk and return garbage to the application. Effectively, it is a
+ * checksum that can not be reproduced by an attacker. We store the MAC in the
+ * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated
+ * regular checksum of the ciphertext which can be used for scrubbing.
+ *
+ * OBJECT AUTHENTICATION:
+ * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because
+ * they contain some info that always needs to be readable. To prevent this
+ * data from being altered, we authenticate this data using SHA512-HMAC. This
+ * will produce a MAC (similar to the one produced via encryption) which can
+ * be used to verify the object was not modified. HMACs do not require key
+ * rotation or IVs, so we can keep up to the full 3 copies of authenticated
+ * data.
+ *
+ * ZIL ENCRYPTION:
+ * ZIL blocks have their bp written to disk ahead of the associated data, so we
+ * cannot store the MAC there as we normally do. For these blocks the MAC is
+ * stored in the embedded checksum within the zil_chain_t header. The salt and
+ * IV are generated for the block on bp allocation instead of at encryption
+ * time. In addition, ZIL blocks have some pieces that must be left in plaintext
+ * for claiming even though all of the sensitive user data still needs to be
+ * encrypted. The function zio_crypt_init_uios_zil() handles parsing which
+ * pieces of the block need to be encrypted. All data that is not encrypted is
+ * authenticated using the AAD mechanisms that the supported encryption modes
+ * provide for. In order to preserve the semantics of the ZIL for encrypted
+ * datasets, the ZIL is not protected at the objset level as described below.
+ *
+ * DNODE ENCRYPTION:
+ * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left
+ * in plaintext for scrubbing and claiming, but the bonus buffers might contain
+ * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing
+ * which which pieces of the block need to be encrypted. For more details about
+ * dnode authentication and encryption, see zio_crypt_init_uios_dnode().
+ *
+ * OBJECT SET AUTHENTICATION:
+ * Up to this point, everything we have encrypted and authenticated has been
+ * at level 0 (or -2 for the ZIL). If we did not do any further work the
+ * on-disk format would be susceptible to attacks that deleted or rearranged
+ * the order of level 0 blocks. Ideally, the cleanest solution would be to
+ * maintain a tree of authentication MACs going up the bp tree. However, this
+ * presents a problem for raw sends. Send files do not send information about
+ * indirect blocks so there would be no convenient way to transfer the MACs and
+ * they cannot be recalculated on the receive side without the master key which
+ * would defeat one of the purposes of raw sends in the first place. Instead,
+ * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs
+ * from the level below. We also include some portable fields from blk_prop such
+ * as the lsize and compression algorithm to prevent the data from being
+ * misinterpreted.
+ *
+ * At the objset level, we maintain 2 separate 256 bit MACs in the
+ * objset_phys_t. The first one is "portable" and is the logical root of the
+ * MAC tree maintained in the metadnode's bps. The second, is "local" and is
+ * used as the root MAC for the user accounting objects, which are also not
+ * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload
+ * of the send file. The useraccounting code ensures that the useraccounting
+ * info is not present upon a receive, so the local MAC can simply be cleared
+ * out at that time. For more info about objset_phys_t authentication, see
+ * zio_crypt_do_objset_hmacs().
+ *
+ * CONSIDERATIONS FOR DEDUP:
+ * In order for dedup to work, blocks that we want to dedup with one another
+ * need to use the same IV and encryption key, so that they will have the same
+ * ciphertext. Normally, one should never reuse an IV with the same encryption
+ * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both
+ * blocks. In this case, however, since we are using the same plaintext as
+ * well all that we end up with is a duplicate of the original ciphertext we
+ * already had. As a result, an attacker with read access to the raw disk will
+ * be able to tell which blocks are the same but this information is given away
+ * by dedup anyway. In order to get the same IVs and encryption keys for
+ * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC
+ * here so that a reproducible checksum of the plaintext is never available to
+ * the attacker. The HMAC key is kept alongside the master key, encrypted on
+ * disk. The first 64 bits of the HMAC are used in place of the random salt, and
+ * the next 96 bits are used as the IV. As a result of this mechanism, dedup
+ * will only work within a clone family since encrypted dedup requires use of
+ * the same master and HMAC keys.
+ */
+
+/*
+ * After encrypting many blocks with the same key we may start to run up
+ * against the theoretical limits of how much data can securely be encrypted
+ * with a single key using the supported encryption modes. The most obvious
+ * limitation is that our risk of generating 2 equivalent 96 bit IVs increases
+ * the more IVs we generate (which both GCM and CCM modes strictly forbid).
+ * This risk actually grows surprisingly quickly over time according to the
+ * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have
+ * generated n IVs with a cryptographically secure RNG, the approximate
+ * probability p(n) of a collision is given as:
+ *
+ * p(n) ~= e^(-n*(n-1)/(2*(2^96)))
+ *
+ * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html]
+ *
+ * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion
+ * we must not write more than 398,065,730 blocks with the same encryption key.
+ * Therefore, we rotate our keys after 400,000,000 blocks have been written by
+ * generating a new random 64 bit salt for our HKDF encryption key generation
+ * function.
+ */
+#define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000
+#define ZFS_CURRENT_MAX_SALT_USES \
+ (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT))
+unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT;
+
+typedef struct blkptr_auth_buf {
+ uint64_t bab_prop; /* blk_prop - portable mask */
+ uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */
+ uint64_t bab_pad; /* reserved for future use */
+} blkptr_auth_buf_t;
+
+zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = {
+ {"", ZC_TYPE_NONE, 0, "inherit"},
+ {"", ZC_TYPE_NONE, 0, "on"},
+ {"", ZC_TYPE_NONE, 0, "off"},
+ {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"},
+ {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"},
+ {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"},
+ {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"},
+ {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"},
+ {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"}
+};
+
+void
+zio_crypt_key_destroy(zio_crypt_key_t *key)
+{
+ rw_destroy(&key->zk_salt_lock);
+
+ /* free crypto templates */
+ crypto_destroy_ctx_template(key->zk_current_tmpl);
+ crypto_destroy_ctx_template(key->zk_hmac_tmpl);
+
+ /* zero out sensitive data */
+ bzero(key, sizeof (zio_crypt_key_t));
+}
+
+int
+zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key)
+{
+ int ret;
+ crypto_mechanism_t mech;
+ uint_t keydata_len;
+
+ ASSERT(key != NULL);
+ ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+
+ keydata_len = zio_crypt_table[crypt].ci_keylen;
+ bzero(key, sizeof (zio_crypt_key_t));
+
+ /* fill keydata buffers and salt with random data */
+ ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t));
+ if (ret != 0)
+ goto error;
+
+ ret = random_get_bytes(key->zk_master_keydata, keydata_len);
+ if (ret != 0)
+ goto error;
+
+ ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN);
+ if (ret != 0)
+ goto error;
+
+ ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
+ if (ret != 0)
+ goto error;
+
+ /* derive the current key from the master key */
+ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+ key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
+ keydata_len);
+ if (ret != 0)
+ goto error;
+
+ /* initialize keys for the ICP */
+ key->zk_current_key.ck_format = CRYPTO_KEY_RAW;
+ key->zk_current_key.ck_data = key->zk_current_keydata;
+ key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+ key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW;
+ key->zk_hmac_key.ck_data = &key->zk_hmac_key;
+ key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
+
+ /*
+ * Initialize the crypto templates. It's ok if this fails because
+ * this is just an optimization.
+ */
+ mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname);
+ ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
+ &key->zk_current_tmpl, KM_SLEEP);
+ if (ret != CRYPTO_SUCCESS)
+ key->zk_current_tmpl = NULL;
+
+ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+ ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key,
+ &key->zk_hmac_tmpl, KM_SLEEP);
+ if (ret != CRYPTO_SUCCESS)
+ key->zk_hmac_tmpl = NULL;
+
+ key->zk_crypt = crypt;
+ key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION;
+ key->zk_salt_count = 0;
+ rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
+
+ return (0);
+
+error:
+ zio_crypt_key_destroy(key);
+ return (ret);
+}
+
+static int
+zio_crypt_key_change_salt(zio_crypt_key_t *key)
+{
+ int ret = 0;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ crypto_mechanism_t mech;
+ uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen;
+
+ /* generate a new salt */
+ ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN);
+ if (ret != 0)
+ goto error;
+
+ rw_enter(&key->zk_salt_lock, RW_WRITER);
+
+ /* someone beat us to the salt rotation, just unlock and return */
+ if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES)
+ goto out_unlock;
+
+ /* derive the current key from the master key and the new salt */
+ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+ salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len);
+ if (ret != 0)
+ goto out_unlock;
+
+ /* assign the salt and reset the usage count */
+ bcopy(salt, key->zk_salt, ZIO_DATA_SALT_LEN);
+ key->zk_salt_count = 0;
+
+ /* destroy the old context template and create the new one */
+ crypto_destroy_ctx_template(key->zk_current_tmpl);
+ ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
+ &key->zk_current_tmpl, KM_SLEEP);
+ if (ret != CRYPTO_SUCCESS)
+ key->zk_current_tmpl = NULL;
+
+ rw_exit(&key->zk_salt_lock);
+
+ return (0);
+
+out_unlock:
+ rw_exit(&key->zk_salt_lock);
+error:
+ return (ret);
+}
+
+/* See comment above zfs_key_max_salt_uses definition for details */
+int
+zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt)
+{
+ int ret;
+ boolean_t salt_change;
+
+ rw_enter(&key->zk_salt_lock, RW_READER);
+
+ bcopy(key->zk_salt, salt, ZIO_DATA_SALT_LEN);
+ salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >=
+ ZFS_CURRENT_MAX_SALT_USES);
+
+ rw_exit(&key->zk_salt_lock);
+
+ if (salt_change) {
+ ret = zio_crypt_key_change_salt(key);
+ if (ret != 0)
+ goto error;
+ }
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+/*
+ * This function handles all encryption and decryption in zfs. When
+ * encrypting it expects puio to reference the plaintext and cuio to
+ * reference the ciphertext. cuio must have enough space for the
+ * ciphertext + room for a MAC. datalen should be the length of the
+ * plaintext / ciphertext alone.
+ */
+static int
+zio_do_crypt_uio(boolean_t encrypt, uint64_t crypt, crypto_key_t *key,
+ crypto_ctx_template_t tmpl, uint8_t *ivbuf, uint_t datalen,
+ zfs_uio_t *puio, zfs_uio_t *cuio, uint8_t *authbuf, uint_t auth_len)
+{
+ int ret;
+ crypto_data_t plaindata, cipherdata;
+ CK_AES_CCM_PARAMS ccmp;
+ CK_AES_GCM_PARAMS gcmp;
+ crypto_mechanism_t mech;
+ zio_crypt_info_t crypt_info;
+ uint_t plain_full_len, maclen;
+
+ ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+ ASSERT3U(key->ck_format, ==, CRYPTO_KEY_RAW);
+
+ /* lookup the encryption info */
+ crypt_info = zio_crypt_table[crypt];
+
+ /* the mac will always be the last iovec_t in the cipher uio */
+ maclen = cuio->uio_iov[cuio->uio_iovcnt - 1].iov_len;
+
+ ASSERT(maclen <= ZIO_DATA_MAC_LEN);
+
+ /* setup encryption mechanism (same as crypt) */
+ mech.cm_type = crypto_mech2id(crypt_info.ci_mechname);
+
+ /*
+ * Strangely, the ICP requires that plain_full_len must include
+ * the MAC length when decrypting, even though the UIO does not
+ * need to have the extra space allocated.
+ */
+ if (encrypt) {
+ plain_full_len = datalen;
+ } else {
+ plain_full_len = datalen + maclen;
+ }
+
+ /*
+ * setup encryption params (currently only AES CCM and AES GCM
+ * are supported)
+ */
+ if (crypt_info.ci_crypt_type == ZC_TYPE_CCM) {
+ ccmp.ulNonceSize = ZIO_DATA_IV_LEN;
+ ccmp.ulAuthDataSize = auth_len;
+ ccmp.authData = authbuf;
+ ccmp.ulMACSize = maclen;
+ ccmp.nonce = ivbuf;
+ ccmp.ulDataSize = plain_full_len;
+
+ mech.cm_param = (char *)(&ccmp);
+ mech.cm_param_len = sizeof (CK_AES_CCM_PARAMS);
+ } else {
+ gcmp.ulIvLen = ZIO_DATA_IV_LEN;
+ gcmp.ulIvBits = CRYPTO_BYTES2BITS(ZIO_DATA_IV_LEN);
+ gcmp.ulAADLen = auth_len;
+ gcmp.pAAD = authbuf;
+ gcmp.ulTagBits = CRYPTO_BYTES2BITS(maclen);
+ gcmp.pIv = ivbuf;
+
+ mech.cm_param = (char *)(&gcmp);
+ mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS);
+ }
+
+ /* populate the cipher and plain data structs. */
+ plaindata.cd_format = CRYPTO_DATA_UIO;
+ plaindata.cd_offset = 0;
+ plaindata.cd_uio = puio;
+ plaindata.cd_miscdata = NULL;
+ plaindata.cd_length = plain_full_len;
+
+ cipherdata.cd_format = CRYPTO_DATA_UIO;
+ cipherdata.cd_offset = 0;
+ cipherdata.cd_uio = cuio;
+ cipherdata.cd_miscdata = NULL;
+ cipherdata.cd_length = datalen + maclen;
+
+ /* perform the actual encryption */
+ if (encrypt) {
+ ret = crypto_encrypt(&mech, &plaindata, key, tmpl, &cipherdata,
+ NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+ } else {
+ ret = crypto_decrypt(&mech, &cipherdata, key, tmpl, &plaindata,
+ NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ASSERT3U(ret, ==, CRYPTO_INVALID_MAC);
+ ret = SET_ERROR(ECKSUM);
+ goto error;
+ }
+ }
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+int
+zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv,
+ uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out)
+{
+ int ret;
+ zfs_uio_t puio, cuio;
+ uint64_t aad[3];
+ iovec_t plain_iovecs[2], cipher_iovecs[3];
+ uint64_t crypt = key->zk_crypt;
+ uint_t enc_len, keydata_len, aad_len;
+
+ ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+ ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW);
+
+ keydata_len = zio_crypt_table[crypt].ci_keylen;
+
+ /* generate iv for wrapping the master and hmac key */
+ ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN);
+ if (ret != 0)
+ goto error;
+
+ /* initialize zfs_uio_ts */
+ plain_iovecs[0].iov_base = key->zk_master_keydata;
+ plain_iovecs[0].iov_len = keydata_len;
+ plain_iovecs[1].iov_base = key->zk_hmac_keydata;
+ plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
+
+ cipher_iovecs[0].iov_base = keydata_out;
+ cipher_iovecs[0].iov_len = keydata_len;
+ cipher_iovecs[1].iov_base = hmac_keydata_out;
+ cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
+ cipher_iovecs[2].iov_base = mac;
+ cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN;
+
+ /*
+ * Although we don't support writing to the old format, we do
+ * support rewrapping the key so that the user can move and
+ * quarantine datasets on the old format.
+ */
+ if (key->zk_version == 0) {
+ aad_len = sizeof (uint64_t);
+ aad[0] = LE_64(key->zk_guid);
+ } else {
+ ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+ aad_len = sizeof (uint64_t) * 3;
+ aad[0] = LE_64(key->zk_guid);
+ aad[1] = LE_64(crypt);
+ aad[2] = LE_64(key->zk_version);
+ }
+
+ enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN;
+ puio.uio_iov = plain_iovecs;
+ puio.uio_iovcnt = 2;
+ puio.uio_segflg = UIO_SYSSPACE;
+ cuio.uio_iov = cipher_iovecs;
+ cuio.uio_iovcnt = 3;
+ cuio.uio_segflg = UIO_SYSSPACE;
+
+ /* encrypt the keys and store the resulting ciphertext and mac */
+ ret = zio_do_crypt_uio(B_TRUE, crypt, cwkey, NULL, iv, enc_len,
+ &puio, &cuio, (uint8_t *)aad, aad_len);
+ if (ret != 0)
+ goto error;
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+int
+zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version,
+ uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv,
+ uint8_t *mac, zio_crypt_key_t *key)
+{
+ crypto_mechanism_t mech;
+ zfs_uio_t puio, cuio;
+ uint64_t aad[3];
+ iovec_t plain_iovecs[2], cipher_iovecs[3];
+ uint_t enc_len, keydata_len, aad_len;
+ int ret;
+
+ ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+ ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW);
+
+ rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
+
+ keydata_len = zio_crypt_table[crypt].ci_keylen;
+
+ /* initialize zfs_uio_ts */
+ plain_iovecs[0].iov_base = key->zk_master_keydata;
+ plain_iovecs[0].iov_len = keydata_len;
+ plain_iovecs[1].iov_base = key->zk_hmac_keydata;
+ plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
+
+ cipher_iovecs[0].iov_base = keydata;
+ cipher_iovecs[0].iov_len = keydata_len;
+ cipher_iovecs[1].iov_base = hmac_keydata;
+ cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
+ cipher_iovecs[2].iov_base = mac;
+ cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN;
+
+ if (version == 0) {
+ aad_len = sizeof (uint64_t);
+ aad[0] = LE_64(guid);
+ } else {
+ ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+ aad_len = sizeof (uint64_t) * 3;
+ aad[0] = LE_64(guid);
+ aad[1] = LE_64(crypt);
+ aad[2] = LE_64(version);
+ }
+
+ enc_len = keydata_len + SHA512_HMAC_KEYLEN;
+ puio.uio_iov = plain_iovecs;
+ puio.uio_segflg = UIO_SYSSPACE;
+ puio.uio_iovcnt = 2;
+ cuio.uio_iov = cipher_iovecs;
+ cuio.uio_iovcnt = 3;
+ cuio.uio_segflg = UIO_SYSSPACE;
+
+ /* decrypt the keys and store the result in the output buffers */
+ ret = zio_do_crypt_uio(B_FALSE, crypt, cwkey, NULL, iv, enc_len,
+ &puio, &cuio, (uint8_t *)aad, aad_len);
+ if (ret != 0)
+ goto error;
+
+ /* generate a fresh salt */
+ ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
+ if (ret != 0)
+ goto error;
+
+ /* derive the current key from the master key */
+ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+ key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
+ keydata_len);
+ if (ret != 0)
+ goto error;
+
+ /* initialize keys for ICP */
+ key->zk_current_key.ck_format = CRYPTO_KEY_RAW;
+ key->zk_current_key.ck_data = key->zk_current_keydata;
+ key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+ key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW;
+ key->zk_hmac_key.ck_data = key->zk_hmac_keydata;
+ key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
+
+ /*
+ * Initialize the crypto templates. It's ok if this fails because
+ * this is just an optimization.
+ */
+ mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname);
+ ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
+ &key->zk_current_tmpl, KM_SLEEP);
+ if (ret != CRYPTO_SUCCESS)
+ key->zk_current_tmpl = NULL;
+
+ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+ ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key,
+ &key->zk_hmac_tmpl, KM_SLEEP);
+ if (ret != CRYPTO_SUCCESS)
+ key->zk_hmac_tmpl = NULL;
+
+ key->zk_crypt = crypt;
+ key->zk_version = version;
+ key->zk_guid = guid;
+ key->zk_salt_count = 0;
+
+ return (0);
+
+error:
+ zio_crypt_key_destroy(key);
+ return (ret);
+}
+
+int
+zio_crypt_generate_iv(uint8_t *ivbuf)
+{
+ int ret;
+
+ /* randomly generate the IV */
+ ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN);
+ if (ret != 0)
+ goto error;
+
+ return (0);
+
+error:
+ bzero(ivbuf, ZIO_DATA_IV_LEN);
+ return (ret);
+}
+
+int
+zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen,
+ uint8_t *digestbuf, uint_t digestlen)
+{
+ int ret;
+ crypto_mechanism_t mech;
+ crypto_data_t in_data, digest_data;
+ uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH];
+
+ ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH);
+
+ /* initialize sha512-hmac mechanism and crypto data */
+ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+ mech.cm_param = NULL;
+ mech.cm_param_len = 0;
+
+ /* initialize the crypto data */
+ in_data.cd_format = CRYPTO_DATA_RAW;
+ in_data.cd_offset = 0;
+ in_data.cd_length = datalen;
+ in_data.cd_raw.iov_base = (char *)data;
+ in_data.cd_raw.iov_len = in_data.cd_length;
+
+ digest_data.cd_format = CRYPTO_DATA_RAW;
+ digest_data.cd_offset = 0;
+ digest_data.cd_length = SHA512_DIGEST_LENGTH;
+ digest_data.cd_raw.iov_base = (char *)raw_digestbuf;
+ digest_data.cd_raw.iov_len = digest_data.cd_length;
+
+ /* generate the hmac */
+ ret = crypto_mac(&mech, &in_data, &key->zk_hmac_key, key->zk_hmac_tmpl,
+ &digest_data, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ bcopy(raw_digestbuf, digestbuf, digestlen);
+
+ return (0);
+
+error:
+ bzero(digestbuf, digestlen);
+ return (ret);
+}
+
+int
+zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data,
+ uint_t datalen, uint8_t *ivbuf, uint8_t *salt)
+{
+ int ret;
+ uint8_t digestbuf[SHA512_DIGEST_LENGTH];
+
+ ret = zio_crypt_do_hmac(key, data, datalen,
+ digestbuf, SHA512_DIGEST_LENGTH);
+ if (ret != 0)
+ return (ret);
+
+ bcopy(digestbuf, salt, ZIO_DATA_SALT_LEN);
+ bcopy(digestbuf + ZIO_DATA_SALT_LEN, ivbuf, ZIO_DATA_IV_LEN);
+
+ return (0);
+}
+
+/*
+ * The following functions are used to encode and decode encryption parameters
+ * into blkptr_t and zil_header_t. The ICP wants to use these parameters as
+ * byte strings, which normally means that these strings would not need to deal
+ * with byteswapping at all. However, both blkptr_t and zil_header_t may be
+ * byteswapped by lower layers and so we must "undo" that byteswap here upon
+ * decoding and encoding in a non-native byteorder. These functions require
+ * that the byteorder bit is correct before being called.
+ */
+void
+zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv)
+{
+ uint64_t val64;
+ uint32_t val32;
+
+ ASSERT(BP_IS_ENCRYPTED(bp));
+
+ if (!BP_SHOULD_BYTESWAP(bp)) {
+ bcopy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t));
+ bcopy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t));
+ bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
+ BP_SET_IV2(bp, val32);
+ } else {
+ bcopy(salt, &val64, sizeof (uint64_t));
+ bp->blk_dva[2].dva_word[0] = BSWAP_64(val64);
+
+ bcopy(iv, &val64, sizeof (uint64_t));
+ bp->blk_dva[2].dva_word[1] = BSWAP_64(val64);
+
+ bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
+ BP_SET_IV2(bp, BSWAP_32(val32));
+ }
+}
+
+void
+zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv)
+{
+ uint64_t val64;
+ uint32_t val32;
+
+ ASSERT(BP_IS_PROTECTED(bp));
+
+ /* for convenience, so callers don't need to check */
+ if (BP_IS_AUTHENTICATED(bp)) {
+ bzero(salt, ZIO_DATA_SALT_LEN);
+ bzero(iv, ZIO_DATA_IV_LEN);
+ return;
+ }
+
+ if (!BP_SHOULD_BYTESWAP(bp)) {
+ bcopy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t));
+ bcopy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t));
+
+ val32 = (uint32_t)BP_GET_IV2(bp);
+ bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
+ } else {
+ val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]);
+ bcopy(&val64, salt, sizeof (uint64_t));
+
+ val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]);
+ bcopy(&val64, iv, sizeof (uint64_t));
+
+ val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp));
+ bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
+ }
+}
+
+void
+zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac)
+{
+ uint64_t val64;
+
+ ASSERT(BP_USES_CRYPT(bp));
+ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET);
+
+ if (!BP_SHOULD_BYTESWAP(bp)) {
+ bcopy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t));
+ bcopy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3],
+ sizeof (uint64_t));
+ } else {
+ bcopy(mac, &val64, sizeof (uint64_t));
+ bp->blk_cksum.zc_word[2] = BSWAP_64(val64);
+
+ bcopy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t));
+ bp->blk_cksum.zc_word[3] = BSWAP_64(val64);
+ }
+}
+
+void
+zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac)
+{
+ uint64_t val64;
+
+ ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp));
+
+ /* for convenience, so callers don't need to check */
+ if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+ bzero(mac, ZIO_DATA_MAC_LEN);
+ return;
+ }
+
+ if (!BP_SHOULD_BYTESWAP(bp)) {
+ bcopy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t));
+ bcopy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t),
+ sizeof (uint64_t));
+ } else {
+ val64 = BSWAP_64(bp->blk_cksum.zc_word[2]);
+ bcopy(&val64, mac, sizeof (uint64_t));
+
+ val64 = BSWAP_64(bp->blk_cksum.zc_word[3]);
+ bcopy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t));
+ }
+}
+
+void
+zio_crypt_encode_mac_zil(void *data, uint8_t *mac)
+{
+ zil_chain_t *zilc = data;
+
+ bcopy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t));
+ bcopy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3],
+ sizeof (uint64_t));
+}
+
+void
+zio_crypt_decode_mac_zil(const void *data, uint8_t *mac)
+{
+ /*
+ * The ZIL MAC is embedded in the block it protects, which will
+ * not have been byteswapped by the time this function has been called.
+ * As a result, we don't need to worry about byteswapping the MAC.
+ */
+ const zil_chain_t *zilc = data;
+
+ bcopy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t));
+ bcopy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t),
+ sizeof (uint64_t));
+}
+
+/*
+ * This routine takes a block of dnodes (src_abd) and copies only the bonus
+ * buffers to the same offsets in the dst buffer. datalen should be the size
+ * of both the src_abd and the dst buffer (not just the length of the bonus
+ * buffers).
+ */
+void
+zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen)
+{
+ uint_t i, max_dnp = datalen >> DNODE_SHIFT;
+ uint8_t *src;
+ dnode_phys_t *dnp, *sdnp, *ddnp;
+
+ src = abd_borrow_buf_copy(src_abd, datalen);
+
+ sdnp = (dnode_phys_t *)src;
+ ddnp = (dnode_phys_t *)dst;
+
+ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+ dnp = &sdnp[i];
+ if (dnp->dn_type != DMU_OT_NONE &&
+ DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
+ dnp->dn_bonuslen != 0) {
+ bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]),
+ DN_MAX_BONUS_LEN(dnp));
+ }
+ }
+
+ abd_return_buf(src_abd, src, datalen);
+}
+
+/*
+ * This function decides what fields from blk_prop are included in
+ * the on-disk various MAC algorithms.
+ */
+static void
+zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version)
+{
+ /*
+ * Version 0 did not properly zero out all non-portable fields
+ * as it should have done. We maintain this code so that we can
+ * do read-only imports of pools on this version.
+ */
+ if (version == 0) {
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_CHECKSUM(bp, 0);
+ BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE);
+ return;
+ }
+
+ ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+
+ /*
+ * The hole_birth feature might set these fields even if this bp
+ * is a hole. We zero them out here to guarantee that raw sends
+ * will function with or without the feature.
+ */
+ if (BP_IS_HOLE(bp)) {
+ bp->blk_prop = 0ULL;
+ return;
+ }
+
+ /*
+ * At L0 we want to verify these fields to ensure that data blocks
+ * can not be reinterpreted. For instance, we do not want an attacker
+ * to trick us into returning raw lz4 compressed data to the user
+ * by modifying the compression bits. At higher levels, we cannot
+ * enforce this policy since raw sends do not convey any information
+ * about indirect blocks, so these values might be different on the
+ * receive side. Fortunately, this does not open any new attack
+ * vectors, since any alterations that can be made to a higher level
+ * bp must still verify the correct order of the layer below it.
+ */
+ if (BP_GET_LEVEL(bp) != 0) {
+ BP_SET_BYTEORDER(bp, 0);
+ BP_SET_COMPRESS(bp, 0);
+
+ /*
+ * psize cannot be set to zero or it will trigger
+ * asserts, but the value doesn't really matter as
+ * long as it is constant.
+ */
+ BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE);
+ }
+
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_CHECKSUM(bp, 0);
+}
+
+static void
+zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp,
+ blkptr_auth_buf_t *bab, uint_t *bab_len)
+{
+ blkptr_t tmpbp = *bp;
+
+ if (should_bswap)
+ byteswap_uint64_array(&tmpbp, sizeof (blkptr_t));
+
+ ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp));
+ ASSERT0(BP_IS_EMBEDDED(&tmpbp));
+
+ zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac);
+
+ /*
+ * We always MAC blk_prop in LE to ensure portability. This
+ * must be done after decoding the mac, since the endianness
+ * will get zero'd out here.
+ */
+ zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version);
+ bab->bab_prop = LE_64(tmpbp.blk_prop);
+ bab->bab_pad = 0ULL;
+
+ /* version 0 did not include the padding */
+ *bab_len = sizeof (blkptr_auth_buf_t);
+ if (version == 0)
+ *bab_len -= sizeof (uint64_t);
+}
+
+static int
+zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version,
+ boolean_t should_bswap, blkptr_t *bp)
+{
+ int ret;
+ uint_t bab_len;
+ blkptr_auth_buf_t bab;
+ crypto_data_t cd;
+
+ zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+ cd.cd_format = CRYPTO_DATA_RAW;
+ cd.cd_offset = 0;
+ cd.cd_length = bab_len;
+ cd.cd_raw.iov_base = (char *)&bab;
+ cd.cd_raw.iov_len = cd.cd_length;
+
+ ret = crypto_mac_update(ctx, &cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+static void
+zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version,
+ boolean_t should_bswap, blkptr_t *bp)
+{
+ uint_t bab_len;
+ blkptr_auth_buf_t bab;
+
+ zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+ SHA2Update(ctx, &bab, bab_len);
+}
+
+static void
+zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version,
+ boolean_t should_bswap, blkptr_t *bp)
+{
+ uint_t bab_len;
+ blkptr_auth_buf_t bab;
+
+ zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+ bcopy(&bab, *aadp, bab_len);
+ *aadp += bab_len;
+ *aad_len += bab_len;
+}
+
+static int
+zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version,
+ boolean_t should_bswap, dnode_phys_t *dnp)
+{
+ int ret, i;
+ dnode_phys_t *adnp;
+ boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
+ crypto_data_t cd;
+ uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)];
+
+ cd.cd_format = CRYPTO_DATA_RAW;
+ cd.cd_offset = 0;
+
+ /* authenticate the core dnode (masking out non-portable bits) */
+ bcopy(dnp, tmp_dncore, sizeof (tmp_dncore));
+ adnp = (dnode_phys_t *)tmp_dncore;
+ if (le_bswap) {
+ adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec);
+ adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen);
+ adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid);
+ adnp->dn_used = BSWAP_64(adnp->dn_used);
+ }
+ adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
+ adnp->dn_used = 0;
+
+ cd.cd_length = sizeof (tmp_dncore);
+ cd.cd_raw.iov_base = (char *)adnp;
+ cd.cd_raw.iov_len = cd.cd_length;
+
+ ret = crypto_mac_update(ctx, &cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ for (i = 0; i < dnp->dn_nblkptr; i++) {
+ ret = zio_crypt_bp_do_hmac_updates(ctx, version,
+ should_bswap, &dnp->dn_blkptr[i]);
+ if (ret != 0)
+ goto error;
+ }
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ ret = zio_crypt_bp_do_hmac_updates(ctx, version,
+ should_bswap, DN_SPILL_BLKPTR(dnp));
+ if (ret != 0)
+ goto error;
+ }
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+/*
+ * objset_phys_t blocks introduce a number of exceptions to the normal
+ * authentication process. objset_phys_t's contain 2 separate HMACS for
+ * protecting the integrity of their data. The portable_mac protects the
+ * metadnode. This MAC can be sent with a raw send and protects against
+ * reordering of data within the metadnode. The local_mac protects the user
+ * accounting objects which are not sent from one system to another.
+ *
+ * In addition, objset blocks are the only blocks that can be modified and
+ * written to disk without the key loaded under certain circumstances. During
+ * zil_claim() we need to be able to update the zil_header_t to complete
+ * claiming log blocks and during raw receives we need to write out the
+ * portable_mac from the send file. Both of these actions are possible
+ * because these fields are not protected by either MAC so neither one will
+ * need to modify the MACs without the key. However, when the modified blocks
+ * are written out they will be byteswapped into the host machine's native
+ * endianness which will modify fields protected by the MAC. As a result, MAC
+ * calculation for objset blocks works slightly differently from other block
+ * types. Where other block types MAC the data in whatever endianness is
+ * written to disk, objset blocks always MAC little endian version of their
+ * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP()
+ * and le_bswap indicates whether a byteswap is needed to get this block
+ * into little endian format.
+ */
+int
+zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen,
+ boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac)
+{
+ int ret;
+ crypto_mechanism_t mech;
+ crypto_context_t ctx;
+ crypto_data_t cd;
+ objset_phys_t *osp = data;
+ uint64_t intval;
+ boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
+ uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH];
+ uint8_t raw_local_mac[SHA512_DIGEST_LENGTH];
+
+ /* initialize HMAC mechanism */
+ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+ mech.cm_param = NULL;
+ mech.cm_param_len = 0;
+
+ cd.cd_format = CRYPTO_DATA_RAW;
+ cd.cd_offset = 0;
+
+ /* calculate the portable MAC from the portable fields and metadnode */
+ ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ /* add in the os_type */
+ intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type);
+ cd.cd_length = sizeof (uint64_t);
+ cd.cd_raw.iov_base = (char *)&intval;
+ cd.cd_raw.iov_len = cd.cd_length;
+
+ ret = crypto_mac_update(ctx, &cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ /* add in the portable os_flags */
+ intval = osp->os_flags;
+ if (should_bswap)
+ intval = BSWAP_64(intval);
+ intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
+ if (!ZFS_HOST_BYTEORDER)
+ intval = BSWAP_64(intval);
+
+ cd.cd_length = sizeof (uint64_t);
+ cd.cd_raw.iov_base = (char *)&intval;
+ cd.cd_raw.iov_len = cd.cd_length;
+
+ ret = crypto_mac_update(ctx, &cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ /* add in fields from the metadnode */
+ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+ should_bswap, &osp->os_meta_dnode);
+ if (ret)
+ goto error;
+
+ /* store the final digest in a temporary buffer and copy what we need */
+ cd.cd_length = SHA512_DIGEST_LENGTH;
+ cd.cd_raw.iov_base = (char *)raw_portable_mac;
+ cd.cd_raw.iov_len = cd.cd_length;
+
+ ret = crypto_mac_final(ctx, &cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN);
+
+ /*
+ * This is necessary here as we check next whether
+ * OBJSET_FLAG_USERACCOUNTING_COMPLETE or
+ * OBJSET_FLAG_USEROBJACCOUNTING are set in order to
+ * decide if the local_mac should be zeroed out.
+ */
+ intval = osp->os_flags;
+ if (should_bswap)
+ intval = BSWAP_64(intval);
+
+ /*
+ * The local MAC protects the user, group and project accounting.
+ * If these objects are not present, the local MAC is zeroed out.
+ */
+ if ((datalen >= OBJSET_PHYS_SIZE_V3 &&
+ osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
+ osp->os_groupused_dnode.dn_type == DMU_OT_NONE &&
+ osp->os_projectused_dnode.dn_type == DMU_OT_NONE) ||
+ (datalen >= OBJSET_PHYS_SIZE_V2 &&
+ osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
+ osp->os_groupused_dnode.dn_type == DMU_OT_NONE) ||
+ (datalen <= OBJSET_PHYS_SIZE_V1) ||
+ (((intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE) == 0 ||
+ (intval & OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE) == 0) &&
+ key->zk_version > 0)) {
+ bzero(local_mac, ZIO_OBJSET_MAC_LEN);
+ return (0);
+ }
+
+ /* calculate the local MAC from the userused and groupused dnodes */
+ ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ /* add in the non-portable os_flags */
+ intval = osp->os_flags;
+ if (should_bswap)
+ intval = BSWAP_64(intval);
+ intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
+ if (!ZFS_HOST_BYTEORDER)
+ intval = BSWAP_64(intval);
+
+ cd.cd_length = sizeof (uint64_t);
+ cd.cd_raw.iov_base = (char *)&intval;
+ cd.cd_raw.iov_len = cd.cd_length;
+
+ ret = crypto_mac_update(ctx, &cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ /* add in fields from the user accounting dnodes */
+ if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) {
+ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+ should_bswap, &osp->os_userused_dnode);
+ if (ret)
+ goto error;
+ }
+
+ if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) {
+ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+ should_bswap, &osp->os_groupused_dnode);
+ if (ret)
+ goto error;
+ }
+
+ if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE &&
+ datalen >= OBJSET_PHYS_SIZE_V3) {
+ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+ should_bswap, &osp->os_projectused_dnode);
+ if (ret)
+ goto error;
+ }
+
+ /* store the final digest in a temporary buffer and copy what we need */
+ cd.cd_length = SHA512_DIGEST_LENGTH;
+ cd.cd_raw.iov_base = (char *)raw_local_mac;
+ cd.cd_raw.iov_len = cd.cd_length;
+
+ ret = crypto_mac_final(ctx, &cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ bcopy(raw_local_mac, local_mac, ZIO_OBJSET_MAC_LEN);
+
+ return (0);
+
+error:
+ bzero(portable_mac, ZIO_OBJSET_MAC_LEN);
+ bzero(local_mac, ZIO_OBJSET_MAC_LEN);
+ return (ret);
+}
+
+static void
+zio_crypt_destroy_uio(zfs_uio_t *uio)
+{
+ if (uio->uio_iov)
+ kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t));
+}
+
+/*
+ * This function parses an uncompressed indirect block and returns a checksum
+ * of all the portable fields from all of the contained bps. The portable
+ * fields are the MAC and all of the fields from blk_prop except for the dedup,
+ * checksum, and psize bits. For an explanation of the purpose of this, see
+ * the comment block on object set authentication.
+ */
+static int
+zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf,
+ uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum)
+{
+ blkptr_t *bp;
+ int i, epb = datalen >> SPA_BLKPTRSHIFT;
+ SHA2_CTX ctx;
+ uint8_t digestbuf[SHA512_DIGEST_LENGTH];
+
+ /* checksum all of the MACs from the layer below */
+ SHA2Init(SHA512, &ctx);
+ for (i = 0, bp = buf; i < epb; i++, bp++) {
+ zio_crypt_bp_do_indrect_checksum_updates(&ctx, version,
+ byteswap, bp);
+ }
+ SHA2Final(digestbuf, &ctx);
+
+ if (generate) {
+ bcopy(digestbuf, cksum, ZIO_DATA_MAC_LEN);
+ return (0);
+ }
+
+ if (bcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0)
+ return (SET_ERROR(ECKSUM));
+
+ return (0);
+}
+
+int
+zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf,
+ uint_t datalen, boolean_t byteswap, uint8_t *cksum)
+{
+ int ret;
+
+ /*
+ * Unfortunately, callers of this function will not always have
+ * easy access to the on-disk format version. This info is
+ * normally found in the DSL Crypto Key, but the checksum-of-MACs
+ * is expected to be verifiable even when the key isn't loaded.
+ * Here, instead of doing a ZAP lookup for the version for each
+ * zio, we simply try both existing formats.
+ */
+ ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf,
+ datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum);
+ if (ret == ECKSUM) {
+ ASSERT(!generate);
+ ret = zio_crypt_do_indirect_mac_checksum_impl(generate,
+ buf, datalen, 0, byteswap, cksum);
+ }
+
+ return (ret);
+}
+
+int
+zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd,
+ uint_t datalen, boolean_t byteswap, uint8_t *cksum)
+{
+ int ret;
+ void *buf;
+
+ buf = abd_borrow_buf_copy(abd, datalen);
+ ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen,
+ byteswap, cksum);
+ abd_return_buf(abd, buf, datalen);
+
+ return (ret);
+}
+
+/*
+ * Special case handling routine for encrypting / decrypting ZIL blocks.
+ * We do not check for the older ZIL chain because the encryption feature
+ * was not available before the newer ZIL chain was introduced. The goal
+ * here is to encrypt everything except the blkptr_t of a lr_write_t and
+ * the zil_chain_t header. Everything that is not encrypted is authenticated.
+ */
+static int
+zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
+ uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio,
+ zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len,
+ boolean_t *no_crypt)
+{
+ int ret;
+ uint64_t txtype, lr_len;
+ uint_t nr_src, nr_dst, crypt_len;
+ uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
+ iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
+ uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp;
+ zil_chain_t *zilc;
+ lr_t *lr;
+ uint8_t *aadbuf = zio_buf_alloc(datalen);
+
+ /* cipherbuf always needs an extra iovec for the MAC */
+ if (encrypt) {
+ src = plainbuf;
+ dst = cipherbuf;
+ nr_src = 0;
+ nr_dst = 1;
+ } else {
+ src = cipherbuf;
+ dst = plainbuf;
+ nr_src = 1;
+ nr_dst = 0;
+ }
+
+ /* find the start and end record of the log block */
+ zilc = (zil_chain_t *)src;
+ slrp = src + sizeof (zil_chain_t);
+ aadp = aadbuf;
+ blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+
+ /* calculate the number of encrypted iovecs we will need */
+ for (; slrp < blkend; slrp += lr_len) {
+ lr = (lr_t *)slrp;
+
+ if (!byteswap) {
+ txtype = lr->lrc_txtype;
+ lr_len = lr->lrc_reclen;
+ } else {
+ txtype = BSWAP_64(lr->lrc_txtype);
+ lr_len = BSWAP_64(lr->lrc_reclen);
+ }
+
+ nr_iovecs++;
+ if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t))
+ nr_iovecs++;
+ }
+
+ nr_src += nr_iovecs;
+ nr_dst += nr_iovecs;
+
+ /* allocate the iovec arrays */
+ if (nr_src != 0) {
+ src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP);
+ if (src_iovecs == NULL) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+ }
+
+ if (nr_dst != 0) {
+ dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP);
+ if (dst_iovecs == NULL) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+ }
+
+ /*
+ * Copy the plain zil header over and authenticate everything except
+ * the checksum that will store our MAC. If we are writing the data
+ * the embedded checksum will not have been calculated yet, so we don't
+ * authenticate that.
+ */
+ bcopy(src, dst, sizeof (zil_chain_t));
+ bcopy(src, aadp, sizeof (zil_chain_t) - sizeof (zio_eck_t));
+ aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t);
+ aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t);
+
+ /* loop over records again, filling in iovecs */
+ nr_iovecs = 0;
+ slrp = src + sizeof (zil_chain_t);
+ dlrp = dst + sizeof (zil_chain_t);
+
+ for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) {
+ lr = (lr_t *)slrp;
+
+ if (!byteswap) {
+ txtype = lr->lrc_txtype;
+ lr_len = lr->lrc_reclen;
+ } else {
+ txtype = BSWAP_64(lr->lrc_txtype);
+ lr_len = BSWAP_64(lr->lrc_reclen);
+ }
+
+ /* copy the common lr_t */
+ bcopy(slrp, dlrp, sizeof (lr_t));
+ bcopy(slrp, aadp, sizeof (lr_t));
+ aadp += sizeof (lr_t);
+ aad_len += sizeof (lr_t);
+
+ ASSERT3P(src_iovecs, !=, NULL);
+ ASSERT3P(dst_iovecs, !=, NULL);
+
+ /*
+ * If this is a TX_WRITE record we want to encrypt everything
+ * except the bp if exists. If the bp does exist we want to
+ * authenticate it.
+ */
+ if (txtype == TX_WRITE) {
+ crypt_len = sizeof (lr_write_t) -
+ sizeof (lr_t) - sizeof (blkptr_t);
+ src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
+ src_iovecs[nr_iovecs].iov_len = crypt_len;
+ dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
+ dst_iovecs[nr_iovecs].iov_len = crypt_len;
+
+ /* copy the bp now since it will not be encrypted */
+ bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+ dlrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+ sizeof (blkptr_t));
+ bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+ aadp, sizeof (blkptr_t));
+ aadp += sizeof (blkptr_t);
+ aad_len += sizeof (blkptr_t);
+ nr_iovecs++;
+ total_len += crypt_len;
+
+ if (lr_len != sizeof (lr_write_t)) {
+ crypt_len = lr_len - sizeof (lr_write_t);
+ src_iovecs[nr_iovecs].iov_base =
+ slrp + sizeof (lr_write_t);
+ src_iovecs[nr_iovecs].iov_len = crypt_len;
+ dst_iovecs[nr_iovecs].iov_base =
+ dlrp + sizeof (lr_write_t);
+ dst_iovecs[nr_iovecs].iov_len = crypt_len;
+ nr_iovecs++;
+ total_len += crypt_len;
+ }
+ } else {
+ crypt_len = lr_len - sizeof (lr_t);
+ src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
+ src_iovecs[nr_iovecs].iov_len = crypt_len;
+ dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
+ dst_iovecs[nr_iovecs].iov_len = crypt_len;
+ nr_iovecs++;
+ total_len += crypt_len;
+ }
+ }
+
+ *no_crypt = (nr_iovecs == 0);
+ *enc_len = total_len;
+ *authbuf = aadbuf;
+ *auth_len = aad_len;
+
+ if (encrypt) {
+ puio->uio_iov = src_iovecs;
+ puio->uio_iovcnt = nr_src;
+ cuio->uio_iov = dst_iovecs;
+ cuio->uio_iovcnt = nr_dst;
+ } else {
+ puio->uio_iov = dst_iovecs;
+ puio->uio_iovcnt = nr_dst;
+ cuio->uio_iov = src_iovecs;
+ cuio->uio_iovcnt = nr_src;
+ }
+
+ return (0);
+
+error:
+ zio_buf_free(aadbuf, datalen);
+ if (src_iovecs != NULL)
+ kmem_free(src_iovecs, nr_src * sizeof (iovec_t));
+ if (dst_iovecs != NULL)
+ kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t));
+
+ *enc_len = 0;
+ *authbuf = NULL;
+ *auth_len = 0;
+ *no_crypt = B_FALSE;
+ puio->uio_iov = NULL;
+ puio->uio_iovcnt = 0;
+ cuio->uio_iov = NULL;
+ cuio->uio_iovcnt = 0;
+ return (ret);
+}
+
+/*
+ * Special case handling routine for encrypting / decrypting dnode blocks.
+ */
+static int
+zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version,
+ uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
+ zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf,
+ uint_t *auth_len, boolean_t *no_crypt)
+{
+ int ret;
+ uint_t nr_src, nr_dst, crypt_len;
+ uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
+ uint_t i, j, max_dnp = datalen >> DNODE_SHIFT;
+ iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
+ uint8_t *src, *dst, *aadp;
+ dnode_phys_t *dnp, *adnp, *sdnp, *ddnp;
+ uint8_t *aadbuf = zio_buf_alloc(datalen);
+
+ if (encrypt) {
+ src = plainbuf;
+ dst = cipherbuf;
+ nr_src = 0;
+ nr_dst = 1;
+ } else {
+ src = cipherbuf;
+ dst = plainbuf;
+ nr_src = 1;
+ nr_dst = 0;
+ }
+
+ sdnp = (dnode_phys_t *)src;
+ ddnp = (dnode_phys_t *)dst;
+ aadp = aadbuf;
+
+ /*
+ * Count the number of iovecs we will need to do the encryption by
+ * counting the number of bonus buffers that need to be encrypted.
+ */
+ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+ /*
+ * This block may still be byteswapped. However, all of the
+ * values we use are either uint8_t's (for which byteswapping
+ * is a noop) or a * != 0 check, which will work regardless
+ * of whether or not we byteswap.
+ */
+ if (sdnp[i].dn_type != DMU_OT_NONE &&
+ DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) &&
+ sdnp[i].dn_bonuslen != 0) {
+ nr_iovecs++;
+ }
+ }
+
+ nr_src += nr_iovecs;
+ nr_dst += nr_iovecs;
+
+ if (nr_src != 0) {
+ src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP);
+ if (src_iovecs == NULL) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+ }
+
+ if (nr_dst != 0) {
+ dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP);
+ if (dst_iovecs == NULL) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+ }
+
+ nr_iovecs = 0;
+
+ /*
+ * Iterate through the dnodes again, this time filling in the uios
+ * we allocated earlier. We also concatenate any data we want to
+ * authenticate onto aadbuf.
+ */
+ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+ dnp = &sdnp[i];
+
+ /* copy over the core fields and blkptrs (kept as plaintext) */
+ bcopy(dnp, &ddnp[i], (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp);
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ bcopy(DN_SPILL_BLKPTR(dnp), DN_SPILL_BLKPTR(&ddnp[i]),
+ sizeof (blkptr_t));
+ }
+
+ /*
+ * Handle authenticated data. We authenticate everything in
+ * the dnode that can be brought over when we do a raw send.
+ * This includes all of the core fields as well as the MACs
+ * stored in the bp checksums and all of the portable bits
+ * from blk_prop. We include the dnode padding here in case it
+ * ever gets used in the future. Some dn_flags and dn_used are
+ * not portable so we mask those out values out of the
+ * authenticated data.
+ */
+ crypt_len = offsetof(dnode_phys_t, dn_blkptr);
+ bcopy(dnp, aadp, crypt_len);
+ adnp = (dnode_phys_t *)aadp;
+ adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
+ adnp->dn_used = 0;
+ aadp += crypt_len;
+ aad_len += crypt_len;
+
+ for (j = 0; j < dnp->dn_nblkptr; j++) {
+ zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
+ version, byteswap, &dnp->dn_blkptr[j]);
+ }
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
+ version, byteswap, DN_SPILL_BLKPTR(dnp));
+ }
+
+ /*
+ * If this bonus buffer needs to be encrypted, we prepare an
+ * iovec_t. The encryption / decryption functions will fill
+ * this in for us with the encrypted or decrypted data.
+ * Otherwise we add the bonus buffer to the authenticated
+ * data buffer and copy it over to the destination. The
+ * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that
+ * we can guarantee alignment with the AES block size
+ * (128 bits).
+ */
+ crypt_len = DN_MAX_BONUS_LEN(dnp);
+ if (dnp->dn_type != DMU_OT_NONE &&
+ DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
+ dnp->dn_bonuslen != 0) {
+ ASSERT3U(nr_iovecs, <, nr_src);
+ ASSERT3U(nr_iovecs, <, nr_dst);
+ ASSERT3P(src_iovecs, !=, NULL);
+ ASSERT3P(dst_iovecs, !=, NULL);
+ src_iovecs[nr_iovecs].iov_base = DN_BONUS(dnp);
+ src_iovecs[nr_iovecs].iov_len = crypt_len;
+ dst_iovecs[nr_iovecs].iov_base = DN_BONUS(&ddnp[i]);
+ dst_iovecs[nr_iovecs].iov_len = crypt_len;
+
+ nr_iovecs++;
+ total_len += crypt_len;
+ } else {
+ bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), crypt_len);
+ bcopy(DN_BONUS(dnp), aadp, crypt_len);
+ aadp += crypt_len;
+ aad_len += crypt_len;
+ }
+ }
+
+ *no_crypt = (nr_iovecs == 0);
+ *enc_len = total_len;
+ *authbuf = aadbuf;
+ *auth_len = aad_len;
+
+ if (encrypt) {
+ puio->uio_iov = src_iovecs;
+ puio->uio_iovcnt = nr_src;
+ cuio->uio_iov = dst_iovecs;
+ cuio->uio_iovcnt = nr_dst;
+ } else {
+ puio->uio_iov = dst_iovecs;
+ puio->uio_iovcnt = nr_dst;
+ cuio->uio_iov = src_iovecs;
+ cuio->uio_iovcnt = nr_src;
+ }
+
+ return (0);
+
+error:
+ zio_buf_free(aadbuf, datalen);
+ if (src_iovecs != NULL)
+ kmem_free(src_iovecs, nr_src * sizeof (iovec_t));
+ if (dst_iovecs != NULL)
+ kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t));
+
+ *enc_len = 0;
+ *authbuf = NULL;
+ *auth_len = 0;
+ *no_crypt = B_FALSE;
+ puio->uio_iov = NULL;
+ puio->uio_iovcnt = 0;
+ cuio->uio_iov = NULL;
+ cuio->uio_iovcnt = 0;
+ return (ret);
+}
+
+static int
+zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf,
+ uint8_t *cipherbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *cuio,
+ uint_t *enc_len)
+{
+ int ret;
+ uint_t nr_plain = 1, nr_cipher = 2;
+ iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL;
+
+ /* allocate the iovecs for the plain and cipher data */
+ plain_iovecs = kmem_alloc(nr_plain * sizeof (iovec_t),
+ KM_SLEEP);
+ if (!plain_iovecs) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+
+ cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t),
+ KM_SLEEP);
+ if (!cipher_iovecs) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+
+ plain_iovecs[0].iov_base = plainbuf;
+ plain_iovecs[0].iov_len = datalen;
+ cipher_iovecs[0].iov_base = cipherbuf;
+ cipher_iovecs[0].iov_len = datalen;
+
+ *enc_len = datalen;
+ puio->uio_iov = plain_iovecs;
+ puio->uio_iovcnt = nr_plain;
+ cuio->uio_iov = cipher_iovecs;
+ cuio->uio_iovcnt = nr_cipher;
+
+ return (0);
+
+error:
+ if (plain_iovecs != NULL)
+ kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t));
+ if (cipher_iovecs != NULL)
+ kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t));
+
+ *enc_len = 0;
+ puio->uio_iov = NULL;
+ puio->uio_iovcnt = 0;
+ cuio->uio_iov = NULL;
+ cuio->uio_iovcnt = 0;
+ return (ret);
+}
+
+/*
+ * This function builds up the plaintext (puio) and ciphertext (cuio) uios so
+ * that they can be used for encryption and decryption by zio_do_crypt_uio().
+ * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks
+ * requiring special handling to parse out pieces that are to be encrypted. The
+ * authbuf is used by these special cases to store additional authenticated
+ * data (AAD) for the encryption modes.
+ */
+static int
+zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot,
+ uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
+ uint8_t *mac, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len,
+ uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt)
+{
+ int ret;
+ iovec_t *mac_iov;
+
+ ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE);
+
+ /* route to handler */
+ switch (ot) {
+ case DMU_OT_INTENT_LOG:
+ ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf,
+ datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len,
+ no_crypt);
+ break;
+ case DMU_OT_DNODE:
+ ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf,
+ cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf,
+ auth_len, no_crypt);
+ break;
+ default:
+ ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf,
+ datalen, puio, cuio, enc_len);
+ *authbuf = NULL;
+ *auth_len = 0;
+ *no_crypt = B_FALSE;
+ break;
+ }
+
+ if (ret != 0)
+ goto error;
+
+ /* populate the uios */
+ puio->uio_segflg = UIO_SYSSPACE;
+ cuio->uio_segflg = UIO_SYSSPACE;
+
+ mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]);
+ mac_iov->iov_base = mac;
+ mac_iov->iov_len = ZIO_DATA_MAC_LEN;
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+/*
+ * Primary encryption / decryption entrypoint for zio data.
+ */
+int
+zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
+ dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv,
+ uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf,
+ boolean_t *no_crypt)
+{
+ int ret;
+ boolean_t locked = B_FALSE;
+ uint64_t crypt = key->zk_crypt;
+ uint_t keydata_len = zio_crypt_table[crypt].ci_keylen;
+ uint_t enc_len, auth_len;
+ zfs_uio_t puio, cuio;
+ uint8_t enc_keydata[MASTER_KEY_MAX_LEN];
+ crypto_key_t tmp_ckey, *ckey = NULL;
+ crypto_ctx_template_t tmpl;
+ uint8_t *authbuf = NULL;
+
+ /*
+ * If the needed key is the current one, just use it. Otherwise we
+ * need to generate a temporary one from the given salt + master key.
+ * If we are encrypting, we must return a copy of the current salt
+ * so that it can be stored in the blkptr_t.
+ */
+ rw_enter(&key->zk_salt_lock, RW_READER);
+ locked = B_TRUE;
+
+ if (bcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) {
+ ckey = &key->zk_current_key;
+ tmpl = key->zk_current_tmpl;
+ } else {
+ rw_exit(&key->zk_salt_lock);
+ locked = B_FALSE;
+
+ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+ salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len);
+ if (ret != 0)
+ goto error;
+
+ tmp_ckey.ck_format = CRYPTO_KEY_RAW;
+ tmp_ckey.ck_data = enc_keydata;
+ tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+ ckey = &tmp_ckey;
+ tmpl = NULL;
+ }
+
+ /*
+ * Attempt to use QAT acceleration if we can. We currently don't
+ * do this for metadnode and ZIL blocks, since they have a much
+ * more involved buffer layout and the qat_crypt() function only
+ * works in-place.
+ */
+ if (qat_crypt_use_accel(datalen) &&
+ ot != DMU_OT_INTENT_LOG && ot != DMU_OT_DNODE) {
+ uint8_t *srcbuf, *dstbuf;
+
+ if (encrypt) {
+ srcbuf = plainbuf;
+ dstbuf = cipherbuf;
+ } else {
+ srcbuf = cipherbuf;
+ dstbuf = plainbuf;
+ }
+
+ ret = qat_crypt((encrypt) ? QAT_ENCRYPT : QAT_DECRYPT, srcbuf,
+ dstbuf, NULL, 0, iv, mac, ckey, key->zk_crypt, datalen);
+ if (ret == CPA_STATUS_SUCCESS) {
+ if (locked) {
+ rw_exit(&key->zk_salt_lock);
+ locked = B_FALSE;
+ }
+
+ return (0);
+ }
+ /* If the hardware implementation fails fall back to software */
+ }
+
+ bzero(&puio, sizeof (zfs_uio_t));
+ bzero(&cuio, sizeof (zfs_uio_t));
+
+ /* create uios for encryption */
+ ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf,
+ cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len,
+ &authbuf, &auth_len, no_crypt);
+ if (ret != 0)
+ goto error;
+
+ /* perform the encryption / decryption in software */
+ ret = zio_do_crypt_uio(encrypt, key->zk_crypt, ckey, tmpl, iv, enc_len,
+ &puio, &cuio, authbuf, auth_len);
+ if (ret != 0)
+ goto error;
+
+ if (locked) {
+ rw_exit(&key->zk_salt_lock);
+ locked = B_FALSE;
+ }
+
+ if (authbuf != NULL)
+ zio_buf_free(authbuf, datalen);
+ if (ckey == &tmp_ckey)
+ bzero(enc_keydata, keydata_len);
+ zio_crypt_destroy_uio(&puio);
+ zio_crypt_destroy_uio(&cuio);
+
+ return (0);
+
+error:
+ if (locked)
+ rw_exit(&key->zk_salt_lock);
+ if (authbuf != NULL)
+ zio_buf_free(authbuf, datalen);
+ if (ckey == &tmp_ckey)
+ bzero(enc_keydata, keydata_len);
+ zio_crypt_destroy_uio(&puio);
+ zio_crypt_destroy_uio(&cuio);
+
+ return (ret);
+}
+
+/*
+ * Simple wrapper around zio_do_crypt_data() to work with abd's instead of
+ * linear buffers.
+ */
+int
+zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot,
+ boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac,
+ uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt)
+{
+ int ret;
+ void *ptmp, *ctmp;
+
+ if (encrypt) {
+ ptmp = abd_borrow_buf_copy(pabd, datalen);
+ ctmp = abd_borrow_buf(cabd, datalen);
+ } else {
+ ptmp = abd_borrow_buf(pabd, datalen);
+ ctmp = abd_borrow_buf_copy(cabd, datalen);
+ }
+
+ ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac,
+ datalen, ptmp, ctmp, no_crypt);
+ if (ret != 0)
+ goto error;
+
+ if (encrypt) {
+ abd_return_buf(pabd, ptmp, datalen);
+ abd_return_buf_copy(cabd, ctmp, datalen);
+ } else {
+ abd_return_buf_copy(pabd, ptmp, datalen);
+ abd_return_buf(cabd, ctmp, datalen);
+ }
+
+ return (0);
+
+error:
+ if (encrypt) {
+ abd_return_buf(pabd, ptmp, datalen);
+ abd_return_buf_copy(cabd, ctmp, datalen);
+ } else {
+ abd_return_buf_copy(pabd, ptmp, datalen);
+ abd_return_buf(cabd, ctmp, datalen);
+ }
+
+ return (ret);
+}
+
+#if defined(_KERNEL)
+/* BEGIN CSTYLED */
+module_param(zfs_key_max_salt_uses, ulong, 0644);
+MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value "
+ "can be used for generating encryption keys before it is rotated");
+/* END CSTYLED */
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
new file mode 100644
index 000000000000..e6420f19ed87
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
@@ -0,0 +1,552 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * LLNL-CODE-403049.
+ * Rewritten for Linux by:
+ * Rohan Puri <rohan.puri15@gmail.com>
+ * Brian Behlendorf <behlendorf1@llnl.gov>
+ */
+
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zpl.h>
+
+/*
+ * Common open routine. Disallow any write access.
+ */
+/* ARGSUSED */
+static int
+zpl_common_open(struct inode *ip, struct file *filp)
+{
+ if (filp->f_mode & FMODE_WRITE)
+ return (-EACCES);
+
+ return (generic_file_open(ip, filp));
+}
+
+/*
+ * Get root directory contents.
+ */
+static int
+zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx)
+{
+ zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
+ int error = 0;
+
+ ZPL_ENTER(zfsvfs);
+
+ if (!zpl_dir_emit_dots(filp, ctx))
+ goto out;
+
+ if (ctx->pos == 2) {
+ if (!zpl_dir_emit(ctx, ZFS_SNAPDIR_NAME,
+ strlen(ZFS_SNAPDIR_NAME), ZFSCTL_INO_SNAPDIR, DT_DIR))
+ goto out;
+
+ ctx->pos++;
+ }
+
+ if (ctx->pos == 3) {
+ if (!zpl_dir_emit(ctx, ZFS_SHAREDIR_NAME,
+ strlen(ZFS_SHAREDIR_NAME), ZFSCTL_INO_SHARES, DT_DIR))
+ goto out;
+
+ ctx->pos++;
+ }
+out:
+ ZPL_EXIT(zfsvfs);
+
+ return (error);
+}
+
+#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
+static int
+zpl_root_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ zpl_dir_context_t ctx =
+ ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
+ int error;
+
+ error = zpl_root_iterate(filp, &ctx);
+ filp->f_pos = ctx.pos;
+
+ return (error);
+}
+#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
+
+/*
+ * Get root directory attributes.
+ */
+/* ARGSUSED */
+static int
+zpl_root_getattr_impl(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *ip = path->dentry->d_inode;
+
+ generic_fillattr(ip, stat);
+ stat->atime = current_time(ip);
+
+ return (0);
+}
+ZPL_GETATTR_WRAPPER(zpl_root_getattr);
+
+static struct dentry *
+zpl_root_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags)
+{
+ cred_t *cr = CRED();
+ struct inode *ip;
+ int error;
+
+ crhold(cr);
+ error = -zfsctl_root_lookup(dip, dname(dentry), &ip, 0, cr, NULL, NULL);
+ ASSERT3S(error, <=, 0);
+ crfree(cr);
+
+ if (error) {
+ if (error == -ENOENT)
+ return (d_splice_alias(NULL, dentry));
+ else
+ return (ERR_PTR(error));
+ }
+
+ return (d_splice_alias(ip, dentry));
+}
+
+/*
+ * The '.zfs' control directory file and inode operations.
+ */
+const struct file_operations zpl_fops_root = {
+ .open = zpl_common_open,
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+#ifdef HAVE_VFS_ITERATE_SHARED
+ .iterate_shared = zpl_root_iterate,
+#elif defined(HAVE_VFS_ITERATE)
+ .iterate = zpl_root_iterate,
+#else
+ .readdir = zpl_root_readdir,
+#endif
+};
+
+const struct inode_operations zpl_ops_root = {
+ .lookup = zpl_root_lookup,
+ .getattr = zpl_root_getattr,
+};
+
+static struct vfsmount *
+zpl_snapdir_automount(struct path *path)
+{
+ int error;
+
+ error = -zfsctl_snapshot_mount(path, 0);
+ if (error)
+ return (ERR_PTR(error));
+
+ /*
+ * Rather than returning the new vfsmount for the snapshot we must
+ * return NULL to indicate a mount collision. This is done because
+ * the user space mount calls do_add_mount() which adds the vfsmount
+ * to the name space. If we returned the new mount here it would be
+ * added again to the vfsmount list resulting in list corruption.
+ */
+ return (NULL);
+}
+
+/*
+ * Negative dentries must always be revalidated so newly created snapshots
+ * can be detected and automounted. Normal dentries should be kept because
+ * as of the 3.18 kernel revaliding the mountpoint dentry will result in
+ * the snapshot being immediately unmounted.
+ */
+static int
+#ifdef HAVE_D_REVALIDATE_NAMEIDATA
+zpl_snapdir_revalidate(struct dentry *dentry, struct nameidata *i)
+#else
+zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags)
+#endif
+{
+ return (!!dentry->d_inode);
+}
+
+dentry_operations_t zpl_dops_snapdirs = {
+/*
+ * Auto mounting of snapshots is only supported for 2.6.37 and
+ * newer kernels. Prior to this kernel the ops->follow_link()
+ * callback was used as a hack to trigger the mount. The
+ * resulting vfsmount was then explicitly grafted in to the
+ * name space. While it might be possible to add compatibility
+ * code to accomplish this it would require considerable care.
+ */
+ .d_automount = zpl_snapdir_automount,
+ .d_revalidate = zpl_snapdir_revalidate,
+};
+
+static struct dentry *
+zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
+ unsigned int flags)
+{
+ fstrans_cookie_t cookie;
+ cred_t *cr = CRED();
+ struct inode *ip = NULL;
+ int error;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfsctl_snapdir_lookup(dip, dname(dentry), &ip,
+ 0, cr, NULL, NULL);
+ ASSERT3S(error, <=, 0);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+
+ if (error && error != -ENOENT)
+ return (ERR_PTR(error));
+
+ ASSERT(error == 0 || ip == NULL);
+ d_clear_d_op(dentry);
+ d_set_d_op(dentry, &zpl_dops_snapdirs);
+ dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+
+ return (d_splice_alias(ip, dentry));
+}
+
+static int
+zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx)
+{
+ zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
+ fstrans_cookie_t cookie;
+ char snapname[MAXNAMELEN];
+ boolean_t case_conflict;
+ uint64_t id, pos;
+ int error = 0;
+
+ ZPL_ENTER(zfsvfs);
+ cookie = spl_fstrans_mark();
+
+ if (!zpl_dir_emit_dots(filp, ctx))
+ goto out;
+
+ /* Start the position at 0 if it already emitted . and .. */
+ pos = (ctx->pos == 2 ? 0 : ctx->pos);
+ while (error == 0) {
+ dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
+ error = -dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN,
+ snapname, &id, &pos, &case_conflict);
+ dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
+ if (error)
+ goto out;
+
+ if (!zpl_dir_emit(ctx, snapname, strlen(snapname),
+ ZFSCTL_INO_SHARES - id, DT_DIR))
+ goto out;
+
+ ctx->pos = pos;
+ }
+out:
+ spl_fstrans_unmark(cookie);
+ ZPL_EXIT(zfsvfs);
+
+ if (error == -ENOENT)
+ return (0);
+
+ return (error);
+}
+
+#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
+static int
+zpl_snapdir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ zpl_dir_context_t ctx =
+ ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
+ int error;
+
+ error = zpl_snapdir_iterate(filp, &ctx);
+ filp->f_pos = ctx.pos;
+
+ return (error);
+}
+#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
+
+static int
+zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry,
+ struct inode *tdip, struct dentry *tdentry, unsigned int flags)
+{
+ cred_t *cr = CRED();
+ int error;
+
+ /* We probably don't want to support renameat2(2) in ctldir */
+ if (flags)
+ return (-EINVAL);
+
+ crhold(cr);
+ error = -zfsctl_snapdir_rename(sdip, dname(sdentry),
+ tdip, dname(tdentry), cr, 0);
+ ASSERT3S(error, <=, 0);
+ crfree(cr);
+
+ return (error);
+}
+
+#ifndef HAVE_RENAME_WANTS_FLAGS
+static int
+zpl_snapdir_rename(struct inode *sdip, struct dentry *sdentry,
+ struct inode *tdip, struct dentry *tdentry)
+{
+ return (zpl_snapdir_rename2(sdip, sdentry, tdip, tdentry, 0));
+}
+#endif
+
+static int
+zpl_snapdir_rmdir(struct inode *dip, struct dentry *dentry)
+{
+ cred_t *cr = CRED();
+ int error;
+
+ crhold(cr);
+ error = -zfsctl_snapdir_remove(dip, dname(dentry), cr, 0);
+ ASSERT3S(error, <=, 0);
+ crfree(cr);
+
+ return (error);
+}
+
+static int
+zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
+{
+ cred_t *cr = CRED();
+ vattr_t *vap;
+ struct inode *ip;
+ int error;
+
+ crhold(cr);
+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+ zpl_vap_init(vap, dip, mode | S_IFDIR, cr);
+
+ error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0);
+ if (error == 0) {
+ d_clear_d_op(dentry);
+ d_set_d_op(dentry, &zpl_dops_snapdirs);
+ d_instantiate(dentry, ip);
+ }
+
+ kmem_free(vap, sizeof (vattr_t));
+ ASSERT3S(error, <=, 0);
+ crfree(cr);
+
+ return (error);
+}
+
+/*
+ * Get snapshot directory attributes.
+ */
+/* ARGSUSED */
+static int
+zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *ip = path->dentry->d_inode;
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+
+ ZPL_ENTER(zfsvfs);
+ generic_fillattr(ip, stat);
+
+ stat->nlink = stat->size = 2;
+ stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os);
+ stat->atime = current_time(ip);
+ ZPL_EXIT(zfsvfs);
+
+ return (0);
+}
+ZPL_GETATTR_WRAPPER(zpl_snapdir_getattr);
+
+/*
+ * The '.zfs/snapshot' directory file operations. These mainly control
+ * generating the list of available snapshots when doing an 'ls' in the
+ * directory. See zpl_snapdir_readdir().
+ */
+const struct file_operations zpl_fops_snapdir = {
+ .open = zpl_common_open,
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+#ifdef HAVE_VFS_ITERATE_SHARED
+ .iterate_shared = zpl_snapdir_iterate,
+#elif defined(HAVE_VFS_ITERATE)
+ .iterate = zpl_snapdir_iterate,
+#else
+ .readdir = zpl_snapdir_readdir,
+#endif
+
+};
+
+/*
+ * The '.zfs/snapshot' directory inode operations. These mainly control
+ * creating an inode for a snapshot directory and initializing the needed
+ * infrastructure to automount the snapshot. See zpl_snapdir_lookup().
+ */
+const struct inode_operations zpl_ops_snapdir = {
+ .lookup = zpl_snapdir_lookup,
+ .getattr = zpl_snapdir_getattr,
+#ifdef HAVE_RENAME_WANTS_FLAGS
+ .rename = zpl_snapdir_rename2,
+#else
+ .rename = zpl_snapdir_rename,
+#endif
+ .rmdir = zpl_snapdir_rmdir,
+ .mkdir = zpl_snapdir_mkdir,
+};
+
+static struct dentry *
+zpl_shares_lookup(struct inode *dip, struct dentry *dentry,
+ unsigned int flags)
+{
+ fstrans_cookie_t cookie;
+ cred_t *cr = CRED();
+ struct inode *ip = NULL;
+ int error;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfsctl_shares_lookup(dip, dname(dentry), &ip,
+ 0, cr, NULL, NULL);
+ ASSERT3S(error, <=, 0);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+
+ if (error) {
+ if (error == -ENOENT)
+ return (d_splice_alias(NULL, dentry));
+ else
+ return (ERR_PTR(error));
+ }
+
+ return (d_splice_alias(ip, dentry));
+}
+
+static int
+zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx)
+{
+ fstrans_cookie_t cookie;
+ cred_t *cr = CRED();
+ zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
+ znode_t *dzp;
+ int error = 0;
+
+ ZPL_ENTER(zfsvfs);
+ cookie = spl_fstrans_mark();
+
+ if (zfsvfs->z_shares_dir == 0) {
+ zpl_dir_emit_dots(filp, ctx);
+ goto out;
+ }
+
+ error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp);
+ if (error)
+ goto out;
+
+ crhold(cr);
+ error = -zfs_readdir(ZTOI(dzp), ctx, cr);
+ crfree(cr);
+
+ iput(ZTOI(dzp));
+out:
+ spl_fstrans_unmark(cookie);
+ ZPL_EXIT(zfsvfs);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
+static int
+zpl_shares_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ zpl_dir_context_t ctx =
+ ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
+ int error;
+
+ error = zpl_shares_iterate(filp, &ctx);
+ filp->f_pos = ctx.pos;
+
+ return (error);
+}
+#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
+
+/* ARGSUSED */
+static int
+zpl_shares_getattr_impl(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *ip = path->dentry->d_inode;
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ znode_t *dzp;
+ int error;
+
+ ZPL_ENTER(zfsvfs);
+
+ if (zfsvfs->z_shares_dir == 0) {
+ generic_fillattr(path->dentry->d_inode, stat);
+ stat->nlink = stat->size = 2;
+ stat->atime = current_time(ip);
+ ZPL_EXIT(zfsvfs);
+ return (0);
+ }
+
+ error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp);
+ if (error == 0) {
+ error = -zfs_getattr_fast(ZTOI(dzp), stat);
+ iput(ZTOI(dzp));
+ }
+
+ ZPL_EXIT(zfsvfs);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+ZPL_GETATTR_WRAPPER(zpl_shares_getattr);
+
+/*
+ * The '.zfs/shares' directory file operations.
+ */
+const struct file_operations zpl_fops_shares = {
+ .open = zpl_common_open,
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+#ifdef HAVE_VFS_ITERATE_SHARED
+ .iterate_shared = zpl_shares_iterate,
+#elif defined(HAVE_VFS_ITERATE)
+ .iterate = zpl_shares_iterate,
+#else
+ .readdir = zpl_shares_readdir,
+#endif
+
+};
+
+/*
+ * The '.zfs/shares' directory inode operations.
+ */
+const struct inode_operations zpl_ops_shares = {
+ .lookup = zpl_shares_lookup,
+ .getattr = zpl_shares_getattr,
+};
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c
new file mode 100644
index 000000000000..eaf048c38db1
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c
@@ -0,0 +1,154 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Gunnar Beutner
+ * Copyright (c) 2012 Cyril Plisko. All rights reserved.
+ */
+
+
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zpl.h>
+
+
+static int
+#ifdef HAVE_ENCODE_FH_WITH_INODE
+zpl_encode_fh(struct inode *ip, __u32 *fh, int *max_len, struct inode *parent)
+{
+#else
+zpl_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, int connectable)
+{
+ /* CSTYLED */
+ struct inode *ip = dentry->d_inode;
+#endif /* HAVE_ENCODE_FH_WITH_INODE */
+ fstrans_cookie_t cookie;
+ fid_t *fid = (fid_t *)fh;
+ int len_bytes, rc;
+
+ len_bytes = *max_len * sizeof (__u32);
+
+ if (len_bytes < offsetof(fid_t, fid_data))
+ return (255);
+
+ fid->fid_len = len_bytes - offsetof(fid_t, fid_data);
+ cookie = spl_fstrans_mark();
+
+ if (zfsctl_is_node(ip))
+ rc = zfsctl_fid(ip, fid);
+ else
+ rc = zfs_fid(ip, fid);
+
+ spl_fstrans_unmark(cookie);
+ len_bytes = offsetof(fid_t, fid_data) + fid->fid_len;
+ *max_len = roundup(len_bytes, sizeof (__u32)) / sizeof (__u32);
+
+ return (rc == 0 ? FILEID_INO32_GEN : 255);
+}
+
+static struct dentry *
+zpl_fh_to_dentry(struct super_block *sb, struct fid *fh,
+ int fh_len, int fh_type)
+{
+ fid_t *fid = (fid_t *)fh;
+ fstrans_cookie_t cookie;
+ struct inode *ip;
+ int len_bytes, rc;
+
+ len_bytes = fh_len * sizeof (__u32);
+
+ if (fh_type != FILEID_INO32_GEN ||
+ len_bytes < offsetof(fid_t, fid_data) ||
+ len_bytes < offsetof(fid_t, fid_data) + fid->fid_len)
+ return (ERR_PTR(-EINVAL));
+
+ cookie = spl_fstrans_mark();
+ rc = zfs_vget(sb, &ip, fid);
+ spl_fstrans_unmark(cookie);
+
+ if (rc) {
+ /*
+ * If we see ENOENT it might mean that an NFSv4 * client
+ * is using a cached inode value in a file handle and
+ * that the sought after file has had its inode changed
+ * by a third party. So change the error to ESTALE
+ * which will trigger a full lookup by the client and
+ * will find the new filename/inode pair if it still
+ * exists.
+ */
+ if (rc == ENOENT)
+ rc = ESTALE;
+
+ return (ERR_PTR(-rc));
+ }
+
+ ASSERT((ip != NULL) && !IS_ERR(ip));
+
+ return (d_obtain_alias(ip));
+}
+
+static struct dentry *
+zpl_get_parent(struct dentry *child)
+{
+ cred_t *cr = CRED();
+ fstrans_cookie_t cookie;
+ znode_t *zp;
+ int error;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfs_lookup(ITOZ(child->d_inode), "..", &zp, 0, cr, NULL, NULL);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ if (error)
+ return (ERR_PTR(error));
+
+ return (d_obtain_alias(ZTOI(zp)));
+}
+
+static int
+zpl_commit_metadata(struct inode *inode)
+{
+ cred_t *cr = CRED();
+ fstrans_cookie_t cookie;
+ int error;
+
+ if (zfsctl_is_node(inode))
+ return (0);
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfs_fsync(ITOZ(inode), 0, cr);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+const struct export_operations zpl_export_operations = {
+ .encode_fh = zpl_encode_fh,
+ .fh_to_dentry = zpl_fh_to_dentry,
+ .get_parent = zpl_get_parent,
+ .commit_metadata = zpl_commit_metadata,
+};
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
new file mode 100644
index 000000000000..970db4a8b73a
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
@@ -0,0 +1,1069 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ */
+
+
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+#endif
+#include <sys/file.h>
+#include <sys/dmu_objset.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_project.h>
+
+/*
+ * When using fallocate(2) to preallocate space, inflate the requested
+ * capacity check by 10% to account for the required metadata blocks.
+ */
+unsigned int zfs_fallocate_reserve_percent = 110;
+
+static int
+zpl_open(struct inode *ip, struct file *filp)
+{
+ cred_t *cr = CRED();
+ int error;
+ fstrans_cookie_t cookie;
+
+ error = generic_file_open(ip, filp);
+ if (error)
+ return (error);
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+zpl_release(struct inode *ip, struct file *filp)
+{
+ cred_t *cr = CRED();
+ int error;
+ fstrans_cookie_t cookie;
+
+ cookie = spl_fstrans_mark();
+ if (ITOZ(ip)->z_atime_dirty)
+ zfs_mark_inode_dirty(ip);
+
+ crhold(cr);
+ error = -zfs_close(ip, filp->f_flags, cr);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+zpl_iterate(struct file *filp, zpl_dir_context_t *ctx)
+{
+ cred_t *cr = CRED();
+ int error;
+ fstrans_cookie_t cookie;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfs_readdir(file_inode(filp), ctx, cr);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
+static int
+zpl_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ zpl_dir_context_t ctx =
+ ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
+ int error;
+
+ error = zpl_iterate(filp, &ctx);
+ filp->f_pos = ctx.pos;
+
+ return (error);
+}
+#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
+
+#if defined(HAVE_FSYNC_WITHOUT_DENTRY)
+/*
+ * Linux 2.6.35 - 3.0 API,
+ * As of 2.6.35 the dentry argument to the fops->fsync() hook was deemed
+ * redundant. The dentry is still accessible via filp->f_path.dentry,
+ * and we are guaranteed that filp will never be NULL.
+ */
+static int
+zpl_fsync(struct file *filp, int datasync)
+{
+ struct inode *inode = filp->f_mapping->host;
+ cred_t *cr = CRED();
+ int error;
+ fstrans_cookie_t cookie;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfs_fsync(ITOZ(inode), datasync, cr);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+#ifdef HAVE_FILE_AIO_FSYNC
+static int
+zpl_aio_fsync(struct kiocb *kiocb, int datasync)
+{
+ return (zpl_fsync(kiocb->ki_filp, datasync));
+}
+#endif
+
+#elif defined(HAVE_FSYNC_RANGE)
+/*
+ * Linux 3.1 - 3.x API,
+ * As of 3.1 the responsibility to call filemap_write_and_wait_range() has
+ * been pushed down in to the .fsync() vfs hook. Additionally, the i_mutex
+ * lock is no longer held by the caller, for zfs we don't require the lock
+ * to be held so we don't acquire it.
+ */
+static int
+zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
+{
+ struct inode *inode = filp->f_mapping->host;
+ cred_t *cr = CRED();
+ int error;
+ fstrans_cookie_t cookie;
+
+ error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (error)
+ return (error);
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfs_fsync(ITOZ(inode), datasync, cr);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+#ifdef HAVE_FILE_AIO_FSYNC
+static int
+zpl_aio_fsync(struct kiocb *kiocb, int datasync)
+{
+ return (zpl_fsync(kiocb->ki_filp, kiocb->ki_pos, -1, datasync));
+}
+#endif
+
+#else
+#error "Unsupported fops->fsync() implementation"
+#endif
+
+static inline int
+zfs_io_flags(struct kiocb *kiocb)
+{
+ int flags = 0;
+
+#if defined(IOCB_DSYNC)
+ if (kiocb->ki_flags & IOCB_DSYNC)
+ flags |= O_DSYNC;
+#endif
+#if defined(IOCB_SYNC)
+ if (kiocb->ki_flags & IOCB_SYNC)
+ flags |= O_SYNC;
+#endif
+#if defined(IOCB_APPEND)
+ if (kiocb->ki_flags & IOCB_APPEND)
+ flags |= O_APPEND;
+#endif
+#if defined(IOCB_DIRECT)
+ if (kiocb->ki_flags & IOCB_DIRECT)
+ flags |= O_DIRECT;
+#endif
+ return (flags);
+}
+
+/*
+ * If relatime is enabled, call file_accessed() if zfs_relatime_need_update()
+ * is true. This is needed since datasets with inherited "relatime" property
+ * aren't necessarily mounted with the MNT_RELATIME flag (e.g. after
+ * `zfs set relatime=...`), which is what relatime test in VFS by
+ * relatime_need_update() is based on.
+ */
+static inline void
+zpl_file_accessed(struct file *filp)
+{
+ struct inode *ip = filp->f_mapping->host;
+
+ if (!IS_NOATIME(ip) && ITOZSB(ip)->z_relatime) {
+ if (zfs_relatime_need_update(ip))
+ file_accessed(filp);
+ } else {
+ file_accessed(filp);
+ }
+}
+
+#if defined(HAVE_VFS_RW_ITERATE)
+
+/*
+ * When HAVE_VFS_IOV_ITER is defined the iov_iter structure supports
+ * iovecs, kvevs, bvecs and pipes, plus all the required interfaces to
+ * manipulate the iov_iter are available. In which case the full iov_iter
+ * can be attached to the uio and correctly handled in the lower layers.
+ * Otherwise, for older kernels extract the iovec and pass it instead.
+ */
+static void
+zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to,
+ loff_t pos, ssize_t count, size_t skip)
+{
+#if defined(HAVE_VFS_IOV_ITER)
+ zfs_uio_iov_iter_init(uio, to, pos, count, skip);
+#else
+ zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos,
+ to->type & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE,
+ count, skip);
+#endif
+}
+
+static ssize_t
+zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
+{
+ cred_t *cr = CRED();
+ fstrans_cookie_t cookie;
+ struct file *filp = kiocb->ki_filp;
+ ssize_t count = iov_iter_count(to);
+ zfs_uio_t uio;
+
+ zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0);
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+
+ int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
+ filp->f_flags | zfs_io_flags(kiocb), cr);
+
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+
+ if (error < 0)
+ return (error);
+
+ ssize_t read = count - uio.uio_resid;
+ kiocb->ki_pos += read;
+
+ zpl_file_accessed(filp);
+
+ return (read);
+}
+
+static inline ssize_t
+zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from,
+ size_t *countp)
+{
+#ifdef HAVE_GENERIC_WRITE_CHECKS_KIOCB
+ ssize_t ret = generic_write_checks(kiocb, from);
+ if (ret <= 0)
+ return (ret);
+
+ *countp = ret;
+#else
+ struct file *file = kiocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *ip = mapping->host;
+ int isblk = S_ISBLK(ip->i_mode);
+
+ *countp = iov_iter_count(from);
+ ssize_t ret = generic_write_checks(file, &kiocb->ki_pos, countp, isblk);
+ if (ret)
+ return (ret);
+#endif
+
+ return (0);
+}
+
+static ssize_t
+zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
+{
+ cred_t *cr = CRED();
+ fstrans_cookie_t cookie;
+ struct file *filp = kiocb->ki_filp;
+ struct inode *ip = filp->f_mapping->host;
+ zfs_uio_t uio;
+ size_t count = 0;
+ ssize_t ret;
+
+ ret = zpl_generic_write_checks(kiocb, from, &count);
+ if (ret)
+ return (ret);
+
+ zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset);
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+
+ int error = -zfs_write(ITOZ(ip), &uio,
+ filp->f_flags | zfs_io_flags(kiocb), cr);
+
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+
+ if (error < 0)
+ return (error);
+
+ ssize_t wrote = count - uio.uio_resid;
+ kiocb->ki_pos += wrote;
+
+ if (wrote > 0)
+ iov_iter_advance(from, wrote);
+
+ return (wrote);
+}
+
+#else /* !HAVE_VFS_RW_ITERATE */
+
+static ssize_t
+zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ cred_t *cr = CRED();
+ fstrans_cookie_t cookie;
+ struct file *filp = kiocb->ki_filp;
+ size_t count;
+ ssize_t ret;
+
+ ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
+ if (ret)
+ return (ret);
+
+ zfs_uio_t uio;
+ zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE,
+ count, 0);
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+
+ int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
+ filp->f_flags | zfs_io_flags(kiocb), cr);
+
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+
+ if (error < 0)
+ return (error);
+
+ ssize_t read = count - uio.uio_resid;
+ kiocb->ki_pos += read;
+
+ zpl_file_accessed(filp);
+
+ return (read);
+}
+
+static ssize_t
+zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ cred_t *cr = CRED();
+ fstrans_cookie_t cookie;
+ struct file *filp = kiocb->ki_filp;
+ struct inode *ip = filp->f_mapping->host;
+ size_t count;
+ ssize_t ret;
+
+ ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
+ if (ret)
+ return (ret);
+
+ ret = generic_write_checks(filp, &pos, &count, S_ISBLK(ip->i_mode));
+ if (ret)
+ return (ret);
+
+ zfs_uio_t uio;
+ zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE,
+ count, 0);
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+
+ int error = -zfs_write(ITOZ(ip), &uio,
+ filp->f_flags | zfs_io_flags(kiocb), cr);
+
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+
+ if (error < 0)
+ return (error);
+
+ ssize_t wrote = count - uio.uio_resid;
+ kiocb->ki_pos += wrote;
+
+ return (wrote);
+}
+#endif /* HAVE_VFS_RW_ITERATE */
+
+#if defined(HAVE_VFS_RW_ITERATE)
+static ssize_t
+zpl_direct_IO_impl(int rw, struct kiocb *kiocb, struct iov_iter *iter)
+{
+ if (rw == WRITE)
+ return (zpl_iter_write(kiocb, iter));
+ else
+ return (zpl_iter_read(kiocb, iter));
+}
+#if defined(HAVE_VFS_DIRECT_IO_ITER)
+static ssize_t
+zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter)
+{
+ return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter));
+}
+#elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET)
+static ssize_t
+zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
+{
+ ASSERT3S(pos, ==, kiocb->ki_pos);
+ return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter));
+}
+#elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET)
+static ssize_t
+zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
+{
+ ASSERT3S(pos, ==, kiocb->ki_pos);
+ return (zpl_direct_IO_impl(rw, kiocb, iter));
+}
+#else
+#error "Unknown direct IO interface"
+#endif
+
+#else /* HAVE_VFS_RW_ITERATE */
+
+#if defined(HAVE_VFS_DIRECT_IO_IOVEC)
+static ssize_t
+zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iov,
+ loff_t pos, unsigned long nr_segs)
+{
+ if (rw == WRITE)
+ return (zpl_aio_write(kiocb, iov, nr_segs, pos));
+ else
+ return (zpl_aio_read(kiocb, iov, nr_segs, pos));
+}
+#elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET)
+static ssize_t
+zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
+{
+ const struct iovec *iovp = iov_iter_iovec(iter);
+ unsigned long nr_segs = iter->nr_segs;
+
+ ASSERT3S(pos, ==, kiocb->ki_pos);
+ if (rw == WRITE)
+ return (zpl_aio_write(kiocb, iovp, nr_segs, pos));
+ else
+ return (zpl_aio_read(kiocb, iovp, nr_segs, pos));
+}
+#else
+#error "Unknown direct IO interface"
+#endif
+
+#endif /* HAVE_VFS_RW_ITERATE */
+
+static loff_t
+zpl_llseek(struct file *filp, loff_t offset, int whence)
+{
+#if defined(SEEK_HOLE) && defined(SEEK_DATA)
+ fstrans_cookie_t cookie;
+
+ if (whence == SEEK_DATA || whence == SEEK_HOLE) {
+ struct inode *ip = filp->f_mapping->host;
+ loff_t maxbytes = ip->i_sb->s_maxbytes;
+ loff_t error;
+
+ spl_inode_lock_shared(ip);
+ cookie = spl_fstrans_mark();
+ error = -zfs_holey(ITOZ(ip), whence, &offset);
+ spl_fstrans_unmark(cookie);
+ if (error == 0)
+ error = lseek_execute(filp, ip, offset, maxbytes);
+ spl_inode_unlock_shared(ip);
+
+ return (error);
+ }
+#endif /* SEEK_HOLE && SEEK_DATA */
+
+ return (generic_file_llseek(filp, offset, whence));
+}
+
+/*
+ * It's worth taking a moment to describe how mmap is implemented
+ * for zfs because it differs considerably from other Linux filesystems.
+ * However, this issue is handled the same way under OpenSolaris.
+ *
+ * The issue is that by design zfs bypasses the Linux page cache and
+ * leaves all caching up to the ARC. This has been shown to work
+ * well for the common read(2)/write(2) case. However, mmap(2)
+ * is problem because it relies on being tightly integrated with the
+ * page cache. To handle this we cache mmap'ed files twice, once in
+ * the ARC and a second time in the page cache. The code is careful
+ * to keep both copies synchronized.
+ *
+ * When a file with an mmap'ed region is written to using write(2)
+ * both the data in the ARC and existing pages in the page cache
+ * are updated. For a read(2) data will be read first from the page
+ * cache then the ARC if needed. Neither a write(2) or read(2) will
+ * will ever result in new pages being added to the page cache.
+ *
+ * New pages are added to the page cache only via .readpage() which
+ * is called when the vfs needs to read a page off disk to back the
+ * virtual memory region. These pages may be modified without
+ * notifying the ARC and will be written out periodically via
+ * .writepage(). This will occur due to either a sync or the usual
+ * page aging behavior. Note because a read(2) of a mmap'ed file
+ * will always check the page cache first even when the ARC is out
+ * of date correct data will still be returned.
+ *
+ * While this implementation ensures correct behavior it does have
+ * have some drawbacks. The most obvious of which is that it
+ * increases the required memory footprint when access mmap'ed
+ * files. It also adds additional complexity to the code keeping
+ * both caches synchronized.
+ *
+ * Longer term it may be possible to cleanly resolve this wart by
+ * mapping page cache pages directly on to the ARC buffers. The
+ * Linux address space operations are flexible enough to allow
+ * selection of which pages back a particular index. The trick
+ * would be working out the details of which subsystem is in
+ * charge, the ARC, the page cache, or both. It may also prove
+ * helpful to move the ARC buffers to a scatter-gather lists
+ * rather than a vmalloc'ed region.
+ */
+static int
+zpl_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct inode *ip = filp->f_mapping->host;
+ znode_t *zp = ITOZ(ip);
+ int error;
+ fstrans_cookie_t cookie;
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start,
+ (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags);
+ spl_fstrans_unmark(cookie);
+ if (error)
+ return (error);
+
+ error = generic_file_mmap(filp, vma);
+ if (error)
+ return (error);
+
+ mutex_enter(&zp->z_lock);
+ zp->z_is_mapped = B_TRUE;
+ mutex_exit(&zp->z_lock);
+
+ return (error);
+}
+
+/*
+ * Populate a page with data for the Linux page cache. This function is
+ * only used to support mmap(2). There will be an identical copy of the
+ * data in the ARC which is kept up to date via .write() and .writepage().
+ */
+static int
+zpl_readpage(struct file *filp, struct page *pp)
+{
+ struct inode *ip;
+ struct page *pl[1];
+ int error = 0;
+ fstrans_cookie_t cookie;
+
+ ASSERT(PageLocked(pp));
+ ip = pp->mapping->host;
+ pl[0] = pp;
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_getpage(ip, pl, 1);
+ spl_fstrans_unmark(cookie);
+
+ if (error) {
+ SetPageError(pp);
+ ClearPageUptodate(pp);
+ } else {
+ ClearPageError(pp);
+ SetPageUptodate(pp);
+ flush_dcache_page(pp);
+ }
+
+ unlock_page(pp);
+ return (error);
+}
+
+/*
+ * Populate a set of pages with data for the Linux page cache. This
+ * function will only be called for read ahead and never for demand
+ * paging. For simplicity, the code relies on read_cache_pages() to
+ * correctly lock each page for IO and call zpl_readpage().
+ */
+static int
+zpl_readpages(struct file *filp, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ return (read_cache_pages(mapping, pages,
+ (filler_t *)zpl_readpage, filp));
+}
+
+static int
+zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
+{
+ struct address_space *mapping = data;
+ fstrans_cookie_t cookie;
+
+ ASSERT(PageLocked(pp));
+ ASSERT(!PageWriteback(pp));
+
+ cookie = spl_fstrans_mark();
+ (void) zfs_putpage(mapping->host, pp, wbc);
+ spl_fstrans_unmark(cookie);
+
+ return (0);
+}
+
+static int
+zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+ znode_t *zp = ITOZ(mapping->host);
+ zfsvfs_t *zfsvfs = ITOZSB(mapping->host);
+ enum writeback_sync_modes sync_mode;
+ int result;
+
+ ZPL_ENTER(zfsvfs);
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ wbc->sync_mode = WB_SYNC_ALL;
+ ZPL_EXIT(zfsvfs);
+ sync_mode = wbc->sync_mode;
+
+ /*
+ * We don't want to run write_cache_pages() in SYNC mode here, because
+ * that would make putpage() wait for a single page to be committed to
+ * disk every single time, resulting in atrocious performance. Instead
+ * we run it once in non-SYNC mode so that the ZIL gets all the data,
+ * and then we commit it all in one go.
+ */
+ wbc->sync_mode = WB_SYNC_NONE;
+ result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
+ if (sync_mode != wbc->sync_mode) {
+ ZPL_ENTER(zfsvfs);
+ ZPL_VERIFY_ZP(zp);
+ if (zfsvfs->z_log != NULL)
+ zil_commit(zfsvfs->z_log, zp->z_id);
+ ZPL_EXIT(zfsvfs);
+
+ /*
+ * We need to call write_cache_pages() again (we can't just
+ * return after the commit) because the previous call in
+ * non-SYNC mode does not guarantee that we got all the dirty
+ * pages (see the implementation of write_cache_pages() for
+ * details). That being said, this is a no-op in most cases.
+ */
+ wbc->sync_mode = sync_mode;
+ result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
+ }
+ return (result);
+}
+
+/*
+ * Write out dirty pages to the ARC, this function is only required to
+ * support mmap(2). Mapped pages may be dirtied by memory operations
+ * which never call .write(). These dirty pages are kept in sync with
+ * the ARC buffers via this hook.
+ */
+static int
+zpl_writepage(struct page *pp, struct writeback_control *wbc)
+{
+ if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ wbc->sync_mode = WB_SYNC_ALL;
+
+ return (zpl_putpage(pp, wbc, pp->mapping));
+}
+
+/*
+ * The flag combination which matches the behavior of zfs_space() is
+ * FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE
+ * flag was introduced in the 2.6.38 kernel.
+ *
+ * The original mode=0 (allocate space) behavior can be reasonably emulated
+ * by checking if enough space exists and creating a sparse file, as real
+ * persistent space reservation is not possible due to COW, snapshots, etc.
+ */
+static long
+zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len)
+{
+ cred_t *cr = CRED();
+ loff_t olen;
+ fstrans_cookie_t cookie;
+ int error = 0;
+
+ if ((mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) != 0)
+ return (-EOPNOTSUPP);
+
+ if (offset < 0 || len <= 0)
+ return (-EINVAL);
+
+ spl_inode_lock(ip);
+ olen = i_size_read(ip);
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ flock64_t bf;
+
+ if (offset > olen)
+ goto out_unmark;
+
+ if (offset + len > olen)
+ len = olen - offset;
+ bf.l_type = F_WRLCK;
+ bf.l_whence = SEEK_SET;
+ bf.l_start = offset;
+ bf.l_len = len;
+ bf.l_pid = 0;
+
+ error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr);
+ } else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) {
+ unsigned int percent = zfs_fallocate_reserve_percent;
+ struct kstatfs statfs;
+
+ /* Legacy mode, disable fallocate compatibility. */
+ if (percent == 0) {
+ error = -EOPNOTSUPP;
+ goto out_unmark;
+ }
+
+ /*
+ * Use zfs_statvfs() instead of dmu_objset_space() since it
+ * also checks project quota limits, which are relevant here.
+ */
+ error = zfs_statvfs(ip, &statfs);
+ if (error)
+ goto out_unmark;
+
+ /*
+ * Shrink available space a bit to account for overhead/races.
+ * We know the product previously fit into availbytes from
+ * dmu_objset_space(), so the smaller product will also fit.
+ */
+ if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) {
+ error = -ENOSPC;
+ goto out_unmark;
+ }
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen)
+ error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE);
+ }
+out_unmark:
+ spl_fstrans_unmark(cookie);
+ spl_inode_unlock(ip);
+
+ crfree(cr);
+
+ return (error);
+}
+
+static long
+zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len)
+{
+ return zpl_fallocate_common(file_inode(filp),
+ mode, offset, len);
+}
+
+#define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL)
+#define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL)
+
+static uint32_t
+__zpl_ioctl_getflags(struct inode *ip)
+{
+ uint64_t zfs_flags = ITOZ(ip)->z_pflags;
+ uint32_t ioctl_flags = 0;
+
+ if (zfs_flags & ZFS_IMMUTABLE)
+ ioctl_flags |= FS_IMMUTABLE_FL;
+
+ if (zfs_flags & ZFS_APPENDONLY)
+ ioctl_flags |= FS_APPEND_FL;
+
+ if (zfs_flags & ZFS_NODUMP)
+ ioctl_flags |= FS_NODUMP_FL;
+
+ if (zfs_flags & ZFS_PROJINHERIT)
+ ioctl_flags |= ZFS_PROJINHERIT_FL;
+
+ return (ioctl_flags & ZFS_FL_USER_VISIBLE);
+}
+
+/*
+ * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file
+ * attributes common to both Linux and Solaris are mapped.
+ */
+static int
+zpl_ioctl_getflags(struct file *filp, void __user *arg)
+{
+ uint32_t flags;
+ int err;
+
+ flags = __zpl_ioctl_getflags(file_inode(filp));
+ err = copy_to_user(arg, &flags, sizeof (flags));
+
+ return (err);
+}
+
+/*
+ * fchange() is a helper macro to detect if we have been asked to change a
+ * flag. This is ugly, but the requirement that we do this is a consequence of
+ * how the Linux file attribute interface was designed. Another consequence is
+ * that concurrent modification of files suffers from a TOCTOU race. Neither
+ * are things we can fix without modifying the kernel-userland interface, which
+ * is outside of our jurisdiction.
+ */
+
+#define fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1)))
+
+static int
+__zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
+{
+ uint64_t zfs_flags = ITOZ(ip)->z_pflags;
+ xoptattr_t *xoap;
+
+ if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL |
+ ZFS_PROJINHERIT_FL))
+ return (-EOPNOTSUPP);
+
+ if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE)
+ return (-EACCES);
+
+ if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) ||
+ fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) &&
+ !capable(CAP_LINUX_IMMUTABLE))
+ return (-EACCES);
+
+ if (!inode_owner_or_capable(ip))
+ return (-EACCES);
+
+ xva_init(xva);
+ xoap = xva_getxoptattr(xva);
+
+ XVA_SET_REQ(xva, XAT_IMMUTABLE);
+ if (ioctl_flags & FS_IMMUTABLE_FL)
+ xoap->xoa_immutable = B_TRUE;
+
+ XVA_SET_REQ(xva, XAT_APPENDONLY);
+ if (ioctl_flags & FS_APPEND_FL)
+ xoap->xoa_appendonly = B_TRUE;
+
+ XVA_SET_REQ(xva, XAT_NODUMP);
+ if (ioctl_flags & FS_NODUMP_FL)
+ xoap->xoa_nodump = B_TRUE;
+
+ XVA_SET_REQ(xva, XAT_PROJINHERIT);
+ if (ioctl_flags & ZFS_PROJINHERIT_FL)
+ xoap->xoa_projinherit = B_TRUE;
+
+ return (0);
+}
+
+static int
+zpl_ioctl_setflags(struct file *filp, void __user *arg)
+{
+ struct inode *ip = file_inode(filp);
+ uint32_t flags;
+ cred_t *cr = CRED();
+ xvattr_t xva;
+ int err;
+ fstrans_cookie_t cookie;
+
+ if (copy_from_user(&flags, arg, sizeof (flags)))
+ return (-EFAULT);
+
+ err = __zpl_ioctl_setflags(ip, flags, &xva);
+ if (err)
+ return (err);
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+
+ return (err);
+}
+
+static int
+zpl_ioctl_getxattr(struct file *filp, void __user *arg)
+{
+ zfsxattr_t fsx = { 0 };
+ struct inode *ip = file_inode(filp);
+ int err;
+
+ fsx.fsx_xflags = __zpl_ioctl_getflags(ip);
+ fsx.fsx_projid = ITOZ(ip)->z_projid;
+ err = copy_to_user(arg, &fsx, sizeof (fsx));
+
+ return (err);
+}
+
+static int
+zpl_ioctl_setxattr(struct file *filp, void __user *arg)
+{
+ struct inode *ip = file_inode(filp);
+ zfsxattr_t fsx;
+ cred_t *cr = CRED();
+ xvattr_t xva;
+ xoptattr_t *xoap;
+ int err;
+ fstrans_cookie_t cookie;
+
+ if (copy_from_user(&fsx, arg, sizeof (fsx)))
+ return (-EFAULT);
+
+ if (!zpl_is_valid_projid(fsx.fsx_projid))
+ return (-EINVAL);
+
+ err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva);
+ if (err)
+ return (err);
+
+ xoap = xva_getxoptattr(&xva);
+ XVA_SET_REQ(&xva, XAT_PROJID);
+ xoap->xoa_projid = fsx.fsx_projid;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+
+ return (err);
+}
+
+static long
+zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case FS_IOC_GETFLAGS:
+ return (zpl_ioctl_getflags(filp, (void *)arg));
+ case FS_IOC_SETFLAGS:
+ return (zpl_ioctl_setflags(filp, (void *)arg));
+ case ZFS_IOC_FSGETXATTR:
+ return (zpl_ioctl_getxattr(filp, (void *)arg));
+ case ZFS_IOC_FSSETXATTR:
+ return (zpl_ioctl_setxattr(filp, (void *)arg));
+ default:
+ return (-ENOTTY);
+ }
+}
+
+#ifdef CONFIG_COMPAT
+static long
+zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case FS_IOC32_GETFLAGS:
+ cmd = FS_IOC_GETFLAGS;
+ break;
+ case FS_IOC32_SETFLAGS:
+ cmd = FS_IOC_SETFLAGS;
+ break;
+ default:
+ return (-ENOTTY);
+ }
+ return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)));
+}
+#endif /* CONFIG_COMPAT */
+
+
+const struct address_space_operations zpl_address_space_operations = {
+ .readpages = zpl_readpages,
+ .readpage = zpl_readpage,
+ .writepage = zpl_writepage,
+ .writepages = zpl_writepages,
+ .direct_IO = zpl_direct_IO,
+};
+
+const struct file_operations zpl_file_operations = {
+ .open = zpl_open,
+ .release = zpl_release,
+ .llseek = zpl_llseek,
+#ifdef HAVE_VFS_RW_ITERATE
+#ifdef HAVE_NEW_SYNC_READ
+ .read = new_sync_read,
+ .write = new_sync_write,
+#endif
+ .read_iter = zpl_iter_read,
+ .write_iter = zpl_iter_write,
+#ifdef HAVE_VFS_IOV_ITER
+ .splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
+#endif
+#else
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .aio_read = zpl_aio_read,
+ .aio_write = zpl_aio_write,
+#endif
+ .mmap = zpl_mmap,
+ .fsync = zpl_fsync,
+#ifdef HAVE_FILE_AIO_FSYNC
+ .aio_fsync = zpl_aio_fsync,
+#endif
+ .fallocate = zpl_fallocate,
+ .unlocked_ioctl = zpl_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = zpl_compat_ioctl,
+#endif
+};
+
+const struct file_operations zpl_dir_file_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+#if defined(HAVE_VFS_ITERATE_SHARED)
+ .iterate_shared = zpl_iterate,
+#elif defined(HAVE_VFS_ITERATE)
+ .iterate = zpl_iterate,
+#else
+ .readdir = zpl_readdir,
+#endif
+ .fsync = zpl_fsync,
+ .unlocked_ioctl = zpl_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = zpl_compat_ioctl,
+#endif
+};
+
+/* BEGIN CSTYLED */
+module_param(zfs_fallocate_reserve_percent, uint, 0644);
+MODULE_PARM_DESC(zfs_fallocate_reserve_percent,
+ "Percentage of length to use for the available capacity check");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
new file mode 100644
index 000000000000..e79d334edc9b
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
@@ -0,0 +1,745 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ */
+
+
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_znode.h>
+#include <sys/dmu_objset.h>
+#include <sys/vfs.h>
+#include <sys/zpl.h>
+#include <sys/file.h>
+
+
+static struct dentry *
+zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+ cred_t *cr = CRED();
+ struct inode *ip;
+ znode_t *zp;
+ int error;
+ fstrans_cookie_t cookie;
+ pathname_t *ppn = NULL;
+ pathname_t pn;
+ int zfs_flags = 0;
+ zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
+
+ if (dlen(dentry) >= ZAP_MAXNAMELEN)
+ return (ERR_PTR(-ENAMETOOLONG));
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+
+ /* If we are a case insensitive fs, we need the real name */
+ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+ zfs_flags = FIGNORECASE;
+ pn_alloc(&pn);
+ ppn = &pn;
+ }
+
+ error = -zfs_lookup(ITOZ(dir), dname(dentry), &zp,
+ zfs_flags, cr, NULL, ppn);
+ spl_fstrans_unmark(cookie);
+ ASSERT3S(error, <=, 0);
+ crfree(cr);
+
+ spin_lock(&dentry->d_lock);
+ dentry->d_time = jiffies;
+ spin_unlock(&dentry->d_lock);
+
+ if (error) {
+ /*
+ * If we have a case sensitive fs, we do not want to
+ * insert negative entries, so return NULL for ENOENT.
+ * Fall through if the error is not ENOENT. Also free memory.
+ */
+ if (ppn) {
+ pn_free(ppn);
+ if (error == -ENOENT)
+ return (NULL);
+ }
+
+ if (error == -ENOENT)
+ return (d_splice_alias(NULL, dentry));
+ else
+ return (ERR_PTR(error));
+ }
+ ip = ZTOI(zp);
+
+ /*
+ * If we are case insensitive, call the correct function
+ * to install the name.
+ */
+ if (ppn) {
+ struct dentry *new_dentry;
+ struct qstr ci_name;
+
+ if (strcmp(dname(dentry), pn.pn_buf) == 0) {
+ new_dentry = d_splice_alias(ip, dentry);
+ } else {
+ ci_name.name = pn.pn_buf;
+ ci_name.len = strlen(pn.pn_buf);
+ new_dentry = d_add_ci(dentry, ip, &ci_name);
+ }
+ pn_free(ppn);
+ return (new_dentry);
+ } else {
+ return (d_splice_alias(ip, dentry));
+ }
+}
+
+void
+zpl_vap_init(vattr_t *vap, struct inode *dir, umode_t mode, cred_t *cr)
+{
+ vap->va_mask = ATTR_MODE;
+ vap->va_mode = mode;
+ vap->va_uid = crgetfsuid(cr);
+
+ if (dir && dir->i_mode & S_ISGID) {
+ vap->va_gid = KGID_TO_SGID(dir->i_gid);
+ if (S_ISDIR(mode))
+ vap->va_mode |= S_ISGID;
+ } else {
+ vap->va_gid = crgetfsgid(cr);
+ }
+}
+
+static int
+zpl_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool flag)
+{
+ cred_t *cr = CRED();
+ znode_t *zp;
+ vattr_t *vap;
+ int error;
+ fstrans_cookie_t cookie;
+
+ crhold(cr);
+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+ zpl_vap_init(vap, dir, mode, cr);
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_create(ITOZ(dir), dname(dentry), vap, 0,
+ mode, &zp, cr, 0, NULL);
+ if (error == 0) {
+ d_instantiate(dentry, ZTOI(zp));
+
+ error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name);
+ if (error == 0)
+ error = zpl_init_acl(ZTOI(zp), dir);
+
+ if (error)
+ (void) zfs_remove(ITOZ(dir), dname(dentry), cr, 0);
+ }
+
+ spl_fstrans_unmark(cookie);
+ kmem_free(vap, sizeof (vattr_t));
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+ dev_t rdev)
+{
+ cred_t *cr = CRED();
+ znode_t *zp;
+ vattr_t *vap;
+ int error;
+ fstrans_cookie_t cookie;
+
+ /*
+ * We currently expect Linux to supply rdev=0 for all sockets
+ * and fifos, but we want to know if this behavior ever changes.
+ */
+ if (S_ISSOCK(mode) || S_ISFIFO(mode))
+ ASSERT(rdev == 0);
+
+ crhold(cr);
+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+ zpl_vap_init(vap, dir, mode, cr);
+ vap->va_rdev = rdev;
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_create(ITOZ(dir), dname(dentry), vap, 0,
+ mode, &zp, cr, 0, NULL);
+ if (error == 0) {
+ d_instantiate(dentry, ZTOI(zp));
+
+ error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name);
+ if (error == 0)
+ error = zpl_init_acl(ZTOI(zp), dir);
+
+ if (error)
+ (void) zfs_remove(ITOZ(dir), dname(dentry), cr, 0);
+ }
+
+ spl_fstrans_unmark(cookie);
+ kmem_free(vap, sizeof (vattr_t));
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+#ifdef HAVE_TMPFILE
+static int
+zpl_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ cred_t *cr = CRED();
+ struct inode *ip;
+ vattr_t *vap;
+ int error;
+ fstrans_cookie_t cookie;
+
+ crhold(cr);
+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+ /*
+ * The VFS does not apply the umask, therefore it is applied here
+ * when POSIX ACLs are not enabled.
+ */
+ if (!IS_POSIXACL(dir))
+ mode &= ~current_umask();
+ zpl_vap_init(vap, dir, mode, cr);
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_tmpfile(dir, vap, 0, mode, &ip, cr, 0, NULL);
+ if (error == 0) {
+ /* d_tmpfile will do drop_nlink, so we should set it first */
+ set_nlink(ip, 1);
+ d_tmpfile(dentry, ip);
+
+ error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
+ if (error == 0)
+ error = zpl_init_acl(ip, dir);
+ /*
+ * don't need to handle error here, file is already in
+ * unlinked set.
+ */
+ }
+
+ spl_fstrans_unmark(cookie);
+ kmem_free(vap, sizeof (vattr_t));
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+#endif
+
+static int
+zpl_unlink(struct inode *dir, struct dentry *dentry)
+{
+ cred_t *cr = CRED();
+ int error;
+ fstrans_cookie_t cookie;
+ zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfs_remove(ITOZ(dir), dname(dentry), cr, 0);
+
+ /*
+ * For a CI FS we must invalidate the dentry to prevent the
+ * creation of negative entries.
+ */
+ if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE)
+ d_invalidate(dentry);
+
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+zpl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ cred_t *cr = CRED();
+ vattr_t *vap;
+ znode_t *zp;
+ int error;
+ fstrans_cookie_t cookie;
+
+ crhold(cr);
+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+ zpl_vap_init(vap, dir, mode | S_IFDIR, cr);
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_mkdir(ITOZ(dir), dname(dentry), vap, &zp, cr, 0, NULL);
+ if (error == 0) {
+ d_instantiate(dentry, ZTOI(zp));
+
+ error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name);
+ if (error == 0)
+ error = zpl_init_acl(ZTOI(zp), dir);
+
+ if (error)
+ (void) zfs_rmdir(ITOZ(dir), dname(dentry), NULL, cr, 0);
+ }
+
+ spl_fstrans_unmark(cookie);
+ kmem_free(vap, sizeof (vattr_t));
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+zpl_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ cred_t *cr = CRED();
+ int error;
+ fstrans_cookie_t cookie;
+ zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfs_rmdir(ITOZ(dir), dname(dentry), NULL, cr, 0);
+
+ /*
+ * For a CI FS we must invalidate the dentry to prevent the
+ * creation of negative entries.
+ */
+ if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE)
+ d_invalidate(dentry);
+
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ int error;
+ fstrans_cookie_t cookie;
+
+ cookie = spl_fstrans_mark();
+
+ /*
+ * XXX request_mask and query_flags currently ignored.
+ */
+
+ error = -zfs_getattr_fast(path->dentry->d_inode, stat);
+ spl_fstrans_unmark(cookie);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+ZPL_GETATTR_WRAPPER(zpl_getattr);
+
+static int
+zpl_setattr(struct dentry *dentry, struct iattr *ia)
+{
+ struct inode *ip = dentry->d_inode;
+ cred_t *cr = CRED();
+ vattr_t *vap;
+ int error;
+ fstrans_cookie_t cookie;
+
+ error = setattr_prepare(dentry, ia);
+ if (error)
+ return (error);
+
+ crhold(cr);
+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+ vap->va_mask = ia->ia_valid & ATTR_IATTR_MASK;
+ vap->va_mode = ia->ia_mode;
+ vap->va_uid = KUID_TO_SUID(ia->ia_uid);
+ vap->va_gid = KGID_TO_SGID(ia->ia_gid);
+ vap->va_size = ia->ia_size;
+ vap->va_atime = ia->ia_atime;
+ vap->va_mtime = ia->ia_mtime;
+ vap->va_ctime = ia->ia_ctime;
+
+ if (vap->va_mask & ATTR_ATIME)
+ ip->i_atime = zpl_inode_timestamp_truncate(ia->ia_atime, ip);
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_setattr(ITOZ(ip), vap, 0, cr);
+ if (!error && (ia->ia_valid & ATTR_MODE))
+ error = zpl_chmod_acl(ip);
+
+ spl_fstrans_unmark(cookie);
+ kmem_free(vap, sizeof (vattr_t));
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+zpl_rename2(struct inode *sdip, struct dentry *sdentry,
+ struct inode *tdip, struct dentry *tdentry, unsigned int flags)
+{
+ cred_t *cr = CRED();
+ int error;
+ fstrans_cookie_t cookie;
+
+ /* We don't have renameat2(2) support */
+ if (flags)
+ return (-EINVAL);
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfs_rename(ITOZ(sdip), dname(sdentry), ITOZ(tdip),
+ dname(tdentry), cr, 0);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+#ifndef HAVE_RENAME_WANTS_FLAGS
+static int
+zpl_rename(struct inode *sdip, struct dentry *sdentry,
+ struct inode *tdip, struct dentry *tdentry)
+{
+ return (zpl_rename2(sdip, sdentry, tdip, tdentry, 0));
+}
+#endif
+
+static int
+zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name)
+{
+ cred_t *cr = CRED();
+ vattr_t *vap;
+ znode_t *zp;
+ int error;
+ fstrans_cookie_t cookie;
+
+ crhold(cr);
+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+ zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr);
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_symlink(ITOZ(dir), dname(dentry), vap,
+ (char *)name, &zp, cr, 0);
+ if (error == 0) {
+ d_instantiate(dentry, ZTOI(zp));
+
+ error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name);
+ if (error)
+ (void) zfs_remove(ITOZ(dir), dname(dentry), cr, 0);
+ }
+
+ spl_fstrans_unmark(cookie);
+ kmem_free(vap, sizeof (vattr_t));
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+#if defined(HAVE_PUT_LINK_COOKIE)
+static void
+zpl_put_link(struct inode *unused, void *cookie)
+{
+ kmem_free(cookie, MAXPATHLEN);
+}
+#elif defined(HAVE_PUT_LINK_NAMEIDATA)
+static void
+zpl_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr)
+{
+ const char *link = nd_get_link(nd);
+
+ if (!IS_ERR(link))
+ kmem_free(link, MAXPATHLEN);
+}
+#elif defined(HAVE_PUT_LINK_DELAYED)
+static void
+zpl_put_link(void *ptr)
+{
+ kmem_free(ptr, MAXPATHLEN);
+}
+#endif
+
+static int
+zpl_get_link_common(struct dentry *dentry, struct inode *ip, char **link)
+{
+ fstrans_cookie_t cookie;
+ cred_t *cr = CRED();
+ int error;
+
+ crhold(cr);
+ *link = NULL;
+
+ struct iovec iov;
+ iov.iov_len = MAXPATHLEN;
+ iov.iov_base = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+
+ zfs_uio_t uio;
+ zfs_uio_iovec_init(&uio, &iov, 1, 0, UIO_SYSSPACE, MAXPATHLEN - 1, 0);
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_readlink(ip, &uio, cr);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+
+ if (error)
+ kmem_free(iov.iov_base, MAXPATHLEN);
+ else
+ *link = iov.iov_base;
+
+ return (error);
+}
+
+#if defined(HAVE_GET_LINK_DELAYED)
+static const char *
+zpl_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *done)
+{
+ char *link = NULL;
+ int error;
+
+ if (!dentry)
+ return (ERR_PTR(-ECHILD));
+
+ error = zpl_get_link_common(dentry, inode, &link);
+ if (error)
+ return (ERR_PTR(error));
+
+ set_delayed_call(done, zpl_put_link, link);
+
+ return (link);
+}
+#elif defined(HAVE_GET_LINK_COOKIE)
+static const char *
+zpl_get_link(struct dentry *dentry, struct inode *inode, void **cookie)
+{
+ char *link = NULL;
+ int error;
+
+ if (!dentry)
+ return (ERR_PTR(-ECHILD));
+
+ error = zpl_get_link_common(dentry, inode, &link);
+ if (error)
+ return (ERR_PTR(error));
+
+ return (*cookie = link);
+}
+#elif defined(HAVE_FOLLOW_LINK_COOKIE)
+static const char *
+zpl_follow_link(struct dentry *dentry, void **cookie)
+{
+ char *link = NULL;
+ int error;
+
+ error = zpl_get_link_common(dentry, dentry->d_inode, &link);
+ if (error)
+ return (ERR_PTR(error));
+
+ return (*cookie = link);
+}
+#elif defined(HAVE_FOLLOW_LINK_NAMEIDATA)
+static void *
+zpl_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ char *link = NULL;
+ int error;
+
+ error = zpl_get_link_common(dentry, dentry->d_inode, &link);
+ if (error)
+ nd_set_link(nd, ERR_PTR(error));
+ else
+ nd_set_link(nd, link);
+
+ return (NULL);
+}
+#endif
+
+static int
+zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+{
+ cred_t *cr = CRED();
+ struct inode *ip = old_dentry->d_inode;
+ int error;
+ fstrans_cookie_t cookie;
+
+ if (ip->i_nlink >= ZFS_LINK_MAX)
+ return (-EMLINK);
+
+ crhold(cr);
+ ip->i_ctime = current_time(ip);
+ igrab(ip); /* Use ihold() if available */
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_link(ITOZ(dir), ITOZ(ip), dname(dentry), cr, 0);
+ if (error) {
+ iput(ip);
+ goto out;
+ }
+
+ d_instantiate(dentry, ip);
+out:
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+#ifdef HAVE_D_REVALIDATE_NAMEIDATA
+zpl_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+ unsigned int flags = (nd ? nd->flags : 0);
+#else
+zpl_revalidate(struct dentry *dentry, unsigned int flags)
+{
+#endif /* HAVE_D_REVALIDATE_NAMEIDATA */
+ /* CSTYLED */
+ zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
+ int error;
+
+ if (flags & LOOKUP_RCU)
+ return (-ECHILD);
+
+ /*
+ * After a rollback negative dentries created before the rollback
+ * time must be invalidated. Otherwise they can obscure files which
+ * are only present in the rolled back dataset.
+ */
+ if (dentry->d_inode == NULL) {
+ spin_lock(&dentry->d_lock);
+ error = time_before(dentry->d_time, zfsvfs->z_rollback_time);
+ spin_unlock(&dentry->d_lock);
+
+ if (error)
+ return (0);
+ }
+
+ /*
+ * The dentry may reference a stale inode if a mounted file system
+ * was rolled back to a point in time where the object didn't exist.
+ */
+ if (dentry->d_inode && ITOZ(dentry->d_inode)->z_is_stale)
+ return (0);
+
+ return (1);
+}
+
+const struct inode_operations zpl_inode_operations = {
+ .setattr = zpl_setattr,
+ .getattr = zpl_getattr,
+#ifdef HAVE_GENERIC_SETXATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
+#endif
+ .listxattr = zpl_xattr_list,
+#if defined(CONFIG_FS_POSIX_ACL)
+#if defined(HAVE_SET_ACL)
+ .set_acl = zpl_set_acl,
+#endif /* HAVE_SET_ACL */
+ .get_acl = zpl_get_acl,
+#endif /* CONFIG_FS_POSIX_ACL */
+};
+
+const struct inode_operations zpl_dir_inode_operations = {
+ .create = zpl_create,
+ .lookup = zpl_lookup,
+ .link = zpl_link,
+ .unlink = zpl_unlink,
+ .symlink = zpl_symlink,
+ .mkdir = zpl_mkdir,
+ .rmdir = zpl_rmdir,
+ .mknod = zpl_mknod,
+#ifdef HAVE_RENAME_WANTS_FLAGS
+ .rename = zpl_rename2,
+#else
+ .rename = zpl_rename,
+#endif
+#ifdef HAVE_TMPFILE
+ .tmpfile = zpl_tmpfile,
+#endif
+ .setattr = zpl_setattr,
+ .getattr = zpl_getattr,
+#ifdef HAVE_GENERIC_SETXATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
+#endif
+ .listxattr = zpl_xattr_list,
+#if defined(CONFIG_FS_POSIX_ACL)
+#if defined(HAVE_SET_ACL)
+ .set_acl = zpl_set_acl,
+#endif /* HAVE_SET_ACL */
+ .get_acl = zpl_get_acl,
+#endif /* CONFIG_FS_POSIX_ACL */
+};
+
+const struct inode_operations zpl_symlink_inode_operations = {
+#ifdef HAVE_GENERIC_READLINK
+ .readlink = generic_readlink,
+#endif
+#if defined(HAVE_GET_LINK_DELAYED) || defined(HAVE_GET_LINK_COOKIE)
+ .get_link = zpl_get_link,
+#elif defined(HAVE_FOLLOW_LINK_COOKIE) || defined(HAVE_FOLLOW_LINK_NAMEIDATA)
+ .follow_link = zpl_follow_link,
+#endif
+#if defined(HAVE_PUT_LINK_COOKIE) || defined(HAVE_PUT_LINK_NAMEIDATA)
+ .put_link = zpl_put_link,
+#endif
+ .setattr = zpl_setattr,
+ .getattr = zpl_getattr,
+#ifdef HAVE_GENERIC_SETXATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
+#endif
+ .listxattr = zpl_xattr_list,
+};
+
+const struct inode_operations zpl_special_inode_operations = {
+ .setattr = zpl_setattr,
+ .getattr = zpl_getattr,
+#ifdef HAVE_GENERIC_SETXATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
+#endif
+ .listxattr = zpl_xattr_list,
+#if defined(CONFIG_FS_POSIX_ACL)
+#if defined(HAVE_SET_ACL)
+ .set_acl = zpl_set_acl,
+#endif /* HAVE_SET_ACL */
+ .get_acl = zpl_get_acl,
+#endif /* CONFIG_FS_POSIX_ACL */
+};
+
+dentry_operations_t zpl_dentry_operations = {
+ .d_revalidate = zpl_revalidate,
+};
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
new file mode 100644
index 000000000000..c2fd3fee1401
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
@@ -0,0 +1,365 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
+ */
+
+
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zpl.h>
+
+
+static struct inode *
+zpl_inode_alloc(struct super_block *sb)
+{
+ struct inode *ip;
+
+ VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0);
+ inode_set_iversion(ip, 1);
+
+ return (ip);
+}
+
+static void
+zpl_inode_destroy(struct inode *ip)
+{
+ ASSERT(atomic_read(&ip->i_count) == 0);
+ zfs_inode_destroy(ip);
+}
+
+/*
+ * Called from __mark_inode_dirty() to reflect that something in the
+ * inode has changed. We use it to ensure the znode system attributes
+ * are always strictly update to date with respect to the inode.
+ */
+#ifdef HAVE_DIRTY_INODE_WITH_FLAGS
+static void
+zpl_dirty_inode(struct inode *ip, int flags)
+{
+ fstrans_cookie_t cookie;
+
+ cookie = spl_fstrans_mark();
+ zfs_dirty_inode(ip, flags);
+ spl_fstrans_unmark(cookie);
+}
+#else
+static void
+zpl_dirty_inode(struct inode *ip)
+{
+ fstrans_cookie_t cookie;
+
+ cookie = spl_fstrans_mark();
+ zfs_dirty_inode(ip, 0);
+ spl_fstrans_unmark(cookie);
+}
+#endif /* HAVE_DIRTY_INODE_WITH_FLAGS */
+
+/*
+ * When ->drop_inode() is called its return value indicates if the
+ * inode should be evicted from the inode cache. If the inode is
+ * unhashed and has no links the default policy is to evict it
+ * immediately.
+ *
+ * The ->evict_inode() callback must minimally truncate the inode pages,
+ * and call clear_inode(). For 2.6.35 and later kernels this will
+ * simply update the inode state, with the sync occurring before the
+ * truncate in evict(). For earlier kernels clear_inode() maps to
+ * end_writeback() which is responsible for completing all outstanding
+ * write back. In either case, once this is done it is safe to cleanup
+ * any remaining inode specific data via zfs_inactive().
+ * remaining filesystem specific data.
+ */
+static void
+zpl_evict_inode(struct inode *ip)
+{
+ fstrans_cookie_t cookie;
+
+ cookie = spl_fstrans_mark();
+ truncate_setsize(ip, 0);
+ clear_inode(ip);
+ zfs_inactive(ip);
+ spl_fstrans_unmark(cookie);
+}
+
+static void
+zpl_put_super(struct super_block *sb)
+{
+ fstrans_cookie_t cookie;
+ int error;
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_umount(sb);
+ spl_fstrans_unmark(cookie);
+ ASSERT3S(error, <=, 0);
+}
+
+static int
+zpl_sync_fs(struct super_block *sb, int wait)
+{
+ fstrans_cookie_t cookie;
+ cred_t *cr = CRED();
+ int error;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfs_sync(sb, wait, cr);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
+{
+ fstrans_cookie_t cookie;
+ int error;
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_statvfs(dentry->d_inode, statp);
+ spl_fstrans_unmark(cookie);
+ ASSERT3S(error, <=, 0);
+
+ /*
+ * If required by a 32-bit system call, dynamically scale the
+ * block size up to 16MiB and decrease the block counts. This
+ * allows for a maximum size of 64EiB to be reported. The file
+ * counts must be artificially capped at 2^32-1.
+ */
+ if (unlikely(zpl_is_32bit_api())) {
+ while (statp->f_blocks > UINT32_MAX &&
+ statp->f_bsize < SPA_MAXBLOCKSIZE) {
+ statp->f_frsize <<= 1;
+ statp->f_bsize <<= 1;
+
+ statp->f_blocks >>= 1;
+ statp->f_bfree >>= 1;
+ statp->f_bavail >>= 1;
+ }
+
+ uint64_t usedobjs = statp->f_files - statp->f_ffree;
+ statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs);
+ statp->f_files = statp->f_ffree + usedobjs;
+ }
+
+ return (error);
+}
+
+static int
+zpl_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+ zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data };
+ fstrans_cookie_t cookie;
+ int error;
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_remount(sb, flags, &zm);
+ spl_fstrans_unmark(cookie);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+__zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs)
+{
+ ZPL_ENTER(zfsvfs);
+
+ char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+ dmu_objset_name(zfsvfs->z_os, fsname);
+
+ for (int i = 0; fsname[i] != 0; i++) {
+ /*
+ * Spaces in the dataset name must be converted to their
+ * octal escape sequence for getmntent(3) to correctly
+ * parse then fsname portion of /proc/self/mounts.
+ */
+ if (fsname[i] == ' ') {
+ seq_puts(seq, "\\040");
+ } else {
+ seq_putc(seq, fsname[i]);
+ }
+ }
+
+ kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
+
+ ZPL_EXIT(zfsvfs);
+
+ return (0);
+}
+
+static int
+zpl_show_devname(struct seq_file *seq, struct dentry *root)
+{
+ return (__zpl_show_devname(seq, root->d_sb->s_fs_info));
+}
+
+static int
+__zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs)
+{
+ seq_printf(seq, ",%s",
+ zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr");
+
+#ifdef CONFIG_FS_POSIX_ACL
+ switch (zfsvfs->z_acl_type) {
+ case ZFS_ACLTYPE_POSIX:
+ seq_puts(seq, ",posixacl");
+ break;
+ default:
+ seq_puts(seq, ",noacl");
+ break;
+ }
+#endif /* CONFIG_FS_POSIX_ACL */
+
+ return (0);
+}
+
+static int
+zpl_show_options(struct seq_file *seq, struct dentry *root)
+{
+ return (__zpl_show_options(seq, root->d_sb->s_fs_info));
+}
+
+static int
+zpl_fill_super(struct super_block *sb, void *data, int silent)
+{
+ zfs_mnt_t *zm = (zfs_mnt_t *)data;
+ fstrans_cookie_t cookie;
+ int error;
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_domount(sb, zm, silent);
+ spl_fstrans_unmark(cookie);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+zpl_test_super(struct super_block *s, void *data)
+{
+ zfsvfs_t *zfsvfs = s->s_fs_info;
+ objset_t *os = data;
+
+ if (zfsvfs == NULL)
+ return (0);
+
+ return (os == zfsvfs->z_os);
+}
+
+static struct super_block *
+zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
+{
+ struct super_block *s;
+ objset_t *os;
+ int err;
+
+ err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
+ if (err)
+ return (ERR_PTR(-err));
+
+ /*
+ * The dsl pool lock must be released prior to calling sget().
+ * It is possible sget() may block on the lock in grab_super()
+ * while deactivate_super() holds that same lock and waits for
+ * a txg sync. If the dsl_pool lock is held over sget()
+ * this can prevent the pool sync and cause a deadlock.
+ */
+ dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
+ dsl_pool_rele(dmu_objset_pool(os), FTAG);
+
+ s = sget(fs_type, zpl_test_super, set_anon_super, flags, os);
+
+ dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
+ dsl_dataset_rele(dmu_objset_ds(os), FTAG);
+
+ if (IS_ERR(s))
+ return (ERR_CAST(s));
+
+ if (s->s_root == NULL) {
+ err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);
+ if (err) {
+ deactivate_locked_super(s);
+ return (ERR_PTR(err));
+ }
+ s->s_flags |= SB_ACTIVE;
+ } else if ((flags ^ s->s_flags) & SB_RDONLY) {
+ deactivate_locked_super(s);
+ return (ERR_PTR(-EBUSY));
+ }
+
+ return (s);
+}
+
+static struct dentry *
+zpl_mount(struct file_system_type *fs_type, int flags,
+ const char *osname, void *data)
+{
+ zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data };
+
+ struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm);
+ if (IS_ERR(sb))
+ return (ERR_CAST(sb));
+
+ return (dget(sb->s_root));
+}
+
+static void
+zpl_kill_sb(struct super_block *sb)
+{
+ zfs_preumount(sb);
+ kill_anon_super(sb);
+}
+
+void
+zpl_prune_sb(int64_t nr_to_scan, void *arg)
+{
+ struct super_block *sb = (struct super_block *)arg;
+ int objects = 0;
+
+ (void) -zfs_prune(sb, nr_to_scan, &objects);
+}
+
+const struct super_operations zpl_super_operations = {
+ .alloc_inode = zpl_inode_alloc,
+ .destroy_inode = zpl_inode_destroy,
+ .dirty_inode = zpl_dirty_inode,
+ .write_inode = NULL,
+ .evict_inode = zpl_evict_inode,
+ .put_super = zpl_put_super,
+ .sync_fs = zpl_sync_fs,
+ .statfs = zpl_statfs,
+ .remount_fs = zpl_remount_fs,
+ .show_devname = zpl_show_devname,
+ .show_options = zpl_show_options,
+ .show_stats = NULL,
+};
+
+struct file_system_type zpl_fs_type = {
+ .owner = THIS_MODULE,
+ .name = ZFS_DRIVER,
+ .mount = zpl_mount,
+ .kill_sb = zpl_kill_sb,
+};
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
new file mode 100644
index 000000000000..83812f2dcba8
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
@@ -0,0 +1,1486 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
+ *
+ * Extended attributes (xattr) on Solaris are implemented as files
+ * which exist in a hidden xattr directory. These extended attributes
+ * can be accessed using the attropen() system call which opens
+ * the extended attribute. It can then be manipulated just like
+ * a standard file descriptor. This has a couple advantages such
+ * as practically no size limit on the file, and the extended
+ * attributes permissions may differ from those of the parent file.
+ * This interface is really quite clever, but it's also completely
+ * different than what is supported on Linux. It also comes with a
+ * steep performance penalty when accessing small xattrs because they
+ * are not stored with the parent file.
+ *
+ * Under Linux extended attributes are manipulated by the system
+ * calls getxattr(2), setxattr(2), and listxattr(2). They consider
+ * extended attributes to be name/value pairs where the name is a
+ * NULL terminated string. The name must also include one of the
+ * following namespace prefixes:
+ *
+ * user - No restrictions and is available to user applications.
+ * trusted - Restricted to kernel and root (CAP_SYS_ADMIN) use.
+ * system - Used for access control lists (system.nfs4_acl, etc).
+ * security - Used by SELinux to store a files security context.
+ *
+ * The value under Linux to limited to 65536 bytes of binary data.
+ * In practice, individual xattrs tend to be much smaller than this
+ * and are typically less than 100 bytes. A good example of this
+ * are the security.selinux xattrs which are less than 100 bytes and
+ * exist for every file when xattr labeling is enabled.
+ *
+ * The Linux xattr implementation has been written to take advantage of
+ * this typical usage. When the dataset property 'xattr=sa' is set,
+ * then xattrs will be preferentially stored as System Attributes (SA).
+ * This allows tiny xattrs (~100 bytes) to be stored with the dnode and
+ * up to 64k of xattrs to be stored in the spill block. If additional
+ * xattr space is required, which is unlikely under Linux, they will
+ * be stored using the traditional directory approach.
+ *
+ * This optimization results in roughly a 3x performance improvement
+ * when accessing xattrs because it avoids the need to perform a seek
+ * for every xattr value. When multiple xattrs are stored per-file
+ * the performance improvements are even greater because all of the
+ * xattrs stored in the spill block will be cached.
+ *
+ * However, by default SA based xattrs are disabled in the Linux port
+ * to maximize compatibility with other implementations. If you do
+ * enable SA based xattrs then they will not be visible on platforms
+ * which do not support this feature.
+ *
+ * NOTE: One additional consequence of the xattr directory implementation
+ * is that when an extended attribute is manipulated an inode is created.
+ * This inode will exist in the Linux inode cache but there will be no
+ * associated entry in the dentry cache which references it. This is
+ * safe but it may result in some confusion. Enabling SA based xattrs
+ * largely avoids the issue except in the overflow case.
+ */
+
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zap.h>
+#include <sys/vfs.h>
+#include <sys/zpl.h>
+
+typedef struct xattr_filldir {
+ size_t size;
+ size_t offset;
+ char *buf;
+ struct dentry *dentry;
+} xattr_filldir_t;
+
+static const struct xattr_handler *zpl_xattr_handler(const char *);
+
+static int
+zpl_xattr_permission(xattr_filldir_t *xf, const char *name, int name_len)
+{
+ static const struct xattr_handler *handler;
+ struct dentry *d = xf->dentry;
+
+ handler = zpl_xattr_handler(name);
+ if (!handler)
+ return (0);
+
+ if (handler->list) {
+#if defined(HAVE_XATTR_LIST_SIMPLE)
+ if (!handler->list(d))
+ return (0);
+#elif defined(HAVE_XATTR_LIST_DENTRY)
+ if (!handler->list(d, NULL, 0, name, name_len, 0))
+ return (0);
+#elif defined(HAVE_XATTR_LIST_HANDLER)
+ if (!handler->list(handler, d, NULL, 0, name, name_len))
+ return (0);
+#endif
+ }
+
+ return (1);
+}
+
+/*
+ * Determine is a given xattr name should be visible and if so copy it
+ * in to the provided buffer (xf->buf).
+ */
+static int
+zpl_xattr_filldir(xattr_filldir_t *xf, const char *name, int name_len)
+{
+ /* Check permissions using the per-namespace list xattr handler. */
+ if (!zpl_xattr_permission(xf, name, name_len))
+ return (0);
+
+ /* When xf->buf is NULL only calculate the required size. */
+ if (xf->buf) {
+ if (xf->offset + name_len + 1 > xf->size)
+ return (-ERANGE);
+
+ memcpy(xf->buf + xf->offset, name, name_len);
+ xf->buf[xf->offset + name_len] = '\0';
+ }
+
+ xf->offset += (name_len + 1);
+
+ return (0);
+}
+
+/*
+ * Read as many directory entry names as will fit in to the provided buffer,
+ * or when no buffer is provided calculate the required buffer size.
+ */
+static int
+zpl_xattr_readdir(struct inode *dxip, xattr_filldir_t *xf)
+{
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ int error;
+
+ zap_cursor_init(&zc, ITOZSB(dxip)->z_os, ITOZ(dxip)->z_id);
+
+ while ((error = -zap_cursor_retrieve(&zc, &zap)) == 0) {
+
+ if (zap.za_integer_length != 8 || zap.za_num_integers != 1) {
+ error = -ENXIO;
+ break;
+ }
+
+ error = zpl_xattr_filldir(xf, zap.za_name, strlen(zap.za_name));
+ if (error)
+ break;
+
+ zap_cursor_advance(&zc);
+ }
+
+ zap_cursor_fini(&zc);
+
+ if (error == -ENOENT)
+ error = 0;
+
+ return (error);
+}
+
+static ssize_t
+zpl_xattr_list_dir(xattr_filldir_t *xf, cred_t *cr)
+{
+ struct inode *ip = xf->dentry->d_inode;
+ struct inode *dxip = NULL;
+ znode_t *dxzp;
+ int error;
+
+ /* Lookup the xattr directory */
+ error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, LOOKUP_XATTR,
+ cr, NULL, NULL);
+ if (error) {
+ if (error == -ENOENT)
+ error = 0;
+
+ return (error);
+ }
+
+ dxip = ZTOI(dxzp);
+ error = zpl_xattr_readdir(dxip, xf);
+ iput(dxip);
+
+ return (error);
+}
+
+static ssize_t
+zpl_xattr_list_sa(xattr_filldir_t *xf)
+{
+ znode_t *zp = ITOZ(xf->dentry->d_inode);
+ nvpair_t *nvp = NULL;
+ int error = 0;
+
+ mutex_enter(&zp->z_lock);
+ if (zp->z_xattr_cached == NULL)
+ error = -zfs_sa_get_xattr(zp);
+ mutex_exit(&zp->z_lock);
+
+ if (error)
+ return (error);
+
+ ASSERT(zp->z_xattr_cached);
+
+ while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) {
+ ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY);
+
+ error = zpl_xattr_filldir(xf, nvpair_name(nvp),
+ strlen(nvpair_name(nvp)));
+ if (error)
+ return (error);
+ }
+
+ return (0);
+}
+
+ssize_t
+zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+ znode_t *zp = ITOZ(dentry->d_inode);
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ xattr_filldir_t xf = { buffer_size, 0, buffer, dentry };
+ cred_t *cr = CRED();
+ fstrans_cookie_t cookie;
+ int error = 0;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ ZPL_ENTER(zfsvfs);
+ ZPL_VERIFY_ZP(zp);
+ rw_enter(&zp->z_xattr_lock, RW_READER);
+
+ if (zfsvfs->z_use_sa && zp->z_is_sa) {
+ error = zpl_xattr_list_sa(&xf);
+ if (error)
+ goto out;
+ }
+
+ error = zpl_xattr_list_dir(&xf, cr);
+ if (error)
+ goto out;
+
+ error = xf.offset;
+out:
+
+ rw_exit(&zp->z_xattr_lock);
+ ZPL_EXIT(zfsvfs);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+
+ return (error);
+}
+
+static int
+zpl_xattr_get_dir(struct inode *ip, const char *name, void *value,
+ size_t size, cred_t *cr)
+{
+ fstrans_cookie_t cookie;
+ struct inode *xip = NULL;
+ znode_t *dxzp = NULL;
+ znode_t *xzp = NULL;
+ int error;
+
+ /* Lookup the xattr directory */
+ error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, LOOKUP_XATTR,
+ cr, NULL, NULL);
+ if (error)
+ goto out;
+
+ /* Lookup a specific xattr name in the directory */
+ error = -zfs_lookup(dxzp, (char *)name, &xzp, 0, cr, NULL, NULL);
+ if (error)
+ goto out;
+
+ xip = ZTOI(xzp);
+ if (!size) {
+ error = i_size_read(xip);
+ goto out;
+ }
+
+ if (size < i_size_read(xip)) {
+ error = -ERANGE;
+ goto out;
+ }
+
+ struct iovec iov;
+ iov.iov_base = (void *)value;
+ iov.iov_len = size;
+
+ zfs_uio_t uio;
+ zfs_uio_iovec_init(&uio, &iov, 1, 0, UIO_SYSSPACE, size, 0);
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_read(ITOZ(xip), &uio, 0, cr);
+ spl_fstrans_unmark(cookie);
+
+ if (error == 0)
+ error = size - zfs_uio_resid(&uio);
+out:
+ if (xzp)
+ zrele(xzp);
+
+ if (dxzp)
+ zrele(dxzp);
+
+ return (error);
+}
+
+static int
+zpl_xattr_get_sa(struct inode *ip, const char *name, void *value, size_t size)
+{
+ znode_t *zp = ITOZ(ip);
+ uchar_t *nv_value;
+ uint_t nv_size;
+ int error = 0;
+
+ ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
+
+ mutex_enter(&zp->z_lock);
+ if (zp->z_xattr_cached == NULL)
+ error = -zfs_sa_get_xattr(zp);
+ mutex_exit(&zp->z_lock);
+
+ if (error)
+ return (error);
+
+ ASSERT(zp->z_xattr_cached);
+ error = -nvlist_lookup_byte_array(zp->z_xattr_cached, name,
+ &nv_value, &nv_size);
+ if (error)
+ return (error);
+
+ if (size == 0 || value == NULL)
+ return (nv_size);
+
+ if (size < nv_size)
+ return (-ERANGE);
+
+ memcpy(value, nv_value, nv_size);
+
+ return (nv_size);
+}
+
+static int
+__zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size,
+ cred_t *cr)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ int error;
+
+ ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
+
+ if (zfsvfs->z_use_sa && zp->z_is_sa) {
+ error = zpl_xattr_get_sa(ip, name, value, size);
+ if (error != -ENOENT)
+ goto out;
+ }
+
+ error = zpl_xattr_get_dir(ip, name, value, size, cr);
+out:
+ if (error == -ENOENT)
+ error = -ENODATA;
+
+ return (error);
+}
+
+#define XATTR_NOENT 0x0
+#define XATTR_IN_SA 0x1
+#define XATTR_IN_DIR 0x2
+/* check where the xattr resides */
+static int
+__zpl_xattr_where(struct inode *ip, const char *name, int *where, cred_t *cr)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ int error;
+
+ ASSERT(where);
+ ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
+
+ *where = XATTR_NOENT;
+ if (zfsvfs->z_use_sa && zp->z_is_sa) {
+ error = zpl_xattr_get_sa(ip, name, NULL, 0);
+ if (error >= 0)
+ *where |= XATTR_IN_SA;
+ else if (error != -ENOENT)
+ return (error);
+ }
+
+ error = zpl_xattr_get_dir(ip, name, NULL, 0, cr);
+ if (error >= 0)
+ *where |= XATTR_IN_DIR;
+ else if (error != -ENOENT)
+ return (error);
+
+ if (*where == (XATTR_IN_SA|XATTR_IN_DIR))
+ cmn_err(CE_WARN, "ZFS: inode %p has xattr \"%s\""
+ " in both SA and dir", ip, name);
+ if (*where == XATTR_NOENT)
+ error = -ENODATA;
+ else
+ error = 0;
+ return (error);
+}
+
+static int
+zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ cred_t *cr = CRED();
+ fstrans_cookie_t cookie;
+ int error;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ ZPL_ENTER(zfsvfs);
+ ZPL_VERIFY_ZP(zp);
+ rw_enter(&zp->z_xattr_lock, RW_READER);
+ error = __zpl_xattr_get(ip, name, value, size, cr);
+ rw_exit(&zp->z_xattr_lock);
+ ZPL_EXIT(zfsvfs);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+
+ return (error);
+}
+
+static int
+zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value,
+ size_t size, int flags, cred_t *cr)
+{
+ znode_t *dxzp = NULL;
+ znode_t *xzp = NULL;
+ vattr_t *vap = NULL;
+ int lookup_flags, error;
+ const int xattr_mode = S_IFREG | 0644;
+ loff_t pos = 0;
+
+ /*
+ * Lookup the xattr directory. When we're adding an entry pass
+ * CREATE_XATTR_DIR to ensure the xattr directory is created.
+ * When removing an entry this flag is not passed to avoid
+ * unnecessarily creating a new xattr directory.
+ */
+ lookup_flags = LOOKUP_XATTR;
+ if (value != NULL)
+ lookup_flags |= CREATE_XATTR_DIR;
+
+ error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, lookup_flags,
+ cr, NULL, NULL);
+ if (error)
+ goto out;
+
+ /* Lookup a specific xattr name in the directory */
+ error = -zfs_lookup(dxzp, (char *)name, &xzp, 0, cr, NULL, NULL);
+ if (error && (error != -ENOENT))
+ goto out;
+
+ error = 0;
+
+ /* Remove a specific name xattr when value is set to NULL. */
+ if (value == NULL) {
+ if (xzp)
+ error = -zfs_remove(dxzp, (char *)name, cr, 0);
+
+ goto out;
+ }
+
+ /* Lookup failed create a new xattr. */
+ if (xzp == NULL) {
+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+ vap->va_mode = xattr_mode;
+ vap->va_mask = ATTR_MODE;
+ vap->va_uid = crgetfsuid(cr);
+ vap->va_gid = crgetfsgid(cr);
+
+ error = -zfs_create(dxzp, (char *)name, vap, 0, 0644, &xzp,
+ cr, 0, NULL);
+ if (error)
+ goto out;
+ }
+
+ ASSERT(xzp != NULL);
+
+ error = -zfs_freesp(xzp, 0, 0, xattr_mode, TRUE);
+ if (error)
+ goto out;
+
+ error = -zfs_write_simple(xzp, value, size, pos, NULL);
+out:
+ if (error == 0) {
+ ip->i_ctime = current_time(ip);
+ zfs_mark_inode_dirty(ip);
+ }
+
+ if (vap)
+ kmem_free(vap, sizeof (vattr_t));
+
+ if (xzp)
+ zrele(xzp);
+
+ if (dxzp)
+ zrele(dxzp);
+
+ if (error == -ENOENT)
+ error = -ENODATA;
+
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+zpl_xattr_set_sa(struct inode *ip, const char *name, const void *value,
+ size_t size, int flags, cred_t *cr)
+{
+ znode_t *zp = ITOZ(ip);
+ nvlist_t *nvl;
+ size_t sa_size;
+ int error = 0;
+
+ mutex_enter(&zp->z_lock);
+ if (zp->z_xattr_cached == NULL)
+ error = -zfs_sa_get_xattr(zp);
+ mutex_exit(&zp->z_lock);
+
+ if (error)
+ return (error);
+
+ ASSERT(zp->z_xattr_cached);
+ nvl = zp->z_xattr_cached;
+
+ if (value == NULL) {
+ error = -nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY);
+ if (error == -ENOENT)
+ error = zpl_xattr_set_dir(ip, name, NULL, 0, flags, cr);
+ } else {
+ /* Limited to 32k to keep nvpair memory allocations small */
+ if (size > DXATTR_MAX_ENTRY_SIZE)
+ return (-EFBIG);
+
+ /* Prevent the DXATTR SA from consuming the entire SA region */
+ error = -nvlist_size(nvl, &sa_size, NV_ENCODE_XDR);
+ if (error)
+ return (error);
+
+ if (sa_size > DXATTR_MAX_SA_SIZE)
+ return (-EFBIG);
+
+ error = -nvlist_add_byte_array(nvl, name,
+ (uchar_t *)value, size);
+ }
+
+ /*
+ * Update the SA for additions, modifications, and removals. On
+ * error drop the inconsistent cached version of the nvlist, it
+ * will be reconstructed from the ARC when next accessed.
+ */
+ if (error == 0)
+ error = -zfs_sa_set_xattr(zp);
+
+ if (error) {
+ nvlist_free(nvl);
+ zp->z_xattr_cached = NULL;
+ }
+
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+zpl_xattr_set(struct inode *ip, const char *name, const void *value,
+ size_t size, int flags)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ cred_t *cr = CRED();
+ fstrans_cookie_t cookie;
+ int where;
+ int error;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ ZPL_ENTER(zfsvfs);
+ ZPL_VERIFY_ZP(zp);
+ rw_enter(&ITOZ(ip)->z_xattr_lock, RW_WRITER);
+
+ /*
+ * Before setting the xattr check to see if it already exists.
+ * This is done to ensure the following optional flags are honored.
+ *
+ * XATTR_CREATE: fail if xattr already exists
+ * XATTR_REPLACE: fail if xattr does not exist
+ *
+ * We also want to know if it resides in sa or dir, so we can make
+ * sure we don't end up with duplicate in both places.
+ */
+ error = __zpl_xattr_where(ip, name, &where, cr);
+ if (error < 0) {
+ if (error != -ENODATA)
+ goto out;
+ if (flags & XATTR_REPLACE)
+ goto out;
+
+ /* The xattr to be removed already doesn't exist */
+ error = 0;
+ if (value == NULL)
+ goto out;
+ } else {
+ error = -EEXIST;
+ if (flags & XATTR_CREATE)
+ goto out;
+ }
+
+ /* Preferentially store the xattr as a SA for better performance */
+ if (zfsvfs->z_use_sa && zp->z_is_sa &&
+ (zfsvfs->z_xattr_sa || (value == NULL && where & XATTR_IN_SA))) {
+ error = zpl_xattr_set_sa(ip, name, value, size, flags, cr);
+ if (error == 0) {
+ /*
+ * Successfully put into SA, we need to clear the one
+ * in dir.
+ */
+ if (where & XATTR_IN_DIR)
+ zpl_xattr_set_dir(ip, name, NULL, 0, 0, cr);
+ goto out;
+ }
+ }
+
+ error = zpl_xattr_set_dir(ip, name, value, size, flags, cr);
+ /*
+ * Successfully put into dir, we need to clear the one in SA.
+ */
+ if (error == 0 && (where & XATTR_IN_SA))
+ zpl_xattr_set_sa(ip, name, NULL, 0, 0, cr);
+out:
+ rw_exit(&ITOZ(ip)->z_xattr_lock);
+ ZPL_EXIT(zfsvfs);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+/*
+ * Extended user attributes
+ *
+ * "Extended user attributes may be assigned to files and directories for
+ * storing arbitrary additional information such as the mime type,
+ * character set or encoding of a file. The access permissions for user
+ * attributes are defined by the file permission bits: read permission
+ * is required to retrieve the attribute value, and writer permission is
+ * required to change it.
+ *
+ * The file permission bits of regular files and directories are
+ * interpreted differently from the file permission bits of special
+ * files and symbolic links. For regular files and directories the file
+ * permission bits define access to the file's contents, while for
+ * device special files they define access to the device described by
+ * the special file. The file permissions of symbolic links are not
+ * used in access checks. These differences would allow users to
+ * consume filesystem resources in a way not controllable by disk quotas
+ * for group or world writable special files and directories.
+ *
+ * For this reason, extended user attributes are allowed only for
+ * regular files and directories, and access to extended user attributes
+ * is restricted to the owner and to users with appropriate capabilities
+ * for directories with the sticky bit set (see the chmod(1) manual page
+ * for an explanation of the sticky bit)." - xattr(7)
+ *
+ * ZFS allows extended user attributes to be disabled administratively
+ * by setting the 'xattr=off' property on the dataset.
+ */
+static int
+__zpl_xattr_user_list(struct inode *ip, char *list, size_t list_size,
+ const char *name, size_t name_len)
+{
+ return (ITOZSB(ip)->z_flags & ZSB_XATTR);
+}
+ZPL_XATTR_LIST_WRAPPER(zpl_xattr_user_list);
+
+static int
+__zpl_xattr_user_get(struct inode *ip, const char *name,
+ void *value, size_t size)
+{
+ char *xattr_name;
+ int error;
+ /* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+ if (strcmp(name, "") == 0)
+ return (-EINVAL);
+#endif
+ if (!(ITOZSB(ip)->z_flags & ZSB_XATTR))
+ return (-EOPNOTSUPP);
+
+ xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name);
+ error = zpl_xattr_get(ip, xattr_name, value, size);
+ kmem_strfree(xattr_name);
+
+ return (error);
+}
+ZPL_XATTR_GET_WRAPPER(zpl_xattr_user_get);
+
+static int
+__zpl_xattr_user_set(struct inode *ip, const char *name,
+ const void *value, size_t size, int flags)
+{
+ char *xattr_name;
+ int error;
+ /* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+ if (strcmp(name, "") == 0)
+ return (-EINVAL);
+#endif
+ if (!(ITOZSB(ip)->z_flags & ZSB_XATTR))
+ return (-EOPNOTSUPP);
+
+ xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name);
+ error = zpl_xattr_set(ip, xattr_name, value, size, flags);
+ kmem_strfree(xattr_name);
+
+ return (error);
+}
+ZPL_XATTR_SET_WRAPPER(zpl_xattr_user_set);
+
+xattr_handler_t zpl_xattr_user_handler =
+{
+ .prefix = XATTR_USER_PREFIX,
+ .list = zpl_xattr_user_list,
+ .get = zpl_xattr_user_get,
+ .set = zpl_xattr_user_set,
+};
+
+/*
+ * Trusted extended attributes
+ *
+ * "Trusted extended attributes are visible and accessible only to
+ * processes that have the CAP_SYS_ADMIN capability. Attributes in this
+ * class are used to implement mechanisms in user space (i.e., outside
+ * the kernel) which keep information in extended attributes to which
+ * ordinary processes should not have access." - xattr(7)
+ */
+static int
+__zpl_xattr_trusted_list(struct inode *ip, char *list, size_t list_size,
+ const char *name, size_t name_len)
+{
+ return (capable(CAP_SYS_ADMIN));
+}
+ZPL_XATTR_LIST_WRAPPER(zpl_xattr_trusted_list);
+
+static int
+__zpl_xattr_trusted_get(struct inode *ip, const char *name,
+ void *value, size_t size)
+{
+ char *xattr_name;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return (-EACCES);
+ /* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+ if (strcmp(name, "") == 0)
+ return (-EINVAL);
+#endif
+ xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name);
+ error = zpl_xattr_get(ip, xattr_name, value, size);
+ kmem_strfree(xattr_name);
+
+ return (error);
+}
+ZPL_XATTR_GET_WRAPPER(zpl_xattr_trusted_get);
+
+static int
+__zpl_xattr_trusted_set(struct inode *ip, const char *name,
+ const void *value, size_t size, int flags)
+{
+ char *xattr_name;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return (-EACCES);
+ /* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+ if (strcmp(name, "") == 0)
+ return (-EINVAL);
+#endif
+ xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name);
+ error = zpl_xattr_set(ip, xattr_name, value, size, flags);
+ kmem_strfree(xattr_name);
+
+ return (error);
+}
+ZPL_XATTR_SET_WRAPPER(zpl_xattr_trusted_set);
+
+xattr_handler_t zpl_xattr_trusted_handler =
+{
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .list = zpl_xattr_trusted_list,
+ .get = zpl_xattr_trusted_get,
+ .set = zpl_xattr_trusted_set,
+};
+
+/*
+ * Extended security attributes
+ *
+ * "The security attribute namespace is used by kernel security modules,
+ * such as Security Enhanced Linux, and also to implement file
+ * capabilities (see capabilities(7)). Read and write access
+ * permissions to security attributes depend on the policy implemented
+ * for each security attribute by the security module. When no security
+ * module is loaded, all processes have read access to extended security
+ * attributes, and write access is limited to processes that have the
+ * CAP_SYS_ADMIN capability." - xattr(7)
+ */
+static int
+__zpl_xattr_security_list(struct inode *ip, char *list, size_t list_size,
+ const char *name, size_t name_len)
+{
+ return (1);
+}
+ZPL_XATTR_LIST_WRAPPER(zpl_xattr_security_list);
+
+static int
+__zpl_xattr_security_get(struct inode *ip, const char *name,
+ void *value, size_t size)
+{
+ char *xattr_name;
+ int error;
+ /* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+ if (strcmp(name, "") == 0)
+ return (-EINVAL);
+#endif
+ xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name);
+ error = zpl_xattr_get(ip, xattr_name, value, size);
+ kmem_strfree(xattr_name);
+
+ return (error);
+}
+ZPL_XATTR_GET_WRAPPER(zpl_xattr_security_get);
+
+static int
+__zpl_xattr_security_set(struct inode *ip, const char *name,
+ const void *value, size_t size, int flags)
+{
+ char *xattr_name;
+ int error;
+ /* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+ if (strcmp(name, "") == 0)
+ return (-EINVAL);
+#endif
+ xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name);
+ error = zpl_xattr_set(ip, xattr_name, value, size, flags);
+ kmem_strfree(xattr_name);
+
+ return (error);
+}
+ZPL_XATTR_SET_WRAPPER(zpl_xattr_security_set);
+
+static int
+zpl_xattr_security_init_impl(struct inode *ip, const struct xattr *xattrs,
+ void *fs_info)
+{
+ const struct xattr *xattr;
+ int error = 0;
+
+ for (xattr = xattrs; xattr->name != NULL; xattr++) {
+ error = __zpl_xattr_security_set(ip,
+ xattr->name, xattr->value, xattr->value_len, 0);
+
+ if (error < 0)
+ break;
+ }
+
+ return (error);
+}
+
+int
+zpl_xattr_security_init(struct inode *ip, struct inode *dip,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(ip, dip, qstr,
+ &zpl_xattr_security_init_impl, NULL);
+}
+
+/*
+ * Security xattr namespace handlers.
+ */
+xattr_handler_t zpl_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .list = zpl_xattr_security_list,
+ .get = zpl_xattr_security_get,
+ .set = zpl_xattr_security_set,
+};
+
+/*
+ * Extended system attributes
+ *
+ * "Extended system attributes are used by the kernel to store system
+ * objects such as Access Control Lists. Read and write access permissions
+ * to system attributes depend on the policy implemented for each system
+ * attribute implemented by filesystems in the kernel." - xattr(7)
+ */
+#ifdef CONFIG_FS_POSIX_ACL
+#ifndef HAVE_SET_ACL
+static
+#endif
+int
+zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type)
+{
+ char *name, *value = NULL;
+ int error = 0;
+ size_t size = 0;
+
+ if (S_ISLNK(ip->i_mode))
+ return (-EOPNOTSUPP);
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
+ if (acl) {
+ umode_t mode = ip->i_mode;
+ error = posix_acl_equiv_mode(acl, &mode);
+ if (error < 0) {
+ return (error);
+ } else {
+ /*
+ * The mode bits will have been set by
+ * ->zfs_setattr()->zfs_acl_chmod_setattr()
+ * using the ZFS ACL conversion. If they
+ * differ from the Posix ACL conversion dirty
+ * the inode to write the Posix mode bits.
+ */
+ if (ip->i_mode != mode) {
+ ip->i_mode = mode;
+ ip->i_ctime = current_time(ip);
+ zfs_mark_inode_dirty(ip);
+ }
+
+ if (error == 0)
+ acl = NULL;
+ }
+ }
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
+ if (!S_ISDIR(ip->i_mode))
+ return (acl ? -EACCES : 0);
+ break;
+
+ default:
+ return (-EINVAL);
+ }
+
+ if (acl) {
+ size = posix_acl_xattr_size(acl->a_count);
+ value = kmem_alloc(size, KM_SLEEP);
+
+ error = zpl_acl_to_xattr(acl, value, size);
+ if (error < 0) {
+ kmem_free(value, size);
+ return (error);
+ }
+ }
+
+ error = zpl_xattr_set(ip, name, value, size, 0);
+ if (value)
+ kmem_free(value, size);
+
+ if (!error) {
+ if (acl)
+ zpl_set_cached_acl(ip, type, acl);
+ else
+ zpl_forget_cached_acl(ip, type);
+ }
+
+ return (error);
+}
+
+struct posix_acl *
+zpl_get_acl(struct inode *ip, int type)
+{
+ struct posix_acl *acl;
+ void *value = NULL;
+ char *name;
+ int size;
+
+ /*
+ * As of Linux 3.14, the kernel get_acl will check this for us.
+ * Also as of Linux 4.7, comparing against ACL_NOT_CACHED is wrong
+ * as the kernel get_acl will set it to temporary sentinel value.
+ */
+#ifndef HAVE_KERNEL_GET_ACL_HANDLE_CACHE
+ acl = get_cached_acl(ip, type);
+ if (acl != ACL_NOT_CACHED)
+ return (acl);
+#endif
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
+ break;
+ case ACL_TYPE_DEFAULT:
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
+ break;
+ default:
+ return (ERR_PTR(-EINVAL));
+ }
+
+ size = zpl_xattr_get(ip, name, NULL, 0);
+ if (size > 0) {
+ value = kmem_alloc(size, KM_SLEEP);
+ size = zpl_xattr_get(ip, name, value, size);
+ }
+
+ if (size > 0) {
+ acl = zpl_acl_from_xattr(value, size);
+ } else if (size == -ENODATA || size == -ENOSYS) {
+ acl = NULL;
+ } else {
+ acl = ERR_PTR(-EIO);
+ }
+
+ if (size > 0)
+ kmem_free(value, size);
+
+ /* As of Linux 4.7, the kernel get_acl will set this for us */
+#ifndef HAVE_KERNEL_GET_ACL_HANDLE_CACHE
+ if (!IS_ERR(acl))
+ zpl_set_cached_acl(ip, type, acl);
+#endif
+
+ return (acl);
+}
+
+int
+zpl_init_acl(struct inode *ip, struct inode *dir)
+{
+ struct posix_acl *acl = NULL;
+ int error = 0;
+
+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX)
+ return (0);
+
+ if (!S_ISLNK(ip->i_mode)) {
+ acl = zpl_get_acl(dir, ACL_TYPE_DEFAULT);
+ if (IS_ERR(acl))
+ return (PTR_ERR(acl));
+ if (!acl) {
+ ip->i_mode &= ~current_umask();
+ ip->i_ctime = current_time(ip);
+ zfs_mark_inode_dirty(ip);
+ return (0);
+ }
+ }
+
+ if (acl) {
+ umode_t mode;
+
+ if (S_ISDIR(ip->i_mode)) {
+ error = zpl_set_acl(ip, acl, ACL_TYPE_DEFAULT);
+ if (error)
+ goto out;
+ }
+
+ mode = ip->i_mode;
+ error = __posix_acl_create(&acl, GFP_KERNEL, &mode);
+ if (error >= 0) {
+ ip->i_mode = mode;
+ zfs_mark_inode_dirty(ip);
+ if (error > 0)
+ error = zpl_set_acl(ip, acl, ACL_TYPE_ACCESS);
+ }
+ }
+out:
+ zpl_posix_acl_release(acl);
+
+ return (error);
+}
+
+int
+zpl_chmod_acl(struct inode *ip)
+{
+ struct posix_acl *acl;
+ int error;
+
+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX)
+ return (0);
+
+ if (S_ISLNK(ip->i_mode))
+ return (-EOPNOTSUPP);
+
+ acl = zpl_get_acl(ip, ACL_TYPE_ACCESS);
+ if (IS_ERR(acl) || !acl)
+ return (PTR_ERR(acl));
+
+ error = __posix_acl_chmod(&acl, GFP_KERNEL, ip->i_mode);
+ if (!error)
+ error = zpl_set_acl(ip, acl, ACL_TYPE_ACCESS);
+
+ zpl_posix_acl_release(acl);
+
+ return (error);
+}
+
+static int
+__zpl_xattr_acl_list_access(struct inode *ip, char *list, size_t list_size,
+ const char *name, size_t name_len)
+{
+ char *xattr_name = XATTR_NAME_POSIX_ACL_ACCESS;
+ size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_ACCESS);
+
+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX)
+ return (0);
+
+ if (list && xattr_size <= list_size)
+ memcpy(list, xattr_name, xattr_size);
+
+ return (xattr_size);
+}
+ZPL_XATTR_LIST_WRAPPER(zpl_xattr_acl_list_access);
+
+static int
+__zpl_xattr_acl_list_default(struct inode *ip, char *list, size_t list_size,
+ const char *name, size_t name_len)
+{
+ char *xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT;
+ size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_DEFAULT);
+
+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX)
+ return (0);
+
+ if (list && xattr_size <= list_size)
+ memcpy(list, xattr_name, xattr_size);
+
+ return (xattr_size);
+}
+ZPL_XATTR_LIST_WRAPPER(zpl_xattr_acl_list_default);
+
+static int
+__zpl_xattr_acl_get_access(struct inode *ip, const char *name,
+ void *buffer, size_t size)
+{
+ struct posix_acl *acl;
+ int type = ACL_TYPE_ACCESS;
+ int error;
+ /* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+ if (strcmp(name, "") != 0)
+ return (-EINVAL);
+#endif
+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX)
+ return (-EOPNOTSUPP);
+
+ acl = zpl_get_acl(ip, type);
+ if (IS_ERR(acl))
+ return (PTR_ERR(acl));
+ if (acl == NULL)
+ return (-ENODATA);
+
+ error = zpl_acl_to_xattr(acl, buffer, size);
+ zpl_posix_acl_release(acl);
+
+ return (error);
+}
+ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_access);
+
+static int
+__zpl_xattr_acl_get_default(struct inode *ip, const char *name,
+ void *buffer, size_t size)
+{
+ struct posix_acl *acl;
+ int type = ACL_TYPE_DEFAULT;
+ int error;
+ /* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+ if (strcmp(name, "") != 0)
+ return (-EINVAL);
+#endif
+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX)
+ return (-EOPNOTSUPP);
+
+ acl = zpl_get_acl(ip, type);
+ if (IS_ERR(acl))
+ return (PTR_ERR(acl));
+ if (acl == NULL)
+ return (-ENODATA);
+
+ error = zpl_acl_to_xattr(acl, buffer, size);
+ zpl_posix_acl_release(acl);
+
+ return (error);
+}
+ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_default);
+
+static int
+__zpl_xattr_acl_set_access(struct inode *ip, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct posix_acl *acl;
+ int type = ACL_TYPE_ACCESS;
+ int error = 0;
+ /* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+ if (strcmp(name, "") != 0)
+ return (-EINVAL);
+#endif
+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX)
+ return (-EOPNOTSUPP);
+
+ if (!inode_owner_or_capable(ip))
+ return (-EPERM);
+
+ if (value) {
+ acl = zpl_acl_from_xattr(value, size);
+ if (IS_ERR(acl))
+ return (PTR_ERR(acl));
+ else if (acl) {
+ error = zpl_posix_acl_valid(ip, acl);
+ if (error) {
+ zpl_posix_acl_release(acl);
+ return (error);
+ }
+ }
+ } else {
+ acl = NULL;
+ }
+
+ error = zpl_set_acl(ip, acl, type);
+ zpl_posix_acl_release(acl);
+
+ return (error);
+}
+ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_access);
+
+static int
+__zpl_xattr_acl_set_default(struct inode *ip, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct posix_acl *acl;
+ int type = ACL_TYPE_DEFAULT;
+ int error = 0;
+ /* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+ if (strcmp(name, "") != 0)
+ return (-EINVAL);
+#endif
+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX)
+ return (-EOPNOTSUPP);
+
+ if (!inode_owner_or_capable(ip))
+ return (-EPERM);
+
+ if (value) {
+ acl = zpl_acl_from_xattr(value, size);
+ if (IS_ERR(acl))
+ return (PTR_ERR(acl));
+ else if (acl) {
+ error = zpl_posix_acl_valid(ip, acl);
+ if (error) {
+ zpl_posix_acl_release(acl);
+ return (error);
+ }
+ }
+ } else {
+ acl = NULL;
+ }
+
+ error = zpl_set_acl(ip, acl, type);
+ zpl_posix_acl_release(acl);
+
+ return (error);
+}
+ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_default);
+
+/*
+ * ACL access xattr namespace handlers.
+ *
+ * Use .name instead of .prefix when available. xattr_resolve_name will match
+ * whole name and reject anything that has .name only as prefix.
+ */
+xattr_handler_t zpl_xattr_acl_access_handler =
+{
+#ifdef HAVE_XATTR_HANDLER_NAME
+ .name = XATTR_NAME_POSIX_ACL_ACCESS,
+#else
+ .prefix = XATTR_NAME_POSIX_ACL_ACCESS,
+#endif
+ .list = zpl_xattr_acl_list_access,
+ .get = zpl_xattr_acl_get_access,
+ .set = zpl_xattr_acl_set_access,
+#if defined(HAVE_XATTR_LIST_SIMPLE) || \
+ defined(HAVE_XATTR_LIST_DENTRY) || \
+ defined(HAVE_XATTR_LIST_HANDLER)
+ .flags = ACL_TYPE_ACCESS,
+#endif
+};
+
+/*
+ * ACL default xattr namespace handlers.
+ *
+ * Use .name instead of .prefix when available. xattr_resolve_name will match
+ * whole name and reject anything that has .name only as prefix.
+ */
+xattr_handler_t zpl_xattr_acl_default_handler =
+{
+#ifdef HAVE_XATTR_HANDLER_NAME
+ .name = XATTR_NAME_POSIX_ACL_DEFAULT,
+#else
+ .prefix = XATTR_NAME_POSIX_ACL_DEFAULT,
+#endif
+ .list = zpl_xattr_acl_list_default,
+ .get = zpl_xattr_acl_get_default,
+ .set = zpl_xattr_acl_set_default,
+#if defined(HAVE_XATTR_LIST_SIMPLE) || \
+ defined(HAVE_XATTR_LIST_DENTRY) || \
+ defined(HAVE_XATTR_LIST_HANDLER)
+ .flags = ACL_TYPE_DEFAULT,
+#endif
+};
+
+#endif /* CONFIG_FS_POSIX_ACL */
+
+xattr_handler_t *zpl_xattr_handlers[] = {
+ &zpl_xattr_security_handler,
+ &zpl_xattr_trusted_handler,
+ &zpl_xattr_user_handler,
+#ifdef CONFIG_FS_POSIX_ACL
+ &zpl_xattr_acl_access_handler,
+ &zpl_xattr_acl_default_handler,
+#endif /* CONFIG_FS_POSIX_ACL */
+ NULL
+};
+
+static const struct xattr_handler *
+zpl_xattr_handler(const char *name)
+{
+ if (strncmp(name, XATTR_USER_PREFIX,
+ XATTR_USER_PREFIX_LEN) == 0)
+ return (&zpl_xattr_user_handler);
+
+ if (strncmp(name, XATTR_TRUSTED_PREFIX,
+ XATTR_TRUSTED_PREFIX_LEN) == 0)
+ return (&zpl_xattr_trusted_handler);
+
+ if (strncmp(name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN) == 0)
+ return (&zpl_xattr_security_handler);
+
+#ifdef CONFIG_FS_POSIX_ACL
+ if (strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS,
+ sizeof (XATTR_NAME_POSIX_ACL_ACCESS)) == 0)
+ return (&zpl_xattr_acl_access_handler);
+
+ if (strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT,
+ sizeof (XATTR_NAME_POSIX_ACL_DEFAULT)) == 0)
+ return (&zpl_xattr_acl_default_handler);
+#endif /* CONFIG_FS_POSIX_ACL */
+
+ return (NULL);
+}
+
+#if !defined(HAVE_POSIX_ACL_RELEASE) || defined(HAVE_POSIX_ACL_RELEASE_GPL_ONLY)
+struct acl_rel_struct {
+ struct acl_rel_struct *next;
+ struct posix_acl *acl;
+ clock_t time;
+};
+
+#define ACL_REL_GRACE (60*HZ)
+#define ACL_REL_WINDOW (1*HZ)
+#define ACL_REL_SCHED (ACL_REL_GRACE+ACL_REL_WINDOW)
+
+/*
+ * Lockless multi-producer single-consumer fifo list.
+ * Nodes are added to tail and removed from head. Tail pointer is our
+ * synchronization point. It always points to the next pointer of the last
+ * node, or head if list is empty.
+ */
+static struct acl_rel_struct *acl_rel_head = NULL;
+static struct acl_rel_struct **acl_rel_tail = &acl_rel_head;
+
+static void
+zpl_posix_acl_free(void *arg)
+{
+ struct acl_rel_struct *freelist = NULL;
+ struct acl_rel_struct *a;
+ clock_t new_time;
+ boolean_t refire = B_FALSE;
+
+ ASSERT3P(acl_rel_head, !=, NULL);
+ while (acl_rel_head) {
+ a = acl_rel_head;
+ if (ddi_get_lbolt() - a->time >= ACL_REL_GRACE) {
+ /*
+ * If a is the last node we need to reset tail, but we
+ * need to use cmpxchg to make sure it is still the
+ * last node.
+ */
+ if (acl_rel_tail == &a->next) {
+ acl_rel_head = NULL;
+ if (cmpxchg(&acl_rel_tail, &a->next,
+ &acl_rel_head) == &a->next) {
+ ASSERT3P(a->next, ==, NULL);
+ a->next = freelist;
+ freelist = a;
+ break;
+ }
+ }
+ /*
+ * a is not last node, make sure next pointer is set
+ * by the adder and advance the head.
+ */
+ while (READ_ONCE(a->next) == NULL)
+ cpu_relax();
+ acl_rel_head = a->next;
+ a->next = freelist;
+ freelist = a;
+ } else {
+ /*
+ * a is still in grace period. We are responsible to
+ * reschedule the free task, since adder will only do
+ * so if list is empty.
+ */
+ new_time = a->time + ACL_REL_SCHED;
+ refire = B_TRUE;
+ break;
+ }
+ }
+
+ if (refire)
+ taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free,
+ NULL, TQ_SLEEP, new_time);
+
+ while (freelist) {
+ a = freelist;
+ freelist = a->next;
+ kfree(a->acl);
+ kmem_free(a, sizeof (struct acl_rel_struct));
+ }
+}
+
+void
+zpl_posix_acl_release_impl(struct posix_acl *acl)
+{
+ struct acl_rel_struct *a, **prev;
+
+ a = kmem_alloc(sizeof (struct acl_rel_struct), KM_SLEEP);
+ a->next = NULL;
+ a->acl = acl;
+ a->time = ddi_get_lbolt();
+ /* atomically points tail to us and get the previous tail */
+ prev = xchg(&acl_rel_tail, &a->next);
+ ASSERT3P(*prev, ==, NULL);
+ *prev = a;
+ /* if it was empty before, schedule the free task */
+ if (prev == &acl_rel_head)
+ taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free,
+ NULL, TQ_SLEEP, ddi_get_lbolt() + ACL_REL_SCHED);
+}
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
new file mode 100644
index 000000000000..0caf31307718
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -0,0 +1,1098 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ */
+
+#include <sys/dataset_kstats.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/zil_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/zio.h>
+#include <sys/zfs_rlock.h>
+#include <sys/spa_impl.h>
+#include <sys/zvol.h>
+#include <sys/zvol_impl.h>
+
+#include <linux/blkdev_compat.h>
+#include <linux/task_io_accounting_ops.h>
+
+unsigned int zvol_major = ZVOL_MAJOR;
+unsigned int zvol_request_sync = 0;
+unsigned int zvol_prefetch_bytes = (128 * 1024);
+unsigned long zvol_max_discard_blocks = 16384;
+unsigned int zvol_threads = 32;
+
+struct zvol_state_os {
+ struct gendisk *zvo_disk; /* generic disk */
+ struct request_queue *zvo_queue; /* request queue */
+ dev_t zvo_dev; /* device id */
+};
+
+taskq_t *zvol_taskq;
+static struct ida zvol_ida;
+
+typedef struct zv_request {
+ zvol_state_t *zv;
+ struct bio *bio;
+ taskq_ent_t ent;
+} zv_request_t;
+
+/*
+ * Given a path, return TRUE if path is a ZVOL.
+ */
+static boolean_t
+zvol_is_zvol_impl(const char *path)
+{
+ dev_t dev = 0;
+
+ if (vdev_lookup_bdev(path, &dev) != 0)
+ return (B_FALSE);
+
+ if (MAJOR(dev) == zvol_major)
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+static void
+zvol_write(void *arg)
+{
+ zv_request_t *zvr = arg;
+ struct bio *bio = zvr->bio;
+ int error = 0;
+ zfs_uio_t uio;
+
+ zfs_uio_bvec_init(&uio, bio);
+
+ zvol_state_t *zv = zvr->zv;
+ ASSERT3P(zv, !=, NULL);
+ ASSERT3U(zv->zv_open_count, >, 0);
+ ASSERT3P(zv->zv_zilog, !=, NULL);
+
+ /* bio marked as FLUSH need to flush before write */
+ if (bio_is_flush(bio))
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+ /* Some requests are just for flush and nothing else. */
+ if (uio.uio_resid == 0) {
+ rw_exit(&zv->zv_suspend_lock);
+ BIO_END_IO(bio, 0);
+ kmem_free(zvr, sizeof (zv_request_t));
+ return;
+ }
+
+ struct request_queue *q = zv->zv_zso->zvo_queue;
+ struct gendisk *disk = zv->zv_zso->zvo_disk;
+ ssize_t start_resid = uio.uio_resid;
+ unsigned long start_time;
+
+ boolean_t acct = blk_queue_io_stat(q);
+ if (acct)
+ start_time = blk_generic_start_io_acct(q, disk, WRITE, bio);
+
+ boolean_t sync =
+ bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+
+ zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
+ uio.uio_loffset, uio.uio_resid, RL_WRITER);
+
+ uint64_t volsize = zv->zv_volsize;
+ while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
+ uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
+ uint64_t off = uio.uio_loffset;
+ dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+
+ if (bytes > volsize - off) /* don't write past the end */
+ bytes = volsize - off;
+
+ dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
+
+ /* This will only fail for ENOSPC */
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ break;
+ }
+ error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
+ if (error == 0) {
+ zvol_log_write(zv, tx, off, bytes, sync);
+ }
+ dmu_tx_commit(tx);
+
+ if (error)
+ break;
+ }
+ zfs_rangelock_exit(lr);
+
+ int64_t nwritten = start_resid - uio.uio_resid;
+ dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
+ task_io_account_write(nwritten);
+
+ if (sync)
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+ rw_exit(&zv->zv_suspend_lock);
+
+ if (acct)
+ blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
+
+ BIO_END_IO(bio, -error);
+ kmem_free(zvr, sizeof (zv_request_t));
+}
+
+static void
+zvol_discard(void *arg)
+{
+ zv_request_t *zvr = arg;
+ struct bio *bio = zvr->bio;
+ zvol_state_t *zv = zvr->zv;
+ uint64_t start = BIO_BI_SECTOR(bio) << 9;
+ uint64_t size = BIO_BI_SIZE(bio);
+ uint64_t end = start + size;
+ boolean_t sync;
+ int error = 0;
+ dmu_tx_t *tx;
+
+ ASSERT3P(zv, !=, NULL);
+ ASSERT3U(zv->zv_open_count, >, 0);
+ ASSERT3P(zv->zv_zilog, !=, NULL);
+
+ struct request_queue *q = zv->zv_zso->zvo_queue;
+ struct gendisk *disk = zv->zv_zso->zvo_disk;
+ unsigned long start_time;
+
+ boolean_t acct = blk_queue_io_stat(q);
+ if (acct)
+ start_time = blk_generic_start_io_acct(q, disk, WRITE, bio);
+
+ sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+
+ if (end > zv->zv_volsize) {
+ error = SET_ERROR(EIO);
+ goto unlock;
+ }
+
+ /*
+ * Align the request to volume block boundaries when a secure erase is
+ * not required. This will prevent dnode_free_range() from zeroing out
+ * the unaligned parts which is slow (read-modify-write) and useless
+ * since we are not freeing any space by doing so.
+ */
+ if (!bio_is_secure_erase(bio)) {
+ start = P2ROUNDUP(start, zv->zv_volblocksize);
+ end = P2ALIGN(end, zv->zv_volblocksize);
+ size = end - start;
+ }
+
+ if (start >= end)
+ goto unlock;
+
+ zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
+ start, size, RL_WRITER);
+
+ tx = dmu_tx_create(zv->zv_objset);
+ dmu_tx_mark_netfree(tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ dmu_tx_abort(tx);
+ } else {
+ zvol_log_truncate(zv, tx, start, size, B_TRUE);
+ dmu_tx_commit(tx);
+ error = dmu_free_long_range(zv->zv_objset,
+ ZVOL_OBJ, start, size);
+ }
+ zfs_rangelock_exit(lr);
+
+ if (error == 0 && sync)
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+unlock:
+ rw_exit(&zv->zv_suspend_lock);
+
+ if (acct)
+ blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
+
+ BIO_END_IO(bio, -error);
+ kmem_free(zvr, sizeof (zv_request_t));
+}
+
+static void
+zvol_read(void *arg)
+{
+ zv_request_t *zvr = arg;
+ struct bio *bio = zvr->bio;
+ int error = 0;
+ zfs_uio_t uio;
+
+ zfs_uio_bvec_init(&uio, bio);
+
+ zvol_state_t *zv = zvr->zv;
+ ASSERT3P(zv, !=, NULL);
+ ASSERT3U(zv->zv_open_count, >, 0);
+
+ struct request_queue *q = zv->zv_zso->zvo_queue;
+ struct gendisk *disk = zv->zv_zso->zvo_disk;
+ ssize_t start_resid = uio.uio_resid;
+ unsigned long start_time;
+
+ boolean_t acct = blk_queue_io_stat(q);
+ if (acct)
+ start_time = blk_generic_start_io_acct(q, disk, READ, bio);
+
+ zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
+ uio.uio_loffset, uio.uio_resid, RL_READER);
+
+ uint64_t volsize = zv->zv_volsize;
+ while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
+ uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
+
+ /* don't read past the end */
+ if (bytes > volsize - uio.uio_loffset)
+ bytes = volsize - uio.uio_loffset;
+
+ error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = SET_ERROR(EIO);
+ break;
+ }
+ }
+ zfs_rangelock_exit(lr);
+
+ int64_t nread = start_resid - uio.uio_resid;
+ dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
+ task_io_account_read(nread);
+
+ rw_exit(&zv->zv_suspend_lock);
+
+ if (acct)
+ blk_generic_end_io_acct(q, disk, READ, bio, start_time);
+
+ BIO_END_IO(bio, -error);
+ kmem_free(zvr, sizeof (zv_request_t));
+}
+
+#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
+static blk_qc_t
+zvol_submit_bio(struct bio *bio)
+#else
+static MAKE_REQUEST_FN_RET
+zvol_request(struct request_queue *q, struct bio *bio)
+#endif
+{
+#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
+ struct request_queue *q = bio->bi_disk->queue;
+#endif
+ zvol_state_t *zv = q->queuedata;
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+ uint64_t offset = BIO_BI_SECTOR(bio) << 9;
+ uint64_t size = BIO_BI_SIZE(bio);
+ int rw = bio_data_dir(bio);
+ zv_request_t *zvr;
+
+ if (bio_has_data(bio) && offset + size > zv->zv_volsize) {
+ printk(KERN_INFO
+ "%s: bad access: offset=%llu, size=%lu\n",
+ zv->zv_zso->zvo_disk->disk_name,
+ (long long unsigned)offset,
+ (long unsigned)size);
+
+ BIO_END_IO(bio, -SET_ERROR(EIO));
+ goto out;
+ }
+
+ if (rw == WRITE) {
+ if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
+ BIO_END_IO(bio, -SET_ERROR(EROFS));
+ goto out;
+ }
+
+ /*
+ * Prevents the zvol from being suspended, or the ZIL being
+ * concurrently opened. Will be released after the i/o
+ * completes.
+ */
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
+
+ /*
+ * Open a ZIL if this is the first time we have written to this
+ * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
+ * than zv_state_lock so that we don't need to acquire an
+ * additional lock in this path.
+ */
+ if (zv->zv_zilog == NULL) {
+ rw_exit(&zv->zv_suspend_lock);
+ rw_enter(&zv->zv_suspend_lock, RW_WRITER);
+ if (zv->zv_zilog == NULL) {
+ zv->zv_zilog = zil_open(zv->zv_objset,
+ zvol_get_data);
+ zv->zv_flags |= ZVOL_WRITTEN_TO;
+ }
+ rw_downgrade(&zv->zv_suspend_lock);
+ }
+
+ zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
+ zvr->zv = zv;
+ zvr->bio = bio;
+ taskq_init_ent(&zvr->ent);
+
+ /*
+ * We don't want this thread to be blocked waiting for i/o to
+ * complete, so we instead wait from a taskq callback. The
+ * i/o may be a ZIL write (via zil_commit()), or a read of an
+ * indirect block, or a read of a data block (if this is a
+ * partial-block write). We will indicate that the i/o is
+ * complete by calling BIO_END_IO() from the taskq callback.
+ *
+ * This design allows the calling thread to continue and
+ * initiate more concurrent operations by calling
+ * zvol_request() again. There are typically only a small
+ * number of threads available to call zvol_request() (e.g.
+ * one per iSCSI target), so keeping the latency of
+ * zvol_request() low is important for performance.
+ *
+ * The zvol_request_sync module parameter allows this
+ * behavior to be altered, for performance evaluation
+ * purposes. If the callback blocks, setting
+ * zvol_request_sync=1 will result in much worse performance.
+ *
+ * We can have up to zvol_threads concurrent i/o's being
+ * processed for all zvols on the system. This is typically
+ * a vast improvement over the zvol_request_sync=1 behavior
+ * of one i/o at a time per zvol. However, an even better
+ * design would be for zvol_request() to initiate the zio
+ * directly, and then be notified by the zio_done callback,
+ * which would call BIO_END_IO(). Unfortunately, the DMU/ZIL
+ * interfaces lack this functionality (they block waiting for
+ * the i/o to complete).
+ */
+ if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
+ if (zvol_request_sync) {
+ zvol_discard(zvr);
+ } else {
+ taskq_dispatch_ent(zvol_taskq,
+ zvol_discard, zvr, 0, &zvr->ent);
+ }
+ } else {
+ if (zvol_request_sync) {
+ zvol_write(zvr);
+ } else {
+ taskq_dispatch_ent(zvol_taskq,
+ zvol_write, zvr, 0, &zvr->ent);
+ }
+ }
+ } else {
+ /*
+ * The SCST driver, and possibly others, may issue READ I/Os
+ * with a length of zero bytes. These empty I/Os contain no
+ * data and require no additional handling.
+ */
+ if (size == 0) {
+ BIO_END_IO(bio, 0);
+ goto out;
+ }
+
+ zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
+ zvr->zv = zv;
+ zvr->bio = bio;
+ taskq_init_ent(&zvr->ent);
+
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
+
+ /* See comment in WRITE case above. */
+ if (zvol_request_sync) {
+ zvol_read(zvr);
+ } else {
+ taskq_dispatch_ent(zvol_taskq,
+ zvol_read, zvr, 0, &zvr->ent);
+ }
+ }
+
+out:
+ spl_fstrans_unmark(cookie);
+#if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
+ defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
+ return (BLK_QC_T_NONE);
+#endif
+}
+
+static int
+zvol_open(struct block_device *bdev, fmode_t flag)
+{
+ zvol_state_t *zv;
+ int error = 0;
+ boolean_t drop_suspend = B_TRUE;
+
+ rw_enter(&zvol_state_lock, RW_READER);
+ /*
+ * Obtain a copy of private_data under the zvol_state_lock to make
+ * sure that either the result of zvol free code path setting
+ * bdev->bd_disk->private_data to NULL is observed, or zvol_free()
+ * is not called on this zv because of the positive zv_open_count.
+ */
+ zv = bdev->bd_disk->private_data;
+ if (zv == NULL) {
+ rw_exit(&zvol_state_lock);
+ return (SET_ERROR(-ENXIO));
+ }
+
+ mutex_enter(&zv->zv_state_lock);
+ /*
+ * make sure zvol is not suspended during first open
+ * (hold zv_suspend_lock) and respect proper lock acquisition
+ * ordering - zv_suspend_lock before zv_state_lock
+ */
+ if (zv->zv_open_count == 0) {
+ if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
+ mutex_enter(&zv->zv_state_lock);
+ /* check to see if zv_suspend_lock is needed */
+ if (zv->zv_open_count != 0) {
+ rw_exit(&zv->zv_suspend_lock);
+ drop_suspend = B_FALSE;
+ }
+ }
+ } else {
+ drop_suspend = B_FALSE;
+ }
+ rw_exit(&zvol_state_lock);
+
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+ if (zv->zv_open_count == 0) {
+ ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
+ error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
+ if (error)
+ goto out_mutex;
+ }
+
+ if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+ error = -EROFS;
+ goto out_open_count;
+ }
+
+ zv->zv_open_count++;
+
+ mutex_exit(&zv->zv_state_lock);
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+
+ zfs_check_media_change(bdev);
+
+ return (0);
+
+out_open_count:
+ if (zv->zv_open_count == 0)
+ zvol_last_close(zv);
+
+out_mutex:
+ mutex_exit(&zv->zv_state_lock);
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+ if (error == -EINTR) {
+ error = -ERESTARTSYS;
+ schedule();
+ }
+ return (SET_ERROR(error));
+}
+
+static void
+zvol_release(struct gendisk *disk, fmode_t mode)
+{
+ zvol_state_t *zv;
+ boolean_t drop_suspend = B_TRUE;
+
+ rw_enter(&zvol_state_lock, RW_READER);
+ zv = disk->private_data;
+
+ mutex_enter(&zv->zv_state_lock);
+ ASSERT3U(zv->zv_open_count, >, 0);
+ /*
+ * make sure zvol is not suspended during last close
+ * (hold zv_suspend_lock) and respect proper lock acquisition
+ * ordering - zv_suspend_lock before zv_state_lock
+ */
+ if (zv->zv_open_count == 1) {
+ if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
+ mutex_enter(&zv->zv_state_lock);
+ /* check to see if zv_suspend_lock is needed */
+ if (zv->zv_open_count != 1) {
+ rw_exit(&zv->zv_suspend_lock);
+ drop_suspend = B_FALSE;
+ }
+ }
+ } else {
+ drop_suspend = B_FALSE;
+ }
+ rw_exit(&zvol_state_lock);
+
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+ zv->zv_open_count--;
+ if (zv->zv_open_count == 0) {
+ ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
+ zvol_last_close(zv);
+ }
+
+ mutex_exit(&zv->zv_state_lock);
+
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+}
+
+static int
+zvol_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ zvol_state_t *zv = bdev->bd_disk->private_data;
+ int error = 0;
+
+ ASSERT3U(zv->zv_open_count, >, 0);
+
+ switch (cmd) {
+ case BLKFLSBUF:
+ fsync_bdev(bdev);
+ invalidate_bdev(bdev);
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
+
+ if (!(zv->zv_flags & ZVOL_RDONLY))
+ txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
+
+ rw_exit(&zv->zv_suspend_lock);
+ break;
+
+ case BLKZNAME:
+ mutex_enter(&zv->zv_state_lock);
+ error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
+ mutex_exit(&zv->zv_state_lock);
+ break;
+
+ default:
+ error = -ENOTTY;
+ break;
+ }
+
+ return (SET_ERROR(error));
+}
+
+#ifdef CONFIG_COMPAT
+static int
+zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned cmd, unsigned long arg)
+{
+ return (zvol_ioctl(bdev, mode, cmd, arg));
+}
+#else
+#define zvol_compat_ioctl NULL
+#endif
+
+static unsigned int
+zvol_check_events(struct gendisk *disk, unsigned int clearing)
+{
+ unsigned int mask = 0;
+
+ rw_enter(&zvol_state_lock, RW_READER);
+
+ zvol_state_t *zv = disk->private_data;
+ if (zv != NULL) {
+ mutex_enter(&zv->zv_state_lock);
+ mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
+ zv->zv_changed = 0;
+ mutex_exit(&zv->zv_state_lock);
+ }
+
+ rw_exit(&zvol_state_lock);
+
+ return (mask);
+}
+
+static int
+zvol_revalidate_disk(struct gendisk *disk)
+{
+ rw_enter(&zvol_state_lock, RW_READER);
+
+ zvol_state_t *zv = disk->private_data;
+ if (zv != NULL) {
+ mutex_enter(&zv->zv_state_lock);
+ set_capacity(zv->zv_zso->zvo_disk,
+ zv->zv_volsize >> SECTOR_BITS);
+ mutex_exit(&zv->zv_state_lock);
+ }
+
+ rw_exit(&zvol_state_lock);
+
+ return (0);
+}
+
+static int
+zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
+{
+ struct gendisk *disk = zv->zv_zso->zvo_disk;
+
+#if defined(HAVE_REVALIDATE_DISK_SIZE)
+ revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0);
+#elif defined(HAVE_REVALIDATE_DISK)
+ revalidate_disk(disk);
+#else
+ zvol_revalidate_disk(disk);
+#endif
+ return (0);
+}
+
+static void
+zvol_clear_private(zvol_state_t *zv)
+{
+ /*
+ * Cleared while holding zvol_state_lock as a writer
+ * which will prevent zvol_open() from opening it.
+ */
+ zv->zv_zso->zvo_disk->private_data = NULL;
+}
+
+/*
+ * Provide a simple virtual geometry for legacy compatibility. For devices
+ * smaller than 1 MiB a small head and sector count is used to allow very
+ * tiny devices. For devices over 1 Mib a standard head and sector count
+ * is used to keep the cylinders count reasonable.
+ */
+static int
+zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ zvol_state_t *zv = bdev->bd_disk->private_data;
+ sector_t sectors;
+
+ ASSERT3U(zv->zv_open_count, >, 0);
+
+ sectors = get_capacity(zv->zv_zso->zvo_disk);
+
+ if (sectors > 2048) {
+ geo->heads = 16;
+ geo->sectors = 63;
+ } else {
+ geo->heads = 2;
+ geo->sectors = 4;
+ }
+
+ geo->start = 0;
+ geo->cylinders = sectors / (geo->heads * geo->sectors);
+
+ return (0);
+}
+
+static struct block_device_operations zvol_ops = {
+ .open = zvol_open,
+ .release = zvol_release,
+ .ioctl = zvol_ioctl,
+ .compat_ioctl = zvol_compat_ioctl,
+ .check_events = zvol_check_events,
+ .revalidate_disk = zvol_revalidate_disk,
+ .getgeo = zvol_getgeo,
+ .owner = THIS_MODULE,
+#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
+ .submit_bio = zvol_submit_bio,
+#endif
+};
+
+/*
+ * Allocate memory for a new zvol_state_t and setup the required
+ * request queue and generic disk structures for the block device.
+ */
+static zvol_state_t *
+zvol_alloc(dev_t dev, const char *name)
+{
+ zvol_state_t *zv;
+ struct zvol_state_os *zso;
+ uint64_t volmode;
+
+ if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
+ return (NULL);
+
+ if (volmode == ZFS_VOLMODE_DEFAULT)
+ volmode = zvol_volmode;
+
+ if (volmode == ZFS_VOLMODE_NONE)
+ return (NULL);
+
+ zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
+ zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
+ zv->zv_zso = zso;
+ zv->zv_volmode = volmode;
+
+ list_link_init(&zv->zv_next);
+ mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
+
+#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
+ zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
+#else
+ zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
+#endif
+ if (zso->zvo_queue == NULL)
+ goto out_kmem;
+
+ blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE);
+
+ /* Limit read-ahead to a single page to prevent over-prefetching. */
+ blk_queue_set_read_ahead(zso->zvo_queue, 1);
+
+ /* Disable write merging in favor of the ZIO pipeline. */
+ blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
+
+ zso->zvo_disk = alloc_disk(ZVOL_MINORS);
+ if (zso->zvo_disk == NULL)
+ goto out_queue;
+
+ zso->zvo_queue->queuedata = zv;
+ zso->zvo_dev = dev;
+ zv->zv_open_count = 0;
+ strlcpy(zv->zv_name, name, MAXNAMELEN);
+
+ zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
+ rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
+
+ zso->zvo_disk->major = zvol_major;
+ zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE;
+
+ if (volmode == ZFS_VOLMODE_DEV) {
+ /*
+ * ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set
+ * gendisk->minors = 1 as noted in include/linux/genhd.h.
+ * Also disable extended partition numbers (GENHD_FL_EXT_DEVT)
+ * and suppresses partition scanning (GENHD_FL_NO_PART_SCAN)
+ * setting gendisk->flags accordingly.
+ */
+ zso->zvo_disk->minors = 1;
+#if defined(GENHD_FL_EXT_DEVT)
+ zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT;
+#endif
+#if defined(GENHD_FL_NO_PART_SCAN)
+ zso->zvo_disk->flags |= GENHD_FL_NO_PART_SCAN;
+#endif
+ }
+ zso->zvo_disk->first_minor = (dev & MINORMASK);
+ zso->zvo_disk->fops = &zvol_ops;
+ zso->zvo_disk->private_data = zv;
+ zso->zvo_disk->queue = zso->zvo_queue;
+ snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
+ ZVOL_DEV_NAME, (dev & MINORMASK));
+
+ return (zv);
+
+out_queue:
+ blk_cleanup_queue(zso->zvo_queue);
+out_kmem:
+ kmem_free(zso, sizeof (struct zvol_state_os));
+ kmem_free(zv, sizeof (zvol_state_t));
+ return (NULL);
+}
+
+/*
+ * Cleanup then free a zvol_state_t which was created by zvol_alloc().
+ * At this time, the structure is not opened by anyone, is taken off
+ * the zvol_state_list, and has its private data set to NULL.
+ * The zvol_state_lock is dropped.
+ *
+ * This function may take many milliseconds to complete (e.g. we've seen
+ * it take over 256ms), due to the calls to "blk_cleanup_queue" and
+ * "del_gendisk". Thus, consumers need to be careful to account for this
+ * latency when calling this function.
+ */
+static void
+zvol_free(zvol_state_t *zv)
+{
+
+ ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+ ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT0(zv->zv_open_count);
+ ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL);
+
+ rw_destroy(&zv->zv_suspend_lock);
+ zfs_rangelock_fini(&zv->zv_rangelock);
+
+ del_gendisk(zv->zv_zso->zvo_disk);
+ blk_cleanup_queue(zv->zv_zso->zvo_queue);
+ put_disk(zv->zv_zso->zvo_disk);
+
+ ida_simple_remove(&zvol_ida,
+ MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
+
+ mutex_destroy(&zv->zv_state_lock);
+ dataset_kstats_destroy(&zv->zv_kstat);
+
+ kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
+ kmem_free(zv, sizeof (zvol_state_t));
+}
+
+void
+zvol_wait_close(zvol_state_t *zv)
+{
+}
+
+/*
+ * Create a block device minor node and setup the linkage between it
+ * and the specified volume. Once this function returns the block
+ * device is live and ready for use.
+ */
+static int
+zvol_os_create_minor(const char *name)
+{
+ zvol_state_t *zv;
+ objset_t *os;
+ dmu_object_info_t *doi;
+ uint64_t volsize;
+ uint64_t len;
+ unsigned minor = 0;
+ int error = 0;
+ int idx;
+ uint64_t hash = zvol_name_hash(name);
+
+ if (zvol_inhibit_dev)
+ return (0);
+
+ idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
+ if (idx < 0)
+ return (SET_ERROR(-idx));
+ minor = idx << ZVOL_MINOR_BITS;
+
+ zv = zvol_find_by_name_hash(name, hash, RW_NONE);
+ if (zv) {
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ mutex_exit(&zv->zv_state_lock);
+ ida_simple_remove(&zvol_ida, idx);
+ return (SET_ERROR(EEXIST));
+ }
+
+ doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
+
+ error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
+ if (error)
+ goto out_doi;
+
+ error = dmu_object_info(os, ZVOL_OBJ, doi);
+ if (error)
+ goto out_dmu_objset_disown;
+
+ error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+ if (error)
+ goto out_dmu_objset_disown;
+
+ zv = zvol_alloc(MKDEV(zvol_major, minor), name);
+ if (zv == NULL) {
+ error = SET_ERROR(EAGAIN);
+ goto out_dmu_objset_disown;
+ }
+ zv->zv_hash = hash;
+
+ if (dmu_objset_is_snapshot(os))
+ zv->zv_flags |= ZVOL_RDONLY;
+
+ zv->zv_volblocksize = doi->doi_data_block_size;
+ zv->zv_volsize = volsize;
+ zv->zv_objset = os;
+
+ set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
+
+ blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue,
+ (DMU_MAX_ACCESS / 4) >> 9);
+ blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX);
+ blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX);
+ blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
+ zv->zv_volblocksize);
+ blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize);
+ blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue,
+ (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
+ blk_queue_discard_granularity(zv->zv_zso->zvo_queue,
+ zv->zv_volblocksize);
+ blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
+#ifdef QUEUE_FLAG_NONROT
+ blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue);
+#endif
+#ifdef QUEUE_FLAG_ADD_RANDOM
+ blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue);
+#endif
+ /* This flag was introduced in kernel version 4.12. */
+#ifdef QUEUE_FLAG_SCSI_PASSTHROUGH
+ blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
+#endif
+
+ if (spa_writeable(dmu_objset_spa(os))) {
+ if (zil_replay_disable)
+ zil_destroy(dmu_objset_zil(os), B_FALSE);
+ else
+ zil_replay(os, zv, zvol_replay_vector);
+ }
+ ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
+ dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
+
+ /*
+ * When udev detects the addition of the device it will immediately
+ * invoke blkid(8) to determine the type of content on the device.
+ * Prefetching the blocks commonly scanned by blkid(8) will speed
+ * up this process.
+ */
+ len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE);
+ if (len > 0) {
+ dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
+ dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
+ ZIO_PRIORITY_SYNC_READ);
+ }
+
+ zv->zv_objset = NULL;
+out_dmu_objset_disown:
+ dmu_objset_disown(os, B_TRUE, FTAG);
+out_doi:
+ kmem_free(doi, sizeof (dmu_object_info_t));
+
+ /*
+ * Keep in mind that once add_disk() is called, the zvol is
+ * announced to the world, and zvol_open()/zvol_release() can
+ * be called at any time. Incidentally, add_disk() itself calls
+ * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
+ * directly as well.
+ */
+ if (error == 0) {
+ rw_enter(&zvol_state_lock, RW_WRITER);
+ zvol_insert(zv);
+ rw_exit(&zvol_state_lock);
+ add_disk(zv->zv_zso->zvo_disk);
+ } else {
+ ida_simple_remove(&zvol_ida, idx);
+ }
+
+ return (error);
+}
+
+static void
+zvol_rename_minor(zvol_state_t *zv, const char *newname)
+{
+ int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
+
+ ASSERT(RW_LOCK_HELD(&zvol_state_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+ strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
+
+ /* move to new hashtable entry */
+ zv->zv_hash = zvol_name_hash(zv->zv_name);
+ hlist_del(&zv->zv_hlink);
+ hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
+
+ /*
+ * The block device's read-only state is briefly changed causing
+ * a KOBJ_CHANGE uevent to be issued. This ensures udev detects
+ * the name change and fixes the symlinks. This does not change
+ * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
+ * changes. This would normally be done using kobject_uevent() but
+ * that is a GPL-only symbol which is why we need this workaround.
+ */
+ set_disk_ro(zv->zv_zso->zvo_disk, !readonly);
+ set_disk_ro(zv->zv_zso->zvo_disk, readonly);
+}
+
+static void
+zvol_set_disk_ro_impl(zvol_state_t *zv, int flags)
+{
+
+ set_disk_ro(zv->zv_zso->zvo_disk, flags);
+}
+
+static void
+zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity)
+{
+
+ set_capacity(zv->zv_zso->zvo_disk, capacity);
+}
+
+const static zvol_platform_ops_t zvol_linux_ops = {
+ .zv_free = zvol_free,
+ .zv_rename_minor = zvol_rename_minor,
+ .zv_create_minor = zvol_os_create_minor,
+ .zv_update_volsize = zvol_update_volsize,
+ .zv_clear_private = zvol_clear_private,
+ .zv_is_zvol = zvol_is_zvol_impl,
+ .zv_set_disk_ro = zvol_set_disk_ro_impl,
+ .zv_set_capacity = zvol_set_capacity_impl,
+};
+
+int
+zvol_init(void)
+{
+ int error;
+ int threads = MIN(MAX(zvol_threads, 1), 1024);
+
+ error = register_blkdev(zvol_major, ZVOL_DRIVER);
+ if (error) {
+ printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
+ return (error);
+ }
+ zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
+ threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+ if (zvol_taskq == NULL) {
+ unregister_blkdev(zvol_major, ZVOL_DRIVER);
+ return (-ENOMEM);
+ }
+ zvol_init_impl();
+ ida_init(&zvol_ida);
+ zvol_register_ops(&zvol_linux_ops);
+ return (0);
+}
+
+void
+zvol_fini(void)
+{
+ zvol_fini_impl();
+ unregister_blkdev(zvol_major, ZVOL_DRIVER);
+ taskq_destroy(zvol_taskq);
+ ida_destroy(&zvol_ida);
+}
+
+/* BEGIN CSTYLED */
+module_param(zvol_inhibit_dev, uint, 0644);
+MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
+
+module_param(zvol_major, uint, 0444);
+MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
+
+module_param(zvol_threads, uint, 0444);
+MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests");
+
+module_param(zvol_request_sync, uint, 0644);
+MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
+
+module_param(zvol_max_discard_blocks, ulong, 0444);
+MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
+
+module_param(zvol_prefetch_bytes, uint, 0644);
+MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
+
+module_param(zvol_volmode, uint, 0644);
+MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/spl/Makefile.in b/sys/contrib/openzfs/module/spl/Makefile.in
new file mode 100644
index 000000000000..cedbfe92b58a
--- /dev/null
+++ b/sys/contrib/openzfs/module/spl/Makefile.in
@@ -0,0 +1,13 @@
+ifneq ($(KBUILD_EXTMOD),)
+src = @abs_srcdir@
+obj = @abs_builddir@
+mfdir = $(obj)
+else
+mfdir = $(srctree)/$(src)
+endif
+
+MODULE := spl
+
+obj-$(CONFIG_ZFS) := $(MODULE).o
+
+include $(mfdir)/../os/linux/spl/Makefile
diff --git a/sys/contrib/openzfs/module/unicode/Makefile.in b/sys/contrib/openzfs/module/unicode/Makefile.in
new file mode 100644
index 000000000000..59c07c4555b7
--- /dev/null
+++ b/sys/contrib/openzfs/module/unicode/Makefile.in
@@ -0,0 +1,11 @@
+ifneq ($(KBUILD_EXTMOD),)
+src = @abs_srcdir@
+obj = @abs_builddir@
+endif
+
+MODULE := zunicode
+
+obj-$(CONFIG_ZFS) := $(MODULE).o
+
+$(MODULE)-objs += u8_textprep.o
+$(MODULE)-objs += uconv.o
diff --git a/sys/contrib/openzfs/module/unicode/u8_textprep.c b/sys/contrib/openzfs/module/unicode/u8_textprep.c
new file mode 100644
index 000000000000..be816d728359
--- /dev/null
+++ b/sys/contrib/openzfs/module/unicode/u8_textprep.c
@@ -0,0 +1,2151 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+
+
+/*
+ * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
+ *
+ * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
+ * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
+ * the section 3C man pages.
+ * Interface stability: Committed.
+ */
+
+#include <sys/types.h>
+#include <sys/strings.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/debug.h>
+#include <sys/kmem.h>
+#include <sys/sunddi.h>
+#include <sys/u8_textprep.h>
+#include <sys/byteorder.h>
+#include <sys/errno.h>
+#include <sys/u8_textprep_data.h>
+#include <sys/mod.h>
+
+/* The maximum possible number of bytes in a UTF-8 character. */
+#define U8_MB_CUR_MAX (4)
+
+/*
+ * The maximum number of bytes needed for a UTF-8 character to cover
+ * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
+ */
+#define U8_MAX_BYTES_UCS2 (3)
+
+/* The maximum possible number of bytes in a Stream-Safe Text. */
+#define U8_STREAM_SAFE_TEXT_MAX (128)
+
+/*
+ * The maximum number of characters in a combining/conjoining sequence and
+ * the actual upperbound limit of a combining/conjoining sequence.
+ */
+#define U8_MAX_CHARS_A_SEQ (32)
+#define U8_UPPER_LIMIT_IN_A_SEQ (31)
+
+/* The combining class value for Starter. */
+#define U8_COMBINING_CLASS_STARTER (0)
+
+/*
+ * Some Hangul related macros at below.
+ *
+ * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
+ * Vowels, and optional Trailing consonants in Unicode scalar values.
+ *
+ * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
+ * the actual U+11A8. This is due to that the trailing consonant is optional
+ * and thus we are doing a pre-calculation of subtracting one.
+ *
+ * Each of 19 modern leading consonants has total 588 possible syllables since
+ * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
+ * no trailing consonant case, i.e., 21 x 28 = 588.
+ *
+ * We also have bunch of Hangul related macros at below. Please bear in mind
+ * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
+ * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
+ * Jamo; it just guarantee that it will be most likely.
+ */
+#define U8_HANGUL_SYL_FIRST (0xAC00U)
+#define U8_HANGUL_SYL_LAST (0xD7A3U)
+
+#define U8_HANGUL_JAMO_L_FIRST (0x1100U)
+#define U8_HANGUL_JAMO_L_LAST (0x1112U)
+#define U8_HANGUL_JAMO_V_FIRST (0x1161U)
+#define U8_HANGUL_JAMO_V_LAST (0x1175U)
+#define U8_HANGUL_JAMO_T_FIRST (0x11A7U)
+#define U8_HANGUL_JAMO_T_LAST (0x11C2U)
+
+#define U8_HANGUL_V_COUNT (21)
+#define U8_HANGUL_VT_COUNT (588)
+#define U8_HANGUL_T_COUNT (28)
+
+#define U8_HANGUL_JAMO_1ST_BYTE (0xE1U)
+
+#define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
+ (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
+ (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
+ (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
+
+#define U8_HANGUL_JAMO_L(u) \
+ ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
+
+#define U8_HANGUL_JAMO_V(u) \
+ ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
+
+#define U8_HANGUL_JAMO_T(u) \
+ ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
+
+#define U8_HANGUL_JAMO(u) \
+ ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
+
+#define U8_HANGUL_SYLLABLE(u) \
+ ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
+
+#define U8_HANGUL_COMPOSABLE_L_V(s, u) \
+ ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
+
+#define U8_HANGUL_COMPOSABLE_LV_T(s, u) \
+ ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
+
+/* The types of decomposition mappings. */
+#define U8_DECOMP_BOTH (0xF5U)
+#define U8_DECOMP_CANONICAL (0xF6U)
+
+/* The indicator for 16-bit table. */
+#define U8_16BIT_TABLE_INDICATOR (0x8000U)
+
+/* The following are some convenience macros. */
+#define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
+ (u) = ((((uint32_t)(b1) & 0x0F) << 12) | \
+ (((uint32_t)(b2) & 0x3F) << 6) | \
+ ((uint32_t)(b3) & 0x3F));
+
+#define U8_SIMPLE_SWAP(a, b, t) \
+ (t) = (a); \
+ (a) = (b); \
+ (b) = (t);
+
+#define U8_ASCII_TOUPPER(c) \
+ (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
+
+#define U8_ASCII_TOLOWER(c) \
+ (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
+
+#define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U)
+/*
+ * The following macro assumes that the two characters that are to be
+ * swapped are adjacent to each other and 'a' comes before 'b'.
+ *
+ * If the assumptions are not met, then, the macro will fail.
+ */
+#define U8_SWAP_COMB_MARKS(a, b) \
+ for (k = 0; k < disp[(a)]; k++) \
+ u8t[k] = u8s[start[(a)] + k]; \
+ for (k = 0; k < disp[(b)]; k++) \
+ u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
+ start[(b)] = start[(a)] + disp[(b)]; \
+ for (k = 0; k < disp[(a)]; k++) \
+ u8s[start[(b)] + k] = u8t[k]; \
+ U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
+ U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
+
+/* The possible states during normalization. */
+typedef enum {
+ U8_STATE_START = 0,
+ U8_STATE_HANGUL_L = 1,
+ U8_STATE_HANGUL_LV = 2,
+ U8_STATE_HANGUL_LVT = 3,
+ U8_STATE_HANGUL_V = 4,
+ U8_STATE_HANGUL_T = 5,
+ U8_STATE_COMBINING_MARK = 6
+} u8_normalization_states_t;
+
+/*
+ * The three vectors at below are used to check bytes of a given UTF-8
+ * character are valid and not containing any malformed byte values.
+ *
+ * We used to have a quite relaxed UTF-8 binary representation but then there
+ * was some security related issues and so the Unicode Consortium defined
+ * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
+ * one more time at the Unicode 3.2. The following three tables are based on
+ * that.
+ */
+
+#define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF)
+
+#define I_ U8_ILLEGAL_CHAR
+#define O_ U8_OUT_OF_RANGE_CHAR
+
+const int8_t u8_number_of_bytes[0x100] = {
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+
+/* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */
+ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */
+ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */
+ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */
+ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
+ I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+
+/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+
+/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+
+/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
+ 4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
+};
+
+#undef I_
+#undef O_
+
+const uint8_t u8_valid_min_2nd_byte[0x100] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+/* C0 C1 C2 C3 C4 C5 C6 C7 */
+ 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* C8 C9 CA CB CC CD CE CF */
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* D0 D1 D2 D3 D4 D5 D6 D7 */
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* D8 D9 DA DB DC DD DE DF */
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* E0 E1 E2 E3 E4 E5 E6 E7 */
+ 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* E8 E9 EA EB EC ED EE EF */
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* F0 F1 F2 F3 F4 F5 F6 F7 */
+ 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+const uint8_t u8_valid_max_2nd_byte[0x100] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+/* C0 C1 C2 C3 C4 C5 C6 C7 */
+ 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/* C8 C9 CA CB CC CD CE CF */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/* D0 D1 D2 D3 D4 D5 D6 D7 */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/* D8 D9 DA DB DC DD DE DF */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/* E0 E1 E2 E3 E4 E5 E6 E7 */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/* E8 E9 EA EB EC ED EE EF */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
+/* F0 F1 F2 F3 F4 F5 F6 F7 */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+
+/*
+ * The u8_validate() validates on the given UTF-8 character string and
+ * calculate the byte length. It is quite similar to mblen(3C) except that
+ * this will validate against the list of characters if required and
+ * specific to UTF-8 and Unicode.
+ */
+int
+u8_validate(const char *u8str, size_t n, char **list, int flag, int *errnum)
+{
+ uchar_t *ib;
+ uchar_t *ibtail;
+ uchar_t **p;
+ uchar_t *s1;
+ uchar_t *s2;
+ uchar_t f;
+ int sz;
+ size_t i;
+ int ret_val;
+ boolean_t second;
+ boolean_t no_need_to_validate_entire;
+ boolean_t check_additional;
+ boolean_t validate_ucs2_range_only;
+
+ if (! u8str)
+ return (0);
+
+ ib = (uchar_t *)u8str;
+ ibtail = ib + n;
+
+ ret_val = 0;
+
+ no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
+ check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
+ validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
+
+ while (ib < ibtail) {
+ /*
+ * The first byte of a UTF-8 character tells how many
+ * bytes will follow for the character. If the first byte
+ * is an illegal byte value or out of range value, we just
+ * return -1 with an appropriate error number.
+ */
+ sz = u8_number_of_bytes[*ib];
+ if (sz == U8_ILLEGAL_CHAR) {
+ *errnum = EILSEQ;
+ return (-1);
+ }
+
+ if (sz == U8_OUT_OF_RANGE_CHAR ||
+ (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
+ *errnum = ERANGE;
+ return (-1);
+ }
+
+ /*
+ * If we don't have enough bytes to check on, that's also
+ * an error. As you can see, we give illegal byte sequence
+ * checking higher priority then EINVAL cases.
+ */
+ if ((ibtail - ib) < sz) {
+ *errnum = EINVAL;
+ return (-1);
+ }
+
+ if (sz == 1) {
+ ib++;
+ ret_val++;
+ } else {
+ /*
+ * Check on the multi-byte UTF-8 character. For more
+ * details on this, see comment added for the used
+ * data structures at the beginning of the file.
+ */
+ f = *ib++;
+ ret_val++;
+ second = B_TRUE;
+ for (i = 1; i < sz; i++) {
+ if (second) {
+ if (*ib < u8_valid_min_2nd_byte[f] ||
+ *ib > u8_valid_max_2nd_byte[f]) {
+ *errnum = EILSEQ;
+ return (-1);
+ }
+ second = B_FALSE;
+ } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
+ *errnum = EILSEQ;
+ return (-1);
+ }
+ ib++;
+ ret_val++;
+ }
+ }
+
+ if (check_additional) {
+ for (p = (uchar_t **)list, i = 0; p[i]; i++) {
+ s1 = ib - sz;
+ s2 = p[i];
+ while (s1 < ib) {
+ if (*s1 != *s2 || *s2 == '\0')
+ break;
+ s1++;
+ s2++;
+ }
+
+ if (s1 >= ib && *s2 == '\0') {
+ *errnum = EBADF;
+ return (-1);
+ }
+ }
+ }
+
+ if (no_need_to_validate_entire)
+ break;
+ }
+
+ return (ret_val);
+}
+
+/*
+ * The do_case_conv() looks at the mapping tables and returns found
+ * bytes if any. If not found, the input bytes are returned. The function
+ * always terminate the return bytes with a null character assuming that
+ * there are plenty of room to do so.
+ *
+ * The case conversions are simple case conversions mapping a character to
+ * another character as specified in the Unicode data. The byte size of
+ * the mapped character could be different from that of the input character.
+ *
+ * The return value is the byte length of the returned character excluding
+ * the terminating null byte.
+ */
+static size_t
+do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
+{
+ size_t i;
+ uint16_t b1 = 0;
+ uint16_t b2 = 0;
+ uint16_t b3 = 0;
+ uint16_t b3_tbl;
+ uint16_t b3_base;
+ uint16_t b4 = 0;
+ size_t start_id;
+ size_t end_id;
+
+ /*
+ * At this point, the only possible values for sz are 2, 3, and 4.
+ * The u8s should point to a vector that is well beyond the size of
+ * 5 bytes.
+ */
+ if (sz == 2) {
+ b3 = u8s[0] = s[0];
+ b4 = u8s[1] = s[1];
+ } else if (sz == 3) {
+ b2 = u8s[0] = s[0];
+ b3 = u8s[1] = s[1];
+ b4 = u8s[2] = s[2];
+ } else if (sz == 4) {
+ b1 = u8s[0] = s[0];
+ b2 = u8s[1] = s[1];
+ b3 = u8s[2] = s[2];
+ b4 = u8s[3] = s[3];
+ } else {
+ /* This is not possible but just in case as a fallback. */
+ if (is_it_toupper)
+ *u8s = U8_ASCII_TOUPPER(*s);
+ else
+ *u8s = U8_ASCII_TOLOWER(*s);
+ u8s[1] = '\0';
+
+ return (1);
+ }
+ u8s[sz] = '\0';
+
+ /*
+ * Let's find out if we have a corresponding character.
+ */
+ b1 = u8_common_b1_tbl[uv][b1];
+ if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ b2 = u8_case_common_b2_tbl[uv][b1][b2];
+ if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ if (is_it_toupper) {
+ b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
+ if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
+ end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
+
+ /* Either there is no match or an error at the table. */
+ if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
+ return ((size_t)sz);
+
+ b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
+
+ for (i = 0; start_id < end_id; start_id++)
+ u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
+ } else {
+ b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
+ if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
+ end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
+
+ if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
+ return ((size_t)sz);
+
+ b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
+
+ for (i = 0; start_id < end_id; start_id++)
+ u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
+ }
+
+ /*
+ * If i is still zero, that means there is no corresponding character.
+ */
+ if (i == 0)
+ return ((size_t)sz);
+
+ u8s[i] = '\0';
+
+ return (i);
+}
+
+/*
+ * The do_case_compare() function compares the two input strings, s1 and s2,
+ * one character at a time doing case conversions if applicable and return
+ * the comparison result as like strcmp().
+ *
+ * Since, in empirical sense, most of text data are 7-bit ASCII characters,
+ * we treat the 7-bit ASCII characters as a special case trying to yield
+ * faster processing time.
+ */
+static int
+do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
+ size_t n2, boolean_t is_it_toupper, int *errnum)
+{
+ int f;
+ int sz1;
+ int sz2;
+ size_t j;
+ size_t i1;
+ size_t i2;
+ uchar_t u8s1[U8_MB_CUR_MAX + 1];
+ uchar_t u8s2[U8_MB_CUR_MAX + 1];
+
+ i1 = i2 = 0;
+ while (i1 < n1 && i2 < n2) {
+ /*
+ * Find out what would be the byte length for this UTF-8
+ * character at string s1 and also find out if this is
+ * an illegal start byte or not and if so, issue a proper
+ * error number and yet treat this byte as a character.
+ */
+ sz1 = u8_number_of_bytes[*s1];
+ if (sz1 < 0) {
+ *errnum = EILSEQ;
+ sz1 = 1;
+ }
+
+ /*
+ * For 7-bit ASCII characters mainly, we do a quick case
+ * conversion right at here.
+ *
+ * If we don't have enough bytes for this character, issue
+ * an EINVAL error and use what are available.
+ *
+ * If we have enough bytes, find out if there is
+ * a corresponding uppercase character and if so, copy over
+ * the bytes for a comparison later. If there is no
+ * corresponding uppercase character, then, use what we have
+ * for the comparison.
+ */
+ if (sz1 == 1) {
+ if (is_it_toupper)
+ u8s1[0] = U8_ASCII_TOUPPER(*s1);
+ else
+ u8s1[0] = U8_ASCII_TOLOWER(*s1);
+ s1++;
+ u8s1[1] = '\0';
+ } else if ((i1 + sz1) > n1) {
+ *errnum = EINVAL;
+ for (j = 0; (i1 + j) < n1; )
+ u8s1[j++] = *s1++;
+ u8s1[j] = '\0';
+ } else {
+ (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
+ s1 += sz1;
+ }
+
+ /* Do the same for the string s2. */
+ sz2 = u8_number_of_bytes[*s2];
+ if (sz2 < 0) {
+ *errnum = EILSEQ;
+ sz2 = 1;
+ }
+
+ if (sz2 == 1) {
+ if (is_it_toupper)
+ u8s2[0] = U8_ASCII_TOUPPER(*s2);
+ else
+ u8s2[0] = U8_ASCII_TOLOWER(*s2);
+ s2++;
+ u8s2[1] = '\0';
+ } else if ((i2 + sz2) > n2) {
+ *errnum = EINVAL;
+ for (j = 0; (i2 + j) < n2; )
+ u8s2[j++] = *s2++;
+ u8s2[j] = '\0';
+ } else {
+ (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
+ s2 += sz2;
+ }
+
+ /* Now compare the two characters. */
+ if (sz1 == 1 && sz2 == 1) {
+ if (*u8s1 > *u8s2)
+ return (1);
+ if (*u8s1 < *u8s2)
+ return (-1);
+ } else {
+ f = strcmp((const char *)u8s1, (const char *)u8s2);
+ if (f != 0)
+ return (f);
+ }
+
+ /*
+ * They were the same. Let's move on to the next
+ * characters then.
+ */
+ i1 += sz1;
+ i2 += sz2;
+ }
+
+ /*
+ * We compared until the end of either or both strings.
+ *
+ * If we reached to or went over the ends for the both, that means
+ * they are the same.
+ *
+ * If we reached only one of the two ends, that means the other string
+ * has something which then the fact can be used to determine
+ * the return value.
+ */
+ if (i1 >= n1) {
+ if (i2 >= n2)
+ return (0);
+ return (-1);
+ }
+ return (1);
+}
+
+/*
+ * The combining_class() function checks on the given bytes and find out
+ * the corresponding Unicode combining class value. The return value 0 means
+ * it is a Starter. Any illegal UTF-8 character will also be treated as
+ * a Starter.
+ */
+static uchar_t
+combining_class(size_t uv, uchar_t *s, size_t sz)
+{
+ uint16_t b1 = 0;
+ uint16_t b2 = 0;
+ uint16_t b3 = 0;
+ uint16_t b4 = 0;
+
+ if (sz == 1 || sz > 4)
+ return (0);
+
+ if (sz == 2) {
+ b3 = s[0];
+ b4 = s[1];
+ } else if (sz == 3) {
+ b2 = s[0];
+ b3 = s[1];
+ b4 = s[2];
+ } else if (sz == 4) {
+ b1 = s[0];
+ b2 = s[1];
+ b3 = s[2];
+ b4 = s[3];
+ }
+
+ b1 = u8_common_b1_tbl[uv][b1];
+ if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+ return (0);
+
+ b2 = u8_combining_class_b2_tbl[uv][b1][b2];
+ if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+ return (0);
+
+ b3 = u8_combining_class_b3_tbl[uv][b2][b3];
+ if (b3 == U8_TBL_ELEMENT_NOT_DEF)
+ return (0);
+
+ return (u8_combining_class_b4_tbl[uv][b3][b4]);
+}
+
+/*
+ * The do_decomp() function finds out a matching decomposition if any
+ * and return. If there is no match, the input bytes are copied and returned.
+ * The function also checks if there is a Hangul, decomposes it if necessary
+ * and returns.
+ *
+ * To save time, a single byte 7-bit ASCII character should be handled by
+ * the caller.
+ *
+ * The function returns the number of bytes returned sans always terminating
+ * the null byte. It will also return a state that will tell if there was
+ * a Hangul character decomposed which then will be used by the caller.
+ */
+static size_t
+do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
+ boolean_t canonical_decomposition, u8_normalization_states_t *state)
+{
+ uint16_t b1 = 0;
+ uint16_t b2 = 0;
+ uint16_t b3 = 0;
+ uint16_t b3_tbl;
+ uint16_t b3_base;
+ uint16_t b4 = 0;
+ size_t start_id;
+ size_t end_id;
+ size_t i;
+ uint32_t u1;
+
+ if (sz == 2) {
+ b3 = u8s[0] = s[0];
+ b4 = u8s[1] = s[1];
+ u8s[2] = '\0';
+ } else if (sz == 3) {
+ /* Convert it to a Unicode scalar value. */
+ U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
+
+ /*
+ * If this is a Hangul syllable, we decompose it into
+ * a leading consonant, a vowel, and an optional trailing
+ * consonant and then return.
+ */
+ if (U8_HANGUL_SYLLABLE(u1)) {
+ u1 -= U8_HANGUL_SYL_FIRST;
+
+ b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
+ b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
+ / U8_HANGUL_T_COUNT;
+ b3 = u1 % U8_HANGUL_T_COUNT;
+
+ U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
+ U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
+ if (b3) {
+ b3 += U8_HANGUL_JAMO_T_FIRST;
+ U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
+
+ u8s[9] = '\0';
+ *state = U8_STATE_HANGUL_LVT;
+ return (9);
+ }
+
+ u8s[6] = '\0';
+ *state = U8_STATE_HANGUL_LV;
+ return (6);
+ }
+
+ b2 = u8s[0] = s[0];
+ b3 = u8s[1] = s[1];
+ b4 = u8s[2] = s[2];
+ u8s[3] = '\0';
+
+ /*
+ * If this is a Hangul Jamo, we know there is nothing
+ * further that we can decompose.
+ */
+ if (U8_HANGUL_JAMO_L(u1)) {
+ *state = U8_STATE_HANGUL_L;
+ return (3);
+ }
+
+ if (U8_HANGUL_JAMO_V(u1)) {
+ if (*state == U8_STATE_HANGUL_L)
+ *state = U8_STATE_HANGUL_LV;
+ else
+ *state = U8_STATE_HANGUL_V;
+ return (3);
+ }
+
+ if (U8_HANGUL_JAMO_T(u1)) {
+ if (*state == U8_STATE_HANGUL_LV)
+ *state = U8_STATE_HANGUL_LVT;
+ else
+ *state = U8_STATE_HANGUL_T;
+ return (3);
+ }
+ } else if (sz == 4) {
+ b1 = u8s[0] = s[0];
+ b2 = u8s[1] = s[1];
+ b3 = u8s[2] = s[2];
+ b4 = u8s[3] = s[3];
+ u8s[4] = '\0';
+ } else {
+ /*
+ * This is a fallback and should not happen if the function
+ * was called properly.
+ */
+ u8s[0] = s[0];
+ u8s[1] = '\0';
+ *state = U8_STATE_START;
+ return (1);
+ }
+
+ /*
+ * At this point, this routine does not know what it would get.
+ * The caller should sort it out if the state isn't a Hangul one.
+ */
+ *state = U8_STATE_START;
+
+ /* Try to find matching decomposition mapping byte sequence. */
+ b1 = u8_common_b1_tbl[uv][b1];
+ if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ b2 = u8_decomp_b2_tbl[uv][b1][b2];
+ if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
+ if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ /*
+ * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
+ * which is 0x8000, this means we couldn't fit the mappings into
+ * the cardinality of a unsigned byte.
+ */
+ if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
+ b3_tbl -= U8_16BIT_TABLE_INDICATOR;
+ start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
+ end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
+ } else {
+ start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
+ end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
+ }
+
+ /* This also means there wasn't any matching decomposition. */
+ if (start_id >= end_id)
+ return ((size_t)sz);
+
+ /*
+ * The final table for decomposition mappings has three types of
+ * byte sequences depending on whether a mapping is for compatibility
+ * decomposition, canonical decomposition, or both like the following:
+ *
+ * (1) Compatibility decomposition mappings:
+ *
+ * +---+---+-...-+---+
+ * | B0| B1| ... | Bm|
+ * +---+---+-...-+---+
+ *
+ * The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
+ *
+ * (2) Canonical decomposition mappings:
+ *
+ * +---+---+---+-...-+---+
+ * | T | b0| b1| ... | bn|
+ * +---+---+---+-...-+---+
+ *
+ * where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
+ *
+ * (3) Both mappings:
+ *
+ * +---+---+---+---+-...-+---+---+---+-...-+---+
+ * | T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
+ * +---+---+---+---+-...-+---+---+---+-...-+---+
+ *
+ * where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
+ * byte, b0 to bn are canonical mapping bytes and B0 to Bm are
+ * compatibility mapping bytes.
+ *
+ * Note that compatibility decomposition means doing recursive
+ * decompositions using both compatibility decomposition mappings and
+ * canonical decomposition mappings. On the other hand, canonical
+ * decomposition means doing recursive decompositions using only
+ * canonical decomposition mappings. Since the table we have has gone
+ * through the recursions already, we do not need to do so during
+ * runtime, i.e., the table has been completely flattened out
+ * already.
+ */
+
+ b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
+
+ /* Get the type, T, of the byte sequence. */
+ b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
+
+ /*
+ * If necessary, adjust start_id, end_id, or both. Note that if
+ * this is compatibility decomposition mapping, there is no
+ * adjustment.
+ */
+ if (canonical_decomposition) {
+ /* Is the mapping only for compatibility decomposition? */
+ if (b1 < U8_DECOMP_BOTH)
+ return ((size_t)sz);
+
+ start_id++;
+
+ if (b1 == U8_DECOMP_BOTH) {
+ end_id = start_id +
+ u8_decomp_final_tbl[uv][b3_base + start_id];
+ start_id++;
+ }
+ } else {
+ /*
+ * Unless this is a compatibility decomposition mapping,
+ * we adjust the start_id.
+ */
+ if (b1 == U8_DECOMP_BOTH) {
+ start_id++;
+ start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
+ } else if (b1 == U8_DECOMP_CANONICAL) {
+ start_id++;
+ }
+ }
+
+ for (i = 0; start_id < end_id; start_id++)
+ u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
+ u8s[i] = '\0';
+
+ return (i);
+}
+
+/*
+ * The find_composition_start() function uses the character bytes given and
+ * find out the matching composition mappings if any and return the address
+ * to the composition mappings as explained in the do_composition().
+ */
+static uchar_t *
+find_composition_start(size_t uv, uchar_t *s, size_t sz)
+{
+ uint16_t b1 = 0;
+ uint16_t b2 = 0;
+ uint16_t b3 = 0;
+ uint16_t b3_tbl;
+ uint16_t b3_base;
+ uint16_t b4 = 0;
+ size_t start_id;
+ size_t end_id;
+
+ if (sz == 1) {
+ b4 = s[0];
+ } else if (sz == 2) {
+ b3 = s[0];
+ b4 = s[1];
+ } else if (sz == 3) {
+ b2 = s[0];
+ b3 = s[1];
+ b4 = s[2];
+ } else if (sz == 4) {
+ b1 = s[0];
+ b2 = s[1];
+ b3 = s[2];
+ b4 = s[3];
+ } else {
+ /*
+ * This is a fallback and should not happen if the function
+ * was called properly.
+ */
+ return (NULL);
+ }
+
+ b1 = u8_composition_b1_tbl[uv][b1];
+ if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+ return (NULL);
+
+ b2 = u8_composition_b2_tbl[uv][b1][b2];
+ if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+ return (NULL);
+
+ b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
+ if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+ return (NULL);
+
+ if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
+ b3_tbl -= U8_16BIT_TABLE_INDICATOR;
+ start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
+ end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
+ } else {
+ start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
+ end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
+ }
+
+ if (start_id >= end_id)
+ return (NULL);
+
+ b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
+
+ return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
+}
+
+/*
+ * The blocked() function checks on the combining class values of previous
+ * characters in this sequence and return whether it is blocked or not.
+ */
+static boolean_t
+blocked(uchar_t *comb_class, size_t last)
+{
+ uchar_t my_comb_class;
+ size_t i;
+
+ my_comb_class = comb_class[last];
+ for (i = 1; i < last; i++)
+ if (comb_class[i] >= my_comb_class ||
+ comb_class[i] == U8_COMBINING_CLASS_STARTER)
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+/*
+ * The do_composition() reads the character string pointed by 's' and
+ * do necessary canonical composition and then copy over the result back to
+ * the 's'.
+ *
+ * The input argument 's' cannot contain more than 32 characters.
+ */
+static size_t
+do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
+ uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
+{
+ uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
+ uchar_t tc[U8_MB_CUR_MAX] = { '\0' };
+ uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
+ size_t saved_marks_count;
+ uchar_t *p;
+ uchar_t *saved_p;
+ uchar_t *q;
+ size_t i;
+ size_t saved_i;
+ size_t j;
+ size_t k;
+ size_t l;
+ size_t C;
+ size_t saved_l;
+ size_t size;
+ uint32_t u1;
+ uint32_t u2;
+ boolean_t match_not_found = B_TRUE;
+
+ /*
+ * This should never happen unless the callers are doing some strange
+ * and unexpected things.
+ *
+ * The "last" is the index pointing to the last character not last + 1.
+ */
+ if (last >= U8_MAX_CHARS_A_SEQ)
+ last = U8_UPPER_LIMIT_IN_A_SEQ;
+
+ for (i = l = 0; i <= last; i++) {
+ /*
+ * The last or any non-Starters at the beginning, we don't
+ * have any chance to do composition and so we just copy them
+ * to the temporary buffer.
+ */
+ if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
+SAVE_THE_CHAR:
+ p = s + start[i];
+ size = disp[i];
+ for (k = 0; k < size; k++)
+ t[l++] = *p++;
+ continue;
+ }
+
+ /*
+ * If this could be a start of Hangul Jamos, then, we try to
+ * conjoin them.
+ */
+ if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
+ U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
+ s[start[i] + 1], s[start[i] + 2]);
+ U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
+ s[start[i] + 4], s[start[i] + 5]);
+
+ if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
+ u1 -= U8_HANGUL_JAMO_L_FIRST;
+ u2 -= U8_HANGUL_JAMO_V_FIRST;
+ u1 = U8_HANGUL_SYL_FIRST +
+ (u1 * U8_HANGUL_V_COUNT + u2) *
+ U8_HANGUL_T_COUNT;
+
+ i += 2;
+ if (i <= last) {
+ U8_PUT_3BYTES_INTO_UTF32(u2,
+ s[start[i]], s[start[i] + 1],
+ s[start[i] + 2]);
+
+ if (U8_HANGUL_JAMO_T(u2)) {
+ u1 += u2 -
+ U8_HANGUL_JAMO_T_FIRST;
+ i++;
+ }
+ }
+
+ U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
+ i--;
+ l += 3;
+ continue;
+ }
+ }
+
+ /*
+ * Let's then find out if this Starter has composition
+ * mapping.
+ */
+ p = find_composition_start(uv, s + start[i], disp[i]);
+ if (p == NULL)
+ goto SAVE_THE_CHAR;
+
+ /*
+ * We have a Starter with composition mapping and the next
+ * character is a non-Starter. Let's try to find out if
+ * we can do composition.
+ */
+
+ saved_p = p;
+ saved_i = i;
+ saved_l = l;
+ saved_marks_count = 0;
+
+TRY_THE_NEXT_MARK:
+ q = s + start[++i];
+ size = disp[i];
+
+ /*
+ * The next for() loop compares the non-Starter pointed by
+ * 'q' with the possible (joinable) characters pointed by 'p'.
+ *
+ * The composition final table entry pointed by the 'p'
+ * looks like the following:
+ *
+ * +---+---+---+-...-+---+---+---+---+-...-+---+---+
+ * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
+ * +---+---+---+-...-+---+---+---+---+-...-+---+---+
+ *
+ * where C is the count byte indicating the number of
+ * mapping pairs where each pair would be look like
+ * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
+ * character of a canonical decomposition and the B0-Bm are
+ * the bytes of a matching composite character. The F is
+ * a filler byte after each character as the separator.
+ */
+
+ match_not_found = B_TRUE;
+
+ for (C = *p++; C > 0; C--) {
+ for (k = 0; k < size; p++, k++)
+ if (*p != q[k])
+ break;
+
+ /* Have we found it? */
+ if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
+ match_not_found = B_FALSE;
+
+ l = saved_l;
+
+ while (*++p != U8_TBL_ELEMENT_FILLER)
+ t[l++] = *p;
+
+ break;
+ }
+
+ /* We didn't find; skip to the next pair. */
+ if (*p != U8_TBL_ELEMENT_FILLER)
+ while (*++p != U8_TBL_ELEMENT_FILLER)
+ ;
+ while (*++p != U8_TBL_ELEMENT_FILLER)
+ ;
+ p++;
+ }
+
+ /*
+ * If there was no match, we will need to save the combining
+ * mark for later appending. After that, if the next one
+ * is a non-Starter and not blocked, then, we try once
+ * again to do composition with the next non-Starter.
+ *
+ * If there was no match and this was a Starter, then,
+ * this is a new start.
+ *
+ * If there was a match and a composition done and we have
+ * more to check on, then, we retrieve a new composition final
+ * table entry for the composite and then try to do the
+ * composition again.
+ */
+
+ if (match_not_found) {
+ if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
+ i--;
+ goto SAVE_THE_CHAR;
+ }
+
+ saved_marks[saved_marks_count++] = i;
+ }
+
+ if (saved_l == l) {
+ while (i < last) {
+ if (blocked(comb_class, i + 1))
+ saved_marks[saved_marks_count++] = ++i;
+ else
+ break;
+ }
+ if (i < last) {
+ p = saved_p;
+ goto TRY_THE_NEXT_MARK;
+ }
+ } else if (i < last) {
+ p = find_composition_start(uv, t + saved_l,
+ l - saved_l);
+ if (p != NULL) {
+ saved_p = p;
+ goto TRY_THE_NEXT_MARK;
+ }
+ }
+
+ /*
+ * There is no more composition possible.
+ *
+ * If there was no composition what so ever then we copy
+ * over the original Starter and then append any non-Starters
+ * remaining at the target string sequentially after that.
+ */
+
+ if (saved_l == l) {
+ p = s + start[saved_i];
+ size = disp[saved_i];
+ for (j = 0; j < size; j++)
+ t[l++] = *p++;
+ }
+
+ for (k = 0; k < saved_marks_count; k++) {
+ p = s + start[saved_marks[k]];
+ size = disp[saved_marks[k]];
+ for (j = 0; j < size; j++)
+ t[l++] = *p++;
+ }
+ }
+
+ /*
+ * If the last character is a Starter and if we have a character
+ * (possibly another Starter) that can be turned into a composite,
+ * we do so and we do so until there is no more of composition
+ * possible.
+ */
+ if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
+ p = *os;
+ saved_l = l - disp[last];
+
+ while (p < oslast) {
+ size = u8_number_of_bytes[*p];
+ if (size <= 1 || (p + size) > oslast)
+ break;
+
+ saved_p = p;
+
+ for (i = 0; i < size; i++)
+ tc[i] = *p++;
+
+ q = find_composition_start(uv, t + saved_l,
+ l - saved_l);
+ if (q == NULL) {
+ p = saved_p;
+ break;
+ }
+
+ match_not_found = B_TRUE;
+
+ for (C = *q++; C > 0; C--) {
+ for (k = 0; k < size; q++, k++)
+ if (*q != tc[k])
+ break;
+
+ if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
+ match_not_found = B_FALSE;
+
+ l = saved_l;
+
+ while (*++q != U8_TBL_ELEMENT_FILLER) {
+ /*
+ * This is practically
+ * impossible but we don't
+ * want to take any chances.
+ */
+ if (l >=
+ U8_STREAM_SAFE_TEXT_MAX) {
+ p = saved_p;
+ goto SAFE_RETURN;
+ }
+ t[l++] = *q;
+ }
+
+ break;
+ }
+
+ if (*q != U8_TBL_ELEMENT_FILLER)
+ while (*++q != U8_TBL_ELEMENT_FILLER)
+ ;
+ while (*++q != U8_TBL_ELEMENT_FILLER)
+ ;
+ q++;
+ }
+
+ if (match_not_found) {
+ p = saved_p;
+ break;
+ }
+ }
+SAFE_RETURN:
+ *os = p;
+ }
+
+ /*
+ * Now we copy over the temporary string to the target string.
+ * Since composition always reduces the number of characters or
+ * the number of characters stay, we don't need to worry about
+ * the buffer overflow here.
+ */
+ for (i = 0; i < l; i++)
+ s[i] = t[i];
+ s[l] = '\0';
+
+ return (l);
+}
+
+/*
+ * The collect_a_seq() function checks on the given string s, collect
+ * a sequence of characters at u8s, and return the sequence. While it collects
+ * a sequence, it also applies case conversion, canonical or compatibility
+ * decomposition, canonical decomposition, or some or all of them and
+ * in that order.
+ *
+ * The collected sequence cannot be bigger than 32 characters since if
+ * it is having more than 31 characters, the sequence will be terminated
+ * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
+ * a Stream-Safe Text. The collected sequence is always terminated with
+ * a null byte and the return value is the byte length of the sequence
+ * including 0. The return value does not include the terminating
+ * null byte.
+ */
+static size_t
+collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
+ boolean_t is_it_toupper, boolean_t is_it_tolower,
+ boolean_t canonical_decomposition, boolean_t compatibility_decomposition,
+ boolean_t canonical_composition,
+ int *errnum, u8_normalization_states_t *state)
+{
+ uchar_t *s;
+ int sz;
+ int saved_sz;
+ size_t i;
+ size_t j;
+ size_t k;
+ size_t l;
+ uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
+ uchar_t disp[U8_MAX_CHARS_A_SEQ];
+ uchar_t start[U8_MAX_CHARS_A_SEQ];
+ uchar_t u8t[U8_MB_CUR_MAX] = { '\0' };
+ uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
+ uchar_t tc;
+ size_t last;
+ size_t saved_last;
+ uint32_t u1;
+
+ /*
+ * Save the source string pointer which we will return a changed
+ * pointer if we do processing.
+ */
+ s = *source;
+
+ /*
+ * The following is a fallback for just in case callers are not
+ * checking the string boundaries before the calling.
+ */
+ if (s >= slast) {
+ u8s[0] = '\0';
+
+ return (0);
+ }
+
+ /*
+ * As the first thing, let's collect a character and do case
+ * conversion if necessary.
+ */
+
+ sz = u8_number_of_bytes[*s];
+
+ if (sz < 0) {
+ *errnum = EILSEQ;
+
+ u8s[0] = *s++;
+ u8s[1] = '\0';
+
+ *source = s;
+
+ return (1);
+ }
+
+ if (sz == 1) {
+ if (is_it_toupper)
+ u8s[0] = U8_ASCII_TOUPPER(*s);
+ else if (is_it_tolower)
+ u8s[0] = U8_ASCII_TOLOWER(*s);
+ else
+ u8s[0] = *s;
+ s++;
+ u8s[1] = '\0';
+ } else if ((s + sz) > slast) {
+ *errnum = EINVAL;
+
+ for (i = 0; s < slast; )
+ u8s[i++] = *s++;
+ u8s[i] = '\0';
+
+ *source = s;
+
+ return (i);
+ } else {
+ if (is_it_toupper || is_it_tolower) {
+ i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
+ s += sz;
+ sz = i;
+ } else {
+ for (i = 0; i < sz; )
+ u8s[i++] = *s++;
+ u8s[i] = '\0';
+ }
+ }
+
+ /*
+ * And then canonical/compatibility decomposition followed by
+ * an optional canonical composition. Please be noted that
+ * canonical composition is done only when a decomposition is
+ * done.
+ */
+ if (canonical_decomposition || compatibility_decomposition) {
+ if (sz == 1) {
+ *state = U8_STATE_START;
+
+ saved_sz = 1;
+
+ comb_class[0] = 0;
+ start[0] = 0;
+ disp[0] = 1;
+
+ last = 1;
+ } else {
+ saved_sz = do_decomp(uv, u8s, u8s, sz,
+ canonical_decomposition, state);
+
+ last = 0;
+
+ for (i = 0; i < saved_sz; ) {
+ sz = u8_number_of_bytes[u8s[i]];
+
+ comb_class[last] = combining_class(uv,
+ u8s + i, sz);
+ start[last] = i;
+ disp[last] = sz;
+
+ last++;
+ i += sz;
+ }
+
+ /*
+ * Decomposition yields various Hangul related
+ * states but not on combining marks. We need to
+ * find out at here by checking on the last
+ * character.
+ */
+ if (*state == U8_STATE_START) {
+ if (comb_class[last - 1])
+ *state = U8_STATE_COMBINING_MARK;
+ }
+ }
+
+ saved_last = last;
+
+ while (s < slast) {
+ sz = u8_number_of_bytes[*s];
+
+ /*
+ * If this is an illegal character, an incomplete
+ * character, or an 7-bit ASCII Starter character,
+ * then we have collected a sequence; break and let
+ * the next call deal with the two cases.
+ *
+ * Note that this is okay only if you are using this
+ * function with a fixed length string, not on
+ * a buffer with multiple calls of one chunk at a time.
+ */
+ if (sz <= 1) {
+ break;
+ } else if ((s + sz) > slast) {
+ break;
+ } else {
+ /*
+ * If the previous character was a Hangul Jamo
+ * and this character is a Hangul Jamo that
+ * can be conjoined, we collect the Jamo.
+ */
+ if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
+ U8_PUT_3BYTES_INTO_UTF32(u1,
+ *s, *(s + 1), *(s + 2));
+
+ if (U8_HANGUL_COMPOSABLE_L_V(*state,
+ u1)) {
+ i = 0;
+ *state = U8_STATE_HANGUL_LV;
+ goto COLLECT_A_HANGUL;
+ }
+
+ if (U8_HANGUL_COMPOSABLE_LV_T(*state,
+ u1)) {
+ i = 0;
+ *state = U8_STATE_HANGUL_LVT;
+ goto COLLECT_A_HANGUL;
+ }
+ }
+
+ /*
+ * Regardless of whatever it was, if this is
+ * a Starter, we don't collect the character
+ * since that's a new start and we will deal
+ * with it at the next time.
+ */
+ i = combining_class(uv, s, sz);
+ if (i == U8_COMBINING_CLASS_STARTER)
+ break;
+
+ /*
+ * We know the current character is a combining
+ * mark. If the previous character wasn't
+ * a Starter (not Hangul) or a combining mark,
+ * then, we don't collect this combining mark.
+ */
+ if (*state != U8_STATE_START &&
+ *state != U8_STATE_COMBINING_MARK)
+ break;
+
+ *state = U8_STATE_COMBINING_MARK;
+COLLECT_A_HANGUL:
+ /*
+ * If we collected a Starter and combining
+ * marks up to 30, i.e., total 31 characters,
+ * then, we terminate this degenerately long
+ * combining sequence with a U+034F COMBINING
+ * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
+ * UTF-8 and turn this into a Stream-Safe
+ * Text. This will be extremely rare but
+ * possible.
+ *
+ * The following will also guarantee that
+ * we are not writing more than 32 characters
+ * plus a NULL at u8s[].
+ */
+ if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
+TURN_STREAM_SAFE:
+ *state = U8_STATE_START;
+ comb_class[last] = 0;
+ start[last] = saved_sz;
+ disp[last] = 2;
+ last++;
+
+ u8s[saved_sz++] = 0xCD;
+ u8s[saved_sz++] = 0x8F;
+
+ break;
+ }
+
+ /*
+ * Some combining marks also do decompose into
+ * another combining mark or marks.
+ */
+ if (*state == U8_STATE_COMBINING_MARK) {
+ k = last;
+ l = sz;
+ i = do_decomp(uv, uts, s, sz,
+ canonical_decomposition, state);
+ for (j = 0; j < i; ) {
+ sz = u8_number_of_bytes[uts[j]];
+
+ comb_class[last] =
+ combining_class(uv,
+ uts + j, sz);
+ start[last] = saved_sz + j;
+ disp[last] = sz;
+
+ last++;
+ if (last >=
+ U8_UPPER_LIMIT_IN_A_SEQ) {
+ last = k;
+ goto TURN_STREAM_SAFE;
+ }
+ j += sz;
+ }
+
+ *state = U8_STATE_COMBINING_MARK;
+ sz = i;
+ s += l;
+
+ for (i = 0; i < sz; i++)
+ u8s[saved_sz++] = uts[i];
+ } else {
+ comb_class[last] = i;
+ start[last] = saved_sz;
+ disp[last] = sz;
+ last++;
+
+ for (i = 0; i < sz; i++)
+ u8s[saved_sz++] = *s++;
+ }
+
+ /*
+ * If this is U+0345 COMBINING GREEK
+ * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
+ * iota subscript, and need to be converted to
+ * uppercase letter, convert it to U+0399 GREEK
+ * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
+ * i.e., convert to capital adscript form as
+ * specified in the Unicode standard.
+ *
+ * This is the only special case of (ambiguous)
+ * case conversion at combining marks and
+ * probably the standard will never have
+ * anything similar like this in future.
+ */
+ if (is_it_toupper && sz >= 2 &&
+ u8s[saved_sz - 2] == 0xCD &&
+ u8s[saved_sz - 1] == 0x85) {
+ u8s[saved_sz - 2] = 0xCE;
+ u8s[saved_sz - 1] = 0x99;
+ }
+ }
+ }
+
+ /*
+ * Let's try to ensure a canonical ordering for the collected
+ * combining marks. We do this only if we have collected
+ * at least one more non-Starter. (The decomposition mapping
+ * data tables have fully (and recursively) expanded and
+ * canonically ordered decompositions.)
+ *
+ * The U8_SWAP_COMB_MARKS() convenience macro has some
+ * assumptions and we are meeting the assumptions.
+ */
+ last--;
+ if (last >= saved_last) {
+ for (i = 0; i < last; i++)
+ for (j = last; j > i; j--)
+ if (comb_class[j] &&
+ comb_class[j - 1] > comb_class[j]) {
+ U8_SWAP_COMB_MARKS(j - 1, j);
+ }
+ }
+
+ *source = s;
+
+ if (! canonical_composition) {
+ u8s[saved_sz] = '\0';
+ return (saved_sz);
+ }
+
+ /*
+ * Now do the canonical composition. Note that we do this
+ * only after a canonical or compatibility decomposition to
+ * finish up NFC or NFKC.
+ */
+ sz = do_composition(uv, u8s, comb_class, start, disp, last,
+ &s, slast);
+ }
+
+ *source = s;
+
+ return ((size_t)sz);
+}
+
+/*
+ * The do_norm_compare() function does string comparison based on Unicode
+ * simple case mappings and Unicode Normalization definitions.
+ *
+ * It does so by collecting a sequence of character at a time and comparing
+ * the collected sequences from the strings.
+ *
+ * The meanings on the return values are the same as the usual strcmp().
+ */
+static int
+do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
+ int flag, int *errnum)
+{
+ int result;
+ size_t sz1;
+ size_t sz2;
+ uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
+ uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
+ uchar_t *s1last;
+ uchar_t *s2last;
+ boolean_t is_it_toupper;
+ boolean_t is_it_tolower;
+ boolean_t canonical_decomposition;
+ boolean_t compatibility_decomposition;
+ boolean_t canonical_composition;
+ u8_normalization_states_t state;
+
+ s1last = s1 + n1;
+ s2last = s2 + n2;
+
+ is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
+ is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
+ canonical_decomposition = flag & U8_CANON_DECOMP;
+ compatibility_decomposition = flag & U8_COMPAT_DECOMP;
+ canonical_composition = flag & U8_CANON_COMP;
+
+ while (s1 < s1last && s2 < s2last) {
+ /*
+ * If the current character is a 7-bit ASCII and the last
+ * character, or, if the current character and the next
+ * character are both some 7-bit ASCII characters then
+ * we treat the current character as a sequence.
+ *
+ * In any other cases, we need to call collect_a_seq().
+ */
+
+ if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
+ ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
+ if (is_it_toupper)
+ u8s1[0] = U8_ASCII_TOUPPER(*s1);
+ else if (is_it_tolower)
+ u8s1[0] = U8_ASCII_TOLOWER(*s1);
+ else
+ u8s1[0] = *s1;
+ u8s1[1] = '\0';
+ sz1 = 1;
+ s1++;
+ } else {
+ state = U8_STATE_START;
+ sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
+ is_it_toupper, is_it_tolower,
+ canonical_decomposition,
+ compatibility_decomposition,
+ canonical_composition, errnum, &state);
+ }
+
+ if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
+ ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
+ if (is_it_toupper)
+ u8s2[0] = U8_ASCII_TOUPPER(*s2);
+ else if (is_it_tolower)
+ u8s2[0] = U8_ASCII_TOLOWER(*s2);
+ else
+ u8s2[0] = *s2;
+ u8s2[1] = '\0';
+ sz2 = 1;
+ s2++;
+ } else {
+ state = U8_STATE_START;
+ sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
+ is_it_toupper, is_it_tolower,
+ canonical_decomposition,
+ compatibility_decomposition,
+ canonical_composition, errnum, &state);
+ }
+
+ /*
+ * Now compare the two characters. If they are the same,
+ * we move on to the next character sequences.
+ */
+ if (sz1 == 1 && sz2 == 1) {
+ if (*u8s1 > *u8s2)
+ return (1);
+ if (*u8s1 < *u8s2)
+ return (-1);
+ } else {
+ result = strcmp((const char *)u8s1, (const char *)u8s2);
+ if (result != 0)
+ return (result);
+ }
+ }
+
+ /*
+ * We compared until the end of either or both strings.
+ *
+ * If we reached to or went over the ends for the both, that means
+ * they are the same.
+ *
+ * If we reached only one end, that means the other string has
+ * something which then can be used to determine the return value.
+ */
+ if (s1 >= s1last) {
+ if (s2 >= s2last)
+ return (0);
+ return (-1);
+ }
+ return (1);
+}
+
+/*
+ * The u8_strcmp() function compares two UTF-8 strings quite similar to
+ * the strcmp(). For the comparison, however, Unicode Normalization specific
+ * equivalency and Unicode simple case conversion mappings based equivalency
+ * can be requested and checked against.
+ */
+int
+u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
+ int *errnum)
+{
+ int f;
+ size_t n1;
+ size_t n2;
+
+ *errnum = 0;
+
+ /*
+ * Check on the requested Unicode version, case conversion, and
+ * normalization flag values.
+ */
+
+ if (uv > U8_UNICODE_LATEST) {
+ *errnum = ERANGE;
+ uv = U8_UNICODE_LATEST;
+ }
+
+ if (flag == 0) {
+ flag = U8_STRCMP_CS;
+ } else {
+ f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
+ U8_STRCMP_CI_LOWER);
+ if (f == 0) {
+ flag |= U8_STRCMP_CS;
+ } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
+ f != U8_STRCMP_CI_LOWER) {
+ *errnum = EBADF;
+ flag = U8_STRCMP_CS;
+ }
+
+ f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
+ if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
+ f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
+ *errnum = EBADF;
+ flag = U8_STRCMP_CS;
+ }
+ }
+
+ if (flag == U8_STRCMP_CS) {
+ return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
+ }
+
+ n1 = strlen(s1);
+ n2 = strlen(s2);
+ if (n != 0) {
+ if (n < n1)
+ n1 = n;
+ if (n < n2)
+ n2 = n;
+ }
+
+ /*
+ * Simple case conversion can be done much faster and so we do
+ * them separately here.
+ */
+ if (flag == U8_STRCMP_CI_UPPER) {
+ return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
+ n1, n2, B_TRUE, errnum));
+ } else if (flag == U8_STRCMP_CI_LOWER) {
+ return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
+ n1, n2, B_FALSE, errnum));
+ }
+
+ return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
+ flag, errnum));
+}
+
+size_t
+u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
+ int flag, size_t unicode_version, int *errnum)
+{
+ int f;
+ int sz;
+ uchar_t *ib;
+ uchar_t *ibtail;
+ uchar_t *ob;
+ uchar_t *obtail;
+ boolean_t do_not_ignore_null;
+ boolean_t do_not_ignore_invalid;
+ boolean_t is_it_toupper;
+ boolean_t is_it_tolower;
+ boolean_t canonical_decomposition;
+ boolean_t compatibility_decomposition;
+ boolean_t canonical_composition;
+ size_t ret_val;
+ size_t i;
+ size_t j;
+ uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
+ u8_normalization_states_t state;
+
+ if (unicode_version > U8_UNICODE_LATEST) {
+ *errnum = ERANGE;
+ return ((size_t)-1);
+ }
+
+ f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
+ if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
+ *errnum = EBADF;
+ return ((size_t)-1);
+ }
+
+ f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
+ if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
+ f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
+ *errnum = EBADF;
+ return ((size_t)-1);
+ }
+
+ if (inarray == NULL || *inlen == 0)
+ return (0);
+
+ if (outarray == NULL) {
+ *errnum = E2BIG;
+ return ((size_t)-1);
+ }
+
+ ib = (uchar_t *)inarray;
+ ob = (uchar_t *)outarray;
+ ibtail = ib + *inlen;
+ obtail = ob + *outlen;
+
+ do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
+ do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
+ is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
+ is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
+
+ ret_val = 0;
+
+ /*
+ * If we don't have a normalization flag set, we do the simple case
+ * conversion based text preparation separately below. Text
+ * preparation involving Normalization will be done in the false task
+ * block, again, separately since it will take much more time and
+ * resource than doing simple case conversions.
+ */
+ if (f == 0) {
+ while (ib < ibtail) {
+ if (*ib == '\0' && do_not_ignore_null)
+ break;
+
+ sz = u8_number_of_bytes[*ib];
+
+ if (sz < 0) {
+ if (do_not_ignore_invalid) {
+ *errnum = EILSEQ;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ sz = 1;
+ ret_val++;
+ }
+
+ if (sz == 1) {
+ if (ob >= obtail) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ if (is_it_toupper)
+ *ob = U8_ASCII_TOUPPER(*ib);
+ else if (is_it_tolower)
+ *ob = U8_ASCII_TOLOWER(*ib);
+ else
+ *ob = *ib;
+ ib++;
+ ob++;
+ } else if ((ib + sz) > ibtail) {
+ if (do_not_ignore_invalid) {
+ *errnum = EINVAL;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ if ((obtail - ob) < (ibtail - ib)) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ /*
+ * We treat the remaining incomplete character
+ * bytes as a character.
+ */
+ ret_val++;
+
+ while (ib < ibtail)
+ *ob++ = *ib++;
+ } else {
+ if (is_it_toupper || is_it_tolower) {
+ i = do_case_conv(unicode_version, u8s,
+ ib, sz, is_it_toupper);
+
+ if ((obtail - ob) < i) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ ib += sz;
+
+ for (sz = 0; sz < i; sz++)
+ *ob++ = u8s[sz];
+ } else {
+ if ((obtail - ob) < sz) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ for (i = 0; i < sz; i++)
+ *ob++ = *ib++;
+ }
+ }
+ }
+ } else {
+ canonical_decomposition = flag & U8_CANON_DECOMP;
+ compatibility_decomposition = flag & U8_COMPAT_DECOMP;
+ canonical_composition = flag & U8_CANON_COMP;
+
+ while (ib < ibtail) {
+ if (*ib == '\0' && do_not_ignore_null)
+ break;
+
+ /*
+ * If the current character is a 7-bit ASCII
+ * character and it is the last character, or,
+ * if the current character is a 7-bit ASCII
+ * character and the next character is also a 7-bit
+ * ASCII character, then, we copy over this
+ * character without going through collect_a_seq().
+ *
+ * In any other cases, we need to look further with
+ * the collect_a_seq() function.
+ */
+ if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
+ ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
+ if (ob >= obtail) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ if (is_it_toupper)
+ *ob = U8_ASCII_TOUPPER(*ib);
+ else if (is_it_tolower)
+ *ob = U8_ASCII_TOLOWER(*ib);
+ else
+ *ob = *ib;
+ ib++;
+ ob++;
+ } else {
+ *errnum = 0;
+ state = U8_STATE_START;
+
+ j = collect_a_seq(unicode_version, u8s,
+ &ib, ibtail,
+ is_it_toupper,
+ is_it_tolower,
+ canonical_decomposition,
+ compatibility_decomposition,
+ canonical_composition,
+ errnum, &state);
+
+ if (*errnum && do_not_ignore_invalid) {
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ if ((obtail - ob) < j) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ for (i = 0; i < j; i++)
+ *ob++ = u8s[i];
+ }
+ }
+ }
+
+ *inlen = ibtail - ib;
+ *outlen = obtail - ob;
+
+ return (ret_val);
+}
+
+#if defined(_KERNEL)
+static int __init
+unicode_init(void)
+{
+ return (0);
+}
+
+static void __exit
+unicode_fini(void)
+{
+}
+
+module_init(unicode_init);
+module_exit(unicode_fini);
+#endif
+
+ZFS_MODULE_DESCRIPTION("Unicode implementation");
+ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR);
+ZFS_MODULE_LICENSE(ZFS_META_LICENSE);
+ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
+
+EXPORT_SYMBOL(u8_validate);
+EXPORT_SYMBOL(u8_strcmp);
+EXPORT_SYMBOL(u8_textprep_str);
diff --git a/sys/contrib/openzfs/module/unicode/uconv.c b/sys/contrib/openzfs/module/unicode/uconv.c
new file mode 100644
index 000000000000..fe84979d08b2
--- /dev/null
+++ b/sys/contrib/openzfs/module/unicode/uconv.c
@@ -0,0 +1,863 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+
+/*
+ * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
+ * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
+ * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
+ * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
+ * the section 3C man pages.
+ * Interface stability: Committed
+ */
+
+#include <sys/types.h>
+#ifdef _KERNEL
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/debug.h>
+#include <sys/kmem.h>
+#include <sys/sunddi.h>
+#else
+#include <sys/u8_textprep.h>
+#endif /* _KERNEL */
+#include <sys/byteorder.h>
+#include <sys/errno.h>
+
+
+/*
+ * The max and min values of high and low surrogate pairs of UTF-16,
+ * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
+ */
+#define UCONV_U16_HI_MIN (0xd800U)
+#define UCONV_U16_HI_MAX (0xdbffU)
+#define UCONV_U16_LO_MIN (0xdc00U)
+#define UCONV_U16_LO_MAX (0xdfffU)
+#define UCONV_U16_BIT_SHIFT (0x0400U)
+#define UCONV_U16_BIT_MASK (0x0fffffU)
+#define UCONV_U16_START (0x010000U)
+
+/* The maximum value of Unicode coding space and ASCII coding space. */
+#define UCONV_UNICODE_MAX (0x10ffffU)
+#define UCONV_ASCII_MAX (0x7fU)
+
+/* The mask values for input and output endians. */
+#define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
+#define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
+
+/* Native and reversed endian macros. */
+#ifdef _ZFS_BIG_ENDIAN
+#define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN
+#define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN
+#define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN
+#define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN
+#else
+#define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN
+#define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN
+#define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN
+#define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN
+#endif /* _BIG_ENDIAN */
+
+/* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
+#define UCONV_BOM_NORMAL (0xfeffU)
+#define UCONV_BOM_SWAPPED (0xfffeU)
+#define UCONV_BOM_SWAPPED_32 (0xfffe0000U)
+
+/* UTF-32 boundaries based on UTF-8 character byte lengths. */
+#define UCONV_U8_ONE_BYTE (0x7fU)
+#define UCONV_U8_TWO_BYTES (0x7ffU)
+#define UCONV_U8_THREE_BYTES (0xffffU)
+#define UCONV_U8_FOUR_BYTES (0x10ffffU)
+
+/* The common minimum and maximum values at the UTF-8 character bytes. */
+#define UCONV_U8_BYTE_MIN (0x80U)
+#define UCONV_U8_BYTE_MAX (0xbfU)
+
+/*
+ * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
+ * UTF-8 character bytes.
+ */
+#define UCONV_U8_BIT_SHIFT 6
+#define UCONV_U8_BIT_MASK 0x3f
+
+/*
+ * The following vector shows remaining bytes in a UTF-8 character.
+ * Index will be the first byte of the character.
+ */
+static const uchar_t remaining_bytes_tbl[0x100] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
+ 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+
+/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+
+/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+
+/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
+ 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/*
+ * The following is a vector of bit-masks to get used bits in
+ * the first byte of a UTF-8 character. Index is remaining bytes at above of
+ * the character.
+ */
+#ifdef _KERNEL
+const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
+#else
+static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
+#endif /* _KERNEL */
+
+/*
+ * The following two vectors are to provide valid minimum and
+ * maximum values for the 2'nd byte of a multibyte UTF-8 character for
+ * better illegal sequence checking. The index value must be the value of
+ * the first byte of the UTF-8 character.
+ */
+static const uchar_t valid_min_2nd_byte[0x100] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+
+/* C0 C1 C2 C3 C4 C5 C6 C7 */
+ 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+
+/* C8 C9 CA CB CC CD CE CF */
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+
+/* D0 D1 D2 D3 D4 D5 D6 D7 */
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+
+/* D8 D9 DA DB DC DD DE DF */
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+
+/* E0 E1 E2 E3 E4 E5 E6 E7 */
+ 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+
+/* E8 E9 EA EB EC ED EE EF */
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+
+/* F0 F1 F2 F3 F4 F5 F6 F7 */
+ 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+static const uchar_t valid_max_2nd_byte[0x100] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+
+/* C0 C1 C2 C3 C4 C5 C6 C7 */
+ 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+
+/* C8 C9 CA CB CC CD CE CF */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+
+/* D0 D1 D2 D3 D4 D5 D6 D7 */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+
+/* D8 D9 DA DB DC DD DE DF */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+
+/* E0 E1 E2 E3 E4 E5 E6 E7 */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+
+/* E8 E9 EA EB EC ED EE EF */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
+
+/* F0 F1 F2 F3 F4 F5 F6 F7 */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+
+static int
+check_endian(int flag, int *in, int *out)
+{
+ *in = flag & UCONV_IN_ENDIAN_MASKS;
+
+ /* You cannot have both. */
+ if (*in == UCONV_IN_ENDIAN_MASKS)
+ return (EBADF);
+
+ if (*in == 0)
+ *in = UCONV_IN_NAT_ENDIAN;
+
+ *out = flag & UCONV_OUT_ENDIAN_MASKS;
+
+ /* You cannot have both. */
+ if (*out == UCONV_OUT_ENDIAN_MASKS)
+ return (EBADF);
+
+ if (*out == 0)
+ *out = UCONV_OUT_NAT_ENDIAN;
+
+ return (0);
+}
+
+static boolean_t
+check_bom16(const uint16_t *u16s, size_t u16l, int *in)
+{
+ if (u16l > 0) {
+ if (*u16s == UCONV_BOM_NORMAL) {
+ *in = UCONV_IN_NAT_ENDIAN;
+ return (B_TRUE);
+ }
+ if (*u16s == UCONV_BOM_SWAPPED) {
+ *in = UCONV_IN_REV_ENDIAN;
+ return (B_TRUE);
+ }
+ }
+
+ return (B_FALSE);
+}
+
+static boolean_t
+check_bom32(const uint32_t *u32s, size_t u32l, int *in)
+{
+ if (u32l > 0) {
+ if (*u32s == UCONV_BOM_NORMAL) {
+ *in = UCONV_IN_NAT_ENDIAN;
+ return (B_TRUE);
+ }
+ if (*u32s == UCONV_BOM_SWAPPED_32) {
+ *in = UCONV_IN_REV_ENDIAN;
+ return (B_TRUE);
+ }
+ }
+
+ return (B_FALSE);
+}
+
+int
+uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
+ uint32_t *u32s, size_t *utf32len, int flag)
+{
+ int inendian;
+ int outendian;
+ size_t u16l;
+ size_t u32l;
+ uint32_t hi;
+ uint32_t lo;
+ boolean_t do_not_ignore_null;
+
+ /*
+ * Do preliminary validity checks on parameters and collect info on
+ * endians.
+ */
+ if (u16s == NULL || utf16len == NULL)
+ return (EILSEQ);
+
+ if (u32s == NULL || utf32len == NULL)
+ return (E2BIG);
+
+ if (check_endian(flag, &inendian, &outendian) != 0)
+ return (EBADF);
+
+ /*
+ * Initialize input and output parameter buffer indices and
+ * temporary variables.
+ */
+ u16l = u32l = 0;
+ hi = 0;
+ do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
+
+ /*
+ * Check on the BOM at the beginning of the input buffer if required
+ * and if there is indeed one, process it.
+ */
+ if ((flag & UCONV_IN_ACCEPT_BOM) &&
+ check_bom16(u16s, *utf16len, &inendian))
+ u16l++;
+
+ /*
+ * Reset inendian and outendian so that after this point, those can be
+ * used as condition values.
+ */
+ inendian &= UCONV_IN_NAT_ENDIAN;
+ outendian &= UCONV_OUT_NAT_ENDIAN;
+
+ /*
+ * If there is something in the input buffer and if necessary and
+ * requested, save the BOM at the output buffer.
+ */
+ if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
+ u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
+ UCONV_BOM_SWAPPED_32;
+
+ /*
+ * Do conversion; if encounter a surrogate pair, assemble high and
+ * low pair values to form a UTF-32 character. If a half of a pair
+ * exists alone, then, either it is an illegal (EILSEQ) or
+ * invalid (EINVAL) value.
+ */
+ for (; u16l < *utf16len; u16l++) {
+ if (u16s[u16l] == 0 && do_not_ignore_null)
+ break;
+
+ lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
+
+ if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
+ if (hi)
+ return (EILSEQ);
+ hi = lo;
+ continue;
+ } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
+ if (! hi)
+ return (EILSEQ);
+ lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
+ lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
+ + UCONV_U16_START;
+ hi = 0;
+ } else if (hi) {
+ return (EILSEQ);
+ }
+
+ if (u32l >= *utf32len)
+ return (E2BIG);
+
+ u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
+ }
+
+ /*
+ * If high half didn't see low half, then, it's most likely the input
+ * parameter is incomplete.
+ */
+ if (hi)
+ return (EINVAL);
+
+ /*
+ * Save the number of consumed and saved characters. They do not
+ * include terminating NULL character (U+0000) at the end of
+ * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
+ * the input buffer length is big enough to include the terminating
+ * NULL character).
+ */
+ *utf16len = u16l;
+ *utf32len = u32l;
+
+ return (0);
+}
+
+int
+uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
+ uchar_t *u8s, size_t *utf8len, int flag)
+{
+ int inendian;
+ int outendian;
+ size_t u16l;
+ size_t u8l;
+ uint32_t hi;
+ uint32_t lo;
+ boolean_t do_not_ignore_null;
+
+ if (u16s == NULL || utf16len == NULL)
+ return (EILSEQ);
+
+ if (u8s == NULL || utf8len == NULL)
+ return (E2BIG);
+
+ if (check_endian(flag, &inendian, &outendian) != 0)
+ return (EBADF);
+
+ u16l = u8l = 0;
+ hi = 0;
+ do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
+
+ if ((flag & UCONV_IN_ACCEPT_BOM) &&
+ check_bom16(u16s, *utf16len, &inendian))
+ u16l++;
+
+ inendian &= UCONV_IN_NAT_ENDIAN;
+
+ for (; u16l < *utf16len; u16l++) {
+ if (u16s[u16l] == 0 && do_not_ignore_null)
+ break;
+
+ lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
+
+ if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
+ if (hi)
+ return (EILSEQ);
+ hi = lo;
+ continue;
+ } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
+ if (! hi)
+ return (EILSEQ);
+ lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
+ lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
+ + UCONV_U16_START;
+ hi = 0;
+ } else if (hi) {
+ return (EILSEQ);
+ }
+
+ /*
+ * Now we convert a UTF-32 character into a UTF-8 character.
+ * Unicode coding space is between U+0000 and U+10FFFF;
+ * anything bigger is an illegal character.
+ */
+ if (lo <= UCONV_U8_ONE_BYTE) {
+ if (u8l >= *utf8len)
+ return (E2BIG);
+ u8s[u8l++] = (uchar_t)lo;
+ } else if (lo <= UCONV_U8_TWO_BYTES) {
+ if ((u8l + 1) >= *utf8len)
+ return (E2BIG);
+ u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
+ u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f));
+ } else if (lo <= UCONV_U8_THREE_BYTES) {
+ if ((u8l + 2) >= *utf8len)
+ return (E2BIG);
+ u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
+ u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
+ u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f));
+ } else if (lo <= UCONV_U8_FOUR_BYTES) {
+ if ((u8l + 3) >= *utf8len)
+ return (E2BIG);
+ u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
+ u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
+ u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
+ u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f));
+ } else {
+ return (EILSEQ);
+ }
+ }
+
+ if (hi)
+ return (EINVAL);
+
+ *utf16len = u16l;
+ *utf8len = u8l;
+
+ return (0);
+}
+
+int
+uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
+ uint16_t *u16s, size_t *utf16len, int flag)
+{
+ int inendian;
+ int outendian;
+ size_t u16l;
+ size_t u32l;
+ uint32_t hi;
+ uint32_t lo;
+ boolean_t do_not_ignore_null;
+
+ if (u32s == NULL || utf32len == NULL)
+ return (EILSEQ);
+
+ if (u16s == NULL || utf16len == NULL)
+ return (E2BIG);
+
+ if (check_endian(flag, &inendian, &outendian) != 0)
+ return (EBADF);
+
+ u16l = u32l = 0;
+ do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
+
+ if ((flag & UCONV_IN_ACCEPT_BOM) &&
+ check_bom32(u32s, *utf32len, &inendian))
+ u32l++;
+
+ inendian &= UCONV_IN_NAT_ENDIAN;
+ outendian &= UCONV_OUT_NAT_ENDIAN;
+
+ if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
+ u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
+ UCONV_BOM_SWAPPED;
+
+ for (; u32l < *utf32len; u32l++) {
+ if (u32s[u32l] == 0 && do_not_ignore_null)
+ break;
+
+ hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
+
+ /*
+ * Anything bigger than the Unicode coding space, i.e.,
+ * Unicode scalar value bigger than U+10FFFF, is an illegal
+ * character.
+ */
+ if (hi > UCONV_UNICODE_MAX)
+ return (EILSEQ);
+
+ /*
+ * Anything bigger than U+FFFF must be converted into
+ * a surrogate pair in UTF-16.
+ */
+ if (hi >= UCONV_U16_START) {
+ lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
+ UCONV_U16_LO_MIN;
+ hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
+ UCONV_U16_HI_MIN;
+
+ if ((u16l + 1) >= *utf16len)
+ return (E2BIG);
+
+ if (outendian) {
+ u16s[u16l++] = (uint16_t)hi;
+ u16s[u16l++] = (uint16_t)lo;
+ } else {
+ u16s[u16l++] = BSWAP_16(((uint16_t)hi));
+ u16s[u16l++] = BSWAP_16(((uint16_t)lo));
+ }
+ } else {
+ if (u16l >= *utf16len)
+ return (E2BIG);
+ u16s[u16l++] = (outendian) ? (uint16_t)hi :
+ BSWAP_16(((uint16_t)hi));
+ }
+ }
+
+ *utf16len = u16l;
+ *utf32len = u32l;
+
+ return (0);
+}
+
+int
+uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
+ uchar_t *u8s, size_t *utf8len, int flag)
+{
+ int inendian;
+ int outendian;
+ size_t u32l;
+ size_t u8l;
+ uint32_t lo;
+ boolean_t do_not_ignore_null;
+
+ if (u32s == NULL || utf32len == NULL)
+ return (EILSEQ);
+
+ if (u8s == NULL || utf8len == NULL)
+ return (E2BIG);
+
+ if (check_endian(flag, &inendian, &outendian) != 0)
+ return (EBADF);
+
+ u32l = u8l = 0;
+ do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
+
+ if ((flag & UCONV_IN_ACCEPT_BOM) &&
+ check_bom32(u32s, *utf32len, &inendian))
+ u32l++;
+
+ inendian &= UCONV_IN_NAT_ENDIAN;
+
+ for (; u32l < *utf32len; u32l++) {
+ if (u32s[u32l] == 0 && do_not_ignore_null)
+ break;
+
+ lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
+
+ if (lo <= UCONV_U8_ONE_BYTE) {
+ if (u8l >= *utf8len)
+ return (E2BIG);
+ u8s[u8l++] = (uchar_t)lo;
+ } else if (lo <= UCONV_U8_TWO_BYTES) {
+ if ((u8l + 1) >= *utf8len)
+ return (E2BIG);
+ u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
+ u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f));
+ } else if (lo <= UCONV_U8_THREE_BYTES) {
+ if ((u8l + 2) >= *utf8len)
+ return (E2BIG);
+ u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
+ u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
+ u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f));
+ } else if (lo <= UCONV_U8_FOUR_BYTES) {
+ if ((u8l + 3) >= *utf8len)
+ return (E2BIG);
+ u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
+ u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
+ u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
+ u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f));
+ } else {
+ return (EILSEQ);
+ }
+ }
+
+ *utf32len = u32l;
+ *utf8len = u8l;
+
+ return (0);
+}
+
+int
+uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
+ uint16_t *u16s, size_t *utf16len, int flag)
+{
+ int inendian;
+ int outendian;
+ size_t u16l;
+ size_t u8l;
+ uint32_t hi;
+ uint32_t lo;
+ int remaining_bytes;
+ int first_b;
+ boolean_t do_not_ignore_null;
+
+ if (u8s == NULL || utf8len == NULL)
+ return (EILSEQ);
+
+ if (u16s == NULL || utf16len == NULL)
+ return (E2BIG);
+
+ if (check_endian(flag, &inendian, &outendian) != 0)
+ return (EBADF);
+
+ u16l = u8l = 0;
+ do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
+
+ outendian &= UCONV_OUT_NAT_ENDIAN;
+
+ if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
+ u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
+ UCONV_BOM_SWAPPED;
+
+ for (; u8l < *utf8len; ) {
+ if (u8s[u8l] == 0 && do_not_ignore_null)
+ break;
+
+ /*
+ * Collect a UTF-8 character and convert it to a UTF-32
+ * character. In doing so, we screen out illegally formed
+ * UTF-8 characters and treat such as illegal characters.
+ * The algorithm at below also screens out anything bigger
+ * than the U+10FFFF.
+ *
+ * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
+ * more details on the illegal values of UTF-8 character
+ * bytes.
+ */
+ hi = (uint32_t)u8s[u8l++];
+
+ if (hi > UCONV_ASCII_MAX) {
+ if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
+ return (EILSEQ);
+
+ first_b = hi;
+ hi = hi & u8_masks_tbl[remaining_bytes];
+
+ for (; remaining_bytes > 0; remaining_bytes--) {
+ /*
+ * If we have no more bytes, the current
+ * UTF-8 character is incomplete.
+ */
+ if (u8l >= *utf8len)
+ return (EINVAL);
+
+ lo = (uint32_t)u8s[u8l++];
+
+ if (first_b) {
+ if (lo < valid_min_2nd_byte[first_b] ||
+ lo > valid_max_2nd_byte[first_b])
+ return (EILSEQ);
+ first_b = 0;
+ } else if (lo < UCONV_U8_BYTE_MIN ||
+ lo > UCONV_U8_BYTE_MAX) {
+ return (EILSEQ);
+ }
+ hi = (hi << UCONV_U8_BIT_SHIFT) |
+ (lo & UCONV_U8_BIT_MASK);
+ }
+ }
+
+ if (hi >= UCONV_U16_START) {
+ lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
+ UCONV_U16_LO_MIN;
+ hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
+ UCONV_U16_HI_MIN;
+
+ if ((u16l + 1) >= *utf16len)
+ return (E2BIG);
+
+ if (outendian) {
+ u16s[u16l++] = (uint16_t)hi;
+ u16s[u16l++] = (uint16_t)lo;
+ } else {
+ u16s[u16l++] = BSWAP_16(((uint16_t)hi));
+ u16s[u16l++] = BSWAP_16(((uint16_t)lo));
+ }
+ } else {
+ if (u16l >= *utf16len)
+ return (E2BIG);
+
+ u16s[u16l++] = (outendian) ? (uint16_t)hi :
+ BSWAP_16(((uint16_t)hi));
+ }
+ }
+
+ *utf16len = u16l;
+ *utf8len = u8l;
+
+ return (0);
+}
+
+int
+uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
+ uint32_t *u32s, size_t *utf32len, int flag)
+{
+ int inendian;
+ int outendian;
+ size_t u32l;
+ size_t u8l;
+ uint32_t hi;
+ uint32_t c;
+ int remaining_bytes;
+ int first_b;
+ boolean_t do_not_ignore_null;
+
+ if (u8s == NULL || utf8len == NULL)
+ return (EILSEQ);
+
+ if (u32s == NULL || utf32len == NULL)
+ return (E2BIG);
+
+ if (check_endian(flag, &inendian, &outendian) != 0)
+ return (EBADF);
+
+ u32l = u8l = 0;
+ do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
+
+ outendian &= UCONV_OUT_NAT_ENDIAN;
+
+ if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
+ u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
+ UCONV_BOM_SWAPPED_32;
+
+ for (; u8l < *utf8len; ) {
+ if (u8s[u8l] == 0 && do_not_ignore_null)
+ break;
+
+ hi = (uint32_t)u8s[u8l++];
+
+ if (hi > UCONV_ASCII_MAX) {
+ if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
+ return (EILSEQ);
+
+ first_b = hi;
+ hi = hi & u8_masks_tbl[remaining_bytes];
+
+ for (; remaining_bytes > 0; remaining_bytes--) {
+ if (u8l >= *utf8len)
+ return (EINVAL);
+
+ c = (uint32_t)u8s[u8l++];
+
+ if (first_b) {
+ if (c < valid_min_2nd_byte[first_b] ||
+ c > valid_max_2nd_byte[first_b])
+ return (EILSEQ);
+ first_b = 0;
+ } else if (c < UCONV_U8_BYTE_MIN ||
+ c > UCONV_U8_BYTE_MAX) {
+ return (EILSEQ);
+ }
+ hi = (hi << UCONV_U8_BIT_SHIFT) |
+ (c & UCONV_U8_BIT_MASK);
+ }
+ }
+
+ if (u32l >= *utf32len)
+ return (E2BIG);
+
+ u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
+ }
+
+ *utf32len = u32l;
+ *utf8len = u8l;
+
+ return (0);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(uconv_u16tou32);
+EXPORT_SYMBOL(uconv_u16tou8);
+EXPORT_SYMBOL(uconv_u32tou16);
+EXPORT_SYMBOL(uconv_u32tou8);
+EXPORT_SYMBOL(uconv_u8tou16);
+EXPORT_SYMBOL(uconv_u8tou32);
+#endif
diff --git a/sys/contrib/openzfs/module/zcommon/Makefile.in b/sys/contrib/openzfs/module/zcommon/Makefile.in
new file mode 100644
index 000000000000..ebc538440445
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/Makefile.in
@@ -0,0 +1,28 @@
+ifneq ($(KBUILD_EXTMOD),)
+src = @abs_srcdir@
+obj = @abs_builddir@
+endif
+
+MODULE := zcommon
+
+obj-$(CONFIG_ZFS) := $(MODULE).o
+
+# Suppress unused-value warnings in sparc64 architecture headers
+ccflags-$(CONFIG_SPARC64) += -Wno-unused-value
+
+$(MODULE)-objs += cityhash.o
+$(MODULE)-objs += zfeature_common.o
+$(MODULE)-objs += zfs_comutil.o
+$(MODULE)-objs += zfs_deleg.o
+$(MODULE)-objs += zfs_fletcher.o
+$(MODULE)-objs += zfs_fletcher_superscalar.o
+$(MODULE)-objs += zfs_fletcher_superscalar4.o
+$(MODULE)-objs += zfs_namecheck.o
+$(MODULE)-objs += zfs_prop.o
+$(MODULE)-objs += zpool_prop.o
+$(MODULE)-objs += zprop_common.o
+
+$(MODULE)-$(CONFIG_X86) += zfs_fletcher_intel.o
+$(MODULE)-$(CONFIG_X86) += zfs_fletcher_sse.o
+$(MODULE)-$(CONFIG_X86) += zfs_fletcher_avx512.o
+$(MODULE)-$(CONFIG_ARM64) += zfs_fletcher_aarch64_neon.o
diff --git a/sys/contrib/openzfs/module/zcommon/cityhash.c b/sys/contrib/openzfs/module/zcommon/cityhash.c
new file mode 100644
index 000000000000..413a96df2cda
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/cityhash.c
@@ -0,0 +1,67 @@
+// Copyright (c) 2011 Google, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+#include <cityhash.h>
+
+#define HASH_K1 0xb492b66fbe98f273ULL
+#define HASH_K2 0x9ae16a3b2f90404fULL
+
+/*
+ * Bitwise right rotate. Normally this will compile to a single
+ * instruction.
+ */
+static inline uint64_t
+rotate(uint64_t val, int shift)
+{
+ // Avoid shifting by 64: doing so yields an undefined result.
+ return (shift == 0 ? val : (val >> shift) | (val << (64 - shift)));
+}
+
+static inline uint64_t
+cityhash_helper(uint64_t u, uint64_t v, uint64_t mul)
+{
+ uint64_t a = (u ^ v) * mul;
+ a ^= (a >> 47);
+ uint64_t b = (v ^ a) * mul;
+ b ^= (b >> 47);
+ b *= mul;
+ return (b);
+}
+
+uint64_t
+cityhash4(uint64_t w1, uint64_t w2, uint64_t w3, uint64_t w4)
+{
+ uint64_t mul = HASH_K2 + 64;
+ uint64_t a = w1 * HASH_K1;
+ uint64_t b = w2;
+ uint64_t c = w4 * mul;
+ uint64_t d = w3 * HASH_K2;
+ return (cityhash_helper(rotate(a + b, 43) + rotate(c, 30) + d,
+ a + rotate(b + HASH_K2, 18) + c, mul));
+
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(cityhash4);
+#endif
diff --git a/sys/contrib/openzfs/module/zcommon/zfeature_common.c b/sys/contrib/openzfs/module/zcommon/zfeature_common.c
new file mode 100644
index 000000000000..e95a85e89ba2
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfeature_common.c
@@ -0,0 +1,609 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ */
+
+#ifndef _KERNEL
+#include <errno.h>
+#include <string.h>
+#include <sys/stat.h>
+#endif
+#include <sys/debug.h>
+#include <sys/fs/zfs.h>
+#include <sys/inttypes.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/zfs_sysfs.h>
+#include "zfeature_common.h"
+
+/*
+ * Set to disable all feature checks while opening pools, allowing pools with
+ * unsupported features to be opened. Set for testing only.
+ */
+boolean_t zfeature_checks_disable = B_FALSE;
+
+zfeature_info_t spa_feature_table[SPA_FEATURES];
+
+/*
+ * Valid characters for feature guids. This list is mainly for aesthetic
+ * purposes and could be expanded in the future. There are different allowed
+ * characters in the guids reverse dns portion (before the colon) and its
+ * short name (after the colon).
+ */
+static int
+valid_char(char c, boolean_t after_colon)
+{
+ return ((c >= 'a' && c <= 'z') ||
+ (c >= '0' && c <= '9') ||
+ (after_colon && c == '_') ||
+ (!after_colon && (c == '.' || c == '-')));
+}
+
+/*
+ * Every feature guid must contain exactly one colon which separates a reverse
+ * dns organization name from the feature's "short" name (e.g.
+ * "com.company:feature_name").
+ */
+boolean_t
+zfeature_is_valid_guid(const char *name)
+{
+ int i;
+ boolean_t has_colon = B_FALSE;
+
+ i = 0;
+ while (name[i] != '\0') {
+ char c = name[i++];
+ if (c == ':') {
+ if (has_colon)
+ return (B_FALSE);
+ has_colon = B_TRUE;
+ continue;
+ }
+ if (!valid_char(c, has_colon))
+ return (B_FALSE);
+ }
+
+ return (has_colon);
+}
+
+boolean_t
+zfeature_is_supported(const char *guid)
+{
+ if (zfeature_checks_disable)
+ return (B_TRUE);
+
+ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
+ zfeature_info_t *feature = &spa_feature_table[i];
+ if (strcmp(guid, feature->fi_guid) == 0)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+int
+zfeature_lookup_guid(const char *guid, spa_feature_t *res)
+{
+ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
+ zfeature_info_t *feature = &spa_feature_table[i];
+ if (!feature->fi_zfs_mod_supported)
+ continue;
+ if (strcmp(guid, feature->fi_guid) == 0) {
+ if (res != NULL)
+ *res = i;
+ return (0);
+ }
+ }
+
+ return (ENOENT);
+}
+
+int
+zfeature_lookup_name(const char *name, spa_feature_t *res)
+{
+ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
+ zfeature_info_t *feature = &spa_feature_table[i];
+ if (!feature->fi_zfs_mod_supported)
+ continue;
+ if (strcmp(name, feature->fi_uname) == 0) {
+ if (res != NULL)
+ *res = i;
+ return (0);
+ }
+ }
+
+ return (ENOENT);
+}
+
+boolean_t
+zfeature_depends_on(spa_feature_t fid, spa_feature_t check)
+{
+ zfeature_info_t *feature = &spa_feature_table[fid];
+
+ for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) {
+ if (feature->fi_depends[i] == check)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+static boolean_t
+deps_contains_feature(const spa_feature_t *deps, const spa_feature_t feature)
+{
+ for (int i = 0; deps[i] != SPA_FEATURE_NONE; i++)
+ if (deps[i] == feature)
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+#if !defined(_KERNEL) && !defined(LIB_ZPOOL_BUILD)
+static boolean_t
+zfs_mod_supported_impl(const char *scope, const char *name, const char *sysfs)
+{
+ boolean_t supported = B_FALSE;
+ char *path;
+
+ int len = asprintf(&path, "%s%s%s%s%s", sysfs,
+ scope == NULL ? "" : "/", scope == NULL ? "" : scope,
+ name == NULL ? "" : "/", name == NULL ? "" : name);
+ if (len > 0) {
+ struct stat64 statbuf;
+ supported = !!(stat64(path, &statbuf) == 0);
+ free(path);
+ }
+
+ return (supported);
+}
+
+boolean_t
+zfs_mod_supported(const char *scope, const char *name)
+{
+ boolean_t supported;
+
+ /*
+ * Check both the primary and alternate sysfs locations to determine
+ * if the required functionality is supported.
+ */
+ supported = (zfs_mod_supported_impl(scope, name, ZFS_SYSFS_DIR) ||
+ zfs_mod_supported_impl(scope, name, ZFS_SYSFS_ALT_DIR));
+
+ /*
+ * For backwards compatibility with kernel modules that predate
+ * supported feature/property checking. Report the feature/property
+ * as supported if the kernel module is loaded but the requested
+ * scope directory does not exist.
+ */
+ if (supported == B_FALSE) {
+ struct stat64 statbuf;
+ if ((stat64(ZFS_SYSFS_DIR, &statbuf) == 0) &&
+ !zfs_mod_supported_impl(scope, NULL, ZFS_SYSFS_DIR) &&
+ !zfs_mod_supported_impl(scope, NULL, ZFS_SYSFS_ALT_DIR)) {
+ supported = B_TRUE;
+ }
+ }
+
+ return (supported);
+}
+#endif
+
+static boolean_t
+zfs_mod_supported_feature(const char *name)
+{
+ /*
+ * The zfs module spa_feature_table[], whether in-kernel or in
+ * libzpool, always supports all the features. libzfs needs to
+ * query the running module, via sysfs, to determine which
+ * features are supported.
+ *
+ * The equivalent _can_ be done on FreeBSD by way of the sysctl
+ * tree, but this has not been done yet. Therefore, we return
+ * that all features except edonr are supported.
+ */
+#if defined(__FreeBSD__)
+ if (strcmp(name, "org.illumos:edonr") == 0)
+ return (B_FALSE);
+ else
+ return (B_TRUE);
+#elif defined(_KERNEL) || defined(LIB_ZPOOL_BUILD)
+ return (B_TRUE);
+#else
+ return (zfs_mod_supported(ZFS_SYSFS_POOL_FEATURES, name));
+#endif
+}
+
+static void
+zfeature_register(spa_feature_t fid, const char *guid, const char *name,
+ const char *desc, zfeature_flags_t flags, zfeature_type_t type,
+ const spa_feature_t *deps)
+{
+ zfeature_info_t *feature = &spa_feature_table[fid];
+ static spa_feature_t nodeps[] = { SPA_FEATURE_NONE };
+
+ ASSERT(name != NULL);
+ ASSERT(desc != NULL);
+ ASSERT((flags & ZFEATURE_FLAG_READONLY_COMPAT) == 0 ||
+ (flags & ZFEATURE_FLAG_MOS) == 0);
+ ASSERT3U(fid, <, SPA_FEATURES);
+ ASSERT(zfeature_is_valid_guid(guid));
+
+ if (deps == NULL)
+ deps = nodeps;
+
+ VERIFY(((flags & ZFEATURE_FLAG_PER_DATASET) == 0) ||
+ (deps_contains_feature(deps, SPA_FEATURE_EXTENSIBLE_DATASET)));
+
+ feature->fi_feature = fid;
+ feature->fi_guid = guid;
+ feature->fi_uname = name;
+ feature->fi_desc = desc;
+ feature->fi_flags = flags;
+ feature->fi_type = type;
+ feature->fi_depends = deps;
+ feature->fi_zfs_mod_supported = zfs_mod_supported_feature(guid);
+}
+
+/*
+ * Every feature has a GUID of the form com.example:feature_name. The
+ * reversed DNS name ensures that the feature's GUID is unique across all ZFS
+ * implementations. This allows companies to independently develop and
+ * release features. Examples include org.delphix and org.datto. Previously,
+ * features developed on one implementation have used that implementation's
+ * domain name (e.g. org.illumos and org.zfsonlinux). Use of the org.openzfs
+ * domain name is recommended for new features which are developed by the
+ * OpenZFS community and its platforms. This domain may optionally be used by
+ * companies developing features for initial release through an OpenZFS
+ * implementation. Use of the org.openzfs domain requires reserving the
+ * feature name in advance with the OpenZFS project.
+ */
+void
+zpool_feature_init(void)
+{
+ zfeature_register(SPA_FEATURE_ASYNC_DESTROY,
+ "com.delphix:async_destroy", "async_destroy",
+ "Destroy filesystems asynchronously.",
+ ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+ zfeature_register(SPA_FEATURE_EMPTY_BPOBJ,
+ "com.delphix:empty_bpobj", "empty_bpobj",
+ "Snapshots use less space.",
+ ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+ zfeature_register(SPA_FEATURE_LZ4_COMPRESS,
+ "org.illumos:lz4_compress", "lz4_compress",
+ "LZ4 compression algorithm support.",
+ ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+ zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
+ "com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump",
+ "Crash dumps to multiple vdev pools.",
+ 0, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+ zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM,
+ "com.delphix:spacemap_histogram", "spacemap_histogram",
+ "Spacemaps maintain space histograms.",
+ ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+ zfeature_register(SPA_FEATURE_ENABLED_TXG,
+ "com.delphix:enabled_txg", "enabled_txg",
+ "Record txg at which a feature is enabled",
+ ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+ {
+ static const spa_feature_t hole_birth_deps[] = {
+ SPA_FEATURE_ENABLED_TXG,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_HOLE_BIRTH,
+ "com.delphix:hole_birth", "hole_birth",
+ "Retain hole birth txg for more precise zfs send",
+ ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
+ ZFEATURE_TYPE_BOOLEAN, hole_birth_deps);
+ }
+
+ zfeature_register(SPA_FEATURE_POOL_CHECKPOINT,
+ "com.delphix:zpool_checkpoint", "zpool_checkpoint",
+ "Pool state can be checkpointed, allowing rewind later.",
+ ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+ zfeature_register(SPA_FEATURE_SPACEMAP_V2,
+ "com.delphix:spacemap_v2", "spacemap_v2",
+ "Space maps representing large segments are more efficient.",
+ ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
+ ZFEATURE_TYPE_BOOLEAN, NULL);
+
+ zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET,
+ "com.delphix:extensible_dataset", "extensible_dataset",
+ "Enhanced dataset functionality, used by other features.",
+ 0, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+ {
+ static const spa_feature_t bookmarks_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+
+ zfeature_register(SPA_FEATURE_BOOKMARKS,
+ "com.delphix:bookmarks", "bookmarks",
+ "\"zfs bookmark\" command",
+ ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN,
+ bookmarks_deps);
+ }
+
+ {
+ static const spa_feature_t filesystem_limits_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_FS_SS_LIMIT,
+ "com.joyent:filesystem_limits", "filesystem_limits",
+ "Filesystem and snapshot limits.",
+ ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN,
+ filesystem_limits_deps);
+ }
+
+ zfeature_register(SPA_FEATURE_EMBEDDED_DATA,
+ "com.delphix:embedded_data", "embedded_data",
+ "Blocks which compress very well use even less space.",
+ ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
+ ZFEATURE_TYPE_BOOLEAN, NULL);
+
+ {
+ static const spa_feature_t livelist_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_LIVELIST,
+ "com.delphix:livelist", "livelist",
+ "Improved clone deletion performance.",
+ ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN,
+ livelist_deps);
+ }
+
+ {
+ static const spa_feature_t log_spacemap_deps[] = {
+ SPA_FEATURE_SPACEMAP_V2,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_LOG_SPACEMAP,
+ "com.delphix:log_spacemap", "log_spacemap",
+ "Log metaslab changes on a single spacemap and "
+ "flush them periodically.",
+ ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN,
+ log_spacemap_deps);
+ }
+
+ {
+ static const spa_feature_t large_blocks_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_LARGE_BLOCKS,
+ "org.open-zfs:large_blocks", "large_blocks",
+ "Support for blocks larger than 128KB.",
+ ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN,
+ large_blocks_deps);
+ }
+
+ {
+ static const spa_feature_t large_dnode_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_LARGE_DNODE,
+ "org.zfsonlinux:large_dnode", "large_dnode",
+ "Variable on-disk size of dnodes.",
+ ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN,
+ large_dnode_deps);
+ }
+
+ {
+ static const spa_feature_t sha512_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_SHA512,
+ "org.illumos:sha512", "sha512",
+ "SHA-512/256 hash algorithm.",
+ ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN,
+ sha512_deps);
+ }
+
+ {
+ static const spa_feature_t skein_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_SKEIN,
+ "org.illumos:skein", "skein",
+ "Skein hash algorithm.",
+ ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN,
+ skein_deps);
+ }
+
+ {
+ static const spa_feature_t edonr_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_EDONR,
+ "org.illumos:edonr", "edonr",
+ "Edon-R hash algorithm.",
+ ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN,
+ edonr_deps);
+ }
+
+ {
+ static const spa_feature_t redact_books_deps[] = {
+ SPA_FEATURE_BOOKMARK_V2,
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_BOOKMARKS,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_REDACTION_BOOKMARKS,
+ "com.delphix:redaction_bookmarks", "redaction_bookmarks",
+ "Support for bookmarks which store redaction lists for zfs "
+ "redacted send/recv.", 0, ZFEATURE_TYPE_BOOLEAN,
+ redact_books_deps);
+ }
+
+ {
+ static const spa_feature_t redact_datasets_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_REDACTED_DATASETS,
+ "com.delphix:redacted_datasets", "redacted_datasets", "Support for "
+ "redacted datasets, produced by receiving a redacted zfs send "
+ "stream.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_UINT64_ARRAY,
+ redact_datasets_deps);
+ }
+
+ {
+ static const spa_feature_t bookmark_written_deps[] = {
+ SPA_FEATURE_BOOKMARK_V2,
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_BOOKMARKS,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_BOOKMARK_WRITTEN,
+ "com.delphix:bookmark_written", "bookmark_written",
+ "Additional accounting, enabling the written#<bookmark> property"
+ "(space written since a bookmark), and estimates of send stream "
+ "sizes for incrementals from bookmarks.",
+ 0, ZFEATURE_TYPE_BOOLEAN, bookmark_written_deps);
+ }
+
+ zfeature_register(SPA_FEATURE_DEVICE_REMOVAL,
+ "com.delphix:device_removal", "device_removal",
+ "Top-level vdevs can be removed, reducing logical pool size.",
+ ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+ {
+ static const spa_feature_t obsolete_counts_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_DEVICE_REMOVAL,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_OBSOLETE_COUNTS,
+ "com.delphix:obsolete_counts", "obsolete_counts",
+ "Reduce memory used by removed devices when their blocks are "
+ "freed or remapped.",
+ ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN,
+ obsolete_counts_deps);
+ }
+
+ {
+ static const spa_feature_t userobj_accounting_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_USEROBJ_ACCOUNTING,
+ "org.zfsonlinux:userobj_accounting", "userobj_accounting",
+ "User/Group object accounting.",
+ ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET,
+ ZFEATURE_TYPE_BOOLEAN, userobj_accounting_deps);
+ }
+
+ {
+ static const spa_feature_t bookmark_v2_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_BOOKMARKS,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_BOOKMARK_V2,
+ "com.datto:bookmark_v2", "bookmark_v2",
+ "Support for larger bookmarks",
+ 0, ZFEATURE_TYPE_BOOLEAN, bookmark_v2_deps);
+ }
+
+ {
+ static const spa_feature_t encryption_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_BOOKMARK_V2,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_ENCRYPTION,
+ "com.datto:encryption", "encryption",
+ "Support for dataset level encryption",
+ ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN,
+ encryption_deps);
+ }
+
+ {
+ static const spa_feature_t project_quota_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_PROJECT_QUOTA,
+ "org.zfsonlinux:project_quota", "project_quota",
+ "space/object accounting based on project ID.",
+ ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET,
+ ZFEATURE_TYPE_BOOLEAN, project_quota_deps);
+ }
+
+ zfeature_register(SPA_FEATURE_ALLOCATION_CLASSES,
+ "org.zfsonlinux:allocation_classes", "allocation_classes",
+ "Support for separate allocation classes.",
+ ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+ zfeature_register(SPA_FEATURE_RESILVER_DEFER,
+ "com.datto:resilver_defer", "resilver_defer",
+ "Support for deferring new resilvers when one is already running.",
+ ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+ zfeature_register(SPA_FEATURE_DEVICE_REBUILD,
+ "org.openzfs:device_rebuild", "device_rebuild",
+ "Support for sequential mirror/dRAID device rebuilds",
+ ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+ {
+ static const spa_feature_t zstd_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_ZSTD_COMPRESS,
+ "org.freebsd:zstd_compress", "zstd_compress",
+ "zstd compression algorithm support.",
+ ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, zstd_deps);
+ }
+
+ zfeature_register(SPA_FEATURE_DRAID,
+ "org.openzfs:draid", "draid", "Support for distributed spare RAID",
+ ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zfeature_lookup_guid);
+EXPORT_SYMBOL(zfeature_lookup_name);
+EXPORT_SYMBOL(zfeature_is_supported);
+EXPORT_SYMBOL(zfeature_is_valid_guid);
+EXPORT_SYMBOL(zfeature_depends_on);
+EXPORT_SYMBOL(zpool_feature_init);
+EXPORT_SYMBOL(spa_feature_table);
+#endif
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_comutil.c b/sys/contrib/openzfs/module/zcommon/zfs_comutil.c
new file mode 100644
index 000000000000..1cec60ac1d67
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfs_comutil.c
@@ -0,0 +1,263 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ */
+
+/*
+ * This file is intended for functions that ought to be common between user
+ * land (libzfs) and the kernel. When many common routines need to be shared
+ * then a separate file should to be created.
+ */
+
+#if !defined(_KERNEL)
+#include <string.h>
+#endif
+
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+#include <sys/nvpair.h>
+#include "zfs_comutil.h"
+#include <sys/zfs_ratelimit.h>
+
+/*
+ * Are there allocatable vdevs?
+ */
+boolean_t
+zfs_allocatable_devs(nvlist_t *nv)
+{
+ uint64_t is_log;
+ uint_t c;
+ nvlist_t **child;
+ uint_t children;
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0) {
+ return (B_FALSE);
+ }
+ for (c = 0; c < children; c++) {
+ is_log = 0;
+ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+ &is_log);
+ if (!is_log)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+/*
+ * Are there special vdevs?
+ */
+boolean_t
+zfs_special_devs(nvlist_t *nv, char *type)
+{
+ char *bias;
+ uint_t c;
+ nvlist_t **child;
+ uint_t children;
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0) {
+ return (B_FALSE);
+ }
+ for (c = 0; c < children; c++) {
+ if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS,
+ &bias) == 0) {
+ if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0 ||
+ strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) {
+ if (type != NULL && strcmp(bias, type) == 0) {
+ return (B_TRUE);
+ } else if (type == NULL) {
+ return (B_TRUE);
+ }
+ }
+ }
+ }
+ return (B_FALSE);
+}
+
+void
+zpool_get_load_policy(nvlist_t *nvl, zpool_load_policy_t *zlpp)
+{
+ nvlist_t *policy;
+ nvpair_t *elem;
+ char *nm;
+
+ /* Defaults */
+ zlpp->zlp_rewind = ZPOOL_NO_REWIND;
+ zlpp->zlp_maxmeta = 0;
+ zlpp->zlp_maxdata = UINT64_MAX;
+ zlpp->zlp_txg = UINT64_MAX;
+
+ if (nvl == NULL)
+ return;
+
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
+ nm = nvpair_name(elem);
+ if (strcmp(nm, ZPOOL_LOAD_POLICY) == 0) {
+ if (nvpair_value_nvlist(elem, &policy) == 0)
+ zpool_get_load_policy(policy, zlpp);
+ return;
+ } else if (strcmp(nm, ZPOOL_LOAD_REWIND_POLICY) == 0) {
+ if (nvpair_value_uint32(elem, &zlpp->zlp_rewind) == 0)
+ if (zlpp->zlp_rewind & ~ZPOOL_REWIND_POLICIES)
+ zlpp->zlp_rewind = ZPOOL_NO_REWIND;
+ } else if (strcmp(nm, ZPOOL_LOAD_REQUEST_TXG) == 0) {
+ (void) nvpair_value_uint64(elem, &zlpp->zlp_txg);
+ } else if (strcmp(nm, ZPOOL_LOAD_META_THRESH) == 0) {
+ (void) nvpair_value_uint64(elem, &zlpp->zlp_maxmeta);
+ } else if (strcmp(nm, ZPOOL_LOAD_DATA_THRESH) == 0) {
+ (void) nvpair_value_uint64(elem, &zlpp->zlp_maxdata);
+ }
+ }
+ if (zlpp->zlp_rewind == 0)
+ zlpp->zlp_rewind = ZPOOL_NO_REWIND;
+}
+
+typedef struct zfs_version_spa_map {
+ int version_zpl;
+ int version_spa;
+} zfs_version_spa_map_t;
+
+/*
+ * Keep this table in monotonically increasing version number order.
+ */
+static zfs_version_spa_map_t zfs_version_table[] = {
+ {ZPL_VERSION_INITIAL, SPA_VERSION_INITIAL},
+ {ZPL_VERSION_DIRENT_TYPE, SPA_VERSION_INITIAL},
+ {ZPL_VERSION_FUID, SPA_VERSION_FUID},
+ {ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE},
+ {ZPL_VERSION_SA, SPA_VERSION_SA},
+ {0, 0}
+};
+
+/*
+ * Return the max zpl version for a corresponding spa version
+ * -1 is returned if no mapping exists.
+ */
+int
+zfs_zpl_version_map(int spa_version)
+{
+ int i;
+ int version = -1;
+
+ for (i = 0; zfs_version_table[i].version_spa; i++) {
+ if (spa_version >= zfs_version_table[i].version_spa)
+ version = zfs_version_table[i].version_zpl;
+ }
+
+ return (version);
+}
+
+/*
+ * Return the min spa version for a corresponding spa version
+ * -1 is returned if no mapping exists.
+ */
+int
+zfs_spa_version_map(int zpl_version)
+{
+ int i;
+ int version = -1;
+
+ for (i = 0; zfs_version_table[i].version_zpl; i++) {
+ if (zfs_version_table[i].version_zpl >= zpl_version)
+ return (zfs_version_table[i].version_spa);
+ }
+
+ return (version);
+}
+
+/*
+ * This is the table of legacy internal event names; it should not be modified.
+ * The internal events are now stored in the history log as strings.
+ */
+const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS] = {
+ "invalid event",
+ "pool create",
+ "vdev add",
+ "pool remove",
+ "pool destroy",
+ "pool export",
+ "pool import",
+ "vdev attach",
+ "vdev replace",
+ "vdev detach",
+ "vdev online",
+ "vdev offline",
+ "vdev upgrade",
+ "pool clear",
+ "pool scrub",
+ "pool property set",
+ "create",
+ "clone",
+ "destroy",
+ "destroy_begin_sync",
+ "inherit",
+ "property set",
+ "quota set",
+ "permission update",
+ "permission remove",
+ "permission who remove",
+ "promote",
+ "receive",
+ "rename",
+ "reservation set",
+ "replay_inc_sync",
+ "replay_full_sync",
+ "rollback",
+ "snapshot",
+ "filesystem version upgrade",
+ "refquota set",
+ "refreservation set",
+ "pool scrub done",
+ "user hold",
+ "user release",
+ "pool split",
+};
+
+boolean_t
+zfs_dataset_name_hidden(const char *name)
+{
+ /*
+ * Skip over datasets that are not visible in this zone,
+ * internal datasets (which have a $ in their name), and
+ * temporary datasets (which have a % in their name).
+ */
+ if (strchr(name, '$') != NULL)
+ return (B_TRUE);
+ if (strchr(name, '%') != NULL)
+ return (B_TRUE);
+ if (!INGLOBALZONE(curproc) && !zone_dataset_visible(name, NULL))
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zfs_allocatable_devs);
+EXPORT_SYMBOL(zfs_special_devs);
+EXPORT_SYMBOL(zpool_get_load_policy);
+EXPORT_SYMBOL(zfs_zpl_version_map);
+EXPORT_SYMBOL(zfs_spa_version_map);
+EXPORT_SYMBOL(zfs_history_event_names);
+EXPORT_SYMBOL(zfs_dataset_name_hidden);
+#endif
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_deleg.c b/sys/contrib/openzfs/module/zcommon/zfs_deleg.c
new file mode 100644
index 000000000000..e1f5a353b7a4
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfs_deleg.c
@@ -0,0 +1,249 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
+ */
+
+#include <sys/zfs_context.h>
+
+#if defined(_KERNEL)
+#include <sys/sunddi.h>
+#include <sys/ctype.h>
+#else
+#include <stdio.h>
+#include <unistd.h>
+#include <libnvpair.h>
+#include <ctype.h>
+#endif
+#include <sys/strings.h>
+#include <sys/dsl_deleg.h>
+#include "zfs_prop.h"
+#include "zfs_deleg.h"
+#include "zfs_namecheck.h"
+
+zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = {
+ {ZFS_DELEG_PERM_ALLOW},
+ {ZFS_DELEG_PERM_BOOKMARK},
+ {ZFS_DELEG_PERM_CLONE},
+ {ZFS_DELEG_PERM_CREATE},
+ {ZFS_DELEG_PERM_DESTROY},
+ {ZFS_DELEG_PERM_DIFF},
+ {ZFS_DELEG_PERM_MOUNT},
+ {ZFS_DELEG_PERM_PROMOTE},
+ {ZFS_DELEG_PERM_RECEIVE},
+ {ZFS_DELEG_PERM_RENAME},
+ {ZFS_DELEG_PERM_ROLLBACK},
+ {ZFS_DELEG_PERM_SNAPSHOT},
+ {ZFS_DELEG_PERM_SHARE},
+ {ZFS_DELEG_PERM_SEND},
+ {ZFS_DELEG_PERM_USERPROP},
+ {ZFS_DELEG_PERM_USERQUOTA},
+ {ZFS_DELEG_PERM_GROUPQUOTA},
+ {ZFS_DELEG_PERM_USERUSED},
+ {ZFS_DELEG_PERM_GROUPUSED},
+ {ZFS_DELEG_PERM_USEROBJQUOTA},
+ {ZFS_DELEG_PERM_GROUPOBJQUOTA},
+ {ZFS_DELEG_PERM_USEROBJUSED},
+ {ZFS_DELEG_PERM_GROUPOBJUSED},
+ {ZFS_DELEG_PERM_HOLD},
+ {ZFS_DELEG_PERM_RELEASE},
+ {ZFS_DELEG_PERM_LOAD_KEY},
+ {ZFS_DELEG_PERM_CHANGE_KEY},
+ {ZFS_DELEG_PERM_PROJECTUSED},
+ {ZFS_DELEG_PERM_PROJECTQUOTA},
+ {ZFS_DELEG_PERM_PROJECTOBJUSED},
+ {ZFS_DELEG_PERM_PROJECTOBJQUOTA},
+ {NULL}
+};
+
+static int
+zfs_valid_permission_name(const char *perm)
+{
+ if (zfs_deleg_canonicalize_perm(perm))
+ return (0);
+
+ return (permset_namecheck(perm, NULL, NULL));
+}
+
+const char *
+zfs_deleg_canonicalize_perm(const char *perm)
+{
+ int i;
+ zfs_prop_t prop;
+
+ for (i = 0; zfs_deleg_perm_tab[i].z_perm != NULL; i++) {
+ if (strcmp(perm, zfs_deleg_perm_tab[i].z_perm) == 0)
+ return (perm);
+ }
+
+ prop = zfs_name_to_prop(perm);
+ if (prop != ZPROP_INVAL && zfs_prop_delegatable(prop))
+ return (zfs_prop_to_name(prop));
+ return (NULL);
+
+}
+
+static int
+zfs_validate_who(char *who)
+{
+ char *p;
+
+ if (who[2] != ZFS_DELEG_FIELD_SEP_CHR)
+ return (-1);
+
+ switch (who[0]) {
+ case ZFS_DELEG_USER:
+ case ZFS_DELEG_GROUP:
+ case ZFS_DELEG_USER_SETS:
+ case ZFS_DELEG_GROUP_SETS:
+ if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT)
+ return (-1);
+ for (p = &who[3]; *p; p++)
+ if (!isdigit(*p))
+ return (-1);
+ break;
+
+ case ZFS_DELEG_NAMED_SET:
+ case ZFS_DELEG_NAMED_SET_SETS:
+ if (who[1] != ZFS_DELEG_NA)
+ return (-1);
+ return (permset_namecheck(&who[3], NULL, NULL));
+
+ case ZFS_DELEG_CREATE:
+ case ZFS_DELEG_CREATE_SETS:
+ if (who[1] != ZFS_DELEG_NA)
+ return (-1);
+ if (who[3] != '\0')
+ return (-1);
+ break;
+
+ case ZFS_DELEG_EVERYONE:
+ case ZFS_DELEG_EVERYONE_SETS:
+ if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT)
+ return (-1);
+ if (who[3] != '\0')
+ return (-1);
+ break;
+
+ default:
+ return (-1);
+ }
+
+ return (0);
+}
+
+int
+zfs_deleg_verify_nvlist(nvlist_t *nvp)
+{
+ nvpair_t *who, *perm_name;
+ nvlist_t *perms;
+ int error;
+
+ if (nvp == NULL)
+ return (-1);
+
+ who = nvlist_next_nvpair(nvp, NULL);
+ if (who == NULL)
+ return (-1);
+
+ do {
+ if (zfs_validate_who(nvpair_name(who)))
+ return (-1);
+
+ error = nvlist_lookup_nvlist(nvp, nvpair_name(who), &perms);
+
+ if (error && error != ENOENT)
+ return (-1);
+ if (error == ENOENT)
+ continue;
+
+ perm_name = nvlist_next_nvpair(perms, NULL);
+ if (perm_name == NULL) {
+ return (-1);
+ }
+ do {
+ error = zfs_valid_permission_name(
+ nvpair_name(perm_name));
+ if (error)
+ return (-1);
+ } while ((perm_name = nvlist_next_nvpair(perms, perm_name))
+ != NULL);
+ } while ((who = nvlist_next_nvpair(nvp, who)) != NULL);
+ return (0);
+}
+
+/*
+ * Construct the base attribute name. The base attribute names
+ * are the "key" to locate the jump objects which contain the actual
+ * permissions. The base attribute names are encoded based on
+ * type of entry and whether it is a local or descendent permission.
+ *
+ * Arguments:
+ * attr - attribute name return string, attribute is assumed to be
+ * ZFS_MAX_DELEG_NAME long.
+ * type - type of entry to construct
+ * inheritchr - inheritance type (local,descendent, or NA for create and
+ * permission set definitions
+ * data - is either a permission set name or a 64 bit uid/gid.
+ */
+void
+zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type,
+ char inheritchr, void *data)
+{
+ int len = ZFS_MAX_DELEG_NAME;
+ uint64_t *id = data;
+
+ switch (type) {
+ case ZFS_DELEG_USER:
+ case ZFS_DELEG_GROUP:
+ case ZFS_DELEG_USER_SETS:
+ case ZFS_DELEG_GROUP_SETS:
+ (void) snprintf(attr, len, "%c%c%c%lld", type, inheritchr,
+ ZFS_DELEG_FIELD_SEP_CHR, (longlong_t)*id);
+ break;
+ case ZFS_DELEG_NAMED_SET_SETS:
+ case ZFS_DELEG_NAMED_SET:
+ (void) snprintf(attr, len, "%c-%c%s", type,
+ ZFS_DELEG_FIELD_SEP_CHR, (char *)data);
+ break;
+ case ZFS_DELEG_CREATE:
+ case ZFS_DELEG_CREATE_SETS:
+ (void) snprintf(attr, len, "%c-%c", type,
+ ZFS_DELEG_FIELD_SEP_CHR);
+ break;
+ case ZFS_DELEG_EVERYONE:
+ case ZFS_DELEG_EVERYONE_SETS:
+ (void) snprintf(attr, len, "%c%c%c", type, inheritchr,
+ ZFS_DELEG_FIELD_SEP_CHR);
+ break;
+ default:
+ ASSERT(!"bad zfs_deleg_who_type_t");
+ }
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zfs_deleg_verify_nvlist);
+EXPORT_SYMBOL(zfs_deleg_whokey);
+EXPORT_SYMBOL(zfs_deleg_canonicalize_perm);
+#endif
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_fletcher.c b/sys/contrib/openzfs/module/zcommon/zfs_fletcher.c
new file mode 100644
index 000000000000..7a9de4a4309d
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfs_fletcher.c
@@ -0,0 +1,991 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+/*
+ * Fletcher Checksums
+ * ------------------
+ *
+ * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
+ * recurrence relations:
+ *
+ * a = a + f
+ * i i-1 i-1
+ *
+ * b = b + a
+ * i i-1 i
+ *
+ * c = c + b (fletcher-4 only)
+ * i i-1 i
+ *
+ * d = d + c (fletcher-4 only)
+ * i i-1 i
+ *
+ * Where
+ * a_0 = b_0 = c_0 = d_0 = 0
+ * and
+ * f_0 .. f_(n-1) are the input data.
+ *
+ * Using standard techniques, these translate into the following series:
+ *
+ * __n_ __n_
+ * \ | \ |
+ * a = > f b = > i * f
+ * n /___| n - i n /___| n - i
+ * i = 1 i = 1
+ *
+ *
+ * __n_ __n_
+ * \ | i*(i+1) \ | i*(i+1)*(i+2)
+ * c = > ------- f d = > ------------- f
+ * n /___| 2 n - i n /___| 6 n - i
+ * i = 1 i = 1
+ *
+ * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
+ * Since the additions are done mod (2^64), errors in the high bits may not
+ * be noticed. For this reason, fletcher-2 is deprecated.
+ *
+ * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
+ * A conservative estimate of how big the buffer can get before we overflow
+ * can be estimated using f_i = 0xffffffff for all i:
+ *
+ * % bc
+ * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
+ * 2264
+ * quit
+ * %
+ *
+ * So blocks of up to 2k will not overflow. Our largest block size is
+ * 128k, which has 32k 4-byte words, so we can compute the largest possible
+ * accumulators, then divide by 2^64 to figure the max amount of overflow:
+ *
+ * % bc
+ * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
+ * a/2^64;b/2^64;c/2^64;d/2^64
+ * 0
+ * 0
+ * 1365
+ * 11186858
+ * quit
+ * %
+ *
+ * So a and b cannot overflow. To make sure each bit of input has some
+ * effect on the contents of c and d, we can look at what the factors of
+ * the coefficients in the equations for c_n and d_n are. The number of 2s
+ * in the factors determines the lowest set bit in the multiplier. Running
+ * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
+ * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow
+ * the 64-bit accumulators, every bit of every f_i effects every accumulator,
+ * even for 128k blocks.
+ *
+ * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
+ * we could do our calculations mod (2^32 - 1) by adding in the carries
+ * periodically, and store the number of carries in the top 32-bits.
+ *
+ * --------------------
+ * Checksum Performance
+ * --------------------
+ *
+ * There are two interesting components to checksum performance: cached and
+ * uncached performance. With cached data, fletcher-2 is about four times
+ * faster than fletcher-4. With uncached data, the performance difference is
+ * negligible, since the cost of a cache fill dominates the processing time.
+ * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
+ * efficient pass over the data.
+ *
+ * In normal operation, the data which is being checksummed is in a buffer
+ * which has been filled either by:
+ *
+ * 1. a compression step, which will be mostly cached, or
+ * 2. a bcopy() or copyin(), which will be uncached (because the
+ * copy is cache-bypassing).
+ *
+ * For both cached and uncached data, both fletcher checksums are much faster
+ * than sha-256, and slower than 'off', which doesn't touch the data at all.
+ */
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/byteorder.h>
+#include <sys/spa.h>
+#include <sys/simd.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <zfs_fletcher.h>
+
+#define FLETCHER_MIN_SIMD_SIZE 64
+
+static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx);
+static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp);
+static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx,
+ const void *buf, uint64_t size);
+static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx,
+ const void *buf, uint64_t size);
+static boolean_t fletcher_4_scalar_valid(void);
+
+static const fletcher_4_ops_t fletcher_4_scalar_ops = {
+ .init_native = fletcher_4_scalar_init,
+ .fini_native = fletcher_4_scalar_fini,
+ .compute_native = fletcher_4_scalar_native,
+ .init_byteswap = fletcher_4_scalar_init,
+ .fini_byteswap = fletcher_4_scalar_fini,
+ .compute_byteswap = fletcher_4_scalar_byteswap,
+ .valid = fletcher_4_scalar_valid,
+ .name = "scalar"
+};
+
+static fletcher_4_ops_t fletcher_4_fastest_impl = {
+ .name = "fastest",
+ .valid = fletcher_4_scalar_valid
+};
+
+static const fletcher_4_ops_t *fletcher_4_impls[] = {
+ &fletcher_4_scalar_ops,
+ &fletcher_4_superscalar_ops,
+ &fletcher_4_superscalar4_ops,
+#if defined(HAVE_SSE2)
+ &fletcher_4_sse2_ops,
+#endif
+#if defined(HAVE_SSE2) && defined(HAVE_SSSE3)
+ &fletcher_4_ssse3_ops,
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_AVX2)
+ &fletcher_4_avx2_ops,
+#endif
+#if defined(__x86_64) && defined(HAVE_AVX512F)
+ &fletcher_4_avx512f_ops,
+#endif
+#if defined(__x86_64) && defined(HAVE_AVX512BW)
+ &fletcher_4_avx512bw_ops,
+#endif
+#if defined(__aarch64__) && !defined(__FreeBSD__)
+ &fletcher_4_aarch64_neon_ops,
+#endif
+};
+
+/* Hold all supported implementations */
+static uint32_t fletcher_4_supp_impls_cnt = 0;
+static fletcher_4_ops_t *fletcher_4_supp_impls[ARRAY_SIZE(fletcher_4_impls)];
+
+/* Select fletcher4 implementation */
+#define IMPL_FASTEST (UINT32_MAX)
+#define IMPL_CYCLE (UINT32_MAX - 1)
+#define IMPL_SCALAR (0)
+
+static uint32_t fletcher_4_impl_chosen = IMPL_FASTEST;
+
+#define IMPL_READ(i) (*(volatile uint32_t *) &(i))
+
+static struct fletcher_4_impl_selector {
+ const char *fis_name;
+ uint32_t fis_sel;
+} fletcher_4_impl_selectors[] = {
+ { "cycle", IMPL_CYCLE },
+ { "fastest", IMPL_FASTEST },
+ { "scalar", IMPL_SCALAR }
+};
+
+#if defined(_KERNEL)
+static kstat_t *fletcher_4_kstat;
+
+static struct fletcher_4_kstat {
+ uint64_t native;
+ uint64_t byteswap;
+} fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
+#endif
+
+/* Indicate that benchmark has been completed */
+static boolean_t fletcher_4_initialized = B_FALSE;
+
+/*ARGSUSED*/
+void
+fletcher_init(zio_cksum_t *zcp)
+{
+ ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+}
+
+int
+fletcher_2_incremental_native(void *buf, size_t size, void *data)
+{
+ zio_cksum_t *zcp = data;
+
+ const uint64_t *ip = buf;
+ const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+ uint64_t a0, b0, a1, b1;
+
+ a0 = zcp->zc_word[0];
+ a1 = zcp->zc_word[1];
+ b0 = zcp->zc_word[2];
+ b1 = zcp->zc_word[3];
+
+ for (; ip < ipend; ip += 2) {
+ a0 += ip[0];
+ a1 += ip[1];
+ b0 += a0;
+ b1 += a1;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+ return (0);
+}
+
+/*ARGSUSED*/
+void
+fletcher_2_native(const void *buf, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_init(zcp);
+ (void) fletcher_2_incremental_native((void *) buf, size, zcp);
+}
+
+int
+fletcher_2_incremental_byteswap(void *buf, size_t size, void *data)
+{
+ zio_cksum_t *zcp = data;
+
+ const uint64_t *ip = buf;
+ const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+ uint64_t a0, b0, a1, b1;
+
+ a0 = zcp->zc_word[0];
+ a1 = zcp->zc_word[1];
+ b0 = zcp->zc_word[2];
+ b1 = zcp->zc_word[3];
+
+ for (; ip < ipend; ip += 2) {
+ a0 += BSWAP_64(ip[0]);
+ a1 += BSWAP_64(ip[1]);
+ b0 += a0;
+ b1 += a1;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+ return (0);
+}
+
+/*ARGSUSED*/
+void
+fletcher_2_byteswap(const void *buf, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_init(zcp);
+ (void) fletcher_2_incremental_byteswap((void *) buf, size, zcp);
+}
+
+static void
+fletcher_4_scalar_init(fletcher_4_ctx_t *ctx)
+{
+ ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0);
+}
+
+static void
+fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
+{
+ memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t));
+}
+
+static void
+fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf,
+ uint64_t size)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+
+ a = ctx->scalar.zc_word[0];
+ b = ctx->scalar.zc_word[1];
+ c = ctx->scalar.zc_word[2];
+ d = ctx->scalar.zc_word[3];
+
+ for (; ip < ipend; ip++) {
+ a += ip[0];
+ b += a;
+ c += b;
+ d += c;
+ }
+
+ ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
+}
+
+static void
+fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
+ uint64_t size)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+
+ a = ctx->scalar.zc_word[0];
+ b = ctx->scalar.zc_word[1];
+ c = ctx->scalar.zc_word[2];
+ d = ctx->scalar.zc_word[3];
+
+ for (; ip < ipend; ip++) {
+ a += BSWAP_32(ip[0]);
+ b += a;
+ c += b;
+ d += c;
+ }
+
+ ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
+}
+
+static boolean_t
+fletcher_4_scalar_valid(void)
+{
+ return (B_TRUE);
+}
+
+int
+fletcher_4_impl_set(const char *val)
+{
+ int err = -EINVAL;
+ uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
+ size_t i, val_len;
+
+ val_len = strlen(val);
+ while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */
+ val_len--;
+
+ /* check mandatory implementations */
+ for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) {
+ const char *name = fletcher_4_impl_selectors[i].fis_name;
+
+ if (val_len == strlen(name) &&
+ strncmp(val, name, val_len) == 0) {
+ impl = fletcher_4_impl_selectors[i].fis_sel;
+ err = 0;
+ break;
+ }
+ }
+
+ if (err != 0 && fletcher_4_initialized) {
+ /* check all supported implementations */
+ for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
+ const char *name = fletcher_4_supp_impls[i]->name;
+
+ if (val_len == strlen(name) &&
+ strncmp(val, name, val_len) == 0) {
+ impl = i;
+ err = 0;
+ break;
+ }
+ }
+ }
+
+ if (err == 0) {
+ atomic_swap_32(&fletcher_4_impl_chosen, impl);
+ membar_producer();
+ }
+
+ return (err);
+}
+
+/*
+ * Returns the Fletcher 4 operations for checksums. When a SIMD
+ * implementation is not allowed in the current context, then fallback
+ * to the fastest generic implementation.
+ */
+static inline const fletcher_4_ops_t *
+fletcher_4_impl_get(void)
+{
+ if (!kfpu_allowed())
+ return (&fletcher_4_superscalar4_ops);
+
+ const fletcher_4_ops_t *ops = NULL;
+ uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
+
+ switch (impl) {
+ case IMPL_FASTEST:
+ ASSERT(fletcher_4_initialized);
+ ops = &fletcher_4_fastest_impl;
+ break;
+ case IMPL_CYCLE:
+ /* Cycle through supported implementations */
+ ASSERT(fletcher_4_initialized);
+ ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
+ static uint32_t cycle_count = 0;
+ uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
+ ops = fletcher_4_supp_impls[idx];
+ break;
+ default:
+ ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
+ ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
+ ops = fletcher_4_supp_impls[impl];
+ break;
+ }
+
+ ASSERT3P(ops, !=, NULL);
+
+ return (ops);
+}
+
+static inline void
+fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ fletcher_4_ctx_t ctx;
+ const fletcher_4_ops_t *ops = fletcher_4_impl_get();
+
+ ops->init_native(&ctx);
+ ops->compute_native(&ctx, buf, size);
+ ops->fini_native(&ctx, zcp);
+}
+
+/*ARGSUSED*/
+void
+fletcher_4_native(const void *buf, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
+
+ ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
+
+ if (size == 0 || p2size == 0) {
+ ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+
+ if (size > 0)
+ fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
+ buf, size);
+ } else {
+ fletcher_4_native_impl(buf, p2size, zcp);
+
+ if (p2size < size)
+ fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
+ (char *)buf + p2size, size - p2size);
+ }
+}
+
+void
+fletcher_4_native_varsize(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+ fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
+}
+
+static inline void
+fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ fletcher_4_ctx_t ctx;
+ const fletcher_4_ops_t *ops = fletcher_4_impl_get();
+
+ ops->init_byteswap(&ctx);
+ ops->compute_byteswap(&ctx, buf, size);
+ ops->fini_byteswap(&ctx, zcp);
+}
+
+/*ARGSUSED*/
+void
+fletcher_4_byteswap(const void *buf, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
+
+ ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
+
+ if (size == 0 || p2size == 0) {
+ ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+
+ if (size > 0)
+ fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
+ buf, size);
+ } else {
+ fletcher_4_byteswap_impl(buf, p2size, zcp);
+
+ if (p2size < size)
+ fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
+ (char *)buf + p2size, size - p2size);
+ }
+}
+
+/* Incremental Fletcher 4 */
+
+#define ZFS_FLETCHER_4_INC_MAX_SIZE (8ULL << 20)
+
+static inline void
+fletcher_4_incremental_combine(zio_cksum_t *zcp, const uint64_t size,
+ const zio_cksum_t *nzcp)
+{
+ const uint64_t c1 = size / sizeof (uint32_t);
+ const uint64_t c2 = c1 * (c1 + 1) / 2;
+ const uint64_t c3 = c2 * (c1 + 2) / 3;
+
+ /*
+ * Value of 'c3' overflows on buffer sizes close to 16MiB. For that
+ * reason we split incremental fletcher4 computation of large buffers
+ * to steps of (ZFS_FLETCHER_4_INC_MAX_SIZE) size.
+ */
+ ASSERT3U(size, <=, ZFS_FLETCHER_4_INC_MAX_SIZE);
+
+ zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] +
+ c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0];
+ zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] +
+ c2 * zcp->zc_word[0];
+ zcp->zc_word[1] += nzcp->zc_word[1] + c1 * zcp->zc_word[0];
+ zcp->zc_word[0] += nzcp->zc_word[0];
+}
+
+static inline void
+fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size,
+ zio_cksum_t *zcp)
+{
+ while (size > 0) {
+ zio_cksum_t nzc;
+ uint64_t len = MIN(size, ZFS_FLETCHER_4_INC_MAX_SIZE);
+
+ if (native)
+ fletcher_4_native(buf, len, NULL, &nzc);
+ else
+ fletcher_4_byteswap(buf, len, NULL, &nzc);
+
+ fletcher_4_incremental_combine(zcp, len, &nzc);
+
+ size -= len;
+ buf += len;
+ }
+}
+
+int
+fletcher_4_incremental_native(void *buf, size_t size, void *data)
+{
+ zio_cksum_t *zcp = data;
+ /* Use scalar impl to directly update cksum of small blocks */
+ if (size < SPA_MINBLOCKSIZE)
+ fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
+ else
+ fletcher_4_incremental_impl(B_TRUE, buf, size, zcp);
+ return (0);
+}
+
+int
+fletcher_4_incremental_byteswap(void *buf, size_t size, void *data)
+{
+ zio_cksum_t *zcp = data;
+ /* Use scalar impl to directly update cksum of small blocks */
+ if (size < SPA_MINBLOCKSIZE)
+ fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size);
+ else
+ fletcher_4_incremental_impl(B_FALSE, buf, size, zcp);
+ return (0);
+}
+
+#if defined(_KERNEL)
+/*
+ * Fletcher 4 kstats
+ */
+static int
+fletcher_4_kstat_headers(char *buf, size_t size)
+{
+ ssize_t off = 0;
+
+ off += snprintf(buf + off, size, "%-17s", "implementation");
+ off += snprintf(buf + off, size - off, "%-15s", "native");
+ (void) snprintf(buf + off, size - off, "%-15s\n", "byteswap");
+
+ return (0);
+}
+
+static int
+fletcher_4_kstat_data(char *buf, size_t size, void *data)
+{
+ struct fletcher_4_kstat *fastest_stat =
+ &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
+ struct fletcher_4_kstat *curr_stat = (struct fletcher_4_kstat *)data;
+ ssize_t off = 0;
+
+ if (curr_stat == fastest_stat) {
+ off += snprintf(buf + off, size - off, "%-17s", "fastest");
+ off += snprintf(buf + off, size - off, "%-15s",
+ fletcher_4_supp_impls[fastest_stat->native]->name);
+ off += snprintf(buf + off, size - off, "%-15s\n",
+ fletcher_4_supp_impls[fastest_stat->byteswap]->name);
+ } else {
+ ptrdiff_t id = curr_stat - fletcher_4_stat_data;
+
+ off += snprintf(buf + off, size - off, "%-17s",
+ fletcher_4_supp_impls[id]->name);
+ off += snprintf(buf + off, size - off, "%-15llu",
+ (u_longlong_t)curr_stat->native);
+ off += snprintf(buf + off, size - off, "%-15llu\n",
+ (u_longlong_t)curr_stat->byteswap);
+ }
+
+ return (0);
+}
+
+static void *
+fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
+{
+ if (n <= fletcher_4_supp_impls_cnt)
+ ksp->ks_private = (void *) (fletcher_4_stat_data + n);
+ else
+ ksp->ks_private = NULL;
+
+ return (ksp->ks_private);
+}
+#endif
+
+#define FLETCHER_4_FASTEST_FN_COPY(type, src) \
+{ \
+ fletcher_4_fastest_impl.init_ ## type = src->init_ ## type; \
+ fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type; \
+ fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \
+}
+
+#define FLETCHER_4_BENCH_NS (MSEC2NSEC(1)) /* 1ms */
+
+typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *,
+ zio_cksum_t *);
+
+#if defined(_KERNEL)
+static void
+fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
+{
+
+ struct fletcher_4_kstat *fastest_stat =
+ &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
+ hrtime_t start;
+ uint64_t run_bw, run_time_ns, best_run = 0;
+ zio_cksum_t zc;
+ uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen);
+
+ fletcher_checksum_func_t *fletcher_4_test = native ?
+ fletcher_4_native : fletcher_4_byteswap;
+
+ for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
+ struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i];
+ uint64_t run_count = 0;
+
+ /* temporary set an implementation */
+ fletcher_4_impl_chosen = i;
+
+ kpreempt_disable();
+ start = gethrtime();
+ do {
+ for (l = 0; l < 32; l++, run_count++)
+ fletcher_4_test(data, data_size, NULL, &zc);
+
+ run_time_ns = gethrtime() - start;
+ } while (run_time_ns < FLETCHER_4_BENCH_NS);
+ kpreempt_enable();
+
+ run_bw = data_size * run_count * NANOSEC;
+ run_bw /= run_time_ns; /* B/s */
+
+ if (native)
+ stat->native = run_bw;
+ else
+ stat->byteswap = run_bw;
+
+ if (run_bw > best_run) {
+ best_run = run_bw;
+
+ if (native) {
+ fastest_stat->native = i;
+ FLETCHER_4_FASTEST_FN_COPY(native,
+ fletcher_4_supp_impls[i]);
+ } else {
+ fastest_stat->byteswap = i;
+ FLETCHER_4_FASTEST_FN_COPY(byteswap,
+ fletcher_4_supp_impls[i]);
+ }
+ }
+ }
+
+ /* restore original selection */
+ atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
+}
+#endif /* _KERNEL */
+
+/*
+ * Initialize and benchmark all supported implementations.
+ */
+static void
+fletcher_4_benchmark(void)
+{
+ fletcher_4_ops_t *curr_impl;
+ int i, c;
+
+ /* Move supported implementations into fletcher_4_supp_impls */
+ for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
+ curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];
+
+ if (curr_impl->valid && curr_impl->valid())
+ fletcher_4_supp_impls[c++] = curr_impl;
+ }
+ membar_producer(); /* complete fletcher_4_supp_impls[] init */
+ fletcher_4_supp_impls_cnt = c; /* number of supported impl */
+
+#if defined(_KERNEL)
+ static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
+ char *databuf = vmem_alloc(data_size, KM_SLEEP);
+
+ for (i = 0; i < data_size / sizeof (uint64_t); i++)
+ ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
+
+ fletcher_4_benchmark_impl(B_FALSE, databuf, data_size);
+ fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
+
+ vmem_free(databuf, data_size);
+#else
+ /*
+ * Skip the benchmark in user space to avoid impacting libzpool
+ * consumers (zdb, zhack, zinject, ztest). The last implementation
+ * is assumed to be the fastest and used by default.
+ */
+ memcpy(&fletcher_4_fastest_impl,
+ fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],
+ sizeof (fletcher_4_fastest_impl));
+ fletcher_4_fastest_impl.name = "fastest";
+ membar_producer();
+#endif /* _KERNEL */
+}
+
+void
+fletcher_4_init(void)
+{
+ /* Determine the fastest available implementation. */
+ fletcher_4_benchmark();
+
+#if defined(_KERNEL)
+ /* Install kstats for all implementations */
+ fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+ if (fletcher_4_kstat != NULL) {
+ fletcher_4_kstat->ks_data = NULL;
+ fletcher_4_kstat->ks_ndata = UINT32_MAX;
+ kstat_set_raw_ops(fletcher_4_kstat,
+ fletcher_4_kstat_headers,
+ fletcher_4_kstat_data,
+ fletcher_4_kstat_addr);
+ kstat_install(fletcher_4_kstat);
+ }
+#endif
+
+ /* Finish initialization */
+ fletcher_4_initialized = B_TRUE;
+}
+
+void
+fletcher_4_fini(void)
+{
+#if defined(_KERNEL)
+ if (fletcher_4_kstat != NULL) {
+ kstat_delete(fletcher_4_kstat);
+ fletcher_4_kstat = NULL;
+ }
+#endif
+}
+
+/* ABD adapters */
+
+static void
+abd_fletcher_4_init(zio_abd_checksum_data_t *cdp)
+{
+ const fletcher_4_ops_t *ops = fletcher_4_impl_get();
+ cdp->acd_private = (void *) ops;
+
+ if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE)
+ ops->init_native(cdp->acd_ctx);
+ else
+ ops->init_byteswap(cdp->acd_ctx);
+}
+
+static void
+abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp)
+{
+ fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
+
+ ASSERT(ops);
+
+ if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE)
+ ops->fini_native(cdp->acd_ctx, cdp->acd_zcp);
+ else
+ ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp);
+}
+
+static void
+abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size,
+ zio_abd_checksum_data_t *cdp)
+{
+ zio_cksum_t *zcp = cdp->acd_zcp;
+
+ ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE);
+
+ abd_fletcher_4_fini(cdp);
+ cdp->acd_private = (void *)&fletcher_4_scalar_ops;
+
+ if (native)
+ fletcher_4_incremental_native(data, size, zcp);
+ else
+ fletcher_4_incremental_byteswap(data, size, zcp);
+}
+
+static int
+abd_fletcher_4_iter(void *data, size_t size, void *private)
+{
+ zio_abd_checksum_data_t *cdp = (zio_abd_checksum_data_t *)private;
+ fletcher_4_ctx_t *ctx = cdp->acd_ctx;
+ fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
+ boolean_t native = cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE;
+ uint64_t asize = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
+
+ ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
+
+ if (asize > 0) {
+ if (native)
+ ops->compute_native(ctx, data, asize);
+ else
+ ops->compute_byteswap(ctx, data, asize);
+
+ size -= asize;
+ data = (char *)data + asize;
+ }
+
+ if (size > 0) {
+ ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE);
+ /* At this point we have to switch to scalar impl */
+ abd_fletcher_4_simd2scalar(native, data, size, cdp);
+ }
+
+ return (0);
+}
+
+zio_abd_checksum_func_t fletcher_4_abd_ops = {
+ .acf_init = abd_fletcher_4_init,
+ .acf_fini = abd_fletcher_4_fini,
+ .acf_iter = abd_fletcher_4_iter
+};
+
+#if defined(_KERNEL)
+
+#define IMPL_FMT(impl, i) (((impl) == (i)) ? "[%s] " : "%s ")
+
+#if defined(__linux__)
+
+static int
+fletcher_4_param_get(char *buffer, zfs_kernel_param_t *unused)
+{
+ const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
+ char *fmt;
+ int cnt = 0;
+
+ /* list fastest */
+ fmt = IMPL_FMT(impl, IMPL_FASTEST);
+ cnt += sprintf(buffer + cnt, fmt, "fastest");
+
+ /* list all supported implementations */
+ for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) {
+ fmt = IMPL_FMT(impl, i);
+ cnt += sprintf(buffer + cnt, fmt,
+ fletcher_4_supp_impls[i]->name);
+ }
+
+ return (cnt);
+}
+
+static int
+fletcher_4_param_set(const char *val, zfs_kernel_param_t *unused)
+{
+ return (fletcher_4_impl_set(val));
+}
+
+#else
+
+#include <sys/sbuf.h>
+
+static int
+fletcher_4_param(ZFS_MODULE_PARAM_ARGS)
+{
+ int err;
+
+ if (req->newptr == NULL) {
+ const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
+ const int init_buflen = 64;
+ const char *fmt;
+ struct sbuf *s;
+
+ s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req);
+
+ /* list fastest */
+ fmt = IMPL_FMT(impl, IMPL_FASTEST);
+ (void) sbuf_printf(s, fmt, "fastest");
+
+ /* list all supported implementations */
+ for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) {
+ fmt = IMPL_FMT(impl, i);
+ (void) sbuf_printf(s, fmt,
+ fletcher_4_supp_impls[i]->name);
+ }
+
+ err = sbuf_finish(s);
+ sbuf_delete(s);
+
+ return (err);
+ }
+
+ char buf[16];
+
+ err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+ if (err)
+ return (err);
+ return (-fletcher_4_impl_set(buf));
+}
+
+#endif
+
+#undef IMPL_FMT
+
+/*
+ * Choose a fletcher 4 implementation in ZFS.
+ * Users can choose "cycle" to exercise all implementations, but this is
+ * for testing purpose therefore it can only be set in user space.
+ */
+/* BEGIN CSTYLED */
+ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, fletcher_4_impl,
+ fletcher_4_param_set, fletcher_4_param_get, ZMOD_RW,
+ "Select fletcher 4 implementation.");
+/* END CSTYLED */
+
+EXPORT_SYMBOL(fletcher_init);
+EXPORT_SYMBOL(fletcher_2_incremental_native);
+EXPORT_SYMBOL(fletcher_2_incremental_byteswap);
+EXPORT_SYMBOL(fletcher_4_init);
+EXPORT_SYMBOL(fletcher_4_fini);
+EXPORT_SYMBOL(fletcher_2_native);
+EXPORT_SYMBOL(fletcher_2_byteswap);
+EXPORT_SYMBOL(fletcher_4_native);
+EXPORT_SYMBOL(fletcher_4_native_varsize);
+EXPORT_SYMBOL(fletcher_4_byteswap);
+EXPORT_SYMBOL(fletcher_4_incremental_native);
+EXPORT_SYMBOL(fletcher_4_incremental_byteswap);
+EXPORT_SYMBOL(fletcher_4_abd_ops);
+#endif
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_fletcher_aarch64_neon.c b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_aarch64_neon.c
new file mode 100644
index 000000000000..c95a71681584
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_aarch64_neon.c
@@ -0,0 +1,215 @@
+/*
+ * Implement fast Fletcher4 with NEON instructions. (aarch64)
+ *
+ * Use the 128-bit NEON SIMD instructions and registers to compute
+ * Fletcher4 in two incremental 64-bit parallel accumulator streams,
+ * and then combine the streams to form the final four checksum words.
+ * This implementation is a derivative of the AVX SIMD implementation by
+ * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c).
+ *
+ * Copyright (C) 2016 Romain Dolbeau.
+ *
+ * Authors:
+ * Romain Dolbeau <romain.dolbeau@atos.net>
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include <sys/simd.h>
+#include <sys/spa_checksum.h>
+#include <sys/strings.h>
+#include <zfs_fletcher.h>
+
+static void
+fletcher_4_aarch64_neon_init(fletcher_4_ctx_t *ctx)
+{
+ bzero(ctx->aarch64_neon, 4 * sizeof (zfs_fletcher_aarch64_neon_t));
+}
+
+static void
+fletcher_4_aarch64_neon_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
+{
+ uint64_t A, B, C, D;
+ A = ctx->aarch64_neon[0].v[0] + ctx->aarch64_neon[0].v[1];
+ B = 2 * ctx->aarch64_neon[1].v[0] + 2 * ctx->aarch64_neon[1].v[1] -
+ ctx->aarch64_neon[0].v[1];
+ C = 4 * ctx->aarch64_neon[2].v[0] - ctx->aarch64_neon[1].v[0] +
+ 4 * ctx->aarch64_neon[2].v[1] - 3 * ctx->aarch64_neon[1].v[1];
+ D = 8 * ctx->aarch64_neon[3].v[0] - 4 * ctx->aarch64_neon[2].v[0] +
+ 8 * ctx->aarch64_neon[3].v[1] - 8 * ctx->aarch64_neon[2].v[1] +
+ ctx->aarch64_neon[1].v[1];
+ ZIO_SET_CHECKSUM(zcp, A, B, C, D);
+}
+
+#define NEON_INIT_LOOP() \
+ asm("eor %[ZERO].16b,%[ZERO].16b,%[ZERO].16b\n" \
+ "ld1 { %[ACC0].4s }, %[CTX0]\n" \
+ "ld1 { %[ACC1].4s }, %[CTX1]\n" \
+ "ld1 { %[ACC2].4s }, %[CTX2]\n" \
+ "ld1 { %[ACC3].4s }, %[CTX3]\n" \
+ : [ZERO] "=w" (ZERO), \
+ [ACC0] "=w" (ACC0), [ACC1] "=w" (ACC1), \
+ [ACC2] "=w" (ACC2), [ACC3] "=w" (ACC3) \
+ : [CTX0] "Q" (ctx->aarch64_neon[0]), \
+ [CTX1] "Q" (ctx->aarch64_neon[1]), \
+ [CTX2] "Q" (ctx->aarch64_neon[2]), \
+ [CTX3] "Q" (ctx->aarch64_neon[3]))
+
+#define NEON_DO_REVERSE "rev32 %[SRC].16b, %[SRC].16b\n"
+
+#define NEON_DONT_REVERSE ""
+
+#define NEON_MAIN_LOOP(REVERSE) \
+ asm("ld1 { %[SRC].4s }, %[IP]\n" \
+ REVERSE \
+ "zip1 %[TMP1].4s, %[SRC].4s, %[ZERO].4s\n" \
+ "zip2 %[TMP2].4s, %[SRC].4s, %[ZERO].4s\n" \
+ "add %[ACC0].2d, %[ACC0].2d, %[TMP1].2d\n" \
+ "add %[ACC1].2d, %[ACC1].2d, %[ACC0].2d\n" \
+ "add %[ACC2].2d, %[ACC2].2d, %[ACC1].2d\n" \
+ "add %[ACC3].2d, %[ACC3].2d, %[ACC2].2d\n" \
+ "add %[ACC0].2d, %[ACC0].2d, %[TMP2].2d\n" \
+ "add %[ACC1].2d, %[ACC1].2d, %[ACC0].2d\n" \
+ "add %[ACC2].2d, %[ACC2].2d, %[ACC1].2d\n" \
+ "add %[ACC3].2d, %[ACC3].2d, %[ACC2].2d\n" \
+ : [SRC] "=&w" (SRC), \
+ [TMP1] "=&w" (TMP1), [TMP2] "=&w" (TMP2), \
+ [ACC0] "+w" (ACC0), [ACC1] "+w" (ACC1), \
+ [ACC2] "+w" (ACC2), [ACC3] "+w" (ACC3) \
+ : [ZERO] "w" (ZERO), [IP] "Q" (*ip))
+
+#define NEON_FINI_LOOP() \
+ asm("st1 { %[ACC0].4s },%[DST0]\n" \
+ "st1 { %[ACC1].4s },%[DST1]\n" \
+ "st1 { %[ACC2].4s },%[DST2]\n" \
+ "st1 { %[ACC3].4s },%[DST3]\n" \
+ : [DST0] "=Q" (ctx->aarch64_neon[0]), \
+ [DST1] "=Q" (ctx->aarch64_neon[1]), \
+ [DST2] "=Q" (ctx->aarch64_neon[2]), \
+ [DST3] "=Q" (ctx->aarch64_neon[3]) \
+ : [ACC0] "w" (ACC0), [ACC1] "w" (ACC1), \
+ [ACC2] "w" (ACC2), [ACC3] "w" (ACC3))
+
+static void
+fletcher_4_aarch64_neon_native(fletcher_4_ctx_t *ctx,
+ const void *buf, uint64_t size)
+{
+ const uint64_t *ip = buf;
+ const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
+#if defined(_KERNEL)
+register unsigned char ZERO asm("v0") __attribute__((vector_size(16)));
+register unsigned char ACC0 asm("v1") __attribute__((vector_size(16)));
+register unsigned char ACC1 asm("v2") __attribute__((vector_size(16)));
+register unsigned char ACC2 asm("v3") __attribute__((vector_size(16)));
+register unsigned char ACC3 asm("v4") __attribute__((vector_size(16)));
+register unsigned char TMP1 asm("v5") __attribute__((vector_size(16)));
+register unsigned char TMP2 asm("v6") __attribute__((vector_size(16)));
+register unsigned char SRC asm("v7") __attribute__((vector_size(16)));
+#else
+unsigned char ZERO __attribute__((vector_size(16)));
+unsigned char ACC0 __attribute__((vector_size(16)));
+unsigned char ACC1 __attribute__((vector_size(16)));
+unsigned char ACC2 __attribute__((vector_size(16)));
+unsigned char ACC3 __attribute__((vector_size(16)));
+unsigned char TMP1 __attribute__((vector_size(16)));
+unsigned char TMP2 __attribute__((vector_size(16)));
+unsigned char SRC __attribute__((vector_size(16)));
+#endif
+
+ kfpu_begin();
+
+ NEON_INIT_LOOP();
+
+ for (; ip < ipend; ip += 2) {
+ NEON_MAIN_LOOP(NEON_DONT_REVERSE);
+ }
+
+ NEON_FINI_LOOP();
+
+ kfpu_end();
+}
+
+static void
+fletcher_4_aarch64_neon_byteswap(fletcher_4_ctx_t *ctx,
+ const void *buf, uint64_t size)
+{
+ const uint64_t *ip = buf;
+ const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
+#if defined(_KERNEL)
+register unsigned char ZERO asm("v0") __attribute__((vector_size(16)));
+register unsigned char ACC0 asm("v1") __attribute__((vector_size(16)));
+register unsigned char ACC1 asm("v2") __attribute__((vector_size(16)));
+register unsigned char ACC2 asm("v3") __attribute__((vector_size(16)));
+register unsigned char ACC3 asm("v4") __attribute__((vector_size(16)));
+register unsigned char TMP1 asm("v5") __attribute__((vector_size(16)));
+register unsigned char TMP2 asm("v6") __attribute__((vector_size(16)));
+register unsigned char SRC asm("v7") __attribute__((vector_size(16)));
+#else
+unsigned char ZERO __attribute__((vector_size(16)));
+unsigned char ACC0 __attribute__((vector_size(16)));
+unsigned char ACC1 __attribute__((vector_size(16)));
+unsigned char ACC2 __attribute__((vector_size(16)));
+unsigned char ACC3 __attribute__((vector_size(16)));
+unsigned char TMP1 __attribute__((vector_size(16)));
+unsigned char TMP2 __attribute__((vector_size(16)));
+unsigned char SRC __attribute__((vector_size(16)));
+#endif
+
+ kfpu_begin();
+
+ NEON_INIT_LOOP();
+
+ for (; ip < ipend; ip += 2) {
+ NEON_MAIN_LOOP(NEON_DO_REVERSE);
+ }
+
+ NEON_FINI_LOOP();
+
+ kfpu_end();
+}
+
+static boolean_t fletcher_4_aarch64_neon_valid(void)
+{
+ return (kfpu_allowed());
+}
+
+const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = {
+ .init_native = fletcher_4_aarch64_neon_init,
+ .compute_native = fletcher_4_aarch64_neon_native,
+ .fini_native = fletcher_4_aarch64_neon_fini,
+ .init_byteswap = fletcher_4_aarch64_neon_init,
+ .compute_byteswap = fletcher_4_aarch64_neon_byteswap,
+ .fini_byteswap = fletcher_4_aarch64_neon_fini,
+ .valid = fletcher_4_aarch64_neon_valid,
+ .name = "aarch64_neon"
+};
+
+#endif /* defined(__aarch64__) */
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_fletcher_avx512.c b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_avx512.c
new file mode 100644
index 000000000000..300ec4c1fb69
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_avx512.c
@@ -0,0 +1,225 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#if defined(__x86_64) && defined(HAVE_AVX512F)
+
+#include <sys/byteorder.h>
+#include <sys/frame.h>
+#include <sys/spa_checksum.h>
+#include <sys/strings.h>
+#include <sys/simd.h>
+#include <zfs_fletcher.h>
+
+#ifdef __linux__
+#define __asm __asm__ __volatile__
+#endif
+
+static void
+fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx)
+{
+ bzero(ctx->avx512, 4 * sizeof (zfs_fletcher_avx512_t));
+}
+
+static void
+fletcher_4_avx512f_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
+{
+ static const uint64_t
+ CcA[] = { 0, 0, 1, 3, 6, 10, 15, 21 },
+ CcB[] = { 28, 36, 44, 52, 60, 68, 76, 84 },
+ DcA[] = { 0, 0, 0, 1, 4, 10, 20, 35 },
+ DcB[] = { 56, 84, 120, 164, 216, 276, 344, 420 },
+ DcC[] = { 448, 512, 576, 640, 704, 768, 832, 896 };
+
+ uint64_t A, B, C, D;
+ uint64_t i;
+
+ A = ctx->avx512[0].v[0];
+ B = 8 * ctx->avx512[1].v[0];
+ C = 64 * ctx->avx512[2].v[0] - CcB[0] * ctx->avx512[1].v[0];
+ D = 512 * ctx->avx512[3].v[0] - DcC[0] * ctx->avx512[2].v[0] +
+ DcB[0] * ctx->avx512[1].v[0];
+
+ for (i = 1; i < 8; i++) {
+ A += ctx->avx512[0].v[i];
+ B += 8 * ctx->avx512[1].v[i] - i * ctx->avx512[0].v[i];
+ C += 64 * ctx->avx512[2].v[i] - CcB[i] * ctx->avx512[1].v[i] +
+ CcA[i] * ctx->avx512[0].v[i];
+ D += 512 * ctx->avx512[3].v[i] - DcC[i] * ctx->avx512[2].v[i] +
+ DcB[i] * ctx->avx512[1].v[i] - DcA[i] * ctx->avx512[0].v[i];
+ }
+
+ ZIO_SET_CHECKSUM(zcp, A, B, C, D);
+}
+
+#define FLETCHER_4_AVX512_RESTORE_CTX(ctx) \
+{ \
+ __asm("vmovdqu64 %0, %%zmm0" :: "m" ((ctx)->avx512[0])); \
+ __asm("vmovdqu64 %0, %%zmm1" :: "m" ((ctx)->avx512[1])); \
+ __asm("vmovdqu64 %0, %%zmm2" :: "m" ((ctx)->avx512[2])); \
+ __asm("vmovdqu64 %0, %%zmm3" :: "m" ((ctx)->avx512[3])); \
+}
+
+#define FLETCHER_4_AVX512_SAVE_CTX(ctx) \
+{ \
+ __asm("vmovdqu64 %%zmm0, %0" : "=m" ((ctx)->avx512[0])); \
+ __asm("vmovdqu64 %%zmm1, %0" : "=m" ((ctx)->avx512[1])); \
+ __asm("vmovdqu64 %%zmm2, %0" : "=m" ((ctx)->avx512[2])); \
+ __asm("vmovdqu64 %%zmm3, %0" : "=m" ((ctx)->avx512[3])); \
+}
+
+static void
+fletcher_4_avx512f_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
+
+ kfpu_begin();
+
+ FLETCHER_4_AVX512_RESTORE_CTX(ctx);
+
+ for (; ip < ipend; ip += 8) {
+ __asm("vpmovzxdq %0, %%zmm4"::"m" (*ip));
+ __asm("vpaddq %zmm4, %zmm0, %zmm0");
+ __asm("vpaddq %zmm0, %zmm1, %zmm1");
+ __asm("vpaddq %zmm1, %zmm2, %zmm2");
+ __asm("vpaddq %zmm2, %zmm3, %zmm3");
+ }
+
+ FLETCHER_4_AVX512_SAVE_CTX(ctx);
+
+ kfpu_end();
+}
+STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_native);
+
+static void
+fletcher_4_avx512f_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
+ uint64_t size)
+{
+ static const uint64_t byteswap_mask = 0xFFULL;
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
+
+ kfpu_begin();
+
+ FLETCHER_4_AVX512_RESTORE_CTX(ctx);
+
+ __asm("vpbroadcastq %0, %%zmm8" :: "r" (byteswap_mask));
+ __asm("vpsllq $8, %zmm8, %zmm9");
+ __asm("vpsllq $16, %zmm8, %zmm10");
+ __asm("vpsllq $24, %zmm8, %zmm11");
+
+ for (; ip < ipend; ip += 8) {
+ __asm("vpmovzxdq %0, %%zmm5"::"m" (*ip));
+
+ __asm("vpsrlq $24, %zmm5, %zmm6");
+ __asm("vpandd %zmm8, %zmm6, %zmm6");
+ __asm("vpsrlq $8, %zmm5, %zmm7");
+ __asm("vpandd %zmm9, %zmm7, %zmm7");
+ __asm("vpord %zmm6, %zmm7, %zmm4");
+ __asm("vpsllq $8, %zmm5, %zmm6");
+ __asm("vpandd %zmm10, %zmm6, %zmm6");
+ __asm("vpord %zmm6, %zmm4, %zmm4");
+ __asm("vpsllq $24, %zmm5, %zmm5");
+ __asm("vpandd %zmm11, %zmm5, %zmm5");
+ __asm("vpord %zmm5, %zmm4, %zmm4");
+
+ __asm("vpaddq %zmm4, %zmm0, %zmm0");
+ __asm("vpaddq %zmm0, %zmm1, %zmm1");
+ __asm("vpaddq %zmm1, %zmm2, %zmm2");
+ __asm("vpaddq %zmm2, %zmm3, %zmm3");
+ }
+
+ FLETCHER_4_AVX512_SAVE_CTX(ctx)
+
+ kfpu_end();
+}
+STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap);
+
+static boolean_t
+fletcher_4_avx512f_valid(void)
+{
+ return (kfpu_allowed() && zfs_avx512f_available());
+}
+
+const fletcher_4_ops_t fletcher_4_avx512f_ops = {
+ .init_native = fletcher_4_avx512f_init,
+ .fini_native = fletcher_4_avx512f_fini,
+ .compute_native = fletcher_4_avx512f_native,
+ .init_byteswap = fletcher_4_avx512f_init,
+ .fini_byteswap = fletcher_4_avx512f_fini,
+ .compute_byteswap = fletcher_4_avx512f_byteswap,
+ .valid = fletcher_4_avx512f_valid,
+ .name = "avx512f"
+};
+
+#if defined(HAVE_AVX512BW)
+static void
+fletcher_4_avx512bw_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
+ uint64_t size)
+{
+ static const zfs_fletcher_avx512_t mask = {
+ .v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
+ 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
+ 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
+ 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B }
+ };
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
+
+ kfpu_begin();
+
+ FLETCHER_4_AVX512_RESTORE_CTX(ctx);
+
+ __asm("vmovdqu64 %0, %%zmm5" :: "m" (mask));
+
+ for (; ip < ipend; ip += 8) {
+ __asm("vpmovzxdq %0, %%zmm4"::"m" (*ip));
+
+ __asm("vpshufb %zmm5, %zmm4, %zmm4");
+
+ __asm("vpaddq %zmm4, %zmm0, %zmm0");
+ __asm("vpaddq %zmm0, %zmm1, %zmm1");
+ __asm("vpaddq %zmm1, %zmm2, %zmm2");
+ __asm("vpaddq %zmm2, %zmm3, %zmm3");
+ }
+
+ FLETCHER_4_AVX512_SAVE_CTX(ctx)
+
+ kfpu_end();
+}
+STACK_FRAME_NON_STANDARD(fletcher_4_avx512bw_byteswap);
+
+const fletcher_4_ops_t fletcher_4_avx512bw_ops = {
+ .init_native = fletcher_4_avx512f_init,
+ .fini_native = fletcher_4_avx512f_fini,
+ .compute_native = fletcher_4_avx512f_native,
+ .init_byteswap = fletcher_4_avx512f_init,
+ .fini_byteswap = fletcher_4_avx512f_fini,
+ .compute_byteswap = fletcher_4_avx512bw_byteswap,
+ .valid = fletcher_4_avx512f_valid,
+ .name = "avx512bw"
+};
+#endif
+
+#endif /* defined(__x86_64) && defined(HAVE_AVX512F) */
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_fletcher_intel.c b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_intel.c
new file mode 100644
index 000000000000..5136a01eca51
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_intel.c
@@ -0,0 +1,173 @@
+/*
+ * Implement fast Fletcher4 with AVX2 instructions. (x86_64)
+ *
+ * Use the 256-bit AVX2 SIMD instructions and registers to compute
+ * Fletcher4 in four incremental 64-bit parallel accumulator streams,
+ * and then combine the streams to form the final four checksum words.
+ *
+ * Copyright (C) 2015 Intel Corporation.
+ *
+ * Authors:
+ * James Guilford <james.guilford@intel.com>
+ * Jinshan Xiong <jinshan.xiong@intel.com>
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(HAVE_AVX) && defined(HAVE_AVX2)
+
+#include <sys/spa_checksum.h>
+#include <sys/simd.h>
+#include <sys/strings.h>
+#include <zfs_fletcher.h>
+
+static void
+fletcher_4_avx2_init(fletcher_4_ctx_t *ctx)
+{
+ bzero(ctx->avx, 4 * sizeof (zfs_fletcher_avx_t));
+}
+
+static void
+fletcher_4_avx2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
+{
+ uint64_t A, B, C, D;
+
+ A = ctx->avx[0].v[0] + ctx->avx[0].v[1] +
+ ctx->avx[0].v[2] + ctx->avx[0].v[3];
+ B = 0 - ctx->avx[0].v[1] - 2 * ctx->avx[0].v[2] - 3 * ctx->avx[0].v[3] +
+ 4 * ctx->avx[1].v[0] + 4 * ctx->avx[1].v[1] + 4 * ctx->avx[1].v[2] +
+ 4 * ctx->avx[1].v[3];
+
+ C = ctx->avx[0].v[2] + 3 * ctx->avx[0].v[3] - 6 * ctx->avx[1].v[0] -
+ 10 * ctx->avx[1].v[1] - 14 * ctx->avx[1].v[2] -
+ 18 * ctx->avx[1].v[3] + 16 * ctx->avx[2].v[0] +
+ 16 * ctx->avx[2].v[1] + 16 * ctx->avx[2].v[2] +
+ 16 * ctx->avx[2].v[3];
+
+ D = 0 - ctx->avx[0].v[3] + 4 * ctx->avx[1].v[0] +
+ 10 * ctx->avx[1].v[1] + 20 * ctx->avx[1].v[2] +
+ 34 * ctx->avx[1].v[3] - 48 * ctx->avx[2].v[0] -
+ 64 * ctx->avx[2].v[1] - 80 * ctx->avx[2].v[2] -
+ 96 * ctx->avx[2].v[3] + 64 * ctx->avx[3].v[0] +
+ 64 * ctx->avx[3].v[1] + 64 * ctx->avx[3].v[2] +
+ 64 * ctx->avx[3].v[3];
+
+ ZIO_SET_CHECKSUM(zcp, A, B, C, D);
+}
+
+#define FLETCHER_4_AVX2_RESTORE_CTX(ctx) \
+{ \
+ asm volatile("vmovdqu %0, %%ymm0" :: "m" ((ctx)->avx[0])); \
+ asm volatile("vmovdqu %0, %%ymm1" :: "m" ((ctx)->avx[1])); \
+ asm volatile("vmovdqu %0, %%ymm2" :: "m" ((ctx)->avx[2])); \
+ asm volatile("vmovdqu %0, %%ymm3" :: "m" ((ctx)->avx[3])); \
+}
+
+#define FLETCHER_4_AVX2_SAVE_CTX(ctx) \
+{ \
+ asm volatile("vmovdqu %%ymm0, %0" : "=m" ((ctx)->avx[0])); \
+ asm volatile("vmovdqu %%ymm1, %0" : "=m" ((ctx)->avx[1])); \
+ asm volatile("vmovdqu %%ymm2, %0" : "=m" ((ctx)->avx[2])); \
+ asm volatile("vmovdqu %%ymm3, %0" : "=m" ((ctx)->avx[3])); \
+}
+
+
+static void
+fletcher_4_avx2_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
+{
+ const uint64_t *ip = buf;
+ const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
+
+ kfpu_begin();
+
+ FLETCHER_4_AVX2_RESTORE_CTX(ctx);
+
+ for (; ip < ipend; ip += 2) {
+ asm volatile("vpmovzxdq %0, %%ymm4"::"m" (*ip));
+ asm volatile("vpaddq %ymm4, %ymm0, %ymm0");
+ asm volatile("vpaddq %ymm0, %ymm1, %ymm1");
+ asm volatile("vpaddq %ymm1, %ymm2, %ymm2");
+ asm volatile("vpaddq %ymm2, %ymm3, %ymm3");
+ }
+
+ FLETCHER_4_AVX2_SAVE_CTX(ctx);
+ asm volatile("vzeroupper");
+
+ kfpu_end();
+}
+
+static void
+fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
+{
+ static const zfs_fletcher_avx_t mask = {
+ .v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
+ 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B }
+ };
+ const uint64_t *ip = buf;
+ const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
+
+ kfpu_begin();
+
+ FLETCHER_4_AVX2_RESTORE_CTX(ctx);
+
+ asm volatile("vmovdqu %0, %%ymm5" :: "m" (mask));
+
+ for (; ip < ipend; ip += 2) {
+ asm volatile("vpmovzxdq %0, %%ymm4"::"m" (*ip));
+ asm volatile("vpshufb %ymm5, %ymm4, %ymm4");
+
+ asm volatile("vpaddq %ymm4, %ymm0, %ymm0");
+ asm volatile("vpaddq %ymm0, %ymm1, %ymm1");
+ asm volatile("vpaddq %ymm1, %ymm2, %ymm2");
+ asm volatile("vpaddq %ymm2, %ymm3, %ymm3");
+ }
+
+ FLETCHER_4_AVX2_SAVE_CTX(ctx);
+ asm volatile("vzeroupper");
+
+ kfpu_end();
+}
+
+static boolean_t fletcher_4_avx2_valid(void)
+{
+ return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
+}
+
+const fletcher_4_ops_t fletcher_4_avx2_ops = {
+ .init_native = fletcher_4_avx2_init,
+ .fini_native = fletcher_4_avx2_fini,
+ .compute_native = fletcher_4_avx2_native,
+ .init_byteswap = fletcher_4_avx2_init,
+ .fini_byteswap = fletcher_4_avx2_fini,
+ .compute_byteswap = fletcher_4_avx2_byteswap,
+ .valid = fletcher_4_avx2_valid,
+ .name = "avx2"
+};
+
+#endif /* defined(HAVE_AVX) && defined(HAVE_AVX2) */
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_fletcher_sse.c b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_sse.c
new file mode 100644
index 000000000000..15ce9b07ffbe
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_sse.c
@@ -0,0 +1,232 @@
+/*
+ * Implement fast Fletcher4 with SSE2,SSSE3 instructions. (x86)
+ *
+ * Use the 128-bit SSE2/SSSE3 SIMD instructions and registers to compute
+ * Fletcher4 in two incremental 64-bit parallel accumulator streams,
+ * and then combine the streams to form the final four checksum words.
+ * This implementation is a derivative of the AVX SIMD implementation by
+ * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c).
+ *
+ * Copyright (C) 2016 Tyler J. Stachecki.
+ *
+ * Authors:
+ * Tyler J. Stachecki <stachecki.tyler@gmail.com>
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(HAVE_SSE2)
+
+#include <sys/simd.h>
+#include <sys/spa_checksum.h>
+#include <sys/byteorder.h>
+#include <sys/strings.h>
+#include <zfs_fletcher.h>
+
+static void
+fletcher_4_sse2_init(fletcher_4_ctx_t *ctx)
+{
+ bzero(ctx->sse, 4 * sizeof (zfs_fletcher_sse_t));
+}
+
+static void
+fletcher_4_sse2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
+{
+ uint64_t A, B, C, D;
+
+ /*
+ * The mixing matrix for checksum calculation is:
+ * a = a0 + a1
+ * b = 2b0 + 2b1 - a1
+ * c = 4c0 - b0 + 4c1 -3b1
+ * d = 8d0 - 4c0 + 8d1 - 8c1 + b1;
+ *
+ * c and d are multiplied by 4 and 8, respectively,
+ * before spilling the vectors out to memory.
+ */
+ A = ctx->sse[0].v[0] + ctx->sse[0].v[1];
+ B = 2 * ctx->sse[1].v[0] + 2 * ctx->sse[1].v[1] - ctx->sse[0].v[1];
+ C = 4 * ctx->sse[2].v[0] - ctx->sse[1].v[0] + 4 * ctx->sse[2].v[1] -
+ 3 * ctx->sse[1].v[1];
+ D = 8 * ctx->sse[3].v[0] - 4 * ctx->sse[2].v[0] + 8 * ctx->sse[3].v[1] -
+ 8 * ctx->sse[2].v[1] + ctx->sse[1].v[1];
+
+ ZIO_SET_CHECKSUM(zcp, A, B, C, D);
+}
+
+#define FLETCHER_4_SSE_RESTORE_CTX(ctx) \
+{ \
+ asm volatile("movdqu %0, %%xmm0" :: "m" ((ctx)->sse[0])); \
+ asm volatile("movdqu %0, %%xmm1" :: "m" ((ctx)->sse[1])); \
+ asm volatile("movdqu %0, %%xmm2" :: "m" ((ctx)->sse[2])); \
+ asm volatile("movdqu %0, %%xmm3" :: "m" ((ctx)->sse[3])); \
+}
+
+#define FLETCHER_4_SSE_SAVE_CTX(ctx) \
+{ \
+ asm volatile("movdqu %%xmm0, %0" : "=m" ((ctx)->sse[0])); \
+ asm volatile("movdqu %%xmm1, %0" : "=m" ((ctx)->sse[1])); \
+ asm volatile("movdqu %%xmm2, %0" : "=m" ((ctx)->sse[2])); \
+ asm volatile("movdqu %%xmm3, %0" : "=m" ((ctx)->sse[3])); \
+}
+
+static void
+fletcher_4_sse2_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
+{
+ const uint64_t *ip = buf;
+ const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
+
+ kfpu_begin();
+
+ FLETCHER_4_SSE_RESTORE_CTX(ctx);
+
+ asm volatile("pxor %xmm4, %xmm4");
+
+ for (; ip < ipend; ip += 2) {
+ asm volatile("movdqu %0, %%xmm5" :: "m"(*ip));
+ asm volatile("movdqa %xmm5, %xmm6");
+ asm volatile("punpckldq %xmm4, %xmm5");
+ asm volatile("punpckhdq %xmm4, %xmm6");
+ asm volatile("paddq %xmm5, %xmm0");
+ asm volatile("paddq %xmm0, %xmm1");
+ asm volatile("paddq %xmm1, %xmm2");
+ asm volatile("paddq %xmm2, %xmm3");
+ asm volatile("paddq %xmm6, %xmm0");
+ asm volatile("paddq %xmm0, %xmm1");
+ asm volatile("paddq %xmm1, %xmm2");
+ asm volatile("paddq %xmm2, %xmm3");
+ }
+
+ FLETCHER_4_SSE_SAVE_CTX(ctx);
+
+ kfpu_end();
+}
+
+static void
+fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
+
+ kfpu_begin();
+
+ FLETCHER_4_SSE_RESTORE_CTX(ctx);
+
+ for (; ip < ipend; ip += 2) {
+ uint32_t scratch1 = BSWAP_32(ip[0]);
+ uint32_t scratch2 = BSWAP_32(ip[1]);
+ asm volatile("movd %0, %%xmm5" :: "r"(scratch1));
+ asm volatile("movd %0, %%xmm6" :: "r"(scratch2));
+ asm volatile("punpcklqdq %xmm6, %xmm5");
+ asm volatile("paddq %xmm5, %xmm0");
+ asm volatile("paddq %xmm0, %xmm1");
+ asm volatile("paddq %xmm1, %xmm2");
+ asm volatile("paddq %xmm2, %xmm3");
+ }
+
+ FLETCHER_4_SSE_SAVE_CTX(ctx);
+
+ kfpu_end();
+}
+
+static boolean_t fletcher_4_sse2_valid(void)
+{
+ return (kfpu_allowed() && zfs_sse2_available());
+}
+
+const fletcher_4_ops_t fletcher_4_sse2_ops = {
+ .init_native = fletcher_4_sse2_init,
+ .fini_native = fletcher_4_sse2_fini,
+ .compute_native = fletcher_4_sse2_native,
+ .init_byteswap = fletcher_4_sse2_init,
+ .fini_byteswap = fletcher_4_sse2_fini,
+ .compute_byteswap = fletcher_4_sse2_byteswap,
+ .valid = fletcher_4_sse2_valid,
+ .name = "sse2"
+};
+
+#endif /* defined(HAVE_SSE2) */
+
+#if defined(HAVE_SSE2) && defined(HAVE_SSSE3)
+static void
+fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
+{
+ static const zfs_fletcher_sse_t mask = {
+ .v = { 0x0405060700010203, 0x0C0D0E0F08090A0B }
+ };
+
+ const uint64_t *ip = buf;
+ const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
+
+ kfpu_begin();
+
+ FLETCHER_4_SSE_RESTORE_CTX(ctx);
+
+ asm volatile("movdqu %0, %%xmm7"::"m" (mask));
+ asm volatile("pxor %xmm4, %xmm4");
+
+ for (; ip < ipend; ip += 2) {
+ asm volatile("movdqu %0, %%xmm5"::"m" (*ip));
+ asm volatile("pshufb %xmm7, %xmm5");
+ asm volatile("movdqa %xmm5, %xmm6");
+ asm volatile("punpckldq %xmm4, %xmm5");
+ asm volatile("punpckhdq %xmm4, %xmm6");
+ asm volatile("paddq %xmm5, %xmm0");
+ asm volatile("paddq %xmm0, %xmm1");
+ asm volatile("paddq %xmm1, %xmm2");
+ asm volatile("paddq %xmm2, %xmm3");
+ asm volatile("paddq %xmm6, %xmm0");
+ asm volatile("paddq %xmm0, %xmm1");
+ asm volatile("paddq %xmm1, %xmm2");
+ asm volatile("paddq %xmm2, %xmm3");
+ }
+
+ FLETCHER_4_SSE_SAVE_CTX(ctx);
+
+ kfpu_end();
+}
+
+static boolean_t fletcher_4_ssse3_valid(void)
+{
+ return (kfpu_allowed() && zfs_sse2_available() &&
+ zfs_ssse3_available());
+}
+
+const fletcher_4_ops_t fletcher_4_ssse3_ops = {
+ .init_native = fletcher_4_sse2_init,
+ .fini_native = fletcher_4_sse2_fini,
+ .compute_native = fletcher_4_sse2_native,
+ .init_byteswap = fletcher_4_sse2_init,
+ .fini_byteswap = fletcher_4_sse2_fini,
+ .compute_byteswap = fletcher_4_ssse3_byteswap,
+ .valid = fletcher_4_ssse3_valid,
+ .name = "ssse3"
+};
+
+#endif /* defined(HAVE_SSE2) && defined(HAVE_SSSE3) */
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_fletcher_superscalar.c b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_superscalar.c
new file mode 100644
index 000000000000..153f5c7d75e3
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_superscalar.c
@@ -0,0 +1,163 @@
+/*
+ * Implement fast Fletcher4 using superscalar pipelines.
+ *
+ * Use regular C code to compute
+ * Fletcher4 in two incremental 64-bit parallel accumulator streams,
+ * and then combine the streams to form the final four checksum words.
+ * This implementation is a derivative of the AVX SIMD implementation by
+ * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c).
+ *
+ * Copyright (C) 2016 Romain Dolbeau.
+ *
+ * Authors:
+ * Romain Dolbeau <romain.dolbeau@atos.net>
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/param.h>
+#include <sys/byteorder.h>
+#include <sys/spa_checksum.h>
+#include <sys/strings.h>
+#include <zfs_fletcher.h>
+
+static void
+fletcher_4_superscalar_init(fletcher_4_ctx_t *ctx)
+{
+ bzero(ctx->superscalar, 4 * sizeof (zfs_fletcher_superscalar_t));
+}
+
+static void
+fletcher_4_superscalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
+{
+ uint64_t A, B, C, D;
+ A = ctx->superscalar[0].v[0] + ctx->superscalar[0].v[1];
+ B = 2 * ctx->superscalar[1].v[0] + 2 * ctx->superscalar[1].v[1] -
+ ctx->superscalar[0].v[1];
+ C = 4 * ctx->superscalar[2].v[0] - ctx->superscalar[1].v[0] +
+ 4 * ctx->superscalar[2].v[1] - 3 * ctx->superscalar[1].v[1];
+ D = 8 * ctx->superscalar[3].v[0] - 4 * ctx->superscalar[2].v[0] +
+ 8 * ctx->superscalar[3].v[1] - 8 * ctx->superscalar[2].v[1] +
+ ctx->superscalar[1].v[1];
+ ZIO_SET_CHECKSUM(zcp, A, B, C, D);
+}
+
+static void
+fletcher_4_superscalar_native(fletcher_4_ctx_t *ctx,
+ const void *buf, uint64_t size)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+ uint64_t a2, b2, c2, d2;
+
+ a = ctx->superscalar[0].v[0];
+ b = ctx->superscalar[1].v[0];
+ c = ctx->superscalar[2].v[0];
+ d = ctx->superscalar[3].v[0];
+ a2 = ctx->superscalar[0].v[1];
+ b2 = ctx->superscalar[1].v[1];
+ c2 = ctx->superscalar[2].v[1];
+ d2 = ctx->superscalar[3].v[1];
+
+ for (; ip < ipend; ip += 2) {
+ a += ip[0];
+ a2 += ip[1];
+ b += a;
+ b2 += a2;
+ c += b;
+ c2 += b2;
+ d += c;
+ d2 += c2;
+ }
+
+ ctx->superscalar[0].v[0] = a;
+ ctx->superscalar[1].v[0] = b;
+ ctx->superscalar[2].v[0] = c;
+ ctx->superscalar[3].v[0] = d;
+ ctx->superscalar[0].v[1] = a2;
+ ctx->superscalar[1].v[1] = b2;
+ ctx->superscalar[2].v[1] = c2;
+ ctx->superscalar[3].v[1] = d2;
+}
+
+static void
+fletcher_4_superscalar_byteswap(fletcher_4_ctx_t *ctx,
+ const void *buf, uint64_t size)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+ uint64_t a2, b2, c2, d2;
+
+ a = ctx->superscalar[0].v[0];
+ b = ctx->superscalar[1].v[0];
+ c = ctx->superscalar[2].v[0];
+ d = ctx->superscalar[3].v[0];
+ a2 = ctx->superscalar[0].v[1];
+ b2 = ctx->superscalar[1].v[1];
+ c2 = ctx->superscalar[2].v[1];
+ d2 = ctx->superscalar[3].v[1];
+
+ for (; ip < ipend; ip += 2) {
+ a += BSWAP_32(ip[0]);
+ a2 += BSWAP_32(ip[1]);
+ b += a;
+ b2 += a2;
+ c += b;
+ c2 += b2;
+ d += c;
+ d2 += c2;
+ }
+
+ ctx->superscalar[0].v[0] = a;
+ ctx->superscalar[1].v[0] = b;
+ ctx->superscalar[2].v[0] = c;
+ ctx->superscalar[3].v[0] = d;
+ ctx->superscalar[0].v[1] = a2;
+ ctx->superscalar[1].v[1] = b2;
+ ctx->superscalar[2].v[1] = c2;
+ ctx->superscalar[3].v[1] = d2;
+}
+
+static boolean_t fletcher_4_superscalar_valid(void)
+{
+ return (B_TRUE);
+}
+
+const fletcher_4_ops_t fletcher_4_superscalar_ops = {
+ .init_native = fletcher_4_superscalar_init,
+ .compute_native = fletcher_4_superscalar_native,
+ .fini_native = fletcher_4_superscalar_fini,
+ .init_byteswap = fletcher_4_superscalar_init,
+ .compute_byteswap = fletcher_4_superscalar_byteswap,
+ .fini_byteswap = fletcher_4_superscalar_fini,
+ .valid = fletcher_4_superscalar_valid,
+ .name = "superscalar"
+};
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_fletcher_superscalar4.c b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_superscalar4.c
new file mode 100644
index 000000000000..75e6a3baf980
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_superscalar4.c
@@ -0,0 +1,229 @@
+/*
+ * Implement fast Fletcher4 using superscalar pipelines.
+ *
+ * Use regular C code to compute
+ * Fletcher4 in four incremental 64-bit parallel accumulator streams,
+ * and then combine the streams to form the final four checksum words.
+ * This implementation is a derivative of the AVX SIMD implementation by
+ * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c).
+ *
+ * Copyright (C) 2016 Romain Dolbeau.
+ *
+ * Authors:
+ * Romain Dolbeau <romain.dolbeau@atos.net>
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/param.h>
+#include <sys/byteorder.h>
+#include <sys/spa_checksum.h>
+#include <sys/strings.h>
+#include <zfs_fletcher.h>
+
+static void
+fletcher_4_superscalar4_init(fletcher_4_ctx_t *ctx)
+{
+ bzero(ctx->superscalar, 4 * sizeof (zfs_fletcher_superscalar_t));
+}
+
+static void
+fletcher_4_superscalar4_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
+{
+ uint64_t A, B, C, D;
+
+ A = ctx->superscalar[0].v[0] + ctx->superscalar[0].v[1] +
+ ctx->superscalar[0].v[2] + ctx->superscalar[0].v[3];
+ B = 0 - ctx->superscalar[0].v[1] - 2 * ctx->superscalar[0].v[2] -
+ 3 * ctx->superscalar[0].v[3] + 4 * ctx->superscalar[1].v[0] +
+ 4 * ctx->superscalar[1].v[1] + 4 * ctx->superscalar[1].v[2] +
+ 4 * ctx->superscalar[1].v[3];
+
+ C = ctx->superscalar[0].v[2] + 3 * ctx->superscalar[0].v[3] -
+ 6 * ctx->superscalar[1].v[0] - 10 * ctx->superscalar[1].v[1] -
+ 14 * ctx->superscalar[1].v[2] - 18 * ctx->superscalar[1].v[3] +
+ 16 * ctx->superscalar[2].v[0] + 16 * ctx->superscalar[2].v[1] +
+ 16 * ctx->superscalar[2].v[2] + 16 * ctx->superscalar[2].v[3];
+
+ D = 0 - ctx->superscalar[0].v[3] + 4 * ctx->superscalar[1].v[0] +
+ 10 * ctx->superscalar[1].v[1] + 20 * ctx->superscalar[1].v[2] +
+ 34 * ctx->superscalar[1].v[3] - 48 * ctx->superscalar[2].v[0] -
+ 64 * ctx->superscalar[2].v[1] - 80 * ctx->superscalar[2].v[2] -
+ 96 * ctx->superscalar[2].v[3] + 64 * ctx->superscalar[3].v[0] +
+ 64 * ctx->superscalar[3].v[1] + 64 * ctx->superscalar[3].v[2] +
+ 64 * ctx->superscalar[3].v[3];
+
+ ZIO_SET_CHECKSUM(zcp, A, B, C, D);
+}
+
+static void
+fletcher_4_superscalar4_native(fletcher_4_ctx_t *ctx,
+ const void *buf, uint64_t size)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+ uint64_t a2, b2, c2, d2;
+ uint64_t a3, b3, c3, d3;
+ uint64_t a4, b4, c4, d4;
+
+ a = ctx->superscalar[0].v[0];
+ b = ctx->superscalar[1].v[0];
+ c = ctx->superscalar[2].v[0];
+ d = ctx->superscalar[3].v[0];
+ a2 = ctx->superscalar[0].v[1];
+ b2 = ctx->superscalar[1].v[1];
+ c2 = ctx->superscalar[2].v[1];
+ d2 = ctx->superscalar[3].v[1];
+ a3 = ctx->superscalar[0].v[2];
+ b3 = ctx->superscalar[1].v[2];
+ c3 = ctx->superscalar[2].v[2];
+ d3 = ctx->superscalar[3].v[2];
+ a4 = ctx->superscalar[0].v[3];
+ b4 = ctx->superscalar[1].v[3];
+ c4 = ctx->superscalar[2].v[3];
+ d4 = ctx->superscalar[3].v[3];
+
+ for (; ip < ipend; ip += 4) {
+ a += ip[0];
+ a2 += ip[1];
+ a3 += ip[2];
+ a4 += ip[3];
+ b += a;
+ b2 += a2;
+ b3 += a3;
+ b4 += a4;
+ c += b;
+ c2 += b2;
+ c3 += b3;
+ c4 += b4;
+ d += c;
+ d2 += c2;
+ d3 += c3;
+ d4 += c4;
+ }
+
+ ctx->superscalar[0].v[0] = a;
+ ctx->superscalar[1].v[0] = b;
+ ctx->superscalar[2].v[0] = c;
+ ctx->superscalar[3].v[0] = d;
+ ctx->superscalar[0].v[1] = a2;
+ ctx->superscalar[1].v[1] = b2;
+ ctx->superscalar[2].v[1] = c2;
+ ctx->superscalar[3].v[1] = d2;
+ ctx->superscalar[0].v[2] = a3;
+ ctx->superscalar[1].v[2] = b3;
+ ctx->superscalar[2].v[2] = c3;
+ ctx->superscalar[3].v[2] = d3;
+ ctx->superscalar[0].v[3] = a4;
+ ctx->superscalar[1].v[3] = b4;
+ ctx->superscalar[2].v[3] = c4;
+ ctx->superscalar[3].v[3] = d4;
+}
+
+static void
+fletcher_4_superscalar4_byteswap(fletcher_4_ctx_t *ctx,
+ const void *buf, uint64_t size)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+ uint64_t a2, b2, c2, d2;
+ uint64_t a3, b3, c3, d3;
+ uint64_t a4, b4, c4, d4;
+
+ a = ctx->superscalar[0].v[0];
+ b = ctx->superscalar[1].v[0];
+ c = ctx->superscalar[2].v[0];
+ d = ctx->superscalar[3].v[0];
+ a2 = ctx->superscalar[0].v[1];
+ b2 = ctx->superscalar[1].v[1];
+ c2 = ctx->superscalar[2].v[1];
+ d2 = ctx->superscalar[3].v[1];
+ a3 = ctx->superscalar[0].v[2];
+ b3 = ctx->superscalar[1].v[2];
+ c3 = ctx->superscalar[2].v[2];
+ d3 = ctx->superscalar[3].v[2];
+ a4 = ctx->superscalar[0].v[3];
+ b4 = ctx->superscalar[1].v[3];
+ c4 = ctx->superscalar[2].v[3];
+ d4 = ctx->superscalar[3].v[3];
+
+ for (; ip < ipend; ip += 4) {
+ a += BSWAP_32(ip[0]);
+ a2 += BSWAP_32(ip[1]);
+ a3 += BSWAP_32(ip[2]);
+ a4 += BSWAP_32(ip[3]);
+ b += a;
+ b2 += a2;
+ b3 += a3;
+ b4 += a4;
+ c += b;
+ c2 += b2;
+ c3 += b3;
+ c4 += b4;
+ d += c;
+ d2 += c2;
+ d3 += c3;
+ d4 += c4;
+ }
+
+ ctx->superscalar[0].v[0] = a;
+ ctx->superscalar[1].v[0] = b;
+ ctx->superscalar[2].v[0] = c;
+ ctx->superscalar[3].v[0] = d;
+ ctx->superscalar[0].v[1] = a2;
+ ctx->superscalar[1].v[1] = b2;
+ ctx->superscalar[2].v[1] = c2;
+ ctx->superscalar[3].v[1] = d2;
+ ctx->superscalar[0].v[2] = a3;
+ ctx->superscalar[1].v[2] = b3;
+ ctx->superscalar[2].v[2] = c3;
+ ctx->superscalar[3].v[2] = d3;
+ ctx->superscalar[0].v[3] = a4;
+ ctx->superscalar[1].v[3] = b4;
+ ctx->superscalar[2].v[3] = c4;
+ ctx->superscalar[3].v[3] = d4;
+}
+
+static boolean_t fletcher_4_superscalar4_valid(void)
+{
+ return (B_TRUE);
+}
+
+const fletcher_4_ops_t fletcher_4_superscalar4_ops = {
+ .init_native = fletcher_4_superscalar4_init,
+ .compute_native = fletcher_4_superscalar4_native,
+ .fini_native = fletcher_4_superscalar4_fini,
+ .init_byteswap = fletcher_4_superscalar4_init,
+ .compute_byteswap = fletcher_4_superscalar4_byteswap,
+ .fini_byteswap = fletcher_4_superscalar4_fini,
+ .valid = fletcher_4_superscalar4_valid,
+ .name = "superscalar4"
+};
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_namecheck.c b/sys/contrib/openzfs/module/zcommon/zfs_namecheck.c
new file mode 100644
index 000000000000..0011a971cacb
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfs_namecheck.c
@@ -0,0 +1,473 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ */
+
+/*
+ * Common name validation routines for ZFS. These routines are shared by the
+ * userland code as well as the ioctl() layer to ensure that we don't
+ * inadvertently expose a hole through direct ioctl()s that never gets tested.
+ * In userland, however, we want significantly more information about _why_ the
+ * name is invalid. In the kernel, we only care whether it's valid or not.
+ * Each routine therefore takes a 'namecheck_err_t' which describes exactly why
+ * the name failed to validate.
+ */
+
+#if !defined(_KERNEL)
+#include <string.h>
+#endif
+
+#include <sys/dsl_dir.h>
+#include <sys/param.h>
+#include <sys/nvpair.h>
+#include "zfs_namecheck.h"
+#include "zfs_deleg.h"
+
+/*
+ * Deeply nested datasets can overflow the stack, so we put a limit
+ * in the amount of nesting a path can have. zfs_max_dataset_nesting
+ * can be tuned temporarily to fix existing datasets that exceed our
+ * predefined limit.
+ */
+int zfs_max_dataset_nesting = 50;
+
+static int
+valid_char(char c)
+{
+ return ((c >= 'a' && c <= 'z') ||
+ (c >= 'A' && c <= 'Z') ||
+ (c >= '0' && c <= '9') ||
+ c == '-' || c == '_' || c == '.' || c == ':' || c == ' ');
+}
+
+/*
+ * Looks at a path and returns its level of nesting (depth).
+ */
+int
+get_dataset_depth(const char *path)
+{
+ const char *loc = path;
+ int nesting = 0;
+
+ /*
+ * Keep track of nesting until you hit the end of the
+ * path or found the snapshot/bookmark separator.
+ */
+ for (int i = 0; loc[i] != '\0' &&
+ loc[i] != '@' &&
+ loc[i] != '#'; i++) {
+ if (loc[i] == '/')
+ nesting++;
+ }
+
+ return (nesting);
+}
+
+/*
+ * Snapshot names must be made up of alphanumeric characters plus the following
+ * characters:
+ *
+ * [-_.: ]
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+zfs_component_namecheck(const char *path, namecheck_err_t *why, char *what)
+{
+ const char *loc;
+
+ if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) {
+ if (why)
+ *why = NAME_ERR_TOOLONG;
+ return (-1);
+ }
+
+ if (path[0] == '\0') {
+ if (why)
+ *why = NAME_ERR_EMPTY_COMPONENT;
+ return (-1);
+ }
+
+ for (loc = path; *loc; loc++) {
+ if (!valid_char(*loc)) {
+ if (why) {
+ *why = NAME_ERR_INVALCHAR;
+ *what = *loc;
+ }
+ return (-1);
+ }
+ }
+ return (0);
+}
+
+
+/*
+ * Permissions set name must start with the letter '@' followed by the
+ * same character restrictions as snapshot names, except that the name
+ * cannot exceed 64 characters.
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+permset_namecheck(const char *path, namecheck_err_t *why, char *what)
+{
+ if (strlen(path) >= ZFS_PERMSET_MAXLEN) {
+ if (why)
+ *why = NAME_ERR_TOOLONG;
+ return (-1);
+ }
+
+ if (path[0] != '@') {
+ if (why) {
+ *why = NAME_ERR_NO_AT;
+ *what = path[0];
+ }
+ return (-1);
+ }
+
+ return (zfs_component_namecheck(&path[1], why, what));
+}
+
+/*
+ * Dataset paths should not be deeper than zfs_max_dataset_nesting
+ * in terms of nesting.
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+dataset_nestcheck(const char *path)
+{
+ return ((get_dataset_depth(path) < zfs_max_dataset_nesting) ? 0 : -1);
+}
+
+/*
+ * Entity names must be of the following form:
+ *
+ * [component/]*[component][(@|#)component]?
+ *
+ * Where each component is made up of alphanumeric characters plus the following
+ * characters:
+ *
+ * [-_.: %]
+ *
+ * We allow '%' here as we use that character internally to create unique
+ * names for temporary clones (for online recv).
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+entity_namecheck(const char *path, namecheck_err_t *why, char *what)
+{
+ const char *end;
+
+ EQUIV(why == NULL, what == NULL);
+
+ /*
+ * Make sure the name is not too long.
+ */
+ if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) {
+ if (why)
+ *why = NAME_ERR_TOOLONG;
+ return (-1);
+ }
+
+ /* Explicitly check for a leading slash. */
+ if (path[0] == '/') {
+ if (why)
+ *why = NAME_ERR_LEADING_SLASH;
+ return (-1);
+ }
+
+ if (path[0] == '\0') {
+ if (why)
+ *why = NAME_ERR_EMPTY_COMPONENT;
+ return (-1);
+ }
+
+ const char *start = path;
+ boolean_t found_delim = B_FALSE;
+ for (;;) {
+ /* Find the end of this component */
+ end = start;
+ while (*end != '/' && *end != '@' && *end != '#' &&
+ *end != '\0')
+ end++;
+
+ if (*end == '\0' && end[-1] == '/') {
+ /* trailing slashes are not allowed */
+ if (why)
+ *why = NAME_ERR_TRAILING_SLASH;
+ return (-1);
+ }
+
+ /* Validate the contents of this component */
+ for (const char *loc = start; loc != end; loc++) {
+ if (!valid_char(*loc) && *loc != '%') {
+ if (why) {
+ *why = NAME_ERR_INVALCHAR;
+ *what = *loc;
+ }
+ return (-1);
+ }
+ }
+
+ if (*end == '\0' || *end == '/') {
+ int component_length = end - start;
+ /* Validate the contents of this component is not '.' */
+ if (component_length == 1) {
+ if (start[0] == '.') {
+ if (why)
+ *why = NAME_ERR_SELF_REF;
+ return (-1);
+ }
+ }
+
+ /* Validate the content of this component is not '..' */
+ if (component_length == 2) {
+ if (start[0] == '.' && start[1] == '.') {
+ if (why)
+ *why = NAME_ERR_PARENT_REF;
+ return (-1);
+ }
+ }
+ }
+
+ /* Snapshot or bookmark delimiter found */
+ if (*end == '@' || *end == '#') {
+ /* Multiple delimiters are not allowed */
+ if (found_delim != 0) {
+ if (why)
+ *why = NAME_ERR_MULTIPLE_DELIMITERS;
+ return (-1);
+ }
+
+ found_delim = B_TRUE;
+ }
+
+ /* Zero-length components are not allowed */
+ if (start == end) {
+ if (why)
+ *why = NAME_ERR_EMPTY_COMPONENT;
+ return (-1);
+ }
+
+ /* If we've reached the end of the string, we're OK */
+ if (*end == '\0')
+ return (0);
+
+ /*
+ * If there is a '/' in a snapshot or bookmark name
+ * then report an error
+ */
+ if (*end == '/' && found_delim != 0) {
+ if (why)
+ *why = NAME_ERR_TRAILING_SLASH;
+ return (-1);
+ }
+
+ /* Update to the next component */
+ start = end + 1;
+ }
+}
+
+/*
+ * Dataset is any entity, except bookmark
+ */
+int
+dataset_namecheck(const char *path, namecheck_err_t *why, char *what)
+{
+ int ret = entity_namecheck(path, why, what);
+
+ if (ret == 0 && strchr(path, '#') != NULL) {
+ if (why != NULL) {
+ *why = NAME_ERR_INVALCHAR;
+ *what = '#';
+ }
+ return (-1);
+ }
+
+ return (ret);
+}
+
+/*
+ * Assert path is a valid bookmark name
+ */
+int
+bookmark_namecheck(const char *path, namecheck_err_t *why, char *what)
+{
+ int ret = entity_namecheck(path, why, what);
+
+ if (ret == 0 && strchr(path, '#') == NULL) {
+ if (why != NULL) {
+ *why = NAME_ERR_NO_POUND;
+ *what = '#';
+ }
+ return (-1);
+ }
+
+ return (ret);
+}
+
+/*
+ * Assert path is a valid snapshot name
+ */
+int
+snapshot_namecheck(const char *path, namecheck_err_t *why, char *what)
+{
+ int ret = entity_namecheck(path, why, what);
+
+ if (ret == 0 && strchr(path, '@') == NULL) {
+ if (why != NULL) {
+ *why = NAME_ERR_NO_AT;
+ *what = '@';
+ }
+ return (-1);
+ }
+
+ return (ret);
+}
+
+/*
+ * mountpoint names must be of the following form:
+ *
+ * /[component][/]*[component][/]
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+mountpoint_namecheck(const char *path, namecheck_err_t *why)
+{
+ const char *start, *end;
+
+ /*
+ * Make sure none of the mountpoint component names are too long.
+ * If a component name is too long then the mkdir of the mountpoint
+ * will fail but then the mountpoint property will be set to a value
+ * that can never be mounted. Better to fail before setting the prop.
+ * Extra slashes are OK, they will be tossed by the mountpoint mkdir.
+ */
+
+ if (path == NULL || *path != '/') {
+ if (why)
+ *why = NAME_ERR_LEADING_SLASH;
+ return (-1);
+ }
+
+ /* Skip leading slash */
+ start = &path[1];
+ do {
+ end = start;
+ while (*end != '/' && *end != '\0')
+ end++;
+
+ if (end - start >= ZFS_MAX_DATASET_NAME_LEN) {
+ if (why)
+ *why = NAME_ERR_TOOLONG;
+ return (-1);
+ }
+ start = end + 1;
+
+ } while (*end != '\0');
+
+ return (0);
+}
+
+/*
+ * For pool names, we have the same set of valid characters as described in
+ * dataset names, with the additional restriction that the pool name must begin
+ * with a letter. The pool names 'raidz' and 'mirror' are also reserved names
+ * that cannot be used.
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+pool_namecheck(const char *pool, namecheck_err_t *why, char *what)
+{
+ const char *c;
+
+ /*
+ * Make sure the name is not too long.
+ * If we're creating a pool with version >= SPA_VERSION_DSL_SCRUB (v11)
+ * we need to account for additional space needed by the origin ds which
+ * will also be snapshotted: "poolname"+"/"+"$ORIGIN"+"@"+"$ORIGIN".
+ * Play it safe and enforce this limit even if the pool version is < 11
+ * so it can be upgraded without issues.
+ */
+ if (strlen(pool) >= (ZFS_MAX_DATASET_NAME_LEN - 2 -
+ strlen(ORIGIN_DIR_NAME) * 2)) {
+ if (why)
+ *why = NAME_ERR_TOOLONG;
+ return (-1);
+ }
+
+ c = pool;
+ while (*c != '\0') {
+ if (!valid_char(*c)) {
+ if (why) {
+ *why = NAME_ERR_INVALCHAR;
+ *what = *c;
+ }
+ return (-1);
+ }
+ c++;
+ }
+
+ if (!(*pool >= 'a' && *pool <= 'z') &&
+ !(*pool >= 'A' && *pool <= 'Z')) {
+ if (why)
+ *why = NAME_ERR_NOLETTER;
+ return (-1);
+ }
+
+ if (strcmp(pool, "mirror") == 0 ||
+ strcmp(pool, "raidz") == 0 ||
+ strcmp(pool, "draid") == 0) {
+ if (why)
+ *why = NAME_ERR_RESERVED;
+ return (-1);
+ }
+
+ if (pool[0] == 'c' && (pool[1] >= '0' && pool[1] <= '9')) {
+ if (why)
+ *why = NAME_ERR_DISKLIKE;
+ return (-1);
+ }
+
+ return (0);
+}
+
+EXPORT_SYMBOL(entity_namecheck);
+EXPORT_SYMBOL(pool_namecheck);
+EXPORT_SYMBOL(dataset_namecheck);
+EXPORT_SYMBOL(bookmark_namecheck);
+EXPORT_SYMBOL(snapshot_namecheck);
+EXPORT_SYMBOL(zfs_component_namecheck);
+EXPORT_SYMBOL(dataset_nestcheck);
+EXPORT_SYMBOL(get_dataset_depth);
+EXPORT_SYMBOL(zfs_max_dataset_nesting);
+
+ZFS_MODULE_PARAM(zfs, zfs_, max_dataset_nesting, INT, ZMOD_RW,
+ "Limit to the amount of nesting a path can have. Defaults to 50.");
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_prop.c b/sys/contrib/openzfs/module/zcommon/zfs_prop.c
new file mode 100644
index 000000000000..b78331187e13
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfs_prop.c
@@ -0,0 +1,1052 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright 2016, Joyent, Inc.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/u8_textprep.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_znode.h>
+#include <sys/dsl_crypt.h>
+
+#include "zfs_prop.h"
+#include "zfs_deleg.h"
+#include "zfs_fletcher.h"
+
+#if !defined(_KERNEL)
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#endif
+
+static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS];
+
+/* Note this is indexed by zfs_userquota_prop_t, keep the order the same */
+const char *zfs_userquota_prop_prefixes[] = {
+ "userused@",
+ "userquota@",
+ "groupused@",
+ "groupquota@",
+ "userobjused@",
+ "userobjquota@",
+ "groupobjused@",
+ "groupobjquota@",
+ "projectused@",
+ "projectquota@",
+ "projectobjused@",
+ "projectobjquota@"
+};
+
+zprop_desc_t *
+zfs_prop_get_table(void)
+{
+ return (zfs_prop_table);
+}
+
+void
+zfs_prop_init(void)
+{
+ static zprop_index_t checksum_table[] = {
+ { "on", ZIO_CHECKSUM_ON },
+ { "off", ZIO_CHECKSUM_OFF },
+ { "fletcher2", ZIO_CHECKSUM_FLETCHER_2 },
+ { "fletcher4", ZIO_CHECKSUM_FLETCHER_4 },
+ { "sha256", ZIO_CHECKSUM_SHA256 },
+ { "noparity", ZIO_CHECKSUM_NOPARITY },
+ { "sha512", ZIO_CHECKSUM_SHA512 },
+ { "skein", ZIO_CHECKSUM_SKEIN },
+#if !defined(__FreeBSD__)
+
+ { "edonr", ZIO_CHECKSUM_EDONR },
+#endif
+ { NULL }
+ };
+
+ static zprop_index_t dedup_table[] = {
+ { "on", ZIO_CHECKSUM_ON },
+ { "off", ZIO_CHECKSUM_OFF },
+ { "verify", ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY },
+ { "sha256", ZIO_CHECKSUM_SHA256 },
+ { "sha256,verify",
+ ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY },
+ { "sha512", ZIO_CHECKSUM_SHA512 },
+ { "sha512,verify",
+ ZIO_CHECKSUM_SHA512 | ZIO_CHECKSUM_VERIFY },
+ { "skein", ZIO_CHECKSUM_SKEIN },
+ { "skein,verify",
+ ZIO_CHECKSUM_SKEIN | ZIO_CHECKSUM_VERIFY },
+#if !defined(__FreeBSD__)
+
+ { "edonr,verify",
+ ZIO_CHECKSUM_EDONR | ZIO_CHECKSUM_VERIFY },
+#endif
+ { NULL }
+ };
+
+ static zprop_index_t compress_table[] = {
+ { "on", ZIO_COMPRESS_ON },
+ { "off", ZIO_COMPRESS_OFF },
+ { "lzjb", ZIO_COMPRESS_LZJB },
+ { "gzip", ZIO_COMPRESS_GZIP_6 }, /* gzip default */
+ { "gzip-1", ZIO_COMPRESS_GZIP_1 },
+ { "gzip-2", ZIO_COMPRESS_GZIP_2 },
+ { "gzip-3", ZIO_COMPRESS_GZIP_3 },
+ { "gzip-4", ZIO_COMPRESS_GZIP_4 },
+ { "gzip-5", ZIO_COMPRESS_GZIP_5 },
+ { "gzip-6", ZIO_COMPRESS_GZIP_6 },
+ { "gzip-7", ZIO_COMPRESS_GZIP_7 },
+ { "gzip-8", ZIO_COMPRESS_GZIP_8 },
+ { "gzip-9", ZIO_COMPRESS_GZIP_9 },
+ { "zle", ZIO_COMPRESS_ZLE },
+ { "lz4", ZIO_COMPRESS_LZ4 },
+ { "zstd", ZIO_COMPRESS_ZSTD },
+ { "zstd-fast",
+ ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_DEFAULT) },
+
+ /*
+ * ZSTD 1-19 are synthetic. We store the compression level in a
+ * separate hidden property to avoid wasting a large amount of
+ * space in the ZIO_COMPRESS enum.
+ *
+ * The compression level is also stored within the header of the
+ * compressed block since we may need it for later recompression
+ * to avoid checksum errors (L2ARC).
+ *
+ * Note that the level here is defined as bit shifted mask on
+ * top of the method.
+ */
+ { "zstd-1", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_1) },
+ { "zstd-2", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_2) },
+ { "zstd-3", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_3) },
+ { "zstd-4", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_4) },
+ { "zstd-5", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_5) },
+ { "zstd-6", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_6) },
+ { "zstd-7", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_7) },
+ { "zstd-8", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_8) },
+ { "zstd-9", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_9) },
+ { "zstd-10", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_10) },
+ { "zstd-11", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_11) },
+ { "zstd-12", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_12) },
+ { "zstd-13", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_13) },
+ { "zstd-14", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_14) },
+ { "zstd-15", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_15) },
+ { "zstd-16", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_16) },
+ { "zstd-17", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_17) },
+ { "zstd-18", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_18) },
+ { "zstd-19", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_19) },
+
+ /*
+ * The ZSTD-Fast levels are also synthetic.
+ */
+ { "zstd-fast-1",
+ ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_1) },
+ { "zstd-fast-2",
+ ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_2) },
+ { "zstd-fast-3",
+ ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_3) },
+ { "zstd-fast-4",
+ ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_4) },
+ { "zstd-fast-5",
+ ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_5) },
+ { "zstd-fast-6",
+ ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_6) },
+ { "zstd-fast-7",
+ ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_7) },
+ { "zstd-fast-8",
+ ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_8) },
+ { "zstd-fast-9",
+ ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_9) },
+ { "zstd-fast-10",
+ ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_10) },
+ { "zstd-fast-20",
+ ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_20) },
+ { "zstd-fast-30",
+ ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_30) },
+ { "zstd-fast-40",
+ ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_40) },
+ { "zstd-fast-50",
+ ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_50) },
+ { "zstd-fast-60",
+ ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_60) },
+ { "zstd-fast-70",
+ ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_70) },
+ { "zstd-fast-80",
+ ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_80) },
+ { "zstd-fast-90",
+ ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_90) },
+ { "zstd-fast-100",
+ ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_100) },
+ { "zstd-fast-500",
+ ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_500) },
+ { "zstd-fast-1000",
+ ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_1000) },
+ { NULL }
+ };
+
+ static zprop_index_t crypto_table[] = {
+ { "on", ZIO_CRYPT_ON },
+ { "off", ZIO_CRYPT_OFF },
+ { "aes-128-ccm", ZIO_CRYPT_AES_128_CCM },
+ { "aes-192-ccm", ZIO_CRYPT_AES_192_CCM },
+ { "aes-256-ccm", ZIO_CRYPT_AES_256_CCM },
+ { "aes-128-gcm", ZIO_CRYPT_AES_128_GCM },
+ { "aes-192-gcm", ZIO_CRYPT_AES_192_GCM },
+ { "aes-256-gcm", ZIO_CRYPT_AES_256_GCM },
+ { NULL }
+ };
+
+ static zprop_index_t keyformat_table[] = {
+ { "none", ZFS_KEYFORMAT_NONE },
+ { "raw", ZFS_KEYFORMAT_RAW },
+ { "hex", ZFS_KEYFORMAT_HEX },
+ { "passphrase", ZFS_KEYFORMAT_PASSPHRASE },
+ { NULL }
+ };
+
+ static zprop_index_t snapdir_table[] = {
+ { "hidden", ZFS_SNAPDIR_HIDDEN },
+ { "visible", ZFS_SNAPDIR_VISIBLE },
+ { NULL }
+ };
+
+ static zprop_index_t snapdev_table[] = {
+ { "hidden", ZFS_SNAPDEV_HIDDEN },
+ { "visible", ZFS_SNAPDEV_VISIBLE },
+ { NULL }
+ };
+
+ static zprop_index_t acl_mode_table[] = {
+ { "discard", ZFS_ACL_DISCARD },
+ { "groupmask", ZFS_ACL_GROUPMASK },
+ { "passthrough", ZFS_ACL_PASSTHROUGH },
+ { "restricted", ZFS_ACL_RESTRICTED },
+ { NULL }
+ };
+
+ static zprop_index_t acltype_table[] = {
+ { "off", ZFS_ACLTYPE_OFF },
+ { "posix", ZFS_ACLTYPE_POSIX },
+ { "nfsv4", ZFS_ACLTYPE_NFSV4 },
+ { "disabled", ZFS_ACLTYPE_OFF }, /* bkwrd compatibility */
+ { "noacl", ZFS_ACLTYPE_OFF }, /* bkwrd compatibility */
+ { "posixacl", ZFS_ACLTYPE_POSIX }, /* bkwrd compatibility */
+ { NULL }
+ };
+
+ static zprop_index_t acl_inherit_table[] = {
+ { "discard", ZFS_ACL_DISCARD },
+ { "noallow", ZFS_ACL_NOALLOW },
+ { "restricted", ZFS_ACL_RESTRICTED },
+ { "passthrough", ZFS_ACL_PASSTHROUGH },
+ { "secure", ZFS_ACL_RESTRICTED }, /* bkwrd compatibility */
+ { "passthrough-x", ZFS_ACL_PASSTHROUGH_X },
+ { NULL }
+ };
+
+ static zprop_index_t case_table[] = {
+ { "sensitive", ZFS_CASE_SENSITIVE },
+ { "insensitive", ZFS_CASE_INSENSITIVE },
+ { "mixed", ZFS_CASE_MIXED },
+ { NULL }
+ };
+
+ static zprop_index_t copies_table[] = {
+ { "1", 1 },
+ { "2", 2 },
+ { "3", 3 },
+ { NULL }
+ };
+
+ /*
+ * Use the unique flags we have to send to u8_strcmp() and/or
+ * u8_textprep() to represent the various normalization property
+ * values.
+ */
+ static zprop_index_t normalize_table[] = {
+ { "none", 0 },
+ { "formD", U8_TEXTPREP_NFD },
+ { "formKC", U8_TEXTPREP_NFKC },
+ { "formC", U8_TEXTPREP_NFC },
+ { "formKD", U8_TEXTPREP_NFKD },
+ { NULL }
+ };
+
+ static zprop_index_t version_table[] = {
+ { "1", 1 },
+ { "2", 2 },
+ { "3", 3 },
+ { "4", 4 },
+ { "5", 5 },
+ { "current", ZPL_VERSION },
+ { NULL }
+ };
+
+ static zprop_index_t boolean_table[] = {
+ { "off", 0 },
+ { "on", 1 },
+ { NULL }
+ };
+
+ static zprop_index_t keystatus_table[] = {
+ { "none", ZFS_KEYSTATUS_NONE},
+ { "unavailable", ZFS_KEYSTATUS_UNAVAILABLE},
+ { "available", ZFS_KEYSTATUS_AVAILABLE},
+ { NULL }
+ };
+
+ static zprop_index_t logbias_table[] = {
+ { "latency", ZFS_LOGBIAS_LATENCY },
+ { "throughput", ZFS_LOGBIAS_THROUGHPUT },
+ { NULL }
+ };
+
+ static zprop_index_t canmount_table[] = {
+ { "off", ZFS_CANMOUNT_OFF },
+ { "on", ZFS_CANMOUNT_ON },
+ { "noauto", ZFS_CANMOUNT_NOAUTO },
+ { NULL }
+ };
+
+ static zprop_index_t cache_table[] = {
+ { "none", ZFS_CACHE_NONE },
+ { "metadata", ZFS_CACHE_METADATA },
+ { "all", ZFS_CACHE_ALL },
+ { NULL }
+ };
+
+ static zprop_index_t sync_table[] = {
+ { "standard", ZFS_SYNC_STANDARD },
+ { "always", ZFS_SYNC_ALWAYS },
+ { "disabled", ZFS_SYNC_DISABLED },
+ { NULL }
+ };
+
+ static zprop_index_t xattr_table[] = {
+ { "off", ZFS_XATTR_OFF },
+ { "on", ZFS_XATTR_DIR },
+ { "sa", ZFS_XATTR_SA },
+ { "dir", ZFS_XATTR_DIR },
+ { NULL }
+ };
+
+ static zprop_index_t dnsize_table[] = {
+ { "legacy", ZFS_DNSIZE_LEGACY },
+ { "auto", ZFS_DNSIZE_AUTO },
+ { "1k", ZFS_DNSIZE_1K },
+ { "2k", ZFS_DNSIZE_2K },
+ { "4k", ZFS_DNSIZE_4K },
+ { "8k", ZFS_DNSIZE_8K },
+ { "16k", ZFS_DNSIZE_16K },
+ { NULL }
+ };
+
+ static zprop_index_t redundant_metadata_table[] = {
+ { "all", ZFS_REDUNDANT_METADATA_ALL },
+ { "most", ZFS_REDUNDANT_METADATA_MOST },
+ { NULL }
+ };
+
+ static zprop_index_t volmode_table[] = {
+ { "default", ZFS_VOLMODE_DEFAULT },
+ { "full", ZFS_VOLMODE_GEOM },
+ { "geom", ZFS_VOLMODE_GEOM },
+ { "dev", ZFS_VOLMODE_DEV },
+ { "none", ZFS_VOLMODE_NONE },
+ { NULL }
+ };
+
+ /* inherit index properties */
+ zprop_register_index(ZFS_PROP_REDUNDANT_METADATA, "redundant_metadata",
+ ZFS_REDUNDANT_METADATA_ALL,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "all | most", "REDUND_MD",
+ redundant_metadata_table);
+ zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "standard | always | disabled", "SYNC",
+ sync_table);
+ zprop_register_index(ZFS_PROP_CHECKSUM, "checksum",
+ ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM |
+ ZFS_TYPE_VOLUME,
+#if !defined(__FreeBSD__)
+ "on | off | fletcher2 | fletcher4 | sha256 | sha512 | skein"
+ " | edonr",
+#else
+ "on | off | fletcher2 | fletcher4 | sha256 | sha512 | skein",
+#endif
+ "CHECKSUM", checksum_table);
+ zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "on | off | verify | sha256[,verify] | sha512[,verify] | "
+#if !defined(__FreeBSD__)
+ "skein[,verify] | edonr,verify",
+#else
+ "skein[,verify]",
+#endif
+ "DEDUP", dedup_table);
+ zprop_register_index(ZFS_PROP_COMPRESSION, "compression",
+ ZIO_COMPRESS_DEFAULT, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "on | off | lzjb | gzip | gzip-[1-9] | zle | lz4 | "
+ "zstd | zstd-[1-19] | "
+ "zstd-fast-[1-10,20,30,40,50,60,70,80,90,100,500,1000]",
+ "COMPRESS", compress_table);
+ zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ "hidden | visible", "SNAPDIR", snapdir_table);
+ zprop_register_index(ZFS_PROP_SNAPDEV, "snapdev", ZFS_SNAPDEV_HIDDEN,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "hidden | visible", "SNAPDEV", snapdev_table);
+ zprop_register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_DISCARD,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ "discard | groupmask | passthrough | restricted", "ACLMODE",
+ acl_mode_table);
+ zprop_register_index(ZFS_PROP_ACLTYPE, "acltype",
+#ifdef __linux__
+ /* Linux doesn't natively support ZFS's NFSv4-style ACLs. */
+ ZFS_ACLTYPE_OFF,
+#else
+ ZFS_ACLTYPE_NFSV4,
+#endif
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+ "off | nfsv4 | posix", "ACLTYPE", acltype_table);
+ zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit",
+ ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ "discard | noallow | restricted | passthrough | passthrough-x",
+ "ACLINHERIT", acl_inherit_table);
+ zprop_register_index(ZFS_PROP_COPIES, "copies", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "1 | 2 | 3", "COPIES", copies_table);
+ zprop_register_index(ZFS_PROP_PRIMARYCACHE, "primarycache",
+ ZFS_CACHE_ALL, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
+ "all | none | metadata", "PRIMARYCACHE", cache_table);
+ zprop_register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache",
+ ZFS_CACHE_ALL, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
+ "all | none | metadata", "SECONDARYCACHE", cache_table);
+ zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "latency | throughput", "LOGBIAS", logbias_table);
+ zprop_register_index(ZFS_PROP_XATTR, "xattr", ZFS_XATTR_DIR,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+ "on | off | dir | sa", "XATTR", xattr_table);
+ zprop_register_index(ZFS_PROP_DNODESIZE, "dnodesize",
+ ZFS_DNSIZE_LEGACY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ "legacy | auto | 1k | 2k | 4k | 8k | 16k", "DNSIZE", dnsize_table);
+ zprop_register_index(ZFS_PROP_VOLMODE, "volmode",
+ ZFS_VOLMODE_DEFAULT, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "default | full | geom | dev | none", "VOLMODE", volmode_table);
+
+ /* inherit index (boolean) properties */
+ zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table);
+ zprop_register_index(ZFS_PROP_RELATIME, "relatime", 0, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "on | off", "RELATIME", boolean_table);
+ zprop_register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "DEVICES",
+ boolean_table);
+ zprop_register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "EXEC",
+ boolean_table);
+ zprop_register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID",
+ boolean_table);
+ zprop_register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off", "RDONLY",
+ boolean_table);
+#ifdef __FreeBSD__
+ zprop_register_index(ZFS_PROP_ZONED, "jailed", 0, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "on | off", "JAILED", boolean_table);
+#else
+ zprop_register_index(ZFS_PROP_ZONED, "zoned", 0, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "on | off", "ZONED", boolean_table);
+#endif
+ zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN", boolean_table);
+ zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND",
+ boolean_table);
+ zprop_register_index(ZFS_PROP_OVERLAY, "overlay", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "on | off", "OVERLAY", boolean_table);
+
+ /* default index properties */
+ zprop_register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+ "1 | 2 | 3 | 4 | 5 | current", "VERSION", version_table);
+ zprop_register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON,
+ PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto",
+ "CANMOUNT", canmount_table);
+
+ /* readonly index properties */
+ zprop_register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
+ ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table);
+ zprop_register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0,
+ PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY",
+ boolean_table);
+ zprop_register_index(ZFS_PROP_KEYSTATUS, "keystatus",
+ ZFS_KEYSTATUS_NONE, PROP_READONLY, ZFS_TYPE_DATASET,
+ "none | unavailable | available",
+ "KEYSTATUS", keystatus_table);
+
+ /* set once index properties */
+ zprop_register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
+ PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+ "none | formC | formD | formKC | formKD", "NORMALIZATION",
+ normalize_table);
+ zprop_register_index(ZFS_PROP_CASE, "casesensitivity",
+ ZFS_CASE_SENSITIVE, PROP_ONETIME, ZFS_TYPE_FILESYSTEM |
+ ZFS_TYPE_SNAPSHOT,
+ "sensitive | insensitive | mixed", "CASE", case_table);
+ zprop_register_index(ZFS_PROP_KEYFORMAT, "keyformat",
+ ZFS_KEYFORMAT_NONE, PROP_ONETIME_DEFAULT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "none | raw | hex | passphrase", "KEYFORMAT", keyformat_table);
+ zprop_register_index(ZFS_PROP_ENCRYPTION, "encryption",
+ ZIO_CRYPT_DEFAULT, PROP_ONETIME, ZFS_TYPE_DATASET,
+ "on | off | aes-128-ccm | aes-192-ccm | aes-256-ccm | "
+ "aes-128-gcm | aes-192-gcm | aes-256-gcm", "ENCRYPTION",
+ crypto_table);
+
+ /* set once index (boolean) properties */
+ zprop_register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+ "on | off", "UTF8ONLY", boolean_table);
+
+ /* string properties */
+ zprop_register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN");
+ zprop_register_string(ZFS_PROP_CLONES, "clones", NULL, PROP_READONLY,
+ ZFS_TYPE_SNAPSHOT, "<dataset>[,...]", "CLONES");
+ zprop_register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/",
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "<path> | legacy | none",
+ "MOUNTPOINT");
+ zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off",
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | NFS share options",
+ "SHARENFS");
+ zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY,
+ ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK,
+ "filesystem | volume | snapshot | bookmark", "TYPE");
+ zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off",
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ "on | off | SMB share options", "SHARESMB");
+ zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel",
+ ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET,
+ "<sensitivity label>", "MLSLABEL");
+ zprop_register_string(ZFS_PROP_SELINUX_CONTEXT, "context",
+ "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "<selinux context>",
+ "CONTEXT");
+ zprop_register_string(ZFS_PROP_SELINUX_FSCONTEXT, "fscontext",
+ "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "<selinux fscontext>",
+ "FSCONTEXT");
+ zprop_register_string(ZFS_PROP_SELINUX_DEFCONTEXT, "defcontext",
+ "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "<selinux defcontext>",
+ "DEFCONTEXT");
+ zprop_register_string(ZFS_PROP_SELINUX_ROOTCONTEXT, "rootcontext",
+ "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "<selinux rootcontext>",
+ "ROOTCONTEXT");
+ zprop_register_string(ZFS_PROP_RECEIVE_RESUME_TOKEN,
+ "receive_resume_token",
+ NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "<string token>", "RESUMETOK");
+ zprop_register_string(ZFS_PROP_ENCRYPTION_ROOT, "encryptionroot", NULL,
+ PROP_READONLY, ZFS_TYPE_DATASET, "<filesystem | volume>",
+ "ENCROOT");
+ zprop_register_string(ZFS_PROP_KEYLOCATION, "keylocation",
+ "none", PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "prompt | <file URI>", "KEYLOCATION");
+ zprop_register_string(ZFS_PROP_REDACT_SNAPS,
+ "redact_snaps", NULL, PROP_READONLY,
+ ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<snapshot>[,...]",
+ "RSNAPS");
+
+ /* readonly number properties */
+ zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
+ ZFS_TYPE_DATASET, "<size>", "USED");
+ zprop_register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "AVAIL");
+ zprop_register_number(ZFS_PROP_REFERENCED, "referenced", 0,
+ PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<size>",
+ "REFER");
+ zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0,
+ PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK,
+ "<1.00x or higher if compressed>", "RATIO");
+ zprop_register_number(ZFS_PROP_REFRATIO, "refcompressratio", 0,
+ PROP_READONLY, ZFS_TYPE_DATASET,
+ "<1.00x or higher if compressed>", "REFRATIO");
+ zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize",
+ ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME,
+ ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK");
+ zprop_register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0,
+ PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+ "USEDSNAP");
+ zprop_register_number(ZFS_PROP_USEDDS, "usedbydataset", 0,
+ PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+ "USEDDS");
+ zprop_register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0,
+ PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+ "USEDCHILD");
+ zprop_register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0,
+ PROP_READONLY,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDREFRESERV");
+ zprop_register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY,
+ ZFS_TYPE_SNAPSHOT, "<count>", "USERREFS");
+ zprop_register_number(ZFS_PROP_WRITTEN, "written", 0, PROP_READONLY,
+ ZFS_TYPE_DATASET, "<size>", "WRITTEN");
+ zprop_register_number(ZFS_PROP_LOGICALUSED, "logicalused", 0,
+ PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+ "LUSED");
+ zprop_register_number(ZFS_PROP_LOGICALREFERENCED, "logicalreferenced",
+ 0, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<size>",
+ "LREFER");
+ zprop_register_number(ZFS_PROP_FILESYSTEM_COUNT, "filesystem_count",
+ UINT64_MAX, PROP_READONLY, ZFS_TYPE_FILESYSTEM,
+ "<count>", "FSCOUNT");
+ zprop_register_number(ZFS_PROP_SNAPSHOT_COUNT, "snapshot_count",
+ UINT64_MAX, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "<count>", "SSCOUNT");
+ zprop_register_number(ZFS_PROP_GUID, "guid", 0, PROP_READONLY,
+ ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<uint64>", "GUID");
+ zprop_register_number(ZFS_PROP_CREATETXG, "createtxg", 0, PROP_READONLY,
+ ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<uint64>", "CREATETXG");
+ zprop_register_number(ZFS_PROP_PBKDF2_ITERS, "pbkdf2iters",
+ 0, PROP_ONETIME_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "<iters>", "PBKDF2ITERS");
+ zprop_register_number(ZFS_PROP_OBJSETID, "objsetid", 0,
+ PROP_READONLY, ZFS_TYPE_DATASET, "<uint64>", "OBJSETID");
+
+ /* default number properties */
+ zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
+ ZFS_TYPE_FILESYSTEM, "<size> | none", "QUOTA");
+ zprop_register_number(ZFS_PROP_RESERVATION, "reservation", 0,
+ PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "<size> | none", "RESERV");
+ zprop_register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT,
+ ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, "<size>", "VOLSIZE");
+ zprop_register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT,
+ ZFS_TYPE_FILESYSTEM, "<size> | none", "REFQUOTA");
+ zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0,
+ PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "<size> | none", "REFRESERV");
+ zprop_register_number(ZFS_PROP_FILESYSTEM_LIMIT, "filesystem_limit",
+ UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM,
+ "<count> | none", "FSLIMIT");
+ zprop_register_number(ZFS_PROP_SNAPSHOT_LIMIT, "snapshot_limit",
+ UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "<count> | none", "SSLIMIT");
+
+ /* inherit number properties */
+ zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
+ SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE");
+ zprop_register_number(ZFS_PROP_SPECIAL_SMALL_BLOCKS,
+ "special_small_blocks", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ "zero or 512 to 1M, power of 2", "SPECIAL_SMALL_BLOCKS");
+
+ /* hidden properties */
+ zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
+ PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES");
+ zprop_register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING,
+ PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "NAME");
+ zprop_register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions",
+ PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS");
+ zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu",
+ PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME,
+ "STMF_SBD_LU");
+ zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting",
+ PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET,
+ "USERACCOUNTING");
+ zprop_register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER,
+ PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE");
+ zprop_register_hidden(ZFS_PROP_INCONSISTENT, "inconsistent",
+ PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "INCONSISTENT");
+ zprop_register_hidden(ZFS_PROP_IVSET_GUID, "ivsetguid",
+ PROP_TYPE_NUMBER, PROP_READONLY,
+ ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "IVSETGUID");
+ zprop_register_hidden(ZFS_PROP_PREV_SNAP, "prevsnap", PROP_TYPE_STRING,
+ PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PREVSNAP");
+ zprop_register_hidden(ZFS_PROP_PBKDF2_SALT, "pbkdf2salt",
+ PROP_TYPE_NUMBER, PROP_ONETIME_DEFAULT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PBKDF2SALT");
+ zprop_register_hidden(ZFS_PROP_KEY_GUID, "keyguid", PROP_TYPE_NUMBER,
+ PROP_READONLY, ZFS_TYPE_DATASET, "KEYGUID");
+ zprop_register_hidden(ZFS_PROP_REDACTED, "redacted", PROP_TYPE_NUMBER,
+ PROP_READONLY, ZFS_TYPE_DATASET, "REDACTED");
+
+ /*
+ * Properties that are obsolete and not used. These are retained so
+ * that we don't have to change the values of the zfs_prop_t enum, or
+ * have NULL pointers in the zfs_prop_table[].
+ */
+ zprop_register_hidden(ZFS_PROP_REMAPTXG, "remaptxg", PROP_TYPE_NUMBER,
+ PROP_READONLY, ZFS_TYPE_DATASET, "REMAPTXG");
+
+ /* oddball properties */
+ zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0,
+ NULL, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK,
+ "<date>", "CREATION", B_FALSE, B_TRUE, NULL);
+}
+
+boolean_t
+zfs_prop_delegatable(zfs_prop_t prop)
+{
+ zprop_desc_t *pd = &zfs_prop_table[prop];
+
+ /* The mlslabel property is never delegatable. */
+ if (prop == ZFS_PROP_MLSLABEL)
+ return (B_FALSE);
+
+ return (pd->pd_attr != PROP_READONLY);
+}
+
+/*
+ * Given a zfs dataset property name, returns the corresponding property ID.
+ */
+zfs_prop_t
+zfs_name_to_prop(const char *propname)
+{
+ return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET));
+}
+
+/*
+ * For user property names, we allow all lowercase alphanumeric characters, plus
+ * a few useful punctuation characters.
+ */
+static int
+valid_char(char c)
+{
+ return ((c >= 'a' && c <= 'z') ||
+ (c >= '0' && c <= '9') ||
+ c == '-' || c == '_' || c == '.' || c == ':');
+}
+
+/*
+ * Returns true if this is a valid user-defined property (one with a ':').
+ */
+boolean_t
+zfs_prop_user(const char *name)
+{
+ int i;
+ char c;
+ boolean_t foundsep = B_FALSE;
+
+ for (i = 0; i < strlen(name); i++) {
+ c = name[i];
+ if (!valid_char(c))
+ return (B_FALSE);
+ if (c == ':')
+ foundsep = B_TRUE;
+ }
+
+ if (!foundsep)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+/*
+ * Returns true if this is a valid userspace-type property (one with a '@').
+ * Note that after the @, any character is valid (eg, another @, for SID
+ * user@domain).
+ */
+boolean_t
+zfs_prop_userquota(const char *name)
+{
+ zfs_userquota_prop_t prop;
+
+ for (prop = 0; prop < ZFS_NUM_USERQUOTA_PROPS; prop++) {
+ if (strncmp(name, zfs_userquota_prop_prefixes[prop],
+ strlen(zfs_userquota_prop_prefixes[prop])) == 0) {
+ return (B_TRUE);
+ }
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Returns true if this is a valid written@ property.
+ * Note that after the @, any character is valid (eg, another @, for
+ * written@pool/fs@origin).
+ */
+boolean_t
+zfs_prop_written(const char *name)
+{
+ static const char *prop_prefix = "written@";
+ static const char *book_prefix = "written#";
+ return (strncmp(name, prop_prefix, strlen(prop_prefix)) == 0 ||
+ strncmp(name, book_prefix, strlen(book_prefix)) == 0);
+}
+
+/*
+ * Tables of index types, plus functions to convert between the user view
+ * (strings) and internal representation (uint64_t).
+ */
+int
+zfs_prop_string_to_index(zfs_prop_t prop, const char *string, uint64_t *index)
+{
+ return (zprop_string_to_index(prop, string, index, ZFS_TYPE_DATASET));
+}
+
+int
+zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string)
+{
+ return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET));
+}
+
+uint64_t
+zfs_prop_random_value(zfs_prop_t prop, uint64_t seed)
+{
+ return (zprop_random_value(prop, seed, ZFS_TYPE_DATASET));
+}
+
+/*
+ * Returns TRUE if the property applies to any of the given dataset types.
+ */
+boolean_t
+zfs_prop_valid_for_type(int prop, zfs_type_t types, boolean_t headcheck)
+{
+ return (zprop_valid_for_type(prop, types, headcheck));
+}
+
+zprop_type_t
+zfs_prop_get_type(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_proptype);
+}
+
+/*
+ * Returns TRUE if the property is readonly.
+ */
+boolean_t
+zfs_prop_readonly(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_attr == PROP_READONLY ||
+ zfs_prop_table[prop].pd_attr == PROP_ONETIME ||
+ zfs_prop_table[prop].pd_attr == PROP_ONETIME_DEFAULT);
+}
+
+/*
+ * Returns TRUE if the property is visible (not hidden).
+ */
+boolean_t
+zfs_prop_visible(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_visible &&
+ zfs_prop_table[prop].pd_zfs_mod_supported);
+}
+
+/*
+ * Returns TRUE if the property is only allowed to be set once.
+ */
+boolean_t
+zfs_prop_setonce(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_attr == PROP_ONETIME ||
+ zfs_prop_table[prop].pd_attr == PROP_ONETIME_DEFAULT);
+}
+
+const char *
+zfs_prop_default_string(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_strdefault);
+}
+
+uint64_t
+zfs_prop_default_numeric(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_numdefault);
+}
+
+/*
+ * Given a dataset property ID, returns the corresponding name.
+ * Assuming the zfs dataset property ID is valid.
+ */
+const char *
+zfs_prop_to_name(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_name);
+}
+
+/*
+ * Returns TRUE if the property is inheritable.
+ */
+boolean_t
+zfs_prop_inheritable(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_attr == PROP_INHERIT ||
+ zfs_prop_table[prop].pd_attr == PROP_ONETIME);
+}
+
+/*
+ * Returns TRUE if property is one of the encryption properties that requires
+ * a loaded encryption key to modify.
+ */
+boolean_t
+zfs_prop_encryption_key_param(zfs_prop_t prop)
+{
+ /*
+ * keylocation does not count as an encryption property. It can be
+ * changed at will without needing the master keys.
+ */
+ return (prop == ZFS_PROP_PBKDF2_SALT || prop == ZFS_PROP_PBKDF2_ITERS ||
+ prop == ZFS_PROP_KEYFORMAT);
+}
+
+/*
+ * Helper function used by both kernelspace and userspace to check the
+ * keylocation property. If encrypted is set, the keylocation must be valid
+ * for an encrypted dataset.
+ */
+boolean_t
+zfs_prop_valid_keylocation(const char *str, boolean_t encrypted)
+{
+ if (strcmp("none", str) == 0)
+ return (!encrypted);
+ else if (strcmp("prompt", str) == 0)
+ return (B_TRUE);
+ else if (strlen(str) > 8 && strncmp("file:///", str, 8) == 0)
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+
+#ifndef _KERNEL
+#include <libzfs.h>
+
+/*
+ * Returns a string describing the set of acceptable values for the given
+ * zfs property, or NULL if it cannot be set.
+ */
+const char *
+zfs_prop_values(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_values);
+}
+
+/*
+ * Returns TRUE if this property is a string type. Note that index types
+ * (compression, checksum) are treated as strings in userland, even though they
+ * are stored numerically on disk.
+ */
+int
+zfs_prop_is_string(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_proptype == PROP_TYPE_STRING ||
+ zfs_prop_table[prop].pd_proptype == PROP_TYPE_INDEX);
+}
+
+/*
+ * Returns the column header for the given property. Used only in
+ * 'zfs list -o', but centralized here with the other property information.
+ */
+const char *
+zfs_prop_column_name(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_colname);
+}
+
+/*
+ * Returns whether the given property should be displayed right-justified for
+ * 'zfs list'.
+ */
+boolean_t
+zfs_prop_align_right(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_rightalign);
+}
+
+#endif
+
+#if defined(_KERNEL)
+
+#include <sys/simd.h>
+
+#if defined(HAVE_KERNEL_FPU_INTERNAL)
+union fpregs_state **zfs_kfpu_fpregs;
+EXPORT_SYMBOL(zfs_kfpu_fpregs);
+#endif /* HAVE_KERNEL_FPU_INTERNAL */
+
+static int __init
+zcommon_init(void)
+{
+ int error = kfpu_init();
+ if (error)
+ return (error);
+
+ fletcher_4_init();
+
+ return (0);
+}
+
+static void __exit
+zcommon_fini(void)
+{
+ fletcher_4_fini();
+ kfpu_fini();
+}
+
+module_init_early(zcommon_init);
+module_exit(zcommon_fini);
+
+#endif
+
+ZFS_MODULE_DESCRIPTION("Generic ZFS support");
+ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR);
+ZFS_MODULE_LICENSE(ZFS_META_LICENSE);
+ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
+
+/* zfs dataset property functions */
+EXPORT_SYMBOL(zfs_userquota_prop_prefixes);
+EXPORT_SYMBOL(zfs_prop_init);
+EXPORT_SYMBOL(zfs_prop_get_type);
+EXPORT_SYMBOL(zfs_prop_get_table);
+EXPORT_SYMBOL(zfs_prop_delegatable);
+EXPORT_SYMBOL(zfs_prop_visible);
+
+/* Dataset property functions shared between libzfs and kernel. */
+EXPORT_SYMBOL(zfs_prop_default_string);
+EXPORT_SYMBOL(zfs_prop_default_numeric);
+EXPORT_SYMBOL(zfs_prop_readonly);
+EXPORT_SYMBOL(zfs_prop_inheritable);
+EXPORT_SYMBOL(zfs_prop_encryption_key_param);
+EXPORT_SYMBOL(zfs_prop_valid_keylocation);
+EXPORT_SYMBOL(zfs_prop_setonce);
+EXPORT_SYMBOL(zfs_prop_to_name);
+EXPORT_SYMBOL(zfs_name_to_prop);
+EXPORT_SYMBOL(zfs_prop_user);
+EXPORT_SYMBOL(zfs_prop_userquota);
+EXPORT_SYMBOL(zfs_prop_index_to_string);
+EXPORT_SYMBOL(zfs_prop_string_to_index);
+EXPORT_SYMBOL(zfs_prop_valid_for_type);
+EXPORT_SYMBOL(zfs_prop_written);
diff --git a/sys/contrib/openzfs/module/zcommon/zpool_prop.c b/sys/contrib/openzfs/module/zcommon/zpool_prop.c
new file mode 100644
index 000000000000..6299d371f25d
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zpool_prop.c
@@ -0,0 +1,279 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
+ */
+
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/fs/zfs.h>
+
+#include "zfs_prop.h"
+
+#if !defined(_KERNEL)
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#endif
+
+static zprop_desc_t zpool_prop_table[ZPOOL_NUM_PROPS];
+
+zprop_desc_t *
+zpool_prop_get_table(void)
+{
+ return (zpool_prop_table);
+}
+
+void
+zpool_prop_init(void)
+{
+ static zprop_index_t boolean_table[] = {
+ { "off", 0},
+ { "on", 1},
+ { NULL }
+ };
+
+ static zprop_index_t failuremode_table[] = {
+ { "wait", ZIO_FAILURE_MODE_WAIT },
+ { "continue", ZIO_FAILURE_MODE_CONTINUE },
+ { "panic", ZIO_FAILURE_MODE_PANIC },
+ { NULL }
+ };
+
+ /* string properties */
+ zprop_register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT,
+ ZFS_TYPE_POOL, "<path>", "ALTROOT");
+ zprop_register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT,
+ ZFS_TYPE_POOL, "<filesystem>", "BOOTFS");
+ zprop_register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "<file> | none", "CACHEFILE");
+ zprop_register_string(ZPOOL_PROP_COMMENT, "comment", NULL,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "<comment-string>", "COMMENT");
+ zprop_register_string(ZPOOL_PROP_COMPATIBILITY, "compatibility",
+ "off", PROP_DEFAULT, ZFS_TYPE_POOL,
+ "<file[,file...]> | off | legacy", "COMPATIBILITY");
+
+ /* readonly number properties */
+ zprop_register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<size>", "SIZE");
+ zprop_register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<size>", "FREE");
+ zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<size>", "FREEING");
+ zprop_register_number(ZPOOL_PROP_CHECKPOINT, "checkpoint", 0,
+ PROP_READONLY, ZFS_TYPE_POOL, "<size>", "CKPOINT");
+ zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<size>", "LEAKED");
+ zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
+ PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
+ zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0,
+ PROP_READONLY, ZFS_TYPE_POOL, "<size>", "EXPANDSZ");
+ zprop_register_number(ZPOOL_PROP_FRAGMENTATION, "fragmentation", 0,
+ PROP_READONLY, ZFS_TYPE_POOL, "<percent>", "FRAG");
+ zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<size>", "CAP");
+ zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<guid>", "GUID");
+ zprop_register_number(ZPOOL_PROP_LOAD_GUID, "load_guid", 0,
+ PROP_READONLY, ZFS_TYPE_POOL, "<load_guid>", "LOAD_GUID");
+ zprop_register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<state>", "HEALTH");
+ zprop_register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0,
+ PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if deduped>",
+ "DEDUP");
+
+ /* default number properties */
+ zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "<version>", "VERSION");
+ zprop_register_number(ZPOOL_PROP_ASHIFT, "ashift", 0, PROP_DEFAULT,
+ ZFS_TYPE_POOL, "<ashift, 9-16, or 0=default>", "ASHIFT");
+
+ /* default index (boolean) properties */
+ zprop_register_index(ZPOOL_PROP_DELEGATION, "delegation", 1,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "DELEGATION",
+ boolean_table);
+ zprop_register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table);
+ zprop_register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "LISTSNAPS",
+ boolean_table);
+ zprop_register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table);
+ zprop_register_index(ZPOOL_PROP_READONLY, "readonly", 0,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "RDONLY", boolean_table);
+ zprop_register_index(ZPOOL_PROP_MULTIHOST, "multihost", 0,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "MULTIHOST",
+ boolean_table);
+
+ /* default index properties */
+ zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode",
+ ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL,
+ "wait | continue | panic", "FAILMODE", failuremode_table);
+ zprop_register_index(ZPOOL_PROP_AUTOTRIM, "autotrim",
+ SPA_AUTOTRIM_DEFAULT, PROP_DEFAULT, ZFS_TYPE_POOL,
+ "on | off", "AUTOTRIM", boolean_table);
+
+ /* hidden properties */
+ zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
+ PROP_READONLY, ZFS_TYPE_POOL, "NAME");
+ zprop_register_hidden(ZPOOL_PROP_MAXBLOCKSIZE, "maxblocksize",
+ PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE");
+ zprop_register_hidden(ZPOOL_PROP_TNAME, "tname", PROP_TYPE_STRING,
+ PROP_ONETIME, ZFS_TYPE_POOL, "TNAME");
+ zprop_register_hidden(ZPOOL_PROP_MAXDNODESIZE, "maxdnodesize",
+ PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXDNODESIZE");
+ zprop_register_hidden(ZPOOL_PROP_DEDUPDITTO, "dedupditto",
+ PROP_TYPE_NUMBER, PROP_DEFAULT, ZFS_TYPE_POOL, "DEDUPDITTO");
+}
+
+/*
+ * Given a property name and its type, returns the corresponding property ID.
+ */
+zpool_prop_t
+zpool_name_to_prop(const char *propname)
+{
+ return (zprop_name_to_prop(propname, ZFS_TYPE_POOL));
+}
+
+/*
+ * Given a pool property ID, returns the corresponding name.
+ * Assuming the pool property ID is valid.
+ */
+const char *
+zpool_prop_to_name(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_name);
+}
+
+zprop_type_t
+zpool_prop_get_type(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_proptype);
+}
+
+boolean_t
+zpool_prop_readonly(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_attr == PROP_READONLY);
+}
+
+boolean_t
+zpool_prop_setonce(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_attr == PROP_ONETIME);
+}
+
+const char *
+zpool_prop_default_string(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_strdefault);
+}
+
+uint64_t
+zpool_prop_default_numeric(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_numdefault);
+}
+
+/*
+ * Returns true if this is a valid feature@ property.
+ */
+boolean_t
+zpool_prop_feature(const char *name)
+{
+ static const char *prefix = "feature@";
+ return (strncmp(name, prefix, strlen(prefix)) == 0);
+}
+
+/*
+ * Returns true if this is a valid unsupported@ property.
+ */
+boolean_t
+zpool_prop_unsupported(const char *name)
+{
+ static const char *prefix = "unsupported@";
+ return (strncmp(name, prefix, strlen(prefix)) == 0);
+}
+
+int
+zpool_prop_string_to_index(zpool_prop_t prop, const char *string,
+ uint64_t *index)
+{
+ return (zprop_string_to_index(prop, string, index, ZFS_TYPE_POOL));
+}
+
+int
+zpool_prop_index_to_string(zpool_prop_t prop, uint64_t index,
+ const char **string)
+{
+ return (zprop_index_to_string(prop, index, string, ZFS_TYPE_POOL));
+}
+
+uint64_t
+zpool_prop_random_value(zpool_prop_t prop, uint64_t seed)
+{
+ return (zprop_random_value(prop, seed, ZFS_TYPE_POOL));
+}
+
+#ifndef _KERNEL
+#include <libzfs.h>
+
+const char *
+zpool_prop_values(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_values);
+}
+
+const char *
+zpool_prop_column_name(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_colname);
+}
+
+boolean_t
+zpool_prop_align_right(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_rightalign);
+}
+#endif
+
+#if defined(_KERNEL)
+/* zpool property functions */
+EXPORT_SYMBOL(zpool_prop_init);
+EXPORT_SYMBOL(zpool_prop_get_type);
+EXPORT_SYMBOL(zpool_prop_get_table);
+
+/* Pool property functions shared between libzfs and kernel. */
+EXPORT_SYMBOL(zpool_name_to_prop);
+EXPORT_SYMBOL(zpool_prop_to_name);
+EXPORT_SYMBOL(zpool_prop_default_string);
+EXPORT_SYMBOL(zpool_prop_default_numeric);
+EXPORT_SYMBOL(zpool_prop_readonly);
+EXPORT_SYMBOL(zpool_prop_feature);
+EXPORT_SYMBOL(zpool_prop_unsupported);
+EXPORT_SYMBOL(zpool_prop_index_to_string);
+EXPORT_SYMBOL(zpool_prop_string_to_index);
+#endif
diff --git a/sys/contrib/openzfs/module/zcommon/zprop_common.c b/sys/contrib/openzfs/module/zcommon/zprop_common.c
new file mode 100644
index 000000000000..faab9d9a74fd
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zprop_common.c
@@ -0,0 +1,480 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+/*
+ * Common routines used by zfs and zpool property management.
+ */
+
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_sysfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/fs/zfs.h>
+
+#include "zfs_prop.h"
+#include "zfs_deleg.h"
+
+#if !defined(_KERNEL)
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <sys/stat.h>
+#endif
+
+static zprop_desc_t *
+zprop_get_proptable(zfs_type_t type)
+{
+ if (type == ZFS_TYPE_POOL)
+ return (zpool_prop_get_table());
+ else
+ return (zfs_prop_get_table());
+}
+
+static int
+zprop_get_numprops(zfs_type_t type)
+{
+ if (type == ZFS_TYPE_POOL)
+ return (ZPOOL_NUM_PROPS);
+ else
+ return (ZFS_NUM_PROPS);
+}
+
+static boolean_t
+zfs_mod_supported_prop(const char *name, zfs_type_t type)
+{
+/*
+ * The zfs module spa_feature_table[], whether in-kernel or in libzpool,
+ * always supports all the properties. libzfs needs to query the running
+ * module, via sysfs, to determine which properties are supported.
+ *
+ * The equivalent _can_ be done on FreeBSD by way of the sysctl
+ * tree, but this has not been done yet.
+ */
+#if defined(_KERNEL) || defined(LIB_ZPOOL_BUILD) || defined(__FreeBSD__)
+ return (B_TRUE);
+#else
+ return (zfs_mod_supported(type == ZFS_TYPE_POOL ?
+ ZFS_SYSFS_POOL_PROPERTIES : ZFS_SYSFS_DATASET_PROPERTIES, name));
+#endif
+}
+
+void
+zprop_register_impl(int prop, const char *name, zprop_type_t type,
+ uint64_t numdefault, const char *strdefault, zprop_attr_t attr,
+ int objset_types, const char *values, const char *colname,
+ boolean_t rightalign, boolean_t visible, const zprop_index_t *idx_tbl)
+{
+ zprop_desc_t *prop_tbl = zprop_get_proptable(objset_types);
+ zprop_desc_t *pd;
+
+ pd = &prop_tbl[prop];
+
+ ASSERT(pd->pd_name == NULL || pd->pd_name == name);
+ ASSERT(name != NULL);
+ ASSERT(colname != NULL);
+
+ pd->pd_name = name;
+ pd->pd_propnum = prop;
+ pd->pd_proptype = type;
+ pd->pd_numdefault = numdefault;
+ pd->pd_strdefault = strdefault;
+ pd->pd_attr = attr;
+ pd->pd_types = objset_types;
+ pd->pd_values = values;
+ pd->pd_colname = colname;
+ pd->pd_rightalign = rightalign;
+ pd->pd_visible = visible;
+ pd->pd_zfs_mod_supported = zfs_mod_supported_prop(name, objset_types);
+ pd->pd_table = idx_tbl;
+ pd->pd_table_size = 0;
+ while (idx_tbl && (idx_tbl++)->pi_name != NULL)
+ pd->pd_table_size++;
+}
+
+void
+zprop_register_string(int prop, const char *name, const char *def,
+ zprop_attr_t attr, int objset_types, const char *values,
+ const char *colname)
+{
+ zprop_register_impl(prop, name, PROP_TYPE_STRING, 0, def, attr,
+ objset_types, values, colname, B_FALSE, B_TRUE, NULL);
+
+}
+
+void
+zprop_register_number(int prop, const char *name, uint64_t def,
+ zprop_attr_t attr, int objset_types, const char *values,
+ const char *colname)
+{
+ zprop_register_impl(prop, name, PROP_TYPE_NUMBER, def, NULL, attr,
+ objset_types, values, colname, B_TRUE, B_TRUE, NULL);
+}
+
+void
+zprop_register_index(int prop, const char *name, uint64_t def,
+ zprop_attr_t attr, int objset_types, const char *values,
+ const char *colname, const zprop_index_t *idx_tbl)
+{
+ zprop_register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr,
+ objset_types, values, colname, B_FALSE, B_TRUE, idx_tbl);
+}
+
+void
+zprop_register_hidden(int prop, const char *name, zprop_type_t type,
+ zprop_attr_t attr, int objset_types, const char *colname)
+{
+ zprop_register_impl(prop, name, type, 0, NULL, attr,
+ objset_types, NULL, colname,
+ type == PROP_TYPE_NUMBER, B_FALSE, NULL);
+}
+
+
+/*
+ * A comparison function we can use to order indexes into property tables.
+ */
+static int
+zprop_compare(const void *arg1, const void *arg2)
+{
+ const zprop_desc_t *p1 = *((zprop_desc_t **)arg1);
+ const zprop_desc_t *p2 = *((zprop_desc_t **)arg2);
+ boolean_t p1ro, p2ro;
+
+ p1ro = (p1->pd_attr == PROP_READONLY);
+ p2ro = (p2->pd_attr == PROP_READONLY);
+
+ if (p1ro == p2ro)
+ return (strcmp(p1->pd_name, p2->pd_name));
+
+ return (p1ro ? -1 : 1);
+}
+
+/*
+ * Iterate over all properties in the given property table, calling back
+ * into the specified function for each property. We will continue to
+ * iterate until we either reach the end or the callback function returns
+ * something other than ZPROP_CONT.
+ */
+int
+zprop_iter_common(zprop_func func, void *cb, boolean_t show_all,
+ boolean_t ordered, zfs_type_t type)
+{
+ int i, num_props, size, prop;
+ zprop_desc_t *prop_tbl;
+ zprop_desc_t **order;
+
+ prop_tbl = zprop_get_proptable(type);
+ num_props = zprop_get_numprops(type);
+ size = num_props * sizeof (zprop_desc_t *);
+
+#if defined(_KERNEL)
+ order = kmem_alloc(size, KM_SLEEP);
+#else
+ if ((order = malloc(size)) == NULL)
+ return (ZPROP_CONT);
+#endif
+
+ for (int j = 0; j < num_props; j++)
+ order[j] = &prop_tbl[j];
+
+ if (ordered) {
+ qsort((void *)order, num_props, sizeof (zprop_desc_t *),
+ zprop_compare);
+ }
+
+ prop = ZPROP_CONT;
+ for (i = 0; i < num_props; i++) {
+ if ((order[i]->pd_visible || show_all) &&
+ order[i]->pd_zfs_mod_supported &&
+ (func(order[i]->pd_propnum, cb) != ZPROP_CONT)) {
+ prop = order[i]->pd_propnum;
+ break;
+ }
+ }
+
+#if defined(_KERNEL)
+ kmem_free(order, size);
+#else
+ free(order);
+#endif
+ return (prop);
+}
+
+static boolean_t
+propname_match(const char *p, size_t len, zprop_desc_t *prop_entry)
+{
+ const char *propname = prop_entry->pd_name;
+#ifndef _KERNEL
+ const char *colname = prop_entry->pd_colname;
+ int c;
+#endif
+
+ if (len == strlen(propname) &&
+ strncmp(p, propname, len) == 0)
+ return (B_TRUE);
+
+#ifndef _KERNEL
+ if (colname == NULL || len != strlen(colname))
+ return (B_FALSE);
+
+ for (c = 0; c < len; c++)
+ if (p[c] != tolower(colname[c]))
+ break;
+
+ return (colname[c] == '\0');
+#else
+ return (B_FALSE);
+#endif
+}
+
+typedef struct name_to_prop_cb {
+ const char *propname;
+ zprop_desc_t *prop_tbl;
+} name_to_prop_cb_t;
+
+static int
+zprop_name_to_prop_cb(int prop, void *cb_data)
+{
+ name_to_prop_cb_t *data = cb_data;
+
+ if (propname_match(data->propname, strlen(data->propname),
+ &data->prop_tbl[prop]))
+ return (prop);
+
+ return (ZPROP_CONT);
+}
+
+int
+zprop_name_to_prop(const char *propname, zfs_type_t type)
+{
+ int prop;
+ name_to_prop_cb_t cb_data;
+
+ cb_data.propname = propname;
+ cb_data.prop_tbl = zprop_get_proptable(type);
+
+ prop = zprop_iter_common(zprop_name_to_prop_cb, &cb_data,
+ B_TRUE, B_FALSE, type);
+
+ return (prop == ZPROP_CONT ? ZPROP_INVAL : prop);
+}
+
+int
+zprop_string_to_index(int prop, const char *string, uint64_t *index,
+ zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl;
+ const zprop_index_t *idx_tbl;
+ int i;
+
+ if (prop == ZPROP_INVAL || prop == ZPROP_CONT)
+ return (-1);
+
+ ASSERT(prop < zprop_get_numprops(type));
+ prop_tbl = zprop_get_proptable(type);
+ if ((idx_tbl = prop_tbl[prop].pd_table) == NULL)
+ return (-1);
+
+ for (i = 0; idx_tbl[i].pi_name != NULL; i++) {
+ if (strcmp(string, idx_tbl[i].pi_name) == 0) {
+ *index = idx_tbl[i].pi_value;
+ return (0);
+ }
+ }
+
+ return (-1);
+}
+
+int
+zprop_index_to_string(int prop, uint64_t index, const char **string,
+ zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl;
+ const zprop_index_t *idx_tbl;
+ int i;
+
+ if (prop == ZPROP_INVAL || prop == ZPROP_CONT)
+ return (-1);
+
+ ASSERT(prop < zprop_get_numprops(type));
+ prop_tbl = zprop_get_proptable(type);
+ if ((idx_tbl = prop_tbl[prop].pd_table) == NULL)
+ return (-1);
+
+ for (i = 0; idx_tbl[i].pi_name != NULL; i++) {
+ if (idx_tbl[i].pi_value == index) {
+ *string = idx_tbl[i].pi_name;
+ return (0);
+ }
+ }
+
+ return (-1);
+}
+
+/*
+ * Return a random valid property value. Used by ztest.
+ */
+uint64_t
+zprop_random_value(int prop, uint64_t seed, zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl;
+ const zprop_index_t *idx_tbl;
+
+ ASSERT((uint_t)prop < zprop_get_numprops(type));
+ prop_tbl = zprop_get_proptable(type);
+ idx_tbl = prop_tbl[prop].pd_table;
+
+ if (idx_tbl == NULL)
+ return (seed);
+
+ return (idx_tbl[seed % prop_tbl[prop].pd_table_size].pi_value);
+}
+
+const char *
+zprop_values(int prop, zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl;
+
+ ASSERT(prop != ZPROP_INVAL && prop != ZPROP_CONT);
+ ASSERT(prop < zprop_get_numprops(type));
+
+ prop_tbl = zprop_get_proptable(type);
+
+ return (prop_tbl[prop].pd_values);
+}
+
+/*
+ * Returns TRUE if the property applies to any of the given dataset types.
+ *
+ * If headcheck is set, the check is being made against the head dataset
+ * type of a snapshot which requires to return B_TRUE when the property
+ * is only valid for snapshots.
+ */
+boolean_t
+zprop_valid_for_type(int prop, zfs_type_t type, boolean_t headcheck)
+{
+ zprop_desc_t *prop_tbl;
+
+ if (prop == ZPROP_INVAL || prop == ZPROP_CONT)
+ return (B_FALSE);
+
+ ASSERT(prop < zprop_get_numprops(type));
+ prop_tbl = zprop_get_proptable(type);
+ if (headcheck && prop_tbl[prop].pd_types == ZFS_TYPE_SNAPSHOT)
+ return (B_TRUE);
+ return ((prop_tbl[prop].pd_types & type) != 0);
+}
+
+#ifndef _KERNEL
+
+/*
+ * Determines the minimum width for the column, and indicates whether it's fixed
+ * or not. Only string columns are non-fixed.
+ */
+size_t
+zprop_width(int prop, boolean_t *fixed, zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl, *pd;
+ const zprop_index_t *idx;
+ size_t ret;
+ int i;
+
+ ASSERT(prop != ZPROP_INVAL && prop != ZPROP_CONT);
+ ASSERT(prop < zprop_get_numprops(type));
+
+ prop_tbl = zprop_get_proptable(type);
+ pd = &prop_tbl[prop];
+
+ *fixed = B_TRUE;
+
+ /*
+ * Start with the width of the column name.
+ */
+ ret = strlen(pd->pd_colname);
+
+ /*
+ * For fixed-width values, make sure the width is large enough to hold
+ * any possible value.
+ */
+ switch (pd->pd_proptype) {
+ case PROP_TYPE_NUMBER:
+ /*
+ * The maximum length of a human-readable number is 5 characters
+ * ("20.4M", for example).
+ */
+ if (ret < 5)
+ ret = 5;
+ /*
+ * 'creation' is handled specially because it's a number
+ * internally, but displayed as a date string.
+ */
+ if (prop == ZFS_PROP_CREATION)
+ *fixed = B_FALSE;
+ /*
+ * 'health' is handled specially because it's a number
+ * internally, but displayed as a fixed 8 character string.
+ */
+ if (prop == ZPOOL_PROP_HEALTH)
+ ret = 8;
+ break;
+ case PROP_TYPE_INDEX:
+ idx = prop_tbl[prop].pd_table;
+ for (i = 0; idx[i].pi_name != NULL; i++) {
+ if (strlen(idx[i].pi_name) > ret)
+ ret = strlen(idx[i].pi_name);
+ }
+ break;
+
+ case PROP_TYPE_STRING:
+ *fixed = B_FALSE;
+ break;
+ }
+
+ return (ret);
+}
+
+#endif
+
+#if defined(_KERNEL)
+/* Common routines to initialize property tables */
+EXPORT_SYMBOL(zprop_register_impl);
+EXPORT_SYMBOL(zprop_register_string);
+EXPORT_SYMBOL(zprop_register_number);
+EXPORT_SYMBOL(zprop_register_index);
+EXPORT_SYMBOL(zprop_register_hidden);
+
+/* Common routines for zfs and zpool property management */
+EXPORT_SYMBOL(zprop_iter_common);
+EXPORT_SYMBOL(zprop_name_to_prop);
+EXPORT_SYMBOL(zprop_string_to_index);
+EXPORT_SYMBOL(zprop_index_to_string);
+EXPORT_SYMBOL(zprop_random_value);
+EXPORT_SYMBOL(zprop_values);
+EXPORT_SYMBOL(zprop_valid_for_type);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/Makefile.in b/sys/contrib/openzfs/module/zfs/Makefile.in
new file mode 100644
index 000000000000..653ea0da9bcc
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/Makefile.in
@@ -0,0 +1,157 @@
+ifneq ($(KBUILD_EXTMOD),)
+src = @abs_srcdir@
+obj = @abs_builddir@
+mfdir = $(obj)
+else
+mfdir = $(srctree)/$(src)
+endif
+
+MODULE := zfs
+
+obj-$(CONFIG_ZFS) := $(MODULE).o
+
+# Suppress unused-value warnings in sparc64 architecture headers
+ccflags-$(CONFIG_SPARC64) += -Wno-unused-value
+
+$(MODULE)-objs += abd.o
+$(MODULE)-objs += aggsum.o
+$(MODULE)-objs += arc.o
+$(MODULE)-objs += blkptr.o
+$(MODULE)-objs += bplist.o
+$(MODULE)-objs += bpobj.o
+$(MODULE)-objs += bptree.o
+$(MODULE)-objs += btree.o
+$(MODULE)-objs += bqueue.o
+$(MODULE)-objs += dataset_kstats.o
+$(MODULE)-objs += dbuf.o
+$(MODULE)-objs += dbuf_stats.o
+$(MODULE)-objs += ddt.o
+$(MODULE)-objs += ddt_zap.o
+$(MODULE)-objs += dmu.o
+$(MODULE)-objs += dmu_diff.o
+$(MODULE)-objs += dmu_object.o
+$(MODULE)-objs += dmu_objset.o
+$(MODULE)-objs += dmu_recv.o
+$(MODULE)-objs += dmu_redact.o
+$(MODULE)-objs += dmu_send.o
+$(MODULE)-objs += dmu_traverse.o
+$(MODULE)-objs += dmu_tx.o
+$(MODULE)-objs += dmu_zfetch.o
+$(MODULE)-objs += dnode.o
+$(MODULE)-objs += dnode_sync.o
+$(MODULE)-objs += dsl_bookmark.o
+$(MODULE)-objs += dsl_crypt.o
+$(MODULE)-objs += dsl_dataset.o
+$(MODULE)-objs += dsl_deadlist.o
+$(MODULE)-objs += dsl_deleg.o
+$(MODULE)-objs += dsl_destroy.o
+$(MODULE)-objs += dsl_dir.o
+$(MODULE)-objs += dsl_pool.o
+$(MODULE)-objs += dsl_prop.o
+$(MODULE)-objs += dsl_scan.o
+$(MODULE)-objs += dsl_synctask.o
+$(MODULE)-objs += dsl_userhold.o
+$(MODULE)-objs += edonr_zfs.o
+$(MODULE)-objs += fm.o
+$(MODULE)-objs += gzip.o
+$(MODULE)-objs += hkdf.o
+$(MODULE)-objs += lz4.o
+$(MODULE)-objs += lzjb.o
+$(MODULE)-objs += metaslab.o
+$(MODULE)-objs += mmp.o
+$(MODULE)-objs += multilist.o
+$(MODULE)-objs += objlist.o
+$(MODULE)-objs += pathname.o
+$(MODULE)-objs += range_tree.o
+$(MODULE)-objs += refcount.o
+$(MODULE)-objs += rrwlock.o
+$(MODULE)-objs += sa.o
+$(MODULE)-objs += sha256.o
+$(MODULE)-objs += skein_zfs.o
+$(MODULE)-objs += spa.o
+$(MODULE)-objs += spa_boot.o
+$(MODULE)-objs += spa_checkpoint.o
+$(MODULE)-objs += spa_config.o
+$(MODULE)-objs += spa_errlog.o
+$(MODULE)-objs += spa_history.o
+$(MODULE)-objs += spa_log_spacemap.o
+$(MODULE)-objs += spa_misc.o
+$(MODULE)-objs += spa_stats.o
+$(MODULE)-objs += space_map.o
+$(MODULE)-objs += space_reftree.o
+$(MODULE)-objs += txg.o
+$(MODULE)-objs += uberblock.o
+$(MODULE)-objs += unique.o
+$(MODULE)-objs += vdev.o
+$(MODULE)-objs += vdev_cache.o
+$(MODULE)-objs += vdev_draid.o
+$(MODULE)-objs += vdev_draid_rand.o
+$(MODULE)-objs += vdev_indirect.o
+$(MODULE)-objs += vdev_indirect_births.o
+$(MODULE)-objs += vdev_indirect_mapping.o
+$(MODULE)-objs += vdev_initialize.o
+$(MODULE)-objs += vdev_label.o
+$(MODULE)-objs += vdev_mirror.o
+$(MODULE)-objs += vdev_missing.o
+$(MODULE)-objs += vdev_queue.o
+$(MODULE)-objs += vdev_raidz.o
+$(MODULE)-objs += vdev_raidz_math.o
+$(MODULE)-objs += vdev_raidz_math_scalar.o
+$(MODULE)-objs += vdev_rebuild.o
+$(MODULE)-objs += vdev_removal.o
+$(MODULE)-objs += vdev_root.o
+$(MODULE)-objs += vdev_trim.o
+$(MODULE)-objs += zap.o
+$(MODULE)-objs += zap_leaf.o
+$(MODULE)-objs += zap_micro.o
+$(MODULE)-objs += zcp.o
+$(MODULE)-objs += zcp_get.o
+$(MODULE)-objs += zcp_global.o
+$(MODULE)-objs += zcp_iter.o
+$(MODULE)-objs += zcp_set.o
+$(MODULE)-objs += zcp_synctask.o
+$(MODULE)-objs += zfeature.o
+$(MODULE)-objs += zfs_byteswap.o
+$(MODULE)-objs += zfs_fm.o
+$(MODULE)-objs += zfs_fuid.o
+$(MODULE)-objs += zfs_ioctl.o
+$(MODULE)-objs += zfs_log.o
+$(MODULE)-objs += zfs_onexit.o
+$(MODULE)-objs += zfs_quota.o
+$(MODULE)-objs += zfs_ratelimit.o
+$(MODULE)-objs += zfs_replay.o
+$(MODULE)-objs += zfs_rlock.o
+$(MODULE)-objs += zfs_sa.o
+$(MODULE)-objs += zfs_vnops.o
+$(MODULE)-objs += zil.o
+$(MODULE)-objs += zio.o
+$(MODULE)-objs += zio_checksum.o
+$(MODULE)-objs += zio_compress.o
+$(MODULE)-objs += zio_inject.o
+$(MODULE)-objs += zle.o
+$(MODULE)-objs += zrlock.o
+$(MODULE)-objs += zthr.o
+$(MODULE)-objs += zvol.o
+
+# Suppress incorrect warnings from versions of objtool which are not
+# aware of x86 EVEX prefix instructions used for AVX512.
+OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512bw.o := y
+OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512f.o := y
+
+$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_sse2.o
+$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_ssse3.o
+$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx2.o
+$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512f.o
+$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512bw.o
+
+$(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neon.o
+$(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neonx2.o
+
+$(MODULE)-$(CONFIG_PPC) += vdev_raidz_math_powerpc_altivec.o
+$(MODULE)-$(CONFIG_PPC64) += vdev_raidz_math_powerpc_altivec.o
+
+ifeq ($(CONFIG_ALTIVEC),y)
+$(obj)/vdev_raidz_math_powerpc_altivec.o: c_flags += -maltivec
+endif
+
+include $(mfdir)/../os/linux/zfs/Makefile
diff --git a/sys/contrib/openzfs/module/zfs/THIRDPARTYLICENSE.cityhash b/sys/contrib/openzfs/module/zfs/THIRDPARTYLICENSE.cityhash
new file mode 100644
index 000000000000..e558b2a50358
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/THIRDPARTYLICENSE.cityhash
@@ -0,0 +1,19 @@
+Copyright (c) 2011 Google, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/sys/contrib/openzfs/module/zfs/THIRDPARTYLICENSE.cityhash.descrip b/sys/contrib/openzfs/module/zfs/THIRDPARTYLICENSE.cityhash.descrip
new file mode 100644
index 000000000000..f98cb76dfc91
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/THIRDPARTYLICENSE.cityhash.descrip
@@ -0,0 +1 @@
+CITYHASH CHECKSUM FUNCTIONALITY IN ZFS
diff --git a/sys/contrib/openzfs/module/zfs/abd.c b/sys/contrib/openzfs/module/zfs/abd.c
new file mode 100644
index 000000000000..7d3a2f6d69e2
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/abd.c
@@ -0,0 +1,1212 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2019 by Delphix. All rights reserved.
+ */
+
+/*
+ * ARC buffer data (ABD).
+ *
+ * ABDs are an abstract data structure for the ARC which can use two
+ * different ways of storing the underlying data:
+ *
+ * (a) Linear buffer. In this case, all the data in the ABD is stored in one
+ * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).
+ *
+ * +-------------------+
+ * | ABD (linear) |
+ * | abd_flags = ... |
+ * | abd_size = ... | +--------------------------------+
+ * | abd_buf ------------->| raw buffer of size abd_size |
+ * +-------------------+ +--------------------------------+
+ * no abd_chunks
+ *
+ * (b) Scattered buffer. In this case, the data in the ABD is split into
+ * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers
+ * to the chunks recorded in an array at the end of the ABD structure.
+ *
+ * +-------------------+
+ * | ABD (scattered) |
+ * | abd_flags = ... |
+ * | abd_size = ... |
+ * | abd_offset = 0 | +-----------+
+ * | abd_chunks[0] ----------------------------->| chunk 0 |
+ * | abd_chunks[1] ---------------------+ +-----------+
+ * | ... | | +-----------+
+ * | abd_chunks[N-1] ---------+ +------->| chunk 1 |
+ * +-------------------+ | +-----------+
+ * | ...
+ * | +-----------+
+ * +----------------->| chunk N-1 |
+ * +-----------+
+ *
+ * In addition to directly allocating a linear or scattered ABD, it is also
+ * possible to create an ABD by requesting the "sub-ABD" starting at an offset
+ * within an existing ABD. In linear buffers this is simple (set abd_buf of
+ * the new ABD to the starting point within the original raw buffer), but
+ * scattered ABDs are a little more complex. The new ABD makes a copy of the
+ * relevant abd_chunks pointers (but not the underlying data). However, to
+ * provide arbitrary rather than only chunk-aligned starting offsets, it also
+ * tracks an abd_offset field which represents the starting point of the data
+ * within the first chunk in abd_chunks. For both linear and scattered ABDs,
+ * creating an offset ABD marks the original ABD as the offset's parent, and the
+ * original ABD's abd_children refcount is incremented. This data allows us to
+ * ensure the root ABD isn't deleted before its children.
+ *
+ * Most consumers should never need to know what type of ABD they're using --
+ * the ABD public API ensures that it's possible to transparently switch from
+ * using a linear ABD to a scattered one when doing so would be beneficial.
+ *
+ * If you need to use the data within an ABD directly, if you know it's linear
+ * (because you allocated it) you can use abd_to_buf() to access the underlying
+ * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions
+ * which will allocate a raw buffer if necessary. Use the abd_return_buf*
+ * functions to return any raw buffers that are no longer necessary when you're
+ * done using them.
+ *
+ * There are a variety of ABD APIs that implement basic buffer operations:
+ * compare, copy, read, write, and fill with zeroes. If you need a custom
+ * function which progressively accesses the whole ABD, use the abd_iterate_*
+ * functions.
+ *
+ * As an additional feature, linear and scatter ABD's can be stitched together
+ * by using the gang ABD type (abd_alloc_gang_abd()). This allows for
+ * multiple ABDs to be viewed as a singular ABD.
+ *
+ * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
+ * B_FALSE.
+ */
+
+#include <sys/abd_impl.h>
+#include <sys/param.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+
+/* see block comment above for description */
+int zfs_abd_scatter_enabled = B_TRUE;
+
+void
+abd_verify(abd_t *abd)
+{
+ ASSERT3U(abd->abd_size, >, 0);
+ ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
+ ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
+ ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG |
+ ABD_FLAG_GANG_FREE | ABD_FLAG_ZEROS | ABD_FLAG_ALLOCD));
+#ifdef ZFS_DEBUG
+ IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
+#endif
+ IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
+ if (abd_is_linear(abd)) {
+ ASSERT3P(ABD_LINEAR_BUF(abd), !=, NULL);
+ } else if (abd_is_gang(abd)) {
+ uint_t child_sizes = 0;
+ for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain);
+ cabd != NULL;
+ cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
+ ASSERT(list_link_active(&cabd->abd_gang_link));
+ child_sizes += cabd->abd_size;
+ abd_verify(cabd);
+ }
+ ASSERT3U(abd->abd_size, ==, child_sizes);
+ } else {
+ abd_verify_scatter(abd);
+ }
+}
+
+static void
+abd_init_struct(abd_t *abd)
+{
+ list_link_init(&abd->abd_gang_link);
+ mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL);
+ abd->abd_flags = 0;
+#ifdef ZFS_DEBUG
+ zfs_refcount_create(&abd->abd_children);
+ abd->abd_parent = NULL;
+#endif
+ abd->abd_size = 0;
+}
+
+static void
+abd_fini_struct(abd_t *abd)
+{
+ mutex_destroy(&abd->abd_mtx);
+ ASSERT(!list_link_active(&abd->abd_gang_link));
+#ifdef ZFS_DEBUG
+ zfs_refcount_destroy(&abd->abd_children);
+#endif
+}
+
+abd_t *
+abd_alloc_struct(size_t size)
+{
+ abd_t *abd = abd_alloc_struct_impl(size);
+ abd_init_struct(abd);
+ abd->abd_flags |= ABD_FLAG_ALLOCD;
+ return (abd);
+}
+
+void
+abd_free_struct(abd_t *abd)
+{
+ abd_fini_struct(abd);
+ abd_free_struct_impl(abd);
+}
+
+/*
+ * Allocate an ABD, along with its own underlying data buffers. Use this if you
+ * don't care whether the ABD is linear or not.
+ */
+abd_t *
+abd_alloc(size_t size, boolean_t is_metadata)
+{
+ if (!zfs_abd_scatter_enabled || abd_size_alloc_linear(size))
+ return (abd_alloc_linear(size, is_metadata));
+
+ VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+ abd_t *abd = abd_alloc_struct(size);
+ abd->abd_flags |= ABD_FLAG_OWNER;
+ abd->abd_u.abd_scatter.abd_offset = 0;
+ abd_alloc_chunks(abd, size);
+
+ if (is_metadata) {
+ abd->abd_flags |= ABD_FLAG_META;
+ }
+ abd->abd_size = size;
+
+ abd_update_scatter_stats(abd, ABDSTAT_INCR);
+
+ return (abd);
+}
+
+/*
+ * Allocate an ABD that must be linear, along with its own underlying data
+ * buffer. Only use this when it would be very annoying to write your ABD
+ * consumer with a scattered ABD.
+ */
+abd_t *
+abd_alloc_linear(size_t size, boolean_t is_metadata)
+{
+ abd_t *abd = abd_alloc_struct(0);
+
+ VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+ abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_OWNER;
+ if (is_metadata) {
+ abd->abd_flags |= ABD_FLAG_META;
+ }
+ abd->abd_size = size;
+
+ if (is_metadata) {
+ ABD_LINEAR_BUF(abd) = zio_buf_alloc(size);
+ } else {
+ ABD_LINEAR_BUF(abd) = zio_data_buf_alloc(size);
+ }
+
+ abd_update_linear_stats(abd, ABDSTAT_INCR);
+
+ return (abd);
+}
+
+static void
+abd_free_linear(abd_t *abd)
+{
+ if (abd_is_linear_page(abd)) {
+ abd_free_linear_page(abd);
+ return;
+ }
+ if (abd->abd_flags & ABD_FLAG_META) {
+ zio_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size);
+ } else {
+ zio_data_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size);
+ }
+
+ abd_update_linear_stats(abd, ABDSTAT_DECR);
+}
+
+static void
+abd_free_gang(abd_t *abd)
+{
+ ASSERT(abd_is_gang(abd));
+ abd_t *cabd;
+
+ while ((cabd = list_head(&ABD_GANG(abd).abd_gang_chain)) != NULL) {
+ /*
+ * We must acquire the child ABDs mutex to ensure that if it
+ * is being added to another gang ABD we will set the link
+ * as inactive when removing it from this gang ABD and before
+ * adding it to the other gang ABD.
+ */
+ mutex_enter(&cabd->abd_mtx);
+ ASSERT(list_link_active(&cabd->abd_gang_link));
+ list_remove(&ABD_GANG(abd).abd_gang_chain, cabd);
+ mutex_exit(&cabd->abd_mtx);
+ if (cabd->abd_flags & ABD_FLAG_GANG_FREE)
+ abd_free(cabd);
+ }
+ list_destroy(&ABD_GANG(abd).abd_gang_chain);
+}
+
+static void
+abd_free_scatter(abd_t *abd)
+{
+ abd_free_chunks(abd);
+ abd_update_scatter_stats(abd, ABDSTAT_DECR);
+}
+
+/*
+ * Free an ABD. Use with any kind of abd: those created with abd_alloc_*()
+ * and abd_get_*(), including abd_get_offset_struct().
+ *
+ * If the ABD was created with abd_alloc_*(), the underlying data
+ * (scatterlist or linear buffer) will also be freed. (Subject to ownership
+ * changes via abd_*_ownership_of_buf().)
+ *
+ * Unless the ABD was created with abd_get_offset_struct(), the abd_t will
+ * also be freed.
+ */
+void
+abd_free(abd_t *abd)
+{
+ if (abd == NULL)
+ return;
+
+ abd_verify(abd);
+#ifdef ZFS_DEBUG
+ IMPLY(abd->abd_flags & ABD_FLAG_OWNER, abd->abd_parent == NULL);
+#endif
+
+ if (abd_is_gang(abd)) {
+ abd_free_gang(abd);
+ } else if (abd_is_linear(abd)) {
+ if (abd->abd_flags & ABD_FLAG_OWNER)
+ abd_free_linear(abd);
+ } else {
+ if (abd->abd_flags & ABD_FLAG_OWNER)
+ abd_free_scatter(abd);
+ }
+
+#ifdef ZFS_DEBUG
+ if (abd->abd_parent != NULL) {
+ (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children,
+ abd->abd_size, abd);
+ }
+#endif
+
+ abd_fini_struct(abd);
+ if (abd->abd_flags & ABD_FLAG_ALLOCD)
+ abd_free_struct_impl(abd);
+}
+
+/*
+ * Allocate an ABD of the same format (same metadata flag, same scatterize
+ * setting) as another ABD.
+ */
+abd_t *
+abd_alloc_sametype(abd_t *sabd, size_t size)
+{
+ boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
+ if (abd_is_linear(sabd) &&
+ !abd_is_linear_page(sabd)) {
+ return (abd_alloc_linear(size, is_metadata));
+ } else {
+ return (abd_alloc(size, is_metadata));
+ }
+}
+
+/*
+ * Create gang ABD that will be the head of a list of ABD's. This is used
+ * to "chain" scatter/gather lists together when constructing aggregated
+ * IO's. To free this abd, abd_free() must be called.
+ */
+abd_t *
+abd_alloc_gang(void)
+{
+ abd_t *abd = abd_alloc_struct(0);
+ abd->abd_flags |= ABD_FLAG_GANG | ABD_FLAG_OWNER;
+ list_create(&ABD_GANG(abd).abd_gang_chain,
+ sizeof (abd_t), offsetof(abd_t, abd_gang_link));
+ return (abd);
+}
+
+/*
+ * Add a child gang ABD to a parent gang ABDs chained list.
+ */
+static void
+abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free)
+{
+ ASSERT(abd_is_gang(pabd));
+ ASSERT(abd_is_gang(cabd));
+
+ if (free_on_free) {
+ /*
+ * If the parent is responsible for freeing the child gang
+ * ABD we will just splice the child's children ABD list to
+ * the parent's list and immediately free the child gang ABD
+ * struct. The parent gang ABDs children from the child gang
+ * will retain all the free_on_free settings after being
+ * added to the parents list.
+ */
+ pabd->abd_size += cabd->abd_size;
+ list_move_tail(&ABD_GANG(pabd).abd_gang_chain,
+ &ABD_GANG(cabd).abd_gang_chain);
+ ASSERT(list_is_empty(&ABD_GANG(cabd).abd_gang_chain));
+ abd_verify(pabd);
+ abd_free(cabd);
+ } else {
+ for (abd_t *child = list_head(&ABD_GANG(cabd).abd_gang_chain);
+ child != NULL;
+ child = list_next(&ABD_GANG(cabd).abd_gang_chain, child)) {
+ /*
+ * We always pass B_FALSE for free_on_free as it is the
+ * original child gang ABDs responsibilty to determine
+ * if any of its child ABDs should be free'd on the call
+ * to abd_free().
+ */
+ abd_gang_add(pabd, child, B_FALSE);
+ }
+ abd_verify(pabd);
+ }
+}
+
+/*
+ * Add a child ABD to a gang ABD's chained list.
+ */
+void
+abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free)
+{
+ ASSERT(abd_is_gang(pabd));
+ abd_t *child_abd = NULL;
+
+ /*
+ * If the child being added is a gang ABD, we will add the
+ * child's ABDs to the parent gang ABD. This allows us to account
+ * for the offset correctly in the parent gang ABD.
+ */
+ if (abd_is_gang(cabd)) {
+ ASSERT(!list_link_active(&cabd->abd_gang_link));
+ ASSERT(!list_is_empty(&ABD_GANG(cabd).abd_gang_chain));
+ return (abd_gang_add_gang(pabd, cabd, free_on_free));
+ }
+ ASSERT(!abd_is_gang(cabd));
+
+ /*
+ * In order to verify that an ABD is not already part of
+ * another gang ABD, we must lock the child ABD's abd_mtx
+ * to check its abd_gang_link status. We unlock the abd_mtx
+ * only after it is has been added to a gang ABD, which
+ * will update the abd_gang_link's status. See comment below
+ * for how an ABD can be in multiple gang ABD's simultaneously.
+ */
+ mutex_enter(&cabd->abd_mtx);
+ if (list_link_active(&cabd->abd_gang_link)) {
+ /*
+ * If the child ABD is already part of another
+ * gang ABD then we must allocate a new
+ * ABD to use a separate link. We mark the newly
+ * allocated ABD with ABD_FLAG_GANG_FREE, before
+ * adding it to the gang ABD's list, to make the
+ * gang ABD aware that it is responsible to call
+ * abd_free(). We use abd_get_offset() in order
+ * to just allocate a new ABD but avoid copying the
+ * data over into the newly allocated ABD.
+ *
+ * An ABD may become part of multiple gang ABD's. For
+ * example, when writing ditto bocks, the same ABD
+ * is used to write 2 or 3 locations with 2 or 3
+ * zio_t's. Each of the zio's may be aggregated with
+ * different adjacent zio's. zio aggregation uses gang
+ * zio's, so the single ABD can become part of multiple
+ * gang zio's.
+ *
+ * The ASSERT below is to make sure that if
+ * free_on_free is passed as B_TRUE, the ABD can
+ * not be in multiple gang ABD's. The gang ABD
+ * can not be responsible for cleaning up the child
+ * ABD memory allocation if the ABD can be in
+ * multiple gang ABD's at one time.
+ */
+ ASSERT3B(free_on_free, ==, B_FALSE);
+ child_abd = abd_get_offset(cabd, 0);
+ child_abd->abd_flags |= ABD_FLAG_GANG_FREE;
+ } else {
+ child_abd = cabd;
+ if (free_on_free)
+ child_abd->abd_flags |= ABD_FLAG_GANG_FREE;
+ }
+ ASSERT3P(child_abd, !=, NULL);
+
+ list_insert_tail(&ABD_GANG(pabd).abd_gang_chain, child_abd);
+ mutex_exit(&cabd->abd_mtx);
+ pabd->abd_size += child_abd->abd_size;
+}
+
+/*
+ * Locate the ABD for the supplied offset in the gang ABD.
+ * Return a new offset relative to the returned ABD.
+ */
+abd_t *
+abd_gang_get_offset(abd_t *abd, size_t *off)
+{
+ abd_t *cabd;
+
+ ASSERT(abd_is_gang(abd));
+ ASSERT3U(*off, <, abd->abd_size);
+ for (cabd = list_head(&ABD_GANG(abd).abd_gang_chain); cabd != NULL;
+ cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
+ if (*off >= cabd->abd_size)
+ *off -= cabd->abd_size;
+ else
+ return (cabd);
+ }
+ VERIFY3P(cabd, !=, NULL);
+ return (cabd);
+}
+
+/*
+ * Allocate a new ABD, using the provided struct (if non-NULL, and if
+ * circumstances allow - otherwise allocate the struct). The returned ABD will
+ * point to offset off of sabd. It shares the underlying buffer data with sabd.
+ * Use abd_free() to free. sabd must not be freed while any derived ABDs exist.
+ */
+static abd_t *
+abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size)
+{
+ abd_verify(sabd);
+ ASSERT3U(off + size, <=, sabd->abd_size);
+
+ if (abd_is_linear(sabd)) {
+ if (abd == NULL)
+ abd = abd_alloc_struct(0);
+ /*
+ * Even if this buf is filesystem metadata, we only track that
+ * if we own the underlying data buffer, which is not true in
+ * this case. Therefore, we don't ever use ABD_FLAG_META here.
+ */
+ abd->abd_flags |= ABD_FLAG_LINEAR;
+
+ ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off;
+ } else if (abd_is_gang(sabd)) {
+ size_t left = size;
+ if (abd == NULL) {
+ abd = abd_alloc_gang();
+ } else {
+ abd->abd_flags |= ABD_FLAG_GANG;
+ list_create(&ABD_GANG(abd).abd_gang_chain,
+ sizeof (abd_t), offsetof(abd_t, abd_gang_link));
+ }
+
+ abd->abd_flags &= ~ABD_FLAG_OWNER;
+ for (abd_t *cabd = abd_gang_get_offset(sabd, &off);
+ cabd != NULL && left > 0;
+ cabd = list_next(&ABD_GANG(sabd).abd_gang_chain, cabd)) {
+ int csize = MIN(left, cabd->abd_size - off);
+
+ abd_t *nabd = abd_get_offset_size(cabd, off, csize);
+ abd_gang_add(abd, nabd, B_TRUE);
+ left -= csize;
+ off = 0;
+ }
+ ASSERT3U(left, ==, 0);
+ } else {
+ abd = abd_get_offset_scatter(abd, sabd, off);
+ }
+
+ ASSERT3P(abd, !=, NULL);
+ abd->abd_size = size;
+#ifdef ZFS_DEBUG
+ abd->abd_parent = sabd;
+ (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd);
+#endif
+ return (abd);
+}
+
+/*
+ * Like abd_get_offset_size(), but memory for the abd_t is provided by the
+ * caller. Using this routine can improve performance by avoiding the cost
+ * of allocating memory for the abd_t struct, and updating the abd stats.
+ * Usually, the provided abd is returned, but in some circumstances (FreeBSD,
+ * if sabd is scatter and size is more than 2 pages) a new abd_t may need to
+ * be allocated. Therefore callers should be careful to use the returned
+ * abd_t*.
+ */
+abd_t *
+abd_get_offset_struct(abd_t *abd, abd_t *sabd, size_t off, size_t size)
+{
+ abd_init_struct(abd);
+ return (abd_get_offset_impl(abd, sabd, off, size));
+}
+
+abd_t *
+abd_get_offset(abd_t *sabd, size_t off)
+{
+ size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0;
+ VERIFY3U(size, >, 0);
+ return (abd_get_offset_impl(NULL, sabd, off, size));
+}
+
+abd_t *
+abd_get_offset_size(abd_t *sabd, size_t off, size_t size)
+{
+ ASSERT3U(off + size, <=, sabd->abd_size);
+ return (abd_get_offset_impl(NULL, sabd, off, size));
+}
+
+/*
+ * Return a size scatter ABD containing only zeros.
+ */
+abd_t *
+abd_get_zeros(size_t size)
+{
+ ASSERT3P(abd_zero_scatter, !=, NULL);
+ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+ return (abd_get_offset_size(abd_zero_scatter, 0, size));
+}
+
+/*
+ * Allocate a linear ABD structure for buf.
+ */
+abd_t *
+abd_get_from_buf(void *buf, size_t size)
+{
+ abd_t *abd = abd_alloc_struct(0);
+
+ VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+ /*
+ * Even if this buf is filesystem metadata, we only track that if we
+ * own the underlying data buffer, which is not true in this case.
+ * Therefore, we don't ever use ABD_FLAG_META here.
+ */
+ abd->abd_flags |= ABD_FLAG_LINEAR;
+ abd->abd_size = size;
+
+ ABD_LINEAR_BUF(abd) = buf;
+
+ return (abd);
+}
+
+/*
+ * Get the raw buffer associated with a linear ABD.
+ */
+void *
+abd_to_buf(abd_t *abd)
+{
+ ASSERT(abd_is_linear(abd));
+ abd_verify(abd);
+ return (ABD_LINEAR_BUF(abd));
+}
+
+/*
+ * Borrow a raw buffer from an ABD without copying the contents of the ABD
+ * into the buffer. If the ABD is scattered, this will allocate a raw buffer
+ * whose contents are undefined. To copy over the existing data in the ABD, use
+ * abd_borrow_buf_copy() instead.
+ */
+void *
+abd_borrow_buf(abd_t *abd, size_t n)
+{
+ void *buf;
+ abd_verify(abd);
+ ASSERT3U(abd->abd_size, >=, n);
+ if (abd_is_linear(abd)) {
+ buf = abd_to_buf(abd);
+ } else {
+ buf = zio_buf_alloc(n);
+ }
+#ifdef ZFS_DEBUG
+ (void) zfs_refcount_add_many(&abd->abd_children, n, buf);
+#endif
+ return (buf);
+}
+
+void *
+abd_borrow_buf_copy(abd_t *abd, size_t n)
+{
+ void *buf = abd_borrow_buf(abd, n);
+ if (!abd_is_linear(abd)) {
+ abd_copy_to_buf(buf, abd, n);
+ }
+ return (buf);
+}
+
+/*
+ * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
+ * not change the contents of the ABD and will ASSERT that you didn't modify
+ * the buffer since it was borrowed. If you want any changes you made to buf to
+ * be copied back to abd, use abd_return_buf_copy() instead.
+ */
+void
+abd_return_buf(abd_t *abd, void *buf, size_t n)
+{
+ abd_verify(abd);
+ ASSERT3U(abd->abd_size, >=, n);
+ if (abd_is_linear(abd)) {
+ ASSERT3P(buf, ==, abd_to_buf(abd));
+ } else {
+ ASSERT0(abd_cmp_buf(abd, buf, n));
+ zio_buf_free(buf, n);
+ }
+#ifdef ZFS_DEBUG
+ (void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
+#endif
+}
+
+void
+abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
+{
+ if (!abd_is_linear(abd)) {
+ abd_copy_from_buf(abd, buf, n);
+ }
+ abd_return_buf(abd, buf, n);
+}
+
+void
+abd_release_ownership_of_buf(abd_t *abd)
+{
+ ASSERT(abd_is_linear(abd));
+ ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
+
+ /*
+ * abd_free() needs to handle LINEAR_PAGE ABD's specially.
+ * Since that flag does not survive the
+ * abd_release_ownership_of_buf() -> abd_get_from_buf() ->
+ * abd_take_ownership_of_buf() sequence, we don't allow releasing
+ * these "linear but not zio_[data_]buf_alloc()'ed" ABD's.
+ */
+ ASSERT(!abd_is_linear_page(abd));
+
+ abd_verify(abd);
+
+ abd->abd_flags &= ~ABD_FLAG_OWNER;
+ /* Disable this flag since we no longer own the data buffer */
+ abd->abd_flags &= ~ABD_FLAG_META;
+
+ abd_update_linear_stats(abd, ABDSTAT_DECR);
+}
+
+
+/*
+ * Give this ABD ownership of the buffer that it's storing. Can only be used on
+ * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
+ * with abd_alloc_linear() which subsequently released ownership of their buf
+ * with abd_release_ownership_of_buf().
+ */
+void
+abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata)
+{
+ ASSERT(abd_is_linear(abd));
+ ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
+ abd_verify(abd);
+
+ abd->abd_flags |= ABD_FLAG_OWNER;
+ if (is_metadata) {
+ abd->abd_flags |= ABD_FLAG_META;
+ }
+
+ abd_update_linear_stats(abd, ABDSTAT_INCR);
+}
+
+/*
+ * Initializes an abd_iter based on whether the abd is a gang ABD
+ * or just a single ABD.
+ */
+static inline abd_t *
+abd_init_abd_iter(abd_t *abd, struct abd_iter *aiter, size_t off)
+{
+ abd_t *cabd = NULL;
+
+ if (abd_is_gang(abd)) {
+ cabd = abd_gang_get_offset(abd, &off);
+ if (cabd) {
+ abd_iter_init(aiter, cabd);
+ abd_iter_advance(aiter, off);
+ }
+ } else {
+ abd_iter_init(aiter, abd);
+ abd_iter_advance(aiter, off);
+ }
+ return (cabd);
+}
+
+/*
+ * Advances an abd_iter. We have to be careful with gang ABD as
+ * advancing could mean that we are at the end of a particular ABD and
+ * must grab the ABD in the gang ABD's list.
+ */
+static inline abd_t *
+abd_advance_abd_iter(abd_t *abd, abd_t *cabd, struct abd_iter *aiter,
+ size_t len)
+{
+ abd_iter_advance(aiter, len);
+ if (abd_is_gang(abd) && abd_iter_at_end(aiter)) {
+ ASSERT3P(cabd, !=, NULL);
+ cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd);
+ if (cabd) {
+ abd_iter_init(aiter, cabd);
+ abd_iter_advance(aiter, 0);
+ }
+ }
+ return (cabd);
+}
+
+int
+abd_iterate_func(abd_t *abd, size_t off, size_t size,
+ abd_iter_func_t *func, void *private)
+{
+ struct abd_iter aiter;
+ int ret = 0;
+
+ if (size == 0)
+ return (0);
+
+ abd_verify(abd);
+ ASSERT3U(off + size, <=, abd->abd_size);
+
+ boolean_t gang = abd_is_gang(abd);
+ abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
+
+ while (size > 0) {
+ /* If we are at the end of the gang ABD we are done */
+ if (gang && !c_abd)
+ break;
+
+ abd_iter_map(&aiter);
+
+ size_t len = MIN(aiter.iter_mapsize, size);
+ ASSERT3U(len, >, 0);
+
+ ret = func(aiter.iter_mapaddr, len, private);
+
+ abd_iter_unmap(&aiter);
+
+ if (ret != 0)
+ break;
+
+ size -= len;
+ c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
+ }
+
+ return (ret);
+}
+
+struct buf_arg {
+ void *arg_buf;
+};
+
+static int
+abd_copy_to_buf_off_cb(void *buf, size_t size, void *private)
+{
+ struct buf_arg *ba_ptr = private;
+
+ (void) memcpy(ba_ptr->arg_buf, buf, size);
+ ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+ return (0);
+}
+
+/*
+ * Copy abd to buf. (off is the offset in abd.)
+ */
+void
+abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size)
+{
+ struct buf_arg ba_ptr = { buf };
+
+ (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb,
+ &ba_ptr);
+}
+
+static int
+abd_cmp_buf_off_cb(void *buf, size_t size, void *private)
+{
+ int ret;
+ struct buf_arg *ba_ptr = private;
+
+ ret = memcmp(buf, ba_ptr->arg_buf, size);
+ ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+ return (ret);
+}
+
+/*
+ * Compare the contents of abd to buf. (off is the offset in abd.)
+ */
+int
+abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
+{
+ struct buf_arg ba_ptr = { (void *) buf };
+
+ return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr));
+}
+
+static int
+abd_copy_from_buf_off_cb(void *buf, size_t size, void *private)
+{
+ struct buf_arg *ba_ptr = private;
+
+ (void) memcpy(buf, ba_ptr->arg_buf, size);
+ ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+ return (0);
+}
+
+/*
+ * Copy from buf to abd. (off is the offset in abd.)
+ */
+void
+abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
+{
+ struct buf_arg ba_ptr = { (void *) buf };
+
+ (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb,
+ &ba_ptr);
+}
+
+/*ARGSUSED*/
+static int
+abd_zero_off_cb(void *buf, size_t size, void *private)
+{
+ (void) memset(buf, 0, size);
+ return (0);
+}
+
+/*
+ * Zero out the abd from a particular offset to the end.
+ */
+void
+abd_zero_off(abd_t *abd, size_t off, size_t size)
+{
+ (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL);
+}
+
+/*
+ * Iterate over two ABDs and call func incrementally on the two ABDs' data in
+ * equal-sized chunks (passed to func as raw buffers). func could be called many
+ * times during this iteration.
+ */
+int
+abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
+ size_t size, abd_iter_func2_t *func, void *private)
+{
+ int ret = 0;
+ struct abd_iter daiter, saiter;
+ boolean_t dabd_is_gang_abd, sabd_is_gang_abd;
+ abd_t *c_dabd, *c_sabd;
+
+ if (size == 0)
+ return (0);
+
+ abd_verify(dabd);
+ abd_verify(sabd);
+
+ ASSERT3U(doff + size, <=, dabd->abd_size);
+ ASSERT3U(soff + size, <=, sabd->abd_size);
+
+ dabd_is_gang_abd = abd_is_gang(dabd);
+ sabd_is_gang_abd = abd_is_gang(sabd);
+ c_dabd = abd_init_abd_iter(dabd, &daiter, doff);
+ c_sabd = abd_init_abd_iter(sabd, &saiter, soff);
+
+ while (size > 0) {
+ /* if we are at the end of the gang ABD we are done */
+ if ((dabd_is_gang_abd && !c_dabd) ||
+ (sabd_is_gang_abd && !c_sabd))
+ break;
+
+ abd_iter_map(&daiter);
+ abd_iter_map(&saiter);
+
+ size_t dlen = MIN(daiter.iter_mapsize, size);
+ size_t slen = MIN(saiter.iter_mapsize, size);
+ size_t len = MIN(dlen, slen);
+ ASSERT(dlen > 0 || slen > 0);
+
+ ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len,
+ private);
+
+ abd_iter_unmap(&saiter);
+ abd_iter_unmap(&daiter);
+
+ if (ret != 0)
+ break;
+
+ size -= len;
+ c_dabd =
+ abd_advance_abd_iter(dabd, c_dabd, &daiter, len);
+ c_sabd =
+ abd_advance_abd_iter(sabd, c_sabd, &saiter, len);
+ }
+
+ return (ret);
+}
+
+/*ARGSUSED*/
+static int
+abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private)
+{
+ (void) memcpy(dbuf, sbuf, size);
+ return (0);
+}
+
+/*
+ * Copy from sabd to dabd starting from soff and doff.
+ */
+void
+abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size)
+{
+ (void) abd_iterate_func2(dabd, sabd, doff, soff, size,
+ abd_copy_off_cb, NULL);
+}
+
+/*ARGSUSED*/
+static int
+abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private)
+{
+ return (memcmp(bufa, bufb, size));
+}
+
+/*
+ * Compares the contents of two ABDs.
+ */
+int
+abd_cmp(abd_t *dabd, abd_t *sabd)
+{
+ ASSERT3U(dabd->abd_size, ==, sabd->abd_size);
+ return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size,
+ abd_cmp_cb, NULL));
+}
+
+/*
+ * Iterate over code ABDs and a data ABD and call @func_raidz_gen.
+ *
+ * @cabds parity ABDs, must have equal size
+ * @dabd data ABD. Can be NULL (in this case @dsize = 0)
+ * @func_raidz_gen should be implemented so that its behaviour
+ * is the same when taking linear and when taking scatter
+ */
+void
+abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
+ ssize_t csize, ssize_t dsize, const unsigned parity,
+ void (*func_raidz_gen)(void **, const void *, size_t, size_t))
+{
+ int i;
+ ssize_t len, dlen;
+ struct abd_iter caiters[3];
+ struct abd_iter daiter = {0};
+ void *caddrs[3];
+ unsigned long flags __maybe_unused = 0;
+ abd_t *c_cabds[3];
+ abd_t *c_dabd = NULL;
+ boolean_t cabds_is_gang_abd[3];
+ boolean_t dabd_is_gang_abd = B_FALSE;
+
+ ASSERT3U(parity, <=, 3);
+
+ for (i = 0; i < parity; i++) {
+ cabds_is_gang_abd[i] = abd_is_gang(cabds[i]);
+ c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], 0);
+ }
+
+ if (dabd) {
+ dabd_is_gang_abd = abd_is_gang(dabd);
+ c_dabd = abd_init_abd_iter(dabd, &daiter, 0);
+ }
+
+ ASSERT3S(dsize, >=, 0);
+
+ abd_enter_critical(flags);
+ while (csize > 0) {
+ /* if we are at the end of the gang ABD we are done */
+ if (dabd_is_gang_abd && !c_dabd)
+ break;
+
+ for (i = 0; i < parity; i++) {
+ /*
+ * If we are at the end of the gang ABD we are
+ * done.
+ */
+ if (cabds_is_gang_abd[i] && !c_cabds[i])
+ break;
+ abd_iter_map(&caiters[i]);
+ caddrs[i] = caiters[i].iter_mapaddr;
+ }
+
+ len = csize;
+
+ if (dabd && dsize > 0)
+ abd_iter_map(&daiter);
+
+ switch (parity) {
+ case 3:
+ len = MIN(caiters[2].iter_mapsize, len);
+ /* falls through */
+ case 2:
+ len = MIN(caiters[1].iter_mapsize, len);
+ /* falls through */
+ case 1:
+ len = MIN(caiters[0].iter_mapsize, len);
+ }
+
+ /* must be progressive */
+ ASSERT3S(len, >, 0);
+
+ if (dabd && dsize > 0) {
+ /* this needs precise iter.length */
+ len = MIN(daiter.iter_mapsize, len);
+ dlen = len;
+ } else
+ dlen = 0;
+
+ /* must be progressive */
+ ASSERT3S(len, >, 0);
+ /*
+ * The iterated function likely will not do well if each
+ * segment except the last one is not multiple of 512 (raidz).
+ */
+ ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
+
+ func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen);
+
+ for (i = parity-1; i >= 0; i--) {
+ abd_iter_unmap(&caiters[i]);
+ c_cabds[i] =
+ abd_advance_abd_iter(cabds[i], c_cabds[i],
+ &caiters[i], len);
+ }
+
+ if (dabd && dsize > 0) {
+ abd_iter_unmap(&daiter);
+ c_dabd =
+ abd_advance_abd_iter(dabd, c_dabd, &daiter,
+ dlen);
+ dsize -= dlen;
+ }
+
+ csize -= len;
+
+ ASSERT3S(dsize, >=, 0);
+ ASSERT3S(csize, >=, 0);
+ }
+ abd_exit_critical(flags);
+}
+
+/*
+ * Iterate over code ABDs and data reconstruction target ABDs and call
+ * @func_raidz_rec. Function maps at most 6 pages atomically.
+ *
+ * @cabds parity ABDs, must have equal size
+ * @tabds rec target ABDs, at most 3
+ * @tsize size of data target columns
+ * @func_raidz_rec expects syndrome data in target columns. Function
+ * reconstructs data and overwrites target columns.
+ */
+void
+abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
+ ssize_t tsize, const unsigned parity,
+ void (*func_raidz_rec)(void **t, const size_t tsize, void **c,
+ const unsigned *mul),
+ const unsigned *mul)
+{
+ int i;
+ ssize_t len;
+ struct abd_iter citers[3];
+ struct abd_iter xiters[3];
+ void *caddrs[3], *xaddrs[3];
+ unsigned long flags __maybe_unused = 0;
+ boolean_t cabds_is_gang_abd[3];
+ boolean_t tabds_is_gang_abd[3];
+ abd_t *c_cabds[3];
+ abd_t *c_tabds[3];
+
+ ASSERT3U(parity, <=, 3);
+
+ for (i = 0; i < parity; i++) {
+ cabds_is_gang_abd[i] = abd_is_gang(cabds[i]);
+ tabds_is_gang_abd[i] = abd_is_gang(tabds[i]);
+ c_cabds[i] =
+ abd_init_abd_iter(cabds[i], &citers[i], 0);
+ c_tabds[i] =
+ abd_init_abd_iter(tabds[i], &xiters[i], 0);
+ }
+
+ abd_enter_critical(flags);
+ while (tsize > 0) {
+
+ for (i = 0; i < parity; i++) {
+ /*
+ * If we are at the end of the gang ABD we
+ * are done.
+ */
+ if (cabds_is_gang_abd[i] && !c_cabds[i])
+ break;
+ if (tabds_is_gang_abd[i] && !c_tabds[i])
+ break;
+ abd_iter_map(&citers[i]);
+ abd_iter_map(&xiters[i]);
+ caddrs[i] = citers[i].iter_mapaddr;
+ xaddrs[i] = xiters[i].iter_mapaddr;
+ }
+
+ len = tsize;
+ switch (parity) {
+ case 3:
+ len = MIN(xiters[2].iter_mapsize, len);
+ len = MIN(citers[2].iter_mapsize, len);
+ /* falls through */
+ case 2:
+ len = MIN(xiters[1].iter_mapsize, len);
+ len = MIN(citers[1].iter_mapsize, len);
+ /* falls through */
+ case 1:
+ len = MIN(xiters[0].iter_mapsize, len);
+ len = MIN(citers[0].iter_mapsize, len);
+ }
+ /* must be progressive */
+ ASSERT3S(len, >, 0);
+ /*
+ * The iterated function likely will not do well if each
+ * segment except the last one is not multiple of 512 (raidz).
+ */
+ ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
+
+ func_raidz_rec(xaddrs, len, caddrs, mul);
+
+ for (i = parity-1; i >= 0; i--) {
+ abd_iter_unmap(&xiters[i]);
+ abd_iter_unmap(&citers[i]);
+ c_tabds[i] =
+ abd_advance_abd_iter(tabds[i], c_tabds[i],
+ &xiters[i], len);
+ c_cabds[i] =
+ abd_advance_abd_iter(cabds[i], c_cabds[i],
+ &citers[i], len);
+ }
+
+ tsize -= len;
+ ASSERT3S(tsize, >=, 0);
+ }
+ abd_exit_critical(flags);
+}
diff --git a/sys/contrib/openzfs/module/zfs/aggsum.c b/sys/contrib/openzfs/module/zfs/aggsum.c
new file mode 100644
index 000000000000..e46da95f676c
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/aggsum.c
@@ -0,0 +1,240 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2017, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/aggsum.h>
+
+/*
+ * Aggregate-sum counters are a form of fanned-out counter, used when atomic
+ * instructions on a single field cause enough CPU cache line contention to
+ * slow system performance. Due to their increased overhead and the expense
+ * involved with precisely reading from them, they should only be used in cases
+ * where the write rate (increment/decrement) is much higher than the read rate
+ * (get value).
+ *
+ * Aggregate sum counters are comprised of two basic parts, the core and the
+ * buckets. The core counter contains a lock for the entire counter, as well
+ * as the current upper and lower bounds on the value of the counter. The
+ * aggsum_bucket structure contains a per-bucket lock to protect the contents of
+ * the bucket, the current amount that this bucket has changed from the global
+ * counter (called the delta), and the amount of increment and decrement we have
+ * "borrowed" from the core counter.
+ *
+ * The basic operation of an aggsum is simple. Threads that wish to modify the
+ * counter will modify one bucket's counter (determined by their current CPU, to
+ * help minimize lock and cache contention). If the bucket already has
+ * sufficient capacity borrowed from the core structure to handle their request,
+ * they simply modify the delta and return. If the bucket does not, we clear
+ * the bucket's current state (to prevent the borrowed amounts from getting too
+ * large), and borrow more from the core counter. Borrowing is done by adding to
+ * the upper bound (or subtracting from the lower bound) of the core counter,
+ * and setting the borrow value for the bucket to the amount added (or
+ * subtracted). Clearing the bucket is the opposite; we add the current delta
+ * to both the lower and upper bounds of the core counter, subtract the borrowed
+ * incremental from the upper bound, and add the borrowed decrement from the
+ * lower bound. Note that only borrowing and clearing require access to the
+ * core counter; since all other operations access CPU-local resources,
+ * performance can be much higher than a traditional counter.
+ *
+ * Threads that wish to read from the counter have a slightly more challenging
+ * task. It is fast to determine the upper and lower bounds of the aggum; this
+ * does not require grabbing any locks. This suffices for cases where an
+ * approximation of the aggsum's value is acceptable. However, if one needs to
+ * know whether some specific value is above or below the current value in the
+ * aggsum, they invoke aggsum_compare(). This function operates by repeatedly
+ * comparing the target value to the upper and lower bounds of the aggsum, and
+ * then clearing a bucket. This proceeds until the target is outside of the
+ * upper and lower bounds and we return a response, or the last bucket has been
+ * cleared and we know that the target is equal to the aggsum's value. Finally,
+ * the most expensive operation is determining the precise value of the aggsum.
+ * To do this, we clear every bucket and then return the upper bound (which must
+ * be equal to the lower bound). What makes aggsum_compare() and aggsum_value()
+ * expensive is clearing buckets. This involves grabbing the global lock
+ * (serializing against themselves and borrow operations), grabbing a bucket's
+ * lock (preventing threads on those CPUs from modifying their delta), and
+ * zeroing out the borrowed value (forcing that thread to borrow on its next
+ * request, which will also be expensive). This is what makes aggsums well
+ * suited for write-many read-rarely operations.
+ *
+ * Note that the aggsums do not expand if more CPUs are hot-added. In that
+ * case, we will have less fanout than boot_ncpus, but we don't want to always
+ * reserve the RAM necessary to create the extra slots for additional CPUs up
+ * front, and dynamically adding them is a complex task.
+ */
+
+/*
+ * We will borrow aggsum_borrow_multiplier times the current request, so we will
+ * have to get the as_lock approximately every aggsum_borrow_multiplier calls to
+ * aggsum_delta().
+ */
+static uint_t aggsum_borrow_multiplier = 10;
+
+void
+aggsum_init(aggsum_t *as, uint64_t value)
+{
+ bzero(as, sizeof (*as));
+ as->as_lower_bound = as->as_upper_bound = value;
+ mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL);
+ as->as_numbuckets = boot_ncpus;
+ as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t),
+ KM_SLEEP);
+ for (int i = 0; i < as->as_numbuckets; i++) {
+ mutex_init(&as->as_buckets[i].asc_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ }
+}
+
+void
+aggsum_fini(aggsum_t *as)
+{
+ for (int i = 0; i < as->as_numbuckets; i++)
+ mutex_destroy(&as->as_buckets[i].asc_lock);
+ kmem_free(as->as_buckets, as->as_numbuckets * sizeof (aggsum_bucket_t));
+ mutex_destroy(&as->as_lock);
+}
+
+int64_t
+aggsum_lower_bound(aggsum_t *as)
+{
+ return (as->as_lower_bound);
+}
+
+int64_t
+aggsum_upper_bound(aggsum_t *as)
+{
+ return (as->as_upper_bound);
+}
+
+static void
+aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb)
+{
+ ASSERT(MUTEX_HELD(&as->as_lock));
+ ASSERT(MUTEX_HELD(&asb->asc_lock));
+
+ /*
+ * We use atomic instructions for this because we read the upper and
+ * lower bounds without the lock, so we need stores to be atomic.
+ */
+ atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
+ asb->asc_delta + asb->asc_borrowed);
+ atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
+ asb->asc_delta - asb->asc_borrowed);
+ asb->asc_delta = 0;
+ asb->asc_borrowed = 0;
+}
+
+uint64_t
+aggsum_value(aggsum_t *as)
+{
+ int64_t rv;
+
+ mutex_enter(&as->as_lock);
+ if (as->as_lower_bound == as->as_upper_bound) {
+ rv = as->as_lower_bound;
+ for (int i = 0; i < as->as_numbuckets; i++) {
+ ASSERT0(as->as_buckets[i].asc_delta);
+ ASSERT0(as->as_buckets[i].asc_borrowed);
+ }
+ mutex_exit(&as->as_lock);
+ return (rv);
+ }
+ for (int i = 0; i < as->as_numbuckets; i++) {
+ struct aggsum_bucket *asb = &as->as_buckets[i];
+ mutex_enter(&asb->asc_lock);
+ aggsum_flush_bucket(as, asb);
+ mutex_exit(&asb->asc_lock);
+ }
+ VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
+ rv = as->as_lower_bound;
+ mutex_exit(&as->as_lock);
+
+ return (rv);
+}
+
+void
+aggsum_add(aggsum_t *as, int64_t delta)
+{
+ struct aggsum_bucket *asb;
+ int64_t borrow;
+
+ asb = &as->as_buckets[CPU_SEQID_UNSTABLE % as->as_numbuckets];
+
+ /* Try fast path if we already borrowed enough before. */
+ mutex_enter(&asb->asc_lock);
+ if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed &&
+ asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) {
+ asb->asc_delta += delta;
+ mutex_exit(&asb->asc_lock);
+ return;
+ }
+ mutex_exit(&asb->asc_lock);
+
+ /*
+ * We haven't borrowed enough. Take the global lock and borrow
+ * considering what is requested now and what we borrowed before.
+ */
+ borrow = (delta < 0 ? -delta : delta) * aggsum_borrow_multiplier;
+ mutex_enter(&as->as_lock);
+ mutex_enter(&asb->asc_lock);
+ delta += asb->asc_delta;
+ asb->asc_delta = 0;
+ if (borrow >= asb->asc_borrowed)
+ borrow -= asb->asc_borrowed;
+ else
+ borrow = (borrow - (int64_t)asb->asc_borrowed) / 4;
+ asb->asc_borrowed += borrow;
+ atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
+ delta - borrow);
+ atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
+ delta + borrow);
+ mutex_exit(&asb->asc_lock);
+ mutex_exit(&as->as_lock);
+}
+
+/*
+ * Compare the aggsum value to target efficiently. Returns -1 if the value
+ * represented by the aggsum is less than target, 1 if it's greater, and 0 if
+ * they are equal.
+ */
+int
+aggsum_compare(aggsum_t *as, uint64_t target)
+{
+ if (as->as_upper_bound < target)
+ return (-1);
+ if (as->as_lower_bound > target)
+ return (1);
+ mutex_enter(&as->as_lock);
+ for (int i = 0; i < as->as_numbuckets; i++) {
+ struct aggsum_bucket *asb = &as->as_buckets[i];
+ mutex_enter(&asb->asc_lock);
+ aggsum_flush_bucket(as, asb);
+ mutex_exit(&asb->asc_lock);
+ if (as->as_upper_bound < target) {
+ mutex_exit(&as->as_lock);
+ return (-1);
+ }
+ if (as->as_lower_bound > target) {
+ mutex_exit(&as->as_lock);
+ return (1);
+ }
+ }
+ VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
+ ASSERT3U(as->as_lower_bound, ==, target);
+ mutex_exit(&as->as_lock);
+ return (0);
+}
diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c
new file mode 100644
index 000000000000..b4f0c8a85b64
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/arc.c
@@ -0,0 +1,10768 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, Joyent, Inc.
+ * Copyright (c) 2011, 2020, Delphix. All rights reserved.
+ * Copyright (c) 2014, Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ * Copyright (c) 2020, George Amanakis. All rights reserved.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ * Copyright (c) 2020, The FreeBSD Foundation [1]
+ *
+ * [1] Portions of this software were developed by Allan Jude
+ * under sponsorship from the FreeBSD Foundation.
+ */
+
+/*
+ * DVA-based Adjustable Replacement Cache
+ *
+ * While much of the theory of operation used here is
+ * based on the self-tuning, low overhead replacement cache
+ * presented by Megiddo and Modha at FAST 2003, there are some
+ * significant differences:
+ *
+ * 1. The Megiddo and Modha model assumes any page is evictable.
+ * Pages in its cache cannot be "locked" into memory. This makes
+ * the eviction algorithm simple: evict the last page in the list.
+ * This also make the performance characteristics easy to reason
+ * about. Our cache is not so simple. At any given moment, some
+ * subset of the blocks in the cache are un-evictable because we
+ * have handed out a reference to them. Blocks are only evictable
+ * when there are no external references active. This makes
+ * eviction far more problematic: we choose to evict the evictable
+ * blocks that are the "lowest" in the list.
+ *
+ * There are times when it is not possible to evict the requested
+ * space. In these circumstances we are unable to adjust the cache
+ * size. To prevent the cache growing unbounded at these times we
+ * implement a "cache throttle" that slows the flow of new data
+ * into the cache until we can make space available.
+ *
+ * 2. The Megiddo and Modha model assumes a fixed cache size.
+ * Pages are evicted when the cache is full and there is a cache
+ * miss. Our model has a variable sized cache. It grows with
+ * high use, but also tries to react to memory pressure from the
+ * operating system: decreasing its size when system memory is
+ * tight.
+ *
+ * 3. The Megiddo and Modha model assumes a fixed page size. All
+ * elements of the cache are therefore exactly the same size. So
+ * when adjusting the cache size following a cache miss, its simply
+ * a matter of choosing a single page to evict. In our model, we
+ * have variable sized cache blocks (ranging from 512 bytes to
+ * 128K bytes). We therefore choose a set of blocks to evict to make
+ * space for a cache miss that approximates as closely as possible
+ * the space used by the new block.
+ *
+ * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
+ * by N. Megiddo & D. Modha, FAST 2003
+ */
+
+/*
+ * The locking model:
+ *
+ * A new reference to a cache buffer can be obtained in two
+ * ways: 1) via a hash table lookup using the DVA as a key,
+ * or 2) via one of the ARC lists. The arc_read() interface
+ * uses method 1, while the internal ARC algorithms for
+ * adjusting the cache use method 2. We therefore provide two
+ * types of locks: 1) the hash table lock array, and 2) the
+ * ARC list locks.
+ *
+ * Buffers do not have their own mutexes, rather they rely on the
+ * hash table mutexes for the bulk of their protection (i.e. most
+ * fields in the arc_buf_hdr_t are protected by these mutexes).
+ *
+ * buf_hash_find() returns the appropriate mutex (held) when it
+ * locates the requested buffer in the hash table. It returns
+ * NULL for the mutex if the buffer was not in the table.
+ *
+ * buf_hash_remove() expects the appropriate hash mutex to be
+ * already held before it is invoked.
+ *
+ * Each ARC state also has a mutex which is used to protect the
+ * buffer list associated with the state. When attempting to
+ * obtain a hash table lock while holding an ARC list lock you
+ * must use: mutex_tryenter() to avoid deadlock. Also note that
+ * the active state mutex must be held before the ghost state mutex.
+ *
+ * It as also possible to register a callback which is run when the
+ * arc_meta_limit is reached and no buffers can be safely evicted. In
+ * this case the arc user should drop a reference on some arc buffers so
+ * they can be reclaimed and the arc_meta_limit honored. For example,
+ * when using the ZPL each dentry holds a references on a znode. These
+ * dentries must be pruned before the arc buffer holding the znode can
+ * be safely evicted.
+ *
+ * Note that the majority of the performance stats are manipulated
+ * with atomic operations.
+ *
+ * The L2ARC uses the l2ad_mtx on each vdev for the following:
+ *
+ * - L2ARC buflist creation
+ * - L2ARC buflist eviction
+ * - L2ARC write completion, which walks L2ARC buflists
+ * - ARC header destruction, as it removes from L2ARC buflists
+ * - ARC header release, as it removes from L2ARC buflists
+ */
+
+/*
+ * ARC operation:
+ *
+ * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
+ * This structure can point either to a block that is still in the cache or to
+ * one that is only accessible in an L2 ARC device, or it can provide
+ * information about a block that was recently evicted. If a block is
+ * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
+ * information to retrieve it from the L2ARC device. This information is
+ * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
+ * that is in this state cannot access the data directly.
+ *
+ * Blocks that are actively being referenced or have not been evicted
+ * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
+ * the arc_buf_hdr_t that will point to the data block in memory. A block can
+ * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
+ * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
+ * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
+ *
+ * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
+ * ability to store the physical data (b_pabd) associated with the DVA of the
+ * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
+ * it will match its on-disk compression characteristics. This behavior can be
+ * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
+ * compressed ARC functionality is disabled, the b_pabd will point to an
+ * uncompressed version of the on-disk data.
+ *
+ * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
+ * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
+ * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
+ * consumer. The ARC will provide references to this data and will keep it
+ * cached until it is no longer in use. The ARC caches only the L1ARC's physical
+ * data block and will evict any arc_buf_t that is no longer referenced. The
+ * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
+ * "overhead_size" kstat.
+ *
+ * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
+ * compressed form. The typical case is that consumers will want uncompressed
+ * data, and when that happens a new data buffer is allocated where the data is
+ * decompressed for them to use. Currently the only consumer who wants
+ * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
+ * exists on disk. When this happens, the arc_buf_t's data buffer is shared
+ * with the arc_buf_hdr_t.
+ *
+ * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
+ * first one is owned by a compressed send consumer (and therefore references
+ * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
+ * used by any other consumer (and has its own uncompressed copy of the data
+ * buffer).
+ *
+ * arc_buf_hdr_t
+ * +-----------+
+ * | fields |
+ * | common to |
+ * | L1- and |
+ * | L2ARC |
+ * +-----------+
+ * | l2arc_buf_hdr_t
+ * | |
+ * +-----------+
+ * | l1arc_buf_hdr_t
+ * | | arc_buf_t
+ * | b_buf +------------>+-----------+ arc_buf_t
+ * | b_pabd +-+ |b_next +---->+-----------+
+ * +-----------+ | |-----------| |b_next +-->NULL
+ * | |b_comp = T | +-----------+
+ * | |b_data +-+ |b_comp = F |
+ * | +-----------+ | |b_data +-+
+ * +->+------+ | +-----------+ |
+ * compressed | | | |
+ * data | |<--------------+ | uncompressed
+ * +------+ compressed, | data
+ * shared +-->+------+
+ * data | |
+ * | |
+ * +------+
+ *
+ * When a consumer reads a block, the ARC must first look to see if the
+ * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
+ * arc_buf_t and either copies uncompressed data into a new data buffer from an
+ * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
+ * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
+ * hdr is compressed and the desired compression characteristics of the
+ * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
+ * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
+ * the last buffer in the hdr's b_buf list, however a shared compressed buf can
+ * be anywhere in the hdr's list.
+ *
+ * The diagram below shows an example of an uncompressed ARC hdr that is
+ * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
+ * the last element in the buf list):
+ *
+ * arc_buf_hdr_t
+ * +-----------+
+ * | |
+ * | |
+ * | |
+ * +-----------+
+ * l2arc_buf_hdr_t| |
+ * | |
+ * +-----------+
+ * l1arc_buf_hdr_t| |
+ * | | arc_buf_t (shared)
+ * | b_buf +------------>+---------+ arc_buf_t
+ * | | |b_next +---->+---------+
+ * | b_pabd +-+ |---------| |b_next +-->NULL
+ * +-----------+ | | | +---------+
+ * | |b_data +-+ | |
+ * | +---------+ | |b_data +-+
+ * +->+------+ | +---------+ |
+ * | | | |
+ * uncompressed | | | |
+ * data +------+ | |
+ * ^ +->+------+ |
+ * | uncompressed | | |
+ * | data | | |
+ * | +------+ |
+ * +---------------------------------+
+ *
+ * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
+ * since the physical block is about to be rewritten. The new data contents
+ * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
+ * it may compress the data before writing it to disk. The ARC will be called
+ * with the transformed data and will bcopy the transformed on-disk block into
+ * a newly allocated b_pabd. Writes are always done into buffers which have
+ * either been loaned (and hence are new and don't have other readers) or
+ * buffers which have been released (and hence have their own hdr, if there
+ * were originally other readers of the buf's original hdr). This ensures that
+ * the ARC only needs to update a single buf and its hdr after a write occurs.
+ *
+ * When the L2ARC is in use, it will also take advantage of the b_pabd. The
+ * L2ARC will always write the contents of b_pabd to the L2ARC. This means
+ * that when compressed ARC is enabled that the L2ARC blocks are identical
+ * to the on-disk block in the main data pool. This provides a significant
+ * advantage since the ARC can leverage the bp's checksum when reading from the
+ * L2ARC to determine if the contents are valid. However, if the compressed
+ * ARC is disabled, then the L2ARC's block must be transformed to look
+ * like the physical block in the main data pool before comparing the
+ * checksum and determining its validity.
+ *
+ * The L1ARC has a slightly different system for storing encrypted data.
+ * Raw (encrypted + possibly compressed) data has a few subtle differences from
+ * data that is just compressed. The biggest difference is that it is not
+ * possible to decrypt encrypted data (or vice-versa) if the keys aren't loaded.
+ * The other difference is that encryption cannot be treated as a suggestion.
+ * If a caller would prefer compressed data, but they actually wind up with
+ * uncompressed data the worst thing that could happen is there might be a
+ * performance hit. If the caller requests encrypted data, however, we must be
+ * sure they actually get it or else secret information could be leaked. Raw
+ * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore,
+ * may have both an encrypted version and a decrypted version of its data at
+ * once. When a caller needs a raw arc_buf_t, it is allocated and the data is
+ * copied out of this header. To avoid complications with b_pabd, raw buffers
+ * cannot be shared.
+ */
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/spa_impl.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <sys/arc.h>
+#include <sys/zfs_refcount.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
+#include <sys/multilist.h>
+#include <sys/abd.h>
+#include <sys/zil.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/callb.h>
+#include <sys/kstat.h>
+#include <sys/zthr.h>
+#include <zfs_fletcher.h>
+#include <sys/arc_impl.h>
+#include <sys/trace_zfs.h>
+#include <sys/aggsum.h>
+#include <cityhash.h>
+#include <sys/vdev_trim.h>
+#include <sys/zstd/zstd.h>
+
+#ifndef _KERNEL
+/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
+boolean_t arc_watch = B_FALSE;
+#endif
+
+/*
+ * This thread's job is to keep enough free memory in the system, by
+ * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves
+ * arc_available_memory().
+ */
+static zthr_t *arc_reap_zthr;
+
+/*
+ * This thread's job is to keep arc_size under arc_c, by calling
+ * arc_evict(), which improves arc_is_overflowing().
+ */
+static zthr_t *arc_evict_zthr;
+
+static kmutex_t arc_evict_lock;
+static boolean_t arc_evict_needed = B_FALSE;
+
+/*
+ * Count of bytes evicted since boot.
+ */
+static uint64_t arc_evict_count;
+
+/*
+ * List of arc_evict_waiter_t's, representing threads waiting for the
+ * arc_evict_count to reach specific values.
+ */
+static list_t arc_evict_waiters;
+
+/*
+ * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of
+ * the requested amount of data to be evicted. For example, by default for
+ * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation.
+ * Since this is above 100%, it ensures that progress is made towards getting
+ * arc_size under arc_c. Since this is finite, it ensures that allocations
+ * can still happen, even during the potentially long time that arc_size is
+ * more than arc_c.
+ */
+int zfs_arc_eviction_pct = 200;
+
+/*
+ * The number of headers to evict in arc_evict_state_impl() before
+ * dropping the sublist lock and evicting from another sublist. A lower
+ * value means we're more likely to evict the "correct" header (i.e. the
+ * oldest header in the arc state), but comes with higher overhead
+ * (i.e. more invocations of arc_evict_state_impl()).
+ */
+int zfs_arc_evict_batch_limit = 10;
+
+/* number of seconds before growing cache again */
+int arc_grow_retry = 5;
+
+/*
+ * Minimum time between calls to arc_kmem_reap_soon().
+ */
+int arc_kmem_cache_reap_retry_ms = 1000;
+
+/* shift of arc_c for calculating overflow limit in arc_get_data_impl */
+int zfs_arc_overflow_shift = 8;
+
+/* shift of arc_c for calculating both min and max arc_p */
+int arc_p_min_shift = 4;
+
+/* log2(fraction of arc to reclaim) */
+int arc_shrink_shift = 7;
+
+/* percent of pagecache to reclaim arc to */
+#ifdef _KERNEL
+uint_t zfs_arc_pc_percent = 0;
+#endif
+
+/*
+ * log2(fraction of ARC which must be free to allow growing).
+ * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
+ * when reading a new block into the ARC, we will evict an equal-sized block
+ * from the ARC.
+ *
+ * This must be less than arc_shrink_shift, so that when we shrink the ARC,
+ * we will still not allow it to grow.
+ */
+int arc_no_grow_shift = 5;
+
+
+/*
+ * minimum lifespan of a prefetch block in clock ticks
+ * (initialized in arc_init())
+ */
+static int arc_min_prefetch_ms;
+static int arc_min_prescient_prefetch_ms;
+
+/*
+ * If this percent of memory is free, don't throttle.
+ */
+int arc_lotsfree_percent = 10;
+
+/*
+ * The arc has filled available memory and has now warmed up.
+ */
+boolean_t arc_warm;
+
+/*
+ * These tunables are for performance analysis.
+ */
+unsigned long zfs_arc_max = 0;
+unsigned long zfs_arc_min = 0;
+unsigned long zfs_arc_meta_limit = 0;
+unsigned long zfs_arc_meta_min = 0;
+unsigned long zfs_arc_dnode_limit = 0;
+unsigned long zfs_arc_dnode_reduce_percent = 10;
+int zfs_arc_grow_retry = 0;
+int zfs_arc_shrink_shift = 0;
+int zfs_arc_p_min_shift = 0;
+int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
+
+/*
+ * ARC dirty data constraints for arc_tempreserve_space() throttle.
+ */
+unsigned long zfs_arc_dirty_limit_percent = 50; /* total dirty data limit */
+unsigned long zfs_arc_anon_limit_percent = 25; /* anon block dirty limit */
+unsigned long zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */
+
+/*
+ * Enable or disable compressed arc buffers.
+ */
+int zfs_compressed_arc_enabled = B_TRUE;
+
+/*
+ * ARC will evict meta buffers that exceed arc_meta_limit. This
+ * tunable make arc_meta_limit adjustable for different workloads.
+ */
+unsigned long zfs_arc_meta_limit_percent = 75;
+
+/*
+ * Percentage that can be consumed by dnodes of ARC meta buffers.
+ */
+unsigned long zfs_arc_dnode_limit_percent = 10;
+
+/*
+ * These tunables are Linux specific
+ */
+unsigned long zfs_arc_sys_free = 0;
+int zfs_arc_min_prefetch_ms = 0;
+int zfs_arc_min_prescient_prefetch_ms = 0;
+int zfs_arc_p_dampener_disable = 1;
+int zfs_arc_meta_prune = 10000;
+int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
+int zfs_arc_meta_adjust_restarts = 4096;
+int zfs_arc_lotsfree_percent = 10;
+
+/* The 6 states: */
+arc_state_t ARC_anon;
+arc_state_t ARC_mru;
+arc_state_t ARC_mru_ghost;
+arc_state_t ARC_mfu;
+arc_state_t ARC_mfu_ghost;
+arc_state_t ARC_l2c_only;
+
+arc_stats_t arc_stats = {
+ { "hits", KSTAT_DATA_UINT64 },
+ { "misses", KSTAT_DATA_UINT64 },
+ { "demand_data_hits", KSTAT_DATA_UINT64 },
+ { "demand_data_misses", KSTAT_DATA_UINT64 },
+ { "demand_metadata_hits", KSTAT_DATA_UINT64 },
+ { "demand_metadata_misses", KSTAT_DATA_UINT64 },
+ { "prefetch_data_hits", KSTAT_DATA_UINT64 },
+ { "prefetch_data_misses", KSTAT_DATA_UINT64 },
+ { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
+ { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
+ { "mru_hits", KSTAT_DATA_UINT64 },
+ { "mru_ghost_hits", KSTAT_DATA_UINT64 },
+ { "mfu_hits", KSTAT_DATA_UINT64 },
+ { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
+ { "deleted", KSTAT_DATA_UINT64 },
+ { "mutex_miss", KSTAT_DATA_UINT64 },
+ { "access_skip", KSTAT_DATA_UINT64 },
+ { "evict_skip", KSTAT_DATA_UINT64 },
+ { "evict_not_enough", KSTAT_DATA_UINT64 },
+ { "evict_l2_cached", KSTAT_DATA_UINT64 },
+ { "evict_l2_eligible", KSTAT_DATA_UINT64 },
+ { "evict_l2_eligible_mfu", KSTAT_DATA_UINT64 },
+ { "evict_l2_eligible_mru", KSTAT_DATA_UINT64 },
+ { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
+ { "evict_l2_skip", KSTAT_DATA_UINT64 },
+ { "hash_elements", KSTAT_DATA_UINT64 },
+ { "hash_elements_max", KSTAT_DATA_UINT64 },
+ { "hash_collisions", KSTAT_DATA_UINT64 },
+ { "hash_chains", KSTAT_DATA_UINT64 },
+ { "hash_chain_max", KSTAT_DATA_UINT64 },
+ { "p", KSTAT_DATA_UINT64 },
+ { "c", KSTAT_DATA_UINT64 },
+ { "c_min", KSTAT_DATA_UINT64 },
+ { "c_max", KSTAT_DATA_UINT64 },
+ { "size", KSTAT_DATA_UINT64 },
+ { "compressed_size", KSTAT_DATA_UINT64 },
+ { "uncompressed_size", KSTAT_DATA_UINT64 },
+ { "overhead_size", KSTAT_DATA_UINT64 },
+ { "hdr_size", KSTAT_DATA_UINT64 },
+ { "data_size", KSTAT_DATA_UINT64 },
+ { "metadata_size", KSTAT_DATA_UINT64 },
+ { "dbuf_size", KSTAT_DATA_UINT64 },
+ { "dnode_size", KSTAT_DATA_UINT64 },
+ { "bonus_size", KSTAT_DATA_UINT64 },
+#if defined(COMPAT_FREEBSD11)
+ { "other_size", KSTAT_DATA_UINT64 },
+#endif
+ { "anon_size", KSTAT_DATA_UINT64 },
+ { "anon_evictable_data", KSTAT_DATA_UINT64 },
+ { "anon_evictable_metadata", KSTAT_DATA_UINT64 },
+ { "mru_size", KSTAT_DATA_UINT64 },
+ { "mru_evictable_data", KSTAT_DATA_UINT64 },
+ { "mru_evictable_metadata", KSTAT_DATA_UINT64 },
+ { "mru_ghost_size", KSTAT_DATA_UINT64 },
+ { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 },
+ { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
+ { "mfu_size", KSTAT_DATA_UINT64 },
+ { "mfu_evictable_data", KSTAT_DATA_UINT64 },
+ { "mfu_evictable_metadata", KSTAT_DATA_UINT64 },
+ { "mfu_ghost_size", KSTAT_DATA_UINT64 },
+ { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 },
+ { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
+ { "l2_hits", KSTAT_DATA_UINT64 },
+ { "l2_misses", KSTAT_DATA_UINT64 },
+ { "l2_prefetch_asize", KSTAT_DATA_UINT64 },
+ { "l2_mru_asize", KSTAT_DATA_UINT64 },
+ { "l2_mfu_asize", KSTAT_DATA_UINT64 },
+ { "l2_bufc_data_asize", KSTAT_DATA_UINT64 },
+ { "l2_bufc_metadata_asize", KSTAT_DATA_UINT64 },
+ { "l2_feeds", KSTAT_DATA_UINT64 },
+ { "l2_rw_clash", KSTAT_DATA_UINT64 },
+ { "l2_read_bytes", KSTAT_DATA_UINT64 },
+ { "l2_write_bytes", KSTAT_DATA_UINT64 },
+ { "l2_writes_sent", KSTAT_DATA_UINT64 },
+ { "l2_writes_done", KSTAT_DATA_UINT64 },
+ { "l2_writes_error", KSTAT_DATA_UINT64 },
+ { "l2_writes_lock_retry", KSTAT_DATA_UINT64 },
+ { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
+ { "l2_evict_reading", KSTAT_DATA_UINT64 },
+ { "l2_evict_l1cached", KSTAT_DATA_UINT64 },
+ { "l2_free_on_write", KSTAT_DATA_UINT64 },
+ { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
+ { "l2_cksum_bad", KSTAT_DATA_UINT64 },
+ { "l2_io_error", KSTAT_DATA_UINT64 },
+ { "l2_size", KSTAT_DATA_UINT64 },
+ { "l2_asize", KSTAT_DATA_UINT64 },
+ { "l2_hdr_size", KSTAT_DATA_UINT64 },
+ { "l2_log_blk_writes", KSTAT_DATA_UINT64 },
+ { "l2_log_blk_avg_asize", KSTAT_DATA_UINT64 },
+ { "l2_log_blk_asize", KSTAT_DATA_UINT64 },
+ { "l2_log_blk_count", KSTAT_DATA_UINT64 },
+ { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_success", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_dh_errors", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_size", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_asize", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_bufs", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 },
+ { "memory_throttle_count", KSTAT_DATA_UINT64 },
+ { "memory_direct_count", KSTAT_DATA_UINT64 },
+ { "memory_indirect_count", KSTAT_DATA_UINT64 },
+ { "memory_all_bytes", KSTAT_DATA_UINT64 },
+ { "memory_free_bytes", KSTAT_DATA_UINT64 },
+ { "memory_available_bytes", KSTAT_DATA_INT64 },
+ { "arc_no_grow", KSTAT_DATA_UINT64 },
+ { "arc_tempreserve", KSTAT_DATA_UINT64 },
+ { "arc_loaned_bytes", KSTAT_DATA_UINT64 },
+ { "arc_prune", KSTAT_DATA_UINT64 },
+ { "arc_meta_used", KSTAT_DATA_UINT64 },
+ { "arc_meta_limit", KSTAT_DATA_UINT64 },
+ { "arc_dnode_limit", KSTAT_DATA_UINT64 },
+ { "arc_meta_max", KSTAT_DATA_UINT64 },
+ { "arc_meta_min", KSTAT_DATA_UINT64 },
+ { "async_upgrade_sync", KSTAT_DATA_UINT64 },
+ { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
+ { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
+ { "arc_need_free", KSTAT_DATA_UINT64 },
+ { "arc_sys_free", KSTAT_DATA_UINT64 },
+ { "arc_raw_size", KSTAT_DATA_UINT64 },
+ { "cached_only_in_progress", KSTAT_DATA_UINT64 },
+ { "abd_chunk_waste_size", KSTAT_DATA_UINT64 },
+};
+
+#define ARCSTAT_MAX(stat, val) { \
+ uint64_t m; \
+ while ((val) > (m = arc_stats.stat.value.ui64) && \
+ (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
+ continue; \
+}
+
+#define ARCSTAT_MAXSTAT(stat) \
+ ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
+
+/*
+ * We define a macro to allow ARC hits/misses to be easily broken down by
+ * two separate conditions, giving a total of four different subtypes for
+ * each of hits and misses (so eight statistics total).
+ */
+#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
+ if (cond1) { \
+ if (cond2) { \
+ ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
+ } else { \
+ ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
+ } \
+ } else { \
+ if (cond2) { \
+ ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
+ } else { \
+ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
+ } \
+ }
+
+/*
+ * This macro allows us to use kstats as floating averages. Each time we
+ * update this kstat, we first factor it and the update value by
+ * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
+ * average. This macro assumes that integer loads and stores are atomic, but
+ * is not safe for multiple writers updating the kstat in parallel (only the
+ * last writer's update will remain).
+ */
+#define ARCSTAT_F_AVG_FACTOR 3
+#define ARCSTAT_F_AVG(stat, value) \
+ do { \
+ uint64_t x = ARCSTAT(stat); \
+ x = x - x / ARCSTAT_F_AVG_FACTOR + \
+ (value) / ARCSTAT_F_AVG_FACTOR; \
+ ARCSTAT(stat) = x; \
+ _NOTE(CONSTCOND) \
+ } while (0)
+
+kstat_t *arc_ksp;
+static arc_state_t *arc_anon;
+static arc_state_t *arc_mru_ghost;
+static arc_state_t *arc_mfu_ghost;
+static arc_state_t *arc_l2c_only;
+
+arc_state_t *arc_mru;
+arc_state_t *arc_mfu;
+
+/*
+ * There are several ARC variables that are critical to export as kstats --
+ * but we don't want to have to grovel around in the kstat whenever we wish to
+ * manipulate them. For these variables, we therefore define them to be in
+ * terms of the statistic variable. This assures that we are not introducing
+ * the possibility of inconsistency by having shadow copies of the variables,
+ * while still allowing the code to be readable.
+ */
+#define arc_tempreserve ARCSTAT(arcstat_tempreserve)
+#define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes)
+#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
+/* max size for dnodes */
+#define arc_dnode_size_limit ARCSTAT(arcstat_dnode_limit)
+#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
+#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
+#define arc_need_free ARCSTAT(arcstat_need_free) /* waiting to be evicted */
+
+/* size of all b_rabd's in entire arc */
+#define arc_raw_size ARCSTAT(arcstat_raw_size)
+/* compressed size of entire arc */
+#define arc_compressed_size ARCSTAT(arcstat_compressed_size)
+/* uncompressed size of entire arc */
+#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size)
+/* number of bytes in the arc from arc_buf_t's */
+#define arc_overhead_size ARCSTAT(arcstat_overhead_size)
+
+/*
+ * There are also some ARC variables that we want to export, but that are
+ * updated so often that having the canonical representation be the statistic
+ * variable causes a performance bottleneck. We want to use aggsum_t's for these
+ * instead, but still be able to export the kstat in the same way as before.
+ * The solution is to always use the aggsum version, except in the kstat update
+ * callback.
+ */
+aggsum_t arc_size;
+aggsum_t arc_meta_used;
+aggsum_t astat_data_size;
+aggsum_t astat_metadata_size;
+aggsum_t astat_dbuf_size;
+aggsum_t astat_dnode_size;
+aggsum_t astat_bonus_size;
+aggsum_t astat_hdr_size;
+aggsum_t astat_l2_hdr_size;
+aggsum_t astat_abd_chunk_waste_size;
+
+hrtime_t arc_growtime;
+list_t arc_prune_list;
+kmutex_t arc_prune_mtx;
+taskq_t *arc_prune_taskq;
+
+#define GHOST_STATE(state) \
+ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
+ (state) == arc_l2c_only)
+
+#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
+#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
+#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
+#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
+#define HDR_PRESCIENT_PREFETCH(hdr) \
+ ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+#define HDR_COMPRESSION_ENABLED(hdr) \
+ ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
+
+#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE)
+#define HDR_L2_READING(hdr) \
+ (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \
+ ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
+#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
+#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
+#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
+#define HDR_PROTECTED(hdr) ((hdr)->b_flags & ARC_FLAG_PROTECTED)
+#define HDR_NOAUTH(hdr) ((hdr)->b_flags & ARC_FLAG_NOAUTH)
+#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
+
+#define HDR_ISTYPE_METADATA(hdr) \
+ ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
+#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr))
+
+#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
+#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
+#define HDR_HAS_RABD(hdr) \
+ (HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) && \
+ (hdr)->b_crypt_hdr.b_rabd != NULL)
+#define HDR_ENCRYPTED(hdr) \
+ (HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
+#define HDR_AUTHENTICATED(hdr) \
+ (HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
+
+/* For storing compression mode in b_flags */
+#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1)
+
+#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \
+ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
+#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
+ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
+
+#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL)
+#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED)
+#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
+#define ARC_BUF_ENCRYPTED(buf) ((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED)
+
+/*
+ * Other sizes
+ */
+
+#define HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
+#define HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr))
+#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
+
+/*
+ * Hash table routines
+ */
+
+#define HT_LOCK_ALIGN 64
+#define HT_LOCK_PAD (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
+
+struct ht_lock {
+ kmutex_t ht_lock;
+#ifdef _KERNEL
+ unsigned char pad[HT_LOCK_PAD];
+#endif
+};
+
+#define BUF_LOCKS 8192
+typedef struct buf_hash_table {
+ uint64_t ht_mask;
+ arc_buf_hdr_t **ht_table;
+ struct ht_lock ht_locks[BUF_LOCKS];
+} buf_hash_table_t;
+
+static buf_hash_table_t buf_hash_table;
+
+#define BUF_HASH_INDEX(spa, dva, birth) \
+ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
+#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
+#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
+#define HDR_LOCK(hdr) \
+ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
+
+uint64_t zfs_crc64_table[256];
+
+/*
+ * Level 2 ARC
+ */
+
+#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
+#define L2ARC_HEADROOM 2 /* num of writes */
+
+/*
+ * If we discover during ARC scan any buffers to be compressed, we boost
+ * our headroom for the next scanning cycle by this percentage multiple.
+ */
+#define L2ARC_HEADROOM_BOOST 200
+#define L2ARC_FEED_SECS 1 /* caching interval secs */
+#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
+
+/*
+ * We can feed L2ARC from two states of ARC buffers, mru and mfu,
+ * and each of the state has two types: data and metadata.
+ */
+#define L2ARC_FEED_TYPES 4
+
+#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
+#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
+
+/* L2ARC Performance Tunables */
+unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */
+unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */
+unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */
+unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
+unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
+unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */
+int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
+int l2arc_feed_again = B_TRUE; /* turbo warmup */
+int l2arc_norw = B_FALSE; /* no reads during writes */
+int l2arc_meta_percent = 33; /* limit on headers size */
+
+/*
+ * L2ARC Internals
+ */
+static list_t L2ARC_dev_list; /* device list */
+static list_t *l2arc_dev_list; /* device list pointer */
+static kmutex_t l2arc_dev_mtx; /* device list mutex */
+static l2arc_dev_t *l2arc_dev_last; /* last device used */
+static list_t L2ARC_free_on_write; /* free after write buf list */
+static list_t *l2arc_free_on_write; /* free after write list ptr */
+static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
+static uint64_t l2arc_ndev; /* number of devices */
+
+typedef struct l2arc_read_callback {
+ arc_buf_hdr_t *l2rcb_hdr; /* read header */
+ blkptr_t l2rcb_bp; /* original blkptr */
+ zbookmark_phys_t l2rcb_zb; /* original bookmark */
+ int l2rcb_flags; /* original flags */
+ abd_t *l2rcb_abd; /* temporary buffer */
+} l2arc_read_callback_t;
+
+typedef struct l2arc_data_free {
+ /* protected by l2arc_free_on_write_mtx */
+ abd_t *l2df_abd;
+ size_t l2df_size;
+ arc_buf_contents_t l2df_type;
+ list_node_t l2df_list_node;
+} l2arc_data_free_t;
+
+typedef enum arc_fill_flags {
+ ARC_FILL_LOCKED = 1 << 0, /* hdr lock is held */
+ ARC_FILL_COMPRESSED = 1 << 1, /* fill with compressed data */
+ ARC_FILL_ENCRYPTED = 1 << 2, /* fill with encrypted data */
+ ARC_FILL_NOAUTH = 1 << 3, /* don't attempt to authenticate */
+ ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */
+} arc_fill_flags_t;
+
+static kmutex_t l2arc_feed_thr_lock;
+static kcondvar_t l2arc_feed_thr_cv;
+static uint8_t l2arc_thread_exit;
+
+static kmutex_t l2arc_rebuild_thr_lock;
+static kcondvar_t l2arc_rebuild_thr_cv;
+
+enum arc_hdr_alloc_flags {
+ ARC_HDR_ALLOC_RDATA = 0x1,
+ ARC_HDR_DO_ADAPT = 0x2,
+};
+
+
+static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
+static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
+static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
+static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
+static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
+static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
+static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t);
+static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int);
+static void arc_access(arc_buf_hdr_t *, kmutex_t *);
+static void arc_buf_watch(arc_buf_t *);
+
+static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
+static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
+static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
+static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
+
+static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
+static void l2arc_read_done(zio_t *);
+static void l2arc_do_free_on_write(void);
+static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
+ boolean_t state_only);
+
+#define l2arc_hdr_arcstats_increment(hdr) \
+ l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE)
+#define l2arc_hdr_arcstats_decrement(hdr) \
+ l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE)
+#define l2arc_hdr_arcstats_increment_state(hdr) \
+ l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE)
+#define l2arc_hdr_arcstats_decrement_state(hdr) \
+ l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE)
+
+/*
+ * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU
+ * metadata and data are cached from ARC into L2ARC.
+ */
+int l2arc_mfuonly = 0;
+
+/*
+ * L2ARC TRIM
+ * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of
+ * the current write size (l2arc_write_max) we should TRIM if we
+ * have filled the device. It is defined as a percentage of the
+ * write size. If set to 100 we trim twice the space required to
+ * accommodate upcoming writes. A minimum of 64MB will be trimmed.
+ * It also enables TRIM of the whole L2ARC device upon creation or
+ * addition to an existing pool or if the header of the device is
+ * invalid upon importing a pool or onlining a cache device. The
+ * default is 0, which disables TRIM on L2ARC altogether as it can
+ * put significant stress on the underlying storage devices. This
+ * will vary depending of how well the specific device handles
+ * these commands.
+ */
+unsigned long l2arc_trim_ahead = 0;
+
+/*
+ * Performance tuning of L2ARC persistence:
+ *
+ * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding
+ * an L2ARC device (either at pool import or later) will attempt
+ * to rebuild L2ARC buffer contents.
+ * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls
+ * whether log blocks are written to the L2ARC device. If the L2ARC
+ * device is less than 1GB, the amount of data l2arc_evict()
+ * evicts is significant compared to the amount of restored L2ARC
+ * data. In this case do not write log blocks in L2ARC in order
+ * not to waste space.
+ */
+int l2arc_rebuild_enabled = B_TRUE;
+unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024;
+
+/* L2ARC persistence rebuild control routines. */
+void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
+static void l2arc_dev_rebuild_thread(void *arg);
+static int l2arc_rebuild(l2arc_dev_t *dev);
+
+/* L2ARC persistence read I/O routines. */
+static int l2arc_dev_hdr_read(l2arc_dev_t *dev);
+static int l2arc_log_blk_read(l2arc_dev_t *dev,
+ const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp,
+ l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
+ zio_t *this_io, zio_t **next_io);
+static zio_t *l2arc_log_blk_fetch(vdev_t *vd,
+ const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb);
+static void l2arc_log_blk_fetch_abort(zio_t *zio);
+
+/* L2ARC persistence block restoration routines. */
+static void l2arc_log_blk_restore(l2arc_dev_t *dev,
+ const l2arc_log_blk_phys_t *lb, uint64_t lb_asize);
+static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
+ l2arc_dev_t *dev);
+
+/* L2ARC persistence write I/O routines. */
+static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
+ l2arc_write_callback_t *cb);
+
+/* L2ARC persistence auxiliary routines. */
+boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
+ const l2arc_log_blkptr_t *lbp);
+static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
+ const arc_buf_hdr_t *ab);
+boolean_t l2arc_range_check_overlap(uint64_t bottom,
+ uint64_t top, uint64_t check);
+static void l2arc_blk_fetch_done(zio_t *zio);
+static inline uint64_t
+ l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev);
+
+/*
+ * We use Cityhash for this. It's fast, and has good hash properties without
+ * requiring any large static buffers.
+ */
+static uint64_t
+buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
+{
+ return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
+}
+
+#define HDR_EMPTY(hdr) \
+ ((hdr)->b_dva.dva_word[0] == 0 && \
+ (hdr)->b_dva.dva_word[1] == 0)
+
+#define HDR_EMPTY_OR_LOCKED(hdr) \
+ (HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr)))
+
+#define HDR_EQUAL(spa, dva, birth, hdr) \
+ ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
+ ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
+ ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
+
+static void
+buf_discard_identity(arc_buf_hdr_t *hdr)
+{
+ hdr->b_dva.dva_word[0] = 0;
+ hdr->b_dva.dva_word[1] = 0;
+ hdr->b_birth = 0;
+}
+
+static arc_buf_hdr_t *
+buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
+{
+ const dva_t *dva = BP_IDENTITY(bp);
+ uint64_t birth = BP_PHYSICAL_BIRTH(bp);
+ uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
+ kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
+ arc_buf_hdr_t *hdr;
+
+ mutex_enter(hash_lock);
+ for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
+ hdr = hdr->b_hash_next) {
+ if (HDR_EQUAL(spa, dva, birth, hdr)) {
+ *lockp = hash_lock;
+ return (hdr);
+ }
+ }
+ mutex_exit(hash_lock);
+ *lockp = NULL;
+ return (NULL);
+}
+
+/*
+ * Insert an entry into the hash table. If there is already an element
+ * equal to elem in the hash table, then the already existing element
+ * will be returned and the new element will not be inserted.
+ * Otherwise returns NULL.
+ * If lockp == NULL, the caller is assumed to already hold the hash lock.
+ */
+static arc_buf_hdr_t *
+buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
+{
+ uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
+ kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
+ arc_buf_hdr_t *fhdr;
+ uint32_t i;
+
+ ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
+ ASSERT(hdr->b_birth != 0);
+ ASSERT(!HDR_IN_HASH_TABLE(hdr));
+
+ if (lockp != NULL) {
+ *lockp = hash_lock;
+ mutex_enter(hash_lock);
+ } else {
+ ASSERT(MUTEX_HELD(hash_lock));
+ }
+
+ for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
+ fhdr = fhdr->b_hash_next, i++) {
+ if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
+ return (fhdr);
+ }
+
+ hdr->b_hash_next = buf_hash_table.ht_table[idx];
+ buf_hash_table.ht_table[idx] = hdr;
+ arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
+
+ /* collect some hash table performance data */
+ if (i > 0) {
+ ARCSTAT_BUMP(arcstat_hash_collisions);
+ if (i == 1)
+ ARCSTAT_BUMP(arcstat_hash_chains);
+
+ ARCSTAT_MAX(arcstat_hash_chain_max, i);
+ }
+
+ ARCSTAT_BUMP(arcstat_hash_elements);
+ ARCSTAT_MAXSTAT(arcstat_hash_elements);
+
+ return (NULL);
+}
+
+static void
+buf_hash_remove(arc_buf_hdr_t *hdr)
+{
+ arc_buf_hdr_t *fhdr, **hdrp;
+ uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
+
+ ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
+ ASSERT(HDR_IN_HASH_TABLE(hdr));
+
+ hdrp = &buf_hash_table.ht_table[idx];
+ while ((fhdr = *hdrp) != hdr) {
+ ASSERT3P(fhdr, !=, NULL);
+ hdrp = &fhdr->b_hash_next;
+ }
+ *hdrp = hdr->b_hash_next;
+ hdr->b_hash_next = NULL;
+ arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
+
+ /* collect some hash table performance data */
+ ARCSTAT_BUMPDOWN(arcstat_hash_elements);
+
+ if (buf_hash_table.ht_table[idx] &&
+ buf_hash_table.ht_table[idx]->b_hash_next == NULL)
+ ARCSTAT_BUMPDOWN(arcstat_hash_chains);
+}
+
+/*
+ * Global data structures and functions for the buf kmem cache.
+ */
+
+static kmem_cache_t *hdr_full_cache;
+static kmem_cache_t *hdr_full_crypt_cache;
+static kmem_cache_t *hdr_l2only_cache;
+static kmem_cache_t *buf_cache;
+
+static void
+buf_fini(void)
+{
+ int i;
+
+#if defined(_KERNEL)
+ /*
+ * Large allocations which do not require contiguous pages
+ * should be using vmem_free() in the linux kernel\
+ */
+ vmem_free(buf_hash_table.ht_table,
+ (buf_hash_table.ht_mask + 1) * sizeof (void *));
+#else
+ kmem_free(buf_hash_table.ht_table,
+ (buf_hash_table.ht_mask + 1) * sizeof (void *));
+#endif
+ for (i = 0; i < BUF_LOCKS; i++)
+ mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
+ kmem_cache_destroy(hdr_full_cache);
+ kmem_cache_destroy(hdr_full_crypt_cache);
+ kmem_cache_destroy(hdr_l2only_cache);
+ kmem_cache_destroy(buf_cache);
+}
+
+/*
+ * Constructor callback - called when the cache is empty
+ * and a new buf is requested.
+ */
+/* ARGSUSED */
+static int
+hdr_full_cons(void *vbuf, void *unused, int kmflag)
+{
+ arc_buf_hdr_t *hdr = vbuf;
+
+ bzero(hdr, HDR_FULL_SIZE);
+ hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
+ cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
+ zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
+ mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_link_init(&hdr->b_l1hdr.b_arc_node);
+ list_link_init(&hdr->b_l2hdr.b_l2node);
+ multilist_link_init(&hdr->b_l1hdr.b_arc_node);
+ arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag)
+{
+ arc_buf_hdr_t *hdr = vbuf;
+
+ hdr_full_cons(vbuf, unused, kmflag);
+ bzero(&hdr->b_crypt_hdr, sizeof (hdr->b_crypt_hdr));
+ arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
+{
+ arc_buf_hdr_t *hdr = vbuf;
+
+ bzero(hdr, HDR_L2ONLY_SIZE);
+ arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+buf_cons(void *vbuf, void *unused, int kmflag)
+{
+ arc_buf_t *buf = vbuf;
+
+ bzero(buf, sizeof (arc_buf_t));
+ mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+ arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
+
+ return (0);
+}
+
+/*
+ * Destructor callback - called when a cached buf is
+ * no longer required.
+ */
+/* ARGSUSED */
+static void
+hdr_full_dest(void *vbuf, void *unused)
+{
+ arc_buf_hdr_t *hdr = vbuf;
+
+ ASSERT(HDR_EMPTY(hdr));
+ cv_destroy(&hdr->b_l1hdr.b_cv);
+ zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
+ mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
+ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+ arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
+}
+
+/* ARGSUSED */
+static void
+hdr_full_crypt_dest(void *vbuf, void *unused)
+{
+ arc_buf_hdr_t *hdr = vbuf;
+
+ hdr_full_dest(vbuf, unused);
+ arc_space_return(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
+}
+
+/* ARGSUSED */
+static void
+hdr_l2only_dest(void *vbuf, void *unused)
+{
+ arc_buf_hdr_t *hdr __maybe_unused = vbuf;
+
+ ASSERT(HDR_EMPTY(hdr));
+ arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
+}
+
+/* ARGSUSED */
+static void
+buf_dest(void *vbuf, void *unused)
+{
+ arc_buf_t *buf = vbuf;
+
+ mutex_destroy(&buf->b_evict_lock);
+ arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
+}
+
+static void
+buf_init(void)
+{
+ uint64_t *ct = NULL;
+ uint64_t hsize = 1ULL << 12;
+ int i, j;
+
+ /*
+ * The hash table is big enough to fill all of physical memory
+ * with an average block size of zfs_arc_average_blocksize (default 8K).
+ * By default, the table will take up
+ * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
+ */
+ while (hsize * zfs_arc_average_blocksize < arc_all_memory())
+ hsize <<= 1;
+retry:
+ buf_hash_table.ht_mask = hsize - 1;
+#if defined(_KERNEL)
+ /*
+ * Large allocations which do not require contiguous pages
+ * should be using vmem_alloc() in the linux kernel
+ */
+ buf_hash_table.ht_table =
+ vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
+#else
+ buf_hash_table.ht_table =
+ kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
+#endif
+ if (buf_hash_table.ht_table == NULL) {
+ ASSERT(hsize > (1ULL << 8));
+ hsize >>= 1;
+ goto retry;
+ }
+
+ hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
+ 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0);
+ hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt",
+ HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest,
+ NULL, NULL, NULL, 0);
+ hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
+ HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL,
+ NULL, NULL, 0);
+ buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
+ 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
+
+ for (i = 0; i < 256; i++)
+ for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
+ *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
+
+ for (i = 0; i < BUF_LOCKS; i++) {
+ mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ }
+}
+
+#define ARC_MINTIME (hz>>4) /* 62 ms */
+
+/*
+ * This is the size that the buf occupies in memory. If the buf is compressed,
+ * it will correspond to the compressed size. You should use this method of
+ * getting the buf size unless you explicitly need the logical size.
+ */
+uint64_t
+arc_buf_size(arc_buf_t *buf)
+{
+ return (ARC_BUF_COMPRESSED(buf) ?
+ HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
+}
+
+uint64_t
+arc_buf_lsize(arc_buf_t *buf)
+{
+ return (HDR_GET_LSIZE(buf->b_hdr));
+}
+
+/*
+ * This function will return B_TRUE if the buffer is encrypted in memory.
+ * This buffer can be decrypted by calling arc_untransform().
+ */
+boolean_t
+arc_is_encrypted(arc_buf_t *buf)
+{
+ return (ARC_BUF_ENCRYPTED(buf) != 0);
+}
+
+/*
+ * Returns B_TRUE if the buffer represents data that has not had its MAC
+ * verified yet.
+ */
+boolean_t
+arc_is_unauthenticated(arc_buf_t *buf)
+{
+ return (HDR_NOAUTH(buf->b_hdr) != 0);
+}
+
+void
+arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt,
+ uint8_t *iv, uint8_t *mac)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ ASSERT(HDR_PROTECTED(hdr));
+
+ bcopy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
+ bcopy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
+ bcopy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
+ *byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
+ ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
+}
+
+/*
+ * Indicates how this buffer is compressed in memory. If it is not compressed
+ * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with
+ * arc_untransform() as long as it is also unencrypted.
+ */
+enum zio_compress
+arc_get_compression(arc_buf_t *buf)
+{
+ return (ARC_BUF_COMPRESSED(buf) ?
+ HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
+}
+
+/*
+ * Return the compression algorithm used to store this data in the ARC. If ARC
+ * compression is enabled or this is an encrypted block, this will be the same
+ * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF.
+ */
+static inline enum zio_compress
+arc_hdr_get_compress(arc_buf_hdr_t *hdr)
+{
+ return (HDR_COMPRESSION_ENABLED(hdr) ?
+ HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF);
+}
+
+uint8_t
+arc_get_complevel(arc_buf_t *buf)
+{
+ return (buf->b_hdr->b_complevel);
+}
+
+static inline boolean_t
+arc_buf_is_shared(arc_buf_t *buf)
+{
+ boolean_t shared = (buf->b_data != NULL &&
+ buf->b_hdr->b_l1hdr.b_pabd != NULL &&
+ abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
+ buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
+ IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
+ IMPLY(shared, ARC_BUF_SHARED(buf));
+ IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
+
+ /*
+ * It would be nice to assert arc_can_share() too, but the "hdr isn't
+ * already being shared" requirement prevents us from doing that.
+ */
+
+ return (shared);
+}
+
+/*
+ * Free the checksum associated with this header. If there is no checksum, this
+ * is a no-op.
+ */
+static inline void
+arc_cksum_free(arc_buf_hdr_t *hdr)
+{
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
+ if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
+ kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
+ hdr->b_l1hdr.b_freeze_cksum = NULL;
+ }
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+}
+
+/*
+ * Return true iff at least one of the bufs on hdr is not compressed.
+ * Encrypted buffers count as compressed.
+ */
+static boolean_t
+arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
+{
+ ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr));
+
+ for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
+ if (!ARC_BUF_COMPRESSED(b)) {
+ return (B_TRUE);
+ }
+ }
+ return (B_FALSE);
+}
+
+
+/*
+ * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
+ * matches the checksum that is stored in the hdr. If there is no checksum,
+ * or if the buf is compressed, this is a no-op.
+ */
+static void
+arc_cksum_verify(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ zio_cksum_t zc;
+
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ return;
+
+ if (ARC_BUF_COMPRESSED(buf))
+ return;
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
+
+ if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+ return;
+ }
+
+ fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
+ if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
+ panic("buffer modified while frozen!");
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+}
+
+/*
+ * This function makes the assumption that data stored in the L2ARC
+ * will be transformed exactly as it is in the main pool. Because of
+ * this we can verify the checksum against the reading process's bp.
+ */
+static boolean_t
+arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
+{
+ ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
+ VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
+
+ /*
+ * Block pointers always store the checksum for the logical data.
+ * If the block pointer has the gang bit set, then the checksum
+ * it represents is for the reconstituted data and not for an
+ * individual gang member. The zio pipeline, however, must be able to
+ * determine the checksum of each of the gang constituents so it
+ * treats the checksum comparison differently than what we need
+ * for l2arc blocks. This prevents us from using the
+ * zio_checksum_error() interface directly. Instead we must call the
+ * zio_checksum_error_impl() so that we can ensure the checksum is
+ * generated using the correct checksum algorithm and accounts for the
+ * logical I/O size and not just a gang fragment.
+ */
+ return (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
+ BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
+ zio->io_offset, NULL) == 0);
+}
+
+/*
+ * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
+ * checksum and attaches it to the buf's hdr so that we can ensure that the buf
+ * isn't modified later on. If buf is compressed or there is already a checksum
+ * on the hdr, this is a no-op (we only checksum uncompressed bufs).
+ */
+static void
+arc_cksum_compute(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ return;
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+ if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) {
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+ return;
+ }
+
+ ASSERT(!ARC_BUF_ENCRYPTED(buf));
+ ASSERT(!ARC_BUF_COMPRESSED(buf));
+ hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
+ KM_SLEEP);
+ fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
+ hdr->b_l1hdr.b_freeze_cksum);
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+ arc_buf_watch(buf);
+}
+
+#ifndef _KERNEL
+void
+arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
+{
+ panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr);
+}
+#endif
+
+/* ARGSUSED */
+static void
+arc_buf_unwatch(arc_buf_t *buf)
+{
+#ifndef _KERNEL
+ if (arc_watch) {
+ ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
+ PROT_READ | PROT_WRITE));
+ }
+#endif
+}
+
+/* ARGSUSED */
+static void
+arc_buf_watch(arc_buf_t *buf)
+{
+#ifndef _KERNEL
+ if (arc_watch)
+ ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
+ PROT_READ));
+#endif
+}
+
+static arc_buf_contents_t
+arc_buf_type(arc_buf_hdr_t *hdr)
+{
+ arc_buf_contents_t type;
+ if (HDR_ISTYPE_METADATA(hdr)) {
+ type = ARC_BUFC_METADATA;
+ } else {
+ type = ARC_BUFC_DATA;
+ }
+ VERIFY3U(hdr->b_type, ==, type);
+ return (type);
+}
+
+boolean_t
+arc_is_metadata(arc_buf_t *buf)
+{
+ return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
+}
+
+static uint32_t
+arc_bufc_to_flags(arc_buf_contents_t type)
+{
+ switch (type) {
+ case ARC_BUFC_DATA:
+ /* metadata field is 0 if buffer contains normal data */
+ return (0);
+ case ARC_BUFC_METADATA:
+ return (ARC_FLAG_BUFC_METADATA);
+ default:
+ break;
+ }
+ panic("undefined ARC buffer type!");
+ return ((uint32_t)-1);
+}
+
+void
+arc_buf_thaw(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+
+ arc_cksum_verify(buf);
+
+ /*
+ * Compressed buffers do not manipulate the b_freeze_cksum.
+ */
+ if (ARC_BUF_COMPRESSED(buf))
+ return;
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ arc_cksum_free(hdr);
+ arc_buf_unwatch(buf);
+}
+
+void
+arc_buf_freeze(arc_buf_t *buf)
+{
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ return;
+
+ if (ARC_BUF_COMPRESSED(buf))
+ return;
+
+ ASSERT(HDR_HAS_L1HDR(buf->b_hdr));
+ arc_cksum_compute(buf);
+}
+
+/*
+ * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
+ * the following functions should be used to ensure that the flags are
+ * updated in a thread-safe way. When manipulating the flags either
+ * the hash_lock must be held or the hdr must be undiscoverable. This
+ * ensures that we're not racing with any other threads when updating
+ * the flags.
+ */
+static inline void
+arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
+{
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
+ hdr->b_flags |= flags;
+}
+
+static inline void
+arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
+{
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
+ hdr->b_flags &= ~flags;
+}
+
+/*
+ * Setting the compression bits in the arc_buf_hdr_t's b_flags is
+ * done in a special way since we have to clear and set bits
+ * at the same time. Consumers that wish to set the compression bits
+ * must use this function to ensure that the flags are updated in
+ * thread-safe manner.
+ */
+static void
+arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
+{
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
+
+ /*
+ * Holes and embedded blocks will always have a psize = 0 so
+ * we ignore the compression of the blkptr and set the
+ * want to uncompress them. Mark them as uncompressed.
+ */
+ if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
+ arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
+ ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
+ } else {
+ arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
+ ASSERT(HDR_COMPRESSION_ENABLED(hdr));
+ }
+
+ HDR_SET_COMPRESS(hdr, cmp);
+ ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
+}
+
+/*
+ * Looks for another buf on the same hdr which has the data decompressed, copies
+ * from it, and returns true. If no such buf exists, returns false.
+ */
+static boolean_t
+arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ boolean_t copied = B_FALSE;
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT3P(buf->b_data, !=, NULL);
+ ASSERT(!ARC_BUF_COMPRESSED(buf));
+
+ for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
+ from = from->b_next) {
+ /* can't use our own data buffer */
+ if (from == buf) {
+ continue;
+ }
+
+ if (!ARC_BUF_COMPRESSED(from)) {
+ bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
+ copied = B_TRUE;
+ break;
+ }
+ }
+
+ /*
+ * There were no decompressed bufs, so there should not be a
+ * checksum on the hdr either.
+ */
+ if (zfs_flags & ZFS_DEBUG_MODIFY)
+ EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
+
+ return (copied);
+}
+
+/*
+ * Allocates an ARC buf header that's in an evicted & L2-cached state.
+ * This is used during l2arc reconstruction to make empty ARC buffers
+ * which circumvent the regular disk->arc->l2arc path and instead come
+ * into being in the reverse order, i.e. l2arc->arc.
+ */
+static arc_buf_hdr_t *
+arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev,
+ dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth,
+ enum zio_compress compress, uint8_t complevel, boolean_t protected,
+ boolean_t prefetch, arc_state_type_t arcs_state)
+{
+ arc_buf_hdr_t *hdr;
+
+ ASSERT(size != 0);
+ hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP);
+ hdr->b_birth = birth;
+ hdr->b_type = type;
+ hdr->b_flags = 0;
+ arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR);
+ HDR_SET_LSIZE(hdr, size);
+ HDR_SET_PSIZE(hdr, psize);
+ arc_hdr_set_compress(hdr, compress);
+ hdr->b_complevel = complevel;
+ if (protected)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
+ if (prefetch)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+ hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa);
+
+ hdr->b_dva = dva;
+
+ hdr->b_l2hdr.b_dev = dev;
+ hdr->b_l2hdr.b_daddr = daddr;
+ hdr->b_l2hdr.b_arcs_state = arcs_state;
+
+ return (hdr);
+}
+
+/*
+ * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
+ */
+static uint64_t
+arc_hdr_size(arc_buf_hdr_t *hdr)
+{
+ uint64_t size;
+
+ if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
+ HDR_GET_PSIZE(hdr) > 0) {
+ size = HDR_GET_PSIZE(hdr);
+ } else {
+ ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
+ size = HDR_GET_LSIZE(hdr);
+ }
+ return (size);
+}
+
+static int
+arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
+{
+ int ret;
+ uint64_t csize;
+ uint64_t lsize = HDR_GET_LSIZE(hdr);
+ uint64_t psize = HDR_GET_PSIZE(hdr);
+ void *tmpbuf = NULL;
+ abd_t *abd = hdr->b_l1hdr.b_pabd;
+
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
+ ASSERT(HDR_AUTHENTICATED(hdr));
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+
+ /*
+ * The MAC is calculated on the compressed data that is stored on disk.
+ * However, if compressed arc is disabled we will only have the
+ * decompressed data available to us now. Compress it into a temporary
+ * abd so we can verify the MAC. The performance overhead of this will
+ * be relatively low, since most objects in an encrypted objset will
+ * be encrypted (instead of authenticated) anyway.
+ */
+ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
+ !HDR_COMPRESSION_ENABLED(hdr)) {
+ tmpbuf = zio_buf_alloc(lsize);
+ abd = abd_get_from_buf(tmpbuf, lsize);
+ abd_take_ownership_of_buf(abd, B_TRUE);
+ csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
+ hdr->b_l1hdr.b_pabd, tmpbuf, lsize, hdr->b_complevel);
+ ASSERT3U(csize, <=, psize);
+ abd_zero_off(abd, csize, psize - csize);
+ }
+
+ /*
+ * Authentication is best effort. We authenticate whenever the key is
+ * available. If we succeed we clear ARC_FLAG_NOAUTH.
+ */
+ if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) {
+ ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+ ASSERT3U(lsize, ==, psize);
+ ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd,
+ psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
+ } else {
+ ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize,
+ hdr->b_crypt_hdr.b_mac);
+ }
+
+ if (ret == 0)
+ arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
+ else if (ret != ENOENT)
+ goto error;
+
+ if (tmpbuf != NULL)
+ abd_free(abd);
+
+ return (0);
+
+error:
+ if (tmpbuf != NULL)
+ abd_free(abd);
+
+ return (ret);
+}
+
+/*
+ * This function will take a header that only has raw encrypted data in
+ * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in
+ * b_l1hdr.b_pabd. If designated in the header flags, this function will
+ * also decompress the data.
+ */
+static int
+arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
+{
+ int ret;
+ abd_t *cabd = NULL;
+ void *tmp = NULL;
+ boolean_t no_crypt = B_FALSE;
+ boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
+
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
+ ASSERT(HDR_ENCRYPTED(hdr));
+
+ arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
+
+ ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot,
+ B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv,
+ hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd,
+ hdr->b_crypt_hdr.b_rabd, &no_crypt);
+ if (ret != 0)
+ goto error;
+
+ if (no_crypt) {
+ abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd,
+ HDR_GET_PSIZE(hdr));
+ }
+
+ /*
+ * If this header has disabled arc compression but the b_pabd is
+ * compressed after decrypting it, we need to decompress the newly
+ * decrypted data.
+ */
+ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
+ !HDR_COMPRESSION_ENABLED(hdr)) {
+ /*
+ * We want to make sure that we are correctly honoring the
+ * zfs_abd_scatter_enabled setting, so we allocate an abd here
+ * and then loan a buffer from it, rather than allocating a
+ * linear buffer and wrapping it in an abd later.
+ */
+ cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, B_TRUE);
+ tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
+
+ ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
+ hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
+ HDR_GET_LSIZE(hdr), &hdr->b_complevel);
+ if (ret != 0) {
+ abd_return_buf(cabd, tmp, arc_hdr_size(hdr));
+ goto error;
+ }
+
+ abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
+ arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
+ arc_hdr_size(hdr), hdr);
+ hdr->b_l1hdr.b_pabd = cabd;
+ }
+
+ return (0);
+
+error:
+ arc_hdr_free_abd(hdr, B_FALSE);
+ if (cabd != NULL)
+ arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr);
+
+ return (ret);
+}
+
+/*
+ * This function is called during arc_buf_fill() to prepare the header's
+ * abd plaintext pointer for use. This involves authenticated protected
+ * data and decrypting encrypted data into the plaintext abd.
+ */
+static int
+arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa,
+ const zbookmark_phys_t *zb, boolean_t noauth)
+{
+ int ret;
+
+ ASSERT(HDR_PROTECTED(hdr));
+
+ if (hash_lock != NULL)
+ mutex_enter(hash_lock);
+
+ if (HDR_NOAUTH(hdr) && !noauth) {
+ /*
+ * The caller requested authenticated data but our data has
+ * not been authenticated yet. Verify the MAC now if we can.
+ */
+ ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset);
+ if (ret != 0)
+ goto error;
+ } else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) {
+ /*
+ * If we only have the encrypted version of the data, but the
+ * unencrypted version was requested we take this opportunity
+ * to store the decrypted version in the header for future use.
+ */
+ ret = arc_hdr_decrypt(hdr, spa, zb);
+ if (ret != 0)
+ goto error;
+ }
+
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
+
+ return (0);
+
+error:
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
+
+ return (ret);
+}
+
+/*
+ * This function is used by the dbuf code to decrypt bonus buffers in place.
+ * The dbuf code itself doesn't have any locking for decrypting a shared dnode
+ * block, so we use the hash lock here to protect against concurrent calls to
+ * arc_buf_fill().
+ */
+static void
+arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ ASSERT(HDR_ENCRYPTED(hdr));
+ ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+
+ zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
+ arc_buf_size(buf));
+ buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
+ buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
+ hdr->b_crypt_hdr.b_ebufcnt -= 1;
+}
+
+/*
+ * Given a buf that has a data buffer attached to it, this function will
+ * efficiently fill the buf with data of the specified compression setting from
+ * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
+ * are already sharing a data buf, no copy is performed.
+ *
+ * If the buf is marked as compressed but uncompressed data was requested, this
+ * will allocate a new data buffer for the buf, remove that flag, and fill the
+ * buf with uncompressed data. You can't request a compressed buf on a hdr with
+ * uncompressed data, and (since we haven't added support for it yet) if you
+ * want compressed data your buf must already be marked as compressed and have
+ * the correct-sized data buffer.
+ */
+static int
+arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
+ arc_fill_flags_t flags)
+{
+ int error = 0;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ boolean_t hdr_compressed =
+ (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
+ boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0;
+ boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0;
+ dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
+ kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr);
+
+ ASSERT3P(buf->b_data, !=, NULL);
+ IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf));
+ IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
+ IMPLY(encrypted, HDR_ENCRYPTED(hdr));
+ IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf));
+ IMPLY(encrypted, ARC_BUF_COMPRESSED(buf));
+ IMPLY(encrypted, !ARC_BUF_SHARED(buf));
+
+ /*
+ * If the caller wanted encrypted data we just need to copy it from
+ * b_rabd and potentially byteswap it. We won't be able to do any
+ * further transforms on it.
+ */
+ if (encrypted) {
+ ASSERT(HDR_HAS_RABD(hdr));
+ abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd,
+ HDR_GET_PSIZE(hdr));
+ goto byteswap;
+ }
+
+ /*
+ * Adjust encrypted and authenticated headers to accommodate
+ * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are
+ * allowed to fail decryption due to keys not being loaded
+ * without being marked as an IO error.
+ */
+ if (HDR_PROTECTED(hdr)) {
+ error = arc_fill_hdr_crypt(hdr, hash_lock, spa,
+ zb, !!(flags & ARC_FILL_NOAUTH));
+ if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) {
+ return (error);
+ } else if (error != 0) {
+ if (hash_lock != NULL)
+ mutex_enter(hash_lock);
+ arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
+ return (error);
+ }
+ }
+
+ /*
+ * There is a special case here for dnode blocks which are
+ * decrypting their bonus buffers. These blocks may request to
+ * be decrypted in-place. This is necessary because there may
+ * be many dnodes pointing into this buffer and there is
+ * currently no method to synchronize replacing the backing
+ * b_data buffer and updating all of the pointers. Here we use
+ * the hash lock to ensure there are no races. If the need
+ * arises for other types to be decrypted in-place, they must
+ * add handling here as well.
+ */
+ if ((flags & ARC_FILL_IN_PLACE) != 0) {
+ ASSERT(!hdr_compressed);
+ ASSERT(!compressed);
+ ASSERT(!encrypted);
+
+ if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) {
+ ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
+
+ if (hash_lock != NULL)
+ mutex_enter(hash_lock);
+ arc_buf_untransform_in_place(buf, hash_lock);
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
+
+ /* Compute the hdr's checksum if necessary */
+ arc_cksum_compute(buf);
+ }
+
+ return (0);
+ }
+
+ if (hdr_compressed == compressed) {
+ if (!arc_buf_is_shared(buf)) {
+ abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
+ arc_buf_size(buf));
+ }
+ } else {
+ ASSERT(hdr_compressed);
+ ASSERT(!compressed);
+ ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
+
+ /*
+ * If the buf is sharing its data with the hdr, unlink it and
+ * allocate a new data buffer for the buf.
+ */
+ if (arc_buf_is_shared(buf)) {
+ ASSERT(ARC_BUF_COMPRESSED(buf));
+
+ /* We need to give the buf its own b_data */
+ buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
+ buf->b_data =
+ arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
+ arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+
+ /* Previously overhead was 0; just add new overhead */
+ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
+ } else if (ARC_BUF_COMPRESSED(buf)) {
+ /* We need to reallocate the buf's b_data */
+ arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
+ buf);
+ buf->b_data =
+ arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
+
+ /* We increased the size of b_data; update overhead */
+ ARCSTAT_INCR(arcstat_overhead_size,
+ HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
+ }
+
+ /*
+ * Regardless of the buf's previous compression settings, it
+ * should not be compressed at the end of this function.
+ */
+ buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
+
+ /*
+ * Try copying the data from another buf which already has a
+ * decompressed version. If that's not possible, it's time to
+ * bite the bullet and decompress the data from the hdr.
+ */
+ if (arc_buf_try_copy_decompressed_data(buf)) {
+ /* Skip byteswapping and checksumming (already done) */
+ return (0);
+ } else {
+ error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
+ hdr->b_l1hdr.b_pabd, buf->b_data,
+ HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr),
+ &hdr->b_complevel);
+
+ /*
+ * Absent hardware errors or software bugs, this should
+ * be impossible, but log it anyway so we can debug it.
+ */
+ if (error != 0) {
+ zfs_dbgmsg(
+ "hdr %px, compress %d, psize %d, lsize %d",
+ hdr, arc_hdr_get_compress(hdr),
+ HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
+ if (hash_lock != NULL)
+ mutex_enter(hash_lock);
+ arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
+ return (SET_ERROR(EIO));
+ }
+ }
+ }
+
+byteswap:
+ /* Byteswap the buf's data if necessary */
+ if (bswap != DMU_BSWAP_NUMFUNCS) {
+ ASSERT(!HDR_SHARED_DATA(hdr));
+ ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
+ dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
+ }
+
+ /* Compute the hdr's checksum if necessary */
+ arc_cksum_compute(buf);
+
+ return (0);
+}
+
+/*
+ * If this function is being called to decrypt an encrypted buffer or verify an
+ * authenticated one, the key must be loaded and a mapping must be made
+ * available in the keystore via spa_keystore_create_mapping() or one of its
+ * callers.
+ */
+int
+arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
+ boolean_t in_place)
+{
+ int ret;
+ arc_fill_flags_t flags = 0;
+
+ if (in_place)
+ flags |= ARC_FILL_IN_PLACE;
+
+ ret = arc_buf_fill(buf, spa, zb, flags);
+ if (ret == ECKSUM) {
+ /*
+ * Convert authentication and decryption errors to EIO
+ * (and generate an ereport) before leaving the ARC.
+ */
+ ret = SET_ERROR(EIO);
+ spa_log_error(spa, zb);
+ (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
+ spa, NULL, zb, NULL, 0);
+ }
+
+ return (ret);
+}
+
+/*
+ * Increment the amount of evictable space in the arc_state_t's refcount.
+ * We account for the space used by the hdr and the arc buf individually
+ * so that we can add and remove them from the refcount individually.
+ */
+static void
+arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
+{
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ if (GHOST_STATE(state)) {
+ ASSERT0(hdr->b_l1hdr.b_bufcnt);
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ ASSERT(!HDR_HAS_RABD(hdr));
+ (void) zfs_refcount_add_many(&state->arcs_esize[type],
+ HDR_GET_LSIZE(hdr), hdr);
+ return;
+ }
+
+ ASSERT(!GHOST_STATE(state));
+ if (hdr->b_l1hdr.b_pabd != NULL) {
+ (void) zfs_refcount_add_many(&state->arcs_esize[type],
+ arc_hdr_size(hdr), hdr);
+ }
+ if (HDR_HAS_RABD(hdr)) {
+ (void) zfs_refcount_add_many(&state->arcs_esize[type],
+ HDR_GET_PSIZE(hdr), hdr);
+ }
+
+ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+ buf = buf->b_next) {
+ if (arc_buf_is_shared(buf))
+ continue;
+ (void) zfs_refcount_add_many(&state->arcs_esize[type],
+ arc_buf_size(buf), buf);
+ }
+}
+
+/*
+ * Decrement the amount of evictable space in the arc_state_t's refcount.
+ * We account for the space used by the hdr and the arc buf individually
+ * so that we can add and remove them from the refcount individually.
+ */
+static void
+arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
+{
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ if (GHOST_STATE(state)) {
+ ASSERT0(hdr->b_l1hdr.b_bufcnt);
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ ASSERT(!HDR_HAS_RABD(hdr));
+ (void) zfs_refcount_remove_many(&state->arcs_esize[type],
+ HDR_GET_LSIZE(hdr), hdr);
+ return;
+ }
+
+ ASSERT(!GHOST_STATE(state));
+ if (hdr->b_l1hdr.b_pabd != NULL) {
+ (void) zfs_refcount_remove_many(&state->arcs_esize[type],
+ arc_hdr_size(hdr), hdr);
+ }
+ if (HDR_HAS_RABD(hdr)) {
+ (void) zfs_refcount_remove_many(&state->arcs_esize[type],
+ HDR_GET_PSIZE(hdr), hdr);
+ }
+
+ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+ buf = buf->b_next) {
+ if (arc_buf_is_shared(buf))
+ continue;
+ (void) zfs_refcount_remove_many(&state->arcs_esize[type],
+ arc_buf_size(buf), buf);
+ }
+}
+
+/*
+ * Add a reference to this hdr indicating that someone is actively
+ * referencing that memory. When the refcount transitions from 0 to 1,
+ * we remove it from the respective arc_state_t list to indicate that
+ * it is not evictable.
+ */
+static void
+add_reference(arc_buf_hdr_t *hdr, void *tag)
+{
+ arc_state_t *state;
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) {
+ ASSERT(hdr->b_l1hdr.b_state == arc_anon);
+ ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ }
+
+ state = hdr->b_l1hdr.b_state;
+
+ if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
+ (state != arc_anon)) {
+ /* We don't use the L2-only state list. */
+ if (state != arc_l2c_only) {
+ multilist_remove(state->arcs_list[arc_buf_type(hdr)],
+ hdr);
+ arc_evictable_space_decrement(hdr, state);
+ }
+ /* remove the prefetch flag if we get a reference */
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_decrement_state(hdr);
+ arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_increment_state(hdr);
+ }
+}
+
+/*
+ * Remove a reference from this hdr. When the reference transitions from
+ * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
+ * list making it eligible for eviction.
+ */
+static int
+remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
+{
+ int cnt;
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
+ ASSERT(!GHOST_STATE(state));
+
+ /*
+ * arc_l2c_only counts as a ghost state so we don't need to explicitly
+ * check to prevent usage of the arc_l2c_only list.
+ */
+ if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
+ (state != arc_anon)) {
+ multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr);
+ ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
+ arc_evictable_space_increment(hdr, state);
+ }
+ return (cnt);
+}
+
+/*
+ * Returns detailed information about a specific arc buffer. When the
+ * state_index argument is set the function will calculate the arc header
+ * list position for its arc state. Since this requires a linear traversal
+ * callers are strongly encourage not to do this. However, it can be helpful
+ * for targeted analysis so the functionality is provided.
+ */
+void
+arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
+{
+ arc_buf_hdr_t *hdr = ab->b_hdr;
+ l1arc_buf_hdr_t *l1hdr = NULL;
+ l2arc_buf_hdr_t *l2hdr = NULL;
+ arc_state_t *state = NULL;
+
+ memset(abi, 0, sizeof (arc_buf_info_t));
+
+ if (hdr == NULL)
+ return;
+
+ abi->abi_flags = hdr->b_flags;
+
+ if (HDR_HAS_L1HDR(hdr)) {
+ l1hdr = &hdr->b_l1hdr;
+ state = l1hdr->b_state;
+ }
+ if (HDR_HAS_L2HDR(hdr))
+ l2hdr = &hdr->b_l2hdr;
+
+ if (l1hdr) {
+ abi->abi_bufcnt = l1hdr->b_bufcnt;
+ abi->abi_access = l1hdr->b_arc_access;
+ abi->abi_mru_hits = l1hdr->b_mru_hits;
+ abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
+ abi->abi_mfu_hits = l1hdr->b_mfu_hits;
+ abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
+ abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt);
+ }
+
+ if (l2hdr) {
+ abi->abi_l2arc_dattr = l2hdr->b_daddr;
+ abi->abi_l2arc_hits = l2hdr->b_hits;
+ }
+
+ abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
+ abi->abi_state_contents = arc_buf_type(hdr);
+ abi->abi_size = arc_hdr_size(hdr);
+}
+
+/*
+ * Move the supplied buffer to the indicated state. The hash lock
+ * for the buffer must be held by the caller.
+ */
+static void
+arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
+ kmutex_t *hash_lock)
+{
+ arc_state_t *old_state;
+ int64_t refcnt;
+ uint32_t bufcnt;
+ boolean_t update_old, update_new;
+ arc_buf_contents_t buftype = arc_buf_type(hdr);
+
+ /*
+ * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
+ * in arc_read() when bringing a buffer out of the L2ARC. However, the
+ * L1 hdr doesn't always exist when we change state to arc_anon before
+ * destroying a header, in which case reallocating to add the L1 hdr is
+ * pointless.
+ */
+ if (HDR_HAS_L1HDR(hdr)) {
+ old_state = hdr->b_l1hdr.b_state;
+ refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
+ bufcnt = hdr->b_l1hdr.b_bufcnt;
+ update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL ||
+ HDR_HAS_RABD(hdr));
+ } else {
+ old_state = arc_l2c_only;
+ refcnt = 0;
+ bufcnt = 0;
+ update_old = B_FALSE;
+ }
+ update_new = update_old;
+
+ ASSERT(MUTEX_HELD(hash_lock));
+ ASSERT3P(new_state, !=, old_state);
+ ASSERT(!GHOST_STATE(new_state) || bufcnt == 0);
+ ASSERT(old_state != arc_anon || bufcnt <= 1);
+
+ /*
+ * If this buffer is evictable, transfer it from the
+ * old state list to the new state list.
+ */
+ if (refcnt == 0) {
+ if (old_state != arc_anon && old_state != arc_l2c_only) {
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ multilist_remove(old_state->arcs_list[buftype], hdr);
+
+ if (GHOST_STATE(old_state)) {
+ ASSERT0(bufcnt);
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ update_old = B_TRUE;
+ }
+ arc_evictable_space_decrement(hdr, old_state);
+ }
+ if (new_state != arc_anon && new_state != arc_l2c_only) {
+ /*
+ * An L1 header always exists here, since if we're
+ * moving to some L1-cached state (i.e. not l2c_only or
+ * anonymous), we realloc the header to add an L1hdr
+ * beforehand.
+ */
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ multilist_insert(new_state->arcs_list[buftype], hdr);
+
+ if (GHOST_STATE(new_state)) {
+ ASSERT0(bufcnt);
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ update_new = B_TRUE;
+ }
+ arc_evictable_space_increment(hdr, new_state);
+ }
+ }
+
+ ASSERT(!HDR_EMPTY(hdr));
+ if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
+ buf_hash_remove(hdr);
+
+ /* adjust state sizes (ignore arc_l2c_only) */
+
+ if (update_new && new_state != arc_l2c_only) {
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ if (GHOST_STATE(new_state)) {
+ ASSERT0(bufcnt);
+
+ /*
+ * When moving a header to a ghost state, we first
+ * remove all arc buffers. Thus, we'll have a
+ * bufcnt of zero, and no arc buffer to use for
+ * the reference. As a result, we use the arc
+ * header pointer for the reference.
+ */
+ (void) zfs_refcount_add_many(&new_state->arcs_size,
+ HDR_GET_LSIZE(hdr), hdr);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ ASSERT(!HDR_HAS_RABD(hdr));
+ } else {
+ uint32_t buffers = 0;
+
+ /*
+ * Each individual buffer holds a unique reference,
+ * thus we must remove each of these references one
+ * at a time.
+ */
+ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+ buf = buf->b_next) {
+ ASSERT3U(bufcnt, !=, 0);
+ buffers++;
+
+ /*
+ * When the arc_buf_t is sharing the data
+ * block with the hdr, the owner of the
+ * reference belongs to the hdr. Only
+ * add to the refcount if the arc_buf_t is
+ * not shared.
+ */
+ if (arc_buf_is_shared(buf))
+ continue;
+
+ (void) zfs_refcount_add_many(
+ &new_state->arcs_size,
+ arc_buf_size(buf), buf);
+ }
+ ASSERT3U(bufcnt, ==, buffers);
+
+ if (hdr->b_l1hdr.b_pabd != NULL) {
+ (void) zfs_refcount_add_many(
+ &new_state->arcs_size,
+ arc_hdr_size(hdr), hdr);
+ }
+
+ if (HDR_HAS_RABD(hdr)) {
+ (void) zfs_refcount_add_many(
+ &new_state->arcs_size,
+ HDR_GET_PSIZE(hdr), hdr);
+ }
+ }
+ }
+
+ if (update_old && old_state != arc_l2c_only) {
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ if (GHOST_STATE(old_state)) {
+ ASSERT0(bufcnt);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ ASSERT(!HDR_HAS_RABD(hdr));
+
+ /*
+ * When moving a header off of a ghost state,
+ * the header will not contain any arc buffers.
+ * We use the arc header pointer for the reference
+ * which is exactly what we did when we put the
+ * header on the ghost state.
+ */
+
+ (void) zfs_refcount_remove_many(&old_state->arcs_size,
+ HDR_GET_LSIZE(hdr), hdr);
+ } else {
+ uint32_t buffers = 0;
+
+ /*
+ * Each individual buffer holds a unique reference,
+ * thus we must remove each of these references one
+ * at a time.
+ */
+ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+ buf = buf->b_next) {
+ ASSERT3U(bufcnt, !=, 0);
+ buffers++;
+
+ /*
+ * When the arc_buf_t is sharing the data
+ * block with the hdr, the owner of the
+ * reference belongs to the hdr. Only
+ * add to the refcount if the arc_buf_t is
+ * not shared.
+ */
+ if (arc_buf_is_shared(buf))
+ continue;
+
+ (void) zfs_refcount_remove_many(
+ &old_state->arcs_size, arc_buf_size(buf),
+ buf);
+ }
+ ASSERT3U(bufcnt, ==, buffers);
+ ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
+ HDR_HAS_RABD(hdr));
+
+ if (hdr->b_l1hdr.b_pabd != NULL) {
+ (void) zfs_refcount_remove_many(
+ &old_state->arcs_size, arc_hdr_size(hdr),
+ hdr);
+ }
+
+ if (HDR_HAS_RABD(hdr)) {
+ (void) zfs_refcount_remove_many(
+ &old_state->arcs_size, HDR_GET_PSIZE(hdr),
+ hdr);
+ }
+ }
+ }
+
+ if (HDR_HAS_L1HDR(hdr)) {
+ hdr->b_l1hdr.b_state = new_state;
+
+ if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) {
+ l2arc_hdr_arcstats_decrement_state(hdr);
+ hdr->b_l2hdr.b_arcs_state = new_state->arcs_state;
+ l2arc_hdr_arcstats_increment_state(hdr);
+ }
+ }
+
+ /*
+ * L2 headers should never be on the L2 state list since they don't
+ * have L1 headers allocated.
+ */
+ ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
+ multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
+}
+
+void
+arc_space_consume(uint64_t space, arc_space_type_t type)
+{
+ ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
+
+ switch (type) {
+ default:
+ break;
+ case ARC_SPACE_DATA:
+ aggsum_add(&astat_data_size, space);
+ break;
+ case ARC_SPACE_META:
+ aggsum_add(&astat_metadata_size, space);
+ break;
+ case ARC_SPACE_BONUS:
+ aggsum_add(&astat_bonus_size, space);
+ break;
+ case ARC_SPACE_DNODE:
+ aggsum_add(&astat_dnode_size, space);
+ break;
+ case ARC_SPACE_DBUF:
+ aggsum_add(&astat_dbuf_size, space);
+ break;
+ case ARC_SPACE_HDRS:
+ aggsum_add(&astat_hdr_size, space);
+ break;
+ case ARC_SPACE_L2HDRS:
+ aggsum_add(&astat_l2_hdr_size, space);
+ break;
+ case ARC_SPACE_ABD_CHUNK_WASTE:
+ /*
+ * Note: this includes space wasted by all scatter ABD's, not
+ * just those allocated by the ARC. But the vast majority of
+ * scatter ABD's come from the ARC, because other users are
+ * very short-lived.
+ */
+ aggsum_add(&astat_abd_chunk_waste_size, space);
+ break;
+ }
+
+ if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE)
+ aggsum_add(&arc_meta_used, space);
+
+ aggsum_add(&arc_size, space);
+}
+
+void
+arc_space_return(uint64_t space, arc_space_type_t type)
+{
+ ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
+
+ switch (type) {
+ default:
+ break;
+ case ARC_SPACE_DATA:
+ aggsum_add(&astat_data_size, -space);
+ break;
+ case ARC_SPACE_META:
+ aggsum_add(&astat_metadata_size, -space);
+ break;
+ case ARC_SPACE_BONUS:
+ aggsum_add(&astat_bonus_size, -space);
+ break;
+ case ARC_SPACE_DNODE:
+ aggsum_add(&astat_dnode_size, -space);
+ break;
+ case ARC_SPACE_DBUF:
+ aggsum_add(&astat_dbuf_size, -space);
+ break;
+ case ARC_SPACE_HDRS:
+ aggsum_add(&astat_hdr_size, -space);
+ break;
+ case ARC_SPACE_L2HDRS:
+ aggsum_add(&astat_l2_hdr_size, -space);
+ break;
+ case ARC_SPACE_ABD_CHUNK_WASTE:
+ aggsum_add(&astat_abd_chunk_waste_size, -space);
+ break;
+ }
+
+ if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) {
+ ASSERT(aggsum_compare(&arc_meta_used, space) >= 0);
+ /*
+ * We use the upper bound here rather than the precise value
+ * because the arc_meta_max value doesn't need to be
+ * precise. It's only consumed by humans via arcstats.
+ */
+ if (arc_meta_max < aggsum_upper_bound(&arc_meta_used))
+ arc_meta_max = aggsum_upper_bound(&arc_meta_used);
+ aggsum_add(&arc_meta_used, -space);
+ }
+
+ ASSERT(aggsum_compare(&arc_size, space) >= 0);
+ aggsum_add(&arc_size, -space);
+}
+
+/*
+ * Given a hdr and a buf, returns whether that buf can share its b_data buffer
+ * with the hdr's b_pabd.
+ */
+static boolean_t
+arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
+{
+ /*
+ * The criteria for sharing a hdr's data are:
+ * 1. the buffer is not encrypted
+ * 2. the hdr's compression matches the buf's compression
+ * 3. the hdr doesn't need to be byteswapped
+ * 4. the hdr isn't already being shared
+ * 5. the buf is either compressed or it is the last buf in the hdr list
+ *
+ * Criterion #5 maintains the invariant that shared uncompressed
+ * bufs must be the final buf in the hdr's b_buf list. Reading this, you
+ * might ask, "if a compressed buf is allocated first, won't that be the
+ * last thing in the list?", but in that case it's impossible to create
+ * a shared uncompressed buf anyway (because the hdr must be compressed
+ * to have the compressed buf). You might also think that #3 is
+ * sufficient to make this guarantee, however it's possible
+ * (specifically in the rare L2ARC write race mentioned in
+ * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
+ * is shareable, but wasn't at the time of its allocation. Rather than
+ * allow a new shared uncompressed buf to be created and then shuffle
+ * the list around to make it the last element, this simply disallows
+ * sharing if the new buf isn't the first to be added.
+ */
+ ASSERT3P(buf->b_hdr, ==, hdr);
+ boolean_t hdr_compressed =
+ arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF;
+ boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
+ return (!ARC_BUF_ENCRYPTED(buf) &&
+ buf_compressed == hdr_compressed &&
+ hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
+ !HDR_SHARED_DATA(hdr) &&
+ (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
+}
+
+/*
+ * Allocate a buf for this hdr. If you care about the data that's in the hdr,
+ * or if you want a compressed buffer, pass those flags in. Returns 0 if the
+ * copy was made successfully, or an error code otherwise.
+ */
+static int
+arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
+ void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth,
+ boolean_t fill, arc_buf_t **ret)
+{
+ arc_buf_t *buf;
+ arc_fill_flags_t flags = ARC_FILL_LOCKED;
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
+ VERIFY(hdr->b_type == ARC_BUFC_DATA ||
+ hdr->b_type == ARC_BUFC_METADATA);
+ ASSERT3P(ret, !=, NULL);
+ ASSERT3P(*ret, ==, NULL);
+ IMPLY(encrypted, compressed);
+
+ hdr->b_l1hdr.b_mru_hits = 0;
+ hdr->b_l1hdr.b_mru_ghost_hits = 0;
+ hdr->b_l1hdr.b_mfu_hits = 0;
+ hdr->b_l1hdr.b_mfu_ghost_hits = 0;
+ hdr->b_l1hdr.b_l2_hits = 0;
+
+ buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
+ buf->b_hdr = hdr;
+ buf->b_data = NULL;
+ buf->b_next = hdr->b_l1hdr.b_buf;
+ buf->b_flags = 0;
+
+ add_reference(hdr, tag);
+
+ /*
+ * We're about to change the hdr's b_flags. We must either
+ * hold the hash_lock or be undiscoverable.
+ */
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
+
+ /*
+ * Only honor requests for compressed bufs if the hdr is actually
+ * compressed. This must be overridden if the buffer is encrypted since
+ * encrypted buffers cannot be decompressed.
+ */
+ if (encrypted) {
+ buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
+ buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED;
+ flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED;
+ } else if (compressed &&
+ arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
+ buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
+ flags |= ARC_FILL_COMPRESSED;
+ }
+
+ if (noauth) {
+ ASSERT0(encrypted);
+ flags |= ARC_FILL_NOAUTH;
+ }
+
+ /*
+ * If the hdr's data can be shared then we share the data buffer and
+ * set the appropriate bit in the hdr's b_flags to indicate the hdr is
+ * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
+ * buffer to store the buf's data.
+ *
+ * There are two additional restrictions here because we're sharing
+ * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
+ * actively involved in an L2ARC write, because if this buf is used by
+ * an arc_write() then the hdr's data buffer will be released when the
+ * write completes, even though the L2ARC write might still be using it.
+ * Second, the hdr's ABD must be linear so that the buf's user doesn't
+ * need to be ABD-aware. It must be allocated via
+ * zio_[data_]buf_alloc(), not as a page, because we need to be able
+ * to abd_release_ownership_of_buf(), which isn't allowed on "linear
+ * page" buffers because the ABD code needs to handle freeing them
+ * specially.
+ */
+ boolean_t can_share = arc_can_share(hdr, buf) &&
+ !HDR_L2_WRITING(hdr) &&
+ hdr->b_l1hdr.b_pabd != NULL &&
+ abd_is_linear(hdr->b_l1hdr.b_pabd) &&
+ !abd_is_linear_page(hdr->b_l1hdr.b_pabd);
+
+ /* Set up b_data and sharing */
+ if (can_share) {
+ buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
+ buf->b_flags |= ARC_BUF_FLAG_SHARED;
+ arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
+ } else {
+ buf->b_data =
+ arc_get_data_buf(hdr, arc_buf_size(buf), buf);
+ ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
+ }
+ VERIFY3P(buf->b_data, !=, NULL);
+
+ hdr->b_l1hdr.b_buf = buf;
+ hdr->b_l1hdr.b_bufcnt += 1;
+ if (encrypted)
+ hdr->b_crypt_hdr.b_ebufcnt += 1;
+
+ /*
+ * If the user wants the data from the hdr, we need to either copy or
+ * decompress the data.
+ */
+ if (fill) {
+ ASSERT3P(zb, !=, NULL);
+ return (arc_buf_fill(buf, spa, zb, flags));
+ }
+
+ return (0);
+}
+
+static char *arc_onloan_tag = "onloan";
+
+static inline void
+arc_loaned_bytes_update(int64_t delta)
+{
+ atomic_add_64(&arc_loaned_bytes, delta);
+
+ /* assert that it did not wrap around */
+ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
+}
+
+/*
+ * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
+ * flight data by arc_tempreserve_space() until they are "returned". Loaned
+ * buffers must be returned to the arc before they can be used by the DMU or
+ * freed.
+ */
+arc_buf_t *
+arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
+{
+ arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
+ is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
+
+ arc_loaned_bytes_update(arc_buf_size(buf));
+
+ return (buf);
+}
+
+arc_buf_t *
+arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
+ enum zio_compress compression_type, uint8_t complevel)
+{
+ arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
+ psize, lsize, compression_type, complevel);
+
+ arc_loaned_bytes_update(arc_buf_size(buf));
+
+ return (buf);
+}
+
+arc_buf_t *
+arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder,
+ const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
+ dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
+ enum zio_compress compression_type, uint8_t complevel)
+{
+ arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj,
+ byteorder, salt, iv, mac, ot, psize, lsize, compression_type,
+ complevel);
+
+ atomic_add_64(&arc_loaned_bytes, psize);
+ return (buf);
+}
+
+
+/*
+ * Return a loaned arc buffer to the arc.
+ */
+void
+arc_return_buf(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ ASSERT3P(buf->b_data, !=, NULL);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
+ (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
+
+ arc_loaned_bytes_update(-arc_buf_size(buf));
+}
+
+/* Detach an arc_buf from a dbuf (tag) */
+void
+arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ ASSERT3P(buf->b_data, !=, NULL);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
+ (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
+
+ arc_loaned_bytes_update(arc_buf_size(buf));
+}
+
+static void
+l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
+{
+ l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
+
+ df->l2df_abd = abd;
+ df->l2df_size = size;
+ df->l2df_type = type;
+ mutex_enter(&l2arc_free_on_write_mtx);
+ list_insert_head(l2arc_free_on_write, df);
+ mutex_exit(&l2arc_free_on_write_mtx);
+}
+
+static void
+arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata)
+{
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+ arc_buf_contents_t type = arc_buf_type(hdr);
+ uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
+
+ /* protected by hash lock, if in the hash table */
+ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
+ ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ ASSERT(state != arc_anon && state != arc_l2c_only);
+
+ (void) zfs_refcount_remove_many(&state->arcs_esize[type],
+ size, hdr);
+ }
+ (void) zfs_refcount_remove_many(&state->arcs_size, size, hdr);
+ if (type == ARC_BUFC_METADATA) {
+ arc_space_return(size, ARC_SPACE_META);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ arc_space_return(size, ARC_SPACE_DATA);
+ }
+
+ if (free_rdata) {
+ l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type);
+ } else {
+ l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
+ }
+}
+
+/*
+ * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
+ * data buffer, we transfer the refcount ownership to the hdr and update
+ * the appropriate kstats.
+ */
+static void
+arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
+{
+ ASSERT(arc_can_share(hdr, buf));
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ ASSERT(!ARC_BUF_ENCRYPTED(buf));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
+
+ /*
+ * Start sharing the data buffer. We transfer the
+ * refcount ownership to the hdr since it always owns
+ * the refcount whenever an arc_buf_t is shared.
+ */
+ zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size,
+ arc_hdr_size(hdr), buf, hdr);
+ hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
+ abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
+ HDR_ISTYPE_METADATA(hdr));
+ arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
+ buf->b_flags |= ARC_BUF_FLAG_SHARED;
+
+ /*
+ * Since we've transferred ownership to the hdr we need
+ * to increment its compressed and uncompressed kstats and
+ * decrement the overhead size.
+ */
+ ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
+ ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
+ ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
+}
+
+static void
+arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
+{
+ ASSERT(arc_buf_is_shared(buf));
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
+
+ /*
+ * We are no longer sharing this buffer so we need
+ * to transfer its ownership to the rightful owner.
+ */
+ zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size,
+ arc_hdr_size(hdr), hdr, buf);
+ arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+ abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
+ abd_free(hdr->b_l1hdr.b_pabd);
+ hdr->b_l1hdr.b_pabd = NULL;
+ buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
+
+ /*
+ * Since the buffer is no longer shared between
+ * the arc buf and the hdr, count it as overhead.
+ */
+ ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
+ ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
+ ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
+}
+
+/*
+ * Remove an arc_buf_t from the hdr's buf list and return the last
+ * arc_buf_t on the list. If no buffers remain on the list then return
+ * NULL.
+ */
+static arc_buf_t *
+arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
+{
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
+
+ arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
+ arc_buf_t *lastbuf = NULL;
+
+ /*
+ * Remove the buf from the hdr list and locate the last
+ * remaining buffer on the list.
+ */
+ while (*bufp != NULL) {
+ if (*bufp == buf)
+ *bufp = buf->b_next;
+
+ /*
+ * If we've removed a buffer in the middle of
+ * the list then update the lastbuf and update
+ * bufp.
+ */
+ if (*bufp != NULL) {
+ lastbuf = *bufp;
+ bufp = &(*bufp)->b_next;
+ }
+ }
+ buf->b_next = NULL;
+ ASSERT3P(lastbuf, !=, buf);
+ IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
+ IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
+ IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
+
+ return (lastbuf);
+}
+
+/*
+ * Free up buf->b_data and pull the arc_buf_t off of the arc_buf_hdr_t's
+ * list and free it.
+ */
+static void
+arc_buf_destroy_impl(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ /*
+ * Free up the data associated with the buf but only if we're not
+ * sharing this with the hdr. If we are sharing it with the hdr, the
+ * hdr is responsible for doing the free.
+ */
+ if (buf->b_data != NULL) {
+ /*
+ * We're about to change the hdr's b_flags. We must either
+ * hold the hash_lock or be undiscoverable.
+ */
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
+
+ arc_cksum_verify(buf);
+ arc_buf_unwatch(buf);
+
+ if (arc_buf_is_shared(buf)) {
+ arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+ } else {
+ uint64_t size = arc_buf_size(buf);
+ arc_free_data_buf(hdr, buf->b_data, size, buf);
+ ARCSTAT_INCR(arcstat_overhead_size, -size);
+ }
+ buf->b_data = NULL;
+
+ ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
+ hdr->b_l1hdr.b_bufcnt -= 1;
+
+ if (ARC_BUF_ENCRYPTED(buf)) {
+ hdr->b_crypt_hdr.b_ebufcnt -= 1;
+
+ /*
+ * If we have no more encrypted buffers and we've
+ * already gotten a copy of the decrypted data we can
+ * free b_rabd to save some space.
+ */
+ if (hdr->b_crypt_hdr.b_ebufcnt == 0 &&
+ HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL &&
+ !HDR_IO_IN_PROGRESS(hdr)) {
+ arc_hdr_free_abd(hdr, B_TRUE);
+ }
+ }
+ }
+
+ arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
+
+ if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
+ /*
+ * If the current arc_buf_t is sharing its data buffer with the
+ * hdr, then reassign the hdr's b_pabd to share it with the new
+ * buffer at the end of the list. The shared buffer is always
+ * the last one on the hdr's buffer list.
+ *
+ * There is an equivalent case for compressed bufs, but since
+ * they aren't guaranteed to be the last buf in the list and
+ * that is an exceedingly rare case, we just allow that space be
+ * wasted temporarily. We must also be careful not to share
+ * encrypted buffers, since they cannot be shared.
+ */
+ if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) {
+ /* Only one buf can be shared at once */
+ VERIFY(!arc_buf_is_shared(lastbuf));
+ /* hdr is uncompressed so can't have compressed buf */
+ VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
+
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+ arc_hdr_free_abd(hdr, B_FALSE);
+
+ /*
+ * We must setup a new shared block between the
+ * last buffer and the hdr. The data would have
+ * been allocated by the arc buf so we need to transfer
+ * ownership to the hdr since it's now being shared.
+ */
+ arc_share_buf(hdr, lastbuf);
+ }
+ } else if (HDR_SHARED_DATA(hdr)) {
+ /*
+ * Uncompressed shared buffers are always at the end
+ * of the list. Compressed buffers don't have the
+ * same requirements. This makes it hard to
+ * simply assert that the lastbuf is shared so
+ * we rely on the hdr's compression flags to determine
+ * if we have a compressed, shared buffer.
+ */
+ ASSERT3P(lastbuf, !=, NULL);
+ ASSERT(arc_buf_is_shared(lastbuf) ||
+ arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
+ }
+
+ /*
+ * Free the checksum if we're removing the last uncompressed buf from
+ * this hdr.
+ */
+ if (!arc_hdr_has_uncompressed_buf(hdr)) {
+ arc_cksum_free(hdr);
+ }
+
+ /* clean up the buf */
+ buf->b_hdr = NULL;
+ kmem_cache_free(buf_cache, buf);
+}
+
+static void
+arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags)
+{
+ uint64_t size;
+ boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0);
+ boolean_t do_adapt = ((alloc_flags & ARC_HDR_DO_ADAPT) != 0);
+
+ ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata);
+ IMPLY(alloc_rdata, HDR_PROTECTED(hdr));
+
+ if (alloc_rdata) {
+ size = HDR_GET_PSIZE(hdr);
+ ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL);
+ hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr,
+ do_adapt);
+ ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL);
+ ARCSTAT_INCR(arcstat_raw_size, size);
+ } else {
+ size = arc_hdr_size(hdr);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr,
+ do_adapt);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+ }
+
+ ARCSTAT_INCR(arcstat_compressed_size, size);
+ ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
+}
+
+static void
+arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata)
+{
+ uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
+ IMPLY(free_rdata, HDR_HAS_RABD(hdr));
+
+ /*
+ * If the hdr is currently being written to the l2arc then
+ * we defer freeing the data by adding it to the l2arc_free_on_write
+ * list. The l2arc will free the data once it's finished
+ * writing it to the l2arc device.
+ */
+ if (HDR_L2_WRITING(hdr)) {
+ arc_hdr_free_on_write(hdr, free_rdata);
+ ARCSTAT_BUMP(arcstat_l2_free_on_write);
+ } else if (free_rdata) {
+ arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr);
+ } else {
+ arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, size, hdr);
+ }
+
+ if (free_rdata) {
+ hdr->b_crypt_hdr.b_rabd = NULL;
+ ARCSTAT_INCR(arcstat_raw_size, -size);
+ } else {
+ hdr->b_l1hdr.b_pabd = NULL;
+ }
+
+ if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr))
+ hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
+
+ ARCSTAT_INCR(arcstat_compressed_size, -size);
+ ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
+}
+
+static arc_buf_hdr_t *
+arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
+ boolean_t protected, enum zio_compress compression_type, uint8_t complevel,
+ arc_buf_contents_t type, boolean_t alloc_rdata)
+{
+ arc_buf_hdr_t *hdr;
+ int flags = ARC_HDR_DO_ADAPT;
+
+ VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
+ if (protected) {
+ hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE);
+ } else {
+ hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
+ }
+ flags |= alloc_rdata ? ARC_HDR_ALLOC_RDATA : 0;
+
+ ASSERT(HDR_EMPTY(hdr));
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+ HDR_SET_PSIZE(hdr, psize);
+ HDR_SET_LSIZE(hdr, lsize);
+ hdr->b_spa = spa;
+ hdr->b_type = type;
+ hdr->b_flags = 0;
+ arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
+ arc_hdr_set_compress(hdr, compression_type);
+ hdr->b_complevel = complevel;
+ if (protected)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
+
+ hdr->b_l1hdr.b_state = arc_anon;
+ hdr->b_l1hdr.b_arc_access = 0;
+ hdr->b_l1hdr.b_bufcnt = 0;
+ hdr->b_l1hdr.b_buf = NULL;
+
+ /*
+ * Allocate the hdr's buffer. This will contain either
+ * the compressed or uncompressed data depending on the block
+ * it references and compressed arc enablement.
+ */
+ arc_hdr_alloc_abd(hdr, flags);
+ ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+
+ return (hdr);
+}
+
+/*
+ * Transition between the two allocation states for the arc_buf_hdr struct.
+ * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
+ * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
+ * version is used when a cache buffer is only in the L2ARC in order to reduce
+ * memory usage.
+ */
+static arc_buf_hdr_t *
+arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
+{
+ ASSERT(HDR_HAS_L2HDR(hdr));
+
+ arc_buf_hdr_t *nhdr;
+ l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
+
+ ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
+ (old == hdr_l2only_cache && new == hdr_full_cache));
+
+ /*
+ * if the caller wanted a new full header and the header is to be
+ * encrypted we will actually allocate the header from the full crypt
+ * cache instead. The same applies to freeing from the old cache.
+ */
+ if (HDR_PROTECTED(hdr) && new == hdr_full_cache)
+ new = hdr_full_crypt_cache;
+ if (HDR_PROTECTED(hdr) && old == hdr_full_cache)
+ old = hdr_full_crypt_cache;
+
+ nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
+
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
+ buf_hash_remove(hdr);
+
+ bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
+
+ if (new == hdr_full_cache || new == hdr_full_crypt_cache) {
+ arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
+ /*
+ * arc_access and arc_change_state need to be aware that a
+ * header has just come out of L2ARC, so we set its state to
+ * l2c_only even though it's about to change.
+ */
+ nhdr->b_l1hdr.b_state = arc_l2c_only;
+
+ /* Verify previous threads set to NULL before freeing */
+ ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
+ ASSERT(!HDR_HAS_RABD(hdr));
+ } else {
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ ASSERT0(hdr->b_l1hdr.b_bufcnt);
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+
+ /*
+ * If we've reached here, We must have been called from
+ * arc_evict_hdr(), as such we should have already been
+ * removed from any ghost list we were previously on
+ * (which protects us from racing with arc_evict_state),
+ * thus no locking is needed during this check.
+ */
+ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+
+ /*
+ * A buffer must not be moved into the arc_l2c_only
+ * state if it's not finished being written out to the
+ * l2arc device. Otherwise, the b_l1hdr.b_pabd field
+ * might try to be accessed, even though it was removed.
+ */
+ VERIFY(!HDR_L2_WRITING(hdr));
+ VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ ASSERT(!HDR_HAS_RABD(hdr));
+
+ arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
+ }
+ /*
+ * The header has been reallocated so we need to re-insert it into any
+ * lists it was on.
+ */
+ (void) buf_hash_insert(nhdr, NULL);
+
+ ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
+
+ mutex_enter(&dev->l2ad_mtx);
+
+ /*
+ * We must place the realloc'ed header back into the list at
+ * the same spot. Otherwise, if it's placed earlier in the list,
+ * l2arc_write_buffers() could find it during the function's
+ * write phase, and try to write it out to the l2arc.
+ */
+ list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
+ list_remove(&dev->l2ad_buflist, hdr);
+
+ mutex_exit(&dev->l2ad_mtx);
+
+ /*
+ * Since we're using the pointer address as the tag when
+ * incrementing and decrementing the l2ad_alloc refcount, we
+ * must remove the old pointer (that we're about to destroy) and
+ * add the new pointer to the refcount. Otherwise we'd remove
+ * the wrong pointer address when calling arc_hdr_destroy() later.
+ */
+
+ (void) zfs_refcount_remove_many(&dev->l2ad_alloc,
+ arc_hdr_size(hdr), hdr);
+ (void) zfs_refcount_add_many(&dev->l2ad_alloc,
+ arc_hdr_size(nhdr), nhdr);
+
+ buf_discard_identity(hdr);
+ kmem_cache_free(old, hdr);
+
+ return (nhdr);
+}
+
+/*
+ * This function allows an L1 header to be reallocated as a crypt
+ * header and vice versa. If we are going to a crypt header, the
+ * new fields will be zeroed out.
+ */
+static arc_buf_hdr_t *
+arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt)
+{
+ arc_buf_hdr_t *nhdr;
+ arc_buf_t *buf;
+ kmem_cache_t *ncache, *ocache;
+ unsigned nsize, osize;
+
+ /*
+ * This function requires that hdr is in the arc_anon state.
+ * Therefore it won't have any L2ARC data for us to worry
+ * about copying.
+ */
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(!HDR_HAS_L2HDR(hdr));
+ ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt);
+ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+ ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node));
+ ASSERT3P(hdr->b_hash_next, ==, NULL);
+
+ if (need_crypt) {
+ ncache = hdr_full_crypt_cache;
+ nsize = sizeof (hdr->b_crypt_hdr);
+ ocache = hdr_full_cache;
+ osize = HDR_FULL_SIZE;
+ } else {
+ ncache = hdr_full_cache;
+ nsize = HDR_FULL_SIZE;
+ ocache = hdr_full_crypt_cache;
+ osize = sizeof (hdr->b_crypt_hdr);
+ }
+
+ nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE);
+
+ /*
+ * Copy all members that aren't locks or condvars to the new header.
+ * No lists are pointing to us (as we asserted above), so we don't
+ * need to worry about the list nodes.
+ */
+ nhdr->b_dva = hdr->b_dva;
+ nhdr->b_birth = hdr->b_birth;
+ nhdr->b_type = hdr->b_type;
+ nhdr->b_flags = hdr->b_flags;
+ nhdr->b_psize = hdr->b_psize;
+ nhdr->b_lsize = hdr->b_lsize;
+ nhdr->b_spa = hdr->b_spa;
+ nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum;
+ nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt;
+ nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap;
+ nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state;
+ nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access;
+ nhdr->b_l1hdr.b_mru_hits = hdr->b_l1hdr.b_mru_hits;
+ nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits;
+ nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits;
+ nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits;
+ nhdr->b_l1hdr.b_l2_hits = hdr->b_l1hdr.b_l2_hits;
+ nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb;
+ nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd;
+
+ /*
+ * This zfs_refcount_add() exists only to ensure that the individual
+ * arc buffers always point to a header that is referenced, avoiding
+ * a small race condition that could trigger ASSERTs.
+ */
+ (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG);
+ nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf;
+ for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) {
+ mutex_enter(&buf->b_evict_lock);
+ buf->b_hdr = nhdr;
+ mutex_exit(&buf->b_evict_lock);
+ }
+
+ zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt);
+ (void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG);
+ ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
+
+ if (need_crypt) {
+ arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED);
+ } else {
+ arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED);
+ }
+
+ /* unset all members of the original hdr */
+ bzero(&hdr->b_dva, sizeof (dva_t));
+ hdr->b_birth = 0;
+ hdr->b_type = ARC_BUFC_INVALID;
+ hdr->b_flags = 0;
+ hdr->b_psize = 0;
+ hdr->b_lsize = 0;
+ hdr->b_spa = 0;
+ hdr->b_l1hdr.b_freeze_cksum = NULL;
+ hdr->b_l1hdr.b_buf = NULL;
+ hdr->b_l1hdr.b_bufcnt = 0;
+ hdr->b_l1hdr.b_byteswap = 0;
+ hdr->b_l1hdr.b_state = NULL;
+ hdr->b_l1hdr.b_arc_access = 0;
+ hdr->b_l1hdr.b_mru_hits = 0;
+ hdr->b_l1hdr.b_mru_ghost_hits = 0;
+ hdr->b_l1hdr.b_mfu_hits = 0;
+ hdr->b_l1hdr.b_mfu_ghost_hits = 0;
+ hdr->b_l1hdr.b_l2_hits = 0;
+ hdr->b_l1hdr.b_acb = NULL;
+ hdr->b_l1hdr.b_pabd = NULL;
+
+ if (ocache == hdr_full_crypt_cache) {
+ ASSERT(!HDR_HAS_RABD(hdr));
+ hdr->b_crypt_hdr.b_ot = DMU_OT_NONE;
+ hdr->b_crypt_hdr.b_ebufcnt = 0;
+ hdr->b_crypt_hdr.b_dsobj = 0;
+ bzero(hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
+ bzero(hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
+ bzero(hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
+ }
+
+ buf_discard_identity(hdr);
+ kmem_cache_free(ocache, hdr);
+
+ return (nhdr);
+}
+
+/*
+ * This function is used by the send / receive code to convert a newly
+ * allocated arc_buf_t to one that is suitable for a raw encrypted write. It
+ * is also used to allow the root objset block to be updated without altering
+ * its embedded MACs. Both block types will always be uncompressed so we do not
+ * have to worry about compression type or psize.
+ */
+void
+arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
+ dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv,
+ const uint8_t *mac)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+
+ buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED);
+ if (!HDR_PROTECTED(hdr))
+ hdr = arc_hdr_realloc_crypt(hdr, B_TRUE);
+ hdr->b_crypt_hdr.b_dsobj = dsobj;
+ hdr->b_crypt_hdr.b_ot = ot;
+ hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
+ DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
+ if (!arc_hdr_has_uncompressed_buf(hdr))
+ arc_cksum_free(hdr);
+
+ if (salt != NULL)
+ bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
+ if (iv != NULL)
+ bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
+ if (mac != NULL)
+ bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
+}
+
+/*
+ * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
+ * The buf is returned thawed since we expect the consumer to modify it.
+ */
+arc_buf_t *
+arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
+{
+ arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
+ B_FALSE, ZIO_COMPRESS_OFF, 0, type, B_FALSE);
+
+ arc_buf_t *buf = NULL;
+ VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE,
+ B_FALSE, B_FALSE, &buf));
+ arc_buf_thaw(buf);
+
+ return (buf);
+}
+
+/*
+ * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
+ * for bufs containing metadata.
+ */
+arc_buf_t *
+arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
+ enum zio_compress compression_type, uint8_t complevel)
+{
+ ASSERT3U(lsize, >, 0);
+ ASSERT3U(lsize, >=, psize);
+ ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF);
+ ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
+
+ arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
+ B_FALSE, compression_type, complevel, ARC_BUFC_DATA, B_FALSE);
+
+ arc_buf_t *buf = NULL;
+ VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE,
+ B_TRUE, B_FALSE, B_FALSE, &buf));
+ arc_buf_thaw(buf);
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+
+ if (!arc_buf_is_shared(buf)) {
+ /*
+ * To ensure that the hdr has the correct data in it if we call
+ * arc_untransform() on this buf before it's been written to
+ * disk, it's easiest if we just set up sharing between the
+ * buf and the hdr.
+ */
+ arc_hdr_free_abd(hdr, B_FALSE);
+ arc_share_buf(hdr, buf);
+ }
+
+ return (buf);
+}
+
+arc_buf_t *
+arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder,
+ const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
+ dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
+ enum zio_compress compression_type, uint8_t complevel)
+{
+ arc_buf_hdr_t *hdr;
+ arc_buf_t *buf;
+ arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ?
+ ARC_BUFC_METADATA : ARC_BUFC_DATA;
+
+ ASSERT3U(lsize, >, 0);
+ ASSERT3U(lsize, >=, psize);
+ ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF);
+ ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
+
+ hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE,
+ compression_type, complevel, type, B_TRUE);
+
+ hdr->b_crypt_hdr.b_dsobj = dsobj;
+ hdr->b_crypt_hdr.b_ot = ot;
+ hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
+ DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
+ bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
+ bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
+ bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
+
+ /*
+ * This buffer will be considered encrypted even if the ot is not an
+ * encrypted type. It will become authenticated instead in
+ * arc_write_ready().
+ */
+ buf = NULL;
+ VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE,
+ B_FALSE, B_FALSE, &buf));
+ arc_buf_thaw(buf);
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+
+ return (buf);
+}
+
+static void
+l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
+ boolean_t state_only)
+{
+ l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
+ l2arc_dev_t *dev = l2hdr->b_dev;
+ uint64_t lsize = HDR_GET_LSIZE(hdr);
+ uint64_t psize = HDR_GET_PSIZE(hdr);
+ uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
+ arc_buf_contents_t type = hdr->b_type;
+ int64_t lsize_s;
+ int64_t psize_s;
+ int64_t asize_s;
+
+ if (incr) {
+ lsize_s = lsize;
+ psize_s = psize;
+ asize_s = asize;
+ } else {
+ lsize_s = -lsize;
+ psize_s = -psize;
+ asize_s = -asize;
+ }
+
+ /* If the buffer is a prefetch, count it as such. */
+ if (HDR_PREFETCH(hdr)) {
+ ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s);
+ } else {
+ /*
+ * We use the value stored in the L2 header upon initial
+ * caching in L2ARC. This value will be updated in case
+ * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC
+ * metadata (log entry) cannot currently be updated. Having
+ * the ARC state in the L2 header solves the problem of a
+ * possibly absent L1 header (apparent in buffers restored
+ * from persistent L2ARC).
+ */
+ switch (hdr->b_l2hdr.b_arcs_state) {
+ case ARC_STATE_MRU_GHOST:
+ case ARC_STATE_MRU:
+ ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s);
+ break;
+ case ARC_STATE_MFU_GHOST:
+ case ARC_STATE_MFU:
+ ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (state_only)
+ return;
+
+ ARCSTAT_INCR(arcstat_l2_psize, psize_s);
+ ARCSTAT_INCR(arcstat_l2_lsize, lsize_s);
+
+ switch (type) {
+ case ARC_BUFC_DATA:
+ ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s);
+ break;
+ case ARC_BUFC_METADATA:
+ ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s);
+ break;
+ default:
+ break;
+ }
+}
+
+
+static void
+arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
+{
+ l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
+ l2arc_dev_t *dev = l2hdr->b_dev;
+ uint64_t psize = HDR_GET_PSIZE(hdr);
+ uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
+
+ ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
+ ASSERT(HDR_HAS_L2HDR(hdr));
+
+ list_remove(&dev->l2ad_buflist, hdr);
+
+ l2arc_hdr_arcstats_decrement(hdr);
+ vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
+
+ (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
+ hdr);
+ arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
+}
+
+static void
+arc_hdr_destroy(arc_buf_hdr_t *hdr)
+{
+ if (HDR_HAS_L1HDR(hdr)) {
+ ASSERT(hdr->b_l1hdr.b_buf == NULL ||
+ hdr->b_l1hdr.b_bufcnt > 0);
+ ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+ }
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT(!HDR_IN_HASH_TABLE(hdr));
+
+ if (HDR_HAS_L2HDR(hdr)) {
+ l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
+ boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
+
+ if (!buflist_held)
+ mutex_enter(&dev->l2ad_mtx);
+
+ /*
+ * Even though we checked this conditional above, we
+ * need to check this again now that we have the
+ * l2ad_mtx. This is because we could be racing with
+ * another thread calling l2arc_evict() which might have
+ * destroyed this header's L2 portion as we were waiting
+ * to acquire the l2ad_mtx. If that happens, we don't
+ * want to re-destroy the header's L2 portion.
+ */
+ if (HDR_HAS_L2HDR(hdr))
+ arc_hdr_l2hdr_destroy(hdr);
+
+ if (!buflist_held)
+ mutex_exit(&dev->l2ad_mtx);
+ }
+
+ /*
+ * The header's identify can only be safely discarded once it is no
+ * longer discoverable. This requires removing it from the hash table
+ * and the l2arc header list. After this point the hash lock can not
+ * be used to protect the header.
+ */
+ if (!HDR_EMPTY(hdr))
+ buf_discard_identity(hdr);
+
+ if (HDR_HAS_L1HDR(hdr)) {
+ arc_cksum_free(hdr);
+
+ while (hdr->b_l1hdr.b_buf != NULL)
+ arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
+
+ if (hdr->b_l1hdr.b_pabd != NULL)
+ arc_hdr_free_abd(hdr, B_FALSE);
+
+ if (HDR_HAS_RABD(hdr))
+ arc_hdr_free_abd(hdr, B_TRUE);
+ }
+
+ ASSERT3P(hdr->b_hash_next, ==, NULL);
+ if (HDR_HAS_L1HDR(hdr)) {
+ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+ ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+
+ if (!HDR_PROTECTED(hdr)) {
+ kmem_cache_free(hdr_full_cache, hdr);
+ } else {
+ kmem_cache_free(hdr_full_crypt_cache, hdr);
+ }
+ } else {
+ kmem_cache_free(hdr_l2only_cache, hdr);
+ }
+}
+
+void
+arc_buf_destroy(arc_buf_t *buf, void* tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ if (hdr->b_l1hdr.b_state == arc_anon) {
+ ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ VERIFY0(remove_reference(hdr, NULL, tag));
+ arc_hdr_destroy(hdr);
+ return;
+ }
+
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
+
+ ASSERT3P(hdr, ==, buf->b_hdr);
+ ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+ ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
+ ASSERT3P(buf->b_data, !=, NULL);
+
+ (void) remove_reference(hdr, hash_lock, tag);
+ arc_buf_destroy_impl(buf);
+ mutex_exit(hash_lock);
+}
+
+/*
+ * Evict the arc_buf_hdr that is provided as a parameter. The resultant
+ * state of the header is dependent on its state prior to entering this
+ * function. The following transitions are possible:
+ *
+ * - arc_mru -> arc_mru_ghost
+ * - arc_mfu -> arc_mfu_ghost
+ * - arc_mru_ghost -> arc_l2c_only
+ * - arc_mru_ghost -> deleted
+ * - arc_mfu_ghost -> arc_l2c_only
+ * - arc_mfu_ghost -> deleted
+ */
+static int64_t
+arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
+{
+ arc_state_t *evicted_state, *state;
+ int64_t bytes_evicted = 0;
+ int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
+ arc_min_prescient_prefetch_ms : arc_min_prefetch_ms;
+
+ ASSERT(MUTEX_HELD(hash_lock));
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ state = hdr->b_l1hdr.b_state;
+ if (GHOST_STATE(state)) {
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+
+ /*
+ * l2arc_write_buffers() relies on a header's L1 portion
+ * (i.e. its b_pabd field) during it's write phase.
+ * Thus, we cannot push a header onto the arc_l2c_only
+ * state (removing its L1 piece) until the header is
+ * done being written to the l2arc.
+ */
+ if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
+ ARCSTAT_BUMP(arcstat_evict_l2_skip);
+ return (bytes_evicted);
+ }
+
+ ARCSTAT_BUMP(arcstat_deleted);
+ bytes_evicted += HDR_GET_LSIZE(hdr);
+
+ DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
+
+ if (HDR_HAS_L2HDR(hdr)) {
+ ASSERT(hdr->b_l1hdr.b_pabd == NULL);
+ ASSERT(!HDR_HAS_RABD(hdr));
+ /*
+ * This buffer is cached on the 2nd Level ARC;
+ * don't destroy the header.
+ */
+ arc_change_state(arc_l2c_only, hdr, hash_lock);
+ /*
+ * dropping from L1+L2 cached to L2-only,
+ * realloc to remove the L1 header.
+ */
+ hdr = arc_hdr_realloc(hdr, hdr_full_cache,
+ hdr_l2only_cache);
+ } else {
+ arc_change_state(arc_anon, hdr, hash_lock);
+ arc_hdr_destroy(hdr);
+ }
+ return (bytes_evicted);
+ }
+
+ ASSERT(state == arc_mru || state == arc_mfu);
+ evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+
+ /* prefetch buffers have a minimum lifespan */
+ if (HDR_IO_IN_PROGRESS(hdr) ||
+ ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
+ ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
+ MSEC_TO_TICK(min_lifetime))) {
+ ARCSTAT_BUMP(arcstat_evict_skip);
+ return (bytes_evicted);
+ }
+
+ ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
+ while (hdr->b_l1hdr.b_buf) {
+ arc_buf_t *buf = hdr->b_l1hdr.b_buf;
+ if (!mutex_tryenter(&buf->b_evict_lock)) {
+ ARCSTAT_BUMP(arcstat_mutex_miss);
+ break;
+ }
+ if (buf->b_data != NULL)
+ bytes_evicted += HDR_GET_LSIZE(hdr);
+ mutex_exit(&buf->b_evict_lock);
+ arc_buf_destroy_impl(buf);
+ }
+
+ if (HDR_HAS_L2HDR(hdr)) {
+ ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
+ } else {
+ if (l2arc_write_eligible(hdr->b_spa, hdr)) {
+ ARCSTAT_INCR(arcstat_evict_l2_eligible,
+ HDR_GET_LSIZE(hdr));
+
+ switch (state->arcs_state) {
+ case ARC_STATE_MRU:
+ ARCSTAT_INCR(
+ arcstat_evict_l2_eligible_mru,
+ HDR_GET_LSIZE(hdr));
+ break;
+ case ARC_STATE_MFU:
+ ARCSTAT_INCR(
+ arcstat_evict_l2_eligible_mfu,
+ HDR_GET_LSIZE(hdr));
+ break;
+ default:
+ break;
+ }
+ } else {
+ ARCSTAT_INCR(arcstat_evict_l2_ineligible,
+ HDR_GET_LSIZE(hdr));
+ }
+ }
+
+ if (hdr->b_l1hdr.b_bufcnt == 0) {
+ arc_cksum_free(hdr);
+
+ bytes_evicted += arc_hdr_size(hdr);
+
+ /*
+ * If this hdr is being evicted and has a compressed
+ * buffer then we discard it here before we change states.
+ * This ensures that the accounting is updated correctly
+ * in arc_free_data_impl().
+ */
+ if (hdr->b_l1hdr.b_pabd != NULL)
+ arc_hdr_free_abd(hdr, B_FALSE);
+
+ if (HDR_HAS_RABD(hdr))
+ arc_hdr_free_abd(hdr, B_TRUE);
+
+ arc_change_state(evicted_state, hdr, hash_lock);
+ ASSERT(HDR_IN_HASH_TABLE(hdr));
+ arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
+ DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
+ }
+
+ return (bytes_evicted);
+}
+
+static void
+arc_set_need_free(void)
+{
+ ASSERT(MUTEX_HELD(&arc_evict_lock));
+ int64_t remaining = arc_free_memory() - arc_sys_free / 2;
+ arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters);
+ if (aw == NULL) {
+ arc_need_free = MAX(-remaining, 0);
+ } else {
+ arc_need_free =
+ MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count));
+ }
+}
+
+static uint64_t
+arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
+ uint64_t spa, int64_t bytes)
+{
+ multilist_sublist_t *mls;
+ uint64_t bytes_evicted = 0;
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+ int evict_count = 0;
+
+ ASSERT3P(marker, !=, NULL);
+ IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
+
+ mls = multilist_sublist_lock(ml, idx);
+
+ for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
+ hdr = multilist_sublist_prev(mls, marker)) {
+ if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
+ (evict_count >= zfs_arc_evict_batch_limit))
+ break;
+
+ /*
+ * To keep our iteration location, move the marker
+ * forward. Since we're not holding hdr's hash lock, we
+ * must be very careful and not remove 'hdr' from the
+ * sublist. Otherwise, other consumers might mistake the
+ * 'hdr' as not being on a sublist when they call the
+ * multilist_link_active() function (they all rely on
+ * the hash lock protecting concurrent insertions and
+ * removals). multilist_sublist_move_forward() was
+ * specifically implemented to ensure this is the case
+ * (only 'marker' will be removed and re-inserted).
+ */
+ multilist_sublist_move_forward(mls, marker);
+
+ /*
+ * The only case where the b_spa field should ever be
+ * zero, is the marker headers inserted by
+ * arc_evict_state(). It's possible for multiple threads
+ * to be calling arc_evict_state() concurrently (e.g.
+ * dsl_pool_close() and zio_inject_fault()), so we must
+ * skip any markers we see from these other threads.
+ */
+ if (hdr->b_spa == 0)
+ continue;
+
+ /* we're only interested in evicting buffers of a certain spa */
+ if (spa != 0 && hdr->b_spa != spa) {
+ ARCSTAT_BUMP(arcstat_evict_skip);
+ continue;
+ }
+
+ hash_lock = HDR_LOCK(hdr);
+
+ /*
+ * We aren't calling this function from any code path
+ * that would already be holding a hash lock, so we're
+ * asserting on this assumption to be defensive in case
+ * this ever changes. Without this check, it would be
+ * possible to incorrectly increment arcstat_mutex_miss
+ * below (e.g. if the code changed such that we called
+ * this function with a hash lock held).
+ */
+ ASSERT(!MUTEX_HELD(hash_lock));
+
+ if (mutex_tryenter(hash_lock)) {
+ uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
+ mutex_exit(hash_lock);
+
+ bytes_evicted += evicted;
+
+ /*
+ * If evicted is zero, arc_evict_hdr() must have
+ * decided to skip this header, don't increment
+ * evict_count in this case.
+ */
+ if (evicted != 0)
+ evict_count++;
+
+ } else {
+ ARCSTAT_BUMP(arcstat_mutex_miss);
+ }
+ }
+
+ multilist_sublist_unlock(mls);
+
+ /*
+ * Increment the count of evicted bytes, and wake up any threads that
+ * are waiting for the count to reach this value. Since the list is
+ * ordered by ascending aew_count, we pop off the beginning of the
+ * list until we reach the end, or a waiter that's past the current
+ * "count". Doing this outside the loop reduces the number of times
+ * we need to acquire the global arc_evict_lock.
+ *
+ * Only wake when there's sufficient free memory in the system
+ * (specifically, arc_sys_free/2, which by default is a bit more than
+ * 1/64th of RAM). See the comments in arc_wait_for_eviction().
+ */
+ mutex_enter(&arc_evict_lock);
+ arc_evict_count += bytes_evicted;
+
+ if (arc_free_memory() > arc_sys_free / 2) {
+ arc_evict_waiter_t *aw;
+ while ((aw = list_head(&arc_evict_waiters)) != NULL &&
+ aw->aew_count <= arc_evict_count) {
+ list_remove(&arc_evict_waiters, aw);
+ cv_broadcast(&aw->aew_cv);
+ }
+ }
+ arc_set_need_free();
+ mutex_exit(&arc_evict_lock);
+
+ /*
+ * If the ARC size is reduced from arc_c_max to arc_c_min (especially
+ * if the average cached block is small), eviction can be on-CPU for
+ * many seconds. To ensure that other threads that may be bound to
+ * this CPU are able to make progress, make a voluntary preemption
+ * call here.
+ */
+ cond_resched();
+
+ return (bytes_evicted);
+}
+
+/*
+ * Evict buffers from the given arc state, until we've removed the
+ * specified number of bytes. Move the removed buffers to the
+ * appropriate evict state.
+ *
+ * This function makes a "best effort". It skips over any buffers
+ * it can't get a hash_lock on, and so, may not catch all candidates.
+ * It may also return without evicting as much space as requested.
+ *
+ * If bytes is specified using the special value ARC_EVICT_ALL, this
+ * will evict all available (i.e. unlocked and evictable) buffers from
+ * the given arc state; which is used by arc_flush().
+ */
+static uint64_t
+arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
+ arc_buf_contents_t type)
+{
+ uint64_t total_evicted = 0;
+ multilist_t *ml = state->arcs_list[type];
+ int num_sublists;
+ arc_buf_hdr_t **markers;
+
+ IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
+
+ num_sublists = multilist_get_num_sublists(ml);
+
+ /*
+ * If we've tried to evict from each sublist, made some
+ * progress, but still have not hit the target number of bytes
+ * to evict, we want to keep trying. The markers allow us to
+ * pick up where we left off for each individual sublist, rather
+ * than starting from the tail each time.
+ */
+ markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
+ for (int i = 0; i < num_sublists; i++) {
+ multilist_sublist_t *mls;
+
+ markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
+
+ /*
+ * A b_spa of 0 is used to indicate that this header is
+ * a marker. This fact is used in arc_evict_type() and
+ * arc_evict_state_impl().
+ */
+ markers[i]->b_spa = 0;
+
+ mls = multilist_sublist_lock(ml, i);
+ multilist_sublist_insert_tail(mls, markers[i]);
+ multilist_sublist_unlock(mls);
+ }
+
+ /*
+ * While we haven't hit our target number of bytes to evict, or
+ * we're evicting all available buffers.
+ */
+ while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
+ int sublist_idx = multilist_get_random_index(ml);
+ uint64_t scan_evicted = 0;
+
+ /*
+ * Try to reduce pinned dnodes with a floor of arc_dnode_limit.
+ * Request that 10% of the LRUs be scanned by the superblock
+ * shrinker.
+ */
+ if (type == ARC_BUFC_DATA && aggsum_compare(&astat_dnode_size,
+ arc_dnode_size_limit) > 0) {
+ arc_prune_async((aggsum_upper_bound(&astat_dnode_size) -
+ arc_dnode_size_limit) / sizeof (dnode_t) /
+ zfs_arc_dnode_reduce_percent);
+ }
+
+ /*
+ * Start eviction using a randomly selected sublist,
+ * this is to try and evenly balance eviction across all
+ * sublists. Always starting at the same sublist
+ * (e.g. index 0) would cause evictions to favor certain
+ * sublists over others.
+ */
+ for (int i = 0; i < num_sublists; i++) {
+ uint64_t bytes_remaining;
+ uint64_t bytes_evicted;
+
+ if (bytes == ARC_EVICT_ALL)
+ bytes_remaining = ARC_EVICT_ALL;
+ else if (total_evicted < bytes)
+ bytes_remaining = bytes - total_evicted;
+ else
+ break;
+
+ bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
+ markers[sublist_idx], spa, bytes_remaining);
+
+ scan_evicted += bytes_evicted;
+ total_evicted += bytes_evicted;
+
+ /* we've reached the end, wrap to the beginning */
+ if (++sublist_idx >= num_sublists)
+ sublist_idx = 0;
+ }
+
+ /*
+ * If we didn't evict anything during this scan, we have
+ * no reason to believe we'll evict more during another
+ * scan, so break the loop.
+ */
+ if (scan_evicted == 0) {
+ /* This isn't possible, let's make that obvious */
+ ASSERT3S(bytes, !=, 0);
+
+ /*
+ * When bytes is ARC_EVICT_ALL, the only way to
+ * break the loop is when scan_evicted is zero.
+ * In that case, we actually have evicted enough,
+ * so we don't want to increment the kstat.
+ */
+ if (bytes != ARC_EVICT_ALL) {
+ ASSERT3S(total_evicted, <, bytes);
+ ARCSTAT_BUMP(arcstat_evict_not_enough);
+ }
+
+ break;
+ }
+ }
+
+ for (int i = 0; i < num_sublists; i++) {
+ multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+ multilist_sublist_remove(mls, markers[i]);
+ multilist_sublist_unlock(mls);
+
+ kmem_cache_free(hdr_full_cache, markers[i]);
+ }
+ kmem_free(markers, sizeof (*markers) * num_sublists);
+
+ return (total_evicted);
+}
+
+/*
+ * Flush all "evictable" data of the given type from the arc state
+ * specified. This will not evict any "active" buffers (i.e. referenced).
+ *
+ * When 'retry' is set to B_FALSE, the function will make a single pass
+ * over the state and evict any buffers that it can. Since it doesn't
+ * continually retry the eviction, it might end up leaving some buffers
+ * in the ARC due to lock misses.
+ *
+ * When 'retry' is set to B_TRUE, the function will continually retry the
+ * eviction until *all* evictable buffers have been removed from the
+ * state. As a result, if concurrent insertions into the state are
+ * allowed (e.g. if the ARC isn't shutting down), this function might
+ * wind up in an infinite loop, continually trying to evict buffers.
+ */
+static uint64_t
+arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
+ boolean_t retry)
+{
+ uint64_t evicted = 0;
+
+ while (zfs_refcount_count(&state->arcs_esize[type]) != 0) {
+ evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
+
+ if (!retry)
+ break;
+ }
+
+ return (evicted);
+}
+
+/*
+ * Evict the specified number of bytes from the state specified,
+ * restricting eviction to the spa and type given. This function
+ * prevents us from trying to evict more from a state's list than
+ * is "evictable", and to skip evicting altogether when passed a
+ * negative value for "bytes". In contrast, arc_evict_state() will
+ * evict everything it can, when passed a negative value for "bytes".
+ */
+static uint64_t
+arc_evict_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
+ arc_buf_contents_t type)
+{
+ int64_t delta;
+
+ if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) {
+ delta = MIN(zfs_refcount_count(&state->arcs_esize[type]),
+ bytes);
+ return (arc_evict_state(state, spa, delta, type));
+ }
+
+ return (0);
+}
+
+/*
+ * The goal of this function is to evict enough meta data buffers from the
+ * ARC in order to enforce the arc_meta_limit. Achieving this is slightly
+ * more complicated than it appears because it is common for data buffers
+ * to have holds on meta data buffers. In addition, dnode meta data buffers
+ * will be held by the dnodes in the block preventing them from being freed.
+ * This means we can't simply traverse the ARC and expect to always find
+ * enough unheld meta data buffer to release.
+ *
+ * Therefore, this function has been updated to make alternating passes
+ * over the ARC releasing data buffers and then newly unheld meta data
+ * buffers. This ensures forward progress is maintained and meta_used
+ * will decrease. Normally this is sufficient, but if required the ARC
+ * will call the registered prune callbacks causing dentry and inodes to
+ * be dropped from the VFS cache. This will make dnode meta data buffers
+ * available for reclaim.
+ */
+static uint64_t
+arc_evict_meta_balanced(uint64_t meta_used)
+{
+ int64_t delta, prune = 0, adjustmnt;
+ uint64_t total_evicted = 0;
+ arc_buf_contents_t type = ARC_BUFC_DATA;
+ int restarts = MAX(zfs_arc_meta_adjust_restarts, 0);
+
+restart:
+ /*
+ * This slightly differs than the way we evict from the mru in
+ * arc_evict because we don't have a "target" value (i.e. no
+ * "meta" arc_p). As a result, I think we can completely
+ * cannibalize the metadata in the MRU before we evict the
+ * metadata from the MFU. I think we probably need to implement a
+ * "metadata arc_p" value to do this properly.
+ */
+ adjustmnt = meta_used - arc_meta_limit;
+
+ if (adjustmnt > 0 &&
+ zfs_refcount_count(&arc_mru->arcs_esize[type]) > 0) {
+ delta = MIN(zfs_refcount_count(&arc_mru->arcs_esize[type]),
+ adjustmnt);
+ total_evicted += arc_evict_impl(arc_mru, 0, delta, type);
+ adjustmnt -= delta;
+ }
+
+ /*
+ * We can't afford to recalculate adjustmnt here. If we do,
+ * new metadata buffers can sneak into the MRU or ANON lists,
+ * thus penalize the MFU metadata. Although the fudge factor is
+ * small, it has been empirically shown to be significant for
+ * certain workloads (e.g. creating many empty directories). As
+ * such, we use the original calculation for adjustmnt, and
+ * simply decrement the amount of data evicted from the MRU.
+ */
+
+ if (adjustmnt > 0 &&
+ zfs_refcount_count(&arc_mfu->arcs_esize[type]) > 0) {
+ delta = MIN(zfs_refcount_count(&arc_mfu->arcs_esize[type]),
+ adjustmnt);
+ total_evicted += arc_evict_impl(arc_mfu, 0, delta, type);
+ }
+
+ adjustmnt = meta_used - arc_meta_limit;
+
+ if (adjustmnt > 0 &&
+ zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) {
+ delta = MIN(adjustmnt,
+ zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]));
+ total_evicted += arc_evict_impl(arc_mru_ghost, 0, delta, type);
+ adjustmnt -= delta;
+ }
+
+ if (adjustmnt > 0 &&
+ zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) {
+ delta = MIN(adjustmnt,
+ zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]));
+ total_evicted += arc_evict_impl(arc_mfu_ghost, 0, delta, type);
+ }
+
+ /*
+ * If after attempting to make the requested adjustment to the ARC
+ * the meta limit is still being exceeded then request that the
+ * higher layers drop some cached objects which have holds on ARC
+ * meta buffers. Requests to the upper layers will be made with
+ * increasingly large scan sizes until the ARC is below the limit.
+ */
+ if (meta_used > arc_meta_limit) {
+ if (type == ARC_BUFC_DATA) {
+ type = ARC_BUFC_METADATA;
+ } else {
+ type = ARC_BUFC_DATA;
+
+ if (zfs_arc_meta_prune) {
+ prune += zfs_arc_meta_prune;
+ arc_prune_async(prune);
+ }
+ }
+
+ if (restarts > 0) {
+ restarts--;
+ goto restart;
+ }
+ }
+ return (total_evicted);
+}
+
+/*
+ * Evict metadata buffers from the cache, such that arc_meta_used is
+ * capped by the arc_meta_limit tunable.
+ */
+static uint64_t
+arc_evict_meta_only(uint64_t meta_used)
+{
+ uint64_t total_evicted = 0;
+ int64_t target;
+
+ /*
+ * If we're over the meta limit, we want to evict enough
+ * metadata to get back under the meta limit. We don't want to
+ * evict so much that we drop the MRU below arc_p, though. If
+ * we're over the meta limit more than we're over arc_p, we
+ * evict some from the MRU here, and some from the MFU below.
+ */
+ target = MIN((int64_t)(meta_used - arc_meta_limit),
+ (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
+ zfs_refcount_count(&arc_mru->arcs_size) - arc_p));
+
+ total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
+
+ /*
+ * Similar to the above, we want to evict enough bytes to get us
+ * below the meta limit, but not so much as to drop us below the
+ * space allotted to the MFU (which is defined as arc_c - arc_p).
+ */
+ target = MIN((int64_t)(meta_used - arc_meta_limit),
+ (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) -
+ (arc_c - arc_p)));
+
+ total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
+
+ return (total_evicted);
+}
+
+static uint64_t
+arc_evict_meta(uint64_t meta_used)
+{
+ if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
+ return (arc_evict_meta_only(meta_used));
+ else
+ return (arc_evict_meta_balanced(meta_used));
+}
+
+/*
+ * Return the type of the oldest buffer in the given arc state
+ *
+ * This function will select a random sublist of type ARC_BUFC_DATA and
+ * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
+ * is compared, and the type which contains the "older" buffer will be
+ * returned.
+ */
+static arc_buf_contents_t
+arc_evict_type(arc_state_t *state)
+{
+ multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA];
+ multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA];
+ int data_idx = multilist_get_random_index(data_ml);
+ int meta_idx = multilist_get_random_index(meta_ml);
+ multilist_sublist_t *data_mls;
+ multilist_sublist_t *meta_mls;
+ arc_buf_contents_t type;
+ arc_buf_hdr_t *data_hdr;
+ arc_buf_hdr_t *meta_hdr;
+
+ /*
+ * We keep the sublist lock until we're finished, to prevent
+ * the headers from being destroyed via arc_evict_state().
+ */
+ data_mls = multilist_sublist_lock(data_ml, data_idx);
+ meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
+
+ /*
+ * These two loops are to ensure we skip any markers that
+ * might be at the tail of the lists due to arc_evict_state().
+ */
+
+ for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
+ data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
+ if (data_hdr->b_spa != 0)
+ break;
+ }
+
+ for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
+ meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
+ if (meta_hdr->b_spa != 0)
+ break;
+ }
+
+ if (data_hdr == NULL && meta_hdr == NULL) {
+ type = ARC_BUFC_DATA;
+ } else if (data_hdr == NULL) {
+ ASSERT3P(meta_hdr, !=, NULL);
+ type = ARC_BUFC_METADATA;
+ } else if (meta_hdr == NULL) {
+ ASSERT3P(data_hdr, !=, NULL);
+ type = ARC_BUFC_DATA;
+ } else {
+ ASSERT3P(data_hdr, !=, NULL);
+ ASSERT3P(meta_hdr, !=, NULL);
+
+ /* The headers can't be on the sublist without an L1 header */
+ ASSERT(HDR_HAS_L1HDR(data_hdr));
+ ASSERT(HDR_HAS_L1HDR(meta_hdr));
+
+ if (data_hdr->b_l1hdr.b_arc_access <
+ meta_hdr->b_l1hdr.b_arc_access) {
+ type = ARC_BUFC_DATA;
+ } else {
+ type = ARC_BUFC_METADATA;
+ }
+ }
+
+ multilist_sublist_unlock(meta_mls);
+ multilist_sublist_unlock(data_mls);
+
+ return (type);
+}
+
+/*
+ * Evict buffers from the cache, such that arc_size is capped by arc_c.
+ */
+static uint64_t
+arc_evict(void)
+{
+ uint64_t total_evicted = 0;
+ uint64_t bytes;
+ int64_t target;
+ uint64_t asize = aggsum_value(&arc_size);
+ uint64_t ameta = aggsum_value(&arc_meta_used);
+
+ /*
+ * If we're over arc_meta_limit, we want to correct that before
+ * potentially evicting data buffers below.
+ */
+ total_evicted += arc_evict_meta(ameta);
+
+ /*
+ * Adjust MRU size
+ *
+ * If we're over the target cache size, we want to evict enough
+ * from the list to get back to our target size. We don't want
+ * to evict too much from the MRU, such that it drops below
+ * arc_p. So, if we're over our target cache size more than
+ * the MRU is over arc_p, we'll evict enough to get back to
+ * arc_p here, and then evict more from the MFU below.
+ */
+ target = MIN((int64_t)(asize - arc_c),
+ (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
+ zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p));
+
+ /*
+ * If we're below arc_meta_min, always prefer to evict data.
+ * Otherwise, try to satisfy the requested number of bytes to
+ * evict from the type which contains older buffers; in an
+ * effort to keep newer buffers in the cache regardless of their
+ * type. If we cannot satisfy the number of bytes from this
+ * type, spill over into the next type.
+ */
+ if (arc_evict_type(arc_mru) == ARC_BUFC_METADATA &&
+ ameta > arc_meta_min) {
+ bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
+ total_evicted += bytes;
+
+ /*
+ * If we couldn't evict our target number of bytes from
+ * metadata, we try to get the rest from data.
+ */
+ target -= bytes;
+
+ total_evicted +=
+ arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA);
+ } else {
+ bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA);
+ total_evicted += bytes;
+
+ /*
+ * If we couldn't evict our target number of bytes from
+ * data, we try to get the rest from metadata.
+ */
+ target -= bytes;
+
+ total_evicted +=
+ arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
+ }
+
+ /*
+ * Re-sum ARC stats after the first round of evictions.
+ */
+ asize = aggsum_value(&arc_size);
+ ameta = aggsum_value(&arc_meta_used);
+
+
+ /*
+ * Adjust MFU size
+ *
+ * Now that we've tried to evict enough from the MRU to get its
+ * size back to arc_p, if we're still above the target cache
+ * size, we evict the rest from the MFU.
+ */
+ target = asize - arc_c;
+
+ if (arc_evict_type(arc_mfu) == ARC_BUFC_METADATA &&
+ ameta > arc_meta_min) {
+ bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
+ total_evicted += bytes;
+
+ /*
+ * If we couldn't evict our target number of bytes from
+ * metadata, we try to get the rest from data.
+ */
+ target -= bytes;
+
+ total_evicted +=
+ arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
+ } else {
+ bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
+ total_evicted += bytes;
+
+ /*
+ * If we couldn't evict our target number of bytes from
+ * data, we try to get the rest from data.
+ */
+ target -= bytes;
+
+ total_evicted +=
+ arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
+ }
+
+ /*
+ * Adjust ghost lists
+ *
+ * In addition to the above, the ARC also defines target values
+ * for the ghost lists. The sum of the mru list and mru ghost
+ * list should never exceed the target size of the cache, and
+ * the sum of the mru list, mfu list, mru ghost list, and mfu
+ * ghost list should never exceed twice the target size of the
+ * cache. The following logic enforces these limits on the ghost
+ * caches, and evicts from them as needed.
+ */
+ target = zfs_refcount_count(&arc_mru->arcs_size) +
+ zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
+
+ bytes = arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
+ total_evicted += bytes;
+
+ target -= bytes;
+
+ total_evicted +=
+ arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
+
+ /*
+ * We assume the sum of the mru list and mfu list is less than
+ * or equal to arc_c (we enforced this above), which means we
+ * can use the simpler of the two equations below:
+ *
+ * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
+ * mru ghost + mfu ghost <= arc_c
+ */
+ target = zfs_refcount_count(&arc_mru_ghost->arcs_size) +
+ zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
+
+ bytes = arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
+ total_evicted += bytes;
+
+ target -= bytes;
+
+ total_evicted +=
+ arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
+
+ return (total_evicted);
+}
+
+void
+arc_flush(spa_t *spa, boolean_t retry)
+{
+ uint64_t guid = 0;
+
+ /*
+ * If retry is B_TRUE, a spa must not be specified since we have
+ * no good way to determine if all of a spa's buffers have been
+ * evicted from an arc state.
+ */
+ ASSERT(!retry || spa == 0);
+
+ if (spa != NULL)
+ guid = spa_load_guid(spa);
+
+ (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
+ (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
+
+ (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
+ (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
+
+ (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
+ (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
+
+ (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
+ (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
+}
+
+void
+arc_reduce_target_size(int64_t to_free)
+{
+ uint64_t asize = aggsum_value(&arc_size);
+
+ /*
+ * All callers want the ARC to actually evict (at least) this much
+ * memory. Therefore we reduce from the lower of the current size and
+ * the target size. This way, even if arc_c is much higher than
+ * arc_size (as can be the case after many calls to arc_freed(), we will
+ * immediately have arc_c < arc_size and therefore the arc_evict_zthr
+ * will evict.
+ */
+ uint64_t c = MIN(arc_c, asize);
+
+ if (c > to_free && c - to_free > arc_c_min) {
+ arc_c = c - to_free;
+ atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
+ if (arc_p > arc_c)
+ arc_p = (arc_c >> 1);
+ ASSERT(arc_c >= arc_c_min);
+ ASSERT((int64_t)arc_p >= 0);
+ } else {
+ arc_c = arc_c_min;
+ }
+
+ if (asize > arc_c) {
+ /* See comment in arc_evict_cb_check() on why lock+flag */
+ mutex_enter(&arc_evict_lock);
+ arc_evict_needed = B_TRUE;
+ mutex_exit(&arc_evict_lock);
+ zthr_wakeup(arc_evict_zthr);
+ }
+}
+
+/*
+ * Determine if the system is under memory pressure and is asking
+ * to reclaim memory. A return value of B_TRUE indicates that the system
+ * is under memory pressure and that the arc should adjust accordingly.
+ */
+boolean_t
+arc_reclaim_needed(void)
+{
+ return (arc_available_memory() < 0);
+}
+
+void
+arc_kmem_reap_soon(void)
+{
+ size_t i;
+ kmem_cache_t *prev_cache = NULL;
+ kmem_cache_t *prev_data_cache = NULL;
+ extern kmem_cache_t *zio_buf_cache[];
+ extern kmem_cache_t *zio_data_buf_cache[];
+
+#ifdef _KERNEL
+ if ((aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) &&
+ zfs_arc_meta_prune) {
+ /*
+ * We are exceeding our meta-data cache limit.
+ * Prune some entries to release holds on meta-data.
+ */
+ arc_prune_async(zfs_arc_meta_prune);
+ }
+#if defined(_ILP32)
+ /*
+ * Reclaim unused memory from all kmem caches.
+ */
+ kmem_reap();
+#endif
+#endif
+
+ for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
+#if defined(_ILP32)
+ /* reach upper limit of cache size on 32-bit */
+ if (zio_buf_cache[i] == NULL)
+ break;
+#endif
+ if (zio_buf_cache[i] != prev_cache) {
+ prev_cache = zio_buf_cache[i];
+ kmem_cache_reap_now(zio_buf_cache[i]);
+ }
+ if (zio_data_buf_cache[i] != prev_data_cache) {
+ prev_data_cache = zio_data_buf_cache[i];
+ kmem_cache_reap_now(zio_data_buf_cache[i]);
+ }
+ }
+ kmem_cache_reap_now(buf_cache);
+ kmem_cache_reap_now(hdr_full_cache);
+ kmem_cache_reap_now(hdr_l2only_cache);
+ kmem_cache_reap_now(zfs_btree_leaf_cache);
+ abd_cache_reap_now();
+}
+
+/* ARGSUSED */
+static boolean_t
+arc_evict_cb_check(void *arg, zthr_t *zthr)
+{
+#ifdef ZFS_DEBUG
+ /*
+ * This is necessary in order to keep the kstat information
+ * up to date for tools that display kstat data such as the
+ * mdb ::arc dcmd and the Linux crash utility. These tools
+ * typically do not call kstat's update function, but simply
+ * dump out stats from the most recent update. Without
+ * this call, these commands may show stale stats for the
+ * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
+ * with this call, the data might be out of date if the
+ * evict thread hasn't been woken recently; but that should
+ * suffice. The arc_state_t structures can be queried
+ * directly if more accurate information is needed.
+ */
+ if (arc_ksp != NULL)
+ arc_ksp->ks_update(arc_ksp, KSTAT_READ);
+#endif
+
+ /*
+ * We have to rely on arc_wait_for_eviction() to tell us when to
+ * evict, rather than checking if we are overflowing here, so that we
+ * are sure to not leave arc_wait_for_eviction() waiting on aew_cv.
+ * If we have become "not overflowing" since arc_wait_for_eviction()
+ * checked, we need to wake it up. We could broadcast the CV here,
+ * but arc_wait_for_eviction() may have not yet gone to sleep. We
+ * would need to use a mutex to ensure that this function doesn't
+ * broadcast until arc_wait_for_eviction() has gone to sleep (e.g.
+ * the arc_evict_lock). However, the lock ordering of such a lock
+ * would necessarily be incorrect with respect to the zthr_lock,
+ * which is held before this function is called, and is held by
+ * arc_wait_for_eviction() when it calls zthr_wakeup().
+ */
+ return (arc_evict_needed);
+}
+
+/*
+ * Keep arc_size under arc_c by running arc_evict which evicts data
+ * from the ARC.
+ */
+/* ARGSUSED */
+static void
+arc_evict_cb(void *arg, zthr_t *zthr)
+{
+ uint64_t evicted = 0;
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+
+ /* Evict from cache */
+ evicted = arc_evict();
+
+ /*
+ * If evicted is zero, we couldn't evict anything
+ * via arc_evict(). This could be due to hash lock
+ * collisions, but more likely due to the majority of
+ * arc buffers being unevictable. Therefore, even if
+ * arc_size is above arc_c, another pass is unlikely to
+ * be helpful and could potentially cause us to enter an
+ * infinite loop. Additionally, zthr_iscancelled() is
+ * checked here so that if the arc is shutting down, the
+ * broadcast will wake any remaining arc evict waiters.
+ */
+ mutex_enter(&arc_evict_lock);
+ arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) &&
+ evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0;
+ if (!arc_evict_needed) {
+ /*
+ * We're either no longer overflowing, or we
+ * can't evict anything more, so we should wake
+ * arc_get_data_impl() sooner.
+ */
+ arc_evict_waiter_t *aw;
+ while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) {
+ cv_broadcast(&aw->aew_cv);
+ }
+ arc_set_need_free();
+ }
+ mutex_exit(&arc_evict_lock);
+ spl_fstrans_unmark(cookie);
+}
+
+/* ARGSUSED */
+static boolean_t
+arc_reap_cb_check(void *arg, zthr_t *zthr)
+{
+ int64_t free_memory = arc_available_memory();
+ static int reap_cb_check_counter = 0;
+
+ /*
+ * If a kmem reap is already active, don't schedule more. We must
+ * check for this because kmem_cache_reap_soon() won't actually
+ * block on the cache being reaped (this is to prevent callers from
+ * becoming implicitly blocked by a system-wide kmem reap -- which,
+ * on a system with many, many full magazines, can take minutes).
+ */
+ if (!kmem_cache_reap_active() && free_memory < 0) {
+
+ arc_no_grow = B_TRUE;
+ arc_warm = B_TRUE;
+ /*
+ * Wait at least zfs_grow_retry (default 5) seconds
+ * before considering growing.
+ */
+ arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
+ return (B_TRUE);
+ } else if (free_memory < arc_c >> arc_no_grow_shift) {
+ arc_no_grow = B_TRUE;
+ } else if (gethrtime() >= arc_growtime) {
+ arc_no_grow = B_FALSE;
+ }
+
+ /*
+ * Called unconditionally every 60 seconds to reclaim unused
+ * zstd compression and decompression context. This is done
+ * here to avoid the need for an independent thread.
+ */
+ if (!((reap_cb_check_counter++) % 60))
+ zfs_zstd_cache_reap_now();
+
+ return (B_FALSE);
+}
+
+/*
+ * Keep enough free memory in the system by reaping the ARC's kmem
+ * caches. To cause more slabs to be reapable, we may reduce the
+ * target size of the cache (arc_c), causing the arc_evict_cb()
+ * to free more buffers.
+ */
+/* ARGSUSED */
+static void
+arc_reap_cb(void *arg, zthr_t *zthr)
+{
+ int64_t free_memory;
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+
+ /*
+ * Kick off asynchronous kmem_reap()'s of all our caches.
+ */
+ arc_kmem_reap_soon();
+
+ /*
+ * Wait at least arc_kmem_cache_reap_retry_ms between
+ * arc_kmem_reap_soon() calls. Without this check it is possible to
+ * end up in a situation where we spend lots of time reaping
+ * caches, while we're near arc_c_min. Waiting here also gives the
+ * subsequent free memory check a chance of finding that the
+ * asynchronous reap has already freed enough memory, and we don't
+ * need to call arc_reduce_target_size().
+ */
+ delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000);
+
+ /*
+ * Reduce the target size as needed to maintain the amount of free
+ * memory in the system at a fraction of the arc_size (1/128th by
+ * default). If oversubscribed (free_memory < 0) then reduce the
+ * target arc_size by the deficit amount plus the fractional
+ * amount. If free memory is positive but less then the fractional
+ * amount, reduce by what is needed to hit the fractional amount.
+ */
+ free_memory = arc_available_memory();
+
+ int64_t to_free =
+ (arc_c >> arc_shrink_shift) - free_memory;
+ if (to_free > 0) {
+ arc_reduce_target_size(to_free);
+ }
+ spl_fstrans_unmark(cookie);
+}
+
+#ifdef _KERNEL
+/*
+ * Determine the amount of memory eligible for eviction contained in the
+ * ARC. All clean data reported by the ghost lists can always be safely
+ * evicted. Due to arc_c_min, the same does not hold for all clean data
+ * contained by the regular mru and mfu lists.
+ *
+ * In the case of the regular mru and mfu lists, we need to report as
+ * much clean data as possible, such that evicting that same reported
+ * data will not bring arc_size below arc_c_min. Thus, in certain
+ * circumstances, the total amount of clean data in the mru and mfu
+ * lists might not actually be evictable.
+ *
+ * The following two distinct cases are accounted for:
+ *
+ * 1. The sum of the amount of dirty data contained by both the mru and
+ * mfu lists, plus the ARC's other accounting (e.g. the anon list),
+ * is greater than or equal to arc_c_min.
+ * (i.e. amount of dirty data >= arc_c_min)
+ *
+ * This is the easy case; all clean data contained by the mru and mfu
+ * lists is evictable. Evicting all clean data can only drop arc_size
+ * to the amount of dirty data, which is greater than arc_c_min.
+ *
+ * 2. The sum of the amount of dirty data contained by both the mru and
+ * mfu lists, plus the ARC's other accounting (e.g. the anon list),
+ * is less than arc_c_min.
+ * (i.e. arc_c_min > amount of dirty data)
+ *
+ * 2.1. arc_size is greater than or equal arc_c_min.
+ * (i.e. arc_size >= arc_c_min > amount of dirty data)
+ *
+ * In this case, not all clean data from the regular mru and mfu
+ * lists is actually evictable; we must leave enough clean data
+ * to keep arc_size above arc_c_min. Thus, the maximum amount of
+ * evictable data from the two lists combined, is exactly the
+ * difference between arc_size and arc_c_min.
+ *
+ * 2.2. arc_size is less than arc_c_min
+ * (i.e. arc_c_min > arc_size > amount of dirty data)
+ *
+ * In this case, none of the data contained in the mru and mfu
+ * lists is evictable, even if it's clean. Since arc_size is
+ * already below arc_c_min, evicting any more would only
+ * increase this negative difference.
+ */
+
+#endif /* _KERNEL */
+
+/*
+ * Adapt arc info given the number of bytes we are trying to add and
+ * the state that we are coming from. This function is only called
+ * when we are adding new content to the cache.
+ */
+static void
+arc_adapt(int bytes, arc_state_t *state)
+{
+ int mult;
+ uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
+ int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size);
+ int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size);
+
+ ASSERT(bytes > 0);
+ /*
+ * Adapt the target size of the MRU list:
+ * - if we just hit in the MRU ghost list, then increase
+ * the target size of the MRU list.
+ * - if we just hit in the MFU ghost list, then increase
+ * the target size of the MFU list by decreasing the
+ * target size of the MRU list.
+ */
+ if (state == arc_mru_ghost) {
+ mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
+ if (!zfs_arc_p_dampener_disable)
+ mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
+
+ arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
+ } else if (state == arc_mfu_ghost) {
+ uint64_t delta;
+
+ mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
+ if (!zfs_arc_p_dampener_disable)
+ mult = MIN(mult, 10);
+
+ delta = MIN(bytes * mult, arc_p);
+ arc_p = MAX(arc_p_min, arc_p - delta);
+ }
+ ASSERT((int64_t)arc_p >= 0);
+
+ /*
+ * Wake reap thread if we do not have any available memory
+ */
+ if (arc_reclaim_needed()) {
+ zthr_wakeup(arc_reap_zthr);
+ return;
+ }
+
+ if (arc_no_grow)
+ return;
+
+ if (arc_c >= arc_c_max)
+ return;
+
+ /*
+ * If we're within (2 * maxblocksize) bytes of the target
+ * cache size, increment the target cache size
+ */
+ ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT);
+ if (aggsum_upper_bound(&arc_size) >=
+ arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
+ atomic_add_64(&arc_c, (int64_t)bytes);
+ if (arc_c > arc_c_max)
+ arc_c = arc_c_max;
+ else if (state == arc_anon)
+ atomic_add_64(&arc_p, (int64_t)bytes);
+ if (arc_p > arc_c)
+ arc_p = arc_c;
+ }
+ ASSERT((int64_t)arc_p >= 0);
+}
+
+/*
+ * Check if arc_size has grown past our upper threshold, determined by
+ * zfs_arc_overflow_shift.
+ */
+boolean_t
+arc_is_overflowing(void)
+{
+ /* Always allow at least one block of overflow */
+ int64_t overflow = MAX(SPA_MAXBLOCKSIZE,
+ arc_c >> zfs_arc_overflow_shift);
+
+ /*
+ * We just compare the lower bound here for performance reasons. Our
+ * primary goals are to make sure that the arc never grows without
+ * bound, and that it can reach its maximum size. This check
+ * accomplishes both goals. The maximum amount we could run over by is
+ * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
+ * in the ARC. In practice, that's in the tens of MB, which is low
+ * enough to be safe.
+ */
+ return (aggsum_lower_bound(&arc_size) >= (int64_t)arc_c + overflow);
+}
+
+static abd_t *
+arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
+ boolean_t do_adapt)
+{
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ arc_get_data_impl(hdr, size, tag, do_adapt);
+ if (type == ARC_BUFC_METADATA) {
+ return (abd_alloc(size, B_TRUE));
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ return (abd_alloc(size, B_FALSE));
+ }
+}
+
+static void *
+arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
+{
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ arc_get_data_impl(hdr, size, tag, B_TRUE);
+ if (type == ARC_BUFC_METADATA) {
+ return (zio_buf_alloc(size));
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ return (zio_data_buf_alloc(size));
+ }
+}
+
+/*
+ * Wait for the specified amount of data (in bytes) to be evicted from the
+ * ARC, and for there to be sufficient free memory in the system. Waiting for
+ * eviction ensures that the memory used by the ARC decreases. Waiting for
+ * free memory ensures that the system won't run out of free pages, regardless
+ * of ARC behavior and settings. See arc_lowmem_init().
+ */
+void
+arc_wait_for_eviction(uint64_t amount)
+{
+ mutex_enter(&arc_evict_lock);
+ if (arc_is_overflowing()) {
+ arc_evict_needed = B_TRUE;
+ zthr_wakeup(arc_evict_zthr);
+
+ if (amount != 0) {
+ arc_evict_waiter_t aw;
+ list_link_init(&aw.aew_node);
+ cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL);
+
+ uint64_t last_count = 0;
+ if (!list_is_empty(&arc_evict_waiters)) {
+ arc_evict_waiter_t *last =
+ list_tail(&arc_evict_waiters);
+ last_count = last->aew_count;
+ }
+ /*
+ * Note, the last waiter's count may be less than
+ * arc_evict_count if we are low on memory in which
+ * case arc_evict_state_impl() may have deferred
+ * wakeups (but still incremented arc_evict_count).
+ */
+ aw.aew_count =
+ MAX(last_count, arc_evict_count) + amount;
+
+ list_insert_tail(&arc_evict_waiters, &aw);
+
+ arc_set_need_free();
+
+ DTRACE_PROBE3(arc__wait__for__eviction,
+ uint64_t, amount,
+ uint64_t, arc_evict_count,
+ uint64_t, aw.aew_count);
+
+ /*
+ * We will be woken up either when arc_evict_count
+ * reaches aew_count, or when the ARC is no longer
+ * overflowing and eviction completes.
+ */
+ cv_wait(&aw.aew_cv, &arc_evict_lock);
+
+ /*
+ * In case of "false" wakeup, we will still be on the
+ * list.
+ */
+ if (list_link_active(&aw.aew_node))
+ list_remove(&arc_evict_waiters, &aw);
+
+ cv_destroy(&aw.aew_cv);
+ }
+ }
+ mutex_exit(&arc_evict_lock);
+}
+
+/*
+ * Allocate a block and return it to the caller. If we are hitting the
+ * hard limit for the cache size, we must sleep, waiting for the eviction
+ * thread to catch up. If we're past the target size but below the hard
+ * limit, we'll only signal the reclaim thread and continue on.
+ */
+static void
+arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
+ boolean_t do_adapt)
+{
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ if (do_adapt)
+ arc_adapt(size, state);
+
+ /*
+ * If arc_size is currently overflowing, we must be adding data
+ * faster than we are evicting. To ensure we don't compound the
+ * problem by adding more data and forcing arc_size to grow even
+ * further past it's target size, we wait for the eviction thread to
+ * make some progress. We also wait for there to be sufficient free
+ * memory in the system, as measured by arc_free_memory().
+ *
+ * Specifically, we wait for zfs_arc_eviction_pct percent of the
+ * requested size to be evicted. This should be more than 100%, to
+ * ensure that that progress is also made towards getting arc_size
+ * under arc_c. See the comment above zfs_arc_eviction_pct.
+ *
+ * We do the overflowing check without holding the arc_evict_lock to
+ * reduce lock contention in this hot path. Note that
+ * arc_wait_for_eviction() will acquire the lock and check again to
+ * ensure we are truly overflowing before blocking.
+ */
+ if (arc_is_overflowing()) {
+ arc_wait_for_eviction(size *
+ zfs_arc_eviction_pct / 100);
+ }
+
+ VERIFY3U(hdr->b_type, ==, type);
+ if (type == ARC_BUFC_METADATA) {
+ arc_space_consume(size, ARC_SPACE_META);
+ } else {
+ arc_space_consume(size, ARC_SPACE_DATA);
+ }
+
+ /*
+ * Update the state size. Note that ghost states have a
+ * "ghost size" and so don't need to be updated.
+ */
+ if (!GHOST_STATE(state)) {
+
+ (void) zfs_refcount_add_many(&state->arcs_size, size, tag);
+
+ /*
+ * If this is reached via arc_read, the link is
+ * protected by the hash lock. If reached via
+ * arc_buf_alloc, the header should not be accessed by
+ * any other thread. And, if reached via arc_read_done,
+ * the hash lock will protect it if it's found in the
+ * hash table; otherwise no other thread should be
+ * trying to [add|remove]_reference it.
+ */
+ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
+ ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ (void) zfs_refcount_add_many(&state->arcs_esize[type],
+ size, tag);
+ }
+
+ /*
+ * If we are growing the cache, and we are adding anonymous
+ * data, and we have outgrown arc_p, update arc_p
+ */
+ if (aggsum_upper_bound(&arc_size) < arc_c &&
+ hdr->b_l1hdr.b_state == arc_anon &&
+ (zfs_refcount_count(&arc_anon->arcs_size) +
+ zfs_refcount_count(&arc_mru->arcs_size) > arc_p))
+ arc_p = MIN(arc_c, arc_p + size);
+ }
+}
+
+static void
+arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag)
+{
+ arc_free_data_impl(hdr, size, tag);
+ abd_free(abd);
+}
+
+static void
+arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
+{
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ arc_free_data_impl(hdr, size, tag);
+ if (type == ARC_BUFC_METADATA) {
+ zio_buf_free(buf, size);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ zio_data_buf_free(buf, size);
+ }
+}
+
+/*
+ * Free the arc data buffer.
+ */
+static void
+arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
+{
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ /* protected by hash lock, if in the hash table */
+ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
+ ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ ASSERT(state != arc_anon && state != arc_l2c_only);
+
+ (void) zfs_refcount_remove_many(&state->arcs_esize[type],
+ size, tag);
+ }
+ (void) zfs_refcount_remove_many(&state->arcs_size, size, tag);
+
+ VERIFY3U(hdr->b_type, ==, type);
+ if (type == ARC_BUFC_METADATA) {
+ arc_space_return(size, ARC_SPACE_META);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ arc_space_return(size, ARC_SPACE_DATA);
+ }
+}
+
+/*
+ * This routine is called whenever a buffer is accessed.
+ * NOTE: the hash lock is dropped in this function.
+ */
+static void
+arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
+{
+ clock_t now;
+
+ ASSERT(MUTEX_HELD(hash_lock));
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ if (hdr->b_l1hdr.b_state == arc_anon) {
+ /*
+ * This buffer is not in the cache, and does not
+ * appear in our "ghost" list. Add the new buffer
+ * to the MRU state.
+ */
+
+ ASSERT0(hdr->b_l1hdr.b_arc_access);
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+ DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
+ arc_change_state(arc_mru, hdr, hash_lock);
+
+ } else if (hdr->b_l1hdr.b_state == arc_mru) {
+ now = ddi_get_lbolt();
+
+ /*
+ * If this buffer is here because of a prefetch, then either:
+ * - clear the flag if this is a "referencing" read
+ * (any subsequent access will bump this into the MFU state).
+ * or
+ * - move the buffer to the head of the list if this is
+ * another prefetch (to make it less likely to be evicted).
+ */
+ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
+ if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
+ /* link protected by hash lock */
+ ASSERT(multilist_link_active(
+ &hdr->b_l1hdr.b_arc_node));
+ } else {
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_decrement_state(hdr);
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_PREFETCH |
+ ARC_FLAG_PRESCIENT_PREFETCH);
+ atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
+ ARCSTAT_BUMP(arcstat_mru_hits);
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_increment_state(hdr);
+ }
+ hdr->b_l1hdr.b_arc_access = now;
+ return;
+ }
+
+ /*
+ * This buffer has been "accessed" only once so far,
+ * but it is still in the cache. Move it to the MFU
+ * state.
+ */
+ if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access +
+ ARC_MINTIME)) {
+ /*
+ * More than 125ms have passed since we
+ * instantiated this buffer. Move it to the
+ * most frequently used state.
+ */
+ hdr->b_l1hdr.b_arc_access = now;
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+ arc_change_state(arc_mfu, hdr, hash_lock);
+ }
+ atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
+ ARCSTAT_BUMP(arcstat_mru_hits);
+ } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
+ arc_state_t *new_state;
+ /*
+ * This buffer has been "accessed" recently, but
+ * was evicted from the cache. Move it to the
+ * MFU state.
+ */
+ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
+ new_state = arc_mru;
+ if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_decrement_state(hdr);
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_PREFETCH |
+ ARC_FLAG_PRESCIENT_PREFETCH);
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_increment_state(hdr);
+ }
+ DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
+ } else {
+ new_state = arc_mfu;
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+ }
+
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+ arc_change_state(new_state, hdr, hash_lock);
+
+ atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits);
+ ARCSTAT_BUMP(arcstat_mru_ghost_hits);
+ } else if (hdr->b_l1hdr.b_state == arc_mfu) {
+ /*
+ * This buffer has been accessed more than once and is
+ * still in the cache. Keep it in the MFU state.
+ *
+ * NOTE: an add_reference() that occurred when we did
+ * the arc_read() will have kicked this off the list.
+ * If it was a prefetch, we will explicitly move it to
+ * the head of the list now.
+ */
+
+ atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
+ ARCSTAT_BUMP(arcstat_mfu_hits);
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+ } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
+ arc_state_t *new_state = arc_mfu;
+ /*
+ * This buffer has been accessed more than once but has
+ * been evicted from the cache. Move it back to the
+ * MFU state.
+ */
+
+ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
+ /*
+ * This is a prefetch access...
+ * move this block back to the MRU state.
+ */
+ new_state = arc_mru;
+ }
+
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+ arc_change_state(new_state, hdr, hash_lock);
+
+ atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits);
+ ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
+ } else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
+ /*
+ * This buffer is on the 2nd Level ARC.
+ */
+
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+ arc_change_state(arc_mfu, hdr, hash_lock);
+ } else {
+ cmn_err(CE_PANIC, "invalid arc state 0x%p",
+ hdr->b_l1hdr.b_state);
+ }
+}
+
+/*
+ * This routine is called by dbuf_hold() to update the arc_access() state
+ * which otherwise would be skipped for entries in the dbuf cache.
+ */
+void
+arc_buf_access(arc_buf_t *buf)
+{
+ mutex_enter(&buf->b_evict_lock);
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ /*
+ * Avoid taking the hash_lock when possible as an optimization.
+ * The header must be checked again under the hash_lock in order
+ * to handle the case where it is concurrently being released.
+ */
+ if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
+ mutex_exit(&buf->b_evict_lock);
+ return;
+ }
+
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
+
+ if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
+ mutex_exit(hash_lock);
+ mutex_exit(&buf->b_evict_lock);
+ ARCSTAT_BUMP(arcstat_access_skip);
+ return;
+ }
+
+ mutex_exit(&buf->b_evict_lock);
+
+ ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
+ hdr->b_l1hdr.b_state == arc_mfu);
+
+ DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+
+ ARCSTAT_BUMP(arcstat_hits);
+ ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr) && !HDR_PRESCIENT_PREFETCH(hdr),
+ demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
+}
+
+/* a generic arc_read_done_func_t which you can use */
+/* ARGSUSED */
+void
+arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+ arc_buf_t *buf, void *arg)
+{
+ if (buf == NULL)
+ return;
+
+ bcopy(buf->b_data, arg, arc_buf_size(buf));
+ arc_buf_destroy(buf, arg);
+}
+
+/* a generic arc_read_done_func_t */
+/* ARGSUSED */
+void
+arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+ arc_buf_t *buf, void *arg)
+{
+ arc_buf_t **bufp = arg;
+
+ if (buf == NULL) {
+ ASSERT(zio == NULL || zio->io_error != 0);
+ *bufp = NULL;
+ } else {
+ ASSERT(zio == NULL || zio->io_error == 0);
+ *bufp = buf;
+ ASSERT(buf->b_data != NULL);
+ }
+}
+
+static void
+arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp)
+{
+ if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
+ ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
+ ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF);
+ } else {
+ if (HDR_COMPRESSION_ENABLED(hdr)) {
+ ASSERT3U(arc_hdr_get_compress(hdr), ==,
+ BP_GET_COMPRESS(bp));
+ }
+ ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
+ ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
+ ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp));
+ }
+}
+
+static void
+arc_read_done(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ arc_buf_hdr_t *hdr = zio->io_private;
+ kmutex_t *hash_lock = NULL;
+ arc_callback_t *callback_list;
+ arc_callback_t *acb;
+ boolean_t freeable = B_FALSE;
+
+ /*
+ * The hdr was inserted into hash-table and removed from lists
+ * prior to starting I/O. We should find this header, since
+ * it's in the hash table, and it should be legit since it's
+ * not possible to evict it during the I/O. The only possible
+ * reason for it not to be found is if we were freed during the
+ * read.
+ */
+ if (HDR_IN_HASH_TABLE(hdr)) {
+ arc_buf_hdr_t *found;
+
+ ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
+ ASSERT3U(hdr->b_dva.dva_word[0], ==,
+ BP_IDENTITY(zio->io_bp)->dva_word[0]);
+ ASSERT3U(hdr->b_dva.dva_word[1], ==,
+ BP_IDENTITY(zio->io_bp)->dva_word[1]);
+
+ found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock);
+
+ ASSERT((found == hdr &&
+ DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
+ (found == hdr && HDR_L2_READING(hdr)));
+ ASSERT3P(hash_lock, !=, NULL);
+ }
+
+ if (BP_IS_PROTECTED(bp)) {
+ hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
+ hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
+ zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
+ hdr->b_crypt_hdr.b_iv);
+
+ if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
+ void *tmpbuf;
+
+ tmpbuf = abd_borrow_buf_copy(zio->io_abd,
+ sizeof (zil_chain_t));
+ zio_crypt_decode_mac_zil(tmpbuf,
+ hdr->b_crypt_hdr.b_mac);
+ abd_return_buf(zio->io_abd, tmpbuf,
+ sizeof (zil_chain_t));
+ } else {
+ zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
+ }
+ }
+
+ if (zio->io_error == 0) {
+ /* byteswap if necessary */
+ if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
+ if (BP_GET_LEVEL(zio->io_bp) > 0) {
+ hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
+ } else {
+ hdr->b_l1hdr.b_byteswap =
+ DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
+ }
+ } else {
+ hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
+ }
+ if (!HDR_L2_READING(hdr)) {
+ hdr->b_complevel = zio->io_prop.zp_complevel;
+ }
+ }
+
+ arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
+ if (l2arc_noprefetch && HDR_PREFETCH(hdr))
+ arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
+
+ callback_list = hdr->b_l1hdr.b_acb;
+ ASSERT3P(callback_list, !=, NULL);
+
+ if (hash_lock && zio->io_error == 0 &&
+ hdr->b_l1hdr.b_state == arc_anon) {
+ /*
+ * Only call arc_access on anonymous buffers. This is because
+ * if we've issued an I/O for an evicted buffer, we've already
+ * called arc_access (to prevent any simultaneous readers from
+ * getting confused).
+ */
+ arc_access(hdr, hash_lock);
+ }
+
+ /*
+ * If a read request has a callback (i.e. acb_done is not NULL), then we
+ * make a buf containing the data according to the parameters which were
+ * passed in. The implementation of arc_buf_alloc_impl() ensures that we
+ * aren't needlessly decompressing the data multiple times.
+ */
+ int callback_cnt = 0;
+ for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
+ if (!acb->acb_done || acb->acb_nobuf)
+ continue;
+
+ callback_cnt++;
+
+ if (zio->io_error != 0)
+ continue;
+
+ int error = arc_buf_alloc_impl(hdr, zio->io_spa,
+ &acb->acb_zb, acb->acb_private, acb->acb_encrypted,
+ acb->acb_compressed, acb->acb_noauth, B_TRUE,
+ &acb->acb_buf);
+
+ /*
+ * Assert non-speculative zios didn't fail because an
+ * encryption key wasn't loaded
+ */
+ ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) ||
+ error != EACCES);
+
+ /*
+ * If we failed to decrypt, report an error now (as the zio
+ * layer would have done if it had done the transforms).
+ */
+ if (error == ECKSUM) {
+ ASSERT(BP_IS_PROTECTED(bp));
+ error = SET_ERROR(EIO);
+ if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
+ spa_log_error(zio->io_spa, &acb->acb_zb);
+ (void) zfs_ereport_post(
+ FM_EREPORT_ZFS_AUTHENTICATION,
+ zio->io_spa, NULL, &acb->acb_zb, zio, 0);
+ }
+ }
+
+ if (error != 0) {
+ /*
+ * Decompression or decryption failed. Set
+ * io_error so that when we call acb_done
+ * (below), we will indicate that the read
+ * failed. Note that in the unusual case
+ * where one callback is compressed and another
+ * uncompressed, we will mark all of them
+ * as failed, even though the uncompressed
+ * one can't actually fail. In this case,
+ * the hdr will not be anonymous, because
+ * if there are multiple callbacks, it's
+ * because multiple threads found the same
+ * arc buf in the hash table.
+ */
+ zio->io_error = error;
+ }
+ }
+
+ /*
+ * If there are multiple callbacks, we must have the hash lock,
+ * because the only way for multiple threads to find this hdr is
+ * in the hash table. This ensures that if there are multiple
+ * callbacks, the hdr is not anonymous. If it were anonymous,
+ * we couldn't use arc_buf_destroy() in the error case below.
+ */
+ ASSERT(callback_cnt < 2 || hash_lock != NULL);
+
+ hdr->b_l1hdr.b_acb = NULL;
+ arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+ if (callback_cnt == 0)
+ ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
+
+ ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
+ callback_list != NULL);
+
+ if (zio->io_error == 0) {
+ arc_hdr_verify(hdr, zio->io_bp);
+ } else {
+ arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
+ if (hdr->b_l1hdr.b_state != arc_anon)
+ arc_change_state(arc_anon, hdr, hash_lock);
+ if (HDR_IN_HASH_TABLE(hdr))
+ buf_hash_remove(hdr);
+ freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
+ }
+
+ /*
+ * Broadcast before we drop the hash_lock to avoid the possibility
+ * that the hdr (and hence the cv) might be freed before we get to
+ * the cv_broadcast().
+ */
+ cv_broadcast(&hdr->b_l1hdr.b_cv);
+
+ if (hash_lock != NULL) {
+ mutex_exit(hash_lock);
+ } else {
+ /*
+ * This block was freed while we waited for the read to
+ * complete. It has been removed from the hash table and
+ * moved to the anonymous state (so that it won't show up
+ * in the cache).
+ */
+ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+ freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
+ }
+
+ /* execute each callback and free its structure */
+ while ((acb = callback_list) != NULL) {
+ if (acb->acb_done != NULL) {
+ if (zio->io_error != 0 && acb->acb_buf != NULL) {
+ /*
+ * If arc_buf_alloc_impl() fails during
+ * decompression, the buf will still be
+ * allocated, and needs to be freed here.
+ */
+ arc_buf_destroy(acb->acb_buf,
+ acb->acb_private);
+ acb->acb_buf = NULL;
+ }
+ acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
+ acb->acb_buf, acb->acb_private);
+ }
+
+ if (acb->acb_zio_dummy != NULL) {
+ acb->acb_zio_dummy->io_error = zio->io_error;
+ zio_nowait(acb->acb_zio_dummy);
+ }
+
+ callback_list = acb->acb_next;
+ kmem_free(acb, sizeof (arc_callback_t));
+ }
+
+ if (freeable)
+ arc_hdr_destroy(hdr);
+}
+
+/*
+ * "Read" the block at the specified DVA (in bp) via the
+ * cache. If the block is found in the cache, invoke the provided
+ * callback immediately and return. Note that the `zio' parameter
+ * in the callback will be NULL in this case, since no IO was
+ * required. If the block is not in the cache pass the read request
+ * on to the spa with a substitute callback function, so that the
+ * requested block will be added to the cache.
+ *
+ * If a read request arrives for a block that has a read in-progress,
+ * either wait for the in-progress read to complete (and return the
+ * results); or, if this is a read with a "done" func, add a record
+ * to the read to invoke the "done" func when the read completes,
+ * and return; or just return.
+ *
+ * arc_read_done() will invoke all the requested "done" functions
+ * for readers of this block.
+ */
+int
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
+ arc_read_done_func_t *done, void *private, zio_priority_t priority,
+ int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
+{
+ arc_buf_hdr_t *hdr = NULL;
+ kmutex_t *hash_lock = NULL;
+ zio_t *rzio;
+ uint64_t guid = spa_load_guid(spa);
+ boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0;
+ boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) &&
+ (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
+ boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) &&
+ (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
+ boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp);
+ boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF;
+ int rc = 0;
+
+ ASSERT(!embedded_bp ||
+ BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
+ ASSERT(!BP_IS_HOLE(bp));
+ ASSERT(!BP_IS_REDACTED(bp));
+
+ /*
+ * Normally SPL_FSTRANS will already be set since kernel threads which
+ * expect to call the DMU interfaces will set it when created. System
+ * calls are similarly handled by setting/cleaning the bit in the
+ * registered callback (module/os/.../zfs/zpl_*).
+ *
+ * External consumers such as Lustre which call the exported DMU
+ * interfaces may not have set SPL_FSTRANS. To avoid a deadlock
+ * on the hash_lock always set and clear the bit.
+ */
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+top:
+ if (!embedded_bp) {
+ /*
+ * Embedded BP's have no DVA and require no I/O to "read".
+ * Create an anonymous arc buf to back it.
+ */
+ hdr = buf_hash_find(guid, bp, &hash_lock);
+ }
+
+ /*
+ * Determine if we have an L1 cache hit or a cache miss. For simplicity
+ * we maintain encrypted data separately from compressed / uncompressed
+ * data. If the user is requesting raw encrypted data and we don't have
+ * that in the header we will read from disk to guarantee that we can
+ * get it even if the encryption keys aren't loaded.
+ */
+ if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) ||
+ (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) {
+ arc_buf_t *buf = NULL;
+ *arc_flags |= ARC_FLAG_CACHED;
+
+ if (HDR_IO_IN_PROGRESS(hdr)) {
+ zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
+
+ if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
+ mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_cached_only_in_progress);
+ rc = SET_ERROR(ENOENT);
+ goto out;
+ }
+
+ ASSERT3P(head_zio, !=, NULL);
+ if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
+ priority == ZIO_PRIORITY_SYNC_READ) {
+ /*
+ * This is a sync read that needs to wait for
+ * an in-flight async read. Request that the
+ * zio have its priority upgraded.
+ */
+ zio_change_priority(head_zio, priority);
+ DTRACE_PROBE1(arc__async__upgrade__sync,
+ arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_async_upgrade_sync);
+ }
+ if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_PREDICTIVE_PREFETCH);
+ }
+
+ if (*arc_flags & ARC_FLAG_WAIT) {
+ cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
+ }
+ ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
+
+ if (done) {
+ arc_callback_t *acb = NULL;
+
+ acb = kmem_zalloc(sizeof (arc_callback_t),
+ KM_SLEEP);
+ acb->acb_done = done;
+ acb->acb_private = private;
+ acb->acb_compressed = compressed_read;
+ acb->acb_encrypted = encrypted_read;
+ acb->acb_noauth = noauth_read;
+ acb->acb_nobuf = no_buf;
+ acb->acb_zb = *zb;
+ if (pio != NULL)
+ acb->acb_zio_dummy = zio_null(pio,
+ spa, NULL, NULL, NULL, zio_flags);
+
+ ASSERT3P(acb->acb_done, !=, NULL);
+ acb->acb_zio_head = head_zio;
+ acb->acb_next = hdr->b_l1hdr.b_acb;
+ hdr->b_l1hdr.b_acb = acb;
+ }
+ mutex_exit(hash_lock);
+ goto out;
+ }
+
+ ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
+ hdr->b_l1hdr.b_state == arc_mfu);
+
+ if (done && !no_buf) {
+ if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
+ /*
+ * This is a demand read which does not have to
+ * wait for i/o because we did a predictive
+ * prefetch i/o for it, which has completed.
+ */
+ DTRACE_PROBE1(
+ arc__demand__hit__predictive__prefetch,
+ arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(
+ arcstat_demand_hit_predictive_prefetch);
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_PREDICTIVE_PREFETCH);
+ }
+
+ if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
+ ARCSTAT_BUMP(
+ arcstat_demand_hit_prescient_prefetch);
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_PRESCIENT_PREFETCH);
+ }
+
+ ASSERT(!embedded_bp || !BP_IS_HOLE(bp));
+
+ /* Get a buf with the desired data in it. */
+ rc = arc_buf_alloc_impl(hdr, spa, zb, private,
+ encrypted_read, compressed_read, noauth_read,
+ B_TRUE, &buf);
+ if (rc == ECKSUM) {
+ /*
+ * Convert authentication and decryption errors
+ * to EIO (and generate an ereport if needed)
+ * before leaving the ARC.
+ */
+ rc = SET_ERROR(EIO);
+ if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) {
+ spa_log_error(spa, zb);
+ (void) zfs_ereport_post(
+ FM_EREPORT_ZFS_AUTHENTICATION,
+ spa, NULL, zb, NULL, 0);
+ }
+ }
+ if (rc != 0) {
+ (void) remove_reference(hdr, hash_lock,
+ private);
+ arc_buf_destroy_impl(buf);
+ buf = NULL;
+ }
+
+ /* assert any errors weren't due to unloaded keys */
+ ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
+ rc != EACCES);
+ } else if (*arc_flags & ARC_FLAG_PREFETCH &&
+ zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_decrement_state(hdr);
+ arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_increment_state(hdr);
+ }
+ DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
+ arc_access(hdr, hash_lock);
+ if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
+ if (*arc_flags & ARC_FLAG_L2CACHE)
+ arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
+ mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_hits);
+ ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+ demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
+ data, metadata, hits);
+
+ if (done)
+ done(NULL, zb, bp, buf, private);
+ } else {
+ uint64_t lsize = BP_GET_LSIZE(bp);
+ uint64_t psize = BP_GET_PSIZE(bp);
+ arc_callback_t *acb;
+ vdev_t *vd = NULL;
+ uint64_t addr = 0;
+ boolean_t devw = B_FALSE;
+ uint64_t size;
+ abd_t *hdr_abd;
+ int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0;
+
+ if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
+ rc = SET_ERROR(ENOENT);
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
+ goto out;
+ }
+
+ /*
+ * Gracefully handle a damaged logical block size as a
+ * checksum error.
+ */
+ if (lsize > spa_maxblocksize(spa)) {
+ rc = SET_ERROR(ECKSUM);
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
+ goto out;
+ }
+
+ if (hdr == NULL) {
+ /*
+ * This block is not in the cache or it has
+ * embedded data.
+ */
+ arc_buf_hdr_t *exists = NULL;
+ arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
+ hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
+ BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type,
+ encrypted_read);
+
+ if (!embedded_bp) {
+ hdr->b_dva = *BP_IDENTITY(bp);
+ hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
+ exists = buf_hash_insert(hdr, &hash_lock);
+ }
+ if (exists != NULL) {
+ /* somebody beat us to the hash insert */
+ mutex_exit(hash_lock);
+ buf_discard_identity(hdr);
+ arc_hdr_destroy(hdr);
+ goto top; /* restart the IO request */
+ }
+ } else {
+ /*
+ * This block is in the ghost cache or encrypted data
+ * was requested and we didn't have it. If it was
+ * L2-only (and thus didn't have an L1 hdr),
+ * we realloc the header to add an L1 hdr.
+ */
+ if (!HDR_HAS_L1HDR(hdr)) {
+ hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
+ hdr_full_cache);
+ }
+
+ if (GHOST_STATE(hdr->b_l1hdr.b_state)) {
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ ASSERT(!HDR_HAS_RABD(hdr));
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT0(zfs_refcount_count(
+ &hdr->b_l1hdr.b_refcnt));
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+ } else if (HDR_IO_IN_PROGRESS(hdr)) {
+ /*
+ * If this header already had an IO in progress
+ * and we are performing another IO to fetch
+ * encrypted data we must wait until the first
+ * IO completes so as not to confuse
+ * arc_read_done(). This should be very rare
+ * and so the performance impact shouldn't
+ * matter.
+ */
+ cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
+ }
+
+ /*
+ * This is a delicate dance that we play here.
+ * This hdr might be in the ghost list so we access
+ * it to move it out of the ghost list before we
+ * initiate the read. If it's a prefetch then
+ * it won't have a callback so we'll remove the
+ * reference that arc_buf_alloc_impl() created. We
+ * do this after we've called arc_access() to
+ * avoid hitting an assert in remove_reference().
+ */
+ arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state);
+ arc_access(hdr, hash_lock);
+ arc_hdr_alloc_abd(hdr, alloc_flags);
+ }
+
+ if (encrypted_read) {
+ ASSERT(HDR_HAS_RABD(hdr));
+ size = HDR_GET_PSIZE(hdr);
+ hdr_abd = hdr->b_crypt_hdr.b_rabd;
+ zio_flags |= ZIO_FLAG_RAW;
+ } else {
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+ size = arc_hdr_size(hdr);
+ hdr_abd = hdr->b_l1hdr.b_pabd;
+
+ if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
+ zio_flags |= ZIO_FLAG_RAW_COMPRESS;
+ }
+
+ /*
+ * For authenticated bp's, we do not ask the ZIO layer
+ * to authenticate them since this will cause the entire
+ * IO to fail if the key isn't loaded. Instead, we
+ * defer authentication until arc_buf_fill(), which will
+ * verify the data when the key is available.
+ */
+ if (BP_IS_AUTHENTICATED(bp))
+ zio_flags |= ZIO_FLAG_RAW_ENCRYPT;
+ }
+
+ if (*arc_flags & ARC_FLAG_PREFETCH &&
+ zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_decrement_state(hdr);
+ arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_increment_state(hdr);
+ }
+ if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
+ if (*arc_flags & ARC_FLAG_L2CACHE)
+ arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
+ if (BP_IS_AUTHENTICATED(bp))
+ arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
+ if (BP_GET_LEVEL(bp) > 0)
+ arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
+ if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH);
+ ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
+
+ acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
+ acb->acb_done = done;
+ acb->acb_private = private;
+ acb->acb_compressed = compressed_read;
+ acb->acb_encrypted = encrypted_read;
+ acb->acb_noauth = noauth_read;
+ acb->acb_zb = *zb;
+
+ ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+ hdr->b_l1hdr.b_acb = acb;
+ arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+
+ if (HDR_HAS_L2HDR(hdr) &&
+ (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
+ devw = hdr->b_l2hdr.b_dev->l2ad_writing;
+ addr = hdr->b_l2hdr.b_daddr;
+ /*
+ * Lock out L2ARC device removal.
+ */
+ if (vdev_is_dead(vd) ||
+ !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
+ vd = NULL;
+ }
+
+ /*
+ * We count both async reads and scrub IOs as asynchronous so
+ * that both can be upgraded in the event of a cache hit while
+ * the read IO is still in-flight.
+ */
+ if (priority == ZIO_PRIORITY_ASYNC_READ ||
+ priority == ZIO_PRIORITY_SCRUB)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
+ else
+ arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
+
+ /*
+ * At this point, we have a level 1 cache miss or a blkptr
+ * with embedded data. Try again in L2ARC if possible.
+ */
+ ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
+
+ /*
+ * Skip ARC stat bump for block pointers with embedded
+ * data. The data are read from the blkptr itself via
+ * decode_embedded_bp_compressed().
+ */
+ if (!embedded_bp) {
+ DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr,
+ blkptr_t *, bp, uint64_t, lsize,
+ zbookmark_phys_t *, zb);
+ ARCSTAT_BUMP(arcstat_misses);
+ ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+ demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data,
+ metadata, misses);
+ }
+
+ /* Check if the spa even has l2 configured */
+ const boolean_t spa_has_l2 = l2arc_ndev != 0 &&
+ spa->spa_l2cache.sav_count > 0;
+
+ if (vd != NULL && spa_has_l2 && !(l2arc_norw && devw)) {
+ /*
+ * Read from the L2ARC if the following are true:
+ * 1. The L2ARC vdev was previously cached.
+ * 2. This buffer still has L2ARC metadata.
+ * 3. This buffer isn't currently writing to the L2ARC.
+ * 4. The L2ARC entry wasn't evicted, which may
+ * also have invalidated the vdev.
+ * 5. This isn't prefetch or l2arc_noprefetch is 0.
+ */
+ if (HDR_HAS_L2HDR(hdr) &&
+ !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
+ !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
+ l2arc_read_callback_t *cb;
+ abd_t *abd;
+ uint64_t asize;
+
+ DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_l2_hits);
+ atomic_inc_32(&hdr->b_l2hdr.b_hits);
+
+ cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
+ KM_SLEEP);
+ cb->l2rcb_hdr = hdr;
+ cb->l2rcb_bp = *bp;
+ cb->l2rcb_zb = *zb;
+ cb->l2rcb_flags = zio_flags;
+
+ /*
+ * When Compressed ARC is disabled, but the
+ * L2ARC block is compressed, arc_hdr_size()
+ * will have returned LSIZE rather than PSIZE.
+ */
+ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
+ !HDR_COMPRESSION_ENABLED(hdr) &&
+ HDR_GET_PSIZE(hdr) != 0) {
+ size = HDR_GET_PSIZE(hdr);
+ }
+
+ asize = vdev_psize_to_asize(vd, size);
+ if (asize != size) {
+ abd = abd_alloc_for_io(asize,
+ HDR_ISTYPE_METADATA(hdr));
+ cb->l2rcb_abd = abd;
+ } else {
+ abd = hdr_abd;
+ }
+
+ ASSERT(addr >= VDEV_LABEL_START_SIZE &&
+ addr + asize <= vd->vdev_psize -
+ VDEV_LABEL_END_SIZE);
+
+ /*
+ * l2arc read. The SCL_L2ARC lock will be
+ * released by l2arc_read_done().
+ * Issue a null zio if the underlying buffer
+ * was squashed to zero size by compression.
+ */
+ ASSERT3U(arc_hdr_get_compress(hdr), !=,
+ ZIO_COMPRESS_EMPTY);
+ rzio = zio_read_phys(pio, vd, addr,
+ asize, abd,
+ ZIO_CHECKSUM_OFF,
+ l2arc_read_done, cb, priority,
+ zio_flags | ZIO_FLAG_DONT_CACHE |
+ ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY, B_FALSE);
+ acb->acb_zio_head = rzio;
+
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
+
+ DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
+ zio_t *, rzio);
+ ARCSTAT_INCR(arcstat_l2_read_bytes,
+ HDR_GET_PSIZE(hdr));
+
+ if (*arc_flags & ARC_FLAG_NOWAIT) {
+ zio_nowait(rzio);
+ goto out;
+ }
+
+ ASSERT(*arc_flags & ARC_FLAG_WAIT);
+ if (zio_wait(rzio) == 0)
+ goto out;
+
+ /* l2arc read error; goto zio_read() */
+ if (hash_lock != NULL)
+ mutex_enter(hash_lock);
+ } else {
+ DTRACE_PROBE1(l2arc__miss,
+ arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_l2_misses);
+ if (HDR_L2_WRITING(hdr))
+ ARCSTAT_BUMP(arcstat_l2_rw_clash);
+ spa_config_exit(spa, SCL_L2ARC, vd);
+ }
+ } else {
+ if (vd != NULL)
+ spa_config_exit(spa, SCL_L2ARC, vd);
+
+ /*
+ * Only a spa with l2 should contribute to l2
+ * miss stats. (Including the case of having a
+ * faulted cache device - that's also a miss.)
+ */
+ if (spa_has_l2) {
+ /*
+ * Skip ARC stat bump for block pointers with
+ * embedded data. The data are read from the
+ * blkptr itself via
+ * decode_embedded_bp_compressed().
+ */
+ if (!embedded_bp) {
+ DTRACE_PROBE1(l2arc__miss,
+ arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_l2_misses);
+ }
+ }
+ }
+
+ rzio = zio_read(pio, spa, bp, hdr_abd, size,
+ arc_read_done, hdr, priority, zio_flags, zb);
+ acb->acb_zio_head = rzio;
+
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
+
+ if (*arc_flags & ARC_FLAG_WAIT) {
+ rc = zio_wait(rzio);
+ goto out;
+ }
+
+ ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
+ zio_nowait(rzio);
+ }
+
+out:
+ /* embedded bps don't actually go to disk */
+ if (!embedded_bp)
+ spa_read_history_add(spa, zb, *arc_flags);
+ spl_fstrans_unmark(cookie);
+ return (rc);
+}
+
+arc_prune_t *
+arc_add_prune_callback(arc_prune_func_t *func, void *private)
+{
+ arc_prune_t *p;
+
+ p = kmem_alloc(sizeof (*p), KM_SLEEP);
+ p->p_pfunc = func;
+ p->p_private = private;
+ list_link_init(&p->p_node);
+ zfs_refcount_create(&p->p_refcnt);
+
+ mutex_enter(&arc_prune_mtx);
+ zfs_refcount_add(&p->p_refcnt, &arc_prune_list);
+ list_insert_head(&arc_prune_list, p);
+ mutex_exit(&arc_prune_mtx);
+
+ return (p);
+}
+
+void
+arc_remove_prune_callback(arc_prune_t *p)
+{
+ boolean_t wait = B_FALSE;
+ mutex_enter(&arc_prune_mtx);
+ list_remove(&arc_prune_list, p);
+ if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0)
+ wait = B_TRUE;
+ mutex_exit(&arc_prune_mtx);
+
+ /* wait for arc_prune_task to finish */
+ if (wait)
+ taskq_wait_outstanding(arc_prune_taskq, 0);
+ ASSERT0(zfs_refcount_count(&p->p_refcnt));
+ zfs_refcount_destroy(&p->p_refcnt);
+ kmem_free(p, sizeof (*p));
+}
+
+/*
+ * Notify the arc that a block was freed, and thus will never be used again.
+ */
+void
+arc_freed(spa_t *spa, const blkptr_t *bp)
+{
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+ uint64_t guid = spa_load_guid(spa);
+
+ ASSERT(!BP_IS_EMBEDDED(bp));
+
+ hdr = buf_hash_find(guid, bp, &hash_lock);
+ if (hdr == NULL)
+ return;
+
+ /*
+ * We might be trying to free a block that is still doing I/O
+ * (i.e. prefetch) or has a reference (i.e. a dedup-ed,
+ * dmu_sync-ed block). If this block is being prefetched, then it
+ * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr
+ * until the I/O completes. A block may also have a reference if it is
+ * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
+ * have written the new block to its final resting place on disk but
+ * without the dedup flag set. This would have left the hdr in the MRU
+ * state and discoverable. When the txg finally syncs it detects that
+ * the block was overridden in open context and issues an override I/O.
+ * Since this is a dedup block, the override I/O will determine if the
+ * block is already in the DDT. If so, then it will replace the io_bp
+ * with the bp from the DDT and allow the I/O to finish. When the I/O
+ * reaches the done callback, dbuf_write_override_done, it will
+ * check to see if the io_bp and io_bp_override are identical.
+ * If they are not, then it indicates that the bp was replaced with
+ * the bp in the DDT and the override bp is freed. This allows
+ * us to arrive here with a reference on a block that is being
+ * freed. So if we have an I/O in progress, or a reference to
+ * this hdr, then we don't destroy the hdr.
+ */
+ if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) &&
+ zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) {
+ arc_change_state(arc_anon, hdr, hash_lock);
+ arc_hdr_destroy(hdr);
+ mutex_exit(hash_lock);
+ } else {
+ mutex_exit(hash_lock);
+ }
+
+}
+
+/*
+ * Release this buffer from the cache, making it an anonymous buffer. This
+ * must be done after a read and prior to modifying the buffer contents.
+ * If the buffer has more than one reference, we must make
+ * a new hdr for the buffer.
+ */
+void
+arc_release(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ /*
+ * It would be nice to assert that if its DMU metadata (level >
+ * 0 || it's the dnode file), then it must be syncing context.
+ * But we don't know that information at this level.
+ */
+
+ mutex_enter(&buf->b_evict_lock);
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ /*
+ * We don't grab the hash lock prior to this check, because if
+ * the buffer's header is in the arc_anon state, it won't be
+ * linked into the hash table.
+ */
+ if (hdr->b_l1hdr.b_state == arc_anon) {
+ mutex_exit(&buf->b_evict_lock);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT(!HDR_IN_HASH_TABLE(hdr));
+ ASSERT(!HDR_HAS_L2HDR(hdr));
+ ASSERT(HDR_EMPTY(hdr));
+
+ ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+ ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
+ ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+
+ hdr->b_l1hdr.b_arc_access = 0;
+
+ /*
+ * If the buf is being overridden then it may already
+ * have a hdr that is not empty.
+ */
+ buf_discard_identity(hdr);
+ arc_buf_thaw(buf);
+
+ return;
+ }
+
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
+
+ /*
+ * This assignment is only valid as long as the hash_lock is
+ * held, we must be careful not to reference state or the
+ * b_state field after dropping the lock.
+ */
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+ ASSERT3P(state, !=, arc_anon);
+
+ /* this buffer is not on any list */
+ ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
+
+ if (HDR_HAS_L2HDR(hdr)) {
+ mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
+
+ /*
+ * We have to recheck this conditional again now that
+ * we're holding the l2ad_mtx to prevent a race with
+ * another thread which might be concurrently calling
+ * l2arc_evict(). In that case, l2arc_evict() might have
+ * destroyed the header's L2 portion as we were waiting
+ * to acquire the l2ad_mtx.
+ */
+ if (HDR_HAS_L2HDR(hdr))
+ arc_hdr_l2hdr_destroy(hdr);
+
+ mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
+ }
+
+ /*
+ * Do we have more than one buf?
+ */
+ if (hdr->b_l1hdr.b_bufcnt > 1) {
+ arc_buf_hdr_t *nhdr;
+ uint64_t spa = hdr->b_spa;
+ uint64_t psize = HDR_GET_PSIZE(hdr);
+ uint64_t lsize = HDR_GET_LSIZE(hdr);
+ boolean_t protected = HDR_PROTECTED(hdr);
+ enum zio_compress compress = arc_hdr_get_compress(hdr);
+ arc_buf_contents_t type = arc_buf_type(hdr);
+ VERIFY3U(hdr->b_type, ==, type);
+
+ ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
+ (void) remove_reference(hdr, hash_lock, tag);
+
+ if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
+ ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
+ ASSERT(ARC_BUF_LAST(buf));
+ }
+
+ /*
+ * Pull the data off of this hdr and attach it to
+ * a new anonymous hdr. Also find the last buffer
+ * in the hdr's buffer list.
+ */
+ arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
+ ASSERT3P(lastbuf, !=, NULL);
+
+ /*
+ * If the current arc_buf_t and the hdr are sharing their data
+ * buffer, then we must stop sharing that block.
+ */
+ if (arc_buf_is_shared(buf)) {
+ ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
+ VERIFY(!arc_buf_is_shared(lastbuf));
+
+ /*
+ * First, sever the block sharing relationship between
+ * buf and the arc_buf_hdr_t.
+ */
+ arc_unshare_buf(hdr, buf);
+
+ /*
+ * Now we need to recreate the hdr's b_pabd. Since we
+ * have lastbuf handy, we try to share with it, but if
+ * we can't then we allocate a new b_pabd and copy the
+ * data from buf into it.
+ */
+ if (arc_can_share(hdr, lastbuf)) {
+ arc_share_buf(hdr, lastbuf);
+ } else {
+ arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
+ abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
+ buf->b_data, psize);
+ }
+ VERIFY3P(lastbuf->b_data, !=, NULL);
+ } else if (HDR_SHARED_DATA(hdr)) {
+ /*
+ * Uncompressed shared buffers are always at the end
+ * of the list. Compressed buffers don't have the
+ * same requirements. This makes it hard to
+ * simply assert that the lastbuf is shared so
+ * we rely on the hdr's compression flags to determine
+ * if we have a compressed, shared buffer.
+ */
+ ASSERT(arc_buf_is_shared(lastbuf) ||
+ arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
+ ASSERT(!ARC_BUF_SHARED(buf));
+ }
+
+ ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
+ ASSERT3P(state, !=, arc_l2c_only);
+
+ (void) zfs_refcount_remove_many(&state->arcs_size,
+ arc_buf_size(buf), buf);
+
+ if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
+ ASSERT3P(state, !=, arc_l2c_only);
+ (void) zfs_refcount_remove_many(
+ &state->arcs_esize[type],
+ arc_buf_size(buf), buf);
+ }
+
+ hdr->b_l1hdr.b_bufcnt -= 1;
+ if (ARC_BUF_ENCRYPTED(buf))
+ hdr->b_crypt_hdr.b_ebufcnt -= 1;
+
+ arc_cksum_verify(buf);
+ arc_buf_unwatch(buf);
+
+ /* if this is the last uncompressed buf free the checksum */
+ if (!arc_hdr_has_uncompressed_buf(hdr))
+ arc_cksum_free(hdr);
+
+ mutex_exit(hash_lock);
+
+ /*
+ * Allocate a new hdr. The new hdr will contain a b_pabd
+ * buffer which will be freed in arc_write().
+ */
+ nhdr = arc_hdr_alloc(spa, psize, lsize, protected,
+ compress, hdr->b_complevel, type, HDR_HAS_RABD(hdr));
+ ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
+ ASSERT0(nhdr->b_l1hdr.b_bufcnt);
+ ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt));
+ VERIFY3U(nhdr->b_type, ==, type);
+ ASSERT(!HDR_SHARED_DATA(nhdr));
+
+ nhdr->b_l1hdr.b_buf = buf;
+ nhdr->b_l1hdr.b_bufcnt = 1;
+ if (ARC_BUF_ENCRYPTED(buf))
+ nhdr->b_crypt_hdr.b_ebufcnt = 1;
+ nhdr->b_l1hdr.b_mru_hits = 0;
+ nhdr->b_l1hdr.b_mru_ghost_hits = 0;
+ nhdr->b_l1hdr.b_mfu_hits = 0;
+ nhdr->b_l1hdr.b_mfu_ghost_hits = 0;
+ nhdr->b_l1hdr.b_l2_hits = 0;
+ (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
+ buf->b_hdr = nhdr;
+
+ mutex_exit(&buf->b_evict_lock);
+ (void) zfs_refcount_add_many(&arc_anon->arcs_size,
+ arc_buf_size(buf), buf);
+ } else {
+ mutex_exit(&buf->b_evict_lock);
+ ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
+ /* protected by hash lock, or hdr is on arc_anon */
+ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ hdr->b_l1hdr.b_mru_hits = 0;
+ hdr->b_l1hdr.b_mru_ghost_hits = 0;
+ hdr->b_l1hdr.b_mfu_hits = 0;
+ hdr->b_l1hdr.b_mfu_ghost_hits = 0;
+ hdr->b_l1hdr.b_l2_hits = 0;
+ arc_change_state(arc_anon, hdr, hash_lock);
+ hdr->b_l1hdr.b_arc_access = 0;
+
+ mutex_exit(hash_lock);
+ buf_discard_identity(hdr);
+ arc_buf_thaw(buf);
+ }
+}
+
+int
+arc_released(arc_buf_t *buf)
+{
+ int released;
+
+ mutex_enter(&buf->b_evict_lock);
+ released = (buf->b_data != NULL &&
+ buf->b_hdr->b_l1hdr.b_state == arc_anon);
+ mutex_exit(&buf->b_evict_lock);
+ return (released);
+}
+
+#ifdef ZFS_DEBUG
+int
+arc_referenced(arc_buf_t *buf)
+{
+ int referenced;
+
+ mutex_enter(&buf->b_evict_lock);
+ referenced = (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
+ mutex_exit(&buf->b_evict_lock);
+ return (referenced);
+}
+#endif
+
+static void
+arc_write_ready(zio_t *zio)
+{
+ arc_write_callback_t *callback = zio->io_private;
+ arc_buf_t *buf = callback->awcb_buf;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ blkptr_t *bp = zio->io_bp;
+ uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp);
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
+ ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
+
+ /*
+ * If we're reexecuting this zio because the pool suspended, then
+ * cleanup any state that was previously set the first time the
+ * callback was invoked.
+ */
+ if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
+ arc_cksum_free(hdr);
+ arc_buf_unwatch(buf);
+ if (hdr->b_l1hdr.b_pabd != NULL) {
+ if (arc_buf_is_shared(buf)) {
+ arc_unshare_buf(hdr, buf);
+ } else {
+ arc_hdr_free_abd(hdr, B_FALSE);
+ }
+ }
+
+ if (HDR_HAS_RABD(hdr))
+ arc_hdr_free_abd(hdr, B_TRUE);
+ }
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ ASSERT(!HDR_HAS_RABD(hdr));
+ ASSERT(!HDR_SHARED_DATA(hdr));
+ ASSERT(!arc_buf_is_shared(buf));
+
+ callback->awcb_ready(zio, buf, callback->awcb_private);
+
+ if (HDR_IO_IN_PROGRESS(hdr))
+ ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
+
+ arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+
+ if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr))
+ hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp));
+
+ if (BP_IS_PROTECTED(bp)) {
+ /* ZIL blocks are written through zio_rewrite */
+ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
+ ASSERT(HDR_PROTECTED(hdr));
+
+ if (BP_SHOULD_BYTESWAP(bp)) {
+ if (BP_GET_LEVEL(bp) > 0) {
+ hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
+ } else {
+ hdr->b_l1hdr.b_byteswap =
+ DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
+ }
+ } else {
+ hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
+ }
+
+ hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
+ hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
+ zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
+ hdr->b_crypt_hdr.b_iv);
+ zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
+ }
+
+ /*
+ * If this block was written for raw encryption but the zio layer
+ * ended up only authenticating it, adjust the buffer flags now.
+ */
+ if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) {
+ arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
+ buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
+ if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF)
+ buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
+ } else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) {
+ buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
+ buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
+ }
+
+ /* this must be done after the buffer flags are adjusted */
+ arc_cksum_compute(buf);
+
+ enum zio_compress compress;
+ if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
+ compress = ZIO_COMPRESS_OFF;
+ } else {
+ ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
+ compress = BP_GET_COMPRESS(bp);
+ }
+ HDR_SET_PSIZE(hdr, psize);
+ arc_hdr_set_compress(hdr, compress);
+ hdr->b_complevel = zio->io_prop.zp_complevel;
+
+ if (zio->io_error != 0 || psize == 0)
+ goto out;
+
+ /*
+ * Fill the hdr with data. If the buffer is encrypted we have no choice
+ * but to copy the data into b_radb. If the hdr is compressed, the data
+ * we want is available from the zio, otherwise we can take it from
+ * the buf.
+ *
+ * We might be able to share the buf's data with the hdr here. However,
+ * doing so would cause the ARC to be full of linear ABDs if we write a
+ * lot of shareable data. As a compromise, we check whether scattered
+ * ABDs are allowed, and assume that if they are then the user wants
+ * the ARC to be primarily filled with them regardless of the data being
+ * written. Therefore, if they're allowed then we allocate one and copy
+ * the data into it; otherwise, we share the data directly if we can.
+ */
+ if (ARC_BUF_ENCRYPTED(buf)) {
+ ASSERT3U(psize, >, 0);
+ ASSERT(ARC_BUF_COMPRESSED(buf));
+ arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA);
+ abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
+ } else if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) {
+ /*
+ * Ideally, we would always copy the io_abd into b_pabd, but the
+ * user may have disabled compressed ARC, thus we must check the
+ * hdr's compression setting rather than the io_bp's.
+ */
+ if (BP_IS_ENCRYPTED(bp)) {
+ ASSERT3U(psize, >, 0);
+ arc_hdr_alloc_abd(hdr,
+ ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA);
+ abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
+ } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
+ !ARC_BUF_COMPRESSED(buf)) {
+ ASSERT3U(psize, >, 0);
+ arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
+ abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
+ } else {
+ ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
+ arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
+ abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
+ arc_buf_size(buf));
+ }
+ } else {
+ ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
+ ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
+ ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+
+ arc_share_buf(hdr, buf);
+ }
+
+out:
+ arc_hdr_verify(hdr, bp);
+ spl_fstrans_unmark(cookie);
+}
+
+static void
+arc_write_children_ready(zio_t *zio)
+{
+ arc_write_callback_t *callback = zio->io_private;
+ arc_buf_t *buf = callback->awcb_buf;
+
+ callback->awcb_children_ready(zio, buf, callback->awcb_private);
+}
+
+/*
+ * The SPA calls this callback for each physical write that happens on behalf
+ * of a logical write. See the comment in dbuf_write_physdone() for details.
+ */
+static void
+arc_write_physdone(zio_t *zio)
+{
+ arc_write_callback_t *cb = zio->io_private;
+ if (cb->awcb_physdone != NULL)
+ cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
+}
+
+static void
+arc_write_done(zio_t *zio)
+{
+ arc_write_callback_t *callback = zio->io_private;
+ arc_buf_t *buf = callback->awcb_buf;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+
+ if (zio->io_error == 0) {
+ arc_hdr_verify(hdr, zio->io_bp);
+
+ if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
+ buf_discard_identity(hdr);
+ } else {
+ hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+ hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
+ }
+ } else {
+ ASSERT(HDR_EMPTY(hdr));
+ }
+
+ /*
+ * If the block to be written was all-zero or compressed enough to be
+ * embedded in the BP, no write was performed so there will be no
+ * dva/birth/checksum. The buffer must therefore remain anonymous
+ * (and uncached).
+ */
+ if (!HDR_EMPTY(hdr)) {
+ arc_buf_hdr_t *exists;
+ kmutex_t *hash_lock;
+
+ ASSERT3U(zio->io_error, ==, 0);
+
+ arc_cksum_verify(buf);
+
+ exists = buf_hash_insert(hdr, &hash_lock);
+ if (exists != NULL) {
+ /*
+ * This can only happen if we overwrite for
+ * sync-to-convergence, because we remove
+ * buffers from the hash table when we arc_free().
+ */
+ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+ if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+ panic("bad overwrite, hdr=%p exists=%p",
+ (void *)hdr, (void *)exists);
+ ASSERT(zfs_refcount_is_zero(
+ &exists->b_l1hdr.b_refcnt));
+ arc_change_state(arc_anon, exists, hash_lock);
+ arc_hdr_destroy(exists);
+ mutex_exit(hash_lock);
+ exists = buf_hash_insert(hdr, &hash_lock);
+ ASSERT3P(exists, ==, NULL);
+ } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
+ /* nopwrite */
+ ASSERT(zio->io_prop.zp_nopwrite);
+ if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+ panic("bad nopwrite, hdr=%p exists=%p",
+ (void *)hdr, (void *)exists);
+ } else {
+ /* Dedup */
+ ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
+ ASSERT(hdr->b_l1hdr.b_state == arc_anon);
+ ASSERT(BP_GET_DEDUP(zio->io_bp));
+ ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
+ }
+ }
+ arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+ /* if it's not anon, we are doing a scrub */
+ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+ } else {
+ arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+ }
+
+ ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ callback->awcb_done(zio, buf, callback->awcb_private);
+
+ abd_free(zio->io_abd);
+ kmem_free(callback, sizeof (arc_write_callback_t));
+}
+
+zio_t *
+arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
+ blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc,
+ const zio_prop_t *zp, arc_write_done_func_t *ready,
+ arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
+ arc_write_done_func_t *done, void *private, zio_priority_t priority,
+ int zio_flags, const zbookmark_phys_t *zb)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ arc_write_callback_t *callback;
+ zio_t *zio;
+ zio_prop_t localprop = *zp;
+
+ ASSERT3P(ready, !=, NULL);
+ ASSERT3P(done, !=, NULL);
+ ASSERT(!HDR_IO_ERROR(hdr));
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+ ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
+ if (l2arc)
+ arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
+
+ if (ARC_BUF_ENCRYPTED(buf)) {
+ ASSERT(ARC_BUF_COMPRESSED(buf));
+ localprop.zp_encrypt = B_TRUE;
+ localprop.zp_compress = HDR_GET_COMPRESS(hdr);
+ localprop.zp_complevel = hdr->b_complevel;
+ localprop.zp_byteorder =
+ (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
+ ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
+ bcopy(hdr->b_crypt_hdr.b_salt, localprop.zp_salt,
+ ZIO_DATA_SALT_LEN);
+ bcopy(hdr->b_crypt_hdr.b_iv, localprop.zp_iv,
+ ZIO_DATA_IV_LEN);
+ bcopy(hdr->b_crypt_hdr.b_mac, localprop.zp_mac,
+ ZIO_DATA_MAC_LEN);
+ if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) {
+ localprop.zp_nopwrite = B_FALSE;
+ localprop.zp_copies =
+ MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
+ }
+ zio_flags |= ZIO_FLAG_RAW;
+ } else if (ARC_BUF_COMPRESSED(buf)) {
+ ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
+ localprop.zp_compress = HDR_GET_COMPRESS(hdr);
+ localprop.zp_complevel = hdr->b_complevel;
+ zio_flags |= ZIO_FLAG_RAW_COMPRESS;
+ }
+ callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
+ callback->awcb_ready = ready;
+ callback->awcb_children_ready = children_ready;
+ callback->awcb_physdone = physdone;
+ callback->awcb_done = done;
+ callback->awcb_private = private;
+ callback->awcb_buf = buf;
+
+ /*
+ * The hdr's b_pabd is now stale, free it now. A new data block
+ * will be allocated when the zio pipeline calls arc_write_ready().
+ */
+ if (hdr->b_l1hdr.b_pabd != NULL) {
+ /*
+ * If the buf is currently sharing the data block with
+ * the hdr then we need to break that relationship here.
+ * The hdr will remain with a NULL data pointer and the
+ * buf will take sole ownership of the block.
+ */
+ if (arc_buf_is_shared(buf)) {
+ arc_unshare_buf(hdr, buf);
+ } else {
+ arc_hdr_free_abd(hdr, B_FALSE);
+ }
+ VERIFY3P(buf->b_data, !=, NULL);
+ }
+
+ if (HDR_HAS_RABD(hdr))
+ arc_hdr_free_abd(hdr, B_TRUE);
+
+ if (!(zio_flags & ZIO_FLAG_RAW))
+ arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
+
+ ASSERT(!arc_buf_is_shared(buf));
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+
+ zio = zio_write(pio, spa, txg, bp,
+ abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
+ HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
+ (children_ready != NULL) ? arc_write_children_ready : NULL,
+ arc_write_physdone, arc_write_done, callback,
+ priority, zio_flags, zb);
+
+ return (zio);
+}
+
+void
+arc_tempreserve_clear(uint64_t reserve)
+{
+ atomic_add_64(&arc_tempreserve, -reserve);
+ ASSERT((int64_t)arc_tempreserve >= 0);
+}
+
+int
+arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
+{
+ int error;
+ uint64_t anon_size;
+
+ if (!arc_no_grow &&
+ reserve > arc_c/4 &&
+ reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT))
+ arc_c = MIN(arc_c_max, reserve * 4);
+
+ /*
+ * Throttle when the calculated memory footprint for the TXG
+ * exceeds the target ARC size.
+ */
+ if (reserve > arc_c) {
+ DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
+ return (SET_ERROR(ERESTART));
+ }
+
+ /*
+ * Don't count loaned bufs as in flight dirty data to prevent long
+ * network delays from blocking transactions that are ready to be
+ * assigned to a txg.
+ */
+
+ /* assert that it has not wrapped around */
+ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
+
+ anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) -
+ arc_loaned_bytes), 0);
+
+ /*
+ * Writes will, almost always, require additional memory allocations
+ * in order to compress/encrypt/etc the data. We therefore need to
+ * make sure that there is sufficient available memory for this.
+ */
+ error = arc_memory_throttle(spa, reserve, txg);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Throttle writes when the amount of dirty data in the cache
+ * gets too large. We try to keep the cache less than half full
+ * of dirty blocks so that our sync times don't grow too large.
+ *
+ * In the case of one pool being built on another pool, we want
+ * to make sure we don't end up throttling the lower (backing)
+ * pool when the upper pool is the majority contributor to dirty
+ * data. To insure we make forward progress during throttling, we
+ * also check the current pool's net dirty data and only throttle
+ * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
+ * data in the cache.
+ *
+ * Note: if two requests come in concurrently, we might let them
+ * both succeed, when one of them should fail. Not a huge deal.
+ */
+ uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
+ uint64_t spa_dirty_anon = spa_dirty_data(spa);
+ uint64_t rarc_c = arc_warm ? arc_c : arc_c_max;
+ if (total_dirty > rarc_c * zfs_arc_dirty_limit_percent / 100 &&
+ anon_size > rarc_c * zfs_arc_anon_limit_percent / 100 &&
+ spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
+#ifdef ZFS_DEBUG
+ uint64_t meta_esize = zfs_refcount_count(
+ &arc_anon->arcs_esize[ARC_BUFC_METADATA]);
+ uint64_t data_esize =
+ zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
+ dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
+ "anon_data=%lluK tempreserve=%lluK rarc_c=%lluK\n",
+ arc_tempreserve >> 10, meta_esize >> 10,
+ data_esize >> 10, reserve >> 10, rarc_c >> 10);
+#endif
+ DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
+ return (SET_ERROR(ERESTART));
+ }
+ atomic_add_64(&arc_tempreserve, reserve);
+ return (0);
+}
+
+static void
+arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
+ kstat_named_t *evict_data, kstat_named_t *evict_metadata)
+{
+ size->value.ui64 = zfs_refcount_count(&state->arcs_size);
+ evict_data->value.ui64 =
+ zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
+ evict_metadata->value.ui64 =
+ zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
+}
+
+static int
+arc_kstat_update(kstat_t *ksp, int rw)
+{
+ arc_stats_t *as = ksp->ks_data;
+
+ if (rw == KSTAT_WRITE) {
+ return (SET_ERROR(EACCES));
+ } else {
+ arc_kstat_update_state(arc_anon,
+ &as->arcstat_anon_size,
+ &as->arcstat_anon_evictable_data,
+ &as->arcstat_anon_evictable_metadata);
+ arc_kstat_update_state(arc_mru,
+ &as->arcstat_mru_size,
+ &as->arcstat_mru_evictable_data,
+ &as->arcstat_mru_evictable_metadata);
+ arc_kstat_update_state(arc_mru_ghost,
+ &as->arcstat_mru_ghost_size,
+ &as->arcstat_mru_ghost_evictable_data,
+ &as->arcstat_mru_ghost_evictable_metadata);
+ arc_kstat_update_state(arc_mfu,
+ &as->arcstat_mfu_size,
+ &as->arcstat_mfu_evictable_data,
+ &as->arcstat_mfu_evictable_metadata);
+ arc_kstat_update_state(arc_mfu_ghost,
+ &as->arcstat_mfu_ghost_size,
+ &as->arcstat_mfu_ghost_evictable_data,
+ &as->arcstat_mfu_ghost_evictable_metadata);
+
+ ARCSTAT(arcstat_size) = aggsum_value(&arc_size);
+ ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used);
+ ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size);
+ ARCSTAT(arcstat_metadata_size) =
+ aggsum_value(&astat_metadata_size);
+ ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size);
+ ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size);
+ ARCSTAT(arcstat_dbuf_size) = aggsum_value(&astat_dbuf_size);
+#if defined(COMPAT_FREEBSD11)
+ ARCSTAT(arcstat_other_size) = aggsum_value(&astat_bonus_size) +
+ aggsum_value(&astat_dnode_size) +
+ aggsum_value(&astat_dbuf_size);
+#endif
+ ARCSTAT(arcstat_dnode_size) = aggsum_value(&astat_dnode_size);
+ ARCSTAT(arcstat_bonus_size) = aggsum_value(&astat_bonus_size);
+ ARCSTAT(arcstat_abd_chunk_waste_size) =
+ aggsum_value(&astat_abd_chunk_waste_size);
+
+ as->arcstat_memory_all_bytes.value.ui64 =
+ arc_all_memory();
+ as->arcstat_memory_free_bytes.value.ui64 =
+ arc_free_memory();
+ as->arcstat_memory_available_bytes.value.i64 =
+ arc_available_memory();
+ }
+
+ return (0);
+}
+
+/*
+ * This function *must* return indices evenly distributed between all
+ * sublists of the multilist. This is needed due to how the ARC eviction
+ * code is laid out; arc_evict_state() assumes ARC buffers are evenly
+ * distributed between all sublists and uses this assumption when
+ * deciding which sublist to evict from and how much to evict from it.
+ */
+static unsigned int
+arc_state_multilist_index_func(multilist_t *ml, void *obj)
+{
+ arc_buf_hdr_t *hdr = obj;
+
+ /*
+ * We rely on b_dva to generate evenly distributed index
+ * numbers using buf_hash below. So, as an added precaution,
+ * let's make sure we never add empty buffers to the arc lists.
+ */
+ ASSERT(!HDR_EMPTY(hdr));
+
+ /*
+ * The assumption here, is the hash value for a given
+ * arc_buf_hdr_t will remain constant throughout its lifetime
+ * (i.e. its b_spa, b_dva, and b_birth fields don't change).
+ * Thus, we don't need to store the header's sublist index
+ * on insertion, as this index can be recalculated on removal.
+ *
+ * Also, the low order bits of the hash value are thought to be
+ * distributed evenly. Otherwise, in the case that the multilist
+ * has a power of two number of sublists, each sublists' usage
+ * would not be evenly distributed.
+ */
+ return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
+ multilist_get_num_sublists(ml));
+}
+
+#define WARN_IF_TUNING_IGNORED(tuning, value, do_warn) do { \
+ if ((do_warn) && (tuning) && ((tuning) != (value))) { \
+ cmn_err(CE_WARN, \
+ "ignoring tunable %s (using %llu instead)", \
+ (#tuning), (value)); \
+ } \
+} while (0)
+
+/*
+ * Called during module initialization and periodically thereafter to
+ * apply reasonable changes to the exposed performance tunings. Can also be
+ * called explicitly by param_set_arc_*() functions when ARC tunables are
+ * updated manually. Non-zero zfs_* values which differ from the currently set
+ * values will be applied.
+ */
+void
+arc_tuning_update(boolean_t verbose)
+{
+ uint64_t allmem = arc_all_memory();
+ unsigned long limit;
+
+ /* Valid range: 32M - <arc_c_max> */
+ if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) &&
+ (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) &&
+ (zfs_arc_min <= arc_c_max)) {
+ arc_c_min = zfs_arc_min;
+ arc_c = MAX(arc_c, arc_c_min);
+ }
+ WARN_IF_TUNING_IGNORED(zfs_arc_min, arc_c_min, verbose);
+
+ /* Valid range: 64M - <all physical memory> */
+ if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) &&
+ (zfs_arc_max >= 64 << 20) && (zfs_arc_max < allmem) &&
+ (zfs_arc_max > arc_c_min)) {
+ arc_c_max = zfs_arc_max;
+ arc_c = MIN(arc_c, arc_c_max);
+ arc_p = (arc_c >> 1);
+ if (arc_meta_limit > arc_c_max)
+ arc_meta_limit = arc_c_max;
+ if (arc_dnode_size_limit > arc_meta_limit)
+ arc_dnode_size_limit = arc_meta_limit;
+ }
+ WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose);
+
+ /* Valid range: 16M - <arc_c_max> */
+ if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) &&
+ (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) &&
+ (zfs_arc_meta_min <= arc_c_max)) {
+ arc_meta_min = zfs_arc_meta_min;
+ if (arc_meta_limit < arc_meta_min)
+ arc_meta_limit = arc_meta_min;
+ if (arc_dnode_size_limit < arc_meta_min)
+ arc_dnode_size_limit = arc_meta_min;
+ }
+ WARN_IF_TUNING_IGNORED(zfs_arc_meta_min, arc_meta_min, verbose);
+
+ /* Valid range: <arc_meta_min> - <arc_c_max> */
+ limit = zfs_arc_meta_limit ? zfs_arc_meta_limit :
+ MIN(zfs_arc_meta_limit_percent, 100) * arc_c_max / 100;
+ if ((limit != arc_meta_limit) &&
+ (limit >= arc_meta_min) &&
+ (limit <= arc_c_max))
+ arc_meta_limit = limit;
+ WARN_IF_TUNING_IGNORED(zfs_arc_meta_limit, arc_meta_limit, verbose);
+
+ /* Valid range: <arc_meta_min> - <arc_meta_limit> */
+ limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit :
+ MIN(zfs_arc_dnode_limit_percent, 100) * arc_meta_limit / 100;
+ if ((limit != arc_dnode_size_limit) &&
+ (limit >= arc_meta_min) &&
+ (limit <= arc_meta_limit))
+ arc_dnode_size_limit = limit;
+ WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_size_limit,
+ verbose);
+
+ /* Valid range: 1 - N */
+ if (zfs_arc_grow_retry)
+ arc_grow_retry = zfs_arc_grow_retry;
+
+ /* Valid range: 1 - N */
+ if (zfs_arc_shrink_shift) {
+ arc_shrink_shift = zfs_arc_shrink_shift;
+ arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1);
+ }
+
+ /* Valid range: 1 - N */
+ if (zfs_arc_p_min_shift)
+ arc_p_min_shift = zfs_arc_p_min_shift;
+
+ /* Valid range: 1 - N ms */
+ if (zfs_arc_min_prefetch_ms)
+ arc_min_prefetch_ms = zfs_arc_min_prefetch_ms;
+
+ /* Valid range: 1 - N ms */
+ if (zfs_arc_min_prescient_prefetch_ms) {
+ arc_min_prescient_prefetch_ms =
+ zfs_arc_min_prescient_prefetch_ms;
+ }
+
+ /* Valid range: 0 - 100 */
+ if ((zfs_arc_lotsfree_percent >= 0) &&
+ (zfs_arc_lotsfree_percent <= 100))
+ arc_lotsfree_percent = zfs_arc_lotsfree_percent;
+ WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent,
+ verbose);
+
+ /* Valid range: 0 - <all physical memory> */
+ if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free))
+ arc_sys_free = MIN(MAX(zfs_arc_sys_free, 0), allmem);
+ WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose);
+}
+
+static void
+arc_state_init(void)
+{
+ arc_anon = &ARC_anon;
+ arc_mru = &ARC_mru;
+ arc_mru_ghost = &ARC_mru_ghost;
+ arc_mfu = &ARC_mfu;
+ arc_mfu_ghost = &ARC_mfu_ghost;
+ arc_l2c_only = &ARC_l2c_only;
+
+ arc_mru->arcs_list[ARC_BUFC_METADATA] =
+ multilist_create(sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ arc_state_multilist_index_func);
+ arc_mru->arcs_list[ARC_BUFC_DATA] =
+ multilist_create(sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ arc_state_multilist_index_func);
+ arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] =
+ multilist_create(sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ arc_state_multilist_index_func);
+ arc_mru_ghost->arcs_list[ARC_BUFC_DATA] =
+ multilist_create(sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ arc_state_multilist_index_func);
+ arc_mfu->arcs_list[ARC_BUFC_METADATA] =
+ multilist_create(sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ arc_state_multilist_index_func);
+ arc_mfu->arcs_list[ARC_BUFC_DATA] =
+ multilist_create(sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ arc_state_multilist_index_func);
+ arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] =
+ multilist_create(sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ arc_state_multilist_index_func);
+ arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] =
+ multilist_create(sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ arc_state_multilist_index_func);
+ arc_l2c_only->arcs_list[ARC_BUFC_METADATA] =
+ multilist_create(sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ arc_state_multilist_index_func);
+ arc_l2c_only->arcs_list[ARC_BUFC_DATA] =
+ multilist_create(sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ arc_state_multilist_index_func);
+
+ zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
+ zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
+ zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
+ zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
+ zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
+ zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
+
+ zfs_refcount_create(&arc_anon->arcs_size);
+ zfs_refcount_create(&arc_mru->arcs_size);
+ zfs_refcount_create(&arc_mru_ghost->arcs_size);
+ zfs_refcount_create(&arc_mfu->arcs_size);
+ zfs_refcount_create(&arc_mfu_ghost->arcs_size);
+ zfs_refcount_create(&arc_l2c_only->arcs_size);
+
+ aggsum_init(&arc_meta_used, 0);
+ aggsum_init(&arc_size, 0);
+ aggsum_init(&astat_data_size, 0);
+ aggsum_init(&astat_metadata_size, 0);
+ aggsum_init(&astat_hdr_size, 0);
+ aggsum_init(&astat_l2_hdr_size, 0);
+ aggsum_init(&astat_bonus_size, 0);
+ aggsum_init(&astat_dnode_size, 0);
+ aggsum_init(&astat_dbuf_size, 0);
+ aggsum_init(&astat_abd_chunk_waste_size, 0);
+
+ arc_anon->arcs_state = ARC_STATE_ANON;
+ arc_mru->arcs_state = ARC_STATE_MRU;
+ arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
+ arc_mfu->arcs_state = ARC_STATE_MFU;
+ arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
+ arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
+}
+
+static void
+arc_state_fini(void)
+{
+ zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
+ zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
+ zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
+ zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
+ zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
+ zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
+
+ zfs_refcount_destroy(&arc_anon->arcs_size);
+ zfs_refcount_destroy(&arc_mru->arcs_size);
+ zfs_refcount_destroy(&arc_mru_ghost->arcs_size);
+ zfs_refcount_destroy(&arc_mfu->arcs_size);
+ zfs_refcount_destroy(&arc_mfu_ghost->arcs_size);
+ zfs_refcount_destroy(&arc_l2c_only->arcs_size);
+
+ multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]);
+ multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
+ multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]);
+ multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
+ multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]);
+ multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
+ multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]);
+ multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
+ multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
+ multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
+
+ aggsum_fini(&arc_meta_used);
+ aggsum_fini(&arc_size);
+ aggsum_fini(&astat_data_size);
+ aggsum_fini(&astat_metadata_size);
+ aggsum_fini(&astat_hdr_size);
+ aggsum_fini(&astat_l2_hdr_size);
+ aggsum_fini(&astat_bonus_size);
+ aggsum_fini(&astat_dnode_size);
+ aggsum_fini(&astat_dbuf_size);
+ aggsum_fini(&astat_abd_chunk_waste_size);
+}
+
+uint64_t
+arc_target_bytes(void)
+{
+ return (arc_c);
+}
+
+void
+arc_set_limits(uint64_t allmem)
+{
+ /* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */
+ arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT);
+
+ /* How to set default max varies by platform. */
+ arc_c_max = arc_default_max(arc_c_min, allmem);
+}
+void
+arc_init(void)
+{
+ uint64_t percent, allmem = arc_all_memory();
+ mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t),
+ offsetof(arc_evict_waiter_t, aew_node));
+
+ arc_min_prefetch_ms = 1000;
+ arc_min_prescient_prefetch_ms = 6000;
+
+#if defined(_KERNEL)
+ arc_lowmem_init();
+#endif
+
+ arc_set_limits(allmem);
+
+#ifndef _KERNEL
+ /*
+ * In userland, there's only the memory pressure that we artificially
+ * create (see arc_available_memory()). Don't let arc_c get too
+ * small, because it can cause transactions to be larger than
+ * arc_c, causing arc_tempreserve_space() to fail.
+ */
+ arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT);
+#endif
+
+ arc_c = arc_c_min;
+ arc_p = (arc_c >> 1);
+
+ /* Set min to 1/2 of arc_c_min */
+ arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT;
+ /* Initialize maximum observed usage to zero */
+ arc_meta_max = 0;
+ /*
+ * Set arc_meta_limit to a percent of arc_c_max with a floor of
+ * arc_meta_min, and a ceiling of arc_c_max.
+ */
+ percent = MIN(zfs_arc_meta_limit_percent, 100);
+ arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100);
+ percent = MIN(zfs_arc_dnode_limit_percent, 100);
+ arc_dnode_size_limit = (percent * arc_meta_limit) / 100;
+
+ /* Apply user specified tunings */
+ arc_tuning_update(B_TRUE);
+
+ /* if kmem_flags are set, lets try to use less memory */
+ if (kmem_debugging())
+ arc_c = arc_c / 2;
+ if (arc_c < arc_c_min)
+ arc_c = arc_c_min;
+
+ arc_register_hotplug();
+
+ arc_state_init();
+
+ buf_init();
+
+ list_create(&arc_prune_list, sizeof (arc_prune_t),
+ offsetof(arc_prune_t, p_node));
+ mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+ arc_prune_taskq = taskq_create("arc_prune", 100, defclsyspri,
+ boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
+ TASKQ_THREADS_CPU_PCT);
+
+ arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
+ sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+
+ if (arc_ksp != NULL) {
+ arc_ksp->ks_data = &arc_stats;
+ arc_ksp->ks_update = arc_kstat_update;
+ kstat_install(arc_ksp);
+ }
+
+ arc_evict_zthr = zthr_create("arc_evict",
+ arc_evict_cb_check, arc_evict_cb, NULL);
+ arc_reap_zthr = zthr_create_timer("arc_reap",
+ arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1));
+
+ arc_warm = B_FALSE;
+
+ /*
+ * Calculate maximum amount of dirty data per pool.
+ *
+ * If it has been set by a module parameter, take that.
+ * Otherwise, use a percentage of physical memory defined by
+ * zfs_dirty_data_max_percent (default 10%) with a cap at
+ * zfs_dirty_data_max_max (default 4G or 25% of physical memory).
+ */
+#ifdef __LP64__
+ if (zfs_dirty_data_max_max == 0)
+ zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024,
+ allmem * zfs_dirty_data_max_max_percent / 100);
+#else
+ if (zfs_dirty_data_max_max == 0)
+ zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024,
+ allmem * zfs_dirty_data_max_max_percent / 100);
+#endif
+
+ if (zfs_dirty_data_max == 0) {
+ zfs_dirty_data_max = allmem *
+ zfs_dirty_data_max_percent / 100;
+ zfs_dirty_data_max = MIN(zfs_dirty_data_max,
+ zfs_dirty_data_max_max);
+ }
+}
+
+void
+arc_fini(void)
+{
+ arc_prune_t *p;
+
+#ifdef _KERNEL
+ arc_lowmem_fini();
+#endif /* _KERNEL */
+
+ /* Use B_TRUE to ensure *all* buffers are evicted */
+ arc_flush(NULL, B_TRUE);
+
+ if (arc_ksp != NULL) {
+ kstat_delete(arc_ksp);
+ arc_ksp = NULL;
+ }
+
+ taskq_wait(arc_prune_taskq);
+ taskq_destroy(arc_prune_taskq);
+
+ mutex_enter(&arc_prune_mtx);
+ while ((p = list_head(&arc_prune_list)) != NULL) {
+ list_remove(&arc_prune_list, p);
+ zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
+ zfs_refcount_destroy(&p->p_refcnt);
+ kmem_free(p, sizeof (*p));
+ }
+ mutex_exit(&arc_prune_mtx);
+
+ list_destroy(&arc_prune_list);
+ mutex_destroy(&arc_prune_mtx);
+
+ (void) zthr_cancel(arc_evict_zthr);
+ (void) zthr_cancel(arc_reap_zthr);
+
+ mutex_destroy(&arc_evict_lock);
+ list_destroy(&arc_evict_waiters);
+
+ /*
+ * Free any buffers that were tagged for destruction. This needs
+ * to occur before arc_state_fini() runs and destroys the aggsum
+ * values which are updated when freeing scatter ABDs.
+ */
+ l2arc_do_free_on_write();
+
+ /*
+ * buf_fini() must proceed arc_state_fini() because buf_fin() may
+ * trigger the release of kmem magazines, which can callback to
+ * arc_space_return() which accesses aggsums freed in act_state_fini().
+ */
+ buf_fini();
+ arc_state_fini();
+
+ arc_unregister_hotplug();
+
+ /*
+ * We destroy the zthrs after all the ARC state has been
+ * torn down to avoid the case of them receiving any
+ * wakeup() signals after they are destroyed.
+ */
+ zthr_destroy(arc_evict_zthr);
+ zthr_destroy(arc_reap_zthr);
+
+ ASSERT0(arc_loaned_bytes);
+}
+
+/*
+ * Level 2 ARC
+ *
+ * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
+ * It uses dedicated storage devices to hold cached data, which are populated
+ * using large infrequent writes. The main role of this cache is to boost
+ * the performance of random read workloads. The intended L2ARC devices
+ * include short-stroked disks, solid state disks, and other media with
+ * substantially faster read latency than disk.
+ *
+ * +-----------------------+
+ * | ARC |
+ * +-----------------------+
+ * | ^ ^
+ * | | |
+ * l2arc_feed_thread() arc_read()
+ * | | |
+ * | l2arc read |
+ * V | |
+ * +---------------+ |
+ * | L2ARC | |
+ * +---------------+ |
+ * | ^ |
+ * l2arc_write() | |
+ * | | |
+ * V | |
+ * +-------+ +-------+
+ * | vdev | | vdev |
+ * | cache | | cache |
+ * +-------+ +-------+
+ * +=========+ .-----.
+ * : L2ARC : |-_____-|
+ * : devices : | Disks |
+ * +=========+ `-_____-'
+ *
+ * Read requests are satisfied from the following sources, in order:
+ *
+ * 1) ARC
+ * 2) vdev cache of L2ARC devices
+ * 3) L2ARC devices
+ * 4) vdev cache of disks
+ * 5) disks
+ *
+ * Some L2ARC device types exhibit extremely slow write performance.
+ * To accommodate for this there are some significant differences between
+ * the L2ARC and traditional cache design:
+ *
+ * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
+ * the ARC behave as usual, freeing buffers and placing headers on ghost
+ * lists. The ARC does not send buffers to the L2ARC during eviction as
+ * this would add inflated write latencies for all ARC memory pressure.
+ *
+ * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
+ * It does this by periodically scanning buffers from the eviction-end of
+ * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
+ * not already there. It scans until a headroom of buffers is satisfied,
+ * which itself is a buffer for ARC eviction. If a compressible buffer is
+ * found during scanning and selected for writing to an L2ARC device, we
+ * temporarily boost scanning headroom during the next scan cycle to make
+ * sure we adapt to compression effects (which might significantly reduce
+ * the data volume we write to L2ARC). The thread that does this is
+ * l2arc_feed_thread(), illustrated below; example sizes are included to
+ * provide a better sense of ratio than this diagram:
+ *
+ * head --> tail
+ * +---------------------+----------+
+ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
+ * +---------------------+----------+ | o L2ARC eligible
+ * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
+ * +---------------------+----------+ |
+ * 15.9 Gbytes ^ 32 Mbytes |
+ * headroom |
+ * l2arc_feed_thread()
+ * |
+ * l2arc write hand <--[oooo]--'
+ * | 8 Mbyte
+ * | write max
+ * V
+ * +==============================+
+ * L2ARC dev |####|#|###|###| |####| ... |
+ * +==============================+
+ * 32 Gbytes
+ *
+ * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
+ * evicted, then the L2ARC has cached a buffer much sooner than it probably
+ * needed to, potentially wasting L2ARC device bandwidth and storage. It is
+ * safe to say that this is an uncommon case, since buffers at the end of
+ * the ARC lists have moved there due to inactivity.
+ *
+ * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
+ * then the L2ARC simply misses copying some buffers. This serves as a
+ * pressure valve to prevent heavy read workloads from both stalling the ARC
+ * with waits and clogging the L2ARC with writes. This also helps prevent
+ * the potential for the L2ARC to churn if it attempts to cache content too
+ * quickly, such as during backups of the entire pool.
+ *
+ * 5. After system boot and before the ARC has filled main memory, there are
+ * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
+ * lists can remain mostly static. Instead of searching from tail of these
+ * lists as pictured, the l2arc_feed_thread() will search from the list heads
+ * for eligible buffers, greatly increasing its chance of finding them.
+ *
+ * The L2ARC device write speed is also boosted during this time so that
+ * the L2ARC warms up faster. Since there have been no ARC evictions yet,
+ * there are no L2ARC reads, and no fear of degrading read performance
+ * through increased writes.
+ *
+ * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
+ * the vdev queue can aggregate them into larger and fewer writes. Each
+ * device is written to in a rotor fashion, sweeping writes through
+ * available space then repeating.
+ *
+ * 7. The L2ARC does not store dirty content. It never needs to flush
+ * write buffers back to disk based storage.
+ *
+ * 8. If an ARC buffer is written (and dirtied) which also exists in the
+ * L2ARC, the now stale L2ARC buffer is immediately dropped.
+ *
+ * The performance of the L2ARC can be tweaked by a number of tunables, which
+ * may be necessary for different workloads:
+ *
+ * l2arc_write_max max write bytes per interval
+ * l2arc_write_boost extra write bytes during device warmup
+ * l2arc_noprefetch skip caching prefetched buffers
+ * l2arc_headroom number of max device writes to precache
+ * l2arc_headroom_boost when we find compressed buffers during ARC
+ * scanning, we multiply headroom by this
+ * percentage factor for the next scan cycle,
+ * since more compressed buffers are likely to
+ * be present
+ * l2arc_feed_secs seconds between L2ARC writing
+ *
+ * Tunables may be removed or added as future performance improvements are
+ * integrated, and also may become zpool properties.
+ *
+ * There are three key functions that control how the L2ARC warms up:
+ *
+ * l2arc_write_eligible() check if a buffer is eligible to cache
+ * l2arc_write_size() calculate how much to write
+ * l2arc_write_interval() calculate sleep delay between writes
+ *
+ * These three functions determine what to write, how much, and how quickly
+ * to send writes.
+ *
+ * L2ARC persistence:
+ *
+ * When writing buffers to L2ARC, we periodically add some metadata to
+ * make sure we can pick them up after reboot, thus dramatically reducing
+ * the impact that any downtime has on the performance of storage systems
+ * with large caches.
+ *
+ * The implementation works fairly simply by integrating the following two
+ * modifications:
+ *
+ * *) When writing to the L2ARC, we occasionally write a "l2arc log block",
+ * which is an additional piece of metadata which describes what's been
+ * written. This allows us to rebuild the arc_buf_hdr_t structures of the
+ * main ARC buffers. There are 2 linked-lists of log blocks headed by
+ * dh_start_lbps[2]. We alternate which chain we append to, so they are
+ * time-wise and offset-wise interleaved, but that is an optimization rather
+ * than for correctness. The log block also includes a pointer to the
+ * previous block in its chain.
+ *
+ * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
+ * for our header bookkeeping purposes. This contains a device header,
+ * which contains our top-level reference structures. We update it each
+ * time we write a new log block, so that we're able to locate it in the
+ * L2ARC device. If this write results in an inconsistent device header
+ * (e.g. due to power failure), we detect this by verifying the header's
+ * checksum and simply fail to reconstruct the L2ARC after reboot.
+ *
+ * Implementation diagram:
+ *
+ * +=== L2ARC device (not to scale) ======================================+
+ * | ___two newest log block pointers__.__________ |
+ * | / \dh_start_lbps[1] |
+ * | / \ \dh_start_lbps[0]|
+ * |.___/__. V V |
+ * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
+ * || hdr| ^ /^ /^ / / |
+ * |+------+ ...--\-------/ \-----/--\------/ / |
+ * | \--------------/ \--------------/ |
+ * +======================================================================+
+ *
+ * As can be seen on the diagram, rather than using a simple linked list,
+ * we use a pair of linked lists with alternating elements. This is a
+ * performance enhancement due to the fact that we only find out the
+ * address of the next log block access once the current block has been
+ * completely read in. Obviously, this hurts performance, because we'd be
+ * keeping the device's I/O queue at only a 1 operation deep, thus
+ * incurring a large amount of I/O round-trip latency. Having two lists
+ * allows us to fetch two log blocks ahead of where we are currently
+ * rebuilding L2ARC buffers.
+ *
+ * On-device data structures:
+ *
+ * L2ARC device header: l2arc_dev_hdr_phys_t
+ * L2ARC log block: l2arc_log_blk_phys_t
+ *
+ * L2ARC reconstruction:
+ *
+ * When writing data, we simply write in the standard rotary fashion,
+ * evicting buffers as we go and simply writing new data over them (writing
+ * a new log block every now and then). This obviously means that once we
+ * loop around the end of the device, we will start cutting into an already
+ * committed log block (and its referenced data buffers), like so:
+ *
+ * current write head__ __old tail
+ * \ /
+ * V V
+ * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |-->
+ * ^ ^^^^^^^^^___________________________________
+ * | \
+ * <<nextwrite>> may overwrite this blk and/or its bufs --'
+ *
+ * When importing the pool, we detect this situation and use it to stop
+ * our scanning process (see l2arc_rebuild).
+ *
+ * There is one significant caveat to consider when rebuilding ARC contents
+ * from an L2ARC device: what about invalidated buffers? Given the above
+ * construction, we cannot update blocks which we've already written to amend
+ * them to remove buffers which were invalidated. Thus, during reconstruction,
+ * we might be populating the cache with buffers for data that's not on the
+ * main pool anymore, or may have been overwritten!
+ *
+ * As it turns out, this isn't a problem. Every arc_read request includes
+ * both the DVA and, crucially, the birth TXG of the BP the caller is
+ * looking for. So even if the cache were populated by completely rotten
+ * blocks for data that had been long deleted and/or overwritten, we'll
+ * never actually return bad data from the cache, since the DVA with the
+ * birth TXG uniquely identify a block in space and time - once created,
+ * a block is immutable on disk. The worst thing we have done is wasted
+ * some time and memory at l2arc rebuild to reconstruct outdated ARC
+ * entries that will get dropped from the l2arc as it is being updated
+ * with new blocks.
+ *
+ * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write
+ * hand are not restored. This is done by saving the offset (in bytes)
+ * l2arc_evict() has evicted to in the L2ARC device header and taking it
+ * into account when restoring buffers.
+ */
+
+static boolean_t
+l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
+{
+ /*
+ * A buffer is *not* eligible for the L2ARC if it:
+ * 1. belongs to a different spa.
+ * 2. is already cached on the L2ARC.
+ * 3. has an I/O in progress (it may be an incomplete read).
+ * 4. is flagged not eligible (zfs property).
+ */
+ if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
+ HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+static uint64_t
+l2arc_write_size(l2arc_dev_t *dev)
+{
+ uint64_t size, dev_size, tsize;
+
+ /*
+ * Make sure our globals have meaningful values in case the user
+ * altered them.
+ */
+ size = l2arc_write_max;
+ if (size == 0) {
+ cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
+ "be greater than zero, resetting it to the default (%d)",
+ L2ARC_WRITE_SIZE);
+ size = l2arc_write_max = L2ARC_WRITE_SIZE;
+ }
+
+ if (arc_warm == B_FALSE)
+ size += l2arc_write_boost;
+
+ /*
+ * Make sure the write size does not exceed the size of the cache
+ * device. This is important in l2arc_evict(), otherwise infinite
+ * iteration can occur.
+ */
+ dev_size = dev->l2ad_end - dev->l2ad_start;
+ tsize = size + l2arc_log_blk_overhead(size, dev);
+ if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0)
+ tsize += MAX(64 * 1024 * 1024,
+ (tsize * l2arc_trim_ahead) / 100);
+
+ if (tsize >= dev_size) {
+ cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost "
+ "plus the overhead of log blocks (persistent L2ARC, "
+ "%llu bytes) exceeds the size of the cache device "
+ "(guid %llu), resetting them to the default (%d)",
+ l2arc_log_blk_overhead(size, dev),
+ dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE);
+ size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE;
+
+ if (arc_warm == B_FALSE)
+ size += l2arc_write_boost;
+ }
+
+ return (size);
+
+}
+
+static clock_t
+l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
+{
+ clock_t interval, next, now;
+
+ /*
+ * If the ARC lists are busy, increase our write rate; if the
+ * lists are stale, idle back. This is achieved by checking
+ * how much we previously wrote - if it was more than half of
+ * what we wanted, schedule the next write much sooner.
+ */
+ if (l2arc_feed_again && wrote > (wanted / 2))
+ interval = (hz * l2arc_feed_min_ms) / 1000;
+ else
+ interval = hz * l2arc_feed_secs;
+
+ now = ddi_get_lbolt();
+ next = MAX(now, MIN(now + interval, began + interval));
+
+ return (next);
+}
+
+/*
+ * Cycle through L2ARC devices. This is how L2ARC load balances.
+ * If a device is returned, this also returns holding the spa config lock.
+ */
+static l2arc_dev_t *
+l2arc_dev_get_next(void)
+{
+ l2arc_dev_t *first, *next = NULL;
+
+ /*
+ * Lock out the removal of spas (spa_namespace_lock), then removal
+ * of cache devices (l2arc_dev_mtx). Once a device has been selected,
+ * both locks will be dropped and a spa config lock held instead.
+ */
+ mutex_enter(&spa_namespace_lock);
+ mutex_enter(&l2arc_dev_mtx);
+
+ /* if there are no vdevs, there is nothing to do */
+ if (l2arc_ndev == 0)
+ goto out;
+
+ first = NULL;
+ next = l2arc_dev_last;
+ do {
+ /* loop around the list looking for a non-faulted vdev */
+ if (next == NULL) {
+ next = list_head(l2arc_dev_list);
+ } else {
+ next = list_next(l2arc_dev_list, next);
+ if (next == NULL)
+ next = list_head(l2arc_dev_list);
+ }
+
+ /* if we have come back to the start, bail out */
+ if (first == NULL)
+ first = next;
+ else if (next == first)
+ break;
+
+ } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
+ next->l2ad_trim_all);
+
+ /* if we were unable to find any usable vdevs, return NULL */
+ if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
+ next->l2ad_trim_all)
+ next = NULL;
+
+ l2arc_dev_last = next;
+
+out:
+ mutex_exit(&l2arc_dev_mtx);
+
+ /*
+ * Grab the config lock to prevent the 'next' device from being
+ * removed while we are writing to it.
+ */
+ if (next != NULL)
+ spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
+ mutex_exit(&spa_namespace_lock);
+
+ return (next);
+}
+
+/*
+ * Free buffers that were tagged for destruction.
+ */
+static void
+l2arc_do_free_on_write(void)
+{
+ list_t *buflist;
+ l2arc_data_free_t *df, *df_prev;
+
+ mutex_enter(&l2arc_free_on_write_mtx);
+ buflist = l2arc_free_on_write;
+
+ for (df = list_tail(buflist); df; df = df_prev) {
+ df_prev = list_prev(buflist, df);
+ ASSERT3P(df->l2df_abd, !=, NULL);
+ abd_free(df->l2df_abd);
+ list_remove(buflist, df);
+ kmem_free(df, sizeof (l2arc_data_free_t));
+ }
+
+ mutex_exit(&l2arc_free_on_write_mtx);
+}
+
+/*
+ * A write to a cache device has completed. Update all headers to allow
+ * reads from these buffers to begin.
+ */
+static void
+l2arc_write_done(zio_t *zio)
+{
+ l2arc_write_callback_t *cb;
+ l2arc_lb_abd_buf_t *abd_buf;
+ l2arc_lb_ptr_buf_t *lb_ptr_buf;
+ l2arc_dev_t *dev;
+ l2arc_dev_hdr_phys_t *l2dhdr;
+ list_t *buflist;
+ arc_buf_hdr_t *head, *hdr, *hdr_prev;
+ kmutex_t *hash_lock;
+ int64_t bytes_dropped = 0;
+
+ cb = zio->io_private;
+ ASSERT3P(cb, !=, NULL);
+ dev = cb->l2wcb_dev;
+ l2dhdr = dev->l2ad_dev_hdr;
+ ASSERT3P(dev, !=, NULL);
+ head = cb->l2wcb_head;
+ ASSERT3P(head, !=, NULL);
+ buflist = &dev->l2ad_buflist;
+ ASSERT3P(buflist, !=, NULL);
+ DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
+ l2arc_write_callback_t *, cb);
+
+ /*
+ * All writes completed, or an error was hit.
+ */
+top:
+ mutex_enter(&dev->l2ad_mtx);
+ for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
+ hdr_prev = list_prev(buflist, hdr);
+
+ hash_lock = HDR_LOCK(hdr);
+
+ /*
+ * We cannot use mutex_enter or else we can deadlock
+ * with l2arc_write_buffers (due to swapping the order
+ * the hash lock and l2ad_mtx are taken).
+ */
+ if (!mutex_tryenter(hash_lock)) {
+ /*
+ * Missed the hash lock. We must retry so we
+ * don't leave the ARC_FLAG_L2_WRITING bit set.
+ */
+ ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
+
+ /*
+ * We don't want to rescan the headers we've
+ * already marked as having been written out, so
+ * we reinsert the head node so we can pick up
+ * where we left off.
+ */
+ list_remove(buflist, head);
+ list_insert_after(buflist, hdr, head);
+
+ mutex_exit(&dev->l2ad_mtx);
+
+ /*
+ * We wait for the hash lock to become available
+ * to try and prevent busy waiting, and increase
+ * the chance we'll be able to acquire the lock
+ * the next time around.
+ */
+ mutex_enter(hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
+ }
+
+ /*
+ * We could not have been moved into the arc_l2c_only
+ * state while in-flight due to our ARC_FLAG_L2_WRITING
+ * bit being set. Let's just ensure that's being enforced.
+ */
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ /*
+ * Skipped - drop L2ARC entry and mark the header as no
+ * longer L2 eligibile.
+ */
+ if (zio->io_error != 0) {
+ /*
+ * Error - drop L2ARC entry.
+ */
+ list_remove(buflist, hdr);
+ arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
+
+ uint64_t psize = HDR_GET_PSIZE(hdr);
+ l2arc_hdr_arcstats_decrement(hdr);
+
+ bytes_dropped +=
+ vdev_psize_to_asize(dev->l2ad_vdev, psize);
+ (void) zfs_refcount_remove_many(&dev->l2ad_alloc,
+ arc_hdr_size(hdr), hdr);
+ }
+
+ /*
+ * Allow ARC to begin reads and ghost list evictions to
+ * this L2ARC entry.
+ */
+ arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
+
+ mutex_exit(hash_lock);
+ }
+
+ /*
+ * Free the allocated abd buffers for writing the log blocks.
+ * If the zio failed reclaim the allocated space and remove the
+ * pointers to these log blocks from the log block pointer list
+ * of the L2ARC device.
+ */
+ while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) {
+ abd_free(abd_buf->abd);
+ zio_buf_free(abd_buf, sizeof (*abd_buf));
+ if (zio->io_error != 0) {
+ lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list);
+ /*
+ * L2BLK_GET_PSIZE returns aligned size for log
+ * blocks.
+ */
+ uint64_t asize =
+ L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop);
+ bytes_dropped += asize;
+ ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
+ ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
+ zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
+ lb_ptr_buf);
+ zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
+ kmem_free(lb_ptr_buf->lb_ptr,
+ sizeof (l2arc_log_blkptr_t));
+ kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
+ }
+ }
+ list_destroy(&cb->l2wcb_abd_list);
+
+ if (zio->io_error != 0) {
+ ARCSTAT_BUMP(arcstat_l2_writes_error);
+
+ /*
+ * Restore the lbps array in the header to its previous state.
+ * If the list of log block pointers is empty, zero out the
+ * log block pointers in the device header.
+ */
+ lb_ptr_buf = list_head(&dev->l2ad_lbptr_list);
+ for (int i = 0; i < 2; i++) {
+ if (lb_ptr_buf == NULL) {
+ /*
+ * If the list is empty zero out the device
+ * header. Otherwise zero out the second log
+ * block pointer in the header.
+ */
+ if (i == 0) {
+ bzero(l2dhdr, dev->l2ad_dev_hdr_asize);
+ } else {
+ bzero(&l2dhdr->dh_start_lbps[i],
+ sizeof (l2arc_log_blkptr_t));
+ }
+ break;
+ }
+ bcopy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[i],
+ sizeof (l2arc_log_blkptr_t));
+ lb_ptr_buf = list_next(&dev->l2ad_lbptr_list,
+ lb_ptr_buf);
+ }
+ }
+
+ atomic_inc_64(&l2arc_writes_done);
+ list_remove(buflist, head);
+ ASSERT(!HDR_HAS_L1HDR(head));
+ kmem_cache_free(hdr_l2only_cache, head);
+ mutex_exit(&dev->l2ad_mtx);
+
+ ASSERT(dev->l2ad_vdev != NULL);
+ vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
+
+ l2arc_do_free_on_write();
+
+ kmem_free(cb, sizeof (l2arc_write_callback_t));
+}
+
+static int
+l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
+{
+ int ret;
+ spa_t *spa = zio->io_spa;
+ arc_buf_hdr_t *hdr = cb->l2rcb_hdr;
+ blkptr_t *bp = zio->io_bp;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+ boolean_t no_crypt = B_FALSE;
+
+ /*
+ * ZIL data is never be written to the L2ARC, so we don't need
+ * special handling for its unique MAC storage.
+ */
+ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+
+ /*
+ * If the data was encrypted, decrypt it now. Note that
+ * we must check the bp here and not the hdr, since the
+ * hdr does not have its encryption parameters updated
+ * until arc_read_done().
+ */
+ if (BP_IS_ENCRYPTED(bp)) {
+ abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
+ B_TRUE);
+
+ zio_crypt_decode_params_bp(bp, salt, iv);
+ zio_crypt_decode_mac_bp(bp, mac);
+
+ ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb,
+ BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
+ salt, iv, mac, HDR_GET_PSIZE(hdr), eabd,
+ hdr->b_l1hdr.b_pabd, &no_crypt);
+ if (ret != 0) {
+ arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
+ goto error;
+ }
+
+ /*
+ * If we actually performed decryption, replace b_pabd
+ * with the decrypted data. Otherwise we can just throw
+ * our decryption buffer away.
+ */
+ if (!no_crypt) {
+ arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
+ arc_hdr_size(hdr), hdr);
+ hdr->b_l1hdr.b_pabd = eabd;
+ zio->io_abd = eabd;
+ } else {
+ arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
+ }
+ }
+
+ /*
+ * If the L2ARC block was compressed, but ARC compression
+ * is disabled we decompress the data into a new buffer and
+ * replace the existing data.
+ */
+ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
+ !HDR_COMPRESSION_ENABLED(hdr)) {
+ abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
+ B_TRUE);
+ void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
+
+ ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
+ hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
+ HDR_GET_LSIZE(hdr), &hdr->b_complevel);
+ if (ret != 0) {
+ abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
+ arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
+ goto error;
+ }
+
+ abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
+ arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
+ arc_hdr_size(hdr), hdr);
+ hdr->b_l1hdr.b_pabd = cabd;
+ zio->io_abd = cabd;
+ zio->io_size = HDR_GET_LSIZE(hdr);
+ }
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+
+/*
+ * A read to a cache device completed. Validate buffer contents before
+ * handing over to the regular ARC routines.
+ */
+static void
+l2arc_read_done(zio_t *zio)
+{
+ int tfm_error = 0;
+ l2arc_read_callback_t *cb = zio->io_private;
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+ boolean_t valid_cksum;
+ boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) &&
+ (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT));
+
+ ASSERT3P(zio->io_vd, !=, NULL);
+ ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
+
+ spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
+
+ ASSERT3P(cb, !=, NULL);
+ hdr = cb->l2rcb_hdr;
+ ASSERT3P(hdr, !=, NULL);
+
+ hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+
+ /*
+ * If the data was read into a temporary buffer,
+ * move it and free the buffer.
+ */
+ if (cb->l2rcb_abd != NULL) {
+ ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
+ if (zio->io_error == 0) {
+ if (using_rdata) {
+ abd_copy(hdr->b_crypt_hdr.b_rabd,
+ cb->l2rcb_abd, arc_hdr_size(hdr));
+ } else {
+ abd_copy(hdr->b_l1hdr.b_pabd,
+ cb->l2rcb_abd, arc_hdr_size(hdr));
+ }
+ }
+
+ /*
+ * The following must be done regardless of whether
+ * there was an error:
+ * - free the temporary buffer
+ * - point zio to the real ARC buffer
+ * - set zio size accordingly
+ * These are required because zio is either re-used for
+ * an I/O of the block in the case of the error
+ * or the zio is passed to arc_read_done() and it
+ * needs real data.
+ */
+ abd_free(cb->l2rcb_abd);
+ zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
+
+ if (using_rdata) {
+ ASSERT(HDR_HAS_RABD(hdr));
+ zio->io_abd = zio->io_orig_abd =
+ hdr->b_crypt_hdr.b_rabd;
+ } else {
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+ zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd;
+ }
+ }
+
+ ASSERT3P(zio->io_abd, !=, NULL);
+
+ /*
+ * Check this survived the L2ARC journey.
+ */
+ ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd ||
+ (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd));
+ zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
+ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
+ zio->io_prop.zp_complevel = hdr->b_complevel;
+
+ valid_cksum = arc_cksum_is_equal(hdr, zio);
+
+ /*
+ * b_rabd will always match the data as it exists on disk if it is
+ * being used. Therefore if we are reading into b_rabd we do not
+ * attempt to untransform the data.
+ */
+ if (valid_cksum && !using_rdata)
+ tfm_error = l2arc_untransform(zio, cb);
+
+ if (valid_cksum && tfm_error == 0 && zio->io_error == 0 &&
+ !HDR_L2_EVICTED(hdr)) {
+ mutex_exit(hash_lock);
+ zio->io_private = hdr;
+ arc_read_done(zio);
+ } else {
+ /*
+ * Buffer didn't survive caching. Increment stats and
+ * reissue to the original storage device.
+ */
+ if (zio->io_error != 0) {
+ ARCSTAT_BUMP(arcstat_l2_io_error);
+ } else {
+ zio->io_error = SET_ERROR(EIO);
+ }
+ if (!valid_cksum || tfm_error != 0)
+ ARCSTAT_BUMP(arcstat_l2_cksum_bad);
+
+ /*
+ * If there's no waiter, issue an async i/o to the primary
+ * storage now. If there *is* a waiter, the caller must
+ * issue the i/o in a context where it's OK to block.
+ */
+ if (zio->io_waiter == NULL) {
+ zio_t *pio = zio_unique_parent(zio);
+ void *abd = (using_rdata) ?
+ hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd;
+
+ ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ zio = zio_read(pio, zio->io_spa, zio->io_bp,
+ abd, zio->io_size, arc_read_done,
+ hdr, zio->io_priority, cb->l2rcb_flags,
+ &cb->l2rcb_zb);
+
+ /*
+ * Original ZIO will be freed, so we need to update
+ * ARC header with the new ZIO pointer to be used
+ * by zio_change_priority() in arc_read().
+ */
+ for (struct arc_callback *acb = hdr->b_l1hdr.b_acb;
+ acb != NULL; acb = acb->acb_next)
+ acb->acb_zio_head = zio;
+
+ mutex_exit(hash_lock);
+ zio_nowait(zio);
+ } else {
+ mutex_exit(hash_lock);
+ }
+ }
+
+ kmem_free(cb, sizeof (l2arc_read_callback_t));
+}
+
+/*
+ * This is the list priority from which the L2ARC will search for pages to
+ * cache. This is used within loops (0..3) to cycle through lists in the
+ * desired order. This order can have a significant effect on cache
+ * performance.
+ *
+ * Currently the metadata lists are hit first, MFU then MRU, followed by
+ * the data lists. This function returns a locked list, and also returns
+ * the lock pointer.
+ */
+static multilist_sublist_t *
+l2arc_sublist_lock(int list_num)
+{
+ multilist_t *ml = NULL;
+ unsigned int idx;
+
+ ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES);
+
+ switch (list_num) {
+ case 0:
+ ml = arc_mfu->arcs_list[ARC_BUFC_METADATA];
+ break;
+ case 1:
+ ml = arc_mru->arcs_list[ARC_BUFC_METADATA];
+ break;
+ case 2:
+ ml = arc_mfu->arcs_list[ARC_BUFC_DATA];
+ break;
+ case 3:
+ ml = arc_mru->arcs_list[ARC_BUFC_DATA];
+ break;
+ default:
+ return (NULL);
+ }
+
+ /*
+ * Return a randomly-selected sublist. This is acceptable
+ * because the caller feeds only a little bit of data for each
+ * call (8MB). Subsequent calls will result in different
+ * sublists being selected.
+ */
+ idx = multilist_get_random_index(ml);
+ return (multilist_sublist_lock(ml, idx));
+}
+
+/*
+ * Calculates the maximum overhead of L2ARC metadata log blocks for a given
+ * L2ARC write size. l2arc_evict and l2arc_write_size need to include this
+ * overhead in processing to make sure there is enough headroom available
+ * when writing buffers.
+ */
+static inline uint64_t
+l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev)
+{
+ if (dev->l2ad_log_entries == 0) {
+ return (0);
+ } else {
+ uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT;
+
+ uint64_t log_blocks = (log_entries +
+ dev->l2ad_log_entries - 1) /
+ dev->l2ad_log_entries;
+
+ return (vdev_psize_to_asize(dev->l2ad_vdev,
+ sizeof (l2arc_log_blk_phys_t)) * log_blocks);
+ }
+}
+
+/*
+ * Evict buffers from the device write hand to the distance specified in
+ * bytes. This distance may span populated buffers, it may span nothing.
+ * This is clearing a region on the L2ARC device ready for writing.
+ * If the 'all' boolean is set, every buffer is evicted.
+ */
+static void
+l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
+{
+ list_t *buflist;
+ arc_buf_hdr_t *hdr, *hdr_prev;
+ kmutex_t *hash_lock;
+ uint64_t taddr;
+ l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev;
+ vdev_t *vd = dev->l2ad_vdev;
+ boolean_t rerun;
+
+ buflist = &dev->l2ad_buflist;
+
+ /*
+ * We need to add in the worst case scenario of log block overhead.
+ */
+ distance += l2arc_log_blk_overhead(distance, dev);
+ if (vd->vdev_has_trim && l2arc_trim_ahead > 0) {
+ /*
+ * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100)
+ * times the write size, whichever is greater.
+ */
+ distance += MAX(64 * 1024 * 1024,
+ (distance * l2arc_trim_ahead) / 100);
+ }
+
+top:
+ rerun = B_FALSE;
+ if (dev->l2ad_hand >= (dev->l2ad_end - distance)) {
+ /*
+ * When there is no space to accommodate upcoming writes,
+ * evict to the end. Then bump the write and evict hands
+ * to the start and iterate. This iteration does not
+ * happen indefinitely as we make sure in
+ * l2arc_write_size() that when the write hand is reset,
+ * the write size does not exceed the end of the device.
+ */
+ rerun = B_TRUE;
+ taddr = dev->l2ad_end;
+ } else {
+ taddr = dev->l2ad_hand + distance;
+ }
+ DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
+ uint64_t, taddr, boolean_t, all);
+
+ if (!all) {
+ /*
+ * This check has to be placed after deciding whether to
+ * iterate (rerun).
+ */
+ if (dev->l2ad_first) {
+ /*
+ * This is the first sweep through the device. There is
+ * nothing to evict. We have already trimmmed the
+ * whole device.
+ */
+ goto out;
+ } else {
+ /*
+ * Trim the space to be evicted.
+ */
+ if (vd->vdev_has_trim && dev->l2ad_evict < taddr &&
+ l2arc_trim_ahead > 0) {
+ /*
+ * We have to drop the spa_config lock because
+ * vdev_trim_range() will acquire it.
+ * l2ad_evict already accounts for the label
+ * size. To prevent vdev_trim_ranges() from
+ * adding it again, we subtract it from
+ * l2ad_evict.
+ */
+ spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev);
+ vdev_trim_simple(vd,
+ dev->l2ad_evict - VDEV_LABEL_START_SIZE,
+ taddr - dev->l2ad_evict);
+ spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev,
+ RW_READER);
+ }
+
+ /*
+ * When rebuilding L2ARC we retrieve the evict hand
+ * from the header of the device. Of note, l2arc_evict()
+ * does not actually delete buffers from the cache
+ * device, but trimming may do so depending on the
+ * hardware implementation. Thus keeping track of the
+ * evict hand is useful.
+ */
+ dev->l2ad_evict = MAX(dev->l2ad_evict, taddr);
+ }
+ }
+
+retry:
+ mutex_enter(&dev->l2ad_mtx);
+ /*
+ * We have to account for evicted log blocks. Run vdev_space_update()
+ * on log blocks whose offset (in bytes) is before the evicted offset
+ * (in bytes) by searching in the list of pointers to log blocks
+ * present in the L2ARC device.
+ */
+ for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf;
+ lb_ptr_buf = lb_ptr_buf_prev) {
+
+ lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf);
+
+ /* L2BLK_GET_PSIZE returns aligned size for log blocks */
+ uint64_t asize = L2BLK_GET_PSIZE(
+ (lb_ptr_buf->lb_ptr)->lbp_prop);
+
+ /*
+ * We don't worry about log blocks left behind (ie
+ * lbp_payload_start < l2ad_hand) because l2arc_write_buffers()
+ * will never write more than l2arc_evict() evicts.
+ */
+ if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) {
+ break;
+ } else {
+ vdev_space_update(vd, -asize, 0, 0);
+ ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
+ ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
+ zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
+ lb_ptr_buf);
+ zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
+ list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf);
+ kmem_free(lb_ptr_buf->lb_ptr,
+ sizeof (l2arc_log_blkptr_t));
+ kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
+ }
+ }
+
+ for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
+ hdr_prev = list_prev(buflist, hdr);
+
+ ASSERT(!HDR_EMPTY(hdr));
+ hash_lock = HDR_LOCK(hdr);
+
+ /*
+ * We cannot use mutex_enter or else we can deadlock
+ * with l2arc_write_buffers (due to swapping the order
+ * the hash lock and l2ad_mtx are taken).
+ */
+ if (!mutex_tryenter(hash_lock)) {
+ /*
+ * Missed the hash lock. Retry.
+ */
+ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
+ mutex_exit(&dev->l2ad_mtx);
+ mutex_enter(hash_lock);
+ mutex_exit(hash_lock);
+ goto retry;
+ }
+
+ /*
+ * A header can't be on this list if it doesn't have L2 header.
+ */
+ ASSERT(HDR_HAS_L2HDR(hdr));
+
+ /* Ensure this header has finished being written. */
+ ASSERT(!HDR_L2_WRITING(hdr));
+ ASSERT(!HDR_L2_WRITE_HEAD(hdr));
+
+ if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict ||
+ hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
+ /*
+ * We've evicted to the target address,
+ * or the end of the device.
+ */
+ mutex_exit(hash_lock);
+ break;
+ }
+
+ if (!HDR_HAS_L1HDR(hdr)) {
+ ASSERT(!HDR_L2_READING(hdr));
+ /*
+ * This doesn't exist in the ARC. Destroy.
+ * arc_hdr_destroy() will call list_remove()
+ * and decrement arcstat_l2_lsize.
+ */
+ arc_change_state(arc_anon, hdr, hash_lock);
+ arc_hdr_destroy(hdr);
+ } else {
+ ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
+ ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
+ /*
+ * Invalidate issued or about to be issued
+ * reads, since we may be about to write
+ * over this location.
+ */
+ if (HDR_L2_READING(hdr)) {
+ ARCSTAT_BUMP(arcstat_l2_evict_reading);
+ arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
+ }
+
+ arc_hdr_l2hdr_destroy(hdr);
+ }
+ mutex_exit(hash_lock);
+ }
+ mutex_exit(&dev->l2ad_mtx);
+
+out:
+ /*
+ * We need to check if we evict all buffers, otherwise we may iterate
+ * unnecessarily.
+ */
+ if (!all && rerun) {
+ /*
+ * Bump device hand to the device start if it is approaching the
+ * end. l2arc_evict() has already evicted ahead for this case.
+ */
+ dev->l2ad_hand = dev->l2ad_start;
+ dev->l2ad_evict = dev->l2ad_start;
+ dev->l2ad_first = B_FALSE;
+ goto top;
+ }
+
+ if (!all) {
+ /*
+ * In case of cache device removal (all) the following
+ * assertions may be violated without functional consequences
+ * as the device is about to be removed.
+ */
+ ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end);
+ if (!dev->l2ad_first)
+ ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict);
+ }
+}
+
+/*
+ * Handle any abd transforms that might be required for writing to the L2ARC.
+ * If successful, this function will always return an abd with the data
+ * transformed as it is on disk in a new abd of asize bytes.
+ */
+static int
+l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
+ abd_t **abd_out)
+{
+ int ret;
+ void *tmp = NULL;
+ abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd;
+ enum zio_compress compress = HDR_GET_COMPRESS(hdr);
+ uint64_t psize = HDR_GET_PSIZE(hdr);
+ uint64_t size = arc_hdr_size(hdr);
+ boolean_t ismd = HDR_ISTYPE_METADATA(hdr);
+ boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
+ dsl_crypto_key_t *dck = NULL;
+ uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 };
+ boolean_t no_crypt = B_FALSE;
+
+ ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
+ !HDR_COMPRESSION_ENABLED(hdr)) ||
+ HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize);
+ ASSERT3U(psize, <=, asize);
+
+ /*
+ * If this data simply needs its own buffer, we simply allocate it
+ * and copy the data. This may be done to eliminate a dependency on a
+ * shared buffer or to reallocate the buffer to match asize.
+ */
+ if (HDR_HAS_RABD(hdr) && asize != psize) {
+ ASSERT3U(asize, >=, psize);
+ to_write = abd_alloc_for_io(asize, ismd);
+ abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize);
+ if (psize != asize)
+ abd_zero_off(to_write, psize, asize - psize);
+ goto out;
+ }
+
+ if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) &&
+ !HDR_ENCRYPTED(hdr)) {
+ ASSERT3U(size, ==, psize);
+ to_write = abd_alloc_for_io(asize, ismd);
+ abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
+ if (size != asize)
+ abd_zero_off(to_write, size, asize - size);
+ goto out;
+ }
+
+ if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
+ cabd = abd_alloc_for_io(asize, ismd);
+ tmp = abd_borrow_buf(cabd, asize);
+
+ psize = zio_compress_data(compress, to_write, tmp, size,
+ hdr->b_complevel);
+
+ if (psize >= size) {
+ abd_return_buf(cabd, tmp, asize);
+ HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
+ to_write = cabd;
+ abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
+ if (size != asize)
+ abd_zero_off(to_write, size, asize - size);
+ goto encrypt;
+ }
+ ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr));
+ if (psize < asize)
+ bzero((char *)tmp + psize, asize - psize);
+ psize = HDR_GET_PSIZE(hdr);
+ abd_return_buf_copy(cabd, tmp, asize);
+ to_write = cabd;
+ }
+
+encrypt:
+ if (HDR_ENCRYPTED(hdr)) {
+ eabd = abd_alloc_for_io(asize, ismd);
+
+ /*
+ * If the dataset was disowned before the buffer
+ * made it to this point, the key to re-encrypt
+ * it won't be available. In this case we simply
+ * won't write the buffer to the L2ARC.
+ */
+ ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj,
+ FTAG, &dck);
+ if (ret != 0)
+ goto error;
+
+ ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key,
+ hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt,
+ hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd,
+ &no_crypt);
+ if (ret != 0)
+ goto error;
+
+ if (no_crypt)
+ abd_copy(eabd, to_write, psize);
+
+ if (psize != asize)
+ abd_zero_off(eabd, psize, asize - psize);
+
+ /* assert that the MAC we got here matches the one we saved */
+ ASSERT0(bcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN));
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+
+ if (to_write == cabd)
+ abd_free(cabd);
+
+ to_write = eabd;
+ }
+
+out:
+ ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd);
+ *abd_out = to_write;
+ return (0);
+
+error:
+ if (dck != NULL)
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+ if (cabd != NULL)
+ abd_free(cabd);
+ if (eabd != NULL)
+ abd_free(eabd);
+
+ *abd_out = NULL;
+ return (ret);
+}
+
+static void
+l2arc_blk_fetch_done(zio_t *zio)
+{
+ l2arc_read_callback_t *cb;
+
+ cb = zio->io_private;
+ if (cb->l2rcb_abd != NULL)
+ abd_free(cb->l2rcb_abd);
+ kmem_free(cb, sizeof (l2arc_read_callback_t));
+}
+
+/*
+ * Find and write ARC buffers to the L2ARC device.
+ *
+ * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
+ * for reading until they have completed writing.
+ * The headroom_boost is an in-out parameter used to maintain headroom boost
+ * state between calls to this function.
+ *
+ * Returns the number of bytes actually written (which may be smaller than
+ * the delta by which the device hand has changed due to alignment and the
+ * writing of log blocks).
+ */
+static uint64_t
+l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
+{
+ arc_buf_hdr_t *hdr, *hdr_prev, *head;
+ uint64_t write_asize, write_psize, write_lsize, headroom;
+ boolean_t full;
+ l2arc_write_callback_t *cb = NULL;
+ zio_t *pio, *wzio;
+ uint64_t guid = spa_load_guid(spa);
+ l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
+
+ ASSERT3P(dev->l2ad_vdev, !=, NULL);
+
+ pio = NULL;
+ write_lsize = write_asize = write_psize = 0;
+ full = B_FALSE;
+ head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
+ arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
+
+ /*
+ * Copy buffers for L2ARC writing.
+ */
+ for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
+ /*
+ * If pass == 1 or 3, we cache MRU metadata and data
+ * respectively.
+ */
+ if (l2arc_mfuonly) {
+ if (pass == 1 || pass == 3)
+ continue;
+ }
+
+ multilist_sublist_t *mls = l2arc_sublist_lock(pass);
+ uint64_t passed_sz = 0;
+
+ VERIFY3P(mls, !=, NULL);
+
+ /*
+ * L2ARC fast warmup.
+ *
+ * Until the ARC is warm and starts to evict, read from the
+ * head of the ARC lists rather than the tail.
+ */
+ if (arc_warm == B_FALSE)
+ hdr = multilist_sublist_head(mls);
+ else
+ hdr = multilist_sublist_tail(mls);
+
+ headroom = target_sz * l2arc_headroom;
+ if (zfs_compressed_arc_enabled)
+ headroom = (headroom * l2arc_headroom_boost) / 100;
+
+ for (; hdr; hdr = hdr_prev) {
+ kmutex_t *hash_lock;
+ abd_t *to_write = NULL;
+
+ if (arc_warm == B_FALSE)
+ hdr_prev = multilist_sublist_next(mls, hdr);
+ else
+ hdr_prev = multilist_sublist_prev(mls, hdr);
+
+ hash_lock = HDR_LOCK(hdr);
+ if (!mutex_tryenter(hash_lock)) {
+ /*
+ * Skip this buffer rather than waiting.
+ */
+ continue;
+ }
+
+ passed_sz += HDR_GET_LSIZE(hdr);
+ if (l2arc_headroom != 0 && passed_sz > headroom) {
+ /*
+ * Searched too far.
+ */
+ mutex_exit(hash_lock);
+ break;
+ }
+
+ if (!l2arc_write_eligible(guid, hdr)) {
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ /*
+ * We rely on the L1 portion of the header below, so
+ * it's invalid for this header to have been evicted out
+ * of the ghost cache, prior to being written out. The
+ * ARC_FLAG_L2_WRITING bit ensures this won't happen.
+ */
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
+ ASSERT3U(arc_hdr_size(hdr), >, 0);
+ ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
+ HDR_HAS_RABD(hdr));
+ uint64_t psize = HDR_GET_PSIZE(hdr);
+ uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
+ psize);
+
+ if ((write_asize + asize) > target_sz) {
+ full = B_TRUE;
+ mutex_exit(hash_lock);
+ break;
+ }
+
+ /*
+ * We rely on the L1 portion of the header below, so
+ * it's invalid for this header to have been evicted out
+ * of the ghost cache, prior to being written out. The
+ * ARC_FLAG_L2_WRITING bit ensures this won't happen.
+ */
+ arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
+ ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
+ HDR_HAS_RABD(hdr));
+ ASSERT3U(arc_hdr_size(hdr), >, 0);
+
+ /*
+ * If this header has b_rabd, we can use this since it
+ * must always match the data exactly as it exists on
+ * disk. Otherwise, the L2ARC can normally use the
+ * hdr's data, but if we're sharing data between the
+ * hdr and one of its bufs, L2ARC needs its own copy of
+ * the data so that the ZIO below can't race with the
+ * buf consumer. To ensure that this copy will be
+ * available for the lifetime of the ZIO and be cleaned
+ * up afterwards, we add it to the l2arc_free_on_write
+ * queue. If we need to apply any transforms to the
+ * data (compression, encryption) we will also need the
+ * extra buffer.
+ */
+ if (HDR_HAS_RABD(hdr) && psize == asize) {
+ to_write = hdr->b_crypt_hdr.b_rabd;
+ } else if ((HDR_COMPRESSION_ENABLED(hdr) ||
+ HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) &&
+ !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) &&
+ psize == asize) {
+ to_write = hdr->b_l1hdr.b_pabd;
+ } else {
+ int ret;
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ ret = l2arc_apply_transforms(spa, hdr, asize,
+ &to_write);
+ if (ret != 0) {
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_L2_WRITING);
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ l2arc_free_abd_on_write(to_write, asize, type);
+ }
+
+ if (pio == NULL) {
+ /*
+ * Insert a dummy header on the buflist so
+ * l2arc_write_done() can find where the
+ * write buffers begin without searching.
+ */
+ mutex_enter(&dev->l2ad_mtx);
+ list_insert_head(&dev->l2ad_buflist, head);
+ mutex_exit(&dev->l2ad_mtx);
+
+ cb = kmem_alloc(
+ sizeof (l2arc_write_callback_t), KM_SLEEP);
+ cb->l2wcb_dev = dev;
+ cb->l2wcb_head = head;
+ /*
+ * Create a list to save allocated abd buffers
+ * for l2arc_log_blk_commit().
+ */
+ list_create(&cb->l2wcb_abd_list,
+ sizeof (l2arc_lb_abd_buf_t),
+ offsetof(l2arc_lb_abd_buf_t, node));
+ pio = zio_root(spa, l2arc_write_done, cb,
+ ZIO_FLAG_CANFAIL);
+ }
+
+ hdr->b_l2hdr.b_dev = dev;
+ hdr->b_l2hdr.b_hits = 0;
+
+ hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
+ hdr->b_l2hdr.b_arcs_state =
+ hdr->b_l1hdr.b_state->arcs_state;
+ arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);
+
+ mutex_enter(&dev->l2ad_mtx);
+ list_insert_head(&dev->l2ad_buflist, hdr);
+ mutex_exit(&dev->l2ad_mtx);
+
+ (void) zfs_refcount_add_many(&dev->l2ad_alloc,
+ arc_hdr_size(hdr), hdr);
+
+ wzio = zio_write_phys(pio, dev->l2ad_vdev,
+ hdr->b_l2hdr.b_daddr, asize, to_write,
+ ZIO_CHECKSUM_OFF, NULL, hdr,
+ ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_CANFAIL, B_FALSE);
+
+ write_lsize += HDR_GET_LSIZE(hdr);
+ DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
+ zio_t *, wzio);
+
+ write_psize += psize;
+ write_asize += asize;
+ dev->l2ad_hand += asize;
+ l2arc_hdr_arcstats_increment(hdr);
+ vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+
+ mutex_exit(hash_lock);
+
+ /*
+ * Append buf info to current log and commit if full.
+ * arcstat_l2_{size,asize} kstats are updated
+ * internally.
+ */
+ if (l2arc_log_blk_insert(dev, hdr))
+ l2arc_log_blk_commit(dev, pio, cb);
+
+ zio_nowait(wzio);
+ }
+
+ multilist_sublist_unlock(mls);
+
+ if (full == B_TRUE)
+ break;
+ }
+
+ /* No buffers selected for writing? */
+ if (pio == NULL) {
+ ASSERT0(write_lsize);
+ ASSERT(!HDR_HAS_L1HDR(head));
+ kmem_cache_free(hdr_l2only_cache, head);
+
+ /*
+ * Although we did not write any buffers l2ad_evict may
+ * have advanced.
+ */
+ if (dev->l2ad_evict != l2dhdr->dh_evict)
+ l2arc_dev_hdr_update(dev);
+
+ return (0);
+ }
+
+ if (!dev->l2ad_first)
+ ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
+
+ ASSERT3U(write_asize, <=, target_sz);
+ ARCSTAT_BUMP(arcstat_l2_writes_sent);
+ ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
+
+ dev->l2ad_writing = B_TRUE;
+ (void) zio_wait(pio);
+ dev->l2ad_writing = B_FALSE;
+
+ /*
+ * Update the device header after the zio completes as
+ * l2arc_write_done() may have updated the memory holding the log block
+ * pointers in the device header.
+ */
+ l2arc_dev_hdr_update(dev);
+
+ return (write_asize);
+}
+
+static boolean_t
+l2arc_hdr_limit_reached(void)
+{
+ int64_t s = aggsum_upper_bound(&astat_l2_hdr_size);
+
+ return (arc_reclaim_needed() || (s > arc_meta_limit * 3 / 4) ||
+ (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100));
+}
+
+/*
+ * This thread feeds the L2ARC at regular intervals. This is the beating
+ * heart of the L2ARC.
+ */
+/* ARGSUSED */
+static void
+l2arc_feed_thread(void *unused)
+{
+ callb_cpr_t cpr;
+ l2arc_dev_t *dev;
+ spa_t *spa;
+ uint64_t size, wrote;
+ clock_t begin, next = ddi_get_lbolt();
+ fstrans_cookie_t cookie;
+
+ CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
+
+ mutex_enter(&l2arc_feed_thr_lock);
+
+ cookie = spl_fstrans_mark();
+ while (l2arc_thread_exit == 0) {
+ CALLB_CPR_SAFE_BEGIN(&cpr);
+ (void) cv_timedwait_idle(&l2arc_feed_thr_cv,
+ &l2arc_feed_thr_lock, next);
+ CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
+ next = ddi_get_lbolt() + hz;
+
+ /*
+ * Quick check for L2ARC devices.
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ if (l2arc_ndev == 0) {
+ mutex_exit(&l2arc_dev_mtx);
+ continue;
+ }
+ mutex_exit(&l2arc_dev_mtx);
+ begin = ddi_get_lbolt();
+
+ /*
+ * This selects the next l2arc device to write to, and in
+ * doing so the next spa to feed from: dev->l2ad_spa. This
+ * will return NULL if there are now no l2arc devices or if
+ * they are all faulted.
+ *
+ * If a device is returned, its spa's config lock is also
+ * held to prevent device removal. l2arc_dev_get_next()
+ * will grab and release l2arc_dev_mtx.
+ */
+ if ((dev = l2arc_dev_get_next()) == NULL)
+ continue;
+
+ spa = dev->l2ad_spa;
+ ASSERT3P(spa, !=, NULL);
+
+ /*
+ * If the pool is read-only then force the feed thread to
+ * sleep a little longer.
+ */
+ if (!spa_writeable(spa)) {
+ next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
+ spa_config_exit(spa, SCL_L2ARC, dev);
+ continue;
+ }
+
+ /*
+ * Avoid contributing to memory pressure.
+ */
+ if (l2arc_hdr_limit_reached()) {
+ ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
+ spa_config_exit(spa, SCL_L2ARC, dev);
+ continue;
+ }
+
+ ARCSTAT_BUMP(arcstat_l2_feeds);
+
+ size = l2arc_write_size(dev);
+
+ /*
+ * Evict L2ARC buffers that will be overwritten.
+ */
+ l2arc_evict(dev, size, B_FALSE);
+
+ /*
+ * Write ARC buffers.
+ */
+ wrote = l2arc_write_buffers(spa, dev, size);
+
+ /*
+ * Calculate interval between writes.
+ */
+ next = l2arc_write_interval(begin, size, wrote);
+ spa_config_exit(spa, SCL_L2ARC, dev);
+ }
+ spl_fstrans_unmark(cookie);
+
+ l2arc_thread_exit = 0;
+ cv_broadcast(&l2arc_feed_thr_cv);
+ CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
+ thread_exit();
+}
+
+boolean_t
+l2arc_vdev_present(vdev_t *vd)
+{
+ return (l2arc_vdev_get(vd) != NULL);
+}
+
+/*
+ * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if
+ * the vdev_t isn't an L2ARC device.
+ */
+l2arc_dev_t *
+l2arc_vdev_get(vdev_t *vd)
+{
+ l2arc_dev_t *dev;
+
+ mutex_enter(&l2arc_dev_mtx);
+ for (dev = list_head(l2arc_dev_list); dev != NULL;
+ dev = list_next(l2arc_dev_list, dev)) {
+ if (dev->l2ad_vdev == vd)
+ break;
+ }
+ mutex_exit(&l2arc_dev_mtx);
+
+ return (dev);
+}
+
+/*
+ * Add a vdev for use by the L2ARC. By this point the spa has already
+ * validated the vdev and opened it.
+ */
+void
+l2arc_add_vdev(spa_t *spa, vdev_t *vd)
+{
+ l2arc_dev_t *adddev;
+ uint64_t l2dhdr_asize;
+
+ ASSERT(!l2arc_vdev_present(vd));
+
+ /*
+ * Create a new l2arc device entry.
+ */
+ adddev = vmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
+ adddev->l2ad_spa = spa;
+ adddev->l2ad_vdev = vd;
+ /* leave extra size for an l2arc device header */
+ l2dhdr_asize = adddev->l2ad_dev_hdr_asize =
+ MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift);
+ adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize;
+ adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
+ ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end);
+ adddev->l2ad_hand = adddev->l2ad_start;
+ adddev->l2ad_evict = adddev->l2ad_start;
+ adddev->l2ad_first = B_TRUE;
+ adddev->l2ad_writing = B_FALSE;
+ adddev->l2ad_trim_all = B_FALSE;
+ list_link_init(&adddev->l2ad_node);
+ adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP);
+
+ mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
+ /*
+ * This is a list of all ARC buffers that are still valid on the
+ * device.
+ */
+ list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
+
+ /*
+ * This is a list of pointers to log blocks that are still present
+ * on the device.
+ */
+ list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t),
+ offsetof(l2arc_lb_ptr_buf_t, node));
+
+ vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
+ zfs_refcount_create(&adddev->l2ad_alloc);
+ zfs_refcount_create(&adddev->l2ad_lb_asize);
+ zfs_refcount_create(&adddev->l2ad_lb_count);
+
+ /*
+ * Add device to global list
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ list_insert_head(l2arc_dev_list, adddev);
+ atomic_inc_64(&l2arc_ndev);
+ mutex_exit(&l2arc_dev_mtx);
+
+ /*
+ * Decide if vdev is eligible for L2ARC rebuild
+ */
+ l2arc_rebuild_vdev(adddev->l2ad_vdev, B_FALSE);
+}
+
+void
+l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
+{
+ l2arc_dev_t *dev = NULL;
+ l2arc_dev_hdr_phys_t *l2dhdr;
+ uint64_t l2dhdr_asize;
+ spa_t *spa;
+
+ dev = l2arc_vdev_get(vd);
+ ASSERT3P(dev, !=, NULL);
+ spa = dev->l2ad_spa;
+ l2dhdr = dev->l2ad_dev_hdr;
+ l2dhdr_asize = dev->l2ad_dev_hdr_asize;
+
+ /*
+ * The L2ARC has to hold at least the payload of one log block for
+ * them to be restored (persistent L2ARC). The payload of a log block
+ * depends on the amount of its log entries. We always write log blocks
+ * with 1022 entries. How many of them are committed or restored depends
+ * on the size of the L2ARC device. Thus the maximum payload of
+ * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device
+ * is less than that, we reduce the amount of committed and restored
+ * log entries per block so as to enable persistence.
+ */
+ if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) {
+ dev->l2ad_log_entries = 0;
+ } else {
+ dev->l2ad_log_entries = MIN((dev->l2ad_end -
+ dev->l2ad_start) >> SPA_MAXBLOCKSHIFT,
+ L2ARC_LOG_BLK_MAX_ENTRIES);
+ }
+
+ /*
+ * Read the device header, if an error is returned do not rebuild L2ARC.
+ */
+ if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) {
+ /*
+ * If we are onlining a cache device (vdev_reopen) that was
+ * still present (l2arc_vdev_present()) and rebuild is enabled,
+ * we should evict all ARC buffers and pointers to log blocks
+ * and reclaim their space before restoring its contents to
+ * L2ARC.
+ */
+ if (reopen) {
+ if (!l2arc_rebuild_enabled) {
+ return;
+ } else {
+ l2arc_evict(dev, 0, B_TRUE);
+ /* start a new log block */
+ dev->l2ad_log_ent_idx = 0;
+ dev->l2ad_log_blk_payload_asize = 0;
+ dev->l2ad_log_blk_payload_start = 0;
+ }
+ }
+ /*
+ * Just mark the device as pending for a rebuild. We won't
+ * be starting a rebuild in line here as it would block pool
+ * import. Instead spa_load_impl will hand that off to an
+ * async task which will call l2arc_spa_rebuild_start.
+ */
+ dev->l2ad_rebuild = B_TRUE;
+ } else if (spa_writeable(spa)) {
+ /*
+ * In this case TRIM the whole device if l2arc_trim_ahead > 0,
+ * otherwise create a new header. We zero out the memory holding
+ * the header to reset dh_start_lbps. If we TRIM the whole
+ * device the new header will be written by
+ * vdev_trim_l2arc_thread() at the end of the TRIM to update the
+ * trim_state in the header too. When reading the header, if
+ * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0
+ * we opt to TRIM the whole device again.
+ */
+ if (l2arc_trim_ahead > 0) {
+ dev->l2ad_trim_all = B_TRUE;
+ } else {
+ bzero(l2dhdr, l2dhdr_asize);
+ l2arc_dev_hdr_update(dev);
+ }
+ }
+}
+
+/*
+ * Remove a vdev from the L2ARC.
+ */
+void
+l2arc_remove_vdev(vdev_t *vd)
+{
+ l2arc_dev_t *remdev = NULL;
+
+ /*
+ * Find the device by vdev
+ */
+ remdev = l2arc_vdev_get(vd);
+ ASSERT3P(remdev, !=, NULL);
+
+ /*
+ * Cancel any ongoing or scheduled rebuild.
+ */
+ mutex_enter(&l2arc_rebuild_thr_lock);
+ if (remdev->l2ad_rebuild_began == B_TRUE) {
+ remdev->l2ad_rebuild_cancel = B_TRUE;
+ while (remdev->l2ad_rebuild == B_TRUE)
+ cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock);
+ }
+ mutex_exit(&l2arc_rebuild_thr_lock);
+
+ /*
+ * Remove device from global list
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ list_remove(l2arc_dev_list, remdev);
+ l2arc_dev_last = NULL; /* may have been invalidated */
+ atomic_dec_64(&l2arc_ndev);
+ mutex_exit(&l2arc_dev_mtx);
+
+ /*
+ * Clear all buflists and ARC references. L2ARC device flush.
+ */
+ l2arc_evict(remdev, 0, B_TRUE);
+ list_destroy(&remdev->l2ad_buflist);
+ ASSERT(list_is_empty(&remdev->l2ad_lbptr_list));
+ list_destroy(&remdev->l2ad_lbptr_list);
+ mutex_destroy(&remdev->l2ad_mtx);
+ zfs_refcount_destroy(&remdev->l2ad_alloc);
+ zfs_refcount_destroy(&remdev->l2ad_lb_asize);
+ zfs_refcount_destroy(&remdev->l2ad_lb_count);
+ kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize);
+ vmem_free(remdev, sizeof (l2arc_dev_t));
+}
+
+void
+l2arc_init(void)
+{
+ l2arc_thread_exit = 0;
+ l2arc_ndev = 0;
+ l2arc_writes_sent = 0;
+ l2arc_writes_done = 0;
+
+ mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+ l2arc_dev_list = &L2ARC_dev_list;
+ l2arc_free_on_write = &L2ARC_free_on_write;
+ list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
+ offsetof(l2arc_dev_t, l2ad_node));
+ list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
+ offsetof(l2arc_data_free_t, l2df_list_node));
+}
+
+void
+l2arc_fini(void)
+{
+ mutex_destroy(&l2arc_feed_thr_lock);
+ cv_destroy(&l2arc_feed_thr_cv);
+ mutex_destroy(&l2arc_rebuild_thr_lock);
+ cv_destroy(&l2arc_rebuild_thr_cv);
+ mutex_destroy(&l2arc_dev_mtx);
+ mutex_destroy(&l2arc_free_on_write_mtx);
+
+ list_destroy(l2arc_dev_list);
+ list_destroy(l2arc_free_on_write);
+}
+
+void
+l2arc_start(void)
+{
+ if (!(spa_mode_global & SPA_MODE_WRITE))
+ return;
+
+ (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
+ TS_RUN, defclsyspri);
+}
+
+void
+l2arc_stop(void)
+{
+ if (!(spa_mode_global & SPA_MODE_WRITE))
+ return;
+
+ mutex_enter(&l2arc_feed_thr_lock);
+ cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
+ l2arc_thread_exit = 1;
+ while (l2arc_thread_exit != 0)
+ cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
+ mutex_exit(&l2arc_feed_thr_lock);
+}
+
+/*
+ * Punches out rebuild threads for the L2ARC devices in a spa. This should
+ * be called after pool import from the spa async thread, since starting
+ * these threads directly from spa_import() will make them part of the
+ * "zpool import" context and delay process exit (and thus pool import).
+ */
+void
+l2arc_spa_rebuild_start(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ /*
+ * Locate the spa's l2arc devices and kick off rebuild threads.
+ */
+ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
+ l2arc_dev_t *dev =
+ l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
+ if (dev == NULL) {
+ /* Don't attempt a rebuild if the vdev is UNAVAIL */
+ continue;
+ }
+ mutex_enter(&l2arc_rebuild_thr_lock);
+ if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) {
+ dev->l2ad_rebuild_began = B_TRUE;
+ (void) thread_create(NULL, 0, l2arc_dev_rebuild_thread,
+ dev, 0, &p0, TS_RUN, minclsyspri);
+ }
+ mutex_exit(&l2arc_rebuild_thr_lock);
+ }
+}
+
+/*
+ * Main entry point for L2ARC rebuilding.
+ */
+static void
+l2arc_dev_rebuild_thread(void *arg)
+{
+ l2arc_dev_t *dev = arg;
+
+ VERIFY(!dev->l2ad_rebuild_cancel);
+ VERIFY(dev->l2ad_rebuild);
+ (void) l2arc_rebuild(dev);
+ mutex_enter(&l2arc_rebuild_thr_lock);
+ dev->l2ad_rebuild_began = B_FALSE;
+ dev->l2ad_rebuild = B_FALSE;
+ mutex_exit(&l2arc_rebuild_thr_lock);
+
+ thread_exit();
+}
+
+/*
+ * This function implements the actual L2ARC metadata rebuild. It:
+ * starts reading the log block chain and restores each block's contents
+ * to memory (reconstructing arc_buf_hdr_t's).
+ *
+ * Operation stops under any of the following conditions:
+ *
+ * 1) We reach the end of the log block chain.
+ * 2) We encounter *any* error condition (cksum errors, io errors)
+ */
+static int
+l2arc_rebuild(l2arc_dev_t *dev)
+{
+ vdev_t *vd = dev->l2ad_vdev;
+ spa_t *spa = vd->vdev_spa;
+ int err = 0;
+ l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
+ l2arc_log_blk_phys_t *this_lb, *next_lb;
+ zio_t *this_io = NULL, *next_io = NULL;
+ l2arc_log_blkptr_t lbps[2];
+ l2arc_lb_ptr_buf_t *lb_ptr_buf;
+ boolean_t lock_held;
+
+ this_lb = vmem_zalloc(sizeof (*this_lb), KM_SLEEP);
+ next_lb = vmem_zalloc(sizeof (*next_lb), KM_SLEEP);
+
+ /*
+ * We prevent device removal while issuing reads to the device,
+ * then during the rebuilding phases we drop this lock again so
+ * that a spa_unload or device remove can be initiated - this is
+ * safe, because the spa will signal us to stop before removing
+ * our device and wait for us to stop.
+ */
+ spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
+ lock_held = B_TRUE;
+
+ /*
+ * Retrieve the persistent L2ARC device state.
+ * L2BLK_GET_PSIZE returns aligned size for log blocks.
+ */
+ dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start);
+ dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr +
+ L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop),
+ dev->l2ad_start);
+ dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
+
+ vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time;
+ vd->vdev_trim_state = l2dhdr->dh_trim_state;
+
+ /*
+ * In case the zfs module parameter l2arc_rebuild_enabled is false
+ * we do not start the rebuild process.
+ */
+ if (!l2arc_rebuild_enabled)
+ goto out;
+
+ /* Prepare the rebuild process */
+ bcopy(l2dhdr->dh_start_lbps, lbps, sizeof (lbps));
+
+ /* Start the rebuild process */
+ for (;;) {
+ if (!l2arc_log_blkptr_valid(dev, &lbps[0]))
+ break;
+
+ if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1],
+ this_lb, next_lb, this_io, &next_io)) != 0)
+ goto out;
+
+ /*
+ * Our memory pressure valve. If the system is running low
+ * on memory, rather than swamping memory with new ARC buf
+ * hdrs, we opt not to rebuild the L2ARC. At this point,
+ * however, we have already set up our L2ARC dev to chain in
+ * new metadata log blocks, so the user may choose to offline/
+ * online the L2ARC dev at a later time (or re-import the pool)
+ * to reconstruct it (when there's less memory pressure).
+ */
+ if (l2arc_hdr_limit_reached()) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
+ cmn_err(CE_NOTE, "System running low on memory, "
+ "aborting L2ARC rebuild.");
+ err = SET_ERROR(ENOMEM);
+ goto out;
+ }
+
+ spa_config_exit(spa, SCL_L2ARC, vd);
+ lock_held = B_FALSE;
+
+ /*
+ * Now that we know that the next_lb checks out alright, we
+ * can start reconstruction from this log block.
+ * L2BLK_GET_PSIZE returns aligned size for log blocks.
+ */
+ uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
+ l2arc_log_blk_restore(dev, this_lb, asize);
+
+ /*
+ * log block restored, include its pointer in the list of
+ * pointers to log blocks present in the L2ARC device.
+ */
+ lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
+ lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t),
+ KM_SLEEP);
+ bcopy(&lbps[0], lb_ptr_buf->lb_ptr,
+ sizeof (l2arc_log_blkptr_t));
+ mutex_enter(&dev->l2ad_mtx);
+ list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf);
+ ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
+ ARCSTAT_BUMP(arcstat_l2_log_blk_count);
+ zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
+ zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
+ mutex_exit(&dev->l2ad_mtx);
+ vdev_space_update(vd, asize, 0, 0);
+
+ /*
+ * Protection against loops of log blocks:
+ *
+ * l2ad_hand l2ad_evict
+ * V V
+ * l2ad_start |=======================================| l2ad_end
+ * -----|||----|||---|||----|||
+ * (3) (2) (1) (0)
+ * ---|||---|||----|||---|||
+ * (7) (6) (5) (4)
+ *
+ * In this situation the pointer of log block (4) passes
+ * l2arc_log_blkptr_valid() but the log block should not be
+ * restored as it is overwritten by the payload of log block
+ * (0). Only log blocks (0)-(3) should be restored. We check
+ * whether l2ad_evict lies in between the payload starting
+ * offset of the next log block (lbps[1].lbp_payload_start)
+ * and the payload starting offset of the present log block
+ * (lbps[0].lbp_payload_start). If true and this isn't the
+ * first pass, we are looping from the beginning and we should
+ * stop.
+ */
+ if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
+ lbps[0].lbp_payload_start, dev->l2ad_evict) &&
+ !dev->l2ad_first)
+ goto out;
+
+ cond_resched();
+ for (;;) {
+ mutex_enter(&l2arc_rebuild_thr_lock);
+ if (dev->l2ad_rebuild_cancel) {
+ dev->l2ad_rebuild = B_FALSE;
+ cv_signal(&l2arc_rebuild_thr_cv);
+ mutex_exit(&l2arc_rebuild_thr_lock);
+ err = SET_ERROR(ECANCELED);
+ goto out;
+ }
+ mutex_exit(&l2arc_rebuild_thr_lock);
+ if (spa_config_tryenter(spa, SCL_L2ARC, vd,
+ RW_READER)) {
+ lock_held = B_TRUE;
+ break;
+ }
+ /*
+ * L2ARC config lock held by somebody in writer,
+ * possibly due to them trying to remove us. They'll
+ * likely to want us to shut down, so after a little
+ * delay, we check l2ad_rebuild_cancel and retry
+ * the lock again.
+ */
+ delay(1);
+ }
+
+ /*
+ * Continue with the next log block.
+ */
+ lbps[0] = lbps[1];
+ lbps[1] = this_lb->lb_prev_lbp;
+ PTR_SWAP(this_lb, next_lb);
+ this_io = next_io;
+ next_io = NULL;
+ }
+
+ if (this_io != NULL)
+ l2arc_log_blk_fetch_abort(this_io);
+out:
+ if (next_io != NULL)
+ l2arc_log_blk_fetch_abort(next_io);
+ vmem_free(this_lb, sizeof (*this_lb));
+ vmem_free(next_lb, sizeof (*next_lb));
+
+ if (!l2arc_rebuild_enabled) {
+ spa_history_log_internal(spa, "L2ARC rebuild", NULL,
+ "disabled");
+ } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_success);
+ spa_history_log_internal(spa, "L2ARC rebuild", NULL,
+ "successful, restored %llu blocks",
+ (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
+ } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) {
+ /*
+ * No error but also nothing restored, meaning the lbps array
+ * in the device header points to invalid/non-present log
+ * blocks. Reset the header.
+ */
+ spa_history_log_internal(spa, "L2ARC rebuild", NULL,
+ "no valid log blocks");
+ bzero(l2dhdr, dev->l2ad_dev_hdr_asize);
+ l2arc_dev_hdr_update(dev);
+ } else if (err == ECANCELED) {
+ /*
+ * In case the rebuild was canceled do not log to spa history
+ * log as the pool may be in the process of being removed.
+ */
+ zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks",
+ zfs_refcount_count(&dev->l2ad_lb_count));
+ } else if (err != 0) {
+ spa_history_log_internal(spa, "L2ARC rebuild", NULL,
+ "aborted, restored %llu blocks",
+ (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
+ }
+
+ if (lock_held)
+ spa_config_exit(spa, SCL_L2ARC, vd);
+
+ return (err);
+}
+
+/*
+ * Attempts to read the device header on the provided L2ARC device and writes
+ * it to `hdr'. On success, this function returns 0, otherwise the appropriate
+ * error code is returned.
+ */
+static int
+l2arc_dev_hdr_read(l2arc_dev_t *dev)
+{
+ int err;
+ uint64_t guid;
+ l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
+ const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize;
+ abd_t *abd;
+
+ guid = spa_guid(dev->l2ad_vdev->vdev_spa);
+
+ abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
+
+ err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
+ VDEV_LABEL_START_SIZE, l2dhdr_asize, abd,
+ ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
+ ZIO_FLAG_SPECULATIVE, B_FALSE));
+
+ abd_free(abd);
+
+ if (err != 0) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors);
+ zfs_dbgmsg("L2ARC IO error (%d) while reading device header, "
+ "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid);
+ return (err);
+ }
+
+ if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
+ byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr));
+
+ if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC ||
+ l2dhdr->dh_spa_guid != guid ||
+ l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid ||
+ l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION ||
+ l2dhdr->dh_log_entries != dev->l2ad_log_entries ||
+ l2dhdr->dh_end != dev->l2ad_end ||
+ !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end,
+ l2dhdr->dh_evict) ||
+ (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE &&
+ l2arc_trim_ahead > 0)) {
+ /*
+ * Attempt to rebuild a device containing no actual dev hdr
+ * or containing a header from some other pool or from another
+ * version of persistent L2ARC.
+ */
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ return (0);
+}
+
+/*
+ * Reads L2ARC log blocks from storage and validates their contents.
+ *
+ * This function implements a simple fetcher to make sure that while
+ * we're processing one buffer the L2ARC is already fetching the next
+ * one in the chain.
+ *
+ * The arguments this_lp and next_lp point to the current and next log block
+ * address in the block chain. Similarly, this_lb and next_lb hold the
+ * l2arc_log_blk_phys_t's of the current and next L2ARC blk.
+ *
+ * The `this_io' and `next_io' arguments are used for block fetching.
+ * When issuing the first blk IO during rebuild, you should pass NULL for
+ * `this_io'. This function will then issue a sync IO to read the block and
+ * also issue an async IO to fetch the next block in the block chain. The
+ * fetched IO is returned in `next_io'. On subsequent calls to this
+ * function, pass the value returned in `next_io' from the previous call
+ * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO.
+ * Prior to the call, you should initialize your `next_io' pointer to be
+ * NULL. If no fetch IO was issued, the pointer is left set at NULL.
+ *
+ * On success, this function returns 0, otherwise it returns an appropriate
+ * error code. On error the fetching IO is aborted and cleared before
+ * returning from this function. Therefore, if we return `success', the
+ * caller can assume that we have taken care of cleanup of fetch IOs.
+ */
+static int
+l2arc_log_blk_read(l2arc_dev_t *dev,
+ const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp,
+ l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
+ zio_t *this_io, zio_t **next_io)
+{
+ int err = 0;
+ zio_cksum_t cksum;
+ abd_t *abd = NULL;
+ uint64_t asize;
+
+ ASSERT(this_lbp != NULL && next_lbp != NULL);
+ ASSERT(this_lb != NULL && next_lb != NULL);
+ ASSERT(next_io != NULL && *next_io == NULL);
+ ASSERT(l2arc_log_blkptr_valid(dev, this_lbp));
+
+ /*
+ * Check to see if we have issued the IO for this log block in a
+ * previous run. If not, this is the first call, so issue it now.
+ */
+ if (this_io == NULL) {
+ this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp,
+ this_lb);
+ }
+
+ /*
+ * Peek to see if we can start issuing the next IO immediately.
+ */
+ if (l2arc_log_blkptr_valid(dev, next_lbp)) {
+ /*
+ * Start issuing IO for the next log block early - this
+ * should help keep the L2ARC device busy while we
+ * decompress and restore this log block.
+ */
+ *next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp,
+ next_lb);
+ }
+
+ /* Wait for the IO to read this log block to complete */
+ if ((err = zio_wait(this_io)) != 0) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
+ zfs_dbgmsg("L2ARC IO error (%d) while reading log block, "
+ "offset: %llu, vdev guid: %llu", err, this_lbp->lbp_daddr,
+ dev->l2ad_vdev->vdev_guid);
+ goto cleanup;
+ }
+
+ /*
+ * Make sure the buffer checks out.
+ * L2BLK_GET_PSIZE returns aligned size for log blocks.
+ */
+ asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop);
+ fletcher_4_native(this_lb, asize, NULL, &cksum);
+ if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors);
+ zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, "
+ "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu",
+ this_lbp->lbp_daddr, dev->l2ad_vdev->vdev_guid,
+ dev->l2ad_hand, dev->l2ad_evict);
+ err = SET_ERROR(ECKSUM);
+ goto cleanup;
+ }
+
+ /* Now we can take our time decoding this buffer */
+ switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) {
+ case ZIO_COMPRESS_OFF:
+ break;
+ case ZIO_COMPRESS_LZ4:
+ abd = abd_alloc_for_io(asize, B_TRUE);
+ abd_copy_from_buf_off(abd, this_lb, 0, asize);
+ if ((err = zio_decompress_data(
+ L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
+ abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) {
+ err = SET_ERROR(EINVAL);
+ goto cleanup;
+ }
+ break;
+ default:
+ err = SET_ERROR(EINVAL);
+ goto cleanup;
+ }
+ if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
+ byteswap_uint64_array(this_lb, sizeof (*this_lb));
+ if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) {
+ err = SET_ERROR(EINVAL);
+ goto cleanup;
+ }
+cleanup:
+ /* Abort an in-flight fetch I/O in case of error */
+ if (err != 0 && *next_io != NULL) {
+ l2arc_log_blk_fetch_abort(*next_io);
+ *next_io = NULL;
+ }
+ if (abd != NULL)
+ abd_free(abd);
+ return (err);
+}
+
+/*
+ * Restores the payload of a log block to ARC. This creates empty ARC hdr
+ * entries which only contain an l2arc hdr, essentially restoring the
+ * buffers to their L2ARC evicted state. This function also updates space
+ * usage on the L2ARC vdev to make sure it tracks restored buffers.
+ */
+static void
+l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb,
+ uint64_t lb_asize)
+{
+ uint64_t size = 0, asize = 0;
+ uint64_t log_entries = dev->l2ad_log_entries;
+
+ /*
+ * Usually arc_adapt() is called only for data, not headers, but
+ * since we may allocate significant amount of memory here, let ARC
+ * grow its arc_c.
+ */
+ arc_adapt(log_entries * HDR_L2ONLY_SIZE, arc_l2c_only);
+
+ for (int i = log_entries - 1; i >= 0; i--) {
+ /*
+ * Restore goes in the reverse temporal direction to preserve
+ * correct temporal ordering of buffers in the l2ad_buflist.
+ * l2arc_hdr_restore also does a list_insert_tail instead of
+ * list_insert_head on the l2ad_buflist:
+ *
+ * LIST l2ad_buflist LIST
+ * HEAD <------ (time) ------ TAIL
+ * direction +-----+-----+-----+-----+-----+ direction
+ * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild
+ * fill +-----+-----+-----+-----+-----+
+ * ^ ^
+ * | |
+ * | |
+ * l2arc_feed_thread l2arc_rebuild
+ * will place new bufs here restores bufs here
+ *
+ * During l2arc_rebuild() the device is not used by
+ * l2arc_feed_thread() as dev->l2ad_rebuild is set to true.
+ */
+ size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop);
+ asize += vdev_psize_to_asize(dev->l2ad_vdev,
+ L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop));
+ l2arc_hdr_restore(&lb->lb_entries[i], dev);
+ }
+
+ /*
+ * Record rebuild stats:
+ * size Logical size of restored buffers in the L2ARC
+ * asize Aligned size of restored buffers in the L2ARC
+ */
+ ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
+ ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize);
+ ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries);
+ ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize);
+ ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize);
+ ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
+}
+
+/*
+ * Restores a single ARC buf hdr from a log entry. The ARC buffer is put
+ * into a state indicating that it has been evicted to L2ARC.
+ */
+static void
+l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev)
+{
+ arc_buf_hdr_t *hdr, *exists;
+ kmutex_t *hash_lock;
+ arc_buf_contents_t type = L2BLK_GET_TYPE((le)->le_prop);
+ uint64_t asize;
+
+ /*
+ * Do all the allocation before grabbing any locks, this lets us
+ * sleep if memory is full and we don't have to deal with failed
+ * allocations.
+ */
+ hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type,
+ dev, le->le_dva, le->le_daddr,
+ L2BLK_GET_PSIZE((le)->le_prop), le->le_birth,
+ L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel,
+ L2BLK_GET_PROTECTED((le)->le_prop),
+ L2BLK_GET_PREFETCH((le)->le_prop),
+ L2BLK_GET_STATE((le)->le_prop));
+ asize = vdev_psize_to_asize(dev->l2ad_vdev,
+ L2BLK_GET_PSIZE((le)->le_prop));
+
+ /*
+ * vdev_space_update() has to be called before arc_hdr_destroy() to
+ * avoid underflow since the latter also calls vdev_space_update().
+ */
+ l2arc_hdr_arcstats_increment(hdr);
+ vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+
+ mutex_enter(&dev->l2ad_mtx);
+ list_insert_tail(&dev->l2ad_buflist, hdr);
+ (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
+ mutex_exit(&dev->l2ad_mtx);
+
+ exists = buf_hash_insert(hdr, &hash_lock);
+ if (exists) {
+ /* Buffer was already cached, no need to restore it. */
+ arc_hdr_destroy(hdr);
+ /*
+ * If the buffer is already cached, check whether it has
+ * L2ARC metadata. If not, enter them and update the flag.
+ * This is important is case of onlining a cache device, since
+ * we previously evicted all L2ARC metadata from ARC.
+ */
+ if (!HDR_HAS_L2HDR(exists)) {
+ arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR);
+ exists->b_l2hdr.b_dev = dev;
+ exists->b_l2hdr.b_daddr = le->le_daddr;
+ exists->b_l2hdr.b_arcs_state =
+ L2BLK_GET_STATE((le)->le_prop);
+ mutex_enter(&dev->l2ad_mtx);
+ list_insert_tail(&dev->l2ad_buflist, exists);
+ (void) zfs_refcount_add_many(&dev->l2ad_alloc,
+ arc_hdr_size(exists), exists);
+ mutex_exit(&dev->l2ad_mtx);
+ l2arc_hdr_arcstats_increment(exists);
+ vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+ }
+ ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
+ }
+
+ mutex_exit(hash_lock);
+}
+
+/*
+ * Starts an asynchronous read IO to read a log block. This is used in log
+ * block reconstruction to start reading the next block before we are done
+ * decoding and reconstructing the current block, to keep the l2arc device
+ * nice and hot with read IO to process.
+ * The returned zio will contain a newly allocated memory buffers for the IO
+ * data which should then be freed by the caller once the zio is no longer
+ * needed (i.e. due to it having completed). If you wish to abort this
+ * zio, you should do so using l2arc_log_blk_fetch_abort, which takes
+ * care of disposing of the allocated buffers correctly.
+ */
+static zio_t *
+l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp,
+ l2arc_log_blk_phys_t *lb)
+{
+ uint32_t asize;
+ zio_t *pio;
+ l2arc_read_callback_t *cb;
+
+ /* L2BLK_GET_PSIZE returns aligned size for log blocks */
+ asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
+ ASSERT(asize <= sizeof (l2arc_log_blk_phys_t));
+
+ cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP);
+ cb->l2rcb_abd = abd_get_from_buf(lb, asize);
+ pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY);
+ (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize,
+ cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
+
+ return (pio);
+}
+
+/*
+ * Aborts a zio returned from l2arc_log_blk_fetch and frees the data
+ * buffers allocated for it.
+ */
+static void
+l2arc_log_blk_fetch_abort(zio_t *zio)
+{
+ (void) zio_wait(zio);
+}
+
+/*
+ * Creates a zio to update the device header on an l2arc device.
+ */
+void
+l2arc_dev_hdr_update(l2arc_dev_t *dev)
+{
+ l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
+ const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize;
+ abd_t *abd;
+ int err;
+
+ VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER));
+
+ l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC;
+ l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION;
+ l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
+ l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid;
+ l2dhdr->dh_log_entries = dev->l2ad_log_entries;
+ l2dhdr->dh_evict = dev->l2ad_evict;
+ l2dhdr->dh_start = dev->l2ad_start;
+ l2dhdr->dh_end = dev->l2ad_end;
+ l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize);
+ l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count);
+ l2dhdr->dh_flags = 0;
+ l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time;
+ l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state;
+ if (dev->l2ad_first)
+ l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
+
+ abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
+
+ err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev,
+ VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL,
+ NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE));
+
+ abd_free(abd);
+
+ if (err != 0) {
+ zfs_dbgmsg("L2ARC IO error (%d) while writing device header, "
+ "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid);
+ }
+}
+
+/*
+ * Commits a log block to the L2ARC device. This routine is invoked from
+ * l2arc_write_buffers when the log block fills up.
+ * This function allocates some memory to temporarily hold the serialized
+ * buffer to be written. This is then released in l2arc_write_done.
+ */
+static void
+l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
+{
+ l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
+ l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
+ uint64_t psize, asize;
+ zio_t *wzio;
+ l2arc_lb_abd_buf_t *abd_buf;
+ uint8_t *tmpbuf;
+ l2arc_lb_ptr_buf_t *lb_ptr_buf;
+
+ VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
+
+ tmpbuf = zio_buf_alloc(sizeof (*lb));
+ abd_buf = zio_buf_alloc(sizeof (*abd_buf));
+ abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb));
+ lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
+ lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP);
+
+ /* link the buffer into the block chain */
+ lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1];
+ lb->lb_magic = L2ARC_LOG_BLK_MAGIC;
+
+ /*
+ * l2arc_log_blk_commit() may be called multiple times during a single
+ * l2arc_write_buffers() call. Save the allocated abd buffers in a list
+ * so we can free them in l2arc_write_done() later on.
+ */
+ list_insert_tail(&cb->l2wcb_abd_list, abd_buf);
+
+ /* try to compress the buffer */
+ psize = zio_compress_data(ZIO_COMPRESS_LZ4,
+ abd_buf->abd, tmpbuf, sizeof (*lb), 0);
+
+ /* a log block is never entirely zero */
+ ASSERT(psize != 0);
+ asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
+ ASSERT(asize <= sizeof (*lb));
+
+ /*
+ * Update the start log block pointer in the device header to point
+ * to the log block we're about to write.
+ */
+ l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0];
+ l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand;
+ l2dhdr->dh_start_lbps[0].lbp_payload_asize =
+ dev->l2ad_log_blk_payload_asize;
+ l2dhdr->dh_start_lbps[0].lbp_payload_start =
+ dev->l2ad_log_blk_payload_start;
+ _NOTE(CONSTCOND)
+ L2BLK_SET_LSIZE(
+ (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb));
+ L2BLK_SET_PSIZE(
+ (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize);
+ L2BLK_SET_CHECKSUM(
+ (&l2dhdr->dh_start_lbps[0])->lbp_prop,
+ ZIO_CHECKSUM_FLETCHER_4);
+ if (asize < sizeof (*lb)) {
+ /* compression succeeded */
+ bzero(tmpbuf + psize, asize - psize);
+ L2BLK_SET_COMPRESS(
+ (&l2dhdr->dh_start_lbps[0])->lbp_prop,
+ ZIO_COMPRESS_LZ4);
+ } else {
+ /* compression failed */
+ bcopy(lb, tmpbuf, sizeof (*lb));
+ L2BLK_SET_COMPRESS(
+ (&l2dhdr->dh_start_lbps[0])->lbp_prop,
+ ZIO_COMPRESS_OFF);
+ }
+
+ /* checksum what we're about to write */
+ fletcher_4_native(tmpbuf, asize, NULL,
+ &l2dhdr->dh_start_lbps[0].lbp_cksum);
+
+ abd_free(abd_buf->abd);
+
+ /* perform the write itself */
+ abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb));
+ abd_take_ownership_of_buf(abd_buf->abd, B_TRUE);
+ wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
+ asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
+ DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
+ (void) zio_nowait(wzio);
+
+ dev->l2ad_hand += asize;
+ /*
+ * Include the committed log block's pointer in the list of pointers
+ * to log blocks present in the L2ARC device.
+ */
+ bcopy(&l2dhdr->dh_start_lbps[0], lb_ptr_buf->lb_ptr,
+ sizeof (l2arc_log_blkptr_t));
+ mutex_enter(&dev->l2ad_mtx);
+ list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf);
+ ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
+ ARCSTAT_BUMP(arcstat_l2_log_blk_count);
+ zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
+ zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
+ mutex_exit(&dev->l2ad_mtx);
+ vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+
+ /* bump the kstats */
+ ARCSTAT_INCR(arcstat_l2_write_bytes, asize);
+ ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
+ ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize);
+ ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
+ dev->l2ad_log_blk_payload_asize / asize);
+
+ /* start a new log block */
+ dev->l2ad_log_ent_idx = 0;
+ dev->l2ad_log_blk_payload_asize = 0;
+ dev->l2ad_log_blk_payload_start = 0;
+}
+
+/*
+ * Validates an L2ARC log block address to make sure that it can be read
+ * from the provided L2ARC device.
+ */
+boolean_t
+l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp)
+{
+ /* L2BLK_GET_PSIZE returns aligned size for log blocks */
+ uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
+ uint64_t end = lbp->lbp_daddr + asize - 1;
+ uint64_t start = lbp->lbp_payload_start;
+ boolean_t evicted = B_FALSE;
+
+ /*
+ * A log block is valid if all of the following conditions are true:
+ * - it fits entirely (including its payload) between l2ad_start and
+ * l2ad_end
+ * - it has a valid size
+ * - neither the log block itself nor part of its payload was evicted
+ * by l2arc_evict():
+ *
+ * l2ad_hand l2ad_evict
+ * | | lbp_daddr
+ * | start | | end
+ * | | | | |
+ * V V V V V
+ * l2ad_start ============================================ l2ad_end
+ * --------------------------||||
+ * ^ ^
+ * | log block
+ * payload
+ */
+
+ evicted =
+ l2arc_range_check_overlap(start, end, dev->l2ad_hand) ||
+ l2arc_range_check_overlap(start, end, dev->l2ad_evict) ||
+ l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) ||
+ l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end);
+
+ return (start >= dev->l2ad_start && end <= dev->l2ad_end &&
+ asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) &&
+ (!evicted || dev->l2ad_first));
+}
+
+/*
+ * Inserts ARC buffer header `hdr' into the current L2ARC log block on
+ * the device. The buffer being inserted must be present in L2ARC.
+ * Returns B_TRUE if the L2ARC log block is full and needs to be committed
+ * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
+ */
+static boolean_t
+l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
+{
+ l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
+ l2arc_log_ent_phys_t *le;
+
+ if (dev->l2ad_log_entries == 0)
+ return (B_FALSE);
+
+ int index = dev->l2ad_log_ent_idx++;
+
+ ASSERT3S(index, <, dev->l2ad_log_entries);
+ ASSERT(HDR_HAS_L2HDR(hdr));
+
+ le = &lb->lb_entries[index];
+ bzero(le, sizeof (*le));
+ le->le_dva = hdr->b_dva;
+ le->le_birth = hdr->b_birth;
+ le->le_daddr = hdr->b_l2hdr.b_daddr;
+ if (index == 0)
+ dev->l2ad_log_blk_payload_start = le->le_daddr;
+ L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr));
+ L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr));
+ L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr));
+ le->le_complevel = hdr->b_complevel;
+ L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
+ L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
+ L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
+ L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state);
+
+ dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
+ HDR_GET_PSIZE(hdr));
+
+ return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries);
+}
+
+/*
+ * Checks whether a given L2ARC device address sits in a time-sequential
+ * range. The trick here is that the L2ARC is a rotary buffer, so we can't
+ * just do a range comparison, we need to handle the situation in which the
+ * range wraps around the end of the L2ARC device. Arguments:
+ * bottom -- Lower end of the range to check (written to earlier).
+ * top -- Upper end of the range to check (written to later).
+ * check -- The address for which we want to determine if it sits in
+ * between the top and bottom.
+ *
+ * The 3-way conditional below represents the following cases:
+ *
+ * bottom < top : Sequentially ordered case:
+ * <check>--------+-------------------+
+ * | (overlap here?) |
+ * L2ARC dev V V
+ * |---------------<bottom>============<top>--------------|
+ *
+ * bottom > top: Looped-around case:
+ * <check>--------+------------------+
+ * | (overlap here?) |
+ * L2ARC dev V V
+ * |===============<top>---------------<bottom>===========|
+ * ^ ^
+ * | (or here?) |
+ * +---------------+---------<check>
+ *
+ * top == bottom : Just a single address comparison.
+ */
+boolean_t
+l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
+{
+ if (bottom < top)
+ return (bottom <= check && check <= top);
+ else if (bottom > top)
+ return (check <= top || bottom <= check);
+ else
+ return (check == top);
+}
+
+EXPORT_SYMBOL(arc_buf_size);
+EXPORT_SYMBOL(arc_write);
+EXPORT_SYMBOL(arc_read);
+EXPORT_SYMBOL(arc_buf_info);
+EXPORT_SYMBOL(arc_getbuf_func);
+EXPORT_SYMBOL(arc_add_prune_callback);
+EXPORT_SYMBOL(arc_remove_prune_callback);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_long,
+ param_get_long, ZMOD_RW, "Min arc size");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_long,
+ param_get_long, ZMOD_RW, "Max arc size");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit, param_set_arc_long,
+ param_get_long, ZMOD_RW, "Metadata limit for arc size");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit_percent,
+ param_set_arc_long, param_get_long, ZMOD_RW,
+ "Percent of arc size for arc meta limit");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_min, param_set_arc_long,
+ param_get_long, ZMOD_RW, "Min arc metadata");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_prune, INT, ZMOD_RW,
+ "Meta objects to scan for prune");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_adjust_restarts, INT, ZMOD_RW,
+ "Limit number of restarts in arc_evict_meta");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_strategy, INT, ZMOD_RW,
+ "Meta reclaim strategy");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int,
+ param_get_int, ZMOD_RW, "Seconds before growing arc size");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, p_dampener_disable, INT, ZMOD_RW,
+ "Disable arc_p adapt dampener");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int,
+ param_get_int, ZMOD_RW, "log2(fraction of arc to reclaim)");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW,
+ "Percent of pagecache to reclaim arc to");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, p_min_shift, param_set_arc_int,
+ param_get_int, ZMOD_RW, "arc_c shift to calc min/max arc_p");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, INT, ZMOD_RD,
+ "Target average block size");
+
+ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW,
+ "Disable compressed arc buffers");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int,
+ param_get_int, ZMOD_RW, "Min life of prefetch block in ms");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms,
+ param_set_arc_int, param_get_int, ZMOD_RW,
+ "Min life of prescient prefetched block in ms");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, ULONG, ZMOD_RW,
+ "Max write bytes per interval");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, ULONG, ZMOD_RW,
+ "Extra write bytes during device warmup");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, ULONG, ZMOD_RW,
+ "Number of max device writes to precache");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, ULONG, ZMOD_RW,
+ "Compressed l2arc_headroom multiplier");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, ULONG, ZMOD_RW,
+ "TRIM ahead L2ARC write size multiplier");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, ULONG, ZMOD_RW,
+ "Seconds between L2ARC writing");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, ULONG, ZMOD_RW,
+ "Min feed interval in milliseconds");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW,
+ "Skip caching prefetched buffers");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW,
+ "Turbo L2ARC warmup");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW,
+ "No reads during writes");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, INT, ZMOD_RW,
+ "Percent of ARC size allowed for L2ARC-only headers");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW,
+ "Rebuild the L2ARC when importing a pool");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, ULONG, ZMOD_RW,
+ "Min size in bytes to write rebuild log blocks in L2ARC");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW,
+ "Cache only MFU data from ARC into L2ARC");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int,
+ param_get_int, ZMOD_RW, "System free memory I/O throttle in bytes");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_long,
+ param_get_long, ZMOD_RW, "System free memory target size in bytes");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_long,
+ param_get_long, ZMOD_RW, "Minimum bytes of dnodes in arc");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent,
+ param_set_arc_long, param_get_long, ZMOD_RW,
+ "Percent of ARC meta buffers for dnodes");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, ULONG, ZMOD_RW,
+ "Percentage of excess dnodes to try to unpin");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, INT, ZMOD_RW,
+ "When full, ARC allocation waits for eviction of this % of alloc size");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, INT, ZMOD_RW,
+ "The number of headers to evict per sublist before moving to the next");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/blkptr.c b/sys/contrib/openzfs/module/zfs/blkptr.c
new file mode 100644
index 000000000000..aa09ded8dba3
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/blkptr.c
@@ -0,0 +1,153 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/blkptr.h>
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/zio_compress.h>
+
+/*
+ * Embedded-data Block Pointers
+ *
+ * Normally, block pointers point (via their DVAs) to a block which holds data.
+ * If the data that we need to store is very small, this is an inefficient
+ * use of space, because a block must be at minimum 1 sector (typically 512
+ * bytes or 4KB). Additionally, reading these small blocks tends to generate
+ * more random reads.
+ *
+ * Embedded-data Block Pointers allow small pieces of data (the "payload",
+ * up to 112 bytes) to be stored in the block pointer itself, instead of
+ * being pointed to. The "Pointer" part of this name is a bit of a
+ * misnomer, as nothing is pointed to.
+ *
+ * BP_EMBEDDED_TYPE_DATA block pointers allow highly-compressible data to
+ * be embedded in the block pointer. The logic for this is handled in
+ * the SPA, by the zio pipeline. Therefore most code outside the zio
+ * pipeline doesn't need special-cases to handle these block pointers.
+ *
+ * See spa.h for details on the exact layout of embedded block pointers.
+ */
+
+void
+encode_embedded_bp_compressed(blkptr_t *bp, void *data,
+ enum zio_compress comp, int uncompressed_size, int compressed_size)
+{
+ uint64_t *bp64 = (uint64_t *)bp;
+ uint64_t w = 0;
+ uint8_t *data8 = data;
+
+ ASSERT3U(compressed_size, <=, BPE_PAYLOAD_SIZE);
+ ASSERT(uncompressed_size == compressed_size ||
+ comp != ZIO_COMPRESS_OFF);
+ ASSERT3U(comp, >=, ZIO_COMPRESS_OFF);
+ ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
+
+ bzero(bp, sizeof (*bp));
+ BP_SET_EMBEDDED(bp, B_TRUE);
+ BP_SET_COMPRESS(bp, comp);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+ BPE_SET_LSIZE(bp, uncompressed_size);
+ BPE_SET_PSIZE(bp, compressed_size);
+
+ /*
+ * Encode the byte array into the words of the block pointer.
+ * First byte goes into low bits of first word (little endian).
+ */
+ for (int i = 0; i < compressed_size; i++) {
+ BF64_SET(w, (i % sizeof (w)) * NBBY, NBBY, data8[i]);
+ if (i % sizeof (w) == sizeof (w) - 1) {
+ /* we've reached the end of a word */
+ ASSERT3P(bp64, <, bp + 1);
+ *bp64 = w;
+ bp64++;
+ if (!BPE_IS_PAYLOADWORD(bp, bp64))
+ bp64++;
+ w = 0;
+ }
+ }
+ /* write last partial word */
+ if (bp64 < (uint64_t *)(bp + 1))
+ *bp64 = w;
+}
+
+/*
+ * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be
+ * more than BPE_PAYLOAD_SIZE bytes).
+ */
+void
+decode_embedded_bp_compressed(const blkptr_t *bp, void *buf)
+{
+ int psize;
+ uint8_t *buf8 = buf;
+ uint64_t w = 0;
+ const uint64_t *bp64 = (const uint64_t *)bp;
+
+ ASSERT(BP_IS_EMBEDDED(bp));
+
+ psize = BPE_GET_PSIZE(bp);
+
+ /*
+ * Decode the words of the block pointer into the byte array.
+ * Low bits of first word are the first byte (little endian).
+ */
+ for (int i = 0; i < psize; i++) {
+ if (i % sizeof (w) == 0) {
+ /* beginning of a word */
+ ASSERT3P(bp64, <, bp + 1);
+ w = *bp64;
+ bp64++;
+ if (!BPE_IS_PAYLOADWORD(bp, bp64))
+ bp64++;
+ }
+ buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY);
+ }
+}
+
+/*
+ * Fill in the buffer with the (decompressed) payload of the embedded
+ * blkptr_t. Takes into account compression and byteorder (the payload is
+ * treated as a stream of bytes).
+ * Return 0 on success, or ENOSPC if it won't fit in the buffer.
+ */
+int
+decode_embedded_bp(const blkptr_t *bp, void *buf, int buflen)
+{
+ int lsize, psize;
+
+ ASSERT(BP_IS_EMBEDDED(bp));
+
+ lsize = BPE_GET_LSIZE(bp);
+ psize = BPE_GET_PSIZE(bp);
+
+ if (lsize > buflen)
+ return (SET_ERROR(ENOSPC));
+ ASSERT3U(lsize, ==, buflen);
+
+ if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
+ uint8_t dstbuf[BPE_PAYLOAD_SIZE];
+ decode_embedded_bp_compressed(bp, dstbuf);
+ VERIFY0(zio_decompress_data_buf(BP_GET_COMPRESS(bp),
+ dstbuf, buf, psize, buflen, NULL));
+ } else {
+ ASSERT3U(lsize, ==, psize);
+ decode_embedded_bp_compressed(bp, buf);
+ }
+
+ return (0);
+}
diff --git a/sys/contrib/openzfs/module/zfs/bplist.c b/sys/contrib/openzfs/module/zfs/bplist.c
new file mode 100644
index 000000000000..47ea364ef26f
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/bplist.c
@@ -0,0 +1,91 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/bplist.h>
+#include <sys/zfs_context.h>
+
+
+void
+bplist_create(bplist_t *bpl)
+{
+ mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&bpl->bpl_list, sizeof (bplist_entry_t),
+ offsetof(bplist_entry_t, bpe_node));
+}
+
+void
+bplist_destroy(bplist_t *bpl)
+{
+ list_destroy(&bpl->bpl_list);
+ mutex_destroy(&bpl->bpl_lock);
+}
+
+void
+bplist_append(bplist_t *bpl, const blkptr_t *bp)
+{
+ bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_SLEEP);
+
+ mutex_enter(&bpl->bpl_lock);
+ bpe->bpe_blk = *bp;
+ list_insert_tail(&bpl->bpl_list, bpe);
+ mutex_exit(&bpl->bpl_lock);
+}
+
+/*
+ * To aid debugging, we keep the most recently removed entry. This way if
+ * we are in the callback, we can easily locate the entry.
+ */
+static bplist_entry_t *bplist_iterate_last_removed;
+
+void
+bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx)
+{
+ bplist_entry_t *bpe;
+
+ mutex_enter(&bpl->bpl_lock);
+ while ((bpe = list_head(&bpl->bpl_list))) {
+ bplist_iterate_last_removed = bpe;
+ list_remove(&bpl->bpl_list, bpe);
+ mutex_exit(&bpl->bpl_lock);
+ func(arg, &bpe->bpe_blk, tx);
+ kmem_free(bpe, sizeof (*bpe));
+ mutex_enter(&bpl->bpl_lock);
+ }
+ mutex_exit(&bpl->bpl_lock);
+}
+
+void
+bplist_clear(bplist_t *bpl)
+{
+ bplist_entry_t *bpe;
+
+ mutex_enter(&bpl->bpl_lock);
+ while ((bpe = list_head(&bpl->bpl_list))) {
+ bplist_iterate_last_removed = bpe;
+ list_remove(&bpl->bpl_list, bpe);
+ kmem_free(bpe, sizeof (*bpe));
+ }
+ mutex_exit(&bpl->bpl_lock);
+}
diff --git a/sys/contrib/openzfs/module/zfs/bpobj.c b/sys/contrib/openzfs/module/zfs/bpobj.c
new file mode 100644
index 000000000000..e75ba5cccde6
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/bpobj.c
@@ -0,0 +1,943 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2017 Datto Inc.
+ */
+
+#include <sys/bpobj.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_refcount.h>
+#include <sys/dsl_pool.h>
+#include <sys/zfeature.h>
+#include <sys/zap.h>
+
+/*
+ * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
+ */
+uint64_t
+bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_objset_spa(os);
+ dsl_pool_t *dp = dmu_objset_pool(os);
+
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
+ if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
+ ASSERT0(dp->dp_empty_bpobj);
+ dp->dp_empty_bpobj =
+ bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
+ VERIFY(zap_add(os,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
+ &dp->dp_empty_bpobj, tx) == 0);
+ }
+ spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx);
+ ASSERT(dp->dp_empty_bpobj != 0);
+ return (dp->dp_empty_bpobj);
+ } else {
+ return (bpobj_alloc(os, blocksize, tx));
+ }
+}
+
+void
+bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dmu_objset_pool(os);
+
+ spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx);
+ if (!spa_feature_is_active(dmu_objset_spa(os),
+ SPA_FEATURE_EMPTY_BPOBJ)) {
+ VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_EMPTY_BPOBJ, tx));
+ VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
+ dp->dp_empty_bpobj = 0;
+ }
+}
+
+uint64_t
+bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
+{
+ int size;
+
+ if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
+ size = BPOBJ_SIZE_V0;
+ else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
+ size = BPOBJ_SIZE_V1;
+ else if (!spa_feature_is_active(dmu_objset_spa(os),
+ SPA_FEATURE_LIVELIST))
+ size = BPOBJ_SIZE_V2;
+ else
+ size = sizeof (bpobj_phys_t);
+
+ return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
+ DMU_OT_BPOBJ_HDR, size, tx));
+}
+
+void
+bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+ int64_t i;
+ bpobj_t bpo;
+ dmu_object_info_t doi;
+ int epb;
+ dmu_buf_t *dbuf = NULL;
+
+ ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
+ VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
+
+ mutex_enter(&bpo.bpo_lock);
+
+ if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
+ goto out;
+
+ VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
+ epb = doi.doi_data_block_size / sizeof (uint64_t);
+
+ for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
+ uint64_t *objarray;
+ uint64_t offset, blkoff;
+
+ offset = i * sizeof (uint64_t);
+ blkoff = P2PHASE(i, epb);
+
+ if (dbuf == NULL || dbuf->db_offset > offset) {
+ if (dbuf)
+ dmu_buf_rele(dbuf, FTAG);
+ VERIFY3U(0, ==, dmu_buf_hold(os,
+ bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
+ }
+
+ ASSERT3U(offset, >=, dbuf->db_offset);
+ ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+ objarray = dbuf->db_data;
+ bpobj_free(os, objarray[blkoff], tx);
+ }
+ if (dbuf) {
+ dmu_buf_rele(dbuf, FTAG);
+ dbuf = NULL;
+ }
+ VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
+
+out:
+ mutex_exit(&bpo.bpo_lock);
+ bpobj_close(&bpo);
+
+ VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
+}
+
+int
+bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
+{
+ dmu_object_info_t doi;
+ int err;
+
+ err = dmu_object_info(os, object, &doi);
+ if (err)
+ return (err);
+
+ bzero(bpo, sizeof (*bpo));
+ mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ ASSERT(bpo->bpo_dbuf == NULL);
+ ASSERT(bpo->bpo_phys == NULL);
+ ASSERT(object != 0);
+ ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
+
+ err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
+ if (err)
+ return (err);
+
+ bpo->bpo_os = os;
+ bpo->bpo_object = object;
+ bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
+ bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
+ bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
+ bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2);
+ bpo->bpo_phys = bpo->bpo_dbuf->db_data;
+ return (0);
+}
+
+boolean_t
+bpobj_is_open(const bpobj_t *bpo)
+{
+ return (bpo->bpo_object != 0);
+}
+
+void
+bpobj_close(bpobj_t *bpo)
+{
+ /* Lame workaround for closing a bpobj that was never opened. */
+ if (bpo->bpo_object == 0)
+ return;
+
+ dmu_buf_rele(bpo->bpo_dbuf, bpo);
+ if (bpo->bpo_cached_dbuf != NULL)
+ dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
+ bpo->bpo_dbuf = NULL;
+ bpo->bpo_phys = NULL;
+ bpo->bpo_cached_dbuf = NULL;
+ bpo->bpo_object = 0;
+
+ mutex_destroy(&bpo->bpo_lock);
+}
+
+static boolean_t
+bpobj_is_empty_impl(bpobj_t *bpo)
+{
+ ASSERT(MUTEX_HELD(&bpo->bpo_lock));
+ return (bpo->bpo_phys->bpo_num_blkptrs == 0 &&
+ (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0));
+}
+
+boolean_t
+bpobj_is_empty(bpobj_t *bpo)
+{
+ mutex_enter(&bpo->bpo_lock);
+ boolean_t is_empty = bpobj_is_empty_impl(bpo);
+ mutex_exit(&bpo->bpo_lock);
+ return (is_empty);
+}
+
+/*
+ * A recursive iteration of the bpobjs would be nice here but we run the risk
+ * of overflowing function stack space. Instead, find each subobj and add it
+ * to the head of our list so it can be scanned for subjobjs. Like a
+ * recursive implementation, the "deepest" subobjs will be freed first.
+ * When a subobj is found to have no additional subojs, free it.
+ */
+typedef struct bpobj_info {
+ bpobj_t *bpi_bpo;
+ /*
+ * This object is a subobj of bpi_parent,
+ * at bpi_index in its subobj array.
+ */
+ struct bpobj_info *bpi_parent;
+ uint64_t bpi_index;
+ /* How many of our subobj's are left to process. */
+ uint64_t bpi_unprocessed_subobjs;
+ /* True after having visited this bpo's directly referenced BPs. */
+ boolean_t bpi_visited;
+ list_node_t bpi_node;
+} bpobj_info_t;
+
+static bpobj_info_t *
+bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index)
+{
+ bpobj_info_t *bpi = kmem_zalloc(sizeof (bpobj_info_t), KM_SLEEP);
+ bpi->bpi_bpo = bpo;
+ bpi->bpi_parent = parent;
+ bpi->bpi_index = index;
+ if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
+ bpi->bpi_unprocessed_subobjs = bpo->bpo_phys->bpo_num_subobjs;
+ }
+ return (bpi);
+}
+
+/*
+ * Update bpobj and all of its parents with new space accounting.
+ */
+static void
+propagate_space_reduction(bpobj_info_t *bpi, int64_t freed,
+ int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx)
+{
+
+ for (; bpi != NULL; bpi = bpi->bpi_parent) {
+ bpobj_t *p = bpi->bpi_bpo;
+ ASSERT(dmu_buf_is_dirty(p->bpo_dbuf, tx));
+ p->bpo_phys->bpo_bytes -= freed;
+ ASSERT3S(p->bpo_phys->bpo_bytes, >=, 0);
+ if (p->bpo_havecomp) {
+ p->bpo_phys->bpo_comp -= comp_freed;
+ p->bpo_phys->bpo_uncomp -= uncomp_freed;
+ }
+ }
+}
+
+static int
+bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
+ int64_t start, dmu_tx_t *tx, boolean_t free)
+{
+ int err = 0;
+ int64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
+ dmu_buf_t *dbuf = NULL;
+ bpobj_t *bpo = bpi->bpi_bpo;
+
+ for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) {
+ uint64_t offset = i * sizeof (blkptr_t);
+ uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
+
+ if (dbuf == NULL || dbuf->db_offset > offset) {
+ if (dbuf)
+ dmu_buf_rele(dbuf, FTAG);
+ err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
+ offset, FTAG, &dbuf, 0);
+ if (err)
+ break;
+ }
+
+ ASSERT3U(offset, >=, dbuf->db_offset);
+ ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+ blkptr_t *bparray = dbuf->db_data;
+ blkptr_t *bp = &bparray[blkoff];
+
+ boolean_t bp_freed = BP_GET_FREE(bp);
+ err = func(arg, bp, bp_freed, tx);
+ if (err)
+ break;
+
+ if (free) {
+ int sign = bp_freed ? -1 : +1;
+ spa_t *spa = dmu_objset_spa(bpo->bpo_os);
+ freed += sign * bp_get_dsize_sync(spa, bp);
+ comp_freed += sign * BP_GET_PSIZE(bp);
+ uncomp_freed += sign * BP_GET_UCSIZE(bp);
+ ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx));
+ bpo->bpo_phys->bpo_num_blkptrs--;
+ ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
+ if (bp_freed) {
+ ASSERT(bpo->bpo_havefreed);
+ bpo->bpo_phys->bpo_num_freed--;
+ ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0);
+ }
+ }
+ }
+ if (free) {
+ propagate_space_reduction(bpi, freed, comp_freed,
+ uncomp_freed, tx);
+ VERIFY0(dmu_free_range(bpo->bpo_os,
+ bpo->bpo_object,
+ bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
+ DMU_OBJECT_END, tx));
+ }
+ if (dbuf) {
+ dmu_buf_rele(dbuf, FTAG);
+ dbuf = NULL;
+ }
+ return (err);
+}
+
+/*
+ * Given an initial bpo, start by freeing the BPs that are directly referenced
+ * by that bpo. If the bpo has subobjs, read in its last subobj and push the
+ * subobj to our stack. By popping items off our stack, eventually we will
+ * encounter a bpo that has no subobjs. We can free its bpobj_info_t, and if
+ * requested also free the now-empty bpo from disk and decrement
+ * its parent's subobj count. We continue popping each subobj from our stack,
+ * visiting its last subobj until they too have no more subobjs, and so on.
+ */
+static int
+bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
+ dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size)
+{
+ list_t stack;
+ bpobj_info_t *bpi;
+ int err = 0;
+
+ /*
+ * Create a "stack" for us to work with without worrying about
+ * stack overflows. Initialize it with the initial_bpo.
+ */
+ list_create(&stack, sizeof (bpobj_info_t),
+ offsetof(bpobj_info_t, bpi_node));
+ mutex_enter(&initial_bpo->bpo_lock);
+
+ if (bpobj_size != NULL)
+ *bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs;
+
+ list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0));
+
+ while ((bpi = list_head(&stack)) != NULL) {
+ bpobj_t *bpo = bpi->bpi_bpo;
+
+ ASSERT3P(bpo, !=, NULL);
+ ASSERT(MUTEX_HELD(&bpo->bpo_lock));
+ ASSERT(bpobj_is_open(bpo));
+
+ if (free)
+ dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+
+ if (bpi->bpi_visited == B_FALSE) {
+ err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx,
+ free);
+ bpi->bpi_visited = B_TRUE;
+ if (err != 0)
+ break;
+ }
+ /*
+ * We've finished with this bpo's directly-referenced BP's and
+ * it has no more unprocessed subobjs. We can free its
+ * bpobj_info_t (unless it is the topmost, initial_bpo).
+ * If we are freeing from disk, we can also do that.
+ */
+ if (bpi->bpi_unprocessed_subobjs == 0) {
+ /*
+ * If there are no entries, there should
+ * be no bytes.
+ */
+ if (bpobj_is_empty_impl(bpo)) {
+ ASSERT0(bpo->bpo_phys->bpo_bytes);
+ ASSERT0(bpo->bpo_phys->bpo_comp);
+ ASSERT0(bpo->bpo_phys->bpo_uncomp);
+ }
+
+ /* The initial_bpo has no parent and is not closed. */
+ if (bpi->bpi_parent != NULL) {
+ if (free) {
+ bpobj_t *p = bpi->bpi_parent->bpi_bpo;
+
+ ASSERT0(bpo->bpo_phys->bpo_num_blkptrs);
+ ASSERT3U(p->bpo_phys->bpo_num_subobjs,
+ >, 0);
+ ASSERT3U(bpi->bpi_index, ==,
+ p->bpo_phys->bpo_num_subobjs - 1);
+ ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf,
+ tx));
+
+ p->bpo_phys->bpo_num_subobjs--;
+
+ VERIFY0(dmu_free_range(p->bpo_os,
+ p->bpo_phys->bpo_subobjs,
+ bpi->bpi_index * sizeof (uint64_t),
+ sizeof (uint64_t), tx));
+
+ /* eliminate the empty subobj list */
+ if (bpo->bpo_havesubobj &&
+ bpo->bpo_phys->bpo_subobjs != 0) {
+ ASSERT0(bpo->bpo_phys->
+ bpo_num_subobjs);
+ err = dmu_object_free(
+ bpo->bpo_os,
+ bpo->bpo_phys->bpo_subobjs,
+ tx);
+ if (err)
+ break;
+ bpo->bpo_phys->bpo_subobjs = 0;
+ }
+ err = dmu_object_free(p->bpo_os,
+ bpo->bpo_object, tx);
+ if (err)
+ break;
+ }
+
+ mutex_exit(&bpo->bpo_lock);
+ bpobj_close(bpo);
+ kmem_free(bpo, sizeof (bpobj_t));
+ } else {
+ mutex_exit(&bpo->bpo_lock);
+ }
+
+ /*
+ * Finished processing this bpo. Unlock, and free
+ * our "stack" info.
+ */
+ list_remove_head(&stack);
+ kmem_free(bpi, sizeof (bpobj_info_t));
+ } else {
+ /*
+ * We have unprocessed subobjs. Process the next one.
+ */
+ ASSERT(bpo->bpo_havecomp);
+ ASSERT3P(bpobj_size, ==, NULL);
+
+ /* Add the last subobj to stack. */
+ int64_t i = bpi->bpi_unprocessed_subobjs - 1;
+ uint64_t offset = i * sizeof (uint64_t);
+
+ uint64_t obj_from_sublist;
+ err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+ offset, sizeof (uint64_t), &obj_from_sublist,
+ DMU_READ_PREFETCH);
+ if (err)
+ break;
+ bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t),
+ KM_SLEEP);
+
+ err = bpobj_open(sublist, bpo->bpo_os,
+ obj_from_sublist);
+ if (err)
+ break;
+
+ list_insert_head(&stack, bpi_alloc(sublist, bpi, i));
+ mutex_enter(&sublist->bpo_lock);
+ bpi->bpi_unprocessed_subobjs--;
+ }
+ }
+ /*
+ * Cleanup anything left on the "stack" after we left the loop.
+ * Every bpo on the stack is locked so we must remember to undo
+ * that now (in LIFO order).
+ */
+ while ((bpi = list_remove_head(&stack)) != NULL) {
+ bpobj_t *bpo = bpi->bpi_bpo;
+ ASSERT(err != 0);
+ ASSERT3P(bpo, !=, NULL);
+
+ mutex_exit(&bpo->bpo_lock);
+
+ /* do not free the initial_bpo */
+ if (bpi->bpi_parent != NULL) {
+ bpobj_close(bpi->bpi_bpo);
+ kmem_free(bpi->bpi_bpo, sizeof (bpobj_t));
+ }
+ kmem_free(bpi, sizeof (bpobj_info_t));
+ }
+
+ list_destroy(&stack);
+
+ return (err);
+}
+
+/*
+ * Iterate and remove the entries. If func returns nonzero, iteration
+ * will stop and that entry will not be removed.
+ */
+int
+bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
+{
+ return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL));
+}
+
+/*
+ * Iterate the entries. If func returns nonzero, iteration will stop.
+ *
+ * If there are no subobjs:
+ *
+ * *bpobj_size can be used to return the number of block pointers in the
+ * bpobj. Note that this may be different from the number of block pointers
+ * that are iterated over, if iteration is terminated early (e.g. by the func
+ * returning nonzero).
+ *
+ * If there are concurrent (or subsequent) modifications to the bpobj then the
+ * returned *bpobj_size can be passed as "start" to
+ * livelist_bpobj_iterate_from_nofree() to iterate the newly added entries.
+ */
+int
+bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
+ uint64_t *bpobj_size)
+{
+ return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size));
+}
+
+/*
+ * Iterate over the blkptrs in the bpobj beginning at index start. If func
+ * returns nonzero, iteration will stop. This is a livelist specific function
+ * since it assumes that there are no subobjs present.
+ */
+int
+livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
+ int64_t start)
+{
+ if (bpo->bpo_havesubobj)
+ VERIFY0(bpo->bpo_phys->bpo_subobjs);
+ bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0);
+ int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE);
+ kmem_free(bpi, sizeof (bpobj_info_t));
+ return (err);
+}
+
+/*
+ * Logically add subobj's contents to the parent bpobj.
+ *
+ * In the most general case, this is accomplished in constant time by adding
+ * a reference to subobj. This case is used when enqueuing a large subobj:
+ * +--------------+ +--------------+
+ * | bpobj |----------------------->| subobj list |
+ * +----+----+----+----+----+ +-----+-----+--+--+
+ * | bp | bp | bp | bp | bp | | obj | obj | obj |
+ * +----+----+----+----+----+ +-----+-----+-----+
+ *
+ * +--------------+ +--------------+
+ * | sub-bpobj |----------------------> | subsubobj |
+ * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+
+ * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj |
+ * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+
+ *
+ * Result: sub-bpobj added to parent's subobj list.
+ * +--------------+ +--------------+
+ * | bpobj |----------------------->| subobj list |
+ * +----+----+----+----+----+ +-----+-----+--+--+-----+
+ * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ |
+ * +----+----+----+----+----+ +-----+-----+-----+--|--+
+ * |
+ * /-----------------------------------------------------/
+ * v
+ * +--------------+ +--------------+
+ * | sub-bpobj |----------------------> | subsubobj |
+ * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+
+ * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj |
+ * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+
+ *
+ *
+ * In a common case, the subobj is small: its bp's and its list of subobj's
+ * are each stored in a single block. In this case we copy the subobj's
+ * contents to the parent:
+ * +--------------+ +--------------+
+ * | bpobj |----------------------->| subobj list |
+ * +----+----+----+----+----+ +-----+-----+--+--+
+ * | bp | bp | bp | bp | bp | | obj | obj | obj |
+ * +----+----+----+----+----+ +-----+-----+-----+
+ * ^ ^
+ * +--------------+ | +--------------+ |
+ * | sub-bpobj |---------^------------> | subsubobj | ^
+ * +----+----+----+ | +-----+-----+--+ |
+ * | BP | BP |-->-->-->-->-/ | OBJ | OBJ |-->-/
+ * +----+----+ +-----+-----+
+ *
+ * Result: subobj destroyed, contents copied to parent:
+ * +--------------+ +--------------+
+ * | bpobj |----------------------->| subobj list |
+ * +----+----+----+----+----+----+----+ +-----+-----+--+--+-----+-----+
+ * | bp | bp | bp | bp | bp | BP | BP | | obj | obj | obj | OBJ | OBJ |
+ * +----+----+----+----+----+----+----+ +-----+-----+-----+-----+-----+
+ *
+ *
+ * If the subobj has many BP's but few subobj's, we can copy the sub-subobj's
+ * but retain the sub-bpobj:
+ * +--------------+ +--------------+
+ * | bpobj |----------------------->| subobj list |
+ * +----+----+----+----+----+ +-----+-----+--+--+
+ * | bp | bp | bp | bp | bp | | obj | obj | obj |
+ * +----+----+----+----+----+ +-----+-----+-----+
+ * ^
+ * +--------------+ +--------------+ |
+ * | sub-bpobj |----------------------> | subsubobj | ^
+ * +----+----+----+----+---------+----+ +-----+-----+--+ |
+ * | bp | bp | bp | bp | ... | bp | | OBJ | OBJ |-->-/
+ * +----+----+----+----+---------+----+ +-----+-----+
+ *
+ * Result: sub-sub-bpobjs and subobj added to parent's subobj list.
+ * +--------------+ +--------------+
+ * | bpobj |-------------------->| subobj list |
+ * +----+----+----+----+----+ +-----+-----+--+--+-----+-----+------+
+ * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | OBJ | OBJ* |
+ * +----+----+----+----+----+ +-----+-----+-----+-----+-----+--|---+
+ * |
+ * /--------------------------------------------------------------/
+ * v
+ * +--------------+
+ * | sub-bpobj |
+ * +----+----+----+----+---------+----+
+ * | bp | bp | bp | bp | ... | bp |
+ * +----+----+----+----+---------+----+
+ */
+void
+bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
+{
+ bpobj_t subbpo;
+ uint64_t used, comp, uncomp, subsubobjs;
+ boolean_t copy_subsub = B_TRUE;
+ boolean_t copy_bps = B_TRUE;
+
+ ASSERT(bpobj_is_open(bpo));
+ ASSERT(subobj != 0);
+ ASSERT(bpo->bpo_havesubobj);
+ ASSERT(bpo->bpo_havecomp);
+ ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
+
+ if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
+ bpobj_decr_empty(bpo->bpo_os, tx);
+ return;
+ }
+
+ VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
+ VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
+
+ if (bpobj_is_empty(&subbpo)) {
+ /* No point in having an empty subobj. */
+ bpobj_close(&subbpo);
+ bpobj_free(bpo->bpo_os, subobj, tx);
+ return;
+ }
+
+ mutex_enter(&bpo->bpo_lock);
+ dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+
+ dmu_object_info_t doi;
+
+ if (bpo->bpo_phys->bpo_subobjs != 0) {
+ ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+ &doi));
+ ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
+ }
+
+ /*
+ * If subobj has only one block of subobjs, then move subobj's
+ * subobjs to bpo's subobj list directly. This reduces recursion in
+ * bpobj_iterate due to nested subobjs.
+ */
+ subsubobjs = subbpo.bpo_phys->bpo_subobjs;
+ if (subsubobjs != 0) {
+ VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
+ if (doi.doi_max_offset > doi.doi_data_block_size) {
+ copy_subsub = B_FALSE;
+ }
+ }
+
+ /*
+ * If, in addition to having only one block of subobj's, subobj has
+ * only one block of bp's, then move subobj's bp's to bpo's bp list
+ * directly. This reduces recursion in bpobj_iterate due to nested
+ * subobjs.
+ */
+ VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi));
+ if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) {
+ copy_bps = B_FALSE;
+ }
+
+ if (copy_subsub && subsubobjs != 0) {
+ dmu_buf_t *subdb;
+ uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
+
+ VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs,
+ 0, FTAG, &subdb, 0));
+ /*
+ * Make sure that we are not asking dmu_write()
+ * to write more data than we have in our buffer.
+ */
+ VERIFY3U(subdb->db_size, >=,
+ numsubsub * sizeof (subobj));
+ if (bpo->bpo_phys->bpo_subobjs == 0) {
+ bpo->bpo_phys->bpo_subobjs =
+ dmu_object_alloc(bpo->bpo_os,
+ DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
+ DMU_OT_NONE, 0, tx);
+ }
+ dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+ bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+ numsubsub * sizeof (subobj), subdb->db_data, tx);
+ dmu_buf_rele(subdb, FTAG);
+ bpo->bpo_phys->bpo_num_subobjs += numsubsub;
+
+ dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
+ subbpo.bpo_phys->bpo_subobjs = 0;
+ VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx));
+ }
+
+ if (copy_bps) {
+ dmu_buf_t *bps;
+ uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs;
+
+ ASSERT(copy_subsub);
+ VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj,
+ 0, FTAG, &bps, 0));
+
+ /*
+ * Make sure that we are not asking dmu_write()
+ * to write more data than we have in our buffer.
+ */
+ VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t));
+ dmu_write(bpo->bpo_os, bpo->bpo_object,
+ bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
+ numbps * sizeof (blkptr_t),
+ bps->db_data, tx);
+ dmu_buf_rele(bps, FTAG);
+ bpo->bpo_phys->bpo_num_blkptrs += numbps;
+
+ bpobj_close(&subbpo);
+ VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx));
+ } else {
+ bpobj_close(&subbpo);
+ if (bpo->bpo_phys->bpo_subobjs == 0) {
+ bpo->bpo_phys->bpo_subobjs =
+ dmu_object_alloc(bpo->bpo_os,
+ DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
+ DMU_OT_NONE, 0, tx);
+ }
+
+ dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+ bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+ sizeof (subobj), &subobj, tx);
+ bpo->bpo_phys->bpo_num_subobjs++;
+ }
+
+ bpo->bpo_phys->bpo_bytes += used;
+ bpo->bpo_phys->bpo_comp += comp;
+ bpo->bpo_phys->bpo_uncomp += uncomp;
+ mutex_exit(&bpo->bpo_lock);
+
+}
+
+void
+bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
+ dmu_tx_t *tx)
+{
+ blkptr_t stored_bp = *bp;
+ uint64_t offset;
+ int blkoff;
+ blkptr_t *bparray;
+
+ ASSERT(bpobj_is_open(bpo));
+ ASSERT(!BP_IS_HOLE(bp));
+ ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
+
+ if (BP_IS_EMBEDDED(bp)) {
+ /*
+ * The bpobj will compress better without the payload.
+ *
+ * Note that we store EMBEDDED bp's because they have an
+ * uncompressed size, which must be accounted for. An
+ * alternative would be to add their size to bpo_uncomp
+ * without storing the bp, but that would create additional
+ * complications: bpo_uncomp would be inconsistent with the
+ * set of BP's stored, and bpobj_iterate() wouldn't visit
+ * all the space accounted for in the bpobj.
+ */
+ bzero(&stored_bp, sizeof (stored_bp));
+ stored_bp.blk_prop = bp->blk_prop;
+ stored_bp.blk_birth = bp->blk_birth;
+ } else if (!BP_GET_DEDUP(bp)) {
+ /* The bpobj will compress better without the checksum */
+ bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
+ }
+
+ stored_bp.blk_fill = 0;
+ BP_SET_FREE(&stored_bp, bp_freed);
+
+ mutex_enter(&bpo->bpo_lock);
+
+ offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
+ blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
+
+ if (bpo->bpo_cached_dbuf == NULL ||
+ offset < bpo->bpo_cached_dbuf->db_offset ||
+ offset >= bpo->bpo_cached_dbuf->db_offset +
+ bpo->bpo_cached_dbuf->db_size) {
+ if (bpo->bpo_cached_dbuf)
+ dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
+ VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
+ offset, bpo, &bpo->bpo_cached_dbuf, 0));
+ }
+
+ dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
+ bparray = bpo->bpo_cached_dbuf->db_data;
+ bparray[blkoff] = stored_bp;
+
+ dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+ bpo->bpo_phys->bpo_num_blkptrs++;
+ int sign = bp_freed ? -1 : +1;
+ bpo->bpo_phys->bpo_bytes += sign *
+ bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
+ if (bpo->bpo_havecomp) {
+ bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp);
+ bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp);
+ }
+ if (bp_freed) {
+ ASSERT(bpo->bpo_havefreed);
+ bpo->bpo_phys->bpo_num_freed++;
+ }
+ mutex_exit(&bpo->bpo_lock);
+}
+
+struct space_range_arg {
+ spa_t *spa;
+ uint64_t mintxg;
+ uint64_t maxtxg;
+ uint64_t used;
+ uint64_t comp;
+ uint64_t uncomp;
+};
+
+/* ARGSUSED */
+static int
+space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
+{
+ struct space_range_arg *sra = arg;
+
+ if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
+ if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
+ sra->used += bp_get_dsize_sync(sra->spa, bp);
+ else
+ sra->used += bp_get_dsize(sra->spa, bp);
+ sra->comp += BP_GET_PSIZE(bp);
+ sra->uncomp += BP_GET_UCSIZE(bp);
+ }
+ return (0);
+}
+
+int
+bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ ASSERT(bpobj_is_open(bpo));
+ mutex_enter(&bpo->bpo_lock);
+
+ *usedp = bpo->bpo_phys->bpo_bytes;
+ if (bpo->bpo_havecomp) {
+ *compp = bpo->bpo_phys->bpo_comp;
+ *uncompp = bpo->bpo_phys->bpo_uncomp;
+ mutex_exit(&bpo->bpo_lock);
+ return (0);
+ } else {
+ mutex_exit(&bpo->bpo_lock);
+ return (bpobj_space_range(bpo, 0, UINT64_MAX,
+ usedp, compp, uncompp));
+ }
+}
+
+/*
+ * Return the amount of space in the bpobj which is:
+ * mintxg < blk_birth <= maxtxg
+ */
+int
+bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ struct space_range_arg sra = { 0 };
+ int err;
+
+ ASSERT(bpobj_is_open(bpo));
+
+ /*
+ * As an optimization, if they want the whole txg range, just
+ * get bpo_bytes rather than iterating over the bps.
+ */
+ if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
+ return (bpobj_space(bpo, usedp, compp, uncompp));
+
+ sra.spa = dmu_objset_spa(bpo->bpo_os);
+ sra.mintxg = mintxg;
+ sra.maxtxg = maxtxg;
+
+ err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
+ *usedp = sra.used;
+ *compp = sra.comp;
+ *uncompp = sra.uncomp;
+ return (err);
+}
+
+/*
+ * A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a
+ * bpobj are designated as free or allocated that information is not preserved
+ * in bplists.
+ */
+/* ARGSUSED */
+int
+bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+ dmu_tx_t *tx)
+{
+ bplist_t *bpl = arg;
+ bplist_append(bpl, bp);
+ return (0);
+}
diff --git a/sys/contrib/openzfs/module/zfs/bptree.c b/sys/contrib/openzfs/module/zfs/bptree.c
new file mode 100644
index 000000000000..1827a3c4e326
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/bptree.c
@@ -0,0 +1,303 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/arc.h>
+#include <sys/bptree.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dnode.h>
+#include <sys/spa.h>
+
+/*
+ * A bptree is a queue of root block pointers from destroyed datasets. When a
+ * dataset is destroyed its root block pointer is put on the end of the pool's
+ * bptree queue so the dataset's blocks can be freed asynchronously by
+ * dsl_scan_sync. This allows the delete operation to finish without traversing
+ * all the dataset's blocks.
+ *
+ * Note that while bt_begin and bt_end are only ever incremented in this code,
+ * they are effectively reset to 0 every time the entire bptree is freed because
+ * the bptree's object is destroyed and re-created.
+ */
+
+struct bptree_args {
+ bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */
+ boolean_t ba_free; /* true if freeing during traversal */
+
+ bptree_itor_t *ba_func; /* function to call for each blockpointer */
+ void *ba_arg; /* caller supplied argument to ba_func */
+ dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */
+} bptree_args_t;
+
+uint64_t
+bptree_alloc(objset_t *os, dmu_tx_t *tx)
+{
+ uint64_t obj;
+ dmu_buf_t *db;
+ bptree_phys_t *bt;
+
+ obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
+ SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
+ sizeof (bptree_phys_t), tx);
+
+ /*
+ * Bonus buffer contents are already initialized to 0, but for
+ * readability we make it explicit.
+ */
+ VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+ dmu_buf_will_dirty(db, tx);
+ bt = db->db_data;
+ bt->bt_begin = 0;
+ bt->bt_end = 0;
+ bt->bt_bytes = 0;
+ bt->bt_comp = 0;
+ bt->bt_uncomp = 0;
+ dmu_buf_rele(db, FTAG);
+
+ return (obj);
+}
+
+int
+bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+ dmu_buf_t *db;
+ bptree_phys_t *bt;
+
+ VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+ bt = db->db_data;
+ ASSERT3U(bt->bt_begin, ==, bt->bt_end);
+ ASSERT0(bt->bt_bytes);
+ ASSERT0(bt->bt_comp);
+ ASSERT0(bt->bt_uncomp);
+ dmu_buf_rele(db, FTAG);
+
+ return (dmu_object_free(os, obj, tx));
+}
+
+boolean_t
+bptree_is_empty(objset_t *os, uint64_t obj)
+{
+ dmu_buf_t *db;
+ bptree_phys_t *bt;
+ boolean_t rv;
+
+ VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
+ bt = db->db_data;
+ rv = (bt->bt_begin == bt->bt_end);
+ dmu_buf_rele(db, FTAG);
+ return (rv);
+}
+
+void
+bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
+ uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
+{
+ dmu_buf_t *db;
+ bptree_phys_t *bt;
+ bptree_entry_phys_t *bte;
+
+ /*
+ * bptree objects are in the pool mos, therefore they can only be
+ * modified in syncing context. Furthermore, this is only modified
+ * by the sync thread, so no locking is necessary.
+ */
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+ bt = db->db_data;
+
+ bte = kmem_zalloc(sizeof (*bte), KM_SLEEP);
+ bte->be_birth_txg = birth_txg;
+ bte->be_bp = *bp;
+ dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx);
+ kmem_free(bte, sizeof (*bte));
+
+ dmu_buf_will_dirty(db, tx);
+ bt->bt_end++;
+ bt->bt_bytes += bytes;
+ bt->bt_comp += comp;
+ bt->bt_uncomp += uncomp;
+ dmu_buf_rele(db, FTAG);
+}
+
+/* ARGSUSED */
+static int
+bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ int err;
+ struct bptree_args *ba = arg;
+
+ if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
+ BP_IS_REDACTED(bp))
+ return (0);
+
+ err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
+ if (err == 0 && ba->ba_free) {
+ ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp);
+ ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp);
+ ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp);
+ }
+ return (err);
+}
+
+/*
+ * If "free" is set:
+ * - It is assumed that "func" will be freeing the block pointers.
+ * - If "func" returns nonzero, the bookmark will be remembered and
+ * iteration will be restarted from this point on next invocation.
+ * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
+ * bptree_iterate will remember the bookmark, continue traversing
+ * any additional entries, and return 0.
+ *
+ * If "free" is not set, traversal will stop and return an error if
+ * an i/o error is encountered.
+ *
+ * In either case, if zfs_free_leak_on_eio is set, i/o errors will be
+ * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
+ * traverse_dataset_destroyed()).
+ */
+int
+bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
+ void *arg, dmu_tx_t *tx)
+{
+ boolean_t ioerr = B_FALSE;
+ int err;
+ uint64_t i;
+ dmu_buf_t *db;
+ struct bptree_args ba;
+
+ ASSERT(!free || dmu_tx_is_syncing(tx));
+
+ err = dmu_bonus_hold(os, obj, FTAG, &db);
+ if (err != 0)
+ return (err);
+
+ if (free)
+ dmu_buf_will_dirty(db, tx);
+
+ ba.ba_phys = db->db_data;
+ ba.ba_free = free;
+ ba.ba_func = func;
+ ba.ba_arg = arg;
+ ba.ba_tx = tx;
+
+ err = 0;
+ for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) {
+ bptree_entry_phys_t bte;
+ int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST |
+ TRAVERSE_NO_DECRYPT;
+
+ err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
+ &bte, DMU_READ_NO_PREFETCH);
+ if (err != 0)
+ break;
+
+ if (zfs_free_leak_on_eio)
+ flags |= TRAVERSE_HARD;
+ zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld "
+ "bookmark %lld/%lld/%lld/%lld",
+ (longlong_t)i,
+ (longlong_t)bte.be_birth_txg,
+ (longlong_t)bte.be_zb.zb_objset,
+ (longlong_t)bte.be_zb.zb_object,
+ (longlong_t)bte.be_zb.zb_level,
+ (longlong_t)bte.be_zb.zb_blkid);
+ err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
+ bte.be_birth_txg, &bte.be_zb, flags,
+ bptree_visit_cb, &ba);
+ if (free) {
+ /*
+ * The callback has freed the visited block pointers.
+ * Record our traversal progress on disk, either by
+ * updating this record's bookmark, or by logically
+ * removing this record by advancing bt_begin.
+ */
+ if (err != 0) {
+ /* save bookmark for future resume */
+ ASSERT3U(bte.be_zb.zb_objset, ==,
+ ZB_DESTROYED_OBJSET);
+ ASSERT0(bte.be_zb.zb_level);
+ dmu_write(os, obj, i * sizeof (bte),
+ sizeof (bte), &bte, tx);
+ if (err == EIO || err == ECKSUM ||
+ err == ENXIO) {
+ /*
+ * Skip the rest of this tree and
+ * continue on to the next entry.
+ */
+ err = 0;
+ ioerr = B_TRUE;
+ } else {
+ break;
+ }
+ } else if (ioerr) {
+ /*
+ * This entry is finished, but there were
+ * i/o errors on previous entries, so we
+ * can't adjust bt_begin. Set this entry's
+ * be_birth_txg such that it will be
+ * treated as a no-op in future traversals.
+ */
+ bte.be_birth_txg = UINT64_MAX;
+ dmu_write(os, obj, i * sizeof (bte),
+ sizeof (bte), &bte, tx);
+ }
+
+ if (!ioerr) {
+ ba.ba_phys->bt_begin++;
+ (void) dmu_free_range(os, obj,
+ i * sizeof (bte), sizeof (bte), tx);
+ }
+ } else if (err != 0) {
+ break;
+ }
+ }
+
+ ASSERT(!free || err != 0 || ioerr ||
+ ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
+
+ /* if all blocks are free there should be no used space */
+ if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
+ if (zfs_free_leak_on_eio) {
+ ba.ba_phys->bt_bytes = 0;
+ ba.ba_phys->bt_comp = 0;
+ ba.ba_phys->bt_uncomp = 0;
+ }
+
+ ASSERT0(ba.ba_phys->bt_bytes);
+ ASSERT0(ba.ba_phys->bt_comp);
+ ASSERT0(ba.ba_phys->bt_uncomp);
+ }
+
+ dmu_buf_rele(db, FTAG);
+
+ return (err);
+}
diff --git a/sys/contrib/openzfs/module/zfs/bqueue.c b/sys/contrib/openzfs/module/zfs/bqueue.c
new file mode 100644
index 000000000000..22539efc4e23
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/bqueue.c
@@ -0,0 +1,155 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/bqueue.h>
+#include <sys/zfs_context.h>
+
+static inline bqueue_node_t *
+obj2node(bqueue_t *q, void *data)
+{
+ return ((bqueue_node_t *)((char *)data + q->bq_node_offset));
+}
+
+/*
+ * Initialize a blocking queue The maximum capacity of the queue is set to
+ * size. Types that are stored in a bqueue must contain a bqueue_node_t,
+ * and node_offset must be its offset from the start of the struct.
+ * fill_fraction is a performance tuning value; when the queue is full, any
+ * threads attempting to enqueue records will block. They will block until
+ * they're signaled, which will occur when the queue is at least 1/fill_fraction
+ * empty. Similar behavior occurs on dequeue; if the queue is empty, threads
+ * block. They will be signalled when the queue has 1/fill_fraction full, or
+ * when bqueue_flush is called. As a result, you must call bqueue_flush when
+ * you enqueue your final record on a thread, in case the dequeueing threads are
+ * currently blocked and that enqueue does not cause them to be awoken.
+ * Alternatively, this behavior can be disabled (causing signaling to happen
+ * immediately) by setting fill_fraction to any value larger than size.
+ * Return 0 on success, or -1 on failure.
+ */
+int
+bqueue_init(bqueue_t *q, uint64_t fill_fraction, uint64_t size,
+ size_t node_offset)
+{
+ if (fill_fraction == 0) {
+ return (-1);
+ }
+ list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t),
+ node_offset + offsetof(bqueue_node_t, bqn_node));
+ cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL);
+ q->bq_node_offset = node_offset;
+ q->bq_size = 0;
+ q->bq_maxsize = size;
+ q->bq_fill_fraction = fill_fraction;
+ return (0);
+}
+
+/*
+ * Destroy a blocking queue. This function asserts that there are no
+ * elements in the queue, and no one is blocked on the condition
+ * variables.
+ */
+void
+bqueue_destroy(bqueue_t *q)
+{
+ mutex_enter(&q->bq_lock);
+ ASSERT0(q->bq_size);
+ cv_destroy(&q->bq_add_cv);
+ cv_destroy(&q->bq_pop_cv);
+ list_destroy(&q->bq_list);
+ mutex_exit(&q->bq_lock);
+ mutex_destroy(&q->bq_lock);
+}
+
+static void
+bqueue_enqueue_impl(bqueue_t *q, void *data, uint64_t item_size,
+ boolean_t flush)
+{
+ ASSERT3U(item_size, >, 0);
+ ASSERT3U(item_size, <=, q->bq_maxsize);
+ mutex_enter(&q->bq_lock);
+ obj2node(q, data)->bqn_size = item_size;
+ while (q->bq_size + item_size > q->bq_maxsize) {
+ cv_wait_sig(&q->bq_add_cv, &q->bq_lock);
+ }
+ q->bq_size += item_size;
+ list_insert_tail(&q->bq_list, data);
+ if (q->bq_size >= q->bq_maxsize / q->bq_fill_fraction)
+ cv_signal(&q->bq_pop_cv);
+ if (flush)
+ cv_broadcast(&q->bq_pop_cv);
+ mutex_exit(&q->bq_lock);
+}
+
+/*
+ * Add data to q, consuming size units of capacity. If there is insufficient
+ * capacity to consume size units, block until capacity exists. Asserts size is
+ * > 0.
+ */
+void
+bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size)
+{
+ bqueue_enqueue_impl(q, data, item_size, B_FALSE);
+}
+
+/*
+ * Enqueue an entry, and then flush the queue. This forces the popping threads
+ * to wake up, even if we're below the fill fraction. We have this in a single
+ * function, rather than having a separate call, because it prevents race
+ * conditions between the enqueuing thread and the dequeueing thread, where the
+ * enqueueing thread will wake up the dequeueing thread, that thread will
+ * destroy the condvar before the enqueuing thread is done.
+ */
+void
+bqueue_enqueue_flush(bqueue_t *q, void *data, uint64_t item_size)
+{
+ bqueue_enqueue_impl(q, data, item_size, B_TRUE);
+}
+
+/*
+ * Take the first element off of q. If there are no elements on the queue, wait
+ * until one is put there. Return the removed element.
+ */
+void *
+bqueue_dequeue(bqueue_t *q)
+{
+ void *ret = NULL;
+ uint64_t item_size;
+ mutex_enter(&q->bq_lock);
+ while (q->bq_size == 0) {
+ cv_wait_sig(&q->bq_pop_cv, &q->bq_lock);
+ }
+ ret = list_remove_head(&q->bq_list);
+ ASSERT3P(ret, !=, NULL);
+ item_size = obj2node(q, ret)->bqn_size;
+ q->bq_size -= item_size;
+ if (q->bq_size <= q->bq_maxsize - (q->bq_maxsize / q->bq_fill_fraction))
+ cv_signal(&q->bq_add_cv);
+ mutex_exit(&q->bq_lock);
+ return (ret);
+}
+
+/*
+ * Returns true if the space used is 0.
+ */
+boolean_t
+bqueue_empty(bqueue_t *q)
+{
+ return (q->bq_size == 0);
+}
diff --git a/sys/contrib/openzfs/module/zfs/btree.c b/sys/contrib/openzfs/module/zfs/btree.c
new file mode 100644
index 000000000000..57b9dbbb2b50
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/btree.c
@@ -0,0 +1,2124 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/btree.h>
+#include <sys/bitops.h>
+#include <sys/zfs_context.h>
+
+kmem_cache_t *zfs_btree_leaf_cache;
+
+/*
+ * Control the extent of the verification that occurs when zfs_btree_verify is
+ * called. Primarily used for debugging when extending the btree logic and
+ * functionality. As the intensity is increased, new verification steps are
+ * added. These steps are cumulative; intensity = 3 includes the intensity = 1
+ * and intensity = 2 steps as well.
+ *
+ * Intensity 1: Verify that the tree's height is consistent throughout.
+ * Intensity 2: Verify that a core node's children's parent pointers point
+ * to the core node.
+ * Intensity 3: Verify that the total number of elements in the tree matches the
+ * sum of the number of elements in each node. Also verifies that each node's
+ * count obeys the invariants (less than or equal to maximum value, greater than
+ * or equal to half the maximum minus one).
+ * Intensity 4: Verify that each element compares less than the element
+ * immediately after it and greater than the one immediately before it using the
+ * comparator function. For core nodes, also checks that each element is greater
+ * than the last element in the first of the two nodes it separates, and less
+ * than the first element in the second of the two nodes.
+ * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside
+ * of each node is poisoned appropriately. Note that poisoning always occurs if
+ * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal
+ * operation.
+ *
+ * Intensity 4 and 5 are particularly expensive to perform; the previous levels
+ * are a few memory operations per node, while these levels require multiple
+ * operations per element. In addition, when creating large btrees, these
+ * operations are called at every step, resulting in extremely slow operation
+ * (while the asymptotic complexity of the other steps is the same, the
+ * importance of the constant factors cannot be denied).
+ */
+int zfs_btree_verify_intensity = 0;
+
+/*
+ * A convenience function to silence warnings from memmove's return value and
+ * change argument order to src, dest.
+ */
+static void
+bmov(const void *src, void *dest, size_t size)
+{
+ (void) memmove(dest, src, size);
+}
+
+#ifdef _ILP32
+#define BTREE_POISON 0xabadb10c
+#else
+#define BTREE_POISON 0xabadb10cdeadbeef
+#endif
+
+static void
+zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+#ifdef ZFS_DEBUG
+ size_t size = tree->bt_elem_size;
+ if (!hdr->bth_core) {
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+ (void) memset(leaf->btl_elems + hdr->bth_count * size, 0x0f,
+ BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t) -
+ hdr->bth_count * size);
+ } else {
+ zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+ for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) {
+ node->btc_children[i] =
+ (zfs_btree_hdr_t *)BTREE_POISON;
+ }
+ (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f,
+ (BTREE_CORE_ELEMS - hdr->bth_count) * size);
+ }
+#endif
+}
+
+static inline void
+zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
+ uint64_t offset)
+{
+#ifdef ZFS_DEBUG
+ size_t size = tree->bt_elem_size;
+ ASSERT3U(offset, >=, hdr->bth_count);
+ if (!hdr->bth_core) {
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+ (void) memset(leaf->btl_elems + offset * size, 0x0f, size);
+ } else {
+ zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+ node->btc_children[offset + 1] =
+ (zfs_btree_hdr_t *)BTREE_POISON;
+ (void) memset(node->btc_elems + offset * size, 0x0f, size);
+ }
+#endif
+}
+
+static inline void
+zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
+ uint64_t offset)
+{
+#ifdef ZFS_DEBUG
+ size_t size = tree->bt_elem_size;
+ uint8_t eval = 0x0f;
+ if (hdr->bth_core) {
+ zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+ zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON;
+ VERIFY3P(node->btc_children[offset + 1], ==, cval);
+ for (int i = 0; i < size; i++)
+ VERIFY3U(node->btc_elems[offset * size + i], ==, eval);
+ } else {
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+ for (int i = 0; i < size; i++)
+ VERIFY3U(leaf->btl_elems[offset * size + i], ==, eval);
+ }
+#endif
+}
+
+void
+zfs_btree_init(void)
+{
+ zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache",
+ BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL,
+ NULL, 0);
+}
+
+void
+zfs_btree_fini(void)
+{
+ kmem_cache_destroy(zfs_btree_leaf_cache);
+}
+
+void
+zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
+ size_t size)
+{
+ /*
+ * We need a minimmum of 4 elements so that when we split a node we
+ * always have at least two elements in each node. This simplifies the
+ * logic in zfs_btree_bulk_finish, since it means the last leaf will
+ * always have a left sibling to share with (unless it's the root).
+ */
+ ASSERT3U(size, <=, (BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t)) / 4);
+
+ bzero(tree, sizeof (*tree));
+ tree->bt_compar = compar;
+ tree->bt_elem_size = size;
+ tree->bt_height = -1;
+ tree->bt_bulk = NULL;
+}
+
+/*
+ * Find value in the array of elements provided. Uses a simple binary search.
+ */
+static void *
+zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint64_t nelems,
+ const void *value, zfs_btree_index_t *where)
+{
+ uint64_t max = nelems;
+ uint64_t min = 0;
+ while (max > min) {
+ uint64_t idx = (min + max) / 2;
+ uint8_t *cur = buf + idx * tree->bt_elem_size;
+ int comp = tree->bt_compar(cur, value);
+ if (comp == -1) {
+ min = idx + 1;
+ } else if (comp == 1) {
+ max = idx;
+ } else {
+ ASSERT0(comp);
+ where->bti_offset = idx;
+ where->bti_before = B_FALSE;
+ return (cur);
+ }
+ }
+
+ where->bti_offset = max;
+ where->bti_before = B_TRUE;
+ return (NULL);
+}
+
+/*
+ * Find the given value in the tree. where may be passed as null to use as a
+ * membership test or if the btree is being used as a map.
+ */
+void *
+zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
+{
+ if (tree->bt_height == -1) {
+ if (where != NULL) {
+ where->bti_node = NULL;
+ where->bti_offset = 0;
+ }
+ ASSERT0(tree->bt_num_elems);
+ return (NULL);
+ }
+
+ /*
+ * If we're in bulk-insert mode, we check the last spot in the tree
+ * and the last leaf in the tree before doing the normal search,
+ * because for most workloads the vast majority of finds in
+ * bulk-insert mode are to insert new elements.
+ */
+ zfs_btree_index_t idx;
+ if (tree->bt_bulk != NULL) {
+ zfs_btree_leaf_t *last_leaf = tree->bt_bulk;
+ int compar = tree->bt_compar(last_leaf->btl_elems +
+ ((last_leaf->btl_hdr.bth_count - 1) * tree->bt_elem_size),
+ value);
+ if (compar < 0) {
+ /*
+ * If what they're looking for is after the last
+ * element, it's not in the tree.
+ */
+ if (where != NULL) {
+ where->bti_node = (zfs_btree_hdr_t *)last_leaf;
+ where->bti_offset =
+ last_leaf->btl_hdr.bth_count;
+ where->bti_before = B_TRUE;
+ }
+ return (NULL);
+ } else if (compar == 0) {
+ if (where != NULL) {
+ where->bti_node = (zfs_btree_hdr_t *)last_leaf;
+ where->bti_offset =
+ last_leaf->btl_hdr.bth_count - 1;
+ where->bti_before = B_FALSE;
+ }
+ return (last_leaf->btl_elems +
+ ((last_leaf->btl_hdr.bth_count - 1) *
+ tree->bt_elem_size));
+ }
+ if (tree->bt_compar(last_leaf->btl_elems, value) <= 0) {
+ /*
+ * If what they're looking for is after the first
+ * element in the last leaf, it's in the last leaf or
+ * it's not in the tree.
+ */
+ void *d = zfs_btree_find_in_buf(tree,
+ last_leaf->btl_elems, last_leaf->btl_hdr.bth_count,
+ value, &idx);
+
+ if (where != NULL) {
+ idx.bti_node = (zfs_btree_hdr_t *)last_leaf;
+ *where = idx;
+ }
+ return (d);
+ }
+ }
+
+ zfs_btree_core_t *node = NULL;
+ uint64_t child = 0;
+ uint64_t depth = 0;
+
+ /*
+ * Iterate down the tree, finding which child the value should be in
+ * by comparing with the separators.
+ */
+ for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height;
+ node = (zfs_btree_core_t *)node->btc_children[child], depth++) {
+ ASSERT3P(node, !=, NULL);
+ void *d = zfs_btree_find_in_buf(tree, node->btc_elems,
+ node->btc_hdr.bth_count, value, &idx);
+ EQUIV(d != NULL, !idx.bti_before);
+ if (d != NULL) {
+ if (where != NULL) {
+ idx.bti_node = (zfs_btree_hdr_t *)node;
+ *where = idx;
+ }
+ return (d);
+ }
+ ASSERT(idx.bti_before);
+ child = idx.bti_offset;
+ }
+
+ /*
+ * The value is in this leaf, or it would be if it were in the
+ * tree. Find its proper location and return it.
+ */
+ zfs_btree_leaf_t *leaf = (depth == 0 ?
+ (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node);
+ void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems,
+ leaf->btl_hdr.bth_count, value, &idx);
+
+ if (where != NULL) {
+ idx.bti_node = (zfs_btree_hdr_t *)leaf;
+ *where = idx;
+ }
+
+ return (d);
+}
+
+/*
+ * To explain the following functions, it is useful to understand the four
+ * kinds of shifts used in btree operation. First, a shift is a movement of
+ * elements within a node. It is used to create gaps for inserting new
+ * elements and children, or cover gaps created when things are removed. A
+ * shift has two fundamental properties, each of which can be one of two
+ * values, making four types of shifts. There is the direction of the shift
+ * (left or right) and the shape of the shift (parallelogram or isoceles
+ * trapezoid (shortened to trapezoid hereafter)). The shape distinction only
+ * applies to shifts of core nodes.
+ *
+ * The names derive from the following imagining of the layout of a node:
+ *
+ * Elements: * * * * * * * ... * * *
+ * Children: * * * * * * * * ... * * *
+ *
+ * This layout follows from the fact that the elements act as separators
+ * between pairs of children, and that children root subtrees "below" the
+ * current node. A left and right shift are fairly self-explanatory; a left
+ * shift moves things to the left, while a right shift moves things to the
+ * right. A parallelogram shift is a shift with the same number of elements
+ * and children being moved, while a trapezoid shift is a shift that moves one
+ * more children than elements. An example follows:
+ *
+ * A parallelogram shift could contain the following:
+ * _______________
+ * \* * * * \ * * * ... * * *
+ * * \ * * * *\ * * * ... * * *
+ * ---------------
+ * A trapezoid shift could contain the following:
+ * ___________
+ * * / * * * \ * * * ... * * *
+ * * / * * * *\ * * * ... * * *
+ * ---------------
+ *
+ * Note that a parallelogram shift is always shaped like a "left-leaning"
+ * parallelogram, where the starting index of the children being moved is
+ * always one higher than the starting index of the elements being moved. No
+ * "right-leaning" parallelogram shifts are needed (shifts where the starting
+ * element index and starting child index being moved are the same) to achieve
+ * any btree operations, so we ignore them.
+ */
+
+enum bt_shift_shape {
+ BSS_TRAPEZOID,
+ BSS_PARALLELOGRAM
+};
+
+enum bt_shift_direction {
+ BSD_LEFT,
+ BSD_RIGHT
+};
+
+/*
+ * Shift elements and children in the provided core node by off spots. The
+ * first element moved is idx, and count elements are moved. The shape of the
+ * shift is determined by shape. The direction is determined by dir.
+ */
+static inline void
+bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
+ uint64_t count, uint64_t off, enum bt_shift_shape shape,
+ enum bt_shift_direction dir)
+{
+ size_t size = tree->bt_elem_size;
+ ASSERT(node->btc_hdr.bth_core);
+
+ uint8_t *e_start = node->btc_elems + idx * size;
+ int sign = (dir == BSD_LEFT ? -1 : +1);
+ uint8_t *e_out = e_start + sign * off * size;
+ uint64_t e_count = count;
+ bmov(e_start, e_out, e_count * size);
+
+ zfs_btree_hdr_t **c_start = node->btc_children + idx +
+ (shape == BSS_TRAPEZOID ? 0 : 1);
+ zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off :
+ c_start + off);
+ uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
+ bmov(c_start, c_out, c_count * sizeof (*c_start));
+}
+
+/*
+ * Shift elements and children in the provided core node left by one spot.
+ * The first element moved is idx, and count elements are moved. The
+ * shape of the shift is determined by trap; true if the shift is a trapezoid,
+ * false if it is a parallelogram.
+ */
+static inline void
+bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
+ uint64_t count, enum bt_shift_shape shape)
+{
+ bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT);
+}
+
+/*
+ * Shift elements and children in the provided core node right by one spot.
+ * Starts with elements[idx] and children[idx] and one more child than element.
+ */
+static inline void
+bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
+ uint64_t count, enum bt_shift_shape shape)
+{
+ bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT);
+}
+
+/*
+ * Shift elements and children in the provided leaf node by off spots.
+ * The first element moved is idx, and count elements are moved. The direction
+ * is determined by left.
+ */
+static inline void
+bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint64_t idx,
+ uint64_t count, uint64_t off, enum bt_shift_direction dir)
+{
+ size_t size = tree->bt_elem_size;
+ ASSERT(!node->btl_hdr.bth_core);
+
+ uint8_t *start = node->btl_elems + idx * size;
+ int sign = (dir == BSD_LEFT ? -1 : +1);
+ uint8_t *out = start + sign * off * size;
+ bmov(start, out, count * size);
+}
+
+static inline void
+bt_shift_leaf_right(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx,
+ uint64_t count)
+{
+ bt_shift_leaf(tree, leaf, idx, count, 1, BSD_RIGHT);
+}
+
+static inline void
+bt_shift_leaf_left(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx,
+ uint64_t count)
+{
+ bt_shift_leaf(tree, leaf, idx, count, 1, BSD_LEFT);
+}
+
+/*
+ * Move children and elements from one core node to another. The shape
+ * parameter behaves the same as it does in the shift logic.
+ */
+static inline void
+bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint64_t sidx,
+ uint64_t count, zfs_btree_core_t *dest, uint64_t didx,
+ enum bt_shift_shape shape)
+{
+ size_t size = tree->bt_elem_size;
+ ASSERT(source->btc_hdr.bth_core);
+ ASSERT(dest->btc_hdr.bth_core);
+
+ bmov(source->btc_elems + sidx * size, dest->btc_elems + didx * size,
+ count * size);
+
+ uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
+ bmov(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1),
+ dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1),
+ c_count * sizeof (*source->btc_children));
+}
+
+static inline void
+bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint64_t sidx,
+ uint64_t count, zfs_btree_leaf_t *dest, uint64_t didx)
+{
+ size_t size = tree->bt_elem_size;
+ ASSERT(!source->btl_hdr.bth_core);
+ ASSERT(!dest->btl_hdr.bth_core);
+
+ bmov(source->btl_elems + sidx * size, dest->btl_elems + didx * size,
+ count * size);
+}
+
+/*
+ * Find the first element in the subtree rooted at hdr, return its value and
+ * put its location in where if non-null.
+ */
+static void *
+zfs_btree_first_helper(zfs_btree_hdr_t *hdr, zfs_btree_index_t *where)
+{
+ zfs_btree_hdr_t *node;
+
+ for (node = hdr; node->bth_core; node =
+ ((zfs_btree_core_t *)node)->btc_children[0])
+ ;
+
+ ASSERT(!node->bth_core);
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
+ if (where != NULL) {
+ where->bti_node = node;
+ where->bti_offset = 0;
+ where->bti_before = B_FALSE;
+ }
+ return (&leaf->btl_elems[0]);
+}
+
+/* Insert an element and a child into a core node at the given offset. */
+static void
+zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent,
+ uint64_t offset, zfs_btree_hdr_t *new_node, void *buf)
+{
+ uint64_t size = tree->bt_elem_size;
+ zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
+ ASSERT3P(par_hdr, ==, new_node->bth_parent);
+ ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS);
+
+ if (zfs_btree_verify_intensity >= 5) {
+ zfs_btree_verify_poison_at(tree, par_hdr,
+ par_hdr->bth_count);
+ }
+ /* Shift existing elements and children */
+ uint64_t count = par_hdr->bth_count - offset;
+ bt_shift_core_right(tree, parent, offset, count,
+ BSS_PARALLELOGRAM);
+
+ /* Insert new values */
+ parent->btc_children[offset + 1] = new_node;
+ bmov(buf, parent->btc_elems + offset * size, size);
+ par_hdr->bth_count++;
+}
+
+/*
+ * Insert new_node into the parent of old_node directly after old_node, with
+ * buf as the dividing element between the two.
+ */
+static void
+zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
+ zfs_btree_hdr_t *new_node, void *buf)
+{
+ ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent);
+ uint64_t size = tree->bt_elem_size;
+ zfs_btree_core_t *parent = old_node->bth_parent;
+ zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
+
+ /*
+ * If this is the root node we were splitting, we create a new root
+ * and increase the height of the tree.
+ */
+ if (parent == NULL) {
+ ASSERT3P(old_node, ==, tree->bt_root);
+ tree->bt_num_nodes++;
+ zfs_btree_core_t *new_root =
+ kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS *
+ size, KM_SLEEP);
+ zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr;
+ new_root_hdr->bth_parent = NULL;
+ new_root_hdr->bth_core = B_TRUE;
+ new_root_hdr->bth_count = 1;
+
+ old_node->bth_parent = new_node->bth_parent = new_root;
+ new_root->btc_children[0] = old_node;
+ new_root->btc_children[1] = new_node;
+ bmov(buf, new_root->btc_elems, size);
+
+ tree->bt_height++;
+ tree->bt_root = new_root_hdr;
+ zfs_btree_poison_node(tree, new_root_hdr);
+ return;
+ }
+
+ /*
+ * Since we have the new separator, binary search for where to put
+ * new_node.
+ */
+ zfs_btree_index_t idx;
+ ASSERT(par_hdr->bth_core);
+ VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
+ par_hdr->bth_count, buf, &idx), ==, NULL);
+ ASSERT(idx.bti_before);
+ uint64_t offset = idx.bti_offset;
+ ASSERT3U(offset, <=, par_hdr->bth_count);
+ ASSERT3P(parent->btc_children[offset], ==, old_node);
+
+ /*
+ * If the parent isn't full, shift things to accommodate our insertions
+ * and return.
+ */
+ if (par_hdr->bth_count != BTREE_CORE_ELEMS) {
+ zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf);
+ return;
+ }
+
+ /*
+ * We need to split this core node into two. Currently there are
+ * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for
+ * BTREE_CORE_ELEMS + 2. Some of the children will be part of the
+ * current node, and the others will be moved to the new core node.
+ * There are BTREE_CORE_ELEMS + 1 elements including the new one. One
+ * will be used as the new separator in our parent, and the others
+ * will be split among the two core nodes.
+ *
+ * Usually we will split the node in half evenly, with
+ * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we
+ * instead move only about a quarter of the elements (and children) to
+ * the new node. Since the average state after a long time is a 3/4
+ * full node, shortcutting directly to that state improves efficiency.
+ *
+ * We do this in two stages: first we split into two nodes, and then we
+ * reuse our existing logic to insert the new element and child.
+ */
+ uint64_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ?
+ 2 : 4)) - 1, 2);
+ uint64_t keep_count = BTREE_CORE_ELEMS - move_count - 1;
+ ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2);
+ tree->bt_num_nodes++;
+ zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) +
+ BTREE_CORE_ELEMS * size, KM_SLEEP);
+ zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr;
+ new_par_hdr->bth_parent = par_hdr->bth_parent;
+ new_par_hdr->bth_core = B_TRUE;
+ new_par_hdr->bth_count = move_count;
+ zfs_btree_poison_node(tree, new_par_hdr);
+
+ par_hdr->bth_count = keep_count;
+
+ bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent,
+ 0, BSS_TRAPEZOID);
+
+ /* Store the new separator in a buffer. */
+ uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP);
+ bmov(parent->btc_elems + keep_count * size, tmp_buf,
+ size);
+ zfs_btree_poison_node(tree, par_hdr);
+
+ if (offset < keep_count) {
+ /* Insert the new node into the left half */
+ zfs_btree_insert_core_impl(tree, parent, offset, new_node,
+ buf);
+
+ /*
+ * Move the new separator to the existing buffer.
+ */
+ bmov(tmp_buf, buf, size);
+ } else if (offset > keep_count) {
+ /* Insert the new node into the right half */
+ new_node->bth_parent = new_parent;
+ zfs_btree_insert_core_impl(tree, new_parent,
+ offset - keep_count - 1, new_node, buf);
+
+ /*
+ * Move the new separator to the existing buffer.
+ */
+ bmov(tmp_buf, buf, size);
+ } else {
+ /*
+ * Move the new separator into the right half, and replace it
+ * with buf. We also need to shift back the elements in the
+ * right half to accommodate new_node.
+ */
+ bt_shift_core_right(tree, new_parent, 0, move_count,
+ BSS_TRAPEZOID);
+ new_parent->btc_children[0] = new_node;
+ bmov(tmp_buf, new_parent->btc_elems, size);
+ new_par_hdr->bth_count++;
+ }
+ kmem_free(tmp_buf, size);
+ zfs_btree_poison_node(tree, par_hdr);
+
+ for (int i = 0; i <= new_parent->btc_hdr.bth_count; i++)
+ new_parent->btc_children[i]->bth_parent = new_parent;
+
+ for (int i = 0; i <= parent->btc_hdr.bth_count; i++)
+ ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent);
+
+ /*
+ * Now that the node is split, we need to insert the new node into its
+ * parent. This may cause further splitting.
+ */
+ zfs_btree_insert_into_parent(tree, &parent->btc_hdr,
+ &new_parent->btc_hdr, buf);
+}
+
+/* Insert an element into a leaf node at the given offset. */
+static void
+zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
+ uint64_t idx, const void *value)
+{
+ uint64_t size = tree->bt_elem_size;
+ uint8_t *start = leaf->btl_elems + (idx * size);
+ zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
+ uint64_t capacity __maybe_unused = P2ALIGN((BTREE_LEAF_SIZE -
+ sizeof (zfs_btree_hdr_t)) / size, 2);
+ uint64_t count = leaf->btl_hdr.bth_count - idx;
+ ASSERT3U(leaf->btl_hdr.bth_count, <, capacity);
+
+ if (zfs_btree_verify_intensity >= 5) {
+ zfs_btree_verify_poison_at(tree, &leaf->btl_hdr,
+ leaf->btl_hdr.bth_count);
+ }
+
+ bt_shift_leaf_right(tree, leaf, idx, count);
+ bmov(value, start, size);
+ hdr->bth_count++;
+}
+
+/* Helper function for inserting a new value into leaf at the given index. */
+static void
+zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
+ const void *value, uint64_t idx)
+{
+ uint64_t size = tree->bt_elem_size;
+ uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
+ sizeof (zfs_btree_hdr_t)) / size, 2);
+
+ /*
+ * If the leaf isn't full, shift the elements after idx and insert
+ * value.
+ */
+ if (leaf->btl_hdr.bth_count != capacity) {
+ zfs_btree_insert_leaf_impl(tree, leaf, idx, value);
+ return;
+ }
+
+ /*
+ * Otherwise, we split the leaf node into two nodes. If we're not bulk
+ * inserting, each is of size (capacity / 2). If we are bulk
+ * inserting, we move a quarter of the elements to the new node so
+ * inserts into the old node don't cause immediate splitting but the
+ * tree stays relatively dense. Since the average state after a long
+ * time is a 3/4 full node, shortcutting directly to that state
+ * improves efficiency. At the end of the bulk insertion process
+ * we'll need to go through and fix up any nodes (the last leaf and
+ * its ancestors, potentially) that are below the minimum.
+ *
+ * In either case, we're left with one extra element. The leftover
+ * element will become the new dividing element between the two nodes.
+ */
+ uint64_t move_count = MAX(capacity / (tree->bt_bulk == NULL ? 2 : 4) -
+ 1, 2);
+ uint64_t keep_count = capacity - move_count - 1;
+ ASSERT3U(capacity - move_count, >=, 2);
+ tree->bt_num_nodes++;
+ zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache,
+ KM_SLEEP);
+ zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr;
+ new_hdr->bth_parent = leaf->btl_hdr.bth_parent;
+ new_hdr->bth_core = B_FALSE;
+ new_hdr->bth_count = move_count;
+ zfs_btree_poison_node(tree, new_hdr);
+
+ leaf->btl_hdr.bth_count = keep_count;
+
+ if (tree->bt_bulk != NULL && leaf == tree->bt_bulk)
+ tree->bt_bulk = new_leaf;
+
+ /* Copy the back part to the new leaf. */
+ bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf,
+ 0);
+
+ /* We store the new separator in a buffer we control for simplicity. */
+ uint8_t *buf = kmem_alloc(size, KM_SLEEP);
+ bmov(leaf->btl_elems + (keep_count * size), buf, size);
+ zfs_btree_poison_node(tree, &leaf->btl_hdr);
+
+ if (idx < keep_count) {
+ /* Insert into the existing leaf. */
+ zfs_btree_insert_leaf_impl(tree, leaf, idx, value);
+ } else if (idx > keep_count) {
+ /* Insert into the new leaf. */
+ zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count -
+ 1, value);
+ } else {
+ /*
+ * Shift the elements in the new leaf to make room for the
+ * separator, and use the new value as the new separator.
+ */
+ bt_shift_leaf_right(tree, new_leaf, 0, move_count);
+ bmov(buf, new_leaf->btl_elems, size);
+ bmov(value, buf, size);
+ new_hdr->bth_count++;
+ }
+
+ /*
+ * Now that the node is split, we need to insert the new node into its
+ * parent. This may cause further splitting, bur only of core nodes.
+ */
+ zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr,
+ buf);
+ kmem_free(buf, size);
+}
+
+static uint64_t
+zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+ void *buf;
+ if (hdr->bth_core) {
+ buf = ((zfs_btree_core_t *)hdr)->btc_elems;
+ } else {
+ buf = ((zfs_btree_leaf_t *)hdr)->btl_elems;
+ }
+ zfs_btree_index_t idx;
+ zfs_btree_core_t *parent = hdr->bth_parent;
+ VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
+ parent->btc_hdr.bth_count, buf, &idx), ==, NULL);
+ ASSERT(idx.bti_before);
+ ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count);
+ ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr);
+ return (idx.bti_offset);
+}
+
+/*
+ * Take the b-tree out of bulk insert mode. During bulk-insert mode, some
+ * nodes may violate the invariant that non-root nodes must be at least half
+ * full. All nodes violating this invariant should be the last node in their
+ * particular level. To correct the invariant, we take values from their left
+ * neighbor until they are half full. They must have a left neighbor at their
+ * level because the last node at a level is not the first node unless it's
+ * the root.
+ */
+static void
+zfs_btree_bulk_finish(zfs_btree_t *tree)
+{
+ ASSERT3P(tree->bt_bulk, !=, NULL);
+ ASSERT3P(tree->bt_root, !=, NULL);
+ zfs_btree_leaf_t *leaf = tree->bt_bulk;
+ zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
+ zfs_btree_core_t *parent = hdr->bth_parent;
+ uint64_t size = tree->bt_elem_size;
+ uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
+ sizeof (zfs_btree_hdr_t)) / size, 2);
+
+ /*
+ * The invariant doesn't apply to the root node, if that's the only
+ * node in the tree we're done.
+ */
+ if (parent == NULL) {
+ tree->bt_bulk = NULL;
+ return;
+ }
+
+ /* First, take elements to rebalance the leaf node. */
+ if (hdr->bth_count < capacity / 2) {
+ /*
+ * First, find the left neighbor. The simplest way to do this
+ * is to call zfs_btree_prev twice; the first time finds some
+ * ancestor of this node, and the second time finds the left
+ * neighbor. The ancestor found is the lowest common ancestor
+ * of leaf and the neighbor.
+ */
+ zfs_btree_index_t idx = {
+ .bti_node = hdr,
+ .bti_offset = 0
+ };
+ VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
+ ASSERT(idx.bti_node->bth_core);
+ zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node;
+ uint64_t common_idx = idx.bti_offset;
+
+ VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
+ ASSERT(!idx.bti_node->bth_core);
+ zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node;
+ zfs_btree_hdr_t *l_hdr = idx.bti_node;
+ uint64_t move_count = (capacity / 2) - hdr->bth_count;
+ ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=,
+ capacity / 2);
+
+ if (zfs_btree_verify_intensity >= 5) {
+ for (int i = 0; i < move_count; i++) {
+ zfs_btree_verify_poison_at(tree, hdr,
+ leaf->btl_hdr.bth_count + i);
+ }
+ }
+
+ /* First, shift elements in leaf back. */
+ bt_shift_leaf(tree, leaf, 0, hdr->bth_count, move_count,
+ BSD_RIGHT);
+
+ /* Next, move the separator from the common ancestor to leaf. */
+ uint8_t *separator = common->btc_elems + (common_idx * size);
+ uint8_t *out = leaf->btl_elems + ((move_count - 1) * size);
+ bmov(separator, out, size);
+ move_count--;
+
+ /*
+ * Now we move elements from the tail of the left neighbor to
+ * fill the remaining spots in leaf.
+ */
+ bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count -
+ move_count, move_count, leaf, 0);
+
+ /*
+ * Finally, move the new last element in the left neighbor to
+ * the separator.
+ */
+ bmov(l_neighbor->btl_elems + (l_hdr->bth_count -
+ move_count - 1) * size, separator, size);
+
+ /* Adjust the node's counts, and we're done. */
+ l_hdr->bth_count -= move_count + 1;
+ hdr->bth_count += move_count + 1;
+
+ ASSERT3U(l_hdr->bth_count, >=, capacity / 2);
+ ASSERT3U(hdr->bth_count, >=, capacity / 2);
+ zfs_btree_poison_node(tree, l_hdr);
+ }
+
+ /*
+ * Now we have to rebalance any ancestors of leaf that may also
+ * violate the invariant.
+ */
+ capacity = BTREE_CORE_ELEMS;
+ while (parent->btc_hdr.bth_parent != NULL) {
+ zfs_btree_core_t *cur = parent;
+ zfs_btree_hdr_t *hdr = &cur->btc_hdr;
+ parent = hdr->bth_parent;
+ /*
+ * If the invariant isn't violated, move on to the next
+ * ancestor.
+ */
+ if (hdr->bth_count >= capacity / 2)
+ continue;
+
+ /*
+ * Because the smallest number of nodes we can move when
+ * splitting is 2, we never need to worry about not having a
+ * left sibling (a sibling is a neighbor with the same parent).
+ */
+ uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
+ ASSERT3U(parent_idx, >, 0);
+ zfs_btree_core_t *l_neighbor =
+ (zfs_btree_core_t *)parent->btc_children[parent_idx - 1];
+ uint64_t move_count = (capacity / 2) - hdr->bth_count;
+ ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=,
+ capacity / 2);
+
+ if (zfs_btree_verify_intensity >= 5) {
+ for (int i = 0; i < move_count; i++) {
+ zfs_btree_verify_poison_at(tree, hdr,
+ hdr->bth_count + i);
+ }
+ }
+ /* First, shift things in the right node back. */
+ bt_shift_core(tree, cur, 0, hdr->bth_count, move_count,
+ BSS_TRAPEZOID, BSD_RIGHT);
+
+ /* Next, move the separator to the right node. */
+ uint8_t *separator = parent->btc_elems + ((parent_idx - 1) *
+ size);
+ uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size);
+ bmov(separator, e_out, size);
+
+ /*
+ * Now, move elements and children from the left node to the
+ * right. We move one more child than elements.
+ */
+ move_count--;
+ uint64_t move_idx = l_neighbor->btc_hdr.bth_count - move_count;
+ bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0,
+ BSS_TRAPEZOID);
+
+ /*
+ * Finally, move the last element in the left node to the
+ * separator's position.
+ */
+ move_idx--;
+ bmov(l_neighbor->btc_elems + move_idx * size, separator, size);
+
+ l_neighbor->btc_hdr.bth_count -= move_count + 1;
+ hdr->bth_count += move_count + 1;
+
+ ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2);
+ ASSERT3U(hdr->bth_count, >=, capacity / 2);
+
+ zfs_btree_poison_node(tree, &l_neighbor->btc_hdr);
+
+ for (int i = 0; i <= hdr->bth_count; i++)
+ cur->btc_children[i]->bth_parent = cur;
+ }
+
+ tree->bt_bulk = NULL;
+}
+
+/*
+ * Insert value into tree at the location specified by where.
+ */
+void
+zfs_btree_add_idx(zfs_btree_t *tree, const void *value,
+ const zfs_btree_index_t *where)
+{
+ zfs_btree_index_t idx = {0};
+
+ /* If we're not inserting in the last leaf, end bulk insert mode. */
+ if (tree->bt_bulk != NULL) {
+ if (where->bti_node != &tree->bt_bulk->btl_hdr) {
+ zfs_btree_bulk_finish(tree);
+ VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL);
+ where = &idx;
+ }
+ }
+
+ tree->bt_num_elems++;
+ /*
+ * If this is the first element in the tree, create a leaf root node
+ * and add the value to it.
+ */
+ if (where->bti_node == NULL) {
+ ASSERT3U(tree->bt_num_elems, ==, 1);
+ ASSERT3S(tree->bt_height, ==, -1);
+ ASSERT3P(tree->bt_root, ==, NULL);
+ ASSERT0(where->bti_offset);
+
+ tree->bt_num_nodes++;
+ zfs_btree_leaf_t *leaf = kmem_cache_alloc(zfs_btree_leaf_cache,
+ KM_SLEEP);
+ tree->bt_root = &leaf->btl_hdr;
+ tree->bt_height++;
+
+ zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
+ hdr->bth_parent = NULL;
+ hdr->bth_core = B_FALSE;
+ hdr->bth_count = 0;
+ zfs_btree_poison_node(tree, hdr);
+
+ zfs_btree_insert_into_leaf(tree, leaf, value, 0);
+ tree->bt_bulk = leaf;
+ } else if (!where->bti_node->bth_core) {
+ /*
+ * If we're inserting into a leaf, go directly to the helper
+ * function.
+ */
+ zfs_btree_insert_into_leaf(tree,
+ (zfs_btree_leaf_t *)where->bti_node, value,
+ where->bti_offset);
+ } else {
+ /*
+ * If we're inserting into a core node, we can't just shift
+ * the existing element in that slot in the same node without
+ * breaking our ordering invariants. Instead we place the new
+ * value in the node at that spot and then insert the old
+ * separator into the first slot in the subtree to the right.
+ */
+ ASSERT(where->bti_node->bth_core);
+ zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node;
+
+ /*
+ * We can ignore bti_before, because either way the value
+ * should end up in bti_offset.
+ */
+ uint64_t off = where->bti_offset;
+ zfs_btree_hdr_t *subtree = node->btc_children[off + 1];
+ size_t size = tree->bt_elem_size;
+ uint8_t *buf = kmem_alloc(size, KM_SLEEP);
+ bmov(node->btc_elems + off * size, buf, size);
+ bmov(value, node->btc_elems + off * size, size);
+
+ /*
+ * Find the first slot in the subtree to the right, insert
+ * there.
+ */
+ zfs_btree_index_t new_idx;
+ VERIFY3P(zfs_btree_first_helper(subtree, &new_idx), !=, NULL);
+ ASSERT0(new_idx.bti_offset);
+ ASSERT(!new_idx.bti_node->bth_core);
+ zfs_btree_insert_into_leaf(tree,
+ (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0);
+ kmem_free(buf, size);
+ }
+ zfs_btree_verify(tree);
+}
+
+/*
+ * Return the first element in the tree, and put its location in where if
+ * non-null.
+ */
+void *
+zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where)
+{
+ if (tree->bt_height == -1) {
+ ASSERT0(tree->bt_num_elems);
+ return (NULL);
+ }
+ return (zfs_btree_first_helper(tree->bt_root, where));
+}
+
+/*
+ * Find the last element in the subtree rooted at hdr, return its value and
+ * put its location in where if non-null.
+ */
+static void *
+zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr,
+ zfs_btree_index_t *where)
+{
+ zfs_btree_hdr_t *node;
+
+ for (node = hdr; node->bth_core; node =
+ ((zfs_btree_core_t *)node)->btc_children[node->bth_count])
+ ;
+
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
+ if (where != NULL) {
+ where->bti_node = node;
+ where->bti_offset = node->bth_count - 1;
+ where->bti_before = B_FALSE;
+ }
+ return (leaf->btl_elems + (node->bth_count - 1) * btree->bt_elem_size);
+}
+
+/*
+ * Return the last element in the tree, and put its location in where if
+ * non-null.
+ */
+void *
+zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where)
+{
+ if (tree->bt_height == -1) {
+ ASSERT0(tree->bt_num_elems);
+ return (NULL);
+ }
+ return (zfs_btree_last_helper(tree, tree->bt_root, where));
+}
+
+/*
+ * This function contains the logic to find the next node in the tree. A
+ * helper function is used because there are multiple internal consumemrs of
+ * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each
+ * node after we've finished with it.
+ */
+static void *
+zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx,
+ zfs_btree_index_t *out_idx,
+ void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *))
+{
+ if (idx->bti_node == NULL) {
+ ASSERT3S(tree->bt_height, ==, -1);
+ return (NULL);
+ }
+
+ uint64_t offset = idx->bti_offset;
+ if (!idx->bti_node->bth_core) {
+ /*
+ * When finding the next element of an element in a leaf,
+ * there are two cases. If the element isn't the last one in
+ * the leaf, in which case we just return the next element in
+ * the leaf. Otherwise, we need to traverse up our parents
+ * until we find one where our ancestor isn't the last child
+ * of its parent. Once we do, the next element is the
+ * separator after our ancestor in its parent.
+ */
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
+ uint64_t new_off = offset + (idx->bti_before ? 0 : 1);
+ if (leaf->btl_hdr.bth_count > new_off) {
+ out_idx->bti_node = &leaf->btl_hdr;
+ out_idx->bti_offset = new_off;
+ out_idx->bti_before = B_FALSE;
+ return (leaf->btl_elems + new_off * tree->bt_elem_size);
+ }
+
+ zfs_btree_hdr_t *prev = &leaf->btl_hdr;
+ for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
+ node != NULL; node = node->btc_hdr.bth_parent) {
+ zfs_btree_hdr_t *hdr = &node->btc_hdr;
+ ASSERT(hdr->bth_core);
+ uint64_t i = zfs_btree_find_parent_idx(tree, prev);
+ if (done_func != NULL)
+ done_func(tree, prev);
+ if (i == hdr->bth_count) {
+ prev = hdr;
+ continue;
+ }
+ out_idx->bti_node = hdr;
+ out_idx->bti_offset = i;
+ out_idx->bti_before = B_FALSE;
+ return (node->btc_elems + i * tree->bt_elem_size);
+ }
+ if (done_func != NULL)
+ done_func(tree, prev);
+ /*
+ * We've traversed all the way up and been at the end of the
+ * node every time, so this was the last element in the tree.
+ */
+ return (NULL);
+ }
+
+ /* If we were before an element in a core node, return that element. */
+ ASSERT(idx->bti_node->bth_core);
+ zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
+ if (idx->bti_before) {
+ out_idx->bti_before = B_FALSE;
+ return (node->btc_elems + offset * tree->bt_elem_size);
+ }
+
+ /*
+ * The next element from one in a core node is the first element in
+ * the subtree just to the right of the separator.
+ */
+ zfs_btree_hdr_t *child = node->btc_children[offset + 1];
+ return (zfs_btree_first_helper(child, out_idx));
+}
+
+/*
+ * Return the next valued node in the tree. The same address can be safely
+ * passed for idx and out_idx.
+ */
+void *
+zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx,
+ zfs_btree_index_t *out_idx)
+{
+ return (zfs_btree_next_helper(tree, idx, out_idx, NULL));
+}
+
+/*
+ * Return the previous valued node in the tree. The same value can be safely
+ * passed for idx and out_idx.
+ */
+void *
+zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx,
+ zfs_btree_index_t *out_idx)
+{
+ if (idx->bti_node == NULL) {
+ ASSERT3S(tree->bt_height, ==, -1);
+ return (NULL);
+ }
+
+ uint64_t offset = idx->bti_offset;
+ if (!idx->bti_node->bth_core) {
+ /*
+ * When finding the previous element of an element in a leaf,
+ * there are two cases. If the element isn't the first one in
+ * the leaf, in which case we just return the previous element
+ * in the leaf. Otherwise, we need to traverse up our parents
+ * until we find one where our previous ancestor isn't the
+ * first child. Once we do, the previous element is the
+ * separator after our previous ancestor.
+ */
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
+ if (offset != 0) {
+ out_idx->bti_node = &leaf->btl_hdr;
+ out_idx->bti_offset = offset - 1;
+ out_idx->bti_before = B_FALSE;
+ return (leaf->btl_elems + (offset - 1) *
+ tree->bt_elem_size);
+ }
+ zfs_btree_hdr_t *prev = &leaf->btl_hdr;
+ for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
+ node != NULL; node = node->btc_hdr.bth_parent) {
+ zfs_btree_hdr_t *hdr = &node->btc_hdr;
+ ASSERT(hdr->bth_core);
+ uint64_t i = zfs_btree_find_parent_idx(tree, prev);
+ if (i == 0) {
+ prev = hdr;
+ continue;
+ }
+ out_idx->bti_node = hdr;
+ out_idx->bti_offset = i - 1;
+ out_idx->bti_before = B_FALSE;
+ return (node->btc_elems + (i - 1) * tree->bt_elem_size);
+ }
+ /*
+ * We've traversed all the way up and been at the start of the
+ * node every time, so this was the first node in the tree.
+ */
+ return (NULL);
+ }
+
+ /*
+ * The previous element from one in a core node is the last element in
+ * the subtree just to the left of the separator.
+ */
+ ASSERT(idx->bti_node->bth_core);
+ zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
+ zfs_btree_hdr_t *child = node->btc_children[offset];
+ return (zfs_btree_last_helper(tree, child, out_idx));
+}
+
+/*
+ * Get the value at the provided index in the tree.
+ *
+ * Note that the value returned from this function can be mutated, but only
+ * if it will not change the ordering of the element with respect to any other
+ * elements that could be in the tree.
+ */
+void *
+zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx)
+{
+ ASSERT(!idx->bti_before);
+ if (!idx->bti_node->bth_core) {
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
+ return (leaf->btl_elems + idx->bti_offset * tree->bt_elem_size);
+ }
+ ASSERT(idx->bti_node->bth_core);
+ zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
+ return (node->btc_elems + idx->bti_offset * tree->bt_elem_size);
+}
+
+/* Add the given value to the tree. Must not already be in the tree. */
+void
+zfs_btree_add(zfs_btree_t *tree, const void *node)
+{
+ zfs_btree_index_t where = {0};
+ VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL);
+ zfs_btree_add_idx(tree, node, &where);
+}
+
+/* Helper function to free a tree node. */
+static void
+zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node)
+{
+ tree->bt_num_nodes--;
+ if (!node->bth_core) {
+ kmem_cache_free(zfs_btree_leaf_cache, node);
+ } else {
+ kmem_free(node, sizeof (zfs_btree_core_t) +
+ BTREE_CORE_ELEMS * tree->bt_elem_size);
+ }
+}
+
+/*
+ * Remove the rm_hdr and the separator to its left from the parent node. The
+ * buffer that rm_hdr was stored in may already be freed, so its contents
+ * cannot be accessed.
+ */
+static void
+zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
+ zfs_btree_hdr_t *rm_hdr)
+{
+ size_t size = tree->bt_elem_size;
+ uint64_t min_count = (BTREE_CORE_ELEMS / 2) - 1;
+ zfs_btree_hdr_t *hdr = &node->btc_hdr;
+ /*
+ * If the node is the root node and rm_hdr is one of two children,
+ * promote the other child to the root.
+ */
+ if (hdr->bth_parent == NULL && hdr->bth_count <= 1) {
+ ASSERT3U(hdr->bth_count, ==, 1);
+ ASSERT3P(tree->bt_root, ==, node);
+ ASSERT3P(node->btc_children[1], ==, rm_hdr);
+ tree->bt_root = node->btc_children[0];
+ node->btc_children[0]->bth_parent = NULL;
+ zfs_btree_node_destroy(tree, hdr);
+ tree->bt_height--;
+ return;
+ }
+
+ uint64_t idx;
+ for (idx = 0; idx <= hdr->bth_count; idx++) {
+ if (node->btc_children[idx] == rm_hdr)
+ break;
+ }
+ ASSERT3U(idx, <=, hdr->bth_count);
+
+ /*
+ * If the node is the root or it has more than the minimum number of
+ * children, just remove the child and separator, and return.
+ */
+ if (hdr->bth_parent == NULL ||
+ hdr->bth_count > min_count) {
+ /*
+ * Shift the element and children to the right of rm_hdr to
+ * the left by one spot.
+ */
+ bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
+ BSS_PARALLELOGRAM);
+ hdr->bth_count--;
+ zfs_btree_poison_node_at(tree, hdr, hdr->bth_count);
+ return;
+ }
+
+ ASSERT3U(hdr->bth_count, ==, min_count);
+
+ /*
+ * Now we try to take a node from a neighbor. We check left, then
+ * right. If the neighbor exists and has more than the minimum number
+ * of elements, we move the separator between us and them to our
+ * node, move their closest element (last for left, first for right)
+ * to the separator, and move their closest child to our node. Along
+ * the way we need to collapse the gap made by idx, and (for our right
+ * neighbor) the gap made by removing their first element and child.
+ *
+ * Note: this logic currently doesn't support taking from a neighbor
+ * that isn't a sibling (i.e. a neighbor with a different
+ * parent). This isn't critical functionality, but may be worth
+ * implementing in the future for completeness' sake.
+ */
+ zfs_btree_core_t *parent = hdr->bth_parent;
+ uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
+
+ zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
+ parent->btc_children[parent_idx - 1]);
+ if (l_hdr != NULL && l_hdr->bth_count > min_count) {
+ /* We can take a node from the left neighbor. */
+ ASSERT(l_hdr->bth_core);
+ zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr;
+
+ /*
+ * Start by shifting the elements and children in the current
+ * node to the right by one spot.
+ */
+ bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID);
+
+ /*
+ * Move the separator between node and neighbor to the first
+ * element slot in the current node.
+ */
+ uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
+ size;
+ bmov(separator, node->btc_elems, size);
+
+ /* Move the last child of neighbor to our first child slot. */
+ zfs_btree_hdr_t **take_child = neighbor->btc_children +
+ l_hdr->bth_count;
+ bmov(take_child, node->btc_children, sizeof (*take_child));
+ node->btc_children[0]->bth_parent = node;
+
+ /* Move the last element of neighbor to the separator spot. */
+ uint8_t *take_elem = neighbor->btc_elems +
+ (l_hdr->bth_count - 1) * size;
+ bmov(take_elem, separator, size);
+ l_hdr->bth_count--;
+ zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count);
+ return;
+ }
+
+ zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ?
+ NULL : parent->btc_children[parent_idx + 1]);
+ if (r_hdr != NULL && r_hdr->bth_count > min_count) {
+ /* We can take a node from the right neighbor. */
+ ASSERT(r_hdr->bth_core);
+ zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr;
+
+ /*
+ * Shift elements in node left by one spot to overwrite rm_hdr
+ * and the separator before it.
+ */
+ bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
+ BSS_PARALLELOGRAM);
+
+ /*
+ * Move the separator between node and neighbor to the last
+ * element spot in node.
+ */
+ uint8_t *separator = parent->btc_elems + parent_idx * size;
+ bmov(separator, node->btc_elems + (hdr->bth_count - 1) * size,
+ size);
+
+ /*
+ * Move the first child of neighbor to the last child spot in
+ * node.
+ */
+ zfs_btree_hdr_t **take_child = neighbor->btc_children;
+ bmov(take_child, node->btc_children + hdr->bth_count,
+ sizeof (*take_child));
+ node->btc_children[hdr->bth_count]->bth_parent = node;
+
+ /* Move the first element of neighbor to the separator spot. */
+ uint8_t *take_elem = neighbor->btc_elems;
+ bmov(take_elem, separator, size);
+ r_hdr->bth_count--;
+
+ /*
+ * Shift the elements and children of neighbor to cover the
+ * stolen elements.
+ */
+ bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count,
+ BSS_TRAPEZOID);
+ zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count);
+ return;
+ }
+
+ /*
+ * In this case, neither of our neighbors can spare an element, so we
+ * need to merge with one of them. We prefer the left one,
+ * arbitrarily. Move the separator into the leftmost merging node
+ * (which may be us or the left neighbor), and then move the right
+ * merging node's elements. Once that's done, we go back and delete
+ * the element we're removing. Finally, go into the parent and delete
+ * the right merging node and the separator. This may cause further
+ * merging.
+ */
+ zfs_btree_hdr_t *new_rm_hdr, *keep_hdr;
+ uint64_t new_idx = idx;
+ if (l_hdr != NULL) {
+ keep_hdr = l_hdr;
+ new_rm_hdr = hdr;
+ new_idx += keep_hdr->bth_count + 1;
+ } else {
+ ASSERT3P(r_hdr, !=, NULL);
+ keep_hdr = hdr;
+ new_rm_hdr = r_hdr;
+ parent_idx++;
+ }
+
+ ASSERT(keep_hdr->bth_core);
+ ASSERT(new_rm_hdr->bth_core);
+
+ zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr;
+ zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr;
+
+ if (zfs_btree_verify_intensity >= 5) {
+ for (int i = 0; i < new_rm_hdr->bth_count + 1; i++) {
+ zfs_btree_verify_poison_at(tree, keep_hdr,
+ keep_hdr->bth_count + i);
+ }
+ }
+
+ /* Move the separator into the left node. */
+ uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size;
+ uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
+ size;
+ bmov(separator, e_out, size);
+ keep_hdr->bth_count++;
+
+ /* Move all our elements and children into the left node. */
+ bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep,
+ keep_hdr->bth_count, BSS_TRAPEZOID);
+
+ uint64_t old_count = keep_hdr->bth_count;
+
+ /* Update bookkeeping */
+ keep_hdr->bth_count += new_rm_hdr->bth_count;
+ ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1);
+
+ /*
+ * Shift the element and children to the right of rm_hdr to
+ * the left by one spot.
+ */
+ ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr);
+ bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx,
+ BSS_PARALLELOGRAM);
+ keep_hdr->bth_count--;
+
+ /* Reparent all our children to point to the left node. */
+ zfs_btree_hdr_t **new_start = keep->btc_children +
+ old_count - 1;
+ for (int i = 0; i < new_rm_hdr->bth_count + 1; i++)
+ new_start[i]->bth_parent = keep;
+ for (int i = 0; i <= keep_hdr->bth_count; i++) {
+ ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep);
+ ASSERT3P(keep->btc_children[i], !=, rm_hdr);
+ }
+ zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count);
+
+ new_rm_hdr->bth_count = 0;
+ zfs_btree_node_destroy(tree, new_rm_hdr);
+ zfs_btree_remove_from_node(tree, parent, new_rm_hdr);
+}
+
+/* Remove the element at the specific location. */
+void
+zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
+{
+ size_t size = tree->bt_elem_size;
+ zfs_btree_hdr_t *hdr = where->bti_node;
+ uint64_t idx = where->bti_offset;
+ uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
+ sizeof (zfs_btree_hdr_t)) / size, 2);
+
+ ASSERT(!where->bti_before);
+ if (tree->bt_bulk != NULL) {
+ /*
+ * Leave bulk insert mode. Note that our index would be
+ * invalid after we correct the tree, so we copy the value
+ * we're planning to remove and find it again after
+ * bulk_finish.
+ */
+ uint8_t *value = zfs_btree_get(tree, where);
+ uint8_t *tmp = kmem_alloc(size, KM_SLEEP);
+ bmov(value, tmp, size);
+ zfs_btree_bulk_finish(tree);
+ VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL);
+ kmem_free(tmp, size);
+ hdr = where->bti_node;
+ idx = where->bti_offset;
+ }
+
+ tree->bt_num_elems--;
+ /*
+ * If the element happens to be in a core node, we move a leaf node's
+ * element into its place and then remove the leaf node element. This
+ * makes the rebalance logic not need to be recursive both upwards and
+ * downwards.
+ */
+ if (hdr->bth_core) {
+ zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+ zfs_btree_hdr_t *left_subtree = node->btc_children[idx];
+ void *new_value = zfs_btree_last_helper(tree, left_subtree,
+ where);
+ ASSERT3P(new_value, !=, NULL);
+
+ bmov(new_value, node->btc_elems + idx * size, size);
+
+ hdr = where->bti_node;
+ idx = where->bti_offset;
+ ASSERT(!where->bti_before);
+ }
+
+ /*
+ * First, we'll update the leaf's metadata. Then, we shift any
+ * elements after the idx to the left. After that, we rebalance if
+ * needed.
+ */
+ ASSERT(!hdr->bth_core);
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+ ASSERT3U(hdr->bth_count, >, 0);
+
+ uint64_t min_count = (capacity / 2) - 1;
+
+ /*
+ * If we're over the minimum size or this is the root, just overwrite
+ * the value and return.
+ */
+ if (hdr->bth_count > min_count || hdr->bth_parent == NULL) {
+ hdr->bth_count--;
+ bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx);
+ if (hdr->bth_parent == NULL) {
+ ASSERT0(tree->bt_height);
+ if (hdr->bth_count == 0) {
+ tree->bt_root = NULL;
+ tree->bt_height--;
+ zfs_btree_node_destroy(tree, &leaf->btl_hdr);
+ }
+ }
+ if (tree->bt_root != NULL)
+ zfs_btree_poison_node_at(tree, hdr, hdr->bth_count);
+ zfs_btree_verify(tree);
+ return;
+ }
+ ASSERT3U(hdr->bth_count, ==, min_count);
+
+ /*
+ * Now we try to take a node from a sibling. We check left, then
+ * right. If they exist and have more than the minimum number of
+ * elements, we move the separator between us and them to our node
+ * and move their closest element (last for left, first for right) to
+ * the separator. Along the way we need to collapse the gap made by
+ * idx, and (for our right neighbor) the gap made by removing their
+ * first element.
+ *
+ * Note: this logic currently doesn't support taking from a neighbor
+ * that isn't a sibling. This isn't critical functionality, but may be
+ * worth implementing in the future for completeness' sake.
+ */
+ zfs_btree_core_t *parent = hdr->bth_parent;
+ uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
+
+ zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
+ parent->btc_children[parent_idx - 1]);
+ if (l_hdr != NULL && l_hdr->bth_count > min_count) {
+ /* We can take a node from the left neighbor. */
+ ASSERT(!l_hdr->bth_core);
+
+ /*
+ * Move our elements back by one spot to make room for the
+ * stolen element and overwrite the element being removed.
+ */
+ bt_shift_leaf_right(tree, leaf, 0, idx);
+ uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
+ size;
+ uint8_t *take_elem = ((zfs_btree_leaf_t *)l_hdr)->btl_elems +
+ (l_hdr->bth_count - 1) * size;
+ /* Move the separator to our first spot. */
+ bmov(separator, leaf->btl_elems, size);
+
+ /* Move our neighbor's last element to the separator. */
+ bmov(take_elem, separator, size);
+
+ /* Update the bookkeeping. */
+ l_hdr->bth_count--;
+ zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count);
+
+ zfs_btree_verify(tree);
+ return;
+ }
+
+ zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ?
+ NULL : parent->btc_children[parent_idx + 1]);
+ if (r_hdr != NULL && r_hdr->bth_count > min_count) {
+ /* We can take a node from the right neighbor. */
+ ASSERT(!r_hdr->bth_core);
+ zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr;
+
+ /*
+ * Move our elements after the element being removed forwards
+ * by one spot to make room for the stolen element and
+ * overwrite the element being removed.
+ */
+ bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx -
+ 1);
+
+ uint8_t *separator = parent->btc_elems + parent_idx * size;
+ uint8_t *take_elem = ((zfs_btree_leaf_t *)r_hdr)->btl_elems;
+ /* Move the separator between us to our last spot. */
+ bmov(separator, leaf->btl_elems + (hdr->bth_count - 1) * size,
+ size);
+
+ /* Move our neighbor's first element to the separator. */
+ bmov(take_elem, separator, size);
+
+ /* Update the bookkeeping. */
+ r_hdr->bth_count--;
+
+ /*
+ * Move our neighbors elements forwards to overwrite the
+ * stolen element.
+ */
+ bt_shift_leaf_left(tree, neighbor, 1, r_hdr->bth_count);
+ zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count);
+ zfs_btree_verify(tree);
+ return;
+ }
+
+ /*
+ * In this case, neither of our neighbors can spare an element, so we
+ * need to merge with one of them. We prefer the left one,
+ * arbitrarily. Move the separator into the leftmost merging node
+ * (which may be us or the left neighbor), and then move the right
+ * merging node's elements. Once that's done, we go back and delete
+ * the element we're removing. Finally, go into the parent and delete
+ * the right merging node and the separator. This may cause further
+ * merging.
+ */
+ zfs_btree_hdr_t *rm_hdr, *keep_hdr;
+ uint64_t new_idx = idx;
+ if (l_hdr != NULL) {
+ keep_hdr = l_hdr;
+ rm_hdr = hdr;
+ new_idx += keep_hdr->bth_count + 1; // 449
+ } else {
+ ASSERT3P(r_hdr, !=, NULL);
+ keep_hdr = hdr;
+ rm_hdr = r_hdr;
+ parent_idx++;
+ }
+
+ ASSERT(!keep_hdr->bth_core);
+ ASSERT(!rm_hdr->bth_core);
+ ASSERT3U(keep_hdr->bth_count, ==, min_count);
+ ASSERT3U(rm_hdr->bth_count, ==, min_count);
+
+ zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)keep_hdr;
+ zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr;
+
+ if (zfs_btree_verify_intensity >= 5) {
+ for (int i = 0; i < rm_hdr->bth_count + 1; i++) {
+ zfs_btree_verify_poison_at(tree, keep_hdr,
+ keep_hdr->bth_count + i);
+ }
+ }
+ /*
+ * Move the separator into the first open spot in the left
+ * neighbor.
+ */
+ uint8_t *out = keep->btl_elems + keep_hdr->bth_count * size;
+ uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
+ size;
+ bmov(separator, out, size);
+ keep_hdr->bth_count++;
+
+ /* Move our elements to the left neighbor. */
+ bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep,
+ keep_hdr->bth_count);
+
+ /* Update the bookkeeping. */
+ keep_hdr->bth_count += rm_hdr->bth_count;
+ ASSERT3U(keep_hdr->bth_count, ==, min_count * 2 + 1);
+
+ /* Remove the value from the node */
+ keep_hdr->bth_count--;
+ bt_shift_leaf_left(tree, keep, new_idx + 1, keep_hdr->bth_count -
+ new_idx);
+ zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count);
+
+ rm_hdr->bth_count = 0;
+ zfs_btree_node_destroy(tree, rm_hdr);
+ /* Remove the emptied node from the parent. */
+ zfs_btree_remove_from_node(tree, parent, rm_hdr);
+ zfs_btree_verify(tree);
+}
+
+/* Remove the given value from the tree. */
+void
+zfs_btree_remove(zfs_btree_t *tree, const void *value)
+{
+ zfs_btree_index_t where = {0};
+ VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL);
+ zfs_btree_remove_idx(tree, &where);
+}
+
+/* Return the number of elements in the tree. */
+ulong_t
+zfs_btree_numnodes(zfs_btree_t *tree)
+{
+ return (tree->bt_num_elems);
+}
+
+/*
+ * This function is used to visit all the elements in the tree before
+ * destroying the tree. This allows the calling code to perform any cleanup it
+ * needs to do. This is more efficient than just removing the first element
+ * over and over, because it removes all rebalancing. Once the destroy_nodes()
+ * function has been called, no other btree operations are valid until it
+ * returns NULL, which point the only valid operation is zfs_btree_destroy().
+ *
+ * example:
+ *
+ * zfs_btree_index_t *cookie = NULL;
+ * my_data_t *node;
+ *
+ * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL)
+ * free(node->ptr);
+ * zfs_btree_destroy(tree);
+ *
+ */
+void *
+zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie)
+{
+ if (*cookie == NULL) {
+ if (tree->bt_height == -1)
+ return (NULL);
+ *cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP);
+ return (zfs_btree_first(tree, *cookie));
+ }
+
+ void *rval = zfs_btree_next_helper(tree, *cookie, *cookie,
+ zfs_btree_node_destroy);
+ if (rval == NULL) {
+ tree->bt_root = NULL;
+ tree->bt_height = -1;
+ tree->bt_num_elems = 0;
+ kmem_free(*cookie, sizeof (**cookie));
+ tree->bt_bulk = NULL;
+ }
+ return (rval);
+}
+
+static void
+zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+ if (hdr->bth_core) {
+ zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr;
+ for (int i = 0; i <= hdr->bth_count; i++) {
+ zfs_btree_clear_helper(tree, btc->btc_children[i]);
+ }
+ }
+
+ zfs_btree_node_destroy(tree, hdr);
+}
+
+void
+zfs_btree_clear(zfs_btree_t *tree)
+{
+ if (tree->bt_root == NULL) {
+ ASSERT0(tree->bt_num_elems);
+ return;
+ }
+
+ zfs_btree_clear_helper(tree, tree->bt_root);
+ tree->bt_num_elems = 0;
+ tree->bt_root = NULL;
+ tree->bt_num_nodes = 0;
+ tree->bt_height = -1;
+ tree->bt_bulk = NULL;
+}
+
+void
+zfs_btree_destroy(zfs_btree_t *tree)
+{
+ ASSERT0(tree->bt_num_elems);
+ ASSERT3P(tree->bt_root, ==, NULL);
+}
+
+/* Verify that every child of this node has the correct parent pointer. */
+static void
+zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+ if (!hdr->bth_core)
+ return;
+
+ zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+ for (int i = 0; i <= hdr->bth_count; i++) {
+ VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr);
+ zfs_btree_verify_pointers_helper(tree, node->btc_children[i]);
+ }
+}
+
+/* Verify that every node has the correct parent pointer. */
+static void
+zfs_btree_verify_pointers(zfs_btree_t *tree)
+{
+ if (tree->bt_height == -1) {
+ VERIFY3P(tree->bt_root, ==, NULL);
+ return;
+ }
+ VERIFY3P(tree->bt_root->bth_parent, ==, NULL);
+ zfs_btree_verify_pointers_helper(tree, tree->bt_root);
+}
+
+/*
+ * Verify that all the current node and its children satisfy the count
+ * invariants, and return the total count in the subtree rooted in this node.
+ */
+static uint64_t
+zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+ if (!hdr->bth_core) {
+ if (tree->bt_root != hdr && hdr != &tree->bt_bulk->btl_hdr) {
+ uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
+ sizeof (zfs_btree_hdr_t)) / tree->bt_elem_size, 2);
+ VERIFY3U(hdr->bth_count, >=, (capacity / 2) - 1);
+ }
+
+ return (hdr->bth_count);
+ } else {
+
+ zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+ uint64_t ret = hdr->bth_count;
+ if (tree->bt_root != hdr && tree->bt_bulk == NULL)
+ VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1);
+ for (int i = 0; i <= hdr->bth_count; i++) {
+ ret += zfs_btree_verify_counts_helper(tree,
+ node->btc_children[i]);
+ }
+
+ return (ret);
+ }
+}
+
+/*
+ * Verify that all nodes satisfy the invariants and that the total number of
+ * elements is correct.
+ */
+static void
+zfs_btree_verify_counts(zfs_btree_t *tree)
+{
+ EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1);
+ if (tree->bt_height == -1) {
+ return;
+ }
+ VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==,
+ tree->bt_num_elems);
+}
+
+/*
+ * Check that the subtree rooted at this node has a uniform height. Returns
+ * the number of nodes under this node, to help verify bt_num_nodes.
+ */
+static uint64_t
+zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
+ int64_t height)
+{
+ if (!hdr->bth_core) {
+ VERIFY0(height);
+ return (1);
+ }
+
+ VERIFY(hdr->bth_core);
+ zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+ uint64_t ret = 1;
+ for (int i = 0; i <= hdr->bth_count; i++) {
+ ret += zfs_btree_verify_height_helper(tree,
+ node->btc_children[i], height - 1);
+ }
+ return (ret);
+}
+
+/*
+ * Check that the tree rooted at this node has a uniform height, and that the
+ * bt_height in the tree is correct.
+ */
+static void
+zfs_btree_verify_height(zfs_btree_t *tree)
+{
+ EQUIV(tree->bt_height == -1, tree->bt_root == NULL);
+ if (tree->bt_height == -1) {
+ return;
+ }
+
+ VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root,
+ tree->bt_height), ==, tree->bt_num_nodes);
+}
+
+/*
+ * Check that the elements in this node are sorted, and that if this is a core
+ * node, the separators are properly between the subtrees they separaate and
+ * that the children also satisfy this requirement.
+ */
+static void
+zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+ size_t size = tree->bt_elem_size;
+ if (!hdr->bth_core) {
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+ for (int i = 1; i < hdr->bth_count; i++) {
+ VERIFY3S(tree->bt_compar(leaf->btl_elems + (i - 1) *
+ size, leaf->btl_elems + i * size), ==, -1);
+ }
+ return;
+ }
+
+ zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+ for (int i = 1; i < hdr->bth_count; i++) {
+ VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size,
+ node->btc_elems + i * size), ==, -1);
+ }
+ for (int i = 0; i < hdr->bth_count; i++) {
+ uint8_t *left_child_last = NULL;
+ zfs_btree_hdr_t *left_child_hdr = node->btc_children[i];
+ if (left_child_hdr->bth_core) {
+ zfs_btree_core_t *left_child =
+ (zfs_btree_core_t *)left_child_hdr;
+ left_child_last = left_child->btc_elems +
+ (left_child_hdr->bth_count - 1) * size;
+ } else {
+ zfs_btree_leaf_t *left_child =
+ (zfs_btree_leaf_t *)left_child_hdr;
+ left_child_last = left_child->btl_elems +
+ (left_child_hdr->bth_count - 1) * size;
+ }
+ if (tree->bt_compar(node->btc_elems + i * size,
+ left_child_last) != 1) {
+ panic("btree: compar returned %d (expected 1) at "
+ "%px %d: compar(%px, %px)", tree->bt_compar(
+ node->btc_elems + i * size, left_child_last),
+ (void *)node, i, (void *)(node->btc_elems + i *
+ size), (void *)left_child_last);
+ }
+
+ uint8_t *right_child_first = NULL;
+ zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1];
+ if (right_child_hdr->bth_core) {
+ zfs_btree_core_t *right_child =
+ (zfs_btree_core_t *)right_child_hdr;
+ right_child_first = right_child->btc_elems;
+ } else {
+ zfs_btree_leaf_t *right_child =
+ (zfs_btree_leaf_t *)right_child_hdr;
+ right_child_first = right_child->btl_elems;
+ }
+ if (tree->bt_compar(node->btc_elems + i * size,
+ right_child_first) != -1) {
+ panic("btree: compar returned %d (expected -1) at "
+ "%px %d: compar(%px, %px)", tree->bt_compar(
+ node->btc_elems + i * size, right_child_first),
+ (void *)node, i, (void *)(node->btc_elems + i *
+ size), (void *)right_child_first);
+ }
+ }
+ for (int i = 0; i <= hdr->bth_count; i++) {
+ zfs_btree_verify_order_helper(tree, node->btc_children[i]);
+ }
+}
+
+/* Check that all elements in the tree are in sorted order. */
+static void
+zfs_btree_verify_order(zfs_btree_t *tree)
+{
+ EQUIV(tree->bt_height == -1, tree->bt_root == NULL);
+ if (tree->bt_height == -1) {
+ return;
+ }
+
+ zfs_btree_verify_order_helper(tree, tree->bt_root);
+}
+
+#ifdef ZFS_DEBUG
+/* Check that all unused memory is poisoned correctly. */
+static void
+zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+ size_t size = tree->bt_elem_size;
+ if (!hdr->bth_core) {
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+ uint8_t val = 0x0f;
+ for (int i = hdr->bth_count * size; i < BTREE_LEAF_SIZE -
+ sizeof (zfs_btree_hdr_t); i++) {
+ VERIFY3U(leaf->btl_elems[i], ==, val);
+ }
+ } else {
+ zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+ uint8_t val = 0x0f;
+ for (int i = hdr->bth_count * size; i < BTREE_CORE_ELEMS * size;
+ i++) {
+ VERIFY3U(node->btc_elems[i], ==, val);
+ }
+
+ for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) {
+ VERIFY3P(node->btc_children[i], ==,
+ (zfs_btree_hdr_t *)BTREE_POISON);
+ }
+
+ for (int i = 0; i <= hdr->bth_count; i++) {
+ zfs_btree_verify_poison_helper(tree,
+ node->btc_children[i]);
+ }
+ }
+}
+#endif
+
+/* Check that unused memory in the tree is still poisoned. */
+static void
+zfs_btree_verify_poison(zfs_btree_t *tree)
+{
+#ifdef ZFS_DEBUG
+ if (tree->bt_height == -1)
+ return;
+ zfs_btree_verify_poison_helper(tree, tree->bt_root);
+#endif
+}
+
+void
+zfs_btree_verify(zfs_btree_t *tree)
+{
+ if (zfs_btree_verify_intensity == 0)
+ return;
+ zfs_btree_verify_height(tree);
+ if (zfs_btree_verify_intensity == 1)
+ return;
+ zfs_btree_verify_pointers(tree);
+ if (zfs_btree_verify_intensity == 2)
+ return;
+ zfs_btree_verify_counts(tree);
+ if (zfs_btree_verify_intensity == 3)
+ return;
+ zfs_btree_verify_order(tree);
+
+ if (zfs_btree_verify_intensity == 4)
+ return;
+ zfs_btree_verify_poison(tree);
+}
diff --git a/sys/contrib/openzfs/module/zfs/dataset_kstats.c b/sys/contrib/openzfs/module/zfs/dataset_kstats.c
new file mode 100644
index 000000000000..e46a0926d557
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dataset_kstats.c
@@ -0,0 +1,215 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2018 Datto Inc.
+ */
+
+#include <sys/dataset_kstats.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+
+static dataset_kstat_values_t empty_dataset_kstats = {
+ { "dataset_name", KSTAT_DATA_STRING },
+ { "writes", KSTAT_DATA_UINT64 },
+ { "nwritten", KSTAT_DATA_UINT64 },
+ { "reads", KSTAT_DATA_UINT64 },
+ { "nread", KSTAT_DATA_UINT64 },
+ { "nunlinks", KSTAT_DATA_UINT64 },
+ { "nunlinked", KSTAT_DATA_UINT64 },
+};
+
+static int
+dataset_kstats_update(kstat_t *ksp, int rw)
+{
+ dataset_kstats_t *dk = ksp->ks_private;
+ ASSERT3P(dk->dk_kstats->ks_data, ==, ksp->ks_data);
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
+ dkv->dkv_writes.value.ui64 =
+ aggsum_value(&dk->dk_aggsums.das_writes);
+ dkv->dkv_nwritten.value.ui64 =
+ aggsum_value(&dk->dk_aggsums.das_nwritten);
+ dkv->dkv_reads.value.ui64 =
+ aggsum_value(&dk->dk_aggsums.das_reads);
+ dkv->dkv_nread.value.ui64 =
+ aggsum_value(&dk->dk_aggsums.das_nread);
+ dkv->dkv_nunlinks.value.ui64 =
+ aggsum_value(&dk->dk_aggsums.das_nunlinks);
+ dkv->dkv_nunlinked.value.ui64 =
+ aggsum_value(&dk->dk_aggsums.das_nunlinked);
+
+ return (0);
+}
+
+void
+dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset)
+{
+ /*
+ * There should not be anything wrong with having kstats for
+ * snapshots. Since we are not sure how useful they would be
+ * though nor how much their memory overhead would matter in
+ * a filesystem with many snapshots, we skip them for now.
+ */
+ if (dmu_objset_is_snapshot(objset))
+ return;
+
+ /*
+ * At the time of this writing, KSTAT_STRLEN is 255 in Linux,
+ * and the spa_name can theoretically be up to 256 characters.
+ * In reality though the spa_name can be 240 characters max
+ * [see origin directory name check in pool_namecheck()]. Thus,
+ * the naming scheme for the module name below should not cause
+ * any truncations. In the event that a truncation does happen
+ * though, due to some future change, we silently skip creating
+ * the kstat and log the event.
+ */
+ char kstat_module_name[KSTAT_STRLEN];
+ int n = snprintf(kstat_module_name, sizeof (kstat_module_name),
+ "zfs/%s", spa_name(dmu_objset_spa(objset)));
+ if (n < 0) {
+ zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
+ " snprintf() for kstat module name returned %d",
+ (unsigned long long)dmu_objset_id(objset), n);
+ return;
+ } else if (n >= KSTAT_STRLEN) {
+ zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
+ "kstat module name length (%d) exceeds limit (%d)",
+ (unsigned long long)dmu_objset_id(objset),
+ n, KSTAT_STRLEN);
+ return;
+ }
+
+ char kstat_name[KSTAT_STRLEN];
+ n = snprintf(kstat_name, sizeof (kstat_name), "objset-0x%llx",
+ (unsigned long long)dmu_objset_id(objset));
+ if (n < 0) {
+ zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
+ " snprintf() for kstat name returned %d",
+ (unsigned long long)dmu_objset_id(objset), n);
+ return;
+ }
+ ASSERT3U(n, <, KSTAT_STRLEN);
+
+ kstat_t *kstat = kstat_create(kstat_module_name, 0, kstat_name,
+ "dataset", KSTAT_TYPE_NAMED,
+ sizeof (empty_dataset_kstats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ if (kstat == NULL)
+ return;
+
+ dataset_kstat_values_t *dk_kstats =
+ kmem_alloc(sizeof (empty_dataset_kstats), KM_SLEEP);
+ bcopy(&empty_dataset_kstats, dk_kstats,
+ sizeof (empty_dataset_kstats));
+
+ char *ds_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+ dsl_dataset_name(objset->os_dsl_dataset, ds_name);
+ KSTAT_NAMED_STR_PTR(&dk_kstats->dkv_ds_name) = ds_name;
+ KSTAT_NAMED_STR_BUFLEN(&dk_kstats->dkv_ds_name) =
+ ZFS_MAX_DATASET_NAME_LEN;
+
+ kstat->ks_data = dk_kstats;
+ kstat->ks_update = dataset_kstats_update;
+ kstat->ks_private = dk;
+ kstat->ks_data_size += ZFS_MAX_DATASET_NAME_LEN;
+
+ kstat_install(kstat);
+ dk->dk_kstats = kstat;
+
+ aggsum_init(&dk->dk_aggsums.das_writes, 0);
+ aggsum_init(&dk->dk_aggsums.das_nwritten, 0);
+ aggsum_init(&dk->dk_aggsums.das_reads, 0);
+ aggsum_init(&dk->dk_aggsums.das_nread, 0);
+ aggsum_init(&dk->dk_aggsums.das_nunlinks, 0);
+ aggsum_init(&dk->dk_aggsums.das_nunlinked, 0);
+}
+
+void
+dataset_kstats_destroy(dataset_kstats_t *dk)
+{
+ if (dk->dk_kstats == NULL)
+ return;
+
+ dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
+ kmem_free(KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name),
+ KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name));
+ kmem_free(dkv, sizeof (empty_dataset_kstats));
+
+ kstat_delete(dk->dk_kstats);
+ dk->dk_kstats = NULL;
+
+ aggsum_fini(&dk->dk_aggsums.das_writes);
+ aggsum_fini(&dk->dk_aggsums.das_nwritten);
+ aggsum_fini(&dk->dk_aggsums.das_reads);
+ aggsum_fini(&dk->dk_aggsums.das_nread);
+ aggsum_fini(&dk->dk_aggsums.das_nunlinks);
+ aggsum_fini(&dk->dk_aggsums.das_nunlinked);
+}
+
+void
+dataset_kstats_update_write_kstats(dataset_kstats_t *dk,
+ int64_t nwritten)
+{
+ ASSERT3S(nwritten, >=, 0);
+
+ if (dk->dk_kstats == NULL)
+ return;
+
+ aggsum_add(&dk->dk_aggsums.das_writes, 1);
+ aggsum_add(&dk->dk_aggsums.das_nwritten, nwritten);
+}
+
+void
+dataset_kstats_update_read_kstats(dataset_kstats_t *dk,
+ int64_t nread)
+{
+ ASSERT3S(nread, >=, 0);
+
+ if (dk->dk_kstats == NULL)
+ return;
+
+ aggsum_add(&dk->dk_aggsums.das_reads, 1);
+ aggsum_add(&dk->dk_aggsums.das_nread, nread);
+}
+
+void
+dataset_kstats_update_nunlinks_kstat(dataset_kstats_t *dk, int64_t delta)
+{
+ if (dk->dk_kstats == NULL)
+ return;
+
+ aggsum_add(&dk->dk_aggsums.das_nunlinks, delta);
+}
+
+void
+dataset_kstats_update_nunlinked_kstat(dataset_kstats_t *dk, int64_t delta)
+{
+ if (dk->dk_kstats == NULL)
+ return;
+
+ aggsum_add(&dk->dk_aggsums.das_nunlinked, delta);
+}
diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c
new file mode 100644
index 000000000000..a6cdc017cd21
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dbuf.c
@@ -0,0 +1,4958 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/arc.h>
+#include <sys/dmu.h>
+#include <sys/dmu_send.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/zfeature.h>
+#include <sys/blkptr.h>
+#include <sys/range_tree.h>
+#include <sys/trace_zfs.h>
+#include <sys/callb.h>
+#include <sys/abd.h>
+#include <sys/vdev.h>
+#include <cityhash.h>
+#include <sys/spa_impl.h>
+
+kstat_t *dbuf_ksp;
+
+typedef struct dbuf_stats {
+ /*
+ * Various statistics about the size of the dbuf cache.
+ */
+ kstat_named_t cache_count;
+ kstat_named_t cache_size_bytes;
+ kstat_named_t cache_size_bytes_max;
+ /*
+ * Statistics regarding the bounds on the dbuf cache size.
+ */
+ kstat_named_t cache_target_bytes;
+ kstat_named_t cache_lowater_bytes;
+ kstat_named_t cache_hiwater_bytes;
+ /*
+ * Total number of dbuf cache evictions that have occurred.
+ */
+ kstat_named_t cache_total_evicts;
+ /*
+ * The distribution of dbuf levels in the dbuf cache and
+ * the total size of all dbufs at each level.
+ */
+ kstat_named_t cache_levels[DN_MAX_LEVELS];
+ kstat_named_t cache_levels_bytes[DN_MAX_LEVELS];
+ /*
+ * Statistics about the dbuf hash table.
+ */
+ kstat_named_t hash_hits;
+ kstat_named_t hash_misses;
+ kstat_named_t hash_collisions;
+ kstat_named_t hash_elements;
+ kstat_named_t hash_elements_max;
+ /*
+ * Number of sublists containing more than one dbuf in the dbuf
+ * hash table. Keep track of the longest hash chain.
+ */
+ kstat_named_t hash_chains;
+ kstat_named_t hash_chain_max;
+ /*
+ * Number of times a dbuf_create() discovers that a dbuf was
+ * already created and in the dbuf hash table.
+ */
+ kstat_named_t hash_insert_race;
+ /*
+ * Statistics about the size of the metadata dbuf cache.
+ */
+ kstat_named_t metadata_cache_count;
+ kstat_named_t metadata_cache_size_bytes;
+ kstat_named_t metadata_cache_size_bytes_max;
+ /*
+ * For diagnostic purposes, this is incremented whenever we can't add
+ * something to the metadata cache because it's full, and instead put
+ * the data in the regular dbuf cache.
+ */
+ kstat_named_t metadata_cache_overflow;
+} dbuf_stats_t;
+
+dbuf_stats_t dbuf_stats = {
+ { "cache_count", KSTAT_DATA_UINT64 },
+ { "cache_size_bytes", KSTAT_DATA_UINT64 },
+ { "cache_size_bytes_max", KSTAT_DATA_UINT64 },
+ { "cache_target_bytes", KSTAT_DATA_UINT64 },
+ { "cache_lowater_bytes", KSTAT_DATA_UINT64 },
+ { "cache_hiwater_bytes", KSTAT_DATA_UINT64 },
+ { "cache_total_evicts", KSTAT_DATA_UINT64 },
+ { { "cache_levels_N", KSTAT_DATA_UINT64 } },
+ { { "cache_levels_bytes_N", KSTAT_DATA_UINT64 } },
+ { "hash_hits", KSTAT_DATA_UINT64 },
+ { "hash_misses", KSTAT_DATA_UINT64 },
+ { "hash_collisions", KSTAT_DATA_UINT64 },
+ { "hash_elements", KSTAT_DATA_UINT64 },
+ { "hash_elements_max", KSTAT_DATA_UINT64 },
+ { "hash_chains", KSTAT_DATA_UINT64 },
+ { "hash_chain_max", KSTAT_DATA_UINT64 },
+ { "hash_insert_race", KSTAT_DATA_UINT64 },
+ { "metadata_cache_count", KSTAT_DATA_UINT64 },
+ { "metadata_cache_size_bytes", KSTAT_DATA_UINT64 },
+ { "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 },
+ { "metadata_cache_overflow", KSTAT_DATA_UINT64 }
+};
+
+#define DBUF_STAT_INCR(stat, val) \
+ atomic_add_64(&dbuf_stats.stat.value.ui64, (val));
+#define DBUF_STAT_DECR(stat, val) \
+ DBUF_STAT_INCR(stat, -(val));
+#define DBUF_STAT_BUMP(stat) \
+ DBUF_STAT_INCR(stat, 1);
+#define DBUF_STAT_BUMPDOWN(stat) \
+ DBUF_STAT_INCR(stat, -1);
+#define DBUF_STAT_MAX(stat, v) { \
+ uint64_t _m; \
+ while ((v) > (_m = dbuf_stats.stat.value.ui64) && \
+ (_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\
+ continue; \
+}
+
+static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
+static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
+static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags);
+
+extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
+ dmu_buf_evict_func_t *evict_func_sync,
+ dmu_buf_evict_func_t *evict_func_async,
+ dmu_buf_t **clear_on_evict_dbufp);
+
+/*
+ * Global data structures and functions for the dbuf cache.
+ */
+static kmem_cache_t *dbuf_kmem_cache;
+static taskq_t *dbu_evict_taskq;
+
+static kthread_t *dbuf_cache_evict_thread;
+static kmutex_t dbuf_evict_lock;
+static kcondvar_t dbuf_evict_cv;
+static boolean_t dbuf_evict_thread_exit;
+
+/*
+ * There are two dbuf caches; each dbuf can only be in one of them at a time.
+ *
+ * 1. Cache of metadata dbufs, to help make read-heavy administrative commands
+ * from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
+ * that represent the metadata that describes filesystems/snapshots/
+ * bookmarks/properties/etc. We only evict from this cache when we export a
+ * pool, to short-circuit as much I/O as possible for all administrative
+ * commands that need the metadata. There is no eviction policy for this
+ * cache, because we try to only include types in it which would occupy a
+ * very small amount of space per object but create a large impact on the
+ * performance of these commands. Instead, after it reaches a maximum size
+ * (which should only happen on very small memory systems with a very large
+ * number of filesystem objects), we stop taking new dbufs into the
+ * metadata cache, instead putting them in the normal dbuf cache.
+ *
+ * 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
+ * are not currently held but have been recently released. These dbufs
+ * are not eligible for arc eviction until they are aged out of the cache.
+ * Dbufs that are aged out of the cache will be immediately destroyed and
+ * become eligible for arc eviction.
+ *
+ * Dbufs are added to these caches once the last hold is released. If a dbuf is
+ * later accessed and still exists in the dbuf cache, then it will be removed
+ * from the cache and later re-added to the head of the cache.
+ *
+ * If a given dbuf meets the requirements for the metadata cache, it will go
+ * there, otherwise it will be considered for the generic LRU dbuf cache. The
+ * caches and the refcounts tracking their sizes are stored in an array indexed
+ * by those caches' matching enum values (from dbuf_cached_state_t).
+ */
+typedef struct dbuf_cache {
+ multilist_t *cache;
+ zfs_refcount_t size;
+} dbuf_cache_t;
+dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
+
+/* Size limits for the caches */
+unsigned long dbuf_cache_max_bytes = ULONG_MAX;
+unsigned long dbuf_metadata_cache_max_bytes = ULONG_MAX;
+
+/* Set the default sizes of the caches to log2 fraction of arc size */
+int dbuf_cache_shift = 5;
+int dbuf_metadata_cache_shift = 6;
+
+static unsigned long dbuf_cache_target_bytes(void);
+static unsigned long dbuf_metadata_cache_target_bytes(void);
+
+/*
+ * The LRU dbuf cache uses a three-stage eviction policy:
+ * - A low water marker designates when the dbuf eviction thread
+ * should stop evicting from the dbuf cache.
+ * - When we reach the maximum size (aka mid water mark), we
+ * signal the eviction thread to run.
+ * - The high water mark indicates when the eviction thread
+ * is unable to keep up with the incoming load and eviction must
+ * happen in the context of the calling thread.
+ *
+ * The dbuf cache:
+ * (max size)
+ * low water mid water hi water
+ * +----------------------------------------+----------+----------+
+ * | | | |
+ * | | | |
+ * | | | |
+ * | | | |
+ * +----------------------------------------+----------+----------+
+ * stop signal evict
+ * evicting eviction directly
+ * thread
+ *
+ * The high and low water marks indicate the operating range for the eviction
+ * thread. The low water mark is, by default, 90% of the total size of the
+ * cache and the high water mark is at 110% (both of these percentages can be
+ * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
+ * respectively). The eviction thread will try to ensure that the cache remains
+ * within this range by waking up every second and checking if the cache is
+ * above the low water mark. The thread can also be woken up by callers adding
+ * elements into the cache if the cache is larger than the mid water (i.e max
+ * cache size). Once the eviction thread is woken up and eviction is required,
+ * it will continue evicting buffers until it's able to reduce the cache size
+ * to the low water mark. If the cache size continues to grow and hits the high
+ * water mark, then callers adding elements to the cache will begin to evict
+ * directly from the cache until the cache is no longer above the high water
+ * mark.
+ */
+
+/*
+ * The percentage above and below the maximum cache size.
+ */
+uint_t dbuf_cache_hiwater_pct = 10;
+uint_t dbuf_cache_lowater_pct = 10;
+
+/* ARGSUSED */
+static int
+dbuf_cons(void *vdb, void *unused, int kmflag)
+{
+ dmu_buf_impl_t *db = vdb;
+ bzero(db, sizeof (dmu_buf_impl_t));
+
+ mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
+ rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL);
+ cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
+ multilist_link_init(&db->db_cache_link);
+ zfs_refcount_create(&db->db_holds);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dbuf_dest(void *vdb, void *unused)
+{
+ dmu_buf_impl_t *db = vdb;
+ mutex_destroy(&db->db_mtx);
+ rw_destroy(&db->db_rwlock);
+ cv_destroy(&db->db_changed);
+ ASSERT(!multilist_link_active(&db->db_cache_link));
+ zfs_refcount_destroy(&db->db_holds);
+}
+
+/*
+ * dbuf hash table routines
+ */
+static dbuf_hash_table_t dbuf_hash_table;
+
+static uint64_t dbuf_hash_count;
+
+/*
+ * We use Cityhash for this. It's fast, and has good hash properties without
+ * requiring any large static buffers.
+ */
+static uint64_t
+dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
+{
+ return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));
+}
+
+#define DTRACE_SET_STATE(db, why) \
+ DTRACE_PROBE2(dbuf__state_change, dmu_buf_impl_t *, db, \
+ const char *, why)
+
+#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
+ ((dbuf)->db.db_object == (obj) && \
+ (dbuf)->db_objset == (os) && \
+ (dbuf)->db_level == (level) && \
+ (dbuf)->db_blkid == (blkid))
+
+dmu_buf_impl_t *
+dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ uint64_t hv;
+ uint64_t idx;
+ dmu_buf_impl_t *db;
+
+ hv = dbuf_hash(os, obj, level, blkid);
+ idx = hv & h->hash_table_mask;
+
+ mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
+ if (DBUF_EQUAL(db, os, obj, level, blkid)) {
+ mutex_enter(&db->db_mtx);
+ if (db->db_state != DB_EVICTING) {
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ return (db);
+ }
+ mutex_exit(&db->db_mtx);
+ }
+ }
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ return (NULL);
+}
+
+static dmu_buf_impl_t *
+dbuf_find_bonus(objset_t *os, uint64_t object)
+{
+ dnode_t *dn;
+ dmu_buf_impl_t *db = NULL;
+
+ if (dnode_hold(os, object, FTAG, &dn) == 0) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_bonus != NULL) {
+ db = dn->dn_bonus;
+ mutex_enter(&db->db_mtx);
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ }
+ return (db);
+}
+
+/*
+ * Insert an entry into the hash table. If there is already an element
+ * equal to elem in the hash table, then the already existing element
+ * will be returned and the new element will not be inserted.
+ * Otherwise returns NULL.
+ */
+static dmu_buf_impl_t *
+dbuf_hash_insert(dmu_buf_impl_t *db)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ objset_t *os = db->db_objset;
+ uint64_t obj = db->db.db_object;
+ int level = db->db_level;
+ uint64_t blkid, hv, idx;
+ dmu_buf_impl_t *dbf;
+ uint32_t i;
+
+ blkid = db->db_blkid;
+ hv = dbuf_hash(os, obj, level, blkid);
+ idx = hv & h->hash_table_mask;
+
+ mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
+ dbf = dbf->db_hash_next, i++) {
+ if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
+ mutex_enter(&dbf->db_mtx);
+ if (dbf->db_state != DB_EVICTING) {
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ return (dbf);
+ }
+ mutex_exit(&dbf->db_mtx);
+ }
+ }
+
+ if (i > 0) {
+ DBUF_STAT_BUMP(hash_collisions);
+ if (i == 1)
+ DBUF_STAT_BUMP(hash_chains);
+
+ DBUF_STAT_MAX(hash_chain_max, i);
+ }
+
+ mutex_enter(&db->db_mtx);
+ db->db_hash_next = h->hash_table[idx];
+ h->hash_table[idx] = db;
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ atomic_inc_64(&dbuf_hash_count);
+ DBUF_STAT_MAX(hash_elements_max, dbuf_hash_count);
+
+ return (NULL);
+}
+
+/*
+ * This returns whether this dbuf should be stored in the metadata cache, which
+ * is based on whether it's from one of the dnode types that store data related
+ * to traversing dataset hierarchies.
+ */
+static boolean_t
+dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
+{
+ DB_DNODE_ENTER(db);
+ dmu_object_type_t type = DB_DNODE(db)->dn_type;
+ DB_DNODE_EXIT(db);
+
+ /* Check if this dbuf is one of the types we care about */
+ if (DMU_OT_IS_METADATA_CACHED(type)) {
+ /* If we hit this, then we set something up wrong in dmu_ot */
+ ASSERT(DMU_OT_IS_METADATA(type));
+
+ /*
+ * Sanity check for small-memory systems: don't allocate too
+ * much memory for this purpose.
+ */
+ if (zfs_refcount_count(
+ &dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
+ dbuf_metadata_cache_target_bytes()) {
+ DBUF_STAT_BUMP(metadata_cache_overflow);
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Remove an entry from the hash table. It must be in the EVICTING state.
+ */
+static void
+dbuf_hash_remove(dmu_buf_impl_t *db)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ uint64_t hv, idx;
+ dmu_buf_impl_t *dbf, **dbp;
+
+ hv = dbuf_hash(db->db_objset, db->db.db_object,
+ db->db_level, db->db_blkid);
+ idx = hv & h->hash_table_mask;
+
+ /*
+ * We mustn't hold db_mtx to maintain lock ordering:
+ * DBUF_HASH_MUTEX > db_mtx.
+ */
+ ASSERT(zfs_refcount_is_zero(&db->db_holds));
+ ASSERT(db->db_state == DB_EVICTING);
+ ASSERT(!MUTEX_HELD(&db->db_mtx));
+
+ mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ dbp = &h->hash_table[idx];
+ while ((dbf = *dbp) != db) {
+ dbp = &dbf->db_hash_next;
+ ASSERT(dbf != NULL);
+ }
+ *dbp = db->db_hash_next;
+ db->db_hash_next = NULL;
+ if (h->hash_table[idx] &&
+ h->hash_table[idx]->db_hash_next == NULL)
+ DBUF_STAT_BUMPDOWN(hash_chains);
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ atomic_dec_64(&dbuf_hash_count);
+}
+
+typedef enum {
+ DBVU_EVICTING,
+ DBVU_NOT_EVICTING
+} dbvu_verify_type_t;
+
+static void
+dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
+{
+#ifdef ZFS_DEBUG
+ int64_t holds;
+
+ if (db->db_user == NULL)
+ return;
+
+ /* Only data blocks support the attachment of user data. */
+ ASSERT(db->db_level == 0);
+
+ /* Clients must resolve a dbuf before attaching user data. */
+ ASSERT(db->db.db_data != NULL);
+ ASSERT3U(db->db_state, ==, DB_CACHED);
+
+ holds = zfs_refcount_count(&db->db_holds);
+ if (verify_type == DBVU_EVICTING) {
+ /*
+ * Immediate eviction occurs when holds == dirtycnt.
+ * For normal eviction buffers, holds is zero on
+ * eviction, except when dbuf_fix_old_data() calls
+ * dbuf_clear_data(). However, the hold count can grow
+ * during eviction even though db_mtx is held (see
+ * dmu_bonus_hold() for an example), so we can only
+ * test the generic invariant that holds >= dirtycnt.
+ */
+ ASSERT3U(holds, >=, db->db_dirtycnt);
+ } else {
+ if (db->db_user_immediate_evict == TRUE)
+ ASSERT3U(holds, >=, db->db_dirtycnt);
+ else
+ ASSERT3U(holds, >, 0);
+ }
+#endif
+}
+
+static void
+dbuf_evict_user(dmu_buf_impl_t *db)
+{
+ dmu_buf_user_t *dbu = db->db_user;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (dbu == NULL)
+ return;
+
+ dbuf_verify_user(db, DBVU_EVICTING);
+ db->db_user = NULL;
+
+#ifdef ZFS_DEBUG
+ if (dbu->dbu_clear_on_evict_dbufp != NULL)
+ *dbu->dbu_clear_on_evict_dbufp = NULL;
+#endif
+
+ /*
+ * There are two eviction callbacks - one that we call synchronously
+ * and one that we invoke via a taskq. The async one is useful for
+ * avoiding lock order reversals and limiting stack depth.
+ *
+ * Note that if we have a sync callback but no async callback,
+ * it's likely that the sync callback will free the structure
+ * containing the dbu. In that case we need to take care to not
+ * dereference dbu after calling the sync evict func.
+ */
+ boolean_t has_async = (dbu->dbu_evict_func_async != NULL);
+
+ if (dbu->dbu_evict_func_sync != NULL)
+ dbu->dbu_evict_func_sync(dbu);
+
+ if (has_async) {
+ taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,
+ dbu, 0, &dbu->dbu_tqent);
+ }
+}
+
+boolean_t
+dbuf_is_metadata(dmu_buf_impl_t *db)
+{
+ /*
+ * Consider indirect blocks and spill blocks to be meta data.
+ */
+ if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) {
+ return (B_TRUE);
+ } else {
+ boolean_t is_metadata;
+
+ DB_DNODE_ENTER(db);
+ is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
+ DB_DNODE_EXIT(db);
+
+ return (is_metadata);
+ }
+}
+
+
+/*
+ * This function *must* return indices evenly distributed between all
+ * sublists of the multilist. This is needed due to how the dbuf eviction
+ * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
+ * distributed between all sublists and uses this assumption when
+ * deciding which sublist to evict from and how much to evict from it.
+ */
+static unsigned int
+dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
+{
+ dmu_buf_impl_t *db = obj;
+
+ /*
+ * The assumption here, is the hash value for a given
+ * dmu_buf_impl_t will remain constant throughout it's lifetime
+ * (i.e. it's objset, object, level and blkid fields don't change).
+ * Thus, we don't need to store the dbuf's sublist index
+ * on insertion, as this index can be recalculated on removal.
+ *
+ * Also, the low order bits of the hash value are thought to be
+ * distributed evenly. Otherwise, in the case that the multilist
+ * has a power of two number of sublists, each sublists' usage
+ * would not be evenly distributed.
+ */
+ return (dbuf_hash(db->db_objset, db->db.db_object,
+ db->db_level, db->db_blkid) %
+ multilist_get_num_sublists(ml));
+}
+
+/*
+ * The target size of the dbuf cache can grow with the ARC target,
+ * unless limited by the tunable dbuf_cache_max_bytes.
+ */
+static inline unsigned long
+dbuf_cache_target_bytes(void)
+{
+ return (MIN(dbuf_cache_max_bytes,
+ arc_target_bytes() >> dbuf_cache_shift));
+}
+
+/*
+ * The target size of the dbuf metadata cache can grow with the ARC target,
+ * unless limited by the tunable dbuf_metadata_cache_max_bytes.
+ */
+static inline unsigned long
+dbuf_metadata_cache_target_bytes(void)
+{
+ return (MIN(dbuf_metadata_cache_max_bytes,
+ arc_target_bytes() >> dbuf_metadata_cache_shift));
+}
+
+static inline uint64_t
+dbuf_cache_hiwater_bytes(void)
+{
+ uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
+ return (dbuf_cache_target +
+ (dbuf_cache_target * dbuf_cache_hiwater_pct) / 100);
+}
+
+static inline uint64_t
+dbuf_cache_lowater_bytes(void)
+{
+ uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
+ return (dbuf_cache_target -
+ (dbuf_cache_target * dbuf_cache_lowater_pct) / 100);
+}
+
+static inline boolean_t
+dbuf_cache_above_lowater(void)
+{
+ return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
+ dbuf_cache_lowater_bytes());
+}
+
+/*
+ * Evict the oldest eligible dbuf from the dbuf cache.
+ */
+static void
+dbuf_evict_one(void)
+{
+ int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache);
+ multilist_sublist_t *mls = multilist_sublist_lock(
+ dbuf_caches[DB_DBUF_CACHE].cache, idx);
+
+ ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
+
+ dmu_buf_impl_t *db = multilist_sublist_tail(mls);
+ while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
+ db = multilist_sublist_prev(mls, db);
+ }
+
+ DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
+ multilist_sublist_t *, mls);
+
+ if (db != NULL) {
+ multilist_sublist_remove(mls, db);
+ multilist_sublist_unlock(mls);
+ (void) zfs_refcount_remove_many(
+ &dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db);
+ DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
+ DBUF_STAT_BUMPDOWN(cache_count);
+ DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
+ db->db.db_size);
+ ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
+ db->db_caching_status = DB_NO_CACHE;
+ dbuf_destroy(db);
+ DBUF_STAT_BUMP(cache_total_evicts);
+ } else {
+ multilist_sublist_unlock(mls);
+ }
+}
+
+/*
+ * The dbuf evict thread is responsible for aging out dbufs from the
+ * cache. Once the cache has reached it's maximum size, dbufs are removed
+ * and destroyed. The eviction thread will continue running until the size
+ * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
+ * out of the cache it is destroyed and becomes eligible for arc eviction.
+ */
+/* ARGSUSED */
+static void
+dbuf_evict_thread(void *unused)
+{
+ callb_cpr_t cpr;
+
+ CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
+
+ mutex_enter(&dbuf_evict_lock);
+ while (!dbuf_evict_thread_exit) {
+ while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
+ CALLB_CPR_SAFE_BEGIN(&cpr);
+ (void) cv_timedwait_idle_hires(&dbuf_evict_cv,
+ &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
+ CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
+ }
+ mutex_exit(&dbuf_evict_lock);
+
+ /*
+ * Keep evicting as long as we're above the low water mark
+ * for the cache. We do this without holding the locks to
+ * minimize lock contention.
+ */
+ while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
+ dbuf_evict_one();
+ }
+
+ mutex_enter(&dbuf_evict_lock);
+ }
+
+ dbuf_evict_thread_exit = B_FALSE;
+ cv_broadcast(&dbuf_evict_cv);
+ CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */
+ thread_exit();
+}
+
+/*
+ * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
+ * If the dbuf cache is at its high water mark, then evict a dbuf from the
+ * dbuf cache using the callers context.
+ */
+static void
+dbuf_evict_notify(uint64_t size)
+{
+ /*
+ * We check if we should evict without holding the dbuf_evict_lock,
+ * because it's OK to occasionally make the wrong decision here,
+ * and grabbing the lock results in massive lock contention.
+ */
+ if (size > dbuf_cache_target_bytes()) {
+ if (size > dbuf_cache_hiwater_bytes())
+ dbuf_evict_one();
+ cv_signal(&dbuf_evict_cv);
+ }
+}
+
+static int
+dbuf_kstat_update(kstat_t *ksp, int rw)
+{
+ dbuf_stats_t *ds = ksp->ks_data;
+
+ if (rw == KSTAT_WRITE) {
+ return (SET_ERROR(EACCES));
+ } else {
+ ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count(
+ &dbuf_caches[DB_DBUF_METADATA_CACHE].size);
+ ds->cache_size_bytes.value.ui64 =
+ zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);
+ ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes();
+ ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes();
+ ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes();
+ ds->hash_elements.value.ui64 = dbuf_hash_count;
+ }
+
+ return (0);
+}
+
+void
+dbuf_init(void)
+{
+ uint64_t hsize = 1ULL << 16;
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ int i;
+
+ /*
+ * The hash table is big enough to fill all of physical memory
+ * with an average block size of zfs_arc_average_blocksize (default 8K).
+ * By default, the table will take up
+ * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
+ */
+ while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
+ hsize <<= 1;
+
+retry:
+ h->hash_table_mask = hsize - 1;
+#if defined(_KERNEL)
+ /*
+ * Large allocations which do not require contiguous pages
+ * should be using vmem_alloc() in the linux kernel
+ */
+ h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
+#else
+ h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
+#endif
+ if (h->hash_table == NULL) {
+ /* XXX - we should really return an error instead of assert */
+ ASSERT(hsize > (1ULL << 10));
+ hsize >>= 1;
+ goto retry;
+ }
+
+ dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
+ sizeof (dmu_buf_impl_t),
+ 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
+
+ for (i = 0; i < DBUF_MUTEXES; i++)
+ mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
+
+ dbuf_stats_init(h);
+
+ /*
+ * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
+ * configuration is not required.
+ */
+ dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
+
+ for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
+ dbuf_caches[dcs].cache =
+ multilist_create(sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_cache_link),
+ dbuf_cache_multilist_index_func);
+ zfs_refcount_create(&dbuf_caches[dcs].size);
+ }
+
+ dbuf_evict_thread_exit = B_FALSE;
+ mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
+ dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
+ NULL, 0, &p0, TS_RUN, minclsyspri);
+
+ dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ if (dbuf_ksp != NULL) {
+ for (i = 0; i < DN_MAX_LEVELS; i++) {
+ snprintf(dbuf_stats.cache_levels[i].name,
+ KSTAT_STRLEN, "cache_level_%d", i);
+ dbuf_stats.cache_levels[i].data_type =
+ KSTAT_DATA_UINT64;
+ snprintf(dbuf_stats.cache_levels_bytes[i].name,
+ KSTAT_STRLEN, "cache_level_%d_bytes", i);
+ dbuf_stats.cache_levels_bytes[i].data_type =
+ KSTAT_DATA_UINT64;
+ }
+ dbuf_ksp->ks_data = &dbuf_stats;
+ dbuf_ksp->ks_update = dbuf_kstat_update;
+ kstat_install(dbuf_ksp);
+ }
+}
+
+void
+dbuf_fini(void)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ int i;
+
+ dbuf_stats_destroy();
+
+ for (i = 0; i < DBUF_MUTEXES; i++)
+ mutex_destroy(&h->hash_mutexes[i]);
+#if defined(_KERNEL)
+ /*
+ * Large allocations which do not require contiguous pages
+ * should be using vmem_free() in the linux kernel
+ */
+ vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
+#else
+ kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
+#endif
+ kmem_cache_destroy(dbuf_kmem_cache);
+ taskq_destroy(dbu_evict_taskq);
+
+ mutex_enter(&dbuf_evict_lock);
+ dbuf_evict_thread_exit = B_TRUE;
+ while (dbuf_evict_thread_exit) {
+ cv_signal(&dbuf_evict_cv);
+ cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
+ }
+ mutex_exit(&dbuf_evict_lock);
+
+ mutex_destroy(&dbuf_evict_lock);
+ cv_destroy(&dbuf_evict_cv);
+
+ for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
+ zfs_refcount_destroy(&dbuf_caches[dcs].size);
+ multilist_destroy(dbuf_caches[dcs].cache);
+ }
+
+ if (dbuf_ksp != NULL) {
+ kstat_delete(dbuf_ksp);
+ dbuf_ksp = NULL;
+ }
+}
+
+/*
+ * Other stuff.
+ */
+
+#ifdef ZFS_DEBUG
+static void
+dbuf_verify(dmu_buf_impl_t *db)
+{
+ dnode_t *dn;
+ dbuf_dirty_record_t *dr;
+ uint32_t txg_prev;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
+ return;
+
+ ASSERT(db->db_objset != NULL);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ if (dn == NULL) {
+ ASSERT(db->db_parent == NULL);
+ ASSERT(db->db_blkptr == NULL);
+ } else {
+ ASSERT3U(db->db.db_object, ==, dn->dn_object);
+ ASSERT3P(db->db_objset, ==, dn->dn_objset);
+ ASSERT3U(db->db_level, <, dn->dn_nlevels);
+ ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
+ db->db_blkid == DMU_SPILL_BLKID ||
+ !avl_is_empty(&dn->dn_dbufs));
+ }
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ ASSERT(dn != NULL);
+ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
+ ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
+ } else if (db->db_blkid == DMU_SPILL_BLKID) {
+ ASSERT(dn != NULL);
+ ASSERT0(db->db.db_offset);
+ } else {
+ ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
+ }
+
+ if ((dr = list_head(&db->db_dirty_records)) != NULL) {
+ ASSERT(dr->dr_dbuf == db);
+ txg_prev = dr->dr_txg;
+ for (dr = list_next(&db->db_dirty_records, dr); dr != NULL;
+ dr = list_next(&db->db_dirty_records, dr)) {
+ ASSERT(dr->dr_dbuf == db);
+ ASSERT(txg_prev > dr->dr_txg);
+ txg_prev = dr->dr_txg;
+ }
+ }
+
+ /*
+ * We can't assert that db_size matches dn_datablksz because it
+ * can be momentarily different when another thread is doing
+ * dnode_set_blksz().
+ */
+ if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
+ dr = db->db_data_pending;
+ /*
+ * It should only be modified in syncing context, so
+ * make sure we only have one copy of the data.
+ */
+ ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
+ }
+
+ /* verify db->db_blkptr */
+ if (db->db_blkptr) {
+ if (db->db_parent == dn->dn_dbuf) {
+ /* db is pointed to by the dnode */
+ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
+ if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
+ ASSERT(db->db_parent == NULL);
+ else
+ ASSERT(db->db_parent != NULL);
+ if (db->db_blkid != DMU_SPILL_BLKID)
+ ASSERT3P(db->db_blkptr, ==,
+ &dn->dn_phys->dn_blkptr[db->db_blkid]);
+ } else {
+ /* db is pointed to by an indirect block */
+ int epb __maybe_unused = db->db_parent->db.db_size >>
+ SPA_BLKPTRSHIFT;
+ ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
+ ASSERT3U(db->db_parent->db.db_object, ==,
+ db->db.db_object);
+ /*
+ * dnode_grow_indblksz() can make this fail if we don't
+ * have the parent's rwlock. XXX indblksz no longer
+ * grows. safe to do this now?
+ */
+ if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) {
+ ASSERT3P(db->db_blkptr, ==,
+ ((blkptr_t *)db->db_parent->db.db_data +
+ db->db_blkid % epb));
+ }
+ }
+ }
+ if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
+ (db->db_buf == NULL || db->db_buf->b_data) &&
+ db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
+ db->db_state != DB_FILL && !dn->dn_free_txg) {
+ /*
+ * If the blkptr isn't set but they have nonzero data,
+ * it had better be dirty, otherwise we'll lose that
+ * data when we evict this buffer.
+ *
+ * There is an exception to this rule for indirect blocks; in
+ * this case, if the indirect block is a hole, we fill in a few
+ * fields on each of the child blocks (importantly, birth time)
+ * to prevent hole birth times from being lost when you
+ * partially fill in a hole.
+ */
+ if (db->db_dirtycnt == 0) {
+ if (db->db_level == 0) {
+ uint64_t *buf = db->db.db_data;
+ int i;
+
+ for (i = 0; i < db->db.db_size >> 3; i++) {
+ ASSERT(buf[i] == 0);
+ }
+ } else {
+ blkptr_t *bps = db->db.db_data;
+ ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
+ db->db.db_size);
+ /*
+ * We want to verify that all the blkptrs in the
+ * indirect block are holes, but we may have
+ * automatically set up a few fields for them.
+ * We iterate through each blkptr and verify
+ * they only have those fields set.
+ */
+ for (int i = 0;
+ i < db->db.db_size / sizeof (blkptr_t);
+ i++) {
+ blkptr_t *bp = &bps[i];
+ ASSERT(ZIO_CHECKSUM_IS_ZERO(
+ &bp->blk_cksum));
+ ASSERT(
+ DVA_IS_EMPTY(&bp->blk_dva[0]) &&
+ DVA_IS_EMPTY(&bp->blk_dva[1]) &&
+ DVA_IS_EMPTY(&bp->blk_dva[2]));
+ ASSERT0(bp->blk_fill);
+ ASSERT0(bp->blk_pad[0]);
+ ASSERT0(bp->blk_pad[1]);
+ ASSERT(!BP_IS_EMBEDDED(bp));
+ ASSERT(BP_IS_HOLE(bp));
+ ASSERT0(bp->blk_phys_birth);
+ }
+ }
+ }
+ }
+ DB_DNODE_EXIT(db);
+}
+#endif
+
+static void
+dbuf_clear_data(dmu_buf_impl_t *db)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ dbuf_evict_user(db);
+ ASSERT3P(db->db_buf, ==, NULL);
+ db->db.db_data = NULL;
+ if (db->db_state != DB_NOFILL) {
+ db->db_state = DB_UNCACHED;
+ DTRACE_SET_STATE(db, "clear data");
+ }
+}
+
+static void
+dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(buf != NULL);
+
+ db->db_buf = buf;
+ ASSERT(buf->b_data != NULL);
+ db->db.db_data = buf->b_data;
+}
+
+static arc_buf_t *
+dbuf_alloc_arcbuf_from_arcbuf(dmu_buf_impl_t *db, arc_buf_t *data)
+{
+ objset_t *os = db->db_objset;
+ spa_t *spa = os->os_spa;
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ enum zio_compress compress_type;
+ uint8_t complevel;
+ int psize, lsize;
+
+ psize = arc_buf_size(data);
+ lsize = arc_buf_lsize(data);
+ compress_type = arc_get_compression(data);
+ complevel = arc_get_complevel(data);
+
+ if (arc_is_encrypted(data)) {
+ boolean_t byteorder;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+ dnode_t *dn = DB_DNODE(db);
+
+ arc_get_raw_params(data, &byteorder, salt, iv, mac);
+ data = arc_alloc_raw_buf(spa, db, dmu_objset_id(os),
+ byteorder, salt, iv, mac, dn->dn_type, psize, lsize,
+ compress_type, complevel);
+ } else if (compress_type != ZIO_COMPRESS_OFF) {
+ ASSERT3U(type, ==, ARC_BUFC_DATA);
+ data = arc_alloc_compressed_buf(spa, db,
+ psize, lsize, compress_type, complevel);
+ } else {
+ data = arc_alloc_buf(spa, db, type, psize);
+ }
+ return (data);
+}
+
+static arc_buf_t *
+dbuf_alloc_arcbuf(dmu_buf_impl_t *db)
+{
+ spa_t *spa = db->db_objset->os_spa;
+
+ return (arc_alloc_buf(spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size));
+}
+
+/*
+ * Loan out an arc_buf for read. Return the loaned arc_buf.
+ */
+arc_buf_t *
+dbuf_loan_arcbuf(dmu_buf_impl_t *db)
+{
+ arc_buf_t *abuf;
+
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ mutex_enter(&db->db_mtx);
+ if (arc_released(db->db_buf) || zfs_refcount_count(&db->db_holds) > 1) {
+ int blksz = db->db.db_size;
+ spa_t *spa = db->db_objset->os_spa;
+
+ mutex_exit(&db->db_mtx);
+ abuf = arc_loan_buf(spa, B_FALSE, blksz);
+ bcopy(db->db.db_data, abuf->b_data, blksz);
+ } else {
+ abuf = db->db_buf;
+ arc_loan_inuse_buf(abuf, db);
+ db->db_buf = NULL;
+ dbuf_clear_data(db);
+ mutex_exit(&db->db_mtx);
+ }
+ return (abuf);
+}
+
+/*
+ * Calculate which level n block references the data at the level 0 offset
+ * provided.
+ */
+uint64_t
+dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
+{
+ if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
+ /*
+ * The level n blkid is equal to the level 0 blkid divided by
+ * the number of level 0s in a level n block.
+ *
+ * The level 0 blkid is offset >> datablkshift =
+ * offset / 2^datablkshift.
+ *
+ * The number of level 0s in a level n is the number of block
+ * pointers in an indirect block, raised to the power of level.
+ * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
+ * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
+ *
+ * Thus, the level n blkid is: offset /
+ * ((2^datablkshift)*(2^(level*(indblkshift-SPA_BLKPTRSHIFT))))
+ * = offset / 2^(datablkshift + level *
+ * (indblkshift - SPA_BLKPTRSHIFT))
+ * = offset >> (datablkshift + level *
+ * (indblkshift - SPA_BLKPTRSHIFT))
+ */
+
+ const unsigned exp = dn->dn_datablkshift +
+ level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
+
+ if (exp >= 8 * sizeof (offset)) {
+ /* This only happens on the highest indirection level */
+ ASSERT3U(level, ==, dn->dn_nlevels - 1);
+ return (0);
+ }
+
+ ASSERT3U(exp, <, 8 * sizeof (offset));
+
+ return (offset >> exp);
+ } else {
+ ASSERT3U(offset, <, dn->dn_datablksz);
+ return (0);
+ }
+}
+
+/*
+ * This function is used to lock the parent of the provided dbuf. This should be
+ * used when modifying or reading db_blkptr.
+ */
+db_lock_type_t
+dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag)
+{
+ enum db_lock_type ret = DLT_NONE;
+ if (db->db_parent != NULL) {
+ rw_enter(&db->db_parent->db_rwlock, rw);
+ ret = DLT_PARENT;
+ } else if (dmu_objset_ds(db->db_objset) != NULL) {
+ rrw_enter(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, rw,
+ tag);
+ ret = DLT_OBJSET;
+ }
+ /*
+ * We only return a DLT_NONE lock when it's the top-most indirect block
+ * of the meta-dnode of the MOS.
+ */
+ return (ret);
+}
+
+/*
+ * We need to pass the lock type in because it's possible that the block will
+ * move from being the topmost indirect block in a dnode (and thus, have no
+ * parent) to not the top-most via an indirection increase. This would cause a
+ * panic if we didn't pass the lock type in.
+ */
+void
+dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag)
+{
+ if (type == DLT_PARENT)
+ rw_exit(&db->db_parent->db_rwlock);
+ else if (type == DLT_OBJSET)
+ rrw_exit(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, tag);
+}
+
+static void
+dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+ arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+
+ mutex_enter(&db->db_mtx);
+ ASSERT3U(db->db_state, ==, DB_READ);
+ /*
+ * All reads are synchronous, so we must have a hold on the dbuf
+ */
+ ASSERT(zfs_refcount_count(&db->db_holds) > 0);
+ ASSERT(db->db_buf == NULL);
+ ASSERT(db->db.db_data == NULL);
+ if (buf == NULL) {
+ /* i/o error */
+ ASSERT(zio == NULL || zio->io_error != 0);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ ASSERT3P(db->db_buf, ==, NULL);
+ db->db_state = DB_UNCACHED;
+ DTRACE_SET_STATE(db, "i/o error");
+ } else if (db->db_level == 0 && db->db_freed_in_flight) {
+ /* freed in flight */
+ ASSERT(zio == NULL || zio->io_error == 0);
+ arc_release(buf, db);
+ bzero(buf->b_data, db->db.db_size);
+ arc_buf_freeze(buf);
+ db->db_freed_in_flight = FALSE;
+ dbuf_set_data(db, buf);
+ db->db_state = DB_CACHED;
+ DTRACE_SET_STATE(db, "freed in flight");
+ } else {
+ /* success */
+ ASSERT(zio == NULL || zio->io_error == 0);
+ dbuf_set_data(db, buf);
+ db->db_state = DB_CACHED;
+ DTRACE_SET_STATE(db, "successful read");
+ }
+ cv_broadcast(&db->db_changed);
+ dbuf_rele_and_unlock(db, NULL, B_FALSE);
+}
+
+/*
+ * Shortcut for performing reads on bonus dbufs. Returns
+ * an error if we fail to verify the dnode associated with
+ * a decrypted block. Otherwise success.
+ */
+static int
+dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
+{
+ int bonuslen, max_bonuslen, err;
+
+ err = dbuf_read_verify_dnode_crypt(db, flags);
+ if (err)
+ return (err);
+
+ bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
+ max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(DB_DNODE_HELD(db));
+ ASSERT3U(bonuslen, <=, db->db.db_size);
+ db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP);
+ arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
+ if (bonuslen < max_bonuslen)
+ bzero(db->db.db_data, max_bonuslen);
+ if (bonuslen)
+ bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
+ db->db_state = DB_CACHED;
+ DTRACE_SET_STATE(db, "bonus buffer filled");
+ return (0);
+}
+
+static void
+dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
+{
+ blkptr_t *bps = db->db.db_data;
+ uint32_t indbs = 1ULL << dn->dn_indblkshift;
+ int n_bps = indbs >> SPA_BLKPTRSHIFT;
+
+ for (int i = 0; i < n_bps; i++) {
+ blkptr_t *bp = &bps[i];
+
+ ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, indbs);
+ BP_SET_LSIZE(bp, BP_GET_LEVEL(db->db_blkptr) == 1 ?
+ dn->dn_datablksz : BP_GET_LSIZE(db->db_blkptr));
+ BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
+ BP_SET_LEVEL(bp, BP_GET_LEVEL(db->db_blkptr) - 1);
+ BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
+ }
+}
+
+/*
+ * Handle reads on dbufs that are holes, if necessary. This function
+ * requires that the dbuf's mutex is held. Returns success (0) if action
+ * was taken, ENOENT if no action was taken.
+ */
+static int
+dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ int is_hole = db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr);
+ /*
+ * For level 0 blocks only, if the above check fails:
+ * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
+ * processes the delete record and clears the bp while we are waiting
+ * for the dn_mtx (resulting in a "no" from block_freed).
+ */
+ if (!is_hole && db->db_level == 0) {
+ is_hole = dnode_block_freed(dn, db->db_blkid) ||
+ BP_IS_HOLE(db->db_blkptr);
+ }
+
+ if (is_hole) {
+ dbuf_set_data(db, dbuf_alloc_arcbuf(db));
+ bzero(db->db.db_data, db->db.db_size);
+
+ if (db->db_blkptr != NULL && db->db_level > 0 &&
+ BP_IS_HOLE(db->db_blkptr) &&
+ db->db_blkptr->blk_birth != 0) {
+ dbuf_handle_indirect_hole(db, dn);
+ }
+ db->db_state = DB_CACHED;
+ DTRACE_SET_STATE(db, "hole read satisfied");
+ return (0);
+ }
+ return (ENOENT);
+}
+
+/*
+ * This function ensures that, when doing a decrypting read of a block,
+ * we make sure we have decrypted the dnode associated with it. We must do
+ * this so that we ensure we are fully authenticating the checksum-of-MACs
+ * tree from the root of the objset down to this block. Indirect blocks are
+ * always verified against their secure checksum-of-MACs assuming that the
+ * dnode containing them is correct. Now that we are doing a decrypting read,
+ * we can be sure that the key is loaded and verify that assumption. This is
+ * especially important considering that we always read encrypted dnode
+ * blocks as raw data (without verifying their MACs) to start, and
+ * decrypt / authenticate them when we need to read an encrypted bonus buffer.
+ */
+static int
+dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
+{
+ int err = 0;
+ objset_t *os = db->db_objset;
+ arc_buf_t *dnode_abuf;
+ dnode_t *dn;
+ zbookmark_phys_t zb;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (!os->os_encrypted || os->os_raw_receive ||
+ (flags & DB_RF_NO_DECRYPT) != 0)
+ return (0);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL;
+
+ if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) {
+ DB_DNODE_EXIT(db);
+ return (0);
+ }
+
+ SET_BOOKMARK(&zb, dmu_objset_id(os),
+ DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid);
+ err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE);
+
+ /*
+ * An error code of EACCES tells us that the key is still not
+ * available. This is ok if we are only reading authenticated
+ * (and therefore non-encrypted) blocks.
+ */
+ if (err == EACCES && ((db->db_blkid != DMU_BONUS_BLKID &&
+ !DMU_OT_IS_ENCRYPTED(dn->dn_type)) ||
+ (db->db_blkid == DMU_BONUS_BLKID &&
+ !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
+ err = 0;
+
+ DB_DNODE_EXIT(db);
+
+ return (err);
+}
+
+/*
+ * Drops db_mtx and the parent lock specified by dblt and tag before
+ * returning.
+ */
+static int
+dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
+ db_lock_type_t dblt, void *tag)
+{
+ dnode_t *dn;
+ zbookmark_phys_t zb;
+ uint32_t aflags = ARC_FLAG_NOWAIT;
+ int err, zio_flags;
+ boolean_t bonus_read;
+
+ err = zio_flags = 0;
+ bonus_read = B_FALSE;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db_state == DB_UNCACHED);
+ ASSERT(db->db_buf == NULL);
+ ASSERT(db->db_parent == NULL ||
+ RW_LOCK_HELD(&db->db_parent->db_rwlock));
+
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ err = dbuf_read_bonus(db, dn, flags);
+ goto early_unlock;
+ }
+
+ err = dbuf_read_hole(db, dn, flags);
+ if (err == 0)
+ goto early_unlock;
+
+ /*
+ * Any attempt to read a redacted block should result in an error. This
+ * will never happen under normal conditions, but can be useful for
+ * debugging purposes.
+ */
+ if (BP_IS_REDACTED(db->db_blkptr)) {
+ ASSERT(dsl_dataset_feature_is_active(
+ db->db_objset->os_dsl_dataset,
+ SPA_FEATURE_REDACTED_DATASETS));
+ err = SET_ERROR(EIO);
+ goto early_unlock;
+ }
+
+ SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
+ db->db.db_object, db->db_level, db->db_blkid);
+
+ /*
+ * All bps of an encrypted os should have the encryption bit set.
+ * If this is not true it indicates tampering and we report an error.
+ */
+ if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) {
+ spa_log_error(db->db_objset->os_spa, &zb);
+ zfs_panic_recover("unencrypted block in encrypted "
+ "object set %llu", dmu_objset_id(db->db_objset));
+ err = SET_ERROR(EIO);
+ goto early_unlock;
+ }
+
+ err = dbuf_read_verify_dnode_crypt(db, flags);
+ if (err != 0)
+ goto early_unlock;
+
+ DB_DNODE_EXIT(db);
+
+ db->db_state = DB_READ;
+ DTRACE_SET_STATE(db, "read issued");
+ mutex_exit(&db->db_mtx);
+
+ if (DBUF_IS_L2CACHEABLE(db))
+ aflags |= ARC_FLAG_L2CACHE;
+
+ dbuf_add_ref(db, NULL);
+
+ zio_flags = (flags & DB_RF_CANFAIL) ?
+ ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
+
+ if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
+ zio_flags |= ZIO_FLAG_RAW;
+ /*
+ * The zio layer will copy the provided blkptr later, but we need to
+ * do this now so that we can release the parent's rwlock. We have to
+ * do that now so that if dbuf_read_done is called synchronously (on
+ * an l1 cache hit) we don't acquire the db_mtx while holding the
+ * parent's rwlock, which would be a lock ordering violation.
+ */
+ blkptr_t bp = *db->db_blkptr;
+ dmu_buf_unlock_parent(db, dblt, tag);
+ (void) arc_read(zio, db->db_objset->os_spa, &bp,
+ dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
+ &aflags, &zb);
+ return (err);
+early_unlock:
+ DB_DNODE_EXIT(db);
+ mutex_exit(&db->db_mtx);
+ dmu_buf_unlock_parent(db, dblt, tag);
+ return (err);
+}
+
+/*
+ * This is our just-in-time copy function. It makes a copy of buffers that
+ * have been modified in a previous transaction group before we access them in
+ * the current active group.
+ *
+ * This function is used in three places: when we are dirtying a buffer for the
+ * first time in a txg, when we are freeing a range in a dnode that includes
+ * this buffer, and when we are accessing a buffer which was received compressed
+ * and later referenced in a WRITE_BYREF record.
+ *
+ * Note that when we are called from dbuf_free_range() we do not put a hold on
+ * the buffer, we just traverse the active dbuf list for the dnode.
+ */
+static void
+dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
+{
+ dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db.db_data != NULL);
+ ASSERT(db->db_level == 0);
+ ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
+
+ if (dr == NULL ||
+ (dr->dt.dl.dr_data !=
+ ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
+ return;
+
+ /*
+ * If the last dirty record for this dbuf has not yet synced
+ * and its referencing the dbuf data, either:
+ * reset the reference to point to a new copy,
+ * or (if there a no active holders)
+ * just null out the current db_data pointer.
+ */
+ ASSERT3U(dr->dr_txg, >=, txg - 2);
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ dnode_t *dn = DB_DNODE(db);
+ int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
+ dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);
+ arc_space_consume(bonuslen, ARC_SPACE_BONUS);
+ bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
+ } else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
+ arc_buf_t *buf = dbuf_alloc_arcbuf_from_arcbuf(db, db->db_buf);
+ dr->dt.dl.dr_data = buf;
+ bcopy(db->db.db_data, buf->b_data, arc_buf_size(buf));
+ } else {
+ db->db_buf = NULL;
+ dbuf_clear_data(db);
+ }
+}
+
+int
+dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
+{
+ int err = 0;
+ boolean_t prefetch;
+ dnode_t *dn;
+
+ /*
+ * We don't have to hold the mutex to check db_state because it
+ * can't be freed while we have a hold on the buffer.
+ */
+ ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+
+ if (db->db_state == DB_NOFILL)
+ return (SET_ERROR(EIO));
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
+ (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
+ DBUF_IS_CACHEABLE(db);
+
+ mutex_enter(&db->db_mtx);
+ if (db->db_state == DB_CACHED) {
+ spa_t *spa = dn->dn_objset->os_spa;
+
+ /*
+ * Ensure that this block's dnode has been decrypted if
+ * the caller has requested decrypted data.
+ */
+ err = dbuf_read_verify_dnode_crypt(db, flags);
+
+ /*
+ * If the arc buf is compressed or encrypted and the caller
+ * requested uncompressed data, we need to untransform it
+ * before returning. We also call arc_untransform() on any
+ * unauthenticated blocks, which will verify their MAC if
+ * the key is now available.
+ */
+ if (err == 0 && db->db_buf != NULL &&
+ (flags & DB_RF_NO_DECRYPT) == 0 &&
+ (arc_is_encrypted(db->db_buf) ||
+ arc_is_unauthenticated(db->db_buf) ||
+ arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
+ zbookmark_phys_t zb;
+
+ SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
+ db->db.db_object, db->db_level, db->db_blkid);
+ dbuf_fix_old_data(db, spa_syncing_txg(spa));
+ err = arc_untransform(db->db_buf, spa, &zb, B_FALSE);
+ dbuf_set_data(db, db->db_buf);
+ }
+ mutex_exit(&db->db_mtx);
+ if (err == 0 && prefetch) {
+ dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
+ flags & DB_RF_HAVESTRUCT);
+ }
+ DB_DNODE_EXIT(db);
+ DBUF_STAT_BUMP(hash_hits);
+ } else if (db->db_state == DB_UNCACHED) {
+ spa_t *spa = dn->dn_objset->os_spa;
+ boolean_t need_wait = B_FALSE;
+
+ db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
+
+ if (zio == NULL &&
+ db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
+ zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+ need_wait = B_TRUE;
+ }
+ err = dbuf_read_impl(db, zio, flags, dblt, FTAG);
+ /*
+ * dbuf_read_impl has dropped db_mtx and our parent's rwlock
+ * for us
+ */
+ if (!err && prefetch) {
+ dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
+ flags & DB_RF_HAVESTRUCT);
+ }
+
+ DB_DNODE_EXIT(db);
+ DBUF_STAT_BUMP(hash_misses);
+
+ /*
+ * If we created a zio_root we must execute it to avoid
+ * leaking it, even if it isn't attached to any work due
+ * to an error in dbuf_read_impl().
+ */
+ if (need_wait) {
+ if (err == 0)
+ err = zio_wait(zio);
+ else
+ VERIFY0(zio_wait(zio));
+ }
+ } else {
+ /*
+ * Another reader came in while the dbuf was in flight
+ * between UNCACHED and CACHED. Either a writer will finish
+ * writing the buffer (sending the dbuf to CACHED) or the
+ * first reader's request will reach the read_done callback
+ * and send the dbuf to CACHED. Otherwise, a failure
+ * occurred and the dbuf went to UNCACHED.
+ */
+ mutex_exit(&db->db_mtx);
+ if (prefetch) {
+ dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
+ flags & DB_RF_HAVESTRUCT);
+ }
+ DB_DNODE_EXIT(db);
+ DBUF_STAT_BUMP(hash_misses);
+
+ /* Skip the wait per the caller's request. */
+ if ((flags & DB_RF_NEVERWAIT) == 0) {
+ mutex_enter(&db->db_mtx);
+ while (db->db_state == DB_READ ||
+ db->db_state == DB_FILL) {
+ ASSERT(db->db_state == DB_READ ||
+ (flags & DB_RF_HAVESTRUCT) == 0);
+ DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
+ db, zio_t *, zio);
+ cv_wait(&db->db_changed, &db->db_mtx);
+ }
+ if (db->db_state == DB_UNCACHED)
+ err = SET_ERROR(EIO);
+ mutex_exit(&db->db_mtx);
+ }
+ }
+
+ return (err);
+}
+
+static void
+dbuf_noread(dmu_buf_impl_t *db)
+{
+ ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ mutex_enter(&db->db_mtx);
+ while (db->db_state == DB_READ || db->db_state == DB_FILL)
+ cv_wait(&db->db_changed, &db->db_mtx);
+ if (db->db_state == DB_UNCACHED) {
+ ASSERT(db->db_buf == NULL);
+ ASSERT(db->db.db_data == NULL);
+ dbuf_set_data(db, dbuf_alloc_arcbuf(db));
+ db->db_state = DB_FILL;
+ DTRACE_SET_STATE(db, "assigning filled buffer");
+ } else if (db->db_state == DB_NOFILL) {
+ dbuf_clear_data(db);
+ } else {
+ ASSERT3U(db->db_state, ==, DB_CACHED);
+ }
+ mutex_exit(&db->db_mtx);
+}
+
+void
+dbuf_unoverride(dbuf_dirty_record_t *dr)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
+ uint64_t txg = dr->dr_txg;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ /*
+ * This assert is valid because dmu_sync() expects to be called by
+ * a zilog's get_data while holding a range lock. This call only
+ * comes from dbuf_dirty() callers who must also hold a range lock.
+ */
+ ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
+ ASSERT(db->db_level == 0);
+
+ if (db->db_blkid == DMU_BONUS_BLKID ||
+ dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
+ return;
+
+ ASSERT(db->db_data_pending != dr);
+
+ /* free this block */
+ if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
+ zio_free(db->db_objset->os_spa, txg, bp);
+
+ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ dr->dt.dl.dr_nopwrite = B_FALSE;
+ dr->dt.dl.dr_has_raw_params = B_FALSE;
+
+ /*
+ * Release the already-written buffer, so we leave it in
+ * a consistent dirty state. Note that all callers are
+ * modifying the buffer, so they will immediately do
+ * another (redundant) arc_release(). Therefore, leave
+ * the buf thawed to save the effort of freezing &
+ * immediately re-thawing it.
+ */
+ arc_release(dr->dt.dl.dr_data, db);
+}
+
+/*
+ * Evict (if its unreferenced) or clear (if its referenced) any level-0
+ * data blocks in the free range, so that any future readers will find
+ * empty blocks.
+ */
+void
+dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
+ dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db_search;
+ dmu_buf_impl_t *db, *db_next;
+ uint64_t txg = tx->tx_txg;
+ avl_index_t where;
+ dbuf_dirty_record_t *dr;
+
+ if (end_blkid > dn->dn_maxblkid &&
+ !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID))
+ end_blkid = dn->dn_maxblkid;
+ dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
+
+ db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
+ db_search->db_level = 0;
+ db_search->db_blkid = start_blkid;
+ db_search->db_state = DB_SEARCH;
+
+ mutex_enter(&dn->dn_dbufs_mtx);
+ db = avl_find(&dn->dn_dbufs, db_search, &where);
+ ASSERT3P(db, ==, NULL);
+
+ db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+
+ for (; db != NULL; db = db_next) {
+ db_next = AVL_NEXT(&dn->dn_dbufs, db);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+ if (db->db_level != 0 || db->db_blkid > end_blkid) {
+ break;
+ }
+ ASSERT3U(db->db_blkid, >=, start_blkid);
+
+ /* found a level 0 buffer in the range */
+ mutex_enter(&db->db_mtx);
+ if (dbuf_undirty(db, tx)) {
+ /* mutex has been dropped and dbuf destroyed */
+ continue;
+ }
+
+ if (db->db_state == DB_UNCACHED ||
+ db->db_state == DB_NOFILL ||
+ db->db_state == DB_EVICTING) {
+ ASSERT(db->db.db_data == NULL);
+ mutex_exit(&db->db_mtx);
+ continue;
+ }
+ if (db->db_state == DB_READ || db->db_state == DB_FILL) {
+ /* will be handled in dbuf_read_done or dbuf_rele */
+ db->db_freed_in_flight = TRUE;
+ mutex_exit(&db->db_mtx);
+ continue;
+ }
+ if (zfs_refcount_count(&db->db_holds) == 0) {
+ ASSERT(db->db_buf);
+ dbuf_destroy(db);
+ continue;
+ }
+ /* The dbuf is referenced */
+
+ dr = list_head(&db->db_dirty_records);
+ if (dr != NULL) {
+ if (dr->dr_txg == txg) {
+ /*
+ * This buffer is "in-use", re-adjust the file
+ * size to reflect that this buffer may
+ * contain new data when we sync.
+ */
+ if (db->db_blkid != DMU_SPILL_BLKID &&
+ db->db_blkid > dn->dn_maxblkid)
+ dn->dn_maxblkid = db->db_blkid;
+ dbuf_unoverride(dr);
+ } else {
+ /*
+ * This dbuf is not dirty in the open context.
+ * Either uncache it (if its not referenced in
+ * the open context) or reset its contents to
+ * empty.
+ */
+ dbuf_fix_old_data(db, txg);
+ }
+ }
+ /* clear the contents if its cached */
+ if (db->db_state == DB_CACHED) {
+ ASSERT(db->db.db_data != NULL);
+ arc_release(db->db_buf, db);
+ rw_enter(&db->db_rwlock, RW_WRITER);
+ bzero(db->db.db_data, db->db.db_size);
+ rw_exit(&db->db_rwlock);
+ arc_buf_freeze(db->db_buf);
+ }
+
+ mutex_exit(&db->db_mtx);
+ }
+
+ kmem_free(db_search, sizeof (dmu_buf_impl_t));
+ mutex_exit(&dn->dn_dbufs_mtx);
+}
+
+void
+dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
+{
+ arc_buf_t *buf, *old_buf;
+ dbuf_dirty_record_t *dr;
+ int osize = db->db.db_size;
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ dnode_t *dn;
+
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ /*
+ * XXX we should be doing a dbuf_read, checking the return
+ * value and returning that up to our callers
+ */
+ dmu_buf_will_dirty(&db->db, tx);
+
+ /* create the data buffer for the new block */
+ buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
+
+ /* copy old block data to the new block */
+ old_buf = db->db_buf;
+ bcopy(old_buf->b_data, buf->b_data, MIN(osize, size));
+ /* zero the remainder */
+ if (size > osize)
+ bzero((uint8_t *)buf->b_data + osize, size - osize);
+
+ mutex_enter(&db->db_mtx);
+ dbuf_set_data(db, buf);
+ arc_buf_destroy(old_buf, db);
+ db->db.db_size = size;
+
+ dr = list_head(&db->db_dirty_records);
+ /* dirty record added by dmu_buf_will_dirty() */
+ VERIFY(dr != NULL);
+ if (db->db_level == 0)
+ dr->dt.dl.dr_data = buf;
+ ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
+ ASSERT3U(dr->dr_accounted, ==, osize);
+ dr->dr_accounted = size;
+ mutex_exit(&db->db_mtx);
+
+ dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
+ DB_DNODE_EXIT(db);
+}
+
+void
+dbuf_release_bp(dmu_buf_impl_t *db)
+{
+ objset_t *os __maybe_unused = db->db_objset;
+
+ ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
+ ASSERT(arc_released(os->os_phys_buf) ||
+ list_link_active(&os->os_dsl_dataset->ds_synced_link));
+ ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
+
+ (void) arc_release(db->db_buf, db);
+}
+
+/*
+ * We already have a dirty record for this TXG, and we are being
+ * dirtied again.
+ */
+static void
+dbuf_redirty(dbuf_dirty_record_t *dr)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
+ /*
+ * If this buffer has already been written out,
+ * we now need to reset its state.
+ */
+ dbuf_unoverride(dr);
+ if (db->db.db_object != DMU_META_DNODE_OBJECT &&
+ db->db_state != DB_NOFILL) {
+ /* Already released on initial dirty, so just thaw. */
+ ASSERT(arc_released(db->db_buf));
+ arc_buf_thaw(db->db_buf);
+ }
+ }
+}
+
+dbuf_dirty_record_t *
+dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
+{
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ IMPLY(dn->dn_objset->os_raw_receive, dn->dn_maxblkid >= blkid);
+ dnode_new_blkid(dn, blkid, tx, B_TRUE, B_FALSE);
+ ASSERT(dn->dn_maxblkid >= blkid);
+
+ dbuf_dirty_record_t *dr = kmem_zalloc(sizeof (*dr), KM_SLEEP);
+ list_link_init(&dr->dr_dirty_node);
+ list_link_init(&dr->dr_dbuf_node);
+ dr->dr_dnode = dn;
+ dr->dr_txg = tx->tx_txg;
+ dr->dt.dll.dr_blkid = blkid;
+ dr->dr_accounted = dn->dn_datablksz;
+
+ /*
+ * There should not be any dbuf for the block that we're dirtying.
+ * Otherwise the buffer contents could be inconsistent between the
+ * dbuf and the lightweight dirty record.
+ */
+ ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid));
+
+ mutex_enter(&dn->dn_mtx);
+ int txgoff = tx->tx_txg & TXG_MASK;
+ if (dn->dn_free_ranges[txgoff] != NULL) {
+ range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1);
+ }
+
+ if (dn->dn_nlevels == 1) {
+ ASSERT3U(blkid, <, dn->dn_nblkptr);
+ list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+ mutex_exit(&dn->dn_mtx);
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_setdirty(dn, tx);
+ } else {
+ mutex_exit(&dn->dn_mtx);
+
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ dmu_buf_impl_t *parent_db = dbuf_hold_level(dn,
+ 1, blkid >> epbs, FTAG);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (parent_db == NULL) {
+ kmem_free(dr, sizeof (*dr));
+ return (NULL);
+ }
+ int err = dbuf_read(parent_db, NULL,
+ (DB_RF_NOPREFETCH | DB_RF_CANFAIL));
+ if (err != 0) {
+ dbuf_rele(parent_db, FTAG);
+ kmem_free(dr, sizeof (*dr));
+ return (NULL);
+ }
+
+ dbuf_dirty_record_t *parent_dr = dbuf_dirty(parent_db, tx);
+ dbuf_rele(parent_db, FTAG);
+ mutex_enter(&parent_dr->dt.di.dr_mtx);
+ ASSERT3U(parent_dr->dr_txg, ==, tx->tx_txg);
+ list_insert_tail(&parent_dr->dt.di.dr_children, dr);
+ mutex_exit(&parent_dr->dt.di.dr_mtx);
+ dr->dr_parent = parent_dr;
+ }
+
+ dmu_objset_willuse_space(dn->dn_objset, dr->dr_accounted, tx);
+
+ return (dr);
+}
+
+dbuf_dirty_record_t *
+dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ objset_t *os;
+ dbuf_dirty_record_t *dr, *dr_next, *dr_head;
+ int txgoff = tx->tx_txg & TXG_MASK;
+ boolean_t drop_struct_rwlock = B_FALSE;
+
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+ DMU_TX_DIRTY_BUF(tx, db);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ /*
+ * Shouldn't dirty a regular buffer in syncing context. Private
+ * objects may be dirtied in syncing context, but only if they
+ * were already pre-dirtied in open context.
+ */
+#ifdef ZFS_DEBUG
+ if (dn->dn_objset->os_dsl_dataset != NULL) {
+ rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
+ RW_READER, FTAG);
+ }
+ ASSERT(!dmu_tx_is_syncing(tx) ||
+ BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
+ DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
+ dn->dn_objset->os_dsl_dataset == NULL);
+ if (dn->dn_objset->os_dsl_dataset != NULL)
+ rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
+#endif
+ /*
+ * We make this assert for private objects as well, but after we
+ * check if we're already dirty. They are allowed to re-dirty
+ * in syncing context.
+ */
+ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
+ dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
+ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
+
+ mutex_enter(&db->db_mtx);
+ /*
+ * XXX make this true for indirects too? The problem is that
+ * transactions created with dmu_tx_create_assigned() from
+ * syncing context don't bother holding ahead.
+ */
+ ASSERT(db->db_level != 0 ||
+ db->db_state == DB_CACHED || db->db_state == DB_FILL ||
+ db->db_state == DB_NOFILL);
+
+ mutex_enter(&dn->dn_mtx);
+ dnode_set_dirtyctx(dn, tx, db);
+ if (tx->tx_txg > dn->dn_dirty_txg)
+ dn->dn_dirty_txg = tx->tx_txg;
+ mutex_exit(&dn->dn_mtx);
+
+ if (db->db_blkid == DMU_SPILL_BLKID)
+ dn->dn_have_spill = B_TRUE;
+
+ /*
+ * If this buffer is already dirty, we're done.
+ */
+ dr_head = list_head(&db->db_dirty_records);
+ ASSERT(dr_head == NULL || dr_head->dr_txg <= tx->tx_txg ||
+ db->db.db_object == DMU_META_DNODE_OBJECT);
+ dr_next = dbuf_find_dirty_lte(db, tx->tx_txg);
+ if (dr_next && dr_next->dr_txg == tx->tx_txg) {
+ DB_DNODE_EXIT(db);
+
+ dbuf_redirty(dr_next);
+ mutex_exit(&db->db_mtx);
+ return (dr_next);
+ }
+
+ /*
+ * Only valid if not already dirty.
+ */
+ ASSERT(dn->dn_object == 0 ||
+ dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
+ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
+
+ ASSERT3U(dn->dn_nlevels, >, db->db_level);
+
+ /*
+ * We should only be dirtying in syncing context if it's the
+ * mos or we're initializing the os or it's a special object.
+ * However, we are allowed to dirty in syncing context provided
+ * we already dirtied it in open context. Hence we must make
+ * this assertion only if we're not already dirty.
+ */
+ os = dn->dn_objset;
+ VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa));
+#ifdef ZFS_DEBUG
+ if (dn->dn_objset->os_dsl_dataset != NULL)
+ rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
+ ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
+ os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
+ if (dn->dn_objset->os_dsl_dataset != NULL)
+ rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
+#endif
+ ASSERT(db->db.db_size != 0);
+
+ dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+
+ if (db->db_blkid != DMU_BONUS_BLKID) {
+ dmu_objset_willuse_space(os, db->db.db_size, tx);
+ }
+
+ /*
+ * If this buffer is dirty in an old transaction group we need
+ * to make a copy of it so that the changes we make in this
+ * transaction group won't leak out when we sync the older txg.
+ */
+ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
+ list_link_init(&dr->dr_dirty_node);
+ list_link_init(&dr->dr_dbuf_node);
+ dr->dr_dnode = dn;
+ if (db->db_level == 0) {
+ void *data_old = db->db_buf;
+
+ if (db->db_state != DB_NOFILL) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ dbuf_fix_old_data(db, tx->tx_txg);
+ data_old = db->db.db_data;
+ } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
+ /*
+ * Release the data buffer from the cache so
+ * that we can modify it without impacting
+ * possible other users of this cached data
+ * block. Note that indirect blocks and
+ * private objects are not released until the
+ * syncing state (since they are only modified
+ * then).
+ */
+ arc_release(db->db_buf, db);
+ dbuf_fix_old_data(db, tx->tx_txg);
+ data_old = db->db_buf;
+ }
+ ASSERT(data_old != NULL);
+ }
+ dr->dt.dl.dr_data = data_old;
+ } else {
+ mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_NOLOCKDEP, NULL);
+ list_create(&dr->dt.di.dr_children,
+ sizeof (dbuf_dirty_record_t),
+ offsetof(dbuf_dirty_record_t, dr_dirty_node));
+ }
+ if (db->db_blkid != DMU_BONUS_BLKID)
+ dr->dr_accounted = db->db.db_size;
+ dr->dr_dbuf = db;
+ dr->dr_txg = tx->tx_txg;
+ list_insert_before(&db->db_dirty_records, dr_next, dr);
+
+ /*
+ * We could have been freed_in_flight between the dbuf_noread
+ * and dbuf_dirty. We win, as though the dbuf_noread() had
+ * happened after the free.
+ */
+ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
+ db->db_blkid != DMU_SPILL_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_free_ranges[txgoff] != NULL) {
+ range_tree_clear(dn->dn_free_ranges[txgoff],
+ db->db_blkid, 1);
+ }
+ mutex_exit(&dn->dn_mtx);
+ db->db_freed_in_flight = FALSE;
+ }
+
+ /*
+ * This buffer is now part of this txg
+ */
+ dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
+ db->db_dirtycnt += 1;
+ ASSERT3U(db->db_dirtycnt, <=, 3);
+
+ mutex_exit(&db->db_mtx);
+
+ if (db->db_blkid == DMU_BONUS_BLKID ||
+ db->db_blkid == DMU_SPILL_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+ mutex_exit(&dn->dn_mtx);
+ dnode_setdirty(dn, tx);
+ DB_DNODE_EXIT(db);
+ return (dr);
+ }
+
+ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ drop_struct_rwlock = B_TRUE;
+ }
+
+ /*
+ * If we are overwriting a dedup BP, then unless it is snapshotted,
+ * when we get to syncing context we will need to decrement its
+ * refcount in the DDT. Prefetch the relevant DDT block so that
+ * syncing context won't have to wait for the i/o.
+ */
+ if (db->db_blkptr != NULL) {
+ db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
+ ddt_prefetch(os->os_spa, db->db_blkptr);
+ dmu_buf_unlock_parent(db, dblt, FTAG);
+ }
+
+ /*
+ * We need to hold the dn_struct_rwlock to make this assertion,
+ * because it protects dn_phys / dn_next_nlevels from changing.
+ */
+ ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
+ dn->dn_phys->dn_nlevels > db->db_level ||
+ dn->dn_next_nlevels[txgoff] > db->db_level ||
+ dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
+ dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
+
+
+ if (db->db_level == 0) {
+ ASSERT(!db->db_objset->os_raw_receive ||
+ dn->dn_maxblkid >= db->db_blkid);
+ dnode_new_blkid(dn, db->db_blkid, tx,
+ drop_struct_rwlock, B_FALSE);
+ ASSERT(dn->dn_maxblkid >= db->db_blkid);
+ }
+
+ if (db->db_level+1 < dn->dn_nlevels) {
+ dmu_buf_impl_t *parent = db->db_parent;
+ dbuf_dirty_record_t *di;
+ int parent_held = FALSE;
+
+ if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ parent = dbuf_hold_level(dn, db->db_level + 1,
+ db->db_blkid >> epbs, FTAG);
+ ASSERT(parent != NULL);
+ parent_held = TRUE;
+ }
+ if (drop_struct_rwlock)
+ rw_exit(&dn->dn_struct_rwlock);
+ ASSERT3U(db->db_level + 1, ==, parent->db_level);
+ di = dbuf_dirty(parent, tx);
+ if (parent_held)
+ dbuf_rele(parent, FTAG);
+
+ mutex_enter(&db->db_mtx);
+ /*
+ * Since we've dropped the mutex, it's possible that
+ * dbuf_undirty() might have changed this out from under us.
+ */
+ if (list_head(&db->db_dirty_records) == dr ||
+ dn->dn_object == DMU_META_DNODE_OBJECT) {
+ mutex_enter(&di->dt.di.dr_mtx);
+ ASSERT3U(di->dr_txg, ==, tx->tx_txg);
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ list_insert_tail(&di->dt.di.dr_children, dr);
+ mutex_exit(&di->dt.di.dr_mtx);
+ dr->dr_parent = di;
+ }
+ mutex_exit(&db->db_mtx);
+ } else {
+ ASSERT(db->db_level + 1 == dn->dn_nlevels);
+ ASSERT(db->db_blkid < dn->dn_nblkptr);
+ ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
+ mutex_enter(&dn->dn_mtx);
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+ mutex_exit(&dn->dn_mtx);
+ if (drop_struct_rwlock)
+ rw_exit(&dn->dn_struct_rwlock);
+ }
+
+ dnode_setdirty(dn, tx);
+ DB_DNODE_EXIT(db);
+ return (dr);
+}
+
+static void
+dbuf_undirty_bonus(dbuf_dirty_record_t *dr)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+
+ if (dr->dt.dl.dr_data != db->db.db_data) {
+ struct dnode *dn = dr->dr_dnode;
+ int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
+
+ kmem_free(dr->dt.dl.dr_data, max_bonuslen);
+ arc_space_return(max_bonuslen, ARC_SPACE_BONUS);
+ }
+ db->db_data_pending = NULL;
+ ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
+ list_remove(&db->db_dirty_records, dr);
+ if (dr->dr_dbuf->db_level != 0) {
+ mutex_destroy(&dr->dt.di.dr_mtx);
+ list_destroy(&dr->dt.di.dr_children);
+ }
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+ ASSERT3U(db->db_dirtycnt, >, 0);
+ db->db_dirtycnt -= 1;
+}
+
+/*
+ * Undirty a buffer in the transaction group referenced by the given
+ * transaction. Return whether this evicted the dbuf.
+ */
+static boolean_t
+dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ uint64_t txg = tx->tx_txg;
+
+ ASSERT(txg != 0);
+
+ /*
+ * Due to our use of dn_nlevels below, this can only be called
+ * in open context, unless we are operating on the MOS.
+ * From syncing context, dn_nlevels may be different from the
+ * dn_nlevels used when dbuf was dirtied.
+ */
+ ASSERT(db->db_objset ==
+ dmu_objset_pool(db->db_objset)->dp_meta_objset ||
+ txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ ASSERT0(db->db_level);
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ /*
+ * If this buffer is not dirty, we're done.
+ */
+ dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg);
+ if (dr == NULL)
+ return (B_FALSE);
+ ASSERT(dr->dr_dbuf == db);
+
+ dnode_t *dn = dr->dr_dnode;
+
+ dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+
+ ASSERT(db->db.db_size != 0);
+
+ dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
+ dr->dr_accounted, txg);
+
+ list_remove(&db->db_dirty_records, dr);
+
+ /*
+ * Note that there are three places in dbuf_dirty()
+ * where this dirty record may be put on a list.
+ * Make sure to do a list_remove corresponding to
+ * every one of those list_insert calls.
+ */
+ if (dr->dr_parent) {
+ mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
+ list_remove(&dr->dr_parent->dt.di.dr_children, dr);
+ mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
+ } else if (db->db_blkid == DMU_SPILL_BLKID ||
+ db->db_level + 1 == dn->dn_nlevels) {
+ ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
+ mutex_enter(&dn->dn_mtx);
+ list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ if (db->db_state != DB_NOFILL) {
+ dbuf_unoverride(dr);
+
+ ASSERT(db->db_buf != NULL);
+ ASSERT(dr->dt.dl.dr_data != NULL);
+ if (dr->dt.dl.dr_data != db->db_buf)
+ arc_buf_destroy(dr->dt.dl.dr_data, db);
+ }
+
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+
+ if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
+ ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
+ dbuf_destroy(db);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+static void
+dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+
+ /*
+ * Quick check for dirtiness. For already dirty blocks, this
+ * reduces runtime of this function by >90%, and overall performance
+ * by 50% for some workloads (e.g. file deletion with indirect blocks
+ * cached).
+ */
+ mutex_enter(&db->db_mtx);
+
+ if (db->db_state == DB_CACHED) {
+ dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
+ /*
+ * It's possible that it is already dirty but not cached,
+ * because there are some calls to dbuf_dirty() that don't
+ * go through dmu_buf_will_dirty().
+ */
+ if (dr != NULL) {
+ /* This dbuf is already dirty and cached. */
+ dbuf_redirty(dr);
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+ }
+ mutex_exit(&db->db_mtx);
+
+ DB_DNODE_ENTER(db);
+ if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
+ flags |= DB_RF_HAVESTRUCT;
+ DB_DNODE_EXIT(db);
+ (void) dbuf_read(db, NULL, flags);
+ (void) dbuf_dirty(db, tx);
+}
+
+void
+dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_will_dirty_impl(db_fake,
+ DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
+}
+
+boolean_t
+dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dbuf_dirty_record_t *dr;
+
+ mutex_enter(&db->db_mtx);
+ dr = dbuf_find_dirty_eq(db, tx->tx_txg);
+ mutex_exit(&db->db_mtx);
+ return (dr != NULL);
+}
+
+void
+dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ db->db_state = DB_NOFILL;
+ DTRACE_SET_STATE(db, "allocating NOFILL buffer");
+ dmu_buf_will_fill(db_fake, tx);
+}
+
+void
+dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(db->db_level == 0);
+ ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+
+ ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
+ dmu_tx_private_ok(tx));
+
+ dbuf_noread(db);
+ (void) dbuf_dirty(db, tx);
+}
+
+/*
+ * This function is effectively the same as dmu_buf_will_dirty(), but
+ * indicates the caller expects raw encrypted data in the db, and provides
+ * the crypt params (byteorder, salt, iv, mac) which should be stored in the
+ * blkptr_t when this dbuf is written. This is only used for blocks of
+ * dnodes, during raw receive.
+ */
+void
+dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
+ const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dbuf_dirty_record_t *dr;
+
+ /*
+ * dr_has_raw_params is only processed for blocks of dnodes
+ * (see dbuf_sync_dnode_leaf_crypt()).
+ */
+ ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
+ ASSERT3U(db->db_level, ==, 0);
+ ASSERT(db->db_objset->os_raw_receive);
+
+ dmu_buf_will_dirty_impl(db_fake,
+ DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx);
+
+ dr = dbuf_find_dirty_eq(db, tx->tx_txg);
+
+ ASSERT3P(dr, !=, NULL);
+
+ dr->dt.dl.dr_has_raw_params = B_TRUE;
+ dr->dt.dl.dr_byteorder = byteorder;
+ bcopy(salt, dr->dt.dl.dr_salt, ZIO_DATA_SALT_LEN);
+ bcopy(iv, dr->dt.dl.dr_iv, ZIO_DATA_IV_LEN);
+ bcopy(mac, dr->dt.dl.dr_mac, ZIO_DATA_MAC_LEN);
+}
+
+static void
+dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ struct dirty_leaf *dl;
+ dbuf_dirty_record_t *dr;
+
+ dr = list_head(&db->db_dirty_records);
+ ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
+ dl = &dr->dt.dl;
+ dl->dr_overridden_by = *bp;
+ dl->dr_override_state = DR_OVERRIDDEN;
+ dl->dr_overridden_by.blk_birth = dr->dr_txg;
+}
+
+/* ARGSUSED */
+void
+dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+ dbuf_states_t old_state;
+ mutex_enter(&db->db_mtx);
+ DBUF_VERIFY(db);
+
+ old_state = db->db_state;
+ db->db_state = DB_CACHED;
+ if (old_state == DB_FILL) {
+ if (db->db_level == 0 && db->db_freed_in_flight) {
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ /* we were freed while filling */
+ /* XXX dbuf_undirty? */
+ bzero(db->db.db_data, db->db.db_size);
+ db->db_freed_in_flight = FALSE;
+ DTRACE_SET_STATE(db,
+ "fill done handling freed in flight");
+ } else {
+ DTRACE_SET_STATE(db, "fill done");
+ }
+ cv_broadcast(&db->db_changed);
+ }
+ mutex_exit(&db->db_mtx);
+}
+
+void
+dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
+ bp_embedded_type_t etype, enum zio_compress comp,
+ int uncompressed_size, int compressed_size, int byteorder,
+ dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+ struct dirty_leaf *dl;
+ dmu_object_type_t type;
+ dbuf_dirty_record_t *dr;
+
+ if (etype == BP_EMBEDDED_TYPE_DATA) {
+ ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
+ SPA_FEATURE_EMBEDDED_DATA));
+ }
+
+ DB_DNODE_ENTER(db);
+ type = DB_DNODE(db)->dn_type;
+ DB_DNODE_EXIT(db);
+
+ ASSERT0(db->db_level);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+ dmu_buf_will_not_fill(dbuf, tx);
+
+ dr = list_head(&db->db_dirty_records);
+ ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
+ dl = &dr->dt.dl;
+ encode_embedded_bp_compressed(&dl->dr_overridden_by,
+ data, comp, uncompressed_size, compressed_size);
+ BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
+ BP_SET_TYPE(&dl->dr_overridden_by, type);
+ BP_SET_LEVEL(&dl->dr_overridden_by, 0);
+ BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
+
+ dl->dr_override_state = DR_OVERRIDDEN;
+ dl->dr_overridden_by.blk_birth = dr->dr_txg;
+}
+
+void
+dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+ dmu_object_type_t type;
+ ASSERT(dsl_dataset_feature_is_active(db->db_objset->os_dsl_dataset,
+ SPA_FEATURE_REDACTED_DATASETS));
+
+ DB_DNODE_ENTER(db);
+ type = DB_DNODE(db)->dn_type;
+ DB_DNODE_EXIT(db);
+
+ ASSERT0(db->db_level);
+ dmu_buf_will_not_fill(dbuf, tx);
+
+ blkptr_t bp = { { { {0} } } };
+ BP_SET_TYPE(&bp, type);
+ BP_SET_LEVEL(&bp, 0);
+ BP_SET_BIRTH(&bp, tx->tx_txg, 0);
+ BP_SET_REDACTED(&bp);
+ BPE_SET_LSIZE(&bp, dbuf->db_size);
+
+ dbuf_override_impl(db, &bp, tx);
+}
+
+/*
+ * Directly assign a provided arc buf to a given dbuf if it's not referenced
+ * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
+ */
+void
+dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
+{
+ ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ ASSERT(db->db_level == 0);
+ ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
+ ASSERT(buf != NULL);
+ ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size);
+ ASSERT(tx->tx_txg != 0);
+
+ arc_return_buf(buf, db);
+ ASSERT(arc_released(buf));
+
+ mutex_enter(&db->db_mtx);
+
+ while (db->db_state == DB_READ || db->db_state == DB_FILL)
+ cv_wait(&db->db_changed, &db->db_mtx);
+
+ ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
+
+ if (db->db_state == DB_CACHED &&
+ zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
+ /*
+ * In practice, we will never have a case where we have an
+ * encrypted arc buffer while additional holds exist on the
+ * dbuf. We don't handle this here so we simply assert that
+ * fact instead.
+ */
+ ASSERT(!arc_is_encrypted(buf));
+ mutex_exit(&db->db_mtx);
+ (void) dbuf_dirty(db, tx);
+ bcopy(buf->b_data, db->db.db_data, db->db.db_size);
+ arc_buf_destroy(buf, db);
+ return;
+ }
+
+ if (db->db_state == DB_CACHED) {
+ dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
+
+ ASSERT(db->db_buf != NULL);
+ if (dr != NULL && dr->dr_txg == tx->tx_txg) {
+ ASSERT(dr->dt.dl.dr_data == db->db_buf);
+
+ if (!arc_released(db->db_buf)) {
+ ASSERT(dr->dt.dl.dr_override_state ==
+ DR_OVERRIDDEN);
+ arc_release(db->db_buf, db);
+ }
+ dr->dt.dl.dr_data = buf;
+ arc_buf_destroy(db->db_buf, db);
+ } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
+ arc_release(db->db_buf, db);
+ arc_buf_destroy(db->db_buf, db);
+ }
+ db->db_buf = NULL;
+ }
+ ASSERT(db->db_buf == NULL);
+ dbuf_set_data(db, buf);
+ db->db_state = DB_FILL;
+ DTRACE_SET_STATE(db, "filling assigned arcbuf");
+ mutex_exit(&db->db_mtx);
+ (void) dbuf_dirty(db, tx);
+ dmu_buf_fill_done(&db->db, tx);
+}
+
+void
+dbuf_destroy(dmu_buf_impl_t *db)
+{
+ dnode_t *dn;
+ dmu_buf_impl_t *parent = db->db_parent;
+ dmu_buf_impl_t *dndb;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(zfs_refcount_is_zero(&db->db_holds));
+
+ if (db->db_buf != NULL) {
+ arc_buf_destroy(db->db_buf, db);
+ db->db_buf = NULL;
+ }
+
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ int slots = DB_DNODE(db)->dn_num_slots;
+ int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
+ if (db->db.db_data != NULL) {
+ kmem_free(db->db.db_data, bonuslen);
+ arc_space_return(bonuslen, ARC_SPACE_BONUS);
+ db->db_state = DB_UNCACHED;
+ DTRACE_SET_STATE(db, "buffer cleared");
+ }
+ }
+
+ dbuf_clear_data(db);
+
+ if (multilist_link_active(&db->db_cache_link)) {
+ ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
+ db->db_caching_status == DB_DBUF_METADATA_CACHE);
+
+ multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
+ (void) zfs_refcount_remove_many(
+ &dbuf_caches[db->db_caching_status].size,
+ db->db.db_size, db);
+
+ if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
+ DBUF_STAT_BUMPDOWN(metadata_cache_count);
+ } else {
+ DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
+ DBUF_STAT_BUMPDOWN(cache_count);
+ DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
+ db->db.db_size);
+ }
+ db->db_caching_status = DB_NO_CACHE;
+ }
+
+ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
+ ASSERT(db->db_data_pending == NULL);
+ ASSERT(list_is_empty(&db->db_dirty_records));
+
+ db->db_state = DB_EVICTING;
+ DTRACE_SET_STATE(db, "buffer eviction started");
+ db->db_blkptr = NULL;
+
+ /*
+ * Now that db_state is DB_EVICTING, nobody else can find this via
+ * the hash table. We can now drop db_mtx, which allows us to
+ * acquire the dn_dbufs_mtx.
+ */
+ mutex_exit(&db->db_mtx);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ dndb = dn->dn_dbuf;
+ if (db->db_blkid != DMU_BONUS_BLKID) {
+ boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
+ if (needlock)
+ mutex_enter_nested(&dn->dn_dbufs_mtx,
+ NESTED_SINGLE);
+ avl_remove(&dn->dn_dbufs, db);
+ membar_producer();
+ DB_DNODE_EXIT(db);
+ if (needlock)
+ mutex_exit(&dn->dn_dbufs_mtx);
+ /*
+ * Decrementing the dbuf count means that the hold corresponding
+ * to the removed dbuf is no longer discounted in dnode_move(),
+ * so the dnode cannot be moved until after we release the hold.
+ * The membar_producer() ensures visibility of the decremented
+ * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
+ * release any lock.
+ */
+ mutex_enter(&dn->dn_mtx);
+ dnode_rele_and_unlock(dn, db, B_TRUE);
+ db->db_dnode_handle = NULL;
+
+ dbuf_hash_remove(db);
+ } else {
+ DB_DNODE_EXIT(db);
+ }
+
+ ASSERT(zfs_refcount_is_zero(&db->db_holds));
+
+ db->db_parent = NULL;
+
+ ASSERT(db->db_buf == NULL);
+ ASSERT(db->db.db_data == NULL);
+ ASSERT(db->db_hash_next == NULL);
+ ASSERT(db->db_blkptr == NULL);
+ ASSERT(db->db_data_pending == NULL);
+ ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+ ASSERT(!multilist_link_active(&db->db_cache_link));
+
+ kmem_cache_free(dbuf_kmem_cache, db);
+ arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
+
+ /*
+ * If this dbuf is referenced from an indirect dbuf,
+ * decrement the ref count on the indirect dbuf.
+ */
+ if (parent && parent != dndb) {
+ mutex_enter(&parent->db_mtx);
+ dbuf_rele_and_unlock(parent, db, B_TRUE);
+ }
+}
+
+/*
+ * Note: While bpp will always be updated if the function returns success,
+ * parentp will not be updated if the dnode does not have dn_dbuf filled in;
+ * this happens when the dnode is the meta-dnode, or {user|group|project}used
+ * object.
+ */
+__attribute__((always_inline))
+static inline int
+dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
+ dmu_buf_impl_t **parentp, blkptr_t **bpp)
+{
+ *parentp = NULL;
+ *bpp = NULL;
+
+ ASSERT(blkid != DMU_BONUS_BLKID);
+
+ if (blkid == DMU_SPILL_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_have_spill &&
+ (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+ *bpp = DN_SPILL_BLKPTR(dn->dn_phys);
+ else
+ *bpp = NULL;
+ dbuf_add_ref(dn->dn_dbuf, NULL);
+ *parentp = dn->dn_dbuf;
+ mutex_exit(&dn->dn_mtx);
+ return (0);
+ }
+
+ int nlevels =
+ (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ ASSERT3U(level * epbs, <, 64);
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ /*
+ * This assertion shouldn't trip as long as the max indirect block size
+ * is less than 1M. The reason for this is that up to that point,
+ * the number of levels required to address an entire object with blocks
+ * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64. In
+ * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55
+ * (i.e. we can address the entire object), objects will all use at most
+ * N-1 levels and the assertion won't overflow. However, once epbs is
+ * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66. Then, 4 levels will not be
+ * enough to address an entire object, so objects will have 5 levels,
+ * but then this assertion will overflow.
+ *
+ * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we
+ * need to redo this logic to handle overflows.
+ */
+ ASSERT(level >= nlevels ||
+ ((nlevels - level - 1) * epbs) +
+ highbit64(dn->dn_phys->dn_nblkptr) <= 64);
+ if (level >= nlevels ||
+ blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr <<
+ ((nlevels - level - 1) * epbs)) ||
+ (fail_sparse &&
+ blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
+ /* the buffer has no parent yet */
+ return (SET_ERROR(ENOENT));
+ } else if (level < nlevels-1) {
+ /* this block is referenced from an indirect block */
+ int err;
+
+ err = dbuf_hold_impl(dn, level + 1,
+ blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
+
+ if (err)
+ return (err);
+ err = dbuf_read(*parentp, NULL,
+ (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
+ if (err) {
+ dbuf_rele(*parentp, NULL);
+ *parentp = NULL;
+ return (err);
+ }
+ rw_enter(&(*parentp)->db_rwlock, RW_READER);
+ *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
+ (blkid & ((1ULL << epbs) - 1));
+ if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))
+ ASSERT(BP_IS_HOLE(*bpp));
+ rw_exit(&(*parentp)->db_rwlock);
+ return (0);
+ } else {
+ /* the block is referenced from the dnode */
+ ASSERT3U(level, ==, nlevels-1);
+ ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
+ blkid < dn->dn_phys->dn_nblkptr);
+ if (dn->dn_dbuf) {
+ dbuf_add_ref(dn->dn_dbuf, NULL);
+ *parentp = dn->dn_dbuf;
+ }
+ *bpp = &dn->dn_phys->dn_blkptr[blkid];
+ return (0);
+ }
+}
+
+static dmu_buf_impl_t *
+dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
+ dmu_buf_impl_t *parent, blkptr_t *blkptr)
+{
+ objset_t *os = dn->dn_objset;
+ dmu_buf_impl_t *db, *odb;
+
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ ASSERT(dn->dn_type != DMU_OT_NONE);
+
+ db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
+
+ list_create(&db->db_dirty_records, sizeof (dbuf_dirty_record_t),
+ offsetof(dbuf_dirty_record_t, dr_dbuf_node));
+
+ db->db_objset = os;
+ db->db.db_object = dn->dn_object;
+ db->db_level = level;
+ db->db_blkid = blkid;
+ db->db_dirtycnt = 0;
+ db->db_dnode_handle = dn->dn_handle;
+ db->db_parent = parent;
+ db->db_blkptr = blkptr;
+
+ db->db_user = NULL;
+ db->db_user_immediate_evict = FALSE;
+ db->db_freed_in_flight = FALSE;
+ db->db_pending_evict = FALSE;
+
+ if (blkid == DMU_BONUS_BLKID) {
+ ASSERT3P(parent, ==, dn->dn_dbuf);
+ db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
+ (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
+ db->db.db_offset = DMU_BONUS_BLKID;
+ db->db_state = DB_UNCACHED;
+ DTRACE_SET_STATE(db, "bonus buffer created");
+ db->db_caching_status = DB_NO_CACHE;
+ /* the bonus dbuf is not placed in the hash table */
+ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
+ return (db);
+ } else if (blkid == DMU_SPILL_BLKID) {
+ db->db.db_size = (blkptr != NULL) ?
+ BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
+ db->db.db_offset = 0;
+ } else {
+ int blocksize =
+ db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
+ db->db.db_size = blocksize;
+ db->db.db_offset = db->db_blkid * blocksize;
+ }
+
+ /*
+ * Hold the dn_dbufs_mtx while we get the new dbuf
+ * in the hash table *and* added to the dbufs list.
+ * This prevents a possible deadlock with someone
+ * trying to look up this dbuf before it's added to the
+ * dn_dbufs list.
+ */
+ mutex_enter(&dn->dn_dbufs_mtx);
+ db->db_state = DB_EVICTING; /* not worth logging this state change */
+ if ((odb = dbuf_hash_insert(db)) != NULL) {
+ /* someone else inserted it first */
+ kmem_cache_free(dbuf_kmem_cache, db);
+ mutex_exit(&dn->dn_dbufs_mtx);
+ DBUF_STAT_BUMP(hash_insert_race);
+ return (odb);
+ }
+ avl_add(&dn->dn_dbufs, db);
+
+ db->db_state = DB_UNCACHED;
+ DTRACE_SET_STATE(db, "regular buffer created");
+ db->db_caching_status = DB_NO_CACHE;
+ mutex_exit(&dn->dn_dbufs_mtx);
+ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
+
+ if (parent && parent != dn->dn_dbuf)
+ dbuf_add_ref(parent, db);
+
+ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
+ zfs_refcount_count(&dn->dn_holds) > 0);
+ (void) zfs_refcount_add(&dn->dn_holds, db);
+
+ dprintf_dbuf(db, "db=%p\n", db);
+
+ return (db);
+}
+
+/*
+ * This function returns a block pointer and information about the object,
+ * given a dnode and a block. This is a publicly accessible version of
+ * dbuf_findbp that only returns some information, rather than the
+ * dbuf. Note that the dnode passed in must be held, and the dn_struct_rwlock
+ * should be locked as (at least) a reader.
+ */
+int
+dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid,
+ blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift)
+{
+ dmu_buf_impl_t *dbp = NULL;
+ blkptr_t *bp2;
+ int err = 0;
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+
+ err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2);
+ if (err == 0) {
+ *bp = *bp2;
+ if (dbp != NULL)
+ dbuf_rele(dbp, NULL);
+ if (datablkszsec != NULL)
+ *datablkszsec = dn->dn_phys->dn_datablkszsec;
+ if (indblkshift != NULL)
+ *indblkshift = dn->dn_phys->dn_indblkshift;
+ }
+
+ return (err);
+}
+
+typedef struct dbuf_prefetch_arg {
+ spa_t *dpa_spa; /* The spa to issue the prefetch in. */
+ zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
+ int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
+ int dpa_curlevel; /* The current level that we're reading */
+ dnode_t *dpa_dnode; /* The dnode associated with the prefetch */
+ zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
+ zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
+ arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
+ dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */
+ void *dpa_arg; /* prefetch completion arg */
+} dbuf_prefetch_arg_t;
+
+static void
+dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done)
+{
+ if (dpa->dpa_cb != NULL)
+ dpa->dpa_cb(dpa->dpa_arg, io_done);
+ kmem_free(dpa, sizeof (*dpa));
+}
+
+static void
+dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb,
+ const blkptr_t *iobp, arc_buf_t *abuf, void *private)
+{
+ dbuf_prefetch_arg_t *dpa = private;
+
+ dbuf_prefetch_fini(dpa, B_TRUE);
+ if (abuf != NULL)
+ arc_buf_destroy(abuf, private);
+}
+
+/*
+ * Actually issue the prefetch read for the block given.
+ */
+static void
+dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
+{
+ ASSERT(!BP_IS_REDACTED(bp) ||
+ dsl_dataset_feature_is_active(
+ dpa->dpa_dnode->dn_objset->os_dsl_dataset,
+ SPA_FEATURE_REDACTED_DATASETS));
+
+ if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
+ return (dbuf_prefetch_fini(dpa, B_FALSE));
+
+ int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
+ arc_flags_t aflags =
+ dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
+ ARC_FLAG_NO_BUF;
+
+ /* dnodes are always read as raw and then converted later */
+ if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) &&
+ dpa->dpa_curlevel == 0)
+ zio_flags |= ZIO_FLAG_RAW;
+
+ ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
+ ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
+ ASSERT(dpa->dpa_zio != NULL);
+ (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp,
+ dbuf_issue_final_prefetch_done, dpa,
+ dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb);
+}
+
+/*
+ * Called when an indirect block above our prefetch target is read in. This
+ * will either read in the next indirect block down the tree or issue the actual
+ * prefetch if the next block down is our target.
+ */
+static void
+dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
+ const blkptr_t *iobp, arc_buf_t *abuf, void *private)
+{
+ dbuf_prefetch_arg_t *dpa = private;
+
+ ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
+ ASSERT3S(dpa->dpa_curlevel, >, 0);
+
+ if (abuf == NULL) {
+ ASSERT(zio == NULL || zio->io_error != 0);
+ return (dbuf_prefetch_fini(dpa, B_TRUE));
+ }
+ ASSERT(zio == NULL || zio->io_error == 0);
+
+ /*
+ * The dpa_dnode is only valid if we are called with a NULL
+ * zio. This indicates that the arc_read() returned without
+ * first calling zio_read() to issue a physical read. Once
+ * a physical read is made the dpa_dnode must be invalidated
+ * as the locks guarding it may have been dropped. If the
+ * dpa_dnode is still valid, then we want to add it to the dbuf
+ * cache. To do so, we must hold the dbuf associated with the block
+ * we just prefetched, read its contents so that we associate it
+ * with an arc_buf_t, and then release it.
+ */
+ if (zio != NULL) {
+ ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
+ if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) {
+ ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
+ } else {
+ ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
+ }
+ ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
+
+ dpa->dpa_dnode = NULL;
+ } else if (dpa->dpa_dnode != NULL) {
+ uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
+ (dpa->dpa_epbs * (dpa->dpa_curlevel -
+ dpa->dpa_zb.zb_level));
+ dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
+ dpa->dpa_curlevel, curblkid, FTAG);
+ if (db == NULL) {
+ arc_buf_destroy(abuf, private);
+ return (dbuf_prefetch_fini(dpa, B_TRUE));
+ }
+ (void) dbuf_read(db, NULL,
+ DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
+ dbuf_rele(db, FTAG);
+ }
+
+ dpa->dpa_curlevel--;
+ uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
+ (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
+ blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
+ P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
+
+ ASSERT(!BP_IS_REDACTED(bp) ||
+ dsl_dataset_feature_is_active(
+ dpa->dpa_dnode->dn_objset->os_dsl_dataset,
+ SPA_FEATURE_REDACTED_DATASETS));
+ if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
+ dbuf_prefetch_fini(dpa, B_TRUE);
+ } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
+ ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
+ dbuf_issue_final_prefetch(dpa, bp);
+ } else {
+ arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
+ zbookmark_phys_t zb;
+
+ /* flag if L2ARC eligible, l2arc_noprefetch then decides */
+ if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)
+ iter_aflags |= ARC_FLAG_L2CACHE;
+
+ ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
+
+ SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
+ dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
+
+ (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
+ bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ &iter_aflags, &zb);
+ }
+
+ arc_buf_destroy(abuf, private);
+}
+
+/*
+ * Issue prefetch reads for the given block on the given level. If the indirect
+ * blocks above that block are not in memory, we will read them in
+ * asynchronously. As a result, this call never blocks waiting for a read to
+ * complete. Note that the prefetch might fail if the dataset is encrypted and
+ * the encryption key is unmapped before the IO completes.
+ */
+int
+dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
+ zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb,
+ void *arg)
+{
+ blkptr_t bp;
+ int epbs, nlevels, curlevel;
+ uint64_t curblkid;
+
+ ASSERT(blkid != DMU_BONUS_BLKID);
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+
+ if (blkid > dn->dn_maxblkid)
+ goto no_issue;
+
+ if (level == 0 && dnode_block_freed(dn, blkid))
+ goto no_issue;
+
+ /*
+ * This dnode hasn't been written to disk yet, so there's nothing to
+ * prefetch.
+ */
+ nlevels = dn->dn_phys->dn_nlevels;
+ if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
+ goto no_issue;
+
+ epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
+ goto no_issue;
+
+ dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
+ level, blkid);
+ if (db != NULL) {
+ mutex_exit(&db->db_mtx);
+ /*
+ * This dbuf already exists. It is either CACHED, or
+ * (we assume) about to be read or filled.
+ */
+ goto no_issue;
+ }
+
+ /*
+ * Find the closest ancestor (indirect block) of the target block
+ * that is present in the cache. In this indirect block, we will
+ * find the bp that is at curlevel, curblkid.
+ */
+ curlevel = level;
+ curblkid = blkid;
+ while (curlevel < nlevels - 1) {
+ int parent_level = curlevel + 1;
+ uint64_t parent_blkid = curblkid >> epbs;
+ dmu_buf_impl_t *db;
+
+ if (dbuf_hold_impl(dn, parent_level, parent_blkid,
+ FALSE, TRUE, FTAG, &db) == 0) {
+ blkptr_t *bpp = db->db_buf->b_data;
+ bp = bpp[P2PHASE(curblkid, 1 << epbs)];
+ dbuf_rele(db, FTAG);
+ break;
+ }
+
+ curlevel = parent_level;
+ curblkid = parent_blkid;
+ }
+
+ if (curlevel == nlevels - 1) {
+ /* No cached indirect blocks found. */
+ ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
+ bp = dn->dn_phys->dn_blkptr[curblkid];
+ }
+ ASSERT(!BP_IS_REDACTED(&bp) ||
+ dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset,
+ SPA_FEATURE_REDACTED_DATASETS));
+ if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp))
+ goto no_issue;
+
+ ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
+
+ zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+
+ dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+ SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
+ dn->dn_object, level, blkid);
+ dpa->dpa_curlevel = curlevel;
+ dpa->dpa_prio = prio;
+ dpa->dpa_aflags = aflags;
+ dpa->dpa_spa = dn->dn_objset->os_spa;
+ dpa->dpa_dnode = dn;
+ dpa->dpa_epbs = epbs;
+ dpa->dpa_zio = pio;
+ dpa->dpa_cb = cb;
+ dpa->dpa_arg = arg;
+
+ /* flag if L2ARC eligible, l2arc_noprefetch then decides */
+ if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
+ dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
+
+ /*
+ * If we have the indirect just above us, no need to do the asynchronous
+ * prefetch chain; we'll just run the last step ourselves. If we're at
+ * a higher level, though, we want to issue the prefetches for all the
+ * indirect blocks asynchronously, so we can go on with whatever we were
+ * doing.
+ */
+ if (curlevel == level) {
+ ASSERT3U(curblkid, ==, blkid);
+ dbuf_issue_final_prefetch(dpa, &bp);
+ } else {
+ arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
+ zbookmark_phys_t zb;
+
+ /* flag if L2ARC eligible, l2arc_noprefetch then decides */
+ if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
+ iter_aflags |= ARC_FLAG_L2CACHE;
+
+ SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
+ dn->dn_object, curlevel, curblkid);
+ (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
+ &bp, dbuf_prefetch_indirect_done, dpa, prio,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ &iter_aflags, &zb);
+ }
+ /*
+ * We use pio here instead of dpa_zio since it's possible that
+ * dpa may have already been freed.
+ */
+ zio_nowait(pio);
+ return (1);
+no_issue:
+ if (cb != NULL)
+ cb(arg, B_FALSE);
+ return (0);
+}
+
+int
+dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
+ arc_flags_t aflags)
+{
+
+ return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL));
+}
+
+/*
+ * Helper function for dbuf_hold_impl() to copy a buffer. Handles
+ * the case of encrypted, compressed and uncompressed buffers by
+ * allocating the new buffer, respectively, with arc_alloc_raw_buf(),
+ * arc_alloc_compressed_buf() or arc_alloc_buf().*
+ *
+ * NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl().
+ */
+noinline static void
+dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db)
+{
+ dbuf_dirty_record_t *dr = db->db_data_pending;
+ arc_buf_t *newdata, *data = dr->dt.dl.dr_data;
+
+ newdata = dbuf_alloc_arcbuf_from_arcbuf(db, data);
+ dbuf_set_data(db, newdata);
+ rw_enter(&db->db_rwlock, RW_WRITER);
+ bcopy(data->b_data, db->db.db_data, arc_buf_size(data));
+ rw_exit(&db->db_rwlock);
+}
+
+/*
+ * Returns with db_holds incremented, and db_mtx not held.
+ * Note: dn_struct_rwlock must be held.
+ */
+int
+dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
+ boolean_t fail_sparse, boolean_t fail_uncached,
+ void *tag, dmu_buf_impl_t **dbp)
+{
+ dmu_buf_impl_t *db, *parent = NULL;
+
+ /* If the pool has been created, verify the tx_sync_lock is not held */
+ spa_t *spa = dn->dn_objset->os_spa;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ if (dp != NULL) {
+ ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock));
+ }
+
+ ASSERT(blkid != DMU_BONUS_BLKID);
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ ASSERT3U(dn->dn_nlevels, >, level);
+
+ *dbp = NULL;
+
+ /* dbuf_find() returns with db_mtx held */
+ db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
+
+ if (db == NULL) {
+ blkptr_t *bp = NULL;
+ int err;
+
+ if (fail_uncached)
+ return (SET_ERROR(ENOENT));
+
+ ASSERT3P(parent, ==, NULL);
+ err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
+ if (fail_sparse) {
+ if (err == 0 && bp && BP_IS_HOLE(bp))
+ err = SET_ERROR(ENOENT);
+ if (err) {
+ if (parent)
+ dbuf_rele(parent, NULL);
+ return (err);
+ }
+ }
+ if (err && err != ENOENT)
+ return (err);
+ db = dbuf_create(dn, level, blkid, parent, bp);
+ }
+
+ if (fail_uncached && db->db_state != DB_CACHED) {
+ mutex_exit(&db->db_mtx);
+ return (SET_ERROR(ENOENT));
+ }
+
+ if (db->db_buf != NULL) {
+ arc_buf_access(db->db_buf);
+ ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
+ }
+
+ ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
+
+ /*
+ * If this buffer is currently syncing out, and we are
+ * still referencing it from db_data, we need to make a copy
+ * of it in case we decide we want to dirty it again in this txg.
+ */
+ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
+ dn->dn_object != DMU_META_DNODE_OBJECT &&
+ db->db_state == DB_CACHED && db->db_data_pending) {
+ dbuf_dirty_record_t *dr = db->db_data_pending;
+ if (dr->dt.dl.dr_data == db->db_buf)
+ dbuf_hold_copy(dn, db);
+ }
+
+ if (multilist_link_active(&db->db_cache_link)) {
+ ASSERT(zfs_refcount_is_zero(&db->db_holds));
+ ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
+ db->db_caching_status == DB_DBUF_METADATA_CACHE);
+
+ multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
+ (void) zfs_refcount_remove_many(
+ &dbuf_caches[db->db_caching_status].size,
+ db->db.db_size, db);
+
+ if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
+ DBUF_STAT_BUMPDOWN(metadata_cache_count);
+ } else {
+ DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
+ DBUF_STAT_BUMPDOWN(cache_count);
+ DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
+ db->db.db_size);
+ }
+ db->db_caching_status = DB_NO_CACHE;
+ }
+ (void) zfs_refcount_add(&db->db_holds, tag);
+ DBUF_VERIFY(db);
+ mutex_exit(&db->db_mtx);
+
+ /* NOTE: we can't rele the parent until after we drop the db_mtx */
+ if (parent)
+ dbuf_rele(parent, NULL);
+
+ ASSERT3P(DB_DNODE(db), ==, dn);
+ ASSERT3U(db->db_blkid, ==, blkid);
+ ASSERT3U(db->db_level, ==, level);
+ *dbp = db;
+
+ return (0);
+}
+
+dmu_buf_impl_t *
+dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
+{
+ return (dbuf_hold_level(dn, 0, blkid, tag));
+}
+
+dmu_buf_impl_t *
+dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
+{
+ dmu_buf_impl_t *db;
+ int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
+ return (err ? NULL : db);
+}
+
+void
+dbuf_create_bonus(dnode_t *dn)
+{
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+ ASSERT(dn->dn_bonus == NULL);
+ dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
+}
+
+int
+dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ if (db->db_blkid != DMU_SPILL_BLKID)
+ return (SET_ERROR(ENOTSUP));
+ if (blksz == 0)
+ blksz = SPA_MINBLOCKSIZE;
+ ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
+ blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
+
+ dbuf_new_size(db, blksz, tx);
+
+ return (0);
+}
+
+void
+dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
+{
+ dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
+}
+
+#pragma weak dmu_buf_add_ref = dbuf_add_ref
+void
+dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
+{
+ int64_t holds = zfs_refcount_add(&db->db_holds, tag);
+ VERIFY3S(holds, >, 1);
+}
+
+#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
+boolean_t
+dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
+ void *tag)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dmu_buf_impl_t *found_db;
+ boolean_t result = B_FALSE;
+
+ if (blkid == DMU_BONUS_BLKID)
+ found_db = dbuf_find_bonus(os, obj);
+ else
+ found_db = dbuf_find(os, obj, 0, blkid);
+
+ if (found_db != NULL) {
+ if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
+ (void) zfs_refcount_add(&db->db_holds, tag);
+ result = B_TRUE;
+ }
+ mutex_exit(&found_db->db_mtx);
+ }
+ return (result);
+}
+
+/*
+ * If you call dbuf_rele() you had better not be referencing the dnode handle
+ * unless you have some other direct or indirect hold on the dnode. (An indirect
+ * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
+ * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
+ * dnode's parent dbuf evicting its dnode handles.
+ */
+void
+dbuf_rele(dmu_buf_impl_t *db, void *tag)
+{
+ mutex_enter(&db->db_mtx);
+ dbuf_rele_and_unlock(db, tag, B_FALSE);
+}
+
+void
+dmu_buf_rele(dmu_buf_t *db, void *tag)
+{
+ dbuf_rele((dmu_buf_impl_t *)db, tag);
+}
+
+/*
+ * dbuf_rele() for an already-locked dbuf. This is necessary to allow
+ * db_dirtycnt and db_holds to be updated atomically. The 'evicting'
+ * argument should be set if we are already in the dbuf-evicting code
+ * path, in which case we don't want to recursively evict. This allows us to
+ * avoid deeply nested stacks that would have a call flow similar to this:
+ *
+ * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
+ * ^ |
+ * | |
+ * +-----dbuf_destroy()<--dbuf_evict_one()<--------+
+ *
+ */
+void
+dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
+{
+ int64_t holds;
+ uint64_t size;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ DBUF_VERIFY(db);
+
+ /*
+ * Remove the reference to the dbuf before removing its hold on the
+ * dnode so we can guarantee in dnode_move() that a referenced bonus
+ * buffer has a corresponding dnode hold.
+ */
+ holds = zfs_refcount_remove(&db->db_holds, tag);
+ ASSERT(holds >= 0);
+
+ /*
+ * We can't freeze indirects if there is a possibility that they
+ * may be modified in the current syncing context.
+ */
+ if (db->db_buf != NULL &&
+ holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
+ arc_buf_freeze(db->db_buf);
+ }
+
+ if (holds == db->db_dirtycnt &&
+ db->db_level == 0 && db->db_user_immediate_evict)
+ dbuf_evict_user(db);
+
+ if (holds == 0) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ dnode_t *dn;
+ boolean_t evict_dbuf = db->db_pending_evict;
+
+ /*
+ * If the dnode moves here, we cannot cross this
+ * barrier until the move completes.
+ */
+ DB_DNODE_ENTER(db);
+
+ dn = DB_DNODE(db);
+ atomic_dec_32(&dn->dn_dbufs_count);
+
+ /*
+ * Decrementing the dbuf count means that the bonus
+ * buffer's dnode hold is no longer discounted in
+ * dnode_move(). The dnode cannot move until after
+ * the dnode_rele() below.
+ */
+ DB_DNODE_EXIT(db);
+
+ /*
+ * Do not reference db after its lock is dropped.
+ * Another thread may evict it.
+ */
+ mutex_exit(&db->db_mtx);
+
+ if (evict_dbuf)
+ dnode_evict_bonus(dn);
+
+ dnode_rele(dn, db);
+ } else if (db->db_buf == NULL) {
+ /*
+ * This is a special case: we never associated this
+ * dbuf with any data allocated from the ARC.
+ */
+ ASSERT(db->db_state == DB_UNCACHED ||
+ db->db_state == DB_NOFILL);
+ dbuf_destroy(db);
+ } else if (arc_released(db->db_buf)) {
+ /*
+ * This dbuf has anonymous data associated with it.
+ */
+ dbuf_destroy(db);
+ } else {
+ boolean_t do_arc_evict = B_FALSE;
+ blkptr_t bp;
+ spa_t *spa = dmu_objset_spa(db->db_objset);
+
+ if (!DBUF_IS_CACHEABLE(db) &&
+ db->db_blkptr != NULL &&
+ !BP_IS_HOLE(db->db_blkptr) &&
+ !BP_IS_EMBEDDED(db->db_blkptr)) {
+ do_arc_evict = B_TRUE;
+ bp = *db->db_blkptr;
+ }
+
+ if (!DBUF_IS_CACHEABLE(db) ||
+ db->db_pending_evict) {
+ dbuf_destroy(db);
+ } else if (!multilist_link_active(&db->db_cache_link)) {
+ ASSERT3U(db->db_caching_status, ==,
+ DB_NO_CACHE);
+
+ dbuf_cached_state_t dcs =
+ dbuf_include_in_metadata_cache(db) ?
+ DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
+ db->db_caching_status = dcs;
+
+ multilist_insert(dbuf_caches[dcs].cache, db);
+ size = zfs_refcount_add_many(
+ &dbuf_caches[dcs].size,
+ db->db.db_size, db);
+
+ if (dcs == DB_DBUF_METADATA_CACHE) {
+ DBUF_STAT_BUMP(metadata_cache_count);
+ DBUF_STAT_MAX(
+ metadata_cache_size_bytes_max,
+ size);
+ } else {
+ DBUF_STAT_BUMP(
+ cache_levels[db->db_level]);
+ DBUF_STAT_BUMP(cache_count);
+ DBUF_STAT_INCR(
+ cache_levels_bytes[db->db_level],
+ db->db.db_size);
+ DBUF_STAT_MAX(cache_size_bytes_max,
+ size);
+ }
+ mutex_exit(&db->db_mtx);
+
+ if (dcs == DB_DBUF_CACHE && !evicting)
+ dbuf_evict_notify(size);
+ }
+
+ if (do_arc_evict)
+ arc_freed(spa, &bp);
+ }
+ } else {
+ mutex_exit(&db->db_mtx);
+ }
+
+}
+
+#pragma weak dmu_buf_refcount = dbuf_refcount
+uint64_t
+dbuf_refcount(dmu_buf_impl_t *db)
+{
+ return (zfs_refcount_count(&db->db_holds));
+}
+
+uint64_t
+dmu_buf_user_refcount(dmu_buf_t *db_fake)
+{
+ uint64_t holds;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ mutex_enter(&db->db_mtx);
+ ASSERT3U(zfs_refcount_count(&db->db_holds), >=, db->db_dirtycnt);
+ holds = zfs_refcount_count(&db->db_holds) - db->db_dirtycnt;
+ mutex_exit(&db->db_mtx);
+
+ return (holds);
+}
+
+void *
+dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
+ dmu_buf_user_t *new_user)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ mutex_enter(&db->db_mtx);
+ dbuf_verify_user(db, DBVU_NOT_EVICTING);
+ if (db->db_user == old_user)
+ db->db_user = new_user;
+ else
+ old_user = db->db_user;
+ dbuf_verify_user(db, DBVU_NOT_EVICTING);
+ mutex_exit(&db->db_mtx);
+
+ return (old_user);
+}
+
+void *
+dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
+{
+ return (dmu_buf_replace_user(db_fake, NULL, user));
+}
+
+void *
+dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ db->db_user_immediate_evict = TRUE;
+ return (dmu_buf_set_user(db_fake, user));
+}
+
+void *
+dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
+{
+ return (dmu_buf_replace_user(db_fake, user, NULL));
+}
+
+void *
+dmu_buf_get_user(dmu_buf_t *db_fake)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ dbuf_verify_user(db, DBVU_NOT_EVICTING);
+ return (db->db_user);
+}
+
+void
+dmu_buf_user_evict_wait()
+{
+ taskq_wait(dbu_evict_taskq);
+}
+
+blkptr_t *
+dmu_buf_get_blkptr(dmu_buf_t *db)
+{
+ dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+ return (dbi->db_blkptr);
+}
+
+objset_t *
+dmu_buf_get_objset(dmu_buf_t *db)
+{
+ dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+ return (dbi->db_objset);
+}
+
+dnode_t *
+dmu_buf_dnode_enter(dmu_buf_t *db)
+{
+ dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+ DB_DNODE_ENTER(dbi);
+ return (DB_DNODE(dbi));
+}
+
+void
+dmu_buf_dnode_exit(dmu_buf_t *db)
+{
+ dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+ DB_DNODE_EXIT(dbi);
+}
+
+static void
+dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
+{
+ /* ASSERT(dmu_tx_is_syncing(tx) */
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (db->db_blkptr != NULL)
+ return;
+
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys);
+ BP_ZERO(db->db_blkptr);
+ return;
+ }
+ if (db->db_level == dn->dn_phys->dn_nlevels-1) {
+ /*
+ * This buffer was allocated at a time when there was
+ * no available blkptrs from the dnode, or it was
+ * inappropriate to hook it in (i.e., nlevels mismatch).
+ */
+ ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
+ ASSERT(db->db_parent == NULL);
+ db->db_parent = dn->dn_dbuf;
+ db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
+ DBUF_VERIFY(db);
+ } else {
+ dmu_buf_impl_t *parent = db->db_parent;
+ int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ ASSERT(dn->dn_phys->dn_nlevels > 1);
+ if (parent == NULL) {
+ mutex_exit(&db->db_mtx);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ parent = dbuf_hold_level(dn, db->db_level + 1,
+ db->db_blkid >> epbs, db);
+ rw_exit(&dn->dn_struct_rwlock);
+ mutex_enter(&db->db_mtx);
+ db->db_parent = parent;
+ }
+ db->db_blkptr = (blkptr_t *)parent->db.db_data +
+ (db->db_blkid & ((1ULL << epbs) - 1));
+ DBUF_VERIFY(db);
+ }
+}
+
+static void
+dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ void *data = dr->dt.dl.dr_data;
+
+ ASSERT0(db->db_level);
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db_blkid == DMU_BONUS_BLKID);
+ ASSERT(data != NULL);
+
+ dnode_t *dn = dr->dr_dnode;
+ ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
+ DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
+ bcopy(data, DN_BONUS(dn->dn_phys), DN_MAX_BONUS_LEN(dn->dn_phys));
+
+ dbuf_sync_leaf_verify_bonus_dnode(dr);
+
+ dbuf_undirty_bonus(dr);
+ dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
+}
+
+/*
+ * When syncing out a blocks of dnodes, adjust the block to deal with
+ * encryption. Normally, we make sure the block is decrypted before writing
+ * it. If we have crypt params, then we are writing a raw (encrypted) block,
+ * from a raw receive. In this case, set the ARC buf's crypt params so
+ * that the BP will be filled with the correct byteorder, salt, iv, and mac.
+ */
+static void
+dbuf_prepare_encrypted_dnode_leaf(dbuf_dirty_record_t *dr)
+{
+ int err;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
+ ASSERT3U(db->db_level, ==, 0);
+
+ if (!db->db_objset->os_raw_receive && arc_is_encrypted(db->db_buf)) {
+ zbookmark_phys_t zb;
+
+ /*
+ * Unfortunately, there is currently no mechanism for
+ * syncing context to handle decryption errors. An error
+ * here is only possible if an attacker maliciously
+ * changed a dnode block and updated the associated
+ * checksums going up the block tree.
+ */
+ SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
+ db->db.db_object, db->db_level, db->db_blkid);
+ err = arc_untransform(db->db_buf, db->db_objset->os_spa,
+ &zb, B_TRUE);
+ if (err)
+ panic("Invalid dnode block MAC");
+ } else if (dr->dt.dl.dr_has_raw_params) {
+ (void) arc_release(dr->dt.dl.dr_data, db);
+ arc_convert_to_raw(dr->dt.dl.dr_data,
+ dmu_objset_id(db->db_objset),
+ dr->dt.dl.dr_byteorder, DMU_OT_DNODE,
+ dr->dt.dl.dr_salt, dr->dt.dl.dr_iv, dr->dt.dl.dr_mac);
+ }
+}
+
+/*
+ * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
+ * is critical the we not allow the compiler to inline this function in to
+ * dbuf_sync_list() thereby drastically bloating the stack usage.
+ */
+noinline static void
+dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn = dr->dr_dnode;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
+
+ mutex_enter(&db->db_mtx);
+
+ ASSERT(db->db_level > 0);
+ DBUF_VERIFY(db);
+
+ /* Read the block if it hasn't been read yet. */
+ if (db->db_buf == NULL) {
+ mutex_exit(&db->db_mtx);
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
+ mutex_enter(&db->db_mtx);
+ }
+ ASSERT3U(db->db_state, ==, DB_CACHED);
+ ASSERT(db->db_buf != NULL);
+
+ /* Indirect block size must match what the dnode thinks it is. */
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+ dbuf_check_blkptr(dn, db);
+
+ /* Provide the pending dirty record to child dbufs */
+ db->db_data_pending = dr;
+
+ mutex_exit(&db->db_mtx);
+
+ dbuf_write(dr, db->db_buf, tx);
+
+ zio_t *zio = dr->dr_zio;
+ mutex_enter(&dr->dt.di.dr_mtx);
+ dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
+ ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+ mutex_exit(&dr->dt.di.dr_mtx);
+ zio_nowait(zio);
+}
+
+/*
+ * Verify that the size of the data in our bonus buffer does not exceed
+ * its recorded size.
+ *
+ * The purpose of this verification is to catch any cases in development
+ * where the size of a phys structure (i.e space_map_phys_t) grows and,
+ * due to incorrect feature management, older pools expect to read more
+ * data even though they didn't actually write it to begin with.
+ *
+ * For a example, this would catch an error in the feature logic where we
+ * open an older pool and we expect to write the space map histogram of
+ * a space map with size SPACE_MAP_SIZE_V0.
+ */
+static void
+dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr)
+{
+#ifdef ZFS_DEBUG
+ dnode_t *dn = dr->dr_dnode;
+
+ /*
+ * Encrypted bonus buffers can have data past their bonuslen.
+ * Skip the verification of these blocks.
+ */
+ if (DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))
+ return;
+
+ uint16_t bonuslen = dn->dn_phys->dn_bonuslen;
+ uint16_t maxbonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
+ ASSERT3U(bonuslen, <=, maxbonuslen);
+
+ arc_buf_t *datap = dr->dt.dl.dr_data;
+ char *datap_end = ((char *)datap) + bonuslen;
+ char *datap_max = ((char *)datap) + maxbonuslen;
+
+ /* ensure that everything is zero after our data */
+ for (; datap_end < datap_max; datap_end++)
+ ASSERT(*datap_end == 0);
+#endif
+}
+
+static blkptr_t *
+dbuf_lightweight_bp(dbuf_dirty_record_t *dr)
+{
+ /* This must be a lightweight dirty record. */
+ ASSERT3P(dr->dr_dbuf, ==, NULL);
+ dnode_t *dn = dr->dr_dnode;
+
+ if (dn->dn_phys->dn_nlevels == 1) {
+ VERIFY3U(dr->dt.dll.dr_blkid, <, dn->dn_phys->dn_nblkptr);
+ return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]);
+ } else {
+ dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ VERIFY3U(parent_db->db_level, ==, 1);
+ VERIFY3P(parent_db->db_dnode_handle->dnh_dnode, ==, dn);
+ VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid);
+ blkptr_t *bp = parent_db->db.db_data;
+ return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]);
+ }
+}
+
+static void
+dbuf_lightweight_ready(zio_t *zio)
+{
+ dbuf_dirty_record_t *dr = zio->io_private;
+ blkptr_t *bp = zio->io_bp;
+
+ if (zio->io_error != 0)
+ return;
+
+ dnode_t *dn = dr->dr_dnode;
+
+ blkptr_t *bp_orig = dbuf_lightweight_bp(dr);
+ spa_t *spa = dmu_objset_spa(dn->dn_objset);
+ int64_t delta = bp_get_dsize_sync(spa, bp) -
+ bp_get_dsize_sync(spa, bp_orig);
+ dnode_diduse_space(dn, delta);
+
+ uint64_t blkid = dr->dt.dll.dr_blkid;
+ mutex_enter(&dn->dn_mtx);
+ if (blkid > dn->dn_phys->dn_maxblkid) {
+ ASSERT0(dn->dn_objset->os_raw_receive);
+ dn->dn_phys->dn_maxblkid = blkid;
+ }
+ mutex_exit(&dn->dn_mtx);
+
+ if (!BP_IS_EMBEDDED(bp)) {
+ uint64_t fill = BP_IS_HOLE(bp) ? 0 : 1;
+ BP_SET_FILL(bp, fill);
+ }
+
+ dmu_buf_impl_t *parent_db;
+ EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
+ if (dr->dr_parent == NULL) {
+ parent_db = dn->dn_dbuf;
+ } else {
+ parent_db = dr->dr_parent->dr_dbuf;
+ }
+ rw_enter(&parent_db->db_rwlock, RW_WRITER);
+ *bp_orig = *bp;
+ rw_exit(&parent_db->db_rwlock);
+}
+
+static void
+dbuf_lightweight_physdone(zio_t *zio)
+{
+ dbuf_dirty_record_t *dr = zio->io_private;
+ dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
+ ASSERT3U(dr->dr_txg, ==, zio->io_txg);
+
+ /*
+ * The callback will be called io_phys_children times. Retire one
+ * portion of our dirty space each time we are called. Any rounding
+ * error will be cleaned up by dbuf_lightweight_done().
+ */
+ int delta = dr->dr_accounted / zio->io_phys_children;
+ dsl_pool_undirty_space(dp, delta, zio->io_txg);
+}
+
+static void
+dbuf_lightweight_done(zio_t *zio)
+{
+ dbuf_dirty_record_t *dr = zio->io_private;
+
+ VERIFY0(zio->io_error);
+
+ objset_t *os = dr->dr_dnode->dn_objset;
+ dmu_tx_t *tx = os->os_synctx;
+
+ if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
+ ASSERT(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
+ } else {
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ (void) dsl_dataset_block_kill(ds, &zio->io_bp_orig, tx, B_TRUE);
+ dsl_dataset_block_born(ds, zio->io_bp, tx);
+ }
+
+ /*
+ * See comment in dbuf_write_done().
+ */
+ if (zio->io_phys_children == 0) {
+ dsl_pool_undirty_space(dmu_objset_pool(os),
+ dr->dr_accounted, zio->io_txg);
+ } else {
+ dsl_pool_undirty_space(dmu_objset_pool(os),
+ dr->dr_accounted % zio->io_phys_children, zio->io_txg);
+ }
+
+ abd_free(dr->dt.dll.dr_abd);
+ kmem_free(dr, sizeof (*dr));
+}
+
+noinline static void
+dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+ dnode_t *dn = dr->dr_dnode;
+ zio_t *pio;
+ if (dn->dn_phys->dn_nlevels == 1) {
+ pio = dn->dn_zio;
+ } else {
+ pio = dr->dr_parent->dr_zio;
+ }
+
+ zbookmark_phys_t zb = {
+ .zb_objset = dmu_objset_id(dn->dn_objset),
+ .zb_object = dn->dn_object,
+ .zb_level = 0,
+ .zb_blkid = dr->dt.dll.dr_blkid,
+ };
+
+ /*
+ * See comment in dbuf_write(). This is so that zio->io_bp_orig
+ * will have the old BP in dbuf_lightweight_done().
+ */
+ dr->dr_bp_copy = *dbuf_lightweight_bp(dr);
+
+ dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset),
+ dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,
+ dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),
+ &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,
+ dbuf_lightweight_physdone, dbuf_lightweight_done, dr,
+ ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
+
+ zio_nowait(dr->dr_zio);
+}
+
+/*
+ * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
+ * critical the we not allow the compiler to inline this function in to
+ * dbuf_sync_list() thereby drastically bloating the stack usage.
+ */
+noinline static void
+dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+ arc_buf_t **datap = &dr->dt.dl.dr_data;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn = dr->dr_dnode;
+ objset_t *os;
+ uint64_t txg = tx->tx_txg;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
+
+ mutex_enter(&db->db_mtx);
+ /*
+ * To be synced, we must be dirtied. But we
+ * might have been freed after the dirty.
+ */
+ if (db->db_state == DB_UNCACHED) {
+ /* This buffer has been freed since it was dirtied */
+ ASSERT(db->db.db_data == NULL);
+ } else if (db->db_state == DB_FILL) {
+ /* This buffer was freed and is now being re-filled */
+ ASSERT(db->db.db_data != dr->dt.dl.dr_data);
+ } else {
+ ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
+ }
+ DBUF_VERIFY(db);
+
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
+ /*
+ * In the previous transaction group, the bonus buffer
+ * was entirely used to store the attributes for the
+ * dnode which overrode the dn_spill field. However,
+ * when adding more attributes to the file a spill
+ * block was required to hold the extra attributes.
+ *
+ * Make sure to clear the garbage left in the dn_spill
+ * field from the previous attributes in the bonus
+ * buffer. Otherwise, after writing out the spill
+ * block to the new allocated dva, it will free
+ * the old block pointed to by the invalid dn_spill.
+ */
+ db->db_blkptr = NULL;
+ }
+ dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ /*
+ * If this is a bonus buffer, simply copy the bonus data into the
+ * dnode. It will be written out when the dnode is synced (and it
+ * will be synced, since it must have been dirty for dbuf_sync to
+ * be called).
+ */
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ ASSERT(dr->dr_dbuf == db);
+ dbuf_sync_bonus(dr, tx);
+ return;
+ }
+
+ os = dn->dn_objset;
+
+ /*
+ * This function may have dropped the db_mtx lock allowing a dmu_sync
+ * operation to sneak in. As a result, we need to ensure that we
+ * don't check the dr_override_state until we have returned from
+ * dbuf_check_blkptr.
+ */
+ dbuf_check_blkptr(dn, db);
+
+ /*
+ * If this buffer is in the middle of an immediate write,
+ * wait for the synchronous IO to complete.
+ */
+ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+ cv_wait(&db->db_changed, &db->db_mtx);
+ ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
+ }
+
+ /*
+ * If this is a dnode block, ensure it is appropriately encrypted
+ * or decrypted, depending on what we are writing to it this txg.
+ */
+ if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)
+ dbuf_prepare_encrypted_dnode_leaf(dr);
+
+ if (db->db_state != DB_NOFILL &&
+ dn->dn_object != DMU_META_DNODE_OBJECT &&
+ zfs_refcount_count(&db->db_holds) > 1 &&
+ dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
+ *datap == db->db_buf) {
+ /*
+ * If this buffer is currently "in use" (i.e., there
+ * are active holds and db_data still references it),
+ * then make a copy before we start the write so that
+ * any modifications from the open txg will not leak
+ * into this write.
+ *
+ * NOTE: this copy does not need to be made for
+ * objects only modified in the syncing context (e.g.
+ * DNONE_DNODE blocks).
+ */
+ *datap = dbuf_alloc_arcbuf_from_arcbuf(db, db->db_buf);
+ bcopy(db->db.db_data, (*datap)->b_data, arc_buf_size(*datap));
+ }
+ db->db_data_pending = dr;
+
+ mutex_exit(&db->db_mtx);
+
+ dbuf_write(dr, *datap, tx);
+
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ if (dn->dn_object == DMU_META_DNODE_OBJECT) {
+ list_insert_tail(&dn->dn_dirty_records[txg & TXG_MASK], dr);
+ } else {
+ zio_nowait(dr->dr_zio);
+ }
+}
+
+void
+dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
+{
+ dbuf_dirty_record_t *dr;
+
+ while ((dr = list_head(list))) {
+ if (dr->dr_zio != NULL) {
+ /*
+ * If we find an already initialized zio then we
+ * are processing the meta-dnode, and we have finished.
+ * The dbufs for all dnodes are put back on the list
+ * during processing, so that we can zio_wait()
+ * these IOs after initiating all child IOs.
+ */
+ ASSERT3U(dr->dr_dbuf->db.db_object, ==,
+ DMU_META_DNODE_OBJECT);
+ break;
+ }
+ list_remove(list, dr);
+ if (dr->dr_dbuf == NULL) {
+ dbuf_sync_lightweight(dr, tx);
+ } else {
+ if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+ dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
+ VERIFY3U(dr->dr_dbuf->db_level, ==, level);
+ }
+ if (dr->dr_dbuf->db_level > 0)
+ dbuf_sync_indirect(dr, tx);
+ else
+ dbuf_sync_leaf(dr, tx);
+ }
+ }
+}
+
+/* ARGSUSED */
+static void
+dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+ dnode_t *dn;
+ blkptr_t *bp = zio->io_bp;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ spa_t *spa = zio->io_spa;
+ int64_t delta;
+ uint64_t fill = 0;
+ int i;
+
+ ASSERT3P(db->db_blkptr, !=, NULL);
+ ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
+ dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
+ zio->io_prev_space_delta = delta;
+
+ if (bp->blk_birth != 0) {
+ ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
+ BP_GET_TYPE(bp) == dn->dn_type) ||
+ (db->db_blkid == DMU_SPILL_BLKID &&
+ BP_GET_TYPE(bp) == dn->dn_bonustype) ||
+ BP_IS_EMBEDDED(bp));
+ ASSERT(BP_GET_LEVEL(bp) == db->db_level);
+ }
+
+ mutex_enter(&db->db_mtx);
+
+#ifdef ZFS_DEBUG
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
+ ASSERT(!(BP_IS_HOLE(bp)) &&
+ db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
+ }
+#endif
+
+ if (db->db_level == 0) {
+ mutex_enter(&dn->dn_mtx);
+ if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
+ db->db_blkid != DMU_SPILL_BLKID) {
+ ASSERT0(db->db_objset->os_raw_receive);
+ dn->dn_phys->dn_maxblkid = db->db_blkid;
+ }
+ mutex_exit(&dn->dn_mtx);
+
+ if (dn->dn_type == DMU_OT_DNODE) {
+ i = 0;
+ while (i < db->db.db_size) {
+ dnode_phys_t *dnp =
+ (void *)(((char *)db->db.db_data) + i);
+
+ i += DNODE_MIN_SIZE;
+ if (dnp->dn_type != DMU_OT_NONE) {
+ fill++;
+ i += dnp->dn_extra_slots *
+ DNODE_MIN_SIZE;
+ }
+ }
+ } else {
+ if (BP_IS_HOLE(bp)) {
+ fill = 0;
+ } else {
+ fill = 1;
+ }
+ }
+ } else {
+ blkptr_t *ibp = db->db.db_data;
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+ for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
+ if (BP_IS_HOLE(ibp))
+ continue;
+ fill += BP_GET_FILL(ibp);
+ }
+ }
+ DB_DNODE_EXIT(db);
+
+ if (!BP_IS_EMBEDDED(bp))
+ BP_SET_FILL(bp, fill);
+
+ mutex_exit(&db->db_mtx);
+
+ db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_WRITER, FTAG);
+ *db->db_blkptr = *bp;
+ dmu_buf_unlock_parent(db, dblt, FTAG);
+}
+
+/* ARGSUSED */
+/*
+ * This function gets called just prior to running through the compression
+ * stage of the zio pipeline. If we're an indirect block comprised of only
+ * holes, then we want this indirect to be compressed away to a hole. In
+ * order to do that we must zero out any information about the holes that
+ * this indirect points to prior to before we try to compress it.
+ */
+static void
+dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+ dnode_t *dn;
+ blkptr_t *bp;
+ unsigned int epbs, i;
+
+ ASSERT3U(db->db_level, >, 0);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ ASSERT3U(epbs, <, 31);
+
+ /* Determine if all our children are holes */
+ for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) {
+ if (!BP_IS_HOLE(bp))
+ break;
+ }
+
+ /*
+ * If all the children are holes, then zero them all out so that
+ * we may get compressed away.
+ */
+ if (i == 1ULL << epbs) {
+ /*
+ * We only found holes. Grab the rwlock to prevent
+ * anybody from reading the blocks we're about to
+ * zero out.
+ */
+ rw_enter(&db->db_rwlock, RW_WRITER);
+ bzero(db->db.db_data, db->db.db_size);
+ rw_exit(&db->db_rwlock);
+ }
+ DB_DNODE_EXIT(db);
+}
+
+/*
+ * The SPA will call this callback several times for each zio - once
+ * for every physical child i/o (zio->io_phys_children times). This
+ * allows the DMU to monitor the progress of each logical i/o. For example,
+ * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
+ * block. There may be a long delay before all copies/fragments are completed,
+ * so this callback allows us to retire dirty space gradually, as the physical
+ * i/os complete.
+ */
+/* ARGSUSED */
+static void
+dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+ dmu_buf_impl_t *db = arg;
+ objset_t *os = db->db_objset;
+ dsl_pool_t *dp = dmu_objset_pool(os);
+ dbuf_dirty_record_t *dr;
+ int delta = 0;
+
+ dr = db->db_data_pending;
+ ASSERT3U(dr->dr_txg, ==, zio->io_txg);
+
+ /*
+ * The callback will be called io_phys_children times. Retire one
+ * portion of our dirty space each time we are called. Any rounding
+ * error will be cleaned up by dbuf_write_done().
+ */
+ delta = dr->dr_accounted / zio->io_phys_children;
+ dsl_pool_undirty_space(dp, delta, zio->io_txg);
+}
+
+/* ARGSUSED */
+static void
+dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ blkptr_t *bp = db->db_blkptr;
+ objset_t *os = db->db_objset;
+ dmu_tx_t *tx = os->os_synctx;
+
+ ASSERT0(zio->io_error);
+ ASSERT(db->db_blkptr == bp);
+
+ /*
+ * For nopwrites and rewrites we ensure that the bp matches our
+ * original and bypass all the accounting.
+ */
+ if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
+ ASSERT(BP_EQUAL(bp, bp_orig));
+ } else {
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+ dsl_dataset_block_born(ds, bp, tx);
+ }
+
+ mutex_enter(&db->db_mtx);
+
+ DBUF_VERIFY(db);
+
+ dbuf_dirty_record_t *dr = db->db_data_pending;
+ dnode_t *dn = dr->dr_dnode;
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ ASSERT(dr->dr_dbuf == db);
+ ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
+ list_remove(&db->db_dirty_records, dr);
+
+#ifdef ZFS_DEBUG
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
+ ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
+ db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
+ }
+#endif
+
+ if (db->db_level == 0) {
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
+ if (db->db_state != DB_NOFILL) {
+ if (dr->dt.dl.dr_data != db->db_buf)
+ arc_buf_destroy(dr->dt.dl.dr_data, db);
+ }
+ } else {
+ ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+ ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
+ if (!BP_IS_HOLE(db->db_blkptr)) {
+ int epbs __maybe_unused = dn->dn_phys->dn_indblkshift -
+ SPA_BLKPTRSHIFT;
+ ASSERT3U(db->db_blkid, <=,
+ dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
+ ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
+ db->db.db_size);
+ }
+ mutex_destroy(&dr->dt.di.dr_mtx);
+ list_destroy(&dr->dt.di.dr_children);
+ }
+
+ cv_broadcast(&db->db_changed);
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+ db->db_data_pending = NULL;
+ dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
+
+ /*
+ * If we didn't do a physical write in this ZIO and we
+ * still ended up here, it means that the space of the
+ * dbuf that we just released (and undirtied) above hasn't
+ * been marked as undirtied in the pool's accounting.
+ *
+ * Thus, we undirty that space in the pool's view of the
+ * world here. For physical writes this type of update
+ * happens in dbuf_write_physdone().
+ *
+ * If we did a physical write, cleanup any rounding errors
+ * that came up due to writing multiple copies of a block
+ * on disk [see dbuf_write_physdone()].
+ */
+ if (zio->io_phys_children == 0) {
+ dsl_pool_undirty_space(dmu_objset_pool(os),
+ dr->dr_accounted, zio->io_txg);
+ } else {
+ dsl_pool_undirty_space(dmu_objset_pool(os),
+ dr->dr_accounted % zio->io_phys_children, zio->io_txg);
+ }
+
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+}
+
+static void
+dbuf_write_nofill_ready(zio_t *zio)
+{
+ dbuf_write_ready(zio, NULL, zio->io_private);
+}
+
+static void
+dbuf_write_nofill_done(zio_t *zio)
+{
+ dbuf_write_done(zio, NULL, zio->io_private);
+}
+
+static void
+dbuf_write_override_ready(zio_t *zio)
+{
+ dbuf_dirty_record_t *dr = zio->io_private;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+
+ dbuf_write_ready(zio, NULL, db);
+}
+
+static void
+dbuf_write_override_done(zio_t *zio)
+{
+ dbuf_dirty_record_t *dr = zio->io_private;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
+
+ mutex_enter(&db->db_mtx);
+ if (!BP_EQUAL(zio->io_bp, obp)) {
+ if (!BP_IS_HOLE(obp))
+ dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
+ arc_release(dr->dt.dl.dr_data, db);
+ }
+ mutex_exit(&db->db_mtx);
+
+ dbuf_write_done(zio, NULL, db);
+
+ if (zio->io_abd != NULL)
+ abd_free(zio->io_abd);
+}
+
+typedef struct dbuf_remap_impl_callback_arg {
+ objset_t *drica_os;
+ uint64_t drica_blk_birth;
+ dmu_tx_t *drica_tx;
+} dbuf_remap_impl_callback_arg_t;
+
+static void
+dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,
+ void *arg)
+{
+ dbuf_remap_impl_callback_arg_t *drica = arg;
+ objset_t *os = drica->drica_os;
+ spa_t *spa = dmu_objset_spa(os);
+ dmu_tx_t *tx = drica->drica_tx;
+
+ ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
+
+ if (os == spa_meta_objset(spa)) {
+ spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
+ } else {
+ dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset,
+ size, drica->drica_blk_birth, tx);
+ }
+}
+
+static void
+dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
+{
+ blkptr_t bp_copy = *bp;
+ spa_t *spa = dmu_objset_spa(dn->dn_objset);
+ dbuf_remap_impl_callback_arg_t drica;
+
+ ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
+
+ drica.drica_os = dn->dn_objset;
+ drica.drica_blk_birth = bp->blk_birth;
+ drica.drica_tx = tx;
+ if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
+ &drica)) {
+ /*
+ * If the blkptr being remapped is tracked by a livelist,
+ * then we need to make sure the livelist reflects the update.
+ * First, cancel out the old blkptr by appending a 'FREE'
+ * entry. Next, add an 'ALLOC' to track the new version. This
+ * way we avoid trying to free an inaccurate blkptr at delete.
+ * Note that embedded blkptrs are not tracked in livelists.
+ */
+ if (dn->dn_objset != spa_meta_objset(spa)) {
+ dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
+ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
+ bp->blk_birth > ds->ds_dir->dd_origin_txg) {
+ ASSERT(!BP_IS_EMBEDDED(bp));
+ ASSERT(dsl_dir_is_clone(ds->ds_dir));
+ ASSERT(spa_feature_is_enabled(spa,
+ SPA_FEATURE_LIVELIST));
+ bplist_append(&ds->ds_dir->dd_pending_frees,
+ bp);
+ bplist_append(&ds->ds_dir->dd_pending_allocs,
+ &bp_copy);
+ }
+ }
+
+ /*
+ * The db_rwlock prevents dbuf_read_impl() from
+ * dereferencing the BP while we are changing it. To
+ * avoid lock contention, only grab it when we are actually
+ * changing the BP.
+ */
+ if (rw != NULL)
+ rw_enter(rw, RW_WRITER);
+ *bp = bp_copy;
+ if (rw != NULL)
+ rw_exit(rw);
+ }
+}
+
+/*
+ * Remap any existing BP's to concrete vdevs, if possible.
+ */
+static void
+dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_objset_spa(db->db_objset);
+ ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
+
+ if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
+ return;
+
+ if (db->db_level > 0) {
+ blkptr_t *bp = db->db.db_data;
+ for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
+ dbuf_remap_impl(dn, &bp[i], &db->db_rwlock, tx);
+ }
+ } else if (db->db.db_object == DMU_META_DNODE_OBJECT) {
+ dnode_phys_t *dnp = db->db.db_data;
+ ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==,
+ DMU_OT_DNODE);
+ for (int i = 0; i < db->db.db_size >> DNODE_SHIFT;
+ i += dnp[i].dn_extra_slots + 1) {
+ for (int j = 0; j < dnp[i].dn_nblkptr; j++) {
+ krwlock_t *lock = (dn->dn_dbuf == NULL ? NULL :
+ &dn->dn_dbuf->db_rwlock);
+ dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], lock,
+ tx);
+ }
+ }
+ }
+}
+
+
+/* Issue I/O to commit a dirty buffer to disk. */
+static void
+dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn = dr->dr_dnode;
+ objset_t *os;
+ dmu_buf_impl_t *parent = db->db_parent;
+ uint64_t txg = tx->tx_txg;
+ zbookmark_phys_t zb;
+ zio_prop_t zp;
+ zio_t *pio; /* parent I/O */
+ int wp_flag = 0;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ os = dn->dn_objset;
+
+ if (db->db_state != DB_NOFILL) {
+ if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
+ /*
+ * Private object buffers are released here rather
+ * than in dbuf_dirty() since they are only modified
+ * in the syncing context and we don't want the
+ * overhead of making multiple copies of the data.
+ */
+ if (BP_IS_HOLE(db->db_blkptr)) {
+ arc_buf_thaw(data);
+ } else {
+ dbuf_release_bp(db);
+ }
+ dbuf_remap(dn, db, tx);
+ }
+ }
+
+ if (parent != dn->dn_dbuf) {
+ /* Our parent is an indirect block. */
+ /* We have a dirty parent that has been scheduled for write. */
+ ASSERT(parent && parent->db_data_pending);
+ /* Our parent's buffer is one level closer to the dnode. */
+ ASSERT(db->db_level == parent->db_level-1);
+ /*
+ * We're about to modify our parent's db_data by modifying
+ * our block pointer, so the parent must be released.
+ */
+ ASSERT(arc_released(parent->db_buf));
+ pio = parent->db_data_pending->dr_zio;
+ } else {
+ /* Our parent is the dnode itself. */
+ ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
+ db->db_blkid != DMU_SPILL_BLKID) ||
+ (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
+ if (db->db_blkid != DMU_SPILL_BLKID)
+ ASSERT3P(db->db_blkptr, ==,
+ &dn->dn_phys->dn_blkptr[db->db_blkid]);
+ pio = dn->dn_zio;
+ }
+
+ ASSERT(db->db_level == 0 || data == db->db_buf);
+ ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+ ASSERT(pio);
+
+ SET_BOOKMARK(&zb, os->os_dsl_dataset ?
+ os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+ db->db.db_object, db->db_level, db->db_blkid);
+
+ if (db->db_blkid == DMU_SPILL_BLKID)
+ wp_flag = WP_SPILL;
+ wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
+
+ dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
+
+ /*
+ * We copy the blkptr now (rather than when we instantiate the dirty
+ * record), because its value can change between open context and
+ * syncing context. We do not need to hold dn_struct_rwlock to read
+ * db_blkptr because we are in syncing context.
+ */
+ dr->dr_bp_copy = *db->db_blkptr;
+
+ if (db->db_level == 0 &&
+ dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+ /*
+ * The BP for this block has been provided by open context
+ * (by dmu_sync() or dmu_buf_write_embedded()).
+ */
+ abd_t *contents = (data != NULL) ?
+ abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
+
+ dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
+ contents, db->db.db_size, db->db.db_size, &zp,
+ dbuf_write_override_ready, NULL, NULL,
+ dbuf_write_override_done,
+ dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+ mutex_enter(&db->db_mtx);
+ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
+ dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
+ mutex_exit(&db->db_mtx);
+ } else if (db->db_state == DB_NOFILL) {
+ ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
+ zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
+ dr->dr_zio = zio_write(pio, os->os_spa, txg,
+ &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
+ dbuf_write_nofill_ready, NULL, NULL,
+ dbuf_write_nofill_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
+ } else {
+ ASSERT(arc_released(data));
+
+ /*
+ * For indirect blocks, we want to setup the children
+ * ready callback so that we can properly handle an indirect
+ * block that only contains holes.
+ */
+ arc_write_done_func_t *children_ready_cb = NULL;
+ if (db->db_level != 0)
+ children_ready_cb = dbuf_write_children_ready;
+
+ dr->dr_zio = arc_write(pio, os->os_spa, txg,
+ &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
+ &zp, dbuf_write_ready,
+ children_ready_cb, dbuf_write_physdone,
+ dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_MUSTSUCCEED, &zb);
+ }
+}
+
+EXPORT_SYMBOL(dbuf_find);
+EXPORT_SYMBOL(dbuf_is_metadata);
+EXPORT_SYMBOL(dbuf_destroy);
+EXPORT_SYMBOL(dbuf_loan_arcbuf);
+EXPORT_SYMBOL(dbuf_whichblock);
+EXPORT_SYMBOL(dbuf_read);
+EXPORT_SYMBOL(dbuf_unoverride);
+EXPORT_SYMBOL(dbuf_free_range);
+EXPORT_SYMBOL(dbuf_new_size);
+EXPORT_SYMBOL(dbuf_release_bp);
+EXPORT_SYMBOL(dbuf_dirty);
+EXPORT_SYMBOL(dmu_buf_set_crypt_params);
+EXPORT_SYMBOL(dmu_buf_will_dirty);
+EXPORT_SYMBOL(dmu_buf_is_dirty);
+EXPORT_SYMBOL(dmu_buf_will_not_fill);
+EXPORT_SYMBOL(dmu_buf_will_fill);
+EXPORT_SYMBOL(dmu_buf_fill_done);
+EXPORT_SYMBOL(dmu_buf_rele);
+EXPORT_SYMBOL(dbuf_assign_arcbuf);
+EXPORT_SYMBOL(dbuf_prefetch);
+EXPORT_SYMBOL(dbuf_hold_impl);
+EXPORT_SYMBOL(dbuf_hold);
+EXPORT_SYMBOL(dbuf_hold_level);
+EXPORT_SYMBOL(dbuf_create_bonus);
+EXPORT_SYMBOL(dbuf_spill_set_blksz);
+EXPORT_SYMBOL(dbuf_rm_spill);
+EXPORT_SYMBOL(dbuf_add_ref);
+EXPORT_SYMBOL(dbuf_rele);
+EXPORT_SYMBOL(dbuf_rele_and_unlock);
+EXPORT_SYMBOL(dbuf_refcount);
+EXPORT_SYMBOL(dbuf_sync_list);
+EXPORT_SYMBOL(dmu_buf_set_user);
+EXPORT_SYMBOL(dmu_buf_set_user_ie);
+EXPORT_SYMBOL(dmu_buf_get_user);
+EXPORT_SYMBOL(dmu_buf_get_blkptr);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, ULONG, ZMOD_RW,
+ "Maximum size in bytes of the dbuf cache.");
+
+ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW,
+ "Percentage over dbuf_cache_max_bytes when dbufs must be evicted "
+ "directly.");
+
+ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW,
+ "Percentage below dbuf_cache_max_bytes when the evict thread stops "
+ "evicting dbufs.");
+
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, ULONG, ZMOD_RW,
+ "Maximum size in bytes of the dbuf metadata cache.");
+
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, INT, ZMOD_RW,
+ "Set the size of the dbuf cache to a log2 fraction of arc size.");
+
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, INT, ZMOD_RW,
+ "Set the size of the dbuf metadata cache to a log2 fraction of arc "
+ "size.");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dbuf_stats.c b/sys/contrib/openzfs/module/zfs/dbuf_stats.c
new file mode 100644
index 000000000000..12bb568a08cc
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dbuf_stats.c
@@ -0,0 +1,232 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_objset.h>
+
+/*
+ * Calculate the index of the arc header for the state, disabled by default.
+ */
+int zfs_dbuf_state_index = 0;
+
+/*
+ * ==========================================================================
+ * Dbuf Hash Read Routines
+ * ==========================================================================
+ */
+typedef struct dbuf_stats_t {
+ kmutex_t lock;
+ kstat_t *kstat;
+ dbuf_hash_table_t *hash;
+ int idx;
+} dbuf_stats_t;
+
+static dbuf_stats_t dbuf_stats_hash_table;
+
+static int
+dbuf_stats_hash_table_headers(char *buf, size_t size)
+{
+ (void) snprintf(buf, size,
+ "%-96s | %-119s | %s\n"
+ "%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-5s %-5s %-7s %3s | "
+ "%-5s %-5s %-9s %-6s %-8s %-12s "
+ "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-6s | "
+ "%-6s %-6s %-8s %-8s %-6s %-6s %-6s %-8s %-8s\n",
+ "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level",
+ "blkid", "offset", "dbsize", "meta", "state", "dbholds", "dbc",
+ "list", "atype", "flags", "count", "asize", "access",
+ "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize",
+ "l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs",
+ "bsize", "lvls", "dholds", "blocks", "dsize");
+
+ return (0);
+}
+
+static int
+__dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
+{
+ arc_buf_info_t abi = { 0 };
+ dmu_object_info_t doi = { 0 };
+ dnode_t *dn = DB_DNODE(db);
+ size_t nwritten;
+
+ if (db->db_buf)
+ arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index);
+
+ __dmu_object_info_from_dnode(dn, &doi);
+
+ nwritten = snprintf(buf, size,
+ "%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-5d %-5d "
+ "%-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu "
+ "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-6lu | "
+ "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-6lu %-8llu %-8llu\n",
+ /* dmu_buf_impl_t */
+ spa_name(dn->dn_objset->os_spa),
+ (u_longlong_t)dmu_objset_id(db->db_objset),
+ (longlong_t)db->db.db_object,
+ (longlong_t)db->db_level,
+ (longlong_t)db->db_blkid,
+ (u_longlong_t)db->db.db_offset,
+ (u_longlong_t)db->db.db_size,
+ !!dbuf_is_metadata(db),
+ db->db_state,
+ (ulong_t)zfs_refcount_count(&db->db_holds),
+ multilist_link_active(&db->db_cache_link),
+ /* arc_buf_info_t */
+ abi.abi_state_type,
+ abi.abi_state_contents,
+ abi.abi_flags,
+ (ulong_t)abi.abi_bufcnt,
+ (u_longlong_t)abi.abi_size,
+ (u_longlong_t)abi.abi_access,
+ (ulong_t)abi.abi_mru_hits,
+ (ulong_t)abi.abi_mru_ghost_hits,
+ (ulong_t)abi.abi_mfu_hits,
+ (ulong_t)abi.abi_mfu_ghost_hits,
+ (ulong_t)abi.abi_l2arc_hits,
+ (u_longlong_t)abi.abi_l2arc_dattr,
+ (u_longlong_t)abi.abi_l2arc_asize,
+ abi.abi_l2arc_compress,
+ (ulong_t)abi.abi_holds,
+ /* dmu_object_info_t */
+ doi.doi_type,
+ doi.doi_bonus_type,
+ (ulong_t)doi.doi_data_block_size,
+ (ulong_t)doi.doi_metadata_block_size,
+ (u_longlong_t)doi.doi_bonus_size,
+ (ulong_t)doi.doi_indirection,
+ (ulong_t)zfs_refcount_count(&dn->dn_holds),
+ (u_longlong_t)doi.doi_fill_count,
+ (u_longlong_t)doi.doi_max_offset);
+
+ if (nwritten >= size)
+ return (size);
+
+ return (nwritten + 1);
+}
+
+static int
+dbuf_stats_hash_table_data(char *buf, size_t size, void *data)
+{
+ dbuf_stats_t *dsh = (dbuf_stats_t *)data;
+ dbuf_hash_table_t *h = dsh->hash;
+ dmu_buf_impl_t *db;
+ int length, error = 0;
+
+ ASSERT3S(dsh->idx, >=, 0);
+ ASSERT3S(dsh->idx, <=, h->hash_table_mask);
+ if (size)
+ buf[0] = 0;
+
+ mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx));
+ for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) {
+ /*
+ * Returning ENOMEM will cause the data and header functions
+ * to be called with a larger scratch buffers.
+ */
+ if (size < 512) {
+ error = SET_ERROR(ENOMEM);
+ break;
+ }
+
+ mutex_enter(&db->db_mtx);
+
+ if (db->db_state != DB_EVICTING) {
+ length = __dbuf_stats_hash_table_data(buf, size, db);
+ buf += length;
+ size -= length;
+ }
+
+ mutex_exit(&db->db_mtx);
+ }
+ mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx));
+
+ return (error);
+}
+
+static void *
+dbuf_stats_hash_table_addr(kstat_t *ksp, loff_t n)
+{
+ dbuf_stats_t *dsh = ksp->ks_private;
+
+ ASSERT(MUTEX_HELD(&dsh->lock));
+
+ if (n <= dsh->hash->hash_table_mask) {
+ dsh->idx = n;
+ return (dsh);
+ }
+
+ return (NULL);
+}
+
+static void
+dbuf_stats_hash_table_init(dbuf_hash_table_t *hash)
+{
+ dbuf_stats_t *dsh = &dbuf_stats_hash_table;
+ kstat_t *ksp;
+
+ mutex_init(&dsh->lock, NULL, MUTEX_DEFAULT, NULL);
+ dsh->hash = hash;
+
+ ksp = kstat_create("zfs", 0, "dbufs", "misc",
+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+ dsh->kstat = ksp;
+
+ if (ksp) {
+ ksp->ks_lock = &dsh->lock;
+ ksp->ks_ndata = UINT32_MAX;
+ ksp->ks_private = dsh;
+ kstat_set_raw_ops(ksp, dbuf_stats_hash_table_headers,
+ dbuf_stats_hash_table_data, dbuf_stats_hash_table_addr);
+ kstat_install(ksp);
+ }
+}
+
+static void
+dbuf_stats_hash_table_destroy(void)
+{
+ dbuf_stats_t *dsh = &dbuf_stats_hash_table;
+ kstat_t *ksp;
+
+ ksp = dsh->kstat;
+ if (ksp)
+ kstat_delete(ksp);
+
+ mutex_destroy(&dsh->lock);
+}
+
+void
+dbuf_stats_init(dbuf_hash_table_t *hash)
+{
+ dbuf_stats_hash_table_init(hash);
+}
+
+void
+dbuf_stats_destroy(void)
+{
+ dbuf_stats_hash_table_destroy();
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, dbuf_state_index, INT, ZMOD_RW,
+ "Calculate arc header index");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/ddt.c b/sys/contrib/openzfs/module/zfs/ddt.c
new file mode 100644
index 000000000000..b94a9f54ece3
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/ddt.c
@@ -0,0 +1,1187 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/dsl_scan.h>
+#include <sys/abd.h>
+
+static kmem_cache_t *ddt_cache;
+static kmem_cache_t *ddt_entry_cache;
+
+/*
+ * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
+ */
+int zfs_dedup_prefetch = 0;
+
+static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
+ &ddt_zap_ops,
+};
+
+static const char *ddt_class_name[DDT_CLASSES] = {
+ "ditto",
+ "duplicate",
+ "unique",
+};
+
+static void
+ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_tx_t *tx)
+{
+ spa_t *spa = ddt->ddt_spa;
+ objset_t *os = ddt->ddt_os;
+ uint64_t *objectp = &ddt->ddt_object[type][class];
+ boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP;
+ char name[DDT_NAMELEN];
+
+ ddt_object_name(ddt, type, class, name);
+
+ ASSERT(*objectp == 0);
+ VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
+ ASSERT(*objectp != 0);
+
+ VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
+ sizeof (uint64_t), 1, objectp, tx) == 0);
+
+ VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
+ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ &ddt->ddt_histogram[type][class], tx) == 0);
+}
+
+static void
+ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_tx_t *tx)
+{
+ spa_t *spa = ddt->ddt_spa;
+ objset_t *os = ddt->ddt_os;
+ uint64_t *objectp = &ddt->ddt_object[type][class];
+ uint64_t count;
+ char name[DDT_NAMELEN];
+
+ ddt_object_name(ddt, type, class, name);
+
+ ASSERT(*objectp != 0);
+ ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
+ VERIFY(ddt_object_count(ddt, type, class, &count) == 0 && count == 0);
+ VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
+ VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
+ VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
+ bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
+
+ *objectp = 0;
+}
+
+static int
+ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+ ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
+ dmu_object_info_t doi;
+ uint64_t count;
+ char name[DDT_NAMELEN];
+ int error;
+
+ ddt_object_name(ddt, type, class, name);
+
+ error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
+ sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
+ if (error != 0)
+ return (error);
+
+ error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ &ddt->ddt_histogram[type][class]);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Seed the cached statistics.
+ */
+ error = ddt_object_info(ddt, type, class, &doi);
+ if (error)
+ return (error);
+
+ error = ddt_object_count(ddt, type, class, &count);
+ if (error)
+ return (error);
+
+ ddo->ddo_count = count;
+ ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
+ ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
+
+ return (0);
+}
+
+static void
+ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_tx_t *tx)
+{
+ ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
+ dmu_object_info_t doi;
+ uint64_t count;
+ char name[DDT_NAMELEN];
+
+ ddt_object_name(ddt, type, class, name);
+
+ VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ &ddt->ddt_histogram[type][class], tx) == 0);
+
+ /*
+ * Cache DDT statistics; this is the only time they'll change.
+ */
+ VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
+ VERIFY(ddt_object_count(ddt, type, class, &count) == 0);
+
+ ddo->ddo_count = count;
+ ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
+ ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
+}
+
+static int
+ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde)
+{
+ if (!ddt_object_exists(ddt, type, class))
+ return (SET_ERROR(ENOENT));
+
+ return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde));
+}
+
+static void
+ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde)
+{
+ if (!ddt_object_exists(ddt, type, class))
+ return;
+
+ ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde);
+}
+
+int
+ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde, tx));
+}
+
+static int
+ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde, tx));
+}
+
+int
+ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ uint64_t *walk, ddt_entry_t *dde)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde, walk));
+}
+
+int
+ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ uint64_t *count)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
+ ddt->ddt_object[type][class], count));
+}
+
+int
+ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_object_info_t *doi)
+{
+ if (!ddt_object_exists(ddt, type, class))
+ return (SET_ERROR(ENOENT));
+
+ return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
+ doi));
+}
+
+boolean_t
+ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+ return (!!ddt->ddt_object[type][class]);
+}
+
+void
+ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ char *name)
+{
+ (void) snprintf(name, DDT_NAMELEN, DMU_POOL_DDT,
+ zio_checksum_table[ddt->ddt_checksum].ci_name,
+ ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
+}
+
+void
+ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
+{
+ ASSERT(txg != 0);
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ bp->blk_dva[d] = ddp->ddp_dva[d];
+ BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
+}
+
+/*
+ * The bp created via this function may be used for repairs and scrub, but it
+ * will be missing the salt / IV required to do a full decrypting read.
+ */
+void
+ddt_bp_create(enum zio_checksum checksum,
+ const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
+{
+ BP_ZERO(bp);
+
+ if (ddp != NULL)
+ ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
+
+ bp->blk_cksum = ddk->ddk_cksum;
+
+ BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
+ BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
+ BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
+ BP_SET_CRYPT(bp, DDK_GET_CRYPT(ddk));
+ BP_SET_FILL(bp, 1);
+ BP_SET_CHECKSUM(bp, checksum);
+ BP_SET_TYPE(bp, DMU_OT_DEDUP);
+ BP_SET_LEVEL(bp, 0);
+ BP_SET_DEDUP(bp, 1);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+}
+
+void
+ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
+{
+ ddk->ddk_cksum = bp->blk_cksum;
+ ddk->ddk_prop = 0;
+
+ ASSERT(BP_IS_ENCRYPTED(bp) || !BP_USES_CRYPT(bp));
+
+ DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
+ DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
+ DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
+ DDK_SET_CRYPT(ddk, BP_USES_CRYPT(bp));
+}
+
+void
+ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
+{
+ ASSERT(ddp->ddp_phys_birth == 0);
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ ddp->ddp_dva[d] = bp->blk_dva[d];
+ ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
+}
+
+void
+ddt_phys_clear(ddt_phys_t *ddp)
+{
+ bzero(ddp, sizeof (*ddp));
+}
+
+void
+ddt_phys_addref(ddt_phys_t *ddp)
+{
+ ddp->ddp_refcnt++;
+}
+
+void
+ddt_phys_decref(ddt_phys_t *ddp)
+{
+ if (ddp) {
+ ASSERT(ddp->ddp_refcnt > 0);
+ ddp->ddp_refcnt--;
+ }
+}
+
+void
+ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
+{
+ blkptr_t blk;
+
+ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+
+ /*
+ * We clear the dedup bit so that zio_free() will actually free the
+ * space, rather than just decrementing the refcount in the DDT.
+ */
+ BP_SET_DEDUP(&blk, 0);
+
+ ddt_phys_clear(ddp);
+ zio_free(ddt->ddt_spa, txg, &blk);
+}
+
+ddt_phys_t *
+ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
+{
+ ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
+ BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
+ return (ddp);
+ }
+ return (NULL);
+}
+
+uint64_t
+ddt_phys_total_refcnt(const ddt_entry_t *dde)
+{
+ uint64_t refcnt = 0;
+
+ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
+ refcnt += dde->dde_phys[p].ddp_refcnt;
+
+ return (refcnt);
+}
+
+static void
+ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
+{
+ spa_t *spa = ddt->ddt_spa;
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_key_t *ddk = &dde->dde_key;
+ uint64_t lsize = DDK_GET_LSIZE(ddk);
+ uint64_t psize = DDK_GET_PSIZE(ddk);
+
+ bzero(dds, sizeof (*dds));
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ uint64_t dsize = 0;
+ uint64_t refcnt = ddp->ddp_refcnt;
+
+ if (ddp->ddp_phys_birth == 0)
+ continue;
+
+ for (int d = 0; d < DDE_GET_NDVAS(dde); d++)
+ dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
+
+ dds->dds_blocks += 1;
+ dds->dds_lsize += lsize;
+ dds->dds_psize += psize;
+ dds->dds_dsize += dsize;
+
+ dds->dds_ref_blocks += refcnt;
+ dds->dds_ref_lsize += lsize * refcnt;
+ dds->dds_ref_psize += psize * refcnt;
+ dds->dds_ref_dsize += dsize * refcnt;
+ }
+}
+
+void
+ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
+{
+ const uint64_t *s = (const uint64_t *)src;
+ uint64_t *d = (uint64_t *)dst;
+ uint64_t *d_end = (uint64_t *)(dst + 1);
+
+ ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */
+
+ while (d < d_end)
+ *d++ += (*s++ ^ neg) - neg;
+}
+
+static void
+ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
+{
+ ddt_stat_t dds;
+ ddt_histogram_t *ddh;
+ int bucket;
+
+ ddt_stat_generate(ddt, dde, &dds);
+
+ bucket = highbit64(dds.dds_ref_blocks) - 1;
+ ASSERT(bucket >= 0);
+
+ ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
+
+ ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
+}
+
+void
+ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
+{
+ for (int h = 0; h < 64; h++)
+ ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
+}
+
+void
+ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
+{
+ bzero(dds, sizeof (*dds));
+
+ for (int h = 0; h < 64; h++)
+ ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
+}
+
+boolean_t
+ddt_histogram_empty(const ddt_histogram_t *ddh)
+{
+ const uint64_t *s = (const uint64_t *)ddh;
+ const uint64_t *s_end = (const uint64_t *)(ddh + 1);
+
+ while (s < s_end)
+ if (*s++ != 0)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+void
+ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
+{
+ /* Sum the statistics we cached in ddt_object_sync(). */
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ ddt_object_t *ddo =
+ &ddt->ddt_object_stats[type][class];
+ ddo_total->ddo_count += ddo->ddo_count;
+ ddo_total->ddo_dspace += ddo->ddo_dspace;
+ ddo_total->ddo_mspace += ddo->ddo_mspace;
+ }
+ }
+ }
+
+ /* ... and compute the averages. */
+ if (ddo_total->ddo_count != 0) {
+ ddo_total->ddo_dspace /= ddo_total->ddo_count;
+ ddo_total->ddo_mspace /= ddo_total->ddo_count;
+ }
+}
+
+void
+ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
+{
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ ddt_histogram_add(ddh,
+ &ddt->ddt_histogram_cache[type][class]);
+ }
+ }
+ }
+}
+
+void
+ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
+{
+ ddt_histogram_t *ddh_total;
+
+ ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
+ ddt_get_dedup_histogram(spa, ddh_total);
+ ddt_histogram_stat(dds_total, ddh_total);
+ kmem_free(ddh_total, sizeof (ddt_histogram_t));
+}
+
+uint64_t
+ddt_get_dedup_dspace(spa_t *spa)
+{
+ ddt_stat_t dds_total;
+
+ if (spa->spa_dedup_dspace != ~0ULL)
+ return (spa->spa_dedup_dspace);
+
+ bzero(&dds_total, sizeof (ddt_stat_t));
+
+ /* Calculate and cache the stats */
+ ddt_get_dedup_stats(spa, &dds_total);
+ spa->spa_dedup_dspace = dds_total.dds_ref_dsize - dds_total.dds_dsize;
+ return (spa->spa_dedup_dspace);
+}
+
+uint64_t
+ddt_get_pool_dedup_ratio(spa_t *spa)
+{
+ ddt_stat_t dds_total = { 0 };
+
+ ddt_get_dedup_stats(spa, &dds_total);
+ if (dds_total.dds_dsize == 0)
+ return (100);
+
+ return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
+}
+
+size_t
+ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
+{
+ uchar_t *version = dst++;
+ int cpfunc = ZIO_COMPRESS_ZLE;
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+ size_t c_len;
+
+ ASSERT(d_len >= s_len + 1); /* no compression plus version byte */
+
+ c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
+
+ if (c_len == s_len) {
+ cpfunc = ZIO_COMPRESS_OFF;
+ bcopy(src, dst, s_len);
+ }
+
+ *version = cpfunc;
+ /* CONSTCOND */
+ if (ZFS_HOST_BYTEORDER)
+ *version |= DDT_COMPRESS_BYTEORDER_MASK;
+
+ return (c_len + 1);
+}
+
+void
+ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
+{
+ uchar_t version = *src++;
+ int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+
+ if (ci->ci_decompress != NULL)
+ (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
+ else
+ bcopy(src, dst, d_len);
+
+ if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) !=
+ (ZFS_HOST_BYTEORDER != 0))
+ byteswap_uint64_array(dst, d_len);
+}
+
+ddt_t *
+ddt_select(spa_t *spa, const blkptr_t *bp)
+{
+ return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
+}
+
+void
+ddt_enter(ddt_t *ddt)
+{
+ mutex_enter(&ddt->ddt_lock);
+}
+
+void
+ddt_exit(ddt_t *ddt)
+{
+ mutex_exit(&ddt->ddt_lock);
+}
+
+void
+ddt_init(void)
+{
+ ddt_cache = kmem_cache_create("ddt_cache",
+ sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ ddt_entry_cache = kmem_cache_create("ddt_entry_cache",
+ sizeof (ddt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+ddt_fini(void)
+{
+ kmem_cache_destroy(ddt_entry_cache);
+ kmem_cache_destroy(ddt_cache);
+}
+
+static ddt_entry_t *
+ddt_alloc(const ddt_key_t *ddk)
+{
+ ddt_entry_t *dde;
+
+ dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP);
+ bzero(dde, sizeof (ddt_entry_t));
+ cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
+
+ dde->dde_key = *ddk;
+
+ return (dde);
+}
+
+static void
+ddt_free(ddt_entry_t *dde)
+{
+ ASSERT(!dde->dde_loading);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++)
+ ASSERT(dde->dde_lead_zio[p] == NULL);
+
+ if (dde->dde_repair_abd != NULL)
+ abd_free(dde->dde_repair_abd);
+
+ cv_destroy(&dde->dde_cv);
+ kmem_cache_free(ddt_entry_cache, dde);
+}
+
+void
+ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
+{
+ ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+
+ avl_remove(&ddt->ddt_tree, dde);
+ ddt_free(dde);
+}
+
+ddt_entry_t *
+ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
+{
+ ddt_entry_t *dde, dde_search;
+ enum ddt_type type;
+ enum ddt_class class;
+ avl_index_t where;
+ int error;
+
+ ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+
+ ddt_key_fill(&dde_search.dde_key, bp);
+
+ dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
+ if (dde == NULL) {
+ if (!add)
+ return (NULL);
+ dde = ddt_alloc(&dde_search.dde_key);
+ avl_insert(&ddt->ddt_tree, dde, where);
+ }
+
+ while (dde->dde_loading)
+ cv_wait(&dde->dde_cv, &ddt->ddt_lock);
+
+ if (dde->dde_loaded)
+ return (dde);
+
+ dde->dde_loading = B_TRUE;
+
+ ddt_exit(ddt);
+
+ error = ENOENT;
+
+ for (type = 0; type < DDT_TYPES; type++) {
+ for (class = 0; class < DDT_CLASSES; class++) {
+ error = ddt_object_lookup(ddt, type, class, dde);
+ if (error != ENOENT) {
+ ASSERT0(error);
+ break;
+ }
+ }
+ if (error != ENOENT)
+ break;
+ }
+
+ ddt_enter(ddt);
+
+ ASSERT(dde->dde_loaded == B_FALSE);
+ ASSERT(dde->dde_loading == B_TRUE);
+
+ dde->dde_type = type; /* will be DDT_TYPES if no entry found */
+ dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
+ dde->dde_loaded = B_TRUE;
+ dde->dde_loading = B_FALSE;
+
+ if (error == 0)
+ ddt_stat_update(ddt, dde, -1ULL);
+
+ cv_broadcast(&dde->dde_cv);
+
+ return (dde);
+}
+
+void
+ddt_prefetch(spa_t *spa, const blkptr_t *bp)
+{
+ ddt_t *ddt;
+ ddt_entry_t dde;
+
+ if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
+ return;
+
+ /*
+ * We only remove the DDT once all tables are empty and only
+ * prefetch dedup blocks when there are entries in the DDT.
+ * Thus no locking is required as the DDT can't disappear on us.
+ */
+ ddt = ddt_select(spa, bp);
+ ddt_key_fill(&dde.dde_key, bp);
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ ddt_object_prefetch(ddt, type, class, &dde);
+ }
+ }
+}
+
+/*
+ * Opaque struct used for ddt_key comparison
+ */
+#define DDT_KEY_CMP_LEN (sizeof (ddt_key_t) / sizeof (uint16_t))
+
+typedef struct ddt_key_cmp {
+ uint16_t u16[DDT_KEY_CMP_LEN];
+} ddt_key_cmp_t;
+
+int
+ddt_entry_compare(const void *x1, const void *x2)
+{
+ const ddt_entry_t *dde1 = x1;
+ const ddt_entry_t *dde2 = x2;
+ const ddt_key_cmp_t *k1 = (const ddt_key_cmp_t *)&dde1->dde_key;
+ const ddt_key_cmp_t *k2 = (const ddt_key_cmp_t *)&dde2->dde_key;
+ int32_t cmp = 0;
+
+ for (int i = 0; i < DDT_KEY_CMP_LEN; i++) {
+ cmp = (int32_t)k1->u16[i] - (int32_t)k2->u16[i];
+ if (likely(cmp))
+ break;
+ }
+
+ return (TREE_ISIGN(cmp));
+}
+
+static ddt_t *
+ddt_table_alloc(spa_t *spa, enum zio_checksum c)
+{
+ ddt_t *ddt;
+
+ ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP);
+ bzero(ddt, sizeof (ddt_t));
+
+ mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&ddt->ddt_tree, ddt_entry_compare,
+ sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+ avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
+ sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+ ddt->ddt_checksum = c;
+ ddt->ddt_spa = spa;
+ ddt->ddt_os = spa->spa_meta_objset;
+
+ return (ddt);
+}
+
+static void
+ddt_table_free(ddt_t *ddt)
+{
+ ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
+ ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
+ avl_destroy(&ddt->ddt_tree);
+ avl_destroy(&ddt->ddt_repair_tree);
+ mutex_destroy(&ddt->ddt_lock);
+ kmem_cache_free(ddt_cache, ddt);
+}
+
+void
+ddt_create(spa_t *spa)
+{
+ spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
+ spa->spa_ddt[c] = ddt_table_alloc(spa, c);
+}
+
+int
+ddt_load(spa_t *spa)
+{
+ int error;
+
+ ddt_create(spa);
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+ &spa->spa_ddt_stat_object);
+
+ if (error)
+ return (error == ENOENT ? 0 : error);
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ error = ddt_object_load(ddt, type, class);
+ if (error != 0 && error != ENOENT)
+ return (error);
+ }
+ }
+
+ /*
+ * Seed the cached histograms.
+ */
+ bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+ sizeof (ddt->ddt_histogram));
+ spa->spa_dedup_dspace = ~0ULL;
+ }
+
+ return (0);
+}
+
+void
+ddt_unload(spa_t *spa)
+{
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ if (spa->spa_ddt[c]) {
+ ddt_table_free(spa->spa_ddt[c]);
+ spa->spa_ddt[c] = NULL;
+ }
+ }
+}
+
+boolean_t
+ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
+{
+ ddt_t *ddt;
+ ddt_entry_t *dde;
+
+ if (!BP_GET_DEDUP(bp))
+ return (B_FALSE);
+
+ if (max_class == DDT_CLASS_UNIQUE)
+ return (B_TRUE);
+
+ ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
+ dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP);
+
+ ddt_key_fill(&(dde->dde_key), bp);
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class <= max_class; class++) {
+ if (ddt_object_lookup(ddt, type, class, dde) == 0) {
+ kmem_cache_free(ddt_entry_cache, dde);
+ return (B_TRUE);
+ }
+ }
+ }
+
+ kmem_cache_free(ddt_entry_cache, dde);
+ return (B_FALSE);
+}
+
+ddt_entry_t *
+ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
+{
+ ddt_key_t ddk;
+ ddt_entry_t *dde;
+
+ ddt_key_fill(&ddk, bp);
+
+ dde = ddt_alloc(&ddk);
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ /*
+ * We can only do repair if there are multiple copies
+ * of the block. For anything in the UNIQUE class,
+ * there's definitely only one copy, so don't even try.
+ */
+ if (class != DDT_CLASS_UNIQUE &&
+ ddt_object_lookup(ddt, type, class, dde) == 0)
+ return (dde);
+ }
+ }
+
+ bzero(dde->dde_phys, sizeof (dde->dde_phys));
+
+ return (dde);
+}
+
+void
+ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
+{
+ avl_index_t where;
+
+ ddt_enter(ddt);
+
+ if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
+ avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
+ avl_insert(&ddt->ddt_repair_tree, dde, where);
+ else
+ ddt_free(dde);
+
+ ddt_exit(ddt);
+}
+
+static void
+ddt_repair_entry_done(zio_t *zio)
+{
+ ddt_entry_t *rdde = zio->io_private;
+
+ ddt_free(rdde);
+}
+
+static void
+ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
+{
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_phys_t *rddp = rdde->dde_phys;
+ ddt_key_t *ddk = &dde->dde_key;
+ ddt_key_t *rddk = &rdde->dde_key;
+ zio_t *zio;
+ blkptr_t blk;
+
+ zio = zio_null(rio, rio->io_spa, NULL,
+ ddt_repair_entry_done, rdde, rio->io_flags);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
+ if (ddp->ddp_phys_birth == 0 ||
+ ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
+ bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
+ continue;
+ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+ zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
+ rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
+ }
+
+ zio_nowait(zio);
+}
+
+static void
+ddt_repair_table(ddt_t *ddt, zio_t *rio)
+{
+ spa_t *spa = ddt->ddt_spa;
+ ddt_entry_t *dde, *rdde_next, *rdde;
+ avl_tree_t *t = &ddt->ddt_repair_tree;
+ blkptr_t blk;
+
+ if (spa_sync_pass(spa) > 1)
+ return;
+
+ ddt_enter(ddt);
+ for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
+ rdde_next = AVL_NEXT(t, rdde);
+ avl_remove(&ddt->ddt_repair_tree, rdde);
+ ddt_exit(ddt);
+ ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
+ dde = ddt_repair_start(ddt, &blk);
+ ddt_repair_entry(ddt, dde, rdde, rio);
+ ddt_repair_done(ddt, dde);
+ ddt_enter(ddt);
+ }
+ ddt_exit(ddt);
+}
+
+static void
+ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
+{
+ dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_key_t *ddk = &dde->dde_key;
+ enum ddt_type otype = dde->dde_type;
+ enum ddt_type ntype = DDT_TYPE_CURRENT;
+ enum ddt_class oclass = dde->dde_class;
+ enum ddt_class nclass;
+ uint64_t total_refcnt = 0;
+
+ ASSERT(dde->dde_loaded);
+ ASSERT(!dde->dde_loading);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ ASSERT(dde->dde_lead_zio[p] == NULL);
+ if (ddp->ddp_phys_birth == 0) {
+ ASSERT(ddp->ddp_refcnt == 0);
+ continue;
+ }
+ if (p == DDT_PHYS_DITTO) {
+ /*
+ * Note, we no longer create DDT-DITTO blocks, but we
+ * don't want to leak any written by older software.
+ */
+ ddt_phys_free(ddt, ddk, ddp, txg);
+ continue;
+ }
+ if (ddp->ddp_refcnt == 0)
+ ddt_phys_free(ddt, ddk, ddp, txg);
+ total_refcnt += ddp->ddp_refcnt;
+ }
+
+ /* We do not create new DDT-DITTO blocks. */
+ ASSERT0(dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth);
+ if (total_refcnt > 1)
+ nclass = DDT_CLASS_DUPLICATE;
+ else
+ nclass = DDT_CLASS_UNIQUE;
+
+ if (otype != DDT_TYPES &&
+ (otype != ntype || oclass != nclass || total_refcnt == 0)) {
+ VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
+ ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
+ }
+
+ if (total_refcnt != 0) {
+ dde->dde_type = ntype;
+ dde->dde_class = nclass;
+ ddt_stat_update(ddt, dde, 0);
+ if (!ddt_object_exists(ddt, ntype, nclass))
+ ddt_object_create(ddt, ntype, nclass, tx);
+ VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
+
+ /*
+ * If the class changes, the order that we scan this bp
+ * changes. If it decreases, we could miss it, so
+ * scan it right now. (This covers both class changing
+ * while we are doing ddt_walk(), and when we are
+ * traversing.)
+ */
+ if (nclass < oclass) {
+ dsl_scan_ddt_entry(dp->dp_scan,
+ ddt->ddt_checksum, dde, tx);
+ }
+ }
+}
+
+static void
+ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
+{
+ spa_t *spa = ddt->ddt_spa;
+ ddt_entry_t *dde;
+ void *cookie = NULL;
+
+ if (avl_numnodes(&ddt->ddt_tree) == 0)
+ return;
+
+ ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
+
+ if (spa->spa_ddt_stat_object == 0) {
+ spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
+ DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_DDT_STATS, tx);
+ }
+
+ while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
+ ddt_sync_entry(ddt, dde, tx, txg);
+ ddt_free(dde);
+ }
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ uint64_t add, count = 0;
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ if (ddt_object_exists(ddt, type, class)) {
+ ddt_object_sync(ddt, type, class, tx);
+ VERIFY(ddt_object_count(ddt, type, class,
+ &add) == 0);
+ count += add;
+ }
+ }
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ if (count == 0 && ddt_object_exists(ddt, type, class))
+ ddt_object_destroy(ddt, type, class, tx);
+ }
+ }
+
+ bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+ sizeof (ddt->ddt_histogram));
+ spa->spa_dedup_dspace = ~0ULL;
+}
+
+void
+ddt_sync(spa_t *spa, uint64_t txg)
+{
+ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+ dmu_tx_t *tx;
+ zio_t *rio;
+
+ ASSERT(spa_syncing_txg(spa) == txg);
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+ rio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
+
+ /*
+ * This function may cause an immediate scan of ddt blocks (see
+ * the comment above dsl_scan_ddt() for details). We set the
+ * scan's root zio here so that we can wait for any scan IOs in
+ * addition to the regular ddt IOs.
+ */
+ ASSERT3P(scn->scn_zio_root, ==, NULL);
+ scn->scn_zio_root = rio;
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ if (ddt == NULL)
+ continue;
+ ddt_sync_table(ddt, tx, txg);
+ ddt_repair_table(ddt, rio);
+ }
+
+ (void) zio_wait(rio);
+ scn->scn_zio_root = NULL;
+
+ dmu_tx_commit(tx);
+}
+
+int
+ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
+{
+ do {
+ do {
+ do {
+ ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
+ int error = ENOENT;
+ if (ddt_object_exists(ddt, ddb->ddb_type,
+ ddb->ddb_class)) {
+ error = ddt_object_walk(ddt,
+ ddb->ddb_type, ddb->ddb_class,
+ &ddb->ddb_cursor, dde);
+ }
+ dde->dde_type = ddb->ddb_type;
+ dde->dde_class = ddb->ddb_class;
+ if (error == 0)
+ return (0);
+ if (error != ENOENT)
+ return (error);
+ ddb->ddb_cursor = 0;
+ } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
+ ddb->ddb_checksum = 0;
+ } while (++ddb->ddb_type < DDT_TYPES);
+ ddb->ddb_type = 0;
+ } while (++ddb->ddb_class < DDT_CLASSES);
+
+ return (SET_ERROR(ENOENT));
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
+ "Enable prefetching dedup-ed blks");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/ddt_zap.c b/sys/contrib/openzfs/module/zfs/ddt_zap.c
new file mode 100644
index 000000000000..c5c9eda0b2d0
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/ddt_zap.c
@@ -0,0 +1,168 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+
+int ddt_zap_leaf_blockshift = 12;
+int ddt_zap_indirect_blockshift = 12;
+
+static int
+ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
+{
+ zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY;
+
+ if (prehash)
+ flags |= ZAP_FLAG_PRE_HASHED_KEY;
+
+ *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP,
+ ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift,
+ DMU_OT_NONE, 0, tx);
+
+ return (*objectp == 0 ? SET_ERROR(ENOTSUP) : 0);
+}
+
+static int
+ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ return (zap_destroy(os, object, tx));
+}
+
+static int
+ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde)
+{
+ uchar_t *cbuf;
+ uint64_t one, csize;
+ int error;
+
+ cbuf = kmem_alloc(sizeof (dde->dde_phys) + 1, KM_SLEEP);
+
+ error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, &one, &csize);
+ if (error)
+ goto out;
+
+ ASSERT(one == 1);
+ ASSERT(csize <= (sizeof (dde->dde_phys) + 1));
+
+ error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, 1, csize, cbuf);
+ if (error)
+ goto out;
+
+ ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys));
+out:
+ kmem_free(cbuf, sizeof (dde->dde_phys) + 1);
+
+ return (error);
+}
+
+static void
+ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde)
+{
+ (void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS);
+}
+
+static int
+ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+ uint64_t csize;
+
+ csize = ddt_compress(dde->dde_phys, cbuf,
+ sizeof (dde->dde_phys), sizeof (cbuf));
+
+ return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, 1, csize, cbuf, tx));
+}
+
+static int
+ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, tx));
+}
+
+static int
+ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int error;
+
+ if (*walk == 0) {
+ /*
+ * We don't want to prefetch the entire ZAP object, because
+ * it can be enormous. Also the primary use of DDT iteration
+ * is for scrubbing, in which case we will be issuing many
+ * scrub I/Os for each ZAP block that we read in, so
+ * reading the ZAP is unlikely to be the bottleneck.
+ */
+ zap_cursor_init_noprefetch(&zc, os, object);
+ } else {
+ zap_cursor_init_serialized(&zc, os, object, *walk);
+ }
+ if ((error = zap_cursor_retrieve(&zc, &za)) == 0) {
+ uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+ uint64_t csize = za.za_num_integers;
+ ASSERT(za.za_integer_length == 1);
+ error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name,
+ DDT_KEY_WORDS, 1, csize, cbuf);
+ ASSERT(error == 0);
+ if (error == 0) {
+ ddt_decompress(cbuf, dde->dde_phys, csize,
+ sizeof (dde->dde_phys));
+ dde->dde_key = *(ddt_key_t *)za.za_name;
+ }
+ zap_cursor_advance(&zc);
+ *walk = zap_cursor_serialize(&zc);
+ }
+ zap_cursor_fini(&zc);
+ return (error);
+}
+
+static int
+ddt_zap_count(objset_t *os, uint64_t object, uint64_t *count)
+{
+ return (zap_count(os, object, count));
+}
+
+const ddt_ops_t ddt_zap_ops = {
+ "zap",
+ ddt_zap_create,
+ ddt_zap_destroy,
+ ddt_zap_lookup,
+ ddt_zap_prefetch,
+ ddt_zap_update,
+ ddt_zap_remove,
+ ddt_zap_walk,
+ ddt_zap_count,
+};
diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c
new file mode 100644
index 000000000000..ed345f0b6ec3
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dmu.c
@@ -0,0 +1,2333 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2016, Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2019 Datto Inc.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_prop.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/sa.h>
+#include <sys/zfeature.h>
+#include <sys/abd.h>
+#include <sys/trace_zfs.h>
+#include <sys/zfs_rlock.h>
+#ifdef _KERNEL
+#include <sys/vmsystm.h>
+#include <sys/zfs_znode.h>
+#endif
+
+/*
+ * Enable/disable nopwrite feature.
+ */
+int zfs_nopwrite_enabled = 1;
+
+/*
+ * Tunable to control percentage of dirtied L1 blocks from frees allowed into
+ * one TXG. After this threshold is crossed, additional dirty blocks from frees
+ * will wait until the next TXG.
+ * A value of zero will disable this throttle.
+ */
+unsigned long zfs_per_txg_dirty_frees_percent = 5;
+
+/*
+ * Enable/disable forcing txg sync when dirty in dmu_offset_next.
+ */
+int zfs_dmu_offset_next_sync = 0;
+
+/*
+ * Limit the amount we can prefetch with one call to this amount. This
+ * helps to limit the amount of memory that can be used by prefetching.
+ * Larger objects should be prefetched a bit at a time.
+ */
+int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
+
+const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
+ {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" },
+ {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" },
+ {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "object array" },
+ {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "packed nvlist" },
+ {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "packed nvlist size" },
+ {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj" },
+ {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj header" },
+ {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA space map header" },
+ {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA space map" },
+ {DMU_BSWAP_UINT64, TRUE, FALSE, TRUE, "ZIL intent log" },
+ {DMU_BSWAP_DNODE, TRUE, FALSE, TRUE, "DMU dnode" },
+ {DMU_BSWAP_OBJSET, TRUE, TRUE, FALSE, "DMU objset" },
+ {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL directory" },
+ {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL directory child map"},
+ {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dataset snap map" },
+ {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL props" },
+ {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL dataset" },
+ {DMU_BSWAP_ZNODE, TRUE, FALSE, FALSE, "ZFS znode" },
+ {DMU_BSWAP_OLDACL, TRUE, FALSE, TRUE, "ZFS V0 ACL" },
+ {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "ZFS plain file" },
+ {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS directory" },
+ {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "ZFS master node" },
+ {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS delete queue" },
+ {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "zvol object" },
+ {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "zvol prop" },
+ {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "other uint8[]" },
+ {DMU_BSWAP_UINT64, FALSE, FALSE, TRUE, "other uint64[]" },
+ {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "other ZAP" },
+ {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "persistent error log" },
+ {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "SPA history" },
+ {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA history offsets" },
+ {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "Pool properties" },
+ {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL permissions" },
+ {DMU_BSWAP_ACL, TRUE, FALSE, TRUE, "ZFS ACL" },
+ {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "ZFS SYSACL" },
+ {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "FUID table" },
+ {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "FUID table size" },
+ {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dataset next clones"},
+ {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "scan work queue" },
+ {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS user/group/project used" },
+ {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS user/group/project quota"},
+ {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "snapshot refcount tags"},
+ {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "DDT ZAP algorithm" },
+ {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "DDT statistics" },
+ {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "System attributes" },
+ {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA master node" },
+ {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA attr registration" },
+ {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA attr layouts" },
+ {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "scan translations" },
+ {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "deduplicated block" },
+ {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL deadlist map" },
+ {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL deadlist map hdr" },
+ {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dir clones" },
+ {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj subobj" }
+};
+
+const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
+ { byteswap_uint8_array, "uint8" },
+ { byteswap_uint16_array, "uint16" },
+ { byteswap_uint32_array, "uint32" },
+ { byteswap_uint64_array, "uint64" },
+ { zap_byteswap, "zap" },
+ { dnode_buf_byteswap, "dnode" },
+ { dmu_objset_byteswap, "objset" },
+ { zfs_znode_byteswap, "znode" },
+ { zfs_oldacl_byteswap, "oldacl" },
+ { zfs_acl_byteswap, "acl" }
+};
+
+static int
+dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
+ void *tag, dmu_buf_t **dbp)
+{
+ uint64_t blkid;
+ dmu_buf_impl_t *db;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ blkid = dbuf_whichblock(dn, 0, offset);
+ db = dbuf_hold(dn, blkid, tag);
+ rw_exit(&dn->dn_struct_rwlock);
+
+ if (db == NULL) {
+ *dbp = NULL;
+ return (SET_ERROR(EIO));
+ }
+
+ *dbp = &db->db;
+ return (0);
+}
+int
+dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
+ void *tag, dmu_buf_t **dbp)
+{
+ dnode_t *dn;
+ uint64_t blkid;
+ dmu_buf_impl_t *db;
+ int err;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ blkid = dbuf_whichblock(dn, 0, offset);
+ db = dbuf_hold(dn, blkid, tag);
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+
+ if (db == NULL) {
+ *dbp = NULL;
+ return (SET_ERROR(EIO));
+ }
+
+ *dbp = &db->db;
+ return (err);
+}
+
+int
+dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
+ void *tag, dmu_buf_t **dbp, int flags)
+{
+ int err;
+ int db_flags = DB_RF_CANFAIL;
+
+ if (flags & DMU_READ_NO_PREFETCH)
+ db_flags |= DB_RF_NOPREFETCH;
+ if (flags & DMU_READ_NO_DECRYPT)
+ db_flags |= DB_RF_NO_DECRYPT;
+
+ err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
+ if (err == 0) {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
+ err = dbuf_read(db, NULL, db_flags);
+ if (err != 0) {
+ dbuf_rele(db, tag);
+ *dbp = NULL;
+ }
+ }
+
+ return (err);
+}
+
+int
+dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+ void *tag, dmu_buf_t **dbp, int flags)
+{
+ int err;
+ int db_flags = DB_RF_CANFAIL;
+
+ if (flags & DMU_READ_NO_PREFETCH)
+ db_flags |= DB_RF_NOPREFETCH;
+ if (flags & DMU_READ_NO_DECRYPT)
+ db_flags |= DB_RF_NO_DECRYPT;
+
+ err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
+ if (err == 0) {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
+ err = dbuf_read(db, NULL, db_flags);
+ if (err != 0) {
+ dbuf_rele(db, tag);
+ *dbp = NULL;
+ }
+ }
+
+ return (err);
+}
+
+int
+dmu_bonus_max(void)
+{
+ return (DN_OLD_MAX_BONUSLEN);
+}
+
+int
+dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ int error;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (dn->dn_bonus != db) {
+ error = SET_ERROR(EINVAL);
+ } else if (newsize < 0 || newsize > db_fake->db_size) {
+ error = SET_ERROR(EINVAL);
+ } else {
+ dnode_setbonuslen(dn, newsize, tx);
+ error = 0;
+ }
+
+ DB_DNODE_EXIT(db);
+ return (error);
+}
+
+int
+dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ int error;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (!DMU_OT_IS_VALID(type)) {
+ error = SET_ERROR(EINVAL);
+ } else if (dn->dn_bonus != db) {
+ error = SET_ERROR(EINVAL);
+ } else {
+ dnode_setbonus_type(dn, type, tx);
+ error = 0;
+ }
+
+ DB_DNODE_EXIT(db);
+ return (error);
+}
+
+dmu_object_type_t
+dmu_get_bonustype(dmu_buf_t *db_fake)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ dmu_object_type_t type;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ type = dn->dn_bonustype;
+ DB_DNODE_EXIT(db);
+
+ return (type);
+}
+
+int
+dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int error;
+
+ error = dnode_hold(os, object, FTAG, &dn);
+ dbuf_rm_spill(dn, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ dnode_rm_spill(dn, tx);
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ return (error);
+}
+
+/*
+ * Lookup and hold the bonus buffer for the provided dnode. If the dnode
+ * has not yet been allocated a new bonus dbuf a will be allocated.
+ * Returns ENOENT, EIO, or 0.
+ */
+int dmu_bonus_hold_by_dnode(dnode_t *dn, void *tag, dmu_buf_t **dbp,
+ uint32_t flags)
+{
+ dmu_buf_impl_t *db;
+ int error;
+ uint32_t db_flags = DB_RF_MUST_SUCCEED;
+
+ if (flags & DMU_READ_NO_PREFETCH)
+ db_flags |= DB_RF_NOPREFETCH;
+ if (flags & DMU_READ_NO_DECRYPT)
+ db_flags |= DB_RF_NO_DECRYPT;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_bonus == NULL) {
+ rw_exit(&dn->dn_struct_rwlock);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (dn->dn_bonus == NULL)
+ dbuf_create_bonus(dn);
+ }
+ db = dn->dn_bonus;
+
+ /* as long as the bonus buf is held, the dnode will be held */
+ if (zfs_refcount_add(&db->db_holds, tag) == 1) {
+ VERIFY(dnode_add_ref(dn, db));
+ atomic_inc_32(&dn->dn_dbufs_count);
+ }
+
+ /*
+ * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
+ * hold and incrementing the dbuf count to ensure that dnode_move() sees
+ * a dnode hold for every dbuf.
+ */
+ rw_exit(&dn->dn_struct_rwlock);
+
+ error = dbuf_read(db, NULL, db_flags);
+ if (error) {
+ dnode_evict_bonus(dn);
+ dbuf_rele(db, tag);
+ *dbp = NULL;
+ return (error);
+ }
+
+ *dbp = &db->db;
+ return (0);
+}
+
+int
+dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
+{
+ dnode_t *dn;
+ int error;
+
+ error = dnode_hold(os, object, FTAG, &dn);
+ if (error)
+ return (error);
+
+ error = dmu_bonus_hold_by_dnode(dn, tag, dbp, DMU_READ_NO_PREFETCH);
+ dnode_rele(dn, FTAG);
+
+ return (error);
+}
+
+/*
+ * returns ENOENT, EIO, or 0.
+ *
+ * This interface will allocate a blank spill dbuf when a spill blk
+ * doesn't already exist on the dnode.
+ *
+ * if you only want to find an already existing spill db, then
+ * dmu_spill_hold_existing() should be used.
+ */
+int
+dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
+{
+ dmu_buf_impl_t *db = NULL;
+ int err;
+
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+ db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
+
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&dn->dn_struct_rwlock);
+
+ if (db == NULL) {
+ *dbp = NULL;
+ return (SET_ERROR(EIO));
+ }
+ err = dbuf_read(db, NULL, flags);
+ if (err == 0)
+ *dbp = &db->db;
+ else {
+ dbuf_rele(db, tag);
+ *dbp = NULL;
+ }
+ return (err);
+}
+
+int
+dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+ dnode_t *dn;
+ int err;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
+ err = SET_ERROR(EINVAL);
+ } else {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+ if (!dn->dn_have_spill) {
+ err = SET_ERROR(ENOENT);
+ } else {
+ err = dmu_spill_hold_by_dnode(dn,
+ DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
+ }
+
+ rw_exit(&dn->dn_struct_rwlock);
+ }
+
+ DB_DNODE_EXIT(db);
+ return (err);
+}
+
+int
+dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, void *tag,
+ dmu_buf_t **dbp)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+ dnode_t *dn;
+ int err;
+ uint32_t db_flags = DB_RF_CANFAIL;
+
+ if (flags & DMU_READ_NO_DECRYPT)
+ db_flags |= DB_RF_NO_DECRYPT;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ err = dmu_spill_hold_by_dnode(dn, db_flags, tag, dbp);
+ DB_DNODE_EXIT(db);
+
+ return (err);
+}
+
+/*
+ * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
+ * to take a held dnode rather than <os, object> -- the lookup is wasteful,
+ * and can induce severe lock contention when writing to several files
+ * whose dnodes are in the same block.
+ */
+int
+dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
+ boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
+{
+ dmu_buf_t **dbp;
+ uint64_t blkid, nblks, i;
+ uint32_t dbuf_flags;
+ int err;
+ zio_t *zio = NULL;
+
+ ASSERT(length <= DMU_MAX_ACCESS);
+
+ /*
+ * Note: We directly notify the prefetch code of this read, so that
+ * we can tell it about the multi-block read. dbuf_read() only knows
+ * about the one block it is accessing.
+ */
+ dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
+ DB_RF_NOPREFETCH;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_datablkshift) {
+ int blkshift = dn->dn_datablkshift;
+ nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
+ P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
+ } else {
+ if (offset + length > dn->dn_datablksz) {
+ zfs_panic_recover("zfs: accessing past end of object "
+ "%llx/%llx (size=%u access=%llu+%llu)",
+ (longlong_t)dn->dn_objset->
+ os_dsl_dataset->ds_object,
+ (longlong_t)dn->dn_object, dn->dn_datablksz,
+ (longlong_t)offset, (longlong_t)length);
+ rw_exit(&dn->dn_struct_rwlock);
+ return (SET_ERROR(EIO));
+ }
+ nblks = 1;
+ }
+ dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
+
+ if (read)
+ zio = zio_root(dn->dn_objset->os_spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+ blkid = dbuf_whichblock(dn, 0, offset);
+ for (i = 0; i < nblks; i++) {
+ dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
+ if (db == NULL) {
+ rw_exit(&dn->dn_struct_rwlock);
+ dmu_buf_rele_array(dbp, nblks, tag);
+ if (read)
+ zio_nowait(zio);
+ return (SET_ERROR(EIO));
+ }
+
+ /* initiate async i/o */
+ if (read)
+ (void) dbuf_read(db, zio, dbuf_flags);
+ dbp[i] = &db->db;
+ }
+
+ if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
+ DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
+ dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
+ read && DNODE_IS_CACHEABLE(dn), B_TRUE);
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+
+ if (read) {
+ /* wait for async read i/o */
+ err = zio_wait(zio);
+ if (err) {
+ dmu_buf_rele_array(dbp, nblks, tag);
+ return (err);
+ }
+
+ /* wait for other io to complete */
+ for (i = 0; i < nblks; i++) {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
+ mutex_enter(&db->db_mtx);
+ while (db->db_state == DB_READ ||
+ db->db_state == DB_FILL)
+ cv_wait(&db->db_changed, &db->db_mtx);
+ if (db->db_state == DB_UNCACHED)
+ err = SET_ERROR(EIO);
+ mutex_exit(&db->db_mtx);
+ if (err) {
+ dmu_buf_rele_array(dbp, nblks, tag);
+ return (err);
+ }
+ }
+ }
+
+ *numbufsp = nblks;
+ *dbpp = dbp;
+ return (0);
+}
+
+static int
+dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
+ numbufsp, dbpp, DMU_READ_PREFETCH);
+
+ dnode_rele(dn, FTAG);
+
+ return (err);
+}
+
+int
+dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
+ uint64_t length, boolean_t read, void *tag, int *numbufsp,
+ dmu_buf_t ***dbpp)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ int err;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
+ numbufsp, dbpp, DMU_READ_PREFETCH);
+ DB_DNODE_EXIT(db);
+
+ return (err);
+}
+
+void
+dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
+{
+ int i;
+ dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
+
+ if (numbufs == 0)
+ return;
+
+ for (i = 0; i < numbufs; i++) {
+ if (dbp[i])
+ dbuf_rele(dbp[i], tag);
+ }
+
+ kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
+}
+
+/*
+ * Issue prefetch i/os for the given blocks. If level is greater than 0, the
+ * indirect blocks prefetched will be those that point to the blocks containing
+ * the data starting at offset, and continuing to offset + len.
+ *
+ * Note that if the indirect blocks above the blocks being prefetched are not
+ * in cache, they will be asynchronously read in.
+ */
+void
+dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
+ uint64_t len, zio_priority_t pri)
+{
+ dnode_t *dn;
+ uint64_t blkid;
+ int nblks, err;
+
+ if (len == 0) { /* they're interested in the bonus buffer */
+ dn = DMU_META_DNODE(os);
+
+ if (object == 0 || object >= DN_MAX_OBJECT)
+ return;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ blkid = dbuf_whichblock(dn, level,
+ object * sizeof (dnode_phys_t));
+ dbuf_prefetch(dn, level, blkid, pri, 0);
+ rw_exit(&dn->dn_struct_rwlock);
+ return;
+ }
+
+ /*
+ * See comment before the definition of dmu_prefetch_max.
+ */
+ len = MIN(len, dmu_prefetch_max);
+
+ /*
+ * XXX - Note, if the dnode for the requested object is not
+ * already cached, we will do a *synchronous* read in the
+ * dnode_hold() call. The same is true for any indirects.
+ */
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err != 0)
+ return;
+
+ /*
+ * offset + len - 1 is the last byte we want to prefetch for, and offset
+ * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the
+ * last block we want to prefetch, and dbuf_whichblock(dn, level,
+ * offset) is the first. Then the number we need to prefetch is the
+ * last - first + 1.
+ */
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (level > 0 || dn->dn_datablkshift != 0) {
+ nblks = dbuf_whichblock(dn, level, offset + len - 1) -
+ dbuf_whichblock(dn, level, offset) + 1;
+ } else {
+ nblks = (offset < dn->dn_datablksz);
+ }
+
+ if (nblks != 0) {
+ blkid = dbuf_whichblock(dn, level, offset);
+ for (int i = 0; i < nblks; i++)
+ dbuf_prefetch(dn, level, blkid + i, pri, 0);
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+
+ dnode_rele(dn, FTAG);
+}
+
+/*
+ * Get the next "chunk" of file data to free. We traverse the file from
+ * the end so that the file gets shorter over time (if we crashes in the
+ * middle, this will leave us in a better state). We find allocated file
+ * data by simply searching the allocated level 1 indirects.
+ *
+ * On input, *start should be the first offset that does not need to be
+ * freed (e.g. "offset + length"). On return, *start will be the first
+ * offset that should be freed and l1blks is set to the number of level 1
+ * indirect blocks found within the chunk.
+ */
+static int
+get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks)
+{
+ uint64_t blks;
+ uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
+ /* bytes of data covered by a level-1 indirect block */
+ uint64_t iblkrange = (uint64_t)dn->dn_datablksz *
+ EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
+
+ ASSERT3U(minimum, <=, *start);
+
+ /*
+ * Check if we can free the entire range assuming that all of the
+ * L1 blocks in this range have data. If we can, we use this
+ * worst case value as an estimate so we can avoid having to look
+ * at the object's actual data.
+ */
+ uint64_t total_l1blks =
+ (roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) /
+ iblkrange;
+ if (total_l1blks <= maxblks) {
+ *l1blks = total_l1blks;
+ *start = minimum;
+ return (0);
+ }
+ ASSERT(ISP2(iblkrange));
+
+ for (blks = 0; *start > minimum && blks < maxblks; blks++) {
+ int err;
+
+ /*
+ * dnode_next_offset(BACKWARDS) will find an allocated L1
+ * indirect block at or before the input offset. We must
+ * decrement *start so that it is at the end of the region
+ * to search.
+ */
+ (*start)--;
+
+ err = dnode_next_offset(dn,
+ DNODE_FIND_BACKWARDS, start, 2, 1, 0);
+
+ /* if there are no indirect blocks before start, we are done */
+ if (err == ESRCH) {
+ *start = minimum;
+ break;
+ } else if (err != 0) {
+ *l1blks = blks;
+ return (err);
+ }
+
+ /* set start to the beginning of this L1 indirect */
+ *start = P2ALIGN(*start, iblkrange);
+ }
+ if (*start < minimum)
+ *start = minimum;
+ *l1blks = blks;
+
+ return (0);
+}
+
+/*
+ * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set,
+ * otherwise return false.
+ * Used below in dmu_free_long_range_impl() to enable abort when unmounting
+ */
+/*ARGSUSED*/
+static boolean_t
+dmu_objset_zfs_unmounting(objset_t *os)
+{
+#ifdef _KERNEL
+ if (dmu_objset_type(os) == DMU_OST_ZFS)
+ return (zfs_get_vfs_flag_unmounted(os));
+#endif
+ return (B_FALSE);
+}
+
+static int
+dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
+ uint64_t length)
+{
+ uint64_t object_size;
+ int err;
+ uint64_t dirty_frees_threshold;
+ dsl_pool_t *dp = dmu_objset_pool(os);
+
+ if (dn == NULL)
+ return (SET_ERROR(EINVAL));
+
+ object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
+ if (offset >= object_size)
+ return (0);
+
+ if (zfs_per_txg_dirty_frees_percent <= 100)
+ dirty_frees_threshold =
+ zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
+ else
+ dirty_frees_threshold = zfs_dirty_data_max / 20;
+
+ if (length == DMU_OBJECT_END || offset + length > object_size)
+ length = object_size - offset;
+
+ while (length != 0) {
+ uint64_t chunk_end, chunk_begin, chunk_len;
+ uint64_t l1blks;
+ dmu_tx_t *tx;
+
+ if (dmu_objset_zfs_unmounting(dn->dn_objset))
+ return (SET_ERROR(EINTR));
+
+ chunk_end = chunk_begin = offset + length;
+
+ /* move chunk_begin backwards to the beginning of this chunk */
+ err = get_next_chunk(dn, &chunk_begin, offset, &l1blks);
+ if (err)
+ return (err);
+ ASSERT3U(chunk_begin, >=, offset);
+ ASSERT3U(chunk_begin, <=, chunk_end);
+
+ chunk_len = chunk_end - chunk_begin;
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
+
+ /*
+ * Mark this transaction as typically resulting in a net
+ * reduction in space used.
+ */
+ dmu_tx_mark_netfree(tx);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ uint64_t txg = dmu_tx_get_txg(tx);
+
+ mutex_enter(&dp->dp_lock);
+ uint64_t long_free_dirty =
+ dp->dp_long_free_dirty_pertxg[txg & TXG_MASK];
+ mutex_exit(&dp->dp_lock);
+
+ /*
+ * To avoid filling up a TXG with just frees, wait for
+ * the next TXG to open before freeing more chunks if
+ * we have reached the threshold of frees.
+ */
+ if (dirty_frees_threshold != 0 &&
+ long_free_dirty >= dirty_frees_threshold) {
+ DMU_TX_STAT_BUMP(dmu_tx_dirty_frees_delay);
+ dmu_tx_commit(tx);
+ txg_wait_open(dp, 0, B_TRUE);
+ continue;
+ }
+
+ /*
+ * In order to prevent unnecessary write throttling, for each
+ * TXG, we track the cumulative size of L1 blocks being dirtied
+ * in dnode_free_range() below. We compare this number to a
+ * tunable threshold, past which we prevent new L1 dirty freeing
+ * blocks from being added into the open TXG. See
+ * dmu_free_long_range_impl() for details. The threshold
+ * prevents write throttle activation due to dirty freeing L1
+ * blocks taking up a large percentage of zfs_dirty_data_max.
+ */
+ mutex_enter(&dp->dp_lock);
+ dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] +=
+ l1blks << dn->dn_indblkshift;
+ mutex_exit(&dp->dp_lock);
+ DTRACE_PROBE3(free__long__range,
+ uint64_t, long_free_dirty, uint64_t, chunk_len,
+ uint64_t, txg);
+ dnode_free_range(dn, chunk_begin, chunk_len, tx);
+
+ dmu_tx_commit(tx);
+
+ length -= chunk_len;
+ }
+ return (0);
+}
+
+int
+dmu_free_long_range(objset_t *os, uint64_t object,
+ uint64_t offset, uint64_t length)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err != 0)
+ return (err);
+ err = dmu_free_long_range_impl(os, dn, offset, length);
+
+ /*
+ * It is important to zero out the maxblkid when freeing the entire
+ * file, so that (a) subsequent calls to dmu_free_long_range_impl()
+ * will take the fast path, and (b) dnode_reallocate() can verify
+ * that the entire file has been freed.
+ */
+ if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
+ dn->dn_maxblkid = 0;
+
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+int
+dmu_free_long_object(objset_t *os, uint64_t object)
+{
+ dmu_tx_t *tx;
+ int err;
+
+ err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
+ if (err != 0)
+ return (err);
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, object);
+ dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+ dmu_tx_mark_netfree(tx);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err == 0) {
+ if (err == 0)
+ err = dmu_object_free(os, object, tx);
+
+ dmu_tx_commit(tx);
+ } else {
+ dmu_tx_abort(tx);
+ }
+
+ return (err);
+}
+
+int
+dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t size, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ ASSERT(offset < UINT64_MAX);
+ ASSERT(size == DMU_OBJECT_END || size <= UINT64_MAX - offset);
+ dnode_free_range(dn, offset, size, tx);
+ dnode_rele(dn, FTAG);
+ return (0);
+}
+
+static int
+dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
+ void *buf, uint32_t flags)
+{
+ dmu_buf_t **dbp;
+ int numbufs, err = 0;
+
+ /*
+ * Deal with odd block sizes, where there can't be data past the first
+ * block. If we ever do the tail block optimization, we will need to
+ * handle that here as well.
+ */
+ if (dn->dn_maxblkid == 0) {
+ uint64_t newsz = offset > dn->dn_datablksz ? 0 :
+ MIN(size, dn->dn_datablksz - offset);
+ bzero((char *)buf + newsz, size - newsz);
+ size = newsz;
+ }
+
+ while (size > 0) {
+ uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
+ int i;
+
+ /*
+ * NB: we could do this block-at-a-time, but it's nice
+ * to be reading in parallel.
+ */
+ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
+ TRUE, FTAG, &numbufs, &dbp, flags);
+ if (err)
+ break;
+
+ for (i = 0; i < numbufs; i++) {
+ uint64_t tocpy;
+ int64_t bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = offset - db->db_offset;
+ tocpy = MIN(db->db_size - bufoff, size);
+
+ (void) memcpy(buf, (char *)db->db_data + bufoff, tocpy);
+
+ offset += tocpy;
+ size -= tocpy;
+ buf = (char *)buf + tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ }
+ return (err);
+}
+
+int
+dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ void *buf, uint32_t flags)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err != 0)
+ return (err);
+
+ err = dmu_read_impl(dn, offset, size, buf, flags);
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+int
+dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
+ uint32_t flags)
+{
+ return (dmu_read_impl(dn, offset, size, buf, flags));
+}
+
+static void
+dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
+ const void *buf, dmu_tx_t *tx)
+{
+ int i;
+
+ for (i = 0; i < numbufs; i++) {
+ uint64_t tocpy;
+ int64_t bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = offset - db->db_offset;
+ tocpy = MIN(db->db_size - bufoff, size);
+
+ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+ if (tocpy == db->db_size)
+ dmu_buf_will_fill(db, tx);
+ else
+ dmu_buf_will_dirty(db, tx);
+
+ (void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
+
+ if (tocpy == db->db_size)
+ dmu_buf_fill_done(db, tx);
+
+ offset += tocpy;
+ size -= tocpy;
+ buf = (char *)buf + tocpy;
+ }
+}
+
+void
+dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ const void *buf, dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs;
+
+ if (size == 0)
+ return;
+
+ VERIFY0(dmu_buf_hold_array(os, object, offset, size,
+ FALSE, FTAG, &numbufs, &dbp));
+ dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
+/*
+ * Note: Lustre is an external consumer of this interface.
+ */
+void
+dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
+ const void *buf, dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs;
+
+ if (size == 0)
+ return;
+
+ VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
+ FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
+ dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
+void
+dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs, i;
+
+ if (size == 0)
+ return;
+
+ VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
+ FALSE, FTAG, &numbufs, &dbp));
+
+ for (i = 0; i < numbufs; i++) {
+ dmu_buf_t *db = dbp[i];
+
+ dmu_buf_will_not_fill(db, tx);
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
+void
+dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
+ void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
+ int compressed_size, int byteorder, dmu_tx_t *tx)
+{
+ dmu_buf_t *db;
+
+ ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
+ ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
+ VERIFY0(dmu_buf_hold_noread(os, object, offset,
+ FTAG, &db));
+
+ dmu_buf_write_embedded(db,
+ data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
+ uncompressed_size, compressed_size, byteorder, tx);
+
+ dmu_buf_rele(db, FTAG);
+}
+
+void
+dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ dmu_tx_t *tx)
+{
+ int numbufs, i;
+ dmu_buf_t **dbp;
+
+ VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG,
+ &numbufs, &dbp));
+ for (i = 0; i < numbufs; i++)
+ dmu_buf_redact(dbp[i], tx);
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
+#ifdef _KERNEL
+int
+dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
+{
+ dmu_buf_t **dbp;
+ int numbufs, i, err;
+
+ /*
+ * NB: we could do this block-at-a-time, but it's nice
+ * to be reading in parallel.
+ */
+ err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size,
+ TRUE, FTAG, &numbufs, &dbp, 0);
+ if (err)
+ return (err);
+
+ for (i = 0; i < numbufs; i++) {
+ uint64_t tocpy;
+ int64_t bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = zfs_uio_offset(uio) - db->db_offset;
+ tocpy = MIN(db->db_size - bufoff, size);
+
+ err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy,
+ UIO_READ, uio);
+
+ if (err)
+ break;
+
+ size -= tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+ return (err);
+}
+
+/*
+ * Read 'size' bytes into the uio buffer.
+ * From object zdb->db_object.
+ * Starting at zfs_uio_offset(uio).
+ *
+ * If the caller already has a dbuf in the target object
+ * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
+ * because we don't have to find the dnode_t for the object.
+ */
+int
+dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
+ dnode_t *dn;
+ int err;
+
+ if (size == 0)
+ return (0);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ err = dmu_read_uio_dnode(dn, uio, size);
+ DB_DNODE_EXIT(db);
+
+ return (err);
+}
+
+/*
+ * Read 'size' bytes into the uio buffer.
+ * From the specified object
+ * Starting at offset zfs_uio_offset(uio).
+ */
+int
+dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size)
+{
+ dnode_t *dn;
+ int err;
+
+ if (size == 0)
+ return (0);
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ err = dmu_read_uio_dnode(dn, uio, size);
+
+ dnode_rele(dn, FTAG);
+
+ return (err);
+}
+
+int
+dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs;
+ int err = 0;
+ int i;
+
+ err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size,
+ FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
+ if (err)
+ return (err);
+
+ for (i = 0; i < numbufs; i++) {
+ uint64_t tocpy;
+ int64_t bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = zfs_uio_offset(uio) - db->db_offset;
+ tocpy = MIN(db->db_size - bufoff, size);
+
+ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+ if (tocpy == db->db_size)
+ dmu_buf_will_fill(db, tx);
+ else
+ dmu_buf_will_dirty(db, tx);
+
+ /*
+ * XXX zfs_uiomove could block forever (eg.nfs-backed
+ * pages). There needs to be a uiolockdown() function
+ * to lock the pages in memory, so that zfs_uiomove won't
+ * block.
+ */
+ err = zfs_uio_fault_move((char *)db->db_data + bufoff,
+ tocpy, UIO_WRITE, uio);
+
+ if (tocpy == db->db_size)
+ dmu_buf_fill_done(db, tx);
+
+ if (err)
+ break;
+
+ size -= tocpy;
+ }
+
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ return (err);
+}
+
+/*
+ * Write 'size' bytes from the uio buffer.
+ * To object zdb->db_object.
+ * Starting at offset zfs_uio_offset(uio).
+ *
+ * If the caller already has a dbuf in the target object
+ * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
+ * because we don't have to find the dnode_t for the object.
+ */
+int
+dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
+ dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
+ dnode_t *dn;
+ int err;
+
+ if (size == 0)
+ return (0);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ err = dmu_write_uio_dnode(dn, uio, size, tx);
+ DB_DNODE_EXIT(db);
+
+ return (err);
+}
+
+/*
+ * Write 'size' bytes from the uio buffer.
+ * To the specified object.
+ * Starting at offset zfs_uio_offset(uio).
+ */
+int
+dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ if (size == 0)
+ return (0);
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ err = dmu_write_uio_dnode(dn, uio, size, tx);
+
+ dnode_rele(dn, FTAG);
+
+ return (err);
+}
+#endif /* _KERNEL */
+
+/*
+ * Allocate a loaned anonymous arc buffer.
+ */
+arc_buf_t *
+dmu_request_arcbuf(dmu_buf_t *handle, int size)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
+
+ return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
+}
+
+/*
+ * Free a loaned arc buffer.
+ */
+void
+dmu_return_arcbuf(arc_buf_t *buf)
+{
+ arc_return_buf(buf, FTAG);
+ arc_buf_destroy(buf, FTAG);
+}
+
+/*
+ * A "lightweight" write is faster than a regular write (e.g.
+ * dmu_write_by_dnode() or dmu_assign_arcbuf_by_dnode()), because it avoids the
+ * CPU cost of creating a dmu_buf_impl_t and arc_buf_[hdr_]_t. However, the
+ * data can not be read or overwritten until the transaction's txg has been
+ * synced. This makes it appropriate for workloads that are known to be
+ * (temporarily) write-only, like "zfs receive".
+ *
+ * A single block is written, starting at the specified offset in bytes. If
+ * the call is successful, it returns 0 and the provided abd has been
+ * consumed (the caller should not free it).
+ */
+int
+dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd,
+ const zio_prop_t *zp, enum zio_flag flags, dmu_tx_t *tx)
+{
+ dbuf_dirty_record_t *dr =
+ dbuf_dirty_lightweight(dn, dbuf_whichblock(dn, 0, offset), tx);
+ if (dr == NULL)
+ return (SET_ERROR(EIO));
+ dr->dt.dll.dr_abd = abd;
+ dr->dt.dll.dr_props = *zp;
+ dr->dt.dll.dr_flags = flags;
+ return (0);
+}
+
+/*
+ * When possible directly assign passed loaned arc buffer to a dbuf.
+ * If this is not possible copy the contents of passed arc buf via
+ * dmu_write().
+ */
+int
+dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
+ dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db;
+ objset_t *os = dn->dn_objset;
+ uint64_t object = dn->dn_object;
+ uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
+ uint64_t blkid;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ blkid = dbuf_whichblock(dn, 0, offset);
+ db = dbuf_hold(dn, blkid, FTAG);
+ if (db == NULL)
+ return (SET_ERROR(EIO));
+ rw_exit(&dn->dn_struct_rwlock);
+
+ /*
+ * We can only assign if the offset is aligned and the arc buf is the
+ * same size as the dbuf.
+ */
+ if (offset == db->db.db_offset && blksz == db->db.db_size) {
+ dbuf_assign_arcbuf(db, buf, tx);
+ dbuf_rele(db, FTAG);
+ } else {
+ /* compressed bufs must always be assignable to their dbuf */
+ ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
+ ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
+
+ dbuf_rele(db, FTAG);
+ dmu_write(os, object, offset, blksz, buf->b_data, tx);
+ dmu_return_arcbuf(buf);
+ }
+
+ return (0);
+}
+
+int
+dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
+ dmu_tx_t *tx)
+{
+ int err;
+ dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
+
+ DB_DNODE_ENTER(dbuf);
+ err = dmu_assign_arcbuf_by_dnode(DB_DNODE(dbuf), offset, buf, tx);
+ DB_DNODE_EXIT(dbuf);
+
+ return (err);
+}
+
+typedef struct {
+ dbuf_dirty_record_t *dsa_dr;
+ dmu_sync_cb_t *dsa_done;
+ zgd_t *dsa_zgd;
+ dmu_tx_t *dsa_tx;
+} dmu_sync_arg_t;
+
+/* ARGSUSED */
+static void
+dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
+{
+ dmu_sync_arg_t *dsa = varg;
+ dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
+ blkptr_t *bp = zio->io_bp;
+
+ if (zio->io_error == 0) {
+ if (BP_IS_HOLE(bp)) {
+ /*
+ * A block of zeros may compress to a hole, but the
+ * block size still needs to be known for replay.
+ */
+ BP_SET_LSIZE(bp, db->db_size);
+ } else if (!BP_IS_EMBEDDED(bp)) {
+ ASSERT(BP_GET_LEVEL(bp) == 0);
+ BP_SET_FILL(bp, 1);
+ }
+ }
+}
+
+static void
+dmu_sync_late_arrival_ready(zio_t *zio)
+{
+ dmu_sync_ready(zio, NULL, zio->io_private);
+}
+
+/* ARGSUSED */
+static void
+dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
+{
+ dmu_sync_arg_t *dsa = varg;
+ dbuf_dirty_record_t *dr = dsa->dsa_dr;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ zgd_t *zgd = dsa->dsa_zgd;
+
+ /*
+ * Record the vdev(s) backing this blkptr so they can be flushed after
+ * the writes for the lwb have completed.
+ */
+ if (zio->io_error == 0) {
+ zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
+ }
+
+ mutex_enter(&db->db_mtx);
+ ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
+ if (zio->io_error == 0) {
+ dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
+ if (dr->dt.dl.dr_nopwrite) {
+ blkptr_t *bp = zio->io_bp;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
+
+ ASSERT(BP_EQUAL(bp, bp_orig));
+ VERIFY(BP_EQUAL(bp, db->db_blkptr));
+ ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
+ VERIFY(zio_checksum_table[chksum].ci_flags &
+ ZCHECKSUM_FLAG_NOPWRITE);
+ }
+ dr->dt.dl.dr_overridden_by = *zio->io_bp;
+ dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
+ dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
+
+ /*
+ * Old style holes are filled with all zeros, whereas
+ * new-style holes maintain their lsize, type, level,
+ * and birth time (see zio_write_compress). While we
+ * need to reset the BP_SET_LSIZE() call that happened
+ * in dmu_sync_ready for old style holes, we do *not*
+ * want to wipe out the information contained in new
+ * style holes. Thus, only zero out the block pointer if
+ * it's an old style hole.
+ */
+ if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
+ dr->dt.dl.dr_overridden_by.blk_birth == 0)
+ BP_ZERO(&dr->dt.dl.dr_overridden_by);
+ } else {
+ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ }
+ cv_broadcast(&db->db_changed);
+ mutex_exit(&db->db_mtx);
+
+ dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+
+ kmem_free(dsa, sizeof (*dsa));
+}
+
+static void
+dmu_sync_late_arrival_done(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ dmu_sync_arg_t *dsa = zio->io_private;
+ zgd_t *zgd = dsa->dsa_zgd;
+
+ if (zio->io_error == 0) {
+ /*
+ * Record the vdev(s) backing this blkptr so they can be
+ * flushed after the writes for the lwb have completed.
+ */
+ zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
+
+ if (!BP_IS_HOLE(bp)) {
+ blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig;
+ ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
+ ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
+ ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+ ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
+ zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+ }
+ }
+
+ dmu_tx_commit(dsa->dsa_tx);
+
+ dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+
+ abd_free(zio->io_abd);
+ kmem_free(dsa, sizeof (*dsa));
+}
+
+static int
+dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
+ zio_prop_t *zp, zbookmark_phys_t *zb)
+{
+ dmu_sync_arg_t *dsa;
+ dmu_tx_t *tx;
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
+ if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
+ dmu_tx_abort(tx);
+ /* Make zl_get_data do txg_waited_synced() */
+ return (SET_ERROR(EIO));
+ }
+
+ /*
+ * In order to prevent the zgd's lwb from being free'd prior to
+ * dmu_sync_late_arrival_done() being called, we have to ensure
+ * the lwb's "max txg" takes this tx's txg into account.
+ */
+ zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx));
+
+ dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+ dsa->dsa_dr = NULL;
+ dsa->dsa_done = done;
+ dsa->dsa_zgd = zgd;
+ dsa->dsa_tx = tx;
+
+ /*
+ * Since we are currently syncing this txg, it's nontrivial to
+ * determine what BP to nopwrite against, so we disable nopwrite.
+ *
+ * When syncing, the db_blkptr is initially the BP of the previous
+ * txg. We can not nopwrite against it because it will be changed
+ * (this is similar to the non-late-arrival case where the dbuf is
+ * dirty in a future txg).
+ *
+ * Then dbuf_write_ready() sets bp_blkptr to the location we will write.
+ * We can not nopwrite against it because although the BP will not
+ * (typically) be changed, the data has not yet been persisted to this
+ * location.
+ *
+ * Finally, when dbuf_write_done() is called, it is theoretically
+ * possible to always nopwrite, because the data that was written in
+ * this txg is the same data that we are trying to write. However we
+ * would need to check that this dbuf is not dirty in any future
+ * txg's (as we do in the normal dmu_sync() path). For simplicity, we
+ * don't nopwrite in this case.
+ */
+ zp->zp_nopwrite = B_FALSE;
+
+ zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
+ abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
+ zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
+ dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
+ dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
+
+ return (0);
+}
+
+/*
+ * Intent log support: sync the block associated with db to disk.
+ * N.B. and XXX: the caller is responsible for making sure that the
+ * data isn't changing while dmu_sync() is writing it.
+ *
+ * Return values:
+ *
+ * EEXIST: this txg has already been synced, so there's nothing to do.
+ * The caller should not log the write.
+ *
+ * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
+ * The caller should not log the write.
+ *
+ * EALREADY: this block is already in the process of being synced.
+ * The caller should track its progress (somehow).
+ *
+ * EIO: could not do the I/O.
+ * The caller should do a txg_wait_synced().
+ *
+ * 0: the I/O has been initiated.
+ * The caller should log this blkptr in the done callback.
+ * It is possible that the I/O will fail, in which case
+ * the error will be reported to the done callback and
+ * propagated to pio from zio_done().
+ */
+int
+dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
+ objset_t *os = db->db_objset;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ dbuf_dirty_record_t *dr, *dr_next;
+ dmu_sync_arg_t *dsa;
+ zbookmark_phys_t zb;
+ zio_prop_t zp;
+ dnode_t *dn;
+
+ ASSERT(pio != NULL);
+ ASSERT(txg != 0);
+
+ SET_BOOKMARK(&zb, ds->ds_object,
+ db->db.db_object, db->db_level, db->db_blkid);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
+ DB_DNODE_EXIT(db);
+
+ /*
+ * If we're frozen (running ziltest), we always need to generate a bp.
+ */
+ if (txg > spa_freeze_txg(os->os_spa))
+ return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
+
+ /*
+ * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
+ * and us. If we determine that this txg is not yet syncing,
+ * but it begins to sync a moment later, that's OK because the
+ * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
+ */
+ mutex_enter(&db->db_mtx);
+
+ if (txg <= spa_last_synced_txg(os->os_spa)) {
+ /*
+ * This txg has already synced. There's nothing to do.
+ */
+ mutex_exit(&db->db_mtx);
+ return (SET_ERROR(EEXIST));
+ }
+
+ if (txg <= spa_syncing_txg(os->os_spa)) {
+ /*
+ * This txg is currently syncing, so we can't mess with
+ * the dirty record anymore; just write a new log block.
+ */
+ mutex_exit(&db->db_mtx);
+ return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
+ }
+
+ dr = dbuf_find_dirty_eq(db, txg);
+
+ if (dr == NULL) {
+ /*
+ * There's no dr for this dbuf, so it must have been freed.
+ * There's no need to log writes to freed blocks, so we're done.
+ */
+ mutex_exit(&db->db_mtx);
+ return (SET_ERROR(ENOENT));
+ }
+
+ dr_next = list_next(&db->db_dirty_records, dr);
+ ASSERT(dr_next == NULL || dr_next->dr_txg < txg);
+
+ if (db->db_blkptr != NULL) {
+ /*
+ * We need to fill in zgd_bp with the current blkptr so that
+ * the nopwrite code can check if we're writing the same
+ * data that's already on disk. We can only nopwrite if we
+ * are sure that after making the copy, db_blkptr will not
+ * change until our i/o completes. We ensure this by
+ * holding the db_mtx, and only allowing nopwrite if the
+ * block is not already dirty (see below). This is verified
+ * by dmu_sync_done(), which VERIFYs that the db_blkptr has
+ * not changed.
+ */
+ *zgd->zgd_bp = *db->db_blkptr;
+ }
+
+ /*
+ * Assume the on-disk data is X, the current syncing data (in
+ * txg - 1) is Y, and the current in-memory data is Z (currently
+ * in dmu_sync).
+ *
+ * We usually want to perform a nopwrite if X and Z are the
+ * same. However, if Y is different (i.e. the BP is going to
+ * change before this write takes effect), then a nopwrite will
+ * be incorrect - we would override with X, which could have
+ * been freed when Y was written.
+ *
+ * (Note that this is not a concern when we are nop-writing from
+ * syncing context, because X and Y must be identical, because
+ * all previous txgs have been synced.)
+ *
+ * Therefore, we disable nopwrite if the current BP could change
+ * before this TXG. There are two ways it could change: by
+ * being dirty (dr_next is non-NULL), or by being freed
+ * (dnode_block_freed()). This behavior is verified by
+ * zio_done(), which VERIFYs that the override BP is identical
+ * to the on-disk BP.
+ */
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ if (dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
+ zp.zp_nopwrite = B_FALSE;
+ DB_DNODE_EXIT(db);
+
+ ASSERT(dr->dr_txg == txg);
+ if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
+ dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+ /*
+ * We have already issued a sync write for this buffer,
+ * or this buffer has already been synced. It could not
+ * have been dirtied since, or we would have cleared the state.
+ */
+ mutex_exit(&db->db_mtx);
+ return (SET_ERROR(EALREADY));
+ }
+
+ ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
+ dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
+ mutex_exit(&db->db_mtx);
+
+ dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+ dsa->dsa_dr = dr;
+ dsa->dsa_done = done;
+ dsa->dsa_zgd = zgd;
+ dsa->dsa_tx = NULL;
+
+ zio_nowait(arc_write(pio, os->os_spa, txg,
+ zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
+ &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
+
+ return (0);
+}
+
+int
+dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ err = dnode_set_nlevels(dn, nlevels, tx);
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+int
+dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ err = dnode_set_blksz(dn, size, ibs, tx);
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+int
+dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ dnode_new_blkid(dn, maxblkid, tx, B_FALSE, B_TRUE);
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ return (0);
+}
+
+void
+dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+
+ /*
+ * Send streams include each object's checksum function. This
+ * check ensures that the receiving system can understand the
+ * checksum function transmitted.
+ */
+ ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
+
+ VERIFY0(dnode_hold(os, object, FTAG, &dn));
+ ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
+ dn->dn_checksum = checksum;
+ dnode_setdirty(dn, tx);
+ dnode_rele(dn, FTAG);
+}
+
+void
+dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+
+ /*
+ * Send streams include each object's compression function. This
+ * check ensures that the receiving system can understand the
+ * compression function transmitted.
+ */
+ ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
+
+ VERIFY0(dnode_hold(os, object, FTAG, &dn));
+ dn->dn_compress = compress;
+ dnode_setdirty(dn, tx);
+ dnode_rele(dn, FTAG);
+}
+
+/*
+ * When the "redundant_metadata" property is set to "most", only indirect
+ * blocks of this level and higher will have an additional ditto block.
+ */
+int zfs_redundant_metadata_most_ditto_level = 2;
+
+void
+dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
+{
+ dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
+ boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
+ (wp & WP_SPILL));
+ enum zio_checksum checksum = os->os_checksum;
+ enum zio_compress compress = os->os_compress;
+ uint8_t complevel = os->os_complevel;
+ enum zio_checksum dedup_checksum = os->os_dedup_checksum;
+ boolean_t dedup = B_FALSE;
+ boolean_t nopwrite = B_FALSE;
+ boolean_t dedup_verify = os->os_dedup_verify;
+ boolean_t encrypt = B_FALSE;
+ int copies = os->os_copies;
+
+ /*
+ * We maintain different write policies for each of the following
+ * types of data:
+ * 1. metadata
+ * 2. preallocated blocks (i.e. level-0 blocks of a dump device)
+ * 3. all other level 0 blocks
+ */
+ if (ismd) {
+ /*
+ * XXX -- we should design a compression algorithm
+ * that specializes in arrays of bps.
+ */
+ compress = zio_compress_select(os->os_spa,
+ ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
+
+ /*
+ * Metadata always gets checksummed. If the data
+ * checksum is multi-bit correctable, and it's not a
+ * ZBT-style checksum, then it's suitable for metadata
+ * as well. Otherwise, the metadata checksum defaults
+ * to fletcher4.
+ */
+ if (!(zio_checksum_table[checksum].ci_flags &
+ ZCHECKSUM_FLAG_METADATA) ||
+ (zio_checksum_table[checksum].ci_flags &
+ ZCHECKSUM_FLAG_EMBEDDED))
+ checksum = ZIO_CHECKSUM_FLETCHER_4;
+
+ if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
+ (os->os_redundant_metadata ==
+ ZFS_REDUNDANT_METADATA_MOST &&
+ (level >= zfs_redundant_metadata_most_ditto_level ||
+ DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
+ copies++;
+ } else if (wp & WP_NOFILL) {
+ ASSERT(level == 0);
+
+ /*
+ * If we're writing preallocated blocks, we aren't actually
+ * writing them so don't set any policy properties. These
+ * blocks are currently only used by an external subsystem
+ * outside of zfs (i.e. dump) and not written by the zio
+ * pipeline.
+ */
+ compress = ZIO_COMPRESS_OFF;
+ checksum = ZIO_CHECKSUM_OFF;
+ } else {
+ compress = zio_compress_select(os->os_spa, dn->dn_compress,
+ compress);
+ complevel = zio_complevel_select(os->os_spa, compress,
+ complevel, complevel);
+
+ checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
+ zio_checksum_select(dn->dn_checksum, checksum) :
+ dedup_checksum;
+
+ /*
+ * Determine dedup setting. If we are in dmu_sync(),
+ * we won't actually dedup now because that's all
+ * done in syncing context; but we do want to use the
+ * dedup checksum. If the checksum is not strong
+ * enough to ensure unique signatures, force
+ * dedup_verify.
+ */
+ if (dedup_checksum != ZIO_CHECKSUM_OFF) {
+ dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
+ if (!(zio_checksum_table[checksum].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP))
+ dedup_verify = B_TRUE;
+ }
+
+ /*
+ * Enable nopwrite if we have secure enough checksum
+ * algorithm (see comment in zio_nop_write) and
+ * compression is enabled. We don't enable nopwrite if
+ * dedup is enabled as the two features are mutually
+ * exclusive.
+ */
+ nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
+ ZCHECKSUM_FLAG_NOPWRITE) &&
+ compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
+ }
+
+ /*
+ * All objects in an encrypted objset are protected from modification
+ * via a MAC. Encrypted objects store their IV and salt in the last DVA
+ * in the bp, so we cannot use all copies. Encrypted objects are also
+ * not subject to nopwrite since writing the same data will still
+ * result in a new ciphertext. Only encrypted blocks can be dedup'd
+ * to avoid ambiguity in the dedup code since the DDT does not store
+ * object types.
+ */
+ if (os->os_encrypted && (wp & WP_NOFILL) == 0) {
+ encrypt = B_TRUE;
+
+ if (DMU_OT_IS_ENCRYPTED(type)) {
+ copies = MIN(copies, SPA_DVAS_PER_BP - 1);
+ nopwrite = B_FALSE;
+ } else {
+ dedup = B_FALSE;
+ }
+
+ if (level <= 0 &&
+ (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)) {
+ compress = ZIO_COMPRESS_EMPTY;
+ }
+ }
+
+ zp->zp_compress = compress;
+ zp->zp_complevel = complevel;
+ zp->zp_checksum = checksum;
+ zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
+ zp->zp_level = level;
+ zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
+ zp->zp_dedup = dedup;
+ zp->zp_dedup_verify = dedup && dedup_verify;
+ zp->zp_nopwrite = nopwrite;
+ zp->zp_encrypt = encrypt;
+ zp->zp_byteorder = ZFS_HOST_BYTEORDER;
+ bzero(zp->zp_salt, ZIO_DATA_SALT_LEN);
+ bzero(zp->zp_iv, ZIO_DATA_IV_LEN);
+ bzero(zp->zp_mac, ZIO_DATA_MAC_LEN);
+ zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ?
+ os->os_zpl_special_smallblock : 0;
+
+ ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
+}
+
+/*
+ * This function is only called from zfs_holey_common() for zpl_llseek()
+ * in order to determine the location of holes. In order to accurately
+ * report holes all dirty data must be synced to disk. This causes extremely
+ * poor performance when seeking for holes in a dirty file. As a compromise,
+ * only provide hole data when the dnode is clean. When a dnode is dirty
+ * report the dnode as having no holes which is always a safe thing to do.
+ */
+int
+dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
+{
+ dnode_t *dn;
+ int i, err;
+ boolean_t clean = B_TRUE;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ /*
+ * Check if dnode is dirty
+ */
+ for (i = 0; i < TXG_SIZE; i++) {
+ if (multilist_link_active(&dn->dn_dirty_link[i])) {
+ clean = B_FALSE;
+ break;
+ }
+ }
+
+ /*
+ * If compatibility option is on, sync any current changes before
+ * we go trundling through the block pointers.
+ */
+ if (!clean && zfs_dmu_offset_next_sync) {
+ clean = B_TRUE;
+ dnode_rele(dn, FTAG);
+ txg_wait_synced(dmu_objset_pool(os), 0);
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ }
+
+ if (clean)
+ err = dnode_next_offset(dn,
+ (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
+ else
+ err = SET_ERROR(EBUSY);
+
+ dnode_rele(dn, FTAG);
+
+ return (err);
+}
+
+void
+__dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
+{
+ dnode_phys_t *dnp = dn->dn_phys;
+
+ doi->doi_data_block_size = dn->dn_datablksz;
+ doi->doi_metadata_block_size = dn->dn_indblkshift ?
+ 1ULL << dn->dn_indblkshift : 0;
+ doi->doi_type = dn->dn_type;
+ doi->doi_bonus_type = dn->dn_bonustype;
+ doi->doi_bonus_size = dn->dn_bonuslen;
+ doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT;
+ doi->doi_indirection = dn->dn_nlevels;
+ doi->doi_checksum = dn->dn_checksum;
+ doi->doi_compress = dn->dn_compress;
+ doi->doi_nblkptr = dn->dn_nblkptr;
+ doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
+ doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
+ doi->doi_fill_count = 0;
+ for (int i = 0; i < dnp->dn_nblkptr; i++)
+ doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
+}
+
+void
+dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
+{
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ mutex_enter(&dn->dn_mtx);
+
+ __dmu_object_info_from_dnode(dn, doi);
+
+ mutex_exit(&dn->dn_mtx);
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+/*
+ * Get information on a DMU object.
+ * If doi is NULL, just indicates whether the object exists.
+ */
+int
+dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
+{
+ dnode_t *dn;
+ int err = dnode_hold(os, object, FTAG, &dn);
+
+ if (err)
+ return (err);
+
+ if (doi != NULL)
+ dmu_object_info_from_dnode(dn, doi);
+
+ dnode_rele(dn, FTAG);
+ return (0);
+}
+
+/*
+ * As above, but faster; can be used when you have a held dbuf in hand.
+ */
+void
+dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ DB_DNODE_ENTER(db);
+ dmu_object_info_from_dnode(DB_DNODE(db), doi);
+ DB_DNODE_EXIT(db);
+}
+
+/*
+ * Faster still when you only care about the size.
+ */
+void
+dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
+ u_longlong_t *nblk512)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ *blksize = dn->dn_datablksz;
+ /* add in number of slots used for the dnode itself */
+ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
+ SPA_MINBLOCKSHIFT) + dn->dn_num_slots;
+ DB_DNODE_EXIT(db);
+}
+
+void
+dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ *dnsize = dn->dn_num_slots << DNODE_SHIFT;
+ DB_DNODE_EXIT(db);
+}
+
+void
+byteswap_uint64_array(void *vbuf, size_t size)
+{
+ uint64_t *buf = vbuf;
+ size_t count = size >> 3;
+ int i;
+
+ ASSERT((size & 7) == 0);
+
+ for (i = 0; i < count; i++)
+ buf[i] = BSWAP_64(buf[i]);
+}
+
+void
+byteswap_uint32_array(void *vbuf, size_t size)
+{
+ uint32_t *buf = vbuf;
+ size_t count = size >> 2;
+ int i;
+
+ ASSERT((size & 3) == 0);
+
+ for (i = 0; i < count; i++)
+ buf[i] = BSWAP_32(buf[i]);
+}
+
+void
+byteswap_uint16_array(void *vbuf, size_t size)
+{
+ uint16_t *buf = vbuf;
+ size_t count = size >> 1;
+ int i;
+
+ ASSERT((size & 1) == 0);
+
+ for (i = 0; i < count; i++)
+ buf[i] = BSWAP_16(buf[i]);
+}
+
+/* ARGSUSED */
+void
+byteswap_uint8_array(void *vbuf, size_t size)
+{
+}
+
+void
+dmu_init(void)
+{
+ abd_init();
+ zfs_dbgmsg_init();
+ sa_cache_init();
+ dmu_objset_init();
+ dnode_init();
+ zfetch_init();
+ dmu_tx_init();
+ l2arc_init();
+ arc_init();
+ dbuf_init();
+}
+
+void
+dmu_fini(void)
+{
+ arc_fini(); /* arc depends on l2arc, so arc must go first */
+ l2arc_fini();
+ dmu_tx_fini();
+ zfetch_fini();
+ dbuf_fini();
+ dnode_fini();
+ dmu_objset_fini();
+ sa_cache_fini();
+ zfs_dbgmsg_fini();
+ abd_fini();
+}
+
+EXPORT_SYMBOL(dmu_bonus_hold);
+EXPORT_SYMBOL(dmu_bonus_hold_by_dnode);
+EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
+EXPORT_SYMBOL(dmu_buf_rele_array);
+EXPORT_SYMBOL(dmu_prefetch);
+EXPORT_SYMBOL(dmu_free_range);
+EXPORT_SYMBOL(dmu_free_long_range);
+EXPORT_SYMBOL(dmu_free_long_object);
+EXPORT_SYMBOL(dmu_read);
+EXPORT_SYMBOL(dmu_read_by_dnode);
+EXPORT_SYMBOL(dmu_write);
+EXPORT_SYMBOL(dmu_write_by_dnode);
+EXPORT_SYMBOL(dmu_prealloc);
+EXPORT_SYMBOL(dmu_object_info);
+EXPORT_SYMBOL(dmu_object_info_from_dnode);
+EXPORT_SYMBOL(dmu_object_info_from_db);
+EXPORT_SYMBOL(dmu_object_size_from_db);
+EXPORT_SYMBOL(dmu_object_dnsize_from_db);
+EXPORT_SYMBOL(dmu_object_set_nlevels);
+EXPORT_SYMBOL(dmu_object_set_blocksize);
+EXPORT_SYMBOL(dmu_object_set_maxblkid);
+EXPORT_SYMBOL(dmu_object_set_checksum);
+EXPORT_SYMBOL(dmu_object_set_compress);
+EXPORT_SYMBOL(dmu_offset_next);
+EXPORT_SYMBOL(dmu_write_policy);
+EXPORT_SYMBOL(dmu_sync);
+EXPORT_SYMBOL(dmu_request_arcbuf);
+EXPORT_SYMBOL(dmu_return_arcbuf);
+EXPORT_SYMBOL(dmu_assign_arcbuf_by_dnode);
+EXPORT_SYMBOL(dmu_assign_arcbuf_by_dbuf);
+EXPORT_SYMBOL(dmu_buf_hold);
+EXPORT_SYMBOL(dmu_ot);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, nopwrite_enabled, INT, ZMOD_RW,
+ "Enable NOP writes");
+
+ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, ULONG, ZMOD_RW,
+ "Percentage of dirtied blocks from frees in one TXG");
+
+ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW,
+ "Enable forcing txg sync to find holes");
+
+ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, INT, ZMOD_RW,
+ "Limit one prefetch call to this size");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu_diff.c b/sys/contrib/openzfs/module/zfs/dmu_diff.c
new file mode 100644
index 000000000000..a573a2e1bd41
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dmu_diff.c
@@ -0,0 +1,240 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_file.h>
+
+
+typedef struct dmu_diffarg {
+ zfs_file_t *da_fp; /* file to which we are reporting */
+ offset_t *da_offp;
+ int da_err; /* error that stopped diff search */
+ dmu_diff_record_t da_ddr;
+} dmu_diffarg_t;
+
+static int
+write_record(dmu_diffarg_t *da)
+{
+ zfs_file_t *fp;
+ ssize_t resid;
+
+ if (da->da_ddr.ddr_type == DDR_NONE) {
+ da->da_err = 0;
+ return (0);
+ }
+
+ fp = da->da_fp;
+ da->da_err = zfs_file_write(fp, (caddr_t)&da->da_ddr,
+ sizeof (da->da_ddr), &resid);
+ *da->da_offp += sizeof (da->da_ddr);
+ return (da->da_err);
+}
+
+static int
+report_free_dnode_range(dmu_diffarg_t *da, uint64_t first, uint64_t last)
+{
+ ASSERT(first <= last);
+ if (da->da_ddr.ddr_type != DDR_FREE ||
+ first != da->da_ddr.ddr_last + 1) {
+ if (write_record(da) != 0)
+ return (da->da_err);
+ da->da_ddr.ddr_type = DDR_FREE;
+ da->da_ddr.ddr_first = first;
+ da->da_ddr.ddr_last = last;
+ return (0);
+ }
+ da->da_ddr.ddr_last = last;
+ return (0);
+}
+
+static int
+report_dnode(dmu_diffarg_t *da, uint64_t object, dnode_phys_t *dnp)
+{
+ ASSERT(dnp != NULL);
+ if (dnp->dn_type == DMU_OT_NONE)
+ return (report_free_dnode_range(da, object, object));
+
+ if (da->da_ddr.ddr_type != DDR_INUSE ||
+ object != da->da_ddr.ddr_last + 1) {
+ if (write_record(da) != 0)
+ return (da->da_err);
+ da->da_ddr.ddr_type = DDR_INUSE;
+ da->da_ddr.ddr_first = da->da_ddr.ddr_last = object;
+ return (0);
+ }
+ da->da_ddr.ddr_last = object;
+ return (0);
+}
+
+#define DBP_SPAN(dnp, level) \
+ (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
+ (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+
+/* ARGSUSED */
+static int
+diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ dmu_diffarg_t *da = arg;
+ int err = 0;
+
+ if (issig(JUSTLOOKING) && issig(FORREAL))
+ return (SET_ERROR(EINTR));
+
+ if (zb->zb_level == ZB_DNODE_LEVEL ||
+ zb->zb_object != DMU_META_DNODE_OBJECT)
+ return (0);
+
+ if (BP_IS_HOLE(bp)) {
+ uint64_t span = DBP_SPAN(dnp, zb->zb_level);
+ uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
+
+ err = report_free_dnode_range(da, dnobj,
+ dnobj + (span >> DNODE_SHIFT) - 1);
+ if (err)
+ return (err);
+ } else if (zb->zb_level == 0) {
+ dnode_phys_t *blk;
+ arc_buf_t *abuf;
+ arc_flags_t aflags = ARC_FLAG_WAIT;
+ int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+ int zio_flags = ZIO_FLAG_CANFAIL;
+ int i;
+
+ if (BP_IS_PROTECTED(bp))
+ zio_flags |= ZIO_FLAG_RAW;
+
+ if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
+ ZIO_PRIORITY_ASYNC_READ, zio_flags, &aflags, zb) != 0)
+ return (SET_ERROR(EIO));
+
+ blk = abuf->b_data;
+ for (i = 0; i < epb; i += blk[i].dn_extra_slots + 1) {
+ uint64_t dnobj = (zb->zb_blkid <<
+ (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
+ err = report_dnode(da, dnobj, blk+i);
+ if (err)
+ break;
+ }
+ arc_buf_destroy(abuf, &abuf);
+ if (err)
+ return (err);
+ /* Don't care about the data blocks */
+ return (TRAVERSE_VISIT_NO_CHILDREN);
+ }
+ return (0);
+}
+
+int
+dmu_diff(const char *tosnap_name, const char *fromsnap_name,
+ zfs_file_t *fp, offset_t *offp)
+{
+ dmu_diffarg_t da;
+ dsl_dataset_t *fromsnap;
+ dsl_dataset_t *tosnap;
+ dsl_pool_t *dp;
+ int error;
+ uint64_t fromtxg;
+
+ if (strchr(tosnap_name, '@') == NULL ||
+ strchr(fromsnap_name, '@') == NULL)
+ return (SET_ERROR(EINVAL));
+
+ error = dsl_pool_hold(tosnap_name, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_hold(dp, tosnap_name, FTAG, &tosnap);
+ if (error != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ error = dsl_dataset_hold(dp, fromsnap_name, FTAG, &fromsnap);
+ if (error != 0) {
+ dsl_dataset_rele(tosnap, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) {
+ dsl_dataset_rele(fromsnap, FTAG);
+ dsl_dataset_rele(tosnap, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (SET_ERROR(EXDEV));
+ }
+
+ fromtxg = dsl_dataset_phys(fromsnap)->ds_creation_txg;
+ dsl_dataset_rele(fromsnap, FTAG);
+
+ dsl_dataset_long_hold(tosnap, FTAG);
+ dsl_pool_rele(dp, FTAG);
+
+ da.da_fp = fp;
+ da.da_offp = offp;
+ da.da_ddr.ddr_type = DDR_NONE;
+ da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0;
+ da.da_err = 0;
+
+ /*
+ * Since zfs diff only looks at dnodes which are stored in plaintext
+ * (other than bonus buffers), we don't technically need to decrypt
+ * the dataset to perform this operation. However, the command line
+ * utility will still fail if the keys are not loaded because the
+ * dataset isn't mounted and because it will fail when it attempts to
+ * call the ZFS_IOC_OBJ_TO_STATS ioctl.
+ */
+ error = traverse_dataset(tosnap, fromtxg,
+ TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT,
+ diff_cb, &da);
+
+ if (error != 0) {
+ da.da_err = error;
+ } else {
+ /* we set the da.da_err we return as side-effect */
+ (void) write_record(&da);
+ }
+
+ dsl_dataset_long_rele(tosnap, FTAG);
+ dsl_dataset_rele(tosnap, FTAG);
+
+ return (da.da_err);
+}
diff --git a/sys/contrib/openzfs/module/zfs/dmu_object.c b/sys/contrib/openzfs/module/zfs/dmu_object.c
new file mode 100644
index 000000000000..12cdbd68b104
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dmu_object.c
@@ -0,0 +1,523 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ * Copyright 2014 HybridCluster. All rights reserved.
+ */
+
+#include <sys/dbuf.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/dsl_dataset.h>
+
+/*
+ * Each of the concurrent object allocators will grab
+ * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to
+ * grab 128 slots, which is 4 blocks worth. This was experimentally
+ * determined to be the lowest value that eliminates the measurable effect
+ * of lock contention from this code path.
+ */
+int dmu_object_alloc_chunk_shift = 7;
+
+static uint64_t
+dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
+ int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
+ int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
+{
+ uint64_t object;
+ uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
+ (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
+ dnode_t *dn = NULL;
+ int dn_slots = dnodesize >> DNODE_SHIFT;
+ boolean_t restarted = B_FALSE;
+ uint64_t *cpuobj = NULL;
+ int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
+ int error;
+
+ cpuobj = &os->os_obj_next_percpu[CPU_SEQID_UNSTABLE %
+ os->os_obj_next_percpu_len];
+
+ if (dn_slots == 0) {
+ dn_slots = DNODE_MIN_SLOTS;
+ } else {
+ ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
+ ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
+ }
+
+ /*
+ * The "chunk" of dnodes that is assigned to a CPU-specific
+ * allocator needs to be at least one block's worth, to avoid
+ * lock contention on the dbuf. It can be at most one L1 block's
+ * worth, so that the "rescan after polishing off a L1's worth"
+ * logic below will be sure to kick in.
+ */
+ if (dnodes_per_chunk < DNODES_PER_BLOCK)
+ dnodes_per_chunk = DNODES_PER_BLOCK;
+ if (dnodes_per_chunk > L1_dnode_count)
+ dnodes_per_chunk = L1_dnode_count;
+
+ /*
+ * The caller requested the dnode be returned as a performance
+ * optimization in order to avoid releasing the hold only to
+ * immediately reacquire it. Since they caller is responsible
+ * for releasing the hold they must provide the tag.
+ */
+ if (allocated_dnode != NULL) {
+ ASSERT3P(tag, !=, NULL);
+ } else {
+ ASSERT3P(tag, ==, NULL);
+ tag = FTAG;
+ }
+
+ object = *cpuobj;
+ for (;;) {
+ /*
+ * If we finished a chunk of dnodes, get a new one from
+ * the global allocator.
+ */
+ if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
+ (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
+ dn_slots)) {
+ DNODE_STAT_BUMP(dnode_alloc_next_chunk);
+ mutex_enter(&os->os_obj_lock);
+ ASSERT0(P2PHASE(os->os_obj_next_chunk,
+ dnodes_per_chunk));
+ object = os->os_obj_next_chunk;
+
+ /*
+ * Each time we polish off a L1 bp worth of dnodes
+ * (2^12 objects), move to another L1 bp that's
+ * still reasonably sparse (at most 1/4 full). Look
+ * from the beginning at most once per txg. If we
+ * still can't allocate from that L1 block, search
+ * for an empty L0 block, which will quickly skip
+ * to the end of the metadnode if no nearby L0
+ * blocks are empty. This fallback avoids a
+ * pathology where full dnode blocks containing
+ * large dnodes appear sparse because they have a
+ * low blk_fill, leading to many failed allocation
+ * attempts. In the long term a better mechanism to
+ * search for sparse metadnode regions, such as
+ * spacemaps, could be implemented.
+ *
+ * os_scan_dnodes is set during txg sync if enough
+ * objects have been freed since the previous
+ * rescan to justify backfilling again.
+ *
+ * Note that dmu_traverse depends on the behavior
+ * that we use multiple blocks of the dnode object
+ * before going back to reuse objects. Any change
+ * to this algorithm should preserve that property
+ * or find another solution to the issues described
+ * in traverse_visitbp.
+ */
+ if (P2PHASE(object, L1_dnode_count) == 0) {
+ uint64_t offset;
+ uint64_t blkfill;
+ int minlvl;
+ if (os->os_rescan_dnodes) {
+ offset = 0;
+ os->os_rescan_dnodes = B_FALSE;
+ } else {
+ offset = object << DNODE_SHIFT;
+ }
+ blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
+ minlvl = restarted ? 1 : 2;
+ restarted = B_TRUE;
+ error = dnode_next_offset(DMU_META_DNODE(os),
+ DNODE_FIND_HOLE, &offset, minlvl,
+ blkfill, 0);
+ if (error == 0) {
+ object = offset >> DNODE_SHIFT;
+ }
+ }
+ /*
+ * Note: if "restarted", we may find a L0 that
+ * is not suitably aligned.
+ */
+ os->os_obj_next_chunk =
+ P2ALIGN(object, dnodes_per_chunk) +
+ dnodes_per_chunk;
+ (void) atomic_swap_64(cpuobj, object);
+ mutex_exit(&os->os_obj_lock);
+ }
+
+ /*
+ * The value of (*cpuobj) before adding dn_slots is the object
+ * ID assigned to us. The value afterwards is the object ID
+ * assigned to whoever wants to do an allocation next.
+ */
+ object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
+
+ /*
+ * XXX We should check for an i/o error here and return
+ * up to our caller. Actually we should pre-read it in
+ * dmu_tx_assign(), but there is currently no mechanism
+ * to do so.
+ */
+ error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
+ dn_slots, tag, &dn);
+ if (error == 0) {
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ /*
+ * Another thread could have allocated it; check
+ * again now that we have the struct lock.
+ */
+ if (dn->dn_type == DMU_OT_NONE) {
+ dnode_allocate(dn, ot, blocksize,
+ indirect_blockshift, bonustype,
+ bonuslen, dn_slots, tx);
+ rw_exit(&dn->dn_struct_rwlock);
+ dmu_tx_add_new_object(tx, dn);
+
+ /*
+ * Caller requested the allocated dnode be
+ * returned and is responsible for the hold.
+ */
+ if (allocated_dnode != NULL)
+ *allocated_dnode = dn;
+ else
+ dnode_rele(dn, tag);
+
+ return (object);
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, tag);
+ DNODE_STAT_BUMP(dnode_alloc_race);
+ }
+
+ /*
+ * Skip to next known valid starting point on error. This
+ * is the start of the next block of dnodes.
+ */
+ if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
+ object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
+ DNODE_STAT_BUMP(dnode_alloc_next_block);
+ }
+ (void) atomic_swap_64(cpuobj, object);
+ }
+}
+
+uint64_t
+dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
+ bonuslen, 0, NULL, NULL, tx);
+}
+
+uint64_t
+dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
+ int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
+ dmu_tx_t *tx)
+{
+ return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
+ bonustype, bonuslen, 0, NULL, NULL, tx);
+}
+
+uint64_t
+dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
+ bonuslen, dnodesize, NULL, NULL, tx));
+}
+
+/*
+ * Allocate a new object and return a pointer to the newly allocated dnode
+ * via the allocated_dnode argument. The returned dnode will be held and
+ * the caller is responsible for releasing the hold by calling dnode_rele().
+ */
+uint64_t
+dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize,
+ int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
+ int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
+{
+ return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
+ bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx));
+}
+
+int
+dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
+ bonuslen, 0, tx));
+}
+
+int
+dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen,
+ int dnodesize, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int dn_slots = dnodesize >> DNODE_SHIFT;
+ int err;
+
+ if (dn_slots == 0)
+ dn_slots = DNODE_MIN_SLOTS;
+ ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
+ ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
+
+ if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
+ return (SET_ERROR(EBADF));
+
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
+ FTAG, &dn);
+ if (err)
+ return (err);
+
+ dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
+ dmu_tx_add_new_object(tx, dn);
+
+ dnode_rele(dn, FTAG);
+
+ return (0);
+}
+
+int
+dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
+ bonuslen, DNODE_MIN_SIZE, B_FALSE, tx));
+}
+
+int
+dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
+ boolean_t keep_spill, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int dn_slots = dnodesize >> DNODE_SHIFT;
+ int err;
+
+ if (dn_slots == 0)
+ dn_slots = DNODE_MIN_SLOTS;
+
+ if (object == DMU_META_DNODE_OBJECT)
+ return (SET_ERROR(EBADF));
+
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
+ FTAG, &dn);
+ if (err)
+ return (err);
+
+ dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots,
+ keep_spill, tx);
+
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+int
+dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
+ FTAG, &dn);
+ if (err)
+ return (err);
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ dbuf_rm_spill(dn, tx);
+ dnode_rm_spill(dn, tx);
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+int
+dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
+
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
+ FTAG, &dn);
+ if (err)
+ return (err);
+
+ ASSERT(dn->dn_type != DMU_OT_NONE);
+ /*
+ * If we don't create this free range, we'll leak indirect blocks when
+ * we get to freeing the dnode in syncing context.
+ */
+ dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
+ dnode_free(dn, tx);
+ dnode_rele(dn, FTAG);
+
+ return (0);
+}
+
+/*
+ * Return (in *objectp) the next object which is allocated (or a hole)
+ * after *object, taking into account only objects that may have been modified
+ * after the specified txg.
+ */
+int
+dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
+{
+ uint64_t offset;
+ uint64_t start_obj;
+ struct dsl_dataset *ds = os->os_dsl_dataset;
+ int error;
+
+ if (*objectp == 0) {
+ start_obj = 1;
+ } else if (ds && dsl_dataset_feature_is_active(ds,
+ SPA_FEATURE_LARGE_DNODE)) {
+ uint64_t i = *objectp + 1;
+ uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
+ dmu_object_info_t doi;
+
+ /*
+ * Scan through the remaining meta dnode block. The contents
+ * of each slot in the block are known so it can be quickly
+ * checked. If the block is exhausted without a match then
+ * hand off to dnode_next_offset() for further scanning.
+ */
+ while (i <= last_obj) {
+ error = dmu_object_info(os, i, &doi);
+ if (error == ENOENT) {
+ if (hole) {
+ *objectp = i;
+ return (0);
+ } else {
+ i++;
+ }
+ } else if (error == EEXIST) {
+ i++;
+ } else if (error == 0) {
+ if (hole) {
+ i += doi.doi_dnodesize >> DNODE_SHIFT;
+ } else {
+ *objectp = i;
+ return (0);
+ }
+ } else {
+ return (error);
+ }
+ }
+
+ start_obj = i;
+ } else {
+ start_obj = *objectp + 1;
+ }
+
+ offset = start_obj << DNODE_SHIFT;
+
+ error = dnode_next_offset(DMU_META_DNODE(os),
+ (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
+
+ *objectp = offset >> DNODE_SHIFT;
+
+ return (error);
+}
+
+/*
+ * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
+ * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
+ *
+ * Only for use from syncing context, on MOS objects.
+ */
+void
+dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ VERIFY0(dnode_hold(mos, object, FTAG, &dn));
+ if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
+ dnode_rele(dn, FTAG);
+ return;
+ }
+ ASSERT3U(dn->dn_type, ==, old_type);
+ ASSERT0(dn->dn_maxblkid);
+
+ /*
+ * We must initialize the ZAP data before changing the type,
+ * so that concurrent calls to *_is_zapified() can determine if
+ * the object has been completely zapified by checking the type.
+ */
+ mzap_create_impl(dn, 0, 0, tx);
+
+ dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
+ DMU_OTN_ZAP_METADATA;
+ dnode_setdirty(dn, tx);
+ dnode_rele(dn, FTAG);
+
+ spa_feature_incr(dmu_objset_spa(mos),
+ SPA_FEATURE_EXTENSIBLE_DATASET, tx);
+}
+
+void
+dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ dmu_object_type_t t;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ VERIFY0(dnode_hold(mos, object, FTAG, &dn));
+ t = dn->dn_type;
+ dnode_rele(dn, FTAG);
+
+ if (t == DMU_OTN_ZAP_METADATA) {
+ spa_feature_decr(dmu_objset_spa(mos),
+ SPA_FEATURE_EXTENSIBLE_DATASET, tx);
+ }
+ VERIFY0(dmu_object_free(mos, object, tx));
+}
+
+EXPORT_SYMBOL(dmu_object_alloc);
+EXPORT_SYMBOL(dmu_object_alloc_ibs);
+EXPORT_SYMBOL(dmu_object_alloc_dnsize);
+EXPORT_SYMBOL(dmu_object_alloc_hold);
+EXPORT_SYMBOL(dmu_object_claim);
+EXPORT_SYMBOL(dmu_object_claim_dnsize);
+EXPORT_SYMBOL(dmu_object_reclaim);
+EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
+EXPORT_SYMBOL(dmu_object_rm_spill);
+EXPORT_SYMBOL(dmu_object_free);
+EXPORT_SYMBOL(dmu_object_next);
+EXPORT_SYMBOL(dmu_object_zapify);
+EXPORT_SYMBOL(dmu_object_free_zapified);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, INT, ZMOD_RW,
+ "CPU-specific allocator grabs 2^N objects at once");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c
new file mode 100644
index 000000000000..bfb4adf262d5
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c
@@ -0,0 +1,3044 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/cred.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dnode.h>
+#include <sys/dbuf.h>
+#include <sys/zvol.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/dmu_impl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/sa.h>
+#include <sys/zfs_onexit.h>
+#include <sys/dsl_destroy.h>
+#include <sys/vdev.h>
+#include <sys/zfeature.h>
+#include <sys/policy.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu_recv.h>
+#include <sys/zfs_project.h>
+#include "zfs_namecheck.h"
+
+/*
+ * Needed to close a window in dnode_move() that allows the objset to be freed
+ * before it can be safely accessed.
+ */
+krwlock_t os_lock;
+
+/*
+ * Tunable to overwrite the maximum number of threads for the parallelization
+ * of dmu_objset_find_dp, needed to speed up the import of pools with many
+ * datasets.
+ * Default is 4 times the number of leaf vdevs.
+ */
+int dmu_find_threads = 0;
+
+/*
+ * Backfill lower metadnode objects after this many have been freed.
+ * Backfilling negatively impacts object creation rates, so only do it
+ * if there are enough holes to fill.
+ */
+int dmu_rescan_dnode_threshold = 1 << DN_MAX_INDBLKSHIFT;
+
+static char *upgrade_tag = "upgrade_tag";
+
+static void dmu_objset_find_dp_cb(void *arg);
+
+static void dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb);
+static void dmu_objset_upgrade_stop(objset_t *os);
+
+void
+dmu_objset_init(void)
+{
+ rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
+}
+
+void
+dmu_objset_fini(void)
+{
+ rw_destroy(&os_lock);
+}
+
+spa_t *
+dmu_objset_spa(objset_t *os)
+{
+ return (os->os_spa);
+}
+
+zilog_t *
+dmu_objset_zil(objset_t *os)
+{
+ return (os->os_zil);
+}
+
+dsl_pool_t *
+dmu_objset_pool(objset_t *os)
+{
+ dsl_dataset_t *ds;
+
+ if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
+ return (ds->ds_dir->dd_pool);
+ else
+ return (spa_get_dsl(os->os_spa));
+}
+
+dsl_dataset_t *
+dmu_objset_ds(objset_t *os)
+{
+ return (os->os_dsl_dataset);
+}
+
+dmu_objset_type_t
+dmu_objset_type(objset_t *os)
+{
+ return (os->os_phys->os_type);
+}
+
+void
+dmu_objset_name(objset_t *os, char *buf)
+{
+ dsl_dataset_name(os->os_dsl_dataset, buf);
+}
+
+uint64_t
+dmu_objset_id(objset_t *os)
+{
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+
+ return (ds ? ds->ds_object : 0);
+}
+
+uint64_t
+dmu_objset_dnodesize(objset_t *os)
+{
+ return (os->os_dnodesize);
+}
+
+zfs_sync_type_t
+dmu_objset_syncprop(objset_t *os)
+{
+ return (os->os_sync);
+}
+
+zfs_logbias_op_t
+dmu_objset_logbias(objset_t *os)
+{
+ return (os->os_logbias);
+}
+
+static void
+checksum_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance should have been done by now.
+ */
+ ASSERT(newval != ZIO_CHECKSUM_INHERIT);
+
+ os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
+}
+
+static void
+compression_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval != ZIO_COMPRESS_INHERIT);
+
+ os->os_compress = zio_compress_select(os->os_spa,
+ ZIO_COMPRESS_ALGO(newval), ZIO_COMPRESS_ON);
+ os->os_complevel = zio_complevel_select(os->os_spa, os->os_compress,
+ ZIO_COMPRESS_LEVEL(newval), ZIO_COMPLEVEL_DEFAULT);
+}
+
+static void
+copies_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval > 0);
+ ASSERT(newval <= spa_max_replication(os->os_spa));
+
+ os->os_copies = newval;
+}
+
+static void
+dedup_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+ spa_t *spa = os->os_spa;
+ enum zio_checksum checksum;
+
+ /*
+ * Inheritance should have been done by now.
+ */
+ ASSERT(newval != ZIO_CHECKSUM_INHERIT);
+
+ checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
+
+ os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
+ os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
+}
+
+static void
+primary_cache_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
+ newval == ZFS_CACHE_METADATA);
+
+ os->os_primary_cache = newval;
+}
+
+static void
+secondary_cache_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
+ newval == ZFS_CACHE_METADATA);
+
+ os->os_secondary_cache = newval;
+}
+
+static void
+sync_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||
+ newval == ZFS_SYNC_DISABLED);
+
+ os->os_sync = newval;
+ if (os->os_zil)
+ zil_set_sync(os->os_zil, newval);
+}
+
+static void
+redundant_metadata_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
+ newval == ZFS_REDUNDANT_METADATA_MOST);
+
+ os->os_redundant_metadata = newval;
+}
+
+static void
+dnodesize_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ switch (newval) {
+ case ZFS_DNSIZE_LEGACY:
+ os->os_dnodesize = DNODE_MIN_SIZE;
+ break;
+ case ZFS_DNSIZE_AUTO:
+ /*
+ * Choose a dnode size that will work well for most
+ * workloads if the user specified "auto". Future code
+ * improvements could dynamically select a dnode size
+ * based on observed workload patterns.
+ */
+ os->os_dnodesize = DNODE_MIN_SIZE * 2;
+ break;
+ case ZFS_DNSIZE_1K:
+ case ZFS_DNSIZE_2K:
+ case ZFS_DNSIZE_4K:
+ case ZFS_DNSIZE_8K:
+ case ZFS_DNSIZE_16K:
+ os->os_dnodesize = newval;
+ break;
+ }
+}
+
+static void
+smallblk_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval <= SPA_MAXBLOCKSIZE);
+ ASSERT(ISP2(newval));
+
+ os->os_zpl_special_smallblock = newval;
+}
+
+static void
+logbias_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ ASSERT(newval == ZFS_LOGBIAS_LATENCY ||
+ newval == ZFS_LOGBIAS_THROUGHPUT);
+ os->os_logbias = newval;
+ if (os->os_zil)
+ zil_set_logbias(os->os_zil, newval);
+}
+
+static void
+recordsize_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ os->os_recordsize = newval;
+}
+
+void
+dmu_objset_byteswap(void *buf, size_t size)
+{
+ objset_phys_t *osp = buf;
+
+ ASSERT(size == OBJSET_PHYS_SIZE_V1 || size == OBJSET_PHYS_SIZE_V2 ||
+ size == sizeof (objset_phys_t));
+ dnode_byteswap(&osp->os_meta_dnode);
+ byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
+ osp->os_type = BSWAP_64(osp->os_type);
+ osp->os_flags = BSWAP_64(osp->os_flags);
+ if (size >= OBJSET_PHYS_SIZE_V2) {
+ dnode_byteswap(&osp->os_userused_dnode);
+ dnode_byteswap(&osp->os_groupused_dnode);
+ if (size >= sizeof (objset_phys_t))
+ dnode_byteswap(&osp->os_projectused_dnode);
+ }
+}
+
+/*
+ * The hash is a CRC-based hash of the objset_t pointer and the object number.
+ */
+static uint64_t
+dnode_hash(const objset_t *os, uint64_t obj)
+{
+ uintptr_t osv = (uintptr_t)os;
+ uint64_t crc = -1ULL;
+
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+ /*
+ * The low 6 bits of the pointer don't have much entropy, because
+ * the objset_t is larger than 2^6 bytes long.
+ */
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF];
+
+ crc ^= (osv>>14) ^ (obj>>24);
+
+ return (crc);
+}
+
+static unsigned int
+dnode_multilist_index_func(multilist_t *ml, void *obj)
+{
+ dnode_t *dn = obj;
+ return (dnode_hash(dn->dn_objset, dn->dn_object) %
+ multilist_get_num_sublists(ml));
+}
+
+/*
+ * Instantiates the objset_t in-memory structure corresponding to the
+ * objset_phys_t that's pointed to by the specified blkptr_t.
+ */
+int
+dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
+ objset_t **osp)
+{
+ objset_t *os;
+ int i, err;
+
+ ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
+ ASSERT(!BP_IS_REDACTED(bp));
+
+ /*
+ * We need the pool config lock to get properties.
+ */
+ ASSERT(ds == NULL || dsl_pool_config_held(ds->ds_dir->dd_pool));
+
+ /*
+ * The $ORIGIN dataset (if it exists) doesn't have an associated
+ * objset, so there's no reason to open it. The $ORIGIN dataset
+ * will not exist on pools older than SPA_VERSION_ORIGIN.
+ */
+ if (ds != NULL && spa_get_dsl(spa) != NULL &&
+ spa_get_dsl(spa)->dp_origin_snap != NULL) {
+ ASSERT3P(ds->ds_dir, !=,
+ spa_get_dsl(spa)->dp_origin_snap->ds_dir);
+ }
+
+ os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
+ os->os_dsl_dataset = ds;
+ os->os_spa = spa;
+ os->os_rootbp = bp;
+ if (!BP_IS_HOLE(os->os_rootbp)) {
+ arc_flags_t aflags = ARC_FLAG_WAIT;
+ zbookmark_phys_t zb;
+ int size;
+ enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+ SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+
+ if (DMU_OS_IS_L2CACHEABLE(os))
+ aflags |= ARC_FLAG_L2CACHE;
+
+ if (ds != NULL && ds->ds_dir->dd_crypto_obj != 0) {
+ ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
+ ASSERT(BP_IS_AUTHENTICATED(bp));
+ zio_flags |= ZIO_FLAG_RAW;
+ }
+
+ dprintf_bp(os->os_rootbp, "reading %s", "");
+ err = arc_read(NULL, spa, os->os_rootbp,
+ arc_getbuf_func, &os->os_phys_buf,
+ ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
+ if (err != 0) {
+ kmem_free(os, sizeof (objset_t));
+ /* convert checksum errors into IO errors */
+ if (err == ECKSUM)
+ err = SET_ERROR(EIO);
+ return (err);
+ }
+
+ if (spa_version(spa) < SPA_VERSION_USERSPACE)
+ size = OBJSET_PHYS_SIZE_V1;
+ else if (!spa_feature_is_enabled(spa,
+ SPA_FEATURE_PROJECT_QUOTA))
+ size = OBJSET_PHYS_SIZE_V2;
+ else
+ size = sizeof (objset_phys_t);
+
+ /* Increase the blocksize if we are permitted. */
+ if (arc_buf_size(os->os_phys_buf) < size) {
+ arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf,
+ ARC_BUFC_METADATA, size);
+ bzero(buf->b_data, size);
+ bcopy(os->os_phys_buf->b_data, buf->b_data,
+ arc_buf_size(os->os_phys_buf));
+ arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
+ os->os_phys_buf = buf;
+ }
+
+ os->os_phys = os->os_phys_buf->b_data;
+ os->os_flags = os->os_phys->os_flags;
+ } else {
+ int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
+ sizeof (objset_phys_t) : OBJSET_PHYS_SIZE_V1;
+ os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf,
+ ARC_BUFC_METADATA, size);
+ os->os_phys = os->os_phys_buf->b_data;
+ bzero(os->os_phys, size);
+ }
+ /*
+ * These properties will be filled in by the logic in zfs_get_zplprop()
+ * when they are queried for the first time.
+ */
+ os->os_version = OBJSET_PROP_UNINITIALIZED;
+ os->os_normalization = OBJSET_PROP_UNINITIALIZED;
+ os->os_utf8only = OBJSET_PROP_UNINITIALIZED;
+ os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED;
+
+ /*
+ * Note: the changed_cb will be called once before the register
+ * func returns, thus changing the checksum/compression from the
+ * default (fletcher2/off). Snapshots don't need to know about
+ * checksum/compression/copies.
+ */
+ if (ds != NULL) {
+ os->os_encrypted = (ds->ds_dir->dd_crypto_obj != 0);
+
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
+ primary_cache_changed_cb, os);
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
+ secondary_cache_changed_cb, os);
+ }
+ if (!ds->ds_is_snapshot) {
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_CHECKSUM),
+ checksum_changed_cb, os);
+ }
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_COMPRESSION),
+ compression_changed_cb, os);
+ }
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_COPIES),
+ copies_changed_cb, os);
+ }
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_DEDUP),
+ dedup_changed_cb, os);
+ }
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_LOGBIAS),
+ logbias_changed_cb, os);
+ }
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_SYNC),
+ sync_changed_cb, os);
+ }
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(
+ ZFS_PROP_REDUNDANT_METADATA),
+ redundant_metadata_changed_cb, os);
+ }
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+ recordsize_changed_cb, os);
+ }
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_DNODESIZE),
+ dnodesize_changed_cb, os);
+ }
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(
+ ZFS_PROP_SPECIAL_SMALL_BLOCKS),
+ smallblk_changed_cb, os);
+ }
+ }
+ if (err != 0) {
+ arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
+ kmem_free(os, sizeof (objset_t));
+ return (err);
+ }
+ } else {
+ /* It's the meta-objset. */
+ os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
+ os->os_compress = ZIO_COMPRESS_ON;
+ os->os_complevel = ZIO_COMPLEVEL_DEFAULT;
+ os->os_encrypted = B_FALSE;
+ os->os_copies = spa_max_replication(spa);
+ os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
+ os->os_dedup_verify = B_FALSE;
+ os->os_logbias = ZFS_LOGBIAS_LATENCY;
+ os->os_sync = ZFS_SYNC_STANDARD;
+ os->os_primary_cache = ZFS_CACHE_ALL;
+ os->os_secondary_cache = ZFS_CACHE_ALL;
+ os->os_dnodesize = DNODE_MIN_SIZE;
+ }
+
+ if (ds == NULL || !ds->ds_is_snapshot)
+ os->os_zil_header = os->os_phys->os_zil_header;
+ os->os_zil = zil_alloc(os, &os->os_zil_header);
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ os->os_dirty_dnodes[i] = multilist_create(sizeof (dnode_t),
+ offsetof(dnode_t, dn_dirty_link[i]),
+ dnode_multilist_index_func);
+ }
+ list_create(&os->os_dnodes, sizeof (dnode_t),
+ offsetof(dnode_t, dn_link));
+ list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_link));
+
+ list_link_init(&os->os_evicting_node);
+
+ mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
+ os->os_obj_next_percpu_len = boot_ncpus;
+ os->os_obj_next_percpu = kmem_zalloc(os->os_obj_next_percpu_len *
+ sizeof (os->os_obj_next_percpu[0]), KM_SLEEP);
+
+ dnode_special_open(os, &os->os_phys->os_meta_dnode,
+ DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
+ if (OBJSET_BUF_HAS_USERUSED(os->os_phys_buf)) {
+ dnode_special_open(os, &os->os_phys->os_userused_dnode,
+ DMU_USERUSED_OBJECT, &os->os_userused_dnode);
+ dnode_special_open(os, &os->os_phys->os_groupused_dnode,
+ DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
+ if (OBJSET_BUF_HAS_PROJECTUSED(os->os_phys_buf))
+ dnode_special_open(os,
+ &os->os_phys->os_projectused_dnode,
+ DMU_PROJECTUSED_OBJECT, &os->os_projectused_dnode);
+ }
+
+ mutex_init(&os->os_upgrade_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ *osp = os;
+ return (0);
+}
+
+int
+dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
+{
+ int err = 0;
+
+ /*
+ * We need the pool_config lock to manipulate the dsl_dataset_t.
+ * Even if the dataset is long-held, we need the pool_config lock
+ * to open the objset, as it needs to get properties.
+ */
+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+
+ mutex_enter(&ds->ds_opening_lock);
+ if (ds->ds_objset == NULL) {
+ objset_t *os;
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
+ ds, dsl_dataset_get_blkptr(ds), &os);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+ if (err == 0) {
+ mutex_enter(&ds->ds_lock);
+ ASSERT(ds->ds_objset == NULL);
+ ds->ds_objset = os;
+ mutex_exit(&ds->ds_lock);
+ }
+ }
+ *osp = ds->ds_objset;
+ mutex_exit(&ds->ds_opening_lock);
+ return (err);
+}
+
+/*
+ * Holds the pool while the objset is held. Therefore only one objset
+ * can be held at a time.
+ */
+int
+dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag,
+ objset_t **osp)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ int err;
+ ds_hold_flags_t flags;
+
+ flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
+ err = dsl_pool_hold(name, tag, &dp);
+ if (err != 0)
+ return (err);
+ err = dsl_dataset_hold_flags(dp, name, flags, tag, &ds);
+ if (err != 0) {
+ dsl_pool_rele(dp, tag);
+ return (err);
+ }
+
+ err = dmu_objset_from_ds(ds, osp);
+ if (err != 0) {
+ dsl_dataset_rele(ds, tag);
+ dsl_pool_rele(dp, tag);
+ }
+
+ return (err);
+}
+
+int
+dmu_objset_hold(const char *name, void *tag, objset_t **osp)
+{
+ return (dmu_objset_hold_flags(name, B_FALSE, tag, osp));
+}
+
+static int
+dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
+ boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
+{
+ int err;
+
+ err = dmu_objset_from_ds(ds, osp);
+ if (err != 0) {
+ return (err);
+ } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
+ return (SET_ERROR(EINVAL));
+ } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
+ return (SET_ERROR(EROFS));
+ } else if (!readonly && decrypt &&
+ dsl_dir_incompatible_encryption_version(ds->ds_dir)) {
+ return (SET_ERROR(EROFS));
+ }
+
+ /* if we are decrypting, we can now check MACs in os->os_phys_buf */
+ if (decrypt && arc_is_unauthenticated((*osp)->os_phys_buf)) {
+ zbookmark_phys_t zb;
+
+ SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT,
+ ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+ err = arc_untransform((*osp)->os_phys_buf, (*osp)->os_spa,
+ &zb, B_FALSE);
+ if (err != 0)
+ return (err);
+
+ ASSERT0(arc_is_unauthenticated((*osp)->os_phys_buf));
+ }
+
+ return (0);
+}
+
+/*
+ * dsl_pool must not be held when this is called.
+ * Upon successful return, there will be a longhold on the dataset,
+ * and the dsl_pool will not be held.
+ */
+int
+dmu_objset_own(const char *name, dmu_objset_type_t type,
+ boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ int err;
+ ds_hold_flags_t flags;
+
+ flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
+ err = dsl_pool_hold(name, FTAG, &dp);
+ if (err != 0)
+ return (err);
+ err = dsl_dataset_own(dp, name, flags, tag, &ds);
+ if (err != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (err);
+ }
+ err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp);
+ if (err != 0) {
+ dsl_dataset_disown(ds, flags, tag);
+ dsl_pool_rele(dp, FTAG);
+ return (err);
+ }
+
+ /*
+ * User accounting requires the dataset to be decrypted and rw.
+ * We also don't begin user accounting during claiming to help
+ * speed up pool import times and to keep this txg reserved
+ * completely for recovery work.
+ */
+ if (!readonly && !dp->dp_spa->spa_claiming &&
+ (ds->ds_dir->dd_crypto_obj == 0 || decrypt)) {
+ if (dmu_objset_userobjspace_upgradable(*osp) ||
+ dmu_objset_projectquota_upgradable(*osp)) {
+ dmu_objset_id_quota_upgrade(*osp);
+ } else if (dmu_objset_userused_enabled(*osp)) {
+ dmu_objset_userspace_upgrade(*osp);
+ }
+ }
+
+ dsl_pool_rele(dp, FTAG);
+ return (0);
+}
+
+int
+dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
+ boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
+{
+ dsl_dataset_t *ds;
+ int err;
+ ds_hold_flags_t flags;
+
+ flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
+ err = dsl_dataset_own_obj(dp, obj, flags, tag, &ds);
+ if (err != 0)
+ return (err);
+
+ err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp);
+ if (err != 0) {
+ dsl_dataset_disown(ds, flags, tag);
+ return (err);
+ }
+
+ return (0);
+}
+
+void
+dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag)
+{
+ ds_hold_flags_t flags;
+ dsl_pool_t *dp = dmu_objset_pool(os);
+
+ flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
+ dsl_dataset_rele_flags(os->os_dsl_dataset, flags, tag);
+ dsl_pool_rele(dp, tag);
+}
+
+void
+dmu_objset_rele(objset_t *os, void *tag)
+{
+ dmu_objset_rele_flags(os, B_FALSE, tag);
+}
+
+/*
+ * When we are called, os MUST refer to an objset associated with a dataset
+ * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
+ * == tag. We will then release and reacquire ownership of the dataset while
+ * holding the pool config_rwlock to avoid intervening namespace or ownership
+ * changes may occur.
+ *
+ * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
+ * release the hold on its dataset and acquire a new one on the dataset of the
+ * same name so that it can be partially torn down and reconstructed.
+ */
+void
+dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds,
+ boolean_t decrypt, void *tag)
+{
+ dsl_pool_t *dp;
+ char name[ZFS_MAX_DATASET_NAME_LEN];
+ ds_hold_flags_t flags;
+
+ flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
+ VERIFY3P(ds, !=, NULL);
+ VERIFY3P(ds->ds_owner, ==, tag);
+ VERIFY(dsl_dataset_long_held(ds));
+
+ dsl_dataset_name(ds, name);
+ dp = ds->ds_dir->dd_pool;
+ dsl_pool_config_enter(dp, FTAG);
+ dsl_dataset_disown(ds, flags, tag);
+ VERIFY0(dsl_dataset_own(dp, name, flags, tag, newds));
+ dsl_pool_config_exit(dp, FTAG);
+}
+
+void
+dmu_objset_disown(objset_t *os, boolean_t decrypt, void *tag)
+{
+ ds_hold_flags_t flags;
+
+ flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
+ /*
+ * Stop upgrading thread
+ */
+ dmu_objset_upgrade_stop(os);
+ dsl_dataset_disown(os->os_dsl_dataset, flags, tag);
+}
+
+void
+dmu_objset_evict_dbufs(objset_t *os)
+{
+ dnode_t *dn_marker;
+ dnode_t *dn;
+
+ dn_marker = kmem_alloc(sizeof (dnode_t), KM_SLEEP);
+
+ mutex_enter(&os->os_lock);
+ dn = list_head(&os->os_dnodes);
+ while (dn != NULL) {
+ /*
+ * Skip dnodes without holds. We have to do this dance
+ * because dnode_add_ref() only works if there is already a
+ * hold. If the dnode has no holds, then it has no dbufs.
+ */
+ if (dnode_add_ref(dn, FTAG)) {
+ list_insert_after(&os->os_dnodes, dn, dn_marker);
+ mutex_exit(&os->os_lock);
+
+ dnode_evict_dbufs(dn);
+ dnode_rele(dn, FTAG);
+
+ mutex_enter(&os->os_lock);
+ dn = list_next(&os->os_dnodes, dn_marker);
+ list_remove(&os->os_dnodes, dn_marker);
+ } else {
+ dn = list_next(&os->os_dnodes, dn);
+ }
+ }
+ mutex_exit(&os->os_lock);
+
+ kmem_free(dn_marker, sizeof (dnode_t));
+
+ if (DMU_USERUSED_DNODE(os) != NULL) {
+ if (DMU_PROJECTUSED_DNODE(os) != NULL)
+ dnode_evict_dbufs(DMU_PROJECTUSED_DNODE(os));
+ dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
+ dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
+ }
+ dnode_evict_dbufs(DMU_META_DNODE(os));
+}
+
+/*
+ * Objset eviction processing is split into into two pieces.
+ * The first marks the objset as evicting, evicts any dbufs that
+ * have a refcount of zero, and then queues up the objset for the
+ * second phase of eviction. Once os->os_dnodes has been cleared by
+ * dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
+ * The second phase closes the special dnodes, dequeues the objset from
+ * the list of those undergoing eviction, and finally frees the objset.
+ *
+ * NOTE: Due to asynchronous eviction processing (invocation of
+ * dnode_buf_pageout()), it is possible for the meta dnode for the
+ * objset to have no holds even though os->os_dnodes is not empty.
+ */
+void
+dmu_objset_evict(objset_t *os)
+{
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+
+ for (int t = 0; t < TXG_SIZE; t++)
+ ASSERT(!dmu_objset_is_dirty(os, t));
+
+ if (ds)
+ dsl_prop_unregister_all(ds, os);
+
+ if (os->os_sa)
+ sa_tear_down(os);
+
+ dmu_objset_evict_dbufs(os);
+
+ mutex_enter(&os->os_lock);
+ spa_evicting_os_register(os->os_spa, os);
+ if (list_is_empty(&os->os_dnodes)) {
+ mutex_exit(&os->os_lock);
+ dmu_objset_evict_done(os);
+ } else {
+ mutex_exit(&os->os_lock);
+ }
+
+
+}
+
+void
+dmu_objset_evict_done(objset_t *os)
+{
+ ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
+
+ dnode_special_close(&os->os_meta_dnode);
+ if (DMU_USERUSED_DNODE(os)) {
+ if (DMU_PROJECTUSED_DNODE(os))
+ dnode_special_close(&os->os_projectused_dnode);
+ dnode_special_close(&os->os_userused_dnode);
+ dnode_special_close(&os->os_groupused_dnode);
+ }
+ zil_free(os->os_zil);
+
+ arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
+
+ /*
+ * This is a barrier to prevent the objset from going away in
+ * dnode_move() until we can safely ensure that the objset is still in
+ * use. We consider the objset valid before the barrier and invalid
+ * after the barrier.
+ */
+ rw_enter(&os_lock, RW_READER);
+ rw_exit(&os_lock);
+
+ kmem_free(os->os_obj_next_percpu,
+ os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0]));
+
+ mutex_destroy(&os->os_lock);
+ mutex_destroy(&os->os_userused_lock);
+ mutex_destroy(&os->os_obj_lock);
+ mutex_destroy(&os->os_user_ptr_lock);
+ mutex_destroy(&os->os_upgrade_lock);
+ for (int i = 0; i < TXG_SIZE; i++) {
+ multilist_destroy(os->os_dirty_dnodes[i]);
+ }
+ spa_evicting_os_deregister(os->os_spa, os);
+ kmem_free(os, sizeof (objset_t));
+}
+
+inode_timespec_t
+dmu_objset_snap_cmtime(objset_t *os)
+{
+ return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
+}
+
+objset_t *
+dmu_objset_create_impl_dnstats(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
+ dmu_objset_type_t type, int levels, int blksz, int ibs, dmu_tx_t *tx)
+{
+ objset_t *os;
+ dnode_t *mdn;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ if (blksz == 0)
+ blksz = DNODE_BLOCK_SIZE;
+ if (ibs == 0)
+ ibs = DN_MAX_INDBLKSHIFT;
+
+ if (ds != NULL)
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+ else
+ VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));
+
+ mdn = DMU_META_DNODE(os);
+
+ dnode_allocate(mdn, DMU_OT_DNODE, blksz, ibs, DMU_OT_NONE, 0,
+ DNODE_MIN_SLOTS, tx);
+
+ /*
+ * We don't want to have to increase the meta-dnode's nlevels
+ * later, because then we could do it in quiescing context while
+ * we are also accessing it in open context.
+ *
+ * This precaution is not necessary for the MOS (ds == NULL),
+ * because the MOS is only updated in syncing context.
+ * This is most fortunate: the MOS is the only objset that
+ * needs to be synced multiple times as spa_sync() iterates
+ * to convergence, so minimizing its dn_nlevels matters.
+ */
+ if (ds != NULL) {
+ if (levels == 0) {
+ levels = 1;
+
+ /*
+ * Determine the number of levels necessary for the
+ * meta-dnode to contain DN_MAX_OBJECT dnodes. Note
+ * that in order to ensure that we do not overflow
+ * 64 bits, there has to be a nlevels that gives us a
+ * number of blocks > DN_MAX_OBJECT but < 2^64.
+ * Therefore, (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)
+ * (10) must be less than (64 - log2(DN_MAX_OBJECT))
+ * (16).
+ */
+ while ((uint64_t)mdn->dn_nblkptr <<
+ (mdn->dn_datablkshift - DNODE_SHIFT + (levels - 1) *
+ (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
+ DN_MAX_OBJECT)
+ levels++;
+ }
+
+ mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
+ mdn->dn_nlevels = levels;
+ }
+
+ ASSERT(type != DMU_OST_NONE);
+ ASSERT(type != DMU_OST_ANY);
+ ASSERT(type < DMU_OST_NUMTYPES);
+ os->os_phys->os_type = type;
+
+ /*
+ * Enable user accounting if it is enabled and this is not an
+ * encrypted receive.
+ */
+ if (dmu_objset_userused_enabled(os) &&
+ (!os->os_encrypted || !dmu_objset_is_receiving(os))) {
+ os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+ if (dmu_objset_userobjused_enabled(os)) {
+ ds->ds_feature_activation[
+ SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE;
+ os->os_phys->os_flags |=
+ OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE;
+ }
+ if (dmu_objset_projectquota_enabled(os)) {
+ ds->ds_feature_activation[
+ SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE;
+ os->os_phys->os_flags |=
+ OBJSET_FLAG_PROJECTQUOTA_COMPLETE;
+ }
+ os->os_flags = os->os_phys->os_flags;
+ }
+
+ dsl_dataset_dirty(ds, tx);
+
+ return (os);
+}
+
+/* called from dsl for meta-objset */
+objset_t *
+dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
+ dmu_objset_type_t type, dmu_tx_t *tx)
+{
+ return (dmu_objset_create_impl_dnstats(spa, ds, bp, type, 0, 0, 0, tx));
+}
+
+typedef struct dmu_objset_create_arg {
+ const char *doca_name;
+ cred_t *doca_cred;
+ proc_t *doca_proc;
+ void (*doca_userfunc)(objset_t *os, void *arg,
+ cred_t *cr, dmu_tx_t *tx);
+ void *doca_userarg;
+ dmu_objset_type_t doca_type;
+ uint64_t doca_flags;
+ dsl_crypto_params_t *doca_dcp;
+} dmu_objset_create_arg_t;
+
+/*ARGSUSED*/
+static int
+dmu_objset_create_check(void *arg, dmu_tx_t *tx)
+{
+ dmu_objset_create_arg_t *doca = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dir_t *pdd;
+ dsl_dataset_t *parentds;
+ objset_t *parentos;
+ const char *tail;
+ int error;
+
+ if (strchr(doca->doca_name, '@') != NULL)
+ return (SET_ERROR(EINVAL));
+
+ if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+
+ if (dataset_nestcheck(doca->doca_name) != 0)
+ return (SET_ERROR(ENAMETOOLONG));
+
+ error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
+ if (error != 0)
+ return (error);
+ if (tail == NULL) {
+ dsl_dir_rele(pdd, FTAG);
+ return (SET_ERROR(EEXIST));
+ }
+
+ error = dmu_objset_create_crypt_check(pdd, doca->doca_dcp, NULL);
+ if (error != 0) {
+ dsl_dir_rele(pdd, FTAG);
+ return (error);
+ }
+
+ error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
+ doca->doca_cred, doca->doca_proc);
+ if (error != 0) {
+ dsl_dir_rele(pdd, FTAG);
+ return (error);
+ }
+
+ /* can't create below anything but filesystems (eg. no ZVOLs) */
+ error = dsl_dataset_hold_obj(pdd->dd_pool,
+ dsl_dir_phys(pdd)->dd_head_dataset_obj, FTAG, &parentds);
+ if (error != 0) {
+ dsl_dir_rele(pdd, FTAG);
+ return (error);
+ }
+ error = dmu_objset_from_ds(parentds, &parentos);
+ if (error != 0) {
+ dsl_dataset_rele(parentds, FTAG);
+ dsl_dir_rele(pdd, FTAG);
+ return (error);
+ }
+ if (dmu_objset_type(parentos) != DMU_OST_ZFS) {
+ dsl_dataset_rele(parentds, FTAG);
+ dsl_dir_rele(pdd, FTAG);
+ return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
+ }
+ dsl_dataset_rele(parentds, FTAG);
+ dsl_dir_rele(pdd, FTAG);
+
+ return (error);
+}
+
+static void
+dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
+{
+ dmu_objset_create_arg_t *doca = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ spa_t *spa = dp->dp_spa;
+ dsl_dir_t *pdd;
+ const char *tail;
+ dsl_dataset_t *ds;
+ uint64_t obj;
+ blkptr_t *bp;
+ objset_t *os;
+ zio_t *rzio;
+
+ VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));
+
+ obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
+ doca->doca_cred, doca->doca_dcp, tx);
+
+ VERIFY0(dsl_dataset_hold_obj_flags(pdd->dd_pool, obj,
+ DS_HOLD_FLAG_DECRYPT, FTAG, &ds));
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ bp = dsl_dataset_get_blkptr(ds);
+ os = dmu_objset_create_impl(spa, ds, bp, doca->doca_type, tx);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+ if (doca->doca_userfunc != NULL) {
+ doca->doca_userfunc(os, doca->doca_userarg,
+ doca->doca_cred, tx);
+ }
+
+ /*
+ * The doca_userfunc() may write out some data that needs to be
+ * encrypted if the dataset is encrypted (specifically the root
+ * directory). This data must be written out before the encryption
+ * key mapping is removed by dsl_dataset_rele_flags(). Force the
+ * I/O to occur immediately by invoking the relevant sections of
+ * dsl_pool_sync().
+ */
+ if (os->os_encrypted) {
+ dsl_dataset_t *tmpds = NULL;
+ boolean_t need_sync_done = B_FALSE;
+
+ mutex_enter(&ds->ds_lock);
+ ds->ds_owner = FTAG;
+ mutex_exit(&ds->ds_lock);
+
+ rzio = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ tmpds = txg_list_remove_this(&dp->dp_dirty_datasets, ds,
+ tx->tx_txg);
+ if (tmpds != NULL) {
+ dsl_dataset_sync(ds, rzio, tx);
+ need_sync_done = B_TRUE;
+ }
+ VERIFY0(zio_wait(rzio));
+
+ dmu_objset_sync_done(os, tx);
+ taskq_wait(dp->dp_sync_taskq);
+ if (txg_list_member(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
+ ASSERT3P(ds->ds_key_mapping, !=, NULL);
+ key_mapping_rele(spa, ds->ds_key_mapping, ds);
+ }
+
+ rzio = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ tmpds = txg_list_remove_this(&dp->dp_dirty_datasets, ds,
+ tx->tx_txg);
+ if (tmpds != NULL) {
+ dmu_buf_rele(ds->ds_dbuf, ds);
+ dsl_dataset_sync(ds, rzio, tx);
+ }
+ VERIFY0(zio_wait(rzio));
+
+ if (need_sync_done) {
+ ASSERT3P(ds->ds_key_mapping, !=, NULL);
+ key_mapping_rele(spa, ds->ds_key_mapping, ds);
+ dsl_dataset_sync_done(ds, tx);
+ }
+
+ mutex_enter(&ds->ds_lock);
+ ds->ds_owner = NULL;
+ mutex_exit(&ds->ds_lock);
+ }
+
+ spa_history_log_internal_ds(ds, "create", tx, " ");
+
+ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+ dsl_dir_rele(pdd, FTAG);
+}
+
+int
+dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
+ dsl_crypto_params_t *dcp, dmu_objset_create_sync_func_t func, void *arg)
+{
+ dmu_objset_create_arg_t doca;
+ dsl_crypto_params_t tmp_dcp = { 0 };
+
+ doca.doca_name = name;
+ doca.doca_cred = CRED();
+ doca.doca_proc = curproc;
+ doca.doca_flags = flags;
+ doca.doca_userfunc = func;
+ doca.doca_userarg = arg;
+ doca.doca_type = type;
+
+ /*
+ * Some callers (mostly for testing) do not provide a dcp on their
+ * own but various code inside the sync task will require it to be
+ * allocated. Rather than adding NULL checks throughout this code
+ * or adding dummy dcp's to all of the callers we simply create a
+ * dummy one here and use that. This zero dcp will have the same
+ * effect as asking for inheritance of all encryption params.
+ */
+ doca.doca_dcp = (dcp != NULL) ? dcp : &tmp_dcp;
+
+ int rv = dsl_sync_task(name,
+ dmu_objset_create_check, dmu_objset_create_sync, &doca,
+ 6, ZFS_SPACE_CHECK_NORMAL);
+
+ if (rv == 0)
+ zvol_create_minor(name);
+ return (rv);
+}
+
+typedef struct dmu_objset_clone_arg {
+ const char *doca_clone;
+ const char *doca_origin;
+ cred_t *doca_cred;
+ proc_t *doca_proc;
+} dmu_objset_clone_arg_t;
+
+/*ARGSUSED*/
+static int
+dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
+{
+ dmu_objset_clone_arg_t *doca = arg;
+ dsl_dir_t *pdd;
+ const char *tail;
+ int error;
+ dsl_dataset_t *origin;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+
+ if (strchr(doca->doca_clone, '@') != NULL)
+ return (SET_ERROR(EINVAL));
+
+ if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+
+ error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
+ if (error != 0)
+ return (error);
+ if (tail == NULL) {
+ dsl_dir_rele(pdd, FTAG);
+ return (SET_ERROR(EEXIST));
+ }
+
+ error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
+ doca->doca_cred, doca->doca_proc);
+ if (error != 0) {
+ dsl_dir_rele(pdd, FTAG);
+ return (SET_ERROR(EDQUOT));
+ }
+
+ error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
+ if (error != 0) {
+ dsl_dir_rele(pdd, FTAG);
+ return (error);
+ }
+
+ /* You can only clone snapshots, not the head datasets. */
+ if (!origin->ds_is_snapshot) {
+ dsl_dataset_rele(origin, FTAG);
+ dsl_dir_rele(pdd, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ dsl_dataset_rele(origin, FTAG);
+ dsl_dir_rele(pdd, FTAG);
+
+ return (0);
+}
+
+static void
+dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
+{
+ dmu_objset_clone_arg_t *doca = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dir_t *pdd;
+ const char *tail;
+ dsl_dataset_t *origin, *ds;
+ uint64_t obj;
+ char namebuf[ZFS_MAX_DATASET_NAME_LEN];
+
+ VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
+ VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
+
+ obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
+ doca->doca_cred, NULL, tx);
+
+ VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
+ dsl_dataset_name(origin, namebuf);
+ spa_history_log_internal_ds(ds, "clone", tx,
+ "origin=%s (%llu)", namebuf, (u_longlong_t)origin->ds_object);
+ dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele(origin, FTAG);
+ dsl_dir_rele(pdd, FTAG);
+}
+
+int
+dmu_objset_clone(const char *clone, const char *origin)
+{
+ dmu_objset_clone_arg_t doca;
+
+ doca.doca_clone = clone;
+ doca.doca_origin = origin;
+ doca.doca_cred = CRED();
+ doca.doca_proc = curproc;
+
+ int rv = dsl_sync_task(clone,
+ dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
+ 6, ZFS_SPACE_CHECK_NORMAL);
+
+ if (rv == 0)
+ zvol_create_minor(clone);
+
+ return (rv);
+}
+
+int
+dmu_objset_snapshot_one(const char *fsname, const char *snapname)
+{
+ int err;
+ char *longsnap = kmem_asprintf("%s@%s", fsname, snapname);
+ nvlist_t *snaps = fnvlist_alloc();
+
+ fnvlist_add_boolean(snaps, longsnap);
+ kmem_strfree(longsnap);
+ err = dsl_dataset_snapshot(snaps, NULL, NULL);
+ fnvlist_free(snaps);
+ return (err);
+}
+
+static void
+dmu_objset_upgrade_task_cb(void *data)
+{
+ objset_t *os = data;
+
+ mutex_enter(&os->os_upgrade_lock);
+ os->os_upgrade_status = EINTR;
+ if (!os->os_upgrade_exit) {
+ int status;
+
+ mutex_exit(&os->os_upgrade_lock);
+
+ status = os->os_upgrade_cb(os);
+
+ mutex_enter(&os->os_upgrade_lock);
+
+ os->os_upgrade_status = status;
+ }
+ os->os_upgrade_exit = B_TRUE;
+ os->os_upgrade_id = 0;
+ mutex_exit(&os->os_upgrade_lock);
+ dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
+}
+
+static void
+dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb)
+{
+ if (os->os_upgrade_id != 0)
+ return;
+
+ ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
+ dsl_dataset_long_hold(dmu_objset_ds(os), upgrade_tag);
+
+ mutex_enter(&os->os_upgrade_lock);
+ if (os->os_upgrade_id == 0 && os->os_upgrade_status == 0) {
+ os->os_upgrade_exit = B_FALSE;
+ os->os_upgrade_cb = cb;
+ os->os_upgrade_id = taskq_dispatch(
+ os->os_spa->spa_upgrade_taskq,
+ dmu_objset_upgrade_task_cb, os, TQ_SLEEP);
+ if (os->os_upgrade_id == TASKQID_INVALID) {
+ dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
+ os->os_upgrade_status = ENOMEM;
+ }
+ } else {
+ dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
+ }
+ mutex_exit(&os->os_upgrade_lock);
+}
+
+static void
+dmu_objset_upgrade_stop(objset_t *os)
+{
+ mutex_enter(&os->os_upgrade_lock);
+ os->os_upgrade_exit = B_TRUE;
+ if (os->os_upgrade_id != 0) {
+ taskqid_t id = os->os_upgrade_id;
+
+ os->os_upgrade_id = 0;
+ mutex_exit(&os->os_upgrade_lock);
+
+ if ((taskq_cancel_id(os->os_spa->spa_upgrade_taskq, id)) == 0) {
+ dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
+ }
+ txg_wait_synced(os->os_spa->spa_dsl_pool, 0);
+ } else {
+ mutex_exit(&os->os_upgrade_lock);
+ }
+}
+
+static void
+dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+
+ while ((dn = multilist_sublist_head(list)) != NULL) {
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+ ASSERT(dn->dn_dbuf->db_data_pending);
+ /*
+ * Initialize dn_zio outside dnode_sync() because the
+ * meta-dnode needs to set it outside dnode_sync().
+ */
+ dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
+ ASSERT(dn->dn_zio);
+
+ ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
+ multilist_sublist_remove(list, dn);
+
+ /*
+ * See the comment above dnode_rele_task() for an explanation
+ * of why this dnode hold is always needed (even when not
+ * doing user accounting).
+ */
+ multilist_t *newlist = dn->dn_objset->os_synced_dnodes;
+ (void) dnode_add_ref(dn, newlist);
+ multilist_insert(newlist, dn);
+
+ dnode_sync(dn, tx);
+ }
+}
+
+/* ARGSUSED */
+static void
+dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+ blkptr_t *bp = zio->io_bp;
+ objset_t *os = arg;
+ dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
+ uint64_t fill = 0;
+
+ ASSERT(!BP_IS_EMBEDDED(bp));
+ ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
+ ASSERT0(BP_GET_LEVEL(bp));
+
+ /*
+ * Update rootbp fill count: it should be the number of objects
+ * allocated in the object set (not counting the "special"
+ * objects that are stored in the objset_phys_t -- the meta
+ * dnode and user/group/project accounting objects).
+ */
+ for (int i = 0; i < dnp->dn_nblkptr; i++)
+ fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
+
+ BP_SET_FILL(bp, fill);
+
+ if (os->os_dsl_dataset != NULL)
+ rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG);
+ *os->os_rootbp = *bp;
+ if (os->os_dsl_dataset != NULL)
+ rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
+}
+
+/* ARGSUSED */
+static void
+dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+ blkptr_t *bp = zio->io_bp;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ objset_t *os = arg;
+
+ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+ ASSERT(BP_EQUAL(bp, bp_orig));
+ } else {
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ dmu_tx_t *tx = os->os_synctx;
+
+ (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+ dsl_dataset_block_born(ds, bp, tx);
+ }
+ kmem_free(bp, sizeof (*bp));
+}
+
+typedef struct sync_dnodes_arg {
+ multilist_t *sda_list;
+ int sda_sublist_idx;
+ multilist_t *sda_newlist;
+ dmu_tx_t *sda_tx;
+} sync_dnodes_arg_t;
+
+static void
+sync_dnodes_task(void *arg)
+{
+ sync_dnodes_arg_t *sda = arg;
+
+ multilist_sublist_t *ms =
+ multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
+
+ dmu_objset_sync_dnodes(ms, sda->sda_tx);
+
+ multilist_sublist_unlock(ms);
+
+ kmem_free(sda, sizeof (*sda));
+}
+
+
+/* called from dsl */
+void
+dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
+{
+ int txgoff;
+ zbookmark_phys_t zb;
+ zio_prop_t zp;
+ zio_t *zio;
+ list_t *list;
+ dbuf_dirty_record_t *dr;
+ int num_sublists;
+ multilist_t *ml;
+ blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);
+ *blkptr_copy = *os->os_rootbp;
+
+ dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ /* XXX the write_done callback should really give us the tx... */
+ os->os_synctx = tx;
+
+ if (os->os_dsl_dataset == NULL) {
+ /*
+ * This is the MOS. If we have upgraded,
+ * spa_max_replication() could change, so reset
+ * os_copies here.
+ */
+ os->os_copies = spa_max_replication(os->os_spa);
+ }
+
+ /*
+ * Create the root block IO
+ */
+ SET_BOOKMARK(&zb, os->os_dsl_dataset ?
+ os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+ arc_release(os->os_phys_buf, &os->os_phys_buf);
+
+ dmu_write_policy(os, NULL, 0, 0, &zp);
+
+ /*
+ * If we are either claiming the ZIL or doing a raw receive, write
+ * out the os_phys_buf raw. Neither of these actions will effect the
+ * MAC at this point.
+ */
+ if (os->os_raw_receive ||
+ os->os_next_write_raw[tx->tx_txg & TXG_MASK]) {
+ ASSERT(os->os_encrypted);
+ arc_convert_to_raw(os->os_phys_buf,
+ os->os_dsl_dataset->ds_object, ZFS_HOST_BYTEORDER,
+ DMU_OT_OBJSET, NULL, NULL, NULL);
+ }
+
+ zio = arc_write(pio, os->os_spa, tx->tx_txg,
+ blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
+ &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
+ os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+
+ /*
+ * Sync special dnodes - the parent IO for the sync is the root block
+ */
+ DMU_META_DNODE(os)->dn_zio = zio;
+ dnode_sync(DMU_META_DNODE(os), tx);
+
+ os->os_phys->os_flags = os->os_flags;
+
+ if (DMU_USERUSED_DNODE(os) &&
+ DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
+ DMU_USERUSED_DNODE(os)->dn_zio = zio;
+ dnode_sync(DMU_USERUSED_DNODE(os), tx);
+ DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
+ dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
+ }
+
+ if (DMU_PROJECTUSED_DNODE(os) &&
+ DMU_PROJECTUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
+ DMU_PROJECTUSED_DNODE(os)->dn_zio = zio;
+ dnode_sync(DMU_PROJECTUSED_DNODE(os), tx);
+ }
+
+ txgoff = tx->tx_txg & TXG_MASK;
+
+ /*
+ * We must create the list here because it uses the
+ * dn_dirty_link[] of this txg. But it may already
+ * exist because we call dsl_dataset_sync() twice per txg.
+ */
+ if (os->os_synced_dnodes == NULL) {
+ os->os_synced_dnodes =
+ multilist_create(sizeof (dnode_t),
+ offsetof(dnode_t, dn_dirty_link[txgoff]),
+ dnode_multilist_index_func);
+ } else {
+ ASSERT3U(os->os_synced_dnodes->ml_offset, ==,
+ offsetof(dnode_t, dn_dirty_link[txgoff]));
+ }
+
+ ml = os->os_dirty_dnodes[txgoff];
+ num_sublists = multilist_get_num_sublists(ml);
+ for (int i = 0; i < num_sublists; i++) {
+ if (multilist_sublist_is_empty_idx(ml, i))
+ continue;
+ sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP);
+ sda->sda_list = ml;
+ sda->sda_sublist_idx = i;
+ sda->sda_tx = tx;
+ (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
+ sync_dnodes_task, sda, 0);
+ /* callback frees sda */
+ }
+ taskq_wait(dmu_objset_pool(os)->dp_sync_taskq);
+
+ list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
+ while ((dr = list_head(list)) != NULL) {
+ ASSERT0(dr->dr_dbuf->db_level);
+ list_remove(list, dr);
+ zio_nowait(dr->dr_zio);
+ }
+
+ /* Enable dnode backfill if enough objects have been freed. */
+ if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
+ os->os_rescan_dnodes = B_TRUE;
+ os->os_freed_dnodes = 0;
+ }
+
+ /*
+ * Free intent log blocks up to this tx.
+ */
+ zil_sync(os->os_zil, tx);
+ os->os_phys->os_zil_header = os->os_zil_header;
+ zio_nowait(zio);
+}
+
+boolean_t
+dmu_objset_is_dirty(objset_t *os, uint64_t txg)
+{
+ return (!multilist_is_empty(os->os_dirty_dnodes[txg & TXG_MASK]));
+}
+
+static file_info_cb_t *file_cbs[DMU_OST_NUMTYPES];
+
+void
+dmu_objset_register_type(dmu_objset_type_t ost, file_info_cb_t *cb)
+{
+ file_cbs[ost] = cb;
+}
+
+int
+dmu_get_file_info(objset_t *os, dmu_object_type_t bonustype, const void *data,
+ zfs_file_info_t *zfi)
+{
+ file_info_cb_t *cb = file_cbs[os->os_phys->os_type];
+ if (cb == NULL)
+ return (EINVAL);
+ return (cb(bonustype, data, zfi));
+}
+
+boolean_t
+dmu_objset_userused_enabled(objset_t *os)
+{
+ return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
+ file_cbs[os->os_phys->os_type] != NULL &&
+ DMU_USERUSED_DNODE(os) != NULL);
+}
+
+boolean_t
+dmu_objset_userobjused_enabled(objset_t *os)
+{
+ return (dmu_objset_userused_enabled(os) &&
+ spa_feature_is_enabled(os->os_spa, SPA_FEATURE_USEROBJ_ACCOUNTING));
+}
+
+boolean_t
+dmu_objset_projectquota_enabled(objset_t *os)
+{
+ return (file_cbs[os->os_phys->os_type] != NULL &&
+ DMU_PROJECTUSED_DNODE(os) != NULL &&
+ spa_feature_is_enabled(os->os_spa, SPA_FEATURE_PROJECT_QUOTA));
+}
+
+typedef struct userquota_node {
+ /* must be in the first filed, see userquota_update_cache() */
+ char uqn_id[20 + DMU_OBJACCT_PREFIX_LEN];
+ int64_t uqn_delta;
+ avl_node_t uqn_node;
+} userquota_node_t;
+
+typedef struct userquota_cache {
+ avl_tree_t uqc_user_deltas;
+ avl_tree_t uqc_group_deltas;
+ avl_tree_t uqc_project_deltas;
+} userquota_cache_t;
+
+static int
+userquota_compare(const void *l, const void *r)
+{
+ const userquota_node_t *luqn = l;
+ const userquota_node_t *ruqn = r;
+ int rv;
+
+ /*
+ * NB: can only access uqn_id because userquota_update_cache() doesn't
+ * pass in an entire userquota_node_t.
+ */
+ rv = strcmp(luqn->uqn_id, ruqn->uqn_id);
+
+ return (TREE_ISIGN(rv));
+}
+
+static void
+do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx)
+{
+ void *cookie;
+ userquota_node_t *uqn;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ cookie = NULL;
+ while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas,
+ &cookie)) != NULL) {
+ /*
+ * os_userused_lock protects against concurrent calls to
+ * zap_increment_int(). It's needed because zap_increment_int()
+ * is not thread-safe (i.e. not atomic).
+ */
+ mutex_enter(&os->os_userused_lock);
+ VERIFY0(zap_increment(os, DMU_USERUSED_OBJECT,
+ uqn->uqn_id, uqn->uqn_delta, tx));
+ mutex_exit(&os->os_userused_lock);
+ kmem_free(uqn, sizeof (*uqn));
+ }
+ avl_destroy(&cache->uqc_user_deltas);
+
+ cookie = NULL;
+ while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas,
+ &cookie)) != NULL) {
+ mutex_enter(&os->os_userused_lock);
+ VERIFY0(zap_increment(os, DMU_GROUPUSED_OBJECT,
+ uqn->uqn_id, uqn->uqn_delta, tx));
+ mutex_exit(&os->os_userused_lock);
+ kmem_free(uqn, sizeof (*uqn));
+ }
+ avl_destroy(&cache->uqc_group_deltas);
+
+ if (dmu_objset_projectquota_enabled(os)) {
+ cookie = NULL;
+ while ((uqn = avl_destroy_nodes(&cache->uqc_project_deltas,
+ &cookie)) != NULL) {
+ mutex_enter(&os->os_userused_lock);
+ VERIFY0(zap_increment(os, DMU_PROJECTUSED_OBJECT,
+ uqn->uqn_id, uqn->uqn_delta, tx));
+ mutex_exit(&os->os_userused_lock);
+ kmem_free(uqn, sizeof (*uqn));
+ }
+ avl_destroy(&cache->uqc_project_deltas);
+ }
+}
+
+static void
+userquota_update_cache(avl_tree_t *avl, const char *id, int64_t delta)
+{
+ userquota_node_t *uqn;
+ avl_index_t idx;
+
+ ASSERT(strlen(id) < sizeof (uqn->uqn_id));
+ /*
+ * Use id directly for searching because uqn_id is the first field of
+ * userquota_node_t and fields after uqn_id won't be accessed in
+ * avl_find().
+ */
+ uqn = avl_find(avl, (const void *)id, &idx);
+ if (uqn == NULL) {
+ uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP);
+ strlcpy(uqn->uqn_id, id, sizeof (uqn->uqn_id));
+ avl_insert(avl, uqn, idx);
+ }
+ uqn->uqn_delta += delta;
+}
+
+static void
+do_userquota_update(objset_t *os, userquota_cache_t *cache, uint64_t used,
+ uint64_t flags, uint64_t user, uint64_t group, uint64_t project,
+ boolean_t subtract)
+{
+ if (flags & DNODE_FLAG_USERUSED_ACCOUNTED) {
+ int64_t delta = DNODE_MIN_SIZE + used;
+ char name[20];
+
+ if (subtract)
+ delta = -delta;
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)user);
+ userquota_update_cache(&cache->uqc_user_deltas, name, delta);
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)group);
+ userquota_update_cache(&cache->uqc_group_deltas, name, delta);
+
+ if (dmu_objset_projectquota_enabled(os)) {
+ (void) snprintf(name, sizeof (name), "%llx",
+ (longlong_t)project);
+ userquota_update_cache(&cache->uqc_project_deltas,
+ name, delta);
+ }
+ }
+}
+
+static void
+do_userobjquota_update(objset_t *os, userquota_cache_t *cache, uint64_t flags,
+ uint64_t user, uint64_t group, uint64_t project, boolean_t subtract)
+{
+ if (flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) {
+ char name[20 + DMU_OBJACCT_PREFIX_LEN];
+ int delta = subtract ? -1 : 1;
+
+ (void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx",
+ (longlong_t)user);
+ userquota_update_cache(&cache->uqc_user_deltas, name, delta);
+
+ (void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx",
+ (longlong_t)group);
+ userquota_update_cache(&cache->uqc_group_deltas, name, delta);
+
+ if (dmu_objset_projectquota_enabled(os)) {
+ (void) snprintf(name, sizeof (name),
+ DMU_OBJACCT_PREFIX "%llx", (longlong_t)project);
+ userquota_update_cache(&cache->uqc_project_deltas,
+ name, delta);
+ }
+ }
+}
+
+typedef struct userquota_updates_arg {
+ objset_t *uua_os;
+ int uua_sublist_idx;
+ dmu_tx_t *uua_tx;
+} userquota_updates_arg_t;
+
+static void
+userquota_updates_task(void *arg)
+{
+ userquota_updates_arg_t *uua = arg;
+ objset_t *os = uua->uua_os;
+ dmu_tx_t *tx = uua->uua_tx;
+ dnode_t *dn;
+ userquota_cache_t cache = { { 0 } };
+
+ multilist_sublist_t *list =
+ multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx);
+
+ ASSERT(multilist_sublist_head(list) == NULL ||
+ dmu_objset_userused_enabled(os));
+ avl_create(&cache.uqc_user_deltas, userquota_compare,
+ sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
+ avl_create(&cache.uqc_group_deltas, userquota_compare,
+ sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
+ if (dmu_objset_projectquota_enabled(os))
+ avl_create(&cache.uqc_project_deltas, userquota_compare,
+ sizeof (userquota_node_t), offsetof(userquota_node_t,
+ uqn_node));
+
+ while ((dn = multilist_sublist_head(list)) != NULL) {
+ int flags;
+ ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
+ ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
+ dn->dn_phys->dn_flags &
+ DNODE_FLAG_USERUSED_ACCOUNTED);
+
+ flags = dn->dn_id_flags;
+ ASSERT(flags);
+ if (flags & DN_ID_OLD_EXIST) {
+ do_userquota_update(os, &cache, dn->dn_oldused,
+ dn->dn_oldflags, dn->dn_olduid, dn->dn_oldgid,
+ dn->dn_oldprojid, B_TRUE);
+ do_userobjquota_update(os, &cache, dn->dn_oldflags,
+ dn->dn_olduid, dn->dn_oldgid,
+ dn->dn_oldprojid, B_TRUE);
+ }
+ if (flags & DN_ID_NEW_EXIST) {
+ do_userquota_update(os, &cache,
+ DN_USED_BYTES(dn->dn_phys), dn->dn_phys->dn_flags,
+ dn->dn_newuid, dn->dn_newgid,
+ dn->dn_newprojid, B_FALSE);
+ do_userobjquota_update(os, &cache,
+ dn->dn_phys->dn_flags, dn->dn_newuid, dn->dn_newgid,
+ dn->dn_newprojid, B_FALSE);
+ }
+
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_oldused = 0;
+ dn->dn_oldflags = 0;
+ if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
+ dn->dn_olduid = dn->dn_newuid;
+ dn->dn_oldgid = dn->dn_newgid;
+ dn->dn_oldprojid = dn->dn_newprojid;
+ dn->dn_id_flags |= DN_ID_OLD_EXIST;
+ if (dn->dn_bonuslen == 0)
+ dn->dn_id_flags |= DN_ID_CHKED_SPILL;
+ else
+ dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+ }
+ dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
+ mutex_exit(&dn->dn_mtx);
+
+ multilist_sublist_remove(list, dn);
+ dnode_rele(dn, os->os_synced_dnodes);
+ }
+ do_userquota_cacheflush(os, &cache, tx);
+ multilist_sublist_unlock(list);
+ kmem_free(uua, sizeof (*uua));
+}
+
+/*
+ * Release dnode holds from dmu_objset_sync_dnodes(). When the dnode is being
+ * synced (i.e. we have issued the zio's for blocks in the dnode), it can't be
+ * evicted because the block containing the dnode can't be evicted until it is
+ * written out. However, this hold is necessary to prevent the dnode_t from
+ * being moved (via dnode_move()) while it's still referenced by
+ * dbuf_dirty_record_t:dr_dnode. And dr_dnode is needed for
+ * dirty_lightweight_leaf-type dirty records.
+ *
+ * If we are doing user-object accounting, the dnode_rele() happens from
+ * userquota_updates_task() instead.
+ */
+static void
+dnode_rele_task(void *arg)
+{
+ userquota_updates_arg_t *uua = arg;
+ objset_t *os = uua->uua_os;
+
+ multilist_sublist_t *list =
+ multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx);
+
+ dnode_t *dn;
+ while ((dn = multilist_sublist_head(list)) != NULL) {
+ multilist_sublist_remove(list, dn);
+ dnode_rele(dn, os->os_synced_dnodes);
+ }
+ multilist_sublist_unlock(list);
+ kmem_free(uua, sizeof (*uua));
+}
+
+/*
+ * Return TRUE if userquota updates are needed.
+ */
+static boolean_t
+dmu_objset_do_userquota_updates_prep(objset_t *os, dmu_tx_t *tx)
+{
+ if (!dmu_objset_userused_enabled(os))
+ return (B_FALSE);
+
+ /*
+ * If this is a raw receive just return and handle accounting
+ * later when we have the keys loaded. We also don't do user
+ * accounting during claiming since the datasets are not owned
+ * for the duration of claiming and this txg should only be
+ * used for recovery.
+ */
+ if (os->os_encrypted && dmu_objset_is_receiving(os))
+ return (B_FALSE);
+
+ if (tx->tx_txg <= os->os_spa->spa_claim_max_txg)
+ return (B_FALSE);
+
+ /* Allocate the user/group/project used objects if necessary. */
+ if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
+ VERIFY0(zap_create_claim(os,
+ DMU_USERUSED_OBJECT,
+ DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
+ VERIFY0(zap_create_claim(os,
+ DMU_GROUPUSED_OBJECT,
+ DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
+ }
+
+ if (dmu_objset_projectquota_enabled(os) &&
+ DMU_PROJECTUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
+ VERIFY0(zap_create_claim(os, DMU_PROJECTUSED_OBJECT,
+ DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
+ }
+ return (B_TRUE);
+}
+
+/*
+ * Dispatch taskq tasks to dp_sync_taskq to update the user accounting, and
+ * also release the holds on the dnodes from dmu_objset_sync_dnodes().
+ * The caller must taskq_wait(dp_sync_taskq).
+ */
+void
+dmu_objset_sync_done(objset_t *os, dmu_tx_t *tx)
+{
+ boolean_t need_userquota = dmu_objset_do_userquota_updates_prep(os, tx);
+
+ int num_sublists = multilist_get_num_sublists(os->os_synced_dnodes);
+ for (int i = 0; i < num_sublists; i++) {
+ userquota_updates_arg_t *uua =
+ kmem_alloc(sizeof (*uua), KM_SLEEP);
+ uua->uua_os = os;
+ uua->uua_sublist_idx = i;
+ uua->uua_tx = tx;
+
+ /*
+ * If we don't need to update userquotas, use
+ * dnode_rele_task() to call dnode_rele()
+ */
+ (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
+ need_userquota ? userquota_updates_task : dnode_rele_task,
+ uua, 0);
+ /* callback frees uua */
+ }
+}
+
+
+/*
+ * Returns a pointer to data to find uid/gid from
+ *
+ * If a dirty record for transaction group that is syncing can't
+ * be found then NULL is returned. In the NULL case it is assumed
+ * the uid/gid aren't changing.
+ */
+static void *
+dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ dbuf_dirty_record_t *dr;
+ void *data;
+
+ if (db->db_dirtycnt == 0)
+ return (db->db.db_data); /* Nothing is changing */
+
+ dr = dbuf_find_dirty_eq(db, tx->tx_txg);
+
+ if (dr == NULL) {
+ data = NULL;
+ } else {
+ if (dr->dr_dnode->dn_bonuslen == 0 &&
+ dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
+ data = dr->dt.dl.dr_data->b_data;
+ else
+ data = dr->dt.dl.dr_data;
+ }
+
+ return (data);
+}
+
+void
+dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
+{
+ objset_t *os = dn->dn_objset;
+ void *data = NULL;
+ dmu_buf_impl_t *db = NULL;
+ int flags = dn->dn_id_flags;
+ int error;
+ boolean_t have_spill = B_FALSE;
+
+ if (!dmu_objset_userused_enabled(dn->dn_objset))
+ return;
+
+ /*
+ * Raw receives introduce a problem with user accounting. Raw
+ * receives cannot update the user accounting info because the
+ * user ids and the sizes are encrypted. To guarantee that we
+ * never end up with bad user accounting, we simply disable it
+ * during raw receives. We also disable this for normal receives
+ * so that an incremental raw receive may be done on top of an
+ * existing non-raw receive.
+ */
+ if (os->os_encrypted && dmu_objset_is_receiving(os))
+ return;
+
+ if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
+ DN_ID_CHKED_SPILL)))
+ return;
+
+ if (before && dn->dn_bonuslen != 0)
+ data = DN_BONUS(dn->dn_phys);
+ else if (!before && dn->dn_bonuslen != 0) {
+ if (dn->dn_bonus) {
+ db = dn->dn_bonus;
+ mutex_enter(&db->db_mtx);
+ data = dmu_objset_userquota_find_data(db, tx);
+ } else {
+ data = DN_BONUS(dn->dn_phys);
+ }
+ } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
+ int rf = 0;
+
+ if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
+ rf |= DB_RF_HAVESTRUCT;
+ error = dmu_spill_hold_by_dnode(dn,
+ rf | DB_RF_MUST_SUCCEED,
+ FTAG, (dmu_buf_t **)&db);
+ ASSERT(error == 0);
+ mutex_enter(&db->db_mtx);
+ data = (before) ? db->db.db_data :
+ dmu_objset_userquota_find_data(db, tx);
+ have_spill = B_TRUE;
+ } else {
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+ mutex_exit(&dn->dn_mtx);
+ return;
+ }
+
+ /*
+ * Must always call the callback in case the object
+ * type has changed and that type isn't an object type to track
+ */
+ zfs_file_info_t zfi;
+ error = file_cbs[os->os_phys->os_type](dn->dn_bonustype, data, &zfi);
+
+ if (before) {
+ ASSERT(data);
+ dn->dn_olduid = zfi.zfi_user;
+ dn->dn_oldgid = zfi.zfi_group;
+ dn->dn_oldprojid = zfi.zfi_project;
+ } else if (data) {
+ dn->dn_newuid = zfi.zfi_user;
+ dn->dn_newgid = zfi.zfi_group;
+ dn->dn_newprojid = zfi.zfi_project;
+ }
+
+ /*
+ * Preserve existing uid/gid when the callback can't determine
+ * what the new uid/gid are and the callback returned EEXIST.
+ * The EEXIST error tells us to just use the existing uid/gid.
+ * If we don't know what the old values are then just assign
+ * them to 0, since that is a new file being created.
+ */
+ if (!before && data == NULL && error == EEXIST) {
+ if (flags & DN_ID_OLD_EXIST) {
+ dn->dn_newuid = dn->dn_olduid;
+ dn->dn_newgid = dn->dn_oldgid;
+ dn->dn_newprojid = dn->dn_oldprojid;
+ } else {
+ dn->dn_newuid = 0;
+ dn->dn_newgid = 0;
+ dn->dn_newprojid = ZFS_DEFAULT_PROJID;
+ }
+ error = 0;
+ }
+
+ if (db)
+ mutex_exit(&db->db_mtx);
+
+ mutex_enter(&dn->dn_mtx);
+ if (error == 0 && before)
+ dn->dn_id_flags |= DN_ID_OLD_EXIST;
+ if (error == 0 && !before)
+ dn->dn_id_flags |= DN_ID_NEW_EXIST;
+
+ if (have_spill) {
+ dn->dn_id_flags |= DN_ID_CHKED_SPILL;
+ } else {
+ dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+ }
+ mutex_exit(&dn->dn_mtx);
+ if (have_spill)
+ dmu_buf_rele((dmu_buf_t *)db, FTAG);
+}
+
+boolean_t
+dmu_objset_userspace_present(objset_t *os)
+{
+ return (os->os_phys->os_flags &
+ OBJSET_FLAG_USERACCOUNTING_COMPLETE);
+}
+
+boolean_t
+dmu_objset_userobjspace_present(objset_t *os)
+{
+ return (os->os_phys->os_flags &
+ OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE);
+}
+
+boolean_t
+dmu_objset_projectquota_present(objset_t *os)
+{
+ return (os->os_phys->os_flags &
+ OBJSET_FLAG_PROJECTQUOTA_COMPLETE);
+}
+
+static int
+dmu_objset_space_upgrade(objset_t *os)
+{
+ uint64_t obj;
+ int err = 0;
+
+ /*
+ * We simply need to mark every object dirty, so that it will be
+ * synced out and now accounted. If this is called
+ * concurrently, or if we already did some work before crashing,
+ * that's fine, since we track each object's accounted state
+ * independently.
+ */
+
+ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
+ dmu_tx_t *tx;
+ dmu_buf_t *db;
+ int objerr;
+
+ mutex_enter(&os->os_upgrade_lock);
+ if (os->os_upgrade_exit)
+ err = SET_ERROR(EINTR);
+ mutex_exit(&os->os_upgrade_lock);
+ if (err != 0)
+ return (err);
+
+ if (issig(JUSTLOOKING) && issig(FORREAL))
+ return (SET_ERROR(EINTR));
+
+ objerr = dmu_bonus_hold(os, obj, FTAG, &db);
+ if (objerr != 0)
+ continue;
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, obj);
+ objerr = dmu_tx_assign(tx, TXG_WAIT);
+ if (objerr != 0) {
+ dmu_buf_rele(db, FTAG);
+ dmu_tx_abort(tx);
+ continue;
+ }
+ dmu_buf_will_dirty(db, tx);
+ dmu_buf_rele(db, FTAG);
+ dmu_tx_commit(tx);
+ }
+ return (0);
+}
+
+static int
+dmu_objset_userspace_upgrade_cb(objset_t *os)
+{
+ int err = 0;
+
+ if (dmu_objset_userspace_present(os))
+ return (0);
+ if (dmu_objset_is_snapshot(os))
+ return (SET_ERROR(EINVAL));
+ if (!dmu_objset_userused_enabled(os))
+ return (SET_ERROR(ENOTSUP));
+
+ err = dmu_objset_space_upgrade(os);
+ if (err)
+ return (err);
+
+ os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+ txg_wait_synced(dmu_objset_pool(os), 0);
+ return (0);
+}
+
+void
+dmu_objset_userspace_upgrade(objset_t *os)
+{
+ dmu_objset_upgrade(os, dmu_objset_userspace_upgrade_cb);
+}
+
+static int
+dmu_objset_id_quota_upgrade_cb(objset_t *os)
+{
+ int err = 0;
+
+ if (dmu_objset_userobjspace_present(os) &&
+ dmu_objset_projectquota_present(os))
+ return (0);
+ if (dmu_objset_is_snapshot(os))
+ return (SET_ERROR(EINVAL));
+ if (!dmu_objset_userused_enabled(os))
+ return (SET_ERROR(ENOTSUP));
+ if (!dmu_objset_projectquota_enabled(os) &&
+ dmu_objset_userobjspace_present(os))
+ return (SET_ERROR(ENOTSUP));
+
+ if (dmu_objset_userobjused_enabled(os))
+ dmu_objset_ds(os)->ds_feature_activation[
+ SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE;
+ if (dmu_objset_projectquota_enabled(os))
+ dmu_objset_ds(os)->ds_feature_activation[
+ SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE;
+
+ err = dmu_objset_space_upgrade(os);
+ if (err)
+ return (err);
+
+ os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+ if (dmu_objset_userobjused_enabled(os))
+ os->os_flags |= OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE;
+ if (dmu_objset_projectquota_enabled(os))
+ os->os_flags |= OBJSET_FLAG_PROJECTQUOTA_COMPLETE;
+
+ txg_wait_synced(dmu_objset_pool(os), 0);
+ return (0);
+}
+
+void
+dmu_objset_id_quota_upgrade(objset_t *os)
+{
+ dmu_objset_upgrade(os, dmu_objset_id_quota_upgrade_cb);
+}
+
+boolean_t
+dmu_objset_userobjspace_upgradable(objset_t *os)
+{
+ return (dmu_objset_type(os) == DMU_OST_ZFS &&
+ !dmu_objset_is_snapshot(os) &&
+ dmu_objset_userobjused_enabled(os) &&
+ !dmu_objset_userobjspace_present(os) &&
+ spa_writeable(dmu_objset_spa(os)));
+}
+
+boolean_t
+dmu_objset_projectquota_upgradable(objset_t *os)
+{
+ return (dmu_objset_type(os) == DMU_OST_ZFS &&
+ !dmu_objset_is_snapshot(os) &&
+ dmu_objset_projectquota_enabled(os) &&
+ !dmu_objset_projectquota_present(os) &&
+ spa_writeable(dmu_objset_spa(os)));
+}
+
+void
+dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
+ uint64_t *usedobjsp, uint64_t *availobjsp)
+{
+ dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
+ usedobjsp, availobjsp);
+}
+
+uint64_t
+dmu_objset_fsid_guid(objset_t *os)
+{
+ return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
+}
+
+void
+dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
+{
+ stat->dds_type = os->os_phys->os_type;
+ if (os->os_dsl_dataset)
+ dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
+}
+
+void
+dmu_objset_stats(objset_t *os, nvlist_t *nv)
+{
+ ASSERT(os->os_dsl_dataset ||
+ os->os_phys->os_type == DMU_OST_META);
+
+ if (os->os_dsl_dataset != NULL)
+ dsl_dataset_stats(os->os_dsl_dataset, nv);
+
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
+ os->os_phys->os_type);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
+ dmu_objset_userspace_present(os));
+}
+
+int
+dmu_objset_is_snapshot(objset_t *os)
+{
+ if (os->os_dsl_dataset != NULL)
+ return (os->os_dsl_dataset->ds_is_snapshot);
+ else
+ return (B_FALSE);
+}
+
+int
+dmu_snapshot_realname(objset_t *os, const char *name, char *real, int maxlen,
+ boolean_t *conflict)
+{
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ uint64_t ignored;
+
+ if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
+ return (SET_ERROR(ENOENT));
+
+ return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
+ MT_NORMALIZE, real, maxlen, conflict));
+}
+
+int
+dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
+ uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
+{
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ zap_cursor_t cursor;
+ zap_attribute_t attr;
+
+ ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
+
+ if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
+ return (SET_ERROR(ENOENT));
+
+ zap_cursor_init_serialized(&cursor,
+ ds->ds_dir->dd_pool->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);
+
+ if (zap_cursor_retrieve(&cursor, &attr) != 0) {
+ zap_cursor_fini(&cursor);
+ return (SET_ERROR(ENOENT));
+ }
+
+ if (strlen(attr.za_name) + 1 > namelen) {
+ zap_cursor_fini(&cursor);
+ return (SET_ERROR(ENAMETOOLONG));
+ }
+
+ (void) strlcpy(name, attr.za_name, namelen);
+ if (idp)
+ *idp = attr.za_first_integer;
+ if (case_conflict)
+ *case_conflict = attr.za_normalization_conflict;
+ zap_cursor_advance(&cursor);
+ *offp = zap_cursor_serialize(&cursor);
+ zap_cursor_fini(&cursor);
+
+ return (0);
+}
+
+int
+dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *value)
+{
+ return (dsl_dataset_snap_lookup(os->os_dsl_dataset, name, value));
+}
+
+int
+dmu_dir_list_next(objset_t *os, int namelen, char *name,
+ uint64_t *idp, uint64_t *offp)
+{
+ dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
+ zap_cursor_t cursor;
+ zap_attribute_t attr;
+
+ /* there is no next dir on a snapshot! */
+ if (os->os_dsl_dataset->ds_object !=
+ dsl_dir_phys(dd)->dd_head_dataset_obj)
+ return (SET_ERROR(ENOENT));
+
+ zap_cursor_init_serialized(&cursor,
+ dd->dd_pool->dp_meta_objset,
+ dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);
+
+ if (zap_cursor_retrieve(&cursor, &attr) != 0) {
+ zap_cursor_fini(&cursor);
+ return (SET_ERROR(ENOENT));
+ }
+
+ if (strlen(attr.za_name) + 1 > namelen) {
+ zap_cursor_fini(&cursor);
+ return (SET_ERROR(ENAMETOOLONG));
+ }
+
+ (void) strlcpy(name, attr.za_name, namelen);
+ if (idp)
+ *idp = attr.za_first_integer;
+ zap_cursor_advance(&cursor);
+ *offp = zap_cursor_serialize(&cursor);
+ zap_cursor_fini(&cursor);
+
+ return (0);
+}
+
+typedef struct dmu_objset_find_ctx {
+ taskq_t *dc_tq;
+ dsl_pool_t *dc_dp;
+ uint64_t dc_ddobj;
+ char *dc_ddname; /* last component of ddobj's name */
+ int (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
+ void *dc_arg;
+ int dc_flags;
+ kmutex_t *dc_error_lock;
+ int *dc_error;
+} dmu_objset_find_ctx_t;
+
+static void
+dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
+{
+ dsl_pool_t *dp = dcp->dc_dp;
+ dsl_dir_t *dd;
+ dsl_dataset_t *ds;
+ zap_cursor_t zc;
+ zap_attribute_t *attr;
+ uint64_t thisobj;
+ int err = 0;
+
+ /* don't process if there already was an error */
+ if (*dcp->dc_error != 0)
+ goto out;
+
+ /*
+ * Note: passing the name (dc_ddname) here is optional, but it
+ * improves performance because we don't need to call
+ * zap_value_search() to determine the name.
+ */
+ err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, dcp->dc_ddname, FTAG, &dd);
+ if (err != 0)
+ goto out;
+
+ /* Don't visit hidden ($MOS & $ORIGIN) objsets. */
+ if (dd->dd_myname[0] == '$') {
+ dsl_dir_rele(dd, FTAG);
+ goto out;
+ }
+
+ thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
+ attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+ /*
+ * Iterate over all children.
+ */
+ if (dcp->dc_flags & DS_FIND_CHILDREN) {
+ for (zap_cursor_init(&zc, dp->dp_meta_objset,
+ dsl_dir_phys(dd)->dd_child_dir_zapobj);
+ zap_cursor_retrieve(&zc, attr) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ ASSERT3U(attr->za_integer_length, ==,
+ sizeof (uint64_t));
+ ASSERT3U(attr->za_num_integers, ==, 1);
+
+ dmu_objset_find_ctx_t *child_dcp =
+ kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
+ *child_dcp = *dcp;
+ child_dcp->dc_ddobj = attr->za_first_integer;
+ child_dcp->dc_ddname = spa_strdup(attr->za_name);
+ if (dcp->dc_tq != NULL)
+ (void) taskq_dispatch(dcp->dc_tq,
+ dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
+ else
+ dmu_objset_find_dp_impl(child_dcp);
+ }
+ zap_cursor_fini(&zc);
+ }
+
+ /*
+ * Iterate over all snapshots.
+ */
+ if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
+ dsl_dataset_t *ds;
+ err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
+
+ if (err == 0) {
+ uint64_t snapobj;
+
+ snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
+ dsl_dataset_rele(ds, FTAG);
+
+ for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
+ zap_cursor_retrieve(&zc, attr) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ ASSERT3U(attr->za_integer_length, ==,
+ sizeof (uint64_t));
+ ASSERT3U(attr->za_num_integers, ==, 1);
+
+ err = dsl_dataset_hold_obj(dp,
+ attr->za_first_integer, FTAG, &ds);
+ if (err != 0)
+ break;
+ err = dcp->dc_func(dp, ds, dcp->dc_arg);
+ dsl_dataset_rele(ds, FTAG);
+ if (err != 0)
+ break;
+ }
+ zap_cursor_fini(&zc);
+ }
+ }
+
+ kmem_free(attr, sizeof (zap_attribute_t));
+
+ if (err != 0) {
+ dsl_dir_rele(dd, FTAG);
+ goto out;
+ }
+
+ /*
+ * Apply to self.
+ */
+ err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
+
+ /*
+ * Note: we hold the dir while calling dsl_dataset_hold_obj() so
+ * that the dir will remain cached, and we won't have to re-instantiate
+ * it (which could be expensive due to finding its name via
+ * zap_value_search()).
+ */
+ dsl_dir_rele(dd, FTAG);
+ if (err != 0)
+ goto out;
+ err = dcp->dc_func(dp, ds, dcp->dc_arg);
+ dsl_dataset_rele(ds, FTAG);
+
+out:
+ if (err != 0) {
+ mutex_enter(dcp->dc_error_lock);
+ /* only keep first error */
+ if (*dcp->dc_error == 0)
+ *dcp->dc_error = err;
+ mutex_exit(dcp->dc_error_lock);
+ }
+
+ if (dcp->dc_ddname != NULL)
+ spa_strfree(dcp->dc_ddname);
+ kmem_free(dcp, sizeof (*dcp));
+}
+
+static void
+dmu_objset_find_dp_cb(void *arg)
+{
+ dmu_objset_find_ctx_t *dcp = arg;
+ dsl_pool_t *dp = dcp->dc_dp;
+
+ /*
+ * We need to get a pool_config_lock here, as there are several
+ * assert(pool_config_held) down the stack. Getting a lock via
+ * dsl_pool_config_enter is risky, as it might be stalled by a
+ * pending writer. This would deadlock, as the write lock can
+ * only be granted when our parent thread gives up the lock.
+ * The _prio interface gives us priority over a pending writer.
+ */
+ dsl_pool_config_enter_prio(dp, FTAG);
+
+ dmu_objset_find_dp_impl(dcp);
+
+ dsl_pool_config_exit(dp, FTAG);
+}
+
+/*
+ * Find objsets under and including ddobj, call func(ds) on each.
+ * The order for the enumeration is completely undefined.
+ * func is called with dsl_pool_config held.
+ */
+int
+dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
+ int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
+{
+ int error = 0;
+ taskq_t *tq = NULL;
+ int ntasks;
+ dmu_objset_find_ctx_t *dcp;
+ kmutex_t err_lock;
+
+ mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
+ dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
+ dcp->dc_tq = NULL;
+ dcp->dc_dp = dp;
+ dcp->dc_ddobj = ddobj;
+ dcp->dc_ddname = NULL;
+ dcp->dc_func = func;
+ dcp->dc_arg = arg;
+ dcp->dc_flags = flags;
+ dcp->dc_error_lock = &err_lock;
+ dcp->dc_error = &error;
+
+ if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
+ /*
+ * In case a write lock is held we can't make use of
+ * parallelism, as down the stack of the worker threads
+ * the lock is asserted via dsl_pool_config_held.
+ * In case of a read lock this is solved by getting a read
+ * lock in each worker thread, which isn't possible in case
+ * of a writer lock. So we fall back to the synchronous path
+ * here.
+ * In the future it might be possible to get some magic into
+ * dsl_pool_config_held in a way that it returns true for
+ * the worker threads so that a single lock held from this
+ * thread suffices. For now, stay single threaded.
+ */
+ dmu_objset_find_dp_impl(dcp);
+ mutex_destroy(&err_lock);
+
+ return (error);
+ }
+
+ ntasks = dmu_find_threads;
+ if (ntasks == 0)
+ ntasks = vdev_count_leaves(dp->dp_spa) * 4;
+ tq = taskq_create("dmu_objset_find", ntasks, maxclsyspri, ntasks,
+ INT_MAX, 0);
+ if (tq == NULL) {
+ kmem_free(dcp, sizeof (*dcp));
+ mutex_destroy(&err_lock);
+
+ return (SET_ERROR(ENOMEM));
+ }
+ dcp->dc_tq = tq;
+
+ /* dcp will be freed by task */
+ (void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
+
+ /*
+ * PORTING: this code relies on the property of taskq_wait to wait
+ * until no more tasks are queued and no more tasks are active. As
+ * we always queue new tasks from within other tasks, task_wait
+ * reliably waits for the full recursion to finish, even though we
+ * enqueue new tasks after taskq_wait has been called.
+ * On platforms other than illumos, taskq_wait may not have this
+ * property.
+ */
+ taskq_wait(tq);
+ taskq_destroy(tq);
+ mutex_destroy(&err_lock);
+
+ return (error);
+}
+
+/*
+ * Find all objsets under name, and for each, call 'func(child_name, arg)'.
+ * The dp_config_rwlock must not be held when this is called, and it
+ * will not be held when the callback is called.
+ * Therefore this function should only be used when the pool is not changing
+ * (e.g. in syncing context), or the callback can deal with the possible races.
+ */
+static int
+dmu_objset_find_impl(spa_t *spa, const char *name,
+ int func(const char *, void *), void *arg, int flags)
+{
+ dsl_dir_t *dd;
+ dsl_pool_t *dp = spa_get_dsl(spa);
+ dsl_dataset_t *ds;
+ zap_cursor_t zc;
+ zap_attribute_t *attr;
+ char *child;
+ uint64_t thisobj;
+ int err;
+
+ dsl_pool_config_enter(dp, FTAG);
+
+ err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);
+ if (err != 0) {
+ dsl_pool_config_exit(dp, FTAG);
+ return (err);
+ }
+
+ /* Don't visit hidden ($MOS & $ORIGIN) objsets. */
+ if (dd->dd_myname[0] == '$') {
+ dsl_dir_rele(dd, FTAG);
+ dsl_pool_config_exit(dp, FTAG);
+ return (0);
+ }
+
+ thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
+ attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+ /*
+ * Iterate over all children.
+ */
+ if (flags & DS_FIND_CHILDREN) {
+ for (zap_cursor_init(&zc, dp->dp_meta_objset,
+ dsl_dir_phys(dd)->dd_child_dir_zapobj);
+ zap_cursor_retrieve(&zc, attr) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ ASSERT3U(attr->za_integer_length, ==,
+ sizeof (uint64_t));
+ ASSERT3U(attr->za_num_integers, ==, 1);
+
+ child = kmem_asprintf("%s/%s", name, attr->za_name);
+ dsl_pool_config_exit(dp, FTAG);
+ err = dmu_objset_find_impl(spa, child,
+ func, arg, flags);
+ dsl_pool_config_enter(dp, FTAG);
+ kmem_strfree(child);
+ if (err != 0)
+ break;
+ }
+ zap_cursor_fini(&zc);
+
+ if (err != 0) {
+ dsl_dir_rele(dd, FTAG);
+ dsl_pool_config_exit(dp, FTAG);
+ kmem_free(attr, sizeof (zap_attribute_t));
+ return (err);
+ }
+ }
+
+ /*
+ * Iterate over all snapshots.
+ */
+ if (flags & DS_FIND_SNAPSHOTS) {
+ err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
+
+ if (err == 0) {
+ uint64_t snapobj;
+
+ snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
+ dsl_dataset_rele(ds, FTAG);
+
+ for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
+ zap_cursor_retrieve(&zc, attr) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ ASSERT3U(attr->za_integer_length, ==,
+ sizeof (uint64_t));
+ ASSERT3U(attr->za_num_integers, ==, 1);
+
+ child = kmem_asprintf("%s@%s",
+ name, attr->za_name);
+ dsl_pool_config_exit(dp, FTAG);
+ err = func(child, arg);
+ dsl_pool_config_enter(dp, FTAG);
+ kmem_strfree(child);
+ if (err != 0)
+ break;
+ }
+ zap_cursor_fini(&zc);
+ }
+ }
+
+ dsl_dir_rele(dd, FTAG);
+ kmem_free(attr, sizeof (zap_attribute_t));
+ dsl_pool_config_exit(dp, FTAG);
+
+ if (err != 0)
+ return (err);
+
+ /* Apply to self. */
+ return (func(name, arg));
+}
+
+/*
+ * See comment above dmu_objset_find_impl().
+ */
+int
+dmu_objset_find(const char *name, int func(const char *, void *), void *arg,
+ int flags)
+{
+ spa_t *spa;
+ int error;
+
+ error = spa_open(name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+ error = dmu_objset_find_impl(spa, name, func, arg, flags);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+boolean_t
+dmu_objset_incompatible_encryption_version(objset_t *os)
+{
+ return (dsl_dir_incompatible_encryption_version(
+ os->os_dsl_dataset->ds_dir));
+}
+
+void
+dmu_objset_set_user(objset_t *os, void *user_ptr)
+{
+ ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
+ os->os_user_ptr = user_ptr;
+}
+
+void *
+dmu_objset_get_user(objset_t *os)
+{
+ ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
+ return (os->os_user_ptr);
+}
+
+/*
+ * Determine name of filesystem, given name of snapshot.
+ * buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes
+ */
+int
+dmu_fsname(const char *snapname, char *buf)
+{
+ char *atp = strchr(snapname, '@');
+ if (atp == NULL)
+ return (SET_ERROR(EINVAL));
+ if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+ (void) strlcpy(buf, snapname, atp - snapname + 1);
+ return (0);
+}
+
+/*
+ * Call when we think we're going to write/free space in open context
+ * to track the amount of dirty data in the open txg, which is also the
+ * amount of memory that can not be evicted until this txg syncs.
+ *
+ * Note that there are two conditions where this can be called from
+ * syncing context:
+ *
+ * [1] When we just created the dataset, in which case we go on with
+ * updating any accounting of dirty data as usual.
+ * [2] When we are dirtying MOS data, in which case we only update the
+ * pool's accounting of dirty data.
+ */
+void
+dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ int64_t aspace = spa_get_worst_case_asize(os->os_spa, space);
+
+ if (ds != NULL) {
+ dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
+ }
+
+ dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(dmu_objset_zil);
+EXPORT_SYMBOL(dmu_objset_pool);
+EXPORT_SYMBOL(dmu_objset_ds);
+EXPORT_SYMBOL(dmu_objset_type);
+EXPORT_SYMBOL(dmu_objset_name);
+EXPORT_SYMBOL(dmu_objset_hold);
+EXPORT_SYMBOL(dmu_objset_hold_flags);
+EXPORT_SYMBOL(dmu_objset_own);
+EXPORT_SYMBOL(dmu_objset_rele);
+EXPORT_SYMBOL(dmu_objset_rele_flags);
+EXPORT_SYMBOL(dmu_objset_disown);
+EXPORT_SYMBOL(dmu_objset_from_ds);
+EXPORT_SYMBOL(dmu_objset_create);
+EXPORT_SYMBOL(dmu_objset_clone);
+EXPORT_SYMBOL(dmu_objset_stats);
+EXPORT_SYMBOL(dmu_objset_fast_stat);
+EXPORT_SYMBOL(dmu_objset_spa);
+EXPORT_SYMBOL(dmu_objset_space);
+EXPORT_SYMBOL(dmu_objset_fsid_guid);
+EXPORT_SYMBOL(dmu_objset_find);
+EXPORT_SYMBOL(dmu_objset_byteswap);
+EXPORT_SYMBOL(dmu_objset_evict_dbufs);
+EXPORT_SYMBOL(dmu_objset_snap_cmtime);
+EXPORT_SYMBOL(dmu_objset_dnodesize);
+
+EXPORT_SYMBOL(dmu_objset_sync);
+EXPORT_SYMBOL(dmu_objset_is_dirty);
+EXPORT_SYMBOL(dmu_objset_create_impl_dnstats);
+EXPORT_SYMBOL(dmu_objset_create_impl);
+EXPORT_SYMBOL(dmu_objset_open_impl);
+EXPORT_SYMBOL(dmu_objset_evict);
+EXPORT_SYMBOL(dmu_objset_register_type);
+EXPORT_SYMBOL(dmu_objset_sync_done);
+EXPORT_SYMBOL(dmu_objset_userquota_get_ids);
+EXPORT_SYMBOL(dmu_objset_userused_enabled);
+EXPORT_SYMBOL(dmu_objset_userspace_upgrade);
+EXPORT_SYMBOL(dmu_objset_userspace_present);
+EXPORT_SYMBOL(dmu_objset_userobjused_enabled);
+EXPORT_SYMBOL(dmu_objset_userobjspace_upgradable);
+EXPORT_SYMBOL(dmu_objset_userobjspace_present);
+EXPORT_SYMBOL(dmu_objset_projectquota_enabled);
+EXPORT_SYMBOL(dmu_objset_projectquota_present);
+EXPORT_SYMBOL(dmu_objset_projectquota_upgradable);
+EXPORT_SYMBOL(dmu_objset_id_quota_upgrade);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/dmu_recv.c b/sys/contrib/openzfs/module/zfs/dmu_recv.c
new file mode 100644
index 000000000000..a0fd157ebc5f
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dmu_recv.c
@@ -0,0 +1,3390 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2014 HybridCluster. All rights reserved.
+ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_send.h>
+#include <sys/dmu_recv.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zvol.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
+#include <zfs_fletcher.h>
+#include <sys/avl.h>
+#include <sys/ddt.h>
+#include <sys/zfs_onexit.h>
+#include <sys/dmu_send.h>
+#include <sys/dsl_destroy.h>
+#include <sys/blkptr.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/zfeature.h>
+#include <sys/bqueue.h>
+#include <sys/objlist.h>
+#ifdef _KERNEL
+#include <sys/zfs_vfsops.h>
+#endif
+#include <sys/zfs_file.h>
+
+int zfs_recv_queue_length = SPA_MAXBLOCKSIZE;
+int zfs_recv_queue_ff = 20;
+int zfs_recv_write_batch_size = 1024 * 1024;
+
+static char *dmu_recv_tag = "dmu_recv_tag";
+const char *recv_clone_name = "%recv";
+
+static int receive_read_payload_and_next_header(dmu_recv_cookie_t *ra, int len,
+ void *buf);
+
+struct receive_record_arg {
+ dmu_replay_record_t header;
+ void *payload; /* Pointer to a buffer containing the payload */
+ /*
+ * If the record is a WRITE or SPILL, pointer to the abd containing the
+ * payload.
+ */
+ abd_t *abd;
+ int payload_size;
+ uint64_t bytes_read; /* bytes read from stream when record created */
+ boolean_t eos_marker; /* Marks the end of the stream */
+ bqueue_node_t node;
+};
+
+struct receive_writer_arg {
+ objset_t *os;
+ boolean_t byteswap;
+ bqueue_t q;
+
+ /*
+ * These three members are used to signal to the main thread when
+ * we're done.
+ */
+ kmutex_t mutex;
+ kcondvar_t cv;
+ boolean_t done;
+
+ int err;
+ boolean_t resumable;
+ boolean_t raw; /* DMU_BACKUP_FEATURE_RAW set */
+ boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */
+ boolean_t full; /* this is a full send stream */
+ uint64_t last_object;
+ uint64_t last_offset;
+ uint64_t max_object; /* highest object ID referenced in stream */
+ uint64_t bytes_read; /* bytes read when current record created */
+
+ list_t write_batch;
+
+ /* Encryption parameters for the last received DRR_OBJECT_RANGE */
+ boolean_t or_crypt_params_present;
+ uint64_t or_firstobj;
+ uint64_t or_numslots;
+ uint8_t or_salt[ZIO_DATA_SALT_LEN];
+ uint8_t or_iv[ZIO_DATA_IV_LEN];
+ uint8_t or_mac[ZIO_DATA_MAC_LEN];
+ boolean_t or_byteorder;
+};
+
+typedef struct dmu_recv_begin_arg {
+ const char *drba_origin;
+ dmu_recv_cookie_t *drba_cookie;
+ cred_t *drba_cred;
+ proc_t *drba_proc;
+ dsl_crypto_params_t *drba_dcp;
+} dmu_recv_begin_arg_t;
+
+static void
+byteswap_record(dmu_replay_record_t *drr)
+{
+#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
+#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
+ drr->drr_type = BSWAP_32(drr->drr_type);
+ drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
+
+ switch (drr->drr_type) {
+ case DRR_BEGIN:
+ DO64(drr_begin.drr_magic);
+ DO64(drr_begin.drr_versioninfo);
+ DO64(drr_begin.drr_creation_time);
+ DO32(drr_begin.drr_type);
+ DO32(drr_begin.drr_flags);
+ DO64(drr_begin.drr_toguid);
+ DO64(drr_begin.drr_fromguid);
+ break;
+ case DRR_OBJECT:
+ DO64(drr_object.drr_object);
+ DO32(drr_object.drr_type);
+ DO32(drr_object.drr_bonustype);
+ DO32(drr_object.drr_blksz);
+ DO32(drr_object.drr_bonuslen);
+ DO32(drr_object.drr_raw_bonuslen);
+ DO64(drr_object.drr_toguid);
+ DO64(drr_object.drr_maxblkid);
+ break;
+ case DRR_FREEOBJECTS:
+ DO64(drr_freeobjects.drr_firstobj);
+ DO64(drr_freeobjects.drr_numobjs);
+ DO64(drr_freeobjects.drr_toguid);
+ break;
+ case DRR_WRITE:
+ DO64(drr_write.drr_object);
+ DO32(drr_write.drr_type);
+ DO64(drr_write.drr_offset);
+ DO64(drr_write.drr_logical_size);
+ DO64(drr_write.drr_toguid);
+ ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum);
+ DO64(drr_write.drr_key.ddk_prop);
+ DO64(drr_write.drr_compressed_size);
+ break;
+ case DRR_WRITE_EMBEDDED:
+ DO64(drr_write_embedded.drr_object);
+ DO64(drr_write_embedded.drr_offset);
+ DO64(drr_write_embedded.drr_length);
+ DO64(drr_write_embedded.drr_toguid);
+ DO32(drr_write_embedded.drr_lsize);
+ DO32(drr_write_embedded.drr_psize);
+ break;
+ case DRR_FREE:
+ DO64(drr_free.drr_object);
+ DO64(drr_free.drr_offset);
+ DO64(drr_free.drr_length);
+ DO64(drr_free.drr_toguid);
+ break;
+ case DRR_SPILL:
+ DO64(drr_spill.drr_object);
+ DO64(drr_spill.drr_length);
+ DO64(drr_spill.drr_toguid);
+ DO64(drr_spill.drr_compressed_size);
+ DO32(drr_spill.drr_type);
+ break;
+ case DRR_OBJECT_RANGE:
+ DO64(drr_object_range.drr_firstobj);
+ DO64(drr_object_range.drr_numslots);
+ DO64(drr_object_range.drr_toguid);
+ break;
+ case DRR_REDACT:
+ DO64(drr_redact.drr_object);
+ DO64(drr_redact.drr_offset);
+ DO64(drr_redact.drr_length);
+ DO64(drr_redact.drr_toguid);
+ break;
+ case DRR_END:
+ DO64(drr_end.drr_toguid);
+ ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum);
+ break;
+ default:
+ break;
+ }
+
+ if (drr->drr_type != DRR_BEGIN) {
+ ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum);
+ }
+
+#undef DO64
+#undef DO32
+}
+
+static boolean_t
+redact_snaps_contains(uint64_t *snaps, uint64_t num_snaps, uint64_t guid)
+{
+ for (int i = 0; i < num_snaps; i++) {
+ if (snaps[i] == guid)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+/*
+ * Check that the new stream we're trying to receive is redacted with respect to
+ * a subset of the snapshots that the origin was redacted with respect to. For
+ * the reasons behind this, see the man page on redacted zfs sends and receives.
+ */
+static boolean_t
+compatible_redact_snaps(uint64_t *origin_snaps, uint64_t origin_num_snaps,
+ uint64_t *redact_snaps, uint64_t num_redact_snaps)
+{
+ /*
+ * Short circuit the comparison; if we are redacted with respect to
+ * more snapshots than the origin, we can't be redacted with respect
+ * to a subset.
+ */
+ if (num_redact_snaps > origin_num_snaps) {
+ return (B_FALSE);
+ }
+
+ for (int i = 0; i < num_redact_snaps; i++) {
+ if (!redact_snaps_contains(origin_snaps, origin_num_snaps,
+ redact_snaps[i])) {
+ return (B_FALSE);
+ }
+ }
+ return (B_TRUE);
+}
+
+static boolean_t
+redact_check(dmu_recv_begin_arg_t *drba, dsl_dataset_t *origin)
+{
+ uint64_t *origin_snaps;
+ uint64_t origin_num_snaps;
+ dmu_recv_cookie_t *drc = drba->drba_cookie;
+ struct drr_begin *drrb = drc->drc_drrb;
+ int featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
+ int err = 0;
+ boolean_t ret = B_TRUE;
+ uint64_t *redact_snaps;
+ uint_t numredactsnaps;
+
+ /*
+ * If this is a full send stream, we're safe no matter what.
+ */
+ if (drrb->drr_fromguid == 0)
+ return (ret);
+
+ VERIFY(dsl_dataset_get_uint64_array_feature(origin,
+ SPA_FEATURE_REDACTED_DATASETS, &origin_num_snaps, &origin_snaps));
+
+ if (nvlist_lookup_uint64_array(drc->drc_begin_nvl,
+ BEGINNV_REDACT_FROM_SNAPS, &redact_snaps, &numredactsnaps) ==
+ 0) {
+ /*
+ * If the send stream was sent from the redaction bookmark or
+ * the redacted version of the dataset, then we're safe. Verify
+ * that this is from the a compatible redaction bookmark or
+ * redacted dataset.
+ */
+ if (!compatible_redact_snaps(origin_snaps, origin_num_snaps,
+ redact_snaps, numredactsnaps)) {
+ err = EINVAL;
+ }
+ } else if (featureflags & DMU_BACKUP_FEATURE_REDACTED) {
+ /*
+ * If the stream is redacted, it must be redacted with respect
+ * to a subset of what the origin is redacted with respect to.
+ * See case number 2 in the zfs man page section on redacted zfs
+ * send.
+ */
+ err = nvlist_lookup_uint64_array(drc->drc_begin_nvl,
+ BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps);
+
+ if (err != 0 || !compatible_redact_snaps(origin_snaps,
+ origin_num_snaps, redact_snaps, numredactsnaps)) {
+ err = EINVAL;
+ }
+ } else if (!redact_snaps_contains(origin_snaps, origin_num_snaps,
+ drrb->drr_toguid)) {
+ /*
+ * If the stream isn't redacted but the origin is, this must be
+ * one of the snapshots the origin is redacted with respect to.
+ * See case number 1 in the zfs man page section on redacted zfs
+ * send.
+ */
+ err = EINVAL;
+ }
+
+ if (err != 0)
+ ret = B_FALSE;
+ return (ret);
+}
+
+/*
+ * If we previously received a stream with --large-block, we don't support
+ * receiving an incremental on top of it without --large-block. This avoids
+ * forcing a read-modify-write or trying to re-aggregate a string of WRITE
+ * records.
+ */
+static int
+recv_check_large_blocks(dsl_dataset_t *ds, uint64_t featureflags)
+{
+ if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_LARGE_BLOCKS) &&
+ !(featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS))
+ return (SET_ERROR(ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH));
+ return (0);
+}
+
+static int
+recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
+ uint64_t fromguid, uint64_t featureflags)
+{
+ uint64_t val;
+ uint64_t children;
+ int error;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ boolean_t encrypted = ds->ds_dir->dd_crypto_obj != 0;
+ boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0;
+ boolean_t embed = (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) != 0;
+
+ /* Temporary clone name must not exist. */
+ error = zap_lookup(dp->dp_meta_objset,
+ dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
+ 8, 1, &val);
+ if (error != ENOENT)
+ return (error == 0 ? SET_ERROR(EBUSY) : error);
+
+ /* Resume state must not be set. */
+ if (dsl_dataset_has_resume_receive_state(ds))
+ return (SET_ERROR(EBUSY));
+
+ /* New snapshot name must not exist. */
+ error = zap_lookup(dp->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_snapnames_zapobj,
+ drba->drba_cookie->drc_tosnap, 8, 1, &val);
+ if (error != ENOENT)
+ return (error == 0 ? SET_ERROR(EEXIST) : error);
+
+ /* Must not have children if receiving a ZVOL. */
+ error = zap_count(dp->dp_meta_objset,
+ dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &children);
+ if (error != 0)
+ return (error);
+ if (drba->drba_cookie->drc_drrb->drr_type != DMU_OST_ZFS &&
+ children > 0)
+ return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
+
+ /*
+ * Check snapshot limit before receiving. We'll recheck again at the
+ * end, but might as well abort before receiving if we're already over
+ * the limit.
+ *
+ * Note that we do not check the file system limit with
+ * dsl_dir_fscount_check because the temporary %clones don't count
+ * against that limit.
+ */
+ error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT,
+ NULL, drba->drba_cred, drba->drba_proc);
+ if (error != 0)
+ return (error);
+
+ if (fromguid != 0) {
+ dsl_dataset_t *snap;
+ uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+
+ /* Can't perform a raw receive on top of a non-raw receive */
+ if (!encrypted && raw)
+ return (SET_ERROR(EINVAL));
+
+ /* Encryption is incompatible with embedded data */
+ if (encrypted && embed)
+ return (SET_ERROR(EINVAL));
+
+ /* Find snapshot in this dir that matches fromguid. */
+ while (obj != 0) {
+ error = dsl_dataset_hold_obj(dp, obj, FTAG,
+ &snap);
+ if (error != 0)
+ return (SET_ERROR(ENODEV));
+ if (snap->ds_dir != ds->ds_dir) {
+ dsl_dataset_rele(snap, FTAG);
+ return (SET_ERROR(ENODEV));
+ }
+ if (dsl_dataset_phys(snap)->ds_guid == fromguid)
+ break;
+ obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+ dsl_dataset_rele(snap, FTAG);
+ }
+ if (obj == 0)
+ return (SET_ERROR(ENODEV));
+
+ if (drba->drba_cookie->drc_force) {
+ drba->drba_cookie->drc_fromsnapobj = obj;
+ } else {
+ /*
+ * If we are not forcing, there must be no
+ * changes since fromsnap. Raw sends have an
+ * additional constraint that requires that
+ * no "noop" snapshots exist between fromsnap
+ * and tosnap for the IVset checking code to
+ * work properly.
+ */
+ if (dsl_dataset_modified_since_snap(ds, snap) ||
+ (raw &&
+ dsl_dataset_phys(ds)->ds_prev_snap_obj !=
+ snap->ds_object)) {
+ dsl_dataset_rele(snap, FTAG);
+ return (SET_ERROR(ETXTBSY));
+ }
+ drba->drba_cookie->drc_fromsnapobj =
+ ds->ds_prev->ds_object;
+ }
+
+ if (dsl_dataset_feature_is_active(snap,
+ SPA_FEATURE_REDACTED_DATASETS) && !redact_check(drba,
+ snap)) {
+ dsl_dataset_rele(snap, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ error = recv_check_large_blocks(snap, featureflags);
+ if (error != 0) {
+ dsl_dataset_rele(snap, FTAG);
+ return (error);
+ }
+
+ dsl_dataset_rele(snap, FTAG);
+ } else {
+ /* if full, then must be forced */
+ if (!drba->drba_cookie->drc_force)
+ return (SET_ERROR(EEXIST));
+
+ /*
+ * We don't support using zfs recv -F to blow away
+ * encrypted filesystems. This would require the
+ * dsl dir to point to the old encryption key and
+ * the new one at the same time during the receive.
+ */
+ if ((!encrypted && raw) || encrypted)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * Perform the same encryption checks we would if
+ * we were creating a new dataset from scratch.
+ */
+ if (!raw) {
+ boolean_t will_encrypt;
+
+ error = dmu_objset_create_crypt_check(
+ ds->ds_dir->dd_parent, drba->drba_dcp,
+ &will_encrypt);
+ if (error != 0)
+ return (error);
+
+ if (will_encrypt && embed)
+ return (SET_ERROR(EINVAL));
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Check that any feature flags used in the data stream we're receiving are
+ * supported by the pool we are receiving into.
+ *
+ * Note that some of the features we explicitly check here have additional
+ * (implicit) features they depend on, but those dependencies are enforced
+ * through the zfeature_register() calls declaring the features that we
+ * explicitly check.
+ */
+static int
+recv_begin_check_feature_flags_impl(uint64_t featureflags, spa_t *spa)
+{
+ /*
+ * Check if there are any unsupported feature flags.
+ */
+ if (!DMU_STREAM_SUPPORTED(featureflags)) {
+ return (SET_ERROR(ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE));
+ }
+
+ /* Verify pool version supports SA if SA_SPILL feature set */
+ if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
+ spa_version(spa) < SPA_VERSION_SA)
+ return (SET_ERROR(ENOTSUP));
+
+ /*
+ * LZ4 compressed, ZSTD compressed, embedded, mooched, large blocks,
+ * and large_dnodes in the stream can only be used if those pool
+ * features are enabled because we don't attempt to decompress /
+ * un-embed / un-mooch / split up the blocks / dnodes during the
+ * receive process.
+ */
+ if ((featureflags & DMU_BACKUP_FEATURE_LZ4) &&
+ !spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS))
+ return (SET_ERROR(ENOTSUP));
+ if ((featureflags & DMU_BACKUP_FEATURE_ZSTD) &&
+ !spa_feature_is_enabled(spa, SPA_FEATURE_ZSTD_COMPRESS))
+ return (SET_ERROR(ENOTSUP));
+ if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
+ !spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA))
+ return (SET_ERROR(ENOTSUP));
+ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
+ return (SET_ERROR(ENOTSUP));
+ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
+ !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE))
+ return (SET_ERROR(ENOTSUP));
+
+ /*
+ * Receiving redacted streams requires that redacted datasets are
+ * enabled.
+ */
+ if ((featureflags & DMU_BACKUP_FEATURE_REDACTED) &&
+ !spa_feature_is_enabled(spa, SPA_FEATURE_REDACTED_DATASETS))
+ return (SET_ERROR(ENOTSUP));
+
+ return (0);
+}
+
+static int
+dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
+{
+ dmu_recv_begin_arg_t *drba = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
+ uint64_t fromguid = drrb->drr_fromguid;
+ int flags = drrb->drr_flags;
+ ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
+ int error;
+ uint64_t featureflags = drba->drba_cookie->drc_featureflags;
+ dsl_dataset_t *ds;
+ const char *tofs = drba->drba_cookie->drc_tofs;
+
+ /* already checked */
+ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+ ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING));
+
+ if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+ DMU_COMPOUNDSTREAM ||
+ drrb->drr_type >= DMU_OST_NUMTYPES ||
+ ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL))
+ return (SET_ERROR(EINVAL));
+
+ error = recv_begin_check_feature_flags_impl(featureflags, dp->dp_spa);
+ if (error != 0)
+ return (error);
+
+ /* Resumable receives require extensible datasets */
+ if (drba->drba_cookie->drc_resumable &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET))
+ return (SET_ERROR(ENOTSUP));
+
+ if (featureflags & DMU_BACKUP_FEATURE_RAW) {
+ /* raw receives require the encryption feature */
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION))
+ return (SET_ERROR(ENOTSUP));
+
+ /* embedded data is incompatible with encryption and raw recv */
+ if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
+ return (SET_ERROR(EINVAL));
+
+ /* raw receives require spill block allocation flag */
+ if (!(flags & DRR_FLAG_SPILL_BLOCK))
+ return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING));
+ } else {
+ dsflags |= DS_HOLD_FLAG_DECRYPT;
+ }
+
+ error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
+ if (error == 0) {
+ /* target fs already exists; recv into temp clone */
+
+ /* Can't recv a clone into an existing fs */
+ if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ error = recv_begin_check_existing_impl(drba, ds, fromguid,
+ featureflags);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
+ } else if (error == ENOENT) {
+ /* target fs does not exist; must be a full backup or clone */
+ char buf[ZFS_MAX_DATASET_NAME_LEN];
+ objset_t *os;
+
+ /*
+ * If it's a non-clone incremental, we are missing the
+ * target fs, so fail the recv.
+ */
+ if (fromguid != 0 && !((flags & DRR_FLAG_CLONE) ||
+ drba->drba_origin))
+ return (SET_ERROR(ENOENT));
+
+ /*
+ * If we're receiving a full send as a clone, and it doesn't
+ * contain all the necessary free records and freeobject
+ * records, reject it.
+ */
+ if (fromguid == 0 && drba->drba_origin != NULL &&
+ !(flags & DRR_FLAG_FREERECORDS))
+ return (SET_ERROR(EINVAL));
+
+ /* Open the parent of tofs */
+ ASSERT3U(strlen(tofs), <, sizeof (buf));
+ (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
+ error = dsl_dataset_hold(dp, buf, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0 &&
+ drba->drba_origin == NULL) {
+ boolean_t will_encrypt;
+
+ /*
+ * Check that we aren't breaking any encryption rules
+ * and that we have all the parameters we need to
+ * create an encrypted dataset if necessary. If we are
+ * making an encrypted dataset the stream can't have
+ * embedded data.
+ */
+ error = dmu_objset_create_crypt_check(ds->ds_dir,
+ drba->drba_dcp, &will_encrypt);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ if (will_encrypt &&
+ (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ }
+
+ /*
+ * Check filesystem and snapshot limits before receiving. We'll
+ * recheck snapshot limits again at the end (we create the
+ * filesystems and increment those counts during begin_sync).
+ */
+ error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
+ ZFS_PROP_FILESYSTEM_LIMIT, NULL,
+ drba->drba_cred, drba->drba_proc);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
+ ZFS_PROP_SNAPSHOT_LIMIT, NULL,
+ drba->drba_cred, drba->drba_proc);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ /* can't recv below anything but filesystems (eg. no ZVOLs) */
+ error = dmu_objset_from_ds(ds, &os);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
+ }
+
+ if (drba->drba_origin != NULL) {
+ dsl_dataset_t *origin;
+ error = dsl_dataset_hold_flags(dp, drba->drba_origin,
+ dsflags, FTAG, &origin);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+ if (!origin->ds_is_snapshot) {
+ dsl_dataset_rele_flags(origin, dsflags, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
+ fromguid != 0) {
+ dsl_dataset_rele_flags(origin, dsflags, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(ENODEV));
+ }
+
+ if (origin->ds_dir->dd_crypto_obj != 0 &&
+ (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) {
+ dsl_dataset_rele_flags(origin, dsflags, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * If the origin is redacted we need to verify that this
+ * send stream can safely be received on top of the
+ * origin.
+ */
+ if (dsl_dataset_feature_is_active(origin,
+ SPA_FEATURE_REDACTED_DATASETS)) {
+ if (!redact_check(drba, origin)) {
+ dsl_dataset_rele_flags(origin, dsflags,
+ FTAG);
+ dsl_dataset_rele_flags(ds, dsflags,
+ FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ }
+
+ error = recv_check_large_blocks(ds, featureflags);
+ if (error != 0) {
+ dsl_dataset_rele_flags(origin, dsflags, FTAG);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
+ return (error);
+ }
+
+ dsl_dataset_rele_flags(origin, dsflags, FTAG);
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ error = 0;
+ }
+ return (error);
+}
+
+static void
+dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
+{
+ dmu_recv_begin_arg_t *drba = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ objset_t *mos = dp->dp_meta_objset;
+ dmu_recv_cookie_t *drc = drba->drba_cookie;
+ struct drr_begin *drrb = drc->drc_drrb;
+ const char *tofs = drc->drc_tofs;
+ uint64_t featureflags = drc->drc_featureflags;
+ dsl_dataset_t *ds, *newds;
+ objset_t *os;
+ uint64_t dsobj;
+ ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
+ int error;
+ uint64_t crflags = 0;
+ dsl_crypto_params_t dummy_dcp = { 0 };
+ dsl_crypto_params_t *dcp = drba->drba_dcp;
+
+ if (drrb->drr_flags & DRR_FLAG_CI_DATA)
+ crflags |= DS_FLAG_CI_DATASET;
+
+ if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0)
+ dsflags |= DS_HOLD_FLAG_DECRYPT;
+
+ /*
+ * Raw, non-incremental recvs always use a dummy dcp with
+ * the raw cmd set. Raw incremental recvs do not use a dcp
+ * since the encryption parameters are already set in stone.
+ */
+ if (dcp == NULL && drrb->drr_fromguid == 0 &&
+ drba->drba_origin == NULL) {
+ ASSERT3P(dcp, ==, NULL);
+ dcp = &dummy_dcp;
+
+ if (featureflags & DMU_BACKUP_FEATURE_RAW)
+ dcp->cp_cmd = DCP_CMD_RAW_RECV;
+ }
+
+ error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
+ if (error == 0) {
+ /* create temporary clone */
+ dsl_dataset_t *snap = NULL;
+
+ if (drba->drba_cookie->drc_fromsnapobj != 0) {
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ drba->drba_cookie->drc_fromsnapobj, FTAG, &snap));
+ ASSERT3P(dcp, ==, NULL);
+ }
+ dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
+ snap, crflags, drba->drba_cred, dcp, tx);
+ if (drba->drba_cookie->drc_fromsnapobj != 0)
+ dsl_dataset_rele(snap, FTAG);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
+ } else {
+ dsl_dir_t *dd;
+ const char *tail;
+ dsl_dataset_t *origin = NULL;
+
+ VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail));
+
+ if (drba->drba_origin != NULL) {
+ VERIFY0(dsl_dataset_hold(dp, drba->drba_origin,
+ FTAG, &origin));
+ ASSERT3P(dcp, ==, NULL);
+ }
+
+ /* Create new dataset. */
+ dsobj = dsl_dataset_create_sync(dd, strrchr(tofs, '/') + 1,
+ origin, crflags, drba->drba_cred, dcp, tx);
+ if (origin != NULL)
+ dsl_dataset_rele(origin, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ drc->drc_newfs = B_TRUE;
+ }
+ VERIFY0(dsl_dataset_own_obj_force(dp, dsobj, dsflags, dmu_recv_tag,
+ &newds));
+ if (dsl_dataset_feature_is_active(newds,
+ SPA_FEATURE_REDACTED_DATASETS)) {
+ /*
+ * If the origin dataset is redacted, the child will be redacted
+ * when we create it. We clear the new dataset's
+ * redaction info; if it should be redacted, we'll fill
+ * in its information later.
+ */
+ dsl_dataset_deactivate_feature(newds,
+ SPA_FEATURE_REDACTED_DATASETS, tx);
+ }
+ VERIFY0(dmu_objset_from_ds(newds, &os));
+
+ if (drc->drc_resumable) {
+ dsl_dataset_zapify(newds, tx);
+ if (drrb->drr_fromguid != 0) {
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID,
+ 8, 1, &drrb->drr_fromguid, tx));
+ }
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID,
+ 8, 1, &drrb->drr_toguid, tx));
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME,
+ 1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx));
+ uint64_t one = 1;
+ uint64_t zero = 0;
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT,
+ 8, 1, &one, tx));
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET,
+ 8, 1, &zero, tx));
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES,
+ 8, 1, &zero, tx));
+ if (featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) {
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK,
+ 8, 1, &one, tx));
+ }
+ if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) {
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK,
+ 8, 1, &one, tx));
+ }
+ if (featureflags & DMU_BACKUP_FEATURE_COMPRESSED) {
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK,
+ 8, 1, &one, tx));
+ }
+ if (featureflags & DMU_BACKUP_FEATURE_RAW) {
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_RAWOK,
+ 8, 1, &one, tx));
+ }
+
+ uint64_t *redact_snaps;
+ uint_t numredactsnaps;
+ if (nvlist_lookup_uint64_array(drc->drc_begin_nvl,
+ BEGINNV_REDACT_FROM_SNAPS, &redact_snaps,
+ &numredactsnaps) == 0) {
+ VERIFY0(zap_add(mos, dsobj,
+ DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS,
+ sizeof (*redact_snaps), numredactsnaps,
+ redact_snaps, tx));
+ }
+ }
+
+ /*
+ * Usually the os->os_encrypted value is tied to the presence of a
+ * DSL Crypto Key object in the dd. However, that will not be received
+ * until dmu_recv_stream(), so we set the value manually for now.
+ */
+ if (featureflags & DMU_BACKUP_FEATURE_RAW) {
+ os->os_encrypted = B_TRUE;
+ drba->drba_cookie->drc_raw = B_TRUE;
+ }
+
+ if (featureflags & DMU_BACKUP_FEATURE_REDACTED) {
+ uint64_t *redact_snaps;
+ uint_t numredactsnaps;
+ VERIFY0(nvlist_lookup_uint64_array(drc->drc_begin_nvl,
+ BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps));
+ dsl_dataset_activate_redaction(newds, redact_snaps,
+ numredactsnaps, tx);
+ }
+
+ dmu_buf_will_dirty(newds->ds_dbuf, tx);
+ dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
+
+ /*
+ * If we actually created a non-clone, we need to create the objset
+ * in our new dataset. If this is a raw send we postpone this until
+ * dmu_recv_stream() so that we can allocate the metadnode with the
+ * properties from the DRR_BEGIN payload.
+ */
+ rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG);
+ if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds)) &&
+ (featureflags & DMU_BACKUP_FEATURE_RAW) == 0) {
+ (void) dmu_objset_create_impl(dp->dp_spa,
+ newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
+ }
+ rrw_exit(&newds->ds_bp_rwlock, FTAG);
+
+ drba->drba_cookie->drc_ds = newds;
+ drba->drba_cookie->drc_os = os;
+
+ spa_history_log_internal_ds(newds, "receive", tx, " ");
+}
+
+static int
+dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
+{
+ dmu_recv_begin_arg_t *drba = arg;
+ dmu_recv_cookie_t *drc = drba->drba_cookie;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ struct drr_begin *drrb = drc->drc_drrb;
+ int error;
+ ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
+ dsl_dataset_t *ds;
+ const char *tofs = drc->drc_tofs;
+
+ /* already checked */
+ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+ ASSERT(drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING);
+
+ if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+ DMU_COMPOUNDSTREAM ||
+ drrb->drr_type >= DMU_OST_NUMTYPES)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * This is mostly a sanity check since we should have already done these
+ * checks during a previous attempt to receive the data.
+ */
+ error = recv_begin_check_feature_flags_impl(drc->drc_featureflags,
+ dp->dp_spa);
+ if (error != 0)
+ return (error);
+
+ /* 6 extra bytes for /%recv */
+ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+
+ (void) snprintf(recvname, sizeof (recvname), "%s/%s",
+ tofs, recv_clone_name);
+
+ if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) {
+ /* raw receives require spill block allocation flag */
+ if (!(drrb->drr_flags & DRR_FLAG_SPILL_BLOCK))
+ return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING));
+ } else {
+ dsflags |= DS_HOLD_FLAG_DECRYPT;
+ }
+
+ if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) {
+ /* %recv does not exist; continue in tofs */
+ error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
+ if (error != 0)
+ return (error);
+ }
+
+ /* check that ds is marked inconsistent */
+ if (!DS_IS_INCONSISTENT(ds)) {
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /* check that there is resuming data, and that the toguid matches */
+ if (!dsl_dataset_is_zapified(ds)) {
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ uint64_t val;
+ error = zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val);
+ if (error != 0 || drrb->drr_toguid != val) {
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Check if the receive is still running. If so, it will be owned.
+ * Note that nothing else can own the dataset (e.g. after the receive
+ * fails) because it will be marked inconsistent.
+ */
+ if (dsl_dataset_has_owner(ds)) {
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
+ return (SET_ERROR(EBUSY));
+ }
+
+ /* There should not be any snapshots of this fs yet. */
+ if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) {
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Note: resume point will be checked when we process the first WRITE
+ * record.
+ */
+
+ /* check that the origin matches */
+ val = 0;
+ (void) zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val);
+ if (drrb->drr_fromguid != val) {
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (ds->ds_prev != NULL && drrb->drr_fromguid != 0)
+ drc->drc_fromsnapobj = ds->ds_prev->ds_object;
+
+ /*
+ * If we're resuming, and the send is redacted, then the original send
+ * must have been redacted, and must have been redacted with respect to
+ * the same snapshots.
+ */
+ if (drc->drc_featureflags & DMU_BACKUP_FEATURE_REDACTED) {
+ uint64_t num_ds_redact_snaps;
+ uint64_t *ds_redact_snaps;
+
+ uint_t num_stream_redact_snaps;
+ uint64_t *stream_redact_snaps;
+
+ if (nvlist_lookup_uint64_array(drc->drc_begin_nvl,
+ BEGINNV_REDACT_SNAPS, &stream_redact_snaps,
+ &num_stream_redact_snaps) != 0) {
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (!dsl_dataset_get_uint64_array_feature(ds,
+ SPA_FEATURE_REDACTED_DATASETS, &num_ds_redact_snaps,
+ &ds_redact_snaps)) {
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ for (int i = 0; i < num_ds_redact_snaps; i++) {
+ if (!redact_snaps_contains(ds_redact_snaps,
+ num_ds_redact_snaps, stream_redact_snaps[i])) {
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ }
+ }
+
+ error = recv_check_large_blocks(ds, drc->drc_featureflags);
+ if (error != 0) {
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
+ return (error);
+ }
+
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
+ return (0);
+}
+
+static void
+dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
+{
+ dmu_recv_begin_arg_t *drba = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ const char *tofs = drba->drba_cookie->drc_tofs;
+ uint64_t featureflags = drba->drba_cookie->drc_featureflags;
+ dsl_dataset_t *ds;
+ ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
+ /* 6 extra bytes for /%recv */
+ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+
+ (void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs,
+ recv_clone_name);
+
+ if (featureflags & DMU_BACKUP_FEATURE_RAW) {
+ drba->drba_cookie->drc_raw = B_TRUE;
+ } else {
+ dsflags |= DS_HOLD_FLAG_DECRYPT;
+ }
+
+ if (dsl_dataset_own_force(dp, recvname, dsflags, dmu_recv_tag, &ds)
+ != 0) {
+ /* %recv does not exist; continue in tofs */
+ VERIFY0(dsl_dataset_own_force(dp, tofs, dsflags, dmu_recv_tag,
+ &ds));
+ drba->drba_cookie->drc_newfs = B_TRUE;
+ }
+
+ ASSERT(DS_IS_INCONSISTENT(ds));
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) ||
+ drba->drba_cookie->drc_raw);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+ drba->drba_cookie->drc_ds = ds;
+ VERIFY0(dmu_objset_from_ds(ds, &drba->drba_cookie->drc_os));
+ drba->drba_cookie->drc_should_save = B_TRUE;
+
+ spa_history_log_internal_ds(ds, "resume receive", tx, " ");
+}
+
+/*
+ * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
+ * succeeds; otherwise we will leak the holds on the datasets.
+ */
+int
+dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
+ boolean_t force, boolean_t resumable, nvlist_t *localprops,
+ nvlist_t *hidden_args, char *origin, dmu_recv_cookie_t *drc,
+ zfs_file_t *fp, offset_t *voffp)
+{
+ dmu_recv_begin_arg_t drba = { 0 };
+ int err;
+
+ bzero(drc, sizeof (dmu_recv_cookie_t));
+ drc->drc_drr_begin = drr_begin;
+ drc->drc_drrb = &drr_begin->drr_u.drr_begin;
+ drc->drc_tosnap = tosnap;
+ drc->drc_tofs = tofs;
+ drc->drc_force = force;
+ drc->drc_resumable = resumable;
+ drc->drc_cred = CRED();
+ drc->drc_proc = curproc;
+ drc->drc_clone = (origin != NULL);
+
+ if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
+ drc->drc_byteswap = B_TRUE;
+ (void) fletcher_4_incremental_byteswap(drr_begin,
+ sizeof (dmu_replay_record_t), &drc->drc_cksum);
+ byteswap_record(drr_begin);
+ } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) {
+ (void) fletcher_4_incremental_native(drr_begin,
+ sizeof (dmu_replay_record_t), &drc->drc_cksum);
+ } else {
+ return (SET_ERROR(EINVAL));
+ }
+
+ drc->drc_fp = fp;
+ drc->drc_voff = *voffp;
+ drc->drc_featureflags =
+ DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
+
+ uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen;
+ void *payload = NULL;
+ if (payloadlen != 0)
+ payload = kmem_alloc(payloadlen, KM_SLEEP);
+
+ err = receive_read_payload_and_next_header(drc, payloadlen,
+ payload);
+ if (err != 0) {
+ kmem_free(payload, payloadlen);
+ return (err);
+ }
+ if (payloadlen != 0) {
+ err = nvlist_unpack(payload, payloadlen, &drc->drc_begin_nvl,
+ KM_SLEEP);
+ kmem_free(payload, payloadlen);
+ if (err != 0) {
+ kmem_free(drc->drc_next_rrd,
+ sizeof (*drc->drc_next_rrd));
+ return (err);
+ }
+ }
+
+ if (drc->drc_drrb->drr_flags & DRR_FLAG_SPILL_BLOCK)
+ drc->drc_spill = B_TRUE;
+
+ drba.drba_origin = origin;
+ drba.drba_cookie = drc;
+ drba.drba_cred = CRED();
+ drba.drba_proc = curproc;
+
+ if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) {
+ err = dsl_sync_task(tofs,
+ dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync,
+ &drba, 5, ZFS_SPACE_CHECK_NORMAL);
+ } else {
+
+ /*
+ * For non-raw, non-incremental, non-resuming receives the
+ * user can specify encryption parameters on the command line
+ * with "zfs recv -o". For these receives we create a dcp and
+ * pass it to the sync task. Creating the dcp will implicitly
+ * remove the encryption params from the localprops nvlist,
+ * which avoids errors when trying to set these normally
+ * read-only properties. Any other kind of receive that
+ * attempts to set these properties will fail as a result.
+ */
+ if ((DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) &
+ DMU_BACKUP_FEATURE_RAW) == 0 &&
+ origin == NULL && drc->drc_drrb->drr_fromguid == 0) {
+ err = dsl_crypto_params_create_nvlist(DCP_CMD_NONE,
+ localprops, hidden_args, &drba.drba_dcp);
+ }
+
+ if (err == 0) {
+ err = dsl_sync_task(tofs,
+ dmu_recv_begin_check, dmu_recv_begin_sync,
+ &drba, 5, ZFS_SPACE_CHECK_NORMAL);
+ dsl_crypto_params_free(drba.drba_dcp, !!err);
+ }
+ }
+
+ if (err != 0) {
+ kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
+ nvlist_free(drc->drc_begin_nvl);
+ }
+ return (err);
+}
+
+static int
+receive_read(dmu_recv_cookie_t *drc, int len, void *buf)
+{
+ int done = 0;
+
+ /*
+ * The code doesn't rely on this (lengths being multiples of 8). See
+ * comment in dump_bytes.
+ */
+ ASSERT(len % 8 == 0 ||
+ (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) != 0);
+
+ while (done < len) {
+ ssize_t resid;
+ zfs_file_t *fp = drc->drc_fp;
+ int err = zfs_file_read(fp, (char *)buf + done,
+ len - done, &resid);
+ if (resid == len - done) {
+ /*
+ * Note: ECKSUM or ZFS_ERR_STREAM_TRUNCATED indicates
+ * that the receive was interrupted and can
+ * potentially be resumed.
+ */
+ err = SET_ERROR(ZFS_ERR_STREAM_TRUNCATED);
+ }
+ drc->drc_voff += len - done - resid;
+ done = len - resid;
+ if (err != 0)
+ return (err);
+ }
+
+ drc->drc_bytes_read += len;
+
+ ASSERT3U(done, ==, len);
+ return (0);
+}
+
+static inline uint8_t
+deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
+{
+ if (bonus_type == DMU_OT_SA) {
+ return (1);
+ } else {
+ return (1 +
+ ((DN_OLD_MAX_BONUSLEN -
+ MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT));
+ }
+}
+
+static void
+save_resume_state(struct receive_writer_arg *rwa,
+ uint64_t object, uint64_t offset, dmu_tx_t *tx)
+{
+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+
+ if (!rwa->resumable)
+ return;
+
+ /*
+ * We use ds_resume_bytes[] != 0 to indicate that we need to
+ * update this on disk, so it must not be 0.
+ */
+ ASSERT(rwa->bytes_read != 0);
+
+ /*
+ * We only resume from write records, which have a valid
+ * (non-meta-dnode) object number.
+ */
+ ASSERT(object != 0);
+
+ /*
+ * For resuming to work correctly, we must receive records in order,
+ * sorted by object,offset. This is checked by the callers, but
+ * assert it here for good measure.
+ */
+ ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]);
+ ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] ||
+ offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]);
+ ASSERT3U(rwa->bytes_read, >=,
+ rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]);
+
+ rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object;
+ rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset;
+ rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read;
+}
+
+static int
+receive_object_is_same_generation(objset_t *os, uint64_t object,
+ dmu_object_type_t old_bonus_type, dmu_object_type_t new_bonus_type,
+ const void *new_bonus, boolean_t *samegenp)
+{
+ zfs_file_info_t zoi;
+ int err;
+
+ dmu_buf_t *old_bonus_dbuf;
+ err = dmu_bonus_hold(os, object, FTAG, &old_bonus_dbuf);
+ if (err != 0)
+ return (err);
+ err = dmu_get_file_info(os, old_bonus_type, old_bonus_dbuf->db_data,
+ &zoi);
+ dmu_buf_rele(old_bonus_dbuf, FTAG);
+ if (err != 0)
+ return (err);
+ uint64_t old_gen = zoi.zfi_generation;
+
+ err = dmu_get_file_info(os, new_bonus_type, new_bonus, &zoi);
+ if (err != 0)
+ return (err);
+ uint64_t new_gen = zoi.zfi_generation;
+
+ *samegenp = (old_gen == new_gen);
+ return (0);
+}
+
+static int
+receive_handle_existing_object(const struct receive_writer_arg *rwa,
+ const struct drr_object *drro, const dmu_object_info_t *doi,
+ const void *bonus_data,
+ uint64_t *object_to_hold, uint32_t *new_blksz)
+{
+ uint32_t indblksz = drro->drr_indblkshift ?
+ 1ULL << drro->drr_indblkshift : 0;
+ int nblkptr = deduce_nblkptr(drro->drr_bonustype,
+ drro->drr_bonuslen);
+ uint8_t dn_slots = drro->drr_dn_slots != 0 ?
+ drro->drr_dn_slots : DNODE_MIN_SLOTS;
+ boolean_t do_free_range = B_FALSE;
+ int err;
+
+ *object_to_hold = drro->drr_object;
+
+ /* nblkptr should be bounded by the bonus size and type */
+ if (rwa->raw && nblkptr != drro->drr_nblkptr)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * After the previous send stream, the sending system may
+ * have freed this object, and then happened to re-allocate
+ * this object number in a later txg. In this case, we are
+ * receiving a different logical file, and the block size may
+ * appear to be different. i.e. we may have a different
+ * block size for this object than what the send stream says.
+ * In this case we need to remove the object's contents,
+ * so that its structure can be changed and then its contents
+ * entirely replaced by subsequent WRITE records.
+ *
+ * If this is a -L (--large-block) incremental stream, and
+ * the previous stream was not -L, the block size may appear
+ * to increase. i.e. we may have a smaller block size for
+ * this object than what the send stream says. In this case
+ * we need to keep the object's contents and block size
+ * intact, so that we don't lose parts of the object's
+ * contents that are not changed by this incremental send
+ * stream.
+ *
+ * We can distinguish between the two above cases by using
+ * the ZPL's generation number (see
+ * receive_object_is_same_generation()). However, we only
+ * want to rely on the generation number when absolutely
+ * necessary, because with raw receives, the generation is
+ * encrypted. We also want to minimize dependence on the
+ * ZPL, so that other types of datasets can also be received
+ * (e.g. ZVOLs, although note that ZVOLS currently do not
+ * reallocate their objects or change their structure).
+ * Therefore, we check a number of different cases where we
+ * know it is safe to discard the object's contents, before
+ * using the ZPL's generation number to make the above
+ * distinction.
+ */
+ if (drro->drr_blksz != doi->doi_data_block_size) {
+ if (rwa->raw) {
+ /*
+ * RAW streams always have large blocks, so
+ * we are sure that the data is not needed
+ * due to changing --large-block to be on.
+ * Which is fortunate since the bonus buffer
+ * (which contains the ZPL generation) is
+ * encrypted, and the key might not be
+ * loaded.
+ */
+ do_free_range = B_TRUE;
+ } else if (rwa->full) {
+ /*
+ * This is a full send stream, so it always
+ * replaces what we have. Even if the
+ * generation numbers happen to match, this
+ * can not actually be the same logical file.
+ * This is relevant when receiving a full
+ * send as a clone.
+ */
+ do_free_range = B_TRUE;
+ } else if (drro->drr_type !=
+ DMU_OT_PLAIN_FILE_CONTENTS ||
+ doi->doi_type != DMU_OT_PLAIN_FILE_CONTENTS) {
+ /*
+ * PLAIN_FILE_CONTENTS are the only type of
+ * objects that have ever been stored with
+ * large blocks, so we don't need the special
+ * logic below. ZAP blocks can shrink (when
+ * there's only one block), so we don't want
+ * to hit the error below about block size
+ * only increasing.
+ */
+ do_free_range = B_TRUE;
+ } else if (doi->doi_max_offset <=
+ doi->doi_data_block_size) {
+ /*
+ * There is only one block. We can free it,
+ * because its contents will be replaced by a
+ * WRITE record. This can not be the no-L ->
+ * -L case, because the no-L case would have
+ * resulted in multiple blocks. If we
+ * supported -L -> no-L, it would not be safe
+ * to free the file's contents. Fortunately,
+ * that is not allowed (see
+ * recv_check_large_blocks()).
+ */
+ do_free_range = B_TRUE;
+ } else {
+ boolean_t is_same_gen;
+ err = receive_object_is_same_generation(rwa->os,
+ drro->drr_object, doi->doi_bonus_type,
+ drro->drr_bonustype, bonus_data, &is_same_gen);
+ if (err != 0)
+ return (SET_ERROR(EINVAL));
+
+ if (is_same_gen) {
+ /*
+ * This is the same logical file, and
+ * the block size must be increasing.
+ * It could only decrease if
+ * --large-block was changed to be
+ * off, which is checked in
+ * recv_check_large_blocks().
+ */
+ if (drro->drr_blksz <=
+ doi->doi_data_block_size)
+ return (SET_ERROR(EINVAL));
+ /*
+ * We keep the existing blocksize and
+ * contents.
+ */
+ *new_blksz =
+ doi->doi_data_block_size;
+ } else {
+ do_free_range = B_TRUE;
+ }
+ }
+ }
+
+ /* nblkptr can only decrease if the object was reallocated */
+ if (nblkptr < doi->doi_nblkptr)
+ do_free_range = B_TRUE;
+
+ /* number of slots can only change on reallocation */
+ if (dn_slots != doi->doi_dnodesize >> DNODE_SHIFT)
+ do_free_range = B_TRUE;
+
+ /*
+ * For raw sends we also check a few other fields to
+ * ensure we are preserving the objset structure exactly
+ * as it was on the receive side:
+ * - A changed indirect block size
+ * - A smaller nlevels
+ */
+ if (rwa->raw) {
+ if (indblksz != doi->doi_metadata_block_size)
+ do_free_range = B_TRUE;
+ if (drro->drr_nlevels < doi->doi_indirection)
+ do_free_range = B_TRUE;
+ }
+
+ if (do_free_range) {
+ err = dmu_free_long_range(rwa->os, drro->drr_object,
+ 0, DMU_OBJECT_END);
+ if (err != 0)
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * The dmu does not currently support decreasing nlevels
+ * or changing the number of dnode slots on an object. For
+ * non-raw sends, this does not matter and the new object
+ * can just use the previous one's nlevels. For raw sends,
+ * however, the structure of the received dnode (including
+ * nlevels and dnode slots) must match that of the send
+ * side. Therefore, instead of using dmu_object_reclaim(),
+ * we must free the object completely and call
+ * dmu_object_claim_dnsize() instead.
+ */
+ if ((rwa->raw && drro->drr_nlevels < doi->doi_indirection) ||
+ dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) {
+ err = dmu_free_long_object(rwa->os, drro->drr_object);
+ if (err != 0)
+ return (SET_ERROR(EINVAL));
+
+ txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+ *object_to_hold = DMU_NEW_OBJECT;
+ }
+
+ /*
+ * For raw receives, free everything beyond the new incoming
+ * maxblkid. Normally this would be done with a DRR_FREE
+ * record that would come after this DRR_OBJECT record is
+ * processed. However, for raw receives we manually set the
+ * maxblkid from the drr_maxblkid and so we must first free
+ * everything above that blkid to ensure the DMU is always
+ * consistent with itself. We will never free the first block
+ * of the object here because a maxblkid of 0 could indicate
+ * an object with a single block or one with no blocks. This
+ * free may be skipped when dmu_free_long_range() was called
+ * above since it covers the entire object's contents.
+ */
+ if (rwa->raw && *object_to_hold != DMU_NEW_OBJECT && !do_free_range) {
+ err = dmu_free_long_range(rwa->os, drro->drr_object,
+ (drro->drr_maxblkid + 1) * doi->doi_data_block_size,
+ DMU_OBJECT_END);
+ if (err != 0)
+ return (SET_ERROR(EINVAL));
+ }
+ return (0);
+}
+
+noinline static int
+receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
+ void *data)
+{
+ dmu_object_info_t doi;
+ dmu_tx_t *tx;
+ int err;
+ uint32_t new_blksz = drro->drr_blksz;
+ uint8_t dn_slots = drro->drr_dn_slots != 0 ?
+ drro->drr_dn_slots : DNODE_MIN_SLOTS;
+
+ if (drro->drr_type == DMU_OT_NONE ||
+ !DMU_OT_IS_VALID(drro->drr_type) ||
+ !DMU_OT_IS_VALID(drro->drr_bonustype) ||
+ drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
+ drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
+ P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
+ drro->drr_blksz < SPA_MINBLOCKSIZE ||
+ drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
+ drro->drr_bonuslen >
+ DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) ||
+ dn_slots >
+ (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (rwa->raw) {
+ /*
+ * We should have received a DRR_OBJECT_RANGE record
+ * containing this block and stored it in rwa.
+ */
+ if (drro->drr_object < rwa->or_firstobj ||
+ drro->drr_object >= rwa->or_firstobj + rwa->or_numslots ||
+ drro->drr_raw_bonuslen < drro->drr_bonuslen ||
+ drro->drr_indblkshift > SPA_MAXBLOCKSHIFT ||
+ drro->drr_nlevels > DN_MAX_LEVELS ||
+ drro->drr_nblkptr > DN_MAX_NBLKPTR ||
+ DN_SLOTS_TO_BONUSLEN(dn_slots) <
+ drro->drr_raw_bonuslen)
+ return (SET_ERROR(EINVAL));
+ } else {
+ /*
+ * The DRR_OBJECT_SPILL flag is valid when the DRR_BEGIN
+ * record indicates this by setting DRR_FLAG_SPILL_BLOCK.
+ */
+ if (((drro->drr_flags & ~(DRR_OBJECT_SPILL))) ||
+ (!rwa->spill && DRR_OBJECT_HAS_SPILL(drro->drr_flags))) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (drro->drr_raw_bonuslen != 0 || drro->drr_nblkptr != 0 ||
+ drro->drr_indblkshift != 0 || drro->drr_nlevels != 0) {
+ return (SET_ERROR(EINVAL));
+ }
+ }
+
+ err = dmu_object_info(rwa->os, drro->drr_object, &doi);
+
+ if (err != 0 && err != ENOENT && err != EEXIST)
+ return (SET_ERROR(EINVAL));
+
+ if (drro->drr_object > rwa->max_object)
+ rwa->max_object = drro->drr_object;
+
+ /*
+ * If we are losing blkptrs or changing the block size this must
+ * be a new file instance. We must clear out the previous file
+ * contents before we can change this type of metadata in the dnode.
+ * Raw receives will also check that the indirect structure of the
+ * dnode hasn't changed.
+ */
+ uint64_t object_to_hold;
+ if (err == 0) {
+ err = receive_handle_existing_object(rwa, drro, &doi, data,
+ &object_to_hold, &new_blksz);
+ } else if (err == EEXIST) {
+ /*
+ * The object requested is currently an interior slot of a
+ * multi-slot dnode. This will be resolved when the next txg
+ * is synced out, since the send stream will have told us
+ * to free this slot when we freed the associated dnode
+ * earlier in the stream.
+ */
+ txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+
+ if (dmu_object_info(rwa->os, drro->drr_object, NULL) != ENOENT)
+ return (SET_ERROR(EINVAL));
+
+ /* object was freed and we are about to allocate a new one */
+ object_to_hold = DMU_NEW_OBJECT;
+ } else {
+ /* object is free and we are about to allocate a new one */
+ object_to_hold = DMU_NEW_OBJECT;
+ }
+
+ /*
+ * If this is a multi-slot dnode there is a chance that this
+ * object will expand into a slot that is already used by
+ * another object from the previous snapshot. We must free
+ * these objects before we attempt to allocate the new dnode.
+ */
+ if (dn_slots > 1) {
+ boolean_t need_sync = B_FALSE;
+
+ for (uint64_t slot = drro->drr_object + 1;
+ slot < drro->drr_object + dn_slots;
+ slot++) {
+ dmu_object_info_t slot_doi;
+
+ err = dmu_object_info(rwa->os, slot, &slot_doi);
+ if (err == ENOENT || err == EEXIST)
+ continue;
+ else if (err != 0)
+ return (err);
+
+ err = dmu_free_long_object(rwa->os, slot);
+ if (err != 0)
+ return (err);
+
+ need_sync = B_TRUE;
+ }
+
+ if (need_sync)
+ txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+ }
+
+ tx = dmu_tx_create(rwa->os);
+ dmu_tx_hold_bonus(tx, object_to_hold);
+ dmu_tx_hold_write(tx, object_to_hold, 0, 0);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err != 0) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ if (object_to_hold == DMU_NEW_OBJECT) {
+ /* Currently free, wants to be allocated */
+ err = dmu_object_claim_dnsize(rwa->os, drro->drr_object,
+ drro->drr_type, new_blksz,
+ drro->drr_bonustype, drro->drr_bonuslen,
+ dn_slots << DNODE_SHIFT, tx);
+ } else if (drro->drr_type != doi.doi_type ||
+ new_blksz != doi.doi_data_block_size ||
+ drro->drr_bonustype != doi.doi_bonus_type ||
+ drro->drr_bonuslen != doi.doi_bonus_size) {
+ /* Currently allocated, but with different properties */
+ err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object,
+ drro->drr_type, new_blksz,
+ drro->drr_bonustype, drro->drr_bonuslen,
+ dn_slots << DNODE_SHIFT, rwa->spill ?
+ DRR_OBJECT_HAS_SPILL(drro->drr_flags) : B_FALSE, tx);
+ } else if (rwa->spill && !DRR_OBJECT_HAS_SPILL(drro->drr_flags)) {
+ /*
+ * Currently allocated, the existing version of this object
+ * may reference a spill block that is no longer allocated
+ * at the source and needs to be freed.
+ */
+ err = dmu_object_rm_spill(rwa->os, drro->drr_object, tx);
+ }
+
+ if (err != 0) {
+ dmu_tx_commit(tx);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (rwa->or_crypt_params_present) {
+ /*
+ * Set the crypt params for the buffer associated with this
+ * range of dnodes. This causes the blkptr_t to have the
+ * same crypt params (byteorder, salt, iv, mac) as on the
+ * sending side.
+ *
+ * Since we are committing this tx now, it is possible for
+ * the dnode block to end up on-disk with the incorrect MAC,
+ * if subsequent objects in this block are received in a
+ * different txg. However, since the dataset is marked as
+ * inconsistent, no code paths will do a non-raw read (or
+ * decrypt the block / verify the MAC). The receive code and
+ * scrub code can safely do raw reads and verify the
+ * checksum. They don't need to verify the MAC.
+ */
+ dmu_buf_t *db = NULL;
+ uint64_t offset = rwa->or_firstobj * DNODE_MIN_SIZE;
+
+ err = dmu_buf_hold_by_dnode(DMU_META_DNODE(rwa->os),
+ offset, FTAG, &db, DMU_READ_PREFETCH | DMU_READ_NO_DECRYPT);
+ if (err != 0) {
+ dmu_tx_commit(tx);
+ return (SET_ERROR(EINVAL));
+ }
+
+ dmu_buf_set_crypt_params(db, rwa->or_byteorder,
+ rwa->or_salt, rwa->or_iv, rwa->or_mac, tx);
+
+ dmu_buf_rele(db, FTAG);
+
+ rwa->or_crypt_params_present = B_FALSE;
+ }
+
+ dmu_object_set_checksum(rwa->os, drro->drr_object,
+ drro->drr_checksumtype, tx);
+ dmu_object_set_compress(rwa->os, drro->drr_object,
+ drro->drr_compress, tx);
+
+ /* handle more restrictive dnode structuring for raw recvs */
+ if (rwa->raw) {
+ /*
+ * Set the indirect block size, block shift, nlevels.
+ * This will not fail because we ensured all of the
+ * blocks were freed earlier if this is a new object.
+ * For non-new objects block size and indirect block
+ * shift cannot change and nlevels can only increase.
+ */
+ ASSERT3U(new_blksz, ==, drro->drr_blksz);
+ VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object,
+ drro->drr_blksz, drro->drr_indblkshift, tx));
+ VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object,
+ drro->drr_nlevels, tx));
+
+ /*
+ * Set the maxblkid. This will always succeed because
+ * we freed all blocks beyond the new maxblkid above.
+ */
+ VERIFY0(dmu_object_set_maxblkid(rwa->os, drro->drr_object,
+ drro->drr_maxblkid, tx));
+ }
+
+ if (data != NULL) {
+ dmu_buf_t *db;
+ dnode_t *dn;
+ uint32_t flags = DMU_READ_NO_PREFETCH;
+
+ if (rwa->raw)
+ flags |= DMU_READ_NO_DECRYPT;
+
+ VERIFY0(dnode_hold(rwa->os, drro->drr_object, FTAG, &dn));
+ VERIFY0(dmu_bonus_hold_by_dnode(dn, FTAG, &db, flags));
+
+ dmu_buf_will_dirty(db, tx);
+
+ ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
+ bcopy(data, db->db_data, DRR_OBJECT_PAYLOAD_SIZE(drro));
+
+ /*
+ * Raw bonus buffers have their byteorder determined by the
+ * DRR_OBJECT_RANGE record.
+ */
+ if (rwa->byteswap && !rwa->raw) {
+ dmu_object_byteswap_t byteswap =
+ DMU_OT_BYTESWAP(drro->drr_bonustype);
+ dmu_ot_byteswap[byteswap].ob_func(db->db_data,
+ DRR_OBJECT_PAYLOAD_SIZE(drro));
+ }
+ dmu_buf_rele(db, FTAG);
+ dnode_rele(dn, FTAG);
+ }
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+
+/* ARGSUSED */
+noinline static int
+receive_freeobjects(struct receive_writer_arg *rwa,
+ struct drr_freeobjects *drrfo)
+{
+ uint64_t obj;
+ int next_err = 0;
+
+ if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
+ return (SET_ERROR(EINVAL));
+
+ for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj;
+ obj < drrfo->drr_firstobj + drrfo->drr_numobjs &&
+ obj < DN_MAX_OBJECT && next_err == 0;
+ next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
+ dmu_object_info_t doi;
+ int err;
+
+ err = dmu_object_info(rwa->os, obj, &doi);
+ if (err == ENOENT)
+ continue;
+ else if (err != 0)
+ return (err);
+
+ err = dmu_free_long_object(rwa->os, obj);
+
+ if (err != 0)
+ return (err);
+ }
+ if (next_err != ESRCH)
+ return (next_err);
+ return (0);
+}
+
+/*
+ * Note: if this fails, the caller will clean up any records left on the
+ * rwa->write_batch list.
+ */
+static int
+flush_write_batch_impl(struct receive_writer_arg *rwa)
+{
+ dnode_t *dn;
+ int err;
+
+ if (dnode_hold(rwa->os, rwa->last_object, FTAG, &dn) != 0)
+ return (SET_ERROR(EINVAL));
+
+ struct receive_record_arg *last_rrd = list_tail(&rwa->write_batch);
+ struct drr_write *last_drrw = &last_rrd->header.drr_u.drr_write;
+
+ struct receive_record_arg *first_rrd = list_head(&rwa->write_batch);
+ struct drr_write *first_drrw = &first_rrd->header.drr_u.drr_write;
+
+ ASSERT3U(rwa->last_object, ==, last_drrw->drr_object);
+ ASSERT3U(rwa->last_offset, ==, last_drrw->drr_offset);
+
+ dmu_tx_t *tx = dmu_tx_create(rwa->os);
+ dmu_tx_hold_write_by_dnode(tx, dn, first_drrw->drr_offset,
+ last_drrw->drr_offset - first_drrw->drr_offset +
+ last_drrw->drr_logical_size);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err != 0) {
+ dmu_tx_abort(tx);
+ dnode_rele(dn, FTAG);
+ return (err);
+ }
+
+ struct receive_record_arg *rrd;
+ while ((rrd = list_head(&rwa->write_batch)) != NULL) {
+ struct drr_write *drrw = &rrd->header.drr_u.drr_write;
+ abd_t *abd = rrd->abd;
+
+ ASSERT3U(drrw->drr_object, ==, rwa->last_object);
+
+ if (drrw->drr_logical_size != dn->dn_datablksz) {
+ /*
+ * The WRITE record is larger than the object's block
+ * size. We must be receiving an incremental
+ * large-block stream into a dataset that previously did
+ * a non-large-block receive. Lightweight writes must
+ * be exactly one block, so we need to decompress the
+ * data (if compressed) and do a normal dmu_write().
+ */
+ ASSERT3U(drrw->drr_logical_size, >, dn->dn_datablksz);
+ if (DRR_WRITE_COMPRESSED(drrw)) {
+ abd_t *decomp_abd =
+ abd_alloc_linear(drrw->drr_logical_size,
+ B_FALSE);
+
+ err = zio_decompress_data(
+ drrw->drr_compressiontype,
+ abd, abd_to_buf(decomp_abd),
+ abd_get_size(abd),
+ abd_get_size(decomp_abd), NULL);
+
+ if (err == 0) {
+ dmu_write_by_dnode(dn,
+ drrw->drr_offset,
+ drrw->drr_logical_size,
+ abd_to_buf(decomp_abd), tx);
+ }
+ abd_free(decomp_abd);
+ } else {
+ dmu_write_by_dnode(dn,
+ drrw->drr_offset,
+ drrw->drr_logical_size,
+ abd_to_buf(abd), tx);
+ }
+ if (err == 0)
+ abd_free(abd);
+ } else {
+ zio_prop_t zp;
+ dmu_write_policy(rwa->os, dn, 0, 0, &zp);
+
+ enum zio_flag zio_flags = 0;
+
+ if (rwa->raw) {
+ zp.zp_encrypt = B_TRUE;
+ zp.zp_compress = drrw->drr_compressiontype;
+ zp.zp_byteorder = ZFS_HOST_BYTEORDER ^
+ !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^
+ rwa->byteswap;
+ bcopy(drrw->drr_salt, zp.zp_salt,
+ ZIO_DATA_SALT_LEN);
+ bcopy(drrw->drr_iv, zp.zp_iv,
+ ZIO_DATA_IV_LEN);
+ bcopy(drrw->drr_mac, zp.zp_mac,
+ ZIO_DATA_MAC_LEN);
+ if (DMU_OT_IS_ENCRYPTED(zp.zp_type)) {
+ zp.zp_nopwrite = B_FALSE;
+ zp.zp_copies = MIN(zp.zp_copies,
+ SPA_DVAS_PER_BP - 1);
+ }
+ zio_flags |= ZIO_FLAG_RAW;
+ } else if (DRR_WRITE_COMPRESSED(drrw)) {
+ ASSERT3U(drrw->drr_compressed_size, >, 0);
+ ASSERT3U(drrw->drr_logical_size, >=,
+ drrw->drr_compressed_size);
+ zp.zp_compress = drrw->drr_compressiontype;
+ zio_flags |= ZIO_FLAG_RAW_COMPRESS;
+ } else if (rwa->byteswap) {
+ /*
+ * Note: compressed blocks never need to be
+ * byteswapped, because WRITE records for
+ * metadata blocks are never compressed. The
+ * exception is raw streams, which are written
+ * in the original byteorder, and the byteorder
+ * bit is preserved in the BP by setting
+ * zp_byteorder above.
+ */
+ dmu_object_byteswap_t byteswap =
+ DMU_OT_BYTESWAP(drrw->drr_type);
+ dmu_ot_byteswap[byteswap].ob_func(
+ abd_to_buf(abd),
+ DRR_WRITE_PAYLOAD_SIZE(drrw));
+ }
+
+ /*
+ * Since this data can't be read until the receive
+ * completes, we can do a "lightweight" write for
+ * improved performance.
+ */
+ err = dmu_lightweight_write_by_dnode(dn,
+ drrw->drr_offset, abd, &zp, zio_flags, tx);
+ }
+
+ if (err != 0) {
+ /*
+ * This rrd is left on the list, so the caller will
+ * free it (and the abd).
+ */
+ break;
+ }
+
+ /*
+ * Note: If the receive fails, we want the resume stream to
+ * start with the same record that we last successfully
+ * received (as opposed to the next record), so that we can
+ * verify that we are resuming from the correct location.
+ */
+ save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx);
+
+ list_remove(&rwa->write_batch, rrd);
+ kmem_free(rrd, sizeof (*rrd));
+ }
+
+ dmu_tx_commit(tx);
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+noinline static int
+flush_write_batch(struct receive_writer_arg *rwa)
+{
+ if (list_is_empty(&rwa->write_batch))
+ return (0);
+ int err = rwa->err;
+ if (err == 0)
+ err = flush_write_batch_impl(rwa);
+ if (err != 0) {
+ struct receive_record_arg *rrd;
+ while ((rrd = list_remove_head(&rwa->write_batch)) != NULL) {
+ abd_free(rrd->abd);
+ kmem_free(rrd, sizeof (*rrd));
+ }
+ }
+ ASSERT(list_is_empty(&rwa->write_batch));
+ return (err);
+}
+
+noinline static int
+receive_process_write_record(struct receive_writer_arg *rwa,
+ struct receive_record_arg *rrd)
+{
+ int err = 0;
+
+ ASSERT3U(rrd->header.drr_type, ==, DRR_WRITE);
+ struct drr_write *drrw = &rrd->header.drr_u.drr_write;
+
+ if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset ||
+ !DMU_OT_IS_VALID(drrw->drr_type))
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * For resuming to work, records must be in increasing order
+ * by (object, offset).
+ */
+ if (drrw->drr_object < rwa->last_object ||
+ (drrw->drr_object == rwa->last_object &&
+ drrw->drr_offset < rwa->last_offset)) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ struct receive_record_arg *first_rrd = list_head(&rwa->write_batch);
+ struct drr_write *first_drrw = &first_rrd->header.drr_u.drr_write;
+ uint64_t batch_size =
+ MIN(zfs_recv_write_batch_size, DMU_MAX_ACCESS / 2);
+ if (first_rrd != NULL &&
+ (drrw->drr_object != first_drrw->drr_object ||
+ drrw->drr_offset >= first_drrw->drr_offset + batch_size)) {
+ err = flush_write_batch(rwa);
+ if (err != 0)
+ return (err);
+ }
+
+ rwa->last_object = drrw->drr_object;
+ rwa->last_offset = drrw->drr_offset;
+
+ if (rwa->last_object > rwa->max_object)
+ rwa->max_object = rwa->last_object;
+
+ list_insert_tail(&rwa->write_batch, rrd);
+ /*
+ * Return EAGAIN to indicate that we will use this rrd again,
+ * so the caller should not free it
+ */
+ return (EAGAIN);
+}
+
+static int
+receive_write_embedded(struct receive_writer_arg *rwa,
+ struct drr_write_embedded *drrwe, void *data)
+{
+ dmu_tx_t *tx;
+ int err;
+
+ if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset)
+ return (SET_ERROR(EINVAL));
+
+ if (drrwe->drr_psize > BPE_PAYLOAD_SIZE)
+ return (SET_ERROR(EINVAL));
+
+ if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES)
+ return (SET_ERROR(EINVAL));
+ if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
+ return (SET_ERROR(EINVAL));
+ if (rwa->raw)
+ return (SET_ERROR(EINVAL));
+
+ if (drrwe->drr_object > rwa->max_object)
+ rwa->max_object = drrwe->drr_object;
+
+ tx = dmu_tx_create(rwa->os);
+
+ dmu_tx_hold_write(tx, drrwe->drr_object,
+ drrwe->drr_offset, drrwe->drr_length);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err != 0) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ dmu_write_embedded(rwa->os, drrwe->drr_object,
+ drrwe->drr_offset, data, drrwe->drr_etype,
+ drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize,
+ rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx);
+
+ /* See comment in restore_write. */
+ save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx);
+ dmu_tx_commit(tx);
+ return (0);
+}
+
+static int
+receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
+ abd_t *abd)
+{
+ dmu_buf_t *db, *db_spill;
+ int err;
+
+ if (drrs->drr_length < SPA_MINBLOCKSIZE ||
+ drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os)))
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * This is an unmodified spill block which was added to the stream
+ * to resolve an issue with incorrectly removing spill blocks. It
+ * should be ignored by current versions of the code which support
+ * the DRR_FLAG_SPILL_BLOCK flag.
+ */
+ if (rwa->spill && DRR_SPILL_IS_UNMODIFIED(drrs->drr_flags)) {
+ abd_free(abd);
+ return (0);
+ }
+
+ if (rwa->raw) {
+ if (!DMU_OT_IS_VALID(drrs->drr_type) ||
+ drrs->drr_compressiontype >= ZIO_COMPRESS_FUNCTIONS ||
+ drrs->drr_compressed_size == 0)
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0)
+ return (SET_ERROR(EINVAL));
+
+ if (drrs->drr_object > rwa->max_object)
+ rwa->max_object = drrs->drr_object;
+
+ VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
+ if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT, FTAG,
+ &db_spill)) != 0) {
+ dmu_buf_rele(db, FTAG);
+ return (err);
+ }
+
+ dmu_tx_t *tx = dmu_tx_create(rwa->os);
+
+ dmu_tx_hold_spill(tx, db->db_object);
+
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err != 0) {
+ dmu_buf_rele(db, FTAG);
+ dmu_buf_rele(db_spill, FTAG);
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ /*
+ * Spill blocks may both grow and shrink. When a change in size
+ * occurs any existing dbuf must be updated to match the logical
+ * size of the provided arc_buf_t.
+ */
+ if (db_spill->db_size != drrs->drr_length) {
+ dmu_buf_will_fill(db_spill, tx);
+ VERIFY0(dbuf_spill_set_blksz(db_spill,
+ drrs->drr_length, tx));
+ }
+
+ arc_buf_t *abuf;
+ if (rwa->raw) {
+ boolean_t byteorder = ZFS_HOST_BYTEORDER ^
+ !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^
+ rwa->byteswap;
+
+ abuf = arc_loan_raw_buf(dmu_objset_spa(rwa->os),
+ drrs->drr_object, byteorder, drrs->drr_salt,
+ drrs->drr_iv, drrs->drr_mac, drrs->drr_type,
+ drrs->drr_compressed_size, drrs->drr_length,
+ drrs->drr_compressiontype, 0);
+ } else {
+ abuf = arc_loan_buf(dmu_objset_spa(rwa->os),
+ DMU_OT_IS_METADATA(drrs->drr_type),
+ drrs->drr_length);
+ if (rwa->byteswap) {
+ dmu_object_byteswap_t byteswap =
+ DMU_OT_BYTESWAP(drrs->drr_type);
+ dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(abd),
+ DRR_SPILL_PAYLOAD_SIZE(drrs));
+ }
+ }
+
+ bcopy(abd_to_buf(abd), abuf->b_data, DRR_SPILL_PAYLOAD_SIZE(drrs));
+ abd_free(abd);
+ dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx);
+
+ dmu_buf_rele(db, FTAG);
+ dmu_buf_rele(db_spill, FTAG);
+
+ dmu_tx_commit(tx);
+ return (0);
+}
+
+/* ARGSUSED */
+noinline static int
+receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
+{
+ int err;
+
+ if (drrf->drr_length != -1ULL &&
+ drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
+ return (SET_ERROR(EINVAL));
+
+ if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0)
+ return (SET_ERROR(EINVAL));
+
+ if (drrf->drr_object > rwa->max_object)
+ rwa->max_object = drrf->drr_object;
+
+ err = dmu_free_long_range(rwa->os, drrf->drr_object,
+ drrf->drr_offset, drrf->drr_length);
+
+ return (err);
+}
+
+static int
+receive_object_range(struct receive_writer_arg *rwa,
+ struct drr_object_range *drror)
+{
+ /*
+ * By default, we assume this block is in our native format
+ * (ZFS_HOST_BYTEORDER). We then take into account whether
+ * the send stream is byteswapped (rwa->byteswap). Finally,
+ * we need to byteswap again if this particular block was
+ * in non-native format on the send side.
+ */
+ boolean_t byteorder = ZFS_HOST_BYTEORDER ^ rwa->byteswap ^
+ !!DRR_IS_RAW_BYTESWAPPED(drror->drr_flags);
+
+ /*
+ * Since dnode block sizes are constant, we should not need to worry
+ * about making sure that the dnode block size is the same on the
+ * sending and receiving sides for the time being. For non-raw sends,
+ * this does not matter (and in fact we do not send a DRR_OBJECT_RANGE
+ * record at all). Raw sends require this record type because the
+ * encryption parameters are used to protect an entire block of bonus
+ * buffers. If the size of dnode blocks ever becomes variable,
+ * handling will need to be added to ensure that dnode block sizes
+ * match on the sending and receiving side.
+ */
+ if (drror->drr_numslots != DNODES_PER_BLOCK ||
+ P2PHASE(drror->drr_firstobj, DNODES_PER_BLOCK) != 0 ||
+ !rwa->raw)
+ return (SET_ERROR(EINVAL));
+
+ if (drror->drr_firstobj > rwa->max_object)
+ rwa->max_object = drror->drr_firstobj;
+
+ /*
+ * The DRR_OBJECT_RANGE handling must be deferred to receive_object()
+ * so that the block of dnodes is not written out when it's empty,
+ * and converted to a HOLE BP.
+ */
+ rwa->or_crypt_params_present = B_TRUE;
+ rwa->or_firstobj = drror->drr_firstobj;
+ rwa->or_numslots = drror->drr_numslots;
+ bcopy(drror->drr_salt, rwa->or_salt, ZIO_DATA_SALT_LEN);
+ bcopy(drror->drr_iv, rwa->or_iv, ZIO_DATA_IV_LEN);
+ bcopy(drror->drr_mac, rwa->or_mac, ZIO_DATA_MAC_LEN);
+ rwa->or_byteorder = byteorder;
+
+ return (0);
+}
+
+/*
+ * Until we have the ability to redact large ranges of data efficiently, we
+ * process these records as frees.
+ */
+/* ARGSUSED */
+noinline static int
+receive_redact(struct receive_writer_arg *rwa, struct drr_redact *drrr)
+{
+ struct drr_free drrf = {0};
+ drrf.drr_length = drrr->drr_length;
+ drrf.drr_object = drrr->drr_object;
+ drrf.drr_offset = drrr->drr_offset;
+ drrf.drr_toguid = drrr->drr_toguid;
+ return (receive_free(rwa, &drrf));
+}
+
+/* used to destroy the drc_ds on error */
+static void
+dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
+{
+ dsl_dataset_t *ds = drc->drc_ds;
+ ds_hold_flags_t dsflags;
+
+ dsflags = (drc->drc_raw) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT;
+ /*
+ * Wait for the txg sync before cleaning up the receive. For
+ * resumable receives, this ensures that our resume state has
+ * been written out to disk. For raw receives, this ensures
+ * that the user accounting code will not attempt to do anything
+ * after we stopped receiving the dataset.
+ */
+ txg_wait_synced(ds->ds_dir->dd_pool, 0);
+ ds->ds_objset->os_raw_receive = B_FALSE;
+
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ if (drc->drc_resumable && drc->drc_should_save &&
+ !BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) {
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+ dsl_dataset_disown(ds, dsflags, dmu_recv_tag);
+ } else {
+ char name[ZFS_MAX_DATASET_NAME_LEN];
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+ dsl_dataset_name(ds, name);
+ dsl_dataset_disown(ds, dsflags, dmu_recv_tag);
+ (void) dsl_destroy_head(name);
+ }
+}
+
+static void
+receive_cksum(dmu_recv_cookie_t *drc, int len, void *buf)
+{
+ if (drc->drc_byteswap) {
+ (void) fletcher_4_incremental_byteswap(buf, len,
+ &drc->drc_cksum);
+ } else {
+ (void) fletcher_4_incremental_native(buf, len, &drc->drc_cksum);
+ }
+}
+
+/*
+ * Read the payload into a buffer of size len, and update the current record's
+ * payload field.
+ * Allocate drc->drc_next_rrd and read the next record's header into
+ * drc->drc_next_rrd->header.
+ * Verify checksum of payload and next record.
+ */
+static int
+receive_read_payload_and_next_header(dmu_recv_cookie_t *drc, int len, void *buf)
+{
+ int err;
+
+ if (len != 0) {
+ ASSERT3U(len, <=, SPA_MAXBLOCKSIZE);
+ err = receive_read(drc, len, buf);
+ if (err != 0)
+ return (err);
+ receive_cksum(drc, len, buf);
+
+ /* note: rrd is NULL when reading the begin record's payload */
+ if (drc->drc_rrd != NULL) {
+ drc->drc_rrd->payload = buf;
+ drc->drc_rrd->payload_size = len;
+ drc->drc_rrd->bytes_read = drc->drc_bytes_read;
+ }
+ } else {
+ ASSERT3P(buf, ==, NULL);
+ }
+
+ drc->drc_prev_cksum = drc->drc_cksum;
+
+ drc->drc_next_rrd = kmem_zalloc(sizeof (*drc->drc_next_rrd), KM_SLEEP);
+ err = receive_read(drc, sizeof (drc->drc_next_rrd->header),
+ &drc->drc_next_rrd->header);
+ drc->drc_next_rrd->bytes_read = drc->drc_bytes_read;
+
+ if (err != 0) {
+ kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
+ drc->drc_next_rrd = NULL;
+ return (err);
+ }
+ if (drc->drc_next_rrd->header.drr_type == DRR_BEGIN) {
+ kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
+ drc->drc_next_rrd = NULL;
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Note: checksum is of everything up to but not including the
+ * checksum itself.
+ */
+ ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+ ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+ receive_cksum(drc,
+ offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+ &drc->drc_next_rrd->header);
+
+ zio_cksum_t cksum_orig =
+ drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum;
+ zio_cksum_t *cksump =
+ &drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum;
+
+ if (drc->drc_byteswap)
+ byteswap_record(&drc->drc_next_rrd->header);
+
+ if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) &&
+ !ZIO_CHECKSUM_EQUAL(drc->drc_cksum, *cksump)) {
+ kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
+ drc->drc_next_rrd = NULL;
+ return (SET_ERROR(ECKSUM));
+ }
+
+ receive_cksum(drc, sizeof (cksum_orig), &cksum_orig);
+
+ return (0);
+}
+
+/*
+ * Issue the prefetch reads for any necessary indirect blocks.
+ *
+ * We use the object ignore list to tell us whether or not to issue prefetches
+ * for a given object. We do this for both correctness (in case the blocksize
+ * of an object has changed) and performance (if the object doesn't exist, don't
+ * needlessly try to issue prefetches). We also trim the list as we go through
+ * the stream to prevent it from growing to an unbounded size.
+ *
+ * The object numbers within will always be in sorted order, and any write
+ * records we see will also be in sorted order, but they're not sorted with
+ * respect to each other (i.e. we can get several object records before
+ * receiving each object's write records). As a result, once we've reached a
+ * given object number, we can safely remove any reference to lower object
+ * numbers in the ignore list. In practice, we receive up to 32 object records
+ * before receiving write records, so the list can have up to 32 nodes in it.
+ */
+/* ARGSUSED */
+static void
+receive_read_prefetch(dmu_recv_cookie_t *drc, uint64_t object, uint64_t offset,
+ uint64_t length)
+{
+ if (!objlist_exists(drc->drc_ignore_objlist, object)) {
+ dmu_prefetch(drc->drc_os, object, 1, offset, length,
+ ZIO_PRIORITY_SYNC_READ);
+ }
+}
+
+/*
+ * Read records off the stream, issuing any necessary prefetches.
+ */
+static int
+receive_read_record(dmu_recv_cookie_t *drc)
+{
+ int err;
+
+ switch (drc->drc_rrd->header.drr_type) {
+ case DRR_OBJECT:
+ {
+ struct drr_object *drro =
+ &drc->drc_rrd->header.drr_u.drr_object;
+ uint32_t size = DRR_OBJECT_PAYLOAD_SIZE(drro);
+ void *buf = NULL;
+ dmu_object_info_t doi;
+
+ if (size != 0)
+ buf = kmem_zalloc(size, KM_SLEEP);
+
+ err = receive_read_payload_and_next_header(drc, size, buf);
+ if (err != 0) {
+ kmem_free(buf, size);
+ return (err);
+ }
+ err = dmu_object_info(drc->drc_os, drro->drr_object, &doi);
+ /*
+ * See receive_read_prefetch for an explanation why we're
+ * storing this object in the ignore_obj_list.
+ */
+ if (err == ENOENT || err == EEXIST ||
+ (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
+ objlist_insert(drc->drc_ignore_objlist,
+ drro->drr_object);
+ err = 0;
+ }
+ return (err);
+ }
+ case DRR_FREEOBJECTS:
+ {
+ err = receive_read_payload_and_next_header(drc, 0, NULL);
+ return (err);
+ }
+ case DRR_WRITE:
+ {
+ struct drr_write *drrw = &drc->drc_rrd->header.drr_u.drr_write;
+ int size = DRR_WRITE_PAYLOAD_SIZE(drrw);
+ abd_t *abd = abd_alloc_linear(size, B_FALSE);
+ err = receive_read_payload_and_next_header(drc, size,
+ abd_to_buf(abd));
+ if (err != 0) {
+ abd_free(abd);
+ return (err);
+ }
+ drc->drc_rrd->abd = abd;
+ receive_read_prefetch(drc, drrw->drr_object, drrw->drr_offset,
+ drrw->drr_logical_size);
+ return (err);
+ }
+ case DRR_WRITE_EMBEDDED:
+ {
+ struct drr_write_embedded *drrwe =
+ &drc->drc_rrd->header.drr_u.drr_write_embedded;
+ uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8);
+ void *buf = kmem_zalloc(size, KM_SLEEP);
+
+ err = receive_read_payload_and_next_header(drc, size, buf);
+ if (err != 0) {
+ kmem_free(buf, size);
+ return (err);
+ }
+
+ receive_read_prefetch(drc, drrwe->drr_object, drrwe->drr_offset,
+ drrwe->drr_length);
+ return (err);
+ }
+ case DRR_FREE:
+ case DRR_REDACT:
+ {
+ /*
+ * It might be beneficial to prefetch indirect blocks here, but
+ * we don't really have the data to decide for sure.
+ */
+ err = receive_read_payload_and_next_header(drc, 0, NULL);
+ return (err);
+ }
+ case DRR_END:
+ {
+ struct drr_end *drre = &drc->drc_rrd->header.drr_u.drr_end;
+ if (!ZIO_CHECKSUM_EQUAL(drc->drc_prev_cksum,
+ drre->drr_checksum))
+ return (SET_ERROR(ECKSUM));
+ return (0);
+ }
+ case DRR_SPILL:
+ {
+ struct drr_spill *drrs = &drc->drc_rrd->header.drr_u.drr_spill;
+ int size = DRR_SPILL_PAYLOAD_SIZE(drrs);
+ abd_t *abd = abd_alloc_linear(size, B_FALSE);
+ err = receive_read_payload_and_next_header(drc, size,
+ abd_to_buf(abd));
+ if (err != 0)
+ abd_free(abd);
+ else
+ drc->drc_rrd->abd = abd;
+ return (err);
+ }
+ case DRR_OBJECT_RANGE:
+ {
+ err = receive_read_payload_and_next_header(drc, 0, NULL);
+ return (err);
+
+ }
+ default:
+ return (SET_ERROR(EINVAL));
+ }
+}
+
+
+
+static void
+dprintf_drr(struct receive_record_arg *rrd, int err)
+{
+#ifdef ZFS_DEBUG
+ switch (rrd->header.drr_type) {
+ case DRR_OBJECT:
+ {
+ struct drr_object *drro = &rrd->header.drr_u.drr_object;
+ dprintf("drr_type = OBJECT obj = %llu type = %u "
+ "bonustype = %u blksz = %u bonuslen = %u cksumtype = %u "
+ "compress = %u dn_slots = %u err = %d\n",
+ drro->drr_object, drro->drr_type, drro->drr_bonustype,
+ drro->drr_blksz, drro->drr_bonuslen,
+ drro->drr_checksumtype, drro->drr_compress,
+ drro->drr_dn_slots, err);
+ break;
+ }
+ case DRR_FREEOBJECTS:
+ {
+ struct drr_freeobjects *drrfo =
+ &rrd->header.drr_u.drr_freeobjects;
+ dprintf("drr_type = FREEOBJECTS firstobj = %llu "
+ "numobjs = %llu err = %d\n",
+ drrfo->drr_firstobj, drrfo->drr_numobjs, err);
+ break;
+ }
+ case DRR_WRITE:
+ {
+ struct drr_write *drrw = &rrd->header.drr_u.drr_write;
+ dprintf("drr_type = WRITE obj = %llu type = %u offset = %llu "
+ "lsize = %llu cksumtype = %u flags = %u "
+ "compress = %u psize = %llu err = %d\n",
+ drrw->drr_object, drrw->drr_type, drrw->drr_offset,
+ drrw->drr_logical_size, drrw->drr_checksumtype,
+ drrw->drr_flags, drrw->drr_compressiontype,
+ drrw->drr_compressed_size, err);
+ break;
+ }
+ case DRR_WRITE_BYREF:
+ {
+ struct drr_write_byref *drrwbr =
+ &rrd->header.drr_u.drr_write_byref;
+ dprintf("drr_type = WRITE_BYREF obj = %llu offset = %llu "
+ "length = %llu toguid = %llx refguid = %llx "
+ "refobject = %llu refoffset = %llu cksumtype = %u "
+ "flags = %u err = %d\n",
+ drrwbr->drr_object, drrwbr->drr_offset,
+ drrwbr->drr_length, drrwbr->drr_toguid,
+ drrwbr->drr_refguid, drrwbr->drr_refobject,
+ drrwbr->drr_refoffset, drrwbr->drr_checksumtype,
+ drrwbr->drr_flags, err);
+ break;
+ }
+ case DRR_WRITE_EMBEDDED:
+ {
+ struct drr_write_embedded *drrwe =
+ &rrd->header.drr_u.drr_write_embedded;
+ dprintf("drr_type = WRITE_EMBEDDED obj = %llu offset = %llu "
+ "length = %llu compress = %u etype = %u lsize = %u "
+ "psize = %u err = %d\n",
+ drrwe->drr_object, drrwe->drr_offset, drrwe->drr_length,
+ drrwe->drr_compression, drrwe->drr_etype,
+ drrwe->drr_lsize, drrwe->drr_psize, err);
+ break;
+ }
+ case DRR_FREE:
+ {
+ struct drr_free *drrf = &rrd->header.drr_u.drr_free;
+ dprintf("drr_type = FREE obj = %llu offset = %llu "
+ "length = %lld err = %d\n",
+ drrf->drr_object, drrf->drr_offset, drrf->drr_length,
+ err);
+ break;
+ }
+ case DRR_SPILL:
+ {
+ struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
+ dprintf("drr_type = SPILL obj = %llu length = %llu "
+ "err = %d\n", drrs->drr_object, drrs->drr_length, err);
+ break;
+ }
+ case DRR_OBJECT_RANGE:
+ {
+ struct drr_object_range *drror =
+ &rrd->header.drr_u.drr_object_range;
+ dprintf("drr_type = OBJECT_RANGE firstobj = %llu "
+ "numslots = %llu flags = %u err = %d\n",
+ drror->drr_firstobj, drror->drr_numslots,
+ drror->drr_flags, err);
+ break;
+ }
+ default:
+ return;
+ }
+#endif
+}
+
+/*
+ * Commit the records to the pool.
+ */
+static int
+receive_process_record(struct receive_writer_arg *rwa,
+ struct receive_record_arg *rrd)
+{
+ int err;
+
+ /* Processing in order, therefore bytes_read should be increasing. */
+ ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read);
+ rwa->bytes_read = rrd->bytes_read;
+
+ if (rrd->header.drr_type != DRR_WRITE) {
+ err = flush_write_batch(rwa);
+ if (err != 0) {
+ if (rrd->abd != NULL) {
+ abd_free(rrd->abd);
+ rrd->abd = NULL;
+ rrd->payload = NULL;
+ } else if (rrd->payload != NULL) {
+ kmem_free(rrd->payload, rrd->payload_size);
+ rrd->payload = NULL;
+ }
+
+ return (err);
+ }
+ }
+
+ switch (rrd->header.drr_type) {
+ case DRR_OBJECT:
+ {
+ struct drr_object *drro = &rrd->header.drr_u.drr_object;
+ err = receive_object(rwa, drro, rrd->payload);
+ kmem_free(rrd->payload, rrd->payload_size);
+ rrd->payload = NULL;
+ break;
+ }
+ case DRR_FREEOBJECTS:
+ {
+ struct drr_freeobjects *drrfo =
+ &rrd->header.drr_u.drr_freeobjects;
+ err = receive_freeobjects(rwa, drrfo);
+ break;
+ }
+ case DRR_WRITE:
+ {
+ err = receive_process_write_record(rwa, rrd);
+ if (err != EAGAIN) {
+ /*
+ * On success, receive_process_write_record() returns
+ * EAGAIN to indicate that we do not want to free
+ * the rrd or arc_buf.
+ */
+ ASSERT(err != 0);
+ abd_free(rrd->abd);
+ rrd->abd = NULL;
+ }
+ break;
+ }
+ case DRR_WRITE_EMBEDDED:
+ {
+ struct drr_write_embedded *drrwe =
+ &rrd->header.drr_u.drr_write_embedded;
+ err = receive_write_embedded(rwa, drrwe, rrd->payload);
+ kmem_free(rrd->payload, rrd->payload_size);
+ rrd->payload = NULL;
+ break;
+ }
+ case DRR_FREE:
+ {
+ struct drr_free *drrf = &rrd->header.drr_u.drr_free;
+ err = receive_free(rwa, drrf);
+ break;
+ }
+ case DRR_SPILL:
+ {
+ struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
+ err = receive_spill(rwa, drrs, rrd->abd);
+ if (err != 0)
+ abd_free(rrd->abd);
+ rrd->abd = NULL;
+ rrd->payload = NULL;
+ break;
+ }
+ case DRR_OBJECT_RANGE:
+ {
+ struct drr_object_range *drror =
+ &rrd->header.drr_u.drr_object_range;
+ err = receive_object_range(rwa, drror);
+ break;
+ }
+ case DRR_REDACT:
+ {
+ struct drr_redact *drrr = &rrd->header.drr_u.drr_redact;
+ err = receive_redact(rwa, drrr);
+ break;
+ }
+ default:
+ err = (SET_ERROR(EINVAL));
+ }
+
+ if (err != 0)
+ dprintf_drr(rrd, err);
+
+ return (err);
+}
+
+/*
+ * dmu_recv_stream's worker thread; pull records off the queue, and then call
+ * receive_process_record When we're done, signal the main thread and exit.
+ */
+static void
+receive_writer_thread(void *arg)
+{
+ struct receive_writer_arg *rwa = arg;
+ struct receive_record_arg *rrd;
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+
+ for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker;
+ rrd = bqueue_dequeue(&rwa->q)) {
+ /*
+ * If there's an error, the main thread will stop putting things
+ * on the queue, but we need to clear everything in it before we
+ * can exit.
+ */
+ int err = 0;
+ if (rwa->err == 0) {
+ err = receive_process_record(rwa, rrd);
+ } else if (rrd->abd != NULL) {
+ abd_free(rrd->abd);
+ rrd->abd = NULL;
+ rrd->payload = NULL;
+ } else if (rrd->payload != NULL) {
+ kmem_free(rrd->payload, rrd->payload_size);
+ rrd->payload = NULL;
+ }
+ /*
+ * EAGAIN indicates that this record has been saved (on
+ * raw->write_batch), and will be used again, so we don't
+ * free it.
+ */
+ if (err != EAGAIN) {
+ if (rwa->err == 0)
+ rwa->err = err;
+ kmem_free(rrd, sizeof (*rrd));
+ }
+ }
+ kmem_free(rrd, sizeof (*rrd));
+
+ int err = flush_write_batch(rwa);
+ if (rwa->err == 0)
+ rwa->err = err;
+
+ mutex_enter(&rwa->mutex);
+ rwa->done = B_TRUE;
+ cv_signal(&rwa->cv);
+ mutex_exit(&rwa->mutex);
+ spl_fstrans_unmark(cookie);
+ thread_exit();
+}
+
+static int
+resume_check(dmu_recv_cookie_t *drc, nvlist_t *begin_nvl)
+{
+ uint64_t val;
+ objset_t *mos = dmu_objset_pool(drc->drc_os)->dp_meta_objset;
+ uint64_t dsobj = dmu_objset_id(drc->drc_os);
+ uint64_t resume_obj, resume_off;
+
+ if (nvlist_lookup_uint64(begin_nvl,
+ "resume_object", &resume_obj) != 0 ||
+ nvlist_lookup_uint64(begin_nvl,
+ "resume_offset", &resume_off) != 0) {
+ return (SET_ERROR(EINVAL));
+ }
+ VERIFY0(zap_lookup(mos, dsobj,
+ DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val));
+ if (resume_obj != val)
+ return (SET_ERROR(EINVAL));
+ VERIFY0(zap_lookup(mos, dsobj,
+ DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val));
+ if (resume_off != val)
+ return (SET_ERROR(EINVAL));
+
+ return (0);
+}
+
+/*
+ * Read in the stream's records, one by one, and apply them to the pool. There
+ * are two threads involved; the thread that calls this function will spin up a
+ * worker thread, read the records off the stream one by one, and issue
+ * prefetches for any necessary indirect blocks. It will then push the records
+ * onto an internal blocking queue. The worker thread will pull the records off
+ * the queue, and actually write the data into the DMU. This way, the worker
+ * thread doesn't have to wait for reads to complete, since everything it needs
+ * (the indirect blocks) will be prefetched.
+ *
+ * NB: callers *must* call dmu_recv_end() if this succeeds.
+ */
+int
+dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp)
+{
+ int err = 0;
+ struct receive_writer_arg *rwa = kmem_zalloc(sizeof (*rwa), KM_SLEEP);
+
+ if (dsl_dataset_is_zapified(drc->drc_ds)) {
+ uint64_t bytes;
+ (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset,
+ drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES,
+ sizeof (bytes), 1, &bytes);
+ drc->drc_bytes_read += bytes;
+ }
+
+ drc->drc_ignore_objlist = objlist_create();
+
+ /* these were verified in dmu_recv_begin */
+ ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
+ DMU_SUBSTREAM);
+ ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES);
+
+ ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
+ ASSERT0(drc->drc_os->os_encrypted &&
+ (drc->drc_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA));
+
+ /* handle DSL encryption key payload */
+ if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) {
+ nvlist_t *keynvl = NULL;
+
+ ASSERT(drc->drc_os->os_encrypted);
+ ASSERT(drc->drc_raw);
+
+ err = nvlist_lookup_nvlist(drc->drc_begin_nvl, "crypt_keydata",
+ &keynvl);
+ if (err != 0)
+ goto out;
+
+ /*
+ * If this is a new dataset we set the key immediately.
+ * Otherwise we don't want to change the key until we
+ * are sure the rest of the receive succeeded so we stash
+ * the keynvl away until then.
+ */
+ err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa),
+ drc->drc_ds->ds_object, drc->drc_fromsnapobj,
+ drc->drc_drrb->drr_type, keynvl, drc->drc_newfs);
+ if (err != 0)
+ goto out;
+
+ /* see comment in dmu_recv_end_sync() */
+ drc->drc_ivset_guid = 0;
+ (void) nvlist_lookup_uint64(keynvl, "to_ivset_guid",
+ &drc->drc_ivset_guid);
+
+ if (!drc->drc_newfs)
+ drc->drc_keynvl = fnvlist_dup(keynvl);
+ }
+
+ if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) {
+ err = resume_check(drc, drc->drc_begin_nvl);
+ if (err != 0)
+ goto out;
+ }
+
+ /*
+ * If we failed before this point we will clean up any new resume
+ * state that was created. Now that we've gotten past the initial
+ * checks we are ok to retain that resume state.
+ */
+ drc->drc_should_save = B_TRUE;
+
+ (void) bqueue_init(&rwa->q, zfs_recv_queue_ff,
+ MAX(zfs_recv_queue_length, 2 * zfs_max_recordsize),
+ offsetof(struct receive_record_arg, node));
+ cv_init(&rwa->cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL);
+ rwa->os = drc->drc_os;
+ rwa->byteswap = drc->drc_byteswap;
+ rwa->resumable = drc->drc_resumable;
+ rwa->raw = drc->drc_raw;
+ rwa->spill = drc->drc_spill;
+ rwa->full = (drc->drc_drr_begin->drr_u.drr_begin.drr_fromguid == 0);
+ rwa->os->os_raw_receive = drc->drc_raw;
+ list_create(&rwa->write_batch, sizeof (struct receive_record_arg),
+ offsetof(struct receive_record_arg, node.bqn_node));
+
+ (void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc,
+ TS_RUN, minclsyspri);
+ /*
+ * We're reading rwa->err without locks, which is safe since we are the
+ * only reader, and the worker thread is the only writer. It's ok if we
+ * miss a write for an iteration or two of the loop, since the writer
+ * thread will keep freeing records we send it until we send it an eos
+ * marker.
+ *
+ * We can leave this loop in 3 ways: First, if rwa->err is
+ * non-zero. In that case, the writer thread will free the rrd we just
+ * pushed. Second, if we're interrupted; in that case, either it's the
+ * first loop and drc->drc_rrd was never allocated, or it's later, and
+ * drc->drc_rrd has been handed off to the writer thread who will free
+ * it. Finally, if receive_read_record fails or we're at the end of the
+ * stream, then we free drc->drc_rrd and exit.
+ */
+ while (rwa->err == 0) {
+ if (issig(JUSTLOOKING) && issig(FORREAL)) {
+ err = SET_ERROR(EINTR);
+ break;
+ }
+
+ ASSERT3P(drc->drc_rrd, ==, NULL);
+ drc->drc_rrd = drc->drc_next_rrd;
+ drc->drc_next_rrd = NULL;
+ /* Allocates and loads header into drc->drc_next_rrd */
+ err = receive_read_record(drc);
+
+ if (drc->drc_rrd->header.drr_type == DRR_END || err != 0) {
+ kmem_free(drc->drc_rrd, sizeof (*drc->drc_rrd));
+ drc->drc_rrd = NULL;
+ break;
+ }
+
+ bqueue_enqueue(&rwa->q, drc->drc_rrd,
+ sizeof (struct receive_record_arg) +
+ drc->drc_rrd->payload_size);
+ drc->drc_rrd = NULL;
+ }
+
+ ASSERT3P(drc->drc_rrd, ==, NULL);
+ drc->drc_rrd = kmem_zalloc(sizeof (*drc->drc_rrd), KM_SLEEP);
+ drc->drc_rrd->eos_marker = B_TRUE;
+ bqueue_enqueue_flush(&rwa->q, drc->drc_rrd, 1);
+
+ mutex_enter(&rwa->mutex);
+ while (!rwa->done) {
+ /*
+ * We need to use cv_wait_sig() so that any process that may
+ * be sleeping here can still fork.
+ */
+ (void) cv_wait_sig(&rwa->cv, &rwa->mutex);
+ }
+ mutex_exit(&rwa->mutex);
+
+ /*
+ * If we are receiving a full stream as a clone, all object IDs which
+ * are greater than the maximum ID referenced in the stream are
+ * by definition unused and must be freed.
+ */
+ if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) {
+ uint64_t obj = rwa->max_object + 1;
+ int free_err = 0;
+ int next_err = 0;
+
+ while (next_err == 0) {
+ free_err = dmu_free_long_object(rwa->os, obj);
+ if (free_err != 0 && free_err != ENOENT)
+ break;
+
+ next_err = dmu_object_next(rwa->os, &obj, FALSE, 0);
+ }
+
+ if (err == 0) {
+ if (free_err != 0 && free_err != ENOENT)
+ err = free_err;
+ else if (next_err != ESRCH)
+ err = next_err;
+ }
+ }
+
+ cv_destroy(&rwa->cv);
+ mutex_destroy(&rwa->mutex);
+ bqueue_destroy(&rwa->q);
+ list_destroy(&rwa->write_batch);
+ if (err == 0)
+ err = rwa->err;
+
+out:
+ /*
+ * If we hit an error before we started the receive_writer_thread
+ * we need to clean up the next_rrd we create by processing the
+ * DRR_BEGIN record.
+ */
+ if (drc->drc_next_rrd != NULL)
+ kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
+
+ /*
+ * The objset will be invalidated by dmu_recv_end() when we do
+ * dsl_dataset_clone_swap_sync_impl().
+ */
+ drc->drc_os = NULL;
+
+ kmem_free(rwa, sizeof (*rwa));
+ nvlist_free(drc->drc_begin_nvl);
+
+ if (err != 0) {
+ /*
+ * Clean up references. If receive is not resumable,
+ * destroy what we created, so we don't leave it in
+ * the inconsistent state.
+ */
+ dmu_recv_cleanup_ds(drc);
+ nvlist_free(drc->drc_keynvl);
+ }
+
+ objlist_destroy(drc->drc_ignore_objlist);
+ drc->drc_ignore_objlist = NULL;
+ *voffp = drc->drc_voff;
+ return (err);
+}
+
+static int
+dmu_recv_end_check(void *arg, dmu_tx_t *tx)
+{
+ dmu_recv_cookie_t *drc = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ int error;
+
+ ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
+
+ if (!drc->drc_newfs) {
+ dsl_dataset_t *origin_head;
+
+ error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
+ if (error != 0)
+ return (error);
+ if (drc->drc_force) {
+ /*
+ * We will destroy any snapshots in tofs (i.e. before
+ * origin_head) that are after the origin (which is
+ * the snap before drc_ds, because drc_ds can not
+ * have any snaps of its own).
+ */
+ uint64_t obj;
+
+ obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+ while (obj !=
+ dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
+ dsl_dataset_t *snap;
+ error = dsl_dataset_hold_obj(dp, obj, FTAG,
+ &snap);
+ if (error != 0)
+ break;
+ if (snap->ds_dir != origin_head->ds_dir)
+ error = SET_ERROR(EINVAL);
+ if (error == 0) {
+ error = dsl_destroy_snapshot_check_impl(
+ snap, B_FALSE);
+ }
+ obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+ dsl_dataset_rele(snap, FTAG);
+ if (error != 0)
+ break;
+ }
+ if (error != 0) {
+ dsl_dataset_rele(origin_head, FTAG);
+ return (error);
+ }
+ }
+ if (drc->drc_keynvl != NULL) {
+ error = dsl_crypto_recv_raw_key_check(drc->drc_ds,
+ drc->drc_keynvl, tx);
+ if (error != 0) {
+ dsl_dataset_rele(origin_head, FTAG);
+ return (error);
+ }
+ }
+
+ error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
+ origin_head, drc->drc_force, drc->drc_owner, tx);
+ if (error != 0) {
+ dsl_dataset_rele(origin_head, FTAG);
+ return (error);
+ }
+ error = dsl_dataset_snapshot_check_impl(origin_head,
+ drc->drc_tosnap, tx, B_TRUE, 1,
+ drc->drc_cred, drc->drc_proc);
+ dsl_dataset_rele(origin_head, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
+ } else {
+ error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
+ drc->drc_tosnap, tx, B_TRUE, 1,
+ drc->drc_cred, drc->drc_proc);
+ }
+ return (error);
+}
+
+static void
+dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
+{
+ dmu_recv_cookie_t *drc = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ boolean_t encrypted = drc->drc_ds->ds_dir->dd_crypto_obj != 0;
+ uint64_t newsnapobj;
+
+ spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
+ tx, "snap=%s", drc->drc_tosnap);
+ drc->drc_ds->ds_objset->os_raw_receive = B_FALSE;
+
+ if (!drc->drc_newfs) {
+ dsl_dataset_t *origin_head;
+
+ VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
+ &origin_head));
+
+ if (drc->drc_force) {
+ /*
+ * Destroy any snapshots of drc_tofs (origin_head)
+ * after the origin (the snap before drc_ds).
+ */
+ uint64_t obj;
+
+ obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+ while (obj !=
+ dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
+ dsl_dataset_t *snap;
+ VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG,
+ &snap));
+ ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir);
+ obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+ dsl_destroy_snapshot_sync_impl(snap,
+ B_FALSE, tx);
+ dsl_dataset_rele(snap, FTAG);
+ }
+ }
+ if (drc->drc_keynvl != NULL) {
+ dsl_crypto_recv_raw_key_sync(drc->drc_ds,
+ drc->drc_keynvl, tx);
+ nvlist_free(drc->drc_keynvl);
+ drc->drc_keynvl = NULL;
+ }
+
+ VERIFY3P(drc->drc_ds->ds_prev, ==,
+ origin_head->ds_prev);
+
+ dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
+ origin_head, tx);
+ /*
+ * The objset was evicted by dsl_dataset_clone_swap_sync_impl,
+ * so drc_os is no longer valid.
+ */
+ drc->drc_os = NULL;
+
+ dsl_dataset_snapshot_sync_impl(origin_head,
+ drc->drc_tosnap, tx);
+
+ /* set snapshot's creation time and guid */
+ dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
+ dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time =
+ drc->drc_drrb->drr_creation_time;
+ dsl_dataset_phys(origin_head->ds_prev)->ds_guid =
+ drc->drc_drrb->drr_toguid;
+ dsl_dataset_phys(origin_head->ds_prev)->ds_flags &=
+ ~DS_FLAG_INCONSISTENT;
+
+ dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
+ dsl_dataset_phys(origin_head)->ds_flags &=
+ ~DS_FLAG_INCONSISTENT;
+
+ newsnapobj =
+ dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+
+ dsl_dataset_rele(origin_head, FTAG);
+ dsl_destroy_head_sync_impl(drc->drc_ds, tx);
+
+ if (drc->drc_owner != NULL)
+ VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner);
+ } else {
+ dsl_dataset_t *ds = drc->drc_ds;
+
+ dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx);
+
+ /* set snapshot's creation time and guid */
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ dsl_dataset_phys(ds->ds_prev)->ds_creation_time =
+ drc->drc_drrb->drr_creation_time;
+ dsl_dataset_phys(ds->ds_prev)->ds_guid =
+ drc->drc_drrb->drr_toguid;
+ dsl_dataset_phys(ds->ds_prev)->ds_flags &=
+ ~DS_FLAG_INCONSISTENT;
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
+ if (dsl_dataset_has_resume_receive_state(ds)) {
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_FROMGUID, tx);
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_OBJECT, tx);
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_OFFSET, tx);
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_BYTES, tx);
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_TOGUID, tx);
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_TONAME, tx);
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, tx);
+ }
+ newsnapobj =
+ dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
+ }
+
+ /*
+ * If this is a raw receive, the crypt_keydata nvlist will include
+ * a to_ivset_guid for us to set on the new snapshot. This value
+ * will override the value generated by the snapshot code. However,
+ * this value may not be present, because older implementations of
+ * the raw send code did not include this value, and we are still
+ * allowed to receive them if the zfs_disable_ivset_guid_check
+ * tunable is set, in which case we will leave the newly-generated
+ * value.
+ */
+ if (drc->drc_raw && drc->drc_ivset_guid != 0) {
+ dmu_object_zapify(dp->dp_meta_objset, newsnapobj,
+ DMU_OT_DSL_DATASET, tx);
+ VERIFY0(zap_update(dp->dp_meta_objset, newsnapobj,
+ DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1,
+ &drc->drc_ivset_guid, tx));
+ }
+
+ /*
+ * Release the hold from dmu_recv_begin. This must be done before
+ * we return to open context, so that when we free the dataset's dnode
+ * we can evict its bonus buffer. Since the dataset may be destroyed
+ * at this point (and therefore won't have a valid pointer to the spa)
+ * we release the key mapping manually here while we do have a valid
+ * pointer, if it exists.
+ */
+ if (!drc->drc_raw && encrypted) {
+ (void) spa_keystore_remove_mapping(dmu_tx_pool(tx)->dp_spa,
+ drc->drc_ds->ds_object, drc->drc_ds);
+ }
+ dsl_dataset_disown(drc->drc_ds, 0, dmu_recv_tag);
+ drc->drc_ds = NULL;
+}
+
+static int dmu_recv_end_modified_blocks = 3;
+
+static int
+dmu_recv_existing_end(dmu_recv_cookie_t *drc)
+{
+#ifdef _KERNEL
+ /*
+ * We will be destroying the ds; make sure its origin is unmounted if
+ * necessary.
+ */
+ char name[ZFS_MAX_DATASET_NAME_LEN];
+ dsl_dataset_name(drc->drc_ds, name);
+ zfs_destroy_unmount_origin(name);
+#endif
+
+ return (dsl_sync_task(drc->drc_tofs,
+ dmu_recv_end_check, dmu_recv_end_sync, drc,
+ dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
+}
+
+static int
+dmu_recv_new_end(dmu_recv_cookie_t *drc)
+{
+ return (dsl_sync_task(drc->drc_tofs,
+ dmu_recv_end_check, dmu_recv_end_sync, drc,
+ dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
+}
+
+int
+dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
+{
+ int error;
+
+ drc->drc_owner = owner;
+
+ if (drc->drc_newfs)
+ error = dmu_recv_new_end(drc);
+ else
+ error = dmu_recv_existing_end(drc);
+
+ if (error != 0) {
+ dmu_recv_cleanup_ds(drc);
+ nvlist_free(drc->drc_keynvl);
+ } else {
+ if (drc->drc_newfs) {
+ zvol_create_minor(drc->drc_tofs);
+ }
+ char *snapname = kmem_asprintf("%s@%s",
+ drc->drc_tofs, drc->drc_tosnap);
+ zvol_create_minor(snapname);
+ kmem_strfree(snapname);
+ }
+ return (error);
+}
+
+/*
+ * Return TRUE if this objset is currently being received into.
+ */
+boolean_t
+dmu_objset_is_receiving(objset_t *os)
+{
+ return (os->os_dsl_dataset != NULL &&
+ os->os_dsl_dataset->ds_owner == dmu_recv_tag);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_length, INT, ZMOD_RW,
+ "Maximum receive queue length");
+
+ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_ff, INT, ZMOD_RW,
+ "Receive queue fill fraction");
+
+ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, write_batch_size, INT, ZMOD_RW,
+ "Maximum amount of writes to batch into one transaction");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu_redact.c b/sys/contrib/openzfs/module/zfs/dmu_redact.c
new file mode 100644
index 000000000000..62c7d01d4bd2
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dmu_redact.c
@@ -0,0 +1,1199 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2017, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/txg.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_redact.h>
+#include <sys/bqueue.h>
+#include <sys/objlist.h>
+#include <sys/dmu_tx.h>
+#ifdef _KERNEL
+#include <sys/zfs_vfsops.h>
+#include <sys/zap.h>
+#include <sys/zfs_znode.h>
+#endif
+
+/*
+ * This controls the number of entries in the buffer the redaction_list_update
+ * synctask uses to buffer writes to the redaction list.
+ */
+int redact_sync_bufsize = 1024;
+
+/*
+ * Controls how often to update the redaction list when creating a redaction
+ * list.
+ */
+uint64_t redaction_list_update_interval_ns = 1000 * 1000 * 1000ULL; /* NS */
+
+/*
+ * This tunable controls the length of the queues that zfs redact worker threads
+ * use to communicate. If the dmu_redact_snap thread is blocking on these
+ * queues, this variable may need to be increased. If there is a significant
+ * slowdown at the start of a redact operation as these threads consume all the
+ * available IO resources, or the queues are consuming too much memory, this
+ * variable may need to be decreased.
+ */
+int zfs_redact_queue_length = 1024 * 1024;
+
+/*
+ * These tunables control the fill fraction of the queues by zfs redact. The
+ * fill fraction controls the frequency with which threads have to be
+ * cv_signaled. If a lot of cpu time is being spent on cv_signal, then these
+ * should be tuned down. If the queues empty before the signalled thread can
+ * catch up, then these should be tuned up.
+ */
+uint64_t zfs_redact_queue_ff = 20;
+
+struct redact_record {
+ bqueue_node_t ln;
+ boolean_t eos_marker; /* Marks the end of the stream */
+ uint64_t start_object;
+ uint64_t start_blkid;
+ uint64_t end_object;
+ uint64_t end_blkid;
+ uint8_t indblkshift;
+ uint32_t datablksz;
+};
+
+struct redact_thread_arg {
+ bqueue_t q;
+ objset_t *os; /* Objset to traverse */
+ dsl_dataset_t *ds; /* Dataset to traverse */
+ struct redact_record *current_record;
+ int error_code;
+ boolean_t cancel;
+ zbookmark_phys_t resume;
+ objlist_t *deleted_objs;
+ uint64_t *num_blocks_visited;
+ uint64_t ignore_object; /* ignore further callbacks on this */
+ uint64_t txg; /* txg to traverse since */
+};
+
+/*
+ * The redaction node is a wrapper around the redaction record that is used
+ * by the redaction merging thread to sort the records and determine overlaps.
+ *
+ * It contains two nodes; one sorts the records by their start_zb, and the other
+ * sorts the records by their end_zb.
+ */
+struct redact_node {
+ avl_node_t avl_node_start;
+ avl_node_t avl_node_end;
+ struct redact_record *record;
+ struct redact_thread_arg *rt_arg;
+ uint32_t thread_num;
+};
+
+struct merge_data {
+ list_t md_redact_block_pending;
+ redact_block_phys_t md_coalesce_block;
+ uint64_t md_last_time;
+ redact_block_phys_t md_furthest[TXG_SIZE];
+ /* Lists of struct redact_block_list_node. */
+ list_t md_blocks[TXG_SIZE];
+ boolean_t md_synctask_txg[TXG_SIZE];
+ uint64_t md_latest_synctask_txg;
+ redaction_list_t *md_redaction_list;
+};
+
+/*
+ * A wrapper around struct redact_block so it can be stored in a list_t.
+ */
+struct redact_block_list_node {
+ redact_block_phys_t block;
+ list_node_t node;
+};
+
+/*
+ * We've found a new redaction candidate. In order to improve performance, we
+ * coalesce these blocks when they're adjacent to each other. This function
+ * handles that. If the new candidate block range is immediately after the
+ * range we're building, coalesce it into the range we're building. Otherwise,
+ * put the record we're building on the queue, and update the build pointer to
+ * point to the new record.
+ */
+static void
+record_merge_enqueue(bqueue_t *q, struct redact_record **build,
+ struct redact_record *new)
+{
+ if (new->eos_marker) {
+ if (*build != NULL)
+ bqueue_enqueue(q, *build, sizeof (*build));
+ bqueue_enqueue_flush(q, new, sizeof (*new));
+ return;
+ }
+ if (*build == NULL) {
+ *build = new;
+ return;
+ }
+ struct redact_record *curbuild = *build;
+ if ((curbuild->end_object == new->start_object &&
+ curbuild->end_blkid + 1 == new->start_blkid &&
+ curbuild->end_blkid != UINT64_MAX) ||
+ (curbuild->end_object + 1 == new->start_object &&
+ curbuild->end_blkid == UINT64_MAX && new->start_blkid == 0)) {
+ curbuild->end_object = new->end_object;
+ curbuild->end_blkid = new->end_blkid;
+ kmem_free(new, sizeof (*new));
+ } else {
+ bqueue_enqueue(q, curbuild, sizeof (*curbuild));
+ *build = new;
+ }
+}
+#ifdef _KERNEL
+struct objnode {
+ avl_node_t node;
+ uint64_t obj;
+};
+
+static int
+objnode_compare(const void *o1, const void *o2)
+{
+ const struct objnode *obj1 = o1;
+ const struct objnode *obj2 = o2;
+ if (obj1->obj < obj2->obj)
+ return (-1);
+ if (obj1->obj > obj2->obj)
+ return (1);
+ return (0);
+}
+
+
+static objlist_t *
+zfs_get_deleteq(objset_t *os)
+{
+ objlist_t *deleteq_objlist = objlist_create();
+ uint64_t deleteq_obj;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ dmu_object_info_t doi;
+
+ ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
+ VERIFY0(dmu_object_info(os, MASTER_NODE_OBJ, &doi));
+ ASSERT3U(doi.doi_type, ==, DMU_OT_MASTER_NODE);
+
+ VERIFY0(zap_lookup(os, MASTER_NODE_OBJ,
+ ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
+
+ /*
+ * In order to insert objects into the objlist, they must be in sorted
+ * order. We don't know what order we'll get them out of the ZAP in, so
+ * we insert them into and remove them from an avl_tree_t to sort them.
+ */
+ avl_tree_t at;
+ avl_create(&at, objnode_compare, sizeof (struct objnode),
+ offsetof(struct objnode, node));
+
+ for (zap_cursor_init(&zc, os, deleteq_obj);
+ zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
+ struct objnode *obj = kmem_zalloc(sizeof (*obj), KM_SLEEP);
+ obj->obj = za.za_first_integer;
+ avl_add(&at, obj);
+ }
+ zap_cursor_fini(&zc);
+
+ struct objnode *next, *found = avl_first(&at);
+ while (found != NULL) {
+ next = AVL_NEXT(&at, found);
+ objlist_insert(deleteq_objlist, found->obj);
+ found = next;
+ }
+
+ void *cookie = NULL;
+ while ((found = avl_destroy_nodes(&at, &cookie)) != NULL)
+ kmem_free(found, sizeof (*found));
+ avl_destroy(&at);
+ return (deleteq_objlist);
+}
+#endif
+
+/*
+ * This is the callback function to traverse_dataset for the redaction threads
+ * for dmu_redact_snap. This thread is responsible for creating redaction
+ * records for all the data that is modified by the snapshots we're redacting
+ * with respect to. Redaction records represent ranges of data that have been
+ * modified by one of the redaction snapshots, and are stored in the
+ * redact_record struct. We need to create redaction records for three
+ * cases:
+ *
+ * First, if there's a normal write, we need to create a redaction record for
+ * that block.
+ *
+ * Second, if there's a hole, we need to create a redaction record that covers
+ * the whole range of the hole. If the hole is in the meta-dnode, it must cover
+ * every block in all of the objects in the hole.
+ *
+ * Third, if there is a deleted object, we need to create a redaction record for
+ * all of the blocks in that object.
+ */
+/*ARGSUSED*/
+static int
+redact_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
+{
+ struct redact_thread_arg *rta = arg;
+ struct redact_record *record;
+
+ ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
+ zb->zb_object >= rta->resume.zb_object);
+
+ if (rta->cancel)
+ return (SET_ERROR(EINTR));
+
+ if (rta->ignore_object == zb->zb_object)
+ return (0);
+
+ /*
+ * If we're visiting a dnode, we need to handle the case where the
+ * object has been deleted.
+ */
+ if (zb->zb_level == ZB_DNODE_LEVEL) {
+ ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL);
+
+ if (zb->zb_object == 0)
+ return (0);
+
+ /*
+ * If the object has been deleted, redact all of the blocks in
+ * it.
+ */
+ if (dnp->dn_type == DMU_OT_NONE ||
+ objlist_exists(rta->deleted_objs, zb->zb_object)) {
+ rta->ignore_object = zb->zb_object;
+ record = kmem_zalloc(sizeof (struct redact_record),
+ KM_SLEEP);
+
+ record->eos_marker = B_FALSE;
+ record->start_object = record->end_object =
+ zb->zb_object;
+ record->start_blkid = 0;
+ record->end_blkid = UINT64_MAX;
+ record_merge_enqueue(&rta->q,
+ &rta->current_record, record);
+ }
+ return (0);
+ } else if (zb->zb_level < 0) {
+ return (0);
+ } else if (zb->zb_level > 0 && !BP_IS_HOLE(bp)) {
+ /*
+ * If this is an indirect block, but not a hole, it doesn't
+ * provide any useful information for redaction, so ignore it.
+ */
+ return (0);
+ }
+
+ /*
+ * At this point, there are two options left for the type of block we're
+ * looking at. Either this is a hole (which could be in the dnode or
+ * the meta-dnode), or it's a level 0 block of some sort. If it's a
+ * hole, we create a redaction record that covers the whole range. If
+ * the hole is in a dnode, we need to redact all the blocks in that
+ * hole. If the hole is in the meta-dnode, we instead need to redact
+ * all blocks in every object covered by that hole. If it's a level 0
+ * block, we only need to redact that single block.
+ */
+ record = kmem_zalloc(sizeof (struct redact_record), KM_SLEEP);
+ record->eos_marker = B_FALSE;
+
+ record->start_object = record->end_object = zb->zb_object;
+ if (BP_IS_HOLE(bp)) {
+ record->start_blkid = zb->zb_blkid *
+ bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level);
+
+ record->end_blkid = ((zb->zb_blkid + 1) *
+ bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level)) - 1;
+
+ if (zb->zb_object == DMU_META_DNODE_OBJECT) {
+ record->start_object = record->start_blkid *
+ ((SPA_MINBLOCKSIZE * dnp->dn_datablkszsec) /
+ sizeof (dnode_phys_t));
+ record->start_blkid = 0;
+ record->end_object = ((record->end_blkid +
+ 1) * ((SPA_MINBLOCKSIZE * dnp->dn_datablkszsec) /
+ sizeof (dnode_phys_t))) - 1;
+ record->end_blkid = UINT64_MAX;
+ }
+ } else if (zb->zb_level != 0 ||
+ zb->zb_object == DMU_META_DNODE_OBJECT) {
+ kmem_free(record, sizeof (*record));
+ return (0);
+ } else {
+ record->start_blkid = record->end_blkid = zb->zb_blkid;
+ }
+ record->indblkshift = dnp->dn_indblkshift;
+ record->datablksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+ record_merge_enqueue(&rta->q, &rta->current_record, record);
+
+ return (0);
+}
+
+static void
+redact_traverse_thread(void *arg)
+{
+ struct redact_thread_arg *rt_arg = arg;
+ int err;
+ struct redact_record *data;
+#ifdef _KERNEL
+ if (rt_arg->os->os_phys->os_type == DMU_OST_ZFS)
+ rt_arg->deleted_objs = zfs_get_deleteq(rt_arg->os);
+ else
+ rt_arg->deleted_objs = objlist_create();
+#else
+ rt_arg->deleted_objs = objlist_create();
+#endif
+
+ err = traverse_dataset_resume(rt_arg->ds, rt_arg->txg,
+ &rt_arg->resume, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
+ redact_cb, rt_arg);
+
+ if (err != EINTR)
+ rt_arg->error_code = err;
+ objlist_destroy(rt_arg->deleted_objs);
+ data = kmem_zalloc(sizeof (*data), KM_SLEEP);
+ data->eos_marker = B_TRUE;
+ record_merge_enqueue(&rt_arg->q, &rt_arg->current_record, data);
+ thread_exit();
+}
+
+static inline void
+create_zbookmark_from_obj_off(zbookmark_phys_t *zb, uint64_t object,
+ uint64_t blkid)
+{
+ zb->zb_object = object;
+ zb->zb_level = 0;
+ zb->zb_blkid = blkid;
+}
+
+/*
+ * This is a utility function that can do the comparison for the start or ends
+ * of the ranges in a redact_record.
+ */
+static int
+redact_range_compare(uint64_t obj1, uint64_t off1, uint32_t dbss1,
+ uint64_t obj2, uint64_t off2, uint32_t dbss2)
+{
+ zbookmark_phys_t z1, z2;
+ create_zbookmark_from_obj_off(&z1, obj1, off1);
+ create_zbookmark_from_obj_off(&z2, obj2, off2);
+
+ return (zbookmark_compare(dbss1 >> SPA_MINBLOCKSHIFT, 0,
+ dbss2 >> SPA_MINBLOCKSHIFT, 0, &z1, &z2));
+}
+
+/*
+ * Compare two redaction records by their range's start location. Also makes
+ * eos records always compare last. We use the thread number in the redact_node
+ * to ensure that records do not compare equal (which is not allowed in our avl
+ * trees).
+ */
+static int
+redact_node_compare_start(const void *arg1, const void *arg2)
+{
+ const struct redact_node *rn1 = arg1;
+ const struct redact_node *rn2 = arg2;
+ const struct redact_record *rr1 = rn1->record;
+ const struct redact_record *rr2 = rn2->record;
+ if (rr1->eos_marker)
+ return (1);
+ if (rr2->eos_marker)
+ return (-1);
+
+ int cmp = redact_range_compare(rr1->start_object, rr1->start_blkid,
+ rr1->datablksz, rr2->start_object, rr2->start_blkid,
+ rr2->datablksz);
+ if (cmp == 0)
+ cmp = (rn1->thread_num < rn2->thread_num ? -1 : 1);
+ return (cmp);
+}
+
+/*
+ * Compare two redaction records by their range's end location. Also makes
+ * eos records always compare last. We use the thread number in the redact_node
+ * to ensure that records do not compare equal (which is not allowed in our avl
+ * trees).
+ */
+static int
+redact_node_compare_end(const void *arg1, const void *arg2)
+{
+ const struct redact_node *rn1 = arg1;
+ const struct redact_node *rn2 = arg2;
+ const struct redact_record *srr1 = rn1->record;
+ const struct redact_record *srr2 = rn2->record;
+ if (srr1->eos_marker)
+ return (1);
+ if (srr2->eos_marker)
+ return (-1);
+
+ int cmp = redact_range_compare(srr1->end_object, srr1->end_blkid,
+ srr1->datablksz, srr2->end_object, srr2->end_blkid,
+ srr2->datablksz);
+ if (cmp == 0)
+ cmp = (rn1->thread_num < rn2->thread_num ? -1 : 1);
+ return (cmp);
+}
+
+/*
+ * Utility function that compares two redaction records to determine if any part
+ * of the "from" record is before any part of the "to" record. Also causes End
+ * of Stream redaction records to compare after all others, so that the
+ * redaction merging logic can stay simple.
+ */
+static boolean_t
+redact_record_before(const struct redact_record *from,
+ const struct redact_record *to)
+{
+ if (from->eos_marker == B_TRUE)
+ return (B_FALSE);
+ else if (to->eos_marker == B_TRUE)
+ return (B_TRUE);
+ return (redact_range_compare(from->start_object, from->start_blkid,
+ from->datablksz, to->end_object, to->end_blkid,
+ to->datablksz) <= 0);
+}
+
+/*
+ * Pop a new redaction record off the queue, check that the records are in the
+ * right order, and free the old data.
+ */
+static struct redact_record *
+get_next_redact_record(bqueue_t *bq, struct redact_record *prev)
+{
+ struct redact_record *next = bqueue_dequeue(bq);
+ ASSERT(redact_record_before(prev, next));
+ kmem_free(prev, sizeof (*prev));
+ return (next);
+}
+
+/*
+ * Remove the given redaction node from both trees, pull a new redaction record
+ * off the queue, free the old redaction record, update the redaction node, and
+ * reinsert the node into the trees.
+ */
+static int
+update_avl_trees(avl_tree_t *start_tree, avl_tree_t *end_tree,
+ struct redact_node *redact_node)
+{
+ avl_remove(start_tree, redact_node);
+ avl_remove(end_tree, redact_node);
+ redact_node->record = get_next_redact_record(&redact_node->rt_arg->q,
+ redact_node->record);
+ avl_add(end_tree, redact_node);
+ avl_add(start_tree, redact_node);
+ return (redact_node->rt_arg->error_code);
+}
+
+/*
+ * Synctask for updating redaction lists. We first take this txg's list of
+ * redacted blocks and append those to the redaction list. We then update the
+ * redaction list's bonus buffer. We store the furthest blocks we visited and
+ * the list of snapshots that we're redacting with respect to. We need these so
+ * that redacted sends and receives can be correctly resumed.
+ */
+static void
+redaction_list_update_sync(void *arg, dmu_tx_t *tx)
+{
+ struct merge_data *md = arg;
+ uint64_t txg = dmu_tx_get_txg(tx);
+ list_t *list = &md->md_blocks[txg & TXG_MASK];
+ redact_block_phys_t *furthest_visited =
+ &md->md_furthest[txg & TXG_MASK];
+ objset_t *mos = tx->tx_pool->dp_meta_objset;
+ redaction_list_t *rl = md->md_redaction_list;
+ int bufsize = redact_sync_bufsize;
+ redact_block_phys_t *buf = kmem_alloc(bufsize * sizeof (*buf),
+ KM_SLEEP);
+ int index = 0;
+
+ dmu_buf_will_dirty(rl->rl_dbuf, tx);
+
+ for (struct redact_block_list_node *rbln = list_remove_head(list);
+ rbln != NULL; rbln = list_remove_head(list)) {
+ ASSERT3U(rbln->block.rbp_object, <=,
+ furthest_visited->rbp_object);
+ ASSERT(rbln->block.rbp_object < furthest_visited->rbp_object ||
+ rbln->block.rbp_blkid <= furthest_visited->rbp_blkid);
+ buf[index] = rbln->block;
+ index++;
+ if (index == bufsize) {
+ dmu_write(mos, rl->rl_object,
+ rl->rl_phys->rlp_num_entries * sizeof (*buf),
+ bufsize * sizeof (*buf), buf, tx);
+ rl->rl_phys->rlp_num_entries += bufsize;
+ index = 0;
+ }
+ kmem_free(rbln, sizeof (*rbln));
+ }
+ if (index > 0) {
+ dmu_write(mos, rl->rl_object, rl->rl_phys->rlp_num_entries *
+ sizeof (*buf), index * sizeof (*buf), buf, tx);
+ rl->rl_phys->rlp_num_entries += index;
+ }
+ kmem_free(buf, bufsize * sizeof (*buf));
+
+ md->md_synctask_txg[txg & TXG_MASK] = B_FALSE;
+ rl->rl_phys->rlp_last_object = furthest_visited->rbp_object;
+ rl->rl_phys->rlp_last_blkid = furthest_visited->rbp_blkid;
+}
+
+static void
+commit_rl_updates(objset_t *os, struct merge_data *md, uint64_t object,
+ uint64_t blkid)
+{
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(os->os_spa)->dp_mos_dir);
+ dmu_tx_hold_space(tx, sizeof (struct redact_block_list_node));
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ uint64_t txg = dmu_tx_get_txg(tx);
+ if (!md->md_synctask_txg[txg & TXG_MASK]) {
+ dsl_sync_task_nowait(dmu_tx_pool(tx),
+ redaction_list_update_sync, md, tx);
+ md->md_synctask_txg[txg & TXG_MASK] = B_TRUE;
+ md->md_latest_synctask_txg = txg;
+ }
+ md->md_furthest[txg & TXG_MASK].rbp_object = object;
+ md->md_furthest[txg & TXG_MASK].rbp_blkid = blkid;
+ list_move_tail(&md->md_blocks[txg & TXG_MASK],
+ &md->md_redact_block_pending);
+ dmu_tx_commit(tx);
+ md->md_last_time = gethrtime();
+}
+
+/*
+ * We want to store the list of blocks that we're redacting in the bookmark's
+ * redaction list. However, this list is stored in the MOS, which means it can
+ * only be written to in syncing context. To get around this, we create a
+ * synctask that will write to the mos for us. We tell it what to write by
+ * a linked list for each current transaction group; every time we decide to
+ * redact a block, we append it to the transaction group that is currently in
+ * open context. We also update some progress information that the synctask
+ * will store to enable resumable redacted sends.
+ */
+static void
+update_redaction_list(struct merge_data *md, objset_t *os,
+ uint64_t object, uint64_t blkid, uint64_t endblkid, uint32_t blksz)
+{
+ boolean_t enqueue = B_FALSE;
+ redact_block_phys_t cur = {0};
+ uint64_t count = endblkid - blkid + 1;
+ while (count > REDACT_BLOCK_MAX_COUNT) {
+ update_redaction_list(md, os, object, blkid,
+ blkid + REDACT_BLOCK_MAX_COUNT - 1, blksz);
+ blkid += REDACT_BLOCK_MAX_COUNT;
+ count -= REDACT_BLOCK_MAX_COUNT;
+ }
+ redact_block_phys_t *coalesce = &md->md_coalesce_block;
+ boolean_t new;
+ if (coalesce->rbp_size_count == 0) {
+ new = B_TRUE;
+ enqueue = B_FALSE;
+ } else {
+ uint64_t old_count = redact_block_get_count(coalesce);
+ if (coalesce->rbp_object == object &&
+ coalesce->rbp_blkid + old_count == blkid &&
+ old_count + count <= REDACT_BLOCK_MAX_COUNT) {
+ ASSERT3U(redact_block_get_size(coalesce), ==, blksz);
+ redact_block_set_count(coalesce, old_count + count);
+ new = B_FALSE;
+ enqueue = B_FALSE;
+ } else {
+ new = B_TRUE;
+ enqueue = B_TRUE;
+ }
+ }
+
+ if (new) {
+ cur = *coalesce;
+ coalesce->rbp_blkid = blkid;
+ coalesce->rbp_object = object;
+
+ redact_block_set_count(coalesce, count);
+ redact_block_set_size(coalesce, blksz);
+ }
+
+ if (enqueue && redact_block_get_size(&cur) != 0) {
+ struct redact_block_list_node *rbln =
+ kmem_alloc(sizeof (struct redact_block_list_node),
+ KM_SLEEP);
+ rbln->block = cur;
+ list_insert_tail(&md->md_redact_block_pending, rbln);
+ }
+
+ if (gethrtime() > md->md_last_time +
+ redaction_list_update_interval_ns) {
+ commit_rl_updates(os, md, object, blkid);
+ }
+}
+
+/*
+ * This thread merges all the redaction records provided by the worker threads,
+ * and determines which blocks are redacted by all the snapshots. The algorithm
+ * for doing so is similar to performing a merge in mergesort with n sub-lists
+ * instead of 2, with some added complexity due to the fact that the entries are
+ * ranges, not just single blocks. This algorithm relies on the fact that the
+ * queues are sorted, which is ensured by the fact that traverse_dataset
+ * traverses the dataset in a consistent order. We pull one entry off the front
+ * of the queues of each secure dataset traversal thread. Then we repeat the
+ * following: each record represents a range of blocks modified by one of the
+ * redaction snapshots, and each block in that range may need to be redacted in
+ * the send stream. Find the record with the latest start of its range, and the
+ * record with the earliest end of its range. If the last start is before the
+ * first end, then we know that the blocks in the range [last_start, first_end]
+ * are covered by all of the ranges at the front of the queues, which means
+ * every thread redacts that whole range. For example, let's say the ranges on
+ * each queue look like this:
+ *
+ * Block Id 1 2 3 4 5 6 7 8 9 10 11
+ * Thread 1 | [====================]
+ * Thread 2 | [========]
+ * Thread 3 | [=================]
+ *
+ * Thread 3 has the last start (5), and the thread 2 has the last end (6). All
+ * three threads modified the range [5,6], so that data should not be sent over
+ * the wire. After we've determined whether or not to redact anything, we take
+ * the record with the first end. We discard that record, and pull a new one
+ * off the front of the queue it came from. In the above example, we would
+ * discard Thread 2's record, and pull a new one. Let's say the next record we
+ * pulled from Thread 2 covered range [10,11]. The new layout would look like
+ * this:
+ *
+ * Block Id 1 2 3 4 5 6 7 8 9 10 11
+ * Thread 1 | [====================]
+ * Thread 2 | [==]
+ * Thread 3 | [=================]
+ *
+ * When we compare the last start (10, from Thread 2) and the first end (9, from
+ * Thread 1), we see that the last start is greater than the first end.
+ * Therefore, we do not redact anything from these records. We'll iterate by
+ * replacing the record from Thread 1.
+ *
+ * We iterate by replacing the record with the lowest end because we know
+ * that the record with the lowest end has helped us as much as it can. All the
+ * ranges before it that we will ever redact have been redacted. In addition,
+ * by replacing the one with the lowest end, we guarantee we catch all ranges
+ * that need to be redacted. For example, if in the case above we had replaced
+ * the record from Thread 1 instead, we might have ended up with the following:
+ *
+ * Block Id 1 2 3 4 5 6 7 8 9 10 11 12
+ * Thread 1 | [==]
+ * Thread 2 | [========]
+ * Thread 3 | [=================]
+ *
+ * If the next record from Thread 2 had been [8,10], for example, we should have
+ * redacted part of that range, but because we updated Thread 1's record, we
+ * missed it.
+ *
+ * We implement this algorithm by using two trees. The first sorts the
+ * redaction records by their start_zb, and the second sorts them by their
+ * end_zb. We use these to find the record with the last start and the record
+ * with the first end. We create a record with that start and end, and send it
+ * on. The overall runtime of this implementation is O(n log m), where n is the
+ * total number of redaction records from all the different redaction snapshots,
+ * and m is the number of redaction snapshots.
+ *
+ * If we redact with respect to zero snapshots, we create a redaction
+ * record with the start object and blkid to 0, and the end object and blkid to
+ * UINT64_MAX. This will result in us redacting every block.
+ */
+static int
+perform_thread_merge(bqueue_t *q, uint32_t num_threads,
+ struct redact_thread_arg *thread_args, boolean_t *cancel)
+{
+ struct redact_node *redact_nodes = NULL;
+ avl_tree_t start_tree, end_tree;
+ struct redact_record *record;
+ struct redact_record *current_record = NULL;
+ int err = 0;
+ struct merge_data md = { {0} };
+ list_create(&md.md_redact_block_pending,
+ sizeof (struct redact_block_list_node),
+ offsetof(struct redact_block_list_node, node));
+
+ /*
+ * If we're redacting with respect to zero snapshots, then no data is
+ * permitted to be sent. We enqueue a record that redacts all blocks,
+ * and an eos marker.
+ */
+ if (num_threads == 0) {
+ record = kmem_zalloc(sizeof (struct redact_record),
+ KM_SLEEP);
+ // We can't redact object 0, so don't try.
+ record->start_object = 1;
+ record->start_blkid = 0;
+ record->end_object = record->end_blkid = UINT64_MAX;
+ bqueue_enqueue(q, record, sizeof (*record));
+ return (0);
+ }
+ if (num_threads > 0) {
+ redact_nodes = kmem_zalloc(num_threads *
+ sizeof (*redact_nodes), KM_SLEEP);
+ }
+
+ avl_create(&start_tree, redact_node_compare_start,
+ sizeof (struct redact_node),
+ offsetof(struct redact_node, avl_node_start));
+ avl_create(&end_tree, redact_node_compare_end,
+ sizeof (struct redact_node),
+ offsetof(struct redact_node, avl_node_end));
+
+ for (int i = 0; i < num_threads; i++) {
+ struct redact_node *node = &redact_nodes[i];
+ struct redact_thread_arg *targ = &thread_args[i];
+ node->record = bqueue_dequeue(&targ->q);
+ node->rt_arg = targ;
+ node->thread_num = i;
+ avl_add(&start_tree, node);
+ avl_add(&end_tree, node);
+ }
+
+ /*
+ * Once the first record in the end tree has returned EOS, every record
+ * must be an EOS record, so we should stop.
+ */
+ while (err == 0 && !((struct redact_node *)avl_first(&end_tree))->
+ record->eos_marker) {
+ if (*cancel) {
+ err = EINTR;
+ break;
+ }
+ struct redact_node *last_start = avl_last(&start_tree);
+ struct redact_node *first_end = avl_first(&end_tree);
+
+ /*
+ * If the last start record is before the first end record,
+ * then we have blocks that are redacted by all threads.
+ * Therefore, we should redact them. Copy the record, and send
+ * it to the main thread.
+ */
+ if (redact_record_before(last_start->record,
+ first_end->record)) {
+ record = kmem_zalloc(sizeof (struct redact_record),
+ KM_SLEEP);
+ *record = *first_end->record;
+ record->start_object = last_start->record->start_object;
+ record->start_blkid = last_start->record->start_blkid;
+ record_merge_enqueue(q, &current_record,
+ record);
+ }
+ err = update_avl_trees(&start_tree, &end_tree, first_end);
+ }
+
+ /*
+ * We're done; if we were cancelled, we need to cancel our workers and
+ * clear out their queues. Either way, we need to remove every thread's
+ * redact_node struct from the avl trees.
+ */
+ for (int i = 0; i < num_threads; i++) {
+ if (err != 0) {
+ thread_args[i].cancel = B_TRUE;
+ while (!redact_nodes[i].record->eos_marker) {
+ (void) update_avl_trees(&start_tree, &end_tree,
+ &redact_nodes[i]);
+ }
+ }
+ avl_remove(&start_tree, &redact_nodes[i]);
+ avl_remove(&end_tree, &redact_nodes[i]);
+ kmem_free(redact_nodes[i].record,
+ sizeof (struct redact_record));
+ }
+
+ avl_destroy(&start_tree);
+ avl_destroy(&end_tree);
+ kmem_free(redact_nodes, num_threads * sizeof (*redact_nodes));
+ if (current_record != NULL)
+ bqueue_enqueue(q, current_record, sizeof (current_record));
+ return (err);
+}
+
+struct redact_merge_thread_arg {
+ bqueue_t q;
+ spa_t *spa;
+ int numsnaps;
+ struct redact_thread_arg *thr_args;
+ boolean_t cancel;
+ int error_code;
+};
+
+static void
+redact_merge_thread(void *arg)
+{
+ struct redact_merge_thread_arg *rmta = arg;
+ rmta->error_code = perform_thread_merge(&rmta->q,
+ rmta->numsnaps, rmta->thr_args, &rmta->cancel);
+ struct redact_record *rec = kmem_zalloc(sizeof (*rec), KM_SLEEP);
+ rec->eos_marker = B_TRUE;
+ bqueue_enqueue_flush(&rmta->q, rec, 1);
+ thread_exit();
+}
+
+/*
+ * Find the next object in or after the redaction range passed in, and hold
+ * its dnode with the provided tag. Also update *object to contain the new
+ * object number.
+ */
+static int
+hold_next_object(objset_t *os, struct redact_record *rec, void *tag,
+ uint64_t *object, dnode_t **dn)
+{
+ int err = 0;
+ if (*dn != NULL)
+ dnode_rele(*dn, tag);
+ *dn = NULL;
+ if (*object < rec->start_object) {
+ *object = rec->start_object - 1;
+ }
+ err = dmu_object_next(os, object, B_FALSE, 0);
+ if (err != 0)
+ return (err);
+
+ err = dnode_hold(os, *object, tag, dn);
+ while (err == 0 && (*object < rec->start_object ||
+ DMU_OT_IS_METADATA((*dn)->dn_type))) {
+ dnode_rele(*dn, tag);
+ *dn = NULL;
+ err = dmu_object_next(os, object, B_FALSE, 0);
+ if (err != 0)
+ break;
+ err = dnode_hold(os, *object, tag, dn);
+ }
+ return (err);
+}
+
+static int
+perform_redaction(objset_t *os, redaction_list_t *rl,
+ struct redact_merge_thread_arg *rmta)
+{
+ int err = 0;
+ bqueue_t *q = &rmta->q;
+ struct redact_record *rec = NULL;
+ struct merge_data md = { {0} };
+
+ list_create(&md.md_redact_block_pending,
+ sizeof (struct redact_block_list_node),
+ offsetof(struct redact_block_list_node, node));
+ md.md_redaction_list = rl;
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ list_create(&md.md_blocks[i],
+ sizeof (struct redact_block_list_node),
+ offsetof(struct redact_block_list_node, node));
+ }
+ dnode_t *dn = NULL;
+ uint64_t prev_obj = 0;
+ for (rec = bqueue_dequeue(q); !rec->eos_marker && err == 0;
+ rec = get_next_redact_record(q, rec)) {
+ ASSERT3U(rec->start_object, !=, 0);
+ uint64_t object;
+ if (prev_obj != rec->start_object) {
+ object = rec->start_object - 1;
+ err = hold_next_object(os, rec, FTAG, &object, &dn);
+ } else {
+ object = prev_obj;
+ }
+ while (err == 0 && object <= rec->end_object) {
+ if (issig(JUSTLOOKING) && issig(FORREAL)) {
+ err = EINTR;
+ break;
+ }
+ /*
+ * Part of the current object is contained somewhere in
+ * the range covered by rec.
+ */
+ uint64_t startblkid;
+ uint64_t endblkid;
+ uint64_t maxblkid = dn->dn_phys->dn_maxblkid;
+
+ if (rec->start_object < object)
+ startblkid = 0;
+ else if (rec->start_blkid > maxblkid)
+ break;
+ else
+ startblkid = rec->start_blkid;
+
+ if (rec->end_object > object || rec->end_blkid >
+ maxblkid) {
+ endblkid = maxblkid;
+ } else {
+ endblkid = rec->end_blkid;
+ }
+ update_redaction_list(&md, os, object, startblkid,
+ endblkid, dn->dn_datablksz);
+
+ if (object == rec->end_object)
+ break;
+ err = hold_next_object(os, rec, FTAG, &object, &dn);
+ }
+ if (err == ESRCH)
+ err = 0;
+ if (dn != NULL)
+ prev_obj = object;
+ }
+ if (err == 0 && dn != NULL)
+ dnode_rele(dn, FTAG);
+
+ if (err == ESRCH)
+ err = 0;
+ rmta->cancel = B_TRUE;
+ while (!rec->eos_marker)
+ rec = get_next_redact_record(q, rec);
+ kmem_free(rec, sizeof (*rec));
+
+ /*
+ * There may be a block that's being coalesced, sync that out before we
+ * return.
+ */
+ if (err == 0 && md.md_coalesce_block.rbp_size_count != 0) {
+ struct redact_block_list_node *rbln =
+ kmem_alloc(sizeof (struct redact_block_list_node),
+ KM_SLEEP);
+ rbln->block = md.md_coalesce_block;
+ list_insert_tail(&md.md_redact_block_pending, rbln);
+ }
+ commit_rl_updates(os, &md, UINT64_MAX, UINT64_MAX);
+
+ /*
+ * Wait for all the redaction info to sync out before we return, so that
+ * anyone who attempts to resume this redaction will have all the data
+ * they need.
+ */
+ dsl_pool_t *dp = spa_get_dsl(os->os_spa);
+ if (md.md_latest_synctask_txg != 0)
+ txg_wait_synced(dp, md.md_latest_synctask_txg);
+ for (int i = 0; i < TXG_SIZE; i++)
+ list_destroy(&md.md_blocks[i]);
+ return (err);
+}
+
+static boolean_t
+redact_snaps_contains(uint64_t *snaps, uint64_t num_snaps, uint64_t guid)
+{
+ for (int i = 0; i < num_snaps; i++) {
+ if (snaps[i] == guid)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+int
+dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
+ const char *redactbook)
+{
+ int err = 0;
+ dsl_pool_t *dp = NULL;
+ dsl_dataset_t *ds = NULL;
+ int numsnaps = 0;
+ objset_t *os;
+ struct redact_thread_arg *args = NULL;
+ redaction_list_t *new_rl = NULL;
+ char *newredactbook;
+
+ if ((err = dsl_pool_hold(snapname, FTAG, &dp)) != 0)
+ return (err);
+
+ newredactbook = kmem_zalloc(sizeof (char) * ZFS_MAX_DATASET_NAME_LEN,
+ KM_SLEEP);
+
+ if ((err = dsl_dataset_hold_flags(dp, snapname, DS_HOLD_FLAG_DECRYPT,
+ FTAG, &ds)) != 0) {
+ goto out;
+ }
+ dsl_dataset_long_hold(ds, FTAG);
+ if (!ds->ds_is_snapshot || dmu_objset_from_ds(ds, &os) != 0) {
+ err = EINVAL;
+ goto out;
+ }
+ if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_REDACTED_DATASETS)) {
+ err = EALREADY;
+ goto out;
+ }
+
+ numsnaps = fnvlist_num_pairs(redactnvl);
+ if (numsnaps > 0)
+ args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP);
+
+ nvpair_t *pair = NULL;
+ for (int i = 0; i < numsnaps; i++) {
+ pair = nvlist_next_nvpair(redactnvl, pair);
+ const char *name = nvpair_name(pair);
+ struct redact_thread_arg *rta = &args[i];
+ err = dsl_dataset_hold_flags(dp, name, DS_HOLD_FLAG_DECRYPT,
+ FTAG, &rta->ds);
+ if (err != 0)
+ break;
+ /*
+ * We want to do the long hold before we can get any other
+ * errors, because the cleanup code will release the long
+ * hold if rta->ds is filled in.
+ */
+ dsl_dataset_long_hold(rta->ds, FTAG);
+
+ err = dmu_objset_from_ds(rta->ds, &rta->os);
+ if (err != 0)
+ break;
+ if (!dsl_dataset_is_before(rta->ds, ds, 0)) {
+ err = EINVAL;
+ break;
+ }
+ if (dsl_dataset_feature_is_active(rta->ds,
+ SPA_FEATURE_REDACTED_DATASETS)) {
+ err = EALREADY;
+ break;
+
+ }
+ }
+ if (err != 0)
+ goto out;
+ VERIFY3P(nvlist_next_nvpair(redactnvl, pair), ==, NULL);
+
+ boolean_t resuming = B_FALSE;
+ zfs_bookmark_phys_t bookmark;
+
+ (void) strlcpy(newredactbook, snapname, ZFS_MAX_DATASET_NAME_LEN);
+ char *c = strchr(newredactbook, '@');
+ ASSERT3P(c, !=, NULL);
+ int n = snprintf(c, ZFS_MAX_DATASET_NAME_LEN - (c - newredactbook),
+ "#%s", redactbook);
+ if (n >= ZFS_MAX_DATASET_NAME_LEN - (c - newredactbook)) {
+ dsl_pool_rele(dp, FTAG);
+ kmem_free(newredactbook,
+ sizeof (char) * ZFS_MAX_DATASET_NAME_LEN);
+ if (args != NULL)
+ kmem_free(args, numsnaps * sizeof (*args));
+ return (SET_ERROR(ENAMETOOLONG));
+ }
+ err = dsl_bookmark_lookup(dp, newredactbook, NULL, &bookmark);
+ if (err == 0) {
+ resuming = B_TRUE;
+ if (bookmark.zbm_redaction_obj == 0) {
+ err = EEXIST;
+ goto out;
+ }
+ err = dsl_redaction_list_hold_obj(dp,
+ bookmark.zbm_redaction_obj, FTAG, &new_rl);
+ if (err != 0) {
+ err = EIO;
+ goto out;
+ }
+ dsl_redaction_list_long_hold(dp, new_rl, FTAG);
+ if (new_rl->rl_phys->rlp_num_snaps != numsnaps) {
+ err = ESRCH;
+ goto out;
+ }
+ for (int i = 0; i < numsnaps; i++) {
+ struct redact_thread_arg *rta = &args[i];
+ if (!redact_snaps_contains(new_rl->rl_phys->rlp_snaps,
+ new_rl->rl_phys->rlp_num_snaps,
+ dsl_dataset_phys(rta->ds)->ds_guid)) {
+ err = ESRCH;
+ goto out;
+ }
+ }
+ if (new_rl->rl_phys->rlp_last_blkid == UINT64_MAX &&
+ new_rl->rl_phys->rlp_last_object == UINT64_MAX) {
+ err = EEXIST;
+ goto out;
+ }
+ dsl_pool_rele(dp, FTAG);
+ dp = NULL;
+ } else {
+ uint64_t *guids = NULL;
+ if (numsnaps > 0) {
+ guids = kmem_zalloc(numsnaps * sizeof (uint64_t),
+ KM_SLEEP);
+ }
+ for (int i = 0; i < numsnaps; i++) {
+ struct redact_thread_arg *rta = &args[i];
+ guids[i] = dsl_dataset_phys(rta->ds)->ds_guid;
+ }
+
+ dsl_pool_rele(dp, FTAG);
+ dp = NULL;
+ err = dsl_bookmark_create_redacted(newredactbook, snapname,
+ numsnaps, guids, FTAG, &new_rl);
+ kmem_free(guids, numsnaps * sizeof (uint64_t));
+ if (err != 0) {
+ goto out;
+ }
+ }
+
+ for (int i = 0; i < numsnaps; i++) {
+ struct redact_thread_arg *rta = &args[i];
+ (void) bqueue_init(&rta->q, zfs_redact_queue_ff,
+ zfs_redact_queue_length,
+ offsetof(struct redact_record, ln));
+ if (resuming) {
+ rta->resume.zb_blkid =
+ new_rl->rl_phys->rlp_last_blkid;
+ rta->resume.zb_object =
+ new_rl->rl_phys->rlp_last_object;
+ }
+ rta->txg = dsl_dataset_phys(ds)->ds_creation_txg;
+ (void) thread_create(NULL, 0, redact_traverse_thread, rta,
+ 0, curproc, TS_RUN, minclsyspri);
+ }
+
+ struct redact_merge_thread_arg *rmta;
+ rmta = kmem_zalloc(sizeof (struct redact_merge_thread_arg), KM_SLEEP);
+
+ (void) bqueue_init(&rmta->q, zfs_redact_queue_ff,
+ zfs_redact_queue_length, offsetof(struct redact_record, ln));
+ rmta->numsnaps = numsnaps;
+ rmta->spa = os->os_spa;
+ rmta->thr_args = args;
+ (void) thread_create(NULL, 0, redact_merge_thread, rmta, 0, curproc,
+ TS_RUN, minclsyspri);
+ err = perform_redaction(os, new_rl, rmta);
+ kmem_free(rmta, sizeof (struct redact_merge_thread_arg));
+
+out:
+ kmem_free(newredactbook, sizeof (char) * ZFS_MAX_DATASET_NAME_LEN);
+
+ if (new_rl != NULL) {
+ dsl_redaction_list_long_rele(new_rl, FTAG);
+ dsl_redaction_list_rele(new_rl, FTAG);
+ }
+ for (int i = 0; i < numsnaps; i++) {
+ struct redact_thread_arg *rta = &args[i];
+ /*
+ * rta->ds may be NULL if we got an error while filling
+ * it in.
+ */
+ if (rta->ds != NULL) {
+ dsl_dataset_long_rele(rta->ds, FTAG);
+ dsl_dataset_rele_flags(rta->ds,
+ DS_HOLD_FLAG_DECRYPT, FTAG);
+ }
+ }
+
+ if (args != NULL)
+ kmem_free(args, numsnaps * sizeof (*args));
+ if (dp != NULL)
+ dsl_pool_rele(dp, FTAG);
+ if (ds != NULL) {
+ dsl_dataset_long_rele(ds, FTAG);
+ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+ }
+ return (SET_ERROR(err));
+
+}
diff --git a/sys/contrib/openzfs/module/zfs/dmu_send.c b/sys/contrib/openzfs/module/zfs/dmu_send.c
new file mode 100644
index 000000000000..d654382237c0
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dmu_send.c
@@ -0,0 +1,3094 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2014 HybridCluster. All rights reserved.
+ * Copyright 2016 RackTop Systems.
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/spa_impl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
+#include <zfs_fletcher.h>
+#include <sys/avl.h>
+#include <sys/ddt.h>
+#include <sys/zfs_onexit.h>
+#include <sys/dmu_send.h>
+#include <sys/dmu_recv.h>
+#include <sys/dsl_destroy.h>
+#include <sys/blkptr.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/zfeature.h>
+#include <sys/bqueue.h>
+#include <sys/zvol.h>
+#include <sys/policy.h>
+#include <sys/objlist.h>
+#ifdef _KERNEL
+#include <sys/zfs_vfsops.h>
+#endif
+
+/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
+int zfs_send_corrupt_data = B_FALSE;
+/*
+ * This tunable controls the amount of data (measured in bytes) that will be
+ * prefetched by zfs send. If the main thread is blocking on reads that haven't
+ * completed, this variable might need to be increased. If instead the main
+ * thread is issuing new reads because the prefetches have fallen out of the
+ * cache, this may need to be decreased.
+ */
+int zfs_send_queue_length = SPA_MAXBLOCKSIZE;
+/*
+ * This tunable controls the length of the queues that zfs send worker threads
+ * use to communicate. If the send_main_thread is blocking on these queues,
+ * this variable may need to be increased. If there is a significant slowdown
+ * at the start of a send as these threads consume all the available IO
+ * resources, this variable may need to be decreased.
+ */
+int zfs_send_no_prefetch_queue_length = 1024 * 1024;
+/*
+ * These tunables control the fill fraction of the queues by zfs send. The fill
+ * fraction controls the frequency with which threads have to be cv_signaled.
+ * If a lot of cpu time is being spent on cv_signal, then these should be tuned
+ * down. If the queues empty before the signalled thread can catch up, then
+ * these should be tuned up.
+ */
+int zfs_send_queue_ff = 20;
+int zfs_send_no_prefetch_queue_ff = 20;
+
+/*
+ * Use this to override the recordsize calculation for fast zfs send estimates.
+ */
+int zfs_override_estimate_recordsize = 0;
+
+/* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
+int zfs_send_set_freerecords_bit = B_TRUE;
+
+/* Set this tunable to FALSE is disable sending unmodified spill blocks. */
+int zfs_send_unmodified_spill_blocks = B_TRUE;
+
+static inline boolean_t
+overflow_multiply(uint64_t a, uint64_t b, uint64_t *c)
+{
+ uint64_t temp = a * b;
+ if (b != 0 && temp / b != a)
+ return (B_FALSE);
+ *c = temp;
+ return (B_TRUE);
+}
+
+struct send_thread_arg {
+ bqueue_t q;
+ objset_t *os; /* Objset to traverse */
+ uint64_t fromtxg; /* Traverse from this txg */
+ int flags; /* flags to pass to traverse_dataset */
+ int error_code;
+ boolean_t cancel;
+ zbookmark_phys_t resume;
+ uint64_t *num_blocks_visited;
+};
+
+struct redact_list_thread_arg {
+ boolean_t cancel;
+ bqueue_t q;
+ zbookmark_phys_t resume;
+ redaction_list_t *rl;
+ boolean_t mark_redact;
+ int error_code;
+ uint64_t *num_blocks_visited;
+};
+
+struct send_merge_thread_arg {
+ bqueue_t q;
+ objset_t *os;
+ struct redact_list_thread_arg *from_arg;
+ struct send_thread_arg *to_arg;
+ struct redact_list_thread_arg *redact_arg;
+ int error;
+ boolean_t cancel;
+};
+
+struct send_range {
+ boolean_t eos_marker; /* Marks the end of the stream */
+ uint64_t object;
+ uint64_t start_blkid;
+ uint64_t end_blkid;
+ bqueue_node_t ln;
+ enum type {DATA, HOLE, OBJECT, OBJECT_RANGE, REDACT,
+ PREVIOUSLY_REDACTED} type;
+ union {
+ struct srd {
+ dmu_object_type_t obj_type;
+ uint32_t datablksz; // logical size
+ uint32_t datasz; // payload size
+ blkptr_t bp;
+ arc_buf_t *abuf;
+ abd_t *abd;
+ kmutex_t lock;
+ kcondvar_t cv;
+ boolean_t io_outstanding;
+ int io_err;
+ } data;
+ struct srh {
+ uint32_t datablksz;
+ } hole;
+ struct sro {
+ /*
+ * This is a pointer because embedding it in the
+ * struct causes these structures to be massively larger
+ * for all range types; this makes the code much less
+ * memory efficient.
+ */
+ dnode_phys_t *dnp;
+ blkptr_t bp;
+ } object;
+ struct srr {
+ uint32_t datablksz;
+ } redact;
+ struct sror {
+ blkptr_t bp;
+ } object_range;
+ } sru;
+};
+
+/*
+ * The list of data whose inclusion in a send stream can be pending from
+ * one call to backup_cb to another. Multiple calls to dump_free(),
+ * dump_freeobjects(), and dump_redact() can be aggregated into a single
+ * DRR_FREE, DRR_FREEOBJECTS, or DRR_REDACT replay record.
+ */
+typedef enum {
+ PENDING_NONE,
+ PENDING_FREE,
+ PENDING_FREEOBJECTS,
+ PENDING_REDACT
+} dmu_pendop_t;
+
+typedef struct dmu_send_cookie {
+ dmu_replay_record_t *dsc_drr;
+ dmu_send_outparams_t *dsc_dso;
+ offset_t *dsc_off;
+ objset_t *dsc_os;
+ zio_cksum_t dsc_zc;
+ uint64_t dsc_toguid;
+ uint64_t dsc_fromtxg;
+ int dsc_err;
+ dmu_pendop_t dsc_pending_op;
+ uint64_t dsc_featureflags;
+ uint64_t dsc_last_data_object;
+ uint64_t dsc_last_data_offset;
+ uint64_t dsc_resume_object;
+ uint64_t dsc_resume_offset;
+ boolean_t dsc_sent_begin;
+ boolean_t dsc_sent_end;
+} dmu_send_cookie_t;
+
+static int do_dump(dmu_send_cookie_t *dscp, struct send_range *range);
+
+static void
+range_free(struct send_range *range)
+{
+ if (range->type == OBJECT) {
+ size_t size = sizeof (dnode_phys_t) *
+ (range->sru.object.dnp->dn_extra_slots + 1);
+ kmem_free(range->sru.object.dnp, size);
+ } else if (range->type == DATA) {
+ mutex_enter(&range->sru.data.lock);
+ while (range->sru.data.io_outstanding)
+ cv_wait(&range->sru.data.cv, &range->sru.data.lock);
+ if (range->sru.data.abd != NULL)
+ abd_free(range->sru.data.abd);
+ if (range->sru.data.abuf != NULL) {
+ arc_buf_destroy(range->sru.data.abuf,
+ &range->sru.data.abuf);
+ }
+ mutex_exit(&range->sru.data.lock);
+
+ cv_destroy(&range->sru.data.cv);
+ mutex_destroy(&range->sru.data.lock);
+ }
+ kmem_free(range, sizeof (*range));
+}
+
+/*
+ * For all record types except BEGIN, fill in the checksum (overlaid in
+ * drr_u.drr_checksum.drr_checksum). The checksum verifies everything
+ * up to the start of the checksum itself.
+ */
+static int
+dump_record(dmu_send_cookie_t *dscp, void *payload, int payload_len)
+{
+ dmu_send_outparams_t *dso = dscp->dsc_dso;
+ ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+ ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+ (void) fletcher_4_incremental_native(dscp->dsc_drr,
+ offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+ &dscp->dsc_zc);
+ if (dscp->dsc_drr->drr_type == DRR_BEGIN) {
+ dscp->dsc_sent_begin = B_TRUE;
+ } else {
+ ASSERT(ZIO_CHECKSUM_IS_ZERO(&dscp->dsc_drr->drr_u.
+ drr_checksum.drr_checksum));
+ dscp->dsc_drr->drr_u.drr_checksum.drr_checksum = dscp->dsc_zc;
+ }
+ if (dscp->dsc_drr->drr_type == DRR_END) {
+ dscp->dsc_sent_end = B_TRUE;
+ }
+ (void) fletcher_4_incremental_native(&dscp->dsc_drr->
+ drr_u.drr_checksum.drr_checksum,
+ sizeof (zio_cksum_t), &dscp->dsc_zc);
+ *dscp->dsc_off += sizeof (dmu_replay_record_t);
+ dscp->dsc_err = dso->dso_outfunc(dscp->dsc_os, dscp->dsc_drr,
+ sizeof (dmu_replay_record_t), dso->dso_arg);
+ if (dscp->dsc_err != 0)
+ return (SET_ERROR(EINTR));
+ if (payload_len != 0) {
+ *dscp->dsc_off += payload_len;
+ /*
+ * payload is null when dso_dryrun == B_TRUE (i.e. when we're
+ * doing a send size calculation)
+ */
+ if (payload != NULL) {
+ (void) fletcher_4_incremental_native(
+ payload, payload_len, &dscp->dsc_zc);
+ }
+
+ /*
+ * The code does not rely on this (len being a multiple of 8).
+ * We keep this assertion because of the corresponding assertion
+ * in receive_read(). Keeping this assertion ensures that we do
+ * not inadvertently break backwards compatibility (causing the
+ * assertion in receive_read() to trigger on old software).
+ *
+ * Raw sends cannot be received on old software, and so can
+ * bypass this assertion.
+ */
+
+ ASSERT((payload_len % 8 == 0) ||
+ (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW));
+
+ dscp->dsc_err = dso->dso_outfunc(dscp->dsc_os, payload,
+ payload_len, dso->dso_arg);
+ if (dscp->dsc_err != 0)
+ return (SET_ERROR(EINTR));
+ }
+ return (0);
+}
+
+/*
+ * Fill in the drr_free struct, or perform aggregation if the previous record is
+ * also a free record, and the two are adjacent.
+ *
+ * Note that we send free records even for a full send, because we want to be
+ * able to receive a full send as a clone, which requires a list of all the free
+ * and freeobject records that were generated on the source.
+ */
+static int
+dump_free(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset,
+ uint64_t length)
+{
+ struct drr_free *drrf = &(dscp->dsc_drr->drr_u.drr_free);
+
+ /*
+ * When we receive a free record, dbuf_free_range() assumes
+ * that the receiving system doesn't have any dbufs in the range
+ * being freed. This is always true because there is a one-record
+ * constraint: we only send one WRITE record for any given
+ * object,offset. We know that the one-record constraint is
+ * true because we always send data in increasing order by
+ * object,offset.
+ *
+ * If the increasing-order constraint ever changes, we should find
+ * another way to assert that the one-record constraint is still
+ * satisfied.
+ */
+ ASSERT(object > dscp->dsc_last_data_object ||
+ (object == dscp->dsc_last_data_object &&
+ offset > dscp->dsc_last_data_offset));
+
+ /*
+ * If there is a pending op, but it's not PENDING_FREE, push it out,
+ * since free block aggregation can only be done for blocks of the
+ * same type (i.e., DRR_FREE records can only be aggregated with
+ * other DRR_FREE records. DRR_FREEOBJECTS records can only be
+ * aggregated with other DRR_FREEOBJECTS records).
+ */
+ if (dscp->dsc_pending_op != PENDING_NONE &&
+ dscp->dsc_pending_op != PENDING_FREE) {
+ if (dump_record(dscp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ dscp->dsc_pending_op = PENDING_NONE;
+ }
+
+ if (dscp->dsc_pending_op == PENDING_FREE) {
+ /*
+ * Check to see whether this free block can be aggregated
+ * with pending one.
+ */
+ if (drrf->drr_object == object && drrf->drr_offset +
+ drrf->drr_length == offset) {
+ if (offset + length < offset || length == UINT64_MAX)
+ drrf->drr_length = UINT64_MAX;
+ else
+ drrf->drr_length += length;
+ return (0);
+ } else {
+ /* not a continuation. Push out pending record */
+ if (dump_record(dscp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ dscp->dsc_pending_op = PENDING_NONE;
+ }
+ }
+ /* create a FREE record and make it pending */
+ bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+ dscp->dsc_drr->drr_type = DRR_FREE;
+ drrf->drr_object = object;
+ drrf->drr_offset = offset;
+ if (offset + length < offset)
+ drrf->drr_length = DMU_OBJECT_END;
+ else
+ drrf->drr_length = length;
+ drrf->drr_toguid = dscp->dsc_toguid;
+ if (length == DMU_OBJECT_END) {
+ if (dump_record(dscp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ } else {
+ dscp->dsc_pending_op = PENDING_FREE;
+ }
+
+ return (0);
+}
+
+/*
+ * Fill in the drr_redact struct, or perform aggregation if the previous record
+ * is also a redaction record, and the two are adjacent.
+ */
+static int
+dump_redact(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset,
+ uint64_t length)
+{
+ struct drr_redact *drrr = &dscp->dsc_drr->drr_u.drr_redact;
+
+ /*
+ * If there is a pending op, but it's not PENDING_REDACT, push it out,
+ * since free block aggregation can only be done for blocks of the
+ * same type (i.e., DRR_REDACT records can only be aggregated with
+ * other DRR_REDACT records).
+ */
+ if (dscp->dsc_pending_op != PENDING_NONE &&
+ dscp->dsc_pending_op != PENDING_REDACT) {
+ if (dump_record(dscp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ dscp->dsc_pending_op = PENDING_NONE;
+ }
+
+ if (dscp->dsc_pending_op == PENDING_REDACT) {
+ /*
+ * Check to see whether this redacted block can be aggregated
+ * with pending one.
+ */
+ if (drrr->drr_object == object && drrr->drr_offset +
+ drrr->drr_length == offset) {
+ drrr->drr_length += length;
+ return (0);
+ } else {
+ /* not a continuation. Push out pending record */
+ if (dump_record(dscp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ dscp->dsc_pending_op = PENDING_NONE;
+ }
+ }
+ /* create a REDACT record and make it pending */
+ bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+ dscp->dsc_drr->drr_type = DRR_REDACT;
+ drrr->drr_object = object;
+ drrr->drr_offset = offset;
+ drrr->drr_length = length;
+ drrr->drr_toguid = dscp->dsc_toguid;
+ dscp->dsc_pending_op = PENDING_REDACT;
+
+ return (0);
+}
+
+static int
+dmu_dump_write(dmu_send_cookie_t *dscp, dmu_object_type_t type, uint64_t object,
+ uint64_t offset, int lsize, int psize, const blkptr_t *bp, void *data)
+{
+ uint64_t payload_size;
+ boolean_t raw = (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW);
+ struct drr_write *drrw = &(dscp->dsc_drr->drr_u.drr_write);
+
+ /*
+ * We send data in increasing object, offset order.
+ * See comment in dump_free() for details.
+ */
+ ASSERT(object > dscp->dsc_last_data_object ||
+ (object == dscp->dsc_last_data_object &&
+ offset > dscp->dsc_last_data_offset));
+ dscp->dsc_last_data_object = object;
+ dscp->dsc_last_data_offset = offset + lsize - 1;
+
+ /*
+ * If there is any kind of pending aggregation (currently either
+ * a grouping of free objects or free blocks), push it out to
+ * the stream, since aggregation can't be done across operations
+ * of different types.
+ */
+ if (dscp->dsc_pending_op != PENDING_NONE) {
+ if (dump_record(dscp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ dscp->dsc_pending_op = PENDING_NONE;
+ }
+ /* write a WRITE record */
+ bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+ dscp->dsc_drr->drr_type = DRR_WRITE;
+ drrw->drr_object = object;
+ drrw->drr_type = type;
+ drrw->drr_offset = offset;
+ drrw->drr_toguid = dscp->dsc_toguid;
+ drrw->drr_logical_size = lsize;
+
+ /* only set the compression fields if the buf is compressed or raw */
+ if (raw || lsize != psize) {
+ ASSERT(raw || dscp->dsc_featureflags &
+ DMU_BACKUP_FEATURE_COMPRESSED);
+ ASSERT(!BP_IS_EMBEDDED(bp));
+ ASSERT3S(psize, >, 0);
+
+ if (raw) {
+ ASSERT(BP_IS_PROTECTED(bp));
+
+ /*
+ * This is a raw protected block so we need to pass
+ * along everything the receiving side will need to
+ * interpret this block, including the byteswap, salt,
+ * IV, and MAC.
+ */
+ if (BP_SHOULD_BYTESWAP(bp))
+ drrw->drr_flags |= DRR_RAW_BYTESWAP;
+ zio_crypt_decode_params_bp(bp, drrw->drr_salt,
+ drrw->drr_iv);
+ zio_crypt_decode_mac_bp(bp, drrw->drr_mac);
+ } else {
+ /* this is a compressed block */
+ ASSERT(dscp->dsc_featureflags &
+ DMU_BACKUP_FEATURE_COMPRESSED);
+ ASSERT(!BP_SHOULD_BYTESWAP(bp));
+ ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)));
+ ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF);
+ ASSERT3S(lsize, >=, psize);
+ }
+
+ /* set fields common to compressed and raw sends */
+ drrw->drr_compressiontype = BP_GET_COMPRESS(bp);
+ drrw->drr_compressed_size = psize;
+ payload_size = drrw->drr_compressed_size;
+ } else {
+ payload_size = drrw->drr_logical_size;
+ }
+
+ if (bp == NULL || BP_IS_EMBEDDED(bp) || (BP_IS_PROTECTED(bp) && !raw)) {
+ /*
+ * There's no pre-computed checksum for partial-block writes,
+ * embedded BP's, or encrypted BP's that are being sent as
+ * plaintext, so (like fletcher4-checksummed blocks) userland
+ * will have to compute a dedup-capable checksum itself.
+ */
+ drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
+ } else {
+ drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
+ if (zio_checksum_table[drrw->drr_checksumtype].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP)
+ drrw->drr_flags |= DRR_CHECKSUM_DEDUP;
+ DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
+ DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
+ DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
+ DDK_SET_CRYPT(&drrw->drr_key, BP_IS_PROTECTED(bp));
+ drrw->drr_key.ddk_cksum = bp->blk_cksum;
+ }
+
+ if (dump_record(dscp, data, payload_size) != 0)
+ return (SET_ERROR(EINTR));
+ return (0);
+}
+
+static int
+dump_write_embedded(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset,
+ int blksz, const blkptr_t *bp)
+{
+ char buf[BPE_PAYLOAD_SIZE];
+ struct drr_write_embedded *drrw =
+ &(dscp->dsc_drr->drr_u.drr_write_embedded);
+
+ if (dscp->dsc_pending_op != PENDING_NONE) {
+ if (dump_record(dscp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ dscp->dsc_pending_op = PENDING_NONE;
+ }
+
+ ASSERT(BP_IS_EMBEDDED(bp));
+
+ bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+ dscp->dsc_drr->drr_type = DRR_WRITE_EMBEDDED;
+ drrw->drr_object = object;
+ drrw->drr_offset = offset;
+ drrw->drr_length = blksz;
+ drrw->drr_toguid = dscp->dsc_toguid;
+ drrw->drr_compression = BP_GET_COMPRESS(bp);
+ drrw->drr_etype = BPE_GET_ETYPE(bp);
+ drrw->drr_lsize = BPE_GET_LSIZE(bp);
+ drrw->drr_psize = BPE_GET_PSIZE(bp);
+
+ decode_embedded_bp_compressed(bp, buf);
+
+ if (dump_record(dscp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
+ return (SET_ERROR(EINTR));
+ return (0);
+}
+
+static int
+dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
+ void *data)
+{
+ struct drr_spill *drrs = &(dscp->dsc_drr->drr_u.drr_spill);
+ uint64_t blksz = BP_GET_LSIZE(bp);
+ uint64_t payload_size = blksz;
+
+ if (dscp->dsc_pending_op != PENDING_NONE) {
+ if (dump_record(dscp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ dscp->dsc_pending_op = PENDING_NONE;
+ }
+
+ /* write a SPILL record */
+ bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+ dscp->dsc_drr->drr_type = DRR_SPILL;
+ drrs->drr_object = object;
+ drrs->drr_length = blksz;
+ drrs->drr_toguid = dscp->dsc_toguid;
+
+ /* See comment in dump_dnode() for full details */
+ if (zfs_send_unmodified_spill_blocks &&
+ (bp->blk_birth <= dscp->dsc_fromtxg)) {
+ drrs->drr_flags |= DRR_SPILL_UNMODIFIED;
+ }
+
+ /* handle raw send fields */
+ if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) {
+ ASSERT(BP_IS_PROTECTED(bp));
+
+ if (BP_SHOULD_BYTESWAP(bp))
+ drrs->drr_flags |= DRR_RAW_BYTESWAP;
+ drrs->drr_compressiontype = BP_GET_COMPRESS(bp);
+ drrs->drr_compressed_size = BP_GET_PSIZE(bp);
+ zio_crypt_decode_params_bp(bp, drrs->drr_salt, drrs->drr_iv);
+ zio_crypt_decode_mac_bp(bp, drrs->drr_mac);
+ payload_size = drrs->drr_compressed_size;
+ }
+
+ if (dump_record(dscp, data, payload_size) != 0)
+ return (SET_ERROR(EINTR));
+ return (0);
+}
+
+static int
+dump_freeobjects(dmu_send_cookie_t *dscp, uint64_t firstobj, uint64_t numobjs)
+{
+ struct drr_freeobjects *drrfo = &(dscp->dsc_drr->drr_u.drr_freeobjects);
+ uint64_t maxobj = DNODES_PER_BLOCK *
+ (DMU_META_DNODE(dscp->dsc_os)->dn_maxblkid + 1);
+
+ /*
+ * ZoL < 0.7 does not handle large FREEOBJECTS records correctly,
+ * leading to zfs recv never completing. to avoid this issue, don't
+ * send FREEOBJECTS records for object IDs which cannot exist on the
+ * receiving side.
+ */
+ if (maxobj > 0) {
+ if (maxobj <= firstobj)
+ return (0);
+
+ if (maxobj < firstobj + numobjs)
+ numobjs = maxobj - firstobj;
+ }
+
+ /*
+ * If there is a pending op, but it's not PENDING_FREEOBJECTS,
+ * push it out, since free block aggregation can only be done for
+ * blocks of the same type (i.e., DRR_FREE records can only be
+ * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records
+ * can only be aggregated with other DRR_FREEOBJECTS records).
+ */
+ if (dscp->dsc_pending_op != PENDING_NONE &&
+ dscp->dsc_pending_op != PENDING_FREEOBJECTS) {
+ if (dump_record(dscp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ dscp->dsc_pending_op = PENDING_NONE;
+ }
+
+ if (dscp->dsc_pending_op == PENDING_FREEOBJECTS) {
+ /*
+ * See whether this free object array can be aggregated
+ * with pending one
+ */
+ if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
+ drrfo->drr_numobjs += numobjs;
+ return (0);
+ } else {
+ /* can't be aggregated. Push out pending record */
+ if (dump_record(dscp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ dscp->dsc_pending_op = PENDING_NONE;
+ }
+ }
+
+ /* write a FREEOBJECTS record */
+ bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+ dscp->dsc_drr->drr_type = DRR_FREEOBJECTS;
+ drrfo->drr_firstobj = firstobj;
+ drrfo->drr_numobjs = numobjs;
+ drrfo->drr_toguid = dscp->dsc_toguid;
+
+ dscp->dsc_pending_op = PENDING_FREEOBJECTS;
+
+ return (0);
+}
+
+static int
+dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
+ dnode_phys_t *dnp)
+{
+ struct drr_object *drro = &(dscp->dsc_drr->drr_u.drr_object);
+ int bonuslen;
+
+ if (object < dscp->dsc_resume_object) {
+ /*
+ * Note: when resuming, we will visit all the dnodes in
+ * the block of dnodes that we are resuming from. In
+ * this case it's unnecessary to send the dnodes prior to
+ * the one we are resuming from. We should be at most one
+ * block's worth of dnodes behind the resume point.
+ */
+ ASSERT3U(dscp->dsc_resume_object - object, <,
+ 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT));
+ return (0);
+ }
+
+ if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
+ return (dump_freeobjects(dscp, object, 1));
+
+ if (dscp->dsc_pending_op != PENDING_NONE) {
+ if (dump_record(dscp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ dscp->dsc_pending_op = PENDING_NONE;
+ }
+
+ /* write an OBJECT record */
+ bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+ dscp->dsc_drr->drr_type = DRR_OBJECT;
+ drro->drr_object = object;
+ drro->drr_type = dnp->dn_type;
+ drro->drr_bonustype = dnp->dn_bonustype;
+ drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+ drro->drr_bonuslen = dnp->dn_bonuslen;
+ drro->drr_dn_slots = dnp->dn_extra_slots + 1;
+ drro->drr_checksumtype = dnp->dn_checksum;
+ drro->drr_compress = dnp->dn_compress;
+ drro->drr_toguid = dscp->dsc_toguid;
+
+ if (!(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
+ drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
+
+ bonuslen = P2ROUNDUP(dnp->dn_bonuslen, 8);
+
+ if ((dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW)) {
+ ASSERT(BP_IS_ENCRYPTED(bp));
+
+ if (BP_SHOULD_BYTESWAP(bp))
+ drro->drr_flags |= DRR_RAW_BYTESWAP;
+
+ /* needed for reconstructing dnp on recv side */
+ drro->drr_maxblkid = dnp->dn_maxblkid;
+ drro->drr_indblkshift = dnp->dn_indblkshift;
+ drro->drr_nlevels = dnp->dn_nlevels;
+ drro->drr_nblkptr = dnp->dn_nblkptr;
+
+ /*
+ * Since we encrypt the entire bonus area, the (raw) part
+ * beyond the bonuslen is actually nonzero, so we need
+ * to send it.
+ */
+ if (bonuslen != 0) {
+ drro->drr_raw_bonuslen = DN_MAX_BONUS_LEN(dnp);
+ bonuslen = drro->drr_raw_bonuslen;
+ }
+ }
+
+ /*
+ * DRR_OBJECT_SPILL is set for every dnode which references a
+ * spill block. This allows the receiving pool to definitively
+ * determine when a spill block should be kept or freed.
+ */
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
+ drro->drr_flags |= DRR_OBJECT_SPILL;
+
+ if (dump_record(dscp, DN_BONUS(dnp), bonuslen) != 0)
+ return (SET_ERROR(EINTR));
+
+ /* Free anything past the end of the file. */
+ if (dump_free(dscp, object, (dnp->dn_maxblkid + 1) *
+ (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0)
+ return (SET_ERROR(EINTR));
+
+ /*
+ * Send DRR_SPILL records for unmodified spill blocks. This is useful
+ * because changing certain attributes of the object (e.g. blocksize)
+ * can cause old versions of ZFS to incorrectly remove a spill block.
+ * Including these records in the stream forces an up to date version
+ * to always be written ensuring they're never lost. Current versions
+ * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can
+ * ignore these unmodified spill blocks.
+ */
+ if (zfs_send_unmodified_spill_blocks &&
+ (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
+ (DN_SPILL_BLKPTR(dnp)->blk_birth <= dscp->dsc_fromtxg)) {
+ struct send_range record;
+ blkptr_t *bp = DN_SPILL_BLKPTR(dnp);
+
+ bzero(&record, sizeof (struct send_range));
+ record.type = DATA;
+ record.object = object;
+ record.eos_marker = B_FALSE;
+ record.start_blkid = DMU_SPILL_BLKID;
+ record.end_blkid = record.start_blkid + 1;
+ record.sru.data.bp = *bp;
+ record.sru.data.obj_type = dnp->dn_type;
+ record.sru.data.datablksz = BP_GET_LSIZE(bp);
+
+ if (do_dump(dscp, &record) != 0)
+ return (SET_ERROR(EINTR));
+ }
+
+ if (dscp->dsc_err != 0)
+ return (SET_ERROR(EINTR));
+
+ return (0);
+}
+
+static int
+dump_object_range(dmu_send_cookie_t *dscp, const blkptr_t *bp,
+ uint64_t firstobj, uint64_t numslots)
+{
+ struct drr_object_range *drror =
+ &(dscp->dsc_drr->drr_u.drr_object_range);
+
+ /* we only use this record type for raw sends */
+ ASSERT(BP_IS_PROTECTED(bp));
+ ASSERT(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW);
+ ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
+ ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_DNODE);
+ ASSERT0(BP_GET_LEVEL(bp));
+
+ if (dscp->dsc_pending_op != PENDING_NONE) {
+ if (dump_record(dscp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ dscp->dsc_pending_op = PENDING_NONE;
+ }
+
+ bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+ dscp->dsc_drr->drr_type = DRR_OBJECT_RANGE;
+ drror->drr_firstobj = firstobj;
+ drror->drr_numslots = numslots;
+ drror->drr_toguid = dscp->dsc_toguid;
+ if (BP_SHOULD_BYTESWAP(bp))
+ drror->drr_flags |= DRR_RAW_BYTESWAP;
+ zio_crypt_decode_params_bp(bp, drror->drr_salt, drror->drr_iv);
+ zio_crypt_decode_mac_bp(bp, drror->drr_mac);
+
+ if (dump_record(dscp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ return (0);
+}
+
+static boolean_t
+send_do_embed(const blkptr_t *bp, uint64_t featureflags)
+{
+ if (!BP_IS_EMBEDDED(bp))
+ return (B_FALSE);
+
+ /*
+ * Compression function must be legacy, or explicitly enabled.
+ */
+ if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
+ !(featureflags & DMU_BACKUP_FEATURE_LZ4)))
+ return (B_FALSE);
+
+ /*
+ * If we have not set the ZSTD feature flag, we can't send ZSTD
+ * compressed embedded blocks, as the receiver may not support them.
+ */
+ if ((BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD &&
+ !(featureflags & DMU_BACKUP_FEATURE_ZSTD)))
+ return (B_FALSE);
+
+ /*
+ * Embed type must be explicitly enabled.
+ */
+ switch (BPE_GET_ETYPE(bp)) {
+ case BP_EMBEDDED_TYPE_DATA:
+ if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
+ return (B_TRUE);
+ break;
+ default:
+ return (B_FALSE);
+ }
+ return (B_FALSE);
+}
+
+/*
+ * This function actually handles figuring out what kind of record needs to be
+ * dumped, and calling the appropriate helper function. In most cases,
+ * the data has already been read by send_reader_thread().
+ */
+static int
+do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
+{
+ int err = 0;
+ switch (range->type) {
+ case OBJECT:
+ err = dump_dnode(dscp, &range->sru.object.bp, range->object,
+ range->sru.object.dnp);
+ return (err);
+ case OBJECT_RANGE: {
+ ASSERT3U(range->start_blkid + 1, ==, range->end_blkid);
+ if (!(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW)) {
+ return (0);
+ }
+ uint64_t epb = BP_GET_LSIZE(&range->sru.object_range.bp) >>
+ DNODE_SHIFT;
+ uint64_t firstobj = range->start_blkid * epb;
+ err = dump_object_range(dscp, &range->sru.object_range.bp,
+ firstobj, epb);
+ break;
+ }
+ case REDACT: {
+ struct srr *srrp = &range->sru.redact;
+ err = dump_redact(dscp, range->object, range->start_blkid *
+ srrp->datablksz, (range->end_blkid - range->start_blkid) *
+ srrp->datablksz);
+ return (err);
+ }
+ case DATA: {
+ struct srd *srdp = &range->sru.data;
+ blkptr_t *bp = &srdp->bp;
+ spa_t *spa =
+ dmu_objset_spa(dscp->dsc_os);
+
+ ASSERT3U(srdp->datablksz, ==, BP_GET_LSIZE(bp));
+ ASSERT3U(range->start_blkid + 1, ==, range->end_blkid);
+ if (BP_GET_TYPE(bp) == DMU_OT_SA) {
+ arc_flags_t aflags = ARC_FLAG_WAIT;
+ enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
+
+ if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) {
+ ASSERT(BP_IS_PROTECTED(bp));
+ zioflags |= ZIO_FLAG_RAW;
+ }
+
+ zbookmark_phys_t zb;
+ ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID);
+ zb.zb_objset = dmu_objset_id(dscp->dsc_os);
+ zb.zb_object = range->object;
+ zb.zb_level = 0;
+ zb.zb_blkid = range->start_blkid;
+
+ arc_buf_t *abuf = NULL;
+ if (!dscp->dsc_dso->dso_dryrun && arc_read(NULL, spa,
+ bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
+ zioflags, &aflags, &zb) != 0)
+ return (SET_ERROR(EIO));
+
+ err = dump_spill(dscp, bp, zb.zb_object,
+ (abuf == NULL ? NULL : abuf->b_data));
+ if (abuf != NULL)
+ arc_buf_destroy(abuf, &abuf);
+ return (err);
+ }
+ if (send_do_embed(bp, dscp->dsc_featureflags)) {
+ err = dump_write_embedded(dscp, range->object,
+ range->start_blkid * srdp->datablksz,
+ srdp->datablksz, bp);
+ return (err);
+ }
+ ASSERT(range->object > dscp->dsc_resume_object ||
+ (range->object == dscp->dsc_resume_object &&
+ range->start_blkid * srdp->datablksz >=
+ dscp->dsc_resume_offset));
+ /* it's a level-0 block of a regular object */
+
+ mutex_enter(&srdp->lock);
+ while (srdp->io_outstanding)
+ cv_wait(&srdp->cv, &srdp->lock);
+ err = srdp->io_err;
+ mutex_exit(&srdp->lock);
+
+ if (err != 0) {
+ if (zfs_send_corrupt_data &&
+ !dscp->dsc_dso->dso_dryrun) {
+ /*
+ * Send a block filled with 0x"zfs badd bloc"
+ */
+ srdp->abuf = arc_alloc_buf(spa, &srdp->abuf,
+ ARC_BUFC_DATA, srdp->datablksz);
+ uint64_t *ptr;
+ for (ptr = srdp->abuf->b_data;
+ (char *)ptr < (char *)srdp->abuf->b_data +
+ srdp->datablksz; ptr++)
+ *ptr = 0x2f5baddb10cULL;
+ } else {
+ return (SET_ERROR(EIO));
+ }
+ }
+
+ ASSERT(dscp->dsc_dso->dso_dryrun ||
+ srdp->abuf != NULL || srdp->abd != NULL);
+
+ uint64_t offset = range->start_blkid * srdp->datablksz;
+
+ char *data = NULL;
+ if (srdp->abd != NULL) {
+ data = abd_to_buf(srdp->abd);
+ ASSERT3P(srdp->abuf, ==, NULL);
+ } else if (srdp->abuf != NULL) {
+ data = srdp->abuf->b_data;
+ }
+
+ /*
+ * If we have large blocks stored on disk but the send flags
+ * don't allow us to send large blocks, we split the data from
+ * the arc buf into chunks.
+ */
+ if (srdp->datablksz > SPA_OLD_MAXBLOCKSIZE &&
+ !(dscp->dsc_featureflags &
+ DMU_BACKUP_FEATURE_LARGE_BLOCKS)) {
+ while (srdp->datablksz > 0 && err == 0) {
+ int n = MIN(srdp->datablksz,
+ SPA_OLD_MAXBLOCKSIZE);
+ err = dmu_dump_write(dscp, srdp->obj_type,
+ range->object, offset, n, n, NULL, data);
+ offset += n;
+ /*
+ * When doing dry run, data==NULL is used as a
+ * sentinel value by
+ * dmu_dump_write()->dump_record().
+ */
+ if (data != NULL)
+ data += n;
+ srdp->datablksz -= n;
+ }
+ } else {
+ err = dmu_dump_write(dscp, srdp->obj_type,
+ range->object, offset,
+ srdp->datablksz, srdp->datasz, bp, data);
+ }
+ return (err);
+ }
+ case HOLE: {
+ struct srh *srhp = &range->sru.hole;
+ if (range->object == DMU_META_DNODE_OBJECT) {
+ uint32_t span = srhp->datablksz >> DNODE_SHIFT;
+ uint64_t first_obj = range->start_blkid * span;
+ uint64_t numobj = range->end_blkid * span - first_obj;
+ return (dump_freeobjects(dscp, first_obj, numobj));
+ }
+ uint64_t offset = 0;
+
+ /*
+ * If this multiply overflows, we don't need to send this block.
+ * Even if it has a birth time, it can never not be a hole, so
+ * we don't need to send records for it.
+ */
+ if (!overflow_multiply(range->start_blkid, srhp->datablksz,
+ &offset)) {
+ return (0);
+ }
+ uint64_t len = 0;
+
+ if (!overflow_multiply(range->end_blkid, srhp->datablksz, &len))
+ len = UINT64_MAX;
+ len = len - offset;
+ return (dump_free(dscp, range->object, offset, len));
+ }
+ default:
+ panic("Invalid range type in do_dump: %d", range->type);
+ }
+ return (err);
+}
+
+static struct send_range *
+range_alloc(enum type type, uint64_t object, uint64_t start_blkid,
+ uint64_t end_blkid, boolean_t eos)
+{
+ struct send_range *range = kmem_alloc(sizeof (*range), KM_SLEEP);
+ range->type = type;
+ range->object = object;
+ range->start_blkid = start_blkid;
+ range->end_blkid = end_blkid;
+ range->eos_marker = eos;
+ if (type == DATA) {
+ range->sru.data.abd = NULL;
+ range->sru.data.abuf = NULL;
+ mutex_init(&range->sru.data.lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&range->sru.data.cv, NULL, CV_DEFAULT, NULL);
+ range->sru.data.io_outstanding = 0;
+ range->sru.data.io_err = 0;
+ }
+ return (range);
+}
+
+/*
+ * This is the callback function to traverse_dataset that acts as a worker
+ * thread for dmu_send_impl.
+ */
+/*ARGSUSED*/
+static int
+send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
+{
+ struct send_thread_arg *sta = arg;
+ struct send_range *record;
+
+ ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
+ zb->zb_object >= sta->resume.zb_object);
+
+ /*
+ * All bps of an encrypted os should have the encryption bit set.
+ * If this is not true it indicates tampering and we report an error.
+ */
+ if (sta->os->os_encrypted &&
+ !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) {
+ spa_log_error(spa, zb);
+ zfs_panic_recover("unencrypted block in encrypted "
+ "object set %llu", dmu_objset_id(sta->os));
+ return (SET_ERROR(EIO));
+ }
+
+ if (sta->cancel)
+ return (SET_ERROR(EINTR));
+ if (zb->zb_object != DMU_META_DNODE_OBJECT &&
+ DMU_OBJECT_IS_SPECIAL(zb->zb_object))
+ return (0);
+ atomic_inc_64(sta->num_blocks_visited);
+
+ if (zb->zb_level == ZB_DNODE_LEVEL) {
+ if (zb->zb_object == DMU_META_DNODE_OBJECT)
+ return (0);
+ record = range_alloc(OBJECT, zb->zb_object, 0, 0, B_FALSE);
+ record->sru.object.bp = *bp;
+ size_t size = sizeof (*dnp) * (dnp->dn_extra_slots + 1);
+ record->sru.object.dnp = kmem_alloc(size, KM_SLEEP);
+ bcopy(dnp, record->sru.object.dnp, size);
+ bqueue_enqueue(&sta->q, record, sizeof (*record));
+ return (0);
+ }
+ if (zb->zb_level == 0 && zb->zb_object == DMU_META_DNODE_OBJECT &&
+ !BP_IS_HOLE(bp)) {
+ record = range_alloc(OBJECT_RANGE, 0, zb->zb_blkid,
+ zb->zb_blkid + 1, B_FALSE);
+ record->sru.object_range.bp = *bp;
+ bqueue_enqueue(&sta->q, record, sizeof (*record));
+ return (0);
+ }
+ if (zb->zb_level < 0 || (zb->zb_level > 0 && !BP_IS_HOLE(bp)))
+ return (0);
+ if (zb->zb_object == DMU_META_DNODE_OBJECT && !BP_IS_HOLE(bp))
+ return (0);
+
+ uint64_t span = bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level);
+ uint64_t start;
+
+ /*
+ * If this multiply overflows, we don't need to send this block.
+ * Even if it has a birth time, it can never not be a hole, so
+ * we don't need to send records for it.
+ */
+ if (!overflow_multiply(span, zb->zb_blkid, &start) || (!(zb->zb_blkid ==
+ DMU_SPILL_BLKID || DMU_OT_IS_METADATA(dnp->dn_type)) &&
+ span * zb->zb_blkid > dnp->dn_maxblkid)) {
+ ASSERT(BP_IS_HOLE(bp));
+ return (0);
+ }
+
+ if (zb->zb_blkid == DMU_SPILL_BLKID)
+ ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_SA);
+
+ enum type record_type = DATA;
+ if (BP_IS_HOLE(bp))
+ record_type = HOLE;
+ else if (BP_IS_REDACTED(bp))
+ record_type = REDACT;
+ else
+ record_type = DATA;
+
+ record = range_alloc(record_type, zb->zb_object, start,
+ (start + span < start ? 0 : start + span), B_FALSE);
+
+ uint64_t datablksz = (zb->zb_blkid == DMU_SPILL_BLKID ?
+ BP_GET_LSIZE(bp) : dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+
+ if (BP_IS_HOLE(bp)) {
+ record->sru.hole.datablksz = datablksz;
+ } else if (BP_IS_REDACTED(bp)) {
+ record->sru.redact.datablksz = datablksz;
+ } else {
+ record->sru.data.datablksz = datablksz;
+ record->sru.data.obj_type = dnp->dn_type;
+ record->sru.data.bp = *bp;
+ }
+
+ bqueue_enqueue(&sta->q, record, sizeof (*record));
+ return (0);
+}
+
+struct redact_list_cb_arg {
+ uint64_t *num_blocks_visited;
+ bqueue_t *q;
+ boolean_t *cancel;
+ boolean_t mark_redact;
+};
+
+static int
+redact_list_cb(redact_block_phys_t *rb, void *arg)
+{
+ struct redact_list_cb_arg *rlcap = arg;
+
+ atomic_inc_64(rlcap->num_blocks_visited);
+ if (*rlcap->cancel)
+ return (-1);
+
+ struct send_range *data = range_alloc(REDACT, rb->rbp_object,
+ rb->rbp_blkid, rb->rbp_blkid + redact_block_get_count(rb), B_FALSE);
+ ASSERT3U(data->end_blkid, >, rb->rbp_blkid);
+ if (rlcap->mark_redact) {
+ data->type = REDACT;
+ data->sru.redact.datablksz = redact_block_get_size(rb);
+ } else {
+ data->type = PREVIOUSLY_REDACTED;
+ }
+ bqueue_enqueue(rlcap->q, data, sizeof (*data));
+
+ return (0);
+}
+
+/*
+ * This function kicks off the traverse_dataset. It also handles setting the
+ * error code of the thread in case something goes wrong, and pushes the End of
+ * Stream record when the traverse_dataset call has finished.
+ */
+static void
+send_traverse_thread(void *arg)
+{
+ struct send_thread_arg *st_arg = arg;
+ int err = 0;
+ struct send_range *data;
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+
+ err = traverse_dataset_resume(st_arg->os->os_dsl_dataset,
+ st_arg->fromtxg, &st_arg->resume,
+ st_arg->flags, send_cb, st_arg);
+
+ if (err != EINTR)
+ st_arg->error_code = err;
+ data = range_alloc(DATA, 0, 0, 0, B_TRUE);
+ bqueue_enqueue_flush(&st_arg->q, data, sizeof (*data));
+ spl_fstrans_unmark(cookie);
+ thread_exit();
+}
+
+/*
+ * Utility function that causes End of Stream records to compare after of all
+ * others, so that other threads' comparison logic can stay simple.
+ */
+static int __attribute__((unused))
+send_range_after(const struct send_range *from, const struct send_range *to)
+{
+ if (from->eos_marker == B_TRUE)
+ return (1);
+ if (to->eos_marker == B_TRUE)
+ return (-1);
+
+ uint64_t from_obj = from->object;
+ uint64_t from_end_obj = from->object + 1;
+ uint64_t to_obj = to->object;
+ uint64_t to_end_obj = to->object + 1;
+ if (from_obj == 0) {
+ ASSERT(from->type == HOLE || from->type == OBJECT_RANGE);
+ from_obj = from->start_blkid << DNODES_PER_BLOCK_SHIFT;
+ from_end_obj = from->end_blkid << DNODES_PER_BLOCK_SHIFT;
+ }
+ if (to_obj == 0) {
+ ASSERT(to->type == HOLE || to->type == OBJECT_RANGE);
+ to_obj = to->start_blkid << DNODES_PER_BLOCK_SHIFT;
+ to_end_obj = to->end_blkid << DNODES_PER_BLOCK_SHIFT;
+ }
+
+ if (from_end_obj <= to_obj)
+ return (-1);
+ if (from_obj >= to_end_obj)
+ return (1);
+ int64_t cmp = TREE_CMP(to->type == OBJECT_RANGE, from->type ==
+ OBJECT_RANGE);
+ if (unlikely(cmp))
+ return (cmp);
+ cmp = TREE_CMP(to->type == OBJECT, from->type == OBJECT);
+ if (unlikely(cmp))
+ return (cmp);
+ if (from->end_blkid <= to->start_blkid)
+ return (-1);
+ if (from->start_blkid >= to->end_blkid)
+ return (1);
+ return (0);
+}
+
+/*
+ * Pop the new data off the queue, check that the records we receive are in
+ * the right order, but do not free the old data. This is used so that the
+ * records can be sent on to the main thread without copying the data.
+ */
+static struct send_range *
+get_next_range_nofree(bqueue_t *bq, struct send_range *prev)
+{
+ struct send_range *next = bqueue_dequeue(bq);
+ ASSERT3S(send_range_after(prev, next), ==, -1);
+ return (next);
+}
+
+/*
+ * Pop the new data off the queue, check that the records we receive are in
+ * the right order, and free the old data.
+ */
+static struct send_range *
+get_next_range(bqueue_t *bq, struct send_range *prev)
+{
+ struct send_range *next = get_next_range_nofree(bq, prev);
+ range_free(prev);
+ return (next);
+}
+
+static void
+redact_list_thread(void *arg)
+{
+ struct redact_list_thread_arg *rlt_arg = arg;
+ struct send_range *record;
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+ if (rlt_arg->rl != NULL) {
+ struct redact_list_cb_arg rlcba = {0};
+ rlcba.cancel = &rlt_arg->cancel;
+ rlcba.q = &rlt_arg->q;
+ rlcba.num_blocks_visited = rlt_arg->num_blocks_visited;
+ rlcba.mark_redact = rlt_arg->mark_redact;
+ int err = dsl_redaction_list_traverse(rlt_arg->rl,
+ &rlt_arg->resume, redact_list_cb, &rlcba);
+ if (err != EINTR)
+ rlt_arg->error_code = err;
+ }
+ record = range_alloc(DATA, 0, 0, 0, B_TRUE);
+ bqueue_enqueue_flush(&rlt_arg->q, record, sizeof (*record));
+ spl_fstrans_unmark(cookie);
+
+ thread_exit();
+}
+
+/*
+ * Compare the start point of the two provided ranges. End of stream ranges
+ * compare last, objects compare before any data or hole inside that object and
+ * multi-object holes that start at the same object.
+ */
+static int
+send_range_start_compare(struct send_range *r1, struct send_range *r2)
+{
+ uint64_t r1_objequiv = r1->object;
+ uint64_t r1_l0equiv = r1->start_blkid;
+ uint64_t r2_objequiv = r2->object;
+ uint64_t r2_l0equiv = r2->start_blkid;
+ int64_t cmp = TREE_CMP(r1->eos_marker, r2->eos_marker);
+ if (unlikely(cmp))
+ return (cmp);
+ if (r1->object == 0) {
+ r1_objequiv = r1->start_blkid * DNODES_PER_BLOCK;
+ r1_l0equiv = 0;
+ }
+ if (r2->object == 0) {
+ r2_objequiv = r2->start_blkid * DNODES_PER_BLOCK;
+ r2_l0equiv = 0;
+ }
+
+ cmp = TREE_CMP(r1_objequiv, r2_objequiv);
+ if (likely(cmp))
+ return (cmp);
+ cmp = TREE_CMP(r2->type == OBJECT_RANGE, r1->type == OBJECT_RANGE);
+ if (unlikely(cmp))
+ return (cmp);
+ cmp = TREE_CMP(r2->type == OBJECT, r1->type == OBJECT);
+ if (unlikely(cmp))
+ return (cmp);
+
+ return (TREE_CMP(r1_l0equiv, r2_l0equiv));
+}
+
+enum q_idx {
+ REDACT_IDX = 0,
+ TO_IDX,
+ FROM_IDX,
+ NUM_THREADS
+};
+
+/*
+ * This function returns the next range the send_merge_thread should operate on.
+ * The inputs are two arrays; the first one stores the range at the front of the
+ * queues stored in the second one. The ranges are sorted in descending
+ * priority order; the metadata from earlier ranges overrules metadata from
+ * later ranges. out_mask is used to return which threads the ranges came from;
+ * bit i is set if ranges[i] started at the same place as the returned range.
+ *
+ * This code is not hardcoded to compare a specific number of threads; it could
+ * be used with any number, just by changing the q_idx enum.
+ *
+ * The "next range" is the one with the earliest start; if two starts are equal,
+ * the highest-priority range is the next to operate on. If a higher-priority
+ * range starts in the middle of the first range, then the first range will be
+ * truncated to end where the higher-priority range starts, and we will operate
+ * on that one next time. In this way, we make sure that each block covered by
+ * some range gets covered by a returned range, and each block covered is
+ * returned using the metadata of the highest-priority range it appears in.
+ *
+ * For example, if the three ranges at the front of the queues were [2,4),
+ * [3,5), and [1,3), then the ranges returned would be [1,2) with the metadata
+ * from the third range, [2,4) with the metadata from the first range, and then
+ * [4,5) with the metadata from the second.
+ */
+static struct send_range *
+find_next_range(struct send_range **ranges, bqueue_t **qs, uint64_t *out_mask)
+{
+ int idx = 0; // index of the range with the earliest start
+ int i;
+ uint64_t bmask = 0;
+ for (i = 1; i < NUM_THREADS; i++) {
+ if (send_range_start_compare(ranges[i], ranges[idx]) < 0)
+ idx = i;
+ }
+ if (ranges[idx]->eos_marker) {
+ struct send_range *ret = range_alloc(DATA, 0, 0, 0, B_TRUE);
+ *out_mask = 0;
+ return (ret);
+ }
+ /*
+ * Find all the ranges that start at that same point.
+ */
+ for (i = 0; i < NUM_THREADS; i++) {
+ if (send_range_start_compare(ranges[i], ranges[idx]) == 0)
+ bmask |= 1 << i;
+ }
+ *out_mask = bmask;
+ /*
+ * OBJECT_RANGE records only come from the TO thread, and should always
+ * be treated as overlapping with nothing and sent on immediately. They
+ * are only used in raw sends, and are never redacted.
+ */
+ if (ranges[idx]->type == OBJECT_RANGE) {
+ ASSERT3U(idx, ==, TO_IDX);
+ ASSERT3U(*out_mask, ==, 1 << TO_IDX);
+ struct send_range *ret = ranges[idx];
+ ranges[idx] = get_next_range_nofree(qs[idx], ranges[idx]);
+ return (ret);
+ }
+ /*
+ * Find the first start or end point after the start of the first range.
+ */
+ uint64_t first_change = ranges[idx]->end_blkid;
+ for (i = 0; i < NUM_THREADS; i++) {
+ if (i == idx || ranges[i]->eos_marker ||
+ ranges[i]->object > ranges[idx]->object ||
+ ranges[i]->object == DMU_META_DNODE_OBJECT)
+ continue;
+ ASSERT3U(ranges[i]->object, ==, ranges[idx]->object);
+ if (first_change > ranges[i]->start_blkid &&
+ (bmask & (1 << i)) == 0)
+ first_change = ranges[i]->start_blkid;
+ else if (first_change > ranges[i]->end_blkid)
+ first_change = ranges[i]->end_blkid;
+ }
+ /*
+ * Update all ranges to no longer overlap with the range we're
+ * returning. All such ranges must start at the same place as the range
+ * being returned, and end at or after first_change. Thus we update
+ * their start to first_change. If that makes them size 0, then free
+ * them and pull a new range from that thread.
+ */
+ for (i = 0; i < NUM_THREADS; i++) {
+ if (i == idx || (bmask & (1 << i)) == 0)
+ continue;
+ ASSERT3U(first_change, >, ranges[i]->start_blkid);
+ ranges[i]->start_blkid = first_change;
+ ASSERT3U(ranges[i]->start_blkid, <=, ranges[i]->end_blkid);
+ if (ranges[i]->start_blkid == ranges[i]->end_blkid)
+ ranges[i] = get_next_range(qs[i], ranges[i]);
+ }
+ /*
+ * Short-circuit the simple case; if the range doesn't overlap with
+ * anything else, or it only overlaps with things that start at the same
+ * place and are longer, send it on.
+ */
+ if (first_change == ranges[idx]->end_blkid) {
+ struct send_range *ret = ranges[idx];
+ ranges[idx] = get_next_range_nofree(qs[idx], ranges[idx]);
+ return (ret);
+ }
+
+ /*
+ * Otherwise, return a truncated copy of ranges[idx] and move the start
+ * of ranges[idx] back to first_change.
+ */
+ struct send_range *ret = kmem_alloc(sizeof (*ret), KM_SLEEP);
+ *ret = *ranges[idx];
+ ret->end_blkid = first_change;
+ ranges[idx]->start_blkid = first_change;
+ return (ret);
+}
+
+#define FROM_AND_REDACT_BITS ((1 << REDACT_IDX) | (1 << FROM_IDX))
+
+/*
+ * Merge the results from the from thread and the to thread, and then hand the
+ * records off to send_prefetch_thread to prefetch them. If this is not a
+ * send from a redaction bookmark, the from thread will push an end of stream
+ * record and stop, and we'll just send everything that was changed in the
+ * to_ds since the ancestor's creation txg. If it is, then since
+ * traverse_dataset has a canonical order, we can compare each change as
+ * they're pulled off the queues. That will give us a stream that is
+ * appropriately sorted, and covers all records. In addition, we pull the
+ * data from the redact_list_thread and use that to determine which blocks
+ * should be redacted.
+ */
+static void
+send_merge_thread(void *arg)
+{
+ struct send_merge_thread_arg *smt_arg = arg;
+ struct send_range *front_ranges[NUM_THREADS];
+ bqueue_t *queues[NUM_THREADS];
+ int err = 0;
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+
+ if (smt_arg->redact_arg == NULL) {
+ front_ranges[REDACT_IDX] =
+ kmem_zalloc(sizeof (struct send_range), KM_SLEEP);
+ front_ranges[REDACT_IDX]->eos_marker = B_TRUE;
+ front_ranges[REDACT_IDX]->type = REDACT;
+ queues[REDACT_IDX] = NULL;
+ } else {
+ front_ranges[REDACT_IDX] =
+ bqueue_dequeue(&smt_arg->redact_arg->q);
+ queues[REDACT_IDX] = &smt_arg->redact_arg->q;
+ }
+ front_ranges[TO_IDX] = bqueue_dequeue(&smt_arg->to_arg->q);
+ queues[TO_IDX] = &smt_arg->to_arg->q;
+ front_ranges[FROM_IDX] = bqueue_dequeue(&smt_arg->from_arg->q);
+ queues[FROM_IDX] = &smt_arg->from_arg->q;
+ uint64_t mask = 0;
+ struct send_range *range;
+ for (range = find_next_range(front_ranges, queues, &mask);
+ !range->eos_marker && err == 0 && !smt_arg->cancel;
+ range = find_next_range(front_ranges, queues, &mask)) {
+ /*
+ * If the range in question was in both the from redact bookmark
+ * and the bookmark we're using to redact, then don't send it.
+ * It's already redacted on the receiving system, so a redaction
+ * record would be redundant.
+ */
+ if ((mask & FROM_AND_REDACT_BITS) == FROM_AND_REDACT_BITS) {
+ ASSERT3U(range->type, ==, REDACT);
+ range_free(range);
+ continue;
+ }
+ bqueue_enqueue(&smt_arg->q, range, sizeof (*range));
+
+ if (smt_arg->to_arg->error_code != 0) {
+ err = smt_arg->to_arg->error_code;
+ } else if (smt_arg->from_arg->error_code != 0) {
+ err = smt_arg->from_arg->error_code;
+ } else if (smt_arg->redact_arg != NULL &&
+ smt_arg->redact_arg->error_code != 0) {
+ err = smt_arg->redact_arg->error_code;
+ }
+ }
+ if (smt_arg->cancel && err == 0)
+ err = SET_ERROR(EINTR);
+ smt_arg->error = err;
+ if (smt_arg->error != 0) {
+ smt_arg->to_arg->cancel = B_TRUE;
+ smt_arg->from_arg->cancel = B_TRUE;
+ if (smt_arg->redact_arg != NULL)
+ smt_arg->redact_arg->cancel = B_TRUE;
+ }
+ for (int i = 0; i < NUM_THREADS; i++) {
+ while (!front_ranges[i]->eos_marker) {
+ front_ranges[i] = get_next_range(queues[i],
+ front_ranges[i]);
+ }
+ range_free(front_ranges[i]);
+ }
+ if (range == NULL)
+ range = kmem_zalloc(sizeof (*range), KM_SLEEP);
+ range->eos_marker = B_TRUE;
+ bqueue_enqueue_flush(&smt_arg->q, range, 1);
+ spl_fstrans_unmark(cookie);
+ thread_exit();
+}
+
+struct send_reader_thread_arg {
+ struct send_merge_thread_arg *smta;
+ bqueue_t q;
+ boolean_t cancel;
+ boolean_t issue_reads;
+ uint64_t featureflags;
+ int error;
+};
+
+static void
+dmu_send_read_done(zio_t *zio)
+{
+ struct send_range *range = zio->io_private;
+
+ mutex_enter(&range->sru.data.lock);
+ if (zio->io_error != 0) {
+ abd_free(range->sru.data.abd);
+ range->sru.data.abd = NULL;
+ range->sru.data.io_err = zio->io_error;
+ }
+
+ ASSERT(range->sru.data.io_outstanding);
+ range->sru.data.io_outstanding = B_FALSE;
+ cv_broadcast(&range->sru.data.cv);
+ mutex_exit(&range->sru.data.lock);
+}
+
+static void
+issue_data_read(struct send_reader_thread_arg *srta, struct send_range *range)
+{
+ struct srd *srdp = &range->sru.data;
+ blkptr_t *bp = &srdp->bp;
+ objset_t *os = srta->smta->os;
+
+ ASSERT3U(range->type, ==, DATA);
+ ASSERT3U(range->start_blkid + 1, ==, range->end_blkid);
+ /*
+ * If we have large blocks stored on disk but
+ * the send flags don't allow us to send large
+ * blocks, we split the data from the arc buf
+ * into chunks.
+ */
+ boolean_t split_large_blocks =
+ srdp->datablksz > SPA_OLD_MAXBLOCKSIZE &&
+ !(srta->featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS);
+ /*
+ * We should only request compressed data from the ARC if all
+ * the following are true:
+ * - stream compression was requested
+ * - we aren't splitting large blocks into smaller chunks
+ * - the data won't need to be byteswapped before sending
+ * - this isn't an embedded block
+ * - this isn't metadata (if receiving on a different endian
+ * system it can be byteswapped more easily)
+ */
+ boolean_t request_compressed =
+ (srta->featureflags & DMU_BACKUP_FEATURE_COMPRESSED) &&
+ !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) &&
+ !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp));
+
+ enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
+
+ if (srta->featureflags & DMU_BACKUP_FEATURE_RAW)
+ zioflags |= ZIO_FLAG_RAW;
+ else if (request_compressed)
+ zioflags |= ZIO_FLAG_RAW_COMPRESS;
+
+ srdp->datasz = (zioflags & ZIO_FLAG_RAW_COMPRESS) ?
+ BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp);
+
+ if (!srta->issue_reads)
+ return;
+ if (BP_IS_REDACTED(bp))
+ return;
+ if (send_do_embed(bp, srta->featureflags))
+ return;
+
+ zbookmark_phys_t zb = {
+ .zb_objset = dmu_objset_id(os),
+ .zb_object = range->object,
+ .zb_level = 0,
+ .zb_blkid = range->start_blkid,
+ };
+
+ arc_flags_t aflags = ARC_FLAG_CACHED_ONLY;
+
+ int arc_err = arc_read(NULL, os->os_spa, bp,
+ arc_getbuf_func, &srdp->abuf, ZIO_PRIORITY_ASYNC_READ,
+ zioflags, &aflags, &zb);
+ /*
+ * If the data is not already cached in the ARC, we read directly
+ * from zio. This avoids the performance overhead of adding a new
+ * entry to the ARC, and we also avoid polluting the ARC cache with
+ * data that is not likely to be used in the future.
+ */
+ if (arc_err != 0) {
+ srdp->abd = abd_alloc_linear(srdp->datasz, B_FALSE);
+ srdp->io_outstanding = B_TRUE;
+ zio_nowait(zio_read(NULL, os->os_spa, bp, srdp->abd,
+ srdp->datasz, dmu_send_read_done, range,
+ ZIO_PRIORITY_ASYNC_READ, zioflags, &zb));
+ }
+}
+
+/*
+ * Create a new record with the given values.
+ */
+static void
+enqueue_range(struct send_reader_thread_arg *srta, bqueue_t *q, dnode_t *dn,
+ uint64_t blkid, uint64_t count, const blkptr_t *bp, uint32_t datablksz)
+{
+ enum type range_type = (bp == NULL || BP_IS_HOLE(bp) ? HOLE :
+ (BP_IS_REDACTED(bp) ? REDACT : DATA));
+
+ struct send_range *range = range_alloc(range_type, dn->dn_object,
+ blkid, blkid + count, B_FALSE);
+
+ if (blkid == DMU_SPILL_BLKID)
+ ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_SA);
+
+ switch (range_type) {
+ case HOLE:
+ range->sru.hole.datablksz = datablksz;
+ break;
+ case DATA:
+ ASSERT3U(count, ==, 1);
+ range->sru.data.datablksz = datablksz;
+ range->sru.data.obj_type = dn->dn_type;
+ range->sru.data.bp = *bp;
+ issue_data_read(srta, range);
+ break;
+ case REDACT:
+ range->sru.redact.datablksz = datablksz;
+ break;
+ default:
+ break;
+ }
+ bqueue_enqueue(q, range, datablksz);
+}
+
+/*
+ * This thread is responsible for two things: First, it retrieves the correct
+ * blkptr in the to ds if we need to send the data because of something from
+ * the from thread. As a result of this, we're the first ones to discover that
+ * some indirect blocks can be discarded because they're not holes. Second,
+ * it issues prefetches for the data we need to send.
+ */
+static void
+send_reader_thread(void *arg)
+{
+ struct send_reader_thread_arg *srta = arg;
+ struct send_merge_thread_arg *smta = srta->smta;
+ bqueue_t *inq = &smta->q;
+ bqueue_t *outq = &srta->q;
+ objset_t *os = smta->os;
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+ struct send_range *range = bqueue_dequeue(inq);
+ int err = 0;
+
+ /*
+ * If the record we're analyzing is from a redaction bookmark from the
+ * fromds, then we need to know whether or not it exists in the tods so
+ * we know whether to create records for it or not. If it does, we need
+ * the datablksz so we can generate an appropriate record for it.
+ * Finally, if it isn't redacted, we need the blkptr so that we can send
+ * a WRITE record containing the actual data.
+ */
+ uint64_t last_obj = UINT64_MAX;
+ uint64_t last_obj_exists = B_TRUE;
+ while (!range->eos_marker && !srta->cancel && smta->error == 0 &&
+ err == 0) {
+ switch (range->type) {
+ case DATA:
+ issue_data_read(srta, range);
+ bqueue_enqueue(outq, range, range->sru.data.datablksz);
+ range = get_next_range_nofree(inq, range);
+ break;
+ case HOLE:
+ case OBJECT:
+ case OBJECT_RANGE:
+ case REDACT: // Redacted blocks must exist
+ bqueue_enqueue(outq, range, sizeof (*range));
+ range = get_next_range_nofree(inq, range);
+ break;
+ case PREVIOUSLY_REDACTED: {
+ /*
+ * This entry came from the "from bookmark" when
+ * sending from a bookmark that has a redaction
+ * list. We need to check if this object/blkid
+ * exists in the target ("to") dataset, and if
+ * not then we drop this entry. We also need
+ * to fill in the block pointer so that we know
+ * what to prefetch.
+ *
+ * To accomplish the above, we first cache whether or
+ * not the last object we examined exists. If it
+ * doesn't, we can drop this record. If it does, we hold
+ * the dnode and use it to call dbuf_dnode_findbp. We do
+ * this instead of dbuf_bookmark_findbp because we will
+ * often operate on large ranges, and holding the dnode
+ * once is more efficient.
+ */
+ boolean_t object_exists = B_TRUE;
+ /*
+ * If the data is redacted, we only care if it exists,
+ * so that we don't send records for objects that have
+ * been deleted.
+ */
+ dnode_t *dn;
+ if (range->object == last_obj && !last_obj_exists) {
+ /*
+ * If we're still examining the same object as
+ * previously, and it doesn't exist, we don't
+ * need to call dbuf_bookmark_findbp.
+ */
+ object_exists = B_FALSE;
+ } else {
+ err = dnode_hold(os, range->object, FTAG, &dn);
+ if (err == ENOENT) {
+ object_exists = B_FALSE;
+ err = 0;
+ }
+ last_obj = range->object;
+ last_obj_exists = object_exists;
+ }
+
+ if (err != 0) {
+ break;
+ } else if (!object_exists) {
+ /*
+ * The block was modified, but doesn't
+ * exist in the to dataset; if it was
+ * deleted in the to dataset, then we'll
+ * visit the hole bp for it at some point.
+ */
+ range = get_next_range(inq, range);
+ continue;
+ }
+ uint64_t file_max =
+ (dn->dn_maxblkid < range->end_blkid ?
+ dn->dn_maxblkid : range->end_blkid);
+ /*
+ * The object exists, so we need to try to find the
+ * blkptr for each block in the range we're processing.
+ */
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ for (uint64_t blkid = range->start_blkid;
+ blkid < file_max; blkid++) {
+ blkptr_t bp;
+ uint32_t datablksz =
+ dn->dn_phys->dn_datablkszsec <<
+ SPA_MINBLOCKSHIFT;
+ uint64_t offset = blkid * datablksz;
+ /*
+ * This call finds the next non-hole block in
+ * the object. This is to prevent a
+ * performance problem where we're unredacting
+ * a large hole. Using dnode_next_offset to
+ * skip over the large hole avoids iterating
+ * over every block in it.
+ */
+ err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
+ &offset, 1, 1, 0);
+ if (err == ESRCH) {
+ offset = UINT64_MAX;
+ err = 0;
+ } else if (err != 0) {
+ break;
+ }
+ if (offset != blkid * datablksz) {
+ /*
+ * if there is a hole from here
+ * (blkid) to offset
+ */
+ offset = MIN(offset, file_max *
+ datablksz);
+ uint64_t nblks = (offset / datablksz) -
+ blkid;
+ enqueue_range(srta, outq, dn, blkid,
+ nblks, NULL, datablksz);
+ blkid += nblks;
+ }
+ if (blkid >= file_max)
+ break;
+ err = dbuf_dnode_findbp(dn, 0, blkid, &bp,
+ NULL, NULL);
+ if (err != 0)
+ break;
+ ASSERT(!BP_IS_HOLE(&bp));
+ enqueue_range(srta, outq, dn, blkid, 1, &bp,
+ datablksz);
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ range = get_next_range(inq, range);
+ }
+ }
+ }
+ if (srta->cancel || err != 0) {
+ smta->cancel = B_TRUE;
+ srta->error = err;
+ } else if (smta->error != 0) {
+ srta->error = smta->error;
+ }
+ while (!range->eos_marker)
+ range = get_next_range(inq, range);
+
+ bqueue_enqueue_flush(outq, range, 1);
+ spl_fstrans_unmark(cookie);
+ thread_exit();
+}
+
+#define NUM_SNAPS_NOT_REDACTED UINT64_MAX
+
+struct dmu_send_params {
+ /* Pool args */
+ void *tag; // Tag that dp was held with, will be used to release dp.
+ dsl_pool_t *dp;
+ /* To snapshot args */
+ const char *tosnap;
+ dsl_dataset_t *to_ds;
+ /* From snapshot args */
+ zfs_bookmark_phys_t ancestor_zb;
+ uint64_t *fromredactsnaps;
+ /* NUM_SNAPS_NOT_REDACTED if not sending from redaction bookmark */
+ uint64_t numfromredactsnaps;
+ /* Stream params */
+ boolean_t is_clone;
+ boolean_t embedok;
+ boolean_t large_block_ok;
+ boolean_t compressok;
+ boolean_t rawok;
+ boolean_t savedok;
+ uint64_t resumeobj;
+ uint64_t resumeoff;
+ uint64_t saved_guid;
+ zfs_bookmark_phys_t *redactbook;
+ /* Stream output params */
+ dmu_send_outparams_t *dso;
+
+ /* Stream progress params */
+ offset_t *off;
+ int outfd;
+ char saved_toname[MAXNAMELEN];
+};
+
+static int
+setup_featureflags(struct dmu_send_params *dspp, objset_t *os,
+ uint64_t *featureflags)
+{
+ dsl_dataset_t *to_ds = dspp->to_ds;
+ dsl_pool_t *dp = dspp->dp;
+#ifdef _KERNEL
+ if (dmu_objset_type(os) == DMU_OST_ZFS) {
+ uint64_t version;
+ if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0)
+ return (SET_ERROR(EINVAL));
+
+ if (version >= ZPL_VERSION_SA)
+ *featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
+ }
+#endif
+
+ /* raw sends imply large_block_ok */
+ if ((dspp->rawok || dspp->large_block_ok) &&
+ dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_BLOCKS)) {
+ *featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
+ }
+
+ /* encrypted datasets will not have embedded blocks */
+ if ((dspp->embedok || dspp->rawok) && !os->os_encrypted &&
+ spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
+ *featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
+ }
+
+ /* raw send implies compressok */
+ if (dspp->compressok || dspp->rawok)
+ *featureflags |= DMU_BACKUP_FEATURE_COMPRESSED;
+
+ if (dspp->rawok && os->os_encrypted)
+ *featureflags |= DMU_BACKUP_FEATURE_RAW;
+
+ if ((*featureflags &
+ (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED |
+ DMU_BACKUP_FEATURE_RAW)) != 0 &&
+ spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) {
+ *featureflags |= DMU_BACKUP_FEATURE_LZ4;
+ }
+
+ /*
+ * We specifically do not include DMU_BACKUP_FEATURE_EMBED_DATA here to
+ * allow sending ZSTD compressed datasets to a receiver that does not
+ * support ZSTD
+ */
+ if ((*featureflags &
+ (DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_RAW)) != 0 &&
+ dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_ZSTD_COMPRESS)) {
+ *featureflags |= DMU_BACKUP_FEATURE_ZSTD;
+ }
+
+ if (dspp->resumeobj != 0 || dspp->resumeoff != 0) {
+ *featureflags |= DMU_BACKUP_FEATURE_RESUMING;
+ }
+
+ if (dspp->redactbook != NULL) {
+ *featureflags |= DMU_BACKUP_FEATURE_REDACTED;
+ }
+
+ if (dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_DNODE)) {
+ *featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE;
+ }
+ return (0);
+}
+
+static dmu_replay_record_t *
+create_begin_record(struct dmu_send_params *dspp, objset_t *os,
+ uint64_t featureflags)
+{
+ dmu_replay_record_t *drr = kmem_zalloc(sizeof (dmu_replay_record_t),
+ KM_SLEEP);
+ drr->drr_type = DRR_BEGIN;
+
+ struct drr_begin *drrb = &drr->drr_u.drr_begin;
+ dsl_dataset_t *to_ds = dspp->to_ds;
+
+ drrb->drr_magic = DMU_BACKUP_MAGIC;
+ drrb->drr_creation_time = dsl_dataset_phys(to_ds)->ds_creation_time;
+ drrb->drr_type = dmu_objset_type(os);
+ drrb->drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
+ drrb->drr_fromguid = dspp->ancestor_zb.zbm_guid;
+
+ DMU_SET_STREAM_HDRTYPE(drrb->drr_versioninfo, DMU_SUBSTREAM);
+ DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, featureflags);
+
+ if (dspp->is_clone)
+ drrb->drr_flags |= DRR_FLAG_CLONE;
+ if (dsl_dataset_phys(dspp->to_ds)->ds_flags & DS_FLAG_CI_DATASET)
+ drrb->drr_flags |= DRR_FLAG_CI_DATA;
+ if (zfs_send_set_freerecords_bit)
+ drrb->drr_flags |= DRR_FLAG_FREERECORDS;
+ drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_SPILL_BLOCK;
+
+ if (dspp->savedok) {
+ drrb->drr_toguid = dspp->saved_guid;
+ strlcpy(drrb->drr_toname, dspp->saved_toname,
+ sizeof (drrb->drr_toname));
+ } else {
+ dsl_dataset_name(to_ds, drrb->drr_toname);
+ if (!to_ds->ds_is_snapshot) {
+ (void) strlcat(drrb->drr_toname, "@--head--",
+ sizeof (drrb->drr_toname));
+ }
+ }
+ return (drr);
+}
+
+static void
+setup_to_thread(struct send_thread_arg *to_arg, objset_t *to_os,
+ dmu_sendstatus_t *dssp, uint64_t fromtxg, boolean_t rawok)
+{
+ VERIFY0(bqueue_init(&to_arg->q, zfs_send_no_prefetch_queue_ff,
+ MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize),
+ offsetof(struct send_range, ln)));
+ to_arg->error_code = 0;
+ to_arg->cancel = B_FALSE;
+ to_arg->os = to_os;
+ to_arg->fromtxg = fromtxg;
+ to_arg->flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA;
+ if (rawok)
+ to_arg->flags |= TRAVERSE_NO_DECRYPT;
+ to_arg->num_blocks_visited = &dssp->dss_blocks;
+ (void) thread_create(NULL, 0, send_traverse_thread, to_arg, 0,
+ curproc, TS_RUN, minclsyspri);
+}
+
+static void
+setup_from_thread(struct redact_list_thread_arg *from_arg,
+ redaction_list_t *from_rl, dmu_sendstatus_t *dssp)
+{
+ VERIFY0(bqueue_init(&from_arg->q, zfs_send_no_prefetch_queue_ff,
+ MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize),
+ offsetof(struct send_range, ln)));
+ from_arg->error_code = 0;
+ from_arg->cancel = B_FALSE;
+ from_arg->rl = from_rl;
+ from_arg->mark_redact = B_FALSE;
+ from_arg->num_blocks_visited = &dssp->dss_blocks;
+ /*
+ * If from_ds is null, send_traverse_thread just returns success and
+ * enqueues an eos marker.
+ */
+ (void) thread_create(NULL, 0, redact_list_thread, from_arg, 0,
+ curproc, TS_RUN, minclsyspri);
+}
+
+static void
+setup_redact_list_thread(struct redact_list_thread_arg *rlt_arg,
+ struct dmu_send_params *dspp, redaction_list_t *rl, dmu_sendstatus_t *dssp)
+{
+ if (dspp->redactbook == NULL)
+ return;
+
+ rlt_arg->cancel = B_FALSE;
+ VERIFY0(bqueue_init(&rlt_arg->q, zfs_send_no_prefetch_queue_ff,
+ MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize),
+ offsetof(struct send_range, ln)));
+ rlt_arg->error_code = 0;
+ rlt_arg->mark_redact = B_TRUE;
+ rlt_arg->rl = rl;
+ rlt_arg->num_blocks_visited = &dssp->dss_blocks;
+
+ (void) thread_create(NULL, 0, redact_list_thread, rlt_arg, 0,
+ curproc, TS_RUN, minclsyspri);
+}
+
+static void
+setup_merge_thread(struct send_merge_thread_arg *smt_arg,
+ struct dmu_send_params *dspp, struct redact_list_thread_arg *from_arg,
+ struct send_thread_arg *to_arg, struct redact_list_thread_arg *rlt_arg,
+ objset_t *os)
+{
+ VERIFY0(bqueue_init(&smt_arg->q, zfs_send_no_prefetch_queue_ff,
+ MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize),
+ offsetof(struct send_range, ln)));
+ smt_arg->cancel = B_FALSE;
+ smt_arg->error = 0;
+ smt_arg->from_arg = from_arg;
+ smt_arg->to_arg = to_arg;
+ if (dspp->redactbook != NULL)
+ smt_arg->redact_arg = rlt_arg;
+
+ smt_arg->os = os;
+ (void) thread_create(NULL, 0, send_merge_thread, smt_arg, 0, curproc,
+ TS_RUN, minclsyspri);
+}
+
+static void
+setup_reader_thread(struct send_reader_thread_arg *srt_arg,
+ struct dmu_send_params *dspp, struct send_merge_thread_arg *smt_arg,
+ uint64_t featureflags)
+{
+ VERIFY0(bqueue_init(&srt_arg->q, zfs_send_queue_ff,
+ MAX(zfs_send_queue_length, 2 * zfs_max_recordsize),
+ offsetof(struct send_range, ln)));
+ srt_arg->smta = smt_arg;
+ srt_arg->issue_reads = !dspp->dso->dso_dryrun;
+ srt_arg->featureflags = featureflags;
+ (void) thread_create(NULL, 0, send_reader_thread, srt_arg, 0,
+ curproc, TS_RUN, minclsyspri);
+}
+
+static int
+setup_resume_points(struct dmu_send_params *dspp,
+ struct send_thread_arg *to_arg, struct redact_list_thread_arg *from_arg,
+ struct redact_list_thread_arg *rlt_arg,
+ struct send_merge_thread_arg *smt_arg, boolean_t resuming, objset_t *os,
+ redaction_list_t *redact_rl, nvlist_t *nvl)
+{
+ dsl_dataset_t *to_ds = dspp->to_ds;
+ int err = 0;
+
+ uint64_t obj = 0;
+ uint64_t blkid = 0;
+ if (resuming) {
+ obj = dspp->resumeobj;
+ dmu_object_info_t to_doi;
+ err = dmu_object_info(os, obj, &to_doi);
+ if (err != 0)
+ return (err);
+
+ blkid = dspp->resumeoff / to_doi.doi_data_block_size;
+ }
+ /*
+ * If we're resuming a redacted send, we can skip to the appropriate
+ * point in the redaction bookmark by binary searching through it.
+ */
+ if (redact_rl != NULL) {
+ SET_BOOKMARK(&rlt_arg->resume, to_ds->ds_object, obj, 0, blkid);
+ }
+
+ SET_BOOKMARK(&to_arg->resume, to_ds->ds_object, obj, 0, blkid);
+ if (nvlist_exists(nvl, BEGINNV_REDACT_FROM_SNAPS)) {
+ uint64_t objset = dspp->ancestor_zb.zbm_redaction_obj;
+ /*
+ * Note: If the resume point is in an object whose
+ * blocksize is different in the from vs to snapshots,
+ * we will have divided by the "wrong" blocksize.
+ * However, in this case fromsnap's send_cb() will
+ * detect that the blocksize has changed and therefore
+ * ignore this object.
+ *
+ * If we're resuming a send from a redaction bookmark,
+ * we still cannot accidentally suggest blocks behind
+ * the to_ds. In addition, we know that any blocks in
+ * the object in the to_ds will have to be sent, since
+ * the size changed. Therefore, we can't cause any harm
+ * this way either.
+ */
+ SET_BOOKMARK(&from_arg->resume, objset, obj, 0, blkid);
+ }
+ if (resuming) {
+ fnvlist_add_uint64(nvl, BEGINNV_RESUME_OBJECT, dspp->resumeobj);
+ fnvlist_add_uint64(nvl, BEGINNV_RESUME_OFFSET, dspp->resumeoff);
+ }
+ return (0);
+}
+
+static dmu_sendstatus_t *
+setup_send_progress(struct dmu_send_params *dspp)
+{
+ dmu_sendstatus_t *dssp = kmem_zalloc(sizeof (*dssp), KM_SLEEP);
+ dssp->dss_outfd = dspp->outfd;
+ dssp->dss_off = dspp->off;
+ dssp->dss_proc = curproc;
+ mutex_enter(&dspp->to_ds->ds_sendstream_lock);
+ list_insert_head(&dspp->to_ds->ds_sendstreams, dssp);
+ mutex_exit(&dspp->to_ds->ds_sendstream_lock);
+ return (dssp);
+}
+
+/*
+ * Actually do the bulk of the work in a zfs send.
+ *
+ * The idea is that we want to do a send from ancestor_zb to to_ds. We also
+ * want to not send any data that has been modified by all the datasets in
+ * redactsnaparr, and store the list of blocks that are redacted in this way in
+ * a bookmark named redactbook, created on the to_ds. We do this by creating
+ * several worker threads, whose function is described below.
+ *
+ * There are three cases.
+ * The first case is a redacted zfs send. In this case there are 5 threads.
+ * The first thread is the to_ds traversal thread: it calls dataset_traverse on
+ * the to_ds and finds all the blocks that have changed since ancestor_zb (if
+ * it's a full send, that's all blocks in the dataset). It then sends those
+ * blocks on to the send merge thread. The redact list thread takes the data
+ * from the redaction bookmark and sends those blocks on to the send merge
+ * thread. The send merge thread takes the data from the to_ds traversal
+ * thread, and combines it with the redaction records from the redact list
+ * thread. If a block appears in both the to_ds's data and the redaction data,
+ * the send merge thread will mark it as redacted and send it on to the prefetch
+ * thread. Otherwise, the send merge thread will send the block on to the
+ * prefetch thread unchanged. The prefetch thread will issue prefetch reads for
+ * any data that isn't redacted, and then send the data on to the main thread.
+ * The main thread behaves the same as in a normal send case, issuing demand
+ * reads for data blocks and sending out records over the network
+ *
+ * The graphic below diagrams the flow of data in the case of a redacted zfs
+ * send. Each box represents a thread, and each line represents the flow of
+ * data.
+ *
+ * Records from the |
+ * redaction bookmark |
+ * +--------------------+ | +---------------------------+
+ * | | v | Send Merge Thread |
+ * | Redact List Thread +----------> Apply redaction marks to |
+ * | | | records as specified by |
+ * +--------------------+ | redaction ranges |
+ * +----^---------------+------+
+ * | | Merged data
+ * | |
+ * | +------------v--------+
+ * | | Prefetch Thread |
+ * +--------------------+ | | Issues prefetch |
+ * | to_ds Traversal | | | reads of data blocks|
+ * | Thread (finds +---------------+ +------------+--------+
+ * | candidate blocks) | Blocks modified | Prefetched data
+ * +--------------------+ by to_ds since |
+ * ancestor_zb +------------v----+
+ * | Main Thread | File Descriptor
+ * | Sends data over +->(to zfs receive)
+ * | wire |
+ * +-----------------+
+ *
+ * The second case is an incremental send from a redaction bookmark. The to_ds
+ * traversal thread and the main thread behave the same as in the redacted
+ * send case. The new thread is the from bookmark traversal thread. It
+ * iterates over the redaction list in the redaction bookmark, and enqueues
+ * records for each block that was redacted in the original send. The send
+ * merge thread now has to merge the data from the two threads. For details
+ * about that process, see the header comment of send_merge_thread(). Any data
+ * it decides to send on will be prefetched by the prefetch thread. Note that
+ * you can perform a redacted send from a redaction bookmark; in that case,
+ * the data flow behaves very similarly to the flow in the redacted send case,
+ * except with the addition of the bookmark traversal thread iterating over the
+ * redaction bookmark. The send_merge_thread also has to take on the
+ * responsibility of merging the redact list thread's records, the bookmark
+ * traversal thread's records, and the to_ds records.
+ *
+ * +---------------------+
+ * | |
+ * | Redact List Thread +--------------+
+ * | | |
+ * +---------------------+ |
+ * Blocks in redaction list | Ranges modified by every secure snap
+ * of from bookmark | (or EOS if not readcted)
+ * |
+ * +---------------------+ | +----v----------------------+
+ * | bookmark Traversal | v | Send Merge Thread |
+ * | Thread (finds +---------> Merges bookmark, rlt, and |
+ * | candidate blocks) | | to_ds send records |
+ * +---------------------+ +----^---------------+------+
+ * | | Merged data
+ * | +------------v--------+
+ * | | Prefetch Thread |
+ * +--------------------+ | | Issues prefetch |
+ * | to_ds Traversal | | | reads of data blocks|
+ * | Thread (finds +---------------+ +------------+--------+
+ * | candidate blocks) | Blocks modified | Prefetched data
+ * +--------------------+ by to_ds since +------------v----+
+ * ancestor_zb | Main Thread | File Descriptor
+ * | Sends data over +->(to zfs receive)
+ * | wire |
+ * +-----------------+
+ *
+ * The final case is a simple zfs full or incremental send. The to_ds traversal
+ * thread behaves the same as always. The redact list thread is never started.
+ * The send merge thread takes all the blocks that the to_ds traversal thread
+ * sends it, prefetches the data, and sends the blocks on to the main thread.
+ * The main thread sends the data over the wire.
+ *
+ * To keep performance acceptable, we want to prefetch the data in the worker
+ * threads. While the to_ds thread could simply use the TRAVERSE_PREFETCH
+ * feature built into traverse_dataset, the combining and deletion of records
+ * due to redaction and sends from redaction bookmarks mean that we could
+ * issue many unnecessary prefetches. As a result, we only prefetch data
+ * after we've determined that the record is not going to be redacted. To
+ * prevent the prefetching from getting too far ahead of the main thread, the
+ * blocking queues that are used for communication are capped not by the
+ * number of entries in the queue, but by the sum of the size of the
+ * prefetches associated with them. The limit on the amount of data that the
+ * thread can prefetch beyond what the main thread has reached is controlled
+ * by the global variable zfs_send_queue_length. In addition, to prevent poor
+ * performance in the beginning of a send, we also limit the distance ahead
+ * that the traversal threads can be. That distance is controlled by the
+ * zfs_send_no_prefetch_queue_length tunable.
+ *
+ * Note: Releases dp using the specified tag.
+ */
+static int
+dmu_send_impl(struct dmu_send_params *dspp)
+{
+ objset_t *os;
+ dmu_replay_record_t *drr;
+ dmu_sendstatus_t *dssp;
+ dmu_send_cookie_t dsc = {0};
+ int err;
+ uint64_t fromtxg = dspp->ancestor_zb.zbm_creation_txg;
+ uint64_t featureflags = 0;
+ struct redact_list_thread_arg *from_arg;
+ struct send_thread_arg *to_arg;
+ struct redact_list_thread_arg *rlt_arg;
+ struct send_merge_thread_arg *smt_arg;
+ struct send_reader_thread_arg *srt_arg;
+ struct send_range *range;
+ redaction_list_t *from_rl = NULL;
+ redaction_list_t *redact_rl = NULL;
+ boolean_t resuming = (dspp->resumeobj != 0 || dspp->resumeoff != 0);
+ boolean_t book_resuming = resuming;
+
+ dsl_dataset_t *to_ds = dspp->to_ds;
+ zfs_bookmark_phys_t *ancestor_zb = &dspp->ancestor_zb;
+ dsl_pool_t *dp = dspp->dp;
+ void *tag = dspp->tag;
+
+ err = dmu_objset_from_ds(to_ds, &os);
+ if (err != 0) {
+ dsl_pool_rele(dp, tag);
+ return (err);
+ }
+
+ /*
+ * If this is a non-raw send of an encrypted ds, we can ensure that
+ * the objset_phys_t is authenticated. This is safe because this is
+ * either a snapshot or we have owned the dataset, ensuring that
+ * it can't be modified.
+ */
+ if (!dspp->rawok && os->os_encrypted &&
+ arc_is_unauthenticated(os->os_phys_buf)) {
+ zbookmark_phys_t zb;
+
+ SET_BOOKMARK(&zb, to_ds->ds_object, ZB_ROOT_OBJECT,
+ ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+ err = arc_untransform(os->os_phys_buf, os->os_spa,
+ &zb, B_FALSE);
+ if (err != 0) {
+ dsl_pool_rele(dp, tag);
+ return (err);
+ }
+
+ ASSERT0(arc_is_unauthenticated(os->os_phys_buf));
+ }
+
+ if ((err = setup_featureflags(dspp, os, &featureflags)) != 0) {
+ dsl_pool_rele(dp, tag);
+ return (err);
+ }
+
+ /*
+ * If we're doing a redacted send, hold the bookmark's redaction list.
+ */
+ if (dspp->redactbook != NULL) {
+ err = dsl_redaction_list_hold_obj(dp,
+ dspp->redactbook->zbm_redaction_obj, FTAG,
+ &redact_rl);
+ if (err != 0) {
+ dsl_pool_rele(dp, tag);
+ return (SET_ERROR(EINVAL));
+ }
+ dsl_redaction_list_long_hold(dp, redact_rl, FTAG);
+ }
+
+ /*
+ * If we're sending from a redaction bookmark, hold the redaction list
+ * so that we can consider sending the redacted blocks.
+ */
+ if (ancestor_zb->zbm_redaction_obj != 0) {
+ err = dsl_redaction_list_hold_obj(dp,
+ ancestor_zb->zbm_redaction_obj, FTAG, &from_rl);
+ if (err != 0) {
+ if (redact_rl != NULL) {
+ dsl_redaction_list_long_rele(redact_rl, FTAG);
+ dsl_redaction_list_rele(redact_rl, FTAG);
+ }
+ dsl_pool_rele(dp, tag);
+ return (SET_ERROR(EINVAL));
+ }
+ dsl_redaction_list_long_hold(dp, from_rl, FTAG);
+ }
+
+ dsl_dataset_long_hold(to_ds, FTAG);
+
+ from_arg = kmem_zalloc(sizeof (*from_arg), KM_SLEEP);
+ to_arg = kmem_zalloc(sizeof (*to_arg), KM_SLEEP);
+ rlt_arg = kmem_zalloc(sizeof (*rlt_arg), KM_SLEEP);
+ smt_arg = kmem_zalloc(sizeof (*smt_arg), KM_SLEEP);
+ srt_arg = kmem_zalloc(sizeof (*srt_arg), KM_SLEEP);
+
+ drr = create_begin_record(dspp, os, featureflags);
+ dssp = setup_send_progress(dspp);
+
+ dsc.dsc_drr = drr;
+ dsc.dsc_dso = dspp->dso;
+ dsc.dsc_os = os;
+ dsc.dsc_off = dspp->off;
+ dsc.dsc_toguid = dsl_dataset_phys(to_ds)->ds_guid;
+ dsc.dsc_fromtxg = fromtxg;
+ dsc.dsc_pending_op = PENDING_NONE;
+ dsc.dsc_featureflags = featureflags;
+ dsc.dsc_resume_object = dspp->resumeobj;
+ dsc.dsc_resume_offset = dspp->resumeoff;
+
+ dsl_pool_rele(dp, tag);
+
+ void *payload = NULL;
+ size_t payload_len = 0;
+ nvlist_t *nvl = fnvlist_alloc();
+
+ /*
+ * If we're doing a redacted send, we include the snapshots we're
+ * redacted with respect to so that the target system knows what send
+ * streams can be correctly received on top of this dataset. If we're
+ * instead sending a redacted dataset, we include the snapshots that the
+ * dataset was created with respect to.
+ */
+ if (dspp->redactbook != NULL) {
+ fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_SNAPS,
+ redact_rl->rl_phys->rlp_snaps,
+ redact_rl->rl_phys->rlp_num_snaps);
+ } else if (dsl_dataset_feature_is_active(to_ds,
+ SPA_FEATURE_REDACTED_DATASETS)) {
+ uint64_t *tods_guids;
+ uint64_t length;
+ VERIFY(dsl_dataset_get_uint64_array_feature(to_ds,
+ SPA_FEATURE_REDACTED_DATASETS, &length, &tods_guids));
+ fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_SNAPS, tods_guids,
+ length);
+ }
+
+ /*
+ * If we're sending from a redaction bookmark, then we should retrieve
+ * the guids of that bookmark so we can send them over the wire.
+ */
+ if (from_rl != NULL) {
+ fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_FROM_SNAPS,
+ from_rl->rl_phys->rlp_snaps,
+ from_rl->rl_phys->rlp_num_snaps);
+ }
+
+ /*
+ * If the snapshot we're sending from is redacted, include the redaction
+ * list in the stream.
+ */
+ if (dspp->numfromredactsnaps != NUM_SNAPS_NOT_REDACTED) {
+ ASSERT3P(from_rl, ==, NULL);
+ fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_FROM_SNAPS,
+ dspp->fromredactsnaps, (uint_t)dspp->numfromredactsnaps);
+ if (dspp->numfromredactsnaps > 0) {
+ kmem_free(dspp->fromredactsnaps,
+ dspp->numfromredactsnaps * sizeof (uint64_t));
+ dspp->fromredactsnaps = NULL;
+ }
+ }
+
+ if (resuming || book_resuming) {
+ err = setup_resume_points(dspp, to_arg, from_arg,
+ rlt_arg, smt_arg, resuming, os, redact_rl, nvl);
+ if (err != 0)
+ goto out;
+ }
+
+ if (featureflags & DMU_BACKUP_FEATURE_RAW) {
+ uint64_t ivset_guid = (ancestor_zb != NULL) ?
+ ancestor_zb->zbm_ivset_guid : 0;
+ nvlist_t *keynvl = NULL;
+ ASSERT(os->os_encrypted);
+
+ err = dsl_crypto_populate_key_nvlist(os, ivset_guid,
+ &keynvl);
+ if (err != 0) {
+ fnvlist_free(nvl);
+ goto out;
+ }
+
+ fnvlist_add_nvlist(nvl, "crypt_keydata", keynvl);
+ fnvlist_free(keynvl);
+ }
+
+ if (!nvlist_empty(nvl)) {
+ payload = fnvlist_pack(nvl, &payload_len);
+ drr->drr_payloadlen = payload_len;
+ }
+
+ fnvlist_free(nvl);
+ err = dump_record(&dsc, payload, payload_len);
+ fnvlist_pack_free(payload, payload_len);
+ if (err != 0) {
+ err = dsc.dsc_err;
+ goto out;
+ }
+
+ setup_to_thread(to_arg, os, dssp, fromtxg, dspp->rawok);
+ setup_from_thread(from_arg, from_rl, dssp);
+ setup_redact_list_thread(rlt_arg, dspp, redact_rl, dssp);
+ setup_merge_thread(smt_arg, dspp, from_arg, to_arg, rlt_arg, os);
+ setup_reader_thread(srt_arg, dspp, smt_arg, featureflags);
+
+ range = bqueue_dequeue(&srt_arg->q);
+ while (err == 0 && !range->eos_marker) {
+ err = do_dump(&dsc, range);
+ range = get_next_range(&srt_arg->q, range);
+ if (issig(JUSTLOOKING) && issig(FORREAL))
+ err = SET_ERROR(EINTR);
+ }
+
+ /*
+ * If we hit an error or are interrupted, cancel our worker threads and
+ * clear the queue of any pending records. The threads will pass the
+ * cancel up the tree of worker threads, and each one will clean up any
+ * pending records before exiting.
+ */
+ if (err != 0) {
+ srt_arg->cancel = B_TRUE;
+ while (!range->eos_marker) {
+ range = get_next_range(&srt_arg->q, range);
+ }
+ }
+ range_free(range);
+
+ bqueue_destroy(&srt_arg->q);
+ bqueue_destroy(&smt_arg->q);
+ if (dspp->redactbook != NULL)
+ bqueue_destroy(&rlt_arg->q);
+ bqueue_destroy(&to_arg->q);
+ bqueue_destroy(&from_arg->q);
+
+ if (err == 0 && srt_arg->error != 0)
+ err = srt_arg->error;
+
+ if (err != 0)
+ goto out;
+
+ if (dsc.dsc_pending_op != PENDING_NONE)
+ if (dump_record(&dsc, NULL, 0) != 0)
+ err = SET_ERROR(EINTR);
+
+ if (err != 0) {
+ if (err == EINTR && dsc.dsc_err != 0)
+ err = dsc.dsc_err;
+ goto out;
+ }
+
+ /*
+ * Send the DRR_END record if this is not a saved stream.
+ * Otherwise, the omitted DRR_END record will signal to
+ * the receive side that the stream is incomplete.
+ */
+ if (!dspp->savedok) {
+ bzero(drr, sizeof (dmu_replay_record_t));
+ drr->drr_type = DRR_END;
+ drr->drr_u.drr_end.drr_checksum = dsc.dsc_zc;
+ drr->drr_u.drr_end.drr_toguid = dsc.dsc_toguid;
+
+ if (dump_record(&dsc, NULL, 0) != 0)
+ err = dsc.dsc_err;
+ }
+out:
+ mutex_enter(&to_ds->ds_sendstream_lock);
+ list_remove(&to_ds->ds_sendstreams, dssp);
+ mutex_exit(&to_ds->ds_sendstream_lock);
+
+ VERIFY(err != 0 || (dsc.dsc_sent_begin &&
+ (dsc.dsc_sent_end || dspp->savedok)));
+
+ kmem_free(drr, sizeof (dmu_replay_record_t));
+ kmem_free(dssp, sizeof (dmu_sendstatus_t));
+ kmem_free(from_arg, sizeof (*from_arg));
+ kmem_free(to_arg, sizeof (*to_arg));
+ kmem_free(rlt_arg, sizeof (*rlt_arg));
+ kmem_free(smt_arg, sizeof (*smt_arg));
+ kmem_free(srt_arg, sizeof (*srt_arg));
+
+ dsl_dataset_long_rele(to_ds, FTAG);
+ if (from_rl != NULL) {
+ dsl_redaction_list_long_rele(from_rl, FTAG);
+ dsl_redaction_list_rele(from_rl, FTAG);
+ }
+ if (redact_rl != NULL) {
+ dsl_redaction_list_long_rele(redact_rl, FTAG);
+ dsl_redaction_list_rele(redact_rl, FTAG);
+ }
+
+ return (err);
+}
+
+int
+dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
+ boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
+ boolean_t rawok, boolean_t savedok, int outfd, offset_t *off,
+ dmu_send_outparams_t *dsop)
+{
+ int err;
+ dsl_dataset_t *fromds;
+ ds_hold_flags_t dsflags;
+ struct dmu_send_params dspp = {0};
+ dspp.embedok = embedok;
+ dspp.large_block_ok = large_block_ok;
+ dspp.compressok = compressok;
+ dspp.outfd = outfd;
+ dspp.off = off;
+ dspp.dso = dsop;
+ dspp.tag = FTAG;
+ dspp.rawok = rawok;
+ dspp.savedok = savedok;
+
+ dsflags = (rawok) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT;
+ err = dsl_pool_hold(pool, FTAG, &dspp.dp);
+ if (err != 0)
+ return (err);
+
+ err = dsl_dataset_hold_obj_flags(dspp.dp, tosnap, dsflags, FTAG,
+ &dspp.to_ds);
+ if (err != 0) {
+ dsl_pool_rele(dspp.dp, FTAG);
+ return (err);
+ }
+
+ if (fromsnap != 0) {
+ err = dsl_dataset_hold_obj_flags(dspp.dp, fromsnap, dsflags,
+ FTAG, &fromds);
+ if (err != 0) {
+ dsl_dataset_rele_flags(dspp.to_ds, dsflags, FTAG);
+ dsl_pool_rele(dspp.dp, FTAG);
+ return (err);
+ }
+ dspp.ancestor_zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
+ dspp.ancestor_zb.zbm_creation_txg =
+ dsl_dataset_phys(fromds)->ds_creation_txg;
+ dspp.ancestor_zb.zbm_creation_time =
+ dsl_dataset_phys(fromds)->ds_creation_time;
+
+ if (dsl_dataset_is_zapified(fromds)) {
+ (void) zap_lookup(dspp.dp->dp_meta_objset,
+ fromds->ds_object, DS_FIELD_IVSET_GUID, 8, 1,
+ &dspp.ancestor_zb.zbm_ivset_guid);
+ }
+
+ /* See dmu_send for the reasons behind this. */
+ uint64_t *fromredact;
+
+ if (!dsl_dataset_get_uint64_array_feature(fromds,
+ SPA_FEATURE_REDACTED_DATASETS,
+ &dspp.numfromredactsnaps,
+ &fromredact)) {
+ dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED;
+ } else if (dspp.numfromredactsnaps > 0) {
+ uint64_t size = dspp.numfromredactsnaps *
+ sizeof (uint64_t);
+ dspp.fromredactsnaps = kmem_zalloc(size, KM_SLEEP);
+ bcopy(fromredact, dspp.fromredactsnaps, size);
+ }
+
+ boolean_t is_before =
+ dsl_dataset_is_before(dspp.to_ds, fromds, 0);
+ dspp.is_clone = (dspp.to_ds->ds_dir !=
+ fromds->ds_dir);
+ dsl_dataset_rele(fromds, FTAG);
+ if (!is_before) {
+ dsl_pool_rele(dspp.dp, FTAG);
+ err = SET_ERROR(EXDEV);
+ } else {
+ err = dmu_send_impl(&dspp);
+ }
+ } else {
+ dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED;
+ err = dmu_send_impl(&dspp);
+ }
+ dsl_dataset_rele(dspp.to_ds, FTAG);
+ return (err);
+}
+
+int
+dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+ boolean_t large_block_ok, boolean_t compressok, boolean_t rawok,
+ boolean_t savedok, uint64_t resumeobj, uint64_t resumeoff,
+ const char *redactbook, int outfd, offset_t *off,
+ dmu_send_outparams_t *dsop)
+{
+ int err = 0;
+ ds_hold_flags_t dsflags;
+ boolean_t owned = B_FALSE;
+ dsl_dataset_t *fromds = NULL;
+ zfs_bookmark_phys_t book = {0};
+ struct dmu_send_params dspp = {0};
+
+ dsflags = (rawok) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT;
+ dspp.tosnap = tosnap;
+ dspp.embedok = embedok;
+ dspp.large_block_ok = large_block_ok;
+ dspp.compressok = compressok;
+ dspp.outfd = outfd;
+ dspp.off = off;
+ dspp.dso = dsop;
+ dspp.tag = FTAG;
+ dspp.resumeobj = resumeobj;
+ dspp.resumeoff = resumeoff;
+ dspp.rawok = rawok;
+ dspp.savedok = savedok;
+
+ if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL)
+ return (SET_ERROR(EINVAL));
+
+ err = dsl_pool_hold(tosnap, FTAG, &dspp.dp);
+ if (err != 0)
+ return (err);
+
+ if (strchr(tosnap, '@') == NULL && spa_writeable(dspp.dp->dp_spa)) {
+ /*
+ * We are sending a filesystem or volume. Ensure
+ * that it doesn't change by owning the dataset.
+ */
+
+ if (savedok) {
+ /*
+ * We are looking for the dataset that represents the
+ * partially received send stream. If this stream was
+ * received as a new snapshot of an existing dataset,
+ * this will be saved in a hidden clone named
+ * "<pool>/<dataset>/%recv". Otherwise, the stream
+ * will be saved in the live dataset itself. In
+ * either case we need to use dsl_dataset_own_force()
+ * because the stream is marked as inconsistent,
+ * which would normally make it unavailable to be
+ * owned.
+ */
+ char *name = kmem_asprintf("%s/%s", tosnap,
+ recv_clone_name);
+ err = dsl_dataset_own_force(dspp.dp, name, dsflags,
+ FTAG, &dspp.to_ds);
+ if (err == ENOENT) {
+ err = dsl_dataset_own_force(dspp.dp, tosnap,
+ dsflags, FTAG, &dspp.to_ds);
+ }
+
+ if (err == 0) {
+ err = zap_lookup(dspp.dp->dp_meta_objset,
+ dspp.to_ds->ds_object,
+ DS_FIELD_RESUME_TOGUID, 8, 1,
+ &dspp.saved_guid);
+ }
+
+ if (err == 0) {
+ err = zap_lookup(dspp.dp->dp_meta_objset,
+ dspp.to_ds->ds_object,
+ DS_FIELD_RESUME_TONAME, 1,
+ sizeof (dspp.saved_toname),
+ dspp.saved_toname);
+ }
+ if (err != 0)
+ dsl_dataset_disown(dspp.to_ds, dsflags, FTAG);
+
+ kmem_strfree(name);
+ } else {
+ err = dsl_dataset_own(dspp.dp, tosnap, dsflags,
+ FTAG, &dspp.to_ds);
+ }
+ owned = B_TRUE;
+ } else {
+ err = dsl_dataset_hold_flags(dspp.dp, tosnap, dsflags, FTAG,
+ &dspp.to_ds);
+ }
+
+ if (err != 0) {
+ dsl_pool_rele(dspp.dp, FTAG);
+ return (err);
+ }
+
+ if (redactbook != NULL) {
+ char path[ZFS_MAX_DATASET_NAME_LEN];
+ (void) strlcpy(path, tosnap, sizeof (path));
+ char *at = strchr(path, '@');
+ if (at == NULL) {
+ err = EINVAL;
+ } else {
+ (void) snprintf(at, sizeof (path) - (at - path), "#%s",
+ redactbook);
+ err = dsl_bookmark_lookup(dspp.dp, path,
+ NULL, &book);
+ dspp.redactbook = &book;
+ }
+ }
+
+ if (err != 0) {
+ dsl_pool_rele(dspp.dp, FTAG);
+ if (owned)
+ dsl_dataset_disown(dspp.to_ds, dsflags, FTAG);
+ else
+ dsl_dataset_rele_flags(dspp.to_ds, dsflags, FTAG);
+ return (err);
+ }
+
+ if (fromsnap != NULL) {
+ zfs_bookmark_phys_t *zb = &dspp.ancestor_zb;
+ int fsnamelen;
+ if (strpbrk(tosnap, "@#") != NULL)
+ fsnamelen = strpbrk(tosnap, "@#") - tosnap;
+ else
+ fsnamelen = strlen(tosnap);
+
+ /*
+ * If the fromsnap is in a different filesystem, then
+ * mark the send stream as a clone.
+ */
+ if (strncmp(tosnap, fromsnap, fsnamelen) != 0 ||
+ (fromsnap[fsnamelen] != '@' &&
+ fromsnap[fsnamelen] != '#')) {
+ dspp.is_clone = B_TRUE;
+ }
+
+ if (strchr(fromsnap, '@') != NULL) {
+ err = dsl_dataset_hold(dspp.dp, fromsnap, FTAG,
+ &fromds);
+
+ if (err != 0) {
+ ASSERT3P(fromds, ==, NULL);
+ } else {
+ /*
+ * We need to make a deep copy of the redact
+ * snapshots of the from snapshot, because the
+ * array will be freed when we evict from_ds.
+ */
+ uint64_t *fromredact;
+ if (!dsl_dataset_get_uint64_array_feature(
+ fromds, SPA_FEATURE_REDACTED_DATASETS,
+ &dspp.numfromredactsnaps,
+ &fromredact)) {
+ dspp.numfromredactsnaps =
+ NUM_SNAPS_NOT_REDACTED;
+ } else if (dspp.numfromredactsnaps > 0) {
+ uint64_t size =
+ dspp.numfromredactsnaps *
+ sizeof (uint64_t);
+ dspp.fromredactsnaps = kmem_zalloc(size,
+ KM_SLEEP);
+ bcopy(fromredact, dspp.fromredactsnaps,
+ size);
+ }
+ if (!dsl_dataset_is_before(dspp.to_ds, fromds,
+ 0)) {
+ err = SET_ERROR(EXDEV);
+ } else {
+ zb->zbm_creation_txg =
+ dsl_dataset_phys(fromds)->
+ ds_creation_txg;
+ zb->zbm_creation_time =
+ dsl_dataset_phys(fromds)->
+ ds_creation_time;
+ zb->zbm_guid =
+ dsl_dataset_phys(fromds)->ds_guid;
+ zb->zbm_redaction_obj = 0;
+
+ if (dsl_dataset_is_zapified(fromds)) {
+ (void) zap_lookup(
+ dspp.dp->dp_meta_objset,
+ fromds->ds_object,
+ DS_FIELD_IVSET_GUID, 8, 1,
+ &zb->zbm_ivset_guid);
+ }
+ }
+ dsl_dataset_rele(fromds, FTAG);
+ }
+ } else {
+ dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED;
+ err = dsl_bookmark_lookup(dspp.dp, fromsnap, dspp.to_ds,
+ zb);
+ if (err == EXDEV && zb->zbm_redaction_obj != 0 &&
+ zb->zbm_guid ==
+ dsl_dataset_phys(dspp.to_ds)->ds_guid)
+ err = 0;
+ }
+
+ if (err == 0) {
+ /* dmu_send_impl will call dsl_pool_rele for us. */
+ err = dmu_send_impl(&dspp);
+ } else {
+ dsl_pool_rele(dspp.dp, FTAG);
+ }
+ } else {
+ dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED;
+ err = dmu_send_impl(&dspp);
+ }
+ if (owned)
+ dsl_dataset_disown(dspp.to_ds, dsflags, FTAG);
+ else
+ dsl_dataset_rele_flags(dspp.to_ds, dsflags, FTAG);
+ return (err);
+}
+
+static int
+dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed,
+ uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep)
+{
+ int err = 0;
+ uint64_t size;
+ /*
+ * Assume that space (both on-disk and in-stream) is dominated by
+ * data. We will adjust for indirect blocks and the copies property,
+ * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
+ */
+
+ uint64_t recordsize;
+ uint64_t record_count;
+ objset_t *os;
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+
+ /* Assume all (uncompressed) blocks are recordsize. */
+ if (zfs_override_estimate_recordsize != 0) {
+ recordsize = zfs_override_estimate_recordsize;
+ } else if (os->os_phys->os_type == DMU_OST_ZVOL) {
+ err = dsl_prop_get_int_ds(ds,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize);
+ } else {
+ err = dsl_prop_get_int_ds(ds,
+ zfs_prop_to_name(ZFS_PROP_RECORDSIZE), &recordsize);
+ }
+ if (err != 0)
+ return (err);
+ record_count = uncompressed / recordsize;
+
+ /*
+ * If we're estimating a send size for a compressed stream, use the
+ * compressed data size to estimate the stream size. Otherwise, use the
+ * uncompressed data size.
+ */
+ size = stream_compressed ? compressed : uncompressed;
+
+ /*
+ * Subtract out approximate space used by indirect blocks.
+ * Assume most space is used by data blocks (non-indirect, non-dnode).
+ * Assume no ditto blocks or internal fragmentation.
+ *
+ * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
+ * block.
+ */
+ size -= record_count * sizeof (blkptr_t);
+
+ /* Add in the space for the record associated with each block. */
+ size += record_count * sizeof (dmu_replay_record_t);
+
+ *sizep = size;
+
+ return (0);
+}
+
+int
+dmu_send_estimate_fast(dsl_dataset_t *origds, dsl_dataset_t *fromds,
+ zfs_bookmark_phys_t *frombook, boolean_t stream_compressed,
+ boolean_t saved, uint64_t *sizep)
+{
+ int err;
+ dsl_dataset_t *ds = origds;
+ uint64_t uncomp, comp;
+
+ ASSERT(dsl_pool_config_held(origds->ds_dir->dd_pool));
+ ASSERT(fromds == NULL || frombook == NULL);
+
+ /*
+ * If this is a saved send we may actually be sending
+ * from the %recv clone used for resuming.
+ */
+ if (saved) {
+ objset_t *mos = origds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t guid;
+ char dsname[ZFS_MAX_DATASET_NAME_LEN + 6];
+
+ dsl_dataset_name(origds, dsname);
+ (void) strcat(dsname, "/");
+ (void) strcat(dsname, recv_clone_name);
+
+ err = dsl_dataset_hold(origds->ds_dir->dd_pool,
+ dsname, FTAG, &ds);
+ if (err != ENOENT && err != 0) {
+ return (err);
+ } else if (err == ENOENT) {
+ ds = origds;
+ }
+
+ /* check that this dataset has partially received data */
+ err = zap_lookup(mos, ds->ds_object,
+ DS_FIELD_RESUME_TOGUID, 8, 1, &guid);
+ if (err != 0) {
+ err = SET_ERROR(err == ENOENT ? EINVAL : err);
+ goto out;
+ }
+
+ err = zap_lookup(mos, ds->ds_object,
+ DS_FIELD_RESUME_TONAME, 1, sizeof (dsname), dsname);
+ if (err != 0) {
+ err = SET_ERROR(err == ENOENT ? EINVAL : err);
+ goto out;
+ }
+ }
+
+ /* tosnap must be a snapshot or the target of a saved send */
+ if (!ds->ds_is_snapshot && ds == origds)
+ return (SET_ERROR(EINVAL));
+
+ if (fromds != NULL) {
+ uint64_t used;
+ if (!fromds->ds_is_snapshot) {
+ err = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ if (!dsl_dataset_is_before(ds, fromds, 0)) {
+ err = SET_ERROR(EXDEV);
+ goto out;
+ }
+
+ err = dsl_dataset_space_written(fromds, ds, &used, &comp,
+ &uncomp);
+ if (err != 0)
+ goto out;
+ } else if (frombook != NULL) {
+ uint64_t used;
+ err = dsl_dataset_space_written_bookmark(frombook, ds, &used,
+ &comp, &uncomp);
+ if (err != 0)
+ goto out;
+ } else {
+ uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
+ comp = dsl_dataset_phys(ds)->ds_compressed_bytes;
+ }
+
+ err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp,
+ stream_compressed, sizep);
+ /*
+ * Add the size of the BEGIN and END records to the estimate.
+ */
+ *sizep += 2 * sizeof (dmu_replay_record_t);
+
+out:
+ if (ds != origds)
+ dsl_dataset_rele(ds, FTAG);
+ return (err);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_send, zfs_send_, corrupt_data, INT, ZMOD_RW,
+ "Allow sending corrupt data");
+
+ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_length, INT, ZMOD_RW,
+ "Maximum send queue length");
+
+ZFS_MODULE_PARAM(zfs_send, zfs_send_, unmodified_spill_blocks, INT, ZMOD_RW,
+ "Send unmodified spill blocks");
+
+ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_length, INT, ZMOD_RW,
+ "Maximum send queue length for non-prefetch queues");
+
+ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_ff, INT, ZMOD_RW,
+ "Send queue fill fraction");
+
+ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_ff, INT, ZMOD_RW,
+ "Send queue fill fraction for non-prefetch queues");
+
+ZFS_MODULE_PARAM(zfs_send, zfs_, override_estimate_recordsize, INT, ZMOD_RW,
+ "Override block size estimate with fixed size");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu_traverse.c b/sys/contrib/openzfs/module/zfs/dmu_traverse.c
new file mode 100644
index 000000000000..31db49dae68c
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dmu_traverse.c
@@ -0,0 +1,788 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dnode.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/dmu_impl.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/callb.h>
+#include <sys/zfeature.h>
+
+int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */
+int32_t send_holes_without_birth_time = 1;
+
+typedef struct prefetch_data {
+ kmutex_t pd_mtx;
+ kcondvar_t pd_cv;
+ int32_t pd_bytes_fetched;
+ int pd_flags;
+ boolean_t pd_cancel;
+ boolean_t pd_exited;
+ zbookmark_phys_t pd_resume;
+} prefetch_data_t;
+
+typedef struct traverse_data {
+ spa_t *td_spa;
+ uint64_t td_objset;
+ blkptr_t *td_rootbp;
+ uint64_t td_min_txg;
+ zbookmark_phys_t *td_resume;
+ int td_flags;
+ prefetch_data_t *td_pfd;
+ boolean_t td_paused;
+ uint64_t td_hole_birth_enabled_txg;
+ blkptr_cb_t *td_func;
+ void *td_arg;
+ boolean_t td_realloc_possible;
+} traverse_data_t;
+
+static int traverse_dnode(traverse_data_t *td, const blkptr_t *bp,
+ const dnode_phys_t *dnp, uint64_t objset, uint64_t object);
+static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
+ uint64_t objset, uint64_t object);
+
+static int
+traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
+ uint64_t claim_txg)
+{
+ traverse_data_t *td = arg;
+ zbookmark_phys_t zb;
+
+ if (BP_IS_HOLE(bp))
+ return (0);
+
+ if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa))
+ return (-1);
+
+ SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+ bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+ (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
+
+ return (0);
+}
+
+static int
+traverse_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
+ uint64_t claim_txg)
+{
+ traverse_data_t *td = arg;
+
+ if (lrc->lrc_txtype == TX_WRITE) {
+ lr_write_t *lr = (lr_write_t *)lrc;
+ blkptr_t *bp = &lr->lr_blkptr;
+ zbookmark_phys_t zb;
+
+ if (BP_IS_HOLE(bp))
+ return (0);
+
+ if (claim_txg == 0 || bp->blk_birth < claim_txg)
+ return (0);
+
+ SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
+ ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
+
+ (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
+ td->td_arg);
+ }
+ return (0);
+}
+
+static void
+traverse_zil(traverse_data_t *td, zil_header_t *zh)
+{
+ uint64_t claim_txg = zh->zh_claim_txg;
+
+ /*
+ * We only want to visit blocks that have been claimed but not yet
+ * replayed; plus blocks that are already stable in read-only mode.
+ */
+ if (claim_txg == 0 && spa_writeable(td->td_spa))
+ return;
+
+ zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
+ (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
+ claim_txg, !(td->td_flags & TRAVERSE_NO_DECRYPT));
+ zil_free(zilog);
+}
+
+typedef enum resume_skip {
+ RESUME_SKIP_ALL,
+ RESUME_SKIP_NONE,
+ RESUME_SKIP_CHILDREN
+} resume_skip_t;
+
+/*
+ * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
+ * the block indicated by zb does not need to be visited at all. Returns
+ * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
+ * resume point. This indicates that this block should be visited but not its
+ * children (since they must have been visited in a previous traversal).
+ * Otherwise returns RESUME_SKIP_NONE.
+ */
+static resume_skip_t
+resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
+ const zbookmark_phys_t *zb)
+{
+ if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
+ /*
+ * If we already visited this bp & everything below,
+ * don't bother doing it again.
+ */
+ if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
+ return (RESUME_SKIP_ALL);
+
+ /*
+ * If we found the block we're trying to resume from, zero
+ * the bookmark out to indicate that we have resumed.
+ */
+ if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
+ bzero(td->td_resume, sizeof (*zb));
+ if (td->td_flags & TRAVERSE_POST)
+ return (RESUME_SKIP_CHILDREN);
+ }
+ }
+ return (RESUME_SKIP_NONE);
+}
+
+static void
+traverse_prefetch_metadata(traverse_data_t *td,
+ const blkptr_t *bp, const zbookmark_phys_t *zb)
+{
+ arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+ int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
+
+ if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
+ return;
+ /*
+ * If we are in the process of resuming, don't prefetch, because
+ * some children will not be needed (and in fact may have already
+ * been freed).
+ */
+ if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume))
+ return;
+ if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg)
+ return;
+ if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
+ return;
+ ASSERT(!BP_IS_REDACTED(bp));
+
+ if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
+ zio_flags |= ZIO_FLAG_RAW;
+
+ (void) arc_read(NULL, td->td_spa, bp, NULL, NULL,
+ ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+}
+
+static boolean_t
+prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp)
+{
+ ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA);
+ if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
+ BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG || BP_IS_REDACTED(bp))
+ return (B_FALSE);
+ return (B_TRUE);
+}
+
+static int
+traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
+ const blkptr_t *bp, const zbookmark_phys_t *zb)
+{
+ int err = 0;
+ arc_buf_t *buf = NULL;
+ prefetch_data_t *pd = td->td_pfd;
+
+ switch (resume_skip_check(td, dnp, zb)) {
+ case RESUME_SKIP_ALL:
+ return (0);
+ case RESUME_SKIP_CHILDREN:
+ goto post;
+ case RESUME_SKIP_NONE:
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ if (bp->blk_birth == 0) {
+ /*
+ * Since this block has a birth time of 0 it must be one of
+ * two things: a hole created before the
+ * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole
+ * which has always been a hole in an object.
+ *
+ * If a file is written sparsely, then the unwritten parts of
+ * the file were "always holes" -- that is, they have been
+ * holes since this object was allocated. However, we (and
+ * our callers) can not necessarily tell when an object was
+ * allocated. Therefore, if it's possible that this object
+ * was freed and then its object number reused, we need to
+ * visit all the holes with birth==0.
+ *
+ * If it isn't possible that the object number was reused,
+ * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote
+ * all the blocks we will visit as part of this traversal,
+ * then this hole must have always existed, so we can skip
+ * it. We visit blocks born after (exclusive) td_min_txg.
+ *
+ * Note that the meta-dnode cannot be reallocated.
+ */
+ if (!send_holes_without_birth_time &&
+ (!td->td_realloc_possible ||
+ zb->zb_object == DMU_META_DNODE_OBJECT) &&
+ td->td_hole_birth_enabled_txg <= td->td_min_txg)
+ return (0);
+ } else if (bp->blk_birth <= td->td_min_txg) {
+ return (0);
+ }
+
+ if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) {
+ uint64_t size = BP_GET_LSIZE(bp);
+ mutex_enter(&pd->pd_mtx);
+ ASSERT(pd->pd_bytes_fetched >= 0);
+ while (pd->pd_bytes_fetched < size && !pd->pd_exited)
+ cv_wait_sig(&pd->pd_cv, &pd->pd_mtx);
+ pd->pd_bytes_fetched -= size;
+ cv_broadcast(&pd->pd_cv);
+ mutex_exit(&pd->pd_mtx);
+ }
+
+ if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
+ err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
+ if (err != 0)
+ goto post;
+ return (0);
+ }
+
+ if (td->td_flags & TRAVERSE_PRE) {
+ err = td->td_func(td->td_spa, NULL, bp, zb, dnp,
+ td->td_arg);
+ if (err == TRAVERSE_VISIT_NO_CHILDREN)
+ return (0);
+ if (err != 0)
+ goto post;
+ }
+
+ if (BP_GET_LEVEL(bp) > 0) {
+ uint32_t flags = ARC_FLAG_WAIT;
+ int32_t i;
+ int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+ zbookmark_phys_t *czb;
+
+ ASSERT(!BP_IS_PROTECTED(bp));
+
+ err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ if (err != 0)
+ goto post;
+
+ czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP);
+
+ for (i = 0; i < epb; i++) {
+ SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object,
+ zb->zb_level - 1,
+ zb->zb_blkid * epb + i);
+ traverse_prefetch_metadata(td,
+ &((blkptr_t *)buf->b_data)[i], czb);
+ }
+
+ /* recursively visitbp() blocks below this */
+ for (i = 0; i < epb; i++) {
+ SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object,
+ zb->zb_level - 1,
+ zb->zb_blkid * epb + i);
+ err = traverse_visitbp(td, dnp,
+ &((blkptr_t *)buf->b_data)[i], czb);
+ if (err != 0)
+ break;
+ }
+
+ kmem_free(czb, sizeof (zbookmark_phys_t));
+
+ } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+ uint32_t flags = ARC_FLAG_WAIT;
+ uint32_t zio_flags = ZIO_FLAG_CANFAIL;
+ int32_t i;
+ int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+ dnode_phys_t *child_dnp;
+
+ /*
+ * dnode blocks might have their bonus buffers encrypted, so
+ * we must be careful to honor TRAVERSE_NO_DECRYPT
+ */
+ if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
+ zio_flags |= ZIO_FLAG_RAW;
+
+ err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
+ ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+ if (err != 0)
+ goto post;
+
+ child_dnp = buf->b_data;
+
+ for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
+ prefetch_dnode_metadata(td, &child_dnp[i],
+ zb->zb_objset, zb->zb_blkid * epb + i);
+ }
+
+ /* recursively visitbp() blocks below this */
+ for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
+ err = traverse_dnode(td, bp, &child_dnp[i],
+ zb->zb_objset, zb->zb_blkid * epb + i);
+ if (err != 0)
+ break;
+ }
+ } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+ uint32_t zio_flags = ZIO_FLAG_CANFAIL;
+ arc_flags_t flags = ARC_FLAG_WAIT;
+ objset_phys_t *osp;
+
+ if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
+ zio_flags |= ZIO_FLAG_RAW;
+
+ err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
+ ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+ if (err != 0)
+ goto post;
+
+ osp = buf->b_data;
+ prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset,
+ DMU_META_DNODE_OBJECT);
+ /*
+ * See the block comment above for the goal of this variable.
+ * If the maxblkid of the meta-dnode is 0, then we know that
+ * we've never had more than DNODES_PER_BLOCK objects in the
+ * dataset, which means we can't have reused any object ids.
+ */
+ if (osp->os_meta_dnode.dn_maxblkid == 0)
+ td->td_realloc_possible = B_FALSE;
+
+ if (OBJSET_BUF_HAS_USERUSED(buf)) {
+ if (OBJSET_BUF_HAS_PROJECTUSED(buf))
+ prefetch_dnode_metadata(td,
+ &osp->os_projectused_dnode,
+ zb->zb_objset, DMU_PROJECTUSED_OBJECT);
+ prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
+ zb->zb_objset, DMU_GROUPUSED_OBJECT);
+ prefetch_dnode_metadata(td, &osp->os_userused_dnode,
+ zb->zb_objset, DMU_USERUSED_OBJECT);
+ }
+
+ err = traverse_dnode(td, bp, &osp->os_meta_dnode, zb->zb_objset,
+ DMU_META_DNODE_OBJECT);
+ if (err == 0 && OBJSET_BUF_HAS_USERUSED(buf)) {
+ if (OBJSET_BUF_HAS_PROJECTUSED(buf))
+ err = traverse_dnode(td, bp,
+ &osp->os_projectused_dnode, zb->zb_objset,
+ DMU_PROJECTUSED_OBJECT);
+ if (err == 0)
+ err = traverse_dnode(td, bp,
+ &osp->os_groupused_dnode, zb->zb_objset,
+ DMU_GROUPUSED_OBJECT);
+ if (err == 0)
+ err = traverse_dnode(td, bp,
+ &osp->os_userused_dnode, zb->zb_objset,
+ DMU_USERUSED_OBJECT);
+ }
+ }
+
+ if (buf)
+ arc_buf_destroy(buf, &buf);
+
+post:
+ if (err == 0 && (td->td_flags & TRAVERSE_POST))
+ err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
+
+ if ((td->td_flags & TRAVERSE_HARD) && (err == EIO || err == ECKSUM)) {
+ /*
+ * Ignore this disk error as requested by the HARD flag,
+ * and continue traversal.
+ */
+ err = 0;
+ }
+
+ /*
+ * If we are stopping here, set td_resume.
+ */
+ if (td->td_resume != NULL && err != 0 && !td->td_paused) {
+ td->td_resume->zb_objset = zb->zb_objset;
+ td->td_resume->zb_object = zb->zb_object;
+ td->td_resume->zb_level = 0;
+ /*
+ * If we have stopped on an indirect block (e.g. due to
+ * i/o error), we have not visited anything below it.
+ * Set the bookmark to the first level-0 block that we need
+ * to visit. This way, the resuming code does not need to
+ * deal with resuming from indirect blocks.
+ *
+ * Note, if zb_level <= 0, dnp may be NULL, so we don't want
+ * to dereference it.
+ */
+ td->td_resume->zb_blkid = zb->zb_blkid;
+ if (zb->zb_level > 0) {
+ td->td_resume->zb_blkid <<= zb->zb_level *
+ (dnp->dn_indblkshift - SPA_BLKPTRSHIFT);
+ }
+ td->td_paused = B_TRUE;
+ }
+
+ return (err);
+}
+
+static void
+prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
+ uint64_t objset, uint64_t object)
+{
+ int j;
+ zbookmark_phys_t czb;
+
+ for (j = 0; j < dnp->dn_nblkptr; j++) {
+ SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
+ traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb);
+ }
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
+ traverse_prefetch_metadata(td, DN_SPILL_BLKPTR(dnp), &czb);
+ }
+}
+
+static int
+traverse_dnode(traverse_data_t *td, const blkptr_t *bp, const dnode_phys_t *dnp,
+ uint64_t objset, uint64_t object)
+{
+ int j, err = 0;
+ zbookmark_phys_t czb;
+
+ if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL &&
+ object < td->td_resume->zb_object)
+ return (0);
+
+ if (td->td_flags & TRAVERSE_PRE) {
+ SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
+ ZB_DNODE_BLKID);
+ err = td->td_func(td->td_spa, NULL, bp, &czb, dnp,
+ td->td_arg);
+ if (err == TRAVERSE_VISIT_NO_CHILDREN)
+ return (0);
+ if (err != 0)
+ return (err);
+ }
+
+ for (j = 0; j < dnp->dn_nblkptr; j++) {
+ SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
+ err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
+ if (err != 0)
+ break;
+ }
+
+ if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
+ SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
+ err = traverse_visitbp(td, dnp, DN_SPILL_BLKPTR(dnp), &czb);
+ }
+
+ if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
+ SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
+ ZB_DNODE_BLKID);
+ err = td->td_func(td->td_spa, NULL, bp, &czb, dnp,
+ td->td_arg);
+ if (err == TRAVERSE_VISIT_NO_CHILDREN)
+ return (0);
+ if (err != 0)
+ return (err);
+ }
+ return (err);
+}
+
+/* ARGSUSED */
+static int
+traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ prefetch_data_t *pfd = arg;
+ int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
+ arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
+ ARC_FLAG_PRESCIENT_PREFETCH;
+
+ ASSERT(pfd->pd_bytes_fetched >= 0);
+ if (zb->zb_level == ZB_DNODE_LEVEL)
+ return (0);
+ if (pfd->pd_cancel)
+ return (SET_ERROR(EINTR));
+
+ if (!prefetch_needed(pfd, bp))
+ return (0);
+
+ mutex_enter(&pfd->pd_mtx);
+ while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max)
+ cv_wait_sig(&pfd->pd_cv, &pfd->pd_mtx);
+ pfd->pd_bytes_fetched += BP_GET_LSIZE(bp);
+ cv_broadcast(&pfd->pd_cv);
+ mutex_exit(&pfd->pd_mtx);
+
+ if ((pfd->pd_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
+ zio_flags |= ZIO_FLAG_RAW;
+
+ (void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ zio_flags, &aflags, zb);
+
+ return (0);
+}
+
+static void
+traverse_prefetch_thread(void *arg)
+{
+ traverse_data_t *td_main = arg;
+ traverse_data_t td = *td_main;
+ zbookmark_phys_t czb;
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+
+ td.td_func = traverse_prefetcher;
+ td.td_arg = td_main->td_pfd;
+ td.td_pfd = NULL;
+ td.td_resume = &td_main->td_pfd->pd_resume;
+
+ SET_BOOKMARK(&czb, td.td_objset,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+ (void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb);
+
+ mutex_enter(&td_main->td_pfd->pd_mtx);
+ td_main->td_pfd->pd_exited = B_TRUE;
+ cv_broadcast(&td_main->td_pfd->pd_cv);
+ mutex_exit(&td_main->td_pfd->pd_mtx);
+ spl_fstrans_unmark(cookie);
+}
+
+/*
+ * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
+ * in syncing context).
+ */
+static int
+traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
+ uint64_t txg_start, zbookmark_phys_t *resume, int flags,
+ blkptr_cb_t func, void *arg)
+{
+ traverse_data_t *td;
+ prefetch_data_t *pd;
+ zbookmark_phys_t *czb;
+ int err;
+
+ ASSERT(ds == NULL || objset == ds->ds_object);
+ ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
+
+ td = kmem_alloc(sizeof (traverse_data_t), KM_SLEEP);
+ pd = kmem_zalloc(sizeof (prefetch_data_t), KM_SLEEP);
+ czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP);
+
+ td->td_spa = spa;
+ td->td_objset = objset;
+ td->td_rootbp = rootbp;
+ td->td_min_txg = txg_start;
+ td->td_resume = resume;
+ td->td_func = func;
+ td->td_arg = arg;
+ td->td_pfd = pd;
+ td->td_flags = flags;
+ td->td_paused = B_FALSE;
+ td->td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE);
+
+ if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
+ VERIFY(spa_feature_enabled_txg(spa,
+ SPA_FEATURE_HOLE_BIRTH, &td->td_hole_birth_enabled_txg));
+ } else {
+ td->td_hole_birth_enabled_txg = UINT64_MAX;
+ }
+
+ pd->pd_flags = flags;
+ if (resume != NULL)
+ pd->pd_resume = *resume;
+ mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL);
+
+ SET_BOOKMARK(czb, td->td_objset,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+
+ /* See comment on ZIL traversal in dsl_scan_visitds. */
+ if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
+ enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+ uint32_t flags = ARC_FLAG_WAIT;
+ objset_phys_t *osp;
+ arc_buf_t *buf;
+ ASSERT(!BP_IS_REDACTED(rootbp));
+
+ if ((td->td_flags & TRAVERSE_NO_DECRYPT) &&
+ BP_IS_PROTECTED(rootbp))
+ zio_flags |= ZIO_FLAG_RAW;
+
+ err = arc_read(NULL, td->td_spa, rootbp, arc_getbuf_func,
+ &buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, czb);
+ if (err != 0) {
+ /*
+ * If both TRAVERSE_HARD and TRAVERSE_PRE are set,
+ * continue to visitbp so that td_func can be called
+ * in pre stage, and err will reset to zero.
+ */
+ if (!(td->td_flags & TRAVERSE_HARD) ||
+ !(td->td_flags & TRAVERSE_PRE))
+ goto out;
+ } else {
+ osp = buf->b_data;
+ traverse_zil(td, &osp->os_zil_header);
+ arc_buf_destroy(buf, &buf);
+ }
+ }
+
+ if (!(flags & TRAVERSE_PREFETCH_DATA) ||
+ taskq_dispatch(spa->spa_prefetch_taskq, traverse_prefetch_thread,
+ td, TQ_NOQUEUE) == TASKQID_INVALID)
+ pd->pd_exited = B_TRUE;
+
+ err = traverse_visitbp(td, NULL, rootbp, czb);
+
+ mutex_enter(&pd->pd_mtx);
+ pd->pd_cancel = B_TRUE;
+ cv_broadcast(&pd->pd_cv);
+ while (!pd->pd_exited)
+ cv_wait_sig(&pd->pd_cv, &pd->pd_mtx);
+ mutex_exit(&pd->pd_mtx);
+out:
+ mutex_destroy(&pd->pd_mtx);
+ cv_destroy(&pd->pd_cv);
+
+ kmem_free(czb, sizeof (zbookmark_phys_t));
+ kmem_free(pd, sizeof (struct prefetch_data));
+ kmem_free(td, sizeof (struct traverse_data));
+
+ return (err);
+}
+
+/*
+ * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
+ * in syncing context).
+ */
+int
+traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start,
+ zbookmark_phys_t *resume,
+ int flags, blkptr_cb_t func, void *arg)
+{
+ return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
+ &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg));
+}
+
+int
+traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start,
+ int flags, blkptr_cb_t func, void *arg)
+{
+ return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg));
+}
+
+int
+traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
+ uint64_t txg_start, zbookmark_phys_t *resume, int flags,
+ blkptr_cb_t func, void *arg)
+{
+ return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
+ blkptr, txg_start, resume, flags, func, arg));
+}
+
+/*
+ * NB: pool must not be changing on-disk (eg, from zdb or sync context).
+ */
+int
+traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
+ blkptr_cb_t func, void *arg)
+{
+ int err;
+ dsl_pool_t *dp = spa_get_dsl(spa);
+ objset_t *mos = dp->dp_meta_objset;
+ boolean_t hard = (flags & TRAVERSE_HARD);
+
+ /* visit the MOS */
+ err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
+ txg_start, NULL, flags, func, arg);
+ if (err != 0)
+ return (err);
+
+ /* visit each dataset */
+ for (uint64_t obj = 1; err == 0;
+ err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) {
+ dmu_object_info_t doi;
+
+ err = dmu_object_info(mos, obj, &doi);
+ if (err != 0) {
+ if (hard)
+ continue;
+ break;
+ }
+
+ if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
+ dsl_dataset_t *ds;
+ uint64_t txg = txg_start;
+
+ dsl_pool_config_enter(dp, FTAG);
+ err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
+ dsl_pool_config_exit(dp, FTAG);
+ if (err != 0) {
+ if (hard)
+ continue;
+ break;
+ }
+ if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg)
+ txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ err = traverse_dataset(ds, txg, flags, func, arg);
+ dsl_dataset_rele(ds, FTAG);
+ if (err != 0)
+ break;
+ }
+ }
+ if (err == ESRCH)
+ err = 0;
+ return (err);
+}
+
+EXPORT_SYMBOL(traverse_dataset);
+EXPORT_SYMBOL(traverse_pool);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, pd_bytes_max, INT, ZMOD_RW,
+ "Max number of bytes to prefetch");
+
+#if defined(_KERNEL)
+module_param_named(ignore_hole_birth, send_holes_without_birth_time, int, 0644);
+MODULE_PARM_DESC(ignore_hole_birth,
+ "Alias for send_holes_without_birth_time");
+#endif
+
+ZFS_MODULE_PARAM(zfs, , send_holes_without_birth_time, INT, ZMOD_RW,
+ "Ignore hole_birth txg for zfs send");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu_tx.c b/sys/contrib/openzfs/module/zfs/dmu_tx.c
new file mode 100644
index 000000000000..73667915df0f
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dmu_tx.c
@@ -0,0 +1,1417 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/zap_impl.h>
+#include <sys/spa.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/zfs_context.h>
+#include <sys/trace_zfs.h>
+
+typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
+ uint64_t arg1, uint64_t arg2);
+
+dmu_tx_stats_t dmu_tx_stats = {
+ { "dmu_tx_assigned", KSTAT_DATA_UINT64 },
+ { "dmu_tx_delay", KSTAT_DATA_UINT64 },
+ { "dmu_tx_error", KSTAT_DATA_UINT64 },
+ { "dmu_tx_suspended", KSTAT_DATA_UINT64 },
+ { "dmu_tx_group", KSTAT_DATA_UINT64 },
+ { "dmu_tx_memory_reserve", KSTAT_DATA_UINT64 },
+ { "dmu_tx_memory_reclaim", KSTAT_DATA_UINT64 },
+ { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 },
+ { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 },
+ { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 },
+ { "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 },
+ { "dmu_tx_quota", KSTAT_DATA_UINT64 },
+};
+
+static kstat_t *dmu_tx_ksp;
+
+dmu_tx_t *
+dmu_tx_create_dd(dsl_dir_t *dd)
+{
+ dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
+ tx->tx_dir = dd;
+ if (dd != NULL)
+ tx->tx_pool = dd->dd_pool;
+ list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
+ offsetof(dmu_tx_hold_t, txh_node));
+ list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
+ offsetof(dmu_tx_callback_t, dcb_node));
+ tx->tx_start = gethrtime();
+ return (tx);
+}
+
+dmu_tx_t *
+dmu_tx_create(objset_t *os)
+{
+ dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
+ tx->tx_objset = os;
+ return (tx);
+}
+
+dmu_tx_t *
+dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
+{
+ dmu_tx_t *tx = dmu_tx_create_dd(NULL);
+
+ TXG_VERIFY(dp->dp_spa, txg);
+ tx->tx_pool = dp;
+ tx->tx_txg = txg;
+ tx->tx_anyobj = TRUE;
+
+ return (tx);
+}
+
+int
+dmu_tx_is_syncing(dmu_tx_t *tx)
+{
+ return (tx->tx_anyobj);
+}
+
+int
+dmu_tx_private_ok(dmu_tx_t *tx)
+{
+ return (tx->tx_anyobj);
+}
+
+static dmu_tx_hold_t *
+dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type,
+ uint64_t arg1, uint64_t arg2)
+{
+ dmu_tx_hold_t *txh;
+
+ if (dn != NULL) {
+ (void) zfs_refcount_add(&dn->dn_holds, tx);
+ if (tx->tx_txg != 0) {
+ mutex_enter(&dn->dn_mtx);
+ /*
+ * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
+ * problem, but there's no way for it to happen (for
+ * now, at least).
+ */
+ ASSERT(dn->dn_assigned_txg == 0);
+ dn->dn_assigned_txg = tx->tx_txg;
+ (void) zfs_refcount_add(&dn->dn_tx_holds, tx);
+ mutex_exit(&dn->dn_mtx);
+ }
+ }
+
+ txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
+ txh->txh_tx = tx;
+ txh->txh_dnode = dn;
+ zfs_refcount_create(&txh->txh_space_towrite);
+ zfs_refcount_create(&txh->txh_memory_tohold);
+ txh->txh_type = type;
+ txh->txh_arg1 = arg1;
+ txh->txh_arg2 = arg2;
+ list_insert_tail(&tx->tx_holds, txh);
+
+ return (txh);
+}
+
+static dmu_tx_hold_t *
+dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
+ enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
+{
+ dnode_t *dn = NULL;
+ dmu_tx_hold_t *txh;
+ int err;
+
+ if (object != DMU_NEW_OBJECT) {
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err != 0) {
+ tx->tx_err = err;
+ return (NULL);
+ }
+ }
+ txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2);
+ if (dn != NULL)
+ dnode_rele(dn, FTAG);
+ return (txh);
+}
+
+void
+dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn)
+{
+ /*
+ * If we're syncing, they can manipulate any object anyhow, and
+ * the hold on the dnode_t can cause problems.
+ */
+ if (!dmu_tx_is_syncing(tx))
+ (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0);
+}
+
+/*
+ * This function reads specified data from disk. The specified data will
+ * be needed to perform the transaction -- i.e, it will be read after
+ * we do dmu_tx_assign(). There are two reasons that we read the data now
+ * (before dmu_tx_assign()):
+ *
+ * 1. Reading it now has potentially better performance. The transaction
+ * has not yet been assigned, so the TXG is not held open, and also the
+ * caller typically has less locks held when calling dmu_tx_hold_*() than
+ * after the transaction has been assigned. This reduces the lock (and txg)
+ * hold times, thus reducing lock contention.
+ *
+ * 2. It is easier for callers (primarily the ZPL) to handle i/o errors
+ * that are detected before they start making changes to the DMU state
+ * (i.e. now). Once the transaction has been assigned, and some DMU
+ * state has been changed, it can be difficult to recover from an i/o
+ * error (e.g. to undo the changes already made in memory at the DMU
+ * layer). Typically code to do so does not exist in the caller -- it
+ * assumes that the data has already been cached and thus i/o errors are
+ * not possible.
+ *
+ * It has been observed that the i/o initiated here can be a performance
+ * problem, and it appears to be optional, because we don't look at the
+ * data which is read. However, removing this read would only serve to
+ * move the work elsewhere (after the dmu_tx_assign()), where it may
+ * have a greater impact on performance (in addition to the impact on
+ * fault tolerance noted above).
+ */
+static int
+dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
+{
+ int err;
+ dmu_buf_impl_t *db;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ db = dbuf_hold_level(dn, level, blkid, FTAG);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (db == NULL)
+ return (SET_ERROR(EIO));
+ err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
+ dbuf_rele(db, FTAG);
+ return (err);
+}
+
+/* ARGSUSED */
+static void
+dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+{
+ dnode_t *dn = txh->txh_dnode;
+ int err = 0;
+
+ if (len == 0)
+ return;
+
+ (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG);
+
+ if (dn == NULL)
+ return;
+
+ /*
+ * For i/o error checking, read the blocks that will be needed
+ * to perform the write: the first and last level-0 blocks (if
+ * they are not aligned, i.e. if they are partial-block writes),
+ * and all the level-1 blocks.
+ */
+ if (dn->dn_maxblkid == 0) {
+ if (off < dn->dn_datablksz &&
+ (off > 0 || len < dn->dn_datablksz)) {
+ err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+ if (err != 0) {
+ txh->txh_tx->tx_err = err;
+ }
+ }
+ } else {
+ zio_t *zio = zio_root(dn->dn_objset->os_spa,
+ NULL, NULL, ZIO_FLAG_CANFAIL);
+
+ /* first level-0 block */
+ uint64_t start = off >> dn->dn_datablkshift;
+ if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {
+ err = dmu_tx_check_ioerr(zio, dn, 0, start);
+ if (err != 0) {
+ txh->txh_tx->tx_err = err;
+ }
+ }
+
+ /* last level-0 block */
+ uint64_t end = (off + len - 1) >> dn->dn_datablkshift;
+ if (end != start && end <= dn->dn_maxblkid &&
+ P2PHASE(off + len, dn->dn_datablksz)) {
+ err = dmu_tx_check_ioerr(zio, dn, 0, end);
+ if (err != 0) {
+ txh->txh_tx->tx_err = err;
+ }
+ }
+
+ /* level-1 blocks */
+ if (dn->dn_nlevels > 1) {
+ int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ for (uint64_t i = (start >> shft) + 1;
+ i < end >> shft; i++) {
+ err = dmu_tx_check_ioerr(zio, dn, 1, i);
+ if (err != 0) {
+ txh->txh_tx->tx_err = err;
+ }
+ }
+ }
+
+ err = zio_wait(zio);
+ if (err != 0) {
+ txh->txh_tx->tx_err = err;
+ }
+ }
+}
+
+static void
+dmu_tx_count_dnode(dmu_tx_hold_t *txh)
+{
+ (void) zfs_refcount_add_many(&txh->txh_space_towrite,
+ DNODE_MIN_SIZE, FTAG);
+}
+
+void
+dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT0(tx->tx_txg);
+ ASSERT3U(len, <=, DMU_MAX_ACCESS);
+ ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_WRITE, off, len);
+ if (txh != NULL) {
+ dmu_tx_count_write(txh, off, len);
+ dmu_tx_count_dnode(txh);
+ }
+}
+
+void
+dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT0(tx->tx_txg);
+ ASSERT3U(len, <=, DMU_MAX_ACCESS);
+ ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
+
+ txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len);
+ if (txh != NULL) {
+ dmu_tx_count_write(txh, off, len);
+ dmu_tx_count_dnode(txh);
+ }
+}
+
+/*
+ * This function marks the transaction as being a "net free". The end
+ * result is that refquotas will be disabled for this transaction, and
+ * this transaction will be able to use half of the pool space overhead
+ * (see dsl_pool_adjustedsize()). Therefore this function should only
+ * be called for transactions that we expect will not cause a net increase
+ * in the amount of space used (but it's OK if that is occasionally not true).
+ */
+void
+dmu_tx_mark_netfree(dmu_tx_t *tx)
+{
+ tx->tx_netfree = B_TRUE;
+}
+
+static void
+dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+{
+ dmu_tx_t *tx = txh->txh_tx;
+ dnode_t *dn = txh->txh_dnode;
+ int err;
+
+ ASSERT(tx->tx_txg == 0);
+
+ dmu_tx_count_dnode(txh);
+
+ if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
+ return;
+ if (len == DMU_OBJECT_END)
+ len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;
+
+ dmu_tx_count_dnode(txh);
+
+ /*
+ * For i/o error checking, we read the first and last level-0
+ * blocks if they are not aligned, and all the level-1 blocks.
+ *
+ * Note: dbuf_free_range() assumes that we have not instantiated
+ * any level-0 dbufs that will be completely freed. Therefore we must
+ * exercise care to not read or count the first and last blocks
+ * if they are blocksize-aligned.
+ */
+ if (dn->dn_datablkshift == 0) {
+ if (off != 0 || len < dn->dn_datablksz)
+ dmu_tx_count_write(txh, 0, dn->dn_datablksz);
+ } else {
+ /* first block will be modified if it is not aligned */
+ if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
+ dmu_tx_count_write(txh, off, 1);
+ /* last block will be modified if it is not aligned */
+ if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
+ dmu_tx_count_write(txh, off + len, 1);
+ }
+
+ /*
+ * Check level-1 blocks.
+ */
+ if (dn->dn_nlevels > 1) {
+ int shift = dn->dn_datablkshift + dn->dn_indblkshift -
+ SPA_BLKPTRSHIFT;
+ uint64_t start = off >> shift;
+ uint64_t end = (off + len) >> shift;
+
+ ASSERT(dn->dn_indblkshift != 0);
+
+ /*
+ * dnode_reallocate() can result in an object with indirect
+ * blocks having an odd data block size. In this case,
+ * just check the single block.
+ */
+ if (dn->dn_datablkshift == 0)
+ start = end = 0;
+
+ zio_t *zio = zio_root(tx->tx_pool->dp_spa,
+ NULL, NULL, ZIO_FLAG_CANFAIL);
+ for (uint64_t i = start; i <= end; i++) {
+ uint64_t ibyte = i << shift;
+ err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
+ i = ibyte >> shift;
+ if (err == ESRCH || i > end)
+ break;
+ if (err != 0) {
+ tx->tx_err = err;
+ (void) zio_wait(zio);
+ return;
+ }
+
+ (void) zfs_refcount_add_many(&txh->txh_memory_tohold,
+ 1 << dn->dn_indblkshift, FTAG);
+
+ err = dmu_tx_check_ioerr(zio, dn, 1, i);
+ if (err != 0) {
+ tx->tx_err = err;
+ (void) zio_wait(zio);
+ return;
+ }
+ }
+ err = zio_wait(zio);
+ if (err != 0) {
+ tx->tx_err = err;
+ return;
+ }
+ }
+}
+
+void
+dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
+{
+ dmu_tx_hold_t *txh;
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_FREE, off, len);
+ if (txh != NULL)
+ (void) dmu_tx_hold_free_impl(txh, off, len);
+}
+
+void
+dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
+{
+ dmu_tx_hold_t *txh;
+
+ txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);
+ if (txh != NULL)
+ (void) dmu_tx_hold_free_impl(txh, off, len);
+}
+
+static void
+dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name)
+{
+ dmu_tx_t *tx = txh->txh_tx;
+ dnode_t *dn = txh->txh_dnode;
+ int err;
+
+ ASSERT(tx->tx_txg == 0);
+
+ dmu_tx_count_dnode(txh);
+
+ /*
+ * Modifying a almost-full microzap is around the worst case (128KB)
+ *
+ * If it is a fat zap, the worst case would be 7*16KB=112KB:
+ * - 3 blocks overwritten: target leaf, ptrtbl block, header block
+ * - 4 new blocks written if adding:
+ * - 2 blocks for possibly split leaves,
+ * - 2 grown ptrtbl blocks
+ */
+ (void) zfs_refcount_add_many(&txh->txh_space_towrite,
+ MZAP_MAX_BLKSZ, FTAG);
+
+ if (dn == NULL)
+ return;
+
+ ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
+
+ if (dn->dn_maxblkid == 0 || name == NULL) {
+ /*
+ * This is a microzap (only one block), or we don't know
+ * the name. Check the first block for i/o errors.
+ */
+ err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+ if (err != 0) {
+ tx->tx_err = err;
+ }
+ } else {
+ /*
+ * Access the name so that we'll check for i/o errors to
+ * the leaf blocks, etc. We ignore ENOENT, as this name
+ * may not yet exist.
+ */
+ err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
+ if (err == EIO || err == ECKSUM || err == ENXIO) {
+ tx->tx_err = err;
+ }
+ }
+}
+
+void
+dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT0(tx->tx_txg);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_ZAP, add, (uintptr_t)name);
+ if (txh != NULL)
+ dmu_tx_hold_zap_impl(txh, name);
+}
+
+void
+dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT0(tx->tx_txg);
+ ASSERT(dn != NULL);
+
+ txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name);
+ if (txh != NULL)
+ dmu_tx_hold_zap_impl(txh, name);
+}
+
+void
+dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT(tx->tx_txg == 0);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_BONUS, 0, 0);
+ if (txh)
+ dmu_tx_count_dnode(txh);
+}
+
+void
+dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT0(tx->tx_txg);
+
+ txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0);
+ if (txh)
+ dmu_tx_count_dnode(txh);
+}
+
+void
+dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT(tx->tx_txg == 0);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ DMU_NEW_OBJECT, THT_SPACE, space, 0);
+ if (txh) {
+ (void) zfs_refcount_add_many(
+ &txh->txh_space_towrite, space, FTAG);
+ }
+}
+
+#ifdef ZFS_DEBUG
+void
+dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
+{
+ boolean_t match_object = B_FALSE;
+ boolean_t match_offset = B_FALSE;
+
+ DB_DNODE_ENTER(db);
+ dnode_t *dn = DB_DNODE(db);
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
+ ASSERT3U(dn->dn_object, ==, db->db.db_object);
+
+ if (tx->tx_anyobj) {
+ DB_DNODE_EXIT(db);
+ return;
+ }
+
+ /* XXX No checking on the meta dnode for now */
+ if (db->db.db_object == DMU_META_DNODE_OBJECT) {
+ DB_DNODE_EXIT(db);
+ return;
+ }
+
+ for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
+ txh = list_next(&tx->tx_holds, txh)) {
+ ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+ if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
+ match_object = TRUE;
+ if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
+ int datablkshift = dn->dn_datablkshift ?
+ dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int shift = datablkshift + epbs * db->db_level;
+ uint64_t beginblk = shift >= 64 ? 0 :
+ (txh->txh_arg1 >> shift);
+ uint64_t endblk = shift >= 64 ? 0 :
+ ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
+ uint64_t blkid = db->db_blkid;
+
+ /* XXX txh_arg2 better not be zero... */
+
+ dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
+ txh->txh_type, beginblk, endblk);
+
+ switch (txh->txh_type) {
+ case THT_WRITE:
+ if (blkid >= beginblk && blkid <= endblk)
+ match_offset = TRUE;
+ /*
+ * We will let this hold work for the bonus
+ * or spill buffer so that we don't need to
+ * hold it when creating a new object.
+ */
+ if (blkid == DMU_BONUS_BLKID ||
+ blkid == DMU_SPILL_BLKID)
+ match_offset = TRUE;
+ /*
+ * They might have to increase nlevels,
+ * thus dirtying the new TLIBs. Or the
+ * might have to change the block size,
+ * thus dirying the new lvl=0 blk=0.
+ */
+ if (blkid == 0)
+ match_offset = TRUE;
+ break;
+ case THT_FREE:
+ /*
+ * We will dirty all the level 1 blocks in
+ * the free range and perhaps the first and
+ * last level 0 block.
+ */
+ if (blkid >= beginblk && (blkid <= endblk ||
+ txh->txh_arg2 == DMU_OBJECT_END))
+ match_offset = TRUE;
+ break;
+ case THT_SPILL:
+ if (blkid == DMU_SPILL_BLKID)
+ match_offset = TRUE;
+ break;
+ case THT_BONUS:
+ if (blkid == DMU_BONUS_BLKID)
+ match_offset = TRUE;
+ break;
+ case THT_ZAP:
+ match_offset = TRUE;
+ break;
+ case THT_NEWOBJECT:
+ match_object = TRUE;
+ break;
+ default:
+ cmn_err(CE_PANIC, "bad txh_type %d",
+ txh->txh_type);
+ }
+ }
+ if (match_object && match_offset) {
+ DB_DNODE_EXIT(db);
+ return;
+ }
+ }
+ DB_DNODE_EXIT(db);
+ panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
+ (u_longlong_t)db->db.db_object, db->db_level,
+ (u_longlong_t)db->db_blkid);
+}
+#endif
+
+/*
+ * If we can't do 10 iops, something is wrong. Let us go ahead
+ * and hit zfs_dirty_data_max.
+ */
+hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */
+int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
+
+/*
+ * We delay transactions when we've determined that the backend storage
+ * isn't able to accommodate the rate of incoming writes.
+ *
+ * If there is already a transaction waiting, we delay relative to when
+ * that transaction finishes waiting. This way the calculated min_time
+ * is independent of the number of threads concurrently executing
+ * transactions.
+ *
+ * If we are the only waiter, wait relative to when the transaction
+ * started, rather than the current time. This credits the transaction for
+ * "time already served", e.g. reading indirect blocks.
+ *
+ * The minimum time for a transaction to take is calculated as:
+ * min_time = scale * (dirty - min) / (max - dirty)
+ * min_time is then capped at zfs_delay_max_ns.
+ *
+ * The delay has two degrees of freedom that can be adjusted via tunables.
+ * The percentage of dirty data at which we start to delay is defined by
+ * zfs_delay_min_dirty_percent. This should typically be at or above
+ * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
+ * delay after writing at full speed has failed to keep up with the incoming
+ * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
+ * speaking, this variable determines the amount of delay at the midpoint of
+ * the curve.
+ *
+ * delay
+ * 10ms +-------------------------------------------------------------*+
+ * | *|
+ * 9ms + *+
+ * | *|
+ * 8ms + *+
+ * | * |
+ * 7ms + * +
+ * | * |
+ * 6ms + * +
+ * | * |
+ * 5ms + * +
+ * | * |
+ * 4ms + * +
+ * | * |
+ * 3ms + * +
+ * | * |
+ * 2ms + (midpoint) * +
+ * | | ** |
+ * 1ms + v *** +
+ * | zfs_delay_scale ----------> ******** |
+ * 0 +-------------------------------------*********----------------+
+ * 0% <- zfs_dirty_data_max -> 100%
+ *
+ * Note that since the delay is added to the outstanding time remaining on the
+ * most recent transaction, the delay is effectively the inverse of IOPS.
+ * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
+ * was chosen such that small changes in the amount of accumulated dirty data
+ * in the first 3/4 of the curve yield relatively small differences in the
+ * amount of delay.
+ *
+ * The effects can be easier to understand when the amount of delay is
+ * represented on a log scale:
+ *
+ * delay
+ * 100ms +-------------------------------------------------------------++
+ * + +
+ * | |
+ * + *+
+ * 10ms + *+
+ * + ** +
+ * | (midpoint) ** |
+ * + | ** +
+ * 1ms + v **** +
+ * + zfs_delay_scale ----------> ***** +
+ * | **** |
+ * + **** +
+ * 100us + ** +
+ * + * +
+ * | * |
+ * + * +
+ * 10us + * +
+ * + +
+ * | |
+ * + +
+ * +--------------------------------------------------------------+
+ * 0% <- zfs_dirty_data_max -> 100%
+ *
+ * Note here that only as the amount of dirty data approaches its limit does
+ * the delay start to increase rapidly. The goal of a properly tuned system
+ * should be to keep the amount of dirty data out of that range by first
+ * ensuring that the appropriate limits are set for the I/O scheduler to reach
+ * optimal throughput on the backend storage, and then by changing the value
+ * of zfs_delay_scale to increase the steepness of the curve.
+ */
+static void
+dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
+{
+ dsl_pool_t *dp = tx->tx_pool;
+ uint64_t delay_min_bytes =
+ zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
+ hrtime_t wakeup, min_tx_time, now;
+
+ if (dirty <= delay_min_bytes)
+ return;
+
+ /*
+ * The caller has already waited until we are under the max.
+ * We make them pass us the amount of dirty data so we don't
+ * have to handle the case of it being >= the max, which could
+ * cause a divide-by-zero if it's == the max.
+ */
+ ASSERT3U(dirty, <, zfs_dirty_data_max);
+
+ now = gethrtime();
+ min_tx_time = zfs_delay_scale *
+ (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
+ min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
+ if (now > tx->tx_start + min_tx_time)
+ return;
+
+ DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
+ uint64_t, min_tx_time);
+
+ mutex_enter(&dp->dp_lock);
+ wakeup = MAX(tx->tx_start + min_tx_time,
+ dp->dp_last_wakeup + min_tx_time);
+ dp->dp_last_wakeup = wakeup;
+ mutex_exit(&dp->dp_lock);
+
+ zfs_sleep_until(wakeup);
+}
+
+/*
+ * This routine attempts to assign the transaction to a transaction group.
+ * To do so, we must determine if there is sufficient free space on disk.
+ *
+ * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree()
+ * on it), then it is assumed that there is sufficient free space,
+ * unless there's insufficient slop space in the pool (see the comment
+ * above spa_slop_shift in spa_misc.c).
+ *
+ * If it is not a "netfree" transaction, then if the data already on disk
+ * is over the allowed usage (e.g. quota), this will fail with EDQUOT or
+ * ENOSPC. Otherwise, if the current rough estimate of pending changes,
+ * plus the rough estimate of this transaction's changes, may exceed the
+ * allowed usage, then this will fail with ERESTART, which will cause the
+ * caller to wait for the pending changes to be written to disk (by waiting
+ * for the next TXG to open), and then check the space usage again.
+ *
+ * The rough estimate of pending changes is comprised of the sum of:
+ *
+ * - this transaction's holds' txh_space_towrite
+ *
+ * - dd_tempreserved[], which is the sum of in-flight transactions'
+ * holds' txh_space_towrite (i.e. those transactions that have called
+ * dmu_tx_assign() but not yet called dmu_tx_commit()).
+ *
+ * - dd_space_towrite[], which is the amount of dirtied dbufs.
+ *
+ * Note that all of these values are inflated by spa_get_worst_case_asize(),
+ * which means that we may get ERESTART well before we are actually in danger
+ * of running out of space, but this also mitigates any small inaccuracies
+ * in the rough estimate (e.g. txh_space_towrite doesn't take into account
+ * indirect blocks, and dd_space_towrite[] doesn't take into account changes
+ * to the MOS).
+ *
+ * Note that due to this algorithm, it is possible to exceed the allowed
+ * usage by one transaction. Also, as we approach the allowed usage,
+ * we will allow a very limited amount of changes into each TXG, thus
+ * decreasing performance.
+ */
+static int
+dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
+{
+ spa_t *spa = tx->tx_pool->dp_spa;
+
+ ASSERT0(tx->tx_txg);
+
+ if (tx->tx_err) {
+ DMU_TX_STAT_BUMP(dmu_tx_error);
+ return (tx->tx_err);
+ }
+
+ if (spa_suspended(spa)) {
+ DMU_TX_STAT_BUMP(dmu_tx_suspended);
+
+ /*
+ * If the user has indicated a blocking failure mode
+ * then return ERESTART which will block in dmu_tx_wait().
+ * Otherwise, return EIO so that an error can get
+ * propagated back to the VOP calls.
+ *
+ * Note that we always honor the txg_how flag regardless
+ * of the failuremode setting.
+ */
+ if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
+ !(txg_how & TXG_WAIT))
+ return (SET_ERROR(EIO));
+
+ return (SET_ERROR(ERESTART));
+ }
+
+ if (!tx->tx_dirty_delayed &&
+ dsl_pool_need_dirty_delay(tx->tx_pool)) {
+ tx->tx_wait_dirty = B_TRUE;
+ DMU_TX_STAT_BUMP(dmu_tx_dirty_delay);
+ return (SET_ERROR(ERESTART));
+ }
+
+ tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
+ tx->tx_needassign_txh = NULL;
+
+ /*
+ * NB: No error returns are allowed after txg_hold_open, but
+ * before processing the dnode holds, due to the
+ * dmu_tx_unassign() logic.
+ */
+
+ uint64_t towrite = 0;
+ uint64_t tohold = 0;
+ for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
+ txh = list_next(&tx->tx_holds, txh)) {
+ dnode_t *dn = txh->txh_dnode;
+ if (dn != NULL) {
+ /*
+ * This thread can't hold the dn_struct_rwlock
+ * while assigning the tx, because this can lead to
+ * deadlock. Specifically, if this dnode is already
+ * assigned to an earlier txg, this thread may need
+ * to wait for that txg to sync (the ERESTART case
+ * below). The other thread that has assigned this
+ * dnode to an earlier txg prevents this txg from
+ * syncing until its tx can complete (calling
+ * dmu_tx_commit()), but it may need to acquire the
+ * dn_struct_rwlock to do so (e.g. via
+ * dmu_buf_hold*()).
+ *
+ * Note that this thread can't hold the lock for
+ * read either, but the rwlock doesn't record
+ * enough information to make that assertion.
+ */
+ ASSERT(!RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_assigned_txg == tx->tx_txg - 1) {
+ mutex_exit(&dn->dn_mtx);
+ tx->tx_needassign_txh = txh;
+ DMU_TX_STAT_BUMP(dmu_tx_group);
+ return (SET_ERROR(ERESTART));
+ }
+ if (dn->dn_assigned_txg == 0)
+ dn->dn_assigned_txg = tx->tx_txg;
+ ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+ (void) zfs_refcount_add(&dn->dn_tx_holds, tx);
+ mutex_exit(&dn->dn_mtx);
+ }
+ towrite += zfs_refcount_count(&txh->txh_space_towrite);
+ tohold += zfs_refcount_count(&txh->txh_memory_tohold);
+ }
+
+ /* needed allocation: worst-case estimate of write space */
+ uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite);
+ /* calculate memory footprint estimate */
+ uint64_t memory = towrite + tohold;
+
+ if (tx->tx_dir != NULL && asize != 0) {
+ int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
+ asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx);
+ if (err != 0)
+ return (err);
+ }
+
+ DMU_TX_STAT_BUMP(dmu_tx_assigned);
+
+ return (0);
+}
+
+static void
+dmu_tx_unassign(dmu_tx_t *tx)
+{
+ if (tx->tx_txg == 0)
+ return;
+
+ txg_rele_to_quiesce(&tx->tx_txgh);
+
+ /*
+ * Walk the transaction's hold list, removing the hold on the
+ * associated dnode, and notifying waiters if the refcount drops to 0.
+ */
+ for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds);
+ txh && txh != tx->tx_needassign_txh;
+ txh = list_next(&tx->tx_holds, txh)) {
+ dnode_t *dn = txh->txh_dnode;
+
+ if (dn == NULL)
+ continue;
+ mutex_enter(&dn->dn_mtx);
+ ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+
+ if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) {
+ dn->dn_assigned_txg = 0;
+ cv_broadcast(&dn->dn_notxholds);
+ }
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ txg_rele_to_sync(&tx->tx_txgh);
+
+ tx->tx_lasttried_txg = tx->tx_txg;
+ tx->tx_txg = 0;
+}
+
+/*
+ * Assign tx to a transaction group; txg_how is a bitmask:
+ *
+ * If TXG_WAIT is set and the currently open txg is full, this function
+ * will wait until there's a new txg. This should be used when no locks
+ * are being held. With this bit set, this function will only fail if
+ * we're truly out of space (or over quota).
+ *
+ * If TXG_WAIT is *not* set and we can't assign into the currently open
+ * txg without blocking, this function will return immediately with
+ * ERESTART. This should be used whenever locks are being held. On an
+ * ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
+ * and try again.
+ *
+ * If TXG_NOTHROTTLE is set, this indicates that this tx should not be
+ * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
+ * details on the throttle). This is used by the VFS operations, after
+ * they have already called dmu_tx_wait() (though most likely on a
+ * different tx).
+ *
+ * It is guaranteed that subsequent successful calls to dmu_tx_assign()
+ * will assign the tx to monotonically increasing txgs. Of course this is
+ * not strong monotonicity, because the same txg can be returned multiple
+ * times in a row. This guarantee holds both for subsequent calls from
+ * one thread and for multiple threads. For example, it is impossible to
+ * observe the following sequence of events:
+ *
+ * Thread 1 Thread 2
+ *
+ * dmu_tx_assign(T1, ...)
+ * 1 <- dmu_tx_get_txg(T1)
+ * dmu_tx_assign(T2, ...)
+ * 2 <- dmu_tx_get_txg(T2)
+ * dmu_tx_assign(T3, ...)
+ * 1 <- dmu_tx_get_txg(T3)
+ */
+int
+dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
+{
+ int err;
+
+ ASSERT(tx->tx_txg == 0);
+ ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE));
+ ASSERT(!dsl_pool_sync_context(tx->tx_pool));
+
+ /* If we might wait, we must not hold the config lock. */
+ IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));
+
+ if ((txg_how & TXG_NOTHROTTLE))
+ tx->tx_dirty_delayed = B_TRUE;
+
+ while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
+ dmu_tx_unassign(tx);
+
+ if (err != ERESTART || !(txg_how & TXG_WAIT))
+ return (err);
+
+ dmu_tx_wait(tx);
+ }
+
+ txg_rele_to_quiesce(&tx->tx_txgh);
+
+ return (0);
+}
+
+void
+dmu_tx_wait(dmu_tx_t *tx)
+{
+ spa_t *spa = tx->tx_pool->dp_spa;
+ dsl_pool_t *dp = tx->tx_pool;
+ hrtime_t before;
+
+ ASSERT(tx->tx_txg == 0);
+ ASSERT(!dsl_pool_config_held(tx->tx_pool));
+
+ before = gethrtime();
+
+ if (tx->tx_wait_dirty) {
+ uint64_t dirty;
+
+ /*
+ * dmu_tx_try_assign() has determined that we need to wait
+ * because we've consumed much or all of the dirty buffer
+ * space.
+ */
+ mutex_enter(&dp->dp_lock);
+ if (dp->dp_dirty_total >= zfs_dirty_data_max)
+ DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max);
+ while (dp->dp_dirty_total >= zfs_dirty_data_max)
+ cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
+ dirty = dp->dp_dirty_total;
+ mutex_exit(&dp->dp_lock);
+
+ dmu_tx_delay(tx, dirty);
+
+ tx->tx_wait_dirty = B_FALSE;
+
+ /*
+ * Note: setting tx_dirty_delayed only has effect if the
+ * caller used TX_WAIT. Otherwise they are going to
+ * destroy this tx and try again. The common case,
+ * zfs_write(), uses TX_WAIT.
+ */
+ tx->tx_dirty_delayed = B_TRUE;
+ } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
+ /*
+ * If the pool is suspended we need to wait until it
+ * is resumed. Note that it's possible that the pool
+ * has become active after this thread has tried to
+ * obtain a tx. If that's the case then tx_lasttried_txg
+ * would not have been set.
+ */
+ txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
+ } else if (tx->tx_needassign_txh) {
+ dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
+
+ mutex_enter(&dn->dn_mtx);
+ while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
+ cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
+ mutex_exit(&dn->dn_mtx);
+ tx->tx_needassign_txh = NULL;
+ } else {
+ /*
+ * If we have a lot of dirty data just wait until we sync
+ * out a TXG at which point we'll hopefully have synced
+ * a portion of the changes.
+ */
+ txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
+ }
+
+ spa_tx_assign_add_nsecs(spa, gethrtime() - before);
+}
+
+static void
+dmu_tx_destroy(dmu_tx_t *tx)
+{
+ dmu_tx_hold_t *txh;
+
+ while ((txh = list_head(&tx->tx_holds)) != NULL) {
+ dnode_t *dn = txh->txh_dnode;
+
+ list_remove(&tx->tx_holds, txh);
+ zfs_refcount_destroy_many(&txh->txh_space_towrite,
+ zfs_refcount_count(&txh->txh_space_towrite));
+ zfs_refcount_destroy_many(&txh->txh_memory_tohold,
+ zfs_refcount_count(&txh->txh_memory_tohold));
+ kmem_free(txh, sizeof (dmu_tx_hold_t));
+ if (dn != NULL)
+ dnode_rele(dn, tx);
+ }
+
+ list_destroy(&tx->tx_callbacks);
+ list_destroy(&tx->tx_holds);
+ kmem_free(tx, sizeof (dmu_tx_t));
+}
+
+void
+dmu_tx_commit(dmu_tx_t *tx)
+{
+ ASSERT(tx->tx_txg != 0);
+
+ /*
+ * Go through the transaction's hold list and remove holds on
+ * associated dnodes, notifying waiters if no holds remain.
+ */
+ for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
+ txh = list_next(&tx->tx_holds, txh)) {
+ dnode_t *dn = txh->txh_dnode;
+
+ if (dn == NULL)
+ continue;
+
+ mutex_enter(&dn->dn_mtx);
+ ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+
+ if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) {
+ dn->dn_assigned_txg = 0;
+ cv_broadcast(&dn->dn_notxholds);
+ }
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ if (tx->tx_tempreserve_cookie)
+ dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
+
+ if (!list_is_empty(&tx->tx_callbacks))
+ txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
+
+ if (tx->tx_anyobj == FALSE)
+ txg_rele_to_sync(&tx->tx_txgh);
+
+ dmu_tx_destroy(tx);
+}
+
+void
+dmu_tx_abort(dmu_tx_t *tx)
+{
+ ASSERT(tx->tx_txg == 0);
+
+ /*
+ * Call any registered callbacks with an error code.
+ */
+ if (!list_is_empty(&tx->tx_callbacks))
+ dmu_tx_do_callbacks(&tx->tx_callbacks, SET_ERROR(ECANCELED));
+
+ dmu_tx_destroy(tx);
+}
+
+uint64_t
+dmu_tx_get_txg(dmu_tx_t *tx)
+{
+ ASSERT(tx->tx_txg != 0);
+ return (tx->tx_txg);
+}
+
+dsl_pool_t *
+dmu_tx_pool(dmu_tx_t *tx)
+{
+ ASSERT(tx->tx_pool != NULL);
+ return (tx->tx_pool);
+}
+
+void
+dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
+{
+ dmu_tx_callback_t *dcb;
+
+ dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
+
+ dcb->dcb_func = func;
+ dcb->dcb_data = data;
+
+ list_insert_tail(&tx->tx_callbacks, dcb);
+}
+
+/*
+ * Call all the commit callbacks on a list, with a given error code.
+ */
+void
+dmu_tx_do_callbacks(list_t *cb_list, int error)
+{
+ dmu_tx_callback_t *dcb;
+
+ while ((dcb = list_tail(cb_list)) != NULL) {
+ list_remove(cb_list, dcb);
+ dcb->dcb_func(dcb->dcb_data, error);
+ kmem_free(dcb, sizeof (dmu_tx_callback_t));
+ }
+}
+
+/*
+ * Interface to hold a bunch of attributes.
+ * used for creating new files.
+ * attrsize is the total size of all attributes
+ * to be added during object creation
+ *
+ * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
+ */
+
+/*
+ * hold necessary attribute name for attribute registration.
+ * should be a very rare case where this is needed. If it does
+ * happen it would only happen on the first write to the file system.
+ */
+static void
+dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
+{
+ if (!sa->sa_need_attr_registration)
+ return;
+
+ for (int i = 0; i != sa->sa_num_attrs; i++) {
+ if (!sa->sa_attr_table[i].sa_registered) {
+ if (sa->sa_reg_attr_obj)
+ dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
+ B_TRUE, sa->sa_attr_table[i].sa_name);
+ else
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
+ B_TRUE, sa->sa_attr_table[i].sa_name);
+ }
+ }
+}
+
+void
+dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
+{
+ dmu_tx_hold_t *txh;
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
+ THT_SPILL, 0, 0);
+ if (txh != NULL)
+ (void) zfs_refcount_add_many(&txh->txh_space_towrite,
+ SPA_OLD_MAXBLOCKSIZE, FTAG);
+}
+
+void
+dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
+{
+ sa_os_t *sa = tx->tx_objset->os_sa;
+
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+
+ if (tx->tx_objset->os_sa->sa_master_obj == 0)
+ return;
+
+ if (tx->tx_objset->os_sa->sa_layout_attr_obj) {
+ dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
+ } else {
+ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
+ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ }
+
+ dmu_tx_sa_registration_hold(sa, tx);
+
+ if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill)
+ return;
+
+ (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
+ THT_SPILL, 0, 0);
+}
+
+/*
+ * Hold SA attribute
+ *
+ * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
+ *
+ * variable_size is the total size of all variable sized attributes
+ * passed to this function. It is not the total size of all
+ * variable size attributes that *may* exist on this object.
+ */
+void
+dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
+{
+ uint64_t object;
+ sa_os_t *sa = tx->tx_objset->os_sa;
+
+ ASSERT(hdl != NULL);
+
+ object = sa_handle_object(hdl);
+
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
+ DB_DNODE_ENTER(db);
+ dmu_tx_hold_bonus_by_dnode(tx, DB_DNODE(db));
+ DB_DNODE_EXIT(db);
+
+ if (tx->tx_objset->os_sa->sa_master_obj == 0)
+ return;
+
+ if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
+ tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
+ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
+ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ }
+
+ dmu_tx_sa_registration_hold(sa, tx);
+
+ if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
+ dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
+
+ if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
+ ASSERT(tx->tx_txg == 0);
+ dmu_tx_hold_spill(tx, object);
+ } else {
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ if (dn->dn_have_spill) {
+ ASSERT(tx->tx_txg == 0);
+ dmu_tx_hold_spill(tx, object);
+ }
+ DB_DNODE_EXIT(db);
+ }
+}
+
+void
+dmu_tx_init(void)
+{
+ dmu_tx_ksp = kstat_create("zfs", 0, "dmu_tx", "misc",
+ KSTAT_TYPE_NAMED, sizeof (dmu_tx_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (dmu_tx_ksp != NULL) {
+ dmu_tx_ksp->ks_data = &dmu_tx_stats;
+ kstat_install(dmu_tx_ksp);
+ }
+}
+
+void
+dmu_tx_fini(void)
+{
+ if (dmu_tx_ksp != NULL) {
+ kstat_delete(dmu_tx_ksp);
+ dmu_tx_ksp = NULL;
+ }
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(dmu_tx_create);
+EXPORT_SYMBOL(dmu_tx_hold_write);
+EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode);
+EXPORT_SYMBOL(dmu_tx_hold_free);
+EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode);
+EXPORT_SYMBOL(dmu_tx_hold_zap);
+EXPORT_SYMBOL(dmu_tx_hold_zap_by_dnode);
+EXPORT_SYMBOL(dmu_tx_hold_bonus);
+EXPORT_SYMBOL(dmu_tx_hold_bonus_by_dnode);
+EXPORT_SYMBOL(dmu_tx_abort);
+EXPORT_SYMBOL(dmu_tx_assign);
+EXPORT_SYMBOL(dmu_tx_wait);
+EXPORT_SYMBOL(dmu_tx_commit);
+EXPORT_SYMBOL(dmu_tx_mark_netfree);
+EXPORT_SYMBOL(dmu_tx_get_txg);
+EXPORT_SYMBOL(dmu_tx_callback_register);
+EXPORT_SYMBOL(dmu_tx_do_callbacks);
+EXPORT_SYMBOL(dmu_tx_hold_spill);
+EXPORT_SYMBOL(dmu_tx_hold_sa_create);
+EXPORT_SYMBOL(dmu_tx_hold_sa);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
new file mode 100644
index 000000000000..5d061fe3813e
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
@@ -0,0 +1,471 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dnode.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/dmu.h>
+#include <sys/dbuf.h>
+#include <sys/kstat.h>
+
+/*
+ * This tunable disables predictive prefetch. Note that it leaves "prescient"
+ * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch,
+ * prescient prefetch never issues i/os that end up not being needed,
+ * so it can't hurt performance.
+ */
+
+int zfs_prefetch_disable = B_FALSE;
+
+/* max # of streams per zfetch */
+unsigned int zfetch_max_streams = 8;
+/* min time before stream reclaim */
+unsigned int zfetch_min_sec_reap = 2;
+/* max bytes to prefetch per stream (default 8MB) */
+unsigned int zfetch_max_distance = 8 * 1024 * 1024;
+/* max bytes to prefetch indirects for per stream (default 64MB) */
+unsigned int zfetch_max_idistance = 64 * 1024 * 1024;
+/* max number of bytes in an array_read in which we allow prefetching (1MB) */
+unsigned long zfetch_array_rd_sz = 1024 * 1024;
+
+typedef struct zfetch_stats {
+ kstat_named_t zfetchstat_hits;
+ kstat_named_t zfetchstat_misses;
+ kstat_named_t zfetchstat_max_streams;
+ kstat_named_t zfetchstat_max_completion_us;
+ kstat_named_t zfetchstat_last_completion_us;
+ kstat_named_t zfetchstat_io_issued;
+} zfetch_stats_t;
+
+static zfetch_stats_t zfetch_stats = {
+ { "hits", KSTAT_DATA_UINT64 },
+ { "misses", KSTAT_DATA_UINT64 },
+ { "max_streams", KSTAT_DATA_UINT64 },
+ { "max_completion_us", KSTAT_DATA_UINT64 },
+ { "last_completion_us", KSTAT_DATA_UINT64 },
+ { "io_issued", KSTAT_DATA_UINT64 },
+};
+
+#define ZFETCHSTAT_BUMP(stat) \
+ atomic_inc_64(&zfetch_stats.stat.value.ui64)
+#define ZFETCHSTAT_ADD(stat, val) \
+ atomic_add_64(&zfetch_stats.stat.value.ui64, val)
+#define ZFETCHSTAT_SET(stat, val) \
+ zfetch_stats.stat.value.ui64 = val
+#define ZFETCHSTAT_GET(stat) \
+ zfetch_stats.stat.value.ui64
+
+
+kstat_t *zfetch_ksp;
+
+void
+zfetch_init(void)
+{
+ zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (zfetch_ksp != NULL) {
+ zfetch_ksp->ks_data = &zfetch_stats;
+ kstat_install(zfetch_ksp);
+ }
+}
+
+void
+zfetch_fini(void)
+{
+ if (zfetch_ksp != NULL) {
+ kstat_delete(zfetch_ksp);
+ zfetch_ksp = NULL;
+ }
+}
+
+/*
+ * This takes a pointer to a zfetch structure and a dnode. It performs the
+ * necessary setup for the zfetch structure, grokking data from the
+ * associated dnode.
+ */
+void
+dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
+{
+ if (zf == NULL)
+ return;
+ zf->zf_dnode = dno;
+ zf->zf_numstreams = 0;
+
+ list_create(&zf->zf_stream, sizeof (zstream_t),
+ offsetof(zstream_t, zs_node));
+
+ mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+static void
+dmu_zfetch_stream_fini(zstream_t *zs)
+{
+ mutex_destroy(&zs->zs_lock);
+ kmem_free(zs, sizeof (*zs));
+}
+
+static void
+dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
+{
+ ASSERT(MUTEX_HELD(&zf->zf_lock));
+ list_remove(&zf->zf_stream, zs);
+ dmu_zfetch_stream_fini(zs);
+ zf->zf_numstreams--;
+}
+
+static void
+dmu_zfetch_stream_orphan(zfetch_t *zf, zstream_t *zs)
+{
+ ASSERT(MUTEX_HELD(&zf->zf_lock));
+ list_remove(&zf->zf_stream, zs);
+ zs->zs_fetch = NULL;
+ zf->zf_numstreams--;
+}
+
+/*
+ * Clean-up state associated with a zfetch structure (e.g. destroy the
+ * streams). This doesn't free the zfetch_t itself, that's left to the caller.
+ */
+void
+dmu_zfetch_fini(zfetch_t *zf)
+{
+ zstream_t *zs;
+
+ mutex_enter(&zf->zf_lock);
+ while ((zs = list_head(&zf->zf_stream)) != NULL) {
+ if (zfs_refcount_count(&zs->zs_blocks) != 0)
+ dmu_zfetch_stream_orphan(zf, zs);
+ else
+ dmu_zfetch_stream_remove(zf, zs);
+ }
+ mutex_exit(&zf->zf_lock);
+ list_destroy(&zf->zf_stream);
+ mutex_destroy(&zf->zf_lock);
+
+ zf->zf_dnode = NULL;
+}
+
+/*
+ * If there aren't too many streams already, create a new stream.
+ * The "blkid" argument is the next block that we expect this stream to access.
+ * While we're here, clean up old streams (which haven't been
+ * accessed for at least zfetch_min_sec_reap seconds).
+ */
+static void
+dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
+{
+ zstream_t *zs_next;
+ hrtime_t now = gethrtime();
+
+ ASSERT(MUTEX_HELD(&zf->zf_lock));
+
+ /*
+ * Clean up old streams.
+ */
+ for (zstream_t *zs = list_head(&zf->zf_stream);
+ zs != NULL; zs = zs_next) {
+ zs_next = list_next(&zf->zf_stream, zs);
+ /*
+ * Skip gethrtime() call if there are still references
+ */
+ if (zfs_refcount_count(&zs->zs_blocks) != 0)
+ continue;
+ if (((now - zs->zs_atime) / NANOSEC) >
+ zfetch_min_sec_reap)
+ dmu_zfetch_stream_remove(zf, zs);
+ }
+
+ /*
+ * The maximum number of streams is normally zfetch_max_streams,
+ * but for small files we lower it such that it's at least possible
+ * for all the streams to be non-overlapping.
+ *
+ * If we are already at the maximum number of streams for this file,
+ * even after removing old streams, then don't create this stream.
+ */
+ uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
+ zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
+ zfetch_max_distance));
+ if (zf->zf_numstreams >= max_streams) {
+ ZFETCHSTAT_BUMP(zfetchstat_max_streams);
+ return;
+ }
+
+ zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
+ zs->zs_blkid = blkid;
+ zs->zs_pf_blkid = blkid;
+ zs->zs_ipf_blkid = blkid;
+ zs->zs_atime = now;
+ zs->zs_fetch = zf;
+ zfs_refcount_create(&zs->zs_blocks);
+ mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
+ zf->zf_numstreams++;
+ list_insert_head(&zf->zf_stream, zs);
+}
+
+static void
+dmu_zfetch_stream_done(void *arg, boolean_t io_issued)
+{
+ zstream_t *zs = arg;
+
+ if (zs->zs_start_time && io_issued) {
+ hrtime_t now = gethrtime();
+ hrtime_t delta = NSEC2USEC(now - zs->zs_start_time);
+
+ zs->zs_start_time = 0;
+ ZFETCHSTAT_SET(zfetchstat_last_completion_us, delta);
+ if (delta > ZFETCHSTAT_GET(zfetchstat_max_completion_us))
+ ZFETCHSTAT_SET(zfetchstat_max_completion_us, delta);
+ }
+
+ if (zfs_refcount_remove(&zs->zs_blocks, NULL) != 0)
+ return;
+
+ /*
+ * The parent fetch structure has gone away
+ */
+ if (zs->zs_fetch == NULL)
+ dmu_zfetch_stream_fini(zs);
+}
+
+/*
+ * This is the predictive prefetch entry point. It associates dnode access
+ * specified with blkid and nblks arguments with prefetch stream, predicts
+ * further accesses based on that stats and initiates speculative prefetch.
+ * fetch_data argument specifies whether actual data blocks should be fetched:
+ * FALSE -- prefetch only indirect blocks for predicted data blocks;
+ * TRUE -- prefetch predicted data blocks plus following indirect blocks.
+ */
+void
+dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
+ boolean_t have_lock)
+{
+ zstream_t *zs;
+ int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
+ int64_t pf_ahead_blks, max_blks;
+ int epbs, max_dist_blks, pf_nblks, ipf_nblks, issued;
+ uint64_t end_of_access_blkid;
+ end_of_access_blkid = blkid + nblks;
+ spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
+
+ if (zfs_prefetch_disable)
+ return;
+ /*
+ * If we haven't yet loaded the indirect vdevs' mappings, we
+ * can only read from blocks that we carefully ensure are on
+ * concrete vdevs (or previously-loaded indirect vdevs). So we
+ * can't allow the predictive prefetcher to attempt reads of other
+ * blocks (e.g. of the MOS's dnode object).
+ */
+ if (!spa_indirect_vdevs_loaded(spa))
+ return;
+
+ /*
+ * As a fast path for small (single-block) files, ignore access
+ * to the first block.
+ */
+ if (!have_lock && blkid == 0)
+ return;
+
+ if (!have_lock)
+ rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
+
+ /*
+ * A fast path for small files for which no prefetch will
+ * happen.
+ */
+ if (zf->zf_dnode->dn_maxblkid < 2) {
+ if (!have_lock)
+ rw_exit(&zf->zf_dnode->dn_struct_rwlock);
+ return;
+ }
+ mutex_enter(&zf->zf_lock);
+
+ /*
+ * Find matching prefetch stream. Depending on whether the accesses
+ * are block-aligned, first block of the new access may either follow
+ * the last block of the previous access, or be equal to it.
+ */
+ for (zs = list_head(&zf->zf_stream); zs != NULL;
+ zs = list_next(&zf->zf_stream, zs)) {
+ if (blkid == zs->zs_blkid || blkid + 1 == zs->zs_blkid) {
+ mutex_enter(&zs->zs_lock);
+ /*
+ * zs_blkid could have changed before we
+ * acquired zs_lock; re-check them here.
+ */
+ if (blkid == zs->zs_blkid) {
+ break;
+ } else if (blkid + 1 == zs->zs_blkid) {
+ blkid++;
+ nblks--;
+ if (nblks == 0) {
+ /* Already prefetched this before. */
+ mutex_exit(&zs->zs_lock);
+ mutex_exit(&zf->zf_lock);
+ if (!have_lock) {
+ rw_exit(&zf->zf_dnode->
+ dn_struct_rwlock);
+ }
+ return;
+ }
+ break;
+ }
+ mutex_exit(&zs->zs_lock);
+ }
+ }
+
+ if (zs == NULL) {
+ /*
+ * This access is not part of any existing stream. Create
+ * a new stream for it.
+ */
+ ZFETCHSTAT_BUMP(zfetchstat_misses);
+
+ dmu_zfetch_stream_create(zf, end_of_access_blkid);
+ mutex_exit(&zf->zf_lock);
+ if (!have_lock)
+ rw_exit(&zf->zf_dnode->dn_struct_rwlock);
+ return;
+ }
+
+ /*
+ * This access was to a block that we issued a prefetch for on
+ * behalf of this stream. Issue further prefetches for this stream.
+ *
+ * Normally, we start prefetching where we stopped
+ * prefetching last (zs_pf_blkid). But when we get our first
+ * hit on this stream, zs_pf_blkid == zs_blkid, we don't
+ * want to prefetch the block we just accessed. In this case,
+ * start just after the block we just accessed.
+ */
+ pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
+
+ /*
+ * Double our amount of prefetched data, but don't let the
+ * prefetch get further ahead than zfetch_max_distance.
+ */
+ if (fetch_data) {
+ max_dist_blks =
+ zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
+ /*
+ * Previously, we were (zs_pf_blkid - blkid) ahead. We
+ * want to now be double that, so read that amount again,
+ * plus the amount we are catching up by (i.e. the amount
+ * read just now).
+ */
+ pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
+ max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
+ pf_nblks = MIN(pf_ahead_blks, max_blks);
+ } else {
+ pf_nblks = 0;
+ }
+
+ zs->zs_pf_blkid = pf_start + pf_nblks;
+
+ /*
+ * Do the same for indirects, starting from where we stopped last,
+ * or where we will stop reading data blocks (and the indirects
+ * that point to them).
+ */
+ ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
+ max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
+ /*
+ * We want to double our distance ahead of the data prefetch
+ * (or reader, if we are not prefetching data). Previously, we
+ * were (zs_ipf_blkid - blkid) ahead. To double that, we read
+ * that amount again, plus the amount we are catching up by
+ * (i.e. the amount read now + the amount of data prefetched now).
+ */
+ pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
+ max_blks = max_dist_blks - (ipf_start - end_of_access_blkid);
+ ipf_nblks = MIN(pf_ahead_blks, max_blks);
+ zs->zs_ipf_blkid = ipf_start + ipf_nblks;
+
+ epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
+ ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
+ ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
+
+ zs->zs_atime = gethrtime();
+ /* no prior reads in progress */
+ if (zfs_refcount_count(&zs->zs_blocks) == 0)
+ zs->zs_start_time = zs->zs_atime;
+ zs->zs_blkid = end_of_access_blkid;
+ zfs_refcount_add_many(&zs->zs_blocks, pf_nblks + ipf_iend - ipf_istart,
+ NULL);
+ mutex_exit(&zs->zs_lock);
+ mutex_exit(&zf->zf_lock);
+ issued = 0;
+
+ /*
+ * dbuf_prefetch() is asynchronous (even when it needs to read
+ * indirect blocks), but we still prefer to drop our locks before
+ * calling it to reduce the time we hold them.
+ */
+
+ for (int i = 0; i < pf_nblks; i++) {
+ issued += dbuf_prefetch_impl(zf->zf_dnode, 0, pf_start + i,
+ ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
+ dmu_zfetch_stream_done, zs);
+ }
+ for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
+ issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk,
+ ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
+ dmu_zfetch_stream_done, zs);
+ }
+ if (!have_lock)
+ rw_exit(&zf->zf_dnode->dn_struct_rwlock);
+ ZFETCHSTAT_BUMP(zfetchstat_hits);
+
+ if (issued)
+ ZFETCHSTAT_ADD(zfetchstat_io_issued, issued);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
+ "Disable all ZFS prefetching");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW,
+ "Max number of streams per zfetch");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW,
+ "Min time before stream reclaim");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW,
+ "Max bytes to prefetch per stream");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW,
+ "Max bytes to prefetch indirects for per stream");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, ULONG, ZMOD_RW,
+ "Number of bytes in a array_read");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c
new file mode 100644
index 000000000000..eaba9c0c0e7f
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dnode.c
@@ -0,0 +1,2583 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/range_tree.h>
+#include <sys/trace_zfs.h>
+#include <sys/zfs_project.h>
+
+dnode_stats_t dnode_stats = {
+ { "dnode_hold_dbuf_hold", KSTAT_DATA_UINT64 },
+ { "dnode_hold_dbuf_read", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_hits", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_misses", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_interior", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_lock_retry", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_lock_misses", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_type_none", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_hits", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_misses", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_lock_misses", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_lock_retry", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_overflow", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_refcount", KSTAT_DATA_UINT64 },
+ { "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 },
+ { "dnode_allocate", KSTAT_DATA_UINT64 },
+ { "dnode_reallocate", KSTAT_DATA_UINT64 },
+ { "dnode_buf_evict", KSTAT_DATA_UINT64 },
+ { "dnode_alloc_next_chunk", KSTAT_DATA_UINT64 },
+ { "dnode_alloc_race", KSTAT_DATA_UINT64 },
+ { "dnode_alloc_next_block", KSTAT_DATA_UINT64 },
+ { "dnode_move_invalid", KSTAT_DATA_UINT64 },
+ { "dnode_move_recheck1", KSTAT_DATA_UINT64 },
+ { "dnode_move_recheck2", KSTAT_DATA_UINT64 },
+ { "dnode_move_special", KSTAT_DATA_UINT64 },
+ { "dnode_move_handle", KSTAT_DATA_UINT64 },
+ { "dnode_move_rwlock", KSTAT_DATA_UINT64 },
+ { "dnode_move_active", KSTAT_DATA_UINT64 },
+};
+
+static kstat_t *dnode_ksp;
+static kmem_cache_t *dnode_cache;
+
+static dnode_phys_t dnode_phys_zero __maybe_unused;
+
+int zfs_default_bs = SPA_MINBLOCKSHIFT;
+int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
+
+#ifdef _KERNEL
+static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
+#endif /* _KERNEL */
+
+static int
+dbuf_compare(const void *x1, const void *x2)
+{
+ const dmu_buf_impl_t *d1 = x1;
+ const dmu_buf_impl_t *d2 = x2;
+
+ int cmp = TREE_CMP(d1->db_level, d2->db_level);
+ if (likely(cmp))
+ return (cmp);
+
+ cmp = TREE_CMP(d1->db_blkid, d2->db_blkid);
+ if (likely(cmp))
+ return (cmp);
+
+ if (d1->db_state == DB_SEARCH) {
+ ASSERT3S(d2->db_state, !=, DB_SEARCH);
+ return (-1);
+ } else if (d2->db_state == DB_SEARCH) {
+ ASSERT3S(d1->db_state, !=, DB_SEARCH);
+ return (1);
+ }
+
+ return (TREE_PCMP(d1, d2));
+}
+
+/* ARGSUSED */
+static int
+dnode_cons(void *arg, void *unused, int kmflag)
+{
+ dnode_t *dn = arg;
+ int i;
+
+ rw_init(&dn->dn_struct_rwlock, NULL, RW_NOLOCKDEP, NULL);
+ mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
+ cv_init(&dn->dn_nodnholds, NULL, CV_DEFAULT, NULL);
+
+ /*
+ * Every dbuf has a reference, and dropping a tracked reference is
+ * O(number of references), so don't track dn_holds.
+ */
+ zfs_refcount_create_untracked(&dn->dn_holds);
+ zfs_refcount_create(&dn->dn_tx_holds);
+ list_link_init(&dn->dn_link);
+
+ bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
+ bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
+ bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
+ bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
+ bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
+ bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
+ bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
+ bzero(&dn->dn_next_maxblkid[0], sizeof (dn->dn_next_maxblkid));
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ multilist_link_init(&dn->dn_dirty_link[i]);
+ dn->dn_free_ranges[i] = NULL;
+ list_create(&dn->dn_dirty_records[i],
+ sizeof (dbuf_dirty_record_t),
+ offsetof(dbuf_dirty_record_t, dr_dirty_node));
+ }
+
+ dn->dn_allocated_txg = 0;
+ dn->dn_free_txg = 0;
+ dn->dn_assigned_txg = 0;
+ dn->dn_dirty_txg = 0;
+ dn->dn_dirtyctx = 0;
+ dn->dn_dirtyctx_firstset = NULL;
+ dn->dn_bonus = NULL;
+ dn->dn_have_spill = B_FALSE;
+ dn->dn_zio = NULL;
+ dn->dn_oldused = 0;
+ dn->dn_oldflags = 0;
+ dn->dn_olduid = 0;
+ dn->dn_oldgid = 0;
+ dn->dn_oldprojid = ZFS_DEFAULT_PROJID;
+ dn->dn_newuid = 0;
+ dn->dn_newgid = 0;
+ dn->dn_newprojid = ZFS_DEFAULT_PROJID;
+ dn->dn_id_flags = 0;
+
+ dn->dn_dbufs_count = 0;
+ avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_link));
+
+ dn->dn_moved = 0;
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dnode_dest(void *arg, void *unused)
+{
+ int i;
+ dnode_t *dn = arg;
+
+ rw_destroy(&dn->dn_struct_rwlock);
+ mutex_destroy(&dn->dn_mtx);
+ mutex_destroy(&dn->dn_dbufs_mtx);
+ cv_destroy(&dn->dn_notxholds);
+ cv_destroy(&dn->dn_nodnholds);
+ zfs_refcount_destroy(&dn->dn_holds);
+ zfs_refcount_destroy(&dn->dn_tx_holds);
+ ASSERT(!list_link_active(&dn->dn_link));
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
+ ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
+ list_destroy(&dn->dn_dirty_records[i]);
+ ASSERT0(dn->dn_next_nblkptr[i]);
+ ASSERT0(dn->dn_next_nlevels[i]);
+ ASSERT0(dn->dn_next_indblkshift[i]);
+ ASSERT0(dn->dn_next_bonustype[i]);
+ ASSERT0(dn->dn_rm_spillblk[i]);
+ ASSERT0(dn->dn_next_bonuslen[i]);
+ ASSERT0(dn->dn_next_blksz[i]);
+ ASSERT0(dn->dn_next_maxblkid[i]);
+ }
+
+ ASSERT0(dn->dn_allocated_txg);
+ ASSERT0(dn->dn_free_txg);
+ ASSERT0(dn->dn_assigned_txg);
+ ASSERT0(dn->dn_dirty_txg);
+ ASSERT0(dn->dn_dirtyctx);
+ ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
+ ASSERT3P(dn->dn_bonus, ==, NULL);
+ ASSERT(!dn->dn_have_spill);
+ ASSERT3P(dn->dn_zio, ==, NULL);
+ ASSERT0(dn->dn_oldused);
+ ASSERT0(dn->dn_oldflags);
+ ASSERT0(dn->dn_olduid);
+ ASSERT0(dn->dn_oldgid);
+ ASSERT0(dn->dn_oldprojid);
+ ASSERT0(dn->dn_newuid);
+ ASSERT0(dn->dn_newgid);
+ ASSERT0(dn->dn_newprojid);
+ ASSERT0(dn->dn_id_flags);
+
+ ASSERT0(dn->dn_dbufs_count);
+ avl_destroy(&dn->dn_dbufs);
+}
+
+void
+dnode_init(void)
+{
+ ASSERT(dnode_cache == NULL);
+ dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t),
+ 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
+ kmem_cache_set_move(dnode_cache, dnode_move);
+
+ dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ if (dnode_ksp != NULL) {
+ dnode_ksp->ks_data = &dnode_stats;
+ kstat_install(dnode_ksp);
+ }
+}
+
+void
+dnode_fini(void)
+{
+ if (dnode_ksp != NULL) {
+ kstat_delete(dnode_ksp);
+ dnode_ksp = NULL;
+ }
+
+ kmem_cache_destroy(dnode_cache);
+ dnode_cache = NULL;
+}
+
+
+#ifdef ZFS_DEBUG
+void
+dnode_verify(dnode_t *dn)
+{
+ int drop_struct_lock = FALSE;
+
+ ASSERT(dn->dn_phys);
+ ASSERT(dn->dn_objset);
+ ASSERT(dn->dn_handle->dnh_dnode == dn);
+
+ ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
+
+ if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
+ return;
+
+ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ drop_struct_lock = TRUE;
+ }
+ if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
+ int i;
+ int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
+ ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
+ if (dn->dn_datablkshift) {
+ ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
+ ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
+ ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
+ }
+ ASSERT3U(dn->dn_nlevels, <=, 30);
+ ASSERT(DMU_OT_IS_VALID(dn->dn_type));
+ ASSERT3U(dn->dn_nblkptr, >=, 1);
+ ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
+ ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
+ ASSERT3U(dn->dn_datablksz, ==,
+ dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
+ ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
+ dn->dn_bonuslen, <=, max_bonuslen);
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
+ }
+ }
+ if (dn->dn_phys->dn_type != DMU_OT_NONE)
+ ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
+ ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
+ if (dn->dn_dbuf != NULL) {
+ ASSERT3P(dn->dn_phys, ==,
+ (dnode_phys_t *)dn->dn_dbuf->db.db_data +
+ (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
+ }
+ if (drop_struct_lock)
+ rw_exit(&dn->dn_struct_rwlock);
+}
+#endif
+
+void
+dnode_byteswap(dnode_phys_t *dnp)
+{
+ uint64_t *buf64 = (void*)&dnp->dn_blkptr;
+ int i;
+
+ if (dnp->dn_type == DMU_OT_NONE) {
+ bzero(dnp, sizeof (dnode_phys_t));
+ return;
+ }
+
+ dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
+ dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
+ dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
+ dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
+ dnp->dn_used = BSWAP_64(dnp->dn_used);
+
+ /*
+ * dn_nblkptr is only one byte, so it's OK to read it in either
+ * byte order. We can't read dn_bouslen.
+ */
+ ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
+ ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
+ for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
+ buf64[i] = BSWAP_64(buf64[i]);
+
+ /*
+ * OK to check dn_bonuslen for zero, because it won't matter if
+ * we have the wrong byte order. This is necessary because the
+ * dnode dnode is smaller than a regular dnode.
+ */
+ if (dnp->dn_bonuslen != 0) {
+ /*
+ * Note that the bonus length calculated here may be
+ * longer than the actual bonus buffer. This is because
+ * we always put the bonus buffer after the last block
+ * pointer (instead of packing it against the end of the
+ * dnode buffer).
+ */
+ int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
+ int slots = dnp->dn_extra_slots + 1;
+ size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off;
+ dmu_object_byteswap_t byteswap;
+ ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
+ byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
+ dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len);
+ }
+
+ /* Swap SPILL block if we have one */
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
+ byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
+}
+
+void
+dnode_buf_byteswap(void *vbuf, size_t size)
+{
+ int i = 0;
+
+ ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
+ ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
+
+ while (i < size) {
+ dnode_phys_t *dnp = (void *)(((char *)vbuf) + i);
+ dnode_byteswap(dnp);
+
+ i += DNODE_MIN_SIZE;
+ if (dnp->dn_type != DMU_OT_NONE)
+ i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
+ }
+}
+
+void
+dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
+{
+ ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
+
+ dnode_setdirty(dn, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
+ (dn->dn_nblkptr-1) * sizeof (blkptr_t));
+
+ if (newsize < dn->dn_bonuslen) {
+ /* clear any data after the end of the new size */
+ size_t diff = dn->dn_bonuslen - newsize;
+ char *data_end = ((char *)dn->dn_bonus->db.db_data) + newsize;
+ bzero(data_end, diff);
+ }
+
+ dn->dn_bonuslen = newsize;
+ if (newsize == 0)
+ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
+ else
+ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+void
+dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
+{
+ ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
+ dnode_setdirty(dn, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ dn->dn_bonustype = newtype;
+ dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+void
+dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
+{
+ ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+ dnode_setdirty(dn, tx);
+ dn->dn_rm_spillblk[tx->tx_txg & TXG_MASK] = DN_KILL_SPILLBLK;
+ dn->dn_have_spill = B_FALSE;
+}
+
+static void
+dnode_setdblksz(dnode_t *dn, int size)
+{
+ ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
+ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
+ ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
+ 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
+ dn->dn_datablksz = size;
+ dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
+ dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
+}
+
+static dnode_t *
+dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
+ uint64_t object, dnode_handle_t *dnh)
+{
+ dnode_t *dn;
+
+ dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
+ dn->dn_moved = 0;
+
+ /*
+ * Defer setting dn_objset until the dnode is ready to be a candidate
+ * for the dnode_move() callback.
+ */
+ dn->dn_object = object;
+ dn->dn_dbuf = db;
+ dn->dn_handle = dnh;
+ dn->dn_phys = dnp;
+
+ if (dnp->dn_datablkszsec) {
+ dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ } else {
+ dn->dn_datablksz = 0;
+ dn->dn_datablkszsec = 0;
+ dn->dn_datablkshift = 0;
+ }
+ dn->dn_indblkshift = dnp->dn_indblkshift;
+ dn->dn_nlevels = dnp->dn_nlevels;
+ dn->dn_type = dnp->dn_type;
+ dn->dn_nblkptr = dnp->dn_nblkptr;
+ dn->dn_checksum = dnp->dn_checksum;
+ dn->dn_compress = dnp->dn_compress;
+ dn->dn_bonustype = dnp->dn_bonustype;
+ dn->dn_bonuslen = dnp->dn_bonuslen;
+ dn->dn_num_slots = dnp->dn_extra_slots + 1;
+ dn->dn_maxblkid = dnp->dn_maxblkid;
+ dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
+ dn->dn_id_flags = 0;
+
+ dmu_zfetch_init(&dn->dn_zfetch, dn);
+
+ ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
+ ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
+ ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode));
+
+ mutex_enter(&os->os_lock);
+
+ /*
+ * Exclude special dnodes from os_dnodes so an empty os_dnodes
+ * signifies that the special dnodes have no references from
+ * their children (the entries in os_dnodes). This allows
+ * dnode_destroy() to easily determine if the last child has
+ * been removed and then complete eviction of the objset.
+ */
+ if (!DMU_OBJECT_IS_SPECIAL(object))
+ list_insert_head(&os->os_dnodes, dn);
+ membar_producer();
+
+ /*
+ * Everything else must be valid before assigning dn_objset
+ * makes the dnode eligible for dnode_move().
+ */
+ dn->dn_objset = os;
+
+ dnh->dnh_dnode = dn;
+ mutex_exit(&os->os_lock);
+
+ arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE);
+
+ return (dn);
+}
+
+/*
+ * Caller must be holding the dnode handle, which is released upon return.
+ */
+static void
+dnode_destroy(dnode_t *dn)
+{
+ objset_t *os = dn->dn_objset;
+ boolean_t complete_os_eviction = B_FALSE;
+
+ ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
+
+ mutex_enter(&os->os_lock);
+ POINTER_INVALIDATE(&dn->dn_objset);
+ if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+ list_remove(&os->os_dnodes, dn);
+ complete_os_eviction =
+ list_is_empty(&os->os_dnodes) &&
+ list_link_active(&os->os_evicting_node);
+ }
+ mutex_exit(&os->os_lock);
+
+ /* the dnode can no longer move, so we can release the handle */
+ if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
+ zrl_remove(&dn->dn_handle->dnh_zrlock);
+
+ dn->dn_allocated_txg = 0;
+ dn->dn_free_txg = 0;
+ dn->dn_assigned_txg = 0;
+ dn->dn_dirty_txg = 0;
+
+ dn->dn_dirtyctx = 0;
+ dn->dn_dirtyctx_firstset = NULL;
+ if (dn->dn_bonus != NULL) {
+ mutex_enter(&dn->dn_bonus->db_mtx);
+ dbuf_destroy(dn->dn_bonus);
+ dn->dn_bonus = NULL;
+ }
+ dn->dn_zio = NULL;
+
+ dn->dn_have_spill = B_FALSE;
+ dn->dn_oldused = 0;
+ dn->dn_oldflags = 0;
+ dn->dn_olduid = 0;
+ dn->dn_oldgid = 0;
+ dn->dn_oldprojid = ZFS_DEFAULT_PROJID;
+ dn->dn_newuid = 0;
+ dn->dn_newgid = 0;
+ dn->dn_newprojid = ZFS_DEFAULT_PROJID;
+ dn->dn_id_flags = 0;
+
+ dmu_zfetch_fini(&dn->dn_zfetch);
+ kmem_cache_free(dnode_cache, dn);
+ arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE);
+
+ if (complete_os_eviction)
+ dmu_objset_evict_done(os);
+}
+
+void
+dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
+ dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
+{
+ int i;
+
+ ASSERT3U(dn_slots, >, 0);
+ ASSERT3U(dn_slots << DNODE_SHIFT, <=,
+ spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
+ ASSERT3U(blocksize, <=,
+ spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
+ if (blocksize == 0)
+ blocksize = 1 << zfs_default_bs;
+ else
+ blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
+
+ if (ibs == 0)
+ ibs = zfs_default_ibs;
+
+ ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
+
+ dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n",
+ dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
+ DNODE_STAT_BUMP(dnode_allocate);
+
+ ASSERT(dn->dn_type == DMU_OT_NONE);
+ ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
+ ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
+ ASSERT(ot != DMU_OT_NONE);
+ ASSERT(DMU_OT_IS_VALID(ot));
+ ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+ (bonustype == DMU_OT_SA && bonuslen == 0) ||
+ (bonustype != DMU_OT_NONE && bonuslen != 0));
+ ASSERT(DMU_OT_IS_VALID(bonustype));
+ ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
+ ASSERT(dn->dn_type == DMU_OT_NONE);
+ ASSERT0(dn->dn_maxblkid);
+ ASSERT0(dn->dn_allocated_txg);
+ ASSERT0(dn->dn_assigned_txg);
+ ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
+ ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1);
+ ASSERT(avl_is_empty(&dn->dn_dbufs));
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT0(dn->dn_next_nblkptr[i]);
+ ASSERT0(dn->dn_next_nlevels[i]);
+ ASSERT0(dn->dn_next_indblkshift[i]);
+ ASSERT0(dn->dn_next_bonuslen[i]);
+ ASSERT0(dn->dn_next_bonustype[i]);
+ ASSERT0(dn->dn_rm_spillblk[i]);
+ ASSERT0(dn->dn_next_blksz[i]);
+ ASSERT0(dn->dn_next_maxblkid[i]);
+ ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
+ ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
+ ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
+ }
+
+ dn->dn_type = ot;
+ dnode_setdblksz(dn, blocksize);
+ dn->dn_indblkshift = ibs;
+ dn->dn_nlevels = 1;
+ dn->dn_num_slots = dn_slots;
+ if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
+ dn->dn_nblkptr = 1;
+ else {
+ dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
+ 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+ SPA_BLKPTRSHIFT));
+ }
+
+ dn->dn_bonustype = bonustype;
+ dn->dn_bonuslen = bonuslen;
+ dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
+ dn->dn_compress = ZIO_COMPRESS_INHERIT;
+ dn->dn_dirtyctx = 0;
+
+ dn->dn_free_txg = 0;
+ dn->dn_dirtyctx_firstset = NULL;
+ dn->dn_dirty_txg = 0;
+
+ dn->dn_allocated_txg = tx->tx_txg;
+ dn->dn_id_flags = 0;
+
+ dnode_setdirty(dn, tx);
+ dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
+ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
+ dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
+ dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
+}
+
+void
+dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, int dn_slots,
+ boolean_t keep_spill, dmu_tx_t *tx)
+{
+ int nblkptr;
+
+ ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
+ ASSERT3U(blocksize, <=,
+ spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
+ ASSERT0(blocksize % SPA_MINBLOCKSIZE);
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
+ ASSERT(tx->tx_txg != 0);
+ ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+ (bonustype != DMU_OT_NONE && bonuslen != 0) ||
+ (bonustype == DMU_OT_SA && bonuslen == 0));
+ ASSERT(DMU_OT_IS_VALID(bonustype));
+ ASSERT3U(bonuslen, <=,
+ DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
+ ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT));
+
+ dnode_free_interior_slots(dn);
+ DNODE_STAT_BUMP(dnode_reallocate);
+
+ /* clean up any unreferenced dbufs */
+ dnode_evict_dbufs(dn);
+
+ dn->dn_id_flags = 0;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ dnode_setdirty(dn, tx);
+ if (dn->dn_datablksz != blocksize) {
+ /* change blocksize */
+ ASSERT0(dn->dn_maxblkid);
+ ASSERT(BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
+ dnode_block_freed(dn, 0));
+
+ dnode_setdblksz(dn, blocksize);
+ dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = blocksize;
+ }
+ if (dn->dn_bonuslen != bonuslen)
+ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = bonuslen;
+
+ if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
+ nblkptr = 1;
+ else
+ nblkptr = MIN(DN_MAX_NBLKPTR,
+ 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+ SPA_BLKPTRSHIFT));
+ if (dn->dn_bonustype != bonustype)
+ dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = bonustype;
+ if (dn->dn_nblkptr != nblkptr)
+ dn->dn_next_nblkptr[tx->tx_txg & TXG_MASK] = nblkptr;
+ if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR && !keep_spill) {
+ dbuf_rm_spill(dn, tx);
+ dnode_rm_spill(dn, tx);
+ }
+
+ rw_exit(&dn->dn_struct_rwlock);
+
+ /* change type */
+ dn->dn_type = ot;
+
+ /* change bonus size and type */
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_bonustype = bonustype;
+ dn->dn_bonuslen = bonuslen;
+ dn->dn_num_slots = dn_slots;
+ dn->dn_nblkptr = nblkptr;
+ dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
+ dn->dn_compress = ZIO_COMPRESS_INHERIT;
+ ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
+
+ /* fix up the bonus db_size */
+ if (dn->dn_bonus) {
+ dn->dn_bonus->db.db_size =
+ DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
+ (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+ ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
+ }
+
+ dn->dn_allocated_txg = tx->tx_txg;
+ mutex_exit(&dn->dn_mtx);
+}
+
+#ifdef _KERNEL
+static void
+dnode_move_impl(dnode_t *odn, dnode_t *ndn)
+{
+ int i;
+
+ ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
+ ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
+ ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
+ ASSERT(!MUTEX_HELD(&odn->dn_zfetch.zf_lock));
+
+ /* Copy fields. */
+ ndn->dn_objset = odn->dn_objset;
+ ndn->dn_object = odn->dn_object;
+ ndn->dn_dbuf = odn->dn_dbuf;
+ ndn->dn_handle = odn->dn_handle;
+ ndn->dn_phys = odn->dn_phys;
+ ndn->dn_type = odn->dn_type;
+ ndn->dn_bonuslen = odn->dn_bonuslen;
+ ndn->dn_bonustype = odn->dn_bonustype;
+ ndn->dn_nblkptr = odn->dn_nblkptr;
+ ndn->dn_checksum = odn->dn_checksum;
+ ndn->dn_compress = odn->dn_compress;
+ ndn->dn_nlevels = odn->dn_nlevels;
+ ndn->dn_indblkshift = odn->dn_indblkshift;
+ ndn->dn_datablkshift = odn->dn_datablkshift;
+ ndn->dn_datablkszsec = odn->dn_datablkszsec;
+ ndn->dn_datablksz = odn->dn_datablksz;
+ ndn->dn_maxblkid = odn->dn_maxblkid;
+ ndn->dn_num_slots = odn->dn_num_slots;
+ bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0],
+ sizeof (odn->dn_next_type));
+ bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
+ sizeof (odn->dn_next_nblkptr));
+ bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
+ sizeof (odn->dn_next_nlevels));
+ bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
+ sizeof (odn->dn_next_indblkshift));
+ bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
+ sizeof (odn->dn_next_bonustype));
+ bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
+ sizeof (odn->dn_rm_spillblk));
+ bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
+ sizeof (odn->dn_next_bonuslen));
+ bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
+ sizeof (odn->dn_next_blksz));
+ bcopy(&odn->dn_next_maxblkid[0], &ndn->dn_next_maxblkid[0],
+ sizeof (odn->dn_next_maxblkid));
+ for (i = 0; i < TXG_SIZE; i++) {
+ list_move_tail(&ndn->dn_dirty_records[i],
+ &odn->dn_dirty_records[i]);
+ }
+ bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
+ sizeof (odn->dn_free_ranges));
+ ndn->dn_allocated_txg = odn->dn_allocated_txg;
+ ndn->dn_free_txg = odn->dn_free_txg;
+ ndn->dn_assigned_txg = odn->dn_assigned_txg;
+ ndn->dn_dirty_txg = odn->dn_dirty_txg;
+ ndn->dn_dirtyctx = odn->dn_dirtyctx;
+ ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
+ ASSERT(zfs_refcount_count(&odn->dn_tx_holds) == 0);
+ zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
+ ASSERT(avl_is_empty(&ndn->dn_dbufs));
+ avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
+ ndn->dn_dbufs_count = odn->dn_dbufs_count;
+ ndn->dn_bonus = odn->dn_bonus;
+ ndn->dn_have_spill = odn->dn_have_spill;
+ ndn->dn_zio = odn->dn_zio;
+ ndn->dn_oldused = odn->dn_oldused;
+ ndn->dn_oldflags = odn->dn_oldflags;
+ ndn->dn_olduid = odn->dn_olduid;
+ ndn->dn_oldgid = odn->dn_oldgid;
+ ndn->dn_oldprojid = odn->dn_oldprojid;
+ ndn->dn_newuid = odn->dn_newuid;
+ ndn->dn_newgid = odn->dn_newgid;
+ ndn->dn_newprojid = odn->dn_newprojid;
+ ndn->dn_id_flags = odn->dn_id_flags;
+ dmu_zfetch_init(&ndn->dn_zfetch, NULL);
+ list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
+ ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
+
+ /*
+ * Update back pointers. Updating the handle fixes the back pointer of
+ * every descendant dbuf as well as the bonus dbuf.
+ */
+ ASSERT(ndn->dn_handle->dnh_dnode == odn);
+ ndn->dn_handle->dnh_dnode = ndn;
+ if (ndn->dn_zfetch.zf_dnode == odn) {
+ ndn->dn_zfetch.zf_dnode = ndn;
+ }
+
+ /*
+ * Invalidate the original dnode by clearing all of its back pointers.
+ */
+ odn->dn_dbuf = NULL;
+ odn->dn_handle = NULL;
+ avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_link));
+ odn->dn_dbufs_count = 0;
+ odn->dn_bonus = NULL;
+ dmu_zfetch_fini(&odn->dn_zfetch);
+
+ /*
+ * Set the low bit of the objset pointer to ensure that dnode_move()
+ * recognizes the dnode as invalid in any subsequent callback.
+ */
+ POINTER_INVALIDATE(&odn->dn_objset);
+
+ /*
+ * Satisfy the destructor.
+ */
+ for (i = 0; i < TXG_SIZE; i++) {
+ list_create(&odn->dn_dirty_records[i],
+ sizeof (dbuf_dirty_record_t),
+ offsetof(dbuf_dirty_record_t, dr_dirty_node));
+ odn->dn_free_ranges[i] = NULL;
+ odn->dn_next_nlevels[i] = 0;
+ odn->dn_next_indblkshift[i] = 0;
+ odn->dn_next_bonustype[i] = 0;
+ odn->dn_rm_spillblk[i] = 0;
+ odn->dn_next_bonuslen[i] = 0;
+ odn->dn_next_blksz[i] = 0;
+ }
+ odn->dn_allocated_txg = 0;
+ odn->dn_free_txg = 0;
+ odn->dn_assigned_txg = 0;
+ odn->dn_dirty_txg = 0;
+ odn->dn_dirtyctx = 0;
+ odn->dn_dirtyctx_firstset = NULL;
+ odn->dn_have_spill = B_FALSE;
+ odn->dn_zio = NULL;
+ odn->dn_oldused = 0;
+ odn->dn_oldflags = 0;
+ odn->dn_olduid = 0;
+ odn->dn_oldgid = 0;
+ odn->dn_oldprojid = ZFS_DEFAULT_PROJID;
+ odn->dn_newuid = 0;
+ odn->dn_newgid = 0;
+ odn->dn_newprojid = ZFS_DEFAULT_PROJID;
+ odn->dn_id_flags = 0;
+
+ /*
+ * Mark the dnode.
+ */
+ ndn->dn_moved = 1;
+ odn->dn_moved = (uint8_t)-1;
+}
+
+/*ARGSUSED*/
+static kmem_cbrc_t
+dnode_move(void *buf, void *newbuf, size_t size, void *arg)
+{
+ dnode_t *odn = buf, *ndn = newbuf;
+ objset_t *os;
+ int64_t refcount;
+ uint32_t dbufs;
+
+ /*
+ * The dnode is on the objset's list of known dnodes if the objset
+ * pointer is valid. We set the low bit of the objset pointer when
+ * freeing the dnode to invalidate it, and the memory patterns written
+ * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
+ * A newly created dnode sets the objset pointer last of all to indicate
+ * that the dnode is known and in a valid state to be moved by this
+ * function.
+ */
+ os = odn->dn_objset;
+ if (!POINTER_IS_VALID(os)) {
+ DNODE_STAT_BUMP(dnode_move_invalid);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+
+ /*
+ * Ensure that the objset does not go away during the move.
+ */
+ rw_enter(&os_lock, RW_WRITER);
+ if (os != odn->dn_objset) {
+ rw_exit(&os_lock);
+ DNODE_STAT_BUMP(dnode_move_recheck1);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+
+ /*
+ * If the dnode is still valid, then so is the objset. We know that no
+ * valid objset can be freed while we hold os_lock, so we can safely
+ * ensure that the objset remains in use.
+ */
+ mutex_enter(&os->os_lock);
+
+ /*
+ * Recheck the objset pointer in case the dnode was removed just before
+ * acquiring the lock.
+ */
+ if (os != odn->dn_objset) {
+ mutex_exit(&os->os_lock);
+ rw_exit(&os_lock);
+ DNODE_STAT_BUMP(dnode_move_recheck2);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+
+ /*
+ * At this point we know that as long as we hold os->os_lock, the dnode
+ * cannot be freed and fields within the dnode can be safely accessed.
+ * The objset listing this dnode cannot go away as long as this dnode is
+ * on its list.
+ */
+ rw_exit(&os_lock);
+ if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_BUMP(dnode_move_special);
+ return (KMEM_CBRC_NO);
+ }
+ ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
+
+ /*
+ * Lock the dnode handle to prevent the dnode from obtaining any new
+ * holds. This also prevents the descendant dbufs and the bonus dbuf
+ * from accessing the dnode, so that we can discount their holds. The
+ * handle is safe to access because we know that while the dnode cannot
+ * go away, neither can its handle. Once we hold dnh_zrlock, we can
+ * safely move any dnode referenced only by dbufs.
+ */
+ if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_BUMP(dnode_move_handle);
+ return (KMEM_CBRC_LATER);
+ }
+
+ /*
+ * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
+ * We need to guarantee that there is a hold for every dbuf in order to
+ * determine whether the dnode is actively referenced. Falsely matching
+ * a dbuf to an active hold would lead to an unsafe move. It's possible
+ * that a thread already having an active dnode hold is about to add a
+ * dbuf, and we can't compare hold and dbuf counts while the add is in
+ * progress.
+ */
+ if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
+ zrl_exit(&odn->dn_handle->dnh_zrlock);
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_BUMP(dnode_move_rwlock);
+ return (KMEM_CBRC_LATER);
+ }
+
+ /*
+ * A dbuf may be removed (evicted) without an active dnode hold. In that
+ * case, the dbuf count is decremented under the handle lock before the
+ * dbuf's hold is released. This order ensures that if we count the hold
+ * after the dbuf is removed but before its hold is released, we will
+ * treat the unmatched hold as active and exit safely. If we count the
+ * hold before the dbuf is removed, the hold is discounted, and the
+ * removal is blocked until the move completes.
+ */
+ refcount = zfs_refcount_count(&odn->dn_holds);
+ ASSERT(refcount >= 0);
+ dbufs = DN_DBUFS_COUNT(odn);
+
+ /* We can't have more dbufs than dnode holds. */
+ ASSERT3U(dbufs, <=, refcount);
+ DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
+ uint32_t, dbufs);
+
+ if (refcount > dbufs) {
+ rw_exit(&odn->dn_struct_rwlock);
+ zrl_exit(&odn->dn_handle->dnh_zrlock);
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_BUMP(dnode_move_active);
+ return (KMEM_CBRC_LATER);
+ }
+
+ rw_exit(&odn->dn_struct_rwlock);
+
+ /*
+ * At this point we know that anyone with a hold on the dnode is not
+ * actively referencing it. The dnode is known and in a valid state to
+ * move. We're holding the locks needed to execute the critical section.
+ */
+ dnode_move_impl(odn, ndn);
+
+ list_link_replace(&odn->dn_link, &ndn->dn_link);
+ /* If the dnode was safe to move, the refcount cannot have changed. */
+ ASSERT(refcount == zfs_refcount_count(&ndn->dn_holds));
+ ASSERT(dbufs == DN_DBUFS_COUNT(ndn));
+ zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
+ mutex_exit(&os->os_lock);
+
+ return (KMEM_CBRC_YES);
+}
+#endif /* _KERNEL */
+
+static void
+dnode_slots_hold(dnode_children_t *children, int idx, int slots)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+ zrl_add(&dnh->dnh_zrlock);
+ }
+}
+
+static void
+dnode_slots_rele(dnode_children_t *children, int idx, int slots)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+
+ if (zrl_is_locked(&dnh->dnh_zrlock))
+ zrl_exit(&dnh->dnh_zrlock);
+ else
+ zrl_remove(&dnh->dnh_zrlock);
+ }
+}
+
+static int
+dnode_slots_tryenter(dnode_children_t *children, int idx, int slots)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+
+ if (!zrl_tryenter(&dnh->dnh_zrlock)) {
+ for (int j = idx; j < i; j++) {
+ dnh = &children->dnc_children[j];
+ zrl_exit(&dnh->dnh_zrlock);
+ }
+
+ return (0);
+ }
+ }
+
+ return (1);
+}
+
+static void
+dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+ dnh->dnh_dnode = ptr;
+ }
+}
+
+static boolean_t
+dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ /*
+ * If all dnode slots are either already free or
+ * evictable return B_TRUE.
+ */
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+ dnode_t *dn = dnh->dnh_dnode;
+
+ if (dn == DN_SLOT_FREE) {
+ continue;
+ } else if (DN_SLOT_IS_PTR(dn)) {
+ mutex_enter(&dn->dn_mtx);
+ boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
+ zfs_refcount_is_zero(&dn->dn_holds) &&
+ !DNODE_IS_DIRTY(dn));
+ mutex_exit(&dn->dn_mtx);
+
+ if (!can_free)
+ return (B_FALSE);
+ else
+ continue;
+ } else {
+ return (B_FALSE);
+ }
+ }
+
+ return (B_TRUE);
+}
+
+static void
+dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+
+ ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
+
+ if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
+ dnode_destroy(dnh->dnh_dnode);
+ dnh->dnh_dnode = DN_SLOT_FREE;
+ }
+ }
+}
+
+void
+dnode_free_interior_slots(dnode_t *dn)
+{
+ dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
+ int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
+ int idx = (dn->dn_object & (epb - 1)) + 1;
+ int slots = dn->dn_num_slots - 1;
+
+ if (slots == 0)
+ return;
+
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ while (!dnode_slots_tryenter(children, idx, slots)) {
+ DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
+ cond_resched();
+ }
+
+ dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
+ dnode_slots_rele(children, idx, slots);
+}
+
+void
+dnode_special_close(dnode_handle_t *dnh)
+{
+ dnode_t *dn = dnh->dnh_dnode;
+
+ /*
+ * Ensure dnode_rele_and_unlock() has released dn_mtx, after final
+ * zfs_refcount_remove()
+ */
+ mutex_enter(&dn->dn_mtx);
+ if (zfs_refcount_count(&dn->dn_holds) > 0)
+ cv_wait(&dn->dn_nodnholds, &dn->dn_mtx);
+ mutex_exit(&dn->dn_mtx);
+ ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 0);
+
+ ASSERT(dn->dn_dbuf == NULL ||
+ dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
+ zrl_add(&dnh->dnh_zrlock);
+ dnode_destroy(dn); /* implicit zrl_remove() */
+ zrl_destroy(&dnh->dnh_zrlock);
+ dnh->dnh_dnode = NULL;
+}
+
+void
+dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
+ dnode_handle_t *dnh)
+{
+ dnode_t *dn;
+
+ zrl_init(&dnh->dnh_zrlock);
+ VERIFY3U(1, ==, zrl_tryenter(&dnh->dnh_zrlock));
+
+ dn = dnode_create(os, dnp, NULL, object, dnh);
+ DNODE_VERIFY(dn);
+
+ zrl_exit(&dnh->dnh_zrlock);
+}
+
+static void
+dnode_buf_evict_async(void *dbu)
+{
+ dnode_children_t *dnc = dbu;
+
+ DNODE_STAT_BUMP(dnode_buf_evict);
+
+ for (int i = 0; i < dnc->dnc_count; i++) {
+ dnode_handle_t *dnh = &dnc->dnc_children[i];
+ dnode_t *dn;
+
+ /*
+ * The dnode handle lock guards against the dnode moving to
+ * another valid address, so there is no need here to guard
+ * against changes to or from NULL.
+ */
+ if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ zrl_destroy(&dnh->dnh_zrlock);
+ dnh->dnh_dnode = DN_SLOT_UNINIT;
+ continue;
+ }
+
+ zrl_add(&dnh->dnh_zrlock);
+ dn = dnh->dnh_dnode;
+ /*
+ * If there are holds on this dnode, then there should
+ * be holds on the dnode's containing dbuf as well; thus
+ * it wouldn't be eligible for eviction and this function
+ * would not have been called.
+ */
+ ASSERT(zfs_refcount_is_zero(&dn->dn_holds));
+ ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
+
+ dnode_destroy(dn); /* implicit zrl_remove() for first slot */
+ zrl_destroy(&dnh->dnh_zrlock);
+ dnh->dnh_dnode = DN_SLOT_UNINIT;
+ }
+ kmem_free(dnc, sizeof (dnode_children_t) +
+ dnc->dnc_count * sizeof (dnode_handle_t));
+}
+
+/*
+ * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used
+ * to ensure the hole at the specified object offset is large enough to
+ * hold the dnode being created. The slots parameter is also used to ensure
+ * a dnode does not span multiple dnode blocks. In both of these cases, if
+ * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases
+ * are only possible when using DNODE_MUST_BE_FREE.
+ *
+ * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
+ * dnode_hold_impl() will check if the requested dnode is already consumed
+ * as an extra dnode slot by an large dnode, in which case it returns
+ * ENOENT.
+ *
+ * If the DNODE_DRY_RUN flag is set, we don't actually hold the dnode, just
+ * return whether the hold would succeed or not. tag and dnp should set to
+ * NULL in this case.
+ *
+ * errors:
+ * EINVAL - Invalid object number or flags.
+ * ENOSPC - Hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
+ * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE)
+ * - Refers to a freeing dnode (DNODE_MUST_BE_FREE)
+ * - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED)
+ * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED)
+ * - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED)
+ * EIO - I/O error when reading the meta dnode dbuf.
+ *
+ * succeeds even for free dnodes.
+ */
+int
+dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
+ void *tag, dnode_t **dnp)
+{
+ int epb, idx, err;
+ int drop_struct_lock = FALSE;
+ int type;
+ uint64_t blk;
+ dnode_t *mdn, *dn;
+ dmu_buf_impl_t *db;
+ dnode_children_t *dnc;
+ dnode_phys_t *dn_block;
+ dnode_handle_t *dnh;
+
+ ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
+ ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
+ IMPLY(flag & DNODE_DRY_RUN, (tag == NULL) && (dnp == NULL));
+
+ /*
+ * If you are holding the spa config lock as writer, you shouldn't
+ * be asking the DMU to do *anything* unless it's the root pool
+ * which may require us to read from the root filesystem while
+ * holding some (not all) of the locks as writer.
+ */
+ ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
+ (spa_is_root(os->os_spa) &&
+ spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
+
+ ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE));
+
+ if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT ||
+ object == DMU_PROJECTUSED_OBJECT) {
+ if (object == DMU_USERUSED_OBJECT)
+ dn = DMU_USERUSED_DNODE(os);
+ else if (object == DMU_GROUPUSED_OBJECT)
+ dn = DMU_GROUPUSED_DNODE(os);
+ else
+ dn = DMU_PROJECTUSED_DNODE(os);
+ if (dn == NULL)
+ return (SET_ERROR(ENOENT));
+ type = dn->dn_type;
+ if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
+ return (SET_ERROR(ENOENT));
+ if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
+ return (SET_ERROR(EEXIST));
+ DNODE_VERIFY(dn);
+ /* Don't actually hold if dry run, just return 0 */
+ if (!(flag & DNODE_DRY_RUN)) {
+ (void) zfs_refcount_add(&dn->dn_holds, tag);
+ *dnp = dn;
+ }
+ return (0);
+ }
+
+ if (object == 0 || object >= DN_MAX_OBJECT)
+ return (SET_ERROR(EINVAL));
+
+ mdn = DMU_META_DNODE(os);
+ ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
+
+ DNODE_VERIFY(mdn);
+
+ if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
+ rw_enter(&mdn->dn_struct_rwlock, RW_READER);
+ drop_struct_lock = TRUE;
+ }
+
+ blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
+ db = dbuf_hold(mdn, blk, FTAG);
+ if (drop_struct_lock)
+ rw_exit(&mdn->dn_struct_rwlock);
+ if (db == NULL) {
+ DNODE_STAT_BUMP(dnode_hold_dbuf_hold);
+ return (SET_ERROR(EIO));
+ }
+
+ /*
+ * We do not need to decrypt to read the dnode so it doesn't matter
+ * if we get the encrypted or decrypted version.
+ */
+ err = dbuf_read(db, NULL, DB_RF_CANFAIL |
+ DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH);
+ if (err) {
+ DNODE_STAT_BUMP(dnode_hold_dbuf_read);
+ dbuf_rele(db, FTAG);
+ return (err);
+ }
+
+ ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
+ epb = db->db.db_size >> DNODE_SHIFT;
+
+ idx = object & (epb - 1);
+ dn_block = (dnode_phys_t *)db->db.db_data;
+
+ ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
+ dnc = dmu_buf_get_user(&db->db);
+ dnh = NULL;
+ if (dnc == NULL) {
+ dnode_children_t *winner;
+ int skip = 0;
+
+ dnc = kmem_zalloc(sizeof (dnode_children_t) +
+ epb * sizeof (dnode_handle_t), KM_SLEEP);
+ dnc->dnc_count = epb;
+ dnh = &dnc->dnc_children[0];
+
+ /* Initialize dnode slot status from dnode_phys_t */
+ for (int i = 0; i < epb; i++) {
+ zrl_init(&dnh[i].dnh_zrlock);
+
+ if (skip) {
+ skip--;
+ continue;
+ }
+
+ if (dn_block[i].dn_type != DMU_OT_NONE) {
+ int interior = dn_block[i].dn_extra_slots;
+
+ dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED);
+ dnode_set_slots(dnc, i + 1, interior,
+ DN_SLOT_INTERIOR);
+ skip = interior;
+ } else {
+ dnh[i].dnh_dnode = DN_SLOT_FREE;
+ skip = 0;
+ }
+ }
+
+ dmu_buf_init_user(&dnc->dnc_dbu, NULL,
+ dnode_buf_evict_async, NULL);
+ winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu);
+ if (winner != NULL) {
+
+ for (int i = 0; i < epb; i++)
+ zrl_destroy(&dnh[i].dnh_zrlock);
+
+ kmem_free(dnc, sizeof (dnode_children_t) +
+ epb * sizeof (dnode_handle_t));
+ dnc = winner;
+ }
+ }
+
+ ASSERT(dnc->dnc_count == epb);
+
+ if (flag & DNODE_MUST_BE_ALLOCATED) {
+ slots = 1;
+
+ dnode_slots_hold(dnc, idx, slots);
+ dnh = &dnc->dnc_children[idx];
+
+ if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ dn = dnh->dnh_dnode;
+ } else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {
+ DNODE_STAT_BUMP(dnode_hold_alloc_interior);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(EEXIST));
+ } else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {
+ DNODE_STAT_BUMP(dnode_hold_alloc_misses);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(ENOENT));
+ } else {
+ dnode_slots_rele(dnc, idx, slots);
+ while (!dnode_slots_tryenter(dnc, idx, slots)) {
+ DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
+ cond_resched();
+ }
+
+ /*
+ * Someone else won the race and called dnode_create()
+ * after we checked DN_SLOT_IS_PTR() above but before
+ * we acquired the lock.
+ */
+ if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
+ dn = dnh->dnh_dnode;
+ } else {
+ dn = dnode_create(os, dn_block + idx, db,
+ object, dnh);
+ }
+ }
+
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) {
+ DNODE_STAT_BUMP(dnode_hold_alloc_type_none);
+ mutex_exit(&dn->dn_mtx);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(ENOENT));
+ }
+
+ /* Don't actually hold if dry run, just return 0 */
+ if (flag & DNODE_DRY_RUN) {
+ mutex_exit(&dn->dn_mtx);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (0);
+ }
+
+ DNODE_STAT_BUMP(dnode_hold_alloc_hits);
+ } else if (flag & DNODE_MUST_BE_FREE) {
+
+ if (idx + slots - 1 >= DNODES_PER_BLOCK) {
+ DNODE_STAT_BUMP(dnode_hold_free_overflow);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(ENOSPC));
+ }
+
+ dnode_slots_hold(dnc, idx, slots);
+
+ if (!dnode_check_slots_free(dnc, idx, slots)) {
+ DNODE_STAT_BUMP(dnode_hold_free_misses);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(ENOSPC));
+ }
+
+ dnode_slots_rele(dnc, idx, slots);
+ while (!dnode_slots_tryenter(dnc, idx, slots)) {
+ DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
+ cond_resched();
+ }
+
+ if (!dnode_check_slots_free(dnc, idx, slots)) {
+ DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(ENOSPC));
+ }
+
+ /*
+ * Allocated but otherwise free dnodes which would
+ * be in the interior of a multi-slot dnodes need
+ * to be freed. Single slot dnodes can be safely
+ * re-purposed as a performance optimization.
+ */
+ if (slots > 1)
+ dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+
+ dnh = &dnc->dnc_children[idx];
+ if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ dn = dnh->dnh_dnode;
+ } else {
+ dn = dnode_create(os, dn_block + idx, db,
+ object, dnh);
+ }
+
+ mutex_enter(&dn->dn_mtx);
+ if (!zfs_refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) {
+ DNODE_STAT_BUMP(dnode_hold_free_refcount);
+ mutex_exit(&dn->dn_mtx);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(EEXIST));
+ }
+
+ /* Don't actually hold if dry run, just return 0 */
+ if (flag & DNODE_DRY_RUN) {
+ mutex_exit(&dn->dn_mtx);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (0);
+ }
+
+ dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
+ DNODE_STAT_BUMP(dnode_hold_free_hits);
+ } else {
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ ASSERT0(dn->dn_free_txg);
+
+ if (zfs_refcount_add(&dn->dn_holds, tag) == 1)
+ dbuf_add_ref(db, dnh);
+
+ mutex_exit(&dn->dn_mtx);
+
+ /* Now we can rely on the hold to prevent the dnode from moving. */
+ dnode_slots_rele(dnc, idx, slots);
+
+ DNODE_VERIFY(dn);
+ ASSERT3P(dnp, !=, NULL);
+ ASSERT3P(dn->dn_dbuf, ==, db);
+ ASSERT3U(dn->dn_object, ==, object);
+ dbuf_rele(db, FTAG);
+
+ *dnp = dn;
+ return (0);
+}
+
+/*
+ * Return held dnode if the object is allocated, NULL if not.
+ */
+int
+dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
+{
+ return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
+ dnp));
+}
+
+/*
+ * Can only add a reference if there is already at least one
+ * reference on the dnode. Returns FALSE if unable to add a
+ * new reference.
+ */
+boolean_t
+dnode_add_ref(dnode_t *dn, void *tag)
+{
+ mutex_enter(&dn->dn_mtx);
+ if (zfs_refcount_is_zero(&dn->dn_holds)) {
+ mutex_exit(&dn->dn_mtx);
+ return (FALSE);
+ }
+ VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag));
+ mutex_exit(&dn->dn_mtx);
+ return (TRUE);
+}
+
+void
+dnode_rele(dnode_t *dn, void *tag)
+{
+ mutex_enter(&dn->dn_mtx);
+ dnode_rele_and_unlock(dn, tag, B_FALSE);
+}
+
+void
+dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
+{
+ uint64_t refs;
+ /* Get while the hold prevents the dnode from moving. */
+ dmu_buf_impl_t *db = dn->dn_dbuf;
+ dnode_handle_t *dnh = dn->dn_handle;
+
+ refs = zfs_refcount_remove(&dn->dn_holds, tag);
+ if (refs == 0)
+ cv_broadcast(&dn->dn_nodnholds);
+ mutex_exit(&dn->dn_mtx);
+ /* dnode could get destroyed at this point, so don't use it anymore */
+
+ /*
+ * It's unsafe to release the last hold on a dnode by dnode_rele() or
+ * indirectly by dbuf_rele() while relying on the dnode handle to
+ * prevent the dnode from moving, since releasing the last hold could
+ * result in the dnode's parent dbuf evicting its dnode handles. For
+ * that reason anyone calling dnode_rele() or dbuf_rele() without some
+ * other direct or indirect hold on the dnode must first drop the dnode
+ * handle.
+ */
+ ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
+
+ /* NOTE: the DNODE_DNODE does not have a dn_dbuf */
+ if (refs == 0 && db != NULL) {
+ /*
+ * Another thread could add a hold to the dnode handle in
+ * dnode_hold_impl() while holding the parent dbuf. Since the
+ * hold on the parent dbuf prevents the handle from being
+ * destroyed, the hold on the handle is OK. We can't yet assert
+ * that the handle has zero references, but that will be
+ * asserted anyway when the handle gets destroyed.
+ */
+ mutex_enter(&db->db_mtx);
+ dbuf_rele_and_unlock(db, dnh, evicting);
+ }
+}
+
+/*
+ * Test whether we can create a dnode at the specified location.
+ */
+int
+dnode_try_claim(objset_t *os, uint64_t object, int slots)
+{
+ return (dnode_hold_impl(os, object, DNODE_MUST_BE_FREE | DNODE_DRY_RUN,
+ slots, NULL, NULL));
+}
+
+void
+dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
+{
+ objset_t *os = dn->dn_objset;
+ uint64_t txg = tx->tx_txg;
+
+ if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+ dsl_dataset_dirty(os->os_dsl_dataset, tx);
+ return;
+ }
+
+ DNODE_VERIFY(dn);
+
+#ifdef ZFS_DEBUG
+ mutex_enter(&dn->dn_mtx);
+ ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
+ ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
+ mutex_exit(&dn->dn_mtx);
+#endif
+
+ /*
+ * Determine old uid/gid when necessary
+ */
+ dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
+
+ multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK];
+ multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn);
+
+ /*
+ * If we are already marked dirty, we're done.
+ */
+ if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
+ multilist_sublist_unlock(mls);
+ return;
+ }
+
+ ASSERT(!zfs_refcount_is_zero(&dn->dn_holds) ||
+ !avl_is_empty(&dn->dn_dbufs));
+ ASSERT(dn->dn_datablksz != 0);
+ ASSERT0(dn->dn_next_bonuslen[txg & TXG_MASK]);
+ ASSERT0(dn->dn_next_blksz[txg & TXG_MASK]);
+ ASSERT0(dn->dn_next_bonustype[txg & TXG_MASK]);
+
+ dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
+ dn->dn_object, txg);
+
+ multilist_sublist_insert_head(mls, dn);
+
+ multilist_sublist_unlock(mls);
+
+ /*
+ * The dnode maintains a hold on its containing dbuf as
+ * long as there are holds on it. Each instantiated child
+ * dbuf maintains a hold on the dnode. When the last child
+ * drops its hold, the dnode will drop its hold on the
+ * containing dbuf. We add a "dirty hold" here so that the
+ * dnode will hang around after we finish processing its
+ * children.
+ */
+ VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
+
+ (void) dbuf_dirty(dn->dn_dbuf, tx);
+
+ dsl_dataset_dirty(os->os_dsl_dataset, tx);
+}
+
+void
+dnode_free(dnode_t *dn, dmu_tx_t *tx)
+{
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
+ mutex_exit(&dn->dn_mtx);
+ return;
+ }
+ dn->dn_free_txg = tx->tx_txg;
+ mutex_exit(&dn->dn_mtx);
+
+ dnode_setdirty(dn, tx);
+}
+
+/*
+ * Try to change the block size for the indicated dnode. This can only
+ * succeed if there are no blocks allocated or dirty beyond first block
+ */
+int
+dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db;
+ int err;
+
+ ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
+ if (size == 0)
+ size = SPA_MINBLOCKSIZE;
+ else
+ size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
+
+ if (ibs == dn->dn_indblkshift)
+ ibs = 0;
+
+ if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
+ return (0);
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+ /* Check for any allocated blocks beyond the first */
+ if (dn->dn_maxblkid != 0)
+ goto fail;
+
+ mutex_enter(&dn->dn_dbufs_mtx);
+ for (db = avl_first(&dn->dn_dbufs); db != NULL;
+ db = AVL_NEXT(&dn->dn_dbufs, db)) {
+ if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
+ db->db_blkid != DMU_SPILL_BLKID) {
+ mutex_exit(&dn->dn_dbufs_mtx);
+ goto fail;
+ }
+ }
+ mutex_exit(&dn->dn_dbufs_mtx);
+
+ if (ibs && dn->dn_nlevels != 1)
+ goto fail;
+
+ /* resize the old block */
+ err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
+ if (err == 0) {
+ dbuf_new_size(db, size, tx);
+ } else if (err != ENOENT) {
+ goto fail;
+ }
+
+ dnode_setdblksz(dn, size);
+ dnode_setdirty(dn, tx);
+ dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
+ if (ibs) {
+ dn->dn_indblkshift = ibs;
+ dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
+ }
+ /* release after we have fixed the blocksize in the dnode */
+ if (db)
+ dbuf_rele(db, FTAG);
+
+ rw_exit(&dn->dn_struct_rwlock);
+ return (0);
+
+fail:
+ rw_exit(&dn->dn_struct_rwlock);
+ return (SET_ERROR(ENOTSUP));
+}
+
+static void
+dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx)
+{
+ uint64_t txgoff = tx->tx_txg & TXG_MASK;
+ int old_nlevels = dn->dn_nlevels;
+ dmu_buf_impl_t *db;
+ list_t *list;
+ dbuf_dirty_record_t *new, *dr, *dr_next;
+
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+ ASSERT3U(new_nlevels, >, dn->dn_nlevels);
+ dn->dn_nlevels = new_nlevels;
+
+ ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
+ dn->dn_next_nlevels[txgoff] = new_nlevels;
+
+ /* dirty the left indirects */
+ db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
+ ASSERT(db != NULL);
+ new = dbuf_dirty(db, tx);
+ dbuf_rele(db, FTAG);
+
+ /* transfer the dirty records to the new indirect */
+ mutex_enter(&dn->dn_mtx);
+ mutex_enter(&new->dt.di.dr_mtx);
+ list = &dn->dn_dirty_records[txgoff];
+ for (dr = list_head(list); dr; dr = dr_next) {
+ dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
+
+ IMPLY(dr->dr_dbuf == NULL, old_nlevels == 1);
+ if (dr->dr_dbuf == NULL ||
+ (dr->dr_dbuf->db_level == old_nlevels - 1 &&
+ dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+ dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID)) {
+ list_remove(&dn->dn_dirty_records[txgoff], dr);
+ list_insert_tail(&new->dt.di.dr_children, dr);
+ dr->dr_parent = new;
+ }
+ }
+ mutex_exit(&new->dt.di.dr_mtx);
+ mutex_exit(&dn->dn_mtx);
+}
+
+int
+dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx)
+{
+ int ret = 0;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+ if (dn->dn_nlevels == nlevels) {
+ ret = 0;
+ goto out;
+ } else if (nlevels < dn->dn_nlevels) {
+ ret = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ dnode_set_nlevels_impl(dn, nlevels, tx);
+
+out:
+ rw_exit(&dn->dn_struct_rwlock);
+ return (ret);
+}
+
+/* read-holding callers must not rely on the lock being continuously held */
+void
+dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read,
+ boolean_t force)
+{
+ int epbs, new_nlevels;
+ uint64_t sz;
+
+ ASSERT(blkid != DMU_BONUS_BLKID);
+
+ ASSERT(have_read ?
+ RW_READ_HELD(&dn->dn_struct_rwlock) :
+ RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+ /*
+ * if we have a read-lock, check to see if we need to do any work
+ * before upgrading to a write-lock.
+ */
+ if (have_read) {
+ if (blkid <= dn->dn_maxblkid)
+ return;
+
+ if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
+ rw_exit(&dn->dn_struct_rwlock);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ }
+ }
+
+ /*
+ * Raw sends (indicated by the force flag) require that we take the
+ * given blkid even if the value is lower than the current value.
+ */
+ if (!force && blkid <= dn->dn_maxblkid)
+ goto out;
+
+ /*
+ * We use the (otherwise unused) top bit of dn_next_maxblkid[txgoff]
+ * to indicate that this field is set. This allows us to set the
+ * maxblkid to 0 on an existing object in dnode_sync().
+ */
+ dn->dn_maxblkid = blkid;
+ dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] =
+ blkid | DMU_NEXT_MAXBLKID_SET;
+
+ /*
+ * Compute the number of levels necessary to support the new maxblkid.
+ * Raw sends will ensure nlevels is set correctly for us.
+ */
+ new_nlevels = 1;
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ for (sz = dn->dn_nblkptr;
+ sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
+ new_nlevels++;
+
+ ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS);
+
+ if (!force) {
+ if (new_nlevels > dn->dn_nlevels)
+ dnode_set_nlevels_impl(dn, new_nlevels, tx);
+ } else {
+ ASSERT3U(dn->dn_nlevels, >=, new_nlevels);
+ }
+
+out:
+ if (have_read)
+ rw_downgrade(&dn->dn_struct_rwlock);
+}
+
+static void
+dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
+ if (db != NULL) {
+ dmu_buf_will_dirty(&db->db, tx);
+ dbuf_rele(db, FTAG);
+ }
+}
+
+/*
+ * Dirty all the in-core level-1 dbufs in the range specified by start_blkid
+ * and end_blkid.
+ */
+static void
+dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
+ dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db_search;
+ dmu_buf_impl_t *db;
+ avl_index_t where;
+
+ db_search = kmem_zalloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
+
+ mutex_enter(&dn->dn_dbufs_mtx);
+
+ db_search->db_level = 1;
+ db_search->db_blkid = start_blkid + 1;
+ db_search->db_state = DB_SEARCH;
+ for (;;) {
+
+ db = avl_find(&dn->dn_dbufs, db_search, &where);
+ if (db == NULL)
+ db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+
+ if (db == NULL || db->db_level != 1 ||
+ db->db_blkid >= end_blkid) {
+ break;
+ }
+
+ /*
+ * Setup the next blkid we want to search for.
+ */
+ db_search->db_blkid = db->db_blkid + 1;
+ ASSERT3U(db->db_blkid, >=, start_blkid);
+
+ /*
+ * If the dbuf transitions to DB_EVICTING while we're trying
+ * to dirty it, then we will be unable to discover it in
+ * the dbuf hash table. This will result in a call to
+ * dbuf_create() which needs to acquire the dn_dbufs_mtx
+ * lock. To avoid a deadlock, we drop the lock before
+ * dirtying the level-1 dbuf.
+ */
+ mutex_exit(&dn->dn_dbufs_mtx);
+ dnode_dirty_l1(dn, db->db_blkid, tx);
+ mutex_enter(&dn->dn_dbufs_mtx);
+ }
+
+#ifdef ZFS_DEBUG
+ /*
+ * Walk all the in-core level-1 dbufs and verify they have been dirtied.
+ */
+ db_search->db_level = 1;
+ db_search->db_blkid = start_blkid + 1;
+ db_search->db_state = DB_SEARCH;
+ db = avl_find(&dn->dn_dbufs, db_search, &where);
+ if (db == NULL)
+ db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+ for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
+ if (db->db_level != 1 || db->db_blkid >= end_blkid)
+ break;
+ if (db->db_state != DB_EVICTING)
+ ASSERT(db->db_dirtycnt > 0);
+ }
+#endif
+ kmem_free(db_search, sizeof (dmu_buf_impl_t));
+ mutex_exit(&dn->dn_dbufs_mtx);
+}
+
+void
+dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, void *tag)
+{
+ /*
+ * Don't set dirtyctx to SYNC if we're just modifying this as we
+ * initialize the objset.
+ */
+ if (dn->dn_dirtyctx == DN_UNDIRTIED) {
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+
+ if (ds != NULL) {
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, tag);
+ }
+ if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
+ if (dmu_tx_is_syncing(tx))
+ dn->dn_dirtyctx = DN_DIRTY_SYNC;
+ else
+ dn->dn_dirtyctx = DN_DIRTY_OPEN;
+ dn->dn_dirtyctx_firstset = tag;
+ }
+ if (ds != NULL) {
+ rrw_exit(&ds->ds_bp_rwlock, tag);
+ }
+ }
+}
+
+void
+dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db;
+ uint64_t blkoff, blkid, nblks;
+ int blksz, blkshift, head, tail;
+ int trunc = FALSE;
+ int epbs;
+
+ blksz = dn->dn_datablksz;
+ blkshift = dn->dn_datablkshift;
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ if (len == DMU_OBJECT_END) {
+ len = UINT64_MAX - off;
+ trunc = TRUE;
+ }
+
+ /*
+ * First, block align the region to free:
+ */
+ if (ISP2(blksz)) {
+ head = P2NPHASE(off, blksz);
+ blkoff = P2PHASE(off, blksz);
+ if ((off >> blkshift) > dn->dn_maxblkid)
+ return;
+ } else {
+ ASSERT(dn->dn_maxblkid == 0);
+ if (off == 0 && len >= blksz) {
+ /*
+ * Freeing the whole block; fast-track this request.
+ */
+ blkid = 0;
+ nblks = 1;
+ if (dn->dn_nlevels > 1) {
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ dnode_dirty_l1(dn, 0, tx);
+ rw_exit(&dn->dn_struct_rwlock);
+ }
+ goto done;
+ } else if (off >= blksz) {
+ /* Freeing past end-of-data */
+ return;
+ } else {
+ /* Freeing part of the block. */
+ head = blksz - off;
+ ASSERT3U(head, >, 0);
+ }
+ blkoff = off;
+ }
+ /* zero out any partial block data at the start of the range */
+ if (head) {
+ int res;
+ ASSERT3U(blkoff + head, ==, blksz);
+ if (len < head)
+ head = len;
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
+ TRUE, FALSE, FTAG, &db);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (res == 0) {
+ caddr_t data;
+ boolean_t dirty;
+
+ db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER,
+ FTAG);
+ /* don't dirty if it isn't on disk and isn't dirty */
+ dirty = !list_is_empty(&db->db_dirty_records) ||
+ (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
+ dmu_buf_unlock_parent(db, dblt, FTAG);
+ if (dirty) {
+ dmu_buf_will_dirty(&db->db, tx);
+ data = db->db.db_data;
+ bzero(data + blkoff, head);
+ }
+ dbuf_rele(db, FTAG);
+ }
+ off += head;
+ len -= head;
+ }
+
+ /* If the range was less than one block, we're done */
+ if (len == 0)
+ return;
+
+ /* If the remaining range is past end of file, we're done */
+ if ((off >> blkshift) > dn->dn_maxblkid)
+ return;
+
+ ASSERT(ISP2(blksz));
+ if (trunc)
+ tail = 0;
+ else
+ tail = P2PHASE(len, blksz);
+
+ ASSERT0(P2PHASE(off, blksz));
+ /* zero out any partial block data at the end of the range */
+ if (tail) {
+ int res;
+ if (len < tail)
+ tail = len;
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
+ TRUE, FALSE, FTAG, &db);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (res == 0) {
+ boolean_t dirty;
+ /* don't dirty if not on disk and not dirty */
+ db_lock_type_t type = dmu_buf_lock_parent(db, RW_READER,
+ FTAG);
+ dirty = !list_is_empty(&db->db_dirty_records) ||
+ (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
+ dmu_buf_unlock_parent(db, type, FTAG);
+ if (dirty) {
+ dmu_buf_will_dirty(&db->db, tx);
+ bzero(db->db.db_data, tail);
+ }
+ dbuf_rele(db, FTAG);
+ }
+ len -= tail;
+ }
+
+ /* If the range did not include a full block, we are done */
+ if (len == 0)
+ return;
+
+ ASSERT(IS_P2ALIGNED(off, blksz));
+ ASSERT(trunc || IS_P2ALIGNED(len, blksz));
+ blkid = off >> blkshift;
+ nblks = len >> blkshift;
+ if (trunc)
+ nblks += 1;
+
+ /*
+ * Dirty all the indirect blocks in this range. Note that only
+ * the first and last indirect blocks can actually be written
+ * (if they were partially freed) -- they must be dirtied, even if
+ * they do not exist on disk yet. The interior blocks will
+ * be freed by free_children(), so they will not actually be written.
+ * Even though these interior blocks will not be written, we
+ * dirty them for two reasons:
+ *
+ * - It ensures that the indirect blocks remain in memory until
+ * syncing context. (They have already been prefetched by
+ * dmu_tx_hold_free(), so we don't have to worry about reading
+ * them serially here.)
+ *
+ * - The dirty space accounting will put pressure on the txg sync
+ * mechanism to begin syncing, and to delay transactions if there
+ * is a large amount of freeing. Even though these indirect
+ * blocks will not be written, we could need to write the same
+ * amount of space if we copy the freed BPs into deadlists.
+ */
+ if (dn->dn_nlevels > 1) {
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ uint64_t first, last;
+
+ first = blkid >> epbs;
+ dnode_dirty_l1(dn, first, tx);
+ if (trunc)
+ last = dn->dn_maxblkid >> epbs;
+ else
+ last = (blkid + nblks - 1) >> epbs;
+ if (last != first)
+ dnode_dirty_l1(dn, last, tx);
+
+ dnode_dirty_l1range(dn, first, last, tx);
+
+ int shift = dn->dn_datablkshift + dn->dn_indblkshift -
+ SPA_BLKPTRSHIFT;
+ for (uint64_t i = first + 1; i < last; i++) {
+ /*
+ * Set i to the blockid of the next non-hole
+ * level-1 indirect block at or after i. Note
+ * that dnode_next_offset() operates in terms of
+ * level-0-equivalent bytes.
+ */
+ uint64_t ibyte = i << shift;
+ int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
+ &ibyte, 2, 1, 0);
+ i = ibyte >> shift;
+ if (i >= last)
+ break;
+
+ /*
+ * Normally we should not see an error, either
+ * from dnode_next_offset() or dbuf_hold_level()
+ * (except for ESRCH from dnode_next_offset).
+ * If there is an i/o error, then when we read
+ * this block in syncing context, it will use
+ * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
+ * to the "failmode" property. dnode_next_offset()
+ * doesn't have a flag to indicate MUSTSUCCEED.
+ */
+ if (err != 0)
+ break;
+
+ dnode_dirty_l1(dn, i, tx);
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+ }
+
+done:
+ /*
+ * Add this range to the dnode range list.
+ * We will finish up this free operation in the syncing phase.
+ */
+ mutex_enter(&dn->dn_mtx);
+ {
+ int txgoff = tx->tx_txg & TXG_MASK;
+ if (dn->dn_free_ranges[txgoff] == NULL) {
+ dn->dn_free_ranges[txgoff] = range_tree_create(NULL,
+ RANGE_SEG64, NULL, 0, 0);
+ }
+ range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
+ range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
+ }
+ dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
+ blkid, nblks, tx->tx_txg);
+ mutex_exit(&dn->dn_mtx);
+
+ dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
+ dnode_setdirty(dn, tx);
+}
+
+static boolean_t
+dnode_spill_freed(dnode_t *dn)
+{
+ int i;
+
+ mutex_enter(&dn->dn_mtx);
+ for (i = 0; i < TXG_SIZE; i++) {
+ if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
+ break;
+ }
+ mutex_exit(&dn->dn_mtx);
+ return (i < TXG_SIZE);
+}
+
+/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
+uint64_t
+dnode_block_freed(dnode_t *dn, uint64_t blkid)
+{
+ void *dp = spa_get_dsl(dn->dn_objset->os_spa);
+ int i;
+
+ if (blkid == DMU_BONUS_BLKID)
+ return (FALSE);
+
+ /*
+ * If we're in the process of opening the pool, dp will not be
+ * set yet, but there shouldn't be anything dirty.
+ */
+ if (dp == NULL)
+ return (FALSE);
+
+ if (dn->dn_free_txg)
+ return (TRUE);
+
+ if (blkid == DMU_SPILL_BLKID)
+ return (dnode_spill_freed(dn));
+
+ mutex_enter(&dn->dn_mtx);
+ for (i = 0; i < TXG_SIZE; i++) {
+ if (dn->dn_free_ranges[i] != NULL &&
+ range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
+ break;
+ }
+ mutex_exit(&dn->dn_mtx);
+ return (i < TXG_SIZE);
+}
+
+/* call from syncing context when we actually write/free space for this dnode */
+void
+dnode_diduse_space(dnode_t *dn, int64_t delta)
+{
+ uint64_t space;
+ dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
+ dn, dn->dn_phys,
+ (u_longlong_t)dn->dn_phys->dn_used,
+ (longlong_t)delta);
+
+ mutex_enter(&dn->dn_mtx);
+ space = DN_USED_BYTES(dn->dn_phys);
+ if (delta > 0) {
+ ASSERT3U(space + delta, >=, space); /* no overflow */
+ } else {
+ ASSERT3U(space, >=, -delta); /* no underflow */
+ }
+ space += delta;
+ if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
+ ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
+ ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
+ dn->dn_phys->dn_used = space >> DEV_BSHIFT;
+ } else {
+ dn->dn_phys->dn_used = space;
+ dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
+ }
+ mutex_exit(&dn->dn_mtx);
+}
+
+/*
+ * Scans a block at the indicated "level" looking for a hole or data,
+ * depending on 'flags'.
+ *
+ * If level > 0, then we are scanning an indirect block looking at its
+ * pointers. If level == 0, then we are looking at a block of dnodes.
+ *
+ * If we don't find what we are looking for in the block, we return ESRCH.
+ * Otherwise, return with *offset pointing to the beginning (if searching
+ * forwards) or end (if searching backwards) of the range covered by the
+ * block pointer we matched on (or dnode).
+ *
+ * The basic search algorithm used below by dnode_next_offset() is to
+ * use this function to search up the block tree (widen the search) until
+ * we find something (i.e., we don't return ESRCH) and then search back
+ * down the tree (narrow the search) until we reach our original search
+ * level.
+ */
+static int
+dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
+ int lvl, uint64_t blkfill, uint64_t txg)
+{
+ dmu_buf_impl_t *db = NULL;
+ void *data = NULL;
+ uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ uint64_t epb = 1ULL << epbs;
+ uint64_t minfill, maxfill;
+ boolean_t hole;
+ int i, inc, error, span;
+
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+
+ hole = ((flags & DNODE_FIND_HOLE) != 0);
+ inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
+ ASSERT(txg == 0 || !hole);
+
+ if (lvl == dn->dn_phys->dn_nlevels) {
+ error = 0;
+ epb = dn->dn_phys->dn_nblkptr;
+ data = dn->dn_phys->dn_blkptr;
+ } else {
+ uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
+ error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
+ if (error) {
+ if (error != ENOENT)
+ return (error);
+ if (hole)
+ return (0);
+ /*
+ * This can only happen when we are searching up
+ * the block tree for data. We don't really need to
+ * adjust the offset, as we will just end up looking
+ * at the pointer to this block in its parent, and its
+ * going to be unallocated, so we will skip over it.
+ */
+ return (SET_ERROR(ESRCH));
+ }
+ error = dbuf_read(db, NULL,
+ DB_RF_CANFAIL | DB_RF_HAVESTRUCT |
+ DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH);
+ if (error) {
+ dbuf_rele(db, FTAG);
+ return (error);
+ }
+ data = db->db.db_data;
+ rw_enter(&db->db_rwlock, RW_READER);
+ }
+
+ if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
+ db->db_blkptr->blk_birth <= txg ||
+ BP_IS_HOLE(db->db_blkptr))) {
+ /*
+ * This can only happen when we are searching up the tree
+ * and these conditions mean that we need to keep climbing.
+ */
+ error = SET_ERROR(ESRCH);
+ } else if (lvl == 0) {
+ dnode_phys_t *dnp = data;
+
+ ASSERT(dn->dn_type == DMU_OT_DNODE);
+ ASSERT(!(flags & DNODE_FIND_BACKWARDS));
+
+ for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
+ i < blkfill; i += dnp[i].dn_extra_slots + 1) {
+ if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
+ break;
+ }
+
+ if (i == blkfill)
+ error = SET_ERROR(ESRCH);
+
+ *offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
+ (i << DNODE_SHIFT);
+ } else {
+ blkptr_t *bp = data;
+ uint64_t start = *offset;
+ span = (lvl - 1) * epbs + dn->dn_datablkshift;
+ minfill = 0;
+ maxfill = blkfill << ((lvl - 1) * epbs);
+
+ if (hole)
+ maxfill--;
+ else
+ minfill++;
+
+ if (span >= 8 * sizeof (*offset)) {
+ /* This only happens on the highest indirection level */
+ ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1);
+ *offset = 0;
+ } else {
+ *offset = *offset >> span;
+ }
+
+ for (i = BF64_GET(*offset, 0, epbs);
+ i >= 0 && i < epb; i += inc) {
+ if (BP_GET_FILL(&bp[i]) >= minfill &&
+ BP_GET_FILL(&bp[i]) <= maxfill &&
+ (hole || bp[i].blk_birth > txg))
+ break;
+ if (inc > 0 || *offset > 0)
+ *offset += inc;
+ }
+
+ if (span >= 8 * sizeof (*offset)) {
+ *offset = start;
+ } else {
+ *offset = *offset << span;
+ }
+
+ if (inc < 0) {
+ /* traversing backwards; position offset at the end */
+ ASSERT3U(*offset, <=, start);
+ *offset = MIN(*offset + (1ULL << span) - 1, start);
+ } else if (*offset < start) {
+ *offset = start;
+ }
+ if (i < 0 || i >= epb)
+ error = SET_ERROR(ESRCH);
+ }
+
+ if (db != NULL) {
+ rw_exit(&db->db_rwlock);
+ dbuf_rele(db, FTAG);
+ }
+
+ return (error);
+}
+
+/*
+ * Find the next hole, data, or sparse region at or after *offset.
+ * The value 'blkfill' tells us how many items we expect to find
+ * in an L0 data block; this value is 1 for normal objects,
+ * DNODES_PER_BLOCK for the meta dnode, and some fraction of
+ * DNODES_PER_BLOCK when searching for sparse regions thereof.
+ *
+ * Examples:
+ *
+ * dnode_next_offset(dn, flags, offset, 1, 1, 0);
+ * Finds the next/previous hole/data in a file.
+ * Used in dmu_offset_next().
+ *
+ * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
+ * Finds the next free/allocated dnode an objset's meta-dnode.
+ * Only finds objects that have new contents since txg (ie.
+ * bonus buffer changes and content removal are ignored).
+ * Used in dmu_object_next().
+ *
+ * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
+ * Finds the next L2 meta-dnode bp that's at most 1/4 full.
+ * Used in dmu_object_alloc().
+ */
+int
+dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
+ int minlvl, uint64_t blkfill, uint64_t txg)
+{
+ uint64_t initial_offset = *offset;
+ int lvl, maxlvl;
+ int error = 0;
+
+ if (!(flags & DNODE_FIND_HAVELOCK))
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+ if (dn->dn_phys->dn_nlevels == 0) {
+ error = SET_ERROR(ESRCH);
+ goto out;
+ }
+
+ if (dn->dn_datablkshift == 0) {
+ if (*offset < dn->dn_datablksz) {
+ if (flags & DNODE_FIND_HOLE)
+ *offset = dn->dn_datablksz;
+ } else {
+ error = SET_ERROR(ESRCH);
+ }
+ goto out;
+ }
+
+ maxlvl = dn->dn_phys->dn_nlevels;
+
+ for (lvl = minlvl; lvl <= maxlvl; lvl++) {
+ error = dnode_next_offset_level(dn,
+ flags, offset, lvl, blkfill, txg);
+ if (error != ESRCH)
+ break;
+ }
+
+ while (error == 0 && --lvl >= minlvl) {
+ error = dnode_next_offset_level(dn,
+ flags, offset, lvl, blkfill, txg);
+ }
+
+ /*
+ * There's always a "virtual hole" at the end of the object, even
+ * if all BP's which physically exist are non-holes.
+ */
+ if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
+ minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
+ error = 0;
+ }
+
+ if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
+ initial_offset < *offset : initial_offset > *offset))
+ error = SET_ERROR(ESRCH);
+out:
+ if (!(flags & DNODE_FIND_HAVELOCK))
+ rw_exit(&dn->dn_struct_rwlock);
+
+ return (error);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(dnode_hold);
+EXPORT_SYMBOL(dnode_rele);
+EXPORT_SYMBOL(dnode_set_nlevels);
+EXPORT_SYMBOL(dnode_set_blksz);
+EXPORT_SYMBOL(dnode_free_range);
+EXPORT_SYMBOL(dnode_evict_dbufs);
+EXPORT_SYMBOL(dnode_evict_bonus);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/dnode_sync.c b/sys/contrib/openzfs/module/zfs/dnode_sync.c
new file mode 100644
index 000000000000..66e48a1e17d4
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dnode_sync.c
@@ -0,0 +1,858 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2020 Oxide Computer Company
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_recv.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+#include <sys/range_tree.h>
+#include <sys/zfeature.h>
+
+static void
+dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db;
+ int txgoff = tx->tx_txg & TXG_MASK;
+ int nblkptr = dn->dn_phys->dn_nblkptr;
+ int old_toplvl = dn->dn_phys->dn_nlevels - 1;
+ int new_level = dn->dn_next_nlevels[txgoff];
+ int i;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+ /* this dnode can't be paged out because it's dirty */
+ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
+ ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
+
+ db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
+ ASSERT(db != NULL);
+
+ dn->dn_phys->dn_nlevels = new_level;
+ dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
+ dn->dn_object, dn->dn_phys->dn_nlevels);
+
+ /*
+ * Lock ordering requires that we hold the children's db_mutexes (by
+ * calling dbuf_find()) before holding the parent's db_rwlock. The lock
+ * order is imposed by dbuf_read's steps of "grab the lock to protect
+ * db_parent, get db_parent, hold db_parent's db_rwlock".
+ */
+ dmu_buf_impl_t *children[DN_MAX_NBLKPTR];
+ ASSERT3U(nblkptr, <=, DN_MAX_NBLKPTR);
+ for (i = 0; i < nblkptr; i++) {
+ children[i] =
+ dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i);
+ }
+
+ /* transfer dnode's block pointers to new indirect block */
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
+ if (dn->dn_dbuf != NULL)
+ rw_enter(&dn->dn_dbuf->db_rwlock, RW_WRITER);
+ rw_enter(&db->db_rwlock, RW_WRITER);
+ ASSERT(db->db.db_data);
+ ASSERT(arc_released(db->db_buf));
+ ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
+ bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
+ sizeof (blkptr_t) * nblkptr);
+ arc_buf_freeze(db->db_buf);
+
+ /* set dbuf's parent pointers to new indirect buf */
+ for (i = 0; i < nblkptr; i++) {
+ dmu_buf_impl_t *child = children[i];
+
+ if (child == NULL)
+ continue;
+#ifdef ZFS_DEBUG
+ DB_DNODE_ENTER(child);
+ ASSERT3P(DB_DNODE(child), ==, dn);
+ DB_DNODE_EXIT(child);
+#endif /* DEBUG */
+ if (child->db_parent && child->db_parent != dn->dn_dbuf) {
+ ASSERT(child->db_parent->db_level == db->db_level);
+ ASSERT(child->db_blkptr !=
+ &dn->dn_phys->dn_blkptr[child->db_blkid]);
+ mutex_exit(&child->db_mtx);
+ continue;
+ }
+ ASSERT(child->db_parent == NULL ||
+ child->db_parent == dn->dn_dbuf);
+
+ child->db_parent = db;
+ dbuf_add_ref(db, child);
+ if (db->db.db_data)
+ child->db_blkptr = (blkptr_t *)db->db.db_data + i;
+ else
+ child->db_blkptr = NULL;
+ dprintf_dbuf_bp(child, child->db_blkptr,
+ "changed db_blkptr to new indirect %s", "");
+
+ mutex_exit(&child->db_mtx);
+ }
+
+ bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
+
+ rw_exit(&db->db_rwlock);
+ if (dn->dn_dbuf != NULL)
+ rw_exit(&dn->dn_dbuf->db_rwlock);
+
+ dbuf_rele(db, FTAG);
+
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+static void
+free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+ uint64_t bytesfreed = 0;
+
+ dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);
+
+ for (int i = 0; i < num; i++, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+
+ bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
+ ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
+
+ /*
+ * Save some useful information on the holes being
+ * punched, including logical size, type, and indirection
+ * level. Retaining birth time enables detection of when
+ * holes are punched for reducing the number of free
+ * records transmitted during a zfs send.
+ */
+
+ uint64_t lsize = BP_GET_LSIZE(bp);
+ dmu_object_type_t type = BP_GET_TYPE(bp);
+ uint64_t lvl = BP_GET_LEVEL(bp);
+
+ bzero(bp, sizeof (blkptr_t));
+
+ if (spa_feature_is_active(dn->dn_objset->os_spa,
+ SPA_FEATURE_HOLE_BIRTH)) {
+ BP_SET_LSIZE(bp, lsize);
+ BP_SET_TYPE(bp, type);
+ BP_SET_LEVEL(bp, lvl);
+ BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0);
+ }
+ }
+ dnode_diduse_space(dn, -bytesfreed);
+}
+
+#ifdef ZFS_DEBUG
+static void
+free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
+{
+ int off, num;
+ int i, err, epbs;
+ uint64_t txg = tx->tx_txg;
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ off = start - (db->db_blkid * 1<<epbs);
+ num = end - start + 1;
+
+ ASSERT3U(off, >=, 0);
+ ASSERT3U(num, >=, 0);
+ ASSERT3U(db->db_level, >, 0);
+ ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
+ ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
+ ASSERT(db->db_blkptr != NULL);
+
+ for (i = off; i < off+num; i++) {
+ uint64_t *buf;
+ dmu_buf_impl_t *child;
+ dbuf_dirty_record_t *dr;
+ int j;
+
+ ASSERT(db->db_level == 1);
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ err = dbuf_hold_impl(dn, db->db_level - 1,
+ (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (err == ENOENT)
+ continue;
+ ASSERT(err == 0);
+ ASSERT(child->db_level == 0);
+ dr = dbuf_find_dirty_eq(child, txg);
+
+ /* data_old better be zeroed */
+ if (dr) {
+ buf = dr->dt.dl.dr_data->b_data;
+ for (j = 0; j < child->db.db_size >> 3; j++) {
+ if (buf[j] != 0) {
+ panic("freed data not zero: "
+ "child=%p i=%d off=%d num=%d\n",
+ (void *)child, i, off, num);
+ }
+ }
+ }
+
+ /*
+ * db_data better be zeroed unless it's dirty in a
+ * future txg.
+ */
+ mutex_enter(&child->db_mtx);
+ buf = child->db.db_data;
+ if (buf != NULL && child->db_state != DB_FILL &&
+ list_is_empty(&child->db_dirty_records)) {
+ for (j = 0; j < child->db.db_size >> 3; j++) {
+ if (buf[j] != 0) {
+ panic("freed data not zero: "
+ "child=%p i=%d off=%d num=%d\n",
+ (void *)child, i, off, num);
+ }
+ }
+ }
+ mutex_exit(&child->db_mtx);
+
+ dbuf_rele(child, FTAG);
+ }
+ DB_DNODE_EXIT(db);
+}
+#endif
+
+/*
+ * We don't usually free the indirect blocks here. If in one txg we have a
+ * free_range and a write to the same indirect block, it's important that we
+ * preserve the hole's birth times. Therefore, we don't free any any indirect
+ * blocks in free_children(). If an indirect block happens to turn into all
+ * holes, it will be freed by dbuf_write_children_ready, which happens at a
+ * point in the syncing process where we know for certain the contents of the
+ * indirect block.
+ *
+ * However, if we're freeing a dnode, its space accounting must go to zero
+ * before we actually try to free the dnode, or we will trip an assertion. In
+ * addition, we know the case described above cannot occur, because the dnode is
+ * being freed. Therefore, we free the indirect blocks immediately in that
+ * case.
+ */
+static void
+free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
+ boolean_t free_indirects, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ blkptr_t *bp;
+ dmu_buf_impl_t *subdb;
+ uint64_t start, end, dbstart, dbend;
+ unsigned int epbs, shift, i;
+
+ /*
+ * There is a small possibility that this block will not be cached:
+ * 1 - if level > 1 and there are no children with level <= 1
+ * 2 - if this block was evicted since we read it from
+ * dmu_tx_hold_free().
+ */
+ if (db->db_state != DB_CACHED)
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
+
+ /*
+ * If we modify this indirect block, and we are not freeing the
+ * dnode (!free_indirects), then this indirect block needs to get
+ * written to disk by dbuf_write(). If it is dirty, we know it will
+ * be written (otherwise, we would have incorrect on-disk state
+ * because the space would be freed but still referenced by the BP
+ * in this indirect block). Therefore we VERIFY that it is
+ * dirty.
+ *
+ * Our VERIFY covers some cases that do not actually have to be
+ * dirty, but the open-context code happens to dirty. E.g. if the
+ * blocks we are freeing are all holes, because in that case, we
+ * are only freeing part of this indirect block, so it is an
+ * ancestor of the first or last block to be freed. The first and
+ * last L1 indirect blocks are always dirtied by dnode_free_range().
+ */
+ db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
+ VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0);
+ dmu_buf_unlock_parent(db, dblt, FTAG);
+
+ dbuf_release_bp(db);
+ bp = db->db.db_data;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ ASSERT3U(epbs, <, 31);
+ shift = (db->db_level - 1) * epbs;
+ dbstart = db->db_blkid << epbs;
+ start = blkid >> shift;
+ if (dbstart < start) {
+ bp += start - dbstart;
+ } else {
+ start = dbstart;
+ }
+ dbend = ((db->db_blkid + 1) << epbs) - 1;
+ end = (blkid + nblks - 1) >> shift;
+ if (dbend <= end)
+ end = dbend;
+
+ ASSERT3U(start, <=, end);
+
+ if (db->db_level == 1) {
+ FREE_VERIFY(db, start, end, tx);
+ rw_enter(&db->db_rwlock, RW_WRITER);
+ free_blocks(dn, bp, end - start + 1, tx);
+ rw_exit(&db->db_rwlock);
+ } else {
+ for (uint64_t id = start; id <= end; id++, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
+ id, TRUE, FALSE, FTAG, &subdb));
+ rw_exit(&dn->dn_struct_rwlock);
+ ASSERT3P(bp, ==, subdb->db_blkptr);
+
+ free_children(subdb, blkid, nblks, free_indirects, tx);
+ dbuf_rele(subdb, FTAG);
+ }
+ }
+
+ if (free_indirects) {
+ rw_enter(&db->db_rwlock, RW_WRITER);
+ for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++)
+ ASSERT(BP_IS_HOLE(bp));
+ bzero(db->db.db_data, db->db.db_size);
+ free_blocks(dn, db->db_blkptr, 1, tx);
+ rw_exit(&db->db_rwlock);
+ }
+
+ DB_DNODE_EXIT(db);
+ arc_buf_freeze(db->db_buf);
+}
+
+/*
+ * Traverse the indicated range of the provided file
+ * and "free" all the blocks contained there.
+ */
+static void
+dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
+ boolean_t free_indirects, dmu_tx_t *tx)
+{
+ blkptr_t *bp = dn->dn_phys->dn_blkptr;
+ int dnlevel = dn->dn_phys->dn_nlevels;
+ boolean_t trunc = B_FALSE;
+
+ if (blkid > dn->dn_phys->dn_maxblkid)
+ return;
+
+ ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
+ if (blkid + nblks > dn->dn_phys->dn_maxblkid) {
+ nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
+ trunc = B_TRUE;
+ }
+
+ /* There are no indirect blocks in the object */
+ if (dnlevel == 1) {
+ if (blkid >= dn->dn_phys->dn_nblkptr) {
+ /* this range was never made persistent */
+ return;
+ }
+ ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
+ free_blocks(dn, bp + blkid, nblks, tx);
+ } else {
+ int shift = (dnlevel - 1) *
+ (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
+ int start = blkid >> shift;
+ int end = (blkid + nblks - 1) >> shift;
+ dmu_buf_impl_t *db;
+
+ ASSERT(start < dn->dn_phys->dn_nblkptr);
+ bp += start;
+ for (int i = start; i <= end; i++, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
+ TRUE, FALSE, FTAG, &db));
+ rw_exit(&dn->dn_struct_rwlock);
+ free_children(db, blkid, nblks, free_indirects, tx);
+ dbuf_rele(db, FTAG);
+ }
+ }
+
+ /*
+ * Do not truncate the maxblkid if we are performing a raw
+ * receive. The raw receive sets the maxblkid manually and
+ * must not be overridden. Usually, the last DRR_FREE record
+ * will be at the maxblkid, because the source system sets
+ * the maxblkid when truncating. However, if the last block
+ * was freed by overwriting with zeros and being compressed
+ * away to a hole, the source system will generate a DRR_FREE
+ * record while leaving the maxblkid after the end of that
+ * record. In this case we need to leave the maxblkid as
+ * indicated in the DRR_OBJECT record, so that it matches the
+ * source system, ensuring that the cryptographic hashes will
+ * match.
+ */
+ if (trunc && !dn->dn_objset->os_raw_receive) {
+ uint64_t off __maybe_unused;
+ dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;
+
+ off = (dn->dn_phys->dn_maxblkid + 1) *
+ (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ ASSERT(off < dn->dn_phys->dn_maxblkid ||
+ dn->dn_phys->dn_maxblkid == 0 ||
+ dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
+ }
+}
+
+typedef struct dnode_sync_free_range_arg {
+ dnode_t *dsfra_dnode;
+ dmu_tx_t *dsfra_tx;
+ boolean_t dsfra_free_indirects;
+} dnode_sync_free_range_arg_t;
+
+static void
+dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
+{
+ dnode_sync_free_range_arg_t *dsfra = arg;
+ dnode_t *dn = dsfra->dsfra_dnode;
+
+ mutex_exit(&dn->dn_mtx);
+ dnode_sync_free_range_impl(dn, blkid, nblks,
+ dsfra->dsfra_free_indirects, dsfra->dsfra_tx);
+ mutex_enter(&dn->dn_mtx);
+}
+
+/*
+ * Try to kick all the dnode's dbufs out of the cache...
+ */
+void
+dnode_evict_dbufs(dnode_t *dn)
+{
+ dmu_buf_impl_t *db_marker;
+ dmu_buf_impl_t *db, *db_next;
+
+ db_marker = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
+
+ mutex_enter(&dn->dn_dbufs_mtx);
+ for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
+
+#ifdef ZFS_DEBUG
+ DB_DNODE_ENTER(db);
+ ASSERT3P(DB_DNODE(db), ==, dn);
+ DB_DNODE_EXIT(db);
+#endif /* DEBUG */
+
+ mutex_enter(&db->db_mtx);
+ if (db->db_state != DB_EVICTING &&
+ zfs_refcount_is_zero(&db->db_holds)) {
+ db_marker->db_level = db->db_level;
+ db_marker->db_blkid = db->db_blkid;
+ db_marker->db_state = DB_SEARCH;
+ avl_insert_here(&dn->dn_dbufs, db_marker, db,
+ AVL_BEFORE);
+
+ /*
+ * We need to use the "marker" dbuf rather than
+ * simply getting the next dbuf, because
+ * dbuf_destroy() may actually remove multiple dbufs.
+ * It can call itself recursively on the parent dbuf,
+ * which may also be removed from dn_dbufs. The code
+ * flow would look like:
+ *
+ * dbuf_destroy():
+ * dnode_rele_and_unlock(parent_dbuf, evicting=TRUE):
+ * if (!cacheable || pending_evict)
+ * dbuf_destroy()
+ */
+ dbuf_destroy(db);
+
+ db_next = AVL_NEXT(&dn->dn_dbufs, db_marker);
+ avl_remove(&dn->dn_dbufs, db_marker);
+ } else {
+ db->db_pending_evict = TRUE;
+ mutex_exit(&db->db_mtx);
+ db_next = AVL_NEXT(&dn->dn_dbufs, db);
+ }
+ }
+ mutex_exit(&dn->dn_dbufs_mtx);
+
+ kmem_free(db_marker, sizeof (dmu_buf_impl_t));
+
+ dnode_evict_bonus(dn);
+}
+
+void
+dnode_evict_bonus(dnode_t *dn)
+{
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (dn->dn_bonus != NULL) {
+ if (zfs_refcount_is_zero(&dn->dn_bonus->db_holds)) {
+ mutex_enter(&dn->dn_bonus->db_mtx);
+ dbuf_destroy(dn->dn_bonus);
+ dn->dn_bonus = NULL;
+ } else {
+ dn->dn_bonus->db_pending_evict = TRUE;
+ }
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+static void
+dnode_undirty_dbufs(list_t *list)
+{
+ dbuf_dirty_record_t *dr;
+
+ while ((dr = list_head(list))) {
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ uint64_t txg = dr->dr_txg;
+
+ if (db->db_level != 0)
+ dnode_undirty_dbufs(&dr->dt.di.dr_children);
+
+ mutex_enter(&db->db_mtx);
+ /* XXX - use dbuf_undirty()? */
+ list_remove(list, dr);
+ ASSERT(list_head(&db->db_dirty_records) == dr);
+ list_remove_head(&db->db_dirty_records);
+ ASSERT(list_is_empty(&db->db_dirty_records));
+ db->db_dirtycnt -= 1;
+ if (db->db_level == 0) {
+ ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
+ dr->dt.dl.dr_data == db->db_buf);
+ dbuf_unoverride(dr);
+ } else {
+ mutex_destroy(&dr->dt.di.dr_mtx);
+ list_destroy(&dr->dt.di.dr_children);
+ }
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+ dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
+ }
+}
+
+static void
+dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
+{
+ int txgoff = tx->tx_txg & TXG_MASK;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ /*
+ * Our contents should have been freed in dnode_sync() by the
+ * free range record inserted by the caller of dnode_free().
+ */
+ ASSERT0(DN_USED_BYTES(dn->dn_phys));
+ ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));
+
+ dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
+ dnode_evict_dbufs(dn);
+
+ /*
+ * XXX - It would be nice to assert this, but we may still
+ * have residual holds from async evictions from the arc...
+ *
+ * zfs_obj_to_path() also depends on this being
+ * commented out.
+ *
+ * ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 1);
+ */
+
+ /* Undirty next bits */
+ dn->dn_next_nlevels[txgoff] = 0;
+ dn->dn_next_indblkshift[txgoff] = 0;
+ dn->dn_next_blksz[txgoff] = 0;
+ dn->dn_next_maxblkid[txgoff] = 0;
+
+ /* ASSERT(blkptrs are zero); */
+ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
+ ASSERT(dn->dn_type != DMU_OT_NONE);
+
+ ASSERT(dn->dn_free_txg > 0);
+ if (dn->dn_allocated_txg != dn->dn_free_txg)
+ dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
+ bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots);
+ dnode_free_interior_slots(dn);
+
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_type = DMU_OT_NONE;
+ dn->dn_maxblkid = 0;
+ dn->dn_allocated_txg = 0;
+ dn->dn_free_txg = 0;
+ dn->dn_have_spill = B_FALSE;
+ dn->dn_num_slots = 1;
+ mutex_exit(&dn->dn_mtx);
+
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+
+ dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
+ /*
+ * Now that we've released our hold, the dnode may
+ * be evicted, so we mustn't access it.
+ */
+}
+
+/*
+ * Write out the dnode's dirty buffers.
+ */
+void
+dnode_sync(dnode_t *dn, dmu_tx_t *tx)
+{
+ objset_t *os = dn->dn_objset;
+ dnode_phys_t *dnp = dn->dn_phys;
+ int txgoff = tx->tx_txg & TXG_MASK;
+ list_t *list = &dn->dn_dirty_records[txgoff];
+ static const dnode_phys_t zerodn __maybe_unused = { 0 };
+ boolean_t kill_spill = B_FALSE;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
+ ASSERT(dnp->dn_type != DMU_OT_NONE ||
+ bcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0);
+ DNODE_VERIFY(dn);
+
+ ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
+
+ /*
+ * Do user accounting if it is enabled and this is not
+ * an encrypted receive.
+ */
+ if (dmu_objset_userused_enabled(os) &&
+ !DMU_OBJECT_IS_SPECIAL(dn->dn_object) &&
+ (!os->os_encrypted || !dmu_objset_is_receiving(os))) {
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
+ dn->dn_oldflags = dn->dn_phys->dn_flags;
+ dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
+ if (dmu_objset_userobjused_enabled(dn->dn_objset))
+ dn->dn_phys->dn_flags |=
+ DNODE_FLAG_USEROBJUSED_ACCOUNTED;
+ mutex_exit(&dn->dn_mtx);
+ dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
+ } else {
+ /* Once we account for it, we should always account for it */
+ ASSERT(!(dn->dn_phys->dn_flags &
+ DNODE_FLAG_USERUSED_ACCOUNTED));
+ ASSERT(!(dn->dn_phys->dn_flags &
+ DNODE_FLAG_USEROBJUSED_ACCOUNTED));
+ }
+
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_allocated_txg == tx->tx_txg) {
+ /* The dnode is newly allocated or reallocated */
+ if (dnp->dn_type == DMU_OT_NONE) {
+ /* this is a first alloc, not a realloc */
+ dnp->dn_nlevels = 1;
+ dnp->dn_nblkptr = dn->dn_nblkptr;
+ }
+
+ dnp->dn_type = dn->dn_type;
+ dnp->dn_bonustype = dn->dn_bonustype;
+ dnp->dn_bonuslen = dn->dn_bonuslen;
+ }
+
+ dnp->dn_extra_slots = dn->dn_num_slots - 1;
+
+ ASSERT(dnp->dn_nlevels > 1 ||
+ BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+ BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||
+ BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
+ dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ ASSERT(dnp->dn_nlevels < 2 ||
+ BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+ BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift);
+
+ if (dn->dn_next_type[txgoff] != 0) {
+ dnp->dn_type = dn->dn_type;
+ dn->dn_next_type[txgoff] = 0;
+ }
+
+ if (dn->dn_next_blksz[txgoff] != 0) {
+ ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
+ SPA_MINBLOCKSIZE) == 0);
+ ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+ dn->dn_maxblkid == 0 || list_head(list) != NULL ||
+ dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
+ dnp->dn_datablkszsec ||
+ !range_tree_is_empty(dn->dn_free_ranges[txgoff]));
+ dnp->dn_datablkszsec =
+ dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
+ dn->dn_next_blksz[txgoff] = 0;
+ }
+
+ if (dn->dn_next_bonuslen[txgoff] != 0) {
+ if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
+ dnp->dn_bonuslen = 0;
+ else
+ dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
+ ASSERT(dnp->dn_bonuslen <=
+ DN_SLOTS_TO_BONUSLEN(dnp->dn_extra_slots + 1));
+ dn->dn_next_bonuslen[txgoff] = 0;
+ }
+
+ if (dn->dn_next_bonustype[txgoff] != 0) {
+ ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff]));
+ dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
+ dn->dn_next_bonustype[txgoff] = 0;
+ }
+
+ boolean_t freeing_dnode = dn->dn_free_txg > 0 &&
+ dn->dn_free_txg <= tx->tx_txg;
+
+ /*
+ * Remove the spill block if we have been explicitly asked to
+ * remove it, or if the object is being removed.
+ */
+ if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) {
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
+ kill_spill = B_TRUE;
+ dn->dn_rm_spillblk[txgoff] = 0;
+ }
+
+ if (dn->dn_next_indblkshift[txgoff] != 0) {
+ ASSERT(dnp->dn_nlevels == 1);
+ dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
+ dn->dn_next_indblkshift[txgoff] = 0;
+ }
+
+ /*
+ * Just take the live (open-context) values for checksum and compress.
+ * Strictly speaking it's a future leak, but nothing bad happens if we
+ * start using the new checksum or compress algorithm a little early.
+ */
+ dnp->dn_checksum = dn->dn_checksum;
+ dnp->dn_compress = dn->dn_compress;
+
+ mutex_exit(&dn->dn_mtx);
+
+ if (kill_spill) {
+ free_blocks(dn, DN_SPILL_BLKPTR(dn->dn_phys), 1, tx);
+ mutex_enter(&dn->dn_mtx);
+ dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ /* process all the "freed" ranges in the file */
+ if (dn->dn_free_ranges[txgoff] != NULL) {
+ dnode_sync_free_range_arg_t dsfra;
+ dsfra.dsfra_dnode = dn;
+ dsfra.dsfra_tx = tx;
+ dsfra.dsfra_free_indirects = freeing_dnode;
+ mutex_enter(&dn->dn_mtx);
+ if (freeing_dnode) {
+ ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff],
+ 0, dn->dn_maxblkid + 1));
+ }
+ /*
+ * Because dnode_sync_free_range() must drop dn_mtx during its
+ * processing, using it as a callback to range_tree_vacate() is
+ * not safe. No other operations (besides destroy) are allowed
+ * once range_tree_vacate() has begun, and dropping dn_mtx
+ * would leave a window open for another thread to observe that
+ * invalid (and unsafe) state.
+ */
+ range_tree_walk(dn->dn_free_ranges[txgoff],
+ dnode_sync_free_range, &dsfra);
+ range_tree_vacate(dn->dn_free_ranges[txgoff], NULL, NULL);
+ range_tree_destroy(dn->dn_free_ranges[txgoff]);
+ dn->dn_free_ranges[txgoff] = NULL;
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ if (freeing_dnode) {
+ dn->dn_objset->os_freed_dnodes++;
+ dnode_sync_free(dn, tx);
+ return;
+ }
+
+ if (dn->dn_num_slots > DNODE_MIN_SLOTS) {
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+ mutex_enter(&ds->ds_lock);
+ ds->ds_feature_activation[SPA_FEATURE_LARGE_DNODE] =
+ (void *)B_TRUE;
+ mutex_exit(&ds->ds_lock);
+ }
+
+ if (dn->dn_next_nlevels[txgoff]) {
+ dnode_increase_indirection(dn, tx);
+ dn->dn_next_nlevels[txgoff] = 0;
+ }
+
+ /*
+ * This must be done after dnode_sync_free_range()
+ * and dnode_increase_indirection(). See dnode_new_blkid()
+ * for an explanation of the high bit being set.
+ */
+ if (dn->dn_next_maxblkid[txgoff]) {
+ mutex_enter(&dn->dn_mtx);
+ dnp->dn_maxblkid =
+ dn->dn_next_maxblkid[txgoff] & ~DMU_NEXT_MAXBLKID_SET;
+ dn->dn_next_maxblkid[txgoff] = 0;
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ if (dn->dn_next_nblkptr[txgoff]) {
+ /* this should only happen on a realloc */
+ ASSERT(dn->dn_allocated_txg == tx->tx_txg);
+ if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) {
+ /* zero the new blkptrs we are gaining */
+ bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
+ sizeof (blkptr_t) *
+ (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr));
+#ifdef ZFS_DEBUG
+ } else {
+ int i;
+ ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr);
+ /* the blkptrs we are losing better be unallocated */
+ for (i = 0; i < dnp->dn_nblkptr; i++) {
+ if (i >= dn->dn_next_nblkptr[txgoff])
+ ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i]));
+ }
+#endif
+ }
+ mutex_enter(&dn->dn_mtx);
+ dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff];
+ dn->dn_next_nblkptr[txgoff] = 0;
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx);
+
+ if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+ ASSERT3P(list_head(list), ==, NULL);
+ dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
+ }
+
+ /*
+ * Although we have dropped our reference to the dnode, it
+ * can't be evicted until its written, and we haven't yet
+ * initiated the IO for the dnode's dbuf. Additionally, the caller
+ * has already added a reference to the dnode because it's on the
+ * os_synced_dnodes list.
+ */
+}
diff --git a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c
new file mode 100644
index 000000000000..2faf1af52991
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c
@@ -0,0 +1,1734 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ * Copyright 2019, 2020 by Christian Schwarz. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/spa.h>
+#include <sys/dsl_bookmark.h>
+#include <zfs_namecheck.h>
+#include <sys/dmu_send.h>
+
+static int
+dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname,
+ dsl_dataset_t **dsp, void *tag, char **shortnamep)
+{
+ char buf[ZFS_MAX_DATASET_NAME_LEN];
+ char *hashp;
+
+ if (strlen(fullname) >= ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+ hashp = strchr(fullname, '#');
+ if (hashp == NULL)
+ return (SET_ERROR(EINVAL));
+
+ *shortnamep = hashp + 1;
+ if (zfs_component_namecheck(*shortnamep, NULL, NULL))
+ return (SET_ERROR(EINVAL));
+ (void) strlcpy(buf, fullname, hashp - fullname + 1);
+ return (dsl_dataset_hold(dp, buf, tag, dsp));
+}
+
+/*
+ * When reading BOOKMARK_V1 bookmarks, the BOOKMARK_V2 fields are guaranteed
+ * to be zeroed.
+ *
+ * Returns ESRCH if bookmark is not found.
+ * Note, we need to use the ZAP rather than the AVL to look up bookmarks
+ * by name, because only the ZAP honors the casesensitivity setting.
+ */
+int
+dsl_bookmark_lookup_impl(dsl_dataset_t *ds, const char *shortname,
+ zfs_bookmark_phys_t *bmark_phys)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t bmark_zapobj = ds->ds_bookmarks_obj;
+ matchtype_t mt = 0;
+ int err;
+
+ if (bmark_zapobj == 0)
+ return (SET_ERROR(ESRCH));
+
+ if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
+ mt = MT_NORMALIZE;
+
+ /*
+ * Zero out the bookmark in case the one stored on disk
+ * is in an older, shorter format.
+ */
+ bzero(bmark_phys, sizeof (*bmark_phys));
+
+ err = zap_lookup_norm(mos, bmark_zapobj, shortname, sizeof (uint64_t),
+ sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt, NULL, 0,
+ NULL);
+
+ return (err == ENOENT ? SET_ERROR(ESRCH) : err);
+}
+
+/*
+ * If later_ds is non-NULL, this will return EXDEV if the specified bookmark
+ * does not represents an earlier point in later_ds's timeline. However,
+ * bmp will still be filled in if we return EXDEV.
+ *
+ * Returns ENOENT if the dataset containing the bookmark does not exist.
+ * Returns ESRCH if the dataset exists but the bookmark was not found in it.
+ */
+int
+dsl_bookmark_lookup(dsl_pool_t *dp, const char *fullname,
+ dsl_dataset_t *later_ds, zfs_bookmark_phys_t *bmp)
+{
+ char *shortname;
+ dsl_dataset_t *ds;
+ int error;
+
+ error = dsl_bookmark_hold_ds(dp, fullname, &ds, FTAG, &shortname);
+ if (error != 0)
+ return (error);
+
+ error = dsl_bookmark_lookup_impl(ds, shortname, bmp);
+ if (error == 0 && later_ds != NULL) {
+ if (!dsl_dataset_is_before(later_ds, ds, bmp->zbm_creation_txg))
+ error = SET_ERROR(EXDEV);
+ }
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+}
+
+/*
+ * Validates that
+ * - bmark is a full dataset path of a bookmark (bookmark_namecheck)
+ * - source is a full path of a snapshot or bookmark
+ * ({bookmark,snapshot}_namecheck)
+ *
+ * Returns 0 if valid, -1 otherwise.
+ */
+static int
+dsl_bookmark_create_nvl_validate_pair(const char *bmark, const char *source)
+{
+ if (bookmark_namecheck(bmark, NULL, NULL) != 0)
+ return (-1);
+
+ int is_bmark, is_snap;
+ is_bmark = bookmark_namecheck(source, NULL, NULL) == 0;
+ is_snap = snapshot_namecheck(source, NULL, NULL) == 0;
+ if (!is_bmark && !is_snap)
+ return (-1);
+
+ return (0);
+}
+
+/*
+ * Check that the given nvlist corresponds to the following schema:
+ * { newbookmark -> source, ... }
+ * where
+ * - each pair passes dsl_bookmark_create_nvl_validate_pair
+ * - all newbookmarks are in the same pool
+ * - all newbookmarks have unique names
+ *
+ * Note that this function is only validates above schema. Callers must ensure
+ * that the bookmarks can be created, e.g. that sources exist.
+ *
+ * Returns 0 if the nvlist adheres to above schema.
+ * Returns -1 if it doesn't.
+ */
+int
+dsl_bookmark_create_nvl_validate(nvlist_t *bmarks)
+{
+ char *first;
+ size_t first_len;
+
+ first = NULL;
+ for (nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(bmarks, pair)) {
+
+ char *bmark = nvpair_name(pair);
+ char *source;
+
+ /* list structure: values must be snapshots XOR bookmarks */
+ if (nvpair_value_string(pair, &source) != 0)
+ return (-1);
+ if (dsl_bookmark_create_nvl_validate_pair(bmark, source) != 0)
+ return (-1);
+
+ /* same pool check */
+ if (first == NULL) {
+ char *cp = strpbrk(bmark, "/#");
+ if (cp == NULL)
+ return (-1);
+ first = bmark;
+ first_len = cp - bmark;
+ }
+ if (strncmp(first, bmark, first_len) != 0)
+ return (-1);
+ switch (*(bmark + first_len)) {
+ case '/': /* fallthrough */
+ case '#':
+ break;
+ default:
+ return (-1);
+ }
+
+ /* unique newbookmark names; todo: O(n^2) */
+ for (nvpair_t *pair2 = nvlist_next_nvpair(bmarks, pair);
+ pair2 != NULL; pair2 = nvlist_next_nvpair(bmarks, pair2)) {
+ if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0)
+ return (-1);
+ }
+
+ }
+ return (0);
+}
+
+/*
+ * expects that newbm and source have been validated using
+ * dsl_bookmark_create_nvl_validate_pair
+ */
+static int
+dsl_bookmark_create_check_impl(dsl_pool_t *dp,
+ const char *newbm, const char *source)
+{
+ ASSERT0(dsl_bookmark_create_nvl_validate_pair(newbm, source));
+ /* defer source namecheck until we know it's a snapshot or bookmark */
+
+ int error;
+ dsl_dataset_t *newbm_ds;
+ char *newbm_short;
+ zfs_bookmark_phys_t bmark_phys;
+
+ error = dsl_bookmark_hold_ds(dp, newbm, &newbm_ds, FTAG, &newbm_short);
+ if (error != 0)
+ return (error);
+
+ /* Verify that the new bookmark does not already exist */
+ error = dsl_bookmark_lookup_impl(newbm_ds, newbm_short, &bmark_phys);
+ switch (error) {
+ case ESRCH:
+ /* happy path: new bmark doesn't exist, proceed after switch */
+ error = 0;
+ break;
+ case 0:
+ error = SET_ERROR(EEXIST);
+ goto eholdnewbmds;
+ default:
+ /* dsl_bookmark_lookup_impl already did SET_ERRROR */
+ goto eholdnewbmds;
+ }
+
+ /* error is retval of the following if-cascade */
+ if (strchr(source, '@') != NULL) {
+ dsl_dataset_t *source_snap_ds;
+ ASSERT3S(snapshot_namecheck(source, NULL, NULL), ==, 0);
+ error = dsl_dataset_hold(dp, source, FTAG, &source_snap_ds);
+ if (error == 0) {
+ VERIFY(source_snap_ds->ds_is_snapshot);
+ /*
+ * Verify that source snapshot is an earlier point in
+ * newbm_ds's timeline (source may be newbm_ds's origin)
+ */
+ if (!dsl_dataset_is_before(newbm_ds, source_snap_ds, 0))
+ error = SET_ERROR(
+ ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR);
+ dsl_dataset_rele(source_snap_ds, FTAG);
+ }
+ } else if (strchr(source, '#') != NULL) {
+ zfs_bookmark_phys_t source_phys;
+ ASSERT3S(bookmark_namecheck(source, NULL, NULL), ==, 0);
+ /*
+ * Source must exists and be an earlier point in newbm_ds's
+ * timeline (newbm_ds's origin may be a snap of source's ds)
+ */
+ error = dsl_bookmark_lookup(dp, source, newbm_ds, &source_phys);
+ switch (error) {
+ case 0:
+ break; /* happy path */
+ case EXDEV:
+ error = SET_ERROR(ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR);
+ break;
+ default:
+ /* dsl_bookmark_lookup already did SET_ERRROR */
+ break;
+ }
+ } else {
+ /*
+ * dsl_bookmark_create_nvl_validate validates that source is
+ * either snapshot or bookmark
+ */
+ panic("unreachable code: %s", source);
+ }
+
+eholdnewbmds:
+ dsl_dataset_rele(newbm_ds, FTAG);
+ return (error);
+}
+
+int
+dsl_bookmark_create_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_bookmark_create_arg_t *dbca = arg;
+ int rv = 0;
+ int schema_err = 0;
+ ASSERT3P(dbca, !=, NULL);
+ ASSERT3P(dbca->dbca_bmarks, !=, NULL);
+ /* dbca->dbca_errors is allowed to be NULL */
+
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS))
+ return (SET_ERROR(ENOTSUP));
+
+ if (dsl_bookmark_create_nvl_validate(dbca->dbca_bmarks) != 0)
+ rv = schema_err = SET_ERROR(EINVAL);
+
+ for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) {
+ char *new = nvpair_name(pair);
+
+ int error = schema_err;
+ if (error == 0) {
+ char *source = fnvpair_value_string(pair);
+ error = dsl_bookmark_create_check_impl(dp, new, source);
+ if (error != 0)
+ error = SET_ERROR(error);
+ }
+
+ if (error != 0) {
+ rv = error;
+ if (dbca->dbca_errors != NULL)
+ fnvlist_add_int32(dbca->dbca_errors,
+ new, error);
+ }
+ }
+
+ return (rv);
+}
+
+static dsl_bookmark_node_t *
+dsl_bookmark_node_alloc(char *shortname)
+{
+ dsl_bookmark_node_t *dbn = kmem_alloc(sizeof (*dbn), KM_SLEEP);
+ dbn->dbn_name = spa_strdup(shortname);
+ dbn->dbn_dirty = B_FALSE;
+ mutex_init(&dbn->dbn_lock, NULL, MUTEX_DEFAULT, NULL);
+ return (dbn);
+}
+
+/*
+ * Set the fields in the zfs_bookmark_phys_t based on the specified snapshot.
+ */
+static void
+dsl_bookmark_set_phys(zfs_bookmark_phys_t *zbm, dsl_dataset_t *snap)
+{
+ spa_t *spa = dsl_dataset_get_spa(snap);
+ objset_t *mos = spa_get_dsl(spa)->dp_meta_objset;
+ dsl_dataset_phys_t *dsp = dsl_dataset_phys(snap);
+ zbm->zbm_guid = dsp->ds_guid;
+ zbm->zbm_creation_txg = dsp->ds_creation_txg;
+ zbm->zbm_creation_time = dsp->ds_creation_time;
+ zbm->zbm_redaction_obj = 0;
+
+ /*
+ * If the dataset is encrypted create a larger bookmark to
+ * accommodate the IVset guid. The IVset guid was added
+ * after the encryption feature to prevent a problem with
+ * raw sends. If we encounter an encrypted dataset without
+ * an IVset guid we fall back to a normal bookmark.
+ */
+ if (snap->ds_dir->dd_crypto_obj != 0 &&
+ spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) {
+ (void) zap_lookup(mos, snap->ds_object,
+ DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1,
+ &zbm->zbm_ivset_guid);
+ }
+
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_WRITTEN)) {
+ zbm->zbm_flags = ZBM_FLAG_SNAPSHOT_EXISTS | ZBM_FLAG_HAS_FBN;
+ zbm->zbm_referenced_bytes_refd = dsp->ds_referenced_bytes;
+ zbm->zbm_compressed_bytes_refd = dsp->ds_compressed_bytes;
+ zbm->zbm_uncompressed_bytes_refd = dsp->ds_uncompressed_bytes;
+
+ dsl_dataset_t *nextds;
+ VERIFY0(dsl_dataset_hold_obj(snap->ds_dir->dd_pool,
+ dsp->ds_next_snap_obj, FTAG, &nextds));
+ dsl_deadlist_space(&nextds->ds_deadlist,
+ &zbm->zbm_referenced_freed_before_next_snap,
+ &zbm->zbm_compressed_freed_before_next_snap,
+ &zbm->zbm_uncompressed_freed_before_next_snap);
+ dsl_dataset_rele(nextds, FTAG);
+ } else {
+ bzero(&zbm->zbm_flags,
+ sizeof (zfs_bookmark_phys_t) -
+ offsetof(zfs_bookmark_phys_t, zbm_flags));
+ }
+}
+
+/*
+ * Add dsl_bookmark_node_t `dbn` to the given dataset and increment appropriate
+ * SPA feature counters.
+ */
+void
+dsl_bookmark_node_add(dsl_dataset_t *hds, dsl_bookmark_node_t *dbn,
+ dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ objset_t *mos = dp->dp_meta_objset;
+
+ if (hds->ds_bookmarks_obj == 0) {
+ hds->ds_bookmarks_obj = zap_create_norm(mos,
+ U8_TEXTPREP_TOUPPER, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0,
+ tx);
+ spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
+
+ dsl_dataset_zapify(hds, tx);
+ VERIFY0(zap_add(mos, hds->ds_object,
+ DS_FIELD_BOOKMARK_NAMES,
+ sizeof (hds->ds_bookmarks_obj), 1,
+ &hds->ds_bookmarks_obj, tx));
+ }
+
+ avl_add(&hds->ds_bookmarks, dbn);
+
+ /*
+ * To maintain backwards compatibility with software that doesn't
+ * understand SPA_FEATURE_BOOKMARK_V2, we need to use the smallest
+ * possible bookmark size.
+ */
+ uint64_t bookmark_phys_size = BOOKMARK_PHYS_SIZE_V1;
+ if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2) &&
+ (dbn->dbn_phys.zbm_ivset_guid != 0 || dbn->dbn_phys.zbm_flags &
+ ZBM_FLAG_HAS_FBN || dbn->dbn_phys.zbm_redaction_obj != 0)) {
+ bookmark_phys_size = BOOKMARK_PHYS_SIZE_V2;
+ spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2, tx);
+ }
+
+ __attribute__((unused)) zfs_bookmark_phys_t zero_phys = { 0 };
+ ASSERT0(bcmp(((char *)&dbn->dbn_phys) + bookmark_phys_size,
+ &zero_phys, sizeof (zfs_bookmark_phys_t) - bookmark_phys_size));
+
+ VERIFY0(zap_add(mos, hds->ds_bookmarks_obj, dbn->dbn_name,
+ sizeof (uint64_t), bookmark_phys_size / sizeof (uint64_t),
+ &dbn->dbn_phys, tx));
+}
+
+/*
+ * If redaction_list is non-null, we create a redacted bookmark and redaction
+ * list, and store the object number of the redaction list in redact_obj.
+ */
+static void
+dsl_bookmark_create_sync_impl_snap(const char *bookmark, const char *snapshot,
+ dmu_tx_t *tx, uint64_t num_redact_snaps, uint64_t *redact_snaps, void *tag,
+ redaction_list_t **redaction_list)
+{
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ objset_t *mos = dp->dp_meta_objset;
+ dsl_dataset_t *snapds, *bmark_fs;
+ char *shortname;
+ boolean_t bookmark_redacted;
+ uint64_t *dsredactsnaps;
+ uint64_t dsnumsnaps;
+
+ VERIFY0(dsl_dataset_hold(dp, snapshot, FTAG, &snapds));
+ VERIFY0(dsl_bookmark_hold_ds(dp, bookmark, &bmark_fs, FTAG,
+ &shortname));
+
+ dsl_bookmark_node_t *dbn = dsl_bookmark_node_alloc(shortname);
+ dsl_bookmark_set_phys(&dbn->dbn_phys, snapds);
+
+ bookmark_redacted = dsl_dataset_get_uint64_array_feature(snapds,
+ SPA_FEATURE_REDACTED_DATASETS, &dsnumsnaps, &dsredactsnaps);
+ if (redaction_list != NULL || bookmark_redacted) {
+ redaction_list_t *local_rl;
+ if (bookmark_redacted) {
+ redact_snaps = dsredactsnaps;
+ num_redact_snaps = dsnumsnaps;
+ }
+ dbn->dbn_phys.zbm_redaction_obj = dmu_object_alloc(mos,
+ DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
+ DMU_OTN_UINT64_METADATA, sizeof (redaction_list_phys_t) +
+ num_redact_snaps * sizeof (uint64_t), tx);
+ spa_feature_incr(dp->dp_spa,
+ SPA_FEATURE_REDACTION_BOOKMARKS, tx);
+
+ VERIFY0(dsl_redaction_list_hold_obj(dp,
+ dbn->dbn_phys.zbm_redaction_obj, tag, &local_rl));
+ dsl_redaction_list_long_hold(dp, local_rl, tag);
+
+ ASSERT3U((local_rl)->rl_dbuf->db_size, >=,
+ sizeof (redaction_list_phys_t) + num_redact_snaps *
+ sizeof (uint64_t));
+ dmu_buf_will_dirty(local_rl->rl_dbuf, tx);
+ bcopy(redact_snaps, local_rl->rl_phys->rlp_snaps,
+ sizeof (uint64_t) * num_redact_snaps);
+ local_rl->rl_phys->rlp_num_snaps = num_redact_snaps;
+ if (bookmark_redacted) {
+ ASSERT3P(redaction_list, ==, NULL);
+ local_rl->rl_phys->rlp_last_blkid = UINT64_MAX;
+ local_rl->rl_phys->rlp_last_object = UINT64_MAX;
+ dsl_redaction_list_long_rele(local_rl, tag);
+ dsl_redaction_list_rele(local_rl, tag);
+ } else {
+ *redaction_list = local_rl;
+ }
+ }
+
+ if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) {
+ spa_feature_incr(dp->dp_spa,
+ SPA_FEATURE_BOOKMARK_WRITTEN, tx);
+ }
+
+ dsl_bookmark_node_add(bmark_fs, dbn, tx);
+
+ spa_history_log_internal_ds(bmark_fs, "bookmark", tx,
+ "name=%s creation_txg=%llu target_snap=%llu redact_obj=%llu",
+ shortname, (longlong_t)dbn->dbn_phys.zbm_creation_txg,
+ (longlong_t)snapds->ds_object,
+ (longlong_t)dbn->dbn_phys.zbm_redaction_obj);
+
+ dsl_dataset_rele(bmark_fs, FTAG);
+ dsl_dataset_rele(snapds, FTAG);
+}
+
+
+static void
+dsl_bookmark_create_sync_impl_book(
+ const char *new_name, const char *source_name, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *bmark_fs_source, *bmark_fs_new;
+ char *source_shortname, *new_shortname;
+ zfs_bookmark_phys_t source_phys;
+
+ VERIFY0(dsl_bookmark_hold_ds(dp, source_name, &bmark_fs_source, FTAG,
+ &source_shortname));
+ VERIFY0(dsl_bookmark_hold_ds(dp, new_name, &bmark_fs_new, FTAG,
+ &new_shortname));
+
+ /*
+ * create a copy of the source bookmark by copying most of its members
+ *
+ * Caveat: bookmarking a redaction bookmark yields a normal bookmark
+ * -----------------------------------------------------------------
+ * Reasoning:
+ * - The zbm_redaction_obj would be referred to by both source and new
+ * bookmark, but would be destroyed once either source or new is
+ * destroyed, resulting in use-after-free of the referrred object.
+ * - User expectation when issuing the `zfs bookmark` command is that
+ * a normal bookmark of the source is created
+ *
+ * Design Alternatives For Full Redaction Bookmark Copying:
+ * - reference-count the redaction object => would require on-disk
+ * format change for existing redaction objects
+ * - Copy the redaction object => cannot be done in syncing context
+ * because the redaction object might be too large
+ */
+
+ VERIFY0(dsl_bookmark_lookup_impl(bmark_fs_source, source_shortname,
+ &source_phys));
+ dsl_bookmark_node_t *new_dbn = dsl_bookmark_node_alloc(new_shortname);
+
+ memcpy(&new_dbn->dbn_phys, &source_phys, sizeof (source_phys));
+ new_dbn->dbn_phys.zbm_redaction_obj = 0;
+
+ /* update feature counters */
+ if (new_dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) {
+ spa_feature_incr(dp->dp_spa,
+ SPA_FEATURE_BOOKMARK_WRITTEN, tx);
+ }
+ /* no need for redaction bookmark counter; nulled zbm_redaction_obj */
+ /* dsl_bookmark_node_add bumps bookmarks and v2-bookmarks counter */
+
+ /*
+ * write new bookmark
+ *
+ * Note that dsl_bookmark_lookup_impl guarantees that, if source is a
+ * v1 bookmark, the v2-only fields are zeroed.
+ * And dsl_bookmark_node_add writes back a v1-sized bookmark if
+ * v2 bookmarks are disabled and/or v2-only fields are zeroed.
+ * => bookmark copying works on pre-bookmark-v2 pools
+ */
+ dsl_bookmark_node_add(bmark_fs_new, new_dbn, tx);
+
+ spa_history_log_internal_ds(bmark_fs_source, "bookmark", tx,
+ "name=%s creation_txg=%llu source_guid=%llu",
+ new_shortname, (longlong_t)new_dbn->dbn_phys.zbm_creation_txg,
+ (longlong_t)source_phys.zbm_guid);
+
+ dsl_dataset_rele(bmark_fs_source, FTAG);
+ dsl_dataset_rele(bmark_fs_new, FTAG);
+}
+
+void
+dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_bookmark_create_arg_t *dbca = arg;
+
+ ASSERT(spa_feature_is_enabled(dmu_tx_pool(tx)->dp_spa,
+ SPA_FEATURE_BOOKMARKS));
+
+ for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) {
+
+ char *new = nvpair_name(pair);
+ char *source = fnvpair_value_string(pair);
+
+ if (strchr(source, '@') != NULL) {
+ dsl_bookmark_create_sync_impl_snap(new, source, tx,
+ 0, NULL, NULL, NULL);
+ } else if (strchr(source, '#') != NULL) {
+ dsl_bookmark_create_sync_impl_book(new, source, tx);
+ } else {
+ panic("unreachable code");
+ }
+
+ }
+}
+
+/*
+ * The bookmarks must all be in the same pool.
+ */
+int
+dsl_bookmark_create(nvlist_t *bmarks, nvlist_t *errors)
+{
+ nvpair_t *pair;
+ dsl_bookmark_create_arg_t dbca;
+
+ pair = nvlist_next_nvpair(bmarks, NULL);
+ if (pair == NULL)
+ return (0);
+
+ dbca.dbca_bmarks = bmarks;
+ dbca.dbca_errors = errors;
+
+ return (dsl_sync_task(nvpair_name(pair), dsl_bookmark_create_check,
+ dsl_bookmark_create_sync, &dbca,
+ fnvlist_num_pairs(bmarks), ZFS_SPACE_CHECK_NORMAL));
+}
+
+static int
+dsl_bookmark_create_redacted_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_bookmark_create_redacted_arg_t *dbcra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ int rv = 0;
+
+ if (!spa_feature_is_enabled(dp->dp_spa,
+ SPA_FEATURE_REDACTION_BOOKMARKS))
+ return (SET_ERROR(ENOTSUP));
+ /*
+ * If the list of redact snaps will not fit in the bonus buffer with
+ * the furthest reached object and offset, fail.
+ */
+ if (dbcra->dbcra_numsnaps > (dmu_bonus_max() -
+ sizeof (redaction_list_phys_t)) / sizeof (uint64_t))
+ return (SET_ERROR(E2BIG));
+
+ if (dsl_bookmark_create_nvl_validate_pair(
+ dbcra->dbcra_bmark, dbcra->dbcra_snap) != 0)
+ return (SET_ERROR(EINVAL));
+
+ rv = dsl_bookmark_create_check_impl(dp,
+ dbcra->dbcra_bmark, dbcra->dbcra_snap);
+ return (rv);
+}
+
+static void
+dsl_bookmark_create_redacted_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_bookmark_create_redacted_arg_t *dbcra = arg;
+ dsl_bookmark_create_sync_impl_snap(dbcra->dbcra_bmark,
+ dbcra->dbcra_snap, tx, dbcra->dbcra_numsnaps, dbcra->dbcra_snaps,
+ dbcra->dbcra_tag, dbcra->dbcra_rl);
+}
+
+int
+dsl_bookmark_create_redacted(const char *bookmark, const char *snapshot,
+ uint64_t numsnaps, uint64_t *snapguids, void *tag, redaction_list_t **rl)
+{
+ dsl_bookmark_create_redacted_arg_t dbcra;
+
+ dbcra.dbcra_bmark = bookmark;
+ dbcra.dbcra_snap = snapshot;
+ dbcra.dbcra_rl = rl;
+ dbcra.dbcra_numsnaps = numsnaps;
+ dbcra.dbcra_snaps = snapguids;
+ dbcra.dbcra_tag = tag;
+
+ return (dsl_sync_task(bookmark, dsl_bookmark_create_redacted_check,
+ dsl_bookmark_create_redacted_sync, &dbcra, 5,
+ ZFS_SPACE_CHECK_NORMAL));
+}
+
+/*
+ * Retrieve the list of properties given in the 'props' nvlist for a bookmark.
+ * If 'props' is NULL, retrieves all properties.
+ */
+static void
+dsl_bookmark_fetch_props(dsl_pool_t *dp, zfs_bookmark_phys_t *bmark_phys,
+ nvlist_t *props, nvlist_t *out_props)
+{
+ ASSERT3P(dp, !=, NULL);
+ ASSERT3P(bmark_phys, !=, NULL);
+ ASSERT3P(out_props, !=, NULL);
+ ASSERT(RRW_LOCK_HELD(&dp->dp_config_rwlock));
+
+ if (props == NULL || nvlist_exists(props,
+ zfs_prop_to_name(ZFS_PROP_GUID))) {
+ dsl_prop_nvlist_add_uint64(out_props,
+ ZFS_PROP_GUID, bmark_phys->zbm_guid);
+ }
+ if (props == NULL || nvlist_exists(props,
+ zfs_prop_to_name(ZFS_PROP_CREATETXG))) {
+ dsl_prop_nvlist_add_uint64(out_props,
+ ZFS_PROP_CREATETXG, bmark_phys->zbm_creation_txg);
+ }
+ if (props == NULL || nvlist_exists(props,
+ zfs_prop_to_name(ZFS_PROP_CREATION))) {
+ dsl_prop_nvlist_add_uint64(out_props,
+ ZFS_PROP_CREATION, bmark_phys->zbm_creation_time);
+ }
+ if (props == NULL || nvlist_exists(props,
+ zfs_prop_to_name(ZFS_PROP_IVSET_GUID))) {
+ dsl_prop_nvlist_add_uint64(out_props,
+ ZFS_PROP_IVSET_GUID, bmark_phys->zbm_ivset_guid);
+ }
+ if (bmark_phys->zbm_flags & ZBM_FLAG_HAS_FBN) {
+ if (props == NULL || nvlist_exists(props,
+ zfs_prop_to_name(ZFS_PROP_REFERENCED))) {
+ dsl_prop_nvlist_add_uint64(out_props,
+ ZFS_PROP_REFERENCED,
+ bmark_phys->zbm_referenced_bytes_refd);
+ }
+ if (props == NULL || nvlist_exists(props,
+ zfs_prop_to_name(ZFS_PROP_LOGICALREFERENCED))) {
+ dsl_prop_nvlist_add_uint64(out_props,
+ ZFS_PROP_LOGICALREFERENCED,
+ bmark_phys->zbm_uncompressed_bytes_refd);
+ }
+ if (props == NULL || nvlist_exists(props,
+ zfs_prop_to_name(ZFS_PROP_REFRATIO))) {
+ uint64_t ratio =
+ bmark_phys->zbm_compressed_bytes_refd == 0 ? 100 :
+ bmark_phys->zbm_uncompressed_bytes_refd * 100 /
+ bmark_phys->zbm_compressed_bytes_refd;
+ dsl_prop_nvlist_add_uint64(out_props,
+ ZFS_PROP_REFRATIO, ratio);
+ }
+ }
+
+ if ((props == NULL || nvlist_exists(props, "redact_snaps") ||
+ nvlist_exists(props, "redact_complete")) &&
+ bmark_phys->zbm_redaction_obj != 0) {
+ redaction_list_t *rl;
+ int err = dsl_redaction_list_hold_obj(dp,
+ bmark_phys->zbm_redaction_obj, FTAG, &rl);
+ if (err == 0) {
+ if (nvlist_exists(props, "redact_snaps")) {
+ nvlist_t *nvl;
+ nvl = fnvlist_alloc();
+ fnvlist_add_uint64_array(nvl, ZPROP_VALUE,
+ rl->rl_phys->rlp_snaps,
+ rl->rl_phys->rlp_num_snaps);
+ fnvlist_add_nvlist(out_props, "redact_snaps",
+ nvl);
+ nvlist_free(nvl);
+ }
+ if (nvlist_exists(props, "redact_complete")) {
+ nvlist_t *nvl;
+ nvl = fnvlist_alloc();
+ fnvlist_add_boolean_value(nvl, ZPROP_VALUE,
+ rl->rl_phys->rlp_last_blkid == UINT64_MAX &&
+ rl->rl_phys->rlp_last_object == UINT64_MAX);
+ fnvlist_add_nvlist(out_props, "redact_complete",
+ nvl);
+ nvlist_free(nvl);
+ }
+ dsl_redaction_list_rele(rl, FTAG);
+ }
+ }
+}
+
+int
+dsl_get_bookmarks_impl(dsl_dataset_t *ds, nvlist_t *props, nvlist_t *outnvl)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ ASSERT(dsl_pool_config_held(dp));
+
+ if (dsl_dataset_is_snapshot(ds))
+ return (SET_ERROR(EINVAL));
+
+ for (dsl_bookmark_node_t *dbn = avl_first(&ds->ds_bookmarks);
+ dbn != NULL; dbn = AVL_NEXT(&ds->ds_bookmarks, dbn)) {
+ nvlist_t *out_props = fnvlist_alloc();
+
+ dsl_bookmark_fetch_props(dp, &dbn->dbn_phys, props, out_props);
+
+ fnvlist_add_nvlist(outnvl, dbn->dbn_name, out_props);
+ fnvlist_free(out_props);
+ }
+ return (0);
+}
+
+/*
+ * Comparison func for ds_bookmarks AVL tree. We sort the bookmarks by
+ * their TXG, then by their FBN-ness. The "FBN-ness" component ensures
+ * that all bookmarks at the same TXG that HAS_FBN are adjacent, which
+ * dsl_bookmark_destroy_sync_impl() depends on. Note that there may be
+ * multiple bookmarks at the same TXG (with the same FBN-ness). In this
+ * case we differentiate them by an arbitrary metric (in this case,
+ * their names).
+ */
+static int
+dsl_bookmark_compare(const void *l, const void *r)
+{
+ const dsl_bookmark_node_t *ldbn = l;
+ const dsl_bookmark_node_t *rdbn = r;
+
+ int64_t cmp = TREE_CMP(ldbn->dbn_phys.zbm_creation_txg,
+ rdbn->dbn_phys.zbm_creation_txg);
+ if (likely(cmp))
+ return (cmp);
+ cmp = TREE_CMP((ldbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN),
+ (rdbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN));
+ if (likely(cmp))
+ return (cmp);
+ cmp = strcmp(ldbn->dbn_name, rdbn->dbn_name);
+ return (TREE_ISIGN(cmp));
+}
+
+/*
+ * Cache this (head) dataset's bookmarks in the ds_bookmarks AVL tree.
+ */
+int
+dsl_bookmark_init_ds(dsl_dataset_t *ds)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+
+ ASSERT(!ds->ds_is_snapshot);
+
+ avl_create(&ds->ds_bookmarks, dsl_bookmark_compare,
+ sizeof (dsl_bookmark_node_t),
+ offsetof(dsl_bookmark_node_t, dbn_node));
+
+ if (!dsl_dataset_is_zapified(ds))
+ return (0);
+
+ int zaperr = zap_lookup(mos, ds->ds_object, DS_FIELD_BOOKMARK_NAMES,
+ sizeof (ds->ds_bookmarks_obj), 1, &ds->ds_bookmarks_obj);
+ if (zaperr == ENOENT)
+ return (0);
+ if (zaperr != 0)
+ return (zaperr);
+
+ if (ds->ds_bookmarks_obj == 0)
+ return (0);
+
+ int err = 0;
+ zap_cursor_t zc;
+ zap_attribute_t attr;
+
+ for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj);
+ (err = zap_cursor_retrieve(&zc, &attr)) == 0;
+ zap_cursor_advance(&zc)) {
+ dsl_bookmark_node_t *dbn =
+ dsl_bookmark_node_alloc(attr.za_name);
+
+ err = dsl_bookmark_lookup_impl(ds,
+ dbn->dbn_name, &dbn->dbn_phys);
+ ASSERT3U(err, !=, ENOENT);
+ if (err != 0) {
+ kmem_free(dbn, sizeof (*dbn));
+ break;
+ }
+ avl_add(&ds->ds_bookmarks, dbn);
+ }
+ zap_cursor_fini(&zc);
+ if (err == ENOENT)
+ err = 0;
+ return (err);
+}
+
+void
+dsl_bookmark_fini_ds(dsl_dataset_t *ds)
+{
+ void *cookie = NULL;
+ dsl_bookmark_node_t *dbn;
+
+ if (ds->ds_is_snapshot)
+ return;
+
+ while ((dbn = avl_destroy_nodes(&ds->ds_bookmarks, &cookie)) != NULL) {
+ spa_strfree(dbn->dbn_name);
+ mutex_destroy(&dbn->dbn_lock);
+ kmem_free(dbn, sizeof (*dbn));
+ }
+ avl_destroy(&ds->ds_bookmarks);
+}
+
+/*
+ * Retrieve the bookmarks that exist in the specified dataset, and the
+ * requested properties of each bookmark.
+ *
+ * The "props" nvlist specifies which properties are requested.
+ * See lzc_get_bookmarks() for the list of valid properties.
+ */
+int
+dsl_get_bookmarks(const char *dsname, nvlist_t *props, nvlist_t *outnvl)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ int err;
+
+ err = dsl_pool_hold(dsname, FTAG, &dp);
+ if (err != 0)
+ return (err);
+ err = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+ if (err != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (err);
+ }
+
+ err = dsl_get_bookmarks_impl(ds, props, outnvl);
+
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (err);
+}
+
+/*
+ * Retrieve all properties for a single bookmark in the given dataset.
+ */
+int
+dsl_get_bookmark_props(const char *dsname, const char *bmname, nvlist_t *props)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ zfs_bookmark_phys_t bmark_phys = { 0 };
+ int err;
+
+ err = dsl_pool_hold(dsname, FTAG, &dp);
+ if (err != 0)
+ return (err);
+ err = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+ if (err != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (err);
+ }
+
+ err = dsl_bookmark_lookup_impl(ds, bmname, &bmark_phys);
+ if (err != 0)
+ goto out;
+
+ dsl_bookmark_fetch_props(dp, &bmark_phys, NULL, props);
+out:
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (err);
+}
+
+typedef struct dsl_bookmark_destroy_arg {
+ nvlist_t *dbda_bmarks;
+ nvlist_t *dbda_success;
+ nvlist_t *dbda_errors;
+} dsl_bookmark_destroy_arg_t;
+
+static void
+dsl_bookmark_destroy_sync_impl(dsl_dataset_t *ds, const char *name,
+ dmu_tx_t *tx)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t bmark_zapobj = ds->ds_bookmarks_obj;
+ matchtype_t mt = 0;
+ uint64_t int_size, num_ints;
+ /*
+ * 'search' must be zeroed so that dbn_flags (which is used in
+ * dsl_bookmark_compare()) will be zeroed even if the on-disk
+ * (in ZAP) bookmark is shorter than offsetof(dbn_flags).
+ */
+ dsl_bookmark_node_t search = { 0 };
+ char realname[ZFS_MAX_DATASET_NAME_LEN];
+
+ /*
+ * Find the real name of this bookmark, which may be different
+ * from the given name if the dataset is case-insensitive. Then
+ * use the real name to find the node in the ds_bookmarks AVL tree.
+ */
+
+ if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
+ mt = MT_NORMALIZE;
+
+ VERIFY0(zap_length(mos, bmark_zapobj, name, &int_size, &num_ints));
+
+ ASSERT3U(int_size, ==, sizeof (uint64_t));
+
+ if (num_ints * int_size > BOOKMARK_PHYS_SIZE_V1) {
+ spa_feature_decr(dmu_objset_spa(mos),
+ SPA_FEATURE_BOOKMARK_V2, tx);
+ }
+ VERIFY0(zap_lookup_norm(mos, bmark_zapobj, name, sizeof (uint64_t),
+ num_ints, &search.dbn_phys, mt, realname, sizeof (realname), NULL));
+
+ search.dbn_name = realname;
+ dsl_bookmark_node_t *dbn = avl_find(&ds->ds_bookmarks, &search, NULL);
+ ASSERT(dbn != NULL);
+
+ if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) {
+ /*
+ * If this bookmark HAS_FBN, and it is before the most
+ * recent snapshot, then its TXG is a key in the head's
+ * deadlist (and all clones' heads' deadlists). If this is
+ * the last thing keeping the key (i.e. there are no more
+ * bookmarks with HAS_FBN at this TXG, and there is no
+ * snapshot at this TXG), then remove the key.
+ *
+ * Note that this algorithm depends on ds_bookmarks being
+ * sorted such that all bookmarks at the same TXG with
+ * HAS_FBN are adjacent (with no non-HAS_FBN bookmarks
+ * at the same TXG in between them). If this were not
+ * the case, we would need to examine *all* bookmarks
+ * at this TXG, rather than just the adjacent ones.
+ */
+
+ dsl_bookmark_node_t *dbn_prev =
+ AVL_PREV(&ds->ds_bookmarks, dbn);
+ dsl_bookmark_node_t *dbn_next =
+ AVL_NEXT(&ds->ds_bookmarks, dbn);
+
+ boolean_t more_bookmarks_at_this_txg =
+ (dbn_prev != NULL && dbn_prev->dbn_phys.zbm_creation_txg ==
+ dbn->dbn_phys.zbm_creation_txg &&
+ (dbn_prev->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) ||
+ (dbn_next != NULL && dbn_next->dbn_phys.zbm_creation_txg ==
+ dbn->dbn_phys.zbm_creation_txg &&
+ (dbn_next->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN));
+
+ if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_SNAPSHOT_EXISTS) &&
+ !more_bookmarks_at_this_txg &&
+ dbn->dbn_phys.zbm_creation_txg <
+ dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+ dsl_dir_remove_clones_key(ds->ds_dir,
+ dbn->dbn_phys.zbm_creation_txg, tx);
+ dsl_deadlist_remove_key(&ds->ds_deadlist,
+ dbn->dbn_phys.zbm_creation_txg, tx);
+ }
+
+ spa_feature_decr(dmu_objset_spa(mos),
+ SPA_FEATURE_BOOKMARK_WRITTEN, tx);
+ }
+
+ if (dbn->dbn_phys.zbm_redaction_obj != 0) {
+ VERIFY0(dmu_object_free(mos,
+ dbn->dbn_phys.zbm_redaction_obj, tx));
+ spa_feature_decr(dmu_objset_spa(mos),
+ SPA_FEATURE_REDACTION_BOOKMARKS, tx);
+ }
+
+ avl_remove(&ds->ds_bookmarks, dbn);
+ spa_strfree(dbn->dbn_name);
+ mutex_destroy(&dbn->dbn_lock);
+ kmem_free(dbn, sizeof (*dbn));
+
+ VERIFY0(zap_remove_norm(mos, bmark_zapobj, name, mt, tx));
+}
+
+static int
+dsl_bookmark_destroy_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_bookmark_destroy_arg_t *dbda = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ int rv = 0;
+
+ ASSERT(nvlist_empty(dbda->dbda_success));
+ ASSERT(nvlist_empty(dbda->dbda_errors));
+
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS))
+ return (0);
+
+ for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_bmarks, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_bmarks, pair)) {
+ const char *fullname = nvpair_name(pair);
+ dsl_dataset_t *ds;
+ zfs_bookmark_phys_t bm;
+ int error;
+ char *shortname;
+
+ error = dsl_bookmark_hold_ds(dp, fullname, &ds,
+ FTAG, &shortname);
+ if (error == ENOENT) {
+ /* ignore it; the bookmark is "already destroyed" */
+ continue;
+ }
+ if (error == 0) {
+ error = dsl_bookmark_lookup_impl(ds, shortname, &bm);
+ dsl_dataset_rele(ds, FTAG);
+ if (error == ESRCH) {
+ /*
+ * ignore it; the bookmark is
+ * "already destroyed"
+ */
+ continue;
+ }
+ if (error == 0 && bm.zbm_redaction_obj != 0) {
+ redaction_list_t *rl = NULL;
+ error = dsl_redaction_list_hold_obj(tx->tx_pool,
+ bm.zbm_redaction_obj, FTAG, &rl);
+ if (error == ENOENT) {
+ error = 0;
+ } else if (error == 0 &&
+ dsl_redaction_list_long_held(rl)) {
+ error = SET_ERROR(EBUSY);
+ }
+ if (rl != NULL) {
+ dsl_redaction_list_rele(rl, FTAG);
+ }
+ }
+ }
+ if (error == 0) {
+ if (dmu_tx_is_syncing(tx)) {
+ fnvlist_add_boolean(dbda->dbda_success,
+ fullname);
+ }
+ } else {
+ fnvlist_add_int32(dbda->dbda_errors, fullname, error);
+ rv = error;
+ }
+ }
+ return (rv);
+}
+
+static void
+dsl_bookmark_destroy_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_bookmark_destroy_arg_t *dbda = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ objset_t *mos = dp->dp_meta_objset;
+
+ for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_success, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_success, pair)) {
+ dsl_dataset_t *ds;
+ char *shortname;
+ uint64_t zap_cnt;
+
+ VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair),
+ &ds, FTAG, &shortname));
+ dsl_bookmark_destroy_sync_impl(ds, shortname, tx);
+
+ /*
+ * If all of this dataset's bookmarks have been destroyed,
+ * free the zap object and decrement the feature's use count.
+ */
+ VERIFY0(zap_count(mos, ds->ds_bookmarks_obj, &zap_cnt));
+ if (zap_cnt == 0) {
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ VERIFY0(zap_destroy(mos, ds->ds_bookmarks_obj, tx));
+ ds->ds_bookmarks_obj = 0;
+ spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
+ VERIFY0(zap_remove(mos, ds->ds_object,
+ DS_FIELD_BOOKMARK_NAMES, tx));
+ }
+
+ spa_history_log_internal_ds(ds, "remove bookmark", tx,
+ "name=%s", shortname);
+
+ dsl_dataset_rele(ds, FTAG);
+ }
+}
+
+/*
+ * The bookmarks must all be in the same pool.
+ */
+int
+dsl_bookmark_destroy(nvlist_t *bmarks, nvlist_t *errors)
+{
+ int rv;
+ dsl_bookmark_destroy_arg_t dbda;
+ nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL);
+ if (pair == NULL)
+ return (0);
+
+ dbda.dbda_bmarks = bmarks;
+ dbda.dbda_errors = errors;
+ dbda.dbda_success = fnvlist_alloc();
+
+ rv = dsl_sync_task(nvpair_name(pair), dsl_bookmark_destroy_check,
+ dsl_bookmark_destroy_sync, &dbda, fnvlist_num_pairs(bmarks),
+ ZFS_SPACE_CHECK_RESERVED);
+ fnvlist_free(dbda.dbda_success);
+ return (rv);
+}
+
+/* Return B_TRUE if there are any long holds on this dataset. */
+boolean_t
+dsl_redaction_list_long_held(redaction_list_t *rl)
+{
+ return (!zfs_refcount_is_zero(&rl->rl_longholds));
+}
+
+void
+dsl_redaction_list_long_hold(dsl_pool_t *dp, redaction_list_t *rl, void *tag)
+{
+ ASSERT(dsl_pool_config_held(dp));
+ (void) zfs_refcount_add(&rl->rl_longholds, tag);
+}
+
+void
+dsl_redaction_list_long_rele(redaction_list_t *rl, void *tag)
+{
+ (void) zfs_refcount_remove(&rl->rl_longholds, tag);
+}
+
+/* ARGSUSED */
+static void
+redaction_list_evict_sync(void *rlu)
+{
+ redaction_list_t *rl = rlu;
+ zfs_refcount_destroy(&rl->rl_longholds);
+
+ kmem_free(rl, sizeof (redaction_list_t));
+}
+
+void
+dsl_redaction_list_rele(redaction_list_t *rl, void *tag)
+{
+ dmu_buf_rele(rl->rl_dbuf, tag);
+}
+
+int
+dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, void *tag,
+ redaction_list_t **rlp)
+{
+ objset_t *mos = dp->dp_meta_objset;
+ dmu_buf_t *dbuf;
+ redaction_list_t *rl;
+ int err;
+
+ ASSERT(dsl_pool_config_held(dp));
+
+ err = dmu_bonus_hold(mos, rlobj, tag, &dbuf);
+ if (err != 0)
+ return (err);
+
+ rl = dmu_buf_get_user(dbuf);
+ if (rl == NULL) {
+ redaction_list_t *winner = NULL;
+
+ rl = kmem_zalloc(sizeof (redaction_list_t), KM_SLEEP);
+ rl->rl_dbuf = dbuf;
+ rl->rl_object = rlobj;
+ rl->rl_phys = dbuf->db_data;
+ rl->rl_mos = dp->dp_meta_objset;
+ zfs_refcount_create(&rl->rl_longholds);
+ dmu_buf_init_user(&rl->rl_dbu, redaction_list_evict_sync, NULL,
+ &rl->rl_dbuf);
+ if ((winner = dmu_buf_set_user_ie(dbuf, &rl->rl_dbu)) != NULL) {
+ kmem_free(rl, sizeof (*rl));
+ rl = winner;
+ }
+ }
+ *rlp = rl;
+ return (0);
+}
+
+/*
+ * Snapshot ds is being destroyed.
+ *
+ * Adjust the "freed_before_next" of any bookmarks between this snap
+ * and the previous snapshot, because their "next snapshot" is changing.
+ *
+ * If there are any bookmarks with HAS_FBN at this snapshot, remove
+ * their HAS_SNAP flag (note: there can be at most one snapshot of
+ * each filesystem at a given txg), and return B_TRUE. In this case
+ * the caller can not remove the key in the deadlist at this TXG, because
+ * the HAS_FBN bookmarks require the key be there.
+ *
+ * Returns B_FALSE if there are no bookmarks with HAS_FBN at this
+ * snapshot's TXG. In this case the caller can remove the key in the
+ * deadlist at this TXG.
+ */
+boolean_t
+dsl_bookmark_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ dsl_dataset_t *head, *next;
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &head));
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &next));
+
+ /*
+ * Find the first bookmark that HAS_FBN at or after the
+ * previous snapshot.
+ */
+ dsl_bookmark_node_t search = { 0 };
+ avl_index_t idx;
+ search.dbn_phys.zbm_creation_txg =
+ dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ search.dbn_phys.zbm_flags = ZBM_FLAG_HAS_FBN;
+ /*
+ * The empty-string name can't be in the AVL, and it compares
+ * before any entries with this TXG.
+ */
+ search.dbn_name = "";
+ VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL);
+ dsl_bookmark_node_t *dbn =
+ avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER);
+
+ /*
+ * Iterate over all bookmarks that are at or after the previous
+ * snapshot, and before this (being deleted) snapshot. Adjust
+ * their FBN based on their new next snapshot.
+ */
+ for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg <
+ dsl_dataset_phys(ds)->ds_creation_txg;
+ dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) {
+ if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN))
+ continue;
+ /*
+ * Increase our FBN by the amount of space that was live
+ * (referenced) at the time of this bookmark (i.e.
+ * birth <= zbm_creation_txg), and killed between this
+ * (being deleted) snapshot and the next snapshot (i.e.
+ * on the next snapshot's deadlist). (Space killed before
+ * this are already on our FBN.)
+ */
+ uint64_t referenced, compressed, uncompressed;
+ dsl_deadlist_space_range(&next->ds_deadlist,
+ 0, dbn->dbn_phys.zbm_creation_txg,
+ &referenced, &compressed, &uncompressed);
+ dbn->dbn_phys.zbm_referenced_freed_before_next_snap +=
+ referenced;
+ dbn->dbn_phys.zbm_compressed_freed_before_next_snap +=
+ compressed;
+ dbn->dbn_phys.zbm_uncompressed_freed_before_next_snap +=
+ uncompressed;
+ VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj,
+ dbn->dbn_name, sizeof (uint64_t),
+ sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t),
+ &dbn->dbn_phys, tx));
+ }
+ dsl_dataset_rele(next, FTAG);
+
+ /*
+ * There may be several bookmarks at this txg (the TXG of the
+ * snapshot being deleted). We need to clear the SNAPSHOT_EXISTS
+ * flag on all of them, and return TRUE if there is at least 1
+ * bookmark here with HAS_FBN (thus preventing the deadlist
+ * key from being removed).
+ */
+ boolean_t rv = B_FALSE;
+ for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg ==
+ dsl_dataset_phys(ds)->ds_creation_txg;
+ dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) {
+ if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) {
+ ASSERT(!(dbn->dbn_phys.zbm_flags &
+ ZBM_FLAG_SNAPSHOT_EXISTS));
+ continue;
+ }
+ ASSERT(dbn->dbn_phys.zbm_flags & ZBM_FLAG_SNAPSHOT_EXISTS);
+ dbn->dbn_phys.zbm_flags &= ~ZBM_FLAG_SNAPSHOT_EXISTS;
+ VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj,
+ dbn->dbn_name, sizeof (uint64_t),
+ sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t),
+ &dbn->dbn_phys, tx));
+ rv = B_TRUE;
+ }
+ dsl_dataset_rele(head, FTAG);
+ return (rv);
+}
+
+/*
+ * A snapshot is being created of this (head) dataset.
+ *
+ * We don't keep keys in the deadlist for the most recent snapshot, or any
+ * bookmarks at or after it, because there can't be any blocks on the
+ * deadlist in this range. Now that the most recent snapshot is after
+ * all bookmarks, we need to add these keys. Note that the caller always
+ * adds a key at the previous snapshot, so we only add keys for bookmarks
+ * after that.
+ */
+void
+dsl_bookmark_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ uint64_t last_key_added = UINT64_MAX;
+ for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks);
+ dbn != NULL && dbn->dbn_phys.zbm_creation_txg >
+ dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) {
+ uint64_t creation_txg = dbn->dbn_phys.zbm_creation_txg;
+ ASSERT3U(creation_txg, <=, last_key_added);
+ /*
+ * Note, there may be multiple bookmarks at this TXG,
+ * and we only want to add the key for this TXG once.
+ * The ds_bookmarks AVL is sorted by TXG, so we will visit
+ * these bookmarks in sequence.
+ */
+ if ((dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) &&
+ creation_txg != last_key_added) {
+ dsl_deadlist_add_key(&ds->ds_deadlist,
+ creation_txg, tx);
+ last_key_added = creation_txg;
+ }
+ }
+}
+
+/*
+ * The next snapshot of the origin dataset has changed, due to
+ * promote or clone swap. If there are any bookmarks at this dataset,
+ * we need to update their zbm_*_freed_before_next_snap to reflect this.
+ * The head dataset has the relevant bookmarks in ds_bookmarks.
+ */
+void
+dsl_bookmark_next_changed(dsl_dataset_t *head, dsl_dataset_t *origin,
+ dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+
+ /*
+ * Find the first bookmark that HAS_FBN at the origin snapshot.
+ */
+ dsl_bookmark_node_t search = { 0 };
+ avl_index_t idx;
+ search.dbn_phys.zbm_creation_txg =
+ dsl_dataset_phys(origin)->ds_creation_txg;
+ search.dbn_phys.zbm_flags = ZBM_FLAG_HAS_FBN;
+ /*
+ * The empty-string name can't be in the AVL, and it compares
+ * before any entries with this TXG.
+ */
+ search.dbn_name = "";
+ VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL);
+ dsl_bookmark_node_t *dbn =
+ avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER);
+
+ /*
+ * Iterate over all bookmarks that are at the origin txg.
+ * Adjust their FBN based on their new next snapshot.
+ */
+ for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg ==
+ dsl_dataset_phys(origin)->ds_creation_txg &&
+ (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN);
+ dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) {
+
+ /*
+ * Bookmark is at the origin, therefore its
+ * "next dataset" is changing, so we need
+ * to reset its FBN by recomputing it in
+ * dsl_bookmark_set_phys().
+ */
+ ASSERT3U(dbn->dbn_phys.zbm_guid, ==,
+ dsl_dataset_phys(origin)->ds_guid);
+ ASSERT3U(dbn->dbn_phys.zbm_referenced_bytes_refd, ==,
+ dsl_dataset_phys(origin)->ds_referenced_bytes);
+ ASSERT(dbn->dbn_phys.zbm_flags &
+ ZBM_FLAG_SNAPSHOT_EXISTS);
+ /*
+ * Save and restore the zbm_redaction_obj, which
+ * is zeroed by dsl_bookmark_set_phys().
+ */
+ uint64_t redaction_obj =
+ dbn->dbn_phys.zbm_redaction_obj;
+ dsl_bookmark_set_phys(&dbn->dbn_phys, origin);
+ dbn->dbn_phys.zbm_redaction_obj = redaction_obj;
+
+ VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj,
+ dbn->dbn_name, sizeof (uint64_t),
+ sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t),
+ &dbn->dbn_phys, tx));
+ }
+}
+
+/*
+ * This block is no longer referenced by this (head) dataset.
+ *
+ * Adjust the FBN of any bookmarks that reference this block, whose "next"
+ * is the head dataset.
+ */
+/* ARGSUSED */
+void
+dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ /*
+ * Iterate over bookmarks whose "next" is the head dataset.
+ */
+ for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks);
+ dbn != NULL && dbn->dbn_phys.zbm_creation_txg >=
+ dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) {
+ /*
+ * If the block was live (referenced) at the time of this
+ * bookmark, add its space to the bookmark's FBN.
+ */
+ if (bp->blk_birth <= dbn->dbn_phys.zbm_creation_txg &&
+ (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) {
+ mutex_enter(&dbn->dbn_lock);
+ dbn->dbn_phys.zbm_referenced_freed_before_next_snap +=
+ bp_get_dsize_sync(dsl_dataset_get_spa(ds), bp);
+ dbn->dbn_phys.zbm_compressed_freed_before_next_snap +=
+ BP_GET_PSIZE(bp);
+ dbn->dbn_phys.zbm_uncompressed_freed_before_next_snap +=
+ BP_GET_UCSIZE(bp);
+ /*
+ * Changing the ZAP object here would be too
+ * expensive. Also, we may be called from the zio
+ * interrupt thread, which can't block on i/o.
+ * Therefore, we mark this bookmark as dirty and
+ * modify the ZAP once per txg, in
+ * dsl_bookmark_sync_done().
+ */
+ dbn->dbn_dirty = B_TRUE;
+ mutex_exit(&dbn->dbn_lock);
+ }
+ }
+}
+
+void
+dsl_bookmark_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+
+ if (dsl_dataset_is_snapshot(ds))
+ return;
+
+ /*
+ * We only dirty bookmarks that are at or after the most recent
+ * snapshot. We can't create snapshots between
+ * dsl_bookmark_block_killed() and dsl_bookmark_sync_done(), so we
+ * don't need to look at any bookmarks before ds_prev_snap_txg.
+ */
+ for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks);
+ dbn != NULL && dbn->dbn_phys.zbm_creation_txg >=
+ dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) {
+ if (dbn->dbn_dirty) {
+ /*
+ * We only dirty nodes with HAS_FBN, therefore
+ * we can always use the current bookmark struct size.
+ */
+ ASSERT(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN);
+ VERIFY0(zap_update(dp->dp_meta_objset,
+ ds->ds_bookmarks_obj,
+ dbn->dbn_name, sizeof (uint64_t),
+ sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t),
+ &dbn->dbn_phys, tx));
+ dbn->dbn_dirty = B_FALSE;
+ }
+ }
+#ifdef ZFS_DEBUG
+ for (dsl_bookmark_node_t *dbn = avl_first(&ds->ds_bookmarks);
+ dbn != NULL; dbn = AVL_NEXT(&ds->ds_bookmarks, dbn)) {
+ ASSERT(!dbn->dbn_dirty);
+ }
+#endif
+}
+
+/*
+ * Return the TXG of the most recent bookmark (or 0 if there are no bookmarks).
+ */
+uint64_t
+dsl_bookmark_latest_txg(dsl_dataset_t *ds)
+{
+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+ dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks);
+ if (dbn == NULL)
+ return (0);
+ return (dbn->dbn_phys.zbm_creation_txg);
+}
+
+/*
+ * Compare the redact_block_phys_t to the bookmark. If the last block in the
+ * redact_block_phys_t is before the bookmark, return -1. If the first block in
+ * the redact_block_phys_t is after the bookmark, return 1. Otherwise, the
+ * bookmark is inside the range of the redact_block_phys_t, and we return 0.
+ */
+static int
+redact_block_zb_compare(redact_block_phys_t *first,
+ zbookmark_phys_t *second)
+{
+ /*
+ * If the block_phys is for a previous object, or the last block in the
+ * block_phys is strictly before the block in the bookmark, the
+ * block_phys is earlier.
+ */
+ if (first->rbp_object < second->zb_object ||
+ (first->rbp_object == second->zb_object &&
+ first->rbp_blkid + (redact_block_get_count(first) - 1) <
+ second->zb_blkid)) {
+ return (-1);
+ }
+
+ /*
+ * If the bookmark is for a previous object, or the block in the
+ * bookmark is strictly before the first block in the block_phys, the
+ * bookmark is earlier.
+ */
+ if (first->rbp_object > second->zb_object ||
+ (first->rbp_object == second->zb_object &&
+ first->rbp_blkid > second->zb_blkid)) {
+ return (1);
+ }
+
+ return (0);
+}
+
+/*
+ * Traverse the redaction list in the provided object, and call the callback for
+ * each entry we find. Don't call the callback for any records before resume.
+ */
+int
+dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume,
+ rl_traverse_callback_t cb, void *arg)
+{
+ objset_t *mos = rl->rl_mos;
+ int err = 0;
+
+ if (rl->rl_phys->rlp_last_object != UINT64_MAX ||
+ rl->rl_phys->rlp_last_blkid != UINT64_MAX) {
+ /*
+ * When we finish a send, we update the last object and offset
+ * to UINT64_MAX. If a send fails partway through, the last
+ * object and offset will have some other value, indicating how
+ * far the send got. The redaction list must be complete before
+ * it can be traversed, so return EINVAL if the last object and
+ * blkid are not set to UINT64_MAX.
+ */
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * This allows us to skip the binary search and resume checking logic
+ * below, if we're not resuming a redacted send.
+ */
+ if (ZB_IS_ZERO(resume))
+ resume = NULL;
+
+ /*
+ * Binary search for the point to resume from.
+ */
+ uint64_t maxidx = rl->rl_phys->rlp_num_entries - 1;
+ uint64_t minidx = 0;
+ while (resume != NULL && maxidx > minidx) {
+ redact_block_phys_t rbp = { 0 };
+ ASSERT3U(maxidx, >, minidx);
+ uint64_t mididx = minidx + ((maxidx - minidx) / 2);
+ err = dmu_read(mos, rl->rl_object, mididx * sizeof (rbp),
+ sizeof (rbp), &rbp, DMU_READ_NO_PREFETCH);
+ if (err != 0)
+ break;
+
+ int cmp = redact_block_zb_compare(&rbp, resume);
+
+ if (cmp == 0) {
+ minidx = mididx;
+ break;
+ } else if (cmp > 0) {
+ maxidx =
+ (mididx == minidx ? minidx : mididx - 1);
+ } else {
+ minidx = mididx + 1;
+ }
+ }
+
+ unsigned int bufsize = SPA_OLD_MAXBLOCKSIZE;
+ redact_block_phys_t *buf = zio_data_buf_alloc(bufsize);
+
+ unsigned int entries_per_buf = bufsize / sizeof (redact_block_phys_t);
+ uint64_t start_block = minidx / entries_per_buf;
+ err = dmu_read(mos, rl->rl_object, start_block * bufsize, bufsize, buf,
+ DMU_READ_PREFETCH);
+
+ for (uint64_t curidx = minidx;
+ err == 0 && curidx < rl->rl_phys->rlp_num_entries;
+ curidx++) {
+ /*
+ * We read in the redaction list one block at a time. Once we
+ * finish with all the entries in a given block, we read in a
+ * new one. The predictive prefetcher will take care of any
+ * prefetching, and this code shouldn't be the bottleneck, so we
+ * don't need to do manual prefetching.
+ */
+ if (curidx % entries_per_buf == 0) {
+ err = dmu_read(mos, rl->rl_object, curidx *
+ sizeof (*buf), bufsize, buf,
+ DMU_READ_PREFETCH);
+ if (err != 0)
+ break;
+ }
+ redact_block_phys_t *rb = &buf[curidx % entries_per_buf];
+ /*
+ * If resume is non-null, we should either not send the data, or
+ * null out resume so we don't have to keep doing these
+ * comparisons.
+ */
+ if (resume != NULL) {
+ /*
+ * It is possible that after the binary search we got
+ * a record before the resume point. There's two cases
+ * where this can occur. If the record is the last
+ * redaction record, and the resume point is after the
+ * end of the redacted data, curidx will be the last
+ * redaction record. In that case, the loop will end
+ * after this iteration. The second case is if the
+ * resume point is between two redaction records, the
+ * binary search can return either the record before
+ * or after the resume point. In that case, the next
+ * iteration will be greater than the resume point.
+ */
+ if (redact_block_zb_compare(rb, resume) < 0) {
+ ASSERT3U(curidx, ==, minidx);
+ continue;
+ } else {
+ /*
+ * If the place to resume is in the middle of
+ * the range described by this
+ * redact_block_phys, then modify the
+ * redact_block_phys in memory so we generate
+ * the right records.
+ */
+ if (resume->zb_object == rb->rbp_object &&
+ resume->zb_blkid > rb->rbp_blkid) {
+ uint64_t diff = resume->zb_blkid -
+ rb->rbp_blkid;
+ rb->rbp_blkid = resume->zb_blkid;
+ redact_block_set_count(rb,
+ redact_block_get_count(rb) - diff);
+ }
+ resume = NULL;
+ }
+ }
+
+ if (cb(rb, arg) != 0) {
+ err = EINTR;
+ break;
+ }
+ }
+
+ zio_data_buf_free(buf, bufsize);
+ return (err);
+}
diff --git a/sys/contrib/openzfs/module/zfs/dsl_crypt.c b/sys/contrib/openzfs/module/zfs/dsl_crypt.c
new file mode 100644
index 000000000000..e38ec0cae827
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_crypt.c
@@ -0,0 +1,2863 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, Datto, Inc. All rights reserved.
+ * Copyright (c) 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/dsl_crypt.h>
+#include <sys/dsl_pool.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/zvol.h>
+
+/*
+ * This file's primary purpose is for managing master encryption keys in
+ * memory and on disk. For more info on how these keys are used, see the
+ * block comment in zio_crypt.c.
+ *
+ * All master keys are stored encrypted on disk in the form of the DSL
+ * Crypto Key ZAP object. The binary key data in this object is always
+ * randomly generated and is encrypted with the user's wrapping key. This
+ * layer of indirection allows the user to change their key without
+ * needing to re-encrypt the entire dataset. The ZAP also holds on to the
+ * (non-encrypted) encryption algorithm identifier, IV, and MAC needed to
+ * safely decrypt the master key. For more info on the user's key see the
+ * block comment in libzfs_crypto.c
+ *
+ * In-memory encryption keys are managed through the spa_keystore. The
+ * keystore consists of 3 AVL trees, which are as follows:
+ *
+ * The Wrapping Key Tree:
+ * The wrapping key (wkey) tree stores the user's keys that are fed into the
+ * kernel through 'zfs load-key' and related commands. Datasets inherit their
+ * parent's wkey by default, so these structures are refcounted. The wrapping
+ * keys remain in memory until they are explicitly unloaded (with
+ * "zfs unload-key"). Unloading is only possible when no datasets are using
+ * them (refcount=0).
+ *
+ * The DSL Crypto Key Tree:
+ * The DSL Crypto Keys (DCK) are the in-memory representation of decrypted
+ * master keys. They are used by the functions in zio_crypt.c to perform
+ * encryption, decryption, and authentication. Snapshots and clones of a given
+ * dataset will share a DSL Crypto Key, so they are also refcounted. Once the
+ * refcount on a key hits zero, it is immediately zeroed out and freed.
+ *
+ * The Crypto Key Mapping Tree:
+ * The zio layer needs to lookup master keys by their dataset object id. Since
+ * the DSL Crypto Keys can belong to multiple datasets, we maintain a tree of
+ * dsl_key_mapping_t's which essentially just map the dataset object id to its
+ * appropriate DSL Crypto Key. The management for creating and destroying these
+ * mappings hooks into the code for owning and disowning datasets. Usually,
+ * there will only be one active dataset owner, but there are times
+ * (particularly during dataset creation and destruction) when this may not be
+ * true or the dataset may not be initialized enough to own. As a result, this
+ * object is also refcounted.
+ */
+
+/*
+ * This tunable allows datasets to be raw received even if the stream does
+ * not include IVset guids or if the guids don't match. This is used as part
+ * of the resolution for ZPOOL_ERRATA_ZOL_8308_ENCRYPTION.
+ */
+int zfs_disable_ivset_guid_check = 0;
+
+static void
+dsl_wrapping_key_hold(dsl_wrapping_key_t *wkey, void *tag)
+{
+ (void) zfs_refcount_add(&wkey->wk_refcnt, tag);
+}
+
+static void
+dsl_wrapping_key_rele(dsl_wrapping_key_t *wkey, void *tag)
+{
+ (void) zfs_refcount_remove(&wkey->wk_refcnt, tag);
+}
+
+static void
+dsl_wrapping_key_free(dsl_wrapping_key_t *wkey)
+{
+ ASSERT0(zfs_refcount_count(&wkey->wk_refcnt));
+
+ if (wkey->wk_key.ck_data) {
+ bzero(wkey->wk_key.ck_data,
+ CRYPTO_BITS2BYTES(wkey->wk_key.ck_length));
+ kmem_free(wkey->wk_key.ck_data,
+ CRYPTO_BITS2BYTES(wkey->wk_key.ck_length));
+ }
+
+ zfs_refcount_destroy(&wkey->wk_refcnt);
+ kmem_free(wkey, sizeof (dsl_wrapping_key_t));
+}
+
+static void
+dsl_wrapping_key_create(uint8_t *wkeydata, zfs_keyformat_t keyformat,
+ uint64_t salt, uint64_t iters, dsl_wrapping_key_t **wkey_out)
+{
+ dsl_wrapping_key_t *wkey;
+
+ /* allocate the wrapping key */
+ wkey = kmem_alloc(sizeof (dsl_wrapping_key_t), KM_SLEEP);
+
+ /* allocate and initialize the underlying crypto key */
+ wkey->wk_key.ck_data = kmem_alloc(WRAPPING_KEY_LEN, KM_SLEEP);
+
+ wkey->wk_key.ck_format = CRYPTO_KEY_RAW;
+ wkey->wk_key.ck_length = CRYPTO_BYTES2BITS(WRAPPING_KEY_LEN);
+ bcopy(wkeydata, wkey->wk_key.ck_data, WRAPPING_KEY_LEN);
+
+ /* initialize the rest of the struct */
+ zfs_refcount_create(&wkey->wk_refcnt);
+ wkey->wk_keyformat = keyformat;
+ wkey->wk_salt = salt;
+ wkey->wk_iters = iters;
+
+ *wkey_out = wkey;
+}
+
+int
+dsl_crypto_params_create_nvlist(dcp_cmd_t cmd, nvlist_t *props,
+ nvlist_t *crypto_args, dsl_crypto_params_t **dcp_out)
+{
+ int ret;
+ uint64_t crypt = ZIO_CRYPT_INHERIT;
+ uint64_t keyformat = ZFS_KEYFORMAT_NONE;
+ uint64_t salt = 0, iters = 0;
+ dsl_crypto_params_t *dcp = NULL;
+ dsl_wrapping_key_t *wkey = NULL;
+ uint8_t *wkeydata = NULL;
+ uint_t wkeydata_len = 0;
+ char *keylocation = NULL;
+
+ dcp = kmem_zalloc(sizeof (dsl_crypto_params_t), KM_SLEEP);
+ dcp->cp_cmd = cmd;
+
+ /* get relevant arguments from the nvlists */
+ if (props != NULL) {
+ (void) nvlist_lookup_uint64(props,
+ zfs_prop_to_name(ZFS_PROP_ENCRYPTION), &crypt);
+ (void) nvlist_lookup_uint64(props,
+ zfs_prop_to_name(ZFS_PROP_KEYFORMAT), &keyformat);
+ (void) nvlist_lookup_string(props,
+ zfs_prop_to_name(ZFS_PROP_KEYLOCATION), &keylocation);
+ (void) nvlist_lookup_uint64(props,
+ zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), &salt);
+ (void) nvlist_lookup_uint64(props,
+ zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), &iters);
+
+ dcp->cp_crypt = crypt;
+ }
+
+ if (crypto_args != NULL) {
+ (void) nvlist_lookup_uint8_array(crypto_args, "wkeydata",
+ &wkeydata, &wkeydata_len);
+ }
+
+ /* check for valid command */
+ if (dcp->cp_cmd >= DCP_CMD_MAX) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ } else {
+ dcp->cp_cmd = cmd;
+ }
+
+ /* check for valid crypt */
+ if (dcp->cp_crypt >= ZIO_CRYPT_FUNCTIONS) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ } else {
+ dcp->cp_crypt = crypt;
+ }
+
+ /* check for valid keyformat */
+ if (keyformat >= ZFS_KEYFORMAT_FORMATS) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* check for a valid keylocation (of any kind) and copy it in */
+ if (keylocation != NULL) {
+ if (!zfs_prop_valid_keylocation(keylocation, B_FALSE)) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ dcp->cp_keylocation = spa_strdup(keylocation);
+ }
+
+ /* check wrapping key length, if given */
+ if (wkeydata != NULL && wkeydata_len != WRAPPING_KEY_LEN) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* if the user asked for the default crypt, determine that now */
+ if (dcp->cp_crypt == ZIO_CRYPT_ON)
+ dcp->cp_crypt = ZIO_CRYPT_ON_VALUE;
+
+ /* create the wrapping key from the raw data */
+ if (wkeydata != NULL) {
+ /* create the wrapping key with the verified parameters */
+ dsl_wrapping_key_create(wkeydata, keyformat, salt,
+ iters, &wkey);
+ dcp->cp_wkey = wkey;
+ }
+
+ /*
+ * Remove the encryption properties from the nvlist since they are not
+ * maintained through the DSL.
+ */
+ (void) nvlist_remove_all(props, zfs_prop_to_name(ZFS_PROP_ENCRYPTION));
+ (void) nvlist_remove_all(props, zfs_prop_to_name(ZFS_PROP_KEYFORMAT));
+ (void) nvlist_remove_all(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT));
+ (void) nvlist_remove_all(props,
+ zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS));
+
+ *dcp_out = dcp;
+
+ return (0);
+
+error:
+ kmem_free(dcp, sizeof (dsl_crypto_params_t));
+ *dcp_out = NULL;
+ return (ret);
+}
+
+void
+dsl_crypto_params_free(dsl_crypto_params_t *dcp, boolean_t unload)
+{
+ if (dcp == NULL)
+ return;
+
+ if (dcp->cp_keylocation != NULL)
+ spa_strfree(dcp->cp_keylocation);
+ if (unload && dcp->cp_wkey != NULL)
+ dsl_wrapping_key_free(dcp->cp_wkey);
+
+ kmem_free(dcp, sizeof (dsl_crypto_params_t));
+}
+
+static int
+spa_crypto_key_compare(const void *a, const void *b)
+{
+ const dsl_crypto_key_t *dcka = a;
+ const dsl_crypto_key_t *dckb = b;
+
+ if (dcka->dck_obj < dckb->dck_obj)
+ return (-1);
+ if (dcka->dck_obj > dckb->dck_obj)
+ return (1);
+ return (0);
+}
+
+static int
+spa_key_mapping_compare(const void *a, const void *b)
+{
+ const dsl_key_mapping_t *kma = a;
+ const dsl_key_mapping_t *kmb = b;
+
+ if (kma->km_dsobj < kmb->km_dsobj)
+ return (-1);
+ if (kma->km_dsobj > kmb->km_dsobj)
+ return (1);
+ return (0);
+}
+
+static int
+spa_wkey_compare(const void *a, const void *b)
+{
+ const dsl_wrapping_key_t *wka = a;
+ const dsl_wrapping_key_t *wkb = b;
+
+ if (wka->wk_ddobj < wkb->wk_ddobj)
+ return (-1);
+ if (wka->wk_ddobj > wkb->wk_ddobj)
+ return (1);
+ return (0);
+}
+
+void
+spa_keystore_init(spa_keystore_t *sk)
+{
+ rw_init(&sk->sk_dk_lock, NULL, RW_DEFAULT, NULL);
+ rw_init(&sk->sk_km_lock, NULL, RW_DEFAULT, NULL);
+ rw_init(&sk->sk_wkeys_lock, NULL, RW_DEFAULT, NULL);
+ avl_create(&sk->sk_dsl_keys, spa_crypto_key_compare,
+ sizeof (dsl_crypto_key_t),
+ offsetof(dsl_crypto_key_t, dck_avl_link));
+ avl_create(&sk->sk_key_mappings, spa_key_mapping_compare,
+ sizeof (dsl_key_mapping_t),
+ offsetof(dsl_key_mapping_t, km_avl_link));
+ avl_create(&sk->sk_wkeys, spa_wkey_compare, sizeof (dsl_wrapping_key_t),
+ offsetof(dsl_wrapping_key_t, wk_avl_link));
+}
+
+void
+spa_keystore_fini(spa_keystore_t *sk)
+{
+ dsl_wrapping_key_t *wkey;
+ void *cookie = NULL;
+
+ ASSERT(avl_is_empty(&sk->sk_dsl_keys));
+ ASSERT(avl_is_empty(&sk->sk_key_mappings));
+
+ while ((wkey = avl_destroy_nodes(&sk->sk_wkeys, &cookie)) != NULL)
+ dsl_wrapping_key_free(wkey);
+
+ avl_destroy(&sk->sk_wkeys);
+ avl_destroy(&sk->sk_key_mappings);
+ avl_destroy(&sk->sk_dsl_keys);
+ rw_destroy(&sk->sk_wkeys_lock);
+ rw_destroy(&sk->sk_km_lock);
+ rw_destroy(&sk->sk_dk_lock);
+}
+
+static int
+dsl_dir_get_encryption_root_ddobj(dsl_dir_t *dd, uint64_t *rddobj)
+{
+ if (dd->dd_crypto_obj == 0)
+ return (SET_ERROR(ENOENT));
+
+ return (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
+ DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1, rddobj));
+}
+
+static int
+dsl_dir_get_encryption_version(dsl_dir_t *dd, uint64_t *version)
+{
+ *version = 0;
+
+ if (dd->dd_crypto_obj == 0)
+ return (SET_ERROR(ENOENT));
+
+ /* version 0 is implied by ENOENT */
+ (void) zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
+ DSL_CRYPTO_KEY_VERSION, 8, 1, version);
+
+ return (0);
+}
+
+boolean_t
+dsl_dir_incompatible_encryption_version(dsl_dir_t *dd)
+{
+ int ret;
+ uint64_t version = 0;
+
+ ret = dsl_dir_get_encryption_version(dd, &version);
+ if (ret != 0)
+ return (B_FALSE);
+
+ return (version != ZIO_CRYPT_KEY_CURRENT_VERSION);
+}
+
+static int
+spa_keystore_wkey_hold_ddobj_impl(spa_t *spa, uint64_t ddobj,
+ void *tag, dsl_wrapping_key_t **wkey_out)
+{
+ int ret;
+ dsl_wrapping_key_t search_wkey;
+ dsl_wrapping_key_t *found_wkey;
+
+ ASSERT(RW_LOCK_HELD(&spa->spa_keystore.sk_wkeys_lock));
+
+ /* init the search wrapping key */
+ search_wkey.wk_ddobj = ddobj;
+
+ /* lookup the wrapping key */
+ found_wkey = avl_find(&spa->spa_keystore.sk_wkeys, &search_wkey, NULL);
+ if (!found_wkey) {
+ ret = SET_ERROR(ENOENT);
+ goto error;
+ }
+
+ /* increment the refcount */
+ dsl_wrapping_key_hold(found_wkey, tag);
+
+ *wkey_out = found_wkey;
+ return (0);
+
+error:
+ *wkey_out = NULL;
+ return (ret);
+}
+
+static int
+spa_keystore_wkey_hold_dd(spa_t *spa, dsl_dir_t *dd, void *tag,
+ dsl_wrapping_key_t **wkey_out)
+{
+ int ret;
+ dsl_wrapping_key_t *wkey;
+ uint64_t rddobj;
+ boolean_t locked = B_FALSE;
+
+ if (!RW_WRITE_HELD(&spa->spa_keystore.sk_wkeys_lock)) {
+ rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_READER);
+ locked = B_TRUE;
+ }
+
+ /* get the ddobj that the keylocation property was inherited from */
+ ret = dsl_dir_get_encryption_root_ddobj(dd, &rddobj);
+ if (ret != 0)
+ goto error;
+
+ /* lookup the wkey in the avl tree */
+ ret = spa_keystore_wkey_hold_ddobj_impl(spa, rddobj, tag, &wkey);
+ if (ret != 0)
+ goto error;
+
+ /* unlock the wkey tree if we locked it */
+ if (locked)
+ rw_exit(&spa->spa_keystore.sk_wkeys_lock);
+
+ *wkey_out = wkey;
+ return (0);
+
+error:
+ if (locked)
+ rw_exit(&spa->spa_keystore.sk_wkeys_lock);
+
+ *wkey_out = NULL;
+ return (ret);
+}
+
+int
+dsl_crypto_can_set_keylocation(const char *dsname, const char *keylocation)
+{
+ int ret = 0;
+ dsl_dir_t *dd = NULL;
+ dsl_pool_t *dp = NULL;
+ uint64_t rddobj;
+
+ /* hold the dsl dir */
+ ret = dsl_pool_hold(dsname, FTAG, &dp);
+ if (ret != 0)
+ goto out;
+
+ ret = dsl_dir_hold(dp, dsname, FTAG, &dd, NULL);
+ if (ret != 0) {
+ dd = NULL;
+ goto out;
+ }
+
+ /* if dd is not encrypted, the value may only be "none" */
+ if (dd->dd_crypto_obj == 0) {
+ if (strcmp(keylocation, "none") != 0) {
+ ret = SET_ERROR(EACCES);
+ goto out;
+ }
+
+ ret = 0;
+ goto out;
+ }
+
+ /* check for a valid keylocation for encrypted datasets */
+ if (!zfs_prop_valid_keylocation(keylocation, B_TRUE)) {
+ ret = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ /* check that this is an encryption root */
+ ret = dsl_dir_get_encryption_root_ddobj(dd, &rddobj);
+ if (ret != 0)
+ goto out;
+
+ if (rddobj != dd->dd_object) {
+ ret = SET_ERROR(EACCES);
+ goto out;
+ }
+
+ dsl_dir_rele(dd, FTAG);
+ dsl_pool_rele(dp, FTAG);
+
+ return (0);
+
+out:
+ if (dd != NULL)
+ dsl_dir_rele(dd, FTAG);
+ if (dp != NULL)
+ dsl_pool_rele(dp, FTAG);
+
+ return (ret);
+}
+
+static void
+dsl_crypto_key_free(dsl_crypto_key_t *dck)
+{
+ ASSERT(zfs_refcount_count(&dck->dck_holds) == 0);
+
+ /* destroy the zio_crypt_key_t */
+ zio_crypt_key_destroy(&dck->dck_key);
+
+ /* free the refcount, wrapping key, and lock */
+ zfs_refcount_destroy(&dck->dck_holds);
+ if (dck->dck_wkey)
+ dsl_wrapping_key_rele(dck->dck_wkey, dck);
+
+ /* free the key */
+ kmem_free(dck, sizeof (dsl_crypto_key_t));
+}
+
+static void
+dsl_crypto_key_rele(dsl_crypto_key_t *dck, void *tag)
+{
+ if (zfs_refcount_remove(&dck->dck_holds, tag) == 0)
+ dsl_crypto_key_free(dck);
+}
+
+static int
+dsl_crypto_key_open(objset_t *mos, dsl_wrapping_key_t *wkey,
+ uint64_t dckobj, void *tag, dsl_crypto_key_t **dck_out)
+{
+ int ret;
+ uint64_t crypt = 0, guid = 0, version = 0;
+ uint8_t raw_keydata[MASTER_KEY_MAX_LEN];
+ uint8_t raw_hmac_keydata[SHA512_HMAC_KEYLEN];
+ uint8_t iv[WRAPPING_IV_LEN];
+ uint8_t mac[WRAPPING_MAC_LEN];
+ dsl_crypto_key_t *dck;
+
+ /* allocate and initialize the key */
+ dck = kmem_zalloc(sizeof (dsl_crypto_key_t), KM_SLEEP);
+
+ /* fetch all of the values we need from the ZAP */
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1,
+ &crypt);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_GUID, 8, 1, &guid);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MASTER_KEY, 1,
+ MASTER_KEY_MAX_LEN, raw_keydata);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_HMAC_KEY, 1,
+ SHA512_HMAC_KEYLEN, raw_hmac_keydata);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_IV, 1, WRAPPING_IV_LEN,
+ iv);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MAC, 1, WRAPPING_MAC_LEN,
+ mac);
+ if (ret != 0)
+ goto error;
+
+ /* the initial on-disk format for encryption did not have a version */
+ (void) zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_VERSION, 8, 1, &version);
+
+ /*
+ * Unwrap the keys. If there is an error return EACCES to indicate
+ * an authentication failure.
+ */
+ ret = zio_crypt_key_unwrap(&wkey->wk_key, crypt, version, guid,
+ raw_keydata, raw_hmac_keydata, iv, mac, &dck->dck_key);
+ if (ret != 0) {
+ ret = SET_ERROR(EACCES);
+ goto error;
+ }
+
+ /* finish initializing the dsl_crypto_key_t */
+ zfs_refcount_create(&dck->dck_holds);
+ dsl_wrapping_key_hold(wkey, dck);
+ dck->dck_wkey = wkey;
+ dck->dck_obj = dckobj;
+ zfs_refcount_add(&dck->dck_holds, tag);
+
+ *dck_out = dck;
+ return (0);
+
+error:
+ if (dck != NULL) {
+ bzero(dck, sizeof (dsl_crypto_key_t));
+ kmem_free(dck, sizeof (dsl_crypto_key_t));
+ }
+
+ *dck_out = NULL;
+ return (ret);
+}
+
+static int
+spa_keystore_dsl_key_hold_impl(spa_t *spa, uint64_t dckobj, void *tag,
+ dsl_crypto_key_t **dck_out)
+{
+ int ret;
+ dsl_crypto_key_t search_dck;
+ dsl_crypto_key_t *found_dck;
+
+ ASSERT(RW_LOCK_HELD(&spa->spa_keystore.sk_dk_lock));
+
+ /* init the search key */
+ search_dck.dck_obj = dckobj;
+
+ /* find the matching key in the keystore */
+ found_dck = avl_find(&spa->spa_keystore.sk_dsl_keys, &search_dck, NULL);
+ if (!found_dck) {
+ ret = SET_ERROR(ENOENT);
+ goto error;
+ }
+
+ /* increment the refcount */
+ zfs_refcount_add(&found_dck->dck_holds, tag);
+
+ *dck_out = found_dck;
+ return (0);
+
+error:
+ *dck_out = NULL;
+ return (ret);
+}
+
+static int
+spa_keystore_dsl_key_hold_dd(spa_t *spa, dsl_dir_t *dd, void *tag,
+ dsl_crypto_key_t **dck_out)
+{
+ int ret;
+ avl_index_t where;
+ dsl_crypto_key_t *dck_io = NULL, *dck_ks = NULL;
+ dsl_wrapping_key_t *wkey = NULL;
+ uint64_t dckobj = dd->dd_crypto_obj;
+
+ /* Lookup the key in the tree of currently loaded keys */
+ rw_enter(&spa->spa_keystore.sk_dk_lock, RW_READER);
+ ret = spa_keystore_dsl_key_hold_impl(spa, dckobj, tag, &dck_ks);
+ rw_exit(&spa->spa_keystore.sk_dk_lock);
+ if (ret == 0) {
+ *dck_out = dck_ks;
+ return (0);
+ }
+
+ /* Lookup the wrapping key from the keystore */
+ ret = spa_keystore_wkey_hold_dd(spa, dd, FTAG, &wkey);
+ if (ret != 0) {
+ *dck_out = NULL;
+ return (SET_ERROR(EACCES));
+ }
+
+ /* Read the key from disk */
+ ret = dsl_crypto_key_open(spa->spa_meta_objset, wkey, dckobj,
+ tag, &dck_io);
+ if (ret != 0) {
+ dsl_wrapping_key_rele(wkey, FTAG);
+ *dck_out = NULL;
+ return (ret);
+ }
+
+ /*
+ * Add the key to the keystore. It may already exist if it was
+ * added while performing the read from disk. In this case discard
+ * it and return the key from the keystore.
+ */
+ rw_enter(&spa->spa_keystore.sk_dk_lock, RW_WRITER);
+ ret = spa_keystore_dsl_key_hold_impl(spa, dckobj, tag, &dck_ks);
+ if (ret != 0) {
+ avl_find(&spa->spa_keystore.sk_dsl_keys, dck_io, &where);
+ avl_insert(&spa->spa_keystore.sk_dsl_keys, dck_io, where);
+ *dck_out = dck_io;
+ } else {
+ dsl_crypto_key_free(dck_io);
+ *dck_out = dck_ks;
+ }
+
+ /* Release the wrapping key (the dsl key now has a reference to it) */
+ dsl_wrapping_key_rele(wkey, FTAG);
+ rw_exit(&spa->spa_keystore.sk_dk_lock);
+
+ return (0);
+}
+
+void
+spa_keystore_dsl_key_rele(spa_t *spa, dsl_crypto_key_t *dck, void *tag)
+{
+ rw_enter(&spa->spa_keystore.sk_dk_lock, RW_WRITER);
+
+ if (zfs_refcount_remove(&dck->dck_holds, tag) == 0) {
+ avl_remove(&spa->spa_keystore.sk_dsl_keys, dck);
+ dsl_crypto_key_free(dck);
+ }
+
+ rw_exit(&spa->spa_keystore.sk_dk_lock);
+}
+
+int
+spa_keystore_load_wkey_impl(spa_t *spa, dsl_wrapping_key_t *wkey)
+{
+ int ret;
+ avl_index_t where;
+ dsl_wrapping_key_t *found_wkey;
+
+ rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_WRITER);
+
+ /* insert the wrapping key into the keystore */
+ found_wkey = avl_find(&spa->spa_keystore.sk_wkeys, wkey, &where);
+ if (found_wkey != NULL) {
+ ret = SET_ERROR(EEXIST);
+ goto error_unlock;
+ }
+ avl_insert(&spa->spa_keystore.sk_wkeys, wkey, where);
+
+ rw_exit(&spa->spa_keystore.sk_wkeys_lock);
+
+ return (0);
+
+error_unlock:
+ rw_exit(&spa->spa_keystore.sk_wkeys_lock);
+ return (ret);
+}
+
+int
+spa_keystore_load_wkey(const char *dsname, dsl_crypto_params_t *dcp,
+ boolean_t noop)
+{
+ int ret;
+ dsl_dir_t *dd = NULL;
+ dsl_crypto_key_t *dck = NULL;
+ dsl_wrapping_key_t *wkey = dcp->cp_wkey;
+ dsl_pool_t *dp = NULL;
+ uint64_t rddobj, keyformat, salt, iters;
+
+ /*
+ * We don't validate the wrapping key's keyformat, salt, or iters
+ * since they will never be needed after the DCK has been wrapped.
+ */
+ if (dcp->cp_wkey == NULL ||
+ dcp->cp_cmd != DCP_CMD_NONE ||
+ dcp->cp_crypt != ZIO_CRYPT_INHERIT ||
+ dcp->cp_keylocation != NULL)
+ return (SET_ERROR(EINVAL));
+
+ ret = dsl_pool_hold(dsname, FTAG, &dp);
+ if (ret != 0)
+ goto error;
+
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) {
+ ret = SET_ERROR(ENOTSUP);
+ goto error;
+ }
+
+ /* hold the dsl dir */
+ ret = dsl_dir_hold(dp, dsname, FTAG, &dd, NULL);
+ if (ret != 0) {
+ dd = NULL;
+ goto error;
+ }
+
+ /* confirm that dd is the encryption root */
+ ret = dsl_dir_get_encryption_root_ddobj(dd, &rddobj);
+ if (ret != 0 || rddobj != dd->dd_object) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* initialize the wkey's ddobj */
+ wkey->wk_ddobj = dd->dd_object;
+
+ /* verify that the wkey is correct by opening its dsl key */
+ ret = dsl_crypto_key_open(dp->dp_meta_objset, wkey,
+ dd->dd_crypto_obj, FTAG, &dck);
+ if (ret != 0)
+ goto error;
+
+ /* initialize the wkey encryption parameters from the DSL Crypto Key */
+ ret = zap_lookup(dp->dp_meta_objset, dd->dd_crypto_obj,
+ zfs_prop_to_name(ZFS_PROP_KEYFORMAT), 8, 1, &keyformat);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(dp->dp_meta_objset, dd->dd_crypto_obj,
+ zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 8, 1, &salt);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(dp->dp_meta_objset, dd->dd_crypto_obj,
+ zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 8, 1, &iters);
+ if (ret != 0)
+ goto error;
+
+ ASSERT3U(keyformat, <, ZFS_KEYFORMAT_FORMATS);
+ ASSERT3U(keyformat, !=, ZFS_KEYFORMAT_NONE);
+ IMPLY(keyformat == ZFS_KEYFORMAT_PASSPHRASE, iters != 0);
+ IMPLY(keyformat == ZFS_KEYFORMAT_PASSPHRASE, salt != 0);
+ IMPLY(keyformat != ZFS_KEYFORMAT_PASSPHRASE, iters == 0);
+ IMPLY(keyformat != ZFS_KEYFORMAT_PASSPHRASE, salt == 0);
+
+ wkey->wk_keyformat = keyformat;
+ wkey->wk_salt = salt;
+ wkey->wk_iters = iters;
+
+ /*
+ * At this point we have verified the wkey and confirmed that it can
+ * be used to decrypt a DSL Crypto Key. We can simply cleanup and
+ * return if this is all the user wanted to do.
+ */
+ if (noop)
+ goto error;
+
+ /* insert the wrapping key into the keystore */
+ ret = spa_keystore_load_wkey_impl(dp->dp_spa, wkey);
+ if (ret != 0)
+ goto error;
+
+ dsl_crypto_key_rele(dck, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ dsl_pool_rele(dp, FTAG);
+
+ /* create any zvols under this ds */
+ zvol_create_minors_recursive(dsname);
+
+ return (0);
+
+error:
+ if (dck != NULL)
+ dsl_crypto_key_rele(dck, FTAG);
+ if (dd != NULL)
+ dsl_dir_rele(dd, FTAG);
+ if (dp != NULL)
+ dsl_pool_rele(dp, FTAG);
+
+ return (ret);
+}
+
+int
+spa_keystore_unload_wkey_impl(spa_t *spa, uint64_t ddobj)
+{
+ int ret;
+ dsl_wrapping_key_t search_wkey;
+ dsl_wrapping_key_t *found_wkey;
+
+ /* init the search wrapping key */
+ search_wkey.wk_ddobj = ddobj;
+
+ rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_WRITER);
+
+ /* remove the wrapping key from the keystore */
+ found_wkey = avl_find(&spa->spa_keystore.sk_wkeys,
+ &search_wkey, NULL);
+ if (!found_wkey) {
+ ret = SET_ERROR(EACCES);
+ goto error_unlock;
+ } else if (zfs_refcount_count(&found_wkey->wk_refcnt) != 0) {
+ ret = SET_ERROR(EBUSY);
+ goto error_unlock;
+ }
+ avl_remove(&spa->spa_keystore.sk_wkeys, found_wkey);
+
+ rw_exit(&spa->spa_keystore.sk_wkeys_lock);
+
+ /* free the wrapping key */
+ dsl_wrapping_key_free(found_wkey);
+
+ return (0);
+
+error_unlock:
+ rw_exit(&spa->spa_keystore.sk_wkeys_lock);
+ return (ret);
+}
+
+int
+spa_keystore_unload_wkey(const char *dsname)
+{
+ int ret = 0;
+ dsl_dir_t *dd = NULL;
+ dsl_pool_t *dp = NULL;
+ spa_t *spa = NULL;
+
+ ret = spa_open(dsname, &spa, FTAG);
+ if (ret != 0)
+ return (ret);
+
+ /*
+ * Wait for any outstanding txg IO to complete, releasing any
+ * remaining references on the wkey.
+ */
+ if (spa_mode(spa) != SPA_MODE_READ)
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+
+ spa_close(spa, FTAG);
+
+ /* hold the dsl dir */
+ ret = dsl_pool_hold(dsname, FTAG, &dp);
+ if (ret != 0)
+ goto error;
+
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) {
+ ret = (SET_ERROR(ENOTSUP));
+ goto error;
+ }
+
+ ret = dsl_dir_hold(dp, dsname, FTAG, &dd, NULL);
+ if (ret != 0) {
+ dd = NULL;
+ goto error;
+ }
+
+ /* unload the wkey */
+ ret = spa_keystore_unload_wkey_impl(dp->dp_spa, dd->dd_object);
+ if (ret != 0)
+ goto error;
+
+ dsl_dir_rele(dd, FTAG);
+ dsl_pool_rele(dp, FTAG);
+
+ /* remove any zvols under this ds */
+ zvol_remove_minors(dp->dp_spa, dsname, B_TRUE);
+
+ return (0);
+
+error:
+ if (dd != NULL)
+ dsl_dir_rele(dd, FTAG);
+ if (dp != NULL)
+ dsl_pool_rele(dp, FTAG);
+
+ return (ret);
+}
+
+void
+key_mapping_add_ref(dsl_key_mapping_t *km, void *tag)
+{
+ ASSERT3U(zfs_refcount_count(&km->km_refcnt), >=, 1);
+ zfs_refcount_add(&km->km_refcnt, tag);
+}
+
+/*
+ * The locking here is a little tricky to ensure we don't cause unnecessary
+ * performance problems. We want to release a key mapping whenever someone
+ * decrements the refcount to 0, but freeing the mapping requires removing
+ * it from the spa_keystore, which requires holding sk_km_lock as a writer.
+ * Most of the time we don't want to hold this lock as a writer, since the
+ * same lock is held as a reader for each IO that needs to encrypt / decrypt
+ * data for any dataset and in practice we will only actually free the
+ * mapping after unmounting a dataset.
+ */
+void
+key_mapping_rele(spa_t *spa, dsl_key_mapping_t *km, void *tag)
+{
+ ASSERT3U(zfs_refcount_count(&km->km_refcnt), >=, 1);
+
+ if (zfs_refcount_remove(&km->km_refcnt, tag) != 0)
+ return;
+
+ /*
+ * We think we are going to need to free the mapping. Add a
+ * reference to prevent most other releasers from thinking
+ * this might be their responsibility. This is inherently
+ * racy, so we will confirm that we are legitimately the
+ * last holder once we have the sk_km_lock as a writer.
+ */
+ zfs_refcount_add(&km->km_refcnt, FTAG);
+
+ rw_enter(&spa->spa_keystore.sk_km_lock, RW_WRITER);
+ if (zfs_refcount_remove(&km->km_refcnt, FTAG) != 0) {
+ rw_exit(&spa->spa_keystore.sk_km_lock);
+ return;
+ }
+
+ avl_remove(&spa->spa_keystore.sk_key_mappings, km);
+ rw_exit(&spa->spa_keystore.sk_km_lock);
+
+ spa_keystore_dsl_key_rele(spa, km->km_key, km);
+ zfs_refcount_destroy(&km->km_refcnt);
+ kmem_free(km, sizeof (dsl_key_mapping_t));
+}
+
+int
+spa_keystore_create_mapping(spa_t *spa, dsl_dataset_t *ds, void *tag,
+ dsl_key_mapping_t **km_out)
+{
+ int ret;
+ avl_index_t where;
+ dsl_key_mapping_t *km, *found_km;
+ boolean_t should_free = B_FALSE;
+
+ /* Allocate and initialize the mapping */
+ km = kmem_zalloc(sizeof (dsl_key_mapping_t), KM_SLEEP);
+ zfs_refcount_create(&km->km_refcnt);
+
+ ret = spa_keystore_dsl_key_hold_dd(spa, ds->ds_dir, km, &km->km_key);
+ if (ret != 0) {
+ zfs_refcount_destroy(&km->km_refcnt);
+ kmem_free(km, sizeof (dsl_key_mapping_t));
+
+ if (km_out != NULL)
+ *km_out = NULL;
+ return (ret);
+ }
+
+ km->km_dsobj = ds->ds_object;
+
+ rw_enter(&spa->spa_keystore.sk_km_lock, RW_WRITER);
+
+ /*
+ * If a mapping already exists, simply increment its refcount and
+ * cleanup the one we made. We want to allocate / free outside of
+ * the lock because this lock is also used by the zio layer to lookup
+ * key mappings. Otherwise, use the one we created. Normally, there will
+ * only be one active reference at a time (the objset owner), but there
+ * are times when there could be multiple async users.
+ */
+ found_km = avl_find(&spa->spa_keystore.sk_key_mappings, km, &where);
+ if (found_km != NULL) {
+ should_free = B_TRUE;
+ zfs_refcount_add(&found_km->km_refcnt, tag);
+ if (km_out != NULL)
+ *km_out = found_km;
+ } else {
+ zfs_refcount_add(&km->km_refcnt, tag);
+ avl_insert(&spa->spa_keystore.sk_key_mappings, km, where);
+ if (km_out != NULL)
+ *km_out = km;
+ }
+
+ rw_exit(&spa->spa_keystore.sk_km_lock);
+
+ if (should_free) {
+ spa_keystore_dsl_key_rele(spa, km->km_key, km);
+ zfs_refcount_destroy(&km->km_refcnt);
+ kmem_free(km, sizeof (dsl_key_mapping_t));
+ }
+
+ return (0);
+}
+
+int
+spa_keystore_remove_mapping(spa_t *spa, uint64_t dsobj, void *tag)
+{
+ int ret;
+ dsl_key_mapping_t search_km;
+ dsl_key_mapping_t *found_km;
+
+ /* init the search key mapping */
+ search_km.km_dsobj = dsobj;
+
+ rw_enter(&spa->spa_keystore.sk_km_lock, RW_READER);
+
+ /* find the matching mapping */
+ found_km = avl_find(&spa->spa_keystore.sk_key_mappings,
+ &search_km, NULL);
+ if (found_km == NULL) {
+ ret = SET_ERROR(ENOENT);
+ goto error_unlock;
+ }
+
+ rw_exit(&spa->spa_keystore.sk_km_lock);
+
+ key_mapping_rele(spa, found_km, tag);
+
+ return (0);
+
+error_unlock:
+ rw_exit(&spa->spa_keystore.sk_km_lock);
+ return (ret);
+}
+
+/*
+ * This function is primarily used by the zio and arc layer to lookup
+ * DSL Crypto Keys for encryption. Callers must release the key with
+ * spa_keystore_dsl_key_rele(). The function may also be called with
+ * dck_out == NULL and tag == NULL to simply check that a key exists
+ * without getting a reference to it.
+ */
+int
+spa_keystore_lookup_key(spa_t *spa, uint64_t dsobj, void *tag,
+ dsl_crypto_key_t **dck_out)
+{
+ int ret;
+ dsl_key_mapping_t search_km;
+ dsl_key_mapping_t *found_km;
+
+ ASSERT((tag != NULL && dck_out != NULL) ||
+ (tag == NULL && dck_out == NULL));
+
+ /* init the search key mapping */
+ search_km.km_dsobj = dsobj;
+
+ rw_enter(&spa->spa_keystore.sk_km_lock, RW_READER);
+
+ /* remove the mapping from the tree */
+ found_km = avl_find(&spa->spa_keystore.sk_key_mappings, &search_km,
+ NULL);
+ if (found_km == NULL) {
+ ret = SET_ERROR(ENOENT);
+ goto error_unlock;
+ }
+
+ if (found_km && tag)
+ zfs_refcount_add(&found_km->km_key->dck_holds, tag);
+
+ rw_exit(&spa->spa_keystore.sk_km_lock);
+
+ if (dck_out != NULL)
+ *dck_out = found_km->km_key;
+ return (0);
+
+error_unlock:
+ rw_exit(&spa->spa_keystore.sk_km_lock);
+
+ if (dck_out != NULL)
+ *dck_out = NULL;
+ return (ret);
+}
+
+static int
+dmu_objset_check_wkey_loaded(dsl_dir_t *dd)
+{
+ int ret;
+ dsl_wrapping_key_t *wkey = NULL;
+
+ ret = spa_keystore_wkey_hold_dd(dd->dd_pool->dp_spa, dd, FTAG,
+ &wkey);
+ if (ret != 0)
+ return (SET_ERROR(EACCES));
+
+ dsl_wrapping_key_rele(wkey, FTAG);
+
+ return (0);
+}
+
+static zfs_keystatus_t
+dsl_dataset_get_keystatus(dsl_dir_t *dd)
+{
+ /* check if this dd has a has a dsl key */
+ if (dd->dd_crypto_obj == 0)
+ return (ZFS_KEYSTATUS_NONE);
+
+ return (dmu_objset_check_wkey_loaded(dd) == 0 ?
+ ZFS_KEYSTATUS_AVAILABLE : ZFS_KEYSTATUS_UNAVAILABLE);
+}
+
+static int
+dsl_dir_get_crypt(dsl_dir_t *dd, uint64_t *crypt)
+{
+ if (dd->dd_crypto_obj == 0) {
+ *crypt = ZIO_CRYPT_OFF;
+ return (0);
+ }
+
+ return (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
+ DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1, crypt));
+}
+
+static void
+dsl_crypto_key_sync_impl(objset_t *mos, uint64_t dckobj, uint64_t crypt,
+ uint64_t root_ddobj, uint64_t guid, uint8_t *iv, uint8_t *mac,
+ uint8_t *keydata, uint8_t *hmac_keydata, uint64_t keyformat,
+ uint64_t salt, uint64_t iters, dmu_tx_t *tx)
+{
+ VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1,
+ &crypt, tx));
+ VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1,
+ &root_ddobj, tx));
+ VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_GUID, 8, 1,
+ &guid, tx));
+ VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_IV, 1, WRAPPING_IV_LEN,
+ iv, tx));
+ VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_MAC, 1, WRAPPING_MAC_LEN,
+ mac, tx));
+ VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_MASTER_KEY, 1,
+ MASTER_KEY_MAX_LEN, keydata, tx));
+ VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_HMAC_KEY, 1,
+ SHA512_HMAC_KEYLEN, hmac_keydata, tx));
+ VERIFY0(zap_update(mos, dckobj, zfs_prop_to_name(ZFS_PROP_KEYFORMAT),
+ 8, 1, &keyformat, tx));
+ VERIFY0(zap_update(mos, dckobj, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT),
+ 8, 1, &salt, tx));
+ VERIFY0(zap_update(mos, dckobj, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS),
+ 8, 1, &iters, tx));
+}
+
+static void
+dsl_crypto_key_sync(dsl_crypto_key_t *dck, dmu_tx_t *tx)
+{
+ zio_crypt_key_t *key = &dck->dck_key;
+ dsl_wrapping_key_t *wkey = dck->dck_wkey;
+ uint8_t keydata[MASTER_KEY_MAX_LEN];
+ uint8_t hmac_keydata[SHA512_HMAC_KEYLEN];
+ uint8_t iv[WRAPPING_IV_LEN];
+ uint8_t mac[WRAPPING_MAC_LEN];
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT3U(key->zk_crypt, <, ZIO_CRYPT_FUNCTIONS);
+
+ /* encrypt and store the keys along with the IV and MAC */
+ VERIFY0(zio_crypt_key_wrap(&dck->dck_wkey->wk_key, key, iv, mac,
+ keydata, hmac_keydata));
+
+ /* update the ZAP with the obtained values */
+ dsl_crypto_key_sync_impl(tx->tx_pool->dp_meta_objset, dck->dck_obj,
+ key->zk_crypt, wkey->wk_ddobj, key->zk_guid, iv, mac, keydata,
+ hmac_keydata, wkey->wk_keyformat, wkey->wk_salt, wkey->wk_iters,
+ tx);
+}
+
+typedef struct spa_keystore_change_key_args {
+ const char *skcka_dsname;
+ dsl_crypto_params_t *skcka_cp;
+} spa_keystore_change_key_args_t;
+
+static int
+spa_keystore_change_key_check(void *arg, dmu_tx_t *tx)
+{
+ int ret;
+ dsl_dir_t *dd = NULL;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ spa_keystore_change_key_args_t *skcka = arg;
+ dsl_crypto_params_t *dcp = skcka->skcka_cp;
+ uint64_t rddobj;
+
+ /* check for the encryption feature */
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) {
+ ret = SET_ERROR(ENOTSUP);
+ goto error;
+ }
+
+ /* check for valid key change command */
+ if (dcp->cp_cmd != DCP_CMD_NEW_KEY &&
+ dcp->cp_cmd != DCP_CMD_INHERIT &&
+ dcp->cp_cmd != DCP_CMD_FORCE_NEW_KEY &&
+ dcp->cp_cmd != DCP_CMD_FORCE_INHERIT) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* hold the dd */
+ ret = dsl_dir_hold(dp, skcka->skcka_dsname, FTAG, &dd, NULL);
+ if (ret != 0) {
+ dd = NULL;
+ goto error;
+ }
+
+ /* verify that the dataset is encrypted */
+ if (dd->dd_crypto_obj == 0) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* clones must always use their origin's key */
+ if (dsl_dir_is_clone(dd)) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* lookup the ddobj we are inheriting the keylocation from */
+ ret = dsl_dir_get_encryption_root_ddobj(dd, &rddobj);
+ if (ret != 0)
+ goto error;
+
+ /* Handle inheritance */
+ if (dcp->cp_cmd == DCP_CMD_INHERIT ||
+ dcp->cp_cmd == DCP_CMD_FORCE_INHERIT) {
+ /* no other encryption params should be given */
+ if (dcp->cp_crypt != ZIO_CRYPT_INHERIT ||
+ dcp->cp_keylocation != NULL ||
+ dcp->cp_wkey != NULL) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* check that this is an encryption root */
+ if (dd->dd_object != rddobj) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* check that the parent is encrypted */
+ if (dd->dd_parent->dd_crypto_obj == 0) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* if we are rewrapping check that both keys are loaded */
+ if (dcp->cp_cmd == DCP_CMD_INHERIT) {
+ ret = dmu_objset_check_wkey_loaded(dd);
+ if (ret != 0)
+ goto error;
+
+ ret = dmu_objset_check_wkey_loaded(dd->dd_parent);
+ if (ret != 0)
+ goto error;
+ }
+
+ dsl_dir_rele(dd, FTAG);
+ return (0);
+ }
+
+ /* handle forcing an encryption root without rewrapping */
+ if (dcp->cp_cmd == DCP_CMD_FORCE_NEW_KEY) {
+ /* no other encryption params should be given */
+ if (dcp->cp_crypt != ZIO_CRYPT_INHERIT ||
+ dcp->cp_keylocation != NULL ||
+ dcp->cp_wkey != NULL) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* check that this is not an encryption root */
+ if (dd->dd_object == rddobj) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ dsl_dir_rele(dd, FTAG);
+ return (0);
+ }
+
+ /* crypt cannot be changed after creation */
+ if (dcp->cp_crypt != ZIO_CRYPT_INHERIT) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* we are not inheritting our parent's wkey so we need one ourselves */
+ if (dcp->cp_wkey == NULL) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* check for a valid keyformat for the new wrapping key */
+ if (dcp->cp_wkey->wk_keyformat >= ZFS_KEYFORMAT_FORMATS ||
+ dcp->cp_wkey->wk_keyformat == ZFS_KEYFORMAT_NONE) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /*
+ * If this dataset is not currently an encryption root we need a new
+ * keylocation for this dataset's new wrapping key. Otherwise we can
+ * just keep the one we already had.
+ */
+ if (dd->dd_object != rddobj && dcp->cp_keylocation == NULL) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* check that the keylocation is valid if it is not NULL */
+ if (dcp->cp_keylocation != NULL &&
+ !zfs_prop_valid_keylocation(dcp->cp_keylocation, B_TRUE)) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* passphrases require pbkdf2 salt and iters */
+ if (dcp->cp_wkey->wk_keyformat == ZFS_KEYFORMAT_PASSPHRASE) {
+ if (dcp->cp_wkey->wk_salt == 0 ||
+ dcp->cp_wkey->wk_iters < MIN_PBKDF2_ITERATIONS) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+ } else {
+ if (dcp->cp_wkey->wk_salt != 0 || dcp->cp_wkey->wk_iters != 0) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+ }
+
+ /* make sure the dd's wkey is loaded */
+ ret = dmu_objset_check_wkey_loaded(dd);
+ if (ret != 0)
+ goto error;
+
+ dsl_dir_rele(dd, FTAG);
+
+ return (0);
+
+error:
+ if (dd != NULL)
+ dsl_dir_rele(dd, FTAG);
+
+ return (ret);
+}
+
+/*
+ * This function deals with the intricacies of updating wrapping
+ * key references and encryption roots recursively in the event
+ * of a call to 'zfs change-key' or 'zfs promote'. The 'skip'
+ * parameter should always be set to B_FALSE when called
+ * externally.
+ */
+static void
+spa_keystore_change_key_sync_impl(uint64_t rddobj, uint64_t ddobj,
+ uint64_t new_rddobj, dsl_wrapping_key_t *wkey, boolean_t skip,
+ dmu_tx_t *tx)
+{
+ int ret;
+ zap_cursor_t *zc;
+ zap_attribute_t *za;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dir_t *dd = NULL;
+ dsl_crypto_key_t *dck = NULL;
+ uint64_t curr_rddobj;
+
+ ASSERT(RW_WRITE_HELD(&dp->dp_spa->spa_keystore.sk_wkeys_lock));
+
+ /* hold the dd */
+ VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
+
+ /* ignore special dsl dirs */
+ if (dd->dd_myname[0] == '$' || dd->dd_myname[0] == '%') {
+ dsl_dir_rele(dd, FTAG);
+ return;
+ }
+
+ ret = dsl_dir_get_encryption_root_ddobj(dd, &curr_rddobj);
+ VERIFY(ret == 0 || ret == ENOENT);
+
+ /*
+ * Stop recursing if this dsl dir didn't inherit from the root
+ * or if this dd is a clone.
+ */
+ if (ret == ENOENT ||
+ (!skip && (curr_rddobj != rddobj || dsl_dir_is_clone(dd)))) {
+ dsl_dir_rele(dd, FTAG);
+ return;
+ }
+
+ /*
+ * If we don't have a wrapping key just update the dck to reflect the
+ * new encryption root. Otherwise rewrap the entire dck and re-sync it
+ * to disk. If skip is set, we don't do any of this work.
+ */
+ if (!skip) {
+ if (wkey == NULL) {
+ VERIFY0(zap_update(dp->dp_meta_objset,
+ dd->dd_crypto_obj,
+ DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1,
+ &new_rddobj, tx));
+ } else {
+ VERIFY0(spa_keystore_dsl_key_hold_dd(dp->dp_spa, dd,
+ FTAG, &dck));
+ dsl_wrapping_key_hold(wkey, dck);
+ dsl_wrapping_key_rele(dck->dck_wkey, dck);
+ dck->dck_wkey = wkey;
+ dsl_crypto_key_sync(dck, tx);
+ spa_keystore_dsl_key_rele(dp->dp_spa, dck, FTAG);
+ }
+ }
+
+ zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+ /* Recurse into all child dsl dirs. */
+ for (zap_cursor_init(zc, dp->dp_meta_objset,
+ dsl_dir_phys(dd)->dd_child_dir_zapobj);
+ zap_cursor_retrieve(zc, za) == 0;
+ zap_cursor_advance(zc)) {
+ spa_keystore_change_key_sync_impl(rddobj,
+ za->za_first_integer, new_rddobj, wkey, B_FALSE, tx);
+ }
+ zap_cursor_fini(zc);
+
+ /*
+ * Recurse into all dsl dirs of clones. We utilize the skip parameter
+ * here so that we don't attempt to process the clones directly. This
+ * is because the clone and its origin share the same dck, which has
+ * already been updated.
+ */
+ for (zap_cursor_init(zc, dp->dp_meta_objset,
+ dsl_dir_phys(dd)->dd_clones);
+ zap_cursor_retrieve(zc, za) == 0;
+ zap_cursor_advance(zc)) {
+ dsl_dataset_t *clone;
+
+ VERIFY0(dsl_dataset_hold_obj(dp, za->za_first_integer,
+ FTAG, &clone));
+ spa_keystore_change_key_sync_impl(rddobj,
+ clone->ds_dir->dd_object, new_rddobj, wkey, B_TRUE, tx);
+ dsl_dataset_rele(clone, FTAG);
+ }
+ zap_cursor_fini(zc);
+
+ kmem_free(za, sizeof (zap_attribute_t));
+ kmem_free(zc, sizeof (zap_cursor_t));
+
+ dsl_dir_rele(dd, FTAG);
+}
+
+static void
+spa_keystore_change_key_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds;
+ avl_index_t where;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ spa_t *spa = dp->dp_spa;
+ spa_keystore_change_key_args_t *skcka = arg;
+ dsl_crypto_params_t *dcp = skcka->skcka_cp;
+ dsl_wrapping_key_t *wkey = NULL, *found_wkey;
+ dsl_wrapping_key_t wkey_search;
+ char *keylocation = dcp->cp_keylocation;
+ uint64_t rddobj, new_rddobj;
+
+ /* create and initialize the wrapping key */
+ VERIFY0(dsl_dataset_hold(dp, skcka->skcka_dsname, FTAG, &ds));
+ ASSERT(!ds->ds_is_snapshot);
+
+ if (dcp->cp_cmd == DCP_CMD_NEW_KEY ||
+ dcp->cp_cmd == DCP_CMD_FORCE_NEW_KEY) {
+ /*
+ * We are changing to a new wkey. Set additional properties
+ * which can be sent along with this ioctl. Note that this
+ * command can set keylocation even if it can't normally be
+ * set via 'zfs set' due to a non-local keylocation.
+ */
+ if (dcp->cp_cmd == DCP_CMD_NEW_KEY) {
+ wkey = dcp->cp_wkey;
+ wkey->wk_ddobj = ds->ds_dir->dd_object;
+ } else {
+ keylocation = "prompt";
+ }
+
+ if (keylocation != NULL) {
+ dsl_prop_set_sync_impl(ds,
+ zfs_prop_to_name(ZFS_PROP_KEYLOCATION),
+ ZPROP_SRC_LOCAL, 1, strlen(keylocation) + 1,
+ keylocation, tx);
+ }
+
+ VERIFY0(dsl_dir_get_encryption_root_ddobj(ds->ds_dir, &rddobj));
+ new_rddobj = ds->ds_dir->dd_object;
+ } else {
+ /*
+ * We are inheritting the parent's wkey. Unset any local
+ * keylocation and grab a reference to the wkey.
+ */
+ if (dcp->cp_cmd == DCP_CMD_INHERIT) {
+ VERIFY0(spa_keystore_wkey_hold_dd(spa,
+ ds->ds_dir->dd_parent, FTAG, &wkey));
+ }
+
+ dsl_prop_set_sync_impl(ds,
+ zfs_prop_to_name(ZFS_PROP_KEYLOCATION), ZPROP_SRC_NONE,
+ 0, 0, NULL, tx);
+
+ rddobj = ds->ds_dir->dd_object;
+ VERIFY0(dsl_dir_get_encryption_root_ddobj(ds->ds_dir->dd_parent,
+ &new_rddobj));
+ }
+
+ if (wkey == NULL) {
+ ASSERT(dcp->cp_cmd == DCP_CMD_FORCE_INHERIT ||
+ dcp->cp_cmd == DCP_CMD_FORCE_NEW_KEY);
+ }
+
+ rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_WRITER);
+
+ /* recurse through all children and rewrap their keys */
+ spa_keystore_change_key_sync_impl(rddobj, ds->ds_dir->dd_object,
+ new_rddobj, wkey, B_FALSE, tx);
+
+ /*
+ * All references to the old wkey should be released now (if it
+ * existed). Replace the wrapping key.
+ */
+ wkey_search.wk_ddobj = ds->ds_dir->dd_object;
+ found_wkey = avl_find(&spa->spa_keystore.sk_wkeys, &wkey_search, NULL);
+ if (found_wkey != NULL) {
+ ASSERT0(zfs_refcount_count(&found_wkey->wk_refcnt));
+ avl_remove(&spa->spa_keystore.sk_wkeys, found_wkey);
+ dsl_wrapping_key_free(found_wkey);
+ }
+
+ if (dcp->cp_cmd == DCP_CMD_NEW_KEY) {
+ avl_find(&spa->spa_keystore.sk_wkeys, wkey, &where);
+ avl_insert(&spa->spa_keystore.sk_wkeys, wkey, where);
+ } else if (wkey != NULL) {
+ dsl_wrapping_key_rele(wkey, FTAG);
+ }
+
+ rw_exit(&spa->spa_keystore.sk_wkeys_lock);
+
+ dsl_dataset_rele(ds, FTAG);
+}
+
+int
+spa_keystore_change_key(const char *dsname, dsl_crypto_params_t *dcp)
+{
+ spa_keystore_change_key_args_t skcka;
+
+ /* initialize the args struct */
+ skcka.skcka_dsname = dsname;
+ skcka.skcka_cp = dcp;
+
+ /*
+ * Perform the actual work in syncing context. The blocks modified
+ * here could be calculated but it would require holding the pool
+ * lock and traversing all of the datasets that will have their keys
+ * changed.
+ */
+ return (dsl_sync_task(dsname, spa_keystore_change_key_check,
+ spa_keystore_change_key_sync, &skcka, 15,
+ ZFS_SPACE_CHECK_RESERVED));
+}
+
+int
+dsl_dir_rename_crypt_check(dsl_dir_t *dd, dsl_dir_t *newparent)
+{
+ int ret;
+ uint64_t curr_rddobj, parent_rddobj;
+
+ if (dd->dd_crypto_obj == 0)
+ return (0);
+
+ ret = dsl_dir_get_encryption_root_ddobj(dd, &curr_rddobj);
+ if (ret != 0)
+ goto error;
+
+ /*
+ * if this is not an encryption root, we must make sure we are not
+ * moving dd to a new encryption root
+ */
+ if (dd->dd_object != curr_rddobj) {
+ ret = dsl_dir_get_encryption_root_ddobj(newparent,
+ &parent_rddobj);
+ if (ret != 0)
+ goto error;
+
+ if (parent_rddobj != curr_rddobj) {
+ ret = SET_ERROR(EACCES);
+ goto error;
+ }
+ }
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+/*
+ * Check to make sure that a promote from targetdd to origindd will not require
+ * any key rewraps.
+ */
+int
+dsl_dataset_promote_crypt_check(dsl_dir_t *target, dsl_dir_t *origin)
+{
+ int ret;
+ uint64_t rddobj, op_rddobj, tp_rddobj;
+
+ /* If the dataset is not encrypted we don't need to check anything */
+ if (origin->dd_crypto_obj == 0)
+ return (0);
+
+ /*
+ * If we are not changing the first origin snapshot in a chain
+ * the encryption root won't change either.
+ */
+ if (dsl_dir_is_clone(origin))
+ return (0);
+
+ /*
+ * If the origin is the encryption root we will update
+ * the DSL Crypto Key to point to the target instead.
+ */
+ ret = dsl_dir_get_encryption_root_ddobj(origin, &rddobj);
+ if (ret != 0)
+ return (ret);
+
+ if (rddobj == origin->dd_object)
+ return (0);
+
+ /*
+ * The origin is inheriting its encryption root from its parent.
+ * Check that the parent of the target has the same encryption root.
+ */
+ ret = dsl_dir_get_encryption_root_ddobj(origin->dd_parent, &op_rddobj);
+ if (ret == ENOENT)
+ return (SET_ERROR(EACCES));
+ else if (ret != 0)
+ return (ret);
+
+ ret = dsl_dir_get_encryption_root_ddobj(target->dd_parent, &tp_rddobj);
+ if (ret == ENOENT)
+ return (SET_ERROR(EACCES));
+ else if (ret != 0)
+ return (ret);
+
+ if (op_rddobj != tp_rddobj)
+ return (SET_ERROR(EACCES));
+
+ return (0);
+}
+
+void
+dsl_dataset_promote_crypt_sync(dsl_dir_t *target, dsl_dir_t *origin,
+ dmu_tx_t *tx)
+{
+ uint64_t rddobj;
+ dsl_pool_t *dp = target->dd_pool;
+ dsl_dataset_t *targetds;
+ dsl_dataset_t *originds;
+ char *keylocation;
+
+ if (origin->dd_crypto_obj == 0)
+ return;
+ if (dsl_dir_is_clone(origin))
+ return;
+
+ VERIFY0(dsl_dir_get_encryption_root_ddobj(origin, &rddobj));
+
+ if (rddobj != origin->dd_object)
+ return;
+
+ /*
+ * If the target is being promoted to the encryption root update the
+ * DSL Crypto Key and keylocation to reflect that. We also need to
+ * update the DSL Crypto Keys of all children inheritting their
+ * encryption root to point to the new target. Otherwise, the check
+ * function ensured that the encryption root will not change.
+ */
+ keylocation = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
+
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(target)->dd_head_dataset_obj, FTAG, &targetds));
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(origin)->dd_head_dataset_obj, FTAG, &originds));
+
+ VERIFY0(dsl_prop_get_dd(origin, zfs_prop_to_name(ZFS_PROP_KEYLOCATION),
+ 1, ZAP_MAXVALUELEN, keylocation, NULL, B_FALSE));
+ dsl_prop_set_sync_impl(targetds, zfs_prop_to_name(ZFS_PROP_KEYLOCATION),
+ ZPROP_SRC_LOCAL, 1, strlen(keylocation) + 1, keylocation, tx);
+ dsl_prop_set_sync_impl(originds, zfs_prop_to_name(ZFS_PROP_KEYLOCATION),
+ ZPROP_SRC_NONE, 0, 0, NULL, tx);
+
+ rw_enter(&dp->dp_spa->spa_keystore.sk_wkeys_lock, RW_WRITER);
+ spa_keystore_change_key_sync_impl(rddobj, origin->dd_object,
+ target->dd_object, NULL, B_FALSE, tx);
+ rw_exit(&dp->dp_spa->spa_keystore.sk_wkeys_lock);
+
+ dsl_dataset_rele(targetds, FTAG);
+ dsl_dataset_rele(originds, FTAG);
+ kmem_free(keylocation, ZAP_MAXVALUELEN);
+}
+
+int
+dmu_objset_create_crypt_check(dsl_dir_t *parentdd, dsl_crypto_params_t *dcp,
+ boolean_t *will_encrypt)
+{
+ int ret;
+ uint64_t pcrypt, crypt;
+ dsl_crypto_params_t dummy_dcp = { 0 };
+
+ if (will_encrypt != NULL)
+ *will_encrypt = B_FALSE;
+
+ if (dcp == NULL)
+ dcp = &dummy_dcp;
+
+ if (dcp->cp_cmd != DCP_CMD_NONE)
+ return (SET_ERROR(EINVAL));
+
+ if (parentdd != NULL) {
+ ret = dsl_dir_get_crypt(parentdd, &pcrypt);
+ if (ret != 0)
+ return (ret);
+ } else {
+ pcrypt = ZIO_CRYPT_OFF;
+ }
+
+ crypt = (dcp->cp_crypt == ZIO_CRYPT_INHERIT) ? pcrypt : dcp->cp_crypt;
+
+ ASSERT3U(pcrypt, !=, ZIO_CRYPT_INHERIT);
+ ASSERT3U(crypt, !=, ZIO_CRYPT_INHERIT);
+
+ /* check for valid dcp with no encryption (inherited or local) */
+ if (crypt == ZIO_CRYPT_OFF) {
+ /* Must not specify encryption params */
+ if (dcp->cp_wkey != NULL ||
+ (dcp->cp_keylocation != NULL &&
+ strcmp(dcp->cp_keylocation, "none") != 0))
+ return (SET_ERROR(EINVAL));
+
+ return (0);
+ }
+
+ if (will_encrypt != NULL)
+ *will_encrypt = B_TRUE;
+
+ /*
+ * We will now definitely be encrypting. Check the feature flag. When
+ * creating the pool the caller will check this for us since we won't
+ * technically have the feature activated yet.
+ */
+ if (parentdd != NULL &&
+ !spa_feature_is_enabled(parentdd->dd_pool->dp_spa,
+ SPA_FEATURE_ENCRYPTION)) {
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+
+ /* Check for errata #4 (encryption enabled, bookmark_v2 disabled) */
+ if (parentdd != NULL &&
+ !spa_feature_is_enabled(parentdd->dd_pool->dp_spa,
+ SPA_FEATURE_BOOKMARK_V2)) {
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+
+ /* handle inheritance */
+ if (dcp->cp_wkey == NULL) {
+ ASSERT3P(parentdd, !=, NULL);
+
+ /* key must be fully unspecified */
+ if (dcp->cp_keylocation != NULL)
+ return (SET_ERROR(EINVAL));
+
+ /* parent must have a key to inherit */
+ if (pcrypt == ZIO_CRYPT_OFF)
+ return (SET_ERROR(EINVAL));
+
+ /* check for parent key */
+ ret = dmu_objset_check_wkey_loaded(parentdd);
+ if (ret != 0)
+ return (ret);
+
+ return (0);
+ }
+
+ /* At this point we should have a fully specified key. Check location */
+ if (dcp->cp_keylocation == NULL ||
+ !zfs_prop_valid_keylocation(dcp->cp_keylocation, B_TRUE))
+ return (SET_ERROR(EINVAL));
+
+ /* Must have fully specified keyformat */
+ switch (dcp->cp_wkey->wk_keyformat) {
+ case ZFS_KEYFORMAT_HEX:
+ case ZFS_KEYFORMAT_RAW:
+ /* requires no pbkdf2 iters and salt */
+ if (dcp->cp_wkey->wk_salt != 0 || dcp->cp_wkey->wk_iters != 0)
+ return (SET_ERROR(EINVAL));
+ break;
+ case ZFS_KEYFORMAT_PASSPHRASE:
+ /* requires pbkdf2 iters and salt */
+ if (dcp->cp_wkey->wk_salt == 0 ||
+ dcp->cp_wkey->wk_iters < MIN_PBKDF2_ITERATIONS)
+ return (SET_ERROR(EINVAL));
+ break;
+ case ZFS_KEYFORMAT_NONE:
+ default:
+ /* keyformat must be specified and valid */
+ return (SET_ERROR(EINVAL));
+ }
+
+ return (0);
+}
+
+void
+dsl_dataset_create_crypt_sync(uint64_t dsobj, dsl_dir_t *dd,
+ dsl_dataset_t *origin, dsl_crypto_params_t *dcp, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dd->dd_pool;
+ uint64_t crypt;
+ dsl_wrapping_key_t *wkey;
+
+ /* clones always use their origin's wrapping key */
+ if (dsl_dir_is_clone(dd)) {
+ ASSERT3P(dcp, ==, NULL);
+
+ /*
+ * If this is an encrypted clone we just need to clone the
+ * dck into dd. Zapify the dd so we can do that.
+ */
+ if (origin->ds_dir->dd_crypto_obj != 0) {
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dsl_dir_zapify(dd, tx);
+
+ dd->dd_crypto_obj =
+ dsl_crypto_key_clone_sync(origin->ds_dir, tx);
+ VERIFY0(zap_add(dp->dp_meta_objset, dd->dd_object,
+ DD_FIELD_CRYPTO_KEY_OBJ, sizeof (uint64_t), 1,
+ &dd->dd_crypto_obj, tx));
+ }
+
+ return;
+ }
+
+ /*
+ * A NULL dcp at this point indicates this is the origin dataset
+ * which does not have an objset to encrypt. Raw receives will handle
+ * encryption separately later. In both cases we can simply return.
+ */
+ if (dcp == NULL || dcp->cp_cmd == DCP_CMD_RAW_RECV)
+ return;
+
+ crypt = dcp->cp_crypt;
+ wkey = dcp->cp_wkey;
+
+ /* figure out the effective crypt */
+ if (crypt == ZIO_CRYPT_INHERIT && dd->dd_parent != NULL)
+ VERIFY0(dsl_dir_get_crypt(dd->dd_parent, &crypt));
+
+ /* if we aren't doing encryption just return */
+ if (crypt == ZIO_CRYPT_OFF || crypt == ZIO_CRYPT_INHERIT)
+ return;
+
+ /* zapify the dd so that we can add the crypto key obj to it */
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dsl_dir_zapify(dd, tx);
+
+ /* use the new key if given or inherit from the parent */
+ if (wkey == NULL) {
+ VERIFY0(spa_keystore_wkey_hold_dd(dp->dp_spa,
+ dd->dd_parent, FTAG, &wkey));
+ } else {
+ wkey->wk_ddobj = dd->dd_object;
+ }
+
+ ASSERT3P(wkey, !=, NULL);
+
+ /* Create or clone the DSL crypto key and activate the feature */
+ dd->dd_crypto_obj = dsl_crypto_key_create_sync(crypt, wkey, tx);
+ VERIFY0(zap_add(dp->dp_meta_objset, dd->dd_object,
+ DD_FIELD_CRYPTO_KEY_OBJ, sizeof (uint64_t), 1, &dd->dd_crypto_obj,
+ tx));
+ dsl_dataset_activate_feature(dsobj, SPA_FEATURE_ENCRYPTION,
+ (void *)B_TRUE, tx);
+
+ /*
+ * If we inherited the wrapping key we release our reference now.
+ * Otherwise, this is a new key and we need to load it into the
+ * keystore.
+ */
+ if (dcp->cp_wkey == NULL) {
+ dsl_wrapping_key_rele(wkey, FTAG);
+ } else {
+ VERIFY0(spa_keystore_load_wkey_impl(dp->dp_spa, wkey));
+ }
+}
+
+typedef struct dsl_crypto_recv_key_arg {
+ uint64_t dcrka_dsobj;
+ uint64_t dcrka_fromobj;
+ dmu_objset_type_t dcrka_ostype;
+ nvlist_t *dcrka_nvl;
+ boolean_t dcrka_do_key;
+} dsl_crypto_recv_key_arg_t;
+
+static int
+dsl_crypto_recv_raw_objset_check(dsl_dataset_t *ds, dsl_dataset_t *fromds,
+ dmu_objset_type_t ostype, nvlist_t *nvl, dmu_tx_t *tx)
+{
+ int ret;
+ objset_t *os;
+ dnode_t *mdn;
+ uint8_t *buf = NULL;
+ uint_t len;
+ uint64_t intval, nlevels, blksz, ibs;
+ uint64_t nblkptr, maxblkid;
+
+ if (ostype != DMU_OST_ZFS && ostype != DMU_OST_ZVOL)
+ return (SET_ERROR(EINVAL));
+
+ /* raw receives also need info about the structure of the metadnode */
+ ret = nvlist_lookup_uint64(nvl, "mdn_compress", &intval);
+ if (ret != 0 || intval >= ZIO_COMPRESS_LEGACY_FUNCTIONS)
+ return (SET_ERROR(EINVAL));
+
+ ret = nvlist_lookup_uint64(nvl, "mdn_checksum", &intval);
+ if (ret != 0 || intval >= ZIO_CHECKSUM_LEGACY_FUNCTIONS)
+ return (SET_ERROR(EINVAL));
+
+ ret = nvlist_lookup_uint64(nvl, "mdn_nlevels", &nlevels);
+ if (ret != 0 || nlevels > DN_MAX_LEVELS)
+ return (SET_ERROR(EINVAL));
+
+ ret = nvlist_lookup_uint64(nvl, "mdn_blksz", &blksz);
+ if (ret != 0 || blksz < SPA_MINBLOCKSIZE)
+ return (SET_ERROR(EINVAL));
+ else if (blksz > spa_maxblocksize(tx->tx_pool->dp_spa))
+ return (SET_ERROR(ENOTSUP));
+
+ ret = nvlist_lookup_uint64(nvl, "mdn_indblkshift", &ibs);
+ if (ret != 0 || ibs < DN_MIN_INDBLKSHIFT || ibs > DN_MAX_INDBLKSHIFT)
+ return (SET_ERROR(ENOTSUP));
+
+ ret = nvlist_lookup_uint64(nvl, "mdn_nblkptr", &nblkptr);
+ if (ret != 0 || nblkptr != DN_MAX_NBLKPTR)
+ return (SET_ERROR(ENOTSUP));
+
+ ret = nvlist_lookup_uint64(nvl, "mdn_maxblkid", &maxblkid);
+ if (ret != 0)
+ return (SET_ERROR(EINVAL));
+
+ ret = nvlist_lookup_uint8_array(nvl, "portable_mac", &buf, &len);
+ if (ret != 0 || len != ZIO_OBJSET_MAC_LEN)
+ return (SET_ERROR(EINVAL));
+
+ ret = dmu_objset_from_ds(ds, &os);
+ if (ret != 0)
+ return (ret);
+
+ mdn = DMU_META_DNODE(os);
+
+ /*
+ * If we already created the objset, make sure its unchangeable
+ * properties match the ones received in the nvlist.
+ */
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ if (!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) &&
+ (mdn->dn_nlevels != nlevels || mdn->dn_datablksz != blksz ||
+ mdn->dn_indblkshift != ibs || mdn->dn_nblkptr != nblkptr)) {
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+ /*
+ * Check that the ivset guid of the fromds matches the one from the
+ * send stream. Older versions of the encryption code did not have
+ * an ivset guid on the from dataset and did not send one in the
+ * stream. For these streams we provide the
+ * zfs_disable_ivset_guid_check tunable to allow these datasets to
+ * be received with a generated ivset guid.
+ */
+ if (fromds != NULL && !zfs_disable_ivset_guid_check) {
+ uint64_t from_ivset_guid = 0;
+ intval = 0;
+
+ (void) nvlist_lookup_uint64(nvl, "from_ivset_guid", &intval);
+ (void) zap_lookup(tx->tx_pool->dp_meta_objset,
+ fromds->ds_object, DS_FIELD_IVSET_GUID,
+ sizeof (from_ivset_guid), 1, &from_ivset_guid);
+
+ if (intval == 0 || from_ivset_guid == 0)
+ return (SET_ERROR(ZFS_ERR_FROM_IVSET_GUID_MISSING));
+
+ if (intval != from_ivset_guid)
+ return (SET_ERROR(ZFS_ERR_FROM_IVSET_GUID_MISMATCH));
+ }
+
+ return (0);
+}
+
+static void
+dsl_crypto_recv_raw_objset_sync(dsl_dataset_t *ds, dmu_objset_type_t ostype,
+ nvlist_t *nvl, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = tx->tx_pool;
+ objset_t *os;
+ dnode_t *mdn;
+ zio_t *zio;
+ uint8_t *portable_mac;
+ uint_t len;
+ uint64_t compress, checksum, nlevels, blksz, ibs, maxblkid;
+ boolean_t newds = B_FALSE;
+
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+ mdn = DMU_META_DNODE(os);
+
+ /*
+ * Fetch the values we need from the nvlist. "to_ivset_guid" must
+ * be set on the snapshot, which doesn't exist yet. The receive
+ * code will take care of this for us later.
+ */
+ compress = fnvlist_lookup_uint64(nvl, "mdn_compress");
+ checksum = fnvlist_lookup_uint64(nvl, "mdn_checksum");
+ nlevels = fnvlist_lookup_uint64(nvl, "mdn_nlevels");
+ blksz = fnvlist_lookup_uint64(nvl, "mdn_blksz");
+ ibs = fnvlist_lookup_uint64(nvl, "mdn_indblkshift");
+ maxblkid = fnvlist_lookup_uint64(nvl, "mdn_maxblkid");
+ VERIFY0(nvlist_lookup_uint8_array(nvl, "portable_mac", &portable_mac,
+ &len));
+
+ /* if we haven't created an objset for the ds yet, do that now */
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ if (BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) {
+ (void) dmu_objset_create_impl_dnstats(dp->dp_spa, ds,
+ dsl_dataset_get_blkptr(ds), ostype, nlevels, blksz,
+ ibs, tx);
+ newds = B_TRUE;
+ }
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+ /*
+ * Set the portable MAC. The local MAC will always be zero since the
+ * incoming data will all be portable and user accounting will be
+ * deferred until the next mount. Afterwards, flag the os to be
+ * written out raw next time.
+ */
+ arc_release(os->os_phys_buf, &os->os_phys_buf);
+ bcopy(portable_mac, os->os_phys->os_portable_mac, ZIO_OBJSET_MAC_LEN);
+ os->os_phys->os_flags &= ~OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+ os->os_phys->os_flags &= ~OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE;
+ os->os_flags = os->os_phys->os_flags;
+ bzero(os->os_phys->os_local_mac, ZIO_OBJSET_MAC_LEN);
+ os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
+
+ /* set metadnode compression and checksum */
+ mdn->dn_compress = compress;
+ mdn->dn_checksum = checksum;
+
+ rw_enter(&mdn->dn_struct_rwlock, RW_WRITER);
+ dnode_new_blkid(mdn, maxblkid, tx, B_FALSE, B_TRUE);
+ rw_exit(&mdn->dn_struct_rwlock);
+
+ /*
+ * We can't normally dirty the dataset in syncing context unless
+ * we are creating a new dataset. In this case, we perform a
+ * pseudo txg sync here instead.
+ */
+ if (newds) {
+ dsl_dataset_dirty(ds, tx);
+ } else {
+ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ dsl_dataset_sync(ds, zio, tx);
+ VERIFY0(zio_wait(zio));
+
+ /* dsl_dataset_sync_done will drop this reference. */
+ dmu_buf_add_ref(ds->ds_dbuf, ds);
+ dsl_dataset_sync_done(ds, tx);
+ }
+}
+
+int
+dsl_crypto_recv_raw_key_check(dsl_dataset_t *ds, nvlist_t *nvl, dmu_tx_t *tx)
+{
+ int ret;
+ objset_t *mos = tx->tx_pool->dp_meta_objset;
+ uint8_t *buf = NULL;
+ uint_t len;
+ uint64_t intval, key_guid, version;
+ boolean_t is_passphrase = B_FALSE;
+
+ ASSERT(dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT);
+
+ /*
+ * Read and check all the encryption values from the nvlist. We need
+ * all of the fields of a DSL Crypto Key, as well as a fully specified
+ * wrapping key.
+ */
+ ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE, &intval);
+ if (ret != 0 || intval >= ZIO_CRYPT_FUNCTIONS ||
+ intval <= ZIO_CRYPT_OFF)
+ return (SET_ERROR(EINVAL));
+
+ ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_GUID, &intval);
+ if (ret != 0)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * If this is an incremental receive make sure the given key guid
+ * matches the one we already have.
+ */
+ if (ds->ds_dir->dd_crypto_obj != 0) {
+ ret = zap_lookup(mos, ds->ds_dir->dd_crypto_obj,
+ DSL_CRYPTO_KEY_GUID, 8, 1, &key_guid);
+ if (ret != 0)
+ return (ret);
+ if (intval != key_guid)
+ return (SET_ERROR(EACCES));
+ }
+
+ ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MASTER_KEY,
+ &buf, &len);
+ if (ret != 0 || len != MASTER_KEY_MAX_LEN)
+ return (SET_ERROR(EINVAL));
+
+ ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_HMAC_KEY,
+ &buf, &len);
+ if (ret != 0 || len != SHA512_HMAC_KEYLEN)
+ return (SET_ERROR(EINVAL));
+
+ ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_IV, &buf, &len);
+ if (ret != 0 || len != WRAPPING_IV_LEN)
+ return (SET_ERROR(EINVAL));
+
+ ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MAC, &buf, &len);
+ if (ret != 0 || len != WRAPPING_MAC_LEN)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * We don't support receiving old on-disk formats. The version 0
+ * implementation protected several fields in an objset that were
+ * not always portable during a raw receive. As a result, we call
+ * the old version an on-disk errata #3.
+ */
+ ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_VERSION, &version);
+ if (ret != 0 || version != ZIO_CRYPT_KEY_CURRENT_VERSION)
+ return (SET_ERROR(ENOTSUP));
+
+ ret = nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_KEYFORMAT),
+ &intval);
+ if (ret != 0 || intval >= ZFS_KEYFORMAT_FORMATS ||
+ intval == ZFS_KEYFORMAT_NONE)
+ return (SET_ERROR(EINVAL));
+
+ is_passphrase = (intval == ZFS_KEYFORMAT_PASSPHRASE);
+
+ /*
+ * for raw receives we allow any number of pbkdf2iters since there
+ * won't be a chance for the user to change it.
+ */
+ ret = nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS),
+ &intval);
+ if (ret != 0 || (is_passphrase == (intval == 0)))
+ return (SET_ERROR(EINVAL));
+
+ ret = nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT),
+ &intval);
+ if (ret != 0 || (is_passphrase == (intval == 0)))
+ return (SET_ERROR(EINVAL));
+
+ return (0);
+}
+
+void
+dsl_crypto_recv_raw_key_sync(dsl_dataset_t *ds, nvlist_t *nvl, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = tx->tx_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ dsl_dir_t *dd = ds->ds_dir;
+ uint_t len;
+ uint64_t rddobj, one = 1;
+ uint8_t *keydata, *hmac_keydata, *iv, *mac;
+ uint64_t crypt, key_guid, keyformat, iters, salt;
+ uint64_t version = ZIO_CRYPT_KEY_CURRENT_VERSION;
+ char *keylocation = "prompt";
+
+ /* lookup the values we need to create the DSL Crypto Key */
+ crypt = fnvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE);
+ key_guid = fnvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_GUID);
+ keyformat = fnvlist_lookup_uint64(nvl,
+ zfs_prop_to_name(ZFS_PROP_KEYFORMAT));
+ iters = fnvlist_lookup_uint64(nvl,
+ zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS));
+ salt = fnvlist_lookup_uint64(nvl,
+ zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT));
+ VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MASTER_KEY,
+ &keydata, &len));
+ VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_HMAC_KEY,
+ &hmac_keydata, &len));
+ VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_IV, &iv, &len));
+ VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MAC, &mac, &len));
+
+ /* if this is a new dataset setup the DSL Crypto Key. */
+ if (dd->dd_crypto_obj == 0) {
+ /* zapify the dsl dir so we can add the key object to it */
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dsl_dir_zapify(dd, tx);
+
+ /* create the DSL Crypto Key on disk and activate the feature */
+ dd->dd_crypto_obj = zap_create(mos,
+ DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
+ VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+ dd->dd_crypto_obj, DSL_CRYPTO_KEY_REFCOUNT,
+ sizeof (uint64_t), 1, &one, tx));
+ VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+ dd->dd_crypto_obj, DSL_CRYPTO_KEY_VERSION,
+ sizeof (uint64_t), 1, &version, tx));
+
+ dsl_dataset_activate_feature(ds->ds_object,
+ SPA_FEATURE_ENCRYPTION, (void *)B_TRUE, tx);
+ ds->ds_feature[SPA_FEATURE_ENCRYPTION] = (void *)B_TRUE;
+
+ /* save the dd_crypto_obj on disk */
+ VERIFY0(zap_add(mos, dd->dd_object, DD_FIELD_CRYPTO_KEY_OBJ,
+ sizeof (uint64_t), 1, &dd->dd_crypto_obj, tx));
+
+ /*
+ * Set the keylocation to prompt by default. If keylocation
+ * has been provided via the properties, this will be overridden
+ * later.
+ */
+ dsl_prop_set_sync_impl(ds,
+ zfs_prop_to_name(ZFS_PROP_KEYLOCATION),
+ ZPROP_SRC_LOCAL, 1, strlen(keylocation) + 1,
+ keylocation, tx);
+
+ rddobj = dd->dd_object;
+ } else {
+ VERIFY0(dsl_dir_get_encryption_root_ddobj(dd, &rddobj));
+ }
+
+ /* sync the key data to the ZAP object on disk */
+ dsl_crypto_key_sync_impl(mos, dd->dd_crypto_obj, crypt,
+ rddobj, key_guid, iv, mac, keydata, hmac_keydata, keyformat, salt,
+ iters, tx);
+}
+
+static int
+dsl_crypto_recv_key_check(void *arg, dmu_tx_t *tx)
+{
+ int ret;
+ dsl_crypto_recv_key_arg_t *dcrka = arg;
+ dsl_dataset_t *ds = NULL, *fromds = NULL;
+
+ ret = dsl_dataset_hold_obj(tx->tx_pool, dcrka->dcrka_dsobj,
+ FTAG, &ds);
+ if (ret != 0)
+ goto out;
+
+ if (dcrka->dcrka_fromobj != 0) {
+ ret = dsl_dataset_hold_obj(tx->tx_pool, dcrka->dcrka_fromobj,
+ FTAG, &fromds);
+ if (ret != 0)
+ goto out;
+ }
+
+ ret = dsl_crypto_recv_raw_objset_check(ds, fromds,
+ dcrka->dcrka_ostype, dcrka->dcrka_nvl, tx);
+ if (ret != 0)
+ goto out;
+
+ /*
+ * We run this check even if we won't be doing this part of
+ * the receive now so that we don't make the user wait until
+ * the receive finishes to fail.
+ */
+ ret = dsl_crypto_recv_raw_key_check(ds, dcrka->dcrka_nvl, tx);
+ if (ret != 0)
+ goto out;
+
+out:
+ if (ds != NULL)
+ dsl_dataset_rele(ds, FTAG);
+ if (fromds != NULL)
+ dsl_dataset_rele(fromds, FTAG);
+ return (ret);
+}
+
+static void
+dsl_crypto_recv_key_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_crypto_recv_key_arg_t *dcrka = arg;
+ dsl_dataset_t *ds;
+
+ VERIFY0(dsl_dataset_hold_obj(tx->tx_pool, dcrka->dcrka_dsobj,
+ FTAG, &ds));
+ dsl_crypto_recv_raw_objset_sync(ds, dcrka->dcrka_ostype,
+ dcrka->dcrka_nvl, tx);
+ if (dcrka->dcrka_do_key)
+ dsl_crypto_recv_raw_key_sync(ds, dcrka->dcrka_nvl, tx);
+ dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * This function is used to sync an nvlist representing a DSL Crypto Key and
+ * the associated encryption parameters. The key will be written exactly as is
+ * without wrapping it.
+ */
+int
+dsl_crypto_recv_raw(const char *poolname, uint64_t dsobj, uint64_t fromobj,
+ dmu_objset_type_t ostype, nvlist_t *nvl, boolean_t do_key)
+{
+ dsl_crypto_recv_key_arg_t dcrka;
+
+ dcrka.dcrka_dsobj = dsobj;
+ dcrka.dcrka_fromobj = fromobj;
+ dcrka.dcrka_ostype = ostype;
+ dcrka.dcrka_nvl = nvl;
+ dcrka.dcrka_do_key = do_key;
+
+ return (dsl_sync_task(poolname, dsl_crypto_recv_key_check,
+ dsl_crypto_recv_key_sync, &dcrka, 1, ZFS_SPACE_CHECK_NORMAL));
+}
+
+int
+dsl_crypto_populate_key_nvlist(objset_t *os, uint64_t from_ivset_guid,
+ nvlist_t **nvl_out)
+{
+ int ret;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ dnode_t *mdn;
+ uint64_t rddobj;
+ nvlist_t *nvl = NULL;
+ uint64_t dckobj = ds->ds_dir->dd_crypto_obj;
+ dsl_dir_t *rdd = NULL;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t crypt = 0, key_guid = 0, format = 0;
+ uint64_t iters = 0, salt = 0, version = 0;
+ uint64_t to_ivset_guid = 0;
+ uint8_t raw_keydata[MASTER_KEY_MAX_LEN];
+ uint8_t raw_hmac_keydata[SHA512_HMAC_KEYLEN];
+ uint8_t iv[WRAPPING_IV_LEN];
+ uint8_t mac[WRAPPING_MAC_LEN];
+
+ ASSERT(dckobj != 0);
+
+ mdn = DMU_META_DNODE(os);
+
+ nvl = fnvlist_alloc();
+
+ /* lookup values from the DSL Crypto Key */
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1,
+ &crypt);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_GUID, 8, 1, &key_guid);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MASTER_KEY, 1,
+ MASTER_KEY_MAX_LEN, raw_keydata);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_HMAC_KEY, 1,
+ SHA512_HMAC_KEYLEN, raw_hmac_keydata);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_IV, 1, WRAPPING_IV_LEN,
+ iv);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MAC, 1, WRAPPING_MAC_LEN,
+ mac);
+ if (ret != 0)
+ goto error;
+
+ /* see zfs_disable_ivset_guid_check tunable for errata info */
+ ret = zap_lookup(mos, ds->ds_object, DS_FIELD_IVSET_GUID, 8, 1,
+ &to_ivset_guid);
+ if (ret != 0)
+ ASSERT3U(dp->dp_spa->spa_errata, !=, 0);
+
+ /*
+ * We don't support raw sends of legacy on-disk formats. See the
+ * comment in dsl_crypto_recv_key_check() for details.
+ */
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_VERSION, 8, 1, &version);
+ if (ret != 0 || version != ZIO_CRYPT_KEY_CURRENT_VERSION) {
+ dp->dp_spa->spa_errata = ZPOOL_ERRATA_ZOL_6845_ENCRYPTION;
+ ret = SET_ERROR(ENOTSUP);
+ goto error;
+ }
+
+ /*
+ * Lookup wrapping key properties. An early version of the code did
+ * not correctly add these values to the wrapping key or the DSL
+ * Crypto Key on disk for non encryption roots, so to be safe we
+ * always take the slightly circuitous route of looking it up from
+ * the encryption root's key.
+ */
+ ret = dsl_dir_get_encryption_root_ddobj(ds->ds_dir, &rddobj);
+ if (ret != 0)
+ goto error;
+
+ dsl_pool_config_enter(dp, FTAG);
+
+ ret = dsl_dir_hold_obj(dp, rddobj, NULL, FTAG, &rdd);
+ if (ret != 0)
+ goto error_unlock;
+
+ ret = zap_lookup(dp->dp_meta_objset, rdd->dd_crypto_obj,
+ zfs_prop_to_name(ZFS_PROP_KEYFORMAT), 8, 1, &format);
+ if (ret != 0)
+ goto error_unlock;
+
+ if (format == ZFS_KEYFORMAT_PASSPHRASE) {
+ ret = zap_lookup(dp->dp_meta_objset, rdd->dd_crypto_obj,
+ zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 8, 1, &iters);
+ if (ret != 0)
+ goto error_unlock;
+
+ ret = zap_lookup(dp->dp_meta_objset, rdd->dd_crypto_obj,
+ zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 8, 1, &salt);
+ if (ret != 0)
+ goto error_unlock;
+ }
+
+ dsl_dir_rele(rdd, FTAG);
+ dsl_pool_config_exit(dp, FTAG);
+
+ fnvlist_add_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE, crypt);
+ fnvlist_add_uint64(nvl, DSL_CRYPTO_KEY_GUID, key_guid);
+ fnvlist_add_uint64(nvl, DSL_CRYPTO_KEY_VERSION, version);
+ VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_MASTER_KEY,
+ raw_keydata, MASTER_KEY_MAX_LEN));
+ VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_HMAC_KEY,
+ raw_hmac_keydata, SHA512_HMAC_KEYLEN));
+ VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_IV, iv,
+ WRAPPING_IV_LEN));
+ VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_MAC, mac,
+ WRAPPING_MAC_LEN));
+ VERIFY0(nvlist_add_uint8_array(nvl, "portable_mac",
+ os->os_phys->os_portable_mac, ZIO_OBJSET_MAC_LEN));
+ fnvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), format);
+ fnvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), iters);
+ fnvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), salt);
+ fnvlist_add_uint64(nvl, "mdn_checksum", mdn->dn_checksum);
+ fnvlist_add_uint64(nvl, "mdn_compress", mdn->dn_compress);
+ fnvlist_add_uint64(nvl, "mdn_nlevels", mdn->dn_nlevels);
+ fnvlist_add_uint64(nvl, "mdn_blksz", mdn->dn_datablksz);
+ fnvlist_add_uint64(nvl, "mdn_indblkshift", mdn->dn_indblkshift);
+ fnvlist_add_uint64(nvl, "mdn_nblkptr", mdn->dn_nblkptr);
+ fnvlist_add_uint64(nvl, "mdn_maxblkid", mdn->dn_maxblkid);
+ fnvlist_add_uint64(nvl, "to_ivset_guid", to_ivset_guid);
+ fnvlist_add_uint64(nvl, "from_ivset_guid", from_ivset_guid);
+
+ *nvl_out = nvl;
+ return (0);
+
+error_unlock:
+ dsl_pool_config_exit(dp, FTAG);
+error:
+ if (rdd != NULL)
+ dsl_dir_rele(rdd, FTAG);
+ nvlist_free(nvl);
+
+ *nvl_out = NULL;
+ return (ret);
+}
+
+uint64_t
+dsl_crypto_key_create_sync(uint64_t crypt, dsl_wrapping_key_t *wkey,
+ dmu_tx_t *tx)
+{
+ dsl_crypto_key_t dck;
+ uint64_t version = ZIO_CRYPT_KEY_CURRENT_VERSION;
+ uint64_t one = 1ULL;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+ ASSERT3U(crypt, >, ZIO_CRYPT_OFF);
+
+ /* create the DSL Crypto Key ZAP object */
+ dck.dck_obj = zap_create(tx->tx_pool->dp_meta_objset,
+ DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
+
+ /* fill in the key (on the stack) and sync it to disk */
+ dck.dck_wkey = wkey;
+ VERIFY0(zio_crypt_key_init(crypt, &dck.dck_key));
+
+ dsl_crypto_key_sync(&dck, tx);
+ VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, dck.dck_obj,
+ DSL_CRYPTO_KEY_REFCOUNT, sizeof (uint64_t), 1, &one, tx));
+ VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, dck.dck_obj,
+ DSL_CRYPTO_KEY_VERSION, sizeof (uint64_t), 1, &version, tx));
+
+ zio_crypt_key_destroy(&dck.dck_key);
+ bzero(&dck.dck_key, sizeof (zio_crypt_key_t));
+
+ return (dck.dck_obj);
+}
+
+uint64_t
+dsl_crypto_key_clone_sync(dsl_dir_t *origindd, dmu_tx_t *tx)
+{
+ objset_t *mos = tx->tx_pool->dp_meta_objset;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ VERIFY0(zap_increment(mos, origindd->dd_crypto_obj,
+ DSL_CRYPTO_KEY_REFCOUNT, 1, tx));
+
+ return (origindd->dd_crypto_obj);
+}
+
+void
+dsl_crypto_key_destroy_sync(uint64_t dckobj, dmu_tx_t *tx)
+{
+ objset_t *mos = tx->tx_pool->dp_meta_objset;
+ uint64_t refcnt;
+
+ /* Decrement the refcount, destroy if this is the last reference */
+ VERIFY0(zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_REFCOUNT,
+ sizeof (uint64_t), 1, &refcnt));
+
+ if (refcnt != 1) {
+ VERIFY0(zap_increment(mos, dckobj, DSL_CRYPTO_KEY_REFCOUNT,
+ -1, tx));
+ } else {
+ VERIFY0(zap_destroy(mos, dckobj, tx));
+ }
+}
+
+void
+dsl_dataset_crypt_stats(dsl_dataset_t *ds, nvlist_t *nv)
+{
+ uint64_t intval;
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_dir_t *enc_root;
+ char buf[ZFS_MAX_DATASET_NAME_LEN];
+
+ if (dd->dd_crypto_obj == 0)
+ return;
+
+ intval = dsl_dataset_get_keystatus(dd);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_KEYSTATUS, intval);
+
+ if (dsl_dir_get_crypt(dd, &intval) == 0)
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_ENCRYPTION, intval);
+ if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
+ DSL_CRYPTO_KEY_GUID, 8, 1, &intval) == 0) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_KEY_GUID, intval);
+ }
+ if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
+ zfs_prop_to_name(ZFS_PROP_KEYFORMAT), 8, 1, &intval) == 0) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_KEYFORMAT, intval);
+ }
+ if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
+ zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 8, 1, &intval) == 0) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_PBKDF2_SALT, intval);
+ }
+ if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
+ zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 8, 1, &intval) == 0) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_PBKDF2_ITERS, intval);
+ }
+ if (zap_lookup(dd->dd_pool->dp_meta_objset, ds->ds_object,
+ DS_FIELD_IVSET_GUID, 8, 1, &intval) == 0) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_IVSET_GUID, intval);
+ }
+
+ if (dsl_dir_get_encryption_root_ddobj(dd, &intval) == 0) {
+ if (dsl_dir_hold_obj(dd->dd_pool, intval, NULL, FTAG,
+ &enc_root) == 0) {
+ dsl_dir_name(enc_root, buf);
+ dsl_dir_rele(enc_root, FTAG);
+ dsl_prop_nvlist_add_string(nv,
+ ZFS_PROP_ENCRYPTION_ROOT, buf);
+ }
+ }
+}
+
+int
+spa_crypt_get_salt(spa_t *spa, uint64_t dsobj, uint8_t *salt)
+{
+ int ret;
+ dsl_crypto_key_t *dck = NULL;
+
+ /* look up the key from the spa's keystore */
+ ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck);
+ if (ret != 0)
+ goto error;
+
+ ret = zio_crypt_key_get_salt(&dck->dck_key, salt);
+ if (ret != 0)
+ goto error;
+
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+ return (0);
+
+error:
+ if (dck != NULL)
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+ return (ret);
+}
+
+/*
+ * Objset blocks are a special case for MAC generation. These blocks have 2
+ * 256-bit MACs which are embedded within the block itself, rather than a
+ * single 128 bit MAC. As a result, this function handles encoding and decoding
+ * the MACs on its own, unlike other functions in this file.
+ */
+int
+spa_do_crypt_objset_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj,
+ abd_t *abd, uint_t datalen, boolean_t byteswap)
+{
+ int ret;
+ dsl_crypto_key_t *dck = NULL;
+ void *buf = abd_borrow_buf_copy(abd, datalen);
+ objset_phys_t *osp = buf;
+ uint8_t portable_mac[ZIO_OBJSET_MAC_LEN];
+ uint8_t local_mac[ZIO_OBJSET_MAC_LEN];
+
+ /* look up the key from the spa's keystore */
+ ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck);
+ if (ret != 0)
+ goto error;
+
+ /* calculate both HMACs */
+ ret = zio_crypt_do_objset_hmacs(&dck->dck_key, buf, datalen,
+ byteswap, portable_mac, local_mac);
+ if (ret != 0)
+ goto error;
+
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+
+ /* if we are generating encode the HMACs in the objset_phys_t */
+ if (generate) {
+ bcopy(portable_mac, osp->os_portable_mac, ZIO_OBJSET_MAC_LEN);
+ bcopy(local_mac, osp->os_local_mac, ZIO_OBJSET_MAC_LEN);
+ abd_return_buf_copy(abd, buf, datalen);
+ return (0);
+ }
+
+ if (bcmp(portable_mac, osp->os_portable_mac, ZIO_OBJSET_MAC_LEN) != 0 ||
+ bcmp(local_mac, osp->os_local_mac, ZIO_OBJSET_MAC_LEN) != 0) {
+ abd_return_buf(abd, buf, datalen);
+ return (SET_ERROR(ECKSUM));
+ }
+
+ abd_return_buf(abd, buf, datalen);
+
+ return (0);
+
+error:
+ if (dck != NULL)
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+ abd_return_buf(abd, buf, datalen);
+ return (ret);
+}
+
+int
+spa_do_crypt_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj, abd_t *abd,
+ uint_t datalen, uint8_t *mac)
+{
+ int ret;
+ dsl_crypto_key_t *dck = NULL;
+ uint8_t *buf = abd_borrow_buf_copy(abd, datalen);
+ uint8_t digestbuf[ZIO_DATA_MAC_LEN];
+
+ /* look up the key from the spa's keystore */
+ ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck);
+ if (ret != 0)
+ goto error;
+
+ /* perform the hmac */
+ ret = zio_crypt_do_hmac(&dck->dck_key, buf, datalen,
+ digestbuf, ZIO_DATA_MAC_LEN);
+ if (ret != 0)
+ goto error;
+
+ abd_return_buf(abd, buf, datalen);
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+
+ /*
+ * Truncate and fill in mac buffer if we were asked to generate a MAC.
+ * Otherwise verify that the MAC matched what we expected.
+ */
+ if (generate) {
+ bcopy(digestbuf, mac, ZIO_DATA_MAC_LEN);
+ return (0);
+ }
+
+ if (bcmp(digestbuf, mac, ZIO_DATA_MAC_LEN) != 0)
+ return (SET_ERROR(ECKSUM));
+
+ return (0);
+
+error:
+ if (dck != NULL)
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+ abd_return_buf(abd, buf, datalen);
+ return (ret);
+}
+
+/*
+ * This function serves as a multiplexer for encryption and decryption of
+ * all blocks (except the L2ARC). For encryption, it will populate the IV,
+ * salt, MAC, and cabd (the ciphertext). On decryption it will simply use
+ * these fields to populate pabd (the plaintext).
+ */
+int
+spa_do_crypt_abd(boolean_t encrypt, spa_t *spa, const zbookmark_phys_t *zb,
+ dmu_object_type_t ot, boolean_t dedup, boolean_t bswap, uint8_t *salt,
+ uint8_t *iv, uint8_t *mac, uint_t datalen, abd_t *pabd, abd_t *cabd,
+ boolean_t *no_crypt)
+{
+ int ret;
+ dsl_crypto_key_t *dck = NULL;
+ uint8_t *plainbuf = NULL, *cipherbuf = NULL;
+
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION));
+
+ /* look up the key from the spa's keystore */
+ ret = spa_keystore_lookup_key(spa, zb->zb_objset, FTAG, &dck);
+ if (ret != 0) {
+ ret = SET_ERROR(EACCES);
+ return (ret);
+ }
+
+ if (encrypt) {
+ plainbuf = abd_borrow_buf_copy(pabd, datalen);
+ cipherbuf = abd_borrow_buf(cabd, datalen);
+ } else {
+ plainbuf = abd_borrow_buf(pabd, datalen);
+ cipherbuf = abd_borrow_buf_copy(cabd, datalen);
+ }
+
+ /*
+ * Both encryption and decryption functions need a salt for key
+ * generation and an IV. When encrypting a non-dedup block, we
+ * generate the salt and IV randomly to be stored by the caller. Dedup
+ * blocks perform a (more expensive) HMAC of the plaintext to obtain
+ * the salt and the IV. ZIL blocks have their salt and IV generated
+ * at allocation time in zio_alloc_zil(). On decryption, we simply use
+ * the provided values.
+ */
+ if (encrypt && ot != DMU_OT_INTENT_LOG && !dedup) {
+ ret = zio_crypt_key_get_salt(&dck->dck_key, salt);
+ if (ret != 0)
+ goto error;
+
+ ret = zio_crypt_generate_iv(iv);
+ if (ret != 0)
+ goto error;
+ } else if (encrypt && dedup) {
+ ret = zio_crypt_generate_iv_salt_dedup(&dck->dck_key,
+ plainbuf, datalen, iv, salt);
+ if (ret != 0)
+ goto error;
+ }
+
+ /* call lower level function to perform encryption / decryption */
+ ret = zio_do_crypt_data(encrypt, &dck->dck_key, ot, bswap, salt, iv,
+ mac, datalen, plainbuf, cipherbuf, no_crypt);
+
+ /*
+ * Handle injected decryption faults. Unfortunately, we cannot inject
+ * faults for dnode blocks because we might trigger the panic in
+ * dbuf_prepare_encrypted_dnode_leaf(), which exists because syncing
+ * context is not prepared to handle malicious decryption failures.
+ */
+ if (zio_injection_enabled && !encrypt && ot != DMU_OT_DNODE && ret == 0)
+ ret = zio_handle_decrypt_injection(spa, zb, ot, ECKSUM);
+ if (ret != 0)
+ goto error;
+
+ if (encrypt) {
+ abd_return_buf(pabd, plainbuf, datalen);
+ abd_return_buf_copy(cabd, cipherbuf, datalen);
+ } else {
+ abd_return_buf_copy(pabd, plainbuf, datalen);
+ abd_return_buf(cabd, cipherbuf, datalen);
+ }
+
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+
+ return (0);
+
+error:
+ if (encrypt) {
+ /* zero out any state we might have changed while encrypting */
+ bzero(salt, ZIO_DATA_SALT_LEN);
+ bzero(iv, ZIO_DATA_IV_LEN);
+ bzero(mac, ZIO_DATA_MAC_LEN);
+ abd_return_buf(pabd, plainbuf, datalen);
+ abd_return_buf_copy(cabd, cipherbuf, datalen);
+ } else {
+ abd_return_buf_copy(pabd, plainbuf, datalen);
+ abd_return_buf(cabd, cipherbuf, datalen);
+ }
+
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+
+ return (ret);
+}
+
+ZFS_MODULE_PARAM(zfs, zfs_, disable_ivset_guid_check, INT, ZMOD_RW,
+ "Set to allow raw receives without IVset guids");
diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c
new file mode 100644
index 000000000000..6da5faf01edf
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c
@@ -0,0 +1,5014 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 RackTop Systems.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ * Copyright 2016, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ * Copyright (c) 2020 The FreeBSD Foundation [1]
+ *
+ * [1] Portions of this software were developed by Allan Jude
+ * under sponsorship from the FreeBSD Foundation.
+ */
+
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/zio.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/unique.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+#include <sys/dsl_scan.h>
+#include <sys/dsl_deadlist.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_userhold.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/policy.h>
+#include <sys/dmu_send.h>
+#include <sys/dmu_recv.h>
+#include <sys/zio_compress.h>
+#include <zfs_fletcher.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * The SPA supports block sizes up to 16MB. However, very large blocks
+ * can have an impact on i/o latency (e.g. tying up a spinning disk for
+ * ~300ms), and also potentially on the memory allocator. Therefore,
+ * we do not allow the recordsize to be set larger than zfs_max_recordsize
+ * (default 1MB). Larger blocks can be created by changing this tunable,
+ * and pools with larger blocks can always be imported and used, regardless
+ * of this setting.
+ */
+int zfs_max_recordsize = 1 * 1024 * 1024;
+int zfs_allow_redacted_dataset_mount = 0;
+
+#define SWITCH64(x, y) \
+ { \
+ uint64_t __tmp = (x); \
+ (x) = (y); \
+ (y) = __tmp; \
+ }
+
+#define DS_REF_MAX (1ULL << 62)
+
+extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
+
+static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds,
+ uint64_t obj, dmu_tx_t *tx);
+static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds,
+ dmu_tx_t *tx);
+
+static void unload_zfeature(dsl_dataset_t *ds, spa_feature_t f);
+
+extern int spa_asize_inflation;
+
+static zil_header_t zero_zil;
+
+/*
+ * Figure out how much of this delta should be propagated to the dsl_dir
+ * layer. If there's a refreservation, that space has already been
+ * partially accounted for in our ancestors.
+ */
+static int64_t
+parent_delta(dsl_dataset_t *ds, int64_t delta)
+{
+ dsl_dataset_phys_t *ds_phys;
+ uint64_t old_bytes, new_bytes;
+
+ if (ds->ds_reserved == 0)
+ return (delta);
+
+ ds_phys = dsl_dataset_phys(ds);
+ old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved);
+ new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
+
+ ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
+ return (new_bytes - old_bytes);
+}
+
+void
+dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ int used = bp_get_dsize_sync(spa, bp);
+ int compressed = BP_GET_PSIZE(bp);
+ int uncompressed = BP_GET_UCSIZE(bp);
+ int64_t delta;
+ spa_feature_t f;
+
+ dprintf_bp(bp, "ds=%p", ds);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ /* It could have been compressed away to nothing */
+ if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp))
+ return;
+ ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
+ ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
+ if (ds == NULL) {
+ dsl_pool_mos_diduse_space(tx->tx_pool,
+ used, compressed, uncompressed);
+ return;
+ }
+
+ ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ mutex_enter(&ds->ds_lock);
+ delta = parent_delta(ds, used);
+ dsl_dataset_phys(ds)->ds_referenced_bytes += used;
+ dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
+ dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
+ dsl_dataset_phys(ds)->ds_unique_bytes += used;
+
+ if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) {
+ ds->ds_feature_activation[SPA_FEATURE_LARGE_BLOCKS] =
+ (void *)B_TRUE;
+ }
+
+
+ f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
+ if (f != SPA_FEATURE_NONE) {
+ ASSERT3S(spa_feature_table[f].fi_type, ==,
+ ZFEATURE_TYPE_BOOLEAN);
+ ds->ds_feature_activation[f] = (void *)B_TRUE;
+ }
+
+ f = zio_compress_to_feature(BP_GET_COMPRESS(bp));
+ if (f != SPA_FEATURE_NONE) {
+ ASSERT3S(spa_feature_table[f].fi_type, ==,
+ ZFEATURE_TYPE_BOOLEAN);
+ ds->ds_feature_activation[f] = (void *)B_TRUE;
+ }
+
+ /*
+ * Track block for livelist, but ignore embedded blocks because
+ * they do not need to be freed.
+ */
+ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
+ bp->blk_birth > ds->ds_dir->dd_origin_txg &&
+ !(BP_IS_EMBEDDED(bp))) {
+ ASSERT(dsl_dir_is_clone(ds->ds_dir));
+ ASSERT(spa_feature_is_enabled(spa,
+ SPA_FEATURE_LIVELIST));
+ bplist_append(&ds->ds_dir->dd_pending_allocs, bp);
+ }
+
+ mutex_exit(&ds->ds_lock);
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
+ compressed, uncompressed, tx);
+ dsl_dir_transfer_space(ds->ds_dir, used - delta,
+ DD_USED_REFRSRV, DD_USED_HEAD, tx);
+}
+
+/*
+ * Called when the specified segment has been remapped, and is thus no
+ * longer referenced in the head dataset. The vdev must be indirect.
+ *
+ * If the segment is referenced by a snapshot, put it on the remap deadlist.
+ * Otherwise, add this segment to the obsolete spacemap.
+ */
+void
+dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset,
+ uint64_t size, uint64_t birth, dmu_tx_t *tx)
+{
+ spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(birth <= tx->tx_txg);
+ ASSERT(!ds->ds_is_snapshot);
+
+ if (birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+ spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
+ } else {
+ blkptr_t fakebp;
+ dva_t *dva = &fakebp.blk_dva[0];
+
+ ASSERT(ds != NULL);
+
+ mutex_enter(&ds->ds_remap_deadlist_lock);
+ if (!dsl_dataset_remap_deadlist_exists(ds)) {
+ dsl_dataset_create_remap_deadlist(ds, tx);
+ }
+ mutex_exit(&ds->ds_remap_deadlist_lock);
+
+ BP_ZERO(&fakebp);
+ fakebp.blk_birth = birth;
+ DVA_SET_VDEV(dva, vdev);
+ DVA_SET_OFFSET(dva, offset);
+ DVA_SET_ASIZE(dva, size);
+ dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, B_FALSE,
+ tx);
+ }
+}
+
+int
+dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
+ boolean_t async)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ int used = bp_get_dsize_sync(spa, bp);
+ int compressed = BP_GET_PSIZE(bp);
+ int uncompressed = BP_GET_UCSIZE(bp);
+
+ if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp))
+ return (0);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(bp->blk_birth <= tx->tx_txg);
+
+ if (ds == NULL) {
+ dsl_free(tx->tx_pool, tx->tx_txg, bp);
+ dsl_pool_mos_diduse_space(tx->tx_pool,
+ -used, -compressed, -uncompressed);
+ return (used);
+ }
+ ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
+
+ ASSERT(!ds->ds_is_snapshot);
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+ /*
+ * Track block for livelist, but ignore embedded blocks because
+ * they do not need to be freed.
+ */
+ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
+ bp->blk_birth > ds->ds_dir->dd_origin_txg &&
+ !(BP_IS_EMBEDDED(bp))) {
+ ASSERT(dsl_dir_is_clone(ds->ds_dir));
+ ASSERT(spa_feature_is_enabled(spa,
+ SPA_FEATURE_LIVELIST));
+ bplist_append(&ds->ds_dir->dd_pending_frees, bp);
+ }
+
+ if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+ int64_t delta;
+
+ dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
+ dsl_free(tx->tx_pool, tx->tx_txg, bp);
+
+ mutex_enter(&ds->ds_lock);
+ ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used ||
+ !DS_UNIQUE_IS_ACCURATE(ds));
+ delta = parent_delta(ds, -used);
+ dsl_dataset_phys(ds)->ds_unique_bytes -= used;
+ mutex_exit(&ds->ds_lock);
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
+ delta, -compressed, -uncompressed, tx);
+ dsl_dir_transfer_space(ds->ds_dir, -used - delta,
+ DD_USED_REFRSRV, DD_USED_HEAD, tx);
+ } else {
+ dprintf_bp(bp, "putting on dead list: %s", "");
+ if (async) {
+ /*
+ * We are here as part of zio's write done callback,
+ * which means we're a zio interrupt thread. We can't
+ * call dsl_deadlist_insert() now because it may block
+ * waiting for I/O. Instead, put bp on the deferred
+ * queue and let dsl_pool_sync() finish the job.
+ */
+ bplist_append(&ds->ds_pending_deadlist, bp);
+ } else {
+ dsl_deadlist_insert(&ds->ds_deadlist, bp, B_FALSE, tx);
+ }
+ ASSERT3U(ds->ds_prev->ds_object, ==,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj);
+ ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
+ /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
+ if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
+ ds->ds_object && bp->blk_birth >
+ dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ mutex_enter(&ds->ds_prev->ds_lock);
+ dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
+ mutex_exit(&ds->ds_prev->ds_lock);
+ }
+ if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
+ dsl_dir_transfer_space(ds->ds_dir, used,
+ DD_USED_HEAD, DD_USED_SNAP, tx);
+ }
+ }
+
+ dsl_bookmark_block_killed(ds, bp, tx);
+
+ mutex_enter(&ds->ds_lock);
+ ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);
+ dsl_dataset_phys(ds)->ds_referenced_bytes -= used;
+ ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed);
+ dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed;
+ ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed);
+ dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed;
+ mutex_exit(&ds->ds_lock);
+
+ return (used);
+}
+
+struct feature_type_uint64_array_arg {
+ uint64_t length;
+ uint64_t *array;
+};
+
+static void
+unload_zfeature(dsl_dataset_t *ds, spa_feature_t f)
+{
+ switch (spa_feature_table[f].fi_type) {
+ case ZFEATURE_TYPE_BOOLEAN:
+ break;
+ case ZFEATURE_TYPE_UINT64_ARRAY:
+ {
+ struct feature_type_uint64_array_arg *ftuaa = ds->ds_feature[f];
+ kmem_free(ftuaa->array, ftuaa->length * sizeof (uint64_t));
+ kmem_free(ftuaa, sizeof (*ftuaa));
+ break;
+ }
+ default:
+ panic("Invalid zfeature type %d", spa_feature_table[f].fi_type);
+ }
+}
+
+static int
+load_zfeature(objset_t *mos, dsl_dataset_t *ds, spa_feature_t f)
+{
+ int err = 0;
+ switch (spa_feature_table[f].fi_type) {
+ case ZFEATURE_TYPE_BOOLEAN:
+ err = zap_contains(mos, ds->ds_object,
+ spa_feature_table[f].fi_guid);
+ if (err == 0) {
+ ds->ds_feature[f] = (void *)B_TRUE;
+ } else {
+ ASSERT3U(err, ==, ENOENT);
+ err = 0;
+ }
+ break;
+ case ZFEATURE_TYPE_UINT64_ARRAY:
+ {
+ uint64_t int_size, num_int;
+ uint64_t *data;
+ err = zap_length(mos, ds->ds_object,
+ spa_feature_table[f].fi_guid, &int_size, &num_int);
+ if (err != 0) {
+ ASSERT3U(err, ==, ENOENT);
+ err = 0;
+ break;
+ }
+ ASSERT3U(int_size, ==, sizeof (uint64_t));
+ data = kmem_alloc(int_size * num_int, KM_SLEEP);
+ VERIFY0(zap_lookup(mos, ds->ds_object,
+ spa_feature_table[f].fi_guid, int_size, num_int, data));
+ struct feature_type_uint64_array_arg *ftuaa =
+ kmem_alloc(sizeof (*ftuaa), KM_SLEEP);
+ ftuaa->length = num_int;
+ ftuaa->array = data;
+ ds->ds_feature[f] = ftuaa;
+ break;
+ }
+ default:
+ panic("Invalid zfeature type %d", spa_feature_table[f].fi_type);
+ }
+ return (err);
+}
+
+/*
+ * We have to release the fsid synchronously or we risk that a subsequent
+ * mount of the same dataset will fail to unique_insert the fsid. This
+ * failure would manifest itself as the fsid of this dataset changing
+ * between mounts which makes NFS clients quite unhappy.
+ */
+static void
+dsl_dataset_evict_sync(void *dbu)
+{
+ dsl_dataset_t *ds = dbu;
+
+ ASSERT(ds->ds_owner == NULL);
+
+ unique_remove(ds->ds_fsid_guid);
+}
+
+static void
+dsl_dataset_evict_async(void *dbu)
+{
+ dsl_dataset_t *ds = dbu;
+
+ ASSERT(ds->ds_owner == NULL);
+
+ ds->ds_dbuf = NULL;
+
+ if (ds->ds_objset != NULL)
+ dmu_objset_evict(ds->ds_objset);
+
+ if (ds->ds_prev) {
+ dsl_dataset_rele(ds->ds_prev, ds);
+ ds->ds_prev = NULL;
+ }
+
+ dsl_bookmark_fini_ds(ds);
+
+ bplist_destroy(&ds->ds_pending_deadlist);
+ if (dsl_deadlist_is_open(&ds->ds_deadlist))
+ dsl_deadlist_close(&ds->ds_deadlist);
+ if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))
+ dsl_deadlist_close(&ds->ds_remap_deadlist);
+ if (ds->ds_dir)
+ dsl_dir_async_rele(ds->ds_dir, ds);
+
+ ASSERT(!list_link_active(&ds->ds_synced_link));
+
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (dsl_dataset_feature_is_active(ds, f))
+ unload_zfeature(ds, f);
+ }
+
+ list_destroy(&ds->ds_prop_cbs);
+ mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_opening_lock);
+ mutex_destroy(&ds->ds_sendstream_lock);
+ mutex_destroy(&ds->ds_remap_deadlist_lock);
+ zfs_refcount_destroy(&ds->ds_longholds);
+ rrw_destroy(&ds->ds_bp_rwlock);
+
+ kmem_free(ds, sizeof (dsl_dataset_t));
+}
+
+int
+dsl_dataset_get_snapname(dsl_dataset_t *ds)
+{
+ dsl_dataset_phys_t *headphys;
+ int err;
+ dmu_buf_t *headdbuf;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+
+ if (ds->ds_snapname[0])
+ return (0);
+ if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0)
+ return (0);
+
+ err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
+ FTAG, &headdbuf);
+ if (err != 0)
+ return (err);
+ headphys = headdbuf->db_data;
+ err = zap_value_search(dp->dp_meta_objset,
+ headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
+ if (err != 0 && zfs_recover == B_TRUE) {
+ err = 0;
+ (void) snprintf(ds->ds_snapname, sizeof (ds->ds_snapname),
+ "SNAPOBJ=%llu-ERR=%d",
+ (unsigned long long)ds->ds_object, err);
+ }
+ dmu_buf_rele(headdbuf, FTAG);
+ return (err);
+}
+
+int
+dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
+ matchtype_t mt = 0;
+ int err;
+
+ if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
+ mt = MT_NORMALIZE;
+
+ err = zap_lookup_norm(mos, snapobj, name, 8, 1,
+ value, mt, NULL, 0, NULL);
+ if (err == ENOTSUP && (mt & MT_NORMALIZE))
+ err = zap_lookup(mos, snapobj, name, 8, 1, value);
+ return (err);
+}
+
+int
+dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
+ boolean_t adj_cnt)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
+ matchtype_t mt = 0;
+ int err;
+
+ dsl_dir_snap_cmtime_update(ds->ds_dir);
+
+ if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
+ mt = MT_NORMALIZE;
+
+ err = zap_remove_norm(mos, snapobj, name, mt, tx);
+ if (err == ENOTSUP && (mt & MT_NORMALIZE))
+ err = zap_remove(mos, snapobj, name, tx);
+
+ if (err == 0 && adj_cnt)
+ dsl_fs_ss_count_adjust(ds->ds_dir, -1,
+ DD_FIELD_SNAPSHOT_COUNT, tx);
+
+ return (err);
+}
+
+boolean_t
+dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag)
+{
+ dmu_buf_t *dbuf = ds->ds_dbuf;
+ boolean_t result = B_FALSE;
+
+ if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset,
+ ds->ds_object, DMU_BONUS_BLKID, tag)) {
+
+ if (ds == dmu_buf_get_user(dbuf))
+ result = B_TRUE;
+ else
+ dmu_buf_rele(dbuf, tag);
+ }
+
+ return (result);
+}
+
+int
+dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
+ dsl_dataset_t **dsp)
+{
+ objset_t *mos = dp->dp_meta_objset;
+ dmu_buf_t *dbuf;
+ dsl_dataset_t *ds;
+ int err;
+ dmu_object_info_t doi;
+
+ ASSERT(dsl_pool_config_held(dp));
+
+ err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
+ if (err != 0)
+ return (err);
+
+ /* Make sure dsobj has the correct object type. */
+ dmu_object_info_from_db(dbuf, &doi);
+ if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) {
+ dmu_buf_rele(dbuf, tag);
+ return (SET_ERROR(EINVAL));
+ }
+
+ ds = dmu_buf_get_user(dbuf);
+ if (ds == NULL) {
+ dsl_dataset_t *winner = NULL;
+
+ ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
+ ds->ds_dbuf = dbuf;
+ ds->ds_object = dsobj;
+ ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0;
+ list_link_init(&ds->ds_synced_link);
+
+ err = dsl_dir_hold_obj(dp, dsl_dataset_phys(ds)->ds_dir_obj,
+ NULL, ds, &ds->ds_dir);
+ if (err != 0) {
+ kmem_free(ds, sizeof (dsl_dataset_t));
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+ }
+
+ mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&ds->ds_remap_deadlist_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ rrw_init(&ds->ds_bp_rwlock, B_FALSE);
+ zfs_refcount_create(&ds->ds_longholds);
+
+ bplist_create(&ds->ds_pending_deadlist);
+
+ list_create(&ds->ds_sendstreams, sizeof (dmu_sendstatus_t),
+ offsetof(dmu_sendstatus_t, dss_link));
+
+ list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t),
+ offsetof(dsl_prop_cb_record_t, cbr_ds_node));
+
+ if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
+ spa_feature_t f;
+
+ for (f = 0; f < SPA_FEATURES; f++) {
+ if (!(spa_feature_table[f].fi_flags &
+ ZFEATURE_FLAG_PER_DATASET))
+ continue;
+ err = load_zfeature(mos, ds, f);
+ }
+ }
+
+ if (!ds->ds_is_snapshot) {
+ ds->ds_snapname[0] = '\0';
+ if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+ err = dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj,
+ ds, &ds->ds_prev);
+ }
+ err = dsl_bookmark_init_ds(ds);
+ } else {
+ if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
+ err = dsl_dataset_get_snapname(ds);
+ if (err == 0 &&
+ dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
+ err = zap_count(
+ ds->ds_dir->dd_pool->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_userrefs_obj,
+ &ds->ds_userrefs);
+ }
+ }
+
+ if (err == 0 && !ds->ds_is_snapshot) {
+ err = dsl_prop_get_int_ds(ds,
+ zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+ &ds->ds_reserved);
+ if (err == 0) {
+ err = dsl_prop_get_int_ds(ds,
+ zfs_prop_to_name(ZFS_PROP_REFQUOTA),
+ &ds->ds_quota);
+ }
+ } else {
+ ds->ds_reserved = ds->ds_quota = 0;
+ }
+
+ if (err == 0 && ds->ds_dir->dd_crypto_obj != 0 &&
+ ds->ds_is_snapshot &&
+ zap_contains(mos, dsobj, DS_FIELD_IVSET_GUID) != 0) {
+ dp->dp_spa->spa_errata =
+ ZPOOL_ERRATA_ZOL_8308_ENCRYPTION;
+ }
+
+ dsl_deadlist_open(&ds->ds_deadlist,
+ mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
+ uint64_t remap_deadlist_obj =
+ dsl_dataset_get_remap_deadlist_object(ds);
+ if (remap_deadlist_obj != 0) {
+ dsl_deadlist_open(&ds->ds_remap_deadlist, mos,
+ remap_deadlist_obj);
+ }
+
+ dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict_sync,
+ dsl_dataset_evict_async, &ds->ds_dbuf);
+ if (err == 0)
+ winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu);
+
+ if (err != 0 || winner != NULL) {
+ bplist_destroy(&ds->ds_pending_deadlist);
+ dsl_deadlist_close(&ds->ds_deadlist);
+ if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))
+ dsl_deadlist_close(&ds->ds_remap_deadlist);
+ dsl_bookmark_fini_ds(ds);
+ if (ds->ds_prev)
+ dsl_dataset_rele(ds->ds_prev, ds);
+ dsl_dir_rele(ds->ds_dir, ds);
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (dsl_dataset_feature_is_active(ds, f))
+ unload_zfeature(ds, f);
+ }
+
+ list_destroy(&ds->ds_prop_cbs);
+ list_destroy(&ds->ds_sendstreams);
+ mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_opening_lock);
+ mutex_destroy(&ds->ds_sendstream_lock);
+ mutex_destroy(&ds->ds_remap_deadlist_lock);
+ zfs_refcount_destroy(&ds->ds_longholds);
+ rrw_destroy(&ds->ds_bp_rwlock);
+ kmem_free(ds, sizeof (dsl_dataset_t));
+ if (err != 0) {
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+ }
+ ds = winner;
+ } else {
+ ds->ds_fsid_guid =
+ unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid);
+ if (ds->ds_fsid_guid !=
+ dsl_dataset_phys(ds)->ds_fsid_guid) {
+ zfs_dbgmsg("ds_fsid_guid changed from "
+ "%llx to %llx for pool %s dataset id %llu",
+ (long long)
+ dsl_dataset_phys(ds)->ds_fsid_guid,
+ (long long)ds->ds_fsid_guid,
+ spa_name(dp->dp_spa),
+ dsobj);
+ }
+ }
+ }
+
+ ASSERT3P(ds->ds_dbuf, ==, dbuf);
+ ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data);
+ ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 ||
+ spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
+ dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
+ *dsp = ds;
+
+ return (0);
+}
+
+int
+dsl_dataset_create_key_mapping(dsl_dataset_t *ds)
+{
+ dsl_dir_t *dd = ds->ds_dir;
+
+ if (dd->dd_crypto_obj == 0)
+ return (0);
+
+ return (spa_keystore_create_mapping(dd->dd_pool->dp_spa,
+ ds, ds, &ds->ds_key_mapping));
+}
+
+int
+dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj,
+ ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp)
+{
+ int err;
+
+ err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
+ if (err != 0)
+ return (err);
+
+ ASSERT3P(*dsp, !=, NULL);
+
+ if (flags & DS_HOLD_FLAG_DECRYPT) {
+ err = dsl_dataset_create_key_mapping(*dsp);
+ if (err != 0)
+ dsl_dataset_rele(*dsp, tag);
+ }
+
+ return (err);
+}
+
+int
+dsl_dataset_hold_flags(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
+ void *tag, dsl_dataset_t **dsp)
+{
+ dsl_dir_t *dd;
+ const char *snapname;
+ uint64_t obj;
+ int err = 0;
+ dsl_dataset_t *ds;
+
+ err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
+ if (err != 0)
+ return (err);
+
+ ASSERT(dsl_pool_config_held(dp));
+ obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
+ if (obj != 0)
+ err = dsl_dataset_hold_obj_flags(dp, obj, flags, tag, &ds);
+ else
+ err = SET_ERROR(ENOENT);
+
+ /* we may be looking for a snapshot */
+ if (err == 0 && snapname != NULL) {
+ dsl_dataset_t *snap_ds;
+
+ if (*snapname++ != '@') {
+ dsl_dataset_rele_flags(ds, flags, tag);
+ dsl_dir_rele(dd, FTAG);
+ return (SET_ERROR(ENOENT));
+ }
+
+ dprintf("looking for snapshot '%s'\n", snapname);
+ err = dsl_dataset_snap_lookup(ds, snapname, &obj);
+ if (err == 0) {
+ err = dsl_dataset_hold_obj_flags(dp, obj, flags, tag,
+ &snap_ds);
+ }
+ dsl_dataset_rele_flags(ds, flags, tag);
+
+ if (err == 0) {
+ mutex_enter(&snap_ds->ds_lock);
+ if (snap_ds->ds_snapname[0] == 0)
+ (void) strlcpy(snap_ds->ds_snapname, snapname,
+ sizeof (snap_ds->ds_snapname));
+ mutex_exit(&snap_ds->ds_lock);
+ ds = snap_ds;
+ }
+ }
+ if (err == 0)
+ *dsp = ds;
+ dsl_dir_rele(dd, FTAG);
+ return (err);
+}
+
+int
+dsl_dataset_hold(dsl_pool_t *dp, const char *name, void *tag,
+ dsl_dataset_t **dsp)
+{
+ return (dsl_dataset_hold_flags(dp, name, 0, tag, dsp));
+}
+
+static int
+dsl_dataset_own_obj_impl(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags,
+ void *tag, boolean_t override, dsl_dataset_t **dsp)
+{
+ int err = dsl_dataset_hold_obj_flags(dp, dsobj, flags, tag, dsp);
+ if (err != 0)
+ return (err);
+ if (!dsl_dataset_tryown(*dsp, tag, override)) {
+ dsl_dataset_rele_flags(*dsp, flags, tag);
+ *dsp = NULL;
+ return (SET_ERROR(EBUSY));
+ }
+ return (0);
+}
+
+
+int
+dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags,
+ void *tag, dsl_dataset_t **dsp)
+{
+ return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_FALSE, dsp));
+}
+
+int
+dsl_dataset_own_obj_force(dsl_pool_t *dp, uint64_t dsobj,
+ ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp)
+{
+ return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_TRUE, dsp));
+}
+
+static int
+dsl_dataset_own_impl(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
+ void *tag, boolean_t override, dsl_dataset_t **dsp)
+{
+ int err = dsl_dataset_hold_flags(dp, name, flags, tag, dsp);
+ if (err != 0)
+ return (err);
+ if (!dsl_dataset_tryown(*dsp, tag, override)) {
+ dsl_dataset_rele_flags(*dsp, flags, tag);
+ return (SET_ERROR(EBUSY));
+ }
+ return (0);
+}
+
+int
+dsl_dataset_own_force(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
+ void *tag, dsl_dataset_t **dsp)
+{
+ return (dsl_dataset_own_impl(dp, name, flags, tag, B_TRUE, dsp));
+}
+
+int
+dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
+ void *tag, dsl_dataset_t **dsp)
+{
+ return (dsl_dataset_own_impl(dp, name, flags, tag, B_FALSE, dsp));
+}
+
+/*
+ * See the comment above dsl_pool_hold() for details. In summary, a long
+ * hold is used to prevent destruction of a dataset while the pool hold
+ * is dropped, allowing other concurrent operations (e.g. spa_sync()).
+ *
+ * The dataset and pool must be held when this function is called. After it
+ * is called, the pool hold may be released while the dataset is still held
+ * and accessed.
+ */
+void
+dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag)
+{
+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+ (void) zfs_refcount_add(&ds->ds_longholds, tag);
+}
+
+void
+dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag)
+{
+ (void) zfs_refcount_remove(&ds->ds_longholds, tag);
+}
+
+/* Return B_TRUE if there are any long holds on this dataset. */
+boolean_t
+dsl_dataset_long_held(dsl_dataset_t *ds)
+{
+ return (!zfs_refcount_is_zero(&ds->ds_longholds));
+}
+
+void
+dsl_dataset_name(dsl_dataset_t *ds, char *name)
+{
+ if (ds == NULL) {
+ (void) strlcpy(name, "mos", ZFS_MAX_DATASET_NAME_LEN);
+ } else {
+ dsl_dir_name(ds->ds_dir, name);
+ VERIFY0(dsl_dataset_get_snapname(ds));
+ if (ds->ds_snapname[0]) {
+ VERIFY3U(strlcat(name, "@", ZFS_MAX_DATASET_NAME_LEN),
+ <, ZFS_MAX_DATASET_NAME_LEN);
+ /*
+ * We use a "recursive" mutex so that we
+ * can call dprintf_ds() with ds_lock held.
+ */
+ if (!MUTEX_HELD(&ds->ds_lock)) {
+ mutex_enter(&ds->ds_lock);
+ VERIFY3U(strlcat(name, ds->ds_snapname,
+ ZFS_MAX_DATASET_NAME_LEN), <,
+ ZFS_MAX_DATASET_NAME_LEN);
+ mutex_exit(&ds->ds_lock);
+ } else {
+ VERIFY3U(strlcat(name, ds->ds_snapname,
+ ZFS_MAX_DATASET_NAME_LEN), <,
+ ZFS_MAX_DATASET_NAME_LEN);
+ }
+ }
+ }
+}
+
+int
+dsl_dataset_namelen(dsl_dataset_t *ds)
+{
+ VERIFY0(dsl_dataset_get_snapname(ds));
+ mutex_enter(&ds->ds_lock);
+ int len = strlen(ds->ds_snapname);
+ mutex_exit(&ds->ds_lock);
+ /* add '@' if ds is a snap */
+ if (len > 0)
+ len++;
+ len += dsl_dir_namelen(ds->ds_dir);
+ return (len);
+}
+
+void
+dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
+{
+ dmu_buf_rele(ds->ds_dbuf, tag);
+}
+
+void
+dsl_dataset_remove_key_mapping(dsl_dataset_t *ds)
+{
+ dsl_dir_t *dd = ds->ds_dir;
+
+ if (dd == NULL || dd->dd_crypto_obj == 0)
+ return;
+
+ (void) spa_keystore_remove_mapping(dd->dd_pool->dp_spa,
+ ds->ds_object, ds);
+}
+
+void
+dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag)
+{
+ if (flags & DS_HOLD_FLAG_DECRYPT)
+ dsl_dataset_remove_key_mapping(ds);
+
+ dsl_dataset_rele(ds, tag);
+}
+
+void
+dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag)
+{
+ ASSERT3P(ds->ds_owner, ==, tag);
+ ASSERT(ds->ds_dbuf != NULL);
+
+ mutex_enter(&ds->ds_lock);
+ ds->ds_owner = NULL;
+ mutex_exit(&ds->ds_lock);
+ dsl_dataset_long_rele(ds, tag);
+ dsl_dataset_rele_flags(ds, flags, tag);
+}
+
+boolean_t
+dsl_dataset_tryown(dsl_dataset_t *ds, void *tag, boolean_t override)
+{
+ boolean_t gotit = FALSE;
+
+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+ mutex_enter(&ds->ds_lock);
+ if (ds->ds_owner == NULL && (override || !(DS_IS_INCONSISTENT(ds) ||
+ (dsl_dataset_feature_is_active(ds,
+ SPA_FEATURE_REDACTED_DATASETS) &&
+ !zfs_allow_redacted_dataset_mount)))) {
+ ds->ds_owner = tag;
+ dsl_dataset_long_hold(ds, tag);
+ gotit = TRUE;
+ }
+ mutex_exit(&ds->ds_lock);
+ return (gotit);
+}
+
+boolean_t
+dsl_dataset_has_owner(dsl_dataset_t *ds)
+{
+ boolean_t rv;
+ mutex_enter(&ds->ds_lock);
+ rv = (ds->ds_owner != NULL);
+ mutex_exit(&ds->ds_lock);
+ return (rv);
+}
+
+static boolean_t
+zfeature_active(spa_feature_t f, void *arg)
+{
+ switch (spa_feature_table[f].fi_type) {
+ case ZFEATURE_TYPE_BOOLEAN: {
+ boolean_t val = (boolean_t)(uintptr_t)arg;
+ ASSERT(val == B_FALSE || val == B_TRUE);
+ return (val);
+ }
+ case ZFEATURE_TYPE_UINT64_ARRAY:
+ /*
+ * In this case, arg is a uint64_t array. The feature is active
+ * if the array is non-null.
+ */
+ return (arg != NULL);
+ default:
+ panic("Invalid zfeature type %d", spa_feature_table[f].fi_type);
+ return (B_FALSE);
+ }
+}
+
+boolean_t
+dsl_dataset_feature_is_active(dsl_dataset_t *ds, spa_feature_t f)
+{
+ return (zfeature_active(f, ds->ds_feature[f]));
+}
+
+/*
+ * The buffers passed out by this function are references to internal buffers;
+ * they should not be freed by callers of this function, and they should not be
+ * used after the dataset has been released.
+ */
+boolean_t
+dsl_dataset_get_uint64_array_feature(dsl_dataset_t *ds, spa_feature_t f,
+ uint64_t *outlength, uint64_t **outp)
+{
+ VERIFY(spa_feature_table[f].fi_type & ZFEATURE_TYPE_UINT64_ARRAY);
+ if (!dsl_dataset_feature_is_active(ds, f)) {
+ return (B_FALSE);
+ }
+ struct feature_type_uint64_array_arg *ftuaa = ds->ds_feature[f];
+ *outp = ftuaa->array;
+ *outlength = ftuaa->length;
+ return (B_TRUE);
+}
+
+void
+dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, void *arg,
+ dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+ uint64_t zero = 0;
+
+ VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
+
+ spa_feature_incr(spa, f, tx);
+ dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+
+ switch (spa_feature_table[f].fi_type) {
+ case ZFEATURE_TYPE_BOOLEAN:
+ ASSERT3S((boolean_t)(uintptr_t)arg, ==, B_TRUE);
+ VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,
+ sizeof (zero), 1, &zero, tx));
+ break;
+ case ZFEATURE_TYPE_UINT64_ARRAY:
+ {
+ struct feature_type_uint64_array_arg *ftuaa = arg;
+ VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,
+ sizeof (uint64_t), ftuaa->length, ftuaa->array, tx));
+ break;
+ }
+ default:
+ panic("Invalid zfeature type %d", spa_feature_table[f].fi_type);
+ }
+}
+
+static void
+dsl_dataset_deactivate_feature_impl(dsl_dataset_t *ds, spa_feature_t f,
+ dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+ uint64_t dsobj = ds->ds_object;
+
+ VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
+
+ VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx));
+ spa_feature_decr(spa, f, tx);
+ ds->ds_feature[f] = NULL;
+}
+
+void
+dsl_dataset_deactivate_feature(dsl_dataset_t *ds, spa_feature_t f, dmu_tx_t *tx)
+{
+ unload_zfeature(ds, f);
+ dsl_dataset_deactivate_feature_impl(ds, f, tx);
+}
+
+uint64_t
+dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
+ dsl_crypto_params_t *dcp, uint64_t flags, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dd->dd_pool;
+ dmu_buf_t *dbuf;
+ dsl_dataset_phys_t *dsphys;
+ uint64_t dsobj;
+ objset_t *mos = dp->dp_meta_objset;
+
+ if (origin == NULL)
+ origin = dp->dp_origin_snap;
+
+ ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
+ ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0);
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
+
+ dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+ DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
+ VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ dsphys = dbuf->db_data;
+ bzero(dsphys, sizeof (dsl_dataset_phys_t));
+ dsphys->ds_dir_obj = dd->dd_object;
+ dsphys->ds_flags = flags;
+ dsphys->ds_fsid_guid = unique_create();
+ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+ sizeof (dsphys->ds_guid));
+ dsphys->ds_snapnames_zapobj =
+ zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
+ DMU_OT_NONE, 0, tx);
+ dsphys->ds_creation_time = gethrestime_sec();
+ dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
+
+ if (origin == NULL) {
+ dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
+ } else {
+ dsl_dataset_t *ohds; /* head of the origin snapshot */
+
+ dsphys->ds_prev_snap_obj = origin->ds_object;
+ dsphys->ds_prev_snap_txg =
+ dsl_dataset_phys(origin)->ds_creation_txg;
+ dsphys->ds_referenced_bytes =
+ dsl_dataset_phys(origin)->ds_referenced_bytes;
+ dsphys->ds_compressed_bytes =
+ dsl_dataset_phys(origin)->ds_compressed_bytes;
+ dsphys->ds_uncompressed_bytes =
+ dsl_dataset_phys(origin)->ds_uncompressed_bytes;
+ rrw_enter(&origin->ds_bp_rwlock, RW_READER, FTAG);
+ dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp;
+ rrw_exit(&origin->ds_bp_rwlock, FTAG);
+
+ /*
+ * Inherit flags that describe the dataset's contents
+ * (INCONSISTENT) or properties (Case Insensitive).
+ */
+ dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
+ (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
+
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (zfeature_active(f, origin->ds_feature[f])) {
+ dsl_dataset_activate_feature(dsobj, f,
+ origin->ds_feature[f], tx);
+ }
+ }
+
+ dmu_buf_will_dirty(origin->ds_dbuf, tx);
+ dsl_dataset_phys(origin)->ds_num_children++;
+
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj,
+ FTAG, &ohds));
+ dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
+ dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
+ dsl_dataset_rele(ohds, FTAG);
+
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
+ if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) {
+ dsl_dataset_phys(origin)->ds_next_clones_obj =
+ zap_create(mos,
+ DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
+ }
+ VERIFY0(zap_add_int(mos,
+ dsl_dataset_phys(origin)->ds_next_clones_obj,
+ dsobj, tx));
+ }
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object;
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+ if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
+ dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
+ dsl_dir_phys(origin->ds_dir)->dd_clones =
+ zap_create(mos,
+ DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+ }
+ VERIFY0(zap_add_int(mos,
+ dsl_dir_phys(origin->ds_dir)->dd_clones,
+ dsobj, tx));
+ }
+ }
+
+ /* handle encryption */
+ dsl_dataset_create_crypt_sync(dsobj, dd, origin, dcp, tx);
+
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
+ dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+
+ dmu_buf_rele(dbuf, FTAG);
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj;
+
+ return (dsobj);
+}
+
+static void
+dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ objset_t *os;
+
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+ if (bcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) {
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ zio_t *zio;
+
+ bzero(&os->os_zil_header, sizeof (os->os_zil_header));
+ if (os->os_encrypted)
+ os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
+
+ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ dsl_dataset_sync(ds, zio, tx);
+ VERIFY0(zio_wait(zio));
+
+ /* dsl_dataset_sync_done will drop this reference. */
+ dmu_buf_add_ref(ds->ds_dbuf, ds);
+ dsl_dataset_sync_done(ds, tx);
+ }
+}
+
+uint64_t
+dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
+ dsl_dataset_t *origin, uint64_t flags, cred_t *cr,
+ dsl_crypto_params_t *dcp, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = pdd->dd_pool;
+ uint64_t dsobj, ddobj;
+ dsl_dir_t *dd;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(lastname[0] != '@');
+ /*
+ * Filesystems will eventually have their origin set to dp_origin_snap,
+ * but that's taken care of in dsl_dataset_create_sync_dd. When
+ * creating a filesystem, this function is called with origin equal to
+ * NULL.
+ */
+ if (origin != NULL)
+ ASSERT3P(origin, !=, dp->dp_origin_snap);
+
+ ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
+ VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
+
+ dsobj = dsl_dataset_create_sync_dd(dd, origin, dcp,
+ flags & ~DS_CREATE_FLAG_NODIRTY, tx);
+
+ dsl_deleg_set_create_perms(dd, tx, cr);
+
+ /*
+ * If we are creating a clone and the livelist feature is enabled,
+ * add the entry DD_FIELD_LIVELIST to ZAP.
+ */
+ if (origin != NULL &&
+ spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LIVELIST)) {
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ dsl_dir_zapify(dd, tx);
+ uint64_t obj = dsl_deadlist_alloc(mos, tx);
+ VERIFY0(zap_add(mos, dd->dd_object, DD_FIELD_LIVELIST,
+ sizeof (uint64_t), 1, &obj, tx));
+ spa_feature_incr(dp->dp_spa, SPA_FEATURE_LIVELIST, tx);
+ }
+
+ /*
+ * Since we're creating a new node we know it's a leaf, so we can
+ * initialize the counts if the limit feature is active.
+ */
+ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
+ uint64_t cnt = 0;
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+
+ dsl_dir_zapify(dd, tx);
+ VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+ sizeof (cnt), 1, &cnt, tx));
+ VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+ sizeof (cnt), 1, &cnt, tx));
+ }
+
+ dsl_dir_rele(dd, FTAG);
+
+ /*
+ * If we are creating a clone, make sure we zero out any stale
+ * data from the origin snapshots zil header.
+ */
+ if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {
+ dsl_dataset_t *ds;
+
+ VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+ dsl_dataset_zero_zil(ds, tx);
+ dsl_dataset_rele(ds, FTAG);
+ }
+
+ return (dsobj);
+}
+
+/*
+ * The unique space in the head dataset can be calculated by subtracting
+ * the space used in the most recent snapshot, that is still being used
+ * in this file system, from the space currently in use. To figure out
+ * the space in the most recent snapshot still in use, we need to take
+ * the total space used in the snapshot and subtract out the space that
+ * has been freed up since the snapshot was taken.
+ */
+void
+dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
+{
+ uint64_t mrs_used;
+ uint64_t dlused, dlcomp, dluncomp;
+
+ ASSERT(!ds->ds_is_snapshot);
+
+ if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0)
+ mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes;
+ else
+ mrs_used = 0;
+
+ dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
+
+ ASSERT3U(dlused, <=, mrs_used);
+ dsl_dataset_phys(ds)->ds_unique_bytes =
+ dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused);
+
+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
+ SPA_VERSION_UNIQUE_ACCURATE)
+ dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+}
+
+void
+dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
+ dmu_tx_t *tx)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t count __maybe_unused;
+ int err;
+
+ ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2);
+ err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
+ obj, tx);
+ /*
+ * The err should not be ENOENT, but a bug in a previous version
+ * of the code could cause upgrade_clones_cb() to not set
+ * ds_next_snap_obj when it should, leading to a missing entry.
+ * If we knew that the pool was created after
+ * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
+ * ENOENT. However, at least we can check that we don't have
+ * too many entries in the next_clones_obj even after failing to
+ * remove this one.
+ */
+ if (err != ENOENT)
+ VERIFY0(err);
+ ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
+ &count));
+ ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2);
+}
+
+
+blkptr_t *
+dsl_dataset_get_blkptr(dsl_dataset_t *ds)
+{
+ return (&dsl_dataset_phys(ds)->ds_bp);
+}
+
+spa_t *
+dsl_dataset_get_spa(dsl_dataset_t *ds)
+{
+ return (ds->ds_dir->dd_pool->dp_spa);
+}
+
+void
+dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp;
+
+ if (ds == NULL) /* this is the meta-objset */
+ return;
+
+ ASSERT(ds->ds_objset != NULL);
+
+ if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0)
+ panic("dirtying snapshot!");
+
+ /* Must not dirty a dataset in the same txg where it got snapshotted. */
+ ASSERT3U(tx->tx_txg, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
+
+ dp = ds->ds_dir->dd_pool;
+ if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
+ objset_t *os = ds->ds_objset;
+
+ /* up the hold count until we can be written out */
+ dmu_buf_add_ref(ds->ds_dbuf, ds);
+
+ /* if this dataset is encrypted, grab a reference to the DCK */
+ if (ds->ds_dir->dd_crypto_obj != 0 &&
+ !os->os_raw_receive &&
+ !os->os_next_write_raw[tx->tx_txg & TXG_MASK]) {
+ ASSERT3P(ds->ds_key_mapping, !=, NULL);
+ key_mapping_add_ref(ds->ds_key_mapping, ds);
+ }
+ }
+}
+
+static int
+dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ uint64_t asize;
+
+ if (!dmu_tx_is_syncing(tx))
+ return (0);
+
+ /*
+ * If there's an fs-only reservation, any blocks that might become
+ * owned by the snapshot dataset must be accommodated by space
+ * outside of the reservation.
+ */
+ ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
+ asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved);
+ if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
+ return (SET_ERROR(ENOSPC));
+
+ /*
+ * Propagate any reserved space for this snapshot to other
+ * snapshot checks in this sync group.
+ */
+ if (asize > 0)
+ dsl_dir_willuse_space(ds->ds_dir, asize, tx);
+
+ return (0);
+}
+
+int
+dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
+ dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr, proc_t *proc)
+{
+ int error;
+ uint64_t value;
+
+ ds->ds_trysnap_txg = tx->tx_txg;
+
+ if (!dmu_tx_is_syncing(tx))
+ return (0);
+
+ /*
+ * We don't allow multiple snapshots of the same txg. If there
+ * is already one, try again.
+ */
+ if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg)
+ return (SET_ERROR(EAGAIN));
+
+ /*
+ * Check for conflicting snapshot name.
+ */
+ error = dsl_dataset_snap_lookup(ds, snapname, &value);
+ if (error == 0)
+ return (SET_ERROR(EEXIST));
+ if (error != ENOENT)
+ return (error);
+
+ /*
+ * We don't allow taking snapshots of inconsistent datasets, such as
+ * those into which we are currently receiving. However, if we are
+ * creating this snapshot as part of a receive, this check will be
+ * executed atomically with respect to the completion of the receive
+ * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
+ * case we ignore this, knowing it will be fixed up for us shortly in
+ * dmu_recv_end_sync().
+ */
+ if (!recv && DS_IS_INCONSISTENT(ds))
+ return (SET_ERROR(EBUSY));
+
+ /*
+ * Skip the check for temporary snapshots or if we have already checked
+ * the counts in dsl_dataset_snapshot_check. This means we really only
+ * check the count here when we're receiving a stream.
+ */
+ if (cnt != 0 && cr != NULL) {
+ error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
+ ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr, proc);
+ if (error != 0)
+ return (error);
+ }
+
+ error = dsl_dataset_snapshot_reserve_space(ds, tx);
+ if (error != 0)
+ return (error);
+
+ return (0);
+}
+
+int
+dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_snapshot_arg_t *ddsa = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ nvpair_t *pair;
+ int rv = 0;
+
+ /*
+ * Pre-compute how many total new snapshots will be created for each
+ * level in the tree and below. This is needed for validating the
+ * snapshot limit when either taking a recursive snapshot or when
+ * taking multiple snapshots.
+ *
+ * The problem is that the counts are not actually adjusted when
+ * we are checking, only when we finally sync. For a single snapshot,
+ * this is easy, the count will increase by 1 at each node up the tree,
+ * but its more complicated for the recursive/multiple snapshot case.
+ *
+ * The dsl_fs_ss_limit_check function does recursively check the count
+ * at each level up the tree but since it is validating each snapshot
+ * independently we need to be sure that we are validating the complete
+ * count for the entire set of snapshots. We do this by rolling up the
+ * counts for each component of the name into an nvlist and then
+ * checking each of those cases with the aggregated count.
+ *
+ * This approach properly handles not only the recursive snapshot
+ * case (where we get all of those on the ddsa_snaps list) but also
+ * the sibling case (e.g. snapshot a/b and a/c so that we will also
+ * validate the limit on 'a' using a count of 2).
+ *
+ * We validate the snapshot names in the third loop and only report
+ * name errors once.
+ */
+ if (dmu_tx_is_syncing(tx)) {
+ char *nm;
+ nvlist_t *cnt_track = NULL;
+ cnt_track = fnvlist_alloc();
+
+ nm = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+ /* Rollup aggregated counts into the cnt_track list */
+ for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+ pair != NULL;
+ pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+ char *pdelim;
+ uint64_t val;
+
+ (void) strlcpy(nm, nvpair_name(pair), MAXPATHLEN);
+ pdelim = strchr(nm, '@');
+ if (pdelim == NULL)
+ continue;
+ *pdelim = '\0';
+
+ do {
+ if (nvlist_lookup_uint64(cnt_track, nm,
+ &val) == 0) {
+ /* update existing entry */
+ fnvlist_add_uint64(cnt_track, nm,
+ val + 1);
+ } else {
+ /* add to list */
+ fnvlist_add_uint64(cnt_track, nm, 1);
+ }
+
+ pdelim = strrchr(nm, '/');
+ if (pdelim != NULL)
+ *pdelim = '\0';
+ } while (pdelim != NULL);
+ }
+
+ kmem_free(nm, MAXPATHLEN);
+
+ /* Check aggregated counts at each level */
+ for (pair = nvlist_next_nvpair(cnt_track, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
+ int error = 0;
+ char *name;
+ uint64_t cnt = 0;
+ dsl_dataset_t *ds;
+
+ name = nvpair_name(pair);
+ cnt = fnvpair_value_uint64(pair);
+ ASSERT(cnt > 0);
+
+ error = dsl_dataset_hold(dp, name, FTAG, &ds);
+ if (error == 0) {
+ error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
+ ZFS_PROP_SNAPSHOT_LIMIT, NULL,
+ ddsa->ddsa_cr, ddsa->ddsa_proc);
+ dsl_dataset_rele(ds, FTAG);
+ }
+
+ if (error != 0) {
+ if (ddsa->ddsa_errors != NULL)
+ fnvlist_add_int32(ddsa->ddsa_errors,
+ name, error);
+ rv = error;
+ /* only report one error for this check */
+ break;
+ }
+ }
+ nvlist_free(cnt_track);
+ }
+
+ for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+ int error = 0;
+ dsl_dataset_t *ds;
+ char *name, *atp = NULL;
+ char dsname[ZFS_MAX_DATASET_NAME_LEN];
+
+ name = nvpair_name(pair);
+ if (strlen(name) >= ZFS_MAX_DATASET_NAME_LEN)
+ error = SET_ERROR(ENAMETOOLONG);
+ if (error == 0) {
+ atp = strchr(name, '@');
+ if (atp == NULL)
+ error = SET_ERROR(EINVAL);
+ if (error == 0)
+ (void) strlcpy(dsname, name, atp - name + 1);
+ }
+ if (error == 0)
+ error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+ if (error == 0) {
+ /* passing 0/NULL skips dsl_fs_ss_limit_check */
+ error = dsl_dataset_snapshot_check_impl(ds,
+ atp + 1, tx, B_FALSE, 0, NULL, NULL);
+ dsl_dataset_rele(ds, FTAG);
+ }
+
+ if (error != 0) {
+ if (ddsa->ddsa_errors != NULL) {
+ fnvlist_add_int32(ddsa->ddsa_errors,
+ name, error);
+ }
+ rv = error;
+ }
+ }
+
+ return (rv);
+}
+
+void
+dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
+ dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ dmu_buf_t *dbuf;
+ dsl_dataset_phys_t *dsphys;
+ uint64_t dsobj, crtxg;
+ objset_t *mos = dp->dp_meta_objset;
+ static zil_header_t zero_zil __maybe_unused;
+ objset_t *os __maybe_unused;
+
+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+ /*
+ * If we are on an old pool, the zil must not be active, in which
+ * case it will be zeroed. Usually zil_suspend() accomplishes this.
+ */
+ ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||
+ dmu_objset_from_ds(ds, &os) != 0 ||
+ bcmp(&os->os_phys->os_zil_header, &zero_zil,
+ sizeof (zero_zil)) == 0);
+
+ /* Should not snapshot a dirty dataset. */
+ ASSERT(!txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
+ ds, tx->tx_txg));
+
+ dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);
+
+ /*
+ * The origin's ds_creation_txg has to be < TXG_INITIAL
+ */
+ if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
+ crtxg = 1;
+ else
+ crtxg = tx->tx_txg;
+
+ dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+ DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
+ VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ dsphys = dbuf->db_data;
+ bzero(dsphys, sizeof (dsl_dataset_phys_t));
+ dsphys->ds_dir_obj = ds->ds_dir->dd_object;
+ dsphys->ds_fsid_guid = unique_create();
+ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+ sizeof (dsphys->ds_guid));
+ dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ dsphys->ds_next_snap_obj = ds->ds_object;
+ dsphys->ds_num_children = 1;
+ dsphys->ds_creation_time = gethrestime_sec();
+ dsphys->ds_creation_txg = crtxg;
+ dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
+ dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes;
+ dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes;
+ dsphys->ds_uncompressed_bytes =
+ dsl_dataset_phys(ds)->ds_uncompressed_bytes;
+ dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags;
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+ dmu_buf_rele(dbuf, FTAG);
+
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (zfeature_active(f, ds->ds_feature[f])) {
+ dsl_dataset_activate_feature(dsobj, f,
+ ds->ds_feature[f], tx);
+ }
+ }
+
+ ASSERT3U(ds->ds_prev != 0, ==,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
+ if (ds->ds_prev) {
+ uint64_t next_clones_obj =
+ dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj;
+ ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
+ ds->ds_object ||
+ dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1);
+ if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
+ ds->ds_object) {
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
+ dsl_dataset_phys(ds->ds_prev)->ds_creation_txg);
+ dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj;
+ } else if (next_clones_obj != 0) {
+ dsl_dataset_remove_from_next_clones(ds->ds_prev,
+ dsphys->ds_next_snap_obj, tx);
+ VERIFY0(zap_add_int(mos,
+ next_clones_obj, dsobj, tx));
+ }
+ }
+
+ /*
+ * If we have a reference-reservation on this dataset, we will
+ * need to increase the amount of refreservation being charged
+ * since our unique space is going to zero.
+ */
+ if (ds->ds_reserved) {
+ int64_t delta;
+ ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
+ delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes,
+ ds->ds_reserved);
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
+ delta, 0, 0, tx);
+ }
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_deadlist_obj =
+ dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
+ dsl_deadlist_close(&ds->ds_deadlist);
+ dsl_deadlist_open(&ds->ds_deadlist, mos,
+ dsl_dataset_phys(ds)->ds_deadlist_obj);
+ dsl_deadlist_add_key(&ds->ds_deadlist,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
+ dsl_bookmark_snapshotted(ds, tx);
+
+ if (dsl_dataset_remap_deadlist_exists(ds)) {
+ uint64_t remap_deadlist_obj =
+ dsl_dataset_get_remap_deadlist_object(ds);
+ /*
+ * Move the remap_deadlist to the snapshot. The head
+ * will create a new remap deadlist on demand, from
+ * dsl_dataset_block_remapped().
+ */
+ dsl_dataset_unset_remap_deadlist_object(ds, tx);
+ dsl_deadlist_close(&ds->ds_remap_deadlist);
+
+ dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_REMAP_DEADLIST,
+ sizeof (remap_deadlist_obj), 1, &remap_deadlist_obj, tx));
+ }
+
+ /*
+ * Create a ivset guid for this snapshot if the dataset is
+ * encrypted. This may be overridden by a raw receive. A
+ * previous implementation of this code did not have this
+ * field as part of the on-disk format for ZFS encryption
+ * (see errata #4). As part of the remediation for this
+ * issue, we ask the user to enable the bookmark_v2 feature
+ * which is now a dependency of the encryption feature. We
+ * use this as a heuristic to determine when the user has
+ * elected to correct any datasets created with the old code.
+ * As a result, we only do this step if the bookmark_v2
+ * feature is enabled, which limits the number of states a
+ * given pool / dataset can be in with regards to terms of
+ * correcting the issue.
+ */
+ if (ds->ds_dir->dd_crypto_obj != 0 &&
+ spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2)) {
+ uint64_t ivset_guid = unique_create();
+
+ dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_IVSET_GUID,
+ sizeof (ivset_guid), 1, &ivset_guid, tx));
+ }
+
+ ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);
+ dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;
+ dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;
+ dsl_dataset_phys(ds)->ds_unique_bytes = 0;
+
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
+ dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+
+ VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj,
+ snapname, 8, 1, &dsobj, tx));
+
+ if (ds->ds_prev)
+ dsl_dataset_rele(ds->ds_prev, ds);
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev));
+
+ dsl_scan_ds_snapshotted(ds, tx);
+
+ dsl_dir_snap_cmtime_update(ds->ds_dir);
+
+ spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, " ");
+}
+
+void
+dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_snapshot_arg_t *ddsa = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ nvpair_t *pair;
+
+ for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+ dsl_dataset_t *ds;
+ char *name, *atp;
+ char dsname[ZFS_MAX_DATASET_NAME_LEN];
+
+ name = nvpair_name(pair);
+ atp = strchr(name, '@');
+ (void) strlcpy(dsname, name, atp - name + 1);
+ VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));
+
+ dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);
+ if (ddsa->ddsa_props != NULL) {
+ dsl_props_set_sync_impl(ds->ds_prev,
+ ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
+ }
+ dsl_dataset_rele(ds, FTAG);
+ }
+}
+
+/*
+ * The snapshots must all be in the same pool.
+ * All-or-nothing: if there are any failures, nothing will be modified.
+ */
+int
+dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
+{
+ dsl_dataset_snapshot_arg_t ddsa;
+ nvpair_t *pair;
+ boolean_t needsuspend;
+ int error;
+ spa_t *spa;
+ char *firstname;
+ nvlist_t *suspended = NULL;
+
+ pair = nvlist_next_nvpair(snaps, NULL);
+ if (pair == NULL)
+ return (0);
+ firstname = nvpair_name(pair);
+
+ error = spa_open(firstname, &spa, FTAG);
+ if (error != 0)
+ return (error);
+ needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
+ spa_close(spa, FTAG);
+
+ if (needsuspend) {
+ suspended = fnvlist_alloc();
+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(snaps, pair)) {
+ char fsname[ZFS_MAX_DATASET_NAME_LEN];
+ char *snapname = nvpair_name(pair);
+ char *atp;
+ void *cookie;
+
+ atp = strchr(snapname, '@');
+ if (atp == NULL) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+ (void) strlcpy(fsname, snapname, atp - snapname + 1);
+
+ error = zil_suspend(fsname, &cookie);
+ if (error != 0)
+ break;
+ fnvlist_add_uint64(suspended, fsname,
+ (uintptr_t)cookie);
+ }
+ }
+
+ ddsa.ddsa_snaps = snaps;
+ ddsa.ddsa_props = props;
+ ddsa.ddsa_errors = errors;
+ ddsa.ddsa_cr = CRED();
+ ddsa.ddsa_proc = curproc;
+
+ if (error == 0) {
+ error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
+ dsl_dataset_snapshot_sync, &ddsa,
+ fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL);
+ }
+
+ if (suspended != NULL) {
+ for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(suspended, pair)) {
+ zil_resume((void *)(uintptr_t)
+ fnvpair_value_uint64(pair));
+ }
+ fnvlist_free(suspended);
+ }
+
+ if (error == 0) {
+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(snaps, pair)) {
+ zvol_create_minor(nvpair_name(pair));
+ }
+ }
+
+ return (error);
+}
+
+typedef struct dsl_dataset_snapshot_tmp_arg {
+ const char *ddsta_fsname;
+ const char *ddsta_snapname;
+ minor_t ddsta_cleanup_minor;
+ const char *ddsta_htag;
+} dsl_dataset_snapshot_tmp_arg_t;
+
+static int
+dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ int error;
+
+ error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ /* NULL cred means no limit check for tmp snapshot */
+ error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
+ tx, B_FALSE, 0, NULL, NULL);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,
+ B_TRUE, tx);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+static void
+dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds = NULL;
+
+ VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));
+
+ dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);
+ dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,
+ ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);
+ dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);
+
+ dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
+ minor_t cleanup_minor, const char *htag)
+{
+ dsl_dataset_snapshot_tmp_arg_t ddsta;
+ int error;
+ spa_t *spa;
+ boolean_t needsuspend;
+ void *cookie;
+
+ ddsta.ddsta_fsname = fsname;
+ ddsta.ddsta_snapname = snapname;
+ ddsta.ddsta_cleanup_minor = cleanup_minor;
+ ddsta.ddsta_htag = htag;
+
+ error = spa_open(fsname, &spa, FTAG);
+ if (error != 0)
+ return (error);
+ needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
+ spa_close(spa, FTAG);
+
+ if (needsuspend) {
+ error = zil_suspend(fsname, &cookie);
+ if (error != 0)
+ return (error);
+ }
+
+ error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
+ dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED);
+
+ if (needsuspend)
+ zil_resume(cookie);
+ return (error);
+}
+
+void
+dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(ds->ds_objset != NULL);
+ ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0);
+
+ /*
+ * in case we had to change ds_fsid_guid when we opened it,
+ * sync it out now.
+ */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
+
+ if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) {
+ VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+ ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1,
+ &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx));
+ VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+ ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1,
+ &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx));
+ VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+ ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1,
+ &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx));
+ ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0;
+ ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0;
+ ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
+ }
+
+ dmu_objset_sync(ds->ds_objset, zio, tx);
+
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (zfeature_active(f, ds->ds_feature_activation[f])) {
+ if (zfeature_active(f, ds->ds_feature[f]))
+ continue;
+ dsl_dataset_activate_feature(ds->ds_object, f,
+ ds->ds_feature_activation[f], tx);
+ ds->ds_feature[f] = ds->ds_feature_activation[f];
+ }
+ }
+}
+
+/*
+ * Check if the percentage of blocks shared between the clone and the
+ * snapshot (as opposed to those that are clone only) is below a certain
+ * threshold
+ */
+static boolean_t
+dsl_livelist_should_disable(dsl_dataset_t *ds)
+{
+ uint64_t used, referenced;
+ int percent_shared;
+
+ used = dsl_dir_get_usedds(ds->ds_dir);
+ referenced = dsl_get_referenced(ds);
+ ASSERT3U(referenced, >=, 0);
+ ASSERT3U(used, >=, 0);
+ if (referenced == 0)
+ return (B_FALSE);
+ percent_shared = (100 * (referenced - used)) / referenced;
+ if (percent_shared <= zfs_livelist_min_percent_shared)
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+/*
+ * Check if it is possible to combine two livelist entries into one.
+ * This is the case if the combined number of 'live' blkptrs (ALLOCs that
+ * don't have a matching FREE) is under the maximum sublist size.
+ * We check this by subtracting twice the total number of frees from the total
+ * number of blkptrs. FREEs are counted twice because each FREE blkptr
+ * will cancel out an ALLOC blkptr when the livelist is processed.
+ */
+static boolean_t
+dsl_livelist_should_condense(dsl_deadlist_entry_t *first,
+ dsl_deadlist_entry_t *next)
+{
+ uint64_t total_free = first->dle_bpobj.bpo_phys->bpo_num_freed +
+ next->dle_bpobj.bpo_phys->bpo_num_freed;
+ uint64_t total_entries = first->dle_bpobj.bpo_phys->bpo_num_blkptrs +
+ next->dle_bpobj.bpo_phys->bpo_num_blkptrs;
+ if ((total_entries - (2 * total_free)) < zfs_livelist_max_entries)
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+typedef struct try_condense_arg {
+ spa_t *spa;
+ dsl_dataset_t *ds;
+} try_condense_arg_t;
+
+/*
+ * Iterate over the livelist entries, searching for a pair to condense.
+ * A nonzero return value means stop, 0 means keep looking.
+ */
+static int
+dsl_livelist_try_condense(void *arg, dsl_deadlist_entry_t *first)
+{
+ try_condense_arg_t *tca = arg;
+ spa_t *spa = tca->spa;
+ dsl_dataset_t *ds = tca->ds;
+ dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;
+ dsl_deadlist_entry_t *next;
+
+ /* The condense thread has not yet been created at import */
+ if (spa->spa_livelist_condense_zthr == NULL)
+ return (1);
+
+ /* A condense is already in progress */
+ if (spa->spa_to_condense.ds != NULL)
+ return (1);
+
+ next = AVL_NEXT(&ll->dl_tree, &first->dle_node);
+ /* The livelist has only one entry - don't condense it */
+ if (next == NULL)
+ return (1);
+
+ /* Next is the newest entry - don't condense it */
+ if (AVL_NEXT(&ll->dl_tree, &next->dle_node) == NULL)
+ return (1);
+
+ /* This pair is not ready to condense but keep looking */
+ if (!dsl_livelist_should_condense(first, next))
+ return (0);
+
+ /*
+ * Add a ref to prevent the dataset from being evicted while
+ * the condense zthr or synctask are running. Ref will be
+ * released at the end of the condense synctask
+ */
+ dmu_buf_add_ref(ds->ds_dbuf, spa);
+
+ spa->spa_to_condense.ds = ds;
+ spa->spa_to_condense.first = first;
+ spa->spa_to_condense.next = next;
+ spa->spa_to_condense.syncing = B_FALSE;
+ spa->spa_to_condense.cancelled = B_FALSE;
+
+ zthr_wakeup(spa->spa_livelist_condense_zthr);
+ return (1);
+}
+
+static void
+dsl_flush_pending_livelist(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = ds->ds_dir;
+ spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+ dsl_deadlist_entry_t *last = dsl_deadlist_last(&dd->dd_livelist);
+
+ /* Check if we need to add a new sub-livelist */
+ if (last == NULL) {
+ /* The livelist is empty */
+ dsl_deadlist_add_key(&dd->dd_livelist,
+ tx->tx_txg - 1, tx);
+ } else if (spa_sync_pass(spa) == 1) {
+ /*
+ * Check if the newest entry is full. If it is, make a new one.
+ * We only do this once per sync because we could overfill a
+ * sublist in one sync pass and don't want to add another entry
+ * for a txg that is already represented. This ensures that
+ * blkptrs born in the same txg are stored in the same sublist.
+ */
+ bpobj_t bpobj = last->dle_bpobj;
+ uint64_t all = bpobj.bpo_phys->bpo_num_blkptrs;
+ uint64_t free = bpobj.bpo_phys->bpo_num_freed;
+ uint64_t alloc = all - free;
+ if (alloc > zfs_livelist_max_entries) {
+ dsl_deadlist_add_key(&dd->dd_livelist,
+ tx->tx_txg - 1, tx);
+ }
+ }
+
+ /* Insert each entry into the on-disk livelist */
+ bplist_iterate(&dd->dd_pending_allocs,
+ dsl_deadlist_insert_alloc_cb, &dd->dd_livelist, tx);
+ bplist_iterate(&dd->dd_pending_frees,
+ dsl_deadlist_insert_free_cb, &dd->dd_livelist, tx);
+
+ /* Attempt to condense every pair of adjacent entries */
+ try_condense_arg_t arg = {
+ .spa = spa,
+ .ds = ds
+ };
+ dsl_deadlist_iterate(&dd->dd_livelist, dsl_livelist_try_condense,
+ &arg);
+}
+
+void
+dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ objset_t *os = ds->ds_objset;
+
+ bplist_iterate(&ds->ds_pending_deadlist,
+ dsl_deadlist_insert_alloc_cb, &ds->ds_deadlist, tx);
+
+ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) {
+ dsl_flush_pending_livelist(ds, tx);
+ if (dsl_livelist_should_disable(ds)) {
+ dsl_dir_remove_livelist(ds->ds_dir, tx, B_TRUE);
+ }
+ }
+
+ dsl_bookmark_sync_done(ds, tx);
+
+ multilist_destroy(os->os_synced_dnodes);
+ os->os_synced_dnodes = NULL;
+
+ if (os->os_encrypted)
+ os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_FALSE;
+ else
+ ASSERT0(os->os_next_write_raw[tx->tx_txg & TXG_MASK]);
+
+ ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx)));
+
+ dmu_buf_rele(ds->ds_dbuf, ds);
+}
+
+int
+get_clones_stat_impl(dsl_dataset_t *ds, nvlist_t *val)
+{
+ uint64_t count = 0;
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+
+ /*
+ * There may be missing entries in ds_next_clones_obj
+ * due to a bug in a previous version of the code.
+ * Only trust it if it has the right number of entries.
+ */
+ if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
+ VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
+ &count));
+ }
+ if (count != dsl_dataset_phys(ds)->ds_num_children - 1) {
+ return (SET_ERROR(ENOENT));
+ }
+ for (zap_cursor_init(&zc, mos,
+ dsl_dataset_phys(ds)->ds_next_clones_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ dsl_dataset_t *clone;
+ char buf[ZFS_MAX_DATASET_NAME_LEN];
+ VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
+ za.za_first_integer, FTAG, &clone));
+ dsl_dir_name(clone->ds_dir, buf);
+ fnvlist_add_boolean(val, buf);
+ dsl_dataset_rele(clone, FTAG);
+ }
+ zap_cursor_fini(&zc);
+ return (0);
+}
+
+void
+get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
+{
+ nvlist_t *propval = fnvlist_alloc();
+ nvlist_t *val = fnvlist_alloc();
+
+ if (get_clones_stat_impl(ds, val) == 0) {
+ fnvlist_add_nvlist(propval, ZPROP_VALUE, val);
+ fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES),
+ propval);
+ }
+
+ nvlist_free(val);
+ nvlist_free(propval);
+}
+
+/*
+ * Returns a string that represents the receive resume stats token. It should
+ * be freed with strfree().
+ */
+char *
+get_receive_resume_stats_impl(dsl_dataset_t *ds)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ if (dsl_dataset_has_resume_receive_state(ds)) {
+ char *str;
+ void *packed;
+ uint8_t *compressed;
+ uint64_t val;
+ nvlist_t *token_nv = fnvlist_alloc();
+ size_t packed_size, compressed_size;
+
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "fromguid", val);
+ }
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "object", val);
+ }
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "offset", val);
+ }
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "bytes", val);
+ }
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "toguid", val);
+ }
+ char buf[MAXNAMELEN];
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
+ fnvlist_add_string(token_nv, "toname", buf);
+ }
+ if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_LARGEBLOCK) == 0) {
+ fnvlist_add_boolean(token_nv, "largeblockok");
+ }
+ if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_EMBEDOK) == 0) {
+ fnvlist_add_boolean(token_nv, "embedok");
+ }
+ if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_COMPRESSOK) == 0) {
+ fnvlist_add_boolean(token_nv, "compressok");
+ }
+ if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_RAWOK) == 0) {
+ fnvlist_add_boolean(token_nv, "rawok");
+ }
+ if (dsl_dataset_feature_is_active(ds,
+ SPA_FEATURE_REDACTED_DATASETS)) {
+ uint64_t num_redact_snaps;
+ uint64_t *redact_snaps;
+ VERIFY(dsl_dataset_get_uint64_array_feature(ds,
+ SPA_FEATURE_REDACTED_DATASETS, &num_redact_snaps,
+ &redact_snaps));
+ fnvlist_add_uint64_array(token_nv, "redact_snaps",
+ redact_snaps, num_redact_snaps);
+ }
+ if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS) == 0) {
+ uint64_t num_redact_snaps, int_size;
+ uint64_t *redact_snaps;
+ VERIFY0(zap_length(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, &int_size,
+ &num_redact_snaps));
+ ASSERT3U(int_size, ==, sizeof (uint64_t));
+
+ redact_snaps = kmem_alloc(int_size * num_redact_snaps,
+ KM_SLEEP);
+ VERIFY0(zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, int_size,
+ num_redact_snaps, redact_snaps));
+ fnvlist_add_uint64_array(token_nv, "book_redact_snaps",
+ redact_snaps, num_redact_snaps);
+ kmem_free(redact_snaps, int_size * num_redact_snaps);
+ }
+ packed = fnvlist_pack(token_nv, &packed_size);
+ fnvlist_free(token_nv);
+ compressed = kmem_alloc(packed_size, KM_SLEEP);
+
+ compressed_size = gzip_compress(packed, compressed,
+ packed_size, packed_size, 6);
+
+ zio_cksum_t cksum;
+ fletcher_4_native_varsize(compressed, compressed_size, &cksum);
+
+ size_t alloc_size = compressed_size * 2 + 1;
+ str = kmem_alloc(alloc_size, KM_SLEEP);
+ for (int i = 0; i < compressed_size; i++) {
+ size_t offset = i * 2;
+ (void) snprintf(str + offset, alloc_size - offset,
+ "%02x", compressed[i]);
+ }
+ str[compressed_size * 2] = '\0';
+ char *propval = kmem_asprintf("%u-%llx-%llx-%s",
+ ZFS_SEND_RESUME_TOKEN_VERSION,
+ (longlong_t)cksum.zc_word[0],
+ (longlong_t)packed_size, str);
+ kmem_free(packed, packed_size);
+ kmem_free(str, alloc_size);
+ kmem_free(compressed, packed_size);
+ return (propval);
+ }
+ return (kmem_strdup(""));
+}
+
+/*
+ * Returns a string that represents the receive resume stats token of the
+ * dataset's child. It should be freed with strfree().
+ */
+char *
+get_child_receive_stats(dsl_dataset_t *ds)
+{
+ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+ dsl_dataset_t *recv_ds;
+ dsl_dataset_name(ds, recvname);
+ if (strlcat(recvname, "/", sizeof (recvname)) <
+ sizeof (recvname) &&
+ strlcat(recvname, recv_clone_name, sizeof (recvname)) <
+ sizeof (recvname) &&
+ dsl_dataset_hold(ds->ds_dir->dd_pool, recvname, FTAG,
+ &recv_ds) == 0) {
+ char *propval = get_receive_resume_stats_impl(recv_ds);
+ dsl_dataset_rele(recv_ds, FTAG);
+ return (propval);
+ }
+ return (kmem_strdup(""));
+}
+
+static void
+get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
+{
+ char *propval = get_receive_resume_stats_impl(ds);
+ if (strcmp(propval, "") != 0) {
+ dsl_prop_nvlist_add_string(nv,
+ ZFS_PROP_RECEIVE_RESUME_TOKEN, propval);
+ } else {
+ char *childval = get_child_receive_stats(ds);
+ if (strcmp(childval, "") != 0) {
+ dsl_prop_nvlist_add_string(nv,
+ ZFS_PROP_RECEIVE_RESUME_TOKEN, childval);
+ }
+ kmem_strfree(childval);
+ }
+ kmem_strfree(propval);
+}
+
+uint64_t
+dsl_get_refratio(dsl_dataset_t *ds)
+{
+ uint64_t ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 :
+ (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 /
+ dsl_dataset_phys(ds)->ds_compressed_bytes);
+ return (ratio);
+}
+
+uint64_t
+dsl_get_logicalreferenced(dsl_dataset_t *ds)
+{
+ return (dsl_dataset_phys(ds)->ds_uncompressed_bytes);
+}
+
+uint64_t
+dsl_get_compressratio(dsl_dataset_t *ds)
+{
+ if (ds->ds_is_snapshot) {
+ return (dsl_get_refratio(ds));
+ } else {
+ dsl_dir_t *dd = ds->ds_dir;
+ mutex_enter(&dd->dd_lock);
+ uint64_t val = dsl_dir_get_compressratio(dd);
+ mutex_exit(&dd->dd_lock);
+ return (val);
+ }
+}
+
+uint64_t
+dsl_get_used(dsl_dataset_t *ds)
+{
+ if (ds->ds_is_snapshot) {
+ return (dsl_dataset_phys(ds)->ds_unique_bytes);
+ } else {
+ dsl_dir_t *dd = ds->ds_dir;
+ mutex_enter(&dd->dd_lock);
+ uint64_t val = dsl_dir_get_used(dd);
+ mutex_exit(&dd->dd_lock);
+ return (val);
+ }
+}
+
+uint64_t
+dsl_get_creation(dsl_dataset_t *ds)
+{
+ return (dsl_dataset_phys(ds)->ds_creation_time);
+}
+
+uint64_t
+dsl_get_creationtxg(dsl_dataset_t *ds)
+{
+ return (dsl_dataset_phys(ds)->ds_creation_txg);
+}
+
+uint64_t
+dsl_get_refquota(dsl_dataset_t *ds)
+{
+ return (ds->ds_quota);
+}
+
+uint64_t
+dsl_get_refreservation(dsl_dataset_t *ds)
+{
+ return (ds->ds_reserved);
+}
+
+uint64_t
+dsl_get_guid(dsl_dataset_t *ds)
+{
+ return (dsl_dataset_phys(ds)->ds_guid);
+}
+
+uint64_t
+dsl_get_unique(dsl_dataset_t *ds)
+{
+ return (dsl_dataset_phys(ds)->ds_unique_bytes);
+}
+
+uint64_t
+dsl_get_objsetid(dsl_dataset_t *ds)
+{
+ return (ds->ds_object);
+}
+
+uint64_t
+dsl_get_userrefs(dsl_dataset_t *ds)
+{
+ return (ds->ds_userrefs);
+}
+
+uint64_t
+dsl_get_defer_destroy(dsl_dataset_t *ds)
+{
+ return (DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
+}
+
+uint64_t
+dsl_get_referenced(dsl_dataset_t *ds)
+{
+ return (dsl_dataset_phys(ds)->ds_referenced_bytes);
+}
+
+uint64_t
+dsl_get_numclones(dsl_dataset_t *ds)
+{
+ ASSERT(ds->ds_is_snapshot);
+ return (dsl_dataset_phys(ds)->ds_num_children - 1);
+}
+
+uint64_t
+dsl_get_inconsistent(dsl_dataset_t *ds)
+{
+ return ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT) ?
+ 1 : 0);
+}
+
+uint64_t
+dsl_get_redacted(dsl_dataset_t *ds)
+{
+ return (dsl_dataset_feature_is_active(ds,
+ SPA_FEATURE_REDACTED_DATASETS));
+}
+
+uint64_t
+dsl_get_available(dsl_dataset_t *ds)
+{
+ uint64_t refdbytes = dsl_get_referenced(ds);
+ uint64_t availbytes = dsl_dir_space_available(ds->ds_dir,
+ NULL, 0, TRUE);
+ if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
+ availbytes +=
+ ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
+ }
+ if (ds->ds_quota != 0) {
+ /*
+ * Adjust available bytes according to refquota
+ */
+ if (refdbytes < ds->ds_quota) {
+ availbytes = MIN(availbytes,
+ ds->ds_quota - refdbytes);
+ } else {
+ availbytes = 0;
+ }
+ }
+ return (availbytes);
+}
+
+int
+dsl_get_written(dsl_dataset_t *ds, uint64_t *written)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ dsl_dataset_t *prev;
+ int err = dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
+ if (err == 0) {
+ uint64_t comp, uncomp;
+ err = dsl_dataset_space_written(prev, ds, written,
+ &comp, &uncomp);
+ dsl_dataset_rele(prev, FTAG);
+ }
+ return (err);
+}
+
+/*
+ * 'snap' should be a buffer of size ZFS_MAX_DATASET_NAME_LEN.
+ */
+int
+dsl_get_prev_snap(dsl_dataset_t *ds, char *snap)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) {
+ dsl_dataset_name(ds->ds_prev, snap);
+ return (0);
+ } else {
+ return (SET_ERROR(ENOENT));
+ }
+}
+
+void
+dsl_get_redact_snaps(dsl_dataset_t *ds, nvlist_t *propval)
+{
+ uint64_t nsnaps;
+ uint64_t *snaps;
+ if (dsl_dataset_get_uint64_array_feature(ds,
+ SPA_FEATURE_REDACTED_DATASETS, &nsnaps, &snaps)) {
+ fnvlist_add_uint64_array(propval, ZPROP_VALUE, snaps,
+ nsnaps);
+ }
+}
+
+/*
+ * Returns the mountpoint property and source for the given dataset in the value
+ * and source buffers. The value buffer must be at least as large as MAXPATHLEN
+ * and the source buffer as least as large a ZFS_MAX_DATASET_NAME_LEN.
+ * Returns 0 on success and an error on failure.
+ */
+int
+dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value,
+ char *source)
+{
+ int error;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ /* Retrieve the mountpoint value stored in the zap object */
+ error = dsl_prop_get_ds(ds, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), 1,
+ ZAP_MAXVALUELEN, value, source);
+ if (error != 0) {
+ return (error);
+ }
+
+ /*
+ * Process the dsname and source to find the full mountpoint string.
+ * Can be skipped for 'legacy' or 'none'.
+ */
+ if (value[0] == '/') {
+ char *buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
+ char *root = buf;
+ const char *relpath;
+
+ /*
+ * If we inherit the mountpoint, even from a dataset
+ * with a received value, the source will be the path of
+ * the dataset we inherit from. If source is
+ * ZPROP_SOURCE_VAL_RECVD, the received value is not
+ * inherited.
+ */
+ if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) {
+ relpath = "";
+ } else {
+ ASSERT0(strncmp(dsname, source, strlen(source)));
+ relpath = dsname + strlen(source);
+ if (relpath[0] == '/')
+ relpath++;
+ }
+
+ spa_altroot(dp->dp_spa, root, ZAP_MAXVALUELEN);
+
+ /*
+ * Special case an alternate root of '/'. This will
+ * avoid having multiple leading slashes in the
+ * mountpoint path.
+ */
+ if (strcmp(root, "/") == 0)
+ root++;
+
+ /*
+ * If the mountpoint is '/' then skip over this
+ * if we are obtaining either an alternate root or
+ * an inherited mountpoint.
+ */
+ char *mnt = value;
+ if (value[1] == '\0' && (root[0] != '\0' ||
+ relpath[0] != '\0'))
+ mnt = value + 1;
+
+ if (relpath[0] == '\0') {
+ (void) snprintf(value, ZAP_MAXVALUELEN, "%s%s",
+ root, mnt);
+ } else {
+ (void) snprintf(value, ZAP_MAXVALUELEN, "%s%s%s%s",
+ root, mnt, relpath[0] == '@' ? "" : "/",
+ relpath);
+ }
+ kmem_free(buf, ZAP_MAXVALUELEN);
+ }
+
+ return (0);
+}
+
+void
+dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ ASSERT(dsl_pool_config_held(dp));
+
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO,
+ dsl_get_refratio(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
+ dsl_get_logicalreferenced(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
+ dsl_get_compressratio(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
+ dsl_get_used(ds));
+
+ if (ds->ds_is_snapshot) {
+ get_clones_stat(ds, nv);
+ } else {
+ char buf[ZFS_MAX_DATASET_NAME_LEN];
+ if (dsl_get_prev_snap(ds, buf) == 0)
+ dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP,
+ buf);
+ dsl_dir_stats(ds->ds_dir, nv);
+ }
+
+ nvlist_t *propval = fnvlist_alloc();
+ dsl_get_redact_snaps(ds, propval);
+ fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS),
+ propval);
+ nvlist_free(propval);
+
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE,
+ dsl_get_available(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED,
+ dsl_get_referenced(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
+ dsl_get_creation(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
+ dsl_get_creationtxg(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
+ dsl_get_refquota(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
+ dsl_get_refreservation(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
+ dsl_get_guid(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
+ dsl_get_unique(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
+ dsl_get_objsetid(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
+ dsl_get_userrefs(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
+ dsl_get_defer_destroy(ds));
+ dsl_dataset_crypt_stats(ds, nv);
+
+ if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+ uint64_t written;
+ if (dsl_get_written(ds, &written) == 0) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
+ written);
+ }
+ }
+
+ if (!dsl_dataset_is_snapshot(ds)) {
+ /*
+ * A failed "newfs" (e.g. full) resumable receive leaves
+ * the stats set on this dataset. Check here for the prop.
+ */
+ get_receive_resume_stats(ds, nv);
+
+ /*
+ * A failed incremental resumable receive leaves the
+ * stats set on our child named "%recv". Check the child
+ * for the prop.
+ */
+ /* 6 extra bytes for /%recv */
+ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+ dsl_dataset_t *recv_ds;
+ dsl_dataset_name(ds, recvname);
+ if (strlcat(recvname, "/", sizeof (recvname)) <
+ sizeof (recvname) &&
+ strlcat(recvname, recv_clone_name, sizeof (recvname)) <
+ sizeof (recvname) &&
+ dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) {
+ get_receive_resume_stats(recv_ds, nv);
+ dsl_dataset_rele(recv_ds, FTAG);
+ }
+ }
+}
+
+void
+dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
+{
+ dsl_pool_t *dp __maybe_unused = ds->ds_dir->dd_pool;
+ ASSERT(dsl_pool_config_held(dp));
+
+ stat->dds_creation_txg = dsl_get_creationtxg(ds);
+ stat->dds_inconsistent = dsl_get_inconsistent(ds);
+ stat->dds_guid = dsl_get_guid(ds);
+ stat->dds_redacted = dsl_get_redacted(ds);
+ stat->dds_origin[0] = '\0';
+ if (ds->ds_is_snapshot) {
+ stat->dds_is_snapshot = B_TRUE;
+ stat->dds_num_clones = dsl_get_numclones(ds);
+ } else {
+ stat->dds_is_snapshot = B_FALSE;
+ stat->dds_num_clones = 0;
+
+ if (dsl_dir_is_clone(ds->ds_dir)) {
+ dsl_dir_get_origin(ds->ds_dir, stat->dds_origin);
+ }
+ }
+}
+
+uint64_t
+dsl_dataset_fsid_guid(dsl_dataset_t *ds)
+{
+ return (ds->ds_fsid_guid);
+}
+
+void
+dsl_dataset_space(dsl_dataset_t *ds,
+ uint64_t *refdbytesp, uint64_t *availbytesp,
+ uint64_t *usedobjsp, uint64_t *availobjsp)
+{
+ *refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes;
+ *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
+ if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes)
+ *availbytesp +=
+ ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
+ if (ds->ds_quota != 0) {
+ /*
+ * Adjust available bytes according to refquota
+ */
+ if (*refdbytesp < ds->ds_quota)
+ *availbytesp = MIN(*availbytesp,
+ ds->ds_quota - *refdbytesp);
+ else
+ *availbytesp = 0;
+ }
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ *usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+ *availobjsp = DN_MAX_OBJECT - *usedobjsp;
+}
+
+boolean_t
+dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
+{
+ dsl_pool_t *dp __maybe_unused = ds->ds_dir->dd_pool;
+ uint64_t birth;
+
+ ASSERT(dsl_pool_config_held(dp));
+ if (snap == NULL)
+ return (B_FALSE);
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ birth = dsl_dataset_get_blkptr(ds)->blk_birth;
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+ if (birth > dsl_dataset_phys(snap)->ds_creation_txg) {
+ objset_t *os, *os_snap;
+ /*
+ * It may be that only the ZIL differs, because it was
+ * reset in the head. Don't count that as being
+ * modified.
+ */
+ if (dmu_objset_from_ds(ds, &os) != 0)
+ return (B_TRUE);
+ if (dmu_objset_from_ds(snap, &os_snap) != 0)
+ return (B_TRUE);
+ return (bcmp(&os->os_phys->os_meta_dnode,
+ &os_snap->os_phys->os_meta_dnode,
+ sizeof (os->os_phys->os_meta_dnode)) != 0);
+ }
+ return (B_FALSE);
+}
+
+typedef struct dsl_dataset_rename_snapshot_arg {
+ const char *ddrsa_fsname;
+ const char *ddrsa_oldsnapname;
+ const char *ddrsa_newsnapname;
+ boolean_t ddrsa_recursive;
+ dmu_tx_t *ddrsa_tx;
+} dsl_dataset_rename_snapshot_arg_t;
+
+/* ARGSUSED */
+static int
+dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
+ dsl_dataset_t *hds, void *arg)
+{
+ dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+ int error;
+ uint64_t val;
+
+ error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
+ if (error != 0) {
+ /* ignore nonexistent snapshots */
+ return (error == ENOENT ? 0 : error);
+ }
+
+ /* new name should not exist */
+ error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);
+ if (error == 0)
+ error = SET_ERROR(EEXIST);
+ else if (error == ENOENT)
+ error = 0;
+
+ /* dataset name + 1 for the "@" + the new snapshot name must fit */
+ if (dsl_dir_namelen(hds->ds_dir) + 1 +
+ strlen(ddrsa->ddrsa_newsnapname) >= ZFS_MAX_DATASET_NAME_LEN)
+ error = SET_ERROR(ENAMETOOLONG);
+
+ return (error);
+}
+
+static int
+dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *hds;
+ int error;
+
+ error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);
+ if (error != 0)
+ return (error);
+
+ if (ddrsa->ddrsa_recursive) {
+ error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
+ dsl_dataset_rename_snapshot_check_impl, ddrsa,
+ DS_FIND_CHILDREN);
+ } else {
+ error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);
+ }
+ dsl_dataset_rele(hds, FTAG);
+ return (error);
+}
+
+static int
+dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
+ dsl_dataset_t *hds, void *arg)
+{
+ dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+ dsl_dataset_t *ds;
+ uint64_t val;
+ dmu_tx_t *tx = ddrsa->ddrsa_tx;
+ int error;
+
+ error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
+ ASSERT(error == 0 || error == ENOENT);
+ if (error == ENOENT) {
+ /* ignore nonexistent snapshots */
+ return (0);
+ }
+
+ VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));
+
+ /* log before we change the name */
+ spa_history_log_internal_ds(ds, "rename", tx,
+ "-> @%s", ddrsa->ddrsa_newsnapname);
+
+ VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
+ B_FALSE));
+ mutex_enter(&ds->ds_lock);
+ (void) strlcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname,
+ sizeof (ds->ds_snapname));
+ mutex_exit(&ds->ds_lock);
+ VERIFY0(zap_add(dp->dp_meta_objset,
+ dsl_dataset_phys(hds)->ds_snapnames_zapobj,
+ ds->ds_snapname, 8, 1, &ds->ds_object, tx));
+ zvol_rename_minors(dp->dp_spa, ddrsa->ddrsa_oldsnapname,
+ ddrsa->ddrsa_newsnapname, B_TRUE);
+
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+static void
+dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *hds = NULL;
+
+ VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));
+ ddrsa->ddrsa_tx = tx;
+ if (ddrsa->ddrsa_recursive) {
+ VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
+ dsl_dataset_rename_snapshot_sync_impl, ddrsa,
+ DS_FIND_CHILDREN));
+ } else {
+ VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));
+ }
+ dsl_dataset_rele(hds, FTAG);
+}
+
+int
+dsl_dataset_rename_snapshot(const char *fsname,
+ const char *oldsnapname, const char *newsnapname, boolean_t recursive)
+{
+ dsl_dataset_rename_snapshot_arg_t ddrsa;
+
+ ddrsa.ddrsa_fsname = fsname;
+ ddrsa.ddrsa_oldsnapname = oldsnapname;
+ ddrsa.ddrsa_newsnapname = newsnapname;
+ ddrsa.ddrsa_recursive = recursive;
+
+ return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
+ dsl_dataset_rename_snapshot_sync, &ddrsa,
+ 1, ZFS_SPACE_CHECK_RESERVED));
+}
+
+/*
+ * If we're doing an ownership handoff, we need to make sure that there is
+ * only one long hold on the dataset. We're not allowed to change anything here
+ * so we don't permanently release the long hold or regular hold here. We want
+ * to do this only when syncing to avoid the dataset unexpectedly going away
+ * when we release the long hold.
+ */
+static int
+dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
+{
+ boolean_t held = B_FALSE;
+
+ if (!dmu_tx_is_syncing(tx))
+ return (0);
+
+ dsl_dir_t *dd = ds->ds_dir;
+ mutex_enter(&dd->dd_activity_lock);
+ uint64_t holds = zfs_refcount_count(&ds->ds_longholds) -
+ (owner != NULL ? 1 : 0);
+ /*
+ * The value of dd_activity_waiters can chance as soon as we drop the
+ * lock, but we're fine with that; new waiters coming in or old
+ * waiters leaving doesn't cause problems, since we're going to cancel
+ * waiters later anyway. The goal of this check is to verify that no
+ * non-waiters have long-holds, and all new long-holds will be
+ * prevented because we're holding the pool config as writer.
+ */
+ if (holds != dd->dd_activity_waiters)
+ held = B_TRUE;
+ mutex_exit(&dd->dd_activity_lock);
+
+ if (held)
+ return (SET_ERROR(EBUSY));
+
+ return (0);
+}
+
+int
+dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_rollback_arg_t *ddra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ int64_t unused_refres_delta;
+ int error;
+
+ error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ /* must not be a snapshot */
+ if (ds->ds_is_snapshot) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /* must have a most recent snapshot */
+ if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(ESRCH));
+ }
+
+ /*
+ * No rollback to a snapshot created in the current txg, because
+ * the rollback may dirty the dataset and create blocks that are
+ * not reachable from the rootbp while having a birth txg that
+ * falls into the snapshot's range.
+ */
+ if (dmu_tx_is_syncing(tx) &&
+ dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EAGAIN));
+ }
+
+ /*
+ * If the expected target snapshot is specified, then check that
+ * the latest snapshot is it.
+ */
+ if (ddra->ddra_tosnap != NULL) {
+ dsl_dataset_t *snapds;
+
+ /* Check if the target snapshot exists at all. */
+ error = dsl_dataset_hold(dp, ddra->ddra_tosnap, FTAG, &snapds);
+ if (error != 0) {
+ /*
+ * ESRCH is used to signal that the target snapshot does
+ * not exist, while ENOENT is used to report that
+ * the rolled back dataset does not exist.
+ * ESRCH is also used to cover other cases where the
+ * target snapshot is not related to the dataset being
+ * rolled back such as being in a different pool.
+ */
+ if (error == ENOENT || error == EXDEV)
+ error = SET_ERROR(ESRCH);
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+ ASSERT(snapds->ds_is_snapshot);
+
+ /* Check if the snapshot is the latest snapshot indeed. */
+ if (snapds != ds->ds_prev) {
+ /*
+ * Distinguish between the case where the only problem
+ * is intervening snapshots (EEXIST) vs the snapshot
+ * not being a valid target for rollback (ESRCH).
+ */
+ if (snapds->ds_dir == ds->ds_dir ||
+ (dsl_dir_is_clone(ds->ds_dir) &&
+ dsl_dir_phys(ds->ds_dir)->dd_origin_obj ==
+ snapds->ds_object)) {
+ error = SET_ERROR(EEXIST);
+ } else {
+ error = SET_ERROR(ESRCH);
+ }
+ dsl_dataset_rele(snapds, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+ dsl_dataset_rele(snapds, FTAG);
+ }
+
+ /* must not have any bookmarks after the most recent snapshot */
+ if (dsl_bookmark_latest_txg(ds) >
+ dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EEXIST));
+ }
+
+ error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ /*
+ * Check if the snap we are rolling back to uses more than
+ * the refquota.
+ */
+ if (ds->ds_quota != 0 &&
+ dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EDQUOT));
+ }
+
+ /*
+ * When we do the clone swap, we will temporarily use more space
+ * due to the refreservation (the head will no longer have any
+ * unique space, so the entire amount of the refreservation will need
+ * to be free). We will immediately destroy the clone, freeing
+ * this space, but the freeing happens over many txg's.
+ */
+ unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
+ dsl_dataset_phys(ds)->ds_unique_bytes);
+
+ if (unused_refres_delta > 0 &&
+ unused_refres_delta >
+ dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(ENOSPC));
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+void
+dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_rollback_arg_t *ddra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds, *clone;
+ uint64_t cloneobj;
+ char namebuf[ZFS_MAX_DATASET_NAME_LEN];
+
+ VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
+
+ dsl_dataset_name(ds->ds_prev, namebuf);
+ fnvlist_add_string(ddra->ddra_result, "target", namebuf);
+
+ cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
+ ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, NULL, tx);
+
+ VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
+
+ dsl_dataset_clone_swap_sync_impl(clone, ds, tx);
+ dsl_dataset_zero_zil(ds, tx);
+
+ dsl_destroy_head_sync_impl(clone, tx);
+
+ dsl_dataset_rele(clone, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * Rolls back the given filesystem or volume to the most recent snapshot.
+ * The name of the most recent snapshot will be returned under key "target"
+ * in the result nvlist.
+ *
+ * If owner != NULL:
+ * - The existing dataset MUST be owned by the specified owner at entry
+ * - Upon return, dataset will still be held by the same owner, whether we
+ * succeed or not.
+ *
+ * This mode is required any time the existing filesystem is mounted. See
+ * notes above zfs_suspend_fs() for further details.
+ */
+int
+dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner,
+ nvlist_t *result)
+{
+ dsl_dataset_rollback_arg_t ddra;
+
+ ddra.ddra_fsname = fsname;
+ ddra.ddra_tosnap = tosnap;
+ ddra.ddra_owner = owner;
+ ddra.ddra_result = result;
+
+ return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
+ dsl_dataset_rollback_sync, &ddra,
+ 1, ZFS_SPACE_CHECK_RESERVED));
+}
+
+struct promotenode {
+ list_node_t link;
+ dsl_dataset_t *ds;
+};
+
+static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
+static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,
+ void *tag);
+static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag);
+
+int
+dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_promote_arg_t *ddpa = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *hds;
+ struct promotenode *snap;
+ dsl_dataset_t *origin_ds, *origin_head;
+ int err;
+ uint64_t unused;
+ uint64_t ss_mv_cnt;
+ size_t max_snap_len;
+ boolean_t conflicting_snaps;
+
+ err = promote_hold(ddpa, dp, FTAG);
+ if (err != 0)
+ return (err);
+
+ hds = ddpa->ddpa_clone;
+ max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1;
+
+ if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) {
+ promote_rele(ddpa, FTAG);
+ return (SET_ERROR(EXDEV));
+ }
+
+ snap = list_head(&ddpa->shared_snaps);
+ origin_head = snap->ds;
+ if (snap == NULL) {
+ err = SET_ERROR(ENOENT);
+ goto out;
+ }
+ origin_ds = snap->ds;
+
+ /*
+ * Encrypted clones share a DSL Crypto Key with their origin's dsl dir.
+ * When doing a promote we must make sure the encryption root for
+ * both the target and the target's origin does not change to avoid
+ * needing to rewrap encryption keys
+ */
+ err = dsl_dataset_promote_crypt_check(hds->ds_dir, origin_ds->ds_dir);
+ if (err != 0)
+ goto out;
+
+ /*
+ * Compute and check the amount of space to transfer. Since this is
+ * so expensive, don't do the preliminary check.
+ */
+ if (!dmu_tx_is_syncing(tx)) {
+ promote_rele(ddpa, FTAG);
+ return (0);
+ }
+
+ /* compute origin's new unique space */
+ snap = list_tail(&ddpa->clone_snaps);
+ ASSERT(snap != NULL);
+ ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
+ origin_ds->ds_object);
+ dsl_deadlist_space_range(&snap->ds->ds_deadlist,
+ dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX,
+ &ddpa->unique, &unused, &unused);
+
+ /*
+ * Walk the snapshots that we are moving
+ *
+ * Compute space to transfer. Consider the incremental changes
+ * to used by each snapshot:
+ * (my used) = (prev's used) + (blocks born) - (blocks killed)
+ * So each snapshot gave birth to:
+ * (blocks born) = (my used) - (prev's used) + (blocks killed)
+ * So a sequence would look like:
+ * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
+ * Which simplifies to:
+ * uN + kN + kN-1 + ... + k1 + k0
+ * Note however, if we stop before we reach the ORIGIN we get:
+ * uN + kN + kN-1 + ... + kM - uM-1
+ */
+ conflicting_snaps = B_FALSE;
+ ss_mv_cnt = 0;
+ ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes;
+ ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes;
+ ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes;
+ for (snap = list_head(&ddpa->shared_snaps); snap;
+ snap = list_next(&ddpa->shared_snaps, snap)) {
+ uint64_t val, dlused, dlcomp, dluncomp;
+ dsl_dataset_t *ds = snap->ds;
+
+ ss_mv_cnt++;
+
+ /*
+ * If there are long holds, we won't be able to evict
+ * the objset.
+ */
+ if (dsl_dataset_long_held(ds)) {
+ err = SET_ERROR(EBUSY);
+ goto out;
+ }
+
+ /* Check that the snapshot name does not conflict */
+ VERIFY0(dsl_dataset_get_snapname(ds));
+ if (strlen(ds->ds_snapname) >= max_snap_len) {
+ err = SET_ERROR(ENAMETOOLONG);
+ goto out;
+ }
+ err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
+ if (err == 0) {
+ fnvlist_add_boolean(ddpa->err_ds,
+ snap->ds->ds_snapname);
+ conflicting_snaps = B_TRUE;
+ } else if (err != ENOENT) {
+ goto out;
+ }
+
+ /* The very first snapshot does not have a deadlist */
+ if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0)
+ continue;
+
+ dsl_deadlist_space(&ds->ds_deadlist,
+ &dlused, &dlcomp, &dluncomp);
+ ddpa->used += dlused;
+ ddpa->comp += dlcomp;
+ ddpa->uncomp += dluncomp;
+ }
+
+ /*
+ * Check that bookmarks that are being transferred don't have
+ * name conflicts.
+ */
+ for (dsl_bookmark_node_t *dbn = avl_first(&origin_head->ds_bookmarks);
+ dbn != NULL && dbn->dbn_phys.zbm_creation_txg <=
+ dsl_dataset_phys(origin_ds)->ds_creation_txg;
+ dbn = AVL_NEXT(&origin_head->ds_bookmarks, dbn)) {
+ if (strlen(dbn->dbn_name) >= max_snap_len) {
+ err = SET_ERROR(ENAMETOOLONG);
+ goto out;
+ }
+ zfs_bookmark_phys_t bm;
+ err = dsl_bookmark_lookup_impl(ddpa->ddpa_clone,
+ dbn->dbn_name, &bm);
+
+ if (err == 0) {
+ fnvlist_add_boolean(ddpa->err_ds, dbn->dbn_name);
+ conflicting_snaps = B_TRUE;
+ } else if (err == ESRCH) {
+ err = 0;
+ } else if (err != 0) {
+ goto out;
+ }
+ }
+
+ /*
+ * In order to return the full list of conflicting snapshots, we check
+ * whether there was a conflict after traversing all of them.
+ */
+ if (conflicting_snaps) {
+ err = SET_ERROR(EEXIST);
+ goto out;
+ }
+
+ /*
+ * If we are a clone of a clone then we never reached ORIGIN,
+ * so we need to subtract out the clone origin's used space.
+ */
+ if (ddpa->origin_origin) {
+ ddpa->used -=
+ dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes;
+ ddpa->comp -=
+ dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes;
+ ddpa->uncomp -=
+ dsl_dataset_phys(ddpa->origin_origin)->
+ ds_uncompressed_bytes;
+ }
+
+ /* Check that there is enough space and limit headroom here */
+ err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
+ 0, ss_mv_cnt, ddpa->used, ddpa->cr, ddpa->proc);
+ if (err != 0)
+ goto out;
+
+ /*
+ * Compute the amounts of space that will be used by snapshots
+ * after the promotion (for both origin and clone). For each,
+ * it is the amount of space that will be on all of their
+ * deadlists (that was not born before their new origin).
+ */
+ if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+ uint64_t space;
+
+ /*
+ * Note, typically this will not be a clone of a clone,
+ * so dd_origin_txg will be < TXG_INITIAL, so
+ * these snaplist_space() -> dsl_deadlist_space_range()
+ * calls will be fast because they do not have to
+ * iterate over all bps.
+ */
+ snap = list_head(&ddpa->origin_snaps);
+ if (snap == NULL) {
+ err = SET_ERROR(ENOENT);
+ goto out;
+ }
+ err = snaplist_space(&ddpa->shared_snaps,
+ snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);
+ if (err != 0)
+ goto out;
+
+ err = snaplist_space(&ddpa->clone_snaps,
+ snap->ds->ds_dir->dd_origin_txg, &space);
+ if (err != 0)
+ goto out;
+ ddpa->cloneusedsnap += space;
+ }
+ if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags &
+ DD_FLAG_USED_BREAKDOWN) {
+ err = snaplist_space(&ddpa->origin_snaps,
+ dsl_dataset_phys(origin_ds)->ds_creation_txg,
+ &ddpa->originusedsnap);
+ if (err != 0)
+ goto out;
+ }
+
+out:
+ promote_rele(ddpa, FTAG);
+ return (err);
+}
+
+void
+dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_promote_arg_t *ddpa = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *hds;
+ struct promotenode *snap;
+ dsl_dataset_t *origin_ds;
+ dsl_dataset_t *origin_head;
+ dsl_dir_t *dd;
+ dsl_dir_t *odd = NULL;
+ uint64_t oldnext_obj;
+ int64_t delta;
+
+ ASSERT(nvlist_empty(ddpa->err_ds));
+
+ VERIFY0(promote_hold(ddpa, dp, FTAG));
+ hds = ddpa->ddpa_clone;
+
+ ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE);
+
+ snap = list_head(&ddpa->shared_snaps);
+ origin_ds = snap->ds;
+ dd = hds->ds_dir;
+
+ snap = list_head(&ddpa->origin_snaps);
+ origin_head = snap->ds;
+
+ /*
+ * We need to explicitly open odd, since origin_ds's dd will be
+ * changing.
+ */
+ VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
+ NULL, FTAG, &odd));
+
+ dsl_dataset_promote_crypt_sync(hds->ds_dir, odd, tx);
+
+ /* change origin's next snap */
+ dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
+ oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
+ snap = list_tail(&ddpa->clone_snaps);
+ ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
+ origin_ds->ds_object);
+ dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object;
+
+ /* change the origin's next clone */
+ if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) {
+ dsl_dataset_remove_from_next_clones(origin_ds,
+ snap->ds->ds_object, tx);
+ VERIFY0(zap_add_int(dp->dp_meta_objset,
+ dsl_dataset_phys(origin_ds)->ds_next_clones_obj,
+ oldnext_obj, tx));
+ }
+
+ /* change origin */
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object);
+ dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj;
+ dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
+ dmu_buf_will_dirty(odd->dd_dbuf, tx);
+ dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object;
+ origin_head->ds_dir->dd_origin_txg =
+ dsl_dataset_phys(origin_ds)->ds_creation_txg;
+
+ /* change dd_clone entries */
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+ VERIFY0(zap_remove_int(dp->dp_meta_objset,
+ dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx));
+ VERIFY0(zap_add_int(dp->dp_meta_objset,
+ dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
+ hds->ds_object, tx));
+
+ VERIFY0(zap_remove_int(dp->dp_meta_objset,
+ dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
+ origin_head->ds_object, tx));
+ if (dsl_dir_phys(dd)->dd_clones == 0) {
+ dsl_dir_phys(dd)->dd_clones =
+ zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES,
+ DMU_OT_NONE, 0, tx);
+ }
+ VERIFY0(zap_add_int(dp->dp_meta_objset,
+ dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));
+ }
+
+ /*
+ * Move bookmarks to this dir.
+ */
+ dsl_bookmark_node_t *dbn_next;
+ for (dsl_bookmark_node_t *dbn = avl_first(&origin_head->ds_bookmarks);
+ dbn != NULL && dbn->dbn_phys.zbm_creation_txg <=
+ dsl_dataset_phys(origin_ds)->ds_creation_txg;
+ dbn = dbn_next) {
+ dbn_next = AVL_NEXT(&origin_head->ds_bookmarks, dbn);
+
+ avl_remove(&origin_head->ds_bookmarks, dbn);
+ VERIFY0(zap_remove(dp->dp_meta_objset,
+ origin_head->ds_bookmarks_obj, dbn->dbn_name, tx));
+
+ dsl_bookmark_node_add(hds, dbn, tx);
+ }
+
+ dsl_bookmark_next_changed(hds, origin_ds, tx);
+
+ /* move snapshots to this dir */
+ for (snap = list_head(&ddpa->shared_snaps); snap;
+ snap = list_next(&ddpa->shared_snaps, snap)) {
+ dsl_dataset_t *ds = snap->ds;
+
+ /*
+ * Property callbacks are registered to a particular
+ * dsl_dir. Since ours is changing, evict the objset
+ * so that they will be unregistered from the old dsl_dir.
+ */
+ if (ds->ds_objset) {
+ dmu_objset_evict(ds->ds_objset);
+ ds->ds_objset = NULL;
+ }
+
+ /* move snap name entry */
+ VERIFY0(dsl_dataset_get_snapname(ds));
+ VERIFY0(dsl_dataset_snap_remove(origin_head,
+ ds->ds_snapname, tx, B_TRUE));
+ VERIFY0(zap_add(dp->dp_meta_objset,
+ dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname,
+ 8, 1, &ds->ds_object, tx));
+ dsl_fs_ss_count_adjust(hds->ds_dir, 1,
+ DD_FIELD_SNAPSHOT_COUNT, tx);
+
+ /* change containing dsl_dir */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object);
+ dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object;
+ ASSERT3P(ds->ds_dir, ==, odd);
+ dsl_dir_rele(ds->ds_dir, ds);
+ VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
+ NULL, ds, &ds->ds_dir));
+
+ /* move any clone references */
+ if (dsl_dataset_phys(ds)->ds_next_clones_obj &&
+ spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ for (zap_cursor_init(&zc, dp->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_next_clones_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ dsl_dataset_t *cnds;
+ uint64_t o;
+
+ if (za.za_first_integer == oldnext_obj) {
+ /*
+ * We've already moved the
+ * origin's reference.
+ */
+ continue;
+ }
+
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ za.za_first_integer, FTAG, &cnds));
+ o = dsl_dir_phys(cnds->ds_dir)->
+ dd_head_dataset_obj;
+
+ VERIFY0(zap_remove_int(dp->dp_meta_objset,
+ dsl_dir_phys(odd)->dd_clones, o, tx));
+ VERIFY0(zap_add_int(dp->dp_meta_objset,
+ dsl_dir_phys(dd)->dd_clones, o, tx));
+ dsl_dataset_rele(cnds, FTAG);
+ }
+ zap_cursor_fini(&zc);
+ }
+
+ ASSERT(!dsl_prop_hascb(ds));
+ }
+
+ /*
+ * Change space accounting.
+ * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
+ * both be valid, or both be 0 (resulting in delta == 0). This
+ * is true for each of {clone,origin} independently.
+ */
+
+ delta = ddpa->cloneusedsnap -
+ dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP];
+ ASSERT3S(delta, >=, 0);
+ ASSERT3U(ddpa->used, >=, delta);
+ dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
+ dsl_dir_diduse_space(dd, DD_USED_HEAD,
+ ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
+
+ delta = ddpa->originusedsnap -
+ dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP];
+ ASSERT3S(delta, <=, 0);
+ ASSERT3U(ddpa->used, >=, -delta);
+ dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
+ dsl_dir_diduse_space(odd, DD_USED_HEAD,
+ -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
+
+ dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
+
+ /*
+ * Since livelists are specific to a clone's origin txg, they
+ * are no longer accurate. Destroy the livelist from the clone being
+ * promoted. If the origin dataset is a clone, destroy its livelist
+ * as well.
+ */
+ dsl_dir_remove_livelist(dd, tx, B_TRUE);
+ dsl_dir_remove_livelist(odd, tx, B_TRUE);
+
+ /* log history record */
+ spa_history_log_internal_ds(hds, "promote", tx, " ");
+
+ dsl_dir_rele(odd, FTAG);
+ promote_rele(ddpa, FTAG);
+}
+
+/*
+ * Make a list of dsl_dataset_t's for the snapshots between first_obj
+ * (exclusive) and last_obj (inclusive). The list will be in reverse
+ * order (last_obj will be the list_head()). If first_obj == 0, do all
+ * snapshots back to this dataset's origin.
+ */
+static int
+snaplist_make(dsl_pool_t *dp,
+ uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag)
+{
+ uint64_t obj = last_obj;
+
+ list_create(l, sizeof (struct promotenode),
+ offsetof(struct promotenode, link));
+
+ while (obj != first_obj) {
+ dsl_dataset_t *ds;
+ struct promotenode *snap;
+ int err;
+
+ err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
+ ASSERT(err != ENOENT);
+ if (err != 0)
+ return (err);
+
+ if (first_obj == 0)
+ first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj;
+
+ snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
+ snap->ds = ds;
+ list_insert_tail(l, snap);
+ obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ }
+
+ return (0);
+}
+
+static int
+snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
+{
+ struct promotenode *snap;
+
+ *spacep = 0;
+ for (snap = list_head(l); snap; snap = list_next(l, snap)) {
+ uint64_t used, comp, uncomp;
+ dsl_deadlist_space_range(&snap->ds->ds_deadlist,
+ mintxg, UINT64_MAX, &used, &comp, &uncomp);
+ *spacep += used;
+ }
+ return (0);
+}
+
+static void
+snaplist_destroy(list_t *l, void *tag)
+{
+ struct promotenode *snap;
+
+ if (l == NULL || !list_link_active(&l->list_head))
+ return;
+
+ while ((snap = list_tail(l)) != NULL) {
+ list_remove(l, snap);
+ dsl_dataset_rele(snap->ds, tag);
+ kmem_free(snap, sizeof (*snap));
+ }
+ list_destroy(l);
+}
+
+static int
+promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
+{
+ int error;
+ dsl_dir_t *dd;
+ struct promotenode *snap;
+
+ error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,
+ &ddpa->ddpa_clone);
+ if (error != 0)
+ return (error);
+ dd = ddpa->ddpa_clone->ds_dir;
+
+ if (ddpa->ddpa_clone->ds_is_snapshot ||
+ !dsl_dir_is_clone(dd)) {
+ dsl_dataset_rele(ddpa->ddpa_clone, tag);
+ return (SET_ERROR(EINVAL));
+ }
+
+ error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj,
+ &ddpa->shared_snaps, tag);
+ if (error != 0)
+ goto out;
+
+ error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,
+ &ddpa->clone_snaps, tag);
+ if (error != 0)
+ goto out;
+
+ snap = list_head(&ddpa->shared_snaps);
+ ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj);
+ error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj,
+ dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj,
+ &ddpa->origin_snaps, tag);
+ if (error != 0)
+ goto out;
+
+ if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) {
+ error = dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj,
+ tag, &ddpa->origin_origin);
+ if (error != 0)
+ goto out;
+ }
+out:
+ if (error != 0)
+ promote_rele(ddpa, tag);
+ return (error);
+}
+
+static void
+promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag)
+{
+ snaplist_destroy(&ddpa->shared_snaps, tag);
+ snaplist_destroy(&ddpa->clone_snaps, tag);
+ snaplist_destroy(&ddpa->origin_snaps, tag);
+ if (ddpa->origin_origin != NULL)
+ dsl_dataset_rele(ddpa->origin_origin, tag);
+ dsl_dataset_rele(ddpa->ddpa_clone, tag);
+}
+
+/*
+ * Promote a clone.
+ *
+ * If it fails due to a conflicting snapshot name, "conflsnap" will be filled
+ * in with the name. (It must be at least ZFS_MAX_DATASET_NAME_LEN bytes long.)
+ */
+int
+dsl_dataset_promote(const char *name, char *conflsnap)
+{
+ dsl_dataset_promote_arg_t ddpa = { 0 };
+ uint64_t numsnaps;
+ int error;
+ nvpair_t *snap_pair;
+ objset_t *os;
+
+ /*
+ * We will modify space proportional to the number of
+ * snapshots. Compute numsnaps.
+ */
+ error = dmu_objset_hold(name, FTAG, &os);
+ if (error != 0)
+ return (error);
+ error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
+ dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj,
+ &numsnaps);
+ dmu_objset_rele(os, FTAG);
+ if (error != 0)
+ return (error);
+
+ ddpa.ddpa_clonename = name;
+ ddpa.err_ds = fnvlist_alloc();
+ ddpa.cr = CRED();
+ ddpa.proc = curproc;
+
+ error = dsl_sync_task(name, dsl_dataset_promote_check,
+ dsl_dataset_promote_sync, &ddpa,
+ 2 + numsnaps, ZFS_SPACE_CHECK_RESERVED);
+
+ /*
+ * Return the first conflicting snapshot found.
+ */
+ snap_pair = nvlist_next_nvpair(ddpa.err_ds, NULL);
+ if (snap_pair != NULL && conflsnap != NULL)
+ (void) strlcpy(conflsnap, nvpair_name(snap_pair),
+ ZFS_MAX_DATASET_NAME_LEN);
+
+ fnvlist_free(ddpa.err_ds);
+ return (error);
+}
+
+int
+dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
+ dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)
+{
+ /*
+ * "slack" factor for received datasets with refquota set on them.
+ * See the bottom of this function for details on its use.
+ */
+ uint64_t refquota_slack = (uint64_t)DMU_MAX_ACCESS *
+ spa_asize_inflation;
+ int64_t unused_refres_delta;
+
+ /* they should both be heads */
+ if (clone->ds_is_snapshot ||
+ origin_head->ds_is_snapshot)
+ return (SET_ERROR(EINVAL));
+
+ /* if we are not forcing, the branch point should be just before them */
+ if (!force && clone->ds_prev != origin_head->ds_prev)
+ return (SET_ERROR(EINVAL));
+
+ /* clone should be the clone (unless they are unrelated) */
+ if (clone->ds_prev != NULL &&
+ clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
+ origin_head->ds_dir != clone->ds_prev->ds_dir)
+ return (SET_ERROR(EINVAL));
+
+ /* the clone should be a child of the origin */
+ if (clone->ds_dir->dd_parent != origin_head->ds_dir)
+ return (SET_ERROR(EINVAL));
+
+ /* origin_head shouldn't be modified unless 'force' */
+ if (!force &&
+ dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))
+ return (SET_ERROR(ETXTBSY));
+
+ /* origin_head should have no long holds (e.g. is not mounted) */
+ if (dsl_dataset_handoff_check(origin_head, owner, tx))
+ return (SET_ERROR(EBUSY));
+
+ /* check amount of any unconsumed refreservation */
+ unused_refres_delta =
+ (int64_t)MIN(origin_head->ds_reserved,
+ dsl_dataset_phys(origin_head)->ds_unique_bytes) -
+ (int64_t)MIN(origin_head->ds_reserved,
+ dsl_dataset_phys(clone)->ds_unique_bytes);
+
+ if (unused_refres_delta > 0 &&
+ unused_refres_delta >
+ dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
+ return (SET_ERROR(ENOSPC));
+
+ /*
+ * The clone can't be too much over the head's refquota.
+ *
+ * To ensure that the entire refquota can be used, we allow one
+ * transaction to exceed the refquota. Therefore, this check
+ * needs to also allow for the space referenced to be more than the
+ * refquota. The maximum amount of space that one transaction can use
+ * on disk is DMU_MAX_ACCESS * spa_asize_inflation. Allowing this
+ * overage ensures that we are able to receive a filesystem that
+ * exceeds the refquota on the source system.
+ *
+ * So that overage is the refquota_slack we use below.
+ */
+ if (origin_head->ds_quota != 0 &&
+ dsl_dataset_phys(clone)->ds_referenced_bytes >
+ origin_head->ds_quota + refquota_slack)
+ return (SET_ERROR(EDQUOT));
+
+ return (0);
+}
+
+static void
+dsl_dataset_swap_remap_deadlists(dsl_dataset_t *clone,
+ dsl_dataset_t *origin, dmu_tx_t *tx)
+{
+ uint64_t clone_remap_dl_obj, origin_remap_dl_obj;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+
+ ASSERT(dsl_pool_sync_context(dp));
+
+ clone_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(clone);
+ origin_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(origin);
+
+ if (clone_remap_dl_obj != 0) {
+ dsl_deadlist_close(&clone->ds_remap_deadlist);
+ dsl_dataset_unset_remap_deadlist_object(clone, tx);
+ }
+ if (origin_remap_dl_obj != 0) {
+ dsl_deadlist_close(&origin->ds_remap_deadlist);
+ dsl_dataset_unset_remap_deadlist_object(origin, tx);
+ }
+
+ if (clone_remap_dl_obj != 0) {
+ dsl_dataset_set_remap_deadlist_object(origin,
+ clone_remap_dl_obj, tx);
+ dsl_deadlist_open(&origin->ds_remap_deadlist,
+ dp->dp_meta_objset, clone_remap_dl_obj);
+ }
+ if (origin_remap_dl_obj != 0) {
+ dsl_dataset_set_remap_deadlist_object(clone,
+ origin_remap_dl_obj, tx);
+ dsl_deadlist_open(&clone->ds_remap_deadlist,
+ dp->dp_meta_objset, origin_remap_dl_obj);
+ }
+}
+
+void
+dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
+ dsl_dataset_t *origin_head, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ int64_t unused_refres_delta;
+
+ ASSERT(clone->ds_reserved == 0);
+ /*
+ * NOTE: On DEBUG kernels there could be a race between this and
+ * the check function if spa_asize_inflation is adjusted...
+ */
+ ASSERT(origin_head->ds_quota == 0 ||
+ dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota +
+ DMU_MAX_ACCESS * spa_asize_inflation);
+ ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
+
+ dsl_dir_cancel_waiters(origin_head->ds_dir);
+
+ /*
+ * Swap per-dataset feature flags.
+ */
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (!(spa_feature_table[f].fi_flags &
+ ZFEATURE_FLAG_PER_DATASET)) {
+ ASSERT(!dsl_dataset_feature_is_active(clone, f));
+ ASSERT(!dsl_dataset_feature_is_active(origin_head, f));
+ continue;
+ }
+
+ boolean_t clone_inuse = dsl_dataset_feature_is_active(clone, f);
+ void *clone_feature = clone->ds_feature[f];
+ boolean_t origin_head_inuse =
+ dsl_dataset_feature_is_active(origin_head, f);
+ void *origin_head_feature = origin_head->ds_feature[f];
+
+ if (clone_inuse)
+ dsl_dataset_deactivate_feature_impl(clone, f, tx);
+ if (origin_head_inuse)
+ dsl_dataset_deactivate_feature_impl(origin_head, f, tx);
+
+ if (clone_inuse) {
+ dsl_dataset_activate_feature(origin_head->ds_object, f,
+ clone_feature, tx);
+ origin_head->ds_feature[f] = clone_feature;
+ }
+ if (origin_head_inuse) {
+ dsl_dataset_activate_feature(clone->ds_object, f,
+ origin_head_feature, tx);
+ clone->ds_feature[f] = origin_head_feature;
+ }
+ }
+
+ dmu_buf_will_dirty(clone->ds_dbuf, tx);
+ dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
+
+ if (clone->ds_objset != NULL) {
+ dmu_objset_evict(clone->ds_objset);
+ clone->ds_objset = NULL;
+ }
+
+ if (origin_head->ds_objset != NULL) {
+ dmu_objset_evict(origin_head->ds_objset);
+ origin_head->ds_objset = NULL;
+ }
+
+ unused_refres_delta =
+ (int64_t)MIN(origin_head->ds_reserved,
+ dsl_dataset_phys(origin_head)->ds_unique_bytes) -
+ (int64_t)MIN(origin_head->ds_reserved,
+ dsl_dataset_phys(clone)->ds_unique_bytes);
+
+ /*
+ * Reset origin's unique bytes.
+ */
+ {
+ dsl_dataset_t *origin = clone->ds_prev;
+ uint64_t comp, uncomp;
+
+ dmu_buf_will_dirty(origin->ds_dbuf, tx);
+ dsl_deadlist_space_range(&clone->ds_deadlist,
+ dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX,
+ &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp);
+ }
+
+ /* swap blkptrs */
+ {
+ rrw_enter(&clone->ds_bp_rwlock, RW_WRITER, FTAG);
+ rrw_enter(&origin_head->ds_bp_rwlock, RW_WRITER, FTAG);
+ blkptr_t tmp;
+ tmp = dsl_dataset_phys(origin_head)->ds_bp;
+ dsl_dataset_phys(origin_head)->ds_bp =
+ dsl_dataset_phys(clone)->ds_bp;
+ dsl_dataset_phys(clone)->ds_bp = tmp;
+ rrw_exit(&origin_head->ds_bp_rwlock, FTAG);
+ rrw_exit(&clone->ds_bp_rwlock, FTAG);
+ }
+
+ /* set dd_*_bytes */
+ {
+ int64_t dused, dcomp, duncomp;
+ uint64_t cdl_used, cdl_comp, cdl_uncomp;
+ uint64_t odl_used, odl_comp, odl_uncomp;
+
+ ASSERT3U(dsl_dir_phys(clone->ds_dir)->
+ dd_used_breakdown[DD_USED_SNAP], ==, 0);
+
+ dsl_deadlist_space(&clone->ds_deadlist,
+ &cdl_used, &cdl_comp, &cdl_uncomp);
+ dsl_deadlist_space(&origin_head->ds_deadlist,
+ &odl_used, &odl_comp, &odl_uncomp);
+
+ dused = dsl_dataset_phys(clone)->ds_referenced_bytes +
+ cdl_used -
+ (dsl_dataset_phys(origin_head)->ds_referenced_bytes +
+ odl_used);
+ dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes +
+ cdl_comp -
+ (dsl_dataset_phys(origin_head)->ds_compressed_bytes +
+ odl_comp);
+ duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes +
+ cdl_uncomp -
+ (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes +
+ odl_uncomp);
+
+ dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
+ dused, dcomp, duncomp, tx);
+ dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,
+ -dused, -dcomp, -duncomp, tx);
+
+ /*
+ * The difference in the space used by snapshots is the
+ * difference in snapshot space due to the head's
+ * deadlist (since that's the only thing that's
+ * changing that affects the snapused).
+ */
+ dsl_deadlist_space_range(&clone->ds_deadlist,
+ origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
+ &cdl_used, &cdl_comp, &cdl_uncomp);
+ dsl_deadlist_space_range(&origin_head->ds_deadlist,
+ origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
+ &odl_used, &odl_comp, &odl_uncomp);
+ dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
+ DD_USED_HEAD, DD_USED_SNAP, tx);
+ }
+
+ /* swap ds_*_bytes */
+ SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes,
+ dsl_dataset_phys(clone)->ds_referenced_bytes);
+ SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes,
+ dsl_dataset_phys(clone)->ds_compressed_bytes);
+ SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes,
+ dsl_dataset_phys(clone)->ds_uncompressed_bytes);
+ SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes,
+ dsl_dataset_phys(clone)->ds_unique_bytes);
+
+ /* apply any parent delta for change in unconsumed refreservation */
+ dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
+ unused_refres_delta, 0, 0, tx);
+
+ /*
+ * Swap deadlists.
+ */
+ dsl_deadlist_close(&clone->ds_deadlist);
+ dsl_deadlist_close(&origin_head->ds_deadlist);
+ SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj,
+ dsl_dataset_phys(clone)->ds_deadlist_obj);
+ dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
+ dsl_dataset_phys(clone)->ds_deadlist_obj);
+ dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
+ dsl_dataset_phys(origin_head)->ds_deadlist_obj);
+ dsl_dataset_swap_remap_deadlists(clone, origin_head, tx);
+
+ /*
+ * If there is a bookmark at the origin, its "next dataset" is
+ * changing, so we need to reset its FBN.
+ */
+ dsl_bookmark_next_changed(origin_head, origin_head->ds_prev, tx);
+
+ dsl_scan_ds_clone_swapped(origin_head, clone, tx);
+
+ /*
+ * Destroy any livelists associated with the clone or the origin,
+ * since after the swap the corresponding livelists are no longer
+ * valid.
+ */
+ dsl_dir_remove_livelist(clone->ds_dir, tx, B_TRUE);
+ dsl_dir_remove_livelist(origin_head->ds_dir, tx, B_TRUE);
+
+ spa_history_log_internal_ds(clone, "clone swap", tx,
+ "parent=%s", origin_head->ds_dir->dd_myname);
+}
+
+/*
+ * Given a pool name and a dataset object number in that pool,
+ * return the name of that dataset.
+ */
+int
+dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ int error;
+
+ error = dsl_pool_hold(pname, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
+ if (error == 0) {
+ dsl_dataset_name(ds, buf);
+ dsl_dataset_rele(ds, FTAG);
+ }
+ dsl_pool_rele(dp, FTAG);
+
+ return (error);
+}
+
+int
+dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
+ uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
+{
+ int error = 0;
+
+ ASSERT3S(asize, >, 0);
+
+ /*
+ * *ref_rsrv is the portion of asize that will come from any
+ * unconsumed refreservation space.
+ */
+ *ref_rsrv = 0;
+
+ mutex_enter(&ds->ds_lock);
+ /*
+ * Make a space adjustment for reserved bytes.
+ */
+ if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
+ ASSERT3U(*used, >=,
+ ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
+ *used -=
+ (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
+ *ref_rsrv =
+ asize - MIN(asize, parent_delta(ds, asize + inflight));
+ }
+
+ if (!check_quota || ds->ds_quota == 0) {
+ mutex_exit(&ds->ds_lock);
+ return (0);
+ }
+ /*
+ * If they are requesting more space, and our current estimate
+ * is over quota, they get to try again unless the actual
+ * on-disk is over quota and there are no pending changes (which
+ * may free up space for us).
+ */
+ if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >=
+ ds->ds_quota) {
+ if (inflight > 0 ||
+ dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota)
+ error = SET_ERROR(ERESTART);
+ else
+ error = SET_ERROR(EDQUOT);
+ }
+ mutex_exit(&ds->ds_lock);
+
+ return (error);
+}
+
+typedef struct dsl_dataset_set_qr_arg {
+ const char *ddsqra_name;
+ zprop_source_t ddsqra_source;
+ uint64_t ddsqra_value;
+} dsl_dataset_set_qr_arg_t;
+
+
+/* ARGSUSED */
+static int
+dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_set_qr_arg_t *ddsqra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ int error;
+ uint64_t newval;
+
+ if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)
+ return (SET_ERROR(ENOTSUP));
+
+ error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ if (ds->ds_is_snapshot) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ error = dsl_prop_predict(ds->ds_dir,
+ zfs_prop_to_name(ZFS_PROP_REFQUOTA),
+ ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ if (newval == 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes ||
+ newval < ds->ds_reserved) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(ENOSPC));
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+static void
+dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_set_qr_arg_t *ddsqra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds = NULL;
+ uint64_t newval;
+
+ VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
+
+ dsl_prop_set_sync_impl(ds,
+ zfs_prop_to_name(ZFS_PROP_REFQUOTA),
+ ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
+ &ddsqra->ddsqra_value, tx);
+
+ VERIFY0(dsl_prop_get_int_ds(ds,
+ zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));
+
+ if (ds->ds_quota != newval) {
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_quota = newval;
+ }
+ dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
+ uint64_t refquota)
+{
+ dsl_dataset_set_qr_arg_t ddsqra;
+
+ ddsqra.ddsqra_name = dsname;
+ ddsqra.ddsqra_source = source;
+ ddsqra.ddsqra_value = refquota;
+
+ return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
+ dsl_dataset_set_refquota_sync, &ddsqra, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+static int
+dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_set_qr_arg_t *ddsqra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ int error;
+ uint64_t newval, unique;
+
+ if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)
+ return (SET_ERROR(ENOTSUP));
+
+ error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ if (ds->ds_is_snapshot) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ error = dsl_prop_predict(ds->ds_dir,
+ zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+ ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ /*
+ * If we are doing the preliminary check in open context, the
+ * space estimates may be inaccurate.
+ */
+ if (!dmu_tx_is_syncing(tx)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ mutex_enter(&ds->ds_lock);
+ if (!DS_UNIQUE_IS_ACCURATE(ds))
+ dsl_dataset_recalc_head_uniq(ds);
+ unique = dsl_dataset_phys(ds)->ds_unique_bytes;
+ mutex_exit(&ds->ds_lock);
+
+ if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
+ uint64_t delta = MAX(unique, newval) -
+ MAX(unique, ds->ds_reserved);
+
+ if (delta >
+ dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) ||
+ (ds->ds_quota > 0 && newval > ds->ds_quota)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(ENOSPC));
+ }
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+void
+dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
+ zprop_source_t source, uint64_t value, dmu_tx_t *tx)
+{
+ uint64_t newval;
+ uint64_t unique;
+ int64_t delta;
+
+ dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+ source, sizeof (value), 1, &value, tx);
+
+ VERIFY0(dsl_prop_get_int_ds(ds,
+ zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ mutex_enter(&ds->ds_dir->dd_lock);
+ mutex_enter(&ds->ds_lock);
+ ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
+ unique = dsl_dataset_phys(ds)->ds_unique_bytes;
+ delta = MAX(0, (int64_t)(newval - unique)) -
+ MAX(0, (int64_t)(ds->ds_reserved - unique));
+ ds->ds_reserved = newval;
+ mutex_exit(&ds->ds_lock);
+
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
+ mutex_exit(&ds->ds_dir->dd_lock);
+}
+
+static void
+dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_set_qr_arg_t *ddsqra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds = NULL;
+
+ VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
+ dsl_dataset_set_refreservation_sync_impl(ds,
+ ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);
+ dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
+ uint64_t refreservation)
+{
+ dsl_dataset_set_qr_arg_t ddsqra;
+
+ ddsqra.ddsqra_name = dsname;
+ ddsqra.ddsqra_source = source;
+ ddsqra.ddsqra_value = refreservation;
+
+ return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
+ dsl_dataset_set_refreservation_sync, &ddsqra, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+typedef struct dsl_dataset_set_compression_arg {
+ const char *ddsca_name;
+ zprop_source_t ddsca_source;
+ uint64_t ddsca_value;
+} dsl_dataset_set_compression_arg_t;
+
+/* ARGSUSED */
+static int
+dsl_dataset_set_compression_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_set_compression_arg_t *ddsca = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+
+ uint64_t compval = ZIO_COMPRESS_ALGO(ddsca->ddsca_value);
+ spa_feature_t f = zio_compress_to_feature(compval);
+
+ if (f == SPA_FEATURE_NONE)
+ return (SET_ERROR(EINVAL));
+
+ if (!spa_feature_is_enabled(dp->dp_spa, f))
+ return (SET_ERROR(ENOTSUP));
+
+ return (0);
+}
+
+static void
+dsl_dataset_set_compression_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_set_compression_arg_t *ddsca = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds = NULL;
+
+ uint64_t compval = ZIO_COMPRESS_ALGO(ddsca->ddsca_value);
+ spa_feature_t f = zio_compress_to_feature(compval);
+ ASSERT3S(spa_feature_table[f].fi_type, ==, ZFEATURE_TYPE_BOOLEAN);
+
+ VERIFY0(dsl_dataset_hold(dp, ddsca->ddsca_name, FTAG, &ds));
+ if (zfeature_active(f, ds->ds_feature[f]) != B_TRUE) {
+ ds->ds_feature_activation[f] = (void *)B_TRUE;
+ dsl_dataset_activate_feature(ds->ds_object, f,
+ ds->ds_feature_activation[f], tx);
+ ds->ds_feature[f] = ds->ds_feature_activation[f];
+ }
+ dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_set_compression(const char *dsname, zprop_source_t source,
+ uint64_t compression)
+{
+ dsl_dataset_set_compression_arg_t ddsca;
+
+ /*
+ * The sync task is only required for zstd in order to activate
+ * the feature flag when the property is first set.
+ */
+ if (ZIO_COMPRESS_ALGO(compression) != ZIO_COMPRESS_ZSTD)
+ return (0);
+
+ ddsca.ddsca_name = dsname;
+ ddsca.ddsca_source = source;
+ ddsca.ddsca_value = compression;
+
+ return (dsl_sync_task(dsname, dsl_dataset_set_compression_check,
+ dsl_dataset_set_compression_sync, &ddsca, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+/*
+ * Return (in *usedp) the amount of space referenced by "new" that was not
+ * referenced at the time the bookmark corresponds to. "New" may be a
+ * snapshot or a head. The bookmark must be before new, in
+ * new's filesystem (or its origin) -- caller verifies this.
+ *
+ * The written space is calculated by considering two components: First, we
+ * ignore any freed space, and calculate the written as new's used space
+ * minus old's used space. Next, we add in the amount of space that was freed
+ * between the two time points, thus reducing new's used space relative to
+ * old's. Specifically, this is the space that was born before
+ * zbm_creation_txg, and freed before new (ie. on new's deadlist or a
+ * previous deadlist).
+ *
+ * space freed [---------------------]
+ * snapshots ---O-------O--------O-------O------
+ * bookmark new
+ *
+ * Note, the bookmark's zbm_*_bytes_refd must be valid, but if the HAS_FBN
+ * flag is not set, we will calculate the freed_before_next based on the
+ * next snapshot's deadlist, rather than using zbm_*_freed_before_next_snap.
+ */
+static int
+dsl_dataset_space_written_impl(zfs_bookmark_phys_t *bmp,
+ dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ int err = 0;
+ dsl_pool_t *dp = new->ds_dir->dd_pool;
+
+ ASSERT(dsl_pool_config_held(dp));
+ if (dsl_dataset_is_snapshot(new)) {
+ ASSERT3U(bmp->zbm_creation_txg, <,
+ dsl_dataset_phys(new)->ds_creation_txg);
+ }
+
+ *usedp = 0;
+ *usedp += dsl_dataset_phys(new)->ds_referenced_bytes;
+ *usedp -= bmp->zbm_referenced_bytes_refd;
+
+ *compp = 0;
+ *compp += dsl_dataset_phys(new)->ds_compressed_bytes;
+ *compp -= bmp->zbm_compressed_bytes_refd;
+
+ *uncompp = 0;
+ *uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;
+ *uncompp -= bmp->zbm_uncompressed_bytes_refd;
+
+ dsl_dataset_t *snap = new;
+
+ while (dsl_dataset_phys(snap)->ds_prev_snap_txg >
+ bmp->zbm_creation_txg) {
+ uint64_t used, comp, uncomp;
+
+ dsl_deadlist_space_range(&snap->ds_deadlist,
+ 0, bmp->zbm_creation_txg,
+ &used, &comp, &uncomp);
+ *usedp += used;
+ *compp += comp;
+ *uncompp += uncomp;
+
+ uint64_t snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+ if (snap != new)
+ dsl_dataset_rele(snap, FTAG);
+ err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
+ if (err != 0)
+ break;
+ }
+
+ /*
+ * We might not have the FBN if we are calculating written from
+ * a snapshot (because we didn't know the correct "next" snapshot
+ * until now).
+ */
+ if (bmp->zbm_flags & ZBM_FLAG_HAS_FBN) {
+ *usedp += bmp->zbm_referenced_freed_before_next_snap;
+ *compp += bmp->zbm_compressed_freed_before_next_snap;
+ *uncompp += bmp->zbm_uncompressed_freed_before_next_snap;
+ } else {
+ ASSERT3U(dsl_dataset_phys(snap)->ds_prev_snap_txg, ==,
+ bmp->zbm_creation_txg);
+ uint64_t used, comp, uncomp;
+ dsl_deadlist_space(&snap->ds_deadlist, &used, &comp, &uncomp);
+ *usedp += used;
+ *compp += comp;
+ *uncompp += uncomp;
+ }
+ if (snap != new)
+ dsl_dataset_rele(snap, FTAG);
+ return (err);
+}
+
+/*
+ * Return (in *usedp) the amount of space written in new that was not
+ * present at the time the bookmark corresponds to. New may be a
+ * snapshot or the head. Old must be a bookmark before new, in
+ * new's filesystem (or its origin) -- caller verifies this.
+ */
+int
+dsl_dataset_space_written_bookmark(zfs_bookmark_phys_t *bmp,
+ dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ if (!(bmp->zbm_flags & ZBM_FLAG_HAS_FBN))
+ return (SET_ERROR(ENOTSUP));
+ return (dsl_dataset_space_written_impl(bmp, new,
+ usedp, compp, uncompp));
+}
+
+/*
+ * Return (in *usedp) the amount of space written in new that is not
+ * present in oldsnap. New may be a snapshot or the head. Old must be
+ * a snapshot before new, in new's filesystem (or its origin). If not then
+ * fail and return EINVAL.
+ */
+int
+dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ if (!dsl_dataset_is_before(new, oldsnap, 0))
+ return (SET_ERROR(EINVAL));
+
+ zfs_bookmark_phys_t zbm = { 0 };
+ dsl_dataset_phys_t *dsp = dsl_dataset_phys(oldsnap);
+ zbm.zbm_guid = dsp->ds_guid;
+ zbm.zbm_creation_txg = dsp->ds_creation_txg;
+ zbm.zbm_creation_time = dsp->ds_creation_time;
+ zbm.zbm_referenced_bytes_refd = dsp->ds_referenced_bytes;
+ zbm.zbm_compressed_bytes_refd = dsp->ds_compressed_bytes;
+ zbm.zbm_uncompressed_bytes_refd = dsp->ds_uncompressed_bytes;
+
+ /*
+ * If oldsnap is the origin (or origin's origin, ...) of new,
+ * we can't easily calculate the effective FBN. Therefore,
+ * we do not set ZBM_FLAG_HAS_FBN, so that the _impl will calculate
+ * it relative to the correct "next": the next snapshot towards "new",
+ * rather than the next snapshot in oldsnap's dsl_dir.
+ */
+ return (dsl_dataset_space_written_impl(&zbm, new,
+ usedp, compp, uncompp));
+}
+
+/*
+ * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
+ * lastsnap, and all snapshots in between are deleted.
+ *
+ * blocks that would be freed [---------------------------]
+ * snapshots ---O-------O--------O-------O--------O
+ * firstsnap lastsnap
+ *
+ * This is the set of blocks that were born after the snap before firstsnap,
+ * (birth > firstsnap->prev_snap_txg) and died before the snap after the
+ * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
+ * We calculate this by iterating over the relevant deadlists (from the snap
+ * after lastsnap, backward to the snap after firstsnap), summing up the
+ * space on the deadlist that was born after the snap before firstsnap.
+ */
+int
+dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
+ dsl_dataset_t *lastsnap,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ int err = 0;
+ uint64_t snapobj;
+ dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
+
+ ASSERT(firstsnap->ds_is_snapshot);
+ ASSERT(lastsnap->ds_is_snapshot);
+
+ /*
+ * Check that the snapshots are in the same dsl_dir, and firstsnap
+ * is before lastsnap.
+ */
+ if (firstsnap->ds_dir != lastsnap->ds_dir ||
+ dsl_dataset_phys(firstsnap)->ds_creation_txg >
+ dsl_dataset_phys(lastsnap)->ds_creation_txg)
+ return (SET_ERROR(EINVAL));
+
+ *usedp = *compp = *uncompp = 0;
+
+ snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj;
+ while (snapobj != firstsnap->ds_object) {
+ dsl_dataset_t *ds;
+ uint64_t used, comp, uncomp;
+
+ err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
+ if (err != 0)
+ break;
+
+ dsl_deadlist_space_range(&ds->ds_deadlist,
+ dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX,
+ &used, &comp, &uncomp);
+ *usedp += used;
+ *compp += comp;
+ *uncompp += uncomp;
+
+ snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ ASSERT3U(snapobj, !=, 0);
+ dsl_dataset_rele(ds, FTAG);
+ }
+ return (err);
+}
+
+/*
+ * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
+ * For example, they could both be snapshots of the same filesystem, and
+ * 'earlier' is before 'later'. Or 'earlier' could be the origin of
+ * 'later's filesystem. Or 'earlier' could be an older snapshot in the origin's
+ * filesystem. Or 'earlier' could be the origin's origin.
+ *
+ * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg.
+ */
+boolean_t
+dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
+ uint64_t earlier_txg)
+{
+ dsl_pool_t *dp = later->ds_dir->dd_pool;
+ int error;
+ boolean_t ret;
+
+ ASSERT(dsl_pool_config_held(dp));
+ ASSERT(earlier->ds_is_snapshot || earlier_txg != 0);
+
+ if (earlier_txg == 0)
+ earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg;
+
+ if (later->ds_is_snapshot &&
+ earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg)
+ return (B_FALSE);
+
+ if (later->ds_dir == earlier->ds_dir)
+ return (B_TRUE);
+
+ /*
+ * We check dd_origin_obj explicitly here rather than using
+ * dsl_dir_is_clone() so that we will return TRUE if "earlier"
+ * is $ORIGIN@$ORIGIN. dsl_dataset_space_written() depends on
+ * this behavior.
+ */
+ if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == 0)
+ return (B_FALSE);
+
+ dsl_dataset_t *origin;
+ error = dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);
+ if (error != 0)
+ return (B_FALSE);
+ if (dsl_dataset_phys(origin)->ds_creation_txg == earlier_txg &&
+ origin->ds_dir == earlier->ds_dir) {
+ dsl_dataset_rele(origin, FTAG);
+ return (B_TRUE);
+ }
+ ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
+ dsl_dataset_rele(origin, FTAG);
+ return (ret);
+}
+
+void
+dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
+}
+
+boolean_t
+dsl_dataset_is_zapified(dsl_dataset_t *ds)
+{
+ dmu_object_info_t doi;
+
+ dmu_object_info_from_db(ds->ds_dbuf, &doi);
+ return (doi.doi_type == DMU_OTN_ZAP_METADATA);
+}
+
+boolean_t
+dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds)
+{
+ return (dsl_dataset_is_zapified(ds) &&
+ zap_contains(ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0);
+}
+
+uint64_t
+dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds)
+{
+ uint64_t remap_deadlist_obj;
+ int err;
+
+ if (!dsl_dataset_is_zapified(ds))
+ return (0);
+
+ err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object,
+ DS_FIELD_REMAP_DEADLIST, sizeof (remap_deadlist_obj), 1,
+ &remap_deadlist_obj);
+
+ if (err != 0) {
+ VERIFY3S(err, ==, ENOENT);
+ return (0);
+ }
+
+ ASSERT(remap_deadlist_obj != 0);
+ return (remap_deadlist_obj);
+}
+
+boolean_t
+dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds)
+{
+ EQUIV(dsl_deadlist_is_open(&ds->ds_remap_deadlist),
+ dsl_dataset_get_remap_deadlist_object(ds) != 0);
+ return (dsl_deadlist_is_open(&ds->ds_remap_deadlist));
+}
+
+static void
+dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, uint64_t obj,
+ dmu_tx_t *tx)
+{
+ ASSERT(obj != 0);
+ dsl_dataset_zapify(ds, tx);
+ VERIFY0(zap_add(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object,
+ DS_FIELD_REMAP_DEADLIST, sizeof (obj), 1, &obj, tx));
+}
+
+static void
+dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ VERIFY0(zap_remove(ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_object, DS_FIELD_REMAP_DEADLIST, tx));
+}
+
+void
+dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ uint64_t remap_deadlist_object;
+ spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dsl_dataset_remap_deadlist_exists(ds));
+
+ remap_deadlist_object = ds->ds_remap_deadlist.dl_object;
+ dsl_deadlist_close(&ds->ds_remap_deadlist);
+ dsl_deadlist_free(spa_meta_objset(spa), remap_deadlist_object, tx);
+ dsl_dataset_unset_remap_deadlist_object(ds, tx);
+ spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+}
+
+void
+dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ uint64_t remap_deadlist_obj;
+ spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(MUTEX_HELD(&ds->ds_remap_deadlist_lock));
+ /*
+ * Currently we only create remap deadlists when there are indirect
+ * vdevs with referenced mappings.
+ */
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
+
+ remap_deadlist_obj = dsl_deadlist_clone(
+ &ds->ds_deadlist, UINT64_MAX,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
+ dsl_dataset_set_remap_deadlist_object(ds,
+ remap_deadlist_obj, tx);
+ dsl_deadlist_open(&ds->ds_remap_deadlist, spa_meta_objset(spa),
+ remap_deadlist_obj);
+ spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+}
+
+void
+dsl_dataset_activate_redaction(dsl_dataset_t *ds, uint64_t *redact_snaps,
+ uint64_t num_redact_snaps, dmu_tx_t *tx)
+{
+ uint64_t dsobj = ds->ds_object;
+ struct feature_type_uint64_array_arg *ftuaa =
+ kmem_zalloc(sizeof (*ftuaa), KM_SLEEP);
+ ftuaa->length = (int64_t)num_redact_snaps;
+ if (num_redact_snaps > 0) {
+ ftuaa->array = kmem_alloc(num_redact_snaps * sizeof (uint64_t),
+ KM_SLEEP);
+ bcopy(redact_snaps, ftuaa->array, num_redact_snaps *
+ sizeof (uint64_t));
+ }
+ dsl_dataset_activate_feature(dsobj, SPA_FEATURE_REDACTED_DATASETS,
+ ftuaa, tx);
+ ds->ds_feature[SPA_FEATURE_REDACTED_DATASETS] = ftuaa;
+}
+
+/* BEGIN CSTYLED */
+#if defined(_LP64)
+#define RECORDSIZE_PERM ZMOD_RW
+#else
+/* Limited to 1M on 32-bit platforms due to lack of virtual address space */
+#define RECORDSIZE_PERM ZMOD_RD
+#endif
+ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, INT, RECORDSIZE_PERM,
+ "Max allowed record size");
+
+ZFS_MODULE_PARAM(zfs, zfs_, allow_redacted_dataset_mount, INT, ZMOD_RW,
+ "Allow mounting of redacted datasets");
+/* END CSTYLED */
+
+EXPORT_SYMBOL(dsl_dataset_hold);
+EXPORT_SYMBOL(dsl_dataset_hold_flags);
+EXPORT_SYMBOL(dsl_dataset_hold_obj);
+EXPORT_SYMBOL(dsl_dataset_hold_obj_flags);
+EXPORT_SYMBOL(dsl_dataset_own);
+EXPORT_SYMBOL(dsl_dataset_own_obj);
+EXPORT_SYMBOL(dsl_dataset_name);
+EXPORT_SYMBOL(dsl_dataset_rele);
+EXPORT_SYMBOL(dsl_dataset_rele_flags);
+EXPORT_SYMBOL(dsl_dataset_disown);
+EXPORT_SYMBOL(dsl_dataset_tryown);
+EXPORT_SYMBOL(dsl_dataset_create_sync);
+EXPORT_SYMBOL(dsl_dataset_create_sync_dd);
+EXPORT_SYMBOL(dsl_dataset_snapshot_check);
+EXPORT_SYMBOL(dsl_dataset_snapshot_sync);
+EXPORT_SYMBOL(dsl_dataset_promote);
+EXPORT_SYMBOL(dsl_dataset_user_hold);
+EXPORT_SYMBOL(dsl_dataset_user_release);
+EXPORT_SYMBOL(dsl_dataset_get_holds);
+EXPORT_SYMBOL(dsl_dataset_get_blkptr);
+EXPORT_SYMBOL(dsl_dataset_get_spa);
+EXPORT_SYMBOL(dsl_dataset_modified_since_snap);
+EXPORT_SYMBOL(dsl_dataset_space_written);
+EXPORT_SYMBOL(dsl_dataset_space_wouldfree);
+EXPORT_SYMBOL(dsl_dataset_sync);
+EXPORT_SYMBOL(dsl_dataset_block_born);
+EXPORT_SYMBOL(dsl_dataset_block_kill);
+EXPORT_SYMBOL(dsl_dataset_dirty);
+EXPORT_SYMBOL(dsl_dataset_stats);
+EXPORT_SYMBOL(dsl_dataset_fast_stat);
+EXPORT_SYMBOL(dsl_dataset_space);
+EXPORT_SYMBOL(dsl_dataset_fsid_guid);
+EXPORT_SYMBOL(dsl_dsobj_to_dsname);
+EXPORT_SYMBOL(dsl_dataset_check_quota);
+EXPORT_SYMBOL(dsl_dataset_clone_swap_check_impl);
+EXPORT_SYMBOL(dsl_dataset_clone_swap_sync_impl);
diff --git a/sys/contrib/openzfs/module/zfs/dsl_deadlist.c b/sys/contrib/openzfs/module/zfs/dsl_deadlist.c
new file mode 100644
index 000000000000..bad2d56eefdd
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_deadlist.c
@@ -0,0 +1,1012 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/zfs_context.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+
+/*
+ * Deadlist concurrency:
+ *
+ * Deadlists can only be modified from the syncing thread.
+ *
+ * Except for dsl_deadlist_insert(), it can only be modified with the
+ * dp_config_rwlock held with RW_WRITER.
+ *
+ * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can
+ * be called concurrently, from open context, with the dl_config_rwlock held
+ * with RW_READER.
+ *
+ * Therefore, we only need to provide locking between dsl_deadlist_insert() and
+ * the accessors, protecting:
+ * dl_phys->dl_used,comp,uncomp
+ * and protecting the dl_tree from being loaded.
+ * The locking is provided by dl_lock. Note that locking on the bpobj_t
+ * provides its own locking, and dl_oldfmt is immutable.
+ */
+
+/*
+ * Livelist Overview
+ * ================
+ *
+ * Livelists use the same 'deadlist_t' struct as deadlists and are also used
+ * to track blkptrs over the lifetime of a dataset. Livelists however, belong
+ * to clones and track the blkptrs that are clone-specific (were born after
+ * the clone's creation). The exception is embedded block pointers which are
+ * not included in livelists because they do not need to be freed.
+ *
+ * When it comes time to delete the clone, the livelist provides a quick
+ * reference as to what needs to be freed. For this reason, livelists also track
+ * when clone-specific blkptrs are freed before deletion to prevent double
+ * frees. Each blkptr in a livelist is marked as a FREE or an ALLOC and the
+ * deletion algorithm iterates backwards over the livelist, matching
+ * FREE/ALLOC pairs and then freeing those ALLOCs which remain. livelists
+ * are also updated in the case when blkptrs are remapped: the old version
+ * of the blkptr is cancelled out with a FREE and the new version is tracked
+ * with an ALLOC.
+ *
+ * To bound the amount of memory required for deletion, livelists over a
+ * certain size are spread over multiple entries. Entries are grouped by
+ * birth txg so we can be sure the ALLOC/FREE pair for a given blkptr will
+ * be in the same entry. This allows us to delete livelists incrementally
+ * over multiple syncs, one entry at a time.
+ *
+ * During the lifetime of the clone, livelists can get extremely large.
+ * Their size is managed by periodic condensing (preemptively cancelling out
+ * FREE/ALLOC pairs). Livelists are disabled when a clone is promoted or when
+ * the shared space between the clone and its origin is so small that it
+ * doesn't make sense to use livelists anymore.
+ */
+
+/*
+ * The threshold sublist size at which we create a new sub-livelist for the
+ * next txg. However, since blkptrs of the same transaction group must be in
+ * the same sub-list, the actual sublist size may exceed this. When picking the
+ * size we had to balance the fact that larger sublists mean fewer sublists
+ * (decreasing the cost of insertion) against the consideration that sublists
+ * will be loaded into memory and shouldn't take up an inordinate amount of
+ * space. We settled on ~500000 entries, corresponding to roughly 128M.
+ */
+unsigned long zfs_livelist_max_entries = 500000;
+
+/*
+ * We can approximate how much of a performance gain a livelist will give us
+ * based on the percentage of blocks shared between the clone and its origin.
+ * 0 percent shared means that the clone has completely diverged and that the
+ * old method is maximally effective: every read from the block tree will
+ * result in lots of frees. Livelists give us gains when they track blocks
+ * scattered across the tree, when one read in the old method might only
+ * result in a few frees. Once the clone has been overwritten enough,
+ * writes are no longer sparse and we'll no longer get much of a benefit from
+ * tracking them with a livelist. We chose a lower limit of 75 percent shared
+ * (25 percent overwritten). This means that 1/4 of all block pointers will be
+ * freed (e.g. each read frees 256, out of a max of 1024) so we expect livelists
+ * to make deletion 4x faster. Once the amount of shared space drops below this
+ * threshold, the clone will revert to the old deletion method.
+ */
+int zfs_livelist_min_percent_shared = 75;
+
+static int
+dsl_deadlist_compare(const void *arg1, const void *arg2)
+{
+ const dsl_deadlist_entry_t *dle1 = arg1;
+ const dsl_deadlist_entry_t *dle2 = arg2;
+
+ return (TREE_CMP(dle1->dle_mintxg, dle2->dle_mintxg));
+}
+
+static int
+dsl_deadlist_cache_compare(const void *arg1, const void *arg2)
+{
+ const dsl_deadlist_cache_entry_t *dlce1 = arg1;
+ const dsl_deadlist_cache_entry_t *dlce2 = arg2;
+
+ return (TREE_CMP(dlce1->dlce_mintxg, dlce2->dlce_mintxg));
+}
+
+static void
+dsl_deadlist_load_tree(dsl_deadlist_t *dl)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int error;
+
+ ASSERT(MUTEX_HELD(&dl->dl_lock));
+
+ ASSERT(!dl->dl_oldfmt);
+ if (dl->dl_havecache) {
+ /*
+ * After loading the tree, the caller may modify the tree,
+ * e.g. to add or remove nodes, or to make a node no longer
+ * refer to the empty_bpobj. These changes would make the
+ * dl_cache incorrect. Therefore we discard the cache here,
+ * so that it can't become incorrect.
+ */
+ dsl_deadlist_cache_entry_t *dlce;
+ void *cookie = NULL;
+ while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie))
+ != NULL) {
+ kmem_free(dlce, sizeof (*dlce));
+ }
+ avl_destroy(&dl->dl_cache);
+ dl->dl_havecache = B_FALSE;
+ }
+ if (dl->dl_havetree)
+ return;
+
+ avl_create(&dl->dl_tree, dsl_deadlist_compare,
+ sizeof (dsl_deadlist_entry_t),
+ offsetof(dsl_deadlist_entry_t, dle_node));
+ for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
+ (error = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
+ dle->dle_mintxg = zfs_strtonum(za.za_name, NULL);
+
+ /*
+ * Prefetch all the bpobj's so that we do that i/o
+ * in parallel. Then open them all in a second pass.
+ */
+ dle->dle_bpobj.bpo_object = za.za_first_integer;
+ dmu_prefetch(dl->dl_os, dle->dle_bpobj.bpo_object,
+ 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+
+ avl_add(&dl->dl_tree, dle);
+ }
+ VERIFY3U(error, ==, ENOENT);
+ zap_cursor_fini(&zc);
+
+ for (dsl_deadlist_entry_t *dle = avl_first(&dl->dl_tree);
+ dle != NULL; dle = AVL_NEXT(&dl->dl_tree, dle)) {
+ VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os,
+ dle->dle_bpobj.bpo_object));
+ }
+ dl->dl_havetree = B_TRUE;
+}
+
+/*
+ * Load only the non-empty bpobj's into the dl_cache. The cache is an analog
+ * of the dl_tree, but contains only non-empty_bpobj nodes from the ZAP. It
+ * is used only for gathering space statistics. The dl_cache has two
+ * advantages over the dl_tree:
+ *
+ * 1. Loading the dl_cache is ~5x faster than loading the dl_tree (if it's
+ * mostly empty_bpobj's), due to less CPU overhead to open the empty_bpobj
+ * many times and to inquire about its (zero) space stats many times.
+ *
+ * 2. The dl_cache uses less memory than the dl_tree. We only need to load
+ * the dl_tree of snapshots when deleting a snapshot, after which we free the
+ * dl_tree with dsl_deadlist_discard_tree
+ */
+static void
+dsl_deadlist_load_cache(dsl_deadlist_t *dl)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int error;
+
+ ASSERT(MUTEX_HELD(&dl->dl_lock));
+
+ ASSERT(!dl->dl_oldfmt);
+ if (dl->dl_havecache)
+ return;
+
+ uint64_t empty_bpobj = dmu_objset_pool(dl->dl_os)->dp_empty_bpobj;
+
+ avl_create(&dl->dl_cache, dsl_deadlist_cache_compare,
+ sizeof (dsl_deadlist_cache_entry_t),
+ offsetof(dsl_deadlist_cache_entry_t, dlce_node));
+ for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
+ (error = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ if (za.za_first_integer == empty_bpobj)
+ continue;
+ dsl_deadlist_cache_entry_t *dlce =
+ kmem_zalloc(sizeof (*dlce), KM_SLEEP);
+ dlce->dlce_mintxg = zfs_strtonum(za.za_name, NULL);
+
+ /*
+ * Prefetch all the bpobj's so that we do that i/o
+ * in parallel. Then open them all in a second pass.
+ */
+ dlce->dlce_bpobj = za.za_first_integer;
+ dmu_prefetch(dl->dl_os, dlce->dlce_bpobj,
+ 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+ avl_add(&dl->dl_cache, dlce);
+ }
+ VERIFY3U(error, ==, ENOENT);
+ zap_cursor_fini(&zc);
+
+ for (dsl_deadlist_cache_entry_t *dlce = avl_first(&dl->dl_cache);
+ dlce != NULL; dlce = AVL_NEXT(&dl->dl_cache, dlce)) {
+ bpobj_t bpo;
+ VERIFY0(bpobj_open(&bpo, dl->dl_os, dlce->dlce_bpobj));
+
+ VERIFY0(bpobj_space(&bpo,
+ &dlce->dlce_bytes, &dlce->dlce_comp, &dlce->dlce_uncomp));
+ bpobj_close(&bpo);
+ }
+ dl->dl_havecache = B_TRUE;
+}
+
+/*
+ * Discard the tree to save memory.
+ */
+void
+dsl_deadlist_discard_tree(dsl_deadlist_t *dl)
+{
+ mutex_enter(&dl->dl_lock);
+
+ if (!dl->dl_havetree) {
+ mutex_exit(&dl->dl_lock);
+ return;
+ }
+ dsl_deadlist_entry_t *dle;
+ void *cookie = NULL;
+ while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) != NULL) {
+ bpobj_close(&dle->dle_bpobj);
+ kmem_free(dle, sizeof (*dle));
+ }
+ avl_destroy(&dl->dl_tree);
+
+ dl->dl_havetree = B_FALSE;
+ mutex_exit(&dl->dl_lock);
+}
+
+void
+dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args)
+{
+ dsl_deadlist_entry_t *dle;
+
+ ASSERT(dsl_deadlist_is_open(dl));
+
+ mutex_enter(&dl->dl_lock);
+ dsl_deadlist_load_tree(dl);
+ mutex_exit(&dl->dl_lock);
+ for (dle = avl_first(&dl->dl_tree); dle != NULL;
+ dle = AVL_NEXT(&dl->dl_tree, dle)) {
+ if (func(args, dle) != 0)
+ break;
+ }
+}
+
+void
+dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
+{
+ dmu_object_info_t doi;
+
+ ASSERT(!dsl_deadlist_is_open(dl));
+
+ mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
+ dl->dl_os = os;
+ dl->dl_object = object;
+ VERIFY0(dmu_bonus_hold(os, object, dl, &dl->dl_dbuf));
+ dmu_object_info_from_db(dl->dl_dbuf, &doi);
+ if (doi.doi_type == DMU_OT_BPOBJ) {
+ dmu_buf_rele(dl->dl_dbuf, dl);
+ dl->dl_dbuf = NULL;
+ dl->dl_oldfmt = B_TRUE;
+ VERIFY0(bpobj_open(&dl->dl_bpobj, os, object));
+ return;
+ }
+
+ dl->dl_oldfmt = B_FALSE;
+ dl->dl_phys = dl->dl_dbuf->db_data;
+ dl->dl_havetree = B_FALSE;
+ dl->dl_havecache = B_FALSE;
+}
+
+boolean_t
+dsl_deadlist_is_open(dsl_deadlist_t *dl)
+{
+ return (dl->dl_os != NULL);
+}
+
+void
+dsl_deadlist_close(dsl_deadlist_t *dl)
+{
+ ASSERT(dsl_deadlist_is_open(dl));
+ mutex_destroy(&dl->dl_lock);
+
+ if (dl->dl_oldfmt) {
+ dl->dl_oldfmt = B_FALSE;
+ bpobj_close(&dl->dl_bpobj);
+ dl->dl_os = NULL;
+ dl->dl_object = 0;
+ return;
+ }
+
+ if (dl->dl_havetree) {
+ dsl_deadlist_entry_t *dle;
+ void *cookie = NULL;
+ while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
+ != NULL) {
+ bpobj_close(&dle->dle_bpobj);
+ kmem_free(dle, sizeof (*dle));
+ }
+ avl_destroy(&dl->dl_tree);
+ }
+ if (dl->dl_havecache) {
+ dsl_deadlist_cache_entry_t *dlce;
+ void *cookie = NULL;
+ while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie))
+ != NULL) {
+ kmem_free(dlce, sizeof (*dlce));
+ }
+ avl_destroy(&dl->dl_cache);
+ }
+ dmu_buf_rele(dl->dl_dbuf, dl);
+ dl->dl_dbuf = NULL;
+ dl->dl_phys = NULL;
+ dl->dl_os = NULL;
+ dl->dl_object = 0;
+}
+
+uint64_t
+dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
+{
+ if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
+ return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
+ return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
+ sizeof (dsl_deadlist_phys_t), tx));
+}
+
+void
+dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
+{
+ dmu_object_info_t doi;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int error;
+
+ VERIFY0(dmu_object_info(os, dlobj, &doi));
+ if (doi.doi_type == DMU_OT_BPOBJ) {
+ bpobj_free(os, dlobj, tx);
+ return;
+ }
+
+ for (zap_cursor_init(&zc, os, dlobj);
+ (error = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t obj = za.za_first_integer;
+ if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
+ bpobj_decr_empty(os, tx);
+ else
+ bpobj_free(os, obj, tx);
+ }
+ VERIFY3U(error, ==, ENOENT);
+ zap_cursor_fini(&zc);
+ VERIFY0(dmu_object_free(os, dlobj, tx));
+}
+
+static void
+dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
+ const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
+{
+ ASSERT(MUTEX_HELD(&dl->dl_lock));
+ if (dle->dle_bpobj.bpo_object ==
+ dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
+ uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
+ bpobj_close(&dle->dle_bpobj);
+ bpobj_decr_empty(dl->dl_os, tx);
+ VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+ VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object,
+ dle->dle_mintxg, obj, tx));
+ }
+ bpobj_enqueue(&dle->dle_bpobj, bp, bp_freed, tx);
+}
+
+static void
+dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
+ uint64_t obj, dmu_tx_t *tx)
+{
+ ASSERT(MUTEX_HELD(&dl->dl_lock));
+ if (dle->dle_bpobj.bpo_object !=
+ dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
+ bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
+ } else {
+ bpobj_close(&dle->dle_bpobj);
+ bpobj_decr_empty(dl->dl_os, tx);
+ VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+ VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object,
+ dle->dle_mintxg, obj, tx));
+ }
+}
+
+void
+dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
+ dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle;
+ avl_index_t where;
+
+ if (dl->dl_oldfmt) {
+ bpobj_enqueue(&dl->dl_bpobj, bp, bp_freed, tx);
+ return;
+ }
+
+ mutex_enter(&dl->dl_lock);
+ dsl_deadlist_load_tree(dl);
+
+ dmu_buf_will_dirty(dl->dl_dbuf, tx);
+
+ int sign = bp_freed ? -1 : +1;
+ dl->dl_phys->dl_used +=
+ sign * bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
+ dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp);
+ dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp);
+
+ dle_tofind.dle_mintxg = bp->blk_birth;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+ if (dle == NULL)
+ dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+ else
+ dle = AVL_PREV(&dl->dl_tree, dle);
+
+ if (dle == NULL) {
+ zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu",
+ bp, (longlong_t)bp->blk_birth);
+ dle = avl_first(&dl->dl_tree);
+ }
+
+ ASSERT3P(dle, !=, NULL);
+ dle_enqueue(dl, dle, bp, bp_freed, tx);
+ mutex_exit(&dl->dl_lock);
+}
+
+int
+dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_deadlist_t *dl = arg;
+ dsl_deadlist_insert(dl, bp, B_FALSE, tx);
+ return (0);
+}
+
+int
+dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_deadlist_t *dl = arg;
+ dsl_deadlist_insert(dl, bp, B_TRUE, tx);
+ return (0);
+}
+
+/*
+ * Insert new key in deadlist, which must be > all current entries.
+ * mintxg is not inclusive.
+ */
+void
+dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+ uint64_t obj;
+ dsl_deadlist_entry_t *dle;
+
+ if (dl->dl_oldfmt)
+ return;
+
+ dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
+ dle->dle_mintxg = mintxg;
+
+ mutex_enter(&dl->dl_lock);
+ dsl_deadlist_load_tree(dl);
+
+ obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
+ VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+ avl_add(&dl->dl_tree, dle);
+
+ VERIFY0(zap_add_int_key(dl->dl_os, dl->dl_object,
+ mintxg, obj, tx));
+ mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * Remove this key, merging its entries into the previous key.
+ */
+void
+dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle, *dle_prev;
+
+ if (dl->dl_oldfmt)
+ return;
+ mutex_enter(&dl->dl_lock);
+ dsl_deadlist_load_tree(dl);
+
+ dle_tofind.dle_mintxg = mintxg;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
+ ASSERT3P(dle, !=, NULL);
+ dle_prev = AVL_PREV(&dl->dl_tree, dle);
+
+ dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);
+
+ avl_remove(&dl->dl_tree, dle);
+ bpobj_close(&dle->dle_bpobj);
+ kmem_free(dle, sizeof (*dle));
+
+ VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
+ mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * Remove a deadlist entry and all of its contents by removing the entry from
+ * the deadlist's avl tree, freeing the entry's bpobj and adjusting the
+ * deadlist's space accounting accordingly.
+ */
+void
+dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+ uint64_t used, comp, uncomp;
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle;
+ objset_t *os = dl->dl_os;
+
+ if (dl->dl_oldfmt)
+ return;
+
+ mutex_enter(&dl->dl_lock);
+ dsl_deadlist_load_tree(dl);
+
+ dle_tofind.dle_mintxg = mintxg;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
+ VERIFY3P(dle, !=, NULL);
+
+ avl_remove(&dl->dl_tree, dle);
+ VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx));
+ VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
+ dmu_buf_will_dirty(dl->dl_dbuf, tx);
+ dl->dl_phys->dl_used -= used;
+ dl->dl_phys->dl_comp -= comp;
+ dl->dl_phys->dl_uncomp -= uncomp;
+ if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) {
+ bpobj_decr_empty(os, tx);
+ } else {
+ bpobj_free(os, dle->dle_bpobj.bpo_object, tx);
+ }
+ bpobj_close(&dle->dle_bpobj);
+ kmem_free(dle, sizeof (*dle));
+ mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * Clear out the contents of a deadlist_entry by freeing its bpobj,
+ * replacing it with an empty bpobj and adjusting the deadlist's
+ * space accounting
+ */
+void
+dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl,
+ dmu_tx_t *tx)
+{
+ uint64_t new_obj, used, comp, uncomp;
+ objset_t *os = dl->dl_os;
+
+ mutex_enter(&dl->dl_lock);
+ VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx));
+ VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
+ dmu_buf_will_dirty(dl->dl_dbuf, tx);
+ dl->dl_phys->dl_used -= used;
+ dl->dl_phys->dl_comp -= comp;
+ dl->dl_phys->dl_uncomp -= uncomp;
+ if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj)
+ bpobj_decr_empty(os, tx);
+ else
+ bpobj_free(os, dle->dle_bpobj.bpo_object, tx);
+ bpobj_close(&dle->dle_bpobj);
+ new_obj = bpobj_alloc_empty(os, SPA_OLD_MAXBLOCKSIZE, tx);
+ VERIFY0(bpobj_open(&dle->dle_bpobj, os, new_obj));
+ VERIFY0(zap_add_int_key(os, dl->dl_object, dle->dle_mintxg,
+ new_obj, tx));
+ ASSERT(bpobj_is_empty(&dle->dle_bpobj));
+ mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * Return the first entry in deadlist's avl tree
+ */
+dsl_deadlist_entry_t *
+dsl_deadlist_first(dsl_deadlist_t *dl)
+{
+ dsl_deadlist_entry_t *dle;
+
+ mutex_enter(&dl->dl_lock);
+ dsl_deadlist_load_tree(dl);
+ dle = avl_first(&dl->dl_tree);
+ mutex_exit(&dl->dl_lock);
+
+ return (dle);
+}
+
+/*
+ * Return the last entry in deadlist's avl tree
+ */
+dsl_deadlist_entry_t *
+dsl_deadlist_last(dsl_deadlist_t *dl)
+{
+ dsl_deadlist_entry_t *dle;
+
+ mutex_enter(&dl->dl_lock);
+ dsl_deadlist_load_tree(dl);
+ dle = avl_last(&dl->dl_tree);
+ mutex_exit(&dl->dl_lock);
+
+ return (dle);
+}
+
+/*
+ * Walk ds's snapshots to regenerate generate ZAP & AVL.
+ */
+static void
+dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
+ uint64_t mrs_obj, dmu_tx_t *tx)
+{
+ dsl_deadlist_t dl = { 0 };
+ dsl_pool_t *dp = dmu_objset_pool(os);
+
+ dsl_deadlist_open(&dl, os, dlobj);
+ if (dl.dl_oldfmt) {
+ dsl_deadlist_close(&dl);
+ return;
+ }
+
+ while (mrs_obj != 0) {
+ dsl_dataset_t *ds;
+ VERIFY0(dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
+ dsl_deadlist_add_key(&dl,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
+ mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ dsl_dataset_rele(ds, FTAG);
+ }
+ dsl_deadlist_close(&dl);
+}
+
+uint64_t
+dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
+ uint64_t mrs_obj, dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t *dle;
+ uint64_t newobj;
+
+ newobj = dsl_deadlist_alloc(dl->dl_os, tx);
+
+ if (dl->dl_oldfmt) {
+ dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
+ return (newobj);
+ }
+
+ mutex_enter(&dl->dl_lock);
+ dsl_deadlist_load_tree(dl);
+
+ for (dle = avl_first(&dl->dl_tree); dle;
+ dle = AVL_NEXT(&dl->dl_tree, dle)) {
+ uint64_t obj;
+
+ if (dle->dle_mintxg >= maxtxg)
+ break;
+
+ obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
+ VERIFY0(zap_add_int_key(dl->dl_os, newobj,
+ dle->dle_mintxg, obj, tx));
+ }
+ mutex_exit(&dl->dl_lock);
+ return (newobj);
+}
+
+void
+dsl_deadlist_space(dsl_deadlist_t *dl,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ ASSERT(dsl_deadlist_is_open(dl));
+ if (dl->dl_oldfmt) {
+ VERIFY0(bpobj_space(&dl->dl_bpobj,
+ usedp, compp, uncompp));
+ return;
+ }
+
+ mutex_enter(&dl->dl_lock);
+ *usedp = dl->dl_phys->dl_used;
+ *compp = dl->dl_phys->dl_comp;
+ *uncompp = dl->dl_phys->dl_uncomp;
+ mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * return space used in the range (mintxg, maxtxg].
+ * Includes maxtxg, does not include mintxg.
+ * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
+ * UINT64_MAX).
+ */
+void
+dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ dsl_deadlist_cache_entry_t *dlce;
+ dsl_deadlist_cache_entry_t dlce_tofind;
+ avl_index_t where;
+
+ if (dl->dl_oldfmt) {
+ VERIFY0(bpobj_space_range(&dl->dl_bpobj,
+ mintxg, maxtxg, usedp, compp, uncompp));
+ return;
+ }
+
+ *usedp = *compp = *uncompp = 0;
+
+ mutex_enter(&dl->dl_lock);
+ dsl_deadlist_load_cache(dl);
+ dlce_tofind.dlce_mintxg = mintxg;
+ dlce = avl_find(&dl->dl_cache, &dlce_tofind, &where);
+
+ /*
+ * If this mintxg doesn't exist, it may be an empty_bpobj which
+ * is omitted from the sparse tree. Start at the next non-empty
+ * entry.
+ */
+ if (dlce == NULL)
+ dlce = avl_nearest(&dl->dl_cache, where, AVL_AFTER);
+
+ for (; dlce && dlce->dlce_mintxg < maxtxg;
+ dlce = AVL_NEXT(&dl->dl_tree, dlce)) {
+ *usedp += dlce->dlce_bytes;
+ *compp += dlce->dlce_comp;
+ *uncompp += dlce->dlce_uncomp;
+ }
+
+ mutex_exit(&dl->dl_lock);
+}
+
+static void
+dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
+ dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle;
+ avl_index_t where;
+ uint64_t used, comp, uncomp;
+ bpobj_t bpo;
+
+ ASSERT(MUTEX_HELD(&dl->dl_lock));
+
+ VERIFY0(bpobj_open(&bpo, dl->dl_os, obj));
+ VERIFY0(bpobj_space(&bpo, &used, &comp, &uncomp));
+ bpobj_close(&bpo);
+
+ dsl_deadlist_load_tree(dl);
+
+ dmu_buf_will_dirty(dl->dl_dbuf, tx);
+ dl->dl_phys->dl_used += used;
+ dl->dl_phys->dl_comp += comp;
+ dl->dl_phys->dl_uncomp += uncomp;
+
+ dle_tofind.dle_mintxg = birth;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+ if (dle == NULL)
+ dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+ dle_enqueue_subobj(dl, dle, obj, tx);
+}
+
+static int
+dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+ dmu_tx_t *tx)
+{
+ dsl_deadlist_t *dl = arg;
+ dsl_deadlist_insert(dl, bp, bp_freed, tx);
+ return (0);
+}
+
+/*
+ * Merge the deadlist pointed to by 'obj' into dl. obj will be left as
+ * an empty deadlist.
+ */
+void
+dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ dmu_buf_t *bonus;
+ dsl_deadlist_phys_t *dlp;
+ dmu_object_info_t doi;
+ int error;
+
+ VERIFY0(dmu_object_info(dl->dl_os, obj, &doi));
+ if (doi.doi_type == DMU_OT_BPOBJ) {
+ bpobj_t bpo;
+ VERIFY0(bpobj_open(&bpo, dl->dl_os, obj));
+ VERIFY0(bpobj_iterate(&bpo, dsl_deadlist_insert_cb, dl, tx));
+ bpobj_close(&bpo);
+ return;
+ }
+
+ mutex_enter(&dl->dl_lock);
+ for (zap_cursor_init(&zc, dl->dl_os, obj);
+ (error = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t mintxg = zfs_strtonum(za.za_name, NULL);
+ dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
+ VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx));
+ }
+ VERIFY3U(error, ==, ENOENT);
+ zap_cursor_fini(&zc);
+
+ VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
+ dlp = bonus->db_data;
+ dmu_buf_will_dirty(bonus, tx);
+ bzero(dlp, sizeof (*dlp));
+ dmu_buf_rele(bonus, FTAG);
+ mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * Remove entries on dl that are born > mintxg, and put them on the bpobj.
+ */
+void
+dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
+ dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle;
+ avl_index_t where;
+
+ ASSERT(!dl->dl_oldfmt);
+
+ mutex_enter(&dl->dl_lock);
+ dmu_buf_will_dirty(dl->dl_dbuf, tx);
+ dsl_deadlist_load_tree(dl);
+
+ dle_tofind.dle_mintxg = mintxg;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+ if (dle == NULL)
+ dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
+ while (dle) {
+ uint64_t used, comp, uncomp;
+ dsl_deadlist_entry_t *dle_next;
+
+ bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
+
+ VERIFY0(bpobj_space(&dle->dle_bpobj,
+ &used, &comp, &uncomp));
+ ASSERT3U(dl->dl_phys->dl_used, >=, used);
+ ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
+ ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
+ dl->dl_phys->dl_used -= used;
+ dl->dl_phys->dl_comp -= comp;
+ dl->dl_phys->dl_uncomp -= uncomp;
+
+ VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object,
+ dle->dle_mintxg, tx));
+
+ dle_next = AVL_NEXT(&dl->dl_tree, dle);
+ avl_remove(&dl->dl_tree, dle);
+ bpobj_close(&dle->dle_bpobj);
+ kmem_free(dle, sizeof (*dle));
+ dle = dle_next;
+ }
+ mutex_exit(&dl->dl_lock);
+}
+
+typedef struct livelist_entry {
+ const blkptr_t *le_bp;
+ avl_node_t le_node;
+} livelist_entry_t;
+
+static int
+livelist_compare(const void *larg, const void *rarg)
+{
+ const blkptr_t *l = ((livelist_entry_t *)larg)->le_bp;
+ const blkptr_t *r = ((livelist_entry_t *)rarg)->le_bp;
+
+ /* Sort them according to dva[0] */
+ uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);
+ uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]);
+
+ if (l_dva0_vdev != r_dva0_vdev)
+ return (TREE_CMP(l_dva0_vdev, r_dva0_vdev));
+
+ /* if vdevs are equal, sort by offsets. */
+ uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);
+ uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);
+ if (l_dva0_offset == r_dva0_offset)
+ ASSERT3U(l->blk_birth, ==, r->blk_birth);
+ return (TREE_CMP(l_dva0_offset, r_dva0_offset));
+}
+
+struct livelist_iter_arg {
+ avl_tree_t *avl;
+ bplist_t *to_free;
+ zthr_t *t;
+};
+
+/*
+ * Expects an AVL tree which is incrementally filled will FREE blkptrs
+ * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a
+ * corresponding FREE are stored in the supplied bplist.
+ */
+static int
+dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+ dmu_tx_t *tx)
+{
+ struct livelist_iter_arg *lia = arg;
+ avl_tree_t *avl = lia->avl;
+ bplist_t *to_free = lia->to_free;
+ zthr_t *t = lia->t;
+ ASSERT(tx == NULL);
+
+ if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t)))
+ return (SET_ERROR(EINTR));
+ if (bp_freed) {
+ livelist_entry_t *node = kmem_alloc(sizeof (livelist_entry_t),
+ KM_SLEEP);
+ blkptr_t *temp_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
+ *temp_bp = *bp;
+ node->le_bp = temp_bp;
+ avl_add(avl, node);
+ } else {
+ livelist_entry_t node;
+ node.le_bp = bp;
+ livelist_entry_t *found = avl_find(avl, &node, NULL);
+ if (found != NULL) {
+ avl_remove(avl, found);
+ kmem_free((blkptr_t *)found->le_bp, sizeof (blkptr_t));
+ kmem_free(found, sizeof (livelist_entry_t));
+ } else {
+ bplist_append(to_free, bp);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Accepts a bpobj and a bplist. Will insert into the bplist the blkptrs
+ * which have an ALLOC entry but no matching FREE
+ */
+int
+dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t,
+ uint64_t *size)
+{
+ avl_tree_t avl;
+ avl_create(&avl, livelist_compare, sizeof (livelist_entry_t),
+ offsetof(livelist_entry_t, le_node));
+
+ /* process the sublist */
+ struct livelist_iter_arg arg = {
+ .avl = &avl,
+ .to_free = to_free,
+ .t = t
+ };
+ int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size);
+
+ avl_destroy(&avl);
+ return (err);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, max_entries, ULONG, ZMOD_RW,
+ "Size to start the next sub-livelist in a livelist");
+
+ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, min_percent_shared, INT, ZMOD_RW,
+ "Threshold at which livelist is disabled");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dsl_deleg.c b/sys/contrib/openzfs/module/zfs/dsl_deleg.c
new file mode 100644
index 000000000000..cf8a3c9bbdfb
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_deleg.c
@@ -0,0 +1,774 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ */
+
+/*
+ * DSL permissions are stored in a two level zap attribute
+ * mechanism. The first level identifies the "class" of
+ * entry. The class is identified by the first 2 letters of
+ * the attribute. The second letter "l" or "d" identifies whether
+ * it is a local or descendent permission. The first letter
+ * identifies the type of entry.
+ *
+ * ul$<id> identifies permissions granted locally for this userid.
+ * ud$<id> identifies permissions granted on descendent datasets for
+ * this userid.
+ * Ul$<id> identifies permission sets granted locally for this userid.
+ * Ud$<id> identifies permission sets granted on descendent datasets for
+ * this userid.
+ * gl$<id> identifies permissions granted locally for this groupid.
+ * gd$<id> identifies permissions granted on descendent datasets for
+ * this groupid.
+ * Gl$<id> identifies permission sets granted locally for this groupid.
+ * Gd$<id> identifies permission sets granted on descendent datasets for
+ * this groupid.
+ * el$ identifies permissions granted locally for everyone.
+ * ed$ identifies permissions granted on descendent datasets
+ * for everyone.
+ * El$ identifies permission sets granted locally for everyone.
+ * Ed$ identifies permission sets granted to descendent datasets for
+ * everyone.
+ * c-$ identifies permission to create at dataset creation time.
+ * C-$ identifies permission sets to grant locally at dataset creation
+ * time.
+ * s-$@<name> permissions defined in specified set @<name>
+ * S-$@<name> Sets defined in named set @<name>
+ *
+ * Each of the above entities points to another zap attribute that contains one
+ * attribute for each allowed permission, such as create, destroy,...
+ * All of the "upper" case class types will specify permission set names
+ * rather than permissions.
+ *
+ * Basically it looks something like this:
+ * ul$12 -> ZAP OBJ -> permissions...
+ *
+ * The ZAP OBJ is referred to as the jump object.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_deleg.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+#include <sys/cred.h>
+#include <sys/sunddi.h>
+
+#include "zfs_deleg.h"
+
+/*
+ * Validate that user is allowed to delegate specified permissions.
+ *
+ * In order to delegate "create" you must have "create"
+ * and "allow".
+ */
+int
+dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr)
+{
+ nvpair_t *whopair = NULL;
+ int error;
+
+ if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0)
+ return (error);
+
+ while ((whopair = nvlist_next_nvpair(nvp, whopair))) {
+ nvlist_t *perms;
+ nvpair_t *permpair = NULL;
+
+ VERIFY(nvpair_value_nvlist(whopair, &perms) == 0);
+
+ while ((permpair = nvlist_next_nvpair(perms, permpair))) {
+ const char *perm = nvpair_name(permpair);
+
+ if (strcmp(perm, ZFS_DELEG_PERM_ALLOW) == 0)
+ return (SET_ERROR(EPERM));
+
+ if ((error = dsl_deleg_access(ddname, perm, cr)) != 0)
+ return (error);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Validate that user is allowed to unallow specified permissions. They
+ * must have the 'allow' permission, and even then can only unallow
+ * perms for their uid.
+ */
+int
+dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr)
+{
+ nvpair_t *whopair = NULL;
+ int error;
+ char idstr[32];
+
+ if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0)
+ return (error);
+
+ (void) snprintf(idstr, sizeof (idstr), "%lld",
+ (longlong_t)crgetuid(cr));
+
+ while ((whopair = nvlist_next_nvpair(nvp, whopair))) {
+ zfs_deleg_who_type_t type = nvpair_name(whopair)[0];
+
+ if (type != ZFS_DELEG_USER &&
+ type != ZFS_DELEG_USER_SETS)
+ return (SET_ERROR(EPERM));
+
+ if (strcmp(idstr, &nvpair_name(whopair)[3]) != 0)
+ return (SET_ERROR(EPERM));
+ }
+ return (0);
+}
+
+typedef struct dsl_deleg_arg {
+ const char *dda_name;
+ nvlist_t *dda_nvlist;
+} dsl_deleg_arg_t;
+
+static void
+dsl_deleg_set_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_deleg_arg_t *dda = arg;
+ dsl_dir_t *dd;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ objset_t *mos = dp->dp_meta_objset;
+ nvpair_t *whopair = NULL;
+ uint64_t zapobj;
+
+ VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
+
+ zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
+ if (zapobj == 0) {
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos,
+ DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
+ }
+
+ while ((whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair))) {
+ const char *whokey = nvpair_name(whopair);
+ nvlist_t *perms;
+ nvpair_t *permpair = NULL;
+ uint64_t jumpobj;
+
+ perms = fnvpair_value_nvlist(whopair);
+
+ if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) {
+ jumpobj = zap_create_link(mos, DMU_OT_DSL_PERMS,
+ zapobj, whokey, tx);
+ }
+
+ while ((permpair = nvlist_next_nvpair(perms, permpair))) {
+ const char *perm = nvpair_name(permpair);
+ uint64_t n = 0;
+
+ VERIFY(zap_update(mos, jumpobj,
+ perm, 8, 1, &n, tx) == 0);
+ spa_history_log_internal_dd(dd, "permission update", tx,
+ "%s %s", whokey, perm);
+ }
+ }
+ dsl_dir_rele(dd, FTAG);
+}
+
+static void
+dsl_deleg_unset_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_deleg_arg_t *dda = arg;
+ dsl_dir_t *dd;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ objset_t *mos = dp->dp_meta_objset;
+ nvpair_t *whopair = NULL;
+ uint64_t zapobj;
+
+ VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
+ zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
+ if (zapobj == 0) {
+ dsl_dir_rele(dd, FTAG);
+ return;
+ }
+
+ while ((whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair))) {
+ const char *whokey = nvpair_name(whopair);
+ nvlist_t *perms;
+ nvpair_t *permpair = NULL;
+ uint64_t jumpobj;
+
+ if (nvpair_value_nvlist(whopair, &perms) != 0) {
+ if (zap_lookup(mos, zapobj, whokey, 8,
+ 1, &jumpobj) == 0) {
+ (void) zap_remove(mos, zapobj, whokey, tx);
+ VERIFY(0 == zap_destroy(mos, jumpobj, tx));
+ }
+ spa_history_log_internal_dd(dd, "permission who remove",
+ tx, "%s", whokey);
+ continue;
+ }
+
+ if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0)
+ continue;
+
+ while ((permpair = nvlist_next_nvpair(perms, permpair))) {
+ const char *perm = nvpair_name(permpair);
+ uint64_t n = 0;
+
+ (void) zap_remove(mos, jumpobj, perm, tx);
+ if (zap_count(mos, jumpobj, &n) == 0 && n == 0) {
+ (void) zap_remove(mos, zapobj,
+ whokey, tx);
+ VERIFY(0 == zap_destroy(mos,
+ jumpobj, tx));
+ }
+ spa_history_log_internal_dd(dd, "permission remove", tx,
+ "%s %s", whokey, perm);
+ }
+ }
+ dsl_dir_rele(dd, FTAG);
+}
+
+static int
+dsl_deleg_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_deleg_arg_t *dda = arg;
+ dsl_dir_t *dd;
+ int error;
+
+ if (spa_version(dmu_tx_pool(tx)->dp_spa) <
+ SPA_VERSION_DELEGATED_PERMS) {
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ error = dsl_dir_hold(dmu_tx_pool(tx), dda->dda_name, FTAG, &dd, NULL);
+ if (error == 0)
+ dsl_dir_rele(dd, FTAG);
+ return (error);
+}
+
+int
+dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset)
+{
+ dsl_deleg_arg_t dda;
+
+ /* nvp must already have been verified to be valid */
+
+ dda.dda_name = ddname;
+ dda.dda_nvlist = nvp;
+
+ return (dsl_sync_task(ddname, dsl_deleg_check,
+ unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync,
+ &dda, fnvlist_num_pairs(nvp), ZFS_SPACE_CHECK_RESERVED));
+}
+
+/*
+ * Find all 'allow' permissions from a given point and then continue
+ * traversing up to the root.
+ *
+ * This function constructs an nvlist of nvlists.
+ * each setpoint is an nvlist composed of an nvlist of an nvlist
+ * of the individual * users/groups/everyone/create
+ * permissions.
+ *
+ * The nvlist will look like this.
+ *
+ * { source fsname -> { whokeys { permissions,...}, ...}}
+ *
+ * The fsname nvpairs will be arranged in a bottom up order. For example,
+ * if we have the following structure a/b/c then the nvpairs for the fsnames
+ * will be ordered a/b/c, a/b, a.
+ */
+int
+dsl_deleg_get(const char *ddname, nvlist_t **nvp)
+{
+ dsl_dir_t *dd, *startdd;
+ dsl_pool_t *dp;
+ int error;
+ objset_t *mos;
+ zap_cursor_t *basezc, *zc;
+ zap_attribute_t *baseza, *za;
+ char *source;
+
+ error = dsl_pool_hold(ddname, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dir_hold(dp, ddname, FTAG, &startdd, NULL);
+ if (error != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ dp = startdd->dd_pool;
+ mos = dp->dp_meta_objset;
+
+ zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+ basezc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+ baseza = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+ source = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+ VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ for (dd = startdd; dd != NULL; dd = dd->dd_parent) {
+ nvlist_t *sp_nvp;
+ uint64_t n;
+
+ if (dsl_dir_phys(dd)->dd_deleg_zapobj == 0 ||
+ zap_count(mos,
+ dsl_dir_phys(dd)->dd_deleg_zapobj, &n) != 0 || n == 0)
+ continue;
+
+ sp_nvp = fnvlist_alloc();
+ for (zap_cursor_init(basezc, mos,
+ dsl_dir_phys(dd)->dd_deleg_zapobj);
+ zap_cursor_retrieve(basezc, baseza) == 0;
+ zap_cursor_advance(basezc)) {
+ nvlist_t *perms_nvp;
+
+ ASSERT(baseza->za_integer_length == 8);
+ ASSERT(baseza->za_num_integers == 1);
+
+ perms_nvp = fnvlist_alloc();
+ for (zap_cursor_init(zc, mos, baseza->za_first_integer);
+ zap_cursor_retrieve(zc, za) == 0;
+ zap_cursor_advance(zc)) {
+ fnvlist_add_boolean(perms_nvp, za->za_name);
+ }
+ zap_cursor_fini(zc);
+ fnvlist_add_nvlist(sp_nvp, baseza->za_name, perms_nvp);
+ fnvlist_free(perms_nvp);
+ }
+
+ zap_cursor_fini(basezc);
+
+ dsl_dir_name(dd, source);
+ fnvlist_add_nvlist(*nvp, source, sp_nvp);
+ nvlist_free(sp_nvp);
+ }
+
+ kmem_free(source, ZFS_MAX_DATASET_NAME_LEN);
+ kmem_free(baseza, sizeof (zap_attribute_t));
+ kmem_free(basezc, sizeof (zap_cursor_t));
+ kmem_free(za, sizeof (zap_attribute_t));
+ kmem_free(zc, sizeof (zap_cursor_t));
+
+ dsl_dir_rele(startdd, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (0);
+}
+
+/*
+ * Routines for dsl_deleg_access() -- access checking.
+ */
+typedef struct perm_set {
+ avl_node_t p_node;
+ boolean_t p_matched;
+ char p_setname[ZFS_MAX_DELEG_NAME];
+} perm_set_t;
+
+static int
+perm_set_compare(const void *arg1, const void *arg2)
+{
+ const perm_set_t *node1 = (const perm_set_t *)arg1;
+ const perm_set_t *node2 = (const perm_set_t *)arg2;
+ int val;
+
+ val = strcmp(node1->p_setname, node2->p_setname);
+
+ return (TREE_ISIGN(val));
+}
+
+/*
+ * Determine whether a specified permission exists.
+ *
+ * First the base attribute has to be retrieved. i.e. ul$12
+ * Once the base object has been retrieved the actual permission
+ * is lookup up in the zap object the base object points to.
+ *
+ * Return 0 if permission exists, ENOENT if there is no whokey, EPERM if
+ * there is no perm in that jumpobj.
+ */
+static int
+dsl_check_access(objset_t *mos, uint64_t zapobj,
+ char type, char checkflag, void *valp, const char *perm)
+{
+ int error;
+ uint64_t jumpobj, zero;
+ char whokey[ZFS_MAX_DELEG_NAME];
+
+ zfs_deleg_whokey(whokey, type, checkflag, valp);
+ error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj);
+ if (error == 0) {
+ error = zap_lookup(mos, jumpobj, perm, 8, 1, &zero);
+ if (error == ENOENT)
+ error = SET_ERROR(EPERM);
+ }
+ return (error);
+}
+
+/*
+ * check a specified user/group for a requested permission
+ */
+static int
+dsl_check_user_access(objset_t *mos, uint64_t zapobj, const char *perm,
+ int checkflag, cred_t *cr)
+{
+ const gid_t *gids;
+ int ngids;
+ int i;
+ uint64_t id;
+
+ /* check for user */
+ id = crgetuid(cr);
+ if (dsl_check_access(mos, zapobj,
+ ZFS_DELEG_USER, checkflag, &id, perm) == 0)
+ return (0);
+
+ /* check for users primary group */
+ id = crgetgid(cr);
+ if (dsl_check_access(mos, zapobj,
+ ZFS_DELEG_GROUP, checkflag, &id, perm) == 0)
+ return (0);
+
+ /* check for everyone entry */
+ id = -1;
+ if (dsl_check_access(mos, zapobj,
+ ZFS_DELEG_EVERYONE, checkflag, &id, perm) == 0)
+ return (0);
+
+ /* check each supplemental group user is a member of */
+ ngids = crgetngroups(cr);
+ gids = crgetgroups(cr);
+ for (i = 0; i != ngids; i++) {
+ id = gids[i];
+ if (dsl_check_access(mos, zapobj,
+ ZFS_DELEG_GROUP, checkflag, &id, perm) == 0)
+ return (0);
+ }
+
+ return (SET_ERROR(EPERM));
+}
+
+/*
+ * Iterate over the sets specified in the specified zapobj
+ * and load them into the permsets avl tree.
+ */
+static int
+dsl_load_sets(objset_t *mos, uint64_t zapobj,
+ char type, char checkflag, void *valp, avl_tree_t *avl)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ perm_set_t *permnode;
+ avl_index_t idx;
+ uint64_t jumpobj;
+ int error;
+ char whokey[ZFS_MAX_DELEG_NAME];
+
+ zfs_deleg_whokey(whokey, type, checkflag, valp);
+
+ error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj);
+ if (error != 0)
+ return (error);
+
+ for (zap_cursor_init(&zc, mos, jumpobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ permnode = kmem_alloc(sizeof (perm_set_t), KM_SLEEP);
+ (void) strlcpy(permnode->p_setname, za.za_name,
+ sizeof (permnode->p_setname));
+ permnode->p_matched = B_FALSE;
+
+ if (avl_find(avl, permnode, &idx) == NULL) {
+ avl_insert(avl, permnode, idx);
+ } else {
+ kmem_free(permnode, sizeof (perm_set_t));
+ }
+ }
+ zap_cursor_fini(&zc);
+ return (0);
+}
+
+/*
+ * Load all permissions user based on cred belongs to.
+ */
+static void
+dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl,
+ char checkflag, cred_t *cr)
+{
+ const gid_t *gids;
+ int ngids, i;
+ uint64_t id;
+
+ id = crgetuid(cr);
+ (void) dsl_load_sets(mos, zapobj,
+ ZFS_DELEG_USER_SETS, checkflag, &id, avl);
+
+ id = crgetgid(cr);
+ (void) dsl_load_sets(mos, zapobj,
+ ZFS_DELEG_GROUP_SETS, checkflag, &id, avl);
+
+ (void) dsl_load_sets(mos, zapobj,
+ ZFS_DELEG_EVERYONE_SETS, checkflag, NULL, avl);
+
+ ngids = crgetngroups(cr);
+ gids = crgetgroups(cr);
+ for (i = 0; i != ngids; i++) {
+ id = gids[i];
+ (void) dsl_load_sets(mos, zapobj,
+ ZFS_DELEG_GROUP_SETS, checkflag, &id, avl);
+ }
+}
+
+/*
+ * Check if user has requested permission.
+ */
+int
+dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr)
+{
+ dsl_dir_t *dd;
+ dsl_pool_t *dp;
+ void *cookie;
+ int error;
+ char checkflag;
+ objset_t *mos;
+ avl_tree_t permsets;
+ perm_set_t *setnode;
+
+ dp = ds->ds_dir->dd_pool;
+ mos = dp->dp_meta_objset;
+
+ if (dsl_delegation_on(mos) == B_FALSE)
+ return (SET_ERROR(ECANCELED));
+
+ if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) <
+ SPA_VERSION_DELEGATED_PERMS)
+ return (SET_ERROR(EPERM));
+
+ if (ds->ds_is_snapshot) {
+ /*
+ * Snapshots are treated as descendents only,
+ * local permissions do not apply.
+ */
+ checkflag = ZFS_DELEG_DESCENDENT;
+ } else {
+ checkflag = ZFS_DELEG_LOCAL;
+ }
+
+ avl_create(&permsets, perm_set_compare, sizeof (perm_set_t),
+ offsetof(perm_set_t, p_node));
+
+ ASSERT(dsl_pool_config_held(dp));
+ for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent,
+ checkflag = ZFS_DELEG_DESCENDENT) {
+ uint64_t zapobj;
+ boolean_t expanded;
+
+ /*
+ * If not in global zone then make sure
+ * the zoned property is set
+ */
+ if (!INGLOBALZONE(curproc)) {
+ uint64_t zoned;
+
+ if (dsl_prop_get_dd(dd,
+ zfs_prop_to_name(ZFS_PROP_ZONED),
+ 8, 1, &zoned, NULL, B_FALSE) != 0)
+ break;
+ if (!zoned)
+ break;
+ }
+ zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
+
+ if (zapobj == 0)
+ continue;
+
+ dsl_load_user_sets(mos, zapobj, &permsets, checkflag, cr);
+again:
+ expanded = B_FALSE;
+ for (setnode = avl_first(&permsets); setnode;
+ setnode = AVL_NEXT(&permsets, setnode)) {
+ if (setnode->p_matched == B_TRUE)
+ continue;
+
+ /* See if this set directly grants this permission */
+ error = dsl_check_access(mos, zapobj,
+ ZFS_DELEG_NAMED_SET, 0, setnode->p_setname, perm);
+ if (error == 0)
+ goto success;
+ if (error == EPERM)
+ setnode->p_matched = B_TRUE;
+
+ /* See if this set includes other sets */
+ error = dsl_load_sets(mos, zapobj,
+ ZFS_DELEG_NAMED_SET_SETS, 0,
+ setnode->p_setname, &permsets);
+ if (error == 0)
+ setnode->p_matched = expanded = B_TRUE;
+ }
+ /*
+ * If we expanded any sets, that will define more sets,
+ * which we need to check.
+ */
+ if (expanded)
+ goto again;
+
+ error = dsl_check_user_access(mos, zapobj, perm, checkflag, cr);
+ if (error == 0)
+ goto success;
+ }
+ error = SET_ERROR(EPERM);
+success:
+
+ cookie = NULL;
+ while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL)
+ kmem_free(setnode, sizeof (perm_set_t));
+
+ return (error);
+}
+
+int
+dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ int error;
+
+ error = dsl_pool_hold(dsname, FTAG, &dp);
+ if (error != 0)
+ return (error);
+ error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+ if (error == 0) {
+ error = dsl_deleg_access_impl(ds, perm, cr);
+ dsl_dataset_rele(ds, FTAG);
+ }
+ dsl_pool_rele(dp, FTAG);
+
+ return (error);
+}
+
+/*
+ * Other routines.
+ */
+
+static void
+copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj,
+ boolean_t dosets, uint64_t uid, dmu_tx_t *tx)
+{
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ uint64_t jumpobj, pjumpobj;
+ uint64_t zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ char whokey[ZFS_MAX_DELEG_NAME];
+
+ zfs_deleg_whokey(whokey,
+ dosets ? ZFS_DELEG_CREATE_SETS : ZFS_DELEG_CREATE,
+ ZFS_DELEG_LOCAL, NULL);
+ if (zap_lookup(mos, pzapobj, whokey, 8, 1, &pjumpobj) != 0)
+ return;
+
+ if (zapobj == 0) {
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos,
+ DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
+ }
+
+ zfs_deleg_whokey(whokey,
+ dosets ? ZFS_DELEG_USER_SETS : ZFS_DELEG_USER,
+ ZFS_DELEG_LOCAL, &uid);
+ if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) == ENOENT) {
+ jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
+ VERIFY(zap_add(mos, zapobj, whokey, 8, 1, &jumpobj, tx) == 0);
+ }
+
+ for (zap_cursor_init(&zc, mos, pjumpobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t zero = 0;
+ ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1);
+
+ VERIFY(zap_update(mos, jumpobj, za.za_name,
+ 8, 1, &zero, tx) == 0);
+ }
+ zap_cursor_fini(&zc);
+}
+
+/*
+ * set all create time permission on new dataset.
+ */
+void
+dsl_deleg_set_create_perms(dsl_dir_t *sdd, dmu_tx_t *tx, cred_t *cr)
+{
+ dsl_dir_t *dd;
+ uint64_t uid = crgetuid(cr);
+
+ if (spa_version(dmu_objset_spa(sdd->dd_pool->dp_meta_objset)) <
+ SPA_VERSION_DELEGATED_PERMS)
+ return;
+
+ for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) {
+ uint64_t pzapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
+
+ if (pzapobj == 0)
+ continue;
+
+ copy_create_perms(sdd, pzapobj, B_FALSE, uid, tx);
+ copy_create_perms(sdd, pzapobj, B_TRUE, uid, tx);
+ }
+}
+
+int
+dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ if (zapobj == 0)
+ return (0);
+
+ for (zap_cursor_init(&zc, mos, zapobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1);
+ VERIFY(0 == zap_destroy(mos, za.za_first_integer, tx));
+ }
+ zap_cursor_fini(&zc);
+ VERIFY(0 == zap_destroy(mos, zapobj, tx));
+ return (0);
+}
+
+boolean_t
+dsl_delegation_on(objset_t *os)
+{
+ return (!!spa_delegation(os->os_spa));
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(dsl_deleg_get);
+EXPORT_SYMBOL(dsl_deleg_set);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/dsl_destroy.c b/sys/contrib/openzfs/module/zfs/dsl_destroy.c
new file mode 100644
index 000000000000..837d78987e75
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_destroy.c
@@ -0,0 +1,1281 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2013 by Joyent, Inc. All rights reserved.
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dsl_userhold.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_scan.h>
+#include <sys/dmu_objset.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dmu_impl.h>
+#include <sys/zvol.h>
+#include <sys/zcp.h>
+#include <sys/dsl_deadlist.h>
+#include <sys/zthr.h>
+#include <sys/spa_impl.h>
+
+int
+dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
+{
+ if (!ds->ds_is_snapshot)
+ return (SET_ERROR(EINVAL));
+
+ if (dsl_dataset_long_held(ds))
+ return (SET_ERROR(EBUSY));
+
+ /*
+ * Only allow deferred destroy on pools that support it.
+ * NOTE: deferred destroy is only supported on snapshots.
+ */
+ if (defer) {
+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
+ SPA_VERSION_USERREFS)
+ return (SET_ERROR(ENOTSUP));
+ return (0);
+ }
+
+ /*
+ * If this snapshot has an elevated user reference count,
+ * we can't destroy it yet.
+ */
+ if (ds->ds_userrefs > 0)
+ return (SET_ERROR(EBUSY));
+
+ /*
+ * Can't delete a branch point.
+ */
+ if (dsl_dataset_phys(ds)->ds_num_children > 1)
+ return (SET_ERROR(EEXIST));
+
+ return (0);
+}
+
+int
+dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_destroy_snapshot_arg_t *ddsa = arg;
+ const char *dsname = ddsa->ddsa_name;
+ boolean_t defer = ddsa->ddsa_defer;
+
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ int error = 0;
+ dsl_dataset_t *ds;
+
+ error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+
+ /*
+ * If the snapshot does not exist, silently ignore it, and
+ * dsl_destroy_snapshot_sync() will be a no-op
+ * (it's "already destroyed").
+ */
+ if (error == ENOENT)
+ return (0);
+
+ if (error == 0) {
+ error = dsl_destroy_snapshot_check_impl(ds, defer);
+ dsl_dataset_rele(ds, FTAG);
+ }
+
+ return (error);
+}
+
+struct process_old_arg {
+ dsl_dataset_t *ds;
+ dsl_dataset_t *ds_prev;
+ boolean_t after_branch_point;
+ zio_t *pio;
+ uint64_t used, comp, uncomp;
+};
+
+static int
+process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
+{
+ struct process_old_arg *poa = arg;
+ dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
+
+ ASSERT(!BP_IS_HOLE(bp));
+
+ if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
+ dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx);
+ if (poa->ds_prev && !poa->after_branch_point &&
+ bp->blk_birth >
+ dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
+ dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
+ bp_get_dsize_sync(dp->dp_spa, bp);
+ }
+ } else {
+ poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
+ poa->comp += BP_GET_PSIZE(bp);
+ poa->uncomp += BP_GET_UCSIZE(bp);
+ dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
+ }
+ return (0);
+}
+
+static void
+process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
+ dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
+{
+ struct process_old_arg poa = { 0 };
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t deadlist_obj;
+
+ ASSERT(ds->ds_deadlist.dl_oldfmt);
+ ASSERT(ds_next->ds_deadlist.dl_oldfmt);
+
+ poa.ds = ds;
+ poa.ds_prev = ds_prev;
+ poa.after_branch_point = after_branch_point;
+ poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
+ process_old_cb, &poa, tx));
+ VERIFY0(zio_wait(poa.pio));
+ ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes);
+
+ /* change snapused */
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+ -poa.used, -poa.comp, -poa.uncomp, tx);
+
+ /* swap next's deadlist to our deadlist */
+ dsl_deadlist_close(&ds->ds_deadlist);
+ dsl_deadlist_close(&ds_next->ds_deadlist);
+ deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
+ dsl_dataset_phys(ds)->ds_deadlist_obj =
+ dsl_dataset_phys(ds_next)->ds_deadlist_obj;
+ dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj;
+ dsl_deadlist_open(&ds->ds_deadlist, mos,
+ dsl_dataset_phys(ds)->ds_deadlist_obj);
+ dsl_deadlist_open(&ds_next->ds_deadlist, mos,
+ dsl_dataset_phys(ds_next)->ds_deadlist_obj);
+}
+
+typedef struct remaining_clones_key {
+ dsl_dataset_t *rck_clone;
+ list_node_t rck_node;
+} remaining_clones_key_t;
+
+static remaining_clones_key_t *
+rck_alloc(dsl_dataset_t *clone)
+{
+ remaining_clones_key_t *rck = kmem_alloc(sizeof (*rck), KM_SLEEP);
+ rck->rck_clone = clone;
+ return (rck);
+}
+
+static void
+dsl_dir_remove_clones_key_impl(dsl_dir_t *dd, uint64_t mintxg, dmu_tx_t *tx,
+ list_t *stack, void *tag)
+{
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+
+ /*
+ * If it is the old version, dd_clones doesn't exist so we can't
+ * find the clones, but dsl_deadlist_remove_key() is a no-op so it
+ * doesn't matter.
+ */
+ if (dsl_dir_phys(dd)->dd_clones == 0)
+ return;
+
+ zap_cursor_t *zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+ zap_attribute_t *za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+ for (zap_cursor_init(zc, mos, dsl_dir_phys(dd)->dd_clones);
+ zap_cursor_retrieve(zc, za) == 0;
+ zap_cursor_advance(zc)) {
+ dsl_dataset_t *clone;
+
+ VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
+ za->za_first_integer, tag, &clone));
+
+ if (clone->ds_dir->dd_origin_txg > mintxg) {
+ dsl_deadlist_remove_key(&clone->ds_deadlist,
+ mintxg, tx);
+
+ if (dsl_dataset_remap_deadlist_exists(clone)) {
+ dsl_deadlist_remove_key(
+ &clone->ds_remap_deadlist, mintxg, tx);
+ }
+
+ list_insert_head(stack, rck_alloc(clone));
+ } else {
+ dsl_dataset_rele(clone, tag);
+ }
+ }
+ zap_cursor_fini(zc);
+
+ kmem_free(za, sizeof (zap_attribute_t));
+ kmem_free(zc, sizeof (zap_cursor_t));
+}
+
+void
+dsl_dir_remove_clones_key(dsl_dir_t *top_dd, uint64_t mintxg, dmu_tx_t *tx)
+{
+ list_t stack;
+
+ list_create(&stack, sizeof (remaining_clones_key_t),
+ offsetof(remaining_clones_key_t, rck_node));
+
+ dsl_dir_remove_clones_key_impl(top_dd, mintxg, tx, &stack, FTAG);
+ for (remaining_clones_key_t *rck = list_remove_head(&stack);
+ rck != NULL; rck = list_remove_head(&stack)) {
+ dsl_dataset_t *clone = rck->rck_clone;
+ dsl_dir_t *clone_dir = clone->ds_dir;
+
+ kmem_free(rck, sizeof (*rck));
+
+ dsl_dir_remove_clones_key_impl(clone_dir, mintxg, tx,
+ &stack, FTAG);
+ dsl_dataset_rele(clone, FTAG);
+ }
+
+ list_destroy(&stack);
+}
+
+static void
+dsl_destroy_snapshot_handle_remaps(dsl_dataset_t *ds, dsl_dataset_t *ds_next,
+ dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ /* Move blocks to be obsoleted to pool's obsolete list. */
+ if (dsl_dataset_remap_deadlist_exists(ds_next)) {
+ if (!bpobj_is_open(&dp->dp_obsolete_bpobj))
+ dsl_pool_create_obsolete_bpobj(dp, tx);
+
+ dsl_deadlist_move_bpobj(&ds_next->ds_remap_deadlist,
+ &dp->dp_obsolete_bpobj,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
+ }
+
+ /* Merge our deadlist into next's and free it. */
+ if (dsl_dataset_remap_deadlist_exists(ds)) {
+ uint64_t remap_deadlist_object =
+ dsl_dataset_get_remap_deadlist_object(ds);
+ ASSERT(remap_deadlist_object != 0);
+
+ mutex_enter(&ds_next->ds_remap_deadlist_lock);
+ if (!dsl_dataset_remap_deadlist_exists(ds_next))
+ dsl_dataset_create_remap_deadlist(ds_next, tx);
+ mutex_exit(&ds_next->ds_remap_deadlist_lock);
+
+ dsl_deadlist_merge(&ds_next->ds_remap_deadlist,
+ remap_deadlist_object, tx);
+ dsl_dataset_destroy_remap_deadlist(ds, tx);
+ }
+}
+
+void
+dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
+{
+ int after_branch_point = FALSE;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ dsl_dataset_t *ds_prev = NULL;
+ uint64_t obj;
+
+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+ ASSERT(zfs_refcount_is_zero(&ds->ds_longholds));
+
+ if (defer &&
+ (ds->ds_userrefs > 0 ||
+ dsl_dataset_phys(ds)->ds_num_children > 1)) {
+ ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY;
+ spa_history_log_internal_ds(ds, "defer_destroy", tx, " ");
+ return;
+ }
+
+ ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
+
+ /* We need to log before removing it from the namespace. */
+ spa_history_log_internal_ds(ds, "destroy", tx, " ");
+
+ dsl_scan_ds_destroyed(ds, tx);
+
+ obj = ds->ds_object;
+
+ boolean_t book_exists = dsl_bookmark_ds_destroyed(ds, tx);
+
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (dsl_dataset_feature_is_active(ds, f))
+ dsl_dataset_deactivate_feature(ds, f, tx);
+ }
+ if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+ ASSERT3P(ds->ds_prev, ==, NULL);
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev));
+ after_branch_point =
+ (dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj);
+
+ dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
+ if (after_branch_point &&
+ dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) {
+ dsl_dataset_remove_from_next_clones(ds_prev, obj, tx);
+ if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
+ VERIFY0(zap_add_int(mos,
+ dsl_dataset_phys(ds_prev)->
+ ds_next_clones_obj,
+ dsl_dataset_phys(ds)->ds_next_snap_obj,
+ tx));
+ }
+ }
+ if (!after_branch_point) {
+ dsl_dataset_phys(ds_prev)->ds_next_snap_obj =
+ dsl_dataset_phys(ds)->ds_next_snap_obj;
+ }
+ }
+
+ dsl_dataset_t *ds_next;
+ uint64_t old_unique;
+ uint64_t used = 0, comp = 0, uncomp = 0;
+
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next));
+ ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj);
+
+ old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes;
+
+ dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
+ dsl_dataset_phys(ds_next)->ds_prev_snap_obj =
+ dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ dsl_dataset_phys(ds_next)->ds_prev_snap_txg =
+ dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
+ ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0);
+
+ if (ds_next->ds_deadlist.dl_oldfmt) {
+ process_old_deadlist(ds, ds_prev, ds_next,
+ after_branch_point, tx);
+ } else {
+ /* Adjust prev's unique space. */
+ if (ds_prev && !after_branch_point) {
+ dsl_deadlist_space_range(&ds_next->ds_deadlist,
+ dsl_dataset_phys(ds_prev)->ds_prev_snap_txg,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg,
+ &used, &comp, &uncomp);
+ dsl_dataset_phys(ds_prev)->ds_unique_bytes += used;
+ }
+
+ /* Adjust snapused. */
+ dsl_deadlist_space_range(&ds_next->ds_deadlist,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX,
+ &used, &comp, &uncomp);
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+ -used, -comp, -uncomp, tx);
+
+ /* Move blocks to be freed to pool's free list. */
+ dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
+ &dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg,
+ tx);
+ dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
+ DD_USED_HEAD, used, comp, uncomp, tx);
+
+ /* Merge our deadlist into next's and free it. */
+ dsl_deadlist_merge(&ds_next->ds_deadlist,
+ dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
+
+ /*
+ * We are done with the deadlist tree (generated/used
+ * by dsl_deadlist_move_bpobj() and dsl_deadlist_merge()).
+ * Discard it to save memory.
+ */
+ dsl_deadlist_discard_tree(&ds_next->ds_deadlist);
+ }
+
+ dsl_deadlist_close(&ds->ds_deadlist);
+ dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
+
+ dsl_destroy_snapshot_handle_remaps(ds, ds_next, tx);
+
+ if (!book_exists) {
+ /* Collapse range in clone heads */
+ dsl_dir_remove_clones_key(ds->ds_dir,
+ dsl_dataset_phys(ds)->ds_creation_txg, tx);
+ }
+
+ if (ds_next->ds_is_snapshot) {
+ dsl_dataset_t *ds_nextnext;
+
+ /*
+ * Update next's unique to include blocks which
+ * were previously shared by only this snapshot
+ * and it. Those blocks will be born after the
+ * prev snap and before this snap, and will have
+ * died after the next snap and before the one
+ * after that (ie. be on the snap after next's
+ * deadlist).
+ */
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds_next)->ds_next_snap_obj,
+ FTAG, &ds_nextnext));
+ dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg,
+ dsl_dataset_phys(ds)->ds_creation_txg,
+ &used, &comp, &uncomp);
+ dsl_dataset_phys(ds_next)->ds_unique_bytes += used;
+ dsl_dataset_rele(ds_nextnext, FTAG);
+ ASSERT3P(ds_next->ds_prev, ==, NULL);
+
+ /* Collapse range in this head. */
+ dsl_dataset_t *hds;
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
+ FTAG, &hds));
+ if (!book_exists) {
+ /* Collapse range in this head. */
+ dsl_deadlist_remove_key(&hds->ds_deadlist,
+ dsl_dataset_phys(ds)->ds_creation_txg, tx);
+ }
+ if (dsl_dataset_remap_deadlist_exists(hds)) {
+ dsl_deadlist_remove_key(&hds->ds_remap_deadlist,
+ dsl_dataset_phys(ds)->ds_creation_txg, tx);
+ }
+ dsl_dataset_rele(hds, FTAG);
+
+ } else {
+ ASSERT3P(ds_next->ds_prev, ==, ds);
+ dsl_dataset_rele(ds_next->ds_prev, ds_next);
+ ds_next->ds_prev = NULL;
+ if (ds_prev) {
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj,
+ ds_next, &ds_next->ds_prev));
+ }
+
+ dsl_dataset_recalc_head_uniq(ds_next);
+
+ /*
+ * Reduce the amount of our unconsumed refreservation
+ * being charged to our parent by the amount of
+ * new unique data we have gained.
+ */
+ if (old_unique < ds_next->ds_reserved) {
+ int64_t mrsdelta;
+ uint64_t new_unique =
+ dsl_dataset_phys(ds_next)->ds_unique_bytes;
+
+ ASSERT(old_unique <= new_unique);
+ mrsdelta = MIN(new_unique - old_unique,
+ ds_next->ds_reserved - old_unique);
+ dsl_dir_diduse_space(ds->ds_dir,
+ DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
+ }
+ }
+ dsl_dataset_rele(ds_next, FTAG);
+
+ /*
+ * This must be done after the dsl_traverse(), because it will
+ * re-open the objset.
+ */
+ if (ds->ds_objset) {
+ dmu_objset_evict(ds->ds_objset);
+ ds->ds_objset = NULL;
+ }
+
+ /* remove from snapshot namespace */
+ dsl_dataset_t *ds_head;
+ ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0);
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head));
+ VERIFY0(dsl_dataset_get_snapname(ds));
+#ifdef ZFS_DEBUG
+ {
+ uint64_t val;
+ int err;
+
+ err = dsl_dataset_snap_lookup(ds_head,
+ ds->ds_snapname, &val);
+ ASSERT0(err);
+ ASSERT3U(val, ==, obj);
+ }
+#endif
+ VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE));
+ dsl_dataset_rele(ds_head, FTAG);
+
+ if (ds_prev != NULL)
+ dsl_dataset_rele(ds_prev, FTAG);
+
+ spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
+
+ if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
+ uint64_t count __maybe_unused;
+ ASSERT0(zap_count(mos,
+ dsl_dataset_phys(ds)->ds_next_clones_obj, &count) &&
+ count == 0);
+ VERIFY0(dmu_object_free(mos,
+ dsl_dataset_phys(ds)->ds_next_clones_obj, tx));
+ }
+ if (dsl_dataset_phys(ds)->ds_props_obj != 0)
+ VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj,
+ tx));
+ if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0)
+ VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
+ tx));
+ dsl_dir_rele(ds->ds_dir, ds);
+ ds->ds_dir = NULL;
+ dmu_object_free_zapified(mos, obj, tx);
+}
+
+void
+dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_destroy_snapshot_arg_t *ddsa = arg;
+ const char *dsname = ddsa->ddsa_name;
+ boolean_t defer = ddsa->ddsa_defer;
+
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+
+ int error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+ if (error == ENOENT)
+ return;
+ ASSERT0(error);
+ dsl_destroy_snapshot_sync_impl(ds, defer, tx);
+ zvol_remove_minors(dp->dp_spa, dsname, B_TRUE);
+ dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * The semantics of this function are described in the comment above
+ * lzc_destroy_snaps(). To summarize:
+ *
+ * The snapshots must all be in the same pool.
+ *
+ * Snapshots that don't exist will be silently ignored (considered to be
+ * "already deleted").
+ *
+ * On success, all snaps will be destroyed and this will return 0.
+ * On failure, no snaps will be destroyed, the errlist will be filled in,
+ * and this will return an errno.
+ */
+int
+dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer,
+ nvlist_t *errlist)
+{
+ if (nvlist_next_nvpair(snaps, NULL) == NULL)
+ return (0);
+
+ /*
+ * lzc_destroy_snaps() is documented to take an nvlist whose
+ * values "don't matter". We need to convert that nvlist to
+ * one that we know can be converted to LUA.
+ */
+ nvlist_t *snaps_normalized = fnvlist_alloc();
+ for (nvpair_t *pair = nvlist_next_nvpair(snaps, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) {
+ fnvlist_add_boolean_value(snaps_normalized,
+ nvpair_name(pair), B_TRUE);
+ }
+
+ nvlist_t *arg = fnvlist_alloc();
+ fnvlist_add_nvlist(arg, "snaps", snaps_normalized);
+ fnvlist_free(snaps_normalized);
+ fnvlist_add_boolean_value(arg, "defer", defer);
+
+ nvlist_t *wrapper = fnvlist_alloc();
+ fnvlist_add_nvlist(wrapper, ZCP_ARG_ARGLIST, arg);
+ fnvlist_free(arg);
+
+ const char *program =
+ "arg = ...\n"
+ "snaps = arg['snaps']\n"
+ "defer = arg['defer']\n"
+ "errors = { }\n"
+ "has_errors = false\n"
+ "for snap, v in pairs(snaps) do\n"
+ " errno = zfs.check.destroy{snap, defer=defer}\n"
+ " zfs.debug('snap: ' .. snap .. ' errno: ' .. errno)\n"
+ " if errno == ENOENT then\n"
+ " snaps[snap] = nil\n"
+ " elseif errno ~= 0 then\n"
+ " errors[snap] = errno\n"
+ " has_errors = true\n"
+ " end\n"
+ "end\n"
+ "if has_errors then\n"
+ " return errors\n"
+ "end\n"
+ "for snap, v in pairs(snaps) do\n"
+ " errno = zfs.sync.destroy{snap, defer=defer}\n"
+ " assert(errno == 0)\n"
+ "end\n"
+ "return { }\n";
+
+ nvlist_t *result = fnvlist_alloc();
+ int error = zcp_eval(nvpair_name(nvlist_next_nvpair(snaps, NULL)),
+ program,
+ B_TRUE,
+ 0,
+ zfs_lua_max_memlimit,
+ fnvlist_lookup_nvpair(wrapper, ZCP_ARG_ARGLIST), result);
+ if (error != 0) {
+ char *errorstr = NULL;
+ (void) nvlist_lookup_string(result, ZCP_RET_ERROR, &errorstr);
+ if (errorstr != NULL) {
+ zfs_dbgmsg(errorstr);
+ }
+ fnvlist_free(wrapper);
+ fnvlist_free(result);
+ return (error);
+ }
+ fnvlist_free(wrapper);
+
+ /*
+ * lzc_destroy_snaps() is documented to fill the errlist with
+ * int32 values, so we need to convert the int64 values that are
+ * returned from LUA.
+ */
+ int rv = 0;
+ nvlist_t *errlist_raw = fnvlist_lookup_nvlist(result, ZCP_RET_RETURN);
+ for (nvpair_t *pair = nvlist_next_nvpair(errlist_raw, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(errlist_raw, pair)) {
+ int32_t val = (int32_t)fnvpair_value_int64(pair);
+ if (rv == 0)
+ rv = val;
+ fnvlist_add_int32(errlist, nvpair_name(pair), val);
+ }
+ fnvlist_free(result);
+ return (rv);
+}
+
+int
+dsl_destroy_snapshot(const char *name, boolean_t defer)
+{
+ int error;
+ nvlist_t *nvl = fnvlist_alloc();
+ nvlist_t *errlist = fnvlist_alloc();
+
+ fnvlist_add_boolean(nvl, name);
+ error = dsl_destroy_snapshots_nvl(nvl, defer, errlist);
+ fnvlist_free(errlist);
+ fnvlist_free(nvl);
+ return (error);
+}
+
+struct killarg {
+ dsl_dataset_t *ds;
+ dmu_tx_t *tx;
+};
+
+/* ARGSUSED */
+static int
+kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ struct killarg *ka = arg;
+ dmu_tx_t *tx = ka->tx;
+
+ if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
+ BP_IS_EMBEDDED(bp))
+ return (0);
+
+ if (zb->zb_level == ZB_ZIL_LEVEL) {
+ ASSERT(zilog != NULL);
+ /*
+ * It's a block in the intent log. It has no
+ * accounting, so just free it.
+ */
+ dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
+ } else {
+ ASSERT(zilog == NULL);
+ ASSERT3U(bp->blk_birth, >,
+ dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
+ (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
+ }
+
+ return (0);
+}
+
+static void
+old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ struct killarg ka;
+
+ spa_history_log_internal_ds(ds, "destroy", tx,
+ "(synchronous, mintxg=%llu)",
+ (long long)dsl_dataset_phys(ds)->ds_prev_snap_txg);
+
+ /*
+ * Free everything that we point to (that's born after
+ * the previous snapshot, if we are a clone)
+ *
+ * NB: this should be very quick, because we already
+ * freed all the objects in open context.
+ */
+ ka.ds = ds;
+ ka.tx = tx;
+ VERIFY0(traverse_dataset(ds,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST |
+ TRAVERSE_NO_DECRYPT, kill_blkptr, &ka));
+ ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
+ dsl_dataset_phys(ds)->ds_unique_bytes == 0);
+}
+
+int
+dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
+{
+ int error;
+ uint64_t count;
+ objset_t *mos;
+
+ ASSERT(!ds->ds_is_snapshot);
+ if (ds->ds_is_snapshot)
+ return (SET_ERROR(EINVAL));
+
+ if (zfs_refcount_count(&ds->ds_longholds) != expected_holds)
+ return (SET_ERROR(EBUSY));
+
+ ASSERT0(ds->ds_dir->dd_activity_waiters);
+
+ mos = ds->ds_dir->dd_pool->dp_meta_objset;
+
+ /*
+ * Can't delete a head dataset if there are snapshots of it.
+ * (Except if the only snapshots are from the branch we cloned
+ * from.)
+ */
+ if (ds->ds_prev != NULL &&
+ dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object)
+ return (SET_ERROR(EBUSY));
+
+ /*
+ * Can't delete if there are children of this fs.
+ */
+ error = zap_count(mos,
+ dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count);
+ if (error != 0)
+ return (error);
+ if (count != 0)
+ return (SET_ERROR(EEXIST));
+
+ if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) &&
+ dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
+ ds->ds_prev->ds_userrefs == 0) {
+ /* We need to remove the origin snapshot as well. */
+ if (!zfs_refcount_is_zero(&ds->ds_prev->ds_longholds))
+ return (SET_ERROR(EBUSY));
+ }
+ return (0);
+}
+
+int
+dsl_destroy_head_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_destroy_head_arg_t *ddha = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ int error;
+
+ error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ error = dsl_destroy_head_check_impl(ds, 0);
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+}
+
+static void
+dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ objset_t *mos = dp->dp_meta_objset;
+ dd_used_t t;
+
+ ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock));
+
+ VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
+
+ ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj);
+
+ /* Decrement the filesystem count for all parent filesystems. */
+ if (dd->dd_parent != NULL)
+ dsl_fs_ss_count_adjust(dd->dd_parent, -1,
+ DD_FIELD_FILESYSTEM_COUNT, tx);
+
+ /*
+ * Remove our reservation. The impl() routine avoids setting the
+ * actual property, which would require the (already destroyed) ds.
+ */
+ dsl_dir_set_reservation_sync_impl(dd, 0, tx);
+
+ ASSERT0(dsl_dir_phys(dd)->dd_used_bytes);
+ ASSERT0(dsl_dir_phys(dd)->dd_reserved);
+ for (t = 0; t < DD_USED_NUM; t++)
+ ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]);
+
+ if (dd->dd_crypto_obj != 0) {
+ dsl_crypto_key_destroy_sync(dd->dd_crypto_obj, tx);
+ (void) spa_keystore_unload_wkey_impl(dp->dp_spa, dd->dd_object);
+ }
+
+ VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx));
+ VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx));
+ if (dsl_dir_phys(dd)->dd_clones != 0)
+ VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_clones, tx));
+ VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx));
+ VERIFY0(zap_remove(mos,
+ dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
+ dd->dd_myname, tx));
+
+ dsl_dir_rele(dd, FTAG);
+ dmu_object_free_zapified(mos, ddobj, tx);
+}
+
+static void
+dsl_clone_destroy_assert(dsl_dir_t *dd)
+{
+ uint64_t used, comp, uncomp;
+
+ ASSERT(dsl_dir_is_clone(dd));
+ dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp);
+
+ ASSERT3U(dsl_dir_phys(dd)->dd_used_bytes, ==, used);
+ ASSERT3U(dsl_dir_phys(dd)->dd_compressed_bytes, ==, comp);
+ /*
+ * Greater than because we do not track embedded block pointers in
+ * the livelist
+ */
+ ASSERT3U(dsl_dir_phys(dd)->dd_uncompressed_bytes, >=, uncomp);
+
+ ASSERT(list_is_empty(&dd->dd_pending_allocs.bpl_list));
+ ASSERT(list_is_empty(&dd->dd_pending_frees.bpl_list));
+}
+
+/*
+ * Start the delete process for a clone. Free its zil, verify the space usage
+ * and queue the blkptrs for deletion by adding the livelist to the pool-wide
+ * delete queue.
+ */
+static void
+dsl_async_clone_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ uint64_t zap_obj, to_delete, used, comp, uncomp;
+ objset_t *os;
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ objset_t *mos = dp->dp_meta_objset;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+
+ uint64_t mintxg = 0;
+ dsl_deadlist_entry_t *dle = dsl_deadlist_first(&dd->dd_livelist);
+ if (dle != NULL)
+ mintxg = dle->dle_mintxg;
+
+ spa_history_log_internal_ds(ds, "destroy", tx,
+ "(livelist, mintxg=%llu)", (long long)mintxg);
+
+ /* Check that the clone is in a correct state to be deleted */
+ dsl_clone_destroy_assert(dd);
+
+ /* Destroy the zil */
+ zil_destroy_sync(dmu_objset_zil(os), tx);
+
+ VERIFY0(zap_lookup(mos, dd->dd_object,
+ DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &to_delete));
+ /* Initialize deleted_clones entry to track livelists to cleanup */
+ int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
+ if (error == ENOENT) {
+ zap_obj = zap_create(mos, DMU_OTN_ZAP_METADATA,
+ DMU_OT_NONE, 0, tx);
+ VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1,
+ &(zap_obj), tx));
+ spa->spa_livelists_to_delete = zap_obj;
+ } else if (error != 0) {
+ zfs_panic_recover("zfs: error %d was returned while looking "
+ "up DMU_POOL_DELETED_CLONES in the zap", error);
+ return;
+ }
+ VERIFY0(zap_add_int(mos, zap_obj, to_delete, tx));
+
+ /* Clone is no longer using space, now tracked by dp_free_dir */
+ dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp);
+ dsl_dir_diduse_space(dd, DD_USED_HEAD,
+ -used, -comp, -dsl_dir_phys(dd)->dd_uncompressed_bytes,
+ tx);
+ dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+ used, comp, uncomp, tx);
+ dsl_dir_remove_livelist(dd, tx, B_FALSE);
+ zthr_wakeup(spa->spa_livelist_delete_zthr);
+}
+
+/*
+ * Move the bptree into the pool's list of trees to clean up, update space
+ * accounting information and destroy the zil.
+ */
+static void
+dsl_async_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ uint64_t used, comp, uncomp;
+ objset_t *os;
+
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ objset_t *mos = dp->dp_meta_objset;
+
+ spa_history_log_internal_ds(ds, "destroy", tx,
+ "(bptree, mintxg=%llu)",
+ (long long)dsl_dataset_phys(ds)->ds_prev_snap_txg);
+
+ zil_destroy_sync(dmu_objset_zil(os), tx);
+
+ if (!spa_feature_is_active(dp->dp_spa,
+ SPA_FEATURE_ASYNC_DESTROY)) {
+ dsl_scan_t *scn = dp->dp_scan;
+ spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY,
+ tx);
+ dp->dp_bptree_obj = bptree_alloc(mos, tx);
+ VERIFY0(zap_add(mos,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
+ &dp->dp_bptree_obj, tx));
+ ASSERT(!scn->scn_async_destroying);
+ scn->scn_async_destroying = B_TRUE;
+ }
+
+ used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
+ comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
+ uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;
+
+ ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
+ dsl_dataset_phys(ds)->ds_unique_bytes == used);
+
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ bptree_add(mos, dp->dp_bptree_obj,
+ &dsl_dataset_phys(ds)->ds_bp,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg,
+ used, comp, uncomp, tx);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
+ -used, -comp, -uncomp, tx);
+ dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+ used, comp, uncomp, tx);
+}
+
+void
+dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t obj, ddobj, prevobj = 0;
+ boolean_t rmorigin;
+
+ ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
+ ASSERT(ds->ds_prev == NULL ||
+ dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+ dsl_dir_cancel_waiters(ds->ds_dir);
+
+ rmorigin = (dsl_dir_is_clone(ds->ds_dir) &&
+ DS_IS_DEFER_DESTROY(ds->ds_prev) &&
+ dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
+ ds->ds_prev->ds_userrefs == 0);
+
+ /* Remove our reservation. */
+ if (ds->ds_reserved != 0) {
+ dsl_dataset_set_refreservation_sync_impl(ds,
+ (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
+ 0, tx);
+ ASSERT0(ds->ds_reserved);
+ }
+
+ obj = ds->ds_object;
+
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (dsl_dataset_feature_is_active(ds, f))
+ dsl_dataset_deactivate_feature(ds, f, tx);
+ }
+
+ dsl_scan_ds_destroyed(ds, tx);
+
+ if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+ /* This is a clone */
+ ASSERT(ds->ds_prev != NULL);
+ ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=,
+ obj);
+ ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj);
+
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) {
+ dsl_dataset_remove_from_next_clones(ds->ds_prev,
+ obj, tx);
+ }
+
+ ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1);
+ dsl_dataset_phys(ds->ds_prev)->ds_num_children--;
+ }
+
+ /*
+ * Destroy the deadlist. Unless it's a clone, the
+ * deadlist should be empty since the dataset has no snapshots.
+ * (If it's a clone, it's safe to ignore the deadlist contents
+ * since they are still referenced by the origin snapshot.)
+ */
+ dsl_deadlist_close(&ds->ds_deadlist);
+ dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
+
+ if (dsl_dataset_remap_deadlist_exists(ds))
+ dsl_dataset_destroy_remap_deadlist(ds, tx);
+
+ /*
+ * Each destroy is responsible for both destroying (enqueuing
+ * to be destroyed) the blkptrs comprising the dataset as well as
+ * those belonging to the zil.
+ */
+ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) {
+ dsl_async_clone_destroy(ds, tx);
+ } else if (spa_feature_is_enabled(dp->dp_spa,
+ SPA_FEATURE_ASYNC_DESTROY)) {
+ dsl_async_dataset_destroy(ds, tx);
+ } else {
+ old_synchronous_dataset_destroy(ds, tx);
+ }
+
+ if (ds->ds_prev != NULL) {
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+ VERIFY0(zap_remove_int(mos,
+ dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones,
+ ds->ds_object, tx));
+ }
+ prevobj = ds->ds_prev->ds_object;
+ dsl_dataset_rele(ds->ds_prev, ds);
+ ds->ds_prev = NULL;
+ }
+
+ /*
+ * This must be done after the dsl_traverse(), because it will
+ * re-open the objset.
+ */
+ if (ds->ds_objset) {
+ dmu_objset_evict(ds->ds_objset);
+ ds->ds_objset = NULL;
+ }
+
+ /* Erase the link in the dir */
+ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
+ dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0;
+ ddobj = ds->ds_dir->dd_object;
+ ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0);
+ VERIFY0(zap_destroy(mos,
+ dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx));
+
+ if (ds->ds_bookmarks_obj != 0) {
+ void *cookie = NULL;
+ dsl_bookmark_node_t *dbn;
+
+ while ((dbn = avl_destroy_nodes(&ds->ds_bookmarks, &cookie)) !=
+ NULL) {
+ if (dbn->dbn_phys.zbm_redaction_obj != 0) {
+ VERIFY0(dmu_object_free(mos,
+ dbn->dbn_phys.zbm_redaction_obj, tx));
+ spa_feature_decr(dmu_objset_spa(mos),
+ SPA_FEATURE_REDACTION_BOOKMARKS, tx);
+ }
+ if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) {
+ spa_feature_decr(dmu_objset_spa(mos),
+ SPA_FEATURE_BOOKMARK_WRITTEN, tx);
+ }
+ spa_strfree(dbn->dbn_name);
+ mutex_destroy(&dbn->dbn_lock);
+ kmem_free(dbn, sizeof (*dbn));
+ }
+ avl_destroy(&ds->ds_bookmarks);
+ VERIFY0(zap_destroy(mos, ds->ds_bookmarks_obj, tx));
+ spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
+ }
+
+ spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
+
+ ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj);
+ ASSERT0(dsl_dataset_phys(ds)->ds_props_obj);
+ ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj);
+ dsl_dir_rele(ds->ds_dir, ds);
+ ds->ds_dir = NULL;
+ dmu_object_free_zapified(mos, obj, tx);
+
+ dsl_dir_destroy_sync(ddobj, tx);
+
+ if (rmorigin) {
+ dsl_dataset_t *prev;
+ VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev));
+ dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx);
+ dsl_dataset_rele(prev, FTAG);
+ }
+}
+
+void
+dsl_destroy_head_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_destroy_head_arg_t *ddha = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+
+ VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
+ dsl_destroy_head_sync_impl(ds, tx);
+ zvol_remove_minors(dp->dp_spa, ddha->ddha_name, B_TRUE);
+ dsl_dataset_rele(ds, FTAG);
+}
+
+static void
+dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_destroy_head_arg_t *ddha = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+
+ VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
+
+ /* Mark it as inconsistent on-disk, in case we crash */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
+
+ spa_history_log_internal_ds(ds, "destroy begin", tx, " ");
+ dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_destroy_head(const char *name)
+{
+ dsl_destroy_head_arg_t ddha;
+ int error;
+ spa_t *spa;
+ boolean_t isenabled;
+
+#ifdef _KERNEL
+ zfs_destroy_unmount_origin(name);
+#endif
+
+ error = spa_open(name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+ isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY);
+ spa_close(spa, FTAG);
+
+ ddha.ddha_name = name;
+
+ if (!isenabled) {
+ objset_t *os;
+
+ error = dsl_sync_task(name, dsl_destroy_head_check,
+ dsl_destroy_head_begin_sync, &ddha,
+ 0, ZFS_SPACE_CHECK_DESTROY);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Head deletion is processed in one txg on old pools;
+ * remove the objects from open context so that the txg sync
+ * is not too long. This optimization can only work for
+ * encrypted datasets if the wrapping key is loaded.
+ */
+ error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, B_TRUE,
+ FTAG, &os);
+ if (error == 0) {
+ uint64_t prev_snap_txg =
+ dsl_dataset_phys(dmu_objset_ds(os))->
+ ds_prev_snap_txg;
+ for (uint64_t obj = 0; error == 0;
+ error = dmu_object_next(os, &obj, FALSE,
+ prev_snap_txg))
+ (void) dmu_free_long_object(os, obj);
+ /* sync out all frees */
+ txg_wait_synced(dmu_objset_pool(os), 0);
+ dmu_objset_disown(os, B_TRUE, FTAG);
+ }
+ }
+
+ return (dsl_sync_task(name, dsl_destroy_head_check,
+ dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_DESTROY));
+}
+
+/*
+ * Note, this function is used as the callback for dmu_objset_find(). We
+ * always return 0 so that we will continue to find and process
+ * inconsistent datasets, even if we encounter an error trying to
+ * process one of them.
+ */
+/* ARGSUSED */
+int
+dsl_destroy_inconsistent(const char *dsname, void *arg)
+{
+ objset_t *os;
+
+ if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
+ boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os));
+
+ /*
+ * If the dataset is inconsistent because a resumable receive
+ * has failed, then do not destroy it.
+ */
+ if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os)))
+ need_destroy = B_FALSE;
+
+ dmu_objset_rele(os, FTAG);
+ if (need_destroy)
+ (void) dsl_destroy_head(dsname);
+ }
+ return (0);
+}
+
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(dsl_destroy_head);
+EXPORT_SYMBOL(dsl_destroy_head_sync_impl);
+EXPORT_SYMBOL(dsl_dataset_user_hold_check_one);
+EXPORT_SYMBOL(dsl_destroy_snapshot_sync_impl);
+EXPORT_SYMBOL(dsl_destroy_inconsistent);
+EXPORT_SYMBOL(dsl_dataset_user_release_tmp);
+EXPORT_SYMBOL(dsl_destroy_head_check_impl);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/dsl_dir.c b/sys/contrib/openzfs/module/zfs/dsl_dir.c
new file mode 100644
index 000000000000..90dd787023be
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_dir.c
@@ -0,0 +1,2403 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Martin Matuska. All rights reserved.
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dmu_impl.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/metaslab.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include <sys/sunddi.h>
+#include <sys/zfeature.h>
+#include <sys/policy.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zvol.h>
+#include <sys/zthr.h>
+#include "zfs_namecheck.h"
+#include "zfs_prop.h"
+
+/*
+ * Filesystem and Snapshot Limits
+ * ------------------------------
+ *
+ * These limits are used to restrict the number of filesystems and/or snapshots
+ * that can be created at a given level in the tree or below. A typical
+ * use-case is with a delegated dataset where the administrator wants to ensure
+ * that a user within the zone is not creating too many additional filesystems
+ * or snapshots, even though they're not exceeding their space quota.
+ *
+ * The filesystem and snapshot counts are stored as extensible properties. This
+ * capability is controlled by a feature flag and must be enabled to be used.
+ * Once enabled, the feature is not active until the first limit is set. At
+ * that point, future operations to create/destroy filesystems or snapshots
+ * will validate and update the counts.
+ *
+ * Because the count properties will not exist before the feature is active,
+ * the counts are updated when a limit is first set on an uninitialized
+ * dsl_dir node in the tree (The filesystem/snapshot count on a node includes
+ * all of the nested filesystems/snapshots. Thus, a new leaf node has a
+ * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
+ * snapshot count properties on a node indicate uninitialized counts on that
+ * node.) When first setting a limit on an uninitialized node, the code starts
+ * at the filesystem with the new limit and descends into all sub-filesystems
+ * to add the count properties.
+ *
+ * In practice this is lightweight since a limit is typically set when the
+ * filesystem is created and thus has no children. Once valid, changing the
+ * limit value won't require a re-traversal since the counts are already valid.
+ * When recursively fixing the counts, if a node with a limit is encountered
+ * during the descent, the counts are known to be valid and there is no need to
+ * descend into that filesystem's children. The counts on filesystems above the
+ * one with the new limit will still be uninitialized, unless a limit is
+ * eventually set on one of those filesystems. The counts are always recursively
+ * updated when a limit is set on a dataset, unless there is already a limit.
+ * When a new limit value is set on a filesystem with an existing limit, it is
+ * possible for the new limit to be less than the current count at that level
+ * since a user who can change the limit is also allowed to exceed the limit.
+ *
+ * Once the feature is active, then whenever a filesystem or snapshot is
+ * created, the code recurses up the tree, validating the new count against the
+ * limit at each initialized level. In practice, most levels will not have a
+ * limit set. If there is a limit at any initialized level up the tree, the
+ * check must pass or the creation will fail. Likewise, when a filesystem or
+ * snapshot is destroyed, the counts are recursively adjusted all the way up
+ * the initialized nodes in the tree. Renaming a filesystem into different point
+ * in the tree will first validate, then update the counts on each branch up to
+ * the common ancestor. A receive will also validate the counts and then update
+ * them.
+ *
+ * An exception to the above behavior is that the limit is not enforced if the
+ * user has permission to modify the limit. This is primarily so that
+ * recursive snapshots in the global zone always work. We want to prevent a
+ * denial-of-service in which a lower level delegated dataset could max out its
+ * limit and thus block recursive snapshots from being taken in the global zone.
+ * Because of this, it is possible for the snapshot count to be over the limit
+ * and snapshots taken in the global zone could cause a lower level dataset to
+ * hit or exceed its limit. The administrator taking the global zone recursive
+ * snapshot should be aware of this side-effect and behave accordingly.
+ * For consistency, the filesystem limit is also not enforced if the user can
+ * modify the limit.
+ *
+ * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
+ * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
+ * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
+ * dsl_dir_init_fs_ss_count().
+ */
+
+extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd);
+
+static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
+
+typedef struct ddulrt_arg {
+ dsl_dir_t *ddulrta_dd;
+ uint64_t ddlrta_txg;
+} ddulrt_arg_t;
+
+static void
+dsl_dir_evict_async(void *dbu)
+{
+ dsl_dir_t *dd = dbu;
+ int t;
+ dsl_pool_t *dp __maybe_unused = dd->dd_pool;
+
+ dd->dd_dbuf = NULL;
+
+ for (t = 0; t < TXG_SIZE; t++) {
+ ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
+ ASSERT(dd->dd_tempreserved[t] == 0);
+ ASSERT(dd->dd_space_towrite[t] == 0);
+ }
+
+ if (dd->dd_parent)
+ dsl_dir_async_rele(dd->dd_parent, dd);
+
+ spa_async_close(dd->dd_pool->dp_spa, dd);
+
+ if (dsl_deadlist_is_open(&dd->dd_livelist))
+ dsl_dir_livelist_close(dd);
+
+ dsl_prop_fini(dd);
+ cv_destroy(&dd->dd_activity_cv);
+ mutex_destroy(&dd->dd_activity_lock);
+ mutex_destroy(&dd->dd_lock);
+ kmem_free(dd, sizeof (dsl_dir_t));
+}
+
+int
+dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
+ const char *tail, void *tag, dsl_dir_t **ddp)
+{
+ dmu_buf_t *dbuf;
+ dsl_dir_t *dd;
+ dmu_object_info_t doi;
+ int err;
+
+ ASSERT(dsl_pool_config_held(dp));
+
+ err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
+ if (err != 0)
+ return (err);
+ dd = dmu_buf_get_user(dbuf);
+
+ dmu_object_info_from_db(dbuf, &doi);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
+ ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
+
+ if (dd == NULL) {
+ dsl_dir_t *winner;
+
+ dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
+ dd->dd_object = ddobj;
+ dd->dd_dbuf = dbuf;
+ dd->dd_pool = dp;
+
+ mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&dd->dd_activity_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&dd->dd_activity_cv, NULL, CV_DEFAULT, NULL);
+ dsl_prop_init(dd);
+
+ if (dsl_dir_is_zapified(dd)) {
+ err = zap_lookup(dp->dp_meta_objset,
+ ddobj, DD_FIELD_CRYPTO_KEY_OBJ,
+ sizeof (uint64_t), 1, &dd->dd_crypto_obj);
+ if (err == 0) {
+ /* check for on-disk format errata */
+ if (dsl_dir_incompatible_encryption_version(
+ dd)) {
+ dp->dp_spa->spa_errata =
+ ZPOOL_ERRATA_ZOL_6845_ENCRYPTION;
+ }
+ } else if (err != ENOENT) {
+ goto errout;
+ }
+ }
+
+ dsl_dir_snap_cmtime_update(dd);
+
+ if (dsl_dir_phys(dd)->dd_parent_obj) {
+ err = dsl_dir_hold_obj(dp,
+ dsl_dir_phys(dd)->dd_parent_obj, NULL, dd,
+ &dd->dd_parent);
+ if (err != 0)
+ goto errout;
+ if (tail) {
+#ifdef ZFS_DEBUG
+ uint64_t foundobj;
+
+ err = zap_lookup(dp->dp_meta_objset,
+ dsl_dir_phys(dd->dd_parent)->
+ dd_child_dir_zapobj, tail,
+ sizeof (foundobj), 1, &foundobj);
+ ASSERT(err || foundobj == ddobj);
+#endif
+ (void) strlcpy(dd->dd_myname, tail,
+ sizeof (dd->dd_myname));
+ } else {
+ err = zap_value_search(dp->dp_meta_objset,
+ dsl_dir_phys(dd->dd_parent)->
+ dd_child_dir_zapobj,
+ ddobj, 0, dd->dd_myname);
+ }
+ if (err != 0)
+ goto errout;
+ } else {
+ (void) strlcpy(dd->dd_myname, spa_name(dp->dp_spa),
+ sizeof (dd->dd_myname));
+ }
+
+ if (dsl_dir_is_clone(dd)) {
+ dmu_buf_t *origin_bonus;
+ dsl_dataset_phys_t *origin_phys;
+
+ /*
+ * We can't open the origin dataset, because
+ * that would require opening this dsl_dir.
+ * Just look at its phys directly instead.
+ */
+ err = dmu_bonus_hold(dp->dp_meta_objset,
+ dsl_dir_phys(dd)->dd_origin_obj, FTAG,
+ &origin_bonus);
+ if (err != 0)
+ goto errout;
+ origin_phys = origin_bonus->db_data;
+ dd->dd_origin_txg =
+ origin_phys->ds_creation_txg;
+ dmu_buf_rele(origin_bonus, FTAG);
+ if (dsl_dir_is_zapified(dd)) {
+ uint64_t obj;
+ err = zap_lookup(dp->dp_meta_objset,
+ dd->dd_object, DD_FIELD_LIVELIST,
+ sizeof (uint64_t), 1, &obj);
+ if (err == 0)
+ dsl_dir_livelist_open(dd, obj);
+ else if (err != ENOENT)
+ goto errout;
+ }
+ }
+
+ dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async,
+ &dd->dd_dbuf);
+ winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu);
+ if (winner != NULL) {
+ if (dd->dd_parent)
+ dsl_dir_rele(dd->dd_parent, dd);
+ if (dsl_deadlist_is_open(&dd->dd_livelist))
+ dsl_dir_livelist_close(dd);
+ dsl_prop_fini(dd);
+ cv_destroy(&dd->dd_activity_cv);
+ mutex_destroy(&dd->dd_activity_lock);
+ mutex_destroy(&dd->dd_lock);
+ kmem_free(dd, sizeof (dsl_dir_t));
+ dd = winner;
+ } else {
+ spa_open_ref(dp->dp_spa, dd);
+ }
+ }
+
+ /*
+ * The dsl_dir_t has both open-to-close and instantiate-to-evict
+ * holds on the spa. We need the open-to-close holds because
+ * otherwise the spa_refcnt wouldn't change when we open a
+ * dir which the spa also has open, so we could incorrectly
+ * think it was OK to unload/export/destroy the pool. We need
+ * the instantiate-to-evict hold because the dsl_dir_t has a
+ * pointer to the dd_pool, which has a pointer to the spa_t.
+ */
+ spa_open_ref(dp->dp_spa, tag);
+ ASSERT3P(dd->dd_pool, ==, dp);
+ ASSERT3U(dd->dd_object, ==, ddobj);
+ ASSERT3P(dd->dd_dbuf, ==, dbuf);
+ *ddp = dd;
+ return (0);
+
+errout:
+ if (dd->dd_parent)
+ dsl_dir_rele(dd->dd_parent, dd);
+ if (dsl_deadlist_is_open(&dd->dd_livelist))
+ dsl_dir_livelist_close(dd);
+ dsl_prop_fini(dd);
+ cv_destroy(&dd->dd_activity_cv);
+ mutex_destroy(&dd->dd_activity_lock);
+ mutex_destroy(&dd->dd_lock);
+ kmem_free(dd, sizeof (dsl_dir_t));
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+}
+
+void
+dsl_dir_rele(dsl_dir_t *dd, void *tag)
+{
+ dprintf_dd(dd, "%s\n", "");
+ spa_close(dd->dd_pool->dp_spa, tag);
+ dmu_buf_rele(dd->dd_dbuf, tag);
+}
+
+/*
+ * Remove a reference to the given dsl dir that is being asynchronously
+ * released. Async releases occur from a taskq performing eviction of
+ * dsl datasets and dirs. This process is identical to a normal release
+ * with the exception of using the async API for releasing the reference on
+ * the spa.
+ */
+void
+dsl_dir_async_rele(dsl_dir_t *dd, void *tag)
+{
+ dprintf_dd(dd, "%s\n", "");
+ spa_async_close(dd->dd_pool->dp_spa, tag);
+ dmu_buf_rele(dd->dd_dbuf, tag);
+}
+
+/* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */
+void
+dsl_dir_name(dsl_dir_t *dd, char *buf)
+{
+ if (dd->dd_parent) {
+ dsl_dir_name(dd->dd_parent, buf);
+ VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <,
+ ZFS_MAX_DATASET_NAME_LEN);
+ } else {
+ buf[0] = '\0';
+ }
+ if (!MUTEX_HELD(&dd->dd_lock)) {
+ /*
+ * recursive mutex so that we can use
+ * dprintf_dd() with dd_lock held
+ */
+ mutex_enter(&dd->dd_lock);
+ VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
+ <, ZFS_MAX_DATASET_NAME_LEN);
+ mutex_exit(&dd->dd_lock);
+ } else {
+ VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
+ <, ZFS_MAX_DATASET_NAME_LEN);
+ }
+}
+
+/* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
+int
+dsl_dir_namelen(dsl_dir_t *dd)
+{
+ int result = 0;
+
+ if (dd->dd_parent) {
+ /* parent's name + 1 for the "/" */
+ result = dsl_dir_namelen(dd->dd_parent) + 1;
+ }
+
+ if (!MUTEX_HELD(&dd->dd_lock)) {
+ /* see dsl_dir_name */
+ mutex_enter(&dd->dd_lock);
+ result += strlen(dd->dd_myname);
+ mutex_exit(&dd->dd_lock);
+ } else {
+ result += strlen(dd->dd_myname);
+ }
+
+ return (result);
+}
+
+static int
+getcomponent(const char *path, char *component, const char **nextp)
+{
+ char *p;
+
+ if ((path == NULL) || (path[0] == '\0'))
+ return (SET_ERROR(ENOENT));
+ /* This would be a good place to reserve some namespace... */
+ p = strpbrk(path, "/@");
+ if (p && (p[1] == '/' || p[1] == '@')) {
+ /* two separators in a row */
+ return (SET_ERROR(EINVAL));
+ }
+ if (p == NULL || p == path) {
+ /*
+ * if the first thing is an @ or /, it had better be an
+ * @ and it had better not have any more ats or slashes,
+ * and it had better have something after the @.
+ */
+ if (p != NULL &&
+ (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
+ return (SET_ERROR(EINVAL));
+ if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+ (void) strlcpy(component, path, ZFS_MAX_DATASET_NAME_LEN);
+ p = NULL;
+ } else if (p[0] == '/') {
+ if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+ (void) strncpy(component, path, p - path);
+ component[p - path] = '\0';
+ p++;
+ } else if (p[0] == '@') {
+ /*
+ * if the next separator is an @, there better not be
+ * any more slashes.
+ */
+ if (strchr(path, '/'))
+ return (SET_ERROR(EINVAL));
+ if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+ (void) strncpy(component, path, p - path);
+ component[p - path] = '\0';
+ } else {
+ panic("invalid p=%p", (void *)p);
+ }
+ *nextp = p;
+ return (0);
+}
+
+/*
+ * Return the dsl_dir_t, and possibly the last component which couldn't
+ * be found in *tail. The name must be in the specified dsl_pool_t. This
+ * thread must hold the dp_config_rwlock for the pool. Returns NULL if the
+ * path is bogus, or if tail==NULL and we couldn't parse the whole name.
+ * (*tail)[0] == '@' means that the last component is a snapshot.
+ */
+int
+dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
+ dsl_dir_t **ddp, const char **tailp)
+{
+ char *buf;
+ const char *spaname, *next, *nextnext = NULL;
+ int err;
+ dsl_dir_t *dd;
+ uint64_t ddobj;
+
+ buf = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+ err = getcomponent(name, buf, &next);
+ if (err != 0)
+ goto error;
+
+ /* Make sure the name is in the specified pool. */
+ spaname = spa_name(dp->dp_spa);
+ if (strcmp(buf, spaname) != 0) {
+ err = SET_ERROR(EXDEV);
+ goto error;
+ }
+
+ ASSERT(dsl_pool_config_held(dp));
+
+ err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
+ if (err != 0) {
+ goto error;
+ }
+
+ while (next != NULL) {
+ dsl_dir_t *child_dd;
+ err = getcomponent(next, buf, &nextnext);
+ if (err != 0)
+ break;
+ ASSERT(next[0] != '\0');
+ if (next[0] == '@')
+ break;
+ dprintf("looking up %s in obj%lld\n",
+ buf, dsl_dir_phys(dd)->dd_child_dir_zapobj);
+
+ err = zap_lookup(dp->dp_meta_objset,
+ dsl_dir_phys(dd)->dd_child_dir_zapobj,
+ buf, sizeof (ddobj), 1, &ddobj);
+ if (err != 0) {
+ if (err == ENOENT)
+ err = 0;
+ break;
+ }
+
+ err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd);
+ if (err != 0)
+ break;
+ dsl_dir_rele(dd, tag);
+ dd = child_dd;
+ next = nextnext;
+ }
+
+ if (err != 0) {
+ dsl_dir_rele(dd, tag);
+ goto error;
+ }
+
+ /*
+ * It's an error if there's more than one component left, or
+ * tailp==NULL and there's any component left.
+ */
+ if (next != NULL &&
+ (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
+ /* bad path name */
+ dsl_dir_rele(dd, tag);
+ dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
+ err = SET_ERROR(ENOENT);
+ }
+ if (tailp != NULL)
+ *tailp = next;
+ if (err == 0)
+ *ddp = dd;
+error:
+ kmem_free(buf, ZFS_MAX_DATASET_NAME_LEN);
+ return (err);
+}
+
+/*
+ * If the counts are already initialized for this filesystem and its
+ * descendants then do nothing, otherwise initialize the counts.
+ *
+ * The counts on this filesystem, and those below, may be uninitialized due to
+ * either the use of a pre-existing pool which did not support the
+ * filesystem/snapshot limit feature, or one in which the feature had not yet
+ * been enabled.
+ *
+ * Recursively descend the filesystem tree and update the filesystem/snapshot
+ * counts on each filesystem below, then update the cumulative count on the
+ * current filesystem. If the filesystem already has a count set on it,
+ * then we know that its counts, and the counts on the filesystems below it,
+ * are already correct, so we don't have to update this filesystem.
+ */
+static void
+dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+ uint64_t my_fs_cnt = 0;
+ uint64_t my_ss_cnt = 0;
+ dsl_pool_t *dp = dd->dd_pool;
+ objset_t *os = dp->dp_meta_objset;
+ zap_cursor_t *zc;
+ zap_attribute_t *za;
+ dsl_dataset_t *ds;
+
+ ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT));
+ ASSERT(dsl_pool_config_held(dp));
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dsl_dir_zapify(dd, tx);
+
+ /*
+ * If the filesystem count has already been initialized then we
+ * don't need to recurse down any further.
+ */
+ if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0)
+ return;
+
+ zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+ /* Iterate my child dirs */
+ for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj);
+ zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
+ dsl_dir_t *chld_dd;
+ uint64_t count;
+
+ VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG,
+ &chld_dd));
+
+ /*
+ * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets.
+ */
+ if (chld_dd->dd_myname[0] == '$') {
+ dsl_dir_rele(chld_dd, FTAG);
+ continue;
+ }
+
+ my_fs_cnt++; /* count this child */
+
+ dsl_dir_init_fs_ss_count(chld_dd, tx);
+
+ VERIFY0(zap_lookup(os, chld_dd->dd_object,
+ DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count));
+ my_fs_cnt += count;
+ VERIFY0(zap_lookup(os, chld_dd->dd_object,
+ DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count));
+ my_ss_cnt += count;
+
+ dsl_dir_rele(chld_dd, FTAG);
+ }
+ zap_cursor_fini(zc);
+ /* Count my snapshots (we counted children's snapshots above) */
+ VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
+ dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds));
+
+ for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj);
+ zap_cursor_retrieve(zc, za) == 0;
+ zap_cursor_advance(zc)) {
+ /* Don't count temporary snapshots */
+ if (za->za_name[0] != '%')
+ my_ss_cnt++;
+ }
+ zap_cursor_fini(zc);
+
+ dsl_dataset_rele(ds, FTAG);
+
+ kmem_free(zc, sizeof (zap_cursor_t));
+ kmem_free(za, sizeof (zap_attribute_t));
+
+ /* we're in a sync task, update counts */
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+ sizeof (my_fs_cnt), 1, &my_fs_cnt, tx));
+ VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+ sizeof (my_ss_cnt), 1, &my_ss_cnt, tx));
+}
+
+static int
+dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx)
+{
+ char *ddname = (char *)arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ dsl_dir_t *dd;
+ int error;
+
+ error = dsl_dataset_hold(dp, ddname, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ dd = ds->ds_dir;
+ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) &&
+ dsl_dir_is_zapified(dd) &&
+ zap_contains(dp->dp_meta_objset, dd->dd_object,
+ DD_FIELD_FILESYSTEM_COUNT) == 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EALREADY));
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+static void
+dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx)
+{
+ char *ddname = (char *)arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ spa_t *spa;
+
+ VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds));
+
+ spa = dsl_dataset_get_spa(ds);
+
+ if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) {
+ /*
+ * Since the feature was not active and we're now setting a
+ * limit, increment the feature-active counter so that the
+ * feature becomes active for the first time.
+ *
+ * We are already in a sync task so we can update the MOS.
+ */
+ spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx);
+ }
+
+ /*
+ * Since we are now setting a non-UINT64_MAX limit on the filesystem,
+ * we need to ensure the counts are correct. Descend down the tree from
+ * this point and update all of the counts to be accurate.
+ */
+ dsl_dir_init_fs_ss_count(ds->ds_dir, tx);
+
+ dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * Make sure the feature is enabled and activate it if necessary.
+ * Since we're setting a limit, ensure the on-disk counts are valid.
+ * This is only called by the ioctl path when setting a limit value.
+ *
+ * We do not need to validate the new limit, since users who can change the
+ * limit are also allowed to exceed the limit.
+ */
+int
+dsl_dir_activate_fs_ss_limit(const char *ddname)
+{
+ int error;
+
+ error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check,
+ dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0,
+ ZFS_SPACE_CHECK_RESERVED);
+
+ if (error == EALREADY)
+ error = 0;
+
+ return (error);
+}
+
+/*
+ * Used to determine if the filesystem_limit or snapshot_limit should be
+ * enforced. We allow the limit to be exceeded if the user has permission to
+ * write the property value. We pass in the creds that we got in the open
+ * context since we will always be the GZ root in syncing context. We also have
+ * to handle the case where we are allowed to change the limit on the current
+ * dataset, but there may be another limit in the tree above.
+ *
+ * We can never modify these two properties within a non-global zone. In
+ * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
+ * can't use that function since we are already holding the dp_config_rwlock.
+ * In addition, we already have the dd and dealing with snapshots is simplified
+ * in this code.
+ */
+
+typedef enum {
+ ENFORCE_ALWAYS,
+ ENFORCE_NEVER,
+ ENFORCE_ABOVE
+} enforce_res_t;
+
+static enforce_res_t
+dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop,
+ cred_t *cr, proc_t *proc)
+{
+ enforce_res_t enforce = ENFORCE_ALWAYS;
+ uint64_t obj;
+ dsl_dataset_t *ds;
+ uint64_t zoned;
+ const char *zonedstr;
+
+ ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
+ prop == ZFS_PROP_SNAPSHOT_LIMIT);
+
+#ifdef _KERNEL
+ if (crgetzoneid(cr) != GLOBAL_ZONEID)
+ return (ENFORCE_ALWAYS);
+
+ /*
+ * We are checking the saved credentials of the user process, which is
+ * not the current process. Note that we can't use secpolicy_zfs(),
+ * because it only works if the cred is that of the current process (on
+ * Linux).
+ */
+ if (secpolicy_zfs_proc(cr, proc) == 0)
+ return (ENFORCE_NEVER);
+#endif
+
+ if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0)
+ return (ENFORCE_ALWAYS);
+
+ ASSERT(dsl_pool_config_held(dd->dd_pool));
+
+ if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0)
+ return (ENFORCE_ALWAYS);
+
+ zonedstr = zfs_prop_to_name(ZFS_PROP_ZONED);
+ if (dsl_prop_get_ds(ds, zonedstr, 8, 1, &zoned, NULL) || zoned) {
+ /* Only root can access zoned fs's from the GZ */
+ enforce = ENFORCE_ALWAYS;
+ } else {
+ if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0)
+ enforce = ENFORCE_ABOVE;
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (enforce);
+}
+
+/*
+ * Check if adding additional child filesystem(s) would exceed any filesystem
+ * limits or adding additional snapshot(s) would exceed any snapshot limits.
+ * The prop argument indicates which limit to check.
+ *
+ * Note that all filesystem limits up to the root (or the highest
+ * initialized) filesystem or the given ancestor must be satisfied.
+ */
+int
+dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
+ dsl_dir_t *ancestor, cred_t *cr, proc_t *proc)
+{
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+ uint64_t limit, count;
+ char *count_prop;
+ enforce_res_t enforce;
+ int err = 0;
+
+ ASSERT(dsl_pool_config_held(dd->dd_pool));
+ ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
+ prop == ZFS_PROP_SNAPSHOT_LIMIT);
+
+ /*
+ * If we're allowed to change the limit, don't enforce the limit
+ * e.g. this can happen if a snapshot is taken by an administrative
+ * user in the global zone (i.e. a recursive snapshot by root).
+ * However, we must handle the case of delegated permissions where we
+ * are allowed to change the limit on the current dataset, but there
+ * is another limit in the tree above.
+ */
+ enforce = dsl_enforce_ds_ss_limits(dd, prop, cr, proc);
+ if (enforce == ENFORCE_NEVER)
+ return (0);
+
+ /*
+ * e.g. if renaming a dataset with no snapshots, count adjustment
+ * is 0.
+ */
+ if (delta == 0)
+ return (0);
+
+ if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
+ /*
+ * We don't enforce the limit for temporary snapshots. This is
+ * indicated by a NULL cred_t argument.
+ */
+ if (cr == NULL)
+ return (0);
+
+ count_prop = DD_FIELD_SNAPSHOT_COUNT;
+ } else {
+ count_prop = DD_FIELD_FILESYSTEM_COUNT;
+ }
+
+ /*
+ * If an ancestor has been provided, stop checking the limit once we
+ * hit that dir. We need this during rename so that we don't overcount
+ * the check once we recurse up to the common ancestor.
+ */
+ if (ancestor == dd)
+ return (0);
+
+ /*
+ * If we hit an uninitialized node while recursing up the tree, we can
+ * stop since we know there is no limit here (or above). The counts are
+ * not valid on this node and we know we won't touch this node's counts.
+ */
+ if (!dsl_dir_is_zapified(dd))
+ return (0);
+ err = zap_lookup(os, dd->dd_object,
+ count_prop, sizeof (count), 1, &count);
+ if (err == ENOENT)
+ return (0);
+ if (err != 0)
+ return (err);
+
+ err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
+ B_FALSE);
+ if (err != 0)
+ return (err);
+
+ /* Is there a limit which we've hit? */
+ if (enforce == ENFORCE_ALWAYS && (count + delta) > limit)
+ return (SET_ERROR(EDQUOT));
+
+ if (dd->dd_parent != NULL)
+ err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop,
+ ancestor, cr, proc);
+
+ return (err);
+}
+
+/*
+ * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
+ * parents. When a new filesystem/snapshot is created, increment the count on
+ * all parents, and when a filesystem/snapshot is destroyed, decrement the
+ * count.
+ */
+void
+dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop,
+ dmu_tx_t *tx)
+{
+ int err;
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+ uint64_t count;
+
+ ASSERT(dsl_pool_config_held(dd->dd_pool));
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 ||
+ strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0);
+
+ /*
+ * We don't do accounting for hidden ($FREE, $MOS & $ORIGIN) objsets.
+ */
+ if (dd->dd_myname[0] == '$' && strcmp(prop,
+ DD_FIELD_FILESYSTEM_COUNT) == 0) {
+ return;
+ }
+
+ /*
+ * e.g. if renaming a dataset with no snapshots, count adjustment is 0
+ */
+ if (delta == 0)
+ return;
+
+ /*
+ * If we hit an uninitialized node while recursing up the tree, we can
+ * stop since we know the counts are not valid on this node and we
+ * know we shouldn't touch this node's counts. An uninitialized count
+ * on the node indicates that either the feature has not yet been
+ * activated or there are no limits on this part of the tree.
+ */
+ if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object,
+ prop, sizeof (count), 1, &count)) == ENOENT)
+ return;
+ VERIFY0(err);
+
+ count += delta;
+ /* Use a signed verify to make sure we're not neg. */
+ VERIFY3S(count, >=, 0);
+
+ VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count,
+ tx));
+
+ /* Roll up this additional count into our ancestors */
+ if (dd->dd_parent != NULL)
+ dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx);
+}
+
+uint64_t
+dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
+ dmu_tx_t *tx)
+{
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t ddobj;
+ dsl_dir_phys_t *ddphys;
+ dmu_buf_t *dbuf;
+
+ ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
+ DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
+ if (pds) {
+ VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
+ name, sizeof (uint64_t), 1, &ddobj, tx));
+ } else {
+ /* it's the root dir */
+ VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
+ }
+ VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ ddphys = dbuf->db_data;
+
+ ddphys->dd_creation_time = gethrestime_sec();
+ if (pds) {
+ ddphys->dd_parent_obj = pds->dd_object;
+
+ /* update the filesystem counts */
+ dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx);
+ }
+ ddphys->dd_props_zapobj = zap_create(mos,
+ DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
+ ddphys->dd_child_dir_zapobj = zap_create(mos,
+ DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
+ ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
+
+ dmu_buf_rele(dbuf, FTAG);
+
+ return (ddobj);
+}
+
+boolean_t
+dsl_dir_is_clone(dsl_dir_t *dd)
+{
+ return (dsl_dir_phys(dd)->dd_origin_obj &&
+ (dd->dd_pool->dp_origin_snap == NULL ||
+ dsl_dir_phys(dd)->dd_origin_obj !=
+ dd->dd_pool->dp_origin_snap->ds_object));
+}
+
+uint64_t
+dsl_dir_get_used(dsl_dir_t *dd)
+{
+ return (dsl_dir_phys(dd)->dd_used_bytes);
+}
+
+uint64_t
+dsl_dir_get_compressed(dsl_dir_t *dd)
+{
+ return (dsl_dir_phys(dd)->dd_compressed_bytes);
+}
+
+uint64_t
+dsl_dir_get_quota(dsl_dir_t *dd)
+{
+ return (dsl_dir_phys(dd)->dd_quota);
+}
+
+uint64_t
+dsl_dir_get_reservation(dsl_dir_t *dd)
+{
+ return (dsl_dir_phys(dd)->dd_reserved);
+}
+
+uint64_t
+dsl_dir_get_compressratio(dsl_dir_t *dd)
+{
+ /* a fixed point number, 100x the ratio */
+ return (dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 :
+ (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 /
+ dsl_dir_phys(dd)->dd_compressed_bytes));
+}
+
+uint64_t
+dsl_dir_get_logicalused(dsl_dir_t *dd)
+{
+ return (dsl_dir_phys(dd)->dd_uncompressed_bytes);
+}
+
+uint64_t
+dsl_dir_get_usedsnap(dsl_dir_t *dd)
+{
+ return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]);
+}
+
+uint64_t
+dsl_dir_get_usedds(dsl_dir_t *dd)
+{
+ return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]);
+}
+
+uint64_t
+dsl_dir_get_usedrefreserv(dsl_dir_t *dd)
+{
+ return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]);
+}
+
+uint64_t
+dsl_dir_get_usedchild(dsl_dir_t *dd)
+{
+ return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] +
+ dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]);
+}
+
+void
+dsl_dir_get_origin(dsl_dir_t *dd, char *buf)
+{
+ dsl_dataset_t *ds;
+ VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
+ dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds));
+
+ dsl_dataset_name(ds, buf);
+
+ dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count)
+{
+ if (dsl_dir_is_zapified(dd)) {
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+ return (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+ sizeof (*count), 1, count));
+ } else {
+ return (SET_ERROR(ENOENT));
+ }
+}
+
+int
+dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count)
+{
+ if (dsl_dir_is_zapified(dd)) {
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+ return (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+ sizeof (*count), 1, count));
+ } else {
+ return (SET_ERROR(ENOENT));
+ }
+}
+
+void
+dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
+{
+ mutex_enter(&dd->dd_lock);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
+ dsl_dir_get_quota(dd));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
+ dsl_dir_get_reservation(dd));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
+ dsl_dir_get_logicalused(dd));
+ if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
+ dsl_dir_get_usedsnap(dd));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
+ dsl_dir_get_usedds(dd));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
+ dsl_dir_get_usedrefreserv(dd));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
+ dsl_dir_get_usedchild(dd));
+ }
+ mutex_exit(&dd->dd_lock);
+
+ uint64_t count;
+ if (dsl_dir_get_filesystem_count(dd, &count) == 0) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT,
+ count);
+ }
+ if (dsl_dir_get_snapshot_count(dd, &count) == 0) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT,
+ count);
+ }
+
+ if (dsl_dir_is_clone(dd)) {
+ char buf[ZFS_MAX_DATASET_NAME_LEN];
+ dsl_dir_get_origin(dd, buf);
+ dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
+ }
+
+}
+
+void
+dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dd->dd_pool;
+
+ ASSERT(dsl_dir_phys(dd));
+
+ if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
+ /* up the hold count until we can be written out */
+ dmu_buf_add_ref(dd->dd_dbuf, dd);
+ }
+}
+
+static int64_t
+parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
+{
+ uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved);
+ uint64_t new_accounted =
+ MAX(used + delta, dsl_dir_phys(dd)->dd_reserved);
+ return (new_accounted - old_accounted);
+}
+
+void
+dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ mutex_enter(&dd->dd_lock);
+ ASSERT0(dd->dd_tempreserved[tx->tx_txg & TXG_MASK]);
+ dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
+ dd->dd_space_towrite[tx->tx_txg & TXG_MASK] / 1024);
+ dd->dd_space_towrite[tx->tx_txg & TXG_MASK] = 0;
+ mutex_exit(&dd->dd_lock);
+
+ /* release the hold from dsl_dir_dirty */
+ dmu_buf_rele(dd->dd_dbuf, dd);
+}
+
+static uint64_t
+dsl_dir_space_towrite(dsl_dir_t *dd)
+{
+ uint64_t space = 0;
+
+ ASSERT(MUTEX_HELD(&dd->dd_lock));
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ space += dd->dd_space_towrite[i & TXG_MASK];
+ ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0);
+ }
+ return (space);
+}
+
+/*
+ * How much space would dd have available if ancestor had delta applied
+ * to it? If ondiskonly is set, we're only interested in what's
+ * on-disk, not estimated pending changes.
+ */
+uint64_t
+dsl_dir_space_available(dsl_dir_t *dd,
+ dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
+{
+ uint64_t parentspace, myspace, quota, used;
+
+ /*
+ * If there are no restrictions otherwise, assume we have
+ * unlimited space available.
+ */
+ quota = UINT64_MAX;
+ parentspace = UINT64_MAX;
+
+ if (dd->dd_parent != NULL) {
+ parentspace = dsl_dir_space_available(dd->dd_parent,
+ ancestor, delta, ondiskonly);
+ }
+
+ mutex_enter(&dd->dd_lock);
+ if (dsl_dir_phys(dd)->dd_quota != 0)
+ quota = dsl_dir_phys(dd)->dd_quota;
+ used = dsl_dir_phys(dd)->dd_used_bytes;
+ if (!ondiskonly)
+ used += dsl_dir_space_towrite(dd);
+
+ if (dd->dd_parent == NULL) {
+ uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool,
+ ZFS_SPACE_CHECK_NORMAL);
+ quota = MIN(quota, poolsize);
+ }
+
+ if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) {
+ /*
+ * We have some space reserved, in addition to what our
+ * parent gave us.
+ */
+ parentspace += dsl_dir_phys(dd)->dd_reserved - used;
+ }
+
+ if (dd == ancestor) {
+ ASSERT(delta <= 0);
+ ASSERT(used >= -delta);
+ used += delta;
+ if (parentspace != UINT64_MAX)
+ parentspace -= delta;
+ }
+
+ if (used > quota) {
+ /* over quota */
+ myspace = 0;
+ } else {
+ /*
+ * the lesser of the space provided by our parent and
+ * the space left in our quota
+ */
+ myspace = MIN(parentspace, quota - used);
+ }
+
+ mutex_exit(&dd->dd_lock);
+
+ return (myspace);
+}
+
+struct tempreserve {
+ list_node_t tr_node;
+ dsl_dir_t *tr_ds;
+ uint64_t tr_size;
+};
+
+static int
+dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
+ boolean_t ignorequota, list_t *tr_list,
+ dmu_tx_t *tx, boolean_t first)
+{
+ uint64_t txg;
+ uint64_t quota;
+ struct tempreserve *tr;
+ int retval;
+ uint64_t ref_rsrv;
+
+top_of_function:
+ txg = tx->tx_txg;
+ retval = EDQUOT;
+ ref_rsrv = 0;
+
+ ASSERT3U(txg, !=, 0);
+ ASSERT3S(asize, >, 0);
+
+ mutex_enter(&dd->dd_lock);
+
+ /*
+ * Check against the dsl_dir's quota. We don't add in the delta
+ * when checking for over-quota because they get one free hit.
+ */
+ uint64_t est_inflight = dsl_dir_space_towrite(dd);
+ for (int i = 0; i < TXG_SIZE; i++)
+ est_inflight += dd->dd_tempreserved[i];
+ uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;
+
+ /*
+ * On the first iteration, fetch the dataset's used-on-disk and
+ * refreservation values. Also, if checkrefquota is set, test if
+ * allocating this space would exceed the dataset's refquota.
+ */
+ if (first && tx->tx_objset) {
+ int error;
+ dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
+
+ error = dsl_dataset_check_quota(ds, !netfree,
+ asize, est_inflight, &used_on_disk, &ref_rsrv);
+ if (error != 0) {
+ mutex_exit(&dd->dd_lock);
+ DMU_TX_STAT_BUMP(dmu_tx_quota);
+ return (error);
+ }
+ }
+
+ /*
+ * If this transaction will result in a net free of space,
+ * we want to let it through.
+ */
+ if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0)
+ quota = UINT64_MAX;
+ else
+ quota = dsl_dir_phys(dd)->dd_quota;
+
+ /*
+ * Adjust the quota against the actual pool size at the root
+ * minus any outstanding deferred frees.
+ * To ensure that it's possible to remove files from a full
+ * pool without inducing transient overcommits, we throttle
+ * netfree transactions against a quota that is slightly larger,
+ * but still within the pool's allocation slop. In cases where
+ * we're very close to full, this will allow a steady trickle of
+ * removes to get through.
+ */
+ uint64_t deferred = 0;
+ if (dd->dd_parent == NULL) {
+ uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool,
+ (netfree) ?
+ ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL);
+
+ if (avail < quota) {
+ quota = avail;
+ retval = SET_ERROR(ENOSPC);
+ }
+ }
+
+ /*
+ * If they are requesting more space, and our current estimate
+ * is over quota, they get to try again unless the actual
+ * on-disk is over quota and there are no pending changes (which
+ * may free up space for us).
+ */
+ if (used_on_disk + est_inflight >= quota) {
+ if (est_inflight > 0 || used_on_disk < quota ||
+ (retval == ENOSPC && used_on_disk < quota + deferred))
+ retval = ERESTART;
+ dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
+ "quota=%lluK tr=%lluK err=%d\n",
+ used_on_disk>>10, est_inflight>>10,
+ quota>>10, asize>>10, retval);
+ mutex_exit(&dd->dd_lock);
+ DMU_TX_STAT_BUMP(dmu_tx_quota);
+ return (SET_ERROR(retval));
+ }
+
+ /* We need to up our estimated delta before dropping dd_lock */
+ dd->dd_tempreserved[txg & TXG_MASK] += asize;
+
+ uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
+ asize - ref_rsrv);
+ mutex_exit(&dd->dd_lock);
+
+ tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
+ tr->tr_ds = dd;
+ tr->tr_size = asize;
+ list_insert_tail(tr_list, tr);
+
+ /* see if it's OK with our parent */
+ if (dd->dd_parent != NULL && parent_rsrv != 0) {
+ /*
+ * Recurse on our parent without recursion. This has been
+ * observed to be potentially large stack usage even within
+ * the test suite. Largest seen stack was 7632 bytes on linux.
+ */
+
+ dd = dd->dd_parent;
+ asize = parent_rsrv;
+ ignorequota = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
+ first = B_FALSE;
+ goto top_of_function;
+
+ } else {
+ return (0);
+ }
+}
+
+/*
+ * Reserve space in this dsl_dir, to be used in this tx's txg.
+ * After the space has been dirtied (and dsl_dir_willuse_space()
+ * has been called), the reservation should be canceled, using
+ * dsl_dir_tempreserve_clear().
+ */
+int
+dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
+ boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx)
+{
+ int err;
+ list_t *tr_list;
+
+ if (asize == 0) {
+ *tr_cookiep = NULL;
+ return (0);
+ }
+
+ tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
+ list_create(tr_list, sizeof (struct tempreserve),
+ offsetof(struct tempreserve, tr_node));
+ ASSERT3S(asize, >, 0);
+
+ err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg);
+ if (err == 0) {
+ struct tempreserve *tr;
+
+ tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
+ tr->tr_size = lsize;
+ list_insert_tail(tr_list, tr);
+ } else {
+ if (err == EAGAIN) {
+ /*
+ * If arc_memory_throttle() detected that pageout
+ * is running and we are low on memory, we delay new
+ * non-pageout transactions to give pageout an
+ * advantage.
+ *
+ * It is unfortunate to be delaying while the caller's
+ * locks are held.
+ */
+ txg_delay(dd->dd_pool, tx->tx_txg,
+ MSEC2NSEC(10), MSEC2NSEC(10));
+ err = SET_ERROR(ERESTART);
+ }
+ }
+
+ if (err == 0) {
+ err = dsl_dir_tempreserve_impl(dd, asize, netfree,
+ B_FALSE, tr_list, tx, B_TRUE);
+ }
+
+ if (err != 0)
+ dsl_dir_tempreserve_clear(tr_list, tx);
+ else
+ *tr_cookiep = tr_list;
+
+ return (err);
+}
+
+/*
+ * Clear a temporary reservation that we previously made with
+ * dsl_dir_tempreserve_space().
+ */
+void
+dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
+{
+ int txgidx = tx->tx_txg & TXG_MASK;
+ list_t *tr_list = tr_cookie;
+ struct tempreserve *tr;
+
+ ASSERT3U(tx->tx_txg, !=, 0);
+
+ if (tr_cookie == NULL)
+ return;
+
+ while ((tr = list_head(tr_list)) != NULL) {
+ if (tr->tr_ds) {
+ mutex_enter(&tr->tr_ds->dd_lock);
+ ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
+ tr->tr_size);
+ tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
+ mutex_exit(&tr->tr_ds->dd_lock);
+ } else {
+ arc_tempreserve_clear(tr->tr_size);
+ }
+ list_remove(tr_list, tr);
+ kmem_free(tr, sizeof (struct tempreserve));
+ }
+
+ kmem_free(tr_list, sizeof (list_t));
+}
+
+/*
+ * This should be called from open context when we think we're going to write
+ * or free space, for example when dirtying data. Be conservative; it's okay
+ * to write less space or free more, but we don't want to write more or free
+ * less than the amount specified.
+ *
+ * NOTE: The behavior of this function is identical to the Illumos / FreeBSD
+ * version however it has been adjusted to use an iterative rather than
+ * recursive algorithm to minimize stack usage.
+ */
+void
+dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
+{
+ int64_t parent_space;
+ uint64_t est_used;
+
+ do {
+ mutex_enter(&dd->dd_lock);
+ if (space > 0)
+ dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
+
+ est_used = dsl_dir_space_towrite(dd) +
+ dsl_dir_phys(dd)->dd_used_bytes;
+ parent_space = parent_delta(dd, est_used, space);
+ mutex_exit(&dd->dd_lock);
+
+ /* Make sure that we clean up dd_space_to* */
+ dsl_dir_dirty(dd, tx);
+
+ dd = dd->dd_parent;
+ space = parent_space;
+ } while (space && dd);
+}
+
+/* call from syncing context when we actually write/free space for this dd */
+void
+dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
+ int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
+{
+ int64_t accounted_delta;
+
+ /*
+ * dsl_dataset_set_refreservation_sync_impl() calls this with
+ * dd_lock held, so that it can atomically update
+ * ds->ds_reserved and the dsl_dir accounting, so that
+ * dsl_dataset_check_quota() can see dataset and dir accounting
+ * consistently.
+ */
+ boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(type < DD_USED_NUM);
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+ if (needlock)
+ mutex_enter(&dd->dd_lock);
+ accounted_delta =
+ parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used);
+ ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used);
+ ASSERT(compressed >= 0 ||
+ dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed);
+ ASSERT(uncompressed >= 0 ||
+ dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed);
+ dsl_dir_phys(dd)->dd_used_bytes += used;
+ dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed;
+ dsl_dir_phys(dd)->dd_compressed_bytes += compressed;
+
+ if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+ ASSERT(used > 0 ||
+ dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used);
+ dsl_dir_phys(dd)->dd_used_breakdown[type] += used;
+#ifdef ZFS_DEBUG
+ {
+ dd_used_t t;
+ uint64_t u = 0;
+ for (t = 0; t < DD_USED_NUM; t++)
+ u += dsl_dir_phys(dd)->dd_used_breakdown[t];
+ ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes);
+ }
+#endif
+ }
+ if (needlock)
+ mutex_exit(&dd->dd_lock);
+
+ if (dd->dd_parent != NULL) {
+ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
+ accounted_delta, compressed, uncompressed, tx);
+ dsl_dir_transfer_space(dd->dd_parent,
+ used - accounted_delta,
+ DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
+ }
+}
+
+void
+dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
+ dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(oldtype < DD_USED_NUM);
+ ASSERT(newtype < DD_USED_NUM);
+
+ if (delta == 0 ||
+ !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN))
+ return;
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ mutex_enter(&dd->dd_lock);
+ ASSERT(delta > 0 ?
+ dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta :
+ dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta);
+ ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta));
+ dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta;
+ dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta;
+ mutex_exit(&dd->dd_lock);
+}
+
+typedef struct dsl_dir_set_qr_arg {
+ const char *ddsqra_name;
+ zprop_source_t ddsqra_source;
+ uint64_t ddsqra_value;
+} dsl_dir_set_qr_arg_t;
+
+static int
+dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dir_set_qr_arg_t *ddsqra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ int error;
+ uint64_t towrite, newval;
+
+ error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ error = dsl_prop_predict(ds->ds_dir, "quota",
+ ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ if (newval == 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ mutex_enter(&ds->ds_dir->dd_lock);
+ /*
+ * If we are doing the preliminary check in open context, and
+ * there are pending changes, then don't fail it, since the
+ * pending changes could under-estimate the amount of space to be
+ * freed up.
+ */
+ towrite = dsl_dir_space_towrite(ds->ds_dir);
+ if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
+ (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved ||
+ newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) {
+ error = SET_ERROR(ENOSPC);
+ }
+ mutex_exit(&ds->ds_dir->dd_lock);
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+}
+
+static void
+dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dir_set_qr_arg_t *ddsqra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ uint64_t newval;
+
+ VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
+
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
+ dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA),
+ ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
+ &ddsqra->ddsqra_value, tx);
+
+ VERIFY0(dsl_prop_get_int_ds(ds,
+ zfs_prop_to_name(ZFS_PROP_QUOTA), &newval));
+ } else {
+ newval = ddsqra->ddsqra_value;
+ spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
+ zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval);
+ }
+
+ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
+ mutex_enter(&ds->ds_dir->dd_lock);
+ dsl_dir_phys(ds->ds_dir)->dd_quota = newval;
+ mutex_exit(&ds->ds_dir->dd_lock);
+ dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
+{
+ dsl_dir_set_qr_arg_t ddsqra;
+
+ ddsqra.ddsqra_name = ddname;
+ ddsqra.ddsqra_source = source;
+ ddsqra.ddsqra_value = quota;
+
+ return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
+ dsl_dir_set_quota_sync, &ddsqra, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+static int
+dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dir_set_qr_arg_t *ddsqra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ dsl_dir_t *dd;
+ uint64_t newval, used, avail;
+ int error;
+
+ error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+ if (error != 0)
+ return (error);
+ dd = ds->ds_dir;
+
+ /*
+ * If we are doing the preliminary check in open context, the
+ * space estimates may be inaccurate.
+ */
+ if (!dmu_tx_is_syncing(tx)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ error = dsl_prop_predict(ds->ds_dir,
+ zfs_prop_to_name(ZFS_PROP_RESERVATION),
+ ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ mutex_enter(&dd->dd_lock);
+ used = dsl_dir_phys(dd)->dd_used_bytes;
+ mutex_exit(&dd->dd_lock);
+
+ if (dd->dd_parent) {
+ avail = dsl_dir_space_available(dd->dd_parent,
+ NULL, 0, FALSE);
+ } else {
+ avail = dsl_pool_adjustedsize(dd->dd_pool,
+ ZFS_SPACE_CHECK_NORMAL) - used;
+ }
+
+ if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
+ uint64_t delta = MAX(used, newval) -
+ MAX(used, dsl_dir_phys(dd)->dd_reserved);
+
+ if (delta > avail ||
+ (dsl_dir_phys(dd)->dd_quota > 0 &&
+ newval > dsl_dir_phys(dd)->dd_quota))
+ error = SET_ERROR(ENOSPC);
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+}
+
+void
+dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
+{
+ uint64_t used;
+ int64_t delta;
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+ mutex_enter(&dd->dd_lock);
+ used = dsl_dir_phys(dd)->dd_used_bytes;
+ delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved);
+ dsl_dir_phys(dd)->dd_reserved = value;
+
+ if (dd->dd_parent != NULL) {
+ /* Roll up this additional usage into our ancestors */
+ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
+ delta, 0, 0, tx);
+ }
+ mutex_exit(&dd->dd_lock);
+}
+
+static void
+dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dir_set_qr_arg_t *ddsqra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ uint64_t newval;
+
+ VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
+
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
+ dsl_prop_set_sync_impl(ds,
+ zfs_prop_to_name(ZFS_PROP_RESERVATION),
+ ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
+ &ddsqra->ddsqra_value, tx);
+
+ VERIFY0(dsl_prop_get_int_ds(ds,
+ zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval));
+ } else {
+ newval = ddsqra->ddsqra_value;
+ spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
+ zfs_prop_to_name(ZFS_PROP_RESERVATION),
+ (longlong_t)newval);
+ }
+
+ dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx);
+ dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
+ uint64_t reservation)
+{
+ dsl_dir_set_qr_arg_t ddsqra;
+
+ ddsqra.ddsqra_name = ddname;
+ ddsqra.ddsqra_source = source;
+ ddsqra.ddsqra_value = reservation;
+
+ return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
+ dsl_dir_set_reservation_sync, &ddsqra, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+static dsl_dir_t *
+closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
+{
+ for (; ds1; ds1 = ds1->dd_parent) {
+ dsl_dir_t *dd;
+ for (dd = ds2; dd; dd = dd->dd_parent) {
+ if (ds1 == dd)
+ return (dd);
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * If delta is applied to dd, how much of that delta would be applied to
+ * ancestor? Syncing context only.
+ */
+static int64_t
+would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
+{
+ if (dd == ancestor)
+ return (delta);
+
+ mutex_enter(&dd->dd_lock);
+ delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta);
+ mutex_exit(&dd->dd_lock);
+ return (would_change(dd->dd_parent, delta, ancestor));
+}
+
+typedef struct dsl_dir_rename_arg {
+ const char *ddra_oldname;
+ const char *ddra_newname;
+ cred_t *ddra_cred;
+ proc_t *ddra_proc;
+} dsl_dir_rename_arg_t;
+
+typedef struct dsl_valid_rename_arg {
+ int char_delta;
+ int nest_delta;
+} dsl_valid_rename_arg_t;
+
+/* ARGSUSED */
+static int
+dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+{
+ dsl_valid_rename_arg_t *dvra = arg;
+ char namebuf[ZFS_MAX_DATASET_NAME_LEN];
+
+ dsl_dataset_name(ds, namebuf);
+
+ ASSERT3U(strnlen(namebuf, ZFS_MAX_DATASET_NAME_LEN),
+ <, ZFS_MAX_DATASET_NAME_LEN);
+ int namelen = strlen(namebuf) + dvra->char_delta;
+ int depth = get_dataset_depth(namebuf) + dvra->nest_delta;
+
+ if (namelen >= ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+ if (dvra->nest_delta > 0 && depth >= zfs_max_dataset_nesting)
+ return (SET_ERROR(ENAMETOOLONG));
+ return (0);
+}
+
+static int
+dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dir_rename_arg_t *ddra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dir_t *dd, *newparent;
+ dsl_valid_rename_arg_t dvra;
+ dsl_dataset_t *parentds;
+ objset_t *parentos;
+ const char *mynewname;
+ int error;
+
+ /* target dir should exist */
+ error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
+ if (error != 0)
+ return (error);
+
+ /* new parent should exist */
+ error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG,
+ &newparent, &mynewname);
+ if (error != 0) {
+ dsl_dir_rele(dd, FTAG);
+ return (error);
+ }
+
+ /* can't rename to different pool */
+ if (dd->dd_pool != newparent->dd_pool) {
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (SET_ERROR(EXDEV));
+ }
+
+ /* new name should not already exist */
+ if (mynewname == NULL) {
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (SET_ERROR(EEXIST));
+ }
+
+ /* can't rename below anything but filesystems (eg. no ZVOLs) */
+ error = dsl_dataset_hold_obj(newparent->dd_pool,
+ dsl_dir_phys(newparent)->dd_head_dataset_obj, FTAG, &parentds);
+ if (error != 0) {
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (error);
+ }
+ error = dmu_objset_from_ds(parentds, &parentos);
+ if (error != 0) {
+ dsl_dataset_rele(parentds, FTAG);
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (error);
+ }
+ if (dmu_objset_type(parentos) != DMU_OST_ZFS) {
+ dsl_dataset_rele(parentds, FTAG);
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
+ }
+ dsl_dataset_rele(parentds, FTAG);
+
+ ASSERT3U(strnlen(ddra->ddra_newname, ZFS_MAX_DATASET_NAME_LEN),
+ <, ZFS_MAX_DATASET_NAME_LEN);
+ ASSERT3U(strnlen(ddra->ddra_oldname, ZFS_MAX_DATASET_NAME_LEN),
+ <, ZFS_MAX_DATASET_NAME_LEN);
+ dvra.char_delta = strlen(ddra->ddra_newname)
+ - strlen(ddra->ddra_oldname);
+ dvra.nest_delta = get_dataset_depth(ddra->ddra_newname)
+ - get_dataset_depth(ddra->ddra_oldname);
+
+ /* if the name length is growing, validate child name lengths */
+ if (dvra.char_delta > 0 || dvra.nest_delta > 0) {
+ error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
+ &dvra, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
+ if (error != 0) {
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (error);
+ }
+ }
+
+ if (dmu_tx_is_syncing(tx)) {
+ if (spa_feature_is_active(dp->dp_spa,
+ SPA_FEATURE_FS_SS_LIMIT)) {
+ /*
+ * Although this is the check function and we don't
+ * normally make on-disk changes in check functions,
+ * we need to do that here.
+ *
+ * Ensure this portion of the tree's counts have been
+ * initialized in case the new parent has limits set.
+ */
+ dsl_dir_init_fs_ss_count(dd, tx);
+ }
+ }
+
+ if (newparent != dd->dd_parent) {
+ /* is there enough space? */
+ uint64_t myspace =
+ MAX(dsl_dir_phys(dd)->dd_used_bytes,
+ dsl_dir_phys(dd)->dd_reserved);
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+ uint64_t fs_cnt = 0;
+ uint64_t ss_cnt = 0;
+
+ if (dsl_dir_is_zapified(dd)) {
+ int err;
+
+ err = zap_lookup(os, dd->dd_object,
+ DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
+ &fs_cnt);
+ if (err != ENOENT && err != 0) {
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (err);
+ }
+
+ /*
+ * have to add 1 for the filesystem itself that we're
+ * moving
+ */
+ fs_cnt++;
+
+ err = zap_lookup(os, dd->dd_object,
+ DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
+ &ss_cnt);
+ if (err != ENOENT && err != 0) {
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (err);
+ }
+ }
+
+ /* check for encryption errors */
+ error = dsl_dir_rename_crypt_check(dd, newparent);
+ if (error != 0) {
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (SET_ERROR(EACCES));
+ }
+
+ /* no rename into our descendant */
+ if (closest_common_ancestor(dd, newparent) == dd) {
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ error = dsl_dir_transfer_possible(dd->dd_parent,
+ newparent, fs_cnt, ss_cnt, myspace,
+ ddra->ddra_cred, ddra->ddra_proc);
+ if (error != 0) {
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (error);
+ }
+ }
+
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (0);
+}
+
+static void
+dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dir_rename_arg_t *ddra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dir_t *dd, *newparent;
+ const char *mynewname;
+ objset_t *mos = dp->dp_meta_objset;
+
+ VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
+ VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
+ &mynewname));
+
+ /* Log this before we change the name. */
+ spa_history_log_internal_dd(dd, "rename", tx,
+ "-> %s", ddra->ddra_newname);
+
+ if (newparent != dd->dd_parent) {
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+ uint64_t fs_cnt = 0;
+ uint64_t ss_cnt = 0;
+
+ /*
+ * We already made sure the dd counts were initialized in the
+ * check function.
+ */
+ if (spa_feature_is_active(dp->dp_spa,
+ SPA_FEATURE_FS_SS_LIMIT)) {
+ VERIFY0(zap_lookup(os, dd->dd_object,
+ DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
+ &fs_cnt));
+ /* add 1 for the filesystem itself that we're moving */
+ fs_cnt++;
+
+ VERIFY0(zap_lookup(os, dd->dd_object,
+ DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
+ &ss_cnt));
+ }
+
+ dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt,
+ DD_FIELD_FILESYSTEM_COUNT, tx);
+ dsl_fs_ss_count_adjust(newparent, fs_cnt,
+ DD_FIELD_FILESYSTEM_COUNT, tx);
+
+ dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt,
+ DD_FIELD_SNAPSHOT_COUNT, tx);
+ dsl_fs_ss_count_adjust(newparent, ss_cnt,
+ DD_FIELD_SNAPSHOT_COUNT, tx);
+
+ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
+ -dsl_dir_phys(dd)->dd_used_bytes,
+ -dsl_dir_phys(dd)->dd_compressed_bytes,
+ -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
+ dsl_dir_diduse_space(newparent, DD_USED_CHILD,
+ dsl_dir_phys(dd)->dd_used_bytes,
+ dsl_dir_phys(dd)->dd_compressed_bytes,
+ dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
+
+ if (dsl_dir_phys(dd)->dd_reserved >
+ dsl_dir_phys(dd)->dd_used_bytes) {
+ uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved -
+ dsl_dir_phys(dd)->dd_used_bytes;
+
+ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
+ -unused_rsrv, 0, 0, tx);
+ dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV,
+ unused_rsrv, 0, 0, tx);
+ }
+ }
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+ /* remove from old parent zapobj */
+ VERIFY0(zap_remove(mos,
+ dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
+ dd->dd_myname, tx));
+
+ (void) strlcpy(dd->dd_myname, mynewname,
+ sizeof (dd->dd_myname));
+ dsl_dir_rele(dd->dd_parent, dd);
+ dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object;
+ VERIFY0(dsl_dir_hold_obj(dp,
+ newparent->dd_object, NULL, dd, &dd->dd_parent));
+
+ /* add to new parent zapobj */
+ VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj,
+ dd->dd_myname, 8, 1, &dd->dd_object, tx));
+
+ /* TODO: A rename callback to avoid these layering violations. */
+ zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname);
+ zvol_rename_minors(dp->dp_spa, ddra->ddra_oldname,
+ ddra->ddra_newname, B_TRUE);
+
+ dsl_prop_notify_all(dd);
+
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+}
+
+int
+dsl_dir_rename(const char *oldname, const char *newname)
+{
+ dsl_dir_rename_arg_t ddra;
+
+ ddra.ddra_oldname = oldname;
+ ddra.ddra_newname = newname;
+ ddra.ddra_cred = CRED();
+ ddra.ddra_proc = curproc;
+
+ return (dsl_sync_task(oldname,
+ dsl_dir_rename_check, dsl_dir_rename_sync, &ddra,
+ 3, ZFS_SPACE_CHECK_RESERVED));
+}
+
+int
+dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
+ uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space,
+ cred_t *cr, proc_t *proc)
+{
+ dsl_dir_t *ancestor;
+ int64_t adelta;
+ uint64_t avail;
+ int err;
+
+ ancestor = closest_common_ancestor(sdd, tdd);
+ adelta = would_change(sdd, -space, ancestor);
+ avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
+ if (avail < space)
+ return (SET_ERROR(ENOSPC));
+
+ err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT,
+ ancestor, cr, proc);
+ if (err != 0)
+ return (err);
+ err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT,
+ ancestor, cr, proc);
+ if (err != 0)
+ return (err);
+
+ return (0);
+}
+
+inode_timespec_t
+dsl_dir_snap_cmtime(dsl_dir_t *dd)
+{
+ inode_timespec_t t;
+
+ mutex_enter(&dd->dd_lock);
+ t = dd->dd_snap_cmtime;
+ mutex_exit(&dd->dd_lock);
+
+ return (t);
+}
+
+void
+dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
+{
+ inode_timespec_t t;
+
+ gethrestime(&t);
+ mutex_enter(&dd->dd_lock);
+ dd->dd_snap_cmtime = t;
+ mutex_exit(&dd->dd_lock);
+}
+
+void
+dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx);
+}
+
+boolean_t
+dsl_dir_is_zapified(dsl_dir_t *dd)
+{
+ dmu_object_info_t doi;
+
+ dmu_object_info_from_db(dd->dd_dbuf, &doi);
+ return (doi.doi_type == DMU_OTN_ZAP_METADATA);
+}
+
+void
+dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj)
+{
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ ASSERT(spa_feature_is_active(dd->dd_pool->dp_spa,
+ SPA_FEATURE_LIVELIST));
+ dsl_deadlist_open(&dd->dd_livelist, mos, obj);
+ bplist_create(&dd->dd_pending_allocs);
+ bplist_create(&dd->dd_pending_frees);
+}
+
+void
+dsl_dir_livelist_close(dsl_dir_t *dd)
+{
+ dsl_deadlist_close(&dd->dd_livelist);
+ bplist_destroy(&dd->dd_pending_allocs);
+ bplist_destroy(&dd->dd_pending_frees);
+}
+
+void
+dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total)
+{
+ uint64_t obj;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ spa_t *spa = dp->dp_spa;
+ livelist_condense_entry_t to_condense = spa->spa_to_condense;
+
+ if (!dsl_deadlist_is_open(&dd->dd_livelist))
+ return;
+
+ /*
+ * If the livelist being removed is set to be condensed, stop the
+ * condense zthr and indicate the cancellation in the spa_to_condense
+ * struct in case the condense no-wait synctask has already started
+ */
+ zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
+ if (ll_condense_thread != NULL &&
+ (to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) {
+ /*
+ * We use zthr_wait_cycle_done instead of zthr_cancel
+ * because we don't want to destroy the zthr, just have
+ * it skip its current task.
+ */
+ spa->spa_to_condense.cancelled = B_TRUE;
+ zthr_wait_cycle_done(ll_condense_thread);
+ /*
+ * If we've returned from zthr_wait_cycle_done without
+ * clearing the to_condense data structure it's either
+ * because the no-wait synctask has started (which is
+ * indicated by 'syncing' field of to_condense) and we
+ * can expect it to clear to_condense on its own.
+ * Otherwise, we returned before the zthr ran. The
+ * checkfunc will now fail as cancelled == B_TRUE so we
+ * can safely NULL out ds, allowing a different dir's
+ * livelist to be condensed.
+ *
+ * We can be sure that the to_condense struct will not
+ * be repopulated at this stage because both this
+ * function and dsl_livelist_try_condense execute in
+ * syncing context.
+ */
+ if ((spa->spa_to_condense.ds != NULL) &&
+ !spa->spa_to_condense.syncing) {
+ dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf,
+ spa);
+ spa->spa_to_condense.ds = NULL;
+ }
+ }
+
+ dsl_dir_livelist_close(dd);
+ VERIFY0(zap_lookup(dp->dp_meta_objset, dd->dd_object,
+ DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj));
+ VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object,
+ DD_FIELD_LIVELIST, tx));
+ if (total) {
+ dsl_deadlist_free(dp->dp_meta_objset, obj, tx);
+ spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
+ }
+}
+
+static int
+dsl_dir_activity_in_progress(dsl_dir_t *dd, dsl_dataset_t *ds,
+ zfs_wait_activity_t activity, boolean_t *in_progress)
+{
+ int error = 0;
+
+ ASSERT(MUTEX_HELD(&dd->dd_activity_lock));
+
+ switch (activity) {
+ case ZFS_WAIT_DELETEQ: {
+#ifdef _KERNEL
+ objset_t *os;
+ error = dmu_objset_from_ds(ds, &os);
+ if (error != 0)
+ break;
+
+ mutex_enter(&os->os_user_ptr_lock);
+ void *user = dmu_objset_get_user(os);
+ mutex_exit(&os->os_user_ptr_lock);
+ if (dmu_objset_type(os) != DMU_OST_ZFS ||
+ user == NULL || zfs_get_vfs_flag_unmounted(os)) {
+ *in_progress = B_FALSE;
+ return (0);
+ }
+
+ uint64_t readonly = B_FALSE;
+ error = zfs_get_temporary_prop(ds, ZFS_PROP_READONLY, &readonly,
+ NULL);
+
+ if (error != 0)
+ break;
+
+ if (readonly || !spa_writeable(dd->dd_pool->dp_spa)) {
+ *in_progress = B_FALSE;
+ return (0);
+ }
+
+ uint64_t count, unlinked_obj;
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
+ &unlinked_obj);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ break;
+ }
+ error = zap_count(os, unlinked_obj, &count);
+
+ if (error == 0)
+ *in_progress = (count != 0);
+ break;
+#else
+ /*
+ * The delete queue is ZPL specific, and libzpool doesn't have
+ * it. It doesn't make sense to wait for it.
+ */
+ *in_progress = B_FALSE;
+ break;
+#endif
+ }
+ default:
+ panic("unrecognized value for activity %d", activity);
+ }
+
+ return (error);
+}
+
+int
+dsl_dir_wait(dsl_dir_t *dd, dsl_dataset_t *ds, zfs_wait_activity_t activity,
+ boolean_t *waited)
+{
+ int error = 0;
+ boolean_t in_progress;
+ dsl_pool_t *dp = dd->dd_pool;
+ for (;;) {
+ dsl_pool_config_enter(dp, FTAG);
+ error = dsl_dir_activity_in_progress(dd, ds, activity,
+ &in_progress);
+ dsl_pool_config_exit(dp, FTAG);
+ if (error != 0 || !in_progress)
+ break;
+
+ *waited = B_TRUE;
+
+ if (cv_wait_sig(&dd->dd_activity_cv, &dd->dd_activity_lock) ==
+ 0 || dd->dd_activity_cancelled) {
+ error = SET_ERROR(EINTR);
+ break;
+ }
+ }
+ return (error);
+}
+
+void
+dsl_dir_cancel_waiters(dsl_dir_t *dd)
+{
+ mutex_enter(&dd->dd_activity_lock);
+ dd->dd_activity_cancelled = B_TRUE;
+ cv_broadcast(&dd->dd_activity_cv);
+ while (dd->dd_activity_waiters > 0)
+ cv_wait(&dd->dd_activity_cv, &dd->dd_activity_lock);
+ mutex_exit(&dd->dd_activity_lock);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(dsl_dir_set_quota);
+EXPORT_SYMBOL(dsl_dir_set_reservation);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/dsl_pool.c b/sys/contrib/openzfs/module/zfs/dsl_pool.c
new file mode 100644
index 000000000000..c770eafa75d8
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_pool.c
@@ -0,0 +1,1417 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
+ */
+
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_scan.h>
+#include <sys/dnode.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab_impl.h>
+#include <sys/bptree.h>
+#include <sys/zfeature.h>
+#include <sys/zil_impl.h>
+#include <sys/dsl_userhold.h>
+#include <sys/trace_zfs.h>
+#include <sys/mmp.h>
+
+/*
+ * ZFS Write Throttle
+ * ------------------
+ *
+ * ZFS must limit the rate of incoming writes to the rate at which it is able
+ * to sync data modifications to the backend storage. Throttling by too much
+ * creates an artificial limit; throttling by too little can only be sustained
+ * for short periods and would lead to highly lumpy performance. On a per-pool
+ * basis, ZFS tracks the amount of modified (dirty) data. As operations change
+ * data, the amount of dirty data increases; as ZFS syncs out data, the amount
+ * of dirty data decreases. When the amount of dirty data exceeds a
+ * predetermined threshold further modifications are blocked until the amount
+ * of dirty data decreases (as data is synced out).
+ *
+ * The limit on dirty data is tunable, and should be adjusted according to
+ * both the IO capacity and available memory of the system. The larger the
+ * window, the more ZFS is able to aggregate and amortize metadata (and data)
+ * changes. However, memory is a limited resource, and allowing for more dirty
+ * data comes at the cost of keeping other useful data in memory (for example
+ * ZFS data cached by the ARC).
+ *
+ * Implementation
+ *
+ * As buffers are modified dsl_pool_willuse_space() increments both the per-
+ * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of
+ * dirty space used; dsl_pool_dirty_space() decrements those values as data
+ * is synced out from dsl_pool_sync(). While only the poolwide value is
+ * relevant, the per-txg value is useful for debugging. The tunable
+ * zfs_dirty_data_max determines the dirty space limit. Once that value is
+ * exceeded, new writes are halted until space frees up.
+ *
+ * The zfs_dirty_data_sync_percent tunable dictates the threshold at which we
+ * ensure that there is a txg syncing (see the comment in txg.c for a full
+ * description of transaction group stages).
+ *
+ * The IO scheduler uses both the dirty space limit and current amount of
+ * dirty data as inputs. Those values affect the number of concurrent IOs ZFS
+ * issues. See the comment in vdev_queue.c for details of the IO scheduler.
+ *
+ * The delay is also calculated based on the amount of dirty data. See the
+ * comment above dmu_tx_delay() for details.
+ */
+
+/*
+ * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
+ * capped at zfs_dirty_data_max_max. It can also be overridden with a module
+ * parameter.
+ */
+unsigned long zfs_dirty_data_max = 0;
+unsigned long zfs_dirty_data_max_max = 0;
+int zfs_dirty_data_max_percent = 10;
+int zfs_dirty_data_max_max_percent = 25;
+
+/*
+ * If there's at least this much dirty data (as a percentage of
+ * zfs_dirty_data_max), push out a txg. This should be less than
+ * zfs_vdev_async_write_active_min_dirty_percent.
+ */
+int zfs_dirty_data_sync_percent = 20;
+
+/*
+ * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
+ * and delay each transaction.
+ * This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
+ */
+int zfs_delay_min_dirty_percent = 60;
+
+/*
+ * This controls how quickly the delay approaches infinity.
+ * Larger values cause it to delay more for a given amount of dirty data.
+ * Therefore larger values will cause there to be less dirty data for a
+ * given throughput.
+ *
+ * For the smoothest delay, this value should be about 1 billion divided
+ * by the maximum number of operations per second. This will smoothly
+ * handle between 10x and 1/10th this number.
+ *
+ * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
+ * multiply in dmu_tx_delay().
+ */
+unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
+
+/*
+ * This determines the number of threads used by the dp_sync_taskq.
+ */
+int zfs_sync_taskq_batch_pct = 75;
+
+/*
+ * These tunables determine the behavior of how zil_itxg_clean() is
+ * called via zil_clean() in the context of spa_sync(). When an itxg
+ * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching.
+ * If the dispatch fails, the call to zil_itxg_clean() will occur
+ * synchronously in the context of spa_sync(), which can negatively
+ * impact the performance of spa_sync() (e.g. in the case of the itxg
+ * list having a large number of itxs that needs to be cleaned).
+ *
+ * Thus, these tunables can be used to manipulate the behavior of the
+ * taskq used by zil_clean(); they determine the number of taskq entries
+ * that are pre-populated when the taskq is first created (via the
+ * "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of
+ * taskq entries that are cached after an on-demand allocation (via the
+ * "zfs_zil_clean_taskq_maxalloc").
+ *
+ * The idea being, we want to try reasonably hard to ensure there will
+ * already be a taskq entry pre-allocated by the time that it is needed
+ * by zil_clean(). This way, we can avoid the possibility of an
+ * on-demand allocation of a new taskq entry from failing, which would
+ * result in zil_itxg_clean() being called synchronously from zil_clean()
+ * (which can adversely affect performance of spa_sync()).
+ *
+ * Additionally, the number of threads used by the taskq can be
+ * configured via the "zfs_zil_clean_taskq_nthr_pct" tunable.
+ */
+int zfs_zil_clean_taskq_nthr_pct = 100;
+int zfs_zil_clean_taskq_minalloc = 1024;
+int zfs_zil_clean_taskq_maxalloc = 1024 * 1024;
+
+int
+dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
+{
+ uint64_t obj;
+ int err;
+
+ err = zap_lookup(dp->dp_meta_objset,
+ dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj,
+ name, sizeof (obj), 1, &obj);
+ if (err)
+ return (err);
+
+ return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));
+}
+
+static dsl_pool_t *
+dsl_pool_open_impl(spa_t *spa, uint64_t txg)
+{
+ dsl_pool_t *dp;
+ blkptr_t *bp = spa_get_rootblkptr(spa);
+
+ dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
+ dp->dp_spa = spa;
+ dp->dp_meta_rootbp = *bp;
+ rrw_init(&dp->dp_config_rwlock, B_TRUE);
+ txg_init(dp, txg);
+ mmp_init(spa);
+
+ txg_list_create(&dp->dp_dirty_datasets, spa,
+ offsetof(dsl_dataset_t, ds_dirty_link));
+ txg_list_create(&dp->dp_dirty_zilogs, spa,
+ offsetof(zilog_t, zl_dirty_link));
+ txg_list_create(&dp->dp_dirty_dirs, spa,
+ offsetof(dsl_dir_t, dd_dirty_link));
+ txg_list_create(&dp->dp_sync_tasks, spa,
+ offsetof(dsl_sync_task_t, dst_node));
+ txg_list_create(&dp->dp_early_sync_tasks, spa,
+ offsetof(dsl_sync_task_t, dst_node));
+
+ dp->dp_sync_taskq = taskq_create("dp_sync_taskq",
+ zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
+ TASKQ_THREADS_CPU_PCT);
+
+ dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq",
+ zfs_zil_clean_taskq_nthr_pct, minclsyspri,
+ zfs_zil_clean_taskq_minalloc,
+ zfs_zil_clean_taskq_maxalloc,
+ TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
+
+ mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
+
+ dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri,
+ boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
+ TASKQ_THREADS_CPU_PCT);
+ dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain",
+ 100, defclsyspri, boot_ncpus, INT_MAX,
+ TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
+
+ return (dp);
+}
+
+int
+dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
+{
+ int err;
+ dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+
+ /*
+ * Initialize the caller's dsl_pool_t structure before we actually open
+ * the meta objset. This is done because a self-healing write zio may
+ * be issued as part of dmu_objset_open_impl() and the spa needs its
+ * dsl_pool_t initialized in order to handle the write.
+ */
+ *dpp = dp;
+
+ err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
+ &dp->dp_meta_objset);
+ if (err != 0) {
+ dsl_pool_close(dp);
+ *dpp = NULL;
+ }
+
+ return (err);
+}
+
+int
+dsl_pool_open(dsl_pool_t *dp)
+{
+ int err;
+ dsl_dir_t *dd;
+ dsl_dataset_t *ds;
+ uint64_t obj;
+
+ rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
+ &dp->dp_root_dir_obj);
+ if (err)
+ goto out;
+
+ err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
+ NULL, dp, &dp->dp_root_dir);
+ if (err)
+ goto out;
+
+ err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
+ if (err)
+ goto out;
+
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
+ err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
+ if (err)
+ goto out;
+ err = dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds);
+ if (err == 0) {
+ err = dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, dp,
+ &dp->dp_origin_snap);
+ dsl_dataset_rele(ds, FTAG);
+ }
+ dsl_dir_rele(dd, dp);
+ if (err)
+ goto out;
+ }
+
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+ err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
+ &dp->dp_free_dir);
+ if (err)
+ goto out;
+
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
+ if (err)
+ goto out;
+ VERIFY0(bpobj_open(&dp->dp_free_bpobj,
+ dp->dp_meta_objset, obj));
+ }
+
+ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj);
+ if (err == 0) {
+ VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj,
+ dp->dp_meta_objset, obj));
+ } else if (err == ENOENT) {
+ /*
+ * We might not have created the remap bpobj yet.
+ */
+ err = 0;
+ } else {
+ goto out;
+ }
+ }
+
+ /*
+ * Note: errors ignored, because the these special dirs, used for
+ * space accounting, are only created on demand.
+ */
+ (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
+ &dp->dp_leak_dir);
+
+ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
+ &dp->dp_bptree_obj);
+ if (err != 0)
+ goto out;
+ }
+
+ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) {
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
+ &dp->dp_empty_bpobj);
+ if (err != 0)
+ goto out;
+ }
+
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
+ &dp->dp_tmp_userrefs_obj);
+ if (err == ENOENT)
+ err = 0;
+ if (err)
+ goto out;
+
+ err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
+
+out:
+ rrw_exit(&dp->dp_config_rwlock, FTAG);
+ return (err);
+}
+
+void
+dsl_pool_close(dsl_pool_t *dp)
+{
+ /*
+ * Drop our references from dsl_pool_open().
+ *
+ * Since we held the origin_snap from "syncing" context (which
+ * includes pool-opening context), it actually only got a "ref"
+ * and not a hold, so just drop that here.
+ */
+ if (dp->dp_origin_snap != NULL)
+ dsl_dataset_rele(dp->dp_origin_snap, dp);
+ if (dp->dp_mos_dir != NULL)
+ dsl_dir_rele(dp->dp_mos_dir, dp);
+ if (dp->dp_free_dir != NULL)
+ dsl_dir_rele(dp->dp_free_dir, dp);
+ if (dp->dp_leak_dir != NULL)
+ dsl_dir_rele(dp->dp_leak_dir, dp);
+ if (dp->dp_root_dir != NULL)
+ dsl_dir_rele(dp->dp_root_dir, dp);
+
+ bpobj_close(&dp->dp_free_bpobj);
+ bpobj_close(&dp->dp_obsolete_bpobj);
+
+ /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
+ if (dp->dp_meta_objset != NULL)
+ dmu_objset_evict(dp->dp_meta_objset);
+
+ txg_list_destroy(&dp->dp_dirty_datasets);
+ txg_list_destroy(&dp->dp_dirty_zilogs);
+ txg_list_destroy(&dp->dp_sync_tasks);
+ txg_list_destroy(&dp->dp_early_sync_tasks);
+ txg_list_destroy(&dp->dp_dirty_dirs);
+
+ taskq_destroy(dp->dp_zil_clean_taskq);
+ taskq_destroy(dp->dp_sync_taskq);
+
+ /*
+ * We can't set retry to TRUE since we're explicitly specifying
+ * a spa to flush. This is good enough; any missed buffers for
+ * this spa won't cause trouble, and they'll eventually fall
+ * out of the ARC just like any other unused buffer.
+ */
+ arc_flush(dp->dp_spa, FALSE);
+
+ mmp_fini(dp->dp_spa);
+ txg_fini(dp);
+ dsl_scan_fini(dp);
+ dmu_buf_user_evict_wait();
+
+ rrw_destroy(&dp->dp_config_rwlock);
+ mutex_destroy(&dp->dp_lock);
+ cv_destroy(&dp->dp_spaceavail_cv);
+ taskq_destroy(dp->dp_unlinked_drain_taskq);
+ taskq_destroy(dp->dp_zrele_taskq);
+ if (dp->dp_blkstats != NULL) {
+ mutex_destroy(&dp->dp_blkstats->zab_lock);
+ vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+ }
+ kmem_free(dp, sizeof (dsl_pool_t));
+}
+
+void
+dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ uint64_t obj;
+ /*
+ * Currently, we only create the obsolete_bpobj where there are
+ * indirect vdevs with referenced mappings.
+ */
+ ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_DEVICE_REMOVAL));
+ /* create and open the obsolete_bpobj */
+ obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
+ VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj));
+ VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
+ spa_feature_incr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+}
+
+void
+dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ spa_feature_decr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ VERIFY0(zap_remove(dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_OBSOLETE_BPOBJ, tx));
+ bpobj_free(dp->dp_meta_objset,
+ dp->dp_obsolete_bpobj.bpo_object, tx);
+ bpobj_close(&dp->dp_obsolete_bpobj);
+}
+
+dsl_pool_t *
+dsl_pool_create(spa_t *spa, nvlist_t *zplprops, dsl_crypto_params_t *dcp,
+ uint64_t txg)
+{
+ int err;
+ dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+ dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
+#ifdef _KERNEL
+ objset_t *os;
+#else
+ objset_t *os __attribute__((unused));
+#endif
+ dsl_dataset_t *ds;
+ uint64_t obj;
+
+ rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+
+ /* create and open the MOS (meta-objset) */
+ dp->dp_meta_objset = dmu_objset_create_impl(spa,
+ NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
+ spa->spa_meta_objset = dp->dp_meta_objset;
+
+ /* create the pool directory */
+ err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
+ ASSERT0(err);
+
+ /* Initialize scan structures */
+ VERIFY0(dsl_scan_init(dp, txg));
+
+ /* create and open the root dir */
+ dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
+ VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
+ NULL, dp, &dp->dp_root_dir));
+
+ /* create and open the meta-objset dir */
+ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
+ VERIFY0(dsl_pool_open_special_dir(dp,
+ MOS_DIR_NAME, &dp->dp_mos_dir));
+
+ if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+ /* create and open the free dir */
+ (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
+ FREE_DIR_NAME, tx);
+ VERIFY0(dsl_pool_open_special_dir(dp,
+ FREE_DIR_NAME, &dp->dp_free_dir));
+
+ /* create and open the free_bplist */
+ obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
+ VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
+ VERIFY0(bpobj_open(&dp->dp_free_bpobj,
+ dp->dp_meta_objset, obj));
+ }
+
+ if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
+ dsl_pool_create_origin(dp, tx);
+
+ /*
+ * Some features may be needed when creating the root dataset, so we
+ * create the feature objects here.
+ */
+ if (spa_version(spa) >= SPA_VERSION_FEATURES)
+ spa_feature_create_zap_objects(spa, tx);
+
+ if (dcp != NULL && dcp->cp_crypt != ZIO_CRYPT_OFF &&
+ dcp->cp_crypt != ZIO_CRYPT_INHERIT)
+ spa_feature_enable(spa, SPA_FEATURE_ENCRYPTION, tx);
+
+ /* create the root dataset */
+ obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, dcp, 0, tx);
+
+ /* create the root objset */
+ VERIFY0(dsl_dataset_hold_obj_flags(dp, obj,
+ DS_HOLD_FLAG_DECRYPT, FTAG, &ds));
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ os = dmu_objset_create_impl(dp->dp_spa, ds,
+ dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+#ifdef _KERNEL
+ zfs_create_fs(os, kcred, zplprops, tx);
+#endif
+ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+
+ dmu_tx_commit(tx);
+
+ rrw_exit(&dp->dp_config_rwlock, FTAG);
+
+ return (dp);
+}
+
+/*
+ * Account for the meta-objset space in its placeholder dsl_dir.
+ */
+void
+dsl_pool_mos_diduse_space(dsl_pool_t *dp,
+ int64_t used, int64_t comp, int64_t uncomp)
+{
+ ASSERT3U(comp, ==, uncomp); /* it's all metadata */
+ mutex_enter(&dp->dp_lock);
+ dp->dp_mos_used_delta += used;
+ dp->dp_mos_compressed_delta += comp;
+ dp->dp_mos_uncompressed_delta += uncomp;
+ mutex_exit(&dp->dp_lock);
+}
+
+static void
+dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ dmu_objset_sync(dp->dp_meta_objset, zio, tx);
+ VERIFY0(zio_wait(zio));
+ dmu_objset_sync_done(dp->dp_meta_objset, tx);
+ taskq_wait(dp->dp_sync_taskq);
+ multilist_destroy(dp->dp_meta_objset->os_synced_dnodes);
+ dp->dp_meta_objset->os_synced_dnodes = NULL;
+
+ dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
+ spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+}
+
+static void
+dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
+{
+ ASSERT(MUTEX_HELD(&dp->dp_lock));
+
+ if (delta < 0)
+ ASSERT3U(-delta, <=, dp->dp_dirty_total);
+
+ dp->dp_dirty_total += delta;
+
+ /*
+ * Note: we signal even when increasing dp_dirty_total.
+ * This ensures forward progress -- each thread wakes the next waiter.
+ */
+ if (dp->dp_dirty_total < zfs_dirty_data_max)
+ cv_signal(&dp->dp_spaceavail_cv);
+}
+
+#ifdef ZFS_DEBUG
+static boolean_t
+dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
+{
+ spa_t *spa = dp->dp_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+ txg_list_t *tl = &vd->vdev_ms_list;
+ metaslab_t *ms;
+
+ for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms;
+ ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) {
+ VERIFY(range_tree_is_empty(ms->ms_freeing));
+ VERIFY(range_tree_is_empty(ms->ms_checkpointing));
+ }
+ }
+
+ return (B_TRUE);
+}
+#endif
+
+void
+dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
+{
+ zio_t *zio;
+ dmu_tx_t *tx;
+ dsl_dir_t *dd;
+ dsl_dataset_t *ds;
+ objset_t *mos = dp->dp_meta_objset;
+ list_t synced_datasets;
+
+ list_create(&synced_datasets, sizeof (dsl_dataset_t),
+ offsetof(dsl_dataset_t, ds_synced_link));
+
+ tx = dmu_tx_create_assigned(dp, txg);
+
+ /*
+ * Run all early sync tasks before writing out any dirty blocks.
+ * For more info on early sync tasks see block comment in
+ * dsl_early_sync_task().
+ */
+ if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) {
+ dsl_sync_task_t *dst;
+
+ ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
+ while ((dst =
+ txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) {
+ ASSERT(dsl_early_sync_task_verify(dp, txg));
+ dsl_sync_task_sync(dst, tx);
+ }
+ ASSERT(dsl_early_sync_task_verify(dp, txg));
+ }
+
+ /*
+ * Write out all dirty blocks of dirty datasets.
+ */
+ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
+ /*
+ * We must not sync any non-MOS datasets twice, because
+ * we may have taken a snapshot of them. However, we
+ * may sync newly-created datasets on pass 2.
+ */
+ ASSERT(!list_link_active(&ds->ds_synced_link));
+ list_insert_tail(&synced_datasets, ds);
+ dsl_dataset_sync(ds, zio, tx);
+ }
+ VERIFY0(zio_wait(zio));
+
+ /*
+ * Update the long range free counter after
+ * we're done syncing user data
+ */
+ mutex_enter(&dp->dp_lock);
+ ASSERT(spa_sync_pass(dp->dp_spa) == 1 ||
+ dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0);
+ dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0;
+ mutex_exit(&dp->dp_lock);
+
+ /*
+ * After the data blocks have been written (ensured by the zio_wait()
+ * above), update the user/group/project space accounting. This happens
+ * in tasks dispatched to dp_sync_taskq, so wait for them before
+ * continuing.
+ */
+ for (ds = list_head(&synced_datasets); ds != NULL;
+ ds = list_next(&synced_datasets, ds)) {
+ dmu_objset_sync_done(ds->ds_objset, tx);
+ }
+ taskq_wait(dp->dp_sync_taskq);
+
+ /*
+ * Sync the datasets again to push out the changes due to
+ * userspace updates. This must be done before we process the
+ * sync tasks, so that any snapshots will have the correct
+ * user accounting information (and we won't get confused
+ * about which blocks are part of the snapshot).
+ */
+ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
+ objset_t *os = ds->ds_objset;
+
+ ASSERT(list_link_active(&ds->ds_synced_link));
+ dmu_buf_rele(ds->ds_dbuf, ds);
+ dsl_dataset_sync(ds, zio, tx);
+
+ /*
+ * Release any key mappings created by calls to
+ * dsl_dataset_dirty() from the userquota accounting
+ * code paths.
+ */
+ if (os->os_encrypted && !os->os_raw_receive &&
+ !os->os_next_write_raw[txg & TXG_MASK]) {
+ ASSERT3P(ds->ds_key_mapping, !=, NULL);
+ key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds);
+ }
+ }
+ VERIFY0(zio_wait(zio));
+
+ /*
+ * Now that the datasets have been completely synced, we can
+ * clean up our in-memory structures accumulated while syncing:
+ *
+ * - move dead blocks from the pending deadlist and livelists
+ * to the on-disk versions
+ * - release hold from dsl_dataset_dirty()
+ * - release key mapping hold from dsl_dataset_dirty()
+ */
+ while ((ds = list_remove_head(&synced_datasets)) != NULL) {
+ objset_t *os = ds->ds_objset;
+
+ if (os->os_encrypted && !os->os_raw_receive &&
+ !os->os_next_write_raw[txg & TXG_MASK]) {
+ ASSERT3P(ds->ds_key_mapping, !=, NULL);
+ key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds);
+ }
+
+ dsl_dataset_sync_done(ds, tx);
+ }
+
+ while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
+ dsl_dir_sync(dd, tx);
+ }
+
+ /*
+ * The MOS's space is accounted for in the pool/$MOS
+ * (dp_mos_dir). We can't modify the mos while we're syncing
+ * it, so we remember the deltas and apply them here.
+ */
+ if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 ||
+ dp->dp_mos_uncompressed_delta != 0) {
+ dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
+ dp->dp_mos_used_delta,
+ dp->dp_mos_compressed_delta,
+ dp->dp_mos_uncompressed_delta, tx);
+ dp->dp_mos_used_delta = 0;
+ dp->dp_mos_compressed_delta = 0;
+ dp->dp_mos_uncompressed_delta = 0;
+ }
+
+ if (dmu_objset_is_dirty(mos, txg)) {
+ dsl_pool_sync_mos(dp, tx);
+ }
+
+ /*
+ * We have written all of the accounted dirty data, so our
+ * dp_space_towrite should now be zero. However, some seldom-used
+ * code paths do not adhere to this (e.g. dbuf_undirty()). Shore up
+ * the accounting of any dirtied space now.
+ *
+ * Note that, besides any dirty data from datasets, the amount of
+ * dirty data in the MOS is also accounted by the pool. Therefore,
+ * we want to do this cleanup after dsl_pool_sync_mos() so we don't
+ * attempt to update the accounting for the same dirty data twice.
+ * (i.e. at this point we only update the accounting for the space
+ * that we know that we "leaked").
+ */
+ dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
+
+ /*
+ * If we modify a dataset in the same txg that we want to destroy it,
+ * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
+ * dsl_dir_destroy_check() will fail if there are unexpected holds.
+ * Therefore, we want to sync the MOS (thus syncing the dd_dbuf
+ * and clearing the hold on it) before we process the sync_tasks.
+ * The MOS data dirtied by the sync_tasks will be synced on the next
+ * pass.
+ */
+ if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
+ dsl_sync_task_t *dst;
+ /*
+ * No more sync tasks should have been added while we
+ * were syncing.
+ */
+ ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
+ while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL)
+ dsl_sync_task_sync(dst, tx);
+ }
+
+ dmu_tx_commit(tx);
+
+ DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg);
+}
+
+void
+dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
+{
+ zilog_t *zilog;
+
+ while ((zilog = txg_list_head(&dp->dp_dirty_zilogs, txg))) {
+ dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
+ /*
+ * We don't remove the zilog from the dp_dirty_zilogs
+ * list until after we've cleaned it. This ensures that
+ * callers of zilog_is_dirty() receive an accurate
+ * answer when they are racing with the spa sync thread.
+ */
+ zil_clean(zilog, txg);
+ (void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg);
+ ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
+ dmu_buf_rele(ds->ds_dbuf, zilog);
+ }
+ ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
+}
+
+/*
+ * TRUE if the current thread is the tx_sync_thread or if we
+ * are being called from SPA context during pool initialization.
+ */
+int
+dsl_pool_sync_context(dsl_pool_t *dp)
+{
+ return (curthread == dp->dp_tx.tx_sync_thread ||
+ spa_is_initializing(dp->dp_spa) ||
+ taskq_member(dp->dp_sync_taskq, curthread));
+}
+
+/*
+ * This function returns the amount of allocatable space in the pool
+ * minus whatever space is currently reserved by ZFS for specific
+ * purposes. Specifically:
+ *
+ * 1] Any reserved SLOP space
+ * 2] Any space used by the checkpoint
+ * 3] Any space used for deferred frees
+ *
+ * The latter 2 are especially important because they are needed to
+ * rectify the SPA's and DMU's different understanding of how much space
+ * is used. Now the DMU is aware of that extra space tracked by the SPA
+ * without having to maintain a separate special dir (e.g similar to
+ * $MOS, $FREEING, and $LEAKED).
+ *
+ * Note: By deferred frees here, we mean the frees that were deferred
+ * in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the
+ * segments placed in ms_defer trees during metaslab_sync_done().
+ */
+uint64_t
+dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy)
+{
+ spa_t *spa = dp->dp_spa;
+ uint64_t space, resv, adjustedsize;
+ uint64_t spa_deferred_frees =
+ spa->spa_deferred_bpobj.bpo_phys->bpo_bytes;
+
+ space = spa_get_dspace(spa)
+ - spa_get_checkpoint_space(spa) - spa_deferred_frees;
+ resv = spa_get_slop_space(spa);
+
+ switch (slop_policy) {
+ case ZFS_SPACE_CHECK_NORMAL:
+ break;
+ case ZFS_SPACE_CHECK_RESERVED:
+ resv >>= 1;
+ break;
+ case ZFS_SPACE_CHECK_EXTRA_RESERVED:
+ resv >>= 2;
+ break;
+ case ZFS_SPACE_CHECK_NONE:
+ resv = 0;
+ break;
+ default:
+ panic("invalid slop policy value: %d", slop_policy);
+ break;
+ }
+ adjustedsize = (space >= resv) ? (space - resv) : 0;
+
+ return (adjustedsize);
+}
+
+uint64_t
+dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy)
+{
+ uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy);
+ uint64_t deferred =
+ metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+ uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0;
+ return (quota);
+}
+
+boolean_t
+dsl_pool_need_dirty_delay(dsl_pool_t *dp)
+{
+ uint64_t delay_min_bytes =
+ zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
+ uint64_t dirty_min_bytes =
+ zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
+ uint64_t dirty;
+
+ mutex_enter(&dp->dp_lock);
+ dirty = dp->dp_dirty_total;
+ mutex_exit(&dp->dp_lock);
+ if (dirty > dirty_min_bytes)
+ txg_kick(dp);
+ return (dirty > delay_min_bytes);
+}
+
+void
+dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
+{
+ if (space > 0) {
+ mutex_enter(&dp->dp_lock);
+ dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
+ dsl_pool_dirty_delta(dp, space);
+ mutex_exit(&dp->dp_lock);
+ }
+}
+
+void
+dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
+{
+ ASSERT3S(space, >=, 0);
+ if (space == 0)
+ return;
+
+ mutex_enter(&dp->dp_lock);
+ if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) {
+ /* XXX writing something we didn't dirty? */
+ space = dp->dp_dirty_pertxg[txg & TXG_MASK];
+ }
+ ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
+ dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
+ ASSERT3U(dp->dp_dirty_total, >=, space);
+ dsl_pool_dirty_delta(dp, -space);
+ mutex_exit(&dp->dp_lock);
+}
+
+/* ARGSUSED */
+static int
+upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
+{
+ dmu_tx_t *tx = arg;
+ dsl_dataset_t *ds, *prev = NULL;
+ int err;
+
+ err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
+ if (err)
+ return (err);
+
+ while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+ err = dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
+ if (err) {
+ dsl_dataset_rele(ds, FTAG);
+ return (err);
+ }
+
+ if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object)
+ break;
+ dsl_dataset_rele(ds, FTAG);
+ ds = prev;
+ prev = NULL;
+ }
+
+ if (prev == NULL) {
+ prev = dp->dp_origin_snap;
+
+ /*
+ * The $ORIGIN can't have any data, or the accounting
+ * will be wrong.
+ */
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+ /* The origin doesn't get attached to itself */
+ if (ds->ds_object == prev->ds_object) {
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object;
+ dsl_dataset_phys(ds)->ds_prev_snap_txg =
+ dsl_dataset_phys(prev)->ds_creation_txg;
+
+ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
+ dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object;
+
+ dmu_buf_will_dirty(prev->ds_dbuf, tx);
+ dsl_dataset_phys(prev)->ds_num_children++;
+
+ if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) {
+ ASSERT(ds->ds_prev == NULL);
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj,
+ ds, &ds->ds_prev));
+ }
+ }
+
+ ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object);
+ ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object);
+
+ if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) {
+ dmu_buf_will_dirty(prev->ds_dbuf, tx);
+ dsl_dataset_phys(prev)->ds_next_clones_obj =
+ zap_create(dp->dp_meta_objset,
+ DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
+ }
+ VERIFY0(zap_add_int(dp->dp_meta_objset,
+ dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx));
+
+ dsl_dataset_rele(ds, FTAG);
+ if (prev != dp->dp_origin_snap)
+ dsl_dataset_rele(prev, FTAG);
+ return (0);
+}
+
+void
+dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dp->dp_origin_snap != NULL);
+
+ VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
+ tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
+}
+
+/* ARGSUSED */
+static int
+upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+{
+ dmu_tx_t *tx = arg;
+ objset_t *mos = dp->dp_meta_objset;
+
+ if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) {
+ dsl_dataset_t *origin;
+
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin));
+
+ if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
+ dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
+ dsl_dir_phys(origin->ds_dir)->dd_clones =
+ zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE,
+ 0, tx);
+ }
+
+ VERIFY0(zap_add_int(dp->dp_meta_objset,
+ dsl_dir_phys(origin->ds_dir)->dd_clones,
+ ds->ds_object, tx));
+
+ dsl_dataset_rele(origin, FTAG);
+ }
+ return (0);
+}
+
+void
+dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ uint64_t obj;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
+ VERIFY0(dsl_pool_open_special_dir(dp,
+ FREE_DIR_NAME, &dp->dp_free_dir));
+
+ /*
+ * We can't use bpobj_alloc(), because spa_version() still
+ * returns the old version, and we need a new-version bpobj with
+ * subobj support. So call dmu_object_alloc() directly.
+ */
+ obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
+ SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
+ VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
+ VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
+
+ VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+ upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
+}
+
+void
+dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ uint64_t dsobj;
+ dsl_dataset_t *ds;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dp->dp_origin_snap == NULL);
+ ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));
+
+ /* create the origin dir, ds, & snap-ds */
+ dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
+ NULL, 0, kcred, NULL, tx);
+ VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+ dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
+ VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
+ dp, &dp->dp_origin_snap));
+ dsl_dataset_rele(ds, FTAG);
+}
+
+taskq_t *
+dsl_pool_zrele_taskq(dsl_pool_t *dp)
+{
+ return (dp->dp_zrele_taskq);
+}
+
+taskq_t *
+dsl_pool_unlinked_drain_taskq(dsl_pool_t *dp)
+{
+ return (dp->dp_unlinked_drain_taskq);
+}
+
+/*
+ * Walk through the pool-wide zap object of temporary snapshot user holds
+ * and release them.
+ */
+void
+dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
+{
+ zap_attribute_t za;
+ zap_cursor_t zc;
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t zapobj = dp->dp_tmp_userrefs_obj;
+ nvlist_t *holds;
+
+ if (zapobj == 0)
+ return;
+ ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+
+ holds = fnvlist_alloc();
+
+ for (zap_cursor_init(&zc, mos, zapobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ char *htag;
+ nvlist_t *tags;
+
+ htag = strchr(za.za_name, '-');
+ *htag = '\0';
+ ++htag;
+ if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) {
+ tags = fnvlist_alloc();
+ fnvlist_add_boolean(tags, htag);
+ fnvlist_add_nvlist(holds, za.za_name, tags);
+ fnvlist_free(tags);
+ } else {
+ fnvlist_add_boolean(tags, htag);
+ }
+ }
+ dsl_dataset_user_release_tmp(dp, holds);
+ fnvlist_free(holds);
+ zap_cursor_fini(&zc);
+}
+
+/*
+ * Create the pool-wide zap object for storing temporary snapshot holds.
+ */
+static void
+dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ objset_t *mos = dp->dp_meta_objset;
+
+ ASSERT(dp->dp_tmp_userrefs_obj == 0);
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
+}
+
+static int
+dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
+ const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding)
+{
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t zapobj = dp->dp_tmp_userrefs_obj;
+ char *name;
+ int error;
+
+ ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ /*
+ * If the pool was created prior to SPA_VERSION_USERREFS, the
+ * zap object for temporary holds might not exist yet.
+ */
+ if (zapobj == 0) {
+ if (holding) {
+ dsl_pool_user_hold_create_obj(dp, tx);
+ zapobj = dp->dp_tmp_userrefs_obj;
+ } else {
+ return (SET_ERROR(ENOENT));
+ }
+ }
+
+ name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
+ if (holding)
+ error = zap_add(mos, zapobj, name, 8, 1, &now, tx);
+ else
+ error = zap_remove(mos, zapobj, name, tx);
+ kmem_strfree(name);
+
+ return (error);
+}
+
+/*
+ * Add a temporary hold for the given dataset object and tag.
+ */
+int
+dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
+ uint64_t now, dmu_tx_t *tx)
+{
+ return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
+}
+
+/*
+ * Release a temporary hold for the given dataset object and tag.
+ */
+int
+dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
+ dmu_tx_t *tx)
+{
+ return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0,
+ tx, B_FALSE));
+}
+
+/*
+ * DSL Pool Configuration Lock
+ *
+ * The dp_config_rwlock protects against changes to DSL state (e.g. dataset
+ * creation / destruction / rename / property setting). It must be held for
+ * read to hold a dataset or dsl_dir. I.e. you must call
+ * dsl_pool_config_enter() or dsl_pool_hold() before calling
+ * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock
+ * must be held continuously until all datasets and dsl_dirs are released.
+ *
+ * The only exception to this rule is that if a "long hold" is placed on
+ * a dataset, then the dp_config_rwlock may be dropped while the dataset
+ * is still held. The long hold will prevent the dataset from being
+ * destroyed -- the destroy will fail with EBUSY. A long hold can be
+ * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset
+ * (by calling dsl_{dataset,objset}_{try}own{_obj}).
+ *
+ * Legitimate long-holders (including owners) should be long-running, cancelable
+ * tasks that should cause "zfs destroy" to fail. This includes DMU
+ * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),
+ * "zfs send", and "zfs diff". There are several other long-holders whose
+ * uses are suboptimal (e.g. "zfs promote", and zil_suspend()).
+ *
+ * The usual formula for long-holding would be:
+ * dsl_pool_hold()
+ * dsl_dataset_hold()
+ * ... perform checks ...
+ * dsl_dataset_long_hold()
+ * dsl_pool_rele()
+ * ... perform long-running task ...
+ * dsl_dataset_long_rele()
+ * dsl_dataset_rele()
+ *
+ * Note that when the long hold is released, the dataset is still held but
+ * the pool is not held. The dataset may change arbitrarily during this time
+ * (e.g. it could be destroyed). Therefore you shouldn't do anything to the
+ * dataset except release it.
+ *
+ * Operations generally fall somewhere into the following taxonomy:
+ *
+ * Read-Only Modifying
+ *
+ * Dataset Layer / MOS zfs get zfs destroy
+ *
+ * Individual Dataset read() write()
+ *
+ *
+ * Dataset Layer Operations
+ *
+ * Modifying operations should generally use dsl_sync_task(). The synctask
+ * infrastructure enforces proper locking strategy with respect to the
+ * dp_config_rwlock. See the comment above dsl_sync_task() for details.
+ *
+ * Read-only operations will manually hold the pool, then the dataset, obtain
+ * information from the dataset, then release the pool and dataset.
+ * dmu_objset_{hold,rele}() are convenience routines that also do the pool
+ * hold/rele.
+ *
+ *
+ * Operations On Individual Datasets
+ *
+ * Objects _within_ an objset should only be modified by the current 'owner'
+ * of the objset to prevent incorrect concurrent modification. Thus, use
+ * {dmu_objset,dsl_dataset}_own to mark some entity as the current owner,
+ * and fail with EBUSY if there is already an owner. The owner can then
+ * implement its own locking strategy, independent of the dataset layer's
+ * locking infrastructure.
+ * (E.g., the ZPL has its own set of locks to control concurrency. A regular
+ * vnop will not reach into the dataset layer).
+ *
+ * Ideally, objects would also only be read by the objset’s owner, so that we
+ * don’t observe state mid-modification.
+ * (E.g. the ZPL is creating a new object and linking it into a directory; if
+ * you don’t coordinate with the ZPL to hold ZPL-level locks, you could see an
+ * intermediate state. The ioctl level violates this but in pretty benign
+ * ways, e.g. reading the zpl props object.)
+ */
+
+int
+dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp)
+{
+ spa_t *spa;
+ int error;
+
+ error = spa_open(name, &spa, tag);
+ if (error == 0) {
+ *dp = spa_get_dsl(spa);
+ dsl_pool_config_enter(*dp, tag);
+ }
+ return (error);
+}
+
+void
+dsl_pool_rele(dsl_pool_t *dp, void *tag)
+{
+ dsl_pool_config_exit(dp, tag);
+ spa_close(dp->dp_spa, tag);
+}
+
+void
+dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
+{
+ /*
+ * We use a "reentrant" reader-writer lock, but not reentrantly.
+ *
+ * The rrwlock can (with the track_all flag) track all reading threads,
+ * which is very useful for debugging which code path failed to release
+ * the lock, and for verifying that the *current* thread does hold
+ * the lock.
+ *
+ * (Unlike a rwlock, which knows that N threads hold it for
+ * read, but not *which* threads, so rw_held(RW_READER) returns TRUE
+ * if any thread holds it for read, even if this thread doesn't).
+ */
+ ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
+ rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
+}
+
+void
+dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag)
+{
+ ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
+ rrw_enter_read_prio(&dp->dp_config_rwlock, tag);
+}
+
+void
+dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
+{
+ rrw_exit(&dp->dp_config_rwlock, tag);
+}
+
+boolean_t
+dsl_pool_config_held(dsl_pool_t *dp)
+{
+ return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
+}
+
+boolean_t
+dsl_pool_config_held_writer(dsl_pool_t *dp)
+{
+ return (RRW_WRITE_HELD(&dp->dp_config_rwlock));
+}
+
+EXPORT_SYMBOL(dsl_pool_config_enter);
+EXPORT_SYMBOL(dsl_pool_config_exit);
+
+/* BEGIN CSTYLED */
+/* zfs_dirty_data_max_percent only applied at module load in arc_init(). */
+ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_percent, INT, ZMOD_RD,
+ "Max percent of RAM allowed to be dirty");
+
+/* zfs_dirty_data_max_max_percent only applied at module load in arc_init(). */
+ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max_percent, INT, ZMOD_RD,
+ "zfs_dirty_data_max upper bound as % of RAM");
+
+ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, INT, ZMOD_RW,
+ "Transaction delay threshold");
+
+ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW,
+ "Determines the dirty space limit");
+
+/* zfs_dirty_data_max_max only applied at module load in arc_init(). */
+ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD,
+ "zfs_dirty_data_max upper bound in bytes");
+
+ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, INT, ZMOD_RW,
+ "Dirty data txg sync threshold as a percentage of zfs_dirty_data_max");
+
+ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, ULONG, ZMOD_RW,
+ "How quickly delay approaches infinity");
+
+ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW,
+ "Max percent of CPUs that are used to sync dirty data");
+
+ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_nthr_pct, INT, ZMOD_RW,
+ "Max percent of CPUs that are used per dp_sync_taskq");
+
+ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_minalloc, INT, ZMOD_RW,
+ "Number of taskq entries that are pre-populated");
+
+ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_maxalloc, INT, ZMOD_RW,
+ "Max number of taskq entries that are cached");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dsl_prop.c b/sys/contrib/openzfs/module/zfs/dsl_prop.c
new file mode 100644
index 000000000000..f6ff9ae47192
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_prop.c
@@ -0,0 +1,1287 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Martin Matuska. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+
+#include "zfs_prop.h"
+
+#define ZPROP_INHERIT_SUFFIX "$inherit"
+#define ZPROP_RECVD_SUFFIX "$recvd"
+
+static int
+dodefault(zfs_prop_t prop, int intsz, int numints, void *buf)
+{
+ /*
+ * The setonce properties are read-only, BUT they still
+ * have a default value that can be used as the initial
+ * value.
+ */
+ if (prop == ZPROP_INVAL ||
+ (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop)))
+ return (SET_ERROR(ENOENT));
+
+ if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
+ if (intsz != 1)
+ return (SET_ERROR(EOVERFLOW));
+ (void) strncpy(buf, zfs_prop_default_string(prop),
+ numints);
+ } else {
+ if (intsz != 8 || numints < 1)
+ return (SET_ERROR(EOVERFLOW));
+
+ *(uint64_t *)buf = zfs_prop_default_numeric(prop);
+ }
+
+ return (0);
+}
+
+int
+dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot)
+{
+ int err;
+ dsl_dir_t *target = dd;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ zfs_prop_t prop;
+ boolean_t inheritable;
+ boolean_t inheriting = B_FALSE;
+ char *inheritstr;
+ char *recvdstr;
+
+ ASSERT(dsl_pool_config_held(dd->dd_pool));
+
+ if (setpoint)
+ setpoint[0] = '\0';
+
+ prop = zfs_name_to_prop(propname);
+ inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
+ inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
+ recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
+
+ /*
+ * Note: dd may become NULL, therefore we shouldn't dereference it
+ * after this loop.
+ */
+ for (; dd != NULL; dd = dd->dd_parent) {
+ if (dd != target || snapshot) {
+ if (!inheritable) {
+ err = SET_ERROR(ENOENT);
+ break;
+ }
+ inheriting = B_TRUE;
+ }
+
+ /* Check for a local value. */
+ err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+ propname, intsz, numints, buf);
+ if (err != ENOENT) {
+ if (setpoint != NULL && err == 0)
+ dsl_dir_name(dd, setpoint);
+ break;
+ }
+
+ /*
+ * Skip the check for a received value if there is an explicit
+ * inheritance entry.
+ */
+ err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+ inheritstr);
+ if (err != 0 && err != ENOENT)
+ break;
+
+ if (err == ENOENT) {
+ /* Check for a received value. */
+ err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+ recvdstr, intsz, numints, buf);
+ if (err != ENOENT) {
+ if (setpoint != NULL && err == 0) {
+ if (inheriting) {
+ dsl_dir_name(dd, setpoint);
+ } else {
+ (void) strlcpy(setpoint,
+ ZPROP_SOURCE_VAL_RECVD,
+ MAXNAMELEN);
+ }
+ }
+ break;
+ }
+ }
+
+ /*
+ * If we found an explicit inheritance entry, err is zero even
+ * though we haven't yet found the value, so reinitializing err
+ * at the end of the loop (instead of at the beginning) ensures
+ * that err has a valid post-loop value.
+ */
+ err = SET_ERROR(ENOENT);
+ }
+
+ if (err == ENOENT)
+ err = dodefault(prop, intsz, numints, buf);
+
+ kmem_strfree(inheritstr);
+ kmem_strfree(recvdstr);
+
+ return (err);
+}
+
+int
+dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint)
+{
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ boolean_t inheritable;
+ uint64_t zapobj;
+
+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+ inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
+ zapobj = dsl_dataset_phys(ds)->ds_props_obj;
+
+ if (zapobj != 0) {
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ int err;
+
+ ASSERT(ds->ds_is_snapshot);
+
+ /* Check for a local value. */
+ err = zap_lookup(mos, zapobj, propname, intsz, numints, buf);
+ if (err != ENOENT) {
+ if (setpoint != NULL && err == 0)
+ dsl_dataset_name(ds, setpoint);
+ return (err);
+ }
+
+ /*
+ * Skip the check for a received value if there is an explicit
+ * inheritance entry.
+ */
+ if (inheritable) {
+ char *inheritstr = kmem_asprintf("%s%s", propname,
+ ZPROP_INHERIT_SUFFIX);
+ err = zap_contains(mos, zapobj, inheritstr);
+ kmem_strfree(inheritstr);
+ if (err != 0 && err != ENOENT)
+ return (err);
+ }
+
+ if (err == ENOENT) {
+ /* Check for a received value. */
+ char *recvdstr = kmem_asprintf("%s%s", propname,
+ ZPROP_RECVD_SUFFIX);
+ err = zap_lookup(mos, zapobj, recvdstr,
+ intsz, numints, buf);
+ kmem_strfree(recvdstr);
+ if (err != ENOENT) {
+ if (setpoint != NULL && err == 0)
+ (void) strlcpy(setpoint,
+ ZPROP_SOURCE_VAL_RECVD,
+ MAXNAMELEN);
+ return (err);
+ }
+ }
+ }
+
+ return (dsl_prop_get_dd(ds->ds_dir, propname,
+ intsz, numints, buf, setpoint, ds->ds_is_snapshot));
+}
+
+static dsl_prop_record_t *
+dsl_prop_record_find(dsl_dir_t *dd, const char *propname)
+{
+ dsl_prop_record_t *pr = NULL;
+
+ ASSERT(MUTEX_HELD(&dd->dd_lock));
+
+ for (pr = list_head(&dd->dd_props);
+ pr != NULL; pr = list_next(&dd->dd_props, pr)) {
+ if (strcmp(pr->pr_propname, propname) == 0)
+ break;
+ }
+
+ return (pr);
+}
+
+static dsl_prop_record_t *
+dsl_prop_record_create(dsl_dir_t *dd, const char *propname)
+{
+ dsl_prop_record_t *pr;
+
+ ASSERT(MUTEX_HELD(&dd->dd_lock));
+
+ pr = kmem_alloc(sizeof (dsl_prop_record_t), KM_SLEEP);
+ pr->pr_propname = spa_strdup(propname);
+ list_create(&pr->pr_cbs, sizeof (dsl_prop_cb_record_t),
+ offsetof(dsl_prop_cb_record_t, cbr_pr_node));
+ list_insert_head(&dd->dd_props, pr);
+
+ return (pr);
+}
+
+void
+dsl_prop_init(dsl_dir_t *dd)
+{
+ list_create(&dd->dd_props, sizeof (dsl_prop_record_t),
+ offsetof(dsl_prop_record_t, pr_node));
+}
+
+void
+dsl_prop_fini(dsl_dir_t *dd)
+{
+ dsl_prop_record_t *pr;
+
+ while ((pr = list_remove_head(&dd->dd_props)) != NULL) {
+ list_destroy(&pr->pr_cbs);
+ spa_strfree((char *)pr->pr_propname);
+ kmem_free(pr, sizeof (dsl_prop_record_t));
+ }
+ list_destroy(&dd->dd_props);
+}
+
+/*
+ * Register interest in the named property. We'll call the callback
+ * once to notify it of the current property value, and again each time
+ * the property changes, until this callback is unregistered.
+ *
+ * Return 0 on success, errno if the prop is not an integer value.
+ */
+int
+dsl_prop_register(dsl_dataset_t *ds, const char *propname,
+ dsl_prop_changed_cb_t *callback, void *cbarg)
+{
+ dsl_dir_t *dd = ds->ds_dir;
+ uint64_t value;
+ dsl_prop_record_t *pr;
+ dsl_prop_cb_record_t *cbr;
+ int err;
+ dsl_pool_t *dp __maybe_unused = dd->dd_pool;
+
+ ASSERT(dsl_pool_config_held(dp));
+
+ err = dsl_prop_get_int_ds(ds, propname, &value);
+ if (err != 0)
+ return (err);
+
+ cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP);
+ cbr->cbr_ds = ds;
+ cbr->cbr_func = callback;
+ cbr->cbr_arg = cbarg;
+
+ mutex_enter(&dd->dd_lock);
+ pr = dsl_prop_record_find(dd, propname);
+ if (pr == NULL)
+ pr = dsl_prop_record_create(dd, propname);
+ cbr->cbr_pr = pr;
+ list_insert_head(&pr->pr_cbs, cbr);
+ list_insert_head(&ds->ds_prop_cbs, cbr);
+ mutex_exit(&dd->dd_lock);
+
+ cbr->cbr_func(cbr->cbr_arg, value);
+ return (0);
+}
+
+int
+dsl_prop_get(const char *dsname, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint)
+{
+ objset_t *os;
+ int error;
+
+ error = dmu_objset_hold(dsname, FTAG, &os);
+ if (error != 0)
+ return (error);
+
+ error = dsl_prop_get_ds(dmu_objset_ds(os), propname,
+ intsz, numints, buf, setpoint);
+
+ dmu_objset_rele(os, FTAG);
+ return (error);
+}
+
+/*
+ * Get the current property value. It may have changed by the time this
+ * function returns, so it is NOT safe to follow up with
+ * dsl_prop_register() and assume that the value has not changed in
+ * between.
+ *
+ * Return 0 on success, ENOENT if ddname is invalid.
+ */
+int
+dsl_prop_get_integer(const char *ddname, const char *propname,
+ uint64_t *valuep, char *setpoint)
+{
+ return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint));
+}
+
+int
+dsl_prop_get_int_ds(dsl_dataset_t *ds, const char *propname,
+ uint64_t *valuep)
+{
+ return (dsl_prop_get_ds(ds, propname, 8, 1, valuep, NULL));
+}
+
+/*
+ * Predict the effective value of the given special property if it were set with
+ * the given value and source. This is not a general purpose function. It exists
+ * only to handle the special requirements of the quota and reservation
+ * properties. The fact that these properties are non-inheritable greatly
+ * simplifies the prediction logic.
+ *
+ * Returns 0 on success, a positive error code on failure, or -1 if called with
+ * a property not handled by this function.
+ */
+int
+dsl_prop_predict(dsl_dir_t *dd, const char *propname,
+ zprop_source_t source, uint64_t value, uint64_t *newvalp)
+{
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ objset_t *mos;
+ uint64_t zapobj;
+ uint64_t version;
+ char *recvdstr;
+ int err = 0;
+
+ switch (prop) {
+ case ZFS_PROP_QUOTA:
+ case ZFS_PROP_RESERVATION:
+ case ZFS_PROP_REFQUOTA:
+ case ZFS_PROP_REFRESERVATION:
+ break;
+ default:
+ return (-1);
+ }
+
+ mos = dd->dd_pool->dp_meta_objset;
+ zapobj = dsl_dir_phys(dd)->dd_props_zapobj;
+ recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
+
+ version = spa_version(dd->dd_pool->dp_spa);
+ if (version < SPA_VERSION_RECVD_PROPS) {
+ if (source & ZPROP_SRC_NONE)
+ source = ZPROP_SRC_NONE;
+ else if (source & ZPROP_SRC_RECEIVED)
+ source = ZPROP_SRC_LOCAL;
+ }
+
+ switch ((int)source) {
+ case ZPROP_SRC_NONE:
+ /* Revert to the received value, if any. */
+ err = zap_lookup(mos, zapobj, recvdstr, 8, 1, newvalp);
+ if (err == ENOENT)
+ *newvalp = 0;
+ break;
+ case ZPROP_SRC_LOCAL:
+ *newvalp = value;
+ break;
+ case ZPROP_SRC_RECEIVED:
+ /*
+ * If there's no local setting, then the new received value will
+ * be the effective value.
+ */
+ err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp);
+ if (err == ENOENT)
+ *newvalp = value;
+ break;
+ case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED):
+ /*
+ * We're clearing the received value, so the local setting (if
+ * it exists) remains the effective value.
+ */
+ err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp);
+ if (err == ENOENT)
+ *newvalp = 0;
+ break;
+ default:
+ panic("unexpected property source: %d", source);
+ }
+
+ kmem_strfree(recvdstr);
+
+ if (err == ENOENT)
+ return (0);
+
+ return (err);
+}
+
+/*
+ * Unregister this callback. Return 0 on success, ENOENT if ddname is
+ * invalid, or ENOMSG if no matching callback registered.
+ *
+ * NOTE: This function is no longer used internally but has been preserved
+ * to prevent breaking external consumers (Lustre, etc).
+ */
+int
+dsl_prop_unregister(dsl_dataset_t *ds, const char *propname,
+ dsl_prop_changed_cb_t *callback, void *cbarg)
+{
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_prop_cb_record_t *cbr;
+
+ mutex_enter(&dd->dd_lock);
+ for (cbr = list_head(&ds->ds_prop_cbs);
+ cbr; cbr = list_next(&ds->ds_prop_cbs, cbr)) {
+ if (cbr->cbr_ds == ds &&
+ cbr->cbr_func == callback &&
+ cbr->cbr_arg == cbarg &&
+ strcmp(cbr->cbr_pr->pr_propname, propname) == 0)
+ break;
+ }
+
+ if (cbr == NULL) {
+ mutex_exit(&dd->dd_lock);
+ return (SET_ERROR(ENOMSG));
+ }
+
+ list_remove(&ds->ds_prop_cbs, cbr);
+ list_remove(&cbr->cbr_pr->pr_cbs, cbr);
+ mutex_exit(&dd->dd_lock);
+ kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
+
+ return (0);
+}
+
+/*
+ * Unregister all callbacks that are registered with the
+ * given callback argument.
+ */
+void
+dsl_prop_unregister_all(dsl_dataset_t *ds, void *cbarg)
+{
+ dsl_prop_cb_record_t *cbr, *next_cbr;
+
+ dsl_dir_t *dd = ds->ds_dir;
+
+ mutex_enter(&dd->dd_lock);
+ next_cbr = list_head(&ds->ds_prop_cbs);
+ while (next_cbr != NULL) {
+ cbr = next_cbr;
+ next_cbr = list_next(&ds->ds_prop_cbs, cbr);
+ if (cbr->cbr_arg == cbarg) {
+ list_remove(&ds->ds_prop_cbs, cbr);
+ list_remove(&cbr->cbr_pr->pr_cbs, cbr);
+ kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
+ }
+ }
+ mutex_exit(&dd->dd_lock);
+}
+
+boolean_t
+dsl_prop_hascb(dsl_dataset_t *ds)
+{
+ return (!list_is_empty(&ds->ds_prop_cbs));
+}
+
+/* ARGSUSED */
+static int
+dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+{
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_prop_record_t *pr;
+ dsl_prop_cb_record_t *cbr;
+
+ mutex_enter(&dd->dd_lock);
+ for (pr = list_head(&dd->dd_props);
+ pr; pr = list_next(&dd->dd_props, pr)) {
+ for (cbr = list_head(&pr->pr_cbs); cbr;
+ cbr = list_next(&pr->pr_cbs, cbr)) {
+ uint64_t value;
+
+ /*
+ * Callback entries do not have holds on their
+ * datasets so that datasets with registered
+ * callbacks are still eligible for eviction.
+ * Unlike operations to update properties on a
+ * single dataset, we are performing a recursive
+ * descent of related head datasets. The caller
+ * of this function only has a dataset hold on
+ * the passed in head dataset, not the snapshots
+ * associated with this dataset. Without a hold,
+ * the dataset pointer within callback records
+ * for snapshots can be invalidated by eviction
+ * at any time.
+ *
+ * Use dsl_dataset_try_add_ref() to verify
+ * that the dataset for a snapshot has not
+ * begun eviction processing and to prevent
+ * eviction from occurring for the duration of
+ * the callback. If the hold attempt fails,
+ * this object is already being evicted and the
+ * callback can be safely ignored.
+ */
+ if (ds != cbr->cbr_ds &&
+ !dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG))
+ continue;
+
+ if (dsl_prop_get_ds(cbr->cbr_ds,
+ cbr->cbr_pr->pr_propname, sizeof (value), 1,
+ &value, NULL) == 0)
+ cbr->cbr_func(cbr->cbr_arg, value);
+
+ if (ds != cbr->cbr_ds)
+ dsl_dataset_rele(cbr->cbr_ds, FTAG);
+ }
+ }
+ mutex_exit(&dd->dd_lock);
+
+ return (0);
+}
+
+/*
+ * Update all property values for ddobj & its descendants. This is used
+ * when renaming the dir.
+ */
+void
+dsl_prop_notify_all(dsl_dir_t *dd)
+{
+ dsl_pool_t *dp = dd->dd_pool;
+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+ (void) dmu_objset_find_dp(dp, dd->dd_object, dsl_prop_notify_all_cb,
+ NULL, DS_FIND_CHILDREN);
+}
+
+static void
+dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
+ const char *propname, uint64_t value, int first)
+{
+ dsl_dir_t *dd;
+ dsl_prop_record_t *pr;
+ dsl_prop_cb_record_t *cbr;
+ objset_t *mos = dp->dp_meta_objset;
+ zap_cursor_t zc;
+ zap_attribute_t *za;
+ int err;
+
+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+ err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
+ if (err)
+ return;
+
+ if (!first) {
+ /*
+ * If the prop is set here, then this change is not
+ * being inherited here or below; stop the recursion.
+ */
+ err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+ propname);
+ if (err == 0) {
+ dsl_dir_rele(dd, FTAG);
+ return;
+ }
+ ASSERT3U(err, ==, ENOENT);
+ }
+
+ mutex_enter(&dd->dd_lock);
+ pr = dsl_prop_record_find(dd, propname);
+ if (pr != NULL) {
+ for (cbr = list_head(&pr->pr_cbs); cbr;
+ cbr = list_next(&pr->pr_cbs, cbr)) {
+ uint64_t propobj;
+
+ /*
+ * cbr->cbr_ds may be invalidated due to eviction,
+ * requiring the use of dsl_dataset_try_add_ref().
+ * See comment block in dsl_prop_notify_all_cb()
+ * for details.
+ */
+ if (!dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG))
+ continue;
+
+ propobj = dsl_dataset_phys(cbr->cbr_ds)->ds_props_obj;
+
+ /*
+ * If the property is not set on this ds, then it is
+ * inherited here; call the callback.
+ */
+ if (propobj == 0 ||
+ zap_contains(mos, propobj, propname) != 0)
+ cbr->cbr_func(cbr->cbr_arg, value);
+
+ dsl_dataset_rele(cbr->cbr_ds, FTAG);
+ }
+ }
+ mutex_exit(&dd->dd_lock);
+
+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+ for (zap_cursor_init(&zc, mos,
+ dsl_dir_phys(dd)->dd_child_dir_zapobj);
+ zap_cursor_retrieve(&zc, za) == 0;
+ zap_cursor_advance(&zc)) {
+ dsl_prop_changed_notify(dp, za->za_first_integer,
+ propname, value, FALSE);
+ }
+ kmem_free(za, sizeof (zap_attribute_t));
+ zap_cursor_fini(&zc);
+ dsl_dir_rele(dd, FTAG);
+}
+
+void
+dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
+ zprop_source_t source, int intsz, int numints, const void *value,
+ dmu_tx_t *tx)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t zapobj, intval, dummy, count;
+ int isint;
+ char valbuf[32];
+ const char *valstr = NULL;
+ char *inheritstr;
+ char *recvdstr;
+ char *tbuf = NULL;
+ int err;
+ uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa);
+
+ isint = (dodefault(zfs_name_to_prop(propname), 8, 1, &intval) == 0);
+
+ if (ds->ds_is_snapshot) {
+ ASSERT(version >= SPA_VERSION_SNAP_PROPS);
+ if (dsl_dataset_phys(ds)->ds_props_obj == 0 &&
+ (source & ZPROP_SRC_NONE) == 0) {
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_props_obj =
+ zap_create(mos,
+ DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
+ }
+ zapobj = dsl_dataset_phys(ds)->ds_props_obj;
+ } else {
+ zapobj = dsl_dir_phys(ds->ds_dir)->dd_props_zapobj;
+ }
+
+ /* If we are removing objects from a non-existent ZAP just return */
+ if (zapobj == 0)
+ return;
+
+ if (version < SPA_VERSION_RECVD_PROPS) {
+ if (source & ZPROP_SRC_NONE)
+ source = ZPROP_SRC_NONE;
+ else if (source & ZPROP_SRC_RECEIVED)
+ source = ZPROP_SRC_LOCAL;
+ }
+
+ inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
+ recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
+
+ switch ((int)source) {
+ case ZPROP_SRC_NONE:
+ /*
+ * revert to received value, if any (inherit -S)
+ * - remove propname
+ * - remove propname$inherit
+ */
+ err = zap_remove(mos, zapobj, propname, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ err = zap_remove(mos, zapobj, inheritstr, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ break;
+ case ZPROP_SRC_LOCAL:
+ /*
+ * remove propname$inherit
+ * set propname -> value
+ */
+ err = zap_remove(mos, zapobj, inheritstr, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ VERIFY0(zap_update(mos, zapobj, propname,
+ intsz, numints, value, tx));
+ break;
+ case ZPROP_SRC_INHERITED:
+ /*
+ * explicitly inherit
+ * - remove propname
+ * - set propname$inherit
+ */
+ err = zap_remove(mos, zapobj, propname, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ if (version >= SPA_VERSION_RECVD_PROPS &&
+ dsl_prop_get_int_ds(ds, ZPROP_HAS_RECVD, &dummy) == 0) {
+ dummy = 0;
+ VERIFY0(zap_update(mos, zapobj, inheritstr,
+ 8, 1, &dummy, tx));
+ }
+ break;
+ case ZPROP_SRC_RECEIVED:
+ /*
+ * set propname$recvd -> value
+ */
+ err = zap_update(mos, zapobj, recvdstr,
+ intsz, numints, value, tx);
+ ASSERT(err == 0);
+ break;
+ case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED):
+ /*
+ * clear local and received settings
+ * - remove propname
+ * - remove propname$inherit
+ * - remove propname$recvd
+ */
+ err = zap_remove(mos, zapobj, propname, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ err = zap_remove(mos, zapobj, inheritstr, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ /* FALLTHRU */
+ case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED):
+ /*
+ * remove propname$recvd
+ */
+ err = zap_remove(mos, zapobj, recvdstr, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ break;
+ default:
+ cmn_err(CE_PANIC, "unexpected property source: %d", source);
+ }
+
+ kmem_strfree(inheritstr);
+ kmem_strfree(recvdstr);
+
+ /*
+ * If we are left with an empty snap zap we can destroy it.
+ * This will prevent unnecessary calls to zap_lookup() in
+ * the "zfs list" and "zfs get" code paths.
+ */
+ if (ds->ds_is_snapshot &&
+ zap_count(mos, zapobj, &count) == 0 && count == 0) {
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_props_obj = 0;
+ zap_destroy(mos, zapobj, tx);
+ }
+
+ if (isint) {
+ VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval));
+
+ if (ds->ds_is_snapshot) {
+ dsl_prop_cb_record_t *cbr;
+ /*
+ * It's a snapshot; nothing can inherit this
+ * property, so just look for callbacks on this
+ * ds here.
+ */
+ mutex_enter(&ds->ds_dir->dd_lock);
+ for (cbr = list_head(&ds->ds_prop_cbs); cbr;
+ cbr = list_next(&ds->ds_prop_cbs, cbr)) {
+ if (strcmp(cbr->cbr_pr->pr_propname,
+ propname) == 0)
+ cbr->cbr_func(cbr->cbr_arg, intval);
+ }
+ mutex_exit(&ds->ds_dir->dd_lock);
+ } else {
+ dsl_prop_changed_notify(ds->ds_dir->dd_pool,
+ ds->ds_dir->dd_object, propname, intval, TRUE);
+ }
+
+ (void) snprintf(valbuf, sizeof (valbuf),
+ "%lld", (longlong_t)intval);
+ valstr = valbuf;
+ } else {
+ if (source == ZPROP_SRC_LOCAL) {
+ valstr = value;
+ } else {
+ tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
+ if (dsl_prop_get_ds(ds, propname, 1,
+ ZAP_MAXVALUELEN, tbuf, NULL) == 0)
+ valstr = tbuf;
+ }
+ }
+
+ spa_history_log_internal_ds(ds, (source == ZPROP_SRC_NONE ||
+ source == ZPROP_SRC_INHERITED) ? "inherit" : "set", tx,
+ "%s=%s", propname, (valstr == NULL ? "" : valstr));
+
+ if (tbuf != NULL)
+ kmem_free(tbuf, ZAP_MAXVALUELEN);
+}
+
+int
+dsl_prop_set_int(const char *dsname, const char *propname,
+ zprop_source_t source, uint64_t value)
+{
+ nvlist_t *nvl = fnvlist_alloc();
+ int error;
+
+ fnvlist_add_uint64(nvl, propname, value);
+ error = dsl_props_set(dsname, source, nvl);
+ fnvlist_free(nvl);
+ return (error);
+}
+
+int
+dsl_prop_set_string(const char *dsname, const char *propname,
+ zprop_source_t source, const char *value)
+{
+ nvlist_t *nvl = fnvlist_alloc();
+ int error;
+
+ fnvlist_add_string(nvl, propname, value);
+ error = dsl_props_set(dsname, source, nvl);
+ fnvlist_free(nvl);
+ return (error);
+}
+
+int
+dsl_prop_inherit(const char *dsname, const char *propname,
+ zprop_source_t source)
+{
+ nvlist_t *nvl = fnvlist_alloc();
+ int error;
+
+ fnvlist_add_boolean(nvl, propname);
+ error = dsl_props_set(dsname, source, nvl);
+ fnvlist_free(nvl);
+ return (error);
+}
+
+int
+dsl_props_set_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_props_set_arg_t *dpsa = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ uint64_t version;
+ nvpair_t *elem = NULL;
+ int err;
+
+ err = dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds);
+ if (err != 0)
+ return (err);
+
+ version = spa_version(ds->ds_dir->dd_pool->dp_spa);
+ while ((elem = nvlist_next_nvpair(dpsa->dpsa_props, elem)) != NULL) {
+ if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(ENAMETOOLONG));
+ }
+ if (nvpair_type(elem) == DATA_TYPE_STRING) {
+ char *valstr = fnvpair_value_string(elem);
+ if (strlen(valstr) >= (version <
+ SPA_VERSION_STMF_PROP ?
+ ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(E2BIG));
+ }
+ }
+ }
+
+ if (ds->ds_is_snapshot && version < SPA_VERSION_SNAP_PROPS) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+void
+dsl_props_set_sync_impl(dsl_dataset_t *ds, zprop_source_t source,
+ nvlist_t *props, dmu_tx_t *tx)
+{
+ nvpair_t *elem = NULL;
+
+ while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
+ nvpair_t *pair = elem;
+ const char *name = nvpair_name(pair);
+
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ /*
+ * This usually happens when we reuse the nvlist_t data
+ * returned by the counterpart dsl_prop_get_all_impl().
+ * For instance we do this to restore the original
+ * received properties when an error occurs in the
+ * zfs_ioc_recv() codepath.
+ */
+ nvlist_t *attrs = fnvpair_value_nvlist(pair);
+ pair = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE);
+ }
+
+ if (nvpair_type(pair) == DATA_TYPE_STRING) {
+ const char *value = fnvpair_value_string(pair);
+ dsl_prop_set_sync_impl(ds, name,
+ source, 1, strlen(value) + 1, value, tx);
+ } else if (nvpair_type(pair) == DATA_TYPE_UINT64) {
+ uint64_t intval = fnvpair_value_uint64(pair);
+ dsl_prop_set_sync_impl(ds, name,
+ source, sizeof (intval), 1, &intval, tx);
+ } else if (nvpair_type(pair) == DATA_TYPE_BOOLEAN) {
+ dsl_prop_set_sync_impl(ds, name,
+ source, 0, 0, NULL, tx);
+ } else {
+ panic("invalid nvpair type");
+ }
+ }
+}
+
+void
+dsl_props_set_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_props_set_arg_t *dpsa = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+
+ VERIFY0(dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds));
+ dsl_props_set_sync_impl(ds, dpsa->dpsa_source, dpsa->dpsa_props, tx);
+ dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * All-or-nothing; if any prop can't be set, nothing will be modified.
+ */
+int
+dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props)
+{
+ dsl_props_set_arg_t dpsa;
+ int nblks = 0;
+
+ dpsa.dpsa_dsname = dsname;
+ dpsa.dpsa_source = source;
+ dpsa.dpsa_props = props;
+
+ /*
+ * If the source includes NONE, then we will only be removing entries
+ * from the ZAP object. In that case don't check for ENOSPC.
+ */
+ if ((source & ZPROP_SRC_NONE) == 0)
+ nblks = 2 * fnvlist_num_pairs(props);
+
+ return (dsl_sync_task(dsname, dsl_props_set_check, dsl_props_set_sync,
+ &dpsa, nblks, ZFS_SPACE_CHECK_RESERVED));
+}
+
+typedef enum dsl_prop_getflags {
+ DSL_PROP_GET_INHERITING = 0x1, /* searching parent of target ds */
+ DSL_PROP_GET_SNAPSHOT = 0x2, /* snapshot dataset */
+ DSL_PROP_GET_LOCAL = 0x4, /* local properties */
+ DSL_PROP_GET_RECEIVED = 0x8, /* received properties */
+} dsl_prop_getflags_t;
+
+static int
+dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj,
+ const char *setpoint, dsl_prop_getflags_t flags, nvlist_t *nv)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int err = 0;
+
+ for (zap_cursor_init(&zc, mos, propobj);
+ (err = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ nvlist_t *propval;
+ zfs_prop_t prop;
+ char buf[ZAP_MAXNAMELEN];
+ char *valstr;
+ const char *suffix;
+ const char *propname;
+ const char *source;
+
+ suffix = strchr(za.za_name, '$');
+
+ if (suffix == NULL) {
+ /*
+ * Skip local properties if we only want received
+ * properties.
+ */
+ if (flags & DSL_PROP_GET_RECEIVED)
+ continue;
+
+ propname = za.za_name;
+ source = setpoint;
+ } else if (strcmp(suffix, ZPROP_INHERIT_SUFFIX) == 0) {
+ /* Skip explicitly inherited entries. */
+ continue;
+ } else if (strcmp(suffix, ZPROP_RECVD_SUFFIX) == 0) {
+ if (flags & DSL_PROP_GET_LOCAL)
+ continue;
+
+ (void) strncpy(buf, za.za_name, (suffix - za.za_name));
+ buf[suffix - za.za_name] = '\0';
+ propname = buf;
+
+ if (!(flags & DSL_PROP_GET_RECEIVED)) {
+ /* Skip if locally overridden. */
+ err = zap_contains(mos, propobj, propname);
+ if (err == 0)
+ continue;
+ if (err != ENOENT)
+ break;
+
+ /* Skip if explicitly inherited. */
+ valstr = kmem_asprintf("%s%s", propname,
+ ZPROP_INHERIT_SUFFIX);
+ err = zap_contains(mos, propobj, valstr);
+ kmem_strfree(valstr);
+ if (err == 0)
+ continue;
+ if (err != ENOENT)
+ break;
+ }
+
+ source = ((flags & DSL_PROP_GET_INHERITING) ?
+ setpoint : ZPROP_SOURCE_VAL_RECVD);
+ } else {
+ /*
+ * For backward compatibility, skip suffixes we don't
+ * recognize.
+ */
+ continue;
+ }
+
+ prop = zfs_name_to_prop(propname);
+
+ /* Skip non-inheritable properties. */
+ if ((flags & DSL_PROP_GET_INHERITING) && prop != ZPROP_INVAL &&
+ !zfs_prop_inheritable(prop))
+ continue;
+
+ /* Skip properties not valid for this type. */
+ if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_INVAL &&
+ !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT, B_FALSE))
+ continue;
+
+ /* Skip properties already defined. */
+ if (nvlist_exists(nv, propname))
+ continue;
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ if (za.za_integer_length == 1) {
+ /*
+ * String property
+ */
+ char *tmp = kmem_alloc(za.za_num_integers,
+ KM_SLEEP);
+ err = zap_lookup(mos, propobj,
+ za.za_name, 1, za.za_num_integers, tmp);
+ if (err != 0) {
+ kmem_free(tmp, za.za_num_integers);
+ break;
+ }
+ VERIFY(nvlist_add_string(propval, ZPROP_VALUE,
+ tmp) == 0);
+ kmem_free(tmp, za.za_num_integers);
+ } else {
+ /*
+ * Integer property
+ */
+ ASSERT(za.za_integer_length == 8);
+ (void) nvlist_add_uint64(propval, ZPROP_VALUE,
+ za.za_first_integer);
+ }
+
+ VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, source) == 0);
+ VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
+ nvlist_free(propval);
+ }
+ zap_cursor_fini(&zc);
+ if (err == ENOENT)
+ err = 0;
+ return (err);
+}
+
+/*
+ * Iterate over all properties for this dataset and return them in an nvlist.
+ */
+static int
+dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp,
+ dsl_prop_getflags_t flags)
+{
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_pool_t *dp = dd->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ int err = 0;
+ char setpoint[ZFS_MAX_DATASET_NAME_LEN];
+
+ VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ if (ds->ds_is_snapshot)
+ flags |= DSL_PROP_GET_SNAPSHOT;
+
+ ASSERT(dsl_pool_config_held(dp));
+
+ if (dsl_dataset_phys(ds)->ds_props_obj != 0) {
+ ASSERT(flags & DSL_PROP_GET_SNAPSHOT);
+ dsl_dataset_name(ds, setpoint);
+ err = dsl_prop_get_all_impl(mos,
+ dsl_dataset_phys(ds)->ds_props_obj, setpoint, flags, *nvp);
+ if (err)
+ goto out;
+ }
+
+ for (; dd != NULL; dd = dd->dd_parent) {
+ if (dd != ds->ds_dir || (flags & DSL_PROP_GET_SNAPSHOT)) {
+ if (flags & (DSL_PROP_GET_LOCAL |
+ DSL_PROP_GET_RECEIVED))
+ break;
+ flags |= DSL_PROP_GET_INHERITING;
+ }
+ dsl_dir_name(dd, setpoint);
+ err = dsl_prop_get_all_impl(mos,
+ dsl_dir_phys(dd)->dd_props_zapobj, setpoint, flags, *nvp);
+ if (err)
+ break;
+ }
+
+out:
+ if (err) {
+ nvlist_free(*nvp);
+ *nvp = NULL;
+ }
+ return (err);
+}
+
+boolean_t
+dsl_prop_get_hasrecvd(const char *dsname)
+{
+ uint64_t dummy;
+
+ return (0 ==
+ dsl_prop_get_integer(dsname, ZPROP_HAS_RECVD, &dummy, NULL));
+}
+
+static int
+dsl_prop_set_hasrecvd_impl(const char *dsname, zprop_source_t source)
+{
+ uint64_t version;
+ spa_t *spa;
+ int error = 0;
+
+ VERIFY0(spa_open(dsname, &spa, FTAG));
+ version = spa_version(spa);
+ spa_close(spa, FTAG);
+
+ if (version >= SPA_VERSION_RECVD_PROPS)
+ error = dsl_prop_set_int(dsname, ZPROP_HAS_RECVD, source, 0);
+ return (error);
+}
+
+/*
+ * Call after successfully receiving properties to ensure that only the first
+ * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties.
+ */
+int
+dsl_prop_set_hasrecvd(const char *dsname)
+{
+ int error = 0;
+ if (!dsl_prop_get_hasrecvd(dsname))
+ error = dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_LOCAL);
+ return (error);
+}
+
+void
+dsl_prop_unset_hasrecvd(const char *dsname)
+{
+ VERIFY0(dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_NONE));
+}
+
+int
+dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
+{
+ return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, 0));
+}
+
+int
+dsl_prop_get_received(const char *dsname, nvlist_t **nvp)
+{
+ objset_t *os;
+ int error;
+
+ /*
+ * Received properties are not distinguishable from local properties
+ * until the dataset has received properties on or after
+ * SPA_VERSION_RECVD_PROPS.
+ */
+ dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(dsname) ?
+ DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL);
+
+ error = dmu_objset_hold(dsname, FTAG, &os);
+ if (error != 0)
+ return (error);
+ error = dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags);
+ dmu_objset_rele(os, FTAG);
+ return (error);
+}
+
+void
+dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value)
+{
+ nvlist_t *propval;
+ const char *propname = zfs_prop_to_name(prop);
+ uint64_t default_value;
+
+ if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) {
+ VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
+ return;
+ }
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
+ /* Indicate the default source if we can. */
+ if (dodefault(prop, 8, 1, &default_value) == 0 &&
+ value == default_value) {
+ VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0);
+ }
+ VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
+ nvlist_free(propval);
+}
+
+void
+dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value)
+{
+ nvlist_t *propval;
+ const char *propname = zfs_prop_to_name(prop);
+
+ if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) {
+ VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
+ return;
+ }
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
+ VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
+ nvlist_free(propval);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(dsl_prop_register);
+EXPORT_SYMBOL(dsl_prop_unregister);
+EXPORT_SYMBOL(dsl_prop_unregister_all);
+EXPORT_SYMBOL(dsl_prop_get);
+EXPORT_SYMBOL(dsl_prop_get_integer);
+EXPORT_SYMBOL(dsl_prop_get_all);
+EXPORT_SYMBOL(dsl_prop_get_received);
+EXPORT_SYMBOL(dsl_prop_get_ds);
+EXPORT_SYMBOL(dsl_prop_get_int_ds);
+EXPORT_SYMBOL(dsl_prop_get_dd);
+EXPORT_SYMBOL(dsl_props_set);
+EXPORT_SYMBOL(dsl_prop_set_int);
+EXPORT_SYMBOL(dsl_prop_set_string);
+EXPORT_SYMBOL(dsl_prop_inherit);
+EXPORT_SYMBOL(dsl_prop_predict);
+EXPORT_SYMBOL(dsl_prop_nvlist_add_uint64);
+EXPORT_SYMBOL(dsl_prop_nvlist_add_string);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c
new file mode 100644
index 000000000000..40adfbcee4e1
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c
@@ -0,0 +1,4422 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright 2016 Gary Mills
+ * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
+ * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/dsl_scan.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dnode.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zil_impl.h>
+#include <sys/zio_checksum.h>
+#include <sys/ddt.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/zfeature.h>
+#include <sys/abd.h>
+#include <sys/range_tree.h>
+#ifdef _KERNEL
+#include <sys/zfs_vfsops.h>
+#endif
+
+/*
+ * Grand theory statement on scan queue sorting
+ *
+ * Scanning is implemented by recursively traversing all indirection levels
+ * in an object and reading all blocks referenced from said objects. This
+ * results in us approximately traversing the object from lowest logical
+ * offset to the highest. For best performance, we would want the logical
+ * blocks to be physically contiguous. However, this is frequently not the
+ * case with pools given the allocation patterns of copy-on-write filesystems.
+ * So instead, we put the I/Os into a reordering queue and issue them in a
+ * way that will most benefit physical disks (LBA-order).
+ *
+ * Queue management:
+ *
+ * Ideally, we would want to scan all metadata and queue up all block I/O
+ * prior to starting to issue it, because that allows us to do an optimal
+ * sorting job. This can however consume large amounts of memory. Therefore
+ * we continuously monitor the size of the queues and constrain them to 5%
+ * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this
+ * limit, we clear out a few of the largest extents at the head of the queues
+ * to make room for more scanning. Hopefully, these extents will be fairly
+ * large and contiguous, allowing us to approach sequential I/O throughput
+ * even without a fully sorted tree.
+ *
+ * Metadata scanning takes place in dsl_scan_visit(), which is called from
+ * dsl_scan_sync() every spa_sync(). If we have either fully scanned all
+ * metadata on the pool, or we need to make room in memory because our
+ * queues are too large, dsl_scan_visit() is postponed and
+ * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies
+ * that metadata scanning and queued I/O issuing are mutually exclusive. This
+ * allows us to provide maximum sequential I/O throughput for the majority of
+ * I/O's issued since sequential I/O performance is significantly negatively
+ * impacted if it is interleaved with random I/O.
+ *
+ * Implementation Notes
+ *
+ * One side effect of the queued scanning algorithm is that the scanning code
+ * needs to be notified whenever a block is freed. This is needed to allow
+ * the scanning code to remove these I/Os from the issuing queue. Additionally,
+ * we do not attempt to queue gang blocks to be issued sequentially since this
+ * is very hard to do and would have an extremely limited performance benefit.
+ * Instead, we simply issue gang I/Os as soon as we find them using the legacy
+ * algorithm.
+ *
+ * Backwards compatibility
+ *
+ * This new algorithm is backwards compatible with the legacy on-disk data
+ * structures (and therefore does not require a new feature flag).
+ * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan
+ * will stop scanning metadata (in logical order) and wait for all outstanding
+ * sorted I/O to complete. Once this is done, we write out a checkpoint
+ * bookmark, indicating that we have scanned everything logically before it.
+ * If the pool is imported on a machine without the new sorting algorithm,
+ * the scan simply resumes from the last checkpoint using the legacy algorithm.
+ */
+
+typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
+ const zbookmark_phys_t *);
+
+static scan_cb_t dsl_scan_scrub_cb;
+
+static int scan_ds_queue_compare(const void *a, const void *b);
+static int scan_prefetch_queue_compare(const void *a, const void *b);
+static void scan_ds_queue_clear(dsl_scan_t *scn);
+static void scan_ds_prefetch_queue_clear(dsl_scan_t *scn);
+static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
+ uint64_t *txg);
+static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
+static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
+static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
+static uint64_t dsl_scan_count_leaves(vdev_t *vd);
+
+extern int zfs_vdev_async_write_active_min_dirty_percent;
+
+/*
+ * By default zfs will check to ensure it is not over the hard memory
+ * limit before each txg. If finer-grained control of this is needed
+ * this value can be set to 1 to enable checking before scanning each
+ * block.
+ */
+int zfs_scan_strict_mem_lim = B_FALSE;
+
+/*
+ * Maximum number of parallelly executed bytes per leaf vdev. We attempt
+ * to strike a balance here between keeping the vdev queues full of I/Os
+ * at all times and not overflowing the queues to cause long latency,
+ * which would cause long txg sync times. No matter what, we will not
+ * overload the drives with I/O, since that is protected by
+ * zfs_vdev_scrub_max_active.
+ */
+unsigned long zfs_scan_vdev_limit = 4 << 20;
+
+int zfs_scan_issue_strategy = 0;
+int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */
+unsigned long zfs_scan_max_ext_gap = 2 << 20; /* in bytes */
+
+/*
+ * fill_weight is non-tunable at runtime, so we copy it at module init from
+ * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would
+ * break queue sorting.
+ */
+int zfs_scan_fill_weight = 3;
+static uint64_t fill_weight;
+
+/* See dsl_scan_should_clear() for details on the memory limit tunables */
+uint64_t zfs_scan_mem_lim_min = 16 << 20; /* bytes */
+uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; /* bytes */
+int zfs_scan_mem_lim_fact = 20; /* fraction of physmem */
+int zfs_scan_mem_lim_soft_fact = 20; /* fraction of mem lim above */
+
+int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */
+int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */
+int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
+int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
+int zfs_scan_checkpoint_intval = 7200; /* in seconds */
+int zfs_scan_suspend_progress = 0; /* set to prevent scans from progressing */
+int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
+enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
+/* max number of blocks to free in a single TXG */
+unsigned long zfs_async_block_max_blocks = ULONG_MAX;
+/* max number of dedup blocks to free in a single TXG */
+unsigned long zfs_max_async_dedup_frees = 100000;
+
+int zfs_resilver_disable_defer = 0; /* set to disable resilver deferring */
+
+/*
+ * We wait a few txgs after importing a pool to begin scanning so that
+ * the import / mounting code isn't held up by scrub / resilver IO.
+ * Unfortunately, it is a bit difficult to determine exactly how long
+ * this will take since userspace will trigger fs mounts asynchronously
+ * and the kernel will create zvol minors asynchronously. As a result,
+ * the value provided here is a bit arbitrary, but represents a
+ * reasonable estimate of how many txgs it will take to finish fully
+ * importing a pool
+ */
+#define SCAN_IMPORT_WAIT_TXGS 5
+
+#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \
+ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
+ (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
+
+/*
+ * Enable/disable the processing of the free_bpobj object.
+ */
+int zfs_free_bpobj_enabled = 1;
+
+/* the order has to match pool_scan_type */
+static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
+ NULL,
+ dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */
+ dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */
+};
+
+/* In core node for the scn->scn_queue. Represents a dataset to be scanned */
+typedef struct {
+ uint64_t sds_dsobj;
+ uint64_t sds_txg;
+ avl_node_t sds_node;
+} scan_ds_t;
+
+/*
+ * This controls what conditions are placed on dsl_scan_sync_state():
+ * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0
+ * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0.
+ * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise
+ * write out the scn_phys_cached version.
+ * See dsl_scan_sync_state for details.
+ */
+typedef enum {
+ SYNC_OPTIONAL,
+ SYNC_MANDATORY,
+ SYNC_CACHED
+} state_sync_type_t;
+
+/*
+ * This struct represents the minimum information needed to reconstruct a
+ * zio for sequential scanning. This is useful because many of these will
+ * accumulate in the sequential IO queues before being issued, so saving
+ * memory matters here.
+ */
+typedef struct scan_io {
+ /* fields from blkptr_t */
+ uint64_t sio_blk_prop;
+ uint64_t sio_phys_birth;
+ uint64_t sio_birth;
+ zio_cksum_t sio_cksum;
+ uint32_t sio_nr_dvas;
+
+ /* fields from zio_t */
+ uint32_t sio_flags;
+ zbookmark_phys_t sio_zb;
+
+ /* members for queue sorting */
+ union {
+ avl_node_t sio_addr_node; /* link into issuing queue */
+ list_node_t sio_list_node; /* link for issuing to disk */
+ } sio_nodes;
+
+ /*
+ * There may be up to SPA_DVAS_PER_BP DVAs here from the bp,
+ * depending on how many were in the original bp. Only the
+ * first DVA is really used for sorting and issuing purposes.
+ * The other DVAs (if provided) simply exist so that the zio
+ * layer can find additional copies to repair from in the
+ * event of an error. This array must go at the end of the
+ * struct to allow this for the variable number of elements.
+ */
+ dva_t sio_dva[0];
+} scan_io_t;
+
+#define SIO_SET_OFFSET(sio, x) DVA_SET_OFFSET(&(sio)->sio_dva[0], x)
+#define SIO_SET_ASIZE(sio, x) DVA_SET_ASIZE(&(sio)->sio_dva[0], x)
+#define SIO_GET_OFFSET(sio) DVA_GET_OFFSET(&(sio)->sio_dva[0])
+#define SIO_GET_ASIZE(sio) DVA_GET_ASIZE(&(sio)->sio_dva[0])
+#define SIO_GET_END_OFFSET(sio) \
+ (SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio))
+#define SIO_GET_MUSED(sio) \
+ (sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t)))
+
+struct dsl_scan_io_queue {
+ dsl_scan_t *q_scn; /* associated dsl_scan_t */
+ vdev_t *q_vd; /* top-level vdev that this queue represents */
+
+ /* trees used for sorting I/Os and extents of I/Os */
+ range_tree_t *q_exts_by_addr;
+ zfs_btree_t q_exts_by_size;
+ avl_tree_t q_sios_by_addr;
+ uint64_t q_sio_memused;
+
+ /* members for zio rate limiting */
+ uint64_t q_maxinflight_bytes;
+ uint64_t q_inflight_bytes;
+ kcondvar_t q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */
+
+ /* per txg statistics */
+ uint64_t q_total_seg_size_this_txg;
+ uint64_t q_segs_this_txg;
+ uint64_t q_total_zio_size_this_txg;
+ uint64_t q_zios_this_txg;
+};
+
+/* private data for dsl_scan_prefetch_cb() */
+typedef struct scan_prefetch_ctx {
+ zfs_refcount_t spc_refcnt; /* refcount for memory management */
+ dsl_scan_t *spc_scn; /* dsl_scan_t for the pool */
+ boolean_t spc_root; /* is this prefetch for an objset? */
+ uint8_t spc_indblkshift; /* dn_indblkshift of current dnode */
+ uint16_t spc_datablkszsec; /* dn_idatablkszsec of current dnode */
+} scan_prefetch_ctx_t;
+
+/* private data for dsl_scan_prefetch() */
+typedef struct scan_prefetch_issue_ctx {
+ avl_node_t spic_avl_node; /* link into scn->scn_prefetch_queue */
+ scan_prefetch_ctx_t *spic_spc; /* spc for the callback */
+ blkptr_t spic_bp; /* bp to prefetch */
+ zbookmark_phys_t spic_zb; /* bookmark to prefetch */
+} scan_prefetch_issue_ctx_t;
+
+static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
+ const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue);
+static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue,
+ scan_io_t *sio);
+
+static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd);
+static void scan_io_queues_destroy(dsl_scan_t *scn);
+
+static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP];
+
+/* sio->sio_nr_dvas must be set so we know which cache to free from */
+static void
+sio_free(scan_io_t *sio)
+{
+ ASSERT3U(sio->sio_nr_dvas, >, 0);
+ ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
+
+ kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio);
+}
+
+/* It is up to the caller to set sio->sio_nr_dvas for freeing */
+static scan_io_t *
+sio_alloc(unsigned short nr_dvas)
+{
+ ASSERT3U(nr_dvas, >, 0);
+ ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP);
+
+ return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP));
+}
+
+void
+scan_init(void)
+{
+ /*
+ * This is used in ext_size_compare() to weight segments
+ * based on how sparse they are. This cannot be changed
+ * mid-scan and the tree comparison functions don't currently
+ * have a mechanism for passing additional context to the
+ * compare functions. Thus we store this value globally and
+ * we only allow it to be set at module initialization time
+ */
+ fill_weight = zfs_scan_fill_weight;
+
+ for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
+ char name[36];
+
+ (void) snprintf(name, sizeof (name), "sio_cache_%d", i);
+ sio_cache[i] = kmem_cache_create(name,
+ (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ }
+}
+
+void
+scan_fini(void)
+{
+ for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
+ kmem_cache_destroy(sio_cache[i]);
+ }
+}
+
+static inline boolean_t
+dsl_scan_is_running(const dsl_scan_t *scn)
+{
+ return (scn->scn_phys.scn_state == DSS_SCANNING);
+}
+
+boolean_t
+dsl_scan_resilvering(dsl_pool_t *dp)
+{
+ return (dsl_scan_is_running(dp->dp_scan) &&
+ dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
+}
+
+static inline void
+sio2bp(const scan_io_t *sio, blkptr_t *bp)
+{
+ bzero(bp, sizeof (*bp));
+ bp->blk_prop = sio->sio_blk_prop;
+ bp->blk_phys_birth = sio->sio_phys_birth;
+ bp->blk_birth = sio->sio_birth;
+ bp->blk_fill = 1; /* we always only work with data pointers */
+ bp->blk_cksum = sio->sio_cksum;
+
+ ASSERT3U(sio->sio_nr_dvas, >, 0);
+ ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
+
+ bcopy(sio->sio_dva, bp->blk_dva, sio->sio_nr_dvas * sizeof (dva_t));
+}
+
+static inline void
+bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
+{
+ sio->sio_blk_prop = bp->blk_prop;
+ sio->sio_phys_birth = bp->blk_phys_birth;
+ sio->sio_birth = bp->blk_birth;
+ sio->sio_cksum = bp->blk_cksum;
+ sio->sio_nr_dvas = BP_GET_NDVAS(bp);
+
+ /*
+ * Copy the DVAs to the sio. We need all copies of the block so
+ * that the self healing code can use the alternate copies if the
+ * first is corrupted. We want the DVA at index dva_i to be first
+ * in the sio since this is the primary one that we want to issue.
+ */
+ for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) {
+ sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas];
+ }
+}
+
+int
+dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
+{
+ int err;
+ dsl_scan_t *scn;
+ spa_t *spa = dp->dp_spa;
+ uint64_t f;
+
+ scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
+ scn->scn_dp = dp;
+
+ /*
+ * It's possible that we're resuming a scan after a reboot so
+ * make sure that the scan_async_destroying flag is initialized
+ * appropriately.
+ */
+ ASSERT(!scn->scn_async_destroying);
+ scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
+ SPA_FEATURE_ASYNC_DESTROY);
+
+ /*
+ * Calculate the max number of in-flight bytes for pool-wide
+ * scanning operations (minimum 1MB). Limits for the issuing
+ * phase are done per top-level vdev and are handled separately.
+ */
+ scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
+ dsl_scan_count_leaves(spa->spa_root_vdev), 1ULL << 20);
+
+ avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
+ offsetof(scan_ds_t, sds_node));
+ avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
+ sizeof (scan_prefetch_issue_ctx_t),
+ offsetof(scan_prefetch_issue_ctx_t, spic_avl_node));
+
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ "scrub_func", sizeof (uint64_t), 1, &f);
+ if (err == 0) {
+ /*
+ * There was an old-style scrub in progress. Restart a
+ * new-style scrub from the beginning.
+ */
+ scn->scn_restart_txg = txg;
+ zfs_dbgmsg("old-style scrub was in progress; "
+ "restarting new-style scrub in txg %llu",
+ (longlong_t)scn->scn_restart_txg);
+
+ /*
+ * Load the queue obj from the old location so that it
+ * can be freed by dsl_scan_done().
+ */
+ (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ "scrub_queue", sizeof (uint64_t), 1,
+ &scn->scn_phys.scn_queue_obj);
+ } else {
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+ &scn->scn_phys);
+ /*
+ * Detect if the pool contains the signature of #2094. If it
+ * does properly update the scn->scn_phys structure and notify
+ * the administrator by setting an errata for the pool.
+ */
+ if (err == EOVERFLOW) {
+ uint64_t zaptmp[SCAN_PHYS_NUMINTS + 1];
+ VERIFY3S(SCAN_PHYS_NUMINTS, ==, 24);
+ VERIFY3S(offsetof(dsl_scan_phys_t, scn_flags), ==,
+ (23 * sizeof (uint64_t)));
+
+ err = zap_lookup(dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN,
+ sizeof (uint64_t), SCAN_PHYS_NUMINTS + 1, &zaptmp);
+ if (err == 0) {
+ uint64_t overflow = zaptmp[SCAN_PHYS_NUMINTS];
+
+ if (overflow & ~DSL_SCAN_FLAGS_MASK ||
+ scn->scn_async_destroying) {
+ spa->spa_errata =
+ ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY;
+ return (EOVERFLOW);
+ }
+
+ bcopy(zaptmp, &scn->scn_phys,
+ SCAN_PHYS_NUMINTS * sizeof (uint64_t));
+ scn->scn_phys.scn_flags = overflow;
+
+ /* Required scrub already in progress. */
+ if (scn->scn_phys.scn_state == DSS_FINISHED ||
+ scn->scn_phys.scn_state == DSS_CANCELED)
+ spa->spa_errata =
+ ZPOOL_ERRATA_ZOL_2094_SCRUB;
+ }
+ }
+
+ if (err == ENOENT)
+ return (0);
+ else if (err)
+ return (err);
+
+ /*
+ * We might be restarting after a reboot, so jump the issued
+ * counter to how far we've scanned. We know we're consistent
+ * up to here.
+ */
+ scn->scn_issued_before_pass = scn->scn_phys.scn_examined;
+
+ if (dsl_scan_is_running(scn) &&
+ spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
+ /*
+ * A new-type scrub was in progress on an old
+ * pool, and the pool was accessed by old
+ * software. Restart from the beginning, since
+ * the old software may have changed the pool in
+ * the meantime.
+ */
+ scn->scn_restart_txg = txg;
+ zfs_dbgmsg("new-style scrub was modified "
+ "by old software; restarting in txg %llu",
+ (longlong_t)scn->scn_restart_txg);
+ } else if (dsl_scan_resilvering(dp)) {
+ /*
+ * If a resilver is in progress and there are already
+ * errors, restart it instead of finishing this scan and
+ * then restarting it. If there haven't been any errors
+ * then remember that the incore DTL is valid.
+ */
+ if (scn->scn_phys.scn_errors > 0) {
+ scn->scn_restart_txg = txg;
+ zfs_dbgmsg("resilver can't excise DTL_MISSING "
+ "when finished; restarting in txg %llu",
+ (u_longlong_t)scn->scn_restart_txg);
+ } else {
+ /* it's safe to excise DTL when finished */
+ spa->spa_scrub_started = B_TRUE;
+ }
+ }
+ }
+
+ bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
+
+ /* reload the queue into the in-core state */
+ if (scn->scn_phys.scn_queue_obj != 0) {
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ for (zap_cursor_init(&zc, dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ scan_ds_queue_insert(scn,
+ zfs_strtonum(za.za_name, NULL),
+ za.za_first_integer);
+ }
+ zap_cursor_fini(&zc);
+ }
+
+ spa_scan_stat_init(spa);
+ return (0);
+}
+
+void
+dsl_scan_fini(dsl_pool_t *dp)
+{
+ if (dp->dp_scan != NULL) {
+ dsl_scan_t *scn = dp->dp_scan;
+
+ if (scn->scn_taskq != NULL)
+ taskq_destroy(scn->scn_taskq);
+
+ scan_ds_queue_clear(scn);
+ avl_destroy(&scn->scn_queue);
+ scan_ds_prefetch_queue_clear(scn);
+ avl_destroy(&scn->scn_prefetch_queue);
+
+ kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
+ dp->dp_scan = NULL;
+ }
+}
+
+static boolean_t
+dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+ return (scn->scn_restart_txg != 0 &&
+ scn->scn_restart_txg <= tx->tx_txg);
+}
+
+boolean_t
+dsl_scan_resilver_scheduled(dsl_pool_t *dp)
+{
+ return ((dp->dp_scan && dp->dp_scan->scn_restart_txg != 0) ||
+ (spa_async_tasks(dp->dp_spa) & SPA_ASYNC_RESILVER));
+}
+
+boolean_t
+dsl_scan_scrubbing(const dsl_pool_t *dp)
+{
+ dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys;
+
+ return (scn_phys->scn_state == DSS_SCANNING &&
+ scn_phys->scn_func == POOL_SCAN_SCRUB);
+}
+
+boolean_t
+dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
+{
+ return (dsl_scan_scrubbing(scn->scn_dp) &&
+ scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED);
+}
+
+/*
+ * Writes out a persistent dsl_scan_phys_t record to the pool directory.
+ * Because we can be running in the block sorting algorithm, we do not always
+ * want to write out the record, only when it is "safe" to do so. This safety
+ * condition is achieved by making sure that the sorting queues are empty
+ * (scn_bytes_pending == 0). When this condition is not true, the sync'd state
+ * is inconsistent with how much actual scanning progress has been made. The
+ * kind of sync to be performed is specified by the sync_type argument. If the
+ * sync is optional, we only sync if the queues are empty. If the sync is
+ * mandatory, we do a hard ASSERT to make sure that the queues are empty. The
+ * third possible state is a "cached" sync. This is done in response to:
+ * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been
+ * destroyed, so we wouldn't be able to restart scanning from it.
+ * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been
+ * superseded by a newer snapshot.
+ * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been
+ * swapped with its clone.
+ * In all cases, a cached sync simply rewrites the last record we've written,
+ * just slightly modified. For the modifications that are performed to the
+ * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed,
+ * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped.
+ */
+static void
+dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type)
+{
+ int i;
+ spa_t *spa = scn->scn_dp->dp_spa;
+
+ ASSERT(sync_type != SYNC_MANDATORY || scn->scn_bytes_pending == 0);
+ if (scn->scn_bytes_pending == 0) {
+ for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+ dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue;
+
+ if (q == NULL)
+ continue;
+
+ mutex_enter(&vd->vdev_scan_io_queue_lock);
+ ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL);
+ ASSERT3P(zfs_btree_first(&q->q_exts_by_size, NULL), ==,
+ NULL);
+ ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL);
+ mutex_exit(&vd->vdev_scan_io_queue_lock);
+ }
+
+ if (scn->scn_phys.scn_queue_obj != 0)
+ scan_ds_queue_sync(scn, tx);
+ VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+ &scn->scn_phys, tx));
+ bcopy(&scn->scn_phys, &scn->scn_phys_cached,
+ sizeof (scn->scn_phys));
+
+ if (scn->scn_checkpointing)
+ zfs_dbgmsg("finish scan checkpoint");
+
+ scn->scn_checkpointing = B_FALSE;
+ scn->scn_last_checkpoint = ddi_get_lbolt();
+ } else if (sync_type == SYNC_CACHED) {
+ VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+ &scn->scn_phys_cached, tx));
+ }
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+ vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
+
+ if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd))
+ return (SET_ERROR(EBUSY));
+
+ return (0);
+}
+
+void
+dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+ pool_scan_func_t *funcp = arg;
+ dmu_object_type_t ot = 0;
+ dsl_pool_t *dp = scn->scn_dp;
+ spa_t *spa = dp->dp_spa;
+
+ ASSERT(!dsl_scan_is_running(scn));
+ ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
+ bzero(&scn->scn_phys, sizeof (scn->scn_phys));
+ scn->scn_phys.scn_func = *funcp;
+ scn->scn_phys.scn_state = DSS_SCANNING;
+ scn->scn_phys.scn_min_txg = 0;
+ scn->scn_phys.scn_max_txg = tx->tx_txg;
+ scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
+ scn->scn_phys.scn_start_time = gethrestime_sec();
+ scn->scn_phys.scn_errors = 0;
+ scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
+ scn->scn_issued_before_pass = 0;
+ scn->scn_restart_txg = 0;
+ scn->scn_done_txg = 0;
+ scn->scn_last_checkpoint = 0;
+ scn->scn_checkpointing = B_FALSE;
+ spa_scan_stat_init(spa);
+
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+ scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
+
+ /* rewrite all disk labels */
+ vdev_config_dirty(spa->spa_root_vdev);
+
+ if (vdev_resilver_needed(spa->spa_root_vdev,
+ &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
+ nvlist_t *aux = fnvlist_alloc();
+ fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE,
+ "healing");
+ spa_event_notify(spa, NULL, aux,
+ ESC_ZFS_RESILVER_START);
+ nvlist_free(aux);
+ } else {
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START);
+ }
+
+ spa->spa_scrub_started = B_TRUE;
+ /*
+ * If this is an incremental scrub, limit the DDT scrub phase
+ * to just the auto-ditto class (for correctness); the rest
+ * of the scrub should go faster using top-down pruning.
+ */
+ if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
+ scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
+
+ /*
+ * When starting a resilver clear any existing rebuild state.
+ * This is required to prevent stale rebuild status from
+ * being reported when a rebuild is run, then a resilver and
+ * finally a scrub. In which case only the scrub status
+ * should be reported by 'zpool status'.
+ */
+ if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+ vdev_t *vd = rvd->vdev_child[i];
+ vdev_rebuild_clear_sync(
+ (void *)(uintptr_t)vd->vdev_id, tx);
+ }
+ }
+ }
+
+ /* back to the generic stuff */
+
+ if (dp->dp_blkstats == NULL) {
+ dp->dp_blkstats =
+ vmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
+ mutex_init(&dp->dp_blkstats->zab_lock, NULL,
+ MUTEX_DEFAULT, NULL);
+ }
+ bzero(&dp->dp_blkstats->zab_type, sizeof (dp->dp_blkstats->zab_type));
+
+ if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
+ ot = DMU_OT_ZAP_OTHER;
+
+ scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
+ ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
+
+ bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
+
+ dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
+
+ spa_history_log_internal(spa, "scan setup", tx,
+ "func=%u mintxg=%llu maxtxg=%llu",
+ *funcp, (u_longlong_t)scn->scn_phys.scn_min_txg,
+ (u_longlong_t)scn->scn_phys.scn_max_txg);
+}
+
+/*
+ * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver.
+ * Can also be called to resume a paused scrub.
+ */
+int
+dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
+{
+ spa_t *spa = dp->dp_spa;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ /*
+ * Purge all vdev caches and probe all devices. We do this here
+ * rather than in sync context because this requires a writer lock
+ * on the spa_config lock, which we can't do from sync context. The
+ * spa_scrub_reopen flag indicates that vdev_open() should not
+ * attempt to start another scrub.
+ */
+ spa_vdev_state_enter(spa, SCL_NONE);
+ spa->spa_scrub_reopen = B_TRUE;
+ vdev_reopen(spa->spa_root_vdev);
+ spa->spa_scrub_reopen = B_FALSE;
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+
+ if (func == POOL_SCAN_RESILVER) {
+ dsl_scan_restart_resilver(spa->spa_dsl_pool, 0);
+ return (0);
+ }
+
+ if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
+ /* got scrub start cmd, resume paused scrub */
+ int err = dsl_scrub_set_pause_resume(scn->scn_dp,
+ POOL_SCRUB_NORMAL);
+ if (err == 0) {
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME);
+ return (SET_ERROR(ECANCELED));
+ }
+
+ return (SET_ERROR(err));
+ }
+
+ return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
+ dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
+{
+ static const char *old_names[] = {
+ "scrub_bookmark",
+ "scrub_ddt_bookmark",
+ "scrub_ddt_class_max",
+ "scrub_queue",
+ "scrub_min_txg",
+ "scrub_max_txg",
+ "scrub_func",
+ "scrub_errors",
+ NULL
+ };
+
+ dsl_pool_t *dp = scn->scn_dp;
+ spa_t *spa = dp->dp_spa;
+ int i;
+
+ /* Remove any remnants of an old-style scrub. */
+ for (i = 0; old_names[i]; i++) {
+ (void) zap_remove(dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
+ }
+
+ if (scn->scn_phys.scn_queue_obj != 0) {
+ VERIFY0(dmu_object_free(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, tx));
+ scn->scn_phys.scn_queue_obj = 0;
+ }
+ scan_ds_queue_clear(scn);
+ scan_ds_prefetch_queue_clear(scn);
+
+ scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
+
+ /*
+ * If we were "restarted" from a stopped state, don't bother
+ * with anything else.
+ */
+ if (!dsl_scan_is_running(scn)) {
+ ASSERT(!scn->scn_is_sorted);
+ return;
+ }
+
+ if (scn->scn_is_sorted) {
+ scan_io_queues_destroy(scn);
+ scn->scn_is_sorted = B_FALSE;
+
+ if (scn->scn_taskq != NULL) {
+ taskq_destroy(scn->scn_taskq);
+ scn->scn_taskq = NULL;
+ }
+ }
+
+ scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED;
+
+ spa_notify_waiters(spa);
+
+ if (dsl_scan_restarting(scn, tx))
+ spa_history_log_internal(spa, "scan aborted, restarting", tx,
+ "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
+ else if (!complete)
+ spa_history_log_internal(spa, "scan cancelled", tx,
+ "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
+ else
+ spa_history_log_internal(spa, "scan done", tx,
+ "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
+
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+ spa->spa_scrub_active = B_FALSE;
+
+ /*
+ * If the scrub/resilver completed, update all DTLs to
+ * reflect this. Whether it succeeded or not, vacate
+ * all temporary scrub DTLs.
+ *
+ * As the scrub does not currently support traversing
+ * data that have been freed but are part of a checkpoint,
+ * we don't mark the scrub as done in the DTLs as faults
+ * may still exist in those vdevs.
+ */
+ if (complete &&
+ !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+ vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+ scn->scn_phys.scn_max_txg, B_TRUE, B_FALSE);
+
+ if (scn->scn_phys.scn_min_txg) {
+ nvlist_t *aux = fnvlist_alloc();
+ fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE,
+ "healing");
+ spa_event_notify(spa, NULL, aux,
+ ESC_ZFS_RESILVER_FINISH);
+ nvlist_free(aux);
+ } else {
+ spa_event_notify(spa, NULL, NULL,
+ ESC_ZFS_SCRUB_FINISH);
+ }
+ } else {
+ vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+ 0, B_TRUE, B_FALSE);
+ }
+ spa_errlog_rotate(spa);
+
+ /*
+ * Don't clear flag until after vdev_dtl_reassess to ensure that
+ * DTL_MISSING will get updated when possible.
+ */
+ spa->spa_scrub_started = B_FALSE;
+
+ /*
+ * We may have finished replacing a device.
+ * Let the async thread assess this and handle the detach.
+ */
+ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+
+ /*
+ * Clear any resilver_deferred flags in the config.
+ * If there are drives that need resilvering, kick
+ * off an asynchronous request to start resilver.
+ * vdev_clear_resilver_deferred() may update the config
+ * before the resilver can restart. In the event of
+ * a crash during this period, the spa loading code
+ * will find the drives that need to be resilvered
+ * and start the resilver then.
+ */
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER) &&
+ vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) {
+ spa_history_log_internal(spa,
+ "starting deferred resilver", tx, "errors=%llu",
+ (u_longlong_t)spa_get_errlog_size(spa));
+ spa_async_request(spa, SPA_ASYNC_RESILVER);
+ }
+ }
+
+ scn->scn_phys.scn_end_time = gethrestime_sec();
+
+ if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB)
+ spa->spa_errata = 0;
+
+ ASSERT(!dsl_scan_is_running(scn));
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+
+ if (!dsl_scan_is_running(scn))
+ return (SET_ERROR(ENOENT));
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+
+ dsl_scan_done(scn, B_FALSE, tx);
+ dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
+ spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT);
+}
+
+int
+dsl_scan_cancel(dsl_pool_t *dp)
+{
+ return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
+ dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
+}
+
+static int
+dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx)
+{
+ pool_scrub_cmd_t *cmd = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_scan_t *scn = dp->dp_scan;
+
+ if (*cmd == POOL_SCRUB_PAUSE) {
+ /* can't pause a scrub when there is no in-progress scrub */
+ if (!dsl_scan_scrubbing(dp))
+ return (SET_ERROR(ENOENT));
+
+ /* can't pause a paused scrub */
+ if (dsl_scan_is_paused_scrub(scn))
+ return (SET_ERROR(EBUSY));
+ } else if (*cmd != POOL_SCRUB_NORMAL) {
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ return (0);
+}
+
+static void
+dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
+{
+ pool_scrub_cmd_t *cmd = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ spa_t *spa = dp->dp_spa;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ if (*cmd == POOL_SCRUB_PAUSE) {
+ /* can't pause a scrub when there is no in-progress scrub */
+ spa->spa_scan_pass_scrub_pause = gethrestime_sec();
+ scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED;
+ scn->scn_phys_cached.scn_flags |= DSF_SCRUB_PAUSED;
+ dsl_scan_sync_state(scn, tx, SYNC_CACHED);
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED);
+ spa_notify_waiters(spa);
+ } else {
+ ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
+ if (dsl_scan_is_paused_scrub(scn)) {
+ /*
+ * We need to keep track of how much time we spend
+ * paused per pass so that we can adjust the scrub rate
+ * shown in the output of 'zpool status'
+ */
+ spa->spa_scan_pass_scrub_spent_paused +=
+ gethrestime_sec() - spa->spa_scan_pass_scrub_pause;
+ spa->spa_scan_pass_scrub_pause = 0;
+ scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
+ scn->scn_phys_cached.scn_flags &= ~DSF_SCRUB_PAUSED;
+ dsl_scan_sync_state(scn, tx, SYNC_CACHED);
+ }
+ }
+}
+
+/*
+ * Set scrub pause/resume state if it makes sense to do so
+ */
+int
+dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd)
+{
+ return (dsl_sync_task(spa_name(dp->dp_spa),
+ dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3,
+ ZFS_SPACE_CHECK_RESERVED));
+}
+
+
+/* start a new scan, or restart an existing one. */
+void
+dsl_scan_restart_resilver(dsl_pool_t *dp, uint64_t txg)
+{
+ if (txg == 0) {
+ dmu_tx_t *tx;
+ tx = dmu_tx_create_dd(dp->dp_mos_dir);
+ VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
+
+ txg = dmu_tx_get_txg(tx);
+ dp->dp_scan->scn_restart_txg = txg;
+ dmu_tx_commit(tx);
+ } else {
+ dp->dp_scan->scn_restart_txg = txg;
+ }
+ zfs_dbgmsg("restarting resilver txg=%llu", (longlong_t)txg);
+}
+
+void
+dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
+{
+ zio_free(dp->dp_spa, txg, bp);
+}
+
+void
+dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
+{
+ ASSERT(dsl_pool_sync_context(dp));
+ zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
+}
+
+static int
+scan_ds_queue_compare(const void *a, const void *b)
+{
+ const scan_ds_t *sds_a = a, *sds_b = b;
+
+ if (sds_a->sds_dsobj < sds_b->sds_dsobj)
+ return (-1);
+ if (sds_a->sds_dsobj == sds_b->sds_dsobj)
+ return (0);
+ return (1);
+}
+
+static void
+scan_ds_queue_clear(dsl_scan_t *scn)
+{
+ void *cookie = NULL;
+ scan_ds_t *sds;
+ while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) {
+ kmem_free(sds, sizeof (*sds));
+ }
+}
+
+static boolean_t
+scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg)
+{
+ scan_ds_t srch, *sds;
+
+ srch.sds_dsobj = dsobj;
+ sds = avl_find(&scn->scn_queue, &srch, NULL);
+ if (sds != NULL && txg != NULL)
+ *txg = sds->sds_txg;
+ return (sds != NULL);
+}
+
+static void
+scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg)
+{
+ scan_ds_t *sds;
+ avl_index_t where;
+
+ sds = kmem_zalloc(sizeof (*sds), KM_SLEEP);
+ sds->sds_dsobj = dsobj;
+ sds->sds_txg = txg;
+
+ VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL);
+ avl_insert(&scn->scn_queue, sds, where);
+}
+
+static void
+scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj)
+{
+ scan_ds_t srch, *sds;
+
+ srch.sds_dsobj = dsobj;
+
+ sds = avl_find(&scn->scn_queue, &srch, NULL);
+ VERIFY(sds != NULL);
+ avl_remove(&scn->scn_queue, sds);
+ kmem_free(sds, sizeof (*sds));
+}
+
+static void
+scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ spa_t *spa = dp->dp_spa;
+ dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ?
+ DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER;
+
+ ASSERT0(scn->scn_bytes_pending);
+ ASSERT(scn->scn_phys.scn_queue_obj != 0);
+
+ VERIFY0(dmu_object_free(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, tx));
+ scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot,
+ DMU_OT_NONE, 0, tx);
+ for (scan_ds_t *sds = avl_first(&scn->scn_queue);
+ sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) {
+ VERIFY0(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, sds->sds_dsobj,
+ sds->sds_txg, tx));
+ }
+}
+
+/*
+ * Computes the memory limit state that we're currently in. A sorted scan
+ * needs quite a bit of memory to hold the sorting queue, so we need to
+ * reasonably constrain the size so it doesn't impact overall system
+ * performance. We compute two limits:
+ * 1) Hard memory limit: if the amount of memory used by the sorting
+ * queues on a pool gets above this value, we stop the metadata
+ * scanning portion and start issuing the queued up and sorted
+ * I/Os to reduce memory usage.
+ * This limit is calculated as a fraction of physmem (by default 5%).
+ * We constrain the lower bound of the hard limit to an absolute
+ * minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain
+ * the upper bound to 5% of the total pool size - no chance we'll
+ * ever need that much memory, but just to keep the value in check.
+ * 2) Soft memory limit: once we hit the hard memory limit, we start
+ * issuing I/O to reduce queue memory usage, but we don't want to
+ * completely empty out the queues, since we might be able to find I/Os
+ * that will fill in the gaps of our non-sequential IOs at some point
+ * in the future. So we stop the issuing of I/Os once the amount of
+ * memory used drops below the soft limit (at which point we stop issuing
+ * I/O and start scanning metadata again).
+ *
+ * This limit is calculated by subtracting a fraction of the hard
+ * limit from the hard limit. By default this fraction is 5%, so
+ * the soft limit is 95% of the hard limit. We cap the size of the
+ * difference between the hard and soft limits at an absolute
+ * maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is
+ * sufficient to not cause too frequent switching between the
+ * metadata scan and I/O issue (even at 2k recordsize, 128 MiB's
+ * worth of queues is about 1.2 GiB of on-pool data, so scanning
+ * that should take at least a decent fraction of a second).
+ */
+static boolean_t
+dsl_scan_should_clear(dsl_scan_t *scn)
+{
+ spa_t *spa = scn->scn_dp->dp_spa;
+ vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
+ uint64_t alloc, mlim_hard, mlim_soft, mused;
+
+ alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+ alloc += metaslab_class_get_alloc(spa_special_class(spa));
+ alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
+
+ mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE,
+ zfs_scan_mem_lim_min);
+ mlim_hard = MIN(mlim_hard, alloc / 20);
+ mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact,
+ zfs_scan_mem_lim_soft_max);
+ mused = 0;
+ for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+ vdev_t *tvd = rvd->vdev_child[i];
+ dsl_scan_io_queue_t *queue;
+
+ mutex_enter(&tvd->vdev_scan_io_queue_lock);
+ queue = tvd->vdev_scan_io_queue;
+ if (queue != NULL) {
+ /* # extents in exts_by_size = # in exts_by_addr */
+ mused += zfs_btree_numnodes(&queue->q_exts_by_size) *
+ sizeof (range_seg_gap_t) + queue->q_sio_memused;
+ }
+ mutex_exit(&tvd->vdev_scan_io_queue_lock);
+ }
+
+ dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused);
+
+ if (mused == 0)
+ ASSERT0(scn->scn_bytes_pending);
+
+ /*
+ * If we are above our hard limit, we need to clear out memory.
+ * If we are below our soft limit, we need to accumulate sequential IOs.
+ * Otherwise, we should keep doing whatever we are currently doing.
+ */
+ if (mused >= mlim_hard)
+ return (B_TRUE);
+ else if (mused < mlim_soft)
+ return (B_FALSE);
+ else
+ return (scn->scn_clearing);
+}
+
+static boolean_t
+dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
+{
+ /* we never skip user/group accounting objects */
+ if (zb && (int64_t)zb->zb_object < 0)
+ return (B_FALSE);
+
+ if (scn->scn_suspending)
+ return (B_TRUE); /* we're already suspending */
+
+ if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
+ return (B_FALSE); /* we're resuming */
+
+ /* We only know how to resume from level-0 and objset blocks. */
+ if (zb && (zb->zb_level != 0 && zb->zb_level != ZB_ROOT_LEVEL))
+ return (B_FALSE);
+
+ /*
+ * We suspend if:
+ * - we have scanned for at least the minimum time (default 1 sec
+ * for scrub, 3 sec for resilver), and either we have sufficient
+ * dirty data that we are starting to write more quickly
+ * (default 30%), someone is explicitly waiting for this txg
+ * to complete, or we have used up all of the time in the txg
+ * timeout (default 5 sec).
+ * or
+ * - the spa is shutting down because this pool is being exported
+ * or the machine is rebooting.
+ * or
+ * - the scan queue has reached its memory use limit
+ */
+ uint64_t curr_time_ns = gethrtime();
+ uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
+ uint64_t sync_time_ns = curr_time_ns -
+ scn->scn_dp->dp_spa->spa_sync_starttime;
+ int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
+ int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+ zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
+
+ if ((NSEC2MSEC(scan_time_ns) > mintime &&
+ (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent ||
+ txg_sync_waiting(scn->scn_dp) ||
+ NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
+ spa_shutting_down(scn->scn_dp->dp_spa) ||
+ (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) {
+ if (zb && zb->zb_level == ZB_ROOT_LEVEL) {
+ dprintf("suspending at first available bookmark "
+ "%llx/%llx/%llx/%llx\n",
+ (longlong_t)zb->zb_objset,
+ (longlong_t)zb->zb_object,
+ (longlong_t)zb->zb_level,
+ (longlong_t)zb->zb_blkid);
+ SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
+ zb->zb_objset, 0, 0, 0);
+ } else if (zb != NULL) {
+ dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n",
+ (longlong_t)zb->zb_objset,
+ (longlong_t)zb->zb_object,
+ (longlong_t)zb->zb_level,
+ (longlong_t)zb->zb_blkid);
+ scn->scn_phys.scn_bookmark = *zb;
+ } else {
+#ifdef ZFS_DEBUG
+ dsl_scan_phys_t *scnp = &scn->scn_phys;
+ dprintf("suspending at at DDT bookmark "
+ "%llx/%llx/%llx/%llx\n",
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
+#endif
+ }
+ scn->scn_suspending = B_TRUE;
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+typedef struct zil_scan_arg {
+ dsl_pool_t *zsa_dp;
+ zil_header_t *zsa_zh;
+} zil_scan_arg_t;
+
+/* ARGSUSED */
+static int
+dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
+ uint64_t claim_txg)
+{
+ zil_scan_arg_t *zsa = arg;
+ dsl_pool_t *dp = zsa->zsa_dp;
+ dsl_scan_t *scn = dp->dp_scan;
+ zil_header_t *zh = zsa->zsa_zh;
+ zbookmark_phys_t zb;
+
+ ASSERT(!BP_IS_REDACTED(bp));
+ if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ return (0);
+
+ /*
+ * One block ("stubby") can be allocated a long time ago; we
+ * want to visit that one because it has been allocated
+ * (on-disk) even if it hasn't been claimed (even though for
+ * scrub there's nothing to do to it).
+ */
+ if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa))
+ return (0);
+
+ SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+ VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
+ uint64_t claim_txg)
+{
+ if (lrc->lrc_txtype == TX_WRITE) {
+ zil_scan_arg_t *zsa = arg;
+ dsl_pool_t *dp = zsa->zsa_dp;
+ dsl_scan_t *scn = dp->dp_scan;
+ zil_header_t *zh = zsa->zsa_zh;
+ const lr_write_t *lr = (const lr_write_t *)lrc;
+ const blkptr_t *bp = &lr->lr_blkptr;
+ zbookmark_phys_t zb;
+
+ ASSERT(!BP_IS_REDACTED(bp));
+ if (BP_IS_HOLE(bp) ||
+ bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ return (0);
+
+ /*
+ * birth can be < claim_txg if this record's txg is
+ * already txg sync'ed (but this log block contains
+ * other records that are not synced)
+ */
+ if (claim_txg == 0 || bp->blk_birth < claim_txg)
+ return (0);
+
+ SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ lr->lr_foid, ZB_ZIL_LEVEL,
+ lr->lr_offset / BP_GET_LSIZE(bp));
+
+ VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+ }
+ return (0);
+}
+
+static void
+dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
+{
+ uint64_t claim_txg = zh->zh_claim_txg;
+ zil_scan_arg_t zsa = { dp, zh };
+ zilog_t *zilog;
+
+ ASSERT(spa_writeable(dp->dp_spa));
+
+ /*
+ * We only want to visit blocks that have been claimed but not yet
+ * replayed (or, in read-only mode, blocks that *would* be claimed).
+ */
+ if (claim_txg == 0)
+ return;
+
+ zilog = zil_alloc(dp->dp_meta_objset, zh);
+
+ (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
+ claim_txg, B_FALSE);
+
+ zil_free(zilog);
+}
+
+/*
+ * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea
+ * here is to sort the AVL tree by the order each block will be needed.
+ */
+static int
+scan_prefetch_queue_compare(const void *a, const void *b)
+{
+ const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b;
+ const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc;
+ const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc;
+
+ return (zbookmark_compare(spc_a->spc_datablkszsec,
+ spc_a->spc_indblkshift, spc_b->spc_datablkszsec,
+ spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb));
+}
+
+static void
+scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag)
+{
+ if (zfs_refcount_remove(&spc->spc_refcnt, tag) == 0) {
+ zfs_refcount_destroy(&spc->spc_refcnt);
+ kmem_free(spc, sizeof (scan_prefetch_ctx_t));
+ }
+}
+
+static scan_prefetch_ctx_t *
+scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag)
+{
+ scan_prefetch_ctx_t *spc;
+
+ spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP);
+ zfs_refcount_create(&spc->spc_refcnt);
+ zfs_refcount_add(&spc->spc_refcnt, tag);
+ spc->spc_scn = scn;
+ if (dnp != NULL) {
+ spc->spc_datablkszsec = dnp->dn_datablkszsec;
+ spc->spc_indblkshift = dnp->dn_indblkshift;
+ spc->spc_root = B_FALSE;
+ } else {
+ spc->spc_datablkszsec = 0;
+ spc->spc_indblkshift = 0;
+ spc->spc_root = B_TRUE;
+ }
+
+ return (spc);
+}
+
+static void
+scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, void *tag)
+{
+ zfs_refcount_add(&spc->spc_refcnt, tag);
+}
+
+static void
+scan_ds_prefetch_queue_clear(dsl_scan_t *scn)
+{
+ spa_t *spa = scn->scn_dp->dp_spa;
+ void *cookie = NULL;
+ scan_prefetch_issue_ctx_t *spic = NULL;
+
+ mutex_enter(&spa->spa_scrub_lock);
+ while ((spic = avl_destroy_nodes(&scn->scn_prefetch_queue,
+ &cookie)) != NULL) {
+ scan_prefetch_ctx_rele(spic->spic_spc, scn);
+ kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
+ }
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+static boolean_t
+dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc,
+ const zbookmark_phys_t *zb)
+{
+ zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark;
+ dnode_phys_t tmp_dnp;
+ dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp;
+
+ if (zb->zb_objset != last_zb->zb_objset)
+ return (B_TRUE);
+ if ((int64_t)zb->zb_object < 0)
+ return (B_FALSE);
+
+ tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec;
+ tmp_dnp.dn_indblkshift = spc->spc_indblkshift;
+
+ if (zbookmark_subtree_completed(dnp, zb, last_zb))
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+static void
+dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb)
+{
+ avl_index_t idx;
+ dsl_scan_t *scn = spc->spc_scn;
+ spa_t *spa = scn->scn_dp->dp_spa;
+ scan_prefetch_issue_ctx_t *spic;
+
+ if (zfs_no_scrub_prefetch || BP_IS_REDACTED(bp))
+ return;
+
+ if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg ||
+ (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
+ BP_GET_TYPE(bp) != DMU_OT_OBJSET))
+ return;
+
+ if (dsl_scan_check_prefetch_resume(spc, zb))
+ return;
+
+ scan_prefetch_ctx_add_ref(spc, scn);
+ spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP);
+ spic->spic_spc = spc;
+ spic->spic_bp = *bp;
+ spic->spic_zb = *zb;
+
+ /*
+ * Add the IO to the queue of blocks to prefetch. This allows us to
+ * prioritize blocks that we will need first for the main traversal
+ * thread.
+ */
+ mutex_enter(&spa->spa_scrub_lock);
+ if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) {
+ /* this block is already queued for prefetch */
+ kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
+ scan_prefetch_ctx_rele(spc, scn);
+ mutex_exit(&spa->spa_scrub_lock);
+ return;
+ }
+
+ avl_insert(&scn->scn_prefetch_queue, spic, idx);
+ cv_broadcast(&spa->spa_scrub_io_cv);
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+static void
+dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp,
+ uint64_t objset, uint64_t object)
+{
+ int i;
+ zbookmark_phys_t zb;
+ scan_prefetch_ctx_t *spc;
+
+ if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+ return;
+
+ SET_BOOKMARK(&zb, objset, object, 0, 0);
+
+ spc = scan_prefetch_ctx_create(scn, dnp, FTAG);
+
+ for (i = 0; i < dnp->dn_nblkptr; i++) {
+ zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]);
+ zb.zb_blkid = i;
+ dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb);
+ }
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ zb.zb_level = 0;
+ zb.zb_blkid = DMU_SPILL_BLKID;
+ dsl_scan_prefetch(spc, DN_SPILL_BLKPTR(dnp), &zb);
+ }
+
+ scan_prefetch_ctx_rele(spc, FTAG);
+}
+
+static void
+dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+ arc_buf_t *buf, void *private)
+{
+ scan_prefetch_ctx_t *spc = private;
+ dsl_scan_t *scn = spc->spc_scn;
+ spa_t *spa = scn->scn_dp->dp_spa;
+
+ /* broadcast that the IO has completed for rate limiting purposes */
+ mutex_enter(&spa->spa_scrub_lock);
+ ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
+ spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
+ cv_broadcast(&spa->spa_scrub_io_cv);
+ mutex_exit(&spa->spa_scrub_lock);
+
+ /* if there was an error or we are done prefetching, just cleanup */
+ if (buf == NULL || scn->scn_prefetch_stop)
+ goto out;
+
+ if (BP_GET_LEVEL(bp) > 0) {
+ int i;
+ blkptr_t *cbp;
+ int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+ zbookmark_phys_t czb;
+
+ for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
+ SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+ zb->zb_level - 1, zb->zb_blkid * epb + i);
+ dsl_scan_prefetch(spc, cbp, &czb);
+ }
+ } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+ dnode_phys_t *cdnp;
+ int i;
+ int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+
+ for (i = 0, cdnp = buf->b_data; i < epb;
+ i += cdnp->dn_extra_slots + 1,
+ cdnp += cdnp->dn_extra_slots + 1) {
+ dsl_scan_prefetch_dnode(scn, cdnp,
+ zb->zb_objset, zb->zb_blkid * epb + i);
+ }
+ } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+ objset_phys_t *osp = buf->b_data;
+
+ dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode,
+ zb->zb_objset, DMU_META_DNODE_OBJECT);
+
+ if (OBJSET_BUF_HAS_USERUSED(buf)) {
+ dsl_scan_prefetch_dnode(scn,
+ &osp->os_groupused_dnode, zb->zb_objset,
+ DMU_GROUPUSED_OBJECT);
+ dsl_scan_prefetch_dnode(scn,
+ &osp->os_userused_dnode, zb->zb_objset,
+ DMU_USERUSED_OBJECT);
+ }
+ }
+
+out:
+ if (buf != NULL)
+ arc_buf_destroy(buf, private);
+ scan_prefetch_ctx_rele(spc, scn);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_prefetch_thread(void *arg)
+{
+ dsl_scan_t *scn = arg;
+ spa_t *spa = scn->scn_dp->dp_spa;
+ scan_prefetch_issue_ctx_t *spic;
+
+ /* loop until we are told to stop */
+ while (!scn->scn_prefetch_stop) {
+ arc_flags_t flags = ARC_FLAG_NOWAIT |
+ ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH;
+ int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
+
+ mutex_enter(&spa->spa_scrub_lock);
+
+ /*
+ * Wait until we have an IO to issue and are not above our
+ * maximum in flight limit.
+ */
+ while (!scn->scn_prefetch_stop &&
+ (avl_numnodes(&scn->scn_prefetch_queue) == 0 ||
+ spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) {
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ }
+
+ /* recheck if we should stop since we waited for the cv */
+ if (scn->scn_prefetch_stop) {
+ mutex_exit(&spa->spa_scrub_lock);
+ break;
+ }
+
+ /* remove the prefetch IO from the tree */
+ spic = avl_first(&scn->scn_prefetch_queue);
+ spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp);
+ avl_remove(&scn->scn_prefetch_queue, spic);
+
+ mutex_exit(&spa->spa_scrub_lock);
+
+ if (BP_IS_PROTECTED(&spic->spic_bp)) {
+ ASSERT(BP_GET_TYPE(&spic->spic_bp) == DMU_OT_DNODE ||
+ BP_GET_TYPE(&spic->spic_bp) == DMU_OT_OBJSET);
+ ASSERT3U(BP_GET_LEVEL(&spic->spic_bp), ==, 0);
+ zio_flags |= ZIO_FLAG_RAW;
+ }
+
+ /* issue the prefetch asynchronously */
+ (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa,
+ &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc,
+ ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb);
+
+ kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
+ }
+
+ ASSERT(scn->scn_prefetch_stop);
+
+ /* free any prefetches we didn't get to complete */
+ mutex_enter(&spa->spa_scrub_lock);
+ while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) {
+ avl_remove(&scn->scn_prefetch_queue, spic);
+ scan_prefetch_ctx_rele(spic->spic_spc, scn);
+ kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
+ }
+ ASSERT0(avl_numnodes(&scn->scn_prefetch_queue));
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+static boolean_t
+dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
+ const zbookmark_phys_t *zb)
+{
+ /*
+ * We never skip over user/group accounting objects (obj<0)
+ */
+ if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
+ (int64_t)zb->zb_object >= 0) {
+ /*
+ * If we already visited this bp & everything below (in
+ * a prior txg sync), don't bother doing it again.
+ */
+ if (zbookmark_subtree_completed(dnp, zb,
+ &scn->scn_phys.scn_bookmark))
+ return (B_TRUE);
+
+ /*
+ * If we found the block we're trying to resume from, or
+ * we went past it to a different object, zero it out to
+ * indicate that it's OK to start checking for suspending
+ * again.
+ */
+ if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
+ zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
+ dprintf("resuming at %llx/%llx/%llx/%llx\n",
+ (longlong_t)zb->zb_objset,
+ (longlong_t)zb->zb_object,
+ (longlong_t)zb->zb_level,
+ (longlong_t)zb->zb_blkid);
+ bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
+ }
+ }
+ return (B_FALSE);
+}
+
+static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
+ dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
+ dmu_objset_type_t ostype, dmu_tx_t *tx);
+inline __attribute__((always_inline)) static void dsl_scan_visitdnode(
+ dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype,
+ dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
+
+/*
+ * Return nonzero on i/o error.
+ * Return new buf to write out in *bufp.
+ */
+inline __attribute__((always_inline)) static int
+dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
+ dnode_phys_t *dnp, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
+ int err;
+
+ ASSERT(!BP_IS_REDACTED(bp));
+
+ if (BP_GET_LEVEL(bp) > 0) {
+ arc_flags_t flags = ARC_FLAG_WAIT;
+ int i;
+ blkptr_t *cbp;
+ int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+ arc_buf_t *buf;
+
+ err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
+ ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
+ if (err) {
+ scn->scn_phys.scn_errors++;
+ return (err);
+ }
+ for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
+ zbookmark_phys_t czb;
+
+ SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+ zb->zb_level - 1,
+ zb->zb_blkid * epb + i);
+ dsl_scan_visitbp(cbp, &czb, dnp,
+ ds, scn, ostype, tx);
+ }
+ arc_buf_destroy(buf, &buf);
+ } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+ arc_flags_t flags = ARC_FLAG_WAIT;
+ dnode_phys_t *cdnp;
+ int i;
+ int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+ arc_buf_t *buf;
+
+ if (BP_IS_PROTECTED(bp)) {
+ ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
+ zio_flags |= ZIO_FLAG_RAW;
+ }
+
+ err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
+ ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
+ if (err) {
+ scn->scn_phys.scn_errors++;
+ return (err);
+ }
+ for (i = 0, cdnp = buf->b_data; i < epb;
+ i += cdnp->dn_extra_slots + 1,
+ cdnp += cdnp->dn_extra_slots + 1) {
+ dsl_scan_visitdnode(scn, ds, ostype,
+ cdnp, zb->zb_blkid * epb + i, tx);
+ }
+
+ arc_buf_destroy(buf, &buf);
+ } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+ arc_flags_t flags = ARC_FLAG_WAIT;
+ objset_phys_t *osp;
+ arc_buf_t *buf;
+
+ err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
+ ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
+ if (err) {
+ scn->scn_phys.scn_errors++;
+ return (err);
+ }
+
+ osp = buf->b_data;
+
+ dsl_scan_visitdnode(scn, ds, osp->os_type,
+ &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx);
+
+ if (OBJSET_BUF_HAS_USERUSED(buf)) {
+ /*
+ * We also always visit user/group/project accounting
+ * objects, and never skip them, even if we are
+ * suspending. This is necessary so that the
+ * space deltas from this txg get integrated.
+ */
+ if (OBJSET_BUF_HAS_PROJECTUSED(buf))
+ dsl_scan_visitdnode(scn, ds, osp->os_type,
+ &osp->os_projectused_dnode,
+ DMU_PROJECTUSED_OBJECT, tx);
+ dsl_scan_visitdnode(scn, ds, osp->os_type,
+ &osp->os_groupused_dnode,
+ DMU_GROUPUSED_OBJECT, tx);
+ dsl_scan_visitdnode(scn, ds, osp->os_type,
+ &osp->os_userused_dnode,
+ DMU_USERUSED_OBJECT, tx);
+ }
+ arc_buf_destroy(buf, &buf);
+ }
+
+ return (0);
+}
+
+inline __attribute__((always_inline)) static void
+dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
+ dmu_objset_type_t ostype, dnode_phys_t *dnp,
+ uint64_t object, dmu_tx_t *tx)
+{
+ int j;
+
+ for (j = 0; j < dnp->dn_nblkptr; j++) {
+ zbookmark_phys_t czb;
+
+ SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+ dnp->dn_nlevels - 1, j);
+ dsl_scan_visitbp(&dnp->dn_blkptr[j],
+ &czb, dnp, ds, scn, ostype, tx);
+ }
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ zbookmark_phys_t czb;
+ SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+ 0, DMU_SPILL_BLKID);
+ dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp),
+ &czb, dnp, ds, scn, ostype, tx);
+ }
+}
+
+/*
+ * The arguments are in this order because mdb can only print the
+ * first 5; we want them to be useful.
+ */
+static void
+dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
+ dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
+ dmu_objset_type_t ostype, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ blkptr_t *bp_toread = NULL;
+
+ if (dsl_scan_check_suspend(scn, zb))
+ return;
+
+ if (dsl_scan_check_resume(scn, dnp, zb))
+ return;
+
+ scn->scn_visited_this_txg++;
+
+ /*
+ * This debugging is commented out to conserve stack space. This
+ * function is called recursively and the debugging adds several
+ * bytes to the stack for each call. It can be commented back in
+ * if required to debug an issue in dsl_scan_visitbp().
+ *
+ * dprintf_bp(bp,
+ * "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
+ * ds, ds ? ds->ds_object : 0,
+ * zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
+ * bp);
+ */
+
+ if (BP_IS_HOLE(bp)) {
+ scn->scn_holes_this_txg++;
+ return;
+ }
+
+ if (BP_IS_REDACTED(bp)) {
+ ASSERT(dsl_dataset_feature_is_active(ds,
+ SPA_FEATURE_REDACTED_DATASETS));
+ return;
+ }
+
+ if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) {
+ scn->scn_lt_min_this_txg++;
+ return;
+ }
+
+ bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
+ *bp_toread = *bp;
+
+ if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0)
+ goto out;
+
+ /*
+ * If dsl_scan_ddt() has already visited this block, it will have
+ * already done any translations or scrubbing, so don't call the
+ * callback again.
+ */
+ if (ddt_class_contains(dp->dp_spa,
+ scn->scn_phys.scn_ddt_class_max, bp)) {
+ scn->scn_ddt_contained_this_txg++;
+ goto out;
+ }
+
+ /*
+ * If this block is from the future (after cur_max_txg), then we
+ * are doing this on behalf of a deleted snapshot, and we will
+ * revisit the future block on the next pass of this dataset.
+ * Don't scan it now unless we need to because something
+ * under it was modified.
+ */
+ if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
+ scn->scn_gt_max_this_txg++;
+ goto out;
+ }
+
+ scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
+
+out:
+ kmem_free(bp_toread, sizeof (blkptr_t));
+}
+
+static void
+dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
+ dmu_tx_t *tx)
+{
+ zbookmark_phys_t zb;
+ scan_prefetch_ctx_t *spc;
+
+ SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+
+ if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) {
+ SET_BOOKMARK(&scn->scn_prefetch_bookmark,
+ zb.zb_objset, 0, 0, 0);
+ } else {
+ scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark;
+ }
+
+ scn->scn_objsets_visited_this_txg++;
+
+ spc = scan_prefetch_ctx_create(scn, NULL, FTAG);
+ dsl_scan_prefetch(spc, bp, &zb);
+ scan_prefetch_ctx_rele(spc, FTAG);
+
+ dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx);
+
+ dprintf_ds(ds, "finished scan%s", "");
+}
+
+static void
+ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys)
+{
+ if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) {
+ if (ds->ds_is_snapshot) {
+ /*
+ * Note:
+ * - scn_cur_{min,max}_txg stays the same.
+ * - Setting the flag is not really necessary if
+ * scn_cur_max_txg == scn_max_txg, because there
+ * is nothing after this snapshot that we care
+ * about. However, we set it anyway and then
+ * ignore it when we retraverse it in
+ * dsl_scan_visitds().
+ */
+ scn_phys->scn_bookmark.zb_objset =
+ dsl_dataset_phys(ds)->ds_next_snap_obj;
+ zfs_dbgmsg("destroying ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)dsl_dataset_phys(ds)->
+ ds_next_snap_obj);
+ scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN;
+ } else {
+ SET_BOOKMARK(&scn_phys->scn_bookmark,
+ ZB_DESTROYED_OBJSET, 0, 0, 0);
+ zfs_dbgmsg("destroying ds %llu; currently traversing; "
+ "reset bookmark to -1,0,0,0",
+ (u_longlong_t)ds->ds_object);
+ }
+ }
+}
+
+/*
+ * Invoked when a dataset is destroyed. We need to make sure that:
+ *
+ * 1) If it is the dataset that was currently being scanned, we write
+ * a new dsl_scan_phys_t and marking the objset reference in it
+ * as destroyed.
+ * 2) Remove it from the work queue, if it was present.
+ *
+ * If the dataset was actually a snapshot, instead of marking the dataset
+ * as destroyed, we instead substitute the next snapshot in line.
+ */
+void
+dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+ uint64_t mintxg;
+
+ if (!dsl_scan_is_running(scn))
+ return;
+
+ ds_destroyed_scn_phys(ds, &scn->scn_phys);
+ ds_destroyed_scn_phys(ds, &scn->scn_phys_cached);
+
+ if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
+ scan_ds_queue_remove(scn, ds->ds_object);
+ if (ds->ds_is_snapshot)
+ scan_ds_queue_insert(scn,
+ dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg);
+ }
+
+ if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+ ds->ds_object, &mintxg) == 0) {
+ ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+ if (ds->ds_is_snapshot) {
+ /*
+ * We keep the same mintxg; it could be >
+ * ds_creation_txg if the previous snapshot was
+ * deleted too.
+ */
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj,
+ dsl_dataset_phys(ds)->ds_next_snap_obj,
+ mintxg, tx) == 0);
+ zfs_dbgmsg("destroying ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)dsl_dataset_phys(ds)->
+ ds_next_snap_obj);
+ } else {
+ zfs_dbgmsg("destroying ds %llu; in queue; removing",
+ (u_longlong_t)ds->ds_object);
+ }
+ }
+
+ /*
+ * dsl_scan_sync() should be called after this, and should sync
+ * out our changed state, but just to be safe, do it here.
+ */
+ dsl_scan_sync_state(scn, tx, SYNC_CACHED);
+}
+
+static void
+ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark)
+{
+ if (scn_bookmark->zb_objset == ds->ds_object) {
+ scn_bookmark->zb_objset =
+ dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
+ }
+}
+
+/*
+ * Called when a dataset is snapshotted. If we were currently traversing
+ * this snapshot, we reset our bookmark to point at the newly created
+ * snapshot. We also modify our work queue to remove the old snapshot and
+ * replace with the new one.
+ */
+void
+dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+ uint64_t mintxg;
+
+ if (!dsl_scan_is_running(scn))
+ return;
+
+ ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
+
+ ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark);
+ ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark);
+
+ if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
+ scan_ds_queue_remove(scn, ds->ds_object);
+ scan_ds_queue_insert(scn,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg);
+ }
+
+ if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+ ds->ds_object, &mintxg) == 0) {
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0);
+ zfs_dbgmsg("snapshotting ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
+ }
+
+ dsl_scan_sync_state(scn, tx, SYNC_CACHED);
+}
+
+static void
+ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2,
+ zbookmark_phys_t *scn_bookmark)
+{
+ if (scn_bookmark->zb_objset == ds1->ds_object) {
+ scn_bookmark->zb_objset = ds2->ds_object;
+ zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds1->ds_object,
+ (u_longlong_t)ds2->ds_object);
+ } else if (scn_bookmark->zb_objset == ds2->ds_object) {
+ scn_bookmark->zb_objset = ds1->ds_object;
+ zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds2->ds_object,
+ (u_longlong_t)ds1->ds_object);
+ }
+}
+
+/*
+ * Called when an origin dataset and its clone are swapped. If we were
+ * currently traversing the dataset, we need to switch to traversing the
+ * newly promoted clone.
+ */
+void
+dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds1->ds_dir->dd_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+ uint64_t mintxg1, mintxg2;
+ boolean_t ds1_queued, ds2_queued;
+
+ if (!dsl_scan_is_running(scn))
+ return;
+
+ ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark);
+ ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark);
+
+ /*
+ * Handle the in-memory scan queue.
+ */
+ ds1_queued = scan_ds_queue_contains(scn, ds1->ds_object, &mintxg1);
+ ds2_queued = scan_ds_queue_contains(scn, ds2->ds_object, &mintxg2);
+
+ /* Sanity checking. */
+ if (ds1_queued) {
+ ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+ ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
+ }
+ if (ds2_queued) {
+ ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+ ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
+ }
+
+ if (ds1_queued && ds2_queued) {
+ /*
+ * If both are queued, we don't need to do anything.
+ * The swapping code below would not handle this case correctly,
+ * since we can't insert ds2 if it is already there. That's
+ * because scan_ds_queue_insert() prohibits a duplicate insert
+ * and panics.
+ */
+ } else if (ds1_queued) {
+ scan_ds_queue_remove(scn, ds1->ds_object);
+ scan_ds_queue_insert(scn, ds2->ds_object, mintxg1);
+ } else if (ds2_queued) {
+ scan_ds_queue_remove(scn, ds2->ds_object);
+ scan_ds_queue_insert(scn, ds1->ds_object, mintxg2);
+ }
+
+ /*
+ * Handle the on-disk scan queue.
+ * The on-disk state is an out-of-date version of the in-memory state,
+ * so the in-memory and on-disk values for ds1_queued and ds2_queued may
+ * be different. Therefore we need to apply the swap logic to the
+ * on-disk state independently of the in-memory state.
+ */
+ ds1_queued = zap_lookup_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg1) == 0;
+ ds2_queued = zap_lookup_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg2) == 0;
+
+ /* Sanity checking. */
+ if (ds1_queued) {
+ ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+ ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
+ }
+ if (ds2_queued) {
+ ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+ ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
+ }
+
+ if (ds1_queued && ds2_queued) {
+ /*
+ * If both are queued, we don't need to do anything.
+ * Alternatively, we could check for EEXIST from
+ * zap_add_int_key() and back out to the original state, but
+ * that would be more work than checking for this case upfront.
+ */
+ } else if (ds1_queued) {
+ VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
+ VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg1, tx));
+ zfs_dbgmsg("clone_swap ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds1->ds_object,
+ (u_longlong_t)ds2->ds_object);
+ } else if (ds2_queued) {
+ VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
+ VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg2, tx));
+ zfs_dbgmsg("clone_swap ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds2->ds_object,
+ (u_longlong_t)ds1->ds_object);
+ }
+
+ dsl_scan_sync_state(scn, tx, SYNC_CACHED);
+}
+
+/* ARGSUSED */
+static int
+enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
+{
+ uint64_t originobj = *(uint64_t *)arg;
+ dsl_dataset_t *ds;
+ int err;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj)
+ return (0);
+
+ err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
+ if (err)
+ return (err);
+
+ while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) {
+ dsl_dataset_t *prev;
+ err = dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
+
+ dsl_dataset_rele(ds, FTAG);
+ if (err)
+ return (err);
+ ds = prev;
+ }
+ scan_ds_queue_insert(scn, ds->ds_object,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg);
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+static void
+dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ dsl_dataset_t *ds;
+
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+
+ if (scn->scn_phys.scn_cur_min_txg >=
+ scn->scn_phys.scn_max_txg) {
+ /*
+ * This can happen if this snapshot was created after the
+ * scan started, and we already completed a previous snapshot
+ * that was created after the scan started. This snapshot
+ * only references blocks with:
+ *
+ * birth < our ds_creation_txg
+ * cur_min_txg is no less than ds_creation_txg.
+ * We have already visited these blocks.
+ * or
+ * birth > scn_max_txg
+ * The scan requested not to visit these blocks.
+ *
+ * Subsequent snapshots (and clones) can reference our
+ * blocks, or blocks with even higher birth times.
+ * Therefore we do not need to visit them either,
+ * so we do not add them to the work queue.
+ *
+ * Note that checking for cur_min_txg >= cur_max_txg
+ * is not sufficient, because in that case we may need to
+ * visit subsequent snapshots. This happens when min_txg > 0,
+ * which raises cur_min_txg. In this case we will visit
+ * this dataset but skip all of its blocks, because the
+ * rootbp's birth time is < cur_min_txg. Then we will
+ * add the next snapshots/clones to the work queue.
+ */
+ char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+ dsl_dataset_name(ds, dsname);
+ zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because "
+ "cur_min_txg (%llu) >= max_txg (%llu)",
+ (longlong_t)dsobj, dsname,
+ (longlong_t)scn->scn_phys.scn_cur_min_txg,
+ (longlong_t)scn->scn_phys.scn_max_txg);
+ kmem_free(dsname, MAXNAMELEN);
+
+ goto out;
+ }
+
+ /*
+ * Only the ZIL in the head (non-snapshot) is valid. Even though
+ * snapshots can have ZIL block pointers (which may be the same
+ * BP as in the head), they must be ignored. In addition, $ORIGIN
+ * doesn't have a objset (i.e. its ds_bp is a hole) so we don't
+ * need to look for a ZIL in it either. So we traverse the ZIL here,
+ * rather than in scan_recurse(), because the regular snapshot
+ * block-sharing rules don't apply to it.
+ */
+ if (!dsl_dataset_is_snapshot(ds) &&
+ (dp->dp_origin_snap == NULL ||
+ ds->ds_dir != dp->dp_origin_snap->ds_dir)) {
+ objset_t *os;
+ if (dmu_objset_from_ds(ds, &os) != 0) {
+ goto out;
+ }
+ dsl_scan_zil(dp, &os->os_zil_header);
+ }
+
+ /*
+ * Iterate over the bps in this ds.
+ */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+ char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+ dsl_dataset_name(ds, dsname);
+ zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
+ "suspending=%u",
+ (longlong_t)dsobj, dsname,
+ (longlong_t)scn->scn_phys.scn_cur_min_txg,
+ (longlong_t)scn->scn_phys.scn_cur_max_txg,
+ (int)scn->scn_suspending);
+ kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
+
+ if (scn->scn_suspending)
+ goto out;
+
+ /*
+ * We've finished this pass over this dataset.
+ */
+
+ /*
+ * If we did not completely visit this dataset, do another pass.
+ */
+ if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
+ zfs_dbgmsg("incomplete pass; visiting again");
+ scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
+ scan_ds_queue_insert(scn, ds->ds_object,
+ scn->scn_phys.scn_cur_max_txg);
+ goto out;
+ }
+
+ /*
+ * Add descendant datasets to work queue.
+ */
+ if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
+ scan_ds_queue_insert(scn,
+ dsl_dataset_phys(ds)->ds_next_snap_obj,
+ dsl_dataset_phys(ds)->ds_creation_txg);
+ }
+ if (dsl_dataset_phys(ds)->ds_num_children > 1) {
+ boolean_t usenext = B_FALSE;
+ if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
+ uint64_t count;
+ /*
+ * A bug in a previous version of the code could
+ * cause upgrade_clones_cb() to not set
+ * ds_next_snap_obj when it should, leading to a
+ * missing entry. Therefore we can only use the
+ * next_clones_obj when its count is correct.
+ */
+ int err = zap_count(dp->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_next_clones_obj, &count);
+ if (err == 0 &&
+ count == dsl_dataset_phys(ds)->ds_num_children - 1)
+ usenext = B_TRUE;
+ }
+
+ if (usenext) {
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ for (zap_cursor_init(&zc, dp->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_next_clones_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ scan_ds_queue_insert(scn,
+ zfs_strtonum(za.za_name, NULL),
+ dsl_dataset_phys(ds)->ds_creation_txg);
+ }
+ zap_cursor_fini(&zc);
+ } else {
+ VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+ enqueue_clones_cb, &ds->ds_object,
+ DS_FIND_CHILDREN));
+ }
+ }
+
+out:
+ dsl_dataset_rele(ds, FTAG);
+}
+
+/* ARGSUSED */
+static int
+enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
+{
+ dsl_dataset_t *ds;
+ int err;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
+ if (err)
+ return (err);
+
+ while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+ dsl_dataset_t *prev;
+ err = dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
+ if (err) {
+ dsl_dataset_rele(ds, FTAG);
+ return (err);
+ }
+
+ /*
+ * If this is a clone, we don't need to worry about it for now.
+ */
+ if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) {
+ dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele(prev, FTAG);
+ return (0);
+ }
+ dsl_dataset_rele(ds, FTAG);
+ ds = prev;
+ }
+
+ scan_ds_queue_insert(scn, ds->ds_object,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg);
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+/* ARGSUSED */
+void
+dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+ ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ const ddt_key_t *ddk = &dde->dde_key;
+ ddt_phys_t *ddp = dde->dde_phys;
+ blkptr_t bp;
+ zbookmark_phys_t zb = { 0 };
+ int p;
+
+ if (!dsl_scan_is_running(scn))
+ return;
+
+ /*
+ * This function is special because it is the only thing
+ * that can add scan_io_t's to the vdev scan queues from
+ * outside dsl_scan_sync(). For the most part this is ok
+ * as long as it is called from within syncing context.
+ * However, dsl_scan_sync() expects that no new sio's will
+ * be added between when all the work for a scan is done
+ * and the next txg when the scan is actually marked as
+ * completed. This check ensures we do not issue new sio's
+ * during this period.
+ */
+ if (scn->scn_done_txg != 0)
+ return;
+
+ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0 ||
+ ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
+ continue;
+ ddt_bp_create(checksum, ddk, ddp, &bp);
+
+ scn->scn_visited_this_txg++;
+ scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
+ }
+}
+
+/*
+ * Scrub/dedup interaction.
+ *
+ * If there are N references to a deduped block, we don't want to scrub it
+ * N times -- ideally, we should scrub it exactly once.
+ *
+ * We leverage the fact that the dde's replication class (enum ddt_class)
+ * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
+ * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
+ *
+ * To prevent excess scrubbing, the scrub begins by walking the DDT
+ * to find all blocks with refcnt > 1, and scrubs each of these once.
+ * Since there are two replication classes which contain blocks with
+ * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
+ * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
+ *
+ * There would be nothing more to say if a block's refcnt couldn't change
+ * during a scrub, but of course it can so we must account for changes
+ * in a block's replication class.
+ *
+ * Here's an example of what can occur:
+ *
+ * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
+ * when visited during the top-down scrub phase, it will be scrubbed twice.
+ * This negates our scrub optimization, but is otherwise harmless.
+ *
+ * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
+ * on each visit during the top-down scrub phase, it will never be scrubbed.
+ * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
+ * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
+ * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
+ * while a scrub is in progress, it scrubs the block right then.
+ */
+static void
+dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+ ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
+ ddt_entry_t dde;
+ int error;
+ uint64_t n = 0;
+
+ bzero(&dde, sizeof (ddt_entry_t));
+
+ while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
+ ddt_t *ddt;
+
+ if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
+ break;
+ dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
+ (longlong_t)ddb->ddb_class,
+ (longlong_t)ddb->ddb_type,
+ (longlong_t)ddb->ddb_checksum,
+ (longlong_t)ddb->ddb_cursor);
+
+ /* There should be no pending changes to the dedup table */
+ ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
+ ASSERT(avl_first(&ddt->ddt_tree) == NULL);
+
+ dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
+ n++;
+
+ if (dsl_scan_check_suspend(scn, NULL))
+ break;
+ }
+
+ zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; "
+ "suspending=%u", (longlong_t)n,
+ (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending);
+
+ ASSERT(error == 0 || error == ENOENT);
+ ASSERT(error != ENOENT ||
+ ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
+}
+
+static uint64_t
+dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
+{
+ uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
+ if (ds->ds_is_snapshot)
+ return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
+ return (smt);
+}
+
+static void
+dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+ scan_ds_t *sds;
+ dsl_pool_t *dp = scn->scn_dp;
+
+ if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+ scn->scn_phys.scn_ddt_class_max) {
+ scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+ scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+ dsl_scan_ddt(scn, tx);
+ if (scn->scn_suspending)
+ return;
+ }
+
+ if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
+ /* First do the MOS & ORIGIN */
+
+ scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+ scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+ dsl_scan_visit_rootbp(scn, NULL,
+ &dp->dp_meta_rootbp, tx);
+ spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+ if (scn->scn_suspending)
+ return;
+
+ if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
+ VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+ enqueue_cb, NULL, DS_FIND_CHILDREN));
+ } else {
+ dsl_scan_visitds(scn,
+ dp->dp_origin_snap->ds_object, tx);
+ }
+ ASSERT(!scn->scn_suspending);
+ } else if (scn->scn_phys.scn_bookmark.zb_objset !=
+ ZB_DESTROYED_OBJSET) {
+ uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset;
+ /*
+ * If we were suspended, continue from here. Note if the
+ * ds we were suspended on was deleted, the zb_objset may
+ * be -1, so we will skip this and find a new objset
+ * below.
+ */
+ dsl_scan_visitds(scn, dsobj, tx);
+ if (scn->scn_suspending)
+ return;
+ }
+
+ /*
+ * In case we suspended right at the end of the ds, zero the
+ * bookmark so we don't think that we're still trying to resume.
+ */
+ bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t));
+
+ /*
+ * Keep pulling things out of the dataset avl queue. Updates to the
+ * persistent zap-object-as-queue happen only at checkpoints.
+ */
+ while ((sds = avl_first(&scn->scn_queue)) != NULL) {
+ dsl_dataset_t *ds;
+ uint64_t dsobj = sds->sds_dsobj;
+ uint64_t txg = sds->sds_txg;
+
+ /* dequeue and free the ds from the queue */
+ scan_ds_queue_remove(scn, dsobj);
+ sds = NULL;
+
+ /* set up min / max txg */
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+ if (txg != 0) {
+ scn->scn_phys.scn_cur_min_txg =
+ MAX(scn->scn_phys.scn_min_txg, txg);
+ } else {
+ scn->scn_phys.scn_cur_min_txg =
+ MAX(scn->scn_phys.scn_min_txg,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg);
+ }
+ scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
+ dsl_dataset_rele(ds, FTAG);
+
+ dsl_scan_visitds(scn, dsobj, tx);
+ if (scn->scn_suspending)
+ return;
+ }
+
+ /* No more objsets to fetch, we're done */
+ scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET;
+ ASSERT0(scn->scn_suspending);
+}
+
+static uint64_t
+dsl_scan_count_leaves(vdev_t *vd)
+{
+ uint64_t i, leaves = 0;
+
+ /* we only count leaves that belong to the main pool and are readable */
+ if (vd->vdev_islog || vd->vdev_isspare ||
+ vd->vdev_isl2cache || !vdev_readable(vd))
+ return (0);
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ return (1);
+
+ for (i = 0; i < vd->vdev_children; i++) {
+ leaves += dsl_scan_count_leaves(vd->vdev_child[i]);
+ }
+
+ return (leaves);
+}
+
+static void
+scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp)
+{
+ int i;
+ uint64_t cur_size = 0;
+
+ for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+ cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]);
+ }
+
+ q->q_total_zio_size_this_txg += cur_size;
+ q->q_zios_this_txg++;
+}
+
+static void
+scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start,
+ uint64_t end)
+{
+ q->q_total_seg_size_this_txg += end - start;
+ q->q_segs_this_txg++;
+}
+
+static boolean_t
+scan_io_queue_check_suspend(dsl_scan_t *scn)
+{
+ /* See comment in dsl_scan_check_suspend() */
+ uint64_t curr_time_ns = gethrtime();
+ uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
+ uint64_t sync_time_ns = curr_time_ns -
+ scn->scn_dp->dp_spa->spa_sync_starttime;
+ int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
+ int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+ zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
+
+ return ((NSEC2MSEC(scan_time_ns) > mintime &&
+ (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent ||
+ txg_sync_waiting(scn->scn_dp) ||
+ NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
+ spa_shutting_down(scn->scn_dp->dp_spa));
+}
+
+/*
+ * Given a list of scan_io_t's in io_list, this issues the I/Os out to
+ * disk. This consumes the io_list and frees the scan_io_t's. This is
+ * called when emptying queues, either when we're up against the memory
+ * limit or when we have finished scanning. Returns B_TRUE if we stopped
+ * processing the list before we finished. Any sios that were not issued
+ * will remain in the io_list.
+ */
+static boolean_t
+scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
+{
+ dsl_scan_t *scn = queue->q_scn;
+ scan_io_t *sio;
+ int64_t bytes_issued = 0;
+ boolean_t suspended = B_FALSE;
+
+ while ((sio = list_head(io_list)) != NULL) {
+ blkptr_t bp;
+
+ if (scan_io_queue_check_suspend(scn)) {
+ suspended = B_TRUE;
+ break;
+ }
+
+ sio2bp(sio, &bp);
+ bytes_issued += SIO_GET_ASIZE(sio);
+ scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
+ &sio->sio_zb, queue);
+ (void) list_remove_head(io_list);
+ scan_io_queues_update_zio_stats(queue, &bp);
+ sio_free(sio);
+ }
+
+ atomic_add_64(&scn->scn_bytes_pending, -bytes_issued);
+
+ return (suspended);
+}
+
+/*
+ * This function removes sios from an IO queue which reside within a given
+ * range_seg_t and inserts them (in offset order) into a list. Note that
+ * we only ever return a maximum of 32 sios at once. If there are more sios
+ * to process within this segment that did not make it onto the list we
+ * return B_TRUE and otherwise B_FALSE.
+ */
+static boolean_t
+scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
+{
+ scan_io_t *srch_sio, *sio, *next_sio;
+ avl_index_t idx;
+ uint_t num_sios = 0;
+ int64_t bytes_issued = 0;
+
+ ASSERT(rs != NULL);
+ ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+
+ srch_sio = sio_alloc(1);
+ srch_sio->sio_nr_dvas = 1;
+ SIO_SET_OFFSET(srch_sio, rs_get_start(rs, queue->q_exts_by_addr));
+
+ /*
+ * The exact start of the extent might not contain any matching zios,
+ * so if that's the case, examine the next one in the tree.
+ */
+ sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
+ sio_free(srch_sio);
+
+ if (sio == NULL)
+ sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER);
+
+ while (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs,
+ queue->q_exts_by_addr) && num_sios <= 32) {
+ ASSERT3U(SIO_GET_OFFSET(sio), >=, rs_get_start(rs,
+ queue->q_exts_by_addr));
+ ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs_get_end(rs,
+ queue->q_exts_by_addr));
+
+ next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
+ avl_remove(&queue->q_sios_by_addr, sio);
+ queue->q_sio_memused -= SIO_GET_MUSED(sio);
+
+ bytes_issued += SIO_GET_ASIZE(sio);
+ num_sios++;
+ list_insert_tail(list, sio);
+ sio = next_sio;
+ }
+
+ /*
+ * We limit the number of sios we process at once to 32 to avoid
+ * biting off more than we can chew. If we didn't take everything
+ * in the segment we update it to reflect the work we were able to
+ * complete. Otherwise, we remove it from the range tree entirely.
+ */
+ if (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs,
+ queue->q_exts_by_addr)) {
+ range_tree_adjust_fill(queue->q_exts_by_addr, rs,
+ -bytes_issued);
+ range_tree_resize_segment(queue->q_exts_by_addr, rs,
+ SIO_GET_OFFSET(sio), rs_get_end(rs,
+ queue->q_exts_by_addr) - SIO_GET_OFFSET(sio));
+
+ return (B_TRUE);
+ } else {
+ uint64_t rstart = rs_get_start(rs, queue->q_exts_by_addr);
+ uint64_t rend = rs_get_end(rs, queue->q_exts_by_addr);
+ range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart);
+ return (B_FALSE);
+ }
+}
+
+/*
+ * This is called from the queue emptying thread and selects the next
+ * extent from which we are to issue I/Os. The behavior of this function
+ * depends on the state of the scan, the current memory consumption and
+ * whether or not we are performing a scan shutdown.
+ * 1) We select extents in an elevator algorithm (LBA-order) if the scan
+ * needs to perform a checkpoint
+ * 2) We select the largest available extent if we are up against the
+ * memory limit.
+ * 3) Otherwise we don't select any extents.
+ */
+static range_seg_t *
+scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
+{
+ dsl_scan_t *scn = queue->q_scn;
+ range_tree_t *rt = queue->q_exts_by_addr;
+
+ ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+ ASSERT(scn->scn_is_sorted);
+
+ /* handle tunable overrides */
+ if (scn->scn_checkpointing || scn->scn_clearing) {
+ if (zfs_scan_issue_strategy == 1) {
+ return (range_tree_first(rt));
+ } else if (zfs_scan_issue_strategy == 2) {
+ /*
+ * We need to get the original entry in the by_addr
+ * tree so we can modify it.
+ */
+ range_seg_t *size_rs =
+ zfs_btree_first(&queue->q_exts_by_size, NULL);
+ if (size_rs == NULL)
+ return (NULL);
+ uint64_t start = rs_get_start(size_rs, rt);
+ uint64_t size = rs_get_end(size_rs, rt) - start;
+ range_seg_t *addr_rs = range_tree_find(rt, start,
+ size);
+ ASSERT3P(addr_rs, !=, NULL);
+ ASSERT3U(rs_get_start(size_rs, rt), ==,
+ rs_get_start(addr_rs, rt));
+ ASSERT3U(rs_get_end(size_rs, rt), ==,
+ rs_get_end(addr_rs, rt));
+ return (addr_rs);
+ }
+ }
+
+ /*
+ * During normal clearing, we want to issue our largest segments
+ * first, keeping IO as sequential as possible, and leaving the
+ * smaller extents for later with the hope that they might eventually
+ * grow to larger sequential segments. However, when the scan is
+ * checkpointing, no new extents will be added to the sorting queue,
+ * so the way we are sorted now is as good as it will ever get.
+ * In this case, we instead switch to issuing extents in LBA order.
+ */
+ if (scn->scn_checkpointing) {
+ return (range_tree_first(rt));
+ } else if (scn->scn_clearing) {
+ /*
+ * We need to get the original entry in the by_addr
+ * tree so we can modify it.
+ */
+ range_seg_t *size_rs = zfs_btree_first(&queue->q_exts_by_size,
+ NULL);
+ if (size_rs == NULL)
+ return (NULL);
+ uint64_t start = rs_get_start(size_rs, rt);
+ uint64_t size = rs_get_end(size_rs, rt) - start;
+ range_seg_t *addr_rs = range_tree_find(rt, start, size);
+ ASSERT3P(addr_rs, !=, NULL);
+ ASSERT3U(rs_get_start(size_rs, rt), ==, rs_get_start(addr_rs,
+ rt));
+ ASSERT3U(rs_get_end(size_rs, rt), ==, rs_get_end(addr_rs, rt));
+ return (addr_rs);
+ } else {
+ return (NULL);
+ }
+}
+
+static void
+scan_io_queues_run_one(void *arg)
+{
+ dsl_scan_io_queue_t *queue = arg;
+ kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
+ boolean_t suspended = B_FALSE;
+ range_seg_t *rs = NULL;
+ scan_io_t *sio = NULL;
+ list_t sio_list;
+ uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
+ uint64_t nr_leaves = dsl_scan_count_leaves(queue->q_vd);
+
+ ASSERT(queue->q_scn->scn_is_sorted);
+
+ list_create(&sio_list, sizeof (scan_io_t),
+ offsetof(scan_io_t, sio_nodes.sio_list_node));
+ mutex_enter(q_lock);
+
+ /* calculate maximum in-flight bytes for this txg (min 1MB) */
+ queue->q_maxinflight_bytes =
+ MAX(nr_leaves * bytes_per_leaf, 1ULL << 20);
+
+ /* reset per-queue scan statistics for this txg */
+ queue->q_total_seg_size_this_txg = 0;
+ queue->q_segs_this_txg = 0;
+ queue->q_total_zio_size_this_txg = 0;
+ queue->q_zios_this_txg = 0;
+
+ /* loop until we run out of time or sios */
+ while ((rs = scan_io_queue_fetch_ext(queue)) != NULL) {
+ uint64_t seg_start = 0, seg_end = 0;
+ boolean_t more_left = B_TRUE;
+
+ ASSERT(list_is_empty(&sio_list));
+
+ /* loop while we still have sios left to process in this rs */
+ while (more_left) {
+ scan_io_t *first_sio, *last_sio;
+
+ /*
+ * We have selected which extent needs to be
+ * processed next. Gather up the corresponding sios.
+ */
+ more_left = scan_io_queue_gather(queue, rs, &sio_list);
+ ASSERT(!list_is_empty(&sio_list));
+ first_sio = list_head(&sio_list);
+ last_sio = list_tail(&sio_list);
+
+ seg_end = SIO_GET_END_OFFSET(last_sio);
+ if (seg_start == 0)
+ seg_start = SIO_GET_OFFSET(first_sio);
+
+ /*
+ * Issuing sios can take a long time so drop the
+ * queue lock. The sio queue won't be updated by
+ * other threads since we're in syncing context so
+ * we can be sure that our trees will remain exactly
+ * as we left them.
+ */
+ mutex_exit(q_lock);
+ suspended = scan_io_queue_issue(queue, &sio_list);
+ mutex_enter(q_lock);
+
+ if (suspended)
+ break;
+ }
+
+ /* update statistics for debugging purposes */
+ scan_io_queues_update_seg_stats(queue, seg_start, seg_end);
+
+ if (suspended)
+ break;
+ }
+
+ /*
+ * If we were suspended in the middle of processing,
+ * requeue any unfinished sios and exit.
+ */
+ while ((sio = list_head(&sio_list)) != NULL) {
+ list_remove(&sio_list, sio);
+ scan_io_queue_insert_impl(queue, sio);
+ }
+
+ mutex_exit(q_lock);
+ list_destroy(&sio_list);
+}
+
+/*
+ * Performs an emptying run on all scan queues in the pool. This just
+ * punches out one thread per top-level vdev, each of which processes
+ * only that vdev's scan queue. We can parallelize the I/O here because
+ * we know that each queue's I/Os only affect its own top-level vdev.
+ *
+ * This function waits for the queue runs to complete, and must be
+ * called from dsl_scan_sync (or in general, syncing context).
+ */
+static void
+scan_io_queues_run(dsl_scan_t *scn)
+{
+ spa_t *spa = scn->scn_dp->dp_spa;
+
+ ASSERT(scn->scn_is_sorted);
+ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
+ if (scn->scn_bytes_pending == 0)
+ return;
+
+ if (scn->scn_taskq == NULL) {
+ int nthreads = spa->spa_root_vdev->vdev_children;
+
+ /*
+ * We need to make this taskq *always* execute as many
+ * threads in parallel as we have top-level vdevs and no
+ * less, otherwise strange serialization of the calls to
+ * scan_io_queues_run_one can occur during spa_sync runs
+ * and that significantly impacts performance.
+ */
+ scn->scn_taskq = taskq_create("dsl_scan_iss", nthreads,
+ minclsyspri, nthreads, nthreads, TASKQ_PREPOPULATE);
+ }
+
+ for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+
+ mutex_enter(&vd->vdev_scan_io_queue_lock);
+ if (vd->vdev_scan_io_queue != NULL) {
+ VERIFY(taskq_dispatch(scn->scn_taskq,
+ scan_io_queues_run_one, vd->vdev_scan_io_queue,
+ TQ_SLEEP) != TASKQID_INVALID);
+ }
+ mutex_exit(&vd->vdev_scan_io_queue_lock);
+ }
+
+ /*
+ * Wait for the queues to finish issuing their IOs for this run
+ * before we return. There may still be IOs in flight at this
+ * point.
+ */
+ taskq_wait(scn->scn_taskq);
+}
+
+static boolean_t
+dsl_scan_async_block_should_pause(dsl_scan_t *scn)
+{
+ uint64_t elapsed_nanosecs;
+
+ if (zfs_recover)
+ return (B_FALSE);
+
+ if (zfs_async_block_max_blocks != 0 &&
+ scn->scn_visited_this_txg >= zfs_async_block_max_blocks) {
+ return (B_TRUE);
+ }
+
+ if (zfs_max_async_dedup_frees != 0 &&
+ scn->scn_dedup_frees_this_txg >= zfs_max_async_dedup_frees) {
+ return (B_TRUE);
+ }
+
+ elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+ return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+ (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms &&
+ txg_sync_waiting(scn->scn_dp)) ||
+ spa_shutting_down(scn->scn_dp->dp_spa));
+}
+
+static int
+dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = arg;
+
+ if (!scn->scn_is_bptree ||
+ (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
+ if (dsl_scan_async_block_should_pause(scn))
+ return (SET_ERROR(ERESTART));
+ }
+
+ zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
+ dmu_tx_get_txg(tx), bp, 0));
+ dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
+ -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
+ -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
+ scn->scn_visited_this_txg++;
+ if (BP_GET_DEDUP(bp))
+ scn->scn_dedup_frees_this_txg++;
+ return (0);
+}
+
+static void
+dsl_scan_update_stats(dsl_scan_t *scn)
+{
+ spa_t *spa = scn->scn_dp->dp_spa;
+ uint64_t i;
+ uint64_t seg_size_total = 0, zio_size_total = 0;
+ uint64_t seg_count_total = 0, zio_count_total = 0;
+
+ for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+ dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue;
+
+ if (queue == NULL)
+ continue;
+
+ seg_size_total += queue->q_total_seg_size_this_txg;
+ zio_size_total += queue->q_total_zio_size_this_txg;
+ seg_count_total += queue->q_segs_this_txg;
+ zio_count_total += queue->q_zios_this_txg;
+ }
+
+ if (seg_count_total == 0 || zio_count_total == 0) {
+ scn->scn_avg_seg_size_this_txg = 0;
+ scn->scn_avg_zio_size_this_txg = 0;
+ scn->scn_segs_this_txg = 0;
+ scn->scn_zios_this_txg = 0;
+ return;
+ }
+
+ scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total;
+ scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total;
+ scn->scn_segs_this_txg = seg_count_total;
+ scn->scn_zios_this_txg = zio_count_total;
+}
+
+static int
+bpobj_dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+ dmu_tx_t *tx)
+{
+ ASSERT(!bp_freed);
+ return (dsl_scan_free_block_cb(arg, bp, tx));
+}
+
+static int
+dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+ dmu_tx_t *tx)
+{
+ ASSERT(!bp_freed);
+ dsl_scan_t *scn = arg;
+ const dva_t *dva = &bp->blk_dva[0];
+
+ if (dsl_scan_async_block_should_pause(scn))
+ return (SET_ERROR(ERESTART));
+
+ spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa,
+ DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva),
+ DVA_GET_ASIZE(dva), tx);
+ scn->scn_visited_this_txg++;
+ return (0);
+}
+
+boolean_t
+dsl_scan_active(dsl_scan_t *scn)
+{
+ spa_t *spa = scn->scn_dp->dp_spa;
+ uint64_t used = 0, comp, uncomp;
+ boolean_t clones_left;
+
+ if (spa->spa_load_state != SPA_LOAD_NONE)
+ return (B_FALSE);
+ if (spa_shutting_down(spa))
+ return (B_FALSE);
+ if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) ||
+ (scn->scn_async_destroying && !scn->scn_async_stalled))
+ return (B_TRUE);
+
+ if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+ (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
+ &used, &comp, &uncomp);
+ }
+ clones_left = spa_livelist_delete_check(spa);
+ return ((used != 0) || (clones_left));
+}
+
+static boolean_t
+dsl_scan_check_deferred(vdev_t *vd)
+{
+ boolean_t need_resilver = B_FALSE;
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ need_resilver |=
+ dsl_scan_check_deferred(vd->vdev_child[c]);
+ }
+
+ if (!vdev_is_concrete(vd) || vd->vdev_aux ||
+ !vd->vdev_ops->vdev_op_leaf)
+ return (need_resilver);
+
+ if (!vd->vdev_resilver_deferred)
+ need_resilver = B_TRUE;
+
+ return (need_resilver);
+}
+
+static boolean_t
+dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
+ uint64_t phys_birth)
+{
+ vdev_t *vd;
+
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+
+ if (vd->vdev_ops == &vdev_indirect_ops) {
+ /*
+ * The indirect vdev can point to multiple
+ * vdevs. For simplicity, always create
+ * the resilver zio_t. zio_vdev_io_start()
+ * will bypass the child resilver i/o's if
+ * they are on vdevs that don't have DTL's.
+ */
+ return (B_TRUE);
+ }
+
+ if (DVA_GET_GANG(dva)) {
+ /*
+ * Gang members may be spread across multiple
+ * vdevs, so the best estimate we have is the
+ * scrub range, which has already been checked.
+ * XXX -- it would be better to change our
+ * allocation policy to ensure that all
+ * gang members reside on the same vdev.
+ */
+ return (B_TRUE);
+ }
+
+ /*
+ * Check if the top-level vdev must resilver this offset.
+ * When the offset does not intersect with a dirty leaf DTL
+ * then it may be possible to skip the resilver IO. The psize
+ * is provided instead of asize to simplify the check for RAIDZ.
+ */
+ if (!vdev_dtl_need_resilver(vd, dva, psize, phys_birth))
+ return (B_FALSE);
+
+ /*
+ * Check that this top-level vdev has a device under it which
+ * is resilvering and is not deferred.
+ */
+ if (!dsl_scan_check_deferred(vd))
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+static int
+dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = dp->dp_scan;
+ spa_t *spa = dp->dp_spa;
+ int err = 0;
+
+ if (spa_suspend_async_destroy(spa))
+ return (0);
+
+ if (zfs_free_bpobj_enabled &&
+ spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+ scn->scn_is_bptree = B_FALSE;
+ scn->scn_async_block_min_time_ms = zfs_free_min_time_ms;
+ scn->scn_zio_root = zio_root(spa, NULL,
+ NULL, ZIO_FLAG_MUSTSUCCEED);
+ err = bpobj_iterate(&dp->dp_free_bpobj,
+ bpobj_dsl_scan_free_block_cb, scn, tx);
+ VERIFY0(zio_wait(scn->scn_zio_root));
+ scn->scn_zio_root = NULL;
+
+ if (err != 0 && err != ERESTART)
+ zfs_panic_recover("error %u from bpobj_iterate()", err);
+ }
+
+ if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
+ ASSERT(scn->scn_async_destroying);
+ scn->scn_is_bptree = B_TRUE;
+ scn->scn_zio_root = zio_root(spa, NULL,
+ NULL, ZIO_FLAG_MUSTSUCCEED);
+ err = bptree_iterate(dp->dp_meta_objset,
+ dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
+ VERIFY0(zio_wait(scn->scn_zio_root));
+ scn->scn_zio_root = NULL;
+
+ if (err == EIO || err == ECKSUM) {
+ err = 0;
+ } else if (err != 0 && err != ERESTART) {
+ zfs_panic_recover("error %u from "
+ "traverse_dataset_destroyed()", err);
+ }
+
+ if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
+ /* finished; deactivate async destroy feature */
+ spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
+ ASSERT(!spa_feature_is_active(spa,
+ SPA_FEATURE_ASYNC_DESTROY));
+ VERIFY0(zap_remove(dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_BPTREE_OBJ, tx));
+ VERIFY0(bptree_free(dp->dp_meta_objset,
+ dp->dp_bptree_obj, tx));
+ dp->dp_bptree_obj = 0;
+ scn->scn_async_destroying = B_FALSE;
+ scn->scn_async_stalled = B_FALSE;
+ } else {
+ /*
+ * If we didn't make progress, mark the async
+ * destroy as stalled, so that we will not initiate
+ * a spa_sync() on its behalf. Note that we only
+ * check this if we are not finished, because if the
+ * bptree had no blocks for us to visit, we can
+ * finish without "making progress".
+ */
+ scn->scn_async_stalled =
+ (scn->scn_visited_this_txg == 0);
+ }
+ }
+ if (scn->scn_visited_this_txg) {
+ zfs_dbgmsg("freed %llu blocks in %llums from "
+ "free_bpobj/bptree txg %llu; err=%u",
+ (longlong_t)scn->scn_visited_this_txg,
+ (longlong_t)
+ NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
+ (longlong_t)tx->tx_txg, err);
+ scn->scn_visited_this_txg = 0;
+ scn->scn_dedup_frees_this_txg = 0;
+
+ /*
+ * Write out changes to the DDT that may be required as a
+ * result of the blocks freed. This ensures that the DDT
+ * is clean when a scrub/resilver runs.
+ */
+ ddt_sync(spa, tx->tx_txg);
+ }
+ if (err != 0)
+ return (err);
+ if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
+ zfs_free_leak_on_eio &&
+ (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
+ dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 ||
+ dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) {
+ /*
+ * We have finished background destroying, but there is still
+ * some space left in the dp_free_dir. Transfer this leaked
+ * space to the dp_leak_dir.
+ */
+ if (dp->dp_leak_dir == NULL) {
+ rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+ (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
+ LEAK_DIR_NAME, tx);
+ VERIFY0(dsl_pool_open_special_dir(dp,
+ LEAK_DIR_NAME, &dp->dp_leak_dir));
+ rrw_exit(&dp->dp_config_rwlock, FTAG);
+ }
+ dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
+ dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
+ dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
+ dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
+ dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+ -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
+ -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
+ -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
+ }
+
+ if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
+ !spa_livelist_delete_check(spa)) {
+ /* finished; verify that space accounting went to zero */
+ ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
+ ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes);
+ ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes);
+ }
+
+ spa_notify_waiters(spa);
+
+ EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj),
+ 0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_OBSOLETE_BPOBJ));
+ if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) {
+ ASSERT(spa_feature_is_active(dp->dp_spa,
+ SPA_FEATURE_OBSOLETE_COUNTS));
+
+ scn->scn_is_bptree = B_FALSE;
+ scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms;
+ err = bpobj_iterate(&dp->dp_obsolete_bpobj,
+ dsl_scan_obsolete_block_cb, scn, tx);
+ if (err != 0 && err != ERESTART)
+ zfs_panic_recover("error %u from bpobj_iterate()", err);
+
+ if (bpobj_is_empty(&dp->dp_obsolete_bpobj))
+ dsl_pool_destroy_obsolete_bpobj(dp, tx);
+ }
+ return (0);
+}
+
+/*
+ * This is the primary entry point for scans that is called from syncing
+ * context. Scans must happen entirely during syncing context so that we
+ * can guarantee that blocks we are currently scanning will not change out
+ * from under us. While a scan is active, this function controls how quickly
+ * transaction groups proceed, instead of the normal handling provided by
+ * txg_sync_thread().
+ */
+void
+dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ int err = 0;
+ dsl_scan_t *scn = dp->dp_scan;
+ spa_t *spa = dp->dp_spa;
+ state_sync_type_t sync_type = SYNC_OPTIONAL;
+
+ if (spa->spa_resilver_deferred &&
+ !spa_feature_is_active(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))
+ spa_feature_incr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
+
+ /*
+ * Check for scn_restart_txg before checking spa_load_state, so
+ * that we can restart an old-style scan while the pool is being
+ * imported (see dsl_scan_init). We also restart scans if there
+ * is a deferred resilver and the user has manually disabled
+ * deferred resilvers via the tunable.
+ */
+ if (dsl_scan_restarting(scn, tx) ||
+ (spa->spa_resilver_deferred && zfs_resilver_disable_defer)) {
+ pool_scan_func_t func = POOL_SCAN_SCRUB;
+ dsl_scan_done(scn, B_FALSE, tx);
+ if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+ func = POOL_SCAN_RESILVER;
+ zfs_dbgmsg("restarting scan func=%u txg=%llu",
+ func, (longlong_t)tx->tx_txg);
+ dsl_scan_setup_sync(&func, tx);
+ }
+
+ /*
+ * Only process scans in sync pass 1.
+ */
+ if (spa_sync_pass(spa) > 1)
+ return;
+
+ /*
+ * If the spa is shutting down, then stop scanning. This will
+ * ensure that the scan does not dirty any new data during the
+ * shutdown phase.
+ */
+ if (spa_shutting_down(spa))
+ return;
+
+ /*
+ * If the scan is inactive due to a stalled async destroy, try again.
+ */
+ if (!scn->scn_async_stalled && !dsl_scan_active(scn))
+ return;
+
+ /* reset scan statistics */
+ scn->scn_visited_this_txg = 0;
+ scn->scn_dedup_frees_this_txg = 0;
+ scn->scn_holes_this_txg = 0;
+ scn->scn_lt_min_this_txg = 0;
+ scn->scn_gt_max_this_txg = 0;
+ scn->scn_ddt_contained_this_txg = 0;
+ scn->scn_objsets_visited_this_txg = 0;
+ scn->scn_avg_seg_size_this_txg = 0;
+ scn->scn_segs_this_txg = 0;
+ scn->scn_avg_zio_size_this_txg = 0;
+ scn->scn_zios_this_txg = 0;
+ scn->scn_suspending = B_FALSE;
+ scn->scn_sync_start_time = gethrtime();
+ spa->spa_scrub_active = B_TRUE;
+
+ /*
+ * First process the async destroys. If we suspend, don't do
+ * any scrubbing or resilvering. This ensures that there are no
+ * async destroys while we are scanning, so the scan code doesn't
+ * have to worry about traversing it. It is also faster to free the
+ * blocks than to scrub them.
+ */
+ err = dsl_process_async_destroys(dp, tx);
+ if (err != 0)
+ return;
+
+ if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn))
+ return;
+
+ /*
+ * Wait a few txgs after importing to begin scanning so that
+ * we can get the pool imported quickly.
+ */
+ if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS)
+ return;
+
+ /*
+ * zfs_scan_suspend_progress can be set to disable scan progress.
+ * We don't want to spin the txg_sync thread, so we add a delay
+ * here to simulate the time spent doing a scan. This is mostly
+ * useful for testing and debugging.
+ */
+ if (zfs_scan_suspend_progress) {
+ uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time;
+ int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+ zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
+
+ while (zfs_scan_suspend_progress &&
+ !txg_sync_waiting(scn->scn_dp) &&
+ !spa_shutting_down(scn->scn_dp->dp_spa) &&
+ NSEC2MSEC(scan_time_ns) < mintime) {
+ delay(hz);
+ scan_time_ns = gethrtime() - scn->scn_sync_start_time;
+ }
+ return;
+ }
+
+ /*
+ * It is possible to switch from unsorted to sorted at any time,
+ * but afterwards the scan will remain sorted unless reloaded from
+ * a checkpoint after a reboot.
+ */
+ if (!zfs_scan_legacy) {
+ scn->scn_is_sorted = B_TRUE;
+ if (scn->scn_last_checkpoint == 0)
+ scn->scn_last_checkpoint = ddi_get_lbolt();
+ }
+
+ /*
+ * For sorted scans, determine what kind of work we will be doing
+ * this txg based on our memory limitations and whether or not we
+ * need to perform a checkpoint.
+ */
+ if (scn->scn_is_sorted) {
+ /*
+ * If we are over our checkpoint interval, set scn_clearing
+ * so that we can begin checkpointing immediately. The
+ * checkpoint allows us to save a consistent bookmark
+ * representing how much data we have scrubbed so far.
+ * Otherwise, use the memory limit to determine if we should
+ * scan for metadata or start issue scrub IOs. We accumulate
+ * metadata until we hit our hard memory limit at which point
+ * we issue scrub IOs until we are at our soft memory limit.
+ */
+ if (scn->scn_checkpointing ||
+ ddi_get_lbolt() - scn->scn_last_checkpoint >
+ SEC_TO_TICK(zfs_scan_checkpoint_intval)) {
+ if (!scn->scn_checkpointing)
+ zfs_dbgmsg("begin scan checkpoint");
+
+ scn->scn_checkpointing = B_TRUE;
+ scn->scn_clearing = B_TRUE;
+ } else {
+ boolean_t should_clear = dsl_scan_should_clear(scn);
+ if (should_clear && !scn->scn_clearing) {
+ zfs_dbgmsg("begin scan clearing");
+ scn->scn_clearing = B_TRUE;
+ } else if (!should_clear && scn->scn_clearing) {
+ zfs_dbgmsg("finish scan clearing");
+ scn->scn_clearing = B_FALSE;
+ }
+ }
+ } else {
+ ASSERT0(scn->scn_checkpointing);
+ ASSERT0(scn->scn_clearing);
+ }
+
+ if (!scn->scn_clearing && scn->scn_done_txg == 0) {
+ /* Need to scan metadata for more blocks to scrub */
+ dsl_scan_phys_t *scnp = &scn->scn_phys;
+ taskqid_t prefetch_tqid;
+ uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
+ uint64_t nr_leaves = dsl_scan_count_leaves(spa->spa_root_vdev);
+
+ /*
+ * Recalculate the max number of in-flight bytes for pool-wide
+ * scanning operations (minimum 1MB). Limits for the issuing
+ * phase are done per top-level vdev and are handled separately.
+ */
+ scn->scn_maxinflight_bytes =
+ MAX(nr_leaves * bytes_per_leaf, 1ULL << 20);
+
+ if (scnp->scn_ddt_bookmark.ddb_class <=
+ scnp->scn_ddt_class_max) {
+ ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark));
+ zfs_dbgmsg("doing scan sync txg %llu; "
+ "ddt bm=%llu/%llu/%llu/%llx",
+ (longlong_t)tx->tx_txg,
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
+ } else {
+ zfs_dbgmsg("doing scan sync txg %llu; "
+ "bm=%llu/%llu/%llu/%llu",
+ (longlong_t)tx->tx_txg,
+ (longlong_t)scnp->scn_bookmark.zb_objset,
+ (longlong_t)scnp->scn_bookmark.zb_object,
+ (longlong_t)scnp->scn_bookmark.zb_level,
+ (longlong_t)scnp->scn_bookmark.zb_blkid);
+ }
+
+ scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+ NULL, ZIO_FLAG_CANFAIL);
+
+ scn->scn_prefetch_stop = B_FALSE;
+ prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq,
+ dsl_scan_prefetch_thread, scn, TQ_SLEEP);
+ ASSERT(prefetch_tqid != TASKQID_INVALID);
+
+ dsl_pool_config_enter(dp, FTAG);
+ dsl_scan_visit(scn, tx);
+ dsl_pool_config_exit(dp, FTAG);
+
+ mutex_enter(&dp->dp_spa->spa_scrub_lock);
+ scn->scn_prefetch_stop = B_TRUE;
+ cv_broadcast(&spa->spa_scrub_io_cv);
+ mutex_exit(&dp->dp_spa->spa_scrub_lock);
+
+ taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid);
+ (void) zio_wait(scn->scn_zio_root);
+ scn->scn_zio_root = NULL;
+
+ zfs_dbgmsg("scan visited %llu blocks in %llums "
+ "(%llu os's, %llu holes, %llu < mintxg, "
+ "%llu in ddt, %llu > maxtxg)",
+ (longlong_t)scn->scn_visited_this_txg,
+ (longlong_t)NSEC2MSEC(gethrtime() -
+ scn->scn_sync_start_time),
+ (longlong_t)scn->scn_objsets_visited_this_txg,
+ (longlong_t)scn->scn_holes_this_txg,
+ (longlong_t)scn->scn_lt_min_this_txg,
+ (longlong_t)scn->scn_ddt_contained_this_txg,
+ (longlong_t)scn->scn_gt_max_this_txg);
+
+ if (!scn->scn_suspending) {
+ ASSERT0(avl_numnodes(&scn->scn_queue));
+ scn->scn_done_txg = tx->tx_txg + 1;
+ if (scn->scn_is_sorted) {
+ scn->scn_checkpointing = B_TRUE;
+ scn->scn_clearing = B_TRUE;
+ }
+ zfs_dbgmsg("scan complete txg %llu",
+ (longlong_t)tx->tx_txg);
+ }
+ } else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) {
+ ASSERT(scn->scn_clearing);
+
+ /* need to issue scrubbing IOs from per-vdev queues */
+ scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+ NULL, ZIO_FLAG_CANFAIL);
+ scan_io_queues_run(scn);
+ (void) zio_wait(scn->scn_zio_root);
+ scn->scn_zio_root = NULL;
+
+ /* calculate and dprintf the current memory usage */
+ (void) dsl_scan_should_clear(scn);
+ dsl_scan_update_stats(scn);
+
+ zfs_dbgmsg("scan issued %llu blocks (%llu segs) in %llums "
+ "(avg_block_size = %llu, avg_seg_size = %llu)",
+ (longlong_t)scn->scn_zios_this_txg,
+ (longlong_t)scn->scn_segs_this_txg,
+ (longlong_t)NSEC2MSEC(gethrtime() -
+ scn->scn_sync_start_time),
+ (longlong_t)scn->scn_avg_zio_size_this_txg,
+ (longlong_t)scn->scn_avg_seg_size_this_txg);
+ } else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) {
+ /* Finished with everything. Mark the scrub as complete */
+ zfs_dbgmsg("scan issuing complete txg %llu",
+ (longlong_t)tx->tx_txg);
+ ASSERT3U(scn->scn_done_txg, !=, 0);
+ ASSERT0(spa->spa_scrub_inflight);
+ ASSERT0(scn->scn_bytes_pending);
+ dsl_scan_done(scn, B_TRUE, tx);
+ sync_type = SYNC_MANDATORY;
+ }
+
+ dsl_scan_sync_state(scn, tx, sync_type);
+}
+
+static void
+count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp)
+{
+ int i;
+
+ /*
+ * Don't count embedded bp's, since we already did the work of
+ * scanning these when we scanned the containing block.
+ */
+ if (BP_IS_EMBEDDED(bp))
+ return;
+
+ /*
+ * Update the spa's stats on how many bytes we have issued.
+ * Sequential scrubs create a zio for each DVA of the bp. Each
+ * of these will include all DVAs for repair purposes, but the
+ * zio code will only try the first one unless there is an issue.
+ * Therefore, we should only count the first DVA for these IOs.
+ */
+ if (scn->scn_is_sorted) {
+ atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued,
+ DVA_GET_ASIZE(&bp->blk_dva[0]));
+ } else {
+ spa_t *spa = scn->scn_dp->dp_spa;
+
+ for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+ atomic_add_64(&spa->spa_scan_pass_issued,
+ DVA_GET_ASIZE(&bp->blk_dva[i]));
+ }
+ }
+
+ /*
+ * If we resume after a reboot, zab will be NULL; don't record
+ * incomplete stats in that case.
+ */
+ if (zab == NULL)
+ return;
+
+ mutex_enter(&zab->zab_lock);
+
+ for (i = 0; i < 4; i++) {
+ int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
+ int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
+
+ if (t & DMU_OT_NEWTYPE)
+ t = DMU_OT_OTHER;
+ zfs_blkstat_t *zb = &zab->zab_type[l][t];
+ int equal;
+
+ zb->zb_count++;
+ zb->zb_asize += BP_GET_ASIZE(bp);
+ zb->zb_lsize += BP_GET_LSIZE(bp);
+ zb->zb_psize += BP_GET_PSIZE(bp);
+ zb->zb_gangs += BP_COUNT_GANG(bp);
+
+ switch (BP_GET_NDVAS(bp)) {
+ case 2:
+ if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1]))
+ zb->zb_ditto_2_of_2_samevdev++;
+ break;
+ case 3:
+ equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1])) +
+ (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2])) +
+ (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2]));
+ if (equal == 1)
+ zb->zb_ditto_2_of_3_samevdev++;
+ else if (equal == 3)
+ zb->zb_ditto_3_of_3_samevdev++;
+ break;
+ }
+ }
+
+ mutex_exit(&zab->zab_lock);
+}
+
+static void
+scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
+{
+ avl_index_t idx;
+ int64_t asize = SIO_GET_ASIZE(sio);
+ dsl_scan_t *scn = queue->q_scn;
+
+ ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+
+ if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
+ /* block is already scheduled for reading */
+ atomic_add_64(&scn->scn_bytes_pending, -asize);
+ sio_free(sio);
+ return;
+ }
+ avl_insert(&queue->q_sios_by_addr, sio, idx);
+ queue->q_sio_memused += SIO_GET_MUSED(sio);
+ range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), asize);
+}
+
+/*
+ * Given all the info we got from our metadata scanning process, we
+ * construct a scan_io_t and insert it into the scan sorting queue. The
+ * I/O must already be suitable for us to process. This is controlled
+ * by dsl_scan_enqueue().
+ */
+static void
+scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
+ int zio_flags, const zbookmark_phys_t *zb)
+{
+ dsl_scan_t *scn = queue->q_scn;
+ scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp));
+
+ ASSERT0(BP_IS_GANG(bp));
+ ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+
+ bp2sio(bp, sio, dva_i);
+ sio->sio_flags = zio_flags;
+ sio->sio_zb = *zb;
+
+ /*
+ * Increment the bytes pending counter now so that we can't
+ * get an integer underflow in case the worker processes the
+ * zio before we get to incrementing this counter.
+ */
+ atomic_add_64(&scn->scn_bytes_pending, SIO_GET_ASIZE(sio));
+
+ scan_io_queue_insert_impl(queue, sio);
+}
+
+/*
+ * Given a set of I/O parameters as discovered by the metadata traversal
+ * process, attempts to place the I/O into the sorted queues (if allowed),
+ * or immediately executes the I/O.
+ */
+static void
+dsl_scan_enqueue(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
+ const zbookmark_phys_t *zb)
+{
+ spa_t *spa = dp->dp_spa;
+
+ ASSERT(!BP_IS_EMBEDDED(bp));
+
+ /*
+ * Gang blocks are hard to issue sequentially, so we just issue them
+ * here immediately instead of queuing them.
+ */
+ if (!dp->dp_scan->scn_is_sorted || BP_IS_GANG(bp)) {
+ scan_exec_io(dp, bp, zio_flags, zb, NULL);
+ return;
+ }
+
+ for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
+ dva_t dva;
+ vdev_t *vdev;
+
+ dva = bp->blk_dva[i];
+ vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva));
+ ASSERT(vdev != NULL);
+
+ mutex_enter(&vdev->vdev_scan_io_queue_lock);
+ if (vdev->vdev_scan_io_queue == NULL)
+ vdev->vdev_scan_io_queue = scan_io_queue_create(vdev);
+ ASSERT(dp->dp_scan != NULL);
+ scan_io_queue_insert(vdev->vdev_scan_io_queue, bp,
+ i, zio_flags, zb);
+ mutex_exit(&vdev->vdev_scan_io_queue_lock);
+ }
+}
+
+static int
+dsl_scan_scrub_cb(dsl_pool_t *dp,
+ const blkptr_t *bp, const zbookmark_phys_t *zb)
+{
+ dsl_scan_t *scn = dp->dp_scan;
+ spa_t *spa = dp->dp_spa;
+ uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
+ size_t psize = BP_GET_PSIZE(bp);
+ boolean_t needs_io = B_FALSE;
+ int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
+
+
+ if (phys_birth <= scn->scn_phys.scn_min_txg ||
+ phys_birth >= scn->scn_phys.scn_max_txg) {
+ count_block(scn, dp->dp_blkstats, bp);
+ return (0);
+ }
+
+ /* Embedded BP's have phys_birth==0, so we reject them above. */
+ ASSERT(!BP_IS_EMBEDDED(bp));
+
+ ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
+ if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
+ zio_flags |= ZIO_FLAG_SCRUB;
+ needs_io = B_TRUE;
+ } else {
+ ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
+ zio_flags |= ZIO_FLAG_RESILVER;
+ needs_io = B_FALSE;
+ }
+
+ /* If it's an intent log block, failure is expected. */
+ if (zb->zb_level == ZB_ZIL_LEVEL)
+ zio_flags |= ZIO_FLAG_SPECULATIVE;
+
+ for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
+ const dva_t *dva = &bp->blk_dva[d];
+
+ /*
+ * Keep track of how much data we've examined so that
+ * zpool(8) status can make useful progress reports.
+ */
+ scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva);
+ spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva);
+
+ /* if it's a resilver, this may not be in the target range */
+ if (!needs_io)
+ needs_io = dsl_scan_need_resilver(spa, dva, psize,
+ phys_birth);
+ }
+
+ if (needs_io && !zfs_no_scrub_io) {
+ dsl_scan_enqueue(dp, bp, zio_flags, zb);
+ } else {
+ count_block(scn, dp->dp_blkstats, bp);
+ }
+
+ /* do not relocate this block */
+ return (0);
+}
+
+static void
+dsl_scan_scrub_done(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ dsl_scan_io_queue_t *queue = zio->io_private;
+
+ abd_free(zio->io_abd);
+
+ if (queue == NULL) {
+ mutex_enter(&spa->spa_scrub_lock);
+ ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
+ spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
+ cv_broadcast(&spa->spa_scrub_io_cv);
+ mutex_exit(&spa->spa_scrub_lock);
+ } else {
+ mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock);
+ ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp));
+ queue->q_inflight_bytes -= BP_GET_PSIZE(bp);
+ cv_broadcast(&queue->q_zio_cv);
+ mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock);
+ }
+
+ if (zio->io_error && (zio->io_error != ECKSUM ||
+ !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
+ atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors);
+ }
+}
+
+/*
+ * Given a scanning zio's information, executes the zio. The zio need
+ * not necessarily be only sortable, this function simply executes the
+ * zio, no matter what it is. The optional queue argument allows the
+ * caller to specify that they want per top level vdev IO rate limiting
+ * instead of the legacy global limiting.
+ */
+static void
+scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
+ const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue)
+{
+ spa_t *spa = dp->dp_spa;
+ dsl_scan_t *scn = dp->dp_scan;
+ size_t size = BP_GET_PSIZE(bp);
+ abd_t *data = abd_alloc_for_io(size, B_FALSE);
+
+ ASSERT3U(scn->scn_maxinflight_bytes, >, 0);
+
+ if (queue == NULL) {
+ mutex_enter(&spa->spa_scrub_lock);
+ while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ spa->spa_scrub_inflight += BP_GET_PSIZE(bp);
+ mutex_exit(&spa->spa_scrub_lock);
+ } else {
+ kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
+
+ mutex_enter(q_lock);
+ while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes)
+ cv_wait(&queue->q_zio_cv, q_lock);
+ queue->q_inflight_bytes += BP_GET_PSIZE(bp);
+ mutex_exit(q_lock);
+ }
+
+ count_block(scn, dp->dp_blkstats, bp);
+ zio_nowait(zio_read(scn->scn_zio_root, spa, bp, data, size,
+ dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb));
+}
+
+/*
+ * This is the primary extent sorting algorithm. We balance two parameters:
+ * 1) how many bytes of I/O are in an extent
+ * 2) how well the extent is filled with I/O (as a fraction of its total size)
+ * Since we allow extents to have gaps between their constituent I/Os, it's
+ * possible to have a fairly large extent that contains the same amount of
+ * I/O bytes than a much smaller extent, which just packs the I/O more tightly.
+ * The algorithm sorts based on a score calculated from the extent's size,
+ * the relative fill volume (in %) and a "fill weight" parameter that controls
+ * the split between whether we prefer larger extents or more well populated
+ * extents:
+ *
+ * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT)
+ *
+ * Example:
+ * 1) assume extsz = 64 MiB
+ * 2) assume fill = 32 MiB (extent is half full)
+ * 3) assume fill_weight = 3
+ * 4) SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100
+ * SCORE = 32M + (50 * 3 * 32M) / 100
+ * SCORE = 32M + (4800M / 100)
+ * SCORE = 32M + 48M
+ * ^ ^
+ * | +--- final total relative fill-based score
+ * +--------- final total fill-based score
+ * SCORE = 80M
+ *
+ * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards
+ * extents that are more completely filled (in a 3:2 ratio) vs just larger.
+ * Note that as an optimization, we replace multiplication and division by
+ * 100 with bitshifting by 7 (which effectively multiplies and divides by 128).
+ */
+static int
+ext_size_compare(const void *x, const void *y)
+{
+ const range_seg_gap_t *rsa = x, *rsb = y;
+
+ uint64_t sa = rsa->rs_end - rsa->rs_start;
+ uint64_t sb = rsb->rs_end - rsb->rs_start;
+ uint64_t score_a, score_b;
+
+ score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) *
+ fill_weight * rsa->rs_fill) >> 7);
+ score_b = rsb->rs_fill + ((((rsb->rs_fill << 7) / sb) *
+ fill_weight * rsb->rs_fill) >> 7);
+
+ if (score_a > score_b)
+ return (-1);
+ if (score_a == score_b) {
+ if (rsa->rs_start < rsb->rs_start)
+ return (-1);
+ if (rsa->rs_start == rsb->rs_start)
+ return (0);
+ return (1);
+ }
+ return (1);
+}
+
+/*
+ * Comparator for the q_sios_by_addr tree. Sorting is simply performed
+ * based on LBA-order (from lowest to highest).
+ */
+static int
+sio_addr_compare(const void *x, const void *y)
+{
+ const scan_io_t *a = x, *b = y;
+
+ return (TREE_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b)));
+}
+
+/* IO queues are created on demand when they are needed. */
+static dsl_scan_io_queue_t *
+scan_io_queue_create(vdev_t *vd)
+{
+ dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
+ dsl_scan_io_queue_t *q = kmem_zalloc(sizeof (*q), KM_SLEEP);
+
+ q->q_scn = scn;
+ q->q_vd = vd;
+ q->q_sio_memused = 0;
+ cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
+ q->q_exts_by_addr = range_tree_create_impl(&rt_btree_ops, RANGE_SEG_GAP,
+ &q->q_exts_by_size, 0, 0, ext_size_compare, zfs_scan_max_ext_gap);
+ avl_create(&q->q_sios_by_addr, sio_addr_compare,
+ sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
+
+ return (q);
+}
+
+/*
+ * Destroys a scan queue and all segments and scan_io_t's contained in it.
+ * No further execution of I/O occurs, anything pending in the queue is
+ * simply freed without being executed.
+ */
+void
+dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue)
+{
+ dsl_scan_t *scn = queue->q_scn;
+ scan_io_t *sio;
+ void *cookie = NULL;
+ int64_t bytes_dequeued = 0;
+
+ ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+
+ while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
+ NULL) {
+ ASSERT(range_tree_contains(queue->q_exts_by_addr,
+ SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio)));
+ bytes_dequeued += SIO_GET_ASIZE(sio);
+ queue->q_sio_memused -= SIO_GET_MUSED(sio);
+ sio_free(sio);
+ }
+
+ ASSERT0(queue->q_sio_memused);
+ atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued);
+ range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
+ range_tree_destroy(queue->q_exts_by_addr);
+ avl_destroy(&queue->q_sios_by_addr);
+ cv_destroy(&queue->q_zio_cv);
+
+ kmem_free(queue, sizeof (*queue));
+}
+
+/*
+ * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is
+ * called on behalf of vdev_top_transfer when creating or destroying
+ * a mirror vdev due to zpool attach/detach.
+ */
+void
+dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd)
+{
+ mutex_enter(&svd->vdev_scan_io_queue_lock);
+ mutex_enter(&tvd->vdev_scan_io_queue_lock);
+
+ VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL);
+ tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue;
+ svd->vdev_scan_io_queue = NULL;
+ if (tvd->vdev_scan_io_queue != NULL)
+ tvd->vdev_scan_io_queue->q_vd = tvd;
+
+ mutex_exit(&tvd->vdev_scan_io_queue_lock);
+ mutex_exit(&svd->vdev_scan_io_queue_lock);
+}
+
+static void
+scan_io_queues_destroy(dsl_scan_t *scn)
+{
+ vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
+
+ for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+ vdev_t *tvd = rvd->vdev_child[i];
+
+ mutex_enter(&tvd->vdev_scan_io_queue_lock);
+ if (tvd->vdev_scan_io_queue != NULL)
+ dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue);
+ tvd->vdev_scan_io_queue = NULL;
+ mutex_exit(&tvd->vdev_scan_io_queue_lock);
+ }
+}
+
+static void
+dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
+{
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+ vdev_t *vdev;
+ kmutex_t *q_lock;
+ dsl_scan_io_queue_t *queue;
+ scan_io_t *srch_sio, *sio;
+ avl_index_t idx;
+ uint64_t start, size;
+
+ vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i]));
+ ASSERT(vdev != NULL);
+ q_lock = &vdev->vdev_scan_io_queue_lock;
+ queue = vdev->vdev_scan_io_queue;
+
+ mutex_enter(q_lock);
+ if (queue == NULL) {
+ mutex_exit(q_lock);
+ return;
+ }
+
+ srch_sio = sio_alloc(BP_GET_NDVAS(bp));
+ bp2sio(bp, srch_sio, dva_i);
+ start = SIO_GET_OFFSET(srch_sio);
+ size = SIO_GET_ASIZE(srch_sio);
+
+ /*
+ * We can find the zio in two states:
+ * 1) Cold, just sitting in the queue of zio's to be issued at
+ * some point in the future. In this case, all we do is
+ * remove the zio from the q_sios_by_addr tree, decrement
+ * its data volume from the containing range_seg_t and
+ * resort the q_exts_by_size tree to reflect that the
+ * range_seg_t has lost some of its 'fill'. We don't shorten
+ * the range_seg_t - this is usually rare enough not to be
+ * worth the extra hassle of trying keep track of precise
+ * extent boundaries.
+ * 2) Hot, where the zio is currently in-flight in
+ * dsl_scan_issue_ios. In this case, we can't simply
+ * reach in and stop the in-flight zio's, so we instead
+ * block the caller. Eventually, dsl_scan_issue_ios will
+ * be done with issuing the zio's it gathered and will
+ * signal us.
+ */
+ sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
+ sio_free(srch_sio);
+
+ if (sio != NULL) {
+ int64_t asize = SIO_GET_ASIZE(sio);
+ blkptr_t tmpbp;
+
+ /* Got it while it was cold in the queue */
+ ASSERT3U(start, ==, SIO_GET_OFFSET(sio));
+ ASSERT3U(size, ==, asize);
+ avl_remove(&queue->q_sios_by_addr, sio);
+ queue->q_sio_memused -= SIO_GET_MUSED(sio);
+
+ ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
+ range_tree_remove_fill(queue->q_exts_by_addr, start, size);
+
+ /*
+ * We only update scn_bytes_pending in the cold path,
+ * otherwise it will already have been accounted for as
+ * part of the zio's execution.
+ */
+ atomic_add_64(&scn->scn_bytes_pending, -asize);
+
+ /* count the block as though we issued it */
+ sio2bp(sio, &tmpbp);
+ count_block(scn, dp->dp_blkstats, &tmpbp);
+
+ sio_free(sio);
+ }
+ mutex_exit(q_lock);
+}
+
+/*
+ * Callback invoked when a zio_free() zio is executing. This needs to be
+ * intercepted to prevent the zio from deallocating a particular portion
+ * of disk space and it then getting reallocated and written to, while we
+ * still have it queued up for processing.
+ */
+void
+dsl_scan_freed(spa_t *spa, const blkptr_t *bp)
+{
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ ASSERT(!BP_IS_EMBEDDED(bp));
+ ASSERT(scn != NULL);
+ if (!dsl_scan_is_running(scn))
+ return;
+
+ for (int i = 0; i < BP_GET_NDVAS(bp); i++)
+ dsl_scan_freed_dva(spa, bp, i);
+}
+
+/*
+ * Check if a vdev needs resilvering (non-empty DTL), if so, and resilver has
+ * not started, start it. Otherwise, only restart if max txg in DTL range is
+ * greater than the max txg in the current scan. If the DTL max is less than
+ * the scan max, then the vdev has not missed any new data since the resilver
+ * started, so a restart is not needed.
+ */
+void
+dsl_scan_assess_vdev(dsl_pool_t *dp, vdev_t *vd)
+{
+ uint64_t min, max;
+
+ if (!vdev_resilver_needed(vd, &min, &max))
+ return;
+
+ if (!dsl_scan_resilvering(dp)) {
+ spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER);
+ return;
+ }
+
+ if (max <= dp->dp_scan->scn_phys.scn_max_txg)
+ return;
+
+ /* restart is needed, check if it can be deferred */
+ if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))
+ vdev_defer_resilver(vd);
+ else
+ spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, scan_vdev_limit, ULONG, ZMOD_RW,
+ "Max bytes in flight per leaf vdev for scrubs and resilvers");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scrub_min_time_ms, INT, ZMOD_RW,
+ "Min millisecs to scrub per txg");
+
+ZFS_MODULE_PARAM(zfs, zfs_, obsolete_min_time_ms, INT, ZMOD_RW,
+ "Min millisecs to obsolete per txg");
+
+ZFS_MODULE_PARAM(zfs, zfs_, free_min_time_ms, INT, ZMOD_RW,
+ "Min millisecs to free per txg");
+
+ZFS_MODULE_PARAM(zfs, zfs_, resilver_min_time_ms, INT, ZMOD_RW,
+ "Min millisecs to resilver per txg");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scan_suspend_progress, INT, ZMOD_RW,
+ "Set to prevent scans from progressing");
+
+ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_io, INT, ZMOD_RW,
+ "Set to disable scrub I/O");
+
+ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_prefetch, INT, ZMOD_RW,
+ "Set to disable scrub prefetching");
+
+ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, ULONG, ZMOD_RW,
+ "Max number of blocks freed in one txg");
+
+ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, ULONG, ZMOD_RW,
+ "Max number of dedup blocks freed in one txg");
+
+ZFS_MODULE_PARAM(zfs, zfs_, free_bpobj_enabled, INT, ZMOD_RW,
+ "Enable processing of the free_bpobj");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_fact, INT, ZMOD_RW,
+ "Fraction of RAM for scan hard limit");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scan_issue_strategy, INT, ZMOD_RW,
+ "IO issuing strategy during scrubbing. "
+ "0 = default, 1 = LBA, 2 = size");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scan_legacy, INT, ZMOD_RW,
+ "Scrub using legacy non-sequential method");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scan_checkpoint_intval, INT, ZMOD_RW,
+ "Scan progress on-disk checkpointing interval");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scan_max_ext_gap, ULONG, ZMOD_RW,
+ "Max gap in bytes between sequential scrub / resilver I/Os");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_soft_fact, INT, ZMOD_RW,
+ "Fraction of hard limit used as soft limit");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scan_strict_mem_lim, INT, ZMOD_RW,
+ "Tunable to attempt to reduce lock contention");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, INT, ZMOD_RW,
+ "Tunable to adjust bias towards more filled segments during scans");
+
+ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW,
+ "Process all resilvers immediately");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dsl_synctask.c b/sys/contrib/openzfs/module/zfs/dsl_synctask.c
new file mode 100644
index 000000000000..148e8fff2437
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_synctask.c
@@ -0,0 +1,257 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/metaslab.h>
+
+#define DST_AVG_BLKSHIFT 14
+
+/* ARGSUSED */
+static int
+dsl_null_checkfunc(void *arg, dmu_tx_t *tx)
+{
+ return (0);
+}
+
+static int
+dsl_sync_task_common(const char *pool, dsl_checkfunc_t *checkfunc,
+ dsl_syncfunc_t *syncfunc, dsl_sigfunc_t *sigfunc, void *arg,
+ int blocks_modified, zfs_space_check_t space_check, boolean_t early)
+{
+ spa_t *spa;
+ dmu_tx_t *tx;
+ int err;
+ dsl_sync_task_t dst = { { { NULL } } };
+ dsl_pool_t *dp;
+
+ err = spa_open(pool, &spa, FTAG);
+ if (err != 0)
+ return (err);
+ dp = spa_get_dsl(spa);
+
+top:
+ tx = dmu_tx_create_dd(dp->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+ dst.dst_pool = dp;
+ dst.dst_txg = dmu_tx_get_txg(tx);
+ dst.dst_space = blocks_modified << DST_AVG_BLKSHIFT;
+ dst.dst_space_check = space_check;
+ dst.dst_checkfunc = checkfunc != NULL ? checkfunc : dsl_null_checkfunc;
+ dst.dst_syncfunc = syncfunc;
+ dst.dst_arg = arg;
+ dst.dst_error = 0;
+ dst.dst_nowaiter = B_FALSE;
+
+ dsl_pool_config_enter(dp, FTAG);
+ err = dst.dst_checkfunc(arg, tx);
+ dsl_pool_config_exit(dp, FTAG);
+
+ if (err != 0) {
+ dmu_tx_commit(tx);
+ spa_close(spa, FTAG);
+ return (err);
+ }
+
+ txg_list_t *task_list = (early) ?
+ &dp->dp_early_sync_tasks : &dp->dp_sync_tasks;
+ VERIFY(txg_list_add_tail(task_list, &dst, dst.dst_txg));
+
+ dmu_tx_commit(tx);
+
+ if (sigfunc != NULL && txg_wait_synced_sig(dp, dst.dst_txg)) {
+ /* current contract is to call func once */
+ sigfunc(arg, tx);
+ sigfunc = NULL; /* in case we're performing an EAGAIN retry */
+ }
+ txg_wait_synced(dp, dst.dst_txg);
+
+ if (dst.dst_error == EAGAIN) {
+ txg_wait_synced(dp, dst.dst_txg + TXG_DEFER_SIZE);
+ goto top;
+ }
+
+ spa_close(spa, FTAG);
+ return (dst.dst_error);
+}
+
+/*
+ * Called from open context to perform a callback in syncing context. Waits
+ * for the operation to complete.
+ *
+ * The checkfunc will be called from open context as a preliminary check
+ * which can quickly fail. If it succeeds, it will be called again from
+ * syncing context. The checkfunc should generally be designed to work
+ * properly in either context, but if necessary it can check
+ * dmu_tx_is_syncing(tx).
+ *
+ * The synctask infrastructure enforces proper locking strategy with respect
+ * to the dp_config_rwlock -- the lock will always be held when the callbacks
+ * are called. It will be held for read during the open-context (preliminary)
+ * call to the checkfunc, and then held for write from syncing context during
+ * the calls to the check and sync funcs.
+ *
+ * A dataset or pool name can be passed as the first argument. Typically,
+ * the check func will hold, check the return value of the hold, and then
+ * release the dataset. The sync func will VERIFYO(hold()) the dataset.
+ * This is safe because no changes can be made between the check and sync funcs,
+ * and the sync func will only be called if the check func successfully opened
+ * the dataset.
+ */
+int
+dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
+ dsl_syncfunc_t *syncfunc, void *arg,
+ int blocks_modified, zfs_space_check_t space_check)
+{
+ return (dsl_sync_task_common(pool, checkfunc, syncfunc, NULL, arg,
+ blocks_modified, space_check, B_FALSE));
+}
+
+/*
+ * An early synctask works exactly as a standard synctask with one important
+ * difference on the way it is handled during syncing context. Standard
+ * synctasks run after we've written out all the dirty blocks of dirty
+ * datasets. Early synctasks are executed before writing out any dirty data,
+ * and thus before standard synctasks.
+ *
+ * For that reason, early synctasks can affect the process of writing dirty
+ * changes to disk for the txg that they run and should be used with caution.
+ * In addition, early synctasks should not dirty any metaslabs as this would
+ * invalidate the precondition/invariant for subsequent early synctasks.
+ * [see dsl_pool_sync() and dsl_early_sync_task_verify()]
+ */
+int
+dsl_early_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
+ dsl_syncfunc_t *syncfunc, void *arg,
+ int blocks_modified, zfs_space_check_t space_check)
+{
+ return (dsl_sync_task_common(pool, checkfunc, syncfunc, NULL, arg,
+ blocks_modified, space_check, B_TRUE));
+}
+
+/*
+ * A standard synctask that can be interrupted from a signal. The sigfunc
+ * is called once if a signal occurred while waiting for the task to sync.
+ */
+int
+dsl_sync_task_sig(const char *pool, dsl_checkfunc_t *checkfunc,
+ dsl_syncfunc_t *syncfunc, dsl_sigfunc_t *sigfunc, void *arg,
+ int blocks_modified, zfs_space_check_t space_check)
+{
+ return (dsl_sync_task_common(pool, checkfunc, syncfunc, sigfunc, arg,
+ blocks_modified, space_check, B_FALSE));
+}
+
+static void
+dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+ dmu_tx_t *tx, boolean_t early)
+{
+ dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP);
+
+ dst->dst_pool = dp;
+ dst->dst_txg = dmu_tx_get_txg(tx);
+ dst->dst_space_check = ZFS_SPACE_CHECK_NONE;
+ dst->dst_checkfunc = dsl_null_checkfunc;
+ dst->dst_syncfunc = syncfunc;
+ dst->dst_arg = arg;
+ dst->dst_error = 0;
+ dst->dst_nowaiter = B_TRUE;
+
+ txg_list_t *task_list = (early) ?
+ &dp->dp_early_sync_tasks : &dp->dp_sync_tasks;
+ VERIFY(txg_list_add_tail(task_list, dst, dst->dst_txg));
+}
+
+void
+dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+ dmu_tx_t *tx)
+{
+ dsl_sync_task_nowait_common(dp, syncfunc, arg, tx, B_FALSE);
+}
+
+void
+dsl_early_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+ dmu_tx_t *tx)
+{
+ dsl_sync_task_nowait_common(dp, syncfunc, arg, tx, B_TRUE);
+}
+
+/*
+ * Called in syncing context to execute the synctask.
+ */
+void
+dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dst->dst_pool;
+
+ ASSERT0(dst->dst_error);
+
+ /*
+ * Check for sufficient space.
+ *
+ * When the sync task was created, the caller specified the
+ * type of space checking required. See the comment in
+ * zfs_space_check_t for details on the semantics of each
+ * type of space checking.
+ *
+ * We just check against what's on-disk; we don't want any
+ * in-flight accounting to get in our way, because open context
+ * may have already used up various in-core limits
+ * (arc_tempreserve, dsl_pool_tempreserve).
+ */
+ if (dst->dst_space_check != ZFS_SPACE_CHECK_NONE) {
+ uint64_t quota = dsl_pool_unreserved_space(dp,
+ dst->dst_space_check);
+ uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
+
+ /* MOS space is triple-dittoed, so we multiply by 3. */
+ if (used + dst->dst_space * 3 > quota) {
+ dst->dst_error = SET_ERROR(ENOSPC);
+ if (dst->dst_nowaiter)
+ kmem_free(dst, sizeof (*dst));
+ return;
+ }
+ }
+
+ /*
+ * Check for errors by calling checkfunc.
+ */
+ rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+ dst->dst_error = dst->dst_checkfunc(dst->dst_arg, tx);
+ if (dst->dst_error == 0)
+ dst->dst_syncfunc(dst->dst_arg, tx);
+ rrw_exit(&dp->dp_config_rwlock, FTAG);
+ if (dst->dst_nowaiter)
+ kmem_free(dst, sizeof (*dst));
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(dsl_sync_task);
+EXPORT_SYMBOL(dsl_sync_task_nowait);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/dsl_userhold.c b/sys/contrib/openzfs/module/zfs/dsl_userhold.c
new file mode 100644
index 000000000000..75d153194a00
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_userhold.c
@@ -0,0 +1,691 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dsl_userhold.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_tx.h>
+#include <sys/zfs_onexit.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+
+typedef struct dsl_dataset_user_hold_arg {
+ nvlist_t *dduha_holds;
+ nvlist_t *dduha_chkholds;
+ nvlist_t *dduha_errlist;
+ minor_t dduha_minor;
+} dsl_dataset_user_hold_arg_t;
+
+/*
+ * If you add new checks here, you may need to add additional checks to the
+ * "temporary" case in snapshot_check() in dmu_objset.c.
+ */
+int
+dsl_dataset_user_hold_check_one(dsl_dataset_t *ds, const char *htag,
+ boolean_t temphold, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ objset_t *mos = dp->dp_meta_objset;
+ int error = 0;
+
+ ASSERT(dsl_pool_config_held(dp));
+
+ if (strlen(htag) > MAXNAMELEN)
+ return (SET_ERROR(E2BIG));
+ /* Tempholds have a more restricted length */
+ if (temphold && strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
+ return (SET_ERROR(E2BIG));
+
+ /* tags must be unique (if ds already exists) */
+ if (ds != NULL && dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
+ uint64_t value;
+
+ error = zap_lookup(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
+ htag, 8, 1, &value);
+ if (error == 0)
+ error = SET_ERROR(EEXIST);
+ else if (error == ENOENT)
+ error = 0;
+ }
+
+ return (error);
+}
+
+static int
+dsl_dataset_user_hold_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_user_hold_arg_t *dduha = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ nvlist_t *tmp_holds;
+
+ if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS)
+ return (SET_ERROR(ENOTSUP));
+
+ if (!dmu_tx_is_syncing(tx))
+ return (0);
+
+ /*
+ * Ensure the list has no duplicates by copying name/values from
+ * non-unique dduha_holds to unique tmp_holds, and comparing counts.
+ */
+ tmp_holds = fnvlist_alloc();
+ for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_holds, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) {
+ size_t len = strlen(nvpair_name(pair)) +
+ strlen(fnvpair_value_string(pair));
+ char *nameval = kmem_zalloc(len + 2, KM_SLEEP);
+ (void) strlcpy(nameval, nvpair_name(pair), len + 2);
+ (void) strlcat(nameval, "@", len + 2);
+ (void) strlcat(nameval, fnvpair_value_string(pair), len + 2);
+ fnvlist_add_string(tmp_holds, nameval, "");
+ kmem_free(nameval, len + 2);
+ }
+ size_t tmp_count = fnvlist_num_pairs(tmp_holds);
+ fnvlist_free(tmp_holds);
+ if (tmp_count != fnvlist_num_pairs(dduha->dduha_holds))
+ return (SET_ERROR(EEXIST));
+ for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_holds, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) {
+ dsl_dataset_t *ds;
+ int error = 0;
+ char *htag, *name;
+
+ /* must be a snapshot */
+ name = nvpair_name(pair);
+ if (strchr(name, '@') == NULL)
+ error = SET_ERROR(EINVAL);
+
+ if (error == 0)
+ error = nvpair_value_string(pair, &htag);
+
+ if (error == 0)
+ error = dsl_dataset_hold(dp, name, FTAG, &ds);
+
+ if (error == 0) {
+ error = dsl_dataset_user_hold_check_one(ds, htag,
+ dduha->dduha_minor != 0, tx);
+ dsl_dataset_rele(ds, FTAG);
+ }
+
+ if (error == 0) {
+ fnvlist_add_string(dduha->dduha_chkholds, name, htag);
+ } else {
+ /*
+ * We register ENOENT errors so they can be correctly
+ * reported if needed, such as when all holds fail.
+ */
+ fnvlist_add_int32(dduha->dduha_errlist, name, error);
+ if (error != ENOENT)
+ return (error);
+ }
+ }
+
+ return (0);
+}
+
+
+static void
+dsl_dataset_user_hold_sync_one_impl(nvlist_t *tmpholds, dsl_dataset_t *ds,
+ const char *htag, minor_t minor, uint64_t now, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t zapobj;
+
+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+ if (dsl_dataset_phys(ds)->ds_userrefs_obj == 0) {
+ /*
+ * This is the first user hold for this dataset. Create
+ * the userrefs zap object.
+ */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj =
+ zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
+ } else {
+ zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj;
+ }
+ ds->ds_userrefs++;
+
+ VERIFY0(zap_add(mos, zapobj, htag, 8, 1, &now, tx));
+
+ if (minor != 0) {
+ char name[MAXNAMELEN];
+ nvlist_t *tags;
+
+ VERIFY0(dsl_pool_user_hold(dp, ds->ds_object,
+ htag, now, tx));
+ (void) snprintf(name, sizeof (name), "%llx",
+ (u_longlong_t)ds->ds_object);
+
+ if (nvlist_lookup_nvlist(tmpholds, name, &tags) != 0) {
+ tags = fnvlist_alloc();
+ fnvlist_add_boolean(tags, htag);
+ fnvlist_add_nvlist(tmpholds, name, tags);
+ fnvlist_free(tags);
+ } else {
+ fnvlist_add_boolean(tags, htag);
+ }
+ }
+
+ spa_history_log_internal_ds(ds, "hold", tx,
+ "tag=%s temp=%d refs=%llu",
+ htag, minor != 0, (u_longlong_t)ds->ds_userrefs);
+}
+
+typedef struct zfs_hold_cleanup_arg {
+ char zhca_spaname[ZFS_MAX_DATASET_NAME_LEN];
+ uint64_t zhca_spa_load_guid;
+ nvlist_t *zhca_holds;
+} zfs_hold_cleanup_arg_t;
+
+static void
+dsl_dataset_user_release_onexit(void *arg)
+{
+ zfs_hold_cleanup_arg_t *ca = arg;
+ spa_t *spa;
+ int error;
+
+ error = spa_open(ca->zhca_spaname, &spa, FTAG);
+ if (error != 0) {
+ zfs_dbgmsg("couldn't release holds on pool=%s "
+ "because pool is no longer loaded",
+ ca->zhca_spaname);
+ return;
+ }
+ if (spa_load_guid(spa) != ca->zhca_spa_load_guid) {
+ zfs_dbgmsg("couldn't release holds on pool=%s "
+ "because pool is no longer loaded (guid doesn't match)",
+ ca->zhca_spaname);
+ spa_close(spa, FTAG);
+ return;
+ }
+
+ (void) dsl_dataset_user_release_tmp(spa_get_dsl(spa), ca->zhca_holds);
+ fnvlist_free(ca->zhca_holds);
+ kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
+ spa_close(spa, FTAG);
+}
+
+static void
+dsl_onexit_hold_cleanup(spa_t *spa, nvlist_t *holds, minor_t minor)
+{
+ zfs_hold_cleanup_arg_t *ca;
+
+ if (minor == 0 || nvlist_empty(holds)) {
+ fnvlist_free(holds);
+ return;
+ }
+
+ ASSERT(spa != NULL);
+ ca = kmem_alloc(sizeof (*ca), KM_SLEEP);
+
+ (void) strlcpy(ca->zhca_spaname, spa_name(spa),
+ sizeof (ca->zhca_spaname));
+ ca->zhca_spa_load_guid = spa_load_guid(spa);
+ ca->zhca_holds = holds;
+ VERIFY0(zfs_onexit_add_cb(minor,
+ dsl_dataset_user_release_onexit, ca, NULL));
+}
+
+void
+dsl_dataset_user_hold_sync_one(dsl_dataset_t *ds, const char *htag,
+ minor_t minor, uint64_t now, dmu_tx_t *tx)
+{
+ nvlist_t *tmpholds;
+
+ if (minor != 0)
+ tmpholds = fnvlist_alloc();
+ else
+ tmpholds = NULL;
+ dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, htag, minor, now, tx);
+ dsl_onexit_hold_cleanup(dsl_dataset_get_spa(ds), tmpholds, minor);
+}
+
+static void
+dsl_dataset_user_hold_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_user_hold_arg_t *dduha = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ nvlist_t *tmpholds;
+ uint64_t now = gethrestime_sec();
+
+ if (dduha->dduha_minor != 0)
+ tmpholds = fnvlist_alloc();
+ else
+ tmpholds = NULL;
+ for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_chkholds, NULL);
+ pair != NULL;
+ pair = nvlist_next_nvpair(dduha->dduha_chkholds, pair)) {
+ dsl_dataset_t *ds;
+
+ VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds));
+ dsl_dataset_user_hold_sync_one_impl(tmpholds, ds,
+ fnvpair_value_string(pair), dduha->dduha_minor, now, tx);
+ dsl_dataset_rele(ds, FTAG);
+ }
+ dsl_onexit_hold_cleanup(dp->dp_spa, tmpholds, dduha->dduha_minor);
+}
+
+/*
+ * The full semantics of this function are described in the comment above
+ * lzc_hold().
+ *
+ * To summarize:
+ * holds is nvl of snapname -> holdname
+ * errlist will be filled in with snapname -> error
+ *
+ * The snapshots must all be in the same pool.
+ *
+ * Holds for snapshots that don't exist will be skipped.
+ *
+ * If none of the snapshots for requested holds exist then ENOENT will be
+ * returned.
+ *
+ * If cleanup_minor is not 0, the holds will be temporary, which will be cleaned
+ * up when the process exits.
+ *
+ * On success all the holds, for snapshots that existed, will be created and 0
+ * will be returned.
+ *
+ * On failure no holds will be created, the errlist will be filled in,
+ * and an errno will returned.
+ *
+ * In all cases the errlist will contain entries for holds where the snapshot
+ * didn't exist.
+ */
+int
+dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist)
+{
+ dsl_dataset_user_hold_arg_t dduha;
+ nvpair_t *pair;
+ int ret;
+
+ pair = nvlist_next_nvpair(holds, NULL);
+ if (pair == NULL)
+ return (0);
+
+ dduha.dduha_holds = holds;
+ /* chkholds can have non-unique name */
+ VERIFY(0 == nvlist_alloc(&dduha.dduha_chkholds, 0, KM_SLEEP));
+ dduha.dduha_errlist = errlist;
+ dduha.dduha_minor = cleanup_minor;
+
+ ret = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check,
+ dsl_dataset_user_hold_sync, &dduha,
+ fnvlist_num_pairs(holds), ZFS_SPACE_CHECK_RESERVED);
+ fnvlist_free(dduha.dduha_chkholds);
+
+ return (ret);
+}
+
+typedef int (dsl_holdfunc_t)(dsl_pool_t *dp, const char *name, void *tag,
+ dsl_dataset_t **dsp);
+
+typedef struct dsl_dataset_user_release_arg {
+ dsl_holdfunc_t *ddura_holdfunc;
+ nvlist_t *ddura_holds;
+ nvlist_t *ddura_todelete;
+ nvlist_t *ddura_errlist;
+ nvlist_t *ddura_chkholds;
+} dsl_dataset_user_release_arg_t;
+
+/* Place a dataset hold on the snapshot identified by passed dsobj string */
+static int
+dsl_dataset_hold_obj_string(dsl_pool_t *dp, const char *dsobj, void *tag,
+ dsl_dataset_t **dsp)
+{
+ return (dsl_dataset_hold_obj(dp, zfs_strtonum(dsobj, NULL), tag, dsp));
+}
+
+static int
+dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura,
+ dsl_dataset_t *ds, nvlist_t *holds, const char *snapname)
+{
+ uint64_t zapobj;
+ nvlist_t *holds_found;
+ objset_t *mos;
+ int numholds;
+
+ if (!ds->ds_is_snapshot)
+ return (SET_ERROR(EINVAL));
+
+ if (nvlist_empty(holds))
+ return (0);
+
+ numholds = 0;
+ mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj;
+ VERIFY0(nvlist_alloc(&holds_found, NV_UNIQUE_NAME, KM_SLEEP));
+
+ for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(holds, pair)) {
+ uint64_t tmp;
+ int error;
+ const char *holdname = nvpair_name(pair);
+
+ if (zapobj != 0)
+ error = zap_lookup(mos, zapobj, holdname, 8, 1, &tmp);
+ else
+ error = SET_ERROR(ENOENT);
+
+ /*
+ * Non-existent holds are put on the errlist, but don't
+ * cause an overall failure.
+ */
+ if (error == ENOENT) {
+ if (ddura->ddura_errlist != NULL) {
+ char *errtag = kmem_asprintf("%s#%s",
+ snapname, holdname);
+ fnvlist_add_int32(ddura->ddura_errlist, errtag,
+ ENOENT);
+ kmem_strfree(errtag);
+ }
+ continue;
+ }
+
+ if (error != 0) {
+ fnvlist_free(holds_found);
+ return (error);
+ }
+
+ fnvlist_add_boolean(holds_found, holdname);
+ numholds++;
+ }
+
+ if (DS_IS_DEFER_DESTROY(ds) &&
+ dsl_dataset_phys(ds)->ds_num_children == 1 &&
+ ds->ds_userrefs == numholds) {
+ /* we need to destroy the snapshot as well */
+ if (dsl_dataset_long_held(ds)) {
+ fnvlist_free(holds_found);
+ return (SET_ERROR(EBUSY));
+ }
+ fnvlist_add_boolean(ddura->ddura_todelete, snapname);
+ }
+
+ if (numholds != 0) {
+ fnvlist_add_nvlist(ddura->ddura_chkholds, snapname,
+ holds_found);
+ }
+ fnvlist_free(holds_found);
+
+ return (0);
+}
+
+static int
+dsl_dataset_user_release_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_user_release_arg_t *ddura;
+ dsl_holdfunc_t *holdfunc;
+ dsl_pool_t *dp;
+
+ if (!dmu_tx_is_syncing(tx))
+ return (0);
+
+ dp = dmu_tx_pool(tx);
+
+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+ ddura = arg;
+ holdfunc = ddura->ddura_holdfunc;
+
+ for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_holds, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) {
+ int error;
+ dsl_dataset_t *ds;
+ nvlist_t *holds;
+ const char *snapname = nvpair_name(pair);
+
+ error = nvpair_value_nvlist(pair, &holds);
+ if (error != 0)
+ error = (SET_ERROR(EINVAL));
+ else
+ error = holdfunc(dp, snapname, FTAG, &ds);
+ if (error == 0) {
+ error = dsl_dataset_user_release_check_one(ddura, ds,
+ holds, snapname);
+ dsl_dataset_rele(ds, FTAG);
+ }
+ if (error != 0) {
+ if (ddura->ddura_errlist != NULL) {
+ fnvlist_add_int32(ddura->ddura_errlist,
+ snapname, error);
+ }
+ /*
+ * Non-existent snapshots are put on the errlist,
+ * but don't cause an overall failure.
+ */
+ if (error != ENOENT)
+ return (error);
+ }
+ }
+
+ return (0);
+}
+
+static void
+dsl_dataset_user_release_sync_one(dsl_dataset_t *ds, nvlist_t *holds,
+ dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+
+ for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(holds, pair)) {
+ int error;
+ const char *holdname = nvpair_name(pair);
+
+ /* Remove temporary hold if one exists. */
+ error = dsl_pool_user_release(dp, ds->ds_object, holdname, tx);
+ VERIFY(error == 0 || error == ENOENT);
+
+ VERIFY0(zap_remove(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
+ holdname, tx));
+ ds->ds_userrefs--;
+
+ spa_history_log_internal_ds(ds, "release", tx,
+ "tag=%s refs=%lld", holdname, (longlong_t)ds->ds_userrefs);
+ }
+}
+
+static void
+dsl_dataset_user_release_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_user_release_arg_t *ddura = arg;
+ dsl_holdfunc_t *holdfunc = ddura->ddura_holdfunc;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+
+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+ for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_chkholds, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_chkholds,
+ pair)) {
+ dsl_dataset_t *ds;
+ const char *name = nvpair_name(pair);
+
+ VERIFY0(holdfunc(dp, name, FTAG, &ds));
+
+ dsl_dataset_user_release_sync_one(ds,
+ fnvpair_value_nvlist(pair), tx);
+ if (nvlist_exists(ddura->ddura_todelete, name)) {
+ ASSERT(ds->ds_userrefs == 0 &&
+ dsl_dataset_phys(ds)->ds_num_children == 1 &&
+ DS_IS_DEFER_DESTROY(ds));
+ dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx);
+ }
+ dsl_dataset_rele(ds, FTAG);
+ }
+}
+
+/*
+ * The full semantics of this function are described in the comment above
+ * lzc_release().
+ *
+ * To summarize:
+ * Releases holds specified in the nvl holds.
+ *
+ * holds is nvl of snapname -> { holdname, ... }
+ * errlist will be filled in with snapname -> error
+ *
+ * If tmpdp is not NULL the names for holds should be the dsobj's of snapshots,
+ * otherwise they should be the names of snapshots.
+ *
+ * As a release may cause snapshots to be destroyed this tries to ensure they
+ * aren't mounted.
+ *
+ * The release of non-existent holds are skipped.
+ *
+ * At least one hold must have been released for the this function to succeed
+ * and return 0.
+ */
+static int
+dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist,
+ dsl_pool_t *tmpdp)
+{
+ dsl_dataset_user_release_arg_t ddura;
+ nvpair_t *pair;
+ char *pool;
+ int error;
+
+ pair = nvlist_next_nvpair(holds, NULL);
+ if (pair == NULL)
+ return (0);
+
+ /*
+ * The release may cause snapshots to be destroyed; make sure they
+ * are not mounted.
+ */
+ if (tmpdp != NULL) {
+ /* Temporary holds are specified by dsobj string. */
+ ddura.ddura_holdfunc = dsl_dataset_hold_obj_string;
+ pool = spa_name(tmpdp->dp_spa);
+#ifdef _KERNEL
+ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(holds, pair)) {
+ dsl_dataset_t *ds;
+
+ dsl_pool_config_enter(tmpdp, FTAG);
+ error = dsl_dataset_hold_obj_string(tmpdp,
+ nvpair_name(pair), FTAG, &ds);
+ if (error == 0) {
+ char name[ZFS_MAX_DATASET_NAME_LEN];
+ dsl_dataset_name(ds, name);
+ dsl_pool_config_exit(tmpdp, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+ (void) zfs_unmount_snap(name);
+ } else {
+ dsl_pool_config_exit(tmpdp, FTAG);
+ }
+ }
+#endif
+ } else {
+ /* Non-temporary holds are specified by name. */
+ ddura.ddura_holdfunc = dsl_dataset_hold;
+ pool = nvpair_name(pair);
+#ifdef _KERNEL
+ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(holds, pair)) {
+ (void) zfs_unmount_snap(nvpair_name(pair));
+ }
+#endif
+ }
+
+ ddura.ddura_holds = holds;
+ ddura.ddura_errlist = errlist;
+ VERIFY0(nvlist_alloc(&ddura.ddura_todelete, NV_UNIQUE_NAME,
+ KM_SLEEP));
+ VERIFY0(nvlist_alloc(&ddura.ddura_chkholds, NV_UNIQUE_NAME,
+ KM_SLEEP));
+
+ error = dsl_sync_task(pool, dsl_dataset_user_release_check,
+ dsl_dataset_user_release_sync, &ddura, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED);
+ fnvlist_free(ddura.ddura_todelete);
+ fnvlist_free(ddura.ddura_chkholds);
+
+ return (error);
+}
+
+/*
+ * holds is nvl of snapname -> { holdname, ... }
+ * errlist will be filled in with snapname -> error
+ */
+int
+dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist)
+{
+ return (dsl_dataset_user_release_impl(holds, errlist, NULL));
+}
+
+/*
+ * holds is nvl of snapdsobj -> { holdname, ... }
+ */
+void
+dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds)
+{
+ ASSERT(dp != NULL);
+ (void) dsl_dataset_user_release_impl(holds, NULL, dp);
+}
+
+int
+dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ int err;
+
+ err = dsl_pool_hold(dsname, FTAG, &dp);
+ if (err != 0)
+ return (err);
+ err = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+ if (err != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (err);
+ }
+
+ if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
+ zap_attribute_t *za;
+ zap_cursor_t zc;
+
+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+ for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_userrefs_obj);
+ zap_cursor_retrieve(&zc, za) == 0;
+ zap_cursor_advance(&zc)) {
+ fnvlist_add_uint64(nvl, za->za_name,
+ za->za_first_integer);
+ }
+ zap_cursor_fini(&zc);
+ kmem_free(za, sizeof (zap_attribute_t));
+ }
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (0);
+}
diff --git a/sys/contrib/openzfs/module/zfs/edonr_zfs.c b/sys/contrib/openzfs/module/zfs/edonr_zfs.c
new file mode 100644
index 000000000000..aa00e1c9417e
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/edonr_zfs.c
@@ -0,0 +1,115 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/edonr.h>
+#include <sys/abd.h>
+
+#define EDONR_MODE 512
+#define EDONR_BLOCK_SIZE EdonR512_BLOCK_SIZE
+
+static int
+edonr_incremental(void *buf, size_t size, void *arg)
+{
+ EdonRState *ctx = arg;
+ EdonRUpdate(ctx, buf, size * 8);
+ return (0);
+}
+
+/*
+ * Native zio_checksum interface for the Edon-R hash function.
+ */
+/*ARGSUSED*/
+void
+abd_checksum_edonr_native(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ uint8_t digest[EDONR_MODE / 8];
+ EdonRState ctx;
+
+ ASSERT(ctx_template != NULL);
+ bcopy(ctx_template, &ctx, sizeof (ctx));
+ (void) abd_iterate_func(abd, 0, size, edonr_incremental, &ctx);
+ EdonRFinal(&ctx, digest);
+ bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word));
+}
+
+/*
+ * Byteswapped zio_checksum interface for the Edon-R hash function.
+ */
+void
+abd_checksum_edonr_byteswap(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ zio_cksum_t tmp;
+
+ abd_checksum_edonr_native(abd, size, ctx_template, &tmp);
+ zcp->zc_word[0] = BSWAP_64(zcp->zc_word[0]);
+ zcp->zc_word[1] = BSWAP_64(zcp->zc_word[1]);
+ zcp->zc_word[2] = BSWAP_64(zcp->zc_word[2]);
+ zcp->zc_word[3] = BSWAP_64(zcp->zc_word[3]);
+}
+
+void *
+abd_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
+{
+ EdonRState *ctx;
+ uint8_t salt_block[EDONR_BLOCK_SIZE];
+
+ /*
+ * Edon-R needs all but the last hash invocation to be on full-size
+ * blocks, but the salt is too small. Rather than simply padding it
+ * with zeros, we expand the salt into a new salt block of proper
+ * size by double-hashing it (the new salt block will be composed of
+ * H(salt) || H(H(salt))).
+ */
+ CTASSERT(EDONR_BLOCK_SIZE == 2 * (EDONR_MODE / 8));
+ EdonRHash(EDONR_MODE, salt->zcs_bytes, sizeof (salt->zcs_bytes) * 8,
+ salt_block);
+ EdonRHash(EDONR_MODE, salt_block, EDONR_MODE, salt_block +
+ EDONR_MODE / 8);
+
+ /*
+ * Feed the new salt block into the hash function - this will serve
+ * as our MAC key.
+ */
+ ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
+ EdonRInit(ctx, EDONR_MODE);
+ EdonRUpdate(ctx, salt_block, sizeof (salt_block) * 8);
+ return (ctx);
+}
+
+void
+abd_checksum_edonr_tmpl_free(void *ctx_template)
+{
+ EdonRState *ctx = ctx_template;
+
+ bzero(ctx, sizeof (*ctx));
+ kmem_free(ctx, sizeof (*ctx));
+}
diff --git a/sys/contrib/openzfs/module/zfs/fm.c b/sys/contrib/openzfs/module/zfs/fm.c
new file mode 100644
index 000000000000..a5003f85d621
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/fm.c
@@ -0,0 +1,1686 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Fault Management Architecture (FMA) Resource and Protocol Support
+ *
+ * The routines contained herein provide services to support kernel subsystems
+ * in publishing fault management telemetry (see PSARC 2002/412 and 2003/089).
+ *
+ * Name-Value Pair Lists
+ *
+ * The embodiment of an FMA protocol element (event, fmri or authority) is a
+ * name-value pair list (nvlist_t). FMA-specific nvlist constructor and
+ * destructor functions, fm_nvlist_create() and fm_nvlist_destroy(), are used
+ * to create an nvpair list using custom allocators. Callers may choose to
+ * allocate either from the kernel memory allocator, or from a preallocated
+ * buffer, useful in constrained contexts like high-level interrupt routines.
+ *
+ * Protocol Event and FMRI Construction
+ *
+ * Convenience routines are provided to construct nvlist events according to
+ * the FMA Event Protocol and Naming Schema specification for ereports and
+ * FMRIs for the dev, cpu, hc, mem, legacy hc and de schemes.
+ *
+ * ENA Manipulation
+ *
+ * Routines to generate ENA formats 0, 1 and 2 are available as well as
+ * routines to increment formats 1 and 2. Individual fields within the
+ * ENA are extractable via fm_ena_time_get(), fm_ena_id_get(),
+ * fm_ena_format_get() and fm_ena_gen_get().
+ */
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/list.h>
+#include <sys/nvpair.h>
+#include <sys/cmn_err.h>
+#include <sys/sysmacros.h>
+#include <sys/sunddi.h>
+#include <sys/systeminfo.h>
+#include <sys/fm/util.h>
+#include <sys/fm/protocol.h>
+#include <sys/kstat.h>
+#include <sys/zfs_context.h>
+#ifdef _KERNEL
+#include <sys/atomic.h>
+#include <sys/condvar.h>
+#include <sys/console.h>
+#include <sys/time.h>
+#include <sys/zfs_ioctl.h>
+
+int zfs_zevent_len_max = 0;
+int zfs_zevent_cols = 80;
+int zfs_zevent_console = 0;
+
+static int zevent_len_cur = 0;
+static int zevent_waiters = 0;
+static int zevent_flags = 0;
+
+/* Num events rate limited since the last time zfs_zevent_next() was called */
+static uint64_t ratelimit_dropped = 0;
+
+/*
+ * The EID (Event IDentifier) is used to uniquely tag a zevent when it is
+ * posted. The posted EIDs are monotonically increasing but not persistent.
+ * They will be reset to the initial value (1) each time the kernel module is
+ * loaded.
+ */
+static uint64_t zevent_eid = 0;
+
+static kmutex_t zevent_lock;
+static list_t zevent_list;
+static kcondvar_t zevent_cv;
+#endif /* _KERNEL */
+
+
+/*
+ * Common fault management kstats to record event generation failures
+ */
+
+struct erpt_kstat {
+ kstat_named_t erpt_dropped; /* num erpts dropped on post */
+ kstat_named_t erpt_set_failed; /* num erpt set failures */
+ kstat_named_t fmri_set_failed; /* num fmri set failures */
+ kstat_named_t payload_set_failed; /* num payload set failures */
+ kstat_named_t erpt_duplicates; /* num duplicate erpts */
+};
+
+static struct erpt_kstat erpt_kstat_data = {
+ { "erpt-dropped", KSTAT_DATA_UINT64 },
+ { "erpt-set-failed", KSTAT_DATA_UINT64 },
+ { "fmri-set-failed", KSTAT_DATA_UINT64 },
+ { "payload-set-failed", KSTAT_DATA_UINT64 },
+ { "erpt-duplicates", KSTAT_DATA_UINT64 }
+};
+
+kstat_t *fm_ksp;
+
+#ifdef _KERNEL
+
+/*
+ * Formatting utility function for fm_nvprintr. We attempt to wrap chunks of
+ * output so they aren't split across console lines, and return the end column.
+ */
+/*PRINTFLIKE4*/
+static int
+fm_printf(int depth, int c, int cols, const char *format, ...)
+{
+ va_list ap;
+ int width;
+ char c1;
+
+ va_start(ap, format);
+ width = vsnprintf(&c1, sizeof (c1), format, ap);
+ va_end(ap);
+
+ if (c + width >= cols) {
+ console_printf("\n");
+ c = 0;
+ if (format[0] != ' ' && depth > 0) {
+ console_printf(" ");
+ c++;
+ }
+ }
+
+ va_start(ap, format);
+ console_vprintf(format, ap);
+ va_end(ap);
+
+ return ((c + width) % cols);
+}
+
+/*
+ * Recursively print an nvlist in the specified column width and return the
+ * column we end up in. This function is called recursively by fm_nvprint(),
+ * below. We generically format the entire nvpair using hexadecimal
+ * integers and strings, and elide any integer arrays. Arrays are basically
+ * used for cache dumps right now, so we suppress them so as not to overwhelm
+ * the amount of console output we produce at panic time. This can be further
+ * enhanced as FMA technology grows based upon the needs of consumers. All
+ * FMA telemetry is logged using the dump device transport, so the console
+ * output serves only as a fallback in case this procedure is unsuccessful.
+ */
+static int
+fm_nvprintr(nvlist_t *nvl, int d, int c, int cols)
+{
+ nvpair_t *nvp;
+
+ for (nvp = nvlist_next_nvpair(nvl, NULL);
+ nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) {
+
+ data_type_t type = nvpair_type(nvp);
+ const char *name = nvpair_name(nvp);
+
+ boolean_t b;
+ uint8_t i8;
+ uint16_t i16;
+ uint32_t i32;
+ uint64_t i64;
+ char *str;
+ nvlist_t *cnv;
+
+ if (strcmp(name, FM_CLASS) == 0)
+ continue; /* already printed by caller */
+
+ c = fm_printf(d, c, cols, " %s=", name);
+
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ c = fm_printf(d + 1, c, cols, " 1");
+ break;
+
+ case DATA_TYPE_BOOLEAN_VALUE:
+ (void) nvpair_value_boolean_value(nvp, &b);
+ c = fm_printf(d + 1, c, cols, b ? "1" : "0");
+ break;
+
+ case DATA_TYPE_BYTE:
+ (void) nvpair_value_byte(nvp, &i8);
+ c = fm_printf(d + 1, c, cols, "0x%x", i8);
+ break;
+
+ case DATA_TYPE_INT8:
+ (void) nvpair_value_int8(nvp, (void *)&i8);
+ c = fm_printf(d + 1, c, cols, "0x%x", i8);
+ break;
+
+ case DATA_TYPE_UINT8:
+ (void) nvpair_value_uint8(nvp, &i8);
+ c = fm_printf(d + 1, c, cols, "0x%x", i8);
+ break;
+
+ case DATA_TYPE_INT16:
+ (void) nvpair_value_int16(nvp, (void *)&i16);
+ c = fm_printf(d + 1, c, cols, "0x%x", i16);
+ break;
+
+ case DATA_TYPE_UINT16:
+ (void) nvpair_value_uint16(nvp, &i16);
+ c = fm_printf(d + 1, c, cols, "0x%x", i16);
+ break;
+
+ case DATA_TYPE_INT32:
+ (void) nvpair_value_int32(nvp, (void *)&i32);
+ c = fm_printf(d + 1, c, cols, "0x%x", i32);
+ break;
+
+ case DATA_TYPE_UINT32:
+ (void) nvpair_value_uint32(nvp, &i32);
+ c = fm_printf(d + 1, c, cols, "0x%x", i32);
+ break;
+
+ case DATA_TYPE_INT64:
+ (void) nvpair_value_int64(nvp, (void *)&i64);
+ c = fm_printf(d + 1, c, cols, "0x%llx",
+ (u_longlong_t)i64);
+ break;
+
+ case DATA_TYPE_UINT64:
+ (void) nvpair_value_uint64(nvp, &i64);
+ c = fm_printf(d + 1, c, cols, "0x%llx",
+ (u_longlong_t)i64);
+ break;
+
+ case DATA_TYPE_HRTIME:
+ (void) nvpair_value_hrtime(nvp, (void *)&i64);
+ c = fm_printf(d + 1, c, cols, "0x%llx",
+ (u_longlong_t)i64);
+ break;
+
+ case DATA_TYPE_STRING:
+ (void) nvpair_value_string(nvp, &str);
+ c = fm_printf(d + 1, c, cols, "\"%s\"",
+ str ? str : "<NULL>");
+ break;
+
+ case DATA_TYPE_NVLIST:
+ c = fm_printf(d + 1, c, cols, "[");
+ (void) nvpair_value_nvlist(nvp, &cnv);
+ c = fm_nvprintr(cnv, d + 1, c, cols);
+ c = fm_printf(d + 1, c, cols, " ]");
+ break;
+
+ case DATA_TYPE_NVLIST_ARRAY: {
+ nvlist_t **val;
+ uint_t i, nelem;
+
+ c = fm_printf(d + 1, c, cols, "[");
+ (void) nvpair_value_nvlist_array(nvp, &val, &nelem);
+ for (i = 0; i < nelem; i++) {
+ c = fm_nvprintr(val[i], d + 1, c, cols);
+ }
+ c = fm_printf(d + 1, c, cols, " ]");
+ }
+ break;
+
+ case DATA_TYPE_INT8_ARRAY: {
+ int8_t *val;
+ uint_t i, nelem;
+
+ c = fm_printf(d + 1, c, cols, "[ ");
+ (void) nvpair_value_int8_array(nvp, &val, &nelem);
+ for (i = 0; i < nelem; i++)
+ c = fm_printf(d + 1, c, cols, "0x%llx ",
+ (u_longlong_t)val[i]);
+
+ c = fm_printf(d + 1, c, cols, "]");
+ break;
+ }
+
+ case DATA_TYPE_UINT8_ARRAY: {
+ uint8_t *val;
+ uint_t i, nelem;
+
+ c = fm_printf(d + 1, c, cols, "[ ");
+ (void) nvpair_value_uint8_array(nvp, &val, &nelem);
+ for (i = 0; i < nelem; i++)
+ c = fm_printf(d + 1, c, cols, "0x%llx ",
+ (u_longlong_t)val[i]);
+
+ c = fm_printf(d + 1, c, cols, "]");
+ break;
+ }
+
+ case DATA_TYPE_INT16_ARRAY: {
+ int16_t *val;
+ uint_t i, nelem;
+
+ c = fm_printf(d + 1, c, cols, "[ ");
+ (void) nvpair_value_int16_array(nvp, &val, &nelem);
+ for (i = 0; i < nelem; i++)
+ c = fm_printf(d + 1, c, cols, "0x%llx ",
+ (u_longlong_t)val[i]);
+
+ c = fm_printf(d + 1, c, cols, "]");
+ break;
+ }
+
+ case DATA_TYPE_UINT16_ARRAY: {
+ uint16_t *val;
+ uint_t i, nelem;
+
+ c = fm_printf(d + 1, c, cols, "[ ");
+ (void) nvpair_value_uint16_array(nvp, &val, &nelem);
+ for (i = 0; i < nelem; i++)
+ c = fm_printf(d + 1, c, cols, "0x%llx ",
+ (u_longlong_t)val[i]);
+
+ c = fm_printf(d + 1, c, cols, "]");
+ break;
+ }
+
+ case DATA_TYPE_INT32_ARRAY: {
+ int32_t *val;
+ uint_t i, nelem;
+
+ c = fm_printf(d + 1, c, cols, "[ ");
+ (void) nvpair_value_int32_array(nvp, &val, &nelem);
+ for (i = 0; i < nelem; i++)
+ c = fm_printf(d + 1, c, cols, "0x%llx ",
+ (u_longlong_t)val[i]);
+
+ c = fm_printf(d + 1, c, cols, "]");
+ break;
+ }
+
+ case DATA_TYPE_UINT32_ARRAY: {
+ uint32_t *val;
+ uint_t i, nelem;
+
+ c = fm_printf(d + 1, c, cols, "[ ");
+ (void) nvpair_value_uint32_array(nvp, &val, &nelem);
+ for (i = 0; i < nelem; i++)
+ c = fm_printf(d + 1, c, cols, "0x%llx ",
+ (u_longlong_t)val[i]);
+
+ c = fm_printf(d + 1, c, cols, "]");
+ break;
+ }
+
+ case DATA_TYPE_INT64_ARRAY: {
+ int64_t *val;
+ uint_t i, nelem;
+
+ c = fm_printf(d + 1, c, cols, "[ ");
+ (void) nvpair_value_int64_array(nvp, &val, &nelem);
+ for (i = 0; i < nelem; i++)
+ c = fm_printf(d + 1, c, cols, "0x%llx ",
+ (u_longlong_t)val[i]);
+
+ c = fm_printf(d + 1, c, cols, "]");
+ break;
+ }
+
+ case DATA_TYPE_UINT64_ARRAY: {
+ uint64_t *val;
+ uint_t i, nelem;
+
+ c = fm_printf(d + 1, c, cols, "[ ");
+ (void) nvpair_value_uint64_array(nvp, &val, &nelem);
+ for (i = 0; i < nelem; i++)
+ c = fm_printf(d + 1, c, cols, "0x%llx ",
+ (u_longlong_t)val[i]);
+
+ c = fm_printf(d + 1, c, cols, "]");
+ break;
+ }
+
+ case DATA_TYPE_STRING_ARRAY:
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_BYTE_ARRAY:
+ c = fm_printf(d + 1, c, cols, "[...]");
+ break;
+
+ case DATA_TYPE_UNKNOWN:
+ case DATA_TYPE_DONTCARE:
+ c = fm_printf(d + 1, c, cols, "<unknown>");
+ break;
+ }
+ }
+
+ return (c);
+}
+
+void
+fm_nvprint(nvlist_t *nvl)
+{
+ char *class;
+ int c = 0;
+
+ console_printf("\n");
+
+ if (nvlist_lookup_string(nvl, FM_CLASS, &class) == 0)
+ c = fm_printf(0, c, zfs_zevent_cols, "%s", class);
+
+ if (fm_nvprintr(nvl, 0, c, zfs_zevent_cols) != 0)
+ console_printf("\n");
+
+ console_printf("\n");
+}
+
+static zevent_t *
+zfs_zevent_alloc(void)
+{
+ zevent_t *ev;
+
+ ev = kmem_zalloc(sizeof (zevent_t), KM_SLEEP);
+
+ list_create(&ev->ev_ze_list, sizeof (zfs_zevent_t),
+ offsetof(zfs_zevent_t, ze_node));
+ list_link_init(&ev->ev_node);
+
+ return (ev);
+}
+
+static void
+zfs_zevent_free(zevent_t *ev)
+{
+ /* Run provided cleanup callback */
+ ev->ev_cb(ev->ev_nvl, ev->ev_detector);
+
+ list_destroy(&ev->ev_ze_list);
+ kmem_free(ev, sizeof (zevent_t));
+}
+
+static void
+zfs_zevent_drain(zevent_t *ev)
+{
+ zfs_zevent_t *ze;
+
+ ASSERT(MUTEX_HELD(&zevent_lock));
+ list_remove(&zevent_list, ev);
+
+ /* Remove references to this event in all private file data */
+ while ((ze = list_head(&ev->ev_ze_list)) != NULL) {
+ list_remove(&ev->ev_ze_list, ze);
+ ze->ze_zevent = NULL;
+ ze->ze_dropped++;
+ }
+
+ zfs_zevent_free(ev);
+}
+
+void
+zfs_zevent_drain_all(int *count)
+{
+ zevent_t *ev;
+
+ mutex_enter(&zevent_lock);
+ while ((ev = list_head(&zevent_list)) != NULL)
+ zfs_zevent_drain(ev);
+
+ *count = zevent_len_cur;
+ zevent_len_cur = 0;
+ mutex_exit(&zevent_lock);
+}
+
+/*
+ * New zevents are inserted at the head. If the maximum queue
+ * length is exceeded a zevent will be drained from the tail.
+ * As part of this any user space processes which currently have
+ * a reference to this zevent_t in their private data will have
+ * this reference set to NULL.
+ */
+static void
+zfs_zevent_insert(zevent_t *ev)
+{
+ ASSERT(MUTEX_HELD(&zevent_lock));
+ list_insert_head(&zevent_list, ev);
+
+ if (zevent_len_cur >= zfs_zevent_len_max)
+ zfs_zevent_drain(list_tail(&zevent_list));
+ else
+ zevent_len_cur++;
+}
+
+/*
+ * Post a zevent. The cb will be called when nvl and detector are no longer
+ * needed, i.e.:
+ * - An error happened and a zevent can't be posted. In this case, cb is called
+ * before zfs_zevent_post() returns.
+ * - The event is being drained and freed.
+ */
+int
+zfs_zevent_post(nvlist_t *nvl, nvlist_t *detector, zevent_cb_t *cb)
+{
+ inode_timespec_t tv;
+ int64_t tv_array[2];
+ uint64_t eid;
+ size_t nvl_size = 0;
+ zevent_t *ev;
+ int error;
+
+ ASSERT(cb != NULL);
+
+ gethrestime(&tv);
+ tv_array[0] = tv.tv_sec;
+ tv_array[1] = tv.tv_nsec;
+
+ error = nvlist_add_int64_array(nvl, FM_EREPORT_TIME, tv_array, 2);
+ if (error) {
+ atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
+ goto out;
+ }
+
+ eid = atomic_inc_64_nv(&zevent_eid);
+ error = nvlist_add_uint64(nvl, FM_EREPORT_EID, eid);
+ if (error) {
+ atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
+ goto out;
+ }
+
+ error = nvlist_size(nvl, &nvl_size, NV_ENCODE_NATIVE);
+ if (error) {
+ atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
+ goto out;
+ }
+
+ if (nvl_size > ERPT_DATA_SZ || nvl_size == 0) {
+ atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
+ error = EOVERFLOW;
+ goto out;
+ }
+
+ if (zfs_zevent_console)
+ fm_nvprint(nvl);
+
+ ev = zfs_zevent_alloc();
+ if (ev == NULL) {
+ atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
+ error = ENOMEM;
+ goto out;
+ }
+
+ ev->ev_nvl = nvl;
+ ev->ev_detector = detector;
+ ev->ev_cb = cb;
+ ev->ev_eid = eid;
+
+ mutex_enter(&zevent_lock);
+ zfs_zevent_insert(ev);
+ cv_broadcast(&zevent_cv);
+ mutex_exit(&zevent_lock);
+
+out:
+ if (error)
+ cb(nvl, detector);
+
+ return (error);
+}
+
+void
+zfs_zevent_track_duplicate(void)
+{
+ atomic_inc_64(&erpt_kstat_data.erpt_duplicates.value.ui64);
+}
+
+static int
+zfs_zevent_minor_to_state(minor_t minor, zfs_zevent_t **ze)
+{
+ *ze = zfsdev_get_state(minor, ZST_ZEVENT);
+ if (*ze == NULL)
+ return (SET_ERROR(EBADF));
+
+ return (0);
+}
+
+int
+zfs_zevent_fd_hold(int fd, minor_t *minorp, zfs_zevent_t **ze)
+{
+ int error;
+
+ error = zfsdev_getminor(fd, minorp);
+ if (error == 0)
+ error = zfs_zevent_minor_to_state(*minorp, ze);
+
+ if (error)
+ zfs_zevent_fd_rele(fd);
+
+ return (error);
+}
+
+void
+zfs_zevent_fd_rele(int fd)
+{
+ zfs_file_put(fd);
+}
+
+/*
+ * Get the next zevent in the stream and place a copy in 'event'. This
+ * may fail with ENOMEM if the encoded nvlist size exceeds the passed
+ * 'event_size'. In this case the stream pointer is not advanced and
+ * and 'event_size' is set to the minimum required buffer size.
+ */
+int
+zfs_zevent_next(zfs_zevent_t *ze, nvlist_t **event, uint64_t *event_size,
+ uint64_t *dropped)
+{
+ zevent_t *ev;
+ size_t size;
+ int error = 0;
+
+ mutex_enter(&zevent_lock);
+ if (ze->ze_zevent == NULL) {
+ /* New stream start at the beginning/tail */
+ ev = list_tail(&zevent_list);
+ if (ev == NULL) {
+ error = ENOENT;
+ goto out;
+ }
+ } else {
+ /*
+ * Existing stream continue with the next element and remove
+ * ourselves from the wait queue for the previous element
+ */
+ ev = list_prev(&zevent_list, ze->ze_zevent);
+ if (ev == NULL) {
+ error = ENOENT;
+ goto out;
+ }
+ }
+
+ VERIFY(nvlist_size(ev->ev_nvl, &size, NV_ENCODE_NATIVE) == 0);
+ if (size > *event_size) {
+ *event_size = size;
+ error = ENOMEM;
+ goto out;
+ }
+
+ if (ze->ze_zevent)
+ list_remove(&ze->ze_zevent->ev_ze_list, ze);
+
+ ze->ze_zevent = ev;
+ list_insert_head(&ev->ev_ze_list, ze);
+ (void) nvlist_dup(ev->ev_nvl, event, KM_SLEEP);
+ *dropped = ze->ze_dropped;
+
+#ifdef _KERNEL
+ /* Include events dropped due to rate limiting */
+ *dropped += ratelimit_dropped;
+ ratelimit_dropped = 0;
+#endif
+ ze->ze_dropped = 0;
+out:
+ mutex_exit(&zevent_lock);
+
+ return (error);
+}
+
+/*
+ * Wait in an interruptible state for any new events.
+ */
+int
+zfs_zevent_wait(zfs_zevent_t *ze)
+{
+ int error = EAGAIN;
+
+ mutex_enter(&zevent_lock);
+ zevent_waiters++;
+
+ while (error == EAGAIN) {
+ if (zevent_flags & ZEVENT_SHUTDOWN) {
+ error = SET_ERROR(ESHUTDOWN);
+ break;
+ }
+
+ error = cv_wait_sig(&zevent_cv, &zevent_lock);
+ if (signal_pending(current)) {
+ error = SET_ERROR(EINTR);
+ break;
+ } else if (!list_is_empty(&zevent_list)) {
+ error = 0;
+ continue;
+ } else {
+ error = EAGAIN;
+ }
+ }
+
+ zevent_waiters--;
+ mutex_exit(&zevent_lock);
+
+ return (error);
+}
+
+/*
+ * The caller may seek to a specific EID by passing that EID. If the EID
+ * is still available in the posted list of events the cursor is positioned
+ * there. Otherwise ENOENT is returned and the cursor is not moved.
+ *
+ * There are two reserved EIDs which may be passed and will never fail.
+ * ZEVENT_SEEK_START positions the cursor at the start of the list, and
+ * ZEVENT_SEEK_END positions the cursor at the end of the list.
+ */
+int
+zfs_zevent_seek(zfs_zevent_t *ze, uint64_t eid)
+{
+ zevent_t *ev;
+ int error = 0;
+
+ mutex_enter(&zevent_lock);
+
+ if (eid == ZEVENT_SEEK_START) {
+ if (ze->ze_zevent)
+ list_remove(&ze->ze_zevent->ev_ze_list, ze);
+
+ ze->ze_zevent = NULL;
+ goto out;
+ }
+
+ if (eid == ZEVENT_SEEK_END) {
+ if (ze->ze_zevent)
+ list_remove(&ze->ze_zevent->ev_ze_list, ze);
+
+ ev = list_head(&zevent_list);
+ if (ev) {
+ ze->ze_zevent = ev;
+ list_insert_head(&ev->ev_ze_list, ze);
+ } else {
+ ze->ze_zevent = NULL;
+ }
+
+ goto out;
+ }
+
+ for (ev = list_tail(&zevent_list); ev != NULL;
+ ev = list_prev(&zevent_list, ev)) {
+ if (ev->ev_eid == eid) {
+ if (ze->ze_zevent)
+ list_remove(&ze->ze_zevent->ev_ze_list, ze);
+
+ ze->ze_zevent = ev;
+ list_insert_head(&ev->ev_ze_list, ze);
+ break;
+ }
+ }
+
+ if (ev == NULL)
+ error = ENOENT;
+
+out:
+ mutex_exit(&zevent_lock);
+
+ return (error);
+}
+
+void
+zfs_zevent_init(zfs_zevent_t **zep)
+{
+ zfs_zevent_t *ze;
+
+ ze = *zep = kmem_zalloc(sizeof (zfs_zevent_t), KM_SLEEP);
+ list_link_init(&ze->ze_node);
+}
+
+void
+zfs_zevent_destroy(zfs_zevent_t *ze)
+{
+ mutex_enter(&zevent_lock);
+ if (ze->ze_zevent)
+ list_remove(&ze->ze_zevent->ev_ze_list, ze);
+ mutex_exit(&zevent_lock);
+
+ kmem_free(ze, sizeof (zfs_zevent_t));
+}
+#endif /* _KERNEL */
+
+/*
+ * Wrappers for FM nvlist allocators
+ */
+/* ARGSUSED */
+static void *
+i_fm_alloc(nv_alloc_t *nva, size_t size)
+{
+ return (kmem_zalloc(size, KM_SLEEP));
+}
+
+/* ARGSUSED */
+static void
+i_fm_free(nv_alloc_t *nva, void *buf, size_t size)
+{
+ kmem_free(buf, size);
+}
+
+const nv_alloc_ops_t fm_mem_alloc_ops = {
+ .nv_ao_init = NULL,
+ .nv_ao_fini = NULL,
+ .nv_ao_alloc = i_fm_alloc,
+ .nv_ao_free = i_fm_free,
+ .nv_ao_reset = NULL
+};
+
+/*
+ * Create and initialize a new nv_alloc_t for a fixed buffer, buf. A pointer
+ * to the newly allocated nv_alloc_t structure is returned upon success or NULL
+ * is returned to indicate that the nv_alloc structure could not be created.
+ */
+nv_alloc_t *
+fm_nva_xcreate(char *buf, size_t bufsz)
+{
+ nv_alloc_t *nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
+
+ if (bufsz == 0 || nv_alloc_init(nvhdl, nv_fixed_ops, buf, bufsz) != 0) {
+ kmem_free(nvhdl, sizeof (nv_alloc_t));
+ return (NULL);
+ }
+
+ return (nvhdl);
+}
+
+/*
+ * Destroy a previously allocated nv_alloc structure. The fixed buffer
+ * associated with nva must be freed by the caller.
+ */
+void
+fm_nva_xdestroy(nv_alloc_t *nva)
+{
+ nv_alloc_fini(nva);
+ kmem_free(nva, sizeof (nv_alloc_t));
+}
+
+/*
+ * Create a new nv list. A pointer to a new nv list structure is returned
+ * upon success or NULL is returned to indicate that the structure could
+ * not be created. The newly created nv list is created and managed by the
+ * operations installed in nva. If nva is NULL, the default FMA nva
+ * operations are installed and used.
+ *
+ * When called from the kernel and nva == NULL, this function must be called
+ * from passive kernel context with no locks held that can prevent a
+ * sleeping memory allocation from occurring. Otherwise, this function may
+ * be called from other kernel contexts as long a valid nva created via
+ * fm_nva_create() is supplied.
+ */
+nvlist_t *
+fm_nvlist_create(nv_alloc_t *nva)
+{
+ int hdl_alloced = 0;
+ nvlist_t *nvl;
+ nv_alloc_t *nvhdl;
+
+ if (nva == NULL) {
+ nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
+
+ if (nv_alloc_init(nvhdl, &fm_mem_alloc_ops, NULL, 0) != 0) {
+ kmem_free(nvhdl, sizeof (nv_alloc_t));
+ return (NULL);
+ }
+ hdl_alloced = 1;
+ } else {
+ nvhdl = nva;
+ }
+
+ if (nvlist_xalloc(&nvl, NV_UNIQUE_NAME, nvhdl) != 0) {
+ if (hdl_alloced) {
+ nv_alloc_fini(nvhdl);
+ kmem_free(nvhdl, sizeof (nv_alloc_t));
+ }
+ return (NULL);
+ }
+
+ return (nvl);
+}
+
+/*
+ * Destroy a previously allocated nvlist structure. flag indicates whether
+ * or not the associated nva structure should be freed (FM_NVA_FREE) or
+ * retained (FM_NVA_RETAIN). Retaining the nv alloc structure allows
+ * it to be re-used for future nvlist creation operations.
+ */
+void
+fm_nvlist_destroy(nvlist_t *nvl, int flag)
+{
+ nv_alloc_t *nva = nvlist_lookup_nv_alloc(nvl);
+
+ nvlist_free(nvl);
+
+ if (nva != NULL) {
+ if (flag == FM_NVA_FREE)
+ fm_nva_xdestroy(nva);
+ }
+}
+
+int
+i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap)
+{
+ int nelem, ret = 0;
+ data_type_t type;
+
+ while (ret == 0 && name != NULL) {
+ type = va_arg(ap, data_type_t);
+ switch (type) {
+ case DATA_TYPE_BYTE:
+ ret = nvlist_add_byte(payload, name,
+ va_arg(ap, uint_t));
+ break;
+ case DATA_TYPE_BYTE_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_byte_array(payload, name,
+ va_arg(ap, uchar_t *), nelem);
+ break;
+ case DATA_TYPE_BOOLEAN_VALUE:
+ ret = nvlist_add_boolean_value(payload, name,
+ va_arg(ap, boolean_t));
+ break;
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_boolean_array(payload, name,
+ va_arg(ap, boolean_t *), nelem);
+ break;
+ case DATA_TYPE_INT8:
+ ret = nvlist_add_int8(payload, name,
+ va_arg(ap, int));
+ break;
+ case DATA_TYPE_INT8_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_int8_array(payload, name,
+ va_arg(ap, int8_t *), nelem);
+ break;
+ case DATA_TYPE_UINT8:
+ ret = nvlist_add_uint8(payload, name,
+ va_arg(ap, uint_t));
+ break;
+ case DATA_TYPE_UINT8_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_uint8_array(payload, name,
+ va_arg(ap, uint8_t *), nelem);
+ break;
+ case DATA_TYPE_INT16:
+ ret = nvlist_add_int16(payload, name,
+ va_arg(ap, int));
+ break;
+ case DATA_TYPE_INT16_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_int16_array(payload, name,
+ va_arg(ap, int16_t *), nelem);
+ break;
+ case DATA_TYPE_UINT16:
+ ret = nvlist_add_uint16(payload, name,
+ va_arg(ap, uint_t));
+ break;
+ case DATA_TYPE_UINT16_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_uint16_array(payload, name,
+ va_arg(ap, uint16_t *), nelem);
+ break;
+ case DATA_TYPE_INT32:
+ ret = nvlist_add_int32(payload, name,
+ va_arg(ap, int32_t));
+ break;
+ case DATA_TYPE_INT32_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_int32_array(payload, name,
+ va_arg(ap, int32_t *), nelem);
+ break;
+ case DATA_TYPE_UINT32:
+ ret = nvlist_add_uint32(payload, name,
+ va_arg(ap, uint32_t));
+ break;
+ case DATA_TYPE_UINT32_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_uint32_array(payload, name,
+ va_arg(ap, uint32_t *), nelem);
+ break;
+ case DATA_TYPE_INT64:
+ ret = nvlist_add_int64(payload, name,
+ va_arg(ap, int64_t));
+ break;
+ case DATA_TYPE_INT64_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_int64_array(payload, name,
+ va_arg(ap, int64_t *), nelem);
+ break;
+ case DATA_TYPE_UINT64:
+ ret = nvlist_add_uint64(payload, name,
+ va_arg(ap, uint64_t));
+ break;
+ case DATA_TYPE_UINT64_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_uint64_array(payload, name,
+ va_arg(ap, uint64_t *), nelem);
+ break;
+ case DATA_TYPE_STRING:
+ ret = nvlist_add_string(payload, name,
+ va_arg(ap, char *));
+ break;
+ case DATA_TYPE_STRING_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_string_array(payload, name,
+ va_arg(ap, char **), nelem);
+ break;
+ case DATA_TYPE_NVLIST:
+ ret = nvlist_add_nvlist(payload, name,
+ va_arg(ap, nvlist_t *));
+ break;
+ case DATA_TYPE_NVLIST_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_nvlist_array(payload, name,
+ va_arg(ap, nvlist_t **), nelem);
+ break;
+ default:
+ ret = EINVAL;
+ }
+
+ name = va_arg(ap, char *);
+ }
+ return (ret);
+}
+
+void
+fm_payload_set(nvlist_t *payload, ...)
+{
+ int ret;
+ const char *name;
+ va_list ap;
+
+ va_start(ap, payload);
+ name = va_arg(ap, char *);
+ ret = i_fm_payload_set(payload, name, ap);
+ va_end(ap);
+
+ if (ret)
+ atomic_inc_64(&erpt_kstat_data.payload_set_failed.value.ui64);
+}
+
+/*
+ * Set-up and validate the members of an ereport event according to:
+ *
+ * Member name Type Value
+ * ====================================================
+ * class string ereport
+ * version uint8_t 0
+ * ena uint64_t <ena>
+ * detector nvlist_t <detector>
+ * ereport-payload nvlist_t <var args>
+ *
+ * We don't actually add a 'version' member to the payload. Really,
+ * the version quoted to us by our caller is that of the category 1
+ * "ereport" event class (and we require FM_EREPORT_VERS0) but
+ * the payload version of the actual leaf class event under construction
+ * may be something else. Callers should supply a version in the varargs,
+ * or (better) we could take two version arguments - one for the
+ * ereport category 1 classification (expect FM_EREPORT_VERS0) and one
+ * for the leaf class.
+ */
+void
+fm_ereport_set(nvlist_t *ereport, int version, const char *erpt_class,
+ uint64_t ena, const nvlist_t *detector, ...)
+{
+ char ereport_class[FM_MAX_CLASS];
+ const char *name;
+ va_list ap;
+ int ret;
+
+ if (version != FM_EREPORT_VERS0) {
+ atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
+ return;
+ }
+
+ (void) snprintf(ereport_class, FM_MAX_CLASS, "%s.%s",
+ FM_EREPORT_CLASS, erpt_class);
+ if (nvlist_add_string(ereport, FM_CLASS, ereport_class) != 0) {
+ atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
+ return;
+ }
+
+ if (nvlist_add_uint64(ereport, FM_EREPORT_ENA, ena)) {
+ atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
+ }
+
+ if (nvlist_add_nvlist(ereport, FM_EREPORT_DETECTOR,
+ (nvlist_t *)detector) != 0) {
+ atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
+ }
+
+ va_start(ap, detector);
+ name = va_arg(ap, const char *);
+ ret = i_fm_payload_set(ereport, name, ap);
+ va_end(ap);
+
+ if (ret)
+ atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
+}
+
+/*
+ * Set-up and validate the members of an hc fmri according to;
+ *
+ * Member name Type Value
+ * ===================================================
+ * version uint8_t 0
+ * auth nvlist_t <auth>
+ * hc-name string <name>
+ * hc-id string <id>
+ *
+ * Note that auth and hc-id are optional members.
+ */
+
+#define HC_MAXPAIRS 20
+#define HC_MAXNAMELEN 50
+
+static int
+fm_fmri_hc_set_common(nvlist_t *fmri, int version, const nvlist_t *auth)
+{
+ if (version != FM_HC_SCHEME_VERSION) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return (0);
+ }
+
+ if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0 ||
+ nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return (0);
+ }
+
+ if (auth != NULL && nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
+ (nvlist_t *)auth) != 0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return (0);
+ }
+
+ return (1);
+}
+
+void
+fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth,
+ nvlist_t *snvl, int npairs, ...)
+{
+ nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
+ nvlist_t *pairs[HC_MAXPAIRS];
+ va_list ap;
+ int i;
+
+ if (!fm_fmri_hc_set_common(fmri, version, auth))
+ return;
+
+ npairs = MIN(npairs, HC_MAXPAIRS);
+
+ va_start(ap, npairs);
+ for (i = 0; i < npairs; i++) {
+ const char *name = va_arg(ap, const char *);
+ uint32_t id = va_arg(ap, uint32_t);
+ char idstr[11];
+
+ (void) snprintf(idstr, sizeof (idstr), "%u", id);
+
+ pairs[i] = fm_nvlist_create(nva);
+ if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
+ nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
+ }
+ }
+ va_end(ap);
+
+ if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0)
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+
+ for (i = 0; i < npairs; i++)
+ fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
+
+ if (snvl != NULL) {
+ if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
+ }
+ }
+}
+
+void
+fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth,
+ nvlist_t *snvl, nvlist_t *bboard, int npairs, ...)
+{
+ nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
+ nvlist_t *pairs[HC_MAXPAIRS];
+ nvlist_t **hcl;
+ uint_t n;
+ int i, j;
+ va_list ap;
+ char *hcname, *hcid;
+
+ if (!fm_fmri_hc_set_common(fmri, version, auth))
+ return;
+
+ /*
+ * copy the bboard nvpairs to the pairs array
+ */
+ if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n)
+ != 0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+
+ for (i = 0; i < n; i++) {
+ if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME,
+ &hcname) != 0) {
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+ if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) {
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+
+ pairs[i] = fm_nvlist_create(nva);
+ if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, hcname) != 0 ||
+ nvlist_add_string(pairs[i], FM_FMRI_HC_ID, hcid) != 0) {
+ for (j = 0; j <= i; j++) {
+ if (pairs[j] != NULL)
+ fm_nvlist_destroy(pairs[j],
+ FM_NVA_RETAIN);
+ }
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+ }
+
+ /*
+ * create the pairs from passed in pairs
+ */
+ npairs = MIN(npairs, HC_MAXPAIRS);
+
+ va_start(ap, npairs);
+ for (i = n; i < npairs + n; i++) {
+ const char *name = va_arg(ap, const char *);
+ uint32_t id = va_arg(ap, uint32_t);
+ char idstr[11];
+ (void) snprintf(idstr, sizeof (idstr), "%u", id);
+ pairs[i] = fm_nvlist_create(nva);
+ if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
+ nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
+ for (j = 0; j <= i; j++) {
+ if (pairs[j] != NULL)
+ fm_nvlist_destroy(pairs[j],
+ FM_NVA_RETAIN);
+ }
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+ }
+ va_end(ap);
+
+ /*
+ * Create the fmri hc list
+ */
+ if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs,
+ npairs + n) != 0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+
+ for (i = 0; i < npairs + n; i++) {
+ fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
+ }
+
+ if (snvl != NULL) {
+ if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+ }
+}
+
+/*
+ * Set-up and validate the members of an dev fmri according to:
+ *
+ * Member name Type Value
+ * ====================================================
+ * version uint8_t 0
+ * auth nvlist_t <auth>
+ * devpath string <devpath>
+ * [devid] string <devid>
+ * [target-port-l0id] string <target-port-lun0-id>
+ *
+ * Note that auth and devid are optional members.
+ */
+void
+fm_fmri_dev_set(nvlist_t *fmri_dev, int version, const nvlist_t *auth,
+ const char *devpath, const char *devid, const char *tpl0)
+{
+ int err = 0;
+
+ if (version != DEV_SCHEME_VERSION0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+
+ err |= nvlist_add_uint8(fmri_dev, FM_VERSION, version);
+ err |= nvlist_add_string(fmri_dev, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV);
+
+ if (auth != NULL) {
+ err |= nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY,
+ (nvlist_t *)auth);
+ }
+
+ err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath);
+
+ if (devid != NULL)
+ err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid);
+
+ if (tpl0 != NULL)
+ err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0);
+
+ if (err)
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+
+}
+
+/*
+ * Set-up and validate the members of an cpu fmri according to:
+ *
+ * Member name Type Value
+ * ====================================================
+ * version uint8_t 0
+ * auth nvlist_t <auth>
+ * cpuid uint32_t <cpu_id>
+ * cpumask uint8_t <cpu_mask>
+ * serial uint64_t <serial_id>
+ *
+ * Note that auth, cpumask, serial are optional members.
+ *
+ */
+void
+fm_fmri_cpu_set(nvlist_t *fmri_cpu, int version, const nvlist_t *auth,
+ uint32_t cpu_id, uint8_t *cpu_maskp, const char *serial_idp)
+{
+ uint64_t *failedp = &erpt_kstat_data.fmri_set_failed.value.ui64;
+
+ if (version < CPU_SCHEME_VERSION1) {
+ atomic_inc_64(failedp);
+ return;
+ }
+
+ if (nvlist_add_uint8(fmri_cpu, FM_VERSION, version) != 0) {
+ atomic_inc_64(failedp);
+ return;
+ }
+
+ if (nvlist_add_string(fmri_cpu, FM_FMRI_SCHEME,
+ FM_FMRI_SCHEME_CPU) != 0) {
+ atomic_inc_64(failedp);
+ return;
+ }
+
+ if (auth != NULL && nvlist_add_nvlist(fmri_cpu, FM_FMRI_AUTHORITY,
+ (nvlist_t *)auth) != 0)
+ atomic_inc_64(failedp);
+
+ if (nvlist_add_uint32(fmri_cpu, FM_FMRI_CPU_ID, cpu_id) != 0)
+ atomic_inc_64(failedp);
+
+ if (cpu_maskp != NULL && nvlist_add_uint8(fmri_cpu, FM_FMRI_CPU_MASK,
+ *cpu_maskp) != 0)
+ atomic_inc_64(failedp);
+
+ if (serial_idp == NULL || nvlist_add_string(fmri_cpu,
+ FM_FMRI_CPU_SERIAL_ID, (char *)serial_idp) != 0)
+ atomic_inc_64(failedp);
+}
+
+/*
+ * Set-up and validate the members of a mem according to:
+ *
+ * Member name Type Value
+ * ====================================================
+ * version uint8_t 0
+ * auth nvlist_t <auth> [optional]
+ * unum string <unum>
+ * serial string <serial> [optional*]
+ * offset uint64_t <offset> [optional]
+ *
+ * * serial is required if offset is present
+ */
+void
+fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth,
+ const char *unum, const char *serial, uint64_t offset)
+{
+ if (version != MEM_SCHEME_VERSION0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+
+ if (!serial && (offset != (uint64_t)-1)) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+
+ if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+
+ if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_MEM) != 0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+
+ if (auth != NULL) {
+ if (nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
+ (nvlist_t *)auth) != 0) {
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
+ }
+ }
+
+ if (nvlist_add_string(fmri, FM_FMRI_MEM_UNUM, unum) != 0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ }
+
+ if (serial != NULL) {
+ if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID,
+ (char **)&serial, 1) != 0) {
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
+ }
+ if (offset != (uint64_t)-1 && nvlist_add_uint64(fmri,
+ FM_FMRI_MEM_OFFSET, offset) != 0) {
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
+ }
+ }
+}
+
+void
+fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid,
+ uint64_t vdev_guid)
+{
+ if (version != ZFS_SCHEME_VERSION0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+
+ if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+
+ if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+
+ if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ }
+
+ if (vdev_guid != 0) {
+ if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) {
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
+ }
+ }
+}
+
+uint64_t
+fm_ena_increment(uint64_t ena)
+{
+ uint64_t new_ena;
+
+ switch (ENA_FORMAT(ena)) {
+ case FM_ENA_FMT1:
+ new_ena = ena + (1 << ENA_FMT1_GEN_SHFT);
+ break;
+ case FM_ENA_FMT2:
+ new_ena = ena + (1 << ENA_FMT2_GEN_SHFT);
+ break;
+ default:
+ new_ena = 0;
+ }
+
+ return (new_ena);
+}
+
+uint64_t
+fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format)
+{
+ uint64_t ena = 0;
+
+ switch (format) {
+ case FM_ENA_FMT1:
+ if (timestamp) {
+ ena = (uint64_t)((format & ENA_FORMAT_MASK) |
+ ((cpuid << ENA_FMT1_CPUID_SHFT) &
+ ENA_FMT1_CPUID_MASK) |
+ ((timestamp << ENA_FMT1_TIME_SHFT) &
+ ENA_FMT1_TIME_MASK));
+ } else {
+ ena = (uint64_t)((format & ENA_FORMAT_MASK) |
+ ((cpuid << ENA_FMT1_CPUID_SHFT) &
+ ENA_FMT1_CPUID_MASK) |
+ ((gethrtime() << ENA_FMT1_TIME_SHFT) &
+ ENA_FMT1_TIME_MASK));
+ }
+ break;
+ case FM_ENA_FMT2:
+ ena = (uint64_t)((format & ENA_FORMAT_MASK) |
+ ((timestamp << ENA_FMT2_TIME_SHFT) & ENA_FMT2_TIME_MASK));
+ break;
+ default:
+ break;
+ }
+
+ return (ena);
+}
+
+uint64_t
+fm_ena_generate(uint64_t timestamp, uchar_t format)
+{
+ uint64_t ena;
+
+ kpreempt_disable();
+ ena = fm_ena_generate_cpu(timestamp, getcpuid(), format);
+ kpreempt_enable();
+
+ return (ena);
+}
+
+uint64_t
+fm_ena_generation_get(uint64_t ena)
+{
+ uint64_t gen;
+
+ switch (ENA_FORMAT(ena)) {
+ case FM_ENA_FMT1:
+ gen = (ena & ENA_FMT1_GEN_MASK) >> ENA_FMT1_GEN_SHFT;
+ break;
+ case FM_ENA_FMT2:
+ gen = (ena & ENA_FMT2_GEN_MASK) >> ENA_FMT2_GEN_SHFT;
+ break;
+ default:
+ gen = 0;
+ break;
+ }
+
+ return (gen);
+}
+
+uchar_t
+fm_ena_format_get(uint64_t ena)
+{
+
+ return (ENA_FORMAT(ena));
+}
+
+uint64_t
+fm_ena_id_get(uint64_t ena)
+{
+ uint64_t id;
+
+ switch (ENA_FORMAT(ena)) {
+ case FM_ENA_FMT1:
+ id = (ena & ENA_FMT1_ID_MASK) >> ENA_FMT1_ID_SHFT;
+ break;
+ case FM_ENA_FMT2:
+ id = (ena & ENA_FMT2_ID_MASK) >> ENA_FMT2_ID_SHFT;
+ break;
+ default:
+ id = 0;
+ }
+
+ return (id);
+}
+
+uint64_t
+fm_ena_time_get(uint64_t ena)
+{
+ uint64_t time;
+
+ switch (ENA_FORMAT(ena)) {
+ case FM_ENA_FMT1:
+ time = (ena & ENA_FMT1_TIME_MASK) >> ENA_FMT1_TIME_SHFT;
+ break;
+ case FM_ENA_FMT2:
+ time = (ena & ENA_FMT2_TIME_MASK) >> ENA_FMT2_TIME_SHFT;
+ break;
+ default:
+ time = 0;
+ }
+
+ return (time);
+}
+
+#ifdef _KERNEL
+/*
+ * Helper function to increment ereport dropped count. Used by the event
+ * rate limiting code to give feedback to the user about how many events were
+ * rate limited by including them in the 'dropped' count.
+ */
+void
+fm_erpt_dropped_increment(void)
+{
+ atomic_inc_64(&ratelimit_dropped);
+}
+
+void
+fm_init(void)
+{
+ zevent_len_cur = 0;
+ zevent_flags = 0;
+
+ if (zfs_zevent_len_max == 0)
+ zfs_zevent_len_max = ERPT_MAX_ERRS * MAX(max_ncpus, 4);
+
+ /* Initialize zevent allocation and generation kstats */
+ fm_ksp = kstat_create("zfs", 0, "fm", "misc", KSTAT_TYPE_NAMED,
+ sizeof (struct erpt_kstat) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (fm_ksp != NULL) {
+ fm_ksp->ks_data = &erpt_kstat_data;
+ kstat_install(fm_ksp);
+ } else {
+ cmn_err(CE_NOTE, "failed to create fm/misc kstat\n");
+ }
+
+ mutex_init(&zevent_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zevent_list, sizeof (zevent_t),
+ offsetof(zevent_t, ev_node));
+ cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL);
+
+ zfs_ereport_init();
+}
+
+void
+fm_fini(void)
+{
+ int count;
+
+ zfs_ereport_fini();
+
+ zfs_zevent_drain_all(&count);
+
+ mutex_enter(&zevent_lock);
+ cv_broadcast(&zevent_cv);
+
+ zevent_flags |= ZEVENT_SHUTDOWN;
+ while (zevent_waiters > 0) {
+ mutex_exit(&zevent_lock);
+ schedule();
+ mutex_enter(&zevent_lock);
+ }
+ mutex_exit(&zevent_lock);
+
+ cv_destroy(&zevent_cv);
+ list_destroy(&zevent_list);
+ mutex_destroy(&zevent_lock);
+
+ if (fm_ksp != NULL) {
+ kstat_delete(fm_ksp);
+ fm_ksp = NULL;
+ }
+}
+#endif /* _KERNEL */
+
+ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, len_max, INT, ZMOD_RW,
+ "Max event queue length");
+
+ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, cols, INT, ZMOD_RW,
+ "Max event column width");
+
+ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, console, INT, ZMOD_RW,
+ "Log events to the console");
diff --git a/sys/contrib/openzfs/module/zfs/gzip.c b/sys/contrib/openzfs/module/zfs/gzip.c
new file mode 100644
index 000000000000..e2c6e59969d6
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/gzip.c
@@ -0,0 +1,106 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+
+#include <sys/debug.h>
+#include <sys/types.h>
+#include <sys/strings.h>
+#include <sys/qat.h>
+#include <sys/zio_compress.h>
+
+#ifdef _KERNEL
+
+#include <sys/zmod.h>
+typedef size_t zlen_t;
+#define compress_func z_compress_level
+#define uncompress_func z_uncompress
+
+#else /* _KERNEL */
+
+#include <zlib.h>
+typedef uLongf zlen_t;
+#define compress_func compress2
+#define uncompress_func uncompress
+
+#endif
+
+size_t
+gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ int ret;
+ zlen_t dstlen = d_len;
+
+ ASSERT(d_len <= s_len);
+
+ /* check if hardware accelerator can be used */
+ if (qat_dc_use_accel(s_len)) {
+ ret = qat_compress(QAT_COMPRESS, s_start, s_len, d_start,
+ d_len, &dstlen);
+ if (ret == CPA_STATUS_SUCCESS) {
+ return ((size_t)dstlen);
+ } else if (ret == CPA_STATUS_INCOMPRESSIBLE) {
+ if (d_len != s_len)
+ return (s_len);
+
+ bcopy(s_start, d_start, s_len);
+ return (s_len);
+ }
+ /* if hardware compression fails, do it again with software */
+ }
+
+ if (compress_func(d_start, &dstlen, s_start, s_len, n) != Z_OK) {
+ if (d_len != s_len)
+ return (s_len);
+
+ bcopy(s_start, d_start, s_len);
+ return (s_len);
+ }
+
+ return ((size_t)dstlen);
+}
+
+/*ARGSUSED*/
+int
+gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ zlen_t dstlen = d_len;
+
+ ASSERT(d_len >= s_len);
+
+ /* check if hardware accelerator can be used */
+ if (qat_dc_use_accel(d_len)) {
+ if (qat_compress(QAT_DECOMPRESS, s_start, s_len,
+ d_start, d_len, &dstlen) == CPA_STATUS_SUCCESS)
+ return (0);
+ /* if hardware de-compress fail, do it again with software */
+ }
+
+ if (uncompress_func(d_start, &dstlen, s_start, s_len) != Z_OK)
+ return (-1);
+
+ return (0);
+}
diff --git a/sys/contrib/openzfs/module/zfs/hkdf.c b/sys/contrib/openzfs/module/zfs/hkdf.c
new file mode 100644
index 000000000000..14265472df7d
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/hkdf.c
@@ -0,0 +1,171 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, Datto, Inc. All rights reserved.
+ */
+
+#include <sys/crypto/api.h>
+#include <sys/sha2.h>
+#include <sys/hkdf.h>
+
+static int
+hkdf_sha512_extract(uint8_t *salt, uint_t salt_len, uint8_t *key_material,
+ uint_t km_len, uint8_t *out_buf)
+{
+ int ret;
+ crypto_mechanism_t mech;
+ crypto_key_t key;
+ crypto_data_t input_cd, output_cd;
+
+ /* initialize HMAC mechanism */
+ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+ mech.cm_param = NULL;
+ mech.cm_param_len = 0;
+
+ /* initialize the salt as a crypto key */
+ key.ck_format = CRYPTO_KEY_RAW;
+ key.ck_length = CRYPTO_BYTES2BITS(salt_len);
+ key.ck_data = salt;
+
+ /* initialize crypto data for the input and output data */
+ input_cd.cd_format = CRYPTO_DATA_RAW;
+ input_cd.cd_offset = 0;
+ input_cd.cd_length = km_len;
+ input_cd.cd_raw.iov_base = (char *)key_material;
+ input_cd.cd_raw.iov_len = input_cd.cd_length;
+
+ output_cd.cd_format = CRYPTO_DATA_RAW;
+ output_cd.cd_offset = 0;
+ output_cd.cd_length = SHA512_DIGEST_LENGTH;
+ output_cd.cd_raw.iov_base = (char *)out_buf;
+ output_cd.cd_raw.iov_len = output_cd.cd_length;
+
+ ret = crypto_mac(&mech, &input_cd, &key, NULL, &output_cd, NULL);
+ if (ret != CRYPTO_SUCCESS)
+ return (SET_ERROR(EIO));
+
+ return (0);
+}
+
+static int
+hkdf_sha512_expand(uint8_t *extract_key, uint8_t *info, uint_t info_len,
+ uint8_t *out_buf, uint_t out_len)
+{
+ int ret;
+ crypto_mechanism_t mech;
+ crypto_context_t ctx;
+ crypto_key_t key;
+ crypto_data_t T_cd, info_cd, c_cd;
+ uint_t i, T_len = 0, pos = 0;
+ uint8_t c;
+ uint_t N = (out_len + SHA512_DIGEST_LENGTH) / SHA512_DIGEST_LENGTH;
+ uint8_t T[SHA512_DIGEST_LENGTH];
+
+ if (N > 255)
+ return (SET_ERROR(EINVAL));
+
+ /* initialize HMAC mechanism */
+ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+ mech.cm_param = NULL;
+ mech.cm_param_len = 0;
+
+ /* initialize the salt as a crypto key */
+ key.ck_format = CRYPTO_KEY_RAW;
+ key.ck_length = CRYPTO_BYTES2BITS(SHA512_DIGEST_LENGTH);
+ key.ck_data = extract_key;
+
+ /* initialize crypto data for the input and output data */
+ T_cd.cd_format = CRYPTO_DATA_RAW;
+ T_cd.cd_offset = 0;
+ T_cd.cd_raw.iov_base = (char *)T;
+
+ c_cd.cd_format = CRYPTO_DATA_RAW;
+ c_cd.cd_offset = 0;
+ c_cd.cd_length = 1;
+ c_cd.cd_raw.iov_base = (char *)&c;
+ c_cd.cd_raw.iov_len = c_cd.cd_length;
+
+ info_cd.cd_format = CRYPTO_DATA_RAW;
+ info_cd.cd_offset = 0;
+ info_cd.cd_length = info_len;
+ info_cd.cd_raw.iov_base = (char *)info;
+ info_cd.cd_raw.iov_len = info_cd.cd_length;
+
+ for (i = 1; i <= N; i++) {
+ c = i;
+
+ T_cd.cd_length = T_len;
+ T_cd.cd_raw.iov_len = T_cd.cd_length;
+
+ ret = crypto_mac_init(&mech, &key, NULL, &ctx, NULL);
+ if (ret != CRYPTO_SUCCESS)
+ return (SET_ERROR(EIO));
+
+ ret = crypto_mac_update(ctx, &T_cd, NULL);
+ if (ret != CRYPTO_SUCCESS)
+ return (SET_ERROR(EIO));
+
+ ret = crypto_mac_update(ctx, &info_cd, NULL);
+ if (ret != CRYPTO_SUCCESS)
+ return (SET_ERROR(EIO));
+
+ ret = crypto_mac_update(ctx, &c_cd, NULL);
+ if (ret != CRYPTO_SUCCESS)
+ return (SET_ERROR(EIO));
+
+ T_len = SHA512_DIGEST_LENGTH;
+ T_cd.cd_length = T_len;
+ T_cd.cd_raw.iov_len = T_cd.cd_length;
+
+ ret = crypto_mac_final(ctx, &T_cd, NULL);
+ if (ret != CRYPTO_SUCCESS)
+ return (SET_ERROR(EIO));
+
+ bcopy(T, out_buf + pos,
+ (i != N) ? SHA512_DIGEST_LENGTH : (out_len - pos));
+ pos += SHA512_DIGEST_LENGTH;
+ }
+
+ return (0);
+}
+
+/*
+ * HKDF is designed to be a relatively fast function for deriving keys from a
+ * master key + a salt. We use this function to generate new encryption keys
+ * so as to avoid hitting the cryptographic limits of the underlying
+ * encryption modes. Note that, for the sake of deriving encryption keys, the
+ * info parameter is called the "salt" everywhere else in the code.
+ */
+int
+hkdf_sha512(uint8_t *key_material, uint_t km_len, uint8_t *salt,
+ uint_t salt_len, uint8_t *info, uint_t info_len, uint8_t *output_key,
+ uint_t out_len)
+{
+ int ret;
+ uint8_t extract_key[SHA512_DIGEST_LENGTH];
+
+ ret = hkdf_sha512_extract(salt, salt_len, key_material, km_len,
+ extract_key);
+ if (ret != 0)
+ return (ret);
+
+ ret = hkdf_sha512_expand(extract_key, info, info_len, output_key,
+ out_len);
+ if (ret != 0)
+ return (ret);
+
+ return (0);
+}
diff --git a/sys/contrib/openzfs/module/zfs/lz4.c b/sys/contrib/openzfs/module/zfs/lz4.c
new file mode 100644
index 000000000000..9da9d9e00635
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/lz4.c
@@ -0,0 +1,1084 @@
+/*
+ * LZ4 - Fast LZ compression algorithm
+ * Header File
+ * Copyright (C) 2011-2013, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at :
+ * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+ * - LZ4 source repository : http://code.google.com/p/lz4/
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zio_compress.h>
+
+static int real_LZ4_compress(const char *source, char *dest, int isize,
+ int osize);
+static int LZ4_uncompress_unknownOutputSize(const char *source, char *dest,
+ int isize, int maxOutputSize);
+static int LZ4_compressCtx(void *ctx, const char *source, char *dest,
+ int isize, int osize);
+static int LZ4_compress64kCtx(void *ctx, const char *source, char *dest,
+ int isize, int osize);
+
+static void *lz4_alloc(int flags);
+static void lz4_free(void *ctx);
+
+/*ARGSUSED*/
+size_t
+lz4_compress_zfs(void *s_start, void *d_start, size_t s_len,
+ size_t d_len, int n)
+{
+ uint32_t bufsiz;
+ char *dest = d_start;
+
+ ASSERT(d_len >= sizeof (bufsiz));
+
+ bufsiz = real_LZ4_compress(s_start, &dest[sizeof (bufsiz)], s_len,
+ d_len - sizeof (bufsiz));
+
+ /* Signal an error if the compression routine returned zero. */
+ if (bufsiz == 0)
+ return (s_len);
+
+ /*
+ * The exact compressed size is needed by the decompression routine,
+ * so it is stored at the start of the buffer. Note that this may be
+ * less than the compressed block size, which is rounded up to a
+ * multiple of 1<<ashift.
+ */
+ *(uint32_t *)dest = BE_32(bufsiz);
+
+ return (bufsiz + sizeof (bufsiz));
+}
+
+/*ARGSUSED*/
+int
+lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
+ size_t d_len, int n)
+{
+ const char *src = s_start;
+ uint32_t bufsiz = BE_IN32(src);
+
+ /* invalid compressed buffer size encoded at start */
+ if (bufsiz + sizeof (bufsiz) > s_len)
+ return (1);
+
+ /*
+ * Returns 0 on success (decompression function returned non-negative)
+ * and non-zero on failure (decompression function returned negative).
+ */
+ return (LZ4_uncompress_unknownOutputSize(&src[sizeof (bufsiz)],
+ d_start, bufsiz, d_len) < 0);
+}
+
+/*
+ * LZ4 API Description:
+ *
+ * Simple Functions:
+ * real_LZ4_compress() :
+ * isize : is the input size. Max supported value is ~1.9GB
+ * return : the number of bytes written in buffer dest
+ * or 0 if the compression fails (if LZ4_COMPRESSMIN is set).
+ * note : destination buffer must be already allocated.
+ * destination buffer must be sized to handle worst cases
+ * situations (input data not compressible) worst case size
+ * evaluation is provided by function LZ4_compressBound().
+ *
+ * real_LZ4_uncompress() :
+ * osize : is the output size, therefore the original size
+ * return : the number of bytes read in the source buffer.
+ * If the source stream is malformed, the function will stop
+ * decoding and return a negative result, indicating the byte
+ * position of the faulty instruction. This function never
+ * writes beyond dest + osize, and is therefore protected
+ * against malicious data packets.
+ * note : destination buffer must be already allocated
+ * note : real_LZ4_uncompress() is not used in ZFS so its code
+ * is not present here.
+ *
+ * Advanced Functions
+ *
+ * LZ4_compressBound() :
+ * Provides the maximum size that LZ4 may output in a "worst case"
+ * scenario (input data not compressible) primarily useful for memory
+ * allocation of output buffer.
+ *
+ * isize : is the input size. Max supported value is ~1.9GB
+ * return : maximum output size in a "worst case" scenario
+ * note : this function is limited by "int" range (2^31-1)
+ *
+ * LZ4_uncompress_unknownOutputSize() :
+ * isize : is the input size, therefore the compressed size
+ * maxOutputSize : is the size of the destination buffer (which must be
+ * already allocated)
+ * return : the number of bytes decoded in the destination buffer
+ * (necessarily <= maxOutputSize). If the source stream is
+ * malformed, the function will stop decoding and return a
+ * negative result, indicating the byte position of the faulty
+ * instruction. This function never writes beyond dest +
+ * maxOutputSize, and is therefore protected against malicious
+ * data packets.
+ * note : Destination buffer must be already allocated.
+ * This version is slightly slower than real_LZ4_uncompress()
+ *
+ * LZ4_compressCtx() :
+ * This function explicitly handles the CTX memory structure.
+ *
+ * ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated
+ * by the caller (either on the stack or using kmem_cache_alloc). Passing
+ * NULL isn't valid.
+ *
+ * LZ4_compress64kCtx() :
+ * Same as LZ4_compressCtx(), but specific to small inputs (<64KB).
+ * isize *Must* be <64KB, otherwise the output will be corrupted.
+ *
+ * ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated
+ * by the caller (either on the stack or using kmem_cache_alloc). Passing
+ * NULL isn't valid.
+ */
+
+/*
+ * Tuning parameters
+ */
+
+/*
+ * COMPRESSIONLEVEL: Increasing this value improves compression ratio
+ * Lowering this value reduces memory usage. Reduced memory usage
+ * typically improves speed, due to cache effect (ex: L1 32KB for Intel,
+ * L1 64KB for AMD). Memory usage formula : N->2^(N+2) Bytes
+ * (examples : 12 -> 16KB ; 17 -> 512KB)
+ */
+#define COMPRESSIONLEVEL 12
+
+/*
+ * NOTCOMPRESSIBLE_CONFIRMATION: Decreasing this value will make the
+ * algorithm skip faster data segments considered "incompressible".
+ * This may decrease compression ratio dramatically, but will be
+ * faster on incompressible data. Increasing this value will make
+ * the algorithm search more before declaring a segment "incompressible".
+ * This could improve compression a bit, but will be slower on
+ * incompressible data. The default value (6) is recommended.
+ */
+#define NOTCOMPRESSIBLE_CONFIRMATION 6
+
+/*
+ * BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE: This will provide a boost to
+ * performance for big endian cpu, but the resulting compressed stream
+ * will be incompatible with little-endian CPU. You can set this option
+ * to 1 in situations where data will stay within closed environment.
+ * This option is useless on Little_Endian CPU (such as x86).
+ */
+/* #define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 */
+
+/*
+ * CPU Feature Detection
+ */
+
+/* 32 or 64 bits ? */
+#if defined(_LP64)
+#define LZ4_ARCH64 1
+#else
+#define LZ4_ARCH64 0
+#endif
+
+/*
+ * Little Endian or Big Endian?
+ * Note: overwrite the below #define if you know your architecture endianness.
+ */
+#if defined(_ZFS_BIG_ENDIAN)
+#define LZ4_BIG_ENDIAN 1
+#else
+/*
+ * Little Endian assumed. PDP Endian and other very rare endian format
+ * are unsupported.
+ */
+#undef LZ4_BIG_ENDIAN
+#endif
+
+/*
+ * Unaligned memory access is automatically enabled for "common" CPU,
+ * such as x86. For others CPU, the compiler will be more cautious, and
+ * insert extra code to ensure aligned access is respected. If you know
+ * your target CPU supports unaligned memory access, you may want to
+ * force this option manually to improve performance
+ */
+#if defined(__ARM_FEATURE_UNALIGNED)
+#define LZ4_FORCE_UNALIGNED_ACCESS 1
+#endif
+
+/*
+ * Illumos : we can't use GCC's __builtin_ctz family of builtins in the
+ * kernel
+ * Linux : we can use GCC's __builtin_ctz family of builtins in the
+ * kernel
+ */
+#undef LZ4_FORCE_SW_BITCOUNT
+#if defined(__sparc)
+#define LZ4_FORCE_SW_BITCOUNT
+#endif
+
+/*
+ * Compiler Options
+ */
+/* Disable restrict */
+#define restrict
+
+/*
+ * Linux : GCC_VERSION is defined as of 3.9-rc1, so undefine it.
+ * torvalds/linux@3f3f8d2f48acfd8ed3b8e6b7377935da57b27b16
+ */
+#ifdef GCC_VERSION
+#undef GCC_VERSION
+#endif
+
+#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__)
+#define expect(expr, value) (__builtin_expect((expr), (value)))
+#else
+#define expect(expr, value) (expr)
+#endif
+
+#ifndef likely
+#define likely(expr) expect((expr) != 0, 1)
+#endif
+
+#ifndef unlikely
+#define unlikely(expr) expect((expr) != 0, 0)
+#endif
+
+#define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | \
+ (((x) & 0xffu) << 8)))
+
+/* Basic types */
+#define BYTE uint8_t
+#define U16 uint16_t
+#define U32 uint32_t
+#define S32 int32_t
+#define U64 uint64_t
+
+#ifndef LZ4_FORCE_UNALIGNED_ACCESS
+#pragma pack(1)
+#endif
+
+typedef struct _U16_S {
+ U16 v;
+} U16_S;
+typedef struct _U32_S {
+ U32 v;
+} U32_S;
+typedef struct _U64_S {
+ U64 v;
+} U64_S;
+
+#ifndef LZ4_FORCE_UNALIGNED_ACCESS
+#pragma pack()
+#endif
+
+#define A64(x) (((U64_S *)(x))->v)
+#define A32(x) (((U32_S *)(x))->v)
+#define A16(x) (((U16_S *)(x))->v)
+
+/*
+ * Constants
+ */
+#define MINMATCH 4
+
+#define HASH_LOG COMPRESSIONLEVEL
+#define HASHTABLESIZE (1 << HASH_LOG)
+#define HASH_MASK (HASHTABLESIZE - 1)
+
+#define SKIPSTRENGTH (NOTCOMPRESSIBLE_CONFIRMATION > 2 ? \
+ NOTCOMPRESSIBLE_CONFIRMATION : 2)
+
+#define COPYLENGTH 8
+#define LASTLITERALS 5
+#define MFLIMIT (COPYLENGTH + MINMATCH)
+#define MINLENGTH (MFLIMIT + 1)
+
+#define MAXD_LOG 16
+#define MAX_DISTANCE ((1 << MAXD_LOG) - 1)
+
+#define ML_BITS 4
+#define ML_MASK ((1U<<ML_BITS)-1)
+#define RUN_BITS (8-ML_BITS)
+#define RUN_MASK ((1U<<RUN_BITS)-1)
+
+
+/*
+ * Architecture-specific macros
+ */
+#if LZ4_ARCH64
+#define STEPSIZE 8
+#define UARCH U64
+#define AARCH A64
+#define LZ4_COPYSTEP(s, d) A64(d) = A64(s); d += 8; s += 8;
+#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d)
+#define LZ4_SECURECOPY(s, d, e) if (d < e) LZ4_WILDCOPY(s, d, e)
+#define HTYPE U32
+#define INITBASE(base) const BYTE* const base = ip
+#else /* !LZ4_ARCH64 */
+#define STEPSIZE 4
+#define UARCH U32
+#define AARCH A32
+#define LZ4_COPYSTEP(s, d) A32(d) = A32(s); d += 4; s += 4;
+#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d); LZ4_COPYSTEP(s, d);
+#define LZ4_SECURECOPY LZ4_WILDCOPY
+#define HTYPE const BYTE *
+#define INITBASE(base) const int base = 0
+#endif /* !LZ4_ARCH64 */
+
+#if (defined(LZ4_BIG_ENDIAN) && !defined(BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE))
+#define LZ4_READ_LITTLEENDIAN_16(d, s, p) \
+ { U16 v = A16(p); v = lz4_bswap16(v); d = (s) - v; }
+#define LZ4_WRITE_LITTLEENDIAN_16(p, i) \
+ { U16 v = (U16)(i); v = lz4_bswap16(v); A16(p) = v; p += 2; }
+#else
+#define LZ4_READ_LITTLEENDIAN_16(d, s, p) { d = (s) - A16(p); }
+#define LZ4_WRITE_LITTLEENDIAN_16(p, v) { A16(p) = v; p += 2; }
+#endif
+
+
+/* Local structures */
+struct refTables {
+ HTYPE hashTable[HASHTABLESIZE];
+};
+
+
+/* Macros */
+#define LZ4_HASH_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH * 8) - \
+ HASH_LOG))
+#define LZ4_HASH_VALUE(p) LZ4_HASH_FUNCTION(A32(p))
+#define LZ4_WILDCOPY(s, d, e) do { LZ4_COPYPACKET(s, d) } while (d < e);
+#define LZ4_BLINDCOPY(s, d, l) { BYTE* e = (d) + l; LZ4_WILDCOPY(s, d, e); \
+ d = e; }
+
+
+/* Private functions */
+#if LZ4_ARCH64
+
+static inline int
+LZ4_NbCommonBytes(register U64 val)
+{
+#if defined(LZ4_BIG_ENDIAN)
+#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \
+ !defined(LZ4_FORCE_SW_BITCOUNT)
+ return (__builtin_clzll(val) >> 3);
+#else
+ int r;
+ if (!(val >> 32)) {
+ r = 4;
+ } else {
+ r = 0;
+ val >>= 32;
+ }
+ if (!(val >> 16)) {
+ r += 2;
+ val >>= 8;
+ } else {
+ val >>= 24;
+ }
+ r += (!val);
+ return (r);
+#endif
+#else
+#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \
+ !defined(LZ4_FORCE_SW_BITCOUNT)
+ return (__builtin_ctzll(val) >> 3);
+#else
+ static const int DeBruijnBytePos[64] =
+ { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5,
+ 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5,
+ 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4,
+ 4, 5, 7, 2, 6, 5, 7, 6, 7, 7
+ };
+ return DeBruijnBytePos[((U64) ((val & -val) * 0x0218A392CDABBD3F)) >>
+ 58];
+#endif
+#endif
+}
+
+#else
+
+static inline int
+LZ4_NbCommonBytes(register U32 val)
+{
+#if defined(LZ4_BIG_ENDIAN)
+#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \
+ !defined(LZ4_FORCE_SW_BITCOUNT)
+ return (__builtin_clz(val) >> 3);
+#else
+ int r;
+ if (!(val >> 16)) {
+ r = 2;
+ val >>= 8;
+ } else {
+ r = 0;
+ val >>= 24;
+ }
+ r += (!val);
+ return (r);
+#endif
+#else
+#if defined(__GNUC__) && (GCC_VERSION >= 304) && \
+ !defined(LZ4_FORCE_SW_BITCOUNT)
+ return (__builtin_ctz(val) >> 3);
+#else
+ static const int DeBruijnBytePos[32] = {
+ 0, 0, 3, 0, 3, 1, 3, 0,
+ 3, 2, 2, 1, 3, 2, 0, 1,
+ 3, 3, 1, 2, 2, 2, 2, 0,
+ 3, 1, 2, 0, 1, 0, 1, 1
+ };
+ return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >>
+ 27];
+#endif
+#endif
+}
+
+#endif
+
+/* Compression functions */
+
+/*ARGSUSED*/
+static int
+LZ4_compressCtx(void *ctx, const char *source, char *dest, int isize,
+ int osize)
+{
+ struct refTables *srt = (struct refTables *)ctx;
+ HTYPE *HashTable = (HTYPE *) (srt->hashTable);
+
+ const BYTE *ip = (BYTE *) source;
+ INITBASE(base);
+ const BYTE *anchor = ip;
+ const BYTE *const iend = ip + isize;
+ const BYTE *const oend = (BYTE *) dest + osize;
+ const BYTE *const mflimit = iend - MFLIMIT;
+#define matchlimit (iend - LASTLITERALS)
+
+ BYTE *op = (BYTE *) dest;
+
+ int len, length;
+ const int skipStrength = SKIPSTRENGTH;
+ U32 forwardH;
+
+
+ /* Init */
+ if (isize < MINLENGTH)
+ goto _last_literals;
+
+ /* First Byte */
+ HashTable[LZ4_HASH_VALUE(ip)] = ip - base;
+ ip++;
+ forwardH = LZ4_HASH_VALUE(ip);
+
+ /* Main Loop */
+ for (;;) {
+ int findMatchAttempts = (1U << skipStrength) + 3;
+ const BYTE *forwardIp = ip;
+ const BYTE *ref;
+ BYTE *token;
+
+ /* Find a match */
+ do {
+ U32 h = forwardH;
+ int step = findMatchAttempts++ >> skipStrength;
+ ip = forwardIp;
+ forwardIp = ip + step;
+
+ if (unlikely(forwardIp > mflimit)) {
+ goto _last_literals;
+ }
+
+ forwardH = LZ4_HASH_VALUE(forwardIp);
+ ref = base + HashTable[h];
+ HashTable[h] = ip - base;
+
+ } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip)));
+
+ /* Catch up */
+ while ((ip > anchor) && (ref > (BYTE *) source) &&
+ unlikely(ip[-1] == ref[-1])) {
+ ip--;
+ ref--;
+ }
+
+ /* Encode Literal length */
+ length = ip - anchor;
+ token = op++;
+
+ /* Check output limit */
+ if (unlikely(op + length + (2 + 1 + LASTLITERALS) +
+ (length >> 8) > oend))
+ return (0);
+
+ if (length >= (int)RUN_MASK) {
+ *token = (RUN_MASK << ML_BITS);
+ len = length - RUN_MASK;
+ for (; len > 254; len -= 255)
+ *op++ = 255;
+ *op++ = (BYTE)len;
+ } else
+ *token = (length << ML_BITS);
+
+ /* Copy Literals */
+ LZ4_BLINDCOPY(anchor, op, length);
+
+ _next_match:
+ /* Encode Offset */
+ LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref);
+
+ /* Start Counting */
+ ip += MINMATCH;
+ ref += MINMATCH; /* MinMatch verified */
+ anchor = ip;
+ while (likely(ip < matchlimit - (STEPSIZE - 1))) {
+ UARCH diff = AARCH(ref) ^ AARCH(ip);
+ if (!diff) {
+ ip += STEPSIZE;
+ ref += STEPSIZE;
+ continue;
+ }
+ ip += LZ4_NbCommonBytes(diff);
+ goto _endCount;
+ }
+#if LZ4_ARCH64
+ if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) {
+ ip += 4;
+ ref += 4;
+ }
+#endif
+ if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) {
+ ip += 2;
+ ref += 2;
+ }
+ if ((ip < matchlimit) && (*ref == *ip))
+ ip++;
+ _endCount:
+
+ /* Encode MatchLength */
+ len = (ip - anchor);
+ /* Check output limit */
+ if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend))
+ return (0);
+ if (len >= (int)ML_MASK) {
+ *token += ML_MASK;
+ len -= ML_MASK;
+ for (; len > 509; len -= 510) {
+ *op++ = 255;
+ *op++ = 255;
+ }
+ if (len > 254) {
+ len -= 255;
+ *op++ = 255;
+ }
+ *op++ = (BYTE)len;
+ } else
+ *token += len;
+
+ /* Test end of chunk */
+ if (ip > mflimit) {
+ anchor = ip;
+ break;
+ }
+ /* Fill table */
+ HashTable[LZ4_HASH_VALUE(ip - 2)] = ip - 2 - base;
+
+ /* Test next position */
+ ref = base + HashTable[LZ4_HASH_VALUE(ip)];
+ HashTable[LZ4_HASH_VALUE(ip)] = ip - base;
+ if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) {
+ token = op++;
+ *token = 0;
+ goto _next_match;
+ }
+ /* Prepare next loop */
+ anchor = ip++;
+ forwardH = LZ4_HASH_VALUE(ip);
+ }
+
+ _last_literals:
+ /* Encode Last Literals */
+ {
+ int lastRun = iend - anchor;
+ if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) >
+ oend)
+ return (0);
+ if (lastRun >= (int)RUN_MASK) {
+ *op++ = (RUN_MASK << ML_BITS);
+ lastRun -= RUN_MASK;
+ for (; lastRun > 254; lastRun -= 255) {
+ *op++ = 255;
+ }
+ *op++ = (BYTE)lastRun;
+ } else
+ *op++ = (lastRun << ML_BITS);
+ (void) memcpy(op, anchor, iend - anchor);
+ op += iend - anchor;
+ }
+
+ /* End */
+ return (int)(((char *)op) - dest);
+}
+
+
+
+/* Note : this function is valid only if isize < LZ4_64KLIMIT */
+#define LZ4_64KLIMIT ((1 << 16) + (MFLIMIT - 1))
+#define HASHLOG64K (HASH_LOG + 1)
+#define HASH64KTABLESIZE (1U << HASHLOG64K)
+#define LZ4_HASH64K_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH*8) - \
+ HASHLOG64K))
+#define LZ4_HASH64K_VALUE(p) LZ4_HASH64K_FUNCTION(A32(p))
+
+/*ARGSUSED*/
+static int
+LZ4_compress64kCtx(void *ctx, const char *source, char *dest, int isize,
+ int osize)
+{
+ struct refTables *srt = (struct refTables *)ctx;
+ U16 *HashTable = (U16 *) (srt->hashTable);
+
+ const BYTE *ip = (BYTE *) source;
+ const BYTE *anchor = ip;
+ const BYTE *const base = ip;
+ const BYTE *const iend = ip + isize;
+ const BYTE *const oend = (BYTE *) dest + osize;
+ const BYTE *const mflimit = iend - MFLIMIT;
+#define matchlimit (iend - LASTLITERALS)
+
+ BYTE *op = (BYTE *) dest;
+
+ int len, length;
+ const int skipStrength = SKIPSTRENGTH;
+ U32 forwardH;
+
+ /* Init */
+ if (isize < MINLENGTH)
+ goto _last_literals;
+
+ /* First Byte */
+ ip++;
+ forwardH = LZ4_HASH64K_VALUE(ip);
+
+ /* Main Loop */
+ for (;;) {
+ int findMatchAttempts = (1U << skipStrength) + 3;
+ const BYTE *forwardIp = ip;
+ const BYTE *ref;
+ BYTE *token;
+
+ /* Find a match */
+ do {
+ U32 h = forwardH;
+ int step = findMatchAttempts++ >> skipStrength;
+ ip = forwardIp;
+ forwardIp = ip + step;
+
+ if (forwardIp > mflimit) {
+ goto _last_literals;
+ }
+
+ forwardH = LZ4_HASH64K_VALUE(forwardIp);
+ ref = base + HashTable[h];
+ HashTable[h] = ip - base;
+
+ } while (A32(ref) != A32(ip));
+
+ /* Catch up */
+ while ((ip > anchor) && (ref > (BYTE *) source) &&
+ (ip[-1] == ref[-1])) {
+ ip--;
+ ref--;
+ }
+
+ /* Encode Literal length */
+ length = ip - anchor;
+ token = op++;
+
+ /* Check output limit */
+ if (unlikely(op + length + (2 + 1 + LASTLITERALS) +
+ (length >> 8) > oend))
+ return (0);
+
+ if (length >= (int)RUN_MASK) {
+ *token = (RUN_MASK << ML_BITS);
+ len = length - RUN_MASK;
+ for (; len > 254; len -= 255)
+ *op++ = 255;
+ *op++ = (BYTE)len;
+ } else
+ *token = (length << ML_BITS);
+
+ /* Copy Literals */
+ LZ4_BLINDCOPY(anchor, op, length);
+
+ _next_match:
+ /* Encode Offset */
+ LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref);
+
+ /* Start Counting */
+ ip += MINMATCH;
+ ref += MINMATCH; /* MinMatch verified */
+ anchor = ip;
+ while (ip < matchlimit - (STEPSIZE - 1)) {
+ UARCH diff = AARCH(ref) ^ AARCH(ip);
+ if (!diff) {
+ ip += STEPSIZE;
+ ref += STEPSIZE;
+ continue;
+ }
+ ip += LZ4_NbCommonBytes(diff);
+ goto _endCount;
+ }
+#if LZ4_ARCH64
+ if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) {
+ ip += 4;
+ ref += 4;
+ }
+#endif
+ if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) {
+ ip += 2;
+ ref += 2;
+ }
+ if ((ip < matchlimit) && (*ref == *ip))
+ ip++;
+ _endCount:
+
+ /* Encode MatchLength */
+ len = (ip - anchor);
+ /* Check output limit */
+ if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend))
+ return (0);
+ if (len >= (int)ML_MASK) {
+ *token += ML_MASK;
+ len -= ML_MASK;
+ for (; len > 509; len -= 510) {
+ *op++ = 255;
+ *op++ = 255;
+ }
+ if (len > 254) {
+ len -= 255;
+ *op++ = 255;
+ }
+ *op++ = (BYTE)len;
+ } else
+ *token += len;
+
+ /* Test end of chunk */
+ if (ip > mflimit) {
+ anchor = ip;
+ break;
+ }
+ /* Fill table */
+ HashTable[LZ4_HASH64K_VALUE(ip - 2)] = ip - 2 - base;
+
+ /* Test next position */
+ ref = base + HashTable[LZ4_HASH64K_VALUE(ip)];
+ HashTable[LZ4_HASH64K_VALUE(ip)] = ip - base;
+ if (A32(ref) == A32(ip)) {
+ token = op++;
+ *token = 0;
+ goto _next_match;
+ }
+ /* Prepare next loop */
+ anchor = ip++;
+ forwardH = LZ4_HASH64K_VALUE(ip);
+ }
+
+ _last_literals:
+ /* Encode Last Literals */
+ {
+ int lastRun = iend - anchor;
+ if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) >
+ oend)
+ return (0);
+ if (lastRun >= (int)RUN_MASK) {
+ *op++ = (RUN_MASK << ML_BITS);
+ lastRun -= RUN_MASK;
+ for (; lastRun > 254; lastRun -= 255)
+ *op++ = 255;
+ *op++ = (BYTE)lastRun;
+ } else
+ *op++ = (lastRun << ML_BITS);
+ (void) memcpy(op, anchor, iend - anchor);
+ op += iend - anchor;
+ }
+
+ /* End */
+ return (int)(((char *)op) - dest);
+}
+
+static int
+real_LZ4_compress(const char *source, char *dest, int isize, int osize)
+{
+ void *ctx;
+ int result;
+
+ ctx = lz4_alloc(KM_SLEEP);
+
+ /*
+ * out of kernel memory, gently fall through - this will disable
+ * compression in zio_compress_data
+ */
+ if (ctx == NULL)
+ return (0);
+
+ memset(ctx, 0, sizeof (struct refTables));
+
+ if (isize < LZ4_64KLIMIT)
+ result = LZ4_compress64kCtx(ctx, source, dest, isize, osize);
+ else
+ result = LZ4_compressCtx(ctx, source, dest, isize, osize);
+
+ lz4_free(ctx);
+ return (result);
+}
+
+/* Decompression functions */
+
+/*
+ * Note: The decoding functions real_LZ4_uncompress() and
+ * LZ4_uncompress_unknownOutputSize() are safe against "buffer overflow"
+ * attack type. They will never write nor read outside of the provided
+ * output buffers. LZ4_uncompress_unknownOutputSize() also insures that
+ * it will never read outside of the input buffer. A corrupted input
+ * will produce an error result, a negative int, indicating the position
+ * of the error within input stream.
+ *
+ * Note[2]: real_LZ4_uncompress(), referred to above, is not used in ZFS so
+ * its code is not present here.
+ */
+
+static const int dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
+#if LZ4_ARCH64
+static const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
+#endif
+
+static int
+LZ4_uncompress_unknownOutputSize(const char *source, char *dest, int isize,
+ int maxOutputSize)
+{
+ /* Local Variables */
+ const BYTE *restrict ip = (const BYTE *) source;
+ const BYTE *const iend = ip + isize;
+ const BYTE *ref;
+
+ BYTE *op = (BYTE *) dest;
+ BYTE *const oend = op + maxOutputSize;
+ BYTE *cpy;
+
+ /* Main Loop */
+ while (ip < iend) {
+ unsigned token;
+ size_t length;
+
+ /* get runlength */
+ token = *ip++;
+ if ((length = (token >> ML_BITS)) == RUN_MASK) {
+ int s = 255;
+ while ((ip < iend) && (s == 255)) {
+ s = *ip++;
+ if (unlikely(length > (size_t)(length + s)))
+ goto _output_error;
+ length += s;
+ }
+ }
+ /* copy literals */
+ cpy = op + length;
+ /* CORNER-CASE: cpy might overflow. */
+ if (cpy < op)
+ goto _output_error; /* cpy was overflowed, bail! */
+ if ((cpy > oend - COPYLENGTH) ||
+ (ip + length > iend - COPYLENGTH)) {
+ if (cpy > oend)
+ /* Error: writes beyond output buffer */
+ goto _output_error;
+ if (ip + length != iend)
+ /*
+ * Error: LZ4 format requires to consume all
+ * input at this stage
+ */
+ goto _output_error;
+ (void) memcpy(op, ip, length);
+ op += length;
+ /* Necessarily EOF, due to parsing restrictions */
+ break;
+ }
+ LZ4_WILDCOPY(ip, op, cpy);
+ ip -= (op - cpy);
+ op = cpy;
+
+ /* get offset */
+ LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip);
+ ip += 2;
+ if (ref < (BYTE * const) dest)
+ /*
+ * Error: offset creates reference outside of
+ * destination buffer
+ */
+ goto _output_error;
+
+ /* get matchlength */
+ if ((length = (token & ML_MASK)) == ML_MASK) {
+ while (ip < iend) {
+ int s = *ip++;
+ if (unlikely(length > (size_t)(length + s)))
+ goto _output_error;
+ length += s;
+ if (s == 255)
+ continue;
+ break;
+ }
+ }
+ /* copy repeated sequence */
+ if (unlikely(op - ref < STEPSIZE)) {
+#if LZ4_ARCH64
+ int dec64 = dec64table[op - ref];
+#else
+ const int dec64 = 0;
+#endif
+ op[0] = ref[0];
+ op[1] = ref[1];
+ op[2] = ref[2];
+ op[3] = ref[3];
+ op += 4;
+ ref += 4;
+ ref -= dec32table[op - ref];
+ A32(op) = A32(ref);
+ op += STEPSIZE - 4;
+ ref -= dec64;
+ } else {
+ LZ4_COPYSTEP(ref, op);
+ }
+ cpy = op + length - (STEPSIZE - 4);
+ if (cpy > oend - COPYLENGTH) {
+ if (cpy > oend)
+ /*
+ * Error: request to write outside of
+ * destination buffer
+ */
+ goto _output_error;
+#if LZ4_ARCH64
+ if ((ref + COPYLENGTH) > oend)
+#else
+ if ((ref + COPYLENGTH) > oend ||
+ (op + COPYLENGTH) > oend)
+#endif
+ goto _output_error;
+ LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH));
+ while (op < cpy)
+ *op++ = *ref++;
+ op = cpy;
+ if (op == oend)
+ /*
+ * Check EOF (should never happen, since
+ * last 5 bytes are supposed to be literals)
+ */
+ goto _output_error;
+ continue;
+ }
+ LZ4_SECURECOPY(ref, op, cpy);
+ op = cpy; /* correction */
+ }
+
+ /* end of decoding */
+ return (int)(((char *)op) - dest);
+
+ /* write overflow error detected */
+ _output_error:
+ return (-1);
+}
+
+#ifdef __FreeBSD__
+/*
+ * FreeBSD has 4, 8 and 16 KB malloc zones which can be used here.
+ * Should struct refTables get resized this may need to be revisited, hence
+ * compiler-time asserts.
+ */
+_Static_assert(sizeof(struct refTables) <= 16384,
+ "refTables too big for malloc");
+_Static_assert((sizeof(struct refTables) % 4096) == 0,
+ "refTables not a multiple of page size");
+#else
+#define ZFS_LZ4_USE_CACHE
+#endif
+
+#ifdef ZFS_LZ4_USE_CACHE
+static kmem_cache_t *lz4_cache;
+
+void
+lz4_init(void)
+{
+ lz4_cache = kmem_cache_create("lz4_cache",
+ sizeof (struct refTables), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+lz4_fini(void)
+{
+ if (lz4_cache) {
+ kmem_cache_destroy(lz4_cache);
+ lz4_cache = NULL;
+ }
+}
+
+static void *
+lz4_alloc(int flags)
+{
+ ASSERT(lz4_cache != NULL);
+ return (kmem_cache_alloc(lz4_cache, flags));
+}
+
+static void
+lz4_free(void *ctx)
+{
+ kmem_cache_free(lz4_cache, ctx);
+}
+#else
+void
+lz4_init(void)
+{
+}
+
+void
+lz4_fini(void)
+{
+}
+
+static void *
+lz4_alloc(int flags)
+{
+ return (kmem_alloc(sizeof (struct refTables), flags));
+}
+
+static void
+lz4_free(void *ctx)
+{
+ kmem_free(ctx, sizeof (struct refTables));
+}
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/lzjb.c b/sys/contrib/openzfs/module/zfs/lzjb.c
new file mode 100644
index 000000000000..a478e64c5141
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/lzjb.c
@@ -0,0 +1,132 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * We keep our own copy of this algorithm for 3 main reasons:
+ * 1. If we didn't, anyone modifying common/os/compress.c would
+ * directly break our on disk format
+ * 2. Our version of lzjb does not have a number of checks that the
+ * common/os version needs and uses
+ * 3. We initialize the lempel to ensure deterministic results,
+ * so that identical blocks can always be deduplicated.
+ * In particular, we are adding the "feature" that compress() can
+ * take a destination buffer size and returns the compressed length, or the
+ * source length if compression would overflow the destination buffer.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zio_compress.h>
+
+#define MATCH_BITS 6
+#define MATCH_MIN 3
+#define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1))
+#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1)
+#define LEMPEL_SIZE 1024
+
+/*ARGSUSED*/
+size_t
+lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ uchar_t *src = s_start;
+ uchar_t *dst = d_start;
+ uchar_t *cpy;
+ uchar_t *copymap = NULL;
+ int copymask = 1 << (NBBY - 1);
+ int mlen, offset, hash;
+ uint16_t *hp;
+ uint16_t *lempel;
+
+ lempel = kmem_zalloc(LEMPEL_SIZE * sizeof (uint16_t), KM_SLEEP);
+ while (src < (uchar_t *)s_start + s_len) {
+ if ((copymask <<= 1) == (1 << NBBY)) {
+ if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) {
+ kmem_free(lempel,
+ LEMPEL_SIZE*sizeof (uint16_t));
+ return (s_len);
+ }
+ copymask = 1;
+ copymap = dst;
+ *dst++ = 0;
+ }
+ if (src > (uchar_t *)s_start + s_len - MATCH_MAX) {
+ *dst++ = *src++;
+ continue;
+ }
+ hash = (src[0] << 16) + (src[1] << 8) + src[2];
+ hash += hash >> 9;
+ hash += hash >> 5;
+ hp = &lempel[hash & (LEMPEL_SIZE - 1)];
+ offset = (intptr_t)(src - *hp) & OFFSET_MASK;
+ *hp = (uint16_t)(uintptr_t)src;
+ cpy = src - offset;
+ if (cpy >= (uchar_t *)s_start && cpy != src &&
+ src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) {
+ *copymap |= copymask;
+ for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++)
+ if (src[mlen] != cpy[mlen])
+ break;
+ *dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) |
+ (offset >> NBBY);
+ *dst++ = (uchar_t)offset;
+ src += mlen;
+ } else {
+ *dst++ = *src++;
+ }
+ }
+
+ kmem_free(lempel, LEMPEL_SIZE * sizeof (uint16_t));
+ return (dst - (uchar_t *)d_start);
+}
+
+/*ARGSUSED*/
+int
+lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ uchar_t *src = s_start;
+ uchar_t *dst = d_start;
+ uchar_t *d_end = (uchar_t *)d_start + d_len;
+ uchar_t *cpy;
+ uchar_t copymap = 0;
+ int copymask = 1 << (NBBY - 1);
+
+ while (dst < d_end) {
+ if ((copymask <<= 1) == (1 << NBBY)) {
+ copymask = 1;
+ copymap = *src++;
+ }
+ if (copymap & copymask) {
+ int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN;
+ int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK;
+ src += 2;
+ if ((cpy = dst - offset) < (uchar_t *)d_start)
+ return (-1);
+ while (--mlen >= 0 && dst < d_end)
+ *dst++ = *cpy++;
+ } else {
+ *dst++ = *src++;
+ }
+ }
+ return (0);
+}
diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c
new file mode 100644
index 000000000000..bc4f007b61a1
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/metaslab.c
@@ -0,0 +1,6287 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/space_map.h>
+#include <sys/metaslab_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_draid.h>
+#include <sys/zio.h>
+#include <sys/spa_impl.h>
+#include <sys/zfeature.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/zap.h>
+#include <sys/btree.h>
+
+#define WITH_DF_BLOCK_ALLOCATOR
+
+#define GANG_ALLOCATION(flags) \
+ ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
+
+/*
+ * Metaslab granularity, in bytes. This is roughly similar to what would be
+ * referred to as the "stripe size" in traditional RAID arrays. In normal
+ * operation, we will try to write this amount of data to a top-level vdev
+ * before moving on to the next one.
+ */
+unsigned long metaslab_aliquot = 512 << 10;
+
+/*
+ * For testing, make some blocks above a certain size be gang blocks.
+ */
+unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
+
+/*
+ * In pools where the log space map feature is not enabled we touch
+ * multiple metaslabs (and their respective space maps) with each
+ * transaction group. Thus, we benefit from having a small space map
+ * block size since it allows us to issue more I/O operations scattered
+ * around the disk. So a sane default for the space map block size
+ * is 8~16K.
+ */
+int zfs_metaslab_sm_blksz_no_log = (1 << 14);
+
+/*
+ * When the log space map feature is enabled, we accumulate a lot of
+ * changes per metaslab that are flushed once in a while so we benefit
+ * from a bigger block size like 128K for the metaslab space maps.
+ */
+int zfs_metaslab_sm_blksz_with_log = (1 << 17);
+
+/*
+ * The in-core space map representation is more compact than its on-disk form.
+ * The zfs_condense_pct determines how much more compact the in-core
+ * space map representation must be before we compact it on-disk.
+ * Values should be greater than or equal to 100.
+ */
+int zfs_condense_pct = 200;
+
+/*
+ * Condensing a metaslab is not guaranteed to actually reduce the amount of
+ * space used on disk. In particular, a space map uses data in increments of
+ * MAX(1 << ashift, space_map_blksz), so a metaslab might use the
+ * same number of blocks after condensing. Since the goal of condensing is to
+ * reduce the number of IOPs required to read the space map, we only want to
+ * condense when we can be sure we will reduce the number of blocks used by the
+ * space map. Unfortunately, we cannot precisely compute whether or not this is
+ * the case in metaslab_should_condense since we are holding ms_lock. Instead,
+ * we apply the following heuristic: do not condense a spacemap unless the
+ * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
+ * blocks.
+ */
+int zfs_metaslab_condense_block_threshold = 4;
+
+/*
+ * The zfs_mg_noalloc_threshold defines which metaslab groups should
+ * be eligible for allocation. The value is defined as a percentage of
+ * free space. Metaslab groups that have more free space than
+ * zfs_mg_noalloc_threshold are always eligible for allocations. Once
+ * a metaslab group's free space is less than or equal to the
+ * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
+ * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
+ * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
+ * groups are allowed to accept allocations. Gang blocks are always
+ * eligible to allocate on any metaslab group. The default value of 0 means
+ * no metaslab group will be excluded based on this criterion.
+ */
+int zfs_mg_noalloc_threshold = 0;
+
+/*
+ * Metaslab groups are considered eligible for allocations if their
+ * fragmentation metric (measured as a percentage) is less than or
+ * equal to zfs_mg_fragmentation_threshold. If a metaslab group
+ * exceeds this threshold then it will be skipped unless all metaslab
+ * groups within the metaslab class have also crossed this threshold.
+ *
+ * This tunable was introduced to avoid edge cases where we continue
+ * allocating from very fragmented disks in our pool while other, less
+ * fragmented disks, exists. On the other hand, if all disks in the
+ * pool are uniformly approaching the threshold, the threshold can
+ * be a speed bump in performance, where we keep switching the disks
+ * that we allocate from (e.g. we allocate some segments from disk A
+ * making it bypassing the threshold while freeing segments from disk
+ * B getting its fragmentation below the threshold).
+ *
+ * Empirically, we've seen that our vdev selection for allocations is
+ * good enough that fragmentation increases uniformly across all vdevs
+ * the majority of the time. Thus we set the threshold percentage high
+ * enough to avoid hitting the speed bump on pools that are being pushed
+ * to the edge.
+ */
+int zfs_mg_fragmentation_threshold = 95;
+
+/*
+ * Allow metaslabs to keep their active state as long as their fragmentation
+ * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
+ * active metaslab that exceeds this threshold will no longer keep its active
+ * status allowing better metaslabs to be selected.
+ */
+int zfs_metaslab_fragmentation_threshold = 70;
+
+/*
+ * When set will load all metaslabs when pool is first opened.
+ */
+int metaslab_debug_load = 0;
+
+/*
+ * When set will prevent metaslabs from being unloaded.
+ */
+int metaslab_debug_unload = 0;
+
+/*
+ * Minimum size which forces the dynamic allocator to change
+ * it's allocation strategy. Once the space map cannot satisfy
+ * an allocation of this size then it switches to using more
+ * aggressive strategy (i.e search by size rather than offset).
+ */
+uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
+
+/*
+ * The minimum free space, in percent, which must be available
+ * in a space map to continue allocations in a first-fit fashion.
+ * Once the space map's free space drops below this level we dynamically
+ * switch to using best-fit allocations.
+ */
+int metaslab_df_free_pct = 4;
+
+/*
+ * Maximum distance to search forward from the last offset. Without this
+ * limit, fragmented pools can see >100,000 iterations and
+ * metaslab_block_picker() becomes the performance limiting factor on
+ * high-performance storage.
+ *
+ * With the default setting of 16MB, we typically see less than 500
+ * iterations, even with very fragmented, ashift=9 pools. The maximum number
+ * of iterations possible is:
+ * metaslab_df_max_search / (2 * (1<<ashift))
+ * With the default setting of 16MB this is 16*1024 (with ashift=9) or
+ * 2048 (with ashift=12).
+ */
+int metaslab_df_max_search = 16 * 1024 * 1024;
+
+/*
+ * Forces the metaslab_block_picker function to search for at least this many
+ * segments forwards until giving up on finding a segment that the allocation
+ * will fit into.
+ */
+uint32_t metaslab_min_search_count = 100;
+
+/*
+ * If we are not searching forward (due to metaslab_df_max_search,
+ * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable
+ * controls what segment is used. If it is set, we will use the largest free
+ * segment. If it is not set, we will use a segment of exactly the requested
+ * size (or larger).
+ */
+int metaslab_df_use_largest_segment = B_FALSE;
+
+/*
+ * Percentage of all cpus that can be used by the metaslab taskq.
+ */
+int metaslab_load_pct = 50;
+
+/*
+ * These tunables control how long a metaslab will remain loaded after the
+ * last allocation from it. A metaslab can't be unloaded until at least
+ * metaslab_unload_delay TXG's and metaslab_unload_delay_ms milliseconds
+ * have elapsed. However, zfs_metaslab_mem_limit may cause it to be
+ * unloaded sooner. These settings are intended to be generous -- to keep
+ * metaslabs loaded for a long time, reducing the rate of metaslab loading.
+ */
+int metaslab_unload_delay = 32;
+int metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */
+
+/*
+ * Max number of metaslabs per group to preload.
+ */
+int metaslab_preload_limit = 10;
+
+/*
+ * Enable/disable preloading of metaslab.
+ */
+int metaslab_preload_enabled = B_TRUE;
+
+/*
+ * Enable/disable fragmentation weighting on metaslabs.
+ */
+int metaslab_fragmentation_factor_enabled = B_TRUE;
+
+/*
+ * Enable/disable lba weighting (i.e. outer tracks are given preference).
+ */
+int metaslab_lba_weighting_enabled = B_TRUE;
+
+/*
+ * Enable/disable metaslab group biasing.
+ */
+int metaslab_bias_enabled = B_TRUE;
+
+/*
+ * Enable/disable remapping of indirect DVAs to their concrete vdevs.
+ */
+boolean_t zfs_remap_blkptr_enable = B_TRUE;
+
+/*
+ * Enable/disable segment-based metaslab selection.
+ */
+int zfs_metaslab_segment_weight_enabled = B_TRUE;
+
+/*
+ * When using segment-based metaslab selection, we will continue
+ * allocating from the active metaslab until we have exhausted
+ * zfs_metaslab_switch_threshold of its buckets.
+ */
+int zfs_metaslab_switch_threshold = 2;
+
+/*
+ * Internal switch to enable/disable the metaslab allocation tracing
+ * facility.
+ */
+boolean_t metaslab_trace_enabled = B_FALSE;
+
+/*
+ * Maximum entries that the metaslab allocation tracing facility will keep
+ * in a given list when running in non-debug mode. We limit the number
+ * of entries in non-debug mode to prevent us from using up too much memory.
+ * The limit should be sufficiently large that we don't expect any allocation
+ * to every exceed this value. In debug mode, the system will panic if this
+ * limit is ever reached allowing for further investigation.
+ */
+uint64_t metaslab_trace_max_entries = 5000;
+
+/*
+ * Maximum number of metaslabs per group that can be disabled
+ * simultaneously.
+ */
+int max_disabled_ms = 3;
+
+/*
+ * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
+ * To avoid 64-bit overflow, don't set above UINT32_MAX.
+ */
+unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
+
+/*
+ * Maximum percentage of memory to use on storing loaded metaslabs. If loading
+ * a metaslab would take it over this percentage, the oldest selected metaslab
+ * is automatically unloaded.
+ */
+int zfs_metaslab_mem_limit = 75;
+
+/*
+ * Force the per-metaslab range trees to use 64-bit integers to store
+ * segments. Used for debugging purposes.
+ */
+boolean_t zfs_metaslab_force_large_segs = B_FALSE;
+
+/*
+ * By default we only store segments over a certain size in the size-sorted
+ * metaslab trees (ms_allocatable_by_size and
+ * ms_unflushed_frees_by_size). This dramatically reduces memory usage and
+ * improves load and unload times at the cost of causing us to use slightly
+ * larger segments than we would otherwise in some cases.
+ */
+uint32_t metaslab_by_size_min_shift = 14;
+
+/*
+ * If not set, we will first try normal allocation. If that fails then
+ * we will do a gang allocation. If that fails then we will do a "try hard"
+ * gang allocation. If that fails then we will have a multi-layer gang
+ * block.
+ *
+ * If set, we will first try normal allocation. If that fails then
+ * we will do a "try hard" allocation. If that fails we will do a gang
+ * allocation. If that fails we will do a "try hard" gang allocation. If
+ * that fails then we will have a multi-layer gang block.
+ */
+int zfs_metaslab_try_hard_before_gang = B_FALSE;
+
+/*
+ * When not trying hard, we only consider the best zfs_metaslab_find_max_tries
+ * metaslabs. This improves performance, especially when there are many
+ * metaslabs per vdev and the allocation can't actually be satisfied (so we
+ * would otherwise iterate all the metaslabs). If there is a metaslab with a
+ * worse weight but it can actually satisfy the allocation, we won't find it
+ * until trying hard. This may happen if the worse metaslab is not loaded
+ * (and the true weight is better than we have calculated), or due to weight
+ * bucketization. E.g. we are looking for a 60K segment, and the best
+ * metaslabs all have free segments in the 32-63K bucket, but the best
+ * zfs_metaslab_find_max_tries metaslabs have ms_max_size <60KB, and a
+ * subsequent metaslab has ms_max_size >60KB (but fewer segments in this
+ * bucket, and therefore a lower weight).
+ */
+int zfs_metaslab_find_max_tries = 100;
+
+static uint64_t metaslab_weight(metaslab_t *, boolean_t);
+static void metaslab_set_fragmentation(metaslab_t *, boolean_t);
+static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
+static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
+
+static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
+static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
+static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
+static unsigned int metaslab_idx_func(multilist_t *, void *);
+static void metaslab_evict(metaslab_t *, uint64_t);
+static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg);
+kmem_cache_t *metaslab_alloc_trace_cache;
+
+typedef struct metaslab_stats {
+ kstat_named_t metaslabstat_trace_over_limit;
+ kstat_named_t metaslabstat_reload_tree;
+ kstat_named_t metaslabstat_too_many_tries;
+ kstat_named_t metaslabstat_try_hard;
+} metaslab_stats_t;
+
+static metaslab_stats_t metaslab_stats = {
+ { "trace_over_limit", KSTAT_DATA_UINT64 },
+ { "reload_tree", KSTAT_DATA_UINT64 },
+ { "too_many_tries", KSTAT_DATA_UINT64 },
+ { "try_hard", KSTAT_DATA_UINT64 },
+};
+
+#define METASLABSTAT_BUMP(stat) \
+ atomic_inc_64(&metaslab_stats.stat.value.ui64);
+
+
+kstat_t *metaslab_ksp;
+
+void
+metaslab_stat_init(void)
+{
+ ASSERT(metaslab_alloc_trace_cache == NULL);
+ metaslab_alloc_trace_cache = kmem_cache_create(
+ "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats",
+ "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) /
+ sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+ if (metaslab_ksp != NULL) {
+ metaslab_ksp->ks_data = &metaslab_stats;
+ kstat_install(metaslab_ksp);
+ }
+}
+
+void
+metaslab_stat_fini(void)
+{
+ if (metaslab_ksp != NULL) {
+ kstat_delete(metaslab_ksp);
+ metaslab_ksp = NULL;
+ }
+
+ kmem_cache_destroy(metaslab_alloc_trace_cache);
+ metaslab_alloc_trace_cache = NULL;
+}
+
+/*
+ * ==========================================================================
+ * Metaslab classes
+ * ==========================================================================
+ */
+metaslab_class_t *
+metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
+{
+ metaslab_class_t *mc;
+
+ mc = kmem_zalloc(offsetof(metaslab_class_t,
+ mc_allocator[spa->spa_alloc_count]), KM_SLEEP);
+
+ mc->mc_spa = spa;
+ mc->mc_ops = ops;
+ mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
+ mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t),
+ offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
+ mca->mca_rotor = NULL;
+ zfs_refcount_create_tracked(&mca->mca_alloc_slots);
+ }
+
+ return (mc);
+}
+
+void
+metaslab_class_destroy(metaslab_class_t *mc)
+{
+ spa_t *spa = mc->mc_spa;
+
+ ASSERT(mc->mc_alloc == 0);
+ ASSERT(mc->mc_deferred == 0);
+ ASSERT(mc->mc_space == 0);
+ ASSERT(mc->mc_dspace == 0);
+
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
+ ASSERT(mca->mca_rotor == NULL);
+ zfs_refcount_destroy(&mca->mca_alloc_slots);
+ }
+ mutex_destroy(&mc->mc_lock);
+ multilist_destroy(mc->mc_metaslab_txg_list);
+ kmem_free(mc, offsetof(metaslab_class_t,
+ mc_allocator[spa->spa_alloc_count]));
+}
+
+int
+metaslab_class_validate(metaslab_class_t *mc)
+{
+ metaslab_group_t *mg;
+ vdev_t *vd;
+
+ /*
+ * Must hold one of the spa_config locks.
+ */
+ ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
+ spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
+
+ if ((mg = mc->mc_allocator[0].mca_rotor) == NULL)
+ return (0);
+
+ do {
+ vd = mg->mg_vd;
+ ASSERT(vd->vdev_mg != NULL);
+ ASSERT3P(vd->vdev_top, ==, vd);
+ ASSERT3P(mg->mg_class, ==, mc);
+ ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
+ } while ((mg = mg->mg_next) != mc->mc_allocator[0].mca_rotor);
+
+ return (0);
+}
+
+static void
+metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
+ int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
+{
+ atomic_add_64(&mc->mc_alloc, alloc_delta);
+ atomic_add_64(&mc->mc_deferred, defer_delta);
+ atomic_add_64(&mc->mc_space, space_delta);
+ atomic_add_64(&mc->mc_dspace, dspace_delta);
+}
+
+uint64_t
+metaslab_class_get_alloc(metaslab_class_t *mc)
+{
+ return (mc->mc_alloc);
+}
+
+uint64_t
+metaslab_class_get_deferred(metaslab_class_t *mc)
+{
+ return (mc->mc_deferred);
+}
+
+uint64_t
+metaslab_class_get_space(metaslab_class_t *mc)
+{
+ return (mc->mc_space);
+}
+
+uint64_t
+metaslab_class_get_dspace(metaslab_class_t *mc)
+{
+ return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
+}
+
+void
+metaslab_class_histogram_verify(metaslab_class_t *mc)
+{
+ spa_t *spa = mc->mc_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t *mc_hist;
+ int i;
+
+ if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
+ return;
+
+ mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
+ KM_SLEEP);
+
+ mutex_enter(&mc->mc_lock);
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ metaslab_group_t *mg = vdev_get_mg(tvd, mc);
+
+ /*
+ * Skip any holes, uninitialized top-levels, or
+ * vdevs that are not in this metalab class.
+ */
+ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
+ mg->mg_class != mc) {
+ continue;
+ }
+
+ IMPLY(mg == mg->mg_vd->vdev_log_mg,
+ mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
+
+ for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+ mc_hist[i] += mg->mg_histogram[i];
+ }
+
+ for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+ VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
+ }
+
+ mutex_exit(&mc->mc_lock);
+ kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
+}
+
+/*
+ * Calculate the metaslab class's fragmentation metric. The metric
+ * is weighted based on the space contribution of each metaslab group.
+ * The return value will be a number between 0 and 100 (inclusive), or
+ * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
+ * zfs_frag_table for more information about the metric.
+ */
+uint64_t
+metaslab_class_fragmentation(metaslab_class_t *mc)
+{
+ vdev_t *rvd = mc->mc_spa->spa_root_vdev;
+ uint64_t fragmentation = 0;
+
+ spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
+
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ metaslab_group_t *mg = tvd->vdev_mg;
+
+ /*
+ * Skip any holes, uninitialized top-levels,
+ * or vdevs that are not in this metalab class.
+ */
+ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
+ mg->mg_class != mc) {
+ continue;
+ }
+
+ /*
+ * If a metaslab group does not contain a fragmentation
+ * metric then just bail out.
+ */
+ if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
+ spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
+ return (ZFS_FRAG_INVALID);
+ }
+
+ /*
+ * Determine how much this metaslab_group is contributing
+ * to the overall pool fragmentation metric.
+ */
+ fragmentation += mg->mg_fragmentation *
+ metaslab_group_get_space(mg);
+ }
+ fragmentation /= metaslab_class_get_space(mc);
+
+ ASSERT3U(fragmentation, <=, 100);
+ spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
+ return (fragmentation);
+}
+
+/*
+ * Calculate the amount of expandable space that is available in
+ * this metaslab class. If a device is expanded then its expandable
+ * space will be the amount of allocatable space that is currently not
+ * part of this metaslab class.
+ */
+uint64_t
+metaslab_class_expandable_space(metaslab_class_t *mc)
+{
+ vdev_t *rvd = mc->mc_spa->spa_root_vdev;
+ uint64_t space = 0;
+
+ spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ metaslab_group_t *mg = tvd->vdev_mg;
+
+ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
+ mg->mg_class != mc) {
+ continue;
+ }
+
+ /*
+ * Calculate if we have enough space to add additional
+ * metaslabs. We report the expandable space in terms
+ * of the metaslab size since that's the unit of expansion.
+ */
+ space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
+ 1ULL << tvd->vdev_ms_shift);
+ }
+ spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
+ return (space);
+}
+
+void
+metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
+{
+ multilist_t *ml = mc->mc_metaslab_txg_list;
+ for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
+ multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+ metaslab_t *msp = multilist_sublist_head(mls);
+ multilist_sublist_unlock(mls);
+ while (msp != NULL) {
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * If the metaslab has been removed from the list
+ * (which could happen if we were at the memory limit
+ * and it was evicted during this loop), then we can't
+ * proceed and we should restart the sublist.
+ */
+ if (!multilist_link_active(&msp->ms_class_txg_node)) {
+ mutex_exit(&msp->ms_lock);
+ i--;
+ break;
+ }
+ mls = multilist_sublist_lock(ml, i);
+ metaslab_t *next_msp = multilist_sublist_next(mls, msp);
+ multilist_sublist_unlock(mls);
+ if (txg >
+ msp->ms_selected_txg + metaslab_unload_delay &&
+ gethrtime() > msp->ms_selected_time +
+ (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) {
+ metaslab_evict(msp, txg);
+ } else {
+ /*
+ * Once we've hit a metaslab selected too
+ * recently to evict, we're done evicting for
+ * now.
+ */
+ mutex_exit(&msp->ms_lock);
+ break;
+ }
+ mutex_exit(&msp->ms_lock);
+ msp = next_msp;
+ }
+ }
+}
+
+static int
+metaslab_compare(const void *x1, const void *x2)
+{
+ const metaslab_t *m1 = (const metaslab_t *)x1;
+ const metaslab_t *m2 = (const metaslab_t *)x2;
+
+ int sort1 = 0;
+ int sort2 = 0;
+ if (m1->ms_allocator != -1 && m1->ms_primary)
+ sort1 = 1;
+ else if (m1->ms_allocator != -1 && !m1->ms_primary)
+ sort1 = 2;
+ if (m2->ms_allocator != -1 && m2->ms_primary)
+ sort2 = 1;
+ else if (m2->ms_allocator != -1 && !m2->ms_primary)
+ sort2 = 2;
+
+ /*
+ * Sort inactive metaslabs first, then primaries, then secondaries. When
+ * selecting a metaslab to allocate from, an allocator first tries its
+ * primary, then secondary active metaslab. If it doesn't have active
+ * metaslabs, or can't allocate from them, it searches for an inactive
+ * metaslab to activate. If it can't find a suitable one, it will steal
+ * a primary or secondary metaslab from another allocator.
+ */
+ if (sort1 < sort2)
+ return (-1);
+ if (sort1 > sort2)
+ return (1);
+
+ int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight);
+ if (likely(cmp))
+ return (cmp);
+
+ IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
+
+ return (TREE_CMP(m1->ms_start, m2->ms_start));
+}
+
+/*
+ * ==========================================================================
+ * Metaslab groups
+ * ==========================================================================
+ */
+/*
+ * Update the allocatable flag and the metaslab group's capacity.
+ * The allocatable flag is set to true if the capacity is below
+ * the zfs_mg_noalloc_threshold or has a fragmentation value that is
+ * greater than zfs_mg_fragmentation_threshold. If a metaslab group
+ * transitions from allocatable to non-allocatable or vice versa then the
+ * metaslab group's class is updated to reflect the transition.
+ */
+static void
+metaslab_group_alloc_update(metaslab_group_t *mg)
+{
+ vdev_t *vd = mg->mg_vd;
+ metaslab_class_t *mc = mg->mg_class;
+ vdev_stat_t *vs = &vd->vdev_stat;
+ boolean_t was_allocatable;
+ boolean_t was_initialized;
+
+ ASSERT(vd == vd->vdev_top);
+ ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
+ SCL_ALLOC);
+
+ mutex_enter(&mg->mg_lock);
+ was_allocatable = mg->mg_allocatable;
+ was_initialized = mg->mg_initialized;
+
+ mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
+ (vs->vs_space + 1);
+
+ mutex_enter(&mc->mc_lock);
+
+ /*
+ * If the metaslab group was just added then it won't
+ * have any space until we finish syncing out this txg.
+ * At that point we will consider it initialized and available
+ * for allocations. We also don't consider non-activated
+ * metaslab groups (e.g. vdevs that are in the middle of being removed)
+ * to be initialized, because they can't be used for allocation.
+ */
+ mg->mg_initialized = metaslab_group_initialized(mg);
+ if (!was_initialized && mg->mg_initialized) {
+ mc->mc_groups++;
+ } else if (was_initialized && !mg->mg_initialized) {
+ ASSERT3U(mc->mc_groups, >, 0);
+ mc->mc_groups--;
+ }
+ if (mg->mg_initialized)
+ mg->mg_no_free_space = B_FALSE;
+
+ /*
+ * A metaslab group is considered allocatable if it has plenty
+ * of free space or is not heavily fragmented. We only take
+ * fragmentation into account if the metaslab group has a valid
+ * fragmentation metric (i.e. a value between 0 and 100).
+ */
+ mg->mg_allocatable = (mg->mg_activation_count > 0 &&
+ mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
+ (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
+ mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
+
+ /*
+ * The mc_alloc_groups maintains a count of the number of
+ * groups in this metaslab class that are still above the
+ * zfs_mg_noalloc_threshold. This is used by the allocating
+ * threads to determine if they should avoid allocations to
+ * a given group. The allocator will avoid allocations to a group
+ * if that group has reached or is below the zfs_mg_noalloc_threshold
+ * and there are still other groups that are above the threshold.
+ * When a group transitions from allocatable to non-allocatable or
+ * vice versa we update the metaslab class to reflect that change.
+ * When the mc_alloc_groups value drops to 0 that means that all
+ * groups have reached the zfs_mg_noalloc_threshold making all groups
+ * eligible for allocations. This effectively means that all devices
+ * are balanced again.
+ */
+ if (was_allocatable && !mg->mg_allocatable)
+ mc->mc_alloc_groups--;
+ else if (!was_allocatable && mg->mg_allocatable)
+ mc->mc_alloc_groups++;
+ mutex_exit(&mc->mc_lock);
+
+ mutex_exit(&mg->mg_lock);
+}
+
+int
+metaslab_sort_by_flushed(const void *va, const void *vb)
+{
+ const metaslab_t *a = va;
+ const metaslab_t *b = vb;
+
+ int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
+ if (likely(cmp))
+ return (cmp);
+
+ uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id;
+ uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id;
+ cmp = TREE_CMP(a_vdev_id, b_vdev_id);
+ if (cmp)
+ return (cmp);
+
+ return (TREE_CMP(a->ms_id, b->ms_id));
+}
+
+metaslab_group_t *
+metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
+{
+ metaslab_group_t *mg;
+
+ mg = kmem_zalloc(offsetof(metaslab_group_t,
+ mg_allocator[allocators]), KM_SLEEP);
+ mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
+ avl_create(&mg->mg_metaslab_tree, metaslab_compare,
+ sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node));
+ mg->mg_vd = vd;
+ mg->mg_class = mc;
+ mg->mg_activation_count = 0;
+ mg->mg_initialized = B_FALSE;
+ mg->mg_no_free_space = B_TRUE;
+ mg->mg_allocators = allocators;
+
+ for (int i = 0; i < allocators; i++) {
+ metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
+ zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth);
+ }
+
+ mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
+ maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC);
+
+ return (mg);
+}
+
+void
+metaslab_group_destroy(metaslab_group_t *mg)
+{
+ ASSERT(mg->mg_prev == NULL);
+ ASSERT(mg->mg_next == NULL);
+ /*
+ * We may have gone below zero with the activation count
+ * either because we never activated in the first place or
+ * because we're done, and possibly removing the vdev.
+ */
+ ASSERT(mg->mg_activation_count <= 0);
+
+ taskq_destroy(mg->mg_taskq);
+ avl_destroy(&mg->mg_metaslab_tree);
+ mutex_destroy(&mg->mg_lock);
+ mutex_destroy(&mg->mg_ms_disabled_lock);
+ cv_destroy(&mg->mg_ms_disabled_cv);
+
+ for (int i = 0; i < mg->mg_allocators; i++) {
+ metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
+ zfs_refcount_destroy(&mga->mga_alloc_queue_depth);
+ }
+ kmem_free(mg, offsetof(metaslab_group_t,
+ mg_allocator[mg->mg_allocators]));
+}
+
+void
+metaslab_group_activate(metaslab_group_t *mg)
+{
+ metaslab_class_t *mc = mg->mg_class;
+ spa_t *spa = mc->mc_spa;
+ metaslab_group_t *mgprev, *mgnext;
+
+ ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0);
+
+ ASSERT(mg->mg_prev == NULL);
+ ASSERT(mg->mg_next == NULL);
+ ASSERT(mg->mg_activation_count <= 0);
+
+ if (++mg->mg_activation_count <= 0)
+ return;
+
+ mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
+ metaslab_group_alloc_update(mg);
+
+ if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) {
+ mg->mg_prev = mg;
+ mg->mg_next = mg;
+ } else {
+ mgnext = mgprev->mg_next;
+ mg->mg_prev = mgprev;
+ mg->mg_next = mgnext;
+ mgprev->mg_next = mg;
+ mgnext->mg_prev = mg;
+ }
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ mc->mc_allocator[i].mca_rotor = mg;
+ mg = mg->mg_next;
+ }
+}
+
+/*
+ * Passivate a metaslab group and remove it from the allocation rotor.
+ * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
+ * a metaslab group. This function will momentarily drop spa_config_locks
+ * that are lower than the SCL_ALLOC lock (see comment below).
+ */
+void
+metaslab_group_passivate(metaslab_group_t *mg)
+{
+ metaslab_class_t *mc = mg->mg_class;
+ spa_t *spa = mc->mc_spa;
+ metaslab_group_t *mgprev, *mgnext;
+ int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
+
+ ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
+ (SCL_ALLOC | SCL_ZIO));
+
+ if (--mg->mg_activation_count != 0) {
+ for (int i = 0; i < spa->spa_alloc_count; i++)
+ ASSERT(mc->mc_allocator[i].mca_rotor != mg);
+ ASSERT(mg->mg_prev == NULL);
+ ASSERT(mg->mg_next == NULL);
+ ASSERT(mg->mg_activation_count < 0);
+ return;
+ }
+
+ /*
+ * The spa_config_lock is an array of rwlocks, ordered as
+ * follows (from highest to lowest):
+ * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
+ * SCL_ZIO > SCL_FREE > SCL_VDEV
+ * (For more information about the spa_config_lock see spa_misc.c)
+ * The higher the lock, the broader its coverage. When we passivate
+ * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
+ * config locks. However, the metaslab group's taskq might be trying
+ * to preload metaslabs so we must drop the SCL_ZIO lock and any
+ * lower locks to allow the I/O to complete. At a minimum,
+ * we continue to hold the SCL_ALLOC lock, which prevents any future
+ * allocations from taking place and any changes to the vdev tree.
+ */
+ spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
+ taskq_wait_outstanding(mg->mg_taskq, 0);
+ spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
+ metaslab_group_alloc_update(mg);
+ for (int i = 0; i < mg->mg_allocators; i++) {
+ metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
+ metaslab_t *msp = mga->mga_primary;
+ if (msp != NULL) {
+ mutex_enter(&msp->ms_lock);
+ metaslab_passivate(msp,
+ metaslab_weight_from_range_tree(msp));
+ mutex_exit(&msp->ms_lock);
+ }
+ msp = mga->mga_secondary;
+ if (msp != NULL) {
+ mutex_enter(&msp->ms_lock);
+ metaslab_passivate(msp,
+ metaslab_weight_from_range_tree(msp));
+ mutex_exit(&msp->ms_lock);
+ }
+ }
+
+ mgprev = mg->mg_prev;
+ mgnext = mg->mg_next;
+
+ if (mg == mgnext) {
+ mgnext = NULL;
+ } else {
+ mgprev->mg_next = mgnext;
+ mgnext->mg_prev = mgprev;
+ }
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ if (mc->mc_allocator[i].mca_rotor == mg)
+ mc->mc_allocator[i].mca_rotor = mgnext;
+ }
+
+ mg->mg_prev = NULL;
+ mg->mg_next = NULL;
+}
+
+boolean_t
+metaslab_group_initialized(metaslab_group_t *mg)
+{
+ vdev_t *vd = mg->mg_vd;
+ vdev_stat_t *vs = &vd->vdev_stat;
+
+ return (vs->vs_space != 0 && mg->mg_activation_count > 0);
+}
+
+uint64_t
+metaslab_group_get_space(metaslab_group_t *mg)
+{
+ /*
+ * Note that the number of nodes in mg_metaslab_tree may be one less
+ * than vdev_ms_count, due to the embedded log metaslab.
+ */
+ mutex_enter(&mg->mg_lock);
+ uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree);
+ mutex_exit(&mg->mg_lock);
+ return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count);
+}
+
+void
+metaslab_group_histogram_verify(metaslab_group_t *mg)
+{
+ uint64_t *mg_hist;
+ avl_tree_t *t = &mg->mg_metaslab_tree;
+ uint64_t ashift = mg->mg_vd->vdev_ashift;
+
+ if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
+ return;
+
+ mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
+ KM_SLEEP);
+
+ ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
+ SPACE_MAP_HISTOGRAM_SIZE + ashift);
+
+ mutex_enter(&mg->mg_lock);
+ for (metaslab_t *msp = avl_first(t);
+ msp != NULL; msp = AVL_NEXT(t, msp)) {
+ VERIFY3P(msp->ms_group, ==, mg);
+ /* skip if not active */
+ if (msp->ms_sm == NULL)
+ continue;
+
+ for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+ mg_hist[i + ashift] +=
+ msp->ms_sm->sm_phys->smp_histogram[i];
+ }
+ }
+
+ for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
+ VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
+
+ mutex_exit(&mg->mg_lock);
+
+ kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
+}
+
+static void
+metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
+{
+ metaslab_class_t *mc = mg->mg_class;
+ uint64_t ashift = mg->mg_vd->vdev_ashift;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ if (msp->ms_sm == NULL)
+ return;
+
+ mutex_enter(&mg->mg_lock);
+ mutex_enter(&mc->mc_lock);
+ for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+ IMPLY(mg == mg->mg_vd->vdev_log_mg,
+ mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
+ mg->mg_histogram[i + ashift] +=
+ msp->ms_sm->sm_phys->smp_histogram[i];
+ mc->mc_histogram[i + ashift] +=
+ msp->ms_sm->sm_phys->smp_histogram[i];
+ }
+ mutex_exit(&mc->mc_lock);
+ mutex_exit(&mg->mg_lock);
+}
+
+void
+metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
+{
+ metaslab_class_t *mc = mg->mg_class;
+ uint64_t ashift = mg->mg_vd->vdev_ashift;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ if (msp->ms_sm == NULL)
+ return;
+
+ mutex_enter(&mg->mg_lock);
+ mutex_enter(&mc->mc_lock);
+ for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+ ASSERT3U(mg->mg_histogram[i + ashift], >=,
+ msp->ms_sm->sm_phys->smp_histogram[i]);
+ ASSERT3U(mc->mc_histogram[i + ashift], >=,
+ msp->ms_sm->sm_phys->smp_histogram[i]);
+ IMPLY(mg == mg->mg_vd->vdev_log_mg,
+ mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
+
+ mg->mg_histogram[i + ashift] -=
+ msp->ms_sm->sm_phys->smp_histogram[i];
+ mc->mc_histogram[i + ashift] -=
+ msp->ms_sm->sm_phys->smp_histogram[i];
+ }
+ mutex_exit(&mc->mc_lock);
+ mutex_exit(&mg->mg_lock);
+}
+
+static void
+metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
+{
+ ASSERT(msp->ms_group == NULL);
+ mutex_enter(&mg->mg_lock);
+ msp->ms_group = mg;
+ msp->ms_weight = 0;
+ avl_add(&mg->mg_metaslab_tree, msp);
+ mutex_exit(&mg->mg_lock);
+
+ mutex_enter(&msp->ms_lock);
+ metaslab_group_histogram_add(mg, msp);
+ mutex_exit(&msp->ms_lock);
+}
+
+static void
+metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
+{
+ mutex_enter(&msp->ms_lock);
+ metaslab_group_histogram_remove(mg, msp);
+ mutex_exit(&msp->ms_lock);
+
+ mutex_enter(&mg->mg_lock);
+ ASSERT(msp->ms_group == mg);
+ avl_remove(&mg->mg_metaslab_tree, msp);
+
+ metaslab_class_t *mc = msp->ms_group->mg_class;
+ multilist_sublist_t *mls =
+ multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+ if (multilist_link_active(&msp->ms_class_txg_node))
+ multilist_sublist_remove(mls, msp);
+ multilist_sublist_unlock(mls);
+
+ msp->ms_group = NULL;
+ mutex_exit(&mg->mg_lock);
+}
+
+static void
+metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT(MUTEX_HELD(&mg->mg_lock));
+ ASSERT(msp->ms_group == mg);
+
+ avl_remove(&mg->mg_metaslab_tree, msp);
+ msp->ms_weight = weight;
+ avl_add(&mg->mg_metaslab_tree, msp);
+
+}
+
+static void
+metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
+{
+ /*
+ * Although in principle the weight can be any value, in
+ * practice we do not use values in the range [1, 511].
+ */
+ ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ mutex_enter(&mg->mg_lock);
+ metaslab_group_sort_impl(mg, msp, weight);
+ mutex_exit(&mg->mg_lock);
+}
+
+/*
+ * Calculate the fragmentation for a given metaslab group. We can use
+ * a simple average here since all metaslabs within the group must have
+ * the same size. The return value will be a value between 0 and 100
+ * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
+ * group have a fragmentation metric.
+ */
+uint64_t
+metaslab_group_fragmentation(metaslab_group_t *mg)
+{
+ vdev_t *vd = mg->mg_vd;
+ uint64_t fragmentation = 0;
+ uint64_t valid_ms = 0;
+
+ for (int m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+
+ if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
+ continue;
+ if (msp->ms_group != mg)
+ continue;
+
+ valid_ms++;
+ fragmentation += msp->ms_fragmentation;
+ }
+
+ if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
+ return (ZFS_FRAG_INVALID);
+
+ fragmentation /= valid_ms;
+ ASSERT3U(fragmentation, <=, 100);
+ return (fragmentation);
+}
+
+/*
+ * Determine if a given metaslab group should skip allocations. A metaslab
+ * group should avoid allocations if its free capacity is less than the
+ * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
+ * zfs_mg_fragmentation_threshold and there is at least one metaslab group
+ * that can still handle allocations. If the allocation throttle is enabled
+ * then we skip allocations to devices that have reached their maximum
+ * allocation queue depth unless the selected metaslab group is the only
+ * eligible group remaining.
+ */
+static boolean_t
+metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
+ uint64_t psize, int allocator, int d)
+{
+ spa_t *spa = mg->mg_vd->vdev_spa;
+ metaslab_class_t *mc = mg->mg_class;
+
+ /*
+ * We can only consider skipping this metaslab group if it's
+ * in the normal metaslab class and there are other metaslab
+ * groups to select from. Otherwise, we always consider it eligible
+ * for allocations.
+ */
+ if ((mc != spa_normal_class(spa) &&
+ mc != spa_special_class(spa) &&
+ mc != spa_dedup_class(spa)) ||
+ mc->mc_groups <= 1)
+ return (B_TRUE);
+
+ /*
+ * If the metaslab group's mg_allocatable flag is set (see comments
+ * in metaslab_group_alloc_update() for more information) and
+ * the allocation throttle is disabled then allow allocations to this
+ * device. However, if the allocation throttle is enabled then
+ * check if we have reached our allocation limit (mga_alloc_queue_depth)
+ * to determine if we should allow allocations to this metaslab group.
+ * If all metaslab groups are no longer considered allocatable
+ * (mc_alloc_groups == 0) or we're trying to allocate the smallest
+ * gang block size then we allow allocations on this metaslab group
+ * regardless of the mg_allocatable or throttle settings.
+ */
+ if (mg->mg_allocatable) {
+ metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
+ int64_t qdepth;
+ uint64_t qmax = mga->mga_cur_max_alloc_queue_depth;
+
+ if (!mc->mc_alloc_throttle_enabled)
+ return (B_TRUE);
+
+ /*
+ * If this metaslab group does not have any free space, then
+ * there is no point in looking further.
+ */
+ if (mg->mg_no_free_space)
+ return (B_FALSE);
+
+ /*
+ * Relax allocation throttling for ditto blocks. Due to
+ * random imbalances in allocation it tends to push copies
+ * to one vdev, that looks a bit better at the moment.
+ */
+ qmax = qmax * (4 + d) / 4;
+
+ qdepth = zfs_refcount_count(&mga->mga_alloc_queue_depth);
+
+ /*
+ * If this metaslab group is below its qmax or it's
+ * the only allocatable metasable group, then attempt
+ * to allocate from it.
+ */
+ if (qdepth < qmax || mc->mc_alloc_groups == 1)
+ return (B_TRUE);
+ ASSERT3U(mc->mc_alloc_groups, >, 1);
+
+ /*
+ * Since this metaslab group is at or over its qmax, we
+ * need to determine if there are metaslab groups after this
+ * one that might be able to handle this allocation. This is
+ * racy since we can't hold the locks for all metaslab
+ * groups at the same time when we make this check.
+ */
+ for (metaslab_group_t *mgp = mg->mg_next;
+ mgp != rotor; mgp = mgp->mg_next) {
+ metaslab_group_allocator_t *mgap =
+ &mgp->mg_allocator[allocator];
+ qmax = mgap->mga_cur_max_alloc_queue_depth;
+ qmax = qmax * (4 + d) / 4;
+ qdepth =
+ zfs_refcount_count(&mgap->mga_alloc_queue_depth);
+
+ /*
+ * If there is another metaslab group that
+ * might be able to handle the allocation, then
+ * we return false so that we skip this group.
+ */
+ if (qdepth < qmax && !mgp->mg_no_free_space)
+ return (B_FALSE);
+ }
+
+ /*
+ * We didn't find another group to handle the allocation
+ * so we can't skip this metaslab group even though
+ * we are at or over our qmax.
+ */
+ return (B_TRUE);
+
+ } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+/*
+ * ==========================================================================
+ * Range tree callbacks
+ * ==========================================================================
+ */
+
+/*
+ * Comparison function for the private size-ordered tree using 32-bit
+ * ranges. Tree is sorted by size, larger sizes at the end of the tree.
+ */
+static int
+metaslab_rangesize32_compare(const void *x1, const void *x2)
+{
+ const range_seg32_t *r1 = x1;
+ const range_seg32_t *r2 = x2;
+
+ uint64_t rs_size1 = r1->rs_end - r1->rs_start;
+ uint64_t rs_size2 = r2->rs_end - r2->rs_start;
+
+ int cmp = TREE_CMP(rs_size1, rs_size2);
+ if (likely(cmp))
+ return (cmp);
+
+ return (TREE_CMP(r1->rs_start, r2->rs_start));
+}
+
+/*
+ * Comparison function for the private size-ordered tree using 64-bit
+ * ranges. Tree is sorted by size, larger sizes at the end of the tree.
+ */
+static int
+metaslab_rangesize64_compare(const void *x1, const void *x2)
+{
+ const range_seg64_t *r1 = x1;
+ const range_seg64_t *r2 = x2;
+
+ uint64_t rs_size1 = r1->rs_end - r1->rs_start;
+ uint64_t rs_size2 = r2->rs_end - r2->rs_start;
+
+ int cmp = TREE_CMP(rs_size1, rs_size2);
+ if (likely(cmp))
+ return (cmp);
+
+ return (TREE_CMP(r1->rs_start, r2->rs_start));
+}
+typedef struct metaslab_rt_arg {
+ zfs_btree_t *mra_bt;
+ uint32_t mra_floor_shift;
+} metaslab_rt_arg_t;
+
+struct mssa_arg {
+ range_tree_t *rt;
+ metaslab_rt_arg_t *mra;
+};
+
+static void
+metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size)
+{
+ struct mssa_arg *mssap = arg;
+ range_tree_t *rt = mssap->rt;
+ metaslab_rt_arg_t *mrap = mssap->mra;
+ range_seg_max_t seg = {0};
+ rs_set_start(&seg, rt, start);
+ rs_set_end(&seg, rt, start + size);
+ metaslab_rt_add(rt, &seg, mrap);
+}
+
+static void
+metaslab_size_tree_full_load(range_tree_t *rt)
+{
+ metaslab_rt_arg_t *mrap = rt->rt_arg;
+ METASLABSTAT_BUMP(metaslabstat_reload_tree);
+ ASSERT0(zfs_btree_numnodes(mrap->mra_bt));
+ mrap->mra_floor_shift = 0;
+ struct mssa_arg arg = {0};
+ arg.rt = rt;
+ arg.mra = mrap;
+ range_tree_walk(rt, metaslab_size_sorted_add, &arg);
+}
+
+/*
+ * Create any block allocator specific components. The current allocators
+ * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
+ */
+/* ARGSUSED */
+static void
+metaslab_rt_create(range_tree_t *rt, void *arg)
+{
+ metaslab_rt_arg_t *mrap = arg;
+ zfs_btree_t *size_tree = mrap->mra_bt;
+
+ size_t size;
+ int (*compare) (const void *, const void *);
+ switch (rt->rt_type) {
+ case RANGE_SEG32:
+ size = sizeof (range_seg32_t);
+ compare = metaslab_rangesize32_compare;
+ break;
+ case RANGE_SEG64:
+ size = sizeof (range_seg64_t);
+ compare = metaslab_rangesize64_compare;
+ break;
+ default:
+ panic("Invalid range seg type %d", rt->rt_type);
+ }
+ zfs_btree_create(size_tree, compare, size);
+ mrap->mra_floor_shift = metaslab_by_size_min_shift;
+}
+
+/* ARGSUSED */
+static void
+metaslab_rt_destroy(range_tree_t *rt, void *arg)
+{
+ metaslab_rt_arg_t *mrap = arg;
+ zfs_btree_t *size_tree = mrap->mra_bt;
+
+ zfs_btree_destroy(size_tree);
+ kmem_free(mrap, sizeof (*mrap));
+}
+
+/* ARGSUSED */
+static void
+metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+ metaslab_rt_arg_t *mrap = arg;
+ zfs_btree_t *size_tree = mrap->mra_bt;
+
+ if (rs_get_end(rs, rt) - rs_get_start(rs, rt) <
+ (1 << mrap->mra_floor_shift))
+ return;
+
+ zfs_btree_add(size_tree, rs);
+}
+
+/* ARGSUSED */
+static void
+metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+ metaslab_rt_arg_t *mrap = arg;
+ zfs_btree_t *size_tree = mrap->mra_bt;
+
+ if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1 <<
+ mrap->mra_floor_shift))
+ return;
+
+ zfs_btree_remove(size_tree, rs);
+}
+
+/* ARGSUSED */
+static void
+metaslab_rt_vacate(range_tree_t *rt, void *arg)
+{
+ metaslab_rt_arg_t *mrap = arg;
+ zfs_btree_t *size_tree = mrap->mra_bt;
+ zfs_btree_clear(size_tree);
+ zfs_btree_destroy(size_tree);
+
+ metaslab_rt_create(rt, arg);
+}
+
+static range_tree_ops_t metaslab_rt_ops = {
+ .rtop_create = metaslab_rt_create,
+ .rtop_destroy = metaslab_rt_destroy,
+ .rtop_add = metaslab_rt_add,
+ .rtop_remove = metaslab_rt_remove,
+ .rtop_vacate = metaslab_rt_vacate
+};
+
+/*
+ * ==========================================================================
+ * Common allocator routines
+ * ==========================================================================
+ */
+
+/*
+ * Return the maximum contiguous segment within the metaslab.
+ */
+uint64_t
+metaslab_largest_allocatable(metaslab_t *msp)
+{
+ zfs_btree_t *t = &msp->ms_allocatable_by_size;
+ range_seg_t *rs;
+
+ if (t == NULL)
+ return (0);
+ if (zfs_btree_numnodes(t) == 0)
+ metaslab_size_tree_full_load(msp->ms_allocatable);
+
+ rs = zfs_btree_last(t, NULL);
+ if (rs == NULL)
+ return (0);
+
+ return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs,
+ msp->ms_allocatable));
+}
+
+/*
+ * Return the maximum contiguous segment within the unflushed frees of this
+ * metaslab.
+ */
+static uint64_t
+metaslab_largest_unflushed_free(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ if (msp->ms_unflushed_frees == NULL)
+ return (0);
+
+ if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0)
+ metaslab_size_tree_full_load(msp->ms_unflushed_frees);
+ range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size,
+ NULL);
+ if (rs == NULL)
+ return (0);
+
+ /*
+ * When a range is freed from the metaslab, that range is added to
+ * both the unflushed frees and the deferred frees. While the block
+ * will eventually be usable, if the metaslab were loaded the range
+ * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE
+ * txgs had passed. As a result, when attempting to estimate an upper
+ * bound for the largest currently-usable free segment in the
+ * metaslab, we need to not consider any ranges currently in the defer
+ * trees. This algorithm approximates the largest available chunk in
+ * the largest range in the unflushed_frees tree by taking the first
+ * chunk. While this may be a poor estimate, it should only remain so
+ * briefly and should eventually self-correct as frees are no longer
+ * deferred. Similar logic applies to the ms_freed tree. See
+ * metaslab_load() for more details.
+ *
+ * There are two primary sources of inaccuracy in this estimate. Both
+ * are tolerated for performance reasons. The first source is that we
+ * only check the largest segment for overlaps. Smaller segments may
+ * have more favorable overlaps with the other trees, resulting in
+ * larger usable chunks. Second, we only look at the first chunk in
+ * the largest segment; there may be other usable chunks in the
+ * largest segment, but we ignore them.
+ */
+ uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees);
+ uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart;
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ uint64_t start = 0;
+ uint64_t size = 0;
+ boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart,
+ rsize, &start, &size);
+ if (found) {
+ if (rstart == start)
+ return (0);
+ rsize = start - rstart;
+ }
+ }
+
+ uint64_t start = 0;
+ uint64_t size = 0;
+ boolean_t found = range_tree_find_in(msp->ms_freed, rstart,
+ rsize, &start, &size);
+ if (found)
+ rsize = start - rstart;
+
+ return (rsize);
+}
+
+static range_seg_t *
+metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start,
+ uint64_t size, zfs_btree_index_t *where)
+{
+ range_seg_t *rs;
+ range_seg_max_t rsearch;
+
+ rs_set_start(&rsearch, rt, start);
+ rs_set_end(&rsearch, rt, start + size);
+
+ rs = zfs_btree_find(t, &rsearch, where);
+ if (rs == NULL) {
+ rs = zfs_btree_next(t, where, where);
+ }
+
+ return (rs);
+}
+
+#if defined(WITH_DF_BLOCK_ALLOCATOR) || \
+ defined(WITH_CF_BLOCK_ALLOCATOR)
+
+/*
+ * This is a helper function that can be used by the allocator to find a
+ * suitable block to allocate. This will search the specified B-tree looking
+ * for a block that matches the specified criteria.
+ */
+static uint64_t
+metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size,
+ uint64_t max_search)
+{
+ if (*cursor == 0)
+ *cursor = rt->rt_start;
+ zfs_btree_t *bt = &rt->rt_root;
+ zfs_btree_index_t where;
+ range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where);
+ uint64_t first_found;
+ int count_searched = 0;
+
+ if (rs != NULL)
+ first_found = rs_get_start(rs, rt);
+
+ while (rs != NULL && (rs_get_start(rs, rt) - first_found <=
+ max_search || count_searched < metaslab_min_search_count)) {
+ uint64_t offset = rs_get_start(rs, rt);
+ if (offset + size <= rs_get_end(rs, rt)) {
+ *cursor = offset + size;
+ return (offset);
+ }
+ rs = zfs_btree_next(bt, &where, &where);
+ count_searched++;
+ }
+
+ *cursor = 0;
+ return (-1ULL);
+}
+#endif /* WITH_DF/CF_BLOCK_ALLOCATOR */
+
+#if defined(WITH_DF_BLOCK_ALLOCATOR)
+/*
+ * ==========================================================================
+ * Dynamic Fit (df) block allocator
+ *
+ * Search for a free chunk of at least this size, starting from the last
+ * offset (for this alignment of block) looking for up to
+ * metaslab_df_max_search bytes (16MB). If a large enough free chunk is not
+ * found within 16MB, then return a free chunk of exactly the requested size (or
+ * larger).
+ *
+ * If it seems like searching from the last offset will be unproductive, skip
+ * that and just return a free chunk of exactly the requested size (or larger).
+ * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct. This
+ * mechanism is probably not very useful and may be removed in the future.
+ *
+ * The behavior when not searching can be changed to return the largest free
+ * chunk, instead of a free chunk of exactly the requested size, by setting
+ * metaslab_df_use_largest_segment.
+ * ==========================================================================
+ */
+static uint64_t
+metaslab_df_alloc(metaslab_t *msp, uint64_t size)
+{
+ /*
+ * Find the largest power of 2 block size that evenly divides the
+ * requested size. This is used to try to allocate blocks with similar
+ * alignment from the same area of the metaslab (i.e. same cursor
+ * bucket) but it does not guarantee that other allocations sizes
+ * may exist in the same region.
+ */
+ uint64_t align = size & -size;
+ uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
+ range_tree_t *rt = msp->ms_allocatable;
+ int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
+ uint64_t offset;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ /*
+ * If we're running low on space, find a segment based on size,
+ * rather than iterating based on offset.
+ */
+ if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold ||
+ free_pct < metaslab_df_free_pct) {
+ offset = -1;
+ } else {
+ offset = metaslab_block_picker(rt,
+ cursor, size, metaslab_df_max_search);
+ }
+
+ if (offset == -1) {
+ range_seg_t *rs;
+ if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0)
+ metaslab_size_tree_full_load(msp->ms_allocatable);
+
+ if (metaslab_df_use_largest_segment) {
+ /* use largest free segment */
+ rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL);
+ } else {
+ zfs_btree_index_t where;
+ /* use segment of this size, or next largest */
+ rs = metaslab_block_find(&msp->ms_allocatable_by_size,
+ rt, msp->ms_start, size, &where);
+ }
+ if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs,
+ rt)) {
+ offset = rs_get_start(rs, rt);
+ *cursor = offset + size;
+ }
+ }
+
+ return (offset);
+}
+
+static metaslab_ops_t metaslab_df_ops = {
+ metaslab_df_alloc
+};
+
+metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
+#endif /* WITH_DF_BLOCK_ALLOCATOR */
+
+#if defined(WITH_CF_BLOCK_ALLOCATOR)
+/*
+ * ==========================================================================
+ * Cursor fit block allocator -
+ * Select the largest region in the metaslab, set the cursor to the beginning
+ * of the range and the cursor_end to the end of the range. As allocations
+ * are made advance the cursor. Continue allocating from the cursor until
+ * the range is exhausted and then find a new range.
+ * ==========================================================================
+ */
+static uint64_t
+metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
+{
+ range_tree_t *rt = msp->ms_allocatable;
+ zfs_btree_t *t = &msp->ms_allocatable_by_size;
+ uint64_t *cursor = &msp->ms_lbas[0];
+ uint64_t *cursor_end = &msp->ms_lbas[1];
+ uint64_t offset = 0;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ ASSERT3U(*cursor_end, >=, *cursor);
+
+ if ((*cursor + size) > *cursor_end) {
+ range_seg_t *rs;
+
+ if (zfs_btree_numnodes(t) == 0)
+ metaslab_size_tree_full_load(msp->ms_allocatable);
+ rs = zfs_btree_last(t, NULL);
+ if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) <
+ size)
+ return (-1ULL);
+
+ *cursor = rs_get_start(rs, rt);
+ *cursor_end = rs_get_end(rs, rt);
+ }
+
+ offset = *cursor;
+ *cursor += size;
+
+ return (offset);
+}
+
+static metaslab_ops_t metaslab_cf_ops = {
+ metaslab_cf_alloc
+};
+
+metaslab_ops_t *zfs_metaslab_ops = &metaslab_cf_ops;
+#endif /* WITH_CF_BLOCK_ALLOCATOR */
+
+#if defined(WITH_NDF_BLOCK_ALLOCATOR)
+/*
+ * ==========================================================================
+ * New dynamic fit allocator -
+ * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
+ * contiguous blocks. If no region is found then just use the largest segment
+ * that remains.
+ * ==========================================================================
+ */
+
+/*
+ * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
+ * to request from the allocator.
+ */
+uint64_t metaslab_ndf_clump_shift = 4;
+
+static uint64_t
+metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
+{
+ zfs_btree_t *t = &msp->ms_allocatable->rt_root;
+ range_tree_t *rt = msp->ms_allocatable;
+ zfs_btree_index_t where;
+ range_seg_t *rs;
+ range_seg_max_t rsearch;
+ uint64_t hbit = highbit64(size);
+ uint64_t *cursor = &msp->ms_lbas[hbit - 1];
+ uint64_t max_size = metaslab_largest_allocatable(msp);
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ if (max_size < size)
+ return (-1ULL);
+
+ rs_set_start(&rsearch, rt, *cursor);
+ rs_set_end(&rsearch, rt, *cursor + size);
+
+ rs = zfs_btree_find(t, &rsearch, &where);
+ if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) {
+ t = &msp->ms_allocatable_by_size;
+
+ rs_set_start(&rsearch, rt, 0);
+ rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit +
+ metaslab_ndf_clump_shift)));
+
+ rs = zfs_btree_find(t, &rsearch, &where);
+ if (rs == NULL)
+ rs = zfs_btree_next(t, &where, &where);
+ ASSERT(rs != NULL);
+ }
+
+ if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) {
+ *cursor = rs_get_start(rs, rt) + size;
+ return (rs_get_start(rs, rt));
+ }
+ return (-1ULL);
+}
+
+static metaslab_ops_t metaslab_ndf_ops = {
+ metaslab_ndf_alloc
+};
+
+metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
+#endif /* WITH_NDF_BLOCK_ALLOCATOR */
+
+
+/*
+ * ==========================================================================
+ * Metaslabs
+ * ==========================================================================
+ */
+
+/*
+ * Wait for any in-progress metaslab loads to complete.
+ */
+static void
+metaslab_load_wait(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ while (msp->ms_loading) {
+ ASSERT(!msp->ms_loaded);
+ cv_wait(&msp->ms_load_cv, &msp->ms_lock);
+ }
+}
+
+/*
+ * Wait for any in-progress flushing to complete.
+ */
+static void
+metaslab_flush_wait(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ while (msp->ms_flushing)
+ cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
+}
+
+static unsigned int
+metaslab_idx_func(multilist_t *ml, void *arg)
+{
+ metaslab_t *msp = arg;
+ return (msp->ms_id % multilist_get_num_sublists(ml));
+}
+
+uint64_t
+metaslab_allocated_space(metaslab_t *msp)
+{
+ return (msp->ms_allocated_space);
+}
+
+/*
+ * Verify that the space accounting on disk matches the in-core range_trees.
+ */
+static void
+metaslab_verify_space(metaslab_t *msp, uint64_t txg)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ uint64_t allocating = 0;
+ uint64_t sm_free_space, msp_free_space;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT(!msp->ms_condensing);
+
+ if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
+ return;
+
+ /*
+ * We can only verify the metaslab space when we're called
+ * from syncing context with a loaded metaslab that has an
+ * allocated space map. Calling this in non-syncing context
+ * does not provide a consistent view of the metaslab since
+ * we're performing allocations in the future.
+ */
+ if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
+ !msp->ms_loaded)
+ return;
+
+ /*
+ * Even though the smp_alloc field can get negative,
+ * when it comes to a metaslab's space map, that should
+ * never be the case.
+ */
+ ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
+
+ ASSERT3U(space_map_allocated(msp->ms_sm), >=,
+ range_tree_space(msp->ms_unflushed_frees));
+
+ ASSERT3U(metaslab_allocated_space(msp), ==,
+ space_map_allocated(msp->ms_sm) +
+ range_tree_space(msp->ms_unflushed_allocs) -
+ range_tree_space(msp->ms_unflushed_frees));
+
+ sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
+
+ /*
+ * Account for future allocations since we would have
+ * already deducted that space from the ms_allocatable.
+ */
+ for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
+ allocating +=
+ range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
+ }
+ ASSERT3U(allocating + msp->ms_allocated_this_txg, ==,
+ msp->ms_allocating_total);
+
+ ASSERT3U(msp->ms_deferspace, ==,
+ range_tree_space(msp->ms_defer[0]) +
+ range_tree_space(msp->ms_defer[1]));
+
+ msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
+ msp->ms_deferspace + range_tree_space(msp->ms_freed);
+
+ VERIFY3U(sm_free_space, ==, msp_free_space);
+}
+
+static void
+metaslab_aux_histograms_clear(metaslab_t *msp)
+{
+ /*
+ * Auxiliary histograms are only cleared when resetting them,
+ * which can only happen while the metaslab is loaded.
+ */
+ ASSERT(msp->ms_loaded);
+
+ bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
+ for (int t = 0; t < TXG_DEFER_SIZE; t++)
+ bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
+}
+
+static void
+metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
+ range_tree_t *rt)
+{
+ /*
+ * This is modeled after space_map_histogram_add(), so refer to that
+ * function for implementation details. We want this to work like
+ * the space map histogram, and not the range tree histogram, as we
+ * are essentially constructing a delta that will be later subtracted
+ * from the space map histogram.
+ */
+ int idx = 0;
+ for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+ ASSERT3U(i, >=, idx + shift);
+ histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
+
+ if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
+ ASSERT3U(idx + shift, ==, i);
+ idx++;
+ ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
+ }
+ }
+}
+
+/*
+ * Called at every sync pass that the metaslab gets synced.
+ *
+ * The reason is that we want our auxiliary histograms to be updated
+ * wherever the metaslab's space map histogram is updated. This way
+ * we stay consistent on which parts of the metaslab space map's
+ * histogram are currently not available for allocations (e.g because
+ * they are in the defer, freed, and freeing trees).
+ */
+static void
+metaslab_aux_histograms_update(metaslab_t *msp)
+{
+ space_map_t *sm = msp->ms_sm;
+ ASSERT(sm != NULL);
+
+ /*
+ * This is similar to the metaslab's space map histogram updates
+ * that take place in metaslab_sync(). The only difference is that
+ * we only care about segments that haven't made it into the
+ * ms_allocatable tree yet.
+ */
+ if (msp->ms_loaded) {
+ metaslab_aux_histograms_clear(msp);
+
+ metaslab_aux_histogram_add(msp->ms_synchist,
+ sm->sm_shift, msp->ms_freed);
+
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ metaslab_aux_histogram_add(msp->ms_deferhist[t],
+ sm->sm_shift, msp->ms_defer[t]);
+ }
+ }
+
+ metaslab_aux_histogram_add(msp->ms_synchist,
+ sm->sm_shift, msp->ms_freeing);
+}
+
+/*
+ * Called every time we are done syncing (writing to) the metaslab,
+ * i.e. at the end of each sync pass.
+ * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
+ */
+static void
+metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ space_map_t *sm = msp->ms_sm;
+
+ if (sm == NULL) {
+ /*
+ * We came here from metaslab_init() when creating/opening a
+ * pool, looking at a metaslab that hasn't had any allocations
+ * yet.
+ */
+ return;
+ }
+
+ /*
+ * This is similar to the actions that we take for the ms_freed
+ * and ms_defer trees in metaslab_sync_done().
+ */
+ uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
+ if (defer_allowed) {
+ bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
+ sizeof (msp->ms_synchist));
+ } else {
+ bzero(msp->ms_deferhist[hist_index],
+ sizeof (msp->ms_deferhist[hist_index]));
+ }
+ bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
+}
+
+/*
+ * Ensure that the metaslab's weight and fragmentation are consistent
+ * with the contents of the histogram (either the range tree's histogram
+ * or the space map's depending whether the metaslab is loaded).
+ */
+static void
+metaslab_verify_weight_and_frag(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
+ return;
+
+ /*
+ * We can end up here from vdev_remove_complete(), in which case we
+ * cannot do these assertions because we hold spa config locks and
+ * thus we are not allowed to read from the DMU.
+ *
+ * We check if the metaslab group has been removed and if that's
+ * the case we return immediately as that would mean that we are
+ * here from the aforementioned code path.
+ */
+ if (msp->ms_group == NULL)
+ return;
+
+ /*
+ * Devices being removed always return a weight of 0 and leave
+ * fragmentation and ms_max_size as is - there is nothing for
+ * us to verify here.
+ */
+ vdev_t *vd = msp->ms_group->mg_vd;
+ if (vd->vdev_removing)
+ return;
+
+ /*
+ * If the metaslab is dirty it probably means that we've done
+ * some allocations or frees that have changed our histograms
+ * and thus the weight.
+ */
+ for (int t = 0; t < TXG_SIZE; t++) {
+ if (txg_list_member(&vd->vdev_ms_list, msp, t))
+ return;
+ }
+
+ /*
+ * This verification checks that our in-memory state is consistent
+ * with what's on disk. If the pool is read-only then there aren't
+ * any changes and we just have the initially-loaded state.
+ */
+ if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
+ return;
+
+ /* some extra verification for in-core tree if you can */
+ if (msp->ms_loaded) {
+ range_tree_stat_verify(msp->ms_allocatable);
+ VERIFY(space_map_histogram_verify(msp->ms_sm,
+ msp->ms_allocatable));
+ }
+
+ uint64_t weight = msp->ms_weight;
+ uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
+ boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
+ uint64_t frag = msp->ms_fragmentation;
+ uint64_t max_segsize = msp->ms_max_size;
+
+ msp->ms_weight = 0;
+ msp->ms_fragmentation = 0;
+
+ /*
+ * This function is used for verification purposes and thus should
+ * not introduce any side-effects/mutations on the system's state.
+ *
+ * Regardless of whether metaslab_weight() thinks this metaslab
+ * should be active or not, we want to ensure that the actual weight
+ * (and therefore the value of ms_weight) would be the same if it
+ * was to be recalculated at this point.
+ *
+ * In addition we set the nodirty flag so metaslab_weight() does
+ * not dirty the metaslab for future TXGs (e.g. when trying to
+ * force condensing to upgrade the metaslab spacemaps).
+ */
+ msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active;
+
+ VERIFY3U(max_segsize, ==, msp->ms_max_size);
+
+ /*
+ * If the weight type changed then there is no point in doing
+ * verification. Revert fields to their original values.
+ */
+ if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
+ (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
+ msp->ms_fragmentation = frag;
+ msp->ms_weight = weight;
+ return;
+ }
+
+ VERIFY3U(msp->ms_fragmentation, ==, frag);
+ VERIFY3U(msp->ms_weight, ==, weight);
+}
+
+/*
+ * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from
+ * this class that was used longest ago, and attempt to unload it. We don't
+ * want to spend too much time in this loop to prevent performance
+ * degradation, and we expect that most of the time this operation will
+ * succeed. Between that and the normal unloading processing during txg sync,
+ * we expect this to keep the metaslab memory usage under control.
+ */
+static void
+metaslab_potentially_evict(metaslab_class_t *mc)
+{
+#ifdef _KERNEL
+ uint64_t allmem = arc_all_memory();
+ uint64_t inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
+ uint64_t size = spl_kmem_cache_entry_size(zfs_btree_leaf_cache);
+ int tries = 0;
+ for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
+ tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2;
+ tries++) {
+ unsigned int idx = multilist_get_random_index(
+ mc->mc_metaslab_txg_list);
+ multilist_sublist_t *mls =
+ multilist_sublist_lock(mc->mc_metaslab_txg_list, idx);
+ metaslab_t *msp = multilist_sublist_head(mls);
+ multilist_sublist_unlock(mls);
+ while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
+ inuse * size) {
+ VERIFY3P(mls, ==, multilist_sublist_lock(
+ mc->mc_metaslab_txg_list, idx));
+ ASSERT3U(idx, ==,
+ metaslab_idx_func(mc->mc_metaslab_txg_list, msp));
+
+ if (!multilist_link_active(&msp->ms_class_txg_node)) {
+ multilist_sublist_unlock(mls);
+ break;
+ }
+ metaslab_t *next_msp = multilist_sublist_next(mls, msp);
+ multilist_sublist_unlock(mls);
+ /*
+ * If the metaslab is currently loading there are two
+ * cases. If it's the metaslab we're evicting, we
+ * can't continue on or we'll panic when we attempt to
+ * recursively lock the mutex. If it's another
+ * metaslab that's loading, it can be safely skipped,
+ * since we know it's very new and therefore not a
+ * good eviction candidate. We check later once the
+ * lock is held that the metaslab is fully loaded
+ * before actually unloading it.
+ */
+ if (msp->ms_loading) {
+ msp = next_msp;
+ inuse =
+ spl_kmem_cache_inuse(zfs_btree_leaf_cache);
+ continue;
+ }
+ /*
+ * We can't unload metaslabs with no spacemap because
+ * they're not ready to be unloaded yet. We can't
+ * unload metaslabs with outstanding allocations
+ * because doing so could cause the metaslab's weight
+ * to decrease while it's unloaded, which violates an
+ * invariant that we use to prevent unnecessary
+ * loading. We also don't unload metaslabs that are
+ * currently active because they are high-weight
+ * metaslabs that are likely to be used in the near
+ * future.
+ */
+ mutex_enter(&msp->ms_lock);
+ if (msp->ms_allocator == -1 && msp->ms_sm != NULL &&
+ msp->ms_allocating_total == 0) {
+ metaslab_unload(msp);
+ }
+ mutex_exit(&msp->ms_lock);
+ msp = next_msp;
+ inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
+ }
+ }
+#endif
+}
+
+static int
+metaslab_load_impl(metaslab_t *msp)
+{
+ int error = 0;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT(msp->ms_loading);
+ ASSERT(!msp->ms_condensing);
+
+ /*
+ * We temporarily drop the lock to unblock other operations while we
+ * are reading the space map. Therefore, metaslab_sync() and
+ * metaslab_sync_done() can run at the same time as we do.
+ *
+ * If we are using the log space maps, metaslab_sync() can't write to
+ * the metaslab's space map while we are loading as we only write to
+ * it when we are flushing the metaslab, and that can't happen while
+ * we are loading it.
+ *
+ * If we are not using log space maps though, metaslab_sync() can
+ * append to the space map while we are loading. Therefore we load
+ * only entries that existed when we started the load. Additionally,
+ * metaslab_sync_done() has to wait for the load to complete because
+ * there are potential races like metaslab_load() loading parts of the
+ * space map that are currently being appended by metaslab_sync(). If
+ * we didn't, the ms_allocatable would have entries that
+ * metaslab_sync_done() would try to re-add later.
+ *
+ * That's why before dropping the lock we remember the synced length
+ * of the metaslab and read up to that point of the space map,
+ * ignoring entries appended by metaslab_sync() that happen after we
+ * drop the lock.
+ */
+ uint64_t length = msp->ms_synced_length;
+ mutex_exit(&msp->ms_lock);
+
+ hrtime_t load_start = gethrtime();
+ metaslab_rt_arg_t *mrap;
+ if (msp->ms_allocatable->rt_arg == NULL) {
+ mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
+ } else {
+ mrap = msp->ms_allocatable->rt_arg;
+ msp->ms_allocatable->rt_ops = NULL;
+ msp->ms_allocatable->rt_arg = NULL;
+ }
+ mrap->mra_bt = &msp->ms_allocatable_by_size;
+ mrap->mra_floor_shift = metaslab_by_size_min_shift;
+
+ if (msp->ms_sm != NULL) {
+ error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
+ SM_FREE, length);
+
+ /* Now, populate the size-sorted tree. */
+ metaslab_rt_create(msp->ms_allocatable, mrap);
+ msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
+ msp->ms_allocatable->rt_arg = mrap;
+
+ struct mssa_arg arg = {0};
+ arg.rt = msp->ms_allocatable;
+ arg.mra = mrap;
+ range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add,
+ &arg);
+ } else {
+ /*
+ * Add the size-sorted tree first, since we don't need to load
+ * the metaslab from the spacemap.
+ */
+ metaslab_rt_create(msp->ms_allocatable, mrap);
+ msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
+ msp->ms_allocatable->rt_arg = mrap;
+ /*
+ * The space map has not been allocated yet, so treat
+ * all the space in the metaslab as free and add it to the
+ * ms_allocatable tree.
+ */
+ range_tree_add(msp->ms_allocatable,
+ msp->ms_start, msp->ms_size);
+
+ if (msp->ms_freed != NULL) {
+ /*
+ * If the ms_sm doesn't exist, this means that this
+ * metaslab hasn't gone through metaslab_sync() and
+ * thus has never been dirtied. So we shouldn't
+ * expect any unflushed allocs or frees from previous
+ * TXGs.
+ *
+ * Note: ms_freed and all the other trees except for
+ * the ms_allocatable, can be NULL at this point only
+ * if this is a new metaslab of a vdev that just got
+ * expanded.
+ */
+ ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+ ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
+ }
+ }
+
+ /*
+ * We need to grab the ms_sync_lock to prevent metaslab_sync() from
+ * changing the ms_sm (or log_sm) and the metaslab's range trees
+ * while we are about to use them and populate the ms_allocatable.
+ * The ms_lock is insufficient for this because metaslab_sync() doesn't
+ * hold the ms_lock while writing the ms_checkpointing tree to disk.
+ */
+ mutex_enter(&msp->ms_sync_lock);
+ mutex_enter(&msp->ms_lock);
+
+ ASSERT(!msp->ms_condensing);
+ ASSERT(!msp->ms_flushing);
+
+ if (error != 0) {
+ mutex_exit(&msp->ms_sync_lock);
+ return (error);
+ }
+
+ ASSERT3P(msp->ms_group, !=, NULL);
+ msp->ms_loaded = B_TRUE;
+
+ /*
+ * Apply all the unflushed changes to ms_allocatable right
+ * away so any manipulations we do below have a clear view
+ * of what is allocated and what is free.
+ */
+ range_tree_walk(msp->ms_unflushed_allocs,
+ range_tree_remove, msp->ms_allocatable);
+ range_tree_walk(msp->ms_unflushed_frees,
+ range_tree_add, msp->ms_allocatable);
+
+ msp->ms_loaded = B_TRUE;
+
+ ASSERT3P(msp->ms_group, !=, NULL);
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ if (spa_syncing_log_sm(spa) != NULL) {
+ ASSERT(spa_feature_is_enabled(spa,
+ SPA_FEATURE_LOG_SPACEMAP));
+
+ /*
+ * If we use a log space map we add all the segments
+ * that are in ms_unflushed_frees so they are available
+ * for allocation.
+ *
+ * ms_allocatable needs to contain all free segments
+ * that are ready for allocations (thus not segments
+ * from ms_freeing, ms_freed, and the ms_defer trees).
+ * But if we grab the lock in this code path at a sync
+ * pass later that 1, then it also contains the
+ * segments of ms_freed (they were added to it earlier
+ * in this path through ms_unflushed_frees). So we
+ * need to remove all the segments that exist in
+ * ms_freed from ms_allocatable as they will be added
+ * later in metaslab_sync_done().
+ *
+ * When there's no log space map, the ms_allocatable
+ * correctly doesn't contain any segments that exist
+ * in ms_freed [see ms_synced_length].
+ */
+ range_tree_walk(msp->ms_freed,
+ range_tree_remove, msp->ms_allocatable);
+ }
+
+ /*
+ * If we are not using the log space map, ms_allocatable
+ * contains the segments that exist in the ms_defer trees
+ * [see ms_synced_length]. Thus we need to remove them
+ * from ms_allocatable as they will be added again in
+ * metaslab_sync_done().
+ *
+ * If we are using the log space map, ms_allocatable still
+ * contains the segments that exist in the ms_defer trees.
+ * Not because it read them through the ms_sm though. But
+ * because these segments are part of ms_unflushed_frees
+ * whose segments we add to ms_allocatable earlier in this
+ * code path.
+ */
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ range_tree_walk(msp->ms_defer[t],
+ range_tree_remove, msp->ms_allocatable);
+ }
+
+ /*
+ * Call metaslab_recalculate_weight_and_sort() now that the
+ * metaslab is loaded so we get the metaslab's real weight.
+ *
+ * Unless this metaslab was created with older software and
+ * has not yet been converted to use segment-based weight, we
+ * expect the new weight to be better or equal to the weight
+ * that the metaslab had while it was not loaded. This is
+ * because the old weight does not take into account the
+ * consolidation of adjacent segments between TXGs. [see
+ * comment for ms_synchist and ms_deferhist[] for more info]
+ */
+ uint64_t weight = msp->ms_weight;
+ uint64_t max_size = msp->ms_max_size;
+ metaslab_recalculate_weight_and_sort(msp);
+ if (!WEIGHT_IS_SPACEBASED(weight))
+ ASSERT3U(weight, <=, msp->ms_weight);
+ msp->ms_max_size = metaslab_largest_allocatable(msp);
+ ASSERT3U(max_size, <=, msp->ms_max_size);
+ hrtime_t load_end = gethrtime();
+ msp->ms_load_time = load_end;
+ zfs_dbgmsg("metaslab_load: txg %llu, spa %s, vdev_id %llu, "
+ "ms_id %llu, smp_length %llu, "
+ "unflushed_allocs %llu, unflushed_frees %llu, "
+ "freed %llu, defer %llu + %llu, unloaded time %llu ms, "
+ "loading_time %lld ms, ms_max_size %llu, "
+ "max size error %lld, "
+ "old_weight %llx, new_weight %llx",
+ spa_syncing_txg(spa), spa_name(spa),
+ msp->ms_group->mg_vd->vdev_id, msp->ms_id,
+ space_map_length(msp->ms_sm),
+ range_tree_space(msp->ms_unflushed_allocs),
+ range_tree_space(msp->ms_unflushed_frees),
+ range_tree_space(msp->ms_freed),
+ range_tree_space(msp->ms_defer[0]),
+ range_tree_space(msp->ms_defer[1]),
+ (longlong_t)((load_start - msp->ms_unload_time) / 1000000),
+ (longlong_t)((load_end - load_start) / 1000000),
+ msp->ms_max_size, msp->ms_max_size - max_size,
+ weight, msp->ms_weight);
+
+ metaslab_verify_space(msp, spa_syncing_txg(spa));
+ mutex_exit(&msp->ms_sync_lock);
+ return (0);
+}
+
+int
+metaslab_load(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ /*
+ * There may be another thread loading the same metaslab, if that's
+ * the case just wait until the other thread is done and return.
+ */
+ metaslab_load_wait(msp);
+ if (msp->ms_loaded)
+ return (0);
+ VERIFY(!msp->ms_loading);
+ ASSERT(!msp->ms_condensing);
+
+ /*
+ * We set the loading flag BEFORE potentially dropping the lock to
+ * wait for an ongoing flush (see ms_flushing below). This way other
+ * threads know that there is already a thread that is loading this
+ * metaslab.
+ */
+ msp->ms_loading = B_TRUE;
+
+ /*
+ * Wait for any in-progress flushing to finish as we drop the ms_lock
+ * both here (during space_map_load()) and in metaslab_flush() (when
+ * we flush our changes to the ms_sm).
+ */
+ if (msp->ms_flushing)
+ metaslab_flush_wait(msp);
+
+ /*
+ * In the possibility that we were waiting for the metaslab to be
+ * flushed (where we temporarily dropped the ms_lock), ensure that
+ * no one else loaded the metaslab somehow.
+ */
+ ASSERT(!msp->ms_loaded);
+
+ /*
+ * If we're loading a metaslab in the normal class, consider evicting
+ * another one to keep our memory usage under the limit defined by the
+ * zfs_metaslab_mem_limit tunable.
+ */
+ if (spa_normal_class(msp->ms_group->mg_class->mc_spa) ==
+ msp->ms_group->mg_class) {
+ metaslab_potentially_evict(msp->ms_group->mg_class);
+ }
+
+ int error = metaslab_load_impl(msp);
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ msp->ms_loading = B_FALSE;
+ cv_broadcast(&msp->ms_load_cv);
+
+ return (error);
+}
+
+void
+metaslab_unload(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ /*
+ * This can happen if a metaslab is selected for eviction (in
+ * metaslab_potentially_evict) and then unloaded during spa_sync (via
+ * metaslab_class_evict_old).
+ */
+ if (!msp->ms_loaded)
+ return;
+
+ range_tree_vacate(msp->ms_allocatable, NULL, NULL);
+ msp->ms_loaded = B_FALSE;
+ msp->ms_unload_time = gethrtime();
+
+ msp->ms_activation_weight = 0;
+ msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
+
+ if (msp->ms_group != NULL) {
+ metaslab_class_t *mc = msp->ms_group->mg_class;
+ multilist_sublist_t *mls =
+ multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+ if (multilist_link_active(&msp->ms_class_txg_node))
+ multilist_sublist_remove(mls, msp);
+ multilist_sublist_unlock(mls);
+
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ zfs_dbgmsg("metaslab_unload: txg %llu, spa %s, vdev_id %llu, "
+ "ms_id %llu, weight %llx, "
+ "selected txg %llu (%llu ms ago), alloc_txg %llu, "
+ "loaded %llu ms ago, max_size %llu",
+ spa_syncing_txg(spa), spa_name(spa),
+ msp->ms_group->mg_vd->vdev_id, msp->ms_id,
+ msp->ms_weight,
+ msp->ms_selected_txg,
+ (msp->ms_unload_time - msp->ms_selected_time) / 1000 / 1000,
+ msp->ms_alloc_txg,
+ (msp->ms_unload_time - msp->ms_load_time) / 1000 / 1000,
+ msp->ms_max_size);
+ }
+
+ /*
+ * We explicitly recalculate the metaslab's weight based on its space
+ * map (as it is now not loaded). We want unload metaslabs to always
+ * have their weights calculated from the space map histograms, while
+ * loaded ones have it calculated from their in-core range tree
+ * [see metaslab_load()]. This way, the weight reflects the information
+ * available in-core, whether it is loaded or not.
+ *
+ * If ms_group == NULL means that we came here from metaslab_fini(),
+ * at which point it doesn't make sense for us to do the recalculation
+ * and the sorting.
+ */
+ if (msp->ms_group != NULL)
+ metaslab_recalculate_weight_and_sort(msp);
+}
+
+/*
+ * We want to optimize the memory use of the per-metaslab range
+ * trees. To do this, we store the segments in the range trees in
+ * units of sectors, zero-indexing from the start of the metaslab. If
+ * the vdev_ms_shift - the vdev_ashift is less than 32, we can store
+ * the ranges using two uint32_ts, rather than two uint64_ts.
+ */
+range_seg_type_t
+metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp,
+ uint64_t *start, uint64_t *shift)
+{
+ if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 &&
+ !zfs_metaslab_force_large_segs) {
+ *shift = vdev->vdev_ashift;
+ *start = msp->ms_start;
+ return (RANGE_SEG32);
+ } else {
+ *shift = 0;
+ *start = 0;
+ return (RANGE_SEG64);
+ }
+}
+
+void
+metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ metaslab_class_t *mc = msp->ms_group->mg_class;
+ multilist_sublist_t *mls =
+ multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+ if (multilist_link_active(&msp->ms_class_txg_node))
+ multilist_sublist_remove(mls, msp);
+ msp->ms_selected_txg = txg;
+ msp->ms_selected_time = gethrtime();
+ multilist_sublist_insert_tail(mls, msp);
+ multilist_sublist_unlock(mls);
+}
+
+void
+metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
+ int64_t defer_delta, int64_t space_delta)
+{
+ vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
+
+ ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
+ ASSERT(vd->vdev_ms_count != 0);
+
+ metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
+ vdev_deflated_space(vd, space_delta));
+}
+
+int
+metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
+ uint64_t txg, metaslab_t **msp)
+{
+ vdev_t *vd = mg->mg_vd;
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ metaslab_t *ms;
+ int error;
+
+ ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
+ mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
+ multilist_link_init(&ms->ms_class_txg_node);
+
+ ms->ms_id = id;
+ ms->ms_start = id << vd->vdev_ms_shift;
+ ms->ms_size = 1ULL << vd->vdev_ms_shift;
+ ms->ms_allocator = -1;
+ ms->ms_new = B_TRUE;
+
+ vdev_ops_t *ops = vd->vdev_ops;
+ if (ops->vdev_op_metaslab_init != NULL)
+ ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size);
+
+ /*
+ * We only open space map objects that already exist. All others
+ * will be opened when we finally allocate an object for it.
+ *
+ * Note:
+ * When called from vdev_expand(), we can't call into the DMU as
+ * we are holding the spa_config_lock as a writer and we would
+ * deadlock [see relevant comment in vdev_metaslab_init()]. in
+ * that case, the object parameter is zero though, so we won't
+ * call into the DMU.
+ */
+ if (object != 0) {
+ error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
+ ms->ms_size, vd->vdev_ashift);
+
+ if (error != 0) {
+ kmem_free(ms, sizeof (metaslab_t));
+ return (error);
+ }
+
+ ASSERT(ms->ms_sm != NULL);
+ ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
+ }
+
+ range_seg_type_t type;
+ uint64_t shift, start;
+ type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
+
+ /*
+ * We create the ms_allocatable here, but we don't create the
+ * other range trees until metaslab_sync_done(). This serves
+ * two purposes: it allows metaslab_sync_done() to detect the
+ * addition of new space; and for debugging, it ensures that
+ * we'd data fault on any attempt to use this metaslab before
+ * it's ready.
+ */
+ ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift);
+
+ ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift);
+
+ metaslab_group_add(mg, ms);
+ metaslab_set_fragmentation(ms, B_FALSE);
+
+ /*
+ * If we're opening an existing pool (txg == 0) or creating
+ * a new one (txg == TXG_INITIAL), all space is available now.
+ * If we're adding space to an existing pool, the new space
+ * does not become available until after this txg has synced.
+ * The metaslab's weight will also be initialized when we sync
+ * out this txg. This ensures that we don't attempt to allocate
+ * from it before we have initialized it completely.
+ */
+ if (txg <= TXG_INITIAL) {
+ metaslab_sync_done(ms, 0);
+ metaslab_space_update(vd, mg->mg_class,
+ metaslab_allocated_space(ms), 0, 0);
+ }
+
+ if (txg != 0) {
+ vdev_dirty(vd, 0, NULL, txg);
+ vdev_dirty(vd, VDD_METASLAB, ms, txg);
+ }
+
+ *msp = ms;
+
+ return (0);
+}
+
+static void
+metaslab_fini_flush_data(metaslab_t *msp)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+
+ if (metaslab_unflushed_txg(msp) == 0) {
+ ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL),
+ ==, NULL);
+ return;
+ }
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+ mutex_enter(&spa->spa_flushed_ms_lock);
+ avl_remove(&spa->spa_metaslabs_by_flushed, msp);
+ mutex_exit(&spa->spa_flushed_ms_lock);
+
+ spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
+ spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp));
+}
+
+uint64_t
+metaslab_unflushed_changes_memused(metaslab_t *ms)
+{
+ return ((range_tree_numsegs(ms->ms_unflushed_allocs) +
+ range_tree_numsegs(ms->ms_unflushed_frees)) *
+ ms->ms_unflushed_allocs->rt_root.bt_elem_size);
+}
+
+void
+metaslab_fini(metaslab_t *msp)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ vdev_t *vd = mg->mg_vd;
+ spa_t *spa = vd->vdev_spa;
+
+ metaslab_fini_flush_data(msp);
+
+ metaslab_group_remove(mg, msp);
+
+ mutex_enter(&msp->ms_lock);
+ VERIFY(msp->ms_group == NULL);
+ /*
+ * If the range trees haven't been allocated, this metaslab hasn't
+ * been through metaslab_sync_done() for the first time yet, so its
+ * space hasn't been accounted for in its vdev and doesn't need to be
+ * subtracted.
+ */
+ if (msp->ms_freed != NULL) {
+ metaslab_space_update(vd, mg->mg_class,
+ -metaslab_allocated_space(msp), 0, -msp->ms_size);
+
+ }
+ space_map_close(msp->ms_sm);
+ msp->ms_sm = NULL;
+
+ metaslab_unload(msp);
+
+ range_tree_destroy(msp->ms_allocatable);
+
+ if (msp->ms_freed != NULL) {
+ range_tree_destroy(msp->ms_freeing);
+ range_tree_destroy(msp->ms_freed);
+
+ ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+ metaslab_unflushed_changes_memused(msp));
+ spa->spa_unflushed_stats.sus_memused -=
+ metaslab_unflushed_changes_memused(msp);
+ range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
+ range_tree_destroy(msp->ms_unflushed_allocs);
+ range_tree_destroy(msp->ms_checkpointing);
+ range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
+ range_tree_destroy(msp->ms_unflushed_frees);
+
+ for (int t = 0; t < TXG_SIZE; t++) {
+ range_tree_destroy(msp->ms_allocating[t]);
+ }
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ range_tree_destroy(msp->ms_defer[t]);
+ }
+ }
+ ASSERT0(msp->ms_deferspace);
+
+ for (int t = 0; t < TXG_SIZE; t++)
+ ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
+
+ range_tree_vacate(msp->ms_trim, NULL, NULL);
+ range_tree_destroy(msp->ms_trim);
+
+ mutex_exit(&msp->ms_lock);
+ cv_destroy(&msp->ms_load_cv);
+ cv_destroy(&msp->ms_flush_cv);
+ mutex_destroy(&msp->ms_lock);
+ mutex_destroy(&msp->ms_sync_lock);
+ ASSERT3U(msp->ms_allocator, ==, -1);
+
+ kmem_free(msp, sizeof (metaslab_t));
+}
+
+#define FRAGMENTATION_TABLE_SIZE 17
+
+/*
+ * This table defines a segment size based fragmentation metric that will
+ * allow each metaslab to derive its own fragmentation value. This is done
+ * by calculating the space in each bucket of the spacemap histogram and
+ * multiplying that by the fragmentation metric in this table. Doing
+ * this for all buckets and dividing it by the total amount of free
+ * space in this metaslab (i.e. the total free space in all buckets) gives
+ * us the fragmentation metric. This means that a high fragmentation metric
+ * equates to most of the free space being comprised of small segments.
+ * Conversely, if the metric is low, then most of the free space is in
+ * large segments. A 10% change in fragmentation equates to approximately
+ * double the number of segments.
+ *
+ * This table defines 0% fragmented space using 16MB segments. Testing has
+ * shown that segments that are greater than or equal to 16MB do not suffer
+ * from drastic performance problems. Using this value, we derive the rest
+ * of the table. Since the fragmentation value is never stored on disk, it
+ * is possible to change these calculations in the future.
+ */
+int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
+ 100, /* 512B */
+ 100, /* 1K */
+ 98, /* 2K */
+ 95, /* 4K */
+ 90, /* 8K */
+ 80, /* 16K */
+ 70, /* 32K */
+ 60, /* 64K */
+ 50, /* 128K */
+ 40, /* 256K */
+ 30, /* 512K */
+ 20, /* 1M */
+ 15, /* 2M */
+ 10, /* 4M */
+ 5, /* 8M */
+ 0 /* 16M */
+};
+
+/*
+ * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
+ * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
+ * been upgraded and does not support this metric. Otherwise, the return
+ * value should be in the range [0, 100].
+ */
+static void
+metaslab_set_fragmentation(metaslab_t *msp, boolean_t nodirty)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ uint64_t fragmentation = 0;
+ uint64_t total = 0;
+ boolean_t feature_enabled = spa_feature_is_enabled(spa,
+ SPA_FEATURE_SPACEMAP_HISTOGRAM);
+
+ if (!feature_enabled) {
+ msp->ms_fragmentation = ZFS_FRAG_INVALID;
+ return;
+ }
+
+ /*
+ * A null space map means that the entire metaslab is free
+ * and thus is not fragmented.
+ */
+ if (msp->ms_sm == NULL) {
+ msp->ms_fragmentation = 0;
+ return;
+ }
+
+ /*
+ * If this metaslab's space map has not been upgraded, flag it
+ * so that we upgrade next time we encounter it.
+ */
+ if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
+ uint64_t txg = spa_syncing_txg(spa);
+ vdev_t *vd = msp->ms_group->mg_vd;
+
+ /*
+ * If we've reached the final dirty txg, then we must
+ * be shutting down the pool. We don't want to dirty
+ * any data past this point so skip setting the condense
+ * flag. We can retry this action the next time the pool
+ * is imported. We also skip marking this metaslab for
+ * condensing if the caller has explicitly set nodirty.
+ */
+ if (!nodirty &&
+ spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
+ msp->ms_condense_wanted = B_TRUE;
+ vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
+ zfs_dbgmsg("txg %llu, requesting force condense: "
+ "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
+ vd->vdev_id);
+ }
+ msp->ms_fragmentation = ZFS_FRAG_INVALID;
+ return;
+ }
+
+ for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+ uint64_t space = 0;
+ uint8_t shift = msp->ms_sm->sm_shift;
+
+ int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
+ FRAGMENTATION_TABLE_SIZE - 1);
+
+ if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
+ continue;
+
+ space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
+ total += space;
+
+ ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
+ fragmentation += space * zfs_frag_table[idx];
+ }
+
+ if (total > 0)
+ fragmentation /= total;
+ ASSERT3U(fragmentation, <=, 100);
+
+ msp->ms_fragmentation = fragmentation;
+}
+
+/*
+ * Compute a weight -- a selection preference value -- for the given metaslab.
+ * This is based on the amount of free space, the level of fragmentation,
+ * the LBA range, and whether the metaslab is loaded.
+ */
+static uint64_t
+metaslab_space_weight(metaslab_t *msp)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ vdev_t *vd = mg->mg_vd;
+ uint64_t weight, space;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ /*
+ * The baseline weight is the metaslab's free space.
+ */
+ space = msp->ms_size - metaslab_allocated_space(msp);
+
+ if (metaslab_fragmentation_factor_enabled &&
+ msp->ms_fragmentation != ZFS_FRAG_INVALID) {
+ /*
+ * Use the fragmentation information to inversely scale
+ * down the baseline weight. We need to ensure that we
+ * don't exclude this metaslab completely when it's 100%
+ * fragmented. To avoid this we reduce the fragmented value
+ * by 1.
+ */
+ space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
+
+ /*
+ * If space < SPA_MINBLOCKSIZE, then we will not allocate from
+ * this metaslab again. The fragmentation metric may have
+ * decreased the space to something smaller than
+ * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
+ * so that we can consume any remaining space.
+ */
+ if (space > 0 && space < SPA_MINBLOCKSIZE)
+ space = SPA_MINBLOCKSIZE;
+ }
+ weight = space;
+
+ /*
+ * Modern disks have uniform bit density and constant angular velocity.
+ * Therefore, the outer recording zones are faster (higher bandwidth)
+ * than the inner zones by the ratio of outer to inner track diameter,
+ * which is typically around 2:1. We account for this by assigning
+ * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
+ * In effect, this means that we'll select the metaslab with the most
+ * free bandwidth rather than simply the one with the most free space.
+ */
+ if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
+ weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
+ ASSERT(weight >= space && weight <= 2 * space);
+ }
+
+ /*
+ * If this metaslab is one we're actively using, adjust its
+ * weight to make it preferable to any inactive metaslab so
+ * we'll polish it off. If the fragmentation on this metaslab
+ * has exceed our threshold, then don't mark it active.
+ */
+ if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
+ msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
+ weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
+ }
+
+ WEIGHT_SET_SPACEBASED(weight);
+ return (weight);
+}
+
+/*
+ * Return the weight of the specified metaslab, according to the segment-based
+ * weighting algorithm. The metaslab must be loaded. This function can
+ * be called within a sync pass since it relies only on the metaslab's
+ * range tree which is always accurate when the metaslab is loaded.
+ */
+static uint64_t
+metaslab_weight_from_range_tree(metaslab_t *msp)
+{
+ uint64_t weight = 0;
+ uint32_t segments = 0;
+
+ ASSERT(msp->ms_loaded);
+
+ for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
+ i--) {
+ uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
+ int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
+
+ segments <<= 1;
+ segments += msp->ms_allocatable->rt_histogram[i];
+
+ /*
+ * The range tree provides more precision than the space map
+ * and must be downgraded so that all values fit within the
+ * space map's histogram. This allows us to compare loaded
+ * vs. unloaded metaslabs to determine which metaslab is
+ * considered "best".
+ */
+ if (i > max_idx)
+ continue;
+
+ if (segments != 0) {
+ WEIGHT_SET_COUNT(weight, segments);
+ WEIGHT_SET_INDEX(weight, i);
+ WEIGHT_SET_ACTIVE(weight, 0);
+ break;
+ }
+ }
+ return (weight);
+}
+
+/*
+ * Calculate the weight based on the on-disk histogram. Should be applied
+ * only to unloaded metaslabs (i.e no incoming allocations) in-order to
+ * give results consistent with the on-disk state
+ */
+static uint64_t
+metaslab_weight_from_spacemap(metaslab_t *msp)
+{
+ space_map_t *sm = msp->ms_sm;
+ ASSERT(!msp->ms_loaded);
+ ASSERT(sm != NULL);
+ ASSERT3U(space_map_object(sm), !=, 0);
+ ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
+
+ /*
+ * Create a joint histogram from all the segments that have made
+ * it to the metaslab's space map histogram, that are not yet
+ * available for allocation because they are still in the freeing
+ * pipeline (e.g. freeing, freed, and defer trees). Then subtract
+ * these segments from the space map's histogram to get a more
+ * accurate weight.
+ */
+ uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
+ for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
+ deferspace_histogram[i] += msp->ms_synchist[i];
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+ deferspace_histogram[i] += msp->ms_deferhist[t][i];
+ }
+ }
+
+ uint64_t weight = 0;
+ for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
+ ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
+ deferspace_histogram[i]);
+ uint64_t count =
+ sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
+ if (count != 0) {
+ WEIGHT_SET_COUNT(weight, count);
+ WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
+ WEIGHT_SET_ACTIVE(weight, 0);
+ break;
+ }
+ }
+ return (weight);
+}
+
+/*
+ * Compute a segment-based weight for the specified metaslab. The weight
+ * is determined by highest bucket in the histogram. The information
+ * for the highest bucket is encoded into the weight value.
+ */
+static uint64_t
+metaslab_segment_weight(metaslab_t *msp)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ uint64_t weight = 0;
+ uint8_t shift = mg->mg_vd->vdev_ashift;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ /*
+ * The metaslab is completely free.
+ */
+ if (metaslab_allocated_space(msp) == 0) {
+ int idx = highbit64(msp->ms_size) - 1;
+ int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
+
+ if (idx < max_idx) {
+ WEIGHT_SET_COUNT(weight, 1ULL);
+ WEIGHT_SET_INDEX(weight, idx);
+ } else {
+ WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
+ WEIGHT_SET_INDEX(weight, max_idx);
+ }
+ WEIGHT_SET_ACTIVE(weight, 0);
+ ASSERT(!WEIGHT_IS_SPACEBASED(weight));
+ return (weight);
+ }
+
+ ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
+
+ /*
+ * If the metaslab is fully allocated then just make the weight 0.
+ */
+ if (metaslab_allocated_space(msp) == msp->ms_size)
+ return (0);
+ /*
+ * If the metaslab is already loaded, then use the range tree to
+ * determine the weight. Otherwise, we rely on the space map information
+ * to generate the weight.
+ */
+ if (msp->ms_loaded) {
+ weight = metaslab_weight_from_range_tree(msp);
+ } else {
+ weight = metaslab_weight_from_spacemap(msp);
+ }
+
+ /*
+ * If the metaslab was active the last time we calculated its weight
+ * then keep it active. We want to consume the entire region that
+ * is associated with this weight.
+ */
+ if (msp->ms_activation_weight != 0 && weight != 0)
+ WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
+ return (weight);
+}
+
+/*
+ * Determine if we should attempt to allocate from this metaslab. If the
+ * metaslab is loaded, then we can determine if the desired allocation
+ * can be satisfied by looking at the size of the maximum free segment
+ * on that metaslab. Otherwise, we make our decision based on the metaslab's
+ * weight. For segment-based weighting we can determine the maximum
+ * allocation based on the index encoded in its value. For space-based
+ * weights we rely on the entire weight (excluding the weight-type bit).
+ */
+static boolean_t
+metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard)
+{
+ /*
+ * If the metaslab is loaded, ms_max_size is definitive and we can use
+ * the fast check. If it's not, the ms_max_size is a lower bound (once
+ * set), and we should use the fast check as long as we're not in
+ * try_hard and it's been less than zfs_metaslab_max_size_cache_sec
+ * seconds since the metaslab was unloaded.
+ */
+ if (msp->ms_loaded ||
+ (msp->ms_max_size != 0 && !try_hard && gethrtime() <
+ msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec)))
+ return (msp->ms_max_size >= asize);
+
+ boolean_t should_allocate;
+ if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
+ /*
+ * The metaslab segment weight indicates segments in the
+ * range [2^i, 2^(i+1)), where i is the index in the weight.
+ * Since the asize might be in the middle of the range, we
+ * should attempt the allocation if asize < 2^(i+1).
+ */
+ should_allocate = (asize <
+ 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
+ } else {
+ should_allocate = (asize <=
+ (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
+ }
+
+ return (should_allocate);
+}
+
+static uint64_t
+metaslab_weight(metaslab_t *msp, boolean_t nodirty)
+{
+ vdev_t *vd = msp->ms_group->mg_vd;
+ spa_t *spa = vd->vdev_spa;
+ uint64_t weight;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ metaslab_set_fragmentation(msp, nodirty);
+
+ /*
+ * Update the maximum size. If the metaslab is loaded, this will
+ * ensure that we get an accurate maximum size if newly freed space
+ * has been added back into the free tree. If the metaslab is
+ * unloaded, we check if there's a larger free segment in the
+ * unflushed frees. This is a lower bound on the largest allocatable
+ * segment size. Coalescing of adjacent entries may reveal larger
+ * allocatable segments, but we aren't aware of those until loading
+ * the space map into a range tree.
+ */
+ if (msp->ms_loaded) {
+ msp->ms_max_size = metaslab_largest_allocatable(msp);
+ } else {
+ msp->ms_max_size = MAX(msp->ms_max_size,
+ metaslab_largest_unflushed_free(msp));
+ }
+
+ /*
+ * Segment-based weighting requires space map histogram support.
+ */
+ if (zfs_metaslab_segment_weight_enabled &&
+ spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
+ (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
+ sizeof (space_map_phys_t))) {
+ weight = metaslab_segment_weight(msp);
+ } else {
+ weight = metaslab_space_weight(msp);
+ }
+ return (weight);
+}
+
+void
+metaslab_recalculate_weight_and_sort(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ /* note: we preserve the mask (e.g. indication of primary, etc..) */
+ uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
+ metaslab_group_sort(msp->ms_group, msp,
+ metaslab_weight(msp, B_FALSE) | was_active);
+}
+
+static int
+metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
+ int allocator, uint64_t activation_weight)
+{
+ metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ /*
+ * If we're activating for the claim code, we don't want to actually
+ * set the metaslab up for a specific allocator.
+ */
+ if (activation_weight == METASLAB_WEIGHT_CLAIM) {
+ ASSERT0(msp->ms_activation_weight);
+ msp->ms_activation_weight = msp->ms_weight;
+ metaslab_group_sort(mg, msp, msp->ms_weight |
+ activation_weight);
+ return (0);
+ }
+
+ metaslab_t **mspp = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
+ &mga->mga_primary : &mga->mga_secondary);
+
+ mutex_enter(&mg->mg_lock);
+ if (*mspp != NULL) {
+ mutex_exit(&mg->mg_lock);
+ return (EEXIST);
+ }
+
+ *mspp = msp;
+ ASSERT3S(msp->ms_allocator, ==, -1);
+ msp->ms_allocator = allocator;
+ msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
+
+ ASSERT0(msp->ms_activation_weight);
+ msp->ms_activation_weight = msp->ms_weight;
+ metaslab_group_sort_impl(mg, msp,
+ msp->ms_weight | activation_weight);
+ mutex_exit(&mg->mg_lock);
+
+ return (0);
+}
+
+static int
+metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ /*
+ * The current metaslab is already activated for us so there
+ * is nothing to do. Already activated though, doesn't mean
+ * that this metaslab is activated for our allocator nor our
+ * requested activation weight. The metaslab could have started
+ * as an active one for our allocator but changed allocators
+ * while we were waiting to grab its ms_lock or we stole it
+ * [see find_valid_metaslab()]. This means that there is a
+ * possibility of passivating a metaslab of another allocator
+ * or from a different activation mask, from this thread.
+ */
+ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
+ ASSERT(msp->ms_loaded);
+ return (0);
+ }
+
+ int error = metaslab_load(msp);
+ if (error != 0) {
+ metaslab_group_sort(msp->ms_group, msp, 0);
+ return (error);
+ }
+
+ /*
+ * When entering metaslab_load() we may have dropped the
+ * ms_lock because we were loading this metaslab, or we
+ * were waiting for another thread to load it for us. In
+ * that scenario, we recheck the weight of the metaslab
+ * to see if it was activated by another thread.
+ *
+ * If the metaslab was activated for another allocator or
+ * it was activated with a different activation weight (e.g.
+ * we wanted to make it a primary but it was activated as
+ * secondary) we return error (EBUSY).
+ *
+ * If the metaslab was activated for the same allocator
+ * and requested activation mask, skip activating it.
+ */
+ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
+ if (msp->ms_allocator != allocator)
+ return (EBUSY);
+
+ if ((msp->ms_weight & activation_weight) == 0)
+ return (SET_ERROR(EBUSY));
+
+ EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY),
+ msp->ms_primary);
+ return (0);
+ }
+
+ /*
+ * If the metaslab has literally 0 space, it will have weight 0. In
+ * that case, don't bother activating it. This can happen if the
+ * metaslab had space during find_valid_metaslab, but another thread
+ * loaded it and used all that space while we were waiting to grab the
+ * lock.
+ */
+ if (msp->ms_weight == 0) {
+ ASSERT0(range_tree_space(msp->ms_allocatable));
+ return (SET_ERROR(ENOSPC));
+ }
+
+ if ((error = metaslab_activate_allocator(msp->ms_group, msp,
+ allocator, activation_weight)) != 0) {
+ return (error);
+ }
+
+ ASSERT(msp->ms_loaded);
+ ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
+
+ return (0);
+}
+
+static void
+metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
+ uint64_t weight)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT(msp->ms_loaded);
+
+ if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
+ metaslab_group_sort(mg, msp, weight);
+ return;
+ }
+
+ mutex_enter(&mg->mg_lock);
+ ASSERT3P(msp->ms_group, ==, mg);
+ ASSERT3S(0, <=, msp->ms_allocator);
+ ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
+
+ metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator];
+ if (msp->ms_primary) {
+ ASSERT3P(mga->mga_primary, ==, msp);
+ ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
+ mga->mga_primary = NULL;
+ } else {
+ ASSERT3P(mga->mga_secondary, ==, msp);
+ ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
+ mga->mga_secondary = NULL;
+ }
+ msp->ms_allocator = -1;
+ metaslab_group_sort_impl(mg, msp, weight);
+ mutex_exit(&mg->mg_lock);
+}
+
+static void
+metaslab_passivate(metaslab_t *msp, uint64_t weight)
+{
+ uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_TYPE;
+
+ /*
+ * If size < SPA_MINBLOCKSIZE, then we will not allocate from
+ * this metaslab again. In that case, it had better be empty,
+ * or we would be leaving space on the table.
+ */
+ ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
+ size >= SPA_MINBLOCKSIZE ||
+ range_tree_space(msp->ms_allocatable) == 0);
+ ASSERT0(weight & METASLAB_ACTIVE_MASK);
+
+ ASSERT(msp->ms_activation_weight != 0);
+ msp->ms_activation_weight = 0;
+ metaslab_passivate_allocator(msp->ms_group, msp, weight);
+ ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK);
+}
+
+/*
+ * Segment-based metaslabs are activated once and remain active until
+ * we either fail an allocation attempt (similar to space-based metaslabs)
+ * or have exhausted the free space in zfs_metaslab_switch_threshold
+ * buckets since the metaslab was activated. This function checks to see
+ * if we've exhausted the zfs_metaslab_switch_threshold buckets in the
+ * metaslab and passivates it proactively. This will allow us to select a
+ * metaslab with a larger contiguous region, if any, remaining within this
+ * metaslab group. If we're in sync pass > 1, then we continue using this
+ * metaslab so that we don't dirty more block and cause more sync passes.
+ */
+static void
+metaslab_segment_may_passivate(metaslab_t *msp)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+
+ if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
+ return;
+
+ /*
+ * Since we are in the middle of a sync pass, the most accurate
+ * information that is accessible to us is the in-core range tree
+ * histogram; calculate the new weight based on that information.
+ */
+ uint64_t weight = metaslab_weight_from_range_tree(msp);
+ int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
+ int current_idx = WEIGHT_GET_INDEX(weight);
+
+ if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
+ metaslab_passivate(msp, weight);
+}
+
+static void
+metaslab_preload(void *arg)
+{
+ metaslab_t *msp = arg;
+ metaslab_class_t *mc = msp->ms_group->mg_class;
+ spa_t *spa = mc->mc_spa;
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+
+ ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
+
+ mutex_enter(&msp->ms_lock);
+ (void) metaslab_load(msp);
+ metaslab_set_selected_txg(msp, spa_syncing_txg(spa));
+ mutex_exit(&msp->ms_lock);
+ spl_fstrans_unmark(cookie);
+}
+
+static void
+metaslab_group_preload(metaslab_group_t *mg)
+{
+ spa_t *spa = mg->mg_vd->vdev_spa;
+ metaslab_t *msp;
+ avl_tree_t *t = &mg->mg_metaslab_tree;
+ int m = 0;
+
+ if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
+ taskq_wait_outstanding(mg->mg_taskq, 0);
+ return;
+ }
+
+ mutex_enter(&mg->mg_lock);
+
+ /*
+ * Load the next potential metaslabs
+ */
+ for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
+ ASSERT3P(msp->ms_group, ==, mg);
+
+ /*
+ * We preload only the maximum number of metaslabs specified
+ * by metaslab_preload_limit. If a metaslab is being forced
+ * to condense then we preload it too. This will ensure
+ * that force condensing happens in the next txg.
+ */
+ if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
+ continue;
+ }
+
+ VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
+ msp, TQ_SLEEP) != TASKQID_INVALID);
+ }
+ mutex_exit(&mg->mg_lock);
+}
+
+/*
+ * Determine if the space map's on-disk footprint is past our tolerance for
+ * inefficiency. We would like to use the following criteria to make our
+ * decision:
+ *
+ * 1. Do not condense if the size of the space map object would dramatically
+ * increase as a result of writing out the free space range tree.
+ *
+ * 2. Condense if the on on-disk space map representation is at least
+ * zfs_condense_pct/100 times the size of the optimal representation
+ * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
+ *
+ * 3. Do not condense if the on-disk size of the space map does not actually
+ * decrease.
+ *
+ * Unfortunately, we cannot compute the on-disk size of the space map in this
+ * context because we cannot accurately compute the effects of compression, etc.
+ * Instead, we apply the heuristic described in the block comment for
+ * zfs_metaslab_condense_block_threshold - we only condense if the space used
+ * is greater than a threshold number of blocks.
+ */
+static boolean_t
+metaslab_should_condense(metaslab_t *msp)
+{
+ space_map_t *sm = msp->ms_sm;
+ vdev_t *vd = msp->ms_group->mg_vd;
+ uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT(msp->ms_loaded);
+ ASSERT(sm != NULL);
+ ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1);
+
+ /*
+ * We always condense metaslabs that are empty and metaslabs for
+ * which a condense request has been made.
+ */
+ if (range_tree_numsegs(msp->ms_allocatable) == 0 ||
+ msp->ms_condense_wanted)
+ return (B_TRUE);
+
+ uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize);
+ uint64_t object_size = space_map_length(sm);
+ uint64_t optimal_size = space_map_estimate_optimal_size(sm,
+ msp->ms_allocatable, SM_NO_VDEVID);
+
+ return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
+ object_size > zfs_metaslab_condense_block_threshold * record_size);
+}
+
+/*
+ * Condense the on-disk space map representation to its minimized form.
+ * The minimized form consists of a small number of allocations followed
+ * by the entries of the free range tree (ms_allocatable). The condensed
+ * spacemap contains all the entries of previous TXGs (including those in
+ * the pool-wide log spacemaps; thus this is effectively a superset of
+ * metaslab_flush()), but this TXG's entries still need to be written.
+ */
+static void
+metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
+{
+ range_tree_t *condense_tree;
+ space_map_t *sm = msp->ms_sm;
+ uint64_t txg = dmu_tx_get_txg(tx);
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT(msp->ms_loaded);
+ ASSERT(msp->ms_sm != NULL);
+
+ /*
+ * In order to condense the space map, we need to change it so it
+ * only describes which segments are currently allocated and free.
+ *
+ * All the current free space resides in the ms_allocatable, all
+ * the ms_defer trees, and all the ms_allocating trees. We ignore
+ * ms_freed because it is empty because we're in sync pass 1. We
+ * ignore ms_freeing because these changes are not yet reflected
+ * in the spacemap (they will be written later this txg).
+ *
+ * So to truncate the space map to represent all the entries of
+ * previous TXGs we do the following:
+ *
+ * 1] We create a range tree (condense tree) that is 100% empty.
+ * 2] We add to it all segments found in the ms_defer trees
+ * as those segments are marked as free in the original space
+ * map. We do the same with the ms_allocating trees for the same
+ * reason. Adding these segments should be a relatively
+ * inexpensive operation since we expect these trees to have a
+ * small number of nodes.
+ * 3] We vacate any unflushed allocs, since they are not frees we
+ * need to add to the condense tree. Then we vacate any
+ * unflushed frees as they should already be part of ms_allocatable.
+ * 4] At this point, we would ideally like to add all segments
+ * in the ms_allocatable tree from the condense tree. This way
+ * we would write all the entries of the condense tree as the
+ * condensed space map, which would only contain freed
+ * segments with everything else assumed to be allocated.
+ *
+ * Doing so can be prohibitively expensive as ms_allocatable can
+ * be large, and therefore computationally expensive to add to
+ * the condense_tree. Instead we first sync out an entry marking
+ * everything as allocated, then the condense_tree and then the
+ * ms_allocatable, in the condensed space map. While this is not
+ * optimal, it is typically close to optimal and more importantly
+ * much cheaper to compute.
+ *
+ * 5] Finally, as both of the unflushed trees were written to our
+ * new and condensed metaslab space map, we basically flushed
+ * all the unflushed changes to disk, thus we call
+ * metaslab_flush_update().
+ */
+ ASSERT3U(spa_sync_pass(spa), ==, 1);
+ ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */
+
+ zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, "
+ "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
+ msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
+ spa->spa_name, space_map_length(msp->ms_sm),
+ range_tree_numsegs(msp->ms_allocatable),
+ msp->ms_condense_wanted ? "TRUE" : "FALSE");
+
+ msp->ms_condense_wanted = B_FALSE;
+
+ range_seg_type_t type;
+ uint64_t shift, start;
+ type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
+ &start, &shift);
+
+ condense_tree = range_tree_create(NULL, type, NULL, start, shift);
+
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ range_tree_walk(msp->ms_defer[t],
+ range_tree_add, condense_tree);
+ }
+
+ for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
+ range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
+ range_tree_add, condense_tree);
+ }
+
+ ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+ metaslab_unflushed_changes_memused(msp));
+ spa->spa_unflushed_stats.sus_memused -=
+ metaslab_unflushed_changes_memused(msp);
+ range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
+ range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
+
+ /*
+ * We're about to drop the metaslab's lock thus allowing other
+ * consumers to change it's content. Set the metaslab's ms_condensing
+ * flag to ensure that allocations on this metaslab do not occur
+ * while we're in the middle of committing it to disk. This is only
+ * critical for ms_allocatable as all other range trees use per TXG
+ * views of their content.
+ */
+ msp->ms_condensing = B_TRUE;
+
+ mutex_exit(&msp->ms_lock);
+ uint64_t object = space_map_object(msp->ms_sm);
+ space_map_truncate(sm,
+ spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
+ zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx);
+
+ /*
+ * space_map_truncate() may have reallocated the spacemap object.
+ * If so, update the vdev_ms_array.
+ */
+ if (space_map_object(msp->ms_sm) != object) {
+ object = space_map_object(msp->ms_sm);
+ dmu_write(spa->spa_meta_objset,
+ msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) *
+ msp->ms_id, sizeof (uint64_t), &object, tx);
+ }
+
+ /*
+ * Note:
+ * When the log space map feature is enabled, each space map will
+ * always have ALLOCS followed by FREES for each sync pass. This is
+ * typically true even when the log space map feature is disabled,
+ * except from the case where a metaslab goes through metaslab_sync()
+ * and gets condensed. In that case the metaslab's space map will have
+ * ALLOCS followed by FREES (due to condensing) followed by ALLOCS
+ * followed by FREES (due to space_map_write() in metaslab_sync()) for
+ * sync pass 1.
+ */
+ range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start,
+ shift);
+ range_tree_add(tmp_tree, msp->ms_start, msp->ms_size);
+ space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx);
+ space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
+ space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx);
+
+ range_tree_vacate(condense_tree, NULL, NULL);
+ range_tree_destroy(condense_tree);
+ range_tree_vacate(tmp_tree, NULL, NULL);
+ range_tree_destroy(tmp_tree);
+ mutex_enter(&msp->ms_lock);
+
+ msp->ms_condensing = B_FALSE;
+ metaslab_flush_update(msp, tx);
+}
+
+/*
+ * Called when the metaslab has been flushed (its own spacemap now reflects
+ * all the contents of the pool-wide spacemap log). Updates the metaslab's
+ * metadata and any pool-wide related log space map data (e.g. summary,
+ * obsolete logs, etc..) to reflect that.
+ */
+static void
+metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ spa_t *spa = mg->mg_vd->vdev_spa;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ ASSERT3U(spa_sync_pass(spa), ==, 1);
+ ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+ ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
+
+ /*
+ * Just because a metaslab got flushed, that doesn't mean that
+ * it will pass through metaslab_sync_done(). Thus, make sure to
+ * update ms_synced_length here in case it doesn't.
+ */
+ msp->ms_synced_length = space_map_length(msp->ms_sm);
+
+ /*
+ * We may end up here from metaslab_condense() without the
+ * feature being active. In that case this is a no-op.
+ */
+ if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+ return;
+
+ ASSERT(spa_syncing_log_sm(spa) != NULL);
+ ASSERT(msp->ms_sm != NULL);
+ ASSERT(metaslab_unflushed_txg(msp) != 0);
+ ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
+
+ VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
+
+ /* update metaslab's position in our flushing tree */
+ uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
+ mutex_enter(&spa->spa_flushed_ms_lock);
+ avl_remove(&spa->spa_metaslabs_by_flushed, msp);
+ metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
+ avl_add(&spa->spa_metaslabs_by_flushed, msp);
+ mutex_exit(&spa->spa_flushed_ms_lock);
+
+ /* update metaslab counts of spa_log_sm_t nodes */
+ spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
+ spa_log_sm_increment_current_mscount(spa);
+
+ /* cleanup obsolete logs if any */
+ uint64_t log_blocks_before = spa_log_sm_nblocks(spa);
+ spa_cleanup_old_sm_logs(spa, tx);
+ uint64_t log_blocks_after = spa_log_sm_nblocks(spa);
+ VERIFY3U(log_blocks_after, <=, log_blocks_before);
+
+ /* update log space map summary */
+ uint64_t blocks_gone = log_blocks_before - log_blocks_after;
+ spa_log_summary_add_flushed_metaslab(spa);
+ spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg);
+ spa_log_summary_decrement_blkcount(spa, blocks_gone);
+}
+
+boolean_t
+metaslab_flush(metaslab_t *msp, dmu_tx_t *tx)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT3U(spa_sync_pass(spa), ==, 1);
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+ ASSERT(msp->ms_sm != NULL);
+ ASSERT(metaslab_unflushed_txg(msp) != 0);
+ ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL);
+
+ /*
+ * There is nothing wrong with flushing the same metaslab twice, as
+ * this codepath should work on that case. However, the current
+ * flushing scheme makes sure to avoid this situation as we would be
+ * making all these calls without having anything meaningful to write
+ * to disk. We assert this behavior here.
+ */
+ ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx));
+
+ /*
+ * We can not flush while loading, because then we would
+ * not load the ms_unflushed_{allocs,frees}.
+ */
+ if (msp->ms_loading)
+ return (B_FALSE);
+
+ metaslab_verify_space(msp, dmu_tx_get_txg(tx));
+ metaslab_verify_weight_and_frag(msp);
+
+ /*
+ * Metaslab condensing is effectively flushing. Therefore if the
+ * metaslab can be condensed we can just condense it instead of
+ * flushing it.
+ *
+ * Note that metaslab_condense() does call metaslab_flush_update()
+ * so we can just return immediately after condensing. We also
+ * don't need to care about setting ms_flushing or broadcasting
+ * ms_flush_cv, even if we temporarily drop the ms_lock in
+ * metaslab_condense(), as the metaslab is already loaded.
+ */
+ if (msp->ms_loaded && metaslab_should_condense(msp)) {
+ metaslab_group_t *mg = msp->ms_group;
+
+ /*
+ * For all histogram operations below refer to the
+ * comments of metaslab_sync() where we follow a
+ * similar procedure.
+ */
+ metaslab_group_histogram_verify(mg);
+ metaslab_class_histogram_verify(mg->mg_class);
+ metaslab_group_histogram_remove(mg, msp);
+
+ metaslab_condense(msp, tx);
+
+ space_map_histogram_clear(msp->ms_sm);
+ space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
+ ASSERT(range_tree_is_empty(msp->ms_freed));
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ space_map_histogram_add(msp->ms_sm,
+ msp->ms_defer[t], tx);
+ }
+ metaslab_aux_histograms_update(msp);
+
+ metaslab_group_histogram_add(mg, msp);
+ metaslab_group_histogram_verify(mg);
+ metaslab_class_histogram_verify(mg->mg_class);
+
+ metaslab_verify_space(msp, dmu_tx_get_txg(tx));
+
+ /*
+ * Since we recreated the histogram (and potentially
+ * the ms_sm too while condensing) ensure that the
+ * weight is updated too because we are not guaranteed
+ * that this metaslab is dirty and will go through
+ * metaslab_sync_done().
+ */
+ metaslab_recalculate_weight_and_sort(msp);
+ return (B_TRUE);
+ }
+
+ msp->ms_flushing = B_TRUE;
+ uint64_t sm_len_before = space_map_length(msp->ms_sm);
+
+ mutex_exit(&msp->ms_lock);
+ space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC,
+ SM_NO_VDEVID, tx);
+ space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE,
+ SM_NO_VDEVID, tx);
+ mutex_enter(&msp->ms_lock);
+
+ uint64_t sm_len_after = space_map_length(msp->ms_sm);
+ if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
+ zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, "
+ "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, "
+ "appended %llu bytes", dmu_tx_get_txg(tx), spa_name(spa),
+ msp->ms_group->mg_vd->vdev_id, msp->ms_id,
+ range_tree_space(msp->ms_unflushed_allocs),
+ range_tree_space(msp->ms_unflushed_frees),
+ (sm_len_after - sm_len_before));
+ }
+
+ ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+ metaslab_unflushed_changes_memused(msp));
+ spa->spa_unflushed_stats.sus_memused -=
+ metaslab_unflushed_changes_memused(msp);
+ range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
+ range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
+
+ metaslab_verify_space(msp, dmu_tx_get_txg(tx));
+ metaslab_verify_weight_and_frag(msp);
+
+ metaslab_flush_update(msp, tx);
+
+ metaslab_verify_space(msp, dmu_tx_get_txg(tx));
+ metaslab_verify_weight_and_frag(msp);
+
+ msp->ms_flushing = B_FALSE;
+ cv_broadcast(&msp->ms_flush_cv);
+ return (B_TRUE);
+}
+
+/*
+ * Write a metaslab to disk in the context of the specified transaction group.
+ */
+void
+metaslab_sync(metaslab_t *msp, uint64_t txg)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ vdev_t *vd = mg->mg_vd;
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa_meta_objset(spa);
+ range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
+ dmu_tx_t *tx;
+
+ ASSERT(!vd->vdev_ishole);
+
+ /*
+ * This metaslab has just been added so there's no work to do now.
+ */
+ if (msp->ms_freeing == NULL) {
+ ASSERT3P(alloctree, ==, NULL);
+ return;
+ }
+
+ ASSERT3P(alloctree, !=, NULL);
+ ASSERT3P(msp->ms_freeing, !=, NULL);
+ ASSERT3P(msp->ms_freed, !=, NULL);
+ ASSERT3P(msp->ms_checkpointing, !=, NULL);
+ ASSERT3P(msp->ms_trim, !=, NULL);
+
+ /*
+ * Normally, we don't want to process a metaslab if there are no
+ * allocations or frees to perform. However, if the metaslab is being
+ * forced to condense, it's loaded and we're not beyond the final
+ * dirty txg, we need to let it through. Not condensing beyond the
+ * final dirty txg prevents an issue where metaslabs that need to be
+ * condensed but were loaded for other reasons could cause a panic
+ * here. By only checking the txg in that branch of the conditional,
+ * we preserve the utility of the VERIFY statements in all other
+ * cases.
+ */
+ if (range_tree_is_empty(alloctree) &&
+ range_tree_is_empty(msp->ms_freeing) &&
+ range_tree_is_empty(msp->ms_checkpointing) &&
+ !(msp->ms_loaded && msp->ms_condense_wanted &&
+ txg <= spa_final_dirty_txg(spa)))
+ return;
+
+
+ VERIFY3U(txg, <=, spa_final_dirty_txg(spa));
+
+ /*
+ * The only state that can actually be changing concurrently
+ * with metaslab_sync() is the metaslab's ms_allocatable. No
+ * other thread can be modifying this txg's alloc, freeing,
+ * freed, or space_map_phys_t. We drop ms_lock whenever we
+ * could call into the DMU, because the DMU can call down to
+ * us (e.g. via zio_free()) at any time.
+ *
+ * The spa_vdev_remove_thread() can be reading metaslab state
+ * concurrently, and it is locked out by the ms_sync_lock.
+ * Note that the ms_lock is insufficient for this, because it
+ * is dropped by space_map_write().
+ */
+ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+ /*
+ * Generate a log space map if one doesn't exist already.
+ */
+ spa_generate_syncing_log_sm(spa, tx);
+
+ if (msp->ms_sm == NULL) {
+ uint64_t new_object = space_map_alloc(mos,
+ spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
+ zfs_metaslab_sm_blksz_with_log :
+ zfs_metaslab_sm_blksz_no_log, tx);
+ VERIFY3U(new_object, !=, 0);
+
+ dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
+ msp->ms_id, sizeof (uint64_t), &new_object, tx);
+
+ VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
+ msp->ms_start, msp->ms_size, vd->vdev_ashift));
+ ASSERT(msp->ms_sm != NULL);
+
+ ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+ ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
+ ASSERT0(metaslab_allocated_space(msp));
+ }
+
+ if (metaslab_unflushed_txg(msp) == 0 &&
+ spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
+ ASSERT(spa_syncing_log_sm(spa) != NULL);
+
+ metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
+ spa_log_sm_increment_current_mscount(spa);
+ spa_log_summary_add_flushed_metaslab(spa);
+
+ ASSERT(msp->ms_sm != NULL);
+ mutex_enter(&spa->spa_flushed_ms_lock);
+ avl_add(&spa->spa_metaslabs_by_flushed, msp);
+ mutex_exit(&spa->spa_flushed_ms_lock);
+
+ ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+ ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
+ }
+
+ if (!range_tree_is_empty(msp->ms_checkpointing) &&
+ vd->vdev_checkpoint_sm == NULL) {
+ ASSERT(spa_has_checkpoint(spa));
+
+ uint64_t new_object = space_map_alloc(mos,
+ zfs_vdev_standard_sm_blksz, tx);
+ VERIFY3U(new_object, !=, 0);
+
+ VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
+ mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
+ ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+
+ /*
+ * We save the space map object as an entry in vdev_top_zap
+ * so it can be retrieved when the pool is reopened after an
+ * export or through zdb.
+ */
+ VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+ sizeof (new_object), 1, &new_object, tx));
+ }
+
+ mutex_enter(&msp->ms_sync_lock);
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * Note: metaslab_condense() clears the space map's histogram.
+ * Therefore we must verify and remove this histogram before
+ * condensing.
+ */
+ metaslab_group_histogram_verify(mg);
+ metaslab_class_histogram_verify(mg->mg_class);
+ metaslab_group_histogram_remove(mg, msp);
+
+ if (spa->spa_sync_pass == 1 && msp->ms_loaded &&
+ metaslab_should_condense(msp))
+ metaslab_condense(msp, tx);
+
+ /*
+ * We'll be going to disk to sync our space accounting, thus we
+ * drop the ms_lock during that time so allocations coming from
+ * open-context (ZIL) for future TXGs do not block.
+ */
+ mutex_exit(&msp->ms_lock);
+ space_map_t *log_sm = spa_syncing_log_sm(spa);
+ if (log_sm != NULL) {
+ ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+ space_map_write(log_sm, alloctree, SM_ALLOC,
+ vd->vdev_id, tx);
+ space_map_write(log_sm, msp->ms_freeing, SM_FREE,
+ vd->vdev_id, tx);
+ mutex_enter(&msp->ms_lock);
+
+ ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+ metaslab_unflushed_changes_memused(msp));
+ spa->spa_unflushed_stats.sus_memused -=
+ metaslab_unflushed_changes_memused(msp);
+ range_tree_remove_xor_add(alloctree,
+ msp->ms_unflushed_frees, msp->ms_unflushed_allocs);
+ range_tree_remove_xor_add(msp->ms_freeing,
+ msp->ms_unflushed_allocs, msp->ms_unflushed_frees);
+ spa->spa_unflushed_stats.sus_memused +=
+ metaslab_unflushed_changes_memused(msp);
+ } else {
+ ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+ space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
+ SM_NO_VDEVID, tx);
+ space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
+ SM_NO_VDEVID, tx);
+ mutex_enter(&msp->ms_lock);
+ }
+
+ msp->ms_allocated_space += range_tree_space(alloctree);
+ ASSERT3U(msp->ms_allocated_space, >=,
+ range_tree_space(msp->ms_freeing));
+ msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
+
+ if (!range_tree_is_empty(msp->ms_checkpointing)) {
+ ASSERT(spa_has_checkpoint(spa));
+ ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+
+ /*
+ * Since we are doing writes to disk and the ms_checkpointing
+ * tree won't be changing during that time, we drop the
+ * ms_lock while writing to the checkpoint space map, for the
+ * same reason mentioned above.
+ */
+ mutex_exit(&msp->ms_lock);
+ space_map_write(vd->vdev_checkpoint_sm,
+ msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
+ mutex_enter(&msp->ms_lock);
+
+ spa->spa_checkpoint_info.sci_dspace +=
+ range_tree_space(msp->ms_checkpointing);
+ vd->vdev_stat.vs_checkpoint_space +=
+ range_tree_space(msp->ms_checkpointing);
+ ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
+ -space_map_allocated(vd->vdev_checkpoint_sm));
+
+ range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
+ }
+
+ if (msp->ms_loaded) {
+ /*
+ * When the space map is loaded, we have an accurate
+ * histogram in the range tree. This gives us an opportunity
+ * to bring the space map's histogram up-to-date so we clear
+ * it first before updating it.
+ */
+ space_map_histogram_clear(msp->ms_sm);
+ space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
+
+ /*
+ * Since we've cleared the histogram we need to add back
+ * any free space that has already been processed, plus
+ * any deferred space. This allows the on-disk histogram
+ * to accurately reflect all free space even if some space
+ * is not yet available for allocation (i.e. deferred).
+ */
+ space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
+
+ /*
+ * Add back any deferred free space that has not been
+ * added back into the in-core free tree yet. This will
+ * ensure that we don't end up with a space map histogram
+ * that is completely empty unless the metaslab is fully
+ * allocated.
+ */
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ space_map_histogram_add(msp->ms_sm,
+ msp->ms_defer[t], tx);
+ }
+ }
+
+ /*
+ * Always add the free space from this sync pass to the space
+ * map histogram. We want to make sure that the on-disk histogram
+ * accounts for all free space. If the space map is not loaded,
+ * then we will lose some accuracy but will correct it the next
+ * time we load the space map.
+ */
+ space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
+ metaslab_aux_histograms_update(msp);
+
+ metaslab_group_histogram_add(mg, msp);
+ metaslab_group_histogram_verify(mg);
+ metaslab_class_histogram_verify(mg->mg_class);
+
+ /*
+ * For sync pass 1, we avoid traversing this txg's free range tree
+ * and instead will just swap the pointers for freeing and freed.
+ * We can safely do this since the freed_tree is guaranteed to be
+ * empty on the initial pass.
+ *
+ * Keep in mind that even if we are currently using a log spacemap
+ * we want current frees to end up in the ms_allocatable (but not
+ * get appended to the ms_sm) so their ranges can be reused as usual.
+ */
+ if (spa_sync_pass(spa) == 1) {
+ range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
+ ASSERT0(msp->ms_allocated_this_txg);
+ } else {
+ range_tree_vacate(msp->ms_freeing,
+ range_tree_add, msp->ms_freed);
+ }
+ msp->ms_allocated_this_txg += range_tree_space(alloctree);
+ range_tree_vacate(alloctree, NULL, NULL);
+
+ ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
+ ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
+ & TXG_MASK]));
+ ASSERT0(range_tree_space(msp->ms_freeing));
+ ASSERT0(range_tree_space(msp->ms_checkpointing));
+
+ mutex_exit(&msp->ms_lock);
+
+ /*
+ * Verify that the space map object ID has been recorded in the
+ * vdev_ms_array.
+ */
+ uint64_t object;
+ VERIFY0(dmu_read(mos, vd->vdev_ms_array,
+ msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0));
+ VERIFY3U(object, ==, space_map_object(msp->ms_sm));
+
+ mutex_exit(&msp->ms_sync_lock);
+ dmu_tx_commit(tx);
+}
+
+static void
+metaslab_evict(metaslab_t *msp, uint64_t txg)
+{
+ if (!msp->ms_loaded || msp->ms_disabled != 0)
+ return;
+
+ for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
+ VERIFY0(range_tree_space(
+ msp->ms_allocating[(txg + t) & TXG_MASK]));
+ }
+ if (msp->ms_allocator != -1)
+ metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK);
+
+ if (!metaslab_debug_unload)
+ metaslab_unload(msp);
+}
+
+/*
+ * Called after a transaction group has completely synced to mark
+ * all of the metaslab's free space as usable.
+ */
+void
+metaslab_sync_done(metaslab_t *msp, uint64_t txg)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ vdev_t *vd = mg->mg_vd;
+ spa_t *spa = vd->vdev_spa;
+ range_tree_t **defer_tree;
+ int64_t alloc_delta, defer_delta;
+ boolean_t defer_allowed = B_TRUE;
+
+ ASSERT(!vd->vdev_ishole);
+
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * If this metaslab is just becoming available, initialize its
+ * range trees and add its capacity to the vdev.
+ */
+ if (msp->ms_freed == NULL) {
+ range_seg_type_t type;
+ uint64_t shift, start;
+ type = metaslab_calculate_range_tree_type(vd, msp, &start,
+ &shift);
+
+ for (int t = 0; t < TXG_SIZE; t++) {
+ ASSERT(msp->ms_allocating[t] == NULL);
+
+ msp->ms_allocating[t] = range_tree_create(NULL, type,
+ NULL, start, shift);
+ }
+
+ ASSERT3P(msp->ms_freeing, ==, NULL);
+ msp->ms_freeing = range_tree_create(NULL, type, NULL, start,
+ shift);
+
+ ASSERT3P(msp->ms_freed, ==, NULL);
+ msp->ms_freed = range_tree_create(NULL, type, NULL, start,
+ shift);
+
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ ASSERT3P(msp->ms_defer[t], ==, NULL);
+ msp->ms_defer[t] = range_tree_create(NULL, type, NULL,
+ start, shift);
+ }
+
+ ASSERT3P(msp->ms_checkpointing, ==, NULL);
+ msp->ms_checkpointing = range_tree_create(NULL, type, NULL,
+ start, shift);
+
+ ASSERT3P(msp->ms_unflushed_allocs, ==, NULL);
+ msp->ms_unflushed_allocs = range_tree_create(NULL, type, NULL,
+ start, shift);
+
+ metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
+ mrap->mra_bt = &msp->ms_unflushed_frees_by_size;
+ mrap->mra_floor_shift = metaslab_by_size_min_shift;
+ ASSERT3P(msp->ms_unflushed_frees, ==, NULL);
+ msp->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops,
+ type, mrap, start, shift);
+
+ metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
+ }
+ ASSERT0(range_tree_space(msp->ms_freeing));
+ ASSERT0(range_tree_space(msp->ms_checkpointing));
+
+ defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
+
+ uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
+ metaslab_class_get_alloc(spa_normal_class(spa));
+ if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
+ defer_allowed = B_FALSE;
+ }
+
+ defer_delta = 0;
+ alloc_delta = msp->ms_allocated_this_txg -
+ range_tree_space(msp->ms_freed);
+
+ if (defer_allowed) {
+ defer_delta = range_tree_space(msp->ms_freed) -
+ range_tree_space(*defer_tree);
+ } else {
+ defer_delta -= range_tree_space(*defer_tree);
+ }
+ metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
+ defer_delta, 0);
+
+ if (spa_syncing_log_sm(spa) == NULL) {
+ /*
+ * If there's a metaslab_load() in progress and we don't have
+ * a log space map, it means that we probably wrote to the
+ * metaslab's space map. If this is the case, we need to
+ * make sure that we wait for the load to complete so that we
+ * have a consistent view at the in-core side of the metaslab.
+ */
+ metaslab_load_wait(msp);
+ } else {
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+ }
+
+ /*
+ * When auto-trimming is enabled, free ranges which are added to
+ * ms_allocatable are also be added to ms_trim. The ms_trim tree is
+ * periodically consumed by the vdev_autotrim_thread() which issues
+ * trims for all ranges and then vacates the tree. The ms_trim tree
+ * can be discarded at any time with the sole consequence of recent
+ * frees not being trimmed.
+ */
+ if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) {
+ range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim);
+ if (!defer_allowed) {
+ range_tree_walk(msp->ms_freed, range_tree_add,
+ msp->ms_trim);
+ }
+ } else {
+ range_tree_vacate(msp->ms_trim, NULL, NULL);
+ }
+
+ /*
+ * Move the frees from the defer_tree back to the free
+ * range tree (if it's loaded). Swap the freed_tree and
+ * the defer_tree -- this is safe to do because we've
+ * just emptied out the defer_tree.
+ */
+ range_tree_vacate(*defer_tree,
+ msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
+ if (defer_allowed) {
+ range_tree_swap(&msp->ms_freed, defer_tree);
+ } else {
+ range_tree_vacate(msp->ms_freed,
+ msp->ms_loaded ? range_tree_add : NULL,
+ msp->ms_allocatable);
+ }
+
+ msp->ms_synced_length = space_map_length(msp->ms_sm);
+
+ msp->ms_deferspace += defer_delta;
+ ASSERT3S(msp->ms_deferspace, >=, 0);
+ ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
+ if (msp->ms_deferspace != 0) {
+ /*
+ * Keep syncing this metaslab until all deferred frees
+ * are back in circulation.
+ */
+ vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
+ }
+ metaslab_aux_histograms_update_done(msp, defer_allowed);
+
+ if (msp->ms_new) {
+ msp->ms_new = B_FALSE;
+ mutex_enter(&mg->mg_lock);
+ mg->mg_ms_ready++;
+ mutex_exit(&mg->mg_lock);
+ }
+
+ /*
+ * Re-sort metaslab within its group now that we've adjusted
+ * its allocatable space.
+ */
+ metaslab_recalculate_weight_and_sort(msp);
+
+ ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
+ ASSERT0(range_tree_space(msp->ms_freeing));
+ ASSERT0(range_tree_space(msp->ms_freed));
+ ASSERT0(range_tree_space(msp->ms_checkpointing));
+ msp->ms_allocating_total -= msp->ms_allocated_this_txg;
+ msp->ms_allocated_this_txg = 0;
+ mutex_exit(&msp->ms_lock);
+}
+
+void
+metaslab_sync_reassess(metaslab_group_t *mg)
+{
+ spa_t *spa = mg->mg_class->mc_spa;
+
+ spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
+ metaslab_group_alloc_update(mg);
+ mg->mg_fragmentation = metaslab_group_fragmentation(mg);
+
+ /*
+ * Preload the next potential metaslabs but only on active
+ * metaslab groups. We can get into a state where the metaslab
+ * is no longer active since we dirty metaslabs as we remove a
+ * a device, thus potentially making the metaslab group eligible
+ * for preloading.
+ */
+ if (mg->mg_activation_count > 0) {
+ metaslab_group_preload(mg);
+ }
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
+}
+
+/*
+ * When writing a ditto block (i.e. more than one DVA for a given BP) on
+ * the same vdev as an existing DVA of this BP, then try to allocate it
+ * on a different metaslab than existing DVAs (i.e. a unique metaslab).
+ */
+static boolean_t
+metaslab_is_unique(metaslab_t *msp, dva_t *dva)
+{
+ uint64_t dva_ms_id;
+
+ if (DVA_GET_ASIZE(dva) == 0)
+ return (B_TRUE);
+
+ if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
+ return (B_TRUE);
+
+ dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
+
+ return (msp->ms_id != dva_ms_id);
+}
+
+/*
+ * ==========================================================================
+ * Metaslab allocation tracing facility
+ * ==========================================================================
+ */
+
+/*
+ * Add an allocation trace element to the allocation tracing list.
+ */
+static void
+metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
+ metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
+ int allocator)
+{
+ metaslab_alloc_trace_t *mat;
+
+ if (!metaslab_trace_enabled)
+ return;
+
+ /*
+ * When the tracing list reaches its maximum we remove
+ * the second element in the list before adding a new one.
+ * By removing the second element we preserve the original
+ * entry as a clue to what allocations steps have already been
+ * performed.
+ */
+ if (zal->zal_size == metaslab_trace_max_entries) {
+ metaslab_alloc_trace_t *mat_next;
+#ifdef ZFS_DEBUG
+ panic("too many entries in allocation list");
+#endif
+ METASLABSTAT_BUMP(metaslabstat_trace_over_limit);
+ zal->zal_size--;
+ mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
+ list_remove(&zal->zal_list, mat_next);
+ kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
+ }
+
+ mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
+ list_link_init(&mat->mat_list_node);
+ mat->mat_mg = mg;
+ mat->mat_msp = msp;
+ mat->mat_size = psize;
+ mat->mat_dva_id = dva_id;
+ mat->mat_offset = offset;
+ mat->mat_weight = 0;
+ mat->mat_allocator = allocator;
+
+ if (msp != NULL)
+ mat->mat_weight = msp->ms_weight;
+
+ /*
+ * The list is part of the zio so locking is not required. Only
+ * a single thread will perform allocations for a given zio.
+ */
+ list_insert_tail(&zal->zal_list, mat);
+ zal->zal_size++;
+
+ ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
+}
+
+void
+metaslab_trace_init(zio_alloc_list_t *zal)
+{
+ list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
+ offsetof(metaslab_alloc_trace_t, mat_list_node));
+ zal->zal_size = 0;
+}
+
+void
+metaslab_trace_fini(zio_alloc_list_t *zal)
+{
+ metaslab_alloc_trace_t *mat;
+
+ while ((mat = list_remove_head(&zal->zal_list)) != NULL)
+ kmem_cache_free(metaslab_alloc_trace_cache, mat);
+ list_destroy(&zal->zal_list);
+ zal->zal_size = 0;
+}
+
+/*
+ * ==========================================================================
+ * Metaslab block operations
+ * ==========================================================================
+ */
+
+static void
+metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
+ int allocator)
+{
+ if (!(flags & METASLAB_ASYNC_ALLOC) ||
+ (flags & METASLAB_DONT_THROTTLE))
+ return;
+
+ metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+ if (!mg->mg_class->mc_alloc_throttle_enabled)
+ return;
+
+ metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
+ (void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag);
+}
+
+static void
+metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
+{
+ metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
+ metaslab_class_allocator_t *mca =
+ &mg->mg_class->mc_allocator[allocator];
+ uint64_t max = mg->mg_max_alloc_queue_depth;
+ uint64_t cur = mga->mga_cur_max_alloc_queue_depth;
+ while (cur < max) {
+ if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth,
+ cur, cur + 1) == cur) {
+ atomic_inc_64(&mca->mca_alloc_max_slots);
+ return;
+ }
+ cur = mga->mga_cur_max_alloc_queue_depth;
+ }
+}
+
+void
+metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
+ int allocator, boolean_t io_complete)
+{
+ if (!(flags & METASLAB_ASYNC_ALLOC) ||
+ (flags & METASLAB_DONT_THROTTLE))
+ return;
+
+ metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+ if (!mg->mg_class->mc_alloc_throttle_enabled)
+ return;
+
+ metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
+ (void) zfs_refcount_remove(&mga->mga_alloc_queue_depth, tag);
+ if (io_complete)
+ metaslab_group_increment_qdepth(mg, allocator);
+}
+
+void
+metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
+ int allocator)
+{
+#ifdef ZFS_DEBUG
+ const dva_t *dva = bp->blk_dva;
+ int ndvas = BP_GET_NDVAS(bp);
+
+ for (int d = 0; d < ndvas; d++) {
+ uint64_t vdev = DVA_GET_VDEV(&dva[d]);
+ metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+ metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
+ VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag));
+ }
+#endif
+}
+
+static uint64_t
+metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
+{
+ uint64_t start;
+ range_tree_t *rt = msp->ms_allocatable;
+ metaslab_class_t *mc = msp->ms_group->mg_class;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ VERIFY(!msp->ms_condensing);
+ VERIFY0(msp->ms_disabled);
+
+ start = mc->mc_ops->msop_alloc(msp, size);
+ if (start != -1ULL) {
+ metaslab_group_t *mg = msp->ms_group;
+ vdev_t *vd = mg->mg_vd;
+
+ VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
+ VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+ VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
+ range_tree_remove(rt, start, size);
+ range_tree_clear(msp->ms_trim, start, size);
+
+ if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
+ vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
+
+ range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
+ msp->ms_allocating_total += size;
+
+ /* Track the last successful allocation */
+ msp->ms_alloc_txg = txg;
+ metaslab_verify_space(msp, txg);
+ }
+
+ /*
+ * Now that we've attempted the allocation we need to update the
+ * metaslab's maximum block size since it may have changed.
+ */
+ msp->ms_max_size = metaslab_largest_allocatable(msp);
+ return (start);
+}
+
+/*
+ * Find the metaslab with the highest weight that is less than what we've
+ * already tried. In the common case, this means that we will examine each
+ * metaslab at most once. Note that concurrent callers could reorder metaslabs
+ * by activation/passivation once we have dropped the mg_lock. If a metaslab is
+ * activated by another thread, and we fail to allocate from the metaslab we
+ * have selected, we may not try the newly-activated metaslab, and instead
+ * activate another metaslab. This is not optimal, but generally does not cause
+ * any problems (a possible exception being if every metaslab is completely full
+ * except for the newly-activated metaslab which we fail to examine).
+ */
+static metaslab_t *
+find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
+ dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
+ boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search,
+ boolean_t *was_active)
+{
+ avl_index_t idx;
+ avl_tree_t *t = &mg->mg_metaslab_tree;
+ metaslab_t *msp = avl_find(t, search, &idx);
+ if (msp == NULL)
+ msp = avl_nearest(t, idx, AVL_AFTER);
+
+ int tries = 0;
+ for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
+ int i;
+
+ if (!try_hard && tries > zfs_metaslab_find_max_tries) {
+ METASLABSTAT_BUMP(metaslabstat_too_many_tries);
+ return (NULL);
+ }
+ tries++;
+
+ if (!metaslab_should_allocate(msp, asize, try_hard)) {
+ metaslab_trace_add(zal, mg, msp, asize, d,
+ TRACE_TOO_SMALL, allocator);
+ continue;
+ }
+
+ /*
+ * If the selected metaslab is condensing or disabled,
+ * skip it.
+ */
+ if (msp->ms_condensing || msp->ms_disabled > 0)
+ continue;
+
+ *was_active = msp->ms_allocator != -1;
+ /*
+ * If we're activating as primary, this is our first allocation
+ * from this disk, so we don't need to check how close we are.
+ * If the metaslab under consideration was already active,
+ * we're getting desperate enough to steal another allocator's
+ * metaslab, so we still don't care about distances.
+ */
+ if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
+ break;
+
+ for (i = 0; i < d; i++) {
+ if (want_unique &&
+ !metaslab_is_unique(msp, &dva[i]))
+ break; /* try another metaslab */
+ }
+ if (i == d)
+ break;
+ }
+
+ if (msp != NULL) {
+ search->ms_weight = msp->ms_weight;
+ search->ms_start = msp->ms_start + 1;
+ search->ms_allocator = msp->ms_allocator;
+ search->ms_primary = msp->ms_primary;
+ }
+ return (msp);
+}
+
+static void
+metaslab_active_mask_verify(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
+ return;
+
+ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0)
+ return;
+
+ if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) {
+ VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
+ VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
+ VERIFY3S(msp->ms_allocator, !=, -1);
+ VERIFY(msp->ms_primary);
+ return;
+ }
+
+ if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) {
+ VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
+ VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
+ VERIFY3S(msp->ms_allocator, !=, -1);
+ VERIFY(!msp->ms_primary);
+ return;
+ }
+
+ if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
+ VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
+ VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
+ VERIFY3S(msp->ms_allocator, ==, -1);
+ return;
+ }
+}
+
+/* ARGSUSED */
+static uint64_t
+metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
+ uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
+ int allocator, boolean_t try_hard)
+{
+ metaslab_t *msp = NULL;
+ uint64_t offset = -1ULL;
+
+ uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY;
+ for (int i = 0; i < d; i++) {
+ if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
+ DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
+ activation_weight = METASLAB_WEIGHT_SECONDARY;
+ } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
+ DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
+ activation_weight = METASLAB_WEIGHT_CLAIM;
+ break;
+ }
+ }
+
+ /*
+ * If we don't have enough metaslabs active to fill the entire array, we
+ * just use the 0th slot.
+ */
+ if (mg->mg_ms_ready < mg->mg_allocators * 3)
+ allocator = 0;
+ metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
+
+ ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
+
+ metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
+ search->ms_weight = UINT64_MAX;
+ search->ms_start = 0;
+ /*
+ * At the end of the metaslab tree are the already-active metaslabs,
+ * first the primaries, then the secondaries. When we resume searching
+ * through the tree, we need to consider ms_allocator and ms_primary so
+ * we start in the location right after where we left off, and don't
+ * accidentally loop forever considering the same metaslabs.
+ */
+ search->ms_allocator = -1;
+ search->ms_primary = B_TRUE;
+ for (;;) {
+ boolean_t was_active = B_FALSE;
+
+ mutex_enter(&mg->mg_lock);
+
+ if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
+ mga->mga_primary != NULL) {
+ msp = mga->mga_primary;
+
+ /*
+ * Even though we don't hold the ms_lock for the
+ * primary metaslab, those fields should not
+ * change while we hold the mg_lock. Thus it is
+ * safe to make assertions on them.
+ */
+ ASSERT(msp->ms_primary);
+ ASSERT3S(msp->ms_allocator, ==, allocator);
+ ASSERT(msp->ms_loaded);
+
+ was_active = B_TRUE;
+ ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
+ } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
+ mga->mga_secondary != NULL) {
+ msp = mga->mga_secondary;
+
+ /*
+ * See comment above about the similar assertions
+ * for the primary metaslab.
+ */
+ ASSERT(!msp->ms_primary);
+ ASSERT3S(msp->ms_allocator, ==, allocator);
+ ASSERT(msp->ms_loaded);
+
+ was_active = B_TRUE;
+ ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
+ } else {
+ msp = find_valid_metaslab(mg, activation_weight, dva, d,
+ want_unique, asize, allocator, try_hard, zal,
+ search, &was_active);
+ }
+
+ mutex_exit(&mg->mg_lock);
+ if (msp == NULL) {
+ kmem_free(search, sizeof (*search));
+ return (-1ULL);
+ }
+ mutex_enter(&msp->ms_lock);
+
+ metaslab_active_mask_verify(msp);
+
+ /*
+ * This code is disabled out because of issues with
+ * tracepoints in non-gpl kernel modules.
+ */
+#if 0
+ DTRACE_PROBE3(ms__activation__attempt,
+ metaslab_t *, msp, uint64_t, activation_weight,
+ boolean_t, was_active);
+#endif
+
+ /*
+ * Ensure that the metaslab we have selected is still
+ * capable of handling our request. It's possible that
+ * another thread may have changed the weight while we
+ * were blocked on the metaslab lock. We check the
+ * active status first to see if we need to set_selected_txg
+ * a new metaslab.
+ */
+ if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
+ ASSERT3S(msp->ms_allocator, ==, -1);
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ /*
+ * If the metaslab was activated for another allocator
+ * while we were waiting in the ms_lock above, or it's
+ * a primary and we're seeking a secondary (or vice versa),
+ * we go back and select a new metaslab.
+ */
+ if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
+ (msp->ms_allocator != -1) &&
+ (msp->ms_allocator != allocator || ((activation_weight ==
+ METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
+ ASSERT(msp->ms_loaded);
+ ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) ||
+ msp->ms_allocator != -1);
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ /*
+ * This metaslab was used for claiming regions allocated
+ * by the ZIL during pool import. Once these regions are
+ * claimed we don't need to keep the CLAIM bit set
+ * anymore. Passivate this metaslab to zero its activation
+ * mask.
+ */
+ if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
+ activation_weight != METASLAB_WEIGHT_CLAIM) {
+ ASSERT(msp->ms_loaded);
+ ASSERT3S(msp->ms_allocator, ==, -1);
+ metaslab_passivate(msp, msp->ms_weight &
+ ~METASLAB_WEIGHT_CLAIM);
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ metaslab_set_selected_txg(msp, txg);
+
+ int activation_error =
+ metaslab_activate(msp, allocator, activation_weight);
+ metaslab_active_mask_verify(msp);
+
+ /*
+ * If the metaslab was activated by another thread for
+ * another allocator or activation_weight (EBUSY), or it
+ * failed because another metaslab was assigned as primary
+ * for this allocator (EEXIST) we continue using this
+ * metaslab for our allocation, rather than going on to a
+ * worse metaslab (we waited for that metaslab to be loaded
+ * after all).
+ *
+ * If the activation failed due to an I/O error or ENOSPC we
+ * skip to the next metaslab.
+ */
+ boolean_t activated;
+ if (activation_error == 0) {
+ activated = B_TRUE;
+ } else if (activation_error == EBUSY ||
+ activation_error == EEXIST) {
+ activated = B_FALSE;
+ } else {
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+ ASSERT(msp->ms_loaded);
+
+ /*
+ * Now that we have the lock, recheck to see if we should
+ * continue to use this metaslab for this allocation. The
+ * the metaslab is now loaded so metaslab_should_allocate()
+ * can accurately determine if the allocation attempt should
+ * proceed.
+ */
+ if (!metaslab_should_allocate(msp, asize, try_hard)) {
+ /* Passivate this metaslab and select a new one. */
+ metaslab_trace_add(zal, mg, msp, asize, d,
+ TRACE_TOO_SMALL, allocator);
+ goto next;
+ }
+
+ /*
+ * If this metaslab is currently condensing then pick again
+ * as we can't manipulate this metaslab until it's committed
+ * to disk. If this metaslab is being initialized, we shouldn't
+ * allocate from it since the allocated region might be
+ * overwritten after allocation.
+ */
+ if (msp->ms_condensing) {
+ metaslab_trace_add(zal, mg, msp, asize, d,
+ TRACE_CONDENSING, allocator);
+ if (activated) {
+ metaslab_passivate(msp, msp->ms_weight &
+ ~METASLAB_ACTIVE_MASK);
+ }
+ mutex_exit(&msp->ms_lock);
+ continue;
+ } else if (msp->ms_disabled > 0) {
+ metaslab_trace_add(zal, mg, msp, asize, d,
+ TRACE_DISABLED, allocator);
+ if (activated) {
+ metaslab_passivate(msp, msp->ms_weight &
+ ~METASLAB_ACTIVE_MASK);
+ }
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ offset = metaslab_block_alloc(msp, asize, txg);
+ metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
+
+ if (offset != -1ULL) {
+ /* Proactively passivate the metaslab, if needed */
+ if (activated)
+ metaslab_segment_may_passivate(msp);
+ break;
+ }
+next:
+ ASSERT(msp->ms_loaded);
+
+ /*
+ * This code is disabled out because of issues with
+ * tracepoints in non-gpl kernel modules.
+ */
+#if 0
+ DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp,
+ uint64_t, asize);
+#endif
+
+ /*
+ * We were unable to allocate from this metaslab so determine
+ * a new weight for this metaslab. Now that we have loaded
+ * the metaslab we can provide a better hint to the metaslab
+ * selector.
+ *
+ * For space-based metaslabs, we use the maximum block size.
+ * This information is only available when the metaslab
+ * is loaded and is more accurate than the generic free
+ * space weight that was calculated by metaslab_weight().
+ * This information allows us to quickly compare the maximum
+ * available allocation in the metaslab to the allocation
+ * size being requested.
+ *
+ * For segment-based metaslabs, determine the new weight
+ * based on the highest bucket in the range tree. We
+ * explicitly use the loaded segment weight (i.e. the range
+ * tree histogram) since it contains the space that is
+ * currently available for allocation and is accurate
+ * even within a sync pass.
+ */
+ uint64_t weight;
+ if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
+ weight = metaslab_largest_allocatable(msp);
+ WEIGHT_SET_SPACEBASED(weight);
+ } else {
+ weight = metaslab_weight_from_range_tree(msp);
+ }
+
+ if (activated) {
+ metaslab_passivate(msp, weight);
+ } else {
+ /*
+ * For the case where we use the metaslab that is
+ * active for another allocator we want to make
+ * sure that we retain the activation mask.
+ *
+ * Note that we could attempt to use something like
+ * metaslab_recalculate_weight_and_sort() that
+ * retains the activation mask here. That function
+ * uses metaslab_weight() to set the weight though
+ * which is not as accurate as the calculations
+ * above.
+ */
+ weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
+ metaslab_group_sort(mg, msp, weight);
+ }
+ metaslab_active_mask_verify(msp);
+
+ /*
+ * We have just failed an allocation attempt, check
+ * that metaslab_should_allocate() agrees. Otherwise,
+ * we may end up in an infinite loop retrying the same
+ * metaslab.
+ */
+ ASSERT(!metaslab_should_allocate(msp, asize, try_hard));
+
+ mutex_exit(&msp->ms_lock);
+ }
+ mutex_exit(&msp->ms_lock);
+ kmem_free(search, sizeof (*search));
+ return (offset);
+}
+
+static uint64_t
+metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
+ uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
+ int allocator, boolean_t try_hard)
+{
+ uint64_t offset;
+ ASSERT(mg->mg_initialized);
+
+ offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
+ dva, d, allocator, try_hard);
+
+ mutex_enter(&mg->mg_lock);
+ if (offset == -1ULL) {
+ mg->mg_failed_allocations++;
+ metaslab_trace_add(zal, mg, NULL, asize, d,
+ TRACE_GROUP_FAILURE, allocator);
+ if (asize == SPA_GANGBLOCKSIZE) {
+ /*
+ * This metaslab group was unable to allocate
+ * the minimum gang block size so it must be out of
+ * space. We must notify the allocation throttle
+ * to start skipping allocation attempts to this
+ * metaslab group until more space becomes available.
+ * Note: this failure cannot be caused by the
+ * allocation throttle since the allocation throttle
+ * is only responsible for skipping devices and
+ * not failing block allocations.
+ */
+ mg->mg_no_free_space = B_TRUE;
+ }
+ }
+ mg->mg_allocations++;
+ mutex_exit(&mg->mg_lock);
+ return (offset);
+}
+
+/*
+ * Allocate a block for the specified i/o.
+ */
+int
+metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
+ dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
+ zio_alloc_list_t *zal, int allocator)
+{
+ metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
+ metaslab_group_t *mg, *fast_mg, *rotor;
+ vdev_t *vd;
+ boolean_t try_hard = B_FALSE;
+
+ ASSERT(!DVA_IS_VALID(&dva[d]));
+
+ /*
+ * For testing, make some blocks above a certain size be gang blocks.
+ * This will result in more split blocks when using device removal,
+ * and a large number of split blocks coupled with ztest-induced
+ * damage can result in extremely long reconstruction times. This
+ * will also test spilling from special to normal.
+ */
+ if (psize >= metaslab_force_ganging && (spa_get_random(100) < 3)) {
+ metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
+ allocator);
+ return (SET_ERROR(ENOSPC));
+ }
+
+ /*
+ * Start at the rotor and loop through all mgs until we find something.
+ * Note that there's no locking on mca_rotor or mca_aliquot because
+ * nothing actually breaks if we miss a few updates -- we just won't
+ * allocate quite as evenly. It all balances out over time.
+ *
+ * If we are doing ditto or log blocks, try to spread them across
+ * consecutive vdevs. If we're forced to reuse a vdev before we've
+ * allocated all of our ditto blocks, then try and spread them out on
+ * that vdev as much as possible. If it turns out to not be possible,
+ * gradually lower our standards until anything becomes acceptable.
+ * Also, allocating on consecutive vdevs (as opposed to random vdevs)
+ * gives us hope of containing our fault domains to something we're
+ * able to reason about. Otherwise, any two top-level vdev failures
+ * will guarantee the loss of data. With consecutive allocation,
+ * only two adjacent top-level vdev failures will result in data loss.
+ *
+ * If we are doing gang blocks (hintdva is non-NULL), try to keep
+ * ourselves on the same vdev as our gang block header. That
+ * way, we can hope for locality in vdev_cache, plus it makes our
+ * fault domains something tractable.
+ */
+ if (hintdva) {
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
+
+ /*
+ * It's possible the vdev we're using as the hint no
+ * longer exists or its mg has been closed (e.g. by
+ * device removal). Consult the rotor when
+ * all else fails.
+ */
+ if (vd != NULL && vd->vdev_mg != NULL) {
+ mg = vdev_get_mg(vd, mc);
+
+ if (flags & METASLAB_HINTBP_AVOID &&
+ mg->mg_next != NULL)
+ mg = mg->mg_next;
+ } else {
+ mg = mca->mca_rotor;
+ }
+ } else if (d != 0) {
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
+ mg = vd->vdev_mg->mg_next;
+ } else if (flags & METASLAB_FASTWRITE) {
+ mg = fast_mg = mca->mca_rotor;
+
+ do {
+ if (fast_mg->mg_vd->vdev_pending_fastwrite <
+ mg->mg_vd->vdev_pending_fastwrite)
+ mg = fast_mg;
+ } while ((fast_mg = fast_mg->mg_next) != mca->mca_rotor);
+
+ } else {
+ ASSERT(mca->mca_rotor != NULL);
+ mg = mca->mca_rotor;
+ }
+
+ /*
+ * If the hint put us into the wrong metaslab class, or into a
+ * metaslab group that has been passivated, just follow the rotor.
+ */
+ if (mg->mg_class != mc || mg->mg_activation_count <= 0)
+ mg = mca->mca_rotor;
+
+ rotor = mg;
+top:
+ do {
+ boolean_t allocatable;
+
+ ASSERT(mg->mg_activation_count == 1);
+ vd = mg->mg_vd;
+
+ /*
+ * Don't allocate from faulted devices.
+ */
+ if (try_hard) {
+ spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
+ allocatable = vdev_allocatable(vd);
+ spa_config_exit(spa, SCL_ZIO, FTAG);
+ } else {
+ allocatable = vdev_allocatable(vd);
+ }
+
+ /*
+ * Determine if the selected metaslab group is eligible
+ * for allocations. If we're ganging then don't allow
+ * this metaslab group to skip allocations since that would
+ * inadvertently return ENOSPC and suspend the pool
+ * even though space is still available.
+ */
+ if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
+ allocatable = metaslab_group_allocatable(mg, rotor,
+ psize, allocator, d);
+ }
+
+ if (!allocatable) {
+ metaslab_trace_add(zal, mg, NULL, psize, d,
+ TRACE_NOT_ALLOCATABLE, allocator);
+ goto next;
+ }
+
+ ASSERT(mg->mg_initialized);
+
+ /*
+ * Avoid writing single-copy data to a failing,
+ * non-redundant vdev, unless we've already tried all
+ * other vdevs.
+ */
+ if ((vd->vdev_stat.vs_write_errors > 0 ||
+ vd->vdev_state < VDEV_STATE_HEALTHY) &&
+ d == 0 && !try_hard && vd->vdev_children == 0) {
+ metaslab_trace_add(zal, mg, NULL, psize, d,
+ TRACE_VDEV_ERROR, allocator);
+ goto next;
+ }
+
+ ASSERT(mg->mg_class == mc);
+
+ uint64_t asize = vdev_psize_to_asize(vd, psize);
+ ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
+
+ /*
+ * If we don't need to try hard, then require that the
+ * block be on a different metaslab from any other DVAs
+ * in this BP (unique=true). If we are trying hard, then
+ * allow any metaslab to be used (unique=false).
+ */
+ uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
+ !try_hard, dva, d, allocator, try_hard);
+
+ if (offset != -1ULL) {
+ /*
+ * If we've just selected this metaslab group,
+ * figure out whether the corresponding vdev is
+ * over- or under-used relative to the pool,
+ * and set an allocation bias to even it out.
+ *
+ * Bias is also used to compensate for unequally
+ * sized vdevs so that space is allocated fairly.
+ */
+ if (mca->mca_aliquot == 0 && metaslab_bias_enabled) {
+ vdev_stat_t *vs = &vd->vdev_stat;
+ int64_t vs_free = vs->vs_space - vs->vs_alloc;
+ int64_t mc_free = mc->mc_space - mc->mc_alloc;
+ int64_t ratio;
+
+ /*
+ * Calculate how much more or less we should
+ * try to allocate from this device during
+ * this iteration around the rotor.
+ *
+ * This basically introduces a zero-centered
+ * bias towards the devices with the most
+ * free space, while compensating for vdev
+ * size differences.
+ *
+ * Examples:
+ * vdev V1 = 16M/128M
+ * vdev V2 = 16M/128M
+ * ratio(V1) = 100% ratio(V2) = 100%
+ *
+ * vdev V1 = 16M/128M
+ * vdev V2 = 64M/128M
+ * ratio(V1) = 127% ratio(V2) = 72%
+ *
+ * vdev V1 = 16M/128M
+ * vdev V2 = 64M/512M
+ * ratio(V1) = 40% ratio(V2) = 160%
+ */
+ ratio = (vs_free * mc->mc_alloc_groups * 100) /
+ (mc_free + 1);
+ mg->mg_bias = ((ratio - 100) *
+ (int64_t)mg->mg_aliquot) / 100;
+ } else if (!metaslab_bias_enabled) {
+ mg->mg_bias = 0;
+ }
+
+ if ((flags & METASLAB_FASTWRITE) ||
+ atomic_add_64_nv(&mca->mca_aliquot, asize) >=
+ mg->mg_aliquot + mg->mg_bias) {
+ mca->mca_rotor = mg->mg_next;
+ mca->mca_aliquot = 0;
+ }
+
+ DVA_SET_VDEV(&dva[d], vd->vdev_id);
+ DVA_SET_OFFSET(&dva[d], offset);
+ DVA_SET_GANG(&dva[d],
+ ((flags & METASLAB_GANG_HEADER) ? 1 : 0));
+ DVA_SET_ASIZE(&dva[d], asize);
+
+ if (flags & METASLAB_FASTWRITE) {
+ atomic_add_64(&vd->vdev_pending_fastwrite,
+ psize);
+ }
+
+ return (0);
+ }
+next:
+ mca->mca_rotor = mg->mg_next;
+ mca->mca_aliquot = 0;
+ } while ((mg = mg->mg_next) != rotor);
+
+ /*
+ * If we haven't tried hard, perhaps do so now.
+ */
+ if (!try_hard && (zfs_metaslab_try_hard_before_gang ||
+ GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 ||
+ psize <= 1 << spa->spa_min_ashift)) {
+ METASLABSTAT_BUMP(metaslabstat_try_hard);
+ try_hard = B_TRUE;
+ goto top;
+ }
+
+ bzero(&dva[d], sizeof (dva_t));
+
+ metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
+ return (SET_ERROR(ENOSPC));
+}
+
+void
+metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
+ boolean_t checkpoint)
+{
+ metaslab_t *msp;
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
+ ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
+
+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ VERIFY(!msp->ms_condensing);
+ VERIFY3U(offset, >=, msp->ms_start);
+ VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
+ VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+ VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
+
+ metaslab_check_free_impl(vd, offset, asize);
+
+ mutex_enter(&msp->ms_lock);
+ if (range_tree_is_empty(msp->ms_freeing) &&
+ range_tree_is_empty(msp->ms_checkpointing)) {
+ vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
+ }
+
+ if (checkpoint) {
+ ASSERT(spa_has_checkpoint(spa));
+ range_tree_add(msp->ms_checkpointing, offset, asize);
+ } else {
+ range_tree_add(msp->ms_freeing, offset, asize);
+ }
+ mutex_exit(&msp->ms_lock);
+}
+
+/* ARGSUSED */
+void
+metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *arg)
+{
+ boolean_t *checkpoint = arg;
+
+ ASSERT3P(checkpoint, !=, NULL);
+
+ if (vd->vdev_ops->vdev_op_remap != NULL)
+ vdev_indirect_mark_obsolete(vd, offset, size);
+ else
+ metaslab_free_impl(vd, offset, size, *checkpoint);
+}
+
+static void
+metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
+ boolean_t checkpoint)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
+
+ if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
+ return;
+
+ if (spa->spa_vdev_removal != NULL &&
+ spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
+ vdev_is_concrete(vd)) {
+ /*
+ * Note: we check if the vdev is concrete because when
+ * we complete the removal, we first change the vdev to be
+ * an indirect vdev (in open context), and then (in syncing
+ * context) clear spa_vdev_removal.
+ */
+ free_from_removing_vdev(vd, offset, size);
+ } else if (vd->vdev_ops->vdev_op_remap != NULL) {
+ vdev_indirect_mark_obsolete(vd, offset, size);
+ vd->vdev_ops->vdev_op_remap(vd, offset, size,
+ metaslab_free_impl_cb, &checkpoint);
+ } else {
+ metaslab_free_concrete(vd, offset, size, checkpoint);
+ }
+}
+
+typedef struct remap_blkptr_cb_arg {
+ blkptr_t *rbca_bp;
+ spa_remap_cb_t rbca_cb;
+ vdev_t *rbca_remap_vd;
+ uint64_t rbca_remap_offset;
+ void *rbca_cb_arg;
+} remap_blkptr_cb_arg_t;
+
+static void
+remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *arg)
+{
+ remap_blkptr_cb_arg_t *rbca = arg;
+ blkptr_t *bp = rbca->rbca_bp;
+
+ /* We can not remap split blocks. */
+ if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
+ return;
+ ASSERT0(inner_offset);
+
+ if (rbca->rbca_cb != NULL) {
+ /*
+ * At this point we know that we are not handling split
+ * blocks and we invoke the callback on the previous
+ * vdev which must be indirect.
+ */
+ ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
+
+ rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
+ rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
+
+ /* set up remap_blkptr_cb_arg for the next call */
+ rbca->rbca_remap_vd = vd;
+ rbca->rbca_remap_offset = offset;
+ }
+
+ /*
+ * The phys birth time is that of dva[0]. This ensures that we know
+ * when each dva was written, so that resilver can determine which
+ * blocks need to be scrubbed (i.e. those written during the time
+ * the vdev was offline). It also ensures that the key used in
+ * the ARC hash table is unique (i.e. dva[0] + phys_birth). If
+ * we didn't change the phys_birth, a lookup in the ARC for a
+ * remapped BP could find the data that was previously stored at
+ * this vdev + offset.
+ */
+ vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
+ DVA_GET_VDEV(&bp->blk_dva[0]));
+ vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
+ bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
+ DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
+
+ DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
+ DVA_SET_OFFSET(&bp->blk_dva[0], offset);
+}
+
+/*
+ * If the block pointer contains any indirect DVAs, modify them to refer to
+ * concrete DVAs. Note that this will sometimes not be possible, leaving
+ * the indirect DVA in place. This happens if the indirect DVA spans multiple
+ * segments in the mapping (i.e. it is a "split block").
+ *
+ * If the BP was remapped, calls the callback on the original dva (note the
+ * callback can be called multiple times if the original indirect DVA refers
+ * to another indirect DVA, etc).
+ *
+ * Returns TRUE if the BP was remapped.
+ */
+boolean_t
+spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
+{
+ remap_blkptr_cb_arg_t rbca;
+
+ if (!zfs_remap_blkptr_enable)
+ return (B_FALSE);
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
+ return (B_FALSE);
+
+ /*
+ * Dedup BP's can not be remapped, because ddt_phys_select() depends
+ * on DVA[0] being the same in the BP as in the DDT (dedup table).
+ */
+ if (BP_GET_DEDUP(bp))
+ return (B_FALSE);
+
+ /*
+ * Gang blocks can not be remapped, because
+ * zio_checksum_gang_verifier() depends on the DVA[0] that's in
+ * the BP used to read the gang block header (GBH) being the same
+ * as the DVA[0] that we allocated for the GBH.
+ */
+ if (BP_IS_GANG(bp))
+ return (B_FALSE);
+
+ /*
+ * Embedded BP's have no DVA to remap.
+ */
+ if (BP_GET_NDVAS(bp) < 1)
+ return (B_FALSE);
+
+ /*
+ * Note: we only remap dva[0]. If we remapped other dvas, we
+ * would no longer know what their phys birth txg is.
+ */
+ dva_t *dva = &bp->blk_dva[0];
+
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t size = DVA_GET_ASIZE(dva);
+ vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+
+ if (vd->vdev_ops->vdev_op_remap == NULL)
+ return (B_FALSE);
+
+ rbca.rbca_bp = bp;
+ rbca.rbca_cb = callback;
+ rbca.rbca_remap_vd = vd;
+ rbca.rbca_remap_offset = offset;
+ rbca.rbca_cb_arg = arg;
+
+ /*
+ * remap_blkptr_cb() will be called in order for each level of
+ * indirection, until a concrete vdev is reached or a split block is
+ * encountered. old_vd and old_offset are updated within the callback
+ * as we go from the one indirect vdev to the next one (either concrete
+ * or indirect again) in that order.
+ */
+ vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
+
+ /* Check if the DVA wasn't remapped because it is a split block */
+ if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+/*
+ * Undo the allocation of a DVA which happened in the given transaction group.
+ */
+void
+metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
+{
+ metaslab_t *msp;
+ vdev_t *vd;
+ uint64_t vdev = DVA_GET_VDEV(dva);
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t size = DVA_GET_ASIZE(dva);
+
+ ASSERT(DVA_IS_VALID(dva));
+ ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
+
+ if (txg > spa_freeze_txg(spa))
+ return;
+
+ if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) ||
+ (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
+ zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu",
+ (u_longlong_t)vdev, (u_longlong_t)offset,
+ (u_longlong_t)size);
+ return;
+ }
+
+ ASSERT(!vd->vdev_removing);
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
+ ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
+
+ if (DVA_GET_GANG(dva))
+ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ mutex_enter(&msp->ms_lock);
+ range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
+ offset, size);
+ msp->ms_allocating_total -= size;
+
+ VERIFY(!msp->ms_condensing);
+ VERIFY3U(offset, >=, msp->ms_start);
+ VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
+ VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
+ msp->ms_size);
+ VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+ VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+ range_tree_add(msp->ms_allocatable, offset, size);
+ mutex_exit(&msp->ms_lock);
+}
+
+/*
+ * Free the block represented by the given DVA.
+ */
+void
+metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
+{
+ uint64_t vdev = DVA_GET_VDEV(dva);
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t size = DVA_GET_ASIZE(dva);
+ vdev_t *vd = vdev_lookup_top(spa, vdev);
+
+ ASSERT(DVA_IS_VALID(dva));
+ ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
+
+ if (DVA_GET_GANG(dva)) {
+ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+ }
+
+ metaslab_free_impl(vd, offset, size, checkpoint);
+}
+
+/*
+ * Reserve some allocation slots. The reservation system must be called
+ * before we call into the allocator. If there aren't any available slots
+ * then the I/O will be throttled until an I/O completes and its slots are
+ * freed up. The function returns true if it was successful in placing
+ * the reservation.
+ */
+boolean_t
+metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
+ zio_t *zio, int flags)
+{
+ metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
+ uint64_t available_slots = 0;
+ boolean_t slot_reserved = B_FALSE;
+ uint64_t max = mca->mca_alloc_max_slots;
+
+ ASSERT(mc->mc_alloc_throttle_enabled);
+ mutex_enter(&mc->mc_lock);
+
+ uint64_t reserved_slots = zfs_refcount_count(&mca->mca_alloc_slots);
+ if (reserved_slots < max)
+ available_slots = max - reserved_slots;
+
+ if (slots <= available_slots || GANG_ALLOCATION(flags) ||
+ flags & METASLAB_MUST_RESERVE) {
+ /*
+ * We reserve the slots individually so that we can unreserve
+ * them individually when an I/O completes.
+ */
+ for (int d = 0; d < slots; d++)
+ zfs_refcount_add(&mca->mca_alloc_slots, zio);
+ zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
+ slot_reserved = B_TRUE;
+ }
+
+ mutex_exit(&mc->mc_lock);
+ return (slot_reserved);
+}
+
+void
+metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
+ int allocator, zio_t *zio)
+{
+ metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
+
+ ASSERT(mc->mc_alloc_throttle_enabled);
+ mutex_enter(&mc->mc_lock);
+ for (int d = 0; d < slots; d++)
+ zfs_refcount_remove(&mca->mca_alloc_slots, zio);
+ mutex_exit(&mc->mc_lock);
+}
+
+static int
+metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
+ uint64_t txg)
+{
+ metaslab_t *msp;
+ spa_t *spa = vd->vdev_spa;
+ int error = 0;
+
+ if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
+ return (SET_ERROR(ENXIO));
+
+ ASSERT3P(vd->vdev_ms, !=, NULL);
+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ mutex_enter(&msp->ms_lock);
+
+ if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) {
+ error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
+ if (error == EBUSY) {
+ ASSERT(msp->ms_loaded);
+ ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
+ error = 0;
+ }
+ }
+
+ if (error == 0 &&
+ !range_tree_contains(msp->ms_allocatable, offset, size))
+ error = SET_ERROR(ENOENT);
+
+ if (error || txg == 0) { /* txg == 0 indicates dry run */
+ mutex_exit(&msp->ms_lock);
+ return (error);
+ }
+
+ VERIFY(!msp->ms_condensing);
+ VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+ VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+ VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
+ msp->ms_size);
+ range_tree_remove(msp->ms_allocatable, offset, size);
+ range_tree_clear(msp->ms_trim, offset, size);
+
+ if (spa_writeable(spa)) { /* don't dirty if we're zdb(8) */
+ metaslab_class_t *mc = msp->ms_group->mg_class;
+ multilist_sublist_t *mls =
+ multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+ if (!multilist_link_active(&msp->ms_class_txg_node)) {
+ msp->ms_selected_txg = txg;
+ multilist_sublist_insert_head(mls, msp);
+ }
+ multilist_sublist_unlock(mls);
+
+ if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
+ vdev_dirty(vd, VDD_METASLAB, msp, txg);
+ range_tree_add(msp->ms_allocating[txg & TXG_MASK],
+ offset, size);
+ msp->ms_allocating_total += size;
+ }
+
+ mutex_exit(&msp->ms_lock);
+
+ return (0);
+}
+
+typedef struct metaslab_claim_cb_arg_t {
+ uint64_t mcca_txg;
+ int mcca_error;
+} metaslab_claim_cb_arg_t;
+
+/* ARGSUSED */
+static void
+metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *arg)
+{
+ metaslab_claim_cb_arg_t *mcca_arg = arg;
+
+ if (mcca_arg->mcca_error == 0) {
+ mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
+ size, mcca_arg->mcca_txg);
+ }
+}
+
+int
+metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
+{
+ if (vd->vdev_ops->vdev_op_remap != NULL) {
+ metaslab_claim_cb_arg_t arg;
+
+ /*
+ * Only zdb(8) can claim on indirect vdevs. This is used
+ * to detect leaks of mapped space (that are not accounted
+ * for in the obsolete counts, spacemap, or bpobj).
+ */
+ ASSERT(!spa_writeable(vd->vdev_spa));
+ arg.mcca_error = 0;
+ arg.mcca_txg = txg;
+
+ vd->vdev_ops->vdev_op_remap(vd, offset, size,
+ metaslab_claim_impl_cb, &arg);
+
+ if (arg.mcca_error == 0) {
+ arg.mcca_error = metaslab_claim_concrete(vd,
+ offset, size, txg);
+ }
+ return (arg.mcca_error);
+ } else {
+ return (metaslab_claim_concrete(vd, offset, size, txg));
+ }
+}
+
+/*
+ * Intent log support: upon opening the pool after a crash, notify the SPA
+ * of blocks that the intent log has allocated for immediate write, but
+ * which are still considered free by the SPA because the last transaction
+ * group didn't commit yet.
+ */
+static int
+metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
+{
+ uint64_t vdev = DVA_GET_VDEV(dva);
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t size = DVA_GET_ASIZE(dva);
+ vdev_t *vd;
+
+ if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
+ return (SET_ERROR(ENXIO));
+ }
+
+ ASSERT(DVA_IS_VALID(dva));
+
+ if (DVA_GET_GANG(dva))
+ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+ return (metaslab_claim_impl(vd, offset, size, txg));
+}
+
+int
+metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
+ int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
+ zio_alloc_list_t *zal, zio_t *zio, int allocator)
+{
+ dva_t *dva = bp->blk_dva;
+ dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
+ int error = 0;
+
+ ASSERT(bp->blk_birth == 0);
+ ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
+
+ spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
+
+ if (mc->mc_allocator[allocator].mca_rotor == NULL) {
+ /* no vdevs in this class */
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
+ return (SET_ERROR(ENOSPC));
+ }
+
+ ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
+ ASSERT(BP_GET_NDVAS(bp) == 0);
+ ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
+ ASSERT3P(zal, !=, NULL);
+
+ for (int d = 0; d < ndvas; d++) {
+ error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
+ txg, flags, zal, allocator);
+ if (error != 0) {
+ for (d--; d >= 0; d--) {
+ metaslab_unalloc_dva(spa, &dva[d], txg);
+ metaslab_group_alloc_decrement(spa,
+ DVA_GET_VDEV(&dva[d]), zio, flags,
+ allocator, B_FALSE);
+ bzero(&dva[d], sizeof (dva_t));
+ }
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
+ return (error);
+ } else {
+ /*
+ * Update the metaslab group's queue depth
+ * based on the newly allocated dva.
+ */
+ metaslab_group_alloc_increment(spa,
+ DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
+ }
+ }
+ ASSERT(error == 0);
+ ASSERT(BP_GET_NDVAS(bp) == ndvas);
+
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
+
+ BP_SET_BIRTH(bp, txg, 0);
+
+ return (0);
+}
+
+void
+metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
+{
+ const dva_t *dva = bp->blk_dva;
+ int ndvas = BP_GET_NDVAS(bp);
+
+ ASSERT(!BP_IS_HOLE(bp));
+ ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
+
+ /*
+ * If we have a checkpoint for the pool we need to make sure that
+ * the blocks that we free that are part of the checkpoint won't be
+ * reused until the checkpoint is discarded or we revert to it.
+ *
+ * The checkpoint flag is passed down the metaslab_free code path
+ * and is set whenever we want to add a block to the checkpoint's
+ * accounting. That is, we "checkpoint" blocks that existed at the
+ * time the checkpoint was created and are therefore referenced by
+ * the checkpointed uberblock.
+ *
+ * Note that, we don't checkpoint any blocks if the current
+ * syncing txg <= spa_checkpoint_txg. We want these frees to sync
+ * normally as they will be referenced by the checkpointed uberblock.
+ */
+ boolean_t checkpoint = B_FALSE;
+ if (bp->blk_birth <= spa->spa_checkpoint_txg &&
+ spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
+ /*
+ * At this point, if the block is part of the checkpoint
+ * there is no way it was created in the current txg.
+ */
+ ASSERT(!now);
+ ASSERT3U(spa_syncing_txg(spa), ==, txg);
+ checkpoint = B_TRUE;
+ }
+
+ spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
+
+ for (int d = 0; d < ndvas; d++) {
+ if (now) {
+ metaslab_unalloc_dva(spa, &dva[d], txg);
+ } else {
+ ASSERT3U(txg, ==, spa_syncing_txg(spa));
+ metaslab_free_dva(spa, &dva[d], checkpoint);
+ }
+ }
+
+ spa_config_exit(spa, SCL_FREE, FTAG);
+}
+
+int
+metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
+{
+ const dva_t *dva = bp->blk_dva;
+ int ndvas = BP_GET_NDVAS(bp);
+ int error = 0;
+
+ ASSERT(!BP_IS_HOLE(bp));
+
+ if (txg != 0) {
+ /*
+ * First do a dry run to make sure all DVAs are claimable,
+ * so we don't have to unwind from partial failures below.
+ */
+ if ((error = metaslab_claim(spa, bp, 0)) != 0)
+ return (error);
+ }
+
+ spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
+
+ for (int d = 0; d < ndvas; d++) {
+ error = metaslab_claim_dva(spa, &dva[d], txg);
+ if (error != 0)
+ break;
+ }
+
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
+
+ ASSERT(error == 0 || txg == 0);
+
+ return (error);
+}
+
+void
+metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
+{
+ const dva_t *dva = bp->blk_dva;
+ int ndvas = BP_GET_NDVAS(bp);
+ uint64_t psize = BP_GET_PSIZE(bp);
+ int d;
+ vdev_t *vd;
+
+ ASSERT(!BP_IS_HOLE(bp));
+ ASSERT(!BP_IS_EMBEDDED(bp));
+ ASSERT(psize > 0);
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+ for (d = 0; d < ndvas; d++) {
+ if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
+ continue;
+ atomic_add_64(&vd->vdev_pending_fastwrite, psize);
+ }
+
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+}
+
+void
+metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
+{
+ const dva_t *dva = bp->blk_dva;
+ int ndvas = BP_GET_NDVAS(bp);
+ uint64_t psize = BP_GET_PSIZE(bp);
+ int d;
+ vdev_t *vd;
+
+ ASSERT(!BP_IS_HOLE(bp));
+ ASSERT(!BP_IS_EMBEDDED(bp));
+ ASSERT(psize > 0);
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+ for (d = 0; d < ndvas; d++) {
+ if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
+ continue;
+ ASSERT3U(vd->vdev_pending_fastwrite, >=, psize);
+ atomic_sub_64(&vd->vdev_pending_fastwrite, psize);
+ }
+
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+}
+
+/* ARGSUSED */
+static void
+metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *arg)
+{
+ if (vd->vdev_ops == &vdev_indirect_ops)
+ return;
+
+ metaslab_check_free_impl(vd, offset, size);
+}
+
+static void
+metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
+{
+ metaslab_t *msp;
+ spa_t *spa __maybe_unused = vd->vdev_spa;
+
+ if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
+ return;
+
+ if (vd->vdev_ops->vdev_op_remap != NULL) {
+ vd->vdev_ops->vdev_op_remap(vd, offset, size,
+ metaslab_check_free_impl_cb, NULL);
+ return;
+ }
+
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
+ ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
+
+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ mutex_enter(&msp->ms_lock);
+ if (msp->ms_loaded) {
+ range_tree_verify_not_present(msp->ms_allocatable,
+ offset, size);
+ }
+
+ /*
+ * Check all segments that currently exist in the freeing pipeline.
+ *
+ * It would intuitively make sense to also check the current allocating
+ * tree since metaslab_unalloc_dva() exists for extents that are
+ * allocated and freed in the same sync pass within the same txg.
+ * Unfortunately there are places (e.g. the ZIL) where we allocate a
+ * segment but then we free part of it within the same txg
+ * [see zil_sync()]. Thus, we don't call range_tree_verify() in the
+ * current allocating tree.
+ */
+ range_tree_verify_not_present(msp->ms_freeing, offset, size);
+ range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
+ range_tree_verify_not_present(msp->ms_freed, offset, size);
+ for (int j = 0; j < TXG_DEFER_SIZE; j++)
+ range_tree_verify_not_present(msp->ms_defer[j], offset, size);
+ range_tree_verify_not_present(msp->ms_trim, offset, size);
+ mutex_exit(&msp->ms_lock);
+}
+
+void
+metaslab_check_free(spa_t *spa, const blkptr_t *bp)
+{
+ if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
+ return;
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
+ uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
+ vdev_t *vd = vdev_lookup_top(spa, vdev);
+ uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
+ uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
+
+ if (DVA_GET_GANG(&bp->blk_dva[i]))
+ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+ ASSERT3P(vd, !=, NULL);
+
+ metaslab_check_free_impl(vd, offset, size);
+ }
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+}
+
+static void
+metaslab_group_disable_wait(metaslab_group_t *mg)
+{
+ ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
+ while (mg->mg_disabled_updating) {
+ cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
+ }
+}
+
+static void
+metaslab_group_disabled_increment(metaslab_group_t *mg)
+{
+ ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
+ ASSERT(mg->mg_disabled_updating);
+
+ while (mg->mg_ms_disabled >= max_disabled_ms) {
+ cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
+ }
+ mg->mg_ms_disabled++;
+ ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms);
+}
+
+/*
+ * Mark the metaslab as disabled to prevent any allocations on this metaslab.
+ * We must also track how many metaslabs are currently disabled within a
+ * metaslab group and limit them to prevent allocation failures from
+ * occurring because all metaslabs are disabled.
+ */
+void
+metaslab_disable(metaslab_t *msp)
+{
+ ASSERT(!MUTEX_HELD(&msp->ms_lock));
+ metaslab_group_t *mg = msp->ms_group;
+
+ mutex_enter(&mg->mg_ms_disabled_lock);
+
+ /*
+ * To keep an accurate count of how many threads have disabled
+ * a specific metaslab group, we only allow one thread to mark
+ * the metaslab group at a time. This ensures that the value of
+ * ms_disabled will be accurate when we decide to mark a metaslab
+ * group as disabled. To do this we force all other threads
+ * to wait till the metaslab's mg_disabled_updating flag is no
+ * longer set.
+ */
+ metaslab_group_disable_wait(mg);
+ mg->mg_disabled_updating = B_TRUE;
+ if (msp->ms_disabled == 0) {
+ metaslab_group_disabled_increment(mg);
+ }
+ mutex_enter(&msp->ms_lock);
+ msp->ms_disabled++;
+ mutex_exit(&msp->ms_lock);
+
+ mg->mg_disabled_updating = B_FALSE;
+ cv_broadcast(&mg->mg_ms_disabled_cv);
+ mutex_exit(&mg->mg_ms_disabled_lock);
+}
+
+void
+metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ spa_t *spa = mg->mg_vd->vdev_spa;
+
+ /*
+ * Wait for the outstanding IO to be synced to prevent newly
+ * allocated blocks from being overwritten. This used by
+ * initialize and TRIM which are modifying unallocated space.
+ */
+ if (sync)
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ mutex_enter(&mg->mg_ms_disabled_lock);
+ mutex_enter(&msp->ms_lock);
+ if (--msp->ms_disabled == 0) {
+ mg->mg_ms_disabled--;
+ cv_broadcast(&mg->mg_ms_disabled_cv);
+ if (unload)
+ metaslab_unload(msp);
+ }
+ mutex_exit(&msp->ms_lock);
+ mutex_exit(&mg->mg_ms_disabled_lock);
+}
+
+static void
+metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
+{
+ vdev_t *vd = ms->ms_group->mg_vd;
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa_meta_objset(spa);
+
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+ metaslab_unflushed_phys_t entry = {
+ .msp_unflushed_txg = metaslab_unflushed_txg(ms),
+ };
+ uint64_t entry_size = sizeof (entry);
+ uint64_t entry_offset = ms->ms_id * entry_size;
+
+ uint64_t object = 0;
+ int err = zap_lookup(mos, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
+ &object);
+ if (err == ENOENT) {
+ object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA,
+ SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
+ VERIFY0(zap_add(mos, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
+ &object, tx));
+ } else {
+ VERIFY0(err);
+ }
+
+ dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size,
+ &entry, tx);
+}
+
+void
+metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
+{
+ spa_t *spa = ms->ms_group->mg_vd->vdev_spa;
+
+ if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+ return;
+
+ ms->ms_unflushed_txg = txg;
+ metaslab_update_ondisk_flush_data(ms, tx);
+}
+
+uint64_t
+metaslab_unflushed_txg(metaslab_t *ms)
+{
+ return (ms->ms_unflushed_txg);
+}
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, ULONG, ZMOD_RW,
+ "Allocation granularity (a.k.a. stripe size)");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW,
+ "Load all metaslabs when pool is first opened");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW,
+ "Prevent metaslabs from being unloaded");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW,
+ "Preload potential metaslabs during reassessment");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, INT, ZMOD_RW,
+ "Delay in txgs after metaslab was last used before unloading");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, INT, ZMOD_RW,
+ "Delay in milliseconds after metaslab was last used before unloading");
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, INT, ZMOD_RW,
+ "Percentage of metaslab group size that should be free to make it "
+ "eligible for allocation");
+
+ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, INT, ZMOD_RW,
+ "Percentage of metaslab group size that should be considered eligible "
+ "for allocations unless all metaslab groups within the metaslab class "
+ "have also crossed this threshold");
+
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, INT,
+ ZMOD_RW, "Fragmentation for metaslab to allow allocation");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT, ZMOD_RW,
+ "Use the fragmentation metric to prefer less fragmented metaslabs");
+/* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW,
+ "Prefer metaslabs with lower LBAs");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW,
+ "Enable metaslab group biasing");
+
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT,
+ ZMOD_RW, "Enable segment-based metaslab selection");
+
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW,
+ "Segment-based metaslab selection maximum buckets before switching");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, ULONG, ZMOD_RW,
+ "Blocks larger than this size are forced to be gang blocks");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, INT, ZMOD_RW,
+ "Max distance (bytes) to search forward before using size tree");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW,
+ "When looking in size tree, use largest segment instead of exact fit");
+
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, ULONG,
+ ZMOD_RW, "How long to trust the cached max chunk size of a metaslab");
+
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, INT, ZMOD_RW,
+ "Percentage of memory that can be used to store metaslab range trees");
+
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT,
+ ZMOD_RW, "Try hard to allocate before ganging");
+
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, INT, ZMOD_RW,
+ "Normally only consider this many of the best metaslabs in each vdev");
diff --git a/sys/contrib/openzfs/module/zfs/mmp.c b/sys/contrib/openzfs/module/zfs/mmp.c
new file mode 100644
index 000000000000..d05c9db24c20
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/mmp.c
@@ -0,0 +1,741 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
+ */
+
+#include <sys/abd.h>
+#include <sys/mmp.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/time.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/zfs_context.h>
+#include <sys/callb.h>
+
+/*
+ * Multi-Modifier Protection (MMP) attempts to prevent a user from importing
+ * or opening a pool on more than one host at a time. In particular, it
+ * prevents "zpool import -f" on a host from succeeding while the pool is
+ * already imported on another host. There are many other ways in which a
+ * device could be used by two hosts for different purposes at the same time
+ * resulting in pool damage. This implementation does not attempt to detect
+ * those cases.
+ *
+ * MMP operates by ensuring there are frequent visible changes on disk (a
+ * "heartbeat") at all times. And by altering the import process to check
+ * for these changes and failing the import when they are detected. This
+ * functionality is enabled by setting the 'multihost' pool property to on.
+ *
+ * Uberblocks written by the txg_sync thread always go into the first
+ * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP.
+ * They are used to hold uberblocks which are exactly the same as the last
+ * synced uberblock except that the ub_timestamp and mmp_config are frequently
+ * updated. Like all other uberblocks, the slot is written with an embedded
+ * checksum, and slots with invalid checksums are ignored. This provides the
+ * "heartbeat", with no risk of overwriting good uberblocks that must be
+ * preserved, e.g. previous txgs and associated block pointers.
+ *
+ * Three optional fields are added to uberblock structure; ub_mmp_magic,
+ * ub_mmp_config, and ub_mmp_delay. The ub_mmp_magic value allows zfs to tell
+ * whether the other ub_mmp_* fields are valid. The ub_mmp_config field tells
+ * the importing host the settings of zfs_multihost_interval and
+ * zfs_multihost_fail_intervals on the host which last had (or currently has)
+ * the pool imported. These determine how long a host must wait to detect
+ * activity in the pool, before concluding the pool is not in use. The
+ * mmp_delay field is a decaying average of the amount of time between
+ * completion of successive MMP writes, in nanoseconds. It indicates whether
+ * MMP is enabled.
+ *
+ * During import an activity test may now be performed to determine if
+ * the pool is in use. The activity test is typically required if the
+ * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is
+ * POOL_STATE_ACTIVE, and the pool is not a root pool.
+ *
+ * The activity test finds the "best" uberblock (highest txg, timestamp, and, if
+ * ub_mmp_magic is valid, sequence number from ub_mmp_config). It then waits
+ * some time, and finds the "best" uberblock again. If any of the mentioned
+ * fields have different values in the newly read uberblock, the pool is in use
+ * by another host and the import fails. In order to assure the accuracy of the
+ * activity test, the default values result in an activity test duration of 20x
+ * the mmp write interval.
+ *
+ * The duration of the "zpool import" activity test depends on the information
+ * available in the "best" uberblock:
+ *
+ * 1) If uberblock was written by zfs-0.8 or newer and fail_intervals > 0:
+ * ub_mmp_config.fail_intervals * ub_mmp_config.multihost_interval * 2
+ *
+ * In this case, a weak guarantee is provided. Since the host which last had
+ * the pool imported will suspend the pool if no mmp writes land within
+ * fail_intervals * multihost_interval ms, the absence of writes during that
+ * time means either the pool is not imported, or it is imported but the pool
+ * is suspended and no further writes will occur.
+ *
+ * Note that resuming the suspended pool on the remote host would invalidate
+ * this guarantee, and so it is not allowed.
+ *
+ * The factor of 2 provides a conservative safety factor and derives from
+ * MMP_IMPORT_SAFETY_FACTOR;
+ *
+ * 2) If uberblock was written by zfs-0.8 or newer and fail_intervals == 0:
+ * (ub_mmp_config.multihost_interval + ub_mmp_delay) *
+ * zfs_multihost_import_intervals
+ *
+ * In this case no guarantee can provided. However, as long as some devices
+ * are healthy and connected, it is likely that at least one write will land
+ * within (multihost_interval + mmp_delay) because multihost_interval is
+ * enough time for a write to be attempted to each leaf vdev, and mmp_delay
+ * is enough for one to land, based on past delays. Multiplying by
+ * zfs_multihost_import_intervals provides a conservative safety factor.
+ *
+ * 3) If uberblock was written by zfs-0.7:
+ * (zfs_multihost_interval + ub_mmp_delay) * zfs_multihost_import_intervals
+ *
+ * The same logic as case #2 applies, but we do not know remote tunables.
+ *
+ * We use the local value for zfs_multihost_interval because the original MMP
+ * did not record this value in the uberblock.
+ *
+ * ub_mmp_delay >= (zfs_multihost_interval / leaves), so if the other host
+ * has a much larger zfs_multihost_interval set, ub_mmp_delay will reflect
+ * that. We will have waited enough time for zfs_multihost_import_intervals
+ * writes to be issued and all but one to land.
+ *
+ * single device pool example delays
+ *
+ * import_delay = (1 + 1) * 20 = 40s #defaults, no I/O delay
+ * import_delay = (1 + 10) * 20 = 220s #defaults, 10s I/O delay
+ * import_delay = (10 + 10) * 20 = 400s #10s multihost_interval,
+ * no I/O delay
+ * 100 device pool example delays
+ *
+ * import_delay = (1 + .01) * 20 = 20s #defaults, no I/O delay
+ * import_delay = (1 + 10) * 20 = 220s #defaults, 10s I/O delay
+ * import_delay = (10 + .1) * 20 = 202s #10s multihost_interval,
+ * no I/O delay
+ *
+ * 4) Otherwise, this uberblock was written by a pre-MMP zfs:
+ * zfs_multihost_import_intervals * zfs_multihost_interval
+ *
+ * In this case local tunables are used. By default this product = 10s, long
+ * enough for a pool with any activity at all to write at least one
+ * uberblock. No guarantee can be provided.
+ *
+ * Additionally, the duration is then extended by a random 25% to attempt to to
+ * detect simultaneous imports. For example, if both partner hosts are rebooted
+ * at the same time and automatically attempt to import the pool.
+ */
+
+/*
+ * Used to control the frequency of mmp writes which are performed when the
+ * 'multihost' pool property is on. This is one factor used to determine the
+ * length of the activity check during import.
+ *
+ * On average an mmp write will be issued for each leaf vdev every
+ * zfs_multihost_interval milliseconds. In practice, the observed period can
+ * vary with the I/O load and this observed value is the ub_mmp_delay which is
+ * stored in the uberblock. The minimum allowed value is 100 ms.
+ */
+ulong_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL;
+
+/*
+ * Used to control the duration of the activity test on import. Smaller values
+ * of zfs_multihost_import_intervals will reduce the import time but increase
+ * the risk of failing to detect an active pool. The total activity check time
+ * is never allowed to drop below one second. A value of 0 is ignored and
+ * treated as if it was set to 1.
+ */
+uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS;
+
+/*
+ * Controls the behavior of the pool when mmp write failures or delays are
+ * detected.
+ *
+ * When zfs_multihost_fail_intervals = 0, mmp write failures or delays are
+ * ignored. The failures will still be reported to the ZED which depending on
+ * its configuration may take action such as suspending the pool or taking a
+ * device offline.
+ *
+ * When zfs_multihost_fail_intervals > 0, the pool will be suspended if
+ * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds pass
+ * without a successful mmp write. This guarantees the activity test will see
+ * mmp writes if the pool is imported. A value of 1 is ignored and treated as
+ * if it was set to 2, because a single leaf vdev pool will issue a write once
+ * per multihost_interval and thus any variation in latency would cause the
+ * pool to be suspended.
+ */
+uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS;
+
+char *mmp_tag = "mmp_write_uberblock";
+static void mmp_thread(void *arg);
+
+void
+mmp_init(spa_t *spa)
+{
+ mmp_thread_t *mmp = &spa->spa_mmp;
+
+ mutex_init(&mmp->mmp_thread_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL);
+ mmp->mmp_kstat_id = 1;
+}
+
+void
+mmp_fini(spa_t *spa)
+{
+ mmp_thread_t *mmp = &spa->spa_mmp;
+
+ mutex_destroy(&mmp->mmp_thread_lock);
+ cv_destroy(&mmp->mmp_thread_cv);
+ mutex_destroy(&mmp->mmp_io_lock);
+}
+
+static void
+mmp_thread_enter(mmp_thread_t *mmp, callb_cpr_t *cpr)
+{
+ CALLB_CPR_INIT(cpr, &mmp->mmp_thread_lock, callb_generic_cpr, FTAG);
+ mutex_enter(&mmp->mmp_thread_lock);
+}
+
+static void
+mmp_thread_exit(mmp_thread_t *mmp, kthread_t **mpp, callb_cpr_t *cpr)
+{
+ ASSERT(*mpp != NULL);
+ *mpp = NULL;
+ cv_broadcast(&mmp->mmp_thread_cv);
+ CALLB_CPR_EXIT(cpr); /* drops &mmp->mmp_thread_lock */
+ thread_exit();
+}
+
+void
+mmp_thread_start(spa_t *spa)
+{
+ mmp_thread_t *mmp = &spa->spa_mmp;
+
+ if (spa_writeable(spa)) {
+ mutex_enter(&mmp->mmp_thread_lock);
+ if (!mmp->mmp_thread) {
+ mmp->mmp_thread = thread_create(NULL, 0, mmp_thread,
+ spa, 0, &p0, TS_RUN, defclsyspri);
+ zfs_dbgmsg("MMP thread started pool '%s' "
+ "gethrtime %llu", spa_name(spa), gethrtime());
+ }
+ mutex_exit(&mmp->mmp_thread_lock);
+ }
+}
+
+void
+mmp_thread_stop(spa_t *spa)
+{
+ mmp_thread_t *mmp = &spa->spa_mmp;
+
+ mutex_enter(&mmp->mmp_thread_lock);
+ mmp->mmp_thread_exiting = 1;
+ cv_broadcast(&mmp->mmp_thread_cv);
+
+ while (mmp->mmp_thread) {
+ cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock);
+ }
+ mutex_exit(&mmp->mmp_thread_lock);
+ zfs_dbgmsg("MMP thread stopped pool '%s' gethrtime %llu",
+ spa_name(spa), gethrtime());
+
+ ASSERT(mmp->mmp_thread == NULL);
+ mmp->mmp_thread_exiting = 0;
+}
+
+typedef enum mmp_vdev_state_flag {
+ MMP_FAIL_NOT_WRITABLE = (1 << 0),
+ MMP_FAIL_WRITE_PENDING = (1 << 1),
+} mmp_vdev_state_flag_t;
+
+/*
+ * Find a leaf vdev to write an MMP block to. It must not have an outstanding
+ * mmp write (if so a new write will also likely block). If there is no usable
+ * leaf, a nonzero error value is returned. The error value returned is a bit
+ * field.
+ *
+ * MMP_FAIL_WRITE_PENDING One or more leaf vdevs are writeable, but have an
+ * outstanding MMP write.
+ * MMP_FAIL_NOT_WRITABLE One or more leaf vdevs are not writeable.
+ */
+
+static int
+mmp_next_leaf(spa_t *spa)
+{
+ vdev_t *leaf;
+ vdev_t *starting_leaf;
+ int fail_mask = 0;
+
+ ASSERT(MUTEX_HELD(&spa->spa_mmp.mmp_io_lock));
+ ASSERT(spa_config_held(spa, SCL_STATE, RW_READER));
+ ASSERT(list_link_active(&spa->spa_leaf_list.list_head) == B_TRUE);
+ ASSERT(!list_is_empty(&spa->spa_leaf_list));
+
+ if (spa->spa_mmp.mmp_leaf_last_gen != spa->spa_leaf_list_gen) {
+ spa->spa_mmp.mmp_last_leaf = list_head(&spa->spa_leaf_list);
+ spa->spa_mmp.mmp_leaf_last_gen = spa->spa_leaf_list_gen;
+ }
+
+ leaf = spa->spa_mmp.mmp_last_leaf;
+ if (leaf == NULL)
+ leaf = list_head(&spa->spa_leaf_list);
+ starting_leaf = leaf;
+
+ do {
+ leaf = list_next(&spa->spa_leaf_list, leaf);
+ if (leaf == NULL)
+ leaf = list_head(&spa->spa_leaf_list);
+
+ /*
+ * We skip unwritable, offline, detached, and dRAID spare
+ * devices as they are either not legal targets or the write
+ * may fail or not be seen by other hosts. Skipped dRAID
+ * spares can never be written so the fail mask is not set.
+ */
+ if (!vdev_writeable(leaf) || leaf->vdev_offline ||
+ leaf->vdev_detached) {
+ fail_mask |= MMP_FAIL_NOT_WRITABLE;
+ } else if (leaf->vdev_ops == &vdev_draid_spare_ops) {
+ continue;
+ } else if (leaf->vdev_mmp_pending != 0) {
+ fail_mask |= MMP_FAIL_WRITE_PENDING;
+ } else {
+ spa->spa_mmp.mmp_last_leaf = leaf;
+ return (0);
+ }
+ } while (leaf != starting_leaf);
+
+ ASSERT(fail_mask);
+
+ return (fail_mask);
+}
+
+/*
+ * MMP writes are issued on a fixed schedule, but may complete at variable,
+ * much longer, intervals. The mmp_delay captures long periods between
+ * successful writes for any reason, including disk latency, scheduling delays,
+ * etc.
+ *
+ * The mmp_delay is usually calculated as a decaying average, but if the latest
+ * delay is higher we do not average it, so that we do not hide sudden spikes
+ * which the importing host must wait for.
+ *
+ * If writes are occurring frequently, such as due to a high rate of txg syncs,
+ * the mmp_delay could become very small. Since those short delays depend on
+ * activity we cannot count on, we never allow mmp_delay to get lower than rate
+ * expected if only mmp_thread writes occur.
+ *
+ * If an mmp write was skipped or fails, and we have already waited longer than
+ * mmp_delay, we need to update it so the next write reflects the longer delay.
+ *
+ * Do not set mmp_delay if the multihost property is not on, so as not to
+ * trigger an activity check on import.
+ */
+static void
+mmp_delay_update(spa_t *spa, boolean_t write_completed)
+{
+ mmp_thread_t *mts = &spa->spa_mmp;
+ hrtime_t delay = gethrtime() - mts->mmp_last_write;
+
+ ASSERT(MUTEX_HELD(&mts->mmp_io_lock));
+
+ if (spa_multihost(spa) == B_FALSE) {
+ mts->mmp_delay = 0;
+ return;
+ }
+
+ if (delay > mts->mmp_delay)
+ mts->mmp_delay = delay;
+
+ if (write_completed == B_FALSE)
+ return;
+
+ mts->mmp_last_write = gethrtime();
+
+ /*
+ * strictly less than, in case delay was changed above.
+ */
+ if (delay < mts->mmp_delay) {
+ hrtime_t min_delay =
+ MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval)) /
+ MAX(1, vdev_count_leaves(spa));
+ mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128),
+ min_delay);
+ }
+}
+
+static void
+mmp_write_done(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ vdev_t *vd = zio->io_vd;
+ mmp_thread_t *mts = zio->io_private;
+
+ mutex_enter(&mts->mmp_io_lock);
+ uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id;
+ hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending;
+
+ mmp_delay_update(spa, (zio->io_error == 0));
+
+ vd->vdev_mmp_pending = 0;
+ vd->vdev_mmp_kstat_id = 0;
+
+ mutex_exit(&mts->mmp_io_lock);
+ spa_config_exit(spa, SCL_STATE, mmp_tag);
+
+ spa_mmp_history_set(spa, mmp_kstat_id, zio->io_error,
+ mmp_write_duration);
+
+ abd_free(zio->io_abd);
+}
+
+/*
+ * When the uberblock on-disk is updated by a spa_sync,
+ * creating a new "best" uberblock, update the one stored
+ * in the mmp thread state, used for mmp writes.
+ */
+void
+mmp_update_uberblock(spa_t *spa, uberblock_t *ub)
+{
+ mmp_thread_t *mmp = &spa->spa_mmp;
+
+ mutex_enter(&mmp->mmp_io_lock);
+ mmp->mmp_ub = *ub;
+ mmp->mmp_seq = 1;
+ mmp->mmp_ub.ub_timestamp = gethrestime_sec();
+ mmp_delay_update(spa, B_TRUE);
+ mutex_exit(&mmp->mmp_io_lock);
+}
+
+/*
+ * Choose a random vdev, label, and MMP block, and write over it
+ * with a copy of the last-synced uberblock, whose timestamp
+ * has been updated to reflect that the pool is in use.
+ */
+static void
+mmp_write_uberblock(spa_t *spa)
+{
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+ mmp_thread_t *mmp = &spa->spa_mmp;
+ uberblock_t *ub;
+ vdev_t *vd = NULL;
+ int label, error;
+ uint64_t offset;
+
+ hrtime_t lock_acquire_time = gethrtime();
+ spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER);
+ lock_acquire_time = gethrtime() - lock_acquire_time;
+ if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
+ zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns "
+ "gethrtime %llu", spa_name(spa), lock_acquire_time,
+ gethrtime());
+
+ mutex_enter(&mmp->mmp_io_lock);
+
+ error = mmp_next_leaf(spa);
+
+ /*
+ * spa_mmp_history has two types of entries:
+ * Issued MMP write: records time issued, error status, etc.
+ * Skipped MMP write: an MMP write could not be issued because no
+ * suitable leaf vdev was available. See comment above struct
+ * spa_mmp_history for details.
+ */
+
+ if (error) {
+ mmp_delay_update(spa, B_FALSE);
+ if (mmp->mmp_skip_error == error) {
+ spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1);
+ } else {
+ mmp->mmp_skip_error = error;
+ spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg,
+ gethrestime_sec(), mmp->mmp_delay, NULL, 0,
+ mmp->mmp_kstat_id++, error);
+ zfs_dbgmsg("MMP error choosing leaf pool '%s' "
+ "gethrtime %llu fail_mask %#x", spa_name(spa),
+ gethrtime(), error);
+ }
+ mutex_exit(&mmp->mmp_io_lock);
+ spa_config_exit(spa, SCL_STATE, mmp_tag);
+ return;
+ }
+
+ vd = spa->spa_mmp.mmp_last_leaf;
+ if (mmp->mmp_skip_error != 0) {
+ mmp->mmp_skip_error = 0;
+ zfs_dbgmsg("MMP write after skipping due to unavailable "
+ "leaves, pool '%s' gethrtime %llu leaf %#llu",
+ spa_name(spa), gethrtime(), vd->vdev_guid);
+ }
+
+ if (mmp->mmp_zio_root == NULL)
+ mmp->mmp_zio_root = zio_root(spa, NULL, NULL,
+ flags | ZIO_FLAG_GODFATHER);
+
+ if (mmp->mmp_ub.ub_timestamp != gethrestime_sec()) {
+ /*
+ * Want to reset mmp_seq when timestamp advances because after
+ * an mmp_seq wrap new values will not be chosen by
+ * uberblock_compare() as the "best".
+ */
+ mmp->mmp_ub.ub_timestamp = gethrestime_sec();
+ mmp->mmp_seq = 1;
+ }
+
+ ub = &mmp->mmp_ub;
+ ub->ub_mmp_magic = MMP_MAGIC;
+ ub->ub_mmp_delay = mmp->mmp_delay;
+ ub->ub_mmp_config = MMP_SEQ_SET(mmp->mmp_seq) |
+ MMP_INTERVAL_SET(MMP_INTERVAL_OK(zfs_multihost_interval)) |
+ MMP_FAIL_INT_SET(MMP_FAIL_INTVS_OK(
+ zfs_multihost_fail_intervals));
+ vd->vdev_mmp_pending = gethrtime();
+ vd->vdev_mmp_kstat_id = mmp->mmp_kstat_id;
+
+ zio_t *zio = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags);
+ abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
+ abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
+ abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
+
+ mmp->mmp_seq++;
+ mmp->mmp_kstat_id++;
+ mutex_exit(&mmp->mmp_io_lock);
+
+ offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) -
+ MMP_BLOCKS_PER_LABEL + spa_get_random(MMP_BLOCKS_PER_LABEL));
+
+ label = spa_get_random(VDEV_LABELS);
+ vdev_label_write(zio, vd, label, ub_abd, offset,
+ VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp,
+ flags | ZIO_FLAG_DONT_PROPAGATE);
+
+ (void) spa_mmp_history_add(spa, ub->ub_txg, ub->ub_timestamp,
+ ub->ub_mmp_delay, vd, label, vd->vdev_mmp_kstat_id, 0);
+
+ zio_nowait(zio);
+}
+
+static void
+mmp_thread(void *arg)
+{
+ spa_t *spa = (spa_t *)arg;
+ mmp_thread_t *mmp = &spa->spa_mmp;
+ boolean_t suspended = spa_suspended(spa);
+ boolean_t multihost = spa_multihost(spa);
+ uint64_t mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK(
+ zfs_multihost_interval));
+ uint32_t mmp_fail_intervals = MMP_FAIL_INTVS_OK(
+ zfs_multihost_fail_intervals);
+ hrtime_t mmp_fail_ns = mmp_fail_intervals * mmp_interval;
+ boolean_t last_spa_suspended = suspended;
+ boolean_t last_spa_multihost = multihost;
+ uint64_t last_mmp_interval = mmp_interval;
+ uint32_t last_mmp_fail_intervals = mmp_fail_intervals;
+ hrtime_t last_mmp_fail_ns = mmp_fail_ns;
+ callb_cpr_t cpr;
+ int skip_wait = 0;
+
+ mmp_thread_enter(mmp, &cpr);
+
+ /*
+ * There have been no MMP writes yet. Setting mmp_last_write here gives
+ * us one mmp_fail_ns period, which is consistent with the activity
+ * check duration, to try to land an MMP write before MMP suspends the
+ * pool (if so configured).
+ */
+
+ mutex_enter(&mmp->mmp_io_lock);
+ mmp->mmp_last_write = gethrtime();
+ mmp->mmp_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval));
+ mutex_exit(&mmp->mmp_io_lock);
+
+ while (!mmp->mmp_thread_exiting) {
+ hrtime_t next_time = gethrtime() +
+ MSEC2NSEC(MMP_DEFAULT_INTERVAL);
+ int leaves = MAX(vdev_count_leaves(spa), 1);
+
+ /* Detect changes in tunables or state */
+
+ last_spa_suspended = suspended;
+ last_spa_multihost = multihost;
+ suspended = spa_suspended(spa);
+ multihost = spa_multihost(spa);
+
+ last_mmp_interval = mmp_interval;
+ last_mmp_fail_intervals = mmp_fail_intervals;
+ last_mmp_fail_ns = mmp_fail_ns;
+ mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK(
+ zfs_multihost_interval));
+ mmp_fail_intervals = MMP_FAIL_INTVS_OK(
+ zfs_multihost_fail_intervals);
+
+ /* Smooth so pool is not suspended when reducing tunables */
+ if (mmp_fail_intervals * mmp_interval < mmp_fail_ns) {
+ mmp_fail_ns = (mmp_fail_ns * 31 +
+ mmp_fail_intervals * mmp_interval) / 32;
+ } else {
+ mmp_fail_ns = mmp_fail_intervals *
+ mmp_interval;
+ }
+
+ if (mmp_interval != last_mmp_interval ||
+ mmp_fail_intervals != last_mmp_fail_intervals) {
+ /*
+ * We want other hosts to see new tunables as quickly as
+ * possible. Write out at higher frequency than usual.
+ */
+ skip_wait += leaves;
+ }
+
+ if (multihost)
+ next_time = gethrtime() + mmp_interval / leaves;
+
+ if (mmp_fail_ns != last_mmp_fail_ns) {
+ zfs_dbgmsg("MMP interval change pool '%s' "
+ "gethrtime %llu last_mmp_interval %llu "
+ "mmp_interval %llu last_mmp_fail_intervals %u "
+ "mmp_fail_intervals %u mmp_fail_ns %llu "
+ "skip_wait %d leaves %d next_time %llu",
+ spa_name(spa), gethrtime(), last_mmp_interval,
+ mmp_interval, last_mmp_fail_intervals,
+ mmp_fail_intervals, mmp_fail_ns, skip_wait, leaves,
+ next_time);
+ }
+
+ /*
+ * MMP off => on, or suspended => !suspended:
+ * No writes occurred recently. Update mmp_last_write to give
+ * us some time to try.
+ */
+ if ((!last_spa_multihost && multihost) ||
+ (last_spa_suspended && !suspended)) {
+ zfs_dbgmsg("MMP state change pool '%s': gethrtime %llu "
+ "last_spa_multihost %u multihost %u "
+ "last_spa_suspended %u suspended %u",
+ spa_name(spa), last_spa_multihost, multihost,
+ last_spa_suspended, suspended);
+ mutex_enter(&mmp->mmp_io_lock);
+ mmp->mmp_last_write = gethrtime();
+ mmp->mmp_delay = mmp_interval;
+ mutex_exit(&mmp->mmp_io_lock);
+ }
+
+ /*
+ * MMP on => off:
+ * mmp_delay == 0 tells importing node to skip activity check.
+ */
+ if (last_spa_multihost && !multihost) {
+ mutex_enter(&mmp->mmp_io_lock);
+ mmp->mmp_delay = 0;
+ mutex_exit(&mmp->mmp_io_lock);
+ }
+
+ /*
+ * Suspend the pool if no MMP write has succeeded in over
+ * mmp_interval * mmp_fail_intervals nanoseconds.
+ */
+ if (multihost && !suspended && mmp_fail_intervals &&
+ (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
+ zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
+ "mmp_last_write %llu mmp_interval %llu "
+ "mmp_fail_intervals %llu mmp_fail_ns %llu",
+ spa_name(spa), (u_longlong_t)gethrtime(),
+ (u_longlong_t)mmp->mmp_last_write,
+ (u_longlong_t)mmp_interval,
+ (u_longlong_t)mmp_fail_intervals,
+ (u_longlong_t)mmp_fail_ns);
+ cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
+ "succeeded in over %llu ms; suspending pool. "
+ "Hrtime %llu",
+ spa_name(spa),
+ NSEC2MSEC(gethrtime() - mmp->mmp_last_write),
+ gethrtime());
+ zio_suspend(spa, NULL, ZIO_SUSPEND_MMP);
+ }
+
+ if (multihost && !suspended)
+ mmp_write_uberblock(spa);
+
+ if (skip_wait > 0) {
+ next_time = gethrtime() + MSEC2NSEC(MMP_MIN_INTERVAL) /
+ leaves;
+ skip_wait--;
+ }
+
+ CALLB_CPR_SAFE_BEGIN(&cpr);
+ (void) cv_timedwait_idle_hires(&mmp->mmp_thread_cv,
+ &mmp->mmp_thread_lock, next_time, USEC2NSEC(100),
+ CALLOUT_FLAG_ABSOLUTE);
+ CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock);
+ }
+
+ /* Outstanding writes are allowed to complete. */
+ zio_wait(mmp->mmp_zio_root);
+
+ mmp->mmp_zio_root = NULL;
+ mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr);
+}
+
+/*
+ * Signal the MMP thread to wake it, when it is sleeping on
+ * its cv. Used when some module parameter has changed and
+ * we want the thread to know about it.
+ * Only signal if the pool is active and mmp thread is
+ * running, otherwise there is no thread to wake.
+ */
+static void
+mmp_signal_thread(spa_t *spa)
+{
+ mmp_thread_t *mmp = &spa->spa_mmp;
+
+ mutex_enter(&mmp->mmp_thread_lock);
+ if (mmp->mmp_thread)
+ cv_broadcast(&mmp->mmp_thread_cv);
+ mutex_exit(&mmp->mmp_thread_lock);
+}
+
+void
+mmp_signal_all_threads(void)
+{
+ spa_t *spa = NULL;
+
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa))) {
+ if (spa->spa_state == POOL_STATE_ACTIVE)
+ mmp_signal_thread(spa);
+ }
+ mutex_exit(&spa_namespace_lock);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM_CALL(zfs_multihost, zfs_multihost_, interval,
+ param_set_multihost_interval, param_get_ulong, ZMOD_RW,
+ "Milliseconds between mmp writes to each leaf");
+/* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, fail_intervals, UINT, ZMOD_RW,
+ "Max allowed period without a successful mmp write");
+
+ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, import_intervals, UINT, ZMOD_RW,
+ "Number of zfs_multihost_interval periods to wait for activity");
diff --git a/sys/contrib/openzfs/module/zfs/multilist.c b/sys/contrib/openzfs/module/zfs/multilist.c
new file mode 100644
index 000000000000..36c0d33bf1f6
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/multilist.c
@@ -0,0 +1,434 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/multilist.h>
+#include <sys/trace_zfs.h>
+
+/* needed for spa_get_random() */
+#include <sys/spa.h>
+
+/*
+ * This overrides the number of sublists in each multilist_t, which defaults
+ * to the number of CPUs in the system (see multilist_create()).
+ */
+int zfs_multilist_num_sublists = 0;
+
+/*
+ * Given the object contained on the list, return a pointer to the
+ * object's multilist_node_t structure it contains.
+ */
+#ifdef ZFS_DEBUG
+static multilist_node_t *
+multilist_d2l(multilist_t *ml, void *obj)
+{
+ return ((multilist_node_t *)((char *)obj + ml->ml_offset));
+}
+#endif
+
+/*
+ * Initialize a new mutlilist using the parameters specified.
+ *
+ * - 'size' denotes the size of the structure containing the
+ * multilist_node_t.
+ * - 'offset' denotes the byte offset of the mutlilist_node_t within
+ * the structure that contains it.
+ * - 'num' specifies the number of internal sublists to create.
+ * - 'index_func' is used to determine which sublist to insert into
+ * when the multilist_insert() function is called; as well as which
+ * sublist to remove from when multilist_remove() is called. The
+ * requirements this function must meet, are the following:
+ *
+ * - It must always return the same value when called on the same
+ * object (to ensure the object is removed from the list it was
+ * inserted into).
+ *
+ * - It must return a value in the range [0, number of sublists).
+ * The multilist_get_num_sublists() function may be used to
+ * determine the number of sublists in the multilist.
+ *
+ * Also, in order to reduce internal contention between the sublists
+ * during insertion and removal, this function should choose evenly
+ * between all available sublists when inserting. This isn't a hard
+ * requirement, but a general rule of thumb in order to garner the
+ * best multi-threaded performance out of the data structure.
+ */
+static multilist_t *
+multilist_create_impl(size_t size, size_t offset,
+ unsigned int num, multilist_sublist_index_func_t *index_func)
+{
+ ASSERT3U(size, >, 0);
+ ASSERT3U(size, >=, offset + sizeof (multilist_node_t));
+ ASSERT3U(num, >, 0);
+ ASSERT3P(index_func, !=, NULL);
+
+ multilist_t *ml = kmem_alloc(sizeof (*ml), KM_SLEEP);
+ ml->ml_offset = offset;
+ ml->ml_num_sublists = num;
+ ml->ml_index_func = index_func;
+
+ ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) *
+ ml->ml_num_sublists, KM_SLEEP);
+
+ ASSERT3P(ml->ml_sublists, !=, NULL);
+
+ for (int i = 0; i < ml->ml_num_sublists; i++) {
+ multilist_sublist_t *mls = &ml->ml_sublists[i];
+ mutex_init(&mls->mls_lock, NULL, MUTEX_NOLOCKDEP, NULL);
+ list_create(&mls->mls_list, size, offset);
+ }
+ return (ml);
+}
+
+/*
+ * Allocate a new multilist, using the default number of sublists (the number
+ * of CPUs, or at least 4, or the tunable zfs_multilist_num_sublists). Note
+ * that the multilists do not expand if more CPUs are hot-added. In that case,
+ * we will have less fanout than boot_ncpus, but we don't want to always
+ * reserve the RAM necessary to create the extra slots for additional CPUs up
+ * front, and dynamically adding them is a complex task.
+ */
+multilist_t *
+multilist_create(size_t size, size_t offset,
+ multilist_sublist_index_func_t *index_func)
+{
+ int num_sublists;
+
+ if (zfs_multilist_num_sublists > 0) {
+ num_sublists = zfs_multilist_num_sublists;
+ } else {
+ num_sublists = MAX(boot_ncpus, 4);
+ }
+
+ return (multilist_create_impl(size, offset, num_sublists, index_func));
+}
+
+/*
+ * Destroy the given multilist object, and free up any memory it holds.
+ */
+void
+multilist_destroy(multilist_t *ml)
+{
+ ASSERT(multilist_is_empty(ml));
+
+ for (int i = 0; i < ml->ml_num_sublists; i++) {
+ multilist_sublist_t *mls = &ml->ml_sublists[i];
+
+ ASSERT(list_is_empty(&mls->mls_list));
+
+ list_destroy(&mls->mls_list);
+ mutex_destroy(&mls->mls_lock);
+ }
+
+ ASSERT3P(ml->ml_sublists, !=, NULL);
+ kmem_free(ml->ml_sublists,
+ sizeof (multilist_sublist_t) * ml->ml_num_sublists);
+
+ ml->ml_num_sublists = 0;
+ ml->ml_offset = 0;
+ kmem_free(ml, sizeof (multilist_t));
+}
+
+/*
+ * Insert the given object into the multilist.
+ *
+ * This function will insert the object specified into the sublist
+ * determined using the function given at multilist creation time.
+ *
+ * The sublist locks are automatically acquired if not already held, to
+ * ensure consistency when inserting and removing from multiple threads.
+ */
+void
+multilist_insert(multilist_t *ml, void *obj)
+{
+ unsigned int sublist_idx = ml->ml_index_func(ml, obj);
+ multilist_sublist_t *mls;
+ boolean_t need_lock;
+
+ DTRACE_PROBE3(multilist__insert, multilist_t *, ml,
+ unsigned int, sublist_idx, void *, obj);
+
+ ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+
+ mls = &ml->ml_sublists[sublist_idx];
+
+ /*
+ * Note: Callers may already hold the sublist lock by calling
+ * multilist_sublist_lock(). Here we rely on MUTEX_HELD()
+ * returning TRUE if and only if the current thread holds the
+ * lock. While it's a little ugly to make the lock recursive in
+ * this way, it works and allows the calling code to be much
+ * simpler -- otherwise it would have to pass around a flag
+ * indicating that it already has the lock.
+ */
+ need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+ if (need_lock)
+ mutex_enter(&mls->mls_lock);
+
+ ASSERT(!multilist_link_active(multilist_d2l(ml, obj)));
+
+ multilist_sublist_insert_head(mls, obj);
+
+ if (need_lock)
+ mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * Remove the given object from the multilist.
+ *
+ * This function will remove the object specified from the sublist
+ * determined using the function given at multilist creation time.
+ *
+ * The necessary sublist locks are automatically acquired, to ensure
+ * consistency when inserting and removing from multiple threads.
+ */
+void
+multilist_remove(multilist_t *ml, void *obj)
+{
+ unsigned int sublist_idx = ml->ml_index_func(ml, obj);
+ multilist_sublist_t *mls;
+ boolean_t need_lock;
+
+ DTRACE_PROBE3(multilist__remove, multilist_t *, ml,
+ unsigned int, sublist_idx, void *, obj);
+
+ ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+
+ mls = &ml->ml_sublists[sublist_idx];
+ /* See comment in multilist_insert(). */
+ need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+ if (need_lock)
+ mutex_enter(&mls->mls_lock);
+
+ ASSERT(multilist_link_active(multilist_d2l(ml, obj)));
+
+ multilist_sublist_remove(mls, obj);
+
+ if (need_lock)
+ mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * Check to see if this multilist object is empty.
+ *
+ * This will return TRUE if it finds all of the sublists of this
+ * multilist to be empty, and FALSE otherwise. Each sublist lock will be
+ * automatically acquired as necessary.
+ *
+ * If concurrent insertions and removals are occurring, the semantics
+ * of this function become a little fuzzy. Instead of locking all
+ * sublists for the entire call time of the function, each sublist is
+ * only locked as it is individually checked for emptiness. Thus, it's
+ * possible for this function to return TRUE with non-empty sublists at
+ * the time the function returns. This would be due to another thread
+ * inserting into a given sublist, after that specific sublist was check
+ * and deemed empty, but before all sublists have been checked.
+ */
+int
+multilist_is_empty(multilist_t *ml)
+{
+ for (int i = 0; i < ml->ml_num_sublists; i++) {
+ multilist_sublist_t *mls = &ml->ml_sublists[i];
+ /* See comment in multilist_insert(). */
+ boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+ if (need_lock)
+ mutex_enter(&mls->mls_lock);
+
+ if (!list_is_empty(&mls->mls_list)) {
+ if (need_lock)
+ mutex_exit(&mls->mls_lock);
+
+ return (FALSE);
+ }
+
+ if (need_lock)
+ mutex_exit(&mls->mls_lock);
+ }
+
+ return (TRUE);
+}
+
+/* Return the number of sublists composing this multilist */
+unsigned int
+multilist_get_num_sublists(multilist_t *ml)
+{
+ return (ml->ml_num_sublists);
+}
+
+/* Return a randomly selected, valid sublist index for this multilist */
+unsigned int
+multilist_get_random_index(multilist_t *ml)
+{
+ return (spa_get_random(ml->ml_num_sublists));
+}
+
+/* Lock and return the sublist specified at the given index */
+multilist_sublist_t *
+multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
+{
+ multilist_sublist_t *mls;
+
+ ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+ mls = &ml->ml_sublists[sublist_idx];
+ mutex_enter(&mls->mls_lock);
+
+ return (mls);
+}
+
+/* Lock and return the sublist that would be used to store the specified obj */
+multilist_sublist_t *
+multilist_sublist_lock_obj(multilist_t *ml, void *obj)
+{
+ return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj)));
+}
+
+void
+multilist_sublist_unlock(multilist_sublist_t *mls)
+{
+ mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * We're allowing any object to be inserted into this specific sublist,
+ * but this can lead to trouble if multilist_remove() is called to
+ * remove this object. Specifically, if calling ml_index_func on this
+ * object returns an index for sublist different than what is passed as
+ * a parameter here, any call to multilist_remove() with this newly
+ * inserted object is undefined! (the call to multilist_remove() will
+ * remove the object from a list that it isn't contained in)
+ */
+void
+multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ list_insert_head(&mls->mls_list, obj);
+}
+
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ list_insert_tail(&mls->mls_list, obj);
+}
+
+/*
+ * Move the object one element forward in the list.
+ *
+ * This function will move the given object forward in the list (towards
+ * the head) by one object. So, in essence, it will swap its position in
+ * the list with its "prev" pointer. If the given object is already at the
+ * head of the list, it cannot be moved forward any more than it already
+ * is, so no action is taken.
+ *
+ * NOTE: This function **must not** remove any object from the list other
+ * than the object given as the parameter. This is relied upon in
+ * arc_evict_state_impl().
+ */
+void
+multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj)
+{
+ void *prev = list_prev(&mls->mls_list, obj);
+
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ ASSERT(!list_is_empty(&mls->mls_list));
+
+ /* 'obj' must be at the head of the list, nothing to do */
+ if (prev == NULL)
+ return;
+
+ list_remove(&mls->mls_list, obj);
+ list_insert_before(&mls->mls_list, prev, obj);
+}
+
+void
+multilist_sublist_remove(multilist_sublist_t *mls, void *obj)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ list_remove(&mls->mls_list, obj);
+}
+
+int
+multilist_sublist_is_empty(multilist_sublist_t *mls)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ return (list_is_empty(&mls->mls_list));
+}
+
+int
+multilist_sublist_is_empty_idx(multilist_t *ml, unsigned int sublist_idx)
+{
+ multilist_sublist_t *mls;
+ int empty;
+
+ ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+ mls = &ml->ml_sublists[sublist_idx];
+ ASSERT(!MUTEX_HELD(&mls->mls_lock));
+ mutex_enter(&mls->mls_lock);
+ empty = list_is_empty(&mls->mls_list);
+ mutex_exit(&mls->mls_lock);
+ return (empty);
+}
+
+void *
+multilist_sublist_head(multilist_sublist_t *mls)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ return (list_head(&mls->mls_list));
+}
+
+void *
+multilist_sublist_tail(multilist_sublist_t *mls)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ return (list_tail(&mls->mls_list));
+}
+
+void *
+multilist_sublist_next(multilist_sublist_t *mls, void *obj)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ return (list_next(&mls->mls_list, obj));
+}
+
+void *
+multilist_sublist_prev(multilist_sublist_t *mls, void *obj)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ return (list_prev(&mls->mls_list, obj));
+}
+
+void
+multilist_link_init(multilist_node_t *link)
+{
+ list_link_init(link);
+}
+
+int
+multilist_link_active(multilist_node_t *link)
+{
+ return (list_link_active(link));
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, multilist_num_sublists, INT, ZMOD_RW,
+ "Number of sublists used in each multilist");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/objlist.c b/sys/contrib/openzfs/module/zfs/objlist.c
new file mode 100644
index 000000000000..c80bab2a77bd
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/objlist.c
@@ -0,0 +1,84 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/objlist.h>
+#include <sys/zfs_context.h>
+
+objlist_t *
+objlist_create(void)
+{
+ objlist_t *list = kmem_alloc(sizeof (*list), KM_SLEEP);
+ list_create(&list->ol_list, sizeof (objlist_node_t),
+ offsetof(objlist_node_t, on_node));
+ list->ol_last_lookup = 0;
+ return (list);
+}
+
+void
+objlist_destroy(objlist_t *list)
+{
+ for (objlist_node_t *n = list_remove_head(&list->ol_list);
+ n != NULL; n = list_remove_head(&list->ol_list)) {
+ kmem_free(n, sizeof (*n));
+ }
+ list_destroy(&list->ol_list);
+ kmem_free(list, sizeof (*list));
+}
+
+/*
+ * This function looks through the objlist to see if the specified object number
+ * is contained in the objlist. In the process, it will remove all object
+ * numbers in the list that are smaller than the specified object number. Thus,
+ * any lookup of an object number smaller than a previously looked up object
+ * number will always return false; therefore, all lookups should be done in
+ * ascending order.
+ */
+boolean_t
+objlist_exists(objlist_t *list, uint64_t object)
+{
+ objlist_node_t *node = list_head(&list->ol_list);
+ ASSERT3U(object, >=, list->ol_last_lookup);
+ list->ol_last_lookup = object;
+ while (node != NULL && node->on_object < object) {
+ VERIFY3P(node, ==, list_remove_head(&list->ol_list));
+ kmem_free(node, sizeof (*node));
+ node = list_head(&list->ol_list);
+ }
+ return (node != NULL && node->on_object == object);
+}
+
+/*
+ * The objlist is a list of object numbers stored in ascending order. However,
+ * the insertion of new object numbers does not seek out the correct location to
+ * store a new object number; instead, it appends it to the list for simplicity.
+ * Thus, any users must take care to only insert new object numbers in ascending
+ * order.
+ */
+void
+objlist_insert(objlist_t *list, uint64_t object)
+{
+ objlist_node_t *node = kmem_zalloc(sizeof (*node), KM_SLEEP);
+ node->on_object = object;
+#ifdef ZFS_DEBUG
+ objlist_node_t *last_object = list_tail(&list->ol_list);
+ uint64_t last_objnum = (last_object != NULL ? last_object->on_object :
+ 0);
+ ASSERT3U(node->on_object, >, last_objnum);
+#endif
+ list_insert_tail(&list->ol_list, node);
+}
diff --git a/sys/contrib/openzfs/module/zfs/pathname.c b/sys/contrib/openzfs/module/zfs/pathname.c
new file mode 100644
index 000000000000..84ab7b7e1111
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/pathname.c
@@ -0,0 +1,88 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+
+#include <sys/types.h>
+#include <sys/pathname.h>
+#include <sys/kmem.h>
+#include <sys/sysmacros.h>
+
+/*
+ * Pathname utilities.
+ *
+ * In translating file names we copy each argument file
+ * name into a pathname structure where we operate on it.
+ * Each pathname structure can hold "pn_bufsize" characters
+ * including a terminating null, and operations here support
+ * allocating and freeing pathname structures, fetching
+ * strings from user space, getting the next character from
+ * a pathname, combining two pathnames (used in symbolic
+ * link processing), and peeling off the first component
+ * of a pathname.
+ */
+
+/*
+ * Allocate contents of pathname structure. Structure is typically
+ * an automatic variable in calling routine for convenience.
+ *
+ * May sleep in the call to kmem_alloc() and so must not be called
+ * from interrupt level.
+ */
+void
+pn_alloc(struct pathname *pnp)
+{
+ pn_alloc_sz(pnp, MAXPATHLEN);
+}
+void
+pn_alloc_sz(struct pathname *pnp, size_t sz)
+{
+ pnp->pn_buf = kmem_alloc(sz, KM_SLEEP);
+ pnp->pn_bufsize = sz;
+}
+
+/*
+ * Free pathname resources.
+ */
+void
+pn_free(struct pathname *pnp)
+{
+ /* pn_bufsize is usually MAXPATHLEN, but may not be */
+ kmem_free(pnp->pn_buf, pnp->pn_bufsize);
+ pnp->pn_buf = NULL;
+ pnp->pn_bufsize = 0;
+}
diff --git a/sys/contrib/openzfs/module/zfs/range_tree.c b/sys/contrib/openzfs/module/zfs/range_tree.c
new file mode 100644
index 000000000000..5219fd079b73
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/range_tree.c
@@ -0,0 +1,922 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2013, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/dnode.h>
+#include <sys/zio.h>
+#include <sys/range_tree.h>
+
+/*
+ * Range trees are tree-based data structures that can be used to
+ * track free space or generally any space allocation information.
+ * A range tree keeps track of individual segments and automatically
+ * provides facilities such as adjacent extent merging and extent
+ * splitting in response to range add/remove requests.
+ *
+ * A range tree starts out completely empty, with no segments in it.
+ * Adding an allocation via range_tree_add to the range tree can either:
+ * 1) create a new extent
+ * 2) extend an adjacent extent
+ * 3) merge two adjacent extents
+ * Conversely, removing an allocation via range_tree_remove can:
+ * 1) completely remove an extent
+ * 2) shorten an extent (if the allocation was near one of its ends)
+ * 3) split an extent into two extents, in effect punching a hole
+ *
+ * A range tree is also capable of 'bridging' gaps when adding
+ * allocations. This is useful for cases when close proximity of
+ * allocations is an important detail that needs to be represented
+ * in the range tree. See range_tree_set_gap(). The default behavior
+ * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
+ *
+ * In order to traverse a range tree, use either the range_tree_walk()
+ * or range_tree_vacate() functions.
+ *
+ * To obtain more accurate information on individual segment
+ * operations that the range tree performs "under the hood", you can
+ * specify a set of callbacks by passing a range_tree_ops_t structure
+ * to the range_tree_create function. Any callbacks that are non-NULL
+ * are then called at the appropriate times.
+ *
+ * The range tree code also supports a special variant of range trees
+ * that can bridge small gaps between segments. This kind of tree is used
+ * by the dsl scanning code to group I/Os into mostly sequential chunks to
+ * optimize disk performance. The code here attempts to do this with as
+ * little memory and computational overhead as possible. One limitation of
+ * this implementation is that segments of range trees with gaps can only
+ * support removing complete segments.
+ */
+
+static inline void
+rs_copy(range_seg_t *src, range_seg_t *dest, range_tree_t *rt)
+{
+ ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+ size_t size = 0;
+ switch (rt->rt_type) {
+ case RANGE_SEG32:
+ size = sizeof (range_seg32_t);
+ break;
+ case RANGE_SEG64:
+ size = sizeof (range_seg64_t);
+ break;
+ case RANGE_SEG_GAP:
+ size = sizeof (range_seg_gap_t);
+ break;
+ default:
+ VERIFY(0);
+ }
+ bcopy(src, dest, size);
+}
+
+void
+range_tree_stat_verify(range_tree_t *rt)
+{
+ range_seg_t *rs;
+ zfs_btree_index_t where;
+ uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
+ int i;
+
+ for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL;
+ rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
+ uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
+ int idx = highbit64(size) - 1;
+
+ hist[idx]++;
+ ASSERT3U(hist[idx], !=, 0);
+ }
+
+ for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+ if (hist[i] != rt->rt_histogram[i]) {
+ zfs_dbgmsg("i=%d, hist=%px, hist=%llu, rt_hist=%llu",
+ i, hist, hist[i], rt->rt_histogram[i]);
+ }
+ VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
+ }
+}
+
+static void
+range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
+{
+ uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
+ int idx = highbit64(size) - 1;
+
+ ASSERT(size != 0);
+ ASSERT3U(idx, <,
+ sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
+
+ rt->rt_histogram[idx]++;
+ ASSERT3U(rt->rt_histogram[idx], !=, 0);
+}
+
+static void
+range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
+{
+ uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
+ int idx = highbit64(size) - 1;
+
+ ASSERT(size != 0);
+ ASSERT3U(idx, <,
+ sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
+
+ ASSERT3U(rt->rt_histogram[idx], !=, 0);
+ rt->rt_histogram[idx]--;
+}
+
+static int
+range_tree_seg32_compare(const void *x1, const void *x2)
+{
+ const range_seg32_t *r1 = x1;
+ const range_seg32_t *r2 = x2;
+
+ ASSERT3U(r1->rs_start, <=, r1->rs_end);
+ ASSERT3U(r2->rs_start, <=, r2->rs_end);
+
+ return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
+}
+
+static int
+range_tree_seg64_compare(const void *x1, const void *x2)
+{
+ const range_seg64_t *r1 = x1;
+ const range_seg64_t *r2 = x2;
+
+ ASSERT3U(r1->rs_start, <=, r1->rs_end);
+ ASSERT3U(r2->rs_start, <=, r2->rs_end);
+
+ return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
+}
+
+static int
+range_tree_seg_gap_compare(const void *x1, const void *x2)
+{
+ const range_seg_gap_t *r1 = x1;
+ const range_seg_gap_t *r2 = x2;
+
+ ASSERT3U(r1->rs_start, <=, r1->rs_end);
+ ASSERT3U(r2->rs_start, <=, r2->rs_end);
+
+ return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
+}
+
+range_tree_t *
+range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg,
+ uint64_t start, uint64_t shift,
+ int (*zfs_btree_compare) (const void *, const void *),
+ uint64_t gap)
+{
+ range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
+
+ ASSERT3U(shift, <, 64);
+ ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES);
+ size_t size;
+ int (*compare) (const void *, const void *);
+ switch (type) {
+ case RANGE_SEG32:
+ size = sizeof (range_seg32_t);
+ compare = range_tree_seg32_compare;
+ break;
+ case RANGE_SEG64:
+ size = sizeof (range_seg64_t);
+ compare = range_tree_seg64_compare;
+ break;
+ case RANGE_SEG_GAP:
+ size = sizeof (range_seg_gap_t);
+ compare = range_tree_seg_gap_compare;
+ break;
+ default:
+ panic("Invalid range seg type %d", type);
+ }
+ zfs_btree_create(&rt->rt_root, compare, size);
+
+ rt->rt_ops = ops;
+ rt->rt_gap = gap;
+ rt->rt_arg = arg;
+ rt->rt_type = type;
+ rt->rt_start = start;
+ rt->rt_shift = shift;
+ rt->rt_btree_compare = zfs_btree_compare;
+
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
+ rt->rt_ops->rtop_create(rt, rt->rt_arg);
+
+ return (rt);
+}
+
+range_tree_t *
+range_tree_create(range_tree_ops_t *ops, range_seg_type_t type,
+ void *arg, uint64_t start, uint64_t shift)
+{
+ return (range_tree_create_impl(ops, type, arg, start, shift, NULL, 0));
+}
+
+void
+range_tree_destroy(range_tree_t *rt)
+{
+ VERIFY0(rt->rt_space);
+
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
+ rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
+
+ zfs_btree_destroy(&rt->rt_root);
+ kmem_free(rt, sizeof (*rt));
+}
+
+void
+range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta)
+{
+ if (delta < 0 && delta * -1 >= rs_get_fill(rs, rt)) {
+ zfs_panic_recover("zfs: attempting to decrease fill to or "
+ "below 0; probable double remove in segment [%llx:%llx]",
+ (longlong_t)rs_get_start(rs, rt),
+ (longlong_t)rs_get_end(rs, rt));
+ }
+ if (rs_get_fill(rs, rt) + delta > rs_get_end(rs, rt) -
+ rs_get_start(rs, rt)) {
+ zfs_panic_recover("zfs: attempting to increase fill beyond "
+ "max; probable double add in segment [%llx:%llx]",
+ (longlong_t)rs_get_start(rs, rt),
+ (longlong_t)rs_get_end(rs, rt));
+ }
+
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+ rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+ rs_set_fill(rs, rt, rs_get_fill(rs, rt) + delta);
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+ rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
+}
+
+static void
+range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
+{
+ range_tree_t *rt = arg;
+ zfs_btree_index_t where;
+ range_seg_t *rs_before, *rs_after, *rs;
+ range_seg_max_t tmp, rsearch;
+ uint64_t end = start + size, gap = rt->rt_gap;
+ uint64_t bridge_size = 0;
+ boolean_t merge_before, merge_after;
+
+ ASSERT3U(size, !=, 0);
+ ASSERT3U(fill, <=, size);
+ ASSERT3U(start + size, >, start);
+
+ rs_set_start(&rsearch, rt, start);
+ rs_set_end(&rsearch, rt, end);
+ rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
+
+ /*
+ * If this is a gap-supporting range tree, it is possible that we
+ * are inserting into an existing segment. In this case simply
+ * bump the fill count and call the remove / add callbacks. If the
+ * new range will extend an existing segment, we remove the
+ * existing one, apply the new extent to it and re-insert it using
+ * the normal code paths.
+ */
+ if (rs != NULL) {
+ if (gap == 0) {
+ zfs_panic_recover("zfs: adding existent segment to "
+ "range tree (offset=%llx size=%llx)",
+ (longlong_t)start, (longlong_t)size);
+ return;
+ }
+ uint64_t rstart = rs_get_start(rs, rt);
+ uint64_t rend = rs_get_end(rs, rt);
+ if (rstart <= start && rend >= end) {
+ range_tree_adjust_fill(rt, rs, fill);
+ return;
+ }
+
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+ rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+
+ range_tree_stat_decr(rt, rs);
+ rt->rt_space -= rend - rstart;
+
+ fill += rs_get_fill(rs, rt);
+ start = MIN(start, rstart);
+ end = MAX(end, rend);
+ size = end - start;
+
+ zfs_btree_remove(&rt->rt_root, rs);
+ range_tree_add_impl(rt, start, size, fill);
+ return;
+ }
+
+ ASSERT3P(rs, ==, NULL);
+
+ /*
+ * Determine whether or not we will have to merge with our neighbors.
+ * If gap != 0, we might need to merge with our neighbors even if we
+ * aren't directly touching.
+ */
+ zfs_btree_index_t where_before, where_after;
+ rs_before = zfs_btree_prev(&rt->rt_root, &where, &where_before);
+ rs_after = zfs_btree_next(&rt->rt_root, &where, &where_after);
+
+ merge_before = (rs_before != NULL && rs_get_end(rs_before, rt) >=
+ start - gap);
+ merge_after = (rs_after != NULL && rs_get_start(rs_after, rt) <= end +
+ gap);
+
+ if (merge_before && gap != 0)
+ bridge_size += start - rs_get_end(rs_before, rt);
+ if (merge_after && gap != 0)
+ bridge_size += rs_get_start(rs_after, rt) - end;
+
+ if (merge_before && merge_after) {
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
+ rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
+ rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
+ }
+
+ range_tree_stat_decr(rt, rs_before);
+ range_tree_stat_decr(rt, rs_after);
+
+ rs_copy(rs_after, &tmp, rt);
+ uint64_t before_start = rs_get_start_raw(rs_before, rt);
+ uint64_t before_fill = rs_get_fill(rs_before, rt);
+ uint64_t after_fill = rs_get_fill(rs_after, rt);
+ zfs_btree_remove_idx(&rt->rt_root, &where_before);
+
+ /*
+ * We have to re-find the node because our old reference is
+ * invalid as soon as we do any mutating btree operations.
+ */
+ rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after);
+ rs_set_start_raw(rs_after, rt, before_start);
+ rs_set_fill(rs_after, rt, after_fill + before_fill + fill);
+ rs = rs_after;
+ } else if (merge_before) {
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+ rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
+
+ range_tree_stat_decr(rt, rs_before);
+
+ uint64_t before_fill = rs_get_fill(rs_before, rt);
+ rs_set_end(rs_before, rt, end);
+ rs_set_fill(rs_before, rt, before_fill + fill);
+ rs = rs_before;
+ } else if (merge_after) {
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+ rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
+
+ range_tree_stat_decr(rt, rs_after);
+
+ uint64_t after_fill = rs_get_fill(rs_after, rt);
+ rs_set_start(rs_after, rt, start);
+ rs_set_fill(rs_after, rt, after_fill + fill);
+ rs = rs_after;
+ } else {
+ rs = &tmp;
+
+ rs_set_start(rs, rt, start);
+ rs_set_end(rs, rt, end);
+ rs_set_fill(rs, rt, fill);
+ zfs_btree_add_idx(&rt->rt_root, rs, &where);
+ }
+
+ if (gap != 0) {
+ ASSERT3U(rs_get_fill(rs, rt), <=, rs_get_end(rs, rt) -
+ rs_get_start(rs, rt));
+ } else {
+ ASSERT3U(rs_get_fill(rs, rt), ==, rs_get_end(rs, rt) -
+ rs_get_start(rs, rt));
+ }
+
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+ rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
+
+ range_tree_stat_incr(rt, rs);
+ rt->rt_space += size + bridge_size;
+}
+
+void
+range_tree_add(void *arg, uint64_t start, uint64_t size)
+{
+ range_tree_add_impl(arg, start, size, size);
+}
+
+static void
+range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size,
+ boolean_t do_fill)
+{
+ zfs_btree_index_t where;
+ range_seg_t *rs;
+ range_seg_max_t rsearch, rs_tmp;
+ uint64_t end = start + size;
+ boolean_t left_over, right_over;
+
+ VERIFY3U(size, !=, 0);
+ VERIFY3U(size, <=, rt->rt_space);
+ if (rt->rt_type == RANGE_SEG64)
+ ASSERT3U(start + size, >, start);
+
+ rs_set_start(&rsearch, rt, start);
+ rs_set_end(&rsearch, rt, end);
+ rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
+
+ /* Make sure we completely overlap with someone */
+ if (rs == NULL) {
+ zfs_panic_recover("zfs: removing nonexistent segment from "
+ "range tree (offset=%llx size=%llx)",
+ (longlong_t)start, (longlong_t)size);
+ return;
+ }
+
+ /*
+ * Range trees with gap support must only remove complete segments
+ * from the tree. This allows us to maintain accurate fill accounting
+ * and to ensure that bridged sections are not leaked. If we need to
+ * remove less than the full segment, we can only adjust the fill count.
+ */
+ if (rt->rt_gap != 0) {
+ if (do_fill) {
+ if (rs_get_fill(rs, rt) == size) {
+ start = rs_get_start(rs, rt);
+ end = rs_get_end(rs, rt);
+ size = end - start;
+ } else {
+ range_tree_adjust_fill(rt, rs, -size);
+ return;
+ }
+ } else if (rs_get_start(rs, rt) != start ||
+ rs_get_end(rs, rt) != end) {
+ zfs_panic_recover("zfs: freeing partial segment of "
+ "gap tree (offset=%llx size=%llx) of "
+ "(offset=%llx size=%llx)",
+ (longlong_t)start, (longlong_t)size,
+ (longlong_t)rs_get_start(rs, rt),
+ (longlong_t)rs_get_end(rs, rt) - rs_get_start(rs,
+ rt));
+ return;
+ }
+ }
+
+ VERIFY3U(rs_get_start(rs, rt), <=, start);
+ VERIFY3U(rs_get_end(rs, rt), >=, end);
+
+ left_over = (rs_get_start(rs, rt) != start);
+ right_over = (rs_get_end(rs, rt) != end);
+
+ range_tree_stat_decr(rt, rs);
+
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+ rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+
+ if (left_over && right_over) {
+ range_seg_max_t newseg;
+ rs_set_start(&newseg, rt, end);
+ rs_set_end_raw(&newseg, rt, rs_get_end_raw(rs, rt));
+ rs_set_fill(&newseg, rt, rs_get_end(rs, rt) - end);
+ range_tree_stat_incr(rt, &newseg);
+
+ // This modifies the buffer already inside the range tree
+ rs_set_end(rs, rt, start);
+
+ rs_copy(rs, &rs_tmp, rt);
+ if (zfs_btree_next(&rt->rt_root, &where, &where) != NULL)
+ zfs_btree_add_idx(&rt->rt_root, &newseg, &where);
+ else
+ zfs_btree_add(&rt->rt_root, &newseg);
+
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+ rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg);
+ } else if (left_over) {
+ // This modifies the buffer already inside the range tree
+ rs_set_end(rs, rt, start);
+ rs_copy(rs, &rs_tmp, rt);
+ } else if (right_over) {
+ // This modifies the buffer already inside the range tree
+ rs_set_start(rs, rt, end);
+ rs_copy(rs, &rs_tmp, rt);
+ } else {
+ zfs_btree_remove_idx(&rt->rt_root, &where);
+ rs = NULL;
+ }
+
+ if (rs != NULL) {
+ /*
+ * The fill of the leftover segment will always be equal to
+ * the size, since we do not support removing partial segments
+ * of range trees with gaps.
+ */
+ rs_set_fill_raw(rs, rt, rs_get_end_raw(rs, rt) -
+ rs_get_start_raw(rs, rt));
+ range_tree_stat_incr(rt, &rs_tmp);
+
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+ rt->rt_ops->rtop_add(rt, &rs_tmp, rt->rt_arg);
+ }
+
+ rt->rt_space -= size;
+}
+
+void
+range_tree_remove(void *arg, uint64_t start, uint64_t size)
+{
+ range_tree_remove_impl(arg, start, size, B_FALSE);
+}
+
+void
+range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+ range_tree_remove_impl(rt, start, size, B_TRUE);
+}
+
+void
+range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
+ uint64_t newstart, uint64_t newsize)
+{
+ int64_t delta = newsize - (rs_get_end(rs, rt) - rs_get_start(rs, rt));
+
+ range_tree_stat_decr(rt, rs);
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+ rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+
+ rs_set_start(rs, rt, newstart);
+ rs_set_end(rs, rt, newstart + newsize);
+
+ range_tree_stat_incr(rt, rs);
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+ rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
+
+ rt->rt_space += delta;
+}
+
+static range_seg_t *
+range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+ range_seg_max_t rsearch;
+ uint64_t end = start + size;
+
+ VERIFY(size != 0);
+
+ rs_set_start(&rsearch, rt, start);
+ rs_set_end(&rsearch, rt, end);
+ return (zfs_btree_find(&rt->rt_root, &rsearch, NULL));
+}
+
+range_seg_t *
+range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+ if (rt->rt_type == RANGE_SEG64)
+ ASSERT3U(start + size, >, start);
+
+ range_seg_t *rs = range_tree_find_impl(rt, start, size);
+ if (rs != NULL && rs_get_start(rs, rt) <= start &&
+ rs_get_end(rs, rt) >= start + size) {
+ return (rs);
+ }
+ return (NULL);
+}
+
+void
+range_tree_verify_not_present(range_tree_t *rt, uint64_t off, uint64_t size)
+{
+ range_seg_t *rs = range_tree_find(rt, off, size);
+ if (rs != NULL)
+ panic("segment already in tree; rs=%p", (void *)rs);
+}
+
+boolean_t
+range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+ return (range_tree_find(rt, start, size) != NULL);
+}
+
+/*
+ * Returns the first subset of the given range which overlaps with the range
+ * tree. Returns true if there is a segment in the range, and false if there
+ * isn't.
+ */
+boolean_t
+range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size,
+ uint64_t *ostart, uint64_t *osize)
+{
+ if (rt->rt_type == RANGE_SEG64)
+ ASSERT3U(start + size, >, start);
+
+ range_seg_max_t rsearch;
+ rs_set_start(&rsearch, rt, start);
+ rs_set_end_raw(&rsearch, rt, rs_get_start_raw(&rsearch, rt) + 1);
+
+ zfs_btree_index_t where;
+ range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
+ if (rs != NULL) {
+ *ostart = start;
+ *osize = MIN(size, rs_get_end(rs, rt) - start);
+ return (B_TRUE);
+ }
+
+ rs = zfs_btree_next(&rt->rt_root, &where, &where);
+ if (rs == NULL || rs_get_start(rs, rt) > start + size)
+ return (B_FALSE);
+
+ *ostart = rs_get_start(rs, rt);
+ *osize = MIN(start + size, rs_get_end(rs, rt)) -
+ rs_get_start(rs, rt);
+ return (B_TRUE);
+}
+
+/*
+ * Ensure that this range is not in the tree, regardless of whether
+ * it is currently in the tree.
+ */
+void
+range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+ range_seg_t *rs;
+
+ if (size == 0)
+ return;
+
+ if (rt->rt_type == RANGE_SEG64)
+ ASSERT3U(start + size, >, start);
+
+ while ((rs = range_tree_find_impl(rt, start, size)) != NULL) {
+ uint64_t free_start = MAX(rs_get_start(rs, rt), start);
+ uint64_t free_end = MIN(rs_get_end(rs, rt), start + size);
+ range_tree_remove(rt, free_start, free_end - free_start);
+ }
+}
+
+void
+range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
+{
+ range_tree_t *rt;
+
+ ASSERT0(range_tree_space(*rtdst));
+ ASSERT0(zfs_btree_numnodes(&(*rtdst)->rt_root));
+
+ rt = *rtsrc;
+ *rtsrc = *rtdst;
+ *rtdst = rt;
+}
+
+void
+range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
+{
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
+ rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
+
+ if (func != NULL) {
+ range_seg_t *rs;
+ zfs_btree_index_t *cookie = NULL;
+
+ while ((rs = zfs_btree_destroy_nodes(&rt->rt_root, &cookie)) !=
+ NULL) {
+ func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) -
+ rs_get_start(rs, rt));
+ }
+ } else {
+ zfs_btree_clear(&rt->rt_root);
+ }
+
+ bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
+ rt->rt_space = 0;
+}
+
+void
+range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
+{
+ zfs_btree_index_t where;
+ for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where);
+ rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
+ func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) -
+ rs_get_start(rs, rt));
+ }
+}
+
+range_seg_t *
+range_tree_first(range_tree_t *rt)
+{
+ return (zfs_btree_first(&rt->rt_root, NULL));
+}
+
+uint64_t
+range_tree_space(range_tree_t *rt)
+{
+ return (rt->rt_space);
+}
+
+uint64_t
+range_tree_numsegs(range_tree_t *rt)
+{
+ return ((rt == NULL) ? 0 : zfs_btree_numnodes(&rt->rt_root));
+}
+
+boolean_t
+range_tree_is_empty(range_tree_t *rt)
+{
+ ASSERT(rt != NULL);
+ return (range_tree_space(rt) == 0);
+}
+
+/* ARGSUSED */
+void
+rt_btree_create(range_tree_t *rt, void *arg)
+{
+ zfs_btree_t *size_tree = arg;
+
+ size_t size;
+ switch (rt->rt_type) {
+ case RANGE_SEG32:
+ size = sizeof (range_seg32_t);
+ break;
+ case RANGE_SEG64:
+ size = sizeof (range_seg64_t);
+ break;
+ case RANGE_SEG_GAP:
+ size = sizeof (range_seg_gap_t);
+ break;
+ default:
+ panic("Invalid range seg type %d", rt->rt_type);
+ }
+ zfs_btree_create(size_tree, rt->rt_btree_compare, size);
+}
+
+/* ARGSUSED */
+void
+rt_btree_destroy(range_tree_t *rt, void *arg)
+{
+ zfs_btree_t *size_tree = arg;
+ ASSERT0(zfs_btree_numnodes(size_tree));
+
+ zfs_btree_destroy(size_tree);
+}
+
+/* ARGSUSED */
+void
+rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+ zfs_btree_t *size_tree = arg;
+
+ zfs_btree_add(size_tree, rs);
+}
+
+/* ARGSUSED */
+void
+rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+ zfs_btree_t *size_tree = arg;
+
+ zfs_btree_remove(size_tree, rs);
+}
+
+/* ARGSUSED */
+void
+rt_btree_vacate(range_tree_t *rt, void *arg)
+{
+ zfs_btree_t *size_tree = arg;
+ zfs_btree_clear(size_tree);
+ zfs_btree_destroy(size_tree);
+
+ rt_btree_create(rt, arg);
+}
+
+range_tree_ops_t rt_btree_ops = {
+ .rtop_create = rt_btree_create,
+ .rtop_destroy = rt_btree_destroy,
+ .rtop_add = rt_btree_add,
+ .rtop_remove = rt_btree_remove,
+ .rtop_vacate = rt_btree_vacate
+};
+
+/*
+ * Remove any overlapping ranges between the given segment [start, end)
+ * from removefrom. Add non-overlapping leftovers to addto.
+ */
+void
+range_tree_remove_xor_add_segment(uint64_t start, uint64_t end,
+ range_tree_t *removefrom, range_tree_t *addto)
+{
+ zfs_btree_index_t where;
+ range_seg_max_t starting_rs;
+ rs_set_start(&starting_rs, removefrom, start);
+ rs_set_end_raw(&starting_rs, removefrom, rs_get_start_raw(&starting_rs,
+ removefrom) + 1);
+
+ range_seg_t *curr = zfs_btree_find(&removefrom->rt_root,
+ &starting_rs, &where);
+
+ if (curr == NULL)
+ curr = zfs_btree_next(&removefrom->rt_root, &where, &where);
+
+ range_seg_t *next;
+ for (; curr != NULL; curr = next) {
+ if (start == end)
+ return;
+ VERIFY3U(start, <, end);
+
+ /* there is no overlap */
+ if (end <= rs_get_start(curr, removefrom)) {
+ range_tree_add(addto, start, end - start);
+ return;
+ }
+
+ uint64_t overlap_start = MAX(rs_get_start(curr, removefrom),
+ start);
+ uint64_t overlap_end = MIN(rs_get_end(curr, removefrom),
+ end);
+ uint64_t overlap_size = overlap_end - overlap_start;
+ ASSERT3S(overlap_size, >, 0);
+ range_seg_max_t rs;
+ rs_copy(curr, &rs, removefrom);
+
+ range_tree_remove(removefrom, overlap_start, overlap_size);
+
+ if (start < overlap_start)
+ range_tree_add(addto, start, overlap_start - start);
+
+ start = overlap_end;
+ next = zfs_btree_find(&removefrom->rt_root, &rs, &where);
+ /*
+ * If we find something here, we only removed part of the
+ * curr segment. Either there's some left at the end
+ * because we've reached the end of the range we're removing,
+ * or there's some left at the start because we started
+ * partway through the range. Either way, we continue with
+ * the loop. If it's the former, we'll return at the start of
+ * the loop, and if it's the latter we'll see if there is more
+ * area to process.
+ */
+ if (next != NULL) {
+ ASSERT(start == end || start == rs_get_end(&rs,
+ removefrom));
+ }
+
+ next = zfs_btree_next(&removefrom->rt_root, &where, &where);
+ }
+ VERIFY3P(curr, ==, NULL);
+
+ if (start != end) {
+ VERIFY3U(start, <, end);
+ range_tree_add(addto, start, end - start);
+ } else {
+ VERIFY3U(start, ==, end);
+ }
+}
+
+/*
+ * For each entry in rt, if it exists in removefrom, remove it
+ * from removefrom. Otherwise, add it to addto.
+ */
+void
+range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom,
+ range_tree_t *addto)
+{
+ zfs_btree_index_t where;
+ for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs;
+ rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
+ range_tree_remove_xor_add_segment(rs_get_start(rs, rt),
+ rs_get_end(rs, rt), removefrom, addto);
+ }
+}
+
+uint64_t
+range_tree_min(range_tree_t *rt)
+{
+ range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL);
+ return (rs != NULL ? rs_get_start(rs, rt) : 0);
+}
+
+uint64_t
+range_tree_max(range_tree_t *rt)
+{
+ range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL);
+ return (rs != NULL ? rs_get_end(rs, rt) : 0);
+}
+
+uint64_t
+range_tree_span(range_tree_t *rt)
+{
+ return (range_tree_max(rt) - range_tree_min(rt));
+}
diff --git a/sys/contrib/openzfs/module/zfs/refcount.c b/sys/contrib/openzfs/module/zfs/refcount.c
new file mode 100644
index 000000000000..39476261edfb
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/refcount.c
@@ -0,0 +1,327 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zfs_refcount.h>
+
+/*
+ * Reference count tracking is disabled by default. It's memory requirements
+ * are reasonable, however as implemented it consumes a significant amount of
+ * cpu time. Until its performance is improved it should be manually enabled.
+ */
+int reference_tracking_enable = FALSE;
+int reference_history = 3; /* tunable */
+
+#ifdef ZFS_DEBUG
+static kmem_cache_t *reference_cache;
+static kmem_cache_t *reference_history_cache;
+
+void
+zfs_refcount_init(void)
+{
+ reference_cache = kmem_cache_create("reference_cache",
+ sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ reference_history_cache = kmem_cache_create("reference_history_cache",
+ sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+zfs_refcount_fini(void)
+{
+ kmem_cache_destroy(reference_cache);
+ kmem_cache_destroy(reference_history_cache);
+}
+
+void
+zfs_refcount_create(zfs_refcount_t *rc)
+{
+ mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&rc->rc_list, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+ list_create(&rc->rc_removed, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+ rc->rc_count = 0;
+ rc->rc_removed_count = 0;
+ rc->rc_tracked = reference_tracking_enable;
+}
+
+void
+zfs_refcount_create_tracked(zfs_refcount_t *rc)
+{
+ zfs_refcount_create(rc);
+ rc->rc_tracked = B_TRUE;
+}
+
+void
+zfs_refcount_create_untracked(zfs_refcount_t *rc)
+{
+ zfs_refcount_create(rc);
+ rc->rc_tracked = B_FALSE;
+}
+
+void
+zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number)
+{
+ reference_t *ref;
+
+ ASSERT3U(rc->rc_count, ==, number);
+ while ((ref = list_head(&rc->rc_list))) {
+ list_remove(&rc->rc_list, ref);
+ kmem_cache_free(reference_cache, ref);
+ }
+ list_destroy(&rc->rc_list);
+
+ while ((ref = list_head(&rc->rc_removed))) {
+ list_remove(&rc->rc_removed, ref);
+ kmem_cache_free(reference_history_cache, ref->ref_removed);
+ kmem_cache_free(reference_cache, ref);
+ }
+ list_destroy(&rc->rc_removed);
+ mutex_destroy(&rc->rc_mtx);
+}
+
+void
+zfs_refcount_destroy(zfs_refcount_t *rc)
+{
+ zfs_refcount_destroy_many(rc, 0);
+}
+
+int
+zfs_refcount_is_zero(zfs_refcount_t *rc)
+{
+ return (rc->rc_count == 0);
+}
+
+int64_t
+zfs_refcount_count(zfs_refcount_t *rc)
+{
+ return (rc->rc_count);
+}
+
+int64_t
+zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder)
+{
+ reference_t *ref = NULL;
+ int64_t count;
+
+ if (rc->rc_tracked) {
+ ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
+ ref->ref_holder = holder;
+ ref->ref_number = number;
+ }
+ mutex_enter(&rc->rc_mtx);
+ ASSERT3U(rc->rc_count, >=, 0);
+ if (rc->rc_tracked)
+ list_insert_head(&rc->rc_list, ref);
+ rc->rc_count += number;
+ count = rc->rc_count;
+ mutex_exit(&rc->rc_mtx);
+
+ return (count);
+}
+
+int64_t
+zfs_refcount_add(zfs_refcount_t *rc, const void *holder)
+{
+ return (zfs_refcount_add_many(rc, 1, holder));
+}
+
+int64_t
+zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number,
+ const void *holder)
+{
+ reference_t *ref;
+ int64_t count;
+
+ mutex_enter(&rc->rc_mtx);
+ ASSERT3U(rc->rc_count, >=, number);
+
+ if (!rc->rc_tracked) {
+ rc->rc_count -= number;
+ count = rc->rc_count;
+ mutex_exit(&rc->rc_mtx);
+ return (count);
+ }
+
+ for (ref = list_head(&rc->rc_list); ref;
+ ref = list_next(&rc->rc_list, ref)) {
+ if (ref->ref_holder == holder && ref->ref_number == number) {
+ list_remove(&rc->rc_list, ref);
+ if (reference_history > 0) {
+ ref->ref_removed =
+ kmem_cache_alloc(reference_history_cache,
+ KM_SLEEP);
+ list_insert_head(&rc->rc_removed, ref);
+ rc->rc_removed_count++;
+ if (rc->rc_removed_count > reference_history) {
+ ref = list_tail(&rc->rc_removed);
+ list_remove(&rc->rc_removed, ref);
+ kmem_cache_free(reference_history_cache,
+ ref->ref_removed);
+ kmem_cache_free(reference_cache, ref);
+ rc->rc_removed_count--;
+ }
+ } else {
+ kmem_cache_free(reference_cache, ref);
+ }
+ rc->rc_count -= number;
+ count = rc->rc_count;
+ mutex_exit(&rc->rc_mtx);
+ return (count);
+ }
+ }
+ panic("No such hold %p on refcount %llx", holder,
+ (u_longlong_t)(uintptr_t)rc);
+ return (-1);
+}
+
+int64_t
+zfs_refcount_remove(zfs_refcount_t *rc, const void *holder)
+{
+ return (zfs_refcount_remove_many(rc, 1, holder));
+}
+
+void
+zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src)
+{
+ int64_t count, removed_count;
+ list_t list, removed;
+
+ list_create(&list, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+ list_create(&removed, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+
+ mutex_enter(&src->rc_mtx);
+ count = src->rc_count;
+ removed_count = src->rc_removed_count;
+ src->rc_count = 0;
+ src->rc_removed_count = 0;
+ list_move_tail(&list, &src->rc_list);
+ list_move_tail(&removed, &src->rc_removed);
+ mutex_exit(&src->rc_mtx);
+
+ mutex_enter(&dst->rc_mtx);
+ dst->rc_count += count;
+ dst->rc_removed_count += removed_count;
+ list_move_tail(&dst->rc_list, &list);
+ list_move_tail(&dst->rc_removed, &removed);
+ mutex_exit(&dst->rc_mtx);
+
+ list_destroy(&list);
+ list_destroy(&removed);
+}
+
+void
+zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number,
+ const void *current_holder, const void *new_holder)
+{
+ reference_t *ref;
+ boolean_t found = B_FALSE;
+
+ mutex_enter(&rc->rc_mtx);
+ if (!rc->rc_tracked) {
+ mutex_exit(&rc->rc_mtx);
+ return;
+ }
+
+ for (ref = list_head(&rc->rc_list); ref;
+ ref = list_next(&rc->rc_list, ref)) {
+ if (ref->ref_holder == current_holder &&
+ ref->ref_number == number) {
+ ref->ref_holder = new_holder;
+ found = B_TRUE;
+ break;
+ }
+ }
+ ASSERT(found);
+ mutex_exit(&rc->rc_mtx);
+}
+
+void
+zfs_refcount_transfer_ownership(zfs_refcount_t *rc, const void *current_holder,
+ const void *new_holder)
+{
+ return (zfs_refcount_transfer_ownership_many(rc, 1, current_holder,
+ new_holder));
+}
+
+/*
+ * If tracking is enabled, return true if a reference exists that matches
+ * the "holder" tag. If tracking is disabled, then return true if a reference
+ * might be held.
+ */
+boolean_t
+zfs_refcount_held(zfs_refcount_t *rc, const void *holder)
+{
+ reference_t *ref;
+
+ mutex_enter(&rc->rc_mtx);
+
+ if (!rc->rc_tracked) {
+ mutex_exit(&rc->rc_mtx);
+ return (rc->rc_count > 0);
+ }
+
+ for (ref = list_head(&rc->rc_list); ref;
+ ref = list_next(&rc->rc_list, ref)) {
+ if (ref->ref_holder == holder) {
+ mutex_exit(&rc->rc_mtx);
+ return (B_TRUE);
+ }
+ }
+ mutex_exit(&rc->rc_mtx);
+ return (B_FALSE);
+}
+
+/*
+ * If tracking is enabled, return true if a reference does not exist that
+ * matches the "holder" tag. If tracking is disabled, always return true
+ * since the reference might not be held.
+ */
+boolean_t
+zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder)
+{
+ reference_t *ref;
+
+ mutex_enter(&rc->rc_mtx);
+
+ if (!rc->rc_tracked) {
+ mutex_exit(&rc->rc_mtx);
+ return (B_TRUE);
+ }
+
+ for (ref = list_head(&rc->rc_list); ref;
+ ref = list_next(&rc->rc_list, ref)) {
+ if (ref->ref_holder == holder) {
+ mutex_exit(&rc->rc_mtx);
+ return (B_FALSE);
+ }
+ }
+ mutex_exit(&rc->rc_mtx);
+ return (B_TRUE);
+}
+#endif /* ZFS_DEBUG */
diff --git a/sys/contrib/openzfs/module/zfs/rrwlock.c b/sys/contrib/openzfs/module/zfs/rrwlock.c
new file mode 100644
index 000000000000..d23fc3ad1067
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/rrwlock.c
@@ -0,0 +1,396 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#include <sys/rrwlock.h>
+#include <sys/trace_zfs.h>
+
+/*
+ * This file contains the implementation of a re-entrant read
+ * reader/writer lock (aka "rrwlock").
+ *
+ * This is a normal reader/writer lock with the additional feature
+ * of allowing threads who have already obtained a read lock to
+ * re-enter another read lock (re-entrant read) - even if there are
+ * waiting writers.
+ *
+ * Callers who have not obtained a read lock give waiting writers priority.
+ *
+ * The rrwlock_t lock does not allow re-entrant writers, nor does it
+ * allow a re-entrant mix of reads and writes (that is, it does not
+ * allow a caller who has already obtained a read lock to be able to
+ * then grab a write lock without first dropping all read locks, and
+ * vice versa).
+ *
+ * The rrwlock_t uses tsd (thread specific data) to keep a list of
+ * nodes (rrw_node_t), where each node keeps track of which specific
+ * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering
+ * should be rare, a thread that grabs multiple reads on the same rrwlock_t
+ * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the
+ * tsd list can represent a different rrwlock_t. This allows a thread
+ * to enter multiple and unique rrwlock_ts for read locks at the same time.
+ *
+ * Since using tsd exposes some overhead, the rrwlock_t only needs to
+ * keep tsd data when writers are waiting. If no writers are waiting, then
+ * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd
+ * is needed. Once a writer attempts to grab the lock, readers then
+ * keep tsd data and bump the linked readers count (rr_linked_rcount).
+ *
+ * If there are waiting writers and there are anonymous readers, then a
+ * reader doesn't know if it is a re-entrant lock. But since it may be one,
+ * we allow the read to proceed (otherwise it could deadlock). Since once
+ * waiting writers are active, readers no longer bump the anonymous count,
+ * the anonymous readers will eventually flush themselves out. At this point,
+ * readers will be able to tell if they are a re-entrant lock (have a
+ * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then
+ * we must let the proceed. If they are not, then the reader blocks for the
+ * waiting writers. Hence, we do not starve writers.
+ */
+
+/* global key for TSD */
+uint_t rrw_tsd_key;
+
+typedef struct rrw_node {
+ struct rrw_node *rn_next;
+ rrwlock_t *rn_rrl;
+ void *rn_tag;
+} rrw_node_t;
+
+static rrw_node_t *
+rrn_find(rrwlock_t *rrl)
+{
+ rrw_node_t *rn;
+
+ if (zfs_refcount_count(&rrl->rr_linked_rcount) == 0)
+ return (NULL);
+
+ for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
+ if (rn->rn_rrl == rrl)
+ return (rn);
+ }
+ return (NULL);
+}
+
+/*
+ * Add a node to the head of the singly linked list.
+ */
+static void
+rrn_add(rrwlock_t *rrl, void *tag)
+{
+ rrw_node_t *rn;
+
+ rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
+ rn->rn_rrl = rrl;
+ rn->rn_next = tsd_get(rrw_tsd_key);
+ rn->rn_tag = tag;
+ VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
+}
+
+/*
+ * If a node is found for 'rrl', then remove the node from this
+ * thread's list and return TRUE; otherwise return FALSE.
+ */
+static boolean_t
+rrn_find_and_remove(rrwlock_t *rrl, void *tag)
+{
+ rrw_node_t *rn;
+ rrw_node_t *prev = NULL;
+
+ if (zfs_refcount_count(&rrl->rr_linked_rcount) == 0)
+ return (B_FALSE);
+
+ for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
+ if (rn->rn_rrl == rrl && rn->rn_tag == tag) {
+ if (prev)
+ prev->rn_next = rn->rn_next;
+ else
+ VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0);
+ kmem_free(rn, sizeof (*rn));
+ return (B_TRUE);
+ }
+ prev = rn;
+ }
+ return (B_FALSE);
+}
+
+void
+rrw_init(rrwlock_t *rrl, boolean_t track_all)
+{
+ mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
+ rrl->rr_writer = NULL;
+ zfs_refcount_create(&rrl->rr_anon_rcount);
+ zfs_refcount_create(&rrl->rr_linked_rcount);
+ rrl->rr_writer_wanted = B_FALSE;
+ rrl->rr_track_all = track_all;
+}
+
+void
+rrw_destroy(rrwlock_t *rrl)
+{
+ mutex_destroy(&rrl->rr_lock);
+ cv_destroy(&rrl->rr_cv);
+ ASSERT(rrl->rr_writer == NULL);
+ zfs_refcount_destroy(&rrl->rr_anon_rcount);
+ zfs_refcount_destroy(&rrl->rr_linked_rcount);
+}
+
+static void
+rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag)
+{
+ mutex_enter(&rrl->rr_lock);
+#if !defined(ZFS_DEBUG) && defined(_KERNEL)
+ if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted &&
+ !rrl->rr_track_all) {
+ rrl->rr_anon_rcount.rc_count++;
+ mutex_exit(&rrl->rr_lock);
+ return;
+ }
+ DTRACE_PROBE(zfs__rrwfastpath__rdmiss);
+#endif
+ ASSERT(rrl->rr_writer != curthread);
+ ASSERT(zfs_refcount_count(&rrl->rr_anon_rcount) >= 0);
+
+ while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted &&
+ zfs_refcount_is_zero(&rrl->rr_anon_rcount) && !prio &&
+ rrn_find(rrl) == NULL))
+ cv_wait(&rrl->rr_cv, &rrl->rr_lock);
+
+ if (rrl->rr_writer_wanted || rrl->rr_track_all) {
+ /* may or may not be a re-entrant enter */
+ rrn_add(rrl, tag);
+ (void) zfs_refcount_add(&rrl->rr_linked_rcount, tag);
+ } else {
+ (void) zfs_refcount_add(&rrl->rr_anon_rcount, tag);
+ }
+ ASSERT(rrl->rr_writer == NULL);
+ mutex_exit(&rrl->rr_lock);
+}
+
+void
+rrw_enter_read(rrwlock_t *rrl, void *tag)
+{
+ rrw_enter_read_impl(rrl, B_FALSE, tag);
+}
+
+/*
+ * take a read lock even if there are pending write lock requests. if we want
+ * to take a lock reentrantly, but from different threads (that have a
+ * relationship to each other), the normal detection mechanism to overrule
+ * the pending writer does not work, so we have to give an explicit hint here.
+ */
+void
+rrw_enter_read_prio(rrwlock_t *rrl, void *tag)
+{
+ rrw_enter_read_impl(rrl, B_TRUE, tag);
+}
+
+
+void
+rrw_enter_write(rrwlock_t *rrl)
+{
+ mutex_enter(&rrl->rr_lock);
+ ASSERT(rrl->rr_writer != curthread);
+
+ while (zfs_refcount_count(&rrl->rr_anon_rcount) > 0 ||
+ zfs_refcount_count(&rrl->rr_linked_rcount) > 0 ||
+ rrl->rr_writer != NULL) {
+ rrl->rr_writer_wanted = B_TRUE;
+ cv_wait(&rrl->rr_cv, &rrl->rr_lock);
+ }
+ rrl->rr_writer_wanted = B_FALSE;
+ rrl->rr_writer = curthread;
+ mutex_exit(&rrl->rr_lock);
+}
+
+void
+rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
+{
+ if (rw == RW_READER)
+ rrw_enter_read(rrl, tag);
+ else
+ rrw_enter_write(rrl);
+}
+
+void
+rrw_exit(rrwlock_t *rrl, void *tag)
+{
+ mutex_enter(&rrl->rr_lock);
+#if !defined(ZFS_DEBUG) && defined(_KERNEL)
+ if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) {
+ rrl->rr_anon_rcount.rc_count--;
+ if (rrl->rr_anon_rcount.rc_count == 0)
+ cv_broadcast(&rrl->rr_cv);
+ mutex_exit(&rrl->rr_lock);
+ return;
+ }
+ DTRACE_PROBE(zfs__rrwfastpath__exitmiss);
+#endif
+ ASSERT(!zfs_refcount_is_zero(&rrl->rr_anon_rcount) ||
+ !zfs_refcount_is_zero(&rrl->rr_linked_rcount) ||
+ rrl->rr_writer != NULL);
+
+ if (rrl->rr_writer == NULL) {
+ int64_t count;
+ if (rrn_find_and_remove(rrl, tag)) {
+ count = zfs_refcount_remove(
+ &rrl->rr_linked_rcount, tag);
+ } else {
+ ASSERT(!rrl->rr_track_all);
+ count = zfs_refcount_remove(&rrl->rr_anon_rcount, tag);
+ }
+ if (count == 0)
+ cv_broadcast(&rrl->rr_cv);
+ } else {
+ ASSERT(rrl->rr_writer == curthread);
+ ASSERT(zfs_refcount_is_zero(&rrl->rr_anon_rcount) &&
+ zfs_refcount_is_zero(&rrl->rr_linked_rcount));
+ rrl->rr_writer = NULL;
+ cv_broadcast(&rrl->rr_cv);
+ }
+ mutex_exit(&rrl->rr_lock);
+}
+
+/*
+ * If the lock was created with track_all, rrw_held(RW_READER) will return
+ * B_TRUE iff the current thread has the lock for reader. Otherwise it may
+ * return B_TRUE if any thread has the lock for reader.
+ */
+boolean_t
+rrw_held(rrwlock_t *rrl, krw_t rw)
+{
+ boolean_t held;
+
+ mutex_enter(&rrl->rr_lock);
+ if (rw == RW_WRITER) {
+ held = (rrl->rr_writer == curthread);
+ } else {
+ held = (!zfs_refcount_is_zero(&rrl->rr_anon_rcount) ||
+ rrn_find(rrl) != NULL);
+ }
+ mutex_exit(&rrl->rr_lock);
+
+ return (held);
+}
+
+void
+rrw_tsd_destroy(void *arg)
+{
+ rrw_node_t *rn = arg;
+ if (rn != NULL) {
+ panic("thread %p terminating with rrw lock %p held",
+ (void *)curthread, (void *)rn->rn_rrl);
+ }
+}
+
+/*
+ * A reader-mostly lock implementation, tuning above reader-writer locks
+ * for hightly parallel read acquisitions, while pessimizing writes.
+ *
+ * The idea is to split single busy lock into array of locks, so that
+ * each reader can lock only one of them for read, depending on result
+ * of simple hash function. That proportionally reduces lock congestion.
+ * Writer at the same time has to sequentially acquire write on all the locks.
+ * That makes write acquisition proportionally slower, but in places where
+ * it is used (filesystem unmount) performance is not critical.
+ *
+ * All the functions below are direct wrappers around functions above.
+ */
+void
+rrm_init(rrmlock_t *rrl, boolean_t track_all)
+{
+ int i;
+
+ for (i = 0; i < RRM_NUM_LOCKS; i++)
+ rrw_init(&rrl->locks[i], track_all);
+}
+
+void
+rrm_destroy(rrmlock_t *rrl)
+{
+ int i;
+
+ for (i = 0; i < RRM_NUM_LOCKS; i++)
+ rrw_destroy(&rrl->locks[i]);
+}
+
+void
+rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag)
+{
+ if (rw == RW_READER)
+ rrm_enter_read(rrl, tag);
+ else
+ rrm_enter_write(rrl);
+}
+
+/*
+ * This maps the current thread to a specific lock. Note that the lock
+ * must be released by the same thread that acquired it. We do this
+ * mapping by taking the thread pointer mod a prime number. We examine
+ * only the low 32 bits of the thread pointer, because 32-bit division
+ * is faster than 64-bit division, and the high 32 bits have little
+ * entropy anyway.
+ */
+#define RRM_TD_LOCK() (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS)
+
+void
+rrm_enter_read(rrmlock_t *rrl, void *tag)
+{
+ rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag);
+}
+
+void
+rrm_enter_write(rrmlock_t *rrl)
+{
+ int i;
+
+ for (i = 0; i < RRM_NUM_LOCKS; i++)
+ rrw_enter_write(&rrl->locks[i]);
+}
+
+void
+rrm_exit(rrmlock_t *rrl, void *tag)
+{
+ int i;
+
+ if (rrl->locks[0].rr_writer == curthread) {
+ for (i = 0; i < RRM_NUM_LOCKS; i++)
+ rrw_exit(&rrl->locks[i], tag);
+ } else {
+ rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag);
+ }
+}
+
+boolean_t
+rrm_held(rrmlock_t *rrl, krw_t rw)
+{
+ if (rw == RW_WRITER) {
+ return (rrw_held(&rrl->locks[0], rw));
+ } else {
+ return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw));
+ }
+}
diff --git a/sys/contrib/openzfs/module/zfs/sa.c b/sys/contrib/openzfs/module/zfs/sa.c
new file mode 100644
index 000000000000..5af0aaa7d0aa
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/sa.c
@@ -0,0 +1,2257 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/sunddi.h>
+#include <sys/sa_impl.h>
+#include <sys/errno.h>
+#include <sys/zfs_context.h>
+
+#ifdef _KERNEL
+#include <sys/zfs_znode.h>
+#endif
+
+/*
+ * ZFS System attributes:
+ *
+ * A generic mechanism to allow for arbitrary attributes
+ * to be stored in a dnode. The data will be stored in the bonus buffer of
+ * the dnode and if necessary a special "spill" block will be used to handle
+ * overflow situations. The spill block will be sized to fit the data
+ * from 512 - 128K. When a spill block is used the BP (blkptr_t) for the
+ * spill block is stored at the end of the current bonus buffer. Any
+ * attributes that would be in the way of the blkptr_t will be relocated
+ * into the spill block.
+ *
+ * Attribute registration:
+ *
+ * Stored persistently on a per dataset basis
+ * a mapping between attribute "string" names and their actual attribute
+ * numeric values, length, and byteswap function. The names are only used
+ * during registration. All attributes are known by their unique attribute
+ * id value. If an attribute can have a variable size then the value
+ * 0 will be used to indicate this.
+ *
+ * Attribute Layout:
+ *
+ * Attribute layouts are a way to compactly store multiple attributes, but
+ * without taking the overhead associated with managing each attribute
+ * individually. Since you will typically have the same set of attributes
+ * stored in the same order a single table will be used to represent that
+ * layout. The ZPL for example will usually have only about 10 different
+ * layouts (regular files, device files, symlinks,
+ * regular files + scanstamp, files/dir with extended attributes, and then
+ * you have the possibility of all of those minus ACL, because it would
+ * be kicked out into the spill block)
+ *
+ * Layouts are simply an array of the attributes and their
+ * ordering i.e. [0, 1, 4, 5, 2]
+ *
+ * Each distinct layout is given a unique layout number and that is what's
+ * stored in the header at the beginning of the SA data buffer.
+ *
+ * A layout only covers a single dbuf (bonus or spill). If a set of
+ * attributes is split up between the bonus buffer and a spill buffer then
+ * two different layouts will be used. This allows us to byteswap the
+ * spill without looking at the bonus buffer and keeps the on disk format of
+ * the bonus and spill buffer the same.
+ *
+ * Adding a single attribute will cause the entire set of attributes to
+ * be rewritten and could result in a new layout number being constructed
+ * as part of the rewrite if no such layout exists for the new set of
+ * attributes. The new attribute will be appended to the end of the already
+ * existing attributes.
+ *
+ * Both the attribute registration and attribute layout information are
+ * stored in normal ZAP attributes. Their should be a small number of
+ * known layouts and the set of attributes is assumed to typically be quite
+ * small.
+ *
+ * The registered attributes and layout "table" information is maintained
+ * in core and a special "sa_os_t" is attached to the objset_t.
+ *
+ * A special interface is provided to allow for quickly applying
+ * a large set of attributes at once. sa_replace_all_by_template() is
+ * used to set an array of attributes. This is used by the ZPL when
+ * creating a brand new file. The template that is passed into the function
+ * specifies the attribute, size for variable length attributes, location of
+ * data and special "data locator" function if the data isn't in a contiguous
+ * location.
+ *
+ * Byteswap implications:
+ *
+ * Since the SA attributes are not entirely self describing we can't do
+ * the normal byteswap processing. The special ZAP layout attribute and
+ * attribute registration attributes define the byteswap function and the
+ * size of the attributes, unless it is variable sized.
+ * The normal ZFS byteswapping infrastructure assumes you don't need
+ * to read any objects in order to do the necessary byteswapping. Whereas
+ * SA attributes can only be properly byteswapped if the dataset is opened
+ * and the layout/attribute ZAP attributes are available. Because of this
+ * the SA attributes will be byteswapped when they are first accessed by
+ * the SA code that will read the SA data.
+ */
+
+typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t,
+ uint16_t length, int length_idx, boolean_t, void *userp);
+
+static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype);
+static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab);
+static sa_idx_tab_t *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype,
+ sa_hdr_phys_t *hdr);
+static void sa_idx_tab_rele(objset_t *os, void *arg);
+static void sa_copy_data(sa_data_locator_t *func, void *start, void *target,
+ int buflen);
+static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
+ sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
+ uint16_t buflen, dmu_tx_t *tx);
+
+arc_byteswap_func_t sa_bswap_table[] = {
+ byteswap_uint64_array,
+ byteswap_uint32_array,
+ byteswap_uint16_array,
+ byteswap_uint8_array,
+ zfs_acl_byteswap,
+};
+
+#ifdef HAVE_EFFICIENT_UNALIGNED_ACCESS
+#define SA_COPY_DATA(f, s, t, l) \
+do { \
+ if (f == NULL) { \
+ if (l == 8) { \
+ *(uint64_t *)t = *(uint64_t *)s; \
+ } else if (l == 16) { \
+ *(uint64_t *)t = *(uint64_t *)s; \
+ *(uint64_t *)((uintptr_t)t + 8) = \
+ *(uint64_t *)((uintptr_t)s + 8); \
+ } else { \
+ bcopy(s, t, l); \
+ } \
+ } else { \
+ sa_copy_data(f, s, t, l); \
+ } \
+} while (0)
+#else
+#define SA_COPY_DATA(f, s, t, l) sa_copy_data(f, s, t, l)
+#endif
+
+/*
+ * This table is fixed and cannot be changed. Its purpose is to
+ * allow the SA code to work with both old/new ZPL file systems.
+ * It contains the list of legacy attributes. These attributes aren't
+ * stored in the "attribute" registry zap objects, since older ZPL file systems
+ * won't have the registry. Only objsets of type ZFS_TYPE_FILESYSTEM will
+ * use this static table.
+ */
+sa_attr_reg_t sa_legacy_attrs[] = {
+ {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
+ {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
+ {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
+ {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
+ {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
+ {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
+ {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
+ {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
+ {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
+ {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
+ {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
+ {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
+ {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
+ {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
+ {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
+ {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
+};
+
+/*
+ * This is only used for objects of type DMU_OT_ZNODE
+ */
+sa_attr_type_t sa_legacy_zpl_layout[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
+/*
+ * Special dummy layout used for buffers with no attributes.
+ */
+sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
+
+static int sa_legacy_attr_count = ARRAY_SIZE(sa_legacy_attrs);
+static kmem_cache_t *sa_cache = NULL;
+
+/*ARGSUSED*/
+static int
+sa_cache_constructor(void *buf, void *unused, int kmflag)
+{
+ sa_handle_t *hdl = buf;
+
+ mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+sa_cache_destructor(void *buf, void *unused)
+{
+ sa_handle_t *hdl = buf;
+ mutex_destroy(&hdl->sa_lock);
+}
+
+void
+sa_cache_init(void)
+{
+ sa_cache = kmem_cache_create("sa_cache",
+ sizeof (sa_handle_t), 0, sa_cache_constructor,
+ sa_cache_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+sa_cache_fini(void)
+{
+ if (sa_cache)
+ kmem_cache_destroy(sa_cache);
+}
+
+static int
+layout_num_compare(const void *arg1, const void *arg2)
+{
+ const sa_lot_t *node1 = (const sa_lot_t *)arg1;
+ const sa_lot_t *node2 = (const sa_lot_t *)arg2;
+
+ return (TREE_CMP(node1->lot_num, node2->lot_num));
+}
+
+static int
+layout_hash_compare(const void *arg1, const void *arg2)
+{
+ const sa_lot_t *node1 = (const sa_lot_t *)arg1;
+ const sa_lot_t *node2 = (const sa_lot_t *)arg2;
+
+ int cmp = TREE_CMP(node1->lot_hash, node2->lot_hash);
+ if (likely(cmp))
+ return (cmp);
+
+ return (TREE_CMP(node1->lot_instance, node2->lot_instance));
+}
+
+static boolean_t
+sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count)
+{
+ int i;
+
+ if (count != tbf->lot_attr_count)
+ return (1);
+
+ for (i = 0; i != count; i++) {
+ if (attrs[i] != tbf->lot_attrs[i])
+ return (1);
+ }
+ return (0);
+}
+
+#define SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF])
+
+static uint64_t
+sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count)
+{
+ int i;
+ uint64_t crc = -1ULL;
+
+ for (i = 0; i != attr_count; i++)
+ crc ^= SA_ATTR_HASH(attrs[i]);
+
+ return (crc);
+}
+
+static int
+sa_get_spill(sa_handle_t *hdl)
+{
+ int rc;
+ if (hdl->sa_spill == NULL) {
+ if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL,
+ &hdl->sa_spill)) == 0)
+ VERIFY(0 == sa_build_index(hdl, SA_SPILL));
+ } else {
+ rc = 0;
+ }
+
+ return (rc);
+}
+
+/*
+ * Main attribute lookup/update function
+ * returns 0 for success or non zero for failures
+ *
+ * Operates on bulk array, first failure will abort further processing
+ */
+static int
+sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
+ sa_data_op_t data_op, dmu_tx_t *tx)
+{
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ int i;
+ int error = 0;
+ sa_buf_type_t buftypes;
+
+ buftypes = 0;
+
+ ASSERT(count > 0);
+ for (i = 0; i != count; i++) {
+ ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs);
+
+ bulk[i].sa_addr = NULL;
+ /* First check the bonus buffer */
+
+ if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT(
+ hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) {
+ SA_ATTR_INFO(sa, hdl->sa_bonus_tab,
+ SA_GET_HDR(hdl, SA_BONUS),
+ bulk[i].sa_attr, bulk[i], SA_BONUS, hdl);
+ if (tx && !(buftypes & SA_BONUS)) {
+ dmu_buf_will_dirty(hdl->sa_bonus, tx);
+ buftypes |= SA_BONUS;
+ }
+ }
+ if (bulk[i].sa_addr == NULL &&
+ ((error = sa_get_spill(hdl)) == 0)) {
+ if (TOC_ATTR_PRESENT(
+ hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) {
+ SA_ATTR_INFO(sa, hdl->sa_spill_tab,
+ SA_GET_HDR(hdl, SA_SPILL),
+ bulk[i].sa_attr, bulk[i], SA_SPILL, hdl);
+ if (tx && !(buftypes & SA_SPILL) &&
+ bulk[i].sa_size == bulk[i].sa_length) {
+ dmu_buf_will_dirty(hdl->sa_spill, tx);
+ buftypes |= SA_SPILL;
+ }
+ }
+ }
+ if (error && error != ENOENT) {
+ return ((error == ECKSUM) ? EIO : error);
+ }
+
+ switch (data_op) {
+ case SA_LOOKUP:
+ if (bulk[i].sa_addr == NULL)
+ return (SET_ERROR(ENOENT));
+ if (bulk[i].sa_data) {
+ SA_COPY_DATA(bulk[i].sa_data_func,
+ bulk[i].sa_addr, bulk[i].sa_data,
+ bulk[i].sa_size);
+ }
+ continue;
+
+ case SA_UPDATE:
+ /* existing rewrite of attr */
+ if (bulk[i].sa_addr &&
+ bulk[i].sa_size == bulk[i].sa_length) {
+ SA_COPY_DATA(bulk[i].sa_data_func,
+ bulk[i].sa_data, bulk[i].sa_addr,
+ bulk[i].sa_length);
+ continue;
+ } else if (bulk[i].sa_addr) { /* attr size change */
+ error = sa_modify_attrs(hdl, bulk[i].sa_attr,
+ SA_REPLACE, bulk[i].sa_data_func,
+ bulk[i].sa_data, bulk[i].sa_length, tx);
+ } else { /* adding new attribute */
+ error = sa_modify_attrs(hdl, bulk[i].sa_attr,
+ SA_ADD, bulk[i].sa_data_func,
+ bulk[i].sa_data, bulk[i].sa_length, tx);
+ }
+ if (error)
+ return (error);
+ break;
+ default:
+ break;
+ }
+ }
+ return (error);
+}
+
+static sa_lot_t *
+sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
+ uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx)
+{
+ sa_os_t *sa = os->os_sa;
+ sa_lot_t *tb, *findtb;
+ int i;
+ avl_index_t loc;
+
+ ASSERT(MUTEX_HELD(&sa->sa_lock));
+ tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP);
+ tb->lot_attr_count = attr_count;
+ tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
+ KM_SLEEP);
+ bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count);
+ tb->lot_num = lot_num;
+ tb->lot_hash = hash;
+ tb->lot_instance = 0;
+
+ if (zapadd) {
+ char attr_name[8];
+
+ if (sa->sa_layout_attr_obj == 0) {
+ sa->sa_layout_attr_obj = zap_create_link(os,
+ DMU_OT_SA_ATTR_LAYOUTS,
+ sa->sa_master_obj, SA_LAYOUTS, tx);
+ }
+
+ (void) snprintf(attr_name, sizeof (attr_name),
+ "%d", (int)lot_num);
+ VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj,
+ attr_name, 2, attr_count, attrs, tx));
+ }
+
+ list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t),
+ offsetof(sa_idx_tab_t, sa_next));
+
+ for (i = 0; i != attr_count; i++) {
+ if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0)
+ tb->lot_var_sizes++;
+ }
+
+ avl_add(&sa->sa_layout_num_tree, tb);
+
+ /* verify we don't have a hash collision */
+ if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) {
+ for (; findtb && findtb->lot_hash == hash;
+ findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) {
+ if (findtb->lot_instance != tb->lot_instance)
+ break;
+ tb->lot_instance++;
+ }
+ }
+ avl_add(&sa->sa_layout_hash_tree, tb);
+ return (tb);
+}
+
+static void
+sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs,
+ int count, dmu_tx_t *tx, sa_lot_t **lot)
+{
+ sa_lot_t *tb, tbsearch;
+ avl_index_t loc;
+ sa_os_t *sa = os->os_sa;
+ boolean_t found = B_FALSE;
+
+ mutex_enter(&sa->sa_lock);
+ tbsearch.lot_hash = hash;
+ tbsearch.lot_instance = 0;
+ tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc);
+ if (tb) {
+ for (; tb && tb->lot_hash == hash;
+ tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) {
+ if (sa_layout_equal(tb, attrs, count) == 0) {
+ found = B_TRUE;
+ break;
+ }
+ }
+ }
+ if (!found) {
+ tb = sa_add_layout_entry(os, attrs, count,
+ avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx);
+ }
+ mutex_exit(&sa->sa_lock);
+ *lot = tb;
+}
+
+static int
+sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
+{
+ int error;
+ uint32_t blocksize;
+
+ if (size == 0) {
+ blocksize = SPA_MINBLOCKSIZE;
+ } else if (size > SPA_OLD_MAXBLOCKSIZE) {
+ ASSERT(0);
+ return (SET_ERROR(EFBIG));
+ } else {
+ blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t);
+ }
+
+ error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx);
+ ASSERT(error == 0);
+ return (error);
+}
+
+static void
+sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
+{
+ if (func == NULL) {
+ bcopy(datastart, target, buflen);
+ } else {
+ boolean_t start;
+ int bytes;
+ void *dataptr;
+ void *saptr = target;
+ uint32_t length;
+
+ start = B_TRUE;
+ bytes = 0;
+ while (bytes < buflen) {
+ func(&dataptr, &length, buflen, start, datastart);
+ bcopy(dataptr, saptr, length);
+ saptr = (void *)((caddr_t)saptr + length);
+ bytes += length;
+ start = B_FALSE;
+ }
+ }
+}
+
+/*
+ * Determine several different values pertaining to system attribute
+ * buffers.
+ *
+ * Return the size of the sa_hdr_phys_t header for the buffer. Each
+ * variable length attribute except the first contributes two bytes to
+ * the header size, which is then rounded up to an 8-byte boundary.
+ *
+ * The following output parameters are also computed.
+ *
+ * index - The index of the first attribute in attr_desc that will
+ * spill over. Only valid if will_spill is set.
+ *
+ * total - The total number of bytes of all system attributes described
+ * in attr_desc.
+ *
+ * will_spill - Set when spilling is necessary. It is only set when
+ * the buftype is SA_BONUS.
+ */
+static int
+sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
+ dmu_buf_t *db, sa_buf_type_t buftype, int full_space, int *index,
+ int *total, boolean_t *will_spill)
+{
+ int var_size_count = 0;
+ int i;
+ int hdrsize;
+ int extra_hdrsize;
+
+ if (buftype == SA_BONUS && sa->sa_force_spill) {
+ *total = 0;
+ *index = 0;
+ *will_spill = B_TRUE;
+ return (0);
+ }
+
+ *index = -1;
+ *total = 0;
+ *will_spill = B_FALSE;
+
+ extra_hdrsize = 0;
+ hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
+ sizeof (sa_hdr_phys_t);
+
+ ASSERT(IS_P2ALIGNED(full_space, 8));
+
+ for (i = 0; i != attr_count; i++) {
+ boolean_t is_var_sz, might_spill_here;
+ int tmp_hdrsize;
+
+ *total = P2ROUNDUP(*total, 8);
+ *total += attr_desc[i].sa_length;
+ if (*will_spill)
+ continue;
+
+ is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0);
+ if (is_var_sz)
+ var_size_count++;
+
+ /*
+ * Calculate what the SA header size would be if this
+ * attribute doesn't spill.
+ */
+ tmp_hdrsize = hdrsize + ((is_var_sz && var_size_count > 1) ?
+ sizeof (uint16_t) : 0);
+
+ /*
+ * Check whether this attribute spans into the space
+ * that would be used by the spill block pointer should
+ * a spill block be needed.
+ */
+ might_spill_here =
+ buftype == SA_BONUS && *index == -1 &&
+ (*total + P2ROUNDUP(tmp_hdrsize, 8)) >
+ (full_space - sizeof (blkptr_t));
+
+ if (is_var_sz && var_size_count > 1) {
+ if (buftype == SA_SPILL ||
+ tmp_hdrsize + *total < full_space) {
+ /*
+ * Record the extra header size in case this
+ * increase needs to be reversed due to
+ * spill-over.
+ */
+ hdrsize = tmp_hdrsize;
+ if (*index != -1 || might_spill_here)
+ extra_hdrsize += sizeof (uint16_t);
+ } else {
+ ASSERT(buftype == SA_BONUS);
+ if (*index == -1)
+ *index = i;
+ *will_spill = B_TRUE;
+ continue;
+ }
+ }
+
+ /*
+ * Store index of where spill *could* occur. Then
+ * continue to count the remaining attribute sizes. The
+ * sum is used later for sizing bonus and spill buffer.
+ */
+ if (might_spill_here)
+ *index = i;
+
+ if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space &&
+ buftype == SA_BONUS)
+ *will_spill = B_TRUE;
+ }
+
+ if (*will_spill)
+ hdrsize -= extra_hdrsize;
+
+ hdrsize = P2ROUNDUP(hdrsize, 8);
+ return (hdrsize);
+}
+
+#define BUF_SPACE_NEEDED(total, header) (total + header)
+
+/*
+ * Find layout that corresponds to ordering of attributes
+ * If not found a new layout number is created and added to
+ * persistent layout tables.
+ */
+static int
+sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
+ dmu_tx_t *tx)
+{
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ uint64_t hash;
+ sa_buf_type_t buftype;
+ sa_hdr_phys_t *sahdr;
+ void *data_start;
+ sa_attr_type_t *attrs, *attrs_start;
+ int i, lot_count;
+ int dnodesize;
+ int spill_idx;
+ int hdrsize;
+ int spillhdrsize = 0;
+ int used;
+ dmu_object_type_t bonustype;
+ sa_lot_t *lot;
+ int len_idx;
+ int spill_used;
+ int bonuslen;
+ boolean_t spilling;
+
+ dmu_buf_will_dirty(hdl->sa_bonus, tx);
+ bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus);
+ dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize);
+ bonuslen = DN_BONUS_SIZE(dnodesize);
+
+ /* first determine bonus header size and sum of all attributes */
+ hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
+ SA_BONUS, bonuslen, &spill_idx, &used, &spilling);
+
+ if (used > SPA_OLD_MAXBLOCKSIZE)
+ return (SET_ERROR(EFBIG));
+
+ VERIFY0(dmu_set_bonus(hdl->sa_bonus, spilling ?
+ MIN(bonuslen - sizeof (blkptr_t), used + hdrsize) :
+ used + hdrsize, tx));
+
+ ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) ||
+ bonustype == DMU_OT_SA);
+
+ /* setup and size spill buffer when needed */
+ if (spilling) {
+ boolean_t dummy;
+
+ if (hdl->sa_spill == NULL) {
+ VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, 0, NULL,
+ &hdl->sa_spill) == 0);
+ }
+ dmu_buf_will_dirty(hdl->sa_spill, tx);
+
+ spillhdrsize = sa_find_sizes(sa, &attr_desc[spill_idx],
+ attr_count - spill_idx, hdl->sa_spill, SA_SPILL,
+ hdl->sa_spill->db_size, &i, &spill_used, &dummy);
+
+ if (spill_used > SPA_OLD_MAXBLOCKSIZE)
+ return (SET_ERROR(EFBIG));
+
+ if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) >
+ hdl->sa_spill->db_size)
+ VERIFY(0 == sa_resize_spill(hdl,
+ BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx));
+ }
+
+ /* setup starting pointers to lay down data */
+ data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize);
+ sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data;
+ buftype = SA_BONUS;
+
+ attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
+ KM_SLEEP);
+ lot_count = 0;
+
+ for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) {
+ uint16_t length;
+
+ ASSERT(IS_P2ALIGNED(data_start, 8));
+ attrs[i] = attr_desc[i].sa_attr;
+ length = SA_REGISTERED_LEN(sa, attrs[i]);
+ if (length == 0)
+ length = attr_desc[i].sa_length;
+
+ if (spilling && i == spill_idx) { /* switch to spill buffer */
+ VERIFY(bonustype == DMU_OT_SA);
+ if (buftype == SA_BONUS && !sa->sa_force_spill) {
+ sa_find_layout(hdl->sa_os, hash, attrs_start,
+ lot_count, tx, &lot);
+ SA_SET_HDR(sahdr, lot->lot_num, hdrsize);
+ }
+
+ buftype = SA_SPILL;
+ hash = -1ULL;
+ len_idx = 0;
+
+ sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data;
+ sahdr->sa_magic = SA_MAGIC;
+ data_start = (void *)((uintptr_t)sahdr +
+ spillhdrsize);
+ attrs_start = &attrs[i];
+ lot_count = 0;
+ }
+ hash ^= SA_ATTR_HASH(attrs[i]);
+ attr_desc[i].sa_addr = data_start;
+ attr_desc[i].sa_size = length;
+ SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data,
+ data_start, length);
+ if (sa->sa_attr_table[attrs[i]].sa_length == 0) {
+ sahdr->sa_lengths[len_idx++] = length;
+ }
+ data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
+ length), 8);
+ lot_count++;
+ }
+
+ sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot);
+
+ /*
+ * Verify that old znodes always have layout number 0.
+ * Must be DMU_OT_SA for arbitrary layouts
+ */
+ VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) ||
+ (bonustype == DMU_OT_SA && lot->lot_num > 1));
+
+ if (bonustype == DMU_OT_SA) {
+ SA_SET_HDR(sahdr, lot->lot_num,
+ buftype == SA_BONUS ? hdrsize : spillhdrsize);
+ }
+
+ kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count);
+ if (hdl->sa_bonus_tab) {
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
+ hdl->sa_bonus_tab = NULL;
+ }
+ if (!sa->sa_force_spill)
+ VERIFY(0 == sa_build_index(hdl, SA_BONUS));
+ if (hdl->sa_spill) {
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
+ if (!spilling) {
+ /*
+ * remove spill block that is no longer needed.
+ */
+ dmu_buf_rele(hdl->sa_spill, NULL);
+ hdl->sa_spill = NULL;
+ hdl->sa_spill_tab = NULL;
+ VERIFY(0 == dmu_rm_spill(hdl->sa_os,
+ sa_handle_object(hdl), tx));
+ } else {
+ VERIFY(0 == sa_build_index(hdl, SA_SPILL));
+ }
+ }
+
+ return (0);
+}
+
+static void
+sa_free_attr_table(sa_os_t *sa)
+{
+ int i;
+
+ if (sa->sa_attr_table == NULL)
+ return;
+
+ for (i = 0; i != sa->sa_num_attrs; i++) {
+ if (sa->sa_attr_table[i].sa_name)
+ kmem_free(sa->sa_attr_table[i].sa_name,
+ strlen(sa->sa_attr_table[i].sa_name) + 1);
+ }
+
+ kmem_free(sa->sa_attr_table,
+ sizeof (sa_attr_table_t) * sa->sa_num_attrs);
+
+ sa->sa_attr_table = NULL;
+}
+
+static int
+sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
+{
+ sa_os_t *sa = os->os_sa;
+ uint64_t sa_attr_count = 0;
+ uint64_t sa_reg_count = 0;
+ int error = 0;
+ uint64_t attr_value;
+ sa_attr_table_t *tb;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int registered_count = 0;
+ int i;
+ dmu_objset_type_t ostype = dmu_objset_type(os);
+
+ sa->sa_user_table =
+ kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP);
+ sa->sa_user_table_sz = count * sizeof (sa_attr_type_t);
+
+ if (sa->sa_reg_attr_obj != 0) {
+ error = zap_count(os, sa->sa_reg_attr_obj,
+ &sa_attr_count);
+
+ /*
+ * Make sure we retrieved a count and that it isn't zero
+ */
+ if (error || (error == 0 && sa_attr_count == 0)) {
+ if (error == 0)
+ error = SET_ERROR(EINVAL);
+ goto bail;
+ }
+ sa_reg_count = sa_attr_count;
+ }
+
+ if (ostype == DMU_OST_ZFS && sa_attr_count == 0)
+ sa_attr_count += sa_legacy_attr_count;
+
+ /* Allocate attribute numbers for attributes that aren't registered */
+ for (i = 0; i != count; i++) {
+ boolean_t found = B_FALSE;
+ int j;
+
+ if (ostype == DMU_OST_ZFS) {
+ for (j = 0; j != sa_legacy_attr_count; j++) {
+ if (strcmp(reg_attrs[i].sa_name,
+ sa_legacy_attrs[j].sa_name) == 0) {
+ sa->sa_user_table[i] =
+ sa_legacy_attrs[j].sa_attr;
+ found = B_TRUE;
+ }
+ }
+ }
+ if (found)
+ continue;
+
+ if (sa->sa_reg_attr_obj)
+ error = zap_lookup(os, sa->sa_reg_attr_obj,
+ reg_attrs[i].sa_name, 8, 1, &attr_value);
+ else
+ error = SET_ERROR(ENOENT);
+ switch (error) {
+ case ENOENT:
+ sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count;
+ sa_attr_count++;
+ break;
+ case 0:
+ sa->sa_user_table[i] = ATTR_NUM(attr_value);
+ break;
+ default:
+ goto bail;
+ }
+ }
+
+ sa->sa_num_attrs = sa_attr_count;
+ tb = sa->sa_attr_table =
+ kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP);
+
+ /*
+ * Attribute table is constructed from requested attribute list,
+ * previously foreign registered attributes, and also the legacy
+ * ZPL set of attributes.
+ */
+
+ if (sa->sa_reg_attr_obj) {
+ for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj);
+ (error = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t value;
+ value = za.za_first_integer;
+
+ registered_count++;
+ tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value);
+ tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value);
+ tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value);
+ tb[ATTR_NUM(value)].sa_registered = B_TRUE;
+
+ if (tb[ATTR_NUM(value)].sa_name) {
+ continue;
+ }
+ tb[ATTR_NUM(value)].sa_name =
+ kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP);
+ (void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name,
+ strlen(za.za_name) +1);
+ }
+ zap_cursor_fini(&zc);
+ /*
+ * Make sure we processed the correct number of registered
+ * attributes
+ */
+ if (registered_count != sa_reg_count) {
+ ASSERT(error != 0);
+ goto bail;
+ }
+
+ }
+
+ if (ostype == DMU_OST_ZFS) {
+ for (i = 0; i != sa_legacy_attr_count; i++) {
+ if (tb[i].sa_name)
+ continue;
+ tb[i].sa_attr = sa_legacy_attrs[i].sa_attr;
+ tb[i].sa_length = sa_legacy_attrs[i].sa_length;
+ tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap;
+ tb[i].sa_registered = B_FALSE;
+ tb[i].sa_name =
+ kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1,
+ KM_SLEEP);
+ (void) strlcpy(tb[i].sa_name,
+ sa_legacy_attrs[i].sa_name,
+ strlen(sa_legacy_attrs[i].sa_name) + 1);
+ }
+ }
+
+ for (i = 0; i != count; i++) {
+ sa_attr_type_t attr_id;
+
+ attr_id = sa->sa_user_table[i];
+ if (tb[attr_id].sa_name)
+ continue;
+
+ tb[attr_id].sa_length = reg_attrs[i].sa_length;
+ tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap;
+ tb[attr_id].sa_attr = attr_id;
+ tb[attr_id].sa_name =
+ kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP);
+ (void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name,
+ strlen(reg_attrs[i].sa_name) + 1);
+ }
+
+ sa->sa_need_attr_registration =
+ (sa_attr_count != registered_count);
+
+ return (0);
+bail:
+ kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t));
+ sa->sa_user_table = NULL;
+ sa_free_attr_table(sa);
+ ASSERT(error != 0);
+ return (error);
+}
+
+int
+sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count,
+ sa_attr_type_t **user_table)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ sa_os_t *sa;
+ dmu_objset_type_t ostype = dmu_objset_type(os);
+ sa_attr_type_t *tb;
+ int error;
+
+ mutex_enter(&os->os_user_ptr_lock);
+ if (os->os_sa) {
+ mutex_enter(&os->os_sa->sa_lock);
+ mutex_exit(&os->os_user_ptr_lock);
+ tb = os->os_sa->sa_user_table;
+ mutex_exit(&os->os_sa->sa_lock);
+ *user_table = tb;
+ return (0);
+ }
+
+ sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP);
+ mutex_init(&sa->sa_lock, NULL, MUTEX_NOLOCKDEP, NULL);
+ sa->sa_master_obj = sa_obj;
+
+ os->os_sa = sa;
+ mutex_enter(&sa->sa_lock);
+ mutex_exit(&os->os_user_ptr_lock);
+ avl_create(&sa->sa_layout_num_tree, layout_num_compare,
+ sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node));
+ avl_create(&sa->sa_layout_hash_tree, layout_hash_compare,
+ sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node));
+
+ if (sa_obj) {
+ error = zap_lookup(os, sa_obj, SA_LAYOUTS,
+ 8, 1, &sa->sa_layout_attr_obj);
+ if (error != 0 && error != ENOENT)
+ goto fail;
+ error = zap_lookup(os, sa_obj, SA_REGISTRY,
+ 8, 1, &sa->sa_reg_attr_obj);
+ if (error != 0 && error != ENOENT)
+ goto fail;
+ }
+
+ if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0)
+ goto fail;
+
+ if (sa->sa_layout_attr_obj != 0) {
+ uint64_t layout_count;
+
+ error = zap_count(os, sa->sa_layout_attr_obj,
+ &layout_count);
+
+ /*
+ * Layout number count should be > 0
+ */
+ if (error || (error == 0 && layout_count == 0)) {
+ if (error == 0)
+ error = SET_ERROR(EINVAL);
+ goto fail;
+ }
+
+ for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj);
+ (error = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ sa_attr_type_t *lot_attrs;
+ uint64_t lot_num;
+
+ lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) *
+ za.za_num_integers, KM_SLEEP);
+
+ if ((error = (zap_lookup(os, sa->sa_layout_attr_obj,
+ za.za_name, 2, za.za_num_integers,
+ lot_attrs))) != 0) {
+ kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
+ za.za_num_integers);
+ break;
+ }
+ VERIFY(ddi_strtoull(za.za_name, NULL, 10,
+ (unsigned long long *)&lot_num) == 0);
+
+ (void) sa_add_layout_entry(os, lot_attrs,
+ za.za_num_integers, lot_num,
+ sa_layout_info_hash(lot_attrs,
+ za.za_num_integers), B_FALSE, NULL);
+ kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
+ za.za_num_integers);
+ }
+ zap_cursor_fini(&zc);
+
+ /*
+ * Make sure layout count matches number of entries added
+ * to AVL tree
+ */
+ if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) {
+ ASSERT(error != 0);
+ goto fail;
+ }
+ }
+
+ /* Add special layout number for old ZNODES */
+ if (ostype == DMU_OST_ZFS) {
+ (void) sa_add_layout_entry(os, sa_legacy_zpl_layout,
+ sa_legacy_attr_count, 0,
+ sa_layout_info_hash(sa_legacy_zpl_layout,
+ sa_legacy_attr_count), B_FALSE, NULL);
+
+ (void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1,
+ 0, B_FALSE, NULL);
+ }
+ *user_table = os->os_sa->sa_user_table;
+ mutex_exit(&sa->sa_lock);
+ return (0);
+fail:
+ os->os_sa = NULL;
+ sa_free_attr_table(sa);
+ if (sa->sa_user_table)
+ kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
+ mutex_exit(&sa->sa_lock);
+ avl_destroy(&sa->sa_layout_hash_tree);
+ avl_destroy(&sa->sa_layout_num_tree);
+ mutex_destroy(&sa->sa_lock);
+ kmem_free(sa, sizeof (sa_os_t));
+ return ((error == ECKSUM) ? EIO : error);
+}
+
+void
+sa_tear_down(objset_t *os)
+{
+ sa_os_t *sa = os->os_sa;
+ sa_lot_t *layout;
+ void *cookie;
+
+ kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
+
+ /* Free up attr table */
+
+ sa_free_attr_table(sa);
+
+ cookie = NULL;
+ while ((layout =
+ avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie))) {
+ sa_idx_tab_t *tab;
+ while ((tab = list_head(&layout->lot_idx_tab))) {
+ ASSERT(zfs_refcount_count(&tab->sa_refcount));
+ sa_idx_tab_rele(os, tab);
+ }
+ }
+
+ cookie = NULL;
+ while ((layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie))) {
+ kmem_free(layout->lot_attrs,
+ sizeof (sa_attr_type_t) * layout->lot_attr_count);
+ kmem_free(layout, sizeof (sa_lot_t));
+ }
+
+ avl_destroy(&sa->sa_layout_hash_tree);
+ avl_destroy(&sa->sa_layout_num_tree);
+ mutex_destroy(&sa->sa_lock);
+
+ kmem_free(sa, sizeof (sa_os_t));
+ os->os_sa = NULL;
+}
+
+static void
+sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr,
+ uint16_t length, int length_idx, boolean_t var_length, void *userp)
+{
+ sa_idx_tab_t *idx_tab = userp;
+
+ if (var_length) {
+ ASSERT(idx_tab->sa_variable_lengths);
+ idx_tab->sa_variable_lengths[length_idx] = length;
+ }
+ TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx,
+ (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr));
+}
+
+static void
+sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type,
+ sa_iterfunc_t func, sa_lot_t *tab, void *userp)
+{
+ void *data_start;
+ sa_lot_t *tb = tab;
+ sa_lot_t search;
+ avl_index_t loc;
+ sa_os_t *sa = os->os_sa;
+ int i;
+ uint16_t *length_start = NULL;
+ uint8_t length_idx = 0;
+
+ if (tab == NULL) {
+ search.lot_num = SA_LAYOUT_NUM(hdr, type);
+ tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
+ ASSERT(tb);
+ }
+
+ if (IS_SA_BONUSTYPE(type)) {
+ data_start = (void *)P2ROUNDUP(((uintptr_t)hdr +
+ offsetof(sa_hdr_phys_t, sa_lengths) +
+ (sizeof (uint16_t) * tb->lot_var_sizes)), 8);
+ length_start = hdr->sa_lengths;
+ } else {
+ data_start = hdr;
+ }
+
+ for (i = 0; i != tb->lot_attr_count; i++) {
+ int attr_length, reg_length;
+ uint8_t idx_len;
+
+ reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length;
+ if (reg_length) {
+ attr_length = reg_length;
+ idx_len = 0;
+ } else {
+ attr_length = length_start[length_idx];
+ idx_len = length_idx++;
+ }
+
+ func(hdr, data_start, tb->lot_attrs[i], attr_length,
+ idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp);
+
+ data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
+ attr_length), 8);
+ }
+}
+
+/*ARGSUSED*/
+static void
+sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr,
+ uint16_t length, int length_idx, boolean_t variable_length, void *userp)
+{
+ sa_handle_t *hdl = userp;
+ sa_os_t *sa = hdl->sa_os->os_sa;
+
+ sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length);
+}
+
+static void
+sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype)
+{
+ sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype);
+ dmu_buf_impl_t *db;
+ int num_lengths = 1;
+ int i;
+ sa_os_t *sa __maybe_unused = hdl->sa_os->os_sa;
+
+ ASSERT(MUTEX_HELD(&sa->sa_lock));
+ if (sa_hdr_phys->sa_magic == SA_MAGIC)
+ return;
+
+ db = SA_GET_DB(hdl, buftype);
+
+ if (buftype == SA_SPILL) {
+ arc_release(db->db_buf, NULL);
+ arc_buf_thaw(db->db_buf);
+ }
+
+ sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic);
+ sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info);
+
+ /*
+ * Determine number of variable lengths in header
+ * The standard 8 byte header has one for free and a
+ * 16 byte header would have 4 + 1;
+ */
+ if (SA_HDR_SIZE(sa_hdr_phys) > 8)
+ num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1;
+ for (i = 0; i != num_lengths; i++)
+ sa_hdr_phys->sa_lengths[i] =
+ BSWAP_16(sa_hdr_phys->sa_lengths[i]);
+
+ sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA,
+ sa_byteswap_cb, NULL, hdl);
+
+ if (buftype == SA_SPILL)
+ arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf);
+}
+
+static int
+sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype)
+{
+ sa_hdr_phys_t *sa_hdr_phys;
+ dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype);
+ dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db);
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ sa_idx_tab_t *idx_tab;
+
+ sa_hdr_phys = SA_GET_HDR(hdl, buftype);
+
+ mutex_enter(&sa->sa_lock);
+
+ /* Do we need to byteswap? */
+
+ /* only check if not old znode */
+ if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC &&
+ sa_hdr_phys->sa_magic != 0) {
+ if (BSWAP_32(sa_hdr_phys->sa_magic) != SA_MAGIC) {
+ mutex_exit(&sa->sa_lock);
+ zfs_dbgmsg("Buffer Header: %x != SA_MAGIC:%x "
+ "object=%#llx\n", sa_hdr_phys->sa_magic, SA_MAGIC,
+ db->db.db_object);
+ return (SET_ERROR(EIO));
+ }
+ sa_byteswap(hdl, buftype);
+ }
+
+ idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys);
+
+ if (buftype == SA_BONUS)
+ hdl->sa_bonus_tab = idx_tab;
+ else
+ hdl->sa_spill_tab = idx_tab;
+
+ mutex_exit(&sa->sa_lock);
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+sa_evict_sync(void *dbu)
+{
+ panic("evicting sa dbuf\n");
+}
+
+static void
+sa_idx_tab_rele(objset_t *os, void *arg)
+{
+ sa_os_t *sa = os->os_sa;
+ sa_idx_tab_t *idx_tab = arg;
+
+ if (idx_tab == NULL)
+ return;
+
+ mutex_enter(&sa->sa_lock);
+ if (zfs_refcount_remove(&idx_tab->sa_refcount, NULL) == 0) {
+ list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab);
+ if (idx_tab->sa_variable_lengths)
+ kmem_free(idx_tab->sa_variable_lengths,
+ sizeof (uint16_t) *
+ idx_tab->sa_layout->lot_var_sizes);
+ zfs_refcount_destroy(&idx_tab->sa_refcount);
+ kmem_free(idx_tab->sa_idx_tab,
+ sizeof (uint32_t) * sa->sa_num_attrs);
+ kmem_free(idx_tab, sizeof (sa_idx_tab_t));
+ }
+ mutex_exit(&sa->sa_lock);
+}
+
+static void
+sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab)
+{
+ sa_os_t *sa __maybe_unused = os->os_sa;
+
+ ASSERT(MUTEX_HELD(&sa->sa_lock));
+ (void) zfs_refcount_add(&idx_tab->sa_refcount, NULL);
+}
+
+void
+sa_spill_rele(sa_handle_t *hdl)
+{
+ mutex_enter(&hdl->sa_lock);
+ if (hdl->sa_spill) {
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
+ dmu_buf_rele(hdl->sa_spill, NULL);
+ hdl->sa_spill = NULL;
+ hdl->sa_spill_tab = NULL;
+ }
+ mutex_exit(&hdl->sa_lock);
+}
+
+void
+sa_handle_destroy(sa_handle_t *hdl)
+{
+ dmu_buf_t *db = hdl->sa_bonus;
+
+ mutex_enter(&hdl->sa_lock);
+ (void) dmu_buf_remove_user(db, &hdl->sa_dbu);
+
+ if (hdl->sa_bonus_tab)
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
+
+ if (hdl->sa_spill_tab)
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
+
+ dmu_buf_rele(hdl->sa_bonus, NULL);
+
+ if (hdl->sa_spill)
+ dmu_buf_rele(hdl->sa_spill, NULL);
+ mutex_exit(&hdl->sa_lock);
+
+ kmem_cache_free(sa_cache, hdl);
+}
+
+int
+sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp,
+ sa_handle_type_t hdl_type, sa_handle_t **handlepp)
+{
+ int error = 0;
+ sa_handle_t *handle = NULL;
+#ifdef ZFS_DEBUG
+ dmu_object_info_t doi;
+
+ dmu_object_info_from_db(db, &doi);
+ ASSERT(doi.doi_bonus_type == DMU_OT_SA ||
+ doi.doi_bonus_type == DMU_OT_ZNODE);
+#endif
+ /* find handle, if it exists */
+ /* if one doesn't exist then create a new one, and initialize it */
+
+ if (hdl_type == SA_HDL_SHARED)
+ handle = dmu_buf_get_user(db);
+
+ if (handle == NULL) {
+ sa_handle_t *winner = NULL;
+
+ handle = kmem_cache_alloc(sa_cache, KM_SLEEP);
+ handle->sa_dbu.dbu_evict_func_sync = NULL;
+ handle->sa_dbu.dbu_evict_func_async = NULL;
+ handle->sa_userp = userp;
+ handle->sa_bonus = db;
+ handle->sa_os = os;
+ handle->sa_spill = NULL;
+ handle->sa_bonus_tab = NULL;
+ handle->sa_spill_tab = NULL;
+
+ error = sa_build_index(handle, SA_BONUS);
+
+ if (hdl_type == SA_HDL_SHARED) {
+ dmu_buf_init_user(&handle->sa_dbu, sa_evict_sync, NULL,
+ NULL);
+ winner = dmu_buf_set_user_ie(db, &handle->sa_dbu);
+ }
+
+ if (winner != NULL) {
+ kmem_cache_free(sa_cache, handle);
+ handle = winner;
+ }
+ }
+ *handlepp = handle;
+
+ return (error);
+}
+
+int
+sa_handle_get(objset_t *objset, uint64_t objid, void *userp,
+ sa_handle_type_t hdl_type, sa_handle_t **handlepp)
+{
+ dmu_buf_t *db;
+ int error;
+
+ if ((error = dmu_bonus_hold(objset, objid, NULL, &db)))
+ return (error);
+
+ return (sa_handle_get_from_db(objset, db, userp, hdl_type,
+ handlepp));
+}
+
+int
+sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db)
+{
+ return (dmu_bonus_hold(objset, obj_num, tag, db));
+}
+
+void
+sa_buf_rele(dmu_buf_t *db, void *tag)
+{
+ dmu_buf_rele(db, tag);
+}
+
+static int
+sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count)
+{
+ ASSERT(hdl);
+ ASSERT(MUTEX_HELD(&hdl->sa_lock));
+ return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL));
+}
+
+static int
+sa_lookup_locked(sa_handle_t *hdl, sa_attr_type_t attr, void *buf,
+ uint32_t buflen)
+{
+ int error;
+ sa_bulk_attr_t bulk;
+
+ VERIFY3U(buflen, <=, SA_ATTR_MAX_LEN);
+
+ bulk.sa_attr = attr;
+ bulk.sa_data = buf;
+ bulk.sa_length = buflen;
+ bulk.sa_data_func = NULL;
+
+ ASSERT(hdl);
+ error = sa_lookup_impl(hdl, &bulk, 1);
+ return (error);
+}
+
+int
+sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen)
+{
+ int error;
+
+ mutex_enter(&hdl->sa_lock);
+ error = sa_lookup_locked(hdl, attr, buf, buflen);
+ mutex_exit(&hdl->sa_lock);
+
+ return (error);
+}
+
+#ifdef _KERNEL
+int
+sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, zfs_uio_t *uio)
+{
+ int error;
+ sa_bulk_attr_t bulk;
+
+ bulk.sa_data = NULL;
+ bulk.sa_attr = attr;
+ bulk.sa_data_func = NULL;
+
+ ASSERT(hdl);
+
+ mutex_enter(&hdl->sa_lock);
+ if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) {
+ error = zfs_uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size,
+ zfs_uio_resid(uio)), UIO_READ, uio);
+ }
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+/*
+ * For the existed object that is upgraded from old system, its ondisk layout
+ * has no slot for the project ID attribute. But quota accounting logic needs
+ * to access related slots by offset directly. So we need to adjust these old
+ * objects' layout to make the project ID to some unified and fixed offset.
+ */
+int
+sa_add_projid(sa_handle_t *hdl, dmu_tx_t *tx, uint64_t projid)
+{
+ znode_t *zp = sa_get_userdata(hdl);
+ dmu_buf_t *db = sa_get_db(hdl);
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ int count = 0, err = 0;
+ sa_bulk_attr_t *bulk, *attrs;
+ zfs_acl_locator_cb_t locate = { 0 };
+ uint64_t uid, gid, mode, rdev, xattr = 0, parent, gen, links;
+ uint64_t crtime[2], mtime[2], ctime[2], atime[2];
+ zfs_acl_phys_t znode_acl = { 0 };
+ char scanstamp[AV_SCANSTAMP_SZ];
+
+ if (zp->z_acl_cached == NULL) {
+ zfs_acl_t *aclp;
+
+ mutex_enter(&zp->z_acl_lock);
+ err = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
+ mutex_exit(&zp->z_acl_lock);
+ if (err != 0 && err != ENOENT)
+ return (err);
+ }
+
+ bulk = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
+ attrs = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
+ mutex_enter(&hdl->sa_lock);
+ mutex_enter(&zp->z_lock);
+
+ err = sa_lookup_locked(hdl, SA_ZPL_PROJID(zfsvfs), &projid,
+ sizeof (uint64_t));
+ if (unlikely(err == 0))
+ /* Someone has added project ID attr by race. */
+ err = EEXIST;
+ if (err != ENOENT)
+ goto out;
+
+ /* First do a bulk query of the attributes that aren't cached */
+ if (zp->z_is_sa) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &mode, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
+ &gen, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &uid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+ &gid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+ &parent, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &atime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL,
+ &crtime, 16);
+ if (Z_ISBLK(ZTOTYPE(zp)) || Z_ISCHR(ZTOTYPE(zp)))
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
+ &rdev, 8);
+ } else {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &atime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL,
+ &crtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
+ &gen, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &mode, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+ &parent, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL,
+ &xattr, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
+ &rdev, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &uid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+ &gid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+ &znode_acl, 88);
+ }
+ err = sa_bulk_lookup_locked(hdl, bulk, count);
+ if (err != 0)
+ goto out;
+
+ err = sa_lookup_locked(hdl, SA_ZPL_XATTR(zfsvfs), &xattr, 8);
+ if (err != 0 && err != ENOENT)
+ goto out;
+
+ zp->z_projid = projid;
+ zp->z_pflags |= ZFS_PROJID;
+ links = ZTONLNK(zp);
+ count = 0;
+ err = 0;
+
+ SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+ SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8);
+ SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
+ SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
+ SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+ SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL,
+ &crtime, 16);
+ SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
+ SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_PROJID(zfsvfs), NULL, &projid, 8);
+
+ if (Z_ISBLK(ZTOTYPE(zp)) || Z_ISCHR(ZTOTYPE(zp)))
+ SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_RDEV(zfsvfs), NULL,
+ &rdev, 8);
+
+ if (zp->z_acl_cached != NULL) {
+ SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+ &zp->z_acl_cached->z_acl_count, 8);
+ if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID)
+ zfs_acl_xform(zp, zp->z_acl_cached, CRED());
+ locate.cb_aclp = zp->z_acl_cached;
+ SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_DACL_ACES(zfsvfs),
+ zfs_acl_data_locator, &locate,
+ zp->z_acl_cached->z_acl_bytes);
+ }
+
+ if (xattr)
+ SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_XATTR(zfsvfs), NULL,
+ &xattr, 8);
+
+ if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) {
+ bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+ scanstamp, AV_SCANSTAMP_SZ);
+ SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_SCANSTAMP(zfsvfs), NULL,
+ scanstamp, AV_SCANSTAMP_SZ);
+ zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP;
+ }
+
+ VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0);
+ VERIFY(sa_replace_all_by_template_locked(hdl, attrs, count, tx) == 0);
+ if (znode_acl.z_acl_extern_obj) {
+ VERIFY(0 == dmu_object_free(zfsvfs->z_os,
+ znode_acl.z_acl_extern_obj, tx));
+ }
+
+ zp->z_is_sa = B_TRUE;
+
+out:
+ mutex_exit(&zp->z_lock);
+ mutex_exit(&hdl->sa_lock);
+ kmem_free(attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
+ kmem_free(bulk, sizeof (sa_bulk_attr_t) * ZPL_END);
+ return (err);
+}
+#endif
+
+static sa_idx_tab_t *
+sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, sa_hdr_phys_t *hdr)
+{
+ sa_idx_tab_t *idx_tab;
+ sa_os_t *sa = os->os_sa;
+ sa_lot_t *tb, search;
+ avl_index_t loc;
+
+ /*
+ * Deterimine layout number. If SA node and header == 0 then
+ * force the index table to the dummy "1" empty layout.
+ *
+ * The layout number would only be zero for a newly created file
+ * that has not added any attributes yet, or with crypto enabled which
+ * doesn't write any attributes to the bonus buffer.
+ */
+
+ search.lot_num = SA_LAYOUT_NUM(hdr, bonustype);
+
+ tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
+
+ /* Verify header size is consistent with layout information */
+ ASSERT(tb);
+ ASSERT((IS_SA_BONUSTYPE(bonustype) &&
+ SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb)) || !IS_SA_BONUSTYPE(bonustype) ||
+ (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0));
+
+ /*
+ * See if any of the already existing TOC entries can be reused?
+ */
+
+ for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab;
+ idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) {
+ boolean_t valid_idx = B_TRUE;
+ int i;
+
+ if (tb->lot_var_sizes != 0 &&
+ idx_tab->sa_variable_lengths != NULL) {
+ for (i = 0; i != tb->lot_var_sizes; i++) {
+ if (hdr->sa_lengths[i] !=
+ idx_tab->sa_variable_lengths[i]) {
+ valid_idx = B_FALSE;
+ break;
+ }
+ }
+ }
+ if (valid_idx) {
+ sa_idx_tab_hold(os, idx_tab);
+ return (idx_tab);
+ }
+ }
+
+ /* No such luck, create a new entry */
+ idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP);
+ idx_tab->sa_idx_tab =
+ kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP);
+ idx_tab->sa_layout = tb;
+ zfs_refcount_create(&idx_tab->sa_refcount);
+ if (tb->lot_var_sizes)
+ idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) *
+ tb->lot_var_sizes, KM_SLEEP);
+
+ sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab,
+ tb, idx_tab);
+ sa_idx_tab_hold(os, idx_tab); /* one hold for consumer */
+ sa_idx_tab_hold(os, idx_tab); /* one for layout */
+ list_insert_tail(&tb->lot_idx_tab, idx_tab);
+ return (idx_tab);
+}
+
+void
+sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len,
+ boolean_t start, void *userdata)
+{
+ ASSERT(start);
+
+ *dataptr = userdata;
+ *len = total_len;
+}
+
+static void
+sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx)
+{
+ uint64_t attr_value = 0;
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ sa_attr_table_t *tb = sa->sa_attr_table;
+ int i;
+
+ mutex_enter(&sa->sa_lock);
+
+ if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) {
+ mutex_exit(&sa->sa_lock);
+ return;
+ }
+
+ if (sa->sa_reg_attr_obj == 0) {
+ sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os,
+ DMU_OT_SA_ATTR_REGISTRATION,
+ sa->sa_master_obj, SA_REGISTRY, tx);
+ }
+ for (i = 0; i != sa->sa_num_attrs; i++) {
+ if (sa->sa_attr_table[i].sa_registered)
+ continue;
+ ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length,
+ tb[i].sa_byteswap);
+ VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj,
+ tb[i].sa_name, 8, 1, &attr_value, tx));
+ tb[i].sa_registered = B_TRUE;
+ }
+ sa->sa_need_attr_registration = B_FALSE;
+ mutex_exit(&sa->sa_lock);
+}
+
+/*
+ * Replace all attributes with attributes specified in template.
+ * If dnode had a spill buffer then those attributes will be
+ * also be replaced, possibly with just an empty spill block
+ *
+ * This interface is intended to only be used for bulk adding of
+ * attributes for a new file. It will also be used by the ZPL
+ * when converting and old formatted znode to native SA support.
+ */
+int
+sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
+ int attr_count, dmu_tx_t *tx)
+{
+ sa_os_t *sa = hdl->sa_os->os_sa;
+
+ if (sa->sa_need_attr_registration)
+ sa_attr_register_sync(hdl, tx);
+ return (sa_build_layouts(hdl, attr_desc, attr_count, tx));
+}
+
+int
+sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
+ int attr_count, dmu_tx_t *tx)
+{
+ int error;
+
+ mutex_enter(&hdl->sa_lock);
+ error = sa_replace_all_by_template_locked(hdl, attr_desc,
+ attr_count, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+/*
+ * Add/remove a single attribute or replace a variable-sized attribute value
+ * with a value of a different size, and then rewrite the entire set
+ * of attributes.
+ * Same-length attribute value replacement (including fixed-length attributes)
+ * is handled more efficiently by the upper layers.
+ */
+static int
+sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
+ sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
+ uint16_t buflen, dmu_tx_t *tx)
+{
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
+ dnode_t *dn;
+ sa_bulk_attr_t *attr_desc;
+ void *old_data[2];
+ int bonus_attr_count = 0;
+ int bonus_data_size = 0;
+ int spill_data_size = 0;
+ int spill_attr_count = 0;
+ int error;
+ uint16_t length, reg_length;
+ int i, j, k, length_idx;
+ sa_hdr_phys_t *hdr;
+ sa_idx_tab_t *idx_tab;
+ int attr_count;
+ int count;
+
+ ASSERT(MUTEX_HELD(&hdl->sa_lock));
+
+ /* First make of copy of the old data */
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ if (dn->dn_bonuslen != 0) {
+ bonus_data_size = hdl->sa_bonus->db_size;
+ old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
+ bcopy(hdl->sa_bonus->db_data, old_data[0],
+ hdl->sa_bonus->db_size);
+ bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count;
+ } else {
+ old_data[0] = NULL;
+ }
+ DB_DNODE_EXIT(db);
+
+ /* Bring spill buffer online if it isn't currently */
+
+ if ((error = sa_get_spill(hdl)) == 0) {
+ spill_data_size = hdl->sa_spill->db_size;
+ old_data[1] = vmem_alloc(spill_data_size, KM_SLEEP);
+ bcopy(hdl->sa_spill->db_data, old_data[1],
+ hdl->sa_spill->db_size);
+ spill_attr_count =
+ hdl->sa_spill_tab->sa_layout->lot_attr_count;
+ } else if (error && error != ENOENT) {
+ if (old_data[0])
+ kmem_free(old_data[0], bonus_data_size);
+ return (error);
+ } else {
+ old_data[1] = NULL;
+ }
+
+ /* build descriptor of all attributes */
+
+ attr_count = bonus_attr_count + spill_attr_count;
+ if (action == SA_ADD)
+ attr_count++;
+ else if (action == SA_REMOVE)
+ attr_count--;
+
+ attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP);
+
+ /*
+ * loop through bonus and spill buffer if it exists, and
+ * build up new attr_descriptor to reset the attributes
+ */
+ k = j = 0;
+ count = bonus_attr_count;
+ hdr = SA_GET_HDR(hdl, SA_BONUS);
+ idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
+ for (; k != 2; k++) {
+ /*
+ * Iterate over each attribute in layout. Fetch the
+ * size of variable-length attributes needing rewrite
+ * from sa_lengths[].
+ */
+ for (i = 0, length_idx = 0; i != count; i++) {
+ sa_attr_type_t attr;
+
+ attr = idx_tab->sa_layout->lot_attrs[i];
+ reg_length = SA_REGISTERED_LEN(sa, attr);
+ if (reg_length == 0) {
+ length = hdr->sa_lengths[length_idx];
+ length_idx++;
+ } else {
+ length = reg_length;
+ }
+ if (attr == newattr) {
+ /*
+ * There is nothing to do for SA_REMOVE,
+ * so it is just skipped.
+ */
+ if (action == SA_REMOVE)
+ continue;
+
+ /*
+ * Duplicate attributes are not allowed, so the
+ * action can not be SA_ADD here.
+ */
+ ASSERT3S(action, ==, SA_REPLACE);
+
+ /*
+ * Only a variable-sized attribute can be
+ * replaced here, and its size must be changing.
+ */
+ ASSERT3U(reg_length, ==, 0);
+ ASSERT3U(length, !=, buflen);
+ SA_ADD_BULK_ATTR(attr_desc, j, attr,
+ locator, datastart, buflen);
+ } else {
+ SA_ADD_BULK_ATTR(attr_desc, j, attr,
+ NULL, (void *)
+ (TOC_OFF(idx_tab->sa_idx_tab[attr]) +
+ (uintptr_t)old_data[k]), length);
+ }
+ }
+ if (k == 0 && hdl->sa_spill) {
+ hdr = SA_GET_HDR(hdl, SA_SPILL);
+ idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL);
+ count = spill_attr_count;
+ } else {
+ break;
+ }
+ }
+ if (action == SA_ADD) {
+ reg_length = SA_REGISTERED_LEN(sa, newattr);
+ IMPLY(reg_length != 0, reg_length == buflen);
+ SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator,
+ datastart, buflen);
+ }
+ ASSERT3U(j, ==, attr_count);
+
+ error = sa_build_layouts(hdl, attr_desc, attr_count, tx);
+
+ if (old_data[0])
+ kmem_free(old_data[0], bonus_data_size);
+ if (old_data[1])
+ vmem_free(old_data[1], spill_data_size);
+ kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);
+
+ return (error);
+}
+
+static int
+sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
+ dmu_tx_t *tx)
+{
+ int error;
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ dmu_object_type_t bonustype;
+ dmu_buf_t *saved_spill;
+
+ ASSERT(hdl);
+ ASSERT(MUTEX_HELD(&hdl->sa_lock));
+
+ bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS));
+ saved_spill = hdl->sa_spill;
+
+ /* sync out registration table if necessary */
+ if (sa->sa_need_attr_registration)
+ sa_attr_register_sync(hdl, tx);
+
+ error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx);
+ if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb)
+ sa->sa_update_cb(hdl, tx);
+
+ /*
+ * If saved_spill is NULL and current sa_spill is not NULL that
+ * means we increased the refcount of the spill buffer through
+ * sa_get_spill() or dmu_spill_hold_by_dnode(). Therefore we
+ * must release the hold before calling dmu_tx_commit() to avoid
+ * making a copy of this buffer in dbuf_sync_leaf() due to the
+ * reference count now being greater than 1.
+ */
+ if (!saved_spill && hdl->sa_spill) {
+ if (hdl->sa_spill_tab) {
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
+ hdl->sa_spill_tab = NULL;
+ }
+
+ dmu_buf_rele(hdl->sa_spill, NULL);
+ hdl->sa_spill = NULL;
+ }
+
+ return (error);
+}
+
+/*
+ * update or add new attribute
+ */
+int
+sa_update(sa_handle_t *hdl, sa_attr_type_t type,
+ void *buf, uint32_t buflen, dmu_tx_t *tx)
+{
+ int error;
+ sa_bulk_attr_t bulk;
+
+ VERIFY3U(buflen, <=, SA_ATTR_MAX_LEN);
+
+ bulk.sa_attr = type;
+ bulk.sa_data_func = NULL;
+ bulk.sa_length = buflen;
+ bulk.sa_data = buf;
+
+ mutex_enter(&hdl->sa_lock);
+ error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+/*
+ * Return size of an attribute
+ */
+
+int
+sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size)
+{
+ sa_bulk_attr_t bulk;
+ int error;
+
+ bulk.sa_data = NULL;
+ bulk.sa_attr = attr;
+ bulk.sa_data_func = NULL;
+
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+ if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) {
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+ }
+ *size = bulk.sa_size;
+
+ mutex_exit(&hdl->sa_lock);
+ return (0);
+}
+
+int
+sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
+{
+ ASSERT(hdl);
+ ASSERT(MUTEX_HELD(&hdl->sa_lock));
+ return (sa_lookup_impl(hdl, attrs, count));
+}
+
+int
+sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
+{
+ int error;
+
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+ error = sa_bulk_lookup_locked(hdl, attrs, count);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+int
+sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx)
+{
+ int error;
+
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+ error = sa_bulk_update_impl(hdl, attrs, count, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+int
+sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx)
+{
+ int error;
+
+ mutex_enter(&hdl->sa_lock);
+ error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL,
+ NULL, 0, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+void
+sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi)
+{
+ dmu_object_info_from_db(hdl->sa_bonus, doi);
+}
+
+void
+sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks)
+{
+ dmu_object_size_from_db(hdl->sa_bonus,
+ blksize, nblocks);
+}
+
+void
+sa_set_userp(sa_handle_t *hdl, void *ptr)
+{
+ hdl->sa_userp = ptr;
+}
+
+dmu_buf_t *
+sa_get_db(sa_handle_t *hdl)
+{
+ return (hdl->sa_bonus);
+}
+
+void *
+sa_get_userdata(sa_handle_t *hdl)
+{
+ return (hdl->sa_userp);
+}
+
+void
+sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func)
+{
+ ASSERT(MUTEX_HELD(&os->os_sa->sa_lock));
+ os->os_sa->sa_update_cb = func;
+}
+
+void
+sa_register_update_callback(objset_t *os, sa_update_cb_t *func)
+{
+
+ mutex_enter(&os->os_sa->sa_lock);
+ sa_register_update_callback_locked(os, func);
+ mutex_exit(&os->os_sa->sa_lock);
+}
+
+uint64_t
+sa_handle_object(sa_handle_t *hdl)
+{
+ return (hdl->sa_bonus->db_object);
+}
+
+boolean_t
+sa_enabled(objset_t *os)
+{
+ return (os->os_sa == NULL);
+}
+
+int
+sa_set_sa_object(objset_t *os, uint64_t sa_object)
+{
+ sa_os_t *sa = os->os_sa;
+
+ if (sa->sa_master_obj)
+ return (1);
+
+ sa->sa_master_obj = sa_object;
+
+ return (0);
+}
+
+int
+sa_hdrsize(void *arg)
+{
+ sa_hdr_phys_t *hdr = arg;
+
+ return (SA_HDR_SIZE(hdr));
+}
+
+void
+sa_handle_lock(sa_handle_t *hdl)
+{
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+}
+
+void
+sa_handle_unlock(sa_handle_t *hdl)
+{
+ ASSERT(hdl);
+ mutex_exit(&hdl->sa_lock);
+}
+
+#ifdef _KERNEL
+EXPORT_SYMBOL(sa_handle_get);
+EXPORT_SYMBOL(sa_handle_get_from_db);
+EXPORT_SYMBOL(sa_handle_destroy);
+EXPORT_SYMBOL(sa_buf_hold);
+EXPORT_SYMBOL(sa_buf_rele);
+EXPORT_SYMBOL(sa_spill_rele);
+EXPORT_SYMBOL(sa_lookup);
+EXPORT_SYMBOL(sa_update);
+EXPORT_SYMBOL(sa_remove);
+EXPORT_SYMBOL(sa_bulk_lookup);
+EXPORT_SYMBOL(sa_bulk_lookup_locked);
+EXPORT_SYMBOL(sa_bulk_update);
+EXPORT_SYMBOL(sa_size);
+EXPORT_SYMBOL(sa_object_info);
+EXPORT_SYMBOL(sa_object_size);
+EXPORT_SYMBOL(sa_get_userdata);
+EXPORT_SYMBOL(sa_set_userp);
+EXPORT_SYMBOL(sa_get_db);
+EXPORT_SYMBOL(sa_handle_object);
+EXPORT_SYMBOL(sa_register_update_callback);
+EXPORT_SYMBOL(sa_setup);
+EXPORT_SYMBOL(sa_replace_all_by_template);
+EXPORT_SYMBOL(sa_replace_all_by_template_locked);
+EXPORT_SYMBOL(sa_enabled);
+EXPORT_SYMBOL(sa_cache_init);
+EXPORT_SYMBOL(sa_cache_fini);
+EXPORT_SYMBOL(sa_set_sa_object);
+EXPORT_SYMBOL(sa_hdrsize);
+EXPORT_SYMBOL(sa_handle_lock);
+EXPORT_SYMBOL(sa_handle_unlock);
+EXPORT_SYMBOL(sa_lookup_uio);
+EXPORT_SYMBOL(sa_add_projid);
+#endif /* _KERNEL */
diff --git a/sys/contrib/openzfs/module/zfs/sha256.c b/sys/contrib/openzfs/module/zfs/sha256.c
new file mode 100644
index 000000000000..d297768eada5
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/sha256.c
@@ -0,0 +1,105 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/sha2.h>
+#include <sys/abd.h>
+#include <sys/qat.h>
+
+static int
+sha_incremental(void *buf, size_t size, void *arg)
+{
+ SHA2_CTX *ctx = arg;
+ SHA2Update(ctx, buf, size);
+ return (0);
+}
+
+/*ARGSUSED*/
+void
+abd_checksum_SHA256(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ int ret;
+ SHA2_CTX ctx;
+ zio_cksum_t tmp;
+
+ if (qat_checksum_use_accel(size)) {
+ uint8_t *buf = abd_borrow_buf_copy(abd, size);
+ ret = qat_checksum(ZIO_CHECKSUM_SHA256, buf, size, &tmp);
+ abd_return_buf(abd, buf, size);
+ if (ret == CPA_STATUS_SUCCESS)
+ goto bswap;
+
+ /* If the hardware implementation fails fall back to software */
+ }
+
+ SHA2Init(SHA256, &ctx);
+ (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx);
+ SHA2Final(&tmp, &ctx);
+
+bswap:
+ /*
+ * A prior implementation of this function had a
+ * private SHA256 implementation always wrote things out in
+ * Big Endian and there wasn't a byteswap variant of it.
+ * To preserve on disk compatibility we need to force that
+ * behavior.
+ */
+ zcp->zc_word[0] = BE_64(tmp.zc_word[0]);
+ zcp->zc_word[1] = BE_64(tmp.zc_word[1]);
+ zcp->zc_word[2] = BE_64(tmp.zc_word[2]);
+ zcp->zc_word[3] = BE_64(tmp.zc_word[3]);
+}
+
+/*ARGSUSED*/
+void
+abd_checksum_SHA512_native(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ SHA2_CTX ctx;
+
+ SHA2Init(SHA512_256, &ctx);
+ (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx);
+ SHA2Final(zcp, &ctx);
+}
+
+/*ARGSUSED*/
+void
+abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ zio_cksum_t tmp;
+
+ abd_checksum_SHA512_native(abd, size, ctx_template, &tmp);
+ zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
+ zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
+ zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
+ zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
+}
diff --git a/sys/contrib/openzfs/module/zfs/skein_zfs.c b/sys/contrib/openzfs/module/zfs/skein_zfs.c
new file mode 100644
index 000000000000..11b9940e027e
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/skein_zfs.c
@@ -0,0 +1,102 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/skein.h>
+
+#include <sys/abd.h>
+
+static int
+skein_incremental(void *buf, size_t size, void *arg)
+{
+ Skein_512_Ctxt_t *ctx = arg;
+ (void) Skein_512_Update(ctx, buf, size);
+ return (0);
+}
+/*
+ * Computes a native 256-bit skein MAC checksum. Please note that this
+ * function requires the presence of a ctx_template that should be allocated
+ * using abd_checksum_skein_tmpl_init.
+ */
+/*ARGSUSED*/
+void
+abd_checksum_skein_native(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ Skein_512_Ctxt_t ctx;
+
+ ASSERT(ctx_template != NULL);
+ bcopy(ctx_template, &ctx, sizeof (ctx));
+ (void) abd_iterate_func(abd, 0, size, skein_incremental, &ctx);
+ (void) Skein_512_Final(&ctx, (uint8_t *)zcp);
+ bzero(&ctx, sizeof (ctx));
+}
+
+/*
+ * Byteswapped version of abd_checksum_skein_native. This just invokes
+ * the native checksum function and byteswaps the resulting checksum (since
+ * skein is internally endian-insensitive).
+ */
+void
+abd_checksum_skein_byteswap(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ zio_cksum_t tmp;
+
+ abd_checksum_skein_native(abd, size, ctx_template, &tmp);
+ zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
+ zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
+ zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
+ zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
+}
+
+/*
+ * Allocates a skein MAC template suitable for using in skein MAC checksum
+ * computations and returns a pointer to it.
+ */
+void *
+abd_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
+{
+ Skein_512_Ctxt_t *ctx;
+
+ ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
+ (void) Skein_512_InitExt(ctx, sizeof (zio_cksum_t) * 8, 0,
+ salt->zcs_bytes, sizeof (salt->zcs_bytes));
+ return (ctx);
+}
+
+/*
+ * Frees a skein context template previously allocated using
+ * zio_checksum_skein_tmpl_init.
+ */
+void
+abd_checksum_skein_tmpl_free(void *ctx_template)
+{
+ Skein_512_Ctxt_t *ctx = ctx_template;
+
+ bzero(ctx, sizeof (*ctx));
+ kmem_free(ctx, sizeof (*ctx));
+}
diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c
new file mode 100644
index 000000000000..5170c9ca226f
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/spa.c
@@ -0,0 +1,9885 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Toomas Soome <tsoome@me.com>
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
+ * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
+ */
+
+/*
+ * SPA: Storage Pool Allocator
+ *
+ * This file contains all the routines used when modifying on-disk SPA state.
+ * This includes opening, importing, destroying, exporting a pool, and syncing a
+ * pool.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/ddt.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_removal.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/vdev_indirect_births.h>
+#include <sys/vdev_initialize.h>
+#include <sys/vdev_rebuild.h>
+#include <sys/vdev_trim.h>
+#include <sys/vdev_disk.h>
+#include <sys/vdev_draid.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/mmp.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/bpobj.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_objset.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/fs/zfs.h>
+#include <sys/arc.h>
+#include <sys/callb.h>
+#include <sys/systeminfo.h>
+#include <sys/spa_boot.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/dsl_scan.h>
+#include <sys/zfeature.h>
+#include <sys/dsl_destroy.h>
+#include <sys/zvol.h>
+
+#ifdef _KERNEL
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+#include <sys/callb.h>
+#include <sys/zone.h>
+#include <sys/vmsystm.h>
+#endif /* _KERNEL */
+
+#include "zfs_prop.h"
+#include "zfs_comutil.h"
+
+/*
+ * The interval, in seconds, at which failed configuration cache file writes
+ * should be retried.
+ */
+int zfs_ccw_retry_interval = 300;
+
+typedef enum zti_modes {
+ ZTI_MODE_FIXED, /* value is # of threads (min 1) */
+ ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
+ ZTI_MODE_NULL, /* don't create a taskq */
+ ZTI_NMODES
+} zti_modes_t;
+
+#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) }
+#define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
+#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 }
+#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 }
+
+#define ZTI_N(n) ZTI_P(n, 1)
+#define ZTI_ONE ZTI_N(1)
+
+typedef struct zio_taskq_info {
+ zti_modes_t zti_mode;
+ uint_t zti_value;
+ uint_t zti_count;
+} zio_taskq_info_t;
+
+static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
+ "iss", "iss_h", "int", "int_h"
+};
+
+/*
+ * This table defines the taskq settings for each ZFS I/O type. When
+ * initializing a pool, we use this table to create an appropriately sized
+ * taskq. Some operations are low volume and therefore have a small, static
+ * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
+ * macros. Other operations process a large amount of data; the ZTI_BATCH
+ * macro causes us to create a taskq oriented for throughput. Some operations
+ * are so high frequency and short-lived that the taskq itself can become a
+ * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
+ * additional degree of parallelism specified by the number of threads per-
+ * taskq and the number of taskqs; when dispatching an event in this case, the
+ * particular taskq is chosen at random.
+ *
+ * The different taskq priorities are to handle the different contexts (issue
+ * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
+ * need to be handled with minimum delay.
+ */
+const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
+ /* ISSUE ISSUE_HIGH INTR INTR_HIGH */
+ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
+ { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */
+ { ZTI_BATCH, ZTI_N(5), ZTI_P(12, 8), ZTI_N(5) }, /* WRITE */
+ { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
+ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
+ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
+ { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */
+};
+
+static void spa_sync_version(void *arg, dmu_tx_t *tx);
+static void spa_sync_props(void *arg, dmu_tx_t *tx);
+static boolean_t spa_has_active_shared_spare(spa_t *spa);
+static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport);
+static void spa_vdev_resilver_done(spa_t *spa);
+
+uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */
+boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
+uint_t zio_taskq_basedc = 80; /* base duty cycle */
+
+boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
+
+/*
+ * Report any spa_load_verify errors found, but do not fail spa_load.
+ * This is used by zdb to analyze non-idle pools.
+ */
+boolean_t spa_load_verify_dryrun = B_FALSE;
+
+/*
+ * This (illegal) pool name is used when temporarily importing a spa_t in order
+ * to get the vdev stats associated with the imported devices.
+ */
+#define TRYIMPORT_NAME "$import"
+
+/*
+ * For debugging purposes: print out vdev tree during pool import.
+ */
+int spa_load_print_vdev_tree = B_FALSE;
+
+/*
+ * A non-zero value for zfs_max_missing_tvds means that we allow importing
+ * pools with missing top-level vdevs. This is strictly intended for advanced
+ * pool recovery cases since missing data is almost inevitable. Pools with
+ * missing devices can only be imported read-only for safety reasons, and their
+ * fail-mode will be automatically set to "continue".
+ *
+ * With 1 missing vdev we should be able to import the pool and mount all
+ * datasets. User data that was not modified after the missing device has been
+ * added should be recoverable. This means that snapshots created prior to the
+ * addition of that device should be completely intact.
+ *
+ * With 2 missing vdevs, some datasets may fail to mount since there are
+ * dataset statistics that are stored as regular metadata. Some data might be
+ * recoverable if those vdevs were added recently.
+ *
+ * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
+ * may be missing entirely. Chances of data recovery are very low. Note that
+ * there are also risks of performing an inadvertent rewind as we might be
+ * missing all the vdevs with the latest uberblocks.
+ */
+unsigned long zfs_max_missing_tvds = 0;
+
+/*
+ * The parameters below are similar to zfs_max_missing_tvds but are only
+ * intended for a preliminary open of the pool with an untrusted config which
+ * might be incomplete or out-dated.
+ *
+ * We are more tolerant for pools opened from a cachefile since we could have
+ * an out-dated cachefile where a device removal was not registered.
+ * We could have set the limit arbitrarily high but in the case where devices
+ * are really missing we would want to return the proper error codes; we chose
+ * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
+ * and we get a chance to retrieve the trusted config.
+ */
+uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
+
+/*
+ * In the case where config was assembled by scanning device paths (/dev/dsks
+ * by default) we are less tolerant since all the existing devices should have
+ * been detected and we want spa_load to return the right error codes.
+ */
+uint64_t zfs_max_missing_tvds_scan = 0;
+
+/*
+ * Debugging aid that pauses spa_sync() towards the end.
+ */
+boolean_t zfs_pause_spa_sync = B_FALSE;
+
+/*
+ * Variables to indicate the livelist condense zthr func should wait at certain
+ * points for the livelist to be removed - used to test condense/destroy races
+ */
+int zfs_livelist_condense_zthr_pause = 0;
+int zfs_livelist_condense_sync_pause = 0;
+
+/*
+ * Variables to track whether or not condense cancellation has been
+ * triggered in testing.
+ */
+int zfs_livelist_condense_sync_cancel = 0;
+int zfs_livelist_condense_zthr_cancel = 0;
+
+/*
+ * Variable to track whether or not extra ALLOC blkptrs were added to a
+ * livelist entry while it was being condensed (caused by the way we track
+ * remapped blkptrs in dbuf_remap_impl)
+ */
+int zfs_livelist_condense_new_alloc = 0;
+
+/*
+ * ==========================================================================
+ * SPA properties routines
+ * ==========================================================================
+ */
+
+/*
+ * Add a (source=src, propname=propval) list to an nvlist.
+ */
+static void
+spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
+ uint64_t intval, zprop_source_t src)
+{
+ const char *propname = zpool_prop_to_name(prop);
+ nvlist_t *propval;
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
+
+ if (strval != NULL)
+ VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
+ else
+ VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
+
+ VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
+ nvlist_free(propval);
+}
+
+/*
+ * Get property values from the spa configuration.
+ */
+static void
+spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ dsl_pool_t *pool = spa->spa_dsl_pool;
+ uint64_t size, alloc, cap, version;
+ const zprop_source_t src = ZPROP_SRC_NONE;
+ spa_config_dirent_t *dp;
+ metaslab_class_t *mc = spa_normal_class(spa);
+
+ ASSERT(MUTEX_HELD(&spa->spa_props_lock));
+
+ if (rvd != NULL) {
+ alloc = metaslab_class_get_alloc(mc);
+ alloc += metaslab_class_get_alloc(spa_special_class(spa));
+ alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
+ alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa));
+
+ size = metaslab_class_get_space(mc);
+ size += metaslab_class_get_space(spa_special_class(spa));
+ size += metaslab_class_get_space(spa_dedup_class(spa));
+ size += metaslab_class_get_space(spa_embedded_log_class(spa));
+
+ spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
+ size - alloc, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL,
+ spa->spa_checkpoint_info.sci_dspace, src);
+
+ spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
+ metaslab_class_fragmentation(mc), src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
+ metaslab_class_expandable_space(mc), src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
+ (spa_mode(spa) == SPA_MODE_READ), src);
+
+ cap = (size == 0) ? 0 : (alloc * 100 / size);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
+
+ spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
+ ddt_get_pool_dedup_ratio(spa), src);
+
+ spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
+ rvd->vdev_state, src);
+
+ version = spa_version(spa);
+ if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
+ version, ZPROP_SRC_DEFAULT);
+ } else {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
+ version, ZPROP_SRC_LOCAL);
+ }
+ spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID,
+ NULL, spa_load_guid(spa), src);
+ }
+
+ if (pool != NULL) {
+ /*
+ * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
+ * when opening pools before this version freedir will be NULL.
+ */
+ if (pool->dp_free_dir != NULL) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
+ dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
+ src);
+ } else {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
+ NULL, 0, src);
+ }
+
+ if (pool->dp_leak_dir != NULL) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
+ dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
+ src);
+ } else {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
+ NULL, 0, src);
+ }
+ }
+
+ spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
+
+ if (spa->spa_comment != NULL) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
+ 0, ZPROP_SRC_LOCAL);
+ }
+
+ if (spa->spa_compatibility != NULL) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_COMPATIBILITY,
+ spa->spa_compatibility, 0, ZPROP_SRC_LOCAL);
+ }
+
+ if (spa->spa_root != NULL)
+ spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
+ 0, ZPROP_SRC_LOCAL);
+
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+ MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
+ } else {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+ SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
+ }
+
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
+ DNODE_MAX_SIZE, ZPROP_SRC_NONE);
+ } else {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
+ DNODE_MIN_SIZE, ZPROP_SRC_NONE);
+ }
+
+ if ((dp = list_head(&spa->spa_config_list)) != NULL) {
+ if (dp->scd_path == NULL) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
+ "none", 0, ZPROP_SRC_LOCAL);
+ } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
+ dp->scd_path, 0, ZPROP_SRC_LOCAL);
+ }
+ }
+}
+
+/*
+ * Get zpool property values.
+ */
+int
+spa_prop_get(spa_t *spa, nvlist_t **nvp)
+{
+ objset_t *mos = spa->spa_meta_objset;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ dsl_pool_t *dp;
+ int err;
+
+ err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP);
+ if (err)
+ return (err);
+
+ dp = spa_get_dsl(spa);
+ dsl_pool_config_enter(dp, FTAG);
+ mutex_enter(&spa->spa_props_lock);
+
+ /*
+ * Get properties from the spa config.
+ */
+ spa_prop_get_config(spa, nvp);
+
+ /* If no pool property object, no more prop to get. */
+ if (mos == NULL || spa->spa_pool_props_object == 0)
+ goto out;
+
+ /*
+ * Get properties from the MOS pool property object.
+ */
+ for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
+ (err = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t intval = 0;
+ char *strval = NULL;
+ zprop_source_t src = ZPROP_SRC_DEFAULT;
+ zpool_prop_t prop;
+
+ if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL)
+ continue;
+
+ switch (za.za_integer_length) {
+ case 8:
+ /* integer property */
+ if (za.za_first_integer !=
+ zpool_prop_default_numeric(prop))
+ src = ZPROP_SRC_LOCAL;
+
+ if (prop == ZPOOL_PROP_BOOTFS) {
+ dsl_dataset_t *ds = NULL;
+
+ err = dsl_dataset_hold_obj(dp,
+ za.za_first_integer, FTAG, &ds);
+ if (err != 0)
+ break;
+
+ strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
+ KM_SLEEP);
+ dsl_dataset_name(ds, strval);
+ dsl_dataset_rele(ds, FTAG);
+ } else {
+ strval = NULL;
+ intval = za.za_first_integer;
+ }
+
+ spa_prop_add_list(*nvp, prop, strval, intval, src);
+
+ if (strval != NULL)
+ kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
+
+ break;
+
+ case 1:
+ /* string property */
+ strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
+ err = zap_lookup(mos, spa->spa_pool_props_object,
+ za.za_name, 1, za.za_num_integers, strval);
+ if (err) {
+ kmem_free(strval, za.za_num_integers);
+ break;
+ }
+ spa_prop_add_list(*nvp, prop, strval, 0, src);
+ kmem_free(strval, za.za_num_integers);
+ break;
+
+ default:
+ break;
+ }
+ }
+ zap_cursor_fini(&zc);
+out:
+ mutex_exit(&spa->spa_props_lock);
+ dsl_pool_config_exit(dp, FTAG);
+ if (err && err != ENOENT) {
+ nvlist_free(*nvp);
+ *nvp = NULL;
+ return (err);
+ }
+
+ return (0);
+}
+
+/*
+ * Validate the given pool properties nvlist and modify the list
+ * for the property values to be set.
+ */
+static int
+spa_prop_validate(spa_t *spa, nvlist_t *props)
+{
+ nvpair_t *elem;
+ int error = 0, reset_bootfs = 0;
+ uint64_t objnum = 0;
+ boolean_t has_feature = B_FALSE;
+
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
+ uint64_t intval;
+ char *strval, *slash, *check, *fname;
+ const char *propname = nvpair_name(elem);
+ zpool_prop_t prop = zpool_name_to_prop(propname);
+
+ switch (prop) {
+ case ZPOOL_PROP_INVAL:
+ if (!zpool_prop_feature(propname)) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ /*
+ * Sanitize the input.
+ */
+ if (nvpair_type(elem) != DATA_TYPE_UINT64) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ if (nvpair_value_uint64(elem, &intval) != 0) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ if (intval != 0) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ fname = strchr(propname, '@') + 1;
+ if (zfeature_lookup_name(fname, NULL) != 0) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ has_feature = B_TRUE;
+ break;
+
+ case ZPOOL_PROP_VERSION:
+ error = nvpair_value_uint64(elem, &intval);
+ if (!error &&
+ (intval < spa_version(spa) ||
+ intval > SPA_VERSION_BEFORE_FEATURES ||
+ has_feature))
+ error = SET_ERROR(EINVAL);
+ break;
+
+ case ZPOOL_PROP_DELEGATION:
+ case ZPOOL_PROP_AUTOREPLACE:
+ case ZPOOL_PROP_LISTSNAPS:
+ case ZPOOL_PROP_AUTOEXPAND:
+ case ZPOOL_PROP_AUTOTRIM:
+ error = nvpair_value_uint64(elem, &intval);
+ if (!error && intval > 1)
+ error = SET_ERROR(EINVAL);
+ break;
+
+ case ZPOOL_PROP_MULTIHOST:
+ error = nvpair_value_uint64(elem, &intval);
+ if (!error && intval > 1)
+ error = SET_ERROR(EINVAL);
+
+ if (!error) {
+ uint32_t hostid = zone_get_hostid(NULL);
+ if (hostid)
+ spa->spa_hostid = hostid;
+ else
+ error = SET_ERROR(ENOTSUP);
+ }
+
+ break;
+
+ case ZPOOL_PROP_BOOTFS:
+ /*
+ * If the pool version is less than SPA_VERSION_BOOTFS,
+ * or the pool is still being created (version == 0),
+ * the bootfs property cannot be set.
+ */
+ if (spa_version(spa) < SPA_VERSION_BOOTFS) {
+ error = SET_ERROR(ENOTSUP);
+ break;
+ }
+
+ /*
+ * Make sure the vdev config is bootable
+ */
+ if (!vdev_is_bootable(spa->spa_root_vdev)) {
+ error = SET_ERROR(ENOTSUP);
+ break;
+ }
+
+ reset_bootfs = 1;
+
+ error = nvpair_value_string(elem, &strval);
+
+ if (!error) {
+ objset_t *os;
+
+ if (strval == NULL || strval[0] == '\0') {
+ objnum = zpool_prop_default_numeric(
+ ZPOOL_PROP_BOOTFS);
+ break;
+ }
+
+ error = dmu_objset_hold(strval, FTAG, &os);
+ if (error != 0)
+ break;
+
+ /* Must be ZPL. */
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ error = SET_ERROR(ENOTSUP);
+ } else {
+ objnum = dmu_objset_id(os);
+ }
+ dmu_objset_rele(os, FTAG);
+ }
+ break;
+
+ case ZPOOL_PROP_FAILUREMODE:
+ error = nvpair_value_uint64(elem, &intval);
+ if (!error && intval > ZIO_FAILURE_MODE_PANIC)
+ error = SET_ERROR(EINVAL);
+
+ /*
+ * This is a special case which only occurs when
+ * the pool has completely failed. This allows
+ * the user to change the in-core failmode property
+ * without syncing it out to disk (I/Os might
+ * currently be blocked). We do this by returning
+ * EIO to the caller (spa_prop_set) to trick it
+ * into thinking we encountered a property validation
+ * error.
+ */
+ if (!error && spa_suspended(spa)) {
+ spa->spa_failmode = intval;
+ error = SET_ERROR(EIO);
+ }
+ break;
+
+ case ZPOOL_PROP_CACHEFILE:
+ if ((error = nvpair_value_string(elem, &strval)) != 0)
+ break;
+
+ if (strval[0] == '\0')
+ break;
+
+ if (strcmp(strval, "none") == 0)
+ break;
+
+ if (strval[0] != '/') {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ slash = strrchr(strval, '/');
+ ASSERT(slash != NULL);
+
+ if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
+ strcmp(slash, "/..") == 0)
+ error = SET_ERROR(EINVAL);
+ break;
+
+ case ZPOOL_PROP_COMMENT:
+ if ((error = nvpair_value_string(elem, &strval)) != 0)
+ break;
+ for (check = strval; *check != '\0'; check++) {
+ if (!isprint(*check)) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+ }
+ if (strlen(strval) > ZPROP_MAX_COMMENT)
+ error = SET_ERROR(E2BIG);
+ break;
+
+ default:
+ break;
+ }
+
+ if (error)
+ break;
+ }
+
+ (void) nvlist_remove_all(props,
+ zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO));
+
+ if (!error && reset_bootfs) {
+ error = nvlist_remove(props,
+ zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
+
+ if (!error) {
+ error = nvlist_add_uint64(props,
+ zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
+ }
+ }
+
+ return (error);
+}
+
+void
+spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
+{
+ char *cachefile;
+ spa_config_dirent_t *dp;
+
+ if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
+ &cachefile) != 0)
+ return;
+
+ dp = kmem_alloc(sizeof (spa_config_dirent_t),
+ KM_SLEEP);
+
+ if (cachefile[0] == '\0')
+ dp->scd_path = spa_strdup(spa_config_path);
+ else if (strcmp(cachefile, "none") == 0)
+ dp->scd_path = NULL;
+ else
+ dp->scd_path = spa_strdup(cachefile);
+
+ list_insert_head(&spa->spa_config_list, dp);
+ if (need_sync)
+ spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+}
+
+int
+spa_prop_set(spa_t *spa, nvlist_t *nvp)
+{
+ int error;
+ nvpair_t *elem = NULL;
+ boolean_t need_sync = B_FALSE;
+
+ if ((error = spa_prop_validate(spa, nvp)) != 0)
+ return (error);
+
+ while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
+ zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
+
+ if (prop == ZPOOL_PROP_CACHEFILE ||
+ prop == ZPOOL_PROP_ALTROOT ||
+ prop == ZPOOL_PROP_READONLY)
+ continue;
+
+ if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
+ uint64_t ver;
+
+ if (prop == ZPOOL_PROP_VERSION) {
+ VERIFY(nvpair_value_uint64(elem, &ver) == 0);
+ } else {
+ ASSERT(zpool_prop_feature(nvpair_name(elem)));
+ ver = SPA_VERSION_FEATURES;
+ need_sync = B_TRUE;
+ }
+
+ /* Save time if the version is already set. */
+ if (ver == spa_version(spa))
+ continue;
+
+ /*
+ * In addition to the pool directory object, we might
+ * create the pool properties object, the features for
+ * read object, the features for write object, or the
+ * feature descriptions object.
+ */
+ error = dsl_sync_task(spa->spa_name, NULL,
+ spa_sync_version, &ver,
+ 6, ZFS_SPACE_CHECK_RESERVED);
+ if (error)
+ return (error);
+ continue;
+ }
+
+ need_sync = B_TRUE;
+ break;
+ }
+
+ if (need_sync) {
+ return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
+ nvp, 6, ZFS_SPACE_CHECK_RESERVED));
+ }
+
+ return (0);
+}
+
+/*
+ * If the bootfs property value is dsobj, clear it.
+ */
+void
+spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
+{
+ if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
+ VERIFY(zap_remove(spa->spa_meta_objset,
+ spa->spa_pool_props_object,
+ zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
+ spa->spa_bootfs = 0;
+ }
+}
+
+/*ARGSUSED*/
+static int
+spa_change_guid_check(void *arg, dmu_tx_t *tx)
+{
+ uint64_t *newguid __maybe_unused = arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t vdev_state;
+
+ if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+ int error = (spa_has_checkpoint(spa)) ?
+ ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+ return (SET_ERROR(error));
+ }
+
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ vdev_state = rvd->vdev_state;
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ if (vdev_state != VDEV_STATE_HEALTHY)
+ return (SET_ERROR(ENXIO));
+
+ ASSERT3U(spa_guid(spa), !=, *newguid);
+
+ return (0);
+}
+
+static void
+spa_change_guid_sync(void *arg, dmu_tx_t *tx)
+{
+ uint64_t *newguid = arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ uint64_t oldguid;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ oldguid = spa_guid(spa);
+
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ rvd->vdev_guid = *newguid;
+ rvd->vdev_guid_sum += (*newguid - oldguid);
+ vdev_config_dirty(rvd);
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
+ (u_longlong_t)oldguid, (u_longlong_t)*newguid);
+}
+
+/*
+ * Change the GUID for the pool. This is done so that we can later
+ * re-import a pool built from a clone of our own vdevs. We will modify
+ * the root vdev's guid, our own pool guid, and then mark all of our
+ * vdevs dirty. Note that we must make sure that all our vdevs are
+ * online when we do this, or else any vdevs that weren't present
+ * would be orphaned from our pool. We are also going to issue a
+ * sysevent to update any watchers.
+ */
+int
+spa_change_guid(spa_t *spa)
+{
+ int error;
+ uint64_t guid;
+
+ mutex_enter(&spa->spa_vdev_top_lock);
+ mutex_enter(&spa_namespace_lock);
+ guid = spa_generate_guid(NULL);
+
+ error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
+ spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
+
+ if (error == 0) {
+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
+ }
+
+ mutex_exit(&spa_namespace_lock);
+ mutex_exit(&spa->spa_vdev_top_lock);
+
+ return (error);
+}
+
+/*
+ * ==========================================================================
+ * SPA state manipulation (open/create/destroy/import/export)
+ * ==========================================================================
+ */
+
+static int
+spa_error_entry_compare(const void *a, const void *b)
+{
+ const spa_error_entry_t *sa = (const spa_error_entry_t *)a;
+ const spa_error_entry_t *sb = (const spa_error_entry_t *)b;
+ int ret;
+
+ ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
+ sizeof (zbookmark_phys_t));
+
+ return (TREE_ISIGN(ret));
+}
+
+/*
+ * Utility function which retrieves copies of the current logs and
+ * re-initializes them in the process.
+ */
+void
+spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
+{
+ ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
+
+ bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
+ bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
+
+ avl_create(&spa->spa_errlist_scrub,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+ avl_create(&spa->spa_errlist_last,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+}
+
+static void
+spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
+{
+ const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
+ enum zti_modes mode = ztip->zti_mode;
+ uint_t value = ztip->zti_value;
+ uint_t count = ztip->zti_count;
+ spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+ uint_t flags = 0;
+ boolean_t batch = B_FALSE;
+
+ if (mode == ZTI_MODE_NULL) {
+ tqs->stqs_count = 0;
+ tqs->stqs_taskq = NULL;
+ return;
+ }
+
+ ASSERT3U(count, >, 0);
+
+ tqs->stqs_count = count;
+ tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
+
+ switch (mode) {
+ case ZTI_MODE_FIXED:
+ ASSERT3U(value, >=, 1);
+ value = MAX(value, 1);
+ flags |= TASKQ_DYNAMIC;
+ break;
+
+ case ZTI_MODE_BATCH:
+ batch = B_TRUE;
+ flags |= TASKQ_THREADS_CPU_PCT;
+ value = MIN(zio_taskq_batch_pct, 100);
+ break;
+
+ default:
+ panic("unrecognized mode for %s_%s taskq (%u:%u) in "
+ "spa_activate()",
+ zio_type_name[t], zio_taskq_types[q], mode, value);
+ break;
+ }
+
+ for (uint_t i = 0; i < count; i++) {
+ taskq_t *tq;
+ char name[32];
+
+ (void) snprintf(name, sizeof (name), "%s_%s",
+ zio_type_name[t], zio_taskq_types[q]);
+
+ if (zio_taskq_sysdc && spa->spa_proc != &p0) {
+ if (batch)
+ flags |= TASKQ_DC_BATCH;
+
+ tq = taskq_create_sysdc(name, value, 50, INT_MAX,
+ spa->spa_proc, zio_taskq_basedc, flags);
+ } else {
+ pri_t pri = maxclsyspri;
+ /*
+ * The write issue taskq can be extremely CPU
+ * intensive. Run it at slightly less important
+ * priority than the other taskqs.
+ *
+ * Under Linux and FreeBSD this means incrementing
+ * the priority value as opposed to platforms like
+ * illumos where it should be decremented.
+ *
+ * On FreeBSD, if priorities divided by four (RQ_PPQ)
+ * are equal then a difference between them is
+ * insignificant.
+ */
+ if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) {
+#if defined(__linux__)
+ pri++;
+#elif defined(__FreeBSD__)
+ pri += 4;
+#else
+#error "unknown OS"
+#endif
+ }
+ tq = taskq_create_proc(name, value, pri, 50,
+ INT_MAX, spa->spa_proc, flags);
+ }
+
+ tqs->stqs_taskq[i] = tq;
+ }
+}
+
+static void
+spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
+{
+ spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+
+ if (tqs->stqs_taskq == NULL) {
+ ASSERT3U(tqs->stqs_count, ==, 0);
+ return;
+ }
+
+ for (uint_t i = 0; i < tqs->stqs_count; i++) {
+ ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
+ taskq_destroy(tqs->stqs_taskq[i]);
+ }
+
+ kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
+ tqs->stqs_taskq = NULL;
+}
+
+/*
+ * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
+ * Note that a type may have multiple discrete taskqs to avoid lock contention
+ * on the taskq itself. In that case we choose which taskq at random by using
+ * the low bits of gethrtime().
+ */
+void
+spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+ task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
+{
+ spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+ taskq_t *tq;
+
+ ASSERT3P(tqs->stqs_taskq, !=, NULL);
+ ASSERT3U(tqs->stqs_count, !=, 0);
+
+ if (tqs->stqs_count == 1) {
+ tq = tqs->stqs_taskq[0];
+ } else {
+ tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
+ }
+
+ taskq_dispatch_ent(tq, func, arg, flags, ent);
+}
+
+/*
+ * Same as spa_taskq_dispatch_ent() but block on the task until completion.
+ */
+void
+spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+ task_func_t *func, void *arg, uint_t flags)
+{
+ spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+ taskq_t *tq;
+ taskqid_t id;
+
+ ASSERT3P(tqs->stqs_taskq, !=, NULL);
+ ASSERT3U(tqs->stqs_count, !=, 0);
+
+ if (tqs->stqs_count == 1) {
+ tq = tqs->stqs_taskq[0];
+ } else {
+ tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
+ }
+
+ id = taskq_dispatch(tq, func, arg, flags);
+ if (id)
+ taskq_wait_id(tq, id);
+}
+
+static void
+spa_create_zio_taskqs(spa_t *spa)
+{
+ for (int t = 0; t < ZIO_TYPES; t++) {
+ for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
+ spa_taskqs_init(spa, t, q);
+ }
+ }
+}
+
+/*
+ * Disabled until spa_thread() can be adapted for Linux.
+ */
+#undef HAVE_SPA_THREAD
+
+#if defined(_KERNEL) && defined(HAVE_SPA_THREAD)
+static void
+spa_thread(void *arg)
+{
+ psetid_t zio_taskq_psrset_bind = PS_NONE;
+ callb_cpr_t cprinfo;
+
+ spa_t *spa = arg;
+ user_t *pu = PTOU(curproc);
+
+ CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
+ spa->spa_name);
+
+ ASSERT(curproc != &p0);
+ (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
+ "zpool-%s", spa->spa_name);
+ (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
+
+ /* bind this thread to the requested psrset */
+ if (zio_taskq_psrset_bind != PS_NONE) {
+ pool_lock();
+ mutex_enter(&cpu_lock);
+ mutex_enter(&pidlock);
+ mutex_enter(&curproc->p_lock);
+
+ if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
+ 0, NULL, NULL) == 0) {
+ curthread->t_bind_pset = zio_taskq_psrset_bind;
+ } else {
+ cmn_err(CE_WARN,
+ "Couldn't bind process for zfs pool \"%s\" to "
+ "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
+ }
+
+ mutex_exit(&curproc->p_lock);
+ mutex_exit(&pidlock);
+ mutex_exit(&cpu_lock);
+ pool_unlock();
+ }
+
+ if (zio_taskq_sysdc) {
+ sysdc_thread_enter(curthread, 100, 0);
+ }
+
+ spa->spa_proc = curproc;
+ spa->spa_did = curthread->t_did;
+
+ spa_create_zio_taskqs(spa);
+
+ mutex_enter(&spa->spa_proc_lock);
+ ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
+
+ spa->spa_proc_state = SPA_PROC_ACTIVE;
+ cv_broadcast(&spa->spa_proc_cv);
+
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ while (spa->spa_proc_state == SPA_PROC_ACTIVE)
+ cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
+ CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
+
+ ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
+ spa->spa_proc_state = SPA_PROC_GONE;
+ spa->spa_proc = &p0;
+ cv_broadcast(&spa->spa_proc_cv);
+ CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */
+
+ mutex_enter(&curproc->p_lock);
+ lwp_exit();
+}
+#endif
+
+/*
+ * Activate an uninitialized pool.
+ */
+static void
+spa_activate(spa_t *spa, spa_mode_t mode)
+{
+ ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+
+ spa->spa_state = POOL_STATE_ACTIVE;
+ spa->spa_mode = mode;
+
+ spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
+ spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
+ spa->spa_embedded_log_class =
+ metaslab_class_create(spa, zfs_metaslab_ops);
+ spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops);
+ spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops);
+
+ /* Try to create a covering process */
+ mutex_enter(&spa->spa_proc_lock);
+ ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
+ ASSERT(spa->spa_proc == &p0);
+ spa->spa_did = 0;
+
+#ifdef HAVE_SPA_THREAD
+ /* Only create a process if we're going to be around a while. */
+ if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
+ if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
+ NULL, 0) == 0) {
+ spa->spa_proc_state = SPA_PROC_CREATED;
+ while (spa->spa_proc_state == SPA_PROC_CREATED) {
+ cv_wait(&spa->spa_proc_cv,
+ &spa->spa_proc_lock);
+ }
+ ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
+ ASSERT(spa->spa_proc != &p0);
+ ASSERT(spa->spa_did != 0);
+ } else {
+#ifdef _KERNEL
+ cmn_err(CE_WARN,
+ "Couldn't create process for zfs pool \"%s\"\n",
+ spa->spa_name);
+#endif
+ }
+ }
+#endif /* HAVE_SPA_THREAD */
+ mutex_exit(&spa->spa_proc_lock);
+
+ /* If we didn't create a process, we need to create our taskqs. */
+ if (spa->spa_proc == &p0) {
+ spa_create_zio_taskqs(spa);
+ }
+
+ for (size_t i = 0; i < TXG_SIZE; i++) {
+ spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+ }
+
+ list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_config_dirty_node));
+ list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
+ offsetof(objset_t, os_evicting_node));
+ list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_state_dirty_node));
+
+ txg_list_create(&spa->spa_vdev_txg_list, spa,
+ offsetof(struct vdev, vdev_txg_node));
+
+ avl_create(&spa->spa_errlist_scrub,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+ avl_create(&spa->spa_errlist_last,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+
+ spa_keystore_init(&spa->spa_keystore);
+
+ /*
+ * This taskq is used to perform zvol-minor-related tasks
+ * asynchronously. This has several advantages, including easy
+ * resolution of various deadlocks.
+ *
+ * The taskq must be single threaded to ensure tasks are always
+ * processed in the order in which they were dispatched.
+ *
+ * A taskq per pool allows one to keep the pools independent.
+ * This way if one pool is suspended, it will not impact another.
+ *
+ * The preferred location to dispatch a zvol minor task is a sync
+ * task. In this context, there is easy access to the spa_t and minimal
+ * error handling is required because the sync task must succeed.
+ */
+ spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri,
+ 1, INT_MAX, 0);
+
+ /*
+ * Taskq dedicated to prefetcher threads: this is used to prevent the
+ * pool traverse code from monopolizing the global (and limited)
+ * system_taskq by inappropriately scheduling long running tasks on it.
+ */
+ spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100,
+ defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
+
+ /*
+ * The taskq to upgrade datasets in this pool. Currently used by
+ * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA.
+ */
+ spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100,
+ defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
+}
+
+/*
+ * Opposite of spa_activate().
+ */
+static void
+spa_deactivate(spa_t *spa)
+{
+ ASSERT(spa->spa_sync_on == B_FALSE);
+ ASSERT(spa->spa_dsl_pool == NULL);
+ ASSERT(spa->spa_root_vdev == NULL);
+ ASSERT(spa->spa_async_zio_root == NULL);
+ ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
+
+ spa_evicting_os_wait(spa);
+
+ if (spa->spa_zvol_taskq) {
+ taskq_destroy(spa->spa_zvol_taskq);
+ spa->spa_zvol_taskq = NULL;
+ }
+
+ if (spa->spa_prefetch_taskq) {
+ taskq_destroy(spa->spa_prefetch_taskq);
+ spa->spa_prefetch_taskq = NULL;
+ }
+
+ if (spa->spa_upgrade_taskq) {
+ taskq_destroy(spa->spa_upgrade_taskq);
+ spa->spa_upgrade_taskq = NULL;
+ }
+
+ txg_list_destroy(&spa->spa_vdev_txg_list);
+
+ list_destroy(&spa->spa_config_dirty_list);
+ list_destroy(&spa->spa_evicting_os_list);
+ list_destroy(&spa->spa_state_dirty_list);
+
+ taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
+
+ for (int t = 0; t < ZIO_TYPES; t++) {
+ for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
+ spa_taskqs_fini(spa, t, q);
+ }
+ }
+
+ for (size_t i = 0; i < TXG_SIZE; i++) {
+ ASSERT3P(spa->spa_txg_zio[i], !=, NULL);
+ VERIFY0(zio_wait(spa->spa_txg_zio[i]));
+ spa->spa_txg_zio[i] = NULL;
+ }
+
+ metaslab_class_destroy(spa->spa_normal_class);
+ spa->spa_normal_class = NULL;
+
+ metaslab_class_destroy(spa->spa_log_class);
+ spa->spa_log_class = NULL;
+
+ metaslab_class_destroy(spa->spa_embedded_log_class);
+ spa->spa_embedded_log_class = NULL;
+
+ metaslab_class_destroy(spa->spa_special_class);
+ spa->spa_special_class = NULL;
+
+ metaslab_class_destroy(spa->spa_dedup_class);
+ spa->spa_dedup_class = NULL;
+
+ /*
+ * If this was part of an import or the open otherwise failed, we may
+ * still have errors left in the queues. Empty them just in case.
+ */
+ spa_errlog_drain(spa);
+ avl_destroy(&spa->spa_errlist_scrub);
+ avl_destroy(&spa->spa_errlist_last);
+
+ spa_keystore_fini(&spa->spa_keystore);
+
+ spa->spa_state = POOL_STATE_UNINITIALIZED;
+
+ mutex_enter(&spa->spa_proc_lock);
+ if (spa->spa_proc_state != SPA_PROC_NONE) {
+ ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
+ spa->spa_proc_state = SPA_PROC_DEACTIVATE;
+ cv_broadcast(&spa->spa_proc_cv);
+ while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
+ ASSERT(spa->spa_proc != &p0);
+ cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
+ }
+ ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
+ spa->spa_proc_state = SPA_PROC_NONE;
+ }
+ ASSERT(spa->spa_proc == &p0);
+ mutex_exit(&spa->spa_proc_lock);
+
+ /*
+ * We want to make sure spa_thread() has actually exited the ZFS
+ * module, so that the module can't be unloaded out from underneath
+ * it.
+ */
+ if (spa->spa_did != 0) {
+ thread_join(spa->spa_did);
+ spa->spa_did = 0;
+ }
+}
+
+/*
+ * Verify a pool configuration, and construct the vdev tree appropriately. This
+ * will create all the necessary vdevs in the appropriate layout, with each vdev
+ * in the CLOSED state. This will prep the pool before open/creation/import.
+ * All vdev validation is done by the vdev_alloc() routine.
+ */
+int
+spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
+ uint_t id, int atype)
+{
+ nvlist_t **child;
+ uint_t children;
+ int error;
+
+ if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
+ return (error);
+
+ if ((*vdp)->vdev_ops->vdev_op_leaf)
+ return (0);
+
+ error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children);
+
+ if (error == ENOENT)
+ return (0);
+
+ if (error) {
+ vdev_free(*vdp);
+ *vdp = NULL;
+ return (SET_ERROR(EINVAL));
+ }
+
+ for (int c = 0; c < children; c++) {
+ vdev_t *vd;
+ if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
+ atype)) != 0) {
+ vdev_free(*vdp);
+ *vdp = NULL;
+ return (error);
+ }
+ }
+
+ ASSERT(*vdp != NULL);
+
+ return (0);
+}
+
+static boolean_t
+spa_should_flush_logs_on_unload(spa_t *spa)
+{
+ if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+ return (B_FALSE);
+
+ if (!spa_writeable(spa))
+ return (B_FALSE);
+
+ if (!spa->spa_sync_on)
+ return (B_FALSE);
+
+ if (spa_state(spa) != POOL_STATE_EXPORTED)
+ return (B_FALSE);
+
+ if (zfs_keep_log_spacemaps_at_export)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+/*
+ * Opens a transaction that will set the flag that will instruct
+ * spa_sync to attempt to flush all the metaslabs for that txg.
+ */
+static void
+spa_unload_log_sm_flush_all(spa_t *spa)
+{
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+ ASSERT3U(spa->spa_log_flushall_txg, ==, 0);
+ spa->spa_log_flushall_txg = dmu_tx_get_txg(tx);
+
+ dmu_tx_commit(tx);
+ txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg);
+}
+
+static void
+spa_unload_log_sm_metadata(spa_t *spa)
+{
+ void *cookie = NULL;
+ spa_log_sm_t *sls;
+ while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg,
+ &cookie)) != NULL) {
+ VERIFY0(sls->sls_mscount);
+ kmem_free(sls, sizeof (spa_log_sm_t));
+ }
+
+ for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+ e != NULL; e = list_head(&spa->spa_log_summary)) {
+ VERIFY0(e->lse_mscount);
+ list_remove(&spa->spa_log_summary, e);
+ kmem_free(e, sizeof (log_summary_entry_t));
+ }
+
+ spa->spa_unflushed_stats.sus_nblocks = 0;
+ spa->spa_unflushed_stats.sus_memused = 0;
+ spa->spa_unflushed_stats.sus_blocklimit = 0;
+}
+
+static void
+spa_destroy_aux_threads(spa_t *spa)
+{
+ if (spa->spa_condense_zthr != NULL) {
+ zthr_destroy(spa->spa_condense_zthr);
+ spa->spa_condense_zthr = NULL;
+ }
+ if (spa->spa_checkpoint_discard_zthr != NULL) {
+ zthr_destroy(spa->spa_checkpoint_discard_zthr);
+ spa->spa_checkpoint_discard_zthr = NULL;
+ }
+ if (spa->spa_livelist_delete_zthr != NULL) {
+ zthr_destroy(spa->spa_livelist_delete_zthr);
+ spa->spa_livelist_delete_zthr = NULL;
+ }
+ if (spa->spa_livelist_condense_zthr != NULL) {
+ zthr_destroy(spa->spa_livelist_condense_zthr);
+ spa->spa_livelist_condense_zthr = NULL;
+ }
+}
+
+/*
+ * Opposite of spa_load().
+ */
+static void
+spa_unload(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
+
+ spa_import_progress_remove(spa_guid(spa));
+ spa_load_note(spa, "UNLOADING");
+
+ spa_wake_waiters(spa);
+
+ /*
+ * If the log space map feature is enabled and the pool is getting
+ * exported (but not destroyed), we want to spend some time flushing
+ * as many metaslabs as we can in an attempt to destroy log space
+ * maps and save import time.
+ */
+ if (spa_should_flush_logs_on_unload(spa))
+ spa_unload_log_sm_flush_all(spa);
+
+ /*
+ * Stop async tasks.
+ */
+ spa_async_suspend(spa);
+
+ if (spa->spa_root_vdev) {
+ vdev_t *root_vdev = spa->spa_root_vdev;
+ vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE);
+ vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
+ vdev_autotrim_stop_all(spa);
+ vdev_rebuild_stop_all(spa);
+ }
+
+ /*
+ * Stop syncing.
+ */
+ if (spa->spa_sync_on) {
+ txg_sync_stop(spa->spa_dsl_pool);
+ spa->spa_sync_on = B_FALSE;
+ }
+
+ /*
+ * This ensures that there is no async metaslab prefetching
+ * while we attempt to unload the spa.
+ */
+ if (spa->spa_root_vdev != NULL) {
+ for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
+ vdev_t *vc = spa->spa_root_vdev->vdev_child[c];
+ if (vc->vdev_mg != NULL)
+ taskq_wait(vc->vdev_mg->mg_taskq);
+ }
+ }
+
+ if (spa->spa_mmp.mmp_thread)
+ mmp_thread_stop(spa);
+
+ /*
+ * Wait for any outstanding async I/O to complete.
+ */
+ if (spa->spa_async_zio_root != NULL) {
+ for (int i = 0; i < max_ncpus; i++)
+ (void) zio_wait(spa->spa_async_zio_root[i]);
+ kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
+ spa->spa_async_zio_root = NULL;
+ }
+
+ if (spa->spa_vdev_removal != NULL) {
+ spa_vdev_removal_destroy(spa->spa_vdev_removal);
+ spa->spa_vdev_removal = NULL;
+ }
+
+ spa_destroy_aux_threads(spa);
+
+ spa_condense_fini(spa);
+
+ bpobj_close(&spa->spa_deferred_bpobj);
+
+ spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
+
+ /*
+ * Close all vdevs.
+ */
+ if (spa->spa_root_vdev)
+ vdev_free(spa->spa_root_vdev);
+ ASSERT(spa->spa_root_vdev == NULL);
+
+ /*
+ * Close the dsl pool.
+ */
+ if (spa->spa_dsl_pool) {
+ dsl_pool_close(spa->spa_dsl_pool);
+ spa->spa_dsl_pool = NULL;
+ spa->spa_meta_objset = NULL;
+ }
+
+ ddt_unload(spa);
+ spa_unload_log_sm_metadata(spa);
+
+ /*
+ * Drop and purge level 2 cache
+ */
+ spa_l2cache_drop(spa);
+
+ for (int i = 0; i < spa->spa_spares.sav_count; i++)
+ vdev_free(spa->spa_spares.sav_vdevs[i]);
+ if (spa->spa_spares.sav_vdevs) {
+ kmem_free(spa->spa_spares.sav_vdevs,
+ spa->spa_spares.sav_count * sizeof (void *));
+ spa->spa_spares.sav_vdevs = NULL;
+ }
+ if (spa->spa_spares.sav_config) {
+ nvlist_free(spa->spa_spares.sav_config);
+ spa->spa_spares.sav_config = NULL;
+ }
+ spa->spa_spares.sav_count = 0;
+
+ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
+ vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
+ vdev_free(spa->spa_l2cache.sav_vdevs[i]);
+ }
+ if (spa->spa_l2cache.sav_vdevs) {
+ kmem_free(spa->spa_l2cache.sav_vdevs,
+ spa->spa_l2cache.sav_count * sizeof (void *));
+ spa->spa_l2cache.sav_vdevs = NULL;
+ }
+ if (spa->spa_l2cache.sav_config) {
+ nvlist_free(spa->spa_l2cache.sav_config);
+ spa->spa_l2cache.sav_config = NULL;
+ }
+ spa->spa_l2cache.sav_count = 0;
+
+ spa->spa_async_suspended = 0;
+
+ spa->spa_indirect_vdevs_loaded = B_FALSE;
+
+ if (spa->spa_comment != NULL) {
+ spa_strfree(spa->spa_comment);
+ spa->spa_comment = NULL;
+ }
+ if (spa->spa_compatibility != NULL) {
+ spa_strfree(spa->spa_compatibility);
+ spa->spa_compatibility = NULL;
+ }
+
+ spa_config_exit(spa, SCL_ALL, spa);
+}
+
+/*
+ * Load (or re-load) the current list of vdevs describing the active spares for
+ * this pool. When this is called, we have some form of basic information in
+ * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and
+ * then re-generate a more complete list including status information.
+ */
+void
+spa_load_spares(spa_t *spa)
+{
+ nvlist_t **spares;
+ uint_t nspares;
+ int i;
+ vdev_t *vd, *tvd;
+
+#ifndef _KERNEL
+ /*
+ * zdb opens both the current state of the pool and the
+ * checkpointed state (if present), with a different spa_t.
+ *
+ * As spare vdevs are shared among open pools, we skip loading
+ * them when we load the checkpointed state of the pool.
+ */
+ if (!spa_writeable(spa))
+ return;
+#endif
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ /*
+ * First, close and free any existing spare vdevs.
+ */
+ for (i = 0; i < spa->spa_spares.sav_count; i++) {
+ vd = spa->spa_spares.sav_vdevs[i];
+
+ /* Undo the call to spa_activate() below */
+ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
+ B_FALSE)) != NULL && tvd->vdev_isspare)
+ spa_spare_remove(tvd);
+ vdev_close(vd);
+ vdev_free(vd);
+ }
+
+ if (spa->spa_spares.sav_vdevs)
+ kmem_free(spa->spa_spares.sav_vdevs,
+ spa->spa_spares.sav_count * sizeof (void *));
+
+ if (spa->spa_spares.sav_config == NULL)
+ nspares = 0;
+ else
+ VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
+
+ spa->spa_spares.sav_count = (int)nspares;
+ spa->spa_spares.sav_vdevs = NULL;
+
+ if (nspares == 0)
+ return;
+
+ /*
+ * Construct the array of vdevs, opening them to get status in the
+ * process. For each spare, there is potentially two different vdev_t
+ * structures associated with it: one in the list of spares (used only
+ * for basic validation purposes) and one in the active vdev
+ * configuration (if it's spared in). During this phase we open and
+ * validate each vdev on the spare list. If the vdev also exists in the
+ * active configuration, then we also mark this vdev as an active spare.
+ */
+ spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *),
+ KM_SLEEP);
+ for (i = 0; i < spa->spa_spares.sav_count; i++) {
+ VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
+ VDEV_ALLOC_SPARE) == 0);
+ ASSERT(vd != NULL);
+
+ spa->spa_spares.sav_vdevs[i] = vd;
+
+ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
+ B_FALSE)) != NULL) {
+ if (!tvd->vdev_isspare)
+ spa_spare_add(tvd);
+
+ /*
+ * We only mark the spare active if we were successfully
+ * able to load the vdev. Otherwise, importing a pool
+ * with a bad active spare would result in strange
+ * behavior, because multiple pool would think the spare
+ * is actively in use.
+ *
+ * There is a vulnerability here to an equally bizarre
+ * circumstance, where a dead active spare is later
+ * brought back to life (onlined or otherwise). Given
+ * the rarity of this scenario, and the extra complexity
+ * it adds, we ignore the possibility.
+ */
+ if (!vdev_is_dead(tvd))
+ spa_spare_activate(tvd);
+ }
+
+ vd->vdev_top = vd;
+ vd->vdev_aux = &spa->spa_spares;
+
+ if (vdev_open(vd) != 0)
+ continue;
+
+ if (vdev_validate_aux(vd) == 0)
+ spa_spare_add(vd);
+ }
+
+ /*
+ * Recompute the stashed list of spares, with status information
+ * this time.
+ */
+ VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
+ DATA_TYPE_NVLIST_ARRAY) == 0);
+
+ spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
+ KM_SLEEP);
+ for (i = 0; i < spa->spa_spares.sav_count; i++)
+ spares[i] = vdev_config_generate(spa,
+ spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
+ for (i = 0; i < spa->spa_spares.sav_count; i++)
+ nvlist_free(spares[i]);
+ kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
+}
+
+/*
+ * Load (or re-load) the current list of vdevs describing the active l2cache for
+ * this pool. When this is called, we have some form of basic information in
+ * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and
+ * then re-generate a more complete list including status information.
+ * Devices which are already active have their details maintained, and are
+ * not re-opened.
+ */
+void
+spa_load_l2cache(spa_t *spa)
+{
+ nvlist_t **l2cache = NULL;
+ uint_t nl2cache;
+ int i, j, oldnvdevs;
+ uint64_t guid;
+ vdev_t *vd, **oldvdevs, **newvdevs;
+ spa_aux_vdev_t *sav = &spa->spa_l2cache;
+
+#ifndef _KERNEL
+ /*
+ * zdb opens both the current state of the pool and the
+ * checkpointed state (if present), with a different spa_t.
+ *
+ * As L2 caches are part of the ARC which is shared among open
+ * pools, we skip loading them when we load the checkpointed
+ * state of the pool.
+ */
+ if (!spa_writeable(spa))
+ return;
+#endif
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ oldvdevs = sav->sav_vdevs;
+ oldnvdevs = sav->sav_count;
+ sav->sav_vdevs = NULL;
+ sav->sav_count = 0;
+
+ if (sav->sav_config == NULL) {
+ nl2cache = 0;
+ newvdevs = NULL;
+ goto out;
+ }
+
+ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
+ ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
+ newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
+
+ /*
+ * Process new nvlist of vdevs.
+ */
+ for (i = 0; i < nl2cache; i++) {
+ VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
+ &guid) == 0);
+
+ newvdevs[i] = NULL;
+ for (j = 0; j < oldnvdevs; j++) {
+ vd = oldvdevs[j];
+ if (vd != NULL && guid == vd->vdev_guid) {
+ /*
+ * Retain previous vdev for add/remove ops.
+ */
+ newvdevs[i] = vd;
+ oldvdevs[j] = NULL;
+ break;
+ }
+ }
+
+ if (newvdevs[i] == NULL) {
+ /*
+ * Create new vdev
+ */
+ VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
+ VDEV_ALLOC_L2CACHE) == 0);
+ ASSERT(vd != NULL);
+ newvdevs[i] = vd;
+
+ /*
+ * Commit this vdev as an l2cache device,
+ * even if it fails to open.
+ */
+ spa_l2cache_add(vd);
+
+ vd->vdev_top = vd;
+ vd->vdev_aux = sav;
+
+ spa_l2cache_activate(vd);
+
+ if (vdev_open(vd) != 0)
+ continue;
+
+ (void) vdev_validate_aux(vd);
+
+ if (!vdev_is_dead(vd))
+ l2arc_add_vdev(spa, vd);
+
+ /*
+ * Upon cache device addition to a pool or pool
+ * creation with a cache device or if the header
+ * of the device is invalid we issue an async
+ * TRIM command for the whole device which will
+ * execute if l2arc_trim_ahead > 0.
+ */
+ spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
+ }
+ }
+
+ sav->sav_vdevs = newvdevs;
+ sav->sav_count = (int)nl2cache;
+
+ /*
+ * Recompute the stashed list of l2cache devices, with status
+ * information this time.
+ */
+ VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
+ DATA_TYPE_NVLIST_ARRAY) == 0);
+
+ if (sav->sav_count > 0)
+ l2cache = kmem_alloc(sav->sav_count * sizeof (void *),
+ KM_SLEEP);
+ for (i = 0; i < sav->sav_count; i++)
+ l2cache[i] = vdev_config_generate(spa,
+ sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
+ VERIFY(nvlist_add_nvlist_array(sav->sav_config,
+ ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
+
+out:
+ /*
+ * Purge vdevs that were dropped
+ */
+ for (i = 0; i < oldnvdevs; i++) {
+ uint64_t pool;
+
+ vd = oldvdevs[i];
+ if (vd != NULL) {
+ ASSERT(vd->vdev_isl2cache);
+
+ if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
+ pool != 0ULL && l2arc_vdev_present(vd))
+ l2arc_remove_vdev(vd);
+ vdev_clear_stats(vd);
+ vdev_free(vd);
+ }
+ }
+
+ if (oldvdevs)
+ kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
+
+ for (i = 0; i < sav->sav_count; i++)
+ nvlist_free(l2cache[i]);
+ if (sav->sav_count)
+ kmem_free(l2cache, sav->sav_count * sizeof (void *));
+}
+
+static int
+load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
+{
+ dmu_buf_t *db;
+ char *packed = NULL;
+ size_t nvsize = 0;
+ int error;
+ *value = NULL;
+
+ error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
+ if (error)
+ return (error);
+
+ nvsize = *(uint64_t *)db->db_data;
+ dmu_buf_rele(db, FTAG);
+
+ packed = vmem_alloc(nvsize, KM_SLEEP);
+ error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
+ DMU_READ_PREFETCH);
+ if (error == 0)
+ error = nvlist_unpack(packed, nvsize, value, 0);
+ vmem_free(packed, nvsize);
+
+ return (error);
+}
+
+/*
+ * Concrete top-level vdevs that are not missing and are not logs. At every
+ * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
+ */
+static uint64_t
+spa_healthy_core_tvds(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t tvds = 0;
+
+ for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+ vdev_t *vd = rvd->vdev_child[i];
+ if (vd->vdev_islog)
+ continue;
+ if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
+ tvds++;
+ }
+
+ return (tvds);
+}
+
+/*
+ * Checks to see if the given vdev could not be opened, in which case we post a
+ * sysevent to notify the autoreplace code that the device has been removed.
+ */
+static void
+spa_check_removed(vdev_t *vd)
+{
+ for (uint64_t c = 0; c < vd->vdev_children; c++)
+ spa_check_removed(vd->vdev_child[c]);
+
+ if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
+ vdev_is_concrete(vd)) {
+ zfs_post_autoreplace(vd->vdev_spa, vd);
+ spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
+ }
+}
+
+static int
+spa_check_for_missing_logs(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ /*
+ * If we're doing a normal import, then build up any additional
+ * diagnostic information about missing log devices.
+ * We'll pass this up to the user for further processing.
+ */
+ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
+ nvlist_t **child, *nv;
+ uint64_t idx = 0;
+
+ child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *),
+ KM_SLEEP);
+ VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+
+ /*
+ * We consider a device as missing only if it failed
+ * to open (i.e. offline or faulted is not considered
+ * as missing).
+ */
+ if (tvd->vdev_islog &&
+ tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
+ child[idx++] = vdev_config_generate(spa, tvd,
+ B_FALSE, VDEV_CONFIG_MISSING);
+ }
+ }
+
+ if (idx > 0) {
+ fnvlist_add_nvlist_array(nv,
+ ZPOOL_CONFIG_CHILDREN, child, idx);
+ fnvlist_add_nvlist(spa->spa_load_info,
+ ZPOOL_CONFIG_MISSING_DEVICES, nv);
+
+ for (uint64_t i = 0; i < idx; i++)
+ nvlist_free(child[i]);
+ }
+ nvlist_free(nv);
+ kmem_free(child, rvd->vdev_children * sizeof (char **));
+
+ if (idx > 0) {
+ spa_load_failed(spa, "some log devices are missing");
+ vdev_dbgmsg_print_tree(rvd, 2);
+ return (SET_ERROR(ENXIO));
+ }
+ } else {
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+
+ if (tvd->vdev_islog &&
+ tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
+ spa_set_log_state(spa, SPA_LOG_CLEAR);
+ spa_load_note(spa, "some log devices are "
+ "missing, ZIL is dropped.");
+ vdev_dbgmsg_print_tree(rvd, 2);
+ break;
+ }
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Check for missing log devices
+ */
+static boolean_t
+spa_check_logs(spa_t *spa)
+{
+ boolean_t rv = B_FALSE;
+ dsl_pool_t *dp = spa_get_dsl(spa);
+
+ switch (spa->spa_log_state) {
+ default:
+ break;
+ case SPA_LOG_MISSING:
+ /* need to recheck in case slog has been restored */
+ case SPA_LOG_UNKNOWN:
+ rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+ zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
+ if (rv)
+ spa_set_log_state(spa, SPA_LOG_MISSING);
+ break;
+ }
+ return (rv);
+}
+
+/*
+ * Passivate any log vdevs (note, does not apply to embedded log metaslabs).
+ */
+static boolean_t
+spa_passivate_log(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ boolean_t slog_found = B_FALSE;
+
+ ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+
+ if (tvd->vdev_islog) {
+ ASSERT3P(tvd->vdev_log_mg, ==, NULL);
+ metaslab_group_passivate(tvd->vdev_mg);
+ slog_found = B_TRUE;
+ }
+ }
+
+ return (slog_found);
+}
+
+/*
+ * Activate any log vdevs (note, does not apply to embedded log metaslabs).
+ */
+static void
+spa_activate_log(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+
+ if (tvd->vdev_islog) {
+ ASSERT3P(tvd->vdev_log_mg, ==, NULL);
+ metaslab_group_activate(tvd->vdev_mg);
+ }
+ }
+}
+
+int
+spa_reset_logs(spa_t *spa)
+{
+ int error;
+
+ error = dmu_objset_find(spa_name(spa), zil_reset,
+ NULL, DS_FIND_CHILDREN);
+ if (error == 0) {
+ /*
+ * We successfully offlined the log device, sync out the
+ * current txg so that the "stubby" block can be removed
+ * by zil_sync().
+ */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ }
+ return (error);
+}
+
+static void
+spa_aux_check_removed(spa_aux_vdev_t *sav)
+{
+ for (int i = 0; i < sav->sav_count; i++)
+ spa_check_removed(sav->sav_vdevs[i]);
+}
+
+void
+spa_claim_notify(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+
+ if (zio->io_error)
+ return;
+
+ mutex_enter(&spa->spa_props_lock); /* any mutex will do */
+ if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
+ spa->spa_claim_max_txg = zio->io_bp->blk_birth;
+ mutex_exit(&spa->spa_props_lock);
+}
+
+typedef struct spa_load_error {
+ uint64_t sle_meta_count;
+ uint64_t sle_data_count;
+} spa_load_error_t;
+
+static void
+spa_load_verify_done(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ spa_load_error_t *sle = zio->io_private;
+ dmu_object_type_t type = BP_GET_TYPE(bp);
+ int error = zio->io_error;
+ spa_t *spa = zio->io_spa;
+
+ abd_free(zio->io_abd);
+ if (error) {
+ if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
+ type != DMU_OT_INTENT_LOG)
+ atomic_inc_64(&sle->sle_meta_count);
+ else
+ atomic_inc_64(&sle->sle_data_count);
+ }
+
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
+ cv_broadcast(&spa->spa_scrub_io_cv);
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+/*
+ * Maximum number of inflight bytes is the log2 fraction of the arc size.
+ * By default, we set it to 1/16th of the arc.
+ */
+int spa_load_verify_shift = 4;
+int spa_load_verify_metadata = B_TRUE;
+int spa_load_verify_data = B_TRUE;
+
+/*ARGSUSED*/
+static int
+spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
+ BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
+ return (0);
+ /*
+ * Note: normally this routine will not be called if
+ * spa_load_verify_metadata is not set. However, it may be useful
+ * to manually set the flag after the traversal has begun.
+ */
+ if (!spa_load_verify_metadata)
+ return (0);
+ if (!BP_IS_METADATA(bp) && !spa_load_verify_data)
+ return (0);
+
+ uint64_t maxinflight_bytes =
+ arc_target_bytes() >> spa_load_verify_shift;
+ zio_t *rio = arg;
+ size_t size = BP_GET_PSIZE(bp);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ while (spa->spa_load_verify_bytes >= maxinflight_bytes)
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ spa->spa_load_verify_bytes += size;
+ mutex_exit(&spa->spa_scrub_lock);
+
+ zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
+ spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+{
+ if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+
+ return (0);
+}
+
+static int
+spa_load_verify(spa_t *spa)
+{
+ zio_t *rio;
+ spa_load_error_t sle = { 0 };
+ zpool_load_policy_t policy;
+ boolean_t verify_ok = B_FALSE;
+ int error = 0;
+
+ zpool_get_load_policy(spa->spa_config, &policy);
+
+ if (policy.zlp_rewind & ZPOOL_NEVER_REWIND)
+ return (0);
+
+ dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
+ error = dmu_objset_find_dp(spa->spa_dsl_pool,
+ spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
+ DS_FIND_CHILDREN);
+ dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
+ if (error != 0)
+ return (error);
+
+ rio = zio_root(spa, NULL, &sle,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+
+ if (spa_load_verify_metadata) {
+ if (spa->spa_extreme_rewind) {
+ spa_load_note(spa, "performing a complete scan of the "
+ "pool since extreme rewind is on. This may take "
+ "a very long time.\n (spa_load_verify_data=%u, "
+ "spa_load_verify_metadata=%u)",
+ spa_load_verify_data, spa_load_verify_metadata);
+ }
+
+ error = traverse_pool(spa, spa->spa_verify_min_txg,
+ TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
+ TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio);
+ }
+
+ (void) zio_wait(rio);
+ ASSERT0(spa->spa_load_verify_bytes);
+
+ spa->spa_load_meta_errors = sle.sle_meta_count;
+ spa->spa_load_data_errors = sle.sle_data_count;
+
+ if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) {
+ spa_load_note(spa, "spa_load_verify found %llu metadata errors "
+ "and %llu data errors", (u_longlong_t)sle.sle_meta_count,
+ (u_longlong_t)sle.sle_data_count);
+ }
+
+ if (spa_load_verify_dryrun ||
+ (!error && sle.sle_meta_count <= policy.zlp_maxmeta &&
+ sle.sle_data_count <= policy.zlp_maxdata)) {
+ int64_t loss = 0;
+
+ verify_ok = B_TRUE;
+ spa->spa_load_txg = spa->spa_uberblock.ub_txg;
+ spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
+
+ loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
+ VERIFY(nvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
+ VERIFY(nvlist_add_int64(spa->spa_load_info,
+ ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
+ VERIFY(nvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
+ } else {
+ spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
+ }
+
+ if (spa_load_verify_dryrun)
+ return (0);
+
+ if (error) {
+ if (error != ENXIO && error != EIO)
+ error = SET_ERROR(EIO);
+ return (error);
+ }
+
+ return (verify_ok ? 0 : EIO);
+}
+
+/*
+ * Find a value in the pool props object.
+ */
+static void
+spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
+{
+ (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
+ zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
+}
+
+/*
+ * Find a value in the pool directory object.
+ */
+static int
+spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent)
+{
+ int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ name, sizeof (uint64_t), 1, val);
+
+ if (error != 0 && (error != ENOENT || log_enoent)) {
+ spa_load_failed(spa, "couldn't get '%s' value in MOS directory "
+ "[error=%d]", name, error);
+ }
+
+ return (error);
+}
+
+static int
+spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
+{
+ vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
+ return (SET_ERROR(err));
+}
+
+boolean_t
+spa_livelist_delete_check(spa_t *spa)
+{
+ return (spa->spa_livelists_to_delete != 0);
+}
+
+/* ARGSUSED */
+static boolean_t
+spa_livelist_delete_cb_check(void *arg, zthr_t *z)
+{
+ spa_t *spa = arg;
+ return (spa_livelist_delete_check(spa));
+}
+
+static int
+delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ spa_t *spa = arg;
+ zio_free(spa, tx->tx_txg, bp);
+ dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
+ -bp_get_dsize_sync(spa, bp),
+ -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
+ return (0);
+}
+
+static int
+dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp)
+{
+ int err;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ zap_cursor_init(&zc, os, zap_obj);
+ err = zap_cursor_retrieve(&zc, &za);
+ zap_cursor_fini(&zc);
+ if (err == 0)
+ *llp = za.za_first_integer;
+ return (err);
+}
+
+/*
+ * Components of livelist deletion that must be performed in syncing
+ * context: freeing block pointers and updating the pool-wide data
+ * structures to indicate how much work is left to do
+ */
+typedef struct sublist_delete_arg {
+ spa_t *spa;
+ dsl_deadlist_t *ll;
+ uint64_t key;
+ bplist_t *to_free;
+} sublist_delete_arg_t;
+
+static void
+sublist_delete_sync(void *arg, dmu_tx_t *tx)
+{
+ sublist_delete_arg_t *sda = arg;
+ spa_t *spa = sda->spa;
+ dsl_deadlist_t *ll = sda->ll;
+ uint64_t key = sda->key;
+ bplist_t *to_free = sda->to_free;
+
+ bplist_iterate(to_free, delete_blkptr_cb, spa, tx);
+ dsl_deadlist_remove_entry(ll, key, tx);
+}
+
+typedef struct livelist_delete_arg {
+ spa_t *spa;
+ uint64_t ll_obj;
+ uint64_t zap_obj;
+} livelist_delete_arg_t;
+
+static void
+livelist_delete_sync(void *arg, dmu_tx_t *tx)
+{
+ livelist_delete_arg_t *lda = arg;
+ spa_t *spa = lda->spa;
+ uint64_t ll_obj = lda->ll_obj;
+ uint64_t zap_obj = lda->zap_obj;
+ objset_t *mos = spa->spa_meta_objset;
+ uint64_t count;
+
+ /* free the livelist and decrement the feature count */
+ VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx));
+ dsl_deadlist_free(mos, ll_obj, tx);
+ spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
+ VERIFY0(zap_count(mos, zap_obj, &count));
+ if (count == 0) {
+ /* no more livelists to delete */
+ VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_DELETED_CLONES, tx));
+ VERIFY0(zap_destroy(mos, zap_obj, tx));
+ spa->spa_livelists_to_delete = 0;
+ spa_notify_waiters(spa);
+ }
+}
+
+/*
+ * Load in the value for the livelist to be removed and open it. Then,
+ * load its first sublist and determine which block pointers should actually
+ * be freed. Then, call a synctask which performs the actual frees and updates
+ * the pool-wide livelist data.
+ */
+/* ARGSUSED */
+static void
+spa_livelist_delete_cb(void *arg, zthr_t *z)
+{
+ spa_t *spa = arg;
+ uint64_t ll_obj = 0, count;
+ objset_t *mos = spa->spa_meta_objset;
+ uint64_t zap_obj = spa->spa_livelists_to_delete;
+ /*
+ * Determine the next livelist to delete. This function should only
+ * be called if there is at least one deleted clone.
+ */
+ VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj));
+ VERIFY0(zap_count(mos, ll_obj, &count));
+ if (count > 0) {
+ dsl_deadlist_t *ll;
+ dsl_deadlist_entry_t *dle;
+ bplist_t to_free;
+ ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP);
+ dsl_deadlist_open(ll, mos, ll_obj);
+ dle = dsl_deadlist_first(ll);
+ ASSERT3P(dle, !=, NULL);
+ bplist_create(&to_free);
+ int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free,
+ z, NULL);
+ if (err == 0) {
+ sublist_delete_arg_t sync_arg = {
+ .spa = spa,
+ .ll = ll,
+ .key = dle->dle_mintxg,
+ .to_free = &to_free
+ };
+ zfs_dbgmsg("deleting sublist (id %llu) from"
+ " livelist %llu, %d remaining",
+ dle->dle_bpobj.bpo_object, ll_obj, count - 1);
+ VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+ sublist_delete_sync, &sync_arg, 0,
+ ZFS_SPACE_CHECK_DESTROY));
+ } else {
+ VERIFY3U(err, ==, EINTR);
+ }
+ bplist_clear(&to_free);
+ bplist_destroy(&to_free);
+ dsl_deadlist_close(ll);
+ kmem_free(ll, sizeof (dsl_deadlist_t));
+ } else {
+ livelist_delete_arg_t sync_arg = {
+ .spa = spa,
+ .ll_obj = ll_obj,
+ .zap_obj = zap_obj
+ };
+ zfs_dbgmsg("deletion of livelist %llu completed", ll_obj);
+ VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync,
+ &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY));
+ }
+}
+
+static void
+spa_start_livelist_destroy_thread(spa_t *spa)
+{
+ ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL);
+ spa->spa_livelist_delete_zthr =
+ zthr_create("z_livelist_destroy",
+ spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa);
+}
+
+typedef struct livelist_new_arg {
+ bplist_t *allocs;
+ bplist_t *frees;
+} livelist_new_arg_t;
+
+static int
+livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+ dmu_tx_t *tx)
+{
+ ASSERT(tx == NULL);
+ livelist_new_arg_t *lna = arg;
+ if (bp_freed) {
+ bplist_append(lna->frees, bp);
+ } else {
+ bplist_append(lna->allocs, bp);
+ zfs_livelist_condense_new_alloc++;
+ }
+ return (0);
+}
+
+typedef struct livelist_condense_arg {
+ spa_t *spa;
+ bplist_t to_keep;
+ uint64_t first_size;
+ uint64_t next_size;
+} livelist_condense_arg_t;
+
+static void
+spa_livelist_condense_sync(void *arg, dmu_tx_t *tx)
+{
+ livelist_condense_arg_t *lca = arg;
+ spa_t *spa = lca->spa;
+ bplist_t new_frees;
+ dsl_dataset_t *ds = spa->spa_to_condense.ds;
+
+ /* Have we been cancelled? */
+ if (spa->spa_to_condense.cancelled) {
+ zfs_livelist_condense_sync_cancel++;
+ goto out;
+ }
+
+ dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
+ dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
+ dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;
+
+ /*
+ * It's possible that the livelist was changed while the zthr was
+ * running. Therefore, we need to check for new blkptrs in the two
+ * entries being condensed and continue to track them in the livelist.
+ * Because of the way we handle remapped blkptrs (see dbuf_remap_impl),
+ * it's possible that the newly added blkptrs are FREEs or ALLOCs so
+ * we need to sort them into two different bplists.
+ */
+ uint64_t first_obj = first->dle_bpobj.bpo_object;
+ uint64_t next_obj = next->dle_bpobj.bpo_object;
+ uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs;
+ uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs;
+
+ bplist_create(&new_frees);
+ livelist_new_arg_t new_bps = {
+ .allocs = &lca->to_keep,
+ .frees = &new_frees,
+ };
+
+ if (cur_first_size > lca->first_size) {
+ VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj,
+ livelist_track_new_cb, &new_bps, lca->first_size));
+ }
+ if (cur_next_size > lca->next_size) {
+ VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj,
+ livelist_track_new_cb, &new_bps, lca->next_size));
+ }
+
+ dsl_deadlist_clear_entry(first, ll, tx);
+ ASSERT(bpobj_is_empty(&first->dle_bpobj));
+ dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx);
+
+ bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx);
+ bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx);
+ bplist_destroy(&new_frees);
+
+ char dsname[ZFS_MAX_DATASET_NAME_LEN];
+ dsl_dataset_name(ds, dsname);
+ zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu "
+ "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu "
+ "(%llu blkptrs)", tx->tx_txg, dsname, ds->ds_object, first_obj,
+ cur_first_size, next_obj, cur_next_size,
+ first->dle_bpobj.bpo_object,
+ first->dle_bpobj.bpo_phys->bpo_num_blkptrs);
+out:
+ dmu_buf_rele(ds->ds_dbuf, spa);
+ spa->spa_to_condense.ds = NULL;
+ bplist_clear(&lca->to_keep);
+ bplist_destroy(&lca->to_keep);
+ kmem_free(lca, sizeof (livelist_condense_arg_t));
+ spa->spa_to_condense.syncing = B_FALSE;
+}
+
+static void
+spa_livelist_condense_cb(void *arg, zthr_t *t)
+{
+ while (zfs_livelist_condense_zthr_pause &&
+ !(zthr_has_waiters(t) || zthr_iscancelled(t)))
+ delay(1);
+
+ spa_t *spa = arg;
+ dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
+ dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
+ uint64_t first_size, next_size;
+
+ livelist_condense_arg_t *lca =
+ kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP);
+ bplist_create(&lca->to_keep);
+
+ /*
+ * Process the livelists (matching FREEs and ALLOCs) in open context
+ * so we have minimal work in syncing context to condense.
+ *
+ * We save bpobj sizes (first_size and next_size) to use later in
+ * syncing context to determine if entries were added to these sublists
+ * while in open context. This is possible because the clone is still
+ * active and open for normal writes and we want to make sure the new,
+ * unprocessed blockpointers are inserted into the livelist normally.
+ *
+ * Note that dsl_process_sub_livelist() both stores the size number of
+ * blockpointers and iterates over them while the bpobj's lock held, so
+ * the sizes returned to us are consistent which what was actually
+ * processed.
+ */
+ int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t,
+ &first_size);
+ if (err == 0)
+ err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep,
+ t, &next_size);
+
+ if (err == 0) {
+ while (zfs_livelist_condense_sync_pause &&
+ !(zthr_has_waiters(t) || zthr_iscancelled(t)))
+ delay(1);
+
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ dmu_tx_mark_netfree(tx);
+ dmu_tx_hold_space(tx, 1);
+ err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE);
+ if (err == 0) {
+ /*
+ * Prevent the condense zthr restarting before
+ * the synctask completes.
+ */
+ spa->spa_to_condense.syncing = B_TRUE;
+ lca->spa = spa;
+ lca->first_size = first_size;
+ lca->next_size = next_size;
+ dsl_sync_task_nowait(spa_get_dsl(spa),
+ spa_livelist_condense_sync, lca, tx);
+ dmu_tx_commit(tx);
+ return;
+ }
+ }
+ /*
+ * Condensing can not continue: either it was externally stopped or
+ * we were unable to assign to a tx because the pool has run out of
+ * space. In the second case, we'll just end up trying to condense
+ * again in a later txg.
+ */
+ ASSERT(err != 0);
+ bplist_clear(&lca->to_keep);
+ bplist_destroy(&lca->to_keep);
+ kmem_free(lca, sizeof (livelist_condense_arg_t));
+ dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa);
+ spa->spa_to_condense.ds = NULL;
+ if (err == EINTR)
+ zfs_livelist_condense_zthr_cancel++;
+}
+
+/* ARGSUSED */
+/*
+ * Check that there is something to condense but that a condense is not
+ * already in progress and that condensing has not been cancelled.
+ */
+static boolean_t
+spa_livelist_condense_cb_check(void *arg, zthr_t *z)
+{
+ spa_t *spa = arg;
+ if ((spa->spa_to_condense.ds != NULL) &&
+ (spa->spa_to_condense.syncing == B_FALSE) &&
+ (spa->spa_to_condense.cancelled == B_FALSE)) {
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+static void
+spa_start_livelist_condensing_thread(spa_t *spa)
+{
+ spa->spa_to_condense.ds = NULL;
+ spa->spa_to_condense.first = NULL;
+ spa->spa_to_condense.next = NULL;
+ spa->spa_to_condense.syncing = B_FALSE;
+ spa->spa_to_condense.cancelled = B_FALSE;
+
+ ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL);
+ spa->spa_livelist_condense_zthr =
+ zthr_create("z_livelist_condense",
+ spa_livelist_condense_cb_check,
+ spa_livelist_condense_cb, spa);
+}
+
+static void
+spa_spawn_aux_threads(spa_t *spa)
+{
+ ASSERT(spa_writeable(spa));
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa_start_indirect_condensing_thread(spa);
+ spa_start_livelist_destroy_thread(spa);
+ spa_start_livelist_condensing_thread(spa);
+
+ ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
+ spa->spa_checkpoint_discard_zthr =
+ zthr_create("z_checkpoint_discard",
+ spa_checkpoint_discard_thread_check,
+ spa_checkpoint_discard_thread, spa);
+}
+
+/*
+ * Fix up config after a partly-completed split. This is done with the
+ * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off
+ * pool have that entry in their config, but only the splitting one contains
+ * a list of all the guids of the vdevs that are being split off.
+ *
+ * This function determines what to do with that list: either rejoin
+ * all the disks to the pool, or complete the splitting process. To attempt
+ * the rejoin, each disk that is offlined is marked online again, and
+ * we do a reopen() call. If the vdev label for every disk that was
+ * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
+ * then we call vdev_split() on each disk, and complete the split.
+ *
+ * Otherwise we leave the config alone, with all the vdevs in place in
+ * the original pool.
+ */
+static void
+spa_try_repair(spa_t *spa, nvlist_t *config)
+{
+ uint_t extracted;
+ uint64_t *glist;
+ uint_t i, gcount;
+ nvlist_t *nvl;
+ vdev_t **vd;
+ boolean_t attempt_reopen;
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
+ return;
+
+ /* check that the config is complete */
+ if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
+ &glist, &gcount) != 0)
+ return;
+
+ vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
+
+ /* attempt to online all the vdevs & validate */
+ attempt_reopen = B_TRUE;
+ for (i = 0; i < gcount; i++) {
+ if (glist[i] == 0) /* vdev is hole */
+ continue;
+
+ vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
+ if (vd[i] == NULL) {
+ /*
+ * Don't bother attempting to reopen the disks;
+ * just do the split.
+ */
+ attempt_reopen = B_FALSE;
+ } else {
+ /* attempt to re-online it */
+ vd[i]->vdev_offline = B_FALSE;
+ }
+ }
+
+ if (attempt_reopen) {
+ vdev_reopen(spa->spa_root_vdev);
+
+ /* check each device to see what state it's in */
+ for (extracted = 0, i = 0; i < gcount; i++) {
+ if (vd[i] != NULL &&
+ vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
+ break;
+ ++extracted;
+ }
+ }
+
+ /*
+ * If every disk has been moved to the new pool, or if we never
+ * even attempted to look at them, then we split them off for
+ * good.
+ */
+ if (!attempt_reopen || gcount == extracted) {
+ for (i = 0; i < gcount; i++)
+ if (vd[i] != NULL)
+ vdev_split(vd[i]);
+ vdev_reopen(spa->spa_root_vdev);
+ }
+
+ kmem_free(vd, gcount * sizeof (vdev_t *));
+}
+
+static int
+spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
+{
+ char *ereport = FM_EREPORT_ZFS_POOL;
+ int error;
+
+ spa->spa_load_state = state;
+ (void) spa_import_progress_set_state(spa_guid(spa),
+ spa_load_state(spa));
+
+ gethrestime(&spa->spa_loaded_ts);
+ error = spa_load_impl(spa, type, &ereport);
+
+ /*
+ * Don't count references from objsets that are already closed
+ * and are making their way through the eviction process.
+ */
+ spa_evicting_os_wait(spa);
+ spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
+ if (error) {
+ if (error != EEXIST) {
+ spa->spa_loaded_ts.tv_sec = 0;
+ spa->spa_loaded_ts.tv_nsec = 0;
+ }
+ if (error != EBADF) {
+ (void) zfs_ereport_post(ereport, spa,
+ NULL, NULL, NULL, 0);
+ }
+ }
+ spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
+ spa->spa_ena = 0;
+
+ (void) spa_import_progress_set_state(spa_guid(spa),
+ spa_load_state(spa));
+
+ return (error);
+}
+
+#ifdef ZFS_DEBUG
+/*
+ * Count the number of per-vdev ZAPs associated with all of the vdevs in the
+ * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
+ * spa's per-vdev ZAP list.
+ */
+static uint64_t
+vdev_count_verify_zaps(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ uint64_t total = 0;
+
+ if (vd->vdev_top_zap != 0) {
+ total++;
+ ASSERT0(zap_lookup_int(spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps, vd->vdev_top_zap));
+ }
+ if (vd->vdev_leaf_zap != 0) {
+ total++;
+ ASSERT0(zap_lookup_int(spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
+ }
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ total += vdev_count_verify_zaps(vd->vdev_child[i]);
+ }
+
+ return (total);
+}
+#endif
+
+/*
+ * Determine whether the activity check is required.
+ */
+static boolean_t
+spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
+ nvlist_t *config)
+{
+ uint64_t state = 0;
+ uint64_t hostid = 0;
+ uint64_t tryconfig_txg = 0;
+ uint64_t tryconfig_timestamp = 0;
+ uint16_t tryconfig_mmp_seq = 0;
+ nvlist_t *nvinfo;
+
+ if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
+ nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
+ (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG,
+ &tryconfig_txg);
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
+ &tryconfig_timestamp);
+ (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ,
+ &tryconfig_mmp_seq);
+ }
+
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state);
+
+ /*
+ * Disable the MMP activity check - This is used by zdb which
+ * is intended to be used on potentially active pools.
+ */
+ if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP)
+ return (B_FALSE);
+
+ /*
+ * Skip the activity check when the MMP feature is disabled.
+ */
+ if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0)
+ return (B_FALSE);
+
+ /*
+ * If the tryconfig_ values are nonzero, they are the results of an
+ * earlier tryimport. If they all match the uberblock we just found,
+ * then the pool has not changed and we return false so we do not test
+ * a second time.
+ */
+ if (tryconfig_txg && tryconfig_txg == ub->ub_txg &&
+ tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp &&
+ tryconfig_mmp_seq && tryconfig_mmp_seq ==
+ (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0))
+ return (B_FALSE);
+
+ /*
+ * Allow the activity check to be skipped when importing the pool
+ * on the same host which last imported it. Since the hostid from
+ * configuration may be stale use the one read from the label.
+ */
+ if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID))
+ hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID);
+
+ if (hostid == spa_get_hostid(spa))
+ return (B_FALSE);
+
+ /*
+ * Skip the activity test when the pool was cleanly exported.
+ */
+ if (state != POOL_STATE_ACTIVE)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+/*
+ * Nanoseconds the activity check must watch for changes on-disk.
+ */
+static uint64_t
+spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
+{
+ uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1);
+ uint64_t multihost_interval = MSEC2NSEC(
+ MMP_INTERVAL_OK(zfs_multihost_interval));
+ uint64_t import_delay = MAX(NANOSEC, import_intervals *
+ multihost_interval);
+
+ /*
+ * Local tunables determine a minimum duration except for the case
+ * where we know when the remote host will suspend the pool if MMP
+ * writes do not land.
+ *
+ * See Big Theory comment at the top of mmp.c for the reasoning behind
+ * these cases and times.
+ */
+
+ ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100);
+
+ if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
+ MMP_FAIL_INT(ub) > 0) {
+
+ /* MMP on remote host will suspend pool after failed writes */
+ import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) *
+ MMP_IMPORT_SAFETY_FACTOR / 100;
+
+ zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp "
+ "mmp_fails=%llu ub_mmp mmp_interval=%llu "
+ "import_intervals=%u", import_delay, MMP_FAIL_INT(ub),
+ MMP_INTERVAL(ub), import_intervals);
+
+ } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
+ MMP_FAIL_INT(ub) == 0) {
+
+ /* MMP on remote host will never suspend pool */
+ import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) +
+ ub->ub_mmp_delay) * import_intervals);
+
+ zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp "
+ "mmp_interval=%llu ub_mmp_delay=%llu "
+ "import_intervals=%u", import_delay, MMP_INTERVAL(ub),
+ ub->ub_mmp_delay, import_intervals);
+
+ } else if (MMP_VALID(ub)) {
+ /*
+ * zfs-0.7 compatibility case
+ */
+
+ import_delay = MAX(import_delay, (multihost_interval +
+ ub->ub_mmp_delay) * import_intervals);
+
+ zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu "
+ "import_intervals=%u leaves=%u", import_delay,
+ ub->ub_mmp_delay, import_intervals,
+ vdev_count_leaves(spa));
+ } else {
+ /* Using local tunings is the only reasonable option */
+ zfs_dbgmsg("pool last imported on non-MMP aware "
+ "host using import_delay=%llu multihost_interval=%llu "
+ "import_intervals=%u", import_delay, multihost_interval,
+ import_intervals);
+ }
+
+ return (import_delay);
+}
+
+/*
+ * Perform the import activity check. If the user canceled the import or
+ * we detected activity then fail.
+ */
+static int
+spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
+{
+ uint64_t txg = ub->ub_txg;
+ uint64_t timestamp = ub->ub_timestamp;
+ uint64_t mmp_config = ub->ub_mmp_config;
+ uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
+ uint64_t import_delay;
+ hrtime_t import_expire;
+ nvlist_t *mmp_label = NULL;
+ vdev_t *rvd = spa->spa_root_vdev;
+ kcondvar_t cv;
+ kmutex_t mtx;
+ int error = 0;
+
+ cv_init(&cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_enter(&mtx);
+
+ /*
+ * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed
+ * during the earlier tryimport. If the txg recorded there is 0 then
+ * the pool is known to be active on another host.
+ *
+ * Otherwise, the pool might be in use on another host. Check for
+ * changes in the uberblocks on disk if necessary.
+ */
+ if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
+ nvlist_t *nvinfo = fnvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_LOAD_INFO);
+
+ if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) &&
+ fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) {
+ vdev_uberblock_load(rvd, ub, &mmp_label);
+ error = SET_ERROR(EREMOTEIO);
+ goto out;
+ }
+ }
+
+ import_delay = spa_activity_check_duration(spa, ub);
+
+ /* Add a small random factor in case of simultaneous imports (0-25%) */
+ import_delay += import_delay * spa_get_random(250) / 1000;
+
+ import_expire = gethrtime() + import_delay;
+
+ while (gethrtime() < import_expire) {
+ (void) spa_import_progress_set_mmp_check(spa_guid(spa),
+ NSEC2SEC(import_expire - gethrtime()));
+
+ vdev_uberblock_load(rvd, ub, &mmp_label);
+
+ if (txg != ub->ub_txg || timestamp != ub->ub_timestamp ||
+ mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) {
+ zfs_dbgmsg("multihost activity detected "
+ "txg %llu ub_txg %llu "
+ "timestamp %llu ub_timestamp %llu "
+ "mmp_config %#llx ub_mmp_config %#llx",
+ txg, ub->ub_txg, timestamp, ub->ub_timestamp,
+ mmp_config, ub->ub_mmp_config);
+
+ error = SET_ERROR(EREMOTEIO);
+ break;
+ }
+
+ if (mmp_label) {
+ nvlist_free(mmp_label);
+ mmp_label = NULL;
+ }
+
+ error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz);
+ if (error != -1) {
+ error = SET_ERROR(EINTR);
+ break;
+ }
+ error = 0;
+ }
+
+out:
+ mutex_exit(&mtx);
+ mutex_destroy(&mtx);
+ cv_destroy(&cv);
+
+ /*
+ * If the pool is determined to be active store the status in the
+ * spa->spa_load_info nvlist. If the remote hostname or hostid are
+ * available from configuration read from disk store them as well.
+ * This allows 'zpool import' to generate a more useful message.
+ *
+ * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory)
+ * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool
+ * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool
+ */
+ if (error == EREMOTEIO) {
+ char *hostname = "<unknown>";
+ uint64_t hostid = 0;
+
+ if (mmp_label) {
+ if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) {
+ hostname = fnvlist_lookup_string(mmp_label,
+ ZPOOL_CONFIG_HOSTNAME);
+ fnvlist_add_string(spa->spa_load_info,
+ ZPOOL_CONFIG_MMP_HOSTNAME, hostname);
+ }
+
+ if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) {
+ hostid = fnvlist_lookup_uint64(mmp_label,
+ ZPOOL_CONFIG_HOSTID);
+ fnvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_MMP_HOSTID, hostid);
+ }
+ }
+
+ fnvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE);
+ fnvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_MMP_TXG, 0);
+
+ error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO);
+ }
+
+ if (mmp_label)
+ nvlist_free(mmp_label);
+
+ return (error);
+}
+
+static int
+spa_verify_host(spa_t *spa, nvlist_t *mos_config)
+{
+ uint64_t hostid;
+ char *hostname;
+ uint64_t myhostid = 0;
+
+ if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
+ ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
+ hostname = fnvlist_lookup_string(mos_config,
+ ZPOOL_CONFIG_HOSTNAME);
+
+ myhostid = zone_get_hostid(NULL);
+
+ if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
+ cmn_err(CE_WARN, "pool '%s' could not be "
+ "loaded as it was last accessed by "
+ "another system (host: %s hostid: 0x%llx). "
+ "See: https://openzfs.github.io/openzfs-docs/msg/"
+ "ZFS-8000-EY",
+ spa_name(spa), hostname, (u_longlong_t)hostid);
+ spa_load_failed(spa, "hostid verification failed: pool "
+ "last accessed by host: %s (hostid: 0x%llx)",
+ hostname, (u_longlong_t)hostid);
+ return (SET_ERROR(EBADF));
+ }
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
+{
+ int error = 0;
+ nvlist_t *nvtree, *nvl, *config = spa->spa_config;
+ int parse;
+ vdev_t *rvd;
+ uint64_t pool_guid;
+ char *comment;
+ char *compatibility;
+
+ /*
+ * Versioning wasn't explicitly added to the label until later, so if
+ * it's not present treat it as the initial version.
+ */
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+ &spa->spa_ubsync.ub_version) != 0)
+ spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
+
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
+ spa_load_failed(spa, "invalid config provided: '%s' missing",
+ ZPOOL_CONFIG_POOL_GUID);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * If we are doing an import, ensure that the pool is not already
+ * imported by checking if its pool guid already exists in the
+ * spa namespace.
+ *
+ * The only case that we allow an already imported pool to be
+ * imported again, is when the pool is checkpointed and we want to
+ * look at its checkpointed state from userland tools like zdb.
+ */
+#ifdef _KERNEL
+ if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
+ spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
+ spa_guid_exists(pool_guid, 0)) {
+#else
+ if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
+ spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
+ spa_guid_exists(pool_guid, 0) &&
+ !spa_importing_readonly_checkpoint(spa)) {
+#endif
+ spa_load_failed(spa, "a pool with guid %llu is already open",
+ (u_longlong_t)pool_guid);
+ return (SET_ERROR(EEXIST));
+ }
+
+ spa->spa_config_guid = pool_guid;
+
+ nvlist_free(spa->spa_load_info);
+ spa->spa_load_info = fnvlist_alloc();
+
+ ASSERT(spa->spa_comment == NULL);
+ if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
+ spa->spa_comment = spa_strdup(comment);
+
+ ASSERT(spa->spa_compatibility == NULL);
+ if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY,
+ &compatibility) == 0)
+ spa->spa_compatibility = spa_strdup(compatibility);
+
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ &spa->spa_config_txg);
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
+ spa->spa_config_splitting = fnvlist_dup(nvl);
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
+ spa_load_failed(spa, "invalid config provided: '%s' missing",
+ ZPOOL_CONFIG_VDEV_TREE);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Create "The Godfather" zio to hold all async IOs
+ */
+ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
+ KM_SLEEP);
+ for (int i = 0; i < max_ncpus; i++) {
+ spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+ ZIO_FLAG_GODFATHER);
+ }
+
+ /*
+ * Parse the configuration into a vdev tree. We explicitly set the
+ * value that will be returned by spa_version() since parsing the
+ * configuration requires knowing the version number.
+ */
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ parse = (type == SPA_IMPORT_EXISTING ?
+ VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
+ error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ if (error != 0) {
+ spa_load_failed(spa, "unable to parse config [error=%d]",
+ error);
+ return (error);
+ }
+
+ ASSERT(spa->spa_root_vdev == rvd);
+ ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
+ ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
+
+ if (type != SPA_IMPORT_ASSEMBLE) {
+ ASSERT(spa_guid(spa) == pool_guid);
+ }
+
+ return (0);
+}
+
+/*
+ * Recursively open all vdevs in the vdev tree. This function is called twice:
+ * first with the untrusted config, then with the trusted config.
+ */
+static int
+spa_ld_open_vdevs(spa_t *spa)
+{
+ int error = 0;
+
+ /*
+ * spa_missing_tvds_allowed defines how many top-level vdevs can be
+ * missing/unopenable for the root vdev to be still considered openable.
+ */
+ if (spa->spa_trust_config) {
+ spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
+ } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
+ spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
+ } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
+ spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
+ } else {
+ spa->spa_missing_tvds_allowed = 0;
+ }
+
+ spa->spa_missing_tvds_allowed =
+ MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = vdev_open(spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ if (spa->spa_missing_tvds != 0) {
+ spa_load_note(spa, "vdev tree has %lld missing top-level "
+ "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
+ if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) {
+ /*
+ * Although theoretically we could allow users to open
+ * incomplete pools in RW mode, we'd need to add a lot
+ * of extra logic (e.g. adjust pool space to account
+ * for missing vdevs).
+ * This limitation also prevents users from accidentally
+ * opening the pool in RW mode during data recovery and
+ * damaging it further.
+ */
+ spa_load_note(spa, "pools with missing top-level "
+ "vdevs can only be opened in read-only mode.");
+ error = SET_ERROR(ENXIO);
+ } else {
+ spa_load_note(spa, "current settings allow for maximum "
+ "%lld missing top-level vdevs at this stage.",
+ (u_longlong_t)spa->spa_missing_tvds_allowed);
+ }
+ }
+ if (error != 0) {
+ spa_load_failed(spa, "unable to open vdev tree [error=%d]",
+ error);
+ }
+ if (spa->spa_missing_tvds != 0 || error != 0)
+ vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
+
+ return (error);
+}
+
+/*
+ * We need to validate the vdev labels against the configuration that
+ * we have in hand. This function is called twice: first with an untrusted
+ * config, then with a trusted config. The validation is more strict when the
+ * config is trusted.
+ */
+static int
+spa_ld_validate_vdevs(spa_t *spa)
+{
+ int error = 0;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = vdev_validate(rvd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ if (error != 0) {
+ spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
+ return (error);
+ }
+
+ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
+ spa_load_failed(spa, "cannot open vdev tree after invalidating "
+ "some vdevs");
+ vdev_dbgmsg_print_tree(rvd, 2);
+ return (SET_ERROR(ENXIO));
+ }
+
+ return (0);
+}
+
+static void
+spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
+{
+ spa->spa_state = POOL_STATE_ACTIVE;
+ spa->spa_ubsync = spa->spa_uberblock;
+ spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
+ TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
+ spa->spa_first_txg = spa->spa_last_ubsync_txg ?
+ spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
+ spa->spa_claim_max_txg = spa->spa_first_txg;
+ spa->spa_prev_software_version = ub->ub_software_version;
+}
+
+static int
+spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ nvlist_t *label;
+ uberblock_t *ub = &spa->spa_uberblock;
+ boolean_t activity_check = B_FALSE;
+
+ /*
+ * If we are opening the checkpointed state of the pool by
+ * rewinding to it, at this point we will have written the
+ * checkpointed uberblock to the vdev labels, so searching
+ * the labels will find the right uberblock. However, if
+ * we are opening the checkpointed state read-only, we have
+ * not modified the labels. Therefore, we must ignore the
+ * labels and continue using the spa_uberblock that was set
+ * by spa_ld_checkpoint_rewind.
+ *
+ * Note that it would be fine to ignore the labels when
+ * rewinding (opening writeable) as well. However, if we
+ * crash just after writing the labels, we will end up
+ * searching the labels. Doing so in the common case means
+ * that this code path gets exercised normally, rather than
+ * just in the edge case.
+ */
+ if (ub->ub_checkpoint_txg != 0 &&
+ spa_importing_readonly_checkpoint(spa)) {
+ spa_ld_select_uberblock_done(spa, ub);
+ return (0);
+ }
+
+ /*
+ * Find the best uberblock.
+ */
+ vdev_uberblock_load(rvd, ub, &label);
+
+ /*
+ * If we weren't able to find a single valid uberblock, return failure.
+ */
+ if (ub->ub_txg == 0) {
+ nvlist_free(label);
+ spa_load_failed(spa, "no valid uberblock found");
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
+ }
+
+ if (spa->spa_load_max_txg != UINT64_MAX) {
+ (void) spa_import_progress_set_max_txg(spa_guid(spa),
+ (u_longlong_t)spa->spa_load_max_txg);
+ }
+ spa_load_note(spa, "using uberblock with txg=%llu",
+ (u_longlong_t)ub->ub_txg);
+
+
+ /*
+ * For pools which have the multihost property on determine if the
+ * pool is truly inactive and can be safely imported. Prevent
+ * hosts which don't have a hostid set from importing the pool.
+ */
+ activity_check = spa_activity_check_required(spa, ub, label,
+ spa->spa_config);
+ if (activity_check) {
+ if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay &&
+ spa_get_hostid(spa) == 0) {
+ nvlist_free(label);
+ fnvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
+ return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
+ }
+
+ int error = spa_activity_check(spa, ub, spa->spa_config);
+ if (error) {
+ nvlist_free(label);
+ return (error);
+ }
+
+ fnvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE);
+ fnvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_MMP_TXG, ub->ub_txg);
+ fnvlist_add_uint16(spa->spa_load_info,
+ ZPOOL_CONFIG_MMP_SEQ,
+ (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0));
+ }
+
+ /*
+ * If the pool has an unsupported version we can't open it.
+ */
+ if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
+ nvlist_free(label);
+ spa_load_failed(spa, "version %llu is not supported",
+ (u_longlong_t)ub->ub_version);
+ return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
+ }
+
+ if (ub->ub_version >= SPA_VERSION_FEATURES) {
+ nvlist_t *features;
+
+ /*
+ * If we weren't able to find what's necessary for reading the
+ * MOS in the label, return failure.
+ */
+ if (label == NULL) {
+ spa_load_failed(spa, "label config unavailable");
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+ ENXIO));
+ }
+
+ if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ,
+ &features) != 0) {
+ nvlist_free(label);
+ spa_load_failed(spa, "invalid label: '%s' missing",
+ ZPOOL_CONFIG_FEATURES_FOR_READ);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+ ENXIO));
+ }
+
+ /*
+ * Update our in-core representation with the definitive values
+ * from the label.
+ */
+ nvlist_free(spa->spa_label_features);
+ VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
+ }
+
+ nvlist_free(label);
+
+ /*
+ * Look through entries in the label nvlist's features_for_read. If
+ * there is a feature listed there which we don't understand then we
+ * cannot open a pool.
+ */
+ if (ub->ub_version >= SPA_VERSION_FEATURES) {
+ nvlist_t *unsup_feat;
+
+ VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
+ 0);
+
+ for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
+ NULL); nvp != NULL;
+ nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
+ if (!zfeature_is_supported(nvpair_name(nvp))) {
+ VERIFY(nvlist_add_string(unsup_feat,
+ nvpair_name(nvp), "") == 0);
+ }
+ }
+
+ if (!nvlist_empty(unsup_feat)) {
+ VERIFY(nvlist_add_nvlist(spa->spa_load_info,
+ ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
+ nvlist_free(unsup_feat);
+ spa_load_failed(spa, "some features are unsupported");
+ return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
+ ENOTSUP));
+ }
+
+ nvlist_free(unsup_feat);
+ }
+
+ if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_try_repair(spa, spa->spa_config);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ nvlist_free(spa->spa_config_splitting);
+ spa->spa_config_splitting = NULL;
+ }
+
+ /*
+ * Initialize internal SPA structures.
+ */
+ spa_ld_select_uberblock_done(spa, ub);
+
+ return (0);
+}
+
+static int
+spa_ld_open_rootbp(spa_t *spa)
+{
+ int error = 0;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
+ if (error != 0) {
+ spa_load_failed(spa, "unable to open rootbp in dsl_pool_init "
+ "[error=%d]", error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+ spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
+
+ return (0);
+}
+
+static int
+spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
+ boolean_t reloading)
+{
+ vdev_t *mrvd, *rvd = spa->spa_root_vdev;
+ nvlist_t *nv, *mos_config, *policy;
+ int error = 0, copy_error;
+ uint64_t healthy_tvds, healthy_tvds_mos;
+ uint64_t mos_config_txg;
+
+ if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
+ != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ /*
+ * If we're assembling a pool from a split, the config provided is
+ * already trusted so there is nothing to do.
+ */
+ if (type == SPA_IMPORT_ASSEMBLE)
+ return (0);
+
+ healthy_tvds = spa_healthy_core_tvds(spa);
+
+ if (load_nvlist(spa, spa->spa_config_object, &mos_config)
+ != 0) {
+ spa_load_failed(spa, "unable to retrieve MOS config");
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ /*
+ * If we are doing an open, pool owner wasn't verified yet, thus do
+ * the verification here.
+ */
+ if (spa->spa_load_state == SPA_LOAD_OPEN) {
+ error = spa_verify_host(spa, mos_config);
+ if (error != 0) {
+ nvlist_free(mos_config);
+ return (error);
+ }
+ }
+
+ nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+ /*
+ * Build a new vdev tree from the trusted config
+ */
+ error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD);
+ if (error != 0) {
+ nvlist_free(mos_config);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ spa_load_failed(spa, "spa_config_parse failed [error=%d]",
+ error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
+ }
+
+ /*
+ * Vdev paths in the MOS may be obsolete. If the untrusted config was
+ * obtained by scanning /dev/dsk, then it will have the right vdev
+ * paths. We update the trusted MOS config with this information.
+ * We first try to copy the paths with vdev_copy_path_strict, which
+ * succeeds only when both configs have exactly the same vdev tree.
+ * If that fails, we fall back to a more flexible method that has a
+ * best effort policy.
+ */
+ copy_error = vdev_copy_path_strict(rvd, mrvd);
+ if (copy_error != 0 || spa_load_print_vdev_tree) {
+ spa_load_note(spa, "provided vdev tree:");
+ vdev_dbgmsg_print_tree(rvd, 2);
+ spa_load_note(spa, "MOS vdev tree:");
+ vdev_dbgmsg_print_tree(mrvd, 2);
+ }
+ if (copy_error != 0) {
+ spa_load_note(spa, "vdev_copy_path_strict failed, falling "
+ "back to vdev_copy_path_relaxed");
+ vdev_copy_path_relaxed(rvd, mrvd);
+ }
+
+ vdev_close(rvd);
+ vdev_free(rvd);
+ spa->spa_root_vdev = mrvd;
+ rvd = mrvd;
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ /*
+ * We will use spa_config if we decide to reload the spa or if spa_load
+ * fails and we rewind. We must thus regenerate the config using the
+ * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to
+ * pass settings on how to load the pool and is not stored in the MOS.
+ * We copy it over to our new, trusted config.
+ */
+ mos_config_txg = fnvlist_lookup_uint64(mos_config,
+ ZPOOL_CONFIG_POOL_TXG);
+ nvlist_free(mos_config);
+ mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
+ if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY,
+ &policy) == 0)
+ fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy);
+ spa_config_set(spa, mos_config);
+ spa->spa_config_source = SPA_CONFIG_SRC_MOS;
+
+ /*
+ * Now that we got the config from the MOS, we should be more strict
+ * in checking blkptrs and can make assumptions about the consistency
+ * of the vdev tree. spa_trust_config must be set to true before opening
+ * vdevs in order for them to be writeable.
+ */
+ spa->spa_trust_config = B_TRUE;
+
+ /*
+ * Open and validate the new vdev tree
+ */
+ error = spa_ld_open_vdevs(spa);
+ if (error != 0)
+ return (error);
+
+ error = spa_ld_validate_vdevs(spa);
+ if (error != 0)
+ return (error);
+
+ if (copy_error != 0 || spa_load_print_vdev_tree) {
+ spa_load_note(spa, "final vdev tree:");
+ vdev_dbgmsg_print_tree(rvd, 2);
+ }
+
+ if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
+ !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
+ /*
+ * Sanity check to make sure that we are indeed loading the
+ * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
+ * in the config provided and they happened to be the only ones
+ * to have the latest uberblock, we could involuntarily perform
+ * an extreme rewind.
+ */
+ healthy_tvds_mos = spa_healthy_core_tvds(spa);
+ if (healthy_tvds_mos - healthy_tvds >=
+ SPA_SYNC_MIN_VDEVS) {
+ spa_load_note(spa, "config provided misses too many "
+ "top-level vdevs compared to MOS (%lld vs %lld). ",
+ (u_longlong_t)healthy_tvds,
+ (u_longlong_t)healthy_tvds_mos);
+ spa_load_note(spa, "vdev tree:");
+ vdev_dbgmsg_print_tree(rvd, 2);
+ if (reloading) {
+ spa_load_failed(spa, "config was already "
+ "provided from MOS. Aborting.");
+ return (spa_vdev_err(rvd,
+ VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+ spa_load_note(spa, "spa must be reloaded using MOS "
+ "config");
+ return (SET_ERROR(EAGAIN));
+ }
+ }
+
+ error = spa_check_for_missing_logs(spa);
+ if (error != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
+
+ if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
+ spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
+ "guid sum (%llu != %llu)",
+ (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
+ (u_longlong_t)rvd->vdev_guid_sum);
+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
+ ENXIO));
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_open_indirect_vdev_metadata(spa_t *spa)
+{
+ int error = 0;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ /*
+ * Everything that we read before spa_remove_init() must be stored
+ * on concreted vdevs. Therefore we do this as early as possible.
+ */
+ error = spa_remove_init(spa);
+ if (error != 0) {
+ spa_load_failed(spa, "spa_remove_init failed [error=%d]",
+ error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ /*
+ * Retrieve information needed to condense indirect vdev mappings.
+ */
+ error = spa_condense_init(spa);
+ if (error != 0) {
+ spa_load_failed(spa, "spa_condense_init failed [error=%d]",
+ error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep)
+{
+ int error = 0;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ if (spa_version(spa) >= SPA_VERSION_FEATURES) {
+ boolean_t missing_feat_read = B_FALSE;
+ nvlist_t *unsup_feat, *enabled_feat;
+
+ if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
+ &spa->spa_feat_for_read_obj, B_TRUE) != 0) {
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
+ &spa->spa_feat_for_write_obj, B_TRUE) != 0) {
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
+ &spa->spa_feat_desc_obj, B_TRUE) != 0) {
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ enabled_feat = fnvlist_alloc();
+ unsup_feat = fnvlist_alloc();
+
+ if (!spa_features_check(spa, B_FALSE,
+ unsup_feat, enabled_feat))
+ missing_feat_read = B_TRUE;
+
+ if (spa_writeable(spa) ||
+ spa->spa_load_state == SPA_LOAD_TRYIMPORT) {
+ if (!spa_features_check(spa, B_TRUE,
+ unsup_feat, enabled_feat)) {
+ *missing_feat_writep = B_TRUE;
+ }
+ }
+
+ fnvlist_add_nvlist(spa->spa_load_info,
+ ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
+
+ if (!nvlist_empty(unsup_feat)) {
+ fnvlist_add_nvlist(spa->spa_load_info,
+ ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
+ }
+
+ fnvlist_free(enabled_feat);
+ fnvlist_free(unsup_feat);
+
+ if (!missing_feat_read) {
+ fnvlist_add_boolean(spa->spa_load_info,
+ ZPOOL_CONFIG_CAN_RDONLY);
+ }
+
+ /*
+ * If the state is SPA_LOAD_TRYIMPORT, our objective is
+ * twofold: to determine whether the pool is available for
+ * import in read-write mode and (if it is not) whether the
+ * pool is available for import in read-only mode. If the pool
+ * is available for import in read-write mode, it is displayed
+ * as available in userland; if it is not available for import
+ * in read-only mode, it is displayed as unavailable in
+ * userland. If the pool is available for import in read-only
+ * mode but not read-write mode, it is displayed as unavailable
+ * in userland with a special note that the pool is actually
+ * available for open in read-only mode.
+ *
+ * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
+ * missing a feature for write, we must first determine whether
+ * the pool can be opened read-only before returning to
+ * userland in order to know whether to display the
+ * abovementioned note.
+ */
+ if (missing_feat_read || (*missing_feat_writep &&
+ spa_writeable(spa))) {
+ spa_load_failed(spa, "pool uses unsupported features");
+ return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
+ ENOTSUP));
+ }
+
+ /*
+ * Load refcounts for ZFS features from disk into an in-memory
+ * cache during SPA initialization.
+ */
+ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
+ uint64_t refcount;
+
+ error = feature_get_refcount_from_disk(spa,
+ &spa_feature_table[i], &refcount);
+ if (error == 0) {
+ spa->spa_feat_refcount_cache[i] = refcount;
+ } else if (error == ENOTSUP) {
+ spa->spa_feat_refcount_cache[i] =
+ SPA_FEATURE_DISABLED;
+ } else {
+ spa_load_failed(spa, "error getting refcount "
+ "for feature %s [error=%d]",
+ spa_feature_table[i].fi_guid, error);
+ return (spa_vdev_err(rvd,
+ VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+ }
+ }
+
+ if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
+ if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
+ &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ /*
+ * Encryption was added before bookmark_v2, even though bookmark_v2
+ * is now a dependency. If this pool has encryption enabled without
+ * bookmark_v2, trigger an errata message.
+ */
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) &&
+ !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) {
+ spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION;
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_load_special_directories(spa_t *spa)
+{
+ int error = 0;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ spa->spa_is_initializing = B_TRUE;
+ error = dsl_pool_open(spa->spa_dsl_pool);
+ spa->spa_is_initializing = B_FALSE;
+ if (error != 0) {
+ spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_get_props(spa_t *spa)
+{
+ int error = 0;
+ uint64_t obj;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ /* Grab the checksum salt from the MOS. */
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_CHECKSUM_SALT, 1,
+ sizeof (spa->spa_cksum_salt.zcs_bytes),
+ spa->spa_cksum_salt.zcs_bytes);
+ if (error == ENOENT) {
+ /* Generate a new salt for subsequent use */
+ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
+ sizeof (spa->spa_cksum_salt.zcs_bytes));
+ } else if (error != 0) {
+ spa_load_failed(spa, "unable to retrieve checksum salt from "
+ "MOS [error=%d]", error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
+ if (error != 0) {
+ spa_load_failed(spa, "error opening deferred-frees bpobj "
+ "[error=%d]", error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ /*
+ * Load the bit that tells us to use the new accounting function
+ * (raid-z deflation). If we have an older pool, this will not
+ * be present.
+ */
+ error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
+ &spa->spa_creation_version, B_FALSE);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ /*
+ * Load the persistent error log. If we have an older pool, this will
+ * not be present.
+ */
+ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last,
+ B_FALSE);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
+ &spa->spa_errlog_scrub, B_FALSE);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ /*
+ * Load the livelist deletion field. If a livelist is queued for
+ * deletion, indicate that in the spa
+ */
+ error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES,
+ &spa->spa_livelists_to_delete, B_FALSE);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ /*
+ * Load the history object. If we have an older pool, this
+ * will not be present.
+ */
+ error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ /*
+ * Load the per-vdev ZAP map. If we have an older pool, this will not
+ * be present; in this case, defer its creation to a later time to
+ * avoid dirtying the MOS this early / out of sync context. See
+ * spa_sync_config_object.
+ */
+
+ /* The sentinel is only available in the MOS config. */
+ nvlist_t *mos_config;
+ if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
+ spa_load_failed(spa, "unable to retrieve MOS config");
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
+ &spa->spa_all_vdev_zaps, B_FALSE);
+
+ if (error == ENOENT) {
+ VERIFY(!nvlist_exists(mos_config,
+ ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
+ spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
+ ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
+ } else if (error != 0) {
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
+ /*
+ * An older version of ZFS overwrote the sentinel value, so
+ * we have orphaned per-vdev ZAPs in the MOS. Defer their
+ * destruction to later; see spa_sync_config_object.
+ */
+ spa->spa_avz_action = AVZ_ACTION_DESTROY;
+ /*
+ * We're assuming that no vdevs have had their ZAPs created
+ * before this. Better be sure of it.
+ */
+ ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
+ }
+ nvlist_free(mos_config);
+
+ spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
+
+ error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object,
+ B_FALSE);
+ if (error && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ if (error == 0) {
+ uint64_t autoreplace;
+
+ spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
+ spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
+ spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
+ spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
+ spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
+ spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
+ spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim);
+ spa->spa_autoreplace = (autoreplace != 0);
+ }
+
+ /*
+ * If we are importing a pool with missing top-level vdevs,
+ * we enforce that the pool doesn't panic or get suspended on
+ * error since the likelihood of missing data is extremely high.
+ */
+ if (spa->spa_missing_tvds > 0 &&
+ spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
+ spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
+ spa_load_note(spa, "forcing failmode to 'continue' "
+ "as some top level vdevs are missing");
+ spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type)
+{
+ int error = 0;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ /*
+ * If we're assembling the pool from the split-off vdevs of
+ * an existing pool, we don't want to attach the spares & cache
+ * devices.
+ */
+
+ /*
+ * Load any hot spares for this pool.
+ */
+ error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object,
+ B_FALSE);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
+ ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
+ if (load_nvlist(spa, spa->spa_spares.sav_object,
+ &spa->spa_spares.sav_config) != 0) {
+ spa_load_failed(spa, "error loading spares nvlist");
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_load_spares(spa);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ } else if (error == 0) {
+ spa->spa_spares.sav_sync = B_TRUE;
+ }
+
+ /*
+ * Load any level 2 ARC devices for this pool.
+ */
+ error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
+ &spa->spa_l2cache.sav_object, B_FALSE);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
+ ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
+ if (load_nvlist(spa, spa->spa_l2cache.sav_object,
+ &spa->spa_l2cache.sav_config) != 0) {
+ spa_load_failed(spa, "error loading l2cache nvlist");
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_load_l2cache(spa);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ } else if (error == 0) {
+ spa->spa_l2cache.sav_sync = B_TRUE;
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_load_vdev_metadata(spa_t *spa)
+{
+ int error = 0;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ /*
+ * If the 'multihost' property is set, then never allow a pool to
+ * be imported when the system hostid is zero. The exception to
+ * this rule is zdb which is always allowed to access pools.
+ */
+ if (spa_multihost(spa) && spa_get_hostid(spa) == 0 &&
+ (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) {
+ fnvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
+ return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
+ }
+
+ /*
+ * If the 'autoreplace' property is set, then post a resource notifying
+ * the ZFS DE that it should not issue any faults for unopenable
+ * devices. We also iterate over the vdevs, and post a sysevent for any
+ * unopenable vdevs so that the normal autoreplace handler can take
+ * over.
+ */
+ if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
+ spa_check_removed(spa->spa_root_vdev);
+ /*
+ * For the import case, this is done in spa_import(), because
+ * at this point we're using the spare definitions from
+ * the MOS config, not necessarily from the userland config.
+ */
+ if (spa->spa_load_state != SPA_LOAD_IMPORT) {
+ spa_aux_check_removed(&spa->spa_spares);
+ spa_aux_check_removed(&spa->spa_l2cache);
+ }
+ }
+
+ /*
+ * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc.
+ */
+ error = vdev_load(rvd);
+ if (error != 0) {
+ spa_load_failed(spa, "vdev_load failed [error=%d]", error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
+ }
+
+ error = spa_ld_log_spacemaps(spa);
+ if (error != 0) {
+ spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]",
+ error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
+ }
+
+ /*
+ * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
+ */
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ return (0);
+}
+
+static int
+spa_ld_load_dedup_tables(spa_t *spa)
+{
+ int error = 0;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ error = ddt_load(spa);
+ if (error != 0) {
+ spa_load_failed(spa, "ddt_load failed [error=%d]", error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
+ boolean_t missing = spa_check_logs(spa);
+ if (missing) {
+ if (spa->spa_missing_tvds != 0) {
+ spa_load_note(spa, "spa_check_logs failed "
+ "so dropping the logs");
+ } else {
+ *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
+ spa_load_failed(spa, "spa_check_logs failed");
+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
+ ENXIO));
+ }
+ }
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_verify_pool_data(spa_t *spa)
+{
+ int error = 0;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ /*
+ * We've successfully opened the pool, verify that we're ready
+ * to start pushing transactions.
+ */
+ if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
+ error = spa_load_verify(spa);
+ if (error != 0) {
+ spa_load_failed(spa, "spa_load_verify failed "
+ "[error=%d]", error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+ error));
+ }
+ }
+
+ return (0);
+}
+
+static void
+spa_ld_claim_log_blocks(spa_t *spa)
+{
+ dmu_tx_t *tx;
+ dsl_pool_t *dp = spa_get_dsl(spa);
+
+ /*
+ * Claim log blocks that haven't been committed yet.
+ * This must all happen in a single txg.
+ * Note: spa_claim_max_txg is updated by spa_claim_notify(),
+ * invoked from zil_claim_log_block()'s i/o done callback.
+ * Price of rollback is that we abandon the log.
+ */
+ spa->spa_claiming = B_TRUE;
+
+ tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
+ (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+ zil_claim, tx, DS_FIND_CHILDREN);
+ dmu_tx_commit(tx);
+
+ spa->spa_claiming = B_FALSE;
+
+ spa_set_log_state(spa, SPA_LOG_GOOD);
+}
+
+static void
+spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
+ boolean_t update_config_cache)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ int need_update = B_FALSE;
+
+ /*
+ * If the config cache is stale, or we have uninitialized
+ * metaslabs (see spa_vdev_add()), then update the config.
+ *
+ * If this is a verbatim import, trust the current
+ * in-core spa_config and update the disk labels.
+ */
+ if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
+ spa->spa_load_state == SPA_LOAD_IMPORT ||
+ spa->spa_load_state == SPA_LOAD_RECOVER ||
+ (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
+ need_update = B_TRUE;
+
+ for (int c = 0; c < rvd->vdev_children; c++)
+ if (rvd->vdev_child[c]->vdev_ms_array == 0)
+ need_update = B_TRUE;
+
+ /*
+ * Update the config cache asynchronously in case we're the
+ * root pool, in which case the config cache isn't writable yet.
+ */
+ if (need_update)
+ spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+}
+
+static void
+spa_ld_prepare_for_reload(spa_t *spa)
+{
+ spa_mode_t mode = spa->spa_mode;
+ int async_suspended = spa->spa_async_suspended;
+
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_activate(spa, mode);
+
+ /*
+ * We save the value of spa_async_suspended as it gets reset to 0 by
+ * spa_unload(). We want to restore it back to the original value before
+ * returning as we might be calling spa_async_resume() later.
+ */
+ spa->spa_async_suspended = async_suspended;
+}
+
+static int
+spa_ld_read_checkpoint_txg(spa_t *spa)
+{
+ uberblock_t checkpoint;
+ int error = 0;
+
+ ASSERT0(spa->spa_checkpoint_txg);
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+ sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+ if (error == ENOENT)
+ return (0);
+
+ if (error != 0)
+ return (error);
+
+ ASSERT3U(checkpoint.ub_txg, !=, 0);
+ ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
+ ASSERT3U(checkpoint.ub_timestamp, !=, 0);
+ spa->spa_checkpoint_txg = checkpoint.ub_txg;
+ spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
+
+ return (0);
+}
+
+static int
+spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
+{
+ int error = 0;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
+
+ /*
+ * Never trust the config that is provided unless we are assembling
+ * a pool following a split.
+ * This means don't trust blkptrs and the vdev tree in general. This
+ * also effectively puts the spa in read-only mode since
+ * spa_writeable() checks for spa_trust_config to be true.
+ * We will later load a trusted config from the MOS.
+ */
+ if (type != SPA_IMPORT_ASSEMBLE)
+ spa->spa_trust_config = B_FALSE;
+
+ /*
+ * Parse the config provided to create a vdev tree.
+ */
+ error = spa_ld_parse_config(spa, type);
+ if (error != 0)
+ return (error);
+
+ spa_import_progress_add(spa);
+
+ /*
+ * Now that we have the vdev tree, try to open each vdev. This involves
+ * opening the underlying physical device, retrieving its geometry and
+ * probing the vdev with a dummy I/O. The state of each vdev will be set
+ * based on the success of those operations. After this we'll be ready
+ * to read from the vdevs.
+ */
+ error = spa_ld_open_vdevs(spa);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Read the label of each vdev and make sure that the GUIDs stored
+ * there match the GUIDs in the config provided.
+ * If we're assembling a new pool that's been split off from an
+ * existing pool, the labels haven't yet been updated so we skip
+ * validation for now.
+ */
+ if (type != SPA_IMPORT_ASSEMBLE) {
+ error = spa_ld_validate_vdevs(spa);
+ if (error != 0)
+ return (error);
+ }
+
+ /*
+ * Read all vdev labels to find the best uberblock (i.e. latest,
+ * unless spa_load_max_txg is set) and store it in spa_uberblock. We
+ * get the list of features required to read blkptrs in the MOS from
+ * the vdev label with the best uberblock and verify that our version
+ * of zfs supports them all.
+ */
+ error = spa_ld_select_uberblock(spa, type);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Pass that uberblock to the dsl_pool layer which will open the root
+ * blkptr. This blkptr points to the latest version of the MOS and will
+ * allow us to read its contents.
+ */
+ error = spa_ld_open_rootbp(spa);
+ if (error != 0)
+ return (error);
+
+ return (0);
+}
+
+static int
+spa_ld_checkpoint_rewind(spa_t *spa)
+{
+ uberblock_t checkpoint;
+ int error = 0;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+ sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+ if (error != 0) {
+ spa_load_failed(spa, "unable to retrieve checkpointed "
+ "uberblock from the MOS config [error=%d]", error);
+
+ if (error == ENOENT)
+ error = ZFS_ERR_NO_CHECKPOINT;
+
+ return (error);
+ }
+
+ ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
+ ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
+
+ /*
+ * We need to update the txg and timestamp of the checkpointed
+ * uberblock to be higher than the latest one. This ensures that
+ * the checkpointed uberblock is selected if we were to close and
+ * reopen the pool right after we've written it in the vdev labels.
+ * (also see block comment in vdev_uberblock_compare)
+ */
+ checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
+ checkpoint.ub_timestamp = gethrestime_sec();
+
+ /*
+ * Set current uberblock to be the checkpointed uberblock.
+ */
+ spa->spa_uberblock = checkpoint;
+
+ /*
+ * If we are doing a normal rewind, then the pool is open for
+ * writing and we sync the "updated" checkpointed uberblock to
+ * disk. Once this is done, we've basically rewound the whole
+ * pool and there is no way back.
+ *
+ * There are cases when we don't want to attempt and sync the
+ * checkpointed uberblock to disk because we are opening a
+ * pool as read-only. Specifically, verifying the checkpointed
+ * state with zdb, and importing the checkpointed state to get
+ * a "preview" of its content.
+ */
+ if (spa_writeable(spa)) {
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
+ int svdcount = 0;
+ int children = rvd->vdev_children;
+ int c0 = spa_get_random(children);
+
+ for (int c = 0; c < children; c++) {
+ vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
+
+ /* Stop when revisiting the first vdev */
+ if (c > 0 && svd[0] == vd)
+ break;
+
+ if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
+ !vdev_is_concrete(vd))
+ continue;
+
+ svd[svdcount++] = vd;
+ if (svdcount == SPA_SYNC_MIN_VDEVS)
+ break;
+ }
+ error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
+ if (error == 0)
+ spa->spa_last_synced_guid = rvd->vdev_guid;
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ if (error != 0) {
+ spa_load_failed(spa, "failed to write checkpointed "
+ "uberblock to the vdev labels [error=%d]", error);
+ return (error);
+ }
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
+ boolean_t *update_config_cache)
+{
+ int error;
+
+ /*
+ * Parse the config for pool, open and validate vdevs,
+ * select an uberblock, and use that uberblock to open
+ * the MOS.
+ */
+ error = spa_ld_mos_init(spa, type);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Retrieve the trusted config stored in the MOS and use it to create
+ * a new, exact version of the vdev tree, then reopen all vdevs.
+ */
+ error = spa_ld_trusted_config(spa, type, B_FALSE);
+ if (error == EAGAIN) {
+ if (update_config_cache != NULL)
+ *update_config_cache = B_TRUE;
+
+ /*
+ * Redo the loading process with the trusted config if it is
+ * too different from the untrusted config.
+ */
+ spa_ld_prepare_for_reload(spa);
+ spa_load_note(spa, "RELOADING");
+ error = spa_ld_mos_init(spa, type);
+ if (error != 0)
+ return (error);
+
+ error = spa_ld_trusted_config(spa, type, B_TRUE);
+ if (error != 0)
+ return (error);
+
+ } else if (error != 0) {
+ return (error);
+ }
+
+ return (0);
+}
+
+/*
+ * Load an existing storage pool, using the config provided. This config
+ * describes which vdevs are part of the pool and is later validated against
+ * partial configs present in each vdev's label and an entire copy of the
+ * config stored in the MOS.
+ */
+static int
+spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
+{
+ int error = 0;
+ boolean_t missing_feat_write = B_FALSE;
+ boolean_t checkpoint_rewind =
+ (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+ boolean_t update_config_cache = B_FALSE;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
+
+ spa_load_note(spa, "LOADING");
+
+ error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
+ if (error != 0)
+ return (error);
+
+ /*
+ * If we are rewinding to the checkpoint then we need to repeat
+ * everything we've done so far in this function but this time
+ * selecting the checkpointed uberblock and using that to open
+ * the MOS.
+ */
+ if (checkpoint_rewind) {
+ /*
+ * If we are rewinding to the checkpoint update config cache
+ * anyway.
+ */
+ update_config_cache = B_TRUE;
+
+ /*
+ * Extract the checkpointed uberblock from the current MOS
+ * and use this as the pool's uberblock from now on. If the
+ * pool is imported as writeable we also write the checkpoint
+ * uberblock to the labels, making the rewind permanent.
+ */
+ error = spa_ld_checkpoint_rewind(spa);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Redo the loading process again with the
+ * checkpointed uberblock.
+ */
+ spa_ld_prepare_for_reload(spa);
+ spa_load_note(spa, "LOADING checkpointed uberblock");
+ error = spa_ld_mos_with_trusted_config(spa, type, NULL);
+ if (error != 0)
+ return (error);
+ }
+
+ /*
+ * Retrieve the checkpoint txg if the pool has a checkpoint.
+ */
+ error = spa_ld_read_checkpoint_txg(spa);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Retrieve the mapping of indirect vdevs. Those vdevs were removed
+ * from the pool and their contents were re-mapped to other vdevs. Note
+ * that everything that we read before this step must have been
+ * rewritten on concrete vdevs after the last device removal was
+ * initiated. Otherwise we could be reading from indirect vdevs before
+ * we have loaded their mappings.
+ */
+ error = spa_ld_open_indirect_vdev_metadata(spa);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Retrieve the full list of active features from the MOS and check if
+ * they are all supported.
+ */
+ error = spa_ld_check_features(spa, &missing_feat_write);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Load several special directories from the MOS needed by the dsl_pool
+ * layer.
+ */
+ error = spa_ld_load_special_directories(spa);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Retrieve pool properties from the MOS.
+ */
+ error = spa_ld_get_props(spa);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Retrieve the list of auxiliary devices - cache devices and spares -
+ * and open them.
+ */
+ error = spa_ld_open_aux_vdevs(spa, type);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Load the metadata for all vdevs. Also check if unopenable devices
+ * should be autoreplaced.
+ */
+ error = spa_ld_load_vdev_metadata(spa);
+ if (error != 0)
+ return (error);
+
+ error = spa_ld_load_dedup_tables(spa);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Verify the logs now to make sure we don't have any unexpected errors
+ * when we claim log blocks later.
+ */
+ error = spa_ld_verify_logs(spa, type, ereport);
+ if (error != 0)
+ return (error);
+
+ if (missing_feat_write) {
+ ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
+
+ /*
+ * At this point, we know that we can open the pool in
+ * read-only mode but not read-write mode. We now have enough
+ * information and can return to userland.
+ */
+ return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
+ ENOTSUP));
+ }
+
+ /*
+ * Traverse the last txgs to make sure the pool was left off in a safe
+ * state. When performing an extreme rewind, we verify the whole pool,
+ * which can take a very long time.
+ */
+ error = spa_ld_verify_pool_data(spa);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Calculate the deflated space for the pool. This must be done before
+ * we write anything to the pool because we'd need to update the space
+ * accounting using the deflated sizes.
+ */
+ spa_update_dspace(spa);
+
+ /*
+ * We have now retrieved all the information we needed to open the
+ * pool. If we are importing the pool in read-write mode, a few
+ * additional steps must be performed to finish the import.
+ */
+ if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
+ spa->spa_load_max_txg == UINT64_MAX)) {
+ uint64_t config_cache_txg = spa->spa_config_txg;
+
+ ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
+
+ /*
+ * In case of a checkpoint rewind, log the original txg
+ * of the checkpointed uberblock.
+ */
+ if (checkpoint_rewind) {
+ spa_history_log_internal(spa, "checkpoint rewind",
+ NULL, "rewound state to txg=%llu",
+ (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
+ }
+
+ /*
+ * Traverse the ZIL and claim all blocks.
+ */
+ spa_ld_claim_log_blocks(spa);
+
+ /*
+ * Kick-off the syncing thread.
+ */
+ spa->spa_sync_on = B_TRUE;
+ txg_sync_start(spa->spa_dsl_pool);
+ mmp_thread_start(spa);
+
+ /*
+ * Wait for all claims to sync. We sync up to the highest
+ * claimed log block birth time so that claimed log blocks
+ * don't appear to be from the future. spa_claim_max_txg
+ * will have been set for us by ZIL traversal operations
+ * performed above.
+ */
+ txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
+
+ /*
+ * Check if we need to request an update of the config. On the
+ * next sync, we would update the config stored in vdev labels
+ * and the cachefile (by default /etc/zfs/zpool.cache).
+ */
+ spa_ld_check_for_config_update(spa, config_cache_txg,
+ update_config_cache);
+
+ /*
+ * Check if a rebuild was in progress and if so resume it.
+ * Then check all DTLs to see if anything needs resilvering.
+ * The resilver will be deferred if a rebuild was started.
+ */
+ if (vdev_rebuild_active(spa->spa_root_vdev)) {
+ vdev_rebuild_restart(spa);
+ } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
+ vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
+ spa_async_request(spa, SPA_ASYNC_RESILVER);
+ }
+
+ /*
+ * Log the fact that we booted up (so that we can detect if
+ * we rebooted in the middle of an operation).
+ */
+ spa_history_log_version(spa, "open", NULL);
+
+ spa_restart_removal(spa);
+ spa_spawn_aux_threads(spa);
+
+ /*
+ * Delete any inconsistent datasets.
+ *
+ * Note:
+ * Since we may be issuing deletes for clones here,
+ * we make sure to do so after we've spawned all the
+ * auxiliary threads above (from which the livelist
+ * deletion zthr is part of).
+ */
+ (void) dmu_objset_find(spa_name(spa),
+ dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
+
+ /*
+ * Clean up any stale temporary dataset userrefs.
+ */
+ dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_initialize_restart(spa->spa_root_vdev);
+ vdev_trim_restart(spa->spa_root_vdev);
+ vdev_autotrim_restart(spa);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ }
+
+ spa_import_progress_remove(spa_guid(spa));
+ spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
+
+ spa_load_note(spa, "LOADED");
+
+ return (0);
+}
+
+static int
+spa_load_retry(spa_t *spa, spa_load_state_t state)
+{
+ spa_mode_t mode = spa->spa_mode;
+
+ spa_unload(spa);
+ spa_deactivate(spa);
+
+ spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
+
+ spa_activate(spa, mode);
+ spa_async_suspend(spa);
+
+ spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
+ (u_longlong_t)spa->spa_load_max_txg);
+
+ return (spa_load(spa, state, SPA_IMPORT_EXISTING));
+}
+
+/*
+ * If spa_load() fails this function will try loading prior txg's. If
+ * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
+ * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
+ * function will not rewind the pool and will return the same error as
+ * spa_load().
+ */
+static int
+spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
+ int rewind_flags)
+{
+ nvlist_t *loadinfo = NULL;
+ nvlist_t *config = NULL;
+ int load_error, rewind_error;
+ uint64_t safe_rewind_txg;
+ uint64_t min_txg;
+
+ if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
+ spa->spa_load_max_txg = spa->spa_load_txg;
+ spa_set_log_state(spa, SPA_LOG_CLEAR);
+ } else {
+ spa->spa_load_max_txg = max_request;
+ if (max_request != UINT64_MAX)
+ spa->spa_extreme_rewind = B_TRUE;
+ }
+
+ load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
+ if (load_error == 0)
+ return (0);
+ if (load_error == ZFS_ERR_NO_CHECKPOINT) {
+ /*
+ * When attempting checkpoint-rewind on a pool with no
+ * checkpoint, we should not attempt to load uberblocks
+ * from previous txgs when spa_load fails.
+ */
+ ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+ spa_import_progress_remove(spa_guid(spa));
+ return (load_error);
+ }
+
+ if (spa->spa_root_vdev != NULL)
+ config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+
+ spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
+ spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
+
+ if (rewind_flags & ZPOOL_NEVER_REWIND) {
+ nvlist_free(config);
+ spa_import_progress_remove(spa_guid(spa));
+ return (load_error);
+ }
+
+ if (state == SPA_LOAD_RECOVER) {
+ /* Price of rolling back is discarding txgs, including log */
+ spa_set_log_state(spa, SPA_LOG_CLEAR);
+ } else {
+ /*
+ * If we aren't rolling back save the load info from our first
+ * import attempt so that we can restore it after attempting
+ * to rewind.
+ */
+ loadinfo = spa->spa_load_info;
+ spa->spa_load_info = fnvlist_alloc();
+ }
+
+ spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
+ safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
+ min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
+ TXG_INITIAL : safe_rewind_txg;
+
+ /*
+ * Continue as long as we're finding errors, we're still within
+ * the acceptable rewind range, and we're still finding uberblocks
+ */
+ while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
+ spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
+ if (spa->spa_load_max_txg < safe_rewind_txg)
+ spa->spa_extreme_rewind = B_TRUE;
+ rewind_error = spa_load_retry(spa, state);
+ }
+
+ spa->spa_extreme_rewind = B_FALSE;
+ spa->spa_load_max_txg = UINT64_MAX;
+
+ if (config && (rewind_error || state != SPA_LOAD_RECOVER))
+ spa_config_set(spa, config);
+ else
+ nvlist_free(config);
+
+ if (state == SPA_LOAD_RECOVER) {
+ ASSERT3P(loadinfo, ==, NULL);
+ spa_import_progress_remove(spa_guid(spa));
+ return (rewind_error);
+ } else {
+ /* Store the rewind info as part of the initial load info */
+ fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
+ spa->spa_load_info);
+
+ /* Restore the initial load info */
+ fnvlist_free(spa->spa_load_info);
+ spa->spa_load_info = loadinfo;
+
+ spa_import_progress_remove(spa_guid(spa));
+ return (load_error);
+ }
+}
+
+/*
+ * Pool Open/Import
+ *
+ * The import case is identical to an open except that the configuration is sent
+ * down from userland, instead of grabbed from the configuration cache. For the
+ * case of an open, the pool configuration will exist in the
+ * POOL_STATE_UNINITIALIZED state.
+ *
+ * The stats information (gen/count/ustats) is used to gather vdev statistics at
+ * the same time open the pool, without having to keep around the spa_t in some
+ * ambiguous state.
+ */
+static int
+spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
+ nvlist_t **config)
+{
+ spa_t *spa;
+ spa_load_state_t state = SPA_LOAD_OPEN;
+ int error;
+ int locked = B_FALSE;
+ int firstopen = B_FALSE;
+
+ *spapp = NULL;
+
+ /*
+ * As disgusting as this is, we need to support recursive calls to this
+ * function because dsl_dir_open() is called during spa_load(), and ends
+ * up calling spa_open() again. The real fix is to figure out how to
+ * avoid dsl_dir_open() calling this in the first place.
+ */
+ if (MUTEX_NOT_HELD(&spa_namespace_lock)) {
+ mutex_enter(&spa_namespace_lock);
+ locked = B_TRUE;
+ }
+
+ if ((spa = spa_lookup(pool)) == NULL) {
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(ENOENT));
+ }
+
+ if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
+ zpool_load_policy_t policy;
+
+ firstopen = B_TRUE;
+
+ zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config,
+ &policy);
+ if (policy.zlp_rewind & ZPOOL_DO_REWIND)
+ state = SPA_LOAD_RECOVER;
+
+ spa_activate(spa, spa_mode_global);
+
+ if (state != SPA_LOAD_RECOVER)
+ spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+ spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
+
+ zfs_dbgmsg("spa_open_common: opening %s", pool);
+ error = spa_load_best(spa, state, policy.zlp_txg,
+ policy.zlp_rewind);
+
+ if (error == EBADF) {
+ /*
+ * If vdev_validate() returns failure (indicated by
+ * EBADF), it indicates that one of the vdevs indicates
+ * that the pool has been exported or destroyed. If
+ * this is the case, the config cache is out of sync and
+ * we should remove the pool from the namespace.
+ */
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_write_cachefile(spa, B_TRUE, B_TRUE);
+ spa_remove(spa);
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(ENOENT));
+ }
+
+ if (error) {
+ /*
+ * We can't open the pool, but we still have useful
+ * information: the state of each vdev after the
+ * attempted vdev_open(). Return this to the user.
+ */
+ if (config != NULL && spa->spa_config) {
+ VERIFY(nvlist_dup(spa->spa_config, config,
+ KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist(*config,
+ ZPOOL_CONFIG_LOAD_INFO,
+ spa->spa_load_info) == 0);
+ }
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa->spa_last_open_failed = error;
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+ *spapp = NULL;
+ return (error);
+ }
+ }
+
+ spa_open_ref(spa, tag);
+
+ if (config != NULL)
+ *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+
+ /*
+ * If we've recovered the pool, pass back any information we
+ * gathered while doing the load.
+ */
+ if (state == SPA_LOAD_RECOVER) {
+ VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
+ spa->spa_load_info) == 0);
+ }
+
+ if (locked) {
+ spa->spa_last_open_failed = 0;
+ spa->spa_last_ubsync_txg = 0;
+ spa->spa_load_txg = 0;
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ if (firstopen)
+ zvol_create_minors_recursive(spa_name(spa));
+
+ *spapp = spa;
+
+ return (0);
+}
+
+int
+spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
+ nvlist_t **config)
+{
+ return (spa_open_common(name, spapp, tag, policy, config));
+}
+
+int
+spa_open(const char *name, spa_t **spapp, void *tag)
+{
+ return (spa_open_common(name, spapp, tag, NULL, NULL));
+}
+
+/*
+ * Lookup the given spa_t, incrementing the inject count in the process,
+ * preventing it from being exported or destroyed.
+ */
+spa_t *
+spa_inject_addref(char *name)
+{
+ spa_t *spa;
+
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(name)) == NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (NULL);
+ }
+ spa->spa_inject_ref++;
+ mutex_exit(&spa_namespace_lock);
+
+ return (spa);
+}
+
+void
+spa_inject_delref(spa_t *spa)
+{
+ mutex_enter(&spa_namespace_lock);
+ spa->spa_inject_ref--;
+ mutex_exit(&spa_namespace_lock);
+}
+
+/*
+ * Add spares device information to the nvlist.
+ */
+static void
+spa_add_spares(spa_t *spa, nvlist_t *config)
+{
+ nvlist_t **spares;
+ uint_t i, nspares;
+ nvlist_t *nvroot;
+ uint64_t guid;
+ vdev_stat_t *vs;
+ uint_t vsc;
+ uint64_t pool;
+
+ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
+ if (spa->spa_spares.sav_count == 0)
+ return;
+
+ VERIFY(nvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+ VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
+ if (nspares != 0) {
+ VERIFY(nvlist_add_nvlist_array(nvroot,
+ ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
+ VERIFY(nvlist_lookup_nvlist_array(nvroot,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
+
+ /*
+ * Go through and find any spares which have since been
+ * repurposed as an active spare. If this is the case, update
+ * their status appropriately.
+ */
+ for (i = 0; i < nspares; i++) {
+ VERIFY(nvlist_lookup_uint64(spares[i],
+ ZPOOL_CONFIG_GUID, &guid) == 0);
+ if (spa_spare_exists(guid, &pool, NULL) &&
+ pool != 0ULL) {
+ VERIFY(nvlist_lookup_uint64_array(
+ spares[i], ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t **)&vs, &vsc) == 0);
+ vs->vs_state = VDEV_STATE_CANT_OPEN;
+ vs->vs_aux = VDEV_AUX_SPARED;
+ }
+ }
+ }
+}
+
+/*
+ * Add l2cache device information to the nvlist, including vdev stats.
+ */
+static void
+spa_add_l2cache(spa_t *spa, nvlist_t *config)
+{
+ nvlist_t **l2cache;
+ uint_t i, j, nl2cache;
+ nvlist_t *nvroot;
+ uint64_t guid;
+ vdev_t *vd;
+ vdev_stat_t *vs;
+ uint_t vsc;
+
+ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
+ if (spa->spa_l2cache.sav_count == 0)
+ return;
+
+ VERIFY(nvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+ VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
+ if (nl2cache != 0) {
+ VERIFY(nvlist_add_nvlist_array(nvroot,
+ ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
+ VERIFY(nvlist_lookup_nvlist_array(nvroot,
+ ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
+
+ /*
+ * Update level 2 cache device stats.
+ */
+
+ for (i = 0; i < nl2cache; i++) {
+ VERIFY(nvlist_lookup_uint64(l2cache[i],
+ ZPOOL_CONFIG_GUID, &guid) == 0);
+
+ vd = NULL;
+ for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
+ if (guid ==
+ spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
+ vd = spa->spa_l2cache.sav_vdevs[j];
+ break;
+ }
+ }
+ ASSERT(vd != NULL);
+
+ VERIFY(nvlist_lookup_uint64_array(l2cache[i],
+ ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
+ == 0);
+ vdev_get_stats(vd, vs);
+ vdev_config_generate_stats(vd, l2cache[i]);
+
+ }
+ }
+}
+
+static void
+spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ if (spa->spa_feat_for_read_obj != 0) {
+ for (zap_cursor_init(&zc, spa->spa_meta_objset,
+ spa->spa_feat_for_read_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ ASSERT(za.za_integer_length == sizeof (uint64_t) &&
+ za.za_num_integers == 1);
+ VERIFY0(nvlist_add_uint64(features, za.za_name,
+ za.za_first_integer));
+ }
+ zap_cursor_fini(&zc);
+ }
+
+ if (spa->spa_feat_for_write_obj != 0) {
+ for (zap_cursor_init(&zc, spa->spa_meta_objset,
+ spa->spa_feat_for_write_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ ASSERT(za.za_integer_length == sizeof (uint64_t) &&
+ za.za_num_integers == 1);
+ VERIFY0(nvlist_add_uint64(features, za.za_name,
+ za.za_first_integer));
+ }
+ zap_cursor_fini(&zc);
+ }
+}
+
+static void
+spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features)
+{
+ int i;
+
+ for (i = 0; i < SPA_FEATURES; i++) {
+ zfeature_info_t feature = spa_feature_table[i];
+ uint64_t refcount;
+
+ if (feature_get_refcount(spa, &feature, &refcount) != 0)
+ continue;
+
+ VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount));
+ }
+}
+
+/*
+ * Store a list of pool features and their reference counts in the
+ * config.
+ *
+ * The first time this is called on a spa, allocate a new nvlist, fetch
+ * the pool features and reference counts from disk, then save the list
+ * in the spa. In subsequent calls on the same spa use the saved nvlist
+ * and refresh its values from the cached reference counts. This
+ * ensures we don't block here on I/O on a suspended pool so 'zpool
+ * clear' can resume the pool.
+ */
+static void
+spa_add_feature_stats(spa_t *spa, nvlist_t *config)
+{
+ nvlist_t *features;
+
+ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
+ mutex_enter(&spa->spa_feat_stats_lock);
+ features = spa->spa_feat_stats;
+
+ if (features != NULL) {
+ spa_feature_stats_from_cache(spa, features);
+ } else {
+ VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP));
+ spa->spa_feat_stats = features;
+ spa_feature_stats_from_disk(spa, features);
+ }
+
+ VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
+ features));
+
+ mutex_exit(&spa->spa_feat_stats_lock);
+}
+
+int
+spa_get_stats(const char *name, nvlist_t **config,
+ char *altroot, size_t buflen)
+{
+ int error;
+ spa_t *spa;
+
+ *config = NULL;
+ error = spa_open_common(name, &spa, FTAG, NULL, config);
+
+ if (spa != NULL) {
+ /*
+ * This still leaves a window of inconsistency where the spares
+ * or l2cache devices could change and the config would be
+ * self-inconsistent.
+ */
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ if (*config != NULL) {
+ uint64_t loadtimes[2];
+
+ loadtimes[0] = spa->spa_loaded_ts.tv_sec;
+ loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
+ VERIFY(nvlist_add_uint64_array(*config,
+ ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
+
+ VERIFY(nvlist_add_uint64(*config,
+ ZPOOL_CONFIG_ERRCOUNT,
+ spa_get_errlog_size(spa)) == 0);
+
+ if (spa_suspended(spa)) {
+ VERIFY(nvlist_add_uint64(*config,
+ ZPOOL_CONFIG_SUSPENDED,
+ spa->spa_failmode) == 0);
+ VERIFY(nvlist_add_uint64(*config,
+ ZPOOL_CONFIG_SUSPENDED_REASON,
+ spa->spa_suspended) == 0);
+ }
+
+ spa_add_spares(spa, *config);
+ spa_add_l2cache(spa, *config);
+ spa_add_feature_stats(spa, *config);
+ }
+ }
+
+ /*
+ * We want to get the alternate root even for faulted pools, so we cheat
+ * and call spa_lookup() directly.
+ */
+ if (altroot) {
+ if (spa == NULL) {
+ mutex_enter(&spa_namespace_lock);
+ spa = spa_lookup(name);
+ if (spa)
+ spa_altroot(spa, altroot, buflen);
+ else
+ altroot[0] = '\0';
+ spa = NULL;
+ mutex_exit(&spa_namespace_lock);
+ } else {
+ spa_altroot(spa, altroot, buflen);
+ }
+ }
+
+ if (spa != NULL) {
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ spa_close(spa, FTAG);
+ }
+
+ return (error);
+}
+
+/*
+ * Validate that the auxiliary device array is well formed. We must have an
+ * array of nvlists, each which describes a valid leaf vdev. If this is an
+ * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
+ * specified, as long as they are well-formed.
+ */
+static int
+spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
+ spa_aux_vdev_t *sav, const char *config, uint64_t version,
+ vdev_labeltype_t label)
+{
+ nvlist_t **dev;
+ uint_t i, ndev;
+ vdev_t *vd;
+ int error;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ /*
+ * It's acceptable to have no devs specified.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
+ return (0);
+
+ if (ndev == 0)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * Make sure the pool is formatted with a version that supports this
+ * device type.
+ */
+ if (spa_version(spa) < version)
+ return (SET_ERROR(ENOTSUP));
+
+ /*
+ * Set the pending device list so we correctly handle device in-use
+ * checking.
+ */
+ sav->sav_pending = dev;
+ sav->sav_npending = ndev;
+
+ for (i = 0; i < ndev; i++) {
+ if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
+ mode)) != 0)
+ goto out;
+
+ if (!vd->vdev_ops->vdev_op_leaf) {
+ vdev_free(vd);
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ vd->vdev_top = vd;
+
+ if ((error = vdev_open(vd)) == 0 &&
+ (error = vdev_label_init(vd, crtxg, label)) == 0) {
+ VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
+ vd->vdev_guid) == 0);
+ }
+
+ vdev_free(vd);
+
+ if (error &&
+ (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
+ goto out;
+ else
+ error = 0;
+ }
+
+out:
+ sav->sav_pending = NULL;
+ sav->sav_npending = 0;
+ return (error);
+}
+
+static int
+spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
+{
+ int error;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
+ &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
+ VDEV_LABEL_SPARE)) != 0) {
+ return (error);
+ }
+
+ return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
+ &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
+ VDEV_LABEL_L2CACHE));
+}
+
+static void
+spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
+ const char *config)
+{
+ int i;
+
+ if (sav->sav_config != NULL) {
+ nvlist_t **olddevs;
+ uint_t oldndevs;
+ nvlist_t **newdevs;
+
+ /*
+ * Generate new dev list by concatenating with the
+ * current dev list.
+ */
+ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
+ &olddevs, &oldndevs) == 0);
+
+ newdevs = kmem_alloc(sizeof (void *) *
+ (ndevs + oldndevs), KM_SLEEP);
+ for (i = 0; i < oldndevs; i++)
+ VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
+ KM_SLEEP) == 0);
+ for (i = 0; i < ndevs; i++)
+ VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
+ KM_SLEEP) == 0);
+
+ VERIFY(nvlist_remove(sav->sav_config, config,
+ DATA_TYPE_NVLIST_ARRAY) == 0);
+
+ VERIFY(nvlist_add_nvlist_array(sav->sav_config,
+ config, newdevs, ndevs + oldndevs) == 0);
+ for (i = 0; i < oldndevs + ndevs; i++)
+ nvlist_free(newdevs[i]);
+ kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
+ } else {
+ /*
+ * Generate a new dev list.
+ */
+ VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
+ KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
+ devs, ndevs) == 0);
+ }
+}
+
+/*
+ * Stop and drop level 2 ARC devices
+ */
+void
+spa_l2cache_drop(spa_t *spa)
+{
+ vdev_t *vd;
+ int i;
+ spa_aux_vdev_t *sav = &spa->spa_l2cache;
+
+ for (i = 0; i < sav->sav_count; i++) {
+ uint64_t pool;
+
+ vd = sav->sav_vdevs[i];
+ ASSERT(vd != NULL);
+
+ if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
+ pool != 0ULL && l2arc_vdev_present(vd))
+ l2arc_remove_vdev(vd);
+ }
+}
+
+/*
+ * Verify encryption parameters for spa creation. If we are encrypting, we must
+ * have the encryption feature flag enabled.
+ */
+static int
+spa_create_check_encryption_params(dsl_crypto_params_t *dcp,
+ boolean_t has_encryption)
+{
+ if (dcp->cp_crypt != ZIO_CRYPT_OFF &&
+ dcp->cp_crypt != ZIO_CRYPT_INHERIT &&
+ !has_encryption)
+ return (SET_ERROR(ENOTSUP));
+
+ return (dmu_objset_create_crypt_check(NULL, dcp, NULL));
+}
+
+/*
+ * Pool Creation
+ */
+int
+spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
+ nvlist_t *zplprops, dsl_crypto_params_t *dcp)
+{
+ spa_t *spa;
+ char *altroot = NULL;
+ vdev_t *rvd;
+ dsl_pool_t *dp;
+ dmu_tx_t *tx;
+ int error = 0;
+ uint64_t txg = TXG_INITIAL;
+ nvlist_t **spares, **l2cache;
+ uint_t nspares, nl2cache;
+ uint64_t version, obj, ndraid = 0;
+ boolean_t has_features;
+ boolean_t has_encryption;
+ boolean_t has_allocclass;
+ spa_feature_t feat;
+ char *feat_name;
+ char *poolname;
+ nvlist_t *nvl;
+
+ if (props == NULL ||
+ nvlist_lookup_string(props, "tname", &poolname) != 0)
+ poolname = (char *)pool;
+
+ /*
+ * If this pool already exists, return failure.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if (spa_lookup(poolname) != NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(EEXIST));
+ }
+
+ /*
+ * Allocate a new spa_t structure.
+ */
+ nvl = fnvlist_alloc();
+ fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
+ (void) nvlist_lookup_string(props,
+ zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
+ spa = spa_add(poolname, nvl, altroot);
+ fnvlist_free(nvl);
+ spa_activate(spa, spa_mode_global);
+
+ if (props && (error = spa_prop_validate(spa, props))) {
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (error);
+ }
+
+ /*
+ * Temporary pool names should never be written to disk.
+ */
+ if (poolname != pool)
+ spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
+
+ has_features = B_FALSE;
+ has_encryption = B_FALSE;
+ has_allocclass = B_FALSE;
+ for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
+ elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
+ if (zpool_prop_feature(nvpair_name(elem))) {
+ has_features = B_TRUE;
+
+ feat_name = strchr(nvpair_name(elem), '@') + 1;
+ VERIFY0(zfeature_lookup_name(feat_name, &feat));
+ if (feat == SPA_FEATURE_ENCRYPTION)
+ has_encryption = B_TRUE;
+ if (feat == SPA_FEATURE_ALLOCATION_CLASSES)
+ has_allocclass = B_TRUE;
+ }
+ }
+
+ /* verify encryption params, if they were provided */
+ if (dcp != NULL) {
+ error = spa_create_check_encryption_params(dcp, has_encryption);
+ if (error != 0) {
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (error);
+ }
+ }
+ if (!has_allocclass && zfs_special_devs(nvroot, NULL)) {
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (ENOTSUP);
+ }
+
+ if (has_features || nvlist_lookup_uint64(props,
+ zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
+ version = SPA_VERSION;
+ }
+ ASSERT(SPA_VERSION_IS_SUPPORTED(version));
+
+ spa->spa_first_txg = txg;
+ spa->spa_uberblock.ub_txg = txg - 1;
+ spa->spa_uberblock.ub_version = version;
+ spa->spa_ubsync = spa->spa_uberblock;
+ spa->spa_load_state = SPA_LOAD_CREATE;
+ spa->spa_removing_phys.sr_state = DSS_NONE;
+ spa->spa_removing_phys.sr_removing_vdev = -1;
+ spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
+ spa->spa_indirect_vdevs_loaded = B_TRUE;
+
+ /*
+ * Create "The Godfather" zio to hold all async IOs
+ */
+ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
+ KM_SLEEP);
+ for (int i = 0; i < max_ncpus; i++) {
+ spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+ ZIO_FLAG_GODFATHER);
+ }
+
+ /*
+ * Create the root vdev.
+ */
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+ error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
+
+ ASSERT(error != 0 || rvd != NULL);
+ ASSERT(error != 0 || spa->spa_root_vdev == rvd);
+
+ if (error == 0 && !zfs_allocatable_devs(nvroot))
+ error = SET_ERROR(EINVAL);
+
+ if (error == 0 &&
+ (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
+ (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 &&
+ (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) {
+ /*
+ * instantiate the metaslab groups (this will dirty the vdevs)
+ * we can no longer error exit past this point
+ */
+ for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ vdev_metaslab_set_size(vd);
+ vdev_expand(vd, txg);
+ }
+ }
+
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ if (error != 0) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (error);
+ }
+
+ /*
+ * Get the list of spares, if specified.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) == 0) {
+ VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
+ KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_load_spares(spa);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ spa->spa_spares.sav_sync = B_TRUE;
+ }
+
+ /*
+ * Get the list of level 2 cache devices, if specified.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+ &l2cache, &nl2cache) == 0) {
+ VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_load_l2cache(spa);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ spa->spa_l2cache.sav_sync = B_TRUE;
+ }
+
+ spa->spa_is_initializing = B_TRUE;
+ spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg);
+ spa->spa_is_initializing = B_FALSE;
+
+ /*
+ * Create DDTs (dedup tables).
+ */
+ ddt_create(spa);
+
+ spa_update_dspace(spa);
+
+ tx = dmu_tx_create_assigned(dp, txg);
+
+ /*
+ * Create the pool's history object.
+ */
+ if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history)
+ spa_history_create_obj(spa, tx);
+
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
+ spa_history_log_version(spa, "create", tx);
+
+ /*
+ * Create the pool config object.
+ */
+ spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
+ DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
+ DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
+
+ if (zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
+ sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add pool config");
+ }
+
+ if (zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
+ sizeof (uint64_t), 1, &version, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add pool version");
+ }
+
+ /* Newly created pools with the right version are always deflated. */
+ if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
+ spa->spa_deflate = TRUE;
+ if (zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
+ sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add deflate");
+ }
+ }
+
+ /*
+ * Create the deferred-free bpobj. Turn off compression
+ * because sync-to-convergence takes longer if the blocksize
+ * keeps changing.
+ */
+ obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
+ dmu_object_set_compress(spa->spa_meta_objset, obj,
+ ZIO_COMPRESS_OFF, tx);
+ if (zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
+ sizeof (uint64_t), 1, &obj, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add bpobj");
+ }
+ VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
+ spa->spa_meta_objset, obj));
+
+ /*
+ * Generate some random noise for salted checksums to operate on.
+ */
+ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
+ sizeof (spa->spa_cksum_salt.zcs_bytes));
+
+ /*
+ * Set pool properties.
+ */
+ spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
+ spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
+ spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
+ spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
+ spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
+ spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM);
+
+ if (props != NULL) {
+ spa_configfile_set(spa, props, B_FALSE);
+ spa_sync_props(props, tx);
+ }
+
+ for (int i = 0; i < ndraid; i++)
+ spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
+
+ dmu_tx_commit(tx);
+
+ spa->spa_sync_on = B_TRUE;
+ txg_sync_start(dp);
+ mmp_thread_start(spa);
+ txg_wait_synced(dp, txg);
+
+ spa_spawn_aux_threads(spa);
+
+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
+
+ /*
+ * Don't count references from objsets that are already closed
+ * and are making their way through the eviction process.
+ */
+ spa_evicting_os_wait(spa);
+ spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
+ spa->spa_load_state = SPA_LOAD_NONE;
+
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+/*
+ * Import a non-root pool into the system.
+ */
+int
+spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
+{
+ spa_t *spa;
+ char *altroot = NULL;
+ spa_load_state_t state = SPA_LOAD_IMPORT;
+ zpool_load_policy_t policy;
+ spa_mode_t mode = spa_mode_global;
+ uint64_t readonly = B_FALSE;
+ int error;
+ nvlist_t *nvroot;
+ nvlist_t **spares, **l2cache;
+ uint_t nspares, nl2cache;
+
+ /*
+ * If a pool with this name exists, return failure.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if (spa_lookup(pool) != NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(EEXIST));
+ }
+
+ /*
+ * Create and initialize the spa structure.
+ */
+ (void) nvlist_lookup_string(props,
+ zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
+ (void) nvlist_lookup_uint64(props,
+ zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
+ if (readonly)
+ mode = SPA_MODE_READ;
+ spa = spa_add(pool, config, altroot);
+ spa->spa_import_flags = flags;
+
+ /*
+ * Verbatim import - Take a pool and insert it into the namespace
+ * as if it had been loaded at boot.
+ */
+ if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
+ if (props != NULL)
+ spa_configfile_set(spa, props, B_FALSE);
+
+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
+ zfs_dbgmsg("spa_import: verbatim import of %s", pool);
+ mutex_exit(&spa_namespace_lock);
+ return (0);
+ }
+
+ spa_activate(spa, mode);
+
+ /*
+ * Don't start async tasks until we know everything is healthy.
+ */
+ spa_async_suspend(spa);
+
+ zpool_get_load_policy(config, &policy);
+ if (policy.zlp_rewind & ZPOOL_DO_REWIND)
+ state = SPA_LOAD_RECOVER;
+
+ spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
+
+ if (state != SPA_LOAD_RECOVER) {
+ spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+ zfs_dbgmsg("spa_import: importing %s", pool);
+ } else {
+ zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
+ "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg);
+ }
+ error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind);
+
+ /*
+ * Propagate anything learned while loading the pool and pass it
+ * back to caller (i.e. rewind info, missing devices, etc).
+ */
+ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
+ spa->spa_load_info) == 0);
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ /*
+ * Toss any existing sparelist, as it doesn't have any validity
+ * anymore, and conflicts with spa_has_spare().
+ */
+ if (spa->spa_spares.sav_config) {
+ nvlist_free(spa->spa_spares.sav_config);
+ spa->spa_spares.sav_config = NULL;
+ spa_load_spares(spa);
+ }
+ if (spa->spa_l2cache.sav_config) {
+ nvlist_free(spa->spa_l2cache.sav_config);
+ spa->spa_l2cache.sav_config = NULL;
+ spa_load_l2cache(spa);
+ }
+
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ if (props != NULL)
+ spa_configfile_set(spa, props, B_FALSE);
+
+ if (error != 0 || (props && spa_writeable(spa) &&
+ (error = spa_prop_set(spa, props)))) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (error);
+ }
+
+ spa_async_resume(spa);
+
+ /*
+ * Override any spares and level 2 cache devices as specified by
+ * the user, as these may have correct device names/devids, etc.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) == 0) {
+ if (spa->spa_spares.sav_config)
+ VERIFY(nvlist_remove(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
+ else
+ VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_load_spares(spa);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ spa->spa_spares.sav_sync = B_TRUE;
+ }
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+ &l2cache, &nl2cache) == 0) {
+ if (spa->spa_l2cache.sav_config)
+ VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
+ else
+ VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_load_l2cache(spa);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ spa->spa_l2cache.sav_sync = B_TRUE;
+ }
+
+ /*
+ * Check for any removed devices.
+ */
+ if (spa->spa_autoreplace) {
+ spa_aux_check_removed(&spa->spa_spares);
+ spa_aux_check_removed(&spa->spa_l2cache);
+ }
+
+ if (spa_writeable(spa)) {
+ /*
+ * Update the config cache to include the newly-imported pool.
+ */
+ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+ }
+
+ /*
+ * It's possible that the pool was expanded while it was exported.
+ * We kick off an async task to handle this for us.
+ */
+ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
+
+ spa_history_log_version(spa, "import", NULL);
+
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
+
+ mutex_exit(&spa_namespace_lock);
+
+ zvol_create_minors_recursive(pool);
+
+ return (0);
+}
+
+nvlist_t *
+spa_tryimport(nvlist_t *tryconfig)
+{
+ nvlist_t *config = NULL;
+ char *poolname, *cachefile;
+ spa_t *spa;
+ uint64_t state;
+ int error;
+ zpool_load_policy_t policy;
+
+ if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
+ return (NULL);
+
+ if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
+ return (NULL);
+
+ /*
+ * Create and initialize the spa structure.
+ */
+ mutex_enter(&spa_namespace_lock);
+ spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
+ spa_activate(spa, SPA_MODE_READ);
+
+ /*
+ * Rewind pool if a max txg was provided.
+ */
+ zpool_get_load_policy(spa->spa_config, &policy);
+ if (policy.zlp_txg != UINT64_MAX) {
+ spa->spa_load_max_txg = policy.zlp_txg;
+ spa->spa_extreme_rewind = B_TRUE;
+ zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
+ poolname, (longlong_t)policy.zlp_txg);
+ } else {
+ zfs_dbgmsg("spa_tryimport: importing %s", poolname);
+ }
+
+ if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
+ == 0) {
+ zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
+ spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
+ } else {
+ spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
+ }
+
+ error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
+
+ /*
+ * If 'tryconfig' was at least parsable, return the current config.
+ */
+ if (spa->spa_root_vdev != NULL) {
+ config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
+ poolname) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ state) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
+ spa->spa_uberblock.ub_timestamp) == 0);
+ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
+ spa->spa_load_info) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA,
+ spa->spa_errata) == 0);
+
+ /*
+ * If the bootfs property exists on this pool then we
+ * copy it out so that external consumers can tell which
+ * pools are bootable.
+ */
+ if ((!error || error == EEXIST) && spa->spa_bootfs) {
+ char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+ /*
+ * We have to play games with the name since the
+ * pool was opened as TRYIMPORT_NAME.
+ */
+ if (dsl_dsobj_to_dsname(spa_name(spa),
+ spa->spa_bootfs, tmpname) == 0) {
+ char *cp;
+ char *dsname;
+
+ dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+ cp = strchr(tmpname, '/');
+ if (cp == NULL) {
+ (void) strlcpy(dsname, tmpname,
+ MAXPATHLEN);
+ } else {
+ (void) snprintf(dsname, MAXPATHLEN,
+ "%s/%s", poolname, ++cp);
+ }
+ VERIFY(nvlist_add_string(config,
+ ZPOOL_CONFIG_BOOTFS, dsname) == 0);
+ kmem_free(dsname, MAXPATHLEN);
+ }
+ kmem_free(tmpname, MAXPATHLEN);
+ }
+
+ /*
+ * Add the list of hot spares and level 2 cache devices.
+ */
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ spa_add_spares(spa, config);
+ spa_add_l2cache(spa, config);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ }
+
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+
+ return (config);
+}
+
+/*
+ * Pool export/destroy
+ *
+ * The act of destroying or exporting a pool is very simple. We make sure there
+ * is no more pending I/O and any references to the pool are gone. Then, we
+ * update the pool state and sync all the labels to disk, removing the
+ * configuration from the cache afterwards. If the 'hardforce' flag is set, then
+ * we don't sync the labels or remove the configuration cache.
+ */
+static int
+spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
+ boolean_t force, boolean_t hardforce)
+{
+ int error;
+ spa_t *spa;
+
+ if (oldconfig)
+ *oldconfig = NULL;
+
+ if (!(spa_mode_global & SPA_MODE_WRITE))
+ return (SET_ERROR(EROFS));
+
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(pool)) == NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(ENOENT));
+ }
+
+ if (spa->spa_is_exporting) {
+ /* the pool is being exported by another thread */
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS));
+ }
+ spa->spa_is_exporting = B_TRUE;
+
+ /*
+ * Put a hold on the pool, drop the namespace lock, stop async tasks,
+ * reacquire the namespace lock, and see if we can export.
+ */
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ spa_async_suspend(spa);
+ if (spa->spa_zvol_taskq) {
+ zvol_remove_minors(spa, spa_name(spa), B_TRUE);
+ taskq_wait(spa->spa_zvol_taskq);
+ }
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+
+ if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+ goto export_spa;
+ /*
+ * The pool will be in core if it's openable, in which case we can
+ * modify its state. Objsets may be open only because they're dirty,
+ * so we have to force it to sync before checking spa_refcnt.
+ */
+ if (spa->spa_sync_on) {
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ spa_evicting_os_wait(spa);
+ }
+
+ /*
+ * A pool cannot be exported or destroyed if there are active
+ * references. If we are resetting a pool, allow references by
+ * fault injection handlers.
+ */
+ if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) {
+ error = SET_ERROR(EBUSY);
+ goto fail;
+ }
+
+ if (spa->spa_sync_on) {
+ /*
+ * A pool cannot be exported if it has an active shared spare.
+ * This is to prevent other pools stealing the active spare
+ * from an exported pool. At user's own will, such pool can
+ * be forcedly exported.
+ */
+ if (!force && new_state == POOL_STATE_EXPORTED &&
+ spa_has_active_shared_spare(spa)) {
+ error = SET_ERROR(EXDEV);
+ goto fail;
+ }
+
+ /*
+ * We're about to export or destroy this pool. Make sure
+ * we stop all initialization and trim activity here before
+ * we set the spa_final_txg. This will ensure that all
+ * dirty data resulting from the initialization is
+ * committed to disk before we unload the pool.
+ */
+ if (spa->spa_root_vdev != NULL) {
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
+ vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
+ vdev_autotrim_stop_all(spa);
+ vdev_rebuild_stop_all(spa);
+ }
+
+ /*
+ * We want this to be reflected on every label,
+ * so mark them all dirty. spa_unload() will do the
+ * final sync that pushes these changes out.
+ */
+ if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa->spa_state = new_state;
+ spa->spa_final_txg = spa_last_synced_txg(spa) +
+ TXG_DEFER_SIZE + 1;
+ vdev_config_dirty(spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ }
+ }
+
+export_spa:
+ if (new_state == POOL_STATE_DESTROYED)
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
+ else if (new_state == POOL_STATE_EXPORTED)
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT);
+
+ if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ }
+
+ if (oldconfig && spa->spa_config)
+ VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
+
+ if (new_state != POOL_STATE_UNINITIALIZED) {
+ if (!hardforce)
+ spa_write_cachefile(spa, B_TRUE, B_TRUE);
+ spa_remove(spa);
+ } else {
+ /*
+ * If spa_remove() is not called for this spa_t and
+ * there is any possibility that it can be reused,
+ * we make sure to reset the exporting flag.
+ */
+ spa->spa_is_exporting = B_FALSE;
+ }
+
+ mutex_exit(&spa_namespace_lock);
+ return (0);
+
+fail:
+ spa->spa_is_exporting = B_FALSE;
+ spa_async_resume(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (error);
+}
+
+/*
+ * Destroy a storage pool.
+ */
+int
+spa_destroy(const char *pool)
+{
+ return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
+ B_FALSE, B_FALSE));
+}
+
+/*
+ * Export a storage pool.
+ */
+int
+spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force,
+ boolean_t hardforce)
+{
+ return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
+ force, hardforce));
+}
+
+/*
+ * Similar to spa_export(), this unloads the spa_t without actually removing it
+ * from the namespace in any way.
+ */
+int
+spa_reset(const char *pool)
+{
+ return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
+ B_FALSE, B_FALSE));
+}
+
+/*
+ * ==========================================================================
+ * Device manipulation
+ * ==========================================================================
+ */
+
+/*
+ * This is called as a synctask to increment the draid feature flag
+ */
+static void
+spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ int draid = (int)(uintptr_t)arg;
+
+ for (int c = 0; c < draid; c++)
+ spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
+}
+
+/*
+ * Add a device to a storage pool.
+ */
+int
+spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
+{
+ uint64_t txg, ndraid = 0;
+ int error;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd, *tvd;
+ nvlist_t **spares, **l2cache;
+ uint_t nspares, nl2cache;
+
+ ASSERT(spa_writeable(spa));
+
+ txg = spa_vdev_enter(spa);
+
+ if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
+ VDEV_ALLOC_ADD)) != 0)
+ return (spa_vdev_exit(spa, NULL, txg, error));
+
+ spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */
+
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
+ &nspares) != 0)
+ nspares = 0;
+
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
+ &nl2cache) != 0)
+ nl2cache = 0;
+
+ if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
+ return (spa_vdev_exit(spa, vd, txg, EINVAL));
+
+ if (vd->vdev_children != 0 &&
+ (error = vdev_create(vd, txg, B_FALSE)) != 0) {
+ return (spa_vdev_exit(spa, vd, txg, error));
+ }
+
+ /*
+ * The virtual dRAID spares must be added after vdev tree is created
+ * and the vdev guids are generated. The guid of their assoicated
+ * dRAID is stored in the config and used when opening the spare.
+ */
+ if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid,
+ rvd->vdev_children)) == 0) {
+ if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)
+ nspares = 0;
+ } else {
+ return (spa_vdev_exit(spa, vd, txg, error));
+ }
+
+ /*
+ * We must validate the spares and l2cache devices after checking the
+ * children. Otherwise, vdev_inuse() will blindly overwrite the spare.
+ */
+ if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
+ return (spa_vdev_exit(spa, vd, txg, error));
+
+ /*
+ * If we are in the middle of a device removal, we can only add
+ * devices which match the existing devices in the pool.
+ * If we are in the middle of a removal, or have some indirect
+ * vdevs, we can not add raidz or dRAID top levels.
+ */
+ if (spa->spa_vdev_removal != NULL ||
+ spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
+ for (int c = 0; c < vd->vdev_children; c++) {
+ tvd = vd->vdev_child[c];
+ if (spa->spa_vdev_removal != NULL &&
+ tvd->vdev_ashift != spa->spa_max_ashift) {
+ return (spa_vdev_exit(spa, vd, txg, EINVAL));
+ }
+ /* Fail if top level vdev is raidz or a dRAID */
+ if (vdev_get_nparity(tvd) != 0)
+ return (spa_vdev_exit(spa, vd, txg, EINVAL));
+
+ /*
+ * Need the top level mirror to be
+ * a mirror of leaf vdevs only
+ */
+ if (tvd->vdev_ops == &vdev_mirror_ops) {
+ for (uint64_t cid = 0;
+ cid < tvd->vdev_children; cid++) {
+ vdev_t *cvd = tvd->vdev_child[cid];
+ if (!cvd->vdev_ops->vdev_op_leaf) {
+ return (spa_vdev_exit(spa, vd,
+ txg, EINVAL));
+ }
+ }
+ }
+ }
+ }
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ tvd = vd->vdev_child[c];
+ vdev_remove_child(vd, tvd);
+ tvd->vdev_id = rvd->vdev_children;
+ vdev_add_child(rvd, tvd);
+ vdev_config_dirty(tvd);
+ }
+
+ if (nspares != 0) {
+ spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
+ ZPOOL_CONFIG_SPARES);
+ spa_load_spares(spa);
+ spa->spa_spares.sav_sync = B_TRUE;
+ }
+
+ if (nl2cache != 0) {
+ spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
+ ZPOOL_CONFIG_L2CACHE);
+ spa_load_l2cache(spa);
+ spa->spa_l2cache.sav_sync = B_TRUE;
+ }
+
+ /*
+ * We can't increment a feature while holding spa_vdev so we
+ * have to do it in a synctask.
+ */
+ if (ndraid != 0) {
+ dmu_tx_t *tx;
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr,
+ (void *)(uintptr_t)ndraid, tx);
+ dmu_tx_commit(tx);
+ }
+
+ /*
+ * We have to be careful when adding new vdevs to an existing pool.
+ * If other threads start allocating from these vdevs before we
+ * sync the config cache, and we lose power, then upon reboot we may
+ * fail to open the pool because there are DVAs that the config cache
+ * can't translate. Therefore, we first add the vdevs without
+ * initializing metaslabs; sync the config cache (via spa_vdev_exit());
+ * and then let spa_config_update() initialize the new metaslabs.
+ *
+ * spa_load() checks for added-but-not-initialized vdevs, so that
+ * if we lose power at any point in this sequence, the remaining
+ * steps will be completed the next time we load the pool.
+ */
+ (void) spa_vdev_exit(spa, vd, txg, 0);
+
+ mutex_enter(&spa_namespace_lock);
+ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+/*
+ * Attach a device to a mirror. The arguments are the path to any device
+ * in the mirror, and the nvroot for the new device. If the path specifies
+ * a device that is not mirrored, we automatically insert the mirror vdev.
+ *
+ * If 'replacing' is specified, the new device is intended to replace the
+ * existing device; in this case the two devices are made into their own
+ * mirror using the 'replacing' vdev, which is functionally identical to
+ * the mirror vdev (it actually reuses all the same ops) but has a few
+ * extra rules: you can't attach to it after it's been created, and upon
+ * completion of resilvering, the first disk (the one being replaced)
+ * is automatically detached.
+ *
+ * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild)
+ * should be performed instead of traditional healing reconstruction. From
+ * an administrators perspective these are both resilver operations.
+ */
+int
+spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
+ int rebuild)
+{
+ uint64_t txg, dtl_max_txg;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
+ vdev_ops_t *pvops;
+ char *oldvdpath, *newvdpath;
+ int newvd_isspare;
+ int error;
+
+ ASSERT(spa_writeable(spa));
+
+ txg = spa_vdev_enter(spa);
+
+ oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+ error = (spa_has_checkpoint(spa)) ?
+ ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+ return (spa_vdev_exit(spa, NULL, txg, error));
+ }
+
+ if (rebuild) {
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ if (dsl_scan_resilvering(spa_get_dsl(spa)))
+ return (spa_vdev_exit(spa, NULL, txg,
+ ZFS_ERR_RESILVER_IN_PROGRESS));
+ } else {
+ if (vdev_rebuild_active(rvd))
+ return (spa_vdev_exit(spa, NULL, txg,
+ ZFS_ERR_REBUILD_IN_PROGRESS));
+ }
+
+ if (spa->spa_vdev_removal != NULL)
+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+ if (oldvd == NULL)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+ if (!oldvd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ pvd = oldvd->vdev_parent;
+
+ if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
+ VDEV_ALLOC_ATTACH)) != 0)
+ return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+ if (newrootvd->vdev_children != 1)
+ return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
+
+ newvd = newrootvd->vdev_child[0];
+
+ if (!newvd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
+
+ if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
+ return (spa_vdev_exit(spa, newrootvd, txg, error));
+
+ /*
+ * Spares can't replace logs
+ */
+ if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+ /*
+ * A dRAID spare can only replace a child of its parent dRAID vdev.
+ */
+ if (newvd->vdev_ops == &vdev_draid_spare_ops &&
+ oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) {
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+ }
+
+ if (rebuild) {
+ /*
+ * For rebuilds, the top vdev must support reconstruction
+ * using only space maps. This means the only allowable
+ * vdevs types are the root vdev, a mirror, or dRAID.
+ */
+ tvd = pvd;
+ if (pvd->vdev_top != NULL)
+ tvd = pvd->vdev_top;
+
+ if (tvd->vdev_ops != &vdev_mirror_ops &&
+ tvd->vdev_ops != &vdev_root_ops &&
+ tvd->vdev_ops != &vdev_draid_ops) {
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+ }
+ }
+
+ if (!replacing) {
+ /*
+ * For attach, the only allowable parent is a mirror or the root
+ * vdev.
+ */
+ if (pvd->vdev_ops != &vdev_mirror_ops &&
+ pvd->vdev_ops != &vdev_root_ops)
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+ pvops = &vdev_mirror_ops;
+ } else {
+ /*
+ * Active hot spares can only be replaced by inactive hot
+ * spares.
+ */
+ if (pvd->vdev_ops == &vdev_spare_ops &&
+ oldvd->vdev_isspare &&
+ !spa_has_spare(spa, newvd->vdev_guid))
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+ /*
+ * If the source is a hot spare, and the parent isn't already a
+ * spare, then we want to create a new hot spare. Otherwise, we
+ * want to create a replacing vdev. The user is not allowed to
+ * attach to a spared vdev child unless the 'isspare' state is
+ * the same (spare replaces spare, non-spare replaces
+ * non-spare).
+ */
+ if (pvd->vdev_ops == &vdev_replacing_ops &&
+ spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+ } else if (pvd->vdev_ops == &vdev_spare_ops &&
+ newvd->vdev_isspare != oldvd->vdev_isspare) {
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+ }
+
+ if (newvd->vdev_isspare)
+ pvops = &vdev_spare_ops;
+ else
+ pvops = &vdev_replacing_ops;
+ }
+
+ /*
+ * Make sure the new device is big enough.
+ */
+ if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
+ return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
+
+ /*
+ * The new device cannot have a higher alignment requirement
+ * than the top-level vdev.
+ */
+ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+ /*
+ * If this is an in-place replacement, update oldvd's path and devid
+ * to make it distinguishable from newvd, and unopenable from now on.
+ */
+ if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
+ spa_strfree(oldvd->vdev_path);
+ oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
+ KM_SLEEP);
+ (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5,
+ "%s/%s", newvd->vdev_path, "old");
+ if (oldvd->vdev_devid != NULL) {
+ spa_strfree(oldvd->vdev_devid);
+ oldvd->vdev_devid = NULL;
+ }
+ }
+
+ /*
+ * If the parent is not a mirror, or if we're replacing, insert the new
+ * mirror/replacing/spare vdev above oldvd.
+ */
+ if (pvd->vdev_ops != pvops)
+ pvd = vdev_add_parent(oldvd, pvops);
+
+ ASSERT(pvd->vdev_top->vdev_parent == rvd);
+ ASSERT(pvd->vdev_ops == pvops);
+ ASSERT(oldvd->vdev_parent == pvd);
+
+ /*
+ * Extract the new device from its root and add it to pvd.
+ */
+ vdev_remove_child(newrootvd, newvd);
+ newvd->vdev_id = pvd->vdev_children;
+ newvd->vdev_crtxg = oldvd->vdev_crtxg;
+ vdev_add_child(pvd, newvd);
+
+ /*
+ * Reevaluate the parent vdev state.
+ */
+ vdev_propagate_state(pvd);
+
+ tvd = newvd->vdev_top;
+ ASSERT(pvd->vdev_top == tvd);
+ ASSERT(tvd->vdev_parent == rvd);
+
+ vdev_config_dirty(tvd);
+
+ /*
+ * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
+ * for any dmu_sync-ed blocks. It will propagate upward when
+ * spa_vdev_exit() calls vdev_dtl_reassess().
+ */
+ dtl_max_txg = txg + TXG_CONCURRENT_STATES;
+
+ vdev_dtl_dirty(newvd, DTL_MISSING,
+ TXG_INITIAL, dtl_max_txg - TXG_INITIAL);
+
+ if (newvd->vdev_isspare) {
+ spa_spare_activate(newvd);
+ spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
+ }
+
+ oldvdpath = spa_strdup(oldvd->vdev_path);
+ newvdpath = spa_strdup(newvd->vdev_path);
+ newvd_isspare = newvd->vdev_isspare;
+
+ /*
+ * Mark newvd's DTL dirty in this txg.
+ */
+ vdev_dirty(tvd, VDD_DTL, newvd, txg);
+
+ /*
+ * Schedule the resilver or rebuild to restart in the future. We do
+ * this to ensure that dmu_sync-ed blocks have been stitched into the
+ * respective datasets.
+ */
+ if (rebuild) {
+ newvd->vdev_rebuild_txg = txg;
+
+ vdev_rebuild(tvd);
+ } else {
+ newvd->vdev_resilver_txg = txg;
+
+ if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
+ spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) {
+ vdev_defer_resilver(newvd);
+ } else {
+ dsl_scan_restart_resilver(spa->spa_dsl_pool,
+ dtl_max_txg);
+ }
+ }
+
+ if (spa->spa_bootfs)
+ spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
+
+ spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH);
+
+ /*
+ * Commit the config
+ */
+ (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
+
+ spa_history_log_internal(spa, "vdev attach", NULL,
+ "%s vdev=%s %s vdev=%s",
+ replacing && newvd_isspare ? "spare in" :
+ replacing ? "replace" : "attach", newvdpath,
+ replacing ? "for" : "to", oldvdpath);
+
+ spa_strfree(oldvdpath);
+ spa_strfree(newvdpath);
+
+ return (0);
+}
+
+/*
+ * Detach a device from a mirror or replacing vdev.
+ *
+ * If 'replace_done' is specified, only detach if the parent
+ * is a replacing vdev.
+ */
+int
+spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
+{
+ uint64_t txg;
+ int error;
+ vdev_t *rvd __maybe_unused = spa->spa_root_vdev;
+ vdev_t *vd, *pvd, *cvd, *tvd;
+ boolean_t unspare = B_FALSE;
+ uint64_t unspare_guid = 0;
+ char *vdpath;
+
+ ASSERT(spa_writeable(spa));
+
+ txg = spa_vdev_detach_enter(spa, guid);
+
+ vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+
+ /*
+ * Besides being called directly from the userland through the
+ * ioctl interface, spa_vdev_detach() can be potentially called
+ * at the end of spa_vdev_resilver_done().
+ *
+ * In the regular case, when we have a checkpoint this shouldn't
+ * happen as we never empty the DTLs of a vdev during the scrub
+ * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done()
+ * should never get here when we have a checkpoint.
+ *
+ * That said, even in a case when we checkpoint the pool exactly
+ * as spa_vdev_resilver_done() calls this function everything
+ * should be fine as the resilver will return right away.
+ */
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+ error = (spa_has_checkpoint(spa)) ?
+ ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+ return (spa_vdev_exit(spa, NULL, txg, error));
+ }
+
+ if (vd == NULL)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ pvd = vd->vdev_parent;
+
+ /*
+ * If the parent/child relationship is not as expected, don't do it.
+ * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
+ * vdev that's replacing B with C. The user's intent in replacing
+ * is to go from M(A,B) to M(A,C). If the user decides to cancel
+ * the replace by detaching C, the expected behavior is to end up
+ * M(A,B). But suppose that right after deciding to detach C,
+ * the replacement of B completes. We would have M(A,C), and then
+ * ask to detach C, which would leave us with just A -- not what
+ * the user wanted. To prevent this, we make sure that the
+ * parent/child relationship hasn't changed -- in this example,
+ * that C's parent is still the replacing vdev R.
+ */
+ if (pvd->vdev_guid != pguid && pguid != 0)
+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+ /*
+ * Only 'replacing' or 'spare' vdevs can be replaced.
+ */
+ if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
+ pvd->vdev_ops != &vdev_spare_ops)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
+ spa_version(spa) >= SPA_VERSION_SPARES);
+
+ /*
+ * Only mirror, replacing, and spare vdevs support detach.
+ */
+ if (pvd->vdev_ops != &vdev_replacing_ops &&
+ pvd->vdev_ops != &vdev_mirror_ops &&
+ pvd->vdev_ops != &vdev_spare_ops)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ /*
+ * If this device has the only valid copy of some data,
+ * we cannot safely detach it.
+ */
+ if (vdev_dtl_required(vd))
+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+ ASSERT(pvd->vdev_children >= 2);
+
+ /*
+ * If we are detaching the second disk from a replacing vdev, then
+ * check to see if we changed the original vdev's path to have "/old"
+ * at the end in spa_vdev_attach(). If so, undo that change now.
+ */
+ if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
+ vd->vdev_path != NULL) {
+ size_t len = strlen(vd->vdev_path);
+
+ for (int c = 0; c < pvd->vdev_children; c++) {
+ cvd = pvd->vdev_child[c];
+
+ if (cvd == vd || cvd->vdev_path == NULL)
+ continue;
+
+ if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
+ strcmp(cvd->vdev_path + len, "/old") == 0) {
+ spa_strfree(cvd->vdev_path);
+ cvd->vdev_path = spa_strdup(vd->vdev_path);
+ break;
+ }
+ }
+ }
+
+ /*
+ * If we are detaching the original disk from a normal spare, then it
+ * implies that the spare should become a real disk, and be removed
+ * from the active spare list for the pool. dRAID spares on the
+ * other hand are coupled to the pool and thus should never be removed
+ * from the spares list.
+ */
+ if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) {
+ vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1];
+
+ if (last_cvd->vdev_isspare &&
+ last_cvd->vdev_ops != &vdev_draid_spare_ops) {
+ unspare = B_TRUE;
+ }
+ }
+
+ /*
+ * Erase the disk labels so the disk can be used for other things.
+ * This must be done after all other error cases are handled,
+ * but before we disembowel vd (so we can still do I/O to it).
+ * But if we can't do it, don't treat the error as fatal --
+ * it may be that the unwritability of the disk is the reason
+ * it's being detached!
+ */
+ error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+
+ /*
+ * Remove vd from its parent and compact the parent's children.
+ */
+ vdev_remove_child(pvd, vd);
+ vdev_compact_children(pvd);
+
+ /*
+ * Remember one of the remaining children so we can get tvd below.
+ */
+ cvd = pvd->vdev_child[pvd->vdev_children - 1];
+
+ /*
+ * If we need to remove the remaining child from the list of hot spares,
+ * do it now, marking the vdev as no longer a spare in the process.
+ * We must do this before vdev_remove_parent(), because that can
+ * change the GUID if it creates a new toplevel GUID. For a similar
+ * reason, we must remove the spare now, in the same txg as the detach;
+ * otherwise someone could attach a new sibling, change the GUID, and
+ * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
+ */
+ if (unspare) {
+ ASSERT(cvd->vdev_isspare);
+ spa_spare_remove(cvd);
+ unspare_guid = cvd->vdev_guid;
+ (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
+ cvd->vdev_unspare = B_TRUE;
+ }
+
+ /*
+ * If the parent mirror/replacing vdev only has one child,
+ * the parent is no longer needed. Remove it from the tree.
+ */
+ if (pvd->vdev_children == 1) {
+ if (pvd->vdev_ops == &vdev_spare_ops)
+ cvd->vdev_unspare = B_FALSE;
+ vdev_remove_parent(cvd);
+ }
+
+ /*
+ * We don't set tvd until now because the parent we just removed
+ * may have been the previous top-level vdev.
+ */
+ tvd = cvd->vdev_top;
+ ASSERT(tvd->vdev_parent == rvd);
+
+ /*
+ * Reevaluate the parent vdev state.
+ */
+ vdev_propagate_state(cvd);
+
+ /*
+ * If the 'autoexpand' property is set on the pool then automatically
+ * try to expand the size of the pool. For example if the device we
+ * just detached was smaller than the others, it may be possible to
+ * add metaslabs (i.e. grow the pool). We need to reopen the vdev
+ * first so that we can obtain the updated sizes of the leaf vdevs.
+ */
+ if (spa->spa_autoexpand) {
+ vdev_reopen(tvd);
+ vdev_expand(tvd, txg);
+ }
+
+ vdev_config_dirty(tvd);
+
+ /*
+ * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
+ * vd->vdev_detached is set and free vd's DTL object in syncing context.
+ * But first make sure we're not on any *other* txg's DTL list, to
+ * prevent vd from being accessed after it's freed.
+ */
+ vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none");
+ for (int t = 0; t < TXG_SIZE; t++)
+ (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
+ vd->vdev_detached = B_TRUE;
+ vdev_dirty(tvd, VDD_DTL, vd, txg);
+
+ spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);
+ spa_notify_waiters(spa);
+
+ /* hang on to the spa before we release the lock */
+ spa_open_ref(spa, FTAG);
+
+ error = spa_vdev_exit(spa, vd, txg, 0);
+
+ spa_history_log_internal(spa, "detach", NULL,
+ "vdev=%s", vdpath);
+ spa_strfree(vdpath);
+
+ /*
+ * If this was the removal of the original device in a hot spare vdev,
+ * then we want to go through and remove the device from the hot spare
+ * list of every other pool.
+ */
+ if (unspare) {
+ spa_t *altspa = NULL;
+
+ mutex_enter(&spa_namespace_lock);
+ while ((altspa = spa_next(altspa)) != NULL) {
+ if (altspa->spa_state != POOL_STATE_ACTIVE ||
+ altspa == spa)
+ continue;
+
+ spa_open_ref(altspa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
+ mutex_enter(&spa_namespace_lock);
+ spa_close(altspa, FTAG);
+ }
+ mutex_exit(&spa_namespace_lock);
+
+ /* search the rest of the vdevs for spares to remove */
+ spa_vdev_resilver_done(spa);
+ }
+
+ /* all done with the spa; OK to release */
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+
+ return (error);
+}
+
+static int
+spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
+ list_t *vd_list)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+
+ /* Look up vdev and ensure it's a leaf. */
+ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+ if (vd == NULL || vd->vdev_detached) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ return (SET_ERROR(ENODEV));
+ } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ return (SET_ERROR(EINVAL));
+ } else if (!vdev_writeable(vd)) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ return (SET_ERROR(EROFS));
+ }
+ mutex_enter(&vd->vdev_initialize_lock);
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+
+ /*
+ * When we activate an initialize action we check to see
+ * if the vdev_initialize_thread is NULL. We do this instead
+ * of using the vdev_initialize_state since there might be
+ * a previous initialization process which has completed but
+ * the thread is not exited.
+ */
+ if (cmd_type == POOL_INITIALIZE_START &&
+ (vd->vdev_initialize_thread != NULL ||
+ vd->vdev_top->vdev_removing)) {
+ mutex_exit(&vd->vdev_initialize_lock);
+ return (SET_ERROR(EBUSY));
+ } else if (cmd_type == POOL_INITIALIZE_CANCEL &&
+ (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
+ vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
+ mutex_exit(&vd->vdev_initialize_lock);
+ return (SET_ERROR(ESRCH));
+ } else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
+ vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
+ mutex_exit(&vd->vdev_initialize_lock);
+ return (SET_ERROR(ESRCH));
+ }
+
+ switch (cmd_type) {
+ case POOL_INITIALIZE_START:
+ vdev_initialize(vd);
+ break;
+ case POOL_INITIALIZE_CANCEL:
+ vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list);
+ break;
+ case POOL_INITIALIZE_SUSPEND:
+ vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list);
+ break;
+ default:
+ panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
+ }
+ mutex_exit(&vd->vdev_initialize_lock);
+
+ return (0);
+}
+
+int
+spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
+ nvlist_t *vdev_errlist)
+{
+ int total_errors = 0;
+ list_t vd_list;
+
+ list_create(&vd_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_initialize_node));
+
+ /*
+ * We hold the namespace lock through the whole function
+ * to prevent any changes to the pool while we're starting or
+ * stopping initialization. The config and state locks are held so that
+ * we can properly assess the vdev state before we commit to
+ * the initializing operation.
+ */
+ mutex_enter(&spa_namespace_lock);
+
+ for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
+ uint64_t vdev_guid = fnvpair_value_uint64(pair);
+
+ int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type,
+ &vd_list);
+ if (error != 0) {
+ char guid_as_str[MAXNAMELEN];
+
+ (void) snprintf(guid_as_str, sizeof (guid_as_str),
+ "%llu", (unsigned long long)vdev_guid);
+ fnvlist_add_int64(vdev_errlist, guid_as_str, error);
+ total_errors++;
+ }
+ }
+
+ /* Wait for all initialize threads to stop. */
+ vdev_initialize_stop_wait(spa, &vd_list);
+
+ /* Sync out the initializing state */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ mutex_exit(&spa_namespace_lock);
+
+ list_destroy(&vd_list);
+
+ return (total_errors);
+}
+
+static int
+spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
+ uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+
+ /* Look up vdev and ensure it's a leaf. */
+ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+ if (vd == NULL || vd->vdev_detached) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ return (SET_ERROR(ENODEV));
+ } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ return (SET_ERROR(EINVAL));
+ } else if (!vdev_writeable(vd)) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ return (SET_ERROR(EROFS));
+ } else if (!vd->vdev_has_trim) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ return (SET_ERROR(EOPNOTSUPP));
+ } else if (secure && !vd->vdev_has_securetrim) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+ mutex_enter(&vd->vdev_trim_lock);
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+
+ /*
+ * When we activate a TRIM action we check to see if the
+ * vdev_trim_thread is NULL. We do this instead of using the
+ * vdev_trim_state since there might be a previous TRIM process
+ * which has completed but the thread is not exited.
+ */
+ if (cmd_type == POOL_TRIM_START &&
+ (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) {
+ mutex_exit(&vd->vdev_trim_lock);
+ return (SET_ERROR(EBUSY));
+ } else if (cmd_type == POOL_TRIM_CANCEL &&
+ (vd->vdev_trim_state != VDEV_TRIM_ACTIVE &&
+ vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) {
+ mutex_exit(&vd->vdev_trim_lock);
+ return (SET_ERROR(ESRCH));
+ } else if (cmd_type == POOL_TRIM_SUSPEND &&
+ vd->vdev_trim_state != VDEV_TRIM_ACTIVE) {
+ mutex_exit(&vd->vdev_trim_lock);
+ return (SET_ERROR(ESRCH));
+ }
+
+ switch (cmd_type) {
+ case POOL_TRIM_START:
+ vdev_trim(vd, rate, partial, secure);
+ break;
+ case POOL_TRIM_CANCEL:
+ vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list);
+ break;
+ case POOL_TRIM_SUSPEND:
+ vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list);
+ break;
+ default:
+ panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
+ }
+ mutex_exit(&vd->vdev_trim_lock);
+
+ return (0);
+}
+
+/*
+ * Initiates a manual TRIM for the requested vdevs. This kicks off individual
+ * TRIM threads for each child vdev. These threads pass over all of the free
+ * space in the vdev's metaslabs and issues TRIM commands for that space.
+ */
+int
+spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate,
+ boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist)
+{
+ int total_errors = 0;
+ list_t vd_list;
+
+ list_create(&vd_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_trim_node));
+
+ /*
+ * We hold the namespace lock through the whole function
+ * to prevent any changes to the pool while we're starting or
+ * stopping TRIM. The config and state locks are held so that
+ * we can properly assess the vdev state before we commit to
+ * the TRIM operation.
+ */
+ mutex_enter(&spa_namespace_lock);
+
+ for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
+ uint64_t vdev_guid = fnvpair_value_uint64(pair);
+
+ int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type,
+ rate, partial, secure, &vd_list);
+ if (error != 0) {
+ char guid_as_str[MAXNAMELEN];
+
+ (void) snprintf(guid_as_str, sizeof (guid_as_str),
+ "%llu", (unsigned long long)vdev_guid);
+ fnvlist_add_int64(vdev_errlist, guid_as_str, error);
+ total_errors++;
+ }
+ }
+
+ /* Wait for all TRIM threads to stop. */
+ vdev_trim_stop_wait(spa, &vd_list);
+
+ /* Sync out the TRIM state */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ mutex_exit(&spa_namespace_lock);
+
+ list_destroy(&vd_list);
+
+ return (total_errors);
+}
+
+/*
+ * Split a set of devices from their mirrors, and create a new pool from them.
+ */
+int
+spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
+ nvlist_t *props, boolean_t exp)
+{
+ int error = 0;
+ uint64_t txg, *glist;
+ spa_t *newspa;
+ uint_t c, children, lastlog;
+ nvlist_t **child, *nvl, *tmp;
+ dmu_tx_t *tx;
+ char *altroot = NULL;
+ vdev_t *rvd, **vml = NULL; /* vdev modify list */
+ boolean_t activate_slog;
+
+ ASSERT(spa_writeable(spa));
+
+ txg = spa_vdev_enter(spa);
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+ error = (spa_has_checkpoint(spa)) ?
+ ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+ return (spa_vdev_exit(spa, NULL, txg, error));
+ }
+
+ /* clear the log and flush everything up to now */
+ activate_slog = spa_passivate_log(spa);
+ (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+ error = spa_reset_logs(spa);
+ txg = spa_vdev_config_enter(spa);
+
+ if (activate_slog)
+ spa_activate_log(spa);
+
+ if (error != 0)
+ return (spa_vdev_exit(spa, NULL, txg, error));
+
+ /* check new spa name before going any further */
+ if (spa_lookup(newname) != NULL)
+ return (spa_vdev_exit(spa, NULL, txg, EEXIST));
+
+ /*
+ * scan through all the children to ensure they're all mirrors
+ */
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
+ nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
+ &children) != 0)
+ return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+ /* first, check to ensure we've got the right child count */
+ rvd = spa->spa_root_vdev;
+ lastlog = 0;
+ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ /* don't count the holes & logs as children */
+ if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops &&
+ !vdev_is_concrete(vd))) {
+ if (lastlog == 0)
+ lastlog = c;
+ continue;
+ }
+
+ lastlog = 0;
+ }
+ if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
+ return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+ /* next, ensure no spare or cache devices are part of the split */
+ if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
+ nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
+ return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+ vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
+ glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
+
+ /* then, loop over each vdev and validate it */
+ for (c = 0; c < children; c++) {
+ uint64_t is_hole = 0;
+
+ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
+ &is_hole);
+
+ if (is_hole != 0) {
+ if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
+ spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
+ continue;
+ } else {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+ }
+
+ /* deal with indirect vdevs */
+ if (spa->spa_root_vdev->vdev_child[c]->vdev_ops ==
+ &vdev_indirect_ops)
+ continue;
+
+ /* which disk is going to be split? */
+ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
+ &glist[c]) != 0) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ /* look it up in the spa */
+ vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
+ if (vml[c] == NULL) {
+ error = SET_ERROR(ENODEV);
+ break;
+ }
+
+ /* make sure there's nothing stopping the split */
+ if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
+ vml[c]->vdev_islog ||
+ !vdev_is_concrete(vml[c]) ||
+ vml[c]->vdev_isspare ||
+ vml[c]->vdev_isl2cache ||
+ !vdev_writeable(vml[c]) ||
+ vml[c]->vdev_children != 0 ||
+ vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
+ c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ if (vdev_dtl_required(vml[c]) ||
+ vdev_resilver_needed(vml[c], NULL, NULL)) {
+ error = SET_ERROR(EBUSY);
+ break;
+ }
+
+ /* we need certain info from the top level */
+ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
+ vml[c]->vdev_top->vdev_ms_array) == 0);
+ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
+ vml[c]->vdev_top->vdev_ms_shift) == 0);
+ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
+ vml[c]->vdev_top->vdev_asize) == 0);
+ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
+ vml[c]->vdev_top->vdev_ashift) == 0);
+
+ /* transfer per-vdev ZAPs */
+ ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
+ VERIFY0(nvlist_add_uint64(child[c],
+ ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
+
+ ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
+ VERIFY0(nvlist_add_uint64(child[c],
+ ZPOOL_CONFIG_VDEV_TOP_ZAP,
+ vml[c]->vdev_parent->vdev_top_zap));
+ }
+
+ if (error != 0) {
+ kmem_free(vml, children * sizeof (vdev_t *));
+ kmem_free(glist, children * sizeof (uint64_t));
+ return (spa_vdev_exit(spa, NULL, txg, error));
+ }
+
+ /* stop writers from using the disks */
+ for (c = 0; c < children; c++) {
+ if (vml[c] != NULL)
+ vml[c]->vdev_offline = B_TRUE;
+ }
+ vdev_reopen(spa->spa_root_vdev);
+
+ /*
+ * Temporarily record the splitting vdevs in the spa config. This
+ * will disappear once the config is regenerated.
+ */
+ VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
+ glist, children) == 0);
+ kmem_free(glist, children * sizeof (uint64_t));
+
+ mutex_enter(&spa->spa_props_lock);
+ VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
+ nvl) == 0);
+ mutex_exit(&spa->spa_props_lock);
+ spa->spa_config_splitting = nvl;
+ vdev_config_dirty(spa->spa_root_vdev);
+
+ /* configure and create the new pool */
+ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
+ spa_version(spa)) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ spa->spa_config_txg) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ spa_generate_guid(NULL)) == 0);
+ VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
+ (void) nvlist_lookup_string(props,
+ zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
+
+ /* add the new pool to the namespace */
+ newspa = spa_add(newname, config, altroot);
+ newspa->spa_avz_action = AVZ_ACTION_REBUILD;
+ newspa->spa_config_txg = spa->spa_config_txg;
+ spa_set_log_state(newspa, SPA_LOG_CLEAR);
+
+ /* release the spa config lock, retaining the namespace lock */
+ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+
+ if (zio_injection_enabled)
+ zio_handle_panic_injection(spa, FTAG, 1);
+
+ spa_activate(newspa, spa_mode_global);
+ spa_async_suspend(newspa);
+
+ /*
+ * Temporarily stop the initializing and TRIM activity. We set the
+ * state to ACTIVE so that we know to resume initializing or TRIM
+ * once the split has completed.
+ */
+ list_t vd_initialize_list;
+ list_create(&vd_initialize_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_initialize_node));
+
+ list_t vd_trim_list;
+ list_create(&vd_trim_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_trim_node));
+
+ for (c = 0; c < children; c++) {
+ if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
+ mutex_enter(&vml[c]->vdev_initialize_lock);
+ vdev_initialize_stop(vml[c],
+ VDEV_INITIALIZE_ACTIVE, &vd_initialize_list);
+ mutex_exit(&vml[c]->vdev_initialize_lock);
+
+ mutex_enter(&vml[c]->vdev_trim_lock);
+ vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list);
+ mutex_exit(&vml[c]->vdev_trim_lock);
+ }
+ }
+
+ vdev_initialize_stop_wait(spa, &vd_initialize_list);
+ vdev_trim_stop_wait(spa, &vd_trim_list);
+
+ list_destroy(&vd_initialize_list);
+ list_destroy(&vd_trim_list);
+
+ newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
+ newspa->spa_is_splitting = B_TRUE;
+
+ /* create the new pool from the disks of the original pool */
+ error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE);
+ if (error)
+ goto out;
+
+ /* if that worked, generate a real config for the new pool */
+ if (newspa->spa_root_vdev != NULL) {
+ VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
+ ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
+ spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
+ B_TRUE));
+ }
+
+ /* set the props */
+ if (props != NULL) {
+ spa_configfile_set(newspa, props, B_FALSE);
+ error = spa_prop_set(newspa, props);
+ if (error)
+ goto out;
+ }
+
+ /* flush everything */
+ txg = spa_vdev_config_enter(newspa);
+ vdev_config_dirty(newspa->spa_root_vdev);
+ (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
+
+ if (zio_injection_enabled)
+ zio_handle_panic_injection(spa, FTAG, 2);
+
+ spa_async_resume(newspa);
+
+ /* finally, update the original pool's config */
+ txg = spa_vdev_config_enter(spa);
+ tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0)
+ dmu_tx_abort(tx);
+ for (c = 0; c < children; c++) {
+ if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
+ vdev_t *tvd = vml[c]->vdev_top;
+
+ /*
+ * Need to be sure the detachable VDEV is not
+ * on any *other* txg's DTL list to prevent it
+ * from being accessed after it's freed.
+ */
+ for (int t = 0; t < TXG_SIZE; t++) {
+ (void) txg_list_remove_this(
+ &tvd->vdev_dtl_list, vml[c], t);
+ }
+
+ vdev_split(vml[c]);
+ if (error == 0)
+ spa_history_log_internal(spa, "detach", tx,
+ "vdev=%s", vml[c]->vdev_path);
+
+ vdev_free(vml[c]);
+ }
+ }
+ spa->spa_avz_action = AVZ_ACTION_REBUILD;
+ vdev_config_dirty(spa->spa_root_vdev);
+ spa->spa_config_splitting = NULL;
+ nvlist_free(nvl);
+ if (error == 0)
+ dmu_tx_commit(tx);
+ (void) spa_vdev_exit(spa, NULL, txg, 0);
+
+ if (zio_injection_enabled)
+ zio_handle_panic_injection(spa, FTAG, 3);
+
+ /* split is complete; log a history record */
+ spa_history_log_internal(newspa, "split", NULL,
+ "from pool %s", spa_name(spa));
+
+ newspa->spa_is_splitting = B_FALSE;
+ kmem_free(vml, children * sizeof (vdev_t *));
+
+ /* if we're not going to mount the filesystems in userland, export */
+ if (exp)
+ error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
+ B_FALSE, B_FALSE);
+
+ return (error);
+
+out:
+ spa_unload(newspa);
+ spa_deactivate(newspa);
+ spa_remove(newspa);
+
+ txg = spa_vdev_config_enter(spa);
+
+ /* re-online all offlined disks */
+ for (c = 0; c < children; c++) {
+ if (vml[c] != NULL)
+ vml[c]->vdev_offline = B_FALSE;
+ }
+
+ /* restart initializing or trimming disks as necessary */
+ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
+ spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
+ spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
+
+ vdev_reopen(spa->spa_root_vdev);
+
+ nvlist_free(spa->spa_config_splitting);
+ spa->spa_config_splitting = NULL;
+ (void) spa_vdev_exit(spa, NULL, txg, error);
+
+ kmem_free(vml, children * sizeof (vdev_t *));
+ return (error);
+}
+
+/*
+ * Find any device that's done replacing, or a vdev marked 'unspare' that's
+ * currently spared, so we can detach it.
+ */
+static vdev_t *
+spa_vdev_resilver_done_hunt(vdev_t *vd)
+{
+ vdev_t *newvd, *oldvd;
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
+ if (oldvd != NULL)
+ return (oldvd);
+ }
+
+ /*
+ * Check for a completed replacement. We always consider the first
+ * vdev in the list to be the oldest vdev, and the last one to be
+ * the newest (see spa_vdev_attach() for how that works). In
+ * the case where the newest vdev is faulted, we will not automatically
+ * remove it after a resilver completes. This is OK as it will require
+ * user intervention to determine which disk the admin wishes to keep.
+ */
+ if (vd->vdev_ops == &vdev_replacing_ops) {
+ ASSERT(vd->vdev_children > 1);
+
+ newvd = vd->vdev_child[vd->vdev_children - 1];
+ oldvd = vd->vdev_child[0];
+
+ if (vdev_dtl_empty(newvd, DTL_MISSING) &&
+ vdev_dtl_empty(newvd, DTL_OUTAGE) &&
+ !vdev_dtl_required(oldvd))
+ return (oldvd);
+ }
+
+ /*
+ * Check for a completed resilver with the 'unspare' flag set.
+ * Also potentially update faulted state.
+ */
+ if (vd->vdev_ops == &vdev_spare_ops) {
+ vdev_t *first = vd->vdev_child[0];
+ vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
+
+ if (last->vdev_unspare) {
+ oldvd = first;
+ newvd = last;
+ } else if (first->vdev_unspare) {
+ oldvd = last;
+ newvd = first;
+ } else {
+ oldvd = NULL;
+ }
+
+ if (oldvd != NULL &&
+ vdev_dtl_empty(newvd, DTL_MISSING) &&
+ vdev_dtl_empty(newvd, DTL_OUTAGE) &&
+ !vdev_dtl_required(oldvd))
+ return (oldvd);
+
+ vdev_propagate_state(vd);
+
+ /*
+ * If there are more than two spares attached to a disk,
+ * and those spares are not required, then we want to
+ * attempt to free them up now so that they can be used
+ * by other pools. Once we're back down to a single
+ * disk+spare, we stop removing them.
+ */
+ if (vd->vdev_children > 2) {
+ newvd = vd->vdev_child[1];
+
+ if (newvd->vdev_isspare && last->vdev_isspare &&
+ vdev_dtl_empty(last, DTL_MISSING) &&
+ vdev_dtl_empty(last, DTL_OUTAGE) &&
+ !vdev_dtl_required(newvd))
+ return (newvd);
+ }
+ }
+
+ return (NULL);
+}
+
+static void
+spa_vdev_resilver_done(spa_t *spa)
+{
+ vdev_t *vd, *pvd, *ppvd;
+ uint64_t guid, sguid, pguid, ppguid;
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+ while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
+ pvd = vd->vdev_parent;
+ ppvd = pvd->vdev_parent;
+ guid = vd->vdev_guid;
+ pguid = pvd->vdev_guid;
+ ppguid = ppvd->vdev_guid;
+ sguid = 0;
+ /*
+ * If we have just finished replacing a hot spared device, then
+ * we need to detach the parent's first child (the original hot
+ * spare) as well.
+ */
+ if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
+ ppvd->vdev_children == 2) {
+ ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
+ sguid = ppvd->vdev_child[1]->vdev_guid;
+ }
+ ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
+
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
+ return;
+ if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
+ return;
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ }
+
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ /*
+ * If a detach was not performed above replace waiters will not have
+ * been notified. In which case we must do so now.
+ */
+ spa_notify_waiters(spa);
+}
+
+/*
+ * Update the stored path or FRU for this vdev.
+ */
+static int
+spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
+ boolean_t ispath)
+{
+ vdev_t *vd;
+ boolean_t sync = B_FALSE;
+
+ ASSERT(spa_writeable(spa));
+
+ spa_vdev_state_enter(spa, SCL_ALL);
+
+ if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+ return (spa_vdev_state_exit(spa, NULL, ENOENT));
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+
+ if (ispath) {
+ if (strcmp(value, vd->vdev_path) != 0) {
+ spa_strfree(vd->vdev_path);
+ vd->vdev_path = spa_strdup(value);
+ sync = B_TRUE;
+ }
+ } else {
+ if (vd->vdev_fru == NULL) {
+ vd->vdev_fru = spa_strdup(value);
+ sync = B_TRUE;
+ } else if (strcmp(value, vd->vdev_fru) != 0) {
+ spa_strfree(vd->vdev_fru);
+ vd->vdev_fru = spa_strdup(value);
+ sync = B_TRUE;
+ }
+ }
+
+ return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
+}
+
+int
+spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
+{
+ return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
+}
+
+int
+spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
+{
+ return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
+}
+
+/*
+ * ==========================================================================
+ * SPA Scanning
+ * ==========================================================================
+ */
+int
+spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd)
+{
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+
+ if (dsl_scan_resilvering(spa->spa_dsl_pool))
+ return (SET_ERROR(EBUSY));
+
+ return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd));
+}
+
+int
+spa_scan_stop(spa_t *spa)
+{
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+ if (dsl_scan_resilvering(spa->spa_dsl_pool))
+ return (SET_ERROR(EBUSY));
+ return (dsl_scan_cancel(spa->spa_dsl_pool));
+}
+
+int
+spa_scan(spa_t *spa, pool_scan_func_t func)
+{
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+
+ if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
+ return (SET_ERROR(ENOTSUP));
+
+ if (func == POOL_SCAN_RESILVER &&
+ !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
+ return (SET_ERROR(ENOTSUP));
+
+ /*
+ * If a resilver was requested, but there is no DTL on a
+ * writeable leaf device, we have nothing to do.
+ */
+ if (func == POOL_SCAN_RESILVER &&
+ !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
+ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+ return (0);
+ }
+
+ return (dsl_scan(spa->spa_dsl_pool, func));
+}
+
+/*
+ * ==========================================================================
+ * SPA async task processing
+ * ==========================================================================
+ */
+
+static void
+spa_async_remove(spa_t *spa, vdev_t *vd)
+{
+ if (vd->vdev_remove_wanted) {
+ vd->vdev_remove_wanted = B_FALSE;
+ vd->vdev_delayed_close = B_FALSE;
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
+
+ /*
+ * We want to clear the stats, but we don't want to do a full
+ * vdev_clear() as that will cause us to throw away
+ * degraded/faulted state as well as attempt to reopen the
+ * device, all of which is a waste.
+ */
+ vd->vdev_stat.vs_read_errors = 0;
+ vd->vdev_stat.vs_write_errors = 0;
+ vd->vdev_stat.vs_checksum_errors = 0;
+
+ vdev_state_dirty(vd->vdev_top);
+
+ /* Tell userspace that the vdev is gone. */
+ zfs_post_remove(spa, vd);
+ }
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ spa_async_remove(spa, vd->vdev_child[c]);
+}
+
+static void
+spa_async_probe(spa_t *spa, vdev_t *vd)
+{
+ if (vd->vdev_probe_wanted) {
+ vd->vdev_probe_wanted = B_FALSE;
+ vdev_reopen(vd); /* vdev_open() does the actual probe */
+ }
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ spa_async_probe(spa, vd->vdev_child[c]);
+}
+
+static void
+spa_async_autoexpand(spa_t *spa, vdev_t *vd)
+{
+ if (!spa->spa_autoexpand)
+ return;
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ spa_async_autoexpand(spa, cvd);
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
+ return;
+
+ spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND);
+}
+
+static void
+spa_async_thread(void *arg)
+{
+ spa_t *spa = (spa_t *)arg;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ int tasks;
+
+ ASSERT(spa->spa_sync_on);
+
+ mutex_enter(&spa->spa_async_lock);
+ tasks = spa->spa_async_tasks;
+ spa->spa_async_tasks = 0;
+ mutex_exit(&spa->spa_async_lock);
+
+ /*
+ * See if the config needs to be updated.
+ */
+ if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
+ uint64_t old_space, new_space;
+
+ mutex_enter(&spa_namespace_lock);
+ old_space = metaslab_class_get_space(spa_normal_class(spa));
+ old_space += metaslab_class_get_space(spa_special_class(spa));
+ old_space += metaslab_class_get_space(spa_dedup_class(spa));
+ old_space += metaslab_class_get_space(
+ spa_embedded_log_class(spa));
+
+ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+
+ new_space = metaslab_class_get_space(spa_normal_class(spa));
+ new_space += metaslab_class_get_space(spa_special_class(spa));
+ new_space += metaslab_class_get_space(spa_dedup_class(spa));
+ new_space += metaslab_class_get_space(
+ spa_embedded_log_class(spa));
+ mutex_exit(&spa_namespace_lock);
+
+ /*
+ * If the pool grew as a result of the config update,
+ * then log an internal history event.
+ */
+ if (new_space != old_space) {
+ spa_history_log_internal(spa, "vdev online", NULL,
+ "pool '%s' size: %llu(+%llu)",
+ spa_name(spa), (u_longlong_t)new_space,
+ (u_longlong_t)(new_space - old_space));
+ }
+ }
+
+ /*
+ * See if any devices need to be marked REMOVED.
+ */
+ if (tasks & SPA_ASYNC_REMOVE) {
+ spa_vdev_state_enter(spa, SCL_NONE);
+ spa_async_remove(spa, spa->spa_root_vdev);
+ for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
+ spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
+ for (int i = 0; i < spa->spa_spares.sav_count; i++)
+ spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+ }
+
+ if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ spa_async_autoexpand(spa, spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ }
+
+ /*
+ * See if any devices need to be probed.
+ */
+ if (tasks & SPA_ASYNC_PROBE) {
+ spa_vdev_state_enter(spa, SCL_NONE);
+ spa_async_probe(spa, spa->spa_root_vdev);
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+ }
+
+ /*
+ * If any devices are done replacing, detach them.
+ */
+ if (tasks & SPA_ASYNC_RESILVER_DONE ||
+ tasks & SPA_ASYNC_REBUILD_DONE) {
+ spa_vdev_resilver_done(spa);
+ }
+
+ /*
+ * Kick off a resilver.
+ */
+ if (tasks & SPA_ASYNC_RESILVER &&
+ !vdev_rebuild_active(spa->spa_root_vdev) &&
+ (!dsl_scan_resilvering(dp) ||
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
+ dsl_scan_restart_resilver(dp, 0);
+
+ if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
+ mutex_enter(&spa_namespace_lock);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_initialize_restart(spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ if (tasks & SPA_ASYNC_TRIM_RESTART) {
+ mutex_enter(&spa_namespace_lock);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_trim_restart(spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) {
+ mutex_enter(&spa_namespace_lock);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_autotrim_restart(spa);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ /*
+ * Kick off L2 cache whole device TRIM.
+ */
+ if (tasks & SPA_ASYNC_L2CACHE_TRIM) {
+ mutex_enter(&spa_namespace_lock);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_trim_l2arc(spa);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ /*
+ * Kick off L2 cache rebuilding.
+ */
+ if (tasks & SPA_ASYNC_L2CACHE_REBUILD) {
+ mutex_enter(&spa_namespace_lock);
+ spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER);
+ l2arc_spa_rebuild_start(spa);
+ spa_config_exit(spa, SCL_L2ARC, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ /*
+ * Let the world know that we're done.
+ */
+ mutex_enter(&spa->spa_async_lock);
+ spa->spa_async_thread = NULL;
+ cv_broadcast(&spa->spa_async_cv);
+ mutex_exit(&spa->spa_async_lock);
+ thread_exit();
+}
+
+void
+spa_async_suspend(spa_t *spa)
+{
+ mutex_enter(&spa->spa_async_lock);
+ spa->spa_async_suspended++;
+ while (spa->spa_async_thread != NULL)
+ cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
+ mutex_exit(&spa->spa_async_lock);
+
+ spa_vdev_remove_suspend(spa);
+
+ zthr_t *condense_thread = spa->spa_condense_zthr;
+ if (condense_thread != NULL)
+ zthr_cancel(condense_thread);
+
+ zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
+ if (discard_thread != NULL)
+ zthr_cancel(discard_thread);
+
+ zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
+ if (ll_delete_thread != NULL)
+ zthr_cancel(ll_delete_thread);
+
+ zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
+ if (ll_condense_thread != NULL)
+ zthr_cancel(ll_condense_thread);
+}
+
+void
+spa_async_resume(spa_t *spa)
+{
+ mutex_enter(&spa->spa_async_lock);
+ ASSERT(spa->spa_async_suspended != 0);
+ spa->spa_async_suspended--;
+ mutex_exit(&spa->spa_async_lock);
+ spa_restart_removal(spa);
+
+ zthr_t *condense_thread = spa->spa_condense_zthr;
+ if (condense_thread != NULL)
+ zthr_resume(condense_thread);
+
+ zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
+ if (discard_thread != NULL)
+ zthr_resume(discard_thread);
+
+ zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
+ if (ll_delete_thread != NULL)
+ zthr_resume(ll_delete_thread);
+
+ zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
+ if (ll_condense_thread != NULL)
+ zthr_resume(ll_condense_thread);
+}
+
+static boolean_t
+spa_async_tasks_pending(spa_t *spa)
+{
+ uint_t non_config_tasks;
+ uint_t config_task;
+ boolean_t config_task_suspended;
+
+ non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
+ config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
+ if (spa->spa_ccw_fail_time == 0) {
+ config_task_suspended = B_FALSE;
+ } else {
+ config_task_suspended =
+ (gethrtime() - spa->spa_ccw_fail_time) <
+ ((hrtime_t)zfs_ccw_retry_interval * NANOSEC);
+ }
+
+ return (non_config_tasks || (config_task && !config_task_suspended));
+}
+
+static void
+spa_async_dispatch(spa_t *spa)
+{
+ mutex_enter(&spa->spa_async_lock);
+ if (spa_async_tasks_pending(spa) &&
+ !spa->spa_async_suspended &&
+ spa->spa_async_thread == NULL)
+ spa->spa_async_thread = thread_create(NULL, 0,
+ spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
+ mutex_exit(&spa->spa_async_lock);
+}
+
+void
+spa_async_request(spa_t *spa, int task)
+{
+ zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
+ mutex_enter(&spa->spa_async_lock);
+ spa->spa_async_tasks |= task;
+ mutex_exit(&spa->spa_async_lock);
+}
+
+int
+spa_async_tasks(spa_t *spa)
+{
+ return (spa->spa_async_tasks);
+}
+
+/*
+ * ==========================================================================
+ * SPA syncing routines
+ * ==========================================================================
+ */
+
+
+static int
+bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+ dmu_tx_t *tx)
+{
+ bpobj_t *bpo = arg;
+ bpobj_enqueue(bpo, bp, bp_freed, tx);
+ return (0);
+}
+
+int
+bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx));
+}
+
+int
+bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx));
+}
+
+static int
+spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ zio_t *pio = arg;
+
+ zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp,
+ pio->io_flags));
+ return (0);
+}
+
+static int
+bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+ dmu_tx_t *tx)
+{
+ ASSERT(!bp_freed);
+ return (spa_free_sync_cb(arg, bp, tx));
+}
+
+/*
+ * Note: this simple function is not inlined to make it easier to dtrace the
+ * amount of time spent syncing frees.
+ */
+static void
+spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
+{
+ zio_t *zio = zio_root(spa, NULL, NULL, 0);
+ bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
+ VERIFY(zio_wait(zio) == 0);
+}
+
+/*
+ * Note: this simple function is not inlined to make it easier to dtrace the
+ * amount of time spent syncing deferred frees.
+ */
+static void
+spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
+{
+ if (spa_sync_pass(spa) != 1)
+ return;
+
+ /*
+ * Note:
+ * If the log space map feature is active, we stop deferring
+ * frees to the next TXG and therefore running this function
+ * would be considered a no-op as spa_deferred_bpobj should
+ * not have any entries.
+ *
+ * That said we run this function anyway (instead of returning
+ * immediately) for the edge-case scenario where we just
+ * activated the log space map feature in this TXG but we have
+ * deferred frees from the previous TXG.
+ */
+ zio_t *zio = zio_root(spa, NULL, NULL, 0);
+ VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
+ bpobj_spa_free_sync_cb, zio, tx), ==, 0);
+ VERIFY0(zio_wait(zio));
+}
+
+static void
+spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
+{
+ char *packed = NULL;
+ size_t bufsize;
+ size_t nvsize = 0;
+ dmu_buf_t *db;
+
+ VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
+
+ /*
+ * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
+ * information. This avoids the dmu_buf_will_dirty() path and
+ * saves us a pre-read to get data we don't actually care about.
+ */
+ bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
+ packed = vmem_alloc(bufsize, KM_SLEEP);
+
+ VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
+ KM_SLEEP) == 0);
+ bzero(packed + nvsize, bufsize - nvsize);
+
+ dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
+
+ vmem_free(packed, bufsize);
+
+ VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
+ dmu_buf_will_dirty(db, tx);
+ *(uint64_t *)db->db_data = nvsize;
+ dmu_buf_rele(db, FTAG);
+}
+
+static void
+spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
+ const char *config, const char *entry)
+{
+ nvlist_t *nvroot;
+ nvlist_t **list;
+ int i;
+
+ if (!sav->sav_sync)
+ return;
+
+ /*
+ * Update the MOS nvlist describing the list of available devices.
+ * spa_validate_aux() will have already made sure this nvlist is
+ * valid and the vdevs are labeled appropriately.
+ */
+ if (sav->sav_object == 0) {
+ sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
+ DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
+ sizeof (uint64_t), tx);
+ VERIFY(zap_update(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
+ &sav->sav_object, tx) == 0);
+ }
+
+ VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ if (sav->sav_count == 0) {
+ VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
+ } else {
+ list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP);
+ for (i = 0; i < sav->sav_count; i++)
+ list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
+ B_FALSE, VDEV_CONFIG_L2CACHE);
+ VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
+ sav->sav_count) == 0);
+ for (i = 0; i < sav->sav_count; i++)
+ nvlist_free(list[i]);
+ kmem_free(list, sav->sav_count * sizeof (void *));
+ }
+
+ spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
+ nvlist_free(nvroot);
+
+ sav->sav_sync = B_FALSE;
+}
+
+/*
+ * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
+ * The all-vdev ZAP must be empty.
+ */
+static void
+spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ if (vd->vdev_top_zap != 0) {
+ VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
+ vd->vdev_top_zap, tx));
+ }
+ if (vd->vdev_leaf_zap != 0) {
+ VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
+ vd->vdev_leaf_zap, tx));
+ }
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ spa_avz_build(vd->vdev_child[i], avz, tx);
+ }
+}
+
+static void
+spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
+{
+ nvlist_t *config;
+
+ /*
+ * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
+ * its config may not be dirty but we still need to build per-vdev ZAPs.
+ * Similarly, if the pool is being assembled (e.g. after a split), we
+ * need to rebuild the AVZ although the config may not be dirty.
+ */
+ if (list_is_empty(&spa->spa_config_dirty_list) &&
+ spa->spa_avz_action == AVZ_ACTION_NONE)
+ return;
+
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+
+ ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
+ spa->spa_avz_action == AVZ_ACTION_INITIALIZE ||
+ spa->spa_all_vdev_zaps != 0);
+
+ if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
+ /* Make and build the new AVZ */
+ uint64_t new_avz = zap_create(spa->spa_meta_objset,
+ DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
+ spa_avz_build(spa->spa_root_vdev, new_avz, tx);
+
+ /* Diff old AVZ with new one */
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ for (zap_cursor_init(&zc, spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t vdzap = za.za_first_integer;
+ if (zap_lookup_int(spa->spa_meta_objset, new_avz,
+ vdzap) == ENOENT) {
+ /*
+ * ZAP is listed in old AVZ but not in new one;
+ * destroy it
+ */
+ VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
+ tx));
+ }
+ }
+
+ zap_cursor_fini(&zc);
+
+ /* Destroy the old AVZ */
+ VERIFY0(zap_destroy(spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps, tx));
+
+ /* Replace the old AVZ in the dir obj with the new one */
+ VERIFY0(zap_update(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
+ sizeof (new_avz), 1, &new_avz, tx));
+
+ spa->spa_all_vdev_zaps = new_avz;
+ } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ /* Walk through the AVZ and destroy all listed ZAPs */
+ for (zap_cursor_init(&zc, spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t zap = za.za_first_integer;
+ VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
+ }
+
+ zap_cursor_fini(&zc);
+
+ /* Destroy and unlink the AVZ itself */
+ VERIFY0(zap_destroy(spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps, tx));
+ VERIFY0(zap_remove(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
+ spa->spa_all_vdev_zaps = 0;
+ }
+
+ if (spa->spa_all_vdev_zaps == 0) {
+ spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
+ DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_VDEV_ZAP_MAP, tx);
+ }
+ spa->spa_avz_action = AVZ_ACTION_NONE;
+
+ /* Create ZAPs for vdevs that don't have them. */
+ vdev_construct_zaps(spa->spa_root_vdev, tx);
+
+ config = spa_config_generate(spa, spa->spa_root_vdev,
+ dmu_tx_get_txg(tx), B_FALSE);
+
+ /*
+ * If we're upgrading the spa version then make sure that
+ * the config object gets updated with the correct version.
+ */
+ if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
+ spa->spa_uberblock.ub_version);
+
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ nvlist_free(spa->spa_config_syncing);
+ spa->spa_config_syncing = config;
+
+ spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
+}
+
+static void
+spa_sync_version(void *arg, dmu_tx_t *tx)
+{
+ uint64_t *versionp = arg;
+ uint64_t version = *versionp;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ /*
+ * Setting the version is special cased when first creating the pool.
+ */
+ ASSERT(tx->tx_txg != TXG_INITIAL);
+
+ ASSERT(SPA_VERSION_IS_SUPPORTED(version));
+ ASSERT(version >= spa_version(spa));
+
+ spa->spa_uberblock.ub_version = version;
+ vdev_config_dirty(spa->spa_root_vdev);
+ spa_history_log_internal(spa, "set", tx, "version=%lld",
+ (longlong_t)version);
+}
+
+/*
+ * Set zpool properties.
+ */
+static void
+spa_sync_props(void *arg, dmu_tx_t *tx)
+{
+ nvlist_t *nvp = arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ nvpair_t *elem = NULL;
+
+ mutex_enter(&spa->spa_props_lock);
+
+ while ((elem = nvlist_next_nvpair(nvp, elem))) {
+ uint64_t intval;
+ char *strval, *fname;
+ zpool_prop_t prop;
+ const char *propname;
+ zprop_type_t proptype;
+ spa_feature_t fid;
+
+ switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
+ case ZPOOL_PROP_INVAL:
+ /*
+ * We checked this earlier in spa_prop_validate().
+ */
+ ASSERT(zpool_prop_feature(nvpair_name(elem)));
+
+ fname = strchr(nvpair_name(elem), '@') + 1;
+ VERIFY0(zfeature_lookup_name(fname, &fid));
+
+ spa_feature_enable(spa, fid, tx);
+ spa_history_log_internal(spa, "set", tx,
+ "%s=enabled", nvpair_name(elem));
+ break;
+
+ case ZPOOL_PROP_VERSION:
+ intval = fnvpair_value_uint64(elem);
+ /*
+ * The version is synced separately before other
+ * properties and should be correct by now.
+ */
+ ASSERT3U(spa_version(spa), >=, intval);
+ break;
+
+ case ZPOOL_PROP_ALTROOT:
+ /*
+ * 'altroot' is a non-persistent property. It should
+ * have been set temporarily at creation or import time.
+ */
+ ASSERT(spa->spa_root != NULL);
+ break;
+
+ case ZPOOL_PROP_READONLY:
+ case ZPOOL_PROP_CACHEFILE:
+ /*
+ * 'readonly' and 'cachefile' are also non-persistent
+ * properties.
+ */
+ break;
+ case ZPOOL_PROP_COMMENT:
+ strval = fnvpair_value_string(elem);
+ if (spa->spa_comment != NULL)
+ spa_strfree(spa->spa_comment);
+ spa->spa_comment = spa_strdup(strval);
+ /*
+ * We need to dirty the configuration on all the vdevs
+ * so that their labels get updated. It's unnecessary
+ * to do this for pool creation since the vdev's
+ * configuration has already been dirtied.
+ */
+ if (tx->tx_txg != TXG_INITIAL)
+ vdev_config_dirty(spa->spa_root_vdev);
+ spa_history_log_internal(spa, "set", tx,
+ "%s=%s", nvpair_name(elem), strval);
+ break;
+ case ZPOOL_PROP_COMPATIBILITY:
+ strval = fnvpair_value_string(elem);
+ if (spa->spa_compatibility != NULL)
+ spa_strfree(spa->spa_compatibility);
+ spa->spa_compatibility = spa_strdup(strval);
+ /*
+ * Dirty the configuration on vdevs as above.
+ */
+ if (tx->tx_txg != TXG_INITIAL)
+ vdev_config_dirty(spa->spa_root_vdev);
+ spa_history_log_internal(spa, "set", tx,
+ "%s=%s", nvpair_name(elem), strval);
+ break;
+
+ default:
+ /*
+ * Set pool property values in the poolprops mos object.
+ */
+ if (spa->spa_pool_props_object == 0) {
+ spa->spa_pool_props_object =
+ zap_create_link(mos, DMU_OT_POOL_PROPS,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
+ tx);
+ }
+
+ /* normalize the property name */
+ propname = zpool_prop_to_name(prop);
+ proptype = zpool_prop_get_type(prop);
+
+ if (nvpair_type(elem) == DATA_TYPE_STRING) {
+ ASSERT(proptype == PROP_TYPE_STRING);
+ strval = fnvpair_value_string(elem);
+ VERIFY0(zap_update(mos,
+ spa->spa_pool_props_object, propname,
+ 1, strlen(strval) + 1, strval, tx));
+ spa_history_log_internal(spa, "set", tx,
+ "%s=%s", nvpair_name(elem), strval);
+ } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
+ intval = fnvpair_value_uint64(elem);
+
+ if (proptype == PROP_TYPE_INDEX) {
+ const char *unused;
+ VERIFY0(zpool_prop_index_to_string(
+ prop, intval, &unused));
+ }
+ VERIFY0(zap_update(mos,
+ spa->spa_pool_props_object, propname,
+ 8, 1, &intval, tx));
+ spa_history_log_internal(spa, "set", tx,
+ "%s=%lld", nvpair_name(elem),
+ (longlong_t)intval);
+ } else {
+ ASSERT(0); /* not allowed */
+ }
+
+ switch (prop) {
+ case ZPOOL_PROP_DELEGATION:
+ spa->spa_delegation = intval;
+ break;
+ case ZPOOL_PROP_BOOTFS:
+ spa->spa_bootfs = intval;
+ break;
+ case ZPOOL_PROP_FAILUREMODE:
+ spa->spa_failmode = intval;
+ break;
+ case ZPOOL_PROP_AUTOTRIM:
+ spa->spa_autotrim = intval;
+ spa_async_request(spa,
+ SPA_ASYNC_AUTOTRIM_RESTART);
+ break;
+ case ZPOOL_PROP_AUTOEXPAND:
+ spa->spa_autoexpand = intval;
+ if (tx->tx_txg != TXG_INITIAL)
+ spa_async_request(spa,
+ SPA_ASYNC_AUTOEXPAND);
+ break;
+ case ZPOOL_PROP_MULTIHOST:
+ spa->spa_multihost = intval;
+ break;
+ default:
+ break;
+ }
+ }
+
+ }
+
+ mutex_exit(&spa->spa_props_lock);
+}
+
+/*
+ * Perform one-time upgrade on-disk changes. spa_version() does not
+ * reflect the new version this txg, so there must be no changes this
+ * txg to anything that the upgrade code depends on after it executes.
+ * Therefore this must be called after dsl_pool_sync() does the sync
+ * tasks.
+ */
+static void
+spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
+{
+ if (spa_sync_pass(spa) != 1)
+ return;
+
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
+ dsl_pool_create_origin(dp, tx);
+
+ /* Keeping the origin open increases spa_minref */
+ spa->spa_minref += 3;
+ }
+
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
+ dsl_pool_upgrade_clones(dp, tx);
+ }
+
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
+ dsl_pool_upgrade_dir_clones(dp, tx);
+
+ /* Keeping the freedir open increases spa_minref */
+ spa->spa_minref += 3;
+ }
+
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
+ spa_feature_create_zap_objects(spa, tx);
+ }
+
+ /*
+ * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
+ * when possibility to use lz4 compression for metadata was added
+ * Old pools that have this feature enabled must be upgraded to have
+ * this feature active
+ */
+ if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
+ boolean_t lz4_en = spa_feature_is_enabled(spa,
+ SPA_FEATURE_LZ4_COMPRESS);
+ boolean_t lz4_ac = spa_feature_is_active(spa,
+ SPA_FEATURE_LZ4_COMPRESS);
+
+ if (lz4_en && !lz4_ac)
+ spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
+ }
+
+ /*
+ * If we haven't written the salt, do so now. Note that the
+ * feature may not be activated yet, but that's fine since
+ * the presence of this ZAP entry is backwards compatible.
+ */
+ if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_CHECKSUM_SALT) == ENOENT) {
+ VERIFY0(zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
+ sizeof (spa->spa_cksum_salt.zcs_bytes),
+ spa->spa_cksum_salt.zcs_bytes, tx));
+ }
+
+ rrw_exit(&dp->dp_config_rwlock, FTAG);
+}
+
+static void
+vdev_indirect_state_sync_verify(vdev_t *vd)
+{
+ vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping;
+ vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births;
+
+ if (vd->vdev_ops == &vdev_indirect_ops) {
+ ASSERT(vim != NULL);
+ ASSERT(vib != NULL);
+ }
+
+ uint64_t obsolete_sm_object = 0;
+ ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+ if (obsolete_sm_object != 0) {
+ ASSERT(vd->vdev_obsolete_sm != NULL);
+ ASSERT(vd->vdev_removing ||
+ vd->vdev_ops == &vdev_indirect_ops);
+ ASSERT(vdev_indirect_mapping_num_entries(vim) > 0);
+ ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0);
+ ASSERT3U(obsolete_sm_object, ==,
+ space_map_object(vd->vdev_obsolete_sm));
+ ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=,
+ space_map_allocated(vd->vdev_obsolete_sm));
+ }
+ ASSERT(vd->vdev_obsolete_segments != NULL);
+
+ /*
+ * Since frees / remaps to an indirect vdev can only
+ * happen in syncing context, the obsolete segments
+ * tree must be empty when we start syncing.
+ */
+ ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
+}
+
+/*
+ * Set the top-level vdev's max queue depth. Evaluate each top-level's
+ * async write queue depth in case it changed. The max queue depth will
+ * not change in the middle of syncing out this txg.
+ */
+static void
+spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
+{
+ ASSERT(spa_writeable(spa));
+
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
+ zfs_vdev_queue_depth_pct / 100;
+ metaslab_class_t *normal = spa_normal_class(spa);
+ metaslab_class_t *special = spa_special_class(spa);
+ metaslab_class_t *dedup = spa_dedup_class(spa);
+
+ uint64_t slots_per_allocator = 0;
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+
+ metaslab_group_t *mg = tvd->vdev_mg;
+ if (mg == NULL || !metaslab_group_initialized(mg))
+ continue;
+
+ metaslab_class_t *mc = mg->mg_class;
+ if (mc != normal && mc != special && mc != dedup)
+ continue;
+
+ /*
+ * It is safe to do a lock-free check here because only async
+ * allocations look at mg_max_alloc_queue_depth, and async
+ * allocations all happen from spa_sync().
+ */
+ for (int i = 0; i < mg->mg_allocators; i++) {
+ ASSERT0(zfs_refcount_count(
+ &(mg->mg_allocator[i].mga_alloc_queue_depth)));
+ }
+ mg->mg_max_alloc_queue_depth = max_queue_depth;
+
+ for (int i = 0; i < mg->mg_allocators; i++) {
+ mg->mg_allocator[i].mga_cur_max_alloc_queue_depth =
+ zfs_vdev_def_queue_depth;
+ }
+ slots_per_allocator += zfs_vdev_def_queue_depth;
+ }
+
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ ASSERT0(zfs_refcount_count(&normal->mc_allocator[i].
+ mca_alloc_slots));
+ ASSERT0(zfs_refcount_count(&special->mc_allocator[i].
+ mca_alloc_slots));
+ ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i].
+ mca_alloc_slots));
+ normal->mc_allocator[i].mca_alloc_max_slots =
+ slots_per_allocator;
+ special->mc_allocator[i].mca_alloc_max_slots =
+ slots_per_allocator;
+ dedup->mc_allocator[i].mca_alloc_max_slots =
+ slots_per_allocator;
+ }
+ normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+ special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+ dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+}
+
+static void
+spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx)
+{
+ ASSERT(spa_writeable(spa));
+
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+ vdev_indirect_state_sync_verify(vd);
+
+ if (vdev_indirect_should_condense(vd)) {
+ spa_condense_indirect_start_sync(vd, tx);
+ break;
+ }
+ }
+}
+
+static void
+spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
+{
+ objset_t *mos = spa->spa_meta_objset;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ uint64_t txg = tx->tx_txg;
+ bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
+
+ do {
+ int pass = ++spa->spa_sync_pass;
+
+ spa_sync_config_object(spa, tx);
+ spa_sync_aux_dev(spa, &spa->spa_spares, tx,
+ ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
+ spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
+ ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
+ spa_errlog_sync(spa, txg);
+ dsl_pool_sync(dp, txg);
+
+ if (pass < zfs_sync_pass_deferred_free ||
+ spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
+ /*
+ * If the log space map feature is active we don't
+ * care about deferred frees and the deferred bpobj
+ * as the log space map should effectively have the
+ * same results (i.e. appending only to one object).
+ */
+ spa_sync_frees(spa, free_bpl, tx);
+ } else {
+ /*
+ * We can not defer frees in pass 1, because
+ * we sync the deferred frees later in pass 1.
+ */
+ ASSERT3U(pass, >, 1);
+ bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb,
+ &spa->spa_deferred_bpobj, tx);
+ }
+
+ ddt_sync(spa, txg);
+ dsl_scan_sync(dp, tx);
+ svr_sync(spa, tx);
+ spa_sync_upgrades(spa, tx);
+
+ spa_flush_metaslabs(spa, tx);
+
+ vdev_t *vd = NULL;
+ while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
+ != NULL)
+ vdev_sync(vd, txg);
+
+ /*
+ * Note: We need to check if the MOS is dirty because we could
+ * have marked the MOS dirty without updating the uberblock
+ * (e.g. if we have sync tasks but no dirty user data). We need
+ * to check the uberblock's rootbp because it is updated if we
+ * have synced out dirty data (though in this case the MOS will
+ * most likely also be dirty due to second order effects, we
+ * don't want to rely on that here).
+ */
+ if (pass == 1 &&
+ spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+ !dmu_objset_is_dirty(mos, txg)) {
+ /*
+ * Nothing changed on the first pass, therefore this
+ * TXG is a no-op. Avoid syncing deferred frees, so
+ * that we can keep this TXG as a no-op.
+ */
+ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
+ ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
+ ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
+ ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg));
+ break;
+ }
+
+ spa_sync_deferred_frees(spa, tx);
+ } while (dmu_objset_is_dirty(mos, txg));
+}
+
+/*
+ * Rewrite the vdev configuration (which includes the uberblock) to
+ * commit the transaction group.
+ *
+ * If there are no dirty vdevs, we sync the uberblock to a few random
+ * top-level vdevs that are known to be visible in the config cache
+ * (see spa_vdev_add() for a complete description). If there *are* dirty
+ * vdevs, sync the uberblock to all vdevs.
+ */
+static void
+spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t txg = tx->tx_txg;
+
+ for (;;) {
+ int error = 0;
+
+ /*
+ * We hold SCL_STATE to prevent vdev open/close/etc.
+ * while we're attempting to write the vdev labels.
+ */
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+
+ if (list_is_empty(&spa->spa_config_dirty_list)) {
+ vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
+ int svdcount = 0;
+ int children = rvd->vdev_children;
+ int c0 = spa_get_random(children);
+
+ for (int c = 0; c < children; c++) {
+ vdev_t *vd =
+ rvd->vdev_child[(c0 + c) % children];
+
+ /* Stop when revisiting the first vdev */
+ if (c > 0 && svd[0] == vd)
+ break;
+
+ if (vd->vdev_ms_array == 0 ||
+ vd->vdev_islog ||
+ !vdev_is_concrete(vd))
+ continue;
+
+ svd[svdcount++] = vd;
+ if (svdcount == SPA_SYNC_MIN_VDEVS)
+ break;
+ }
+ error = vdev_config_sync(svd, svdcount, txg);
+ } else {
+ error = vdev_config_sync(rvd->vdev_child,
+ rvd->vdev_children, txg);
+ }
+
+ if (error == 0)
+ spa->spa_last_synced_guid = rvd->vdev_guid;
+
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ if (error == 0)
+ break;
+ zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
+ zio_resume_wait(spa);
+ }
+}
+
+/*
+ * Sync the specified transaction group. New blocks may be dirtied as
+ * part of the process, so we iterate until it converges.
+ */
+void
+spa_sync(spa_t *spa, uint64_t txg)
+{
+ vdev_t *vd = NULL;
+
+ VERIFY(spa_writeable(spa));
+
+ /*
+ * Wait for i/os issued in open context that need to complete
+ * before this txg syncs.
+ */
+ (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
+ spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+
+ /*
+ * Lock out configuration changes.
+ */
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ spa->spa_syncing_txg = txg;
+ spa->spa_sync_pass = 0;
+
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ mutex_enter(&spa->spa_alloc_locks[i]);
+ VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
+ mutex_exit(&spa->spa_alloc_locks[i]);
+ }
+
+ /*
+ * If there are any pending vdev state changes, convert them
+ * into config changes that go out with this transaction group.
+ */
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ while (list_head(&spa->spa_state_dirty_list) != NULL) {
+ /*
+ * We need the write lock here because, for aux vdevs,
+ * calling vdev_config_dirty() modifies sav_config.
+ * This is ugly and will become unnecessary when we
+ * eliminate the aux vdev wart by integrating all vdevs
+ * into the root vdev tree.
+ */
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
+ while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
+ vdev_state_clean(vd);
+ vdev_config_dirty(vd);
+ }
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+ }
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
+
+ spa->spa_sync_starttime = gethrtime();
+ taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
+ spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
+ spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
+ NSEC_TO_TICK(spa->spa_deadman_synctime));
+
+ /*
+ * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
+ * set spa_deflate if we have no raid-z vdevs.
+ */
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ int i;
+ for (i = 0; i < rvd->vdev_children; i++) {
+ vd = rvd->vdev_child[i];
+ if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
+ break;
+ }
+ if (i == rvd->vdev_children) {
+ spa->spa_deflate = TRUE;
+ VERIFY0(zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
+ sizeof (uint64_t), 1, &spa->spa_deflate, tx));
+ }
+ }
+
+ spa_sync_adjust_vdev_max_queue_depth(spa);
+
+ spa_sync_condense_indirect(spa, tx);
+
+ spa_sync_iterate_to_convergence(spa, tx);
+
+#ifdef ZFS_DEBUG
+ if (!list_is_empty(&spa->spa_config_dirty_list)) {
+ /*
+ * Make sure that the number of ZAPs for all the vdevs matches
+ * the number of ZAPs in the per-vdev ZAP list. This only gets
+ * called if the config is dirty; otherwise there may be
+ * outstanding AVZ operations that weren't completed in
+ * spa_sync_config_object.
+ */
+ uint64_t all_vdev_zap_entry_count;
+ ASSERT0(zap_count(spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
+ ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
+ all_vdev_zap_entry_count);
+ }
+#endif
+
+ if (spa->spa_vdev_removal != NULL) {
+ ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
+ }
+
+ spa_sync_rewrite_vdev_config(spa, tx);
+ dmu_tx_commit(tx);
+
+ taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
+ spa->spa_deadman_tqid = 0;
+
+ /*
+ * Clear the dirty config list.
+ */
+ while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
+ vdev_config_clean(vd);
+
+ /*
+ * Now that the new config has synced transactionally,
+ * let it become visible to the config cache.
+ */
+ if (spa->spa_config_syncing != NULL) {
+ spa_config_set(spa, spa->spa_config_syncing);
+ spa->spa_config_txg = txg;
+ spa->spa_config_syncing = NULL;
+ }
+
+ dsl_pool_sync_done(dp, txg);
+
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ mutex_enter(&spa->spa_alloc_locks[i]);
+ VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
+ mutex_exit(&spa->spa_alloc_locks[i]);
+ }
+
+ /*
+ * Update usable space statistics.
+ */
+ while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
+ != NULL)
+ vdev_sync_done(vd, txg);
+
+ metaslab_class_evict_old(spa->spa_normal_class, txg);
+ metaslab_class_evict_old(spa->spa_log_class, txg);
+
+ spa_sync_close_syncing_log_sm(spa);
+
+ spa_update_dspace(spa);
+
+ /*
+ * It had better be the case that we didn't dirty anything
+ * since vdev_config_sync().
+ */
+ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
+ ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
+ ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
+
+ while (zfs_pause_spa_sync)
+ delay(1);
+
+ spa->spa_sync_pass = 0;
+
+ /*
+ * Update the last synced uberblock here. We want to do this at
+ * the end of spa_sync() so that consumers of spa_last_synced_txg()
+ * will be guaranteed that all the processing associated with
+ * that txg has been completed.
+ */
+ spa->spa_ubsync = spa->spa_uberblock;
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ spa_handle_ignored_writes(spa);
+
+ /*
+ * If any async tasks have been requested, kick them off.
+ */
+ spa_async_dispatch(spa);
+}
+
+/*
+ * Sync all pools. We don't want to hold the namespace lock across these
+ * operations, so we take a reference on the spa_t and drop the lock during the
+ * sync.
+ */
+void
+spa_sync_allpools(void)
+{
+ spa_t *spa = NULL;
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL) {
+ if (spa_state(spa) != POOL_STATE_ACTIVE ||
+ !spa_writeable(spa) || spa_suspended(spa))
+ continue;
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+ }
+ mutex_exit(&spa_namespace_lock);
+}
+
+/*
+ * ==========================================================================
+ * Miscellaneous routines
+ * ==========================================================================
+ */
+
+/*
+ * Remove all pools in the system.
+ */
+void
+spa_evict_all(void)
+{
+ spa_t *spa;
+
+ /*
+ * Remove all cached state. All pools should be closed now,
+ * so every spa in the AVL tree should be unreferenced.
+ */
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(NULL)) != NULL) {
+ /*
+ * Stop async tasks. The async thread may need to detach
+ * a device that's been replaced, which requires grabbing
+ * spa_namespace_lock, so we must drop it here.
+ */
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ spa_async_suspend(spa);
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+
+ if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ }
+ spa_remove(spa);
+ }
+ mutex_exit(&spa_namespace_lock);
+}
+
+vdev_t *
+spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
+{
+ vdev_t *vd;
+ int i;
+
+ if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
+ return (vd);
+
+ if (aux) {
+ for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
+ vd = spa->spa_l2cache.sav_vdevs[i];
+ if (vd->vdev_guid == guid)
+ return (vd);
+ }
+
+ for (i = 0; i < spa->spa_spares.sav_count; i++) {
+ vd = spa->spa_spares.sav_vdevs[i];
+ if (vd->vdev_guid == guid)
+ return (vd);
+ }
+ }
+
+ return (NULL);
+}
+
+void
+spa_upgrade(spa_t *spa, uint64_t version)
+{
+ ASSERT(spa_writeable(spa));
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+ /*
+ * This should only be called for a non-faulted pool, and since a
+ * future version would result in an unopenable pool, this shouldn't be
+ * possible.
+ */
+ ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
+ ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
+
+ spa->spa_uberblock.ub_version = version;
+ vdev_config_dirty(spa->spa_root_vdev);
+
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ txg_wait_synced(spa_get_dsl(spa), 0);
+}
+
+boolean_t
+spa_has_spare(spa_t *spa, uint64_t guid)
+{
+ int i;
+ uint64_t spareguid;
+ spa_aux_vdev_t *sav = &spa->spa_spares;
+
+ for (i = 0; i < sav->sav_count; i++)
+ if (sav->sav_vdevs[i]->vdev_guid == guid)
+ return (B_TRUE);
+
+ for (i = 0; i < sav->sav_npending; i++) {
+ if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
+ &spareguid) == 0 && spareguid == guid)
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Check if a pool has an active shared spare device.
+ * Note: reference count of an active spare is 2, as a spare and as a replace
+ */
+static boolean_t
+spa_has_active_shared_spare(spa_t *spa)
+{
+ int i, refcnt;
+ uint64_t pool;
+ spa_aux_vdev_t *sav = &spa->spa_spares;
+
+ for (i = 0; i < sav->sav_count; i++) {
+ if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
+ &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
+ refcnt > 2)
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+uint64_t
+spa_total_metaslabs(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ uint64_t m = 0;
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+ if (!vdev_is_concrete(vd))
+ continue;
+ m += vd->vdev_ms_count;
+ }
+ return (m);
+}
+
+/*
+ * Notify any waiting threads that some activity has switched from being in-
+ * progress to not-in-progress so that the thread can wake up and determine
+ * whether it is finished waiting.
+ */
+void
+spa_notify_waiters(spa_t *spa)
+{
+ /*
+ * Acquiring spa_activities_lock here prevents the cv_broadcast from
+ * happening between the waiting thread's check and cv_wait.
+ */
+ mutex_enter(&spa->spa_activities_lock);
+ cv_broadcast(&spa->spa_activities_cv);
+ mutex_exit(&spa->spa_activities_lock);
+}
+
+/*
+ * Notify any waiting threads that the pool is exporting, and then block until
+ * they are finished using the spa_t.
+ */
+void
+spa_wake_waiters(spa_t *spa)
+{
+ mutex_enter(&spa->spa_activities_lock);
+ spa->spa_waiters_cancel = B_TRUE;
+ cv_broadcast(&spa->spa_activities_cv);
+ while (spa->spa_waiters != 0)
+ cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock);
+ spa->spa_waiters_cancel = B_FALSE;
+ mutex_exit(&spa->spa_activities_lock);
+}
+
+/* Whether the vdev or any of its descendants are being initialized/trimmed. */
+static boolean_t
+spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER));
+ ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
+ ASSERT(activity == ZPOOL_WAIT_INITIALIZE ||
+ activity == ZPOOL_WAIT_TRIM);
+
+ kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ?
+ &vd->vdev_initialize_lock : &vd->vdev_trim_lock;
+
+ mutex_exit(&spa->spa_activities_lock);
+ mutex_enter(lock);
+ mutex_enter(&spa->spa_activities_lock);
+
+ boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ?
+ (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) :
+ (vd->vdev_trim_state == VDEV_TRIM_ACTIVE);
+ mutex_exit(lock);
+
+ if (in_progress)
+ return (B_TRUE);
+
+ for (int i = 0; i < vd->vdev_children; i++) {
+ if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i],
+ activity))
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * If use_guid is true, this checks whether the vdev specified by guid is
+ * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool
+ * is being initialized/trimmed. The caller must hold the config lock and
+ * spa_activities_lock.
+ */
+static int
+spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid,
+ zpool_wait_activity_t activity, boolean_t *in_progress)
+{
+ mutex_exit(&spa->spa_activities_lock);
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+ mutex_enter(&spa->spa_activities_lock);
+
+ vdev_t *vd;
+ if (use_guid) {
+ vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+ if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ return (EINVAL);
+ }
+ } else {
+ vd = spa->spa_root_vdev;
+ }
+
+ *in_progress = spa_vdev_activity_in_progress_impl(vd, activity);
+
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ return (0);
+}
+
+/*
+ * Locking for waiting threads
+ * ---------------------------
+ *
+ * Waiting threads need a way to check whether a given activity is in progress,
+ * and then, if it is, wait for it to complete. Each activity will have some
+ * in-memory representation of the relevant on-disk state which can be used to
+ * determine whether or not the activity is in progress. The in-memory state and
+ * the locking used to protect it will be different for each activity, and may
+ * not be suitable for use with a cvar (e.g., some state is protected by the
+ * config lock). To allow waiting threads to wait without any races, another
+ * lock, spa_activities_lock, is used.
+ *
+ * When the state is checked, both the activity-specific lock (if there is one)
+ * and spa_activities_lock are held. In some cases, the activity-specific lock
+ * is acquired explicitly (e.g. the config lock). In others, the locking is
+ * internal to some check (e.g. bpobj_is_empty). After checking, the waiting
+ * thread releases the activity-specific lock and, if the activity is in
+ * progress, then cv_waits using spa_activities_lock.
+ *
+ * The waiting thread is woken when another thread, one completing some
+ * activity, updates the state of the activity and then calls
+ * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only
+ * needs to hold its activity-specific lock when updating the state, and this
+ * lock can (but doesn't have to) be dropped before calling spa_notify_waiters.
+ *
+ * Because spa_notify_waiters acquires spa_activities_lock before broadcasting,
+ * and because it is held when the waiting thread checks the state of the
+ * activity, it can never be the case that the completing thread both updates
+ * the activity state and cv_broadcasts in between the waiting thread's check
+ * and cv_wait. Thus, a waiting thread can never miss a wakeup.
+ *
+ * In order to prevent deadlock, when the waiting thread does its check, in some
+ * cases it will temporarily drop spa_activities_lock in order to acquire the
+ * activity-specific lock. The order in which spa_activities_lock and the
+ * activity specific lock are acquired in the waiting thread is determined by
+ * the order in which they are acquired in the completing thread; if the
+ * completing thread calls spa_notify_waiters with the activity-specific lock
+ * held, then the waiting thread must also acquire the activity-specific lock
+ * first.
+ */
+
+static int
+spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
+ boolean_t use_tag, uint64_t tag, boolean_t *in_progress)
+{
+ int error = 0;
+
+ ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
+
+ switch (activity) {
+ case ZPOOL_WAIT_CKPT_DISCARD:
+ *in_progress =
+ (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) &&
+ zap_contains(spa_meta_objset(spa),
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) ==
+ ENOENT);
+ break;
+ case ZPOOL_WAIT_FREE:
+ *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS &&
+ !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) ||
+ spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) ||
+ spa_livelist_delete_check(spa));
+ break;
+ case ZPOOL_WAIT_INITIALIZE:
+ case ZPOOL_WAIT_TRIM:
+ error = spa_vdev_activity_in_progress(spa, use_tag, tag,
+ activity, in_progress);
+ break;
+ case ZPOOL_WAIT_REPLACE:
+ mutex_exit(&spa->spa_activities_lock);
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+ mutex_enter(&spa->spa_activities_lock);
+
+ *in_progress = vdev_replace_in_progress(spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ break;
+ case ZPOOL_WAIT_REMOVE:
+ *in_progress = (spa->spa_removing_phys.sr_state ==
+ DSS_SCANNING);
+ break;
+ case ZPOOL_WAIT_RESILVER:
+ if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev)))
+ break;
+ /* fall through */
+ case ZPOOL_WAIT_SCRUB:
+ {
+ boolean_t scanning, paused, is_scrub;
+ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+
+ is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB);
+ scanning = (scn->scn_phys.scn_state == DSS_SCANNING);
+ paused = dsl_scan_is_paused_scrub(scn);
+ *in_progress = (scanning && !paused &&
+ is_scrub == (activity == ZPOOL_WAIT_SCRUB));
+ break;
+ }
+ default:
+ panic("unrecognized value for activity %d", activity);
+ }
+
+ return (error);
+}
+
+static int
+spa_wait_common(const char *pool, zpool_wait_activity_t activity,
+ boolean_t use_tag, uint64_t tag, boolean_t *waited)
+{
+ /*
+ * The tag is used to distinguish between instances of an activity.
+ * 'initialize' and 'trim' are the only activities that we use this for.
+ * The other activities can only have a single instance in progress in a
+ * pool at one time, making the tag unnecessary.
+ *
+ * There can be multiple devices being replaced at once, but since they
+ * all finish once resilvering finishes, we don't bother keeping track
+ * of them individually, we just wait for them all to finish.
+ */
+ if (use_tag && activity != ZPOOL_WAIT_INITIALIZE &&
+ activity != ZPOOL_WAIT_TRIM)
+ return (EINVAL);
+
+ if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES)
+ return (EINVAL);
+
+ spa_t *spa;
+ int error = spa_open(pool, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Increment the spa's waiter count so that we can call spa_close and
+ * still ensure that the spa_t doesn't get freed before this thread is
+ * finished with it when the pool is exported. We want to call spa_close
+ * before we start waiting because otherwise the additional ref would
+ * prevent the pool from being exported or destroyed throughout the
+ * potentially long wait.
+ */
+ mutex_enter(&spa->spa_activities_lock);
+ spa->spa_waiters++;
+ spa_close(spa, FTAG);
+
+ *waited = B_FALSE;
+ for (;;) {
+ boolean_t in_progress;
+ error = spa_activity_in_progress(spa, activity, use_tag, tag,
+ &in_progress);
+
+ if (error || !in_progress || spa->spa_waiters_cancel)
+ break;
+
+ *waited = B_TRUE;
+
+ if (cv_wait_sig(&spa->spa_activities_cv,
+ &spa->spa_activities_lock) == 0) {
+ error = EINTR;
+ break;
+ }
+ }
+
+ spa->spa_waiters--;
+ cv_signal(&spa->spa_waiters_cv);
+ mutex_exit(&spa->spa_activities_lock);
+
+ return (error);
+}
+
+/*
+ * Wait for a particular instance of the specified activity to complete, where
+ * the instance is identified by 'tag'
+ */
+int
+spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag,
+ boolean_t *waited)
+{
+ return (spa_wait_common(pool, activity, B_TRUE, tag, waited));
+}
+
+/*
+ * Wait for all instances of the specified activity complete
+ */
+int
+spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited)
+{
+
+ return (spa_wait_common(pool, activity, B_FALSE, 0, waited));
+}
+
+sysevent_t *
+spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
+{
+ sysevent_t *ev = NULL;
+#ifdef _KERNEL
+ nvlist_t *resource;
+
+ resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl);
+ if (resource) {
+ ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP);
+ ev->resource = resource;
+ }
+#endif
+ return (ev);
+}
+
+void
+spa_event_post(sysevent_t *ev)
+{
+#ifdef _KERNEL
+ if (ev) {
+ zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb);
+ kmem_free(ev, sizeof (*ev));
+ }
+#endif
+}
+
+/*
+ * Post a zevent corresponding to the given sysevent. The 'name' must be one
+ * of the event definitions in sys/sysevent/eventdefs.h. The payload will be
+ * filled in from the spa and (optionally) the vdev. This doesn't do anything
+ * in the userland libzpool, as we don't want consumers to misinterpret ztest
+ * or zdb as real changes.
+ */
+void
+spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
+{
+ spa_event_post(spa_event_create(spa, vd, hist_nvl, name));
+}
+
+/* state manipulation functions */
+EXPORT_SYMBOL(spa_open);
+EXPORT_SYMBOL(spa_open_rewind);
+EXPORT_SYMBOL(spa_get_stats);
+EXPORT_SYMBOL(spa_create);
+EXPORT_SYMBOL(spa_import);
+EXPORT_SYMBOL(spa_tryimport);
+EXPORT_SYMBOL(spa_destroy);
+EXPORT_SYMBOL(spa_export);
+EXPORT_SYMBOL(spa_reset);
+EXPORT_SYMBOL(spa_async_request);
+EXPORT_SYMBOL(spa_async_suspend);
+EXPORT_SYMBOL(spa_async_resume);
+EXPORT_SYMBOL(spa_inject_addref);
+EXPORT_SYMBOL(spa_inject_delref);
+EXPORT_SYMBOL(spa_scan_stat_init);
+EXPORT_SYMBOL(spa_scan_get_stats);
+
+/* device manipulation */
+EXPORT_SYMBOL(spa_vdev_add);
+EXPORT_SYMBOL(spa_vdev_attach);
+EXPORT_SYMBOL(spa_vdev_detach);
+EXPORT_SYMBOL(spa_vdev_setpath);
+EXPORT_SYMBOL(spa_vdev_setfru);
+EXPORT_SYMBOL(spa_vdev_split_mirror);
+
+/* spare statech is global across all pools) */
+EXPORT_SYMBOL(spa_spare_add);
+EXPORT_SYMBOL(spa_spare_remove);
+EXPORT_SYMBOL(spa_spare_exists);
+EXPORT_SYMBOL(spa_spare_activate);
+
+/* L2ARC statech is global across all pools) */
+EXPORT_SYMBOL(spa_l2cache_add);
+EXPORT_SYMBOL(spa_l2cache_remove);
+EXPORT_SYMBOL(spa_l2cache_exists);
+EXPORT_SYMBOL(spa_l2cache_activate);
+EXPORT_SYMBOL(spa_l2cache_drop);
+
+/* scanning */
+EXPORT_SYMBOL(spa_scan);
+EXPORT_SYMBOL(spa_scan_stop);
+
+/* spa syncing */
+EXPORT_SYMBOL(spa_sync); /* only for DMU use */
+EXPORT_SYMBOL(spa_sync_allpools);
+
+/* properties */
+EXPORT_SYMBOL(spa_prop_set);
+EXPORT_SYMBOL(spa_prop_get);
+EXPORT_SYMBOL(spa_prop_clear_bootfs);
+
+/* asynchronous event notification */
+EXPORT_SYMBOL(spa_event_notify);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, INT, ZMOD_RW,
+ "log2(fraction of arc that can be used by inflight I/Os when "
+ "verifying pool during import");
+
+ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW,
+ "Set to traverse metadata on pool import");
+
+ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW,
+ "Set to traverse data on pool import");
+
+ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW,
+ "Print vdev tree to zfs_dbgmsg during pool import");
+
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD,
+ "Percentage of CPUs to run an IO worker thread");
+
+ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW,
+ "Allow importing pool with up to this number of missing top-level "
+ "vdevs (in read-only mode)");
+
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW,
+ "Set the livelist condense zthr to pause");
+
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, ZMOD_RW,
+ "Set the livelist condense synctask to pause");
+
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, INT, ZMOD_RW,
+ "Whether livelist condensing was canceled in the synctask");
+
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT, ZMOD_RW,
+ "Whether livelist condensing was canceled in the zthr function");
+
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW,
+ "Whether extra ALLOC blkptrs were added to a livelist entry while it "
+ "was being condensed");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/spa_boot.c b/sys/contrib/openzfs/module/zfs/spa_boot.c
new file mode 100644
index 000000000000..674394650f82
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/spa_boot.c
@@ -0,0 +1,50 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifdef _KERNEL
+
+#include <sys/zio.h>
+#include <sys/spa_boot.h>
+#include <sys/sunddi.h>
+
+char *
+spa_get_bootprop(char *propname)
+{
+ char *value;
+
+ if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
+ DDI_PROP_DONTPASS, propname, &value) != DDI_SUCCESS)
+ return (NULL);
+ return (value);
+}
+
+void
+spa_free_bootprop(char *value)
+{
+ ddi_prop_free(value);
+}
+
+#endif /* _KERNEL */
diff --git a/sys/contrib/openzfs/module/zfs/spa_checkpoint.c b/sys/contrib/openzfs/module/zfs/spa_checkpoint.c
new file mode 100644
index 000000000000..5fb614467273
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/spa_checkpoint.c
@@ -0,0 +1,636 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+/*
+ * Storage Pool Checkpoint
+ *
+ * A storage pool checkpoint can be thought of as a pool-wide snapshot or
+ * a stable version of extreme rewind that guarantees no blocks from the
+ * checkpointed state will have been overwritten. It remembers the entire
+ * state of the storage pool (e.g. snapshots, dataset names, etc..) from the
+ * point that it was taken and the user can rewind back to that point even if
+ * they applied destructive operations on their datasets or even enabled new
+ * zpool on-disk features. If a pool has a checkpoint that is no longer
+ * needed, the user can discard it.
+ *
+ * == On disk data structures used ==
+ *
+ * - The pool has a new feature flag and a new entry in the MOS. The feature
+ * flag is set to active when we create the checkpoint and remains active
+ * until the checkpoint is fully discarded. The entry in the MOS config
+ * (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that
+ * references the state of the pool when we take the checkpoint. The entry
+ * remains populated until we start discarding the checkpoint or we rewind
+ * back to it.
+ *
+ * - Each vdev contains a vdev-wide space map while the pool has a checkpoint,
+ * which persists until the checkpoint is fully discarded. The space map
+ * contains entries that have been freed in the current state of the pool
+ * but we want to keep around in case we decide to rewind to the checkpoint.
+ * [see vdev_checkpoint_sm]
+ *
+ * - Each metaslab's ms_sm space map behaves the same as without the
+ * checkpoint, with the only exception being the scenario when we free
+ * blocks that belong to the checkpoint. In this case, these blocks remain
+ * ALLOCATED in the metaslab's space map and they are added as FREE in the
+ * vdev's checkpoint space map.
+ *
+ * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that
+ * the uberblock was checkpointed. For normal uberblocks this field is 0.
+ *
+ * == Overview of operations ==
+ *
+ * - To create a checkpoint, we first wait for the current TXG to be synced,
+ * so we can use the most recently synced uberblock (spa_ubsync) as the
+ * checkpointed uberblock. Then we use an early synctask to place that
+ * uberblock in MOS config, increment the feature flag for the checkpoint
+ * (marking it active), and setting spa_checkpoint_txg (see its use below)
+ * to the TXG of the checkpointed uberblock. We use an early synctask for
+ * the aforementioned operations to ensure that no blocks were dirtied
+ * between the current TXG and the TXG of the checkpointed uberblock
+ * (e.g the previous txg).
+ *
+ * - When a checkpoint exists, we need to ensure that the blocks that
+ * belong to the checkpoint are freed but never reused. This means that
+ * these blocks should never end up in the ms_allocatable or the ms_freeing
+ * trees of a metaslab. Therefore, whenever there is a checkpoint the new
+ * ms_checkpointing tree is used in addition to the aforementioned ones.
+ *
+ * Whenever a block is freed and we find out that it is referenced by the
+ * checkpoint (we find out by comparing its birth to spa_checkpoint_txg),
+ * we place it in the ms_checkpointing tree instead of the ms_freeingtree.
+ * This way, we divide the blocks that are being freed into checkpointed
+ * and not-checkpointed blocks.
+ *
+ * In order to persist these frees, we write the extents from the
+ * ms_freeingtree to the ms_sm as usual, and the extents from the
+ * ms_checkpointing tree to the vdev_checkpoint_sm. This way, these
+ * checkpointed extents will remain allocated in the metaslab's ms_sm space
+ * map, and therefore won't be reused [see metaslab_sync()]. In addition,
+ * when we discard the checkpoint, we can find the entries that have
+ * actually been freed in vdev_checkpoint_sm.
+ * [see spa_checkpoint_discard_thread_sync()]
+ *
+ * - To discard the checkpoint we use an early synctask to delete the
+ * checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0,
+ * and wakeup the discarding zthr thread (an open-context async thread).
+ * We use an early synctask to ensure that the operation happens before any
+ * new data end up in the checkpoint's data structures.
+ *
+ * Once the synctask is done and the discarding zthr is awake, we discard
+ * the checkpointed data over multiple TXGs by having the zthr prefetching
+ * entries from vdev_checkpoint_sm and then starting a synctask that places
+ * them as free blocks into their respective ms_allocatable and ms_sm
+ * structures.
+ * [see spa_checkpoint_discard_thread()]
+ *
+ * When there are no entries left in the vdev_checkpoint_sm of all
+ * top-level vdevs, a final synctask runs that decrements the feature flag.
+ *
+ * - To rewind to the checkpoint, we first use the current uberblock and
+ * open the MOS so we can access the checkpointed uberblock from the MOS
+ * config. After we retrieve the checkpointed uberblock, we use it as the
+ * current uberblock for the pool by writing it to disk with an updated
+ * TXG, opening its version of the MOS, and moving on as usual from there.
+ * [see spa_ld_checkpoint_rewind()]
+ *
+ * An important note on rewinding to the checkpoint has to do with how we
+ * handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL
+ * blocks that have not been claimed by the time we took the checkpoint
+ * as they should no longer be valid.
+ * [see comment in zil_claim()]
+ *
+ * == Miscellaneous information ==
+ *
+ * - In the hypothetical event that we take a checkpoint, remove a vdev,
+ * and attempt to rewind, the rewind would fail as the checkpointed
+ * uberblock would reference data in the removed device. For this reason
+ * and others of similar nature, we disallow the following operations that
+ * can change the config:
+ * vdev removal and attach/detach, mirror splitting, and pool reguid.
+ *
+ * - As most of the checkpoint logic is implemented in the SPA and doesn't
+ * distinguish datasets when it comes to space accounting, having a
+ * checkpoint can potentially break the boundaries set by dataset
+ * reservations.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/metaslab_impl.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/spa_checkpoint.h>
+#include <sys/vdev_impl.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+
+/*
+ * The following parameter limits the amount of memory to be used for the
+ * prefetching of the checkpoint space map done on each vdev while
+ * discarding the checkpoint.
+ *
+ * The reason it exists is because top-level vdevs with long checkpoint
+ * space maps can potentially take up a lot of memory depending on the
+ * amount of checkpointed data that has been freed within them while
+ * the pool had a checkpoint.
+ */
+unsigned long zfs_spa_discard_memory_limit = 16 * 1024 * 1024;
+
+int
+spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs)
+{
+ if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+ return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
+
+ bzero(pcs, sizeof (pool_checkpoint_stat_t));
+
+ int error = zap_contains(spa_meta_objset(spa),
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT);
+ ASSERT(error == 0 || error == ENOENT);
+
+ if (error == ENOENT)
+ pcs->pcs_state = CS_CHECKPOINT_DISCARDING;
+ else
+ pcs->pcs_state = CS_CHECKPOINT_EXISTS;
+
+ pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace;
+ pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp;
+
+ return (0);
+}
+
+static void
+spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = arg;
+
+ spa->spa_checkpoint_info.sci_timestamp = 0;
+
+ spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
+ spa_notify_waiters(spa);
+
+ spa_history_log_internal(spa, "spa discard checkpoint", tx,
+ "finished discarding checkpointed state from the pool");
+}
+
+typedef struct spa_checkpoint_discard_sync_callback_arg {
+ vdev_t *sdc_vd;
+ uint64_t sdc_txg;
+ uint64_t sdc_entry_limit;
+} spa_checkpoint_discard_sync_callback_arg_t;
+
+static int
+spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg)
+{
+ spa_checkpoint_discard_sync_callback_arg_t *sdc = arg;
+ vdev_t *vd = sdc->sdc_vd;
+ metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
+ uint64_t end = sme->sme_offset + sme->sme_run;
+
+ if (sdc->sdc_entry_limit == 0)
+ return (SET_ERROR(EINTR));
+
+ /*
+ * Since the space map is not condensed, we know that
+ * none of its entries is crossing the boundaries of
+ * its respective metaslab.
+ *
+ * That said, there is no fundamental requirement that
+ * the checkpoint's space map entries should not cross
+ * metaslab boundaries. So if needed we could add code
+ * that handles metaslab-crossing segments in the future.
+ */
+ VERIFY3U(sme->sme_type, ==, SM_FREE);
+ VERIFY3U(sme->sme_offset, >=, ms->ms_start);
+ VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+ /*
+ * At this point we should not be processing any
+ * other frees concurrently, so the lock is technically
+ * unnecessary. We use the lock anyway though to
+ * potentially save ourselves from future headaches.
+ */
+ mutex_enter(&ms->ms_lock);
+ if (range_tree_is_empty(ms->ms_freeing))
+ vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg);
+ range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run);
+ mutex_exit(&ms->ms_lock);
+
+ ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=,
+ sme->sme_run);
+ ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run);
+
+ vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run;
+ vd->vdev_stat.vs_checkpoint_space -= sme->sme_run;
+ sdc->sdc_entry_limit--;
+
+ return (0);
+}
+
+#ifdef ZFS_DEBUG
+static void
+spa_checkpoint_accounting_verify(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t ckpoint_sm_space_sum = 0;
+ uint64_t vs_ckpoint_space_sum = 0;
+
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ if (vd->vdev_checkpoint_sm != NULL) {
+ ckpoint_sm_space_sum +=
+ -space_map_allocated(vd->vdev_checkpoint_sm);
+ vs_ckpoint_space_sum +=
+ vd->vdev_stat.vs_checkpoint_space;
+ ASSERT3U(ckpoint_sm_space_sum, ==,
+ vs_ckpoint_space_sum);
+ } else {
+ ASSERT0(vd->vdev_stat.vs_checkpoint_space);
+ }
+ }
+ ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum);
+}
+#endif
+
+static void
+spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
+{
+ vdev_t *vd = arg;
+ int error;
+
+ /*
+ * The space map callback is applied only to non-debug entries.
+ * Because the number of debug entries is less or equal to the
+ * number of non-debug entries, we want to ensure that we only
+ * read what we prefetched from open-context.
+ *
+ * Thus, we set the maximum entries that the space map callback
+ * will be applied to be half the entries that could fit in the
+ * imposed memory limit.
+ *
+ * Note that since this is a conservative estimate we also
+ * assume the worst case scenario in our computation where each
+ * entry is two-word.
+ */
+ uint64_t max_entry_limit =
+ (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1;
+
+ /*
+ * Iterate from the end of the space map towards the beginning,
+ * placing its entries on ms_freeing and removing them from the
+ * space map. The iteration stops if one of the following
+ * conditions is true:
+ *
+ * 1] We reached the beginning of the space map. At this point
+ * the space map should be completely empty and
+ * space_map_incremental_destroy should have returned 0.
+ * The next step would be to free and close the space map
+ * and remove its entry from its vdev's top zap. This allows
+ * spa_checkpoint_discard_thread() to move on to the next vdev.
+ *
+ * 2] We reached the memory limit (amount of memory used to hold
+ * space map entries in memory) and space_map_incremental_destroy
+ * returned EINTR. This means that there are entries remaining
+ * in the space map that will be cleared in a future invocation
+ * of this function by spa_checkpoint_discard_thread().
+ */
+ spa_checkpoint_discard_sync_callback_arg_t sdc;
+ sdc.sdc_vd = vd;
+ sdc.sdc_txg = tx->tx_txg;
+ sdc.sdc_entry_limit = max_entry_limit;
+
+ uint64_t words_before =
+ space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
+
+ error = space_map_incremental_destroy(vd->vdev_checkpoint_sm,
+ spa_checkpoint_discard_sync_callback, &sdc, tx);
+
+ uint64_t words_after =
+ space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
+
+#ifdef ZFS_DEBUG
+ spa_checkpoint_accounting_verify(vd->vdev_spa);
+#endif
+
+ zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, "
+ "deleted %llu words - %llu words are left",
+ tx->tx_txg, vd->vdev_id, (words_before - words_after),
+ words_after);
+
+ if (error != EINTR) {
+ if (error != 0) {
+ zfs_panic_recover("zfs: error %d was returned "
+ "while incrementally destroying the checkpoint "
+ "space map of vdev %llu\n",
+ error, vd->vdev_id);
+ }
+ ASSERT0(words_after);
+ ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm));
+ ASSERT0(space_map_length(vd->vdev_checkpoint_sm));
+
+ space_map_free(vd->vdev_checkpoint_sm, tx);
+ space_map_close(vd->vdev_checkpoint_sm);
+ vd->vdev_checkpoint_sm = NULL;
+
+ VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa),
+ vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx));
+ }
+}
+
+static boolean_t
+spa_checkpoint_discard_is_done(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ ASSERT(!spa_has_checkpoint(spa));
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT));
+
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL)
+ return (B_FALSE);
+ ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space);
+ }
+
+ return (B_TRUE);
+}
+
+/* ARGSUSED */
+boolean_t
+spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr)
+{
+ spa_t *spa = arg;
+
+ if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+ return (B_FALSE);
+
+ if (spa_has_checkpoint(spa))
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+void
+spa_checkpoint_discard_thread(void *arg, zthr_t *zthr)
+{
+ spa_t *spa = arg;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ while (vd->vdev_checkpoint_sm != NULL) {
+ space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm;
+ int numbufs;
+ dmu_buf_t **dbp;
+
+ if (zthr_iscancelled(zthr))
+ return;
+
+ ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
+
+ uint64_t size = MIN(space_map_length(checkpoint_sm),
+ zfs_spa_discard_memory_limit);
+ uint64_t offset =
+ space_map_length(checkpoint_sm) - size;
+
+ /*
+ * Ensure that the part of the space map that will
+ * be destroyed by the synctask, is prefetched in
+ * memory before the synctask runs.
+ */
+ int error = dmu_buf_hold_array_by_bonus(
+ checkpoint_sm->sm_dbuf, offset, size,
+ B_TRUE, FTAG, &numbufs, &dbp);
+ if (error != 0) {
+ zfs_panic_recover("zfs: error %d was returned "
+ "while prefetching checkpoint space map "
+ "entries of vdev %llu\n",
+ error, vd->vdev_id);
+ }
+
+ VERIFY0(dsl_sync_task(spa->spa_name, NULL,
+ spa_checkpoint_discard_thread_sync, vd,
+ 0, ZFS_SPACE_CHECK_NONE));
+
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ }
+ }
+
+ VERIFY(spa_checkpoint_discard_is_done(spa));
+ VERIFY0(spa->spa_checkpoint_info.sci_dspace);
+ VERIFY0(dsl_sync_task(spa->spa_name, NULL,
+ spa_checkpoint_discard_complete_sync, spa,
+ 0, ZFS_SPACE_CHECK_NONE));
+}
+
+
+/* ARGSUSED */
+static int
+spa_checkpoint_check(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT))
+ return (SET_ERROR(ENOTSUP));
+
+ if (!spa_top_vdevs_spacemap_addressable(spa))
+ return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG));
+
+ if (spa->spa_removing_phys.sr_state == DSS_SCANNING)
+ return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS));
+
+ if (spa->spa_checkpoint_txg != 0)
+ return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS));
+
+ if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+ return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+spa_checkpoint_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ spa_t *spa = dp->dp_spa;
+ uberblock_t checkpoint = spa->spa_ubsync;
+
+ /*
+ * At this point, there should not be a checkpoint in the MOS.
+ */
+ ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT);
+
+ ASSERT0(spa->spa_checkpoint_info.sci_timestamp);
+ ASSERT0(spa->spa_checkpoint_info.sci_dspace);
+
+ /*
+ * Since the checkpointed uberblock is the one that just got synced
+ * (we use spa_ubsync), its txg must be equal to the txg number of
+ * the txg we are syncing, minus 1.
+ */
+ ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1);
+
+ /*
+ * Once the checkpoint is in place, we need to ensure that none of
+ * its blocks will be marked for reuse after it has been freed.
+ * When there is a checkpoint and a block is freed, we compare its
+ * birth txg to the txg of the checkpointed uberblock to see if the
+ * block is part of the checkpoint or not. Therefore, we have to set
+ * spa_checkpoint_txg before any frees happen in this txg (which is
+ * why this is done as an early_synctask as explained in the comment
+ * in spa_checkpoint()).
+ */
+ spa->spa_checkpoint_txg = checkpoint.ub_txg;
+ spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
+
+ checkpoint.ub_checkpoint_txg = checkpoint.ub_txg;
+ VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT,
+ sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t),
+ &checkpoint, tx));
+
+ /*
+ * Increment the feature refcount and thus activate the feature.
+ * Note that the feature will be deactivated when we've
+ * completely discarded all checkpointed state (both vdev
+ * space maps and uberblock).
+ */
+ spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
+
+ spa_history_log_internal(spa, "spa checkpoint", tx,
+ "checkpointed uberblock txg=%llu", (u_longlong_t)checkpoint.ub_txg);
+}
+
+/*
+ * Create a checkpoint for the pool.
+ */
+int
+spa_checkpoint(const char *pool)
+{
+ int error;
+ spa_t *spa;
+
+ error = spa_open(pool, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ mutex_enter(&spa->spa_vdev_top_lock);
+
+ /*
+ * Wait for current syncing txg to finish so the latest synced
+ * uberblock (spa_ubsync) has all the changes that we expect
+ * to see if we were to revert later to the checkpoint. In other
+ * words we want the checkpointed uberblock to include/reference
+ * all the changes that were pending at the time that we issued
+ * the checkpoint command.
+ */
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ /*
+ * As the checkpointed uberblock references blocks from the previous
+ * txg (spa_ubsync) we want to ensure that are not freeing any of
+ * these blocks in the same txg that the following synctask will
+ * run. Thus, we run it as an early synctask, so the dirty changes
+ * that are synced to disk afterwards during zios and other synctasks
+ * do not reuse checkpointed blocks.
+ */
+ error = dsl_early_sync_task(pool, spa_checkpoint_check,
+ spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL);
+
+ mutex_exit(&spa->spa_vdev_top_lock);
+
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+ return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
+
+ if (spa->spa_checkpoint_txg == 0)
+ return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
+
+ VERIFY0(zap_contains(spa_meta_objset(spa),
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT));
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ZPOOL_CHECKPOINT, tx));
+
+ spa->spa_checkpoint_txg = 0;
+
+ zthr_wakeup(spa->spa_checkpoint_discard_zthr);
+
+ spa_history_log_internal(spa, "spa discard checkpoint", tx,
+ "started discarding checkpointed state from the pool");
+}
+
+/*
+ * Discard the checkpoint from a pool.
+ */
+int
+spa_checkpoint_discard(const char *pool)
+{
+ /*
+ * Similarly to spa_checkpoint(), we want our synctask to run
+ * before any pending dirty data are written to disk so they
+ * won't end up in the checkpoint's data structures (e.g.
+ * ms_checkpointing and vdev_checkpoint_sm) and re-create any
+ * space maps that the discarding open-context thread has
+ * deleted.
+ * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread]
+ */
+ return (dsl_early_sync_task(pool, spa_checkpoint_discard_check,
+ spa_checkpoint_discard_sync, NULL, 0,
+ ZFS_SPACE_CHECK_DISCARD_CHECKPOINT));
+}
+
+EXPORT_SYMBOL(spa_checkpoint_get_stats);
+EXPORT_SYMBOL(spa_checkpoint_discard_thread);
+EXPORT_SYMBOL(spa_checkpoint_discard_thread_check);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_spa, zfs_spa_, discard_memory_limit, ULONG, ZMOD_RW,
+ "Limit for memory used in prefetching the checkpoint space map done "
+ "on each vdev while discarding the checkpoint");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/spa_config.c b/sys/contrib/openzfs/module/zfs/spa_config.c
new file mode 100644
index 000000000000..4a3144313267
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/spa_config.c
@@ -0,0 +1,623 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
+ */
+
+#include <sys/spa.h>
+#include <sys/file.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa_impl.h>
+#include <sys/nvpair.h>
+#include <sys/uio.h>
+#include <sys/fs/zfs.h>
+#include <sys/vdev_impl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/systeminfo.h>
+#include <sys/sunddi.h>
+#include <sys/zfeature.h>
+#include <sys/zfs_file.h>
+#ifdef _KERNEL
+#include <sys/zone.h>
+#endif
+
+/*
+ * Pool configuration repository.
+ *
+ * Pool configuration is stored as a packed nvlist on the filesystem. By
+ * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot
+ * (when the ZFS module is loaded). Pools can also have the 'cachefile'
+ * property set that allows them to be stored in an alternate location until
+ * the control of external software.
+ *
+ * For each cache file, we have a single nvlist which holds all the
+ * configuration information. When the module loads, we read this information
+ * from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is
+ * maintained independently in spa.c. Whenever the namespace is modified, or
+ * the configuration of a pool is changed, we call spa_write_cachefile(), which
+ * walks through all the active pools and writes the configuration to disk.
+ */
+
+static uint64_t spa_config_generation = 1;
+
+/*
+ * This can be overridden in userland to preserve an alternate namespace for
+ * userland pools when doing testing.
+ */
+char *spa_config_path = ZPOOL_CACHE;
+int zfs_autoimport_disable = 1;
+
+/*
+ * Called when the module is first loaded, this routine loads the configuration
+ * file into the SPA namespace. It does not actually open or load the pools; it
+ * only populates the namespace.
+ */
+void
+spa_config_load(void)
+{
+ void *buf = NULL;
+ nvlist_t *nvlist, *child;
+ nvpair_t *nvpair;
+ char *pathname;
+ zfs_file_t *fp;
+ zfs_file_attr_t zfa;
+ uint64_t fsize;
+ int err;
+
+#ifdef _KERNEL
+ if (zfs_autoimport_disable)
+ return;
+#endif
+
+ /*
+ * Open the configuration file.
+ */
+ pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+ (void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path);
+
+ err = zfs_file_open(pathname, O_RDONLY, 0, &fp);
+
+#ifdef __FreeBSD__
+ if (err)
+ err = zfs_file_open(ZPOOL_CACHE_BOOT, O_RDONLY, 0, &fp);
+#endif
+ kmem_free(pathname, MAXPATHLEN);
+
+ if (err)
+ return;
+
+ if (zfs_file_getattr(fp, &zfa))
+ goto out;
+
+ fsize = zfa.zfa_size;
+ buf = kmem_alloc(fsize, KM_SLEEP);
+
+ /*
+ * Read the nvlist from the file.
+ */
+ if (zfs_file_read(fp, buf, fsize, NULL) < 0)
+ goto out;
+
+ /*
+ * Unpack the nvlist.
+ */
+ if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0)
+ goto out;
+
+ /*
+ * Iterate over all elements in the nvlist, creating a new spa_t for
+ * each one with the specified configuration.
+ */
+ mutex_enter(&spa_namespace_lock);
+ nvpair = NULL;
+ while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
+ if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
+ continue;
+
+ child = fnvpair_value_nvlist(nvpair);
+
+ if (spa_lookup(nvpair_name(nvpair)) != NULL)
+ continue;
+ (void) spa_add(nvpair_name(nvpair), child, NULL);
+ }
+ mutex_exit(&spa_namespace_lock);
+
+ nvlist_free(nvlist);
+
+out:
+ if (buf != NULL)
+ kmem_free(buf, fsize);
+
+ zfs_file_close(fp);
+}
+
+static int
+spa_config_remove(spa_config_dirent_t *dp)
+{
+ int error = 0;
+
+ /*
+ * Remove the cache file. If zfs_file_unlink() in not supported by the
+ * platform fallback to truncating the file which is functionally
+ * equivalent.
+ */
+ error = zfs_file_unlink(dp->scd_path);
+ if (error == EOPNOTSUPP) {
+ int flags = O_RDWR | O_TRUNC;
+ zfs_file_t *fp;
+
+ error = zfs_file_open(dp->scd_path, flags, 0644, &fp);
+ if (error == 0) {
+ (void) zfs_file_fsync(fp, O_SYNC);
+ (void) zfs_file_close(fp);
+ }
+ }
+
+ return (error);
+}
+
+static int
+spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)
+{
+ size_t buflen;
+ char *buf;
+ int oflags = O_RDWR | O_TRUNC | O_CREAT | O_LARGEFILE;
+ char *temp;
+ int err;
+ zfs_file_t *fp;
+
+ /*
+ * If the nvlist is empty (NULL), then remove the old cachefile.
+ */
+ if (nvl == NULL) {
+ err = spa_config_remove(dp);
+ if (err == ENOENT)
+ err = 0;
+
+ return (err);
+ }
+
+ /*
+ * Pack the configuration into a buffer.
+ */
+ buf = fnvlist_pack(nvl, &buflen);
+ temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+
+ /*
+ * Write the configuration to disk. Due to the complexity involved
+ * in performing a rename and remove from within the kernel the file
+ * is instead truncated and overwritten in place. This way we always
+ * have a consistent view of the data or a zero length file.
+ */
+ err = zfs_file_open(dp->scd_path, oflags, 0644, &fp);
+ if (err == 0) {
+ err = zfs_file_write(fp, buf, buflen, NULL);
+ if (err == 0)
+ err = zfs_file_fsync(fp, O_SYNC);
+
+ zfs_file_close(fp);
+ if (err)
+ (void) spa_config_remove(dp);
+ }
+ fnvlist_pack_free(buf, buflen);
+ kmem_free(temp, MAXPATHLEN);
+ return (err);
+}
+
+/*
+ * Synchronize pool configuration to disk. This must be called with the
+ * namespace lock held. Synchronizing the pool cache is typically done after
+ * the configuration has been synced to the MOS. This exposes a window where
+ * the MOS config will have been updated but the cache file has not. If
+ * the system were to crash at that instant then the cached config may not
+ * contain the correct information to open the pool and an explicit import
+ * would be required.
+ */
+void
+spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent)
+{
+ spa_config_dirent_t *dp, *tdp;
+ nvlist_t *nvl;
+ char *pool_name;
+ boolean_t ccw_failure;
+ int error = 0;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ if (!(spa_mode_global & SPA_MODE_WRITE))
+ return;
+
+ /*
+ * Iterate over all cachefiles for the pool, past or present. When the
+ * cachefile is changed, the new one is pushed onto this list, allowing
+ * us to update previous cachefiles that no longer contain this pool.
+ */
+ ccw_failure = B_FALSE;
+ for (dp = list_head(&target->spa_config_list); dp != NULL;
+ dp = list_next(&target->spa_config_list, dp)) {
+ spa_t *spa = NULL;
+ if (dp->scd_path == NULL)
+ continue;
+
+ /*
+ * Iterate over all pools, adding any matching pools to 'nvl'.
+ */
+ nvl = NULL;
+ while ((spa = spa_next(spa)) != NULL) {
+ /*
+ * Skip over our own pool if we're about to remove
+ * ourselves from the spa namespace or any pool that
+ * is readonly. Since we cannot guarantee that a
+ * readonly pool would successfully import upon reboot,
+ * we don't allow them to be written to the cache file.
+ */
+ if ((spa == target && removing) ||
+ !spa_writeable(spa))
+ continue;
+
+ mutex_enter(&spa->spa_props_lock);
+ tdp = list_head(&spa->spa_config_list);
+ if (spa->spa_config == NULL ||
+ tdp == NULL ||
+ tdp->scd_path == NULL ||
+ strcmp(tdp->scd_path, dp->scd_path) != 0) {
+ mutex_exit(&spa->spa_props_lock);
+ continue;
+ }
+
+ if (nvl == NULL)
+ nvl = fnvlist_alloc();
+
+ if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME)
+ pool_name = fnvlist_lookup_string(
+ spa->spa_config, ZPOOL_CONFIG_POOL_NAME);
+ else
+ pool_name = spa_name(spa);
+
+ fnvlist_add_nvlist(nvl, pool_name, spa->spa_config);
+ mutex_exit(&spa->spa_props_lock);
+ }
+
+ error = spa_config_write(dp, nvl);
+ if (error != 0)
+ ccw_failure = B_TRUE;
+ nvlist_free(nvl);
+ }
+
+ if (ccw_failure) {
+ /*
+ * Keep trying so that configuration data is
+ * written if/when any temporary filesystem
+ * resource issues are resolved.
+ */
+ if (target->spa_ccw_fail_time == 0) {
+ (void) zfs_ereport_post(
+ FM_EREPORT_ZFS_CONFIG_CACHE_WRITE,
+ target, NULL, NULL, NULL, 0);
+ }
+ target->spa_ccw_fail_time = gethrtime();
+ spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE);
+ } else {
+ /*
+ * Do not rate limit future attempts to update
+ * the config cache.
+ */
+ target->spa_ccw_fail_time = 0;
+ }
+
+ /*
+ * Remove any config entries older than the current one.
+ */
+ dp = list_head(&target->spa_config_list);
+ while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) {
+ list_remove(&target->spa_config_list, tdp);
+ if (tdp->scd_path != NULL)
+ spa_strfree(tdp->scd_path);
+ kmem_free(tdp, sizeof (spa_config_dirent_t));
+ }
+
+ spa_config_generation++;
+
+ if (postsysevent)
+ spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC);
+}
+
+/*
+ * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache,
+ * and we don't want to allow the local zone to see all the pools anyway.
+ * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration
+ * information for all pool visible within the zone.
+ */
+nvlist_t *
+spa_all_configs(uint64_t *generation)
+{
+ nvlist_t *pools;
+ spa_t *spa = NULL;
+
+ if (*generation == spa_config_generation)
+ return (NULL);
+
+ pools = fnvlist_alloc();
+
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL) {
+ if (INGLOBALZONE(curproc) ||
+ zone_dataset_visible(spa_name(spa), NULL)) {
+ mutex_enter(&spa->spa_props_lock);
+ fnvlist_add_nvlist(pools, spa_name(spa),
+ spa->spa_config);
+ mutex_exit(&spa->spa_props_lock);
+ }
+ }
+ *generation = spa_config_generation;
+ mutex_exit(&spa_namespace_lock);
+
+ return (pools);
+}
+
+void
+spa_config_set(spa_t *spa, nvlist_t *config)
+{
+ mutex_enter(&spa->spa_props_lock);
+ if (spa->spa_config != NULL && spa->spa_config != config)
+ nvlist_free(spa->spa_config);
+ spa->spa_config = config;
+ mutex_exit(&spa->spa_props_lock);
+}
+
+/*
+ * Generate the pool's configuration based on the current in-core state.
+ *
+ * We infer whether to generate a complete config or just one top-level config
+ * based on whether vd is the root vdev.
+ */
+nvlist_t *
+spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
+{
+ nvlist_t *config, *nvroot;
+ vdev_t *rvd = spa->spa_root_vdev;
+ unsigned long hostid = 0;
+ boolean_t locked = B_FALSE;
+ uint64_t split_guid;
+ char *pool_name;
+
+ if (vd == NULL) {
+ vd = rvd;
+ locked = B_TRUE;
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+ }
+
+ ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) ==
+ (SCL_CONFIG | SCL_STATE));
+
+ /*
+ * If txg is -1, report the current value of spa->spa_config_txg.
+ */
+ if (txg == -1ULL)
+ txg = spa->spa_config_txg;
+
+ /*
+ * Originally, users had to handle spa namespace collisions by either
+ * exporting the already imported pool or by specifying a new name for
+ * the pool with a conflicting name. In the case of root pools from
+ * virtual guests, neither approach to collision resolution is
+ * reasonable. This is addressed by extending the new name syntax with
+ * an option to specify that the new name is temporary. When specified,
+ * ZFS_IMPORT_TEMP_NAME will be set in spa->spa_import_flags to tell us
+ * to use the previous name, which we do below.
+ */
+ if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) {
+ VERIFY0(nvlist_lookup_string(spa->spa_config,
+ ZPOOL_CONFIG_POOL_NAME, &pool_name));
+ } else
+ pool_name = spa_name(spa);
+
+ config = fnvlist_alloc();
+
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa));
+ fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, pool_name);
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa));
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg);
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata);
+ if (spa->spa_comment != NULL)
+ fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT,
+ spa->spa_comment);
+ if (spa->spa_compatibility != NULL)
+ fnvlist_add_string(config, ZPOOL_CONFIG_COMPATIBILITY,
+ spa->spa_compatibility);
+
+ hostid = spa_get_hostid(spa);
+ if (hostid != 0)
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid);
+ fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname()->nodename);
+
+ int config_gen_flags = 0;
+ if (vd != rvd) {
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID,
+ vd->vdev_top->vdev_guid);
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_GUID,
+ vd->vdev_guid);
+ if (vd->vdev_isspare)
+ fnvlist_add_uint64(config,
+ ZPOOL_CONFIG_IS_SPARE, 1ULL);
+ if (vd->vdev_islog)
+ fnvlist_add_uint64(config,
+ ZPOOL_CONFIG_IS_LOG, 1ULL);
+ vd = vd->vdev_top; /* label contains top config */
+ } else {
+ /*
+ * Only add the (potentially large) split information
+ * in the mos config, and not in the vdev labels
+ */
+ if (spa->spa_config_splitting != NULL)
+ fnvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT,
+ spa->spa_config_splitting);
+
+ fnvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS);
+
+ config_gen_flags |= VDEV_CONFIG_MOS;
+ }
+
+ /*
+ * Add the top-level config. We even add this on pools which
+ * don't support holes in the namespace.
+ */
+ vdev_top_config_generate(spa, config);
+
+ /*
+ * If we're splitting, record the original pool's guid.
+ */
+ if (spa->spa_config_splitting != NULL &&
+ nvlist_lookup_uint64(spa->spa_config_splitting,
+ ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) {
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID, split_guid);
+ }
+
+ nvroot = vdev_config_generate(spa, vd, getstats, config_gen_flags);
+ fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot);
+ nvlist_free(nvroot);
+
+ /*
+ * Store what's necessary for reading the MOS in the label.
+ */
+ fnvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
+ spa->spa_label_features);
+
+ if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) {
+ ddt_histogram_t *ddh;
+ ddt_stat_t *dds;
+ ddt_object_t *ddo;
+
+ ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
+ ddt_get_dedup_histogram(spa, ddh);
+ fnvlist_add_uint64_array(config,
+ ZPOOL_CONFIG_DDT_HISTOGRAM,
+ (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t));
+ kmem_free(ddh, sizeof (ddt_histogram_t));
+
+ ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP);
+ ddt_get_dedup_object_stats(spa, ddo);
+ fnvlist_add_uint64_array(config,
+ ZPOOL_CONFIG_DDT_OBJ_STATS,
+ (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t));
+ kmem_free(ddo, sizeof (ddt_object_t));
+
+ dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP);
+ ddt_get_dedup_stats(spa, dds);
+ fnvlist_add_uint64_array(config,
+ ZPOOL_CONFIG_DDT_STATS,
+ (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t));
+ kmem_free(dds, sizeof (ddt_stat_t));
+ }
+
+ if (locked)
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+
+ return (config);
+}
+
+/*
+ * Update all disk labels, generate a fresh config based on the current
+ * in-core state, and sync the global config cache (do not sync the config
+ * cache if this is a booting rootpool).
+ */
+void
+spa_config_update(spa_t *spa, int what)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t txg;
+ int c;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ txg = spa_last_synced_txg(spa) + 1;
+ if (what == SPA_CONFIG_UPDATE_POOL) {
+ vdev_config_dirty(rvd);
+ } else {
+ /*
+ * If we have top-level vdevs that were added but have
+ * not yet been prepared for allocation, do that now.
+ * (It's safe now because the config cache is up to date,
+ * so it will be able to translate the new DVAs.)
+ * See comments in spa_vdev_add() for full details.
+ */
+ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+
+ /*
+ * Explicitly skip vdevs that are indirect or
+ * log vdevs that are being removed. The reason
+ * is that both of those can have vdev_ms_array
+ * set to 0 and we wouldn't want to change their
+ * metaslab size nor call vdev_expand() on them.
+ */
+ if (!vdev_is_concrete(tvd) ||
+ (tvd->vdev_islog && tvd->vdev_removing))
+ continue;
+
+ if (tvd->vdev_ms_array == 0)
+ vdev_metaslab_set_size(tvd);
+ vdev_expand(tvd, txg);
+ }
+ }
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ /*
+ * Wait for the mosconfig to be regenerated and synced.
+ */
+ txg_wait_synced(spa->spa_dsl_pool, txg);
+
+ /*
+ * Update the global config cache to reflect the new mosconfig.
+ */
+ if (!spa->spa_is_root) {
+ spa_write_cachefile(spa, B_FALSE,
+ what != SPA_CONFIG_UPDATE_POOL);
+ }
+
+ if (what == SPA_CONFIG_UPDATE_POOL)
+ spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);
+}
+
+EXPORT_SYMBOL(spa_config_load);
+EXPORT_SYMBOL(spa_all_configs);
+EXPORT_SYMBOL(spa_config_set);
+EXPORT_SYMBOL(spa_config_generate);
+EXPORT_SYMBOL(spa_config_update);
+
+/* BEGIN CSTYLED */
+#ifdef __linux__
+/* string sysctls require a char array on FreeBSD */
+ZFS_MODULE_PARAM(zfs_spa, spa_, config_path, STRING, ZMOD_RD,
+ "SPA config file (/etc/zfs/zpool.cache)");
+#endif
+
+ZFS_MODULE_PARAM(zfs, zfs_, autoimport_disable, INT, ZMOD_RW,
+ "Disable pool import at module load");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/spa_errlog.c b/sys/contrib/openzfs/module/zfs/spa_errlog.c
new file mode 100644
index 000000000000..fa5120eb61b3
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/spa_errlog.c
@@ -0,0 +1,416 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ */
+
+/*
+ * Routines to manage the on-disk persistent error log.
+ *
+ * Each pool stores a log of all logical data errors seen during normal
+ * operation. This is actually the union of two distinct logs: the last log,
+ * and the current log. All errors seen are logged to the current log. When a
+ * scrub completes, the current log becomes the last log, the last log is thrown
+ * out, and the current log is reinitialized. This way, if an error is somehow
+ * corrected, a new scrub will show that it no longer exists, and will be
+ * deleted from the log when the scrub completes.
+ *
+ * The log is stored using a ZAP object whose key is a string form of the
+ * zbookmark_phys tuple (objset, object, level, blkid), and whose contents is an
+ * optional 'objset:object' human-readable string describing the data. When an
+ * error is first logged, this string will be empty, indicating that no name is
+ * known. This prevents us from having to issue a potentially large amount of
+ * I/O to discover the object name during an error path. Instead, we do the
+ * calculation when the data is requested, storing the result so future queries
+ * will be faster.
+ *
+ * This log is then shipped into an nvlist where the key is the dataset name and
+ * the value is the object name. Userland is then responsible for uniquifying
+ * this list and displaying it to the user.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+
+
+/*
+ * Convert a bookmark to a string.
+ */
+static void
+bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len)
+{
+ (void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
+ (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
+ (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid);
+}
+
+/*
+ * Convert a string to a bookmark
+ */
+#ifdef _KERNEL
+static void
+name_to_bookmark(char *buf, zbookmark_phys_t *zb)
+{
+ zb->zb_objset = zfs_strtonum(buf, &buf);
+ ASSERT(*buf == ':');
+ zb->zb_object = zfs_strtonum(buf + 1, &buf);
+ ASSERT(*buf == ':');
+ zb->zb_level = (int)zfs_strtonum(buf + 1, &buf);
+ ASSERT(*buf == ':');
+ zb->zb_blkid = zfs_strtonum(buf + 1, &buf);
+ ASSERT(*buf == '\0');
+}
+#endif
+
+/*
+ * Log an uncorrectable error to the persistent error log. We add it to the
+ * spa's list of pending errors. The changes are actually synced out to disk
+ * during spa_errlog_sync().
+ */
+void
+spa_log_error(spa_t *spa, const zbookmark_phys_t *zb)
+{
+ spa_error_entry_t search;
+ spa_error_entry_t *new;
+ avl_tree_t *tree;
+ avl_index_t where;
+
+ /*
+ * If we are trying to import a pool, ignore any errors, as we won't be
+ * writing to the pool any time soon.
+ */
+ if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
+ return;
+
+ mutex_enter(&spa->spa_errlist_lock);
+
+ /*
+ * If we have had a request to rotate the log, log it to the next list
+ * instead of the current one.
+ */
+ if (spa->spa_scrub_active || spa->spa_scrub_finished)
+ tree = &spa->spa_errlist_scrub;
+ else
+ tree = &spa->spa_errlist_last;
+
+ search.se_bookmark = *zb;
+ if (avl_find(tree, &search, &where) != NULL) {
+ mutex_exit(&spa->spa_errlist_lock);
+ return;
+ }
+
+ new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
+ new->se_bookmark = *zb;
+ avl_insert(tree, new, where);
+
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Return the number of errors currently in the error log. This is actually the
+ * sum of both the last log and the current log, since we don't know the union
+ * of these logs until we reach userland.
+ */
+uint64_t
+spa_get_errlog_size(spa_t *spa)
+{
+ uint64_t total = 0, count;
+
+ mutex_enter(&spa->spa_errlog_lock);
+ if (spa->spa_errlog_scrub != 0 &&
+ zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
+ &count) == 0)
+ total += count;
+
+ if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
+ zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
+ &count) == 0)
+ total += count;
+ mutex_exit(&spa->spa_errlog_lock);
+
+ mutex_enter(&spa->spa_errlist_lock);
+ total += avl_numnodes(&spa->spa_errlist_last);
+ total += avl_numnodes(&spa->spa_errlist_scrub);
+ mutex_exit(&spa->spa_errlist_lock);
+
+ return (total);
+}
+
+#ifdef _KERNEL
+static int
+process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ zbookmark_phys_t zb;
+
+ if (obj == 0)
+ return (0);
+
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+
+ if (*count == 0) {
+ zap_cursor_fini(&zc);
+ return (SET_ERROR(ENOMEM));
+ }
+
+ name_to_bookmark(za.za_name, &zb);
+
+ if (copyout(&zb, (char *)addr +
+ (*count - 1) * sizeof (zbookmark_phys_t),
+ sizeof (zbookmark_phys_t)) != 0) {
+ zap_cursor_fini(&zc);
+ return (SET_ERROR(EFAULT));
+ }
+
+ *count -= 1;
+ }
+
+ zap_cursor_fini(&zc);
+
+ return (0);
+}
+
+static int
+process_error_list(avl_tree_t *list, void *addr, size_t *count)
+{
+ spa_error_entry_t *se;
+
+ for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
+
+ if (*count == 0)
+ return (SET_ERROR(ENOMEM));
+
+ if (copyout(&se->se_bookmark, (char *)addr +
+ (*count - 1) * sizeof (zbookmark_phys_t),
+ sizeof (zbookmark_phys_t)) != 0)
+ return (SET_ERROR(EFAULT));
+
+ *count -= 1;
+ }
+
+ return (0);
+}
+#endif
+
+/*
+ * Copy all known errors to userland as an array of bookmarks. This is
+ * actually a union of the on-disk last log and current log, as well as any
+ * pending error requests.
+ *
+ * Because the act of reading the on-disk log could cause errors to be
+ * generated, we have two separate locks: one for the error log and one for the
+ * in-core error lists. We only need the error list lock to log and error, so
+ * we grab the error log lock while we read the on-disk logs, and only pick up
+ * the error list lock when we are finished.
+ */
+int
+spa_get_errlog(spa_t *spa, void *uaddr, size_t *count)
+{
+ int ret = 0;
+
+#ifdef _KERNEL
+ mutex_enter(&spa->spa_errlog_lock);
+
+ ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
+
+ if (!ret && !spa->spa_scrub_finished)
+ ret = process_error_log(spa, spa->spa_errlog_last, uaddr,
+ count);
+
+ mutex_enter(&spa->spa_errlist_lock);
+ if (!ret)
+ ret = process_error_list(&spa->spa_errlist_scrub, uaddr,
+ count);
+ if (!ret)
+ ret = process_error_list(&spa->spa_errlist_last, uaddr,
+ count);
+ mutex_exit(&spa->spa_errlist_lock);
+
+ mutex_exit(&spa->spa_errlog_lock);
+#endif
+
+ return (ret);
+}
+
+/*
+ * Called when a scrub completes. This simply set a bit which tells which AVL
+ * tree to add new errors. spa_errlog_sync() is responsible for actually
+ * syncing the changes to the underlying objects.
+ */
+void
+spa_errlog_rotate(spa_t *spa)
+{
+ mutex_enter(&spa->spa_errlist_lock);
+ spa->spa_scrub_finished = B_TRUE;
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Discard any pending errors from the spa_t. Called when unloading a faulted
+ * pool, as the errors encountered during the open cannot be synced to disk.
+ */
+void
+spa_errlog_drain(spa_t *spa)
+{
+ spa_error_entry_t *se;
+ void *cookie;
+
+ mutex_enter(&spa->spa_errlist_lock);
+
+ cookie = NULL;
+ while ((se = avl_destroy_nodes(&spa->spa_errlist_last,
+ &cookie)) != NULL)
+ kmem_free(se, sizeof (spa_error_entry_t));
+ cookie = NULL;
+ while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub,
+ &cookie)) != NULL)
+ kmem_free(se, sizeof (spa_error_entry_t));
+
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Process a list of errors into the current on-disk log.
+ */
+static void
+sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
+{
+ spa_error_entry_t *se;
+ char buf[64];
+ void *cookie;
+
+ if (avl_numnodes(t) != 0) {
+ /* create log if necessary */
+ if (*obj == 0)
+ *obj = zap_create(spa->spa_meta_objset,
+ DMU_OT_ERROR_LOG, DMU_OT_NONE,
+ 0, tx);
+
+ /* add errors to the current log */
+ for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
+ char *name = se->se_name ? se->se_name : "";
+
+ bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
+
+ (void) zap_update(spa->spa_meta_objset,
+ *obj, buf, 1, strlen(name) + 1, name, tx);
+ }
+
+ /* purge the error list */
+ cookie = NULL;
+ while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
+ kmem_free(se, sizeof (spa_error_entry_t));
+ }
+}
+
+/*
+ * Sync the error log out to disk. This is a little tricky because the act of
+ * writing the error log requires the spa_errlist_lock. So, we need to lock the
+ * error lists, take a copy of the lists, and then reinitialize them. Then, we
+ * drop the error list lock and take the error log lock, at which point we
+ * do the errlog processing. Then, if we encounter an I/O error during this
+ * process, we can successfully add the error to the list. Note that this will
+ * result in the perpetual recycling of errors, but it is an unlikely situation
+ * and not a performance critical operation.
+ */
+void
+spa_errlog_sync(spa_t *spa, uint64_t txg)
+{
+ dmu_tx_t *tx;
+ avl_tree_t scrub, last;
+ int scrub_finished;
+
+ mutex_enter(&spa->spa_errlist_lock);
+
+ /*
+ * Bail out early under normal circumstances.
+ */
+ if (avl_numnodes(&spa->spa_errlist_scrub) == 0 &&
+ avl_numnodes(&spa->spa_errlist_last) == 0 &&
+ !spa->spa_scrub_finished) {
+ mutex_exit(&spa->spa_errlist_lock);
+ return;
+ }
+
+ spa_get_errlists(spa, &last, &scrub);
+ scrub_finished = spa->spa_scrub_finished;
+ spa->spa_scrub_finished = B_FALSE;
+
+ mutex_exit(&spa->spa_errlist_lock);
+ mutex_enter(&spa->spa_errlog_lock);
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+ /*
+ * Sync out the current list of errors.
+ */
+ sync_error_list(spa, &last, &spa->spa_errlog_last, tx);
+
+ /*
+ * Rotate the log if necessary.
+ */
+ if (scrub_finished) {
+ if (spa->spa_errlog_last != 0)
+ VERIFY(dmu_object_free(spa->spa_meta_objset,
+ spa->spa_errlog_last, tx) == 0);
+ spa->spa_errlog_last = spa->spa_errlog_scrub;
+ spa->spa_errlog_scrub = 0;
+
+ sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx);
+ }
+
+ /*
+ * Sync out any pending scrub errors.
+ */
+ sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx);
+
+ /*
+ * Update the MOS to reflect the new values.
+ */
+ (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1,
+ &spa->spa_errlog_last, tx);
+ (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1,
+ &spa->spa_errlog_scrub, tx);
+
+ dmu_tx_commit(tx);
+
+ mutex_exit(&spa->spa_errlog_lock);
+}
+
+#if defined(_KERNEL)
+/* error handling */
+EXPORT_SYMBOL(spa_log_error);
+EXPORT_SYMBOL(spa_get_errlog_size);
+EXPORT_SYMBOL(spa_get_errlog);
+EXPORT_SYMBOL(spa_errlog_rotate);
+EXPORT_SYMBOL(spa_errlog_drain);
+EXPORT_SYMBOL(spa_errlog_sync);
+EXPORT_SYMBOL(spa_get_errlists);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/spa_history.c b/sys/contrib/openzfs/module/zfs/spa_history.c
new file mode 100644
index 000000000000..0482e0f6c39d
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/spa_history.c
@@ -0,0 +1,634 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zap.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/cmn_err.h>
+#include <sys/sunddi.h>
+#include <sys/cred.h>
+#include "zfs_comutil.h"
+#include "zfs_gitrev.h"
+#ifdef _KERNEL
+#include <sys/zone.h>
+#endif
+
+/*
+ * Routines to manage the on-disk history log.
+ *
+ * The history log is stored as a dmu object containing
+ * <packed record length, record nvlist> tuples.
+ *
+ * Where "record nvlist" is an nvlist containing uint64_ts and strings, and
+ * "packed record length" is the packed length of the "record nvlist" stored
+ * as a little endian uint64_t.
+ *
+ * The log is implemented as a ring buffer, though the original creation
+ * of the pool ('zpool create') is never overwritten.
+ *
+ * The history log is tracked as object 'spa_t::spa_history'. The bonus buffer
+ * of 'spa_history' stores the offsets for logging/retrieving history as
+ * 'spa_history_phys_t'. 'sh_pool_create_len' is the ending offset in bytes of
+ * where the 'zpool create' record is stored. This allows us to never
+ * overwrite the original creation of the pool. 'sh_phys_max_off' is the
+ * physical ending offset in bytes of the log. This tells you the length of
+ * the buffer. 'sh_eof' is the logical EOF (in bytes). Whenever a record
+ * is added, 'sh_eof' is incremented by the size of the record.
+ * 'sh_eof' is never decremented. 'sh_bof' is the logical BOF (in bytes).
+ * This is where the consumer should start reading from after reading in
+ * the 'zpool create' portion of the log.
+ *
+ * 'sh_records_lost' keeps track of how many records have been overwritten
+ * and permanently lost.
+ */
+
+/* convert a logical offset to physical */
+static uint64_t
+spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp)
+{
+ uint64_t phys_len;
+
+ phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len;
+ return ((log_off - shpp->sh_pool_create_len) % phys_len
+ + shpp->sh_pool_create_len);
+}
+
+void
+spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
+{
+ dmu_buf_t *dbp;
+ spa_history_phys_t *shpp;
+ objset_t *mos = spa->spa_meta_objset;
+
+ ASSERT0(spa->spa_history);
+ spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
+ SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
+ sizeof (spa_history_phys_t), tx);
+
+ VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_HISTORY, sizeof (uint64_t), 1,
+ &spa->spa_history, tx));
+
+ VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
+ ASSERT3U(dbp->db_size, >=, sizeof (spa_history_phys_t));
+
+ shpp = dbp->db_data;
+ dmu_buf_will_dirty(dbp, tx);
+
+ /*
+ * Figure out maximum size of history log. We set it at
+ * 0.1% of pool size, with a max of 1G and min of 128KB.
+ */
+ shpp->sh_phys_max_off =
+ metaslab_class_get_dspace(spa_normal_class(spa)) / 1000;
+ shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 1<<30);
+ shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10);
+
+ dmu_buf_rele(dbp, FTAG);
+}
+
+/*
+ * Change 'sh_bof' to the beginning of the next record.
+ */
+static int
+spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp)
+{
+ objset_t *mos = spa->spa_meta_objset;
+ uint64_t firstread, reclen, phys_bof;
+ char buf[sizeof (reclen)];
+ int err;
+
+ phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp);
+ firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof);
+
+ if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread,
+ buf, DMU_READ_PREFETCH)) != 0)
+ return (err);
+ if (firstread != sizeof (reclen)) {
+ if ((err = dmu_read(mos, spa->spa_history,
+ shpp->sh_pool_create_len, sizeof (reclen) - firstread,
+ buf + firstread, DMU_READ_PREFETCH)) != 0)
+ return (err);
+ }
+
+ reclen = LE_64(*((uint64_t *)buf));
+ shpp->sh_bof += reclen + sizeof (reclen);
+ shpp->sh_records_lost++;
+ return (0);
+}
+
+static int
+spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp,
+ dmu_tx_t *tx)
+{
+ uint64_t firstwrite, phys_eof;
+ objset_t *mos = spa->spa_meta_objset;
+ int err;
+
+ ASSERT(MUTEX_HELD(&spa->spa_history_lock));
+
+ /* see if we need to reset logical BOF */
+ while (shpp->sh_phys_max_off - shpp->sh_pool_create_len -
+ (shpp->sh_eof - shpp->sh_bof) <= len) {
+ if ((err = spa_history_advance_bof(spa, shpp)) != 0) {
+ return (err);
+ }
+ }
+
+ phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
+ firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof);
+ shpp->sh_eof += len;
+ dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx);
+
+ len -= firstwrite;
+ if (len > 0) {
+ /* write out the rest at the beginning of physical file */
+ dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len,
+ len, (char *)buf + firstwrite, tx);
+ }
+
+ return (0);
+}
+
+/*
+ * Post a history sysevent.
+ *
+ * The nvlist_t* passed into this function will be transformed into a new
+ * nvlist where:
+ *
+ * 1. Nested nvlists will be flattened to a single level
+ * 2. Keys will have their names normalized (to remove any problematic
+ * characters, such as whitespace)
+ *
+ * The nvlist_t passed into this function will duplicated and should be freed
+ * by caller.
+ *
+ */
+static void
+spa_history_log_notify(spa_t *spa, nvlist_t *nvl)
+{
+ nvlist_t *hist_nvl = fnvlist_alloc();
+ uint64_t uint64;
+ char *string;
+
+ if (nvlist_lookup_string(nvl, ZPOOL_HIST_CMD, &string) == 0)
+ fnvlist_add_string(hist_nvl, ZFS_EV_HIST_CMD, string);
+
+ if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0)
+ fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string);
+
+ if (nvlist_lookup_string(nvl, ZPOOL_HIST_ZONE, &string) == 0)
+ fnvlist_add_string(hist_nvl, ZFS_EV_HIST_ZONE, string);
+
+ if (nvlist_lookup_string(nvl, ZPOOL_HIST_HOST, &string) == 0)
+ fnvlist_add_string(hist_nvl, ZFS_EV_HIST_HOST, string);
+
+ if (nvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME, &string) == 0)
+ fnvlist_add_string(hist_nvl, ZFS_EV_HIST_DSNAME, string);
+
+ if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR, &string) == 0)
+ fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_STR, string);
+
+ if (nvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL, &string) == 0)
+ fnvlist_add_string(hist_nvl, ZFS_EV_HIST_IOCTL, string);
+
+ if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0)
+ fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string);
+
+ if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID, &uint64) == 0)
+ fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_DSID, uint64);
+
+ if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG, &uint64) == 0)
+ fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TXG, uint64);
+
+ if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TIME, &uint64) == 0)
+ fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TIME, uint64);
+
+ if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_WHO, &uint64) == 0)
+ fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_WHO, uint64);
+
+ if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_INT_EVENT, &uint64) == 0)
+ fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_INT_EVENT, uint64);
+
+ spa_event_notify(spa, NULL, hist_nvl, ESC_ZFS_HISTORY_EVENT);
+
+ nvlist_free(hist_nvl);
+}
+
+/*
+ * Write out a history event.
+ */
+/*ARGSUSED*/
+static void
+spa_history_log_sync(void *arg, dmu_tx_t *tx)
+{
+ nvlist_t *nvl = arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ dmu_buf_t *dbp;
+ spa_history_phys_t *shpp;
+ size_t reclen;
+ uint64_t le_len;
+ char *record_packed = NULL;
+ int ret;
+
+ /*
+ * If we have an older pool that doesn't have a command
+ * history object, create it now.
+ */
+ mutex_enter(&spa->spa_history_lock);
+ if (!spa->spa_history)
+ spa_history_create_obj(spa, tx);
+ mutex_exit(&spa->spa_history_lock);
+
+ /*
+ * Get the offset of where we need to write via the bonus buffer.
+ * Update the offset when the write completes.
+ */
+ VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
+ shpp = dbp->db_data;
+
+ dmu_buf_will_dirty(dbp, tx);
+
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(dbp, &doi);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
+ }
+#endif
+
+ fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname()->nodename);
+
+ if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) {
+ zfs_dbgmsg("command: %s",
+ fnvlist_lookup_string(nvl, ZPOOL_HIST_CMD));
+ } else if (nvlist_exists(nvl, ZPOOL_HIST_INT_NAME)) {
+ if (nvlist_exists(nvl, ZPOOL_HIST_DSNAME)) {
+ zfs_dbgmsg("txg %lld %s %s (id %llu) %s",
+ fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG),
+ fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME),
+ fnvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME),
+ fnvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID),
+ fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR));
+ } else {
+ zfs_dbgmsg("txg %lld %s %s",
+ fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG),
+ fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME),
+ fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR));
+ }
+ /*
+ * The history sysevent is posted only for internal history
+ * messages to show what has happened, not how it happened. For
+ * example, the following command:
+ *
+ * # zfs destroy -r tank/foo
+ *
+ * will result in one sysevent posted per dataset that is
+ * destroyed as a result of the command - which could be more
+ * than one event in total. By contrast, if the sysevent was
+ * posted as a result of the ZPOOL_HIST_CMD key being present
+ * it would result in only one sysevent being posted with the
+ * full command line arguments, requiring the consumer to know
+ * how to parse and understand zfs(8) command invocations.
+ */
+ spa_history_log_notify(spa, nvl);
+ } else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) {
+ zfs_dbgmsg("ioctl %s",
+ fnvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL));
+ }
+
+ VERIFY3U(nvlist_pack(nvl, &record_packed, &reclen, NV_ENCODE_NATIVE,
+ KM_SLEEP), ==, 0);
+
+ mutex_enter(&spa->spa_history_lock);
+
+ /* write out the packed length as little endian */
+ le_len = LE_64((uint64_t)reclen);
+ ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx);
+ if (!ret)
+ ret = spa_history_write(spa, record_packed, reclen, shpp, tx);
+
+ /* The first command is the create, which we keep forever */
+ if (ret == 0 && shpp->sh_pool_create_len == 0 &&
+ nvlist_exists(nvl, ZPOOL_HIST_CMD)) {
+ shpp->sh_pool_create_len = shpp->sh_bof = shpp->sh_eof;
+ }
+
+ mutex_exit(&spa->spa_history_lock);
+ fnvlist_pack_free(record_packed, reclen);
+ dmu_buf_rele(dbp, FTAG);
+ fnvlist_free(nvl);
+}
+
+/*
+ * Write out a history event.
+ */
+int
+spa_history_log(spa_t *spa, const char *msg)
+{
+ int err;
+ nvlist_t *nvl = fnvlist_alloc();
+
+ fnvlist_add_string(nvl, ZPOOL_HIST_CMD, msg);
+ err = spa_history_log_nvl(spa, nvl);
+ fnvlist_free(nvl);
+ return (err);
+}
+
+int
+spa_history_log_nvl(spa_t *spa, nvlist_t *nvl)
+{
+ int err = 0;
+ dmu_tx_t *tx;
+ nvlist_t *nvarg, *in_nvl = NULL;
+
+ if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa))
+ return (SET_ERROR(EINVAL));
+
+ err = nvlist_lookup_nvlist(nvl, ZPOOL_HIST_INPUT_NVL, &in_nvl);
+ if (err == 0) {
+ (void) nvlist_remove_all(in_nvl, ZPOOL_HIDDEN_ARGS);
+ }
+
+ tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ VERIFY0(nvlist_dup(nvl, &nvarg, KM_SLEEP));
+ if (spa_history_zone() != NULL) {
+ fnvlist_add_string(nvarg, ZPOOL_HIST_ZONE,
+ spa_history_zone());
+ }
+ fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED()));
+
+ /*
+ * Since the history is recorded asynchronously, the effective time is
+ * now, which may be considerably before the change is made on disk.
+ */
+ fnvlist_add_uint64(nvarg, ZPOOL_HIST_TIME, gethrestime_sec());
+
+ /* Kick this off asynchronously; errors are ignored. */
+ dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, nvarg, tx);
+ dmu_tx_commit(tx);
+
+ /* spa_history_log_sync will free nvl */
+ return (err);
+}
+
+/*
+ * Read out the command history.
+ */
+int
+spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
+{
+ objset_t *mos = spa->spa_meta_objset;
+ dmu_buf_t *dbp;
+ uint64_t read_len, phys_read_off, phys_eof;
+ uint64_t leftover = 0;
+ spa_history_phys_t *shpp;
+ int err;
+
+ /*
+ * If the command history doesn't exist (older pool),
+ * that's ok, just return ENOENT.
+ */
+ if (!spa->spa_history)
+ return (SET_ERROR(ENOENT));
+
+ /*
+ * The history is logged asynchronously, so when they request
+ * the first chunk of history, make sure everything has been
+ * synced to disk so that we get it.
+ */
+ if (*offp == 0 && spa_writeable(spa))
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0)
+ return (err);
+ shpp = dbp->db_data;
+
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(dbp, &doi);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
+ }
+#endif
+
+ mutex_enter(&spa->spa_history_lock);
+ phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
+
+ if (*offp < shpp->sh_pool_create_len) {
+ /* read in just the zpool create history */
+ phys_read_off = *offp;
+ read_len = MIN(*len, shpp->sh_pool_create_len -
+ phys_read_off);
+ } else {
+ /*
+ * Need to reset passed in offset to BOF if the passed in
+ * offset has since been overwritten.
+ */
+ *offp = MAX(*offp, shpp->sh_bof);
+ phys_read_off = spa_history_log_to_phys(*offp, shpp);
+
+ /*
+ * Read up to the minimum of what the user passed down or
+ * the EOF (physical or logical). If we hit physical EOF,
+ * use 'leftover' to read from the physical BOF.
+ */
+ if (phys_read_off <= phys_eof) {
+ read_len = MIN(*len, phys_eof - phys_read_off);
+ } else {
+ read_len = MIN(*len,
+ shpp->sh_phys_max_off - phys_read_off);
+ if (phys_read_off + *len > shpp->sh_phys_max_off) {
+ leftover = MIN(*len - read_len,
+ phys_eof - shpp->sh_pool_create_len);
+ }
+ }
+ }
+
+ /* offset for consumer to use next */
+ *offp += read_len + leftover;
+
+ /* tell the consumer how much you actually read */
+ *len = read_len + leftover;
+
+ if (read_len == 0) {
+ mutex_exit(&spa->spa_history_lock);
+ dmu_buf_rele(dbp, FTAG);
+ return (0);
+ }
+
+ err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf,
+ DMU_READ_PREFETCH);
+ if (leftover && err == 0) {
+ err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len,
+ leftover, buf + read_len, DMU_READ_PREFETCH);
+ }
+ mutex_exit(&spa->spa_history_lock);
+
+ dmu_buf_rele(dbp, FTAG);
+ return (err);
+}
+
+/*
+ * The nvlist will be consumed by this call.
+ */
+static void
+log_internal(nvlist_t *nvl, const char *operation, spa_t *spa,
+ dmu_tx_t *tx, const char *fmt, va_list adx)
+{
+ char *msg;
+
+ /*
+ * If this is part of creating a pool, not everything is
+ * initialized yet, so don't bother logging the internal events.
+ * Likewise if the pool is not writeable.
+ */
+ if (spa_is_initializing(spa) || !spa_writeable(spa)) {
+ fnvlist_free(nvl);
+ return;
+ }
+
+ msg = kmem_vasprintf(fmt, adx);
+ fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg);
+ kmem_strfree(msg);
+
+ fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation);
+ fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg);
+ fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec());
+
+ if (dmu_tx_is_syncing(tx)) {
+ spa_history_log_sync(nvl, tx);
+ } else {
+ dsl_sync_task_nowait(spa_get_dsl(spa),
+ spa_history_log_sync, nvl, tx);
+ }
+ /* spa_history_log_sync() will free nvl */
+}
+
+void
+spa_history_log_internal(spa_t *spa, const char *operation,
+ dmu_tx_t *tx, const char *fmt, ...)
+{
+ dmu_tx_t *htx = tx;
+ va_list adx;
+
+ /* create a tx if we didn't get one */
+ if (tx == NULL) {
+ htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ if (dmu_tx_assign(htx, TXG_WAIT) != 0) {
+ dmu_tx_abort(htx);
+ return;
+ }
+ }
+
+ va_start(adx, fmt);
+ log_internal(fnvlist_alloc(), operation, spa, htx, fmt, adx);
+ va_end(adx);
+
+ /* if we didn't get a tx from the caller, commit the one we made */
+ if (tx == NULL)
+ dmu_tx_commit(htx);
+}
+
+void
+spa_history_log_internal_ds(dsl_dataset_t *ds, const char *operation,
+ dmu_tx_t *tx, const char *fmt, ...)
+{
+ va_list adx;
+ char namebuf[ZFS_MAX_DATASET_NAME_LEN];
+ nvlist_t *nvl = fnvlist_alloc();
+
+ ASSERT(tx != NULL);
+
+ dsl_dataset_name(ds, namebuf);
+ fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
+ fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, ds->ds_object);
+
+ va_start(adx, fmt);
+ log_internal(nvl, operation, dsl_dataset_get_spa(ds), tx, fmt, adx);
+ va_end(adx);
+}
+
+void
+spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
+ dmu_tx_t *tx, const char *fmt, ...)
+{
+ va_list adx;
+ char namebuf[ZFS_MAX_DATASET_NAME_LEN];
+ nvlist_t *nvl = fnvlist_alloc();
+
+ ASSERT(tx != NULL);
+
+ dsl_dir_name(dd, namebuf);
+ fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
+ fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID,
+ dsl_dir_phys(dd)->dd_head_dataset_obj);
+
+ va_start(adx, fmt);
+ log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx);
+ va_end(adx);
+}
+
+void
+spa_history_log_version(spa_t *spa, const char *operation, dmu_tx_t *tx)
+{
+ utsname_t *u = utsname();
+
+ spa_history_log_internal(spa, operation, tx,
+ "pool version %llu; software version %s; uts %s %s %s %s",
+ (u_longlong_t)spa_version(spa), ZFS_META_GITREV,
+ u->nodename, u->release, u->version, u->machine);
+}
+
+#ifndef _KERNEL
+const char *
+spa_history_zone(void)
+{
+ return (NULL);
+}
+#endif
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(spa_history_create_obj);
+EXPORT_SYMBOL(spa_history_get);
+EXPORT_SYMBOL(spa_history_log);
+EXPORT_SYMBOL(spa_history_log_internal);
+EXPORT_SYMBOL(spa_history_log_version);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c b/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c
new file mode 100644
index 000000000000..5c55d32ec066
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c
@@ -0,0 +1,1322 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2018, 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/dmu_objset.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/spa_log_spacemap.h>
+#include <sys/vdev_impl.h>
+#include <sys/zap.h>
+
+/*
+ * Log Space Maps
+ *
+ * Log space maps are an optimization in ZFS metadata allocations for pools
+ * whose workloads are primarily random-writes. Random-write workloads are also
+ * typically random-free, meaning that they are freeing from locations scattered
+ * throughout the pool. This means that each TXG we will have to append some
+ * FREE records to almost every metaslab. With log space maps, we hold their
+ * changes in memory and log them altogether in one pool-wide space map on-disk
+ * for persistence. As more blocks are accumulated in the log space maps and
+ * more unflushed changes are accounted in memory, we flush a selected group
+ * of metaslabs every TXG to relieve memory pressure and potential overheads
+ * when loading the pool. Flushing a metaslab to disk relieves memory as we
+ * flush any unflushed changes from memory to disk (i.e. the metaslab's space
+ * map) and saves import time by making old log space maps obsolete and
+ * eventually destroying them. [A log space map is said to be obsolete when all
+ * its entries have made it to their corresponding metaslab space maps].
+ *
+ * == On disk data structures used ==
+ *
+ * - The pool has a new feature flag and a new entry in the MOS. The feature
+ * is activated when we create the first log space map and remains active
+ * for the lifetime of the pool. The new entry in the MOS Directory [refer
+ * to DMU_POOL_LOG_SPACEMAP_ZAP] is populated with a ZAP whose key-value
+ * pairs are of the form <key: txg, value: log space map object for that txg>.
+ * This entry is our on-disk reference of the log space maps that exist in
+ * the pool for each TXG and it is used during import to load all the
+ * metaslab unflushed changes in memory. To see how this structure is first
+ * created and later populated refer to spa_generate_syncing_log_sm(). To see
+ * how it is used during import time refer to spa_ld_log_sm_metadata().
+ *
+ * - Each vdev has a new entry in its vdev_top_zap (see field
+ * VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS) which holds the msp_unflushed_txg of
+ * each metaslab in this vdev. This field is the on-disk counterpart of the
+ * in-memory field ms_unflushed_txg which tells us from which TXG and onwards
+ * the metaslab haven't had its changes flushed. During import, we use this
+ * to ignore any entries in the space map log that are for this metaslab but
+ * from a TXG before msp_unflushed_txg. At that point, we also populate its
+ * in-memory counterpart and from there both fields are updated every time
+ * we flush that metaslab.
+ *
+ * - A space map is created every TXG and, during that TXG, it is used to log
+ * all incoming changes (the log space map). When created, the log space map
+ * is referenced in memory by spa_syncing_log_sm and its object ID is inserted
+ * to the space map ZAP mentioned above. The log space map is closed at the
+ * end of the TXG and will be destroyed when it becomes fully obsolete. We
+ * know when a log space map has become obsolete by looking at the oldest
+ * (and smallest) ms_unflushed_txg in the pool. If the value of that is bigger
+ * than the log space map's TXG, then it means that there is no metaslab who
+ * doesn't have the changes from that log and we can therefore destroy it.
+ * [see spa_cleanup_old_sm_logs()].
+ *
+ * == Important in-memory structures ==
+ *
+ * - The per-spa field spa_metaslabs_by_flushed sorts all the metaslabs in
+ * the pool by their ms_unflushed_txg field. It is primarily used for three
+ * reasons. First of all, it is used during flushing where we try to flush
+ * metaslabs in-order from the oldest-flushed to the most recently flushed
+ * every TXG. Secondly, it helps us to lookup the ms_unflushed_txg of the
+ * oldest flushed metaslab to distinguish which log space maps have become
+ * obsolete and which ones are still relevant. Finally it tells us which
+ * metaslabs have unflushed changes in a pool where this feature was just
+ * enabled, as we don't immediately add all of the pool's metaslabs but we
+ * add them over time as they go through metaslab_sync(). The reason that
+ * we do that is to ease these pools into the behavior of the flushing
+ * algorithm (described later on).
+ *
+ * - The per-spa field spa_sm_logs_by_txg can be thought as the in-memory
+ * counterpart of the space map ZAP mentioned above. It's an AVL tree whose
+ * nodes represent the log space maps in the pool. This in-memory
+ * representation of log space maps in the pool sorts the log space maps by
+ * the TXG that they were created (which is also the TXG of their unflushed
+ * changes). It also contains the following extra information for each
+ * space map:
+ * [1] The number of metaslabs that were last flushed on that TXG. This is
+ * important because if that counter is zero and this is the oldest
+ * log then it means that it is also obsolete.
+ * [2] The number of blocks of that space map. This field is used by the
+ * block heuristic of our flushing algorithm (described later on).
+ * It represents how many blocks of metadata changes ZFS had to write
+ * to disk for that TXG.
+ *
+ * - The per-spa field spa_log_summary is a list of entries that summarizes
+ * the metaslab and block counts of all the nodes of the spa_sm_logs_by_txg
+ * AVL tree mentioned above. The reason this exists is that our flushing
+ * algorithm (described later) tries to estimate how many metaslabs to flush
+ * in each TXG by iterating over all the log space maps and looking at their
+ * block counts. Summarizing that information means that don't have to
+ * iterate through each space map, minimizing the runtime overhead of the
+ * flushing algorithm which would be induced in syncing context. In terms of
+ * implementation the log summary is used as a queue:
+ * * we modify or pop entries from its head when we flush metaslabs
+ * * we modify or append entries to its tail when we sync changes.
+ *
+ * - Each metaslab has two new range trees that hold its unflushed changes,
+ * ms_unflushed_allocs and ms_unflushed_frees. These are always disjoint.
+ *
+ * == Flushing algorithm ==
+ *
+ * The decision of how many metaslabs to flush on a give TXG is guided by
+ * two heuristics:
+ *
+ * [1] The memory heuristic -
+ * We keep track of the memory used by the unflushed trees from all the
+ * metaslabs [see sus_memused of spa_unflushed_stats] and we ensure that it
+ * stays below a certain threshold which is determined by an arbitrary hard
+ * limit and an arbitrary percentage of the system's memory [see
+ * spa_log_exceeds_memlimit()]. When we see that the memory usage of the
+ * unflushed changes are passing that threshold, we flush metaslabs, which
+ * empties their unflushed range trees, reducing the memory used.
+ *
+ * [2] The block heuristic -
+ * We try to keep the total number of blocks in the log space maps in check
+ * so the log doesn't grow indefinitely and we don't induce a lot of overhead
+ * when loading the pool. At the same time we don't want to flush a lot of
+ * metaslabs too often as this would defeat the purpose of the log space map.
+ * As a result we set a limit in the amount of blocks that we think it's
+ * acceptable for the log space maps to have and try not to cross it.
+ * [see sus_blocklimit from spa_unflushed_stats].
+ *
+ * In order to stay below the block limit every TXG we have to estimate how
+ * many metaslabs we need to flush based on the current rate of incoming blocks
+ * and our history of log space map blocks. The main idea here is to answer
+ * the question of how many metaslabs do we need to flush in order to get rid
+ * at least an X amount of log space map blocks. We can answer this question
+ * by iterating backwards from the oldest log space map to the newest one
+ * and looking at their metaslab and block counts. At this point the log summary
+ * mentioned above comes handy as it reduces the amount of things that we have
+ * to iterate (even though it may reduce the preciseness of our estimates due
+ * to its aggregation of data). So with that in mind, we project the incoming
+ * rate of the current TXG into the future and attempt to approximate how many
+ * metaslabs would we need to flush from now in order to avoid exceeding our
+ * block limit in different points in the future (granted that we would keep
+ * flushing the same number of metaslabs for every TXG). Then we take the
+ * maximum number from all these estimates to be on the safe side. For the
+ * exact implementation details of algorithm refer to
+ * spa_estimate_metaslabs_to_flush.
+ */
+
+/*
+ * This is used as the block size for the space maps used for the
+ * log space map feature. These space maps benefit from a bigger
+ * block size as we expect to be writing a lot of data to them at
+ * once.
+ */
+unsigned long zfs_log_sm_blksz = 1ULL << 17;
+
+/*
+ * Percentage of the overall system's memory that ZFS allows to be
+ * used for unflushed changes (e.g. the sum of size of all the nodes
+ * in the unflushed trees).
+ *
+ * Note that this value is calculated over 1000000 for finer granularity
+ * (thus the _ppm suffix; reads as "parts per million"). As an example,
+ * the default of 1000 allows 0.1% of memory to be used.
+ */
+unsigned long zfs_unflushed_max_mem_ppm = 1000;
+
+/*
+ * Specific hard-limit in memory that ZFS allows to be used for
+ * unflushed changes.
+ */
+unsigned long zfs_unflushed_max_mem_amt = 1ULL << 30;
+
+/*
+ * The following tunable determines the number of blocks that can be used for
+ * the log space maps. It is expressed as a percentage of the total number of
+ * metaslabs in the pool (i.e. the default of 400 means that the number of log
+ * blocks is capped at 4 times the number of metaslabs).
+ *
+ * This value exists to tune our flushing algorithm, with higher values
+ * flushing metaslabs less often (doing less I/Os) per TXG versus lower values
+ * flushing metaslabs more aggressively with the upside of saving overheads
+ * when loading the pool. Another factor in this tradeoff is that flushing
+ * less often can potentially lead to better utilization of the metaslab space
+ * map's block size as we accumulate more changes per flush.
+ *
+ * Given that this tunable indirectly controls the flush rate (metaslabs
+ * flushed per txg) and that's why making it a percentage in terms of the
+ * number of metaslabs in the pool makes sense here.
+ *
+ * As a rule of thumb we default this tunable to 400% based on the following:
+ *
+ * 1] Assuming a constant flush rate and a constant incoming rate of log blocks
+ * it is reasonable to expect that the amount of obsolete entries changes
+ * linearly from txg to txg (e.g. the oldest log should have the most
+ * obsolete entries, and the most recent one the least). With this we could
+ * say that, at any given time, about half of the entries in the whole space
+ * map log are obsolete. Thus for every two entries for a metaslab in the
+ * log space map, only one of them is valid and actually makes it to the
+ * metaslab's space map.
+ * [factor of 2]
+ * 2] Each entry in the log space map is guaranteed to be two words while
+ * entries in metaslab space maps are generally single-word.
+ * [an extra factor of 2 - 400% overall]
+ * 3] Even if [1] and [2] are slightly less than 2 each, we haven't taken into
+ * account any consolidation of segments from the log space map to the
+ * unflushed range trees nor their history (e.g. a segment being allocated,
+ * then freed, then allocated again means 3 log space map entries but 0
+ * metaslab space map entries). Depending on the workload, we've seen ~1.8
+ * non-obsolete log space map entries per metaslab entry, for a total of
+ * ~600%. Since most of these estimates though are workload dependent, we
+ * default on 400% to be conservative.
+ *
+ * Thus we could say that even in the worst
+ * case of [1] and [2], the factor should end up being 4.
+ *
+ * That said, regardless of the number of metaslabs in the pool we need to
+ * provide upper and lower bounds for the log block limit.
+ * [see zfs_unflushed_log_block_{min,max}]
+ */
+unsigned long zfs_unflushed_log_block_pct = 400;
+
+/*
+ * If the number of metaslabs is small and our incoming rate is high, we could
+ * get into a situation that we are flushing all our metaslabs every TXG. Thus
+ * we always allow at least this many log blocks.
+ */
+unsigned long zfs_unflushed_log_block_min = 1000;
+
+/*
+ * If the log becomes too big, the import time of the pool can take a hit in
+ * terms of performance. Thus we have a hard limit in the size of the log in
+ * terms of blocks.
+ */
+unsigned long zfs_unflushed_log_block_max = (1ULL << 18);
+
+/*
+ * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and
+ * stability of the flushing algorithm (longer summary) vs its runtime overhead
+ * (smaller summary is faster to traverse).
+ */
+unsigned long zfs_max_logsm_summary_length = 10;
+
+/*
+ * Tunable that sets the lower bound on the metaslabs to flush every TXG.
+ *
+ * Setting this to 0 has no effect since if the pool is idle we won't even be
+ * creating log space maps and therefore we won't be flushing. On the other
+ * hand if the pool has any incoming workload our block heuristic will start
+ * flushing metaslabs anyway.
+ *
+ * The point of this tunable is to be used in extreme cases where we really
+ * want to flush more metaslabs than our adaptable heuristic plans to flush.
+ */
+unsigned long zfs_min_metaslabs_to_flush = 1;
+
+/*
+ * Tunable that specifies how far in the past do we want to look when trying to
+ * estimate the incoming log blocks for the current TXG.
+ *
+ * Setting this too high may not only increase runtime but also minimize the
+ * effect of the incoming rates from the most recent TXGs as we take the
+ * average over all the blocks that we walk
+ * [see spa_estimate_incoming_log_blocks].
+ */
+unsigned long zfs_max_log_walking = 5;
+
+/*
+ * This tunable exists solely for testing purposes. It ensures that the log
+ * spacemaps are not flushed and destroyed during export in order for the
+ * relevant log spacemap import code paths to be tested (effectively simulating
+ * a crash).
+ */
+int zfs_keep_log_spacemaps_at_export = 0;
+
+static uint64_t
+spa_estimate_incoming_log_blocks(spa_t *spa)
+{
+ ASSERT3U(spa_sync_pass(spa), ==, 1);
+ uint64_t steps = 0, sum = 0;
+ for (spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
+ sls != NULL && steps < zfs_max_log_walking;
+ sls = AVL_PREV(&spa->spa_sm_logs_by_txg, sls)) {
+ if (sls->sls_txg == spa_syncing_txg(spa)) {
+ /*
+ * skip the log created in this TXG as this would
+ * make our estimations inaccurate.
+ */
+ continue;
+ }
+ sum += sls->sls_nblocks;
+ steps++;
+ }
+ return ((steps > 0) ? DIV_ROUND_UP(sum, steps) : 0);
+}
+
+uint64_t
+spa_log_sm_blocklimit(spa_t *spa)
+{
+ return (spa->spa_unflushed_stats.sus_blocklimit);
+}
+
+void
+spa_log_sm_set_blocklimit(spa_t *spa)
+{
+ if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
+ ASSERT0(spa_log_sm_blocklimit(spa));
+ return;
+ }
+
+ uint64_t calculated_limit =
+ (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100;
+ spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit,
+ zfs_unflushed_log_block_min), zfs_unflushed_log_block_max);
+}
+
+uint64_t
+spa_log_sm_nblocks(spa_t *spa)
+{
+ return (spa->spa_unflushed_stats.sus_nblocks);
+}
+
+/*
+ * Ensure that the in-memory log space map structures and the summary
+ * have the same block and metaslab counts.
+ */
+static void
+spa_log_summary_verify_counts(spa_t *spa)
+{
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+ if ((zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) == 0)
+ return;
+
+ uint64_t ms_in_avl = avl_numnodes(&spa->spa_metaslabs_by_flushed);
+
+ uint64_t ms_in_summary = 0, blk_in_summary = 0;
+ for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+ e; e = list_next(&spa->spa_log_summary, e)) {
+ ms_in_summary += e->lse_mscount;
+ blk_in_summary += e->lse_blkcount;
+ }
+
+ uint64_t ms_in_logs = 0, blk_in_logs = 0;
+ for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+ sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+ ms_in_logs += sls->sls_mscount;
+ blk_in_logs += sls->sls_nblocks;
+ }
+
+ VERIFY3U(ms_in_logs, ==, ms_in_summary);
+ VERIFY3U(ms_in_logs, ==, ms_in_avl);
+ VERIFY3U(blk_in_logs, ==, blk_in_summary);
+ VERIFY3U(blk_in_logs, ==, spa_log_sm_nblocks(spa));
+}
+
+static boolean_t
+summary_entry_is_full(spa_t *spa, log_summary_entry_t *e)
+{
+ uint64_t blocks_per_row = MAX(1,
+ DIV_ROUND_UP(spa_log_sm_blocklimit(spa),
+ zfs_max_logsm_summary_length));
+ return (blocks_per_row <= e->lse_blkcount);
+}
+
+/*
+ * Update the log summary information to reflect the fact that a metaslab
+ * was flushed or destroyed (e.g due to device removal or pool export/destroy).
+ *
+ * We typically flush the oldest flushed metaslab so the first (and oldest)
+ * entry of the summary is updated. However if that metaslab is getting loaded
+ * we may flush the second oldest one which may be part of an entry later in
+ * the summary. Moreover, if we call into this function from metaslab_fini()
+ * the metaslabs probably won't be ordered by ms_unflushed_txg. Thus we ask
+ * for a txg as an argument so we can locate the appropriate summary entry for
+ * the metaslab.
+ */
+void
+spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
+{
+ /*
+ * We don't track summary data for read-only pools and this function
+ * can be called from metaslab_fini(). In that case return immediately.
+ */
+ if (!spa_writeable(spa))
+ return;
+
+ log_summary_entry_t *target = NULL;
+ for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+ e != NULL; e = list_next(&spa->spa_log_summary, e)) {
+ if (e->lse_start > txg)
+ break;
+ target = e;
+ }
+
+ if (target == NULL || target->lse_mscount == 0) {
+ /*
+ * We didn't find a summary entry for this metaslab. We must be
+ * at the teardown of a spa_load() attempt that got an error
+ * while reading the log space maps.
+ */
+ VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
+ return;
+ }
+
+ target->lse_mscount--;
+}
+
+/*
+ * Update the log summary information to reflect the fact that we destroyed
+ * old log space maps. Since we can only destroy the oldest log space maps,
+ * we decrement the block count of the oldest summary entry and potentially
+ * destroy it when that count hits 0.
+ *
+ * This function is called after a metaslab is flushed and typically that
+ * metaslab is the oldest flushed, which means that this function will
+ * typically decrement the block count of the first entry of the summary and
+ * potentially free it if the block count gets to zero (its metaslab count
+ * should be zero too at that point).
+ *
+ * There are certain scenarios though that don't work exactly like that so we
+ * need to account for them:
+ *
+ * Scenario [1]: It is possible that after we flushed the oldest flushed
+ * metaslab and we destroyed the oldest log space map, more recent logs had 0
+ * metaslabs pointing to them so we got rid of them too. This can happen due
+ * to metaslabs being destroyed through device removal, or because the oldest
+ * flushed metaslab was loading but we kept flushing more recently flushed
+ * metaslabs due to the memory pressure of unflushed changes. Because of that,
+ * we always iterate from the beginning of the summary and if blocks_gone is
+ * bigger than the block_count of the current entry we free that entry (we
+ * expect its metaslab count to be zero), we decrement blocks_gone and on to
+ * the next entry repeating this procedure until blocks_gone gets decremented
+ * to 0. Doing this also works for the typical case mentioned above.
+ *
+ * Scenario [2]: The oldest flushed metaslab isn't necessarily accounted by
+ * the first (and oldest) entry in the summary. If the first few entries of
+ * the summary were only accounting metaslabs from a device that was just
+ * removed, then the current oldest flushed metaslab could be accounted by an
+ * entry somewhere in the middle of the summary. Moreover flushing that
+ * metaslab will destroy all the log space maps older than its ms_unflushed_txg
+ * because they became obsolete after the removal. Thus, iterating as we did
+ * for scenario [1] works out for this case too.
+ *
+ * Scenario [3]: At times we decide to flush all the metaslabs in the pool
+ * in one TXG (either because we are exporting the pool or because our flushing
+ * heuristics decided to do so). When that happens all the log space maps get
+ * destroyed except the one created for the current TXG which doesn't have
+ * any log blocks yet. As log space maps get destroyed with every metaslab that
+ * we flush, entries in the summary are also destroyed. This brings a weird
+ * corner-case when we flush the last metaslab and the log space map of the
+ * current TXG is in the same summary entry with other log space maps that
+ * are older. When that happens we are eventually left with this one last
+ * summary entry whose blocks are gone (blocks_gone equals the entry's block
+ * count) but its metaslab count is non-zero (because it accounts all the
+ * metaslabs in the pool as they all got flushed). Under this scenario we can't
+ * free this last summary entry as it's referencing all the metaslabs in the
+ * pool and its block count will get incremented at the end of this sync (when
+ * we close the syncing log space map). Thus we just decrement its current
+ * block count and leave it alone. In the case that the pool gets exported,
+ * its metaslab count will be decremented over time as we call metaslab_fini()
+ * for all the metaslabs in the pool and the entry will be freed at
+ * spa_unload_log_sm_metadata().
+ */
+void
+spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone)
+{
+ for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+ e != NULL; e = list_head(&spa->spa_log_summary)) {
+ if (e->lse_blkcount > blocks_gone) {
+ /*
+ * Assert that we stopped at an entry that is not
+ * obsolete.
+ */
+ ASSERT(e->lse_mscount != 0);
+
+ e->lse_blkcount -= blocks_gone;
+ blocks_gone = 0;
+ break;
+ } else if (e->lse_mscount == 0) {
+ /* remove obsolete entry */
+ blocks_gone -= e->lse_blkcount;
+ list_remove(&spa->spa_log_summary, e);
+ kmem_free(e, sizeof (log_summary_entry_t));
+ } else {
+ /* Verify that this is scenario [3] mentioned above. */
+ VERIFY3U(blocks_gone, ==, e->lse_blkcount);
+
+ /*
+ * Assert that this is scenario [3] further by ensuring
+ * that this is the only entry in the summary.
+ */
+ VERIFY3P(e, ==, list_tail(&spa->spa_log_summary));
+ ASSERT3P(e, ==, list_head(&spa->spa_log_summary));
+
+ blocks_gone = e->lse_blkcount = 0;
+ break;
+ }
+ }
+
+ /*
+ * Ensure that there is no way we are trying to remove more blocks
+ * than the # of blocks in the summary.
+ */
+ ASSERT0(blocks_gone);
+}
+
+void
+spa_log_sm_decrement_mscount(spa_t *spa, uint64_t txg)
+{
+ spa_log_sm_t target = { .sls_txg = txg };
+ spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
+ &target, NULL);
+
+ if (sls == NULL) {
+ /*
+ * We must be at the teardown of a spa_load() attempt that
+ * got an error while reading the log space maps.
+ */
+ VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
+ return;
+ }
+
+ ASSERT(sls->sls_mscount > 0);
+ sls->sls_mscount--;
+}
+
+void
+spa_log_sm_increment_current_mscount(spa_t *spa)
+{
+ spa_log_sm_t *last_sls = avl_last(&spa->spa_sm_logs_by_txg);
+ ASSERT3U(last_sls->sls_txg, ==, spa_syncing_txg(spa));
+ last_sls->sls_mscount++;
+}
+
+static void
+summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed,
+ uint64_t nblocks)
+{
+ log_summary_entry_t *e = list_tail(&spa->spa_log_summary);
+
+ if (e == NULL || summary_entry_is_full(spa, e)) {
+ e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP);
+ e->lse_start = txg;
+ list_insert_tail(&spa->spa_log_summary, e);
+ }
+
+ ASSERT3U(e->lse_start, <=, txg);
+ e->lse_mscount += metaslabs_flushed;
+ e->lse_blkcount += nblocks;
+}
+
+static void
+spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks)
+{
+ summary_add_data(spa, spa_syncing_txg(spa), 0, nblocks);
+}
+
+void
+spa_log_summary_add_flushed_metaslab(spa_t *spa)
+{
+ summary_add_data(spa, spa_syncing_txg(spa), 1, 0);
+}
+
+/*
+ * This function attempts to estimate how many metaslabs should
+ * we flush to satisfy our block heuristic for the log spacemap
+ * for the upcoming TXGs.
+ *
+ * Specifically, it first tries to estimate the number of incoming
+ * blocks in this TXG. Then by projecting that incoming rate to
+ * future TXGs and using the log summary, it figures out how many
+ * flushes we would need to do for future TXGs individually to
+ * stay below our block limit and returns the maximum number of
+ * flushes from those estimates.
+ */
+static uint64_t
+spa_estimate_metaslabs_to_flush(spa_t *spa)
+{
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+ ASSERT3U(spa_sync_pass(spa), ==, 1);
+ ASSERT(spa_log_sm_blocklimit(spa) != 0);
+
+ /*
+ * This variable contains the incoming rate that will be projected
+ * and used for our flushing estimates in the future.
+ */
+ uint64_t incoming = spa_estimate_incoming_log_blocks(spa);
+
+ /*
+ * At any point in time this variable tells us how many
+ * TXGs in the future we are so we can make our estimations.
+ */
+ uint64_t txgs_in_future = 1;
+
+ /*
+ * This variable tells us how much room do we have until we hit
+ * our limit. When it goes negative, it means that we've exceeded
+ * our limit and we need to flush.
+ *
+ * Note that since we start at the first TXG in the future (i.e.
+ * txgs_in_future starts from 1) we already decrement this
+ * variable by the incoming rate.
+ */
+ int64_t available_blocks =
+ spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming;
+
+ /*
+ * This variable tells us the total number of flushes needed to
+ * keep the log size within the limit when we reach txgs_in_future.
+ */
+ uint64_t total_flushes = 0;
+
+ /* Holds the current maximum of our estimates so far. */
+ uint64_t max_flushes_pertxg =
+ MIN(avl_numnodes(&spa->spa_metaslabs_by_flushed),
+ zfs_min_metaslabs_to_flush);
+
+ /*
+ * For our estimations we only look as far in the future
+ * as the summary allows us.
+ */
+ for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+ e; e = list_next(&spa->spa_log_summary, e)) {
+
+ /*
+ * If there is still room before we exceed our limit
+ * then keep skipping TXGs accumulating more blocks
+ * based on the incoming rate until we exceed it.
+ */
+ if (available_blocks >= 0) {
+ uint64_t skip_txgs = (available_blocks / incoming) + 1;
+ available_blocks -= (skip_txgs * incoming);
+ txgs_in_future += skip_txgs;
+ ASSERT3S(available_blocks, >=, -incoming);
+ }
+
+ /*
+ * At this point we're far enough into the future where
+ * the limit was just exceeded and we flush metaslabs
+ * based on the current entry in the summary, updating
+ * our available_blocks.
+ */
+ ASSERT3S(available_blocks, <, 0);
+ available_blocks += e->lse_blkcount;
+ total_flushes += e->lse_mscount;
+
+ /*
+ * Keep the running maximum of the total_flushes that
+ * we've done so far over the number of TXGs in the
+ * future that we are. The idea here is to estimate
+ * the average number of flushes that we should do
+ * every TXG so that when we are that many TXGs in the
+ * future we stay under the limit.
+ */
+ max_flushes_pertxg = MAX(max_flushes_pertxg,
+ DIV_ROUND_UP(total_flushes, txgs_in_future));
+ ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
+ max_flushes_pertxg);
+ }
+ return (max_flushes_pertxg);
+}
+
+uint64_t
+spa_log_sm_memused(spa_t *spa)
+{
+ return (spa->spa_unflushed_stats.sus_memused);
+}
+
+static boolean_t
+spa_log_exceeds_memlimit(spa_t *spa)
+{
+ if (spa_log_sm_memused(spa) > zfs_unflushed_max_mem_amt)
+ return (B_TRUE);
+
+ uint64_t system_mem_allowed = ((physmem * PAGESIZE) *
+ zfs_unflushed_max_mem_ppm) / 1000000;
+ if (spa_log_sm_memused(spa) > system_mem_allowed)
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+boolean_t
+spa_flush_all_logs_requested(spa_t *spa)
+{
+ return (spa->spa_log_flushall_txg != 0);
+}
+
+void
+spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
+{
+ uint64_t txg = dmu_tx_get_txg(tx);
+
+ if (spa_sync_pass(spa) != 1)
+ return;
+
+ if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+ return;
+
+ /*
+ * If we don't have any metaslabs with unflushed changes
+ * return immediately.
+ */
+ if (avl_numnodes(&spa->spa_metaslabs_by_flushed) == 0)
+ return;
+
+ /*
+ * During SPA export we leave a few empty TXGs to go by [see
+ * spa_final_dirty_txg() to understand why]. For this specific
+ * case, it is important to not flush any metaslabs as that
+ * would dirty this TXG.
+ *
+ * That said, during one of these dirty TXGs that is less or
+ * equal to spa_final_dirty(), spa_unload() will request that
+ * we try to flush all the metaslabs for that TXG before
+ * exporting the pool, thus we ensure that we didn't get a
+ * request of flushing everything before we attempt to return
+ * immediately.
+ */
+ if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+ !dmu_objset_is_dirty(spa_meta_objset(spa), txg) &&
+ !spa_flush_all_logs_requested(spa))
+ return;
+
+ /*
+ * We need to generate a log space map before flushing because this
+ * will set up the in-memory data (i.e. node in spa_sm_logs_by_txg)
+ * for this TXG's flushed metaslab count (aka sls_mscount which is
+ * manipulated in many ways down the metaslab_flush() codepath).
+ *
+ * That is not to say that we may generate a log space map when we
+ * don't need it. If we are flushing metaslabs, that means that we
+ * were going to write changes to disk anyway, so even if we were
+ * not flushing, a log space map would have been created anyway in
+ * metaslab_sync().
+ */
+ spa_generate_syncing_log_sm(spa, tx);
+
+ /*
+ * This variable tells us how many metaslabs we want to flush based
+ * on the block-heuristic of our flushing algorithm (see block comment
+ * of log space map feature). We also decrement this as we flush
+ * metaslabs and attempt to destroy old log space maps.
+ */
+ uint64_t want_to_flush;
+ if (spa_flush_all_logs_requested(spa)) {
+ ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
+ want_to_flush = avl_numnodes(&spa->spa_metaslabs_by_flushed);
+ } else {
+ want_to_flush = spa_estimate_metaslabs_to_flush(spa);
+ }
+
+ ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
+ want_to_flush);
+
+ /* Used purely for verification purposes */
+ uint64_t visited = 0;
+
+ /*
+ * Ideally we would only iterate through spa_metaslabs_by_flushed
+ * using only one variable (curr). We can't do that because
+ * metaslab_flush() mutates position of curr in the AVL when
+ * it flushes that metaslab by moving it to the end of the tree.
+ * Thus we always keep track of the original next node of the
+ * current node (curr) in another variable (next).
+ */
+ metaslab_t *next = NULL;
+ for (metaslab_t *curr = avl_first(&spa->spa_metaslabs_by_flushed);
+ curr != NULL; curr = next) {
+ next = AVL_NEXT(&spa->spa_metaslabs_by_flushed, curr);
+
+ /*
+ * If this metaslab has been flushed this txg then we've done
+ * a full circle over the metaslabs.
+ */
+ if (metaslab_unflushed_txg(curr) == txg)
+ break;
+
+ /*
+ * If we are done flushing for the block heuristic and the
+ * unflushed changes don't exceed the memory limit just stop.
+ */
+ if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa))
+ break;
+
+ mutex_enter(&curr->ms_sync_lock);
+ mutex_enter(&curr->ms_lock);
+ boolean_t flushed = metaslab_flush(curr, tx);
+ mutex_exit(&curr->ms_lock);
+ mutex_exit(&curr->ms_sync_lock);
+
+ /*
+ * If we failed to flush a metaslab (because it was loading),
+ * then we are done with the block heuristic as it's not
+ * possible to destroy any log space maps once you've skipped
+ * a metaslab. In that case we just set our counter to 0 but
+ * we continue looping in case there is still memory pressure
+ * due to unflushed changes. Note that, flushing a metaslab
+ * that is not the oldest flushed in the pool, will never
+ * destroy any log space maps [see spa_cleanup_old_sm_logs()].
+ */
+ if (!flushed) {
+ want_to_flush = 0;
+ } else if (want_to_flush > 0) {
+ want_to_flush--;
+ }
+
+ visited++;
+ }
+ ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited);
+}
+
+/*
+ * Close the log space map for this TXG and update the block counts
+ * for the log's in-memory structure and the summary.
+ */
+void
+spa_sync_close_syncing_log_sm(spa_t *spa)
+{
+ if (spa_syncing_log_sm(spa) == NULL)
+ return;
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+ spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
+ ASSERT3U(sls->sls_txg, ==, spa_syncing_txg(spa));
+
+ sls->sls_nblocks = space_map_nblocks(spa_syncing_log_sm(spa));
+ spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
+
+ /*
+ * Note that we can't assert that sls_mscount is not 0,
+ * because there is the case where the first metaslab
+ * in spa_metaslabs_by_flushed is loading and we were
+ * not able to flush any metaslabs the current TXG.
+ */
+ ASSERT(sls->sls_nblocks != 0);
+
+ spa_log_summary_add_incoming_blocks(spa, sls->sls_nblocks);
+ spa_log_summary_verify_counts(spa);
+
+ space_map_close(spa->spa_syncing_log_sm);
+ spa->spa_syncing_log_sm = NULL;
+
+ /*
+ * At this point we tried to flush as many metaslabs as we
+ * can as the pool is getting exported. Reset the "flush all"
+ * so the last few TXGs before closing the pool can be empty
+ * (e.g. not dirty).
+ */
+ if (spa_flush_all_logs_requested(spa)) {
+ ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
+ spa->spa_log_flushall_txg = 0;
+ }
+}
+
+void
+spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx)
+{
+ objset_t *mos = spa_meta_objset(spa);
+
+ uint64_t spacemap_zap;
+ int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
+ if (error == ENOENT) {
+ ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
+ return;
+ }
+ VERIFY0(error);
+
+ metaslab_t *oldest = avl_first(&spa->spa_metaslabs_by_flushed);
+ uint64_t oldest_flushed_txg = metaslab_unflushed_txg(oldest);
+
+ /* Free all log space maps older than the oldest_flushed_txg. */
+ for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+ sls && sls->sls_txg < oldest_flushed_txg;
+ sls = avl_first(&spa->spa_sm_logs_by_txg)) {
+ ASSERT0(sls->sls_mscount);
+ avl_remove(&spa->spa_sm_logs_by_txg, sls);
+ space_map_free_obj(mos, sls->sls_sm_obj, tx);
+ VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx));
+ spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks;
+ kmem_free(sls, sizeof (spa_log_sm_t));
+ }
+}
+
+static spa_log_sm_t *
+spa_log_sm_alloc(uint64_t sm_obj, uint64_t txg)
+{
+ spa_log_sm_t *sls = kmem_zalloc(sizeof (*sls), KM_SLEEP);
+ sls->sls_sm_obj = sm_obj;
+ sls->sls_txg = txg;
+ return (sls);
+}
+
+void
+spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx)
+{
+ uint64_t txg = dmu_tx_get_txg(tx);
+ objset_t *mos = spa_meta_objset(spa);
+
+ if (spa_syncing_log_sm(spa) != NULL)
+ return;
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP))
+ return;
+
+ uint64_t spacemap_zap;
+ int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
+ if (error == ENOENT) {
+ ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
+
+ error = 0;
+ spacemap_zap = zap_create(mos,
+ DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
+ VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1,
+ &spacemap_zap, tx));
+ spa_feature_incr(spa, SPA_FEATURE_LOG_SPACEMAP, tx);
+ }
+ VERIFY0(error);
+
+ uint64_t sm_obj;
+ ASSERT3U(zap_lookup_int_key(mos, spacemap_zap, txg, &sm_obj),
+ ==, ENOENT);
+ sm_obj = space_map_alloc(mos, zfs_log_sm_blksz, tx);
+ VERIFY0(zap_add_int_key(mos, spacemap_zap, txg, sm_obj, tx));
+ avl_add(&spa->spa_sm_logs_by_txg, spa_log_sm_alloc(sm_obj, txg));
+
+ /*
+ * We pass UINT64_MAX as the space map's representation size
+ * and SPA_MINBLOCKSHIFT as the shift, to make the space map
+ * accept any sorts of segments since there's no real advantage
+ * to being more restrictive (given that we're already going
+ * to be using 2-word entries).
+ */
+ VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj,
+ 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
+
+ /*
+ * If the log space map feature was just enabled, the blocklimit
+ * has not yet been set.
+ */
+ if (spa_log_sm_blocklimit(spa) == 0)
+ spa_log_sm_set_blocklimit(spa);
+}
+
+/*
+ * Find all the log space maps stored in the space map ZAP and sort
+ * them by their TXG in spa_sm_logs_by_txg.
+ */
+static int
+spa_ld_log_sm_metadata(spa_t *spa)
+{
+ int error;
+ uint64_t spacemap_zap;
+
+ ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
+
+ error = zap_lookup(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
+ if (error == ENOENT) {
+ /* the space map ZAP doesn't exist yet */
+ return (0);
+ } else if (error != 0) {
+ spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
+ "zap_lookup(DMU_POOL_DIRECTORY_OBJECT) [error %d]",
+ error);
+ return (error);
+ }
+
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ for (zap_cursor_init(&zc, spa_meta_objset(spa), spacemap_zap);
+ (error = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t log_txg = zfs_strtonum(za.za_name, NULL);
+ spa_log_sm_t *sls =
+ spa_log_sm_alloc(za.za_first_integer, log_txg);
+ avl_add(&spa->spa_sm_logs_by_txg, sls);
+ }
+ zap_cursor_fini(&zc);
+ if (error != ENOENT) {
+ spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
+ "zap_cursor_retrieve(spacemap_zap) [error %d]",
+ error);
+ return (error);
+ }
+
+ for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
+ m; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
+ spa_log_sm_t target = { .sls_txg = metaslab_unflushed_txg(m) };
+ spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
+ &target, NULL);
+
+ /*
+ * At this point if sls is zero it means that a bug occurred
+ * in ZFS the last time the pool was open or earlier in the
+ * import code path. In general, we would have placed a
+ * VERIFY() here or in this case just let the kernel panic
+ * with NULL pointer dereference when incrementing sls_mscount,
+ * but since this is the import code path we can be a bit more
+ * lenient. Thus, for DEBUG bits we always cause a panic, while
+ * in production we log the error and just fail the import.
+ */
+ ASSERT(sls != NULL);
+ if (sls == NULL) {
+ spa_load_failed(spa, "spa_ld_log_sm_metadata(): bug "
+ "encountered: could not find log spacemap for "
+ "TXG %ld [error %d]",
+ metaslab_unflushed_txg(m), ENOENT);
+ return (ENOENT);
+ }
+ sls->sls_mscount++;
+ }
+
+ return (0);
+}
+
+typedef struct spa_ld_log_sm_arg {
+ spa_t *slls_spa;
+ uint64_t slls_txg;
+} spa_ld_log_sm_arg_t;
+
+static int
+spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg)
+{
+ uint64_t offset = sme->sme_offset;
+ uint64_t size = sme->sme_run;
+ uint32_t vdev_id = sme->sme_vdev;
+
+ spa_ld_log_sm_arg_t *slls = arg;
+ spa_t *spa = slls->slls_spa;
+
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+
+ /*
+ * If the vdev has been removed (i.e. it is indirect or a hole)
+ * skip this entry. The contents of this vdev have already moved
+ * elsewhere.
+ */
+ if (!vdev_is_concrete(vd))
+ return (0);
+
+ metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+ ASSERT(!ms->ms_loaded);
+
+ /*
+ * If we have already flushed entries for this TXG to this
+ * metaslab's space map, then ignore it. Note that we flush
+ * before processing any allocations/frees for that TXG, so
+ * the metaslab's space map only has entries from *before*
+ * the unflushed TXG.
+ */
+ if (slls->slls_txg < metaslab_unflushed_txg(ms))
+ return (0);
+
+ switch (sme->sme_type) {
+ case SM_ALLOC:
+ range_tree_remove_xor_add_segment(offset, offset + size,
+ ms->ms_unflushed_frees, ms->ms_unflushed_allocs);
+ break;
+ case SM_FREE:
+ range_tree_remove_xor_add_segment(offset, offset + size,
+ ms->ms_unflushed_allocs, ms->ms_unflushed_frees);
+ break;
+ default:
+ panic("invalid maptype_t");
+ break;
+ }
+ return (0);
+}
+
+static int
+spa_ld_log_sm_data(spa_t *spa)
+{
+ int error = 0;
+
+ /*
+ * If we are not going to do any writes there is no need
+ * to read the log space maps.
+ */
+ if (!spa_writeable(spa))
+ return (0);
+
+ ASSERT0(spa->spa_unflushed_stats.sus_nblocks);
+ ASSERT0(spa->spa_unflushed_stats.sus_memused);
+
+ hrtime_t read_logs_starttime = gethrtime();
+ /* this is a no-op when we don't have space map logs */
+ for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+ sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+ space_map_t *sm = NULL;
+ error = space_map_open(&sm, spa_meta_objset(spa),
+ sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT);
+ if (error != 0) {
+ spa_load_failed(spa, "spa_ld_log_sm_data(): failed at "
+ "space_map_open(obj=%llu) [error %d]",
+ (u_longlong_t)sls->sls_sm_obj, error);
+ goto out;
+ }
+
+ struct spa_ld_log_sm_arg vla = {
+ .slls_spa = spa,
+ .slls_txg = sls->sls_txg
+ };
+ error = space_map_iterate(sm, space_map_length(sm),
+ spa_ld_log_sm_cb, &vla);
+ if (error != 0) {
+ space_map_close(sm);
+ spa_load_failed(spa, "spa_ld_log_sm_data(): failed "
+ "at space_map_iterate(obj=%llu) [error %d]",
+ (u_longlong_t)sls->sls_sm_obj, error);
+ goto out;
+ }
+
+ ASSERT0(sls->sls_nblocks);
+ sls->sls_nblocks = space_map_nblocks(sm);
+ spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
+ summary_add_data(spa, sls->sls_txg,
+ sls->sls_mscount, sls->sls_nblocks);
+
+ space_map_close(sm);
+ }
+ hrtime_t read_logs_endtime = gethrtime();
+ spa_load_note(spa,
+ "read %llu log space maps (%llu total blocks - blksz = %llu bytes) "
+ "in %lld ms", (u_longlong_t)avl_numnodes(&spa->spa_sm_logs_by_txg),
+ (u_longlong_t)spa_log_sm_nblocks(spa),
+ (u_longlong_t)zfs_log_sm_blksz,
+ (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000));
+
+out:
+ /*
+ * Now that the metaslabs contain their unflushed changes:
+ * [1] recalculate their actual allocated space
+ * [2] recalculate their weights
+ * [3] sum up the memory usage of their unflushed range trees
+ * [4] optionally load them, if debug_load is set
+ *
+ * Note that even in the case where we get here because of an
+ * error (e.g. error != 0), we still want to update the fields
+ * below in order to have a proper teardown in spa_unload().
+ */
+ for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
+ m != NULL; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
+ mutex_enter(&m->ms_lock);
+ m->ms_allocated_space = space_map_allocated(m->ms_sm) +
+ range_tree_space(m->ms_unflushed_allocs) -
+ range_tree_space(m->ms_unflushed_frees);
+
+ vdev_t *vd = m->ms_group->mg_vd;
+ metaslab_space_update(vd, m->ms_group->mg_class,
+ range_tree_space(m->ms_unflushed_allocs), 0, 0);
+ metaslab_space_update(vd, m->ms_group->mg_class,
+ -range_tree_space(m->ms_unflushed_frees), 0, 0);
+
+ ASSERT0(m->ms_weight & METASLAB_ACTIVE_MASK);
+ metaslab_recalculate_weight_and_sort(m);
+
+ spa->spa_unflushed_stats.sus_memused +=
+ metaslab_unflushed_changes_memused(m);
+
+ if (metaslab_debug_load && m->ms_sm != NULL) {
+ VERIFY0(metaslab_load(m));
+ metaslab_set_selected_txg(m, 0);
+ }
+ mutex_exit(&m->ms_lock);
+ }
+
+ return (error);
+}
+
+static int
+spa_ld_unflushed_txgs(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa_meta_objset(spa);
+
+ if (vd->vdev_top_zap == 0)
+ return (0);
+
+ uint64_t object = 0;
+ int error = zap_lookup(mos, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
+ sizeof (uint64_t), 1, &object);
+ if (error == ENOENT)
+ return (0);
+ else if (error != 0) {
+ spa_load_failed(spa, "spa_ld_unflushed_txgs(): failed at "
+ "zap_lookup(vdev_top_zap=%llu) [error %d]",
+ (u_longlong_t)vd->vdev_top_zap, error);
+ return (error);
+ }
+
+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *ms = vd->vdev_ms[m];
+ ASSERT(ms != NULL);
+
+ metaslab_unflushed_phys_t entry;
+ uint64_t entry_size = sizeof (entry);
+ uint64_t entry_offset = ms->ms_id * entry_size;
+
+ error = dmu_read(mos, object,
+ entry_offset, entry_size, &entry, 0);
+ if (error != 0) {
+ spa_load_failed(spa, "spa_ld_unflushed_txgs(): "
+ "failed at dmu_read(obj=%llu) [error %d]",
+ (u_longlong_t)object, error);
+ return (error);
+ }
+
+ ms->ms_unflushed_txg = entry.msp_unflushed_txg;
+ if (ms->ms_unflushed_txg != 0) {
+ mutex_enter(&spa->spa_flushed_ms_lock);
+ avl_add(&spa->spa_metaslabs_by_flushed, ms);
+ mutex_exit(&spa->spa_flushed_ms_lock);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Read all the log space map entries into their respective
+ * metaslab unflushed trees and keep them sorted by TXG in the
+ * SPA's metadata. In addition, setup all the metadata for the
+ * memory and the block heuristics.
+ */
+int
+spa_ld_log_spacemaps(spa_t *spa)
+{
+ int error;
+
+ spa_log_sm_set_blocklimit(spa);
+
+ for (uint64_t c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
+ error = spa_ld_unflushed_txgs(vd);
+ if (error != 0)
+ return (error);
+ }
+
+ error = spa_ld_log_sm_metadata(spa);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Note: we don't actually expect anything to change at this point
+ * but we grab the config lock so we don't fail any assertions
+ * when using vdev_lookup_top().
+ */
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ error = spa_ld_log_sm_data(spa);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ return (error);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_amt, ULONG, ZMOD_RW,
+ "Specific hard-limit in memory that ZFS allows to be used for "
+ "unflushed changes");
+
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_ppm, ULONG, ZMOD_RW,
+ "Percentage of the overall system memory that ZFS allows to be "
+ "used for unflushed changes (value is calculated over 1000000 for "
+ "finer granularity");
+
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_max, ULONG, ZMOD_RW,
+ "Hard limit (upper-bound) in the size of the space map log "
+ "in terms of blocks.");
+
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, ULONG, ZMOD_RW,
+ "Lower-bound limit for the maximum amount of blocks allowed in "
+ "log spacemap (see zfs_unflushed_log_block_max)");
+
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, ULONG, ZMOD_RW,
+ "Tunable used to determine the number of blocks that can be used for "
+ "the spacemap log, expressed as a percentage of the total number of "
+ "metaslabs in the pool (e.g. 400 means the number of log blocks is "
+ "capped at 4 times the number of metaslabs)");
+
+ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, ULONG, ZMOD_RW,
+ "The number of past TXGs that the flushing algorithm of the log "
+ "spacemap feature uses to estimate incoming log blocks");
+
+ZFS_MODULE_PARAM(zfs, zfs_, max_logsm_summary_length, ULONG, ZMOD_RW,
+ "Maximum number of rows allowed in the summary of the spacemap log");
+
+ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush, ULONG, ZMOD_RW,
+ "Minimum number of metaslabs to flush per dirty TXG");
+
+ZFS_MODULE_PARAM(zfs, zfs_, keep_log_spacemaps_at_export, INT, ZMOD_RW,
+ "Prevent the log spacemaps from being flushed and destroyed "
+ "during pool export/destroy");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c
new file mode 100644
index 000000000000..b4c73f58d3bc
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/spa_misc.c
@@ -0,0 +1,2953 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_initialize.h>
+#include <sys/vdev_trim.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_raidz.h>
+#include <sys/metaslab.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/fm/util.h>
+#include <sys/dsl_scan.h>
+#include <sys/fs/zfs.h>
+#include <sys/metaslab_impl.h>
+#include <sys/arc.h>
+#include <sys/ddt.h>
+#include <sys/kstat.h>
+#include "zfs_prop.h"
+#include <sys/btree.h>
+#include <sys/zfeature.h>
+#include <sys/qat.h>
+#include <sys/zstd/zstd.h>
+
+/*
+ * SPA locking
+ *
+ * There are three basic locks for managing spa_t structures:
+ *
+ * spa_namespace_lock (global mutex)
+ *
+ * This lock must be acquired to do any of the following:
+ *
+ * - Lookup a spa_t by name
+ * - Add or remove a spa_t from the namespace
+ * - Increase spa_refcount from non-zero
+ * - Check if spa_refcount is zero
+ * - Rename a spa_t
+ * - add/remove/attach/detach devices
+ * - Held for the duration of create/destroy/import/export
+ *
+ * It does not need to handle recursion. A create or destroy may
+ * reference objects (files or zvols) in other pools, but by
+ * definition they must have an existing reference, and will never need
+ * to lookup a spa_t by name.
+ *
+ * spa_refcount (per-spa zfs_refcount_t protected by mutex)
+ *
+ * This reference count keep track of any active users of the spa_t. The
+ * spa_t cannot be destroyed or freed while this is non-zero. Internally,
+ * the refcount is never really 'zero' - opening a pool implicitly keeps
+ * some references in the DMU. Internally we check against spa_minref, but
+ * present the image of a zero/non-zero value to consumers.
+ *
+ * spa_config_lock[] (per-spa array of rwlocks)
+ *
+ * This protects the spa_t from config changes, and must be held in
+ * the following circumstances:
+ *
+ * - RW_READER to perform I/O to the spa
+ * - RW_WRITER to change the vdev config
+ *
+ * The locking order is fairly straightforward:
+ *
+ * spa_namespace_lock -> spa_refcount
+ *
+ * The namespace lock must be acquired to increase the refcount from 0
+ * or to check if it is zero.
+ *
+ * spa_refcount -> spa_config_lock[]
+ *
+ * There must be at least one valid reference on the spa_t to acquire
+ * the config lock.
+ *
+ * spa_namespace_lock -> spa_config_lock[]
+ *
+ * The namespace lock must always be taken before the config lock.
+ *
+ *
+ * The spa_namespace_lock can be acquired directly and is globally visible.
+ *
+ * The namespace is manipulated using the following functions, all of which
+ * require the spa_namespace_lock to be held.
+ *
+ * spa_lookup() Lookup a spa_t by name.
+ *
+ * spa_add() Create a new spa_t in the namespace.
+ *
+ * spa_remove() Remove a spa_t from the namespace. This also
+ * frees up any memory associated with the spa_t.
+ *
+ * spa_next() Returns the next spa_t in the system, or the
+ * first if NULL is passed.
+ *
+ * spa_evict_all() Shutdown and remove all spa_t structures in
+ * the system.
+ *
+ * spa_guid_exists() Determine whether a pool/device guid exists.
+ *
+ * The spa_refcount is manipulated using the following functions:
+ *
+ * spa_open_ref() Adds a reference to the given spa_t. Must be
+ * called with spa_namespace_lock held if the
+ * refcount is currently zero.
+ *
+ * spa_close() Remove a reference from the spa_t. This will
+ * not free the spa_t or remove it from the
+ * namespace. No locking is required.
+ *
+ * spa_refcount_zero() Returns true if the refcount is currently
+ * zero. Must be called with spa_namespace_lock
+ * held.
+ *
+ * The spa_config_lock[] is an array of rwlocks, ordered as follows:
+ * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV.
+ * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}().
+ *
+ * To read the configuration, it suffices to hold one of these locks as reader.
+ * To modify the configuration, you must hold all locks as writer. To modify
+ * vdev state without altering the vdev tree's topology (e.g. online/offline),
+ * you must hold SCL_STATE and SCL_ZIO as writer.
+ *
+ * We use these distinct config locks to avoid recursive lock entry.
+ * For example, spa_sync() (which holds SCL_CONFIG as reader) induces
+ * block allocations (SCL_ALLOC), which may require reading space maps
+ * from disk (dmu_read() -> zio_read() -> SCL_ZIO).
+ *
+ * The spa config locks cannot be normal rwlocks because we need the
+ * ability to hand off ownership. For example, SCL_ZIO is acquired
+ * by the issuing thread and later released by an interrupt thread.
+ * They do, however, obey the usual write-wanted semantics to prevent
+ * writer (i.e. system administrator) starvation.
+ *
+ * The lock acquisition rules are as follows:
+ *
+ * SCL_CONFIG
+ * Protects changes to the vdev tree topology, such as vdev
+ * add/remove/attach/detach. Protects the dirty config list
+ * (spa_config_dirty_list) and the set of spares and l2arc devices.
+ *
+ * SCL_STATE
+ * Protects changes to pool state and vdev state, such as vdev
+ * online/offline/fault/degrade/clear. Protects the dirty state list
+ * (spa_state_dirty_list) and global pool state (spa_state).
+ *
+ * SCL_ALLOC
+ * Protects changes to metaslab groups and classes.
+ * Held as reader by metaslab_alloc() and metaslab_claim().
+ *
+ * SCL_ZIO
+ * Held by bp-level zios (those which have no io_vd upon entry)
+ * to prevent changes to the vdev tree. The bp-level zio implicitly
+ * protects all of its vdev child zios, which do not hold SCL_ZIO.
+ *
+ * SCL_FREE
+ * Protects changes to metaslab groups and classes.
+ * Held as reader by metaslab_free(). SCL_FREE is distinct from
+ * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free
+ * blocks in zio_done() while another i/o that holds either
+ * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete.
+ *
+ * SCL_VDEV
+ * Held as reader to prevent changes to the vdev tree during trivial
+ * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the
+ * other locks, and lower than all of them, to ensure that it's safe
+ * to acquire regardless of caller context.
+ *
+ * In addition, the following rules apply:
+ *
+ * (a) spa_props_lock protects pool properties, spa_config and spa_config_list.
+ * The lock ordering is SCL_CONFIG > spa_props_lock.
+ *
+ * (b) I/O operations on leaf vdevs. For any zio operation that takes
+ * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(),
+ * or zio_write_phys() -- the caller must ensure that the config cannot
+ * cannot change in the interim, and that the vdev cannot be reopened.
+ * SCL_STATE as reader suffices for both.
+ *
+ * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
+ *
+ * spa_vdev_enter() Acquire the namespace lock and the config lock
+ * for writing.
+ *
+ * spa_vdev_exit() Release the config lock, wait for all I/O
+ * to complete, sync the updated configs to the
+ * cache, and release the namespace lock.
+ *
+ * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
+ * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
+ * locking is, always, based on spa_namespace_lock and spa_config_lock[].
+ */
+
+static avl_tree_t spa_namespace_avl;
+kmutex_t spa_namespace_lock;
+static kcondvar_t spa_namespace_cv;
+int spa_max_replication_override = SPA_DVAS_PER_BP;
+
+static kmutex_t spa_spare_lock;
+static avl_tree_t spa_spare_avl;
+static kmutex_t spa_l2cache_lock;
+static avl_tree_t spa_l2cache_avl;
+
+kmem_cache_t *spa_buffer_pool;
+spa_mode_t spa_mode_global = SPA_MODE_UNINIT;
+
+#ifdef ZFS_DEBUG
+/*
+ * Everything except dprintf, set_error, spa, and indirect_remap is on
+ * by default in debug builds.
+ */
+int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SET_ERROR |
+ ZFS_DEBUG_INDIRECT_REMAP);
+#else
+int zfs_flags = 0;
+#endif
+
+/*
+ * zfs_recover can be set to nonzero to attempt to recover from
+ * otherwise-fatal errors, typically caused by on-disk corruption. When
+ * set, calls to zfs_panic_recover() will turn into warning messages.
+ * This should only be used as a last resort, as it typically results
+ * in leaked space, or worse.
+ */
+int zfs_recover = B_FALSE;
+
+/*
+ * If destroy encounters an EIO while reading metadata (e.g. indirect
+ * blocks), space referenced by the missing metadata can not be freed.
+ * Normally this causes the background destroy to become "stalled", as
+ * it is unable to make forward progress. While in this stalled state,
+ * all remaining space to free from the error-encountering filesystem is
+ * "temporarily leaked". Set this flag to cause it to ignore the EIO,
+ * permanently leak the space from indirect blocks that can not be read,
+ * and continue to free everything else that it can.
+ *
+ * The default, "stalling" behavior is useful if the storage partially
+ * fails (i.e. some but not all i/os fail), and then later recovers. In
+ * this case, we will be able to continue pool operations while it is
+ * partially failed, and when it recovers, we can continue to free the
+ * space, with no leaks. However, note that this case is actually
+ * fairly rare.
+ *
+ * Typically pools either (a) fail completely (but perhaps temporarily,
+ * e.g. a top-level vdev going offline), or (b) have localized,
+ * permanent errors (e.g. disk returns the wrong data due to bit flip or
+ * firmware bug). In case (a), this setting does not matter because the
+ * pool will be suspended and the sync thread will not be able to make
+ * forward progress regardless. In case (b), because the error is
+ * permanent, the best we can do is leak the minimum amount of space,
+ * which is what setting this flag will do. Therefore, it is reasonable
+ * for this flag to normally be set, but we chose the more conservative
+ * approach of not setting it, so that there is no possibility of
+ * leaking space in the "partial temporary" failure case.
+ */
+int zfs_free_leak_on_eio = B_FALSE;
+
+/*
+ * Expiration time in milliseconds. This value has two meanings. First it is
+ * used to determine when the spa_deadman() logic should fire. By default the
+ * spa_deadman() will fire if spa_sync() has not completed in 600 seconds.
+ * Secondly, the value determines if an I/O is considered "hung". Any I/O that
+ * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
+ * in one of three behaviors controlled by zfs_deadman_failmode.
+ */
+unsigned long zfs_deadman_synctime_ms = 600000UL;
+
+/*
+ * This value controls the maximum amount of time zio_wait() will block for an
+ * outstanding IO. By default this is 300 seconds at which point the "hung"
+ * behavior will be applied as described for zfs_deadman_synctime_ms.
+ */
+unsigned long zfs_deadman_ziotime_ms = 300000UL;
+
+/*
+ * Check time in milliseconds. This defines the frequency at which we check
+ * for hung I/O.
+ */
+unsigned long zfs_deadman_checktime_ms = 60000UL;
+
+/*
+ * By default the deadman is enabled.
+ */
+int zfs_deadman_enabled = 1;
+
+/*
+ * Controls the behavior of the deadman when it detects a "hung" I/O.
+ * Valid values are zfs_deadman_failmode=<wait|continue|panic>.
+ *
+ * wait - Wait for the "hung" I/O (default)
+ * continue - Attempt to recover from a "hung" I/O
+ * panic - Panic the system
+ */
+char *zfs_deadman_failmode = "wait";
+
+/*
+ * The worst case is single-sector max-parity RAID-Z blocks, in which
+ * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
+ * times the size; so just assume that. Add to this the fact that
+ * we can have up to 3 DVAs per bp, and one more factor of 2 because
+ * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together,
+ * the worst case is:
+ * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
+ */
+int spa_asize_inflation = 24;
+
+/*
+ * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
+ * the pool to be consumed. This ensures that we don't run the pool
+ * completely out of space, due to unaccounted changes (e.g. to the MOS).
+ * It also limits the worst-case time to allocate space. If we have less than
+ * this amount of free space, most ZPL operations (e.g. write, create) will
+ * return ENOSPC. The ZIL metaslabs (spa_embedded_log_class) are also part of
+ * this 3.2% of space which can't be consumed by normal writes; the slop space
+ * "proper" (spa_get_slop_space()) is decreased by the embedded log space.
+ *
+ * Certain operations (e.g. file removal, most administrative actions) can
+ * use half the slop space. They will only return ENOSPC if less than half
+ * the slop space is free. Typically, once the pool has less than the slop
+ * space free, the user will use these operations to free up space in the pool.
+ * These are the operations that call dsl_pool_adjustedsize() with the netfree
+ * argument set to TRUE.
+ *
+ * Operations that are almost guaranteed to free up space in the absence of
+ * a pool checkpoint can use up to three quarters of the slop space
+ * (e.g zfs destroy).
+ *
+ * A very restricted set of operations are always permitted, regardless of
+ * the amount of free space. These are the operations that call
+ * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net
+ * increase in the amount of space used, it is possible to run the pool
+ * completely out of space, causing it to be permanently read-only.
+ *
+ * Note that on very small pools, the slop space will be larger than
+ * 3.2%, in an effort to have it be at least spa_min_slop (128MB),
+ * but we never allow it to be more than half the pool size.
+ *
+ * See also the comments in zfs_space_check_t.
+ */
+int spa_slop_shift = 5;
+uint64_t spa_min_slop = 128 * 1024 * 1024;
+int spa_allocators = 4;
+
+
+/*PRINTFLIKE2*/
+void
+spa_load_failed(spa_t *spa, const char *fmt, ...)
+{
+ va_list adx;
+ char buf[256];
+
+ va_start(adx, fmt);
+ (void) vsnprintf(buf, sizeof (buf), fmt, adx);
+ va_end(adx);
+
+ zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name,
+ spa->spa_trust_config ? "trusted" : "untrusted", buf);
+}
+
+/*PRINTFLIKE2*/
+void
+spa_load_note(spa_t *spa, const char *fmt, ...)
+{
+ va_list adx;
+ char buf[256];
+
+ va_start(adx, fmt);
+ (void) vsnprintf(buf, sizeof (buf), fmt, adx);
+ va_end(adx);
+
+ zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name,
+ spa->spa_trust_config ? "trusted" : "untrusted", buf);
+}
+
+/*
+ * By default dedup and user data indirects land in the special class
+ */
+int zfs_ddt_data_is_special = B_TRUE;
+int zfs_user_indirect_is_special = B_TRUE;
+
+/*
+ * The percentage of special class final space reserved for metadata only.
+ * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only
+ * let metadata into the class.
+ */
+int zfs_special_class_metadata_reserve_pct = 25;
+
+/*
+ * ==========================================================================
+ * SPA config locking
+ * ==========================================================================
+ */
+static void
+spa_config_lock_init(spa_t *spa)
+{
+ for (int i = 0; i < SCL_LOCKS; i++) {
+ spa_config_lock_t *scl = &spa->spa_config_lock[i];
+ mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
+ zfs_refcount_create_untracked(&scl->scl_count);
+ scl->scl_writer = NULL;
+ scl->scl_write_wanted = 0;
+ }
+}
+
+static void
+spa_config_lock_destroy(spa_t *spa)
+{
+ for (int i = 0; i < SCL_LOCKS; i++) {
+ spa_config_lock_t *scl = &spa->spa_config_lock[i];
+ mutex_destroy(&scl->scl_lock);
+ cv_destroy(&scl->scl_cv);
+ zfs_refcount_destroy(&scl->scl_count);
+ ASSERT(scl->scl_writer == NULL);
+ ASSERT(scl->scl_write_wanted == 0);
+ }
+}
+
+int
+spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
+{
+ for (int i = 0; i < SCL_LOCKS; i++) {
+ spa_config_lock_t *scl = &spa->spa_config_lock[i];
+ if (!(locks & (1 << i)))
+ continue;
+ mutex_enter(&scl->scl_lock);
+ if (rw == RW_READER) {
+ if (scl->scl_writer || scl->scl_write_wanted) {
+ mutex_exit(&scl->scl_lock);
+ spa_config_exit(spa, locks & ((1 << i) - 1),
+ tag);
+ return (0);
+ }
+ } else {
+ ASSERT(scl->scl_writer != curthread);
+ if (!zfs_refcount_is_zero(&scl->scl_count)) {
+ mutex_exit(&scl->scl_lock);
+ spa_config_exit(spa, locks & ((1 << i) - 1),
+ tag);
+ return (0);
+ }
+ scl->scl_writer = curthread;
+ }
+ (void) zfs_refcount_add(&scl->scl_count, tag);
+ mutex_exit(&scl->scl_lock);
+ }
+ return (1);
+}
+
+void
+spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
+{
+ int wlocks_held = 0;
+
+ ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY);
+
+ for (int i = 0; i < SCL_LOCKS; i++) {
+ spa_config_lock_t *scl = &spa->spa_config_lock[i];
+ if (scl->scl_writer == curthread)
+ wlocks_held |= (1 << i);
+ if (!(locks & (1 << i)))
+ continue;
+ mutex_enter(&scl->scl_lock);
+ if (rw == RW_READER) {
+ while (scl->scl_writer || scl->scl_write_wanted) {
+ cv_wait(&scl->scl_cv, &scl->scl_lock);
+ }
+ } else {
+ ASSERT(scl->scl_writer != curthread);
+ while (!zfs_refcount_is_zero(&scl->scl_count)) {
+ scl->scl_write_wanted++;
+ cv_wait(&scl->scl_cv, &scl->scl_lock);
+ scl->scl_write_wanted--;
+ }
+ scl->scl_writer = curthread;
+ }
+ (void) zfs_refcount_add(&scl->scl_count, tag);
+ mutex_exit(&scl->scl_lock);
+ }
+ ASSERT3U(wlocks_held, <=, locks);
+}
+
+void
+spa_config_exit(spa_t *spa, int locks, const void *tag)
+{
+ for (int i = SCL_LOCKS - 1; i >= 0; i--) {
+ spa_config_lock_t *scl = &spa->spa_config_lock[i];
+ if (!(locks & (1 << i)))
+ continue;
+ mutex_enter(&scl->scl_lock);
+ ASSERT(!zfs_refcount_is_zero(&scl->scl_count));
+ if (zfs_refcount_remove(&scl->scl_count, tag) == 0) {
+ ASSERT(scl->scl_writer == NULL ||
+ scl->scl_writer == curthread);
+ scl->scl_writer = NULL; /* OK in either case */
+ cv_broadcast(&scl->scl_cv);
+ }
+ mutex_exit(&scl->scl_lock);
+ }
+}
+
+int
+spa_config_held(spa_t *spa, int locks, krw_t rw)
+{
+ int locks_held = 0;
+
+ for (int i = 0; i < SCL_LOCKS; i++) {
+ spa_config_lock_t *scl = &spa->spa_config_lock[i];
+ if (!(locks & (1 << i)))
+ continue;
+ if ((rw == RW_READER &&
+ !zfs_refcount_is_zero(&scl->scl_count)) ||
+ (rw == RW_WRITER && scl->scl_writer == curthread))
+ locks_held |= 1 << i;
+ }
+
+ return (locks_held);
+}
+
+/*
+ * ==========================================================================
+ * SPA namespace functions
+ * ==========================================================================
+ */
+
+/*
+ * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held.
+ * Returns NULL if no matching spa_t is found.
+ */
+spa_t *
+spa_lookup(const char *name)
+{
+ static spa_t search; /* spa_t is large; don't allocate on stack */
+ spa_t *spa;
+ avl_index_t where;
+ char *cp;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ (void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
+
+ /*
+ * If it's a full dataset name, figure out the pool name and
+ * just use that.
+ */
+ cp = strpbrk(search.spa_name, "/@#");
+ if (cp != NULL)
+ *cp = '\0';
+
+ spa = avl_find(&spa_namespace_avl, &search, &where);
+
+ return (spa);
+}
+
+/*
+ * Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
+ * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
+ * looking for potentially hung I/Os.
+ */
+void
+spa_deadman(void *arg)
+{
+ spa_t *spa = arg;
+
+ /* Disable the deadman if the pool is suspended. */
+ if (spa_suspended(spa))
+ return;
+
+ zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
+ (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
+ ++spa->spa_deadman_calls);
+ if (zfs_deadman_enabled)
+ vdev_deadman(spa->spa_root_vdev, FTAG);
+
+ spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
+ spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
+ MSEC_TO_TICK(zfs_deadman_checktime_ms));
+}
+
+static int
+spa_log_sm_sort_by_txg(const void *va, const void *vb)
+{
+ const spa_log_sm_t *a = va;
+ const spa_log_sm_t *b = vb;
+
+ return (TREE_CMP(a->sls_txg, b->sls_txg));
+}
+
+/*
+ * Create an uninitialized spa_t with the given name. Requires
+ * spa_namespace_lock. The caller must ensure that the spa_t doesn't already
+ * exist by calling spa_lookup() first.
+ */
+spa_t *
+spa_add(const char *name, nvlist_t *config, const char *altroot)
+{
+ spa_t *spa;
+ spa_config_dirent_t *dp;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
+
+ mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_activities_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&spa->spa_activities_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&spa->spa_waiters_cv, NULL, CV_DEFAULT, NULL);
+
+ for (int t = 0; t < TXG_SIZE; t++)
+ bplist_create(&spa->spa_free_bplist[t]);
+
+ (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
+ spa->spa_state = POOL_STATE_UNINITIALIZED;
+ spa->spa_freeze_txg = UINT64_MAX;
+ spa->spa_final_txg = UINT64_MAX;
+ spa->spa_load_max_txg = UINT64_MAX;
+ spa->spa_proc = &p0;
+ spa->spa_proc_state = SPA_PROC_NONE;
+ spa->spa_trust_config = B_TRUE;
+ spa->spa_hostid = zone_get_hostid(NULL);
+
+ spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
+ spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms);
+ spa_set_deadman_failmode(spa, zfs_deadman_failmode);
+
+ zfs_refcount_create(&spa->spa_refcount);
+ spa_config_lock_init(spa);
+ spa_stats_init(spa);
+
+ avl_add(&spa_namespace_avl, spa);
+
+ /*
+ * Set the alternate root, if there is one.
+ */
+ if (altroot)
+ spa->spa_root = spa_strdup(altroot);
+
+ spa->spa_alloc_count = spa_allocators;
+ spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count *
+ sizeof (kmutex_t), KM_SLEEP);
+ spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count *
+ sizeof (avl_tree_t), KM_SLEEP);
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare,
+ sizeof (zio_t), offsetof(zio_t, io_alloc_node));
+ }
+ avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
+ sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
+ avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg,
+ sizeof (spa_log_sm_t), offsetof(spa_log_sm_t, sls_node));
+ list_create(&spa->spa_log_summary, sizeof (log_summary_entry_t),
+ offsetof(log_summary_entry_t, lse_node));
+
+ /*
+ * Every pool starts with the default cachefile
+ */
+ list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
+ offsetof(spa_config_dirent_t, scd_link));
+
+ dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
+ dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
+ list_insert_head(&spa->spa_config_list, dp);
+
+ VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
+ KM_SLEEP) == 0);
+
+ if (config != NULL) {
+ nvlist_t *features;
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
+ &features) == 0) {
+ VERIFY(nvlist_dup(features, &spa->spa_label_features,
+ 0) == 0);
+ }
+
+ VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
+ }
+
+ if (spa->spa_label_features == NULL) {
+ VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
+ KM_SLEEP) == 0);
+ }
+
+ spa->spa_min_ashift = INT_MAX;
+ spa->spa_max_ashift = 0;
+ spa->spa_min_alloc = INT_MAX;
+
+ /* Reset cached value */
+ spa->spa_dedup_dspace = ~0ULL;
+
+ /*
+ * As a pool is being created, treat all features as disabled by
+ * setting SPA_FEATURE_DISABLED for all entries in the feature
+ * refcount cache.
+ */
+ for (int i = 0; i < SPA_FEATURES; i++) {
+ spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
+ }
+
+ list_create(&spa->spa_leaf_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_leaf_node));
+
+ return (spa);
+}
+
+/*
+ * Removes a spa_t from the namespace, freeing up any memory used. Requires
+ * spa_namespace_lock. This is called only after the spa_t has been closed and
+ * deactivated.
+ */
+void
+spa_remove(spa_t *spa)
+{
+ spa_config_dirent_t *dp;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa_state(spa) == POOL_STATE_UNINITIALIZED);
+ ASSERT3U(zfs_refcount_count(&spa->spa_refcount), ==, 0);
+ ASSERT0(spa->spa_waiters);
+
+ nvlist_free(spa->spa_config_splitting);
+
+ avl_remove(&spa_namespace_avl, spa);
+ cv_broadcast(&spa_namespace_cv);
+
+ if (spa->spa_root)
+ spa_strfree(spa->spa_root);
+
+ while ((dp = list_head(&spa->spa_config_list)) != NULL) {
+ list_remove(&spa->spa_config_list, dp);
+ if (dp->scd_path != NULL)
+ spa_strfree(dp->scd_path);
+ kmem_free(dp, sizeof (spa_config_dirent_t));
+ }
+
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ avl_destroy(&spa->spa_alloc_trees[i]);
+ mutex_destroy(&spa->spa_alloc_locks[i]);
+ }
+ kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count *
+ sizeof (kmutex_t));
+ kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count *
+ sizeof (avl_tree_t));
+
+ avl_destroy(&spa->spa_metaslabs_by_flushed);
+ avl_destroy(&spa->spa_sm_logs_by_txg);
+ list_destroy(&spa->spa_log_summary);
+ list_destroy(&spa->spa_config_list);
+ list_destroy(&spa->spa_leaf_list);
+
+ nvlist_free(spa->spa_label_features);
+ nvlist_free(spa->spa_load_info);
+ nvlist_free(spa->spa_feat_stats);
+ spa_config_set(spa, NULL);
+
+ zfs_refcount_destroy(&spa->spa_refcount);
+
+ spa_stats_destroy(spa);
+ spa_config_lock_destroy(spa);
+
+ for (int t = 0; t < TXG_SIZE; t++)
+ bplist_destroy(&spa->spa_free_bplist[t]);
+
+ zio_checksum_templates_free(spa);
+
+ cv_destroy(&spa->spa_async_cv);
+ cv_destroy(&spa->spa_evicting_os_cv);
+ cv_destroy(&spa->spa_proc_cv);
+ cv_destroy(&spa->spa_scrub_io_cv);
+ cv_destroy(&spa->spa_suspend_cv);
+ cv_destroy(&spa->spa_activities_cv);
+ cv_destroy(&spa->spa_waiters_cv);
+
+ mutex_destroy(&spa->spa_flushed_ms_lock);
+ mutex_destroy(&spa->spa_async_lock);
+ mutex_destroy(&spa->spa_errlist_lock);
+ mutex_destroy(&spa->spa_errlog_lock);
+ mutex_destroy(&spa->spa_evicting_os_lock);
+ mutex_destroy(&spa->spa_history_lock);
+ mutex_destroy(&spa->spa_proc_lock);
+ mutex_destroy(&spa->spa_props_lock);
+ mutex_destroy(&spa->spa_cksum_tmpls_lock);
+ mutex_destroy(&spa->spa_scrub_lock);
+ mutex_destroy(&spa->spa_suspend_lock);
+ mutex_destroy(&spa->spa_vdev_top_lock);
+ mutex_destroy(&spa->spa_feat_stats_lock);
+ mutex_destroy(&spa->spa_activities_lock);
+
+ kmem_free(spa, sizeof (spa_t));
+}
+
+/*
+ * Given a pool, return the next pool in the namespace, or NULL if there is
+ * none. If 'prev' is NULL, return the first pool.
+ */
+spa_t *
+spa_next(spa_t *prev)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ if (prev)
+ return (AVL_NEXT(&spa_namespace_avl, prev));
+ else
+ return (avl_first(&spa_namespace_avl));
+}
+
+/*
+ * ==========================================================================
+ * SPA refcount functions
+ * ==========================================================================
+ */
+
+/*
+ * Add a reference to the given spa_t. Must have at least one reference, or
+ * have the namespace lock held.
+ */
+void
+spa_open_ref(spa_t *spa, void *tag)
+{
+ ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
+ MUTEX_HELD(&spa_namespace_lock));
+ (void) zfs_refcount_add(&spa->spa_refcount, tag);
+}
+
+/*
+ * Remove a reference to the given spa_t. Must have at least one reference, or
+ * have the namespace lock held.
+ */
+void
+spa_close(spa_t *spa, void *tag)
+{
+ ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref ||
+ MUTEX_HELD(&spa_namespace_lock));
+ (void) zfs_refcount_remove(&spa->spa_refcount, tag);
+}
+
+/*
+ * Remove a reference to the given spa_t held by a dsl dir that is
+ * being asynchronously released. Async releases occur from a taskq
+ * performing eviction of dsl datasets and dirs. The namespace lock
+ * isn't held and the hold by the object being evicted may contribute to
+ * spa_minref (e.g. dataset or directory released during pool export),
+ * so the asserts in spa_close() do not apply.
+ */
+void
+spa_async_close(spa_t *spa, void *tag)
+{
+ (void) zfs_refcount_remove(&spa->spa_refcount, tag);
+}
+
+/*
+ * Check to see if the spa refcount is zero. Must be called with
+ * spa_namespace_lock held. We really compare against spa_minref, which is the
+ * number of references acquired when opening a pool
+ */
+boolean_t
+spa_refcount_zero(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref);
+}
+
+/*
+ * ==========================================================================
+ * SPA spare and l2cache tracking
+ * ==========================================================================
+ */
+
+/*
+ * Hot spares and cache devices are tracked using the same code below,
+ * for 'auxiliary' devices.
+ */
+
+typedef struct spa_aux {
+ uint64_t aux_guid;
+ uint64_t aux_pool;
+ avl_node_t aux_avl;
+ int aux_count;
+} spa_aux_t;
+
+static inline int
+spa_aux_compare(const void *a, const void *b)
+{
+ const spa_aux_t *sa = (const spa_aux_t *)a;
+ const spa_aux_t *sb = (const spa_aux_t *)b;
+
+ return (TREE_CMP(sa->aux_guid, sb->aux_guid));
+}
+
+static void
+spa_aux_add(vdev_t *vd, avl_tree_t *avl)
+{
+ avl_index_t where;
+ spa_aux_t search;
+ spa_aux_t *aux;
+
+ search.aux_guid = vd->vdev_guid;
+ if ((aux = avl_find(avl, &search, &where)) != NULL) {
+ aux->aux_count++;
+ } else {
+ aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
+ aux->aux_guid = vd->vdev_guid;
+ aux->aux_count = 1;
+ avl_insert(avl, aux, where);
+ }
+}
+
+static void
+spa_aux_remove(vdev_t *vd, avl_tree_t *avl)
+{
+ spa_aux_t search;
+ spa_aux_t *aux;
+ avl_index_t where;
+
+ search.aux_guid = vd->vdev_guid;
+ aux = avl_find(avl, &search, &where);
+
+ ASSERT(aux != NULL);
+
+ if (--aux->aux_count == 0) {
+ avl_remove(avl, aux);
+ kmem_free(aux, sizeof (spa_aux_t));
+ } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
+ aux->aux_pool = 0ULL;
+ }
+}
+
+static boolean_t
+spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl)
+{
+ spa_aux_t search, *found;
+
+ search.aux_guid = guid;
+ found = avl_find(avl, &search, NULL);
+
+ if (pool) {
+ if (found)
+ *pool = found->aux_pool;
+ else
+ *pool = 0ULL;
+ }
+
+ if (refcnt) {
+ if (found)
+ *refcnt = found->aux_count;
+ else
+ *refcnt = 0;
+ }
+
+ return (found != NULL);
+}
+
+static void
+spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
+{
+ spa_aux_t search, *found;
+ avl_index_t where;
+
+ search.aux_guid = vd->vdev_guid;
+ found = avl_find(avl, &search, &where);
+ ASSERT(found != NULL);
+ ASSERT(found->aux_pool == 0ULL);
+
+ found->aux_pool = spa_guid(vd->vdev_spa);
+}
+
+/*
+ * Spares are tracked globally due to the following constraints:
+ *
+ * - A spare may be part of multiple pools.
+ * - A spare may be added to a pool even if it's actively in use within
+ * another pool.
+ * - A spare in use in any pool can only be the source of a replacement if
+ * the target is a spare in the same pool.
+ *
+ * We keep track of all spares on the system through the use of a reference
+ * counted AVL tree. When a vdev is added as a spare, or used as a replacement
+ * spare, then we bump the reference count in the AVL tree. In addition, we set
+ * the 'vdev_isspare' member to indicate that the device is a spare (active or
+ * inactive). When a spare is made active (used to replace a device in the
+ * pool), we also keep track of which pool its been made a part of.
+ *
+ * The 'spa_spare_lock' protects the AVL tree. These functions are normally
+ * called under the spa_namespace lock as part of vdev reconfiguration. The
+ * separate spare lock exists for the status query path, which does not need to
+ * be completely consistent with respect to other vdev configuration changes.
+ */
+
+static int
+spa_spare_compare(const void *a, const void *b)
+{
+ return (spa_aux_compare(a, b));
+}
+
+void
+spa_spare_add(vdev_t *vd)
+{
+ mutex_enter(&spa_spare_lock);
+ ASSERT(!vd->vdev_isspare);
+ spa_aux_add(vd, &spa_spare_avl);
+ vd->vdev_isspare = B_TRUE;
+ mutex_exit(&spa_spare_lock);
+}
+
+void
+spa_spare_remove(vdev_t *vd)
+{
+ mutex_enter(&spa_spare_lock);
+ ASSERT(vd->vdev_isspare);
+ spa_aux_remove(vd, &spa_spare_avl);
+ vd->vdev_isspare = B_FALSE;
+ mutex_exit(&spa_spare_lock);
+}
+
+boolean_t
+spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt)
+{
+ boolean_t found;
+
+ mutex_enter(&spa_spare_lock);
+ found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl);
+ mutex_exit(&spa_spare_lock);
+
+ return (found);
+}
+
+void
+spa_spare_activate(vdev_t *vd)
+{
+ mutex_enter(&spa_spare_lock);
+ ASSERT(vd->vdev_isspare);
+ spa_aux_activate(vd, &spa_spare_avl);
+ mutex_exit(&spa_spare_lock);
+}
+
+/*
+ * Level 2 ARC devices are tracked globally for the same reasons as spares.
+ * Cache devices currently only support one pool per cache device, and so
+ * for these devices the aux reference count is currently unused beyond 1.
+ */
+
+static int
+spa_l2cache_compare(const void *a, const void *b)
+{
+ return (spa_aux_compare(a, b));
+}
+
+void
+spa_l2cache_add(vdev_t *vd)
+{
+ mutex_enter(&spa_l2cache_lock);
+ ASSERT(!vd->vdev_isl2cache);
+ spa_aux_add(vd, &spa_l2cache_avl);
+ vd->vdev_isl2cache = B_TRUE;
+ mutex_exit(&spa_l2cache_lock);
+}
+
+void
+spa_l2cache_remove(vdev_t *vd)
+{
+ mutex_enter(&spa_l2cache_lock);
+ ASSERT(vd->vdev_isl2cache);
+ spa_aux_remove(vd, &spa_l2cache_avl);
+ vd->vdev_isl2cache = B_FALSE;
+ mutex_exit(&spa_l2cache_lock);
+}
+
+boolean_t
+spa_l2cache_exists(uint64_t guid, uint64_t *pool)
+{
+ boolean_t found;
+
+ mutex_enter(&spa_l2cache_lock);
+ found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl);
+ mutex_exit(&spa_l2cache_lock);
+
+ return (found);
+}
+
+void
+spa_l2cache_activate(vdev_t *vd)
+{
+ mutex_enter(&spa_l2cache_lock);
+ ASSERT(vd->vdev_isl2cache);
+ spa_aux_activate(vd, &spa_l2cache_avl);
+ mutex_exit(&spa_l2cache_lock);
+}
+
+/*
+ * ==========================================================================
+ * SPA vdev locking
+ * ==========================================================================
+ */
+
+/*
+ * Lock the given spa_t for the purpose of adding or removing a vdev.
+ * Grabs the global spa_namespace_lock plus the spa config lock for writing.
+ * It returns the next transaction group for the spa_t.
+ */
+uint64_t
+spa_vdev_enter(spa_t *spa)
+{
+ mutex_enter(&spa->spa_vdev_top_lock);
+ mutex_enter(&spa_namespace_lock);
+
+ vdev_autotrim_stop_all(spa);
+
+ return (spa_vdev_config_enter(spa));
+}
+
+/*
+ * The same as spa_vdev_enter() above but additionally takes the guid of
+ * the vdev being detached. When there is a rebuild in process it will be
+ * suspended while the vdev tree is modified then resumed by spa_vdev_exit().
+ * The rebuild is canceled if only a single child remains after the detach.
+ */
+uint64_t
+spa_vdev_detach_enter(spa_t *spa, uint64_t guid)
+{
+ mutex_enter(&spa->spa_vdev_top_lock);
+ mutex_enter(&spa_namespace_lock);
+
+ vdev_autotrim_stop_all(spa);
+
+ if (guid != 0) {
+ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+ if (vd) {
+ vdev_rebuild_stop_wait(vd->vdev_top);
+ }
+ }
+
+ return (spa_vdev_config_enter(spa));
+}
+
+/*
+ * Internal implementation for spa_vdev_enter(). Used when a vdev
+ * operation requires multiple syncs (i.e. removing a device) while
+ * keeping the spa_namespace_lock held.
+ */
+uint64_t
+spa_vdev_config_enter(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
+
+ return (spa_last_synced_txg(spa) + 1);
+}
+
+/*
+ * Used in combination with spa_vdev_config_enter() to allow the syncing
+ * of multiple transactions without releasing the spa_namespace_lock.
+ */
+void
+spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ int config_changed = B_FALSE;
+
+ ASSERT(txg > spa_last_synced_txg(spa));
+
+ spa->spa_pending_vdev = NULL;
+
+ /*
+ * Reassess the DTLs.
+ */
+ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE, B_FALSE);
+
+ if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
+ config_changed = B_TRUE;
+ spa->spa_config_generation++;
+ }
+
+ /*
+ * Verify the metaslab classes.
+ */
+ ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
+ ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
+ ASSERT(metaslab_class_validate(spa_embedded_log_class(spa)) == 0);
+ ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0);
+ ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0);
+
+ spa_config_exit(spa, SCL_ALL, spa);
+
+ /*
+ * Panic the system if the specified tag requires it. This
+ * is useful for ensuring that configurations are updated
+ * transactionally.
+ */
+ if (zio_injection_enabled)
+ zio_handle_panic_injection(spa, tag, 0);
+
+ /*
+ * Note: this txg_wait_synced() is important because it ensures
+ * that there won't be more than one config change per txg.
+ * This allows us to use the txg as the generation number.
+ */
+ if (error == 0)
+ txg_wait_synced(spa->spa_dsl_pool, txg);
+
+ if (vd != NULL) {
+ ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
+ if (vd->vdev_ops->vdev_op_leaf) {
+ mutex_enter(&vd->vdev_initialize_lock);
+ vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED,
+ NULL);
+ mutex_exit(&vd->vdev_initialize_lock);
+
+ mutex_enter(&vd->vdev_trim_lock);
+ vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL);
+ mutex_exit(&vd->vdev_trim_lock);
+ }
+
+ /*
+ * The vdev may be both a leaf and top-level device.
+ */
+ vdev_autotrim_stop_wait(vd);
+
+ spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
+ vdev_free(vd);
+ spa_config_exit(spa, SCL_ALL, spa);
+ }
+
+ /*
+ * If the config changed, update the config cache.
+ */
+ if (config_changed)
+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
+}
+
+/*
+ * Unlock the spa_t after adding or removing a vdev. Besides undoing the
+ * locking of spa_vdev_enter(), we also want make sure the transactions have
+ * synced to disk, and then update the global configuration cache with the new
+ * information.
+ */
+int
+spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+{
+ vdev_autotrim_restart(spa);
+ vdev_rebuild_restart(spa);
+
+ spa_vdev_config_exit(spa, vd, txg, error, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ mutex_exit(&spa->spa_vdev_top_lock);
+
+ return (error);
+}
+
+/*
+ * Lock the given spa_t for the purpose of changing vdev state.
+ */
+void
+spa_vdev_state_enter(spa_t *spa, int oplocks)
+{
+ int locks = SCL_STATE_ALL | oplocks;
+
+ /*
+ * Root pools may need to read of the underlying devfs filesystem
+ * when opening up a vdev. Unfortunately if we're holding the
+ * SCL_ZIO lock it will result in a deadlock when we try to issue
+ * the read from the root filesystem. Instead we "prefetch"
+ * the associated vnodes that we need prior to opening the
+ * underlying devices and cache them so that we can prevent
+ * any I/O when we are doing the actual open.
+ */
+ if (spa_is_root(spa)) {
+ int low = locks & ~(SCL_ZIO - 1);
+ int high = locks & ~low;
+
+ spa_config_enter(spa, high, spa, RW_WRITER);
+ vdev_hold(spa->spa_root_vdev);
+ spa_config_enter(spa, low, spa, RW_WRITER);
+ } else {
+ spa_config_enter(spa, locks, spa, RW_WRITER);
+ }
+ spa->spa_vdev_locks = locks;
+}
+
+int
+spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
+{
+ boolean_t config_changed = B_FALSE;
+ vdev_t *vdev_top;
+
+ if (vd == NULL || vd == spa->spa_root_vdev) {
+ vdev_top = spa->spa_root_vdev;
+ } else {
+ vdev_top = vd->vdev_top;
+ }
+
+ if (vd != NULL || error == 0)
+ vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE, B_FALSE);
+
+ if (vd != NULL) {
+ if (vd != spa->spa_root_vdev)
+ vdev_state_dirty(vdev_top);
+
+ config_changed = B_TRUE;
+ spa->spa_config_generation++;
+ }
+
+ if (spa_is_root(spa))
+ vdev_rele(spa->spa_root_vdev);
+
+ ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
+ spa_config_exit(spa, spa->spa_vdev_locks, spa);
+
+ /*
+ * If anything changed, wait for it to sync. This ensures that,
+ * from the system administrator's perspective, zpool(8) commands
+ * are synchronous. This is important for things like zpool offline:
+ * when the command completes, you expect no further I/O from ZFS.
+ */
+ if (vd != NULL)
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+
+ /*
+ * If the config changed, update the config cache.
+ */
+ if (config_changed) {
+ mutex_enter(&spa_namespace_lock);
+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ return (error);
+}
+
+/*
+ * ==========================================================================
+ * Miscellaneous functions
+ * ==========================================================================
+ */
+
+void
+spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx)
+{
+ if (!nvlist_exists(spa->spa_label_features, feature)) {
+ fnvlist_add_boolean(spa->spa_label_features, feature);
+ /*
+ * When we are creating the pool (tx_txg==TXG_INITIAL), we can't
+ * dirty the vdev config because lock SCL_CONFIG is not held.
+ * Thankfully, in this case we don't need to dirty the config
+ * because it will be written out anyway when we finish
+ * creating the pool.
+ */
+ if (tx->tx_txg != TXG_INITIAL)
+ vdev_config_dirty(spa->spa_root_vdev);
+ }
+}
+
+void
+spa_deactivate_mos_feature(spa_t *spa, const char *feature)
+{
+ if (nvlist_remove_all(spa->spa_label_features, feature) == 0)
+ vdev_config_dirty(spa->spa_root_vdev);
+}
+
+/*
+ * Return the spa_t associated with given pool_guid, if it exists. If
+ * device_guid is non-zero, determine whether the pool exists *and* contains
+ * a device with the specified device_guid.
+ */
+spa_t *
+spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
+{
+ spa_t *spa;
+ avl_tree_t *t = &spa_namespace_avl;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
+ if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+ continue;
+ if (spa->spa_root_vdev == NULL)
+ continue;
+ if (spa_guid(spa) == pool_guid) {
+ if (device_guid == 0)
+ break;
+
+ if (vdev_lookup_by_guid(spa->spa_root_vdev,
+ device_guid) != NULL)
+ break;
+
+ /*
+ * Check any devices we may be in the process of adding.
+ */
+ if (spa->spa_pending_vdev) {
+ if (vdev_lookup_by_guid(spa->spa_pending_vdev,
+ device_guid) != NULL)
+ break;
+ }
+ }
+ }
+
+ return (spa);
+}
+
+/*
+ * Determine whether a pool with the given pool_guid exists.
+ */
+boolean_t
+spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
+{
+ return (spa_by_guid(pool_guid, device_guid) != NULL);
+}
+
+char *
+spa_strdup(const char *s)
+{
+ size_t len;
+ char *new;
+
+ len = strlen(s);
+ new = kmem_alloc(len + 1, KM_SLEEP);
+ bcopy(s, new, len);
+ new[len] = '\0';
+
+ return (new);
+}
+
+void
+spa_strfree(char *s)
+{
+ kmem_free(s, strlen(s) + 1);
+}
+
+uint64_t
+spa_get_random(uint64_t range)
+{
+ uint64_t r;
+
+ ASSERT(range != 0);
+
+ if (range == 1)
+ return (0);
+
+ (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
+
+ return (r % range);
+}
+
+uint64_t
+spa_generate_guid(spa_t *spa)
+{
+ uint64_t guid = spa_get_random(-1ULL);
+
+ if (spa != NULL) {
+ while (guid == 0 || spa_guid_exists(spa_guid(spa), guid))
+ guid = spa_get_random(-1ULL);
+ } else {
+ while (guid == 0 || spa_guid_exists(guid, 0))
+ guid = spa_get_random(-1ULL);
+ }
+
+ return (guid);
+}
+
+void
+snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
+{
+ char type[256];
+ char *checksum = NULL;
+ char *compress = NULL;
+
+ if (bp != NULL) {
+ if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
+ dmu_object_byteswap_t bswap =
+ DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
+ (void) snprintf(type, sizeof (type), "bswap %s %s",
+ DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ?
+ "metadata" : "data",
+ dmu_ot_byteswap[bswap].ob_name);
+ } else {
+ (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
+ sizeof (type));
+ }
+ if (!BP_IS_EMBEDDED(bp)) {
+ checksum =
+ zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
+ }
+ compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
+ }
+
+ SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum,
+ compress);
+}
+
+void
+spa_freeze(spa_t *spa)
+{
+ uint64_t freeze_txg = 0;
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ if (spa->spa_freeze_txg == UINT64_MAX) {
+ freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
+ spa->spa_freeze_txg = freeze_txg;
+ }
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ if (freeze_txg != 0)
+ txg_wait_synced(spa_get_dsl(spa), freeze_txg);
+}
+
+void
+zfs_panic_recover(const char *fmt, ...)
+{
+ va_list adx;
+
+ va_start(adx, fmt);
+ vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
+ va_end(adx);
+}
+
+/*
+ * This is a stripped-down version of strtoull, suitable only for converting
+ * lowercase hexadecimal numbers that don't overflow.
+ */
+uint64_t
+zfs_strtonum(const char *str, char **nptr)
+{
+ uint64_t val = 0;
+ char c;
+ int digit;
+
+ while ((c = *str) != '\0') {
+ if (c >= '0' && c <= '9')
+ digit = c - '0';
+ else if (c >= 'a' && c <= 'f')
+ digit = 10 + c - 'a';
+ else
+ break;
+
+ val *= 16;
+ val += digit;
+
+ str++;
+ }
+
+ if (nptr)
+ *nptr = (char *)str;
+
+ return (val);
+}
+
+void
+spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx)
+{
+ /*
+ * We bump the feature refcount for each special vdev added to the pool
+ */
+ ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES));
+ spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx);
+}
+
+/*
+ * ==========================================================================
+ * Accessor functions
+ * ==========================================================================
+ */
+
+boolean_t
+spa_shutting_down(spa_t *spa)
+{
+ return (spa->spa_async_suspended);
+}
+
+dsl_pool_t *
+spa_get_dsl(spa_t *spa)
+{
+ return (spa->spa_dsl_pool);
+}
+
+boolean_t
+spa_is_initializing(spa_t *spa)
+{
+ return (spa->spa_is_initializing);
+}
+
+boolean_t
+spa_indirect_vdevs_loaded(spa_t *spa)
+{
+ return (spa->spa_indirect_vdevs_loaded);
+}
+
+blkptr_t *
+spa_get_rootblkptr(spa_t *spa)
+{
+ return (&spa->spa_ubsync.ub_rootbp);
+}
+
+void
+spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
+{
+ spa->spa_uberblock.ub_rootbp = *bp;
+}
+
+void
+spa_altroot(spa_t *spa, char *buf, size_t buflen)
+{
+ if (spa->spa_root == NULL)
+ buf[0] = '\0';
+ else
+ (void) strncpy(buf, spa->spa_root, buflen);
+}
+
+int
+spa_sync_pass(spa_t *spa)
+{
+ return (spa->spa_sync_pass);
+}
+
+char *
+spa_name(spa_t *spa)
+{
+ return (spa->spa_name);
+}
+
+uint64_t
+spa_guid(spa_t *spa)
+{
+ dsl_pool_t *dp = spa_get_dsl(spa);
+ uint64_t guid;
+
+ /*
+ * If we fail to parse the config during spa_load(), we can go through
+ * the error path (which posts an ereport) and end up here with no root
+ * vdev. We stash the original pool guid in 'spa_config_guid' to handle
+ * this case.
+ */
+ if (spa->spa_root_vdev == NULL)
+ return (spa->spa_config_guid);
+
+ guid = spa->spa_last_synced_guid != 0 ?
+ spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
+
+ /*
+ * Return the most recently synced out guid unless we're
+ * in syncing context.
+ */
+ if (dp && dsl_pool_sync_context(dp))
+ return (spa->spa_root_vdev->vdev_guid);
+ else
+ return (guid);
+}
+
+uint64_t
+spa_load_guid(spa_t *spa)
+{
+ /*
+ * This is a GUID that exists solely as a reference for the
+ * purposes of the arc. It is generated at load time, and
+ * is never written to persistent storage.
+ */
+ return (spa->spa_load_guid);
+}
+
+uint64_t
+spa_last_synced_txg(spa_t *spa)
+{
+ return (spa->spa_ubsync.ub_txg);
+}
+
+uint64_t
+spa_first_txg(spa_t *spa)
+{
+ return (spa->spa_first_txg);
+}
+
+uint64_t
+spa_syncing_txg(spa_t *spa)
+{
+ return (spa->spa_syncing_txg);
+}
+
+/*
+ * Return the last txg where data can be dirtied. The final txgs
+ * will be used to just clear out any deferred frees that remain.
+ */
+uint64_t
+spa_final_dirty_txg(spa_t *spa)
+{
+ return (spa->spa_final_txg - TXG_DEFER_SIZE);
+}
+
+pool_state_t
+spa_state(spa_t *spa)
+{
+ return (spa->spa_state);
+}
+
+spa_load_state_t
+spa_load_state(spa_t *spa)
+{
+ return (spa->spa_load_state);
+}
+
+uint64_t
+spa_freeze_txg(spa_t *spa)
+{
+ return (spa->spa_freeze_txg);
+}
+
+/*
+ * Return the inflated asize for a logical write in bytes. This is used by the
+ * DMU to calculate the space a logical write will require on disk.
+ * If lsize is smaller than the largest physical block size allocatable on this
+ * pool we use its value instead, since the write will end up using the whole
+ * block anyway.
+ */
+uint64_t
+spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
+{
+ if (lsize == 0)
+ return (0); /* No inflation needed */
+ return (MAX(lsize, 1 << spa->spa_max_ashift) * spa_asize_inflation);
+}
+
+/*
+ * Return the amount of slop space in bytes. It is typically 1/32 of the pool
+ * (3.2%), minus the embedded log space. On very small pools, it may be
+ * slightly larger than this. The embedded log space is not included in
+ * spa_dspace. By subtracting it, the usable space (per "zfs list") is a
+ * constant 97% of the total space, regardless of metaslab size (assuming the
+ * default spa_slop_shift=5 and a non-tiny pool).
+ *
+ * See the comment above spa_slop_shift for more details.
+ */
+uint64_t
+spa_get_slop_space(spa_t *spa)
+{
+ uint64_t space = spa_get_dspace(spa);
+ uint64_t slop = space >> spa_slop_shift;
+
+ /*
+ * Subtract the embedded log space, but no more than half the (3.2%)
+ * unusable space. Note, the "no more than half" is only relevant if
+ * zfs_embedded_slog_min_ms >> spa_slop_shift < 2, which is not true by
+ * default.
+ */
+ uint64_t embedded_log =
+ metaslab_class_get_dspace(spa_embedded_log_class(spa));
+ slop -= MIN(embedded_log, slop >> 1);
+
+ /*
+ * Slop space should be at least spa_min_slop, but no more than half
+ * the entire pool.
+ */
+ slop = MAX(slop, MIN(space >> 1, spa_min_slop));
+ return (slop);
+}
+
+uint64_t
+spa_get_dspace(spa_t *spa)
+{
+ return (spa->spa_dspace);
+}
+
+uint64_t
+spa_get_checkpoint_space(spa_t *spa)
+{
+ return (spa->spa_checkpoint_info.sci_dspace);
+}
+
+void
+spa_update_dspace(spa_t *spa)
+{
+ spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
+ ddt_get_dedup_dspace(spa);
+ if (spa->spa_vdev_removal != NULL) {
+ /*
+ * We can't allocate from the removing device, so subtract
+ * its size if it was included in dspace (i.e. if this is a
+ * normal-class vdev, not special/dedup). This prevents the
+ * DMU/DSL from filling up the (now smaller) pool while we
+ * are in the middle of removing the device.
+ *
+ * Note that the DMU/DSL doesn't actually know or care
+ * how much space is allocated (it does its own tracking
+ * of how much space has been logically used). So it
+ * doesn't matter that the data we are moving may be
+ * allocated twice (on the old device and the new
+ * device).
+ */
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ vdev_t *vd =
+ vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
+ if (vd->vdev_mg->mg_class == spa_normal_class(spa)) {
+ spa->spa_dspace -= spa_deflate(spa) ?
+ vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
+ }
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+ }
+}
+
+/*
+ * Return the failure mode that has been set to this pool. The default
+ * behavior will be to block all I/Os when a complete failure occurs.
+ */
+uint64_t
+spa_get_failmode(spa_t *spa)
+{
+ return (spa->spa_failmode);
+}
+
+boolean_t
+spa_suspended(spa_t *spa)
+{
+ return (spa->spa_suspended != ZIO_SUSPEND_NONE);
+}
+
+uint64_t
+spa_version(spa_t *spa)
+{
+ return (spa->spa_ubsync.ub_version);
+}
+
+boolean_t
+spa_deflate(spa_t *spa)
+{
+ return (spa->spa_deflate);
+}
+
+metaslab_class_t *
+spa_normal_class(spa_t *spa)
+{
+ return (spa->spa_normal_class);
+}
+
+metaslab_class_t *
+spa_log_class(spa_t *spa)
+{
+ return (spa->spa_log_class);
+}
+
+metaslab_class_t *
+spa_embedded_log_class(spa_t *spa)
+{
+ return (spa->spa_embedded_log_class);
+}
+
+metaslab_class_t *
+spa_special_class(spa_t *spa)
+{
+ return (spa->spa_special_class);
+}
+
+metaslab_class_t *
+spa_dedup_class(spa_t *spa)
+{
+ return (spa->spa_dedup_class);
+}
+
+/*
+ * Locate an appropriate allocation class
+ */
+metaslab_class_t *
+spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype,
+ uint_t level, uint_t special_smallblk)
+{
+ /*
+ * ZIL allocations determine their class in zio_alloc_zil().
+ */
+ ASSERT(objtype != DMU_OT_INTENT_LOG);
+
+ boolean_t has_special_class = spa->spa_special_class->mc_groups != 0;
+
+ if (DMU_OT_IS_DDT(objtype)) {
+ if (spa->spa_dedup_class->mc_groups != 0)
+ return (spa_dedup_class(spa));
+ else if (has_special_class && zfs_ddt_data_is_special)
+ return (spa_special_class(spa));
+ else
+ return (spa_normal_class(spa));
+ }
+
+ /* Indirect blocks for user data can land in special if allowed */
+ if (level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) {
+ if (has_special_class && zfs_user_indirect_is_special)
+ return (spa_special_class(spa));
+ else
+ return (spa_normal_class(spa));
+ }
+
+ if (DMU_OT_IS_METADATA(objtype) || level > 0) {
+ if (has_special_class)
+ return (spa_special_class(spa));
+ else
+ return (spa_normal_class(spa));
+ }
+
+ /*
+ * Allow small file blocks in special class in some cases (like
+ * for the dRAID vdev feature). But always leave a reserve of
+ * zfs_special_class_metadata_reserve_pct exclusively for metadata.
+ */
+ if (DMU_OT_IS_FILE(objtype) &&
+ has_special_class && size <= special_smallblk) {
+ metaslab_class_t *special = spa_special_class(spa);
+ uint64_t alloc = metaslab_class_get_alloc(special);
+ uint64_t space = metaslab_class_get_space(special);
+ uint64_t limit =
+ (space * (100 - zfs_special_class_metadata_reserve_pct))
+ / 100;
+
+ if (alloc < limit)
+ return (special);
+ }
+
+ return (spa_normal_class(spa));
+}
+
+void
+spa_evicting_os_register(spa_t *spa, objset_t *os)
+{
+ mutex_enter(&spa->spa_evicting_os_lock);
+ list_insert_head(&spa->spa_evicting_os_list, os);
+ mutex_exit(&spa->spa_evicting_os_lock);
+}
+
+void
+spa_evicting_os_deregister(spa_t *spa, objset_t *os)
+{
+ mutex_enter(&spa->spa_evicting_os_lock);
+ list_remove(&spa->spa_evicting_os_list, os);
+ cv_broadcast(&spa->spa_evicting_os_cv);
+ mutex_exit(&spa->spa_evicting_os_lock);
+}
+
+void
+spa_evicting_os_wait(spa_t *spa)
+{
+ mutex_enter(&spa->spa_evicting_os_lock);
+ while (!list_is_empty(&spa->spa_evicting_os_list))
+ cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock);
+ mutex_exit(&spa->spa_evicting_os_lock);
+
+ dmu_buf_user_evict_wait();
+}
+
+int
+spa_max_replication(spa_t *spa)
+{
+ /*
+ * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
+ * handle BPs with more than one DVA allocated. Set our max
+ * replication level accordingly.
+ */
+ if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
+ return (1);
+ return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
+}
+
+int
+spa_prev_software_version(spa_t *spa)
+{
+ return (spa->spa_prev_software_version);
+}
+
+uint64_t
+spa_deadman_synctime(spa_t *spa)
+{
+ return (spa->spa_deadman_synctime);
+}
+
+spa_autotrim_t
+spa_get_autotrim(spa_t *spa)
+{
+ return (spa->spa_autotrim);
+}
+
+uint64_t
+spa_deadman_ziotime(spa_t *spa)
+{
+ return (spa->spa_deadman_ziotime);
+}
+
+uint64_t
+spa_get_deadman_failmode(spa_t *spa)
+{
+ return (spa->spa_deadman_failmode);
+}
+
+void
+spa_set_deadman_failmode(spa_t *spa, const char *failmode)
+{
+ if (strcmp(failmode, "wait") == 0)
+ spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT;
+ else if (strcmp(failmode, "continue") == 0)
+ spa->spa_deadman_failmode = ZIO_FAILURE_MODE_CONTINUE;
+ else if (strcmp(failmode, "panic") == 0)
+ spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC;
+ else
+ spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT;
+}
+
+void
+spa_set_deadman_ziotime(hrtime_t ns)
+{
+ spa_t *spa = NULL;
+
+ if (spa_mode_global != SPA_MODE_UNINIT) {
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL)
+ spa->spa_deadman_ziotime = ns;
+ mutex_exit(&spa_namespace_lock);
+ }
+}
+
+void
+spa_set_deadman_synctime(hrtime_t ns)
+{
+ spa_t *spa = NULL;
+
+ if (spa_mode_global != SPA_MODE_UNINIT) {
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL)
+ spa->spa_deadman_synctime = ns;
+ mutex_exit(&spa_namespace_lock);
+ }
+}
+
+uint64_t
+dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
+{
+ uint64_t asize = DVA_GET_ASIZE(dva);
+ uint64_t dsize = asize;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+
+ if (asize != 0 && spa->spa_deflate) {
+ vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+ if (vd != NULL)
+ dsize = (asize >> SPA_MINBLOCKSHIFT) *
+ vd->vdev_deflate_ratio;
+ }
+
+ return (dsize);
+}
+
+uint64_t
+bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
+{
+ uint64_t dsize = 0;
+
+ for (int d = 0; d < BP_GET_NDVAS(bp); d++)
+ dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+
+ return (dsize);
+}
+
+uint64_t
+bp_get_dsize(spa_t *spa, const blkptr_t *bp)
+{
+ uint64_t dsize = 0;
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+ for (int d = 0; d < BP_GET_NDVAS(bp); d++)
+ dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ return (dsize);
+}
+
+uint64_t
+spa_dirty_data(spa_t *spa)
+{
+ return (spa->spa_dsl_pool->dp_dirty_total);
+}
+
+/*
+ * ==========================================================================
+ * SPA Import Progress Routines
+ * ==========================================================================
+ */
+
+typedef struct spa_import_progress {
+ uint64_t pool_guid; /* unique id for updates */
+ char *pool_name;
+ spa_load_state_t spa_load_state;
+ uint64_t mmp_sec_remaining; /* MMP activity check */
+ uint64_t spa_load_max_txg; /* rewind txg */
+ procfs_list_node_t smh_node;
+} spa_import_progress_t;
+
+spa_history_list_t *spa_import_progress_list = NULL;
+
+static int
+spa_import_progress_show_header(struct seq_file *f)
+{
+ seq_printf(f, "%-20s %-14s %-14s %-12s %s\n", "pool_guid",
+ "load_state", "multihost_secs", "max_txg",
+ "pool_name");
+ return (0);
+}
+
+static int
+spa_import_progress_show(struct seq_file *f, void *data)
+{
+ spa_import_progress_t *sip = (spa_import_progress_t *)data;
+
+ seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %s\n",
+ (u_longlong_t)sip->pool_guid, (u_longlong_t)sip->spa_load_state,
+ (u_longlong_t)sip->mmp_sec_remaining,
+ (u_longlong_t)sip->spa_load_max_txg,
+ (sip->pool_name ? sip->pool_name : "-"));
+
+ return (0);
+}
+
+/* Remove oldest elements from list until there are no more than 'size' left */
+static void
+spa_import_progress_truncate(spa_history_list_t *shl, unsigned int size)
+{
+ spa_import_progress_t *sip;
+ while (shl->size > size) {
+ sip = list_remove_head(&shl->procfs_list.pl_list);
+ if (sip->pool_name)
+ spa_strfree(sip->pool_name);
+ kmem_free(sip, sizeof (spa_import_progress_t));
+ shl->size--;
+ }
+
+ IMPLY(size == 0, list_is_empty(&shl->procfs_list.pl_list));
+}
+
+static void
+spa_import_progress_init(void)
+{
+ spa_import_progress_list = kmem_zalloc(sizeof (spa_history_list_t),
+ KM_SLEEP);
+
+ spa_import_progress_list->size = 0;
+
+ spa_import_progress_list->procfs_list.pl_private =
+ spa_import_progress_list;
+
+ procfs_list_install("zfs",
+ NULL,
+ "import_progress",
+ 0644,
+ &spa_import_progress_list->procfs_list,
+ spa_import_progress_show,
+ spa_import_progress_show_header,
+ NULL,
+ offsetof(spa_import_progress_t, smh_node));
+}
+
+static void
+spa_import_progress_destroy(void)
+{
+ spa_history_list_t *shl = spa_import_progress_list;
+ procfs_list_uninstall(&shl->procfs_list);
+ spa_import_progress_truncate(shl, 0);
+ procfs_list_destroy(&shl->procfs_list);
+ kmem_free(shl, sizeof (spa_history_list_t));
+}
+
+int
+spa_import_progress_set_state(uint64_t pool_guid,
+ spa_load_state_t load_state)
+{
+ spa_history_list_t *shl = spa_import_progress_list;
+ spa_import_progress_t *sip;
+ int error = ENOENT;
+
+ if (shl->size == 0)
+ return (0);
+
+ mutex_enter(&shl->procfs_list.pl_lock);
+ for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
+ sip = list_prev(&shl->procfs_list.pl_list, sip)) {
+ if (sip->pool_guid == pool_guid) {
+ sip->spa_load_state = load_state;
+ error = 0;
+ break;
+ }
+ }
+ mutex_exit(&shl->procfs_list.pl_lock);
+
+ return (error);
+}
+
+int
+spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t load_max_txg)
+{
+ spa_history_list_t *shl = spa_import_progress_list;
+ spa_import_progress_t *sip;
+ int error = ENOENT;
+
+ if (shl->size == 0)
+ return (0);
+
+ mutex_enter(&shl->procfs_list.pl_lock);
+ for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
+ sip = list_prev(&shl->procfs_list.pl_list, sip)) {
+ if (sip->pool_guid == pool_guid) {
+ sip->spa_load_max_txg = load_max_txg;
+ error = 0;
+ break;
+ }
+ }
+ mutex_exit(&shl->procfs_list.pl_lock);
+
+ return (error);
+}
+
+int
+spa_import_progress_set_mmp_check(uint64_t pool_guid,
+ uint64_t mmp_sec_remaining)
+{
+ spa_history_list_t *shl = spa_import_progress_list;
+ spa_import_progress_t *sip;
+ int error = ENOENT;
+
+ if (shl->size == 0)
+ return (0);
+
+ mutex_enter(&shl->procfs_list.pl_lock);
+ for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
+ sip = list_prev(&shl->procfs_list.pl_list, sip)) {
+ if (sip->pool_guid == pool_guid) {
+ sip->mmp_sec_remaining = mmp_sec_remaining;
+ error = 0;
+ break;
+ }
+ }
+ mutex_exit(&shl->procfs_list.pl_lock);
+
+ return (error);
+}
+
+/*
+ * A new import is in progress, add an entry.
+ */
+void
+spa_import_progress_add(spa_t *spa)
+{
+ spa_history_list_t *shl = spa_import_progress_list;
+ spa_import_progress_t *sip;
+ char *poolname = NULL;
+
+ sip = kmem_zalloc(sizeof (spa_import_progress_t), KM_SLEEP);
+ sip->pool_guid = spa_guid(spa);
+
+ (void) nvlist_lookup_string(spa->spa_config, ZPOOL_CONFIG_POOL_NAME,
+ &poolname);
+ if (poolname == NULL)
+ poolname = spa_name(spa);
+ sip->pool_name = spa_strdup(poolname);
+ sip->spa_load_state = spa_load_state(spa);
+
+ mutex_enter(&shl->procfs_list.pl_lock);
+ procfs_list_add(&shl->procfs_list, sip);
+ shl->size++;
+ mutex_exit(&shl->procfs_list.pl_lock);
+}
+
+void
+spa_import_progress_remove(uint64_t pool_guid)
+{
+ spa_history_list_t *shl = spa_import_progress_list;
+ spa_import_progress_t *sip;
+
+ mutex_enter(&shl->procfs_list.pl_lock);
+ for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
+ sip = list_prev(&shl->procfs_list.pl_list, sip)) {
+ if (sip->pool_guid == pool_guid) {
+ if (sip->pool_name)
+ spa_strfree(sip->pool_name);
+ list_remove(&shl->procfs_list.pl_list, sip);
+ shl->size--;
+ kmem_free(sip, sizeof (spa_import_progress_t));
+ break;
+ }
+ }
+ mutex_exit(&shl->procfs_list.pl_lock);
+}
+
+/*
+ * ==========================================================================
+ * Initialization and Termination
+ * ==========================================================================
+ */
+
+static int
+spa_name_compare(const void *a1, const void *a2)
+{
+ const spa_t *s1 = a1;
+ const spa_t *s2 = a2;
+ int s;
+
+ s = strcmp(s1->spa_name, s2->spa_name);
+
+ return (TREE_ISIGN(s));
+}
+
+void
+spa_boot_init(void)
+{
+ spa_config_load();
+}
+
+void
+spa_init(spa_mode_t mode)
+{
+ mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
+
+ avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
+ offsetof(spa_t, spa_avl));
+
+ avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
+ offsetof(spa_aux_t, aux_avl));
+
+ avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
+ offsetof(spa_aux_t, aux_avl));
+
+ spa_mode_global = mode;
+
+#ifndef _KERNEL
+ if (spa_mode_global != SPA_MODE_READ && dprintf_find_string("watch")) {
+ struct sigaction sa;
+
+ sa.sa_flags = SA_SIGINFO;
+ sigemptyset(&sa.sa_mask);
+ sa.sa_sigaction = arc_buf_sigsegv;
+
+ if (sigaction(SIGSEGV, &sa, NULL) == -1) {
+ perror("could not enable watchpoints: "
+ "sigaction(SIGSEGV, ...) = ");
+ } else {
+ arc_watch = B_TRUE;
+ }
+ }
+#endif
+
+ fm_init();
+ zfs_refcount_init();
+ unique_init();
+ zfs_btree_init();
+ metaslab_stat_init();
+ ddt_init();
+ zio_init();
+ dmu_init();
+ zil_init();
+ vdev_cache_stat_init();
+ vdev_mirror_stat_init();
+ vdev_raidz_math_init();
+ vdev_file_init();
+ zfs_prop_init();
+ zpool_prop_init();
+ zpool_feature_init();
+ spa_config_load();
+ l2arc_start();
+ scan_init();
+ qat_init();
+ spa_import_progress_init();
+}
+
+void
+spa_fini(void)
+{
+ l2arc_stop();
+
+ spa_evict_all();
+
+ vdev_file_fini();
+ vdev_cache_stat_fini();
+ vdev_mirror_stat_fini();
+ vdev_raidz_math_fini();
+ zil_fini();
+ dmu_fini();
+ zio_fini();
+ ddt_fini();
+ metaslab_stat_fini();
+ zfs_btree_fini();
+ unique_fini();
+ zfs_refcount_fini();
+ fm_fini();
+ scan_fini();
+ qat_fini();
+ spa_import_progress_destroy();
+
+ avl_destroy(&spa_namespace_avl);
+ avl_destroy(&spa_spare_avl);
+ avl_destroy(&spa_l2cache_avl);
+
+ cv_destroy(&spa_namespace_cv);
+ mutex_destroy(&spa_namespace_lock);
+ mutex_destroy(&spa_spare_lock);
+ mutex_destroy(&spa_l2cache_lock);
+}
+
+/*
+ * Return whether this pool has a dedicated slog device. No locking needed.
+ * It's not a problem if the wrong answer is returned as it's only for
+ * performance and not correctness.
+ */
+boolean_t
+spa_has_slogs(spa_t *spa)
+{
+ return (spa->spa_log_class->mc_groups != 0);
+}
+
+spa_log_state_t
+spa_get_log_state(spa_t *spa)
+{
+ return (spa->spa_log_state);
+}
+
+void
+spa_set_log_state(spa_t *spa, spa_log_state_t state)
+{
+ spa->spa_log_state = state;
+}
+
+boolean_t
+spa_is_root(spa_t *spa)
+{
+ return (spa->spa_is_root);
+}
+
+boolean_t
+spa_writeable(spa_t *spa)
+{
+ return (!!(spa->spa_mode & SPA_MODE_WRITE) && spa->spa_trust_config);
+}
+
+/*
+ * Returns true if there is a pending sync task in any of the current
+ * syncing txg, the current quiescing txg, or the current open txg.
+ */
+boolean_t
+spa_has_pending_synctask(spa_t *spa)
+{
+ return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) ||
+ !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks));
+}
+
+spa_mode_t
+spa_mode(spa_t *spa)
+{
+ return (spa->spa_mode);
+}
+
+uint64_t
+spa_bootfs(spa_t *spa)
+{
+ return (spa->spa_bootfs);
+}
+
+uint64_t
+spa_delegation(spa_t *spa)
+{
+ return (spa->spa_delegation);
+}
+
+objset_t *
+spa_meta_objset(spa_t *spa)
+{
+ return (spa->spa_meta_objset);
+}
+
+enum zio_checksum
+spa_dedup_checksum(spa_t *spa)
+{
+ return (spa->spa_dedup_checksum);
+}
+
+/*
+ * Reset pool scan stat per scan pass (or reboot).
+ */
+void
+spa_scan_stat_init(spa_t *spa)
+{
+ /* data not stored on disk */
+ spa->spa_scan_pass_start = gethrestime_sec();
+ if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan))
+ spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start;
+ else
+ spa->spa_scan_pass_scrub_pause = 0;
+ spa->spa_scan_pass_scrub_spent_paused = 0;
+ spa->spa_scan_pass_exam = 0;
+ spa->spa_scan_pass_issued = 0;
+ vdev_scan_stat_init(spa->spa_root_vdev);
+}
+
+/*
+ * Get scan stats for zpool status reports
+ */
+int
+spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
+{
+ dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
+
+ if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
+ return (SET_ERROR(ENOENT));
+ bzero(ps, sizeof (pool_scan_stat_t));
+
+ /* data stored on disk */
+ ps->pss_func = scn->scn_phys.scn_func;
+ ps->pss_state = scn->scn_phys.scn_state;
+ ps->pss_start_time = scn->scn_phys.scn_start_time;
+ ps->pss_end_time = scn->scn_phys.scn_end_time;
+ ps->pss_to_examine = scn->scn_phys.scn_to_examine;
+ ps->pss_examined = scn->scn_phys.scn_examined;
+ ps->pss_to_process = scn->scn_phys.scn_to_process;
+ ps->pss_processed = scn->scn_phys.scn_processed;
+ ps->pss_errors = scn->scn_phys.scn_errors;
+
+ /* data not stored on disk */
+ ps->pss_pass_exam = spa->spa_scan_pass_exam;
+ ps->pss_pass_start = spa->spa_scan_pass_start;
+ ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause;
+ ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused;
+ ps->pss_pass_issued = spa->spa_scan_pass_issued;
+ ps->pss_issued =
+ scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
+
+ return (0);
+}
+
+int
+spa_maxblocksize(spa_t *spa)
+{
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
+ return (SPA_MAXBLOCKSIZE);
+ else
+ return (SPA_OLD_MAXBLOCKSIZE);
+}
+
+
+/*
+ * Returns the txg that the last device removal completed. No indirect mappings
+ * have been added since this txg.
+ */
+uint64_t
+spa_get_last_removal_txg(spa_t *spa)
+{
+ uint64_t vdevid;
+ uint64_t ret = -1ULL;
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ /*
+ * sr_prev_indirect_vdev is only modified while holding all the
+ * config locks, so it is sufficient to hold SCL_VDEV as reader when
+ * examining it.
+ */
+ vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev;
+
+ while (vdevid != -1ULL) {
+ vdev_t *vd = vdev_lookup_top(spa, vdevid);
+ vdev_indirect_births_t *vib = vd->vdev_indirect_births;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+
+ /*
+ * If the removal did not remap any data, we don't care.
+ */
+ if (vdev_indirect_births_count(vib) != 0) {
+ ret = vdev_indirect_births_last_entry_txg(vib);
+ break;
+ }
+
+ vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev;
+ }
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ IMPLY(ret != -1ULL,
+ spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
+
+ return (ret);
+}
+
+int
+spa_maxdnodesize(spa_t *spa)
+{
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE))
+ return (DNODE_MAX_SIZE);
+ else
+ return (DNODE_MIN_SIZE);
+}
+
+boolean_t
+spa_multihost(spa_t *spa)
+{
+ return (spa->spa_multihost ? B_TRUE : B_FALSE);
+}
+
+uint32_t
+spa_get_hostid(spa_t *spa)
+{
+ return (spa->spa_hostid);
+}
+
+boolean_t
+spa_trust_config(spa_t *spa)
+{
+ return (spa->spa_trust_config);
+}
+
+uint64_t
+spa_missing_tvds_allowed(spa_t *spa)
+{
+ return (spa->spa_missing_tvds_allowed);
+}
+
+space_map_t *
+spa_syncing_log_sm(spa_t *spa)
+{
+ return (spa->spa_syncing_log_sm);
+}
+
+void
+spa_set_missing_tvds(spa_t *spa, uint64_t missing)
+{
+ spa->spa_missing_tvds = missing;
+}
+
+/*
+ * Return the pool state string ("ONLINE", "DEGRADED", "SUSPENDED", etc).
+ */
+const char *
+spa_state_to_name(spa_t *spa)
+{
+ ASSERT3P(spa, !=, NULL);
+
+ /*
+ * it is possible for the spa to exist, without root vdev
+ * as the spa transitions during import/export
+ */
+ vdev_t *rvd = spa->spa_root_vdev;
+ if (rvd == NULL) {
+ return ("TRANSITIONING");
+ }
+ vdev_state_t state = rvd->vdev_state;
+ vdev_aux_t aux = rvd->vdev_stat.vs_aux;
+
+ if (spa_suspended(spa) &&
+ (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE))
+ return ("SUSPENDED");
+
+ switch (state) {
+ case VDEV_STATE_CLOSED:
+ case VDEV_STATE_OFFLINE:
+ return ("OFFLINE");
+ case VDEV_STATE_REMOVED:
+ return ("REMOVED");
+ case VDEV_STATE_CANT_OPEN:
+ if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG)
+ return ("FAULTED");
+ else if (aux == VDEV_AUX_SPLIT_POOL)
+ return ("SPLIT");
+ else
+ return ("UNAVAIL");
+ case VDEV_STATE_FAULTED:
+ return ("FAULTED");
+ case VDEV_STATE_DEGRADED:
+ return ("DEGRADED");
+ case VDEV_STATE_HEALTHY:
+ return ("ONLINE");
+ default:
+ break;
+ }
+
+ return ("UNKNOWN");
+}
+
+boolean_t
+spa_top_vdevs_spacemap_addressable(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ if (!vdev_is_spacemap_addressable(rvd->vdev_child[c]))
+ return (B_FALSE);
+ }
+ return (B_TRUE);
+}
+
+boolean_t
+spa_has_checkpoint(spa_t *spa)
+{
+ return (spa->spa_checkpoint_txg != 0);
+}
+
+boolean_t
+spa_importing_readonly_checkpoint(spa_t *spa)
+{
+ return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) &&
+ spa->spa_mode == SPA_MODE_READ);
+}
+
+uint64_t
+spa_min_claim_txg(spa_t *spa)
+{
+ uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg;
+
+ if (checkpoint_txg != 0)
+ return (checkpoint_txg + 1);
+
+ return (spa->spa_first_txg);
+}
+
+/*
+ * If there is a checkpoint, async destroys may consume more space from
+ * the pool instead of freeing it. In an attempt to save the pool from
+ * getting suspended when it is about to run out of space, we stop
+ * processing async destroys.
+ */
+boolean_t
+spa_suspend_async_destroy(spa_t *spa)
+{
+ dsl_pool_t *dp = spa_get_dsl(spa);
+
+ uint64_t unreserved = dsl_pool_unreserved_space(dp,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED);
+ uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
+ uint64_t avail = (unreserved > used) ? (unreserved - used) : 0;
+
+ if (spa_has_checkpoint(spa) && avail == 0)
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+#if defined(_KERNEL)
+
+int
+param_set_deadman_failmode_common(const char *val)
+{
+ spa_t *spa = NULL;
+ char *p;
+
+ if (val == NULL)
+ return (SET_ERROR(EINVAL));
+
+ if ((p = strchr(val, '\n')) != NULL)
+ *p = '\0';
+
+ if (strcmp(val, "wait") != 0 && strcmp(val, "continue") != 0 &&
+ strcmp(val, "panic"))
+ return (SET_ERROR(EINVAL));
+
+ if (spa_mode_global != SPA_MODE_UNINIT) {
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL)
+ spa_set_deadman_failmode(spa, val);
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ return (0);
+}
+#endif
+
+/* Namespace manipulation */
+EXPORT_SYMBOL(spa_lookup);
+EXPORT_SYMBOL(spa_add);
+EXPORT_SYMBOL(spa_remove);
+EXPORT_SYMBOL(spa_next);
+
+/* Refcount functions */
+EXPORT_SYMBOL(spa_open_ref);
+EXPORT_SYMBOL(spa_close);
+EXPORT_SYMBOL(spa_refcount_zero);
+
+/* Pool configuration lock */
+EXPORT_SYMBOL(spa_config_tryenter);
+EXPORT_SYMBOL(spa_config_enter);
+EXPORT_SYMBOL(spa_config_exit);
+EXPORT_SYMBOL(spa_config_held);
+
+/* Pool vdev add/remove lock */
+EXPORT_SYMBOL(spa_vdev_enter);
+EXPORT_SYMBOL(spa_vdev_exit);
+
+/* Pool vdev state change lock */
+EXPORT_SYMBOL(spa_vdev_state_enter);
+EXPORT_SYMBOL(spa_vdev_state_exit);
+
+/* Accessor functions */
+EXPORT_SYMBOL(spa_shutting_down);
+EXPORT_SYMBOL(spa_get_dsl);
+EXPORT_SYMBOL(spa_get_rootblkptr);
+EXPORT_SYMBOL(spa_set_rootblkptr);
+EXPORT_SYMBOL(spa_altroot);
+EXPORT_SYMBOL(spa_sync_pass);
+EXPORT_SYMBOL(spa_name);
+EXPORT_SYMBOL(spa_guid);
+EXPORT_SYMBOL(spa_last_synced_txg);
+EXPORT_SYMBOL(spa_first_txg);
+EXPORT_SYMBOL(spa_syncing_txg);
+EXPORT_SYMBOL(spa_version);
+EXPORT_SYMBOL(spa_state);
+EXPORT_SYMBOL(spa_load_state);
+EXPORT_SYMBOL(spa_freeze_txg);
+EXPORT_SYMBOL(spa_get_dspace);
+EXPORT_SYMBOL(spa_update_dspace);
+EXPORT_SYMBOL(spa_deflate);
+EXPORT_SYMBOL(spa_normal_class);
+EXPORT_SYMBOL(spa_log_class);
+EXPORT_SYMBOL(spa_special_class);
+EXPORT_SYMBOL(spa_preferred_class);
+EXPORT_SYMBOL(spa_max_replication);
+EXPORT_SYMBOL(spa_prev_software_version);
+EXPORT_SYMBOL(spa_get_failmode);
+EXPORT_SYMBOL(spa_suspended);
+EXPORT_SYMBOL(spa_bootfs);
+EXPORT_SYMBOL(spa_delegation);
+EXPORT_SYMBOL(spa_meta_objset);
+EXPORT_SYMBOL(spa_maxblocksize);
+EXPORT_SYMBOL(spa_maxdnodesize);
+
+/* Miscellaneous support routines */
+EXPORT_SYMBOL(spa_guid_exists);
+EXPORT_SYMBOL(spa_strdup);
+EXPORT_SYMBOL(spa_strfree);
+EXPORT_SYMBOL(spa_get_random);
+EXPORT_SYMBOL(spa_generate_guid);
+EXPORT_SYMBOL(snprintf_blkptr);
+EXPORT_SYMBOL(spa_freeze);
+EXPORT_SYMBOL(spa_upgrade);
+EXPORT_SYMBOL(spa_evict_all);
+EXPORT_SYMBOL(spa_lookup_by_guid);
+EXPORT_SYMBOL(spa_has_spare);
+EXPORT_SYMBOL(dva_get_dsize_sync);
+EXPORT_SYMBOL(bp_get_dsize_sync);
+EXPORT_SYMBOL(bp_get_dsize);
+EXPORT_SYMBOL(spa_has_slogs);
+EXPORT_SYMBOL(spa_is_root);
+EXPORT_SYMBOL(spa_writeable);
+EXPORT_SYMBOL(spa_mode);
+EXPORT_SYMBOL(spa_namespace_lock);
+EXPORT_SYMBOL(spa_trust_config);
+EXPORT_SYMBOL(spa_missing_tvds_allowed);
+EXPORT_SYMBOL(spa_set_missing_tvds);
+EXPORT_SYMBOL(spa_state_to_name);
+EXPORT_SYMBOL(spa_importing_readonly_checkpoint);
+EXPORT_SYMBOL(spa_min_claim_txg);
+EXPORT_SYMBOL(spa_suspend_async_destroy);
+EXPORT_SYMBOL(spa_has_checkpoint);
+EXPORT_SYMBOL(spa_top_vdevs_spacemap_addressable);
+
+ZFS_MODULE_PARAM(zfs, zfs_, flags, UINT, ZMOD_RW,
+ "Set additional debugging flags");
+
+ZFS_MODULE_PARAM(zfs, zfs_, recover, INT, ZMOD_RW,
+ "Set to attempt to recover from fatal errors");
+
+ZFS_MODULE_PARAM(zfs, zfs_, free_leak_on_eio, INT, ZMOD_RW,
+ "Set to ignore IO errors during free and permanently leak the space");
+
+ZFS_MODULE_PARAM(zfs, zfs_, deadman_checktime_ms, ULONG, ZMOD_RW,
+ "Dead I/O check interval in milliseconds");
+
+ZFS_MODULE_PARAM(zfs, zfs_, deadman_enabled, INT, ZMOD_RW,
+ "Enable deadman timer");
+
+ZFS_MODULE_PARAM(zfs_spa, spa_, asize_inflation, INT, ZMOD_RW,
+ "SPA size estimate multiplication factor");
+
+ZFS_MODULE_PARAM(zfs, zfs_, ddt_data_is_special, INT, ZMOD_RW,
+ "Place DDT data into the special class");
+
+ZFS_MODULE_PARAM(zfs, zfs_, user_indirect_is_special, INT, ZMOD_RW,
+ "Place user data indirect blocks into the special class");
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, failmode,
+ param_set_deadman_failmode, param_get_charp, ZMOD_RW,
+ "Failmode for deadman timer");
+
+ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, synctime_ms,
+ param_set_deadman_synctime, param_get_ulong, ZMOD_RW,
+ "Pool sync expiration time in milliseconds");
+
+ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, ziotime_ms,
+ param_set_deadman_ziotime, param_get_ulong, ZMOD_RW,
+ "IO expiration time in milliseconds");
+
+ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, INT, ZMOD_RW,
+ "Small file blocks in special vdevs depends on this much "
+ "free space available");
+/* END CSTYLED */
+
+ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift,
+ param_get_int, ZMOD_RW, "Reserved free space in pool");
diff --git a/sys/contrib/openzfs/module/zfs/spa_stats.c b/sys/contrib/openzfs/module/zfs/spa_stats.c
new file mode 100644
index 000000000000..c3eacc14239e
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/spa_stats.c
@@ -0,0 +1,1029 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/spa.h>
+#include <zfs_comutil.h>
+
+/*
+ * Keeps stats on last N reads per spa_t, disabled by default.
+ */
+int zfs_read_history = 0;
+
+/*
+ * Include cache hits in history, disabled by default.
+ */
+int zfs_read_history_hits = 0;
+
+/*
+ * Keeps stats on the last 100 txgs by default.
+ */
+int zfs_txg_history = 100;
+
+/*
+ * Keeps stats on the last N MMP updates, disabled by default.
+ */
+int zfs_multihost_history = 0;
+
+/*
+ * ==========================================================================
+ * SPA Read History Routines
+ * ==========================================================================
+ */
+
+/*
+ * Read statistics - Information exported regarding each arc_read call
+ */
+typedef struct spa_read_history {
+ hrtime_t start; /* time read completed */
+ uint64_t objset; /* read from this objset */
+ uint64_t object; /* read of this object number */
+ uint64_t level; /* block's indirection level */
+ uint64_t blkid; /* read of this block id */
+ char origin[24]; /* read originated from here */
+ uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */
+ pid_t pid; /* PID of task doing read */
+ char comm[16]; /* process name of task doing read */
+ procfs_list_node_t srh_node;
+} spa_read_history_t;
+
+static int
+spa_read_history_show_header(struct seq_file *f)
+{
+ seq_printf(f, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s "
+ "%-24s %-8s %-16s\n", "UID", "start", "objset", "object",
+ "level", "blkid", "aflags", "origin", "pid", "process");
+
+ return (0);
+}
+
+static int
+spa_read_history_show(struct seq_file *f, void *data)
+{
+ spa_read_history_t *srh = (spa_read_history_t *)data;
+
+ seq_printf(f, "%-8llu %-16llu 0x%-6llx "
+ "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n",
+ (u_longlong_t)srh->srh_node.pln_id, srh->start,
+ (longlong_t)srh->objset, (longlong_t)srh->object,
+ (longlong_t)srh->level, (longlong_t)srh->blkid,
+ srh->aflags, srh->origin, srh->pid, srh->comm);
+
+ return (0);
+}
+
+/* Remove oldest elements from list until there are no more than 'size' left */
+static void
+spa_read_history_truncate(spa_history_list_t *shl, unsigned int size)
+{
+ spa_read_history_t *srh;
+ while (shl->size > size) {
+ srh = list_remove_head(&shl->procfs_list.pl_list);
+ ASSERT3P(srh, !=, NULL);
+ kmem_free(srh, sizeof (spa_read_history_t));
+ shl->size--;
+ }
+
+ if (size == 0)
+ ASSERT(list_is_empty(&shl->procfs_list.pl_list));
+}
+
+static int
+spa_read_history_clear(procfs_list_t *procfs_list)
+{
+ spa_history_list_t *shl = procfs_list->pl_private;
+ mutex_enter(&procfs_list->pl_lock);
+ spa_read_history_truncate(shl, 0);
+ mutex_exit(&procfs_list->pl_lock);
+ return (0);
+}
+
+static void
+spa_read_history_init(spa_t *spa)
+{
+ spa_history_list_t *shl = &spa->spa_stats.read_history;
+
+ shl->size = 0;
+ shl->procfs_list.pl_private = shl;
+ procfs_list_install("zfs",
+ spa_name(spa),
+ "reads",
+ 0600,
+ &shl->procfs_list,
+ spa_read_history_show,
+ spa_read_history_show_header,
+ spa_read_history_clear,
+ offsetof(spa_read_history_t, srh_node));
+}
+
+static void
+spa_read_history_destroy(spa_t *spa)
+{
+ spa_history_list_t *shl = &spa->spa_stats.read_history;
+ procfs_list_uninstall(&shl->procfs_list);
+ spa_read_history_truncate(shl, 0);
+ procfs_list_destroy(&shl->procfs_list);
+}
+
+void
+spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags)
+{
+ spa_history_list_t *shl = &spa->spa_stats.read_history;
+ spa_read_history_t *srh;
+
+ ASSERT3P(spa, !=, NULL);
+ ASSERT3P(zb, !=, NULL);
+
+ if (zfs_read_history == 0 && shl->size == 0)
+ return;
+
+ if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED))
+ return;
+
+ srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP);
+ strlcpy(srh->comm, getcomm(), sizeof (srh->comm));
+ srh->start = gethrtime();
+ srh->objset = zb->zb_objset;
+ srh->object = zb->zb_object;
+ srh->level = zb->zb_level;
+ srh->blkid = zb->zb_blkid;
+ srh->aflags = aflags;
+ srh->pid = getpid();
+
+ mutex_enter(&shl->procfs_list.pl_lock);
+
+ procfs_list_add(&shl->procfs_list, srh);
+ shl->size++;
+
+ spa_read_history_truncate(shl, zfs_read_history);
+
+ mutex_exit(&shl->procfs_list.pl_lock);
+}
+
+/*
+ * ==========================================================================
+ * SPA TXG History Routines
+ * ==========================================================================
+ */
+
+/*
+ * Txg statistics - Information exported regarding each txg sync
+ */
+
+typedef struct spa_txg_history {
+ uint64_t txg; /* txg id */
+ txg_state_t state; /* active txg state */
+ uint64_t nread; /* number of bytes read */
+ uint64_t nwritten; /* number of bytes written */
+ uint64_t reads; /* number of read operations */
+ uint64_t writes; /* number of write operations */
+ uint64_t ndirty; /* number of dirty bytes */
+ hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */
+ procfs_list_node_t sth_node;
+} spa_txg_history_t;
+
+static int
+spa_txg_history_show_header(struct seq_file *f)
+{
+ seq_printf(f, "%-8s %-16s %-5s %-12s %-12s %-12s "
+ "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state",
+ "ndirty", "nread", "nwritten", "reads", "writes",
+ "otime", "qtime", "wtime", "stime");
+ return (0);
+}
+
+static int
+spa_txg_history_show(struct seq_file *f, void *data)
+{
+ spa_txg_history_t *sth = (spa_txg_history_t *)data;
+ uint64_t open = 0, quiesce = 0, wait = 0, sync = 0;
+ char state;
+
+ switch (sth->state) {
+ case TXG_STATE_BIRTH: state = 'B'; break;
+ case TXG_STATE_OPEN: state = 'O'; break;
+ case TXG_STATE_QUIESCED: state = 'Q'; break;
+ case TXG_STATE_WAIT_FOR_SYNC: state = 'W'; break;
+ case TXG_STATE_SYNCED: state = 'S'; break;
+ case TXG_STATE_COMMITTED: state = 'C'; break;
+ default: state = '?'; break;
+ }
+
+ if (sth->times[TXG_STATE_OPEN])
+ open = sth->times[TXG_STATE_OPEN] -
+ sth->times[TXG_STATE_BIRTH];
+
+ if (sth->times[TXG_STATE_QUIESCED])
+ quiesce = sth->times[TXG_STATE_QUIESCED] -
+ sth->times[TXG_STATE_OPEN];
+
+ if (sth->times[TXG_STATE_WAIT_FOR_SYNC])
+ wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] -
+ sth->times[TXG_STATE_QUIESCED];
+
+ if (sth->times[TXG_STATE_SYNCED])
+ sync = sth->times[TXG_STATE_SYNCED] -
+ sth->times[TXG_STATE_WAIT_FOR_SYNC];
+
+ seq_printf(f, "%-8llu %-16llu %-5c %-12llu "
+ "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n",
+ (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state,
+ (u_longlong_t)sth->ndirty,
+ (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten,
+ (u_longlong_t)sth->reads, (u_longlong_t)sth->writes,
+ (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait,
+ (u_longlong_t)sync);
+
+ return (0);
+}
+
+/* Remove oldest elements from list until there are no more than 'size' left */
+static void
+spa_txg_history_truncate(spa_history_list_t *shl, unsigned int size)
+{
+ spa_txg_history_t *sth;
+ while (shl->size > size) {
+ sth = list_remove_head(&shl->procfs_list.pl_list);
+ ASSERT3P(sth, !=, NULL);
+ kmem_free(sth, sizeof (spa_txg_history_t));
+ shl->size--;
+ }
+
+ if (size == 0)
+ ASSERT(list_is_empty(&shl->procfs_list.pl_list));
+
+}
+
+static int
+spa_txg_history_clear(procfs_list_t *procfs_list)
+{
+ spa_history_list_t *shl = procfs_list->pl_private;
+ mutex_enter(&procfs_list->pl_lock);
+ spa_txg_history_truncate(shl, 0);
+ mutex_exit(&procfs_list->pl_lock);
+ return (0);
+}
+
+static void
+spa_txg_history_init(spa_t *spa)
+{
+ spa_history_list_t *shl = &spa->spa_stats.txg_history;
+
+ shl->size = 0;
+ shl->procfs_list.pl_private = shl;
+ procfs_list_install("zfs",
+ spa_name(spa),
+ "txgs",
+ 0644,
+ &shl->procfs_list,
+ spa_txg_history_show,
+ spa_txg_history_show_header,
+ spa_txg_history_clear,
+ offsetof(spa_txg_history_t, sth_node));
+}
+
+static void
+spa_txg_history_destroy(spa_t *spa)
+{
+ spa_history_list_t *shl = &spa->spa_stats.txg_history;
+ procfs_list_uninstall(&shl->procfs_list);
+ spa_txg_history_truncate(shl, 0);
+ procfs_list_destroy(&shl->procfs_list);
+}
+
+/*
+ * Add a new txg to historical record.
+ */
+void
+spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time)
+{
+ spa_history_list_t *shl = &spa->spa_stats.txg_history;
+ spa_txg_history_t *sth;
+
+ if (zfs_txg_history == 0 && shl->size == 0)
+ return;
+
+ sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP);
+ sth->txg = txg;
+ sth->state = TXG_STATE_OPEN;
+ sth->times[TXG_STATE_BIRTH] = birth_time;
+
+ mutex_enter(&shl->procfs_list.pl_lock);
+ procfs_list_add(&shl->procfs_list, sth);
+ shl->size++;
+ spa_txg_history_truncate(shl, zfs_txg_history);
+ mutex_exit(&shl->procfs_list.pl_lock);
+}
+
+/*
+ * Set txg state completion time and increment current state.
+ */
+int
+spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state,
+ hrtime_t completed_time)
+{
+ spa_history_list_t *shl = &spa->spa_stats.txg_history;
+ spa_txg_history_t *sth;
+ int error = ENOENT;
+
+ if (zfs_txg_history == 0)
+ return (0);
+
+ mutex_enter(&shl->procfs_list.pl_lock);
+ for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL;
+ sth = list_prev(&shl->procfs_list.pl_list, sth)) {
+ if (sth->txg == txg) {
+ sth->times[completed_state] = completed_time;
+ sth->state++;
+ error = 0;
+ break;
+ }
+ }
+ mutex_exit(&shl->procfs_list.pl_lock);
+
+ return (error);
+}
+
+/*
+ * Set txg IO stats.
+ */
+static int
+spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread,
+ uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty)
+{
+ spa_history_list_t *shl = &spa->spa_stats.txg_history;
+ spa_txg_history_t *sth;
+ int error = ENOENT;
+
+ if (zfs_txg_history == 0)
+ return (0);
+
+ mutex_enter(&shl->procfs_list.pl_lock);
+ for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL;
+ sth = list_prev(&shl->procfs_list.pl_list, sth)) {
+ if (sth->txg == txg) {
+ sth->nread = nread;
+ sth->nwritten = nwritten;
+ sth->reads = reads;
+ sth->writes = writes;
+ sth->ndirty = ndirty;
+ error = 0;
+ break;
+ }
+ }
+ mutex_exit(&shl->procfs_list.pl_lock);
+
+ return (error);
+}
+
+txg_stat_t *
+spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp)
+{
+ txg_stat_t *ts;
+
+ if (zfs_txg_history == 0)
+ return (NULL);
+
+ ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP);
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_get_stats(spa->spa_root_vdev, &ts->vs1);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ ts->txg = txg;
+ ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK];
+
+ spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime());
+
+ return (ts);
+}
+
+void
+spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts)
+{
+ if (ts == NULL)
+ return;
+
+ if (zfs_txg_history == 0) {
+ kmem_free(ts, sizeof (txg_stat_t));
+ return;
+ }
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_get_stats(spa->spa_root_vdev, &ts->vs2);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime());
+ spa_txg_history_set_io(spa, ts->txg,
+ ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ],
+ ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE],
+ ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ],
+ ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE],
+ ts->ndirty);
+
+ kmem_free(ts, sizeof (txg_stat_t));
+}
+
+/*
+ * ==========================================================================
+ * SPA TX Assign Histogram Routines
+ * ==========================================================================
+ */
+
+/*
+ * Tx statistics - Information exported regarding dmu_tx_assign time.
+ */
+
+/*
+ * When the kstat is written zero all buckets. When the kstat is read
+ * count the number of trailing buckets set to zero and update ks_ndata
+ * such that they are not output.
+ */
+static int
+spa_tx_assign_update(kstat_t *ksp, int rw)
+{
+ spa_t *spa = ksp->ks_private;
+ spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
+ int i;
+
+ if (rw == KSTAT_WRITE) {
+ for (i = 0; i < shk->count; i++)
+ ((kstat_named_t *)shk->priv)[i].value.ui64 = 0;
+ }
+
+ for (i = shk->count; i > 0; i--)
+ if (((kstat_named_t *)shk->priv)[i-1].value.ui64 != 0)
+ break;
+
+ ksp->ks_ndata = i;
+ ksp->ks_data_size = i * sizeof (kstat_named_t);
+
+ return (0);
+}
+
+static void
+spa_tx_assign_init(spa_t *spa)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
+ char *name;
+ kstat_named_t *ks;
+ kstat_t *ksp;
+ int i;
+
+ mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
+
+ shk->count = 42; /* power of two buckets for 1ns to 2,199s */
+ shk->size = shk->count * sizeof (kstat_named_t);
+ shk->priv = kmem_alloc(shk->size, KM_SLEEP);
+
+ name = kmem_asprintf("zfs/%s", spa_name(spa));
+
+ for (i = 0; i < shk->count; i++) {
+ ks = &((kstat_named_t *)shk->priv)[i];
+ ks->data_type = KSTAT_DATA_UINT64;
+ ks->value.ui64 = 0;
+ (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns",
+ (u_longlong_t)1 << i);
+ }
+
+ ksp = kstat_create(name, 0, "dmu_tx_assign", "misc",
+ KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL);
+ shk->kstat = ksp;
+
+ if (ksp) {
+ ksp->ks_lock = &shk->lock;
+ ksp->ks_data = shk->priv;
+ ksp->ks_ndata = shk->count;
+ ksp->ks_data_size = shk->size;
+ ksp->ks_private = spa;
+ ksp->ks_update = spa_tx_assign_update;
+ kstat_install(ksp);
+ }
+ kmem_strfree(name);
+}
+
+static void
+spa_tx_assign_destroy(spa_t *spa)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
+ kstat_t *ksp;
+
+ ksp = shk->kstat;
+ if (ksp)
+ kstat_delete(ksp);
+
+ kmem_free(shk->priv, shk->size);
+ mutex_destroy(&shk->lock);
+}
+
+void
+spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
+ uint64_t idx = 0;
+
+ while (((1ULL << idx) < nsecs) && (idx < shk->size - 1))
+ idx++;
+
+ atomic_inc_64(&((kstat_named_t *)shk->priv)[idx].value.ui64);
+}
+
+/*
+ * ==========================================================================
+ * SPA IO History Routines
+ * ==========================================================================
+ */
+static int
+spa_io_history_update(kstat_t *ksp, int rw)
+{
+ if (rw == KSTAT_WRITE)
+ memset(ksp->ks_data, 0, ksp->ks_data_size);
+
+ return (0);
+}
+
+static void
+spa_io_history_init(spa_t *spa)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.io_history;
+ char *name;
+ kstat_t *ksp;
+
+ mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
+
+ name = kmem_asprintf("zfs/%s", spa_name(spa));
+
+ ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0);
+ shk->kstat = ksp;
+
+ if (ksp) {
+ ksp->ks_lock = &shk->lock;
+ ksp->ks_private = spa;
+ ksp->ks_update = spa_io_history_update;
+ kstat_install(ksp);
+ }
+ kmem_strfree(name);
+}
+
+static void
+spa_io_history_destroy(spa_t *spa)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.io_history;
+
+ if (shk->kstat)
+ kstat_delete(shk->kstat);
+
+ mutex_destroy(&shk->lock);
+}
+
+/*
+ * ==========================================================================
+ * SPA MMP History Routines
+ * ==========================================================================
+ */
+
+/*
+ * MMP statistics - Information exported regarding attempted MMP writes
+ * For MMP writes issued, fields used as per comments below.
+ * For MMP writes skipped, an entry represents a span of time when
+ * writes were skipped for same reason (error from mmp_random_leaf).
+ * Differences are:
+ * timestamp time first write skipped, if >1 skipped in a row
+ * mmp_delay delay value at timestamp
+ * vdev_guid number of writes skipped
+ * io_error one of enum mmp_error
+ * duration time span (ns) of skipped writes
+ */
+
+typedef struct spa_mmp_history {
+ uint64_t mmp_node_id; /* unique # for updates */
+ uint64_t txg; /* txg of last sync */
+ uint64_t timestamp; /* UTC time MMP write issued */
+ uint64_t mmp_delay; /* mmp_thread.mmp_delay at timestamp */
+ uint64_t vdev_guid; /* unique ID of leaf vdev */
+ char *vdev_path;
+ int vdev_label; /* vdev label */
+ int io_error; /* error status of MMP write */
+ hrtime_t error_start; /* hrtime of start of error period */
+ hrtime_t duration; /* time from submission to completion */
+ procfs_list_node_t smh_node;
+} spa_mmp_history_t;
+
+static int
+spa_mmp_history_show_header(struct seq_file *f)
+{
+ seq_printf(f, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s "
+ "%-10s %s\n", "id", "txg", "timestamp", "error", "duration",
+ "mmp_delay", "vdev_guid", "vdev_label", "vdev_path");
+ return (0);
+}
+
+static int
+spa_mmp_history_show(struct seq_file *f, void *data)
+{
+ spa_mmp_history_t *smh = (spa_mmp_history_t *)data;
+ char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu "
+ "%-10lld %s\n";
+ char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu "
+ "%-10lld %s\n";
+
+ seq_printf(f, (smh->error_start ? skip_fmt : write_fmt),
+ (u_longlong_t)smh->mmp_node_id, (u_longlong_t)smh->txg,
+ (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error,
+ (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay,
+ (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label,
+ (smh->vdev_path ? smh->vdev_path : "-"));
+
+ return (0);
+}
+
+/* Remove oldest elements from list until there are no more than 'size' left */
+static void
+spa_mmp_history_truncate(spa_history_list_t *shl, unsigned int size)
+{
+ spa_mmp_history_t *smh;
+ while (shl->size > size) {
+ smh = list_remove_head(&shl->procfs_list.pl_list);
+ if (smh->vdev_path)
+ kmem_strfree(smh->vdev_path);
+ kmem_free(smh, sizeof (spa_mmp_history_t));
+ shl->size--;
+ }
+
+ if (size == 0)
+ ASSERT(list_is_empty(&shl->procfs_list.pl_list));
+
+}
+
+static int
+spa_mmp_history_clear(procfs_list_t *procfs_list)
+{
+ spa_history_list_t *shl = procfs_list->pl_private;
+ mutex_enter(&procfs_list->pl_lock);
+ spa_mmp_history_truncate(shl, 0);
+ mutex_exit(&procfs_list->pl_lock);
+ return (0);
+}
+
+static void
+spa_mmp_history_init(spa_t *spa)
+{
+ spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+
+ shl->size = 0;
+
+ shl->procfs_list.pl_private = shl;
+ procfs_list_install("zfs",
+ spa_name(spa),
+ "multihost",
+ 0644,
+ &shl->procfs_list,
+ spa_mmp_history_show,
+ spa_mmp_history_show_header,
+ spa_mmp_history_clear,
+ offsetof(spa_mmp_history_t, smh_node));
+}
+
+static void
+spa_mmp_history_destroy(spa_t *spa)
+{
+ spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+ procfs_list_uninstall(&shl->procfs_list);
+ spa_mmp_history_truncate(shl, 0);
+ procfs_list_destroy(&shl->procfs_list);
+}
+
+/*
+ * Set duration in existing "skip" record to how long we have waited for a leaf
+ * vdev to become available.
+ *
+ * Important that we start search at the tail of the list where new
+ * records are inserted, so this is normally an O(1) operation.
+ */
+int
+spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id)
+{
+ spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+ spa_mmp_history_t *smh;
+ int error = ENOENT;
+
+ if (zfs_multihost_history == 0 && shl->size == 0)
+ return (0);
+
+ mutex_enter(&shl->procfs_list.pl_lock);
+ for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL;
+ smh = list_prev(&shl->procfs_list.pl_list, smh)) {
+ if (smh->mmp_node_id == mmp_node_id) {
+ ASSERT3U(smh->io_error, !=, 0);
+ smh->duration = gethrtime() - smh->error_start;
+ smh->vdev_guid++;
+ error = 0;
+ break;
+ }
+ }
+ mutex_exit(&shl->procfs_list.pl_lock);
+
+ return (error);
+}
+
+/*
+ * Set MMP write duration and error status in existing record.
+ * See comment re: search order above spa_mmp_history_set_skip().
+ */
+int
+spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error,
+ hrtime_t duration)
+{
+ spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+ spa_mmp_history_t *smh;
+ int error = ENOENT;
+
+ if (zfs_multihost_history == 0 && shl->size == 0)
+ return (0);
+
+ mutex_enter(&shl->procfs_list.pl_lock);
+ for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL;
+ smh = list_prev(&shl->procfs_list.pl_list, smh)) {
+ if (smh->mmp_node_id == mmp_node_id) {
+ ASSERT(smh->io_error == 0);
+ smh->io_error = io_error;
+ smh->duration = duration;
+ error = 0;
+ break;
+ }
+ }
+ mutex_exit(&shl->procfs_list.pl_lock);
+
+ return (error);
+}
+
+/*
+ * Add a new MMP historical record.
+ * error == 0 : a write was issued.
+ * error != 0 : a write was not issued because no leaves were found.
+ */
+void
+spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
+ uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id,
+ int error)
+{
+ spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+ spa_mmp_history_t *smh;
+
+ if (zfs_multihost_history == 0 && shl->size == 0)
+ return;
+
+ smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP);
+ smh->txg = txg;
+ smh->timestamp = timestamp;
+ smh->mmp_delay = mmp_delay;
+ if (vd) {
+ smh->vdev_guid = vd->vdev_guid;
+ if (vd->vdev_path)
+ smh->vdev_path = kmem_strdup(vd->vdev_path);
+ }
+ smh->vdev_label = label;
+ smh->mmp_node_id = mmp_node_id;
+
+ if (error) {
+ smh->io_error = error;
+ smh->error_start = gethrtime();
+ smh->vdev_guid = 1;
+ }
+
+ mutex_enter(&shl->procfs_list.pl_lock);
+ procfs_list_add(&shl->procfs_list, smh);
+ shl->size++;
+ spa_mmp_history_truncate(shl, zfs_multihost_history);
+ mutex_exit(&shl->procfs_list.pl_lock);
+}
+
+static void *
+spa_state_addr(kstat_t *ksp, loff_t n)
+{
+ if (n == 0)
+ return (ksp->ks_private); /* return the spa_t */
+ return (NULL);
+}
+
+static int
+spa_state_data(char *buf, size_t size, void *data)
+{
+ spa_t *spa = (spa_t *)data;
+ (void) snprintf(buf, size, "%s\n", spa_state_to_name(spa));
+ return (0);
+}
+
+/*
+ * Return the state of the pool in /proc/spl/kstat/zfs/<pool>/state.
+ *
+ * This is a lock-less read of the pool's state (unlike using 'zpool', which
+ * can potentially block for seconds). Because it doesn't block, it can useful
+ * as a pool heartbeat value.
+ */
+static void
+spa_state_init(spa_t *spa)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.state;
+ char *name;
+ kstat_t *ksp;
+
+ mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
+
+ name = kmem_asprintf("zfs/%s", spa_name(spa));
+ ksp = kstat_create(name, 0, "state", "misc",
+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+
+ shk->kstat = ksp;
+ if (ksp) {
+ ksp->ks_lock = &shk->lock;
+ ksp->ks_data = NULL;
+ ksp->ks_private = spa;
+ ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS;
+ kstat_set_raw_ops(ksp, NULL, spa_state_data, spa_state_addr);
+ kstat_install(ksp);
+ }
+
+ kmem_strfree(name);
+}
+
+static void
+spa_health_destroy(spa_t *spa)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.state;
+ kstat_t *ksp = shk->kstat;
+ if (ksp)
+ kstat_delete(ksp);
+
+ mutex_destroy(&shk->lock);
+}
+
+static spa_iostats_t spa_iostats_template = {
+ { "trim_extents_written", KSTAT_DATA_UINT64 },
+ { "trim_bytes_written", KSTAT_DATA_UINT64 },
+ { "trim_extents_skipped", KSTAT_DATA_UINT64 },
+ { "trim_bytes_skipped", KSTAT_DATA_UINT64 },
+ { "trim_extents_failed", KSTAT_DATA_UINT64 },
+ { "trim_bytes_failed", KSTAT_DATA_UINT64 },
+ { "autotrim_extents_written", KSTAT_DATA_UINT64 },
+ { "autotrim_bytes_written", KSTAT_DATA_UINT64 },
+ { "autotrim_extents_skipped", KSTAT_DATA_UINT64 },
+ { "autotrim_bytes_skipped", KSTAT_DATA_UINT64 },
+ { "autotrim_extents_failed", KSTAT_DATA_UINT64 },
+ { "autotrim_bytes_failed", KSTAT_DATA_UINT64 },
+ { "simple_trim_extents_written", KSTAT_DATA_UINT64 },
+ { "simple_trim_bytes_written", KSTAT_DATA_UINT64 },
+ { "simple_trim_extents_skipped", KSTAT_DATA_UINT64 },
+ { "simple_trim_bytes_skipped", KSTAT_DATA_UINT64 },
+ { "simple_trim_extents_failed", KSTAT_DATA_UINT64 },
+ { "simple_trim_bytes_failed", KSTAT_DATA_UINT64 },
+};
+
+#define SPA_IOSTATS_ADD(stat, val) \
+ atomic_add_64(&iostats->stat.value.ui64, (val));
+
+void
+spa_iostats_trim_add(spa_t *spa, trim_type_t type,
+ uint64_t extents_written, uint64_t bytes_written,
+ uint64_t extents_skipped, uint64_t bytes_skipped,
+ uint64_t extents_failed, uint64_t bytes_failed)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+ kstat_t *ksp = shk->kstat;
+ spa_iostats_t *iostats;
+
+ if (ksp == NULL)
+ return;
+
+ iostats = ksp->ks_data;
+ if (type == TRIM_TYPE_MANUAL) {
+ SPA_IOSTATS_ADD(trim_extents_written, extents_written);
+ SPA_IOSTATS_ADD(trim_bytes_written, bytes_written);
+ SPA_IOSTATS_ADD(trim_extents_skipped, extents_skipped);
+ SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped);
+ SPA_IOSTATS_ADD(trim_extents_failed, extents_failed);
+ SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed);
+ } else if (type == TRIM_TYPE_AUTO) {
+ SPA_IOSTATS_ADD(autotrim_extents_written, extents_written);
+ SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written);
+ SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped);
+ SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped);
+ SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed);
+ SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed);
+ } else {
+ SPA_IOSTATS_ADD(simple_trim_extents_written, extents_written);
+ SPA_IOSTATS_ADD(simple_trim_bytes_written, bytes_written);
+ SPA_IOSTATS_ADD(simple_trim_extents_skipped, extents_skipped);
+ SPA_IOSTATS_ADD(simple_trim_bytes_skipped, bytes_skipped);
+ SPA_IOSTATS_ADD(simple_trim_extents_failed, extents_failed);
+ SPA_IOSTATS_ADD(simple_trim_bytes_failed, bytes_failed);
+ }
+}
+
+static int
+spa_iostats_update(kstat_t *ksp, int rw)
+{
+ if (rw == KSTAT_WRITE) {
+ memcpy(ksp->ks_data, &spa_iostats_template,
+ sizeof (spa_iostats_t));
+ }
+
+ return (0);
+}
+
+static void
+spa_iostats_init(spa_t *spa)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+
+ mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
+
+ char *name = kmem_asprintf("zfs/%s", spa_name(spa));
+ kstat_t *ksp = kstat_create(name, 0, "iostats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (spa_iostats_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ shk->kstat = ksp;
+ if (ksp) {
+ int size = sizeof (spa_iostats_t);
+ ksp->ks_lock = &shk->lock;
+ ksp->ks_private = spa;
+ ksp->ks_update = spa_iostats_update;
+ ksp->ks_data = kmem_alloc(size, KM_SLEEP);
+ memcpy(ksp->ks_data, &spa_iostats_template, size);
+ kstat_install(ksp);
+ }
+
+ kmem_strfree(name);
+}
+
+static void
+spa_iostats_destroy(spa_t *spa)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+ kstat_t *ksp = shk->kstat;
+ if (ksp) {
+ kmem_free(ksp->ks_data, sizeof (spa_iostats_t));
+ kstat_delete(ksp);
+ }
+
+ mutex_destroy(&shk->lock);
+}
+
+void
+spa_stats_init(spa_t *spa)
+{
+ spa_read_history_init(spa);
+ spa_txg_history_init(spa);
+ spa_tx_assign_init(spa);
+ spa_io_history_init(spa);
+ spa_mmp_history_init(spa);
+ spa_state_init(spa);
+ spa_iostats_init(spa);
+}
+
+void
+spa_stats_destroy(spa_t *spa)
+{
+ spa_iostats_destroy(spa);
+ spa_health_destroy(spa);
+ spa_tx_assign_destroy(spa);
+ spa_txg_history_destroy(spa);
+ spa_read_history_destroy(spa);
+ spa_io_history_destroy(spa);
+ spa_mmp_history_destroy(spa);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, read_history, INT, ZMOD_RW,
+ "Historical statistics for the last N reads");
+
+ZFS_MODULE_PARAM(zfs, zfs_, read_history_hits, INT, ZMOD_RW,
+ "Include cache hits in read history");
+
+ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, history, INT, ZMOD_RW,
+ "Historical statistics for the last N txgs");
+
+ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, history, INT, ZMOD_RW,
+ "Historical statistics for last N multihost writes");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/space_map.c b/sys/contrib/openzfs/module/zfs/space_map.c
new file mode 100644
index 000000000000..3db7d199199c
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/space_map.c
@@ -0,0 +1,1105 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dnode.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio.h>
+#include <sys/space_map.h>
+#include <sys/zfeature.h>
+
+/*
+ * Note on space map block size:
+ *
+ * The data for a given space map can be kept on blocks of any size.
+ * Larger blocks entail fewer I/O operations, but they also cause the
+ * DMU to keep more data in-core, and also to waste more I/O bandwidth
+ * when only a few blocks have changed since the last transaction group.
+ */
+
+/*
+ * Enabled whenever we want to stress test the use of double-word
+ * space map entries.
+ */
+boolean_t zfs_force_some_double_word_sm_entries = B_FALSE;
+
+/*
+ * Override the default indirect block size of 128K, instead use 16K for
+ * spacemaps (2^14 bytes). This dramatically reduces write inflation since
+ * appending to a spacemap typically has to write one data block (4KB) and one
+ * or two indirect blocks (16K-32K, rather than 128K).
+ */
+int space_map_ibs = 14;
+
+boolean_t
+sm_entry_is_debug(uint64_t e)
+{
+ return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX);
+}
+
+boolean_t
+sm_entry_is_single_word(uint64_t e)
+{
+ uint8_t prefix = SM_PREFIX_DECODE(e);
+ return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX);
+}
+
+boolean_t
+sm_entry_is_double_word(uint64_t e)
+{
+ return (SM_PREFIX_DECODE(e) == SM2_PREFIX);
+}
+
+/*
+ * Iterate through the space map, invoking the callback on each (non-debug)
+ * space map entry. Stop after reading 'end' bytes of the space map.
+ */
+int
+space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg)
+{
+ uint64_t blksz = sm->sm_blksz;
+
+ ASSERT3U(blksz, !=, 0);
+ ASSERT3U(end, <=, space_map_length(sm));
+ ASSERT0(P2PHASE(end, sizeof (uint64_t)));
+
+ dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end,
+ ZIO_PRIORITY_SYNC_READ);
+
+ int error = 0;
+ uint64_t txg = 0, sync_pass = 0;
+ for (uint64_t block_base = 0; block_base < end && error == 0;
+ block_base += blksz) {
+ dmu_buf_t *db;
+ error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
+ block_base, FTAG, &db, DMU_READ_PREFETCH);
+ if (error != 0)
+ return (error);
+
+ uint64_t *block_start = db->db_data;
+ uint64_t block_length = MIN(end - block_base, blksz);
+ uint64_t *block_end = block_start +
+ (block_length / sizeof (uint64_t));
+
+ VERIFY0(P2PHASE(block_length, sizeof (uint64_t)));
+ VERIFY3U(block_length, !=, 0);
+ ASSERT3U(blksz, ==, db->db_size);
+
+ for (uint64_t *block_cursor = block_start;
+ block_cursor < block_end && error == 0; block_cursor++) {
+ uint64_t e = *block_cursor;
+
+ if (sm_entry_is_debug(e)) {
+ /*
+ * Debug entries are only needed to record the
+ * current TXG and sync pass if available.
+ *
+ * Note though that sometimes there can be
+ * debug entries that are used as padding
+ * at the end of space map blocks in-order
+ * to not split a double-word entry in the
+ * middle between two blocks. These entries
+ * have their TXG field set to 0 and we
+ * skip them without recording the TXG.
+ * [see comment in space_map_write_seg()]
+ */
+ uint64_t e_txg = SM_DEBUG_TXG_DECODE(e);
+ if (e_txg != 0) {
+ txg = e_txg;
+ sync_pass = SM_DEBUG_SYNCPASS_DECODE(e);
+ } else {
+ ASSERT0(SM_DEBUG_SYNCPASS_DECODE(e));
+ }
+ continue;
+ }
+
+ uint64_t raw_offset, raw_run, vdev_id;
+ maptype_t type;
+ if (sm_entry_is_single_word(e)) {
+ type = SM_TYPE_DECODE(e);
+ vdev_id = SM_NO_VDEVID;
+ raw_offset = SM_OFFSET_DECODE(e);
+ raw_run = SM_RUN_DECODE(e);
+ } else {
+ /* it is a two-word entry */
+ ASSERT(sm_entry_is_double_word(e));
+ raw_run = SM2_RUN_DECODE(e);
+ vdev_id = SM2_VDEV_DECODE(e);
+
+ /* move on to the second word */
+ block_cursor++;
+ e = *block_cursor;
+ VERIFY3P(block_cursor, <=, block_end);
+
+ type = SM2_TYPE_DECODE(e);
+ raw_offset = SM2_OFFSET_DECODE(e);
+ }
+
+ uint64_t entry_offset = (raw_offset << sm->sm_shift) +
+ sm->sm_start;
+ uint64_t entry_run = raw_run << sm->sm_shift;
+
+ VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
+ VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
+ ASSERT3U(entry_offset, >=, sm->sm_start);
+ ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size);
+ ASSERT3U(entry_run, <=, sm->sm_size);
+ ASSERT3U(entry_offset + entry_run, <=,
+ sm->sm_start + sm->sm_size);
+
+ space_map_entry_t sme = {
+ .sme_type = type,
+ .sme_vdev = vdev_id,
+ .sme_offset = entry_offset,
+ .sme_run = entry_run,
+ .sme_txg = txg,
+ .sme_sync_pass = sync_pass
+ };
+ error = callback(&sme, arg);
+ }
+ dmu_buf_rele(db, FTAG);
+ }
+ return (error);
+}
+
+/*
+ * Reads the entries from the last block of the space map into
+ * buf in reverse order. Populates nwords with number of words
+ * in the last block.
+ *
+ * Refer to block comment within space_map_incremental_destroy()
+ * to understand why this function is needed.
+ */
+static int
+space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
+ uint64_t bufsz, uint64_t *nwords)
+{
+ int error = 0;
+ dmu_buf_t *db;
+
+ /*
+ * Find the offset of the last word in the space map and use
+ * that to read the last block of the space map with
+ * dmu_buf_hold().
+ */
+ uint64_t last_word_offset =
+ sm->sm_phys->smp_length - sizeof (uint64_t);
+ error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
+ FTAG, &db, DMU_READ_NO_PREFETCH);
+ if (error != 0)
+ return (error);
+
+ ASSERT3U(sm->sm_object, ==, db->db_object);
+ ASSERT3U(sm->sm_blksz, ==, db->db_size);
+ ASSERT3U(bufsz, >=, db->db_size);
+ ASSERT(nwords != NULL);
+
+ uint64_t *words = db->db_data;
+ *nwords =
+ (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
+
+ ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
+
+ uint64_t n = *nwords;
+ uint64_t j = n - 1;
+ for (uint64_t i = 0; i < n; i++) {
+ uint64_t entry = words[i];
+ if (sm_entry_is_double_word(entry)) {
+ /*
+ * Since we are populating the buffer backwards
+ * we have to be extra careful and add the two
+ * words of the double-word entry in the right
+ * order.
+ */
+ ASSERT3U(j, >, 0);
+ buf[j - 1] = entry;
+
+ i++;
+ ASSERT3U(i, <, n);
+ entry = words[i];
+ buf[j] = entry;
+ j -= 2;
+ } else {
+ ASSERT(sm_entry_is_debug(entry) ||
+ sm_entry_is_single_word(entry));
+ buf[j] = entry;
+ j--;
+ }
+ }
+
+ /*
+ * Assert that we wrote backwards all the
+ * way to the beginning of the buffer.
+ */
+ ASSERT3S(j, ==, -1);
+
+ dmu_buf_rele(db, FTAG);
+ return (error);
+}
+
+/*
+ * Note: This function performs destructive actions - specifically
+ * it deletes entries from the end of the space map. Thus, callers
+ * should ensure that they are holding the appropriate locks for
+ * the space map that they provide.
+ */
+int
+space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
+ dmu_tx_t *tx)
+{
+ uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
+ uint64_t *buf = zio_buf_alloc(bufsz);
+
+ dmu_buf_will_dirty(sm->sm_dbuf, tx);
+
+ /*
+ * Ideally we would want to iterate from the beginning of the
+ * space map to the end in incremental steps. The issue with this
+ * approach is that we don't have any field on-disk that points
+ * us where to start between each step. We could try zeroing out
+ * entries that we've destroyed, but this doesn't work either as
+ * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]).
+ *
+ * As a result, we destroy its entries incrementally starting from
+ * the end after applying the callback to each of them.
+ *
+ * The problem with this approach is that we cannot literally
+ * iterate through the words in the space map backwards as we
+ * can't distinguish two-word space map entries from their second
+ * word. Thus we do the following:
+ *
+ * 1] We get all the entries from the last block of the space map
+ * and put them into a buffer in reverse order. This way the
+ * last entry comes first in the buffer, the second to last is
+ * second, etc.
+ * 2] We iterate through the entries in the buffer and we apply
+ * the callback to each one. As we move from entry to entry we
+ * we decrease the size of the space map, deleting effectively
+ * each entry.
+ * 3] If there are no more entries in the space map or the callback
+ * returns a value other than 0, we stop iterating over the
+ * space map. If there are entries remaining and the callback
+ * returned 0, we go back to step [1].
+ */
+ int error = 0;
+ while (space_map_length(sm) > 0 && error == 0) {
+ uint64_t nwords = 0;
+ error = space_map_reversed_last_block_entries(sm, buf, bufsz,
+ &nwords);
+ if (error != 0)
+ break;
+
+ ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t));
+
+ for (uint64_t i = 0; i < nwords; i++) {
+ uint64_t e = buf[i];
+
+ if (sm_entry_is_debug(e)) {
+ sm->sm_phys->smp_length -= sizeof (uint64_t);
+ continue;
+ }
+
+ int words = 1;
+ uint64_t raw_offset, raw_run, vdev_id;
+ maptype_t type;
+ if (sm_entry_is_single_word(e)) {
+ type = SM_TYPE_DECODE(e);
+ vdev_id = SM_NO_VDEVID;
+ raw_offset = SM_OFFSET_DECODE(e);
+ raw_run = SM_RUN_DECODE(e);
+ } else {
+ ASSERT(sm_entry_is_double_word(e));
+ words = 2;
+
+ raw_run = SM2_RUN_DECODE(e);
+ vdev_id = SM2_VDEV_DECODE(e);
+
+ /* move to the second word */
+ i++;
+ e = buf[i];
+
+ ASSERT3P(i, <=, nwords);
+
+ type = SM2_TYPE_DECODE(e);
+ raw_offset = SM2_OFFSET_DECODE(e);
+ }
+
+ uint64_t entry_offset =
+ (raw_offset << sm->sm_shift) + sm->sm_start;
+ uint64_t entry_run = raw_run << sm->sm_shift;
+
+ VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
+ VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
+ VERIFY3U(entry_offset, >=, sm->sm_start);
+ VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size);
+ VERIFY3U(entry_run, <=, sm->sm_size);
+ VERIFY3U(entry_offset + entry_run, <=,
+ sm->sm_start + sm->sm_size);
+
+ space_map_entry_t sme = {
+ .sme_type = type,
+ .sme_vdev = vdev_id,
+ .sme_offset = entry_offset,
+ .sme_run = entry_run
+ };
+ error = callback(&sme, arg);
+ if (error != 0)
+ break;
+
+ if (type == SM_ALLOC)
+ sm->sm_phys->smp_alloc -= entry_run;
+ else
+ sm->sm_phys->smp_alloc += entry_run;
+ sm->sm_phys->smp_length -= words * sizeof (uint64_t);
+ }
+ }
+
+ if (space_map_length(sm) == 0) {
+ ASSERT0(error);
+ ASSERT0(space_map_allocated(sm));
+ }
+
+ zio_buf_free(buf, bufsz);
+ return (error);
+}
+
+typedef struct space_map_load_arg {
+ space_map_t *smla_sm;
+ range_tree_t *smla_rt;
+ maptype_t smla_type;
+} space_map_load_arg_t;
+
+static int
+space_map_load_callback(space_map_entry_t *sme, void *arg)
+{
+ space_map_load_arg_t *smla = arg;
+ if (sme->sme_type == smla->smla_type) {
+ VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=,
+ smla->smla_sm->sm_size);
+ range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run);
+ } else {
+ range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run);
+ }
+
+ return (0);
+}
+
+/*
+ * Load the spacemap into the rangetree, like space_map_load. But only
+ * read the first 'length' bytes of the spacemap.
+ */
+int
+space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+ uint64_t length)
+{
+ space_map_load_arg_t smla;
+
+ VERIFY0(range_tree_space(rt));
+
+ if (maptype == SM_FREE)
+ range_tree_add(rt, sm->sm_start, sm->sm_size);
+
+ smla.smla_rt = rt;
+ smla.smla_sm = sm;
+ smla.smla_type = maptype;
+ int err = space_map_iterate(sm, length,
+ space_map_load_callback, &smla);
+
+ if (err != 0)
+ range_tree_vacate(rt, NULL, NULL);
+
+ return (err);
+}
+
+/*
+ * Load the space map disk into the specified range tree. Segments of maptype
+ * are added to the range tree, other segment types are removed.
+ */
+int
+space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
+{
+ return (space_map_load_length(sm, rt, maptype, space_map_length(sm)));
+}
+
+void
+space_map_histogram_clear(space_map_t *sm)
+{
+ if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
+ return;
+
+ bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram));
+}
+
+boolean_t
+space_map_histogram_verify(space_map_t *sm, range_tree_t *rt)
+{
+ /*
+ * Verify that the in-core range tree does not have any
+ * ranges smaller than our sm_shift size.
+ */
+ for (int i = 0; i < sm->sm_shift; i++) {
+ if (rt->rt_histogram[i] != 0)
+ return (B_FALSE);
+ }
+ return (B_TRUE);
+}
+
+void
+space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx)
+{
+ int idx = 0;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ VERIFY3U(space_map_object(sm), !=, 0);
+
+ if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
+ return;
+
+ dmu_buf_will_dirty(sm->sm_dbuf, tx);
+
+ ASSERT(space_map_histogram_verify(sm, rt));
+ /*
+ * Transfer the content of the range tree histogram to the space
+ * map histogram. The space map histogram contains 32 buckets ranging
+ * between 2^sm_shift to 2^(32+sm_shift-1). The range tree,
+ * however, can represent ranges from 2^0 to 2^63. Since the space
+ * map only cares about allocatable blocks (minimum of sm_shift) we
+ * can safely ignore all ranges in the range tree smaller than sm_shift.
+ */
+ for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+
+ /*
+ * Since the largest histogram bucket in the space map is
+ * 2^(32+sm_shift-1), we need to normalize the values in
+ * the range tree for any bucket larger than that size. For
+ * example given an sm_shift of 9, ranges larger than 2^40
+ * would get normalized as if they were 1TB ranges. Assume
+ * the range tree had a count of 5 in the 2^44 (16TB) bucket,
+ * the calculation below would normalize this to 5 * 2^4 (16).
+ */
+ ASSERT3U(i, >=, idx + sm->sm_shift);
+ sm->sm_phys->smp_histogram[idx] +=
+ rt->rt_histogram[i] << (i - idx - sm->sm_shift);
+
+ /*
+ * Increment the space map's index as long as we haven't
+ * reached the maximum bucket size. Accumulate all ranges
+ * larger than the max bucket size into the last bucket.
+ */
+ if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
+ ASSERT3U(idx + sm->sm_shift, ==, i);
+ idx++;
+ ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
+ }
+ }
+}
+
+static void
+space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
+{
+ dmu_buf_will_dirty(sm->sm_dbuf, tx);
+
+ uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
+ SM_DEBUG_ACTION_ENCODE(maptype) |
+ SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) |
+ SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
+
+ dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length,
+ sizeof (dentry), &dentry, tx);
+
+ sm->sm_phys->smp_length += sizeof (dentry);
+}
+
+/*
+ * Writes one or more entries given a segment.
+ *
+ * Note: The function may release the dbuf from the pointer initially
+ * passed to it, and return a different dbuf. Also, the space map's
+ * dbuf must be dirty for the changes in sm_phys to take effect.
+ */
+static void
+space_map_write_seg(space_map_t *sm, uint64_t rstart, uint64_t rend,
+ maptype_t maptype, uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp,
+ void *tag, dmu_tx_t *tx)
+{
+ ASSERT3U(words, !=, 0);
+ ASSERT3U(words, <=, 2);
+
+ /* ensure the vdev_id can be represented by the space map */
+ ASSERT3U(vdev_id, <=, SM_NO_VDEVID);
+
+ /*
+ * if this is a single word entry, ensure that no vdev was
+ * specified.
+ */
+ IMPLY(words == 1, vdev_id == SM_NO_VDEVID);
+
+ dmu_buf_t *db = *dbp;
+ ASSERT3U(db->db_size, ==, sm->sm_blksz);
+
+ uint64_t *block_base = db->db_data;
+ uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
+ uint64_t *block_cursor = block_base +
+ (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
+
+ ASSERT3P(block_cursor, <=, block_end);
+
+ uint64_t size = (rend - rstart) >> sm->sm_shift;
+ uint64_t start = (rstart - sm->sm_start) >> sm->sm_shift;
+ uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX;
+
+ ASSERT3U(rstart, >=, sm->sm_start);
+ ASSERT3U(rstart, <, sm->sm_start + sm->sm_size);
+ ASSERT3U(rend - rstart, <=, sm->sm_size);
+ ASSERT3U(rend, <=, sm->sm_start + sm->sm_size);
+
+ while (size != 0) {
+ ASSERT3P(block_cursor, <=, block_end);
+
+ /*
+ * If we are at the end of this block, flush it and start
+ * writing again from the beginning.
+ */
+ if (block_cursor == block_end) {
+ dmu_buf_rele(db, tag);
+
+ uint64_t next_word_offset = sm->sm_phys->smp_length;
+ VERIFY0(dmu_buf_hold(sm->sm_os,
+ space_map_object(sm), next_word_offset,
+ tag, &db, DMU_READ_PREFETCH));
+ dmu_buf_will_dirty(db, tx);
+
+ /* update caller's dbuf */
+ *dbp = db;
+
+ ASSERT3U(db->db_size, ==, sm->sm_blksz);
+
+ block_base = db->db_data;
+ block_cursor = block_base;
+ block_end = block_base +
+ (db->db_size / sizeof (uint64_t));
+ }
+
+ /*
+ * If we are writing a two-word entry and we only have one
+ * word left on this block, just pad it with an empty debug
+ * entry and write the two-word entry in the next block.
+ */
+ uint64_t *next_entry = block_cursor + 1;
+ if (next_entry == block_end && words > 1) {
+ ASSERT3U(words, ==, 2);
+ *block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
+ SM_DEBUG_ACTION_ENCODE(0) |
+ SM_DEBUG_SYNCPASS_ENCODE(0) |
+ SM_DEBUG_TXG_ENCODE(0);
+ block_cursor++;
+ sm->sm_phys->smp_length += sizeof (uint64_t);
+ ASSERT3P(block_cursor, ==, block_end);
+ continue;
+ }
+
+ uint64_t run_len = MIN(size, run_max);
+ switch (words) {
+ case 1:
+ *block_cursor = SM_OFFSET_ENCODE(start) |
+ SM_TYPE_ENCODE(maptype) |
+ SM_RUN_ENCODE(run_len);
+ block_cursor++;
+ break;
+ case 2:
+ /* write the first word of the entry */
+ *block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) |
+ SM2_RUN_ENCODE(run_len) |
+ SM2_VDEV_ENCODE(vdev_id);
+ block_cursor++;
+
+ /* move on to the second word of the entry */
+ ASSERT3P(block_cursor, <, block_end);
+ *block_cursor = SM2_TYPE_ENCODE(maptype) |
+ SM2_OFFSET_ENCODE(start);
+ block_cursor++;
+ break;
+ default:
+ panic("%d-word space map entries are not supported",
+ words);
+ break;
+ }
+ sm->sm_phys->smp_length += words * sizeof (uint64_t);
+
+ start += run_len;
+ size -= run_len;
+ }
+ ASSERT0(size);
+
+}
+
+/*
+ * Note: The space map's dbuf must be dirty for the changes in sm_phys to
+ * take effect.
+ */
+static void
+space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+ uint64_t vdev_id, dmu_tx_t *tx)
+{
+ spa_t *spa = tx->tx_pool->dp_spa;
+ dmu_buf_t *db;
+
+ space_map_write_intro_debug(sm, maptype, tx);
+
+#ifdef ZFS_DEBUG
+ /*
+ * We do this right after we write the intro debug entry
+ * because the estimate does not take it into account.
+ */
+ uint64_t initial_objsize = sm->sm_phys->smp_length;
+ uint64_t estimated_growth =
+ space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
+ uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
+#endif
+
+ /*
+ * Find the offset right after the last word in the space map
+ * and use that to get a hold of the last block, so we can
+ * start appending to it.
+ */
+ uint64_t next_word_offset = sm->sm_phys->smp_length;
+ VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
+ next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
+ ASSERT3U(db->db_size, ==, sm->sm_blksz);
+
+ dmu_buf_will_dirty(db, tx);
+
+ zfs_btree_t *t = &rt->rt_root;
+ zfs_btree_index_t where;
+ for (range_seg_t *rs = zfs_btree_first(t, &where); rs != NULL;
+ rs = zfs_btree_next(t, &where, &where)) {
+ uint64_t offset = (rs_get_start(rs, rt) - sm->sm_start) >>
+ sm->sm_shift;
+ uint64_t length = (rs_get_end(rs, rt) - rs_get_start(rs, rt)) >>
+ sm->sm_shift;
+ uint8_t words = 1;
+
+ /*
+ * We only write two-word entries when both of the following
+ * are true:
+ *
+ * [1] The feature is enabled.
+ * [2] The offset or run is too big for a single-word entry,
+ * or the vdev_id is set (meaning not equal to
+ * SM_NO_VDEVID).
+ *
+ * Note that for purposes of testing we've added the case that
+ * we write two-word entries occasionally when the feature is
+ * enabled and zfs_force_some_double_word_sm_entries has been
+ * set.
+ */
+ if (spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_V2) &&
+ (offset >= (1ULL << SM_OFFSET_BITS) ||
+ length > SM_RUN_MAX ||
+ vdev_id != SM_NO_VDEVID ||
+ (zfs_force_some_double_word_sm_entries &&
+ spa_get_random(100) == 0)))
+ words = 2;
+
+ space_map_write_seg(sm, rs_get_start(rs, rt), rs_get_end(rs,
+ rt), maptype, vdev_id, words, &db, FTAG, tx);
+ }
+
+ dmu_buf_rele(db, FTAG);
+
+#ifdef ZFS_DEBUG
+ /*
+ * We expect our estimation to be based on the worst case
+ * scenario [see comment in space_map_estimate_optimal_size()].
+ * Therefore we expect the actual objsize to be equal or less
+ * than whatever we estimated it to be.
+ */
+ ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length);
+#endif
+}
+
+/*
+ * Note: This function manipulates the state of the given space map but
+ * does not hold any locks implicitly. Thus the caller is responsible
+ * for synchronizing writes to the space map.
+ */
+void
+space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+ uint64_t vdev_id, dmu_tx_t *tx)
+{
+ ASSERT(dsl_pool_sync_context(dmu_objset_pool(sm->sm_os)));
+ VERIFY3U(space_map_object(sm), !=, 0);
+
+ dmu_buf_will_dirty(sm->sm_dbuf, tx);
+
+ /*
+ * This field is no longer necessary since the in-core space map
+ * now contains the object number but is maintained for backwards
+ * compatibility.
+ */
+ sm->sm_phys->smp_object = sm->sm_object;
+
+ if (range_tree_is_empty(rt)) {
+ VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object);
+ return;
+ }
+
+ if (maptype == SM_ALLOC)
+ sm->sm_phys->smp_alloc += range_tree_space(rt);
+ else
+ sm->sm_phys->smp_alloc -= range_tree_space(rt);
+
+ uint64_t nodes = zfs_btree_numnodes(&rt->rt_root);
+ uint64_t rt_space = range_tree_space(rt);
+
+ space_map_write_impl(sm, rt, maptype, vdev_id, tx);
+
+ /*
+ * Ensure that the space_map's accounting wasn't changed
+ * while we were in the middle of writing it out.
+ */
+ VERIFY3U(nodes, ==, zfs_btree_numnodes(&rt->rt_root));
+ VERIFY3U(range_tree_space(rt), ==, rt_space);
+}
+
+static int
+space_map_open_impl(space_map_t *sm)
+{
+ int error;
+ u_longlong_t blocks;
+
+ error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf);
+ if (error)
+ return (error);
+
+ dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks);
+ sm->sm_phys = sm->sm_dbuf->db_data;
+ return (0);
+}
+
+int
+space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
+ uint64_t start, uint64_t size, uint8_t shift)
+{
+ space_map_t *sm;
+ int error;
+
+ ASSERT(*smp == NULL);
+ ASSERT(os != NULL);
+ ASSERT(object != 0);
+
+ sm = kmem_alloc(sizeof (space_map_t), KM_SLEEP);
+
+ sm->sm_start = start;
+ sm->sm_size = size;
+ sm->sm_shift = shift;
+ sm->sm_os = os;
+ sm->sm_object = object;
+ sm->sm_blksz = 0;
+ sm->sm_dbuf = NULL;
+ sm->sm_phys = NULL;
+
+ error = space_map_open_impl(sm);
+ if (error != 0) {
+ space_map_close(sm);
+ return (error);
+ }
+ *smp = sm;
+
+ return (0);
+}
+
+void
+space_map_close(space_map_t *sm)
+{
+ if (sm == NULL)
+ return;
+
+ if (sm->sm_dbuf != NULL)
+ dmu_buf_rele(sm->sm_dbuf, sm);
+ sm->sm_dbuf = NULL;
+ sm->sm_phys = NULL;
+
+ kmem_free(sm, sizeof (*sm));
+}
+
+void
+space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx)
+{
+ objset_t *os = sm->sm_os;
+ spa_t *spa = dmu_objset_spa(os);
+ dmu_object_info_t doi;
+
+ ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
+ ASSERT(dmu_tx_is_syncing(tx));
+ VERIFY3U(dmu_tx_get_txg(tx), <=, spa_final_dirty_txg(spa));
+
+ dmu_object_info_from_db(sm->sm_dbuf, &doi);
+
+ /*
+ * If the space map has the wrong bonus size (because
+ * SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or
+ * the wrong block size (because space_map_blksz has changed),
+ * free and re-allocate its object with the updated sizes.
+ *
+ * Otherwise, just truncate the current object.
+ */
+ if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
+ doi.doi_bonus_size != sizeof (space_map_phys_t)) ||
+ doi.doi_data_block_size != blocksize ||
+ doi.doi_metadata_block_size != 1 << space_map_ibs) {
+ zfs_dbgmsg("txg %llu, spa %s, sm %px, reallocating "
+ "object[%llu]: old bonus %u, old blocksz %u",
+ dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object,
+ doi.doi_bonus_size, doi.doi_data_block_size);
+
+ space_map_free(sm, tx);
+ dmu_buf_rele(sm->sm_dbuf, sm);
+
+ sm->sm_object = space_map_alloc(sm->sm_os, blocksize, tx);
+ VERIFY0(space_map_open_impl(sm));
+ } else {
+ VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx));
+
+ /*
+ * If the spacemap is reallocated, its histogram
+ * will be reset. Do the same in the common case so that
+ * bugs related to the uncommon case do not go unnoticed.
+ */
+ bzero(sm->sm_phys->smp_histogram,
+ sizeof (sm->sm_phys->smp_histogram));
+ }
+
+ dmu_buf_will_dirty(sm->sm_dbuf, tx);
+ sm->sm_phys->smp_length = 0;
+ sm->sm_phys->smp_alloc = 0;
+}
+
+uint64_t
+space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_objset_spa(os);
+ uint64_t object;
+ int bonuslen;
+
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
+ spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
+ bonuslen = sizeof (space_map_phys_t);
+ ASSERT3U(bonuslen, <=, dmu_bonus_max());
+ } else {
+ bonuslen = SPACE_MAP_SIZE_V0;
+ }
+
+ object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize,
+ space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
+
+ return (object);
+}
+
+void
+space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_objset_spa(os);
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
+ dmu_object_info_t doi;
+
+ VERIFY0(dmu_object_info(os, smobj, &doi));
+ if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) {
+ spa_feature_decr(spa,
+ SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
+ }
+ }
+
+ VERIFY0(dmu_object_free(os, smobj, tx));
+}
+
+void
+space_map_free(space_map_t *sm, dmu_tx_t *tx)
+{
+ if (sm == NULL)
+ return;
+
+ space_map_free_obj(sm->sm_os, space_map_object(sm), tx);
+ sm->sm_object = 0;
+}
+
+/*
+ * Given a range tree, it makes a worst-case estimate of how much
+ * space would the tree's segments take if they were written to
+ * the given space map.
+ */
+uint64_t
+space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt,
+ uint64_t vdev_id)
+{
+ spa_t *spa = dmu_objset_spa(sm->sm_os);
+ uint64_t shift = sm->sm_shift;
+ uint64_t *histogram = rt->rt_histogram;
+ uint64_t entries_for_seg = 0;
+
+ /*
+ * In order to get a quick estimate of the optimal size that this
+ * range tree would have on-disk as a space map, we iterate through
+ * its histogram buckets instead of iterating through its nodes.
+ *
+ * Note that this is a highest-bound/worst-case estimate for the
+ * following reasons:
+ *
+ * 1] We assume that we always add a debug padding for each block
+ * we write and we also assume that we start at the last word
+ * of a block attempting to write a two-word entry.
+ * 2] Rounding up errors due to the way segments are distributed
+ * in the buckets of the range tree's histogram.
+ * 3] The activation of zfs_force_some_double_word_sm_entries
+ * (tunable) when testing.
+ *
+ * = Math and Rounding Errors =
+ *
+ * rt_histogram[i] bucket of a range tree represents the number
+ * of entries in [2^i, (2^(i+1))-1] of that range_tree. Given
+ * that, we want to divide the buckets into groups: Buckets that
+ * can be represented using a single-word entry, ones that can
+ * be represented with a double-word entry, and ones that can
+ * only be represented with multiple two-word entries.
+ *
+ * [Note that if the new encoding feature is not enabled there
+ * are only two groups: single-word entry buckets and multiple
+ * single-word entry buckets. The information below assumes
+ * two-word entries enabled, but it can easily applied when
+ * the feature is not enabled]
+ *
+ * To find the highest bucket that can be represented with a
+ * single-word entry we look at the maximum run that such entry
+ * can have, which is 2^(SM_RUN_BITS + sm_shift) [remember that
+ * the run of a space map entry is shifted by sm_shift, thus we
+ * add it to the exponent]. This way, excluding the value of the
+ * maximum run that can be represented by a single-word entry,
+ * all runs that are smaller exist in buckets 0 to
+ * SM_RUN_BITS + shift - 1.
+ *
+ * To find the highest bucket that can be represented with a
+ * double-word entry, we follow the same approach. Finally, any
+ * bucket higher than that are represented with multiple two-word
+ * entries. To be more specific, if the highest bucket whose
+ * segments can be represented with a single two-word entry is X,
+ * then bucket X+1 will need 2 two-word entries for each of its
+ * segments, X+2 will need 4, X+3 will need 8, ...etc.
+ *
+ * With all of the above we make our estimation based on bucket
+ * groups. There is a rounding error though. As we mentioned in
+ * the example with the one-word entry, the maximum run that can
+ * be represented in a one-word entry 2^(SM_RUN_BITS + shift) is
+ * not part of bucket SM_RUN_BITS + shift - 1. Thus, segments of
+ * that length fall into the next bucket (and bucket group) where
+ * we start counting two-word entries and this is one more reason
+ * why the estimated size may end up being bigger than the actual
+ * size written.
+ */
+ uint64_t size = 0;
+ uint64_t idx = 0;
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) ||
+ (vdev_id == SM_NO_VDEVID && sm->sm_size < SM_OFFSET_MAX)) {
+
+ /*
+ * If we are trying to force some double word entries just
+ * assume the worst-case of every single word entry being
+ * written as a double word entry.
+ */
+ uint64_t entry_size =
+ (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) &&
+ zfs_force_some_double_word_sm_entries) ?
+ (2 * sizeof (uint64_t)) : sizeof (uint64_t);
+
+ uint64_t single_entry_max_bucket = SM_RUN_BITS + shift - 1;
+ for (; idx <= single_entry_max_bucket; idx++)
+ size += histogram[idx] * entry_size;
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)) {
+ for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) {
+ ASSERT3U(idx, >=, single_entry_max_bucket);
+ entries_for_seg =
+ 1ULL << (idx - single_entry_max_bucket);
+ size += histogram[idx] *
+ entries_for_seg * entry_size;
+ }
+ return (size);
+ }
+ }
+
+ ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2));
+
+ uint64_t double_entry_max_bucket = SM2_RUN_BITS + shift - 1;
+ for (; idx <= double_entry_max_bucket; idx++)
+ size += histogram[idx] * 2 * sizeof (uint64_t);
+
+ for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) {
+ ASSERT3U(idx, >=, double_entry_max_bucket);
+ entries_for_seg = 1ULL << (idx - double_entry_max_bucket);
+ size += histogram[idx] *
+ entries_for_seg * 2 * sizeof (uint64_t);
+ }
+
+ /*
+ * Assume the worst case where we start with the padding at the end
+ * of the current block and we add an extra padding entry at the end
+ * of all subsequent blocks.
+ */
+ size += ((size / sm->sm_blksz) + 1) * sizeof (uint64_t);
+
+ return (size);
+}
+
+uint64_t
+space_map_object(space_map_t *sm)
+{
+ return (sm != NULL ? sm->sm_object : 0);
+}
+
+int64_t
+space_map_allocated(space_map_t *sm)
+{
+ return (sm != NULL ? sm->sm_phys->smp_alloc : 0);
+}
+
+uint64_t
+space_map_length(space_map_t *sm)
+{
+ return (sm != NULL ? sm->sm_phys->smp_length : 0);
+}
+
+uint64_t
+space_map_nblocks(space_map_t *sm)
+{
+ if (sm == NULL)
+ return (0);
+ return (DIV_ROUND_UP(space_map_length(sm), sm->sm_blksz));
+}
diff --git a/sys/contrib/openzfs/module/zfs/space_reftree.c b/sys/contrib/openzfs/module/zfs/space_reftree.c
new file mode 100644
index 000000000000..080fc6646512
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/space_reftree.c
@@ -0,0 +1,152 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2013, 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/range_tree.h>
+#include <sys/space_reftree.h>
+
+/*
+ * Space reference trees.
+ *
+ * A range tree is a collection of integers. Every integer is either
+ * in the tree, or it's not. A space reference tree generalizes
+ * the idea: it allows its members to have arbitrary reference counts,
+ * as opposed to the implicit reference count of 0 or 1 in a range tree.
+ * This representation comes in handy when computing the union or
+ * intersection of multiple space maps. For example, the union of
+ * N range trees is the subset of the reference tree with refcnt >= 1.
+ * The intersection of N range trees is the subset with refcnt >= N.
+ *
+ * [It's very much like a Fourier transform. Unions and intersections
+ * are hard to perform in the 'range tree domain', so we convert the trees
+ * into the 'reference count domain', where it's trivial, then invert.]
+ *
+ * vdev_dtl_reassess() uses computations of this form to determine
+ * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev
+ * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev
+ * has an outage wherever refcnt >= vdev_children.
+ */
+static int
+space_reftree_compare(const void *x1, const void *x2)
+{
+ const space_ref_t *sr1 = (const space_ref_t *)x1;
+ const space_ref_t *sr2 = (const space_ref_t *)x2;
+
+ int cmp = TREE_CMP(sr1->sr_offset, sr2->sr_offset);
+ if (likely(cmp))
+ return (cmp);
+
+ return (TREE_PCMP(sr1, sr2));
+}
+
+void
+space_reftree_create(avl_tree_t *t)
+{
+ avl_create(t, space_reftree_compare,
+ sizeof (space_ref_t), offsetof(space_ref_t, sr_node));
+}
+
+void
+space_reftree_destroy(avl_tree_t *t)
+{
+ space_ref_t *sr;
+ void *cookie = NULL;
+
+ while ((sr = avl_destroy_nodes(t, &cookie)) != NULL)
+ kmem_free(sr, sizeof (*sr));
+
+ avl_destroy(t);
+}
+
+static void
+space_reftree_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt)
+{
+ space_ref_t *sr;
+
+ sr = kmem_alloc(sizeof (*sr), KM_SLEEP);
+ sr->sr_offset = offset;
+ sr->sr_refcnt = refcnt;
+
+ avl_add(t, sr);
+}
+
+void
+space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
+ int64_t refcnt)
+{
+ space_reftree_add_node(t, start, refcnt);
+ space_reftree_add_node(t, end, -refcnt);
+}
+
+/*
+ * Convert (or add) a range tree into a reference tree.
+ */
+void
+space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt)
+{
+ zfs_btree_index_t where;
+
+ for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; rs =
+ zfs_btree_next(&rt->rt_root, &where, &where)) {
+ space_reftree_add_seg(t, rs_get_start(rs, rt), rs_get_end(rs,
+ rt), refcnt);
+ }
+}
+
+/*
+ * Convert a reference tree into a range tree. The range tree will contain
+ * all members of the reference tree for which refcnt >= minref.
+ */
+void
+space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt, int64_t minref)
+{
+ uint64_t start = -1ULL;
+ int64_t refcnt = 0;
+ space_ref_t *sr;
+
+ range_tree_vacate(rt, NULL, NULL);
+
+ for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) {
+ refcnt += sr->sr_refcnt;
+ if (refcnt >= minref) {
+ if (start == -1ULL) {
+ start = sr->sr_offset;
+ }
+ } else {
+ if (start != -1ULL) {
+ uint64_t end = sr->sr_offset;
+ ASSERT(start <= end);
+ if (end > start)
+ range_tree_add(rt, start, end - start);
+ start = -1ULL;
+ }
+ }
+ }
+ ASSERT(refcnt == 0);
+ ASSERT(start == -1ULL);
+}
diff --git a/sys/contrib/openzfs/module/zfs/txg.c b/sys/contrib/openzfs/module/zfs/txg.c
new file mode 100644
index 000000000000..497e19dd58eb
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/txg.c
@@ -0,0 +1,1076 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright 2011 Martin Matuska
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/txg_impl.h>
+#include <sys/dmu_impl.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_scan.h>
+#include <sys/zil.h>
+#include <sys/callb.h>
+#include <sys/trace_zfs.h>
+
+/*
+ * ZFS Transaction Groups
+ * ----------------------
+ *
+ * ZFS transaction groups are, as the name implies, groups of transactions
+ * that act on persistent state. ZFS asserts consistency at the granularity of
+ * these transaction groups. Each successive transaction group (txg) is
+ * assigned a 64-bit consecutive identifier. There are three active
+ * transaction group states: open, quiescing, or syncing. At any given time,
+ * there may be an active txg associated with each state; each active txg may
+ * either be processing, or blocked waiting to enter the next state. There may
+ * be up to three active txgs, and there is always a txg in the open state
+ * (though it may be blocked waiting to enter the quiescing state). In broad
+ * strokes, transactions -- operations that change in-memory structures -- are
+ * accepted into the txg in the open state, and are completed while the txg is
+ * in the open or quiescing states. The accumulated changes are written to
+ * disk in the syncing state.
+ *
+ * Open
+ *
+ * When a new txg becomes active, it first enters the open state. New
+ * transactions -- updates to in-memory structures -- are assigned to the
+ * currently open txg. There is always a txg in the open state so that ZFS can
+ * accept new changes (though the txg may refuse new changes if it has hit
+ * some limit). ZFS advances the open txg to the next state for a variety of
+ * reasons such as it hitting a time or size threshold, or the execution of an
+ * administrative action that must be completed in the syncing state.
+ *
+ * Quiescing
+ *
+ * After a txg exits the open state, it enters the quiescing state. The
+ * quiescing state is intended to provide a buffer between accepting new
+ * transactions in the open state and writing them out to stable storage in
+ * the syncing state. While quiescing, transactions can continue their
+ * operation without delaying either of the other states. Typically, a txg is
+ * in the quiescing state very briefly since the operations are bounded by
+ * software latencies rather than, say, slower I/O latencies. After all
+ * transactions complete, the txg is ready to enter the next state.
+ *
+ * Syncing
+ *
+ * In the syncing state, the in-memory state built up during the open and (to
+ * a lesser degree) the quiescing states is written to stable storage. The
+ * process of writing out modified data can, in turn modify more data. For
+ * example when we write new blocks, we need to allocate space for them; those
+ * allocations modify metadata (space maps)... which themselves must be
+ * written to stable storage. During the sync state, ZFS iterates, writing out
+ * data until it converges and all in-memory changes have been written out.
+ * The first such pass is the largest as it encompasses all the modified user
+ * data (as opposed to filesystem metadata). Subsequent passes typically have
+ * far less data to write as they consist exclusively of filesystem metadata.
+ *
+ * To ensure convergence, after a certain number of passes ZFS begins
+ * overwriting locations on stable storage that had been allocated earlier in
+ * the syncing state (and subsequently freed). ZFS usually allocates new
+ * blocks to optimize for large, continuous, writes. For the syncing state to
+ * converge however it must complete a pass where no new blocks are allocated
+ * since each allocation requires a modification of persistent metadata.
+ * Further, to hasten convergence, after a prescribed number of passes, ZFS
+ * also defers frees, and stops compressing.
+ *
+ * In addition to writing out user data, we must also execute synctasks during
+ * the syncing context. A synctask is the mechanism by which some
+ * administrative activities work such as creating and destroying snapshots or
+ * datasets. Note that when a synctask is initiated it enters the open txg,
+ * and ZFS then pushes that txg as quickly as possible to completion of the
+ * syncing state in order to reduce the latency of the administrative
+ * activity. To complete the syncing state, ZFS writes out a new uberblock,
+ * the root of the tree of blocks that comprise all state stored on the ZFS
+ * pool. Finally, if there is a quiesced txg waiting, we signal that it can
+ * now transition to the syncing state.
+ */
+
+static void txg_sync_thread(void *arg);
+static void txg_quiesce_thread(void *arg);
+
+int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */
+
+/*
+ * Prepare the txg subsystem.
+ */
+void
+txg_init(dsl_pool_t *dp, uint64_t txg)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ int c;
+ bzero(tx, sizeof (tx_state_t));
+
+ tx->tx_cpu = vmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
+
+ for (c = 0; c < max_ncpus; c++) {
+ int i;
+
+ mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_NOLOCKDEP,
+ NULL);
+ for (i = 0; i < TXG_SIZE; i++) {
+ cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
+ NULL);
+ list_create(&tx->tx_cpu[c].tc_callbacks[i],
+ sizeof (dmu_tx_callback_t),
+ offsetof(dmu_tx_callback_t, dcb_node));
+ }
+ }
+
+ mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);
+
+ tx->tx_open_txg = txg;
+}
+
+/*
+ * Close down the txg subsystem.
+ */
+void
+txg_fini(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ int c;
+
+ ASSERT0(tx->tx_threads);
+
+ mutex_destroy(&tx->tx_sync_lock);
+
+ cv_destroy(&tx->tx_sync_more_cv);
+ cv_destroy(&tx->tx_sync_done_cv);
+ cv_destroy(&tx->tx_quiesce_more_cv);
+ cv_destroy(&tx->tx_quiesce_done_cv);
+ cv_destroy(&tx->tx_exit_cv);
+
+ for (c = 0; c < max_ncpus; c++) {
+ int i;
+
+ mutex_destroy(&tx->tx_cpu[c].tc_open_lock);
+ mutex_destroy(&tx->tx_cpu[c].tc_lock);
+ for (i = 0; i < TXG_SIZE; i++) {
+ cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
+ list_destroy(&tx->tx_cpu[c].tc_callbacks[i]);
+ }
+ }
+
+ if (tx->tx_commit_cb_taskq != NULL)
+ taskq_destroy(tx->tx_commit_cb_taskq);
+
+ vmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
+
+ bzero(tx, sizeof (tx_state_t));
+}
+
+/*
+ * Start syncing transaction groups.
+ */
+void
+txg_sync_start(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ mutex_enter(&tx->tx_sync_lock);
+
+ dprintf("pool %p\n", dp);
+
+ ASSERT0(tx->tx_threads);
+
+ tx->tx_threads = 2;
+
+ tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
+ dp, 0, &p0, TS_RUN, defclsyspri);
+
+ /*
+ * The sync thread can need a larger-than-default stack size on
+ * 32-bit x86. This is due in part to nested pools and
+ * scrub_visitbp() recursion.
+ */
+ tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread,
+ dp, 0, &p0, TS_RUN, defclsyspri);
+
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+static void
+txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr)
+{
+ CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
+ mutex_enter(&tx->tx_sync_lock);
+}
+
+static void
+txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
+{
+ ASSERT(*tpp != NULL);
+ *tpp = NULL;
+ tx->tx_threads--;
+ cv_broadcast(&tx->tx_exit_cv);
+ CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */
+ thread_exit();
+}
+
+static void
+txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
+{
+ CALLB_CPR_SAFE_BEGIN(cpr);
+
+ if (time) {
+ (void) cv_timedwait_idle(cv, &tx->tx_sync_lock,
+ ddi_get_lbolt() + time);
+ } else {
+ cv_wait_idle(cv, &tx->tx_sync_lock);
+ }
+
+ CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
+}
+
+/*
+ * Stop syncing transaction groups.
+ */
+void
+txg_sync_stop(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ dprintf("pool %p\n", dp);
+ /*
+ * Finish off any work in progress.
+ */
+ ASSERT3U(tx->tx_threads, ==, 2);
+
+ /*
+ * We need to ensure that we've vacated the deferred metaslab trees.
+ */
+ txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE);
+
+ /*
+ * Wake all sync threads and wait for them to die.
+ */
+ mutex_enter(&tx->tx_sync_lock);
+
+ ASSERT3U(tx->tx_threads, ==, 2);
+
+ tx->tx_exiting = 1;
+
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ cv_broadcast(&tx->tx_quiesce_done_cv);
+ cv_broadcast(&tx->tx_sync_more_cv);
+
+ while (tx->tx_threads != 0)
+ cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
+
+ tx->tx_exiting = 0;
+
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+/*
+ * Get a handle on the currently open txg and keep it open.
+ *
+ * The txg is guaranteed to stay open until txg_rele_to_quiesce() is called for
+ * the handle. Once txg_rele_to_quiesce() has been called, the txg stays
+ * in quiescing state until txg_rele_to_sync() is called for the handle.
+ *
+ * It is guaranteed that subsequent calls return monotonically increasing
+ * txgs for the same dsl_pool_t. Of course this is not strong monotonicity,
+ * because the same txg can be returned multiple times in a row. This
+ * guarantee holds both for subsequent calls from one thread and for multiple
+ * threads. For example, it is impossible to observe the following sequence
+ * of events:
+ *
+ * Thread 1 Thread 2
+ *
+ * 1 <- txg_hold_open(P, ...)
+ * 2 <- txg_hold_open(P, ...)
+ * 1 <- txg_hold_open(P, ...)
+ *
+ */
+uint64_t
+txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ tx_cpu_t *tc;
+ uint64_t txg;
+
+ /*
+ * It appears the processor id is simply used as a "random"
+ * number to index into the array, and there isn't any other
+ * significance to the chosen tx_cpu. Because.. Why not use
+ * the current cpu to index into the array?
+ */
+ tc = &tx->tx_cpu[CPU_SEQID_UNSTABLE];
+
+ mutex_enter(&tc->tc_open_lock);
+ txg = tx->tx_open_txg;
+
+ mutex_enter(&tc->tc_lock);
+ tc->tc_count[txg & TXG_MASK]++;
+ mutex_exit(&tc->tc_lock);
+
+ th->th_cpu = tc;
+ th->th_txg = txg;
+
+ return (txg);
+}
+
+void
+txg_rele_to_quiesce(txg_handle_t *th)
+{
+ tx_cpu_t *tc = th->th_cpu;
+
+ ASSERT(!MUTEX_HELD(&tc->tc_lock));
+ mutex_exit(&tc->tc_open_lock);
+}
+
+void
+txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks)
+{
+ tx_cpu_t *tc = th->th_cpu;
+ int g = th->th_txg & TXG_MASK;
+
+ mutex_enter(&tc->tc_lock);
+ list_move_tail(&tc->tc_callbacks[g], tx_callbacks);
+ mutex_exit(&tc->tc_lock);
+}
+
+void
+txg_rele_to_sync(txg_handle_t *th)
+{
+ tx_cpu_t *tc = th->th_cpu;
+ int g = th->th_txg & TXG_MASK;
+
+ mutex_enter(&tc->tc_lock);
+ ASSERT(tc->tc_count[g] != 0);
+ if (--tc->tc_count[g] == 0)
+ cv_broadcast(&tc->tc_cv[g]);
+ mutex_exit(&tc->tc_lock);
+
+ th->th_cpu = NULL; /* defensive */
+}
+
+/*
+ * Blocks until all transactions in the group are committed.
+ *
+ * On return, the transaction group has reached a stable state in which it can
+ * then be passed off to the syncing context.
+ */
+static void
+txg_quiesce(dsl_pool_t *dp, uint64_t txg)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ uint64_t tx_open_time;
+ int g = txg & TXG_MASK;
+ int c;
+
+ /*
+ * Grab all tc_open_locks so nobody else can get into this txg.
+ */
+ for (c = 0; c < max_ncpus; c++)
+ mutex_enter(&tx->tx_cpu[c].tc_open_lock);
+
+ ASSERT(txg == tx->tx_open_txg);
+ tx->tx_open_txg++;
+ tx->tx_open_time = tx_open_time = gethrtime();
+
+ DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
+ DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
+
+ /*
+ * Now that we've incremented tx_open_txg, we can let threads
+ * enter the next transaction group.
+ */
+ for (c = 0; c < max_ncpus; c++)
+ mutex_exit(&tx->tx_cpu[c].tc_open_lock);
+
+ spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, tx_open_time);
+ spa_txg_history_add(dp->dp_spa, txg + 1, tx_open_time);
+
+ /*
+ * Quiesce the transaction group by waiting for everyone to
+ * call txg_rele_to_sync() for their open transaction handles.
+ */
+ for (c = 0; c < max_ncpus; c++) {
+ tx_cpu_t *tc = &tx->tx_cpu[c];
+ mutex_enter(&tc->tc_lock);
+ while (tc->tc_count[g] != 0)
+ cv_wait(&tc->tc_cv[g], &tc->tc_lock);
+ mutex_exit(&tc->tc_lock);
+ }
+
+ spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_QUIESCED, gethrtime());
+}
+
+static void
+txg_do_callbacks(list_t *cb_list)
+{
+ dmu_tx_do_callbacks(cb_list, 0);
+
+ list_destroy(cb_list);
+
+ kmem_free(cb_list, sizeof (list_t));
+}
+
+/*
+ * Dispatch the commit callbacks registered on this txg to worker threads.
+ *
+ * If no callbacks are registered for a given TXG, nothing happens.
+ * This function creates a taskq for the associated pool, if needed.
+ */
+static void
+txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
+{
+ int c;
+ tx_state_t *tx = &dp->dp_tx;
+ list_t *cb_list;
+
+ for (c = 0; c < max_ncpus; c++) {
+ tx_cpu_t *tc = &tx->tx_cpu[c];
+ /*
+ * No need to lock tx_cpu_t at this point, since this can
+ * only be called once a txg has been synced.
+ */
+
+ int g = txg & TXG_MASK;
+
+ if (list_is_empty(&tc->tc_callbacks[g]))
+ continue;
+
+ if (tx->tx_commit_cb_taskq == NULL) {
+ /*
+ * Commit callback taskq hasn't been created yet.
+ */
+ tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
+ 100, defclsyspri, boot_ncpus, boot_ncpus * 2,
+ TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
+ TASKQ_THREADS_CPU_PCT);
+ }
+
+ cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
+ list_create(cb_list, sizeof (dmu_tx_callback_t),
+ offsetof(dmu_tx_callback_t, dcb_node));
+
+ list_move_tail(cb_list, &tc->tc_callbacks[g]);
+
+ (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
+ txg_do_callbacks, cb_list, TQ_SLEEP);
+ }
+}
+
+/*
+ * Wait for pending commit callbacks of already-synced transactions to finish
+ * processing.
+ * Calling this function from within a commit callback will deadlock.
+ */
+void
+txg_wait_callbacks(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ if (tx->tx_commit_cb_taskq != NULL)
+ taskq_wait_outstanding(tx->tx_commit_cb_taskq, 0);
+}
+
+static boolean_t
+txg_is_syncing(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+ return (tx->tx_syncing_txg != 0);
+}
+
+static boolean_t
+txg_is_quiescing(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+ return (tx->tx_quiescing_txg != 0);
+}
+
+static boolean_t
+txg_has_quiesced_to_sync(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+ return (tx->tx_quiesced_txg != 0);
+}
+
+static void
+txg_sync_thread(void *arg)
+{
+ dsl_pool_t *dp = arg;
+ spa_t *spa = dp->dp_spa;
+ tx_state_t *tx = &dp->dp_tx;
+ callb_cpr_t cpr;
+ clock_t start, delta;
+
+ (void) spl_fstrans_mark();
+ txg_thread_enter(tx, &cpr);
+
+ start = delta = 0;
+ for (;;) {
+ clock_t timeout = zfs_txg_timeout * hz;
+ clock_t timer;
+ uint64_t txg;
+ uint64_t dirty_min_bytes =
+ zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
+
+ /*
+ * We sync when we're scanning, there's someone waiting
+ * on us, or the quiesce thread has handed off a txg to
+ * us, or we have reached our timeout.
+ */
+ timer = (delta >= timeout ? 0 : timeout - delta);
+ while (!dsl_scan_active(dp->dp_scan) &&
+ !tx->tx_exiting && timer > 0 &&
+ tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
+ !txg_has_quiesced_to_sync(dp) &&
+ dp->dp_dirty_total < dirty_min_bytes) {
+ dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
+ tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
+ txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
+ delta = ddi_get_lbolt() - start;
+ timer = (delta > timeout ? 0 : timeout - delta);
+ }
+
+ /*
+ * Wait until the quiesce thread hands off a txg to us,
+ * prompting it to do so if necessary.
+ */
+ while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) {
+ if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
+ tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
+ }
+
+ if (tx->tx_exiting)
+ txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
+
+ /*
+ * Consume the quiesced txg which has been handed off to
+ * us. This may cause the quiescing thread to now be
+ * able to quiesce another txg, so we must signal it.
+ */
+ ASSERT(tx->tx_quiesced_txg != 0);
+ txg = tx->tx_quiesced_txg;
+ tx->tx_quiesced_txg = 0;
+ tx->tx_syncing_txg = txg;
+ DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
+ mutex_exit(&tx->tx_sync_lock);
+
+ txg_stat_t *ts = spa_txg_history_init_io(spa, txg, dp);
+ start = ddi_get_lbolt();
+ spa_sync(spa, txg);
+ delta = ddi_get_lbolt() - start;
+ spa_txg_history_fini_io(spa, ts);
+
+ mutex_enter(&tx->tx_sync_lock);
+ tx->tx_synced_txg = txg;
+ tx->tx_syncing_txg = 0;
+ DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
+ cv_broadcast(&tx->tx_sync_done_cv);
+
+ /*
+ * Dispatch commit callbacks to worker threads.
+ */
+ txg_dispatch_callbacks(dp, txg);
+ }
+}
+
+static void
+txg_quiesce_thread(void *arg)
+{
+ dsl_pool_t *dp = arg;
+ tx_state_t *tx = &dp->dp_tx;
+ callb_cpr_t cpr;
+
+ txg_thread_enter(tx, &cpr);
+
+ for (;;) {
+ uint64_t txg;
+
+ /*
+ * We quiesce when there's someone waiting on us.
+ * However, we can only have one txg in "quiescing" or
+ * "quiesced, waiting to sync" state. So we wait until
+ * the "quiesced, waiting to sync" txg has been consumed
+ * by the sync thread.
+ */
+ while (!tx->tx_exiting &&
+ (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
+ txg_has_quiesced_to_sync(dp)))
+ txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
+
+ if (tx->tx_exiting)
+ txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
+
+ txg = tx->tx_open_txg;
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting,
+ tx->tx_sync_txg_waiting);
+ tx->tx_quiescing_txg = txg;
+
+ mutex_exit(&tx->tx_sync_lock);
+ txg_quiesce(dp, txg);
+ mutex_enter(&tx->tx_sync_lock);
+
+ /*
+ * Hand this txg off to the sync thread.
+ */
+ dprintf("quiesce done, handing off txg %llu\n", txg);
+ tx->tx_quiescing_txg = 0;
+ tx->tx_quiesced_txg = txg;
+ DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
+ cv_broadcast(&tx->tx_sync_more_cv);
+ cv_broadcast(&tx->tx_quiesce_done_cv);
+ }
+}
+
+/*
+ * Delay this thread by delay nanoseconds if we are still in the open
+ * transaction group and there is already a waiting txg quiescing or quiesced.
+ * Abort the delay if this txg stalls or enters the quiescing state.
+ */
+void
+txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ hrtime_t start = gethrtime();
+
+ /* don't delay if this txg could transition to quiescing immediately */
+ if (tx->tx_open_txg > txg ||
+ tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1)
+ return;
+
+ mutex_enter(&tx->tx_sync_lock);
+ if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) {
+ mutex_exit(&tx->tx_sync_lock);
+ return;
+ }
+
+ while (gethrtime() - start < delay &&
+ tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
+ (void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
+ &tx->tx_sync_lock, delay, resolution, 0);
+ }
+
+ DMU_TX_STAT_BUMP(dmu_tx_delay);
+
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+static boolean_t
+txg_wait_synced_impl(dsl_pool_t *dp, uint64_t txg, boolean_t wait_sig)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ ASSERT(!dsl_pool_config_held(dp));
+
+ mutex_enter(&tx->tx_sync_lock);
+ ASSERT3U(tx->tx_threads, ==, 2);
+ if (txg == 0)
+ txg = tx->tx_open_txg + TXG_DEFER_SIZE;
+ if (tx->tx_sync_txg_waiting < txg)
+ tx->tx_sync_txg_waiting = txg;
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
+ while (tx->tx_synced_txg < txg) {
+ dprintf("broadcasting sync more "
+ "tx_synced=%llu waiting=%llu dp=%px\n",
+ tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
+ cv_broadcast(&tx->tx_sync_more_cv);
+ if (wait_sig) {
+ /*
+ * Condition wait here but stop if the thread receives a
+ * signal. The caller may call txg_wait_synced*() again
+ * to resume waiting for this txg.
+ */
+ if (cv_wait_io_sig(&tx->tx_sync_done_cv,
+ &tx->tx_sync_lock) == 0) {
+ mutex_exit(&tx->tx_sync_lock);
+ return (B_TRUE);
+ }
+ } else {
+ cv_wait_io(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
+ }
+ }
+ mutex_exit(&tx->tx_sync_lock);
+ return (B_FALSE);
+}
+
+void
+txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
+{
+ VERIFY0(txg_wait_synced_impl(dp, txg, B_FALSE));
+}
+
+/*
+ * Similar to a txg_wait_synced but it can be interrupted from a signal.
+ * Returns B_TRUE if the thread was signaled while waiting.
+ */
+boolean_t
+txg_wait_synced_sig(dsl_pool_t *dp, uint64_t txg)
+{
+ return (txg_wait_synced_impl(dp, txg, B_TRUE));
+}
+
+/*
+ * Wait for the specified open transaction group. Set should_quiesce
+ * when the current open txg should be quiesced immediately.
+ */
+void
+txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ ASSERT(!dsl_pool_config_held(dp));
+
+ mutex_enter(&tx->tx_sync_lock);
+ ASSERT3U(tx->tx_threads, ==, 2);
+ if (txg == 0)
+ txg = tx->tx_open_txg + 1;
+ if (tx->tx_quiesce_txg_waiting < txg && should_quiesce)
+ tx->tx_quiesce_txg_waiting = txg;
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
+ while (tx->tx_open_txg < txg) {
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ /*
+ * Callers setting should_quiesce will use cv_wait_io() and
+ * be accounted for as iowait time. Otherwise, the caller is
+ * understood to be idle and cv_wait_sig() is used to prevent
+ * incorrectly inflating the system load average.
+ */
+ if (should_quiesce == B_TRUE) {
+ cv_wait_io(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
+ } else {
+ cv_wait_idle(&tx->tx_quiesce_done_cv,
+ &tx->tx_sync_lock);
+ }
+ }
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+/*
+ * If there isn't a txg syncing or in the pipeline, push another txg through
+ * the pipeline by quiescing the open txg.
+ */
+void
+txg_kick(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ ASSERT(!dsl_pool_config_held(dp));
+
+ mutex_enter(&tx->tx_sync_lock);
+ if (!txg_is_syncing(dp) &&
+ !txg_is_quiescing(dp) &&
+ tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
+ tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
+ tx->tx_quiesced_txg <= tx->tx_synced_txg) {
+ tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ }
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+boolean_t
+txg_stalled(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
+}
+
+boolean_t
+txg_sync_waiting(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting ||
+ tx->tx_quiesced_txg != 0);
+}
+
+/*
+ * Verify that this txg is active (open, quiescing, syncing). Non-active
+ * txg's should not be manipulated.
+ */
+#ifdef ZFS_DEBUG
+void
+txg_verify(spa_t *spa, uint64_t txg)
+{
+ dsl_pool_t *dp __maybe_unused = spa_get_dsl(spa);
+ if (txg <= TXG_INITIAL || txg == ZILTEST_TXG)
+ return;
+ ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
+ ASSERT3U(txg, >=, dp->dp_tx.tx_synced_txg);
+ ASSERT3U(txg, >=, dp->dp_tx.tx_open_txg - TXG_CONCURRENT_STATES);
+}
+#endif
+
+/*
+ * Per-txg object lists.
+ */
+void
+txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset)
+{
+ int t;
+
+ mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ tl->tl_offset = offset;
+ tl->tl_spa = spa;
+
+ for (t = 0; t < TXG_SIZE; t++)
+ tl->tl_head[t] = NULL;
+}
+
+static boolean_t
+txg_list_empty_impl(txg_list_t *tl, uint64_t txg)
+{
+ ASSERT(MUTEX_HELD(&tl->tl_lock));
+ TXG_VERIFY(tl->tl_spa, txg);
+ return (tl->tl_head[txg & TXG_MASK] == NULL);
+}
+
+boolean_t
+txg_list_empty(txg_list_t *tl, uint64_t txg)
+{
+ mutex_enter(&tl->tl_lock);
+ boolean_t ret = txg_list_empty_impl(tl, txg);
+ mutex_exit(&tl->tl_lock);
+
+ return (ret);
+}
+
+void
+txg_list_destroy(txg_list_t *tl)
+{
+ int t;
+
+ mutex_enter(&tl->tl_lock);
+ for (t = 0; t < TXG_SIZE; t++)
+ ASSERT(txg_list_empty_impl(tl, t));
+ mutex_exit(&tl->tl_lock);
+
+ mutex_destroy(&tl->tl_lock);
+}
+
+/*
+ * Returns true if all txg lists are empty.
+ *
+ * Warning: this is inherently racy (an item could be added immediately
+ * after this function returns).
+ */
+boolean_t
+txg_all_lists_empty(txg_list_t *tl)
+{
+ mutex_enter(&tl->tl_lock);
+ for (int i = 0; i < TXG_SIZE; i++) {
+ if (!txg_list_empty_impl(tl, i)) {
+ mutex_exit(&tl->tl_lock);
+ return (B_FALSE);
+ }
+ }
+ mutex_exit(&tl->tl_lock);
+ return (B_TRUE);
+}
+
+/*
+ * Add an entry to the list (unless it's already on the list).
+ * Returns B_TRUE if it was actually added.
+ */
+boolean_t
+txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+ boolean_t add;
+
+ TXG_VERIFY(tl->tl_spa, txg);
+ mutex_enter(&tl->tl_lock);
+ add = (tn->tn_member[t] == 0);
+ if (add) {
+ tn->tn_member[t] = 1;
+ tn->tn_next[t] = tl->tl_head[t];
+ tl->tl_head[t] = tn;
+ }
+ mutex_exit(&tl->tl_lock);
+
+ return (add);
+}
+
+/*
+ * Add an entry to the end of the list, unless it's already on the list.
+ * (walks list to find end)
+ * Returns B_TRUE if it was actually added.
+ */
+boolean_t
+txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+ boolean_t add;
+
+ TXG_VERIFY(tl->tl_spa, txg);
+ mutex_enter(&tl->tl_lock);
+ add = (tn->tn_member[t] == 0);
+ if (add) {
+ txg_node_t **tp;
+
+ for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t])
+ continue;
+
+ tn->tn_member[t] = 1;
+ tn->tn_next[t] = NULL;
+ *tp = tn;
+ }
+ mutex_exit(&tl->tl_lock);
+
+ return (add);
+}
+
+/*
+ * Remove the head of the list and return it.
+ */
+void *
+txg_list_remove(txg_list_t *tl, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn;
+ void *p = NULL;
+
+ TXG_VERIFY(tl->tl_spa, txg);
+ mutex_enter(&tl->tl_lock);
+ if ((tn = tl->tl_head[t]) != NULL) {
+ ASSERT(tn->tn_member[t]);
+ ASSERT(tn->tn_next[t] == NULL || tn->tn_next[t]->tn_member[t]);
+ p = (char *)tn - tl->tl_offset;
+ tl->tl_head[t] = tn->tn_next[t];
+ tn->tn_next[t] = NULL;
+ tn->tn_member[t] = 0;
+ }
+ mutex_exit(&tl->tl_lock);
+
+ return (p);
+}
+
+/*
+ * Remove a specific item from the list and return it.
+ */
+void *
+txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn, **tp;
+
+ TXG_VERIFY(tl->tl_spa, txg);
+ mutex_enter(&tl->tl_lock);
+
+ for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
+ if ((char *)tn - tl->tl_offset == p) {
+ *tp = tn->tn_next[t];
+ tn->tn_next[t] = NULL;
+ tn->tn_member[t] = 0;
+ mutex_exit(&tl->tl_lock);
+ return (p);
+ }
+ }
+
+ mutex_exit(&tl->tl_lock);
+
+ return (NULL);
+}
+
+boolean_t
+txg_list_member(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+
+ TXG_VERIFY(tl->tl_spa, txg);
+ return (tn->tn_member[t] != 0);
+}
+
+/*
+ * Walk a txg list
+ */
+void *
+txg_list_head(txg_list_t *tl, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn;
+
+ mutex_enter(&tl->tl_lock);
+ tn = tl->tl_head[t];
+ mutex_exit(&tl->tl_lock);
+
+ TXG_VERIFY(tl->tl_spa, txg);
+ return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
+}
+
+void *
+txg_list_next(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+
+ TXG_VERIFY(tl->tl_spa, txg);
+
+ mutex_enter(&tl->tl_lock);
+ tn = tn->tn_next[t];
+ mutex_exit(&tl->tl_lock);
+
+ return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
+}
+
+EXPORT_SYMBOL(txg_init);
+EXPORT_SYMBOL(txg_fini);
+EXPORT_SYMBOL(txg_sync_start);
+EXPORT_SYMBOL(txg_sync_stop);
+EXPORT_SYMBOL(txg_hold_open);
+EXPORT_SYMBOL(txg_rele_to_quiesce);
+EXPORT_SYMBOL(txg_rele_to_sync);
+EXPORT_SYMBOL(txg_register_callbacks);
+EXPORT_SYMBOL(txg_delay);
+EXPORT_SYMBOL(txg_wait_synced);
+EXPORT_SYMBOL(txg_wait_open);
+EXPORT_SYMBOL(txg_wait_callbacks);
+EXPORT_SYMBOL(txg_stalled);
+EXPORT_SYMBOL(txg_sync_waiting);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, timeout, INT, ZMOD_RW,
+ "Max seconds worth of delta per txg");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/uberblock.c b/sys/contrib/openzfs/module/zfs/uberblock.c
new file mode 100644
index 000000000000..b8857d74d810
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/uberblock.c
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/uberblock_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/mmp.h>
+
+int
+uberblock_verify(uberblock_t *ub)
+{
+ if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC))
+ byteswap_uint64_array(ub, sizeof (uberblock_t));
+
+ if (ub->ub_magic != UBERBLOCK_MAGIC)
+ return (SET_ERROR(EINVAL));
+
+ return (0);
+}
+
+/*
+ * Update the uberblock and return TRUE if anything changed in this
+ * transaction group.
+ */
+boolean_t
+uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg, uint64_t mmp_delay)
+{
+ ASSERT(ub->ub_txg < txg);
+
+ /*
+ * We explicitly do not set ub_version here, so that older versions
+ * continue to be written with the previous uberblock version.
+ */
+ ub->ub_magic = UBERBLOCK_MAGIC;
+ ub->ub_txg = txg;
+ ub->ub_guid_sum = rvd->vdev_guid_sum;
+ ub->ub_timestamp = gethrestime_sec();
+ ub->ub_software_version = SPA_VERSION;
+ ub->ub_mmp_magic = MMP_MAGIC;
+ if (spa_multihost(rvd->vdev_spa)) {
+ ub->ub_mmp_delay = mmp_delay;
+ ub->ub_mmp_config = MMP_SEQ_SET(0) |
+ MMP_INTERVAL_SET(zfs_multihost_interval) |
+ MMP_FAIL_INT_SET(zfs_multihost_fail_intervals);
+ } else {
+ ub->ub_mmp_delay = 0;
+ ub->ub_mmp_config = 0;
+ }
+ ub->ub_checkpoint_txg = 0;
+
+ return (ub->ub_rootbp.blk_birth == txg);
+}
diff --git a/sys/contrib/openzfs/module/zfs/unique.c b/sys/contrib/openzfs/module/zfs/unique.c
new file mode 100644
index 000000000000..0e076797a002
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/unique.c
@@ -0,0 +1,112 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/unique.h>
+
+static avl_tree_t unique_avl;
+static kmutex_t unique_mtx;
+
+typedef struct unique {
+ avl_node_t un_link;
+ uint64_t un_value;
+} unique_t;
+
+#define UNIQUE_MASK ((1ULL << UNIQUE_BITS) - 1)
+
+static int
+unique_compare(const void *a, const void *b)
+{
+ const unique_t *una = (const unique_t *)a;
+ const unique_t *unb = (const unique_t *)b;
+
+ return (TREE_CMP(una->un_value, unb->un_value));
+}
+
+void
+unique_init(void)
+{
+ avl_create(&unique_avl, unique_compare,
+ sizeof (unique_t), offsetof(unique_t, un_link));
+ mutex_init(&unique_mtx, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+unique_fini(void)
+{
+ avl_destroy(&unique_avl);
+ mutex_destroy(&unique_mtx);
+}
+
+uint64_t
+unique_create(void)
+{
+ uint64_t value = unique_insert(0);
+ unique_remove(value);
+ return (value);
+}
+
+uint64_t
+unique_insert(uint64_t value)
+{
+ avl_index_t idx;
+ unique_t *un = kmem_alloc(sizeof (unique_t), KM_SLEEP);
+
+ un->un_value = value;
+
+ mutex_enter(&unique_mtx);
+ while (un->un_value == 0 || un->un_value & ~UNIQUE_MASK ||
+ avl_find(&unique_avl, un, &idx)) {
+ mutex_exit(&unique_mtx);
+ (void) random_get_pseudo_bytes((void*)&un->un_value,
+ sizeof (un->un_value));
+ un->un_value &= UNIQUE_MASK;
+ mutex_enter(&unique_mtx);
+ }
+
+ avl_insert(&unique_avl, un, idx);
+ mutex_exit(&unique_mtx);
+
+ return (un->un_value);
+}
+
+void
+unique_remove(uint64_t value)
+{
+ unique_t un_tofind;
+ unique_t *un;
+
+ un_tofind.un_value = value;
+ mutex_enter(&unique_mtx);
+ un = avl_find(&unique_avl, &un_tofind, NULL);
+ if (un != NULL) {
+ avl_remove(&unique_avl, un);
+ kmem_free(un, sizeof (unique_t));
+ }
+ mutex_exit(&unique_mtx);
+}
diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c
new file mode 100644
index 000000000000..36001e0a6626
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev.c
@@ -0,0 +1,5420 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Toomas Soome <tsoome@me.com>
+ * Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2019, Datto Inc. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/bpobj.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dir.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_rebuild.h>
+#include <sys/vdev_draid.h>
+#include <sys/uberblock_impl.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/space_map.h>
+#include <sys/space_reftree.h>
+#include <sys/zio.h>
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+#include <sys/arc.h>
+#include <sys/zil.h>
+#include <sys/dsl_scan.h>
+#include <sys/vdev_raidz.h>
+#include <sys/abd.h>
+#include <sys/vdev_initialize.h>
+#include <sys/vdev_trim.h>
+#include <sys/zvol.h>
+#include <sys/zfs_ratelimit.h>
+
+/*
+ * One metaslab from each (normal-class) vdev is used by the ZIL. These are
+ * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are
+ * part of the spa_embedded_log_class. The metaslab with the most free space
+ * in each vdev is selected for this purpose when the pool is opened (or a
+ * vdev is added). See vdev_metaslab_init().
+ *
+ * Log blocks can be allocated from the following locations. Each one is tried
+ * in order until the allocation succeeds:
+ * 1. dedicated log vdevs, aka "slog" (spa_log_class)
+ * 2. embedded slog metaslabs (spa_embedded_log_class)
+ * 3. other metaslabs in normal vdevs (spa_normal_class)
+ *
+ * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer
+ * than this number of metaslabs in the vdev. This ensures that we don't set
+ * aside an unreasonable amount of space for the ZIL. If set to less than
+ * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced
+ * (by more than 1<<spa_slop_shift) due to the embedded slog metaslab.
+ */
+int zfs_embedded_slog_min_ms = 64;
+
+/* default target for number of metaslabs per top-level vdev */
+int zfs_vdev_default_ms_count = 200;
+
+/* minimum number of metaslabs per top-level vdev */
+int zfs_vdev_min_ms_count = 16;
+
+/* practical upper limit of total metaslabs per top-level vdev */
+int zfs_vdev_ms_count_limit = 1ULL << 17;
+
+/* lower limit for metaslab size (512M) */
+int zfs_vdev_default_ms_shift = 29;
+
+/* upper limit for metaslab size (16G) */
+int zfs_vdev_max_ms_shift = 34;
+
+int vdev_validate_skip = B_FALSE;
+
+/*
+ * Since the DTL space map of a vdev is not expected to have a lot of
+ * entries, we default its block size to 4K.
+ */
+int zfs_vdev_dtl_sm_blksz = (1 << 12);
+
+/*
+ * Rate limit slow IO (delay) events to this many per second.
+ */
+unsigned int zfs_slow_io_events_per_second = 20;
+
+/*
+ * Rate limit checksum events after this many checksum errors per second.
+ */
+unsigned int zfs_checksum_events_per_second = 20;
+
+/*
+ * Ignore errors during scrub/resilver. Allows to work around resilver
+ * upon import when there are pool errors.
+ */
+int zfs_scan_ignore_errors = 0;
+
+/*
+ * vdev-wide space maps that have lots of entries written to them at
+ * the end of each transaction can benefit from a higher I/O bandwidth
+ * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
+ */
+int zfs_vdev_standard_sm_blksz = (1 << 17);
+
+/*
+ * Tunable parameter for debugging or performance analysis. Setting this
+ * will cause pool corruption on power loss if a volatile out-of-order
+ * write cache is enabled.
+ */
+int zfs_nocacheflush = 0;
+
+uint64_t zfs_vdev_max_auto_ashift = ASHIFT_MAX;
+uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
+
+/*PRINTFLIKE2*/
+void
+vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
+{
+ va_list adx;
+ char buf[256];
+
+ va_start(adx, fmt);
+ (void) vsnprintf(buf, sizeof (buf), fmt, adx);
+ va_end(adx);
+
+ if (vd->vdev_path != NULL) {
+ zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type,
+ vd->vdev_path, buf);
+ } else {
+ zfs_dbgmsg("%s-%llu vdev (guid %llu): %s",
+ vd->vdev_ops->vdev_op_type,
+ (u_longlong_t)vd->vdev_id,
+ (u_longlong_t)vd->vdev_guid, buf);
+ }
+}
+
+void
+vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
+{
+ char state[20];
+
+ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
+ zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id,
+ vd->vdev_ops->vdev_op_type);
+ return;
+ }
+
+ switch (vd->vdev_state) {
+ case VDEV_STATE_UNKNOWN:
+ (void) snprintf(state, sizeof (state), "unknown");
+ break;
+ case VDEV_STATE_CLOSED:
+ (void) snprintf(state, sizeof (state), "closed");
+ break;
+ case VDEV_STATE_OFFLINE:
+ (void) snprintf(state, sizeof (state), "offline");
+ break;
+ case VDEV_STATE_REMOVED:
+ (void) snprintf(state, sizeof (state), "removed");
+ break;
+ case VDEV_STATE_CANT_OPEN:
+ (void) snprintf(state, sizeof (state), "can't open");
+ break;
+ case VDEV_STATE_FAULTED:
+ (void) snprintf(state, sizeof (state), "faulted");
+ break;
+ case VDEV_STATE_DEGRADED:
+ (void) snprintf(state, sizeof (state), "degraded");
+ break;
+ case VDEV_STATE_HEALTHY:
+ (void) snprintf(state, sizeof (state), "healthy");
+ break;
+ default:
+ (void) snprintf(state, sizeof (state), "<state %u>",
+ (uint_t)vd->vdev_state);
+ }
+
+ zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
+ "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type,
+ vd->vdev_islog ? " (log)" : "",
+ (u_longlong_t)vd->vdev_guid,
+ vd->vdev_path ? vd->vdev_path : "N/A", state);
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++)
+ vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
+}
+
+/*
+ * Virtual device management.
+ */
+
+static vdev_ops_t *vdev_ops_table[] = {
+ &vdev_root_ops,
+ &vdev_raidz_ops,
+ &vdev_draid_ops,
+ &vdev_draid_spare_ops,
+ &vdev_mirror_ops,
+ &vdev_replacing_ops,
+ &vdev_spare_ops,
+ &vdev_disk_ops,
+ &vdev_file_ops,
+ &vdev_missing_ops,
+ &vdev_hole_ops,
+ &vdev_indirect_ops,
+ NULL
+};
+
+/*
+ * Given a vdev type, return the appropriate ops vector.
+ */
+static vdev_ops_t *
+vdev_getops(const char *type)
+{
+ vdev_ops_t *ops, **opspp;
+
+ for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
+ if (strcmp(ops->vdev_op_type, type) == 0)
+ break;
+
+ return (ops);
+}
+
+/*
+ * Given a vdev and a metaslab class, find which metaslab group we're
+ * interested in. All vdevs may belong to two different metaslab classes.
+ * Dedicated slog devices use only the primary metaslab group, rather than a
+ * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL.
+ */
+metaslab_group_t *
+vdev_get_mg(vdev_t *vd, metaslab_class_t *mc)
+{
+ if (mc == spa_embedded_log_class(vd->vdev_spa) &&
+ vd->vdev_log_mg != NULL)
+ return (vd->vdev_log_mg);
+ else
+ return (vd->vdev_mg);
+}
+
+/* ARGSUSED */
+void
+vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
+ range_seg64_t *physical_rs, range_seg64_t *remain_rs)
+{
+ physical_rs->rs_start = logical_rs->rs_start;
+ physical_rs->rs_end = logical_rs->rs_end;
+}
+
+/*
+ * Derive the enumerated allocation bias from string input.
+ * String origin is either the per-vdev zap or zpool(8).
+ */
+static vdev_alloc_bias_t
+vdev_derive_alloc_bias(const char *bias)
+{
+ vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
+
+ if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
+ alloc_bias = VDEV_BIAS_LOG;
+ else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
+ alloc_bias = VDEV_BIAS_SPECIAL;
+ else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
+ alloc_bias = VDEV_BIAS_DEDUP;
+
+ return (alloc_bias);
+}
+
+/*
+ * Default asize function: return the MAX of psize with the asize of
+ * all children. This is what's used by anything other than RAID-Z.
+ */
+uint64_t
+vdev_default_asize(vdev_t *vd, uint64_t psize)
+{
+ uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
+ uint64_t csize;
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
+ asize = MAX(asize, csize);
+ }
+
+ return (asize);
+}
+
+uint64_t
+vdev_default_min_asize(vdev_t *vd)
+{
+ return (vd->vdev_min_asize);
+}
+
+/*
+ * Get the minimum allocatable size. We define the allocatable size as
+ * the vdev's asize rounded to the nearest metaslab. This allows us to
+ * replace or attach devices which don't have the same physical size but
+ * can still satisfy the same number of allocations.
+ */
+uint64_t
+vdev_get_min_asize(vdev_t *vd)
+{
+ vdev_t *pvd = vd->vdev_parent;
+
+ /*
+ * If our parent is NULL (inactive spare or cache) or is the root,
+ * just return our own asize.
+ */
+ if (pvd == NULL)
+ return (vd->vdev_asize);
+
+ /*
+ * The top-level vdev just returns the allocatable size rounded
+ * to the nearest metaslab.
+ */
+ if (vd == vd->vdev_top)
+ return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
+
+ return (pvd->vdev_ops->vdev_op_min_asize(pvd));
+}
+
+void
+vdev_set_min_asize(vdev_t *vd)
+{
+ vd->vdev_min_asize = vdev_get_min_asize(vd);
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_set_min_asize(vd->vdev_child[c]);
+}
+
+/*
+ * Get the minimal allocation size for the top-level vdev.
+ */
+uint64_t
+vdev_get_min_alloc(vdev_t *vd)
+{
+ uint64_t min_alloc = 1ULL << vd->vdev_ashift;
+
+ if (vd->vdev_ops->vdev_op_min_alloc != NULL)
+ min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd);
+
+ return (min_alloc);
+}
+
+/*
+ * Get the parity level for a top-level vdev.
+ */
+uint64_t
+vdev_get_nparity(vdev_t *vd)
+{
+ uint64_t nparity = 0;
+
+ if (vd->vdev_ops->vdev_op_nparity != NULL)
+ nparity = vd->vdev_ops->vdev_op_nparity(vd);
+
+ return (nparity);
+}
+
+/*
+ * Get the number of data disks for a top-level vdev.
+ */
+uint64_t
+vdev_get_ndisks(vdev_t *vd)
+{
+ uint64_t ndisks = 1;
+
+ if (vd->vdev_ops->vdev_op_ndisks != NULL)
+ ndisks = vd->vdev_ops->vdev_op_ndisks(vd);
+
+ return (ndisks);
+}
+
+vdev_t *
+vdev_lookup_top(spa_t *spa, uint64_t vdev)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+
+ if (vdev < rvd->vdev_children) {
+ ASSERT(rvd->vdev_child[vdev] != NULL);
+ return (rvd->vdev_child[vdev]);
+ }
+
+ return (NULL);
+}
+
+vdev_t *
+vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
+{
+ vdev_t *mvd;
+
+ if (vd->vdev_guid == guid)
+ return (vd);
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
+ NULL)
+ return (mvd);
+
+ return (NULL);
+}
+
+static int
+vdev_count_leaves_impl(vdev_t *vd)
+{
+ int n = 0;
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ return (1);
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ n += vdev_count_leaves_impl(vd->vdev_child[c]);
+
+ return (n);
+}
+
+int
+vdev_count_leaves(spa_t *spa)
+{
+ int rc;
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ rc = vdev_count_leaves_impl(spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ return (rc);
+}
+
+void
+vdev_add_child(vdev_t *pvd, vdev_t *cvd)
+{
+ size_t oldsize, newsize;
+ uint64_t id = cvd->vdev_id;
+ vdev_t **newchild;
+
+ ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+ ASSERT(cvd->vdev_parent == NULL);
+
+ cvd->vdev_parent = pvd;
+
+ if (pvd == NULL)
+ return;
+
+ ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
+
+ oldsize = pvd->vdev_children * sizeof (vdev_t *);
+ pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
+ newsize = pvd->vdev_children * sizeof (vdev_t *);
+
+ newchild = kmem_alloc(newsize, KM_SLEEP);
+ if (pvd->vdev_child != NULL) {
+ bcopy(pvd->vdev_child, newchild, oldsize);
+ kmem_free(pvd->vdev_child, oldsize);
+ }
+
+ pvd->vdev_child = newchild;
+ pvd->vdev_child[id] = cvd;
+
+ cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
+ ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
+
+ /*
+ * Walk up all ancestors to update guid sum.
+ */
+ for (; pvd != NULL; pvd = pvd->vdev_parent)
+ pvd->vdev_guid_sum += cvd->vdev_guid_sum;
+
+ if (cvd->vdev_ops->vdev_op_leaf) {
+ list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd);
+ cvd->vdev_spa->spa_leaf_list_gen++;
+ }
+}
+
+void
+vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
+{
+ int c;
+ uint_t id = cvd->vdev_id;
+
+ ASSERT(cvd->vdev_parent == pvd);
+
+ if (pvd == NULL)
+ return;
+
+ ASSERT(id < pvd->vdev_children);
+ ASSERT(pvd->vdev_child[id] == cvd);
+
+ pvd->vdev_child[id] = NULL;
+ cvd->vdev_parent = NULL;
+
+ for (c = 0; c < pvd->vdev_children; c++)
+ if (pvd->vdev_child[c])
+ break;
+
+ if (c == pvd->vdev_children) {
+ kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
+ pvd->vdev_child = NULL;
+ pvd->vdev_children = 0;
+ }
+
+ if (cvd->vdev_ops->vdev_op_leaf) {
+ spa_t *spa = cvd->vdev_spa;
+ list_remove(&spa->spa_leaf_list, cvd);
+ spa->spa_leaf_list_gen++;
+ }
+
+ /*
+ * Walk up all ancestors to update guid sum.
+ */
+ for (; pvd != NULL; pvd = pvd->vdev_parent)
+ pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
+}
+
+/*
+ * Remove any holes in the child array.
+ */
+void
+vdev_compact_children(vdev_t *pvd)
+{
+ vdev_t **newchild, *cvd;
+ int oldc = pvd->vdev_children;
+ int newc;
+
+ ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ if (oldc == 0)
+ return;
+
+ for (int c = newc = 0; c < oldc; c++)
+ if (pvd->vdev_child[c])
+ newc++;
+
+ if (newc > 0) {
+ newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP);
+
+ for (int c = newc = 0; c < oldc; c++) {
+ if ((cvd = pvd->vdev_child[c]) != NULL) {
+ newchild[newc] = cvd;
+ cvd->vdev_id = newc++;
+ }
+ }
+ } else {
+ newchild = NULL;
+ }
+
+ kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
+ pvd->vdev_child = newchild;
+ pvd->vdev_children = newc;
+}
+
+/*
+ * Allocate and minimally initialize a vdev_t.
+ */
+vdev_t *
+vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
+{
+ vdev_t *vd;
+ vdev_indirect_config_t *vic;
+
+ vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
+ vic = &vd->vdev_indirect_config;
+
+ if (spa->spa_root_vdev == NULL) {
+ ASSERT(ops == &vdev_root_ops);
+ spa->spa_root_vdev = vd;
+ spa->spa_load_guid = spa_generate_guid(NULL);
+ }
+
+ if (guid == 0 && ops != &vdev_hole_ops) {
+ if (spa->spa_root_vdev == vd) {
+ /*
+ * The root vdev's guid will also be the pool guid,
+ * which must be unique among all pools.
+ */
+ guid = spa_generate_guid(NULL);
+ } else {
+ /*
+ * Any other vdev's guid must be unique within the pool.
+ */
+ guid = spa_generate_guid(spa);
+ }
+ ASSERT(!spa_guid_exists(spa_guid(spa), guid));
+ }
+
+ vd->vdev_spa = spa;
+ vd->vdev_id = id;
+ vd->vdev_guid = guid;
+ vd->vdev_guid_sum = guid;
+ vd->vdev_ops = ops;
+ vd->vdev_state = VDEV_STATE_CLOSED;
+ vd->vdev_ishole = (ops == &vdev_hole_ops);
+ vic->vic_prev_indirect_vdev = UINT64_MAX;
+
+ rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
+ vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL,
+ 0, 0);
+
+ /*
+ * Initialize rate limit structs for events. We rate limit ZIO delay
+ * and checksum events so that we don't overwhelm ZED with thousands
+ * of events when a disk is acting up.
+ */
+ zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
+ 1);
+ zfs_ratelimit_init(&vd->vdev_checksum_rl,
+ &zfs_checksum_events_per_second, 1);
+
+ list_link_init(&vd->vdev_config_dirty_node);
+ list_link_init(&vd->vdev_state_dirty_node);
+ list_link_init(&vd->vdev_initialize_node);
+ list_link_init(&vd->vdev_leaf_node);
+ list_link_init(&vd->vdev_trim_node);
+
+ mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
+ mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
+
+ mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
+
+ mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
+
+ for (int t = 0; t < DTL_TYPES; t++) {
+ vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
+ 0);
+ }
+
+ txg_list_create(&vd->vdev_ms_list, spa,
+ offsetof(struct metaslab, ms_txg_node));
+ txg_list_create(&vd->vdev_dtl_list, spa,
+ offsetof(struct vdev, vdev_dtl_node));
+ vd->vdev_stat.vs_timestamp = gethrtime();
+ vdev_queue_init(vd);
+ vdev_cache_init(vd);
+
+ return (vd);
+}
+
+/*
+ * Allocate a new vdev. The 'alloctype' is used to control whether we are
+ * creating a new vdev or loading an existing one - the behavior is slightly
+ * different for each case.
+ */
+int
+vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
+ int alloctype)
+{
+ vdev_ops_t *ops;
+ char *type;
+ uint64_t guid = 0, islog;
+ vdev_t *vd;
+ vdev_indirect_config_t *vic;
+ char *tmp = NULL;
+ int rc;
+ vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
+ boolean_t top_level = (parent && !parent->vdev_parent);
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
+ return (SET_ERROR(EINVAL));
+
+ if ((ops = vdev_getops(type)) == NULL)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * If this is a load, get the vdev guid from the nvlist.
+ * Otherwise, vdev_alloc_common() will generate one for us.
+ */
+ if (alloctype == VDEV_ALLOC_LOAD) {
+ uint64_t label_id;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
+ label_id != id)
+ return (SET_ERROR(EINVAL));
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+ return (SET_ERROR(EINVAL));
+ } else if (alloctype == VDEV_ALLOC_SPARE) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+ return (SET_ERROR(EINVAL));
+ } else if (alloctype == VDEV_ALLOC_L2CACHE) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+ return (SET_ERROR(EINVAL));
+ } else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * The first allocated vdev must be of type 'root'.
+ */
+ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * Determine whether we're a log vdev.
+ */
+ islog = 0;
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
+ if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
+ return (SET_ERROR(ENOTSUP));
+
+ if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
+ return (SET_ERROR(ENOTSUP));
+
+ if (top_level && alloctype == VDEV_ALLOC_ADD) {
+ char *bias;
+
+ /*
+ * If creating a top-level vdev, check for allocation
+ * classes input.
+ */
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
+ &bias) == 0) {
+ alloc_bias = vdev_derive_alloc_bias(bias);
+
+ /* spa_vdev_add() expects feature to be enabled */
+ if (spa->spa_load_state != SPA_LOAD_CREATE &&
+ !spa_feature_is_enabled(spa,
+ SPA_FEATURE_ALLOCATION_CLASSES)) {
+ return (SET_ERROR(ENOTSUP));
+ }
+ }
+
+ /* spa_vdev_add() expects feature to be enabled */
+ if (ops == &vdev_draid_ops &&
+ spa->spa_load_state != SPA_LOAD_CREATE &&
+ !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) {
+ return (SET_ERROR(ENOTSUP));
+ }
+ }
+
+ /*
+ * Initialize the vdev specific data. This is done before calling
+ * vdev_alloc_common() since it may fail and this simplifies the
+ * error reporting and cleanup code paths.
+ */
+ void *tsd = NULL;
+ if (ops->vdev_op_init != NULL) {
+ rc = ops->vdev_op_init(spa, nv, &tsd);
+ if (rc != 0) {
+ return (rc);
+ }
+ }
+
+ vd = vdev_alloc_common(spa, id, guid, ops);
+ vd->vdev_tsd = tsd;
+ vd->vdev_islog = islog;
+
+ if (top_level && alloc_bias != VDEV_BIAS_NONE)
+ vd->vdev_alloc_bias = alloc_bias;
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
+ vd->vdev_path = spa_strdup(vd->vdev_path);
+
+ /*
+ * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a
+ * fault on a vdev and want it to persist across imports (like with
+ * zpool offline -f).
+ */
+ rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp);
+ if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL;
+ vd->vdev_faulted = 1;
+ vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
+ }
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
+ vd->vdev_devid = spa_strdup(vd->vdev_devid);
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
+ &vd->vdev_physpath) == 0)
+ vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
+ &vd->vdev_enc_sysfs_path) == 0)
+ vd->vdev_enc_sysfs_path = spa_strdup(vd->vdev_enc_sysfs_path);
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
+ vd->vdev_fru = spa_strdup(vd->vdev_fru);
+
+ /*
+ * Set the whole_disk property. If it's not specified, leave the value
+ * as -1.
+ */
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+ &vd->vdev_wholedisk) != 0)
+ vd->vdev_wholedisk = -1ULL;
+
+ vic = &vd->vdev_indirect_config;
+
+ ASSERT0(vic->vic_mapping_object);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
+ &vic->vic_mapping_object);
+ ASSERT0(vic->vic_births_object);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
+ &vic->vic_births_object);
+ ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
+ &vic->vic_prev_indirect_vdev);
+
+ /*
+ * Look for the 'not present' flag. This will only be set if the device
+ * was not present at the time of import.
+ */
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+ &vd->vdev_not_present);
+
+ /*
+ * Get the alignment requirement.
+ */
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
+
+ /*
+ * Retrieve the vdev creation time.
+ */
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
+ &vd->vdev_crtxg);
+
+ /*
+ * If we're a top-level vdev, try to load the allocation parameters.
+ */
+ if (top_level &&
+ (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+ &vd->vdev_ms_array);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+ &vd->vdev_ms_shift);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
+ &vd->vdev_asize);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
+ &vd->vdev_removing);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
+ &vd->vdev_top_zap);
+ } else {
+ ASSERT0(vd->vdev_top_zap);
+ }
+
+ if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
+ ASSERT(alloctype == VDEV_ALLOC_LOAD ||
+ alloctype == VDEV_ALLOC_ADD ||
+ alloctype == VDEV_ALLOC_SPLIT ||
+ alloctype == VDEV_ALLOC_ROOTPOOL);
+ /* Note: metaslab_group_create() is now deferred */
+ }
+
+ if (vd->vdev_ops->vdev_op_leaf &&
+ (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
+ (void) nvlist_lookup_uint64(nv,
+ ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
+ } else {
+ ASSERT0(vd->vdev_leaf_zap);
+ }
+
+ /*
+ * If we're a leaf vdev, try to load the DTL object and other state.
+ */
+
+ if (vd->vdev_ops->vdev_op_leaf &&
+ (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
+ alloctype == VDEV_ALLOC_ROOTPOOL)) {
+ if (alloctype == VDEV_ALLOC_LOAD) {
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
+ &vd->vdev_dtl_object);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
+ &vd->vdev_unspare);
+ }
+
+ if (alloctype == VDEV_ALLOC_ROOTPOOL) {
+ uint64_t spare = 0;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
+ &spare) == 0 && spare)
+ spa_spare_add(vd);
+ }
+
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
+ &vd->vdev_offline);
+
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
+ &vd->vdev_resilver_txg);
+
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
+ &vd->vdev_rebuild_txg);
+
+ if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER))
+ vdev_defer_resilver(vd);
+
+ /*
+ * In general, when importing a pool we want to ignore the
+ * persistent fault state, as the diagnosis made on another
+ * system may not be valid in the current context. The only
+ * exception is if we forced a vdev to a persistently faulted
+ * state with 'zpool offline -f'. The persistent fault will
+ * remain across imports until cleared.
+ *
+ * Local vdevs will remain in the faulted state.
+ */
+ if (spa_load_state(spa) == SPA_LOAD_OPEN ||
+ spa_load_state(spa) == SPA_LOAD_IMPORT) {
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
+ &vd->vdev_faulted);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
+ &vd->vdev_degraded);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
+ &vd->vdev_removed);
+
+ if (vd->vdev_faulted || vd->vdev_degraded) {
+ char *aux;
+
+ vd->vdev_label_aux =
+ VDEV_AUX_ERR_EXCEEDED;
+ if (nvlist_lookup_string(nv,
+ ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
+ strcmp(aux, "external") == 0)
+ vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
+ else
+ vd->vdev_faulted = 0ULL;
+ }
+ }
+ }
+
+ /*
+ * Add ourselves to the parent's list of children.
+ */
+ vdev_add_child(parent, vd);
+
+ *vdp = vd;
+
+ return (0);
+}
+
+void
+vdev_free(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+ ASSERT3P(vd->vdev_trim_thread, ==, NULL);
+ ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
+ ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
+
+ /*
+ * Scan queues are normally destroyed at the end of a scan. If the
+ * queue exists here, that implies the vdev is being removed while
+ * the scan is still running.
+ */
+ if (vd->vdev_scan_io_queue != NULL) {
+ mutex_enter(&vd->vdev_scan_io_queue_lock);
+ dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
+ vd->vdev_scan_io_queue = NULL;
+ mutex_exit(&vd->vdev_scan_io_queue_lock);
+ }
+
+ /*
+ * vdev_free() implies closing the vdev first. This is simpler than
+ * trying to ensure complicated semantics for all callers.
+ */
+ vdev_close(vd);
+
+ ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
+ ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
+
+ /*
+ * Free all children.
+ */
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_free(vd->vdev_child[c]);
+
+ ASSERT(vd->vdev_child == NULL);
+ ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
+
+ if (vd->vdev_ops->vdev_op_fini != NULL)
+ vd->vdev_ops->vdev_op_fini(vd);
+
+ /*
+ * Discard allocation state.
+ */
+ if (vd->vdev_mg != NULL) {
+ vdev_metaslab_fini(vd);
+ metaslab_group_destroy(vd->vdev_mg);
+ vd->vdev_mg = NULL;
+ }
+ if (vd->vdev_log_mg != NULL) {
+ ASSERT0(vd->vdev_ms_count);
+ metaslab_group_destroy(vd->vdev_log_mg);
+ vd->vdev_log_mg = NULL;
+ }
+
+ ASSERT0(vd->vdev_stat.vs_space);
+ ASSERT0(vd->vdev_stat.vs_dspace);
+ ASSERT0(vd->vdev_stat.vs_alloc);
+
+ /*
+ * Remove this vdev from its parent's child list.
+ */
+ vdev_remove_child(vd->vdev_parent, vd);
+
+ ASSERT(vd->vdev_parent == NULL);
+ ASSERT(!list_link_active(&vd->vdev_leaf_node));
+
+ /*
+ * Clean up vdev structure.
+ */
+ vdev_queue_fini(vd);
+ vdev_cache_fini(vd);
+
+ if (vd->vdev_path)
+ spa_strfree(vd->vdev_path);
+ if (vd->vdev_devid)
+ spa_strfree(vd->vdev_devid);
+ if (vd->vdev_physpath)
+ spa_strfree(vd->vdev_physpath);
+
+ if (vd->vdev_enc_sysfs_path)
+ spa_strfree(vd->vdev_enc_sysfs_path);
+
+ if (vd->vdev_fru)
+ spa_strfree(vd->vdev_fru);
+
+ if (vd->vdev_isspare)
+ spa_spare_remove(vd);
+ if (vd->vdev_isl2cache)
+ spa_l2cache_remove(vd);
+
+ txg_list_destroy(&vd->vdev_ms_list);
+ txg_list_destroy(&vd->vdev_dtl_list);
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ space_map_close(vd->vdev_dtl_sm);
+ for (int t = 0; t < DTL_TYPES; t++) {
+ range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
+ range_tree_destroy(vd->vdev_dtl[t]);
+ }
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ EQUIV(vd->vdev_indirect_births != NULL,
+ vd->vdev_indirect_mapping != NULL);
+ if (vd->vdev_indirect_births != NULL) {
+ vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
+ vdev_indirect_births_close(vd->vdev_indirect_births);
+ }
+
+ if (vd->vdev_obsolete_sm != NULL) {
+ ASSERT(vd->vdev_removing ||
+ vd->vdev_ops == &vdev_indirect_ops);
+ space_map_close(vd->vdev_obsolete_sm);
+ vd->vdev_obsolete_sm = NULL;
+ }
+ range_tree_destroy(vd->vdev_obsolete_segments);
+ rw_destroy(&vd->vdev_indirect_rwlock);
+ mutex_destroy(&vd->vdev_obsolete_lock);
+
+ mutex_destroy(&vd->vdev_dtl_lock);
+ mutex_destroy(&vd->vdev_stat_lock);
+ mutex_destroy(&vd->vdev_probe_lock);
+ mutex_destroy(&vd->vdev_scan_io_queue_lock);
+
+ mutex_destroy(&vd->vdev_initialize_lock);
+ mutex_destroy(&vd->vdev_initialize_io_lock);
+ cv_destroy(&vd->vdev_initialize_io_cv);
+ cv_destroy(&vd->vdev_initialize_cv);
+
+ mutex_destroy(&vd->vdev_trim_lock);
+ mutex_destroy(&vd->vdev_autotrim_lock);
+ mutex_destroy(&vd->vdev_trim_io_lock);
+ cv_destroy(&vd->vdev_trim_cv);
+ cv_destroy(&vd->vdev_autotrim_cv);
+ cv_destroy(&vd->vdev_trim_io_cv);
+
+ mutex_destroy(&vd->vdev_rebuild_lock);
+ cv_destroy(&vd->vdev_rebuild_cv);
+
+ zfs_ratelimit_fini(&vd->vdev_delay_rl);
+ zfs_ratelimit_fini(&vd->vdev_checksum_rl);
+
+ if (vd == spa->spa_root_vdev)
+ spa->spa_root_vdev = NULL;
+
+ kmem_free(vd, sizeof (vdev_t));
+}
+
+/*
+ * Transfer top-level vdev state from svd to tvd.
+ */
+static void
+vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
+{
+ spa_t *spa = svd->vdev_spa;
+ metaslab_t *msp;
+ vdev_t *vd;
+ int t;
+
+ ASSERT(tvd == tvd->vdev_top);
+
+ tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite;
+ tvd->vdev_ms_array = svd->vdev_ms_array;
+ tvd->vdev_ms_shift = svd->vdev_ms_shift;
+ tvd->vdev_ms_count = svd->vdev_ms_count;
+ tvd->vdev_top_zap = svd->vdev_top_zap;
+
+ svd->vdev_ms_array = 0;
+ svd->vdev_ms_shift = 0;
+ svd->vdev_ms_count = 0;
+ svd->vdev_top_zap = 0;
+
+ if (tvd->vdev_mg)
+ ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
+ if (tvd->vdev_log_mg)
+ ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg);
+ tvd->vdev_mg = svd->vdev_mg;
+ tvd->vdev_log_mg = svd->vdev_log_mg;
+ tvd->vdev_ms = svd->vdev_ms;
+
+ svd->vdev_mg = NULL;
+ svd->vdev_log_mg = NULL;
+ svd->vdev_ms = NULL;
+
+ if (tvd->vdev_mg != NULL)
+ tvd->vdev_mg->mg_vd = tvd;
+ if (tvd->vdev_log_mg != NULL)
+ tvd->vdev_log_mg->mg_vd = tvd;
+
+ tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
+ svd->vdev_checkpoint_sm = NULL;
+
+ tvd->vdev_alloc_bias = svd->vdev_alloc_bias;
+ svd->vdev_alloc_bias = VDEV_BIAS_NONE;
+
+ tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
+ tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
+ tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
+
+ svd->vdev_stat.vs_alloc = 0;
+ svd->vdev_stat.vs_space = 0;
+ svd->vdev_stat.vs_dspace = 0;
+
+ /*
+ * State which may be set on a top-level vdev that's in the
+ * process of being removed.
+ */
+ ASSERT0(tvd->vdev_indirect_config.vic_births_object);
+ ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
+ ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
+ ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
+ ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
+ ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
+ ASSERT0(tvd->vdev_removing);
+ ASSERT0(tvd->vdev_rebuilding);
+ tvd->vdev_removing = svd->vdev_removing;
+ tvd->vdev_rebuilding = svd->vdev_rebuilding;
+ tvd->vdev_rebuild_config = svd->vdev_rebuild_config;
+ tvd->vdev_indirect_config = svd->vdev_indirect_config;
+ tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
+ tvd->vdev_indirect_births = svd->vdev_indirect_births;
+ range_tree_swap(&svd->vdev_obsolete_segments,
+ &tvd->vdev_obsolete_segments);
+ tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
+ svd->vdev_indirect_config.vic_mapping_object = 0;
+ svd->vdev_indirect_config.vic_births_object = 0;
+ svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
+ svd->vdev_indirect_mapping = NULL;
+ svd->vdev_indirect_births = NULL;
+ svd->vdev_obsolete_sm = NULL;
+ svd->vdev_removing = 0;
+ svd->vdev_rebuilding = 0;
+
+ for (t = 0; t < TXG_SIZE; t++) {
+ while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
+ (void) txg_list_add(&tvd->vdev_ms_list, msp, t);
+ while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
+ (void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
+ if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
+ (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
+ }
+
+ if (list_link_active(&svd->vdev_config_dirty_node)) {
+ vdev_config_clean(svd);
+ vdev_config_dirty(tvd);
+ }
+
+ if (list_link_active(&svd->vdev_state_dirty_node)) {
+ vdev_state_clean(svd);
+ vdev_state_dirty(tvd);
+ }
+
+ tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
+ svd->vdev_deflate_ratio = 0;
+
+ tvd->vdev_islog = svd->vdev_islog;
+ svd->vdev_islog = 0;
+
+ dsl_scan_io_queue_vdev_xfer(svd, tvd);
+}
+
+static void
+vdev_top_update(vdev_t *tvd, vdev_t *vd)
+{
+ if (vd == NULL)
+ return;
+
+ vd->vdev_top = tvd;
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_top_update(tvd, vd->vdev_child[c]);
+}
+
+/*
+ * Add a mirror/replacing vdev above an existing vdev. There is no need to
+ * call .vdev_op_init() since mirror/replacing vdevs do not have private state.
+ */
+vdev_t *
+vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
+{
+ spa_t *spa = cvd->vdev_spa;
+ vdev_t *pvd = cvd->vdev_parent;
+ vdev_t *mvd;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
+
+ mvd->vdev_asize = cvd->vdev_asize;
+ mvd->vdev_min_asize = cvd->vdev_min_asize;
+ mvd->vdev_max_asize = cvd->vdev_max_asize;
+ mvd->vdev_psize = cvd->vdev_psize;
+ mvd->vdev_ashift = cvd->vdev_ashift;
+ mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
+ mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
+ mvd->vdev_state = cvd->vdev_state;
+ mvd->vdev_crtxg = cvd->vdev_crtxg;
+
+ vdev_remove_child(pvd, cvd);
+ vdev_add_child(pvd, mvd);
+ cvd->vdev_id = mvd->vdev_children;
+ vdev_add_child(mvd, cvd);
+ vdev_top_update(cvd->vdev_top, cvd->vdev_top);
+
+ if (mvd == mvd->vdev_top)
+ vdev_top_transfer(cvd, mvd);
+
+ return (mvd);
+}
+
+/*
+ * Remove a 1-way mirror/replacing vdev from the tree.
+ */
+void
+vdev_remove_parent(vdev_t *cvd)
+{
+ vdev_t *mvd = cvd->vdev_parent;
+ vdev_t *pvd = mvd->vdev_parent;
+
+ ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ ASSERT(mvd->vdev_children == 1);
+ ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
+ mvd->vdev_ops == &vdev_replacing_ops ||
+ mvd->vdev_ops == &vdev_spare_ops);
+ cvd->vdev_ashift = mvd->vdev_ashift;
+ cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
+ cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;
+ vdev_remove_child(mvd, cvd);
+ vdev_remove_child(pvd, mvd);
+
+ /*
+ * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
+ * Otherwise, we could have detached an offline device, and when we
+ * go to import the pool we'll think we have two top-level vdevs,
+ * instead of a different version of the same top-level vdev.
+ */
+ if (mvd->vdev_top == mvd) {
+ uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
+ cvd->vdev_orig_guid = cvd->vdev_guid;
+ cvd->vdev_guid += guid_delta;
+ cvd->vdev_guid_sum += guid_delta;
+
+ /*
+ * If pool not set for autoexpand, we need to also preserve
+ * mvd's asize to prevent automatic expansion of cvd.
+ * Otherwise if we are adjusting the mirror by attaching and
+ * detaching children of non-uniform sizes, the mirror could
+ * autoexpand, unexpectedly requiring larger devices to
+ * re-establish the mirror.
+ */
+ if (!cvd->vdev_spa->spa_autoexpand)
+ cvd->vdev_asize = mvd->vdev_asize;
+ }
+ cvd->vdev_id = mvd->vdev_id;
+ vdev_add_child(pvd, cvd);
+ vdev_top_update(cvd->vdev_top, cvd->vdev_top);
+
+ if (cvd == cvd->vdev_top)
+ vdev_top_transfer(mvd, cvd);
+
+ ASSERT(mvd->vdev_children == 0);
+ vdev_free(mvd);
+}
+
+void
+vdev_metaslab_group_create(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ /*
+ * metaslab_group_create was delayed until allocation bias was available
+ */
+ if (vd->vdev_mg == NULL) {
+ metaslab_class_t *mc;
+
+ if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE)
+ vd->vdev_alloc_bias = VDEV_BIAS_LOG;
+
+ ASSERT3U(vd->vdev_islog, ==,
+ (vd->vdev_alloc_bias == VDEV_BIAS_LOG));
+
+ switch (vd->vdev_alloc_bias) {
+ case VDEV_BIAS_LOG:
+ mc = spa_log_class(spa);
+ break;
+ case VDEV_BIAS_SPECIAL:
+ mc = spa_special_class(spa);
+ break;
+ case VDEV_BIAS_DEDUP:
+ mc = spa_dedup_class(spa);
+ break;
+ default:
+ mc = spa_normal_class(spa);
+ }
+
+ vd->vdev_mg = metaslab_group_create(mc, vd,
+ spa->spa_alloc_count);
+
+ if (!vd->vdev_islog) {
+ vd->vdev_log_mg = metaslab_group_create(
+ spa_embedded_log_class(spa), vd, 1);
+ }
+
+ /*
+ * The spa ashift min/max only apply for the normal metaslab
+ * class. Class destination is late binding so ashift boundry
+ * setting had to wait until now.
+ */
+ if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
+ mc == spa_normal_class(spa) && vd->vdev_aux == NULL) {
+ if (vd->vdev_ashift > spa->spa_max_ashift)
+ spa->spa_max_ashift = vd->vdev_ashift;
+ if (vd->vdev_ashift < spa->spa_min_ashift)
+ spa->spa_min_ashift = vd->vdev_ashift;
+
+ uint64_t min_alloc = vdev_get_min_alloc(vd);
+ if (min_alloc < spa->spa_min_alloc)
+ spa->spa_min_alloc = min_alloc;
+ }
+ }
+}
+
+int
+vdev_metaslab_init(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ uint64_t oldc = vd->vdev_ms_count;
+ uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
+ metaslab_t **mspp;
+ int error;
+ boolean_t expanding = (oldc != 0);
+
+ ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
+ /*
+ * This vdev is not being allocated from yet or is a hole.
+ */
+ if (vd->vdev_ms_shift == 0)
+ return (0);
+
+ ASSERT(!vd->vdev_ishole);
+
+ ASSERT(oldc <= newc);
+
+ mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
+
+ if (expanding) {
+ bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
+ vmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
+ }
+
+ vd->vdev_ms = mspp;
+ vd->vdev_ms_count = newc;
+
+ for (uint64_t m = oldc; m < newc; m++) {
+ uint64_t object = 0;
+ /*
+ * vdev_ms_array may be 0 if we are creating the "fake"
+ * metaslabs for an indirect vdev for zdb's leak detection.
+ * See zdb_leak_init().
+ */
+ if (txg == 0 && vd->vdev_ms_array != 0) {
+ error = dmu_read(spa->spa_meta_objset,
+ vd->vdev_ms_array,
+ m * sizeof (uint64_t), sizeof (uint64_t), &object,
+ DMU_READ_PREFETCH);
+ if (error != 0) {
+ vdev_dbgmsg(vd, "unable to read the metaslab "
+ "array [error=%d]", error);
+ return (error);
+ }
+ }
+
+ error = metaslab_init(vd->vdev_mg, m, object, txg,
+ &(vd->vdev_ms[m]));
+ if (error != 0) {
+ vdev_dbgmsg(vd, "metaslab_init failed [error=%d]",
+ error);
+ return (error);
+ }
+ }
+
+ /*
+ * Find the emptiest metaslab on the vdev and mark it for use for
+ * embedded slog by moving it from the regular to the log metaslab
+ * group.
+ */
+ if (vd->vdev_mg->mg_class == spa_normal_class(spa) &&
+ vd->vdev_ms_count > zfs_embedded_slog_min_ms &&
+ avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) {
+ uint64_t slog_msid = 0;
+ uint64_t smallest = UINT64_MAX;
+
+ /*
+ * Note, we only search the new metaslabs, because the old
+ * (pre-existing) ones may be active (e.g. have non-empty
+ * range_tree's), and we don't move them to the new
+ * metaslab_t.
+ */
+ for (uint64_t m = oldc; m < newc; m++) {
+ uint64_t alloc =
+ space_map_allocated(vd->vdev_ms[m]->ms_sm);
+ if (alloc < smallest) {
+ slog_msid = m;
+ smallest = alloc;
+ }
+ }
+ metaslab_t *slog_ms = vd->vdev_ms[slog_msid];
+ /*
+ * The metaslab was marked as dirty at the end of
+ * metaslab_init(). Remove it from the dirty list so that we
+ * can uninitialize and reinitialize it to the new class.
+ */
+ if (txg != 0) {
+ (void) txg_list_remove_this(&vd->vdev_ms_list,
+ slog_ms, txg);
+ }
+ uint64_t sm_obj = space_map_object(slog_ms->ms_sm);
+ metaslab_fini(slog_ms);
+ VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg,
+ &vd->vdev_ms[slog_msid]));
+ }
+
+ if (txg == 0)
+ spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
+
+ /*
+ * If the vdev is being removed we don't activate
+ * the metaslabs since we want to ensure that no new
+ * allocations are performed on this device.
+ */
+ if (!expanding && !vd->vdev_removing) {
+ metaslab_group_activate(vd->vdev_mg);
+ if (vd->vdev_log_mg != NULL)
+ metaslab_group_activate(vd->vdev_log_mg);
+ }
+
+ if (txg == 0)
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
+
+ /*
+ * Regardless whether this vdev was just added or it is being
+ * expanded, the metaslab count has changed. Recalculate the
+ * block limit.
+ */
+ spa_log_sm_set_blocklimit(spa);
+
+ return (0);
+}
+
+void
+vdev_metaslab_fini(vdev_t *vd)
+{
+ if (vd->vdev_checkpoint_sm != NULL) {
+ ASSERT(spa_feature_is_active(vd->vdev_spa,
+ SPA_FEATURE_POOL_CHECKPOINT));
+ space_map_close(vd->vdev_checkpoint_sm);
+ /*
+ * Even though we close the space map, we need to set its
+ * pointer to NULL. The reason is that vdev_metaslab_fini()
+ * may be called multiple times for certain operations
+ * (i.e. when destroying a pool) so we need to ensure that
+ * this clause never executes twice. This logic is similar
+ * to the one used for the vdev_ms clause below.
+ */
+ vd->vdev_checkpoint_sm = NULL;
+ }
+
+ if (vd->vdev_ms != NULL) {
+ metaslab_group_t *mg = vd->vdev_mg;
+
+ metaslab_group_passivate(mg);
+ if (vd->vdev_log_mg != NULL) {
+ ASSERT(!vd->vdev_islog);
+ metaslab_group_passivate(vd->vdev_log_mg);
+ }
+
+ uint64_t count = vd->vdev_ms_count;
+ for (uint64_t m = 0; m < count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+ if (msp != NULL)
+ metaslab_fini(msp);
+ }
+ vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
+ vd->vdev_ms = NULL;
+ vd->vdev_ms_count = 0;
+
+ for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+ ASSERT0(mg->mg_histogram[i]);
+ if (vd->vdev_log_mg != NULL)
+ ASSERT0(vd->vdev_log_mg->mg_histogram[i]);
+ }
+ }
+ ASSERT0(vd->vdev_ms_count);
+ ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
+}
+
+typedef struct vdev_probe_stats {
+ boolean_t vps_readable;
+ boolean_t vps_writeable;
+ int vps_flags;
+} vdev_probe_stats_t;
+
+static void
+vdev_probe_done(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ vdev_t *vd = zio->io_vd;
+ vdev_probe_stats_t *vps = zio->io_private;
+
+ ASSERT(vd->vdev_probe_zio != NULL);
+
+ if (zio->io_type == ZIO_TYPE_READ) {
+ if (zio->io_error == 0)
+ vps->vps_readable = 1;
+ if (zio->io_error == 0 && spa_writeable(spa)) {
+ zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
+ zio->io_offset, zio->io_size, zio->io_abd,
+ ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
+ ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
+ } else {
+ abd_free(zio->io_abd);
+ }
+ } else if (zio->io_type == ZIO_TYPE_WRITE) {
+ if (zio->io_error == 0)
+ vps->vps_writeable = 1;
+ abd_free(zio->io_abd);
+ } else if (zio->io_type == ZIO_TYPE_NULL) {
+ zio_t *pio;
+ zio_link_t *zl;
+
+ vd->vdev_cant_read |= !vps->vps_readable;
+ vd->vdev_cant_write |= !vps->vps_writeable;
+
+ if (vdev_readable(vd) &&
+ (vdev_writeable(vd) || !spa_writeable(spa))) {
+ zio->io_error = 0;
+ } else {
+ ASSERT(zio->io_error != 0);
+ vdev_dbgmsg(vd, "failed probe");
+ (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
+ spa, vd, NULL, NULL, 0);
+ zio->io_error = SET_ERROR(ENXIO);
+ }
+
+ mutex_enter(&vd->vdev_probe_lock);
+ ASSERT(vd->vdev_probe_zio == zio);
+ vd->vdev_probe_zio = NULL;
+ mutex_exit(&vd->vdev_probe_lock);
+
+ zl = NULL;
+ while ((pio = zio_walk_parents(zio, &zl)) != NULL)
+ if (!vdev_accessible(vd, pio))
+ pio->io_error = SET_ERROR(ENXIO);
+
+ kmem_free(vps, sizeof (*vps));
+ }
+}
+
+/*
+ * Determine whether this device is accessible.
+ *
+ * Read and write to several known locations: the pad regions of each
+ * vdev label but the first, which we leave alone in case it contains
+ * a VTOC.
+ */
+zio_t *
+vdev_probe(vdev_t *vd, zio_t *zio)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_probe_stats_t *vps = NULL;
+ zio_t *pio;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ /*
+ * Don't probe the probe.
+ */
+ if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
+ return (NULL);
+
+ /*
+ * To prevent 'probe storms' when a device fails, we create
+ * just one probe i/o at a time. All zios that want to probe
+ * this vdev will become parents of the probe io.
+ */
+ mutex_enter(&vd->vdev_probe_lock);
+
+ if ((pio = vd->vdev_probe_zio) == NULL) {
+ vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
+
+ vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
+ ZIO_FLAG_TRYHARD;
+
+ if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
+ /*
+ * vdev_cant_read and vdev_cant_write can only
+ * transition from TRUE to FALSE when we have the
+ * SCL_ZIO lock as writer; otherwise they can only
+ * transition from FALSE to TRUE. This ensures that
+ * any zio looking at these values can assume that
+ * failures persist for the life of the I/O. That's
+ * important because when a device has intermittent
+ * connectivity problems, we want to ensure that
+ * they're ascribed to the device (ENXIO) and not
+ * the zio (EIO).
+ *
+ * Since we hold SCL_ZIO as writer here, clear both
+ * values so the probe can reevaluate from first
+ * principles.
+ */
+ vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
+ vd->vdev_cant_read = B_FALSE;
+ vd->vdev_cant_write = B_FALSE;
+ }
+
+ vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
+ vdev_probe_done, vps,
+ vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
+
+ /*
+ * We can't change the vdev state in this context, so we
+ * kick off an async task to do it on our behalf.
+ */
+ if (zio != NULL) {
+ vd->vdev_probe_wanted = B_TRUE;
+ spa_async_request(spa, SPA_ASYNC_PROBE);
+ }
+ }
+
+ if (zio != NULL)
+ zio_add_child(zio, pio);
+
+ mutex_exit(&vd->vdev_probe_lock);
+
+ if (vps == NULL) {
+ ASSERT(zio != NULL);
+ return (NULL);
+ }
+
+ for (int l = 1; l < VDEV_LABELS; l++) {
+ zio_nowait(zio_read_phys(pio, vd,
+ vdev_label_offset(vd->vdev_psize, l,
+ offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE,
+ abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
+ ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
+ ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
+ }
+
+ if (zio == NULL)
+ return (pio);
+
+ zio_nowait(pio);
+ return (NULL);
+}
+
+static void
+vdev_load_child(void *arg)
+{
+ vdev_t *vd = arg;
+
+ vd->vdev_load_error = vdev_load(vd);
+}
+
+static void
+vdev_open_child(void *arg)
+{
+ vdev_t *vd = arg;
+
+ vd->vdev_open_thread = curthread;
+ vd->vdev_open_error = vdev_open(vd);
+ vd->vdev_open_thread = NULL;
+}
+
+static boolean_t
+vdev_uses_zvols(vdev_t *vd)
+{
+#ifdef _KERNEL
+ if (zvol_is_zvol(vd->vdev_path))
+ return (B_TRUE);
+#endif
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ if (vdev_uses_zvols(vd->vdev_child[c]))
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+/*
+ * Returns B_TRUE if the passed child should be opened.
+ */
+static boolean_t
+vdev_default_open_children_func(vdev_t *vd)
+{
+ return (B_TRUE);
+}
+
+/*
+ * Open the requested child vdevs. If any of the leaf vdevs are using
+ * a ZFS volume then do the opens in a single thread. This avoids a
+ * deadlock when the current thread is holding the spa_namespace_lock.
+ */
+static void
+vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
+{
+ int children = vd->vdev_children;
+
+ taskq_t *tq = taskq_create("vdev_open", children, minclsyspri,
+ children, children, TASKQ_PREPOPULATE);
+ vd->vdev_nonrot = B_TRUE;
+
+ for (int c = 0; c < children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (open_func(cvd) == B_FALSE)
+ continue;
+
+ if (tq == NULL || vdev_uses_zvols(vd)) {
+ cvd->vdev_open_error = vdev_open(cvd);
+ } else {
+ VERIFY(taskq_dispatch(tq, vdev_open_child,
+ cvd, TQ_SLEEP) != TASKQID_INVALID);
+ }
+
+ vd->vdev_nonrot &= cvd->vdev_nonrot;
+ }
+
+ if (tq != NULL) {
+ taskq_wait(tq);
+ taskq_destroy(tq);
+ }
+}
+
+/*
+ * Open all child vdevs.
+ */
+void
+vdev_open_children(vdev_t *vd)
+{
+ vdev_open_children_impl(vd, vdev_default_open_children_func);
+}
+
+/*
+ * Conditionally open a subset of child vdevs.
+ */
+void
+vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
+{
+ vdev_open_children_impl(vd, open_func);
+}
+
+/*
+ * Compute the raidz-deflation ratio. Note, we hard-code
+ * in 128k (1 << 17) because it is the "typical" blocksize.
+ * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
+ * otherwise it would inconsistently account for existing bp's.
+ */
+static void
+vdev_set_deflate_ratio(vdev_t *vd)
+{
+ if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
+ vd->vdev_deflate_ratio = (1 << 17) /
+ (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
+ }
+}
+
+/*
+ * Maximize performance by inflating the configured ashift for top level
+ * vdevs to be as close to the physical ashift as possible while maintaining
+ * administrator defined limits and ensuring it doesn't go below the
+ * logical ashift.
+ */
+static void
+vdev_ashift_optimize(vdev_t *vd)
+{
+ ASSERT(vd == vd->vdev_top);
+
+ if (vd->vdev_ashift < vd->vdev_physical_ashift) {
+ vd->vdev_ashift = MIN(
+ MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
+ MAX(zfs_vdev_min_auto_ashift,
+ vd->vdev_physical_ashift));
+ } else {
+ /*
+ * If the logical and physical ashifts are the same, then
+ * we ensure that the top-level vdev's ashift is not smaller
+ * than our minimum ashift value. For the unusual case
+ * where logical ashift > physical ashift, we can't cap
+ * the calculated ashift based on max ashift as that
+ * would cause failures.
+ * We still check if we need to increase it to match
+ * the min ashift.
+ */
+ vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift,
+ vd->vdev_ashift);
+ }
+}
+
+/*
+ * Prepare a virtual device for access.
+ */
+int
+vdev_open(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ int error;
+ uint64_t osize = 0;
+ uint64_t max_osize = 0;
+ uint64_t asize, max_asize, psize;
+ uint64_t logical_ashift = 0;
+ uint64_t physical_ashift = 0;
+
+ ASSERT(vd->vdev_open_thread == curthread ||
+ spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+ ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
+ vd->vdev_state == VDEV_STATE_CANT_OPEN ||
+ vd->vdev_state == VDEV_STATE_OFFLINE);
+
+ vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+ vd->vdev_cant_read = B_FALSE;
+ vd->vdev_cant_write = B_FALSE;
+ vd->vdev_min_asize = vdev_get_min_asize(vd);
+
+ /*
+ * If this vdev is not removed, check its fault status. If it's
+ * faulted, bail out of the open.
+ */
+ if (!vd->vdev_removed && vd->vdev_faulted) {
+ ASSERT(vd->vdev_children == 0);
+ ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
+ vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+ vd->vdev_label_aux);
+ return (SET_ERROR(ENXIO));
+ } else if (vd->vdev_offline) {
+ ASSERT(vd->vdev_children == 0);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
+ return (SET_ERROR(ENXIO));
+ }
+
+ error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
+ &logical_ashift, &physical_ashift);
+ /*
+ * Physical volume size should never be larger than its max size, unless
+ * the disk has shrunk while we were reading it or the device is buggy
+ * or damaged: either way it's not safe for use, bail out of the open.
+ */
+ if (osize > max_osize) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_OPEN_FAILED);
+ return (SET_ERROR(ENXIO));
+ }
+
+ /*
+ * Reset the vdev_reopening flag so that we actually close
+ * the vdev on error.
+ */
+ vd->vdev_reopening = B_FALSE;
+ if (zio_injection_enabled && error == 0)
+ error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO));
+
+ if (error) {
+ if (vd->vdev_removed &&
+ vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
+ vd->vdev_removed = B_FALSE;
+
+ if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE,
+ vd->vdev_stat.vs_aux);
+ } else {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ vd->vdev_stat.vs_aux);
+ }
+ return (error);
+ }
+
+ vd->vdev_removed = B_FALSE;
+
+ /*
+ * Recheck the faulted flag now that we have confirmed that
+ * the vdev is accessible. If we're faulted, bail.
+ */
+ if (vd->vdev_faulted) {
+ ASSERT(vd->vdev_children == 0);
+ ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
+ vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+ vd->vdev_label_aux);
+ return (SET_ERROR(ENXIO));
+ }
+
+ if (vd->vdev_degraded) {
+ ASSERT(vd->vdev_children == 0);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
+ VDEV_AUX_ERR_EXCEEDED);
+ } else {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
+ }
+
+ /*
+ * For hole or missing vdevs we just return success.
+ */
+ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
+ return (0);
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
+ VDEV_AUX_NONE);
+ break;
+ }
+ }
+
+ osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
+ max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));
+
+ if (vd->vdev_children == 0) {
+ if (osize < SPA_MINDEVSIZE) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_TOO_SMALL);
+ return (SET_ERROR(EOVERFLOW));
+ }
+ psize = osize;
+ asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
+ max_asize = max_osize - (VDEV_LABEL_START_SIZE +
+ VDEV_LABEL_END_SIZE);
+ } else {
+ if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
+ (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_TOO_SMALL);
+ return (SET_ERROR(EOVERFLOW));
+ }
+ psize = 0;
+ asize = osize;
+ max_asize = max_osize;
+ }
+
+ /*
+ * If the vdev was expanded, record this so that we can re-create the
+ * uberblock rings in labels {2,3}, during the next sync.
+ */
+ if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0))
+ vd->vdev_copy_uberblocks = B_TRUE;
+
+ vd->vdev_psize = psize;
+
+ /*
+ * Make sure the allocatable size hasn't shrunk too much.
+ */
+ if (asize < vd->vdev_min_asize) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * We can always set the logical/physical ashift members since
+ * their values are only used to calculate the vdev_ashift when
+ * the device is first added to the config. These values should
+ * not be used for anything else since they may change whenever
+ * the device is reopened and we don't store them in the label.
+ */
+ vd->vdev_physical_ashift =
+ MAX(physical_ashift, vd->vdev_physical_ashift);
+ vd->vdev_logical_ashift = MAX(logical_ashift,
+ vd->vdev_logical_ashift);
+
+ if (vd->vdev_asize == 0) {
+ /*
+ * This is the first-ever open, so use the computed values.
+ * For compatibility, a different ashift can be requested.
+ */
+ vd->vdev_asize = asize;
+ vd->vdev_max_asize = max_asize;
+
+ /*
+ * If the vdev_ashift was not overriden at creation time,
+ * then set it the logical ashift and optimize the ashift.
+ */
+ if (vd->vdev_ashift == 0) {
+ vd->vdev_ashift = vd->vdev_logical_ashift;
+
+ if (vd->vdev_logical_ashift > ASHIFT_MAX) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_ASHIFT_TOO_BIG);
+ return (SET_ERROR(EDOM));
+ }
+
+ if (vd->vdev_top == vd) {
+ vdev_ashift_optimize(vd);
+ }
+ }
+ if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||
+ vd->vdev_ashift > ASHIFT_MAX)) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_ASHIFT);
+ return (SET_ERROR(EDOM));
+ }
+ } else {
+ /*
+ * Make sure the alignment required hasn't increased.
+ */
+ if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
+ vd->vdev_ops->vdev_op_leaf) {
+ (void) zfs_ereport_post(
+ FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,
+ spa, vd, NULL, NULL, 0);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
+ return (SET_ERROR(EDOM));
+ }
+ vd->vdev_max_asize = max_asize;
+ }
+
+ /*
+ * If all children are healthy we update asize if either:
+ * The asize has increased, due to a device expansion caused by dynamic
+ * LUN growth or vdev replacement, and automatic expansion is enabled;
+ * making the additional space available.
+ *
+ * The asize has decreased, due to a device shrink usually caused by a
+ * vdev replace with a smaller device. This ensures that calculations
+ * based of max_asize and asize e.g. esize are always valid. It's safe
+ * to do this as we've already validated that asize is greater than
+ * vdev_min_asize.
+ */
+ if (vd->vdev_state == VDEV_STATE_HEALTHY &&
+ ((asize > vd->vdev_asize &&
+ (vd->vdev_expanding || spa->spa_autoexpand)) ||
+ (asize < vd->vdev_asize)))
+ vd->vdev_asize = asize;
+
+ vdev_set_min_asize(vd);
+
+ /*
+ * Ensure we can issue some IO before declaring the
+ * vdev open for business.
+ */
+ if (vd->vdev_ops->vdev_op_leaf &&
+ (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+ VDEV_AUX_ERR_EXCEEDED);
+ return (error);
+ }
+
+ /*
+ * Track the the minimum allocation size.
+ */
+ if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
+ vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
+ uint64_t min_alloc = vdev_get_min_alloc(vd);
+ if (min_alloc < spa->spa_min_alloc)
+ spa->spa_min_alloc = min_alloc;
+ }
+
+ /*
+ * If this is a leaf vdev, assess whether a resilver is needed.
+ * But don't do this if we are doing a reopen for a scrub, since
+ * this would just restart the scrub we are already doing.
+ */
+ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen)
+ dsl_scan_assess_vdev(spa->spa_dsl_pool, vd);
+
+ return (0);
+}
+
+static void
+vdev_validate_child(void *arg)
+{
+ vdev_t *vd = arg;
+
+ vd->vdev_validate_thread = curthread;
+ vd->vdev_validate_error = vdev_validate(vd);
+ vd->vdev_validate_thread = NULL;
+}
+
+/*
+ * Called once the vdevs are all opened, this routine validates the label
+ * contents. This needs to be done before vdev_load() so that we don't
+ * inadvertently do repair I/Os to the wrong device.
+ *
+ * This function will only return failure if one of the vdevs indicates that it
+ * has since been destroyed or exported. This is only possible if
+ * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state
+ * will be updated but the function will return 0.
+ */
+int
+vdev_validate(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ taskq_t *tq = NULL;
+ nvlist_t *label;
+ uint64_t guid = 0, aux_guid = 0, top_guid;
+ uint64_t state;
+ nvlist_t *nvl;
+ uint64_t txg;
+ int children = vd->vdev_children;
+
+ if (vdev_validate_skip)
+ return (0);
+
+ if (children > 0) {
+ tq = taskq_create("vdev_validate", children, minclsyspri,
+ children, children, TASKQ_PREPOPULATE);
+ }
+
+ for (uint64_t c = 0; c < children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (tq == NULL || vdev_uses_zvols(cvd)) {
+ vdev_validate_child(cvd);
+ } else {
+ VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd,
+ TQ_SLEEP) != TASKQID_INVALID);
+ }
+ }
+ if (tq != NULL) {
+ taskq_wait(tq);
+ taskq_destroy(tq);
+ }
+ for (int c = 0; c < children; c++) {
+ int error = vd->vdev_child[c]->vdev_validate_error;
+
+ if (error != 0)
+ return (SET_ERROR(EBADF));
+ }
+
+
+ /*
+ * If the device has already failed, or was marked offline, don't do
+ * any further validation. Otherwise, label I/O will fail and we will
+ * overwrite the previous state.
+ */
+ if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd))
+ return (0);
+
+ /*
+ * If we are performing an extreme rewind, we allow for a label that
+ * was modified at a point after the current txg.
+ * If config lock is not held do not check for the txg. spa_sync could
+ * be updating the vdev's label before updating spa_last_synced_txg.
+ */
+ if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 ||
+ spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG)
+ txg = UINT64_MAX;
+ else
+ txg = spa_last_synced_txg(spa);
+
+ if ((label = vdev_label_read_config(vd, txg)) == NULL) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
+ vdev_dbgmsg(vd, "vdev_validate: failed reading config for "
+ "txg %llu", (u_longlong_t)txg);
+ return (0);
+ }
+
+ /*
+ * Determine if this vdev has been split off into another
+ * pool. If so, then refuse to open it.
+ */
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
+ &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_SPLIT_POOL);
+ nvlist_free(label);
+ vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool");
+ return (0);
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
+ ZPOOL_CONFIG_POOL_GUID);
+ return (0);
+ }
+
+ /*
+ * If config is not trusted then ignore the spa guid check. This is
+ * necessary because if the machine crashed during a re-guid the new
+ * guid might have been written to all of the vdev labels, but not the
+ * cached config. The check will be performed again once we have the
+ * trusted config from the MOS.
+ */
+ if (spa->spa_trust_config && guid != spa_guid(spa)) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't "
+ "match config (%llu != %llu)", (u_longlong_t)guid,
+ (u_longlong_t)spa_guid(spa));
+ return (0);
+ }
+
+ if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
+ != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
+ &aux_guid) != 0)
+ aux_guid = 0;
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
+ ZPOOL_CONFIG_GUID);
+ return (0);
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid)
+ != 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
+ ZPOOL_CONFIG_TOP_GUID);
+ return (0);
+ }
+
+ /*
+ * If this vdev just became a top-level vdev because its sibling was
+ * detached, it will have adopted the parent's vdev guid -- but the
+ * label may or may not be on disk yet. Fortunately, either version
+ * of the label will have the same top guid, so if we're a top-level
+ * vdev, we can safely compare to that instead.
+ * However, if the config comes from a cachefile that failed to update
+ * after the detach, a top-level vdev will appear as a non top-level
+ * vdev in the config. Also relax the constraints if we perform an
+ * extreme rewind.
+ *
+ * If we split this vdev off instead, then we also check the
+ * original pool's guid. We don't want to consider the vdev
+ * corrupt if it is partway through a split operation.
+ */
+ if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) {
+ boolean_t mismatch = B_FALSE;
+ if (spa->spa_trust_config && !spa->spa_extreme_rewind) {
+ if (vd != vd->vdev_top || vd->vdev_guid != top_guid)
+ mismatch = B_TRUE;
+ } else {
+ if (vd->vdev_guid != top_guid &&
+ vd->vdev_top->vdev_guid != guid)
+ mismatch = B_TRUE;
+ }
+
+ if (mismatch) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ vdev_dbgmsg(vd, "vdev_validate: config guid "
+ "doesn't match label guid");
+ vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu",
+ (u_longlong_t)vd->vdev_guid,
+ (u_longlong_t)vd->vdev_top->vdev_guid);
+ vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, "
+ "aux_guid %llu", (u_longlong_t)guid,
+ (u_longlong_t)top_guid, (u_longlong_t)aux_guid);
+ return (0);
+ }
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ &state) != 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
+ ZPOOL_CONFIG_POOL_STATE);
+ return (0);
+ }
+
+ nvlist_free(label);
+
+ /*
+ * If this is a verbatim import, no need to check the
+ * state of the pool.
+ */
+ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
+ spa_load_state(spa) == SPA_LOAD_OPEN &&
+ state != POOL_STATE_ACTIVE) {
+ vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) "
+ "for spa %s", (u_longlong_t)state, spa->spa_name);
+ return (SET_ERROR(EBADF));
+ }
+
+ /*
+ * If we were able to open and validate a vdev that was
+ * previously marked permanently unavailable, clear that state
+ * now.
+ */
+ if (vd->vdev_not_present)
+ vd->vdev_not_present = 0;
+
+ return (0);
+}
+
+static void
+vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
+{
+ if (svd->vdev_path != NULL && dvd->vdev_path != NULL) {
+ if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) {
+ zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed "
+ "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
+ dvd->vdev_path, svd->vdev_path);
+ spa_strfree(dvd->vdev_path);
+ dvd->vdev_path = spa_strdup(svd->vdev_path);
+ }
+ } else if (svd->vdev_path != NULL) {
+ dvd->vdev_path = spa_strdup(svd->vdev_path);
+ zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
+ (u_longlong_t)dvd->vdev_guid, dvd->vdev_path);
+ }
+}
+
+/*
+ * Recursively copy vdev paths from one vdev to another. Source and destination
+ * vdev trees must have same geometry otherwise return error. Intended to copy
+ * paths from userland config into MOS config.
+ */
+int
+vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd)
+{
+ if ((svd->vdev_ops == &vdev_missing_ops) ||
+ (svd->vdev_ishole && dvd->vdev_ishole) ||
+ (dvd->vdev_ops == &vdev_indirect_ops))
+ return (0);
+
+ if (svd->vdev_ops != dvd->vdev_ops) {
+ vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s",
+ svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (svd->vdev_guid != dvd->vdev_guid) {
+ vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != "
+ "%llu)", (u_longlong_t)svd->vdev_guid,
+ (u_longlong_t)dvd->vdev_guid);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (svd->vdev_children != dvd->vdev_children) {
+ vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: "
+ "%llu != %llu", (u_longlong_t)svd->vdev_children,
+ (u_longlong_t)dvd->vdev_children);
+ return (SET_ERROR(EINVAL));
+ }
+
+ for (uint64_t i = 0; i < svd->vdev_children; i++) {
+ int error = vdev_copy_path_strict(svd->vdev_child[i],
+ dvd->vdev_child[i]);
+ if (error != 0)
+ return (error);
+ }
+
+ if (svd->vdev_ops->vdev_op_leaf)
+ vdev_copy_path_impl(svd, dvd);
+
+ return (0);
+}
+
+static void
+vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd)
+{
+ ASSERT(stvd->vdev_top == stvd);
+ ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id);
+
+ for (uint64_t i = 0; i < dvd->vdev_children; i++) {
+ vdev_copy_path_search(stvd, dvd->vdev_child[i]);
+ }
+
+ if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd))
+ return;
+
+ /*
+ * The idea here is that while a vdev can shift positions within
+ * a top vdev (when replacing, attaching mirror, etc.) it cannot
+ * step outside of it.
+ */
+ vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid);
+
+ if (vd == NULL || vd->vdev_ops != dvd->vdev_ops)
+ return;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ vdev_copy_path_impl(vd, dvd);
+}
+
+/*
+ * Recursively copy vdev paths from one root vdev to another. Source and
+ * destination vdev trees may differ in geometry. For each destination leaf
+ * vdev, search a vdev with the same guid and top vdev id in the source.
+ * Intended to copy paths from userland config into MOS config.
+ */
+void
+vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd)
+{
+ uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children);
+ ASSERT(srvd->vdev_ops == &vdev_root_ops);
+ ASSERT(drvd->vdev_ops == &vdev_root_ops);
+
+ for (uint64_t i = 0; i < children; i++) {
+ vdev_copy_path_search(srvd->vdev_child[i],
+ drvd->vdev_child[i]);
+ }
+}
+
+/*
+ * Close a virtual device.
+ */
+void
+vdev_close(vdev_t *vd)
+{
+ vdev_t *pvd = vd->vdev_parent;
+ spa_t *spa __maybe_unused = vd->vdev_spa;
+
+ ASSERT(vd != NULL);
+ ASSERT(vd->vdev_open_thread == curthread ||
+ spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
+ /*
+ * If our parent is reopening, then we are as well, unless we are
+ * going offline.
+ */
+ if (pvd != NULL && pvd->vdev_reopening)
+ vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
+
+ vd->vdev_ops->vdev_op_close(vd);
+
+ vdev_cache_purge(vd);
+
+ /*
+ * We record the previous state before we close it, so that if we are
+ * doing a reopen(), we don't generate FMA ereports if we notice that
+ * it's still faulted.
+ */
+ vd->vdev_prevstate = vd->vdev_state;
+
+ if (vd->vdev_offline)
+ vd->vdev_state = VDEV_STATE_OFFLINE;
+ else
+ vd->vdev_state = VDEV_STATE_CLOSED;
+ vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+}
+
+void
+vdev_hold(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_is_root(spa));
+ if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+ return;
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_hold(vd->vdev_child[c]);
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_ops->vdev_op_hold(vd);
+}
+
+void
+vdev_rele(vdev_t *vd)
+{
+ ASSERT(spa_is_root(vd->vdev_spa));
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_rele(vd->vdev_child[c]);
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_ops->vdev_op_rele(vd);
+}
+
+/*
+ * Reopen all interior vdevs and any unopened leaves. We don't actually
+ * reopen leaf vdevs which had previously been opened as they might deadlock
+ * on the spa_config_lock. Instead we only obtain the leaf's physical size.
+ * If the leaf has never been opened then open it, as usual.
+ */
+void
+vdev_reopen(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
+ /* set the reopening flag unless we're taking the vdev offline */
+ vd->vdev_reopening = !vd->vdev_offline;
+ vdev_close(vd);
+ (void) vdev_open(vd);
+
+ /*
+ * Call vdev_validate() here to make sure we have the same device.
+ * Otherwise, a device with an invalid label could be successfully
+ * opened in response to vdev_reopen().
+ */
+ if (vd->vdev_aux) {
+ (void) vdev_validate_aux(vd);
+ if (vdev_readable(vd) && vdev_writeable(vd) &&
+ vd->vdev_aux == &spa->spa_l2cache) {
+ /*
+ * In case the vdev is present we should evict all ARC
+ * buffers and pointers to log blocks and reclaim their
+ * space before restoring its contents to L2ARC.
+ */
+ if (l2arc_vdev_present(vd)) {
+ l2arc_rebuild_vdev(vd, B_TRUE);
+ } else {
+ l2arc_add_vdev(spa, vd);
+ }
+ spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
+ spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
+ }
+ } else {
+ (void) vdev_validate(vd);
+ }
+
+ /*
+ * Reassess parent vdev's health.
+ */
+ vdev_propagate_state(vd);
+}
+
+int
+vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
+{
+ int error;
+
+ /*
+ * Normally, partial opens (e.g. of a mirror) are allowed.
+ * For a create, however, we want to fail the request if
+ * there are any components we can't open.
+ */
+ error = vdev_open(vd);
+
+ if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
+ vdev_close(vd);
+ return (error ? error : SET_ERROR(ENXIO));
+ }
+
+ /*
+ * Recursively load DTLs and initialize all labels.
+ */
+ if ((error = vdev_dtl_load(vd)) != 0 ||
+ (error = vdev_label_init(vd, txg, isreplacing ?
+ VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
+ vdev_close(vd);
+ return (error);
+ }
+
+ return (0);
+}
+
+void
+vdev_metaslab_set_size(vdev_t *vd)
+{
+ uint64_t asize = vd->vdev_asize;
+ uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
+ uint64_t ms_shift;
+
+ /*
+ * There are two dimensions to the metaslab sizing calculation:
+ * the size of the metaslab and the count of metaslabs per vdev.
+ *
+ * The default values used below are a good balance between memory
+ * usage (larger metaslab size means more memory needed for loaded
+ * metaslabs; more metaslabs means more memory needed for the
+ * metaslab_t structs), metaslab load time (larger metaslabs take
+ * longer to load), and metaslab sync time (more metaslabs means
+ * more time spent syncing all of them).
+ *
+ * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
+ * The range of the dimensions are as follows:
+ *
+ * 2^29 <= ms_size <= 2^34
+ * 16 <= ms_count <= 131,072
+ *
+ * On the lower end of vdev sizes, we aim for metaslabs sizes of
+ * at least 512MB (2^29) to minimize fragmentation effects when
+ * testing with smaller devices. However, the count constraint
+ * of at least 16 metaslabs will override this minimum size goal.
+ *
+ * On the upper end of vdev sizes, we aim for a maximum metaslab
+ * size of 16GB. However, we will cap the total count to 2^17
+ * metaslabs to keep our memory footprint in check and let the
+ * metaslab size grow from there if that limit is hit.
+ *
+ * The net effect of applying above constrains is summarized below.
+ *
+ * vdev size metaslab count
+ * --------------|-----------------
+ * < 8GB ~16
+ * 8GB - 100GB one per 512MB
+ * 100GB - 3TB ~200
+ * 3TB - 2PB one per 16GB
+ * > 2PB ~131,072
+ * --------------------------------
+ *
+ * Finally, note that all of the above calculate the initial
+ * number of metaslabs. Expanding a top-level vdev will result
+ * in additional metaslabs being allocated making it possible
+ * to exceed the zfs_vdev_ms_count_limit.
+ */
+
+ if (ms_count < zfs_vdev_min_ms_count)
+ ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
+ else if (ms_count > zfs_vdev_default_ms_count)
+ ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
+ else
+ ms_shift = zfs_vdev_default_ms_shift;
+
+ if (ms_shift < SPA_MAXBLOCKSHIFT) {
+ ms_shift = SPA_MAXBLOCKSHIFT;
+ } else if (ms_shift > zfs_vdev_max_ms_shift) {
+ ms_shift = zfs_vdev_max_ms_shift;
+ /* cap the total count to constrain memory footprint */
+ if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
+ ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
+ }
+
+ vd->vdev_ms_shift = ms_shift;
+ ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
+}
+
+void
+vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
+{
+ ASSERT(vd == vd->vdev_top);
+ /* indirect vdevs don't have metaslabs or dtls */
+ ASSERT(vdev_is_concrete(vd) || flags == 0);
+ ASSERT(ISP2(flags));
+ ASSERT(spa_writeable(vd->vdev_spa));
+
+ if (flags & VDD_METASLAB)
+ (void) txg_list_add(&vd->vdev_ms_list, arg, txg);
+
+ if (flags & VDD_DTL)
+ (void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
+
+ (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
+}
+
+void
+vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
+{
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ vdev_dirty(vd->vdev_top, flags, vd, txg);
+}
+
+/*
+ * DTLs.
+ *
+ * A vdev's DTL (dirty time log) is the set of transaction groups for which
+ * the vdev has less than perfect replication. There are four kinds of DTL:
+ *
+ * DTL_MISSING: txgs for which the vdev has no valid copies of the data
+ *
+ * DTL_PARTIAL: txgs for which data is available, but not fully replicated
+ *
+ * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
+ * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
+ * txgs that was scrubbed.
+ *
+ * DTL_OUTAGE: txgs which cannot currently be read, whether due to
+ * persistent errors or just some device being offline.
+ * Unlike the other three, the DTL_OUTAGE map is not generally
+ * maintained; it's only computed when needed, typically to
+ * determine whether a device can be detached.
+ *
+ * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
+ * either has the data or it doesn't.
+ *
+ * For interior vdevs such as mirror and RAID-Z the picture is more complex.
+ * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
+ * if any child is less than fully replicated, then so is its parent.
+ * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
+ * comprising only those txgs which appear in 'maxfaults' or more children;
+ * those are the txgs we don't have enough replication to read. For example,
+ * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
+ * thus, its DTL_MISSING consists of the set of txgs that appear in more than
+ * two child DTL_MISSING maps.
+ *
+ * It should be clear from the above that to compute the DTLs and outage maps
+ * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
+ * Therefore, that is all we keep on disk. When loading the pool, or after
+ * a configuration change, we generate all other DTLs from first principles.
+ */
+void
+vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
+{
+ range_tree_t *rt = vd->vdev_dtl[t];
+
+ ASSERT(t < DTL_TYPES);
+ ASSERT(vd != vd->vdev_spa->spa_root_vdev);
+ ASSERT(spa_writeable(vd->vdev_spa));
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ if (!range_tree_contains(rt, txg, size))
+ range_tree_add(rt, txg, size);
+ mutex_exit(&vd->vdev_dtl_lock);
+}
+
+boolean_t
+vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
+{
+ range_tree_t *rt = vd->vdev_dtl[t];
+ boolean_t dirty = B_FALSE;
+
+ ASSERT(t < DTL_TYPES);
+ ASSERT(vd != vd->vdev_spa->spa_root_vdev);
+
+ /*
+ * While we are loading the pool, the DTLs have not been loaded yet.
+ * This isn't a problem but it can result in devices being tried
+ * which are known to not have the data. In which case, the import
+ * is relying on the checksum to ensure that we get the right data.
+ * Note that while importing we are only reading the MOS, which is
+ * always checksummed.
+ */
+ mutex_enter(&vd->vdev_dtl_lock);
+ if (!range_tree_is_empty(rt))
+ dirty = range_tree_contains(rt, txg, size);
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ return (dirty);
+}
+
+boolean_t
+vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
+{
+ range_tree_t *rt = vd->vdev_dtl[t];
+ boolean_t empty;
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ empty = range_tree_is_empty(rt);
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ return (empty);
+}
+
+/*
+ * Check if the txg falls within the range which must be
+ * resilvered. DVAs outside this range can always be skipped.
+ */
+boolean_t
+vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
+ uint64_t phys_birth)
+{
+ /* Set by sequential resilver. */
+ if (phys_birth == TXG_UNKNOWN)
+ return (B_TRUE);
+
+ return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1));
+}
+
+/*
+ * Returns B_TRUE if the vdev determines the DVA needs to be resilvered.
+ */
+boolean_t
+vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
+ uint64_t phys_birth)
+{
+ ASSERT(vd != vd->vdev_spa->spa_root_vdev);
+
+ if (vd->vdev_ops->vdev_op_need_resilver == NULL ||
+ vd->vdev_ops->vdev_op_leaf)
+ return (B_TRUE);
+
+ return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize,
+ phys_birth));
+}
+
+/*
+ * Returns the lowest txg in the DTL range.
+ */
+static uint64_t
+vdev_dtl_min(vdev_t *vd)
+{
+ ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
+ ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
+ ASSERT0(vd->vdev_children);
+
+ return (range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1);
+}
+
+/*
+ * Returns the highest txg in the DTL.
+ */
+static uint64_t
+vdev_dtl_max(vdev_t *vd)
+{
+ ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
+ ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
+ ASSERT0(vd->vdev_children);
+
+ return (range_tree_max(vd->vdev_dtl[DTL_MISSING]));
+}
+
+/*
+ * Determine if a resilvering vdev should remove any DTL entries from
+ * its range. If the vdev was resilvering for the entire duration of the
+ * scan then it should excise that range from its DTLs. Otherwise, this
+ * vdev is considered partially resilvered and should leave its DTL
+ * entries intact. The comment in vdev_dtl_reassess() describes how we
+ * excise the DTLs.
+ */
+static boolean_t
+vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done)
+{
+ ASSERT0(vd->vdev_children);
+
+ if (vd->vdev_state < VDEV_STATE_DEGRADED)
+ return (B_FALSE);
+
+ if (vd->vdev_resilver_deferred)
+ return (B_FALSE);
+
+ if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
+ return (B_TRUE);
+
+ if (rebuild_done) {
+ vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+ /* Rebuild not initiated by attach */
+ if (vd->vdev_rebuild_txg == 0)
+ return (B_TRUE);
+
+ /*
+ * When a rebuild completes without error then all missing data
+ * up to the rebuild max txg has been reconstructed and the DTL
+ * is eligible for excision.
+ */
+ if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE &&
+ vdev_dtl_max(vd) <= vrp->vrp_max_txg) {
+ ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd));
+ ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg);
+ ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg);
+ return (B_TRUE);
+ }
+ } else {
+ dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
+ dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys;
+
+ /* Resilver not initiated by attach */
+ if (vd->vdev_resilver_txg == 0)
+ return (B_TRUE);
+
+ /*
+ * When a resilver is initiated the scan will assign the
+ * scn_max_txg value to the highest txg value that exists
+ * in all DTLs. If this device's max DTL is not part of this
+ * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg]
+ * then it is not eligible for excision.
+ */
+ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
+ ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd));
+ ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg);
+ ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg);
+ return (B_TRUE);
+ }
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Reassess DTLs after a config change or scrub completion. If txg == 0 no
+ * write operations will be issued to the pool.
+ */
+void
+vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
+ boolean_t scrub_done, boolean_t rebuild_done)
+{
+ spa_t *spa = vd->vdev_spa;
+ avl_tree_t reftree;
+ int minref;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_dtl_reassess(vd->vdev_child[c], txg,
+ scrub_txg, scrub_done, rebuild_done);
+
+ if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux)
+ return;
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+ vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
+ boolean_t check_excise = B_FALSE;
+ boolean_t wasempty = B_TRUE;
+
+ mutex_enter(&vd->vdev_dtl_lock);
+
+ /*
+ * If requested, pretend the scan or rebuild completed cleanly.
+ */
+ if (zfs_scan_ignore_errors) {
+ if (scn != NULL)
+ scn->scn_phys.scn_errors = 0;
+ if (vr != NULL)
+ vr->vr_rebuild_phys.vrp_errors = 0;
+ }
+
+ if (scrub_txg != 0 &&
+ !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
+ wasempty = B_FALSE;
+ zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d "
+ "dtl:%llu/%llu errors:%llu",
+ (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg,
+ (u_longlong_t)scrub_txg, spa->spa_scrub_started,
+ (u_longlong_t)vdev_dtl_min(vd),
+ (u_longlong_t)vdev_dtl_max(vd),
+ (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0));
+ }
+
+ /*
+ * If we've completed a scrub/resilver or a rebuild cleanly
+ * then determine if this vdev should remove any DTLs. We
+ * only want to excise regions on vdevs that were available
+ * during the entire duration of this scan.
+ */
+ if (rebuild_done &&
+ vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) {
+ check_excise = B_TRUE;
+ } else {
+ if (spa->spa_scrub_started ||
+ (scn != NULL && scn->scn_phys.scn_errors == 0)) {
+ check_excise = B_TRUE;
+ }
+ }
+
+ if (scrub_txg && check_excise &&
+ vdev_dtl_should_excise(vd, rebuild_done)) {
+ /*
+ * We completed a scrub, resilver or rebuild up to
+ * scrub_txg. If we did it without rebooting, then
+ * the scrub dtl will be valid, so excise the old
+ * region and fold in the scrub dtl. Otherwise,
+ * leave the dtl as-is if there was an error.
+ *
+ * There's little trick here: to excise the beginning
+ * of the DTL_MISSING map, we put it into a reference
+ * tree and then add a segment with refcnt -1 that
+ * covers the range [0, scrub_txg). This means
+ * that each txg in that range has refcnt -1 or 0.
+ * We then add DTL_SCRUB with a refcnt of 2, so that
+ * entries in the range [0, scrub_txg) will have a
+ * positive refcnt -- either 1 or 2. We then convert
+ * the reference tree into the new DTL_MISSING map.
+ */
+ space_reftree_create(&reftree);
+ space_reftree_add_map(&reftree,
+ vd->vdev_dtl[DTL_MISSING], 1);
+ space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
+ space_reftree_add_map(&reftree,
+ vd->vdev_dtl[DTL_SCRUB], 2);
+ space_reftree_generate_map(&reftree,
+ vd->vdev_dtl[DTL_MISSING], 1);
+ space_reftree_destroy(&reftree);
+
+ if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
+ zfs_dbgmsg("update DTL_MISSING:%llu/%llu",
+ (u_longlong_t)vdev_dtl_min(vd),
+ (u_longlong_t)vdev_dtl_max(vd));
+ } else if (!wasempty) {
+ zfs_dbgmsg("DTL_MISSING is now empty");
+ }
+ }
+ range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
+ range_tree_walk(vd->vdev_dtl[DTL_MISSING],
+ range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
+ if (scrub_done)
+ range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
+ range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
+ if (!vdev_readable(vd))
+ range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
+ else
+ range_tree_walk(vd->vdev_dtl[DTL_MISSING],
+ range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
+
+ /*
+ * If the vdev was resilvering or rebuilding and no longer
+ * has any DTLs then reset the appropriate flag and dirty
+ * the top level so that we persist the change.
+ */
+ if (txg != 0 &&
+ range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
+ range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
+ if (vd->vdev_rebuild_txg != 0) {
+ vd->vdev_rebuild_txg = 0;
+ vdev_config_dirty(vd->vdev_top);
+ } else if (vd->vdev_resilver_txg != 0) {
+ vd->vdev_resilver_txg = 0;
+ vdev_config_dirty(vd->vdev_top);
+ }
+ }
+
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ if (txg != 0)
+ vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
+ return;
+ }
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ for (int t = 0; t < DTL_TYPES; t++) {
+ /* account for child's outage in parent's missing map */
+ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
+ if (t == DTL_SCRUB)
+ continue; /* leaf vdevs only */
+ if (t == DTL_PARTIAL)
+ minref = 1; /* i.e. non-zero */
+ else if (vdev_get_nparity(vd) != 0)
+ minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */
+ else
+ minref = vd->vdev_children; /* any kind of mirror */
+ space_reftree_create(&reftree);
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ mutex_enter(&cvd->vdev_dtl_lock);
+ space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
+ mutex_exit(&cvd->vdev_dtl_lock);
+ }
+ space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
+ space_reftree_destroy(&reftree);
+ }
+ mutex_exit(&vd->vdev_dtl_lock);
+}
+
+int
+vdev_dtl_load(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ range_tree_t *rt;
+ int error = 0;
+
+ if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
+ ASSERT(vdev_is_concrete(vd));
+
+ error = space_map_open(&vd->vdev_dtl_sm, mos,
+ vd->vdev_dtl_object, 0, -1ULL, 0);
+ if (error)
+ return (error);
+ ASSERT(vd->vdev_dtl_sm != NULL);
+
+ rt = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+ error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC);
+ if (error == 0) {
+ mutex_enter(&vd->vdev_dtl_lock);
+ range_tree_walk(rt, range_tree_add,
+ vd->vdev_dtl[DTL_MISSING]);
+ mutex_exit(&vd->vdev_dtl_lock);
+ }
+
+ range_tree_vacate(rt, NULL, NULL);
+ range_tree_destroy(rt);
+
+ return (error);
+ }
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ error = vdev_dtl_load(vd->vdev_child[c]);
+ if (error != 0)
+ break;
+ }
+
+ return (error);
+}
+
+static void
+vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
+{
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
+ const char *string;
+
+ ASSERT(alloc_bias != VDEV_BIAS_NONE);
+
+ string =
+ (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG :
+ (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
+ (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL;
+
+ ASSERT(string != NULL);
+ VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS,
+ 1, strlen(string) + 1, string, tx));
+
+ if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) {
+ spa_activate_allocation_classes(spa, tx);
+ }
+}
+
+void
+vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx));
+ VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
+ zapobj, tx));
+}
+
+uint64_t
+vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx)
+{
+ spa_t *spa = vd->vdev_spa;
+ uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA,
+ DMU_OT_NONE, 0, tx);
+
+ ASSERT(zap != 0);
+ VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
+ zap, tx));
+
+ return (zap);
+}
+
+void
+vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
+{
+ if (vd->vdev_ops != &vdev_hole_ops &&
+ vd->vdev_ops != &vdev_missing_ops &&
+ vd->vdev_ops != &vdev_root_ops &&
+ !vd->vdev_top->vdev_removing) {
+ if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) {
+ vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx);
+ }
+ if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
+ vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
+ if (vd->vdev_alloc_bias != VDEV_BIAS_NONE)
+ vdev_zap_allocation_data(vd, tx);
+ }
+ }
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ vdev_construct_zaps(vd->vdev_child[i], tx);
+ }
+}
+
+static void
+vdev_dtl_sync(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
+ objset_t *mos = spa->spa_meta_objset;
+ range_tree_t *rtsync;
+ dmu_tx_t *tx;
+ uint64_t object = space_map_object(vd->vdev_dtl_sm);
+
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+ if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
+ mutex_enter(&vd->vdev_dtl_lock);
+ space_map_free(vd->vdev_dtl_sm, tx);
+ space_map_close(vd->vdev_dtl_sm);
+ vd->vdev_dtl_sm = NULL;
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ /*
+ * We only destroy the leaf ZAP for detached leaves or for
+ * removed log devices. Removed data devices handle leaf ZAP
+ * cleanup later, once cancellation is no longer possible.
+ */
+ if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached ||
+ vd->vdev_top->vdev_islog)) {
+ vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx);
+ vd->vdev_leaf_zap = 0;
+ }
+
+ dmu_tx_commit(tx);
+ return;
+ }
+
+ if (vd->vdev_dtl_sm == NULL) {
+ uint64_t new_object;
+
+ new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx);
+ VERIFY3U(new_object, !=, 0);
+
+ VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
+ 0, -1ULL, 0));
+ ASSERT(vd->vdev_dtl_sm != NULL);
+ }
+
+ rtsync = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ range_tree_walk(rt, range_tree_add, rtsync);
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx);
+ space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
+ range_tree_vacate(rtsync, NULL, NULL);
+
+ range_tree_destroy(rtsync);
+
+ /*
+ * If the object for the space map has changed then dirty
+ * the top level so that we update the config.
+ */
+ if (object != space_map_object(vd->vdev_dtl_sm)) {
+ vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, "
+ "new object %llu", (u_longlong_t)txg, spa_name(spa),
+ (u_longlong_t)object,
+ (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
+ vdev_config_dirty(vd->vdev_top);
+ }
+
+ dmu_tx_commit(tx);
+}
+
+/*
+ * Determine whether the specified vdev can be offlined/detached/removed
+ * without losing data.
+ */
+boolean_t
+vdev_dtl_required(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *tvd = vd->vdev_top;
+ uint8_t cant_read = vd->vdev_cant_read;
+ boolean_t required;
+
+ ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
+ if (vd == spa->spa_root_vdev || vd == tvd)
+ return (B_TRUE);
+
+ /*
+ * Temporarily mark the device as unreadable, and then determine
+ * whether this results in any DTL outages in the top-level vdev.
+ * If not, we can safely offline/detach/remove the device.
+ */
+ vd->vdev_cant_read = B_TRUE;
+ vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE);
+ required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
+ vd->vdev_cant_read = cant_read;
+ vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE);
+
+ if (!required && zio_injection_enabled) {
+ required = !!zio_handle_device_injection(vd, NULL,
+ SET_ERROR(ECHILD));
+ }
+
+ return (required);
+}
+
+/*
+ * Determine if resilver is needed, and if so the txg range.
+ */
+boolean_t
+vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
+{
+ boolean_t needed = B_FALSE;
+ uint64_t thismin = UINT64_MAX;
+ uint64_t thismax = 0;
+
+ if (vd->vdev_children == 0) {
+ mutex_enter(&vd->vdev_dtl_lock);
+ if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
+ vdev_writeable(vd)) {
+
+ thismin = vdev_dtl_min(vd);
+ thismax = vdev_dtl_max(vd);
+ needed = B_TRUE;
+ }
+ mutex_exit(&vd->vdev_dtl_lock);
+ } else {
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ uint64_t cmin, cmax;
+
+ if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
+ thismin = MIN(thismin, cmin);
+ thismax = MAX(thismax, cmax);
+ needed = B_TRUE;
+ }
+ }
+ }
+
+ if (needed && minp) {
+ *minp = thismin;
+ *maxp = thismax;
+ }
+ return (needed);
+}
+
+/*
+ * Gets the checkpoint space map object from the vdev's ZAP. On success sm_obj
+ * will contain either the checkpoint spacemap object or zero if none exists.
+ * All other errors are returned to the caller.
+ */
+int
+vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj)
+{
+ ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+
+ if (vd->vdev_top_zap == 0) {
+ *sm_obj = 0;
+ return (0);
+ }
+
+ int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj);
+ if (error == ENOENT) {
+ *sm_obj = 0;
+ error = 0;
+ }
+
+ return (error);
+}
+
+int
+vdev_load(vdev_t *vd)
+{
+ int children = vd->vdev_children;
+ int error = 0;
+ taskq_t *tq = NULL;
+
+ /*
+ * It's only worthwhile to use the taskq for the root vdev, because the
+ * slow part is metaslab_init, and that only happens for top-level
+ * vdevs.
+ */
+ if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) {
+ tq = taskq_create("vdev_load", children, minclsyspri,
+ children, children, TASKQ_PREPOPULATE);
+ }
+
+ /*
+ * Recursively load all children.
+ */
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (tq == NULL || vdev_uses_zvols(cvd)) {
+ cvd->vdev_load_error = vdev_load(cvd);
+ } else {
+ VERIFY(taskq_dispatch(tq, vdev_load_child,
+ cvd, TQ_SLEEP) != TASKQID_INVALID);
+ }
+ }
+
+ if (tq != NULL) {
+ taskq_wait(tq);
+ taskq_destroy(tq);
+ }
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ int error = vd->vdev_child[c]->vdev_load_error;
+
+ if (error != 0)
+ return (error);
+ }
+
+ vdev_set_deflate_ratio(vd);
+
+ /*
+ * On spa_load path, grab the allocation bias from our zap
+ */
+ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
+ spa_t *spa = vd->vdev_spa;
+ char bias_str[64];
+
+ error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
+ bias_str);
+ if (error == 0) {
+ ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
+ vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
+ } else if (error != ENOENT) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) "
+ "failed [error=%d]", vd->vdev_top_zap, error);
+ return (error);
+ }
+ }
+
+ /*
+ * Load any rebuild state from the top-level vdev zap.
+ */
+ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
+ error = vdev_rebuild_load(vd);
+ if (error && error != ENOTSUP) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load "
+ "failed [error=%d]", error);
+ return (error);
+ }
+ }
+
+ /*
+ * If this is a top-level vdev, initialize its metaslabs.
+ */
+ if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
+ vdev_metaslab_group_create(vd);
+
+ if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, "
+ "asize=%llu", (u_longlong_t)vd->vdev_ashift,
+ (u_longlong_t)vd->vdev_asize);
+ return (SET_ERROR(ENXIO));
+ }
+
+ error = vdev_metaslab_init(vd, 0);
+ if (error != 0) {
+ vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
+ "[error=%d]", error);
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ return (error);
+ }
+
+ uint64_t checkpoint_sm_obj;
+ error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj);
+ if (error == 0 && checkpoint_sm_obj != 0) {
+ objset_t *mos = spa_meta_objset(vd->vdev_spa);
+ ASSERT(vd->vdev_asize != 0);
+ ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
+
+ error = space_map_open(&vd->vdev_checkpoint_sm,
+ mos, checkpoint_sm_obj, 0, vd->vdev_asize,
+ vd->vdev_ashift);
+ if (error != 0) {
+ vdev_dbgmsg(vd, "vdev_load: space_map_open "
+ "failed for checkpoint spacemap (obj %llu) "
+ "[error=%d]",
+ (u_longlong_t)checkpoint_sm_obj, error);
+ return (error);
+ }
+ ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+
+ /*
+ * Since the checkpoint_sm contains free entries
+ * exclusively we can use space_map_allocated() to
+ * indicate the cumulative checkpointed space that
+ * has been freed.
+ */
+ vd->vdev_stat.vs_checkpoint_space =
+ -space_map_allocated(vd->vdev_checkpoint_sm);
+ vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
+ vd->vdev_stat.vs_checkpoint_space;
+ } else if (error != 0) {
+ vdev_dbgmsg(vd, "vdev_load: failed to retrieve "
+ "checkpoint space map object from vdev ZAP "
+ "[error=%d]", error);
+ return (error);
+ }
+ }
+
+ /*
+ * If this is a leaf vdev, load its DTL.
+ */
+ if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed "
+ "[error=%d]", error);
+ return (error);
+ }
+
+ uint64_t obsolete_sm_object;
+ error = vdev_obsolete_sm_object(vd, &obsolete_sm_object);
+ if (error == 0 && obsolete_sm_object != 0) {
+ objset_t *mos = vd->vdev_spa->spa_meta_objset;
+ ASSERT(vd->vdev_asize != 0);
+ ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
+
+ if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
+ obsolete_sm_object, 0, vd->vdev_asize, 0))) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ vdev_dbgmsg(vd, "vdev_load: space_map_open failed for "
+ "obsolete spacemap (obj %llu) [error=%d]",
+ (u_longlong_t)obsolete_sm_object, error);
+ return (error);
+ }
+ } else if (error != 0) {
+ vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete "
+ "space map object from vdev ZAP [error=%d]", error);
+ return (error);
+ }
+
+ return (0);
+}
+
+/*
+ * The special vdev case is used for hot spares and l2cache devices. Its
+ * sole purpose it to set the vdev state for the associated vdev. To do this,
+ * we make sure that we can open the underlying device, then try to read the
+ * label, and make sure that the label is sane and that it hasn't been
+ * repurposed to another pool.
+ */
+int
+vdev_validate_aux(vdev_t *vd)
+{
+ nvlist_t *label;
+ uint64_t guid, version;
+ uint64_t state;
+
+ if (!vdev_readable(vd))
+ return (0);
+
+ if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ return (-1);
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
+ !SPA_VERSION_IS_SUPPORTED(version) ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
+ guid != vd->vdev_guid ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ return (-1);
+ }
+
+ /*
+ * We don't actually check the pool state here. If it's in fact in
+ * use by another pool, we update this fact on the fly when requested.
+ */
+ nvlist_free(label);
+ return (0);
+}
+
+static void
+vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx)
+{
+ objset_t *mos = spa_meta_objset(vd->vdev_spa);
+
+ if (vd->vdev_top_zap == 0)
+ return;
+
+ uint64_t object = 0;
+ int err = zap_lookup(mos, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object);
+ if (err == ENOENT)
+ return;
+ VERIFY0(err);
+
+ VERIFY0(dmu_object_free(mos, object, tx));
+ VERIFY0(zap_remove(mos, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx));
+}
+
+/*
+ * Free the objects used to store this vdev's spacemaps, and the array
+ * that points to them.
+ */
+void
+vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx)
+{
+ if (vd->vdev_ms_array == 0)
+ return;
+
+ objset_t *mos = vd->vdev_spa->spa_meta_objset;
+ uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift;
+ size_t array_bytes = array_count * sizeof (uint64_t);
+ uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP);
+ VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0,
+ array_bytes, smobj_array, 0));
+
+ for (uint64_t i = 0; i < array_count; i++) {
+ uint64_t smobj = smobj_array[i];
+ if (smobj == 0)
+ continue;
+
+ space_map_free_obj(mos, smobj, tx);
+ }
+
+ kmem_free(smobj_array, array_bytes);
+ VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
+ vdev_destroy_ms_flush_data(vd, tx);
+ vd->vdev_ms_array = 0;
+}
+
+static void
+vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(vd->vdev_islog);
+ ASSERT(vd == vd->vdev_top);
+ ASSERT3U(txg, ==, spa_syncing_txg(spa));
+
+ dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+ vdev_destroy_spacemaps(vd, tx);
+ if (vd->vdev_top_zap != 0) {
+ vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
+ vd->vdev_top_zap = 0;
+ }
+
+ dmu_tx_commit(tx);
+}
+
+void
+vdev_sync_done(vdev_t *vd, uint64_t txg)
+{
+ metaslab_t *msp;
+ boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
+
+ ASSERT(vdev_is_concrete(vd));
+
+ while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
+ != NULL)
+ metaslab_sync_done(msp, txg);
+
+ if (reassess) {
+ metaslab_sync_reassess(vd->vdev_mg);
+ if (vd->vdev_log_mg != NULL)
+ metaslab_sync_reassess(vd->vdev_log_mg);
+ }
+}
+
+void
+vdev_sync(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *lvd;
+ metaslab_t *msp;
+
+ ASSERT3U(txg, ==, spa->spa_syncing_txg);
+ dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
+ ASSERT(vd->vdev_removing ||
+ vd->vdev_ops == &vdev_indirect_ops);
+
+ vdev_indirect_sync_obsolete(vd, tx);
+
+ /*
+ * If the vdev is indirect, it can't have dirty
+ * metaslabs or DTLs.
+ */
+ if (vd->vdev_ops == &vdev_indirect_ops) {
+ ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
+ ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
+ dmu_tx_commit(tx);
+ return;
+ }
+ }
+
+ ASSERT(vdev_is_concrete(vd));
+
+ if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
+ !vd->vdev_removing) {
+ ASSERT(vd == vd->vdev_top);
+ ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
+ vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
+ DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
+ ASSERT(vd->vdev_ms_array != 0);
+ vdev_config_dirty(vd);
+ }
+
+ while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
+ metaslab_sync(msp, txg);
+ (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
+ }
+
+ while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
+ vdev_dtl_sync(lvd, txg);
+
+ /*
+ * If this is an empty log device being removed, destroy the
+ * metadata associated with it.
+ */
+ if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
+ vdev_remove_empty_log(vd, txg);
+
+ (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
+ dmu_tx_commit(tx);
+}
+
+uint64_t
+vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
+{
+ return (vd->vdev_ops->vdev_op_asize(vd, psize));
+}
+
+/*
+ * Mark the given vdev faulted. A faulted vdev behaves as if the device could
+ * not be opened, and no I/O is attempted.
+ */
+int
+vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
+{
+ vdev_t *vd, *tvd;
+
+ spa_vdev_state_enter(spa, SCL_NONE);
+
+ if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+ return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
+
+ tvd = vd->vdev_top;
+
+ /*
+ * If user did a 'zpool offline -f' then make the fault persist across
+ * reboots.
+ */
+ if (aux == VDEV_AUX_EXTERNAL_PERSIST) {
+ /*
+ * There are two kinds of forced faults: temporary and
+ * persistent. Temporary faults go away at pool import, while
+ * persistent faults stay set. Both types of faults can be
+ * cleared with a zpool clear.
+ *
+ * We tell if a vdev is persistently faulted by looking at the
+ * ZPOOL_CONFIG_AUX_STATE nvpair. If it's set to "external" at
+ * import then it's a persistent fault. Otherwise, it's
+ * temporary. We get ZPOOL_CONFIG_AUX_STATE set to "external"
+ * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL. This
+ * tells vdev_config_generate() (which gets run later) to set
+ * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist.
+ */
+ vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL;
+ vd->vdev_tmpoffline = B_FALSE;
+ aux = VDEV_AUX_EXTERNAL;
+ } else {
+ vd->vdev_tmpoffline = B_TRUE;
+ }
+
+ /*
+ * We don't directly use the aux state here, but if we do a
+ * vdev_reopen(), we need this value to be present to remember why we
+ * were faulted.
+ */
+ vd->vdev_label_aux = aux;
+
+ /*
+ * Faulted state takes precedence over degraded.
+ */
+ vd->vdev_delayed_close = B_FALSE;
+ vd->vdev_faulted = 1ULL;
+ vd->vdev_degraded = 0ULL;
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
+
+ /*
+ * If this device has the only valid copy of the data, then
+ * back off and simply mark the vdev as degraded instead.
+ */
+ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
+ vd->vdev_degraded = 1ULL;
+ vd->vdev_faulted = 0ULL;
+
+ /*
+ * If we reopen the device and it's not dead, only then do we
+ * mark it degraded.
+ */
+ vdev_reopen(tvd);
+
+ if (vdev_readable(vd))
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
+ }
+
+ return (spa_vdev_state_exit(spa, vd, 0));
+}
+
+/*
+ * Mark the given vdev degraded. A degraded vdev is purely an indication to the
+ * user that something is wrong. The vdev continues to operate as normal as far
+ * as I/O is concerned.
+ */
+int
+vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
+{
+ vdev_t *vd;
+
+ spa_vdev_state_enter(spa, SCL_NONE);
+
+ if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+ return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
+
+ /*
+ * If the vdev is already faulted, then don't do anything.
+ */
+ if (vd->vdev_faulted || vd->vdev_degraded)
+ return (spa_vdev_state_exit(spa, NULL, 0));
+
+ vd->vdev_degraded = 1ULL;
+ if (!vdev_is_dead(vd))
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
+ aux);
+
+ return (spa_vdev_state_exit(spa, vd, 0));
+}
+
+/*
+ * Online the given vdev.
+ *
+ * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached
+ * spare device should be detached when the device finishes resilvering.
+ * Second, the online should be treated like a 'test' online case, so no FMA
+ * events are generated if the device fails to open.
+ */
+int
+vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
+{
+ vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
+ boolean_t wasoffline;
+ vdev_state_t oldstate;
+
+ spa_vdev_state_enter(spa, SCL_NONE);
+
+ if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+ return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
+
+ wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
+ oldstate = vd->vdev_state;
+
+ tvd = vd->vdev_top;
+ vd->vdev_offline = B_FALSE;
+ vd->vdev_tmpoffline = B_FALSE;
+ vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
+ vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
+
+ /* XXX - L2ARC 1.0 does not support expansion */
+ if (!vd->vdev_aux) {
+ for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+ pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) ||
+ spa->spa_autoexpand);
+ vd->vdev_expansion_time = gethrestime_sec();
+ }
+
+ vdev_reopen(tvd);
+ vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
+
+ if (!vd->vdev_aux) {
+ for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+ pvd->vdev_expanding = B_FALSE;
+ }
+
+ if (newstate)
+ *newstate = vd->vdev_state;
+ if ((flags & ZFS_ONLINE_UNSPARE) &&
+ !vdev_is_dead(vd) && vd->vdev_parent &&
+ vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+ vd->vdev_parent->vdev_child[0] == vd)
+ vd->vdev_unspare = B_TRUE;
+
+ if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
+
+ /* XXX - L2ARC 1.0 does not support expansion */
+ if (vd->vdev_aux)
+ return (spa_vdev_state_exit(spa, vd, ENOTSUP));
+ spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+ }
+
+ /* Restart initializing if necessary */
+ mutex_enter(&vd->vdev_initialize_lock);
+ if (vdev_writeable(vd) &&
+ vd->vdev_initialize_thread == NULL &&
+ vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
+ (void) vdev_initialize(vd);
+ }
+ mutex_exit(&vd->vdev_initialize_lock);
+
+ /*
+ * Restart trimming if necessary. We do not restart trimming for cache
+ * devices here. This is triggered by l2arc_rebuild_vdev()
+ * asynchronously for the whole device or in l2arc_evict() as it evicts
+ * space for upcoming writes.
+ */
+ mutex_enter(&vd->vdev_trim_lock);
+ if (vdev_writeable(vd) && !vd->vdev_isl2cache &&
+ vd->vdev_trim_thread == NULL &&
+ vd->vdev_trim_state == VDEV_TRIM_ACTIVE) {
+ (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial,
+ vd->vdev_trim_secure);
+ }
+ mutex_exit(&vd->vdev_trim_lock);
+
+ if (wasoffline ||
+ (oldstate < VDEV_STATE_DEGRADED &&
+ vd->vdev_state >= VDEV_STATE_DEGRADED))
+ spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
+
+ return (spa_vdev_state_exit(spa, vd, 0));
+}
+
+static int
+vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
+{
+ vdev_t *vd, *tvd;
+ int error = 0;
+ uint64_t generation;
+ metaslab_group_t *mg;
+
+top:
+ spa_vdev_state_enter(spa, SCL_ALLOC);
+
+ if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+ return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
+
+ if (vd->vdev_ops == &vdev_draid_spare_ops)
+ return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+
+ tvd = vd->vdev_top;
+ mg = tvd->vdev_mg;
+ generation = spa->spa_config_generation + 1;
+
+ /*
+ * If the device isn't already offline, try to offline it.
+ */
+ if (!vd->vdev_offline) {
+ /*
+ * If this device has the only valid copy of some data,
+ * don't allow it to be offlined. Log devices are always
+ * expendable.
+ */
+ if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
+ vdev_dtl_required(vd))
+ return (spa_vdev_state_exit(spa, NULL,
+ SET_ERROR(EBUSY)));
+
+ /*
+ * If the top-level is a slog and it has had allocations
+ * then proceed. We check that the vdev's metaslab group
+ * is not NULL since it's possible that we may have just
+ * added this vdev but not yet initialized its metaslabs.
+ */
+ if (tvd->vdev_islog && mg != NULL) {
+ /*
+ * Prevent any future allocations.
+ */
+ ASSERT3P(tvd->vdev_log_mg, ==, NULL);
+ metaslab_group_passivate(mg);
+ (void) spa_vdev_state_exit(spa, vd, 0);
+
+ error = spa_reset_logs(spa);
+
+ /*
+ * If the log device was successfully reset but has
+ * checkpointed data, do not offline it.
+ */
+ if (error == 0 &&
+ tvd->vdev_checkpoint_sm != NULL) {
+ ASSERT3U(space_map_allocated(
+ tvd->vdev_checkpoint_sm), !=, 0);
+ error = ZFS_ERR_CHECKPOINT_EXISTS;
+ }
+
+ spa_vdev_state_enter(spa, SCL_ALLOC);
+
+ /*
+ * Check to see if the config has changed.
+ */
+ if (error || generation != spa->spa_config_generation) {
+ metaslab_group_activate(mg);
+ if (error)
+ return (spa_vdev_state_exit(spa,
+ vd, error));
+ (void) spa_vdev_state_exit(spa, vd, 0);
+ goto top;
+ }
+ ASSERT0(tvd->vdev_stat.vs_alloc);
+ }
+
+ /*
+ * Offline this device and reopen its top-level vdev.
+ * If the top-level vdev is a log device then just offline
+ * it. Otherwise, if this action results in the top-level
+ * vdev becoming unusable, undo it and fail the request.
+ */
+ vd->vdev_offline = B_TRUE;
+ vdev_reopen(tvd);
+
+ if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
+ vdev_is_dead(tvd)) {
+ vd->vdev_offline = B_FALSE;
+ vdev_reopen(tvd);
+ return (spa_vdev_state_exit(spa, NULL,
+ SET_ERROR(EBUSY)));
+ }
+
+ /*
+ * Add the device back into the metaslab rotor so that
+ * once we online the device it's open for business.
+ */
+ if (tvd->vdev_islog && mg != NULL)
+ metaslab_group_activate(mg);
+ }
+
+ vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
+
+ return (spa_vdev_state_exit(spa, vd, 0));
+}
+
+int
+vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
+{
+ int error;
+
+ mutex_enter(&spa->spa_vdev_top_lock);
+ error = vdev_offline_locked(spa, guid, flags);
+ mutex_exit(&spa->spa_vdev_top_lock);
+
+ return (error);
+}
+
+/*
+ * Clear the error counts associated with this vdev. Unlike vdev_online() and
+ * vdev_offline(), we assume the spa config is locked. We also clear all
+ * children. If 'vd' is NULL, then the user wants to clear all vdevs.
+ */
+void
+vdev_clear(spa_t *spa, vdev_t *vd)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
+ if (vd == NULL)
+ vd = rvd;
+
+ vd->vdev_stat.vs_read_errors = 0;
+ vd->vdev_stat.vs_write_errors = 0;
+ vd->vdev_stat.vs_checksum_errors = 0;
+ vd->vdev_stat.vs_slow_ios = 0;
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_clear(spa, vd->vdev_child[c]);
+
+ /*
+ * It makes no sense to "clear" an indirect vdev.
+ */
+ if (!vdev_is_concrete(vd))
+ return;
+
+ /*
+ * If we're in the FAULTED state or have experienced failed I/O, then
+ * clear the persistent state and attempt to reopen the device. We
+ * also mark the vdev config dirty, so that the new faulted state is
+ * written out to disk.
+ */
+ if (vd->vdev_faulted || vd->vdev_degraded ||
+ !vdev_readable(vd) || !vdev_writeable(vd)) {
+ /*
+ * When reopening in response to a clear event, it may be due to
+ * a fmadm repair request. In this case, if the device is
+ * still broken, we want to still post the ereport again.
+ */
+ vd->vdev_forcefault = B_TRUE;
+
+ vd->vdev_faulted = vd->vdev_degraded = 0ULL;
+ vd->vdev_cant_read = B_FALSE;
+ vd->vdev_cant_write = B_FALSE;
+ vd->vdev_stat.vs_aux = 0;
+
+ vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
+
+ vd->vdev_forcefault = B_FALSE;
+
+ if (vd != rvd && vdev_writeable(vd->vdev_top))
+ vdev_state_dirty(vd->vdev_top);
+
+ /* If a resilver isn't required, check if vdevs can be culled */
+ if (vd->vdev_aux == NULL && !vdev_is_dead(vd) &&
+ !dsl_scan_resilvering(spa->spa_dsl_pool) &&
+ !dsl_scan_resilver_scheduled(spa->spa_dsl_pool))
+ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+
+ spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
+ }
+
+ /*
+ * When clearing a FMA-diagnosed fault, we always want to
+ * unspare the device, as we assume that the original spare was
+ * done in response to the FMA fault.
+ */
+ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
+ vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+ vd->vdev_parent->vdev_child[0] == vd)
+ vd->vdev_unspare = B_TRUE;
+}
+
+boolean_t
+vdev_is_dead(vdev_t *vd)
+{
+ /*
+ * Holes and missing devices are always considered "dead".
+ * This simplifies the code since we don't have to check for
+ * these types of devices in the various code paths.
+ * Instead we rely on the fact that we skip over dead devices
+ * before issuing I/O to them.
+ */
+ return (vd->vdev_state < VDEV_STATE_DEGRADED ||
+ vd->vdev_ops == &vdev_hole_ops ||
+ vd->vdev_ops == &vdev_missing_ops);
+}
+
+boolean_t
+vdev_readable(vdev_t *vd)
+{
+ return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
+}
+
+boolean_t
+vdev_writeable(vdev_t *vd)
+{
+ return (!vdev_is_dead(vd) && !vd->vdev_cant_write &&
+ vdev_is_concrete(vd));
+}
+
+boolean_t
+vdev_allocatable(vdev_t *vd)
+{
+ uint64_t state = vd->vdev_state;
+
+ /*
+ * We currently allow allocations from vdevs which may be in the
+ * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
+ * fails to reopen then we'll catch it later when we're holding
+ * the proper locks. Note that we have to get the vdev state
+ * in a local variable because although it changes atomically,
+ * we're asking two separate questions about it.
+ */
+ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
+ !vd->vdev_cant_write && vdev_is_concrete(vd) &&
+ vd->vdev_mg->mg_initialized);
+}
+
+boolean_t
+vdev_accessible(vdev_t *vd, zio_t *zio)
+{
+ ASSERT(zio->io_vd == vd);
+
+ if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
+ return (B_FALSE);
+
+ if (zio->io_type == ZIO_TYPE_READ)
+ return (!vd->vdev_cant_read);
+
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ return (!vd->vdev_cant_write);
+
+ return (B_TRUE);
+}
+
+static void
+vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
+{
+ /*
+ * Exclude the dRAID spare when aggregating to avoid double counting
+ * the ops and bytes. These IOs are counted by the physical leaves.
+ */
+ if (cvd->vdev_ops == &vdev_draid_spare_ops)
+ return;
+
+ for (int t = 0; t < VS_ZIO_TYPES; t++) {
+ vs->vs_ops[t] += cvs->vs_ops[t];
+ vs->vs_bytes[t] += cvs->vs_bytes[t];
+ }
+
+ cvs->vs_scan_removing = cvd->vdev_removing;
+}
+
+/*
+ * Get extended stats
+ */
+static void
+vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx)
+{
+ int t, b;
+ for (t = 0; t < ZIO_TYPES; t++) {
+ for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++)
+ vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b];
+
+ for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) {
+ vsx->vsx_total_histo[t][b] +=
+ cvsx->vsx_total_histo[t][b];
+ }
+ }
+
+ for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
+ for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) {
+ vsx->vsx_queue_histo[t][b] +=
+ cvsx->vsx_queue_histo[t][b];
+ }
+ vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t];
+ vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t];
+
+ for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++)
+ vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b];
+
+ for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++)
+ vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b];
+ }
+
+}
+
+boolean_t
+vdev_is_spacemap_addressable(vdev_t *vd)
+{
+ if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
+ return (B_TRUE);
+
+ /*
+ * If double-word space map entries are not enabled we assume
+ * 47 bits of the space map entry are dedicated to the entry's
+ * offset (see SM_OFFSET_BITS in space_map.h). We then use that
+ * to calculate the maximum address that can be described by a
+ * space map entry for the given device.
+ */
+ uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
+
+ if (shift >= 63) /* detect potential overflow */
+ return (B_TRUE);
+
+ return (vd->vdev_asize < (1ULL << shift));
+}
+
+/*
+ * Get statistics for the given vdev.
+ */
+static void
+vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
+{
+ int t;
+ /*
+ * If we're getting stats on the root vdev, aggregate the I/O counts
+ * over all top-level vdevs (i.e. the direct children of the root).
+ */
+ if (!vd->vdev_ops->vdev_op_leaf) {
+ if (vs) {
+ memset(vs->vs_ops, 0, sizeof (vs->vs_ops));
+ memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes));
+ }
+ if (vsx)
+ memset(vsx, 0, sizeof (*vsx));
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ vdev_stat_t *cvs = &cvd->vdev_stat;
+ vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex;
+
+ vdev_get_stats_ex_impl(cvd, cvs, cvsx);
+ if (vs)
+ vdev_get_child_stat(cvd, vs, cvs);
+ if (vsx)
+ vdev_get_child_stat_ex(cvd, vsx, cvsx);
+ }
+ } else {
+ /*
+ * We're a leaf. Just copy our ZIO active queue stats in. The
+ * other leaf stats are updated in vdev_stat_update().
+ */
+ if (!vsx)
+ return;
+
+ memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
+
+ for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) {
+ vsx->vsx_active_queue[t] =
+ vd->vdev_queue.vq_class[t].vqc_active;
+ vsx->vsx_pend_queue[t] = avl_numnodes(
+ &vd->vdev_queue.vq_class[t].vqc_queued_tree);
+ }
+ }
+}
+
+void
+vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
+{
+ vdev_t *tvd = vd->vdev_top;
+ mutex_enter(&vd->vdev_stat_lock);
+ if (vs) {
+ bcopy(&vd->vdev_stat, vs, sizeof (*vs));
+ vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
+ vs->vs_state = vd->vdev_state;
+ vs->vs_rsize = vdev_get_min_asize(vd);
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ vs->vs_rsize += VDEV_LABEL_START_SIZE +
+ VDEV_LABEL_END_SIZE;
+ /*
+ * Report initializing progress. Since we don't
+ * have the initializing locks held, this is only
+ * an estimate (although a fairly accurate one).
+ */
+ vs->vs_initialize_bytes_done =
+ vd->vdev_initialize_bytes_done;
+ vs->vs_initialize_bytes_est =
+ vd->vdev_initialize_bytes_est;
+ vs->vs_initialize_state = vd->vdev_initialize_state;
+ vs->vs_initialize_action_time =
+ vd->vdev_initialize_action_time;
+
+ /*
+ * Report manual TRIM progress. Since we don't have
+ * the manual TRIM locks held, this is only an
+ * estimate (although fairly accurate one).
+ */
+ vs->vs_trim_notsup = !vd->vdev_has_trim;
+ vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done;
+ vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est;
+ vs->vs_trim_state = vd->vdev_trim_state;
+ vs->vs_trim_action_time = vd->vdev_trim_action_time;
+
+ /* Set when there is a deferred resilver. */
+ vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
+ }
+
+ /*
+ * Report expandable space on top-level, non-auxiliary devices
+ * only. The expandable space is reported in terms of metaslab
+ * sized units since that determines how much space the pool
+ * can expand.
+ */
+ if (vd->vdev_aux == NULL && tvd != NULL) {
+ vs->vs_esize = P2ALIGN(
+ vd->vdev_max_asize - vd->vdev_asize,
+ 1ULL << tvd->vdev_ms_shift);
+ }
+
+ vs->vs_configured_ashift = vd->vdev_top != NULL
+ ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
+ vs->vs_logical_ashift = vd->vdev_logical_ashift;
+ vs->vs_physical_ashift = vd->vdev_physical_ashift;
+
+ /*
+ * Report fragmentation and rebuild progress for top-level,
+ * non-auxiliary, concrete devices.
+ */
+ if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
+ vdev_is_concrete(vd)) {
+ /*
+ * The vdev fragmentation rating doesn't take into
+ * account the embedded slog metaslab (vdev_log_mg).
+ * Since it's only one metaslab, it would have a tiny
+ * impact on the overall fragmentation.
+ */
+ vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
+ vd->vdev_mg->mg_fragmentation : 0;
+ }
+ }
+
+ vdev_get_stats_ex_impl(vd, vs, vsx);
+ mutex_exit(&vd->vdev_stat_lock);
+}
+
+void
+vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
+{
+ return (vdev_get_stats_ex(vd, vs, NULL));
+}
+
+void
+vdev_clear_stats(vdev_t *vd)
+{
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_space = 0;
+ vd->vdev_stat.vs_dspace = 0;
+ vd->vdev_stat.vs_alloc = 0;
+ mutex_exit(&vd->vdev_stat_lock);
+}
+
+void
+vdev_scan_stat_init(vdev_t *vd)
+{
+ vdev_stat_t *vs = &vd->vdev_stat;
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_scan_stat_init(vd->vdev_child[c]);
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vs->vs_scan_processed = 0;
+ mutex_exit(&vd->vdev_stat_lock);
+}
+
+void
+vdev_stat_update(zio_t *zio, uint64_t psize)
+{
+ spa_t *spa = zio->io_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
+ vdev_t *pvd;
+ uint64_t txg = zio->io_txg;
+ vdev_stat_t *vs = &vd->vdev_stat;
+ vdev_stat_ex_t *vsx = &vd->vdev_stat_ex;
+ zio_type_t type = zio->io_type;
+ int flags = zio->io_flags;
+
+ /*
+ * If this i/o is a gang leader, it didn't do any actual work.
+ */
+ if (zio->io_gang_tree)
+ return;
+
+ if (zio->io_error == 0) {
+ /*
+ * If this is a root i/o, don't count it -- we've already
+ * counted the top-level vdevs, and vdev_get_stats() will
+ * aggregate them when asked. This reduces contention on
+ * the root vdev_stat_lock and implicitly handles blocks
+ * that compress away to holes, for which there is no i/o.
+ * (Holes never create vdev children, so all the counters
+ * remain zero, which is what we want.)
+ *
+ * Note: this only applies to successful i/o (io_error == 0)
+ * because unlike i/o counts, errors are not additive.
+ * When reading a ditto block, for example, failure of
+ * one top-level vdev does not imply a root-level error.
+ */
+ if (vd == rvd)
+ return;
+
+ ASSERT(vd == zio->io_vd);
+
+ if (flags & ZIO_FLAG_IO_BYPASS)
+ return;
+
+ mutex_enter(&vd->vdev_stat_lock);
+
+ if (flags & ZIO_FLAG_IO_REPAIR) {
+ /*
+ * Repair is the result of a resilver issued by the
+ * scan thread (spa_sync).
+ */
+ if (flags & ZIO_FLAG_SCAN_THREAD) {
+ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+ dsl_scan_phys_t *scn_phys = &scn->scn_phys;
+ uint64_t *processed = &scn_phys->scn_processed;
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ atomic_add_64(processed, psize);
+ vs->vs_scan_processed += psize;
+ }
+
+ /*
+ * Repair is the result of a rebuild issued by the
+ * rebuild thread (vdev_rebuild_thread). To avoid
+ * double counting repaired bytes the virtual dRAID
+ * spare vdev is excluded from the processed bytes.
+ */
+ if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
+ vdev_t *tvd = vd->vdev_top;
+ vdev_rebuild_t *vr = &tvd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+ uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
+
+ if (vd->vdev_ops->vdev_op_leaf &&
+ vd->vdev_ops != &vdev_draid_spare_ops) {
+ atomic_add_64(rebuilt, psize);
+ }
+ vs->vs_rebuild_processed += psize;
+ }
+
+ if (flags & ZIO_FLAG_SELF_HEAL)
+ vs->vs_self_healed += psize;
+ }
+
+ /*
+ * The bytes/ops/histograms are recorded at the leaf level and
+ * aggregated into the higher level vdevs in vdev_get_stats().
+ */
+ if (vd->vdev_ops->vdev_op_leaf &&
+ (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) {
+ zio_type_t vs_type = type;
+ zio_priority_t priority = zio->io_priority;
+
+ /*
+ * TRIM ops and bytes are reported to user space as
+ * ZIO_TYPE_IOCTL. This is done to preserve the
+ * vdev_stat_t structure layout for user space.
+ */
+ if (type == ZIO_TYPE_TRIM)
+ vs_type = ZIO_TYPE_IOCTL;
+
+ /*
+ * Solely for the purposes of 'zpool iostat -lqrw'
+ * reporting use the priority to catagorize the IO.
+ * Only the following are reported to user space:
+ *
+ * ZIO_PRIORITY_SYNC_READ,
+ * ZIO_PRIORITY_SYNC_WRITE,
+ * ZIO_PRIORITY_ASYNC_READ,
+ * ZIO_PRIORITY_ASYNC_WRITE,
+ * ZIO_PRIORITY_SCRUB,
+ * ZIO_PRIORITY_TRIM.
+ */
+ if (priority == ZIO_PRIORITY_REBUILD) {
+ priority = ((type == ZIO_TYPE_WRITE) ?
+ ZIO_PRIORITY_ASYNC_WRITE :
+ ZIO_PRIORITY_SCRUB);
+ } else if (priority == ZIO_PRIORITY_INITIALIZING) {
+ ASSERT3U(type, ==, ZIO_TYPE_WRITE);
+ priority = ZIO_PRIORITY_ASYNC_WRITE;
+ } else if (priority == ZIO_PRIORITY_REMOVAL) {
+ priority = ((type == ZIO_TYPE_WRITE) ?
+ ZIO_PRIORITY_ASYNC_WRITE :
+ ZIO_PRIORITY_ASYNC_READ);
+ }
+
+ vs->vs_ops[vs_type]++;
+ vs->vs_bytes[vs_type] += psize;
+
+ if (flags & ZIO_FLAG_DELEGATED) {
+ vsx->vsx_agg_histo[priority]
+ [RQ_HISTO(zio->io_size)]++;
+ } else {
+ vsx->vsx_ind_histo[priority]
+ [RQ_HISTO(zio->io_size)]++;
+ }
+
+ if (zio->io_delta && zio->io_delay) {
+ vsx->vsx_queue_histo[priority]
+ [L_HISTO(zio->io_delta - zio->io_delay)]++;
+ vsx->vsx_disk_histo[type]
+ [L_HISTO(zio->io_delay)]++;
+ vsx->vsx_total_histo[type]
+ [L_HISTO(zio->io_delta)]++;
+ }
+ }
+
+ mutex_exit(&vd->vdev_stat_lock);
+ return;
+ }
+
+ if (flags & ZIO_FLAG_SPECULATIVE)
+ return;
+
+ /*
+ * If this is an I/O error that is going to be retried, then ignore the
+ * error. Otherwise, the user may interpret B_FAILFAST I/O errors as
+ * hard errors, when in reality they can happen for any number of
+ * innocuous reasons (bus resets, MPxIO link failure, etc).
+ */
+ if (zio->io_error == EIO &&
+ !(zio->io_flags & ZIO_FLAG_IO_RETRY))
+ return;
+
+ /*
+ * Intent logs writes won't propagate their error to the root
+ * I/O so don't mark these types of failures as pool-level
+ * errors.
+ */
+ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+ return;
+
+ if (type == ZIO_TYPE_WRITE && txg != 0 &&
+ (!(flags & ZIO_FLAG_IO_REPAIR) ||
+ (flags & ZIO_FLAG_SCAN_THREAD) ||
+ spa->spa_claiming)) {
+ /*
+ * This is either a normal write (not a repair), or it's
+ * a repair induced by the scrub thread, or it's a repair
+ * made by zil_claim() during spa_load() in the first txg.
+ * In the normal case, we commit the DTL change in the same
+ * txg as the block was born. In the scrub-induced repair
+ * case, we know that scrubs run in first-pass syncing context,
+ * so we commit the DTL change in spa_syncing_txg(spa).
+ * In the zil_claim() case, we commit in spa_first_txg(spa).
+ *
+ * We currently do not make DTL entries for failed spontaneous
+ * self-healing writes triggered by normal (non-scrubbing)
+ * reads, because we have no transactional context in which to
+ * do so -- and it's not clear that it'd be desirable anyway.
+ */
+ if (vd->vdev_ops->vdev_op_leaf) {
+ uint64_t commit_txg = txg;
+ if (flags & ZIO_FLAG_SCAN_THREAD) {
+ ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+ ASSERT(spa_sync_pass(spa) == 1);
+ vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
+ commit_txg = spa_syncing_txg(spa);
+ } else if (spa->spa_claiming) {
+ ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+ commit_txg = spa_first_txg(spa);
+ }
+ ASSERT(commit_txg >= spa_syncing_txg(spa));
+ if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
+ return;
+ for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+ vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
+ vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
+ }
+ if (vd != rvd)
+ vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
+ }
+}
+
+int64_t
+vdev_deflated_space(vdev_t *vd, int64_t space)
+{
+ ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0);
+ ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
+
+ return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio);
+}
+
+/*
+ * Update the in-core space usage stats for this vdev, its metaslab class,
+ * and the root vdev.
+ */
+void
+vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
+ int64_t space_delta)
+{
+ int64_t dspace_delta;
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ ASSERT(vd == vd->vdev_top);
+
+ /*
+ * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
+ * factor. We must calculate this here and not at the root vdev
+ * because the root vdev's psize-to-asize is simply the max of its
+ * children's, thus not accurate enough for us.
+ */
+ dspace_delta = vdev_deflated_space(vd, space_delta);
+
+ mutex_enter(&vd->vdev_stat_lock);
+ /* ensure we won't underflow */
+ if (alloc_delta < 0) {
+ ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta);
+ }
+
+ vd->vdev_stat.vs_alloc += alloc_delta;
+ vd->vdev_stat.vs_space += space_delta;
+ vd->vdev_stat.vs_dspace += dspace_delta;
+ mutex_exit(&vd->vdev_stat_lock);
+
+ /* every class but log contributes to root space stats */
+ if (vd->vdev_mg != NULL && !vd->vdev_islog) {
+ ASSERT(!vd->vdev_isl2cache);
+ mutex_enter(&rvd->vdev_stat_lock);
+ rvd->vdev_stat.vs_alloc += alloc_delta;
+ rvd->vdev_stat.vs_space += space_delta;
+ rvd->vdev_stat.vs_dspace += dspace_delta;
+ mutex_exit(&rvd->vdev_stat_lock);
+ }
+ /* Note: metaslab_class_space_update moved to metaslab_space_update */
+}
+
+/*
+ * Mark a top-level vdev's config as dirty, placing it on the dirty list
+ * so that it will be written out next time the vdev configuration is synced.
+ * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
+ */
+void
+vdev_config_dirty(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ int c;
+
+ ASSERT(spa_writeable(spa));
+
+ /*
+ * If this is an aux vdev (as with l2cache and spare devices), then we
+ * update the vdev config manually and set the sync flag.
+ */
+ if (vd->vdev_aux != NULL) {
+ spa_aux_vdev_t *sav = vd->vdev_aux;
+ nvlist_t **aux;
+ uint_t naux;
+
+ for (c = 0; c < sav->sav_count; c++) {
+ if (sav->sav_vdevs[c] == vd)
+ break;
+ }
+
+ if (c == sav->sav_count) {
+ /*
+ * We're being removed. There's nothing more to do.
+ */
+ ASSERT(sav->sav_sync == B_TRUE);
+ return;
+ }
+
+ sav->sav_sync = B_TRUE;
+
+ if (nvlist_lookup_nvlist_array(sav->sav_config,
+ ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
+ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
+ ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
+ }
+
+ ASSERT(c < naux);
+
+ /*
+ * Setting the nvlist in the middle if the array is a little
+ * sketchy, but it will work.
+ */
+ nvlist_free(aux[c]);
+ aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
+
+ return;
+ }
+
+ /*
+ * The dirty list is protected by the SCL_CONFIG lock. The caller
+ * must either hold SCL_CONFIG as writer, or must be the sync thread
+ * (which holds SCL_CONFIG as reader). There's only one sync thread,
+ * so this is sufficient to ensure mutual exclusion.
+ */
+ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
+ (dsl_pool_sync_context(spa_get_dsl(spa)) &&
+ spa_config_held(spa, SCL_CONFIG, RW_READER)));
+
+ if (vd == rvd) {
+ for (c = 0; c < rvd->vdev_children; c++)
+ vdev_config_dirty(rvd->vdev_child[c]);
+ } else {
+ ASSERT(vd == vd->vdev_top);
+
+ if (!list_link_active(&vd->vdev_config_dirty_node) &&
+ vdev_is_concrete(vd)) {
+ list_insert_head(&spa->spa_config_dirty_list, vd);
+ }
+ }
+}
+
+void
+vdev_config_clean(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
+ (dsl_pool_sync_context(spa_get_dsl(spa)) &&
+ spa_config_held(spa, SCL_CONFIG, RW_READER)));
+
+ ASSERT(list_link_active(&vd->vdev_config_dirty_node));
+ list_remove(&spa->spa_config_dirty_list, vd);
+}
+
+/*
+ * Mark a top-level vdev's state as dirty, so that the next pass of
+ * spa_sync() can convert this into vdev_config_dirty(). We distinguish
+ * the state changes from larger config changes because they require
+ * much less locking, and are often needed for administrative actions.
+ */
+void
+vdev_state_dirty(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_writeable(spa));
+ ASSERT(vd == vd->vdev_top);
+
+ /*
+ * The state list is protected by the SCL_STATE lock. The caller
+ * must either hold SCL_STATE as writer, or must be the sync thread
+ * (which holds SCL_STATE as reader). There's only one sync thread,
+ * so this is sufficient to ensure mutual exclusion.
+ */
+ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
+ (dsl_pool_sync_context(spa_get_dsl(spa)) &&
+ spa_config_held(spa, SCL_STATE, RW_READER)));
+
+ if (!list_link_active(&vd->vdev_state_dirty_node) &&
+ vdev_is_concrete(vd))
+ list_insert_head(&spa->spa_state_dirty_list, vd);
+}
+
+void
+vdev_state_clean(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
+ (dsl_pool_sync_context(spa_get_dsl(spa)) &&
+ spa_config_held(spa, SCL_STATE, RW_READER)));
+
+ ASSERT(list_link_active(&vd->vdev_state_dirty_node));
+ list_remove(&spa->spa_state_dirty_list, vd);
+}
+
+/*
+ * Propagate vdev state up from children to parent.
+ */
+void
+vdev_propagate_state(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ int degraded = 0, faulted = 0;
+ int corrupted = 0;
+ vdev_t *child;
+
+ if (vd->vdev_children > 0) {
+ for (int c = 0; c < vd->vdev_children; c++) {
+ child = vd->vdev_child[c];
+
+ /*
+ * Don't factor holes or indirect vdevs into the
+ * decision.
+ */
+ if (!vdev_is_concrete(child))
+ continue;
+
+ if (!vdev_readable(child) ||
+ (!vdev_writeable(child) && spa_writeable(spa))) {
+ /*
+ * Root special: if there is a top-level log
+ * device, treat the root vdev as if it were
+ * degraded.
+ */
+ if (child->vdev_islog && vd == rvd)
+ degraded++;
+ else
+ faulted++;
+ } else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
+ degraded++;
+ }
+
+ if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
+ corrupted++;
+ }
+
+ vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
+
+ /*
+ * Root special: if there is a top-level vdev that cannot be
+ * opened due to corrupted metadata, then propagate the root
+ * vdev's aux state as 'corrupt' rather than 'insufficient
+ * replicas'.
+ */
+ if (corrupted && vd == rvd &&
+ rvd->vdev_state == VDEV_STATE_CANT_OPEN)
+ vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ }
+
+ if (vd->vdev_parent)
+ vdev_propagate_state(vd->vdev_parent);
+}
+
+/*
+ * Set a vdev's state. If this is during an open, we don't update the parent
+ * state, because we're in the process of opening children depth-first.
+ * Otherwise, we propagate the change to the parent.
+ *
+ * If this routine places a device in a faulted state, an appropriate ereport is
+ * generated.
+ */
+void
+vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
+{
+ uint64_t save_state;
+ spa_t *spa = vd->vdev_spa;
+
+ if (state == vd->vdev_state) {
+ /*
+ * Since vdev_offline() code path is already in an offline
+ * state we can miss a statechange event to OFFLINE. Check
+ * the previous state to catch this condition.
+ */
+ if (vd->vdev_ops->vdev_op_leaf &&
+ (state == VDEV_STATE_OFFLINE) &&
+ (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) {
+ /* post an offline state change */
+ zfs_post_state_change(spa, vd, vd->vdev_prevstate);
+ }
+ vd->vdev_stat.vs_aux = aux;
+ return;
+ }
+
+ save_state = vd->vdev_state;
+
+ vd->vdev_state = state;
+ vd->vdev_stat.vs_aux = aux;
+
+ /*
+ * If we are setting the vdev state to anything but an open state, then
+ * always close the underlying device unless the device has requested
+ * a delayed close (i.e. we're about to remove or fault the device).
+ * Otherwise, we keep accessible but invalid devices open forever.
+ * We don't call vdev_close() itself, because that implies some extra
+ * checks (offline, etc) that we don't want here. This is limited to
+ * leaf devices, because otherwise closing the device will affect other
+ * children.
+ */
+ if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
+ vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_ops->vdev_op_close(vd);
+
+ if (vd->vdev_removed &&
+ state == VDEV_STATE_CANT_OPEN &&
+ (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
+ /*
+ * If the previous state is set to VDEV_STATE_REMOVED, then this
+ * device was previously marked removed and someone attempted to
+ * reopen it. If this failed due to a nonexistent device, then
+ * keep the device in the REMOVED state. We also let this be if
+ * it is one of our special test online cases, which is only
+ * attempting to online the device and shouldn't generate an FMA
+ * fault.
+ */
+ vd->vdev_state = VDEV_STATE_REMOVED;
+ vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+ } else if (state == VDEV_STATE_REMOVED) {
+ vd->vdev_removed = B_TRUE;
+ } else if (state == VDEV_STATE_CANT_OPEN) {
+ /*
+ * If we fail to open a vdev during an import or recovery, we
+ * mark it as "not available", which signifies that it was
+ * never there to begin with. Failure to open such a device
+ * is not considered an error.
+ */
+ if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
+ spa_load_state(spa) == SPA_LOAD_RECOVER) &&
+ vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_not_present = 1;
+
+ /*
+ * Post the appropriate ereport. If the 'prevstate' field is
+ * set to something other than VDEV_STATE_UNKNOWN, it indicates
+ * that this is part of a vdev_reopen(). In this case, we don't
+ * want to post the ereport if the device was already in the
+ * CANT_OPEN state beforehand.
+ *
+ * If the 'checkremove' flag is set, then this is an attempt to
+ * online the device in response to an insertion event. If we
+ * hit this case, then we have detected an insertion event for a
+ * faulted or offline device that wasn't in the removed state.
+ * In this scenario, we don't post an ereport because we are
+ * about to replace the device, or attempt an online with
+ * vdev_forcefault, which will generate the fault for us.
+ */
+ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
+ !vd->vdev_not_present && !vd->vdev_checkremove &&
+ vd != spa->spa_root_vdev) {
+ const char *class;
+
+ switch (aux) {
+ case VDEV_AUX_OPEN_FAILED:
+ class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
+ break;
+ case VDEV_AUX_CORRUPT_DATA:
+ class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
+ break;
+ case VDEV_AUX_NO_REPLICAS:
+ class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
+ break;
+ case VDEV_AUX_BAD_GUID_SUM:
+ class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
+ break;
+ case VDEV_AUX_TOO_SMALL:
+ class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
+ break;
+ case VDEV_AUX_BAD_LABEL:
+ class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
+ break;
+ case VDEV_AUX_BAD_ASHIFT:
+ class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT;
+ break;
+ default:
+ class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
+ }
+
+ (void) zfs_ereport_post(class, spa, vd, NULL, NULL,
+ save_state);
+ }
+
+ /* Erase any notion of persistent removed state */
+ vd->vdev_removed = B_FALSE;
+ } else {
+ vd->vdev_removed = B_FALSE;
+ }
+
+ /*
+ * Notify ZED of any significant state-change on a leaf vdev.
+ *
+ */
+ if (vd->vdev_ops->vdev_op_leaf) {
+ /* preserve original state from a vdev_reopen() */
+ if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) &&
+ (vd->vdev_prevstate != vd->vdev_state) &&
+ (save_state <= VDEV_STATE_CLOSED))
+ save_state = vd->vdev_prevstate;
+
+ /* filter out state change due to initial vdev_open */
+ if (save_state > VDEV_STATE_CLOSED)
+ zfs_post_state_change(spa, vd, save_state);
+ }
+
+ if (!isopen && vd->vdev_parent)
+ vdev_propagate_state(vd->vdev_parent);
+}
+
+boolean_t
+vdev_children_are_offline(vdev_t *vd)
+{
+ ASSERT(!vd->vdev_ops->vdev_op_leaf);
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE)
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+}
+
+/*
+ * Check the vdev configuration to ensure that it's capable of supporting
+ * a root pool. We do not support partial configuration.
+ */
+boolean_t
+vdev_is_bootable(vdev_t *vd)
+{
+ if (!vd->vdev_ops->vdev_op_leaf) {
+ const char *vdev_type = vd->vdev_ops->vdev_op_type;
+
+ if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 ||
+ strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) {
+ return (B_FALSE);
+ }
+ }
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ if (!vdev_is_bootable(vd->vdev_child[c]))
+ return (B_FALSE);
+ }
+ return (B_TRUE);
+}
+
+boolean_t
+vdev_is_concrete(vdev_t *vd)
+{
+ vdev_ops_t *ops = vd->vdev_ops;
+ if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops ||
+ ops == &vdev_missing_ops || ops == &vdev_root_ops) {
+ return (B_FALSE);
+ } else {
+ return (B_TRUE);
+ }
+}
+
+/*
+ * Determine if a log device has valid content. If the vdev was
+ * removed or faulted in the MOS config then we know that
+ * the content on the log device has already been written to the pool.
+ */
+boolean_t
+vdev_log_state_valid(vdev_t *vd)
+{
+ if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
+ !vd->vdev_removed)
+ return (B_TRUE);
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ if (vdev_log_state_valid(vd->vdev_child[c]))
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+/*
+ * Expand a vdev if possible.
+ */
+void
+vdev_expand(vdev_t *vd, uint64_t txg)
+{
+ ASSERT(vd->vdev_top == vd);
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+ ASSERT(vdev_is_concrete(vd));
+
+ vdev_set_deflate_ratio(vd);
+
+ if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
+ vdev_is_concrete(vd)) {
+ vdev_metaslab_group_create(vd);
+ VERIFY(vdev_metaslab_init(vd, txg) == 0);
+ vdev_config_dirty(vd);
+ }
+}
+
+/*
+ * Split a vdev.
+ */
+void
+vdev_split(vdev_t *vd)
+{
+ vdev_t *cvd, *pvd = vd->vdev_parent;
+
+ vdev_remove_child(pvd, vd);
+ vdev_compact_children(pvd);
+
+ cvd = pvd->vdev_child[0];
+ if (pvd->vdev_children == 1) {
+ vdev_remove_parent(cvd);
+ cvd->vdev_splitting = B_TRUE;
+ }
+ vdev_propagate_state(cvd);
+}
+
+void
+vdev_deadman(vdev_t *vd, char *tag)
+{
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ vdev_deadman(cvd, tag);
+ }
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ vdev_queue_t *vq = &vd->vdev_queue;
+
+ mutex_enter(&vq->vq_lock);
+ if (avl_numnodes(&vq->vq_active_tree) > 0) {
+ spa_t *spa = vd->vdev_spa;
+ zio_t *fio;
+ uint64_t delta;
+
+ zfs_dbgmsg("slow vdev: %s has %d active IOs",
+ vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
+
+ /*
+ * Look at the head of all the pending queues,
+ * if any I/O has been outstanding for longer than
+ * the spa_deadman_synctime invoke the deadman logic.
+ */
+ fio = avl_first(&vq->vq_active_tree);
+ delta = gethrtime() - fio->io_timestamp;
+ if (delta > spa_deadman_synctime(spa))
+ zio_deadman(fio, tag);
+ }
+ mutex_exit(&vq->vq_lock);
+ }
+}
+
+void
+vdev_defer_resilver(vdev_t *vd)
+{
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ vd->vdev_resilver_deferred = B_TRUE;
+ vd->vdev_spa->spa_resilver_deferred = B_TRUE;
+}
+
+/*
+ * Clears the resilver deferred flag on all leaf devs under vd. Returns
+ * B_TRUE if we have devices that need to be resilvered and are available to
+ * accept resilver I/Os.
+ */
+boolean_t
+vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx)
+{
+ boolean_t resilver_needed = B_FALSE;
+ spa_t *spa = vd->vdev_spa;
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ resilver_needed |= vdev_clear_resilver_deferred(cvd, tx);
+ }
+
+ if (vd == spa->spa_root_vdev &&
+ spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) {
+ spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
+ vdev_config_dirty(vd);
+ spa->spa_resilver_deferred = B_FALSE;
+ return (resilver_needed);
+ }
+
+ if (!vdev_is_concrete(vd) || vd->vdev_aux ||
+ !vd->vdev_ops->vdev_op_leaf)
+ return (resilver_needed);
+
+ vd->vdev_resilver_deferred = B_FALSE;
+
+ return (!vdev_is_dead(vd) && !vd->vdev_offline &&
+ vdev_resilver_needed(vd, NULL, NULL));
+}
+
+boolean_t
+vdev_xlate_is_empty(range_seg64_t *rs)
+{
+ return (rs->rs_start == rs->rs_end);
+}
+
+/*
+ * Translate a logical range to the first contiguous physical range for the
+ * specified vdev_t. This function is initially called with a leaf vdev and
+ * will walk each parent vdev until it reaches a top-level vdev. Once the
+ * top-level is reached the physical range is initialized and the recursive
+ * function begins to unwind. As it unwinds it calls the parent's vdev
+ * specific translation function to do the real conversion.
+ */
+void
+vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
+ range_seg64_t *physical_rs, range_seg64_t *remain_rs)
+{
+ /*
+ * Walk up the vdev tree
+ */
+ if (vd != vd->vdev_top) {
+ vdev_xlate(vd->vdev_parent, logical_rs, physical_rs,
+ remain_rs);
+ } else {
+ /*
+ * We've reached the top-level vdev, initialize the physical
+ * range to the logical range and set an empty remaining
+ * range then start to unwind.
+ */
+ physical_rs->rs_start = logical_rs->rs_start;
+ physical_rs->rs_end = logical_rs->rs_end;
+
+ remain_rs->rs_start = logical_rs->rs_start;
+ remain_rs->rs_end = logical_rs->rs_start;
+
+ return;
+ }
+
+ vdev_t *pvd = vd->vdev_parent;
+ ASSERT3P(pvd, !=, NULL);
+ ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
+
+ /*
+ * As this recursive function unwinds, translate the logical
+ * range into its physical and any remaining components by calling
+ * the vdev specific translate function.
+ */
+ range_seg64_t intermediate = { 0 };
+ pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs);
+
+ physical_rs->rs_start = intermediate.rs_start;
+ physical_rs->rs_end = intermediate.rs_end;
+}
+
+void
+vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
+ vdev_xlate_func_t *func, void *arg)
+{
+ range_seg64_t iter_rs = *logical_rs;
+ range_seg64_t physical_rs;
+ range_seg64_t remain_rs;
+
+ while (!vdev_xlate_is_empty(&iter_rs)) {
+
+ vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs);
+
+ /*
+ * With raidz and dRAID, it's possible that the logical range
+ * does not live on this leaf vdev. Only when there is a non-
+ * zero physical size call the provided function.
+ */
+ if (!vdev_xlate_is_empty(&physical_rs))
+ func(arg, &physical_rs);
+
+ iter_rs = remain_rs;
+ }
+}
+
+/*
+ * Look at the vdev tree and determine whether any devices are currently being
+ * replaced.
+ */
+boolean_t
+vdev_replace_in_progress(vdev_t *vdev)
+{
+ ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0);
+
+ if (vdev->vdev_ops == &vdev_replacing_ops)
+ return (B_TRUE);
+
+ /*
+ * A 'spare' vdev indicates that we have a replace in progress, unless
+ * it has exactly two children, and the second, the hot spare, has
+ * finished being resilvered.
+ */
+ if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 ||
+ !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING)))
+ return (B_TRUE);
+
+ for (int i = 0; i < vdev->vdev_children; i++) {
+ if (vdev_replace_in_progress(vdev->vdev_child[i]))
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+EXPORT_SYMBOL(vdev_fault);
+EXPORT_SYMBOL(vdev_degrade);
+EXPORT_SYMBOL(vdev_online);
+EXPORT_SYMBOL(vdev_offline);
+EXPORT_SYMBOL(vdev_clear);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, INT, ZMOD_RW,
+ "Target number of metaslabs per top-level vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, INT, ZMOD_RW,
+ "Default limit for metaslab size");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, INT, ZMOD_RW,
+ "Minimum number of metaslabs per top-level vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, INT, ZMOD_RW,
+ "Practical upper limit of total metaslabs per top-level vdev");
+
+ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW,
+ "Rate limit slow IO (delay) events to this many per second");
+
+ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW,
+ "Rate limit checksum events to this many checksum errors per second "
+ "(do not set below zed threshold).");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW,
+ "Ignore errors during resilver/scrub");
+
+ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW,
+ "Bypass vdev_validate()");
+
+ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW,
+ "Disable cache flushes");
+
+ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, INT, ZMOD_RW,
+ "Minimum number of metaslabs required to dedicate one for log blocks");
+
+ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift,
+ param_set_min_auto_ashift, param_get_ulong, ZMOD_RW,
+ "Minimum ashift used when creating new top-level vdevs");
+
+ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift,
+ param_set_max_auto_ashift, param_get_ulong, ZMOD_RW,
+ "Maximum ashift used when optimizing for logical -> physical sector "
+ "size on new top-level vdevs");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_cache.c b/sys/contrib/openzfs/module/zfs/vdev_cache.c
new file mode 100644
index 000000000000..6e82184b800d
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_cache.c
@@ -0,0 +1,437 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/kstat.h>
+#include <sys/abd.h>
+
+/*
+ * Virtual device read-ahead caching.
+ *
+ * This file implements a simple LRU read-ahead cache. When the DMU reads
+ * a given block, it will often want other, nearby blocks soon thereafter.
+ * We take advantage of this by reading a larger disk region and caching
+ * the result. In the best case, this can turn 128 back-to-back 512-byte
+ * reads into a single 64k read followed by 127 cache hits; this reduces
+ * latency dramatically. In the worst case, it can turn an isolated 512-byte
+ * read into a 64k read, which doesn't affect latency all that much but is
+ * terribly wasteful of bandwidth. A more intelligent version of the cache
+ * could keep track of access patterns and not do read-ahead unless it sees
+ * at least two temporally close I/Os to the same region. Currently, only
+ * metadata I/O is inflated. A further enhancement could take advantage of
+ * more semantic information about the I/O. And it could use something
+ * faster than an AVL tree; that was chosen solely for convenience.
+ *
+ * There are five cache operations: allocate, fill, read, write, evict.
+ *
+ * (1) Allocate. This reserves a cache entry for the specified region.
+ * We separate the allocate and fill operations so that multiple threads
+ * don't generate I/O for the same cache miss.
+ *
+ * (2) Fill. When the I/O for a cache miss completes, the fill routine
+ * places the data in the previously allocated cache entry.
+ *
+ * (3) Read. Read data from the cache.
+ *
+ * (4) Write. Update cache contents after write completion.
+ *
+ * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry
+ * if the total cache size exceeds zfs_vdev_cache_size.
+ */
+
+/*
+ * These tunables are for performance analysis.
+ */
+/*
+ * All i/os smaller than zfs_vdev_cache_max will be turned into
+ * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
+ * track buffer). At most zfs_vdev_cache_size bytes will be kept in each
+ * vdev's vdev_cache.
+ *
+ * TODO: Note that with the current ZFS code, it turns out that the
+ * vdev cache is not helpful, and in some cases actually harmful. It
+ * is better if we disable this. Once some time has passed, we should
+ * actually remove this to simplify the code. For now we just disable
+ * it by setting the zfs_vdev_cache_size to zero. Note that Solaris 11
+ * has made these same changes.
+ */
+int zfs_vdev_cache_max = 1<<14; /* 16KB */
+int zfs_vdev_cache_size = 0;
+int zfs_vdev_cache_bshift = 16;
+
+#define VCBS (1 << zfs_vdev_cache_bshift) /* 64KB */
+
+kstat_t *vdc_ksp = NULL;
+
+typedef struct vdc_stats {
+ kstat_named_t vdc_stat_delegations;
+ kstat_named_t vdc_stat_hits;
+ kstat_named_t vdc_stat_misses;
+} vdc_stats_t;
+
+static vdc_stats_t vdc_stats = {
+ { "delegations", KSTAT_DATA_UINT64 },
+ { "hits", KSTAT_DATA_UINT64 },
+ { "misses", KSTAT_DATA_UINT64 }
+};
+
+#define VDCSTAT_BUMP(stat) atomic_inc_64(&vdc_stats.stat.value.ui64);
+
+static inline int
+vdev_cache_offset_compare(const void *a1, const void *a2)
+{
+ const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1;
+ const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2;
+
+ return (TREE_CMP(ve1->ve_offset, ve2->ve_offset));
+}
+
+static int
+vdev_cache_lastused_compare(const void *a1, const void *a2)
+{
+ const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1;
+ const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2;
+
+ int cmp = TREE_CMP(ve1->ve_lastused, ve2->ve_lastused);
+ if (likely(cmp))
+ return (cmp);
+
+ /*
+ * Among equally old entries, sort by offset to ensure uniqueness.
+ */
+ return (vdev_cache_offset_compare(a1, a2));
+}
+
+/*
+ * Evict the specified entry from the cache.
+ */
+static void
+vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
+{
+ ASSERT(MUTEX_HELD(&vc->vc_lock));
+ ASSERT3P(ve->ve_fill_io, ==, NULL);
+ ASSERT3P(ve->ve_abd, !=, NULL);
+
+ avl_remove(&vc->vc_lastused_tree, ve);
+ avl_remove(&vc->vc_offset_tree, ve);
+ abd_free(ve->ve_abd);
+ kmem_free(ve, sizeof (vdev_cache_entry_t));
+}
+
+/*
+ * Allocate an entry in the cache. At the point we don't have the data,
+ * we're just creating a placeholder so that multiple threads don't all
+ * go off and read the same blocks.
+ */
+static vdev_cache_entry_t *
+vdev_cache_allocate(zio_t *zio)
+{
+ vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+ uint64_t offset = P2ALIGN(zio->io_offset, VCBS);
+ vdev_cache_entry_t *ve;
+
+ ASSERT(MUTEX_HELD(&vc->vc_lock));
+
+ if (zfs_vdev_cache_size == 0)
+ return (NULL);
+
+ /*
+ * If adding a new entry would exceed the cache size,
+ * evict the oldest entry (LRU).
+ */
+ if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
+ zfs_vdev_cache_size) {
+ ve = avl_first(&vc->vc_lastused_tree);
+ if (ve->ve_fill_io != NULL)
+ return (NULL);
+ ASSERT3U(ve->ve_hits, !=, 0);
+ vdev_cache_evict(vc, ve);
+ }
+
+ ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
+ ve->ve_offset = offset;
+ ve->ve_lastused = ddi_get_lbolt();
+ ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE);
+
+ avl_add(&vc->vc_offset_tree, ve);
+ avl_add(&vc->vc_lastused_tree, ve);
+
+ return (ve);
+}
+
+static void
+vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
+{
+ uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
+
+ ASSERT(MUTEX_HELD(&vc->vc_lock));
+ ASSERT3P(ve->ve_fill_io, ==, NULL);
+
+ if (ve->ve_lastused != ddi_get_lbolt()) {
+ avl_remove(&vc->vc_lastused_tree, ve);
+ ve->ve_lastused = ddi_get_lbolt();
+ avl_add(&vc->vc_lastused_tree, ve);
+ }
+
+ ve->ve_hits++;
+ abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size);
+}
+
+/*
+ * Fill a previously allocated cache entry with data.
+ */
+static void
+vdev_cache_fill(zio_t *fio)
+{
+ vdev_t *vd = fio->io_vd;
+ vdev_cache_t *vc = &vd->vdev_cache;
+ vdev_cache_entry_t *ve = fio->io_private;
+ zio_t *pio;
+
+ ASSERT3U(fio->io_size, ==, VCBS);
+
+ /*
+ * Add data to the cache.
+ */
+ mutex_enter(&vc->vc_lock);
+
+ ASSERT3P(ve->ve_fill_io, ==, fio);
+ ASSERT3U(ve->ve_offset, ==, fio->io_offset);
+ ASSERT3P(ve->ve_abd, ==, fio->io_abd);
+
+ ve->ve_fill_io = NULL;
+
+ /*
+ * Even if this cache line was invalidated by a missed write update,
+ * any reads that were queued up before the missed update are still
+ * valid, so we can satisfy them from this line before we evict it.
+ */
+ zio_link_t *zl = NULL;
+ while ((pio = zio_walk_parents(fio, &zl)) != NULL)
+ vdev_cache_hit(vc, ve, pio);
+
+ if (fio->io_error || ve->ve_missed_update)
+ vdev_cache_evict(vc, ve);
+
+ mutex_exit(&vc->vc_lock);
+}
+
+/*
+ * Read data from the cache. Returns B_TRUE cache hit, B_FALSE on miss.
+ */
+boolean_t
+vdev_cache_read(zio_t *zio)
+{
+ vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+ vdev_cache_entry_t *ve, *ve_search;
+ uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
+ zio_t *fio;
+ uint64_t cache_phase __maybe_unused = P2PHASE(zio->io_offset, VCBS);
+
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+
+ if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
+ return (B_FALSE);
+
+ if (zio->io_size > zfs_vdev_cache_max)
+ return (B_FALSE);
+
+ /*
+ * If the I/O straddles two or more cache blocks, don't cache it.
+ */
+ if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
+ return (B_FALSE);
+
+ ASSERT3U(cache_phase + zio->io_size, <=, VCBS);
+
+ mutex_enter(&vc->vc_lock);
+
+ ve_search = kmem_alloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
+ ve_search->ve_offset = cache_offset;
+ ve = avl_find(&vc->vc_offset_tree, ve_search, NULL);
+ kmem_free(ve_search, sizeof (vdev_cache_entry_t));
+
+ if (ve != NULL) {
+ if (ve->ve_missed_update) {
+ mutex_exit(&vc->vc_lock);
+ return (B_FALSE);
+ }
+
+ if ((fio = ve->ve_fill_io) != NULL) {
+ zio_vdev_io_bypass(zio);
+ zio_add_child(zio, fio);
+ mutex_exit(&vc->vc_lock);
+ VDCSTAT_BUMP(vdc_stat_delegations);
+ return (B_TRUE);
+ }
+
+ vdev_cache_hit(vc, ve, zio);
+ zio_vdev_io_bypass(zio);
+
+ mutex_exit(&vc->vc_lock);
+ VDCSTAT_BUMP(vdc_stat_hits);
+ return (B_TRUE);
+ }
+
+ ve = vdev_cache_allocate(zio);
+
+ if (ve == NULL) {
+ mutex_exit(&vc->vc_lock);
+ return (B_FALSE);
+ }
+
+ fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
+ ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
+ ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
+
+ ve->ve_fill_io = fio;
+ zio_vdev_io_bypass(zio);
+ zio_add_child(zio, fio);
+
+ mutex_exit(&vc->vc_lock);
+ zio_nowait(fio);
+ VDCSTAT_BUMP(vdc_stat_misses);
+
+ return (B_TRUE);
+}
+
+/*
+ * Update cache contents upon write completion.
+ */
+void
+vdev_cache_write(zio_t *zio)
+{
+ vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+ vdev_cache_entry_t *ve, ve_search;
+ uint64_t io_start = zio->io_offset;
+ uint64_t io_end = io_start + zio->io_size;
+ uint64_t min_offset = P2ALIGN(io_start, VCBS);
+ uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
+ avl_index_t where;
+
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+
+ mutex_enter(&vc->vc_lock);
+
+ ve_search.ve_offset = min_offset;
+ ve = avl_find(&vc->vc_offset_tree, &ve_search, &where);
+
+ if (ve == NULL)
+ ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER);
+
+ while (ve != NULL && ve->ve_offset < max_offset) {
+ uint64_t start = MAX(ve->ve_offset, io_start);
+ uint64_t end = MIN(ve->ve_offset + VCBS, io_end);
+
+ if (ve->ve_fill_io != NULL) {
+ ve->ve_missed_update = 1;
+ } else {
+ abd_copy_off(ve->ve_abd, zio->io_abd,
+ start - ve->ve_offset, start - io_start,
+ end - start);
+ }
+ ve = AVL_NEXT(&vc->vc_offset_tree, ve);
+ }
+ mutex_exit(&vc->vc_lock);
+}
+
+void
+vdev_cache_purge(vdev_t *vd)
+{
+ vdev_cache_t *vc = &vd->vdev_cache;
+ vdev_cache_entry_t *ve;
+
+ mutex_enter(&vc->vc_lock);
+ while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
+ vdev_cache_evict(vc, ve);
+ mutex_exit(&vc->vc_lock);
+}
+
+void
+vdev_cache_init(vdev_t *vd)
+{
+ vdev_cache_t *vc = &vd->vdev_cache;
+
+ mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare,
+ sizeof (vdev_cache_entry_t),
+ offsetof(struct vdev_cache_entry, ve_offset_node));
+
+ avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare,
+ sizeof (vdev_cache_entry_t),
+ offsetof(struct vdev_cache_entry, ve_lastused_node));
+}
+
+void
+vdev_cache_fini(vdev_t *vd)
+{
+ vdev_cache_t *vc = &vd->vdev_cache;
+
+ vdev_cache_purge(vd);
+
+ avl_destroy(&vc->vc_offset_tree);
+ avl_destroy(&vc->vc_lastused_tree);
+
+ mutex_destroy(&vc->vc_lock);
+}
+
+void
+vdev_cache_stat_init(void)
+{
+ vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ if (vdc_ksp != NULL) {
+ vdc_ksp->ks_data = &vdc_stats;
+ kstat_install(vdc_ksp);
+ }
+}
+
+void
+vdev_cache_stat_fini(void)
+{
+ if (vdc_ksp != NULL) {
+ kstat_delete(vdc_ksp);
+ vdc_ksp = NULL;
+ }
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_max, INT, ZMOD_RW,
+ "Inflate reads small than max");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_size, INT, ZMOD_RD,
+ "Total size of the per-disk cache");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_bshift, INT, ZMOD_RW,
+ "Shift size to inflate reads too");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid.c b/sys/contrib/openzfs/module/zfs/vdev_draid.c
new file mode 100644
index 000000000000..a4f48cf744b0
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_draid.c
@@ -0,0 +1,2976 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2018 Intel Corporation.
+ * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_draid.h>
+#include <sys/vdev_raidz.h>
+#include <sys/vdev_rebuild.h>
+#include <sys/abd.h>
+#include <sys/zio.h>
+#include <sys/nvpair.h>
+#include <sys/zio_checksum.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
+#include <zfs_fletcher.h>
+
+#ifdef ZFS_DEBUG
+#include <sys/vdev.h> /* For vdev_xlate() in vdev_draid_io_verify() */
+#endif
+
+/*
+ * dRAID is a distributed spare implementation for ZFS. A dRAID vdev is
+ * comprised of multiple raidz redundancy groups which are spread over the
+ * dRAID children. To ensure an even distribution, and avoid hot spots, a
+ * permutation mapping is applied to the order of the dRAID children.
+ * This mixing effectively distributes the parity columns evenly over all
+ * of the disks in the dRAID.
+ *
+ * This is beneficial because it means when resilvering all of the disks
+ * can participate thereby increasing the available IOPs and bandwidth.
+ * Furthermore, by reserving a small fraction of each child's total capacity
+ * virtual distributed spare disks can be created. These spares similarly
+ * benefit from the performance gains of spanning all of the children. The
+ * consequence of which is that resilvering to a distributed spare can
+ * substantially reduce the time required to restore full parity to pool
+ * with a failed disks.
+ *
+ * === dRAID group layout ===
+ *
+ * First, let's define a "row" in the configuration to be a 16M chunk from
+ * each physical drive at the same offset. This is the minimum allowable
+ * size since it must be possible to store a full 16M block when there is
+ * only a single data column. Next, we define a "group" to be a set of
+ * sequential disks containing both the parity and data columns. We allow
+ * groups to span multiple rows in order to align any group size to any
+ * number of physical drives. Finally, a "slice" is comprised of the rows
+ * which contain the target number of groups. The permutation mappings
+ * are applied in a round robin fashion to each slice.
+ *
+ * Given D+P drives in a group (including parity drives) and C-S physical
+ * drives (not including the spare drives), we can distribute the groups
+ * across R rows without remainder by selecting the least common multiple
+ * of D+P and C-S as the number of groups; i.e. ngroups = LCM(D+P, C-S).
+ *
+ * In the example below, there are C=14 physical drives in the configuration
+ * with S=2 drives worth of spare capacity. Each group has a width of 9
+ * which includes D=8 data and P=1 parity drive. There are 4 groups and
+ * 3 rows per slice. Each group has a size of 144M (16M * 9) and a slice
+ * size is 576M (144M * 4). When allocating from a dRAID each group is
+ * filled before moving on to the next as show in slice0 below.
+ *
+ * data disks (8 data + 1 parity) spares (2)
+ * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
+ * ^ | 2 | 6 | 1 | 11| 4 | 0 | 7 | 10| 8 | 9 | 13| 5 | 12| 3 | device map 0
+ * | +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
+ * | | group 0 | group 1..| |
+ * | +-----------------------------------+-----------+-------|
+ * | | 0 1 2 3 4 5 6 7 8 | 36 37 38| | r
+ * | | 9 10 11 12 13 14 15 16 17| 45 46 47| | o
+ * | | 18 19 20 21 22 23 24 25 26| 54 55 56| | w
+ * | 27 28 29 30 31 32 33 34 35| 63 64 65| | 0
+ * s +-----------------------+-----------------------+-------+
+ * l | ..group 1 | group 2.. | |
+ * i +-----------------------+-----------------------+-------+
+ * c | 39 40 41 42 43 44| 72 73 74 75 76 77| | r
+ * e | 48 49 50 51 52 53| 81 82 83 84 85 86| | o
+ * 0 | 57 58 59 60 61 62| 90 91 92 93 94 95| | w
+ * | 66 67 68 69 70 71| 99 100 101 102 103 104| | 1
+ * | +-----------+-----------+-----------------------+-------+
+ * | |..group 2 | group 3 | |
+ * | +-----------+-----------+-----------------------+-------+
+ * | | 78 79 80|108 109 110 111 112 113 114 115 116| | r
+ * | | 87 88 89|117 118 119 120 121 122 123 124 125| | o
+ * | | 96 97 98|126 127 128 129 130 131 132 133 134| | w
+ * v |105 106 107|135 136 137 138 139 140 141 142 143| | 2
+ * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
+ * | 9 | 11| 12| 2 | 4 | 1 | 3 | 0 | 10| 13| 8 | 5 | 6 | 7 | device map 1
+ * s +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
+ * l | group 4 | group 5..| | row 3
+ * i +-----------------------+-----------+-----------+-------|
+ * c | ..group 5 | group 6.. | | row 4
+ * e +-----------+-----------+-----------------------+-------+
+ * 1 |..group 6 | group 7 | | row 5
+ * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
+ * | 3 | 5 | 10| 8 | 6 | 11| 12| 0 | 2 | 4 | 7 | 1 | 9 | 13| device map 2
+ * s +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
+ * l | group 8 | group 9..| | row 6
+ * i +-----------------------------------------------+-------|
+ * c | ..group 9 | group 10.. | | row 7
+ * e +-----------------------+-----------------------+-------+
+ * 2 |..group 10 | group 11 | | row 8
+ * +-----------+-----------------------------------+-------+
+ *
+ * This layout has several advantages over requiring that each row contain
+ * a whole number of groups.
+ *
+ * 1. The group count is not a relevant parameter when defining a dRAID
+ * layout. Only the group width is needed, and *all* groups will have
+ * the desired size.
+ *
+ * 2. All possible group widths (<= physical disk count) can be supported.
+ *
+ * 3. The logic within vdev_draid.c is simplified when the group width is
+ * the same for all groups (although some of the logic around computing
+ * permutation numbers and drive offsets is more complicated).
+ *
+ * N.B. The following array describes all valid dRAID permutation maps.
+ * Each row is used to generate a permutation map for a different number
+ * of children from a unique seed. The seeds were generated and carefully
+ * evaluated by the 'draid' utility in order to provide balanced mappings.
+ * In addition to the seed a checksum of the in-memory mapping is stored
+ * for verification.
+ *
+ * The imbalance ratio of a given failure (e.g. 5 disks wide, child 3 failed,
+ * with a given permutation map) is the ratio of the amounts of I/O that will
+ * be sent to the least and most busy disks when resilvering. The average
+ * imbalance ratio (of a given number of disks and permutation map) is the
+ * average of the ratios of all possible single and double disk failures.
+ *
+ * In order to achieve a low imbalance ratio the number of permutations in
+ * the mapping must be significantly larger than the number of children.
+ * For dRAID the number of permutations has been limited to 512 to minimize
+ * the map size. This does result in a gradually increasing imbalance ratio
+ * as seen in the table below. Increasing the number of permutations for
+ * larger child counts would reduce the imbalance ratio. However, in practice
+ * when there are a large number of children each child is responsible for
+ * fewer total IOs so it's less of a concern.
+ *
+ * Note these values are hard coded and must never be changed. Existing
+ * pools depend on the same mapping always being generated in order to
+ * read and write from the correct locations. Any change would make
+ * existing pools completely inaccessible.
+ */
+static const draid_map_t draid_maps[VDEV_DRAID_MAX_MAPS] = {
+ { 2, 256, 0x89ef3dabbcc7de37, 0x00000000433d433d }, /* 1.000 */
+ { 3, 256, 0x89a57f3de98121b4, 0x00000000bcd8b7b5 }, /* 1.000 */
+ { 4, 256, 0xc9ea9ec82340c885, 0x00000001819d7c69 }, /* 1.000 */
+ { 5, 256, 0xf46733b7f4d47dfd, 0x00000002a1648d74 }, /* 1.010 */
+ { 6, 256, 0x88c3c62d8585b362, 0x00000003d3b0c2c4 }, /* 1.031 */
+ { 7, 256, 0x3a65d809b4d1b9d5, 0x000000055c4183ee }, /* 1.043 */
+ { 8, 256, 0xe98930e3c5d2e90a, 0x00000006edfb0329 }, /* 1.059 */
+ { 9, 256, 0x5a5430036b982ccb, 0x00000008ceaf6934 }, /* 1.056 */
+ { 10, 256, 0x92bf389e9eadac74, 0x0000000b26668c09 }, /* 1.072 */
+ { 11, 256, 0x74ccebf1dcf3ae80, 0x0000000dd691358c }, /* 1.083 */
+ { 12, 256, 0x8847e41a1a9f5671, 0x00000010a0c63c8e }, /* 1.097 */
+ { 13, 256, 0x7481b56debf0e637, 0x0000001424121fe4 }, /* 1.100 */
+ { 14, 256, 0x559b8c44065f8967, 0x00000016ab2ff079 }, /* 1.121 */
+ { 15, 256, 0x34c49545a2ee7f01, 0x0000001a6028efd6 }, /* 1.103 */
+ { 16, 256, 0xb85f4fa81a7698f7, 0x0000001e95ff5e66 }, /* 1.111 */
+ { 17, 256, 0x6353e47b7e47aba0, 0x00000021a81fa0fe }, /* 1.133 */
+ { 18, 256, 0xaa549746b1cbb81c, 0x00000026f02494c9 }, /* 1.131 */
+ { 19, 256, 0x892e343f2f31d690, 0x00000029eb392835 }, /* 1.130 */
+ { 20, 256, 0x76914824db98cc3f, 0x0000003004f31a7c }, /* 1.141 */
+ { 21, 256, 0x4b3cbabf9cfb1d0f, 0x00000036363a2408 }, /* 1.139 */
+ { 22, 256, 0xf45c77abb4f035d4, 0x00000038dd0f3e84 }, /* 1.150 */
+ { 23, 256, 0x5e18bd7f3fd4baf4, 0x0000003f0660391f }, /* 1.174 */
+ { 24, 256, 0xa7b3a4d285d6503b, 0x000000443dfc9ff6 }, /* 1.168 */
+ { 25, 256, 0x56ac7dd967521f5a, 0x0000004b03a87eb7 }, /* 1.180 */
+ { 26, 256, 0x3a42dfda4eb880f7, 0x000000522c719bba }, /* 1.226 */
+ { 27, 256, 0xd200d2fc6b54bf60, 0x0000005760b4fdf5 }, /* 1.228 */
+ { 28, 256, 0xc52605bbd486c546, 0x0000005e00d8f74c }, /* 1.217 */
+ { 29, 256, 0xc761779e63cd762f, 0x00000067be3cd85c }, /* 1.239 */
+ { 30, 256, 0xca577b1e07f85ca5, 0x0000006f5517f3e4 }, /* 1.238 */
+ { 31, 256, 0xfd50a593c518b3d4, 0x0000007370e7778f }, /* 1.273 */
+ { 32, 512, 0xc6c87ba5b042650b, 0x000000f7eb08a156 }, /* 1.191 */
+ { 33, 512, 0xc3880d0c9d458304, 0x0000010734b5d160 }, /* 1.199 */
+ { 34, 512, 0xe920927e4d8b2c97, 0x00000118c1edbce0 }, /* 1.195 */
+ { 35, 512, 0x8da7fcda87bde316, 0x0000012a3e9f9110 }, /* 1.201 */
+ { 36, 512, 0xcf09937491514a29, 0x0000013bd6a24bef }, /* 1.194 */
+ { 37, 512, 0x9b5abbf345cbd7cc, 0x0000014b9d90fac3 }, /* 1.237 */
+ { 38, 512, 0x506312a44668d6a9, 0x0000015e1b5f6148 }, /* 1.242 */
+ { 39, 512, 0x71659ede62b4755f, 0x00000173ef029bcd }, /* 1.231 */
+ { 40, 512, 0xa7fde73fb74cf2d7, 0x000001866fb72748 }, /* 1.233 */
+ { 41, 512, 0x19e8b461a1dea1d3, 0x000001a046f76b23 }, /* 1.271 */
+ { 42, 512, 0x031c9b868cc3e976, 0x000001afa64c49d3 }, /* 1.263 */
+ { 43, 512, 0xbaa5125faa781854, 0x000001c76789e278 }, /* 1.270 */
+ { 44, 512, 0x4ed55052550d721b, 0x000001d800ccd8eb }, /* 1.281 */
+ { 45, 512, 0x0fd63ddbdff90677, 0x000001f08ad59ed2 }, /* 1.282 */
+ { 46, 512, 0x36d66546de7fdd6f, 0x000002016f09574b }, /* 1.286 */
+ { 47, 512, 0x99f997e7eafb69d7, 0x0000021e42e47cb6 }, /* 1.329 */
+ { 48, 512, 0xbecd9c2571312c5d, 0x000002320fe2872b }, /* 1.286 */
+ { 49, 512, 0xd97371329e488a32, 0x0000024cd73f2ca7 }, /* 1.322 */
+ { 50, 512, 0x30e9b136670749ee, 0x000002681c83b0e0 }, /* 1.335 */
+ { 51, 512, 0x11ad6bc8f47aaeb4, 0x0000027e9261b5d5 }, /* 1.305 */
+ { 52, 512, 0x68e445300af432c1, 0x0000029aa0eb7dbf }, /* 1.330 */
+ { 53, 512, 0x910fb561657ea98c, 0x000002b3dca04853 }, /* 1.365 */
+ { 54, 512, 0xd619693d8ce5e7a5, 0x000002cc280e9c97 }, /* 1.334 */
+ { 55, 512, 0x24e281f564dbb60a, 0x000002e9fa842713 }, /* 1.364 */
+ { 56, 512, 0x947a7d3bdaab44c5, 0x000003046680f72e }, /* 1.374 */
+ { 57, 512, 0x2d44fec9c093e0de, 0x00000324198ba810 }, /* 1.363 */
+ { 58, 512, 0x87743c272d29bb4c, 0x0000033ec48c9ac9 }, /* 1.401 */
+ { 59, 512, 0x96aa3b6f67f5d923, 0x0000034faead902c }, /* 1.392 */
+ { 60, 512, 0x94a4f1faf520b0d3, 0x0000037d713ab005 }, /* 1.360 */
+ { 61, 512, 0xb13ed3a272f711a2, 0x00000397368f3cbd }, /* 1.396 */
+ { 62, 512, 0x3b1b11805fa4a64a, 0x000003b8a5e2840c }, /* 1.453 */
+ { 63, 512, 0x4c74caad9172ba71, 0x000003d4be280290 }, /* 1.437 */
+ { 64, 512, 0x035ff643923dd29e, 0x000003fad6c355e1 }, /* 1.402 */
+ { 65, 512, 0x768e9171b11abd3c, 0x0000040eb07fed20 }, /* 1.459 */
+ { 66, 512, 0x75880e6f78a13ddd, 0x000004433d6acf14 }, /* 1.423 */
+ { 67, 512, 0x910b9714f698a877, 0x00000451ea65d5db }, /* 1.447 */
+ { 68, 512, 0x87f5db6f9fdcf5c7, 0x000004732169e3f7 }, /* 1.450 */
+ { 69, 512, 0x836d4968fbaa3706, 0x000004954068a380 }, /* 1.455 */
+ { 70, 512, 0xc567d73a036421ab, 0x000004bd7cb7bd3d }, /* 1.463 */
+ { 71, 512, 0x619df40f240b8fed, 0x000004e376c2e972 }, /* 1.463 */
+ { 72, 512, 0x42763a680d5bed8e, 0x000005084275c680 }, /* 1.452 */
+ { 73, 512, 0x5866f064b3230431, 0x0000052906f2c9ab }, /* 1.498 */
+ { 74, 512, 0x9fa08548b1621a44, 0x0000054708019247 }, /* 1.526 */
+ { 75, 512, 0xb6053078ce0fc303, 0x00000572cc5c72b0 }, /* 1.491 */
+ { 76, 512, 0x4a7aad7bf3890923, 0x0000058e987bc8e9 }, /* 1.470 */
+ { 77, 512, 0xe165613fd75b5a53, 0x000005c20473a211 }, /* 1.527 */
+ { 78, 512, 0x3ff154ac878163a6, 0x000005d659194bf3 }, /* 1.509 */
+ { 79, 512, 0x24b93ade0aa8a532, 0x0000060a201c4f8e }, /* 1.569 */
+ { 80, 512, 0xc18e2d14cd9bb554, 0x0000062c55cfe48c }, /* 1.555 */
+ { 81, 512, 0x98cc78302feb58b6, 0x0000066656a07194 }, /* 1.509 */
+ { 82, 512, 0xc6c5fd5a2abc0543, 0x0000067cff94fbf8 }, /* 1.596 */
+ { 83, 512, 0xa7962f514acbba21, 0x000006ab7b5afa2e }, /* 1.568 */
+ { 84, 512, 0xba02545069ddc6dc, 0x000006d19861364f }, /* 1.541 */
+ { 85, 512, 0x447c73192c35073e, 0x000006fce315ce35 }, /* 1.623 */
+ { 86, 512, 0x48beef9e2d42b0c2, 0x00000720a8e38b6b }, /* 1.620 */
+ { 87, 512, 0x4874cf98541a35e0, 0x00000758382a2273 }, /* 1.597 */
+ { 88, 512, 0xad4cf8333a31127a, 0x00000781e1651b1b }, /* 1.575 */
+ { 89, 512, 0x47ae4859d57888c1, 0x000007b27edbe5bc }, /* 1.627 */
+ { 90, 512, 0x06f7723cfe5d1891, 0x000007dc2a96d8eb }, /* 1.596 */
+ { 91, 512, 0xd4e44218d660576d, 0x0000080ac46f02d5 }, /* 1.622 */
+ { 92, 512, 0x7066702b0d5be1f2, 0x00000832c96d154e }, /* 1.695 */
+ { 93, 512, 0x011209b4f9e11fb9, 0x0000085eefda104c }, /* 1.605 */
+ { 94, 512, 0x47ffba30a0b35708, 0x00000899badc32dc }, /* 1.625 */
+ { 95, 512, 0x1a95a6ac4538aaa8, 0x000008b6b69a42b2 }, /* 1.687 */
+ { 96, 512, 0xbda2b239bb2008eb, 0x000008f22d2de38a }, /* 1.621 */
+ { 97, 512, 0x7ffa0bea90355c6c, 0x0000092e5b23b816 }, /* 1.699 */
+ { 98, 512, 0x1d56ba34be426795, 0x0000094f482e5d1b }, /* 1.688 */
+ { 99, 512, 0x0aa89d45c502e93d, 0x00000977d94a98ce }, /* 1.642 */
+ { 100, 512, 0x54369449f6857774, 0x000009c06c9b34cc }, /* 1.683 */
+ { 101, 512, 0xf7d4dd8445b46765, 0x000009e5dc542259 }, /* 1.755 */
+ { 102, 512, 0xfa8866312f169469, 0x00000a16b54eae93 }, /* 1.692 */
+ { 103, 512, 0xd8a5aea08aef3ff9, 0x00000a381d2cbfe7 }, /* 1.747 */
+ { 104, 512, 0x66bcd2c3d5f9ef0e, 0x00000a8191817be7 }, /* 1.751 */
+ { 105, 512, 0x3fb13a47a012ec81, 0x00000ab562b9a254 }, /* 1.751 */
+ { 106, 512, 0x43100f01c9e5e3ca, 0x00000aeee84c185f }, /* 1.726 */
+ { 107, 512, 0xca09c50ccee2d054, 0x00000b1c359c047d }, /* 1.788 */
+ { 108, 512, 0xd7176732ac503f9b, 0x00000b578bc52a73 }, /* 1.740 */
+ { 109, 512, 0xed206e51f8d9422d, 0x00000b8083e0d960 }, /* 1.780 */
+ { 110, 512, 0x17ead5dc6ba0dcd6, 0x00000bcfb1a32ca8 }, /* 1.836 */
+ { 111, 512, 0x5f1dc21e38a969eb, 0x00000c0171becdd6 }, /* 1.778 */
+ { 112, 512, 0xddaa973de33ec528, 0x00000c3edaba4b95 }, /* 1.831 */
+ { 113, 512, 0x2a5eccd7735a3630, 0x00000c630664e7df }, /* 1.825 */
+ { 114, 512, 0xafcccee5c0b71446, 0x00000cb65392f6e4 }, /* 1.826 */
+ { 115, 512, 0x8fa30c5e7b147e27, 0x00000cd4db391e55 }, /* 1.843 */
+ { 116, 512, 0x5afe0711fdfafd82, 0x00000d08cb4ec35d }, /* 1.826 */
+ { 117, 512, 0x533a6090238afd4c, 0x00000d336f115d1b }, /* 1.803 */
+ { 118, 512, 0x90cf11b595e39a84, 0x00000d8e041c2048 }, /* 1.857 */
+ { 119, 512, 0x0d61a3b809444009, 0x00000dcb798afe35 }, /* 1.877 */
+ { 120, 512, 0x7f34da0f54b0d114, 0x00000df3922664e1 }, /* 1.849 */
+ { 121, 512, 0xa52258d5b72f6551, 0x00000e4d37a9872d }, /* 1.867 */
+ { 122, 512, 0xc1de54d7672878db, 0x00000e6583a94cf6 }, /* 1.978 */
+ { 123, 512, 0x1d03354316a414ab, 0x00000ebffc50308d }, /* 1.947 */
+ { 124, 512, 0xcebdcc377665412c, 0x00000edee1997cea }, /* 1.865 */
+ { 125, 512, 0x4ddd4c04b1a12344, 0x00000f21d64b373f }, /* 1.881 */
+ { 126, 512, 0x64fc8f94e3973658, 0x00000f8f87a8896b }, /* 1.882 */
+ { 127, 512, 0x68765f78034a334e, 0x00000fb8fe62197e }, /* 1.867 */
+ { 128, 512, 0xaf36b871a303e816, 0x00000fec6f3afb1e }, /* 1.972 */
+ { 129, 512, 0x2a4cbf73866c3a28, 0x00001027febfe4e5 }, /* 1.896 */
+ { 130, 512, 0x9cb128aacdcd3b2f, 0x0000106aa8ac569d }, /* 1.965 */
+ { 131, 512, 0x5511d41c55869124, 0x000010bbd755ddf1 }, /* 1.963 */
+ { 132, 512, 0x42f92461937f284a, 0x000010fb8bceb3b5 }, /* 1.925 */
+ { 133, 512, 0xe2d89a1cf6f1f287, 0x0000114cf5331e34 }, /* 1.862 */
+ { 134, 512, 0xdc631a038956200e, 0x0000116428d2adc5 }, /* 2.042 */
+ { 135, 512, 0xb2e5ac222cd236be, 0x000011ca88e4d4d2 }, /* 1.935 */
+ { 136, 512, 0xbc7d8236655d88e7, 0x000011e39cb94e66 }, /* 2.005 */
+ { 137, 512, 0x073e02d88d2d8e75, 0x0000123136c7933c }, /* 2.041 */
+ { 138, 512, 0x3ddb9c3873166be0, 0x00001280e4ec6d52 }, /* 1.997 */
+ { 139, 512, 0x7d3b1a845420e1b5, 0x000012c2e7cd6a44 }, /* 1.996 */
+ { 140, 512, 0x60102308aa7b2a6c, 0x000012fc490e6c7d }, /* 2.053 */
+ { 141, 512, 0xdb22bb2f9eb894aa, 0x00001343f5a85a1a }, /* 1.971 */
+ { 142, 512, 0xd853f879a13b1606, 0x000013bb7d5f9048 }, /* 2.018 */
+ { 143, 512, 0x001620a03f804b1d, 0x000013e74cc794fd }, /* 1.961 */
+ { 144, 512, 0xfdb52dda76fbf667, 0x00001442d2f22480 }, /* 2.046 */
+ { 145, 512, 0xa9160110f66e24ff, 0x0000144b899f9dbb }, /* 1.968 */
+ { 146, 512, 0x77306a30379ae03b, 0x000014cb98eb1f81 }, /* 2.143 */
+ { 147, 512, 0x14f5985d2752319d, 0x000014feab821fc9 }, /* 2.064 */
+ { 148, 512, 0xa4b8ff11de7863f8, 0x0000154a0e60b9c9 }, /* 2.023 */
+ { 149, 512, 0x44b345426455c1b3, 0x000015999c3c569c }, /* 2.136 */
+ { 150, 512, 0x272677826049b46c, 0x000015c9697f4b92 }, /* 2.063 */
+ { 151, 512, 0x2f9216e2cd74fe40, 0x0000162b1f7bbd39 }, /* 1.974 */
+ { 152, 512, 0x706ae3e763ad8771, 0x00001661371c55e1 }, /* 2.210 */
+ { 153, 512, 0xf7fd345307c2480e, 0x000016e251f28b6a }, /* 2.006 */
+ { 154, 512, 0x6e94e3d26b3139eb, 0x000016f2429bb8c6 }, /* 2.193 */
+ { 155, 512, 0x5458bbfbb781fcba, 0x0000173efdeca1b9 }, /* 2.163 */
+ { 156, 512, 0xa80e2afeccd93b33, 0x000017bfdcb78adc }, /* 2.046 */
+ { 157, 512, 0x1e4ccbb22796cf9d, 0x00001826fdcc39c9 }, /* 2.084 */
+ { 158, 512, 0x8fba4b676aaa3663, 0x00001841a1379480 }, /* 2.264 */
+ { 159, 512, 0xf82b843814b315fa, 0x000018886e19b8a3 }, /* 2.074 */
+ { 160, 512, 0x7f21e920ecf753a3, 0x0000191812ca0ea7 }, /* 2.282 */
+ { 161, 512, 0x48bb8ea2c4caa620, 0x0000192f310faccf }, /* 2.148 */
+ { 162, 512, 0x5cdb652b4952c91b, 0x0000199e1d7437c7 }, /* 2.355 */
+ { 163, 512, 0x6ac1ba6f78c06cd4, 0x000019cd11f82c70 }, /* 2.164 */
+ { 164, 512, 0x9faf5f9ca2669a56, 0x00001a18d5431f6a }, /* 2.393 */
+ { 165, 512, 0xaa57e9383eb01194, 0x00001a9e7d253d85 }, /* 2.178 */
+ { 166, 512, 0x896967bf495c34d2, 0x00001afb8319b9fc }, /* 2.334 */
+ { 167, 512, 0xdfad5f05de225f1b, 0x00001b3a59c3093b }, /* 2.266 */
+ { 168, 512, 0xfd299a99f9f2abdd, 0x00001bb6f1a10799 }, /* 2.304 */
+ { 169, 512, 0xdda239e798fe9fd4, 0x00001bfae0c9692d }, /* 2.218 */
+ { 170, 512, 0x5fca670414a32c3e, 0x00001c22129dbcff }, /* 2.377 */
+ { 171, 512, 0x1bb8934314b087de, 0x00001c955db36cd0 }, /* 2.155 */
+ { 172, 512, 0xd96394b4b082200d, 0x00001cfc8619b7e6 }, /* 2.404 */
+ { 173, 512, 0xb612a7735b1c8cbc, 0x00001d303acdd585 }, /* 2.205 */
+ { 174, 512, 0x28e7430fe5875fe1, 0x00001d7ed5b3697d }, /* 2.359 */
+ { 175, 512, 0x5038e89efdd981b9, 0x00001dc40ec35c59 }, /* 2.158 */
+ { 176, 512, 0x075fd78f1d14db7c, 0x00001e31c83b4a2b }, /* 2.614 */
+ { 177, 512, 0xc50fafdb5021be15, 0x00001e7cdac82fbc }, /* 2.239 */
+ { 178, 512, 0xe6dc7572ce7b91c7, 0x00001edd8bb454fc }, /* 2.493 */
+ { 179, 512, 0x21f7843e7beda537, 0x00001f3a8e019d6c }, /* 2.327 */
+ { 180, 512, 0xc83385e20b43ec82, 0x00001f70735ec137 }, /* 2.231 */
+ { 181, 512, 0xca818217dddb21fd, 0x0000201ca44c5a3c }, /* 2.237 */
+ { 182, 512, 0xe6035defea48f933, 0x00002038e3346658 }, /* 2.691 */
+ { 183, 512, 0x47262a4f953dac5a, 0x000020c2e554314e }, /* 2.170 */
+ { 184, 512, 0xe24c7246260873ea, 0x000021197e618d64 }, /* 2.600 */
+ { 185, 512, 0xeef6b57c9b58e9e1, 0x0000217ea48ecddc }, /* 2.391 */
+ { 186, 512, 0x2becd3346e386142, 0x000021c496d4a5f9 }, /* 2.677 */
+ { 187, 512, 0x63c6207bdf3b40a3, 0x0000220e0f2eec0c }, /* 2.410 */
+ { 188, 512, 0x3056ce8989767d4b, 0x0000228eb76cd137 }, /* 2.776 */
+ { 189, 512, 0x91af61c307cee780, 0x000022e17e2ea501 }, /* 2.266 */
+ { 190, 512, 0xda359da225f6d54f, 0x00002358a2debc19 }, /* 2.717 */
+ { 191, 512, 0x0a5f7a2a55607ba0, 0x0000238a79dac18c }, /* 2.474 */
+ { 192, 512, 0x27bb75bf5224638a, 0x00002403a58e2351 }, /* 2.673 */
+ { 193, 512, 0x1ebfdb94630f5d0f, 0x00002492a10cb339 }, /* 2.420 */
+ { 194, 512, 0x6eae5e51d9c5f6fb, 0x000024ce4bf98715 }, /* 2.898 */
+ { 195, 512, 0x08d903b4daedc2e0, 0x0000250d1e15886c }, /* 2.363 */
+ { 196, 512, 0xc722a2f7fa7cd686, 0x0000258a99ed0c9e }, /* 2.747 */
+ { 197, 512, 0x8f71faf0e54e361d, 0x000025dee11976f5 }, /* 2.531 */
+ { 198, 512, 0x87f64695c91a54e7, 0x0000264e00a43da0 }, /* 2.707 */
+ { 199, 512, 0xc719cbac2c336b92, 0x000026d327277ac1 }, /* 2.315 */
+ { 200, 512, 0xe7e647afaf771ade, 0x000027523a5c44bf }, /* 3.012 */
+ { 201, 512, 0x12d4b5c38ce8c946, 0x0000273898432545 }, /* 2.378 */
+ { 202, 512, 0xf2e0cd4067bdc94a, 0x000027e47bb2c935 }, /* 2.969 */
+ { 203, 512, 0x21b79f14d6d947d3, 0x0000281e64977f0d }, /* 2.594 */
+ { 204, 512, 0x515093f952f18cd6, 0x0000289691a473fd }, /* 2.763 */
+ { 205, 512, 0xd47b160a1b1022c8, 0x00002903e8b52411 }, /* 2.457 */
+ { 206, 512, 0xc02fc96684715a16, 0x0000297515608601 }, /* 3.057 */
+ { 207, 512, 0xef51e68efba72ed0, 0x000029ef73604804 }, /* 2.590 */
+ { 208, 512, 0x9e3be6e5448b4f33, 0x00002a2846ed074b }, /* 3.047 */
+ { 209, 512, 0x81d446c6d5fec063, 0x00002a92ca693455 }, /* 2.676 */
+ { 210, 512, 0xff215de8224e57d5, 0x00002b2271fe3729 }, /* 2.993 */
+ { 211, 512, 0xe2524d9ba8f69796, 0x00002b64b99c3ba2 }, /* 2.457 */
+ { 212, 512, 0xf6b28e26097b7e4b, 0x00002bd768b6e068 }, /* 3.182 */
+ { 213, 512, 0x893a487f30ce1644, 0x00002c67f722b4b2 }, /* 2.563 */
+ { 214, 512, 0x386566c3fc9871df, 0x00002cc1cf8b4037 }, /* 3.025 */
+ { 215, 512, 0x1e0ed78edf1f558a, 0x00002d3948d36c7f }, /* 2.730 */
+ { 216, 512, 0xe3bc20c31e61f113, 0x00002d6d6b12e025 }, /* 3.036 */
+ { 217, 512, 0xd6c3ad2e23021882, 0x00002deff7572241 }, /* 2.722 */
+ { 218, 512, 0xb4a9f95cf0f69c5a, 0x00002e67d537aa36 }, /* 3.356 */
+ { 219, 512, 0x6e98ed6f6c38e82f, 0x00002e9720626789 }, /* 2.697 */
+ { 220, 512, 0x2e01edba33fddac7, 0x00002f407c6b0198 }, /* 2.979 */
+ { 221, 512, 0x559d02e1f5f57ccc, 0x00002fb6a5ab4f24 }, /* 2.858 */
+ { 222, 512, 0xac18f5a916adcd8e, 0x0000304ae1c5c57e }, /* 3.258 */
+ { 223, 512, 0x15789fbaddb86f4b, 0x0000306f6e019c78 }, /* 2.693 */
+ { 224, 512, 0xf4a9c36d5bc4c408, 0x000030da40434213 }, /* 3.259 */
+ { 225, 512, 0xf640f90fd2727f44, 0x00003189ed37b90c }, /* 2.733 */
+ { 226, 512, 0xb5313d390d61884a, 0x000031e152616b37 }, /* 3.235 */
+ { 227, 512, 0x4bae6b3ce9160939, 0x0000321f40aeac42 }, /* 2.983 */
+ { 228, 512, 0x838c34480f1a66a1, 0x000032f389c0f78e }, /* 3.308 */
+ { 229, 512, 0xb1c4a52c8e3d6060, 0x0000330062a40284 }, /* 2.715 */
+ { 230, 512, 0xe0f1110c6d0ed822, 0x0000338be435644f }, /* 3.540 */
+ { 231, 512, 0x9f1a8ccdcea68d4b, 0x000034045a4e97e1 }, /* 2.779 */
+ { 232, 512, 0x3261ed62223f3099, 0x000034702cfc401c }, /* 3.084 */
+ { 233, 512, 0xf2191e2311022d65, 0x00003509dd19c9fc }, /* 2.987 */
+ { 234, 512, 0xf102a395c2033abc, 0x000035654dc96fae }, /* 3.341 */
+ { 235, 512, 0x11fe378f027906b6, 0x000035b5193b0264 }, /* 2.793 */
+ { 236, 512, 0xf777f2c026b337aa, 0x000036704f5d9297 }, /* 3.518 */
+ { 237, 512, 0x1b04e9c2ee143f32, 0x000036dfbb7af218 }, /* 2.962 */
+ { 238, 512, 0x2fcec95266f9352c, 0x00003785c8df24a9 }, /* 3.196 */
+ { 239, 512, 0xfe2b0e47e427dd85, 0x000037cbdf5da729 }, /* 2.914 */
+ { 240, 512, 0x72b49bf2225f6c6d, 0x0000382227c15855 }, /* 3.408 */
+ { 241, 512, 0x50486b43df7df9c7, 0x0000389b88be6453 }, /* 2.903 */
+ { 242, 512, 0x5192a3e53181c8ab, 0x000038ddf3d67263 }, /* 3.778 */
+ { 243, 512, 0xe9f5d8365296fd5e, 0x0000399f1c6c9e9c }, /* 3.026 */
+ { 244, 512, 0xc740263f0301efa8, 0x00003a147146512d }, /* 3.347 */
+ { 245, 512, 0x23cd0f2b5671e67d, 0x00003ab10bcc0d9d }, /* 3.212 */
+ { 246, 512, 0x002ccc7e5cd41390, 0x00003ad6cd14a6c0 }, /* 3.482 */
+ { 247, 512, 0x9aafb3c02544b31b, 0x00003b8cb8779fb0 }, /* 3.146 */
+ { 248, 512, 0x72ba07a78b121999, 0x00003c24142a5a3f }, /* 3.626 */
+ { 249, 512, 0x3d784aa58edfc7b4, 0x00003cd084817d99 }, /* 2.952 */
+ { 250, 512, 0xaab750424d8004af, 0x00003d506a8e098e }, /* 3.463 */
+ { 251, 512, 0x84403fcf8e6b5ca2, 0x00003d4c54c2aec4 }, /* 3.131 */
+ { 252, 512, 0x71eb7455ec98e207, 0x00003e655715cf2c }, /* 3.538 */
+ { 253, 512, 0xd752b4f19301595b, 0x00003ecd7b2ca5ac }, /* 2.974 */
+ { 254, 512, 0xc4674129750499de, 0x00003e99e86d3e95 }, /* 3.843 */
+ { 255, 512, 0x9772baff5cd12ef5, 0x00003f895c019841 }, /* 3.088 */
+};
+
+/*
+ * Verify the map is valid. Each device index must appear exactly
+ * once in every row, and the permutation array checksum must match.
+ */
+static int
+verify_perms(uint8_t *perms, uint64_t children, uint64_t nperms,
+ uint64_t checksum)
+{
+ int countssz = sizeof (uint16_t) * children;
+ uint16_t *counts = kmem_zalloc(countssz, KM_SLEEP);
+
+ for (int i = 0; i < nperms; i++) {
+ for (int j = 0; j < children; j++) {
+ uint8_t val = perms[(i * children) + j];
+
+ if (val >= children || counts[val] != i) {
+ kmem_free(counts, countssz);
+ return (EINVAL);
+ }
+
+ counts[val]++;
+ }
+ }
+
+ if (checksum != 0) {
+ int permssz = sizeof (uint8_t) * children * nperms;
+ zio_cksum_t cksum;
+
+ fletcher_4_native_varsize(perms, permssz, &cksum);
+
+ if (checksum != cksum.zc_word[0]) {
+ kmem_free(counts, countssz);
+ return (ECKSUM);
+ }
+ }
+
+ kmem_free(counts, countssz);
+
+ return (0);
+}
+
+/*
+ * Generate the permutation array for the draid_map_t. These maps control
+ * the placement of all data in a dRAID. Therefore it's critical that the
+ * seed always generates the same mapping. We provide our own pseudo-random
+ * number generator for this purpose.
+ */
+int
+vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp)
+{
+ VERIFY3U(map->dm_children, >=, VDEV_DRAID_MIN_CHILDREN);
+ VERIFY3U(map->dm_children, <=, VDEV_DRAID_MAX_CHILDREN);
+ VERIFY3U(map->dm_seed, !=, 0);
+ VERIFY3U(map->dm_nperms, !=, 0);
+ VERIFY3P(map->dm_perms, ==, NULL);
+
+#ifdef _KERNEL
+ /*
+ * The kernel code always provides both a map_seed and checksum.
+ * Only the tests/zfs-tests/cmd/draid/draid.c utility will provide
+ * a zero checksum when generating new candidate maps.
+ */
+ VERIFY3U(map->dm_checksum, !=, 0);
+#endif
+ uint64_t children = map->dm_children;
+ uint64_t nperms = map->dm_nperms;
+ int rowsz = sizeof (uint8_t) * children;
+ int permssz = rowsz * nperms;
+ uint8_t *perms;
+
+ /* Allocate the permutation array */
+ perms = vmem_alloc(permssz, KM_SLEEP);
+
+ /* Setup an initial row with a known pattern */
+ uint8_t *initial_row = kmem_alloc(rowsz, KM_SLEEP);
+ for (int i = 0; i < children; i++)
+ initial_row[i] = i;
+
+ uint64_t draid_seed[2] = { VDEV_DRAID_SEED, map->dm_seed };
+ uint8_t *current_row, *previous_row = initial_row;
+
+ /*
+ * Perform a Fisher-Yates shuffle of each row using the previous
+ * row as the starting point. An initial_row with known pattern
+ * is used as the input for the first row.
+ */
+ for (int i = 0; i < nperms; i++) {
+ current_row = &perms[i * children];
+ memcpy(current_row, previous_row, rowsz);
+
+ for (int j = children - 1; j > 0; j--) {
+ uint64_t k = vdev_draid_rand(draid_seed) % (j + 1);
+ uint8_t val = current_row[j];
+ current_row[j] = current_row[k];
+ current_row[k] = val;
+ }
+
+ previous_row = current_row;
+ }
+
+ kmem_free(initial_row, rowsz);
+
+ int error = verify_perms(perms, children, nperms, map->dm_checksum);
+ if (error) {
+ vmem_free(perms, permssz);
+ return (error);
+ }
+
+ *permsp = perms;
+
+ return (0);
+}
+
+/*
+ * Lookup the fixed draid_map_t for the requested number of children.
+ */
+int
+vdev_draid_lookup_map(uint64_t children, const draid_map_t **mapp)
+{
+ for (int i = 0; i <= VDEV_DRAID_MAX_MAPS; i++) {
+ if (draid_maps[i].dm_children == children) {
+ *mapp = &draid_maps[i];
+ return (0);
+ }
+ }
+
+ return (ENOENT);
+}
+
+/*
+ * Lookup the permutation array and iteration id for the provided offset.
+ */
+static void
+vdev_draid_get_perm(vdev_draid_config_t *vdc, uint64_t pindex,
+ uint8_t **base, uint64_t *iter)
+{
+ uint64_t ncols = vdc->vdc_children;
+ uint64_t poff = pindex % (vdc->vdc_nperms * ncols);
+
+ *base = vdc->vdc_perms + (poff / ncols) * ncols;
+ *iter = poff % ncols;
+}
+
+static inline uint64_t
+vdev_draid_permute_id(vdev_draid_config_t *vdc,
+ uint8_t *base, uint64_t iter, uint64_t index)
+{
+ return ((base[index] + iter) % vdc->vdc_children);
+}
+
+/*
+ * Return the asize which is the psize rounded up to a full group width.
+ * i.e. vdev_draid_psize_to_asize().
+ */
+static uint64_t
+vdev_draid_asize(vdev_t *vd, uint64_t psize)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+ uint64_t ashift = vd->vdev_ashift;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+ uint64_t rows = ((psize - 1) / (vdc->vdc_ndata << ashift)) + 1;
+ uint64_t asize = (rows * vdc->vdc_groupwidth) << ashift;
+
+ ASSERT3U(asize, !=, 0);
+ ASSERT3U(asize % (vdc->vdc_groupwidth), ==, 0);
+
+ return (asize);
+}
+
+/*
+ * Deflate the asize to the psize, this includes stripping parity.
+ */
+uint64_t
+vdev_draid_asize_to_psize(vdev_t *vd, uint64_t asize)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ ASSERT0(asize % vdc->vdc_groupwidth);
+
+ return ((asize / vdc->vdc_groupwidth) * vdc->vdc_ndata);
+}
+
+/*
+ * Convert a logical offset to the corresponding group number.
+ */
+static uint64_t
+vdev_draid_offset_to_group(vdev_t *vd, uint64_t offset)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+ return (offset / vdc->vdc_groupsz);
+}
+
+/*
+ * Convert a group number to the logical starting offset for that group.
+ */
+static uint64_t
+vdev_draid_group_to_offset(vdev_t *vd, uint64_t group)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+ return (group * vdc->vdc_groupsz);
+}
+
+
+static void
+vdev_draid_map_free_vsd(zio_t *zio)
+{
+ raidz_map_t *rm = zio->io_vsd;
+
+ ASSERT0(rm->rm_freed);
+ rm->rm_freed = B_TRUE;
+
+ if (rm->rm_reports == 0) {
+ vdev_raidz_map_free(rm);
+ }
+}
+
+/*ARGSUSED*/
+static void
+vdev_draid_cksum_free(void *arg, size_t ignored)
+{
+ raidz_map_t *rm = arg;
+
+ ASSERT3U(rm->rm_reports, >, 0);
+
+ if (--rm->rm_reports == 0 && rm->rm_freed)
+ vdev_raidz_map_free(rm);
+}
+
+static void
+vdev_draid_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
+{
+ raidz_map_t *rm = zcr->zcr_cbdata;
+ const size_t c = zcr->zcr_cbinfo;
+ uint64_t skip_size = zcr->zcr_sector;
+ uint64_t parity_size;
+ size_t x, offset, size;
+
+ if (good_data == NULL) {
+ zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
+ return;
+ }
+
+ /*
+ * Detailed cksum reporting is currently only supported for single
+ * row draid mappings, this covers the vast majority of zios. Only
+ * a dRAID zio which spans groups will have multiple rows.
+ */
+ if (rm->rm_nrows != 1) {
+ zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
+ return;
+ }
+
+ raidz_row_t *rr = rm->rm_row[0];
+ const abd_t *good = NULL;
+ const abd_t *bad = rr->rr_col[c].rc_abd;
+
+ if (c < rr->rr_firstdatacol) {
+ /*
+ * The first time through, calculate the parity blocks for
+ * the good data (this relies on the fact that the good
+ * data never changes for a given logical zio)
+ */
+ if (rr->rr_col[0].rc_gdata == NULL) {
+ abd_t *bad_parity[VDEV_DRAID_MAXPARITY];
+
+ /*
+ * Set up the rr_col[]s to generate the parity for
+ * good_data, first saving the parity bufs and
+ * replacing them with buffers to hold the result.
+ */
+ for (x = 0; x < rr->rr_firstdatacol; x++) {
+ bad_parity[x] = rr->rr_col[x].rc_abd;
+ rr->rr_col[x].rc_abd = rr->rr_col[x].rc_gdata =
+ abd_alloc_sametype(rr->rr_col[x].rc_abd,
+ rr->rr_col[x].rc_size);
+ }
+
+ /*
+ * Fill in the data columns from good_data being
+ * careful to pad short columns and empty columns
+ * with a skip sector.
+ */
+ uint64_t good_size = abd_get_size((abd_t *)good_data);
+
+ offset = 0;
+ for (; x < rr->rr_cols; x++) {
+ abd_free(rr->rr_col[x].rc_abd);
+
+ if (offset == good_size) {
+ /* empty data column (small write) */
+ rr->rr_col[x].rc_abd =
+ abd_get_zeros(skip_size);
+ } else if (x < rr->rr_bigcols) {
+ /* this is a "big column" */
+ size = rr->rr_col[x].rc_size;
+ rr->rr_col[x].rc_abd =
+ abd_get_offset_size(
+ (abd_t *)good_data, offset, size);
+ offset += size;
+ } else {
+ /* short data column, add skip sector */
+ size = rr->rr_col[x].rc_size -skip_size;
+ rr->rr_col[x].rc_abd = abd_alloc(
+ rr->rr_col[x].rc_size, B_TRUE);
+ abd_copy_off(rr->rr_col[x].rc_abd,
+ (abd_t *)good_data, 0, offset,
+ size);
+ abd_zero_off(rr->rr_col[x].rc_abd,
+ size, skip_size);
+ offset += size;
+ }
+ }
+
+ /*
+ * Construct the parity from the good data.
+ */
+ vdev_raidz_generate_parity_row(rm, rr);
+
+ /* restore everything back to its original state */
+ for (x = 0; x < rr->rr_firstdatacol; x++)
+ rr->rr_col[x].rc_abd = bad_parity[x];
+
+ offset = 0;
+ for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) {
+ abd_free(rr->rr_col[x].rc_abd);
+ rr->rr_col[x].rc_abd = abd_get_offset_size(
+ rr->rr_abd_copy, offset,
+ rr->rr_col[x].rc_size);
+ offset += rr->rr_col[x].rc_size;
+ }
+ }
+
+ ASSERT3P(rr->rr_col[c].rc_gdata, !=, NULL);
+ good = abd_get_offset_size(rr->rr_col[c].rc_gdata, 0,
+ rr->rr_col[c].rc_size);
+ } else {
+ /* adjust good_data to point at the start of our column */
+ parity_size = size = rr->rr_col[0].rc_size;
+ if (c >= rr->rr_bigcols) {
+ size -= skip_size;
+ zcr->zcr_length = size;
+ }
+
+ /* empty column */
+ if (size == 0) {
+ zfs_ereport_finish_checksum(zcr, NULL, NULL, B_TRUE);
+ return;
+ }
+
+ offset = 0;
+ for (x = rr->rr_firstdatacol; x < c; x++) {
+ if (x < rr->rr_bigcols) {
+ offset += parity_size;
+ } else {
+ offset += parity_size - skip_size;
+ }
+ }
+
+ good = abd_get_offset_size((abd_t *)good_data, offset, size);
+ }
+
+ /* we drop the ereport if it ends up that the data was good */
+ zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
+ abd_free((abd_t *)good);
+}
+
+/*
+ * Invoked indirectly by zfs_ereport_start_checksum(), called
+ * below when our read operation fails completely. The main point
+ * is to keep a copy of everything we read from disk, so that at
+ * vdev_draid_cksum_finish() time we can compare it with the good data.
+ */
+static void
+vdev_draid_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
+{
+ size_t c = (size_t)(uintptr_t)arg;
+ raidz_map_t *rm = zio->io_vsd;
+
+ /* set up the report and bump the refcount */
+ zcr->zcr_cbdata = rm;
+ zcr->zcr_cbinfo = c;
+ zcr->zcr_finish = vdev_draid_cksum_finish;
+ zcr->zcr_free = vdev_draid_cksum_free;
+
+ rm->rm_reports++;
+ ASSERT3U(rm->rm_reports, >, 0);
+
+ if (rm->rm_row[0]->rr_abd_copy != NULL)
+ return;
+
+ /*
+ * It's the first time we're called for this raidz_map_t, so we need
+ * to copy the data aside; there's no guarantee that our zio's buffer
+ * won't be re-used for something else.
+ *
+ * Our parity data is already in separate buffers, so there's no need
+ * to copy them. Furthermore, all columns should have been expanded
+ * by vdev_draid_map_alloc_empty() when attempting reconstruction.
+ */
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ size_t offset = 0;
+ size_t size = 0;
+
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ ASSERT3U(rr->rr_col[c].rc_size, ==,
+ rr->rr_col[0].rc_size);
+ size += rr->rr_col[c].rc_size;
+ }
+
+ rr->rr_abd_copy = abd_alloc_for_io(size, B_FALSE);
+
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *col = &rr->rr_col[c];
+ abd_t *tmp = abd_get_offset_size(rr->rr_abd_copy,
+ offset, col->rc_size);
+
+ abd_copy(tmp, col->rc_abd, col->rc_size);
+ abd_free(col->rc_abd);
+
+ col->rc_abd = tmp;
+ offset += col->rc_size;
+ }
+ ASSERT3U(offset, ==, size);
+ }
+}
+
+const zio_vsd_ops_t vdev_draid_vsd_ops = {
+ .vsd_free = vdev_draid_map_free_vsd,
+ .vsd_cksum_report = vdev_draid_cksum_report
+};
+
+/*
+ * Full stripe writes. When writing, all columns (D+P) are required. Parity
+ * is calculated over all the columns, including empty zero filled sectors,
+ * and each is written to disk. While only the data columns are needed for
+ * a normal read, all of the columns are required for reconstruction when
+ * performing a sequential resilver.
+ *
+ * For "big columns" it's sufficient to map the correct range of the zio ABD.
+ * Partial columns require allocating a gang ABD in order to zero fill the
+ * empty sectors. When the column is empty a zero filled sector must be
+ * mapped. In all cases the data ABDs must be the same size as the parity
+ * ABDs (e.g. rc->rc_size == parity_size).
+ */
+static void
+vdev_draid_map_alloc_write(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr)
+{
+ uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
+ uint64_t parity_size = rr->rr_col[0].rc_size;
+ uint64_t abd_off = abd_offset;
+
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+ ASSERT3U(parity_size, ==, abd_get_size(rr->rr_col[0].rc_abd));
+
+ for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_size == 0) {
+ /* empty data column (small write), add a skip sector */
+ ASSERT3U(skip_size, ==, parity_size);
+ rc->rc_abd = abd_get_zeros(skip_size);
+ } else if (rc->rc_size == parity_size) {
+ /* this is a "big column" */
+ rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
+ zio->io_abd, abd_off, rc->rc_size);
+ } else {
+ /* short data column, add a skip sector */
+ ASSERT3U(rc->rc_size + skip_size, ==, parity_size);
+ rc->rc_abd = abd_alloc_gang();
+ abd_gang_add(rc->rc_abd, abd_get_offset_size(
+ zio->io_abd, abd_off, rc->rc_size), B_TRUE);
+ abd_gang_add(rc->rc_abd, abd_get_zeros(skip_size),
+ B_TRUE);
+ }
+
+ ASSERT3U(abd_get_size(rc->rc_abd), ==, parity_size);
+
+ abd_off += rc->rc_size;
+ rc->rc_size = parity_size;
+ }
+
+ IMPLY(abd_offset != 0, abd_off == zio->io_size);
+}
+
+/*
+ * Scrub/resilver reads. In order to store the contents of the skip sectors
+ * an additional ABD is allocated. The columns are handled in the same way
+ * as a full stripe write except instead of using the zero ABD the newly
+ * allocated skip ABD is used to back the skip sectors. In all cases the
+ * data ABD must be the same size as the parity ABDs.
+ */
+static void
+vdev_draid_map_alloc_scrub(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr)
+{
+ uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
+ uint64_t parity_size = rr->rr_col[0].rc_size;
+ uint64_t abd_off = abd_offset;
+ uint64_t skip_off = 0;
+
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+ ASSERT3P(rr->rr_abd_empty, ==, NULL);
+
+ if (rr->rr_nempty > 0) {
+ rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size,
+ B_FALSE);
+ }
+
+ for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_size == 0) {
+ /* empty data column (small read), add a skip sector */
+ ASSERT3U(skip_size, ==, parity_size);
+ ASSERT3U(rr->rr_nempty, !=, 0);
+ rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty,
+ skip_off, skip_size);
+ skip_off += skip_size;
+ } else if (rc->rc_size == parity_size) {
+ /* this is a "big column" */
+ rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
+ zio->io_abd, abd_off, rc->rc_size);
+ } else {
+ /* short data column, add a skip sector */
+ ASSERT3U(rc->rc_size + skip_size, ==, parity_size);
+ ASSERT3U(rr->rr_nempty, !=, 0);
+ rc->rc_abd = abd_alloc_gang();
+ abd_gang_add(rc->rc_abd, abd_get_offset_size(
+ zio->io_abd, abd_off, rc->rc_size), B_TRUE);
+ abd_gang_add(rc->rc_abd, abd_get_offset_size(
+ rr->rr_abd_empty, skip_off, skip_size), B_TRUE);
+ skip_off += skip_size;
+ }
+
+ uint64_t abd_size = abd_get_size(rc->rc_abd);
+ ASSERT3U(abd_size, ==, abd_get_size(rr->rr_col[0].rc_abd));
+
+ /*
+ * Increase rc_size so the skip ABD is included in subsequent
+ * parity calculations.
+ */
+ abd_off += rc->rc_size;
+ rc->rc_size = abd_size;
+ }
+
+ IMPLY(abd_offset != 0, abd_off == zio->io_size);
+ ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size);
+}
+
+/*
+ * Normal reads. In this common case only the columns containing data
+ * are read in to the zio ABDs. Neither the parity columns or empty skip
+ * sectors are read unless the checksum fails verification. In which case
+ * vdev_raidz_read_all() will call vdev_draid_map_alloc_empty() to expand
+ * the raid map in order to allow reconstruction using the parity data and
+ * skip sectors.
+ */
+static void
+vdev_draid_map_alloc_read(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr)
+{
+ uint64_t abd_off = abd_offset;
+
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+
+ for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_size > 0) {
+ rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
+ zio->io_abd, abd_off, rc->rc_size);
+ abd_off += rc->rc_size;
+ }
+ }
+
+ IMPLY(abd_offset != 0, abd_off == zio->io_size);
+}
+
+/*
+ * Converts a normal "read" raidz_row_t to a "scrub" raidz_row_t. The key
+ * difference is that an ABD is allocated to back skip sectors so they may
+ * be read in to memory, verified, and repaired if needed.
+ */
+void
+vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr)
+{
+ uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
+ uint64_t parity_size = rr->rr_col[0].rc_size;
+ uint64_t skip_off = 0;
+
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+ ASSERT3P(rr->rr_abd_empty, ==, NULL);
+
+ if (rr->rr_nempty > 0) {
+ rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size,
+ B_FALSE);
+ }
+
+ for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_size == 0) {
+ /* empty data column (small read), add a skip sector */
+ ASSERT3U(skip_size, ==, parity_size);
+ ASSERT3U(rr->rr_nempty, !=, 0);
+ ASSERT3P(rc->rc_abd, ==, NULL);
+ rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty,
+ skip_off, skip_size);
+ skip_off += skip_size;
+ } else if (rc->rc_size == parity_size) {
+ /* this is a "big column", nothing to add */
+ ASSERT3P(rc->rc_abd, !=, NULL);
+ } else {
+ /* short data column, add a skip sector */
+ ASSERT3U(rc->rc_size + skip_size, ==, parity_size);
+ ASSERT3U(rr->rr_nempty, !=, 0);
+ ASSERT3P(rc->rc_abd, !=, NULL);
+ ASSERT(!abd_is_gang(rc->rc_abd));
+ abd_t *read_abd = rc->rc_abd;
+ rc->rc_abd = abd_alloc_gang();
+ abd_gang_add(rc->rc_abd, read_abd, B_TRUE);
+ abd_gang_add(rc->rc_abd, abd_get_offset_size(
+ rr->rr_abd_empty, skip_off, skip_size), B_TRUE);
+ skip_off += skip_size;
+ }
+
+ /*
+ * Increase rc_size so the empty ABD is included in subsequent
+ * parity calculations.
+ */
+ rc->rc_size = parity_size;
+ }
+
+ ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size);
+}
+
+/*
+ * Given a logical address within a dRAID configuration, return the physical
+ * address on the first drive in the group that this address maps to
+ * (at position 'start' in permutation number 'perm').
+ */
+static uint64_t
+vdev_draid_logical_to_physical(vdev_t *vd, uint64_t logical_offset,
+ uint64_t *perm, uint64_t *start)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ /* b is the dRAID (parent) sector offset. */
+ uint64_t ashift = vd->vdev_top->vdev_ashift;
+ uint64_t b_offset = logical_offset >> ashift;
+
+ /*
+ * The height of a row in units of the vdev's minimum sector size.
+ * This is the amount of data written to each disk of each group
+ * in a given permutation.
+ */
+ uint64_t rowheight_sectors = VDEV_DRAID_ROWHEIGHT >> ashift;
+
+ /*
+ * We cycle through a disk permutation every groupsz * ngroups chunk
+ * of address space. Note that ngroups * groupsz must be a multiple
+ * of the number of data drives (ndisks) in order to guarantee
+ * alignment. So, for example, if our row height is 16MB, our group
+ * size is 10, and there are 13 data drives in the draid, then ngroups
+ * will be 13, we will change permutation every 2.08GB and each
+ * disk will have 160MB of data per chunk.
+ */
+ uint64_t groupwidth = vdc->vdc_groupwidth;
+ uint64_t ngroups = vdc->vdc_ngroups;
+ uint64_t ndisks = vdc->vdc_ndisks;
+
+ /*
+ * groupstart is where the group this IO will land in "starts" in
+ * the permutation array.
+ */
+ uint64_t group = logical_offset / vdc->vdc_groupsz;
+ uint64_t groupstart = (group * groupwidth) % ndisks;
+ ASSERT3U(groupstart + groupwidth, <=, ndisks + groupstart);
+ *start = groupstart;
+
+ /* b_offset is the sector offset within a group chunk */
+ b_offset = b_offset % (rowheight_sectors * groupwidth);
+ ASSERT0(b_offset % groupwidth);
+
+ /*
+ * Find the starting byte offset on each child vdev:
+ * - within a permutation there are ngroups groups spread over the
+ * rows, where each row covers a slice portion of the disk
+ * - each permutation has (groupwidth * ngroups) / ndisks rows
+ * - so each permutation covers rows * slice portion of the disk
+ * - so we need to find the row where this IO group target begins
+ */
+ *perm = group / ngroups;
+ uint64_t row = (*perm * ((groupwidth * ngroups) / ndisks)) +
+ (((group % ngroups) * groupwidth) / ndisks);
+
+ return (((rowheight_sectors * row) +
+ (b_offset / groupwidth)) << ashift);
+}
+
+static uint64_t
+vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,
+ uint64_t abd_offset, uint64_t abd_size)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+ uint64_t ashift = vd->vdev_top->vdev_ashift;
+ uint64_t io_size = abd_size;
+ uint64_t io_asize = vdev_draid_asize(vd, io_size);
+ uint64_t group = vdev_draid_offset_to_group(vd, io_offset);
+ uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1);
+
+ /*
+ * Limit the io_size to the space remaining in the group. A second
+ * row in the raidz_map_t is created for the remainder.
+ */
+ if (io_offset + io_asize > start_offset) {
+ io_size = vdev_draid_asize_to_psize(vd,
+ start_offset - io_offset);
+ }
+
+ /*
+ * At most a block may span the logical end of one group and the start
+ * of the next group. Therefore, at the end of a group the io_size must
+ * span the group width evenly and the remainder must be aligned to the
+ * start of the next group.
+ */
+ IMPLY(abd_offset == 0 && io_size < zio->io_size,
+ (io_asize >> ashift) % vdc->vdc_groupwidth == 0);
+ IMPLY(abd_offset != 0,
+ vdev_draid_group_to_offset(vd, group) == io_offset);
+
+ /* Lookup starting byte offset on each child vdev */
+ uint64_t groupstart, perm;
+ uint64_t physical_offset = vdev_draid_logical_to_physical(vd,
+ io_offset, &perm, &groupstart);
+
+ /*
+ * If there is less than groupwidth drives available after the group
+ * start, the group is going to wrap onto the next row. 'wrap' is the
+ * group disk number that starts on the next row.
+ */
+ uint64_t ndisks = vdc->vdc_ndisks;
+ uint64_t groupwidth = vdc->vdc_groupwidth;
+ uint64_t wrap = groupwidth;
+
+ if (groupstart + groupwidth > ndisks)
+ wrap = ndisks - groupstart;
+
+ /* The io size in units of the vdev's minimum sector size. */
+ const uint64_t psize = io_size >> ashift;
+
+ /*
+ * "Quotient": The number of data sectors for this stripe on all but
+ * the "big column" child vdevs that also contain "remainder" data.
+ */
+ uint64_t q = psize / vdc->vdc_ndata;
+
+ /*
+ * "Remainder": The number of partial stripe data sectors in this I/O.
+ * This will add a sector to some, but not all, child vdevs.
+ */
+ uint64_t r = psize - q * vdc->vdc_ndata;
+
+ /* The number of "big columns" - those which contain remainder data. */
+ uint64_t bc = (r == 0 ? 0 : r + vdc->vdc_nparity);
+ ASSERT3U(bc, <, groupwidth);
+
+ /* The total number of data and parity sectors for this I/O. */
+ uint64_t tot = psize + (vdc->vdc_nparity * (q + (r == 0 ? 0 : 1)));
+
+ raidz_row_t *rr;
+ rr = kmem_alloc(offsetof(raidz_row_t, rr_col[groupwidth]), KM_SLEEP);
+ rr->rr_cols = groupwidth;
+ rr->rr_scols = groupwidth;
+ rr->rr_bigcols = bc;
+ rr->rr_missingdata = 0;
+ rr->rr_missingparity = 0;
+ rr->rr_firstdatacol = vdc->vdc_nparity;
+ rr->rr_abd_copy = NULL;
+ rr->rr_abd_empty = NULL;
+#ifdef ZFS_DEBUG
+ rr->rr_offset = io_offset;
+ rr->rr_size = io_size;
+#endif
+ *rrp = rr;
+
+ uint8_t *base;
+ uint64_t iter, asize = 0;
+ vdev_draid_get_perm(vdc, perm, &base, &iter);
+ for (uint64_t i = 0; i < groupwidth; i++) {
+ raidz_col_t *rc = &rr->rr_col[i];
+ uint64_t c = (groupstart + i) % ndisks;
+
+ /* increment the offset if we wrap to the next row */
+ if (i == wrap)
+ physical_offset += VDEV_DRAID_ROWHEIGHT;
+
+ rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c);
+ rc->rc_offset = physical_offset;
+ rc->rc_abd = NULL;
+ rc->rc_gdata = NULL;
+ rc->rc_orig_data = NULL;
+ rc->rc_error = 0;
+ rc->rc_tried = 0;
+ rc->rc_skipped = 0;
+ rc->rc_repair = 0;
+ rc->rc_need_orig_restore = B_FALSE;
+
+ if (q == 0 && i >= bc)
+ rc->rc_size = 0;
+ else if (i < bc)
+ rc->rc_size = (q + 1) << ashift;
+ else
+ rc->rc_size = q << ashift;
+
+ asize += rc->rc_size;
+ }
+
+ ASSERT3U(asize, ==, tot << ashift);
+ rr->rr_nempty = roundup(tot, groupwidth) - tot;
+ IMPLY(bc > 0, rr->rr_nempty == groupwidth - bc);
+
+ /* Allocate buffers for the parity columns */
+ for (uint64_t c = 0; c < rr->rr_firstdatacol; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
+ }
+
+ /*
+ * Map buffers for data columns and allocate/map buffers for skip
+ * sectors. There are three distinct cases for dRAID which are
+ * required to support sequential rebuild.
+ */
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ vdev_draid_map_alloc_write(zio, abd_offset, rr);
+ } else if ((rr->rr_nempty > 0) &&
+ (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
+ vdev_draid_map_alloc_scrub(zio, abd_offset, rr);
+ } else {
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+ vdev_draid_map_alloc_read(zio, abd_offset, rr);
+ }
+
+ return (io_size);
+}
+
+/*
+ * Allocate the raidz mapping to be applied to the dRAID I/O. The parity
+ * calculations for dRAID are identical to raidz however there are a few
+ * differences in the layout.
+ *
+ * - dRAID always allocates a full stripe width. Any extra sectors due
+ * this padding are zero filled and written to disk. They will be read
+ * back during a scrub or repair operation since they are included in
+ * the parity calculation. This property enables sequential resilvering.
+ *
+ * - When the block at the logical offset spans redundancy groups then two
+ * rows are allocated in the raidz_map_t. One row resides at the end of
+ * the first group and the other at the start of the following group.
+ */
+static raidz_map_t *
+vdev_draid_map_alloc(zio_t *zio)
+{
+ raidz_row_t *rr[2];
+ uint64_t abd_offset = 0;
+ uint64_t abd_size = zio->io_size;
+ uint64_t io_offset = zio->io_offset;
+ uint64_t size;
+ int nrows = 1;
+
+ size = vdev_draid_map_alloc_row(zio, &rr[0], io_offset,
+ abd_offset, abd_size);
+ if (size < abd_size) {
+ vdev_t *vd = zio->io_vd;
+
+ io_offset += vdev_draid_asize(vd, size);
+ abd_offset += size;
+ abd_size -= size;
+ nrows++;
+
+ ASSERT3U(io_offset, ==, vdev_draid_group_to_offset(
+ vd, vdev_draid_offset_to_group(vd, io_offset)));
+ ASSERT3U(abd_offset, <, zio->io_size);
+ ASSERT3U(abd_size, !=, 0);
+
+ size = vdev_draid_map_alloc_row(zio, &rr[1],
+ io_offset, abd_offset, abd_size);
+ VERIFY3U(size, ==, abd_size);
+ }
+
+ raidz_map_t *rm;
+ rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[nrows]), KM_SLEEP);
+ rm->rm_ops = vdev_raidz_math_get_ops();
+ rm->rm_nrows = nrows;
+ rm->rm_row[0] = rr[0];
+ if (nrows == 2)
+ rm->rm_row[1] = rr[1];
+
+ zio->io_vsd = rm;
+ zio->io_vsd_ops = &vdev_draid_vsd_ops;
+
+ return (rm);
+}
+
+/*
+ * Given an offset into a dRAID return the next group width aligned offset
+ * which can be used to start an allocation.
+ */
+static uint64_t
+vdev_draid_get_astart(vdev_t *vd, const uint64_t start)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+ return (roundup(start, vdc->vdc_groupwidth << vd->vdev_ashift));
+}
+
+/*
+ * Allocatable space for dRAID is (children - nspares) * sizeof(smallest child)
+ * rounded down to the last full slice. So each child must provide at least
+ * 1 / (children - nspares) of its asize.
+ */
+static uint64_t
+vdev_draid_min_asize(vdev_t *vd)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+ return ((vd->vdev_min_asize + vdc->vdc_ndisks - 1) / (vdc->vdc_ndisks));
+}
+
+/*
+ * When using dRAID the minimum allocation size is determined by the number
+ * of data disks in the redundancy group. Full stripes are always used.
+ */
+static uint64_t
+vdev_draid_min_alloc(vdev_t *vd)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+ return (vdc->vdc_ndata << vd->vdev_ashift);
+}
+
+/*
+ * Returns true if the txg range does not exist on any leaf vdev.
+ *
+ * A dRAID spare does not fit into the DTL model. While it has child vdevs
+ * there is no redundancy among them, and the effective child vdev is
+ * determined by offset. Essentially we do a vdev_dtl_reassess() on the
+ * fly by replacing a dRAID spare with the child vdev under the offset.
+ * Note that it is a recursive process because the child vdev can be
+ * another dRAID spare and so on.
+ */
+boolean_t
+vdev_draid_missing(vdev_t *vd, uint64_t physical_offset, uint64_t txg,
+ uint64_t size)
+{
+ if (vd->vdev_ops == &vdev_spare_ops ||
+ vd->vdev_ops == &vdev_replacing_ops) {
+ /*
+ * Check all of the readable children, if any child
+ * contains the txg range the data it is not missing.
+ */
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (!vdev_readable(cvd))
+ continue;
+
+ if (!vdev_draid_missing(cvd, physical_offset,
+ txg, size))
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+ }
+
+ if (vd->vdev_ops == &vdev_draid_spare_ops) {
+ /*
+ * When sequentially resilvering we don't have a proper
+ * txg range so instead we must presume all txgs are
+ * missing on this vdev until the resilver completes.
+ */
+ if (vd->vdev_rebuild_txg != 0)
+ return (B_TRUE);
+
+ /*
+ * DTL_MISSING is set for all prior txgs when a resilver
+ * is started in spa_vdev_attach().
+ */
+ if (vdev_dtl_contains(vd, DTL_MISSING, txg, size))
+ return (B_TRUE);
+
+ /*
+ * Consult the DTL on the relevant vdev. Either a vdev
+ * leaf or spare/replace mirror child may be returned so
+ * we must recursively call vdev_draid_missing_impl().
+ */
+ vd = vdev_draid_spare_get_child(vd, physical_offset);
+ if (vd == NULL)
+ return (B_TRUE);
+
+ return (vdev_draid_missing(vd, physical_offset,
+ txg, size));
+ }
+
+ return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
+}
+
+/*
+ * Returns true if the txg is only partially replicated on the leaf vdevs.
+ */
+static boolean_t
+vdev_draid_partial(vdev_t *vd, uint64_t physical_offset, uint64_t txg,
+ uint64_t size)
+{
+ if (vd->vdev_ops == &vdev_spare_ops ||
+ vd->vdev_ops == &vdev_replacing_ops) {
+ /*
+ * Check all of the readable children, if any child is
+ * missing the txg range then it is partially replicated.
+ */
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (!vdev_readable(cvd))
+ continue;
+
+ if (vdev_draid_partial(cvd, physical_offset, txg, size))
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+ }
+
+ if (vd->vdev_ops == &vdev_draid_spare_ops) {
+ /*
+ * When sequentially resilvering we don't have a proper
+ * txg range so instead we must presume all txgs are
+ * missing on this vdev until the resilver completes.
+ */
+ if (vd->vdev_rebuild_txg != 0)
+ return (B_TRUE);
+
+ /*
+ * DTL_MISSING is set for all prior txgs when a resilver
+ * is started in spa_vdev_attach().
+ */
+ if (vdev_dtl_contains(vd, DTL_MISSING, txg, size))
+ return (B_TRUE);
+
+ /*
+ * Consult the DTL on the relevant vdev. Either a vdev
+ * leaf or spare/replace mirror child may be returned so
+ * we must recursively call vdev_draid_missing_impl().
+ */
+ vd = vdev_draid_spare_get_child(vd, physical_offset);
+ if (vd == NULL)
+ return (B_TRUE);
+
+ return (vdev_draid_partial(vd, physical_offset, txg, size));
+ }
+
+ return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
+}
+
+/*
+ * Determine if the vdev is readable at the given offset.
+ */
+boolean_t
+vdev_draid_readable(vdev_t *vd, uint64_t physical_offset)
+{
+ if (vd->vdev_ops == &vdev_draid_spare_ops) {
+ vd = vdev_draid_spare_get_child(vd, physical_offset);
+ if (vd == NULL)
+ return (B_FALSE);
+ }
+
+ if (vd->vdev_ops == &vdev_spare_ops ||
+ vd->vdev_ops == &vdev_replacing_ops) {
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (!vdev_readable(cvd))
+ continue;
+
+ if (vdev_draid_readable(cvd, physical_offset))
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+ }
+
+ return (vdev_readable(vd));
+}
+
+/*
+ * Returns the first distributed spare found under the provided vdev tree.
+ */
+static vdev_t *
+vdev_draid_find_spare(vdev_t *vd)
+{
+ if (vd->vdev_ops == &vdev_draid_spare_ops)
+ return (vd);
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *svd = vdev_draid_find_spare(vd->vdev_child[c]);
+ if (svd != NULL)
+ return (svd);
+ }
+
+ return (NULL);
+}
+
+/*
+ * Returns B_TRUE if the passed in vdev is currently "faulted".
+ * Faulted, in this context, means that the vdev represents a
+ * replacing or sparing vdev tree.
+ */
+static boolean_t
+vdev_draid_faulted(vdev_t *vd, uint64_t physical_offset)
+{
+ if (vd->vdev_ops == &vdev_draid_spare_ops) {
+ vd = vdev_draid_spare_get_child(vd, physical_offset);
+ if (vd == NULL)
+ return (B_FALSE);
+
+ /*
+ * After resolving the distributed spare to a leaf vdev
+ * check the parent to determine if it's "faulted".
+ */
+ vd = vd->vdev_parent;
+ }
+
+ return (vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops);
+}
+
+/*
+ * Determine if the dRAID block at the logical offset is degraded.
+ * Used by sequential resilver.
+ */
+static boolean_t
+vdev_draid_group_degraded(vdev_t *vd, uint64_t offset)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+ ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset);
+
+ uint64_t groupstart, perm;
+ uint64_t physical_offset = vdev_draid_logical_to_physical(vd,
+ offset, &perm, &groupstart);
+
+ uint8_t *base;
+ uint64_t iter;
+ vdev_draid_get_perm(vdc, perm, &base, &iter);
+
+ for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) {
+ uint64_t c = (groupstart + i) % vdc->vdc_ndisks;
+ uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c);
+ vdev_t *cvd = vd->vdev_child[cid];
+
+ /* Group contains a faulted vdev. */
+ if (vdev_draid_faulted(cvd, physical_offset))
+ return (B_TRUE);
+
+ /*
+ * Always check groups with active distributed spares
+ * because any vdev failure in the pool will affect them.
+ */
+ if (vdev_draid_find_spare(cvd) != NULL)
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Determine if the txg is missing. Used by healing resilver.
+ */
+static boolean_t
+vdev_draid_group_missing(vdev_t *vd, uint64_t offset, uint64_t txg,
+ uint64_t size)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+ ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset);
+
+ uint64_t groupstart, perm;
+ uint64_t physical_offset = vdev_draid_logical_to_physical(vd,
+ offset, &perm, &groupstart);
+
+ uint8_t *base;
+ uint64_t iter;
+ vdev_draid_get_perm(vdc, perm, &base, &iter);
+
+ for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) {
+ uint64_t c = (groupstart + i) % vdc->vdc_ndisks;
+ uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c);
+ vdev_t *cvd = vd->vdev_child[cid];
+
+ /* Transaction group is known to be partially replicated. */
+ if (vdev_draid_partial(cvd, physical_offset, txg, size))
+ return (B_TRUE);
+
+ /*
+ * Always check groups with active distributed spares
+ * because any vdev failure in the pool will affect them.
+ */
+ if (vdev_draid_find_spare(cvd) != NULL)
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Find the smallest child asize and largest sector size to calculate the
+ * available capacity. Distributed spares are ignored since their capacity
+ * is also based of the minimum child size in the top-level dRAID.
+ */
+static void
+vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep,
+ uint64_t *logical_ashiftp, uint64_t *physical_ashiftp)
+{
+ uint64_t logical_ashift = 0, physical_ashift = 0;
+ uint64_t asize = 0, max_asize = 0;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (cvd->vdev_ops == &vdev_draid_spare_ops)
+ continue;
+
+ asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1;
+ max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1;
+ logical_ashift = MAX(logical_ashift, cvd->vdev_ashift);
+ physical_ashift = MAX(physical_ashift,
+ cvd->vdev_physical_ashift);
+ }
+
+ *asizep = asize;
+ *max_asizep = max_asize;
+ *logical_ashiftp = logical_ashift;
+ *physical_ashiftp = physical_ashift;
+}
+
+/*
+ * Open spare vdevs.
+ */
+static boolean_t
+vdev_draid_open_spares(vdev_t *vd)
+{
+ return (vd->vdev_ops == &vdev_draid_spare_ops ||
+ vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops);
+}
+
+/*
+ * Open all children, excluding spares.
+ */
+static boolean_t
+vdev_draid_open_children(vdev_t *vd)
+{
+ return (!vdev_draid_open_spares(vd));
+}
+
+/*
+ * Open a top-level dRAID vdev.
+ */
+static int
+vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+ uint64_t nparity = vdc->vdc_nparity;
+ int open_errors = 0;
+
+ if (nparity > VDEV_DRAID_MAXPARITY ||
+ vd->vdev_children < nparity + 1) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * First open the normal children then the distributed spares. This
+ * ordering is important to ensure the distributed spares calculate
+ * the correct psize in the event that the dRAID vdevs were expanded.
+ */
+ vdev_open_children_subset(vd, vdev_draid_open_children);
+ vdev_open_children_subset(vd, vdev_draid_open_spares);
+
+ /* Verify enough of the children are available to continue. */
+ for (int c = 0; c < vd->vdev_children; c++) {
+ if (vd->vdev_child[c]->vdev_open_error != 0) {
+ if ((++open_errors) > nparity) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+ return (SET_ERROR(ENXIO));
+ }
+ }
+ }
+
+ /*
+ * Allocatable capacity is the sum of the space on all children less
+ * the number of distributed spares rounded down to last full row
+ * and then to the last full group. An additional 32MB of scratch
+ * space is reserved at the end of each child for use by the dRAID
+ * expansion feature.
+ */
+ uint64_t child_asize, child_max_asize;
+ vdev_draid_calculate_asize(vd, &child_asize, &child_max_asize,
+ logical_ashift, physical_ashift);
+
+ /*
+ * Should be unreachable since the minimum child size is 64MB, but
+ * we want to make sure an underflow absolutely cannot occur here.
+ */
+ if (child_asize < VDEV_DRAID_REFLOW_RESERVE ||
+ child_max_asize < VDEV_DRAID_REFLOW_RESERVE) {
+ return (SET_ERROR(ENXIO));
+ }
+
+ child_asize = ((child_asize - VDEV_DRAID_REFLOW_RESERVE) /
+ VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT;
+ child_max_asize = ((child_max_asize - VDEV_DRAID_REFLOW_RESERVE) /
+ VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT;
+
+ *asize = (((child_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) *
+ vdc->vdc_groupsz);
+ *max_asize = (((child_max_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) *
+ vdc->vdc_groupsz);
+
+ return (0);
+}
+
+/*
+ * Close a top-level dRAID vdev.
+ */
+static void
+vdev_draid_close(vdev_t *vd)
+{
+ for (int c = 0; c < vd->vdev_children; c++) {
+ if (vd->vdev_child[c] != NULL)
+ vdev_close(vd->vdev_child[c]);
+ }
+}
+
+/*
+ * Return the maximum asize for a rebuild zio in the provided range
+ * given the following constraints. A dRAID chunks may not:
+ *
+ * - Exceed the maximum allowed block size (SPA_MAXBLOCKSIZE), or
+ * - Span dRAID redundancy groups.
+ */
+static uint64_t
+vdev_draid_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize,
+ uint64_t max_segment)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+ uint64_t ashift = vd->vdev_ashift;
+ uint64_t ndata = vdc->vdc_ndata;
+ uint64_t psize = MIN(P2ROUNDUP(max_segment * ndata, 1 << ashift),
+ SPA_MAXBLOCKSIZE);
+
+ ASSERT3U(vdev_draid_get_astart(vd, start), ==, start);
+ ASSERT3U(asize % (vdc->vdc_groupwidth << ashift), ==, 0);
+
+ /* Chunks must evenly span all data columns in the group. */
+ psize = (((psize >> ashift) / ndata) * ndata) << ashift;
+ uint64_t chunk_size = MIN(asize, vdev_psize_to_asize(vd, psize));
+
+ /* Reduce the chunk size to the group space remaining. */
+ uint64_t group = vdev_draid_offset_to_group(vd, start);
+ uint64_t left = vdev_draid_group_to_offset(vd, group + 1) - start;
+ chunk_size = MIN(chunk_size, left);
+
+ ASSERT3U(chunk_size % (vdc->vdc_groupwidth << ashift), ==, 0);
+ ASSERT3U(vdev_draid_offset_to_group(vd, start), ==,
+ vdev_draid_offset_to_group(vd, start + chunk_size - 1));
+
+ return (chunk_size);
+}
+
+/*
+ * Align the start of the metaslab to the group width and slightly reduce
+ * its size to a multiple of the group width. Since full stripe writes are
+ * required by dRAID this space is unallocable. Furthermore, aligning the
+ * metaslab start is important for vdev initialize and TRIM which both operate
+ * on metaslab boundaries which vdev_xlate() expects to be aligned.
+ */
+static void
+vdev_draid_metaslab_init(vdev_t *vd, uint64_t *ms_start, uint64_t *ms_size)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+ uint64_t sz = vdc->vdc_groupwidth << vd->vdev_ashift;
+ uint64_t astart = vdev_draid_get_astart(vd, *ms_start);
+ uint64_t asize = ((*ms_size - (astart - *ms_start)) / sz) * sz;
+
+ *ms_start = astart;
+ *ms_size = asize;
+
+ ASSERT0(*ms_start % sz);
+ ASSERT0(*ms_size % sz);
+}
+
+/*
+ * Add virtual dRAID spares to the list of valid spares. In order to accomplish
+ * this the existing array must be freed and reallocated with the additional
+ * entries.
+ */
+int
+vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp,
+ uint64_t next_vdev_id)
+{
+ uint64_t draid_nspares = 0;
+ uint64_t ndraid = 0;
+ int error;
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ vdev_t *cvd = vd->vdev_child[i];
+
+ if (cvd->vdev_ops == &vdev_draid_ops) {
+ vdev_draid_config_t *vdc = cvd->vdev_tsd;
+ draid_nspares += vdc->vdc_nspares;
+ ndraid++;
+ }
+ }
+
+ if (draid_nspares == 0) {
+ *ndraidp = ndraid;
+ return (0);
+ }
+
+ nvlist_t **old_spares, **new_spares;
+ uint_t old_nspares;
+ error = nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &old_spares, &old_nspares);
+ if (error)
+ old_nspares = 0;
+
+ /* Allocate memory and copy of the existing spares. */
+ new_spares = kmem_alloc(sizeof (nvlist_t *) *
+ (draid_nspares + old_nspares), KM_SLEEP);
+ for (uint_t i = 0; i < old_nspares; i++)
+ new_spares[i] = fnvlist_dup(old_spares[i]);
+
+ /* Add new distributed spares to ZPOOL_CONFIG_SPARES. */
+ uint64_t n = old_nspares;
+ for (uint64_t vdev_id = 0; vdev_id < vd->vdev_children; vdev_id++) {
+ vdev_t *cvd = vd->vdev_child[vdev_id];
+ char path[64];
+
+ if (cvd->vdev_ops != &vdev_draid_ops)
+ continue;
+
+ vdev_draid_config_t *vdc = cvd->vdev_tsd;
+ uint64_t nspares = vdc->vdc_nspares;
+ uint64_t nparity = vdc->vdc_nparity;
+
+ for (uint64_t spare_id = 0; spare_id < nspares; spare_id++) {
+ bzero(path, sizeof (path));
+ (void) snprintf(path, sizeof (path) - 1,
+ "%s%llu-%llu-%llu", VDEV_TYPE_DRAID,
+ (u_longlong_t)nparity,
+ (u_longlong_t)next_vdev_id + vdev_id,
+ (u_longlong_t)spare_id);
+
+ nvlist_t *spare = fnvlist_alloc();
+ fnvlist_add_string(spare, ZPOOL_CONFIG_PATH, path);
+ fnvlist_add_string(spare, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_DRAID_SPARE);
+ fnvlist_add_uint64(spare, ZPOOL_CONFIG_TOP_GUID,
+ cvd->vdev_guid);
+ fnvlist_add_uint64(spare, ZPOOL_CONFIG_SPARE_ID,
+ spare_id);
+ fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_LOG, 0);
+ fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_SPARE, 1);
+ fnvlist_add_uint64(spare, ZPOOL_CONFIG_WHOLE_DISK, 1);
+ fnvlist_add_uint64(spare, ZPOOL_CONFIG_ASHIFT,
+ cvd->vdev_ashift);
+
+ new_spares[n] = spare;
+ n++;
+ }
+ }
+
+ if (n > 0) {
+ (void) nvlist_remove_all(nvroot, ZPOOL_CONFIG_SPARES);
+ fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ new_spares, n);
+ }
+
+ for (int i = 0; i < n; i++)
+ nvlist_free(new_spares[i]);
+
+ kmem_free(new_spares, sizeof (*new_spares) * n);
+ *ndraidp = ndraid;
+
+ return (0);
+}
+
+/*
+ * Determine if any portion of the provided block resides on a child vdev
+ * with a dirty DTL and therefore needs to be resilvered.
+ */
+static boolean_t
+vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
+ uint64_t phys_birth)
+{
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t asize = vdev_draid_asize(vd, psize);
+
+ if (phys_birth == TXG_UNKNOWN) {
+ /*
+ * Sequential resilver. There is no meaningful phys_birth
+ * for this block, we can only determine if block resides
+ * in a degraded group in which case it must be resilvered.
+ */
+ ASSERT3U(vdev_draid_offset_to_group(vd, offset), ==,
+ vdev_draid_offset_to_group(vd, offset + asize - 1));
+
+ return (vdev_draid_group_degraded(vd, offset));
+ } else {
+ /*
+ * Healing resilver. TXGs not in DTL_PARTIAL are intact,
+ * as are blocks in non-degraded groups.
+ */
+ if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
+ return (B_FALSE);
+
+ if (vdev_draid_group_missing(vd, offset, phys_birth, 1))
+ return (B_TRUE);
+
+ /* The block may span groups in which case check both. */
+ if (vdev_draid_offset_to_group(vd, offset) !=
+ vdev_draid_offset_to_group(vd, offset + asize - 1)) {
+ if (vdev_draid_group_missing(vd,
+ offset + asize, phys_birth, 1))
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+ }
+}
+
+static boolean_t
+vdev_draid_rebuilding(vdev_t *vd)
+{
+ if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg)
+ return (B_TRUE);
+
+ for (int i = 0; i < vd->vdev_children; i++) {
+ if (vdev_draid_rebuilding(vd->vdev_child[i])) {
+ return (B_TRUE);
+ }
+ }
+
+ return (B_FALSE);
+}
+
+static void
+vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
+{
+#ifdef ZFS_DEBUG
+ range_seg64_t logical_rs, physical_rs, remain_rs;
+ logical_rs.rs_start = rr->rr_offset;
+ logical_rs.rs_end = logical_rs.rs_start +
+ vdev_draid_asize(vd, rr->rr_size);
+
+ raidz_col_t *rc = &rr->rr_col[col];
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+ vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
+ ASSERT(vdev_xlate_is_empty(&remain_rs));
+ ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
+ ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
+ ASSERT3U(rc->rc_offset + rc->rc_size, ==, physical_rs.rs_end);
+#endif
+}
+
+/*
+ * For write operations:
+ * 1. Generate the parity data
+ * 2. Create child zio write operations to each column's vdev, for both
+ * data and parity. A gang ABD is allocated by vdev_draid_map_alloc()
+ * if a skip sector needs to be added to a column.
+ */
+static void
+vdev_draid_io_start_write(zio_t *zio, raidz_row_t *rr)
+{
+ vdev_t *vd = zio->io_vd;
+ raidz_map_t *rm = zio->io_vsd;
+
+ vdev_raidz_generate_parity_row(rm, rr);
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ /*
+ * Empty columns are zero filled and included in the parity
+ * calculation and therefore must be written.
+ */
+ ASSERT3U(rc->rc_size, !=, 0);
+
+ /* Verify physical to logical translation */
+ vdev_draid_io_verify(vd, rr, c);
+
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ vd->vdev_child[rc->rc_devidx], rc->rc_offset,
+ rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority,
+ 0, vdev_raidz_child_done, rc));
+ }
+}
+
+/*
+ * For read operations:
+ * 1. The vdev_draid_map_alloc() function will create a minimal raidz
+ * mapping for the read based on the zio->io_flags. There are two
+ * possible mappings either 1) a normal read, or 2) a scrub/resilver.
+ * 2. Create the zio read operations. This will include all parity
+ * columns and skip sectors for a scrub/resilver.
+ */
+static void
+vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
+{
+ vdev_t *vd = zio->io_vd;
+
+ /* Sequential rebuild must do IO at redundancy group boundary. */
+ IMPLY(zio->io_priority == ZIO_PRIORITY_REBUILD, rr->rr_nempty == 0);
+
+ /*
+ * Iterate over the columns in reverse order so that we hit the parity
+ * last. Any errors along the way will force us to read the parity.
+ * For scrub/resilver IOs which verify skip sectors, a gang ABD will
+ * have been allocated to store them and rc->rc_size is increased.
+ */
+ for (int c = rr->rr_cols - 1; c >= 0; c--) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+ if (!vdev_draid_readable(cvd, rc->rc_offset)) {
+ if (c >= rr->rr_firstdatacol)
+ rr->rr_missingdata++;
+ else
+ rr->rr_missingparity++;
+ rc->rc_error = SET_ERROR(ENXIO);
+ rc->rc_tried = 1;
+ rc->rc_skipped = 1;
+ continue;
+ }
+
+ if (vdev_draid_missing(cvd, rc->rc_offset, zio->io_txg, 1)) {
+ if (c >= rr->rr_firstdatacol)
+ rr->rr_missingdata++;
+ else
+ rr->rr_missingparity++;
+ rc->rc_error = SET_ERROR(ESTALE);
+ rc->rc_skipped = 1;
+ continue;
+ }
+
+ /*
+ * Empty columns may be read during vdev_draid_io_done().
+ * Only skip them after the readable and missing checks
+ * verify they are available.
+ */
+ if (rc->rc_size == 0) {
+ rc->rc_skipped = 1;
+ continue;
+ }
+
+ if (zio->io_flags & ZIO_FLAG_RESILVER) {
+ vdev_t *svd;
+
+ /*
+ * If this child is a distributed spare then the
+ * offset might reside on the vdev being replaced.
+ * In which case this data must be written to the
+ * new device. Failure to do so would result in
+ * checksum errors when the old device is detached
+ * and the pool is scrubbed.
+ */
+ if ((svd = vdev_draid_find_spare(cvd)) != NULL) {
+ svd = vdev_draid_spare_get_child(svd,
+ rc->rc_offset);
+ if (svd && (svd->vdev_ops == &vdev_spare_ops ||
+ svd->vdev_ops == &vdev_replacing_ops)) {
+ rc->rc_repair = 1;
+ }
+ }
+
+ /*
+ * Always issue a repair IO to this child when its
+ * a spare or replacing vdev with an active rebuild.
+ */
+ if ((cvd->vdev_ops == &vdev_spare_ops ||
+ cvd->vdev_ops == &vdev_replacing_ops) &&
+ vdev_draid_rebuilding(cvd)) {
+ rc->rc_repair = 1;
+ }
+ }
+ }
+
+ /*
+ * Either a parity or data column is missing this means a repair
+ * may be attempted by vdev_draid_io_done(). Expand the raid map
+ * to read in empty columns which are needed along with the parity
+ * during reconstruction.
+ */
+ if ((rr->rr_missingdata > 0 || rr->rr_missingparity > 0) &&
+ rr->rr_nempty > 0 && rr->rr_abd_empty == NULL) {
+ vdev_draid_map_alloc_empty(zio, rr);
+ }
+
+ for (int c = rr->rr_cols - 1; c >= 0; c--) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+ if (rc->rc_error || rc->rc_size == 0)
+ continue;
+
+ if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
+ (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_raidz_child_done, rc));
+ }
+ }
+}
+
+/*
+ * Start an IO operation to a dRAID vdev.
+ */
+static void
+vdev_draid_io_start(zio_t *zio)
+{
+ vdev_t *vd __maybe_unused = zio->io_vd;
+ raidz_map_t *rm;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+ ASSERT3U(zio->io_offset, ==, vdev_draid_get_astart(vd, zio->io_offset));
+
+ rm = vdev_draid_map_alloc(zio);
+
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ vdev_draid_io_start_write(zio, rm->rm_row[i]);
+ }
+ } else {
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ vdev_draid_io_start_read(zio, rm->rm_row[i]);
+ }
+ }
+
+ zio_execute(zio);
+}
+
+/*
+ * Complete an IO operation on a dRAID vdev. The raidz logic can be applied
+ * to dRAID since the layout is fully described by the raidz_map_t.
+ */
+static void
+vdev_draid_io_done(zio_t *zio)
+{
+ vdev_raidz_io_done(zio);
+}
+
+static void
+vdev_draid_state_change(vdev_t *vd, int faulted, int degraded)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+ ASSERT(vd->vdev_ops == &vdev_draid_ops);
+
+ if (faulted > vdc->vdc_nparity)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
+ else if (degraded + faulted != 0)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ else
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+static void
+vdev_draid_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
+ range_seg64_t *physical_rs, range_seg64_t *remain_rs)
+{
+ vdev_t *raidvd = cvd->vdev_parent;
+ ASSERT(raidvd->vdev_ops == &vdev_draid_ops);
+
+ vdev_draid_config_t *vdc = raidvd->vdev_tsd;
+ uint64_t ashift = raidvd->vdev_top->vdev_ashift;
+
+ /* Make sure the offsets are block-aligned */
+ ASSERT0(logical_rs->rs_start % (1 << ashift));
+ ASSERT0(logical_rs->rs_end % (1 << ashift));
+
+ uint64_t logical_start = logical_rs->rs_start;
+ uint64_t logical_end = logical_rs->rs_end;
+
+ /*
+ * Unaligned ranges must be skipped. All metaslabs are correctly
+ * aligned so this should not happen, but this case is handled in
+ * case it's needed by future callers.
+ */
+ uint64_t astart = vdev_draid_get_astart(raidvd, logical_start);
+ if (astart != logical_start) {
+ physical_rs->rs_start = logical_start;
+ physical_rs->rs_end = logical_start;
+ remain_rs->rs_start = MIN(astart, logical_end);
+ remain_rs->rs_end = logical_end;
+ return;
+ }
+
+ /*
+ * Unlike with mirrors and raidz a dRAID logical range can map
+ * to multiple non-contiguous physical ranges. This is handled by
+ * limiting the size of the logical range to a single group and
+ * setting the remain argument such that it describes the remaining
+ * unmapped logical range. This is stricter than absolutely
+ * necessary but helps simplify the logic below.
+ */
+ uint64_t group = vdev_draid_offset_to_group(raidvd, logical_start);
+ uint64_t nextstart = vdev_draid_group_to_offset(raidvd, group + 1);
+ if (logical_end > nextstart)
+ logical_end = nextstart;
+
+ /* Find the starting offset for each vdev in the group */
+ uint64_t perm, groupstart;
+ uint64_t start = vdev_draid_logical_to_physical(raidvd,
+ logical_start, &perm, &groupstart);
+ uint64_t end = start;
+
+ uint8_t *base;
+ uint64_t iter, id;
+ vdev_draid_get_perm(vdc, perm, &base, &iter);
+
+ /*
+ * Check if the passed child falls within the group. If it does
+ * update the start and end to reflect the physical range.
+ * Otherwise, leave them unmodified which will result in an empty
+ * (zero-length) physical range being returned.
+ */
+ for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) {
+ uint64_t c = (groupstart + i) % vdc->vdc_ndisks;
+
+ if (c == 0 && i != 0) {
+ /* the group wrapped, increment the start */
+ start += VDEV_DRAID_ROWHEIGHT;
+ end = start;
+ }
+
+ id = vdev_draid_permute_id(vdc, base, iter, c);
+ if (id == cvd->vdev_id) {
+ uint64_t b_size = (logical_end >> ashift) -
+ (logical_start >> ashift);
+ ASSERT3U(b_size, >, 0);
+ end = start + ((((b_size - 1) /
+ vdc->vdc_groupwidth) + 1) << ashift);
+ break;
+ }
+ }
+ physical_rs->rs_start = start;
+ physical_rs->rs_end = end;
+
+ /*
+ * Only top-level vdevs are allowed to set remain_rs because
+ * when .vdev_op_xlate() is called for their children the full
+ * logical range is not provided by vdev_xlate().
+ */
+ remain_rs->rs_start = logical_end;
+ remain_rs->rs_end = logical_rs->rs_end;
+
+ ASSERT3U(physical_rs->rs_start, <=, logical_start);
+ ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
+ logical_end - logical_start);
+}
+
+/*
+ * Add dRAID specific fields to the config nvlist.
+ */
+static void
+vdev_draid_config_generate(vdev_t *vd, nvlist_t *nv)
+{
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdc->vdc_nparity);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, vdc->vdc_ndata);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, vdc->vdc_nspares);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, vdc->vdc_ngroups);
+}
+
+/*
+ * Initialize private dRAID specific fields from the nvlist.
+ */
+static int
+vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd)
+{
+ uint64_t ndata, nparity, nspares, ngroups;
+ int error;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, &ndata))
+ return (SET_ERROR(EINVAL));
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) ||
+ nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ uint_t children;
+ nvlist_t **child;
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0 || children == 0 ||
+ children > VDEV_DRAID_MAX_CHILDREN) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, &nspares) ||
+ nspares > 100 || nspares > (children - (ndata + nparity))) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, &ngroups) ||
+ ngroups == 0 || ngroups > VDEV_DRAID_MAX_CHILDREN) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Validate the minimum number of children exist per group for the
+ * specified parity level (draid1 >= 2, draid2 >= 3, draid3 >= 4).
+ */
+ if (children < (ndata + nparity + nspares))
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * Create the dRAID configuration using the pool nvlist configuration
+ * and the fixed mapping for the correct number of children.
+ */
+ vdev_draid_config_t *vdc;
+ const draid_map_t *map;
+
+ error = vdev_draid_lookup_map(children, &map);
+ if (error)
+ return (SET_ERROR(EINVAL));
+
+ vdc = kmem_zalloc(sizeof (*vdc), KM_SLEEP);
+ vdc->vdc_ndata = ndata;
+ vdc->vdc_nparity = nparity;
+ vdc->vdc_nspares = nspares;
+ vdc->vdc_children = children;
+ vdc->vdc_ngroups = ngroups;
+ vdc->vdc_nperms = map->dm_nperms;
+
+ error = vdev_draid_generate_perms(map, &vdc->vdc_perms);
+ if (error) {
+ kmem_free(vdc, sizeof (*vdc));
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Derived constants.
+ */
+ vdc->vdc_groupwidth = vdc->vdc_ndata + vdc->vdc_nparity;
+ vdc->vdc_ndisks = vdc->vdc_children - vdc->vdc_nspares;
+ vdc->vdc_groupsz = vdc->vdc_groupwidth * VDEV_DRAID_ROWHEIGHT;
+ vdc->vdc_devslicesz = (vdc->vdc_groupsz * vdc->vdc_ngroups) /
+ vdc->vdc_ndisks;
+
+ ASSERT3U(vdc->vdc_groupwidth, >=, 2);
+ ASSERT3U(vdc->vdc_groupwidth, <=, vdc->vdc_ndisks);
+ ASSERT3U(vdc->vdc_groupsz, >=, 2 * VDEV_DRAID_ROWHEIGHT);
+ ASSERT3U(vdc->vdc_devslicesz, >=, VDEV_DRAID_ROWHEIGHT);
+ ASSERT3U(vdc->vdc_devslicesz % VDEV_DRAID_ROWHEIGHT, ==, 0);
+ ASSERT3U((vdc->vdc_groupwidth * vdc->vdc_ngroups) %
+ vdc->vdc_ndisks, ==, 0);
+
+ *tsd = vdc;
+
+ return (0);
+}
+
+static void
+vdev_draid_fini(vdev_t *vd)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ vmem_free(vdc->vdc_perms, sizeof (uint8_t) *
+ vdc->vdc_children * vdc->vdc_nperms);
+ kmem_free(vdc, sizeof (*vdc));
+}
+
+static uint64_t
+vdev_draid_nparity(vdev_t *vd)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ return (vdc->vdc_nparity);
+}
+
+static uint64_t
+vdev_draid_ndisks(vdev_t *vd)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ return (vdc->vdc_ndisks);
+}
+
+vdev_ops_t vdev_draid_ops = {
+ .vdev_op_init = vdev_draid_init,
+ .vdev_op_fini = vdev_draid_fini,
+ .vdev_op_open = vdev_draid_open,
+ .vdev_op_close = vdev_draid_close,
+ .vdev_op_asize = vdev_draid_asize,
+ .vdev_op_min_asize = vdev_draid_min_asize,
+ .vdev_op_min_alloc = vdev_draid_min_alloc,
+ .vdev_op_io_start = vdev_draid_io_start,
+ .vdev_op_io_done = vdev_draid_io_done,
+ .vdev_op_state_change = vdev_draid_state_change,
+ .vdev_op_need_resilver = vdev_draid_need_resilver,
+ .vdev_op_hold = NULL,
+ .vdev_op_rele = NULL,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = vdev_draid_xlate,
+ .vdev_op_rebuild_asize = vdev_draid_rebuild_asize,
+ .vdev_op_metaslab_init = vdev_draid_metaslab_init,
+ .vdev_op_config_generate = vdev_draid_config_generate,
+ .vdev_op_nparity = vdev_draid_nparity,
+ .vdev_op_ndisks = vdev_draid_ndisks,
+ .vdev_op_type = VDEV_TYPE_DRAID,
+ .vdev_op_leaf = B_FALSE,
+};
+
+
+/*
+ * A dRAID distributed spare is a virtual leaf vdev which is included in the
+ * parent dRAID configuration. The last N columns of the dRAID permutation
+ * table are used to determine on which dRAID children a specific offset
+ * should be written. These spare leaf vdevs can only be used to replace
+ * faulted children in the same dRAID configuration.
+ */
+
+/*
+ * Distributed spare state. All fields are set when the distributed spare is
+ * first opened and are immutable.
+ */
+typedef struct {
+ vdev_t *vds_draid_vdev; /* top-level parent dRAID vdev */
+ uint64_t vds_top_guid; /* top-level parent dRAID guid */
+ uint64_t vds_spare_id; /* spare id (0 - vdc->vdc_nspares-1) */
+} vdev_draid_spare_t;
+
+/*
+ * Returns the parent dRAID vdev to which the distributed spare belongs.
+ * This may be safely called even when the vdev is not open.
+ */
+vdev_t *
+vdev_draid_spare_get_parent(vdev_t *vd)
+{
+ vdev_draid_spare_t *vds = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops);
+
+ if (vds->vds_draid_vdev != NULL)
+ return (vds->vds_draid_vdev);
+
+ return (vdev_lookup_by_guid(vd->vdev_spa->spa_root_vdev,
+ vds->vds_top_guid));
+}
+
+/*
+ * A dRAID space is active when it's the child of a vdev using the
+ * vdev_spare_ops, vdev_replacing_ops or vdev_draid_ops.
+ */
+static boolean_t
+vdev_draid_spare_is_active(vdev_t *vd)
+{
+ vdev_t *pvd = vd->vdev_parent;
+
+ if (pvd != NULL && (pvd->vdev_ops == &vdev_spare_ops ||
+ pvd->vdev_ops == &vdev_replacing_ops ||
+ pvd->vdev_ops == &vdev_draid_ops)) {
+ return (B_TRUE);
+ } else {
+ return (B_FALSE);
+ }
+}
+
+/*
+ * Given a dRAID distribute spare vdev, returns the physical child vdev
+ * on which the provided offset resides. This may involve recursing through
+ * multiple layers of distributed spares. Note that offset is relative to
+ * this vdev.
+ */
+vdev_t *
+vdev_draid_spare_get_child(vdev_t *vd, uint64_t physical_offset)
+{
+ vdev_draid_spare_t *vds = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops);
+
+ /* The vdev is closed */
+ if (vds->vds_draid_vdev == NULL)
+ return (NULL);
+
+ vdev_t *tvd = vds->vds_draid_vdev;
+ vdev_draid_config_t *vdc = tvd->vdev_tsd;
+
+ ASSERT3P(tvd->vdev_ops, ==, &vdev_draid_ops);
+ ASSERT3U(vds->vds_spare_id, <, vdc->vdc_nspares);
+
+ uint8_t *base;
+ uint64_t iter;
+ uint64_t perm = physical_offset / vdc->vdc_devslicesz;
+
+ vdev_draid_get_perm(vdc, perm, &base, &iter);
+
+ uint64_t cid = vdev_draid_permute_id(vdc, base, iter,
+ (tvd->vdev_children - 1) - vds->vds_spare_id);
+ vdev_t *cvd = tvd->vdev_child[cid];
+
+ if (cvd->vdev_ops == &vdev_draid_spare_ops)
+ return (vdev_draid_spare_get_child(cvd, physical_offset));
+
+ return (cvd);
+}
+
+/* ARGSUSED */
+static void
+vdev_draid_spare_close(vdev_t *vd)
+{
+ vdev_draid_spare_t *vds = vd->vdev_tsd;
+ vds->vds_draid_vdev = NULL;
+}
+
+/*
+ * Opening a dRAID spare device is done by looking up the associated dRAID
+ * top-level vdev guid from the spare configuration.
+ */
+static int
+vdev_draid_spare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+ vdev_draid_spare_t *vds = vd->vdev_tsd;
+ vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+ uint64_t asize, max_asize;
+
+ vdev_t *tvd = vdev_lookup_by_guid(rvd, vds->vds_top_guid);
+ if (tvd == NULL) {
+ /*
+ * When spa_vdev_add() is labeling new spares the
+ * associated dRAID is not attached to the root vdev
+ * nor does this spare have a parent. Simulate a valid
+ * device in order to allow the label to be initialized
+ * and the distributed spare added to the configuration.
+ */
+ if (vd->vdev_parent == NULL) {
+ *psize = *max_psize = SPA_MINDEVSIZE;
+ *logical_ashift = *physical_ashift = ASHIFT_MIN;
+ return (0);
+ }
+
+ return (SET_ERROR(EINVAL));
+ }
+
+ vdev_draid_config_t *vdc = tvd->vdev_tsd;
+ if (tvd->vdev_ops != &vdev_draid_ops || vdc == NULL)
+ return (SET_ERROR(EINVAL));
+
+ if (vds->vds_spare_id >= vdc->vdc_nspares)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * Neither tvd->vdev_asize or tvd->vdev_max_asize can be used here
+ * because the caller may be vdev_draid_open() in which case the
+ * values are stale as they haven't yet been updated by vdev_open().
+ * To avoid this always recalculate the dRAID asize and max_asize.
+ */
+ vdev_draid_calculate_asize(tvd, &asize, &max_asize,
+ logical_ashift, physical_ashift);
+
+ *psize = asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
+ *max_psize = max_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
+
+ vds->vds_draid_vdev = tvd;
+
+ return (0);
+}
+
+/*
+ * Completed distributed spare IO. Store the result in the parent zio
+ * as if it had performed the operation itself. Only the first error is
+ * preserved if there are multiple errors.
+ */
+static void
+vdev_draid_spare_child_done(zio_t *zio)
+{
+ zio_t *pio = zio->io_private;
+
+ /*
+ * IOs are issued to non-writable vdevs in order to keep their
+ * DTLs accurate. However, we don't want to propagate the
+ * error in to the distributed spare's DTL. When resilvering
+ * vdev_draid_need_resilver() will consult the relevant DTL
+ * to determine if the data is missing and must be repaired.
+ */
+ if (!vdev_writeable(zio->io_vd))
+ return;
+
+ if (pio->io_error == 0)
+ pio->io_error = zio->io_error;
+}
+
+/*
+ * Returns a valid label nvlist for the distributed spare vdev. This is
+ * used to bypass the IO pipeline to avoid the complexity of constructing
+ * a complete label with valid checksum to return when read.
+ */
+nvlist_t *
+vdev_draid_read_config_spare(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ spa_aux_vdev_t *sav = &spa->spa_spares;
+ uint64_t guid = vd->vdev_guid;
+
+ nvlist_t *nv = fnvlist_alloc();
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_VERSION, spa_version(spa));
+ fnvlist_add_string(nv, ZPOOL_CONFIG_POOL_NAME, spa_name(spa));
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_STATE,
+ vdev_draid_spare_is_active(vd) ?
+ POOL_STATE_ACTIVE : POOL_STATE_SPARE);
+
+ /* Set the vdev guid based on the vdev list in sav_count. */
+ for (int i = 0; i < sav->sav_count; i++) {
+ if (sav->sav_vdevs[i]->vdev_ops == &vdev_draid_spare_ops &&
+ strcmp(sav->sav_vdevs[i]->vdev_path, vd->vdev_path) == 0) {
+ guid = sav->sav_vdevs[i]->vdev_guid;
+ break;
+ }
+ }
+
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, guid);
+
+ return (nv);
+}
+
+/*
+ * Handle any ioctl requested of the distributed spare. Only flushes
+ * are supported in which case all children must be flushed.
+ */
+static int
+vdev_draid_spare_ioctl(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ int error = 0;
+
+ if (zio->io_cmd == DKIOCFLUSHWRITECACHE) {
+ for (int c = 0; c < vd->vdev_children; c++) {
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ vd->vdev_child[c], zio->io_offset, zio->io_abd,
+ zio->io_size, zio->io_type, zio->io_priority, 0,
+ vdev_draid_spare_child_done, zio));
+ }
+ } else {
+ error = SET_ERROR(ENOTSUP);
+ }
+
+ return (error);
+}
+
+/*
+ * Initiate an IO to the distributed spare. For normal IOs this entails using
+ * the zio->io_offset and permutation table to calculate which child dRAID vdev
+ * is responsible for the data. Then passing along the zio to that child to
+ * perform the actual IO. The label ranges are not stored on disk and require
+ * some special handling which is described below.
+ */
+static void
+vdev_draid_spare_io_start(zio_t *zio)
+{
+ vdev_t *cvd = NULL, *vd = zio->io_vd;
+ vdev_draid_spare_t *vds = vd->vdev_tsd;
+ uint64_t offset = zio->io_offset - VDEV_LABEL_START_SIZE;
+
+ /*
+ * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
+ * Nothing to be done here but return failure.
+ */
+ if (vds == NULL) {
+ zio->io_error = ENXIO;
+ zio_interrupt(zio);
+ return;
+ }
+
+ switch (zio->io_type) {
+ case ZIO_TYPE_IOCTL:
+ zio->io_error = vdev_draid_spare_ioctl(zio);
+ break;
+
+ case ZIO_TYPE_WRITE:
+ if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) {
+ /*
+ * Accept probe IOs and config writers to simulate the
+ * existence of an on disk label. vdev_label_sync(),
+ * vdev_uberblock_sync() and vdev_copy_uberblocks()
+ * skip the distributed spares. This only leaves
+ * vdev_label_init() which is allowed to succeed to
+ * avoid adding special cases the function.
+ */
+ if (zio->io_flags & ZIO_FLAG_PROBE ||
+ zio->io_flags & ZIO_FLAG_CONFIG_WRITER) {
+ zio->io_error = 0;
+ } else {
+ zio->io_error = SET_ERROR(EIO);
+ }
+ } else {
+ cvd = vdev_draid_spare_get_child(vd, offset);
+
+ if (cvd == NULL) {
+ zio->io_error = SET_ERROR(ENXIO);
+ } else {
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ offset, zio->io_abd, zio->io_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_draid_spare_child_done, zio));
+ }
+ }
+ break;
+
+ case ZIO_TYPE_READ:
+ if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) {
+ /*
+ * Accept probe IOs to simulate the existence of a
+ * label. vdev_label_read_config() bypasses the
+ * pipeline to read the label configuration and
+ * vdev_uberblock_load() skips distributed spares
+ * when attempting to locate the best uberblock.
+ */
+ if (zio->io_flags & ZIO_FLAG_PROBE) {
+ zio->io_error = 0;
+ } else {
+ zio->io_error = SET_ERROR(EIO);
+ }
+ } else {
+ cvd = vdev_draid_spare_get_child(vd, offset);
+
+ if (cvd == NULL || !vdev_readable(cvd)) {
+ zio->io_error = SET_ERROR(ENXIO);
+ } else {
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ offset, zio->io_abd, zio->io_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_draid_spare_child_done, zio));
+ }
+ }
+ break;
+
+ case ZIO_TYPE_TRIM:
+ /* The vdev label ranges are never trimmed */
+ ASSERT0(VDEV_OFFSET_IS_LABEL(vd, zio->io_offset));
+
+ cvd = vdev_draid_spare_get_child(vd, offset);
+
+ if (cvd == NULL || !cvd->vdev_has_trim) {
+ zio->io_error = SET_ERROR(ENXIO);
+ } else {
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ offset, zio->io_abd, zio->io_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_draid_spare_child_done, zio));
+ }
+ break;
+
+ default:
+ zio->io_error = SET_ERROR(ENOTSUP);
+ break;
+ }
+
+ zio_execute(zio);
+}
+
+/* ARGSUSED */
+static void
+vdev_draid_spare_io_done(zio_t *zio)
+{
+}
+
+/*
+ * Lookup the full spare config in spa->spa_spares.sav_config and
+ * return the top_guid and spare_id for the named spare.
+ */
+static int
+vdev_draid_spare_lookup(spa_t *spa, nvlist_t *nv, uint64_t *top_guidp,
+ uint64_t *spare_idp)
+{
+ nvlist_t **spares;
+ uint_t nspares;
+ int error;
+
+ if ((spa->spa_spares.sav_config == NULL) ||
+ (nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)) {
+ return (SET_ERROR(ENOENT));
+ }
+
+ char *spare_name;
+ error = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &spare_name);
+ if (error != 0)
+ return (SET_ERROR(EINVAL));
+
+ for (int i = 0; i < nspares; i++) {
+ nvlist_t *spare = spares[i];
+ uint64_t top_guid, spare_id;
+ char *type, *path;
+
+ /* Skip non-distributed spares */
+ error = nvlist_lookup_string(spare, ZPOOL_CONFIG_TYPE, &type);
+ if (error != 0 || strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0)
+ continue;
+
+ /* Skip spares with the wrong name */
+ error = nvlist_lookup_string(spare, ZPOOL_CONFIG_PATH, &path);
+ if (error != 0 || strcmp(path, spare_name) != 0)
+ continue;
+
+ /* Found the matching spare */
+ error = nvlist_lookup_uint64(spare,
+ ZPOOL_CONFIG_TOP_GUID, &top_guid);
+ if (error == 0) {
+ error = nvlist_lookup_uint64(spare,
+ ZPOOL_CONFIG_SPARE_ID, &spare_id);
+ }
+
+ if (error != 0) {
+ return (SET_ERROR(EINVAL));
+ } else {
+ *top_guidp = top_guid;
+ *spare_idp = spare_id;
+ return (0);
+ }
+ }
+
+ return (SET_ERROR(ENOENT));
+}
+
+/*
+ * Initialize private dRAID spare specific fields from the nvlist.
+ */
+static int
+vdev_draid_spare_init(spa_t *spa, nvlist_t *nv, void **tsd)
+{
+ vdev_draid_spare_t *vds;
+ uint64_t top_guid = 0;
+ uint64_t spare_id;
+
+ /*
+ * In the normal case check the list of spares stored in the spa
+ * to lookup the top_guid and spare_id for provided spare config.
+ * When creating a new pool or adding vdevs the spare list is not
+ * yet populated and the values are provided in the passed config.
+ */
+ if (vdev_draid_spare_lookup(spa, nv, &top_guid, &spare_id) != 0) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_TOP_GUID,
+ &top_guid) != 0)
+ return (SET_ERROR(EINVAL));
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_SPARE_ID,
+ &spare_id) != 0)
+ return (SET_ERROR(EINVAL));
+ }
+
+ vds = kmem_alloc(sizeof (vdev_draid_spare_t), KM_SLEEP);
+ vds->vds_draid_vdev = NULL;
+ vds->vds_top_guid = top_guid;
+ vds->vds_spare_id = spare_id;
+
+ *tsd = vds;
+
+ return (0);
+}
+
+static void
+vdev_draid_spare_fini(vdev_t *vd)
+{
+ kmem_free(vd->vdev_tsd, sizeof (vdev_draid_spare_t));
+}
+
+static void
+vdev_draid_spare_config_generate(vdev_t *vd, nvlist_t *nv)
+{
+ vdev_draid_spare_t *vds = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops);
+
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vds->vds_top_guid);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_SPARE_ID, vds->vds_spare_id);
+}
+
+vdev_ops_t vdev_draid_spare_ops = {
+ .vdev_op_init = vdev_draid_spare_init,
+ .vdev_op_fini = vdev_draid_spare_fini,
+ .vdev_op_open = vdev_draid_spare_open,
+ .vdev_op_close = vdev_draid_spare_close,
+ .vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
+ .vdev_op_io_start = vdev_draid_spare_io_start,
+ .vdev_op_io_done = vdev_draid_spare_io_done,
+ .vdev_op_state_change = NULL,
+ .vdev_op_need_resilver = NULL,
+ .vdev_op_hold = NULL,
+ .vdev_op_rele = NULL,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = vdev_draid_spare_config_generate,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
+ .vdev_op_type = VDEV_TYPE_DRAID_SPARE,
+ .vdev_op_leaf = B_TRUE,
+};
diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid_rand.c b/sys/contrib/openzfs/module/zfs/vdev_draid_rand.c
new file mode 100644
index 000000000000..fe1a75c11312
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_draid_rand.c
@@ -0,0 +1,40 @@
+/*
+ * Xorshift Pseudo Random Number Generator based on work by David Blackman
+ * and Sebastiano Vigna (vigna@acm.org).
+ *
+ * "Further scramblings of Marsaglia's xorshift generators"
+ * http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf
+ * http://prng.di.unimi.it/xoroshiro128plusplus.c
+ *
+ * To the extent possible under law, the author has dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide. This software is distributed without any warranty.
+ *
+ * See <http://creativecommons.org/publicdomain/zero/1.0/>.
+ *
+ * This is xoroshiro128++ 1.0, one of our all-purpose, rock-solid,
+ * small-state generators. It is extremely (sub-ns) fast and it passes all
+ * tests we are aware of, but its state space is large enough only for
+ * mild parallelism.
+ */
+
+#include <sys/vdev_draid.h>
+
+static inline uint64_t rotl(const uint64_t x, int k)
+{
+ return (x << k) | (x >> (64 - k));
+}
+
+uint64_t
+vdev_draid_rand(uint64_t *s)
+{
+ const uint64_t s0 = s[0];
+ uint64_t s1 = s[1];
+ const uint64_t result = rotl(s0 + s1, 17) + s0;
+
+ s1 ^= s0;
+ s[0] = rotl(s0, 49) ^ s1 ^ (s1 << 21); // a, b
+ s[1] = rotl(s1, 28); // c
+
+ return (result);
+}
diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect.c b/sys/contrib/openzfs/module/zfs/vdev_indirect.c
new file mode 100644
index 000000000000..b26d0993711a
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_indirect.c
@@ -0,0 +1,1911 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ * Copyright (c) 2014, 2020 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/metaslab.h>
+#include <sys/dmu.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zap.h>
+#include <sys/abd.h>
+#include <sys/zthr.h>
+
+/*
+ * An indirect vdev corresponds to a vdev that has been removed. Since
+ * we cannot rewrite block pointers of snapshots, etc., we keep a
+ * mapping from old location on the removed device to the new location
+ * on another device in the pool and use this mapping whenever we need
+ * to access the DVA. Unfortunately, this mapping did not respect
+ * logical block boundaries when it was first created, and so a DVA on
+ * this indirect vdev may be "split" into multiple sections that each
+ * map to a different location. As a consequence, not all DVAs can be
+ * translated to an equivalent new DVA. Instead we must provide a
+ * "vdev_remap" operation that executes a callback on each contiguous
+ * segment of the new location. This function is used in multiple ways:
+ *
+ * - i/os to this vdev use the callback to determine where the
+ * data is now located, and issue child i/os for each segment's new
+ * location.
+ *
+ * - frees and claims to this vdev use the callback to free or claim
+ * each mapped segment. (Note that we don't actually need to claim
+ * log blocks on indirect vdevs, because we don't allocate to
+ * removing vdevs. However, zdb uses zio_claim() for its leak
+ * detection.)
+ */
+
+/*
+ * "Big theory statement" for how we mark blocks obsolete.
+ *
+ * When a block on an indirect vdev is freed or remapped, a section of
+ * that vdev's mapping may no longer be referenced (aka "obsolete"). We
+ * keep track of how much of each mapping entry is obsolete. When
+ * an entry becomes completely obsolete, we can remove it, thus reducing
+ * the memory used by the mapping. The complete picture of obsolescence
+ * is given by the following data structures, described below:
+ * - the entry-specific obsolete count
+ * - the vdev-specific obsolete spacemap
+ * - the pool-specific obsolete bpobj
+ *
+ * == On disk data structures used ==
+ *
+ * We track the obsolete space for the pool using several objects. Each
+ * of these objects is created on demand and freed when no longer
+ * needed, and is assumed to be empty if it does not exist.
+ * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects.
+ *
+ * - Each vic_mapping_object (associated with an indirect vdev) can
+ * have a vimp_counts_object. This is an array of uint32_t's
+ * with the same number of entries as the vic_mapping_object. When
+ * the mapping is condensed, entries from the vic_obsolete_sm_object
+ * (see below) are folded into the counts. Therefore, each
+ * obsolete_counts entry tells us the number of bytes in the
+ * corresponding mapping entry that were not referenced when the
+ * mapping was last condensed.
+ *
+ * - Each indirect or removing vdev can have a vic_obsolete_sm_object.
+ * This is a space map containing an alloc entry for every DVA that
+ * has been obsoleted since the last time this indirect vdev was
+ * condensed. We use this object in order to improve performance
+ * when marking a DVA as obsolete. Instead of modifying an arbitrary
+ * offset of the vimp_counts_object, we only need to append an entry
+ * to the end of this object. When a DVA becomes obsolete, it is
+ * added to the obsolete space map. This happens when the DVA is
+ * freed, remapped and not referenced by a snapshot, or the last
+ * snapshot referencing it is destroyed.
+ *
+ * - Each dataset can have a ds_remap_deadlist object. This is a
+ * deadlist object containing all blocks that were remapped in this
+ * dataset but referenced in a previous snapshot. Blocks can *only*
+ * appear on this list if they were remapped (dsl_dataset_block_remapped);
+ * blocks that were killed in a head dataset are put on the normal
+ * ds_deadlist and marked obsolete when they are freed.
+ *
+ * - The pool can have a dp_obsolete_bpobj. This is a list of blocks
+ * in the pool that need to be marked obsolete. When a snapshot is
+ * destroyed, we move some of the ds_remap_deadlist to the obsolete
+ * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then
+ * asynchronously process the obsolete bpobj, moving its entries to
+ * the specific vdevs' obsolete space maps.
+ *
+ * == Summary of how we mark blocks as obsolete ==
+ *
+ * - When freeing a block: if any DVA is on an indirect vdev, append to
+ * vic_obsolete_sm_object.
+ * - When remapping a block, add dva to ds_remap_deadlist (if prev snap
+ * references; otherwise append to vic_obsolete_sm_object).
+ * - When freeing a snapshot: move parts of ds_remap_deadlist to
+ * dp_obsolete_bpobj (same algorithm as ds_deadlist).
+ * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to
+ * individual vdev's vic_obsolete_sm_object.
+ */
+
+/*
+ * "Big theory statement" for how we condense indirect vdevs.
+ *
+ * Condensing an indirect vdev's mapping is the process of determining
+ * the precise counts of obsolete space for each mapping entry (by
+ * integrating the obsolete spacemap into the obsolete counts) and
+ * writing out a new mapping that contains only referenced entries.
+ *
+ * We condense a vdev when we expect the mapping to shrink (see
+ * vdev_indirect_should_condense()), but only perform one condense at a
+ * time to limit the memory usage. In addition, we use a separate
+ * open-context thread (spa_condense_indirect_thread) to incrementally
+ * create the new mapping object in a way that minimizes the impact on
+ * the rest of the system.
+ *
+ * == Generating a new mapping ==
+ *
+ * To generate a new mapping, we follow these steps:
+ *
+ * 1. Save the old obsolete space map and create a new mapping object
+ * (see spa_condense_indirect_start_sync()). This initializes the
+ * spa_condensing_indirect_phys with the "previous obsolete space map",
+ * which is now read only. Newly obsolete DVAs will be added to a
+ * new (initially empty) obsolete space map, and will not be
+ * considered as part of this condense operation.
+ *
+ * 2. Construct in memory the precise counts of obsolete space for each
+ * mapping entry, by incorporating the obsolete space map into the
+ * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().)
+ *
+ * 3. Iterate through each mapping entry, writing to the new mapping any
+ * entries that are not completely obsolete (i.e. which don't have
+ * obsolete count == mapping length). (See
+ * spa_condense_indirect_generate_new_mapping().)
+ *
+ * 4. Destroy the old mapping object and switch over to the new one
+ * (spa_condense_indirect_complete_sync).
+ *
+ * == Restarting from failure ==
+ *
+ * To restart the condense when we import/open the pool, we must start
+ * at the 2nd step above: reconstruct the precise counts in memory,
+ * based on the space map + counts. Then in the 3rd step, we start
+ * iterating where we left off: at vimp_max_offset of the new mapping
+ * object.
+ */
+
+int zfs_condense_indirect_vdevs_enable = B_TRUE;
+
+/*
+ * Condense if at least this percent of the bytes in the mapping is
+ * obsolete. With the default of 25%, the amount of space mapped
+ * will be reduced to 1% of its original size after at most 16
+ * condenses. Higher values will condense less often (causing less
+ * i/o); lower values will reduce the mapping size more quickly.
+ */
+int zfs_indirect_condense_obsolete_pct = 25;
+
+/*
+ * Condense if the obsolete space map takes up more than this amount of
+ * space on disk (logically). This limits the amount of disk space
+ * consumed by the obsolete space map; the default of 1GB is small enough
+ * that we typically don't mind "wasting" it.
+ */
+unsigned long zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
+
+/*
+ * Don't bother condensing if the mapping uses less than this amount of
+ * memory. The default of 128KB is considered a "trivial" amount of
+ * memory and not worth reducing.
+ */
+unsigned long zfs_condense_min_mapping_bytes = 128 * 1024;
+
+/*
+ * This is used by the test suite so that it can ensure that certain
+ * actions happen while in the middle of a condense (which might otherwise
+ * complete too quickly). If used to reduce the performance impact of
+ * condensing in production, a maximum value of 1 should be sufficient.
+ */
+int zfs_condense_indirect_commit_entry_delay_ms = 0;
+
+/*
+ * If an indirect split block contains more than this many possible unique
+ * combinations when being reconstructed, consider it too computationally
+ * expensive to check them all. Instead, try at most 100 randomly-selected
+ * combinations each time the block is accessed. This allows all segment
+ * copies to participate fairly in the reconstruction when all combinations
+ * cannot be checked and prevents repeated use of one bad copy.
+ */
+int zfs_reconstruct_indirect_combinations_max = 4096;
+
+/*
+ * Enable to simulate damaged segments and validate reconstruction. This
+ * is intentionally not exposed as a module parameter.
+ */
+unsigned long zfs_reconstruct_indirect_damage_fraction = 0;
+
+/*
+ * The indirect_child_t represents the vdev that we will read from, when we
+ * need to read all copies of the data (e.g. for scrub or reconstruction).
+ * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
+ * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs,
+ * ic_vdev is a child of the mirror.
+ */
+typedef struct indirect_child {
+ abd_t *ic_data;
+ vdev_t *ic_vdev;
+
+ /*
+ * ic_duplicate is NULL when the ic_data contents are unique, when it
+ * is determined to be a duplicate it references the primary child.
+ */
+ struct indirect_child *ic_duplicate;
+ list_node_t ic_node; /* node on is_unique_child */
+ int ic_error; /* set when a child does not contain the data */
+} indirect_child_t;
+
+/*
+ * The indirect_split_t represents one mapped segment of an i/o to the
+ * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
+ * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
+ * For split blocks, there will be several of these.
+ */
+typedef struct indirect_split {
+ list_node_t is_node; /* link on iv_splits */
+
+ /*
+ * is_split_offset is the offset into the i/o.
+ * This is the sum of the previous splits' is_size's.
+ */
+ uint64_t is_split_offset;
+
+ vdev_t *is_vdev; /* top-level vdev */
+ uint64_t is_target_offset; /* offset on is_vdev */
+ uint64_t is_size;
+ int is_children; /* number of entries in is_child[] */
+ int is_unique_children; /* number of entries in is_unique_child */
+ list_t is_unique_child;
+
+ /*
+ * is_good_child is the child that we are currently using to
+ * attempt reconstruction.
+ */
+ indirect_child_t *is_good_child;
+
+ indirect_child_t is_child[1]; /* variable-length */
+} indirect_split_t;
+
+/*
+ * The indirect_vsd_t is associated with each i/o to the indirect vdev.
+ * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
+ */
+typedef struct indirect_vsd {
+ boolean_t iv_split_block;
+ boolean_t iv_reconstruct;
+ uint64_t iv_unique_combinations;
+ uint64_t iv_attempts;
+ uint64_t iv_attempts_max;
+
+ list_t iv_splits; /* list of indirect_split_t's */
+} indirect_vsd_t;
+
+static void
+vdev_indirect_map_free(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ indirect_split_t *is;
+ while ((is = list_head(&iv->iv_splits)) != NULL) {
+ for (int c = 0; c < is->is_children; c++) {
+ indirect_child_t *ic = &is->is_child[c];
+ if (ic->ic_data != NULL)
+ abd_free(ic->ic_data);
+ }
+ list_remove(&iv->iv_splits, is);
+
+ indirect_child_t *ic;
+ while ((ic = list_head(&is->is_unique_child)) != NULL)
+ list_remove(&is->is_unique_child, ic);
+
+ list_destroy(&is->is_unique_child);
+
+ kmem_free(is,
+ offsetof(indirect_split_t, is_child[is->is_children]));
+ }
+ kmem_free(iv, sizeof (*iv));
+}
+
+static const zio_vsd_ops_t vdev_indirect_vsd_ops = {
+ .vsd_free = vdev_indirect_map_free,
+ .vsd_cksum_report = zio_vsd_default_cksum_report
+};
+
+/*
+ * Mark the given offset and size as being obsolete.
+ */
+void
+vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);
+ ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
+ ASSERT(size > 0);
+ VERIFY(vdev_indirect_mapping_entry_for_offset(
+ vd->vdev_indirect_mapping, offset) != NULL);
+
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+ mutex_enter(&vd->vdev_obsolete_lock);
+ range_tree_add(vd->vdev_obsolete_segments, offset, size);
+ mutex_exit(&vd->vdev_obsolete_lock);
+ vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa));
+ }
+}
+
+/*
+ * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This
+ * wrapper is provided because the DMU does not know about vdev_t's and
+ * cannot directly call vdev_indirect_mark_obsolete.
+ */
+void
+spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset,
+ uint64_t size, dmu_tx_t *tx)
+{
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ /* The DMU can only remap indirect vdevs. */
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+ vdev_indirect_mark_obsolete(vd, offset, size);
+}
+
+static spa_condensing_indirect_t *
+spa_condensing_indirect_create(spa_t *spa)
+{
+ spa_condensing_indirect_phys_t *scip =
+ &spa->spa_condensing_indirect_phys;
+ spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP);
+ objset_t *mos = spa->spa_meta_objset;
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ list_create(&sci->sci_new_mapping_entries[i],
+ sizeof (vdev_indirect_mapping_entry_t),
+ offsetof(vdev_indirect_mapping_entry_t, vime_node));
+ }
+
+ sci->sci_new_mapping =
+ vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object);
+
+ return (sci);
+}
+
+static void
+spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci)
+{
+ for (int i = 0; i < TXG_SIZE; i++)
+ list_destroy(&sci->sci_new_mapping_entries[i]);
+
+ if (sci->sci_new_mapping != NULL)
+ vdev_indirect_mapping_close(sci->sci_new_mapping);
+
+ kmem_free(sci, sizeof (*sci));
+}
+
+boolean_t
+vdev_indirect_should_condense(vdev_t *vd)
+{
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool));
+
+ if (!zfs_condense_indirect_vdevs_enable)
+ return (B_FALSE);
+
+ /*
+ * We can only condense one indirect vdev at a time.
+ */
+ if (spa->spa_condensing_indirect != NULL)
+ return (B_FALSE);
+
+ if (spa_shutting_down(spa))
+ return (B_FALSE);
+
+ /*
+ * The mapping object size must not change while we are
+ * condensing, so we can only condense indirect vdevs
+ * (not vdevs that are still in the middle of being removed).
+ */
+ if (vd->vdev_ops != &vdev_indirect_ops)
+ return (B_FALSE);
+
+ /*
+ * If nothing new has been marked obsolete, there is no
+ * point in condensing.
+ */
+ uint64_t obsolete_sm_obj __maybe_unused;
+ ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj));
+ if (vd->vdev_obsolete_sm == NULL) {
+ ASSERT0(obsolete_sm_obj);
+ return (B_FALSE);
+ }
+
+ ASSERT(vd->vdev_obsolete_sm != NULL);
+
+ ASSERT3U(obsolete_sm_obj, ==, space_map_object(vd->vdev_obsolete_sm));
+
+ uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim);
+ uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm);
+ uint64_t mapping_size = vdev_indirect_mapping_size(vim);
+ uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm);
+
+ ASSERT3U(bytes_obsolete, <=, bytes_mapped);
+
+ /*
+ * If a high percentage of the bytes that are mapped have become
+ * obsolete, condense (unless the mapping is already small enough).
+ * This has a good chance of reducing the amount of memory used
+ * by the mapping.
+ */
+ if (bytes_obsolete * 100 / bytes_mapped >=
+ zfs_indirect_condense_obsolete_pct &&
+ mapping_size > zfs_condense_min_mapping_bytes) {
+ zfs_dbgmsg("should condense vdev %llu because obsolete "
+ "spacemap covers %d%% of %lluMB mapping",
+ (u_longlong_t)vd->vdev_id,
+ (int)(bytes_obsolete * 100 / bytes_mapped),
+ (u_longlong_t)bytes_mapped / 1024 / 1024);
+ return (B_TRUE);
+ }
+
+ /*
+ * If the obsolete space map takes up too much space on disk,
+ * condense in order to free up this disk space.
+ */
+ if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) {
+ zfs_dbgmsg("should condense vdev %llu because obsolete sm "
+ "length %lluMB >= max size %lluMB",
+ (u_longlong_t)vd->vdev_id,
+ (u_longlong_t)obsolete_sm_size / 1024 / 1024,
+ (u_longlong_t)zfs_condense_max_obsolete_bytes /
+ 1024 / 1024);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * This sync task completes (finishes) a condense, deleting the old
+ * mapping and replacing it with the new one.
+ */
+static void
+spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_condensing_indirect_t *sci = arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ spa_condensing_indirect_phys_t *scip =
+ &spa->spa_condensing_indirect_phys;
+ vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev);
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+ objset_t *mos = spa->spa_meta_objset;
+ vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
+ uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping);
+ uint64_t new_count =
+ vdev_indirect_mapping_num_entries(sci->sci_new_mapping);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+ ASSERT3P(sci, ==, spa->spa_condensing_indirect);
+ for (int i = 0; i < TXG_SIZE; i++) {
+ ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
+ }
+ ASSERT(vic->vic_mapping_object != 0);
+ ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
+ ASSERT(scip->scip_next_mapping_object != 0);
+ ASSERT(scip->scip_prev_obsolete_sm_object != 0);
+
+ /*
+ * Reset vdev_indirect_mapping to refer to the new object.
+ */
+ rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER);
+ vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
+ vd->vdev_indirect_mapping = sci->sci_new_mapping;
+ rw_exit(&vd->vdev_indirect_rwlock);
+
+ sci->sci_new_mapping = NULL;
+ vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
+ vic->vic_mapping_object = scip->scip_next_mapping_object;
+ scip->scip_next_mapping_object = 0;
+
+ space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx);
+ spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ scip->scip_prev_obsolete_sm_object = 0;
+
+ scip->scip_vdev = 0;
+
+ VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_CONDENSING_INDIRECT, tx));
+ spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
+ spa->spa_condensing_indirect = NULL;
+
+ zfs_dbgmsg("finished condense of vdev %llu in txg %llu: "
+ "new mapping object %llu has %llu entries "
+ "(was %llu entries)",
+ vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object,
+ new_count, old_count);
+
+ vdev_config_dirty(spa->spa_root_vdev);
+}
+
+/*
+ * This sync task appends entries to the new mapping object.
+ */
+static void
+spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_condensing_indirect_t *sci = arg;
+ uint64_t txg = dmu_tx_get_txg(tx);
+ spa_t *spa __maybe_unused = dmu_tx_pool(tx)->dp_spa;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT3P(sci, ==, spa->spa_condensing_indirect);
+
+ vdev_indirect_mapping_add_entries(sci->sci_new_mapping,
+ &sci->sci_new_mapping_entries[txg & TXG_MASK], tx);
+ ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK]));
+}
+
+/*
+ * Open-context function to add one entry to the new mapping. The new
+ * entry will be remembered and written from syncing context.
+ */
+static void
+spa_condense_indirect_commit_entry(spa_t *spa,
+ vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count)
+{
+ spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
+
+ ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst));
+
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count));
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+
+ /*
+ * If we are the first entry committed this txg, kick off the sync
+ * task to write to the MOS on our behalf.
+ */
+ if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) {
+ dsl_sync_task_nowait(dmu_tx_pool(tx),
+ spa_condense_indirect_commit_sync, sci, tx);
+ }
+
+ vdev_indirect_mapping_entry_t *vime =
+ kmem_alloc(sizeof (*vime), KM_SLEEP);
+ vime->vime_mapping = *vimep;
+ vime->vime_obsolete_count = count;
+ list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime);
+
+ dmu_tx_commit(tx);
+}
+
+static void
+spa_condense_indirect_generate_new_mapping(vdev_t *vd,
+ uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr)
+{
+ spa_t *spa = vd->vdev_spa;
+ uint64_t mapi = start_index;
+ vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
+ uint64_t old_num_entries =
+ vdev_indirect_mapping_num_entries(old_mapping);
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+ ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev);
+
+ zfs_dbgmsg("starting condense of vdev %llu from index %llu",
+ (u_longlong_t)vd->vdev_id,
+ (u_longlong_t)mapi);
+
+ while (mapi < old_num_entries) {
+
+ if (zthr_iscancelled(zthr)) {
+ zfs_dbgmsg("pausing condense of vdev %llu "
+ "at index %llu", (u_longlong_t)vd->vdev_id,
+ (u_longlong_t)mapi);
+ break;
+ }
+
+ vdev_indirect_mapping_entry_phys_t *entry =
+ &old_mapping->vim_entries[mapi];
+ uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst);
+ ASSERT3U(obsolete_counts[mapi], <=, entry_size);
+ if (obsolete_counts[mapi] < entry_size) {
+ spa_condense_indirect_commit_entry(spa, entry,
+ obsolete_counts[mapi]);
+
+ /*
+ * This delay may be requested for testing, debugging,
+ * or performance reasons.
+ */
+ hrtime_t now = gethrtime();
+ hrtime_t sleep_until = now + MSEC2NSEC(
+ zfs_condense_indirect_commit_entry_delay_ms);
+ zfs_sleep_until(sleep_until);
+ }
+
+ mapi++;
+ }
+}
+
+/* ARGSUSED */
+static boolean_t
+spa_condense_indirect_thread_check(void *arg, zthr_t *zthr)
+{
+ spa_t *spa = arg;
+
+ return (spa->spa_condensing_indirect != NULL);
+}
+
+/* ARGSUSED */
+static void
+spa_condense_indirect_thread(void *arg, zthr_t *zthr)
+{
+ spa_t *spa = arg;
+ vdev_t *vd;
+
+ ASSERT3P(spa->spa_condensing_indirect, !=, NULL);
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev);
+ ASSERT3P(vd, !=, NULL);
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
+ spa_condensing_indirect_phys_t *scip =
+ &spa->spa_condensing_indirect_phys;
+ uint32_t *counts;
+ uint64_t start_index;
+ vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
+ space_map_t *prev_obsolete_sm = NULL;
+
+ ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
+ ASSERT(scip->scip_next_mapping_object != 0);
+ ASSERT(scip->scip_prev_obsolete_sm_object != 0);
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ /*
+ * The list must start out empty in order for the
+ * _commit_sync() sync task to be properly registered
+ * on the first call to _commit_entry(); so it's wise
+ * to double check and ensure we actually are starting
+ * with empty lists.
+ */
+ ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
+ }
+
+ VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
+ scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
+ counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
+ if (prev_obsolete_sm != NULL) {
+ vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,
+ counts, prev_obsolete_sm);
+ }
+ space_map_close(prev_obsolete_sm);
+
+ /*
+ * Generate new mapping. Determine what index to continue from
+ * based on the max offset that we've already written in the
+ * new mapping.
+ */
+ uint64_t max_offset =
+ vdev_indirect_mapping_max_offset(sci->sci_new_mapping);
+ if (max_offset == 0) {
+ /* We haven't written anything to the new mapping yet. */
+ start_index = 0;
+ } else {
+ /*
+ * Pick up from where we left off. _entry_for_offset()
+ * returns a pointer into the vim_entries array. If
+ * max_offset is greater than any of the mappings
+ * contained in the table NULL will be returned and
+ * that indicates we've exhausted our iteration of the
+ * old_mapping.
+ */
+
+ vdev_indirect_mapping_entry_phys_t *entry =
+ vdev_indirect_mapping_entry_for_offset_or_next(old_mapping,
+ max_offset);
+
+ if (entry == NULL) {
+ /*
+ * We've already written the whole new mapping.
+ * This special value will cause us to skip the
+ * generate_new_mapping step and just do the sync
+ * task to complete the condense.
+ */
+ start_index = UINT64_MAX;
+ } else {
+ start_index = entry - old_mapping->vim_entries;
+ ASSERT3U(start_index, <,
+ vdev_indirect_mapping_num_entries(old_mapping));
+ }
+ }
+
+ spa_condense_indirect_generate_new_mapping(vd, counts,
+ start_index, zthr);
+
+ vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts);
+
+ /*
+ * If the zthr has received a cancellation signal while running
+ * in generate_new_mapping() or at any point after that, then bail
+ * early. We don't want to complete the condense if the spa is
+ * shutting down.
+ */
+ if (zthr_iscancelled(zthr))
+ return;
+
+ VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+ spa_condense_indirect_complete_sync, sci, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+/*
+ * Sync task to begin the condensing process.
+ */
+void
+spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx)
+{
+ spa_t *spa = vd->vdev_spa;
+ spa_condensing_indirect_phys_t *scip =
+ &spa->spa_condensing_indirect_phys;
+
+ ASSERT0(scip->scip_next_mapping_object);
+ ASSERT0(scip->scip_prev_obsolete_sm_object);
+ ASSERT0(scip->scip_vdev);
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS));
+ ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping));
+
+ uint64_t obsolete_sm_obj;
+ VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj));
+ ASSERT3U(obsolete_sm_obj, !=, 0);
+
+ scip->scip_vdev = vd->vdev_id;
+ scip->scip_next_mapping_object =
+ vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx);
+
+ scip->scip_prev_obsolete_sm_object = obsolete_sm_obj;
+
+ /*
+ * We don't need to allocate a new space map object, since
+ * vdev_indirect_sync_obsolete will allocate one when needed.
+ */
+ space_map_close(vd->vdev_obsolete_sm);
+ vd->vdev_obsolete_sm = NULL;
+ VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
+
+ VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
+ sizeof (*scip) / sizeof (uint64_t), scip, tx));
+
+ ASSERT3P(spa->spa_condensing_indirect, ==, NULL);
+ spa->spa_condensing_indirect = spa_condensing_indirect_create(spa);
+
+ zfs_dbgmsg("starting condense of vdev %llu in txg %llu: "
+ "posm=%llu nm=%llu",
+ vd->vdev_id, dmu_tx_get_txg(tx),
+ (u_longlong_t)scip->scip_prev_obsolete_sm_object,
+ (u_longlong_t)scip->scip_next_mapping_object);
+
+ zthr_wakeup(spa->spa_condense_zthr);
+}
+
+/*
+ * Sync to the given vdev's obsolete space map any segments that are no longer
+ * referenced as of the given txg.
+ *
+ * If the obsolete space map doesn't exist yet, create and open it.
+ */
+void
+vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config;
+
+ ASSERT3U(vic->vic_mapping_object, !=, 0);
+ ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0);
+ ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
+ ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS));
+
+ uint64_t obsolete_sm_object;
+ VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+ if (obsolete_sm_object == 0) {
+ obsolete_sm_object = space_map_alloc(spa->spa_meta_objset,
+ zfs_vdev_standard_sm_blksz, tx);
+
+ ASSERT(vd->vdev_top_zap != 0);
+ VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM,
+ sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx));
+ ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+ ASSERT3U(obsolete_sm_object, !=, 0);
+
+ spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ VERIFY0(space_map_open(&vd->vdev_obsolete_sm,
+ spa->spa_meta_objset, obsolete_sm_object,
+ 0, vd->vdev_asize, 0));
+ }
+
+ ASSERT(vd->vdev_obsolete_sm != NULL);
+ ASSERT3U(obsolete_sm_object, ==,
+ space_map_object(vd->vdev_obsolete_sm));
+
+ space_map_write(vd->vdev_obsolete_sm,
+ vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx);
+ range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
+}
+
+int
+spa_condense_init(spa_t *spa)
+{
+ int error = zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
+ sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t),
+ &spa->spa_condensing_indirect_phys);
+ if (error == 0) {
+ if (spa_writeable(spa)) {
+ spa->spa_condensing_indirect =
+ spa_condensing_indirect_create(spa);
+ }
+ return (0);
+ } else if (error == ENOENT) {
+ return (0);
+ } else {
+ return (error);
+ }
+}
+
+void
+spa_condense_fini(spa_t *spa)
+{
+ if (spa->spa_condensing_indirect != NULL) {
+ spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
+ spa->spa_condensing_indirect = NULL;
+ }
+}
+
+void
+spa_start_indirect_condensing_thread(spa_t *spa)
+{
+ ASSERT3P(spa->spa_condense_zthr, ==, NULL);
+ spa->spa_condense_zthr = zthr_create("z_indirect_condense",
+ spa_condense_indirect_thread_check,
+ spa_condense_indirect_thread, spa);
+}
+
+/*
+ * Gets the obsolete spacemap object from the vdev's ZAP. On success sm_obj
+ * will contain either the obsolete spacemap object or zero if none exists.
+ * All other errors are returned to the caller.
+ */
+int
+vdev_obsolete_sm_object(vdev_t *vd, uint64_t *sm_obj)
+{
+ ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+
+ if (vd->vdev_top_zap == 0) {
+ *sm_obj = 0;
+ return (0);
+ }
+
+ int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (uint64_t), 1, sm_obj);
+ if (error == ENOENT) {
+ *sm_obj = 0;
+ error = 0;
+ }
+
+ return (error);
+}
+
+/*
+ * Gets the obsolete count are precise spacemap object from the vdev's ZAP.
+ * On success are_precise will be set to reflect if the counts are precise.
+ * All other errors are returned to the caller.
+ */
+int
+vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise)
+{
+ ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+
+ if (vd->vdev_top_zap == 0) {
+ *are_precise = B_FALSE;
+ return (0);
+ }
+
+ uint64_t val = 0;
+ int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val);
+ if (error == 0) {
+ *are_precise = (val != 0);
+ } else if (error == ENOENT) {
+ *are_precise = B_FALSE;
+ error = 0;
+ }
+
+ return (error);
+}
+
+/* ARGSUSED */
+static void
+vdev_indirect_close(vdev_t *vd)
+{
+}
+
+/* ARGSUSED */
+static int
+vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+ *psize = *max_psize = vd->vdev_asize +
+ VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
+ *logical_ashift = vd->vdev_ashift;
+ *physical_ashift = vd->vdev_physical_ashift;
+ return (0);
+}
+
+typedef struct remap_segment {
+ vdev_t *rs_vd;
+ uint64_t rs_offset;
+ uint64_t rs_asize;
+ uint64_t rs_split_offset;
+ list_node_t rs_node;
+} remap_segment_t;
+
+static remap_segment_t *
+rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
+{
+ remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP);
+ rs->rs_vd = vd;
+ rs->rs_offset = offset;
+ rs->rs_asize = asize;
+ rs->rs_split_offset = split_offset;
+ return (rs);
+}
+
+/*
+ * Given an indirect vdev and an extent on that vdev, it duplicates the
+ * physical entries of the indirect mapping that correspond to the extent
+ * to a new array and returns a pointer to it. In addition, copied_entries
+ * is populated with the number of mapping entries that were duplicated.
+ *
+ * Note that the function assumes that the caller holds vdev_indirect_rwlock.
+ * This ensures that the mapping won't change due to condensing as we
+ * copy over its contents.
+ *
+ * Finally, since we are doing an allocation, it is up to the caller to
+ * free the array allocated in this function.
+ */
+static vdev_indirect_mapping_entry_phys_t *
+vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
+ uint64_t asize, uint64_t *copied_entries)
+{
+ vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL;
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ uint64_t entries = 0;
+
+ ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock));
+
+ vdev_indirect_mapping_entry_phys_t *first_mapping =
+ vdev_indirect_mapping_entry_for_offset(vim, offset);
+ ASSERT3P(first_mapping, !=, NULL);
+
+ vdev_indirect_mapping_entry_phys_t *m = first_mapping;
+ while (asize > 0) {
+ uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
+
+ ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m));
+ ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size);
+
+ uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m);
+ uint64_t inner_size = MIN(asize, size - inner_offset);
+
+ offset += inner_size;
+ asize -= inner_size;
+ entries++;
+ m++;
+ }
+
+ size_t copy_length = entries * sizeof (*first_mapping);
+ duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP);
+ bcopy(first_mapping, duplicate_mappings, copy_length);
+ *copied_entries = entries;
+
+ return (duplicate_mappings);
+}
+
+/*
+ * Goes through the relevant indirect mappings until it hits a concrete vdev
+ * and issues the callback. On the way to the concrete vdev, if any other
+ * indirect vdevs are encountered, then the callback will also be called on
+ * each of those indirect vdevs. For example, if the segment is mapped to
+ * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is
+ * mapped to segment B on concrete vdev 2, then the callback will be called on
+ * both vdev 1 and vdev 2.
+ *
+ * While the callback passed to vdev_indirect_remap() is called on every vdev
+ * the function encounters, certain callbacks only care about concrete vdevs.
+ * These types of callbacks should return immediately and explicitly when they
+ * are called on an indirect vdev.
+ *
+ * Because there is a possibility that a DVA section in the indirect device
+ * has been split into multiple sections in our mapping, we keep track
+ * of the relevant contiguous segments of the new location (remap_segment_t)
+ * in a stack. This way we can call the callback for each of the new sections
+ * created by a single section of the indirect device. Note though, that in
+ * this scenario the callbacks in each split block won't occur in-order in
+ * terms of offset, so callers should not make any assumptions about that.
+ *
+ * For callbacks that don't handle split blocks and immediately return when
+ * they encounter them (as is the case for remap_blkptr_cb), the caller can
+ * assume that its callback will be applied from the first indirect vdev
+ * encountered to the last one and then the concrete vdev, in that order.
+ */
+static void
+vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize,
+ void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg)
+{
+ list_t stack;
+ spa_t *spa = vd->vdev_spa;
+
+ list_create(&stack, sizeof (remap_segment_t),
+ offsetof(remap_segment_t, rs_node));
+
+ for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0);
+ rs != NULL; rs = list_remove_head(&stack)) {
+ vdev_t *v = rs->rs_vd;
+ uint64_t num_entries = 0;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+ ASSERT(rs->rs_asize > 0);
+
+ /*
+ * Note: As this function can be called from open context
+ * (e.g. zio_read()), we need the following rwlock to
+ * prevent the mapping from being changed by condensing.
+ *
+ * So we grab the lock and we make a copy of the entries
+ * that are relevant to the extent that we are working on.
+ * Once that is done, we drop the lock and iterate over
+ * our copy of the mapping. Once we are done with the with
+ * the remap segment and we free it, we also free our copy
+ * of the indirect mapping entries that are relevant to it.
+ *
+ * This way we don't need to wait until the function is
+ * finished with a segment, to condense it. In addition, we
+ * don't need a recursive rwlock for the case that a call to
+ * vdev_indirect_remap() needs to call itself (through the
+ * codepath of its callback) for the same vdev in the middle
+ * of its execution.
+ */
+ rw_enter(&v->vdev_indirect_rwlock, RW_READER);
+ ASSERT3P(v->vdev_indirect_mapping, !=, NULL);
+
+ vdev_indirect_mapping_entry_phys_t *mapping =
+ vdev_indirect_mapping_duplicate_adjacent_entries(v,
+ rs->rs_offset, rs->rs_asize, &num_entries);
+ ASSERT3P(mapping, !=, NULL);
+ ASSERT3U(num_entries, >, 0);
+ rw_exit(&v->vdev_indirect_rwlock);
+
+ for (uint64_t i = 0; i < num_entries; i++) {
+ /*
+ * Note: the vdev_indirect_mapping can not change
+ * while we are running. It only changes while the
+ * removal is in progress, and then only from syncing
+ * context. While a removal is in progress, this
+ * function is only called for frees, which also only
+ * happen from syncing context.
+ */
+ vdev_indirect_mapping_entry_phys_t *m = &mapping[i];
+
+ ASSERT3P(m, !=, NULL);
+ ASSERT3U(rs->rs_asize, >, 0);
+
+ uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
+ uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst);
+ uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst);
+
+ ASSERT3U(rs->rs_offset, >=,
+ DVA_MAPPING_GET_SRC_OFFSET(m));
+ ASSERT3U(rs->rs_offset, <,
+ DVA_MAPPING_GET_SRC_OFFSET(m) + size);
+ ASSERT3U(dst_vdev, !=, v->vdev_id);
+
+ uint64_t inner_offset = rs->rs_offset -
+ DVA_MAPPING_GET_SRC_OFFSET(m);
+ uint64_t inner_size =
+ MIN(rs->rs_asize, size - inner_offset);
+
+ vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev);
+ ASSERT3P(dst_v, !=, NULL);
+
+ if (dst_v->vdev_ops == &vdev_indirect_ops) {
+ list_insert_head(&stack,
+ rs_alloc(dst_v, dst_offset + inner_offset,
+ inner_size, rs->rs_split_offset));
+
+ }
+
+ if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) &&
+ IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) {
+ /*
+ * Note: This clause exists only solely for
+ * testing purposes. We use it to ensure that
+ * split blocks work and that the callbacks
+ * using them yield the same result if issued
+ * in reverse order.
+ */
+ uint64_t inner_half = inner_size / 2;
+
+ func(rs->rs_split_offset + inner_half, dst_v,
+ dst_offset + inner_offset + inner_half,
+ inner_half, arg);
+
+ func(rs->rs_split_offset, dst_v,
+ dst_offset + inner_offset,
+ inner_half, arg);
+ } else {
+ func(rs->rs_split_offset, dst_v,
+ dst_offset + inner_offset,
+ inner_size, arg);
+ }
+
+ rs->rs_offset += inner_size;
+ rs->rs_asize -= inner_size;
+ rs->rs_split_offset += inner_size;
+ }
+ VERIFY0(rs->rs_asize);
+
+ kmem_free(mapping, num_entries * sizeof (*mapping));
+ kmem_free(rs, sizeof (remap_segment_t));
+ }
+ list_destroy(&stack);
+}
+
+static void
+vdev_indirect_child_io_done(zio_t *zio)
+{
+ zio_t *pio = zio->io_private;
+
+ mutex_enter(&pio->io_lock);
+ pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
+ mutex_exit(&pio->io_lock);
+
+ abd_free(zio->io_abd);
+}
+
+/*
+ * This is a callback for vdev_indirect_remap() which allocates an
+ * indirect_split_t for each split segment and adds it to iv_splits.
+ */
+static void
+vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *arg)
+{
+ zio_t *zio = arg;
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ ASSERT3P(vd, !=, NULL);
+
+ if (vd->vdev_ops == &vdev_indirect_ops)
+ return;
+
+ int n = 1;
+ if (vd->vdev_ops == &vdev_mirror_ops)
+ n = vd->vdev_children;
+
+ indirect_split_t *is =
+ kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP);
+
+ is->is_children = n;
+ is->is_size = size;
+ is->is_split_offset = split_offset;
+ is->is_target_offset = offset;
+ is->is_vdev = vd;
+ list_create(&is->is_unique_child, sizeof (indirect_child_t),
+ offsetof(indirect_child_t, ic_node));
+
+ /*
+ * Note that we only consider multiple copies of the data for
+ * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even
+ * though they use the same ops as mirror, because there's only one
+ * "good" copy under the replacing/spare.
+ */
+ if (vd->vdev_ops == &vdev_mirror_ops) {
+ for (int i = 0; i < n; i++) {
+ is->is_child[i].ic_vdev = vd->vdev_child[i];
+ list_link_init(&is->is_child[i].ic_node);
+ }
+ } else {
+ is->is_child[0].ic_vdev = vd;
+ }
+
+ list_insert_tail(&iv->iv_splits, is);
+}
+
+static void
+vdev_indirect_read_split_done(zio_t *zio)
+{
+ indirect_child_t *ic = zio->io_private;
+
+ if (zio->io_error != 0) {
+ /*
+ * Clear ic_data to indicate that we do not have data for this
+ * child.
+ */
+ abd_free(ic->ic_data);
+ ic->ic_data = NULL;
+ }
+}
+
+/*
+ * Issue reads for all copies (mirror children) of all splits.
+ */
+static void
+vdev_indirect_read_all(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ for (int i = 0; i < is->is_children; i++) {
+ indirect_child_t *ic = &is->is_child[i];
+
+ if (!vdev_readable(ic->ic_vdev))
+ continue;
+
+ /*
+ * If a child is missing the data, set ic_error. Used
+ * in vdev_indirect_repair(). We perform the read
+ * nevertheless which provides the opportunity to
+ * reconstruct the split block if at all possible.
+ */
+ if (vdev_dtl_contains(ic->ic_vdev, DTL_MISSING,
+ zio->io_txg, 1))
+ ic->ic_error = SET_ERROR(ESTALE);
+
+ ic->ic_data = abd_alloc_sametype(zio->io_abd,
+ is->is_size);
+ ic->ic_duplicate = NULL;
+
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ ic->ic_vdev, is->is_target_offset, ic->ic_data,
+ is->is_size, zio->io_type, zio->io_priority, 0,
+ vdev_indirect_read_split_done, ic));
+ }
+ }
+ iv->iv_reconstruct = B_TRUE;
+}
+
+static void
+vdev_indirect_io_start(zio_t *zio)
+{
+ spa_t *spa __maybe_unused = zio->io_spa;
+ indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP);
+ list_create(&iv->iv_splits,
+ sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
+
+ zio->io_vsd = iv;
+ zio->io_vsd_ops = &vdev_indirect_vsd_ops;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+ if (zio->io_type != ZIO_TYPE_READ) {
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+ /*
+ * Note: this code can handle other kinds of writes,
+ * but we don't expect them.
+ */
+ ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL |
+ ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
+ }
+
+ vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
+ vdev_indirect_gather_splits, zio);
+
+ indirect_split_t *first = list_head(&iv->iv_splits);
+ if (first->is_size == zio->io_size) {
+ /*
+ * This is not a split block; we are pointing to the entire
+ * data, which will checksum the same as the original data.
+ * Pass the BP down so that the child i/o can verify the
+ * checksum, and try a different location if available
+ * (e.g. on a mirror).
+ *
+ * While this special case could be handled the same as the
+ * general (split block) case, doing it this way ensures
+ * that the vast majority of blocks on indirect vdevs
+ * (which are not split) are handled identically to blocks
+ * on non-indirect vdevs. This allows us to be less strict
+ * about performance in the general (but rare) case.
+ */
+ ASSERT0(first->is_split_offset);
+ ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL);
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ first->is_vdev, first->is_target_offset,
+ abd_get_offset(zio->io_abd, 0),
+ zio->io_size, zio->io_type, zio->io_priority, 0,
+ vdev_indirect_child_io_done, zio));
+ } else {
+ iv->iv_split_block = B_TRUE;
+ if (zio->io_type == ZIO_TYPE_READ &&
+ zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
+ /*
+ * Read all copies. Note that for simplicity,
+ * we don't bother consulting the DTL in the
+ * resilver case.
+ */
+ vdev_indirect_read_all(zio);
+ } else {
+ /*
+ * If this is a read zio, we read one copy of each
+ * split segment, from the top-level vdev. Since
+ * we don't know the checksum of each split
+ * individually, the child zio can't ensure that
+ * we get the right data. E.g. if it's a mirror,
+ * it will just read from a random (healthy) leaf
+ * vdev. We have to verify the checksum in
+ * vdev_indirect_io_done().
+ *
+ * For write zios, the vdev code will ensure we write
+ * to all children.
+ */
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ is->is_vdev, is->is_target_offset,
+ abd_get_offset(zio->io_abd,
+ is->is_split_offset), is->is_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_indirect_child_io_done, zio));
+ }
+
+ }
+ }
+
+ zio_execute(zio);
+}
+
+/*
+ * Report a checksum error for a child.
+ */
+static void
+vdev_indirect_checksum_error(zio_t *zio,
+ indirect_split_t *is, indirect_child_t *ic)
+{
+ vdev_t *vd = ic->ic_vdev;
+
+ if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+ return;
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+
+ zio_bad_cksum_t zbc = {{{ 0 }}};
+ abd_t *bad_abd = ic->ic_data;
+ abd_t *good_abd = is->is_good_child->ic_data;
+ (void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio,
+ is->is_target_offset, is->is_size, good_abd, bad_abd, &zbc);
+}
+
+/*
+ * Issue repair i/os for any incorrect copies. We do this by comparing
+ * each split segment's correct data (is_good_child's ic_data) with each
+ * other copy of the data. If they differ, then we overwrite the bad data
+ * with the good copy. The DTL is checked in vdev_indirect_read_all() and
+ * if a vdev is missing a copy of the data we set ic_error and the read is
+ * performed. This provides the opportunity to reconstruct the split block
+ * if at all possible. ic_error is checked here and if set it suppresses
+ * incrementing the checksum counter. Aside from this DTLs are not checked,
+ * which simplifies this code and also issues the optimal number of writes
+ * (based on which copies actually read bad data, as opposed to which we
+ * think might be wrong). For the same reason, we always use
+ * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start().
+ */
+static void
+vdev_indirect_repair(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ enum zio_flag flags = ZIO_FLAG_IO_REPAIR;
+
+ if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))
+ flags |= ZIO_FLAG_SELF_HEAL;
+
+ if (!spa_writeable(zio->io_spa))
+ return;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ for (int c = 0; c < is->is_children; c++) {
+ indirect_child_t *ic = &is->is_child[c];
+ if (ic == is->is_good_child)
+ continue;
+ if (ic->ic_data == NULL)
+ continue;
+ if (ic->ic_duplicate == is->is_good_child)
+ continue;
+
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ ic->ic_vdev, is->is_target_offset,
+ is->is_good_child->ic_data, is->is_size,
+ ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
+ NULL, NULL));
+
+ /*
+ * If ic_error is set the current child does not have
+ * a copy of the data, so suppress incrementing the
+ * checksum counter.
+ */
+ if (ic->ic_error == ESTALE)
+ continue;
+
+ vdev_indirect_checksum_error(zio, is, ic);
+ }
+ }
+}
+
+/*
+ * Report checksum errors on all children that we read from.
+ */
+static void
+vdev_indirect_all_checksum_errors(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+ return;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ for (int c = 0; c < is->is_children; c++) {
+ indirect_child_t *ic = &is->is_child[c];
+
+ if (ic->ic_data == NULL)
+ continue;
+
+ vdev_t *vd = ic->ic_vdev;
+
+ int ret = zfs_ereport_post_checksum(zio->io_spa, vd,
+ NULL, zio, is->is_target_offset, is->is_size,
+ NULL, NULL, NULL);
+ if (ret != EALREADY) {
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+ }
+ }
+}
+
+/*
+ * Copy data from all the splits to a main zio then validate the checksum.
+ * If then checksum is successfully validated return success.
+ */
+static int
+vdev_indirect_splits_checksum_validate(indirect_vsd_t *iv, zio_t *zio)
+{
+ zio_bad_cksum_t zbc;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+
+ ASSERT3P(is->is_good_child->ic_data, !=, NULL);
+ ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL);
+
+ abd_copy_off(zio->io_abd, is->is_good_child->ic_data,
+ is->is_split_offset, 0, is->is_size);
+ }
+
+ return (zio_checksum_error(zio, &zbc));
+}
+
+/*
+ * There are relatively few possible combinations making it feasible to
+ * deterministically check them all. We do this by setting the good_child
+ * to the next unique split version. If we reach the end of the list then
+ * "carry over" to the next unique split version (like counting in base
+ * is_unique_children, but each digit can have a different base).
+ */
+static int
+vdev_indirect_splits_enumerate_all(indirect_vsd_t *iv, zio_t *zio)
+{
+ boolean_t more = B_TRUE;
+
+ iv->iv_attempts = 0;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is))
+ is->is_good_child = list_head(&is->is_unique_child);
+
+ while (more == B_TRUE) {
+ iv->iv_attempts++;
+ more = B_FALSE;
+
+ if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
+ return (0);
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ is->is_good_child = list_next(&is->is_unique_child,
+ is->is_good_child);
+ if (is->is_good_child != NULL) {
+ more = B_TRUE;
+ break;
+ }
+
+ is->is_good_child = list_head(&is->is_unique_child);
+ }
+ }
+
+ ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations);
+
+ return (SET_ERROR(ECKSUM));
+}
+
+/*
+ * There are too many combinations to try all of them in a reasonable amount
+ * of time. So try a fixed number of random combinations from the unique
+ * split versions, after which we'll consider the block unrecoverable.
+ */
+static int
+vdev_indirect_splits_enumerate_randomly(indirect_vsd_t *iv, zio_t *zio)
+{
+ iv->iv_attempts = 0;
+
+ while (iv->iv_attempts < iv->iv_attempts_max) {
+ iv->iv_attempts++;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ indirect_child_t *ic = list_head(&is->is_unique_child);
+ int children = is->is_unique_children;
+
+ for (int i = spa_get_random(children); i > 0; i--)
+ ic = list_next(&is->is_unique_child, ic);
+
+ ASSERT3P(ic, !=, NULL);
+ is->is_good_child = ic;
+ }
+
+ if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
+ return (0);
+ }
+
+ return (SET_ERROR(ECKSUM));
+}
+
+/*
+ * This is a validation function for reconstruction. It randomly selects
+ * a good combination, if one can be found, and then it intentionally
+ * damages all other segment copes by zeroing them. This forces the
+ * reconstruction algorithm to locate the one remaining known good copy.
+ */
+static int
+vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio)
+{
+ int error;
+
+ /* Presume all the copies are unique for initial selection. */
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ is->is_unique_children = 0;
+
+ for (int i = 0; i < is->is_children; i++) {
+ indirect_child_t *ic = &is->is_child[i];
+ if (ic->ic_data != NULL) {
+ is->is_unique_children++;
+ list_insert_tail(&is->is_unique_child, ic);
+ }
+ }
+
+ if (list_is_empty(&is->is_unique_child)) {
+ error = SET_ERROR(EIO);
+ goto out;
+ }
+ }
+
+ /*
+ * Set each is_good_child to a randomly-selected child which
+ * is known to contain validated data.
+ */
+ error = vdev_indirect_splits_enumerate_randomly(iv, zio);
+ if (error)
+ goto out;
+
+ /*
+ * Damage all but the known good copy by zeroing it. This will
+ * result in two or less unique copies per indirect_child_t.
+ * Both may need to be checked in order to reconstruct the block.
+ * Set iv->iv_attempts_max such that all unique combinations will
+ * enumerated, but limit the damage to at most 12 indirect splits.
+ */
+ iv->iv_attempts_max = 1;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ for (int c = 0; c < is->is_children; c++) {
+ indirect_child_t *ic = &is->is_child[c];
+
+ if (ic == is->is_good_child)
+ continue;
+ if (ic->ic_data == NULL)
+ continue;
+
+ abd_zero(ic->ic_data, abd_get_size(ic->ic_data));
+ }
+
+ iv->iv_attempts_max *= 2;
+ if (iv->iv_attempts_max >= (1ULL << 12)) {
+ iv->iv_attempts_max = UINT64_MAX;
+ break;
+ }
+ }
+
+out:
+ /* Empty the unique children lists so they can be reconstructed. */
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ indirect_child_t *ic;
+ while ((ic = list_head(&is->is_unique_child)) != NULL)
+ list_remove(&is->is_unique_child, ic);
+
+ is->is_unique_children = 0;
+ }
+
+ return (error);
+}
+
+/*
+ * This function is called when we have read all copies of the data and need
+ * to try to find a combination of copies that gives us the right checksum.
+ *
+ * If we pointed to any mirror vdevs, this effectively does the job of the
+ * mirror. The mirror vdev code can't do its own job because we don't know
+ * the checksum of each split segment individually.
+ *
+ * We have to try every unique combination of copies of split segments, until
+ * we find one that checksums correctly. Duplicate segment copies are first
+ * identified and latter skipped during reconstruction. This optimization
+ * reduces the search space and ensures that of the remaining combinations
+ * at most one is correct.
+ *
+ * When the total number of combinations is small they can all be checked.
+ * For example, if we have 3 segments in the split, and each points to a
+ * 2-way mirror with unique copies, we will have the following pieces of data:
+ *
+ * | mirror child
+ * split | [0] [1]
+ * ======|=====================
+ * A | data_A_0 data_A_1
+ * B | data_B_0 data_B_1
+ * C | data_C_0 data_C_1
+ *
+ * We will try the following (mirror children)^(number of splits) (2^3=8)
+ * combinations, which is similar to bitwise-little-endian counting in
+ * binary. In general each "digit" corresponds to a split segment, and the
+ * base of each digit is is_children, which can be different for each
+ * digit.
+ *
+ * "low bit" "high bit"
+ * v v
+ * data_A_0 data_B_0 data_C_0
+ * data_A_1 data_B_0 data_C_0
+ * data_A_0 data_B_1 data_C_0
+ * data_A_1 data_B_1 data_C_0
+ * data_A_0 data_B_0 data_C_1
+ * data_A_1 data_B_0 data_C_1
+ * data_A_0 data_B_1 data_C_1
+ * data_A_1 data_B_1 data_C_1
+ *
+ * Note that the split segments may be on the same or different top-level
+ * vdevs. In either case, we may need to try lots of combinations (see
+ * zfs_reconstruct_indirect_combinations_max). This ensures that if a mirror
+ * has small silent errors on all of its children, we can still reconstruct
+ * the correct data, as long as those errors are at sufficiently-separated
+ * offsets (specifically, separated by the largest block size - default of
+ * 128KB, but up to 16MB).
+ */
+static void
+vdev_indirect_reconstruct_io_done(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+ boolean_t known_good = B_FALSE;
+ int error;
+
+ iv->iv_unique_combinations = 1;
+ iv->iv_attempts_max = UINT64_MAX;
+
+ if (zfs_reconstruct_indirect_combinations_max > 0)
+ iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max;
+
+ /*
+ * If nonzero, every 1/x blocks will be damaged, in order to validate
+ * reconstruction when there are split segments with damaged copies.
+ * Known_good will be TRUE when reconstruction is known to be possible.
+ */
+ if (zfs_reconstruct_indirect_damage_fraction != 0 &&
+ spa_get_random(zfs_reconstruct_indirect_damage_fraction) == 0)
+ known_good = (vdev_indirect_splits_damage(iv, zio) == 0);
+
+ /*
+ * Determine the unique children for a split segment and add them
+ * to the is_unique_child list. By restricting reconstruction
+ * to these children, only unique combinations will be considered.
+ * This can vastly reduce the search space when there are a large
+ * number of indirect splits.
+ */
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ is->is_unique_children = 0;
+
+ for (int i = 0; i < is->is_children; i++) {
+ indirect_child_t *ic_i = &is->is_child[i];
+
+ if (ic_i->ic_data == NULL ||
+ ic_i->ic_duplicate != NULL)
+ continue;
+
+ for (int j = i + 1; j < is->is_children; j++) {
+ indirect_child_t *ic_j = &is->is_child[j];
+
+ if (ic_j->ic_data == NULL ||
+ ic_j->ic_duplicate != NULL)
+ continue;
+
+ if (abd_cmp(ic_i->ic_data, ic_j->ic_data) == 0)
+ ic_j->ic_duplicate = ic_i;
+ }
+
+ is->is_unique_children++;
+ list_insert_tail(&is->is_unique_child, ic_i);
+ }
+
+ /* Reconstruction is impossible, no valid children */
+ EQUIV(list_is_empty(&is->is_unique_child),
+ is->is_unique_children == 0);
+ if (list_is_empty(&is->is_unique_child)) {
+ zio->io_error = EIO;
+ vdev_indirect_all_checksum_errors(zio);
+ zio_checksum_verified(zio);
+ return;
+ }
+
+ iv->iv_unique_combinations *= is->is_unique_children;
+ }
+
+ if (iv->iv_unique_combinations <= iv->iv_attempts_max)
+ error = vdev_indirect_splits_enumerate_all(iv, zio);
+ else
+ error = vdev_indirect_splits_enumerate_randomly(iv, zio);
+
+ if (error != 0) {
+ /* All attempted combinations failed. */
+ ASSERT3B(known_good, ==, B_FALSE);
+ zio->io_error = error;
+ vdev_indirect_all_checksum_errors(zio);
+ } else {
+ /*
+ * The checksum has been successfully validated. Issue
+ * repair I/Os to any copies of splits which don't match
+ * the validated version.
+ */
+ ASSERT0(vdev_indirect_splits_checksum_validate(iv, zio));
+ vdev_indirect_repair(zio);
+ zio_checksum_verified(zio);
+ }
+}
+
+static void
+vdev_indirect_io_done(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ if (iv->iv_reconstruct) {
+ /*
+ * We have read all copies of the data (e.g. from mirrors),
+ * either because this was a scrub/resilver, or because the
+ * one-copy read didn't checksum correctly.
+ */
+ vdev_indirect_reconstruct_io_done(zio);
+ return;
+ }
+
+ if (!iv->iv_split_block) {
+ /*
+ * This was not a split block, so we passed the BP down,
+ * and the checksum was handled by the (one) child zio.
+ */
+ return;
+ }
+
+ zio_bad_cksum_t zbc;
+ int ret = zio_checksum_error(zio, &zbc);
+ if (ret == 0) {
+ zio_checksum_verified(zio);
+ return;
+ }
+
+ /*
+ * The checksum didn't match. Read all copies of all splits, and
+ * then we will try to reconstruct. The next time
+ * vdev_indirect_io_done() is called, iv_reconstruct will be set.
+ */
+ vdev_indirect_read_all(zio);
+
+ zio_vdev_io_redone(zio);
+}
+
+vdev_ops_t vdev_indirect_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
+ .vdev_op_open = vdev_indirect_open,
+ .vdev_op_close = vdev_indirect_close,
+ .vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
+ .vdev_op_io_start = vdev_indirect_io_start,
+ .vdev_op_io_done = vdev_indirect_io_done,
+ .vdev_op_state_change = NULL,
+ .vdev_op_need_resilver = NULL,
+ .vdev_op_hold = NULL,
+ .vdev_op_rele = NULL,
+ .vdev_op_remap = vdev_indirect_remap,
+ .vdev_op_xlate = NULL,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
+ .vdev_op_type = VDEV_TYPE_INDIRECT, /* name of this vdev type */
+ .vdev_op_leaf = B_FALSE /* leaf vdev */
+};
+
+EXPORT_SYMBOL(spa_condense_fini);
+EXPORT_SYMBOL(spa_start_indirect_condensing_thread);
+EXPORT_SYMBOL(spa_condense_indirect_start_sync);
+EXPORT_SYMBOL(spa_condense_init);
+EXPORT_SYMBOL(spa_vdev_indirect_mark_obsolete);
+EXPORT_SYMBOL(vdev_indirect_mark_obsolete);
+EXPORT_SYMBOL(vdev_indirect_should_condense);
+EXPORT_SYMBOL(vdev_indirect_sync_obsolete);
+EXPORT_SYMBOL(vdev_obsolete_counts_are_precise);
+EXPORT_SYMBOL(vdev_obsolete_sm_object);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_vdevs_enable, INT, ZMOD_RW,
+ "Whether to attempt condensing indirect vdev mappings");
+
+ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, min_mapping_bytes, ULONG, ZMOD_RW,
+ "Don't bother condensing if the mapping uses less than this amount of "
+ "memory");
+
+ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, max_obsolete_bytes, ULONG, ZMOD_RW,
+ "Minimum size obsolete spacemap to attempt condensing");
+
+ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_commit_entry_delay_ms, INT, ZMOD_RW,
+ "Used by tests to ensure certain actions happen in the middle of a "
+ "condense. A maximum value of 1 should be sufficient.");
+
+ZFS_MODULE_PARAM(zfs_reconstruct, zfs_reconstruct_, indirect_combinations_max, INT, ZMOD_RW,
+ "Maximum number of combinations when reconstructing split segments");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect_births.c b/sys/contrib/openzfs/module/zfs/vdev_indirect_births.c
new file mode 100644
index 000000000000..99b83c392257
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_indirect_births.c
@@ -0,0 +1,226 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2015 by Delphix. All rights reserved.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/vdev_indirect_births.h>
+
+#ifdef ZFS_DEBUG
+static boolean_t
+vdev_indirect_births_verify(vdev_indirect_births_t *vib)
+{
+ ASSERT(vib != NULL);
+
+ ASSERT(vib->vib_object != 0);
+ ASSERT(vib->vib_objset != NULL);
+ ASSERT(vib->vib_phys != NULL);
+ ASSERT(vib->vib_dbuf != NULL);
+
+ EQUIV(vib->vib_phys->vib_count > 0, vib->vib_entries != NULL);
+
+ return (B_TRUE);
+}
+#endif
+
+uint64_t
+vdev_indirect_births_count(vdev_indirect_births_t *vib)
+{
+ ASSERT(vdev_indirect_births_verify(vib));
+
+ return (vib->vib_phys->vib_count);
+}
+
+uint64_t
+vdev_indirect_births_object(vdev_indirect_births_t *vib)
+{
+ ASSERT(vdev_indirect_births_verify(vib));
+
+ return (vib->vib_object);
+}
+
+static uint64_t
+vdev_indirect_births_size_impl(vdev_indirect_births_t *vib)
+{
+ return (vib->vib_phys->vib_count * sizeof (*vib->vib_entries));
+}
+
+void
+vdev_indirect_births_close(vdev_indirect_births_t *vib)
+{
+ ASSERT(vdev_indirect_births_verify(vib));
+
+ if (vib->vib_phys->vib_count > 0) {
+ uint64_t births_size = vdev_indirect_births_size_impl(vib);
+
+ vmem_free(vib->vib_entries, births_size);
+ vib->vib_entries = NULL;
+ }
+
+ dmu_buf_rele(vib->vib_dbuf, vib);
+
+ vib->vib_objset = NULL;
+ vib->vib_object = 0;
+ vib->vib_dbuf = NULL;
+ vib->vib_phys = NULL;
+
+ kmem_free(vib, sizeof (*vib));
+}
+
+uint64_t
+vdev_indirect_births_alloc(objset_t *os, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ return (dmu_object_alloc(os,
+ DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
+ DMU_OTN_UINT64_METADATA, sizeof (vdev_indirect_birth_phys_t),
+ tx));
+}
+
+vdev_indirect_births_t *
+vdev_indirect_births_open(objset_t *os, uint64_t births_object)
+{
+ vdev_indirect_births_t *vib = kmem_zalloc(sizeof (*vib), KM_SLEEP);
+
+ vib->vib_objset = os;
+ vib->vib_object = births_object;
+
+ VERIFY0(dmu_bonus_hold(os, vib->vib_object, vib, &vib->vib_dbuf));
+ vib->vib_phys = vib->vib_dbuf->db_data;
+
+ if (vib->vib_phys->vib_count > 0) {
+ uint64_t births_size = vdev_indirect_births_size_impl(vib);
+ vib->vib_entries = vmem_alloc(births_size, KM_SLEEP);
+ VERIFY0(dmu_read(vib->vib_objset, vib->vib_object, 0,
+ births_size, vib->vib_entries, DMU_READ_PREFETCH));
+ }
+
+ ASSERT(vdev_indirect_births_verify(vib));
+
+ return (vib);
+}
+
+void
+vdev_indirect_births_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ VERIFY0(dmu_object_free(os, object, tx));
+}
+
+void
+vdev_indirect_births_add_entry(vdev_indirect_births_t *vib,
+ uint64_t max_offset, uint64_t txg, dmu_tx_t *tx)
+{
+ vdev_indirect_birth_entry_phys_t vibe;
+ uint64_t old_size;
+ uint64_t new_size;
+ vdev_indirect_birth_entry_phys_t *new_entries;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx)));
+ ASSERT(vdev_indirect_births_verify(vib));
+
+ dmu_buf_will_dirty(vib->vib_dbuf, tx);
+
+ vibe.vibe_offset = max_offset;
+ vibe.vibe_phys_birth_txg = txg;
+
+ old_size = vdev_indirect_births_size_impl(vib);
+ dmu_write(vib->vib_objset, vib->vib_object, old_size, sizeof (vibe),
+ &vibe, tx);
+ vib->vib_phys->vib_count++;
+ new_size = vdev_indirect_births_size_impl(vib);
+
+ new_entries = vmem_alloc(new_size, KM_SLEEP);
+ if (old_size > 0) {
+ bcopy(vib->vib_entries, new_entries, old_size);
+ vmem_free(vib->vib_entries, old_size);
+ }
+ new_entries[vib->vib_phys->vib_count - 1] = vibe;
+ vib->vib_entries = new_entries;
+}
+
+uint64_t
+vdev_indirect_births_last_entry_txg(vdev_indirect_births_t *vib)
+{
+ ASSERT(vdev_indirect_births_verify(vib));
+ ASSERT(vib->vib_phys->vib_count > 0);
+
+ vdev_indirect_birth_entry_phys_t *last =
+ &vib->vib_entries[vib->vib_phys->vib_count - 1];
+ return (last->vibe_phys_birth_txg);
+}
+
+/*
+ * Return the txg in which the given range was copied (i.e. its physical
+ * birth txg). The specified offset+asize must be contiguously mapped
+ * (i.e. not a split block).
+ *
+ * The entries are sorted by increasing phys_birth, and also by increasing
+ * offset. We find the specified offset by binary search. Note that we
+ * can not use bsearch() because looking at each entry independently is
+ * insufficient to find the correct entry. Each entry implicitly relies
+ * on the previous entry: an entry indicates that the offsets from the
+ * end of the previous entry to the end of this entry were written in the
+ * specified txg.
+ */
+uint64_t
+vdev_indirect_births_physbirth(vdev_indirect_births_t *vib, uint64_t offset,
+ uint64_t asize)
+{
+ vdev_indirect_birth_entry_phys_t *base;
+ vdev_indirect_birth_entry_phys_t *last;
+
+ ASSERT(vdev_indirect_births_verify(vib));
+ ASSERT(vib->vib_phys->vib_count > 0);
+
+ base = vib->vib_entries;
+ last = base + vib->vib_phys->vib_count - 1;
+
+ ASSERT3U(offset, <, last->vibe_offset);
+
+ while (last >= base) {
+ vdev_indirect_birth_entry_phys_t *p =
+ base + ((last - base) / 2);
+ if (offset >= p->vibe_offset) {
+ base = p + 1;
+ } else if (p == vib->vib_entries ||
+ offset >= (p - 1)->vibe_offset) {
+ ASSERT3U(offset + asize, <=, p->vibe_offset);
+ return (p->vibe_phys_birth_txg);
+ } else {
+ last = p - 1;
+ }
+ }
+ ASSERT(!"offset not found");
+ return (-1);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(vdev_indirect_births_add_entry);
+EXPORT_SYMBOL(vdev_indirect_births_alloc);
+EXPORT_SYMBOL(vdev_indirect_births_close);
+EXPORT_SYMBOL(vdev_indirect_births_count);
+EXPORT_SYMBOL(vdev_indirect_births_free);
+EXPORT_SYMBOL(vdev_indirect_births_last_entry_txg);
+EXPORT_SYMBOL(vdev_indirect_births_object);
+EXPORT_SYMBOL(vdev_indirect_births_open);
+EXPORT_SYMBOL(vdev_indirect_births_physbirth);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c b/sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c
new file mode 100644
index 000000000000..bb484a401b1b
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c
@@ -0,0 +1,616 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2015, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/zfeature.h>
+#include <sys/dmu_objset.h>
+
+#ifdef ZFS_DEBUG
+static boolean_t
+vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim)
+{
+ ASSERT(vim != NULL);
+
+ ASSERT(vim->vim_object != 0);
+ ASSERT(vim->vim_objset != NULL);
+ ASSERT(vim->vim_phys != NULL);
+ ASSERT(vim->vim_dbuf != NULL);
+
+ EQUIV(vim->vim_phys->vimp_num_entries > 0,
+ vim->vim_entries != NULL);
+ if (vim->vim_phys->vimp_num_entries > 0) {
+ vdev_indirect_mapping_entry_phys_t *last_entry __maybe_unused =
+ &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1];
+ uint64_t offset __maybe_unused =
+ DVA_MAPPING_GET_SRC_OFFSET(last_entry);
+ uint64_t size __maybe_unused =
+ DVA_GET_ASIZE(&last_entry->vimep_dst);
+
+ ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size);
+ }
+ if (vim->vim_havecounts) {
+ ASSERT(vim->vim_phys->vimp_counts_object != 0);
+ }
+
+ return (B_TRUE);
+}
+#endif
+
+uint64_t
+vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ return (vim->vim_phys->vimp_num_entries);
+}
+
+uint64_t
+vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ return (vim->vim_phys->vimp_max_offset);
+}
+
+uint64_t
+vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ return (vim->vim_object);
+}
+
+uint64_t
+vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ return (vim->vim_phys->vimp_bytes_mapped);
+}
+
+/*
+ * The length (in bytes) of the mapping object array in memory and
+ * (logically) on disk.
+ *
+ * Note that unlike most of our accessor functions,
+ * we don't assert that the struct is consistent; therefore it can be
+ * called while there may be concurrent changes, if we don't care about
+ * the value being immediately stale (e.g. from spa_removal_get_stats()).
+ */
+uint64_t
+vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim)
+{
+ return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries));
+}
+
+/*
+ * Compare an offset with an indirect mapping entry; there are three
+ * possible scenarios:
+ *
+ * 1. The offset is "less than" the mapping entry; meaning the
+ * offset is less than the source offset of the mapping entry. In
+ * this case, there is no overlap between the offset and the
+ * mapping entry and -1 will be returned.
+ *
+ * 2. The offset is "greater than" the mapping entry; meaning the
+ * offset is greater than the mapping entry's source offset plus
+ * the entry's size. In this case, there is no overlap between
+ * the offset and the mapping entry and 1 will be returned.
+ *
+ * NOTE: If the offset is actually equal to the entry's offset
+ * plus size, this is considered to be "greater" than the entry,
+ * and this case applies (i.e. 1 will be returned). Thus, the
+ * entry's "range" can be considered to be inclusive at its
+ * start, but exclusive at its end: e.g. [src, src + size).
+ *
+ * 3. The last case to consider is if the offset actually falls
+ * within the mapping entry's range. If this is the case, the
+ * offset is considered to be "equal to" the mapping entry and
+ * 0 will be returned.
+ *
+ * NOTE: If the offset is equal to the entry's source offset,
+ * this case applies and 0 will be returned. If the offset is
+ * equal to the entry's source plus its size, this case does
+ * *not* apply (see "NOTE" above for scenario 2), and 1 will be
+ * returned.
+ */
+static int
+dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem)
+{
+ const uint64_t * const key = v_key;
+ const vdev_indirect_mapping_entry_phys_t * const array_elem =
+ v_array_elem;
+ uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem);
+
+ if (*key < src_offset) {
+ return (-1);
+ } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) {
+ return (0);
+ } else {
+ return (1);
+ }
+}
+
+/*
+ * Returns the mapping entry for the given offset.
+ *
+ * It's possible that the given offset will not be in the mapping table
+ * (i.e. no mapping entries contain this offset), in which case, the
+ * return value value depends on the "next_if_missing" parameter.
+ *
+ * If the offset is not found in the table and "next_if_missing" is
+ * B_FALSE, then NULL will always be returned. The behavior is intended
+ * to allow consumers to get the entry corresponding to the offset
+ * parameter, iff the offset overlaps with an entry in the table.
+ *
+ * If the offset is not found in the table and "next_if_missing" is
+ * B_TRUE, then the entry nearest to the given offset will be returned,
+ * such that the entry's source offset is greater than the offset
+ * passed in (i.e. the "next" mapping entry in the table is returned, if
+ * the offset is missing from the table). If there are no entries whose
+ * source offset is greater than the passed in offset, NULL is returned.
+ */
+static vdev_indirect_mapping_entry_phys_t *
+vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim,
+ uint64_t offset, boolean_t next_if_missing)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+ ASSERT(vim->vim_phys->vimp_num_entries > 0);
+
+ vdev_indirect_mapping_entry_phys_t *entry = NULL;
+
+ uint64_t last = vim->vim_phys->vimp_num_entries - 1;
+ uint64_t base = 0;
+
+ /*
+ * We don't define these inside of the while loop because we use
+ * their value in the case that offset isn't in the mapping.
+ */
+ uint64_t mid;
+ int result;
+
+ while (last >= base) {
+ mid = base + ((last - base) >> 1);
+
+ result = dva_mapping_overlap_compare(&offset,
+ &vim->vim_entries[mid]);
+
+ if (result == 0) {
+ entry = &vim->vim_entries[mid];
+ break;
+ } else if (result < 0) {
+ last = mid - 1;
+ } else {
+ base = mid + 1;
+ }
+ }
+
+ if (entry == NULL && next_if_missing) {
+ ASSERT3U(base, ==, last + 1);
+ ASSERT(mid == base || mid == last);
+ ASSERT3S(result, !=, 0);
+
+ /*
+ * The offset we're looking for isn't actually contained
+ * in the mapping table, thus we need to return the
+ * closest mapping entry that is greater than the
+ * offset. We reuse the result of the last comparison,
+ * comparing the mapping entry at index "mid" and the
+ * offset. The offset is guaranteed to lie between
+ * indices one less than "mid", and one greater than
+ * "mid"; we just need to determine if offset is greater
+ * than, or less than the mapping entry contained at
+ * index "mid".
+ */
+
+ uint64_t index;
+ if (result < 0)
+ index = mid;
+ else
+ index = mid + 1;
+
+ ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries);
+
+ if (index == vim->vim_phys->vimp_num_entries) {
+ /*
+ * If "index" is past the end of the entries
+ * array, then not only is the offset not in the
+ * mapping table, but it's actually greater than
+ * all entries in the table. In this case, we
+ * can't return a mapping entry greater than the
+ * offset (since none exist), so we return NULL.
+ */
+
+ ASSERT3S(dva_mapping_overlap_compare(&offset,
+ &vim->vim_entries[index - 1]), >, 0);
+
+ return (NULL);
+ } else {
+ /*
+ * Just to be safe, we verify the offset falls
+ * in between the mapping entries at index and
+ * one less than index. Since we know the offset
+ * doesn't overlap an entry, and we're supposed
+ * to return the entry just greater than the
+ * offset, both of the following tests must be
+ * true.
+ */
+ ASSERT3S(dva_mapping_overlap_compare(&offset,
+ &vim->vim_entries[index]), <, 0);
+ IMPLY(index >= 1, dva_mapping_overlap_compare(&offset,
+ &vim->vim_entries[index - 1]) > 0);
+
+ return (&vim->vim_entries[index]);
+ }
+ } else {
+ return (entry);
+ }
+}
+
+vdev_indirect_mapping_entry_phys_t *
+vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
+ uint64_t offset)
+{
+ return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
+ B_FALSE));
+}
+
+vdev_indirect_mapping_entry_phys_t *
+vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim,
+ uint64_t offset)
+{
+ return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
+ B_TRUE));
+}
+
+void
+vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ if (vim->vim_phys->vimp_num_entries > 0) {
+ uint64_t map_size = vdev_indirect_mapping_size(vim);
+ vmem_free(vim->vim_entries, map_size);
+ vim->vim_entries = NULL;
+ }
+
+ dmu_buf_rele(vim->vim_dbuf, vim);
+
+ vim->vim_objset = NULL;
+ vim->vim_object = 0;
+ vim->vim_dbuf = NULL;
+ vim->vim_phys = NULL;
+
+ kmem_free(vim, sizeof (*vim));
+}
+
+uint64_t
+vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx)
+{
+ uint64_t object;
+ ASSERT(dmu_tx_is_syncing(tx));
+ uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0;
+
+ if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+ bonus_size = sizeof (vdev_indirect_mapping_phys_t);
+ }
+
+ object = dmu_object_alloc(os,
+ DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
+ DMU_OTN_UINT64_METADATA, bonus_size,
+ tx);
+
+ if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+ dmu_buf_t *dbuf;
+ vdev_indirect_mapping_phys_t *vimp;
+
+ VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ vimp = dbuf->db_data;
+ vimp->vimp_counts_object = dmu_object_alloc(os,
+ DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE,
+ DMU_OT_NONE, 0, tx);
+ spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ dmu_buf_rele(dbuf, FTAG);
+ }
+
+ return (object);
+}
+
+
+vdev_indirect_mapping_t *
+vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object)
+{
+ vdev_indirect_mapping_t *vim = kmem_zalloc(sizeof (*vim), KM_SLEEP);
+ dmu_object_info_t doi;
+ VERIFY0(dmu_object_info(os, mapping_object, &doi));
+
+ vim->vim_objset = os;
+ vim->vim_object = mapping_object;
+
+ VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim,
+ &vim->vim_dbuf));
+ vim->vim_phys = vim->vim_dbuf->db_data;
+
+ vim->vim_havecounts =
+ (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0);
+
+ if (vim->vim_phys->vimp_num_entries > 0) {
+ uint64_t map_size = vdev_indirect_mapping_size(vim);
+ vim->vim_entries = vmem_alloc(map_size, KM_SLEEP);
+ VERIFY0(dmu_read(os, vim->vim_object, 0, map_size,
+ vim->vim_entries, DMU_READ_PREFETCH));
+ }
+
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ return (vim);
+}
+
+void
+vdev_indirect_mapping_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object);
+ if (vim->vim_havecounts) {
+ VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object,
+ tx));
+ spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ }
+ vdev_indirect_mapping_close(vim);
+
+ VERIFY0(dmu_object_free(os, object, tx));
+}
+
+/*
+ * Append the list of vdev_indirect_mapping_entry_t's to the on-disk
+ * mapping object. Also remove the entries from the list and free them.
+ * This also implicitly extends the max_offset of the mapping (to the end
+ * of the last entry).
+ */
+void
+vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim,
+ list_t *list, dmu_tx_t *tx)
+{
+ vdev_indirect_mapping_entry_phys_t *mapbuf;
+ uint64_t old_size;
+ uint32_t *countbuf = NULL;
+ vdev_indirect_mapping_entry_phys_t *old_entries;
+ uint64_t old_count;
+ uint64_t entries_written = 0;
+
+ ASSERT(vdev_indirect_mapping_verify(vim));
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx)));
+ ASSERT(!list_is_empty(list));
+
+ old_size = vdev_indirect_mapping_size(vim);
+ old_entries = vim->vim_entries;
+ old_count = vim->vim_phys->vimp_num_entries;
+
+ dmu_buf_will_dirty(vim->vim_dbuf, tx);
+
+ mapbuf = vmem_alloc(SPA_OLD_MAXBLOCKSIZE, KM_SLEEP);
+ if (vim->vim_havecounts) {
+ countbuf = vmem_alloc(SPA_OLD_MAXBLOCKSIZE, KM_SLEEP);
+ ASSERT(spa_feature_is_active(vim->vim_objset->os_spa,
+ SPA_FEATURE_OBSOLETE_COUNTS));
+ }
+ while (!list_is_empty(list)) {
+ uint64_t i;
+ /*
+ * Write entries from the list to the
+ * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE.
+ */
+ for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) {
+ vdev_indirect_mapping_entry_t *entry =
+ list_remove_head(list);
+ if (entry == NULL)
+ break;
+
+ uint64_t size =
+ DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst);
+ uint64_t src_offset =
+ DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping);
+
+ /*
+ * We shouldn't be adding an entry which is fully
+ * obsolete.
+ */
+ ASSERT3U(entry->vime_obsolete_count, <, size);
+ IMPLY(entry->vime_obsolete_count != 0,
+ vim->vim_havecounts);
+
+ mapbuf[i] = entry->vime_mapping;
+ if (vim->vim_havecounts)
+ countbuf[i] = entry->vime_obsolete_count;
+
+ vim->vim_phys->vimp_bytes_mapped += size;
+ ASSERT3U(src_offset, >=,
+ vim->vim_phys->vimp_max_offset);
+ vim->vim_phys->vimp_max_offset = src_offset + size;
+
+ entries_written++;
+
+ vmem_free(entry, sizeof (*entry));
+ }
+ dmu_write(vim->vim_objset, vim->vim_object,
+ vim->vim_phys->vimp_num_entries * sizeof (*mapbuf),
+ i * sizeof (*mapbuf),
+ mapbuf, tx);
+ if (vim->vim_havecounts) {
+ dmu_write(vim->vim_objset,
+ vim->vim_phys->vimp_counts_object,
+ vim->vim_phys->vimp_num_entries *
+ sizeof (*countbuf),
+ i * sizeof (*countbuf), countbuf, tx);
+ }
+ vim->vim_phys->vimp_num_entries += i;
+ }
+ vmem_free(mapbuf, SPA_OLD_MAXBLOCKSIZE);
+ if (vim->vim_havecounts)
+ vmem_free(countbuf, SPA_OLD_MAXBLOCKSIZE);
+
+ /*
+ * Update the entry array to reflect the new entries. First, copy
+ * over any old entries then read back the new entries we just wrote.
+ */
+ uint64_t new_size = vdev_indirect_mapping_size(vim);
+ ASSERT3U(new_size, >, old_size);
+ ASSERT3U(new_size - old_size, ==,
+ entries_written * sizeof (vdev_indirect_mapping_entry_phys_t));
+ vim->vim_entries = vmem_alloc(new_size, KM_SLEEP);
+ if (old_size > 0) {
+ bcopy(old_entries, vim->vim_entries, old_size);
+ vmem_free(old_entries, old_size);
+ }
+ VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size,
+ new_size - old_size, &vim->vim_entries[old_count],
+ DMU_READ_PREFETCH));
+
+ zfs_dbgmsg("txg %llu: wrote %llu entries to "
+ "indirect mapping obj %llu; max offset=0x%llx",
+ (u_longlong_t)dmu_tx_get_txg(tx),
+ (u_longlong_t)entries_written,
+ (u_longlong_t)vim->vim_object,
+ (u_longlong_t)vim->vim_phys->vimp_max_offset);
+}
+
+/*
+ * Increment the relevant counts for the specified offset and length.
+ * The counts array must be obtained from
+ * vdev_indirect_mapping_load_obsolete_counts().
+ */
+void
+vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim,
+ uint64_t offset, uint64_t length, uint32_t *counts)
+{
+ vdev_indirect_mapping_entry_phys_t *mapping;
+ uint64_t index;
+
+ mapping = vdev_indirect_mapping_entry_for_offset(vim, offset);
+
+ ASSERT(length > 0);
+ ASSERT3P(mapping, !=, NULL);
+
+ index = mapping - vim->vim_entries;
+
+ while (length > 0) {
+ ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim));
+
+ uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst);
+ uint64_t inner_offset = offset -
+ DVA_MAPPING_GET_SRC_OFFSET(mapping);
+ VERIFY3U(inner_offset, <, size);
+ uint64_t inner_size = MIN(length, size - inner_offset);
+
+ VERIFY3U(counts[index] + inner_size, <=, size);
+ counts[index] += inner_size;
+
+ offset += inner_size;
+ length -= inner_size;
+ mapping++;
+ index++;
+ }
+}
+
+typedef struct load_obsolete_space_map_arg {
+ vdev_indirect_mapping_t *losma_vim;
+ uint32_t *losma_counts;
+} load_obsolete_space_map_arg_t;
+
+static int
+load_obsolete_sm_callback(space_map_entry_t *sme, void *arg)
+{
+ load_obsolete_space_map_arg_t *losma = arg;
+ ASSERT3S(sme->sme_type, ==, SM_ALLOC);
+
+ vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim,
+ sme->sme_offset, sme->sme_run, losma->losma_counts);
+
+ return (0);
+}
+
+/*
+ * Modify the counts (increment them) based on the spacemap.
+ */
+void
+vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim,
+ uint32_t *counts, space_map_t *obsolete_space_sm)
+{
+ load_obsolete_space_map_arg_t losma;
+ losma.losma_counts = counts;
+ losma.losma_vim = vim;
+ VERIFY0(space_map_iterate(obsolete_space_sm,
+ space_map_length(obsolete_space_sm),
+ load_obsolete_sm_callback, &losma));
+}
+
+/*
+ * Read the obsolete counts from disk, returning them in an array.
+ */
+uint32_t *
+vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ uint64_t counts_size =
+ vim->vim_phys->vimp_num_entries * sizeof (uint32_t);
+ uint32_t *counts = vmem_alloc(counts_size, KM_SLEEP);
+ if (vim->vim_havecounts) {
+ VERIFY0(dmu_read(vim->vim_objset,
+ vim->vim_phys->vimp_counts_object,
+ 0, counts_size,
+ counts, DMU_READ_PREFETCH));
+ } else {
+ bzero(counts, counts_size);
+ }
+ return (counts);
+}
+
+extern void
+vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim,
+ uint32_t *counts)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ vmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t));
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(vdev_indirect_mapping_add_entries);
+EXPORT_SYMBOL(vdev_indirect_mapping_alloc);
+EXPORT_SYMBOL(vdev_indirect_mapping_bytes_mapped);
+EXPORT_SYMBOL(vdev_indirect_mapping_close);
+EXPORT_SYMBOL(vdev_indirect_mapping_entry_for_offset);
+EXPORT_SYMBOL(vdev_indirect_mapping_entry_for_offset_or_next);
+EXPORT_SYMBOL(vdev_indirect_mapping_free);
+EXPORT_SYMBOL(vdev_indirect_mapping_free_obsolete_counts);
+EXPORT_SYMBOL(vdev_indirect_mapping_increment_obsolete_count);
+EXPORT_SYMBOL(vdev_indirect_mapping_load_obsolete_counts);
+EXPORT_SYMBOL(vdev_indirect_mapping_load_obsolete_spacemap);
+EXPORT_SYMBOL(vdev_indirect_mapping_max_offset);
+EXPORT_SYMBOL(vdev_indirect_mapping_num_entries);
+EXPORT_SYMBOL(vdev_indirect_mapping_object);
+EXPORT_SYMBOL(vdev_indirect_mapping_open);
+EXPORT_SYMBOL(vdev_indirect_mapping_size);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/vdev_initialize.c b/sys/contrib/openzfs/module/zfs/vdev_initialize.c
new file mode 100644
index 000000000000..083ad2861b5b
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_initialize.c
@@ -0,0 +1,766 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/txg.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab_impl.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <sys/vdev_initialize.h>
+
+/*
+ * Value that is written to disk during initialization.
+ */
+#ifdef _ILP32
+unsigned long zfs_initialize_value = 0xdeadbeefUL;
+#else
+unsigned long zfs_initialize_value = 0xdeadbeefdeadbeeeULL;
+#endif
+
+/* maximum number of I/Os outstanding per leaf vdev */
+int zfs_initialize_limit = 1;
+
+/* size of initializing writes; default 1MiB, see zfs_remove_max_segment */
+unsigned long zfs_initialize_chunk_size = 1024 * 1024;
+
+static boolean_t
+vdev_initialize_should_stop(vdev_t *vd)
+{
+ return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) ||
+ vd->vdev_detached || vd->vdev_top->vdev_removing);
+}
+
+static void
+vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
+{
+ /*
+ * We pass in the guid instead of the vdev_t since the vdev may
+ * have been freed prior to the sync task being processed. This
+ * happens when a vdev is detached as we call spa_config_vdev_exit(),
+ * stop the initializing thread, schedule the sync task, and free
+ * the vdev. Later when the scheduled sync task is invoked, it would
+ * find that the vdev has been freed.
+ */
+ uint64_t guid = *(uint64_t *)arg;
+ uint64_t txg = dmu_tx_get_txg(tx);
+ kmem_free(arg, sizeof (uint64_t));
+
+ vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
+ if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
+ return;
+
+ uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK];
+ vd->vdev_initialize_offset[txg & TXG_MASK] = 0;
+
+ VERIFY(vd->vdev_leaf_zap != 0);
+
+ objset_t *mos = vd->vdev_spa->spa_meta_objset;
+
+ if (last_offset > 0) {
+ vd->vdev_initialize_last_offset = last_offset;
+ VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
+ sizeof (last_offset), 1, &last_offset, tx));
+ }
+ if (vd->vdev_initialize_action_time > 0) {
+ uint64_t val = (uint64_t)vd->vdev_initialize_action_time;
+ VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val),
+ 1, &val, tx));
+ }
+
+ uint64_t initialize_state = vd->vdev_initialize_state;
+ VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1,
+ &initialize_state, tx));
+}
+
+static void
+vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
+{
+ ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+ spa_t *spa = vd->vdev_spa;
+
+ if (new_state == vd->vdev_initialize_state)
+ return;
+
+ /*
+ * Copy the vd's guid, this will be freed by the sync task.
+ */
+ uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+ *guid = vd->vdev_guid;
+
+ /*
+ * If we're suspending, then preserving the original start time.
+ */
+ if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
+ vd->vdev_initialize_action_time = gethrestime_sec();
+ }
+
+ vdev_initializing_state_t old_state = vd->vdev_initialize_state;
+ vd->vdev_initialize_state = new_state;
+
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
+ guid, tx);
+
+ switch (new_state) {
+ case VDEV_INITIALIZE_ACTIVE:
+ spa_history_log_internal(spa, "initialize", tx,
+ "vdev=%s activated", vd->vdev_path);
+ break;
+ case VDEV_INITIALIZE_SUSPENDED:
+ spa_history_log_internal(spa, "initialize", tx,
+ "vdev=%s suspended", vd->vdev_path);
+ break;
+ case VDEV_INITIALIZE_CANCELED:
+ if (old_state == VDEV_INITIALIZE_ACTIVE ||
+ old_state == VDEV_INITIALIZE_SUSPENDED)
+ spa_history_log_internal(spa, "initialize", tx,
+ "vdev=%s canceled", vd->vdev_path);
+ break;
+ case VDEV_INITIALIZE_COMPLETE:
+ spa_history_log_internal(spa, "initialize", tx,
+ "vdev=%s complete", vd->vdev_path);
+ break;
+ default:
+ panic("invalid state %llu", (unsigned long long)new_state);
+ }
+
+ dmu_tx_commit(tx);
+
+ if (new_state != VDEV_INITIALIZE_ACTIVE)
+ spa_notify_waiters(spa);
+}
+
+static void
+vdev_initialize_cb(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ mutex_enter(&vd->vdev_initialize_io_lock);
+ if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
+ /*
+ * The I/O failed because the vdev was unavailable; roll the
+ * last offset back. (This works because spa_sync waits on
+ * spa_txg_zio before it runs sync tasks.)
+ */
+ uint64_t *off =
+ &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK];
+ *off = MIN(*off, zio->io_offset);
+ } else {
+ /*
+ * Since initializing is best-effort, we ignore I/O errors and
+ * rely on vdev_probe to determine if the errors are more
+ * critical.
+ */
+ if (zio->io_error != 0)
+ vd->vdev_stat.vs_initialize_errors++;
+
+ vd->vdev_initialize_bytes_done += zio->io_orig_size;
+ }
+ ASSERT3U(vd->vdev_initialize_inflight, >, 0);
+ vd->vdev_initialize_inflight--;
+ cv_broadcast(&vd->vdev_initialize_io_cv);
+ mutex_exit(&vd->vdev_initialize_io_lock);
+
+ spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+}
+
+/* Takes care of physical writing and limiting # of concurrent ZIOs. */
+static int
+vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ /* Limit inflight initializing I/Os */
+ mutex_enter(&vd->vdev_initialize_io_lock);
+ while (vd->vdev_initialize_inflight >= zfs_initialize_limit) {
+ cv_wait(&vd->vdev_initialize_io_cv,
+ &vd->vdev_initialize_io_lock);
+ }
+ vd->vdev_initialize_inflight++;
+ mutex_exit(&vd->vdev_initialize_io_lock);
+
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ uint64_t txg = dmu_tx_get_txg(tx);
+
+ spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
+ mutex_enter(&vd->vdev_initialize_lock);
+
+ if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) {
+ uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+ *guid = vd->vdev_guid;
+
+ /* This is the first write of this txg. */
+ dsl_sync_task_nowait(spa_get_dsl(spa),
+ vdev_initialize_zap_update_sync, guid, tx);
+ }
+
+ /*
+ * We know the vdev struct will still be around since all
+ * consumers of vdev_free must stop the initialization first.
+ */
+ if (vdev_initialize_should_stop(vd)) {
+ mutex_enter(&vd->vdev_initialize_io_lock);
+ ASSERT3U(vd->vdev_initialize_inflight, >, 0);
+ vd->vdev_initialize_inflight--;
+ mutex_exit(&vd->vdev_initialize_io_lock);
+ spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+ mutex_exit(&vd->vdev_initialize_lock);
+ dmu_tx_commit(tx);
+ return (SET_ERROR(EINTR));
+ }
+ mutex_exit(&vd->vdev_initialize_lock);
+
+ vd->vdev_initialize_offset[txg & TXG_MASK] = start + size;
+ zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start,
+ size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL,
+ ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE));
+ /* vdev_initialize_cb releases SCL_STATE_ALL */
+
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+
+/*
+ * Callback to fill each ABD chunk with zfs_initialize_value. len must be
+ * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD
+ * allocation will guarantee these for us.
+ */
+/* ARGSUSED */
+static int
+vdev_initialize_block_fill(void *buf, size_t len, void *unused)
+{
+ ASSERT0(len % sizeof (uint64_t));
+#ifdef _ILP32
+ for (uint64_t i = 0; i < len; i += sizeof (uint32_t)) {
+ *(uint32_t *)((char *)(buf) + i) = zfs_initialize_value;
+ }
+#else
+ for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) {
+ *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value;
+ }
+#endif
+ return (0);
+}
+
+static abd_t *
+vdev_initialize_block_alloc(void)
+{
+ /* Allocate ABD for filler data */
+ abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE);
+
+ ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t));
+ (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size,
+ vdev_initialize_block_fill, NULL);
+
+ return (data);
+}
+
+static void
+vdev_initialize_block_free(abd_t *data)
+{
+ abd_free(data);
+}
+
+static int
+vdev_initialize_ranges(vdev_t *vd, abd_t *data)
+{
+ range_tree_t *rt = vd->vdev_initialize_tree;
+ zfs_btree_t *bt = &rt->rt_root;
+ zfs_btree_index_t where;
+
+ for (range_seg_t *rs = zfs_btree_first(bt, &where); rs != NULL;
+ rs = zfs_btree_next(bt, &where, &where)) {
+ uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
+
+ /* Split range into legally-sized physical chunks */
+ uint64_t writes_required =
+ ((size - 1) / zfs_initialize_chunk_size) + 1;
+
+ for (uint64_t w = 0; w < writes_required; w++) {
+ int error;
+
+ error = vdev_initialize_write(vd,
+ VDEV_LABEL_START_SIZE + rs_get_start(rs, rt) +
+ (w * zfs_initialize_chunk_size),
+ MIN(size - (w * zfs_initialize_chunk_size),
+ zfs_initialize_chunk_size), data);
+ if (error != 0)
+ return (error);
+ }
+ }
+ return (0);
+}
+
+static void
+vdev_initialize_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs)
+{
+ uint64_t *last_rs_end = (uint64_t *)arg;
+
+ if (physical_rs->rs_end > *last_rs_end)
+ *last_rs_end = physical_rs->rs_end;
+}
+
+static void
+vdev_initialize_xlate_progress(void *arg, range_seg64_t *physical_rs)
+{
+ vdev_t *vd = (vdev_t *)arg;
+
+ uint64_t size = physical_rs->rs_end - physical_rs->rs_start;
+ vd->vdev_initialize_bytes_est += size;
+
+ if (vd->vdev_initialize_last_offset > physical_rs->rs_end) {
+ vd->vdev_initialize_bytes_done += size;
+ } else if (vd->vdev_initialize_last_offset > physical_rs->rs_start &&
+ vd->vdev_initialize_last_offset < physical_rs->rs_end) {
+ vd->vdev_initialize_bytes_done +=
+ vd->vdev_initialize_last_offset - physical_rs->rs_start;
+ }
+}
+
+static void
+vdev_initialize_calculate_progress(vdev_t *vd)
+{
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
+ spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
+ ASSERT(vd->vdev_leaf_zap != 0);
+
+ vd->vdev_initialize_bytes_est = 0;
+ vd->vdev_initialize_bytes_done = 0;
+
+ for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
+ metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+ mutex_enter(&msp->ms_lock);
+
+ uint64_t ms_free = (msp->ms_size -
+ metaslab_allocated_space(msp)) /
+ vdev_get_ndisks(vd->vdev_top);
+
+ /*
+ * Convert the metaslab range to a physical range
+ * on our vdev. We use this to determine if we are
+ * in the middle of this metaslab range.
+ */
+ range_seg64_t logical_rs, physical_rs, remain_rs;
+ logical_rs.rs_start = msp->ms_start;
+ logical_rs.rs_end = msp->ms_start + msp->ms_size;
+
+ /* Metaslab space after this offset has not been initialized */
+ vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs);
+ if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
+ vd->vdev_initialize_bytes_est += ms_free;
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ /* Metaslab space before this offset has been initialized */
+ uint64_t last_rs_end = physical_rs.rs_end;
+ if (!vdev_xlate_is_empty(&remain_rs)) {
+ vdev_xlate_walk(vd, &remain_rs,
+ vdev_initialize_xlate_last_rs_end, &last_rs_end);
+ }
+
+ if (vd->vdev_initialize_last_offset > last_rs_end) {
+ vd->vdev_initialize_bytes_done += ms_free;
+ vd->vdev_initialize_bytes_est += ms_free;
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ /*
+ * If we get here, we're in the middle of initializing this
+ * metaslab. Load it and walk the free tree for more accurate
+ * progress estimation.
+ */
+ VERIFY0(metaslab_load(msp));
+
+ zfs_btree_index_t where;
+ range_tree_t *rt = msp->ms_allocatable;
+ for (range_seg_t *rs =
+ zfs_btree_first(&rt->rt_root, &where); rs;
+ rs = zfs_btree_next(&rt->rt_root, &where,
+ &where)) {
+ logical_rs.rs_start = rs_get_start(rs, rt);
+ logical_rs.rs_end = rs_get_end(rs, rt);
+
+ vdev_xlate_walk(vd, &logical_rs,
+ vdev_initialize_xlate_progress, vd);
+ }
+ mutex_exit(&msp->ms_lock);
+ }
+}
+
+static int
+vdev_initialize_load(vdev_t *vd)
+{
+ int err = 0;
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
+ spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
+ ASSERT(vd->vdev_leaf_zap != 0);
+
+ if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE ||
+ vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) {
+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
+ sizeof (vd->vdev_initialize_last_offset), 1,
+ &vd->vdev_initialize_last_offset);
+ if (err == ENOENT) {
+ vd->vdev_initialize_last_offset = 0;
+ err = 0;
+ }
+ }
+
+ vdev_initialize_calculate_progress(vd);
+ return (err);
+}
+
+static void
+vdev_initialize_xlate_range_add(void *arg, range_seg64_t *physical_rs)
+{
+ vdev_t *vd = arg;
+
+ /* Only add segments that we have not visited yet */
+ if (physical_rs->rs_end <= vd->vdev_initialize_last_offset)
+ return;
+
+ /* Pick up where we left off mid-range. */
+ if (vd->vdev_initialize_last_offset > physical_rs->rs_start) {
+ zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
+ "(%llu, %llu)", vd->vdev_path,
+ (u_longlong_t)physical_rs->rs_start,
+ (u_longlong_t)physical_rs->rs_end,
+ (u_longlong_t)vd->vdev_initialize_last_offset,
+ (u_longlong_t)physical_rs->rs_end);
+ ASSERT3U(physical_rs->rs_end, >,
+ vd->vdev_initialize_last_offset);
+ physical_rs->rs_start = vd->vdev_initialize_last_offset;
+ }
+
+ ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start);
+
+ range_tree_add(vd->vdev_initialize_tree, physical_rs->rs_start,
+ physical_rs->rs_end - physical_rs->rs_start);
+}
+
+/*
+ * Convert the logical range into a physical range and add it to our
+ * avl tree.
+ */
+static void
+vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
+{
+ vdev_t *vd = arg;
+ range_seg64_t logical_rs;
+ logical_rs.rs_start = start;
+ logical_rs.rs_end = start + size;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_range_add, arg);
+}
+
+static void
+vdev_initialize_thread(void *arg)
+{
+ vdev_t *vd = arg;
+ spa_t *spa = vd->vdev_spa;
+ int error = 0;
+ uint64_t ms_count = 0;
+
+ ASSERT(vdev_is_concrete(vd));
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ vd->vdev_initialize_last_offset = 0;
+ VERIFY0(vdev_initialize_load(vd));
+
+ abd_t *deadbeef = vdev_initialize_block_alloc();
+
+ vd->vdev_initialize_tree = range_tree_create(NULL, RANGE_SEG64, NULL,
+ 0, 0);
+
+ for (uint64_t i = 0; !vd->vdev_detached &&
+ i < vd->vdev_top->vdev_ms_count; i++) {
+ metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+ boolean_t unload_when_done = B_FALSE;
+
+ /*
+ * If we've expanded the top-level vdev or it's our
+ * first pass, calculate our progress.
+ */
+ if (vd->vdev_top->vdev_ms_count != ms_count) {
+ vdev_initialize_calculate_progress(vd);
+ ms_count = vd->vdev_top->vdev_ms_count;
+ }
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ metaslab_disable(msp);
+ mutex_enter(&msp->ms_lock);
+ if (!msp->ms_loaded && !msp->ms_loading)
+ unload_when_done = B_TRUE;
+ VERIFY0(metaslab_load(msp));
+
+ range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
+ vd);
+ mutex_exit(&msp->ms_lock);
+
+ error = vdev_initialize_ranges(vd, deadbeef);
+ metaslab_enable(msp, B_TRUE, unload_when_done);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
+ if (error != 0)
+ break;
+ }
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ mutex_enter(&vd->vdev_initialize_io_lock);
+ while (vd->vdev_initialize_inflight > 0) {
+ cv_wait(&vd->vdev_initialize_io_cv,
+ &vd->vdev_initialize_io_lock);
+ }
+ mutex_exit(&vd->vdev_initialize_io_lock);
+
+ range_tree_destroy(vd->vdev_initialize_tree);
+ vdev_initialize_block_free(deadbeef);
+ vd->vdev_initialize_tree = NULL;
+
+ mutex_enter(&vd->vdev_initialize_lock);
+ if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) {
+ vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE);
+ }
+ ASSERT(vd->vdev_initialize_thread != NULL ||
+ vd->vdev_initialize_inflight == 0);
+
+ /*
+ * Drop the vdev_initialize_lock while we sync out the
+ * txg since it's possible that a device might be trying to
+ * come online and must check to see if it needs to restart an
+ * initialization. That thread will be holding the spa_config_lock
+ * which would prevent the txg_wait_synced from completing.
+ */
+ mutex_exit(&vd->vdev_initialize_lock);
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ mutex_enter(&vd->vdev_initialize_lock);
+
+ vd->vdev_initialize_thread = NULL;
+ cv_broadcast(&vd->vdev_initialize_cv);
+ mutex_exit(&vd->vdev_initialize_lock);
+
+ thread_exit();
+}
+
+/*
+ * Initiates a device. Caller must hold vdev_initialize_lock.
+ * Device must be a leaf and not already be initializing.
+ */
+void
+vdev_initialize(vdev_t *vd)
+{
+ ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+ ASSERT(!vd->vdev_detached);
+ ASSERT(!vd->vdev_initialize_exit_wanted);
+ ASSERT(!vd->vdev_top->vdev_removing);
+
+ vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE);
+ vd->vdev_initialize_thread = thread_create(NULL, 0,
+ vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+}
+
+/*
+ * Wait for the initialize thread to be terminated (cancelled or stopped).
+ */
+static void
+vdev_initialize_stop_wait_impl(vdev_t *vd)
+{
+ ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+
+ while (vd->vdev_initialize_thread != NULL)
+ cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock);
+
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+ vd->vdev_initialize_exit_wanted = B_FALSE;
+}
+
+/*
+ * Wait for vdev initialize threads which were either to cleanly exit.
+ */
+void
+vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list)
+{
+ vdev_t *vd;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ while ((vd = list_remove_head(vd_list)) != NULL) {
+ mutex_enter(&vd->vdev_initialize_lock);
+ vdev_initialize_stop_wait_impl(vd);
+ mutex_exit(&vd->vdev_initialize_lock);
+ }
+}
+
+/*
+ * Stop initializing a device, with the resultant initializing state being
+ * tgt_state. For blocking behavior pass NULL for vd_list. Otherwise, when
+ * a list_t is provided the stopping vdev is inserted in to the list. Callers
+ * are then required to call vdev_initialize_stop_wait() to block for all the
+ * initialization threads to exit. The caller must hold vdev_initialize_lock
+ * and must not be writing to the spa config, as the initializing thread may
+ * try to enter the config as a reader before exiting.
+ */
+void
+vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state,
+ list_t *vd_list)
+{
+ ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER));
+ ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(vdev_is_concrete(vd));
+
+ /*
+ * Allow cancel requests to proceed even if the initialize thread
+ * has stopped.
+ */
+ if (vd->vdev_initialize_thread == NULL &&
+ tgt_state != VDEV_INITIALIZE_CANCELED) {
+ return;
+ }
+
+ vdev_initialize_change_state(vd, tgt_state);
+ vd->vdev_initialize_exit_wanted = B_TRUE;
+
+ if (vd_list == NULL) {
+ vdev_initialize_stop_wait_impl(vd);
+ } else {
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ list_insert_tail(vd_list, vd);
+ }
+}
+
+static void
+vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state,
+ list_t *vd_list)
+{
+ if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
+ mutex_enter(&vd->vdev_initialize_lock);
+ vdev_initialize_stop(vd, tgt_state, vd_list);
+ mutex_exit(&vd->vdev_initialize_lock);
+ return;
+ }
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state,
+ vd_list);
+ }
+}
+
+/*
+ * Convenience function to stop initializing of a vdev tree and set all
+ * initialize thread pointers to NULL.
+ */
+void
+vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
+{
+ spa_t *spa = vd->vdev_spa;
+ list_t vd_list;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ list_create(&vd_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_initialize_node));
+
+ vdev_initialize_stop_all_impl(vd, tgt_state, &vd_list);
+ vdev_initialize_stop_wait(spa, &vd_list);
+
+ if (vd->vdev_spa->spa_sync_on) {
+ /* Make sure that our state has been synced to disk */
+ txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
+ }
+
+ list_destroy(&vd_list);
+}
+
+void
+vdev_initialize_restart(vdev_t *vd)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+
+ if (vd->vdev_leaf_zap != 0) {
+ mutex_enter(&vd->vdev_initialize_lock);
+ uint64_t initialize_state = VDEV_INITIALIZE_NONE;
+ int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE,
+ sizeof (initialize_state), 1, &initialize_state);
+ ASSERT(err == 0 || err == ENOENT);
+ vd->vdev_initialize_state = initialize_state;
+
+ uint64_t timestamp = 0;
+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME,
+ sizeof (timestamp), 1, &timestamp);
+ ASSERT(err == 0 || err == ENOENT);
+ vd->vdev_initialize_action_time = timestamp;
+
+ if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
+ vd->vdev_offline) {
+ /* load progress for reporting, but don't resume */
+ VERIFY0(vdev_initialize_load(vd));
+ } else if (vd->vdev_initialize_state ==
+ VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) &&
+ !vd->vdev_top->vdev_removing &&
+ vd->vdev_initialize_thread == NULL) {
+ vdev_initialize(vd);
+ }
+
+ mutex_exit(&vd->vdev_initialize_lock);
+ }
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ vdev_initialize_restart(vd->vdev_child[i]);
+ }
+}
+
+EXPORT_SYMBOL(vdev_initialize);
+EXPORT_SYMBOL(vdev_initialize_stop);
+EXPORT_SYMBOL(vdev_initialize_stop_all);
+EXPORT_SYMBOL(vdev_initialize_stop_wait);
+EXPORT_SYMBOL(vdev_initialize_restart);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, initialize_value, ULONG, ZMOD_RW,
+ "Value written during zpool initialize");
+
+ZFS_MODULE_PARAM(zfs, zfs_, initialize_chunk_size, ULONG, ZMOD_RW,
+ "Size in bytes of writes by zpool initialize");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c
new file mode 100644
index 000000000000..04202a9f8960
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_label.c
@@ -0,0 +1,1992 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+/*
+ * Virtual Device Labels
+ * ---------------------
+ *
+ * The vdev label serves several distinct purposes:
+ *
+ * 1. Uniquely identify this device as part of a ZFS pool and confirm its
+ * identity within the pool.
+ *
+ * 2. Verify that all the devices given in a configuration are present
+ * within the pool.
+ *
+ * 3. Determine the uberblock for the pool.
+ *
+ * 4. In case of an import operation, determine the configuration of the
+ * toplevel vdev of which it is a part.
+ *
+ * 5. If an import operation cannot find all the devices in the pool,
+ * provide enough information to the administrator to determine which
+ * devices are missing.
+ *
+ * It is important to note that while the kernel is responsible for writing the
+ * label, it only consumes the information in the first three cases. The
+ * latter information is only consumed in userland when determining the
+ * configuration to import a pool.
+ *
+ *
+ * Label Organization
+ * ------------------
+ *
+ * Before describing the contents of the label, it's important to understand how
+ * the labels are written and updated with respect to the uberblock.
+ *
+ * When the pool configuration is altered, either because it was newly created
+ * or a device was added, we want to update all the labels such that we can deal
+ * with fatal failure at any point. To this end, each disk has two labels which
+ * are updated before and after the uberblock is synced. Assuming we have
+ * labels and an uberblock with the following transaction groups:
+ *
+ * L1 UB L2
+ * +------+ +------+ +------+
+ * | | | | | |
+ * | t10 | | t10 | | t10 |
+ * | | | | | |
+ * +------+ +------+ +------+
+ *
+ * In this stable state, the labels and the uberblock were all updated within
+ * the same transaction group (10). Each label is mirrored and checksummed, so
+ * that we can detect when we fail partway through writing the label.
+ *
+ * In order to identify which labels are valid, the labels are written in the
+ * following manner:
+ *
+ * 1. For each vdev, update 'L1' to the new label
+ * 2. Update the uberblock
+ * 3. For each vdev, update 'L2' to the new label
+ *
+ * Given arbitrary failure, we can determine the correct label to use based on
+ * the transaction group. If we fail after updating L1 but before updating the
+ * UB, we will notice that L1's transaction group is greater than the uberblock,
+ * so L2 must be valid. If we fail after writing the uberblock but before
+ * writing L2, we will notice that L2's transaction group is less than L1, and
+ * therefore L1 is valid.
+ *
+ * Another added complexity is that not every label is updated when the config
+ * is synced. If we add a single device, we do not want to have to re-write
+ * every label for every device in the pool. This means that both L1 and L2 may
+ * be older than the pool uberblock, because the necessary information is stored
+ * on another vdev.
+ *
+ *
+ * On-disk Format
+ * --------------
+ *
+ * The vdev label consists of two distinct parts, and is wrapped within the
+ * vdev_label_t structure. The label includes 8k of padding to permit legacy
+ * VTOC disk labels, but is otherwise ignored.
+ *
+ * The first half of the label is a packed nvlist which contains pool wide
+ * properties, per-vdev properties, and configuration information. It is
+ * described in more detail below.
+ *
+ * The latter half of the label consists of a redundant array of uberblocks.
+ * These uberblocks are updated whenever a transaction group is committed,
+ * or when the configuration is updated. When a pool is loaded, we scan each
+ * vdev for the 'best' uberblock.
+ *
+ *
+ * Configuration Information
+ * -------------------------
+ *
+ * The nvlist describing the pool and vdev contains the following elements:
+ *
+ * version ZFS on-disk version
+ * name Pool name
+ * state Pool state
+ * txg Transaction group in which this label was written
+ * pool_guid Unique identifier for this pool
+ * vdev_tree An nvlist describing vdev tree.
+ * features_for_read
+ * An nvlist of the features necessary for reading the MOS.
+ *
+ * Each leaf device label also contains the following:
+ *
+ * top_guid Unique ID for top-level vdev in which this is contained
+ * guid Unique ID for the leaf vdev
+ *
+ * The 'vs' configuration follows the format described in 'spa_config.c'.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_draid.h>
+#include <sys/uberblock_impl.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/zio.h>
+#include <sys/dsl_scan.h>
+#include <sys/abd.h>
+#include <sys/fs/zfs.h>
+#include <sys/byteorder.h>
+#include <sys/zfs_bootenv.h>
+
+/*
+ * Basic routines to read and write from a vdev label.
+ * Used throughout the rest of this file.
+ */
+uint64_t
+vdev_label_offset(uint64_t psize, int l, uint64_t offset)
+{
+ ASSERT(offset < sizeof (vdev_label_t));
+ ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0);
+
+ return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
+ 0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
+}
+
+/*
+ * Returns back the vdev label associated with the passed in offset.
+ */
+int
+vdev_label_number(uint64_t psize, uint64_t offset)
+{
+ int l;
+
+ if (offset >= psize - VDEV_LABEL_END_SIZE) {
+ offset -= psize - VDEV_LABEL_END_SIZE;
+ offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t);
+ }
+ l = offset / sizeof (vdev_label_t);
+ return (l < VDEV_LABELS ? l : -1);
+}
+
+static void
+vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
+ uint64_t size, zio_done_func_t *done, void *private, int flags)
+{
+ ASSERT(
+ spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE ||
+ spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE);
+ ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
+
+ zio_nowait(zio_read_phys(zio, vd,
+ vdev_label_offset(vd->vdev_psize, l, offset),
+ size, buf, ZIO_CHECKSUM_LABEL, done, private,
+ ZIO_PRIORITY_SYNC_READ, flags, B_TRUE));
+}
+
+void
+vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
+ uint64_t size, zio_done_func_t *done, void *private, int flags)
+{
+ ASSERT(
+ spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE ||
+ spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE);
+ ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
+
+ zio_nowait(zio_write_phys(zio, vd,
+ vdev_label_offset(vd->vdev_psize, l, offset),
+ size, buf, ZIO_CHECKSUM_LABEL, done, private,
+ ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE));
+}
+
+/*
+ * Generate the nvlist representing this vdev's stats
+ */
+void
+vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
+{
+ nvlist_t *nvx;
+ vdev_stat_t *vs;
+ vdev_stat_ex_t *vsx;
+
+ vs = kmem_alloc(sizeof (*vs), KM_SLEEP);
+ vsx = kmem_alloc(sizeof (*vsx), KM_SLEEP);
+
+ vdev_get_stats_ex(vd, vs, vsx);
+ fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t *)vs, sizeof (*vs) / sizeof (uint64_t));
+
+ /*
+ * Add extended stats into a special extended stats nvlist. This keeps
+ * all the extended stats nicely grouped together. The extended stats
+ * nvlist is then added to the main nvlist.
+ */
+ nvx = fnvlist_alloc();
+
+ /* ZIOs in flight to disk */
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE,
+ vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_READ]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE,
+ vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_WRITE]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE,
+ vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_READ]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE,
+ vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_WRITE]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,
+ vsx->vsx_active_queue[ZIO_PRIORITY_SCRUB]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE,
+ vsx->vsx_active_queue[ZIO_PRIORITY_TRIM]);
+
+ /* ZIOs pending */
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE,
+ vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_READ]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE,
+ vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_WRITE]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE,
+ vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_READ]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE,
+ vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_WRITE]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE,
+ vsx->vsx_pend_queue[ZIO_PRIORITY_SCRUB]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE,
+ vsx->vsx_pend_queue[ZIO_PRIORITY_TRIM]);
+
+ /* Histograms */
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
+ vsx->vsx_total_histo[ZIO_TYPE_READ],
+ ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_READ]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
+ vsx->vsx_total_histo[ZIO_TYPE_WRITE],
+ ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_WRITE]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
+ vsx->vsx_disk_histo[ZIO_TYPE_READ],
+ ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_READ]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
+ vsx->vsx_disk_histo[ZIO_TYPE_WRITE],
+ ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_WRITE]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO,
+ vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ],
+ ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO,
+ vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE],
+ ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO,
+ vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ],
+ ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO,
+ vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE],
+ ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,
+ vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB],
+ ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
+ vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM],
+ ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM]));
+
+ /* Request sizes */
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO,
+ vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ],
+ ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO,
+ vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE],
+ ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO,
+ vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ],
+ ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO,
+ vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE],
+ ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO,
+ vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB],
+ ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO,
+ vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM],
+ ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO,
+ vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ],
+ ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO,
+ vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE],
+ ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO,
+ vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ],
+ ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO,
+ vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE],
+ ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO,
+ vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB],
+ ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO,
+ vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM],
+ ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM]));
+
+ /* IO delays */
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios);
+
+ /* Add extended stats nvlist to main nvlist */
+ fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx);
+
+ fnvlist_free(nvx);
+ kmem_free(vs, sizeof (*vs));
+ kmem_free(vsx, sizeof (*vsx));
+}
+
+static void
+root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ if (vd != spa->spa_root_vdev)
+ return;
+
+ /* provide either current or previous scan information */
+ pool_scan_stat_t ps;
+ if (spa_scan_get_stats(spa, &ps) == 0) {
+ fnvlist_add_uint64_array(nvl,
+ ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
+ sizeof (pool_scan_stat_t) / sizeof (uint64_t));
+ }
+
+ pool_removal_stat_t prs;
+ if (spa_removal_get_stats(spa, &prs) == 0) {
+ fnvlist_add_uint64_array(nvl,
+ ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs,
+ sizeof (prs) / sizeof (uint64_t));
+ }
+
+ pool_checkpoint_stat_t pcs;
+ if (spa_checkpoint_get_stats(spa, &pcs) == 0) {
+ fnvlist_add_uint64_array(nvl,
+ ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs,
+ sizeof (pcs) / sizeof (uint64_t));
+ }
+}
+
+static void
+top_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
+{
+ if (vd == vd->vdev_top) {
+ vdev_rebuild_stat_t vrs;
+ if (vdev_rebuild_get_stats(vd, &vrs) == 0) {
+ fnvlist_add_uint64_array(nvl,
+ ZPOOL_CONFIG_REBUILD_STATS, (uint64_t *)&vrs,
+ sizeof (vrs) / sizeof (uint64_t));
+ }
+ }
+}
+
+/*
+ * Generate the nvlist representing this vdev's config.
+ */
+nvlist_t *
+vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
+ vdev_config_flag_t flags)
+{
+ nvlist_t *nv = NULL;
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+
+ nv = fnvlist_alloc();
+
+ fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type);
+ if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid);
+
+ if (vd->vdev_path != NULL)
+ fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path);
+
+ if (vd->vdev_devid != NULL)
+ fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid);
+
+ if (vd->vdev_physpath != NULL)
+ fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
+ vd->vdev_physpath);
+
+ if (vd->vdev_enc_sysfs_path != NULL)
+ fnvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
+ vd->vdev_enc_sysfs_path);
+
+ if (vd->vdev_fru != NULL)
+ fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
+
+ if (vd->vdev_ops->vdev_op_config_generate != NULL)
+ vd->vdev_ops->vdev_op_config_generate(vd, nv);
+
+ if (vd->vdev_wholedisk != -1ULL) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+ vd->vdev_wholedisk);
+ }
+
+ if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
+
+ if (vd->vdev_isspare)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
+
+ if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
+ vd == vd->vdev_top) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+ vd->vdev_ms_array);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+ vd->vdev_ms_shift);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
+ vd->vdev_asize);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
+ if (vd->vdev_removing) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
+ vd->vdev_removing);
+ }
+
+ /* zpool command expects alloc class data */
+ if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) {
+ const char *bias = NULL;
+
+ switch (vd->vdev_alloc_bias) {
+ case VDEV_BIAS_LOG:
+ bias = VDEV_ALLOC_BIAS_LOG;
+ break;
+ case VDEV_BIAS_SPECIAL:
+ bias = VDEV_ALLOC_BIAS_SPECIAL;
+ break;
+ case VDEV_BIAS_DEDUP:
+ bias = VDEV_ALLOC_BIAS_DEDUP;
+ break;
+ default:
+ ASSERT3U(vd->vdev_alloc_bias, ==,
+ VDEV_BIAS_NONE);
+ }
+ fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
+ bias);
+ }
+ }
+
+ if (vd->vdev_dtl_sm != NULL) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
+ space_map_object(vd->vdev_dtl_sm));
+ }
+
+ if (vic->vic_mapping_object != 0) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
+ vic->vic_mapping_object);
+ }
+
+ if (vic->vic_births_object != 0) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
+ vic->vic_births_object);
+ }
+
+ if (vic->vic_prev_indirect_vdev != UINT64_MAX) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
+ vic->vic_prev_indirect_vdev);
+ }
+
+ if (vd->vdev_crtxg)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
+
+ if (vd->vdev_expansion_time)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_EXPANSION_TIME,
+ vd->vdev_expansion_time);
+
+ if (flags & VDEV_CONFIG_MOS) {
+ if (vd->vdev_leaf_zap != 0) {
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP,
+ vd->vdev_leaf_zap);
+ }
+
+ if (vd->vdev_top_zap != 0) {
+ ASSERT(vd == vd->vdev_top);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
+ vd->vdev_top_zap);
+ }
+
+ if (vd->vdev_resilver_deferred) {
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(spa->spa_resilver_deferred);
+ fnvlist_add_boolean(nv, ZPOOL_CONFIG_RESILVER_DEFER);
+ }
+ }
+
+ if (getstats) {
+ vdev_config_generate_stats(vd, nv);
+
+ root_vdev_actions_getprogress(vd, nv);
+ top_vdev_actions_getprogress(vd, nv);
+
+ /*
+ * Note: this can be called from open context
+ * (spa_get_stats()), so we need the rwlock to prevent
+ * the mapping from being changed by condensing.
+ */
+ rw_enter(&vd->vdev_indirect_rwlock, RW_READER);
+ if (vd->vdev_indirect_mapping != NULL) {
+ ASSERT(vd->vdev_indirect_births != NULL);
+ vdev_indirect_mapping_t *vim =
+ vd->vdev_indirect_mapping;
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
+ vdev_indirect_mapping_size(vim));
+ }
+ rw_exit(&vd->vdev_indirect_rwlock);
+ if (vd->vdev_mg != NULL &&
+ vd->vdev_mg->mg_fragmentation != ZFS_FRAG_INVALID) {
+ /*
+ * Compute approximately how much memory would be used
+ * for the indirect mapping if this device were to
+ * be removed.
+ *
+ * Note: If the frag metric is invalid, then not
+ * enough metaslabs have been converted to have
+ * histograms.
+ */
+ uint64_t seg_count = 0;
+ uint64_t to_alloc = vd->vdev_stat.vs_alloc;
+
+ /*
+ * There are the same number of allocated segments
+ * as free segments, so we will have at least one
+ * entry per free segment. However, small free
+ * segments (smaller than vdev_removal_max_span)
+ * will be combined with adjacent allocated segments
+ * as a single mapping.
+ */
+ for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+ if (i + 1 < highbit64(vdev_removal_max_span)
+ - 1) {
+ to_alloc +=
+ vd->vdev_mg->mg_histogram[i] <<
+ (i + 1);
+ } else {
+ seg_count +=
+ vd->vdev_mg->mg_histogram[i];
+ }
+ }
+
+ /*
+ * The maximum length of a mapping is
+ * zfs_remove_max_segment, so we need at least one entry
+ * per zfs_remove_max_segment of allocated data.
+ */
+ seg_count += to_alloc / spa_remove_max_segment(spa);
+
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
+ seg_count *
+ sizeof (vdev_indirect_mapping_entry_phys_t));
+ }
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf) {
+ nvlist_t **child;
+ int c, idx;
+
+ ASSERT(!vd->vdev_ishole);
+
+ child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
+ KM_SLEEP);
+
+ for (c = 0, idx = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ /*
+ * If we're generating an nvlist of removing
+ * vdevs then skip over any device which is
+ * not being removed.
+ */
+ if ((flags & VDEV_CONFIG_REMOVING) &&
+ !cvd->vdev_removing)
+ continue;
+
+ child[idx++] = vdev_config_generate(spa, cvd,
+ getstats, flags);
+ }
+
+ if (idx) {
+ fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ child, idx);
+ }
+
+ for (c = 0; c < idx; c++)
+ nvlist_free(child[c]);
+
+ kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
+
+ } else {
+ const char *aux = NULL;
+
+ if (vd->vdev_offline && !vd->vdev_tmpoffline)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE);
+ if (vd->vdev_resilver_txg != 0)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
+ vd->vdev_resilver_txg);
+ if (vd->vdev_rebuild_txg != 0)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
+ vd->vdev_rebuild_txg);
+ if (vd->vdev_faulted)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE);
+ if (vd->vdev_degraded)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE);
+ if (vd->vdev_removed)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE);
+ if (vd->vdev_unspare)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE);
+ if (vd->vdev_ishole)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE);
+
+ /* Set the reason why we're FAULTED/DEGRADED. */
+ switch (vd->vdev_stat.vs_aux) {
+ case VDEV_AUX_ERR_EXCEEDED:
+ aux = "err_exceeded";
+ break;
+
+ case VDEV_AUX_EXTERNAL:
+ aux = "external";
+ break;
+ }
+
+ if (aux != NULL && !vd->vdev_tmpoffline) {
+ fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux);
+ } else {
+ /*
+ * We're healthy - clear any previous AUX_STATE values.
+ */
+ if (nvlist_exists(nv, ZPOOL_CONFIG_AUX_STATE))
+ nvlist_remove_all(nv, ZPOOL_CONFIG_AUX_STATE);
+ }
+
+ if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
+ vd->vdev_orig_guid);
+ }
+ }
+
+ return (nv);
+}
+
+/*
+ * Generate a view of the top-level vdevs. If we currently have holes
+ * in the namespace, then generate an array which contains a list of holey
+ * vdevs. Additionally, add the number of top-level children that currently
+ * exist.
+ */
+void
+vdev_top_config_generate(spa_t *spa, nvlist_t *config)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t *array;
+ uint_t c, idx;
+
+ array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
+
+ for (c = 0, idx = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+
+ if (tvd->vdev_ishole) {
+ array[idx++] = c;
+ }
+ }
+
+ if (idx) {
+ VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
+ array, idx) == 0);
+ }
+
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
+ rvd->vdev_children) == 0);
+
+ kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
+}
+
+/*
+ * Returns the configuration from the label of the given vdev. For vdevs
+ * which don't have a txg value stored on their label (i.e. spares/cache)
+ * or have not been completely initialized (txg = 0) just return
+ * the configuration from the first valid label we find. Otherwise,
+ * find the most up-to-date label that does not exceed the specified
+ * 'txg' value.
+ */
+nvlist_t *
+vdev_label_read_config(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ nvlist_t *config = NULL;
+ vdev_phys_t *vp[VDEV_LABELS];
+ abd_t *vp_abd[VDEV_LABELS];
+ zio_t *zio[VDEV_LABELS];
+ uint64_t best_txg = 0;
+ uint64_t label_txg = 0;
+ int error = 0;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE;
+
+ ASSERT(vd->vdev_validate_thread == curthread ||
+ spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
+ if (!vdev_readable(vd))
+ return (NULL);
+
+ /*
+ * The label for a dRAID distributed spare is not stored on disk.
+ * Instead it is generated when needed which allows us to bypass
+ * the pipeline when reading the config from the label.
+ */
+ if (vd->vdev_ops == &vdev_draid_spare_ops)
+ return (vdev_draid_read_config_spare(vd));
+
+ for (int l = 0; l < VDEV_LABELS; l++) {
+ vp_abd[l] = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
+ vp[l] = abd_to_buf(vp_abd[l]);
+ }
+
+retry:
+ for (int l = 0; l < VDEV_LABELS; l++) {
+ zio[l] = zio_root(spa, NULL, NULL, flags);
+
+ vdev_label_read(zio[l], vd, l, vp_abd[l],
+ offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t),
+ NULL, NULL, flags);
+ }
+ for (int l = 0; l < VDEV_LABELS; l++) {
+ nvlist_t *label = NULL;
+
+ if (zio_wait(zio[l]) == 0 &&
+ nvlist_unpack(vp[l]->vp_nvlist, sizeof (vp[l]->vp_nvlist),
+ &label, 0) == 0) {
+ /*
+ * Auxiliary vdevs won't have txg values in their
+ * labels and newly added vdevs may not have been
+ * completely initialized so just return the
+ * configuration from the first valid label we
+ * encounter.
+ */
+ error = nvlist_lookup_uint64(label,
+ ZPOOL_CONFIG_POOL_TXG, &label_txg);
+ if ((error || label_txg == 0) && !config) {
+ config = label;
+ for (l++; l < VDEV_LABELS; l++)
+ zio_wait(zio[l]);
+ break;
+ } else if (label_txg <= txg && label_txg > best_txg) {
+ best_txg = label_txg;
+ nvlist_free(config);
+ config = fnvlist_dup(label);
+ }
+ }
+
+ if (label != NULL) {
+ nvlist_free(label);
+ label = NULL;
+ }
+ }
+
+ if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) {
+ flags |= ZIO_FLAG_TRYHARD;
+ goto retry;
+ }
+
+ /*
+ * We found a valid label but it didn't pass txg restrictions.
+ */
+ if (config == NULL && label_txg != 0) {
+ vdev_dbgmsg(vd, "label discarded as txg is too large "
+ "(%llu > %llu)", (u_longlong_t)label_txg,
+ (u_longlong_t)txg);
+ }
+
+ for (int l = 0; l < VDEV_LABELS; l++) {
+ abd_free(vp_abd[l]);
+ }
+
+ return (config);
+}
+
+/*
+ * Determine if a device is in use. The 'spare_guid' parameter will be filled
+ * in with the device guid if this spare is active elsewhere on the system.
+ */
+static boolean_t
+vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
+ uint64_t *spare_guid, uint64_t *l2cache_guid)
+{
+ spa_t *spa = vd->vdev_spa;
+ uint64_t state, pool_guid, device_guid, txg, spare_pool;
+ uint64_t vdtxg = 0;
+ nvlist_t *label;
+
+ if (spare_guid)
+ *spare_guid = 0ULL;
+ if (l2cache_guid)
+ *l2cache_guid = 0ULL;
+
+ /*
+ * Read the label, if any, and perform some basic sanity checks.
+ */
+ if ((label = vdev_label_read_config(vd, -1ULL)) == NULL)
+ return (B_FALSE);
+
+ (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
+ &vdtxg);
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ &state) != 0 ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
+ &device_guid) != 0) {
+ nvlist_free(label);
+ return (B_FALSE);
+ }
+
+ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+ (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
+ &pool_guid) != 0 ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
+ &txg) != 0)) {
+ nvlist_free(label);
+ return (B_FALSE);
+ }
+
+ nvlist_free(label);
+
+ /*
+ * Check to see if this device indeed belongs to the pool it claims to
+ * be a part of. The only way this is allowed is if the device is a hot
+ * spare (which we check for later on).
+ */
+ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+ !spa_guid_exists(pool_guid, device_guid) &&
+ !spa_spare_exists(device_guid, NULL, NULL) &&
+ !spa_l2cache_exists(device_guid, NULL))
+ return (B_FALSE);
+
+ /*
+ * If the transaction group is zero, then this an initialized (but
+ * unused) label. This is only an error if the create transaction
+ * on-disk is the same as the one we're using now, in which case the
+ * user has attempted to add the same vdev multiple times in the same
+ * transaction.
+ */
+ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+ txg == 0 && vdtxg == crtxg)
+ return (B_TRUE);
+
+ /*
+ * Check to see if this is a spare device. We do an explicit check for
+ * spa_has_spare() here because it may be on our pending list of spares
+ * to add. We also check if it is an l2cache device.
+ */
+ if (spa_spare_exists(device_guid, &spare_pool, NULL) ||
+ spa_has_spare(spa, device_guid)) {
+ if (spare_guid)
+ *spare_guid = device_guid;
+
+ switch (reason) {
+ case VDEV_LABEL_CREATE:
+ case VDEV_LABEL_L2CACHE:
+ return (B_TRUE);
+
+ case VDEV_LABEL_REPLACE:
+ return (!spa_has_spare(spa, device_guid) ||
+ spare_pool != 0ULL);
+
+ case VDEV_LABEL_SPARE:
+ return (spa_has_spare(spa, device_guid));
+ default:
+ break;
+ }
+ }
+
+ /*
+ * Check to see if this is an l2cache device.
+ */
+ if (spa_l2cache_exists(device_guid, NULL))
+ return (B_TRUE);
+
+ /*
+ * We can't rely on a pool's state if it's been imported
+ * read-only. Instead we look to see if the pools is marked
+ * read-only in the namespace and set the state to active.
+ */
+ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+ (spa = spa_by_guid(pool_guid, device_guid)) != NULL &&
+ spa_mode(spa) == SPA_MODE_READ)
+ state = POOL_STATE_ACTIVE;
+
+ /*
+ * If the device is marked ACTIVE, then this device is in use by another
+ * pool on the system.
+ */
+ return (state == POOL_STATE_ACTIVE);
+}
+
+/*
+ * Initialize a vdev label. We check to make sure each leaf device is not in
+ * use, and writable. We put down an initial label which we will later
+ * overwrite with a complete label. Note that it's important to do this
+ * sequentially, not in parallel, so that we catch cases of multiple use of the
+ * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with
+ * itself.
+ */
+int
+vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
+{
+ spa_t *spa = vd->vdev_spa;
+ nvlist_t *label;
+ vdev_phys_t *vp;
+ abd_t *vp_abd;
+ abd_t *bootenv;
+ uberblock_t *ub;
+ abd_t *ub_abd;
+ zio_t *zio;
+ char *buf;
+ size_t buflen;
+ int error;
+ uint64_t spare_guid = 0, l2cache_guid = 0;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ if ((error = vdev_label_init(vd->vdev_child[c],
+ crtxg, reason)) != 0)
+ return (error);
+
+ /* Track the creation time for this vdev */
+ vd->vdev_crtxg = crtxg;
+
+ if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa))
+ return (0);
+
+ /*
+ * Dead vdevs cannot be initialized.
+ */
+ if (vdev_is_dead(vd))
+ return (SET_ERROR(EIO));
+
+ /*
+ * Determine if the vdev is in use.
+ */
+ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT &&
+ vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
+ return (SET_ERROR(EBUSY));
+
+ /*
+ * If this is a request to add or replace a spare or l2cache device
+ * that is in use elsewhere on the system, then we must update the
+ * guid (which was initialized to a random value) to reflect the
+ * actual GUID (which is shared between multiple pools).
+ */
+ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE &&
+ spare_guid != 0ULL) {
+ uint64_t guid_delta = spare_guid - vd->vdev_guid;
+
+ vd->vdev_guid += guid_delta;
+
+ for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+ pvd->vdev_guid_sum += guid_delta;
+
+ /*
+ * If this is a replacement, then we want to fallthrough to the
+ * rest of the code. If we're adding a spare, then it's already
+ * labeled appropriately and we can just return.
+ */
+ if (reason == VDEV_LABEL_SPARE)
+ return (0);
+ ASSERT(reason == VDEV_LABEL_REPLACE ||
+ reason == VDEV_LABEL_SPLIT);
+ }
+
+ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE &&
+ l2cache_guid != 0ULL) {
+ uint64_t guid_delta = l2cache_guid - vd->vdev_guid;
+
+ vd->vdev_guid += guid_delta;
+
+ for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+ pvd->vdev_guid_sum += guid_delta;
+
+ /*
+ * If this is a replacement, then we want to fallthrough to the
+ * rest of the code. If we're adding an l2cache, then it's
+ * already labeled appropriately and we can just return.
+ */
+ if (reason == VDEV_LABEL_L2CACHE)
+ return (0);
+ ASSERT(reason == VDEV_LABEL_REPLACE);
+ }
+
+ /*
+ * Initialize its label.
+ */
+ vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
+ abd_zero(vp_abd, sizeof (vdev_phys_t));
+ vp = abd_to_buf(vp_abd);
+
+ /*
+ * Generate a label describing the pool and our top-level vdev.
+ * We mark it as being from txg 0 to indicate that it's not
+ * really part of an active pool just yet. The labels will
+ * be written again with a meaningful txg by spa_sync().
+ */
+ if (reason == VDEV_LABEL_SPARE ||
+ (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) {
+ /*
+ * For inactive hot spares, we generate a special label that
+ * identifies as a mutually shared hot spare. We write the
+ * label if we are adding a hot spare, or if we are removing an
+ * active hot spare (in which case we want to revert the
+ * labels).
+ */
+ VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
+ spa_version(spa)) == 0);
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ POOL_STATE_SPARE) == 0);
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
+ vd->vdev_guid) == 0);
+ } else if (reason == VDEV_LABEL_L2CACHE ||
+ (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) {
+ /*
+ * For level 2 ARC devices, add a special label.
+ */
+ VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
+ spa_version(spa)) == 0);
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ POOL_STATE_L2CACHE) == 0);
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
+ vd->vdev_guid) == 0);
+ } else {
+ uint64_t txg = 0ULL;
+
+ if (reason == VDEV_LABEL_SPLIT)
+ txg = spa->spa_uberblock.ub_txg;
+ label = spa_config_generate(spa, vd, txg, B_FALSE);
+
+ /*
+ * Add our creation time. This allows us to detect multiple
+ * vdev uses as described above, and automatically expires if we
+ * fail.
+ */
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
+ crtxg) == 0);
+ }
+
+ buf = vp->vp_nvlist;
+ buflen = sizeof (vp->vp_nvlist);
+
+ error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
+ if (error != 0) {
+ nvlist_free(label);
+ abd_free(vp_abd);
+ /* EFAULT means nvlist_pack ran out of room */
+ return (SET_ERROR(error == EFAULT ? ENAMETOOLONG : EINVAL));
+ }
+
+ /*
+ * Initialize uberblock template.
+ */
+ ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE);
+ abd_zero(ub_abd, VDEV_UBERBLOCK_RING);
+ abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t));
+ ub = abd_to_buf(ub_abd);
+ ub->ub_txg = 0;
+
+ /* Initialize the 2nd padding area. */
+ bootenv = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
+ abd_zero(bootenv, VDEV_PAD_SIZE);
+
+ /*
+ * Write everything in parallel.
+ */
+retry:
+ zio = zio_root(spa, NULL, NULL, flags);
+
+ for (int l = 0; l < VDEV_LABELS; l++) {
+
+ vdev_label_write(zio, vd, l, vp_abd,
+ offsetof(vdev_label_t, vl_vdev_phys),
+ sizeof (vdev_phys_t), NULL, NULL, flags);
+
+ /*
+ * Skip the 1st padding area.
+ * Zero out the 2nd padding area where it might have
+ * left over data from previous filesystem format.
+ */
+ vdev_label_write(zio, vd, l, bootenv,
+ offsetof(vdev_label_t, vl_be),
+ VDEV_PAD_SIZE, NULL, NULL, flags);
+
+ vdev_label_write(zio, vd, l, ub_abd,
+ offsetof(vdev_label_t, vl_uberblock),
+ VDEV_UBERBLOCK_RING, NULL, NULL, flags);
+ }
+
+ error = zio_wait(zio);
+
+ if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
+ flags |= ZIO_FLAG_TRYHARD;
+ goto retry;
+ }
+
+ nvlist_free(label);
+ abd_free(bootenv);
+ abd_free(ub_abd);
+ abd_free(vp_abd);
+
+ /*
+ * If this vdev hasn't been previously identified as a spare, then we
+ * mark it as such only if a) we are labeling it as a spare, or b) it
+ * exists as a spare elsewhere in the system. Do the same for
+ * level 2 ARC devices.
+ */
+ if (error == 0 && !vd->vdev_isspare &&
+ (reason == VDEV_LABEL_SPARE ||
+ spa_spare_exists(vd->vdev_guid, NULL, NULL)))
+ spa_spare_add(vd);
+
+ if (error == 0 && !vd->vdev_isl2cache &&
+ (reason == VDEV_LABEL_L2CACHE ||
+ spa_l2cache_exists(vd->vdev_guid, NULL)))
+ spa_l2cache_add(vd);
+
+ return (error);
+}
+
+/*
+ * Done callback for vdev_label_read_bootenv_impl. If this is the first
+ * callback to finish, store our abd in the callback pointer. Otherwise, we
+ * just free our abd and return.
+ */
+static void
+vdev_label_read_bootenv_done(zio_t *zio)
+{
+ zio_t *rio = zio->io_private;
+ abd_t **cbp = rio->io_private;
+
+ ASSERT3U(zio->io_size, ==, VDEV_PAD_SIZE);
+
+ if (zio->io_error == 0) {
+ mutex_enter(&rio->io_lock);
+ if (*cbp == NULL) {
+ /* Will free this buffer in vdev_label_read_bootenv. */
+ *cbp = zio->io_abd;
+ } else {
+ abd_free(zio->io_abd);
+ }
+ mutex_exit(&rio->io_lock);
+ } else {
+ abd_free(zio->io_abd);
+ }
+}
+
+static void
+vdev_label_read_bootenv_impl(zio_t *zio, vdev_t *vd, int flags)
+{
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_label_read_bootenv_impl(zio, vd->vdev_child[c], flags);
+
+ /*
+ * We just use the first label that has a correct checksum; the
+ * bootloader should have rewritten them all to be the same on boot,
+ * and any changes we made since boot have been the same across all
+ * labels.
+ */
+ if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
+ for (int l = 0; l < VDEV_LABELS; l++) {
+ vdev_label_read(zio, vd, l,
+ abd_alloc_linear(VDEV_PAD_SIZE, B_FALSE),
+ offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE,
+ vdev_label_read_bootenv_done, zio, flags);
+ }
+ }
+}
+
+int
+vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *bootenv)
+{
+ nvlist_t *config;
+ spa_t *spa = rvd->vdev_spa;
+ abd_t *abd = NULL;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
+
+ ASSERT(bootenv);
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ zio_t *zio = zio_root(spa, NULL, &abd, flags);
+ vdev_label_read_bootenv_impl(zio, rvd, flags);
+ int err = zio_wait(zio);
+
+ if (abd != NULL) {
+ char *buf;
+ vdev_boot_envblock_t *vbe = abd_to_buf(abd);
+
+ vbe->vbe_version = ntohll(vbe->vbe_version);
+ switch (vbe->vbe_version) {
+ case VB_RAW:
+ /*
+ * if we have textual data in vbe_bootenv, create nvlist
+ * with key "envmap".
+ */
+ fnvlist_add_uint64(bootenv, BOOTENV_VERSION, VB_RAW);
+ vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0';
+ fnvlist_add_string(bootenv, GRUB_ENVMAP,
+ vbe->vbe_bootenv);
+ break;
+
+ case VB_NVLIST:
+ err = nvlist_unpack(vbe->vbe_bootenv,
+ sizeof (vbe->vbe_bootenv), &config, 0);
+ if (err == 0) {
+ fnvlist_merge(bootenv, config);
+ nvlist_free(config);
+ break;
+ }
+ /* FALLTHROUGH */
+ default:
+ /* Check for FreeBSD zfs bootonce command string */
+ buf = abd_to_buf(abd);
+ if (*buf == '\0') {
+ fnvlist_add_uint64(bootenv, BOOTENV_VERSION,
+ VB_NVLIST);
+ break;
+ }
+ fnvlist_add_string(bootenv, FREEBSD_BOOTONCE, buf);
+ }
+
+ /*
+ * abd was allocated in vdev_label_read_bootenv_impl()
+ */
+ abd_free(abd);
+ /*
+ * If we managed to read any successfully,
+ * return success.
+ */
+ return (0);
+ }
+ return (err);
+}
+
+int
+vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env)
+{
+ zio_t *zio;
+ spa_t *spa = vd->vdev_spa;
+ vdev_boot_envblock_t *bootenv;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+ int error;
+ size_t nvsize;
+ char *nvbuf;
+
+ error = nvlist_size(env, &nvsize, NV_ENCODE_XDR);
+ if (error != 0)
+ return (SET_ERROR(error));
+
+ if (nvsize >= sizeof (bootenv->vbe_bootenv)) {
+ return (SET_ERROR(E2BIG));
+ }
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ error = ENXIO;
+ for (int c = 0; c < vd->vdev_children; c++) {
+ int child_err;
+
+ child_err = vdev_label_write_bootenv(vd->vdev_child[c], env);
+ /*
+ * As long as any of the disks managed to write all of their
+ * labels successfully, return success.
+ */
+ if (child_err == 0)
+ error = child_err;
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf || vdev_is_dead(vd) ||
+ !vdev_writeable(vd)) {
+ return (error);
+ }
+ ASSERT3U(sizeof (*bootenv), ==, VDEV_PAD_SIZE);
+ abd_t *abd = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
+ abd_zero(abd, VDEV_PAD_SIZE);
+
+ bootenv = abd_borrow_buf_copy(abd, VDEV_PAD_SIZE);
+ nvbuf = bootenv->vbe_bootenv;
+ nvsize = sizeof (bootenv->vbe_bootenv);
+
+ bootenv->vbe_version = fnvlist_lookup_uint64(env, BOOTENV_VERSION);
+ switch (bootenv->vbe_version) {
+ case VB_RAW:
+ if (nvlist_lookup_string(env, GRUB_ENVMAP, &nvbuf) == 0) {
+ (void) strlcpy(bootenv->vbe_bootenv, nvbuf, nvsize);
+ }
+ error = 0;
+ break;
+
+ case VB_NVLIST:
+ error = nvlist_pack(env, &nvbuf, &nvsize, NV_ENCODE_XDR,
+ KM_SLEEP);
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ if (error == 0) {
+ bootenv->vbe_version = htonll(bootenv->vbe_version);
+ abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE);
+ } else {
+ abd_free(abd);
+ return (SET_ERROR(error));
+ }
+
+retry:
+ zio = zio_root(spa, NULL, NULL, flags);
+ for (int l = 0; l < VDEV_LABELS; l++) {
+ vdev_label_write(zio, vd, l, abd,
+ offsetof(vdev_label_t, vl_be),
+ VDEV_PAD_SIZE, NULL, NULL, flags);
+ }
+
+ error = zio_wait(zio);
+ if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
+ flags |= ZIO_FLAG_TRYHARD;
+ goto retry;
+ }
+
+ abd_free(abd);
+ return (error);
+}
+
+/*
+ * ==========================================================================
+ * uberblock load/sync
+ * ==========================================================================
+ */
+
+/*
+ * Consider the following situation: txg is safely synced to disk. We've
+ * written the first uberblock for txg + 1, and then we lose power. When we
+ * come back up, we fail to see the uberblock for txg + 1 because, say,
+ * it was on a mirrored device and the replica to which we wrote txg + 1
+ * is now offline. If we then make some changes and sync txg + 1, and then
+ * the missing replica comes back, then for a few seconds we'll have two
+ * conflicting uberblocks on disk with the same txg. The solution is simple:
+ * among uberblocks with equal txg, choose the one with the latest timestamp.
+ */
+static int
+vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
+{
+ int cmp = TREE_CMP(ub1->ub_txg, ub2->ub_txg);
+
+ if (likely(cmp))
+ return (cmp);
+
+ cmp = TREE_CMP(ub1->ub_timestamp, ub2->ub_timestamp);
+ if (likely(cmp))
+ return (cmp);
+
+ /*
+ * If MMP_VALID(ub) && MMP_SEQ_VALID(ub) then the host has an MMP-aware
+ * ZFS, e.g. OpenZFS >= 0.7.
+ *
+ * If one ub has MMP and the other does not, they were written by
+ * different hosts, which matters for MMP. So we treat no MMP/no SEQ as
+ * a 0 value.
+ *
+ * Since timestamp and txg are the same if we get this far, either is
+ * acceptable for importing the pool.
+ */
+ unsigned int seq1 = 0;
+ unsigned int seq2 = 0;
+
+ if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1))
+ seq1 = MMP_SEQ(ub1);
+
+ if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2))
+ seq2 = MMP_SEQ(ub2);
+
+ return (TREE_CMP(seq1, seq2));
+}
+
+struct ubl_cbdata {
+ uberblock_t *ubl_ubbest; /* Best uberblock */
+ vdev_t *ubl_vd; /* vdev associated with the above */
+};
+
+static void
+vdev_uberblock_load_done(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ spa_t *spa = zio->io_spa;
+ zio_t *rio = zio->io_private;
+ uberblock_t *ub = abd_to_buf(zio->io_abd);
+ struct ubl_cbdata *cbp = rio->io_private;
+
+ ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd));
+
+ if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
+ mutex_enter(&rio->io_lock);
+ if (ub->ub_txg <= spa->spa_load_max_txg &&
+ vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {
+ /*
+ * Keep track of the vdev in which this uberblock
+ * was found. We will use this information later
+ * to obtain the config nvlist associated with
+ * this uberblock.
+ */
+ *cbp->ubl_ubbest = *ub;
+ cbp->ubl_vd = vd;
+ }
+ mutex_exit(&rio->io_lock);
+ }
+
+ abd_free(zio->io_abd);
+}
+
+static void
+vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
+ struct ubl_cbdata *cbp)
+{
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp);
+
+ if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd) &&
+ vd->vdev_ops != &vdev_draid_spare_ops) {
+ for (int l = 0; l < VDEV_LABELS; l++) {
+ for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
+ vdev_label_read(zio, vd, l,
+ abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd),
+ B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n),
+ VDEV_UBERBLOCK_SIZE(vd),
+ vdev_uberblock_load_done, zio, flags);
+ }
+ }
+ }
+}
+
+/*
+ * Reads the 'best' uberblock from disk along with its associated
+ * configuration. First, we read the uberblock array of each label of each
+ * vdev, keeping track of the uberblock with the highest txg in each array.
+ * Then, we read the configuration from the same vdev as the best uberblock.
+ */
+void
+vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
+{
+ zio_t *zio;
+ spa_t *spa = rvd->vdev_spa;
+ struct ubl_cbdata cb;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
+
+ ASSERT(ub);
+ ASSERT(config);
+
+ bzero(ub, sizeof (uberblock_t));
+ *config = NULL;
+
+ cb.ubl_ubbest = ub;
+ cb.ubl_vd = NULL;
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ zio = zio_root(spa, NULL, &cb, flags);
+ vdev_uberblock_load_impl(zio, rvd, flags, &cb);
+ (void) zio_wait(zio);
+
+ /*
+ * It's possible that the best uberblock was discovered on a label
+ * that has a configuration which was written in a future txg.
+ * Search all labels on this vdev to find the configuration that
+ * matches the txg for our uberblock.
+ */
+ if (cb.ubl_vd != NULL) {
+ vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. "
+ "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg);
+
+ *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
+ if (*config == NULL && spa->spa_extreme_rewind) {
+ vdev_dbgmsg(cb.ubl_vd, "failed to read label config. "
+ "Trying again without txg restrictions.");
+ *config = vdev_label_read_config(cb.ubl_vd, UINT64_MAX);
+ }
+ if (*config == NULL) {
+ vdev_dbgmsg(cb.ubl_vd, "failed to read label config");
+ }
+ }
+ spa_config_exit(spa, SCL_ALL, FTAG);
+}
+
+/*
+ * For use when a leaf vdev is expanded.
+ * The location of labels 2 and 3 changed, and at the new location the
+ * uberblock rings are either empty or contain garbage. The sync will write
+ * new configs there because the vdev is dirty, but expansion also needs the
+ * uberblock rings copied. Read them from label 0 which did not move.
+ *
+ * Since the point is to populate labels {2,3} with valid uberblocks,
+ * we zero uberblocks we fail to read or which are not valid.
+ */
+
+static void
+vdev_copy_uberblocks(vdev_t *vd)
+{
+ abd_t *ub_abd;
+ zio_t *write_zio;
+ int locks = (SCL_L2ARC | SCL_ZIO);
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE;
+
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_READER) ==
+ SCL_STATE);
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ /*
+ * No uberblocks are stored on distributed spares, they may be
+ * safely skipped when expanding a leaf vdev.
+ */
+ if (vd->vdev_ops == &vdev_draid_spare_ops)
+ return;
+
+ spa_config_enter(vd->vdev_spa, locks, FTAG, RW_READER);
+
+ ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
+
+ write_zio = zio_root(vd->vdev_spa, NULL, NULL, flags);
+ for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
+ const int src_label = 0;
+ zio_t *zio;
+
+ zio = zio_root(vd->vdev_spa, NULL, NULL, flags);
+ vdev_label_read(zio, vd, src_label, ub_abd,
+ VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
+ NULL, NULL, flags);
+
+ if (zio_wait(zio) || uberblock_verify(abd_to_buf(ub_abd)))
+ abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
+
+ for (int l = 2; l < VDEV_LABELS; l++)
+ vdev_label_write(write_zio, vd, l, ub_abd,
+ VDEV_UBERBLOCK_OFFSET(vd, n),
+ VDEV_UBERBLOCK_SIZE(vd), NULL, NULL,
+ flags | ZIO_FLAG_DONT_PROPAGATE);
+ }
+ (void) zio_wait(write_zio);
+
+ spa_config_exit(vd->vdev_spa, locks, FTAG);
+
+ abd_free(ub_abd);
+}
+
+/*
+ * On success, increment root zio's count of good writes.
+ * We only get credit for writes to known-visible vdevs; see spa_vdev_add().
+ */
+static void
+vdev_uberblock_sync_done(zio_t *zio)
+{
+ uint64_t *good_writes = zio->io_private;
+
+ if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
+ atomic_inc_64(good_writes);
+}
+
+/*
+ * Write the uberblock to all labels of all leaves of the specified vdev.
+ */
+static void
+vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,
+ uberblock_t *ub, vdev_t *vd, int flags)
+{
+ for (uint64_t c = 0; c < vd->vdev_children; c++) {
+ vdev_uberblock_sync(zio, good_writes,
+ ub, vd->vdev_child[c], flags);
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return;
+
+ if (!vdev_writeable(vd))
+ return;
+
+ /*
+ * There's no need to write uberblocks to a distributed spare, they
+ * are already stored on all the leaves of the parent dRAID. For
+ * this same reason vdev_uberblock_load_impl() skips distributed
+ * spares when reading uberblocks.
+ */
+ if (vd->vdev_ops == &vdev_draid_spare_ops)
+ return;
+
+ /* If the vdev was expanded, need to copy uberblock rings. */
+ if (vd->vdev_state == VDEV_STATE_HEALTHY &&
+ vd->vdev_copy_uberblocks == B_TRUE) {
+ vdev_copy_uberblocks(vd);
+ vd->vdev_copy_uberblocks = B_FALSE;
+ }
+
+ int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0;
+ int n = ub->ub_txg % (VDEV_UBERBLOCK_COUNT(vd) - m);
+
+ /* Copy the uberblock_t into the ABD */
+ abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
+ abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
+ abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
+
+ for (int l = 0; l < VDEV_LABELS; l++)
+ vdev_label_write(zio, vd, l, ub_abd,
+ VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
+ vdev_uberblock_sync_done, good_writes,
+ flags | ZIO_FLAG_DONT_PROPAGATE);
+
+ abd_free(ub_abd);
+}
+
+/* Sync the uberblocks to all vdevs in svd[] */
+static int
+vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
+{
+ spa_t *spa = svd[0]->vdev_spa;
+ zio_t *zio;
+ uint64_t good_writes = 0;
+
+ zio = zio_root(spa, NULL, NULL, flags);
+
+ for (int v = 0; v < svdcount; v++)
+ vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags);
+
+ (void) zio_wait(zio);
+
+ /*
+ * Flush the uberblocks to disk. This ensures that the odd labels
+ * are no longer needed (because the new uberblocks and the even
+ * labels are safely on disk), so it is safe to overwrite them.
+ */
+ zio = zio_root(spa, NULL, NULL, flags);
+
+ for (int v = 0; v < svdcount; v++) {
+ if (vdev_writeable(svd[v])) {
+ zio_flush(zio, svd[v]);
+ }
+ }
+
+ (void) zio_wait(zio);
+
+ return (good_writes >= 1 ? 0 : EIO);
+}
+
+/*
+ * On success, increment the count of good writes for our top-level vdev.
+ */
+static void
+vdev_label_sync_done(zio_t *zio)
+{
+ uint64_t *good_writes = zio->io_private;
+
+ if (zio->io_error == 0)
+ atomic_inc_64(good_writes);
+}
+
+/*
+ * If there weren't enough good writes, indicate failure to the parent.
+ */
+static void
+vdev_label_sync_top_done(zio_t *zio)
+{
+ uint64_t *good_writes = zio->io_private;
+
+ if (*good_writes == 0)
+ zio->io_error = SET_ERROR(EIO);
+
+ kmem_free(good_writes, sizeof (uint64_t));
+}
+
+/*
+ * We ignore errors for log and cache devices, simply free the private data.
+ */
+static void
+vdev_label_sync_ignore_done(zio_t *zio)
+{
+ kmem_free(zio->io_private, sizeof (uint64_t));
+}
+
+/*
+ * Write all even or odd labels to all leaves of the specified vdev.
+ */
+static void
+vdev_label_sync(zio_t *zio, uint64_t *good_writes,
+ vdev_t *vd, int l, uint64_t txg, int flags)
+{
+ nvlist_t *label;
+ vdev_phys_t *vp;
+ abd_t *vp_abd;
+ char *buf;
+ size_t buflen;
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_label_sync(zio, good_writes,
+ vd->vdev_child[c], l, txg, flags);
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return;
+
+ if (!vdev_writeable(vd))
+ return;
+
+ /*
+ * The top-level config never needs to be written to a distributed
+ * spare. When read vdev_dspare_label_read_config() will generate
+ * the config for the vdev_label_read_config().
+ */
+ if (vd->vdev_ops == &vdev_draid_spare_ops)
+ return;
+
+ /*
+ * Generate a label describing the top-level config to which we belong.
+ */
+ label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE);
+
+ vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
+ abd_zero(vp_abd, sizeof (vdev_phys_t));
+ vp = abd_to_buf(vp_abd);
+
+ buf = vp->vp_nvlist;
+ buflen = sizeof (vp->vp_nvlist);
+
+ if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP)) {
+ for (; l < VDEV_LABELS; l += 2) {
+ vdev_label_write(zio, vd, l, vp_abd,
+ offsetof(vdev_label_t, vl_vdev_phys),
+ sizeof (vdev_phys_t),
+ vdev_label_sync_done, good_writes,
+ flags | ZIO_FLAG_DONT_PROPAGATE);
+ }
+ }
+
+ abd_free(vp_abd);
+ nvlist_free(label);
+}
+
+static int
+vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
+{
+ list_t *dl = &spa->spa_config_dirty_list;
+ vdev_t *vd;
+ zio_t *zio;
+ int error;
+
+ /*
+ * Write the new labels to disk.
+ */
+ zio = zio_root(spa, NULL, NULL, flags);
+
+ for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
+ uint64_t *good_writes;
+
+ ASSERT(!vd->vdev_ishole);
+
+ good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+ zio_t *vio = zio_null(zio, spa, NULL,
+ (vd->vdev_islog || vd->vdev_aux != NULL) ?
+ vdev_label_sync_ignore_done : vdev_label_sync_top_done,
+ good_writes, flags);
+ vdev_label_sync(vio, good_writes, vd, l, txg, flags);
+ zio_nowait(vio);
+ }
+
+ error = zio_wait(zio);
+
+ /*
+ * Flush the new labels to disk.
+ */
+ zio = zio_root(spa, NULL, NULL, flags);
+
+ for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd))
+ zio_flush(zio, vd);
+
+ (void) zio_wait(zio);
+
+ return (error);
+}
+
+/*
+ * Sync the uberblock and any changes to the vdev configuration.
+ *
+ * The order of operations is carefully crafted to ensure that
+ * if the system panics or loses power at any time, the state on disk
+ * is still transactionally consistent. The in-line comments below
+ * describe the failure semantics at each stage.
+ *
+ * Moreover, vdev_config_sync() is designed to be idempotent: if it fails
+ * at any time, you can just call it again, and it will resume its work.
+ */
+int
+vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
+{
+ spa_t *spa = svd[0]->vdev_spa;
+ uberblock_t *ub = &spa->spa_uberblock;
+ int error = 0;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+
+ ASSERT(svdcount != 0);
+retry:
+ /*
+ * Normally, we don't want to try too hard to write every label and
+ * uberblock. If there is a flaky disk, we don't want the rest of the
+ * sync process to block while we retry. But if we can't write a
+ * single label out, we should retry with ZIO_FLAG_TRYHARD before
+ * bailing out and declaring the pool faulted.
+ */
+ if (error != 0) {
+ if ((flags & ZIO_FLAG_TRYHARD) != 0)
+ return (error);
+ flags |= ZIO_FLAG_TRYHARD;
+ }
+
+ ASSERT(ub->ub_txg <= txg);
+
+ /*
+ * If this isn't a resync due to I/O errors,
+ * and nothing changed in this transaction group,
+ * and the vdev configuration hasn't changed,
+ * then there's nothing to do.
+ */
+ if (ub->ub_txg < txg) {
+ boolean_t changed = uberblock_update(ub, spa->spa_root_vdev,
+ txg, spa->spa_mmp.mmp_delay);
+
+ if (!changed && list_is_empty(&spa->spa_config_dirty_list))
+ return (0);
+ }
+
+ if (txg > spa_freeze_txg(spa))
+ return (0);
+
+ ASSERT(txg <= spa->spa_final_txg);
+
+ /*
+ * Flush the write cache of every disk that's been written to
+ * in this transaction group. This ensures that all blocks
+ * written in this txg will be committed to stable storage
+ * before any uberblock that references them.
+ */
+ zio_t *zio = zio_root(spa, NULL, NULL, flags);
+
+ for (vdev_t *vd =
+ txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL;
+ vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
+ zio_flush(zio, vd);
+
+ (void) zio_wait(zio);
+
+ /*
+ * Sync out the even labels (L0, L2) for every dirty vdev. If the
+ * system dies in the middle of this process, that's OK: all of the
+ * even labels that made it to disk will be newer than any uberblock,
+ * and will therefore be considered invalid. The odd labels (L1, L3),
+ * which have not yet been touched, will still be valid. We flush
+ * the new labels to disk to ensure that all even-label updates
+ * are committed to stable storage before the uberblock update.
+ */
+ if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) {
+ if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+ zfs_dbgmsg("vdev_label_sync_list() returned error %d "
+ "for pool '%s' when syncing out the even labels "
+ "of dirty vdevs", error, spa_name(spa));
+ }
+ goto retry;
+ }
+
+ /*
+ * Sync the uberblocks to all vdevs in svd[].
+ * If the system dies in the middle of this step, there are two cases
+ * to consider, and the on-disk state is consistent either way:
+ *
+ * (1) If none of the new uberblocks made it to disk, then the
+ * previous uberblock will be the newest, and the odd labels
+ * (which had not yet been touched) will be valid with respect
+ * to that uberblock.
+ *
+ * (2) If one or more new uberblocks made it to disk, then they
+ * will be the newest, and the even labels (which had all
+ * been successfully committed) will be valid with respect
+ * to the new uberblocks.
+ */
+ if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) {
+ if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+ zfs_dbgmsg("vdev_uberblock_sync_list() returned error "
+ "%d for pool '%s'", error, spa_name(spa));
+ }
+ goto retry;
+ }
+
+ if (spa_multihost(spa))
+ mmp_update_uberblock(spa, ub);
+
+ /*
+ * Sync out odd labels for every dirty vdev. If the system dies
+ * in the middle of this process, the even labels and the new
+ * uberblocks will suffice to open the pool. The next time
+ * the pool is opened, the first thing we'll do -- before any
+ * user data is modified -- is mark every vdev dirty so that
+ * all labels will be brought up to date. We flush the new labels
+ * to disk to ensure that all odd-label updates are committed to
+ * stable storage before the next transaction group begins.
+ */
+ if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) {
+ if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+ zfs_dbgmsg("vdev_label_sync_list() returned error %d "
+ "for pool '%s' when syncing out the odd labels of "
+ "dirty vdevs", error, spa_name(spa));
+ }
+ goto retry;
+ }
+
+ return (0);
+}
diff --git a/sys/contrib/openzfs/module/zfs/vdev_mirror.c b/sys/contrib/openzfs/module/zfs/vdev_mirror.c
new file mode 100644
index 000000000000..71ca43caec1a
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_mirror.c
@@ -0,0 +1,972 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_scan.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_draid.h>
+#include <sys/zio.h>
+#include <sys/abd.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Vdev mirror kstats
+ */
+static kstat_t *mirror_ksp = NULL;
+
+typedef struct mirror_stats {
+ kstat_named_t vdev_mirror_stat_rotating_linear;
+ kstat_named_t vdev_mirror_stat_rotating_offset;
+ kstat_named_t vdev_mirror_stat_rotating_seek;
+ kstat_named_t vdev_mirror_stat_non_rotating_linear;
+ kstat_named_t vdev_mirror_stat_non_rotating_seek;
+
+ kstat_named_t vdev_mirror_stat_preferred_found;
+ kstat_named_t vdev_mirror_stat_preferred_not_found;
+} mirror_stats_t;
+
+static mirror_stats_t mirror_stats = {
+ /* New I/O follows directly the last I/O */
+ { "rotating_linear", KSTAT_DATA_UINT64 },
+ /* New I/O is within zfs_vdev_mirror_rotating_seek_offset of the last */
+ { "rotating_offset", KSTAT_DATA_UINT64 },
+ /* New I/O requires random seek */
+ { "rotating_seek", KSTAT_DATA_UINT64 },
+ /* New I/O follows directly the last I/O (nonrot) */
+ { "non_rotating_linear", KSTAT_DATA_UINT64 },
+ /* New I/O requires random seek (nonrot) */
+ { "non_rotating_seek", KSTAT_DATA_UINT64 },
+ /* Preferred child vdev found */
+ { "preferred_found", KSTAT_DATA_UINT64 },
+ /* Preferred child vdev not found or equal load */
+ { "preferred_not_found", KSTAT_DATA_UINT64 },
+
+};
+
+#define MIRROR_STAT(stat) (mirror_stats.stat.value.ui64)
+#define MIRROR_INCR(stat, val) atomic_add_64(&MIRROR_STAT(stat), val)
+#define MIRROR_BUMP(stat) MIRROR_INCR(stat, 1)
+
+void
+vdev_mirror_stat_init(void)
+{
+ mirror_ksp = kstat_create("zfs", 0, "vdev_mirror_stats",
+ "misc", KSTAT_TYPE_NAMED,
+ sizeof (mirror_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+ if (mirror_ksp != NULL) {
+ mirror_ksp->ks_data = &mirror_stats;
+ kstat_install(mirror_ksp);
+ }
+}
+
+void
+vdev_mirror_stat_fini(void)
+{
+ if (mirror_ksp != NULL) {
+ kstat_delete(mirror_ksp);
+ mirror_ksp = NULL;
+ }
+}
+
+/*
+ * Virtual device vector for mirroring.
+ */
+typedef struct mirror_child {
+ vdev_t *mc_vd;
+ uint64_t mc_offset;
+ int mc_error;
+ int mc_load;
+ uint8_t mc_tried;
+ uint8_t mc_skipped;
+ uint8_t mc_speculative;
+ uint8_t mc_rebuilding;
+} mirror_child_t;
+
+typedef struct mirror_map {
+ int *mm_preferred;
+ int mm_preferred_cnt;
+ int mm_children;
+ boolean_t mm_resilvering;
+ boolean_t mm_rebuilding;
+ boolean_t mm_root;
+ mirror_child_t mm_child[];
+} mirror_map_t;
+
+static int vdev_mirror_shift = 21;
+
+/*
+ * The load configuration settings below are tuned by default for
+ * the case where all devices are of the same rotational type.
+ *
+ * If there is a mixture of rotating and non-rotating media, setting
+ * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results
+ * as it will direct more reads to the non-rotating vdevs which are more likely
+ * to have a higher performance.
+ */
+
+/* Rotating media load calculation configuration. */
+static int zfs_vdev_mirror_rotating_inc = 0;
+static int zfs_vdev_mirror_rotating_seek_inc = 5;
+static int zfs_vdev_mirror_rotating_seek_offset = 1 * 1024 * 1024;
+
+/* Non-rotating media load calculation configuration. */
+static int zfs_vdev_mirror_non_rotating_inc = 0;
+static int zfs_vdev_mirror_non_rotating_seek_inc = 1;
+
+static inline size_t
+vdev_mirror_map_size(int children)
+{
+ return (offsetof(mirror_map_t, mm_child[children]) +
+ sizeof (int) * children);
+}
+
+static inline mirror_map_t *
+vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root)
+{
+ mirror_map_t *mm;
+
+ mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP);
+ mm->mm_children = children;
+ mm->mm_resilvering = resilvering;
+ mm->mm_root = root;
+ mm->mm_preferred = (int *)((uintptr_t)mm +
+ offsetof(mirror_map_t, mm_child[children]));
+
+ return (mm);
+}
+
+static void
+vdev_mirror_map_free(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+
+ kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
+}
+
+static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
+ .vsd_free = vdev_mirror_map_free,
+ .vsd_cksum_report = zio_vsd_default_cksum_report
+};
+
+static int
+vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
+{
+ uint64_t last_offset;
+ int64_t offset_diff;
+ int load;
+
+ /* All DVAs have equal weight at the root. */
+ if (mm->mm_root)
+ return (INT_MAX);
+
+ /*
+ * We don't return INT_MAX if the device is resilvering i.e.
+ * vdev_resilver_txg != 0 as when tested performance was slightly
+ * worse overall when resilvering with compared to without.
+ */
+
+ /* Fix zio_offset for leaf vdevs */
+ if (vd->vdev_ops->vdev_op_leaf)
+ zio_offset += VDEV_LABEL_START_SIZE;
+
+ /* Standard load based on pending queue length. */
+ load = vdev_queue_length(vd);
+ last_offset = vdev_queue_last_offset(vd);
+
+ if (vd->vdev_nonrot) {
+ /* Non-rotating media. */
+ if (last_offset == zio_offset) {
+ MIRROR_BUMP(vdev_mirror_stat_non_rotating_linear);
+ return (load + zfs_vdev_mirror_non_rotating_inc);
+ }
+
+ /*
+ * Apply a seek penalty even for non-rotating devices as
+ * sequential I/O's can be aggregated into fewer operations on
+ * the device, thus avoiding unnecessary per-command overhead
+ * and boosting performance.
+ */
+ MIRROR_BUMP(vdev_mirror_stat_non_rotating_seek);
+ return (load + zfs_vdev_mirror_non_rotating_seek_inc);
+ }
+
+ /* Rotating media I/O's which directly follow the last I/O. */
+ if (last_offset == zio_offset) {
+ MIRROR_BUMP(vdev_mirror_stat_rotating_linear);
+ return (load + zfs_vdev_mirror_rotating_inc);
+ }
+
+ /*
+ * Apply half the seek increment to I/O's within seek offset
+ * of the last I/O issued to this vdev as they should incur less
+ * of a seek increment.
+ */
+ offset_diff = (int64_t)(last_offset - zio_offset);
+ if (ABS(offset_diff) < zfs_vdev_mirror_rotating_seek_offset) {
+ MIRROR_BUMP(vdev_mirror_stat_rotating_offset);
+ return (load + (zfs_vdev_mirror_rotating_seek_inc / 2));
+ }
+
+ /* Apply the full seek increment to all other I/O's. */
+ MIRROR_BUMP(vdev_mirror_stat_rotating_seek);
+ return (load + zfs_vdev_mirror_rotating_seek_inc);
+}
+
+static boolean_t
+vdev_mirror_rebuilding(vdev_t *vd)
+{
+ if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg)
+ return (B_TRUE);
+
+ for (int i = 0; i < vd->vdev_children; i++) {
+ if (vdev_mirror_rebuilding(vd->vdev_child[i])) {
+ return (B_TRUE);
+ }
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Avoid inlining the function to keep vdev_mirror_io_start(), which
+ * is this functions only caller, as small as possible on the stack.
+ */
+noinline static mirror_map_t *
+vdev_mirror_map_init(zio_t *zio)
+{
+ mirror_map_t *mm = NULL;
+ mirror_child_t *mc;
+ vdev_t *vd = zio->io_vd;
+ int c;
+
+ if (vd == NULL) {
+ dva_t *dva = zio->io_bp->blk_dva;
+ spa_t *spa = zio->io_spa;
+ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+ dva_t dva_copy[SPA_DVAS_PER_BP];
+
+ /*
+ * The sequential scrub code sorts and issues all DVAs
+ * of a bp separately. Each of these IOs includes all
+ * original DVA copies so that repairs can be performed
+ * in the event of an error, but we only actually want
+ * to check the first DVA since the others will be
+ * checked by their respective sorted IOs. Only if we
+ * hit an error will we try all DVAs upon retrying.
+ *
+ * Note: This check is safe even if the user switches
+ * from a legacy scrub to a sequential one in the middle
+ * of processing, since scn_is_sorted isn't updated until
+ * all outstanding IOs from the previous scrub pass
+ * complete.
+ */
+ if ((zio->io_flags & ZIO_FLAG_SCRUB) &&
+ !(zio->io_flags & ZIO_FLAG_IO_RETRY) &&
+ dsl_scan_scrubbing(spa->spa_dsl_pool) &&
+ scn->scn_is_sorted) {
+ c = 1;
+ } else {
+ c = BP_GET_NDVAS(zio->io_bp);
+ }
+
+ /*
+ * If the pool cannot be written to, then infer that some
+ * DVAs might be invalid or point to vdevs that do not exist.
+ * We skip them.
+ */
+ if (!spa_writeable(spa)) {
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+ int j = 0;
+ for (int i = 0; i < c; i++) {
+ if (zfs_dva_valid(spa, &dva[i], zio->io_bp))
+ dva_copy[j++] = dva[i];
+ }
+ if (j == 0) {
+ zio->io_vsd = NULL;
+ zio->io_error = ENXIO;
+ return (NULL);
+ }
+ if (j < c) {
+ dva = dva_copy;
+ c = j;
+ }
+ }
+
+ mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE);
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+
+ mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
+ mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
+ if (mc->mc_vd == NULL) {
+ kmem_free(mm, vdev_mirror_map_size(
+ mm->mm_children));
+ zio->io_vsd = NULL;
+ zio->io_error = ENXIO;
+ return (NULL);
+ }
+ }
+ } else {
+ /*
+ * If we are resilvering, then we should handle scrub reads
+ * differently; we shouldn't issue them to the resilvering
+ * device because it might not have those blocks.
+ *
+ * We are resilvering iff:
+ * 1) We are a replacing vdev (ie our name is "replacing-1" or
+ * "spare-1" or something like that), and
+ * 2) The pool is currently being resilvered.
+ *
+ * We cannot simply check vd->vdev_resilver_txg, because it's
+ * not set in this path.
+ *
+ * Nor can we just check our vdev_ops; there are cases (such as
+ * when a user types "zpool replace pool odev spare_dev" and
+ * spare_dev is in the spare list, or when a spare device is
+ * automatically used to replace a DEGRADED device) when
+ * resilvering is complete but both the original vdev and the
+ * spare vdev remain in the pool. That behavior is intentional.
+ * It helps implement the policy that a spare should be
+ * automatically removed from the pool after the user replaces
+ * the device that originally failed.
+ *
+ * If a spa load is in progress, then spa_dsl_pool may be
+ * uninitialized. But we shouldn't be resilvering during a spa
+ * load anyway.
+ */
+ boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops) &&
+ spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE &&
+ dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool);
+ mm = vdev_mirror_map_alloc(vd->vdev_children, replacing,
+ B_FALSE);
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+ mc->mc_vd = vd->vdev_child[c];
+ mc->mc_offset = zio->io_offset;
+
+ if (vdev_mirror_rebuilding(mc->mc_vd))
+ mm->mm_rebuilding = mc->mc_rebuilding = B_TRUE;
+ }
+ }
+
+ zio->io_vsd = mm;
+ zio->io_vsd_ops = &vdev_mirror_vsd_ops;
+ return (mm);
+}
+
+static int
+vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+ int numerrors = 0;
+ int lasterror = 0;
+
+ if (vd->vdev_children == 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (SET_ERROR(EINVAL));
+ }
+
+ vdev_open_children(vd);
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (cvd->vdev_open_error) {
+ lasterror = cvd->vdev_open_error;
+ numerrors++;
+ continue;
+ }
+
+ *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
+ *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
+ *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
+ *physical_ashift = MAX(*physical_ashift,
+ cvd->vdev_physical_ashift);
+ }
+
+ if (numerrors == vd->vdev_children) {
+ if (vdev_children_are_offline(vd))
+ vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE;
+ else
+ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+ return (lasterror);
+ }
+
+ return (0);
+}
+
+static void
+vdev_mirror_close(vdev_t *vd)
+{
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_close(vd->vdev_child[c]);
+}
+
+static void
+vdev_mirror_child_done(zio_t *zio)
+{
+ mirror_child_t *mc = zio->io_private;
+
+ mc->mc_error = zio->io_error;
+ mc->mc_tried = 1;
+ mc->mc_skipped = 0;
+}
+
+static void
+vdev_mirror_scrub_done(zio_t *zio)
+{
+ mirror_child_t *mc = zio->io_private;
+
+ if (zio->io_error == 0) {
+ zio_t *pio;
+ zio_link_t *zl = NULL;
+
+ mutex_enter(&zio->io_lock);
+ while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
+ mutex_enter(&pio->io_lock);
+ ASSERT3U(zio->io_size, >=, pio->io_size);
+ abd_copy(pio->io_abd, zio->io_abd, pio->io_size);
+ mutex_exit(&pio->io_lock);
+ }
+ mutex_exit(&zio->io_lock);
+ }
+
+ abd_free(zio->io_abd);
+
+ mc->mc_error = zio->io_error;
+ mc->mc_tried = 1;
+ mc->mc_skipped = 0;
+}
+
+/*
+ * Check the other, lower-index DVAs to see if they're on the same
+ * vdev as the child we picked. If they are, use them since they
+ * are likely to have been allocated from the primary metaslab in
+ * use at the time, and hence are more likely to have locality with
+ * single-copy data.
+ */
+static int
+vdev_mirror_dva_select(zio_t *zio, int p)
+{
+ dva_t *dva = zio->io_bp->blk_dva;
+ mirror_map_t *mm = zio->io_vsd;
+ int preferred;
+ int c;
+
+ preferred = mm->mm_preferred[p];
+ for (p--; p >= 0; p--) {
+ c = mm->mm_preferred[p];
+ if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred]))
+ preferred = c;
+ }
+ return (preferred);
+}
+
+static int
+vdev_mirror_preferred_child_randomize(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+ int p;
+
+ if (mm->mm_root) {
+ p = spa_get_random(mm->mm_preferred_cnt);
+ return (vdev_mirror_dva_select(zio, p));
+ }
+
+ /*
+ * To ensure we don't always favour the first matching vdev,
+ * which could lead to wear leveling issues on SSD's, we
+ * use the I/O offset as a pseudo random seed into the vdevs
+ * which have the lowest load.
+ */
+ p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt;
+ return (mm->mm_preferred[p]);
+}
+
+static boolean_t
+vdev_mirror_child_readable(mirror_child_t *mc)
+{
+ vdev_t *vd = mc->mc_vd;
+
+ if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
+ return (vdev_draid_readable(vd, mc->mc_offset));
+ else
+ return (vdev_readable(vd));
+}
+
+static boolean_t
+vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size)
+{
+ vdev_t *vd = mc->mc_vd;
+
+ if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
+ return (vdev_draid_missing(vd, mc->mc_offset, txg, size));
+ else
+ return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
+}
+
+/*
+ * Try to find a vdev whose DTL doesn't contain the block we want to read
+ * preferring vdevs based on determined load. If we can't, try the read on
+ * any vdev we haven't already tried.
+ *
+ * Distributed spares are an exception to the above load rule. They are
+ * always preferred in order to detect gaps in the distributed spare which
+ * are created when another disk in the dRAID fails. In order to restore
+ * redundancy those gaps must be read to trigger the required repair IO.
+ */
+static int
+vdev_mirror_child_select(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+ uint64_t txg = zio->io_txg;
+ int c, lowest_load;
+
+ ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
+
+ lowest_load = INT_MAX;
+ mm->mm_preferred_cnt = 0;
+ for (c = 0; c < mm->mm_children; c++) {
+ mirror_child_t *mc;
+
+ mc = &mm->mm_child[c];
+ if (mc->mc_tried || mc->mc_skipped)
+ continue;
+
+ if (mc->mc_vd == NULL ||
+ !vdev_mirror_child_readable(mc)) {
+ mc->mc_error = SET_ERROR(ENXIO);
+ mc->mc_tried = 1; /* don't even try */
+ mc->mc_skipped = 1;
+ continue;
+ }
+
+ if (vdev_mirror_child_missing(mc, txg, 1)) {
+ mc->mc_error = SET_ERROR(ESTALE);
+ mc->mc_skipped = 1;
+ mc->mc_speculative = 1;
+ continue;
+ }
+
+ if (mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) {
+ mm->mm_preferred[0] = c;
+ mm->mm_preferred_cnt = 1;
+ break;
+ }
+
+ mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
+ if (mc->mc_load > lowest_load)
+ continue;
+
+ if (mc->mc_load < lowest_load) {
+ lowest_load = mc->mc_load;
+ mm->mm_preferred_cnt = 0;
+ }
+ mm->mm_preferred[mm->mm_preferred_cnt] = c;
+ mm->mm_preferred_cnt++;
+ }
+
+ if (mm->mm_preferred_cnt == 1) {
+ MIRROR_BUMP(vdev_mirror_stat_preferred_found);
+ return (mm->mm_preferred[0]);
+ }
+
+ if (mm->mm_preferred_cnt > 1) {
+ MIRROR_BUMP(vdev_mirror_stat_preferred_not_found);
+ return (vdev_mirror_preferred_child_randomize(zio));
+ }
+
+ /*
+ * Every device is either missing or has this txg in its DTL.
+ * Look for any child we haven't already tried before giving up.
+ */
+ for (c = 0; c < mm->mm_children; c++) {
+ if (!mm->mm_child[c].mc_tried)
+ return (c);
+ }
+
+ /*
+ * Every child failed. There's no place left to look.
+ */
+ return (-1);
+}
+
+static void
+vdev_mirror_io_start(zio_t *zio)
+{
+ mirror_map_t *mm;
+ mirror_child_t *mc;
+ int c, children;
+
+ mm = vdev_mirror_map_init(zio);
+
+ if (mm == NULL) {
+ ASSERT(!spa_trust_config(zio->io_spa));
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+ zio_execute(zio);
+ return;
+ }
+
+ if (zio->io_type == ZIO_TYPE_READ) {
+ if (zio->io_bp != NULL &&
+ (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
+ /*
+ * For scrubbing reads (if we can verify the
+ * checksum here, as indicated by io_bp being
+ * non-NULL) we need to allocate a read buffer for
+ * each child and issue reads to all children. If
+ * any child succeeds, it will copy its data into
+ * zio->io_data in vdev_mirror_scrub_done.
+ */
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ mc->mc_vd, mc->mc_offset,
+ abd_alloc_sametype(zio->io_abd,
+ zio->io_size), zio->io_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_mirror_scrub_done, mc));
+ }
+ zio_execute(zio);
+ return;
+ }
+ /*
+ * For normal reads just pick one child.
+ */
+ c = vdev_mirror_child_select(zio);
+ children = (c >= 0);
+ } else {
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+ /*
+ * Writes go to all children.
+ */
+ c = 0;
+ children = mm->mm_children;
+ }
+
+ while (children--) {
+ mc = &mm->mm_child[c];
+ c++;
+
+ /*
+ * When sequentially resilvering only issue write repair
+ * IOs to the vdev which is being rebuilt since performance
+ * is limited by the slowest child. This is an issue for
+ * faster replacement devices such as distributed spares.
+ */
+ if ((zio->io_priority == ZIO_PRIORITY_REBUILD) &&
+ (zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
+ !(zio->io_flags & ZIO_FLAG_SCRUB) &&
+ mm->mm_rebuilding && !mc->mc_rebuilding) {
+ continue;
+ }
+
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_mirror_child_done, mc));
+ }
+
+ zio_execute(zio);
+}
+
+static int
+vdev_mirror_worst_error(mirror_map_t *mm)
+{
+ int error[2] = { 0, 0 };
+
+ for (int c = 0; c < mm->mm_children; c++) {
+ mirror_child_t *mc = &mm->mm_child[c];
+ int s = mc->mc_speculative;
+ error[s] = zio_worst_error(error[s], mc->mc_error);
+ }
+
+ return (error[0] ? error[0] : error[1]);
+}
+
+static void
+vdev_mirror_io_done(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+ mirror_child_t *mc;
+ int c;
+ int good_copies = 0;
+ int unexpected_errors = 0;
+
+ if (mm == NULL)
+ return;
+
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+
+ if (mc->mc_error) {
+ if (!mc->mc_skipped)
+ unexpected_errors++;
+ } else if (mc->mc_tried) {
+ good_copies++;
+ }
+ }
+
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ /*
+ * XXX -- for now, treat partial writes as success.
+ *
+ * Now that we support write reallocation, it would be better
+ * to treat partial failure as real failure unless there are
+ * no non-degraded top-level vdevs left, and not update DTLs
+ * if we intend to reallocate.
+ */
+ /* XXPOLICY */
+ if (good_copies != mm->mm_children) {
+ /*
+ * Always require at least one good copy.
+ *
+ * For ditto blocks (io_vd == NULL), require
+ * all copies to be good.
+ *
+ * XXX -- for replacing vdevs, there's no great answer.
+ * If the old device is really dead, we may not even
+ * be able to access it -- so we only want to
+ * require good writes to the new device. But if
+ * the new device turns out to be flaky, we want
+ * to be able to detach it -- which requires all
+ * writes to the old device to have succeeded.
+ */
+ if (good_copies == 0 || zio->io_vd == NULL)
+ zio->io_error = vdev_mirror_worst_error(mm);
+ }
+ return;
+ }
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+ /*
+ * If we don't have a good copy yet, keep trying other children.
+ */
+ /* XXPOLICY */
+ if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
+ ASSERT(c >= 0 && c < mm->mm_children);
+ mc = &mm->mm_child[c];
+ zio_vdev_io_redone(zio);
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
+ ZIO_TYPE_READ, zio->io_priority, 0,
+ vdev_mirror_child_done, mc));
+ return;
+ }
+
+ /* XXPOLICY */
+ if (good_copies == 0) {
+ zio->io_error = vdev_mirror_worst_error(mm);
+ ASSERT(zio->io_error != 0);
+ }
+
+ if (good_copies && spa_writeable(zio->io_spa) &&
+ (unexpected_errors ||
+ (zio->io_flags & ZIO_FLAG_RESILVER) ||
+ ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) {
+ /*
+ * Use the good data we have in hand to repair damaged children.
+ */
+ for (c = 0; c < mm->mm_children; c++) {
+ /*
+ * Don't rewrite known good children.
+ * Not only is it unnecessary, it could
+ * actually be harmful: if the system lost
+ * power while rewriting the only good copy,
+ * there would be no good copies left!
+ */
+ mc = &mm->mm_child[c];
+
+ if (mc->mc_error == 0) {
+ vdev_ops_t *ops = mc->mc_vd->vdev_ops;
+
+ if (mc->mc_tried)
+ continue;
+ /*
+ * We didn't try this child. We need to
+ * repair it if:
+ * 1. it's a scrub (in which case we have
+ * tried everything that was healthy)
+ * - or -
+ * 2. it's an indirect or distributed spare
+ * vdev (in which case it could point to any
+ * other vdev, which might have a bad DTL)
+ * - or -
+ * 3. the DTL indicates that this data is
+ * missing from this vdev
+ */
+ if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
+ ops != &vdev_indirect_ops &&
+ ops != &vdev_draid_spare_ops &&
+ !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
+ zio->io_txg, 1))
+ continue;
+ mc->mc_error = SET_ERROR(ESTALE);
+ }
+
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ mc->mc_vd, mc->mc_offset,
+ zio->io_abd, zio->io_size, ZIO_TYPE_WRITE,
+ zio->io_priority == ZIO_PRIORITY_REBUILD ?
+ ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
+ ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
+ }
+ }
+}
+
+static void
+vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
+{
+ if (faulted == vd->vdev_children) {
+ if (vdev_children_are_offline(vd)) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE,
+ VDEV_AUX_CHILDREN_OFFLINE);
+ } else {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
+ }
+ } else if (degraded + faulted != 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ } else {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+ }
+}
+
+/*
+ * Return the maximum asize for a rebuild zio in the provided range.
+ */
+static uint64_t
+vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize,
+ uint64_t max_segment)
+{
+ uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift),
+ SPA_MAXBLOCKSIZE);
+
+ return (MIN(asize, vdev_psize_to_asize(vd, psize)));
+}
+
+vdev_ops_t vdev_mirror_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
+ .vdev_op_open = vdev_mirror_open,
+ .vdev_op_close = vdev_mirror_close,
+ .vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
+ .vdev_op_io_start = vdev_mirror_io_start,
+ .vdev_op_io_done = vdev_mirror_io_done,
+ .vdev_op_state_change = vdev_mirror_state_change,
+ .vdev_op_need_resilver = vdev_default_need_resilver,
+ .vdev_op_hold = NULL,
+ .vdev_op_rele = NULL,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
+ .vdev_op_type = VDEV_TYPE_MIRROR, /* name of this vdev type */
+ .vdev_op_leaf = B_FALSE /* not a leaf vdev */
+};
+
+vdev_ops_t vdev_replacing_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
+ .vdev_op_open = vdev_mirror_open,
+ .vdev_op_close = vdev_mirror_close,
+ .vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
+ .vdev_op_io_start = vdev_mirror_io_start,
+ .vdev_op_io_done = vdev_mirror_io_done,
+ .vdev_op_state_change = vdev_mirror_state_change,
+ .vdev_op_need_resilver = vdev_default_need_resilver,
+ .vdev_op_hold = NULL,
+ .vdev_op_rele = NULL,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
+ .vdev_op_type = VDEV_TYPE_REPLACING, /* name of this vdev type */
+ .vdev_op_leaf = B_FALSE /* not a leaf vdev */
+};
+
+vdev_ops_t vdev_spare_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
+ .vdev_op_open = vdev_mirror_open,
+ .vdev_op_close = vdev_mirror_close,
+ .vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
+ .vdev_op_io_start = vdev_mirror_io_start,
+ .vdev_op_io_done = vdev_mirror_io_done,
+ .vdev_op_state_change = vdev_mirror_state_change,
+ .vdev_op_need_resilver = vdev_default_need_resilver,
+ .vdev_op_hold = NULL,
+ .vdev_op_rele = NULL,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
+ .vdev_op_type = VDEV_TYPE_SPARE, /* name of this vdev type */
+ .vdev_op_leaf = B_FALSE /* not a leaf vdev */
+};
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_inc, INT, ZMOD_RW,
+ "Rotating media load increment for non-seeking I/O's");
+
+ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_inc, INT, ZMOD_RW,
+ "Rotating media load increment for seeking I/O's");
+
+ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_offset, INT, ZMOD_RW,
+ "Offset in bytes from the last I/O which triggers "
+ "a reduced rotating media seek increment");
+
+ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_inc, INT, ZMOD_RW,
+ "Non-rotating media load increment for non-seeking I/O's");
+
+ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_seek_inc, INT, ZMOD_RW,
+ "Non-rotating media load increment for seeking I/O's");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_missing.c b/sys/contrib/openzfs/module/zfs/vdev_missing.c
new file mode 100644
index 000000000000..e9145fd012d7
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_missing.c
@@ -0,0 +1,131 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ */
+
+/*
+ * The 'missing' vdev is a special vdev type used only during import. It
+ * signifies a placeholder in the root vdev for some vdev that we know is
+ * missing. We pass it down to the kernel to allow the rest of the
+ * configuration to parsed and an attempt made to open all available devices.
+ * Because its GUID is always 0, we know that the guid sum will mismatch and we
+ * won't be able to open the pool anyway.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+
+/* ARGSUSED */
+static int
+vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+ uint64_t *ashift, uint64_t *pshift)
+{
+ /*
+ * Really this should just fail. But then the root vdev will be in the
+ * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is
+ * VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we
+ * will fail the GUID sum check before ever trying to open the pool.
+ */
+ *psize = 0;
+ *max_psize = 0;
+ *ashift = 0;
+ *pshift = 0;
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_close(vdev_t *vd)
+{
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_io_start(zio_t *zio)
+{
+ zio->io_error = SET_ERROR(ENOTSUP);
+ zio_execute(zio);
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_io_done(zio_t *zio)
+{
+}
+
+vdev_ops_t vdev_missing_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
+ .vdev_op_open = vdev_missing_open,
+ .vdev_op_close = vdev_missing_close,
+ .vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
+ .vdev_op_io_start = vdev_missing_io_start,
+ .vdev_op_io_done = vdev_missing_io_done,
+ .vdev_op_state_change = NULL,
+ .vdev_op_need_resilver = NULL,
+ .vdev_op_hold = NULL,
+ .vdev_op_rele = NULL,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = NULL,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
+ .vdev_op_type = VDEV_TYPE_MISSING, /* name of this vdev type */
+ .vdev_op_leaf = B_TRUE /* leaf vdev */
+};
+
+vdev_ops_t vdev_hole_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
+ .vdev_op_open = vdev_missing_open,
+ .vdev_op_close = vdev_missing_close,
+ .vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
+ .vdev_op_io_start = vdev_missing_io_start,
+ .vdev_op_io_done = vdev_missing_io_done,
+ .vdev_op_state_change = NULL,
+ .vdev_op_need_resilver = NULL,
+ .vdev_op_hold = NULL,
+ .vdev_op_rele = NULL,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = NULL,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
+ .vdev_op_type = VDEV_TYPE_HOLE, /* name of this vdev type */
+ .vdev_op_leaf = B_TRUE /* leaf vdev */
+};
diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c
new file mode 100644
index 000000000000..25a4bc69cc23
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c
@@ -0,0 +1,1164 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/vdev_impl.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/avl.h>
+#include <sys/dsl_pool.h>
+#include <sys/metaslab_impl.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/kstat.h>
+#include <sys/abd.h>
+
+/*
+ * ZFS I/O Scheduler
+ * ---------------
+ *
+ * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The
+ * I/O scheduler determines when and in what order those operations are
+ * issued. The I/O scheduler divides operations into five I/O classes
+ * prioritized in the following order: sync read, sync write, async read,
+ * async write, and scrub/resilver. Each queue defines the minimum and
+ * maximum number of concurrent operations that may be issued to the device.
+ * In addition, the device has an aggregate maximum. Note that the sum of the
+ * per-queue minimums must not exceed the aggregate maximum. If the
+ * sum of the per-queue maximums exceeds the aggregate maximum, then the
+ * number of active i/os may reach zfs_vdev_max_active, in which case no
+ * further i/os will be issued regardless of whether all per-queue
+ * minimums have been met.
+ *
+ * For many physical devices, throughput increases with the number of
+ * concurrent operations, but latency typically suffers. Further, physical
+ * devices typically have a limit at which more concurrent operations have no
+ * effect on throughput or can actually cause it to decrease.
+ *
+ * The scheduler selects the next operation to issue by first looking for an
+ * I/O class whose minimum has not been satisfied. Once all are satisfied and
+ * the aggregate maximum has not been hit, the scheduler looks for classes
+ * whose maximum has not been satisfied. Iteration through the I/O classes is
+ * done in the order specified above. No further operations are issued if the
+ * aggregate maximum number of concurrent operations has been hit or if there
+ * are no operations queued for an I/O class that has not hit its maximum.
+ * Every time an i/o is queued or an operation completes, the I/O scheduler
+ * looks for new operations to issue.
+ *
+ * All I/O classes have a fixed maximum number of outstanding operations
+ * except for the async write class. Asynchronous writes represent the data
+ * that is committed to stable storage during the syncing stage for
+ * transaction groups (see txg.c). Transaction groups enter the syncing state
+ * periodically so the number of queued async writes will quickly burst up and
+ * then bleed down to zero. Rather than servicing them as quickly as possible,
+ * the I/O scheduler changes the maximum number of active async write i/os
+ * according to the amount of dirty data in the pool (see dsl_pool.c). Since
+ * both throughput and latency typically increase with the number of
+ * concurrent operations issued to physical devices, reducing the burstiness
+ * in the number of concurrent operations also stabilizes the response time of
+ * operations from other -- and in particular synchronous -- queues. In broad
+ * strokes, the I/O scheduler will issue more concurrent operations from the
+ * async write queue as there's more dirty data in the pool.
+ *
+ * Async Writes
+ *
+ * The number of concurrent operations issued for the async write I/O class
+ * follows a piece-wise linear function defined by a few adjustable points.
+ *
+ * | o---------| <-- zfs_vdev_async_write_max_active
+ * ^ | /^ |
+ * | | / | |
+ * active | / | |
+ * I/O | / | |
+ * count | / | |
+ * | / | |
+ * |------------o | | <-- zfs_vdev_async_write_min_active
+ * 0|____________^______|_________|
+ * 0% | | 100% of zfs_dirty_data_max
+ * | |
+ * | `-- zfs_vdev_async_write_active_max_dirty_percent
+ * `--------- zfs_vdev_async_write_active_min_dirty_percent
+ *
+ * Until the amount of dirty data exceeds a minimum percentage of the dirty
+ * data allowed in the pool, the I/O scheduler will limit the number of
+ * concurrent operations to the minimum. As that threshold is crossed, the
+ * number of concurrent operations issued increases linearly to the maximum at
+ * the specified maximum percentage of the dirty data allowed in the pool.
+ *
+ * Ideally, the amount of dirty data on a busy pool will stay in the sloped
+ * part of the function between zfs_vdev_async_write_active_min_dirty_percent
+ * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the
+ * maximum percentage, this indicates that the rate of incoming data is
+ * greater than the rate that the backend storage can handle. In this case, we
+ * must further throttle incoming writes (see dmu_tx_delay() for details).
+ */
+
+/*
+ * The maximum number of i/os active to each device. Ideally, this will be >=
+ * the sum of each queue's max_active.
+ */
+uint32_t zfs_vdev_max_active = 1000;
+
+/*
+ * Per-queue limits on the number of i/os active to each device. If the
+ * number of active i/os is < zfs_vdev_max_active, then the min_active comes
+ * into play. We will send min_active from each queue round-robin, and then
+ * send from queues in the order defined by zio_priority_t up to max_active.
+ * Some queues have additional mechanisms to limit number of active I/Os in
+ * addition to min_active and max_active, see below.
+ *
+ * In general, smaller max_active's will lead to lower latency of synchronous
+ * operations. Larger max_active's may lead to higher overall throughput,
+ * depending on underlying storage.
+ *
+ * The ratio of the queues' max_actives determines the balance of performance
+ * between reads, writes, and scrubs. E.g., increasing
+ * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete
+ * more quickly, but reads and writes to have higher latency and lower
+ * throughput.
+ */
+uint32_t zfs_vdev_sync_read_min_active = 10;
+uint32_t zfs_vdev_sync_read_max_active = 10;
+uint32_t zfs_vdev_sync_write_min_active = 10;
+uint32_t zfs_vdev_sync_write_max_active = 10;
+uint32_t zfs_vdev_async_read_min_active = 1;
+uint32_t zfs_vdev_async_read_max_active = 3;
+uint32_t zfs_vdev_async_write_min_active = 2;
+uint32_t zfs_vdev_async_write_max_active = 10;
+uint32_t zfs_vdev_scrub_min_active = 1;
+uint32_t zfs_vdev_scrub_max_active = 3;
+uint32_t zfs_vdev_removal_min_active = 1;
+uint32_t zfs_vdev_removal_max_active = 2;
+uint32_t zfs_vdev_initializing_min_active = 1;
+uint32_t zfs_vdev_initializing_max_active = 1;
+uint32_t zfs_vdev_trim_min_active = 1;
+uint32_t zfs_vdev_trim_max_active = 2;
+uint32_t zfs_vdev_rebuild_min_active = 1;
+uint32_t zfs_vdev_rebuild_max_active = 3;
+
+/*
+ * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
+ * dirty data, use zfs_vdev_async_write_min_active. When it has more than
+ * zfs_vdev_async_write_active_max_dirty_percent, use
+ * zfs_vdev_async_write_max_active. The value is linearly interpolated
+ * between min and max.
+ */
+int zfs_vdev_async_write_active_min_dirty_percent = 30;
+int zfs_vdev_async_write_active_max_dirty_percent = 60;
+
+/*
+ * For non-interactive I/O (scrub, resilver, removal, initialize and rebuild),
+ * the number of concurrently-active I/O's is limited to *_min_active, unless
+ * the vdev is "idle". When there are no interactive I/Os active (sync or
+ * async), and zfs_vdev_nia_delay I/Os have completed since the last
+ * interactive I/O, then the vdev is considered to be "idle", and the number
+ * of concurrently-active non-interactive I/O's is increased to *_max_active.
+ */
+uint_t zfs_vdev_nia_delay = 5;
+
+/*
+ * Some HDDs tend to prioritize sequential I/O so high that concurrent
+ * random I/O latency reaches several seconds. On some HDDs it happens
+ * even if sequential I/Os are submitted one at a time, and so setting
+ * *_max_active to 1 does not help. To prevent non-interactive I/Os, like
+ * scrub, from monopolizing the device no more than zfs_vdev_nia_credit
+ * I/Os can be sent while there are outstanding incomplete interactive
+ * I/Os. This enforced wait ensures the HDD services the interactive I/O
+ * within a reasonable amount of time.
+ */
+uint_t zfs_vdev_nia_credit = 5;
+
+/*
+ * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
+ * For read I/Os, we also aggregate across small adjacency gaps; for writes
+ * we include spans of optional I/Os to aid aggregation at the disk even when
+ * they aren't able to help us aggregate at this level.
+ */
+int zfs_vdev_aggregation_limit = 1 << 20;
+int zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE;
+int zfs_vdev_read_gap_limit = 32 << 10;
+int zfs_vdev_write_gap_limit = 4 << 10;
+
+/*
+ * Define the queue depth percentage for each top-level. This percentage is
+ * used in conjunction with zfs_vdev_async_max_active to determine how many
+ * allocations a specific top-level vdev should handle. Once the queue depth
+ * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100
+ * then allocator will stop allocating blocks on that top-level device.
+ * The default kernel setting is 1000% which will yield 100 allocations per
+ * device. For userland testing, the default setting is 300% which equates
+ * to 30 allocations per device.
+ */
+#ifdef _KERNEL
+int zfs_vdev_queue_depth_pct = 1000;
+#else
+int zfs_vdev_queue_depth_pct = 300;
+#endif
+
+/*
+ * When performing allocations for a given metaslab, we want to make sure that
+ * there are enough IOs to aggregate together to improve throughput. We want to
+ * ensure that there are at least 128k worth of IOs that can be aggregated, and
+ * we assume that the average allocation size is 4k, so we need the queue depth
+ * to be 32 per allocator to get good aggregation of sequential writes.
+ */
+int zfs_vdev_def_queue_depth = 32;
+
+/*
+ * Allow TRIM I/Os to be aggregated. This should normally not be needed since
+ * TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted
+ * by the TRIM code in zfs_trim.c.
+ */
+int zfs_vdev_aggregate_trim = 0;
+
+static int
+vdev_queue_offset_compare(const void *x1, const void *x2)
+{
+ const zio_t *z1 = (const zio_t *)x1;
+ const zio_t *z2 = (const zio_t *)x2;
+
+ int cmp = TREE_CMP(z1->io_offset, z2->io_offset);
+
+ if (likely(cmp))
+ return (cmp);
+
+ return (TREE_PCMP(z1, z2));
+}
+
+static inline avl_tree_t *
+vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
+{
+ return (&vq->vq_class[p].vqc_queued_tree);
+}
+
+static inline avl_tree_t *
+vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
+{
+ ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM);
+ if (t == ZIO_TYPE_READ)
+ return (&vq->vq_read_offset_tree);
+ else if (t == ZIO_TYPE_WRITE)
+ return (&vq->vq_write_offset_tree);
+ else
+ return (&vq->vq_trim_offset_tree);
+}
+
+static int
+vdev_queue_timestamp_compare(const void *x1, const void *x2)
+{
+ const zio_t *z1 = (const zio_t *)x1;
+ const zio_t *z2 = (const zio_t *)x2;
+
+ int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp);
+
+ if (likely(cmp))
+ return (cmp);
+
+ return (TREE_PCMP(z1, z2));
+}
+
+static int
+vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
+{
+ switch (p) {
+ case ZIO_PRIORITY_SYNC_READ:
+ return (zfs_vdev_sync_read_min_active);
+ case ZIO_PRIORITY_SYNC_WRITE:
+ return (zfs_vdev_sync_write_min_active);
+ case ZIO_PRIORITY_ASYNC_READ:
+ return (zfs_vdev_async_read_min_active);
+ case ZIO_PRIORITY_ASYNC_WRITE:
+ return (zfs_vdev_async_write_min_active);
+ case ZIO_PRIORITY_SCRUB:
+ return (vq->vq_ia_active == 0 ? zfs_vdev_scrub_min_active :
+ MIN(vq->vq_nia_credit, zfs_vdev_scrub_min_active));
+ case ZIO_PRIORITY_REMOVAL:
+ return (vq->vq_ia_active == 0 ? zfs_vdev_removal_min_active :
+ MIN(vq->vq_nia_credit, zfs_vdev_removal_min_active));
+ case ZIO_PRIORITY_INITIALIZING:
+ return (vq->vq_ia_active == 0 ?zfs_vdev_initializing_min_active:
+ MIN(vq->vq_nia_credit, zfs_vdev_initializing_min_active));
+ case ZIO_PRIORITY_TRIM:
+ return (zfs_vdev_trim_min_active);
+ case ZIO_PRIORITY_REBUILD:
+ return (vq->vq_ia_active == 0 ? zfs_vdev_rebuild_min_active :
+ MIN(vq->vq_nia_credit, zfs_vdev_rebuild_min_active));
+ default:
+ panic("invalid priority %u", p);
+ return (0);
+ }
+}
+
+static int
+vdev_queue_max_async_writes(spa_t *spa)
+{
+ int writes;
+ uint64_t dirty = 0;
+ dsl_pool_t *dp = spa_get_dsl(spa);
+ uint64_t min_bytes = zfs_dirty_data_max *
+ zfs_vdev_async_write_active_min_dirty_percent / 100;
+ uint64_t max_bytes = zfs_dirty_data_max *
+ zfs_vdev_async_write_active_max_dirty_percent / 100;
+
+ /*
+ * Async writes may occur before the assignment of the spa's
+ * dsl_pool_t if a self-healing zio is issued prior to the
+ * completion of dmu_objset_open_impl().
+ */
+ if (dp == NULL)
+ return (zfs_vdev_async_write_max_active);
+
+ /*
+ * Sync tasks correspond to interactive user actions. To reduce the
+ * execution time of those actions we push data out as fast as possible.
+ */
+ dirty = dp->dp_dirty_total;
+ if (dirty > max_bytes || spa_has_pending_synctask(spa))
+ return (zfs_vdev_async_write_max_active);
+
+ if (dirty < min_bytes)
+ return (zfs_vdev_async_write_min_active);
+
+ /*
+ * linear interpolation:
+ * slope = (max_writes - min_writes) / (max_bytes - min_bytes)
+ * move right by min_bytes
+ * move up by min_writes
+ */
+ writes = (dirty - min_bytes) *
+ (zfs_vdev_async_write_max_active -
+ zfs_vdev_async_write_min_active) /
+ (max_bytes - min_bytes) +
+ zfs_vdev_async_write_min_active;
+ ASSERT3U(writes, >=, zfs_vdev_async_write_min_active);
+ ASSERT3U(writes, <=, zfs_vdev_async_write_max_active);
+ return (writes);
+}
+
+static int
+vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
+{
+ switch (p) {
+ case ZIO_PRIORITY_SYNC_READ:
+ return (zfs_vdev_sync_read_max_active);
+ case ZIO_PRIORITY_SYNC_WRITE:
+ return (zfs_vdev_sync_write_max_active);
+ case ZIO_PRIORITY_ASYNC_READ:
+ return (zfs_vdev_async_read_max_active);
+ case ZIO_PRIORITY_ASYNC_WRITE:
+ return (vdev_queue_max_async_writes(spa));
+ case ZIO_PRIORITY_SCRUB:
+ if (vq->vq_ia_active > 0) {
+ return (MIN(vq->vq_nia_credit,
+ zfs_vdev_scrub_min_active));
+ } else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
+ return (MAX(1, zfs_vdev_scrub_min_active));
+ return (zfs_vdev_scrub_max_active);
+ case ZIO_PRIORITY_REMOVAL:
+ if (vq->vq_ia_active > 0) {
+ return (MIN(vq->vq_nia_credit,
+ zfs_vdev_removal_min_active));
+ } else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
+ return (MAX(1, zfs_vdev_removal_min_active));
+ return (zfs_vdev_removal_max_active);
+ case ZIO_PRIORITY_INITIALIZING:
+ if (vq->vq_ia_active > 0) {
+ return (MIN(vq->vq_nia_credit,
+ zfs_vdev_initializing_min_active));
+ } else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
+ return (MAX(1, zfs_vdev_initializing_min_active));
+ return (zfs_vdev_initializing_max_active);
+ case ZIO_PRIORITY_TRIM:
+ return (zfs_vdev_trim_max_active);
+ case ZIO_PRIORITY_REBUILD:
+ if (vq->vq_ia_active > 0) {
+ return (MIN(vq->vq_nia_credit,
+ zfs_vdev_rebuild_min_active));
+ } else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
+ return (MAX(1, zfs_vdev_rebuild_min_active));
+ return (zfs_vdev_rebuild_max_active);
+ default:
+ panic("invalid priority %u", p);
+ return (0);
+ }
+}
+
+/*
+ * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if
+ * there is no eligible class.
+ */
+static zio_priority_t
+vdev_queue_class_to_issue(vdev_queue_t *vq)
+{
+ spa_t *spa = vq->vq_vdev->vdev_spa;
+ zio_priority_t p, n;
+
+ if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
+ return (ZIO_PRIORITY_NUM_QUEUEABLE);
+
+ /*
+ * Find a queue that has not reached its minimum # outstanding i/os.
+ * Do round-robin to reduce starvation due to zfs_vdev_max_active
+ * and vq_nia_credit limits.
+ */
+ for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) {
+ p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE;
+ if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
+ vq->vq_class[p].vqc_active <
+ vdev_queue_class_min_active(vq, p)) {
+ vq->vq_last_prio = p;
+ return (p);
+ }
+ }
+
+ /*
+ * If we haven't found a queue, look for one that hasn't reached its
+ * maximum # outstanding i/os.
+ */
+ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+ if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
+ vq->vq_class[p].vqc_active <
+ vdev_queue_class_max_active(spa, vq, p)) {
+ vq->vq_last_prio = p;
+ return (p);
+ }
+ }
+
+ /* No eligible queued i/os */
+ return (ZIO_PRIORITY_NUM_QUEUEABLE);
+}
+
+void
+vdev_queue_init(vdev_t *vd)
+{
+ vdev_queue_t *vq = &vd->vdev_queue;
+ zio_priority_t p;
+
+ mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
+ vq->vq_vdev = vd;
+ taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent);
+
+ avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
+ sizeof (zio_t), offsetof(struct zio, io_queue_node));
+ avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
+ vdev_queue_offset_compare, sizeof (zio_t),
+ offsetof(struct zio, io_offset_node));
+ avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
+ vdev_queue_offset_compare, sizeof (zio_t),
+ offsetof(struct zio, io_offset_node));
+ avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM),
+ vdev_queue_offset_compare, sizeof (zio_t),
+ offsetof(struct zio, io_offset_node));
+
+ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+ int (*compfn) (const void *, const void *);
+
+ /*
+ * The synchronous/trim i/o queues are dispatched in FIFO rather
+ * than LBA order. This provides more consistent latency for
+ * these i/os.
+ */
+ if (p == ZIO_PRIORITY_SYNC_READ ||
+ p == ZIO_PRIORITY_SYNC_WRITE ||
+ p == ZIO_PRIORITY_TRIM) {
+ compfn = vdev_queue_timestamp_compare;
+ } else {
+ compfn = vdev_queue_offset_compare;
+ }
+ avl_create(vdev_queue_class_tree(vq, p), compfn,
+ sizeof (zio_t), offsetof(struct zio, io_queue_node));
+ }
+
+ vq->vq_last_offset = 0;
+}
+
+void
+vdev_queue_fini(vdev_t *vd)
+{
+ vdev_queue_t *vq = &vd->vdev_queue;
+
+ for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
+ avl_destroy(vdev_queue_class_tree(vq, p));
+ avl_destroy(&vq->vq_active_tree);
+ avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
+ avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
+ avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM));
+
+ mutex_destroy(&vq->vq_lock);
+}
+
+static void
+vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ spa_history_kstat_t *shk = &spa->spa_stats.io_history;
+
+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+ avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+ avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
+
+ if (shk->kstat != NULL) {
+ mutex_enter(&shk->lock);
+ kstat_waitq_enter(shk->kstat->ks_data);
+ mutex_exit(&shk->lock);
+ }
+}
+
+static void
+vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ spa_history_kstat_t *shk = &spa->spa_stats.io_history;
+
+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+ avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+ avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
+
+ if (shk->kstat != NULL) {
+ mutex_enter(&shk->lock);
+ kstat_waitq_exit(shk->kstat->ks_data);
+ mutex_exit(&shk->lock);
+ }
+}
+
+static boolean_t
+vdev_queue_is_interactive(zio_priority_t p)
+{
+ switch (p) {
+ case ZIO_PRIORITY_SCRUB:
+ case ZIO_PRIORITY_REMOVAL:
+ case ZIO_PRIORITY_INITIALIZING:
+ case ZIO_PRIORITY_REBUILD:
+ return (B_FALSE);
+ default:
+ return (B_TRUE);
+ }
+}
+
+static void
+vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ spa_history_kstat_t *shk = &spa->spa_stats.io_history;
+
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+ vq->vq_class[zio->io_priority].vqc_active++;
+ if (vdev_queue_is_interactive(zio->io_priority)) {
+ if (++vq->vq_ia_active == 1)
+ vq->vq_nia_credit = 1;
+ } else if (vq->vq_ia_active > 0) {
+ vq->vq_nia_credit--;
+ }
+ avl_add(&vq->vq_active_tree, zio);
+
+ if (shk->kstat != NULL) {
+ mutex_enter(&shk->lock);
+ kstat_runq_enter(shk->kstat->ks_data);
+ mutex_exit(&shk->lock);
+ }
+}
+
+static void
+vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ spa_history_kstat_t *shk = &spa->spa_stats.io_history;
+
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+ vq->vq_class[zio->io_priority].vqc_active--;
+ if (vdev_queue_is_interactive(zio->io_priority)) {
+ if (--vq->vq_ia_active == 0)
+ vq->vq_nia_credit = 0;
+ else
+ vq->vq_nia_credit = zfs_vdev_nia_credit;
+ } else if (vq->vq_ia_active == 0)
+ vq->vq_nia_credit++;
+ avl_remove(&vq->vq_active_tree, zio);
+
+ if (shk->kstat != NULL) {
+ kstat_io_t *ksio = shk->kstat->ks_data;
+
+ mutex_enter(&shk->lock);
+ kstat_runq_exit(ksio);
+ if (zio->io_type == ZIO_TYPE_READ) {
+ ksio->reads++;
+ ksio->nread += zio->io_size;
+ } else if (zio->io_type == ZIO_TYPE_WRITE) {
+ ksio->writes++;
+ ksio->nwritten += zio->io_size;
+ }
+ mutex_exit(&shk->lock);
+ }
+}
+
+static void
+vdev_queue_agg_io_done(zio_t *aio)
+{
+ abd_free(aio->io_abd);
+}
+
+/*
+ * Compute the range spanned by two i/os, which is the endpoint of the last
+ * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
+ * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
+ * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
+ */
+#define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
+#define IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
+
+/*
+ * Sufficiently adjacent io_offset's in ZIOs will be aggregated. We do this
+ * by creating a gang ABD from the adjacent ZIOs io_abd's. By using
+ * a gang ABD we avoid doing memory copies to and from the parent,
+ * child ZIOs. The gang ABD also accounts for gaps between adjacent
+ * io_offsets by simply getting the zero ABD for writes or allocating
+ * a new ABD for reads and placing them in the gang ABD as well.
+ */
+static zio_t *
+vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
+{
+ zio_t *first, *last, *aio, *dio, *mandatory, *nio;
+ zio_link_t *zl = NULL;
+ uint64_t maxgap = 0;
+ uint64_t size;
+ uint64_t limit;
+ int maxblocksize;
+ boolean_t stretch = B_FALSE;
+ avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
+ enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
+ uint64_t next_offset;
+ abd_t *abd;
+
+ maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa);
+ if (vq->vq_vdev->vdev_nonrot)
+ limit = zfs_vdev_aggregation_limit_non_rotating;
+ else
+ limit = zfs_vdev_aggregation_limit;
+ limit = MAX(MIN(limit, maxblocksize), 0);
+
+ if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0)
+ return (NULL);
+
+ /*
+ * While TRIM commands could be aggregated based on offset this
+ * behavior is disabled until it's determined to be beneficial.
+ */
+ if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
+ return (NULL);
+
+ /*
+ * I/Os to distributed spares are directly dispatched to the dRAID
+ * leaf vdevs for aggregation. See the comment at the end of the
+ * zio_vdev_io_start() function.
+ */
+ ASSERT(vq->vq_vdev->vdev_ops != &vdev_draid_spare_ops);
+
+ first = last = zio;
+
+ if (zio->io_type == ZIO_TYPE_READ)
+ maxgap = zfs_vdev_read_gap_limit;
+
+ /*
+ * We can aggregate I/Os that are sufficiently adjacent and of
+ * the same flavor, as expressed by the AGG_INHERIT flags.
+ * The latter requirement is necessary so that certain
+ * attributes of the I/O, such as whether it's a normal I/O
+ * or a scrub/resilver, can be preserved in the aggregate.
+ * We can include optional I/Os, but don't allow them
+ * to begin a range as they add no benefit in that situation.
+ */
+
+ /*
+ * We keep track of the last non-optional I/O.
+ */
+ mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;
+
+ /*
+ * Walk backwards through sufficiently contiguous I/Os
+ * recording the last non-optional I/O.
+ */
+ while ((dio = AVL_PREV(t, first)) != NULL &&
+ (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
+ IO_SPAN(dio, last) <= limit &&
+ IO_GAP(dio, first) <= maxgap &&
+ dio->io_type == zio->io_type) {
+ first = dio;
+ if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
+ mandatory = first;
+ }
+
+ /*
+ * Skip any initial optional I/Os.
+ */
+ while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
+ first = AVL_NEXT(t, first);
+ ASSERT(first != NULL);
+ }
+
+
+ /*
+ * Walk forward through sufficiently contiguous I/Os.
+ * The aggregation limit does not apply to optional i/os, so that
+ * we can issue contiguous writes even if they are larger than the
+ * aggregation limit.
+ */
+ while ((dio = AVL_NEXT(t, last)) != NULL &&
+ (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
+ (IO_SPAN(first, dio) <= limit ||
+ (dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
+ IO_SPAN(first, dio) <= maxblocksize &&
+ IO_GAP(last, dio) <= maxgap &&
+ dio->io_type == zio->io_type) {
+ last = dio;
+ if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
+ mandatory = last;
+ }
+
+ /*
+ * Now that we've established the range of the I/O aggregation
+ * we must decide what to do with trailing optional I/Os.
+ * For reads, there's nothing to do. While we are unable to
+ * aggregate further, it's possible that a trailing optional
+ * I/O would allow the underlying device to aggregate with
+ * subsequent I/Os. We must therefore determine if the next
+ * non-optional I/O is close enough to make aggregation
+ * worthwhile.
+ */
+ if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
+ zio_t *nio = last;
+ while ((dio = AVL_NEXT(t, nio)) != NULL &&
+ IO_GAP(nio, dio) == 0 &&
+ IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) {
+ nio = dio;
+ if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
+ stretch = B_TRUE;
+ break;
+ }
+ }
+ }
+
+ if (stretch) {
+ /*
+ * We are going to include an optional io in our aggregated
+ * span, thus closing the write gap. Only mandatory i/os can
+ * start aggregated spans, so make sure that the next i/o
+ * after our span is mandatory.
+ */
+ dio = AVL_NEXT(t, last);
+ dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
+ } else {
+ /* do not include the optional i/o */
+ while (last != mandatory && last != first) {
+ ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
+ last = AVL_PREV(t, last);
+ ASSERT(last != NULL);
+ }
+ }
+
+ if (first == last)
+ return (NULL);
+
+ size = IO_SPAN(first, last);
+ ASSERT3U(size, <=, maxblocksize);
+
+ abd = abd_alloc_gang();
+ if (abd == NULL)
+ return (NULL);
+
+ aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
+ abd, size, first->io_type, zio->io_priority,
+ flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
+ vdev_queue_agg_io_done, NULL);
+ aio->io_timestamp = first->io_timestamp;
+
+ nio = first;
+ next_offset = first->io_offset;
+ do {
+ dio = nio;
+ nio = AVL_NEXT(t, dio);
+ zio_add_child(dio, aio);
+ vdev_queue_io_remove(vq, dio);
+
+ if (dio->io_offset != next_offset) {
+ /* allocate a buffer for a read gap */
+ ASSERT3U(dio->io_type, ==, ZIO_TYPE_READ);
+ ASSERT3U(dio->io_offset, >, next_offset);
+ abd = abd_alloc_for_io(
+ dio->io_offset - next_offset, B_TRUE);
+ abd_gang_add(aio->io_abd, abd, B_TRUE);
+ }
+ if (dio->io_abd &&
+ (dio->io_size != abd_get_size(dio->io_abd))) {
+ /* abd size not the same as IO size */
+ ASSERT3U(abd_get_size(dio->io_abd), >, dio->io_size);
+ abd = abd_get_offset_size(dio->io_abd, 0, dio->io_size);
+ abd_gang_add(aio->io_abd, abd, B_TRUE);
+ } else {
+ if (dio->io_flags & ZIO_FLAG_NODATA) {
+ /* allocate a buffer for a write gap */
+ ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
+ ASSERT3P(dio->io_abd, ==, NULL);
+ abd_gang_add(aio->io_abd,
+ abd_get_zeros(dio->io_size), B_TRUE);
+ } else {
+ /*
+ * We pass B_FALSE to abd_gang_add()
+ * because we did not allocate a new
+ * ABD, so it is assumed the caller
+ * will free this ABD.
+ */
+ abd_gang_add(aio->io_abd, dio->io_abd,
+ B_FALSE);
+ }
+ }
+ next_offset = dio->io_offset + dio->io_size;
+ } while (dio != last);
+ ASSERT3U(abd_get_size(aio->io_abd), ==, aio->io_size);
+
+ /*
+ * We need to drop the vdev queue's lock during zio_execute() to
+ * avoid a deadlock that we could encounter due to lock order
+ * reversal between vq_lock and io_lock in zio_change_priority().
+ */
+ mutex_exit(&vq->vq_lock);
+ while ((dio = zio_walk_parents(aio, &zl)) != NULL) {
+ ASSERT3U(dio->io_type, ==, aio->io_type);
+
+ zio_vdev_io_bypass(dio);
+ zio_execute(dio);
+ }
+ mutex_enter(&vq->vq_lock);
+
+ return (aio);
+}
+
+static zio_t *
+vdev_queue_io_to_issue(vdev_queue_t *vq)
+{
+ zio_t *zio, *aio;
+ zio_priority_t p;
+ avl_index_t idx;
+ avl_tree_t *tree;
+
+again:
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+ p = vdev_queue_class_to_issue(vq);
+
+ if (p == ZIO_PRIORITY_NUM_QUEUEABLE) {
+ /* No eligible queued i/os */
+ return (NULL);
+ }
+
+ /*
+ * For LBA-ordered queues (async / scrub / initializing), issue the
+ * i/o which follows the most recently issued i/o in LBA (offset) order.
+ *
+ * For FIFO queues (sync/trim), issue the i/o with the lowest timestamp.
+ */
+ tree = vdev_queue_class_tree(vq, p);
+ vq->vq_io_search.io_timestamp = 0;
+ vq->vq_io_search.io_offset = vq->vq_last_offset - 1;
+ VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL);
+ zio = avl_nearest(tree, idx, AVL_AFTER);
+ if (zio == NULL)
+ zio = avl_first(tree);
+ ASSERT3U(zio->io_priority, ==, p);
+
+ aio = vdev_queue_aggregate(vq, zio);
+ if (aio != NULL)
+ zio = aio;
+ else
+ vdev_queue_io_remove(vq, zio);
+
+ /*
+ * If the I/O is or was optional and therefore has no data, we need to
+ * simply discard it. We need to drop the vdev queue's lock to avoid a
+ * deadlock that we could encounter since this I/O will complete
+ * immediately.
+ */
+ if (zio->io_flags & ZIO_FLAG_NODATA) {
+ mutex_exit(&vq->vq_lock);
+ zio_vdev_io_bypass(zio);
+ zio_execute(zio);
+ mutex_enter(&vq->vq_lock);
+ goto again;
+ }
+
+ vdev_queue_pending_add(vq, zio);
+ vq->vq_last_offset = zio->io_offset + zio->io_size;
+
+ return (zio);
+}
+
+zio_t *
+vdev_queue_io(zio_t *zio)
+{
+ vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+ zio_t *nio;
+
+ if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
+ return (zio);
+
+ /*
+ * Children i/os inherent their parent's priority, which might
+ * not match the child's i/o type. Fix it up here.
+ */
+ if (zio->io_type == ZIO_TYPE_READ) {
+ ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM);
+
+ if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
+ zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
+ zio->io_priority != ZIO_PRIORITY_SCRUB &&
+ zio->io_priority != ZIO_PRIORITY_REMOVAL &&
+ zio->io_priority != ZIO_PRIORITY_INITIALIZING &&
+ zio->io_priority != ZIO_PRIORITY_REBUILD) {
+ zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
+ }
+ } else if (zio->io_type == ZIO_TYPE_WRITE) {
+ ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM);
+
+ if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
+ zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
+ zio->io_priority != ZIO_PRIORITY_REMOVAL &&
+ zio->io_priority != ZIO_PRIORITY_INITIALIZING &&
+ zio->io_priority != ZIO_PRIORITY_REBUILD) {
+ zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
+ }
+ } else {
+ ASSERT(zio->io_type == ZIO_TYPE_TRIM);
+ ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM);
+ }
+
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
+
+ mutex_enter(&vq->vq_lock);
+ zio->io_timestamp = gethrtime();
+ vdev_queue_io_add(vq, zio);
+ nio = vdev_queue_io_to_issue(vq);
+ mutex_exit(&vq->vq_lock);
+
+ if (nio == NULL)
+ return (NULL);
+
+ if (nio->io_done == vdev_queue_agg_io_done) {
+ zio_nowait(nio);
+ return (NULL);
+ }
+
+ return (nio);
+}
+
+void
+vdev_queue_io_done(zio_t *zio)
+{
+ vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+ zio_t *nio;
+
+ mutex_enter(&vq->vq_lock);
+
+ vdev_queue_pending_remove(vq, zio);
+
+ zio->io_delta = gethrtime() - zio->io_timestamp;
+ vq->vq_io_complete_ts = gethrtime();
+ vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp;
+
+ while ((nio = vdev_queue_io_to_issue(vq)) != NULL) {
+ mutex_exit(&vq->vq_lock);
+ if (nio->io_done == vdev_queue_agg_io_done) {
+ zio_nowait(nio);
+ } else {
+ zio_vdev_io_reissue(nio);
+ zio_execute(nio);
+ }
+ mutex_enter(&vq->vq_lock);
+ }
+
+ mutex_exit(&vq->vq_lock);
+}
+
+void
+vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
+{
+ vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+ avl_tree_t *tree;
+
+ /*
+ * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio
+ * code to issue IOs without adding them to the vdev queue. In this
+ * case, the zio is already going to be issued as quickly as possible
+ * and so it doesn't need any reprioritization to help.
+ */
+ if (zio->io_priority == ZIO_PRIORITY_NOW)
+ return;
+
+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+ ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+
+ if (zio->io_type == ZIO_TYPE_READ) {
+ if (priority != ZIO_PRIORITY_SYNC_READ &&
+ priority != ZIO_PRIORITY_ASYNC_READ &&
+ priority != ZIO_PRIORITY_SCRUB)
+ priority = ZIO_PRIORITY_ASYNC_READ;
+ } else {
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ if (priority != ZIO_PRIORITY_SYNC_WRITE &&
+ priority != ZIO_PRIORITY_ASYNC_WRITE)
+ priority = ZIO_PRIORITY_ASYNC_WRITE;
+ }
+
+ mutex_enter(&vq->vq_lock);
+
+ /*
+ * If the zio is in none of the queues we can simply change
+ * the priority. If the zio is waiting to be submitted we must
+ * remove it from the queue and re-insert it with the new priority.
+ * Otherwise, the zio is currently active and we cannot change its
+ * priority.
+ */
+ tree = vdev_queue_class_tree(vq, zio->io_priority);
+ if (avl_find(tree, zio, NULL) == zio) {
+ avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+ zio->io_priority = priority;
+ avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+ } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
+ zio->io_priority = priority;
+ }
+
+ mutex_exit(&vq->vq_lock);
+}
+
+/*
+ * As these two methods are only used for load calculations we're not
+ * concerned if we get an incorrect value on 32bit platforms due to lack of
+ * vq_lock mutex use here, instead we prefer to keep it lock free for
+ * performance.
+ */
+int
+vdev_queue_length(vdev_t *vd)
+{
+ return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
+}
+
+uint64_t
+vdev_queue_last_offset(vdev_t *vd)
+{
+ return (vd->vdev_queue.vq_last_offset);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, INT, ZMOD_RW,
+ "Max vdev I/O aggregation size");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, INT, ZMOD_RW,
+ "Max vdev I/O aggregation size for non-rotating media");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, INT, ZMOD_RW,
+ "Allow TRIM I/O to be aggregated");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, INT, ZMOD_RW,
+ "Aggregate read I/O over gap");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, write_gap_limit, INT, ZMOD_RW,
+ "Aggregate write I/O over gap");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_active, INT, ZMOD_RW,
+ "Maximum number of active I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_max_dirty_percent, INT, ZMOD_RW,
+ "Async write concurrency max threshold");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_min_dirty_percent, INT, ZMOD_RW,
+ "Async write concurrency min threshold");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_max_active, INT, ZMOD_RW,
+ "Max active async read I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_min_active, INT, ZMOD_RW,
+ "Min active async read I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_max_active, INT, ZMOD_RW,
+ "Max active async write I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_min_active, INT, ZMOD_RW,
+ "Min active async write I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_max_active, INT, ZMOD_RW,
+ "Max active initializing I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_min_active, INT, ZMOD_RW,
+ "Min active initializing I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_max_active, INT, ZMOD_RW,
+ "Max active removal I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_min_active, INT, ZMOD_RW,
+ "Min active removal I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_max_active, INT, ZMOD_RW,
+ "Max active scrub I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_min_active, INT, ZMOD_RW,
+ "Min active scrub I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_max_active, INT, ZMOD_RW,
+ "Max active sync read I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_min_active, INT, ZMOD_RW,
+ "Min active sync read I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_max_active, INT, ZMOD_RW,
+ "Max active sync write I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_min_active, INT, ZMOD_RW,
+ "Min active sync write I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, INT, ZMOD_RW,
+ "Max active trim/discard I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, INT, ZMOD_RW,
+ "Min active trim/discard I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW,
+ "Max active rebuild I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW,
+ "Min active rebuild I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, INT, ZMOD_RW,
+ "Number of non-interactive I/Os to allow in sequence");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, INT, ZMOD_RW,
+ "Number of non-interactive I/Os before _max_active");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW,
+ "Queue depth percentage for each top-level vdev");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
new file mode 100644
index 000000000000..f4812e61252c
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
@@ -0,0 +1,2747 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/abd.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/vdev_raidz.h>
+#include <sys/vdev_raidz_impl.h>
+#include <sys/vdev_draid.h>
+
+#ifdef ZFS_DEBUG
+#include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */
+#endif
+
+/*
+ * Virtual device vector for RAID-Z.
+ *
+ * This vdev supports single, double, and triple parity. For single parity,
+ * we use a simple XOR of all the data columns. For double or triple parity,
+ * we use a special case of Reed-Solomon coding. This extends the
+ * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
+ * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
+ * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
+ * former is also based. The latter is designed to provide higher performance
+ * for writes.
+ *
+ * Note that the Plank paper claimed to support arbitrary N+M, but was then
+ * amended six years later identifying a critical flaw that invalidates its
+ * claims. Nevertheless, the technique can be adapted to work for up to
+ * triple parity. For additional parity, the amendment "Note: Correction to
+ * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
+ * is viable, but the additional complexity means that write performance will
+ * suffer.
+ *
+ * All of the methods above operate on a Galois field, defined over the
+ * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
+ * can be expressed with a single byte. Briefly, the operations on the
+ * field are defined as follows:
+ *
+ * o addition (+) is represented by a bitwise XOR
+ * o subtraction (-) is therefore identical to addition: A + B = A - B
+ * o multiplication of A by 2 is defined by the following bitwise expression:
+ *
+ * (A * 2)_7 = A_6
+ * (A * 2)_6 = A_5
+ * (A * 2)_5 = A_4
+ * (A * 2)_4 = A_3 + A_7
+ * (A * 2)_3 = A_2 + A_7
+ * (A * 2)_2 = A_1 + A_7
+ * (A * 2)_1 = A_0
+ * (A * 2)_0 = A_7
+ *
+ * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
+ * As an aside, this multiplication is derived from the error correcting
+ * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
+ *
+ * Observe that any number in the field (except for 0) can be expressed as a
+ * power of 2 -- a generator for the field. We store a table of the powers of
+ * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
+ * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
+ * than field addition). The inverse of a field element A (A^-1) is therefore
+ * A ^ (255 - 1) = A^254.
+ *
+ * The up-to-three parity columns, P, Q, R over several data columns,
+ * D_0, ... D_n-1, can be expressed by field operations:
+ *
+ * P = D_0 + D_1 + ... + D_n-2 + D_n-1
+ * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
+ * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
+ * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
+ * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
+ *
+ * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
+ * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
+ * independent coefficients. (There are no additional coefficients that have
+ * this property which is why the uncorrected Plank method breaks down.)
+ *
+ * See the reconstruction code below for how P, Q and R can used individually
+ * or in concert to recover missing data columns.
+ */
+
+#define VDEV_RAIDZ_P 0
+#define VDEV_RAIDZ_Q 1
+#define VDEV_RAIDZ_R 2
+
+#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
+#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
+
+/*
+ * We provide a mechanism to perform the field multiplication operation on a
+ * 64-bit value all at once rather than a byte at a time. This works by
+ * creating a mask from the top bit in each byte and using that to
+ * conditionally apply the XOR of 0x1d.
+ */
+#define VDEV_RAIDZ_64MUL_2(x, mask) \
+{ \
+ (mask) = (x) & 0x8080808080808080ULL; \
+ (mask) = ((mask) << 1) - ((mask) >> 7); \
+ (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
+ ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
+}
+
+#define VDEV_RAIDZ_64MUL_4(x, mask) \
+{ \
+ VDEV_RAIDZ_64MUL_2((x), mask); \
+ VDEV_RAIDZ_64MUL_2((x), mask); \
+}
+
+static void
+vdev_raidz_row_free(raidz_row_t *rr)
+{
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_size != 0)
+ abd_free(rc->rc_abd);
+ if (rc->rc_gdata != NULL)
+ abd_free(rc->rc_gdata);
+ if (rc->rc_orig_data != NULL)
+ zio_buf_free(rc->rc_orig_data, rc->rc_size);
+ }
+
+ if (rr->rr_abd_copy != NULL)
+ abd_free(rr->rr_abd_copy);
+
+ if (rr->rr_abd_empty != NULL)
+ abd_free(rr->rr_abd_empty);
+
+ kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
+}
+
+void
+vdev_raidz_map_free(raidz_map_t *rm)
+{
+ for (int i = 0; i < rm->rm_nrows; i++)
+ vdev_raidz_row_free(rm->rm_row[i]);
+
+ kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
+}
+
+static void
+vdev_raidz_map_free_vsd(zio_t *zio)
+{
+ raidz_map_t *rm = zio->io_vsd;
+
+ ASSERT0(rm->rm_freed);
+ rm->rm_freed = B_TRUE;
+
+ if (rm->rm_reports == 0) {
+ vdev_raidz_map_free(rm);
+ }
+}
+
+/*ARGSUSED*/
+static void
+vdev_raidz_cksum_free(void *arg, size_t ignored)
+{
+ raidz_map_t *rm = arg;
+
+ ASSERT3U(rm->rm_reports, >, 0);
+
+ if (--rm->rm_reports == 0 && rm->rm_freed)
+ vdev_raidz_map_free(rm);
+}
+
+static void
+vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
+{
+ raidz_map_t *rm = zcr->zcr_cbdata;
+ const size_t c = zcr->zcr_cbinfo;
+ size_t x, offset;
+
+ if (good_data == NULL) {
+ zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
+ return;
+ }
+
+ ASSERT3U(rm->rm_nrows, ==, 1);
+ raidz_row_t *rr = rm->rm_row[0];
+
+ const abd_t *good = NULL;
+ const abd_t *bad = rr->rr_col[c].rc_abd;
+
+ if (c < rr->rr_firstdatacol) {
+ /*
+ * The first time through, calculate the parity blocks for
+ * the good data (this relies on the fact that the good
+ * data never changes for a given logical ZIO)
+ */
+ if (rr->rr_col[0].rc_gdata == NULL) {
+ abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
+
+ /*
+ * Set up the rr_col[]s to generate the parity for
+ * good_data, first saving the parity bufs and
+ * replacing them with buffers to hold the result.
+ */
+ for (x = 0; x < rr->rr_firstdatacol; x++) {
+ bad_parity[x] = rr->rr_col[x].rc_abd;
+ rr->rr_col[x].rc_abd = rr->rr_col[x].rc_gdata =
+ abd_alloc_sametype(rr->rr_col[x].rc_abd,
+ rr->rr_col[x].rc_size);
+ }
+
+ /* fill in the data columns from good_data */
+ offset = 0;
+ for (; x < rr->rr_cols; x++) {
+ abd_free(rr->rr_col[x].rc_abd);
+
+ rr->rr_col[x].rc_abd =
+ abd_get_offset_size((abd_t *)good_data,
+ offset, rr->rr_col[x].rc_size);
+ offset += rr->rr_col[x].rc_size;
+ }
+
+ /*
+ * Construct the parity from the good data.
+ */
+ vdev_raidz_generate_parity_row(rm, rr);
+
+ /* restore everything back to its original state */
+ for (x = 0; x < rr->rr_firstdatacol; x++)
+ rr->rr_col[x].rc_abd = bad_parity[x];
+
+ offset = 0;
+ for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) {
+ abd_free(rr->rr_col[x].rc_abd);
+ rr->rr_col[x].rc_abd = abd_get_offset_size(
+ rr->rr_abd_copy, offset,
+ rr->rr_col[x].rc_size);
+ offset += rr->rr_col[x].rc_size;
+ }
+ }
+
+ ASSERT3P(rr->rr_col[c].rc_gdata, !=, NULL);
+ good = abd_get_offset_size(rr->rr_col[c].rc_gdata, 0,
+ rr->rr_col[c].rc_size);
+ } else {
+ /* adjust good_data to point at the start of our column */
+ offset = 0;
+ for (x = rr->rr_firstdatacol; x < c; x++)
+ offset += rr->rr_col[x].rc_size;
+
+ good = abd_get_offset_size((abd_t *)good_data, offset,
+ rr->rr_col[c].rc_size);
+ }
+
+ /* we drop the ereport if it ends up that the data was good */
+ zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
+ abd_free((abd_t *)good);
+}
+
+/*
+ * Invoked indirectly by zfs_ereport_start_checksum(), called
+ * below when our read operation fails completely. The main point
+ * is to keep a copy of everything we read from disk, so that at
+ * vdev_raidz_cksum_finish() time we can compare it with the good data.
+ */
+static void
+vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
+{
+ size_t c = (size_t)(uintptr_t)arg;
+ raidz_map_t *rm = zio->io_vsd;
+
+ /* set up the report and bump the refcount */
+ zcr->zcr_cbdata = rm;
+ zcr->zcr_cbinfo = c;
+ zcr->zcr_finish = vdev_raidz_cksum_finish;
+ zcr->zcr_free = vdev_raidz_cksum_free;
+
+ rm->rm_reports++;
+ ASSERT3U(rm->rm_reports, >, 0);
+ ASSERT3U(rm->rm_nrows, ==, 1);
+
+ if (rm->rm_row[0]->rr_abd_copy != NULL)
+ return;
+
+ /*
+ * It's the first time we're called for this raidz_map_t, so we need
+ * to copy the data aside; there's no guarantee that our zio's buffer
+ * won't be re-used for something else.
+ *
+ * Our parity data is already in separate buffers, so there's no need
+ * to copy them.
+ */
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ size_t offset = 0;
+ size_t size = 0;
+
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++)
+ size += rr->rr_col[c].rc_size;
+
+ rr->rr_abd_copy = abd_alloc_for_io(size, B_FALSE);
+
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *col = &rr->rr_col[c];
+ abd_t *tmp = abd_get_offset_size(rr->rr_abd_copy,
+ offset, col->rc_size);
+
+ abd_copy(tmp, col->rc_abd, col->rc_size);
+
+ abd_free(col->rc_abd);
+ col->rc_abd = tmp;
+
+ offset += col->rc_size;
+ }
+ ASSERT3U(offset, ==, size);
+ }
+}
+
+static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
+ .vsd_free = vdev_raidz_map_free_vsd,
+ .vsd_cksum_report = vdev_raidz_cksum_report
+};
+
+/*
+ * Divides the IO evenly across all child vdevs; usually, dcols is
+ * the number of children in the target vdev.
+ *
+ * Avoid inlining the function to keep vdev_raidz_io_start(), which
+ * is this functions only caller, as small as possible on the stack.
+ */
+noinline raidz_map_t *
+vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
+ uint64_t nparity)
+{
+ raidz_row_t *rr;
+ /* The starting RAIDZ (parent) vdev sector of the block. */
+ uint64_t b = zio->io_offset >> ashift;
+ /* The zio's size in units of the vdev's minimum sector size. */
+ uint64_t s = zio->io_size >> ashift;
+ /* The first column for this stripe. */
+ uint64_t f = b % dcols;
+ /* The starting byte offset on each child vdev. */
+ uint64_t o = (b / dcols) << ashift;
+ uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
+
+ raidz_map_t *rm =
+ kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
+ rm->rm_nrows = 1;
+
+ /*
+ * "Quotient": The number of data sectors for this stripe on all but
+ * the "big column" child vdevs that also contain "remainder" data.
+ */
+ q = s / (dcols - nparity);
+
+ /*
+ * "Remainder": The number of partial stripe data sectors in this I/O.
+ * This will add a sector to some, but not all, child vdevs.
+ */
+ r = s - q * (dcols - nparity);
+
+ /* The number of "big columns" - those which contain remainder data. */
+ bc = (r == 0 ? 0 : r + nparity);
+
+ /*
+ * The total number of data and parity sectors associated with
+ * this I/O.
+ */
+ tot = s + nparity * (q + (r == 0 ? 0 : 1));
+
+ /*
+ * acols: The columns that will be accessed.
+ * scols: The columns that will be accessed or skipped.
+ */
+ if (q == 0) {
+ /* Our I/O request doesn't span all child vdevs. */
+ acols = bc;
+ scols = MIN(dcols, roundup(bc, nparity + 1));
+ } else {
+ acols = dcols;
+ scols = dcols;
+ }
+
+ ASSERT3U(acols, <=, scols);
+
+ rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP);
+ rm->rm_row[0] = rr;
+
+ rr->rr_cols = acols;
+ rr->rr_scols = scols;
+ rr->rr_bigcols = bc;
+ rr->rr_missingdata = 0;
+ rr->rr_missingparity = 0;
+ rr->rr_firstdatacol = nparity;
+ rr->rr_abd_copy = NULL;
+ rr->rr_abd_empty = NULL;
+ rr->rr_nempty = 0;
+#ifdef ZFS_DEBUG
+ rr->rr_offset = zio->io_offset;
+ rr->rr_size = zio->io_size;
+#endif
+
+ asize = 0;
+
+ for (c = 0; c < scols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ col = f + c;
+ coff = o;
+ if (col >= dcols) {
+ col -= dcols;
+ coff += 1ULL << ashift;
+ }
+ rc->rc_devidx = col;
+ rc->rc_offset = coff;
+ rc->rc_abd = NULL;
+ rc->rc_gdata = NULL;
+ rc->rc_orig_data = NULL;
+ rc->rc_error = 0;
+ rc->rc_tried = 0;
+ rc->rc_skipped = 0;
+ rc->rc_repair = 0;
+ rc->rc_need_orig_restore = B_FALSE;
+
+ if (c >= acols)
+ rc->rc_size = 0;
+ else if (c < bc)
+ rc->rc_size = (q + 1) << ashift;
+ else
+ rc->rc_size = q << ashift;
+
+ asize += rc->rc_size;
+ }
+
+ ASSERT3U(asize, ==, tot << ashift);
+ rm->rm_nskip = roundup(tot, nparity + 1) - tot;
+ rm->rm_skipstart = bc;
+
+ for (c = 0; c < rr->rr_firstdatacol; c++)
+ rr->rr_col[c].rc_abd =
+ abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
+
+ for (uint64_t off = 0; c < acols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
+ zio->io_abd, off, rc->rc_size);
+ off += rc->rc_size;
+ }
+
+ /*
+ * If all data stored spans all columns, there's a danger that parity
+ * will always be on the same device and, since parity isn't read
+ * during normal operation, that device's I/O bandwidth won't be
+ * used effectively. We therefore switch the parity every 1MB.
+ *
+ * ... at least that was, ostensibly, the theory. As a practical
+ * matter unless we juggle the parity between all devices evenly, we
+ * won't see any benefit. Further, occasional writes that aren't a
+ * multiple of the LCM of the number of children and the minimum
+ * stripe width are sufficient to avoid pessimal behavior.
+ * Unfortunately, this decision created an implicit on-disk format
+ * requirement that we need to support for all eternity, but only
+ * for single-parity RAID-Z.
+ *
+ * If we intend to skip a sector in the zeroth column for padding
+ * we must make sure to note this swap. We will never intend to
+ * skip the first column since at least one data and one parity
+ * column must appear in each row.
+ */
+ ASSERT(rr->rr_cols >= 2);
+ ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
+
+ if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
+ devidx = rr->rr_col[0].rc_devidx;
+ o = rr->rr_col[0].rc_offset;
+ rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
+ rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
+ rr->rr_col[1].rc_devidx = devidx;
+ rr->rr_col[1].rc_offset = o;
+
+ if (rm->rm_skipstart == 0)
+ rm->rm_skipstart = 1;
+ }
+
+ /* init RAIDZ parity ops */
+ rm->rm_ops = vdev_raidz_math_get_ops();
+
+ return (rm);
+}
+
+struct pqr_struct {
+ uint64_t *p;
+ uint64_t *q;
+ uint64_t *r;
+};
+
+static int
+vdev_raidz_p_func(void *buf, size_t size, void *private)
+{
+ struct pqr_struct *pqr = private;
+ const uint64_t *src = buf;
+ int i, cnt = size / sizeof (src[0]);
+
+ ASSERT(pqr->p && !pqr->q && !pqr->r);
+
+ for (i = 0; i < cnt; i++, src++, pqr->p++)
+ *pqr->p ^= *src;
+
+ return (0);
+}
+
+static int
+vdev_raidz_pq_func(void *buf, size_t size, void *private)
+{
+ struct pqr_struct *pqr = private;
+ const uint64_t *src = buf;
+ uint64_t mask;
+ int i, cnt = size / sizeof (src[0]);
+
+ ASSERT(pqr->p && pqr->q && !pqr->r);
+
+ for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
+ *pqr->p ^= *src;
+ VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
+ *pqr->q ^= *src;
+ }
+
+ return (0);
+}
+
+static int
+vdev_raidz_pqr_func(void *buf, size_t size, void *private)
+{
+ struct pqr_struct *pqr = private;
+ const uint64_t *src = buf;
+ uint64_t mask;
+ int i, cnt = size / sizeof (src[0]);
+
+ ASSERT(pqr->p && pqr->q && pqr->r);
+
+ for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
+ *pqr->p ^= *src;
+ VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
+ *pqr->q ^= *src;
+ VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
+ *pqr->r ^= *src;
+ }
+
+ return (0);
+}
+
+static void
+vdev_raidz_generate_parity_p(raidz_row_t *rr)
+{
+ uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+
+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ abd_t *src = rr->rr_col[c].rc_abd;
+
+ if (c == rr->rr_firstdatacol) {
+ abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
+ } else {
+ struct pqr_struct pqr = { p, NULL, NULL };
+ (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
+ vdev_raidz_p_func, &pqr);
+ }
+ }
+}
+
+static void
+vdev_raidz_generate_parity_pq(raidz_row_t *rr)
+{
+ uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+ uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
+ uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
+ ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
+ rr->rr_col[VDEV_RAIDZ_Q].rc_size);
+
+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ abd_t *src = rr->rr_col[c].rc_abd;
+
+ uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
+
+ if (c == rr->rr_firstdatacol) {
+ ASSERT(ccnt == pcnt || ccnt == 0);
+ abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
+ (void) memcpy(q, p, rr->rr_col[c].rc_size);
+
+ for (uint64_t i = ccnt; i < pcnt; i++) {
+ p[i] = 0;
+ q[i] = 0;
+ }
+ } else {
+ struct pqr_struct pqr = { p, q, NULL };
+
+ ASSERT(ccnt <= pcnt);
+ (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
+ vdev_raidz_pq_func, &pqr);
+
+ /*
+ * Treat short columns as though they are full of 0s.
+ * Note that there's therefore nothing needed for P.
+ */
+ uint64_t mask;
+ for (uint64_t i = ccnt; i < pcnt; i++) {
+ VDEV_RAIDZ_64MUL_2(q[i], mask);
+ }
+ }
+ }
+}
+
+static void
+vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
+{
+ uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+ uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
+ uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
+ uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
+ ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
+ rr->rr_col[VDEV_RAIDZ_Q].rc_size);
+ ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
+ rr->rr_col[VDEV_RAIDZ_R].rc_size);
+
+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ abd_t *src = rr->rr_col[c].rc_abd;
+
+ uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
+
+ if (c == rr->rr_firstdatacol) {
+ ASSERT(ccnt == pcnt || ccnt == 0);
+ abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
+ (void) memcpy(q, p, rr->rr_col[c].rc_size);
+ (void) memcpy(r, p, rr->rr_col[c].rc_size);
+
+ for (uint64_t i = ccnt; i < pcnt; i++) {
+ p[i] = 0;
+ q[i] = 0;
+ r[i] = 0;
+ }
+ } else {
+ struct pqr_struct pqr = { p, q, r };
+
+ ASSERT(ccnt <= pcnt);
+ (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
+ vdev_raidz_pqr_func, &pqr);
+
+ /*
+ * Treat short columns as though they are full of 0s.
+ * Note that there's therefore nothing needed for P.
+ */
+ uint64_t mask;
+ for (uint64_t i = ccnt; i < pcnt; i++) {
+ VDEV_RAIDZ_64MUL_2(q[i], mask);
+ VDEV_RAIDZ_64MUL_4(r[i], mask);
+ }
+ }
+ }
+}
+
+/*
+ * Generate RAID parity in the first virtual columns according to the number of
+ * parity columns available.
+ */
+void
+vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
+{
+ ASSERT3U(rr->rr_cols, !=, 0);
+
+ /* Generate using the new math implementation */
+ if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
+ return;
+
+ switch (rr->rr_firstdatacol) {
+ case 1:
+ vdev_raidz_generate_parity_p(rr);
+ break;
+ case 2:
+ vdev_raidz_generate_parity_pq(rr);
+ break;
+ case 3:
+ vdev_raidz_generate_parity_pqr(rr);
+ break;
+ default:
+ cmn_err(CE_PANIC, "invalid RAID-Z configuration");
+ }
+}
+
+void
+vdev_raidz_generate_parity(raidz_map_t *rm)
+{
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ vdev_raidz_generate_parity_row(rm, rr);
+ }
+}
+
+/* ARGSUSED */
+static int
+vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
+{
+ uint64_t *dst = dbuf;
+ uint64_t *src = sbuf;
+ int cnt = size / sizeof (src[0]);
+
+ for (int i = 0; i < cnt; i++) {
+ dst[i] ^= src[i];
+ }
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
+ void *private)
+{
+ uint64_t *dst = dbuf;
+ uint64_t *src = sbuf;
+ uint64_t mask;
+ int cnt = size / sizeof (dst[0]);
+
+ for (int i = 0; i < cnt; i++, dst++, src++) {
+ VDEV_RAIDZ_64MUL_2(*dst, mask);
+ *dst ^= *src;
+ }
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
+{
+ uint64_t *dst = buf;
+ uint64_t mask;
+ int cnt = size / sizeof (dst[0]);
+
+ for (int i = 0; i < cnt; i++, dst++) {
+ /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
+ VDEV_RAIDZ_64MUL_2(*dst, mask);
+ }
+
+ return (0);
+}
+
+struct reconst_q_struct {
+ uint64_t *q;
+ int exp;
+};
+
+static int
+vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
+{
+ struct reconst_q_struct *rq = private;
+ uint64_t *dst = buf;
+ int cnt = size / sizeof (dst[0]);
+
+ for (int i = 0; i < cnt; i++, dst++, rq->q++) {
+ int j;
+ uint8_t *b;
+
+ *dst ^= *rq->q;
+ for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
+ *b = vdev_raidz_exp2(*b, rq->exp);
+ }
+ }
+
+ return (0);
+}
+
+struct reconst_pq_struct {
+ uint8_t *p;
+ uint8_t *q;
+ uint8_t *pxy;
+ uint8_t *qxy;
+ int aexp;
+ int bexp;
+};
+
+static int
+vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
+{
+ struct reconst_pq_struct *rpq = private;
+ uint8_t *xd = xbuf;
+ uint8_t *yd = ybuf;
+
+ for (int i = 0; i < size;
+ i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
+ *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
+ vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
+ *yd = *rpq->p ^ *rpq->pxy ^ *xd;
+ }
+
+ return (0);
+}
+
+static int
+vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
+{
+ struct reconst_pq_struct *rpq = private;
+ uint8_t *xd = xbuf;
+
+ for (int i = 0; i < size;
+ i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
+ /* same operation as vdev_raidz_reconst_pq_func() on xd */
+ *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
+ vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
+ }
+
+ return (0);
+}
+
+static int
+vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
+{
+ int x = tgts[0];
+ abd_t *dst, *src;
+
+ ASSERT3U(ntgts, ==, 1);
+ ASSERT3U(x, >=, rr->rr_firstdatacol);
+ ASSERT3U(x, <, rr->rr_cols);
+
+ ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
+
+ src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
+ dst = rr->rr_col[x].rc_abd;
+
+ abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
+
+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ uint64_t size = MIN(rr->rr_col[x].rc_size,
+ rr->rr_col[c].rc_size);
+
+ src = rr->rr_col[c].rc_abd;
+
+ if (c == x)
+ continue;
+
+ (void) abd_iterate_func2(dst, src, 0, 0, size,
+ vdev_raidz_reconst_p_func, NULL);
+ }
+
+ return (1 << VDEV_RAIDZ_P);
+}
+
+static int
+vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
+{
+ int x = tgts[0];
+ int c, exp;
+ abd_t *dst, *src;
+
+ ASSERT(ntgts == 1);
+
+ ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
+
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
+ rr->rr_col[c].rc_size);
+
+ src = rr->rr_col[c].rc_abd;
+ dst = rr->rr_col[x].rc_abd;
+
+ if (c == rr->rr_firstdatacol) {
+ abd_copy(dst, src, size);
+ if (rr->rr_col[x].rc_size > size) {
+ abd_zero_off(dst, size,
+ rr->rr_col[x].rc_size - size);
+ }
+ } else {
+ ASSERT3U(size, <=, rr->rr_col[x].rc_size);
+ (void) abd_iterate_func2(dst, src, 0, 0, size,
+ vdev_raidz_reconst_q_pre_func, NULL);
+ (void) abd_iterate_func(dst,
+ size, rr->rr_col[x].rc_size - size,
+ vdev_raidz_reconst_q_pre_tail_func, NULL);
+ }
+ }
+
+ src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
+ dst = rr->rr_col[x].rc_abd;
+ exp = 255 - (rr->rr_cols - 1 - x);
+
+ struct reconst_q_struct rq = { abd_to_buf(src), exp };
+ (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
+ vdev_raidz_reconst_q_post_func, &rq);
+
+ return (1 << VDEV_RAIDZ_Q);
+}
+
+static int
+vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
+{
+ uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
+ abd_t *pdata, *qdata;
+ uint64_t xsize, ysize;
+ int x = tgts[0];
+ int y = tgts[1];
+ abd_t *xd, *yd;
+
+ ASSERT(ntgts == 2);
+ ASSERT(x < y);
+ ASSERT(x >= rr->rr_firstdatacol);
+ ASSERT(y < rr->rr_cols);
+
+ ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
+
+ /*
+ * Move the parity data aside -- we're going to compute parity as
+ * though columns x and y were full of zeros -- Pxy and Qxy. We want to
+ * reuse the parity generation mechanism without trashing the actual
+ * parity so we make those columns appear to be full of zeros by
+ * setting their lengths to zero.
+ */
+ pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
+ qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
+ xsize = rr->rr_col[x].rc_size;
+ ysize = rr->rr_col[y].rc_size;
+
+ rr->rr_col[VDEV_RAIDZ_P].rc_abd =
+ abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
+ rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
+ abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
+ rr->rr_col[x].rc_size = 0;
+ rr->rr_col[y].rc_size = 0;
+
+ vdev_raidz_generate_parity_pq(rr);
+
+ rr->rr_col[x].rc_size = xsize;
+ rr->rr_col[y].rc_size = ysize;
+
+ p = abd_to_buf(pdata);
+ q = abd_to_buf(qdata);
+ pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+ qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
+ xd = rr->rr_col[x].rc_abd;
+ yd = rr->rr_col[y].rc_abd;
+
+ /*
+ * We now have:
+ * Pxy = P + D_x + D_y
+ * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
+ *
+ * We can then solve for D_x:
+ * D_x = A * (P + Pxy) + B * (Q + Qxy)
+ * where
+ * A = 2^(x - y) * (2^(x - y) + 1)^-1
+ * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
+ *
+ * With D_x in hand, we can easily solve for D_y:
+ * D_y = P + Pxy + D_x
+ */
+
+ a = vdev_raidz_pow2[255 + x - y];
+ b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
+ tmp = 255 - vdev_raidz_log2[a ^ 1];
+
+ aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
+ bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
+
+ ASSERT3U(xsize, >=, ysize);
+ struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
+
+ (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
+ vdev_raidz_reconst_pq_func, &rpq);
+ (void) abd_iterate_func(xd, ysize, xsize - ysize,
+ vdev_raidz_reconst_pq_tail_func, &rpq);
+
+ abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+ abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
+
+ /*
+ * Restore the saved parity data.
+ */
+ rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
+ rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
+
+ return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
+}
+
+/* BEGIN CSTYLED */
+/*
+ * In the general case of reconstruction, we must solve the system of linear
+ * equations defined by the coefficients used to generate parity as well as
+ * the contents of the data and parity disks. This can be expressed with
+ * vectors for the original data (D) and the actual data (d) and parity (p)
+ * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
+ *
+ * __ __ __ __
+ * | | __ __ | p_0 |
+ * | V | | D_0 | | p_m-1 |
+ * | | x | : | = | d_0 |
+ * | I | | D_n-1 | | : |
+ * | | ~~ ~~ | d_n-1 |
+ * ~~ ~~ ~~ ~~
+ *
+ * I is simply a square identity matrix of size n, and V is a vandermonde
+ * matrix defined by the coefficients we chose for the various parity columns
+ * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
+ * computation as well as linear separability.
+ *
+ * __ __ __ __
+ * | 1 .. 1 1 1 | | p_0 |
+ * | 2^n-1 .. 4 2 1 | __ __ | : |
+ * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
+ * | 1 .. 0 0 0 | | D_1 | | d_0 |
+ * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
+ * | : : : : | | : | | d_2 |
+ * | 0 .. 1 0 0 | | D_n-1 | | : |
+ * | 0 .. 0 1 0 | ~~ ~~ | : |
+ * | 0 .. 0 0 1 | | d_n-1 |
+ * ~~ ~~ ~~ ~~
+ *
+ * Note that I, V, d, and p are known. To compute D, we must invert the
+ * matrix and use the known data and parity values to reconstruct the unknown
+ * data values. We begin by removing the rows in V|I and d|p that correspond
+ * to failed or missing columns; we then make V|I square (n x n) and d|p
+ * sized n by removing rows corresponding to unused parity from the bottom up
+ * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
+ * using Gauss-Jordan elimination. In the example below we use m=3 parity
+ * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
+ * __ __
+ * | 1 1 1 1 1 1 1 1 |
+ * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
+ * | 19 205 116 29 64 16 4 1 | / /
+ * | 1 0 0 0 0 0 0 0 | / /
+ * | 0 1 0 0 0 0 0 0 | <--' /
+ * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
+ * | 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 1 1 1 1 1 1 1 |
+ * | 128 64 32 16 8 4 2 1 |
+ * | 19 205 116 29 64 16 4 1 |
+ * | 1 0 0 0 0 0 0 0 |
+ * | 0 1 0 0 0 0 0 0 |
+ * (V|I)' = | 0 0 1 0 0 0 0 0 |
+ * | 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ *
+ * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
+ * have carefully chosen the seed values 1, 2, and 4 to ensure that this
+ * matrix is not singular.
+ * __ __
+ * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
+ * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
+ * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
+ * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
+ * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
+ * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
+ * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 0 0 1 0 0 0 0 0 |
+ * | 167 100 5 41 159 169 217 208 |
+ * | 166 100 4 40 158 168 216 209 |
+ * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ *
+ * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
+ * of the missing data.
+ *
+ * As is apparent from the example above, the only non-trivial rows in the
+ * inverse matrix correspond to the data disks that we're trying to
+ * reconstruct. Indeed, those are the only rows we need as the others would
+ * only be useful for reconstructing data known or assumed to be valid. For
+ * that reason, we only build the coefficients in the rows that correspond to
+ * targeted columns.
+ */
+/* END CSTYLED */
+
+static void
+vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
+ uint8_t **rows)
+{
+ int i, j;
+ int pow;
+
+ ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
+
+ /*
+ * Fill in the missing rows of interest.
+ */
+ for (i = 0; i < nmap; i++) {
+ ASSERT3S(0, <=, map[i]);
+ ASSERT3S(map[i], <=, 2);
+
+ pow = map[i] * n;
+ if (pow > 255)
+ pow -= 255;
+ ASSERT(pow <= 255);
+
+ for (j = 0; j < n; j++) {
+ pow -= map[i];
+ if (pow < 0)
+ pow += 255;
+ rows[i][j] = vdev_raidz_pow2[pow];
+ }
+ }
+}
+
+static void
+vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
+ uint8_t **rows, uint8_t **invrows, const uint8_t *used)
+{
+ int i, j, ii, jj;
+ uint8_t log;
+
+ /*
+ * Assert that the first nmissing entries from the array of used
+ * columns correspond to parity columns and that subsequent entries
+ * correspond to data columns.
+ */
+ for (i = 0; i < nmissing; i++) {
+ ASSERT3S(used[i], <, rr->rr_firstdatacol);
+ }
+ for (; i < n; i++) {
+ ASSERT3S(used[i], >=, rr->rr_firstdatacol);
+ }
+
+ /*
+ * First initialize the storage where we'll compute the inverse rows.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < n; j++) {
+ invrows[i][j] = (i == j) ? 1 : 0;
+ }
+ }
+
+ /*
+ * Subtract all trivial rows from the rows of consequence.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = nmissing; j < n; j++) {
+ ASSERT3U(used[j], >=, rr->rr_firstdatacol);
+ jj = used[j] - rr->rr_firstdatacol;
+ ASSERT3S(jj, <, n);
+ invrows[i][j] = rows[i][jj];
+ rows[i][jj] = 0;
+ }
+ }
+
+ /*
+ * For each of the rows of interest, we must normalize it and subtract
+ * a multiple of it from the other rows.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < missing[i]; j++) {
+ ASSERT0(rows[i][j]);
+ }
+ ASSERT3U(rows[i][missing[i]], !=, 0);
+
+ /*
+ * Compute the inverse of the first element and multiply each
+ * element in the row by that value.
+ */
+ log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
+
+ for (j = 0; j < n; j++) {
+ rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
+ invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
+ }
+
+ for (ii = 0; ii < nmissing; ii++) {
+ if (i == ii)
+ continue;
+
+ ASSERT3U(rows[ii][missing[i]], !=, 0);
+
+ log = vdev_raidz_log2[rows[ii][missing[i]]];
+
+ for (j = 0; j < n; j++) {
+ rows[ii][j] ^=
+ vdev_raidz_exp2(rows[i][j], log);
+ invrows[ii][j] ^=
+ vdev_raidz_exp2(invrows[i][j], log);
+ }
+ }
+ }
+
+ /*
+ * Verify that the data that is left in the rows are properly part of
+ * an identity matrix.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < n; j++) {
+ if (j == missing[i]) {
+ ASSERT3U(rows[i][j], ==, 1);
+ } else {
+ ASSERT0(rows[i][j]);
+ }
+ }
+ }
+}
+
+static void
+vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
+ int *missing, uint8_t **invrows, const uint8_t *used)
+{
+ int i, j, x, cc, c;
+ uint8_t *src;
+ uint64_t ccount;
+ uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
+ uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
+ uint8_t log = 0;
+ uint8_t val;
+ int ll;
+ uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
+ uint8_t *p, *pp;
+ size_t psize;
+
+ psize = sizeof (invlog[0][0]) * n * nmissing;
+ p = kmem_alloc(psize, KM_SLEEP);
+
+ for (pp = p, i = 0; i < nmissing; i++) {
+ invlog[i] = pp;
+ pp += n;
+ }
+
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < n; j++) {
+ ASSERT3U(invrows[i][j], !=, 0);
+ invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
+ }
+ }
+
+ for (i = 0; i < n; i++) {
+ c = used[i];
+ ASSERT3U(c, <, rr->rr_cols);
+
+ ccount = rr->rr_col[c].rc_size;
+ ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
+ if (ccount == 0)
+ continue;
+ src = abd_to_buf(rr->rr_col[c].rc_abd);
+ for (j = 0; j < nmissing; j++) {
+ cc = missing[j] + rr->rr_firstdatacol;
+ ASSERT3U(cc, >=, rr->rr_firstdatacol);
+ ASSERT3U(cc, <, rr->rr_cols);
+ ASSERT3U(cc, !=, c);
+
+ dcount[j] = rr->rr_col[cc].rc_size;
+ if (dcount[j] != 0)
+ dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
+ }
+
+ for (x = 0; x < ccount; x++, src++) {
+ if (*src != 0)
+ log = vdev_raidz_log2[*src];
+
+ for (cc = 0; cc < nmissing; cc++) {
+ if (x >= dcount[cc])
+ continue;
+
+ if (*src == 0) {
+ val = 0;
+ } else {
+ if ((ll = log + invlog[cc][i]) >= 255)
+ ll -= 255;
+ val = vdev_raidz_pow2[ll];
+ }
+
+ if (i == 0)
+ dst[cc][x] = val;
+ else
+ dst[cc][x] ^= val;
+ }
+ }
+ }
+
+ kmem_free(p, psize);
+}
+
+static int
+vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
+{
+ int n, i, c, t, tt;
+ int nmissing_rows;
+ int missing_rows[VDEV_RAIDZ_MAXPARITY];
+ int parity_map[VDEV_RAIDZ_MAXPARITY];
+ uint8_t *p, *pp;
+ size_t psize;
+ uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
+ uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
+ uint8_t *used;
+
+ abd_t **bufs = NULL;
+
+ int code = 0;
+
+ /*
+ * Matrix reconstruction can't use scatter ABDs yet, so we allocate
+ * temporary linear ABDs if any non-linear ABDs are found.
+ */
+ for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
+ if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
+ bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
+ KM_PUSHPAGE);
+
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *col = &rr->rr_col[c];
+
+ bufs[c] = col->rc_abd;
+ if (bufs[c] != NULL) {
+ col->rc_abd = abd_alloc_linear(
+ col->rc_size, B_TRUE);
+ abd_copy(col->rc_abd, bufs[c],
+ col->rc_size);
+ }
+ }
+
+ break;
+ }
+ }
+
+ n = rr->rr_cols - rr->rr_firstdatacol;
+
+ /*
+ * Figure out which data columns are missing.
+ */
+ nmissing_rows = 0;
+ for (t = 0; t < ntgts; t++) {
+ if (tgts[t] >= rr->rr_firstdatacol) {
+ missing_rows[nmissing_rows++] =
+ tgts[t] - rr->rr_firstdatacol;
+ }
+ }
+
+ /*
+ * Figure out which parity columns to use to help generate the missing
+ * data columns.
+ */
+ for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
+ ASSERT(tt < ntgts);
+ ASSERT(c < rr->rr_firstdatacol);
+
+ /*
+ * Skip any targeted parity columns.
+ */
+ if (c == tgts[tt]) {
+ tt++;
+ continue;
+ }
+
+ code |= 1 << c;
+
+ parity_map[i] = c;
+ i++;
+ }
+
+ ASSERT(code != 0);
+ ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
+
+ psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
+ nmissing_rows * n + sizeof (used[0]) * n;
+ p = kmem_alloc(psize, KM_SLEEP);
+
+ for (pp = p, i = 0; i < nmissing_rows; i++) {
+ rows[i] = pp;
+ pp += n;
+ invrows[i] = pp;
+ pp += n;
+ }
+ used = pp;
+
+ for (i = 0; i < nmissing_rows; i++) {
+ used[i] = parity_map[i];
+ }
+
+ for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ if (tt < nmissing_rows &&
+ c == missing_rows[tt] + rr->rr_firstdatacol) {
+ tt++;
+ continue;
+ }
+
+ ASSERT3S(i, <, n);
+ used[i] = c;
+ i++;
+ }
+
+ /*
+ * Initialize the interesting rows of the matrix.
+ */
+ vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
+
+ /*
+ * Invert the matrix.
+ */
+ vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
+ invrows, used);
+
+ /*
+ * Reconstruct the missing data using the generated matrix.
+ */
+ vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
+ invrows, used);
+
+ kmem_free(p, psize);
+
+ /*
+ * copy back from temporary linear abds and free them
+ */
+ if (bufs) {
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *col = &rr->rr_col[c];
+
+ if (bufs[c] != NULL) {
+ abd_copy(bufs[c], col->rc_abd, col->rc_size);
+ abd_free(col->rc_abd);
+ }
+ col->rc_abd = bufs[c];
+ }
+ kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
+ }
+
+ return (code);
+}
+
+static int
+vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
+ const int *t, int nt)
+{
+ int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
+ int ntgts;
+ int i, c, ret;
+ int code;
+ int nbadparity, nbaddata;
+ int parity_valid[VDEV_RAIDZ_MAXPARITY];
+
+ nbadparity = rr->rr_firstdatacol;
+ nbaddata = rr->rr_cols - nbadparity;
+ ntgts = 0;
+ for (i = 0, c = 0; c < rr->rr_cols; c++) {
+ if (c < rr->rr_firstdatacol)
+ parity_valid[c] = B_FALSE;
+
+ if (i < nt && c == t[i]) {
+ tgts[ntgts++] = c;
+ i++;
+ } else if (rr->rr_col[c].rc_error != 0) {
+ tgts[ntgts++] = c;
+ } else if (c >= rr->rr_firstdatacol) {
+ nbaddata--;
+ } else {
+ parity_valid[c] = B_TRUE;
+ nbadparity--;
+ }
+ }
+
+ ASSERT(ntgts >= nt);
+ ASSERT(nbaddata >= 0);
+ ASSERT(nbaddata + nbadparity == ntgts);
+
+ dt = &tgts[nbadparity];
+
+ /* Reconstruct using the new math implementation */
+ ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
+ if (ret != RAIDZ_ORIGINAL_IMPL)
+ return (ret);
+
+ /*
+ * See if we can use any of our optimized reconstruction routines.
+ */
+ switch (nbaddata) {
+ case 1:
+ if (parity_valid[VDEV_RAIDZ_P])
+ return (vdev_raidz_reconstruct_p(rr, dt, 1));
+
+ ASSERT(rr->rr_firstdatacol > 1);
+
+ if (parity_valid[VDEV_RAIDZ_Q])
+ return (vdev_raidz_reconstruct_q(rr, dt, 1));
+
+ ASSERT(rr->rr_firstdatacol > 2);
+ break;
+
+ case 2:
+ ASSERT(rr->rr_firstdatacol > 1);
+
+ if (parity_valid[VDEV_RAIDZ_P] &&
+ parity_valid[VDEV_RAIDZ_Q])
+ return (vdev_raidz_reconstruct_pq(rr, dt, 2));
+
+ ASSERT(rr->rr_firstdatacol > 2);
+
+ break;
+ }
+
+ code = vdev_raidz_reconstruct_general(rr, tgts, ntgts);
+ ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
+ ASSERT(code > 0);
+ return (code);
+}
+
+static int
+vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+ uint64_t nparity = vdrz->vd_nparity;
+ int c;
+ int lasterror = 0;
+ int numerrors = 0;
+
+ ASSERT(nparity > 0);
+
+ if (nparity > VDEV_RAIDZ_MAXPARITY ||
+ vd->vdev_children < nparity + 1) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (SET_ERROR(EINVAL));
+ }
+
+ vdev_open_children(vd);
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (cvd->vdev_open_error != 0) {
+ lasterror = cvd->vdev_open_error;
+ numerrors++;
+ continue;
+ }
+
+ *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
+ *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
+ *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
+ *physical_ashift = MAX(*physical_ashift,
+ cvd->vdev_physical_ashift);
+ }
+
+ *asize *= vd->vdev_children;
+ *max_asize *= vd->vdev_children;
+
+ if (numerrors > nparity) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+ return (lasterror);
+ }
+
+ return (0);
+}
+
+static void
+vdev_raidz_close(vdev_t *vd)
+{
+ for (int c = 0; c < vd->vdev_children; c++) {
+ if (vd->vdev_child[c] != NULL)
+ vdev_close(vd->vdev_child[c]);
+ }
+}
+
+static uint64_t
+vdev_raidz_asize(vdev_t *vd, uint64_t psize)
+{
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+ uint64_t asize;
+ uint64_t ashift = vd->vdev_top->vdev_ashift;
+ uint64_t cols = vdrz->vd_logical_width;
+ uint64_t nparity = vdrz->vd_nparity;
+
+ asize = ((psize - 1) >> ashift) + 1;
+ asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
+ asize = roundup(asize, nparity + 1) << ashift;
+
+ return (asize);
+}
+
+/*
+ * The allocatable space for a raidz vdev is N * sizeof(smallest child)
+ * so each child must provide at least 1/Nth of its asize.
+ */
+static uint64_t
+vdev_raidz_min_asize(vdev_t *vd)
+{
+ return ((vd->vdev_min_asize + vd->vdev_children - 1) /
+ vd->vdev_children);
+}
+
+void
+vdev_raidz_child_done(zio_t *zio)
+{
+ raidz_col_t *rc = zio->io_private;
+
+ rc->rc_error = zio->io_error;
+ rc->rc_tried = 1;
+ rc->rc_skipped = 0;
+}
+
+static void
+vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
+{
+#ifdef ZFS_DEBUG
+ vdev_t *tvd = vd->vdev_top;
+
+ range_seg64_t logical_rs, physical_rs, remain_rs;
+ logical_rs.rs_start = rr->rr_offset;
+ logical_rs.rs_end = logical_rs.rs_start +
+ vdev_raidz_asize(vd, rr->rr_size);
+
+ raidz_col_t *rc = &rr->rr_col[col];
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+ vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
+ ASSERT(vdev_xlate_is_empty(&remain_rs));
+ ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
+ ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
+ /*
+ * It would be nice to assert that rs_end is equal
+ * to rc_offset + rc_size but there might be an
+ * optional I/O at the end that is not accounted in
+ * rc_size.
+ */
+ if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
+ ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
+ rc->rc_size + (1 << tvd->vdev_ashift));
+ } else {
+ ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
+ }
+#endif
+}
+
+static void
+vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift)
+{
+ vdev_t *vd = zio->io_vd;
+ raidz_map_t *rm = zio->io_vsd;
+ int c, i;
+
+ vdev_raidz_generate_parity_row(rm, rr);
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_size == 0)
+ continue;
+
+ /* Verify physical to logical translation */
+ vdev_raidz_io_verify(vd, rr, c);
+
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ vd->vdev_child[rc->rc_devidx], rc->rc_offset,
+ rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority,
+ 0, vdev_raidz_child_done, rc));
+ }
+
+ /*
+ * Generate optional I/Os for skip sectors to improve aggregation
+ * contiguity.
+ */
+ for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
+ ASSERT(c <= rr->rr_scols);
+ if (c == rr->rr_scols)
+ c = 0;
+
+ raidz_col_t *rc = &rr->rr_col[c];
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset + rc->rc_size, NULL, 1ULL << ashift,
+ zio->io_type, zio->io_priority,
+ ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
+ }
+}
+
+static void
+vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr)
+{
+ vdev_t *vd = zio->io_vd;
+
+ /*
+ * Iterate over the columns in reverse order so that we hit the parity
+ * last -- any errors along the way will force us to read the parity.
+ */
+ for (int c = rr->rr_cols - 1; c >= 0; c--) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_size == 0)
+ continue;
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+ if (!vdev_readable(cvd)) {
+ if (c >= rr->rr_firstdatacol)
+ rr->rr_missingdata++;
+ else
+ rr->rr_missingparity++;
+ rc->rc_error = SET_ERROR(ENXIO);
+ rc->rc_tried = 1; /* don't even try */
+ rc->rc_skipped = 1;
+ continue;
+ }
+ if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
+ if (c >= rr->rr_firstdatacol)
+ rr->rr_missingdata++;
+ else
+ rr->rr_missingparity++;
+ rc->rc_error = SET_ERROR(ESTALE);
+ rc->rc_skipped = 1;
+ continue;
+ }
+ if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
+ (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_raidz_child_done, rc));
+ }
+ }
+}
+
+/*
+ * Start an IO operation on a RAIDZ VDev
+ *
+ * Outline:
+ * - For write operations:
+ * 1. Generate the parity data
+ * 2. Create child zio write operations to each column's vdev, for both
+ * data and parity.
+ * 3. If the column skips any sectors for padding, create optional dummy
+ * write zio children for those areas to improve aggregation continuity.
+ * - For read operations:
+ * 1. Create child zio read operations to each data column's vdev to read
+ * the range of data required for zio.
+ * 2. If this is a scrub or resilver operation, or if any of the data
+ * vdevs have had errors, then create zio read operations to the parity
+ * columns' VDevs as well.
+ */
+static void
+vdev_raidz_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd->vdev_top;
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+ raidz_map_t *rm;
+
+ rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift,
+ vdrz->vd_logical_width, vdrz->vd_nparity);
+
+ /*
+ * Until raidz expansion is implemented all maps for a raidz vdev
+ * contain a single row.
+ */
+ ASSERT3U(rm->rm_nrows, ==, 1);
+ raidz_row_t *rr = rm->rm_row[0];
+
+ zio->io_vsd = rm;
+ zio->io_vsd_ops = &vdev_raidz_vsd_ops;
+
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift);
+ } else {
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+ vdev_raidz_io_start_read(zio, rr);
+ }
+
+ zio_execute(zio);
+}
+
+/*
+ * Report a checksum error for a child of a RAID-Z device.
+ */
+static void
+raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
+{
+ vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
+
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
+ zio->io_priority != ZIO_PRIORITY_REBUILD) {
+ zio_bad_cksum_t zbc;
+ raidz_map_t *rm = zio->io_vsd;
+
+ zbc.zbc_has_cksum = 0;
+ zbc.zbc_injected = rm->rm_ecksuminjected;
+
+ int ret = zfs_ereport_post_checksum(zio->io_spa, vd,
+ &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
+ rc->rc_abd, bad_data, &zbc);
+ if (ret != EALREADY) {
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+ }
+}
+
+/*
+ * We keep track of whether or not there were any injected errors, so that
+ * any ereports we generate can note it.
+ */
+static int
+raidz_checksum_verify(zio_t *zio)
+{
+ zio_bad_cksum_t zbc;
+ raidz_map_t *rm = zio->io_vsd;
+
+ bzero(&zbc, sizeof (zio_bad_cksum_t));
+
+ int ret = zio_checksum_error(zio, &zbc);
+ if (ret != 0 && zbc.zbc_injected != 0)
+ rm->rm_ecksuminjected = 1;
+
+ return (ret);
+}
+
+/*
+ * Generate the parity from the data columns. If we tried and were able to
+ * read the parity without error, verify that the generated parity matches the
+ * data we read. If it doesn't, we fire off a checksum error. Return the
+ * number of such failures.
+ */
+static int
+raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
+{
+ abd_t *orig[VDEV_RAIDZ_MAXPARITY];
+ int c, ret = 0;
+ raidz_map_t *rm = zio->io_vsd;
+ raidz_col_t *rc;
+
+ blkptr_t *bp = zio->io_bp;
+ enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
+ (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+
+ if (checksum == ZIO_CHECKSUM_NOPARITY)
+ return (ret);
+
+ for (c = 0; c < rr->rr_firstdatacol; c++) {
+ rc = &rr->rr_col[c];
+ if (!rc->rc_tried || rc->rc_error != 0)
+ continue;
+
+ orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size);
+ abd_copy(orig[c], rc->rc_abd, rc->rc_size);
+ }
+
+ /*
+ * Regenerates parity even for !tried||rc_error!=0 columns. This
+ * isn't harmful but it does have the side effect of fixing stuff
+ * we didn't realize was necessary (i.e. even if we return 0).
+ */
+ vdev_raidz_generate_parity_row(rm, rr);
+
+ for (c = 0; c < rr->rr_firstdatacol; c++) {
+ rc = &rr->rr_col[c];
+
+ if (!rc->rc_tried || rc->rc_error != 0)
+ continue;
+
+ if (abd_cmp(orig[c], rc->rc_abd) != 0) {
+ raidz_checksum_error(zio, rc, orig[c]);
+ rc->rc_error = SET_ERROR(ECKSUM);
+ ret++;
+ }
+ abd_free(orig[c]);
+ }
+
+ return (ret);
+}
+
+static int
+vdev_raidz_worst_error(raidz_row_t *rr)
+{
+ int error = 0;
+
+ for (int c = 0; c < rr->rr_cols; c++)
+ error = zio_worst_error(error, rr->rr_col[c].rc_error);
+
+ return (error);
+}
+
+static void
+vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
+{
+ int unexpected_errors = 0;
+ int parity_errors = 0;
+ int parity_untried = 0;
+ int data_errors = 0;
+
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_error) {
+ if (c < rr->rr_firstdatacol)
+ parity_errors++;
+ else
+ data_errors++;
+
+ if (!rc->rc_skipped)
+ unexpected_errors++;
+ } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
+ parity_untried++;
+ }
+ }
+
+ /*
+ * If we read more parity disks than were used for
+ * reconstruction, confirm that the other parity disks produced
+ * correct data.
+ *
+ * Note that we also regenerate parity when resilvering so we
+ * can write it out to failed devices later.
+ */
+ if (parity_errors + parity_untried <
+ rr->rr_firstdatacol - data_errors ||
+ (zio->io_flags & ZIO_FLAG_RESILVER)) {
+ int n = raidz_parity_verify(zio, rr);
+ unexpected_errors += n;
+ ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol);
+ }
+
+ if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
+ (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
+ /*
+ * Use the good data we have in hand to repair damaged children.
+ */
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ vdev_t *vd = zio->io_vd;
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+ if ((rc->rc_error == 0 || rc->rc_size == 0) &&
+ (rc->rc_repair == 0)) {
+ continue;
+ }
+
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
+ ZIO_TYPE_WRITE,
+ zio->io_priority == ZIO_PRIORITY_REBUILD ?
+ ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
+ ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
+ }
+ }
+}
+
+static void
+raidz_restore_orig_data(raidz_map_t *rm)
+{
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_need_orig_restore) {
+ abd_copy_from_buf(rc->rc_abd,
+ rc->rc_orig_data, rc->rc_size);
+ rc->rc_need_orig_restore = B_FALSE;
+ }
+ }
+ }
+}
+
+/*
+ * returns EINVAL if reconstruction of the block will not be possible
+ * returns ECKSUM if this specific reconstruction failed
+ * returns 0 on successful reconstruction
+ */
+static int
+raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
+{
+ raidz_map_t *rm = zio->io_vsd;
+
+ /* Reconstruct each row */
+ for (int r = 0; r < rm->rm_nrows; r++) {
+ raidz_row_t *rr = rm->rm_row[r];
+ int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
+ int t = 0;
+ int dead = 0;
+ int dead_data = 0;
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ ASSERT0(rc->rc_need_orig_restore);
+ if (rc->rc_error != 0) {
+ dead++;
+ if (c >= nparity)
+ dead_data++;
+ continue;
+ }
+ if (rc->rc_size == 0)
+ continue;
+ for (int lt = 0; lt < ntgts; lt++) {
+ if (rc->rc_devidx == ltgts[lt]) {
+ if (rc->rc_orig_data == NULL) {
+ rc->rc_orig_data =
+ zio_buf_alloc(rc->rc_size);
+ abd_copy_to_buf(
+ rc->rc_orig_data,
+ rc->rc_abd, rc->rc_size);
+ }
+ rc->rc_need_orig_restore = B_TRUE;
+
+ dead++;
+ if (c >= nparity)
+ dead_data++;
+ my_tgts[t++] = c;
+ break;
+ }
+ }
+ }
+ if (dead > nparity) {
+ /* reconstruction not possible */
+ raidz_restore_orig_data(rm);
+ return (EINVAL);
+ }
+ rr->rr_code = 0;
+ if (dead_data > 0)
+ rr->rr_code = vdev_raidz_reconstruct_row(rm, rr,
+ my_tgts, t);
+ }
+
+ /* Check for success */
+ if (raidz_checksum_verify(zio) == 0) {
+
+ /* Reconstruction succeeded - report errors */
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_need_orig_restore) {
+ /*
+ * Note: if this is a parity column,
+ * we don't really know if it's wrong.
+ * We need to let
+ * vdev_raidz_io_done_verified() check
+ * it, and if we set rc_error, it will
+ * think that it is a "known" error
+ * that doesn't need to be checked
+ * or corrected.
+ */
+ if (rc->rc_error == 0 &&
+ c >= rr->rr_firstdatacol) {
+ raidz_checksum_error(zio,
+ rc, rc->rc_gdata);
+ rc->rc_error =
+ SET_ERROR(ECKSUM);
+ }
+ rc->rc_need_orig_restore = B_FALSE;
+ }
+ }
+
+ vdev_raidz_io_done_verified(zio, rr);
+ }
+
+ zio_checksum_verified(zio);
+
+ return (0);
+ }
+
+ /* Reconstruction failed - restore original data */
+ raidz_restore_orig_data(rm);
+ return (ECKSUM);
+}
+
+/*
+ * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
+ * Note that the algorithm below is non-optimal because it doesn't take into
+ * account how reconstruction is actually performed. For example, with
+ * triple-parity RAID-Z the reconstruction procedure is the same if column 4
+ * is targeted as invalid as if columns 1 and 4 are targeted since in both
+ * cases we'd only use parity information in column 0.
+ *
+ * The order that we find the various possible combinations of failed
+ * disks is dictated by these rules:
+ * - Examine each "slot" (the "i" in tgts[i])
+ * - Try to increment this slot (tgts[i] = tgts[i] + 1)
+ * - if we can't increment because it runs into the next slot,
+ * reset our slot to the minimum, and examine the next slot
+ *
+ * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
+ * 3 columns to reconstruct), we will generate the following sequence:
+ *
+ * STATE ACTION
+ * 0 1 2 special case: skip since these are all parity
+ * 0 1 3 first slot: reset to 0; middle slot: increment to 2
+ * 0 2 3 first slot: increment to 1
+ * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4
+ * 0 1 4 first: reset to 0; middle: increment to 2
+ * 0 2 4 first: increment to 1
+ * 1 2 4 first: reset to 0; middle: increment to 3
+ * 0 3 4 first: increment to 1
+ * 1 3 4 first: increment to 2
+ * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5
+ * 0 1 5 first: reset to 0; middle: increment to 2
+ * 0 2 5 first: increment to 1
+ * 1 2 5 first: reset to 0; middle: increment to 3
+ * 0 3 5 first: increment to 1
+ * 1 3 5 first: increment to 2
+ * 2 3 5 first: reset to 0; middle: increment to 4
+ * 0 4 5 first: increment to 1
+ * 1 4 5 first: increment to 2
+ * 2 4 5 first: increment to 3
+ * 3 4 5 done
+ *
+ * This strategy works for dRAID but is less effecient when there are a large
+ * number of child vdevs and therefore permutations to check. Furthermore,
+ * since the raidz_map_t rows likely do not overlap reconstruction would be
+ * possible as long as there are no more than nparity data errors per row.
+ * These additional permutations are not currently checked but could be as
+ * a future improvement.
+ */
+static int
+vdev_raidz_combrec(zio_t *zio)
+{
+ int nparity = vdev_get_nparity(zio->io_vd);
+ raidz_map_t *rm = zio->io_vsd;
+
+ /* Check if there's enough data to attempt reconstrution. */
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ int total_errors = 0;
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ if (rr->rr_col[c].rc_error)
+ total_errors++;
+ }
+
+ if (total_errors > nparity)
+ return (vdev_raidz_worst_error(rr));
+ }
+
+ for (int num_failures = 1; num_failures <= nparity; num_failures++) {
+ int tstore[VDEV_RAIDZ_MAXPARITY + 2];
+ int *ltgts = &tstore[1]; /* value is logical child ID */
+
+ /* Determine number of logical children, n */
+ int n = zio->io_vd->vdev_children;
+
+ ASSERT3U(num_failures, <=, nparity);
+ ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
+
+ /* Handle corner cases in combrec logic */
+ ltgts[-1] = -1;
+ for (int i = 0; i < num_failures; i++) {
+ ltgts[i] = i;
+ }
+ ltgts[num_failures] = n;
+
+ for (;;) {
+ int err = raidz_reconstruct(zio, ltgts, num_failures,
+ nparity);
+ if (err == EINVAL) {
+ /*
+ * Reconstruction not possible with this #
+ * failures; try more failures.
+ */
+ break;
+ } else if (err == 0)
+ return (0);
+
+ /* Compute next targets to try */
+ for (int t = 0; ; t++) {
+ ASSERT3U(t, <, num_failures);
+ ltgts[t]++;
+ if (ltgts[t] == n) {
+ /* try more failures */
+ ASSERT3U(t, ==, num_failures - 1);
+ break;
+ }
+
+ ASSERT3U(ltgts[t], <, n);
+ ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
+
+ /*
+ * If that spot is available, we're done here.
+ * Try the next combination.
+ */
+ if (ltgts[t] != ltgts[t + 1])
+ break;
+
+ /*
+ * Otherwise, reset this tgt to the minimum,
+ * and move on to the next tgt.
+ */
+ ltgts[t] = ltgts[t - 1] + 1;
+ ASSERT3U(ltgts[t], ==, t);
+ }
+
+ /* Increase the number of failures and keep trying. */
+ if (ltgts[num_failures - 1] == n)
+ break;
+ }
+ }
+
+ return (ECKSUM);
+}
+
+void
+vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
+{
+ for (uint64_t row = 0; row < rm->rm_nrows; row++) {
+ raidz_row_t *rr = rm->rm_row[row];
+ vdev_raidz_reconstruct_row(rm, rr, t, nt);
+ }
+}
+
+/*
+ * Complete a write IO operation on a RAIDZ VDev
+ *
+ * Outline:
+ * 1. Check for errors on the child IOs.
+ * 2. Return, setting an error code if too few child VDevs were written
+ * to reconstruct the data later. Note that partial writes are
+ * considered successful if they can be reconstructed at all.
+ */
+static void
+vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
+{
+ int total_errors = 0;
+
+ ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
+ ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_error) {
+ ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
+
+ total_errors++;
+ }
+ }
+
+ /*
+ * Treat partial writes as a success. If we couldn't write enough
+ * columns to reconstruct the data, the I/O failed. Otherwise,
+ * good enough.
+ *
+ * Now that we support write reallocation, it would be better
+ * to treat partial failure as real failure unless there are
+ * no non-degraded top-level vdevs left, and not update DTLs
+ * if we intend to reallocate.
+ */
+ if (total_errors > rr->rr_firstdatacol) {
+ zio->io_error = zio_worst_error(zio->io_error,
+ vdev_raidz_worst_error(rr));
+ }
+}
+
+/*
+ * return 0 if no reconstruction occurred, otherwise the "code" from
+ * vdev_raidz_reconstruct().
+ */
+static int
+vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
+ raidz_row_t *rr)
+{
+ int parity_errors = 0;
+ int parity_untried = 0;
+ int data_errors = 0;
+ int total_errors = 0;
+ int code = 0;
+
+ ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
+ ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_error) {
+ ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
+
+ if (c < rr->rr_firstdatacol)
+ parity_errors++;
+ else
+ data_errors++;
+
+ total_errors++;
+ } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
+ parity_untried++;
+ }
+ }
+
+ /*
+ * If there were data errors and the number of errors we saw was
+ * correctable -- less than or equal to the number of parity disks read
+ * -- reconstruct based on the missing data.
+ */
+ if (data_errors != 0 &&
+ total_errors <= rr->rr_firstdatacol - parity_untried) {
+ /*
+ * We either attempt to read all the parity columns or
+ * none of them. If we didn't try to read parity, we
+ * wouldn't be here in the correctable case. There must
+ * also have been fewer parity errors than parity
+ * columns or, again, we wouldn't be in this code path.
+ */
+ ASSERT(parity_untried == 0);
+ ASSERT(parity_errors < rr->rr_firstdatacol);
+
+ /*
+ * Identify the data columns that reported an error.
+ */
+ int n = 0;
+ int tgts[VDEV_RAIDZ_MAXPARITY];
+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_error != 0) {
+ ASSERT(n < VDEV_RAIDZ_MAXPARITY);
+ tgts[n++] = c;
+ }
+ }
+
+ ASSERT(rr->rr_firstdatacol >= n);
+
+ code = vdev_raidz_reconstruct_row(rm, rr, tgts, n);
+ }
+
+ return (code);
+}
+
+/*
+ * Return the number of reads issued.
+ */
+static int
+vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
+{
+ vdev_t *vd = zio->io_vd;
+ int nread = 0;
+
+ rr->rr_missingdata = 0;
+ rr->rr_missingparity = 0;
+
+ /*
+ * If this rows contains empty sectors which are not required
+ * for a normal read then allocate an ABD for them now so they
+ * may be read, verified, and any needed repairs performed.
+ */
+ if (rr->rr_nempty && rr->rr_abd_empty == NULL)
+ vdev_draid_map_alloc_empty(zio, rr);
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_tried || rc->rc_size == 0)
+ continue;
+
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ vd->vdev_child[rc->rc_devidx],
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_raidz_child_done, rc));
+ nread++;
+ }
+ return (nread);
+}
+
+/*
+ * We're here because either there were too many errors to even attempt
+ * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
+ * failed. In either case, there is enough bad data to prevent reconstruction.
+ * Start checksum ereports for all children which haven't failed.
+ */
+static void
+vdev_raidz_io_done_unrecoverable(zio_t *zio)
+{
+ raidz_map_t *rm = zio->io_vsd;
+
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
+
+ if (rc->rc_error != 0)
+ continue;
+
+ zio_bad_cksum_t zbc;
+ zbc.zbc_has_cksum = 0;
+ zbc.zbc_injected = rm->rm_ecksuminjected;
+
+ int ret = zfs_ereport_start_checksum(zio->io_spa,
+ cvd, &zio->io_bookmark, zio, rc->rc_offset,
+ rc->rc_size, (void *)(uintptr_t)c, &zbc);
+ if (ret != EALREADY) {
+ mutex_enter(&cvd->vdev_stat_lock);
+ cvd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&cvd->vdev_stat_lock);
+ }
+ }
+ }
+}
+
+void
+vdev_raidz_io_done(zio_t *zio)
+{
+ raidz_map_t *rm = zio->io_vsd;
+
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
+ }
+ } else {
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ rr->rr_code =
+ vdev_raidz_io_done_reconstruct_known_missing(zio,
+ rm, rr);
+ }
+
+ if (raidz_checksum_verify(zio) == 0) {
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ vdev_raidz_io_done_verified(zio, rr);
+ }
+ zio_checksum_verified(zio);
+ } else {
+ /*
+ * A sequential resilver has no checksum which makes
+ * combinatoral reconstruction impossible. This code
+ * path is unreachable since raidz_checksum_verify()
+ * has no checksum to verify and must succeed.
+ */
+ ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
+
+ /*
+ * This isn't a typical situation -- either we got a
+ * read error or a child silently returned bad data.
+ * Read every block so we can try again with as much
+ * data and parity as we can track down. If we've
+ * already been through once before, all children will
+ * be marked as tried so we'll proceed to combinatorial
+ * reconstruction.
+ */
+ int nread = 0;
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ nread += vdev_raidz_read_all(zio,
+ rm->rm_row[i]);
+ }
+ if (nread != 0) {
+ /*
+ * Normally our stage is VDEV_IO_DONE, but if
+ * we've already called redone(), it will have
+ * changed to VDEV_IO_START, in which case we
+ * don't want to call redone() again.
+ */
+ if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
+ zio_vdev_io_redone(zio);
+ return;
+ }
+
+ zio->io_error = vdev_raidz_combrec(zio);
+ if (zio->io_error == ECKSUM &&
+ !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ vdev_raidz_io_done_unrecoverable(zio);
+ }
+ }
+ }
+}
+
+static void
+vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
+{
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+ if (faulted > vdrz->vd_nparity)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
+ else if (degraded + faulted != 0)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ else
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+/*
+ * Determine if any portion of the provided block resides on a child vdev
+ * with a dirty DTL and therefore needs to be resilvered. The function
+ * assumes that at least one DTL is dirty which implies that full stripe
+ * width blocks must be resilvered.
+ */
+static boolean_t
+vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
+ uint64_t phys_birth)
+{
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+ uint64_t dcols = vd->vdev_children;
+ uint64_t nparity = vdrz->vd_nparity;
+ uint64_t ashift = vd->vdev_top->vdev_ashift;
+ /* The starting RAIDZ (parent) vdev sector of the block. */
+ uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
+ /* The zio's size in units of the vdev's minimum sector size. */
+ uint64_t s = ((psize - 1) >> ashift) + 1;
+ /* The first column for this stripe. */
+ uint64_t f = b % dcols;
+
+ /* Unreachable by sequential resilver. */
+ ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
+
+ if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
+ return (B_FALSE);
+
+ if (s + nparity >= dcols)
+ return (B_TRUE);
+
+ for (uint64_t c = 0; c < s + nparity; c++) {
+ uint64_t devidx = (f + c) % dcols;
+ vdev_t *cvd = vd->vdev_child[devidx];
+
+ /*
+ * dsl_scan_need_resilver() already checked vd with
+ * vdev_dtl_contains(). So here just check cvd with
+ * vdev_dtl_empty(), cheaper and a good approximation.
+ */
+ if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+static void
+vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
+ range_seg64_t *physical_rs, range_seg64_t *remain_rs)
+{
+ vdev_t *raidvd = cvd->vdev_parent;
+ ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
+
+ uint64_t width = raidvd->vdev_children;
+ uint64_t tgt_col = cvd->vdev_id;
+ uint64_t ashift = raidvd->vdev_top->vdev_ashift;
+
+ /* make sure the offsets are block-aligned */
+ ASSERT0(logical_rs->rs_start % (1 << ashift));
+ ASSERT0(logical_rs->rs_end % (1 << ashift));
+ uint64_t b_start = logical_rs->rs_start >> ashift;
+ uint64_t b_end = logical_rs->rs_end >> ashift;
+
+ uint64_t start_row = 0;
+ if (b_start > tgt_col) /* avoid underflow */
+ start_row = ((b_start - tgt_col - 1) / width) + 1;
+
+ uint64_t end_row = 0;
+ if (b_end > tgt_col)
+ end_row = ((b_end - tgt_col - 1) / width) + 1;
+
+ physical_rs->rs_start = start_row << ashift;
+ physical_rs->rs_end = end_row << ashift;
+
+ ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
+ ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
+ logical_rs->rs_end - logical_rs->rs_start);
+}
+
+/*
+ * Initialize private RAIDZ specific fields from the nvlist.
+ */
+static int
+vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
+{
+ vdev_raidz_t *vdrz;
+ uint64_t nparity;
+
+ uint_t children;
+ nvlist_t **child;
+ int error = nvlist_lookup_nvlist_array(nv,
+ ZPOOL_CONFIG_CHILDREN, &child, &children);
+ if (error != 0)
+ return (SET_ERROR(EINVAL));
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
+ if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * Previous versions could only support 1 or 2 parity
+ * device.
+ */
+ if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
+ return (SET_ERROR(EINVAL));
+ else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
+ return (SET_ERROR(EINVAL));
+ } else {
+ /*
+ * We require the parity to be specified for SPAs that
+ * support multiple parity levels.
+ */
+ if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * Otherwise, we default to 1 parity device for RAID-Z.
+ */
+ nparity = 1;
+ }
+
+ vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
+ vdrz->vd_logical_width = children;
+ vdrz->vd_nparity = nparity;
+
+ *tsd = vdrz;
+
+ return (0);
+}
+
+static void
+vdev_raidz_fini(vdev_t *vd)
+{
+ kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t));
+}
+
+/*
+ * Add RAIDZ specific fields to the config nvlist.
+ */
+static void
+vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
+{
+ ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+
+ /*
+ * Make sure someone hasn't managed to sneak a fancy new vdev
+ * into a crufty old storage pool.
+ */
+ ASSERT(vdrz->vd_nparity == 1 ||
+ (vdrz->vd_nparity <= 2 &&
+ spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
+ (vdrz->vd_nparity <= 3 &&
+ spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
+
+ /*
+ * Note that we'll add these even on storage pools where they
+ * aren't strictly required -- older software will just ignore
+ * it.
+ */
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
+}
+
+static uint64_t
+vdev_raidz_nparity(vdev_t *vd)
+{
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+ return (vdrz->vd_nparity);
+}
+
+static uint64_t
+vdev_raidz_ndisks(vdev_t *vd)
+{
+ return (vd->vdev_children);
+}
+
+vdev_ops_t vdev_raidz_ops = {
+ .vdev_op_init = vdev_raidz_init,
+ .vdev_op_fini = vdev_raidz_fini,
+ .vdev_op_open = vdev_raidz_open,
+ .vdev_op_close = vdev_raidz_close,
+ .vdev_op_asize = vdev_raidz_asize,
+ .vdev_op_min_asize = vdev_raidz_min_asize,
+ .vdev_op_min_alloc = NULL,
+ .vdev_op_io_start = vdev_raidz_io_start,
+ .vdev_op_io_done = vdev_raidz_io_done,
+ .vdev_op_state_change = vdev_raidz_state_change,
+ .vdev_op_need_resilver = vdev_raidz_need_resilver,
+ .vdev_op_hold = NULL,
+ .vdev_op_rele = NULL,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = vdev_raidz_xlate,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = vdev_raidz_config_generate,
+ .vdev_op_nparity = vdev_raidz_nparity,
+ .vdev_op_ndisks = vdev_raidz_ndisks,
+ .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */
+ .vdev_op_leaf = B_FALSE /* not a leaf vdev */
+};
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c
new file mode 100644
index 000000000000..25d76970e99a
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c
@@ -0,0 +1,666 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/types.h>
+#include <sys/zio.h>
+#include <sys/debug.h>
+#include <sys/zfs_debug.h>
+#include <sys/vdev_raidz.h>
+#include <sys/vdev_raidz_impl.h>
+#include <sys/simd.h>
+
+/* Opaque implementation with NULL methods to represent original methods */
+static const raidz_impl_ops_t vdev_raidz_original_impl = {
+ .name = "original",
+ .is_supported = raidz_will_scalar_work,
+};
+
+/* RAIDZ parity op that contain the fastest methods */
+static raidz_impl_ops_t vdev_raidz_fastest_impl = {
+ .name = "fastest"
+};
+
+/* All compiled in implementations */
+const raidz_impl_ops_t *raidz_all_maths[] = {
+ &vdev_raidz_original_impl,
+ &vdev_raidz_scalar_impl,
+#if defined(__x86_64) && defined(HAVE_SSE2) /* only x86_64 for now */
+ &vdev_raidz_sse2_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_SSSE3) /* only x86_64 for now */
+ &vdev_raidz_ssse3_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_AVX2) /* only x86_64 for now */
+ &vdev_raidz_avx2_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_AVX512F) /* only x86_64 for now */
+ &vdev_raidz_avx512f_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_AVX512BW) /* only x86_64 for now */
+ &vdev_raidz_avx512bw_impl,
+#endif
+#if defined(__aarch64__) && !defined(__FreeBSD__)
+ &vdev_raidz_aarch64_neon_impl,
+ &vdev_raidz_aarch64_neonx2_impl,
+#endif
+#if defined(__powerpc__) && defined(__altivec__)
+ &vdev_raidz_powerpc_altivec_impl,
+#endif
+};
+
+/* Indicate that benchmark has been completed */
+static boolean_t raidz_math_initialized = B_FALSE;
+
+/* Select raidz implementation */
+#define IMPL_FASTEST (UINT32_MAX)
+#define IMPL_CYCLE (UINT32_MAX - 1)
+#define IMPL_ORIGINAL (0)
+#define IMPL_SCALAR (1)
+
+#define RAIDZ_IMPL_READ(i) (*(volatile uint32_t *) &(i))
+
+static uint32_t zfs_vdev_raidz_impl = IMPL_SCALAR;
+static uint32_t user_sel_impl = IMPL_FASTEST;
+
+/* Hold all supported implementations */
+static size_t raidz_supp_impl_cnt = 0;
+static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)];
+
+#if defined(_KERNEL)
+/*
+ * kstats values for supported implementations
+ * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
+ */
+static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1];
+
+/* kstat for benchmarked implementations */
+static kstat_t *raidz_math_kstat = NULL;
+#endif
+
+/*
+ * Returns the RAIDZ operations for raidz_map() parity calculations. When
+ * a SIMD implementation is not allowed in the current context, then fallback
+ * to the fastest generic implementation.
+ */
+const raidz_impl_ops_t *
+vdev_raidz_math_get_ops(void)
+{
+ if (!kfpu_allowed())
+ return (&vdev_raidz_scalar_impl);
+
+ raidz_impl_ops_t *ops = NULL;
+ const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
+
+ switch (impl) {
+ case IMPL_FASTEST:
+ ASSERT(raidz_math_initialized);
+ ops = &vdev_raidz_fastest_impl;
+ break;
+ case IMPL_CYCLE:
+ /* Cycle through all supported implementations */
+ ASSERT(raidz_math_initialized);
+ ASSERT3U(raidz_supp_impl_cnt, >, 0);
+ static size_t cycle_impl_idx = 0;
+ size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt;
+ ops = raidz_supp_impl[idx];
+ break;
+ case IMPL_ORIGINAL:
+ ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl;
+ break;
+ case IMPL_SCALAR:
+ ops = (raidz_impl_ops_t *)&vdev_raidz_scalar_impl;
+ break;
+ default:
+ ASSERT3U(impl, <, raidz_supp_impl_cnt);
+ ASSERT3U(raidz_supp_impl_cnt, >, 0);
+ if (impl < ARRAY_SIZE(raidz_all_maths))
+ ops = raidz_supp_impl[impl];
+ break;
+ }
+
+ ASSERT3P(ops, !=, NULL);
+
+ return (ops);
+}
+
+/*
+ * Select parity generation method for raidz_map
+ */
+int
+vdev_raidz_math_generate(raidz_map_t *rm, raidz_row_t *rr)
+{
+ raidz_gen_f gen_parity = NULL;
+
+ switch (raidz_parity(rm)) {
+ case 1:
+ gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P];
+ break;
+ case 2:
+ gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQ];
+ break;
+ case 3:
+ gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQR];
+ break;
+ default:
+ gen_parity = NULL;
+ cmn_err(CE_PANIC, "invalid RAID-Z configuration %d",
+ raidz_parity(rm));
+ break;
+ }
+
+ /* if method is NULL execute the original implementation */
+ if (gen_parity == NULL)
+ return (RAIDZ_ORIGINAL_IMPL);
+
+ gen_parity(rr);
+
+ return (0);
+}
+
+static raidz_rec_f
+reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid,
+ const int nbaddata)
+{
+ if (nbaddata == 1 && parity_valid[CODE_P]) {
+ return (rm->rm_ops->rec[RAIDZ_REC_P]);
+ }
+ return ((raidz_rec_f) NULL);
+}
+
+static raidz_rec_f
+reconstruct_fun_pq_sel(raidz_map_t *rm, const int *parity_valid,
+ const int nbaddata)
+{
+ if (nbaddata == 1) {
+ if (parity_valid[CODE_P]) {
+ return (rm->rm_ops->rec[RAIDZ_REC_P]);
+ } else if (parity_valid[CODE_Q]) {
+ return (rm->rm_ops->rec[RAIDZ_REC_Q]);
+ }
+ } else if (nbaddata == 2 &&
+ parity_valid[CODE_P] && parity_valid[CODE_Q]) {
+ return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
+ }
+ return ((raidz_rec_f) NULL);
+}
+
+static raidz_rec_f
+reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid,
+ const int nbaddata)
+{
+ if (nbaddata == 1) {
+ if (parity_valid[CODE_P]) {
+ return (rm->rm_ops->rec[RAIDZ_REC_P]);
+ } else if (parity_valid[CODE_Q]) {
+ return (rm->rm_ops->rec[RAIDZ_REC_Q]);
+ } else if (parity_valid[CODE_R]) {
+ return (rm->rm_ops->rec[RAIDZ_REC_R]);
+ }
+ } else if (nbaddata == 2) {
+ if (parity_valid[CODE_P] && parity_valid[CODE_Q]) {
+ return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
+ } else if (parity_valid[CODE_P] && parity_valid[CODE_R]) {
+ return (rm->rm_ops->rec[RAIDZ_REC_PR]);
+ } else if (parity_valid[CODE_Q] && parity_valid[CODE_R]) {
+ return (rm->rm_ops->rec[RAIDZ_REC_QR]);
+ }
+ } else if (nbaddata == 3 &&
+ parity_valid[CODE_P] && parity_valid[CODE_Q] &&
+ parity_valid[CODE_R]) {
+ return (rm->rm_ops->rec[RAIDZ_REC_PQR]);
+ }
+ return ((raidz_rec_f) NULL);
+}
+
+/*
+ * Select data reconstruction method for raidz_map
+ * @parity_valid - Parity validity flag
+ * @dt - Failed data index array
+ * @nbaddata - Number of failed data columns
+ */
+int
+vdev_raidz_math_reconstruct(raidz_map_t *rm, raidz_row_t *rr,
+ const int *parity_valid, const int *dt, const int nbaddata)
+{
+ raidz_rec_f rec_fn = NULL;
+
+ switch (raidz_parity(rm)) {
+ case PARITY_P:
+ rec_fn = reconstruct_fun_p_sel(rm, parity_valid, nbaddata);
+ break;
+ case PARITY_PQ:
+ rec_fn = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata);
+ break;
+ case PARITY_PQR:
+ rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata);
+ break;
+ default:
+ cmn_err(CE_PANIC, "invalid RAID-Z configuration %d",
+ raidz_parity(rm));
+ break;
+ }
+
+ if (rec_fn == NULL)
+ return (RAIDZ_ORIGINAL_IMPL);
+ else
+ return (rec_fn(rr, dt));
+}
+
+const char *raidz_gen_name[] = {
+ "gen_p", "gen_pq", "gen_pqr"
+};
+const char *raidz_rec_name[] = {
+ "rec_p", "rec_q", "rec_r",
+ "rec_pq", "rec_pr", "rec_qr", "rec_pqr"
+};
+
+#if defined(_KERNEL)
+
+#define RAIDZ_KSTAT_LINE_LEN (17 + 10*12 + 1)
+
+static int
+raidz_math_kstat_headers(char *buf, size_t size)
+{
+ int i;
+ ssize_t off;
+
+ ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN);
+
+ off = snprintf(buf, size, "%-17s", "implementation");
+
+ for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++)
+ off += snprintf(buf + off, size - off, "%-16s",
+ raidz_gen_name[i]);
+
+ for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++)
+ off += snprintf(buf + off, size - off, "%-16s",
+ raidz_rec_name[i]);
+
+ (void) snprintf(buf + off, size - off, "\n");
+
+ return (0);
+}
+
+static int
+raidz_math_kstat_data(char *buf, size_t size, void *data)
+{
+ raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt];
+ raidz_impl_kstat_t *cstat = (raidz_impl_kstat_t *)data;
+ ssize_t off = 0;
+ int i;
+
+ ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN);
+
+ if (cstat == fstat) {
+ off += snprintf(buf + off, size - off, "%-17s", "fastest");
+
+ for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) {
+ int id = fstat->gen[i];
+ off += snprintf(buf + off, size - off, "%-16s",
+ raidz_supp_impl[id]->name);
+ }
+ for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) {
+ int id = fstat->rec[i];
+ off += snprintf(buf + off, size - off, "%-16s",
+ raidz_supp_impl[id]->name);
+ }
+ } else {
+ ptrdiff_t id = cstat - raidz_impl_kstats;
+
+ off += snprintf(buf + off, size - off, "%-17s",
+ raidz_supp_impl[id]->name);
+
+ for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++)
+ off += snprintf(buf + off, size - off, "%-16llu",
+ (u_longlong_t)cstat->gen[i]);
+
+ for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++)
+ off += snprintf(buf + off, size - off, "%-16llu",
+ (u_longlong_t)cstat->rec[i]);
+ }
+
+ (void) snprintf(buf + off, size - off, "\n");
+
+ return (0);
+}
+
+static void *
+raidz_math_kstat_addr(kstat_t *ksp, loff_t n)
+{
+ if (n <= raidz_supp_impl_cnt)
+ ksp->ks_private = (void *) (raidz_impl_kstats + n);
+ else
+ ksp->ks_private = NULL;
+
+ return (ksp->ks_private);
+}
+
+#define BENCH_D_COLS (8ULL)
+#define BENCH_COLS (BENCH_D_COLS + PARITY_PQR)
+#define BENCH_ZIO_SIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) /* 128 kiB */
+#define BENCH_NS MSEC2NSEC(1) /* 1ms */
+
+typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn);
+
+static void
+benchmark_gen_impl(raidz_map_t *rm, const int fn)
+{
+ (void) fn;
+ vdev_raidz_generate_parity(rm);
+}
+
+static void
+benchmark_rec_impl(raidz_map_t *rm, const int fn)
+{
+ static const int rec_tgt[7][3] = {
+ {1, 2, 3}, /* rec_p: bad QR & D[0] */
+ {0, 2, 3}, /* rec_q: bad PR & D[0] */
+ {0, 1, 3}, /* rec_r: bad PQ & D[0] */
+ {2, 3, 4}, /* rec_pq: bad R & D[0][1] */
+ {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */
+ {0, 3, 4}, /* rec_qr: bad P & D[0][1] */
+ {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */
+ };
+
+ vdev_raidz_reconstruct(rm, rec_tgt[fn], 3);
+}
+
+/*
+ * Benchmarking of all supported implementations (raidz_supp_impl_cnt)
+ * is performed by setting the rm_ops pointer and calling the top level
+ * generate/reconstruct methods of bench_rm.
+ */
+static void
+benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
+{
+ uint64_t run_cnt, speed, best_speed = 0;
+ hrtime_t t_start, t_diff;
+ raidz_impl_ops_t *curr_impl;
+ raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt];
+ int impl, i;
+
+ for (impl = 0; impl < raidz_supp_impl_cnt; impl++) {
+ /* set an implementation to benchmark */
+ curr_impl = raidz_supp_impl[impl];
+ bench_rm->rm_ops = curr_impl;
+
+ run_cnt = 0;
+ t_start = gethrtime();
+
+ do {
+ for (i = 0; i < 5; i++, run_cnt++)
+ bench_fn(bench_rm, fn);
+
+ t_diff = gethrtime() - t_start;
+ } while (t_diff < BENCH_NS);
+
+ speed = run_cnt * BENCH_ZIO_SIZE * NANOSEC;
+ speed /= (t_diff * BENCH_COLS);
+
+ if (bench_fn == benchmark_gen_impl)
+ raidz_impl_kstats[impl].gen[fn] = speed;
+ else
+ raidz_impl_kstats[impl].rec[fn] = speed;
+
+ /* Update fastest implementation method */
+ if (speed > best_speed) {
+ best_speed = speed;
+
+ if (bench_fn == benchmark_gen_impl) {
+ fstat->gen[fn] = impl;
+ vdev_raidz_fastest_impl.gen[fn] =
+ curr_impl->gen[fn];
+ } else {
+ fstat->rec[fn] = impl;
+ vdev_raidz_fastest_impl.rec[fn] =
+ curr_impl->rec[fn];
+ }
+ }
+ }
+}
+#endif
+
+/*
+ * Initialize and benchmark all supported implementations.
+ */
+static void
+benchmark_raidz(void)
+{
+ raidz_impl_ops_t *curr_impl;
+ int i, c;
+
+ /* Move supported impl into raidz_supp_impl */
+ for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
+ curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i];
+
+ if (curr_impl->init)
+ curr_impl->init();
+
+ if (curr_impl->is_supported())
+ raidz_supp_impl[c++] = (raidz_impl_ops_t *)curr_impl;
+ }
+ membar_producer(); /* complete raidz_supp_impl[] init */
+ raidz_supp_impl_cnt = c; /* number of supported impl */
+
+#if defined(_KERNEL)
+ zio_t *bench_zio = NULL;
+ raidz_map_t *bench_rm = NULL;
+ uint64_t bench_parity;
+
+ /* Fake a zio and run the benchmark on a warmed up buffer */
+ bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
+ bench_zio->io_offset = 0;
+ bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */
+ bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE);
+ memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE);
+
+ /* Benchmark parity generation methods */
+ for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
+ bench_parity = fn + 1;
+ /* New raidz_map is needed for each generate_p/q/r */
+ bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
+ BENCH_D_COLS + bench_parity, bench_parity);
+
+ benchmark_raidz_impl(bench_rm, fn, benchmark_gen_impl);
+
+ vdev_raidz_map_free(bench_rm);
+ }
+
+ /* Benchmark data reconstruction methods */
+ bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
+ BENCH_COLS, PARITY_PQR);
+
+ for (int fn = 0; fn < RAIDZ_REC_NUM; fn++)
+ benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);
+
+ vdev_raidz_map_free(bench_rm);
+
+ /* cleanup the bench zio */
+ abd_free(bench_zio->io_abd);
+ kmem_free(bench_zio, sizeof (zio_t));
+#else
+ /*
+ * Skip the benchmark in user space to avoid impacting libzpool
+ * consumers (zdb, zhack, zinject, ztest). The last implementation
+ * is assumed to be the fastest and used by default.
+ */
+ memcpy(&vdev_raidz_fastest_impl,
+ raidz_supp_impl[raidz_supp_impl_cnt - 1],
+ sizeof (vdev_raidz_fastest_impl));
+ strcpy(vdev_raidz_fastest_impl.name, "fastest");
+#endif /* _KERNEL */
+}
+
+void
+vdev_raidz_math_init(void)
+{
+ /* Determine the fastest available implementation. */
+ benchmark_raidz();
+
+#if defined(_KERNEL)
+ /* Install kstats for all implementations */
+ raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc",
+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+ if (raidz_math_kstat != NULL) {
+ raidz_math_kstat->ks_data = NULL;
+ raidz_math_kstat->ks_ndata = UINT32_MAX;
+ kstat_set_raw_ops(raidz_math_kstat,
+ raidz_math_kstat_headers,
+ raidz_math_kstat_data,
+ raidz_math_kstat_addr);
+ kstat_install(raidz_math_kstat);
+ }
+#endif
+
+ /* Finish initialization */
+ atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl);
+ raidz_math_initialized = B_TRUE;
+}
+
+void
+vdev_raidz_math_fini(void)
+{
+ raidz_impl_ops_t const *curr_impl;
+
+#if defined(_KERNEL)
+ if (raidz_math_kstat != NULL) {
+ kstat_delete(raidz_math_kstat);
+ raidz_math_kstat = NULL;
+ }
+#endif
+
+ for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
+ curr_impl = raidz_all_maths[i];
+ if (curr_impl->fini)
+ curr_impl->fini();
+ }
+}
+
+static const struct {
+ char *name;
+ uint32_t sel;
+} math_impl_opts[] = {
+ { "cycle", IMPL_CYCLE },
+ { "fastest", IMPL_FASTEST },
+ { "original", IMPL_ORIGINAL },
+ { "scalar", IMPL_SCALAR }
+};
+
+/*
+ * Function sets desired raidz implementation.
+ *
+ * If we are called before init(), user preference will be saved in
+ * user_sel_impl, and applied in later init() call. This occurs when module
+ * parameter is specified on module load. Otherwise, directly update
+ * zfs_vdev_raidz_impl.
+ *
+ * @val Name of raidz implementation to use
+ * @param Unused.
+ */
+int
+vdev_raidz_impl_set(const char *val)
+{
+ int err = -EINVAL;
+ char req_name[RAIDZ_IMPL_NAME_MAX];
+ uint32_t impl = RAIDZ_IMPL_READ(user_sel_impl);
+ size_t i;
+
+ /* sanitize input */
+ i = strnlen(val, RAIDZ_IMPL_NAME_MAX);
+ if (i == 0 || i == RAIDZ_IMPL_NAME_MAX)
+ return (err);
+
+ strlcpy(req_name, val, RAIDZ_IMPL_NAME_MAX);
+ while (i > 0 && !!isspace(req_name[i-1]))
+ i--;
+ req_name[i] = '\0';
+
+ /* Check mandatory options */
+ for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) {
+ if (strcmp(req_name, math_impl_opts[i].name) == 0) {
+ impl = math_impl_opts[i].sel;
+ err = 0;
+ break;
+ }
+ }
+
+ /* check all supported impl if init() was already called */
+ if (err != 0 && raidz_math_initialized) {
+ /* check all supported implementations */
+ for (i = 0; i < raidz_supp_impl_cnt; i++) {
+ if (strcmp(req_name, raidz_supp_impl[i]->name) == 0) {
+ impl = i;
+ err = 0;
+ break;
+ }
+ }
+ }
+
+ if (err == 0) {
+ if (raidz_math_initialized)
+ atomic_swap_32(&zfs_vdev_raidz_impl, impl);
+ else
+ atomic_swap_32(&user_sel_impl, impl);
+ }
+
+ return (err);
+}
+
+#if defined(_KERNEL) && defined(__linux__)
+
+static int
+zfs_vdev_raidz_impl_set(const char *val, zfs_kernel_param_t *kp)
+{
+ return (vdev_raidz_impl_set(val));
+}
+
+static int
+zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp)
+{
+ int i, cnt = 0;
+ char *fmt;
+ const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
+
+ ASSERT(raidz_math_initialized);
+
+ /* list mandatory options */
+ for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) {
+ fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s ";
+ cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name);
+ }
+
+ /* list all supported implementations */
+ for (i = 0; i < raidz_supp_impl_cnt; i++) {
+ fmt = (i == impl) ? "[%s] " : "%s ";
+ cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name);
+ }
+
+ return (cnt);
+}
+
+module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set,
+ zfs_vdev_raidz_impl_get, NULL, 0644);
+MODULE_PARM_DESC(zfs_vdev_raidz_impl, "Select raidz implementation.");
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon.c
new file mode 100644
index 000000000000..0a67ceb84920
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon.c
@@ -0,0 +1,2279 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Romain Dolbeau. All rights reserved.
+ */
+
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+
+#if defined(__aarch64__)
+
+#include "vdev_raidz_math_aarch64_neon_common.h"
+
+#define SYN_STRIDE 4
+
+#define ZERO_STRIDE 4
+#define ZERO_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_33_36()
+#define ZERO_D 0, 1, 2, 3
+
+#define COPY_STRIDE 4
+#define COPY_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_33_36()
+#define COPY_D 0, 1, 2, 3
+
+#define ADD_STRIDE 4
+#define ADD_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_33_36()
+#define ADD_D 0, 1, 2, 3
+
+#define MUL_STRIDE 4
+#define MUL_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_33_36()
+#define MUL_D 0, 1, 2, 3
+
+#define GEN_P_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_33_36()
+#define GEN_P_STRIDE 4
+#define GEN_P_P 0, 1, 2, 3
+
+#define GEN_PQ_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define GEN_PQ_STRIDE 4
+#define GEN_PQ_D 0, 1, 2, 3
+#define GEN_PQ_C 4, 5, 6, 7
+
+#define GEN_PQR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define GEN_PQR_STRIDE 4
+#define GEN_PQR_D 0, 1, 2, 3
+#define GEN_PQR_C 4, 5, 6, 7
+
+#define SYN_Q_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_Q_STRIDE 4
+#define SYN_Q_D 0, 1, 2, 3
+#define SYN_Q_X 4, 5, 6, 7
+
+#define SYN_R_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_R_STRIDE 4
+#define SYN_R_D 0, 1, 2, 3
+#define SYN_R_X 4, 5, 6, 7
+
+#define SYN_PQ_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_PQ_STRIDE 4
+#define SYN_PQ_D 0, 1, 2, 3
+#define SYN_PQ_X 4, 5, 6, 7
+
+#define REC_PQ_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_31() \
+ GEN_X_DEFINE_32() \
+ GEN_X_DEFINE_33_36()
+#define REC_PQ_STRIDE 2
+#define REC_PQ_X 0, 1
+#define REC_PQ_Y 2, 3
+#define REC_PQ_T 4, 5
+
+#define SYN_PR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_PR_STRIDE 4
+#define SYN_PR_D 0, 1, 2, 3
+#define SYN_PR_X 4, 5, 6, 7
+
+#define REC_PR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_31() \
+ GEN_X_DEFINE_32() \
+ GEN_X_DEFINE_33_36()
+#define REC_PR_STRIDE 2
+#define REC_PR_X 0, 1
+#define REC_PR_Y 2, 3
+#define REC_PR_T 4, 5
+
+#define SYN_QR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_QR_STRIDE 4
+#define SYN_QR_D 0, 1, 2, 3
+#define SYN_QR_X 4, 5, 6, 7
+
+#define REC_QR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_31() \
+ GEN_X_DEFINE_32() \
+ GEN_X_DEFINE_33_36()
+#define REC_QR_STRIDE 2
+#define REC_QR_X 0, 1
+#define REC_QR_Y 2, 3
+#define REC_QR_T 4, 5
+
+#define SYN_PQR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_PQR_STRIDE 4
+#define SYN_PQR_D 0, 1, 2, 3
+#define SYN_PQR_X 4, 5, 6, 7
+
+#define REC_PQR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_8_9() \
+ GEN_X_DEFINE_31() \
+ GEN_X_DEFINE_32() \
+ GEN_X_DEFINE_33_36()
+#define REC_PQR_STRIDE 2
+#define REC_PQR_X 0, 1
+#define REC_PQR_Y 2, 3
+#define REC_PQR_Z 4, 5
+#define REC_PQR_XS 6, 7
+#define REC_PQR_YS 8, 9
+
+
+#include <sys/vdev_raidz_impl.h>
+#include "vdev_raidz_math_impl.h"
+
+DEFINE_GEN_METHODS(aarch64_neon);
+DEFINE_REC_METHODS(aarch64_neon);
+
+static boolean_t
+raidz_will_aarch64_neon_work(void)
+{
+ return (kfpu_allowed());
+}
+
+const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl = {
+ .init = NULL,
+ .fini = NULL,
+ .gen = RAIDZ_GEN_METHODS(aarch64_neon),
+ .rec = RAIDZ_REC_METHODS(aarch64_neon),
+ .is_supported = &raidz_will_aarch64_neon_work,
+ .name = "aarch64_neon"
+};
+
+#endif /* defined(__aarch64__) */
+
+
+#if defined(__aarch64__)
+/* BEGIN CSTYLED */
+const uint8_t
+__attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = {
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e,
+ 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09,
+ 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c,
+ 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b,
+ 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12,
+ 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15,
+ 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38,
+ 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f,
+ 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36,
+ 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31,
+ 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24,
+ 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23,
+ 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a,
+ 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d,
+ 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+ 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+ 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
+ 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+ 0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e,
+ 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+ 0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79,
+ 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+ 0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c,
+ 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+ 0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b,
+ 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+ 0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62,
+ 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+ 0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65,
+ 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+ 0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48,
+ 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88 },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+ 0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f,
+ 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87 },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+ 0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46,
+ 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96 },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+ 0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41,
+ 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99 },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+ 0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54,
+ 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4 },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+ 0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+ 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+ 0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a,
+ 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+ 0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d,
+ 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5 },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+ 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+ 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7,
+ 0x08, 0x29, 0x4a, 0x6b, 0x8c, 0xad, 0xce, 0xef },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+ 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee,
+ 0x10, 0x32, 0x54, 0x76, 0x98, 0xba, 0xdc, 0xfe },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+ 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9,
+ 0x18, 0x3b, 0x5e, 0x7d, 0x94, 0xb7, 0xd2, 0xf1 },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+ 0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc,
+ 0x20, 0x04, 0x68, 0x4c, 0xb0, 0x94, 0xf8, 0xdc },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+ 0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb,
+ 0x28, 0x0d, 0x62, 0x47, 0xbc, 0x99, 0xf6, 0xd3 },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+ 0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2,
+ 0x30, 0x16, 0x7c, 0x5a, 0xa8, 0x8e, 0xe4, 0xc2 },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+ 0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+ 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd },
+ { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+ 0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8,
+ 0x40, 0x68, 0x10, 0x38, 0xe0, 0xc8, 0xb0, 0x98 },
+ { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+ 0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf,
+ 0x48, 0x61, 0x1a, 0x33, 0xec, 0xc5, 0xbe, 0x97 },
+ { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+ 0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6,
+ 0x50, 0x7a, 0x04, 0x2e, 0xf8, 0xd2, 0xac, 0x86 },
+ { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+ 0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1,
+ 0x58, 0x73, 0x0e, 0x25, 0xf4, 0xdf, 0xa2, 0x89 },
+ { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+ 0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4,
+ 0x60, 0x4c, 0x38, 0x14, 0xd0, 0xfc, 0x88, 0xa4 },
+ { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+ 0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3,
+ 0x68, 0x45, 0x32, 0x1f, 0xdc, 0xf1, 0x86, 0xab },
+ { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+ 0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca,
+ 0x70, 0x5e, 0x2c, 0x02, 0xc8, 0xe6, 0x94, 0xba },
+ { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+ 0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd,
+ 0x78, 0x57, 0x26, 0x09, 0xc4, 0xeb, 0x9a, 0xb5 },
+ { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+ 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+ 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97,
+ 0x88, 0xb9, 0xea, 0xdb, 0x4c, 0x7d, 0x2e, 0x1f },
+ { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+ 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e,
+ 0x90, 0xa2, 0xf4, 0xc6, 0x58, 0x6a, 0x3c, 0x0e },
+ { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+ 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99,
+ 0x98, 0xab, 0xfe, 0xcd, 0x54, 0x67, 0x32, 0x01 },
+ { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+ 0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c,
+ 0xa0, 0x94, 0xc8, 0xfc, 0x70, 0x44, 0x18, 0x2c },
+ { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+ 0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b,
+ 0xa8, 0x9d, 0xc2, 0xf7, 0x7c, 0x49, 0x16, 0x23 },
+ { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+ 0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82,
+ 0xb0, 0x86, 0xdc, 0xea, 0x68, 0x5e, 0x04, 0x32 },
+ { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+ 0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85,
+ 0xb8, 0x8f, 0xd6, 0xe1, 0x64, 0x53, 0x0a, 0x3d },
+ { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+ 0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8,
+ 0xc0, 0xf8, 0xb0, 0x88, 0x20, 0x18, 0x50, 0x68 },
+ { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+ 0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf,
+ 0xc8, 0xf1, 0xba, 0x83, 0x2c, 0x15, 0x5e, 0x67 },
+ { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+ 0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+ 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 },
+ { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+ 0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1,
+ 0xd8, 0xe3, 0xae, 0x95, 0x34, 0x0f, 0x42, 0x79 },
+ { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+ 0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4,
+ 0xe0, 0xdc, 0x98, 0xa4, 0x10, 0x2c, 0x68, 0x54 },
+ { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+ 0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3,
+ 0xe8, 0xd5, 0x92, 0xaf, 0x1c, 0x21, 0x66, 0x5b },
+ { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+ 0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba,
+ 0xf0, 0xce, 0x8c, 0xb2, 0x08, 0x36, 0x74, 0x4a },
+ { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+ 0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd,
+ 0xf8, 0xc7, 0x86, 0xb9, 0x04, 0x3b, 0x7a, 0x45 },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+ 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+ 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x41, 0x82, 0xc3, 0x04, 0x45, 0x86, 0xc7,
+ 0x08, 0x49, 0x8a, 0xcb, 0x0c, 0x4d, 0x8e, 0xcf },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+ 0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x42, 0x84, 0xc6, 0x08, 0x4a, 0x8c, 0xce,
+ 0x10, 0x52, 0x94, 0xd6, 0x18, 0x5a, 0x9c, 0xde },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+ 0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x43, 0x86, 0xc5, 0x0c, 0x4f, 0x8a, 0xc9,
+ 0x18, 0x5b, 0x9e, 0xdd, 0x14, 0x57, 0x92, 0xd1 },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+ 0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x44, 0x88, 0xcc, 0x10, 0x54, 0x98, 0xdc,
+ 0x20, 0x64, 0xa8, 0xec, 0x30, 0x74, 0xb8, 0xfc },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+ 0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x45, 0x8a, 0xcf, 0x14, 0x51, 0x9e, 0xdb,
+ 0x28, 0x6d, 0xa2, 0xe7, 0x3c, 0x79, 0xb6, 0xf3 },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+ 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x46, 0x8c, 0xca, 0x18, 0x5e, 0x94, 0xd2,
+ 0x30, 0x76, 0xbc, 0xfa, 0x28, 0x6e, 0xa4, 0xe2 },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+ 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x47, 0x8e, 0xc9, 0x1c, 0x5b, 0x92, 0xd5,
+ 0x38, 0x7f, 0xb6, 0xf1, 0x24, 0x63, 0xaa, 0xed },
+ { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+ 0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x48, 0x90, 0xd8, 0x20, 0x68, 0xb0, 0xf8,
+ 0x40, 0x08, 0xd0, 0x98, 0x60, 0x28, 0xf0, 0xb8 },
+ { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+ 0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x49, 0x92, 0xdb, 0x24, 0x6d, 0xb6, 0xff,
+ 0x48, 0x01, 0xda, 0x93, 0x6c, 0x25, 0xfe, 0xb7 },
+ { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+ 0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4a, 0x94, 0xde, 0x28, 0x62, 0xbc, 0xf6,
+ 0x50, 0x1a, 0xc4, 0x8e, 0x78, 0x32, 0xec, 0xa6 },
+ { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+ 0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4b, 0x96, 0xdd, 0x2c, 0x67, 0xba, 0xf1,
+ 0x58, 0x13, 0xce, 0x85, 0x74, 0x3f, 0xe2, 0xa9 },
+ { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+ 0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4c, 0x98, 0xd4, 0x30, 0x7c, 0xa8, 0xe4,
+ 0x60, 0x2c, 0xf8, 0xb4, 0x50, 0x1c, 0xc8, 0x84 },
+ { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+ 0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4d, 0x9a, 0xd7, 0x34, 0x79, 0xae, 0xe3,
+ 0x68, 0x25, 0xf2, 0xbf, 0x5c, 0x11, 0xc6, 0x8b },
+ { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+ 0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+ 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a },
+ { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+ 0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4f, 0x9e, 0xd1, 0x3c, 0x73, 0xa2, 0xed,
+ 0x78, 0x37, 0xe6, 0xa9, 0x44, 0x0b, 0xda, 0x95 },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+ 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+ 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x51, 0xa2, 0xf3, 0x44, 0x15, 0xe6, 0xb7,
+ 0x88, 0xd9, 0x2a, 0x7b, 0xcc, 0x9d, 0x6e, 0x3f },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+ 0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x52, 0xa4, 0xf6, 0x48, 0x1a, 0xec, 0xbe,
+ 0x90, 0xc2, 0x34, 0x66, 0xd8, 0x8a, 0x7c, 0x2e },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+ 0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+ 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+ 0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x54, 0xa8, 0xfc, 0x50, 0x04, 0xf8, 0xac,
+ 0xa0, 0xf4, 0x08, 0x5c, 0xf0, 0xa4, 0x58, 0x0c },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+ 0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x55, 0xaa, 0xff, 0x54, 0x01, 0xfe, 0xab,
+ 0xa8, 0xfd, 0x02, 0x57, 0xfc, 0xa9, 0x56, 0x03 },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+ 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x56, 0xac, 0xfa, 0x58, 0x0e, 0xf4, 0xa2,
+ 0xb0, 0xe6, 0x1c, 0x4a, 0xe8, 0xbe, 0x44, 0x12 },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+ 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x57, 0xae, 0xf9, 0x5c, 0x0b, 0xf2, 0xa5,
+ 0xb8, 0xef, 0x16, 0x41, 0xe4, 0xb3, 0x4a, 0x1d },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+ 0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x58, 0xb0, 0xe8, 0x60, 0x38, 0xd0, 0x88,
+ 0xc0, 0x98, 0x70, 0x28, 0xa0, 0xf8, 0x10, 0x48 },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+ 0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x59, 0xb2, 0xeb, 0x64, 0x3d, 0xd6, 0x8f,
+ 0xc8, 0x91, 0x7a, 0x23, 0xac, 0xf5, 0x1e, 0x47 },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+ 0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5a, 0xb4, 0xee, 0x68, 0x32, 0xdc, 0x86,
+ 0xd0, 0x8a, 0x64, 0x3e, 0xb8, 0xe2, 0x0c, 0x56 },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+ 0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5b, 0xb6, 0xed, 0x6c, 0x37, 0xda, 0x81,
+ 0xd8, 0x83, 0x6e, 0x35, 0xb4, 0xef, 0x02, 0x59 },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+ 0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5c, 0xb8, 0xe4, 0x70, 0x2c, 0xc8, 0x94,
+ 0xe0, 0xbc, 0x58, 0x04, 0x90, 0xcc, 0x28, 0x74 },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+ 0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5d, 0xba, 0xe7, 0x74, 0x29, 0xce, 0x93,
+ 0xe8, 0xb5, 0x52, 0x0f, 0x9c, 0xc1, 0x26, 0x7b },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+ 0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5e, 0xbc, 0xe2, 0x78, 0x26, 0xc4, 0x9a,
+ 0xf0, 0xae, 0x4c, 0x12, 0x88, 0xd6, 0x34, 0x6a },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+ 0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5f, 0xbe, 0xe1, 0x7c, 0x23, 0xc2, 0x9d,
+ 0xf8, 0xa7, 0x46, 0x19, 0x84, 0xdb, 0x3a, 0x65 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+ 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+ 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x61, 0xc2, 0xa3, 0x84, 0xe5, 0x46, 0x27,
+ 0x08, 0x69, 0xca, 0xab, 0x8c, 0xed, 0x4e, 0x2f },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+ 0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x62, 0xc4, 0xa6, 0x88, 0xea, 0x4c, 0x2e,
+ 0x10, 0x72, 0xd4, 0xb6, 0x98, 0xfa, 0x5c, 0x3e },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+ 0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x63, 0xc6, 0xa5, 0x8c, 0xef, 0x4a, 0x29,
+ 0x18, 0x7b, 0xde, 0xbd, 0x94, 0xf7, 0x52, 0x31 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+ 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x64, 0xc8, 0xac, 0x90, 0xf4, 0x58, 0x3c,
+ 0x20, 0x44, 0xe8, 0x8c, 0xb0, 0xd4, 0x78, 0x1c },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+ 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x65, 0xca, 0xaf, 0x94, 0xf1, 0x5e, 0x3b,
+ 0x28, 0x4d, 0xe2, 0x87, 0xbc, 0xd9, 0x76, 0x13 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+ 0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x66, 0xcc, 0xaa, 0x98, 0xfe, 0x54, 0x32,
+ 0x30, 0x56, 0xfc, 0x9a, 0xa8, 0xce, 0x64, 0x02 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+ 0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x67, 0xce, 0xa9, 0x9c, 0xfb, 0x52, 0x35,
+ 0x38, 0x5f, 0xf6, 0x91, 0xa4, 0xc3, 0x6a, 0x0d },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+ 0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x68, 0xd0, 0xb8, 0xa0, 0xc8, 0x70, 0x18,
+ 0x40, 0x28, 0x90, 0xf8, 0xe0, 0x88, 0x30, 0x58 },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+ 0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+ 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+ 0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6a, 0xd4, 0xbe, 0xa8, 0xc2, 0x7c, 0x16,
+ 0x50, 0x3a, 0x84, 0xee, 0xf8, 0x92, 0x2c, 0x46 },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+ 0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6b, 0xd6, 0xbd, 0xac, 0xc7, 0x7a, 0x11,
+ 0x58, 0x33, 0x8e, 0xe5, 0xf4, 0x9f, 0x22, 0x49 },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+ 0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6c, 0xd8, 0xb4, 0xb0, 0xdc, 0x68, 0x04,
+ 0x60, 0x0c, 0xb8, 0xd4, 0xd0, 0xbc, 0x08, 0x64 },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+ 0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6d, 0xda, 0xb7, 0xb4, 0xd9, 0x6e, 0x03,
+ 0x68, 0x05, 0xb2, 0xdf, 0xdc, 0xb1, 0x06, 0x6b },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+ 0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6e, 0xdc, 0xb2, 0xb8, 0xd6, 0x64, 0x0a,
+ 0x70, 0x1e, 0xac, 0xc2, 0xc8, 0xa6, 0x14, 0x7a },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+ 0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6f, 0xde, 0xb1, 0xbc, 0xd3, 0x62, 0x0d,
+ 0x78, 0x17, 0xa6, 0xc9, 0xc4, 0xab, 0x1a, 0x75 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+ 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+ 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x71, 0xe2, 0x93, 0xc4, 0xb5, 0x26, 0x57,
+ 0x88, 0xf9, 0x6a, 0x1b, 0x4c, 0x3d, 0xae, 0xdf },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+ 0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x72, 0xe4, 0x96, 0xc8, 0xba, 0x2c, 0x5e,
+ 0x90, 0xe2, 0x74, 0x06, 0x58, 0x2a, 0xbc, 0xce },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+ 0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x73, 0xe6, 0x95, 0xcc, 0xbf, 0x2a, 0x59,
+ 0x98, 0xeb, 0x7e, 0x0d, 0x54, 0x27, 0xb2, 0xc1 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+ 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+ 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+ 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x75, 0xea, 0x9f, 0xd4, 0xa1, 0x3e, 0x4b,
+ 0xa8, 0xdd, 0x42, 0x37, 0x7c, 0x09, 0x96, 0xe3 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+ 0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x76, 0xec, 0x9a, 0xd8, 0xae, 0x34, 0x42,
+ 0xb0, 0xc6, 0x5c, 0x2a, 0x68, 0x1e, 0x84, 0xf2 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+ 0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x77, 0xee, 0x99, 0xdc, 0xab, 0x32, 0x45,
+ 0xb8, 0xcf, 0x56, 0x21, 0x64, 0x13, 0x8a, 0xfd },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+ 0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x78, 0xf0, 0x88, 0xe0, 0x98, 0x10, 0x68,
+ 0xc0, 0xb8, 0x30, 0x48, 0x20, 0x58, 0xd0, 0xa8 },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+ 0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x79, 0xf2, 0x8b, 0xe4, 0x9d, 0x16, 0x6f,
+ 0xc8, 0xb1, 0x3a, 0x43, 0x2c, 0x55, 0xde, 0xa7 },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+ 0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7a, 0xf4, 0x8e, 0xe8, 0x92, 0x1c, 0x66,
+ 0xd0, 0xaa, 0x24, 0x5e, 0x38, 0x42, 0xcc, 0xb6 },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+ 0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7b, 0xf6, 0x8d, 0xec, 0x97, 0x1a, 0x61,
+ 0xd8, 0xa3, 0x2e, 0x55, 0x34, 0x4f, 0xc2, 0xb9 },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+ 0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7c, 0xf8, 0x84, 0xf0, 0x8c, 0x08, 0x74,
+ 0xe0, 0x9c, 0x18, 0x64, 0x10, 0x6c, 0xe8, 0x94 },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+ 0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7d, 0xfa, 0x87, 0xf4, 0x89, 0x0e, 0x73,
+ 0xe8, 0x95, 0x12, 0x6f, 0x1c, 0x61, 0xe6, 0x9b },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+ 0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7e, 0xfc, 0x82, 0xf8, 0x86, 0x04, 0x7a,
+ 0xf0, 0x8e, 0x0c, 0x72, 0x08, 0x76, 0xf4, 0x8a },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+ 0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7f, 0xfe, 0x81, 0xfc, 0x83, 0x02, 0x7d,
+ 0xf8, 0x87, 0x06, 0x79, 0x04, 0x7b, 0xfa, 0x85 },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+ 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+ 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+ 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+ 0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x82, 0x04, 0x86, 0x08, 0x8a, 0x0c, 0x8e,
+ 0x10, 0x92, 0x14, 0x96, 0x18, 0x9a, 0x1c, 0x9e },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+ 0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x83, 0x06, 0x85, 0x0c, 0x8f, 0x0a, 0x89,
+ 0x18, 0x9b, 0x1e, 0x9d, 0x14, 0x97, 0x12, 0x91 },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+ 0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x84, 0x08, 0x8c, 0x10, 0x94, 0x18, 0x9c,
+ 0x20, 0xa4, 0x28, 0xac, 0x30, 0xb4, 0x38, 0xbc },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+ 0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x85, 0x0a, 0x8f, 0x14, 0x91, 0x1e, 0x9b,
+ 0x28, 0xad, 0x22, 0xa7, 0x3c, 0xb9, 0x36, 0xb3 },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+ 0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x86, 0x0c, 0x8a, 0x18, 0x9e, 0x14, 0x92,
+ 0x30, 0xb6, 0x3c, 0xba, 0x28, 0xae, 0x24, 0xa2 },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+ 0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x87, 0x0e, 0x89, 0x1c, 0x9b, 0x12, 0x95,
+ 0x38, 0xbf, 0x36, 0xb1, 0x24, 0xa3, 0x2a, 0xad },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+ 0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x88, 0x10, 0x98, 0x20, 0xa8, 0x30, 0xb8,
+ 0x40, 0xc8, 0x50, 0xd8, 0x60, 0xe8, 0x70, 0xf8 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+ 0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x89, 0x12, 0x9b, 0x24, 0xad, 0x36, 0xbf,
+ 0x48, 0xc1, 0x5a, 0xd3, 0x6c, 0xe5, 0x7e, 0xf7 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+ 0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8a, 0x14, 0x9e, 0x28, 0xa2, 0x3c, 0xb6,
+ 0x50, 0xda, 0x44, 0xce, 0x78, 0xf2, 0x6c, 0xe6 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+ 0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8b, 0x16, 0x9d, 0x2c, 0xa7, 0x3a, 0xb1,
+ 0x58, 0xd3, 0x4e, 0xc5, 0x74, 0xff, 0x62, 0xe9 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+ 0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8c, 0x18, 0x94, 0x30, 0xbc, 0x28, 0xa4,
+ 0x60, 0xec, 0x78, 0xf4, 0x50, 0xdc, 0x48, 0xc4 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+ 0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8d, 0x1a, 0x97, 0x34, 0xb9, 0x2e, 0xa3,
+ 0x68, 0xe5, 0x72, 0xff, 0x5c, 0xd1, 0x46, 0xcb },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+ 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8e, 0x1c, 0x92, 0x38, 0xb6, 0x24, 0xaa,
+ 0x70, 0xfe, 0x6c, 0xe2, 0x48, 0xc6, 0x54, 0xda },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+ 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8f, 0x1e, 0x91, 0x3c, 0xb3, 0x22, 0xad,
+ 0x78, 0xf7, 0x66, 0xe9, 0x44, 0xcb, 0x5a, 0xd5 },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+ 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+ 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x91, 0x22, 0xb3, 0x44, 0xd5, 0x66, 0xf7,
+ 0x88, 0x19, 0xaa, 0x3b, 0xcc, 0x5d, 0xee, 0x7f },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+ 0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x92, 0x24, 0xb6, 0x48, 0xda, 0x6c, 0xfe,
+ 0x90, 0x02, 0xb4, 0x26, 0xd8, 0x4a, 0xfc, 0x6e },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+ 0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x93, 0x26, 0xb5, 0x4c, 0xdf, 0x6a, 0xf9,
+ 0x98, 0x0b, 0xbe, 0x2d, 0xd4, 0x47, 0xf2, 0x61 },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+ 0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x94, 0x28, 0xbc, 0x50, 0xc4, 0x78, 0xec,
+ 0xa0, 0x34, 0x88, 0x1c, 0xf0, 0x64, 0xd8, 0x4c },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+ 0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x95, 0x2a, 0xbf, 0x54, 0xc1, 0x7e, 0xeb,
+ 0xa8, 0x3d, 0x82, 0x17, 0xfc, 0x69, 0xd6, 0x43 },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+ 0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x96, 0x2c, 0xba, 0x58, 0xce, 0x74, 0xe2,
+ 0xb0, 0x26, 0x9c, 0x0a, 0xe8, 0x7e, 0xc4, 0x52 },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+ 0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x97, 0x2e, 0xb9, 0x5c, 0xcb, 0x72, 0xe5,
+ 0xb8, 0x2f, 0x96, 0x01, 0xe4, 0x73, 0xca, 0x5d },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+ 0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x98, 0x30, 0xa8, 0x60, 0xf8, 0x50, 0xc8,
+ 0xc0, 0x58, 0xf0, 0x68, 0xa0, 0x38, 0x90, 0x08 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+ 0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x99, 0x32, 0xab, 0x64, 0xfd, 0x56, 0xcf,
+ 0xc8, 0x51, 0xfa, 0x63, 0xac, 0x35, 0x9e, 0x07 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+ 0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9a, 0x34, 0xae, 0x68, 0xf2, 0x5c, 0xc6,
+ 0xd0, 0x4a, 0xe4, 0x7e, 0xb8, 0x22, 0x8c, 0x16 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+ 0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9b, 0x36, 0xad, 0x6c, 0xf7, 0x5a, 0xc1,
+ 0xd8, 0x43, 0xee, 0x75, 0xb4, 0x2f, 0x82, 0x19 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+ 0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+ 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+ 0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9d, 0x3a, 0xa7, 0x74, 0xe9, 0x4e, 0xd3,
+ 0xe8, 0x75, 0xd2, 0x4f, 0x9c, 0x01, 0xa6, 0x3b },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+ 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9e, 0x3c, 0xa2, 0x78, 0xe6, 0x44, 0xda,
+ 0xf0, 0x6e, 0xcc, 0x52, 0x88, 0x16, 0xb4, 0x2a },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+ 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9f, 0x3e, 0xa1, 0x7c, 0xe3, 0x42, 0xdd,
+ 0xf8, 0x67, 0xc6, 0x59, 0x84, 0x1b, 0xba, 0x25 },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+ 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+ 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa1, 0x42, 0xe3, 0x84, 0x25, 0xc6, 0x67,
+ 0x08, 0xa9, 0x4a, 0xeb, 0x8c, 0x2d, 0xce, 0x6f },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+ 0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa2, 0x44, 0xe6, 0x88, 0x2a, 0xcc, 0x6e,
+ 0x10, 0xb2, 0x54, 0xf6, 0x98, 0x3a, 0xdc, 0x7e },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+ 0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa3, 0x46, 0xe5, 0x8c, 0x2f, 0xca, 0x69,
+ 0x18, 0xbb, 0x5e, 0xfd, 0x94, 0x37, 0xd2, 0x71 },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+ 0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa4, 0x48, 0xec, 0x90, 0x34, 0xd8, 0x7c,
+ 0x20, 0x84, 0x68, 0xcc, 0xb0, 0x14, 0xf8, 0x5c },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+ 0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa5, 0x4a, 0xef, 0x94, 0x31, 0xde, 0x7b,
+ 0x28, 0x8d, 0x62, 0xc7, 0xbc, 0x19, 0xf6, 0x53 },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+ 0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+ 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+ 0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa7, 0x4e, 0xe9, 0x9c, 0x3b, 0xd2, 0x75,
+ 0x38, 0x9f, 0x76, 0xd1, 0xa4, 0x03, 0xea, 0x4d },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+ 0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa8, 0x50, 0xf8, 0xa0, 0x08, 0xf0, 0x58,
+ 0x40, 0xe8, 0x10, 0xb8, 0xe0, 0x48, 0xb0, 0x18 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+ 0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa9, 0x52, 0xfb, 0xa4, 0x0d, 0xf6, 0x5f,
+ 0x48, 0xe1, 0x1a, 0xb3, 0xec, 0x45, 0xbe, 0x17 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+ 0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xaa, 0x54, 0xfe, 0xa8, 0x02, 0xfc, 0x56,
+ 0x50, 0xfa, 0x04, 0xae, 0xf8, 0x52, 0xac, 0x06 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+ 0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xab, 0x56, 0xfd, 0xac, 0x07, 0xfa, 0x51,
+ 0x58, 0xf3, 0x0e, 0xa5, 0xf4, 0x5f, 0xa2, 0x09 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+ 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xac, 0x58, 0xf4, 0xb0, 0x1c, 0xe8, 0x44,
+ 0x60, 0xcc, 0x38, 0x94, 0xd0, 0x7c, 0x88, 0x24 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+ 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xad, 0x5a, 0xf7, 0xb4, 0x19, 0xee, 0x43,
+ 0x68, 0xc5, 0x32, 0x9f, 0xdc, 0x71, 0x86, 0x2b },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+ 0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xae, 0x5c, 0xf2, 0xb8, 0x16, 0xe4, 0x4a,
+ 0x70, 0xde, 0x2c, 0x82, 0xc8, 0x66, 0x94, 0x3a },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+ 0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xaf, 0x5e, 0xf1, 0xbc, 0x13, 0xe2, 0x4d,
+ 0x78, 0xd7, 0x26, 0x89, 0xc4, 0x6b, 0x9a, 0x35 },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+ 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+ 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb1, 0x62, 0xd3, 0xc4, 0x75, 0xa6, 0x17,
+ 0x88, 0x39, 0xea, 0x5b, 0x4c, 0xfd, 0x2e, 0x9f },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+ 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb2, 0x64, 0xd6, 0xc8, 0x7a, 0xac, 0x1e,
+ 0x90, 0x22, 0xf4, 0x46, 0x58, 0xea, 0x3c, 0x8e },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+ 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb3, 0x66, 0xd5, 0xcc, 0x7f, 0xaa, 0x19,
+ 0x98, 0x2b, 0xfe, 0x4d, 0x54, 0xe7, 0x32, 0x81 },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+ 0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb4, 0x68, 0xdc, 0xd0, 0x64, 0xb8, 0x0c,
+ 0xa0, 0x14, 0xc8, 0x7c, 0x70, 0xc4, 0x18, 0xac },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+ 0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb5, 0x6a, 0xdf, 0xd4, 0x61, 0xbe, 0x0b,
+ 0xa8, 0x1d, 0xc2, 0x77, 0x7c, 0xc9, 0x16, 0xa3 },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+ 0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb6, 0x6c, 0xda, 0xd8, 0x6e, 0xb4, 0x02,
+ 0xb0, 0x06, 0xdc, 0x6a, 0x68, 0xde, 0x04, 0xb2 },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+ 0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb7, 0x6e, 0xd9, 0xdc, 0x6b, 0xb2, 0x05,
+ 0xb8, 0x0f, 0xd6, 0x61, 0x64, 0xd3, 0x0a, 0xbd },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+ 0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb8, 0x70, 0xc8, 0xe0, 0x58, 0x90, 0x28,
+ 0xc0, 0x78, 0xb0, 0x08, 0x20, 0x98, 0x50, 0xe8 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+ 0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb9, 0x72, 0xcb, 0xe4, 0x5d, 0x96, 0x2f,
+ 0xc8, 0x71, 0xba, 0x03, 0x2c, 0x95, 0x5e, 0xe7 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+ 0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xba, 0x74, 0xce, 0xe8, 0x52, 0x9c, 0x26,
+ 0xd0, 0x6a, 0xa4, 0x1e, 0x38, 0x82, 0x4c, 0xf6 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+ 0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+ 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+ 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xbc, 0x78, 0xc4, 0xf0, 0x4c, 0x88, 0x34,
+ 0xe0, 0x5c, 0x98, 0x24, 0x10, 0xac, 0x68, 0xd4 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+ 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xbd, 0x7a, 0xc7, 0xf4, 0x49, 0x8e, 0x33,
+ 0xe8, 0x55, 0x92, 0x2f, 0x1c, 0xa1, 0x66, 0xdb },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+ 0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xbe, 0x7c, 0xc2, 0xf8, 0x46, 0x84, 0x3a,
+ 0xf0, 0x4e, 0x8c, 0x32, 0x08, 0xb6, 0x74, 0xca },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+ 0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xbf, 0x7e, 0xc1, 0xfc, 0x43, 0x82, 0x3d,
+ 0xf8, 0x47, 0x86, 0x39, 0x04, 0xbb, 0x7a, 0xc5 },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+ 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+ 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc1, 0x82, 0x43, 0x04, 0xc5, 0x86, 0x47,
+ 0x08, 0xc9, 0x8a, 0x4b, 0x0c, 0xcd, 0x8e, 0x4f },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+ 0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc2, 0x84, 0x46, 0x08, 0xca, 0x8c, 0x4e,
+ 0x10, 0xd2, 0x94, 0x56, 0x18, 0xda, 0x9c, 0x5e },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+ 0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc3, 0x86, 0x45, 0x0c, 0xcf, 0x8a, 0x49,
+ 0x18, 0xdb, 0x9e, 0x5d, 0x14, 0xd7, 0x92, 0x51 },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+ 0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc4, 0x88, 0x4c, 0x10, 0xd4, 0x98, 0x5c,
+ 0x20, 0xe4, 0xa8, 0x6c, 0x30, 0xf4, 0xb8, 0x7c },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+ 0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc5, 0x8a, 0x4f, 0x14, 0xd1, 0x9e, 0x5b,
+ 0x28, 0xed, 0xa2, 0x67, 0x3c, 0xf9, 0xb6, 0x73 },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+ 0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc6, 0x8c, 0x4a, 0x18, 0xde, 0x94, 0x52,
+ 0x30, 0xf6, 0xbc, 0x7a, 0x28, 0xee, 0xa4, 0x62 },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+ 0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc7, 0x8e, 0x49, 0x1c, 0xdb, 0x92, 0x55,
+ 0x38, 0xff, 0xb6, 0x71, 0x24, 0xe3, 0xaa, 0x6d },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+ 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc8, 0x90, 0x58, 0x20, 0xe8, 0xb0, 0x78,
+ 0x40, 0x88, 0xd0, 0x18, 0x60, 0xa8, 0xf0, 0x38 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+ 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc9, 0x92, 0x5b, 0x24, 0xed, 0xb6, 0x7f,
+ 0x48, 0x81, 0xda, 0x13, 0x6c, 0xa5, 0xfe, 0x37 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+ 0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xca, 0x94, 0x5e, 0x28, 0xe2, 0xbc, 0x76,
+ 0x50, 0x9a, 0xc4, 0x0e, 0x78, 0xb2, 0xec, 0x26 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+ 0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xcb, 0x96, 0x5d, 0x2c, 0xe7, 0xba, 0x71,
+ 0x58, 0x93, 0xce, 0x05, 0x74, 0xbf, 0xe2, 0x29 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+ 0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xcc, 0x98, 0x54, 0x30, 0xfc, 0xa8, 0x64,
+ 0x60, 0xac, 0xf8, 0x34, 0x50, 0x9c, 0xc8, 0x04 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+ 0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xcd, 0x9a, 0x57, 0x34, 0xf9, 0xae, 0x63,
+ 0x68, 0xa5, 0xf2, 0x3f, 0x5c, 0x91, 0xc6, 0x0b },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+ 0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xce, 0x9c, 0x52, 0x38, 0xf6, 0xa4, 0x6a,
+ 0x70, 0xbe, 0xec, 0x22, 0x48, 0x86, 0xd4, 0x1a },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+ 0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+ 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+ 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+ 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd1, 0xa2, 0x73, 0x44, 0x95, 0xe6, 0x37,
+ 0x88, 0x59, 0x2a, 0xfb, 0xcc, 0x1d, 0x6e, 0xbf },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+ 0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+ 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+ 0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd3, 0xa6, 0x75, 0x4c, 0x9f, 0xea, 0x39,
+ 0x98, 0x4b, 0x3e, 0xed, 0xd4, 0x07, 0x72, 0xa1 },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+ 0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd4, 0xa8, 0x7c, 0x50, 0x84, 0xf8, 0x2c,
+ 0xa0, 0x74, 0x08, 0xdc, 0xf0, 0x24, 0x58, 0x8c },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+ 0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd5, 0xaa, 0x7f, 0x54, 0x81, 0xfe, 0x2b,
+ 0xa8, 0x7d, 0x02, 0xd7, 0xfc, 0x29, 0x56, 0x83 },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+ 0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd6, 0xac, 0x7a, 0x58, 0x8e, 0xf4, 0x22,
+ 0xb0, 0x66, 0x1c, 0xca, 0xe8, 0x3e, 0x44, 0x92 },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+ 0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd7, 0xae, 0x79, 0x5c, 0x8b, 0xf2, 0x25,
+ 0xb8, 0x6f, 0x16, 0xc1, 0xe4, 0x33, 0x4a, 0x9d },
+ { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+ 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd8, 0xb0, 0x68, 0x60, 0xb8, 0xd0, 0x08,
+ 0xc0, 0x18, 0x70, 0xa8, 0xa0, 0x78, 0x10, 0xc8 },
+ { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+ 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd9, 0xb2, 0x6b, 0x64, 0xbd, 0xd6, 0x0f,
+ 0xc8, 0x11, 0x7a, 0xa3, 0xac, 0x75, 0x1e, 0xc7 },
+ { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+ 0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xda, 0xb4, 0x6e, 0x68, 0xb2, 0xdc, 0x06,
+ 0xd0, 0x0a, 0x64, 0xbe, 0xb8, 0x62, 0x0c, 0xd6 },
+ { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+ 0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xdb, 0xb6, 0x6d, 0x6c, 0xb7, 0xda, 0x01,
+ 0xd8, 0x03, 0x6e, 0xb5, 0xb4, 0x6f, 0x02, 0xd9 },
+ { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+ 0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xdc, 0xb8, 0x64, 0x70, 0xac, 0xc8, 0x14,
+ 0xe0, 0x3c, 0x58, 0x84, 0x90, 0x4c, 0x28, 0xf4 },
+ { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+ 0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xdd, 0xba, 0x67, 0x74, 0xa9, 0xce, 0x13,
+ 0xe8, 0x35, 0x52, 0x8f, 0x9c, 0x41, 0x26, 0xfb },
+ { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+ 0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xde, 0xbc, 0x62, 0x78, 0xa6, 0xc4, 0x1a,
+ 0xf0, 0x2e, 0x4c, 0x92, 0x88, 0x56, 0x34, 0xea },
+ { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+ 0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xdf, 0xbe, 0x61, 0x7c, 0xa3, 0xc2, 0x1d,
+ 0xf8, 0x27, 0x46, 0x99, 0x84, 0x5b, 0x3a, 0xe5 },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+ 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+ 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe1, 0xc2, 0x23, 0x84, 0x65, 0x46, 0xa7,
+ 0x08, 0xe9, 0xca, 0x2b, 0x8c, 0x6d, 0x4e, 0xaf },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+ 0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe2, 0xc4, 0x26, 0x88, 0x6a, 0x4c, 0xae,
+ 0x10, 0xf2, 0xd4, 0x36, 0x98, 0x7a, 0x5c, 0xbe },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+ 0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe3, 0xc6, 0x25, 0x8c, 0x6f, 0x4a, 0xa9,
+ 0x18, 0xfb, 0xde, 0x3d, 0x94, 0x77, 0x52, 0xb1 },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+ 0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe4, 0xc8, 0x2c, 0x90, 0x74, 0x58, 0xbc,
+ 0x20, 0xc4, 0xe8, 0x0c, 0xb0, 0x54, 0x78, 0x9c },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+ 0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe5, 0xca, 0x2f, 0x94, 0x71, 0x5e, 0xbb,
+ 0x28, 0xcd, 0xe2, 0x07, 0xbc, 0x59, 0x76, 0x93 },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+ 0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe6, 0xcc, 0x2a, 0x98, 0x7e, 0x54, 0xb2,
+ 0x30, 0xd6, 0xfc, 0x1a, 0xa8, 0x4e, 0x64, 0x82 },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+ 0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe7, 0xce, 0x29, 0x9c, 0x7b, 0x52, 0xb5,
+ 0x38, 0xdf, 0xf6, 0x11, 0xa4, 0x43, 0x6a, 0x8d },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+ 0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+ 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+ 0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe9, 0xd2, 0x3b, 0xa4, 0x4d, 0x76, 0x9f,
+ 0x48, 0xa1, 0x9a, 0x73, 0xec, 0x05, 0x3e, 0xd7 },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+ 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xea, 0xd4, 0x3e, 0xa8, 0x42, 0x7c, 0x96,
+ 0x50, 0xba, 0x84, 0x6e, 0xf8, 0x12, 0x2c, 0xc6 },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+ 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xeb, 0xd6, 0x3d, 0xac, 0x47, 0x7a, 0x91,
+ 0x58, 0xb3, 0x8e, 0x65, 0xf4, 0x1f, 0x22, 0xc9 },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+ 0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xec, 0xd8, 0x34, 0xb0, 0x5c, 0x68, 0x84,
+ 0x60, 0x8c, 0xb8, 0x54, 0xd0, 0x3c, 0x08, 0xe4 },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+ 0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xed, 0xda, 0x37, 0xb4, 0x59, 0x6e, 0x83,
+ 0x68, 0x85, 0xb2, 0x5f, 0xdc, 0x31, 0x06, 0xeb },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+ 0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xee, 0xdc, 0x32, 0xb8, 0x56, 0x64, 0x8a,
+ 0x70, 0x9e, 0xac, 0x42, 0xc8, 0x26, 0x14, 0xfa },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+ 0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xef, 0xde, 0x31, 0xbc, 0x53, 0x62, 0x8d,
+ 0x78, 0x97, 0xa6, 0x49, 0xc4, 0x2b, 0x1a, 0xf5 },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+ 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+ 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf1, 0xe2, 0x13, 0xc4, 0x35, 0x26, 0xd7,
+ 0x88, 0x79, 0x6a, 0x9b, 0x4c, 0xbd, 0xae, 0x5f },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+ 0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf2, 0xe4, 0x16, 0xc8, 0x3a, 0x2c, 0xde,
+ 0x90, 0x62, 0x74, 0x86, 0x58, 0xaa, 0xbc, 0x4e },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+ 0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf3, 0xe6, 0x15, 0xcc, 0x3f, 0x2a, 0xd9,
+ 0x98, 0x6b, 0x7e, 0x8d, 0x54, 0xa7, 0xb2, 0x41 },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+ 0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf4, 0xe8, 0x1c, 0xd0, 0x24, 0x38, 0xcc,
+ 0xa0, 0x54, 0x48, 0xbc, 0x70, 0x84, 0x98, 0x6c },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+ 0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+ 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+ 0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf6, 0xec, 0x1a, 0xd8, 0x2e, 0x34, 0xc2,
+ 0xb0, 0x46, 0x5c, 0xaa, 0x68, 0x9e, 0x84, 0x72 },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+ 0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf7, 0xee, 0x19, 0xdc, 0x2b, 0x32, 0xc5,
+ 0xb8, 0x4f, 0x56, 0xa1, 0x64, 0x93, 0x8a, 0x7d },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+ 0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf8, 0xf0, 0x08, 0xe0, 0x18, 0x10, 0xe8,
+ 0xc0, 0x38, 0x30, 0xc8, 0x20, 0xd8, 0xd0, 0x28 },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+ 0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf9, 0xf2, 0x0b, 0xe4, 0x1d, 0x16, 0xef,
+ 0xc8, 0x31, 0x3a, 0xc3, 0x2c, 0xd5, 0xde, 0x27 },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+ 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xfa, 0xf4, 0x0e, 0xe8, 0x12, 0x1c, 0xe6,
+ 0xd0, 0x2a, 0x24, 0xde, 0x38, 0xc2, 0xcc, 0x36 },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+ 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xfb, 0xf6, 0x0d, 0xec, 0x17, 0x1a, 0xe1,
+ 0xd8, 0x23, 0x2e, 0xd5, 0x34, 0xcf, 0xc2, 0x39 },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+ 0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xfc, 0xf8, 0x04, 0xf0, 0x0c, 0x08, 0xf4,
+ 0xe0, 0x1c, 0x18, 0xe4, 0x10, 0xec, 0xe8, 0x14 },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+ 0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xfd, 0xfa, 0x07, 0xf4, 0x09, 0x0e, 0xf3,
+ 0xe8, 0x15, 0x12, 0xef, 0x1c, 0xe1, 0xe6, 0x1b },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+ 0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xfe, 0xfc, 0x02, 0xf8, 0x06, 0x04, 0xfa,
+ 0xf0, 0x0e, 0x0c, 0xf2, 0x08, 0xf6, 0xf4, 0x0a },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+ 0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xff, 0xfe, 0x01, 0xfc, 0x03, 0x02, 0xfd,
+ 0xf8, 0x07, 0x06, 0xf9, 0x04, 0xfb, 0xfa, 0x05 }
+};
+/* END CSTYLED */
+#endif /* defined(__aarch64__) */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h
new file mode 100644
index 000000000000..e46b2536546c
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h
@@ -0,0 +1,684 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Romain Dolbeau. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/simd.h>
+
+#ifdef __linux__
+#define __asm __asm__ __volatile__
+#endif
+
+#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
+#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
+
+#define VR0_(REG, ...) "%[w"#REG"]"
+#define VR1_(_1, REG, ...) "%[w"#REG"]"
+#define VR2_(_1, _2, REG, ...) "%[w"#REG"]"
+#define VR3_(_1, _2, _3, REG, ...) "%[w"#REG"]"
+#define VR4_(_1, _2, _3, _4, REG, ...) "%[w"#REG"]"
+#define VR5_(_1, _2, _3, _4, _5, REG, ...) "%[w"#REG"]"
+#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "%[w"#REG"]"
+#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "%[w"#REG"]"
+
+/*
+ * Here we need registers not used otherwise.
+ * They will be used in unused ASM for the case
+ * with more registers than required... but GCC
+ * will still need to make sure the constraints
+ * are correct, and duplicate constraints are illegal
+ * ... and we use the "register" number as a name
+ */
+
+#define VR0(r...) VR0_(r)
+#define VR1(r...) VR1_(r)
+#define VR2(r...) VR2_(r, 36)
+#define VR3(r...) VR3_(r, 36, 35)
+#define VR4(r...) VR4_(r, 36, 35, 34, 33)
+#define VR5(r...) VR5_(r, 36, 35, 34, 33, 32)
+#define VR6(r...) VR6_(r, 36, 35, 34, 33, 32, 31)
+#define VR7(r...) VR7_(r, 36, 35, 34, 33, 32, 31, 30)
+
+#define VR(X) "%[w"#X"]"
+
+#define RVR0_(REG, ...) [w##REG] "w" (w##REG)
+#define RVR1_(_1, REG, ...) [w##REG] "w" (w##REG)
+#define RVR2_(_1, _2, REG, ...) [w##REG] "w" (w##REG)
+#define RVR3_(_1, _2, _3, REG, ...) [w##REG] "w" (w##REG)
+#define RVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "w" (w##REG)
+#define RVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "w" (w##REG)
+#define RVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "w" (w##REG)
+#define RVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "w" (w##REG)
+
+#define RVR0(r...) RVR0_(r)
+#define RVR1(r...) RVR1_(r)
+#define RVR2(r...) RVR2_(r, 36)
+#define RVR3(r...) RVR3_(r, 36, 35)
+#define RVR4(r...) RVR4_(r, 36, 35, 34, 33)
+#define RVR5(r...) RVR5_(r, 36, 35, 34, 33, 32)
+#define RVR6(r...) RVR6_(r, 36, 35, 34, 33, 32, 31)
+#define RVR7(r...) RVR7_(r, 36, 35, 34, 33, 32, 31, 30)
+
+#define RVR(X) [w##X] "w" (w##X)
+
+#define WVR0_(REG, ...) [w##REG] "=w" (w##REG)
+#define WVR1_(_1, REG, ...) [w##REG] "=w" (w##REG)
+#define WVR2_(_1, _2, REG, ...) [w##REG] "=w" (w##REG)
+#define WVR3_(_1, _2, _3, REG, ...) [w##REG] "=w" (w##REG)
+#define WVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "=w" (w##REG)
+#define WVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "=w" (w##REG)
+#define WVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "=w" (w##REG)
+#define WVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "=w" (w##REG)
+
+#define WVR0(r...) WVR0_(r)
+#define WVR1(r...) WVR1_(r)
+#define WVR2(r...) WVR2_(r, 36)
+#define WVR3(r...) WVR3_(r, 36, 35)
+#define WVR4(r...) WVR4_(r, 36, 35, 34, 33)
+#define WVR5(r...) WVR5_(r, 36, 35, 34, 33, 32)
+#define WVR6(r...) WVR6_(r, 36, 35, 34, 33, 32, 31)
+#define WVR7(r...) WVR7_(r, 36, 35, 34, 33, 32, 31, 30)
+
+#define WVR(X) [w##X] "=w" (w##X)
+
+#define UVR0_(REG, ...) [w##REG] "+&w" (w##REG)
+#define UVR1_(_1, REG, ...) [w##REG] "+&w" (w##REG)
+#define UVR2_(_1, _2, REG, ...) [w##REG] "+&w" (w##REG)
+#define UVR3_(_1, _2, _3, REG, ...) [w##REG] "+&w" (w##REG)
+#define UVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "+&w" (w##REG)
+#define UVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "+&w" (w##REG)
+#define UVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "+&w" (w##REG)
+#define UVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "+&w" (w##REG)
+
+#define UVR0(r...) UVR0_(r)
+#define UVR1(r...) UVR1_(r)
+#define UVR2(r...) UVR2_(r, 36)
+#define UVR3(r...) UVR3_(r, 36, 35)
+#define UVR4(r...) UVR4_(r, 36, 35, 34, 33)
+#define UVR5(r...) UVR5_(r, 36, 35, 34, 33, 32)
+#define UVR6(r...) UVR6_(r, 36, 35, 34, 33, 32, 31)
+#define UVR7(r...) UVR7_(r, 36, 35, 34, 33, 32, 31, 30)
+
+#define UVR(X) [w##X] "+&w" (w##X)
+
+#define R_01(REG1, REG2, ...) REG1, REG2
+#define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3
+#define R_23(REG...) _R_23(REG, 1, 2, 3)
+
+#define ZFS_ASM_BUG() ASSERT(0)
+
+#define OFFSET(ptr, val) (((unsigned char *)(ptr))+val)
+
+extern const uint8_t gf_clmul_mod_lt[4*256][16];
+
+#define ELEM_SIZE 16
+
+typedef struct v {
+ uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
+} v_t;
+
+#define XOR_ACC(src, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "ld1 { v21.4s },%[SRC0]\n" \
+ "ld1 { v20.4s },%[SRC1]\n" \
+ "ld1 { v19.4s },%[SRC2]\n" \
+ "ld1 { v18.4s },%[SRC3]\n" \
+ "eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n" \
+ "eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n" \
+ "eor " VR2(r) ".16b," VR2(r) ".16b,v19.16b\n" \
+ "eor " VR3(r) ".16b," VR3(r) ".16b,v18.16b\n" \
+ "ld1 { v21.4s },%[SRC4]\n" \
+ "ld1 { v20.4s },%[SRC5]\n" \
+ "ld1 { v19.4s },%[SRC6]\n" \
+ "ld1 { v18.4s },%[SRC7]\n" \
+ "eor " VR4(r) ".16b," VR4(r) ".16b,v21.16b\n" \
+ "eor " VR5(r) ".16b," VR5(r) ".16b,v20.16b\n" \
+ "eor " VR6(r) ".16b," VR6(r) ".16b,v19.16b\n" \
+ "eor " VR7(r) ".16b," VR7(r) ".16b,v18.16b\n" \
+ : UVR0(r), UVR1(r), UVR2(r), UVR3(r), \
+ UVR4(r), UVR5(r), UVR6(r), UVR7(r) \
+ : [SRC0] "Q" (*(OFFSET(src, 0))), \
+ [SRC1] "Q" (*(OFFSET(src, 16))), \
+ [SRC2] "Q" (*(OFFSET(src, 32))), \
+ [SRC3] "Q" (*(OFFSET(src, 48))), \
+ [SRC4] "Q" (*(OFFSET(src, 64))), \
+ [SRC5] "Q" (*(OFFSET(src, 80))), \
+ [SRC6] "Q" (*(OFFSET(src, 96))), \
+ [SRC7] "Q" (*(OFFSET(src, 112))) \
+ : "v18", "v19", "v20", "v21"); \
+ break; \
+ case 4: \
+ __asm( \
+ "ld1 { v21.4s },%[SRC0]\n" \
+ "ld1 { v20.4s },%[SRC1]\n" \
+ "ld1 { v19.4s },%[SRC2]\n" \
+ "ld1 { v18.4s },%[SRC3]\n" \
+ "eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n" \
+ "eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n" \
+ "eor " VR2(r) ".16b," VR2(r) ".16b,v19.16b\n" \
+ "eor " VR3(r) ".16b," VR3(r) ".16b,v18.16b\n" \
+ : UVR0(r), UVR1(r), UVR2(r), UVR3(r) \
+ : [SRC0] "Q" (*(OFFSET(src, 0))), \
+ [SRC1] "Q" (*(OFFSET(src, 16))), \
+ [SRC2] "Q" (*(OFFSET(src, 32))), \
+ [SRC3] "Q" (*(OFFSET(src, 48))) \
+ : "v18", "v19", "v20", "v21"); \
+ break; \
+ case 2: \
+ __asm( \
+ "ld1 { v21.4s },%[SRC0]\n" \
+ "ld1 { v20.4s },%[SRC1]\n" \
+ "eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n" \
+ "eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n" \
+ : UVR0(r), UVR1(r) \
+ : [SRC0] "Q" (*(OFFSET(src, 0))), \
+ [SRC1] "Q" (*(OFFSET(src, 16))) \
+ : "v20", "v21"); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define XOR(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "eor " VR4(r) ".16b," VR4(r) ".16b," VR0(r) ".16b\n" \
+ "eor " VR5(r) ".16b," VR5(r) ".16b," VR1(r) ".16b\n" \
+ "eor " VR6(r) ".16b," VR6(r) ".16b," VR2(r) ".16b\n" \
+ "eor " VR7(r) ".16b," VR7(r) ".16b," VR3(r) ".16b\n" \
+ : UVR4(r), UVR5(r), UVR6(r), UVR7(r) \
+ : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "eor " VR2(r) ".16b," VR2(r) ".16b," VR0(r) ".16b\n" \
+ "eor " VR3(r) ".16b," VR3(r) ".16b," VR1(r) ".16b\n" \
+ : UVR2(r), UVR3(r) \
+ : RVR0(r), RVR1(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define ZERO(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n" \
+ "eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n" \
+ "eor " VR2(r) ".16b," VR2(r) ".16b," VR2(r) ".16b\n" \
+ "eor " VR3(r) ".16b," VR3(r) ".16b," VR3(r) ".16b\n" \
+ "eor " VR4(r) ".16b," VR4(r) ".16b," VR4(r) ".16b\n" \
+ "eor " VR5(r) ".16b," VR5(r) ".16b," VR5(r) ".16b\n" \
+ "eor " VR6(r) ".16b," VR6(r) ".16b," VR6(r) ".16b\n" \
+ "eor " VR7(r) ".16b," VR7(r) ".16b," VR7(r) ".16b\n" \
+ : WVR0(r), WVR1(r), WVR2(r), WVR3(r), \
+ WVR4(r), WVR5(r), WVR6(r), WVR7(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n" \
+ "eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n" \
+ "eor " VR2(r) ".16b," VR2(r) ".16b," VR2(r) ".16b\n" \
+ "eor " VR3(r) ".16b," VR3(r) ".16b," VR3(r) ".16b\n" \
+ : WVR0(r), WVR1(r), WVR2(r), WVR3(r)); \
+ break; \
+ case 2: \
+ __asm( \
+ "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n" \
+ "eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n" \
+ : WVR0(r), WVR1(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define COPY(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "mov " VR4(r) ".16b," VR0(r) ".16b\n" \
+ "mov " VR5(r) ".16b," VR1(r) ".16b\n" \
+ "mov " VR6(r) ".16b," VR2(r) ".16b\n" \
+ "mov " VR7(r) ".16b," VR3(r) ".16b\n" \
+ : WVR4(r), WVR5(r), WVR6(r), WVR7(r) \
+ : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "mov " VR2(r) ".16b," VR0(r) ".16b\n" \
+ "mov " VR3(r) ".16b," VR1(r) ".16b\n" \
+ : WVR2(r), WVR3(r) \
+ : RVR0(r), RVR1(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define LOAD(src, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "ld1 { " VR0(r) ".4s },%[SRC0]\n" \
+ "ld1 { " VR1(r) ".4s },%[SRC1]\n" \
+ "ld1 { " VR2(r) ".4s },%[SRC2]\n" \
+ "ld1 { " VR3(r) ".4s },%[SRC3]\n" \
+ "ld1 { " VR4(r) ".4s },%[SRC4]\n" \
+ "ld1 { " VR5(r) ".4s },%[SRC5]\n" \
+ "ld1 { " VR6(r) ".4s },%[SRC6]\n" \
+ "ld1 { " VR7(r) ".4s },%[SRC7]\n" \
+ : WVR0(r), WVR1(r), WVR2(r), WVR3(r), \
+ WVR4(r), WVR5(r), WVR6(r), WVR7(r) \
+ : [SRC0] "Q" (*(OFFSET(src, 0))), \
+ [SRC1] "Q" (*(OFFSET(src, 16))), \
+ [SRC2] "Q" (*(OFFSET(src, 32))), \
+ [SRC3] "Q" (*(OFFSET(src, 48))), \
+ [SRC4] "Q" (*(OFFSET(src, 64))), \
+ [SRC5] "Q" (*(OFFSET(src, 80))), \
+ [SRC6] "Q" (*(OFFSET(src, 96))), \
+ [SRC7] "Q" (*(OFFSET(src, 112)))); \
+ break; \
+ case 4: \
+ __asm( \
+ "ld1 { " VR0(r) ".4s },%[SRC0]\n" \
+ "ld1 { " VR1(r) ".4s },%[SRC1]\n" \
+ "ld1 { " VR2(r) ".4s },%[SRC2]\n" \
+ "ld1 { " VR3(r) ".4s },%[SRC3]\n" \
+ : WVR0(r), WVR1(r), WVR2(r), WVR3(r) \
+ : [SRC0] "Q" (*(OFFSET(src, 0))), \
+ [SRC1] "Q" (*(OFFSET(src, 16))), \
+ [SRC2] "Q" (*(OFFSET(src, 32))), \
+ [SRC3] "Q" (*(OFFSET(src, 48)))); \
+ break; \
+ case 2: \
+ __asm( \
+ "ld1 { " VR0(r) ".4s },%[SRC0]\n" \
+ "ld1 { " VR1(r) ".4s },%[SRC1]\n" \
+ : WVR0(r), WVR1(r) \
+ : [SRC0] "Q" (*(OFFSET(src, 0))), \
+ [SRC1] "Q" (*(OFFSET(src, 16)))); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define STORE(dst, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "st1 { " VR0(r) ".4s },%[DST0]\n" \
+ "st1 { " VR1(r) ".4s },%[DST1]\n" \
+ "st1 { " VR2(r) ".4s },%[DST2]\n" \
+ "st1 { " VR3(r) ".4s },%[DST3]\n" \
+ "st1 { " VR4(r) ".4s },%[DST4]\n" \
+ "st1 { " VR5(r) ".4s },%[DST5]\n" \
+ "st1 { " VR6(r) ".4s },%[DST6]\n" \
+ "st1 { " VR7(r) ".4s },%[DST7]\n" \
+ : [DST0] "=Q" (*(OFFSET(dst, 0))), \
+ [DST1] "=Q" (*(OFFSET(dst, 16))), \
+ [DST2] "=Q" (*(OFFSET(dst, 32))), \
+ [DST3] "=Q" (*(OFFSET(dst, 48))), \
+ [DST4] "=Q" (*(OFFSET(dst, 64))), \
+ [DST5] "=Q" (*(OFFSET(dst, 80))), \
+ [DST6] "=Q" (*(OFFSET(dst, 96))), \
+ [DST7] "=Q" (*(OFFSET(dst, 112))) \
+ : RVR0(r), RVR1(r), RVR2(r), RVR3(r), \
+ RVR4(r), RVR5(r), RVR6(r), RVR7(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "st1 { " VR0(r) ".4s },%[DST0]\n" \
+ "st1 { " VR1(r) ".4s },%[DST1]\n" \
+ "st1 { " VR2(r) ".4s },%[DST2]\n" \
+ "st1 { " VR3(r) ".4s },%[DST3]\n" \
+ : [DST0] "=Q" (*(OFFSET(dst, 0))), \
+ [DST1] "=Q" (*(OFFSET(dst, 16))), \
+ [DST2] "=Q" (*(OFFSET(dst, 32))), \
+ [DST3] "=Q" (*(OFFSET(dst, 48))) \
+ : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \
+ break; \
+ case 2: \
+ __asm( \
+ "st1 { " VR0(r) ".4s },%[DST0]\n" \
+ "st1 { " VR1(r) ".4s },%[DST1]\n" \
+ : [DST0] "=Q" (*(OFFSET(dst, 0))), \
+ [DST1] "=Q" (*(OFFSET(dst, 16))) \
+ : RVR0(r), RVR1(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+/*
+ * Unfortunately cannot use the macro, because GCC
+ * will try to use the macro name and not value
+ * later on...
+ * Kept as a reference to what a numbered variable is
+ */
+#define _00 "v17"
+#define _1d "v16"
+#define _temp0 "v19"
+#define _temp1 "v18"
+
+#define MUL2_SETUP() \
+{ \
+ __asm( \
+ "eor " VR(17) ".16b," VR(17) ".16b," VR(17) ".16b\n" \
+ "movi " VR(16) ".16b,#0x1d\n" \
+ : WVR(16), WVR(17)); \
+}
+
+#define MUL2(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "cmgt v19.16b," VR(17) ".16b," VR0(r) ".16b\n" \
+ "cmgt v18.16b," VR(17) ".16b," VR1(r) ".16b\n" \
+ "cmgt v21.16b," VR(17) ".16b," VR2(r) ".16b\n" \
+ "cmgt v20.16b," VR(17) ".16b," VR3(r) ".16b\n" \
+ "and v19.16b,v19.16b," VR(16) ".16b\n" \
+ "and v18.16b,v18.16b," VR(16) ".16b\n" \
+ "and v21.16b,v21.16b," VR(16) ".16b\n" \
+ "and v20.16b,v20.16b," VR(16) ".16b\n" \
+ "shl " VR0(r) ".16b," VR0(r) ".16b,#1\n" \
+ "shl " VR1(r) ".16b," VR1(r) ".16b,#1\n" \
+ "shl " VR2(r) ".16b," VR2(r) ".16b,#1\n" \
+ "shl " VR3(r) ".16b," VR3(r) ".16b,#1\n" \
+ "eor " VR0(r) ".16b,v19.16b," VR0(r) ".16b\n" \
+ "eor " VR1(r) ".16b,v18.16b," VR1(r) ".16b\n" \
+ "eor " VR2(r) ".16b,v21.16b," VR2(r) ".16b\n" \
+ "eor " VR3(r) ".16b,v20.16b," VR3(r) ".16b\n" \
+ : UVR0(r), UVR1(r), UVR2(r), UVR3(r) \
+ : RVR(17), RVR(16) \
+ : "v18", "v19", "v20", "v21"); \
+ break; \
+ case 2: \
+ __asm( \
+ "cmgt v19.16b," VR(17) ".16b," VR0(r) ".16b\n" \
+ "cmgt v18.16b," VR(17) ".16b," VR1(r) ".16b\n" \
+ "and v19.16b,v19.16b," VR(16) ".16b\n" \
+ "and v18.16b,v18.16b," VR(16) ".16b\n" \
+ "shl " VR0(r) ".16b," VR0(r) ".16b,#1\n" \
+ "shl " VR1(r) ".16b," VR1(r) ".16b,#1\n" \
+ "eor " VR0(r) ".16b,v19.16b," VR0(r) ".16b\n" \
+ "eor " VR1(r) ".16b,v18.16b," VR1(r) ".16b\n" \
+ : UVR0(r), UVR1(r) \
+ : RVR(17), RVR(16) \
+ : "v18", "v19"); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define MUL4(r...) \
+{ \
+ MUL2(r); \
+ MUL2(r); \
+}
+
+/*
+ * Unfortunately cannot use the macro, because GCC
+ * will try to use the macro name and not value
+ * later on...
+ * Kept as a reference to what a register is
+ * (here we're using actual registers for the
+ * clobbered ones)
+ */
+#define _0f "v15"
+#define _a_save "v14"
+#define _b_save "v13"
+#define _lt_mod_a "v12"
+#define _lt_clmul_a "v11"
+#define _lt_mod_b "v10"
+#define _lt_clmul_b "v15"
+
+#define _MULx2(c, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 2: \
+ __asm( \
+ /* lts for upper part */ \
+ "movi v15.16b,#0x0f\n" \
+ "ld1 { v10.4s },%[lt0]\n" \
+ "ld1 { v11.4s },%[lt1]\n" \
+ /* upper part */ \
+ "and v14.16b," VR0(r) ".16b,v15.16b\n" \
+ "and v13.16b," VR1(r) ".16b,v15.16b\n" \
+ "ushr " VR0(r) ".16b," VR0(r) ".16b,#4\n" \
+ "ushr " VR1(r) ".16b," VR1(r) ".16b,#4\n" \
+ \
+ "tbl v12.16b,{v10.16b}," VR0(r) ".16b\n" \
+ "tbl v10.16b,{v10.16b}," VR1(r) ".16b\n" \
+ "tbl v15.16b,{v11.16b}," VR0(r) ".16b\n" \
+ "tbl v11.16b,{v11.16b}," VR1(r) ".16b\n" \
+ \
+ "eor " VR0(r) ".16b,v15.16b,v12.16b\n" \
+ "eor " VR1(r) ".16b,v11.16b,v10.16b\n" \
+ /* lts for lower part */ \
+ "ld1 { v10.4s },%[lt2]\n" \
+ "ld1 { v15.4s },%[lt3]\n" \
+ /* lower part */ \
+ "tbl v12.16b,{v10.16b},v14.16b\n" \
+ "tbl v10.16b,{v10.16b},v13.16b\n" \
+ "tbl v11.16b,{v15.16b},v14.16b\n" \
+ "tbl v15.16b,{v15.16b},v13.16b\n" \
+ \
+ "eor " VR0(r) ".16b," VR0(r) ".16b,v12.16b\n" \
+ "eor " VR1(r) ".16b," VR1(r) ".16b,v10.16b\n" \
+ "eor " VR0(r) ".16b," VR0(r) ".16b,v11.16b\n" \
+ "eor " VR1(r) ".16b," VR1(r) ".16b,v15.16b\n" \
+ : UVR0(r), UVR1(r) \
+ : [lt0] "Q" ((gf_clmul_mod_lt[4*(c)+0][0])), \
+ [lt1] "Q" ((gf_clmul_mod_lt[4*(c)+1][0])), \
+ [lt2] "Q" ((gf_clmul_mod_lt[4*(c)+2][0])), \
+ [lt3] "Q" ((gf_clmul_mod_lt[4*(c)+3][0])) \
+ : "v10", "v11", "v12", "v13", "v14", "v15"); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define MUL(c, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ _MULx2(c, R_23(r)); \
+ _MULx2(c, R_01(r)); \
+ break; \
+ case 2: \
+ _MULx2(c, R_01(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define raidz_math_begin() kfpu_begin()
+#define raidz_math_end() kfpu_end()
+
+/* Overkill... */
+#if defined(_KERNEL)
+#define GEN_X_DEFINE_0_3() \
+register unsigned char w0 asm("v0") __attribute__((vector_size(16))); \
+register unsigned char w1 asm("v1") __attribute__((vector_size(16))); \
+register unsigned char w2 asm("v2") __attribute__((vector_size(16))); \
+register unsigned char w3 asm("v3") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_4_5() \
+register unsigned char w4 asm("v4") __attribute__((vector_size(16))); \
+register unsigned char w5 asm("v5") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_6_7() \
+register unsigned char w6 asm("v6") __attribute__((vector_size(16))); \
+register unsigned char w7 asm("v7") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_8_9() \
+register unsigned char w8 asm("v8") __attribute__((vector_size(16))); \
+register unsigned char w9 asm("v9") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_10_11() \
+register unsigned char w10 asm("v10") __attribute__((vector_size(16))); \
+register unsigned char w11 asm("v11") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_12_15() \
+register unsigned char w12 asm("v12") __attribute__((vector_size(16))); \
+register unsigned char w13 asm("v13") __attribute__((vector_size(16))); \
+register unsigned char w14 asm("v14") __attribute__((vector_size(16))); \
+register unsigned char w15 asm("v15") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_16() \
+register unsigned char w16 asm("v16") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_17() \
+register unsigned char w17 asm("v17") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_18_21() \
+register unsigned char w18 asm("v18") __attribute__((vector_size(16))); \
+register unsigned char w19 asm("v19") __attribute__((vector_size(16))); \
+register unsigned char w20 asm("v20") __attribute__((vector_size(16))); \
+register unsigned char w21 asm("v21") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_22_23() \
+register unsigned char w22 asm("v22") __attribute__((vector_size(16))); \
+register unsigned char w23 asm("v23") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_24_27() \
+register unsigned char w24 asm("v24") __attribute__((vector_size(16))); \
+register unsigned char w25 asm("v25") __attribute__((vector_size(16))); \
+register unsigned char w26 asm("v26") __attribute__((vector_size(16))); \
+register unsigned char w27 asm("v27") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_28_30() \
+register unsigned char w28 asm("v28") __attribute__((vector_size(16))); \
+register unsigned char w29 asm("v29") __attribute__((vector_size(16))); \
+register unsigned char w30 asm("v30") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_31() \
+register unsigned char w31 asm("v31") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_32() \
+register unsigned char w32 asm("v31") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_33_36() \
+register unsigned char w33 asm("v31") __attribute__((vector_size(16))); \
+register unsigned char w34 asm("v31") __attribute__((vector_size(16))); \
+register unsigned char w35 asm("v31") __attribute__((vector_size(16))); \
+register unsigned char w36 asm("v31") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_37_38() \
+register unsigned char w37 asm("v31") __attribute__((vector_size(16))); \
+register unsigned char w38 asm("v31") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_ALL() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_8_9() \
+ GEN_X_DEFINE_10_11() \
+ GEN_X_DEFINE_12_15() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_18_21() \
+ GEN_X_DEFINE_22_23() \
+ GEN_X_DEFINE_24_27() \
+ GEN_X_DEFINE_28_30() \
+ GEN_X_DEFINE_31() \
+ GEN_X_DEFINE_32() \
+ GEN_X_DEFINE_33_36() \
+ GEN_X_DEFINE_37_38()
+#else
+#define GEN_X_DEFINE_0_3() \
+ unsigned char w0 __attribute__((vector_size(16))); \
+ unsigned char w1 __attribute__((vector_size(16))); \
+ unsigned char w2 __attribute__((vector_size(16))); \
+ unsigned char w3 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_4_5() \
+ unsigned char w4 __attribute__((vector_size(16))); \
+ unsigned char w5 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_6_7() \
+ unsigned char w6 __attribute__((vector_size(16))); \
+ unsigned char w7 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_8_9() \
+ unsigned char w8 __attribute__((vector_size(16))); \
+ unsigned char w9 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_10_11() \
+ unsigned char w10 __attribute__((vector_size(16))); \
+ unsigned char w11 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_12_15() \
+ unsigned char w12 __attribute__((vector_size(16))); \
+ unsigned char w13 __attribute__((vector_size(16))); \
+ unsigned char w14 __attribute__((vector_size(16))); \
+ unsigned char w15 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_16() \
+ unsigned char w16 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_17() \
+ unsigned char w17 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_18_21() \
+ unsigned char w18 __attribute__((vector_size(16))); \
+ unsigned char w19 __attribute__((vector_size(16))); \
+ unsigned char w20 __attribute__((vector_size(16))); \
+ unsigned char w21 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_22_23() \
+ unsigned char w22 __attribute__((vector_size(16))); \
+ unsigned char w23 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_24_27() \
+ unsigned char w24 __attribute__((vector_size(16))); \
+ unsigned char w25 __attribute__((vector_size(16))); \
+ unsigned char w26 __attribute__((vector_size(16))); \
+ unsigned char w27 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_28_30() \
+ unsigned char w28 __attribute__((vector_size(16))); \
+ unsigned char w29 __attribute__((vector_size(16))); \
+ unsigned char w30 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_31() \
+ unsigned char w31 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_32() \
+ unsigned char w32 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_33_36() \
+ unsigned char w33 __attribute__((vector_size(16))); \
+ unsigned char w34 __attribute__((vector_size(16))); \
+ unsigned char w35 __attribute__((vector_size(16))); \
+ unsigned char w36 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_37_38() \
+ unsigned char w37 __attribute__((vector_size(16))); \
+ unsigned char w38 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_ALL() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_8_9() \
+ GEN_X_DEFINE_10_11() \
+ GEN_X_DEFINE_12_15() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_18_21() \
+ GEN_X_DEFINE_22_23() \
+ GEN_X_DEFINE_24_27() \
+ GEN_X_DEFINE_28_30() \
+ GEN_X_DEFINE_31() \
+ GEN_X_DEFINE_32() \
+ GEN_X_DEFINE_33_36() \
+ GEN_X_DEFINE_37_38()
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neonx2.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neonx2.c
new file mode 100644
index 000000000000..e072f51cd635
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neonx2.c
@@ -0,0 +1,232 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Romain Dolbeau. All rights reserved.
+ */
+
+#include <sys/isa_defs.h>
+
+#if defined(__aarch64__)
+
+#include "vdev_raidz_math_aarch64_neon_common.h"
+
+#define SYN_STRIDE 4
+
+#define ZERO_STRIDE 8
+#define ZERO_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7()
+#define ZERO_D 0, 1, 2, 3, 4, 5, 6, 7
+
+#define COPY_STRIDE 8
+#define COPY_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7()
+#define COPY_D 0, 1, 2, 3, 4, 5, 6, 7
+
+#define ADD_STRIDE 8
+#define ADD_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7()
+#define ADD_D 0, 1, 2, 3, 4, 5, 6, 7
+
+#define MUL_STRIDE 4
+#define MUL_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_33_36()
+#define MUL_D 0, 1, 2, 3
+
+#define GEN_P_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_33_36()
+#define GEN_P_STRIDE 4
+#define GEN_P_P 0, 1, 2, 3
+
+#define GEN_PQ_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define GEN_PQ_STRIDE 4
+#define GEN_PQ_D 0, 1, 2, 3
+#define GEN_PQ_C 4, 5, 6, 7
+
+#define GEN_PQR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define GEN_PQR_STRIDE 4
+#define GEN_PQR_D 0, 1, 2, 3
+#define GEN_PQR_C 4, 5, 6, 7
+
+#define SYN_Q_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_Q_STRIDE 4
+#define SYN_Q_D 0, 1, 2, 3
+#define SYN_Q_X 4, 5, 6, 7
+
+#define SYN_R_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_R_STRIDE 4
+#define SYN_R_D 0, 1, 2, 3
+#define SYN_R_X 4, 5, 6, 7
+
+#define SYN_PQ_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_PQ_STRIDE 4
+#define SYN_PQ_D 0, 1, 2, 3
+#define SYN_PQ_X 4, 5, 6, 7
+
+#define REC_PQ_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_8_9() \
+ GEN_X_DEFINE_22_23() \
+ GEN_X_DEFINE_33_36()
+#define REC_PQ_STRIDE 4
+#define REC_PQ_X 0, 1, 2, 3
+#define REC_PQ_Y 4, 5, 6, 7
+#define REC_PQ_T 8, 9, 22, 23
+
+#define SYN_PR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_PR_STRIDE 4
+#define SYN_PR_D 0, 1, 2, 3
+#define SYN_PR_X 4, 5, 6, 7
+
+#define REC_PR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_8_9() \
+ GEN_X_DEFINE_22_23() \
+ GEN_X_DEFINE_33_36()
+#define REC_PR_STRIDE 4
+#define REC_PR_X 0, 1, 2, 3
+#define REC_PR_Y 4, 5, 6, 7
+#define REC_PR_T 8, 9, 22, 23
+
+#define SYN_QR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_QR_STRIDE 4
+#define SYN_QR_D 0, 1, 2, 3
+#define SYN_QR_X 4, 5, 6, 7
+
+#define REC_QR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_8_9() \
+ GEN_X_DEFINE_22_23() \
+ GEN_X_DEFINE_33_36()
+#define REC_QR_STRIDE 4
+#define REC_QR_X 0, 1, 2, 3
+#define REC_QR_Y 4, 5, 6, 7
+#define REC_QR_T 8, 9, 22, 23
+
+#define SYN_PQR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_PQR_STRIDE 4
+#define SYN_PQR_D 0, 1, 2, 3
+#define SYN_PQR_X 4, 5, 6, 7
+
+#define REC_PQR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_8_9() \
+ GEN_X_DEFINE_31() \
+ GEN_X_DEFINE_32() \
+ GEN_X_DEFINE_33_36()
+#define REC_PQR_STRIDE 2
+#define REC_PQR_X 0, 1
+#define REC_PQR_Y 2, 3
+#define REC_PQR_Z 4, 5
+#define REC_PQR_XS 6, 7
+#define REC_PQR_YS 8, 9
+
+#include <sys/vdev_raidz_impl.h>
+#include "vdev_raidz_math_impl.h"
+
+DEFINE_GEN_METHODS(aarch64_neonx2);
+/*
+ * If compiled with -O0, gcc doesn't do any stack frame coalescing
+ * and -Wframe-larger-than=1024 is triggered in debug mode.
+ */
+#pragma GCC diagnostic ignored "-Wframe-larger-than="
+DEFINE_REC_METHODS(aarch64_neonx2);
+#pragma GCC diagnostic pop
+
+static boolean_t
+raidz_will_aarch64_neonx2_work(void)
+{
+ return (kfpu_allowed());
+}
+
+const raidz_impl_ops_t vdev_raidz_aarch64_neonx2_impl = {
+ .init = NULL,
+ .fini = NULL,
+ .gen = RAIDZ_GEN_METHODS(aarch64_neonx2),
+ .rec = RAIDZ_REC_METHODS(aarch64_neonx2),
+ .is_supported = &raidz_will_aarch64_neonx2_work,
+ .name = "aarch64_neonx2"
+};
+
+#endif /* defined(__aarch64__) */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c
new file mode 100644
index 000000000000..65e4bebce8fa
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c
@@ -0,0 +1,413 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+#include <sys/isa_defs.h>
+
+#if defined(__x86_64) && defined(HAVE_AVX2)
+
+#include <sys/types.h>
+#include <sys/simd.h>
+
+#ifdef __linux__
+#define __asm __asm__ __volatile__
+#endif
+
+#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
+#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
+
+#define VR0_(REG, ...) "ymm"#REG
+#define VR1_(_1, REG, ...) "ymm"#REG
+#define VR2_(_1, _2, REG, ...) "ymm"#REG
+#define VR3_(_1, _2, _3, REG, ...) "ymm"#REG
+#define VR4_(_1, _2, _3, _4, REG, ...) "ymm"#REG
+#define VR5_(_1, _2, _3, _4, _5, REG, ...) "ymm"#REG
+#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "ymm"#REG
+#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "ymm"#REG
+
+#define VR0(r...) VR0_(r)
+#define VR1(r...) VR1_(r)
+#define VR2(r...) VR2_(r, 1)
+#define VR3(r...) VR3_(r, 1, 2)
+#define VR4(r...) VR4_(r, 1, 2)
+#define VR5(r...) VR5_(r, 1, 2, 3)
+#define VR6(r...) VR6_(r, 1, 2, 3, 4)
+#define VR7(r...) VR7_(r, 1, 2, 3, 4, 5)
+
+#define R_01(REG1, REG2, ...) REG1, REG2
+#define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3
+#define R_23(REG...) _R_23(REG, 1, 2, 3)
+
+#define ZFS_ASM_BUG() ASSERT(0)
+
+extern const uint8_t gf_clmul_mod_lt[4*256][16];
+
+#define ELEM_SIZE 32
+
+typedef struct v {
+ uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
+} v_t;
+
+
+#define XOR_ACC(src, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "vpxor 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n" \
+ "vpxor 0x20(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n" \
+ "vpxor 0x40(%[SRC]), %%" VR2(r)", %%" VR2(r) "\n" \
+ "vpxor 0x60(%[SRC]), %%" VR3(r)", %%" VR3(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ case 2: \
+ __asm( \
+ "vpxor 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n" \
+ "vpxor 0x20(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define XOR(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "vpxor %" VR0(r) ", %" VR4(r)", %" VR4(r) "\n" \
+ "vpxor %" VR1(r) ", %" VR5(r)", %" VR5(r) "\n" \
+ "vpxor %" VR2(r) ", %" VR6(r)", %" VR6(r) "\n" \
+ "vpxor %" VR3(r) ", %" VR7(r)", %" VR7(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "vpxor %" VR0(r) ", %" VR2(r)", %" VR2(r) "\n" \
+ "vpxor %" VR1(r) ", %" VR3(r)", %" VR3(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define ZERO(r...) XOR(r, r)
+
+#define COPY(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "vmovdqa %" VR0(r) ", %" VR4(r) "\n" \
+ "vmovdqa %" VR1(r) ", %" VR5(r) "\n" \
+ "vmovdqa %" VR2(r) ", %" VR6(r) "\n" \
+ "vmovdqa %" VR3(r) ", %" VR7(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "vmovdqa %" VR0(r) ", %" VR2(r) "\n" \
+ "vmovdqa %" VR1(r) ", %" VR3(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define LOAD(src, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "vmovdqa 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "vmovdqa 0x20(%[SRC]), %%" VR1(r) "\n" \
+ "vmovdqa 0x40(%[SRC]), %%" VR2(r) "\n" \
+ "vmovdqa 0x60(%[SRC]), %%" VR3(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ case 2: \
+ __asm( \
+ "vmovdqa 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "vmovdqa 0x20(%[SRC]), %%" VR1(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define STORE(dst, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "vmovdqa %%" VR0(r) ", 0x00(%[DST])\n" \
+ "vmovdqa %%" VR1(r) ", 0x20(%[DST])\n" \
+ "vmovdqa %%" VR2(r) ", 0x40(%[DST])\n" \
+ "vmovdqa %%" VR3(r) ", 0x60(%[DST])\n" \
+ : : [DST] "r" (dst)); \
+ break; \
+ case 2: \
+ __asm( \
+ "vmovdqa %%" VR0(r) ", 0x00(%[DST])\n" \
+ "vmovdqa %%" VR1(r) ", 0x20(%[DST])\n" \
+ : : [DST] "r" (dst)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define FLUSH() \
+{ \
+ __asm("vzeroupper"); \
+}
+
+#define MUL2_SETUP() \
+{ \
+ __asm("vmovq %0, %%xmm14" :: "r"(0x1d1d1d1d1d1d1d1d)); \
+ __asm("vpbroadcastq %xmm14, %ymm14"); \
+ __asm("vpxor %ymm15, %ymm15 ,%ymm15"); \
+}
+
+#define _MUL2(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 2: \
+ __asm( \
+ "vpcmpgtb %" VR0(r)", %ymm15, %ymm12\n" \
+ "vpcmpgtb %" VR1(r)", %ymm15, %ymm13\n" \
+ "vpaddb %" VR0(r)", %" VR0(r)", %" VR0(r) "\n" \
+ "vpaddb %" VR1(r)", %" VR1(r)", %" VR1(r) "\n" \
+ "vpand %ymm14, %ymm12, %ymm12\n" \
+ "vpand %ymm14, %ymm13, %ymm13\n" \
+ "vpxor %ymm12, %" VR0(r)", %" VR0(r) "\n" \
+ "vpxor %ymm13, %" VR1(r)", %" VR1(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define MUL2(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ _MUL2(R_01(r)); \
+ _MUL2(R_23(r)); \
+ break; \
+ case 2: \
+ _MUL2(r); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define MUL4(r...) \
+{ \
+ MUL2(r); \
+ MUL2(r); \
+}
+
+#define _0f "ymm15"
+#define _as "ymm14"
+#define _bs "ymm13"
+#define _ltmod "ymm12"
+#define _ltmul "ymm11"
+#define _ta "ymm10"
+#define _tb "ymm15"
+
+static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
+
+#define _MULx2(c, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 2: \
+ __asm( \
+ "vpbroadcastb (%[mask]), %%" _0f "\n" \
+ /* upper bits */ \
+ "vbroadcasti128 0x00(%[lt]), %%" _ltmod "\n" \
+ "vbroadcasti128 0x10(%[lt]), %%" _ltmul "\n" \
+ \
+ "vpsraw $0x4, %%" VR0(r) ", %%"_as "\n" \
+ "vpsraw $0x4, %%" VR1(r) ", %%"_bs "\n" \
+ "vpand %%" _0f ", %%" VR0(r) ", %%" VR0(r) "\n" \
+ "vpand %%" _0f ", %%" VR1(r) ", %%" VR1(r) "\n" \
+ "vpand %%" _0f ", %%" _as ", %%" _as "\n" \
+ "vpand %%" _0f ", %%" _bs ", %%" _bs "\n" \
+ \
+ "vpshufb %%" _as ", %%" _ltmod ", %%" _ta "\n" \
+ "vpshufb %%" _bs ", %%" _ltmod ", %%" _tb "\n" \
+ "vpshufb %%" _as ", %%" _ltmul ", %%" _as "\n" \
+ "vpshufb %%" _bs ", %%" _ltmul ", %%" _bs "\n" \
+ /* lower bits */ \
+ "vbroadcasti128 0x20(%[lt]), %%" _ltmod "\n" \
+ "vbroadcasti128 0x30(%[lt]), %%" _ltmul "\n" \
+ \
+ "vpxor %%" _ta ", %%" _as ", %%" _as "\n" \
+ "vpxor %%" _tb ", %%" _bs ", %%" _bs "\n" \
+ \
+ "vpshufb %%" VR0(r) ", %%" _ltmod ", %%" _ta "\n" \
+ "vpshufb %%" VR1(r) ", %%" _ltmod ", %%" _tb "\n" \
+ "vpshufb %%" VR0(r) ", %%" _ltmul ", %%" VR0(r) "\n"\
+ "vpshufb %%" VR1(r) ", %%" _ltmul ", %%" VR1(r) "\n"\
+ \
+ "vpxor %%" _ta ", %%" VR0(r) ", %%" VR0(r) "\n" \
+ "vpxor %%" _as ", %%" VR0(r) ", %%" VR0(r) "\n" \
+ "vpxor %%" _tb ", %%" VR1(r) ", %%" VR1(r) "\n" \
+ "vpxor %%" _bs ", %%" VR1(r) ", %%" VR1(r) "\n" \
+ : : [mask] "r" (&_mul_mask), \
+ [lt] "r" (gf_clmul_mod_lt[4*(c)])); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define MUL(c, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ _MULx2(c, R_01(r)); \
+ _MULx2(c, R_23(r)); \
+ break; \
+ case 2: \
+ _MULx2(c, R_01(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define raidz_math_begin() kfpu_begin()
+#define raidz_math_end() \
+{ \
+ FLUSH(); \
+ kfpu_end(); \
+}
+
+
+#define SYN_STRIDE 4
+
+#define ZERO_STRIDE 4
+#define ZERO_DEFINE() {}
+#define ZERO_D 0, 1, 2, 3
+
+#define COPY_STRIDE 4
+#define COPY_DEFINE() {}
+#define COPY_D 0, 1, 2, 3
+
+#define ADD_STRIDE 4
+#define ADD_DEFINE() {}
+#define ADD_D 0, 1, 2, 3
+
+#define MUL_STRIDE 4
+#define MUL_DEFINE() {}
+#define MUL_D 0, 1, 2, 3
+
+#define GEN_P_STRIDE 4
+#define GEN_P_DEFINE() {}
+#define GEN_P_P 0, 1, 2, 3
+
+#define GEN_PQ_STRIDE 4
+#define GEN_PQ_DEFINE() {}
+#define GEN_PQ_D 0, 1, 2, 3
+#define GEN_PQ_C 4, 5, 6, 7
+
+#define GEN_PQR_STRIDE 4
+#define GEN_PQR_DEFINE() {}
+#define GEN_PQR_D 0, 1, 2, 3
+#define GEN_PQR_C 4, 5, 6, 7
+
+#define SYN_Q_DEFINE() {}
+#define SYN_Q_D 0, 1, 2, 3
+#define SYN_Q_X 4, 5, 6, 7
+
+#define SYN_R_DEFINE() {}
+#define SYN_R_D 0, 1, 2, 3
+#define SYN_R_X 4, 5, 6, 7
+
+#define SYN_PQ_DEFINE() {}
+#define SYN_PQ_D 0, 1, 2, 3
+#define SYN_PQ_X 4, 5, 6, 7
+
+#define REC_PQ_STRIDE 2
+#define REC_PQ_DEFINE() {}
+#define REC_PQ_X 0, 1
+#define REC_PQ_Y 2, 3
+#define REC_PQ_T 4, 5
+
+#define SYN_PR_DEFINE() {}
+#define SYN_PR_D 0, 1, 2, 3
+#define SYN_PR_X 4, 5, 6, 7
+
+#define REC_PR_STRIDE 2
+#define REC_PR_DEFINE() {}
+#define REC_PR_X 0, 1
+#define REC_PR_Y 2, 3
+#define REC_PR_T 4, 5
+
+#define SYN_QR_DEFINE() {}
+#define SYN_QR_D 0, 1, 2, 3
+#define SYN_QR_X 4, 5, 6, 7
+
+#define REC_QR_STRIDE 2
+#define REC_QR_DEFINE() {}
+#define REC_QR_X 0, 1
+#define REC_QR_Y 2, 3
+#define REC_QR_T 4, 5
+
+#define SYN_PQR_DEFINE() {}
+#define SYN_PQR_D 0, 1, 2, 3
+#define SYN_PQR_X 4, 5, 6, 7
+
+#define REC_PQR_STRIDE 2
+#define REC_PQR_DEFINE() {}
+#define REC_PQR_X 0, 1
+#define REC_PQR_Y 2, 3
+#define REC_PQR_Z 4, 5
+#define REC_PQR_XS 6, 7
+#define REC_PQR_YS 8, 9
+
+
+#include <sys/vdev_raidz_impl.h>
+#include "vdev_raidz_math_impl.h"
+
+DEFINE_GEN_METHODS(avx2);
+DEFINE_REC_METHODS(avx2);
+
+static boolean_t
+raidz_will_avx2_work(void)
+{
+ return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
+}
+
+const raidz_impl_ops_t vdev_raidz_avx2_impl = {
+ .init = NULL,
+ .fini = NULL,
+ .gen = RAIDZ_GEN_METHODS(avx2),
+ .rec = RAIDZ_REC_METHODS(avx2),
+ .is_supported = &raidz_will_avx2_work,
+ .name = "avx2"
+};
+
+#endif /* defined(__x86_64) && defined(HAVE_AVX2) */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c
new file mode 100644
index 000000000000..f06b469023eb
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c
@@ -0,0 +1,413 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Romain Dolbeau. All rights reserved.
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/isa_defs.h>
+
+#if defined(__x86_64) && defined(HAVE_AVX512BW)
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/simd.h>
+
+
+#ifdef __linux__
+#define __asm __asm__ __volatile__
+#endif
+
+#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
+#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
+
+#define VR0_(REG, ...) "zmm"#REG
+#define VR1_(_1, REG, ...) "zmm"#REG
+#define VR2_(_1, _2, REG, ...) "zmm"#REG
+#define VR3_(_1, _2, _3, REG, ...) "zmm"#REG
+#define VR4_(_1, _2, _3, _4, REG, ...) "zmm"#REG
+#define VR5_(_1, _2, _3, _4, _5, REG, ...) "zmm"#REG
+#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "zmm"#REG
+#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "zmm"#REG
+
+#define VR0(r...) VR0_(r)
+#define VR1(r...) VR1_(r)
+#define VR2(r...) VR2_(r, 1)
+#define VR3(r...) VR3_(r, 1, 2)
+#define VR4(r...) VR4_(r, 1, 2)
+#define VR5(r...) VR5_(r, 1, 2, 3)
+#define VR6(r...) VR6_(r, 1, 2, 3, 4)
+#define VR7(r...) VR7_(r, 1, 2, 3, 4, 5)
+
+#define R_01(REG1, REG2, ...) REG1, REG2
+#define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3
+#define R_23(REG...) _R_23(REG, 1, 2, 3)
+
+#define ZFS_ASM_BUG() ASSERT(0)
+
+extern const uint8_t gf_clmul_mod_lt[4*256][16];
+
+#define ELEM_SIZE 64
+
+typedef struct v {
+ uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
+} v_t;
+
+#define XOR_ACC(src, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "vpxorq 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n" \
+ "vpxorq 0x40(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n" \
+ "vpxorq 0x80(%[SRC]), %%" VR2(r)", %%" VR2(r) "\n" \
+ "vpxorq 0xc0(%[SRC]), %%" VR3(r)", %%" VR3(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ case 2: \
+ __asm( \
+ "vpxorq 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n" \
+ "vpxorq 0x40(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define XOR(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "vpxorq %" VR0(r) ", %" VR4(r)", %" VR4(r) "\n" \
+ "vpxorq %" VR1(r) ", %" VR5(r)", %" VR5(r) "\n" \
+ "vpxorq %" VR2(r) ", %" VR6(r)", %" VR6(r) "\n" \
+ "vpxorq %" VR3(r) ", %" VR7(r)", %" VR7(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "vpxorq %" VR0(r) ", %" VR2(r)", %" VR2(r) "\n" \
+ "vpxorq %" VR1(r) ", %" VR3(r)", %" VR3(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define ZERO(r...) XOR(r, r)
+
+#define COPY(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "vmovdqa64 %" VR0(r) ", %" VR4(r) "\n" \
+ "vmovdqa64 %" VR1(r) ", %" VR5(r) "\n" \
+ "vmovdqa64 %" VR2(r) ", %" VR6(r) "\n" \
+ "vmovdqa64 %" VR3(r) ", %" VR7(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "vmovdqa64 %" VR0(r) ", %" VR2(r) "\n" \
+ "vmovdqa64 %" VR1(r) ", %" VR3(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define LOAD(src, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "vmovdqa64 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "vmovdqa64 0x40(%[SRC]), %%" VR1(r) "\n" \
+ "vmovdqa64 0x80(%[SRC]), %%" VR2(r) "\n" \
+ "vmovdqa64 0xc0(%[SRC]), %%" VR3(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ case 2: \
+ __asm( \
+ "vmovdqa64 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "vmovdqa64 0x40(%[SRC]), %%" VR1(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define STORE(dst, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "vmovdqa64 %%" VR0(r) ", 0x00(%[DST])\n" \
+ "vmovdqa64 %%" VR1(r) ", 0x40(%[DST])\n" \
+ "vmovdqa64 %%" VR2(r) ", 0x80(%[DST])\n" \
+ "vmovdqa64 %%" VR3(r) ", 0xc0(%[DST])\n" \
+ : : [DST] "r" (dst)); \
+ break; \
+ case 2: \
+ __asm( \
+ "vmovdqa64 %%" VR0(r) ", 0x00(%[DST])\n" \
+ "vmovdqa64 %%" VR1(r) ", 0x40(%[DST])\n" \
+ : : [DST] "r" (dst)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define MUL2_SETUP() \
+{ \
+ __asm("vmovq %0, %%xmm22" :: "r"(0x1d1d1d1d1d1d1d1d)); \
+ __asm("vpbroadcastq %xmm22, %zmm22"); \
+ __asm("vpxord %zmm23, %zmm23 ,%zmm23"); \
+}
+
+#define _MUL2(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 2: \
+ __asm( \
+ "vpcmpb $1, %zmm23, %" VR0(r)", %k1\n" \
+ "vpcmpb $1, %zmm23, %" VR1(r)", %k2\n" \
+ "vpaddb %" VR0(r)", %" VR0(r)", %" VR0(r) "\n" \
+ "vpaddb %" VR1(r)", %" VR1(r)", %" VR1(r) "\n" \
+ "vpxord %zmm22, %" VR0(r)", %zmm12\n" \
+ "vpxord %zmm22, %" VR1(r)", %zmm13\n" \
+ "vmovdqu8 %zmm12, %" VR0(r) "{%k1}\n" \
+ "vmovdqu8 %zmm13, %" VR1(r) "{%k2}"); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define MUL2(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ _MUL2(R_01(r)); \
+ _MUL2(R_23(r)); \
+ break; \
+ case 2: \
+ _MUL2(r); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define MUL4(r...) \
+{ \
+ MUL2(r); \
+ MUL2(r); \
+}
+
+#define _0f "zmm15"
+#define _as "zmm14"
+#define _bs "zmm13"
+#define _ltmod "zmm12"
+#define _ltmul "zmm11"
+#define _ta "zmm10"
+#define _tb "zmm15"
+
+static const uint8_t __attribute__((aligned(64))) _mul_mask = 0x0F;
+
+#define _MULx2(c, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 2: \
+ __asm( \
+ "vpbroadcastb (%[mask]), %%" _0f "\n" \
+ /* upper bits */ \
+ "vbroadcasti32x4 0x00(%[lt]), %%" _ltmod "\n" \
+ "vbroadcasti32x4 0x10(%[lt]), %%" _ltmul "\n" \
+ \
+ "vpsraw $0x4, %%" VR0(r) ", %%"_as "\n" \
+ "vpsraw $0x4, %%" VR1(r) ", %%"_bs "\n" \
+ "vpandq %%" _0f ", %%" VR0(r) ", %%" VR0(r) "\n" \
+ "vpandq %%" _0f ", %%" VR1(r) ", %%" VR1(r) "\n" \
+ "vpandq %%" _0f ", %%" _as ", %%" _as "\n" \
+ "vpandq %%" _0f ", %%" _bs ", %%" _bs "\n" \
+ \
+ "vpshufb %%" _as ", %%" _ltmod ", %%" _ta "\n" \
+ "vpshufb %%" _bs ", %%" _ltmod ", %%" _tb "\n" \
+ "vpshufb %%" _as ", %%" _ltmul ", %%" _as "\n" \
+ "vpshufb %%" _bs ", %%" _ltmul ", %%" _bs "\n" \
+ /* lower bits */ \
+ "vbroadcasti32x4 0x20(%[lt]), %%" _ltmod "\n" \
+ "vbroadcasti32x4 0x30(%[lt]), %%" _ltmul "\n" \
+ \
+ "vpxorq %%" _ta ", %%" _as ", %%" _as "\n" \
+ "vpxorq %%" _tb ", %%" _bs ", %%" _bs "\n" \
+ \
+ "vpshufb %%" VR0(r) ", %%" _ltmod ", %%" _ta "\n" \
+ "vpshufb %%" VR1(r) ", %%" _ltmod ", %%" _tb "\n" \
+ "vpshufb %%" VR0(r) ", %%" _ltmul ", %%" VR0(r) "\n"\
+ "vpshufb %%" VR1(r) ", %%" _ltmul ", %%" VR1(r) "\n"\
+ \
+ "vpxorq %%" _ta ", %%" VR0(r) ", %%" VR0(r) "\n" \
+ "vpxorq %%" _as ", %%" VR0(r) ", %%" VR0(r) "\n" \
+ "vpxorq %%" _tb ", %%" VR1(r) ", %%" VR1(r) "\n" \
+ "vpxorq %%" _bs ", %%" VR1(r) ", %%" VR1(r) "\n" \
+ : : [mask] "r" (&_mul_mask), \
+ [lt] "r" (gf_clmul_mod_lt[4*(c)])); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define MUL(c, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ _MULx2(c, R_01(r)); \
+ _MULx2(c, R_23(r)); \
+ break; \
+ case 2: \
+ _MULx2(c, R_01(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define raidz_math_begin() kfpu_begin()
+#define raidz_math_end() kfpu_end()
+
+/*
+ * ZERO, COPY, and MUL operations are already 2x unrolled, which means that
+ * the stride of these operations for avx512 must not exceed 4. Otherwise, a
+ * single step would exceed 512B block size.
+ */
+
+#define SYN_STRIDE 4
+
+#define ZERO_STRIDE 4
+#define ZERO_DEFINE() {}
+#define ZERO_D 0, 1, 2, 3
+
+#define COPY_STRIDE 4
+#define COPY_DEFINE() {}
+#define COPY_D 0, 1, 2, 3
+
+#define ADD_STRIDE 4
+#define ADD_DEFINE() {}
+#define ADD_D 0, 1, 2, 3
+
+#define MUL_STRIDE 4
+#define MUL_DEFINE() {}
+#define MUL_D 0, 1, 2, 3
+
+#define GEN_P_STRIDE 4
+#define GEN_P_DEFINE() {}
+#define GEN_P_P 0, 1, 2, 3
+
+#define GEN_PQ_STRIDE 4
+#define GEN_PQ_DEFINE() {}
+#define GEN_PQ_D 0, 1, 2, 3
+#define GEN_PQ_C 4, 5, 6, 7
+
+#define GEN_PQR_STRIDE 4
+#define GEN_PQR_DEFINE() {}
+#define GEN_PQR_D 0, 1, 2, 3
+#define GEN_PQR_C 4, 5, 6, 7
+
+#define SYN_Q_DEFINE() {}
+#define SYN_Q_D 0, 1, 2, 3
+#define SYN_Q_X 4, 5, 6, 7
+
+#define SYN_R_DEFINE() {}
+#define SYN_R_D 0, 1, 2, 3
+#define SYN_R_X 4, 5, 6, 7
+
+#define SYN_PQ_DEFINE() {}
+#define SYN_PQ_D 0, 1, 2, 3
+#define SYN_PQ_X 4, 5, 6, 7
+
+#define REC_PQ_STRIDE 2
+#define REC_PQ_DEFINE() {}
+#define REC_PQ_X 0, 1
+#define REC_PQ_Y 2, 3
+#define REC_PQ_T 4, 5
+
+#define SYN_PR_DEFINE() {}
+#define SYN_PR_D 0, 1, 2, 3
+#define SYN_PR_X 4, 5, 6, 7
+
+#define REC_PR_STRIDE 2
+#define REC_PR_DEFINE() {}
+#define REC_PR_X 0, 1
+#define REC_PR_Y 2, 3
+#define REC_PR_T 4, 5
+
+#define SYN_QR_DEFINE() {}
+#define SYN_QR_D 0, 1, 2, 3
+#define SYN_QR_X 4, 5, 6, 7
+
+#define REC_QR_STRIDE 2
+#define REC_QR_DEFINE() {}
+#define REC_QR_X 0, 1
+#define REC_QR_Y 2, 3
+#define REC_QR_T 4, 5
+
+#define SYN_PQR_DEFINE() {}
+#define SYN_PQR_D 0, 1, 2, 3
+#define SYN_PQR_X 4, 5, 6, 7
+
+#define REC_PQR_STRIDE 2
+#define REC_PQR_DEFINE() {}
+#define REC_PQR_X 0, 1
+#define REC_PQR_Y 2, 3
+#define REC_PQR_Z 4, 5
+#define REC_PQR_XS 6, 7
+#define REC_PQR_YS 8, 9
+
+
+#include <sys/vdev_raidz_impl.h>
+#include "vdev_raidz_math_impl.h"
+
+DEFINE_GEN_METHODS(avx512bw);
+DEFINE_REC_METHODS(avx512bw);
+
+static boolean_t
+raidz_will_avx512bw_work(void)
+{
+ return (kfpu_allowed() && zfs_avx_available() &&
+ zfs_avx512f_available() && zfs_avx512bw_available());
+}
+
+const raidz_impl_ops_t vdev_raidz_avx512bw_impl = {
+ .init = NULL,
+ .fini = NULL,
+ .gen = RAIDZ_GEN_METHODS(avx512bw),
+ .rec = RAIDZ_REC_METHODS(avx512bw),
+ .is_supported = &raidz_will_avx512bw_work,
+ .name = "avx512bw"
+};
+
+#endif /* defined(__x86_64) && defined(HAVE_AVX512BW) */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c
new file mode 100644
index 000000000000..aab653b77491
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c
@@ -0,0 +1,494 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Romain Dolbeau. All rights reserved.
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/isa_defs.h>
+
+#if defined(__x86_64) && defined(HAVE_AVX512F)
+
+#include <sys/types.h>
+#include <sys/simd.h>
+#include <sys/debug.h>
+
+#ifdef __linux__
+#define __asm __asm__ __volatile__
+#endif
+
+#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
+#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
+
+#define VR0_(REG, ...) "zmm"#REG
+#define VR1_(_1, REG, ...) "zmm"#REG
+#define VR2_(_1, _2, REG, ...) "zmm"#REG
+#define VR3_(_1, _2, _3, REG, ...) "zmm"#REG
+#define VR4_(_1, _2, _3, _4, REG, ...) "zmm"#REG
+#define VR5_(_1, _2, _3, _4, _5, REG, ...) "zmm"#REG
+#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "zmm"#REG
+#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "zmm"#REG
+
+#define VR0(r...) VR0_(r)
+#define VR1(r...) VR1_(r)
+#define VR2(r...) VR2_(r, 1)
+#define VR3(r...) VR3_(r, 1, 2)
+#define VR4(r...) VR4_(r, 1, 2)
+#define VR5(r...) VR5_(r, 1, 2, 3)
+#define VR6(r...) VR6_(r, 1, 2, 3, 4)
+#define VR7(r...) VR7_(r, 1, 2, 3, 4, 5)
+
+#define VRy0_(REG, ...) "ymm"#REG
+#define VRy1_(_1, REG, ...) "ymm"#REG
+#define VRy2_(_1, _2, REG, ...) "ymm"#REG
+#define VRy3_(_1, _2, _3, REG, ...) "ymm"#REG
+#define VRy4_(_1, _2, _3, _4, REG, ...) "ymm"#REG
+#define VRy5_(_1, _2, _3, _4, _5, REG, ...) "ymm"#REG
+#define VRy6_(_1, _2, _3, _4, _5, _6, REG, ...) "ymm"#REG
+#define VRy7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "ymm"#REG
+
+#define VRy0(r...) VRy0_(r)
+#define VRy1(r...) VRy1_(r)
+#define VRy2(r...) VRy2_(r, 1)
+#define VRy3(r...) VRy3_(r, 1, 2)
+#define VRy4(r...) VRy4_(r, 1, 2)
+#define VRy5(r...) VRy5_(r, 1, 2, 3)
+#define VRy6(r...) VRy6_(r, 1, 2, 3, 4)
+#define VRy7(r...) VRy7_(r, 1, 2, 3, 4, 5)
+
+#define R_01(REG1, REG2, ...) REG1, REG2
+#define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3
+#define R_23(REG...) _R_23(REG, 1, 2, 3)
+
+#define ELEM_SIZE 64
+
+typedef struct v {
+ uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
+} v_t;
+
+
+#define XOR_ACC(src, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "vpxorq 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n" \
+ "vpxorq 0x40(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n" \
+ "vpxorq 0x80(%[SRC]), %%" VR2(r)", %%" VR2(r) "\n" \
+ "vpxorq 0xc0(%[SRC]), %%" VR3(r)", %%" VR3(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ } \
+}
+
+#define XOR(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "vpxorq %" VR0(r) ", %" VR4(r)", %" VR4(r) "\n" \
+ "vpxorq %" VR1(r) ", %" VR5(r)", %" VR5(r) "\n" \
+ "vpxorq %" VR2(r) ", %" VR6(r)", %" VR6(r) "\n" \
+ "vpxorq %" VR3(r) ", %" VR7(r)", %" VR7(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "vpxorq %" VR0(r) ", %" VR2(r)", %" VR2(r) "\n" \
+ "vpxorq %" VR1(r) ", %" VR3(r)", %" VR3(r)); \
+ break; \
+ } \
+}
+
+
+#define ZERO(r...) XOR(r, r)
+
+
+#define COPY(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "vmovdqa64 %" VR0(r) ", %" VR4(r) "\n" \
+ "vmovdqa64 %" VR1(r) ", %" VR5(r) "\n" \
+ "vmovdqa64 %" VR2(r) ", %" VR6(r) "\n" \
+ "vmovdqa64 %" VR3(r) ", %" VR7(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "vmovdqa64 %" VR0(r) ", %" VR2(r) "\n" \
+ "vmovdqa64 %" VR1(r) ", %" VR3(r)); \
+ break; \
+ } \
+}
+
+#define LOAD(src, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "vmovdqa64 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "vmovdqa64 0x40(%[SRC]), %%" VR1(r) "\n" \
+ "vmovdqa64 0x80(%[SRC]), %%" VR2(r) "\n" \
+ "vmovdqa64 0xc0(%[SRC]), %%" VR3(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ } \
+}
+
+#define STORE(dst, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "vmovdqa64 %%" VR0(r) ", 0x00(%[DST])\n" \
+ "vmovdqa64 %%" VR1(r) ", 0x40(%[DST])\n" \
+ "vmovdqa64 %%" VR2(r) ", 0x80(%[DST])\n" \
+ "vmovdqa64 %%" VR3(r) ", 0xc0(%[DST])\n" \
+ : : [DST] "r" (dst)); \
+ break; \
+ } \
+}
+
+#define MUL2_SETUP() \
+{ \
+ __asm("vmovq %0, %%xmm31" :: "r"(0x1d1d1d1d1d1d1d1d)); \
+ __asm("vpbroadcastq %xmm31, %zmm31"); \
+ __asm("vmovq %0, %%xmm30" :: "r"(0x8080808080808080)); \
+ __asm("vpbroadcastq %xmm30, %zmm30"); \
+ __asm("vmovq %0, %%xmm29" :: "r"(0xfefefefefefefefe)); \
+ __asm("vpbroadcastq %xmm29, %zmm29"); \
+}
+
+#define _MUL2(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 2: \
+ __asm( \
+ "vpandq %" VR0(r)", %zmm30, %zmm26\n" \
+ "vpandq %" VR1(r)", %zmm30, %zmm25\n" \
+ "vpsrlq $7, %zmm26, %zmm28\n" \
+ "vpsrlq $7, %zmm25, %zmm27\n" \
+ "vpsllq $1, %zmm26, %zmm26\n" \
+ "vpsllq $1, %zmm25, %zmm25\n" \
+ "vpsubq %zmm28, %zmm26, %zmm26\n" \
+ "vpsubq %zmm27, %zmm25, %zmm25\n" \
+ "vpsllq $1, %" VR0(r)", %" VR0(r) "\n" \
+ "vpsllq $1, %" VR1(r)", %" VR1(r) "\n" \
+ "vpandq %zmm26, %zmm31, %zmm26\n" \
+ "vpandq %zmm25, %zmm31, %zmm25\n" \
+ "vpternlogd $0x6c,%zmm29, %zmm26, %" VR0(r) "\n" \
+ "vpternlogd $0x6c,%zmm29, %zmm25, %" VR1(r)); \
+ break; \
+ default: \
+ VERIFY(0); \
+ } \
+}
+
+#define MUL2(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ _MUL2(R_01(r)); \
+ _MUL2(R_23(r)); \
+ break; \
+ case 2: \
+ _MUL2(r); \
+ break; \
+ } \
+}
+
+#define MUL4(r...) \
+{ \
+ MUL2(r); \
+ MUL2(r); \
+}
+
+
+/* General multiplication by adding powers of two */
+
+#define _mul_x2_in 21, 22
+#define _mul_x2_acc 23, 24
+
+#define _MUL_PARAM(x, in, acc) \
+{ \
+ if (x & 0x01) { COPY(in, acc); } else { ZERO(acc); } \
+ if (x & 0xfe) { MUL2(in); } \
+ if (x & 0x02) { XOR(in, acc); } \
+ if (x & 0xfc) { MUL2(in); } \
+ if (x & 0x04) { XOR(in, acc); } \
+ if (x & 0xf8) { MUL2(in); } \
+ if (x & 0x08) { XOR(in, acc); } \
+ if (x & 0xf0) { MUL2(in); } \
+ if (x & 0x10) { XOR(in, acc); } \
+ if (x & 0xe0) { MUL2(in); } \
+ if (x & 0x20) { XOR(in, acc); } \
+ if (x & 0xc0) { MUL2(in); } \
+ if (x & 0x40) { XOR(in, acc); } \
+ if (x & 0x80) { MUL2(in); XOR(in, acc); } \
+}
+
+#define MUL_x2_DEFINE(x) \
+static void \
+mul_x2_ ## x(void) { _MUL_PARAM(x, _mul_x2_in, _mul_x2_acc); }
+
+
+MUL_x2_DEFINE(0); MUL_x2_DEFINE(1); MUL_x2_DEFINE(2); MUL_x2_DEFINE(3);
+MUL_x2_DEFINE(4); MUL_x2_DEFINE(5); MUL_x2_DEFINE(6); MUL_x2_DEFINE(7);
+MUL_x2_DEFINE(8); MUL_x2_DEFINE(9); MUL_x2_DEFINE(10); MUL_x2_DEFINE(11);
+MUL_x2_DEFINE(12); MUL_x2_DEFINE(13); MUL_x2_DEFINE(14); MUL_x2_DEFINE(15);
+MUL_x2_DEFINE(16); MUL_x2_DEFINE(17); MUL_x2_DEFINE(18); MUL_x2_DEFINE(19);
+MUL_x2_DEFINE(20); MUL_x2_DEFINE(21); MUL_x2_DEFINE(22); MUL_x2_DEFINE(23);
+MUL_x2_DEFINE(24); MUL_x2_DEFINE(25); MUL_x2_DEFINE(26); MUL_x2_DEFINE(27);
+MUL_x2_DEFINE(28); MUL_x2_DEFINE(29); MUL_x2_DEFINE(30); MUL_x2_DEFINE(31);
+MUL_x2_DEFINE(32); MUL_x2_DEFINE(33); MUL_x2_DEFINE(34); MUL_x2_DEFINE(35);
+MUL_x2_DEFINE(36); MUL_x2_DEFINE(37); MUL_x2_DEFINE(38); MUL_x2_DEFINE(39);
+MUL_x2_DEFINE(40); MUL_x2_DEFINE(41); MUL_x2_DEFINE(42); MUL_x2_DEFINE(43);
+MUL_x2_DEFINE(44); MUL_x2_DEFINE(45); MUL_x2_DEFINE(46); MUL_x2_DEFINE(47);
+MUL_x2_DEFINE(48); MUL_x2_DEFINE(49); MUL_x2_DEFINE(50); MUL_x2_DEFINE(51);
+MUL_x2_DEFINE(52); MUL_x2_DEFINE(53); MUL_x2_DEFINE(54); MUL_x2_DEFINE(55);
+MUL_x2_DEFINE(56); MUL_x2_DEFINE(57); MUL_x2_DEFINE(58); MUL_x2_DEFINE(59);
+MUL_x2_DEFINE(60); MUL_x2_DEFINE(61); MUL_x2_DEFINE(62); MUL_x2_DEFINE(63);
+MUL_x2_DEFINE(64); MUL_x2_DEFINE(65); MUL_x2_DEFINE(66); MUL_x2_DEFINE(67);
+MUL_x2_DEFINE(68); MUL_x2_DEFINE(69); MUL_x2_DEFINE(70); MUL_x2_DEFINE(71);
+MUL_x2_DEFINE(72); MUL_x2_DEFINE(73); MUL_x2_DEFINE(74); MUL_x2_DEFINE(75);
+MUL_x2_DEFINE(76); MUL_x2_DEFINE(77); MUL_x2_DEFINE(78); MUL_x2_DEFINE(79);
+MUL_x2_DEFINE(80); MUL_x2_DEFINE(81); MUL_x2_DEFINE(82); MUL_x2_DEFINE(83);
+MUL_x2_DEFINE(84); MUL_x2_DEFINE(85); MUL_x2_DEFINE(86); MUL_x2_DEFINE(87);
+MUL_x2_DEFINE(88); MUL_x2_DEFINE(89); MUL_x2_DEFINE(90); MUL_x2_DEFINE(91);
+MUL_x2_DEFINE(92); MUL_x2_DEFINE(93); MUL_x2_DEFINE(94); MUL_x2_DEFINE(95);
+MUL_x2_DEFINE(96); MUL_x2_DEFINE(97); MUL_x2_DEFINE(98); MUL_x2_DEFINE(99);
+MUL_x2_DEFINE(100); MUL_x2_DEFINE(101); MUL_x2_DEFINE(102); MUL_x2_DEFINE(103);
+MUL_x2_DEFINE(104); MUL_x2_DEFINE(105); MUL_x2_DEFINE(106); MUL_x2_DEFINE(107);
+MUL_x2_DEFINE(108); MUL_x2_DEFINE(109); MUL_x2_DEFINE(110); MUL_x2_DEFINE(111);
+MUL_x2_DEFINE(112); MUL_x2_DEFINE(113); MUL_x2_DEFINE(114); MUL_x2_DEFINE(115);
+MUL_x2_DEFINE(116); MUL_x2_DEFINE(117); MUL_x2_DEFINE(118); MUL_x2_DEFINE(119);
+MUL_x2_DEFINE(120); MUL_x2_DEFINE(121); MUL_x2_DEFINE(122); MUL_x2_DEFINE(123);
+MUL_x2_DEFINE(124); MUL_x2_DEFINE(125); MUL_x2_DEFINE(126); MUL_x2_DEFINE(127);
+MUL_x2_DEFINE(128); MUL_x2_DEFINE(129); MUL_x2_DEFINE(130); MUL_x2_DEFINE(131);
+MUL_x2_DEFINE(132); MUL_x2_DEFINE(133); MUL_x2_DEFINE(134); MUL_x2_DEFINE(135);
+MUL_x2_DEFINE(136); MUL_x2_DEFINE(137); MUL_x2_DEFINE(138); MUL_x2_DEFINE(139);
+MUL_x2_DEFINE(140); MUL_x2_DEFINE(141); MUL_x2_DEFINE(142); MUL_x2_DEFINE(143);
+MUL_x2_DEFINE(144); MUL_x2_DEFINE(145); MUL_x2_DEFINE(146); MUL_x2_DEFINE(147);
+MUL_x2_DEFINE(148); MUL_x2_DEFINE(149); MUL_x2_DEFINE(150); MUL_x2_DEFINE(151);
+MUL_x2_DEFINE(152); MUL_x2_DEFINE(153); MUL_x2_DEFINE(154); MUL_x2_DEFINE(155);
+MUL_x2_DEFINE(156); MUL_x2_DEFINE(157); MUL_x2_DEFINE(158); MUL_x2_DEFINE(159);
+MUL_x2_DEFINE(160); MUL_x2_DEFINE(161); MUL_x2_DEFINE(162); MUL_x2_DEFINE(163);
+MUL_x2_DEFINE(164); MUL_x2_DEFINE(165); MUL_x2_DEFINE(166); MUL_x2_DEFINE(167);
+MUL_x2_DEFINE(168); MUL_x2_DEFINE(169); MUL_x2_DEFINE(170); MUL_x2_DEFINE(171);
+MUL_x2_DEFINE(172); MUL_x2_DEFINE(173); MUL_x2_DEFINE(174); MUL_x2_DEFINE(175);
+MUL_x2_DEFINE(176); MUL_x2_DEFINE(177); MUL_x2_DEFINE(178); MUL_x2_DEFINE(179);
+MUL_x2_DEFINE(180); MUL_x2_DEFINE(181); MUL_x2_DEFINE(182); MUL_x2_DEFINE(183);
+MUL_x2_DEFINE(184); MUL_x2_DEFINE(185); MUL_x2_DEFINE(186); MUL_x2_DEFINE(187);
+MUL_x2_DEFINE(188); MUL_x2_DEFINE(189); MUL_x2_DEFINE(190); MUL_x2_DEFINE(191);
+MUL_x2_DEFINE(192); MUL_x2_DEFINE(193); MUL_x2_DEFINE(194); MUL_x2_DEFINE(195);
+MUL_x2_DEFINE(196); MUL_x2_DEFINE(197); MUL_x2_DEFINE(198); MUL_x2_DEFINE(199);
+MUL_x2_DEFINE(200); MUL_x2_DEFINE(201); MUL_x2_DEFINE(202); MUL_x2_DEFINE(203);
+MUL_x2_DEFINE(204); MUL_x2_DEFINE(205); MUL_x2_DEFINE(206); MUL_x2_DEFINE(207);
+MUL_x2_DEFINE(208); MUL_x2_DEFINE(209); MUL_x2_DEFINE(210); MUL_x2_DEFINE(211);
+MUL_x2_DEFINE(212); MUL_x2_DEFINE(213); MUL_x2_DEFINE(214); MUL_x2_DEFINE(215);
+MUL_x2_DEFINE(216); MUL_x2_DEFINE(217); MUL_x2_DEFINE(218); MUL_x2_DEFINE(219);
+MUL_x2_DEFINE(220); MUL_x2_DEFINE(221); MUL_x2_DEFINE(222); MUL_x2_DEFINE(223);
+MUL_x2_DEFINE(224); MUL_x2_DEFINE(225); MUL_x2_DEFINE(226); MUL_x2_DEFINE(227);
+MUL_x2_DEFINE(228); MUL_x2_DEFINE(229); MUL_x2_DEFINE(230); MUL_x2_DEFINE(231);
+MUL_x2_DEFINE(232); MUL_x2_DEFINE(233); MUL_x2_DEFINE(234); MUL_x2_DEFINE(235);
+MUL_x2_DEFINE(236); MUL_x2_DEFINE(237); MUL_x2_DEFINE(238); MUL_x2_DEFINE(239);
+MUL_x2_DEFINE(240); MUL_x2_DEFINE(241); MUL_x2_DEFINE(242); MUL_x2_DEFINE(243);
+MUL_x2_DEFINE(244); MUL_x2_DEFINE(245); MUL_x2_DEFINE(246); MUL_x2_DEFINE(247);
+MUL_x2_DEFINE(248); MUL_x2_DEFINE(249); MUL_x2_DEFINE(250); MUL_x2_DEFINE(251);
+MUL_x2_DEFINE(252); MUL_x2_DEFINE(253); MUL_x2_DEFINE(254); MUL_x2_DEFINE(255);
+
+
+typedef void (*mul_fn_ptr_t)(void);
+
+static const mul_fn_ptr_t __attribute__((aligned(256)))
+gf_x2_mul_fns[256] = {
+ mul_x2_0, mul_x2_1, mul_x2_2, mul_x2_3, mul_x2_4, mul_x2_5,
+ mul_x2_6, mul_x2_7, mul_x2_8, mul_x2_9, mul_x2_10, mul_x2_11,
+ mul_x2_12, mul_x2_13, mul_x2_14, mul_x2_15, mul_x2_16, mul_x2_17,
+ mul_x2_18, mul_x2_19, mul_x2_20, mul_x2_21, mul_x2_22, mul_x2_23,
+ mul_x2_24, mul_x2_25, mul_x2_26, mul_x2_27, mul_x2_28, mul_x2_29,
+ mul_x2_30, mul_x2_31, mul_x2_32, mul_x2_33, mul_x2_34, mul_x2_35,
+ mul_x2_36, mul_x2_37, mul_x2_38, mul_x2_39, mul_x2_40, mul_x2_41,
+ mul_x2_42, mul_x2_43, mul_x2_44, mul_x2_45, mul_x2_46, mul_x2_47,
+ mul_x2_48, mul_x2_49, mul_x2_50, mul_x2_51, mul_x2_52, mul_x2_53,
+ mul_x2_54, mul_x2_55, mul_x2_56, mul_x2_57, mul_x2_58, mul_x2_59,
+ mul_x2_60, mul_x2_61, mul_x2_62, mul_x2_63, mul_x2_64, mul_x2_65,
+ mul_x2_66, mul_x2_67, mul_x2_68, mul_x2_69, mul_x2_70, mul_x2_71,
+ mul_x2_72, mul_x2_73, mul_x2_74, mul_x2_75, mul_x2_76, mul_x2_77,
+ mul_x2_78, mul_x2_79, mul_x2_80, mul_x2_81, mul_x2_82, mul_x2_83,
+ mul_x2_84, mul_x2_85, mul_x2_86, mul_x2_87, mul_x2_88, mul_x2_89,
+ mul_x2_90, mul_x2_91, mul_x2_92, mul_x2_93, mul_x2_94, mul_x2_95,
+ mul_x2_96, mul_x2_97, mul_x2_98, mul_x2_99, mul_x2_100, mul_x2_101,
+ mul_x2_102, mul_x2_103, mul_x2_104, mul_x2_105, mul_x2_106, mul_x2_107,
+ mul_x2_108, mul_x2_109, mul_x2_110, mul_x2_111, mul_x2_112, mul_x2_113,
+ mul_x2_114, mul_x2_115, mul_x2_116, mul_x2_117, mul_x2_118, mul_x2_119,
+ mul_x2_120, mul_x2_121, mul_x2_122, mul_x2_123, mul_x2_124, mul_x2_125,
+ mul_x2_126, mul_x2_127, mul_x2_128, mul_x2_129, mul_x2_130, mul_x2_131,
+ mul_x2_132, mul_x2_133, mul_x2_134, mul_x2_135, mul_x2_136, mul_x2_137,
+ mul_x2_138, mul_x2_139, mul_x2_140, mul_x2_141, mul_x2_142, mul_x2_143,
+ mul_x2_144, mul_x2_145, mul_x2_146, mul_x2_147, mul_x2_148, mul_x2_149,
+ mul_x2_150, mul_x2_151, mul_x2_152, mul_x2_153, mul_x2_154, mul_x2_155,
+ mul_x2_156, mul_x2_157, mul_x2_158, mul_x2_159, mul_x2_160, mul_x2_161,
+ mul_x2_162, mul_x2_163, mul_x2_164, mul_x2_165, mul_x2_166, mul_x2_167,
+ mul_x2_168, mul_x2_169, mul_x2_170, mul_x2_171, mul_x2_172, mul_x2_173,
+ mul_x2_174, mul_x2_175, mul_x2_176, mul_x2_177, mul_x2_178, mul_x2_179,
+ mul_x2_180, mul_x2_181, mul_x2_182, mul_x2_183, mul_x2_184, mul_x2_185,
+ mul_x2_186, mul_x2_187, mul_x2_188, mul_x2_189, mul_x2_190, mul_x2_191,
+ mul_x2_192, mul_x2_193, mul_x2_194, mul_x2_195, mul_x2_196, mul_x2_197,
+ mul_x2_198, mul_x2_199, mul_x2_200, mul_x2_201, mul_x2_202, mul_x2_203,
+ mul_x2_204, mul_x2_205, mul_x2_206, mul_x2_207, mul_x2_208, mul_x2_209,
+ mul_x2_210, mul_x2_211, mul_x2_212, mul_x2_213, mul_x2_214, mul_x2_215,
+ mul_x2_216, mul_x2_217, mul_x2_218, mul_x2_219, mul_x2_220, mul_x2_221,
+ mul_x2_222, mul_x2_223, mul_x2_224, mul_x2_225, mul_x2_226, mul_x2_227,
+ mul_x2_228, mul_x2_229, mul_x2_230, mul_x2_231, mul_x2_232, mul_x2_233,
+ mul_x2_234, mul_x2_235, mul_x2_236, mul_x2_237, mul_x2_238, mul_x2_239,
+ mul_x2_240, mul_x2_241, mul_x2_242, mul_x2_243, mul_x2_244, mul_x2_245,
+ mul_x2_246, mul_x2_247, mul_x2_248, mul_x2_249, mul_x2_250, mul_x2_251,
+ mul_x2_252, mul_x2_253, mul_x2_254, mul_x2_255
+};
+
+#define MUL(c, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ COPY(R_01(r), _mul_x2_in); \
+ gf_x2_mul_fns[c](); \
+ COPY(_mul_x2_acc, R_01(r)); \
+ COPY(R_23(r), _mul_x2_in); \
+ gf_x2_mul_fns[c](); \
+ COPY(_mul_x2_acc, R_23(r)); \
+ break; \
+ default: \
+ VERIFY(0); \
+ } \
+}
+
+
+#define raidz_math_begin() kfpu_begin()
+#define raidz_math_end() kfpu_end()
+
+
+#define SYN_STRIDE 4
+
+#define ZERO_STRIDE 4
+#define ZERO_DEFINE() {}
+#define ZERO_D 0, 1, 2, 3
+
+#define COPY_STRIDE 4
+#define COPY_DEFINE() {}
+#define COPY_D 0, 1, 2, 3
+
+#define ADD_STRIDE 4
+#define ADD_DEFINE() {}
+#define ADD_D 0, 1, 2, 3
+
+#define MUL_STRIDE 4
+#define MUL_DEFINE() MUL2_SETUP()
+#define MUL_D 0, 1, 2, 3
+
+#define GEN_P_STRIDE 4
+#define GEN_P_DEFINE() {}
+#define GEN_P_P 0, 1, 2, 3
+
+#define GEN_PQ_STRIDE 4
+#define GEN_PQ_DEFINE() {}
+#define GEN_PQ_D 0, 1, 2, 3
+#define GEN_PQ_C 4, 5, 6, 7
+
+#define GEN_PQR_STRIDE 4
+#define GEN_PQR_DEFINE() {}
+#define GEN_PQR_D 0, 1, 2, 3
+#define GEN_PQR_C 4, 5, 6, 7
+
+#define SYN_Q_DEFINE() {}
+#define SYN_Q_D 0, 1, 2, 3
+#define SYN_Q_X 4, 5, 6, 7
+
+#define SYN_R_DEFINE() {}
+#define SYN_R_D 0, 1, 2, 3
+#define SYN_R_X 4, 5, 6, 7
+
+#define SYN_PQ_DEFINE() {}
+#define SYN_PQ_D 0, 1, 2, 3
+#define SYN_PQ_X 4, 5, 6, 7
+
+#define REC_PQ_STRIDE 4
+#define REC_PQ_DEFINE() MUL2_SETUP()
+#define REC_PQ_X 0, 1, 2, 3
+#define REC_PQ_Y 4, 5, 6, 7
+#define REC_PQ_T 8, 9, 10, 11
+
+#define SYN_PR_DEFINE() {}
+#define SYN_PR_D 0, 1, 2, 3
+#define SYN_PR_X 4, 5, 6, 7
+
+#define REC_PR_STRIDE 4
+#define REC_PR_DEFINE() MUL2_SETUP()
+#define REC_PR_X 0, 1, 2, 3
+#define REC_PR_Y 4, 5, 6, 7
+#define REC_PR_T 8, 9, 10, 11
+
+#define SYN_QR_DEFINE() {}
+#define SYN_QR_D 0, 1, 2, 3
+#define SYN_QR_X 4, 5, 6, 7
+
+#define REC_QR_STRIDE 4
+#define REC_QR_DEFINE() MUL2_SETUP()
+#define REC_QR_X 0, 1, 2, 3
+#define REC_QR_Y 4, 5, 6, 7
+#define REC_QR_T 8, 9, 10, 11
+
+#define SYN_PQR_DEFINE() {}
+#define SYN_PQR_D 0, 1, 2, 3
+#define SYN_PQR_X 4, 5, 6, 7
+
+#define REC_PQR_STRIDE 4
+#define REC_PQR_DEFINE() MUL2_SETUP()
+#define REC_PQR_X 0, 1, 2, 3
+#define REC_PQR_Y 4, 5, 6, 7
+#define REC_PQR_Z 8, 9, 10, 11
+#define REC_PQR_XS 12, 13, 14, 15
+#define REC_PQR_YS 16, 17, 18, 19
+
+
+#include <sys/vdev_raidz_impl.h>
+#include "vdev_raidz_math_impl.h"
+
+DEFINE_GEN_METHODS(avx512f);
+DEFINE_REC_METHODS(avx512f);
+
+static boolean_t
+raidz_will_avx512f_work(void)
+{
+ return (kfpu_allowed() && zfs_avx_available() &&
+ zfs_avx2_available() && zfs_avx512f_available());
+}
+
+const raidz_impl_ops_t vdev_raidz_avx512f_impl = {
+ .init = NULL,
+ .fini = NULL,
+ .gen = RAIDZ_GEN_METHODS(avx512f),
+ .rec = RAIDZ_REC_METHODS(avx512f),
+ .is_supported = &raidz_will_avx512f_work,
+ .name = "avx512f"
+};
+
+#endif /* defined(__x86_64) && defined(HAVE_AVX512F) */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h
new file mode 100644
index 000000000000..35e016fc65a5
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h
@@ -0,0 +1,1502 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#ifndef _VDEV_RAIDZ_MATH_IMPL_H
+#define _VDEV_RAIDZ_MATH_IMPL_H
+
+#include <sys/types.h>
+#include <sys/vdev_raidz_impl.h>
+
+#define raidz_inline inline __attribute__((always_inline))
+#ifndef noinline
+#define noinline __attribute__((noinline))
+#endif
+
+/*
+ * Functions calculate multiplication constants for data reconstruction.
+ * Coefficients depend on RAIDZ geometry, indexes of failed child vdevs, and
+ * used parity columns for reconstruction.
+ * @rr RAIDZ row
+ * @tgtidx array of missing data indexes
+ * @coeff output array of coefficients. Array must be provided by
+ * user and must hold minimum MUL_CNT values.
+ */
+static noinline void
+raidz_rec_q_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
+{
+ const unsigned ncols = rr->rr_cols;
+ const unsigned x = tgtidx[TARGET_X];
+
+ coeff[MUL_Q_X] = gf_exp2(255 - (ncols - x - 1));
+}
+
+static noinline void
+raidz_rec_r_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
+{
+ const unsigned ncols = rr->rr_cols;
+ const unsigned x = tgtidx[TARGET_X];
+
+ coeff[MUL_R_X] = gf_exp4(255 - (ncols - x - 1));
+}
+
+static noinline void
+raidz_rec_pq_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
+{
+ const unsigned ncols = rr->rr_cols;
+ const unsigned x = tgtidx[TARGET_X];
+ const unsigned y = tgtidx[TARGET_Y];
+ gf_t a, b, e;
+
+ a = gf_exp2(x + 255 - y);
+ b = gf_exp2(255 - (ncols - x - 1));
+ e = a ^ 0x01;
+
+ coeff[MUL_PQ_X] = gf_div(a, e);
+ coeff[MUL_PQ_Y] = gf_div(b, e);
+}
+
+static noinline void
+raidz_rec_pr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
+{
+ const unsigned ncols = rr->rr_cols;
+ const unsigned x = tgtidx[TARGET_X];
+ const unsigned y = tgtidx[TARGET_Y];
+
+ gf_t a, b, e;
+
+ a = gf_exp4(x + 255 - y);
+ b = gf_exp4(255 - (ncols - x - 1));
+ e = a ^ 0x01;
+
+ coeff[MUL_PR_X] = gf_div(a, e);
+ coeff[MUL_PR_Y] = gf_div(b, e);
+}
+
+static noinline void
+raidz_rec_qr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
+{
+ const unsigned ncols = rr->rr_cols;
+ const unsigned x = tgtidx[TARGET_X];
+ const unsigned y = tgtidx[TARGET_Y];
+
+ gf_t nx, ny, nxxy, nxyy, d;
+
+ nx = gf_exp2(ncols - x - 1);
+ ny = gf_exp2(ncols - y - 1);
+ nxxy = gf_mul(gf_mul(nx, nx), ny);
+ nxyy = gf_mul(gf_mul(nx, ny), ny);
+ d = nxxy ^ nxyy;
+
+ coeff[MUL_QR_XQ] = ny;
+ coeff[MUL_QR_X] = gf_div(ny, d);
+ coeff[MUL_QR_YQ] = nx;
+ coeff[MUL_QR_Y] = gf_div(nx, d);
+}
+
+static noinline void
+raidz_rec_pqr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
+{
+ const unsigned ncols = rr->rr_cols;
+ const unsigned x = tgtidx[TARGET_X];
+ const unsigned y = tgtidx[TARGET_Y];
+ const unsigned z = tgtidx[TARGET_Z];
+
+ gf_t nx, ny, nz, nxx, nyy, nzz, nyyz, nyzz, xd, yd;
+
+ nx = gf_exp2(ncols - x - 1);
+ ny = gf_exp2(ncols - y - 1);
+ nz = gf_exp2(ncols - z - 1);
+
+ nxx = gf_exp4(ncols - x - 1);
+ nyy = gf_exp4(ncols - y - 1);
+ nzz = gf_exp4(ncols - z - 1);
+
+ nyyz = gf_mul(gf_mul(ny, nz), ny);
+ nyzz = gf_mul(nzz, ny);
+
+ xd = gf_mul(nxx, ny) ^ gf_mul(nx, nyy) ^ nyyz ^
+ gf_mul(nxx, nz) ^ gf_mul(nzz, nx) ^ nyzz;
+
+ yd = gf_inv(ny ^ nz);
+
+ coeff[MUL_PQR_XP] = gf_div(nyyz ^ nyzz, xd);
+ coeff[MUL_PQR_XQ] = gf_div(nyy ^ nzz, xd);
+ coeff[MUL_PQR_XR] = gf_div(ny ^ nz, xd);
+ coeff[MUL_PQR_YU] = nx;
+ coeff[MUL_PQR_YP] = gf_mul(nz, yd);
+ coeff[MUL_PQR_YQ] = yd;
+}
+
+/*
+ * Method for zeroing a buffer (can be implemented using SIMD).
+ * This method is used by multiple for gen/rec functions.
+ *
+ * @dc Destination buffer
+ * @dsize Destination buffer size
+ * @private Unused
+ */
+static int
+raidz_zero_abd_cb(void *dc, size_t dsize, void *private)
+{
+ v_t *dst = (v_t *)dc;
+ size_t i;
+
+ ZERO_DEFINE();
+
+ (void) private; /* unused */
+
+ ZERO(ZERO_D);
+
+ for (i = 0; i < dsize / sizeof (v_t); i += (2 * ZERO_STRIDE)) {
+ STORE(dst + i, ZERO_D);
+ STORE(dst + i + ZERO_STRIDE, ZERO_D);
+ }
+
+ return (0);
+}
+
+#define raidz_zero(dabd, size) \
+{ \
+ abd_iterate_func(dabd, 0, size, raidz_zero_abd_cb, NULL); \
+}
+
+/*
+ * Method for copying two buffers (can be implemented using SIMD).
+ * This method is used by multiple for gen/rec functions.
+ *
+ * @dc Destination buffer
+ * @sc Source buffer
+ * @dsize Destination buffer size
+ * @ssize Source buffer size
+ * @private Unused
+ */
+static int
+raidz_copy_abd_cb(void *dc, void *sc, size_t size, void *private)
+{
+ v_t *dst = (v_t *)dc;
+ const v_t *src = (v_t *)sc;
+ size_t i;
+
+ COPY_DEFINE();
+
+ (void) private; /* unused */
+
+ for (i = 0; i < size / sizeof (v_t); i += (2 * COPY_STRIDE)) {
+ LOAD(src + i, COPY_D);
+ STORE(dst + i, COPY_D);
+
+ LOAD(src + i + COPY_STRIDE, COPY_D);
+ STORE(dst + i + COPY_STRIDE, COPY_D);
+ }
+
+ return (0);
+}
+
+
+#define raidz_copy(dabd, sabd, size) \
+{ \
+ abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_copy_abd_cb, NULL);\
+}
+
+/*
+ * Method for adding (XORing) two buffers.
+ * Source and destination are XORed together and result is stored in
+ * destination buffer. This method is used by multiple for gen/rec functions.
+ *
+ * @dc Destination buffer
+ * @sc Source buffer
+ * @dsize Destination buffer size
+ * @ssize Source buffer size
+ * @private Unused
+ */
+static int
+raidz_add_abd_cb(void *dc, void *sc, size_t size, void *private)
+{
+ v_t *dst = (v_t *)dc;
+ const v_t *src = (v_t *)sc;
+ size_t i;
+
+ ADD_DEFINE();
+
+ (void) private; /* unused */
+
+ for (i = 0; i < size / sizeof (v_t); i += (2 * ADD_STRIDE)) {
+ LOAD(dst + i, ADD_D);
+ XOR_ACC(src + i, ADD_D);
+ STORE(dst + i, ADD_D);
+
+ LOAD(dst + i + ADD_STRIDE, ADD_D);
+ XOR_ACC(src + i + ADD_STRIDE, ADD_D);
+ STORE(dst + i + ADD_STRIDE, ADD_D);
+ }
+
+ return (0);
+}
+
+#define raidz_add(dabd, sabd, size) \
+{ \
+ abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_add_abd_cb, NULL);\
+}
+
+/*
+ * Method for multiplying a buffer with a constant in GF(2^8).
+ * Symbols from buffer are multiplied by a constant and result is stored
+ * back in the same buffer.
+ *
+ * @dc In/Out data buffer.
+ * @size Size of the buffer
+ * @private pointer to the multiplication constant (unsigned)
+ */
+static int
+raidz_mul_abd_cb(void *dc, size_t size, void *private)
+{
+ const unsigned mul = *((unsigned *)private);
+ v_t *d = (v_t *)dc;
+ size_t i;
+
+ MUL_DEFINE();
+
+ for (i = 0; i < size / sizeof (v_t); i += (2 * MUL_STRIDE)) {
+ LOAD(d + i, MUL_D);
+ MUL(mul, MUL_D);
+ STORE(d + i, MUL_D);
+
+ LOAD(d + i + MUL_STRIDE, MUL_D);
+ MUL(mul, MUL_D);
+ STORE(d + i + MUL_STRIDE, MUL_D);
+ }
+
+ return (0);
+}
+
+
+/*
+ * Syndrome generation/update macros
+ *
+ * Require LOAD(), XOR(), STORE(), MUL2(), and MUL4() macros
+ */
+#define P_D_SYNDROME(D, T, t) \
+{ \
+ LOAD((t), T); \
+ XOR(D, T); \
+ STORE((t), T); \
+}
+
+#define Q_D_SYNDROME(D, T, t) \
+{ \
+ LOAD((t), T); \
+ MUL2(T); \
+ XOR(D, T); \
+ STORE((t), T); \
+}
+
+#define Q_SYNDROME(T, t) \
+{ \
+ LOAD((t), T); \
+ MUL2(T); \
+ STORE((t), T); \
+}
+
+#define R_D_SYNDROME(D, T, t) \
+{ \
+ LOAD((t), T); \
+ MUL4(T); \
+ XOR(D, T); \
+ STORE((t), T); \
+}
+
+#define R_SYNDROME(T, t) \
+{ \
+ LOAD((t), T); \
+ MUL4(T); \
+ STORE((t), T); \
+}
+
+
+/*
+ * PARITY CALCULATION
+ *
+ * Macros *_SYNDROME are used for parity/syndrome calculation.
+ * *_D_SYNDROME() macros are used to calculate syndrome between 0 and
+ * length of data column, and *_SYNDROME() macros are only for updating
+ * the parity/syndrome if data column is shorter.
+ *
+ * P parity is calculated using raidz_add_abd().
+ */
+
+/*
+ * Generate P parity (RAIDZ1)
+ *
+ * @rr RAIDZ row
+ */
+static raidz_inline void
+raidz_generate_p_impl(raidz_row_t * const rr)
+{
+ size_t c;
+ const size_t ncols = rr->rr_cols;
+ const size_t psize = rr->rr_col[CODE_P].rc_size;
+ abd_t *pabd = rr->rr_col[CODE_P].rc_abd;
+ size_t size;
+ abd_t *dabd;
+
+ raidz_math_begin();
+
+ /* start with first data column */
+ raidz_copy(pabd, rr->rr_col[1].rc_abd, psize);
+
+ for (c = 2; c < ncols; c++) {
+ dabd = rr->rr_col[c].rc_abd;
+ size = rr->rr_col[c].rc_size;
+
+ /* add data column */
+ raidz_add(pabd, dabd, size);
+ }
+
+ raidz_math_end();
+}
+
+
+/*
+ * Generate PQ parity (RAIDZ2)
+ * The function is called per data column.
+ *
+ * @c array of pointers to parity (code) columns
+ * @dc pointer to data column
+ * @csize size of parity columns
+ * @dsize size of data column
+ */
+static void
+raidz_gen_pq_add(void **c, const void *dc, const size_t csize,
+ const size_t dsize)
+{
+ v_t *p = (v_t *)c[0];
+ v_t *q = (v_t *)c[1];
+ const v_t *d = (const v_t *)dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
+ const v_t * const qend = q + (csize / sizeof (v_t));
+
+ GEN_PQ_DEFINE();
+
+ MUL2_SETUP();
+
+ for (; d < dend; d += GEN_PQ_STRIDE, p += GEN_PQ_STRIDE,
+ q += GEN_PQ_STRIDE) {
+ LOAD(d, GEN_PQ_D);
+ P_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, p);
+ Q_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, q);
+ }
+ for (; q < qend; q += GEN_PQ_STRIDE) {
+ Q_SYNDROME(GEN_PQ_C, q);
+ }
+}
+
+
+/*
+ * Generate PQ parity (RAIDZ2)
+ *
+ * @rr RAIDZ row
+ */
+static raidz_inline void
+raidz_generate_pq_impl(raidz_row_t * const rr)
+{
+ size_t c;
+ const size_t ncols = rr->rr_cols;
+ const size_t csize = rr->rr_col[CODE_P].rc_size;
+ size_t dsize;
+ abd_t *dabd;
+ abd_t *cabds[] = {
+ rr->rr_col[CODE_P].rc_abd,
+ rr->rr_col[CODE_Q].rc_abd
+ };
+
+ raidz_math_begin();
+
+ raidz_copy(cabds[CODE_P], rr->rr_col[2].rc_abd, csize);
+ raidz_copy(cabds[CODE_Q], rr->rr_col[2].rc_abd, csize);
+
+ for (c = 3; c < ncols; c++) {
+ dabd = rr->rr_col[c].rc_abd;
+ dsize = rr->rr_col[c].rc_size;
+
+ abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2,
+ raidz_gen_pq_add);
+ }
+
+ raidz_math_end();
+}
+
+
+/*
+ * Generate PQR parity (RAIDZ3)
+ * The function is called per data column.
+ *
+ * @c array of pointers to parity (code) columns
+ * @dc pointer to data column
+ * @csize size of parity columns
+ * @dsize size of data column
+ */
+static void
+raidz_gen_pqr_add(void **c, const void *dc, const size_t csize,
+ const size_t dsize)
+{
+ v_t *p = (v_t *)c[0];
+ v_t *q = (v_t *)c[1];
+ v_t *r = (v_t *)c[CODE_R];
+ const v_t *d = (const v_t *)dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
+ const v_t * const qend = q + (csize / sizeof (v_t));
+
+ GEN_PQR_DEFINE();
+
+ MUL2_SETUP();
+
+ for (; d < dend; d += GEN_PQR_STRIDE, p += GEN_PQR_STRIDE,
+ q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) {
+ LOAD(d, GEN_PQR_D);
+ P_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, p);
+ Q_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, q);
+ R_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, r);
+ }
+ for (; q < qend; q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) {
+ Q_SYNDROME(GEN_PQR_C, q);
+ R_SYNDROME(GEN_PQR_C, r);
+ }
+}
+
+
+/*
+ * Generate PQR parity (RAIDZ2)
+ *
+ * @rr RAIDZ row
+ */
+static raidz_inline void
+raidz_generate_pqr_impl(raidz_row_t * const rr)
+{
+ size_t c;
+ const size_t ncols = rr->rr_cols;
+ const size_t csize = rr->rr_col[CODE_P].rc_size;
+ size_t dsize;
+ abd_t *dabd;
+ abd_t *cabds[] = {
+ rr->rr_col[CODE_P].rc_abd,
+ rr->rr_col[CODE_Q].rc_abd,
+ rr->rr_col[CODE_R].rc_abd
+ };
+
+ raidz_math_begin();
+
+ raidz_copy(cabds[CODE_P], rr->rr_col[3].rc_abd, csize);
+ raidz_copy(cabds[CODE_Q], rr->rr_col[3].rc_abd, csize);
+ raidz_copy(cabds[CODE_R], rr->rr_col[3].rc_abd, csize);
+
+ for (c = 4; c < ncols; c++) {
+ dabd = rr->rr_col[c].rc_abd;
+ dsize = rr->rr_col[c].rc_size;
+
+ abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3,
+ raidz_gen_pqr_add);
+ }
+
+ raidz_math_end();
+}
+
+
+/*
+ * DATA RECONSTRUCTION
+ *
+ * Data reconstruction process consists of two phases:
+ * - Syndrome calculation
+ * - Data reconstruction
+ *
+ * Syndrome is calculated by generating parity using available data columns
+ * and zeros in places of erasure. Existing parity is added to corresponding
+ * syndrome value to obtain the [P|Q|R]syn values from equation:
+ * P = Psyn + Dx + Dy + Dz
+ * Q = Qsyn + 2^x * Dx + 2^y * Dy + 2^z * Dz
+ * R = Rsyn + 4^x * Dx + 4^y * Dy + 4^z * Dz
+ *
+ * For data reconstruction phase, the corresponding equations are solved
+ * for missing data (Dx, Dy, Dz). This generally involves multiplying known
+ * symbols by an coefficient and adding them together. The multiplication
+ * constant coefficients are calculated ahead of the operation in
+ * raidz_rec_[q|r|pq|pq|qr|pqr]_coeff() functions.
+ *
+ * IMPLEMENTATION NOTE: RAID-Z block can have complex geometry, with "big"
+ * and "short" columns.
+ * For this reason, reconstruction is performed in minimum of
+ * two steps. First, from offset 0 to short_size, then from short_size to
+ * short_size. Calculation functions REC_[*]_BLOCK() are implemented to work
+ * over both ranges. The split also enables removal of conditional expressions
+ * from loop bodies, improving throughput of SIMD implementations.
+ * For the best performance, all functions marked with raidz_inline attribute
+ * must be inlined by compiler.
+ *
+ * parity data
+ * columns columns
+ * <----------> <------------------>
+ * x y <----+ missing columns (x, y)
+ * | |
+ * +---+---+---+---+-v-+---+-v-+---+ ^ 0
+ * | | | | | | | | | |
+ * | | | | | | | | | |
+ * | P | Q | R | D | D | D | D | D | |
+ * | | | | 0 | 1 | 2 | 3 | 4 | |
+ * | | | | | | | | | v
+ * | | | | | +---+---+---+ ^ short_size
+ * | | | | | | |
+ * +---+---+---+---+---+ v big_size
+ * <------------------> <---------->
+ * big columns short columns
+ *
+ */
+
+
+
+
+/*
+ * Reconstruct single data column using P parity
+ *
+ * @syn_method raidz_add_abd()
+ * @rec_method not applicable
+ *
+ * @rr RAIDZ row
+ * @tgtidx array of missing data indexes
+ */
+static raidz_inline int
+raidz_reconstruct_p_impl(raidz_row_t *rr, const int *tgtidx)
+{
+ size_t c;
+ const size_t firstdc = rr->rr_firstdatacol;
+ const size_t ncols = rr->rr_cols;
+ const size_t x = tgtidx[TARGET_X];
+ const size_t xsize = rr->rr_col[x].rc_size;
+ abd_t *xabd = rr->rr_col[x].rc_abd;
+ size_t size;
+ abd_t *dabd;
+
+ if (xabd == NULL)
+ return (1 << CODE_P);
+
+ raidz_math_begin();
+
+ /* copy P into target */
+ raidz_copy(xabd, rr->rr_col[CODE_P].rc_abd, xsize);
+
+ /* generate p_syndrome */
+ for (c = firstdc; c < ncols; c++) {
+ if (c == x)
+ continue;
+
+ dabd = rr->rr_col[c].rc_abd;
+ size = MIN(rr->rr_col[c].rc_size, xsize);
+
+ raidz_add(xabd, dabd, size);
+ }
+
+ raidz_math_end();
+
+ return (1 << CODE_P);
+}
+
+
+/*
+ * Generate Q syndrome (Qsyn)
+ *
+ * @xc array of pointers to syndrome columns
+ * @dc data column (NULL if missing)
+ * @xsize size of syndrome columns
+ * @dsize size of data column (0 if missing)
+ */
+static void
+raidz_syn_q_abd(void **xc, const void *dc, const size_t xsize,
+ const size_t dsize)
+{
+ v_t *x = (v_t *)xc[TARGET_X];
+ const v_t *d = (const v_t *)dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
+ const v_t * const xend = x + (xsize / sizeof (v_t));
+
+ SYN_Q_DEFINE();
+
+ MUL2_SETUP();
+
+ for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) {
+ LOAD(d, SYN_Q_D);
+ Q_D_SYNDROME(SYN_Q_D, SYN_Q_X, x);
+ }
+ for (; x < xend; x += SYN_STRIDE) {
+ Q_SYNDROME(SYN_Q_X, x);
+ }
+}
+
+
+/*
+ * Reconstruct single data column using Q parity
+ *
+ * @syn_method raidz_add_abd()
+ * @rec_method raidz_mul_abd_cb()
+ *
+ * @rr RAIDZ row
+ * @tgtidx array of missing data indexes
+ */
+static raidz_inline int
+raidz_reconstruct_q_impl(raidz_row_t *rr, const int *tgtidx)
+{
+ size_t c;
+ size_t dsize;
+ abd_t *dabd;
+ const size_t firstdc = rr->rr_firstdatacol;
+ const size_t ncols = rr->rr_cols;
+ const size_t x = tgtidx[TARGET_X];
+ abd_t *xabd = rr->rr_col[x].rc_abd;
+ const size_t xsize = rr->rr_col[x].rc_size;
+ abd_t *tabds[] = { xabd };
+
+ if (xabd == NULL)
+ return (1 << CODE_Q);
+
+ unsigned coeff[MUL_CNT];
+ raidz_rec_q_coeff(rr, tgtidx, coeff);
+
+ raidz_math_begin();
+
+ /* Start with first data column if present */
+ if (firstdc != x) {
+ raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+ } else {
+ raidz_zero(xabd, xsize);
+ }
+
+ /* generate q_syndrome */
+ for (c = firstdc+1; c < ncols; c++) {
+ if (c == x) {
+ dabd = NULL;
+ dsize = 0;
+ } else {
+ dabd = rr->rr_col[c].rc_abd;
+ dsize = rr->rr_col[c].rc_size;
+ }
+
+ abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
+ raidz_syn_q_abd);
+ }
+
+ /* add Q to the syndrome */
+ raidz_add(xabd, rr->rr_col[CODE_Q].rc_abd, xsize);
+
+ /* transform the syndrome */
+ abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff);
+
+ raidz_math_end();
+
+ return (1 << CODE_Q);
+}
+
+
+/*
+ * Generate R syndrome (Rsyn)
+ *
+ * @xc array of pointers to syndrome columns
+ * @dc data column (NULL if missing)
+ * @tsize size of syndrome columns
+ * @dsize size of data column (0 if missing)
+ */
+static void
+raidz_syn_r_abd(void **xc, const void *dc, const size_t tsize,
+ const size_t dsize)
+{
+ v_t *x = (v_t *)xc[TARGET_X];
+ const v_t *d = (const v_t *)dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
+ const v_t * const xend = x + (tsize / sizeof (v_t));
+
+ SYN_R_DEFINE();
+
+ MUL2_SETUP();
+
+ for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) {
+ LOAD(d, SYN_R_D);
+ R_D_SYNDROME(SYN_R_D, SYN_R_X, x);
+ }
+ for (; x < xend; x += SYN_STRIDE) {
+ R_SYNDROME(SYN_R_X, x);
+ }
+}
+
+
+/*
+ * Reconstruct single data column using R parity
+ *
+ * @syn_method raidz_add_abd()
+ * @rec_method raidz_mul_abd_cb()
+ *
+ * @rr RAIDZ rr
+ * @tgtidx array of missing data indexes
+ */
+static raidz_inline int
+raidz_reconstruct_r_impl(raidz_row_t *rr, const int *tgtidx)
+{
+ size_t c;
+ size_t dsize;
+ abd_t *dabd;
+ const size_t firstdc = rr->rr_firstdatacol;
+ const size_t ncols = rr->rr_cols;
+ const size_t x = tgtidx[TARGET_X];
+ const size_t xsize = rr->rr_col[x].rc_size;
+ abd_t *xabd = rr->rr_col[x].rc_abd;
+ abd_t *tabds[] = { xabd };
+
+ if (xabd == NULL)
+ return (1 << CODE_R);
+
+ unsigned coeff[MUL_CNT];
+ raidz_rec_r_coeff(rr, tgtidx, coeff);
+
+ raidz_math_begin();
+
+ /* Start with first data column if present */
+ if (firstdc != x) {
+ raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+ } else {
+ raidz_zero(xabd, xsize);
+ }
+
+
+ /* generate q_syndrome */
+ for (c = firstdc+1; c < ncols; c++) {
+ if (c == x) {
+ dabd = NULL;
+ dsize = 0;
+ } else {
+ dabd = rr->rr_col[c].rc_abd;
+ dsize = rr->rr_col[c].rc_size;
+ }
+
+ abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
+ raidz_syn_r_abd);
+ }
+
+ /* add R to the syndrome */
+ raidz_add(xabd, rr->rr_col[CODE_R].rc_abd, xsize);
+
+ /* transform the syndrome */
+ abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff);
+
+ raidz_math_end();
+
+ return (1 << CODE_R);
+}
+
+
+/*
+ * Generate P and Q syndromes
+ *
+ * @xc array of pointers to syndrome columns
+ * @dc data column (NULL if missing)
+ * @tsize size of syndrome columns
+ * @dsize size of data column (0 if missing)
+ */
+static void
+raidz_syn_pq_abd(void **tc, const void *dc, const size_t tsize,
+ const size_t dsize)
+{
+ v_t *x = (v_t *)tc[TARGET_X];
+ v_t *y = (v_t *)tc[TARGET_Y];
+ const v_t *d = (const v_t *)dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
+ const v_t * const yend = y + (tsize / sizeof (v_t));
+
+ SYN_PQ_DEFINE();
+
+ MUL2_SETUP();
+
+ for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) {
+ LOAD(d, SYN_PQ_D);
+ P_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, x);
+ Q_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, y);
+ }
+ for (; y < yend; y += SYN_STRIDE) {
+ Q_SYNDROME(SYN_PQ_X, y);
+ }
+}
+
+/*
+ * Reconstruct data using PQ parity and PQ syndromes
+ *
+ * @tc syndrome/result columns
+ * @tsize size of syndrome/result columns
+ * @c parity columns
+ * @mul array of multiplication constants
+ */
+static void
+raidz_rec_pq_abd(void **tc, const size_t tsize, void **c,
+ const unsigned *mul)
+{
+ v_t *x = (v_t *)tc[TARGET_X];
+ v_t *y = (v_t *)tc[TARGET_Y];
+ const v_t * const xend = x + (tsize / sizeof (v_t));
+ const v_t *p = (v_t *)c[CODE_P];
+ const v_t *q = (v_t *)c[CODE_Q];
+
+ REC_PQ_DEFINE();
+
+ for (; x < xend; x += REC_PQ_STRIDE, y += REC_PQ_STRIDE,
+ p += REC_PQ_STRIDE, q += REC_PQ_STRIDE) {
+ LOAD(x, REC_PQ_X);
+ LOAD(y, REC_PQ_Y);
+
+ XOR_ACC(p, REC_PQ_X);
+ XOR_ACC(q, REC_PQ_Y);
+
+ /* Save Pxy */
+ COPY(REC_PQ_X, REC_PQ_T);
+
+ /* Calc X */
+ MUL(mul[MUL_PQ_X], REC_PQ_X);
+ MUL(mul[MUL_PQ_Y], REC_PQ_Y);
+ XOR(REC_PQ_Y, REC_PQ_X);
+ STORE(x, REC_PQ_X);
+
+ /* Calc Y */
+ XOR(REC_PQ_T, REC_PQ_X);
+ STORE(y, REC_PQ_X);
+ }
+}
+
+
+/*
+ * Reconstruct two data columns using PQ parity
+ *
+ * @syn_method raidz_syn_pq_abd()
+ * @rec_method raidz_rec_pq_abd()
+ *
+ * @rr RAIDZ row
+ * @tgtidx array of missing data indexes
+ */
+static raidz_inline int
+raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx)
+{
+ size_t c;
+ size_t dsize;
+ abd_t *dabd;
+ const size_t firstdc = rr->rr_firstdatacol;
+ const size_t ncols = rr->rr_cols;
+ const size_t x = tgtidx[TARGET_X];
+ const size_t y = tgtidx[TARGET_Y];
+ const size_t xsize = rr->rr_col[x].rc_size;
+ const size_t ysize = rr->rr_col[y].rc_size;
+ abd_t *xabd = rr->rr_col[x].rc_abd;
+ abd_t *yabd = rr->rr_col[y].rc_abd;
+ abd_t *tabds[2] = { xabd, yabd };
+ abd_t *cabds[] = {
+ rr->rr_col[CODE_P].rc_abd,
+ rr->rr_col[CODE_Q].rc_abd
+ };
+
+ if (xabd == NULL)
+ return ((1 << CODE_P) | (1 << CODE_Q));
+
+ unsigned coeff[MUL_CNT];
+ raidz_rec_pq_coeff(rr, tgtidx, coeff);
+
+ /*
+ * Check if some of targets is shorter then others
+ * In this case, shorter target needs to be replaced with
+ * new buffer so that syndrome can be calculated.
+ */
+ if (ysize < xsize) {
+ yabd = abd_alloc(xsize, B_FALSE);
+ tabds[1] = yabd;
+ }
+
+ raidz_math_begin();
+
+ /* Start with first data column if present */
+ if (firstdc != x) {
+ raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+ raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
+ } else {
+ raidz_zero(xabd, xsize);
+ raidz_zero(yabd, xsize);
+ }
+
+ /* generate q_syndrome */
+ for (c = firstdc+1; c < ncols; c++) {
+ if (c == x || c == y) {
+ dabd = NULL;
+ dsize = 0;
+ } else {
+ dabd = rr->rr_col[c].rc_abd;
+ dsize = rr->rr_col[c].rc_size;
+ }
+
+ abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
+ raidz_syn_pq_abd);
+ }
+
+ abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pq_abd, coeff);
+
+ /* Copy shorter targets back to the original abd buffer */
+ if (ysize < xsize)
+ raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
+
+ raidz_math_end();
+
+ if (ysize < xsize)
+ abd_free(yabd);
+
+ return ((1 << CODE_P) | (1 << CODE_Q));
+}
+
+
+/*
+ * Generate P and R syndromes
+ *
+ * @xc array of pointers to syndrome columns
+ * @dc data column (NULL if missing)
+ * @tsize size of syndrome columns
+ * @dsize size of data column (0 if missing)
+ */
+static void
+raidz_syn_pr_abd(void **c, const void *dc, const size_t tsize,
+ const size_t dsize)
+{
+ v_t *x = (v_t *)c[TARGET_X];
+ v_t *y = (v_t *)c[TARGET_Y];
+ const v_t *d = (const v_t *)dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
+ const v_t * const yend = y + (tsize / sizeof (v_t));
+
+ SYN_PR_DEFINE();
+
+ MUL2_SETUP();
+
+ for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) {
+ LOAD(d, SYN_PR_D);
+ P_D_SYNDROME(SYN_PR_D, SYN_PR_X, x);
+ R_D_SYNDROME(SYN_PR_D, SYN_PR_X, y);
+ }
+ for (; y < yend; y += SYN_STRIDE) {
+ R_SYNDROME(SYN_PR_X, y);
+ }
+}
+
+/*
+ * Reconstruct data using PR parity and PR syndromes
+ *
+ * @tc syndrome/result columns
+ * @tsize size of syndrome/result columns
+ * @c parity columns
+ * @mul array of multiplication constants
+ */
+static void
+raidz_rec_pr_abd(void **t, const size_t tsize, void **c,
+ const unsigned *mul)
+{
+ v_t *x = (v_t *)t[TARGET_X];
+ v_t *y = (v_t *)t[TARGET_Y];
+ const v_t * const xend = x + (tsize / sizeof (v_t));
+ const v_t *p = (v_t *)c[CODE_P];
+ const v_t *q = (v_t *)c[CODE_Q];
+
+ REC_PR_DEFINE();
+
+ for (; x < xend; x += REC_PR_STRIDE, y += REC_PR_STRIDE,
+ p += REC_PR_STRIDE, q += REC_PR_STRIDE) {
+ LOAD(x, REC_PR_X);
+ LOAD(y, REC_PR_Y);
+ XOR_ACC(p, REC_PR_X);
+ XOR_ACC(q, REC_PR_Y);
+
+ /* Save Pxy */
+ COPY(REC_PR_X, REC_PR_T);
+
+ /* Calc X */
+ MUL(mul[MUL_PR_X], REC_PR_X);
+ MUL(mul[MUL_PR_Y], REC_PR_Y);
+ XOR(REC_PR_Y, REC_PR_X);
+ STORE(x, REC_PR_X);
+
+ /* Calc Y */
+ XOR(REC_PR_T, REC_PR_X);
+ STORE(y, REC_PR_X);
+ }
+}
+
+
+/*
+ * Reconstruct two data columns using PR parity
+ *
+ * @syn_method raidz_syn_pr_abd()
+ * @rec_method raidz_rec_pr_abd()
+ *
+ * @rr RAIDZ row
+ * @tgtidx array of missing data indexes
+ */
+static raidz_inline int
+raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx)
+{
+ size_t c;
+ size_t dsize;
+ abd_t *dabd;
+ const size_t firstdc = rr->rr_firstdatacol;
+ const size_t ncols = rr->rr_cols;
+ const size_t x = tgtidx[0];
+ const size_t y = tgtidx[1];
+ const size_t xsize = rr->rr_col[x].rc_size;
+ const size_t ysize = rr->rr_col[y].rc_size;
+ abd_t *xabd = rr->rr_col[x].rc_abd;
+ abd_t *yabd = rr->rr_col[y].rc_abd;
+ abd_t *tabds[2] = { xabd, yabd };
+ abd_t *cabds[] = {
+ rr->rr_col[CODE_P].rc_abd,
+ rr->rr_col[CODE_R].rc_abd
+ };
+
+ if (xabd == NULL)
+ return ((1 << CODE_P) | (1 << CODE_R));
+
+ unsigned coeff[MUL_CNT];
+ raidz_rec_pr_coeff(rr, tgtidx, coeff);
+
+ /*
+ * Check if some of targets are shorter then others.
+ * They need to be replaced with a new buffer so that syndrome can
+ * be calculated on full length.
+ */
+ if (ysize < xsize) {
+ yabd = abd_alloc(xsize, B_FALSE);
+ tabds[1] = yabd;
+ }
+
+ raidz_math_begin();
+
+ /* Start with first data column if present */
+ if (firstdc != x) {
+ raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+ raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
+ } else {
+ raidz_zero(xabd, xsize);
+ raidz_zero(yabd, xsize);
+ }
+
+ /* generate q_syndrome */
+ for (c = firstdc+1; c < ncols; c++) {
+ if (c == x || c == y) {
+ dabd = NULL;
+ dsize = 0;
+ } else {
+ dabd = rr->rr_col[c].rc_abd;
+ dsize = rr->rr_col[c].rc_size;
+ }
+
+ abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
+ raidz_syn_pr_abd);
+ }
+
+ abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pr_abd, coeff);
+
+ /*
+ * Copy shorter targets back to the original abd buffer
+ */
+ if (ysize < xsize)
+ raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
+
+ raidz_math_end();
+
+ if (ysize < xsize)
+ abd_free(yabd);
+
+ return ((1 << CODE_P) | (1 << CODE_R));
+}
+
+
+/*
+ * Generate Q and R syndromes
+ *
+ * @xc array of pointers to syndrome columns
+ * @dc data column (NULL if missing)
+ * @tsize size of syndrome columns
+ * @dsize size of data column (0 if missing)
+ */
+static void
+raidz_syn_qr_abd(void **c, const void *dc, const size_t tsize,
+ const size_t dsize)
+{
+ v_t *x = (v_t *)c[TARGET_X];
+ v_t *y = (v_t *)c[TARGET_Y];
+ const v_t * const xend = x + (tsize / sizeof (v_t));
+ const v_t *d = (const v_t *)dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
+
+ SYN_QR_DEFINE();
+
+ MUL2_SETUP();
+
+ for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) {
+ LOAD(d, SYN_PQ_D);
+ Q_D_SYNDROME(SYN_QR_D, SYN_QR_X, x);
+ R_D_SYNDROME(SYN_QR_D, SYN_QR_X, y);
+ }
+ for (; x < xend; x += SYN_STRIDE, y += SYN_STRIDE) {
+ Q_SYNDROME(SYN_QR_X, x);
+ R_SYNDROME(SYN_QR_X, y);
+ }
+}
+
+
+/*
+ * Reconstruct data using QR parity and QR syndromes
+ *
+ * @tc syndrome/result columns
+ * @tsize size of syndrome/result columns
+ * @c parity columns
+ * @mul array of multiplication constants
+ */
+static void
+raidz_rec_qr_abd(void **t, const size_t tsize, void **c,
+ const unsigned *mul)
+{
+ v_t *x = (v_t *)t[TARGET_X];
+ v_t *y = (v_t *)t[TARGET_Y];
+ const v_t * const xend = x + (tsize / sizeof (v_t));
+ const v_t *p = (v_t *)c[CODE_P];
+ const v_t *q = (v_t *)c[CODE_Q];
+
+ REC_QR_DEFINE();
+
+ for (; x < xend; x += REC_QR_STRIDE, y += REC_QR_STRIDE,
+ p += REC_QR_STRIDE, q += REC_QR_STRIDE) {
+ LOAD(x, REC_QR_X);
+ LOAD(y, REC_QR_Y);
+
+ XOR_ACC(p, REC_QR_X);
+ XOR_ACC(q, REC_QR_Y);
+
+ /* Save Pxy */
+ COPY(REC_QR_X, REC_QR_T);
+
+ /* Calc X */
+ MUL(mul[MUL_QR_XQ], REC_QR_X); /* X = Q * xqm */
+ XOR(REC_QR_Y, REC_QR_X); /* X = R ^ X */
+ MUL(mul[MUL_QR_X], REC_QR_X); /* X = X * xm */
+ STORE(x, REC_QR_X);
+
+ /* Calc Y */
+ MUL(mul[MUL_QR_YQ], REC_QR_T); /* X = Q * xqm */
+ XOR(REC_QR_Y, REC_QR_T); /* X = R ^ X */
+ MUL(mul[MUL_QR_Y], REC_QR_T); /* X = X * xm */
+ STORE(y, REC_QR_T);
+ }
+}
+
+
+/*
+ * Reconstruct two data columns using QR parity
+ *
+ * @syn_method raidz_syn_qr_abd()
+ * @rec_method raidz_rec_qr_abd()
+ *
+ * @rr RAIDZ row
+ * @tgtidx array of missing data indexes
+ */
+static raidz_inline int
+raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx)
+{
+ size_t c;
+ size_t dsize;
+ abd_t *dabd;
+ const size_t firstdc = rr->rr_firstdatacol;
+ const size_t ncols = rr->rr_cols;
+ const size_t x = tgtidx[TARGET_X];
+ const size_t y = tgtidx[TARGET_Y];
+ const size_t xsize = rr->rr_col[x].rc_size;
+ const size_t ysize = rr->rr_col[y].rc_size;
+ abd_t *xabd = rr->rr_col[x].rc_abd;
+ abd_t *yabd = rr->rr_col[y].rc_abd;
+ abd_t *tabds[2] = { xabd, yabd };
+ abd_t *cabds[] = {
+ rr->rr_col[CODE_Q].rc_abd,
+ rr->rr_col[CODE_R].rc_abd
+ };
+
+ if (xabd == NULL)
+ return ((1 << CODE_Q) | (1 << CODE_R));
+
+ unsigned coeff[MUL_CNT];
+ raidz_rec_qr_coeff(rr, tgtidx, coeff);
+
+ /*
+ * Check if some of targets is shorter then others
+ * In this case, shorter target needs to be replaced with
+ * new buffer so that syndrome can be calculated.
+ */
+ if (ysize < xsize) {
+ yabd = abd_alloc(xsize, B_FALSE);
+ tabds[1] = yabd;
+ }
+
+ raidz_math_begin();
+
+ /* Start with first data column if present */
+ if (firstdc != x) {
+ raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+ raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
+ } else {
+ raidz_zero(xabd, xsize);
+ raidz_zero(yabd, xsize);
+ }
+
+ /* generate q_syndrome */
+ for (c = firstdc+1; c < ncols; c++) {
+ if (c == x || c == y) {
+ dabd = NULL;
+ dsize = 0;
+ } else {
+ dabd = rr->rr_col[c].rc_abd;
+ dsize = rr->rr_col[c].rc_size;
+ }
+
+ abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
+ raidz_syn_qr_abd);
+ }
+
+ abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_qr_abd, coeff);
+
+ /*
+ * Copy shorter targets back to the original abd buffer
+ */
+ if (ysize < xsize)
+ raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
+
+ raidz_math_end();
+
+ if (ysize < xsize)
+ abd_free(yabd);
+
+
+ return ((1 << CODE_Q) | (1 << CODE_R));
+}
+
+
+/*
+ * Generate P, Q, and R syndromes
+ *
+ * @xc array of pointers to syndrome columns
+ * @dc data column (NULL if missing)
+ * @tsize size of syndrome columns
+ * @dsize size of data column (0 if missing)
+ */
+static void
+raidz_syn_pqr_abd(void **c, const void *dc, const size_t tsize,
+ const size_t dsize)
+{
+ v_t *x = (v_t *)c[TARGET_X];
+ v_t *y = (v_t *)c[TARGET_Y];
+ v_t *z = (v_t *)c[TARGET_Z];
+ const v_t * const yend = y + (tsize / sizeof (v_t));
+ const v_t *d = (const v_t *)dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
+
+ SYN_PQR_DEFINE();
+
+ MUL2_SETUP();
+
+ for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE,
+ z += SYN_STRIDE) {
+ LOAD(d, SYN_PQR_D);
+ P_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, x)
+ Q_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, y);
+ R_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, z);
+ }
+ for (; y < yend; y += SYN_STRIDE, z += SYN_STRIDE) {
+ Q_SYNDROME(SYN_PQR_X, y);
+ R_SYNDROME(SYN_PQR_X, z);
+ }
+}
+
+
+/*
+ * Reconstruct data using PRQ parity and PQR syndromes
+ *
+ * @tc syndrome/result columns
+ * @tsize size of syndrome/result columns
+ * @c parity columns
+ * @mul array of multiplication constants
+ */
+static void
+raidz_rec_pqr_abd(void **t, const size_t tsize, void **c,
+ const unsigned * const mul)
+{
+ v_t *x = (v_t *)t[TARGET_X];
+ v_t *y = (v_t *)t[TARGET_Y];
+ v_t *z = (v_t *)t[TARGET_Z];
+ const v_t * const xend = x + (tsize / sizeof (v_t));
+ const v_t *p = (v_t *)c[CODE_P];
+ const v_t *q = (v_t *)c[CODE_Q];
+ const v_t *r = (v_t *)c[CODE_R];
+
+ REC_PQR_DEFINE();
+
+ for (; x < xend; x += REC_PQR_STRIDE, y += REC_PQR_STRIDE,
+ z += REC_PQR_STRIDE, p += REC_PQR_STRIDE, q += REC_PQR_STRIDE,
+ r += REC_PQR_STRIDE) {
+ LOAD(x, REC_PQR_X);
+ LOAD(y, REC_PQR_Y);
+ LOAD(z, REC_PQR_Z);
+
+ XOR_ACC(p, REC_PQR_X);
+ XOR_ACC(q, REC_PQR_Y);
+ XOR_ACC(r, REC_PQR_Z);
+
+ /* Save Pxyz and Qxyz */
+ COPY(REC_PQR_X, REC_PQR_XS);
+ COPY(REC_PQR_Y, REC_PQR_YS);
+
+ /* Calc X */
+ MUL(mul[MUL_PQR_XP], REC_PQR_X); /* Xp = Pxyz * xp */
+ MUL(mul[MUL_PQR_XQ], REC_PQR_Y); /* Xq = Qxyz * xq */
+ XOR(REC_PQR_Y, REC_PQR_X);
+ MUL(mul[MUL_PQR_XR], REC_PQR_Z); /* Xr = Rxyz * xr */
+ XOR(REC_PQR_Z, REC_PQR_X); /* X = Xp + Xq + Xr */
+ STORE(x, REC_PQR_X);
+
+ /* Calc Y */
+ XOR(REC_PQR_X, REC_PQR_XS); /* Pyz = Pxyz + X */
+ MUL(mul[MUL_PQR_YU], REC_PQR_X); /* Xq = X * upd_q */
+ XOR(REC_PQR_X, REC_PQR_YS); /* Qyz = Qxyz + Xq */
+ COPY(REC_PQR_XS, REC_PQR_X); /* restore Pyz */
+ MUL(mul[MUL_PQR_YP], REC_PQR_X); /* Yp = Pyz * yp */
+ MUL(mul[MUL_PQR_YQ], REC_PQR_YS); /* Yq = Qyz * yq */
+ XOR(REC_PQR_X, REC_PQR_YS); /* Y = Yp + Yq */
+ STORE(y, REC_PQR_YS);
+
+ /* Calc Z */
+ XOR(REC_PQR_XS, REC_PQR_YS); /* Z = Pz = Pyz + Y */
+ STORE(z, REC_PQR_YS);
+ }
+}
+
+
+/*
+ * Reconstruct three data columns using PQR parity
+ *
+ * @syn_method raidz_syn_pqr_abd()
+ * @rec_method raidz_rec_pqr_abd()
+ *
+ * @rr RAIDZ row
+ * @tgtidx array of missing data indexes
+ */
+static raidz_inline int
+raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx)
+{
+ size_t c;
+ size_t dsize;
+ abd_t *dabd;
+ const size_t firstdc = rr->rr_firstdatacol;
+ const size_t ncols = rr->rr_cols;
+ const size_t x = tgtidx[TARGET_X];
+ const size_t y = tgtidx[TARGET_Y];
+ const size_t z = tgtidx[TARGET_Z];
+ const size_t xsize = rr->rr_col[x].rc_size;
+ const size_t ysize = rr->rr_col[y].rc_size;
+ const size_t zsize = rr->rr_col[z].rc_size;
+ abd_t *xabd = rr->rr_col[x].rc_abd;
+ abd_t *yabd = rr->rr_col[y].rc_abd;
+ abd_t *zabd = rr->rr_col[z].rc_abd;
+ abd_t *tabds[] = { xabd, yabd, zabd };
+ abd_t *cabds[] = {
+ rr->rr_col[CODE_P].rc_abd,
+ rr->rr_col[CODE_Q].rc_abd,
+ rr->rr_col[CODE_R].rc_abd
+ };
+
+ if (xabd == NULL)
+ return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R));
+
+ unsigned coeff[MUL_CNT];
+ raidz_rec_pqr_coeff(rr, tgtidx, coeff);
+
+ /*
+ * Check if some of targets is shorter then others
+ * In this case, shorter target needs to be replaced with
+ * new buffer so that syndrome can be calculated.
+ */
+ if (ysize < xsize) {
+ yabd = abd_alloc(xsize, B_FALSE);
+ tabds[1] = yabd;
+ }
+ if (zsize < xsize) {
+ zabd = abd_alloc(xsize, B_FALSE);
+ tabds[2] = zabd;
+ }
+
+ raidz_math_begin();
+
+ /* Start with first data column if present */
+ if (firstdc != x) {
+ raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+ raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
+ raidz_copy(zabd, rr->rr_col[firstdc].rc_abd, xsize);
+ } else {
+ raidz_zero(xabd, xsize);
+ raidz_zero(yabd, xsize);
+ raidz_zero(zabd, xsize);
+ }
+
+ /* generate q_syndrome */
+ for (c = firstdc+1; c < ncols; c++) {
+ if (c == x || c == y || c == z) {
+ dabd = NULL;
+ dsize = 0;
+ } else {
+ dabd = rr->rr_col[c].rc_abd;
+ dsize = rr->rr_col[c].rc_size;
+ }
+
+ abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3,
+ raidz_syn_pqr_abd);
+ }
+
+ abd_raidz_rec_iterate(cabds, tabds, xsize, 3, raidz_rec_pqr_abd, coeff);
+
+ /*
+ * Copy shorter targets back to the original abd buffer
+ */
+ if (ysize < xsize)
+ raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
+ if (zsize < xsize)
+ raidz_copy(rr->rr_col[z].rc_abd, zabd, zsize);
+
+ raidz_math_end();
+
+ if (ysize < xsize)
+ abd_free(yabd);
+ if (zsize < xsize)
+ abd_free(zabd);
+
+ return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R));
+}
+
+#endif /* _VDEV_RAIDZ_MATH_IMPL_H */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec.c
new file mode 100644
index 000000000000..1db2c4cd3a47
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec.c
@@ -0,0 +1,4337 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2019 Romain Dolbeau. All rights reserved.
+ * <romain.dolbeau@european-processor-initiative.eu>
+ */
+
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+
+#if defined(__powerpc__)
+#pragma GCC target("altivec")
+
+#include "vdev_raidz_math_powerpc_altivec_common.h"
+
+#define SYN_STRIDE 4
+
+#define ZERO_STRIDE 4
+#define ZERO_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_33_36()
+#define ZERO_D 0, 1, 2, 3
+
+#define COPY_STRIDE 4
+#define COPY_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_33_36()
+#define COPY_D 0, 1, 2, 3
+
+#define ADD_STRIDE 4
+#define ADD_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_33_36()
+#define ADD_D 0, 1, 2, 3
+
+#define MUL_STRIDE 4
+#define MUL_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_33_36()
+#define MUL_D 0, 1, 2, 3
+
+#define GEN_P_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_33_36()
+#define GEN_P_STRIDE 4
+#define GEN_P_P 0, 1, 2, 3
+
+#define GEN_PQ_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define GEN_PQ_STRIDE 4
+#define GEN_PQ_D 0, 1, 2, 3
+#define GEN_PQ_C 4, 5, 6, 7
+
+#define GEN_PQR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define GEN_PQR_STRIDE 4
+#define GEN_PQR_D 0, 1, 2, 3
+#define GEN_PQR_C 4, 5, 6, 7
+
+#define SYN_Q_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_Q_STRIDE 4
+#define SYN_Q_D 0, 1, 2, 3
+#define SYN_Q_X 4, 5, 6, 7
+
+#define SYN_R_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_R_STRIDE 4
+#define SYN_R_D 0, 1, 2, 3
+#define SYN_R_X 4, 5, 6, 7
+
+#define SYN_PQ_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_PQ_STRIDE 4
+#define SYN_PQ_D 0, 1, 2, 3
+#define SYN_PQ_X 4, 5, 6, 7
+
+#define REC_PQ_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_31() \
+ GEN_X_DEFINE_32() \
+ GEN_X_DEFINE_33_36()
+#define REC_PQ_STRIDE 2
+#define REC_PQ_X 0, 1
+#define REC_PQ_Y 2, 3
+#define REC_PQ_T 4, 5
+
+#define SYN_PR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_PR_STRIDE 4
+#define SYN_PR_D 0, 1, 2, 3
+#define SYN_PR_X 4, 5, 6, 7
+
+#define REC_PR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_31() \
+ GEN_X_DEFINE_32() \
+ GEN_X_DEFINE_33_36()
+#define REC_PR_STRIDE 2
+#define REC_PR_X 0, 1
+#define REC_PR_Y 2, 3
+#define REC_PR_T 4, 5
+
+#define SYN_QR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_QR_STRIDE 4
+#define SYN_QR_D 0, 1, 2, 3
+#define SYN_QR_X 4, 5, 6, 7
+
+#define REC_QR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_31() \
+ GEN_X_DEFINE_32() \
+ GEN_X_DEFINE_33_36()
+#define REC_QR_STRIDE 2
+#define REC_QR_X 0, 1
+#define REC_QR_Y 2, 3
+#define REC_QR_T 4, 5
+
+#define SYN_PQR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_PQR_STRIDE 4
+#define SYN_PQR_D 0, 1, 2, 3
+#define SYN_PQR_X 4, 5, 6, 7
+
+#define REC_PQR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_8_9() \
+ GEN_X_DEFINE_31() \
+ GEN_X_DEFINE_32() \
+ GEN_X_DEFINE_33_36()
+#define REC_PQR_STRIDE 2
+#define REC_PQR_X 0, 1
+#define REC_PQR_Y 2, 3
+#define REC_PQR_Z 4, 5
+#define REC_PQR_XS 6, 7
+#define REC_PQR_YS 8, 9
+
+
+#include <sys/vdev_raidz_impl.h>
+#include "vdev_raidz_math_impl.h"
+
+DEFINE_GEN_METHODS(powerpc_altivec);
+DEFINE_REC_METHODS(powerpc_altivec);
+
+static boolean_t
+raidz_will_powerpc_altivec_work(void)
+{
+ return (kfpu_allowed()) && zfs_altivec_available();
+}
+
+const raidz_impl_ops_t vdev_raidz_powerpc_altivec_impl = {
+ .init = NULL,
+ .fini = NULL,
+ .gen = RAIDZ_GEN_METHODS(powerpc_altivec),
+ .rec = RAIDZ_REC_METHODS(powerpc_altivec),
+ .is_supported = &raidz_will_powerpc_altivec_work,
+ .name = "powerpc_altivec"
+};
+
+#endif /* defined(__powerpc__) */
+
+
+#if defined(__powerpc__)
+#if defined(_ZFS_LITTLE_ENDIAN) && _LITTLE_ENDIAN
+/* BEGIN CSTYLED */
+const uint8_t
+__attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = {
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+ 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
+ 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+ 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10,
+ 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+ 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x11, 0x12, 0x17, 0x14, 0x1d, 0x1e, 0x1b, 0x18,
+ 0x09, 0x0a, 0x0f, 0x0c, 0x05, 0x06, 0x03, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+ 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x3c, 0x38, 0x34, 0x30, 0x2c, 0x28, 0x24, 0x20,
+ 0x1c, 0x18, 0x14, 0x10, 0x0c, 0x08, 0x04, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+ 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x33, 0x36, 0x39, 0x3c, 0x27, 0x22, 0x2d, 0x28,
+ 0x1b, 0x1e, 0x11, 0x14, 0x0f, 0x0a, 0x05, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+ 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x22, 0x24, 0x2e, 0x28, 0x3a, 0x3c, 0x36, 0x30,
+ 0x12, 0x14, 0x1e, 0x18, 0x0a, 0x0c, 0x06, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+ 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x2d, 0x2a, 0x23, 0x24, 0x31, 0x36, 0x3f, 0x38,
+ 0x15, 0x12, 0x1b, 0x1c, 0x09, 0x0e, 0x07, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+ 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
+ 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+ 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x77, 0x7e, 0x65, 0x6c, 0x53, 0x5a, 0x41, 0x48,
+ 0x3f, 0x36, 0x2d, 0x24, 0x1b, 0x12, 0x09, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+ 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x66, 0x6c, 0x72, 0x78, 0x4e, 0x44, 0x5a, 0x50,
+ 0x36, 0x3c, 0x22, 0x28, 0x1e, 0x14, 0x0a, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+ 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x69, 0x62, 0x7f, 0x74, 0x45, 0x4e, 0x53, 0x58,
+ 0x31, 0x3a, 0x27, 0x2c, 0x1d, 0x16, 0x0b, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+ 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x44, 0x48, 0x5c, 0x50, 0x74, 0x78, 0x6c, 0x60,
+ 0x24, 0x28, 0x3c, 0x30, 0x14, 0x18, 0x0c, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+ 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x4b, 0x46, 0x51, 0x5c, 0x7f, 0x72, 0x65, 0x68,
+ 0x23, 0x2e, 0x39, 0x34, 0x17, 0x1a, 0x0d, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+ 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x5a, 0x54, 0x46, 0x48, 0x62, 0x6c, 0x7e, 0x70,
+ 0x2a, 0x24, 0x36, 0x38, 0x12, 0x1c, 0x0e, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+ 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x55, 0x5a, 0x4b, 0x44, 0x69, 0x66, 0x77, 0x78,
+ 0x2d, 0x22, 0x33, 0x3c, 0x11, 0x1e, 0x0f, 0x00 },
+ { 0xbb, 0xa6, 0x81, 0x9c, 0xcf, 0xd2, 0xf5, 0xe8,
+ 0x53, 0x4e, 0x69, 0x74, 0x27, 0x3a, 0x1d, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+ 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 },
+ { 0xbb, 0xa6, 0x81, 0x9c, 0xcf, 0xd2, 0xf5, 0xe8,
+ 0x53, 0x4e, 0x69, 0x74, 0x27, 0x3a, 0x1d, 0x00 },
+ { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+ 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88,
+ 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11, 0x00 },
+ { 0xa6, 0xbb, 0x9c, 0x81, 0xd2, 0xcf, 0xe8, 0xf5,
+ 0x53, 0x4e, 0x69, 0x74, 0x27, 0x3a, 0x1d, 0x00 },
+ { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+ 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xee, 0xfc, 0xca, 0xd8, 0xa6, 0xb4, 0x82, 0x90,
+ 0x7e, 0x6c, 0x5a, 0x48, 0x36, 0x24, 0x12, 0x00 },
+ { 0xa6, 0xbb, 0x9c, 0x81, 0xd2, 0xcf, 0xe8, 0xf5,
+ 0x53, 0x4e, 0x69, 0x74, 0x27, 0x3a, 0x1d, 0x00 },
+ { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+ 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xe1, 0xf2, 0xc7, 0xd4, 0xad, 0xbe, 0x8b, 0x98,
+ 0x79, 0x6a, 0x5f, 0x4c, 0x35, 0x26, 0x13, 0x00 },
+ { 0x9c, 0x81, 0xa6, 0xbb, 0xf5, 0xe8, 0xcf, 0xd2,
+ 0x4e, 0x53, 0x74, 0x69, 0x27, 0x3a, 0x1d, 0x00 },
+ { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+ 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xcc, 0xd8, 0xe4, 0xf0, 0x9c, 0x88, 0xb4, 0xa0,
+ 0x6c, 0x78, 0x44, 0x50, 0x3c, 0x28, 0x14, 0x00 },
+ { 0x9c, 0x81, 0xa6, 0xbb, 0xf5, 0xe8, 0xcf, 0xd2,
+ 0x4e, 0x53, 0x74, 0x69, 0x27, 0x3a, 0x1d, 0x00 },
+ { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+ 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xc3, 0xd6, 0xe9, 0xfc, 0x97, 0x82, 0xbd, 0xa8,
+ 0x6b, 0x7e, 0x41, 0x54, 0x3f, 0x2a, 0x15, 0x00 },
+ { 0x81, 0x9c, 0xbb, 0xa6, 0xe8, 0xf5, 0xd2, 0xcf,
+ 0x4e, 0x53, 0x74, 0x69, 0x27, 0x3a, 0x1d, 0x00 },
+ { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+ 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xd2, 0xc4, 0xfe, 0xe8, 0x8a, 0x9c, 0xa6, 0xb0,
+ 0x62, 0x74, 0x4e, 0x58, 0x3a, 0x2c, 0x16, 0x00 },
+ { 0x81, 0x9c, 0xbb, 0xa6, 0xe8, 0xf5, 0xd2, 0xcf,
+ 0x4e, 0x53, 0x74, 0x69, 0x27, 0x3a, 0x1d, 0x00 },
+ { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+ 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xdd, 0xca, 0xf3, 0xe4, 0x81, 0x96, 0xaf, 0xb8,
+ 0x65, 0x72, 0x4b, 0x5c, 0x39, 0x2e, 0x17, 0x00 },
+ { 0xe8, 0xf5, 0xcf, 0xd2, 0xa6, 0xbb, 0x81, 0x9c,
+ 0x74, 0x69, 0x53, 0x4e, 0x3a, 0x27, 0x1d, 0x00 },
+ { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+ 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x88, 0x90, 0xb8, 0xa0, 0xe8, 0xf0, 0xd8, 0xc0,
+ 0x48, 0x50, 0x78, 0x60, 0x28, 0x30, 0x18, 0x00 },
+ { 0xe8, 0xf5, 0xcf, 0xd2, 0xa6, 0xbb, 0x81, 0x9c,
+ 0x74, 0x69, 0x53, 0x4e, 0x3a, 0x27, 0x1d, 0x00 },
+ { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+ 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x87, 0x9e, 0xb5, 0xac, 0xe3, 0xfa, 0xd1, 0xc8,
+ 0x4f, 0x56, 0x7d, 0x64, 0x2b, 0x32, 0x19, 0x00 },
+ { 0xf5, 0xe8, 0xd2, 0xcf, 0xbb, 0xa6, 0x9c, 0x81,
+ 0x74, 0x69, 0x53, 0x4e, 0x3a, 0x27, 0x1d, 0x00 },
+ { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+ 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x96, 0x8c, 0xa2, 0xb8, 0xfe, 0xe4, 0xca, 0xd0,
+ 0x46, 0x5c, 0x72, 0x68, 0x2e, 0x34, 0x1a, 0x00 },
+ { 0xf5, 0xe8, 0xd2, 0xcf, 0xbb, 0xa6, 0x9c, 0x81,
+ 0x74, 0x69, 0x53, 0x4e, 0x3a, 0x27, 0x1d, 0x00 },
+ { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+ 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x99, 0x82, 0xaf, 0xb4, 0xf5, 0xee, 0xc3, 0xd8,
+ 0x41, 0x5a, 0x77, 0x6c, 0x2d, 0x36, 0x1b, 0x00 },
+ { 0xcf, 0xd2, 0xe8, 0xf5, 0x9c, 0x81, 0xbb, 0xa6,
+ 0x69, 0x74, 0x4e, 0x53, 0x3a, 0x27, 0x1d, 0x00 },
+ { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+ 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xb4, 0xa8, 0x8c, 0x90, 0xc4, 0xd8, 0xfc, 0xe0,
+ 0x54, 0x48, 0x6c, 0x70, 0x24, 0x38, 0x1c, 0x00 },
+ { 0xcf, 0xd2, 0xe8, 0xf5, 0x9c, 0x81, 0xbb, 0xa6,
+ 0x69, 0x74, 0x4e, 0x53, 0x3a, 0x27, 0x1d, 0x00 },
+ { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+ 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xbb, 0xa6, 0x81, 0x9c, 0xcf, 0xd2, 0xf5, 0xe8,
+ 0x53, 0x4e, 0x69, 0x74, 0x27, 0x3a, 0x1d, 0x00 },
+ { 0xd2, 0xcf, 0xf5, 0xe8, 0x81, 0x9c, 0xa6, 0xbb,
+ 0x69, 0x74, 0x4e, 0x53, 0x3a, 0x27, 0x1d, 0x00 },
+ { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+ 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xaa, 0xb4, 0x96, 0x88, 0xd2, 0xcc, 0xee, 0xf0,
+ 0x5a, 0x44, 0x66, 0x78, 0x22, 0x3c, 0x1e, 0x00 },
+ { 0xd2, 0xcf, 0xf5, 0xe8, 0x81, 0x9c, 0xa6, 0xbb,
+ 0x69, 0x74, 0x4e, 0x53, 0x3a, 0x27, 0x1d, 0x00 },
+ { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+ 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xa5, 0xba, 0x9b, 0x84, 0xd9, 0xc6, 0xe7, 0xf8,
+ 0x5d, 0x42, 0x63, 0x7c, 0x21, 0x3e, 0x1f, 0x00 },
+ { 0x6b, 0x51, 0x1f, 0x25, 0x83, 0xb9, 0xf7, 0xcd,
+ 0xa6, 0x9c, 0xd2, 0xe8, 0x4e, 0x74, 0x3a, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+ 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 },
+ { 0x6b, 0x51, 0x1f, 0x25, 0x83, 0xb9, 0xf7, 0xcd,
+ 0xa6, 0x9c, 0xd2, 0xe8, 0x4e, 0x74, 0x3a, 0x00 },
+ { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+ 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xef, 0xce, 0xad, 0x8c, 0x6b, 0x4a, 0x29, 0x08,
+ 0xe7, 0xc6, 0xa5, 0x84, 0x63, 0x42, 0x21, 0x00 },
+ { 0x76, 0x4c, 0x02, 0x38, 0x9e, 0xa4, 0xea, 0xd0,
+ 0xa6, 0x9c, 0xd2, 0xe8, 0x4e, 0x74, 0x3a, 0x00 },
+ { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+ 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10,
+ 0xee, 0xcc, 0xaa, 0x88, 0x66, 0x44, 0x22, 0x00 },
+ { 0x76, 0x4c, 0x02, 0x38, 0x9e, 0xa4, 0xea, 0xd0,
+ 0xa6, 0x9c, 0xd2, 0xe8, 0x4e, 0x74, 0x3a, 0x00 },
+ { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+ 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xf1, 0xd2, 0xb7, 0x94, 0x7d, 0x5e, 0x3b, 0x18,
+ 0xe9, 0xca, 0xaf, 0x8c, 0x65, 0x46, 0x23, 0x00 },
+ { 0x4c, 0x76, 0x38, 0x02, 0xb9, 0x83, 0xcd, 0xf7,
+ 0xbb, 0x81, 0xcf, 0xf5, 0x4e, 0x74, 0x3a, 0x00 },
+ { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+ 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xdc, 0xf8, 0x94, 0xb0, 0x4c, 0x68, 0x04, 0x20,
+ 0xfc, 0xd8, 0xb4, 0x90, 0x6c, 0x48, 0x24, 0x00 },
+ { 0x4c, 0x76, 0x38, 0x02, 0xb9, 0x83, 0xcd, 0xf7,
+ 0xbb, 0x81, 0xcf, 0xf5, 0x4e, 0x74, 0x3a, 0x00 },
+ { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+ 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xd3, 0xf6, 0x99, 0xbc, 0x47, 0x62, 0x0d, 0x28,
+ 0xfb, 0xde, 0xb1, 0x94, 0x6f, 0x4a, 0x25, 0x00 },
+ { 0x51, 0x6b, 0x25, 0x1f, 0xa4, 0x9e, 0xd0, 0xea,
+ 0xbb, 0x81, 0xcf, 0xf5, 0x4e, 0x74, 0x3a, 0x00 },
+ { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+ 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xc2, 0xe4, 0x8e, 0xa8, 0x5a, 0x7c, 0x16, 0x30,
+ 0xf2, 0xd4, 0xbe, 0x98, 0x6a, 0x4c, 0x26, 0x00 },
+ { 0x51, 0x6b, 0x25, 0x1f, 0xa4, 0x9e, 0xd0, 0xea,
+ 0xbb, 0x81, 0xcf, 0xf5, 0x4e, 0x74, 0x3a, 0x00 },
+ { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+ 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xcd, 0xea, 0x83, 0xa4, 0x51, 0x76, 0x1f, 0x38,
+ 0xf5, 0xd2, 0xbb, 0x9c, 0x69, 0x4e, 0x27, 0x00 },
+ { 0x38, 0x02, 0x51, 0x6b, 0xea, 0xd0, 0x83, 0xb9,
+ 0x81, 0xbb, 0xe8, 0xd2, 0x53, 0x69, 0x3a, 0x00 },
+ { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+ 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x98, 0xb0, 0xc8, 0xe0, 0x38, 0x10, 0x68, 0x40,
+ 0xd8, 0xf0, 0x88, 0xa0, 0x78, 0x50, 0x28, 0x00 },
+ { 0x38, 0x02, 0x51, 0x6b, 0xea, 0xd0, 0x83, 0xb9,
+ 0x81, 0xbb, 0xe8, 0xd2, 0x53, 0x69, 0x3a, 0x00 },
+ { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+ 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x97, 0xbe, 0xc5, 0xec, 0x33, 0x1a, 0x61, 0x48,
+ 0xdf, 0xf6, 0x8d, 0xa4, 0x7b, 0x52, 0x29, 0x00 },
+ { 0x25, 0x1f, 0x4c, 0x76, 0xf7, 0xcd, 0x9e, 0xa4,
+ 0x81, 0xbb, 0xe8, 0xd2, 0x53, 0x69, 0x3a, 0x00 },
+ { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+ 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x86, 0xac, 0xd2, 0xf8, 0x2e, 0x04, 0x7a, 0x50,
+ 0xd6, 0xfc, 0x82, 0xa8, 0x7e, 0x54, 0x2a, 0x00 },
+ { 0x25, 0x1f, 0x4c, 0x76, 0xf7, 0xcd, 0x9e, 0xa4,
+ 0x81, 0xbb, 0xe8, 0xd2, 0x53, 0x69, 0x3a, 0x00 },
+ { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+ 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x89, 0xa2, 0xdf, 0xf4, 0x25, 0x0e, 0x73, 0x58,
+ 0xd1, 0xfa, 0x87, 0xac, 0x7d, 0x56, 0x2b, 0x00 },
+ { 0x1f, 0x25, 0x76, 0x4c, 0xd0, 0xea, 0xb9, 0x83,
+ 0x9c, 0xa6, 0xf5, 0xcf, 0x53, 0x69, 0x3a, 0x00 },
+ { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+ 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xa4, 0x88, 0xfc, 0xd0, 0x14, 0x38, 0x4c, 0x60,
+ 0xc4, 0xe8, 0x9c, 0xb0, 0x74, 0x58, 0x2c, 0x00 },
+ { 0x1f, 0x25, 0x76, 0x4c, 0xd0, 0xea, 0xb9, 0x83,
+ 0x9c, 0xa6, 0xf5, 0xcf, 0x53, 0x69, 0x3a, 0x00 },
+ { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+ 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xab, 0x86, 0xf1, 0xdc, 0x1f, 0x32, 0x45, 0x68,
+ 0xc3, 0xee, 0x99, 0xb4, 0x77, 0x5a, 0x2d, 0x00 },
+ { 0x02, 0x38, 0x6b, 0x51, 0xcd, 0xf7, 0xa4, 0x9e,
+ 0x9c, 0xa6, 0xf5, 0xcf, 0x53, 0x69, 0x3a, 0x00 },
+ { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+ 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xba, 0x94, 0xe6, 0xc8, 0x02, 0x2c, 0x5e, 0x70,
+ 0xca, 0xe4, 0x96, 0xb8, 0x72, 0x5c, 0x2e, 0x00 },
+ { 0x02, 0x38, 0x6b, 0x51, 0xcd, 0xf7, 0xa4, 0x9e,
+ 0x9c, 0xa6, 0xf5, 0xcf, 0x53, 0x69, 0x3a, 0x00 },
+ { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+ 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xb5, 0x9a, 0xeb, 0xc4, 0x09, 0x26, 0x57, 0x78,
+ 0xcd, 0xe2, 0x93, 0xbc, 0x71, 0x5e, 0x2f, 0x00 },
+ { 0xd0, 0xf7, 0x9e, 0xb9, 0x4c, 0x6b, 0x02, 0x25,
+ 0xf5, 0xd2, 0xbb, 0x9c, 0x69, 0x4e, 0x27, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+ 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 },
+ { 0xd0, 0xf7, 0x9e, 0xb9, 0x4c, 0x6b, 0x02, 0x25,
+ 0xf5, 0xd2, 0xbb, 0x9c, 0x69, 0x4e, 0x27, 0x00 },
+ { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+ 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x1f, 0x2e, 0x7d, 0x4c, 0xdb, 0xea, 0xb9, 0x88,
+ 0x97, 0xa6, 0xf5, 0xc4, 0x53, 0x62, 0x31, 0x00 },
+ { 0xcd, 0xea, 0x83, 0xa4, 0x51, 0x76, 0x1f, 0x38,
+ 0xf5, 0xd2, 0xbb, 0x9c, 0x69, 0x4e, 0x27, 0x00 },
+ { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+ 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x0e, 0x3c, 0x6a, 0x58, 0xc6, 0xf4, 0xa2, 0x90,
+ 0x9e, 0xac, 0xfa, 0xc8, 0x56, 0x64, 0x32, 0x00 },
+ { 0xcd, 0xea, 0x83, 0xa4, 0x51, 0x76, 0x1f, 0x38,
+ 0xf5, 0xd2, 0xbb, 0x9c, 0x69, 0x4e, 0x27, 0x00 },
+ { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+ 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x01, 0x32, 0x67, 0x54, 0xcd, 0xfe, 0xab, 0x98,
+ 0x99, 0xaa, 0xff, 0xcc, 0x55, 0x66, 0x33, 0x00 },
+ { 0xf7, 0xd0, 0xb9, 0x9e, 0x76, 0x51, 0x38, 0x1f,
+ 0xe8, 0xcf, 0xa6, 0x81, 0x69, 0x4e, 0x27, 0x00 },
+ { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+ 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x2c, 0x18, 0x44, 0x70, 0xfc, 0xc8, 0x94, 0xa0,
+ 0x8c, 0xb8, 0xe4, 0xd0, 0x5c, 0x68, 0x34, 0x00 },
+ { 0xf7, 0xd0, 0xb9, 0x9e, 0x76, 0x51, 0x38, 0x1f,
+ 0xe8, 0xcf, 0xa6, 0x81, 0x69, 0x4e, 0x27, 0x00 },
+ { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+ 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x23, 0x16, 0x49, 0x7c, 0xf7, 0xc2, 0x9d, 0xa8,
+ 0x8b, 0xbe, 0xe1, 0xd4, 0x5f, 0x6a, 0x35, 0x00 },
+ { 0xea, 0xcd, 0xa4, 0x83, 0x6b, 0x4c, 0x25, 0x02,
+ 0xe8, 0xcf, 0xa6, 0x81, 0x69, 0x4e, 0x27, 0x00 },
+ { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+ 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x32, 0x04, 0x5e, 0x68, 0xea, 0xdc, 0x86, 0xb0,
+ 0x82, 0xb4, 0xee, 0xd8, 0x5a, 0x6c, 0x36, 0x00 },
+ { 0xea, 0xcd, 0xa4, 0x83, 0x6b, 0x4c, 0x25, 0x02,
+ 0xe8, 0xcf, 0xa6, 0x81, 0x69, 0x4e, 0x27, 0x00 },
+ { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+ 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x3d, 0x0a, 0x53, 0x64, 0xe1, 0xd6, 0x8f, 0xb8,
+ 0x85, 0xb2, 0xeb, 0xdc, 0x59, 0x6e, 0x37, 0x00 },
+ { 0x83, 0xa4, 0xd0, 0xf7, 0x25, 0x02, 0x76, 0x51,
+ 0xd2, 0xf5, 0x81, 0xa6, 0x74, 0x53, 0x27, 0x00 },
+ { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+ 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x68, 0x50, 0x18, 0x20, 0x88, 0xb0, 0xf8, 0xc0,
+ 0xa8, 0x90, 0xd8, 0xe0, 0x48, 0x70, 0x38, 0x00 },
+ { 0x83, 0xa4, 0xd0, 0xf7, 0x25, 0x02, 0x76, 0x51,
+ 0xd2, 0xf5, 0x81, 0xa6, 0x74, 0x53, 0x27, 0x00 },
+ { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+ 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x67, 0x5e, 0x15, 0x2c, 0x83, 0xba, 0xf1, 0xc8,
+ 0xaf, 0x96, 0xdd, 0xe4, 0x4b, 0x72, 0x39, 0x00 },
+ { 0x9e, 0xb9, 0xcd, 0xea, 0x38, 0x1f, 0x6b, 0x4c,
+ 0xd2, 0xf5, 0x81, 0xa6, 0x74, 0x53, 0x27, 0x00 },
+ { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+ 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x76, 0x4c, 0x02, 0x38, 0x9e, 0xa4, 0xea, 0xd0,
+ 0xa6, 0x9c, 0xd2, 0xe8, 0x4e, 0x74, 0x3a, 0x00 },
+ { 0x9e, 0xb9, 0xcd, 0xea, 0x38, 0x1f, 0x6b, 0x4c,
+ 0xd2, 0xf5, 0x81, 0xa6, 0x74, 0x53, 0x27, 0x00 },
+ { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+ 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x79, 0x42, 0x0f, 0x34, 0x95, 0xae, 0xe3, 0xd8,
+ 0xa1, 0x9a, 0xd7, 0xec, 0x4d, 0x76, 0x3b, 0x00 },
+ { 0xa4, 0x83, 0xf7, 0xd0, 0x1f, 0x38, 0x4c, 0x6b,
+ 0xcf, 0xe8, 0x9c, 0xbb, 0x74, 0x53, 0x27, 0x00 },
+ { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+ 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x54, 0x68, 0x2c, 0x10, 0xa4, 0x98, 0xdc, 0xe0,
+ 0xb4, 0x88, 0xcc, 0xf0, 0x44, 0x78, 0x3c, 0x00 },
+ { 0xa4, 0x83, 0xf7, 0xd0, 0x1f, 0x38, 0x4c, 0x6b,
+ 0xcf, 0xe8, 0x9c, 0xbb, 0x74, 0x53, 0x27, 0x00 },
+ { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+ 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x5b, 0x66, 0x21, 0x1c, 0xaf, 0x92, 0xd5, 0xe8,
+ 0xb3, 0x8e, 0xc9, 0xf4, 0x47, 0x7a, 0x3d, 0x00 },
+ { 0xb9, 0x9e, 0xea, 0xcd, 0x02, 0x25, 0x51, 0x76,
+ 0xcf, 0xe8, 0x9c, 0xbb, 0x74, 0x53, 0x27, 0x00 },
+ { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+ 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x4a, 0x74, 0x36, 0x08, 0xb2, 0x8c, 0xce, 0xf0,
+ 0xba, 0x84, 0xc6, 0xf8, 0x42, 0x7c, 0x3e, 0x00 },
+ { 0xb9, 0x9e, 0xea, 0xcd, 0x02, 0x25, 0x51, 0x76,
+ 0xcf, 0xe8, 0x9c, 0xbb, 0x74, 0x53, 0x27, 0x00 },
+ { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+ 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 },
+ { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x45, 0x7a, 0x3b, 0x04, 0xb9, 0x86, 0xc7, 0xf8,
+ 0xbd, 0x82, 0xc3, 0xfc, 0x41, 0x7e, 0x3f, 0x00 },
+ { 0xd6, 0xa2, 0x3e, 0x4a, 0x1b, 0x6f, 0xf3, 0x87,
+ 0x51, 0x25, 0xb9, 0xcd, 0x9c, 0xe8, 0x74, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+ 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 },
+ { 0xd6, 0xa2, 0x3e, 0x4a, 0x1b, 0x6f, 0xf3, 0x87,
+ 0x51, 0x25, 0xb9, 0xcd, 0x9c, 0xe8, 0x74, 0x00 },
+ { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+ 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xcf, 0x8e, 0x4d, 0x0c, 0xcb, 0x8a, 0x49, 0x08,
+ 0xc7, 0x86, 0x45, 0x04, 0xc3, 0x82, 0x41, 0x00 },
+ { 0xcb, 0xbf, 0x23, 0x57, 0x06, 0x72, 0xee, 0x9a,
+ 0x51, 0x25, 0xb9, 0xcd, 0x9c, 0xe8, 0x74, 0x00 },
+ { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+ 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xde, 0x9c, 0x5a, 0x18, 0xd6, 0x94, 0x52, 0x10,
+ 0xce, 0x8c, 0x4a, 0x08, 0xc6, 0x84, 0x42, 0x00 },
+ { 0xcb, 0xbf, 0x23, 0x57, 0x06, 0x72, 0xee, 0x9a,
+ 0x51, 0x25, 0xb9, 0xcd, 0x9c, 0xe8, 0x74, 0x00 },
+ { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+ 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xd1, 0x92, 0x57, 0x14, 0xdd, 0x9e, 0x5b, 0x18,
+ 0xc9, 0x8a, 0x4f, 0x0c, 0xc5, 0x86, 0x43, 0x00 },
+ { 0xf1, 0x85, 0x19, 0x6d, 0x21, 0x55, 0xc9, 0xbd,
+ 0x4c, 0x38, 0xa4, 0xd0, 0x9c, 0xe8, 0x74, 0x00 },
+ { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+ 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xfc, 0xb8, 0x74, 0x30, 0xec, 0xa8, 0x64, 0x20,
+ 0xdc, 0x98, 0x54, 0x10, 0xcc, 0x88, 0x44, 0x00 },
+ { 0xf1, 0x85, 0x19, 0x6d, 0x21, 0x55, 0xc9, 0xbd,
+ 0x4c, 0x38, 0xa4, 0xd0, 0x9c, 0xe8, 0x74, 0x00 },
+ { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+ 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xf3, 0xb6, 0x79, 0x3c, 0xe7, 0xa2, 0x6d, 0x28,
+ 0xdb, 0x9e, 0x51, 0x14, 0xcf, 0x8a, 0x45, 0x00 },
+ { 0xec, 0x98, 0x04, 0x70, 0x3c, 0x48, 0xd4, 0xa0,
+ 0x4c, 0x38, 0xa4, 0xd0, 0x9c, 0xe8, 0x74, 0x00 },
+ { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+ 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xe2, 0xa4, 0x6e, 0x28, 0xfa, 0xbc, 0x76, 0x30,
+ 0xd2, 0x94, 0x5e, 0x18, 0xca, 0x8c, 0x46, 0x00 },
+ { 0xec, 0x98, 0x04, 0x70, 0x3c, 0x48, 0xd4, 0xa0,
+ 0x4c, 0x38, 0xa4, 0xd0, 0x9c, 0xe8, 0x74, 0x00 },
+ { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+ 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xed, 0xaa, 0x63, 0x24, 0xf1, 0xb6, 0x7f, 0x38,
+ 0xd5, 0x92, 0x5b, 0x1c, 0xc9, 0x8e, 0x47, 0x00 },
+ { 0x85, 0xf1, 0x70, 0x04, 0x72, 0x06, 0x87, 0xf3,
+ 0x76, 0x02, 0x83, 0xf7, 0x81, 0xf5, 0x74, 0x00 },
+ { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+ 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xb8, 0xf0, 0x28, 0x60, 0x98, 0xd0, 0x08, 0x40,
+ 0xf8, 0xb0, 0x68, 0x20, 0xd8, 0x90, 0x48, 0x00 },
+ { 0x85, 0xf1, 0x70, 0x04, 0x72, 0x06, 0x87, 0xf3,
+ 0x76, 0x02, 0x83, 0xf7, 0x81, 0xf5, 0x74, 0x00 },
+ { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+ 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xb7, 0xfe, 0x25, 0x6c, 0x93, 0xda, 0x01, 0x48,
+ 0xff, 0xb6, 0x6d, 0x24, 0xdb, 0x92, 0x49, 0x00 },
+ { 0x98, 0xec, 0x6d, 0x19, 0x6f, 0x1b, 0x9a, 0xee,
+ 0x76, 0x02, 0x83, 0xf7, 0x81, 0xf5, 0x74, 0x00 },
+ { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+ 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xa6, 0xec, 0x32, 0x78, 0x8e, 0xc4, 0x1a, 0x50,
+ 0xf6, 0xbc, 0x62, 0x28, 0xde, 0x94, 0x4a, 0x00 },
+ { 0x98, 0xec, 0x6d, 0x19, 0x6f, 0x1b, 0x9a, 0xee,
+ 0x76, 0x02, 0x83, 0xf7, 0x81, 0xf5, 0x74, 0x00 },
+ { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+ 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xa9, 0xe2, 0x3f, 0x74, 0x85, 0xce, 0x13, 0x58,
+ 0xf1, 0xba, 0x67, 0x2c, 0xdd, 0x96, 0x4b, 0x00 },
+ { 0xa2, 0xd6, 0x57, 0x23, 0x48, 0x3c, 0xbd, 0xc9,
+ 0x6b, 0x1f, 0x9e, 0xea, 0x81, 0xf5, 0x74, 0x00 },
+ { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+ 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x84, 0xc8, 0x1c, 0x50, 0xb4, 0xf8, 0x2c, 0x60,
+ 0xe4, 0xa8, 0x7c, 0x30, 0xd4, 0x98, 0x4c, 0x00 },
+ { 0xa2, 0xd6, 0x57, 0x23, 0x48, 0x3c, 0xbd, 0xc9,
+ 0x6b, 0x1f, 0x9e, 0xea, 0x81, 0xf5, 0x74, 0x00 },
+ { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+ 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x8b, 0xc6, 0x11, 0x5c, 0xbf, 0xf2, 0x25, 0x68,
+ 0xe3, 0xae, 0x79, 0x34, 0xd7, 0x9a, 0x4d, 0x00 },
+ { 0xbf, 0xcb, 0x4a, 0x3e, 0x55, 0x21, 0xa0, 0xd4,
+ 0x6b, 0x1f, 0x9e, 0xea, 0x81, 0xf5, 0x74, 0x00 },
+ { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+ 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x9a, 0xd4, 0x06, 0x48, 0xa2, 0xec, 0x3e, 0x70,
+ 0xea, 0xa4, 0x76, 0x38, 0xd2, 0x9c, 0x4e, 0x00 },
+ { 0xbf, 0xcb, 0x4a, 0x3e, 0x55, 0x21, 0xa0, 0xd4,
+ 0x6b, 0x1f, 0x9e, 0xea, 0x81, 0xf5, 0x74, 0x00 },
+ { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+ 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x95, 0xda, 0x0b, 0x44, 0xa9, 0xe6, 0x37, 0x78,
+ 0xed, 0xa2, 0x73, 0x3c, 0xd1, 0x9e, 0x4f, 0x00 },
+ { 0x6d, 0x04, 0xbf, 0xd6, 0xd4, 0xbd, 0x06, 0x6f,
+ 0x02, 0x6b, 0xd0, 0xb9, 0xbb, 0xd2, 0x69, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+ 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 },
+ { 0x6d, 0x04, 0xbf, 0xd6, 0xd4, 0xbd, 0x06, 0x6f,
+ 0x02, 0x6b, 0xd0, 0xb9, 0xbb, 0xd2, 0x69, 0x00 },
+ { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+ 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x3f, 0x6e, 0x9d, 0xcc, 0x7b, 0x2a, 0xd9, 0x88,
+ 0xb7, 0xe6, 0x15, 0x44, 0xf3, 0xa2, 0x51, 0x00 },
+ { 0x70, 0x19, 0xa2, 0xcb, 0xc9, 0xa0, 0x1b, 0x72,
+ 0x02, 0x6b, 0xd0, 0xb9, 0xbb, 0xd2, 0x69, 0x00 },
+ { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+ 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x2e, 0x7c, 0x8a, 0xd8, 0x66, 0x34, 0xc2, 0x90,
+ 0xbe, 0xec, 0x1a, 0x48, 0xf6, 0xa4, 0x52, 0x00 },
+ { 0x70, 0x19, 0xa2, 0xcb, 0xc9, 0xa0, 0x1b, 0x72,
+ 0x02, 0x6b, 0xd0, 0xb9, 0xbb, 0xd2, 0x69, 0x00 },
+ { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+ 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x21, 0x72, 0x87, 0xd4, 0x6d, 0x3e, 0xcb, 0x98,
+ 0xb9, 0xea, 0x1f, 0x4c, 0xf5, 0xa6, 0x53, 0x00 },
+ { 0x4a, 0x23, 0x98, 0xf1, 0xee, 0x87, 0x3c, 0x55,
+ 0x1f, 0x76, 0xcd, 0xa4, 0xbb, 0xd2, 0x69, 0x00 },
+ { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+ 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x0c, 0x58, 0xa4, 0xf0, 0x5c, 0x08, 0xf4, 0xa0,
+ 0xac, 0xf8, 0x04, 0x50, 0xfc, 0xa8, 0x54, 0x00 },
+ { 0x4a, 0x23, 0x98, 0xf1, 0xee, 0x87, 0x3c, 0x55,
+ 0x1f, 0x76, 0xcd, 0xa4, 0xbb, 0xd2, 0x69, 0x00 },
+ { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+ 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x03, 0x56, 0xa9, 0xfc, 0x57, 0x02, 0xfd, 0xa8,
+ 0xab, 0xfe, 0x01, 0x54, 0xff, 0xaa, 0x55, 0x00 },
+ { 0x57, 0x3e, 0x85, 0xec, 0xf3, 0x9a, 0x21, 0x48,
+ 0x1f, 0x76, 0xcd, 0xa4, 0xbb, 0xd2, 0x69, 0x00 },
+ { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+ 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x12, 0x44, 0xbe, 0xe8, 0x4a, 0x1c, 0xe6, 0xb0,
+ 0xa2, 0xf4, 0x0e, 0x58, 0xfa, 0xac, 0x56, 0x00 },
+ { 0x57, 0x3e, 0x85, 0xec, 0xf3, 0x9a, 0x21, 0x48,
+ 0x1f, 0x76, 0xcd, 0xa4, 0xbb, 0xd2, 0x69, 0x00 },
+ { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+ 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x1d, 0x4a, 0xb3, 0xe4, 0x41, 0x16, 0xef, 0xb8,
+ 0xa5, 0xf2, 0x0b, 0x5c, 0xf9, 0xae, 0x57, 0x00 },
+ { 0x3e, 0x57, 0xf1, 0x98, 0xbd, 0xd4, 0x72, 0x1b,
+ 0x25, 0x4c, 0xea, 0x83, 0xa6, 0xcf, 0x69, 0x00 },
+ { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+ 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x48, 0x10, 0xf8, 0xa0, 0x28, 0x70, 0x98, 0xc0,
+ 0x88, 0xd0, 0x38, 0x60, 0xe8, 0xb0, 0x58, 0x00 },
+ { 0x3e, 0x57, 0xf1, 0x98, 0xbd, 0xd4, 0x72, 0x1b,
+ 0x25, 0x4c, 0xea, 0x83, 0xa6, 0xcf, 0x69, 0x00 },
+ { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+ 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x47, 0x1e, 0xf5, 0xac, 0x23, 0x7a, 0x91, 0xc8,
+ 0x8f, 0xd6, 0x3d, 0x64, 0xeb, 0xb2, 0x59, 0x00 },
+ { 0x23, 0x4a, 0xec, 0x85, 0xa0, 0xc9, 0x6f, 0x06,
+ 0x25, 0x4c, 0xea, 0x83, 0xa6, 0xcf, 0x69, 0x00 },
+ { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+ 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x56, 0x0c, 0xe2, 0xb8, 0x3e, 0x64, 0x8a, 0xd0,
+ 0x86, 0xdc, 0x32, 0x68, 0xee, 0xb4, 0x5a, 0x00 },
+ { 0x23, 0x4a, 0xec, 0x85, 0xa0, 0xc9, 0x6f, 0x06,
+ 0x25, 0x4c, 0xea, 0x83, 0xa6, 0xcf, 0x69, 0x00 },
+ { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+ 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x59, 0x02, 0xef, 0xb4, 0x35, 0x6e, 0x83, 0xd8,
+ 0x81, 0xda, 0x37, 0x6c, 0xed, 0xb6, 0x5b, 0x00 },
+ { 0x19, 0x70, 0xd6, 0xbf, 0x87, 0xee, 0x48, 0x21,
+ 0x38, 0x51, 0xf7, 0x9e, 0xa6, 0xcf, 0x69, 0x00 },
+ { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+ 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x74, 0x28, 0xcc, 0x90, 0x04, 0x58, 0xbc, 0xe0,
+ 0x94, 0xc8, 0x2c, 0x70, 0xe4, 0xb8, 0x5c, 0x00 },
+ { 0x19, 0x70, 0xd6, 0xbf, 0x87, 0xee, 0x48, 0x21,
+ 0x38, 0x51, 0xf7, 0x9e, 0xa6, 0xcf, 0x69, 0x00 },
+ { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+ 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x7b, 0x26, 0xc1, 0x9c, 0x0f, 0x52, 0xb5, 0xe8,
+ 0x93, 0xce, 0x29, 0x74, 0xe7, 0xba, 0x5d, 0x00 },
+ { 0x04, 0x6d, 0xcb, 0xa2, 0x9a, 0xf3, 0x55, 0x3c,
+ 0x38, 0x51, 0xf7, 0x9e, 0xa6, 0xcf, 0x69, 0x00 },
+ { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+ 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x6a, 0x34, 0xd6, 0x88, 0x12, 0x4c, 0xae, 0xf0,
+ 0x9a, 0xc4, 0x26, 0x78, 0xe2, 0xbc, 0x5e, 0x00 },
+ { 0x04, 0x6d, 0xcb, 0xa2, 0x9a, 0xf3, 0x55, 0x3c,
+ 0x38, 0x51, 0xf7, 0x9e, 0xa6, 0xcf, 0x69, 0x00 },
+ { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+ 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 },
+ { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x65, 0x3a, 0xdb, 0x84, 0x19, 0x46, 0xa7, 0xf8,
+ 0x9d, 0xc2, 0x23, 0x7c, 0xe1, 0xbe, 0x5f, 0x00 },
+ { 0xbd, 0xf3, 0x21, 0x6f, 0x98, 0xd6, 0x04, 0x4a,
+ 0xf7, 0xb9, 0x6b, 0x25, 0xd2, 0x9c, 0x4e, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+ 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 },
+ { 0xbd, 0xf3, 0x21, 0x6f, 0x98, 0xd6, 0x04, 0x4a,
+ 0xf7, 0xb9, 0x6b, 0x25, 0xd2, 0x9c, 0x4e, 0x00 },
+ { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+ 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x2f, 0x4e, 0xed, 0x8c, 0xab, 0xca, 0x69, 0x08,
+ 0x27, 0x46, 0xe5, 0x84, 0xa3, 0xc2, 0x61, 0x00 },
+ { 0xa0, 0xee, 0x3c, 0x72, 0x85, 0xcb, 0x19, 0x57,
+ 0xf7, 0xb9, 0x6b, 0x25, 0xd2, 0x9c, 0x4e, 0x00 },
+ { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+ 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x3e, 0x5c, 0xfa, 0x98, 0xb6, 0xd4, 0x72, 0x10,
+ 0x2e, 0x4c, 0xea, 0x88, 0xa6, 0xc4, 0x62, 0x00 },
+ { 0xa0, 0xee, 0x3c, 0x72, 0x85, 0xcb, 0x19, 0x57,
+ 0xf7, 0xb9, 0x6b, 0x25, 0xd2, 0x9c, 0x4e, 0x00 },
+ { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+ 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x31, 0x52, 0xf7, 0x94, 0xbd, 0xde, 0x7b, 0x18,
+ 0x29, 0x4a, 0xef, 0x8c, 0xa5, 0xc6, 0x63, 0x00 },
+ { 0x9a, 0xd4, 0x06, 0x48, 0xa2, 0xec, 0x3e, 0x70,
+ 0xea, 0xa4, 0x76, 0x38, 0xd2, 0x9c, 0x4e, 0x00 },
+ { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+ 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x1c, 0x78, 0xd4, 0xb0, 0x8c, 0xe8, 0x44, 0x20,
+ 0x3c, 0x58, 0xf4, 0x90, 0xac, 0xc8, 0x64, 0x00 },
+ { 0x9a, 0xd4, 0x06, 0x48, 0xa2, 0xec, 0x3e, 0x70,
+ 0xea, 0xa4, 0x76, 0x38, 0xd2, 0x9c, 0x4e, 0x00 },
+ { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+ 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x13, 0x76, 0xd9, 0xbc, 0x87, 0xe2, 0x4d, 0x28,
+ 0x3b, 0x5e, 0xf1, 0x94, 0xaf, 0xca, 0x65, 0x00 },
+ { 0x87, 0xc9, 0x1b, 0x55, 0xbf, 0xf1, 0x23, 0x6d,
+ 0xea, 0xa4, 0x76, 0x38, 0xd2, 0x9c, 0x4e, 0x00 },
+ { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+ 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x02, 0x64, 0xce, 0xa8, 0x9a, 0xfc, 0x56, 0x30,
+ 0x32, 0x54, 0xfe, 0x98, 0xaa, 0xcc, 0x66, 0x00 },
+ { 0x87, 0xc9, 0x1b, 0x55, 0xbf, 0xf1, 0x23, 0x6d,
+ 0xea, 0xa4, 0x76, 0x38, 0xd2, 0x9c, 0x4e, 0x00 },
+ { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+ 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x0d, 0x6a, 0xc3, 0xa4, 0x91, 0xf6, 0x5f, 0x38,
+ 0x35, 0x52, 0xfb, 0x9c, 0xa9, 0xce, 0x67, 0x00 },
+ { 0xee, 0xa0, 0x6f, 0x21, 0xf1, 0xbf, 0x70, 0x3e,
+ 0xd0, 0x9e, 0x51, 0x1f, 0xcf, 0x81, 0x4e, 0x00 },
+ { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+ 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x58, 0x30, 0x88, 0xe0, 0xf8, 0x90, 0x28, 0x40,
+ 0x18, 0x70, 0xc8, 0xa0, 0xb8, 0xd0, 0x68, 0x00 },
+ { 0xee, 0xa0, 0x6f, 0x21, 0xf1, 0xbf, 0x70, 0x3e,
+ 0xd0, 0x9e, 0x51, 0x1f, 0xcf, 0x81, 0x4e, 0x00 },
+ { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+ 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x57, 0x3e, 0x85, 0xec, 0xf3, 0x9a, 0x21, 0x48,
+ 0x1f, 0x76, 0xcd, 0xa4, 0xbb, 0xd2, 0x69, 0x00 },
+ { 0xf3, 0xbd, 0x72, 0x3c, 0xec, 0xa2, 0x6d, 0x23,
+ 0xd0, 0x9e, 0x51, 0x1f, 0xcf, 0x81, 0x4e, 0x00 },
+ { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+ 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x46, 0x2c, 0x92, 0xf8, 0xee, 0x84, 0x3a, 0x50,
+ 0x16, 0x7c, 0xc2, 0xa8, 0xbe, 0xd4, 0x6a, 0x00 },
+ { 0xf3, 0xbd, 0x72, 0x3c, 0xec, 0xa2, 0x6d, 0x23,
+ 0xd0, 0x9e, 0x51, 0x1f, 0xcf, 0x81, 0x4e, 0x00 },
+ { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+ 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x49, 0x22, 0x9f, 0xf4, 0xe5, 0x8e, 0x33, 0x58,
+ 0x11, 0x7a, 0xc7, 0xac, 0xbd, 0xd6, 0x6b, 0x00 },
+ { 0xc9, 0x87, 0x48, 0x06, 0xcb, 0x85, 0x4a, 0x04,
+ 0xcd, 0x83, 0x4c, 0x02, 0xcf, 0x81, 0x4e, 0x00 },
+ { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+ 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x64, 0x08, 0xbc, 0xd0, 0xd4, 0xb8, 0x0c, 0x60,
+ 0x04, 0x68, 0xdc, 0xb0, 0xb4, 0xd8, 0x6c, 0x00 },
+ { 0xc9, 0x87, 0x48, 0x06, 0xcb, 0x85, 0x4a, 0x04,
+ 0xcd, 0x83, 0x4c, 0x02, 0xcf, 0x81, 0x4e, 0x00 },
+ { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+ 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x6b, 0x06, 0xb1, 0xdc, 0xdf, 0xb2, 0x05, 0x68,
+ 0x03, 0x6e, 0xd9, 0xb4, 0xb7, 0xda, 0x6d, 0x00 },
+ { 0xd4, 0x9a, 0x55, 0x1b, 0xd6, 0x98, 0x57, 0x19,
+ 0xcd, 0x83, 0x4c, 0x02, 0xcf, 0x81, 0x4e, 0x00 },
+ { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+ 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x7a, 0x14, 0xa6, 0xc8, 0xc2, 0xac, 0x1e, 0x70,
+ 0x0a, 0x64, 0xd6, 0xb8, 0xb2, 0xdc, 0x6e, 0x00 },
+ { 0xd4, 0x9a, 0x55, 0x1b, 0xd6, 0x98, 0x57, 0x19,
+ 0xcd, 0x83, 0x4c, 0x02, 0xcf, 0x81, 0x4e, 0x00 },
+ { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+ 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x75, 0x1a, 0xab, 0xc4, 0xc9, 0xa6, 0x17, 0x78,
+ 0x0d, 0x62, 0xd3, 0xbc, 0xb1, 0xde, 0x6f, 0x00 },
+ { 0x06, 0x55, 0xa0, 0xf3, 0x57, 0x04, 0xf1, 0xa2,
+ 0xa4, 0xf7, 0x02, 0x51, 0xf5, 0xa6, 0x53, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+ 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 },
+ { 0x06, 0x55, 0xa0, 0xf3, 0x57, 0x04, 0xf1, 0xa2,
+ 0xa4, 0xf7, 0x02, 0x51, 0xf5, 0xa6, 0x53, 0x00 },
+ { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+ 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xdf, 0xae, 0x3d, 0x4c, 0x1b, 0x6a, 0xf9, 0x88,
+ 0x57, 0x26, 0xb5, 0xc4, 0x93, 0xe2, 0x71, 0x00 },
+ { 0x1b, 0x48, 0xbd, 0xee, 0x4a, 0x19, 0xec, 0xbf,
+ 0xa4, 0xf7, 0x02, 0x51, 0xf5, 0xa6, 0x53, 0x00 },
+ { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+ 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xce, 0xbc, 0x2a, 0x58, 0x06, 0x74, 0xe2, 0x90,
+ 0x5e, 0x2c, 0xba, 0xc8, 0x96, 0xe4, 0x72, 0x00 },
+ { 0x1b, 0x48, 0xbd, 0xee, 0x4a, 0x19, 0xec, 0xbf,
+ 0xa4, 0xf7, 0x02, 0x51, 0xf5, 0xa6, 0x53, 0x00 },
+ { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+ 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xc1, 0xb2, 0x27, 0x54, 0x0d, 0x7e, 0xeb, 0x98,
+ 0x59, 0x2a, 0xbf, 0xcc, 0x95, 0xe6, 0x73, 0x00 },
+ { 0x21, 0x72, 0x87, 0xd4, 0x6d, 0x3e, 0xcb, 0x98,
+ 0xb9, 0xea, 0x1f, 0x4c, 0xf5, 0xa6, 0x53, 0x00 },
+ { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+ 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xec, 0x98, 0x04, 0x70, 0x3c, 0x48, 0xd4, 0xa0,
+ 0x4c, 0x38, 0xa4, 0xd0, 0x9c, 0xe8, 0x74, 0x00 },
+ { 0x21, 0x72, 0x87, 0xd4, 0x6d, 0x3e, 0xcb, 0x98,
+ 0xb9, 0xea, 0x1f, 0x4c, 0xf5, 0xa6, 0x53, 0x00 },
+ { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+ 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xe3, 0x96, 0x09, 0x7c, 0x37, 0x42, 0xdd, 0xa8,
+ 0x4b, 0x3e, 0xa1, 0xd4, 0x9f, 0xea, 0x75, 0x00 },
+ { 0x3c, 0x6f, 0x9a, 0xc9, 0x70, 0x23, 0xd6, 0x85,
+ 0xb9, 0xea, 0x1f, 0x4c, 0xf5, 0xa6, 0x53, 0x00 },
+ { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+ 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xf2, 0x84, 0x1e, 0x68, 0x2a, 0x5c, 0xc6, 0xb0,
+ 0x42, 0x34, 0xae, 0xd8, 0x9a, 0xec, 0x76, 0x00 },
+ { 0x3c, 0x6f, 0x9a, 0xc9, 0x70, 0x23, 0xd6, 0x85,
+ 0xb9, 0xea, 0x1f, 0x4c, 0xf5, 0xa6, 0x53, 0x00 },
+ { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+ 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xfd, 0x8a, 0x13, 0x64, 0x21, 0x56, 0xcf, 0xb8,
+ 0x45, 0x32, 0xab, 0xdc, 0x99, 0xee, 0x77, 0x00 },
+ { 0x55, 0x06, 0xee, 0xbd, 0x3e, 0x6d, 0x85, 0xd6,
+ 0x83, 0xd0, 0x38, 0x6b, 0xe8, 0xbb, 0x53, 0x00 },
+ { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+ 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xa8, 0xd0, 0x58, 0x20, 0x48, 0x30, 0xb8, 0xc0,
+ 0x68, 0x10, 0x98, 0xe0, 0x88, 0xf0, 0x78, 0x00 },
+ { 0x55, 0x06, 0xee, 0xbd, 0x3e, 0x6d, 0x85, 0xd6,
+ 0x83, 0xd0, 0x38, 0x6b, 0xe8, 0xbb, 0x53, 0x00 },
+ { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+ 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xa7, 0xde, 0x55, 0x2c, 0x43, 0x3a, 0xb1, 0xc8,
+ 0x6f, 0x16, 0x9d, 0xe4, 0x8b, 0xf2, 0x79, 0x00 },
+ { 0x48, 0x1b, 0xf3, 0xa0, 0x23, 0x70, 0x98, 0xcb,
+ 0x83, 0xd0, 0x38, 0x6b, 0xe8, 0xbb, 0x53, 0x00 },
+ { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+ 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xb6, 0xcc, 0x42, 0x38, 0x5e, 0x24, 0xaa, 0xd0,
+ 0x66, 0x1c, 0x92, 0xe8, 0x8e, 0xf4, 0x7a, 0x00 },
+ { 0x48, 0x1b, 0xf3, 0xa0, 0x23, 0x70, 0x98, 0xcb,
+ 0x83, 0xd0, 0x38, 0x6b, 0xe8, 0xbb, 0x53, 0x00 },
+ { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+ 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0xb9, 0xc2, 0x4f, 0x34, 0x55, 0x2e, 0xa3, 0xd8,
+ 0x61, 0x1a, 0x97, 0xec, 0x8d, 0xf6, 0x7b, 0x00 },
+ { 0x72, 0x21, 0xc9, 0x9a, 0x04, 0x57, 0xbf, 0xec,
+ 0x9e, 0xcd, 0x25, 0x76, 0xe8, 0xbb, 0x53, 0x00 },
+ { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+ 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x94, 0xe8, 0x6c, 0x10, 0x64, 0x18, 0x9c, 0xe0,
+ 0x74, 0x08, 0x8c, 0xf0, 0x84, 0xf8, 0x7c, 0x00 },
+ { 0x72, 0x21, 0xc9, 0x9a, 0x04, 0x57, 0xbf, 0xec,
+ 0x9e, 0xcd, 0x25, 0x76, 0xe8, 0xbb, 0x53, 0x00 },
+ { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+ 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x9b, 0xe6, 0x61, 0x1c, 0x6f, 0x12, 0x95, 0xe8,
+ 0x73, 0x0e, 0x89, 0xf4, 0x87, 0xfa, 0x7d, 0x00 },
+ { 0x6f, 0x3c, 0xd4, 0x87, 0x19, 0x4a, 0xa2, 0xf1,
+ 0x9e, 0xcd, 0x25, 0x76, 0xe8, 0xbb, 0x53, 0x00 },
+ { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+ 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x8a, 0xf4, 0x76, 0x08, 0x72, 0x0c, 0x8e, 0xf0,
+ 0x7a, 0x04, 0x86, 0xf8, 0x82, 0xfc, 0x7e, 0x00 },
+ { 0x6f, 0x3c, 0xd4, 0x87, 0x19, 0x4a, 0xa2, 0xf1,
+ 0x9e, 0xcd, 0x25, 0x76, 0xe8, 0xbb, 0x53, 0x00 },
+ { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+ 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 },
+ { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 },
+ { 0x85, 0xfa, 0x7b, 0x04, 0x79, 0x06, 0x87, 0xf8,
+ 0x7d, 0x02, 0x83, 0xfc, 0x81, 0xfe, 0x7f, 0x00 },
+ { 0xb1, 0x59, 0x7c, 0x94, 0x36, 0xde, 0xfb, 0x13,
+ 0xa2, 0x4a, 0x6f, 0x87, 0x25, 0xcd, 0xe8, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+ 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 },
+ { 0xb1, 0x59, 0x7c, 0x94, 0x36, 0xde, 0xfb, 0x13,
+ 0xa2, 0x4a, 0x6f, 0x87, 0x25, 0xcd, 0xe8, 0x00 },
+ { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+ 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x8f, 0x0e, 0x8d, 0x0c, 0x8b, 0x0a, 0x89, 0x08,
+ 0x87, 0x06, 0x85, 0x04, 0x83, 0x02, 0x81, 0x00 },
+ { 0xac, 0x44, 0x61, 0x89, 0x2b, 0xc3, 0xe6, 0x0e,
+ 0xa2, 0x4a, 0x6f, 0x87, 0x25, 0xcd, 0xe8, 0x00 },
+ { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+ 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x9e, 0x1c, 0x9a, 0x18, 0x96, 0x14, 0x92, 0x10,
+ 0x8e, 0x0c, 0x8a, 0x08, 0x86, 0x04, 0x82, 0x00 },
+ { 0xac, 0x44, 0x61, 0x89, 0x2b, 0xc3, 0xe6, 0x0e,
+ 0xa2, 0x4a, 0x6f, 0x87, 0x25, 0xcd, 0xe8, 0x00 },
+ { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+ 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x91, 0x12, 0x97, 0x14, 0x9d, 0x1e, 0x9b, 0x18,
+ 0x89, 0x0a, 0x8f, 0x0c, 0x85, 0x06, 0x83, 0x00 },
+ { 0x96, 0x7e, 0x5b, 0xb3, 0x0c, 0xe4, 0xc1, 0x29,
+ 0xbf, 0x57, 0x72, 0x9a, 0x25, 0xcd, 0xe8, 0x00 },
+ { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+ 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xbc, 0x38, 0xb4, 0x30, 0xac, 0x28, 0xa4, 0x20,
+ 0x9c, 0x18, 0x94, 0x10, 0x8c, 0x08, 0x84, 0x00 },
+ { 0x96, 0x7e, 0x5b, 0xb3, 0x0c, 0xe4, 0xc1, 0x29,
+ 0xbf, 0x57, 0x72, 0x9a, 0x25, 0xcd, 0xe8, 0x00 },
+ { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+ 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xb3, 0x36, 0xb9, 0x3c, 0xa7, 0x22, 0xad, 0x28,
+ 0x9b, 0x1e, 0x91, 0x14, 0x8f, 0x0a, 0x85, 0x00 },
+ { 0x8b, 0x63, 0x46, 0xae, 0x11, 0xf9, 0xdc, 0x34,
+ 0xbf, 0x57, 0x72, 0x9a, 0x25, 0xcd, 0xe8, 0x00 },
+ { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+ 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xa2, 0x24, 0xae, 0x28, 0xba, 0x3c, 0xb6, 0x30,
+ 0x92, 0x14, 0x9e, 0x18, 0x8a, 0x0c, 0x86, 0x00 },
+ { 0x8b, 0x63, 0x46, 0xae, 0x11, 0xf9, 0xdc, 0x34,
+ 0xbf, 0x57, 0x72, 0x9a, 0x25, 0xcd, 0xe8, 0x00 },
+ { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+ 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xad, 0x2a, 0xa3, 0x24, 0xb1, 0x36, 0xbf, 0x38,
+ 0x95, 0x12, 0x9b, 0x1c, 0x89, 0x0e, 0x87, 0x00 },
+ { 0xe2, 0x0a, 0x32, 0xda, 0x5f, 0xb7, 0x8f, 0x67,
+ 0x85, 0x6d, 0x55, 0xbd, 0x38, 0xd0, 0xe8, 0x00 },
+ { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+ 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xf8, 0x70, 0xe8, 0x60, 0xd8, 0x50, 0xc8, 0x40,
+ 0xb8, 0x30, 0xa8, 0x20, 0x98, 0x10, 0x88, 0x00 },
+ { 0xe2, 0x0a, 0x32, 0xda, 0x5f, 0xb7, 0x8f, 0x67,
+ 0x85, 0x6d, 0x55, 0xbd, 0x38, 0xd0, 0xe8, 0x00 },
+ { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+ 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xf7, 0x7e, 0xe5, 0x6c, 0xd3, 0x5a, 0xc1, 0x48,
+ 0xbf, 0x36, 0xad, 0x24, 0x9b, 0x12, 0x89, 0x00 },
+ { 0xff, 0x17, 0x2f, 0xc7, 0x42, 0xaa, 0x92, 0x7a,
+ 0x85, 0x6d, 0x55, 0xbd, 0x38, 0xd0, 0xe8, 0x00 },
+ { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+ 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xe6, 0x6c, 0xf2, 0x78, 0xce, 0x44, 0xda, 0x50,
+ 0xb6, 0x3c, 0xa2, 0x28, 0x9e, 0x14, 0x8a, 0x00 },
+ { 0xff, 0x17, 0x2f, 0xc7, 0x42, 0xaa, 0x92, 0x7a,
+ 0x85, 0x6d, 0x55, 0xbd, 0x38, 0xd0, 0xe8, 0x00 },
+ { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+ 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xe9, 0x62, 0xff, 0x74, 0xc5, 0x4e, 0xd3, 0x58,
+ 0xb1, 0x3a, 0xa7, 0x2c, 0x9d, 0x16, 0x8b, 0x00 },
+ { 0xc5, 0x2d, 0x15, 0xfd, 0x65, 0x8d, 0xb5, 0x5d,
+ 0x98, 0x70, 0x48, 0xa0, 0x38, 0xd0, 0xe8, 0x00 },
+ { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+ 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xc4, 0x48, 0xdc, 0x50, 0xf4, 0x78, 0xec, 0x60,
+ 0xa4, 0x28, 0xbc, 0x30, 0x94, 0x18, 0x8c, 0x00 },
+ { 0xc5, 0x2d, 0x15, 0xfd, 0x65, 0x8d, 0xb5, 0x5d,
+ 0x98, 0x70, 0x48, 0xa0, 0x38, 0xd0, 0xe8, 0x00 },
+ { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+ 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xcb, 0x46, 0xd1, 0x5c, 0xff, 0x72, 0xe5, 0x68,
+ 0xa3, 0x2e, 0xb9, 0x34, 0x97, 0x1a, 0x8d, 0x00 },
+ { 0xd8, 0x30, 0x08, 0xe0, 0x78, 0x90, 0xa8, 0x40,
+ 0x98, 0x70, 0x48, 0xa0, 0x38, 0xd0, 0xe8, 0x00 },
+ { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+ 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xda, 0x54, 0xc6, 0x48, 0xe2, 0x6c, 0xfe, 0x70,
+ 0xaa, 0x24, 0xb6, 0x38, 0x92, 0x1c, 0x8e, 0x00 },
+ { 0xd8, 0x30, 0x08, 0xe0, 0x78, 0x90, 0xa8, 0x40,
+ 0x98, 0x70, 0x48, 0xa0, 0x38, 0xd0, 0xe8, 0x00 },
+ { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+ 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xd5, 0x5a, 0xcb, 0x44, 0xe9, 0x66, 0xf7, 0x78,
+ 0xad, 0x22, 0xb3, 0x3c, 0x91, 0x1e, 0x8f, 0x00 },
+ { 0x0a, 0xff, 0xfd, 0x08, 0xf9, 0x0c, 0x0e, 0xfb,
+ 0xf1, 0x04, 0x06, 0xf3, 0x02, 0xf7, 0xf5, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+ 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 },
+ { 0x0a, 0xff, 0xfd, 0x08, 0xf9, 0x0c, 0x0e, 0xfb,
+ 0xf1, 0x04, 0x06, 0xf3, 0x02, 0xf7, 0xf5, 0x00 },
+ { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+ 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x7f, 0xee, 0x5d, 0xcc, 0x3b, 0xaa, 0x19, 0x88,
+ 0xf7, 0x66, 0xd5, 0x44, 0xb3, 0x22, 0x91, 0x00 },
+ { 0x17, 0xe2, 0xe0, 0x15, 0xe4, 0x11, 0x13, 0xe6,
+ 0xf1, 0x04, 0x06, 0xf3, 0x02, 0xf7, 0xf5, 0x00 },
+ { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+ 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x6e, 0xfc, 0x4a, 0xd8, 0x26, 0xb4, 0x02, 0x90,
+ 0xfe, 0x6c, 0xda, 0x48, 0xb6, 0x24, 0x92, 0x00 },
+ { 0x17, 0xe2, 0xe0, 0x15, 0xe4, 0x11, 0x13, 0xe6,
+ 0xf1, 0x04, 0x06, 0xf3, 0x02, 0xf7, 0xf5, 0x00 },
+ { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+ 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x61, 0xf2, 0x47, 0xd4, 0x2d, 0xbe, 0x0b, 0x98,
+ 0xf9, 0x6a, 0xdf, 0x4c, 0xb5, 0x26, 0x93, 0x00 },
+ { 0x2d, 0xd8, 0xda, 0x2f, 0xc3, 0x36, 0x34, 0xc1,
+ 0xec, 0x19, 0x1b, 0xee, 0x02, 0xf7, 0xf5, 0x00 },
+ { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+ 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x4c, 0xd8, 0x64, 0xf0, 0x1c, 0x88, 0x34, 0xa0,
+ 0xec, 0x78, 0xc4, 0x50, 0xbc, 0x28, 0x94, 0x00 },
+ { 0x2d, 0xd8, 0xda, 0x2f, 0xc3, 0x36, 0x34, 0xc1,
+ 0xec, 0x19, 0x1b, 0xee, 0x02, 0xf7, 0xf5, 0x00 },
+ { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+ 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x43, 0xd6, 0x69, 0xfc, 0x17, 0x82, 0x3d, 0xa8,
+ 0xeb, 0x7e, 0xc1, 0x54, 0xbf, 0x2a, 0x95, 0x00 },
+ { 0x30, 0xc5, 0xc7, 0x32, 0xde, 0x2b, 0x29, 0xdc,
+ 0xec, 0x19, 0x1b, 0xee, 0x02, 0xf7, 0xf5, 0x00 },
+ { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+ 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x52, 0xc4, 0x7e, 0xe8, 0x0a, 0x9c, 0x26, 0xb0,
+ 0xe2, 0x74, 0xce, 0x58, 0xba, 0x2c, 0x96, 0x00 },
+ { 0x30, 0xc5, 0xc7, 0x32, 0xde, 0x2b, 0x29, 0xdc,
+ 0xec, 0x19, 0x1b, 0xee, 0x02, 0xf7, 0xf5, 0x00 },
+ { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+ 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x5d, 0xca, 0x73, 0xe4, 0x01, 0x96, 0x2f, 0xb8,
+ 0xe5, 0x72, 0xcb, 0x5c, 0xb9, 0x2e, 0x97, 0x00 },
+ { 0x59, 0xac, 0xb3, 0x46, 0x90, 0x65, 0x7a, 0x8f,
+ 0xd6, 0x23, 0x3c, 0xc9, 0x1f, 0xea, 0xf5, 0x00 },
+ { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+ 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x08, 0x90, 0x38, 0xa0, 0x68, 0xf0, 0x58, 0xc0,
+ 0xc8, 0x50, 0xf8, 0x60, 0xa8, 0x30, 0x98, 0x00 },
+ { 0x59, 0xac, 0xb3, 0x46, 0x90, 0x65, 0x7a, 0x8f,
+ 0xd6, 0x23, 0x3c, 0xc9, 0x1f, 0xea, 0xf5, 0x00 },
+ { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+ 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x07, 0x9e, 0x35, 0xac, 0x63, 0xfa, 0x51, 0xc8,
+ 0xcf, 0x56, 0xfd, 0x64, 0xab, 0x32, 0x99, 0x00 },
+ { 0x44, 0xb1, 0xae, 0x5b, 0x8d, 0x78, 0x67, 0x92,
+ 0xd6, 0x23, 0x3c, 0xc9, 0x1f, 0xea, 0xf5, 0x00 },
+ { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+ 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x16, 0x8c, 0x22, 0xb8, 0x7e, 0xe4, 0x4a, 0xd0,
+ 0xc6, 0x5c, 0xf2, 0x68, 0xae, 0x34, 0x9a, 0x00 },
+ { 0x44, 0xb1, 0xae, 0x5b, 0x8d, 0x78, 0x67, 0x92,
+ 0xd6, 0x23, 0x3c, 0xc9, 0x1f, 0xea, 0xf5, 0x00 },
+ { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+ 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x19, 0x82, 0x2f, 0xb4, 0x75, 0xee, 0x43, 0xd8,
+ 0xc1, 0x5a, 0xf7, 0x6c, 0xad, 0x36, 0x9b, 0x00 },
+ { 0x7e, 0x8b, 0x94, 0x61, 0xaa, 0x5f, 0x40, 0xb5,
+ 0xcb, 0x3e, 0x21, 0xd4, 0x1f, 0xea, 0xf5, 0x00 },
+ { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+ 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x34, 0xa8, 0x0c, 0x90, 0x44, 0xd8, 0x7c, 0xe0,
+ 0xd4, 0x48, 0xec, 0x70, 0xa4, 0x38, 0x9c, 0x00 },
+ { 0x7e, 0x8b, 0x94, 0x61, 0xaa, 0x5f, 0x40, 0xb5,
+ 0xcb, 0x3e, 0x21, 0xd4, 0x1f, 0xea, 0xf5, 0x00 },
+ { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+ 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x3b, 0xa6, 0x01, 0x9c, 0x4f, 0xd2, 0x75, 0xe8,
+ 0xd3, 0x4e, 0xe9, 0x74, 0xa7, 0x3a, 0x9d, 0x00 },
+ { 0x63, 0x96, 0x89, 0x7c, 0xb7, 0x42, 0x5d, 0xa8,
+ 0xcb, 0x3e, 0x21, 0xd4, 0x1f, 0xea, 0xf5, 0x00 },
+ { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+ 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x2a, 0xb4, 0x16, 0x88, 0x52, 0xcc, 0x6e, 0xf0,
+ 0xda, 0x44, 0xe6, 0x78, 0xa2, 0x3c, 0x9e, 0x00 },
+ { 0x63, 0x96, 0x89, 0x7c, 0xb7, 0x42, 0x5d, 0xa8,
+ 0xcb, 0x3e, 0x21, 0xd4, 0x1f, 0xea, 0xf5, 0x00 },
+ { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+ 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 },
+ { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x25, 0xba, 0x1b, 0x84, 0x59, 0xc6, 0x67, 0xf8,
+ 0xdd, 0x42, 0xe3, 0x7c, 0xa1, 0x3e, 0x9f, 0x00 },
+ { 0xda, 0x08, 0x63, 0xb1, 0xb5, 0x67, 0x0c, 0xde,
+ 0x04, 0xd6, 0xbd, 0x6f, 0x6b, 0xb9, 0xd2, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+ 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 },
+ { 0xda, 0x08, 0x63, 0xb1, 0xb5, 0x67, 0x0c, 0xde,
+ 0x04, 0xd6, 0xbd, 0x6f, 0x6b, 0xb9, 0xd2, 0x00 },
+ { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+ 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x6f, 0xce, 0x2d, 0x8c, 0xeb, 0x4a, 0xa9, 0x08,
+ 0x67, 0xc6, 0x25, 0x84, 0xe3, 0x42, 0xa1, 0x00 },
+ { 0xc7, 0x15, 0x7e, 0xac, 0xa8, 0x7a, 0x11, 0xc3,
+ 0x04, 0xd6, 0xbd, 0x6f, 0x6b, 0xb9, 0xd2, 0x00 },
+ { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+ 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x7e, 0xdc, 0x3a, 0x98, 0xf6, 0x54, 0xb2, 0x10,
+ 0x6e, 0xcc, 0x2a, 0x88, 0xe6, 0x44, 0xa2, 0x00 },
+ { 0xc7, 0x15, 0x7e, 0xac, 0xa8, 0x7a, 0x11, 0xc3,
+ 0x04, 0xd6, 0xbd, 0x6f, 0x6b, 0xb9, 0xd2, 0x00 },
+ { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+ 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x71, 0xd2, 0x37, 0x94, 0xfd, 0x5e, 0xbb, 0x18,
+ 0x69, 0xca, 0x2f, 0x8c, 0xe5, 0x46, 0xa3, 0x00 },
+ { 0xfd, 0x2f, 0x44, 0x96, 0x8f, 0x5d, 0x36, 0xe4,
+ 0x19, 0xcb, 0xa0, 0x72, 0x6b, 0xb9, 0xd2, 0x00 },
+ { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+ 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x5c, 0xf8, 0x14, 0xb0, 0xcc, 0x68, 0x84, 0x20,
+ 0x7c, 0xd8, 0x34, 0x90, 0xec, 0x48, 0xa4, 0x00 },
+ { 0xfd, 0x2f, 0x44, 0x96, 0x8f, 0x5d, 0x36, 0xe4,
+ 0x19, 0xcb, 0xa0, 0x72, 0x6b, 0xb9, 0xd2, 0x00 },
+ { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+ 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x53, 0xf6, 0x19, 0xbc, 0xc7, 0x62, 0x8d, 0x28,
+ 0x7b, 0xde, 0x31, 0x94, 0xef, 0x4a, 0xa5, 0x00 },
+ { 0xe0, 0x32, 0x59, 0x8b, 0x92, 0x40, 0x2b, 0xf9,
+ 0x19, 0xcb, 0xa0, 0x72, 0x6b, 0xb9, 0xd2, 0x00 },
+ { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+ 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x42, 0xe4, 0x0e, 0xa8, 0xda, 0x7c, 0x96, 0x30,
+ 0x72, 0xd4, 0x3e, 0x98, 0xea, 0x4c, 0xa6, 0x00 },
+ { 0xe0, 0x32, 0x59, 0x8b, 0x92, 0x40, 0x2b, 0xf9,
+ 0x19, 0xcb, 0xa0, 0x72, 0x6b, 0xb9, 0xd2, 0x00 },
+ { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+ 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x4d, 0xea, 0x03, 0xa4, 0xd1, 0x76, 0x9f, 0x38,
+ 0x75, 0xd2, 0x3b, 0x9c, 0xe9, 0x4e, 0xa7, 0x00 },
+ { 0x89, 0x5b, 0x2d, 0xff, 0xdc, 0x0e, 0x78, 0xaa,
+ 0x23, 0xf1, 0x87, 0x55, 0x76, 0xa4, 0xd2, 0x00 },
+ { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+ 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x18, 0xb0, 0x48, 0xe0, 0xb8, 0x10, 0xe8, 0x40,
+ 0x58, 0xf0, 0x08, 0xa0, 0xf8, 0x50, 0xa8, 0x00 },
+ { 0x89, 0x5b, 0x2d, 0xff, 0xdc, 0x0e, 0x78, 0xaa,
+ 0x23, 0xf1, 0x87, 0x55, 0x76, 0xa4, 0xd2, 0x00 },
+ { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+ 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x17, 0xbe, 0x45, 0xec, 0xb3, 0x1a, 0xe1, 0x48,
+ 0x5f, 0xf6, 0x0d, 0xa4, 0xfb, 0x52, 0xa9, 0x00 },
+ { 0x94, 0x46, 0x30, 0xe2, 0xc1, 0x13, 0x65, 0xb7,
+ 0x23, 0xf1, 0x87, 0x55, 0x76, 0xa4, 0xd2, 0x00 },
+ { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+ 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x06, 0xac, 0x52, 0xf8, 0xae, 0x04, 0xfa, 0x50,
+ 0x56, 0xfc, 0x02, 0xa8, 0xfe, 0x54, 0xaa, 0x00 },
+ { 0x94, 0x46, 0x30, 0xe2, 0xc1, 0x13, 0x65, 0xb7,
+ 0x23, 0xf1, 0x87, 0x55, 0x76, 0xa4, 0xd2, 0x00 },
+ { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+ 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x09, 0xa2, 0x5f, 0xf4, 0xa5, 0x0e, 0xf3, 0x58,
+ 0x51, 0xfa, 0x07, 0xac, 0xfd, 0x56, 0xab, 0x00 },
+ { 0xae, 0x7c, 0x0a, 0xd8, 0xe6, 0x34, 0x42, 0x90,
+ 0x3e, 0xec, 0x9a, 0x48, 0x76, 0xa4, 0xd2, 0x00 },
+ { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+ 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x24, 0x88, 0x7c, 0xd0, 0x94, 0x38, 0xcc, 0x60,
+ 0x44, 0xe8, 0x1c, 0xb0, 0xf4, 0x58, 0xac, 0x00 },
+ { 0xae, 0x7c, 0x0a, 0xd8, 0xe6, 0x34, 0x42, 0x90,
+ 0x3e, 0xec, 0x9a, 0x48, 0x76, 0xa4, 0xd2, 0x00 },
+ { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+ 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x2b, 0x86, 0x71, 0xdc, 0x9f, 0x32, 0xc5, 0x68,
+ 0x43, 0xee, 0x19, 0xb4, 0xf7, 0x5a, 0xad, 0x00 },
+ { 0xb3, 0x61, 0x17, 0xc5, 0xfb, 0x29, 0x5f, 0x8d,
+ 0x3e, 0xec, 0x9a, 0x48, 0x76, 0xa4, 0xd2, 0x00 },
+ { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+ 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x3a, 0x94, 0x66, 0xc8, 0x82, 0x2c, 0xde, 0x70,
+ 0x4a, 0xe4, 0x16, 0xb8, 0xf2, 0x5c, 0xae, 0x00 },
+ { 0xb3, 0x61, 0x17, 0xc5, 0xfb, 0x29, 0x5f, 0x8d,
+ 0x3e, 0xec, 0x9a, 0x48, 0x76, 0xa4, 0xd2, 0x00 },
+ { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+ 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x35, 0x9a, 0x6b, 0xc4, 0x89, 0x26, 0xd7, 0x78,
+ 0x4d, 0xe2, 0x13, 0xbc, 0xf1, 0x5e, 0xaf, 0x00 },
+ { 0x61, 0xae, 0xe2, 0x2d, 0x7a, 0xb5, 0xf9, 0x36,
+ 0x57, 0x98, 0xd4, 0x1b, 0x4c, 0x83, 0xcf, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+ 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 },
+ { 0x61, 0xae, 0xe2, 0x2d, 0x7a, 0xb5, 0xf9, 0x36,
+ 0x57, 0x98, 0xd4, 0x1b, 0x4c, 0x83, 0xcf, 0x00 },
+ { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+ 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x9f, 0x2e, 0xfd, 0x4c, 0x5b, 0xea, 0x39, 0x88,
+ 0x17, 0xa6, 0x75, 0xc4, 0xd3, 0x62, 0xb1, 0x00 },
+ { 0x7c, 0xb3, 0xff, 0x30, 0x67, 0xa8, 0xe4, 0x2b,
+ 0x57, 0x98, 0xd4, 0x1b, 0x4c, 0x83, 0xcf, 0x00 },
+ { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+ 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x8e, 0x3c, 0xea, 0x58, 0x46, 0xf4, 0x22, 0x90,
+ 0x1e, 0xac, 0x7a, 0xc8, 0xd6, 0x64, 0xb2, 0x00 },
+ { 0x7c, 0xb3, 0xff, 0x30, 0x67, 0xa8, 0xe4, 0x2b,
+ 0x57, 0x98, 0xd4, 0x1b, 0x4c, 0x83, 0xcf, 0x00 },
+ { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+ 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x81, 0x32, 0xe7, 0x54, 0x4d, 0xfe, 0x2b, 0x98,
+ 0x19, 0xaa, 0x7f, 0xcc, 0xd5, 0x66, 0xb3, 0x00 },
+ { 0x46, 0x89, 0xc5, 0x0a, 0x40, 0x8f, 0xc3, 0x0c,
+ 0x4a, 0x85, 0xc9, 0x06, 0x4c, 0x83, 0xcf, 0x00 },
+ { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+ 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xac, 0x18, 0xc4, 0x70, 0x7c, 0xc8, 0x14, 0xa0,
+ 0x0c, 0xb8, 0x64, 0xd0, 0xdc, 0x68, 0xb4, 0x00 },
+ { 0x46, 0x89, 0xc5, 0x0a, 0x40, 0x8f, 0xc3, 0x0c,
+ 0x4a, 0x85, 0xc9, 0x06, 0x4c, 0x83, 0xcf, 0x00 },
+ { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+ 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xa3, 0x16, 0xc9, 0x7c, 0x77, 0xc2, 0x1d, 0xa8,
+ 0x0b, 0xbe, 0x61, 0xd4, 0xdf, 0x6a, 0xb5, 0x00 },
+ { 0x5b, 0x94, 0xd8, 0x17, 0x5d, 0x92, 0xde, 0x11,
+ 0x4a, 0x85, 0xc9, 0x06, 0x4c, 0x83, 0xcf, 0x00 },
+ { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+ 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xb2, 0x04, 0xde, 0x68, 0x6a, 0xdc, 0x06, 0xb0,
+ 0x02, 0xb4, 0x6e, 0xd8, 0xda, 0x6c, 0xb6, 0x00 },
+ { 0x5b, 0x94, 0xd8, 0x17, 0x5d, 0x92, 0xde, 0x11,
+ 0x4a, 0x85, 0xc9, 0x06, 0x4c, 0x83, 0xcf, 0x00 },
+ { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+ 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xbd, 0x0a, 0xd3, 0x64, 0x61, 0xd6, 0x0f, 0xb8,
+ 0x05, 0xb2, 0x6b, 0xdc, 0xd9, 0x6e, 0xb7, 0x00 },
+ { 0x32, 0xfd, 0xac, 0x63, 0x13, 0xdc, 0x8d, 0x42,
+ 0x70, 0xbf, 0xee, 0x21, 0x51, 0x9e, 0xcf, 0x00 },
+ { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+ 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xe8, 0x50, 0x98, 0x20, 0x08, 0xb0, 0x78, 0xc0,
+ 0x28, 0x90, 0x58, 0xe0, 0xc8, 0x70, 0xb8, 0x00 },
+ { 0x32, 0xfd, 0xac, 0x63, 0x13, 0xdc, 0x8d, 0x42,
+ 0x70, 0xbf, 0xee, 0x21, 0x51, 0x9e, 0xcf, 0x00 },
+ { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+ 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xe7, 0x5e, 0x95, 0x2c, 0x03, 0xba, 0x71, 0xc8,
+ 0x2f, 0x96, 0x5d, 0xe4, 0xcb, 0x72, 0xb9, 0x00 },
+ { 0x2f, 0xe0, 0xb1, 0x7e, 0x0e, 0xc1, 0x90, 0x5f,
+ 0x70, 0xbf, 0xee, 0x21, 0x51, 0x9e, 0xcf, 0x00 },
+ { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+ 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xf6, 0x4c, 0x82, 0x38, 0x1e, 0xa4, 0x6a, 0xd0,
+ 0x26, 0x9c, 0x52, 0xe8, 0xce, 0x74, 0xba, 0x00 },
+ { 0x2f, 0xe0, 0xb1, 0x7e, 0x0e, 0xc1, 0x90, 0x5f,
+ 0x70, 0xbf, 0xee, 0x21, 0x51, 0x9e, 0xcf, 0x00 },
+ { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+ 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xf9, 0x42, 0x8f, 0x34, 0x15, 0xae, 0x63, 0xd8,
+ 0x21, 0x9a, 0x57, 0xec, 0xcd, 0x76, 0xbb, 0x00 },
+ { 0x15, 0xda, 0x8b, 0x44, 0x29, 0xe6, 0xb7, 0x78,
+ 0x6d, 0xa2, 0xf3, 0x3c, 0x51, 0x9e, 0xcf, 0x00 },
+ { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+ 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xd4, 0x68, 0xac, 0x10, 0x24, 0x98, 0x5c, 0xe0,
+ 0x34, 0x88, 0x4c, 0xf0, 0xc4, 0x78, 0xbc, 0x00 },
+ { 0x15, 0xda, 0x8b, 0x44, 0x29, 0xe6, 0xb7, 0x78,
+ 0x6d, 0xa2, 0xf3, 0x3c, 0x51, 0x9e, 0xcf, 0x00 },
+ { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+ 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xdb, 0x66, 0xa1, 0x1c, 0x2f, 0x92, 0x55, 0xe8,
+ 0x33, 0x8e, 0x49, 0xf4, 0xc7, 0x7a, 0xbd, 0x00 },
+ { 0x08, 0xc7, 0x96, 0x59, 0x34, 0xfb, 0xaa, 0x65,
+ 0x6d, 0xa2, 0xf3, 0x3c, 0x51, 0x9e, 0xcf, 0x00 },
+ { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+ 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xca, 0x74, 0xb6, 0x08, 0x32, 0x8c, 0x4e, 0xf0,
+ 0x3a, 0x84, 0x46, 0xf8, 0xc2, 0x7c, 0xbe, 0x00 },
+ { 0x08, 0xc7, 0x96, 0x59, 0x34, 0xfb, 0xaa, 0x65,
+ 0x6d, 0xa2, 0xf3, 0x3c, 0x51, 0x9e, 0xcf, 0x00 },
+ { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+ 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 },
+ { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+ 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xc5, 0x7a, 0xbb, 0x04, 0x39, 0x86, 0x47, 0xf8,
+ 0x3d, 0x82, 0x43, 0xfc, 0xc1, 0x7e, 0xbf, 0x00 },
+ { 0x67, 0xfb, 0x42, 0xde, 0x2d, 0xb1, 0x08, 0x94,
+ 0xf3, 0x6f, 0xd6, 0x4a, 0xb9, 0x25, 0x9c, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+ 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 },
+ { 0x67, 0xfb, 0x42, 0xde, 0x2d, 0xb1, 0x08, 0x94,
+ 0xf3, 0x6f, 0xd6, 0x4a, 0xb9, 0x25, 0x9c, 0x00 },
+ { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+ 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x4f, 0x8e, 0xcd, 0x0c, 0x4b, 0x8a, 0xc9, 0x08,
+ 0x47, 0x86, 0xc5, 0x04, 0x43, 0x82, 0xc1, 0x00 },
+ { 0x7a, 0xe6, 0x5f, 0xc3, 0x30, 0xac, 0x15, 0x89,
+ 0xf3, 0x6f, 0xd6, 0x4a, 0xb9, 0x25, 0x9c, 0x00 },
+ { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+ 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x5e, 0x9c, 0xda, 0x18, 0x56, 0x94, 0xd2, 0x10,
+ 0x4e, 0x8c, 0xca, 0x08, 0x46, 0x84, 0xc2, 0x00 },
+ { 0x7a, 0xe6, 0x5f, 0xc3, 0x30, 0xac, 0x15, 0x89,
+ 0xf3, 0x6f, 0xd6, 0x4a, 0xb9, 0x25, 0x9c, 0x00 },
+ { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+ 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x51, 0x92, 0xd7, 0x14, 0x5d, 0x9e, 0xdb, 0x18,
+ 0x49, 0x8a, 0xcf, 0x0c, 0x45, 0x86, 0xc3, 0x00 },
+ { 0x40, 0xdc, 0x65, 0xf9, 0x17, 0x8b, 0x32, 0xae,
+ 0xee, 0x72, 0xcb, 0x57, 0xb9, 0x25, 0x9c, 0x00 },
+ { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+ 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x7c, 0xb8, 0xf4, 0x30, 0x6c, 0xa8, 0xe4, 0x20,
+ 0x5c, 0x98, 0xd4, 0x10, 0x4c, 0x88, 0xc4, 0x00 },
+ { 0x40, 0xdc, 0x65, 0xf9, 0x17, 0x8b, 0x32, 0xae,
+ 0xee, 0x72, 0xcb, 0x57, 0xb9, 0x25, 0x9c, 0x00 },
+ { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+ 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x73, 0xb6, 0xf9, 0x3c, 0x67, 0xa2, 0xed, 0x28,
+ 0x5b, 0x9e, 0xd1, 0x14, 0x4f, 0x8a, 0xc5, 0x00 },
+ { 0x5d, 0xc1, 0x78, 0xe4, 0x0a, 0x96, 0x2f, 0xb3,
+ 0xee, 0x72, 0xcb, 0x57, 0xb9, 0x25, 0x9c, 0x00 },
+ { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+ 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x62, 0xa4, 0xee, 0x28, 0x7a, 0xbc, 0xf6, 0x30,
+ 0x52, 0x94, 0xde, 0x18, 0x4a, 0x8c, 0xc6, 0x00 },
+ { 0x5d, 0xc1, 0x78, 0xe4, 0x0a, 0x96, 0x2f, 0xb3,
+ 0xee, 0x72, 0xcb, 0x57, 0xb9, 0x25, 0x9c, 0x00 },
+ { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+ 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x6d, 0xaa, 0xe3, 0x24, 0x71, 0xb6, 0xff, 0x38,
+ 0x55, 0x92, 0xdb, 0x1c, 0x49, 0x8e, 0xc7, 0x00 },
+ { 0x34, 0xa8, 0x0c, 0x90, 0x44, 0xd8, 0x7c, 0xe0,
+ 0xd4, 0x48, 0xec, 0x70, 0xa4, 0x38, 0x9c, 0x00 },
+ { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+ 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x38, 0xf0, 0xa8, 0x60, 0x18, 0xd0, 0x88, 0x40,
+ 0x78, 0xb0, 0xe8, 0x20, 0x58, 0x90, 0xc8, 0x00 },
+ { 0x34, 0xa8, 0x0c, 0x90, 0x44, 0xd8, 0x7c, 0xe0,
+ 0xd4, 0x48, 0xec, 0x70, 0xa4, 0x38, 0x9c, 0x00 },
+ { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+ 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x37, 0xfe, 0xa5, 0x6c, 0x13, 0xda, 0x81, 0x48,
+ 0x7f, 0xb6, 0xed, 0x24, 0x5b, 0x92, 0xc9, 0x00 },
+ { 0x29, 0xb5, 0x11, 0x8d, 0x59, 0xc5, 0x61, 0xfd,
+ 0xd4, 0x48, 0xec, 0x70, 0xa4, 0x38, 0x9c, 0x00 },
+ { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+ 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x26, 0xec, 0xb2, 0x78, 0x0e, 0xc4, 0x9a, 0x50,
+ 0x76, 0xbc, 0xe2, 0x28, 0x5e, 0x94, 0xca, 0x00 },
+ { 0x29, 0xb5, 0x11, 0x8d, 0x59, 0xc5, 0x61, 0xfd,
+ 0xd4, 0x48, 0xec, 0x70, 0xa4, 0x38, 0x9c, 0x00 },
+ { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+ 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x29, 0xe2, 0xbf, 0x74, 0x05, 0xce, 0x93, 0x58,
+ 0x71, 0xba, 0xe7, 0x2c, 0x5d, 0x96, 0xcb, 0x00 },
+ { 0x13, 0x8f, 0x2b, 0xb7, 0x7e, 0xe2, 0x46, 0xda,
+ 0xc9, 0x55, 0xf1, 0x6d, 0xa4, 0x38, 0x9c, 0x00 },
+ { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+ 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x04, 0xc8, 0x9c, 0x50, 0x34, 0xf8, 0xac, 0x60,
+ 0x64, 0xa8, 0xfc, 0x30, 0x54, 0x98, 0xcc, 0x00 },
+ { 0x13, 0x8f, 0x2b, 0xb7, 0x7e, 0xe2, 0x46, 0xda,
+ 0xc9, 0x55, 0xf1, 0x6d, 0xa4, 0x38, 0x9c, 0x00 },
+ { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+ 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x0b, 0xc6, 0x91, 0x5c, 0x3f, 0xf2, 0xa5, 0x68,
+ 0x63, 0xae, 0xf9, 0x34, 0x57, 0x9a, 0xcd, 0x00 },
+ { 0x0e, 0x92, 0x36, 0xaa, 0x63, 0xff, 0x5b, 0xc7,
+ 0xc9, 0x55, 0xf1, 0x6d, 0xa4, 0x38, 0x9c, 0x00 },
+ { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+ 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x1a, 0xd4, 0x86, 0x48, 0x22, 0xec, 0xbe, 0x70,
+ 0x6a, 0xa4, 0xf6, 0x38, 0x52, 0x9c, 0xce, 0x00 },
+ { 0x0e, 0x92, 0x36, 0xaa, 0x63, 0xff, 0x5b, 0xc7,
+ 0xc9, 0x55, 0xf1, 0x6d, 0xa4, 0x38, 0x9c, 0x00 },
+ { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+ 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x15, 0xda, 0x8b, 0x44, 0x29, 0xe6, 0xb7, 0x78,
+ 0x6d, 0xa2, 0xf3, 0x3c, 0x51, 0x9e, 0xcf, 0x00 },
+ { 0xdc, 0x5d, 0xc3, 0x42, 0xe2, 0x63, 0xfd, 0x7c,
+ 0xa0, 0x21, 0xbf, 0x3e, 0x9e, 0x1f, 0x81, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+ 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 },
+ { 0xdc, 0x5d, 0xc3, 0x42, 0xe2, 0x63, 0xfd, 0x7c,
+ 0xa0, 0x21, 0xbf, 0x3e, 0x9e, 0x1f, 0x81, 0x00 },
+ { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+ 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xbf, 0x6e, 0x1d, 0xcc, 0xfb, 0x2a, 0x59, 0x88,
+ 0x37, 0xe6, 0x95, 0x44, 0x73, 0xa2, 0xd1, 0x00 },
+ { 0xc1, 0x40, 0xde, 0x5f, 0xff, 0x7e, 0xe0, 0x61,
+ 0xa0, 0x21, 0xbf, 0x3e, 0x9e, 0x1f, 0x81, 0x00 },
+ { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+ 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xae, 0x7c, 0x0a, 0xd8, 0xe6, 0x34, 0x42, 0x90,
+ 0x3e, 0xec, 0x9a, 0x48, 0x76, 0xa4, 0xd2, 0x00 },
+ { 0xc1, 0x40, 0xde, 0x5f, 0xff, 0x7e, 0xe0, 0x61,
+ 0xa0, 0x21, 0xbf, 0x3e, 0x9e, 0x1f, 0x81, 0x00 },
+ { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+ 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xa1, 0x72, 0x07, 0xd4, 0xed, 0x3e, 0x4b, 0x98,
+ 0x39, 0xea, 0x9f, 0x4c, 0x75, 0xa6, 0xd3, 0x00 },
+ { 0xfb, 0x7a, 0xe4, 0x65, 0xd8, 0x59, 0xc7, 0x46,
+ 0xbd, 0x3c, 0xa2, 0x23, 0x9e, 0x1f, 0x81, 0x00 },
+ { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+ 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x8c, 0x58, 0x24, 0xf0, 0xdc, 0x08, 0x74, 0xa0,
+ 0x2c, 0xf8, 0x84, 0x50, 0x7c, 0xa8, 0xd4, 0x00 },
+ { 0xfb, 0x7a, 0xe4, 0x65, 0xd8, 0x59, 0xc7, 0x46,
+ 0xbd, 0x3c, 0xa2, 0x23, 0x9e, 0x1f, 0x81, 0x00 },
+ { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+ 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x83, 0x56, 0x29, 0xfc, 0xd7, 0x02, 0x7d, 0xa8,
+ 0x2b, 0xfe, 0x81, 0x54, 0x7f, 0xaa, 0xd5, 0x00 },
+ { 0xe6, 0x67, 0xf9, 0x78, 0xc5, 0x44, 0xda, 0x5b,
+ 0xbd, 0x3c, 0xa2, 0x23, 0x9e, 0x1f, 0x81, 0x00 },
+ { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+ 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x92, 0x44, 0x3e, 0xe8, 0xca, 0x1c, 0x66, 0xb0,
+ 0x22, 0xf4, 0x8e, 0x58, 0x7a, 0xac, 0xd6, 0x00 },
+ { 0xe6, 0x67, 0xf9, 0x78, 0xc5, 0x44, 0xda, 0x5b,
+ 0xbd, 0x3c, 0xa2, 0x23, 0x9e, 0x1f, 0x81, 0x00 },
+ { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+ 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x9d, 0x4a, 0x33, 0xe4, 0xc1, 0x16, 0x6f, 0xb8,
+ 0x25, 0xf2, 0x8b, 0x5c, 0x79, 0xae, 0xd7, 0x00 },
+ { 0x8f, 0x0e, 0x8d, 0x0c, 0x8b, 0x0a, 0x89, 0x08,
+ 0x87, 0x06, 0x85, 0x04, 0x83, 0x02, 0x81, 0x00 },
+ { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+ 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xc8, 0x10, 0x78, 0xa0, 0xa8, 0x70, 0x18, 0xc0,
+ 0x08, 0xd0, 0xb8, 0x60, 0x68, 0xb0, 0xd8, 0x00 },
+ { 0x8f, 0x0e, 0x8d, 0x0c, 0x8b, 0x0a, 0x89, 0x08,
+ 0x87, 0x06, 0x85, 0x04, 0x83, 0x02, 0x81, 0x00 },
+ { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+ 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xc7, 0x1e, 0x75, 0xac, 0xa3, 0x7a, 0x11, 0xc8,
+ 0x0f, 0xd6, 0xbd, 0x64, 0x6b, 0xb2, 0xd9, 0x00 },
+ { 0x92, 0x13, 0x90, 0x11, 0x96, 0x17, 0x94, 0x15,
+ 0x87, 0x06, 0x85, 0x04, 0x83, 0x02, 0x81, 0x00 },
+ { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+ 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xd6, 0x0c, 0x62, 0xb8, 0xbe, 0x64, 0x0a, 0xd0,
+ 0x06, 0xdc, 0xb2, 0x68, 0x6e, 0xb4, 0xda, 0x00 },
+ { 0x92, 0x13, 0x90, 0x11, 0x96, 0x17, 0x94, 0x15,
+ 0x87, 0x06, 0x85, 0x04, 0x83, 0x02, 0x81, 0x00 },
+ { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+ 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xd9, 0x02, 0x6f, 0xb4, 0xb5, 0x6e, 0x03, 0xd8,
+ 0x01, 0xda, 0xb7, 0x6c, 0x6d, 0xb6, 0xdb, 0x00 },
+ { 0xa8, 0x29, 0xaa, 0x2b, 0xb1, 0x30, 0xb3, 0x32,
+ 0x9a, 0x1b, 0x98, 0x19, 0x83, 0x02, 0x81, 0x00 },
+ { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+ 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xf4, 0x28, 0x4c, 0x90, 0x84, 0x58, 0x3c, 0xe0,
+ 0x14, 0xc8, 0xac, 0x70, 0x64, 0xb8, 0xdc, 0x00 },
+ { 0xa8, 0x29, 0xaa, 0x2b, 0xb1, 0x30, 0xb3, 0x32,
+ 0x9a, 0x1b, 0x98, 0x19, 0x83, 0x02, 0x81, 0x00 },
+ { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+ 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xfb, 0x26, 0x41, 0x9c, 0x8f, 0x52, 0x35, 0xe8,
+ 0x13, 0xce, 0xa9, 0x74, 0x67, 0xba, 0xdd, 0x00 },
+ { 0xb5, 0x34, 0xb7, 0x36, 0xac, 0x2d, 0xae, 0x2f,
+ 0x9a, 0x1b, 0x98, 0x19, 0x83, 0x02, 0x81, 0x00 },
+ { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+ 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xea, 0x34, 0x56, 0x88, 0x92, 0x4c, 0x2e, 0xf0,
+ 0x1a, 0xc4, 0xa6, 0x78, 0x62, 0xbc, 0xde, 0x00 },
+ { 0xb5, 0x34, 0xb7, 0x36, 0xac, 0x2d, 0xae, 0x2f,
+ 0x9a, 0x1b, 0x98, 0x19, 0x83, 0x02, 0x81, 0x00 },
+ { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+ 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 },
+ { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xe5, 0x3a, 0x5b, 0x84, 0x99, 0x46, 0x27, 0xf8,
+ 0x1d, 0xc2, 0xa3, 0x7c, 0x61, 0xbe, 0xdf, 0x00 },
+ { 0x0c, 0xaa, 0x5d, 0xfb, 0xae, 0x08, 0xff, 0x59,
+ 0x55, 0xf3, 0x04, 0xa2, 0xf7, 0x51, 0xa6, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+ 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 },
+ { 0x0c, 0xaa, 0x5d, 0xfb, 0xae, 0x08, 0xff, 0x59,
+ 0x55, 0xf3, 0x04, 0xa2, 0xf7, 0x51, 0xa6, 0x00 },
+ { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+ 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xaf, 0x4e, 0x6d, 0x8c, 0x2b, 0xca, 0xe9, 0x08,
+ 0xa7, 0x46, 0x65, 0x84, 0x23, 0xc2, 0xe1, 0x00 },
+ { 0x11, 0xb7, 0x40, 0xe6, 0xb3, 0x15, 0xe2, 0x44,
+ 0x55, 0xf3, 0x04, 0xa2, 0xf7, 0x51, 0xa6, 0x00 },
+ { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+ 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xbe, 0x5c, 0x7a, 0x98, 0x36, 0xd4, 0xf2, 0x10,
+ 0xae, 0x4c, 0x6a, 0x88, 0x26, 0xc4, 0xe2, 0x00 },
+ { 0x11, 0xb7, 0x40, 0xe6, 0xb3, 0x15, 0xe2, 0x44,
+ 0x55, 0xf3, 0x04, 0xa2, 0xf7, 0x51, 0xa6, 0x00 },
+ { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+ 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xb1, 0x52, 0x77, 0x94, 0x3d, 0xde, 0xfb, 0x18,
+ 0xa9, 0x4a, 0x6f, 0x8c, 0x25, 0xc6, 0xe3, 0x00 },
+ { 0x2b, 0x8d, 0x7a, 0xdc, 0x94, 0x32, 0xc5, 0x63,
+ 0x48, 0xee, 0x19, 0xbf, 0xf7, 0x51, 0xa6, 0x00 },
+ { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+ 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x9c, 0x78, 0x54, 0xb0, 0x0c, 0xe8, 0xc4, 0x20,
+ 0xbc, 0x58, 0x74, 0x90, 0x2c, 0xc8, 0xe4, 0x00 },
+ { 0x2b, 0x8d, 0x7a, 0xdc, 0x94, 0x32, 0xc5, 0x63,
+ 0x48, 0xee, 0x19, 0xbf, 0xf7, 0x51, 0xa6, 0x00 },
+ { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+ 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x93, 0x76, 0x59, 0xbc, 0x07, 0xe2, 0xcd, 0x28,
+ 0xbb, 0x5e, 0x71, 0x94, 0x2f, 0xca, 0xe5, 0x00 },
+ { 0x36, 0x90, 0x67, 0xc1, 0x89, 0x2f, 0xd8, 0x7e,
+ 0x48, 0xee, 0x19, 0xbf, 0xf7, 0x51, 0xa6, 0x00 },
+ { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+ 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x82, 0x64, 0x4e, 0xa8, 0x1a, 0xfc, 0xd6, 0x30,
+ 0xb2, 0x54, 0x7e, 0x98, 0x2a, 0xcc, 0xe6, 0x00 },
+ { 0x36, 0x90, 0x67, 0xc1, 0x89, 0x2f, 0xd8, 0x7e,
+ 0x48, 0xee, 0x19, 0xbf, 0xf7, 0x51, 0xa6, 0x00 },
+ { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+ 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x8d, 0x6a, 0x43, 0xa4, 0x11, 0xf6, 0xdf, 0x38,
+ 0xb5, 0x52, 0x7b, 0x9c, 0x29, 0xce, 0xe7, 0x00 },
+ { 0x5f, 0xf9, 0x13, 0xb5, 0xc7, 0x61, 0x8b, 0x2d,
+ 0x72, 0xd4, 0x3e, 0x98, 0xea, 0x4c, 0xa6, 0x00 },
+ { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+ 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xd8, 0x30, 0x08, 0xe0, 0x78, 0x90, 0xa8, 0x40,
+ 0x98, 0x70, 0x48, 0xa0, 0x38, 0xd0, 0xe8, 0x00 },
+ { 0x5f, 0xf9, 0x13, 0xb5, 0xc7, 0x61, 0x8b, 0x2d,
+ 0x72, 0xd4, 0x3e, 0x98, 0xea, 0x4c, 0xa6, 0x00 },
+ { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+ 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xd7, 0x3e, 0x05, 0xec, 0x73, 0x9a, 0xa1, 0x48,
+ 0x9f, 0x76, 0x4d, 0xa4, 0x3b, 0xd2, 0xe9, 0x00 },
+ { 0x42, 0xe4, 0x0e, 0xa8, 0xda, 0x7c, 0x96, 0x30,
+ 0x72, 0xd4, 0x3e, 0x98, 0xea, 0x4c, 0xa6, 0x00 },
+ { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+ 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xc6, 0x2c, 0x12, 0xf8, 0x6e, 0x84, 0xba, 0x50,
+ 0x96, 0x7c, 0x42, 0xa8, 0x3e, 0xd4, 0xea, 0x00 },
+ { 0x42, 0xe4, 0x0e, 0xa8, 0xda, 0x7c, 0x96, 0x30,
+ 0x72, 0xd4, 0x3e, 0x98, 0xea, 0x4c, 0xa6, 0x00 },
+ { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+ 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xc9, 0x22, 0x1f, 0xf4, 0x65, 0x8e, 0xb3, 0x58,
+ 0x91, 0x7a, 0x47, 0xac, 0x3d, 0xd6, 0xeb, 0x00 },
+ { 0x78, 0xde, 0x34, 0x92, 0xfd, 0x5b, 0xb1, 0x17,
+ 0x6f, 0xc9, 0x23, 0x85, 0xea, 0x4c, 0xa6, 0x00 },
+ { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+ 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xe4, 0x08, 0x3c, 0xd0, 0x54, 0xb8, 0x8c, 0x60,
+ 0x84, 0x68, 0x5c, 0xb0, 0x34, 0xd8, 0xec, 0x00 },
+ { 0x78, 0xde, 0x34, 0x92, 0xfd, 0x5b, 0xb1, 0x17,
+ 0x6f, 0xc9, 0x23, 0x85, 0xea, 0x4c, 0xa6, 0x00 },
+ { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+ 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xeb, 0x06, 0x31, 0xdc, 0x5f, 0xb2, 0x85, 0x68,
+ 0x83, 0x6e, 0x59, 0xb4, 0x37, 0xda, 0xed, 0x00 },
+ { 0x65, 0xc3, 0x29, 0x8f, 0xe0, 0x46, 0xac, 0x0a,
+ 0x6f, 0xc9, 0x23, 0x85, 0xea, 0x4c, 0xa6, 0x00 },
+ { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+ 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xfa, 0x14, 0x26, 0xc8, 0x42, 0xac, 0x9e, 0x70,
+ 0x8a, 0x64, 0x56, 0xb8, 0x32, 0xdc, 0xee, 0x00 },
+ { 0x65, 0xc3, 0x29, 0x8f, 0xe0, 0x46, 0xac, 0x0a,
+ 0x6f, 0xc9, 0x23, 0x85, 0xea, 0x4c, 0xa6, 0x00 },
+ { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+ 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0xf5, 0x1a, 0x2b, 0xc4, 0x49, 0xa6, 0x97, 0x78,
+ 0x8d, 0x62, 0x53, 0xbc, 0x31, 0xde, 0xef, 0x00 },
+ { 0xb7, 0x0c, 0xdc, 0x67, 0x61, 0xda, 0x0a, 0xb1,
+ 0x06, 0xbd, 0x6d, 0xd6, 0xd0, 0x6b, 0xbb, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+ 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 },
+ { 0xb7, 0x0c, 0xdc, 0x67, 0x61, 0xda, 0x0a, 0xb1,
+ 0x06, 0xbd, 0x6d, 0xd6, 0xd0, 0x6b, 0xbb, 0x00 },
+ { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+ 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x5f, 0xae, 0xbd, 0x4c, 0x9b, 0x6a, 0x79, 0x88,
+ 0xd7, 0x26, 0x35, 0xc4, 0x13, 0xe2, 0xf1, 0x00 },
+ { 0xaa, 0x11, 0xc1, 0x7a, 0x7c, 0xc7, 0x17, 0xac,
+ 0x06, 0xbd, 0x6d, 0xd6, 0xd0, 0x6b, 0xbb, 0x00 },
+ { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+ 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x4e, 0xbc, 0xaa, 0x58, 0x86, 0x74, 0x62, 0x90,
+ 0xde, 0x2c, 0x3a, 0xc8, 0x16, 0xe4, 0xf2, 0x00 },
+ { 0xaa, 0x11, 0xc1, 0x7a, 0x7c, 0xc7, 0x17, 0xac,
+ 0x06, 0xbd, 0x6d, 0xd6, 0xd0, 0x6b, 0xbb, 0x00 },
+ { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+ 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x41, 0xb2, 0xa7, 0x54, 0x8d, 0x7e, 0x6b, 0x98,
+ 0xd9, 0x2a, 0x3f, 0xcc, 0x15, 0xe6, 0xf3, 0x00 },
+ { 0x90, 0x2b, 0xfb, 0x40, 0x5b, 0xe0, 0x30, 0x8b,
+ 0x1b, 0xa0, 0x70, 0xcb, 0xd0, 0x6b, 0xbb, 0x00 },
+ { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+ 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x6c, 0x98, 0x84, 0x70, 0xbc, 0x48, 0x54, 0xa0,
+ 0xcc, 0x38, 0x24, 0xd0, 0x1c, 0xe8, 0xf4, 0x00 },
+ { 0x90, 0x2b, 0xfb, 0x40, 0x5b, 0xe0, 0x30, 0x8b,
+ 0x1b, 0xa0, 0x70, 0xcb, 0xd0, 0x6b, 0xbb, 0x00 },
+ { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+ 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x63, 0x96, 0x89, 0x7c, 0xb7, 0x42, 0x5d, 0xa8,
+ 0xcb, 0x3e, 0x21, 0xd4, 0x1f, 0xea, 0xf5, 0x00 },
+ { 0x8d, 0x36, 0xe6, 0x5d, 0x46, 0xfd, 0x2d, 0x96,
+ 0x1b, 0xa0, 0x70, 0xcb, 0xd0, 0x6b, 0xbb, 0x00 },
+ { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+ 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x72, 0x84, 0x9e, 0x68, 0xaa, 0x5c, 0x46, 0xb0,
+ 0xc2, 0x34, 0x2e, 0xd8, 0x1a, 0xec, 0xf6, 0x00 },
+ { 0x8d, 0x36, 0xe6, 0x5d, 0x46, 0xfd, 0x2d, 0x96,
+ 0x1b, 0xa0, 0x70, 0xcb, 0xd0, 0x6b, 0xbb, 0x00 },
+ { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+ 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x7d, 0x8a, 0x93, 0x64, 0xa1, 0x56, 0x4f, 0xb8,
+ 0xc5, 0x32, 0x2b, 0xdc, 0x19, 0xee, 0xf7, 0x00 },
+ { 0xe4, 0x5f, 0x92, 0x29, 0x08, 0xb3, 0x7e, 0xc5,
+ 0x21, 0x9a, 0x57, 0xec, 0xcd, 0x76, 0xbb, 0x00 },
+ { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+ 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x28, 0xd0, 0xd8, 0x20, 0xc8, 0x30, 0x38, 0xc0,
+ 0xe8, 0x10, 0x18, 0xe0, 0x08, 0xf0, 0xf8, 0x00 },
+ { 0xe4, 0x5f, 0x92, 0x29, 0x08, 0xb3, 0x7e, 0xc5,
+ 0x21, 0x9a, 0x57, 0xec, 0xcd, 0x76, 0xbb, 0x00 },
+ { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+ 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x27, 0xde, 0xd5, 0x2c, 0xc3, 0x3a, 0x31, 0xc8,
+ 0xef, 0x16, 0x1d, 0xe4, 0x0b, 0xf2, 0xf9, 0x00 },
+ { 0xf9, 0x42, 0x8f, 0x34, 0x15, 0xae, 0x63, 0xd8,
+ 0x21, 0x9a, 0x57, 0xec, 0xcd, 0x76, 0xbb, 0x00 },
+ { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+ 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x36, 0xcc, 0xc2, 0x38, 0xde, 0x24, 0x2a, 0xd0,
+ 0xe6, 0x1c, 0x12, 0xe8, 0x0e, 0xf4, 0xfa, 0x00 },
+ { 0xf9, 0x42, 0x8f, 0x34, 0x15, 0xae, 0x63, 0xd8,
+ 0x21, 0x9a, 0x57, 0xec, 0xcd, 0x76, 0xbb, 0x00 },
+ { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+ 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x39, 0xc2, 0xcf, 0x34, 0xd5, 0x2e, 0x23, 0xd8,
+ 0xe1, 0x1a, 0x17, 0xec, 0x0d, 0xf6, 0xfb, 0x00 },
+ { 0xc3, 0x78, 0xb5, 0x0e, 0x32, 0x89, 0x44, 0xff,
+ 0x3c, 0x87, 0x4a, 0xf1, 0xcd, 0x76, 0xbb, 0x00 },
+ { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+ 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x14, 0xe8, 0xec, 0x10, 0xe4, 0x18, 0x1c, 0xe0,
+ 0xf4, 0x08, 0x0c, 0xf0, 0x04, 0xf8, 0xfc, 0x00 },
+ { 0xc3, 0x78, 0xb5, 0x0e, 0x32, 0x89, 0x44, 0xff,
+ 0x3c, 0x87, 0x4a, 0xf1, 0xcd, 0x76, 0xbb, 0x00 },
+ { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+ 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x1b, 0xe6, 0xe1, 0x1c, 0xef, 0x12, 0x15, 0xe8,
+ 0xf3, 0x0e, 0x09, 0xf4, 0x07, 0xfa, 0xfd, 0x00 },
+ { 0xde, 0x65, 0xa8, 0x13, 0x2f, 0x94, 0x59, 0xe2,
+ 0x3c, 0x87, 0x4a, 0xf1, 0xcd, 0x76, 0xbb, 0x00 },
+ { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+ 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x0a, 0xf4, 0xf6, 0x08, 0xf2, 0x0c, 0x0e, 0xf0,
+ 0xfa, 0x04, 0x06, 0xf8, 0x02, 0xfc, 0xfe, 0x00 },
+ { 0xde, 0x65, 0xa8, 0x13, 0x2f, 0x94, 0x59, 0xe2,
+ 0x3c, 0x87, 0x4a, 0xf1, 0xcd, 0x76, 0xbb, 0x00 },
+ { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+ 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 },
+ { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+ 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 },
+ { 0x05, 0xfa, 0xfb, 0x04, 0xf9, 0x06, 0x07, 0xf8,
+ 0xfd, 0x02, 0x03, 0xfc, 0x01, 0xfe, 0xff, 0x00 }
+};
+/* END CSTYLED */
+#else
+/* BEGIN CSTYLED */
+const uint8_t
+__attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = {
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e,
+ 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09,
+ 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c,
+ 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b,
+ 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12,
+ 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15,
+ 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38,
+ 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f,
+ 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36,
+ 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31,
+ 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24,
+ 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23,
+ 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a,
+ 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d,
+ 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+ 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+ 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
+ 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+ 0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e,
+ 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+ 0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79,
+ 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+ 0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c,
+ 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+ 0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b,
+ 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+ 0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62,
+ 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+ 0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65,
+ 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+ 0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48,
+ 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88 },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+ 0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f,
+ 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87 },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+ 0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46,
+ 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96 },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+ 0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41,
+ 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99 },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+ 0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54,
+ 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4 },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+ 0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+ 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+ 0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a,
+ 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+ 0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d,
+ 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5 },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+ 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+ 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7,
+ 0x08, 0x29, 0x4a, 0x6b, 0x8c, 0xad, 0xce, 0xef },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+ 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee,
+ 0x10, 0x32, 0x54, 0x76, 0x98, 0xba, 0xdc, 0xfe },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+ 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9,
+ 0x18, 0x3b, 0x5e, 0x7d, 0x94, 0xb7, 0xd2, 0xf1 },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+ 0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc,
+ 0x20, 0x04, 0x68, 0x4c, 0xb0, 0x94, 0xf8, 0xdc },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+ 0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb,
+ 0x28, 0x0d, 0x62, 0x47, 0xbc, 0x99, 0xf6, 0xd3 },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+ 0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2,
+ 0x30, 0x16, 0x7c, 0x5a, 0xa8, 0x8e, 0xe4, 0xc2 },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+ 0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+ 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd },
+ { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+ 0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8,
+ 0x40, 0x68, 0x10, 0x38, 0xe0, 0xc8, 0xb0, 0x98 },
+ { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+ 0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf,
+ 0x48, 0x61, 0x1a, 0x33, 0xec, 0xc5, 0xbe, 0x97 },
+ { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+ 0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6,
+ 0x50, 0x7a, 0x04, 0x2e, 0xf8, 0xd2, 0xac, 0x86 },
+ { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+ 0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1,
+ 0x58, 0x73, 0x0e, 0x25, 0xf4, 0xdf, 0xa2, 0x89 },
+ { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+ 0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4,
+ 0x60, 0x4c, 0x38, 0x14, 0xd0, 0xfc, 0x88, 0xa4 },
+ { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+ 0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3,
+ 0x68, 0x45, 0x32, 0x1f, 0xdc, 0xf1, 0x86, 0xab },
+ { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+ 0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca,
+ 0x70, 0x5e, 0x2c, 0x02, 0xc8, 0xe6, 0x94, 0xba },
+ { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+ 0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd,
+ 0x78, 0x57, 0x26, 0x09, 0xc4, 0xeb, 0x9a, 0xb5 },
+ { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+ 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+ 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97,
+ 0x88, 0xb9, 0xea, 0xdb, 0x4c, 0x7d, 0x2e, 0x1f },
+ { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+ 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e,
+ 0x90, 0xa2, 0xf4, 0xc6, 0x58, 0x6a, 0x3c, 0x0e },
+ { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+ 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99,
+ 0x98, 0xab, 0xfe, 0xcd, 0x54, 0x67, 0x32, 0x01 },
+ { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+ 0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c,
+ 0xa0, 0x94, 0xc8, 0xfc, 0x70, 0x44, 0x18, 0x2c },
+ { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+ 0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b,
+ 0xa8, 0x9d, 0xc2, 0xf7, 0x7c, 0x49, 0x16, 0x23 },
+ { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+ 0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82,
+ 0xb0, 0x86, 0xdc, 0xea, 0x68, 0x5e, 0x04, 0x32 },
+ { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+ 0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85,
+ 0xb8, 0x8f, 0xd6, 0xe1, 0x64, 0x53, 0x0a, 0x3d },
+ { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+ 0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8,
+ 0xc0, 0xf8, 0xb0, 0x88, 0x20, 0x18, 0x50, 0x68 },
+ { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+ 0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf,
+ 0xc8, 0xf1, 0xba, 0x83, 0x2c, 0x15, 0x5e, 0x67 },
+ { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+ 0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+ 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 },
+ { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+ 0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1,
+ 0xd8, 0xe3, 0xae, 0x95, 0x34, 0x0f, 0x42, 0x79 },
+ { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+ 0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4,
+ 0xe0, 0xdc, 0x98, 0xa4, 0x10, 0x2c, 0x68, 0x54 },
+ { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+ 0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3,
+ 0xe8, 0xd5, 0x92, 0xaf, 0x1c, 0x21, 0x66, 0x5b },
+ { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+ 0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba,
+ 0xf0, 0xce, 0x8c, 0xb2, 0x08, 0x36, 0x74, 0x4a },
+ { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+ 0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd,
+ 0xf8, 0xc7, 0x86, 0xb9, 0x04, 0x3b, 0x7a, 0x45 },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+ 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+ 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x41, 0x82, 0xc3, 0x04, 0x45, 0x86, 0xc7,
+ 0x08, 0x49, 0x8a, 0xcb, 0x0c, 0x4d, 0x8e, 0xcf },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+ 0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x42, 0x84, 0xc6, 0x08, 0x4a, 0x8c, 0xce,
+ 0x10, 0x52, 0x94, 0xd6, 0x18, 0x5a, 0x9c, 0xde },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+ 0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x43, 0x86, 0xc5, 0x0c, 0x4f, 0x8a, 0xc9,
+ 0x18, 0x5b, 0x9e, 0xdd, 0x14, 0x57, 0x92, 0xd1 },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+ 0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x44, 0x88, 0xcc, 0x10, 0x54, 0x98, 0xdc,
+ 0x20, 0x64, 0xa8, 0xec, 0x30, 0x74, 0xb8, 0xfc },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+ 0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x45, 0x8a, 0xcf, 0x14, 0x51, 0x9e, 0xdb,
+ 0x28, 0x6d, 0xa2, 0xe7, 0x3c, 0x79, 0xb6, 0xf3 },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+ 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x46, 0x8c, 0xca, 0x18, 0x5e, 0x94, 0xd2,
+ 0x30, 0x76, 0xbc, 0xfa, 0x28, 0x6e, 0xa4, 0xe2 },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+ 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x47, 0x8e, 0xc9, 0x1c, 0x5b, 0x92, 0xd5,
+ 0x38, 0x7f, 0xb6, 0xf1, 0x24, 0x63, 0xaa, 0xed },
+ { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+ 0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x48, 0x90, 0xd8, 0x20, 0x68, 0xb0, 0xf8,
+ 0x40, 0x08, 0xd0, 0x98, 0x60, 0x28, 0xf0, 0xb8 },
+ { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+ 0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x49, 0x92, 0xdb, 0x24, 0x6d, 0xb6, 0xff,
+ 0x48, 0x01, 0xda, 0x93, 0x6c, 0x25, 0xfe, 0xb7 },
+ { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+ 0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4a, 0x94, 0xde, 0x28, 0x62, 0xbc, 0xf6,
+ 0x50, 0x1a, 0xc4, 0x8e, 0x78, 0x32, 0xec, 0xa6 },
+ { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+ 0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4b, 0x96, 0xdd, 0x2c, 0x67, 0xba, 0xf1,
+ 0x58, 0x13, 0xce, 0x85, 0x74, 0x3f, 0xe2, 0xa9 },
+ { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+ 0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4c, 0x98, 0xd4, 0x30, 0x7c, 0xa8, 0xe4,
+ 0x60, 0x2c, 0xf8, 0xb4, 0x50, 0x1c, 0xc8, 0x84 },
+ { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+ 0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4d, 0x9a, 0xd7, 0x34, 0x79, 0xae, 0xe3,
+ 0x68, 0x25, 0xf2, 0xbf, 0x5c, 0x11, 0xc6, 0x8b },
+ { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+ 0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+ 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a },
+ { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+ 0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4f, 0x9e, 0xd1, 0x3c, 0x73, 0xa2, 0xed,
+ 0x78, 0x37, 0xe6, 0xa9, 0x44, 0x0b, 0xda, 0x95 },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+ 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+ 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x51, 0xa2, 0xf3, 0x44, 0x15, 0xe6, 0xb7,
+ 0x88, 0xd9, 0x2a, 0x7b, 0xcc, 0x9d, 0x6e, 0x3f },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+ 0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x52, 0xa4, 0xf6, 0x48, 0x1a, 0xec, 0xbe,
+ 0x90, 0xc2, 0x34, 0x66, 0xd8, 0x8a, 0x7c, 0x2e },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+ 0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+ 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+ 0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x54, 0xa8, 0xfc, 0x50, 0x04, 0xf8, 0xac,
+ 0xa0, 0xf4, 0x08, 0x5c, 0xf0, 0xa4, 0x58, 0x0c },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+ 0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x55, 0xaa, 0xff, 0x54, 0x01, 0xfe, 0xab,
+ 0xa8, 0xfd, 0x02, 0x57, 0xfc, 0xa9, 0x56, 0x03 },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+ 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x56, 0xac, 0xfa, 0x58, 0x0e, 0xf4, 0xa2,
+ 0xb0, 0xe6, 0x1c, 0x4a, 0xe8, 0xbe, 0x44, 0x12 },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+ 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x57, 0xae, 0xf9, 0x5c, 0x0b, 0xf2, 0xa5,
+ 0xb8, 0xef, 0x16, 0x41, 0xe4, 0xb3, 0x4a, 0x1d },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+ 0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x58, 0xb0, 0xe8, 0x60, 0x38, 0xd0, 0x88,
+ 0xc0, 0x98, 0x70, 0x28, 0xa0, 0xf8, 0x10, 0x48 },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+ 0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x59, 0xb2, 0xeb, 0x64, 0x3d, 0xd6, 0x8f,
+ 0xc8, 0x91, 0x7a, 0x23, 0xac, 0xf5, 0x1e, 0x47 },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+ 0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5a, 0xb4, 0xee, 0x68, 0x32, 0xdc, 0x86,
+ 0xd0, 0x8a, 0x64, 0x3e, 0xb8, 0xe2, 0x0c, 0x56 },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+ 0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5b, 0xb6, 0xed, 0x6c, 0x37, 0xda, 0x81,
+ 0xd8, 0x83, 0x6e, 0x35, 0xb4, 0xef, 0x02, 0x59 },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+ 0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5c, 0xb8, 0xe4, 0x70, 0x2c, 0xc8, 0x94,
+ 0xe0, 0xbc, 0x58, 0x04, 0x90, 0xcc, 0x28, 0x74 },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+ 0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5d, 0xba, 0xe7, 0x74, 0x29, 0xce, 0x93,
+ 0xe8, 0xb5, 0x52, 0x0f, 0x9c, 0xc1, 0x26, 0x7b },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+ 0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5e, 0xbc, 0xe2, 0x78, 0x26, 0xc4, 0x9a,
+ 0xf0, 0xae, 0x4c, 0x12, 0x88, 0xd6, 0x34, 0x6a },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+ 0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5f, 0xbe, 0xe1, 0x7c, 0x23, 0xc2, 0x9d,
+ 0xf8, 0xa7, 0x46, 0x19, 0x84, 0xdb, 0x3a, 0x65 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+ 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+ 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x61, 0xc2, 0xa3, 0x84, 0xe5, 0x46, 0x27,
+ 0x08, 0x69, 0xca, 0xab, 0x8c, 0xed, 0x4e, 0x2f },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+ 0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x62, 0xc4, 0xa6, 0x88, 0xea, 0x4c, 0x2e,
+ 0x10, 0x72, 0xd4, 0xb6, 0x98, 0xfa, 0x5c, 0x3e },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+ 0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x63, 0xc6, 0xa5, 0x8c, 0xef, 0x4a, 0x29,
+ 0x18, 0x7b, 0xde, 0xbd, 0x94, 0xf7, 0x52, 0x31 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+ 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x64, 0xc8, 0xac, 0x90, 0xf4, 0x58, 0x3c,
+ 0x20, 0x44, 0xe8, 0x8c, 0xb0, 0xd4, 0x78, 0x1c },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+ 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x65, 0xca, 0xaf, 0x94, 0xf1, 0x5e, 0x3b,
+ 0x28, 0x4d, 0xe2, 0x87, 0xbc, 0xd9, 0x76, 0x13 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+ 0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x66, 0xcc, 0xaa, 0x98, 0xfe, 0x54, 0x32,
+ 0x30, 0x56, 0xfc, 0x9a, 0xa8, 0xce, 0x64, 0x02 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+ 0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x67, 0xce, 0xa9, 0x9c, 0xfb, 0x52, 0x35,
+ 0x38, 0x5f, 0xf6, 0x91, 0xa4, 0xc3, 0x6a, 0x0d },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+ 0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x68, 0xd0, 0xb8, 0xa0, 0xc8, 0x70, 0x18,
+ 0x40, 0x28, 0x90, 0xf8, 0xe0, 0x88, 0x30, 0x58 },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+ 0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+ 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+ 0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6a, 0xd4, 0xbe, 0xa8, 0xc2, 0x7c, 0x16,
+ 0x50, 0x3a, 0x84, 0xee, 0xf8, 0x92, 0x2c, 0x46 },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+ 0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6b, 0xd6, 0xbd, 0xac, 0xc7, 0x7a, 0x11,
+ 0x58, 0x33, 0x8e, 0xe5, 0xf4, 0x9f, 0x22, 0x49 },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+ 0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6c, 0xd8, 0xb4, 0xb0, 0xdc, 0x68, 0x04,
+ 0x60, 0x0c, 0xb8, 0xd4, 0xd0, 0xbc, 0x08, 0x64 },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+ 0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6d, 0xda, 0xb7, 0xb4, 0xd9, 0x6e, 0x03,
+ 0x68, 0x05, 0xb2, 0xdf, 0xdc, 0xb1, 0x06, 0x6b },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+ 0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6e, 0xdc, 0xb2, 0xb8, 0xd6, 0x64, 0x0a,
+ 0x70, 0x1e, 0xac, 0xc2, 0xc8, 0xa6, 0x14, 0x7a },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+ 0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6f, 0xde, 0xb1, 0xbc, 0xd3, 0x62, 0x0d,
+ 0x78, 0x17, 0xa6, 0xc9, 0xc4, 0xab, 0x1a, 0x75 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+ 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+ 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x71, 0xe2, 0x93, 0xc4, 0xb5, 0x26, 0x57,
+ 0x88, 0xf9, 0x6a, 0x1b, 0x4c, 0x3d, 0xae, 0xdf },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+ 0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x72, 0xe4, 0x96, 0xc8, 0xba, 0x2c, 0x5e,
+ 0x90, 0xe2, 0x74, 0x06, 0x58, 0x2a, 0xbc, 0xce },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+ 0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x73, 0xe6, 0x95, 0xcc, 0xbf, 0x2a, 0x59,
+ 0x98, 0xeb, 0x7e, 0x0d, 0x54, 0x27, 0xb2, 0xc1 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+ 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+ 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+ 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x75, 0xea, 0x9f, 0xd4, 0xa1, 0x3e, 0x4b,
+ 0xa8, 0xdd, 0x42, 0x37, 0x7c, 0x09, 0x96, 0xe3 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+ 0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x76, 0xec, 0x9a, 0xd8, 0xae, 0x34, 0x42,
+ 0xb0, 0xc6, 0x5c, 0x2a, 0x68, 0x1e, 0x84, 0xf2 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+ 0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x77, 0xee, 0x99, 0xdc, 0xab, 0x32, 0x45,
+ 0xb8, 0xcf, 0x56, 0x21, 0x64, 0x13, 0x8a, 0xfd },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+ 0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x78, 0xf0, 0x88, 0xe0, 0x98, 0x10, 0x68,
+ 0xc0, 0xb8, 0x30, 0x48, 0x20, 0x58, 0xd0, 0xa8 },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+ 0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x79, 0xf2, 0x8b, 0xe4, 0x9d, 0x16, 0x6f,
+ 0xc8, 0xb1, 0x3a, 0x43, 0x2c, 0x55, 0xde, 0xa7 },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+ 0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7a, 0xf4, 0x8e, 0xe8, 0x92, 0x1c, 0x66,
+ 0xd0, 0xaa, 0x24, 0x5e, 0x38, 0x42, 0xcc, 0xb6 },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+ 0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7b, 0xf6, 0x8d, 0xec, 0x97, 0x1a, 0x61,
+ 0xd8, 0xa3, 0x2e, 0x55, 0x34, 0x4f, 0xc2, 0xb9 },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+ 0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7c, 0xf8, 0x84, 0xf0, 0x8c, 0x08, 0x74,
+ 0xe0, 0x9c, 0x18, 0x64, 0x10, 0x6c, 0xe8, 0x94 },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+ 0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7d, 0xfa, 0x87, 0xf4, 0x89, 0x0e, 0x73,
+ 0xe8, 0x95, 0x12, 0x6f, 0x1c, 0x61, 0xe6, 0x9b },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+ 0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7e, 0xfc, 0x82, 0xf8, 0x86, 0x04, 0x7a,
+ 0xf0, 0x8e, 0x0c, 0x72, 0x08, 0x76, 0xf4, 0x8a },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+ 0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7f, 0xfe, 0x81, 0xfc, 0x83, 0x02, 0x7d,
+ 0xf8, 0x87, 0x06, 0x79, 0x04, 0x7b, 0xfa, 0x85 },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+ 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+ 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+ 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+ 0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x82, 0x04, 0x86, 0x08, 0x8a, 0x0c, 0x8e,
+ 0x10, 0x92, 0x14, 0x96, 0x18, 0x9a, 0x1c, 0x9e },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+ 0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x83, 0x06, 0x85, 0x0c, 0x8f, 0x0a, 0x89,
+ 0x18, 0x9b, 0x1e, 0x9d, 0x14, 0x97, 0x12, 0x91 },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+ 0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x84, 0x08, 0x8c, 0x10, 0x94, 0x18, 0x9c,
+ 0x20, 0xa4, 0x28, 0xac, 0x30, 0xb4, 0x38, 0xbc },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+ 0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x85, 0x0a, 0x8f, 0x14, 0x91, 0x1e, 0x9b,
+ 0x28, 0xad, 0x22, 0xa7, 0x3c, 0xb9, 0x36, 0xb3 },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+ 0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x86, 0x0c, 0x8a, 0x18, 0x9e, 0x14, 0x92,
+ 0x30, 0xb6, 0x3c, 0xba, 0x28, 0xae, 0x24, 0xa2 },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+ 0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x87, 0x0e, 0x89, 0x1c, 0x9b, 0x12, 0x95,
+ 0x38, 0xbf, 0x36, 0xb1, 0x24, 0xa3, 0x2a, 0xad },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+ 0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x88, 0x10, 0x98, 0x20, 0xa8, 0x30, 0xb8,
+ 0x40, 0xc8, 0x50, 0xd8, 0x60, 0xe8, 0x70, 0xf8 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+ 0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x89, 0x12, 0x9b, 0x24, 0xad, 0x36, 0xbf,
+ 0x48, 0xc1, 0x5a, 0xd3, 0x6c, 0xe5, 0x7e, 0xf7 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+ 0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8a, 0x14, 0x9e, 0x28, 0xa2, 0x3c, 0xb6,
+ 0x50, 0xda, 0x44, 0xce, 0x78, 0xf2, 0x6c, 0xe6 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+ 0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8b, 0x16, 0x9d, 0x2c, 0xa7, 0x3a, 0xb1,
+ 0x58, 0xd3, 0x4e, 0xc5, 0x74, 0xff, 0x62, 0xe9 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+ 0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8c, 0x18, 0x94, 0x30, 0xbc, 0x28, 0xa4,
+ 0x60, 0xec, 0x78, 0xf4, 0x50, 0xdc, 0x48, 0xc4 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+ 0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8d, 0x1a, 0x97, 0x34, 0xb9, 0x2e, 0xa3,
+ 0x68, 0xe5, 0x72, 0xff, 0x5c, 0xd1, 0x46, 0xcb },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+ 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8e, 0x1c, 0x92, 0x38, 0xb6, 0x24, 0xaa,
+ 0x70, 0xfe, 0x6c, 0xe2, 0x48, 0xc6, 0x54, 0xda },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+ 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8f, 0x1e, 0x91, 0x3c, 0xb3, 0x22, 0xad,
+ 0x78, 0xf7, 0x66, 0xe9, 0x44, 0xcb, 0x5a, 0xd5 },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+ 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+ 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x91, 0x22, 0xb3, 0x44, 0xd5, 0x66, 0xf7,
+ 0x88, 0x19, 0xaa, 0x3b, 0xcc, 0x5d, 0xee, 0x7f },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+ 0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x92, 0x24, 0xb6, 0x48, 0xda, 0x6c, 0xfe,
+ 0x90, 0x02, 0xb4, 0x26, 0xd8, 0x4a, 0xfc, 0x6e },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+ 0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x93, 0x26, 0xb5, 0x4c, 0xdf, 0x6a, 0xf9,
+ 0x98, 0x0b, 0xbe, 0x2d, 0xd4, 0x47, 0xf2, 0x61 },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+ 0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x94, 0x28, 0xbc, 0x50, 0xc4, 0x78, 0xec,
+ 0xa0, 0x34, 0x88, 0x1c, 0xf0, 0x64, 0xd8, 0x4c },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+ 0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x95, 0x2a, 0xbf, 0x54, 0xc1, 0x7e, 0xeb,
+ 0xa8, 0x3d, 0x82, 0x17, 0xfc, 0x69, 0xd6, 0x43 },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+ 0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x96, 0x2c, 0xba, 0x58, 0xce, 0x74, 0xe2,
+ 0xb0, 0x26, 0x9c, 0x0a, 0xe8, 0x7e, 0xc4, 0x52 },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+ 0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x97, 0x2e, 0xb9, 0x5c, 0xcb, 0x72, 0xe5,
+ 0xb8, 0x2f, 0x96, 0x01, 0xe4, 0x73, 0xca, 0x5d },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+ 0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x98, 0x30, 0xa8, 0x60, 0xf8, 0x50, 0xc8,
+ 0xc0, 0x58, 0xf0, 0x68, 0xa0, 0x38, 0x90, 0x08 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+ 0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x99, 0x32, 0xab, 0x64, 0xfd, 0x56, 0xcf,
+ 0xc8, 0x51, 0xfa, 0x63, 0xac, 0x35, 0x9e, 0x07 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+ 0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9a, 0x34, 0xae, 0x68, 0xf2, 0x5c, 0xc6,
+ 0xd0, 0x4a, 0xe4, 0x7e, 0xb8, 0x22, 0x8c, 0x16 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+ 0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9b, 0x36, 0xad, 0x6c, 0xf7, 0x5a, 0xc1,
+ 0xd8, 0x43, 0xee, 0x75, 0xb4, 0x2f, 0x82, 0x19 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+ 0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+ 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+ 0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9d, 0x3a, 0xa7, 0x74, 0xe9, 0x4e, 0xd3,
+ 0xe8, 0x75, 0xd2, 0x4f, 0x9c, 0x01, 0xa6, 0x3b },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+ 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9e, 0x3c, 0xa2, 0x78, 0xe6, 0x44, 0xda,
+ 0xf0, 0x6e, 0xcc, 0x52, 0x88, 0x16, 0xb4, 0x2a },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+ 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9f, 0x3e, 0xa1, 0x7c, 0xe3, 0x42, 0xdd,
+ 0xf8, 0x67, 0xc6, 0x59, 0x84, 0x1b, 0xba, 0x25 },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+ 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+ 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa1, 0x42, 0xe3, 0x84, 0x25, 0xc6, 0x67,
+ 0x08, 0xa9, 0x4a, 0xeb, 0x8c, 0x2d, 0xce, 0x6f },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+ 0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa2, 0x44, 0xe6, 0x88, 0x2a, 0xcc, 0x6e,
+ 0x10, 0xb2, 0x54, 0xf6, 0x98, 0x3a, 0xdc, 0x7e },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+ 0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa3, 0x46, 0xe5, 0x8c, 0x2f, 0xca, 0x69,
+ 0x18, 0xbb, 0x5e, 0xfd, 0x94, 0x37, 0xd2, 0x71 },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+ 0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa4, 0x48, 0xec, 0x90, 0x34, 0xd8, 0x7c,
+ 0x20, 0x84, 0x68, 0xcc, 0xb0, 0x14, 0xf8, 0x5c },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+ 0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa5, 0x4a, 0xef, 0x94, 0x31, 0xde, 0x7b,
+ 0x28, 0x8d, 0x62, 0xc7, 0xbc, 0x19, 0xf6, 0x53 },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+ 0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+ 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+ 0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa7, 0x4e, 0xe9, 0x9c, 0x3b, 0xd2, 0x75,
+ 0x38, 0x9f, 0x76, 0xd1, 0xa4, 0x03, 0xea, 0x4d },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+ 0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa8, 0x50, 0xf8, 0xa0, 0x08, 0xf0, 0x58,
+ 0x40, 0xe8, 0x10, 0xb8, 0xe0, 0x48, 0xb0, 0x18 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+ 0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa9, 0x52, 0xfb, 0xa4, 0x0d, 0xf6, 0x5f,
+ 0x48, 0xe1, 0x1a, 0xb3, 0xec, 0x45, 0xbe, 0x17 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+ 0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xaa, 0x54, 0xfe, 0xa8, 0x02, 0xfc, 0x56,
+ 0x50, 0xfa, 0x04, 0xae, 0xf8, 0x52, 0xac, 0x06 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+ 0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xab, 0x56, 0xfd, 0xac, 0x07, 0xfa, 0x51,
+ 0x58, 0xf3, 0x0e, 0xa5, 0xf4, 0x5f, 0xa2, 0x09 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+ 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xac, 0x58, 0xf4, 0xb0, 0x1c, 0xe8, 0x44,
+ 0x60, 0xcc, 0x38, 0x94, 0xd0, 0x7c, 0x88, 0x24 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+ 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xad, 0x5a, 0xf7, 0xb4, 0x19, 0xee, 0x43,
+ 0x68, 0xc5, 0x32, 0x9f, 0xdc, 0x71, 0x86, 0x2b },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+ 0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xae, 0x5c, 0xf2, 0xb8, 0x16, 0xe4, 0x4a,
+ 0x70, 0xde, 0x2c, 0x82, 0xc8, 0x66, 0x94, 0x3a },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+ 0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xaf, 0x5e, 0xf1, 0xbc, 0x13, 0xe2, 0x4d,
+ 0x78, 0xd7, 0x26, 0x89, 0xc4, 0x6b, 0x9a, 0x35 },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+ 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+ 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb1, 0x62, 0xd3, 0xc4, 0x75, 0xa6, 0x17,
+ 0x88, 0x39, 0xea, 0x5b, 0x4c, 0xfd, 0x2e, 0x9f },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+ 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb2, 0x64, 0xd6, 0xc8, 0x7a, 0xac, 0x1e,
+ 0x90, 0x22, 0xf4, 0x46, 0x58, 0xea, 0x3c, 0x8e },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+ 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb3, 0x66, 0xd5, 0xcc, 0x7f, 0xaa, 0x19,
+ 0x98, 0x2b, 0xfe, 0x4d, 0x54, 0xe7, 0x32, 0x81 },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+ 0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb4, 0x68, 0xdc, 0xd0, 0x64, 0xb8, 0x0c,
+ 0xa0, 0x14, 0xc8, 0x7c, 0x70, 0xc4, 0x18, 0xac },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+ 0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb5, 0x6a, 0xdf, 0xd4, 0x61, 0xbe, 0x0b,
+ 0xa8, 0x1d, 0xc2, 0x77, 0x7c, 0xc9, 0x16, 0xa3 },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+ 0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb6, 0x6c, 0xda, 0xd8, 0x6e, 0xb4, 0x02,
+ 0xb0, 0x06, 0xdc, 0x6a, 0x68, 0xde, 0x04, 0xb2 },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+ 0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb7, 0x6e, 0xd9, 0xdc, 0x6b, 0xb2, 0x05,
+ 0xb8, 0x0f, 0xd6, 0x61, 0x64, 0xd3, 0x0a, 0xbd },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+ 0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb8, 0x70, 0xc8, 0xe0, 0x58, 0x90, 0x28,
+ 0xc0, 0x78, 0xb0, 0x08, 0x20, 0x98, 0x50, 0xe8 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+ 0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb9, 0x72, 0xcb, 0xe4, 0x5d, 0x96, 0x2f,
+ 0xc8, 0x71, 0xba, 0x03, 0x2c, 0x95, 0x5e, 0xe7 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+ 0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xba, 0x74, 0xce, 0xe8, 0x52, 0x9c, 0x26,
+ 0xd0, 0x6a, 0xa4, 0x1e, 0x38, 0x82, 0x4c, 0xf6 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+ 0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+ 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+ 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xbc, 0x78, 0xc4, 0xf0, 0x4c, 0x88, 0x34,
+ 0xe0, 0x5c, 0x98, 0x24, 0x10, 0xac, 0x68, 0xd4 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+ 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xbd, 0x7a, 0xc7, 0xf4, 0x49, 0x8e, 0x33,
+ 0xe8, 0x55, 0x92, 0x2f, 0x1c, 0xa1, 0x66, 0xdb },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+ 0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xbe, 0x7c, 0xc2, 0xf8, 0x46, 0x84, 0x3a,
+ 0xf0, 0x4e, 0x8c, 0x32, 0x08, 0xb6, 0x74, 0xca },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+ 0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xbf, 0x7e, 0xc1, 0xfc, 0x43, 0x82, 0x3d,
+ 0xf8, 0x47, 0x86, 0x39, 0x04, 0xbb, 0x7a, 0xc5 },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+ 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+ 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc1, 0x82, 0x43, 0x04, 0xc5, 0x86, 0x47,
+ 0x08, 0xc9, 0x8a, 0x4b, 0x0c, 0xcd, 0x8e, 0x4f },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+ 0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc2, 0x84, 0x46, 0x08, 0xca, 0x8c, 0x4e,
+ 0x10, 0xd2, 0x94, 0x56, 0x18, 0xda, 0x9c, 0x5e },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+ 0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc3, 0x86, 0x45, 0x0c, 0xcf, 0x8a, 0x49,
+ 0x18, 0xdb, 0x9e, 0x5d, 0x14, 0xd7, 0x92, 0x51 },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+ 0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc4, 0x88, 0x4c, 0x10, 0xd4, 0x98, 0x5c,
+ 0x20, 0xe4, 0xa8, 0x6c, 0x30, 0xf4, 0xb8, 0x7c },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+ 0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc5, 0x8a, 0x4f, 0x14, 0xd1, 0x9e, 0x5b,
+ 0x28, 0xed, 0xa2, 0x67, 0x3c, 0xf9, 0xb6, 0x73 },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+ 0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc6, 0x8c, 0x4a, 0x18, 0xde, 0x94, 0x52,
+ 0x30, 0xf6, 0xbc, 0x7a, 0x28, 0xee, 0xa4, 0x62 },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+ 0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc7, 0x8e, 0x49, 0x1c, 0xdb, 0x92, 0x55,
+ 0x38, 0xff, 0xb6, 0x71, 0x24, 0xe3, 0xaa, 0x6d },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+ 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc8, 0x90, 0x58, 0x20, 0xe8, 0xb0, 0x78,
+ 0x40, 0x88, 0xd0, 0x18, 0x60, 0xa8, 0xf0, 0x38 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+ 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc9, 0x92, 0x5b, 0x24, 0xed, 0xb6, 0x7f,
+ 0x48, 0x81, 0xda, 0x13, 0x6c, 0xa5, 0xfe, 0x37 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+ 0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xca, 0x94, 0x5e, 0x28, 0xe2, 0xbc, 0x76,
+ 0x50, 0x9a, 0xc4, 0x0e, 0x78, 0xb2, 0xec, 0x26 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+ 0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xcb, 0x96, 0x5d, 0x2c, 0xe7, 0xba, 0x71,
+ 0x58, 0x93, 0xce, 0x05, 0x74, 0xbf, 0xe2, 0x29 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+ 0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xcc, 0x98, 0x54, 0x30, 0xfc, 0xa8, 0x64,
+ 0x60, 0xac, 0xf8, 0x34, 0x50, 0x9c, 0xc8, 0x04 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+ 0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xcd, 0x9a, 0x57, 0x34, 0xf9, 0xae, 0x63,
+ 0x68, 0xa5, 0xf2, 0x3f, 0x5c, 0x91, 0xc6, 0x0b },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+ 0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xce, 0x9c, 0x52, 0x38, 0xf6, 0xa4, 0x6a,
+ 0x70, 0xbe, 0xec, 0x22, 0x48, 0x86, 0xd4, 0x1a },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+ 0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+ 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+ 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+ 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd1, 0xa2, 0x73, 0x44, 0x95, 0xe6, 0x37,
+ 0x88, 0x59, 0x2a, 0xfb, 0xcc, 0x1d, 0x6e, 0xbf },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+ 0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+ 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+ 0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd3, 0xa6, 0x75, 0x4c, 0x9f, 0xea, 0x39,
+ 0x98, 0x4b, 0x3e, 0xed, 0xd4, 0x07, 0x72, 0xa1 },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+ 0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd4, 0xa8, 0x7c, 0x50, 0x84, 0xf8, 0x2c,
+ 0xa0, 0x74, 0x08, 0xdc, 0xf0, 0x24, 0x58, 0x8c },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+ 0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd5, 0xaa, 0x7f, 0x54, 0x81, 0xfe, 0x2b,
+ 0xa8, 0x7d, 0x02, 0xd7, 0xfc, 0x29, 0x56, 0x83 },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+ 0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd6, 0xac, 0x7a, 0x58, 0x8e, 0xf4, 0x22,
+ 0xb0, 0x66, 0x1c, 0xca, 0xe8, 0x3e, 0x44, 0x92 },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+ 0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd7, 0xae, 0x79, 0x5c, 0x8b, 0xf2, 0x25,
+ 0xb8, 0x6f, 0x16, 0xc1, 0xe4, 0x33, 0x4a, 0x9d },
+ { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+ 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd8, 0xb0, 0x68, 0x60, 0xb8, 0xd0, 0x08,
+ 0xc0, 0x18, 0x70, 0xa8, 0xa0, 0x78, 0x10, 0xc8 },
+ { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+ 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd9, 0xb2, 0x6b, 0x64, 0xbd, 0xd6, 0x0f,
+ 0xc8, 0x11, 0x7a, 0xa3, 0xac, 0x75, 0x1e, 0xc7 },
+ { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+ 0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xda, 0xb4, 0x6e, 0x68, 0xb2, 0xdc, 0x06,
+ 0xd0, 0x0a, 0x64, 0xbe, 0xb8, 0x62, 0x0c, 0xd6 },
+ { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+ 0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xdb, 0xb6, 0x6d, 0x6c, 0xb7, 0xda, 0x01,
+ 0xd8, 0x03, 0x6e, 0xb5, 0xb4, 0x6f, 0x02, 0xd9 },
+ { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+ 0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xdc, 0xb8, 0x64, 0x70, 0xac, 0xc8, 0x14,
+ 0xe0, 0x3c, 0x58, 0x84, 0x90, 0x4c, 0x28, 0xf4 },
+ { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+ 0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xdd, 0xba, 0x67, 0x74, 0xa9, 0xce, 0x13,
+ 0xe8, 0x35, 0x52, 0x8f, 0x9c, 0x41, 0x26, 0xfb },
+ { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+ 0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xde, 0xbc, 0x62, 0x78, 0xa6, 0xc4, 0x1a,
+ 0xf0, 0x2e, 0x4c, 0x92, 0x88, 0x56, 0x34, 0xea },
+ { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+ 0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xdf, 0xbe, 0x61, 0x7c, 0xa3, 0xc2, 0x1d,
+ 0xf8, 0x27, 0x46, 0x99, 0x84, 0x5b, 0x3a, 0xe5 },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+ 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+ 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe1, 0xc2, 0x23, 0x84, 0x65, 0x46, 0xa7,
+ 0x08, 0xe9, 0xca, 0x2b, 0x8c, 0x6d, 0x4e, 0xaf },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+ 0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe2, 0xc4, 0x26, 0x88, 0x6a, 0x4c, 0xae,
+ 0x10, 0xf2, 0xd4, 0x36, 0x98, 0x7a, 0x5c, 0xbe },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+ 0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe3, 0xc6, 0x25, 0x8c, 0x6f, 0x4a, 0xa9,
+ 0x18, 0xfb, 0xde, 0x3d, 0x94, 0x77, 0x52, 0xb1 },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+ 0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe4, 0xc8, 0x2c, 0x90, 0x74, 0x58, 0xbc,
+ 0x20, 0xc4, 0xe8, 0x0c, 0xb0, 0x54, 0x78, 0x9c },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+ 0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe5, 0xca, 0x2f, 0x94, 0x71, 0x5e, 0xbb,
+ 0x28, 0xcd, 0xe2, 0x07, 0xbc, 0x59, 0x76, 0x93 },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+ 0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe6, 0xcc, 0x2a, 0x98, 0x7e, 0x54, 0xb2,
+ 0x30, 0xd6, 0xfc, 0x1a, 0xa8, 0x4e, 0x64, 0x82 },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+ 0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe7, 0xce, 0x29, 0x9c, 0x7b, 0x52, 0xb5,
+ 0x38, 0xdf, 0xf6, 0x11, 0xa4, 0x43, 0x6a, 0x8d },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+ 0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+ 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+ 0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe9, 0xd2, 0x3b, 0xa4, 0x4d, 0x76, 0x9f,
+ 0x48, 0xa1, 0x9a, 0x73, 0xec, 0x05, 0x3e, 0xd7 },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+ 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xea, 0xd4, 0x3e, 0xa8, 0x42, 0x7c, 0x96,
+ 0x50, 0xba, 0x84, 0x6e, 0xf8, 0x12, 0x2c, 0xc6 },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+ 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xeb, 0xd6, 0x3d, 0xac, 0x47, 0x7a, 0x91,
+ 0x58, 0xb3, 0x8e, 0x65, 0xf4, 0x1f, 0x22, 0xc9 },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+ 0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xec, 0xd8, 0x34, 0xb0, 0x5c, 0x68, 0x84,
+ 0x60, 0x8c, 0xb8, 0x54, 0xd0, 0x3c, 0x08, 0xe4 },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+ 0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xed, 0xda, 0x37, 0xb4, 0x59, 0x6e, 0x83,
+ 0x68, 0x85, 0xb2, 0x5f, 0xdc, 0x31, 0x06, 0xeb },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+ 0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xee, 0xdc, 0x32, 0xb8, 0x56, 0x64, 0x8a,
+ 0x70, 0x9e, 0xac, 0x42, 0xc8, 0x26, 0x14, 0xfa },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+ 0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xef, 0xde, 0x31, 0xbc, 0x53, 0x62, 0x8d,
+ 0x78, 0x97, 0xa6, 0x49, 0xc4, 0x2b, 0x1a, 0xf5 },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+ 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+ 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf1, 0xe2, 0x13, 0xc4, 0x35, 0x26, 0xd7,
+ 0x88, 0x79, 0x6a, 0x9b, 0x4c, 0xbd, 0xae, 0x5f },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+ 0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf2, 0xe4, 0x16, 0xc8, 0x3a, 0x2c, 0xde,
+ 0x90, 0x62, 0x74, 0x86, 0x58, 0xaa, 0xbc, 0x4e },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+ 0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf3, 0xe6, 0x15, 0xcc, 0x3f, 0x2a, 0xd9,
+ 0x98, 0x6b, 0x7e, 0x8d, 0x54, 0xa7, 0xb2, 0x41 },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+ 0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf4, 0xe8, 0x1c, 0xd0, 0x24, 0x38, 0xcc,
+ 0xa0, 0x54, 0x48, 0xbc, 0x70, 0x84, 0x98, 0x6c },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+ 0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+ 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+ 0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf6, 0xec, 0x1a, 0xd8, 0x2e, 0x34, 0xc2,
+ 0xb0, 0x46, 0x5c, 0xaa, 0x68, 0x9e, 0x84, 0x72 },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+ 0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf7, 0xee, 0x19, 0xdc, 0x2b, 0x32, 0xc5,
+ 0xb8, 0x4f, 0x56, 0xa1, 0x64, 0x93, 0x8a, 0x7d },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+ 0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf8, 0xf0, 0x08, 0xe0, 0x18, 0x10, 0xe8,
+ 0xc0, 0x38, 0x30, 0xc8, 0x20, 0xd8, 0xd0, 0x28 },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+ 0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf9, 0xf2, 0x0b, 0xe4, 0x1d, 0x16, 0xef,
+ 0xc8, 0x31, 0x3a, 0xc3, 0x2c, 0xd5, 0xde, 0x27 },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+ 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xfa, 0xf4, 0x0e, 0xe8, 0x12, 0x1c, 0xe6,
+ 0xd0, 0x2a, 0x24, 0xde, 0x38, 0xc2, 0xcc, 0x36 },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+ 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xfb, 0xf6, 0x0d, 0xec, 0x17, 0x1a, 0xe1,
+ 0xd8, 0x23, 0x2e, 0xd5, 0x34, 0xcf, 0xc2, 0x39 },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+ 0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xfc, 0xf8, 0x04, 0xf0, 0x0c, 0x08, 0xf4,
+ 0xe0, 0x1c, 0x18, 0xe4, 0x10, 0xec, 0xe8, 0x14 },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+ 0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xfd, 0xfa, 0x07, 0xf4, 0x09, 0x0e, 0xf3,
+ 0xe8, 0x15, 0x12, 0xef, 0x1c, 0xe1, 0xe6, 0x1b },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+ 0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xfe, 0xfc, 0x02, 0xf8, 0x06, 0x04, 0xfa,
+ 0xf0, 0x0e, 0x0c, 0xf2, 0x08, 0xf6, 0xf4, 0x0a },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+ 0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xff, 0xfe, 0x01, 0xfc, 0x03, 0x02, 0xfd,
+ 0xf8, 0x07, 0x06, 0xf9, 0x04, 0xfb, 0xfa, 0x05 }
+};
+/* END CSTYLED */
+#endif // ENDIANNESS
+#endif /* defined(__powerpc__) */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec_common.h b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec_common.h
new file mode 100644
index 000000000000..3842f5fd637c
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec_common.h
@@ -0,0 +1,690 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2019 Romain Dolbeau. All rights reserved.
+ * <romain.dolbeau@european-processor-initiative.eu>
+ */
+
+#include <sys/types.h>
+#include <sys/simd.h>
+
+#ifdef __linux__
+#define __asm __asm__ __volatile__
+#endif
+
+#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
+#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
+
+#define VR0_(REG, ...) "%[w"#REG"]"
+#define VR1_(_1, REG, ...) "%[w"#REG"]"
+#define VR2_(_1, _2, REG, ...) "%[w"#REG"]"
+#define VR3_(_1, _2, _3, REG, ...) "%[w"#REG"]"
+#define VR4_(_1, _2, _3, _4, REG, ...) "%[w"#REG"]"
+#define VR5_(_1, _2, _3, _4, _5, REG, ...) "%[w"#REG"]"
+#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "%[w"#REG"]"
+#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "%[w"#REG"]"
+
+/*
+ * Here we need registers not used otherwise.
+ * They will be used in unused ASM for the case
+ * with more registers than required... but GCC
+ * will still need to make sure the constraints
+ * are correct, and duplicate constraints are illegal
+ * ... and we use the "register" number as a name
+ */
+
+#define VR0(r...) VR0_(r)
+#define VR1(r...) VR1_(r)
+#define VR2(r...) VR2_(r, 36)
+#define VR3(r...) VR3_(r, 36, 35)
+#define VR4(r...) VR4_(r, 36, 35, 34, 33)
+#define VR5(r...) VR5_(r, 36, 35, 34, 33, 32)
+#define VR6(r...) VR6_(r, 36, 35, 34, 33, 32, 31)
+#define VR7(r...) VR7_(r, 36, 35, 34, 33, 32, 31, 30)
+
+#define VR(X) "%[w"#X"]"
+
+#define RVR0_(REG, ...) [w##REG] "v" (w##REG)
+#define RVR1_(_1, REG, ...) [w##REG] "v" (w##REG)
+#define RVR2_(_1, _2, REG, ...) [w##REG] "v" (w##REG)
+#define RVR3_(_1, _2, _3, REG, ...) [w##REG] "v" (w##REG)
+#define RVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "v" (w##REG)
+#define RVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "v" (w##REG)
+#define RVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "v" (w##REG)
+#define RVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "v" (w##REG)
+
+#define RVR0(r...) RVR0_(r)
+#define RVR1(r...) RVR1_(r)
+#define RVR2(r...) RVR2_(r, 36)
+#define RVR3(r...) RVR3_(r, 36, 35)
+#define RVR4(r...) RVR4_(r, 36, 35, 34, 33)
+#define RVR5(r...) RVR5_(r, 36, 35, 34, 33, 32)
+#define RVR6(r...) RVR6_(r, 36, 35, 34, 33, 32, 31)
+#define RVR7(r...) RVR7_(r, 36, 35, 34, 33, 32, 31, 30)
+
+#define RVR(X) [w##X] "v" (w##X)
+
+#define WVR0_(REG, ...) [w##REG] "=v" (w##REG)
+#define WVR1_(_1, REG, ...) [w##REG] "=v" (w##REG)
+#define WVR2_(_1, _2, REG, ...) [w##REG] "=v" (w##REG)
+#define WVR3_(_1, _2, _3, REG, ...) [w##REG] "=v" (w##REG)
+#define WVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "=v" (w##REG)
+#define WVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "=v" (w##REG)
+#define WVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "=v" (w##REG)
+#define WVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "=v" (w##REG)
+
+#define WVR0(r...) WVR0_(r)
+#define WVR1(r...) WVR1_(r)
+#define WVR2(r...) WVR2_(r, 36)
+#define WVR3(r...) WVR3_(r, 36, 35)
+#define WVR4(r...) WVR4_(r, 36, 35, 34, 33)
+#define WVR5(r...) WVR5_(r, 36, 35, 34, 33, 32)
+#define WVR6(r...) WVR6_(r, 36, 35, 34, 33, 32, 31)
+#define WVR7(r...) WVR7_(r, 36, 35, 34, 33, 32, 31, 30)
+
+#define WVR(X) [w##X] "=v" (w##X)
+
+#define UVR0_(REG, ...) [w##REG] "+&v" (w##REG)
+#define UVR1_(_1, REG, ...) [w##REG] "+&v" (w##REG)
+#define UVR2_(_1, _2, REG, ...) [w##REG] "+&v" (w##REG)
+#define UVR3_(_1, _2, _3, REG, ...) [w##REG] "+&v" (w##REG)
+#define UVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "+&v" (w##REG)
+#define UVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "+&v" (w##REG)
+#define UVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "+&v" (w##REG)
+#define UVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "+&v" (w##REG)
+
+#define UVR0(r...) UVR0_(r)
+#define UVR1(r...) UVR1_(r)
+#define UVR2(r...) UVR2_(r, 36)
+#define UVR3(r...) UVR3_(r, 36, 35)
+#define UVR4(r...) UVR4_(r, 36, 35, 34, 33)
+#define UVR5(r...) UVR5_(r, 36, 35, 34, 33, 32)
+#define UVR6(r...) UVR6_(r, 36, 35, 34, 33, 32, 31)
+#define UVR7(r...) UVR7_(r, 36, 35, 34, 33, 32, 31, 30)
+
+#define UVR(X) [w##X] "+&v" (w##X)
+
+#define R_01(REG1, REG2, ...) REG1, REG2
+#define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3
+#define R_23(REG...) _R_23(REG, 1, 2, 3)
+
+#define ZFS_ASM_BUG() ASSERT(0)
+
+#define OFFSET(ptr, val) (((unsigned char *)(ptr))+val)
+
+extern const uint8_t gf_clmul_mod_lt[4*256][16];
+
+#define ELEM_SIZE 16
+
+typedef struct v {
+ uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
+} v_t;
+
+#define XOR_ACC(src, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "lvx 21,0,%[SRC0]\n" \
+ "lvx 20,0,%[SRC1]\n" \
+ "lvx 19,0,%[SRC2]\n" \
+ "lvx 18,0,%[SRC3]\n" \
+ "vxor " VR0(r) "," VR0(r) ",21\n" \
+ "vxor " VR1(r) "," VR1(r) ",20\n" \
+ "vxor " VR2(r) "," VR2(r) ",19\n" \
+ "vxor " VR3(r) "," VR3(r) ",18\n" \
+ "lvx 21,0,%[SRC4]\n" \
+ "lvx 20,0,%[SRC5]\n" \
+ "lvx 19,0,%[SRC6]\n" \
+ "lvx 18,0,%[SRC7]\n" \
+ "vxor " VR4(r) "," VR4(r) ",21\n" \
+ "vxor " VR5(r) "," VR5(r) ",20\n" \
+ "vxor " VR6(r) "," VR6(r) ",19\n" \
+ "vxor " VR7(r) "," VR7(r) ",18\n" \
+ : UVR0(r), UVR1(r), UVR2(r), UVR3(r), \
+ UVR4(r), UVR5(r), UVR6(r), UVR7(r) \
+ : [SRC0] "r" ((OFFSET(src, 0))), \
+ [SRC1] "r" ((OFFSET(src, 16))), \
+ [SRC2] "r" ((OFFSET(src, 32))), \
+ [SRC3] "r" ((OFFSET(src, 48))), \
+ [SRC4] "r" ((OFFSET(src, 64))), \
+ [SRC5] "r" ((OFFSET(src, 80))), \
+ [SRC6] "r" ((OFFSET(src, 96))), \
+ [SRC7] "r" ((OFFSET(src, 112))) \
+ : "v18", "v19", "v20", "v21"); \
+ break; \
+ case 4: \
+ __asm( \
+ "lvx 21,0,%[SRC0]\n" \
+ "lvx 20,0,%[SRC1]\n" \
+ "lvx 19,0,%[SRC2]\n" \
+ "lvx 18,0,%[SRC3]\n" \
+ "vxor " VR0(r) "," VR0(r) ",21\n" \
+ "vxor " VR1(r) "," VR1(r) ",20\n" \
+ "vxor " VR2(r) "," VR2(r) ",19\n" \
+ "vxor " VR3(r) "," VR3(r) ",18\n" \
+ : UVR0(r), UVR1(r), UVR2(r), UVR3(r) \
+ : [SRC0] "r" ((OFFSET(src, 0))), \
+ [SRC1] "r" ((OFFSET(src, 16))), \
+ [SRC2] "r" ((OFFSET(src, 32))), \
+ [SRC3] "r" ((OFFSET(src, 48))) \
+ : "v18", "v19", "v20", "v21"); \
+ break; \
+ case 2: \
+ __asm( \
+ "lvx 21,0,%[SRC0]\n" \
+ "lvx 20,0,%[SRC1]\n" \
+ "vxor " VR0(r) "," VR0(r) ",21\n" \
+ "vxor " VR1(r) "," VR1(r) ",20\n" \
+ : UVR0(r), UVR1(r) \
+ : [SRC0] "r" ((OFFSET(src, 0))), \
+ [SRC1] "r" ((OFFSET(src, 16))) \
+ : "v20", "v21"); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define XOR(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "vxor " VR4(r) "," VR4(r) "," VR0(r) "\n" \
+ "vxor " VR5(r) "," VR5(r) "," VR1(r) "\n" \
+ "vxor " VR6(r) "," VR6(r) "," VR2(r) "\n" \
+ "vxor " VR7(r) "," VR7(r) "," VR3(r) "\n" \
+ : UVR4(r), UVR5(r), UVR6(r), UVR7(r) \
+ : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "vxor " VR2(r) "," VR2(r) "," VR0(r) "\n" \
+ "vxor " VR3(r) "," VR3(r) "," VR1(r) "\n" \
+ : UVR2(r), UVR3(r) \
+ : RVR0(r), RVR1(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define ZERO(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "vxor " VR0(r) "," VR0(r) "," VR0(r) "\n" \
+ "vxor " VR1(r) "," VR1(r) "," VR1(r) "\n" \
+ "vxor " VR2(r) "," VR2(r) "," VR2(r) "\n" \
+ "vxor " VR3(r) "," VR3(r) "," VR3(r) "\n" \
+ "vxor " VR4(r) "," VR4(r) "," VR4(r) "\n" \
+ "vxor " VR5(r) "," VR5(r) "," VR5(r) "\n" \
+ "vxor " VR6(r) "," VR6(r) "," VR6(r) "\n" \
+ "vxor " VR7(r) "," VR7(r) "," VR7(r) "\n" \
+ : WVR0(r), WVR1(r), WVR2(r), WVR3(r), \
+ WVR4(r), WVR5(r), WVR6(r), WVR7(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "vxor " VR0(r) "," VR0(r) "," VR0(r) "\n" \
+ "vxor " VR1(r) "," VR1(r) "," VR1(r) "\n" \
+ "vxor " VR2(r) "," VR2(r) "," VR2(r) "\n" \
+ "vxor " VR3(r) "," VR3(r) "," VR3(r) "\n" \
+ : WVR0(r), WVR1(r), WVR2(r), WVR3(r)); \
+ break; \
+ case 2: \
+ __asm( \
+ "vxor " VR0(r) "," VR0(r) "," VR0(r) "\n" \
+ "vxor " VR1(r) "," VR1(r) "," VR1(r) "\n" \
+ : WVR0(r), WVR1(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define COPY(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "vor " VR4(r) "," VR0(r) "," VR0(r) "\n" \
+ "vor " VR5(r) "," VR1(r) "," VR1(r) "\n" \
+ "vor " VR6(r) "," VR2(r) "," VR2(r) "\n" \
+ "vor " VR7(r) "," VR3(r) "," VR3(r) "\n" \
+ : WVR4(r), WVR5(r), WVR6(r), WVR7(r) \
+ : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "vor " VR2(r) "," VR0(r) "," VR0(r) "\n" \
+ "vor " VR3(r) "," VR1(r) "," VR1(r) "\n" \
+ : WVR2(r), WVR3(r) \
+ : RVR0(r), RVR1(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define LOAD(src, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "lvx " VR0(r) " ,0,%[SRC0]\n" \
+ "lvx " VR1(r) " ,0,%[SRC1]\n" \
+ "lvx " VR2(r) " ,0,%[SRC2]\n" \
+ "lvx " VR3(r) " ,0,%[SRC3]\n" \
+ "lvx " VR4(r) " ,0,%[SRC4]\n" \
+ "lvx " VR5(r) " ,0,%[SRC5]\n" \
+ "lvx " VR6(r) " ,0,%[SRC6]\n" \
+ "lvx " VR7(r) " ,0,%[SRC7]\n" \
+ : WVR0(r), WVR1(r), WVR2(r), WVR3(r), \
+ WVR4(r), WVR5(r), WVR6(r), WVR7(r) \
+ : [SRC0] "r" ((OFFSET(src, 0))), \
+ [SRC1] "r" ((OFFSET(src, 16))), \
+ [SRC2] "r" ((OFFSET(src, 32))), \
+ [SRC3] "r" ((OFFSET(src, 48))), \
+ [SRC4] "r" ((OFFSET(src, 64))), \
+ [SRC5] "r" ((OFFSET(src, 80))), \
+ [SRC6] "r" ((OFFSET(src, 96))), \
+ [SRC7] "r" ((OFFSET(src, 112)))); \
+ break; \
+ case 4: \
+ __asm( \
+ "lvx " VR0(r) " ,0,%[SRC0]\n" \
+ "lvx " VR1(r) " ,0,%[SRC1]\n" \
+ "lvx " VR2(r) " ,0,%[SRC2]\n" \
+ "lvx " VR3(r) " ,0,%[SRC3]\n" \
+ : WVR0(r), WVR1(r), WVR2(r), WVR3(r) \
+ : [SRC0] "r" ((OFFSET(src, 0))), \
+ [SRC1] "r" ((OFFSET(src, 16))), \
+ [SRC2] "r" ((OFFSET(src, 32))), \
+ [SRC3] "r" ((OFFSET(src, 48)))); \
+ break; \
+ case 2: \
+ __asm( \
+ "lvx " VR0(r) " ,0,%[SRC0]\n" \
+ "lvx " VR1(r) " ,0,%[SRC1]\n" \
+ : WVR0(r), WVR1(r) \
+ : [SRC0] "r" ((OFFSET(src, 0))), \
+ [SRC1] "r" ((OFFSET(src, 16)))); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define STORE(dst, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "stvx " VR0(r) " ,0,%[DST0]\n" \
+ "stvx " VR1(r) " ,0,%[DST1]\n" \
+ "stvx " VR2(r) " ,0,%[DST2]\n" \
+ "stvx " VR3(r) " ,0,%[DST3]\n" \
+ "stvx " VR4(r) " ,0,%[DST4]\n" \
+ "stvx " VR5(r) " ,0,%[DST5]\n" \
+ "stvx " VR6(r) " ,0,%[DST6]\n" \
+ "stvx " VR7(r) " ,0,%[DST7]\n" \
+ : : [DST0] "r" ((OFFSET(dst, 0))), \
+ [DST1] "r" ((OFFSET(dst, 16))), \
+ [DST2] "r" ((OFFSET(dst, 32))), \
+ [DST3] "r" ((OFFSET(dst, 48))), \
+ [DST4] "r" ((OFFSET(dst, 64))), \
+ [DST5] "r" ((OFFSET(dst, 80))), \
+ [DST6] "r" ((OFFSET(dst, 96))), \
+ [DST7] "r" ((OFFSET(dst, 112))), \
+ RVR0(r), RVR1(r), RVR2(r), RVR3(r), \
+ RVR4(r), RVR5(r), RVR6(r), RVR7(r) \
+ : "memory"); \
+ break; \
+ case 4: \
+ __asm( \
+ "stvx " VR0(r) " ,0,%[DST0]\n" \
+ "stvx " VR1(r) " ,0,%[DST1]\n" \
+ "stvx " VR2(r) " ,0,%[DST2]\n" \
+ "stvx " VR3(r) " ,0,%[DST3]\n" \
+ : : [DST0] "r" ((OFFSET(dst, 0))), \
+ [DST1] "r" ((OFFSET(dst, 16))), \
+ [DST2] "r" ((OFFSET(dst, 32))), \
+ [DST3] "r" ((OFFSET(dst, 48))), \
+ RVR0(r), RVR1(r), RVR2(r), RVR3(r) \
+ : "memory"); \
+ break; \
+ case 2: \
+ __asm( \
+ "stvx " VR0(r) " ,0,%[DST0]\n" \
+ "stvx " VR1(r) " ,0,%[DST1]\n" \
+ : : [DST0] "r" ((OFFSET(dst, 0))), \
+ [DST1] "r" ((OFFSET(dst, 16))), \
+ RVR0(r), RVR1(r) : "memory"); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+/*
+ * Unfortunately cannot use the macro, because GCC
+ * will try to use the macro name and not value
+ * later on...
+ * Kept as a reference to what a numbered variable is
+ */
+#define _00 "17"
+#define _1d "16"
+#define _temp0 "19"
+#define _temp1 "18"
+
+#define MUL2_SETUP() \
+{ \
+ __asm( \
+ "vspltisb " VR(16) ",14\n" \
+ "vspltisb " VR(17) ",15\n" \
+ "vaddubm " VR(16) "," VR(17) "," VR(16) "\n" \
+ "vxor " VR(17) "," VR(17) "," VR(17) "\n" \
+ : WVR(16), WVR(17)); \
+}
+
+#define MUL2(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "vcmpgtsb 19," VR(17) "," VR0(r) "\n" \
+ "vcmpgtsb 18," VR(17) "," VR1(r) "\n" \
+ "vcmpgtsb 21," VR(17) "," VR2(r) "\n" \
+ "vcmpgtsb 20," VR(17) "," VR3(r) "\n" \
+ "vand 19,19," VR(16) "\n" \
+ "vand 18,18," VR(16) "\n" \
+ "vand 21,21," VR(16) "\n" \
+ "vand 20,20," VR(16) "\n" \
+ "vaddubm " VR0(r) "," VR0(r) "," VR0(r) "\n" \
+ "vaddubm " VR1(r) "," VR1(r) "," VR1(r) "\n" \
+ "vaddubm " VR2(r) "," VR2(r) "," VR2(r) "\n" \
+ "vaddubm " VR3(r) "," VR3(r) "," VR3(r) "\n" \
+ "vxor " VR0(r) ",19," VR0(r) "\n" \
+ "vxor " VR1(r) ",18," VR1(r) "\n" \
+ "vxor " VR2(r) ",21," VR2(r) "\n" \
+ "vxor " VR3(r) ",20," VR3(r) "\n" \
+ : UVR0(r), UVR1(r), UVR2(r), UVR3(r) \
+ : RVR(17), RVR(16) \
+ : "v18", "v19", "v20", "v21"); \
+ break; \
+ case 2: \
+ __asm( \
+ "vcmpgtsb 19," VR(17) "," VR0(r) "\n" \
+ "vcmpgtsb 18," VR(17) "," VR1(r) "\n" \
+ "vand 19,19," VR(16) "\n" \
+ "vand 18,18," VR(16) "\n" \
+ "vaddubm " VR0(r) "," VR0(r) "," VR0(r) "\n" \
+ "vaddubm " VR1(r) "," VR1(r) "," VR1(r) "\n" \
+ "vxor " VR0(r) ",19," VR0(r) "\n" \
+ "vxor " VR1(r) ",18," VR1(r) "\n" \
+ : UVR0(r), UVR1(r) \
+ : RVR(17), RVR(16) \
+ : "v18", "v19"); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define MUL4(r...) \
+{ \
+ MUL2(r); \
+ MUL2(r); \
+}
+
+/*
+ * Unfortunately cannot use the macro, because GCC
+ * will try to use the macro name and not value
+ * later on...
+ * Kept as a reference to what a register is
+ * (here we're using actual registers for the
+ * clobbered ones)
+ */
+#define _0f "15"
+#define _a_save "14"
+#define _b_save "13"
+#define _lt_mod_a "12"
+#define _lt_clmul_a "11"
+#define _lt_mod_b "10"
+#define _lt_clmul_b "15"
+
+#define _MULx2(c, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 2: \
+ __asm( \
+ /* lts for upper part */ \
+ "vspltisb 15,15\n" \
+ "lvx 10,0,%[lt0]\n" \
+ "lvx 11,0,%[lt1]\n" \
+ /* upper part */ \
+ "vand 14," VR0(r) ",15\n" \
+ "vand 13," VR1(r) ",15\n" \
+ "vspltisb 15,4\n" \
+ "vsrab " VR0(r) "," VR0(r) ",15\n" \
+ "vsrab " VR1(r) "," VR1(r) ",15\n" \
+ \
+ "vperm 12,10,10," VR0(r) "\n" \
+ "vperm 10,10,10," VR1(r) "\n" \
+ "vperm 15,11,11," VR0(r) "\n" \
+ "vperm 11,11,11," VR1(r) "\n" \
+ \
+ "vxor " VR0(r) ",15,12\n" \
+ "vxor " VR1(r) ",11,10\n" \
+ /* lts for lower part */ \
+ "lvx 10,0,%[lt2]\n" \
+ "lvx 15,0,%[lt3]\n" \
+ /* lower part */ \
+ "vperm 12,10,10,14\n" \
+ "vperm 10,10,10,13\n" \
+ "vperm 11,15,15,14\n" \
+ "vperm 15,15,15,13\n" \
+ \
+ "vxor " VR0(r) "," VR0(r) ",12\n" \
+ "vxor " VR1(r) "," VR1(r) ",10\n" \
+ "vxor " VR0(r) "," VR0(r) ",11\n" \
+ "vxor " VR1(r) "," VR1(r) ",15\n" \
+ : UVR0(r), UVR1(r) \
+ : [lt0] "r" (&(gf_clmul_mod_lt[4*(c)+0][0])), \
+ [lt1] "r" (&(gf_clmul_mod_lt[4*(c)+1][0])), \
+ [lt2] "r" (&(gf_clmul_mod_lt[4*(c)+2][0])), \
+ [lt3] "r" (&(gf_clmul_mod_lt[4*(c)+3][0])) \
+ : "v10", "v11", "v12", "v13", "v14", "v15"); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define MUL(c, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ _MULx2(c, R_23(r)); \
+ _MULx2(c, R_01(r)); \
+ break; \
+ case 2: \
+ _MULx2(c, R_01(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define raidz_math_begin() kfpu_begin()
+#define raidz_math_end() kfpu_end()
+
+/* Overkill... */
+#if 0 // defined(_KERNEL)
+#define GEN_X_DEFINE_0_3() \
+register unsigned char w0 asm("0") __attribute__((vector_size(16))); \
+register unsigned char w1 asm("1") __attribute__((vector_size(16))); \
+register unsigned char w2 asm("2") __attribute__((vector_size(16))); \
+register unsigned char w3 asm("3") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_4_5() \
+register unsigned char w4 asm("4") __attribute__((vector_size(16))); \
+register unsigned char w5 asm("5") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_6_7() \
+register unsigned char w6 asm("6") __attribute__((vector_size(16))); \
+register unsigned char w7 asm("7") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_8_9() \
+register unsigned char w8 asm("8") __attribute__((vector_size(16))); \
+register unsigned char w9 asm("9") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_10_11() \
+register unsigned char w10 asm("10") __attribute__((vector_size(16))); \
+register unsigned char w11 asm("11") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_12_15() \
+register unsigned char w12 asm("12") __attribute__((vector_size(16))); \
+register unsigned char w13 asm("13") __attribute__((vector_size(16))); \
+register unsigned char w14 asm("14") __attribute__((vector_size(16))); \
+register unsigned char w15 asm("15") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_16() \
+register unsigned char w16 asm("16") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_17() \
+register unsigned char w17 asm("17") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_18_21() \
+register unsigned char w18 asm("18") __attribute__((vector_size(16))); \
+register unsigned char w19 asm("19") __attribute__((vector_size(16))); \
+register unsigned char w20 asm("20") __attribute__((vector_size(16))); \
+register unsigned char w21 asm("21") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_22_23() \
+register unsigned char w22 asm("22") __attribute__((vector_size(16))); \
+register unsigned char w23 asm("23") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_24_27() \
+register unsigned char w24 asm("24") __attribute__((vector_size(16))); \
+register unsigned char w25 asm("25") __attribute__((vector_size(16))); \
+register unsigned char w26 asm("26") __attribute__((vector_size(16))); \
+register unsigned char w27 asm("27") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_28_30() \
+register unsigned char w28 asm("28") __attribute__((vector_size(16))); \
+register unsigned char w29 asm("29") __attribute__((vector_size(16))); \
+register unsigned char w30 asm("30") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_31() \
+register unsigned char w31 asm("31") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_32() \
+register unsigned char w32 asm("31") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_33_36() \
+register unsigned char w33 asm("31") __attribute__((vector_size(16))); \
+register unsigned char w34 asm("31") __attribute__((vector_size(16))); \
+register unsigned char w35 asm("31") __attribute__((vector_size(16))); \
+register unsigned char w36 asm("31") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_37_38() \
+register unsigned char w37 asm("31") __attribute__((vector_size(16))); \
+register unsigned char w38 asm("31") __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_ALL() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_8_9() \
+ GEN_X_DEFINE_10_11() \
+ GEN_X_DEFINE_12_15() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_18_21() \
+ GEN_X_DEFINE_22_23() \
+ GEN_X_DEFINE_24_27() \
+ GEN_X_DEFINE_28_30() \
+ GEN_X_DEFINE_31() \
+ GEN_X_DEFINE_32() \
+ GEN_X_DEFINE_33_36() \
+ GEN_X_DEFINE_37_38()
+#else
+#define GEN_X_DEFINE_0_3() \
+ unsigned char w0 __attribute__((vector_size(16))); \
+ unsigned char w1 __attribute__((vector_size(16))); \
+ unsigned char w2 __attribute__((vector_size(16))); \
+ unsigned char w3 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_4_5() \
+ unsigned char w4 __attribute__((vector_size(16))); \
+ unsigned char w5 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_6_7() \
+ unsigned char w6 __attribute__((vector_size(16))); \
+ unsigned char w7 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_8_9() \
+ unsigned char w8 __attribute__((vector_size(16))); \
+ unsigned char w9 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_10_11() \
+ unsigned char w10 __attribute__((vector_size(16))); \
+ unsigned char w11 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_12_15() \
+ unsigned char w12 __attribute__((vector_size(16))); \
+ unsigned char w13 __attribute__((vector_size(16))); \
+ unsigned char w14 __attribute__((vector_size(16))); \
+ unsigned char w15 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_16() \
+ unsigned char w16 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_17() \
+ unsigned char w17 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_18_21() \
+ unsigned char w18 __attribute__((vector_size(16))); \
+ unsigned char w19 __attribute__((vector_size(16))); \
+ unsigned char w20 __attribute__((vector_size(16))); \
+ unsigned char w21 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_22_23() \
+ unsigned char w22 __attribute__((vector_size(16))); \
+ unsigned char w23 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_24_27() \
+ unsigned char w24 __attribute__((vector_size(16))); \
+ unsigned char w25 __attribute__((vector_size(16))); \
+ unsigned char w26 __attribute__((vector_size(16))); \
+ unsigned char w27 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_28_30() \
+ unsigned char w28 __attribute__((vector_size(16))); \
+ unsigned char w29 __attribute__((vector_size(16))); \
+ unsigned char w30 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_31() \
+ unsigned char w31 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_32() \
+ unsigned char w32 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_33_36() \
+ unsigned char w33 __attribute__((vector_size(16))); \
+ unsigned char w34 __attribute__((vector_size(16))); \
+ unsigned char w35 __attribute__((vector_size(16))); \
+ unsigned char w36 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_37_38() \
+ unsigned char w37 __attribute__((vector_size(16))); \
+ unsigned char w38 __attribute__((vector_size(16)));
+#define GEN_X_DEFINE_ALL() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_8_9() \
+ GEN_X_DEFINE_10_11() \
+ GEN_X_DEFINE_12_15() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_18_21() \
+ GEN_X_DEFINE_22_23() \
+ GEN_X_DEFINE_24_27() \
+ GEN_X_DEFINE_28_30() \
+ GEN_X_DEFINE_31() \
+ GEN_X_DEFINE_32() \
+ GEN_X_DEFINE_33_36() \
+ GEN_X_DEFINE_37_38()
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c
new file mode 100644
index 000000000000..cd742e146ca6
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c
@@ -0,0 +1,337 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/vdev_raidz_impl.h>
+
+/*
+ * Provide native CPU scalar routines.
+ * Support 32bit and 64bit CPUs.
+ */
+#if ((~(0x0ULL)) >> 24) == 0xffULL
+#define ELEM_SIZE 4
+typedef uint32_t iv_t;
+#elif ((~(0x0ULL)) >> 56) == 0xffULL
+#define ELEM_SIZE 8
+typedef uint64_t iv_t;
+#endif
+
+/*
+ * Vector type used in scalar implementation
+ *
+ * The union is expected to be of native CPU register size. Since addition
+ * uses XOR operation, it can be performed an all byte elements at once.
+ * Multiplication requires per byte access.
+ */
+typedef union {
+ iv_t e;
+ uint8_t b[ELEM_SIZE];
+} v_t;
+
+/*
+ * Precomputed lookup tables for multiplication by a constant
+ *
+ * Reconstruction path requires multiplication by a constant factors. Instead of
+ * performing two step lookup (log & exp tables), a direct lookup can be used
+ * instead. Multiplication of element 'a' by a constant 'c' is obtained as:
+ *
+ * r = vdev_raidz_mul_lt[c_log][a];
+ *
+ * where c_log = vdev_raidz_log2[c]. Log of coefficient factors is used because
+ * they are faster to obtain while solving the syndrome equations.
+ *
+ * PERFORMANCE NOTE:
+ * Even though the complete lookup table uses 64kiB, only relatively small
+ * portion of it is used at the same time. Following shows number of accessed
+ * bytes for different cases:
+ * - 1 failed disk: 256B (1 mul. coefficient)
+ * - 2 failed disks: 512B (2 mul. coefficients)
+ * - 3 failed disks: 1536B (6 mul. coefficients)
+ *
+ * Size of actually accessed lookup table regions is only larger for
+ * reconstruction of 3 failed disks, when compared to traditional log/exp
+ * method. But since the result is obtained in one lookup step performance is
+ * doubled.
+ */
+static uint8_t vdev_raidz_mul_lt[256][256] __attribute__((aligned(256)));
+
+static void
+raidz_init_scalar(void)
+{
+ int c, i;
+ for (c = 0; c < 256; c++)
+ for (i = 0; i < 256; i++)
+ vdev_raidz_mul_lt[c][i] = gf_mul(c, i);
+
+}
+
+#define PREFETCHNTA(ptr, offset) {}
+#define PREFETCH(ptr, offset) {}
+
+#define XOR_ACC(src, acc) acc.e ^= ((v_t *)src)[0].e
+#define XOR(src, acc) acc.e ^= src.e
+#define ZERO(acc) acc.e = 0
+#define COPY(src, dst) dst = src
+#define LOAD(src, val) val = ((v_t *)src)[0]
+#define STORE(dst, val) ((v_t *)dst)[0] = val
+
+/*
+ * Constants used for optimized multiplication by 2.
+ */
+static const struct {
+ iv_t mod;
+ iv_t mask;
+ iv_t msb;
+} scalar_mul2_consts = {
+#if ELEM_SIZE == 8
+ .mod = 0x1d1d1d1d1d1d1d1dULL,
+ .mask = 0xfefefefefefefefeULL,
+ .msb = 0x8080808080808080ULL,
+#else
+ .mod = 0x1d1d1d1dULL,
+ .mask = 0xfefefefeULL,
+ .msb = 0x80808080ULL,
+#endif
+};
+
+#define MUL2_SETUP() {}
+
+#define MUL2(a) \
+{ \
+ iv_t _mask; \
+ \
+ _mask = (a).e & scalar_mul2_consts.msb; \
+ _mask = (_mask << 1) - (_mask >> 7); \
+ (a).e = ((a).e << 1) & scalar_mul2_consts.mask; \
+ (a).e = (a).e ^ (_mask & scalar_mul2_consts.mod); \
+}
+
+#define MUL4(a) \
+{ \
+ MUL2(a); \
+ MUL2(a); \
+}
+
+#define MUL(c, a) \
+{ \
+ const uint8_t *mul_lt = vdev_raidz_mul_lt[c]; \
+ switch (ELEM_SIZE) { \
+ case 8: \
+ a.b[7] = mul_lt[a.b[7]]; \
+ a.b[6] = mul_lt[a.b[6]]; \
+ a.b[5] = mul_lt[a.b[5]]; \
+ a.b[4] = mul_lt[a.b[4]]; \
+ /* falls through */ \
+ case 4: \
+ a.b[3] = mul_lt[a.b[3]]; \
+ a.b[2] = mul_lt[a.b[2]]; \
+ a.b[1] = mul_lt[a.b[1]]; \
+ a.b[0] = mul_lt[a.b[0]]; \
+ break; \
+ } \
+}
+
+#define raidz_math_begin() {}
+#define raidz_math_end() {}
+
+#define SYN_STRIDE 1
+
+#define ZERO_DEFINE() v_t d0
+#define ZERO_STRIDE 1
+#define ZERO_D d0
+
+#define COPY_DEFINE() v_t d0
+#define COPY_STRIDE 1
+#define COPY_D d0
+
+#define ADD_DEFINE() v_t d0
+#define ADD_STRIDE 1
+#define ADD_D d0
+
+#define MUL_DEFINE() v_t d0
+#define MUL_STRIDE 1
+#define MUL_D d0
+
+#define GEN_P_STRIDE 1
+#define GEN_P_DEFINE() v_t p0
+#define GEN_P_P p0
+
+#define GEN_PQ_STRIDE 1
+#define GEN_PQ_DEFINE() v_t d0, c0
+#define GEN_PQ_D d0
+#define GEN_PQ_C c0
+
+#define GEN_PQR_STRIDE 1
+#define GEN_PQR_DEFINE() v_t d0, c0
+#define GEN_PQR_D d0
+#define GEN_PQR_C c0
+
+#define SYN_Q_DEFINE() v_t d0, x0
+#define SYN_Q_D d0
+#define SYN_Q_X x0
+
+
+#define SYN_R_DEFINE() v_t d0, x0
+#define SYN_R_D d0
+#define SYN_R_X x0
+
+
+#define SYN_PQ_DEFINE() v_t d0, x0
+#define SYN_PQ_D d0
+#define SYN_PQ_X x0
+
+
+#define REC_PQ_STRIDE 1
+#define REC_PQ_DEFINE() v_t x0, y0, t0
+#define REC_PQ_X x0
+#define REC_PQ_Y y0
+#define REC_PQ_T t0
+
+
+#define SYN_PR_DEFINE() v_t d0, x0
+#define SYN_PR_D d0
+#define SYN_PR_X x0
+
+#define REC_PR_STRIDE 1
+#define REC_PR_DEFINE() v_t x0, y0, t0
+#define REC_PR_X x0
+#define REC_PR_Y y0
+#define REC_PR_T t0
+
+
+#define SYN_QR_DEFINE() v_t d0, x0
+#define SYN_QR_D d0
+#define SYN_QR_X x0
+
+
+#define REC_QR_STRIDE 1
+#define REC_QR_DEFINE() v_t x0, y0, t0
+#define REC_QR_X x0
+#define REC_QR_Y y0
+#define REC_QR_T t0
+
+
+#define SYN_PQR_DEFINE() v_t d0, x0
+#define SYN_PQR_D d0
+#define SYN_PQR_X x0
+
+#define REC_PQR_STRIDE 1
+#define REC_PQR_DEFINE() v_t x0, y0, z0, xs0, ys0
+#define REC_PQR_X x0
+#define REC_PQR_Y y0
+#define REC_PQR_Z z0
+#define REC_PQR_XS xs0
+#define REC_PQR_YS ys0
+
+#include "vdev_raidz_math_impl.h"
+
+DEFINE_GEN_METHODS(scalar);
+DEFINE_REC_METHODS(scalar);
+
+boolean_t
+raidz_will_scalar_work(void)
+{
+ return (B_TRUE); /* always */
+}
+
+const raidz_impl_ops_t vdev_raidz_scalar_impl = {
+ .init = raidz_init_scalar,
+ .fini = NULL,
+ .gen = RAIDZ_GEN_METHODS(scalar),
+ .rec = RAIDZ_REC_METHODS(scalar),
+ .is_supported = &raidz_will_scalar_work,
+ .name = "scalar"
+};
+
+/* Powers of 2 in the RAID-Z Galois field. */
+const uint8_t vdev_raidz_pow2[256] __attribute__((aligned(256))) = {
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
+ 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
+ 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
+ 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
+ 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
+ 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
+ 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
+ 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
+ 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
+ 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
+ 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
+ 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
+ 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
+ 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
+ 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
+ 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
+ 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
+ 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
+ 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
+ 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
+ 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
+ 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
+ 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
+ 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
+ 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
+ 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
+ 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
+ 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
+ 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
+ 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
+ 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
+};
+
+/* Logs of 2 in the RAID-Z Galois field. */
+const uint8_t vdev_raidz_log2[256] __attribute__((aligned(256))) = {
+ 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
+ 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
+ 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
+ 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
+ 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
+ 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
+ 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
+ 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
+ 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
+ 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
+ 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
+ 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
+ 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
+ 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
+ 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
+ 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
+ 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
+ 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
+ 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
+ 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
+ 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
+ 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
+ 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
+ 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
+ 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
+ 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
+ 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
+ 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
+ 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
+ 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
+ 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
+ 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
+};
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c
new file mode 100644
index 000000000000..56a0b123d952
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c
@@ -0,0 +1,631 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/isa_defs.h>
+
+#if defined(__x86_64) && defined(HAVE_SSE2)
+
+#include <sys/types.h>
+#include <sys/simd.h>
+#include <sys/debug.h>
+
+#ifdef __linux__
+#define __asm __asm__ __volatile__
+#endif
+
+#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
+#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
+
+#define VR0_(REG, ...) "xmm"#REG
+#define VR1_(_1, REG, ...) "xmm"#REG
+#define VR2_(_1, _2, REG, ...) "xmm"#REG
+#define VR3_(_1, _2, _3, REG, ...) "xmm"#REG
+#define VR4_(_1, _2, _3, _4, REG, ...) "xmm"#REG
+#define VR5_(_1, _2, _3, _4, _5, REG, ...) "xmm"#REG
+#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "xmm"#REG
+#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "xmm"#REG
+
+#define VR0(r...) VR0_(r, 1, 2, 3, 4, 5, 6)
+#define VR1(r...) VR1_(r, 1, 2, 3, 4, 5, 6)
+#define VR2(r...) VR2_(r, 1, 2, 3, 4, 5, 6)
+#define VR3(r...) VR3_(r, 1, 2, 3, 4, 5, 6)
+#define VR4(r...) VR4_(r, 1, 2, 3, 4, 5, 6)
+#define VR5(r...) VR5_(r, 1, 2, 3, 4, 5, 6)
+#define VR6(r...) VR6_(r, 1, 2, 3, 4, 5, 6)
+#define VR7(r...) VR7_(r, 1, 2, 3, 4, 5, 6)
+
+#define ELEM_SIZE 16
+
+typedef struct v {
+ uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
+} v_t;
+
+#define XOR_ACC(src, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "pxor 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "pxor 0x10(%[SRC]), %%" VR1(r) "\n" \
+ "pxor 0x20(%[SRC]), %%" VR2(r) "\n" \
+ "pxor 0x30(%[SRC]), %%" VR3(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ case 2: \
+ __asm( \
+ "pxor 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "pxor 0x10(%[SRC]), %%" VR1(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ case 1: \
+ __asm("pxor 0x00(%[SRC]), %%" VR0(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ } \
+}
+
+#define XOR(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "pxor %" VR0(r) ", %" VR4(r) "\n" \
+ "pxor %" VR1(r) ", %" VR5(r) "\n" \
+ "pxor %" VR2(r) ", %" VR6(r) "\n" \
+ "pxor %" VR3(r) ", %" VR7(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "pxor %" VR0(r) ", %" VR2(r) "\n" \
+ "pxor %" VR1(r) ", %" VR3(r)); \
+ break; \
+ case 2: \
+ __asm( \
+ "pxor %" VR0(r) ", %" VR1(r)); \
+ break; \
+ } \
+}
+
+#define ZERO(r...) XOR(r, r)
+
+#define COPY(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "movdqa %" VR0(r) ", %" VR4(r) "\n" \
+ "movdqa %" VR1(r) ", %" VR5(r) "\n" \
+ "movdqa %" VR2(r) ", %" VR6(r) "\n" \
+ "movdqa %" VR3(r) ", %" VR7(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "movdqa %" VR0(r) ", %" VR2(r) "\n" \
+ "movdqa %" VR1(r) ", %" VR3(r)); \
+ break; \
+ case 2: \
+ __asm( \
+ "movdqa %" VR0(r) ", %" VR1(r)); \
+ break; \
+ default: \
+ VERIFY(0); \
+ } \
+}
+
+#define LOAD(src, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "movdqa 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "movdqa 0x10(%[SRC]), %%" VR1(r) "\n" \
+ "movdqa 0x20(%[SRC]), %%" VR2(r) "\n" \
+ "movdqa 0x30(%[SRC]), %%" VR3(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ case 2: \
+ __asm( \
+ "movdqa 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "movdqa 0x10(%[SRC]), %%" VR1(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ case 1: \
+ __asm( \
+ "movdqa 0x00(%[SRC]), %%" VR0(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ } \
+}
+
+#define STORE(dst, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "movdqa %%" VR0(r)", 0x00(%[DST])\n" \
+ "movdqa %%" VR1(r)", 0x10(%[DST])\n" \
+ "movdqa %%" VR2(r)", 0x20(%[DST])\n" \
+ "movdqa %%" VR3(r)", 0x30(%[DST])\n" \
+ : : [DST] "r" (dst)); \
+ break; \
+ case 2: \
+ __asm( \
+ "movdqa %%" VR0(r)", 0x00(%[DST])\n" \
+ "movdqa %%" VR1(r)", 0x10(%[DST])\n" \
+ : : [DST] "r" (dst)); \
+ break; \
+ case 1: \
+ __asm( \
+ "movdqa %%" VR0(r)", 0x00(%[DST])\n" \
+ : : [DST] "r" (dst)); \
+ break; \
+ default: \
+ VERIFY(0); \
+ } \
+}
+
+#define MUL2_SETUP() \
+{ \
+ __asm( \
+ "movd %[mask], %%xmm15\n" \
+ "pshufd $0x0, %%xmm15, %%xmm15\n" \
+ : : [mask] "r" (0x1d1d1d1d)); \
+}
+
+#define _MUL2_x1(a0) \
+{ \
+ __asm( \
+ "pxor %xmm14, %xmm14\n" \
+ "pcmpgtb %" a0", %xmm14\n" \
+ "pand %xmm15, %xmm14\n" \
+ "paddb %" a0", %" a0 "\n" \
+ "pxor %xmm14, %" a0); \
+}
+
+#define _MUL2_x2(a0, a1) \
+{ \
+ __asm( \
+ "pxor %xmm14, %xmm14\n" \
+ "pxor %xmm13, %xmm13\n" \
+ "pcmpgtb %" a0", %xmm14\n" \
+ "pcmpgtb %" a1", %xmm13\n" \
+ "pand %xmm15, %xmm14\n" \
+ "pand %xmm15, %xmm13\n" \
+ "paddb %" a0", %" a0 "\n" \
+ "paddb %" a1", %" a1 "\n" \
+ "pxor %xmm14, %" a0 "\n" \
+ "pxor %xmm13, %" a1); \
+}
+
+#define MUL2(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ _MUL2_x2(VR0(r), VR1(r)); \
+ _MUL2_x2(VR2(r), VR3(r)); \
+ break; \
+ case 2: \
+ _MUL2_x2(VR0(r), VR1(r)); \
+ break; \
+ case 1: \
+ _MUL2_x1(VR0(r)); \
+ break; \
+ } \
+}
+
+#define MUL4(r...) \
+{ \
+ MUL2(r); \
+ MUL2(r); \
+}
+
+/* General multiplication by adding powers of two */
+
+#define _MUL_PARAM(x, in, acc) \
+{ \
+ if (x & 0x01) { COPY(in, acc); } else { ZERO(acc); } \
+ if (x & 0xfe) { MUL2(in); } \
+ if (x & 0x02) { XOR(in, acc); } \
+ if (x & 0xfc) { MUL2(in); } \
+ if (x & 0x04) { XOR(in, acc); } \
+ if (x & 0xf8) { MUL2(in); } \
+ if (x & 0x08) { XOR(in, acc); } \
+ if (x & 0xf0) { MUL2(in); } \
+ if (x & 0x10) { XOR(in, acc); } \
+ if (x & 0xe0) { MUL2(in); } \
+ if (x & 0x20) { XOR(in, acc); } \
+ if (x & 0xc0) { MUL2(in); } \
+ if (x & 0x40) { XOR(in, acc); } \
+ if (x & 0x80) { MUL2(in); XOR(in, acc); } \
+}
+
+#define _mul_x1_in 11
+#define _mul_x1_acc 12
+
+#define MUL_x1_DEFINE(x) \
+static void \
+mul_x1_ ## x(void) { _MUL_PARAM(x, _mul_x1_in, _mul_x1_acc); }
+
+#define _mul_x2_in 9, 10
+#define _mul_x2_acc 11, 12
+
+#define MUL_x2_DEFINE(x) \
+static void \
+mul_x2_ ## x(void) { _MUL_PARAM(x, _mul_x2_in, _mul_x2_acc); }
+
+MUL_x1_DEFINE(0); MUL_x1_DEFINE(1); MUL_x1_DEFINE(2); MUL_x1_DEFINE(3);
+MUL_x1_DEFINE(4); MUL_x1_DEFINE(5); MUL_x1_DEFINE(6); MUL_x1_DEFINE(7);
+MUL_x1_DEFINE(8); MUL_x1_DEFINE(9); MUL_x1_DEFINE(10); MUL_x1_DEFINE(11);
+MUL_x1_DEFINE(12); MUL_x1_DEFINE(13); MUL_x1_DEFINE(14); MUL_x1_DEFINE(15);
+MUL_x1_DEFINE(16); MUL_x1_DEFINE(17); MUL_x1_DEFINE(18); MUL_x1_DEFINE(19);
+MUL_x1_DEFINE(20); MUL_x1_DEFINE(21); MUL_x1_DEFINE(22); MUL_x1_DEFINE(23);
+MUL_x1_DEFINE(24); MUL_x1_DEFINE(25); MUL_x1_DEFINE(26); MUL_x1_DEFINE(27);
+MUL_x1_DEFINE(28); MUL_x1_DEFINE(29); MUL_x1_DEFINE(30); MUL_x1_DEFINE(31);
+MUL_x1_DEFINE(32); MUL_x1_DEFINE(33); MUL_x1_DEFINE(34); MUL_x1_DEFINE(35);
+MUL_x1_DEFINE(36); MUL_x1_DEFINE(37); MUL_x1_DEFINE(38); MUL_x1_DEFINE(39);
+MUL_x1_DEFINE(40); MUL_x1_DEFINE(41); MUL_x1_DEFINE(42); MUL_x1_DEFINE(43);
+MUL_x1_DEFINE(44); MUL_x1_DEFINE(45); MUL_x1_DEFINE(46); MUL_x1_DEFINE(47);
+MUL_x1_DEFINE(48); MUL_x1_DEFINE(49); MUL_x1_DEFINE(50); MUL_x1_DEFINE(51);
+MUL_x1_DEFINE(52); MUL_x1_DEFINE(53); MUL_x1_DEFINE(54); MUL_x1_DEFINE(55);
+MUL_x1_DEFINE(56); MUL_x1_DEFINE(57); MUL_x1_DEFINE(58); MUL_x1_DEFINE(59);
+MUL_x1_DEFINE(60); MUL_x1_DEFINE(61); MUL_x1_DEFINE(62); MUL_x1_DEFINE(63);
+MUL_x1_DEFINE(64); MUL_x1_DEFINE(65); MUL_x1_DEFINE(66); MUL_x1_DEFINE(67);
+MUL_x1_DEFINE(68); MUL_x1_DEFINE(69); MUL_x1_DEFINE(70); MUL_x1_DEFINE(71);
+MUL_x1_DEFINE(72); MUL_x1_DEFINE(73); MUL_x1_DEFINE(74); MUL_x1_DEFINE(75);
+MUL_x1_DEFINE(76); MUL_x1_DEFINE(77); MUL_x1_DEFINE(78); MUL_x1_DEFINE(79);
+MUL_x1_DEFINE(80); MUL_x1_DEFINE(81); MUL_x1_DEFINE(82); MUL_x1_DEFINE(83);
+MUL_x1_DEFINE(84); MUL_x1_DEFINE(85); MUL_x1_DEFINE(86); MUL_x1_DEFINE(87);
+MUL_x1_DEFINE(88); MUL_x1_DEFINE(89); MUL_x1_DEFINE(90); MUL_x1_DEFINE(91);
+MUL_x1_DEFINE(92); MUL_x1_DEFINE(93); MUL_x1_DEFINE(94); MUL_x1_DEFINE(95);
+MUL_x1_DEFINE(96); MUL_x1_DEFINE(97); MUL_x1_DEFINE(98); MUL_x1_DEFINE(99);
+MUL_x1_DEFINE(100); MUL_x1_DEFINE(101); MUL_x1_DEFINE(102); MUL_x1_DEFINE(103);
+MUL_x1_DEFINE(104); MUL_x1_DEFINE(105); MUL_x1_DEFINE(106); MUL_x1_DEFINE(107);
+MUL_x1_DEFINE(108); MUL_x1_DEFINE(109); MUL_x1_DEFINE(110); MUL_x1_DEFINE(111);
+MUL_x1_DEFINE(112); MUL_x1_DEFINE(113); MUL_x1_DEFINE(114); MUL_x1_DEFINE(115);
+MUL_x1_DEFINE(116); MUL_x1_DEFINE(117); MUL_x1_DEFINE(118); MUL_x1_DEFINE(119);
+MUL_x1_DEFINE(120); MUL_x1_DEFINE(121); MUL_x1_DEFINE(122); MUL_x1_DEFINE(123);
+MUL_x1_DEFINE(124); MUL_x1_DEFINE(125); MUL_x1_DEFINE(126); MUL_x1_DEFINE(127);
+MUL_x1_DEFINE(128); MUL_x1_DEFINE(129); MUL_x1_DEFINE(130); MUL_x1_DEFINE(131);
+MUL_x1_DEFINE(132); MUL_x1_DEFINE(133); MUL_x1_DEFINE(134); MUL_x1_DEFINE(135);
+MUL_x1_DEFINE(136); MUL_x1_DEFINE(137); MUL_x1_DEFINE(138); MUL_x1_DEFINE(139);
+MUL_x1_DEFINE(140); MUL_x1_DEFINE(141); MUL_x1_DEFINE(142); MUL_x1_DEFINE(143);
+MUL_x1_DEFINE(144); MUL_x1_DEFINE(145); MUL_x1_DEFINE(146); MUL_x1_DEFINE(147);
+MUL_x1_DEFINE(148); MUL_x1_DEFINE(149); MUL_x1_DEFINE(150); MUL_x1_DEFINE(151);
+MUL_x1_DEFINE(152); MUL_x1_DEFINE(153); MUL_x1_DEFINE(154); MUL_x1_DEFINE(155);
+MUL_x1_DEFINE(156); MUL_x1_DEFINE(157); MUL_x1_DEFINE(158); MUL_x1_DEFINE(159);
+MUL_x1_DEFINE(160); MUL_x1_DEFINE(161); MUL_x1_DEFINE(162); MUL_x1_DEFINE(163);
+MUL_x1_DEFINE(164); MUL_x1_DEFINE(165); MUL_x1_DEFINE(166); MUL_x1_DEFINE(167);
+MUL_x1_DEFINE(168); MUL_x1_DEFINE(169); MUL_x1_DEFINE(170); MUL_x1_DEFINE(171);
+MUL_x1_DEFINE(172); MUL_x1_DEFINE(173); MUL_x1_DEFINE(174); MUL_x1_DEFINE(175);
+MUL_x1_DEFINE(176); MUL_x1_DEFINE(177); MUL_x1_DEFINE(178); MUL_x1_DEFINE(179);
+MUL_x1_DEFINE(180); MUL_x1_DEFINE(181); MUL_x1_DEFINE(182); MUL_x1_DEFINE(183);
+MUL_x1_DEFINE(184); MUL_x1_DEFINE(185); MUL_x1_DEFINE(186); MUL_x1_DEFINE(187);
+MUL_x1_DEFINE(188); MUL_x1_DEFINE(189); MUL_x1_DEFINE(190); MUL_x1_DEFINE(191);
+MUL_x1_DEFINE(192); MUL_x1_DEFINE(193); MUL_x1_DEFINE(194); MUL_x1_DEFINE(195);
+MUL_x1_DEFINE(196); MUL_x1_DEFINE(197); MUL_x1_DEFINE(198); MUL_x1_DEFINE(199);
+MUL_x1_DEFINE(200); MUL_x1_DEFINE(201); MUL_x1_DEFINE(202); MUL_x1_DEFINE(203);
+MUL_x1_DEFINE(204); MUL_x1_DEFINE(205); MUL_x1_DEFINE(206); MUL_x1_DEFINE(207);
+MUL_x1_DEFINE(208); MUL_x1_DEFINE(209); MUL_x1_DEFINE(210); MUL_x1_DEFINE(211);
+MUL_x1_DEFINE(212); MUL_x1_DEFINE(213); MUL_x1_DEFINE(214); MUL_x1_DEFINE(215);
+MUL_x1_DEFINE(216); MUL_x1_DEFINE(217); MUL_x1_DEFINE(218); MUL_x1_DEFINE(219);
+MUL_x1_DEFINE(220); MUL_x1_DEFINE(221); MUL_x1_DEFINE(222); MUL_x1_DEFINE(223);
+MUL_x1_DEFINE(224); MUL_x1_DEFINE(225); MUL_x1_DEFINE(226); MUL_x1_DEFINE(227);
+MUL_x1_DEFINE(228); MUL_x1_DEFINE(229); MUL_x1_DEFINE(230); MUL_x1_DEFINE(231);
+MUL_x1_DEFINE(232); MUL_x1_DEFINE(233); MUL_x1_DEFINE(234); MUL_x1_DEFINE(235);
+MUL_x1_DEFINE(236); MUL_x1_DEFINE(237); MUL_x1_DEFINE(238); MUL_x1_DEFINE(239);
+MUL_x1_DEFINE(240); MUL_x1_DEFINE(241); MUL_x1_DEFINE(242); MUL_x1_DEFINE(243);
+MUL_x1_DEFINE(244); MUL_x1_DEFINE(245); MUL_x1_DEFINE(246); MUL_x1_DEFINE(247);
+MUL_x1_DEFINE(248); MUL_x1_DEFINE(249); MUL_x1_DEFINE(250); MUL_x1_DEFINE(251);
+MUL_x1_DEFINE(252); MUL_x1_DEFINE(253); MUL_x1_DEFINE(254); MUL_x1_DEFINE(255);
+
+MUL_x2_DEFINE(0); MUL_x2_DEFINE(1); MUL_x2_DEFINE(2); MUL_x2_DEFINE(3);
+MUL_x2_DEFINE(4); MUL_x2_DEFINE(5); MUL_x2_DEFINE(6); MUL_x2_DEFINE(7);
+MUL_x2_DEFINE(8); MUL_x2_DEFINE(9); MUL_x2_DEFINE(10); MUL_x2_DEFINE(11);
+MUL_x2_DEFINE(12); MUL_x2_DEFINE(13); MUL_x2_DEFINE(14); MUL_x2_DEFINE(15);
+MUL_x2_DEFINE(16); MUL_x2_DEFINE(17); MUL_x2_DEFINE(18); MUL_x2_DEFINE(19);
+MUL_x2_DEFINE(20); MUL_x2_DEFINE(21); MUL_x2_DEFINE(22); MUL_x2_DEFINE(23);
+MUL_x2_DEFINE(24); MUL_x2_DEFINE(25); MUL_x2_DEFINE(26); MUL_x2_DEFINE(27);
+MUL_x2_DEFINE(28); MUL_x2_DEFINE(29); MUL_x2_DEFINE(30); MUL_x2_DEFINE(31);
+MUL_x2_DEFINE(32); MUL_x2_DEFINE(33); MUL_x2_DEFINE(34); MUL_x2_DEFINE(35);
+MUL_x2_DEFINE(36); MUL_x2_DEFINE(37); MUL_x2_DEFINE(38); MUL_x2_DEFINE(39);
+MUL_x2_DEFINE(40); MUL_x2_DEFINE(41); MUL_x2_DEFINE(42); MUL_x2_DEFINE(43);
+MUL_x2_DEFINE(44); MUL_x2_DEFINE(45); MUL_x2_DEFINE(46); MUL_x2_DEFINE(47);
+MUL_x2_DEFINE(48); MUL_x2_DEFINE(49); MUL_x2_DEFINE(50); MUL_x2_DEFINE(51);
+MUL_x2_DEFINE(52); MUL_x2_DEFINE(53); MUL_x2_DEFINE(54); MUL_x2_DEFINE(55);
+MUL_x2_DEFINE(56); MUL_x2_DEFINE(57); MUL_x2_DEFINE(58); MUL_x2_DEFINE(59);
+MUL_x2_DEFINE(60); MUL_x2_DEFINE(61); MUL_x2_DEFINE(62); MUL_x2_DEFINE(63);
+MUL_x2_DEFINE(64); MUL_x2_DEFINE(65); MUL_x2_DEFINE(66); MUL_x2_DEFINE(67);
+MUL_x2_DEFINE(68); MUL_x2_DEFINE(69); MUL_x2_DEFINE(70); MUL_x2_DEFINE(71);
+MUL_x2_DEFINE(72); MUL_x2_DEFINE(73); MUL_x2_DEFINE(74); MUL_x2_DEFINE(75);
+MUL_x2_DEFINE(76); MUL_x2_DEFINE(77); MUL_x2_DEFINE(78); MUL_x2_DEFINE(79);
+MUL_x2_DEFINE(80); MUL_x2_DEFINE(81); MUL_x2_DEFINE(82); MUL_x2_DEFINE(83);
+MUL_x2_DEFINE(84); MUL_x2_DEFINE(85); MUL_x2_DEFINE(86); MUL_x2_DEFINE(87);
+MUL_x2_DEFINE(88); MUL_x2_DEFINE(89); MUL_x2_DEFINE(90); MUL_x2_DEFINE(91);
+MUL_x2_DEFINE(92); MUL_x2_DEFINE(93); MUL_x2_DEFINE(94); MUL_x2_DEFINE(95);
+MUL_x2_DEFINE(96); MUL_x2_DEFINE(97); MUL_x2_DEFINE(98); MUL_x2_DEFINE(99);
+MUL_x2_DEFINE(100); MUL_x2_DEFINE(101); MUL_x2_DEFINE(102); MUL_x2_DEFINE(103);
+MUL_x2_DEFINE(104); MUL_x2_DEFINE(105); MUL_x2_DEFINE(106); MUL_x2_DEFINE(107);
+MUL_x2_DEFINE(108); MUL_x2_DEFINE(109); MUL_x2_DEFINE(110); MUL_x2_DEFINE(111);
+MUL_x2_DEFINE(112); MUL_x2_DEFINE(113); MUL_x2_DEFINE(114); MUL_x2_DEFINE(115);
+MUL_x2_DEFINE(116); MUL_x2_DEFINE(117); MUL_x2_DEFINE(118); MUL_x2_DEFINE(119);
+MUL_x2_DEFINE(120); MUL_x2_DEFINE(121); MUL_x2_DEFINE(122); MUL_x2_DEFINE(123);
+MUL_x2_DEFINE(124); MUL_x2_DEFINE(125); MUL_x2_DEFINE(126); MUL_x2_DEFINE(127);
+MUL_x2_DEFINE(128); MUL_x2_DEFINE(129); MUL_x2_DEFINE(130); MUL_x2_DEFINE(131);
+MUL_x2_DEFINE(132); MUL_x2_DEFINE(133); MUL_x2_DEFINE(134); MUL_x2_DEFINE(135);
+MUL_x2_DEFINE(136); MUL_x2_DEFINE(137); MUL_x2_DEFINE(138); MUL_x2_DEFINE(139);
+MUL_x2_DEFINE(140); MUL_x2_DEFINE(141); MUL_x2_DEFINE(142); MUL_x2_DEFINE(143);
+MUL_x2_DEFINE(144); MUL_x2_DEFINE(145); MUL_x2_DEFINE(146); MUL_x2_DEFINE(147);
+MUL_x2_DEFINE(148); MUL_x2_DEFINE(149); MUL_x2_DEFINE(150); MUL_x2_DEFINE(151);
+MUL_x2_DEFINE(152); MUL_x2_DEFINE(153); MUL_x2_DEFINE(154); MUL_x2_DEFINE(155);
+MUL_x2_DEFINE(156); MUL_x2_DEFINE(157); MUL_x2_DEFINE(158); MUL_x2_DEFINE(159);
+MUL_x2_DEFINE(160); MUL_x2_DEFINE(161); MUL_x2_DEFINE(162); MUL_x2_DEFINE(163);
+MUL_x2_DEFINE(164); MUL_x2_DEFINE(165); MUL_x2_DEFINE(166); MUL_x2_DEFINE(167);
+MUL_x2_DEFINE(168); MUL_x2_DEFINE(169); MUL_x2_DEFINE(170); MUL_x2_DEFINE(171);
+MUL_x2_DEFINE(172); MUL_x2_DEFINE(173); MUL_x2_DEFINE(174); MUL_x2_DEFINE(175);
+MUL_x2_DEFINE(176); MUL_x2_DEFINE(177); MUL_x2_DEFINE(178); MUL_x2_DEFINE(179);
+MUL_x2_DEFINE(180); MUL_x2_DEFINE(181); MUL_x2_DEFINE(182); MUL_x2_DEFINE(183);
+MUL_x2_DEFINE(184); MUL_x2_DEFINE(185); MUL_x2_DEFINE(186); MUL_x2_DEFINE(187);
+MUL_x2_DEFINE(188); MUL_x2_DEFINE(189); MUL_x2_DEFINE(190); MUL_x2_DEFINE(191);
+MUL_x2_DEFINE(192); MUL_x2_DEFINE(193); MUL_x2_DEFINE(194); MUL_x2_DEFINE(195);
+MUL_x2_DEFINE(196); MUL_x2_DEFINE(197); MUL_x2_DEFINE(198); MUL_x2_DEFINE(199);
+MUL_x2_DEFINE(200); MUL_x2_DEFINE(201); MUL_x2_DEFINE(202); MUL_x2_DEFINE(203);
+MUL_x2_DEFINE(204); MUL_x2_DEFINE(205); MUL_x2_DEFINE(206); MUL_x2_DEFINE(207);
+MUL_x2_DEFINE(208); MUL_x2_DEFINE(209); MUL_x2_DEFINE(210); MUL_x2_DEFINE(211);
+MUL_x2_DEFINE(212); MUL_x2_DEFINE(213); MUL_x2_DEFINE(214); MUL_x2_DEFINE(215);
+MUL_x2_DEFINE(216); MUL_x2_DEFINE(217); MUL_x2_DEFINE(218); MUL_x2_DEFINE(219);
+MUL_x2_DEFINE(220); MUL_x2_DEFINE(221); MUL_x2_DEFINE(222); MUL_x2_DEFINE(223);
+MUL_x2_DEFINE(224); MUL_x2_DEFINE(225); MUL_x2_DEFINE(226); MUL_x2_DEFINE(227);
+MUL_x2_DEFINE(228); MUL_x2_DEFINE(229); MUL_x2_DEFINE(230); MUL_x2_DEFINE(231);
+MUL_x2_DEFINE(232); MUL_x2_DEFINE(233); MUL_x2_DEFINE(234); MUL_x2_DEFINE(235);
+MUL_x2_DEFINE(236); MUL_x2_DEFINE(237); MUL_x2_DEFINE(238); MUL_x2_DEFINE(239);
+MUL_x2_DEFINE(240); MUL_x2_DEFINE(241); MUL_x2_DEFINE(242); MUL_x2_DEFINE(243);
+MUL_x2_DEFINE(244); MUL_x2_DEFINE(245); MUL_x2_DEFINE(246); MUL_x2_DEFINE(247);
+MUL_x2_DEFINE(248); MUL_x2_DEFINE(249); MUL_x2_DEFINE(250); MUL_x2_DEFINE(251);
+MUL_x2_DEFINE(252); MUL_x2_DEFINE(253); MUL_x2_DEFINE(254); MUL_x2_DEFINE(255);
+
+
+
+typedef void (*mul_fn_ptr_t)(void);
+
+static const mul_fn_ptr_t __attribute__((aligned(256)))
+gf_x1_mul_fns[256] = {
+ mul_x1_0, mul_x1_1, mul_x1_2, mul_x1_3, mul_x1_4, mul_x1_5,
+ mul_x1_6, mul_x1_7, mul_x1_8, mul_x1_9, mul_x1_10, mul_x1_11,
+ mul_x1_12, mul_x1_13, mul_x1_14, mul_x1_15, mul_x1_16, mul_x1_17,
+ mul_x1_18, mul_x1_19, mul_x1_20, mul_x1_21, mul_x1_22, mul_x1_23,
+ mul_x1_24, mul_x1_25, mul_x1_26, mul_x1_27, mul_x1_28, mul_x1_29,
+ mul_x1_30, mul_x1_31, mul_x1_32, mul_x1_33, mul_x1_34, mul_x1_35,
+ mul_x1_36, mul_x1_37, mul_x1_38, mul_x1_39, mul_x1_40, mul_x1_41,
+ mul_x1_42, mul_x1_43, mul_x1_44, mul_x1_45, mul_x1_46, mul_x1_47,
+ mul_x1_48, mul_x1_49, mul_x1_50, mul_x1_51, mul_x1_52, mul_x1_53,
+ mul_x1_54, mul_x1_55, mul_x1_56, mul_x1_57, mul_x1_58, mul_x1_59,
+ mul_x1_60, mul_x1_61, mul_x1_62, mul_x1_63, mul_x1_64, mul_x1_65,
+ mul_x1_66, mul_x1_67, mul_x1_68, mul_x1_69, mul_x1_70, mul_x1_71,
+ mul_x1_72, mul_x1_73, mul_x1_74, mul_x1_75, mul_x1_76, mul_x1_77,
+ mul_x1_78, mul_x1_79, mul_x1_80, mul_x1_81, mul_x1_82, mul_x1_83,
+ mul_x1_84, mul_x1_85, mul_x1_86, mul_x1_87, mul_x1_88, mul_x1_89,
+ mul_x1_90, mul_x1_91, mul_x1_92, mul_x1_93, mul_x1_94, mul_x1_95,
+ mul_x1_96, mul_x1_97, mul_x1_98, mul_x1_99, mul_x1_100, mul_x1_101,
+ mul_x1_102, mul_x1_103, mul_x1_104, mul_x1_105, mul_x1_106, mul_x1_107,
+ mul_x1_108, mul_x1_109, mul_x1_110, mul_x1_111, mul_x1_112, mul_x1_113,
+ mul_x1_114, mul_x1_115, mul_x1_116, mul_x1_117, mul_x1_118, mul_x1_119,
+ mul_x1_120, mul_x1_121, mul_x1_122, mul_x1_123, mul_x1_124, mul_x1_125,
+ mul_x1_126, mul_x1_127, mul_x1_128, mul_x1_129, mul_x1_130, mul_x1_131,
+ mul_x1_132, mul_x1_133, mul_x1_134, mul_x1_135, mul_x1_136, mul_x1_137,
+ mul_x1_138, mul_x1_139, mul_x1_140, mul_x1_141, mul_x1_142, mul_x1_143,
+ mul_x1_144, mul_x1_145, mul_x1_146, mul_x1_147, mul_x1_148, mul_x1_149,
+ mul_x1_150, mul_x1_151, mul_x1_152, mul_x1_153, mul_x1_154, mul_x1_155,
+ mul_x1_156, mul_x1_157, mul_x1_158, mul_x1_159, mul_x1_160, mul_x1_161,
+ mul_x1_162, mul_x1_163, mul_x1_164, mul_x1_165, mul_x1_166, mul_x1_167,
+ mul_x1_168, mul_x1_169, mul_x1_170, mul_x1_171, mul_x1_172, mul_x1_173,
+ mul_x1_174, mul_x1_175, mul_x1_176, mul_x1_177, mul_x1_178, mul_x1_179,
+ mul_x1_180, mul_x1_181, mul_x1_182, mul_x1_183, mul_x1_184, mul_x1_185,
+ mul_x1_186, mul_x1_187, mul_x1_188, mul_x1_189, mul_x1_190, mul_x1_191,
+ mul_x1_192, mul_x1_193, mul_x1_194, mul_x1_195, mul_x1_196, mul_x1_197,
+ mul_x1_198, mul_x1_199, mul_x1_200, mul_x1_201, mul_x1_202, mul_x1_203,
+ mul_x1_204, mul_x1_205, mul_x1_206, mul_x1_207, mul_x1_208, mul_x1_209,
+ mul_x1_210, mul_x1_211, mul_x1_212, mul_x1_213, mul_x1_214, mul_x1_215,
+ mul_x1_216, mul_x1_217, mul_x1_218, mul_x1_219, mul_x1_220, mul_x1_221,
+ mul_x1_222, mul_x1_223, mul_x1_224, mul_x1_225, mul_x1_226, mul_x1_227,
+ mul_x1_228, mul_x1_229, mul_x1_230, mul_x1_231, mul_x1_232, mul_x1_233,
+ mul_x1_234, mul_x1_235, mul_x1_236, mul_x1_237, mul_x1_238, mul_x1_239,
+ mul_x1_240, mul_x1_241, mul_x1_242, mul_x1_243, mul_x1_244, mul_x1_245,
+ mul_x1_246, mul_x1_247, mul_x1_248, mul_x1_249, mul_x1_250, mul_x1_251,
+ mul_x1_252, mul_x1_253, mul_x1_254, mul_x1_255
+};
+
+static const mul_fn_ptr_t __attribute__((aligned(256)))
+gf_x2_mul_fns[256] = {
+ mul_x2_0, mul_x2_1, mul_x2_2, mul_x2_3, mul_x2_4, mul_x2_5,
+ mul_x2_6, mul_x2_7, mul_x2_8, mul_x2_9, mul_x2_10, mul_x2_11,
+ mul_x2_12, mul_x2_13, mul_x2_14, mul_x2_15, mul_x2_16, mul_x2_17,
+ mul_x2_18, mul_x2_19, mul_x2_20, mul_x2_21, mul_x2_22, mul_x2_23,
+ mul_x2_24, mul_x2_25, mul_x2_26, mul_x2_27, mul_x2_28, mul_x2_29,
+ mul_x2_30, mul_x2_31, mul_x2_32, mul_x2_33, mul_x2_34, mul_x2_35,
+ mul_x2_36, mul_x2_37, mul_x2_38, mul_x2_39, mul_x2_40, mul_x2_41,
+ mul_x2_42, mul_x2_43, mul_x2_44, mul_x2_45, mul_x2_46, mul_x2_47,
+ mul_x2_48, mul_x2_49, mul_x2_50, mul_x2_51, mul_x2_52, mul_x2_53,
+ mul_x2_54, mul_x2_55, mul_x2_56, mul_x2_57, mul_x2_58, mul_x2_59,
+ mul_x2_60, mul_x2_61, mul_x2_62, mul_x2_63, mul_x2_64, mul_x2_65,
+ mul_x2_66, mul_x2_67, mul_x2_68, mul_x2_69, mul_x2_70, mul_x2_71,
+ mul_x2_72, mul_x2_73, mul_x2_74, mul_x2_75, mul_x2_76, mul_x2_77,
+ mul_x2_78, mul_x2_79, mul_x2_80, mul_x2_81, mul_x2_82, mul_x2_83,
+ mul_x2_84, mul_x2_85, mul_x2_86, mul_x2_87, mul_x2_88, mul_x2_89,
+ mul_x2_90, mul_x2_91, mul_x2_92, mul_x2_93, mul_x2_94, mul_x2_95,
+ mul_x2_96, mul_x2_97, mul_x2_98, mul_x2_99, mul_x2_100, mul_x2_101,
+ mul_x2_102, mul_x2_103, mul_x2_104, mul_x2_105, mul_x2_106, mul_x2_107,
+ mul_x2_108, mul_x2_109, mul_x2_110, mul_x2_111, mul_x2_112, mul_x2_113,
+ mul_x2_114, mul_x2_115, mul_x2_116, mul_x2_117, mul_x2_118, mul_x2_119,
+ mul_x2_120, mul_x2_121, mul_x2_122, mul_x2_123, mul_x2_124, mul_x2_125,
+ mul_x2_126, mul_x2_127, mul_x2_128, mul_x2_129, mul_x2_130, mul_x2_131,
+ mul_x2_132, mul_x2_133, mul_x2_134, mul_x2_135, mul_x2_136, mul_x2_137,
+ mul_x2_138, mul_x2_139, mul_x2_140, mul_x2_141, mul_x2_142, mul_x2_143,
+ mul_x2_144, mul_x2_145, mul_x2_146, mul_x2_147, mul_x2_148, mul_x2_149,
+ mul_x2_150, mul_x2_151, mul_x2_152, mul_x2_153, mul_x2_154, mul_x2_155,
+ mul_x2_156, mul_x2_157, mul_x2_158, mul_x2_159, mul_x2_160, mul_x2_161,
+ mul_x2_162, mul_x2_163, mul_x2_164, mul_x2_165, mul_x2_166, mul_x2_167,
+ mul_x2_168, mul_x2_169, mul_x2_170, mul_x2_171, mul_x2_172, mul_x2_173,
+ mul_x2_174, mul_x2_175, mul_x2_176, mul_x2_177, mul_x2_178, mul_x2_179,
+ mul_x2_180, mul_x2_181, mul_x2_182, mul_x2_183, mul_x2_184, mul_x2_185,
+ mul_x2_186, mul_x2_187, mul_x2_188, mul_x2_189, mul_x2_190, mul_x2_191,
+ mul_x2_192, mul_x2_193, mul_x2_194, mul_x2_195, mul_x2_196, mul_x2_197,
+ mul_x2_198, mul_x2_199, mul_x2_200, mul_x2_201, mul_x2_202, mul_x2_203,
+ mul_x2_204, mul_x2_205, mul_x2_206, mul_x2_207, mul_x2_208, mul_x2_209,
+ mul_x2_210, mul_x2_211, mul_x2_212, mul_x2_213, mul_x2_214, mul_x2_215,
+ mul_x2_216, mul_x2_217, mul_x2_218, mul_x2_219, mul_x2_220, mul_x2_221,
+ mul_x2_222, mul_x2_223, mul_x2_224, mul_x2_225, mul_x2_226, mul_x2_227,
+ mul_x2_228, mul_x2_229, mul_x2_230, mul_x2_231, mul_x2_232, mul_x2_233,
+ mul_x2_234, mul_x2_235, mul_x2_236, mul_x2_237, mul_x2_238, mul_x2_239,
+ mul_x2_240, mul_x2_241, mul_x2_242, mul_x2_243, mul_x2_244, mul_x2_245,
+ mul_x2_246, mul_x2_247, mul_x2_248, mul_x2_249, mul_x2_250, mul_x2_251,
+ mul_x2_252, mul_x2_253, mul_x2_254, mul_x2_255
+};
+
+#define MUL(c, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 2: \
+ COPY(r, _mul_x2_in); \
+ gf_x2_mul_fns[c](); \
+ COPY(_mul_x2_acc, r); \
+ break; \
+ case 1: \
+ COPY(r, _mul_x1_in); \
+ gf_x1_mul_fns[c](); \
+ COPY(_mul_x1_acc, r); \
+ break; \
+ default: \
+ VERIFY(0); \
+ } \
+}
+
+
+#define raidz_math_begin() kfpu_begin()
+#define raidz_math_end() kfpu_end()
+
+#define SYN_STRIDE 4
+
+#define ZERO_STRIDE 4
+#define ZERO_DEFINE() {}
+#define ZERO_D 0, 1, 2, 3
+
+#define COPY_STRIDE 4
+#define COPY_DEFINE() {}
+#define COPY_D 0, 1, 2, 3
+
+#define ADD_STRIDE 4
+#define ADD_DEFINE() {}
+#define ADD_D 0, 1, 2, 3
+
+#define MUL_STRIDE 2
+#define MUL_DEFINE() MUL2_SETUP()
+#define MUL_D 0, 1
+
+#define GEN_P_STRIDE 4
+#define GEN_P_DEFINE() {}
+#define GEN_P_P 0, 1, 2, 3
+
+#define GEN_PQ_STRIDE 4
+#define GEN_PQ_DEFINE() {}
+#define GEN_PQ_D 0, 1, 2, 3
+#define GEN_PQ_C 4, 5, 6, 7
+
+#define GEN_PQR_STRIDE 4
+#define GEN_PQR_DEFINE() {}
+#define GEN_PQR_D 0, 1, 2, 3
+#define GEN_PQR_C 4, 5, 6, 7
+
+#define SYN_Q_DEFINE() {}
+#define SYN_Q_D 0, 1, 2, 3
+#define SYN_Q_X 4, 5, 6, 7
+
+#define SYN_R_DEFINE() {}
+#define SYN_R_D 0, 1, 2, 3
+#define SYN_R_X 4, 5, 6, 7
+
+#define SYN_PQ_DEFINE() {}
+#define SYN_PQ_D 0, 1, 2, 3
+#define SYN_PQ_X 4, 5, 6, 7
+
+#define REC_PQ_STRIDE 2
+#define REC_PQ_DEFINE() MUL2_SETUP()
+#define REC_PQ_X 0, 1
+#define REC_PQ_Y 2, 3
+#define REC_PQ_T 4, 5
+
+#define SYN_PR_DEFINE() {}
+#define SYN_PR_D 0, 1, 2, 3
+#define SYN_PR_X 4, 5, 6, 7
+
+#define REC_PR_STRIDE 2
+#define REC_PR_DEFINE() MUL2_SETUP()
+#define REC_PR_X 0, 1
+#define REC_PR_Y 2, 3
+#define REC_PR_T 4, 5
+
+#define SYN_QR_DEFINE() {}
+#define SYN_QR_D 0, 1, 2, 3
+#define SYN_QR_X 4, 5, 6, 7
+
+#define REC_QR_STRIDE 2
+#define REC_QR_DEFINE() MUL2_SETUP()
+#define REC_QR_X 0, 1
+#define REC_QR_Y 2, 3
+#define REC_QR_T 4, 5
+
+#define SYN_PQR_DEFINE() {}
+#define SYN_PQR_D 0, 1, 2, 3
+#define SYN_PQR_X 4, 5, 6, 7
+
+#define REC_PQR_STRIDE 1
+#define REC_PQR_DEFINE() MUL2_SETUP()
+#define REC_PQR_X 0
+#define REC_PQR_Y 1
+#define REC_PQR_Z 2
+#define REC_PQR_XS 3
+#define REC_PQR_YS 4
+
+
+#include <sys/vdev_raidz_impl.h>
+#include "vdev_raidz_math_impl.h"
+
+DEFINE_GEN_METHODS(sse2);
+DEFINE_REC_METHODS(sse2);
+
+static boolean_t
+raidz_will_sse2_work(void)
+{
+ return (kfpu_allowed() && zfs_sse_available() && zfs_sse2_available());
+}
+
+const raidz_impl_ops_t vdev_raidz_sse2_impl = {
+ .init = NULL,
+ .fini = NULL,
+ .gen = RAIDZ_GEN_METHODS(sse2),
+ .rec = RAIDZ_REC_METHODS(sse2),
+ .is_supported = &raidz_will_sse2_work,
+ .name = "sse2"
+};
+
+#endif /* defined(__x86_64) && defined(HAVE_SSE2) */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c
new file mode 100644
index 000000000000..5ddc079a4f5d
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c
@@ -0,0 +1,2477 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/isa_defs.h>
+
+#if defined(__x86_64) && defined(HAVE_SSSE3)
+
+#include <sys/types.h>
+#include <sys/simd.h>
+
+#ifdef __linux__
+#define __asm __asm__ __volatile__
+#endif
+
+#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
+#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
+
+#define VR0_(REG, ...) "xmm"#REG
+#define VR1_(_1, REG, ...) "xmm"#REG
+#define VR2_(_1, _2, REG, ...) "xmm"#REG
+#define VR3_(_1, _2, _3, REG, ...) "xmm"#REG
+#define VR4_(_1, _2, _3, _4, REG, ...) "xmm"#REG
+#define VR5_(_1, _2, _3, _4, _5, REG, ...) "xmm"#REG
+#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "xmm"#REG
+#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "xmm"#REG
+
+#define VR0(r...) VR0_(r)
+#define VR1(r...) VR1_(r)
+#define VR2(r...) VR2_(r, 1)
+#define VR3(r...) VR3_(r, 1, 2)
+#define VR4(r...) VR4_(r, 1, 2)
+#define VR5(r...) VR5_(r, 1, 2, 3)
+#define VR6(r...) VR6_(r, 1, 2, 3, 4)
+#define VR7(r...) VR7_(r, 1, 2, 3, 4, 5)
+
+#define R_01(REG1, REG2, ...) REG1, REG2
+#define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3
+#define R_23(REG...) _R_23(REG, 1, 2, 3)
+
+#define ZFS_ASM_BUG() ASSERT(0)
+
+const uint8_t gf_clmul_mod_lt[4*256][16];
+
+#define ELEM_SIZE 16
+
+typedef struct v {
+ uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
+} v_t;
+
+
+#define XOR_ACC(src, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "pxor 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "pxor 0x10(%[SRC]), %%" VR1(r) "\n" \
+ "pxor 0x20(%[SRC]), %%" VR2(r) "\n" \
+ "pxor 0x30(%[SRC]), %%" VR3(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ case 2: \
+ __asm( \
+ "pxor 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "pxor 0x10(%[SRC]), %%" VR1(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define XOR(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "pxor %" VR0(r) ", %" VR4(r) "\n" \
+ "pxor %" VR1(r) ", %" VR5(r) "\n" \
+ "pxor %" VR2(r) ", %" VR6(r) "\n" \
+ "pxor %" VR3(r) ", %" VR7(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "pxor %" VR0(r) ", %" VR2(r) "\n" \
+ "pxor %" VR1(r) ", %" VR3(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define ZERO(r...) XOR(r, r)
+
+#define COPY(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "movdqa %" VR0(r) ", %" VR4(r) "\n" \
+ "movdqa %" VR1(r) ", %" VR5(r) "\n" \
+ "movdqa %" VR2(r) ", %" VR6(r) "\n" \
+ "movdqa %" VR3(r) ", %" VR7(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "movdqa %" VR0(r) ", %" VR2(r) "\n" \
+ "movdqa %" VR1(r) ", %" VR3(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define LOAD(src, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "movdqa 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "movdqa 0x10(%[SRC]), %%" VR1(r) "\n" \
+ "movdqa 0x20(%[SRC]), %%" VR2(r) "\n" \
+ "movdqa 0x30(%[SRC]), %%" VR3(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ case 2: \
+ __asm( \
+ "movdqa 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "movdqa 0x10(%[SRC]), %%" VR1(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define STORE(dst, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "movdqa %%" VR0(r)", 0x00(%[DST])\n" \
+ "movdqa %%" VR1(r)", 0x10(%[DST])\n" \
+ "movdqa %%" VR2(r)", 0x20(%[DST])\n" \
+ "movdqa %%" VR3(r)", 0x30(%[DST])\n" \
+ : : [DST] "r" (dst)); \
+ break; \
+ case 2: \
+ __asm( \
+ "movdqa %%" VR0(r)", 0x00(%[DST])\n" \
+ "movdqa %%" VR1(r)", 0x10(%[DST])\n" \
+ : : [DST] "r" (dst)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define MUL2_SETUP() \
+{ \
+ __asm( \
+ "movd %[mask], %%xmm15\n" \
+ "pshufd $0x0, %%xmm15, %%xmm15\n" \
+ : : [mask] "r" (0x1d1d1d1d)); \
+}
+
+#define _MUL2_x2(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 2: \
+ __asm( \
+ "pxor %xmm14, %xmm14\n" \
+ "pxor %xmm13, %xmm13\n" \
+ "pcmpgtb %" VR0(r)", %xmm14\n" \
+ "pcmpgtb %" VR1(r)", %xmm13\n" \
+ "pand %xmm15, %xmm14\n" \
+ "pand %xmm15, %xmm13\n" \
+ "paddb %" VR0(r)", %" VR0(r) "\n" \
+ "paddb %" VR1(r)", %" VR1(r) "\n" \
+ "pxor %xmm14, %" VR0(r) "\n" \
+ "pxor %xmm13, %" VR1(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define MUL2(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ _MUL2_x2(R_01(r)); \
+ _MUL2_x2(R_23(r)); \
+ break; \
+ case 2: \
+ _MUL2_x2(r); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define MUL4(r...) \
+{ \
+ MUL2(r); \
+ MUL2(r); \
+}
+
+#define _0f "xmm15"
+#define _a_save "xmm14"
+#define _b_save "xmm13"
+#define _lt_mod_a "xmm12"
+#define _lt_clmul_a "xmm11"
+#define _lt_mod_b "xmm10"
+#define _lt_clmul_b "xmm15"
+
+#define _MULx2(c, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 2: \
+ __asm( \
+ /* lts for upper part */ \
+ "movd %[mask], %%" _0f "\n" \
+ "pshufd $0x0, %%" _0f ", %%" _0f "\n" \
+ "movdqa 0x00(%[lt]), %%" _lt_mod_a "\n" \
+ "movdqa 0x10(%[lt]), %%" _lt_clmul_a "\n" \
+ /* upper part */ \
+ "movdqa %%" VR0(r) ", %%" _a_save "\n" \
+ "movdqa %%" VR1(r) ", %%" _b_save "\n" \
+ "psraw $0x4, %%" VR0(r) "\n" \
+ "psraw $0x4, %%" VR1(r) "\n" \
+ "pand %%" _0f ", %%" _a_save "\n" \
+ "pand %%" _0f ", %%" _b_save "\n" \
+ "pand %%" _0f ", %%" VR0(r) "\n" \
+ "pand %%" _0f ", %%" VR1(r) "\n" \
+ \
+ "movdqa %%" _lt_mod_a ", %%" _lt_mod_b "\n" \
+ "movdqa %%" _lt_clmul_a ", %%" _lt_clmul_b "\n" \
+ \
+ "pshufb %%" VR0(r) ",%%" _lt_mod_a "\n" \
+ "pshufb %%" VR1(r) ",%%" _lt_mod_b "\n" \
+ "pshufb %%" VR0(r) ",%%" _lt_clmul_a "\n" \
+ "pshufb %%" VR1(r) ",%%" _lt_clmul_b "\n" \
+ \
+ "pxor %%" _lt_mod_a ",%%" _lt_clmul_a "\n" \
+ "pxor %%" _lt_mod_b ",%%" _lt_clmul_b "\n" \
+ "movdqa %%" _lt_clmul_a ",%%" VR0(r) "\n" \
+ "movdqa %%" _lt_clmul_b ",%%" VR1(r) "\n" \
+ /* lts for lower part */ \
+ "movdqa 0x20(%[lt]), %%" _lt_mod_a "\n" \
+ "movdqa 0x30(%[lt]), %%" _lt_clmul_a "\n" \
+ "movdqa %%" _lt_mod_a ", %%" _lt_mod_b "\n" \
+ "movdqa %%" _lt_clmul_a ", %%" _lt_clmul_b "\n" \
+ /* lower part */ \
+ "pshufb %%" _a_save ",%%" _lt_mod_a "\n" \
+ "pshufb %%" _b_save ",%%" _lt_mod_b "\n" \
+ "pshufb %%" _a_save ",%%" _lt_clmul_a "\n" \
+ "pshufb %%" _b_save ",%%" _lt_clmul_b "\n" \
+ \
+ "pxor %%" _lt_mod_a ",%%" VR0(r) "\n" \
+ "pxor %%" _lt_mod_b ",%%" VR1(r) "\n" \
+ "pxor %%" _lt_clmul_a ",%%" VR0(r) "\n" \
+ "pxor %%" _lt_clmul_b ",%%" VR1(r) "\n" \
+ : : [mask] "r" (0x0f0f0f0f), \
+ [lt] "r" (gf_clmul_mod_lt[4*(c)])); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define MUL(c, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ _MULx2(c, R_23(r)); \
+ _MULx2(c, R_01(r)); \
+ break; \
+ case 2: \
+ _MULx2(c, R_01(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define raidz_math_begin() kfpu_begin()
+#define raidz_math_end() kfpu_end()
+
+
+#define SYN_STRIDE 4
+
+#define ZERO_STRIDE 4
+#define ZERO_DEFINE() {}
+#define ZERO_D 0, 1, 2, 3
+
+#define COPY_STRIDE 4
+#define COPY_DEFINE() {}
+#define COPY_D 0, 1, 2, 3
+
+#define ADD_STRIDE 4
+#define ADD_DEFINE() {}
+#define ADD_D 0, 1, 2, 3
+
+#define MUL_STRIDE 4
+#define MUL_DEFINE() {}
+#define MUL_D 0, 1, 2, 3
+
+#define GEN_P_STRIDE 4
+#define GEN_P_DEFINE() {}
+#define GEN_P_P 0, 1, 2, 3
+
+#define GEN_PQ_STRIDE 4
+#define GEN_PQ_DEFINE() {}
+#define GEN_PQ_D 0, 1, 2, 3
+#define GEN_PQ_C 4, 5, 6, 7
+
+#define GEN_PQR_STRIDE 4
+#define GEN_PQR_DEFINE() {}
+#define GEN_PQR_D 0, 1, 2, 3
+#define GEN_PQR_C 4, 5, 6, 7
+
+#define SYN_Q_DEFINE() {}
+#define SYN_Q_D 0, 1, 2, 3
+#define SYN_Q_X 4, 5, 6, 7
+
+#define SYN_R_DEFINE() {}
+#define SYN_R_D 0, 1, 2, 3
+#define SYN_R_X 4, 5, 6, 7
+
+#define SYN_PQ_DEFINE() {}
+#define SYN_PQ_D 0, 1, 2, 3
+#define SYN_PQ_X 4, 5, 6, 7
+
+#define REC_PQ_STRIDE 2
+#define REC_PQ_DEFINE() {}
+#define REC_PQ_X 0, 1
+#define REC_PQ_Y 2, 3
+#define REC_PQ_T 4, 5
+
+#define SYN_PR_DEFINE() {}
+#define SYN_PR_D 0, 1, 2, 3
+#define SYN_PR_X 4, 5, 6, 7
+
+#define REC_PR_STRIDE 2
+#define REC_PR_DEFINE() {}
+#define REC_PR_X 0, 1
+#define REC_PR_Y 2, 3
+#define REC_PR_T 4, 5
+
+#define SYN_QR_DEFINE() {}
+#define SYN_QR_D 0, 1, 2, 3
+#define SYN_QR_X 4, 5, 6, 7
+
+#define REC_QR_STRIDE 2
+#define REC_QR_DEFINE() {}
+#define REC_QR_X 0, 1
+#define REC_QR_Y 2, 3
+#define REC_QR_T 4, 5
+
+#define SYN_PQR_DEFINE() {}
+#define SYN_PQR_D 0, 1, 2, 3
+#define SYN_PQR_X 4, 5, 6, 7
+
+#define REC_PQR_STRIDE 2
+#define REC_PQR_DEFINE() {}
+#define REC_PQR_X 0, 1
+#define REC_PQR_Y 2, 3
+#define REC_PQR_Z 4, 5
+#define REC_PQR_XS 6, 7
+#define REC_PQR_YS 8, 9
+
+
+#include <sys/vdev_raidz_impl.h>
+#include "vdev_raidz_math_impl.h"
+
+DEFINE_GEN_METHODS(ssse3);
+DEFINE_REC_METHODS(ssse3);
+
+static boolean_t
+raidz_will_ssse3_work(void)
+{
+ return (kfpu_allowed() && zfs_sse_available() &&
+ zfs_sse2_available() && zfs_ssse3_available());
+}
+
+const raidz_impl_ops_t vdev_raidz_ssse3_impl = {
+ .init = NULL,
+ .fini = NULL,
+ .gen = RAIDZ_GEN_METHODS(ssse3),
+ .rec = RAIDZ_REC_METHODS(ssse3),
+ .is_supported = &raidz_will_ssse3_work,
+ .name = "ssse3"
+};
+
+#endif /* defined(__x86_64) && defined(HAVE_SSSE3) */
+
+
+#if defined(__x86_64)
+#if defined(HAVE_SSSE3) || defined(HAVE_AVX2) || defined(HAVE_AVX512BW)
+/* BEGIN CSTYLED */
+const uint8_t
+__attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] =
+{
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e,
+ 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09,
+ 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c,
+ 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b,
+ 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12,
+ 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15,
+ 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38,
+ 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f,
+ 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36,
+ 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31,
+ 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24,
+ 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23,
+ 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a,
+ 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d,
+ 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+ 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+ 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
+ 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+ 0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e,
+ 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+ 0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79,
+ 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+ 0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c,
+ 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+ 0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b,
+ 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+ 0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62,
+ 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+ 0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65,
+ 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+ 0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48,
+ 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88 },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+ 0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f,
+ 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87 },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+ 0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46,
+ 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96 },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+ 0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41,
+ 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99 },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+ 0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54,
+ 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4 },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+ 0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+ 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+ 0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a,
+ 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+ 0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d,
+ 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5 },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+ 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+ 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7,
+ 0x08, 0x29, 0x4a, 0x6b, 0x8c, 0xad, 0xce, 0xef },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+ 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee,
+ 0x10, 0x32, 0x54, 0x76, 0x98, 0xba, 0xdc, 0xfe },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+ 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9,
+ 0x18, 0x3b, 0x5e, 0x7d, 0x94, 0xb7, 0xd2, 0xf1 },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+ 0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc,
+ 0x20, 0x04, 0x68, 0x4c, 0xb0, 0x94, 0xf8, 0xdc },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+ 0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb,
+ 0x28, 0x0d, 0x62, 0x47, 0xbc, 0x99, 0xf6, 0xd3 },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+ 0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2,
+ 0x30, 0x16, 0x7c, 0x5a, 0xa8, 0x8e, 0xe4, 0xc2 },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+ 0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+ 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd },
+ { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+ 0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8,
+ 0x40, 0x68, 0x10, 0x38, 0xe0, 0xc8, 0xb0, 0x98 },
+ { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+ 0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf,
+ 0x48, 0x61, 0x1a, 0x33, 0xec, 0xc5, 0xbe, 0x97 },
+ { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+ 0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6,
+ 0x50, 0x7a, 0x04, 0x2e, 0xf8, 0xd2, 0xac, 0x86 },
+ { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+ 0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1,
+ 0x58, 0x73, 0x0e, 0x25, 0xf4, 0xdf, 0xa2, 0x89 },
+ { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+ 0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4,
+ 0x60, 0x4c, 0x38, 0x14, 0xd0, 0xfc, 0x88, 0xa4 },
+ { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+ 0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3,
+ 0x68, 0x45, 0x32, 0x1f, 0xdc, 0xf1, 0x86, 0xab },
+ { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+ 0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca,
+ 0x70, 0x5e, 0x2c, 0x02, 0xc8, 0xe6, 0x94, 0xba },
+ { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+ 0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd,
+ 0x78, 0x57, 0x26, 0x09, 0xc4, 0xeb, 0x9a, 0xb5 },
+ { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+ 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+ 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97,
+ 0x88, 0xb9, 0xea, 0xdb, 0x4c, 0x7d, 0x2e, 0x1f },
+ { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+ 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e,
+ 0x90, 0xa2, 0xf4, 0xc6, 0x58, 0x6a, 0x3c, 0x0e },
+ { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+ 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99,
+ 0x98, 0xab, 0xfe, 0xcd, 0x54, 0x67, 0x32, 0x01 },
+ { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+ 0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c,
+ 0xa0, 0x94, 0xc8, 0xfc, 0x70, 0x44, 0x18, 0x2c },
+ { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+ 0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b,
+ 0xa8, 0x9d, 0xc2, 0xf7, 0x7c, 0x49, 0x16, 0x23 },
+ { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+ 0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82,
+ 0xb0, 0x86, 0xdc, 0xea, 0x68, 0x5e, 0x04, 0x32 },
+ { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+ 0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85,
+ 0xb8, 0x8f, 0xd6, 0xe1, 0x64, 0x53, 0x0a, 0x3d },
+ { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+ 0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8,
+ 0xc0, 0xf8, 0xb0, 0x88, 0x20, 0x18, 0x50, 0x68 },
+ { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+ 0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf,
+ 0xc8, 0xf1, 0xba, 0x83, 0x2c, 0x15, 0x5e, 0x67 },
+ { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+ 0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+ 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 },
+ { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+ 0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1,
+ 0xd8, 0xe3, 0xae, 0x95, 0x34, 0x0f, 0x42, 0x79 },
+ { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+ 0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4,
+ 0xe0, 0xdc, 0x98, 0xa4, 0x10, 0x2c, 0x68, 0x54 },
+ { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+ 0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3,
+ 0xe8, 0xd5, 0x92, 0xaf, 0x1c, 0x21, 0x66, 0x5b },
+ { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+ 0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba,
+ 0xf0, 0xce, 0x8c, 0xb2, 0x08, 0x36, 0x74, 0x4a },
+ { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+ 0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd,
+ 0xf8, 0xc7, 0x86, 0xb9, 0x04, 0x3b, 0x7a, 0x45 },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+ 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+ 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x41, 0x82, 0xc3, 0x04, 0x45, 0x86, 0xc7,
+ 0x08, 0x49, 0x8a, 0xcb, 0x0c, 0x4d, 0x8e, 0xcf },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+ 0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x42, 0x84, 0xc6, 0x08, 0x4a, 0x8c, 0xce,
+ 0x10, 0x52, 0x94, 0xd6, 0x18, 0x5a, 0x9c, 0xde },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+ 0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x43, 0x86, 0xc5, 0x0c, 0x4f, 0x8a, 0xc9,
+ 0x18, 0x5b, 0x9e, 0xdd, 0x14, 0x57, 0x92, 0xd1 },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+ 0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x44, 0x88, 0xcc, 0x10, 0x54, 0x98, 0xdc,
+ 0x20, 0x64, 0xa8, 0xec, 0x30, 0x74, 0xb8, 0xfc },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+ 0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x45, 0x8a, 0xcf, 0x14, 0x51, 0x9e, 0xdb,
+ 0x28, 0x6d, 0xa2, 0xe7, 0x3c, 0x79, 0xb6, 0xf3 },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+ 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x46, 0x8c, 0xca, 0x18, 0x5e, 0x94, 0xd2,
+ 0x30, 0x76, 0xbc, 0xfa, 0x28, 0x6e, 0xa4, 0xe2 },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+ 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x47, 0x8e, 0xc9, 0x1c, 0x5b, 0x92, 0xd5,
+ 0x38, 0x7f, 0xb6, 0xf1, 0x24, 0x63, 0xaa, 0xed },
+ { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+ 0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x48, 0x90, 0xd8, 0x20, 0x68, 0xb0, 0xf8,
+ 0x40, 0x08, 0xd0, 0x98, 0x60, 0x28, 0xf0, 0xb8 },
+ { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+ 0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x49, 0x92, 0xdb, 0x24, 0x6d, 0xb6, 0xff,
+ 0x48, 0x01, 0xda, 0x93, 0x6c, 0x25, 0xfe, 0xb7 },
+ { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+ 0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4a, 0x94, 0xde, 0x28, 0x62, 0xbc, 0xf6,
+ 0x50, 0x1a, 0xc4, 0x8e, 0x78, 0x32, 0xec, 0xa6 },
+ { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+ 0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4b, 0x96, 0xdd, 0x2c, 0x67, 0xba, 0xf1,
+ 0x58, 0x13, 0xce, 0x85, 0x74, 0x3f, 0xe2, 0xa9 },
+ { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+ 0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4c, 0x98, 0xd4, 0x30, 0x7c, 0xa8, 0xe4,
+ 0x60, 0x2c, 0xf8, 0xb4, 0x50, 0x1c, 0xc8, 0x84 },
+ { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+ 0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4d, 0x9a, 0xd7, 0x34, 0x79, 0xae, 0xe3,
+ 0x68, 0x25, 0xf2, 0xbf, 0x5c, 0x11, 0xc6, 0x8b },
+ { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+ 0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+ 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a },
+ { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+ 0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4f, 0x9e, 0xd1, 0x3c, 0x73, 0xa2, 0xed,
+ 0x78, 0x37, 0xe6, 0xa9, 0x44, 0x0b, 0xda, 0x95 },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+ 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+ 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x51, 0xa2, 0xf3, 0x44, 0x15, 0xe6, 0xb7,
+ 0x88, 0xd9, 0x2a, 0x7b, 0xcc, 0x9d, 0x6e, 0x3f },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+ 0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x52, 0xa4, 0xf6, 0x48, 0x1a, 0xec, 0xbe,
+ 0x90, 0xc2, 0x34, 0x66, 0xd8, 0x8a, 0x7c, 0x2e },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+ 0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+ 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+ 0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x54, 0xa8, 0xfc, 0x50, 0x04, 0xf8, 0xac,
+ 0xa0, 0xf4, 0x08, 0x5c, 0xf0, 0xa4, 0x58, 0x0c },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+ 0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x55, 0xaa, 0xff, 0x54, 0x01, 0xfe, 0xab,
+ 0xa8, 0xfd, 0x02, 0x57, 0xfc, 0xa9, 0x56, 0x03 },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+ 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x56, 0xac, 0xfa, 0x58, 0x0e, 0xf4, 0xa2,
+ 0xb0, 0xe6, 0x1c, 0x4a, 0xe8, 0xbe, 0x44, 0x12 },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+ 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x57, 0xae, 0xf9, 0x5c, 0x0b, 0xf2, 0xa5,
+ 0xb8, 0xef, 0x16, 0x41, 0xe4, 0xb3, 0x4a, 0x1d },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+ 0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x58, 0xb0, 0xe8, 0x60, 0x38, 0xd0, 0x88,
+ 0xc0, 0x98, 0x70, 0x28, 0xa0, 0xf8, 0x10, 0x48 },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+ 0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x59, 0xb2, 0xeb, 0x64, 0x3d, 0xd6, 0x8f,
+ 0xc8, 0x91, 0x7a, 0x23, 0xac, 0xf5, 0x1e, 0x47 },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+ 0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5a, 0xb4, 0xee, 0x68, 0x32, 0xdc, 0x86,
+ 0xd0, 0x8a, 0x64, 0x3e, 0xb8, 0xe2, 0x0c, 0x56 },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+ 0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5b, 0xb6, 0xed, 0x6c, 0x37, 0xda, 0x81,
+ 0xd8, 0x83, 0x6e, 0x35, 0xb4, 0xef, 0x02, 0x59 },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+ 0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5c, 0xb8, 0xe4, 0x70, 0x2c, 0xc8, 0x94,
+ 0xe0, 0xbc, 0x58, 0x04, 0x90, 0xcc, 0x28, 0x74 },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+ 0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5d, 0xba, 0xe7, 0x74, 0x29, 0xce, 0x93,
+ 0xe8, 0xb5, 0x52, 0x0f, 0x9c, 0xc1, 0x26, 0x7b },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+ 0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5e, 0xbc, 0xe2, 0x78, 0x26, 0xc4, 0x9a,
+ 0xf0, 0xae, 0x4c, 0x12, 0x88, 0xd6, 0x34, 0x6a },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+ 0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5f, 0xbe, 0xe1, 0x7c, 0x23, 0xc2, 0x9d,
+ 0xf8, 0xa7, 0x46, 0x19, 0x84, 0xdb, 0x3a, 0x65 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+ 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+ 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x61, 0xc2, 0xa3, 0x84, 0xe5, 0x46, 0x27,
+ 0x08, 0x69, 0xca, 0xab, 0x8c, 0xed, 0x4e, 0x2f },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+ 0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x62, 0xc4, 0xa6, 0x88, 0xea, 0x4c, 0x2e,
+ 0x10, 0x72, 0xd4, 0xb6, 0x98, 0xfa, 0x5c, 0x3e },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+ 0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x63, 0xc6, 0xa5, 0x8c, 0xef, 0x4a, 0x29,
+ 0x18, 0x7b, 0xde, 0xbd, 0x94, 0xf7, 0x52, 0x31 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+ 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x64, 0xc8, 0xac, 0x90, 0xf4, 0x58, 0x3c,
+ 0x20, 0x44, 0xe8, 0x8c, 0xb0, 0xd4, 0x78, 0x1c },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+ 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x65, 0xca, 0xaf, 0x94, 0xf1, 0x5e, 0x3b,
+ 0x28, 0x4d, 0xe2, 0x87, 0xbc, 0xd9, 0x76, 0x13 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+ 0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x66, 0xcc, 0xaa, 0x98, 0xfe, 0x54, 0x32,
+ 0x30, 0x56, 0xfc, 0x9a, 0xa8, 0xce, 0x64, 0x02 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+ 0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x67, 0xce, 0xa9, 0x9c, 0xfb, 0x52, 0x35,
+ 0x38, 0x5f, 0xf6, 0x91, 0xa4, 0xc3, 0x6a, 0x0d },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+ 0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x68, 0xd0, 0xb8, 0xa0, 0xc8, 0x70, 0x18,
+ 0x40, 0x28, 0x90, 0xf8, 0xe0, 0x88, 0x30, 0x58 },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+ 0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+ 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+ 0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6a, 0xd4, 0xbe, 0xa8, 0xc2, 0x7c, 0x16,
+ 0x50, 0x3a, 0x84, 0xee, 0xf8, 0x92, 0x2c, 0x46 },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+ 0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6b, 0xd6, 0xbd, 0xac, 0xc7, 0x7a, 0x11,
+ 0x58, 0x33, 0x8e, 0xe5, 0xf4, 0x9f, 0x22, 0x49 },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+ 0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6c, 0xd8, 0xb4, 0xb0, 0xdc, 0x68, 0x04,
+ 0x60, 0x0c, 0xb8, 0xd4, 0xd0, 0xbc, 0x08, 0x64 },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+ 0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6d, 0xda, 0xb7, 0xb4, 0xd9, 0x6e, 0x03,
+ 0x68, 0x05, 0xb2, 0xdf, 0xdc, 0xb1, 0x06, 0x6b },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+ 0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6e, 0xdc, 0xb2, 0xb8, 0xd6, 0x64, 0x0a,
+ 0x70, 0x1e, 0xac, 0xc2, 0xc8, 0xa6, 0x14, 0x7a },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+ 0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6f, 0xde, 0xb1, 0xbc, 0xd3, 0x62, 0x0d,
+ 0x78, 0x17, 0xa6, 0xc9, 0xc4, 0xab, 0x1a, 0x75 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+ 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+ 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x71, 0xe2, 0x93, 0xc4, 0xb5, 0x26, 0x57,
+ 0x88, 0xf9, 0x6a, 0x1b, 0x4c, 0x3d, 0xae, 0xdf },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+ 0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x72, 0xe4, 0x96, 0xc8, 0xba, 0x2c, 0x5e,
+ 0x90, 0xe2, 0x74, 0x06, 0x58, 0x2a, 0xbc, 0xce },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+ 0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x73, 0xe6, 0x95, 0xcc, 0xbf, 0x2a, 0x59,
+ 0x98, 0xeb, 0x7e, 0x0d, 0x54, 0x27, 0xb2, 0xc1 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+ 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+ 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+ 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x75, 0xea, 0x9f, 0xd4, 0xa1, 0x3e, 0x4b,
+ 0xa8, 0xdd, 0x42, 0x37, 0x7c, 0x09, 0x96, 0xe3 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+ 0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x76, 0xec, 0x9a, 0xd8, 0xae, 0x34, 0x42,
+ 0xb0, 0xc6, 0x5c, 0x2a, 0x68, 0x1e, 0x84, 0xf2 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+ 0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x77, 0xee, 0x99, 0xdc, 0xab, 0x32, 0x45,
+ 0xb8, 0xcf, 0x56, 0x21, 0x64, 0x13, 0x8a, 0xfd },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+ 0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x78, 0xf0, 0x88, 0xe0, 0x98, 0x10, 0x68,
+ 0xc0, 0xb8, 0x30, 0x48, 0x20, 0x58, 0xd0, 0xa8 },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+ 0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x79, 0xf2, 0x8b, 0xe4, 0x9d, 0x16, 0x6f,
+ 0xc8, 0xb1, 0x3a, 0x43, 0x2c, 0x55, 0xde, 0xa7 },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+ 0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7a, 0xf4, 0x8e, 0xe8, 0x92, 0x1c, 0x66,
+ 0xd0, 0xaa, 0x24, 0x5e, 0x38, 0x42, 0xcc, 0xb6 },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+ 0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7b, 0xf6, 0x8d, 0xec, 0x97, 0x1a, 0x61,
+ 0xd8, 0xa3, 0x2e, 0x55, 0x34, 0x4f, 0xc2, 0xb9 },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+ 0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7c, 0xf8, 0x84, 0xf0, 0x8c, 0x08, 0x74,
+ 0xe0, 0x9c, 0x18, 0x64, 0x10, 0x6c, 0xe8, 0x94 },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+ 0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7d, 0xfa, 0x87, 0xf4, 0x89, 0x0e, 0x73,
+ 0xe8, 0x95, 0x12, 0x6f, 0x1c, 0x61, 0xe6, 0x9b },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+ 0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7e, 0xfc, 0x82, 0xf8, 0x86, 0x04, 0x7a,
+ 0xf0, 0x8e, 0x0c, 0x72, 0x08, 0x76, 0xf4, 0x8a },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+ 0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7f, 0xfe, 0x81, 0xfc, 0x83, 0x02, 0x7d,
+ 0xf8, 0x87, 0x06, 0x79, 0x04, 0x7b, 0xfa, 0x85 },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+ 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+ 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+ 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+ 0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x82, 0x04, 0x86, 0x08, 0x8a, 0x0c, 0x8e,
+ 0x10, 0x92, 0x14, 0x96, 0x18, 0x9a, 0x1c, 0x9e },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+ 0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x83, 0x06, 0x85, 0x0c, 0x8f, 0x0a, 0x89,
+ 0x18, 0x9b, 0x1e, 0x9d, 0x14, 0x97, 0x12, 0x91 },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+ 0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x84, 0x08, 0x8c, 0x10, 0x94, 0x18, 0x9c,
+ 0x20, 0xa4, 0x28, 0xac, 0x30, 0xb4, 0x38, 0xbc },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+ 0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x85, 0x0a, 0x8f, 0x14, 0x91, 0x1e, 0x9b,
+ 0x28, 0xad, 0x22, 0xa7, 0x3c, 0xb9, 0x36, 0xb3 },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+ 0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x86, 0x0c, 0x8a, 0x18, 0x9e, 0x14, 0x92,
+ 0x30, 0xb6, 0x3c, 0xba, 0x28, 0xae, 0x24, 0xa2 },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+ 0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x87, 0x0e, 0x89, 0x1c, 0x9b, 0x12, 0x95,
+ 0x38, 0xbf, 0x36, 0xb1, 0x24, 0xa3, 0x2a, 0xad },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+ 0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x88, 0x10, 0x98, 0x20, 0xa8, 0x30, 0xb8,
+ 0x40, 0xc8, 0x50, 0xd8, 0x60, 0xe8, 0x70, 0xf8 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+ 0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x89, 0x12, 0x9b, 0x24, 0xad, 0x36, 0xbf,
+ 0x48, 0xc1, 0x5a, 0xd3, 0x6c, 0xe5, 0x7e, 0xf7 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+ 0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8a, 0x14, 0x9e, 0x28, 0xa2, 0x3c, 0xb6,
+ 0x50, 0xda, 0x44, 0xce, 0x78, 0xf2, 0x6c, 0xe6 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+ 0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8b, 0x16, 0x9d, 0x2c, 0xa7, 0x3a, 0xb1,
+ 0x58, 0xd3, 0x4e, 0xc5, 0x74, 0xff, 0x62, 0xe9 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+ 0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8c, 0x18, 0x94, 0x30, 0xbc, 0x28, 0xa4,
+ 0x60, 0xec, 0x78, 0xf4, 0x50, 0xdc, 0x48, 0xc4 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+ 0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8d, 0x1a, 0x97, 0x34, 0xb9, 0x2e, 0xa3,
+ 0x68, 0xe5, 0x72, 0xff, 0x5c, 0xd1, 0x46, 0xcb },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+ 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8e, 0x1c, 0x92, 0x38, 0xb6, 0x24, 0xaa,
+ 0x70, 0xfe, 0x6c, 0xe2, 0x48, 0xc6, 0x54, 0xda },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+ 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8f, 0x1e, 0x91, 0x3c, 0xb3, 0x22, 0xad,
+ 0x78, 0xf7, 0x66, 0xe9, 0x44, 0xcb, 0x5a, 0xd5 },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+ 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+ 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x91, 0x22, 0xb3, 0x44, 0xd5, 0x66, 0xf7,
+ 0x88, 0x19, 0xaa, 0x3b, 0xcc, 0x5d, 0xee, 0x7f },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+ 0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x92, 0x24, 0xb6, 0x48, 0xda, 0x6c, 0xfe,
+ 0x90, 0x02, 0xb4, 0x26, 0xd8, 0x4a, 0xfc, 0x6e },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+ 0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x93, 0x26, 0xb5, 0x4c, 0xdf, 0x6a, 0xf9,
+ 0x98, 0x0b, 0xbe, 0x2d, 0xd4, 0x47, 0xf2, 0x61 },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+ 0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x94, 0x28, 0xbc, 0x50, 0xc4, 0x78, 0xec,
+ 0xa0, 0x34, 0x88, 0x1c, 0xf0, 0x64, 0xd8, 0x4c },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+ 0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x95, 0x2a, 0xbf, 0x54, 0xc1, 0x7e, 0xeb,
+ 0xa8, 0x3d, 0x82, 0x17, 0xfc, 0x69, 0xd6, 0x43 },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+ 0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x96, 0x2c, 0xba, 0x58, 0xce, 0x74, 0xe2,
+ 0xb0, 0x26, 0x9c, 0x0a, 0xe8, 0x7e, 0xc4, 0x52 },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+ 0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x97, 0x2e, 0xb9, 0x5c, 0xcb, 0x72, 0xe5,
+ 0xb8, 0x2f, 0x96, 0x01, 0xe4, 0x73, 0xca, 0x5d },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+ 0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x98, 0x30, 0xa8, 0x60, 0xf8, 0x50, 0xc8,
+ 0xc0, 0x58, 0xf0, 0x68, 0xa0, 0x38, 0x90, 0x08 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+ 0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x99, 0x32, 0xab, 0x64, 0xfd, 0x56, 0xcf,
+ 0xc8, 0x51, 0xfa, 0x63, 0xac, 0x35, 0x9e, 0x07 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+ 0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9a, 0x34, 0xae, 0x68, 0xf2, 0x5c, 0xc6,
+ 0xd0, 0x4a, 0xe4, 0x7e, 0xb8, 0x22, 0x8c, 0x16 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+ 0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9b, 0x36, 0xad, 0x6c, 0xf7, 0x5a, 0xc1,
+ 0xd8, 0x43, 0xee, 0x75, 0xb4, 0x2f, 0x82, 0x19 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+ 0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+ 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+ 0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9d, 0x3a, 0xa7, 0x74, 0xe9, 0x4e, 0xd3,
+ 0xe8, 0x75, 0xd2, 0x4f, 0x9c, 0x01, 0xa6, 0x3b },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+ 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9e, 0x3c, 0xa2, 0x78, 0xe6, 0x44, 0xda,
+ 0xf0, 0x6e, 0xcc, 0x52, 0x88, 0x16, 0xb4, 0x2a },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+ 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9f, 0x3e, 0xa1, 0x7c, 0xe3, 0x42, 0xdd,
+ 0xf8, 0x67, 0xc6, 0x59, 0x84, 0x1b, 0xba, 0x25 },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+ 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+ 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa1, 0x42, 0xe3, 0x84, 0x25, 0xc6, 0x67,
+ 0x08, 0xa9, 0x4a, 0xeb, 0x8c, 0x2d, 0xce, 0x6f },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+ 0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa2, 0x44, 0xe6, 0x88, 0x2a, 0xcc, 0x6e,
+ 0x10, 0xb2, 0x54, 0xf6, 0x98, 0x3a, 0xdc, 0x7e },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+ 0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa3, 0x46, 0xe5, 0x8c, 0x2f, 0xca, 0x69,
+ 0x18, 0xbb, 0x5e, 0xfd, 0x94, 0x37, 0xd2, 0x71 },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+ 0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa4, 0x48, 0xec, 0x90, 0x34, 0xd8, 0x7c,
+ 0x20, 0x84, 0x68, 0xcc, 0xb0, 0x14, 0xf8, 0x5c },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+ 0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa5, 0x4a, 0xef, 0x94, 0x31, 0xde, 0x7b,
+ 0x28, 0x8d, 0x62, 0xc7, 0xbc, 0x19, 0xf6, 0x53 },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+ 0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+ 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+ 0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa7, 0x4e, 0xe9, 0x9c, 0x3b, 0xd2, 0x75,
+ 0x38, 0x9f, 0x76, 0xd1, 0xa4, 0x03, 0xea, 0x4d },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+ 0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa8, 0x50, 0xf8, 0xa0, 0x08, 0xf0, 0x58,
+ 0x40, 0xe8, 0x10, 0xb8, 0xe0, 0x48, 0xb0, 0x18 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+ 0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa9, 0x52, 0xfb, 0xa4, 0x0d, 0xf6, 0x5f,
+ 0x48, 0xe1, 0x1a, 0xb3, 0xec, 0x45, 0xbe, 0x17 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+ 0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xaa, 0x54, 0xfe, 0xa8, 0x02, 0xfc, 0x56,
+ 0x50, 0xfa, 0x04, 0xae, 0xf8, 0x52, 0xac, 0x06 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+ 0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xab, 0x56, 0xfd, 0xac, 0x07, 0xfa, 0x51,
+ 0x58, 0xf3, 0x0e, 0xa5, 0xf4, 0x5f, 0xa2, 0x09 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+ 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xac, 0x58, 0xf4, 0xb0, 0x1c, 0xe8, 0x44,
+ 0x60, 0xcc, 0x38, 0x94, 0xd0, 0x7c, 0x88, 0x24 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+ 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xad, 0x5a, 0xf7, 0xb4, 0x19, 0xee, 0x43,
+ 0x68, 0xc5, 0x32, 0x9f, 0xdc, 0x71, 0x86, 0x2b },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+ 0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xae, 0x5c, 0xf2, 0xb8, 0x16, 0xe4, 0x4a,
+ 0x70, 0xde, 0x2c, 0x82, 0xc8, 0x66, 0x94, 0x3a },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+ 0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xaf, 0x5e, 0xf1, 0xbc, 0x13, 0xe2, 0x4d,
+ 0x78, 0xd7, 0x26, 0x89, 0xc4, 0x6b, 0x9a, 0x35 },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+ 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+ 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb1, 0x62, 0xd3, 0xc4, 0x75, 0xa6, 0x17,
+ 0x88, 0x39, 0xea, 0x5b, 0x4c, 0xfd, 0x2e, 0x9f },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+ 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb2, 0x64, 0xd6, 0xc8, 0x7a, 0xac, 0x1e,
+ 0x90, 0x22, 0xf4, 0x46, 0x58, 0xea, 0x3c, 0x8e },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+ 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb3, 0x66, 0xd5, 0xcc, 0x7f, 0xaa, 0x19,
+ 0x98, 0x2b, 0xfe, 0x4d, 0x54, 0xe7, 0x32, 0x81 },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+ 0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb4, 0x68, 0xdc, 0xd0, 0x64, 0xb8, 0x0c,
+ 0xa0, 0x14, 0xc8, 0x7c, 0x70, 0xc4, 0x18, 0xac },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+ 0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb5, 0x6a, 0xdf, 0xd4, 0x61, 0xbe, 0x0b,
+ 0xa8, 0x1d, 0xc2, 0x77, 0x7c, 0xc9, 0x16, 0xa3 },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+ 0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb6, 0x6c, 0xda, 0xd8, 0x6e, 0xb4, 0x02,
+ 0xb0, 0x06, 0xdc, 0x6a, 0x68, 0xde, 0x04, 0xb2 },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+ 0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb7, 0x6e, 0xd9, 0xdc, 0x6b, 0xb2, 0x05,
+ 0xb8, 0x0f, 0xd6, 0x61, 0x64, 0xd3, 0x0a, 0xbd },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+ 0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb8, 0x70, 0xc8, 0xe0, 0x58, 0x90, 0x28,
+ 0xc0, 0x78, 0xb0, 0x08, 0x20, 0x98, 0x50, 0xe8 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+ 0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb9, 0x72, 0xcb, 0xe4, 0x5d, 0x96, 0x2f,
+ 0xc8, 0x71, 0xba, 0x03, 0x2c, 0x95, 0x5e, 0xe7 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+ 0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xba, 0x74, 0xce, 0xe8, 0x52, 0x9c, 0x26,
+ 0xd0, 0x6a, 0xa4, 0x1e, 0x38, 0x82, 0x4c, 0xf6 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+ 0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+ 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+ 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xbc, 0x78, 0xc4, 0xf0, 0x4c, 0x88, 0x34,
+ 0xe0, 0x5c, 0x98, 0x24, 0x10, 0xac, 0x68, 0xd4 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+ 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xbd, 0x7a, 0xc7, 0xf4, 0x49, 0x8e, 0x33,
+ 0xe8, 0x55, 0x92, 0x2f, 0x1c, 0xa1, 0x66, 0xdb },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+ 0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xbe, 0x7c, 0xc2, 0xf8, 0x46, 0x84, 0x3a,
+ 0xf0, 0x4e, 0x8c, 0x32, 0x08, 0xb6, 0x74, 0xca },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+ 0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xbf, 0x7e, 0xc1, 0xfc, 0x43, 0x82, 0x3d,
+ 0xf8, 0x47, 0x86, 0x39, 0x04, 0xbb, 0x7a, 0xc5 },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+ 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+ 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc1, 0x82, 0x43, 0x04, 0xc5, 0x86, 0x47,
+ 0x08, 0xc9, 0x8a, 0x4b, 0x0c, 0xcd, 0x8e, 0x4f },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+ 0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc2, 0x84, 0x46, 0x08, 0xca, 0x8c, 0x4e,
+ 0x10, 0xd2, 0x94, 0x56, 0x18, 0xda, 0x9c, 0x5e },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+ 0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc3, 0x86, 0x45, 0x0c, 0xcf, 0x8a, 0x49,
+ 0x18, 0xdb, 0x9e, 0x5d, 0x14, 0xd7, 0x92, 0x51 },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+ 0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc4, 0x88, 0x4c, 0x10, 0xd4, 0x98, 0x5c,
+ 0x20, 0xe4, 0xa8, 0x6c, 0x30, 0xf4, 0xb8, 0x7c },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+ 0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc5, 0x8a, 0x4f, 0x14, 0xd1, 0x9e, 0x5b,
+ 0x28, 0xed, 0xa2, 0x67, 0x3c, 0xf9, 0xb6, 0x73 },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+ 0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc6, 0x8c, 0x4a, 0x18, 0xde, 0x94, 0x52,
+ 0x30, 0xf6, 0xbc, 0x7a, 0x28, 0xee, 0xa4, 0x62 },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+ 0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc7, 0x8e, 0x49, 0x1c, 0xdb, 0x92, 0x55,
+ 0x38, 0xff, 0xb6, 0x71, 0x24, 0xe3, 0xaa, 0x6d },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+ 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc8, 0x90, 0x58, 0x20, 0xe8, 0xb0, 0x78,
+ 0x40, 0x88, 0xd0, 0x18, 0x60, 0xa8, 0xf0, 0x38 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+ 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc9, 0x92, 0x5b, 0x24, 0xed, 0xb6, 0x7f,
+ 0x48, 0x81, 0xda, 0x13, 0x6c, 0xa5, 0xfe, 0x37 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+ 0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xca, 0x94, 0x5e, 0x28, 0xe2, 0xbc, 0x76,
+ 0x50, 0x9a, 0xc4, 0x0e, 0x78, 0xb2, 0xec, 0x26 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+ 0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xcb, 0x96, 0x5d, 0x2c, 0xe7, 0xba, 0x71,
+ 0x58, 0x93, 0xce, 0x05, 0x74, 0xbf, 0xe2, 0x29 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+ 0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xcc, 0x98, 0x54, 0x30, 0xfc, 0xa8, 0x64,
+ 0x60, 0xac, 0xf8, 0x34, 0x50, 0x9c, 0xc8, 0x04 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+ 0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xcd, 0x9a, 0x57, 0x34, 0xf9, 0xae, 0x63,
+ 0x68, 0xa5, 0xf2, 0x3f, 0x5c, 0x91, 0xc6, 0x0b },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+ 0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xce, 0x9c, 0x52, 0x38, 0xf6, 0xa4, 0x6a,
+ 0x70, 0xbe, 0xec, 0x22, 0x48, 0x86, 0xd4, 0x1a },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+ 0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+ 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+ 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+ 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd1, 0xa2, 0x73, 0x44, 0x95, 0xe6, 0x37,
+ 0x88, 0x59, 0x2a, 0xfb, 0xcc, 0x1d, 0x6e, 0xbf },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+ 0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+ 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+ 0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd3, 0xa6, 0x75, 0x4c, 0x9f, 0xea, 0x39,
+ 0x98, 0x4b, 0x3e, 0xed, 0xd4, 0x07, 0x72, 0xa1 },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+ 0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd4, 0xa8, 0x7c, 0x50, 0x84, 0xf8, 0x2c,
+ 0xa0, 0x74, 0x08, 0xdc, 0xf0, 0x24, 0x58, 0x8c },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+ 0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd5, 0xaa, 0x7f, 0x54, 0x81, 0xfe, 0x2b,
+ 0xa8, 0x7d, 0x02, 0xd7, 0xfc, 0x29, 0x56, 0x83 },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+ 0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd6, 0xac, 0x7a, 0x58, 0x8e, 0xf4, 0x22,
+ 0xb0, 0x66, 0x1c, 0xca, 0xe8, 0x3e, 0x44, 0x92 },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+ 0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd7, 0xae, 0x79, 0x5c, 0x8b, 0xf2, 0x25,
+ 0xb8, 0x6f, 0x16, 0xc1, 0xe4, 0x33, 0x4a, 0x9d },
+ { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+ 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd8, 0xb0, 0x68, 0x60, 0xb8, 0xd0, 0x08,
+ 0xc0, 0x18, 0x70, 0xa8, 0xa0, 0x78, 0x10, 0xc8 },
+ { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+ 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd9, 0xb2, 0x6b, 0x64, 0xbd, 0xd6, 0x0f,
+ 0xc8, 0x11, 0x7a, 0xa3, 0xac, 0x75, 0x1e, 0xc7 },
+ { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+ 0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xda, 0xb4, 0x6e, 0x68, 0xb2, 0xdc, 0x06,
+ 0xd0, 0x0a, 0x64, 0xbe, 0xb8, 0x62, 0x0c, 0xd6 },
+ { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+ 0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xdb, 0xb6, 0x6d, 0x6c, 0xb7, 0xda, 0x01,
+ 0xd8, 0x03, 0x6e, 0xb5, 0xb4, 0x6f, 0x02, 0xd9 },
+ { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+ 0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xdc, 0xb8, 0x64, 0x70, 0xac, 0xc8, 0x14,
+ 0xe0, 0x3c, 0x58, 0x84, 0x90, 0x4c, 0x28, 0xf4 },
+ { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+ 0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xdd, 0xba, 0x67, 0x74, 0xa9, 0xce, 0x13,
+ 0xe8, 0x35, 0x52, 0x8f, 0x9c, 0x41, 0x26, 0xfb },
+ { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+ 0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xde, 0xbc, 0x62, 0x78, 0xa6, 0xc4, 0x1a,
+ 0xf0, 0x2e, 0x4c, 0x92, 0x88, 0x56, 0x34, 0xea },
+ { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+ 0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xdf, 0xbe, 0x61, 0x7c, 0xa3, 0xc2, 0x1d,
+ 0xf8, 0x27, 0x46, 0x99, 0x84, 0x5b, 0x3a, 0xe5 },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+ 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+ 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe1, 0xc2, 0x23, 0x84, 0x65, 0x46, 0xa7,
+ 0x08, 0xe9, 0xca, 0x2b, 0x8c, 0x6d, 0x4e, 0xaf },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+ 0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe2, 0xc4, 0x26, 0x88, 0x6a, 0x4c, 0xae,
+ 0x10, 0xf2, 0xd4, 0x36, 0x98, 0x7a, 0x5c, 0xbe },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+ 0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe3, 0xc6, 0x25, 0x8c, 0x6f, 0x4a, 0xa9,
+ 0x18, 0xfb, 0xde, 0x3d, 0x94, 0x77, 0x52, 0xb1 },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+ 0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe4, 0xc8, 0x2c, 0x90, 0x74, 0x58, 0xbc,
+ 0x20, 0xc4, 0xe8, 0x0c, 0xb0, 0x54, 0x78, 0x9c },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+ 0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe5, 0xca, 0x2f, 0x94, 0x71, 0x5e, 0xbb,
+ 0x28, 0xcd, 0xe2, 0x07, 0xbc, 0x59, 0x76, 0x93 },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+ 0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe6, 0xcc, 0x2a, 0x98, 0x7e, 0x54, 0xb2,
+ 0x30, 0xd6, 0xfc, 0x1a, 0xa8, 0x4e, 0x64, 0x82 },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+ 0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe7, 0xce, 0x29, 0x9c, 0x7b, 0x52, 0xb5,
+ 0x38, 0xdf, 0xf6, 0x11, 0xa4, 0x43, 0x6a, 0x8d },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+ 0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+ 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+ 0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe9, 0xd2, 0x3b, 0xa4, 0x4d, 0x76, 0x9f,
+ 0x48, 0xa1, 0x9a, 0x73, 0xec, 0x05, 0x3e, 0xd7 },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+ 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xea, 0xd4, 0x3e, 0xa8, 0x42, 0x7c, 0x96,
+ 0x50, 0xba, 0x84, 0x6e, 0xf8, 0x12, 0x2c, 0xc6 },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+ 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xeb, 0xd6, 0x3d, 0xac, 0x47, 0x7a, 0x91,
+ 0x58, 0xb3, 0x8e, 0x65, 0xf4, 0x1f, 0x22, 0xc9 },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+ 0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xec, 0xd8, 0x34, 0xb0, 0x5c, 0x68, 0x84,
+ 0x60, 0x8c, 0xb8, 0x54, 0xd0, 0x3c, 0x08, 0xe4 },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+ 0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xed, 0xda, 0x37, 0xb4, 0x59, 0x6e, 0x83,
+ 0x68, 0x85, 0xb2, 0x5f, 0xdc, 0x31, 0x06, 0xeb },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+ 0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xee, 0xdc, 0x32, 0xb8, 0x56, 0x64, 0x8a,
+ 0x70, 0x9e, 0xac, 0x42, 0xc8, 0x26, 0x14, 0xfa },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+ 0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xef, 0xde, 0x31, 0xbc, 0x53, 0x62, 0x8d,
+ 0x78, 0x97, 0xa6, 0x49, 0xc4, 0x2b, 0x1a, 0xf5 },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+ 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+ 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf1, 0xe2, 0x13, 0xc4, 0x35, 0x26, 0xd7,
+ 0x88, 0x79, 0x6a, 0x9b, 0x4c, 0xbd, 0xae, 0x5f },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+ 0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf2, 0xe4, 0x16, 0xc8, 0x3a, 0x2c, 0xde,
+ 0x90, 0x62, 0x74, 0x86, 0x58, 0xaa, 0xbc, 0x4e },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+ 0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf3, 0xe6, 0x15, 0xcc, 0x3f, 0x2a, 0xd9,
+ 0x98, 0x6b, 0x7e, 0x8d, 0x54, 0xa7, 0xb2, 0x41 },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+ 0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf4, 0xe8, 0x1c, 0xd0, 0x24, 0x38, 0xcc,
+ 0xa0, 0x54, 0x48, 0xbc, 0x70, 0x84, 0x98, 0x6c },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+ 0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+ 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+ 0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf6, 0xec, 0x1a, 0xd8, 0x2e, 0x34, 0xc2,
+ 0xb0, 0x46, 0x5c, 0xaa, 0x68, 0x9e, 0x84, 0x72 },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+ 0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf7, 0xee, 0x19, 0xdc, 0x2b, 0x32, 0xc5,
+ 0xb8, 0x4f, 0x56, 0xa1, 0x64, 0x93, 0x8a, 0x7d },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+ 0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf8, 0xf0, 0x08, 0xe0, 0x18, 0x10, 0xe8,
+ 0xc0, 0x38, 0x30, 0xc8, 0x20, 0xd8, 0xd0, 0x28 },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+ 0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf9, 0xf2, 0x0b, 0xe4, 0x1d, 0x16, 0xef,
+ 0xc8, 0x31, 0x3a, 0xc3, 0x2c, 0xd5, 0xde, 0x27 },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+ 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xfa, 0xf4, 0x0e, 0xe8, 0x12, 0x1c, 0xe6,
+ 0xd0, 0x2a, 0x24, 0xde, 0x38, 0xc2, 0xcc, 0x36 },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+ 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xfb, 0xf6, 0x0d, 0xec, 0x17, 0x1a, 0xe1,
+ 0xd8, 0x23, 0x2e, 0xd5, 0x34, 0xcf, 0xc2, 0x39 },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+ 0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xfc, 0xf8, 0x04, 0xf0, 0x0c, 0x08, 0xf4,
+ 0xe0, 0x1c, 0x18, 0xe4, 0x10, 0xec, 0xe8, 0x14 },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+ 0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xfd, 0xfa, 0x07, 0xf4, 0x09, 0x0e, 0xf3,
+ 0xe8, 0x15, 0x12, 0xef, 0x1c, 0xe1, 0xe6, 0x1b },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+ 0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xfe, 0xfc, 0x02, 0xf8, 0x06, 0x04, 0xfa,
+ 0xf0, 0x0e, 0x0c, 0xf2, 0x08, 0xf6, 0xf4, 0x0a },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+ 0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xff, 0xfe, 0x01, 0xfc, 0x03, 0x02, 0xfd,
+ 0xf8, 0x07, 0x06, 0xf9, 0x04, 0xfb, 0xfa, 0x05 }
+};
+/* END CSTYLED */
+#endif /* defined(HAVE_SSSE3) || defined(HAVE_AVX2) || defined(HAVE_AVX512BW) */
+#endif /* defined(__x86_64) */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c
new file mode 100644
index 000000000000..784d1af15a81
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c
@@ -0,0 +1,1147 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ *
+ * Copyright (c) 2018, Intel Corporation.
+ * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+ */
+
+#include <sys/vdev_impl.h>
+#include <sys/vdev_draid.h>
+#include <sys/dsl_scan.h>
+#include <sys/spa_impl.h>
+#include <sys/metaslab_impl.h>
+#include <sys/vdev_rebuild.h>
+#include <sys/zio.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+
+/*
+ * This file contains the sequential reconstruction implementation for
+ * resilvering. This form of resilvering is internally referred to as device
+ * rebuild to avoid conflating it with the traditional healing reconstruction
+ * performed by the dsl scan code.
+ *
+ * When replacing a device, or scrubbing the pool, ZFS has historically used
+ * a process called resilvering which is a form of healing reconstruction.
+ * This approach has the advantage that as blocks are read from disk their
+ * checksums can be immediately verified and the data repaired. Unfortunately,
+ * it also results in a random IO pattern to the disk even when extra care
+ * is taken to sequentialize the IO as much as possible. This substantially
+ * increases the time required to resilver the pool and restore redundancy.
+ *
+ * For mirrored devices it's possible to implement an alternate sequential
+ * reconstruction strategy when resilvering. Sequential reconstruction
+ * behaves like a traditional RAID rebuild and reconstructs a device in LBA
+ * order without verifying the checksum. After this phase completes a second
+ * scrub phase is started to verify all of the checksums. This two phase
+ * process will take longer than the healing reconstruction described above.
+ * However, it has that advantage that after the reconstruction first phase
+ * completes redundancy has been restored. At this point the pool can incur
+ * another device failure without risking data loss.
+ *
+ * There are a few noteworthy limitations and other advantages of resilvering
+ * using sequential reconstruction vs healing reconstruction.
+ *
+ * Limitations:
+ *
+ * - Sequential reconstruction is not possible on RAIDZ due to its
+ * variable stripe width. Note dRAID uses a fixed stripe width which
+ * avoids this issue, but comes at the expense of some usable capacity.
+ *
+ * - Block checksums are not verified during sequential reconstruction.
+ * Similar to traditional RAID the parity/mirror data is reconstructed
+ * but cannot be immediately double checked. For this reason when the
+ * last active resilver completes the pool is automatically scrubbed
+ * by default.
+ *
+ * - Deferred resilvers using sequential reconstruction are not currently
+ * supported. When adding another vdev to an active top-level resilver
+ * it must be restarted.
+ *
+ * Advantages:
+ *
+ * - Sequential reconstruction is performed in LBA order which may be faster
+ * than healing reconstruction particularly when using using HDDs (or
+ * especially with SMR devices). Only allocated capacity is resilvered.
+ *
+ * - Sequential reconstruction is not constrained by ZFS block boundaries.
+ * This allows it to issue larger IOs to disk which span multiple blocks
+ * allowing all of these logical blocks to be repaired with a single IO.
+ *
+ * - Unlike a healing resilver or scrub which are pool wide operations,
+ * sequential reconstruction is handled by the top-level vdevs. This
+ * allows for it to be started or canceled on a top-level vdev without
+ * impacting any other top-level vdevs in the pool.
+ *
+ * - Data only referenced by a pool checkpoint will be repaired because
+ * that space is reflected in the space maps. This differs for a
+ * healing resilver or scrub which will not repair that data.
+ */
+
+
+/*
+ * Size of rebuild reads; defaults to 1MiB per data disk and is capped at
+ * SPA_MAXBLOCKSIZE.
+ */
+unsigned long zfs_rebuild_max_segment = 1024 * 1024;
+
+/*
+ * Maximum number of parallelly executed bytes per leaf vdev caused by a
+ * sequential resilver. We attempt to strike a balance here between keeping
+ * the vdev queues full of I/Os at all times and not overflowing the queues
+ * to cause long latency, which would cause long txg sync times.
+ *
+ * A large default value can be safely used here because the default target
+ * segment size is also large (zfs_rebuild_max_segment=1M). This helps keep
+ * the queue depth short.
+ *
+ * 32MB was selected as the default value to achieve good performance with
+ * a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential
+ * rebuild was unable to saturate all of the drives using smaller values.
+ * With a value of 32MB the sequential resilver write rate was measured at
+ * 800MB/s sustained while rebuilding to a distributed spare.
+ */
+unsigned long zfs_rebuild_vdev_limit = 32 << 20;
+
+/*
+ * Automatically start a pool scrub when the last active sequential resilver
+ * completes in order to verify the checksums of all blocks which have been
+ * resilvered. This option is enabled by default and is strongly recommended.
+ */
+int zfs_rebuild_scrub_enabled = 1;
+
+/*
+ * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync().
+ */
+static void vdev_rebuild_thread(void *arg);
+
+/*
+ * Clear the per-vdev rebuild bytes value for a vdev tree.
+ */
+static void
+clear_rebuild_bytes(vdev_t *vd)
+{
+ vdev_stat_t *vs = &vd->vdev_stat;
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++)
+ clear_rebuild_bytes(vd->vdev_child[i]);
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vs->vs_rebuild_processed = 0;
+ mutex_exit(&vd->vdev_stat_lock);
+}
+
+/*
+ * Determines whether a vdev_rebuild_thread() should be stopped.
+ */
+static boolean_t
+vdev_rebuild_should_stop(vdev_t *vd)
+{
+ return (!vdev_writeable(vd) || vd->vdev_removing ||
+ vd->vdev_rebuild_exit_wanted ||
+ vd->vdev_rebuild_cancel_wanted ||
+ vd->vdev_rebuild_reset_wanted);
+}
+
+/*
+ * Determine if the rebuild should be canceled. This may happen when all
+ * vdevs with MISSING DTLs are detached.
+ */
+static boolean_t
+vdev_rebuild_should_cancel(vdev_t *vd)
+{
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+ if (!vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg))
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+/*
+ * The sync task for updating the on-disk state of a rebuild. This is
+ * scheduled by vdev_rebuild_range().
+ */
+static void
+vdev_rebuild_update_sync(void *arg, dmu_tx_t *tx)
+{
+ int vdev_id = (uintptr_t)arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+ uint64_t txg = dmu_tx_get_txg(tx);
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+
+ if (vr->vr_scan_offset[txg & TXG_MASK] > 0) {
+ vrp->vrp_last_offset = vr->vr_scan_offset[txg & TXG_MASK];
+ vr->vr_scan_offset[txg & TXG_MASK] = 0;
+ }
+
+ vrp->vrp_scan_time_ms = vr->vr_prev_scan_time_ms +
+ NSEC2MSEC(gethrtime() - vr->vr_pass_start_time);
+
+ VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+ REBUILD_PHYS_ENTRIES, vrp, tx));
+
+ mutex_exit(&vd->vdev_rebuild_lock);
+}
+
+/*
+ * Initialize the on-disk state for a new rebuild, start the rebuild thread.
+ */
+static void
+vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx)
+{
+ int vdev_id = (uintptr_t)arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+ ASSERT(vd->vdev_rebuilding);
+
+ spa_feature_incr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx);
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+ bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+ vrp->vrp_rebuild_state = VDEV_REBUILD_ACTIVE;
+ vrp->vrp_min_txg = 0;
+ vrp->vrp_max_txg = dmu_tx_get_txg(tx);
+ vrp->vrp_start_time = gethrestime_sec();
+ vrp->vrp_scan_time_ms = 0;
+ vr->vr_prev_scan_time_ms = 0;
+
+ /*
+ * Rebuilds are currently only used when replacing a device, in which
+ * case there must be DTL_MISSING entries. In the future, we could
+ * allow rebuilds to be used in a way similar to a scrub. This would
+ * be useful because it would allow us to rebuild the space used by
+ * pool checkpoints.
+ */
+ VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg));
+
+ VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+ REBUILD_PHYS_ENTRIES, vrp, tx));
+
+ spa_history_log_internal(spa, "rebuild", tx,
+ "vdev_id=%llu vdev_guid=%llu started",
+ (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid);
+
+ ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
+ vd->vdev_rebuild_thread = thread_create(NULL, 0,
+ vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+
+ mutex_exit(&vd->vdev_rebuild_lock);
+}
+
+static void
+vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, char *name)
+{
+ nvlist_t *aux = fnvlist_alloc();
+
+ fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "sequential");
+ spa_event_notify(spa, vd, aux, name);
+ nvlist_free(aux);
+}
+
+/*
+ * Called to request that a new rebuild be started. The feature will remain
+ * active for the duration of the rebuild, then revert to the enabled state.
+ */
+static void
+vdev_rebuild_initiate(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(vd->vdev_top == vd);
+ ASSERT(MUTEX_HELD(&vd->vdev_rebuild_lock));
+ ASSERT(!vd->vdev_rebuilding);
+
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+ vd->vdev_rebuilding = B_TRUE;
+
+ dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_initiate_sync,
+ (void *)(uintptr_t)vd->vdev_id, tx);
+ dmu_tx_commit(tx);
+
+ vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_START);
+}
+
+/*
+ * Update the on-disk state to completed when a rebuild finishes.
+ */
+static void
+vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx)
+{
+ int vdev_id = (uintptr_t)arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+ vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE;
+ vrp->vrp_end_time = gethrestime_sec();
+
+ VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+ REBUILD_PHYS_ENTRIES, vrp, tx));
+
+ vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE);
+ spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx);
+
+ spa_history_log_internal(spa, "rebuild", tx,
+ "vdev_id=%llu vdev_guid=%llu complete",
+ (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid);
+ vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH);
+
+ /* Handles detaching of spares */
+ spa_async_request(spa, SPA_ASYNC_REBUILD_DONE);
+ vd->vdev_rebuilding = B_FALSE;
+ mutex_exit(&vd->vdev_rebuild_lock);
+
+ /*
+ * While we're in syncing context take the opportunity to
+ * setup the scrub when there are no more active rebuilds.
+ */
+ if (!vdev_rebuild_active(spa->spa_root_vdev) &&
+ zfs_rebuild_scrub_enabled) {
+ pool_scan_func_t func = POOL_SCAN_SCRUB;
+ dsl_scan_setup_sync(&func, tx);
+ }
+
+ cv_broadcast(&vd->vdev_rebuild_cv);
+}
+
+/*
+ * Update the on-disk state to canceled when a rebuild finishes.
+ */
+static void
+vdev_rebuild_cancel_sync(void *arg, dmu_tx_t *tx)
+{
+ int vdev_id = (uintptr_t)arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+ vrp->vrp_rebuild_state = VDEV_REBUILD_CANCELED;
+ vrp->vrp_end_time = gethrestime_sec();
+
+ VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+ REBUILD_PHYS_ENTRIES, vrp, tx));
+
+ spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx);
+
+ spa_history_log_internal(spa, "rebuild", tx,
+ "vdev_id=%llu vdev_guid=%llu canceled",
+ (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid);
+ vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH);
+
+ vd->vdev_rebuild_cancel_wanted = B_FALSE;
+ vd->vdev_rebuilding = B_FALSE;
+ mutex_exit(&vd->vdev_rebuild_lock);
+
+ spa_notify_waiters(spa);
+ cv_broadcast(&vd->vdev_rebuild_cv);
+}
+
+/*
+ * Resets the progress of a running rebuild. This will occur when a new
+ * vdev is added to rebuild.
+ */
+static void
+vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx)
+{
+ int vdev_id = (uintptr_t)arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+
+ ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE);
+ ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
+
+ vrp->vrp_last_offset = 0;
+ vrp->vrp_min_txg = 0;
+ vrp->vrp_max_txg = dmu_tx_get_txg(tx);
+ vrp->vrp_bytes_scanned = 0;
+ vrp->vrp_bytes_issued = 0;
+ vrp->vrp_bytes_rebuilt = 0;
+ vrp->vrp_bytes_est = 0;
+ vrp->vrp_scan_time_ms = 0;
+ vr->vr_prev_scan_time_ms = 0;
+
+ /* See vdev_rebuild_initiate_sync comment */
+ VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg));
+
+ VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+ REBUILD_PHYS_ENTRIES, vrp, tx));
+
+ spa_history_log_internal(spa, "rebuild", tx,
+ "vdev_id=%llu vdev_guid=%llu reset",
+ (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid);
+
+ vd->vdev_rebuild_reset_wanted = B_FALSE;
+ ASSERT(vd->vdev_rebuilding);
+
+ vd->vdev_rebuild_thread = thread_create(NULL, 0,
+ vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+
+ mutex_exit(&vd->vdev_rebuild_lock);
+}
+
+/*
+ * Clear the last rebuild status.
+ */
+void
+vdev_rebuild_clear_sync(void *arg, dmu_tx_t *tx)
+{
+ int vdev_id = (uintptr_t)arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+ objset_t *mos = spa_meta_objset(spa);
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD) ||
+ vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE) {
+ mutex_exit(&vd->vdev_rebuild_lock);
+ return;
+ }
+
+ clear_rebuild_bytes(vd);
+ bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+
+ if (vd->vdev_top_zap != 0 && zap_contains(mos, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_VDEV_REBUILD_PHYS) == 0) {
+ VERIFY0(zap_update(mos, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+ REBUILD_PHYS_ENTRIES, vrp, tx));
+ }
+
+ mutex_exit(&vd->vdev_rebuild_lock);
+}
+
+/*
+ * The zio_done_func_t callback for each rebuild I/O issued. It's responsible
+ * for updating the rebuild stats and limiting the number of in flight I/Os.
+ */
+static void
+vdev_rebuild_cb(zio_t *zio)
+{
+ vdev_rebuild_t *vr = zio->io_private;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+ vdev_t *vd = vr->vr_top_vdev;
+
+ mutex_enter(&vr->vr_io_lock);
+ if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
+ /*
+ * The I/O failed because the top-level vdev was unavailable.
+ * Attempt to roll back to the last completed offset, in order
+ * resume from the correct location if the pool is resumed.
+ * (This works because spa_sync waits on spa_txg_zio before
+ * it runs sync tasks.)
+ */
+ uint64_t *off = &vr->vr_scan_offset[zio->io_txg & TXG_MASK];
+ *off = MIN(*off, zio->io_offset);
+ } else if (zio->io_error) {
+ vrp->vrp_errors++;
+ }
+
+ abd_free(zio->io_abd);
+
+ ASSERT3U(vr->vr_bytes_inflight, >, 0);
+ vr->vr_bytes_inflight -= zio->io_size;
+ cv_broadcast(&vr->vr_io_cv);
+ mutex_exit(&vr->vr_io_lock);
+
+ spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+}
+
+/*
+ * Initialize a block pointer that can be used to read the given segment
+ * for sequential rebuild.
+ */
+static void
+vdev_rebuild_blkptr_init(blkptr_t *bp, vdev_t *vd, uint64_t start,
+ uint64_t asize)
+{
+ ASSERT(vd->vdev_ops == &vdev_draid_ops ||
+ vd->vdev_ops == &vdev_mirror_ops ||
+ vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops);
+
+ uint64_t psize = vd->vdev_ops == &vdev_draid_ops ?
+ vdev_draid_asize_to_psize(vd, asize) : asize;
+
+ BP_ZERO(bp);
+
+ DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
+ DVA_SET_OFFSET(&bp->blk_dva[0], start);
+ DVA_SET_GANG(&bp->blk_dva[0], 0);
+ DVA_SET_ASIZE(&bp->blk_dva[0], asize);
+
+ BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
+ BP_SET_LSIZE(bp, psize);
+ BP_SET_PSIZE(bp, psize);
+ BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+ BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
+ BP_SET_TYPE(bp, DMU_OT_NONE);
+ BP_SET_LEVEL(bp, 0);
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+}
+
+/*
+ * Issues a rebuild I/O and takes care of rate limiting the number of queued
+ * rebuild I/Os. The provided start and size must be properly aligned for the
+ * top-level vdev type being rebuilt.
+ */
+static int
+vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
+{
+ uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id;
+ vdev_t *vd = vr->vr_top_vdev;
+ spa_t *spa = vd->vdev_spa;
+ blkptr_t blk;
+
+ ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift);
+ ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift);
+
+ vr->vr_pass_bytes_scanned += size;
+ vr->vr_rebuild_phys.vrp_bytes_scanned += size;
+
+ /*
+ * Rebuild the data in this range by constructing a special block
+ * pointer. It has no relation to any existing blocks in the pool.
+ * However, by disabling checksum verification and issuing a scrub IO
+ * we can reconstruct and repair any children with missing data.
+ */
+ vdev_rebuild_blkptr_init(&blk, vd, start, size);
+ uint64_t psize = BP_GET_PSIZE(&blk);
+
+ if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN))
+ return (0);
+
+ mutex_enter(&vr->vr_io_lock);
+
+ /* Limit in flight rebuild I/Os */
+ while (vr->vr_bytes_inflight >= vr->vr_bytes_inflight_max)
+ cv_wait(&vr->vr_io_cv, &vr->vr_io_lock);
+
+ vr->vr_bytes_inflight += psize;
+ mutex_exit(&vr->vr_io_lock);
+
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ uint64_t txg = dmu_tx_get_txg(tx);
+
+ spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
+ mutex_enter(&vd->vdev_rebuild_lock);
+
+ /* This is the first I/O for this txg. */
+ if (vr->vr_scan_offset[txg & TXG_MASK] == 0) {
+ vr->vr_scan_offset[txg & TXG_MASK] = start;
+ dsl_sync_task_nowait(spa_get_dsl(spa),
+ vdev_rebuild_update_sync,
+ (void *)(uintptr_t)vd->vdev_id, tx);
+ }
+
+ /* When exiting write out our progress. */
+ if (vdev_rebuild_should_stop(vd)) {
+ mutex_enter(&vr->vr_io_lock);
+ vr->vr_bytes_inflight -= psize;
+ mutex_exit(&vr->vr_io_lock);
+ spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+ mutex_exit(&vd->vdev_rebuild_lock);
+ dmu_tx_commit(tx);
+ return (SET_ERROR(EINTR));
+ }
+ mutex_exit(&vd->vdev_rebuild_lock);
+ dmu_tx_commit(tx);
+
+ vr->vr_scan_offset[txg & TXG_MASK] = start + size;
+ vr->vr_pass_bytes_issued += size;
+ vr->vr_rebuild_phys.vrp_bytes_issued += size;
+
+ zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, &blk,
+ abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr,
+ ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_RESILVER, NULL));
+
+ return (0);
+}
+
+/*
+ * Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree.
+ */
+static int
+vdev_rebuild_ranges(vdev_rebuild_t *vr)
+{
+ vdev_t *vd = vr->vr_top_vdev;
+ zfs_btree_t *t = &vr->vr_scan_tree->rt_root;
+ zfs_btree_index_t idx;
+ int error;
+
+ for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL;
+ rs = zfs_btree_next(t, &idx, &idx)) {
+ uint64_t start = rs_get_start(rs, vr->vr_scan_tree);
+ uint64_t size = rs_get_end(rs, vr->vr_scan_tree) - start;
+
+ /*
+ * zfs_scan_suspend_progress can be set to disable rebuild
+ * progress for testing. See comment in dsl_scan_sync().
+ */
+ while (zfs_scan_suspend_progress &&
+ !vdev_rebuild_should_stop(vd)) {
+ delay(hz);
+ }
+
+ while (size > 0) {
+ uint64_t chunk_size;
+
+ /*
+ * Split range into legally-sized logical chunks
+ * given the constraints of the top-level vdev
+ * being rebuilt (dRAID or mirror).
+ */
+ ASSERT3P(vd->vdev_ops, !=, NULL);
+ chunk_size = vd->vdev_ops->vdev_op_rebuild_asize(vd,
+ start, size, zfs_rebuild_max_segment);
+
+ error = vdev_rebuild_range(vr, start, chunk_size);
+ if (error != 0)
+ return (error);
+
+ size -= chunk_size;
+ start += chunk_size;
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Calculates the estimated capacity which remains to be scanned. Since
+ * we traverse the pool in metaslab order only allocated capacity beyond
+ * the vrp_last_offset need be considered. All lower offsets must have
+ * already been rebuilt and are thus already included in vrp_bytes_scanned.
+ */
+static void
+vdev_rebuild_update_bytes_est(vdev_t *vd, uint64_t ms_id)
+{
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+ uint64_t bytes_est = vrp->vrp_bytes_scanned;
+
+ if (vrp->vrp_last_offset < vd->vdev_ms[ms_id]->ms_start)
+ return;
+
+ for (uint64_t i = ms_id; i < vd->vdev_ms_count; i++) {
+ metaslab_t *msp = vd->vdev_ms[i];
+
+ mutex_enter(&msp->ms_lock);
+ bytes_est += metaslab_allocated_space(msp);
+ mutex_exit(&msp->ms_lock);
+ }
+
+ vrp->vrp_bytes_est = bytes_est;
+}
+
+/*
+ * Load from disk the top-level vdev's rebuild information.
+ */
+int
+vdev_rebuild_load(vdev_t *vd)
+{
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+ spa_t *spa = vd->vdev_spa;
+ int err = 0;
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+ vd->vdev_rebuilding = B_FALSE;
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) {
+ bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+ mutex_exit(&vd->vdev_rebuild_lock);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ ASSERT(vd->vdev_top == vd);
+
+ err = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+ REBUILD_PHYS_ENTRIES, vrp);
+
+ /*
+ * A missing or damaged VDEV_TOP_ZAP_VDEV_REBUILD_PHYS should
+ * not prevent a pool from being imported. Clear the rebuild
+ * status allowing a new resilver/rebuild to be started.
+ */
+ if (err == ENOENT || err == EOVERFLOW || err == ECKSUM) {
+ bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+ } else if (err) {
+ mutex_exit(&vd->vdev_rebuild_lock);
+ return (err);
+ }
+
+ vr->vr_prev_scan_time_ms = vrp->vrp_scan_time_ms;
+ vr->vr_top_vdev = vd;
+
+ mutex_exit(&vd->vdev_rebuild_lock);
+
+ return (0);
+}
+
+/*
+ * Each scan thread is responsible for rebuilding a top-level vdev. The
+ * rebuild progress in tracked on-disk in VDEV_TOP_ZAP_VDEV_REBUILD_PHYS.
+ */
+static void
+vdev_rebuild_thread(void *arg)
+{
+ vdev_t *vd = arg;
+ spa_t *spa = vd->vdev_spa;
+ int error = 0;
+
+ /*
+ * If there's a scrub in process request that it be stopped. This
+ * is not required for a correct rebuild, but we do want rebuilds to
+ * emulate the resilver behavior as much as possible.
+ */
+ dsl_pool_t *dsl = spa_get_dsl(spa);
+ if (dsl_scan_scrubbing(dsl))
+ dsl_scan_cancel(dsl);
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ mutex_enter(&vd->vdev_rebuild_lock);
+
+ ASSERT3P(vd->vdev_top, ==, vd);
+ ASSERT3P(vd->vdev_rebuild_thread, !=, NULL);
+ ASSERT(vd->vdev_rebuilding);
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD));
+ ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE);
+ ASSERT3B(vd->vdev_rebuild_reset_wanted, ==, B_FALSE);
+
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+ vr->vr_top_vdev = vd;
+ vr->vr_scan_msp = NULL;
+ vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+ mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL);
+
+ vr->vr_pass_start_time = gethrtime();
+ vr->vr_pass_bytes_scanned = 0;
+ vr->vr_pass_bytes_issued = 0;
+
+ vr->vr_bytes_inflight_max = MAX(1ULL << 20,
+ zfs_rebuild_vdev_limit * vd->vdev_children);
+
+ uint64_t update_est_time = gethrtime();
+ vdev_rebuild_update_bytes_est(vd, 0);
+
+ clear_rebuild_bytes(vr->vr_top_vdev);
+
+ mutex_exit(&vd->vdev_rebuild_lock);
+
+ /*
+ * Systematically walk the metaslabs and issue rebuild I/Os for
+ * all ranges in the allocated space map.
+ */
+ for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
+ metaslab_t *msp = vd->vdev_ms[i];
+ vr->vr_scan_msp = msp;
+
+ /*
+ * Removal of vdevs from the vdev tree may eliminate the need
+ * for the rebuild, in which case it should be canceled. The
+ * vdev_rebuild_cancel_wanted flag is set until the sync task
+ * completes. This may be after the rebuild thread exits.
+ */
+ if (vdev_rebuild_should_cancel(vd)) {
+ vd->vdev_rebuild_cancel_wanted = B_TRUE;
+ error = EINTR;
+ break;
+ }
+
+ ASSERT0(range_tree_space(vr->vr_scan_tree));
+
+ /* Disable any new allocations to this metaslab */
+ metaslab_disable(msp);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ mutex_enter(&msp->ms_sync_lock);
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * If there are outstanding allocations wait for them to be
+ * synced. This is needed to ensure all allocated ranges are
+ * on disk and therefore will be rebuilt.
+ */
+ for (int j = 0; j < TXG_SIZE; j++) {
+ if (range_tree_space(msp->ms_allocating[j])) {
+ mutex_exit(&msp->ms_lock);
+ mutex_exit(&msp->ms_sync_lock);
+ txg_wait_synced(dsl, 0);
+ mutex_enter(&msp->ms_sync_lock);
+ mutex_enter(&msp->ms_lock);
+ break;
+ }
+ }
+
+ /*
+ * When a metaslab has been allocated from read its allocated
+ * ranges from the space map object into the vr_scan_tree.
+ * Then add inflight / unflushed ranges and remove inflight /
+ * unflushed frees. This is the minimum range to be rebuilt.
+ */
+ if (msp->ms_sm != NULL) {
+ VERIFY0(space_map_load(msp->ms_sm,
+ vr->vr_scan_tree, SM_ALLOC));
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ ASSERT0(range_tree_space(
+ msp->ms_allocating[i]));
+ }
+
+ range_tree_walk(msp->ms_unflushed_allocs,
+ range_tree_add, vr->vr_scan_tree);
+ range_tree_walk(msp->ms_unflushed_frees,
+ range_tree_remove, vr->vr_scan_tree);
+
+ /*
+ * Remove ranges which have already been rebuilt based
+ * on the last offset. This can happen when restarting
+ * a scan after exporting and re-importing the pool.
+ */
+ range_tree_clear(vr->vr_scan_tree, 0,
+ vrp->vrp_last_offset);
+ }
+
+ mutex_exit(&msp->ms_lock);
+ mutex_exit(&msp->ms_sync_lock);
+
+ /*
+ * To provide an accurate estimate re-calculate the estimated
+ * size every 5 minutes to account for recent allocations and
+ * frees made to space maps which have not yet been rebuilt.
+ */
+ if (gethrtime() > update_est_time + SEC2NSEC(300)) {
+ update_est_time = gethrtime();
+ vdev_rebuild_update_bytes_est(vd, i);
+ }
+
+ /*
+ * Walk the allocated space map and issue the rebuild I/O.
+ */
+ error = vdev_rebuild_ranges(vr);
+ range_tree_vacate(vr->vr_scan_tree, NULL, NULL);
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ metaslab_enable(msp, B_FALSE, B_FALSE);
+
+ if (error != 0)
+ break;
+ }
+
+ range_tree_destroy(vr->vr_scan_tree);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ /* Wait for any remaining rebuild I/O to complete */
+ mutex_enter(&vr->vr_io_lock);
+ while (vr->vr_bytes_inflight > 0)
+ cv_wait(&vr->vr_io_cv, &vr->vr_io_lock);
+
+ mutex_exit(&vr->vr_io_lock);
+
+ mutex_destroy(&vr->vr_io_lock);
+ cv_destroy(&vr->vr_io_cv);
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ dsl_pool_t *dp = spa_get_dsl(spa);
+ dmu_tx_t *tx = dmu_tx_create_dd(dp->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+ if (error == 0) {
+ /*
+ * After a successful rebuild clear the DTLs of all ranges
+ * which were missing when the rebuild was started. These
+ * ranges must have been rebuilt as a consequence of rebuilding
+ * all allocated space. Note that unlike a scrub or resilver
+ * the rebuild operation will reconstruct data only referenced
+ * by a pool checkpoint. See the dsl_scan_done() comments.
+ */
+ dsl_sync_task_nowait(dp, vdev_rebuild_complete_sync,
+ (void *)(uintptr_t)vd->vdev_id, tx);
+ } else if (vd->vdev_rebuild_cancel_wanted) {
+ /*
+ * The rebuild operation was canceled. This will occur when
+ * a device participating in the rebuild is detached.
+ */
+ dsl_sync_task_nowait(dp, vdev_rebuild_cancel_sync,
+ (void *)(uintptr_t)vd->vdev_id, tx);
+ } else if (vd->vdev_rebuild_reset_wanted) {
+ /*
+ * Reset the running rebuild without canceling and restarting
+ * it. This will occur when a new device is attached and must
+ * participate in the rebuild.
+ */
+ dsl_sync_task_nowait(dp, vdev_rebuild_reset_sync,
+ (void *)(uintptr_t)vd->vdev_id, tx);
+ } else {
+ /*
+ * The rebuild operation should be suspended. This may occur
+ * when detaching a child vdev or when exporting the pool. The
+ * rebuild is left in the active state so it will be resumed.
+ */
+ ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE);
+ vd->vdev_rebuilding = B_FALSE;
+ }
+
+ dmu_tx_commit(tx);
+
+ vd->vdev_rebuild_thread = NULL;
+ mutex_exit(&vd->vdev_rebuild_lock);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ cv_broadcast(&vd->vdev_rebuild_cv);
+
+ thread_exit();
+}
+
+/*
+ * Returns B_TRUE if any top-level vdev are rebuilding.
+ */
+boolean_t
+vdev_rebuild_active(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ boolean_t ret = B_FALSE;
+
+ if (vd == spa->spa_root_vdev) {
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ ret = vdev_rebuild_active(vd->vdev_child[i]);
+ if (ret)
+ return (ret);
+ }
+ } else if (vd->vdev_top_zap != 0) {
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+ ret = (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE);
+ mutex_exit(&vd->vdev_rebuild_lock);
+ }
+
+ return (ret);
+}
+
+/*
+ * Start a rebuild operation. The rebuild may be restarted when the
+ * top-level vdev is currently actively rebuilding.
+ */
+void
+vdev_rebuild(vdev_t *vd)
+{
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp __maybe_unused = &vr->vr_rebuild_phys;
+
+ ASSERT(vd->vdev_top == vd);
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT(!vd->vdev_removing);
+ ASSERT(spa_feature_is_enabled(vd->vdev_spa,
+ SPA_FEATURE_DEVICE_REBUILD));
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+ if (vd->vdev_rebuilding) {
+ ASSERT3U(vrp->vrp_rebuild_state, ==, VDEV_REBUILD_ACTIVE);
+
+ /*
+ * Signal a running rebuild operation that it should restart
+ * from the beginning because a new device was attached. The
+ * vdev_rebuild_reset_wanted flag is set until the sync task
+ * completes. This may be after the rebuild thread exits.
+ */
+ if (!vd->vdev_rebuild_reset_wanted)
+ vd->vdev_rebuild_reset_wanted = B_TRUE;
+ } else {
+ vdev_rebuild_initiate(vd);
+ }
+ mutex_exit(&vd->vdev_rebuild_lock);
+}
+
+static void
+vdev_rebuild_restart_impl(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ if (vd == spa->spa_root_vdev) {
+ for (uint64_t i = 0; i < vd->vdev_children; i++)
+ vdev_rebuild_restart_impl(vd->vdev_child[i]);
+
+ } else if (vd->vdev_top_zap != 0) {
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+ if (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE &&
+ vdev_writeable(vd) && !vd->vdev_rebuilding) {
+ ASSERT(spa_feature_is_active(spa,
+ SPA_FEATURE_DEVICE_REBUILD));
+ vd->vdev_rebuilding = B_TRUE;
+ vd->vdev_rebuild_thread = thread_create(NULL, 0,
+ vdev_rebuild_thread, vd, 0, &p0, TS_RUN,
+ maxclsyspri);
+ }
+ mutex_exit(&vd->vdev_rebuild_lock);
+ }
+}
+
+/*
+ * Conditionally restart all of the vdev_rebuild_thread's for a pool. The
+ * feature flag must be active and the rebuild in the active state. This
+ * cannot be used to start a new rebuild.
+ */
+void
+vdev_rebuild_restart(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ vdev_rebuild_restart_impl(spa->spa_root_vdev);
+}
+
+/*
+ * Stop and wait for all of the vdev_rebuild_thread's associated with the
+ * vdev tree provide to be terminated (canceled or stopped).
+ */
+void
+vdev_rebuild_stop_wait(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ if (vd == spa->spa_root_vdev) {
+ for (uint64_t i = 0; i < vd->vdev_children; i++)
+ vdev_rebuild_stop_wait(vd->vdev_child[i]);
+
+ } else if (vd->vdev_top_zap != 0) {
+ ASSERT(vd == vd->vdev_top);
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+ if (vd->vdev_rebuild_thread != NULL) {
+ vd->vdev_rebuild_exit_wanted = B_TRUE;
+ while (vd->vdev_rebuilding) {
+ cv_wait(&vd->vdev_rebuild_cv,
+ &vd->vdev_rebuild_lock);
+ }
+ vd->vdev_rebuild_exit_wanted = B_FALSE;
+ }
+ mutex_exit(&vd->vdev_rebuild_lock);
+ }
+}
+
+/*
+ * Stop all rebuild operations but leave them in the active state so they
+ * will be resumed when importing the pool.
+ */
+void
+vdev_rebuild_stop_all(spa_t *spa)
+{
+ vdev_rebuild_stop_wait(spa->spa_root_vdev);
+}
+
+/*
+ * Rebuild statistics reported per top-level vdev.
+ */
+int
+vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs)
+{
+ spa_t *spa = tvd->vdev_spa;
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
+ return (SET_ERROR(ENOTSUP));
+
+ if (tvd != tvd->vdev_top || tvd->vdev_top_zap == 0)
+ return (SET_ERROR(EINVAL));
+
+ int error = zap_contains(spa_meta_objset(spa),
+ tvd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS);
+
+ if (error == ENOENT) {
+ bzero(vrs, sizeof (vdev_rebuild_stat_t));
+ vrs->vrs_state = VDEV_REBUILD_NONE;
+ error = 0;
+ } else if (error == 0) {
+ vdev_rebuild_t *vr = &tvd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+ mutex_enter(&tvd->vdev_rebuild_lock);
+ vrs->vrs_state = vrp->vrp_rebuild_state;
+ vrs->vrs_start_time = vrp->vrp_start_time;
+ vrs->vrs_end_time = vrp->vrp_end_time;
+ vrs->vrs_scan_time_ms = vrp->vrp_scan_time_ms;
+ vrs->vrs_bytes_scanned = vrp->vrp_bytes_scanned;
+ vrs->vrs_bytes_issued = vrp->vrp_bytes_issued;
+ vrs->vrs_bytes_rebuilt = vrp->vrp_bytes_rebuilt;
+ vrs->vrs_bytes_est = vrp->vrp_bytes_est;
+ vrs->vrs_errors = vrp->vrp_errors;
+ vrs->vrs_pass_time_ms = NSEC2MSEC(gethrtime() -
+ vr->vr_pass_start_time);
+ vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned;
+ vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued;
+ mutex_exit(&tvd->vdev_rebuild_lock);
+ }
+
+ return (error);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW,
+ "Max segment size in bytes of rebuild reads");
+
+ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, ULONG, ZMOD_RW,
+ "Max bytes in flight per leaf vdev for sequential resilvers");
+
+ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW,
+ "Automatically scrub after sequential resilver completes");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c
new file mode 100644
index 000000000000..a758fe4fb343
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c
@@ -0,0 +1,2390 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/bpobj.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_dir.h>
+#include <sys/arc.h>
+#include <sys/zfeature.h>
+#include <sys/vdev_indirect_births.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/abd.h>
+#include <sys/vdev_initialize.h>
+#include <sys/vdev_trim.h>
+#include <sys/trace_zfs.h>
+
+/*
+ * This file contains the necessary logic to remove vdevs from a
+ * storage pool. Currently, the only devices that can be removed
+ * are log, cache, and spare devices; and top level vdevs from a pool
+ * w/o raidz or mirrors. (Note that members of a mirror can be removed
+ * by the detach operation.)
+ *
+ * Log vdevs are removed by evacuating them and then turning the vdev
+ * into a hole vdev while holding spa config locks.
+ *
+ * Top level vdevs are removed and converted into an indirect vdev via
+ * a multi-step process:
+ *
+ * - Disable allocations from this device (spa_vdev_remove_top).
+ *
+ * - From a new thread (spa_vdev_remove_thread), copy data from
+ * the removing vdev to a different vdev. The copy happens in open
+ * context (spa_vdev_copy_impl) and issues a sync task
+ * (vdev_mapping_sync) so the sync thread can update the partial
+ * indirect mappings in core and on disk.
+ *
+ * - If a free happens during a removal, it is freed from the
+ * removing vdev, and if it has already been copied, from the new
+ * location as well (free_from_removing_vdev).
+ *
+ * - After the removal is completed, the copy thread converts the vdev
+ * into an indirect vdev (vdev_remove_complete) before instructing
+ * the sync thread to destroy the space maps and finish the removal
+ * (spa_finish_removal).
+ */
+
+typedef struct vdev_copy_arg {
+ metaslab_t *vca_msp;
+ uint64_t vca_outstanding_bytes;
+ uint64_t vca_read_error_bytes;
+ uint64_t vca_write_error_bytes;
+ kcondvar_t vca_cv;
+ kmutex_t vca_lock;
+} vdev_copy_arg_t;
+
+/*
+ * The maximum amount of memory we can use for outstanding i/o while
+ * doing a device removal. This determines how much i/o we can have
+ * in flight concurrently.
+ */
+int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
+
+/*
+ * The largest contiguous segment that we will attempt to allocate when
+ * removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If
+ * there is a performance problem with attempting to allocate large blocks,
+ * consider decreasing this.
+ *
+ * See also the accessor function spa_remove_max_segment().
+ */
+int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
+
+/*
+ * Ignore hard IO errors during device removal. When set if a device
+ * encounters hard IO error during the removal process the removal will
+ * not be cancelled. This can result in a normally recoverable block
+ * becoming permanently damaged and is not recommended.
+ */
+int zfs_removal_ignore_errors = 0;
+
+/*
+ * Allow a remap segment to span free chunks of at most this size. The main
+ * impact of a larger span is that we will read and write larger, more
+ * contiguous chunks, with more "unnecessary" data -- trading off bandwidth
+ * for iops. The value here was chosen to align with
+ * zfs_vdev_read_gap_limit, which is a similar concept when doing regular
+ * reads (but there's no reason it has to be the same).
+ *
+ * Additionally, a higher span will have the following relatively minor
+ * effects:
+ * - the mapping will be smaller, since one entry can cover more allocated
+ * segments
+ * - more of the fragmentation in the removing device will be preserved
+ * - we'll do larger allocations, which may fail and fall back on smaller
+ * allocations
+ */
+int vdev_removal_max_span = 32 * 1024;
+
+/*
+ * This is used by the test suite so that it can ensure that certain
+ * actions happen while in the middle of a removal.
+ */
+int zfs_removal_suspend_progress = 0;
+
+#define VDEV_REMOVAL_ZAP_OBJS "lzap"
+
+static void spa_vdev_remove_thread(void *arg);
+static int spa_vdev_remove_cancel_impl(spa_t *spa);
+
+static void
+spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx)
+{
+ VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_REMOVING, sizeof (uint64_t),
+ sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
+ &spa->spa_removing_phys, tx));
+}
+
+static nvlist_t *
+spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
+{
+ for (int i = 0; i < count; i++) {
+ uint64_t guid =
+ fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID);
+
+ if (guid == target_guid)
+ return (nvpp[i]);
+ }
+
+ return (NULL);
+}
+
+static void
+spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
+ nvlist_t *dev_to_remove)
+{
+ nvlist_t **newdev = NULL;
+
+ if (count > 1)
+ newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
+
+ for (int i = 0, j = 0; i < count; i++) {
+ if (dev[i] == dev_to_remove)
+ continue;
+ VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
+ }
+
+ VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
+ VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
+
+ for (int i = 0; i < count - 1; i++)
+ nvlist_free(newdev[i]);
+
+ if (count > 1)
+ kmem_free(newdev, (count - 1) * sizeof (void *));
+}
+
+static spa_vdev_removal_t *
+spa_vdev_removal_create(vdev_t *vd)
+{
+ spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP);
+ mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
+ svr->svr_allocd_segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+ svr->svr_vdev_id = vd->vdev_id;
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ svr->svr_frees[i] = range_tree_create(NULL, RANGE_SEG64, NULL,
+ 0, 0);
+ list_create(&svr->svr_new_segments[i],
+ sizeof (vdev_indirect_mapping_entry_t),
+ offsetof(vdev_indirect_mapping_entry_t, vime_node));
+ }
+
+ return (svr);
+}
+
+void
+spa_vdev_removal_destroy(spa_vdev_removal_t *svr)
+{
+ for (int i = 0; i < TXG_SIZE; i++) {
+ ASSERT0(svr->svr_bytes_done[i]);
+ ASSERT0(svr->svr_max_offset_to_sync[i]);
+ range_tree_destroy(svr->svr_frees[i]);
+ list_destroy(&svr->svr_new_segments[i]);
+ }
+
+ range_tree_destroy(svr->svr_allocd_segs);
+ mutex_destroy(&svr->svr_lock);
+ cv_destroy(&svr->svr_cv);
+ kmem_free(svr, sizeof (*svr));
+}
+
+/*
+ * This is called as a synctask in the txg in which we will mark this vdev
+ * as removing (in the config stored in the MOS).
+ *
+ * It begins the evacuation of a toplevel vdev by:
+ * - initializing the spa_removing_phys which tracks this removal
+ * - computing the amount of space to remove for accounting purposes
+ * - dirtying all dbufs in the spa_config_object
+ * - creating the spa_vdev_removal
+ * - starting the spa_vdev_remove_thread
+ */
+static void
+vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
+{
+ int vdev_id = (uintptr_t)arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+ objset_t *mos = spa->spa_dsl_pool->dp_meta_objset;
+ spa_vdev_removal_t *svr = NULL;
+ uint64_t txg __maybe_unused = dmu_tx_get_txg(tx);
+
+ ASSERT0(vdev_get_nparity(vd));
+ svr = spa_vdev_removal_create(vd);
+
+ ASSERT(vd->vdev_removing);
+ ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
+
+ spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+ /*
+ * By activating the OBSOLETE_COUNTS feature, we prevent
+ * the pool from being downgraded and ensure that the
+ * refcounts are precise.
+ */
+ spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ uint64_t one = 1;
+ VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1,
+ &one, tx));
+ boolean_t are_precise __maybe_unused;
+ ASSERT0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+ ASSERT3B(are_precise, ==, B_TRUE);
+ }
+
+ vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx);
+ vd->vdev_indirect_mapping =
+ vdev_indirect_mapping_open(mos, vic->vic_mapping_object);
+ vic->vic_births_object = vdev_indirect_births_alloc(mos, tx);
+ vd->vdev_indirect_births =
+ vdev_indirect_births_open(mos, vic->vic_births_object);
+ spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id;
+ spa->spa_removing_phys.sr_start_time = gethrestime_sec();
+ spa->spa_removing_phys.sr_end_time = 0;
+ spa->spa_removing_phys.sr_state = DSS_SCANNING;
+ spa->spa_removing_phys.sr_to_copy = 0;
+ spa->spa_removing_phys.sr_copied = 0;
+
+ /*
+ * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because
+ * there may be space in the defer tree, which is free, but still
+ * counted in vs_alloc.
+ */
+ for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
+ metaslab_t *ms = vd->vdev_ms[i];
+ if (ms->ms_sm == NULL)
+ continue;
+
+ spa->spa_removing_phys.sr_to_copy +=
+ metaslab_allocated_space(ms);
+
+ /*
+ * Space which we are freeing this txg does not need to
+ * be copied.
+ */
+ spa->spa_removing_phys.sr_to_copy -=
+ range_tree_space(ms->ms_freeing);
+
+ ASSERT0(range_tree_space(ms->ms_freed));
+ for (int t = 0; t < TXG_SIZE; t++)
+ ASSERT0(range_tree_space(ms->ms_allocating[t]));
+ }
+
+ /*
+ * Sync tasks are called before metaslab_sync(), so there should
+ * be no already-synced metaslabs in the TXG_CLEAN list.
+ */
+ ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL);
+
+ spa_sync_removing_state(spa, tx);
+
+ /*
+ * All blocks that we need to read the most recent mapping must be
+ * stored on concrete vdevs. Therefore, we must dirty anything that
+ * is read before spa_remove_init(). Specifically, the
+ * spa_config_object. (Note that although we already modified the
+ * spa_config_object in spa_sync_removing_state, that may not have
+ * modified all blocks of the object.)
+ */
+ dmu_object_info_t doi;
+ VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi));
+ for (uint64_t offset = 0; offset < doi.doi_max_offset; ) {
+ dmu_buf_t *dbuf;
+ VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT,
+ offset, FTAG, &dbuf, 0));
+ dmu_buf_will_dirty(dbuf, tx);
+ offset += dbuf->db_size;
+ dmu_buf_rele(dbuf, FTAG);
+ }
+
+ /*
+ * Now that we've allocated the im_object, dirty the vdev to ensure
+ * that the object gets written to the config on disk.
+ */
+ vdev_config_dirty(vd);
+
+ zfs_dbgmsg("starting removal thread for vdev %llu (%px) in txg %llu "
+ "im_obj=%llu", vd->vdev_id, vd, dmu_tx_get_txg(tx),
+ vic->vic_mapping_object);
+
+ spa_history_log_internal(spa, "vdev remove started", tx,
+ "%s vdev %llu %s", spa_name(spa), (u_longlong_t)vd->vdev_id,
+ (vd->vdev_path != NULL) ? vd->vdev_path : "-");
+ /*
+ * Setting spa_vdev_removal causes subsequent frees to call
+ * free_from_removing_vdev(). Note that we don't need any locking
+ * because we are the sync thread, and metaslab_free_impl() is only
+ * called from syncing context (potentially from a zio taskq thread,
+ * but in any case only when there are outstanding free i/os, which
+ * there are not).
+ */
+ ASSERT3P(spa->spa_vdev_removal, ==, NULL);
+ spa->spa_vdev_removal = svr;
+ svr->svr_thread = thread_create(NULL, 0,
+ spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri);
+}
+
+/*
+ * When we are opening a pool, we must read the mapping for each
+ * indirect vdev in order from most recently removed to least
+ * recently removed. We do this because the blocks for the mapping
+ * of older indirect vdevs may be stored on more recently removed vdevs.
+ * In order to read each indirect mapping object, we must have
+ * initialized all more recently removed vdevs.
+ */
+int
+spa_remove_init(spa_t *spa)
+{
+ int error;
+
+ error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_REMOVING, sizeof (uint64_t),
+ sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
+ &spa->spa_removing_phys);
+
+ if (error == ENOENT) {
+ spa->spa_removing_phys.sr_state = DSS_NONE;
+ spa->spa_removing_phys.sr_removing_vdev = -1;
+ spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
+ spa->spa_indirect_vdevs_loaded = B_TRUE;
+ return (0);
+ } else if (error != 0) {
+ return (error);
+ }
+
+ if (spa->spa_removing_phys.sr_state == DSS_SCANNING) {
+ /*
+ * We are currently removing a vdev. Create and
+ * initialize a spa_vdev_removal_t from the bonus
+ * buffer of the removing vdevs vdev_im_object, and
+ * initialize its partial mapping.
+ */
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ vdev_t *vd = vdev_lookup_top(spa,
+ spa->spa_removing_phys.sr_removing_vdev);
+
+ if (vd == NULL) {
+ spa_config_exit(spa, SCL_STATE, FTAG);
+ return (EINVAL);
+ }
+
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+
+ ASSERT(vdev_is_concrete(vd));
+ spa_vdev_removal_t *svr = spa_vdev_removal_create(vd);
+ ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id);
+ ASSERT(vd->vdev_removing);
+
+ vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
+ spa->spa_meta_objset, vic->vic_mapping_object);
+ vd->vdev_indirect_births = vdev_indirect_births_open(
+ spa->spa_meta_objset, vic->vic_births_object);
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ spa->spa_vdev_removal = svr;
+ }
+
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ uint64_t indirect_vdev_id =
+ spa->spa_removing_phys.sr_prev_indirect_vdev;
+ while (indirect_vdev_id != UINT64_MAX) {
+ vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id);
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+ vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
+ spa->spa_meta_objset, vic->vic_mapping_object);
+ vd->vdev_indirect_births = vdev_indirect_births_open(
+ spa->spa_meta_objset, vic->vic_births_object);
+
+ indirect_vdev_id = vic->vic_prev_indirect_vdev;
+ }
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ /*
+ * Now that we've loaded all the indirect mappings, we can allow
+ * reads from other blocks (e.g. via predictive prefetch).
+ */
+ spa->spa_indirect_vdevs_loaded = B_TRUE;
+ return (0);
+}
+
+void
+spa_restart_removal(spa_t *spa)
+{
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+
+ if (svr == NULL)
+ return;
+
+ /*
+ * In general when this function is called there is no
+ * removal thread running. The only scenario where this
+ * is not true is during spa_import() where this function
+ * is called twice [once from spa_import_impl() and
+ * spa_async_resume()]. Thus, in the scenario where we
+ * import a pool that has an ongoing removal we don't
+ * want to spawn a second thread.
+ */
+ if (svr->svr_thread != NULL)
+ return;
+
+ if (!spa_writeable(spa))
+ return;
+
+ zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id);
+ svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa,
+ 0, &p0, TS_RUN, minclsyspri);
+}
+
+/*
+ * Process freeing from a device which is in the middle of being removed.
+ * We must handle this carefully so that we attempt to copy freed data,
+ * and we correctly free already-copied data.
+ */
+void
+free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
+{
+ spa_t *spa = vd->vdev_spa;
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ uint64_t txg = spa_syncing_txg(spa);
+ uint64_t max_offset_yet = 0;
+
+ ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
+ ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
+ vdev_indirect_mapping_object(vim));
+ ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
+
+ mutex_enter(&svr->svr_lock);
+
+ /*
+ * Remove the segment from the removing vdev's spacemap. This
+ * ensures that we will not attempt to copy this space (if the
+ * removal thread has not yet visited it), and also ensures
+ * that we know what is actually allocated on the new vdevs
+ * (needed if we cancel the removal).
+ *
+ * Note: we must do the metaslab_free_concrete() with the svr_lock
+ * held, so that the remove_thread can not load this metaslab and then
+ * visit this offset between the time that we metaslab_free_concrete()
+ * and when we check to see if it has been visited.
+ *
+ * Note: The checkpoint flag is set to false as having/taking
+ * a checkpoint and removing a device can't happen at the same
+ * time.
+ */
+ ASSERT(!spa_has_checkpoint(spa));
+ metaslab_free_concrete(vd, offset, size, B_FALSE);
+
+ uint64_t synced_size = 0;
+ uint64_t synced_offset = 0;
+ uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim);
+ if (offset < max_offset_synced) {
+ /*
+ * The mapping for this offset is already on disk.
+ * Free from the new location.
+ *
+ * Note that we use svr_max_synced_offset because it is
+ * updated atomically with respect to the in-core mapping.
+ * By contrast, vim_max_offset is not.
+ *
+ * This block may be split between a synced entry and an
+ * in-flight or unvisited entry. Only process the synced
+ * portion of it here.
+ */
+ synced_size = MIN(size, max_offset_synced - offset);
+ synced_offset = offset;
+
+ ASSERT3U(max_offset_yet, <=, max_offset_synced);
+ max_offset_yet = max_offset_synced;
+
+ DTRACE_PROBE3(remove__free__synced,
+ spa_t *, spa,
+ uint64_t, offset,
+ uint64_t, synced_size);
+
+ size -= synced_size;
+ offset += synced_size;
+ }
+
+ /*
+ * Look at all in-flight txgs starting from the currently syncing one
+ * and see if a section of this free is being copied. By starting from
+ * this txg and iterating forward, we might find that this region
+ * was copied in two different txgs and handle it appropriately.
+ */
+ for (int i = 0; i < TXG_CONCURRENT_STATES; i++) {
+ int txgoff = (txg + i) & TXG_MASK;
+ if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) {
+ /*
+ * The mapping for this offset is in flight, and
+ * will be synced in txg+i.
+ */
+ uint64_t inflight_size = MIN(size,
+ svr->svr_max_offset_to_sync[txgoff] - offset);
+
+ DTRACE_PROBE4(remove__free__inflight,
+ spa_t *, spa,
+ uint64_t, offset,
+ uint64_t, inflight_size,
+ uint64_t, txg + i);
+
+ /*
+ * We copy data in order of increasing offset.
+ * Therefore the max_offset_to_sync[] must increase
+ * (or be zero, indicating that nothing is being
+ * copied in that txg).
+ */
+ if (svr->svr_max_offset_to_sync[txgoff] != 0) {
+ ASSERT3U(svr->svr_max_offset_to_sync[txgoff],
+ >=, max_offset_yet);
+ max_offset_yet =
+ svr->svr_max_offset_to_sync[txgoff];
+ }
+
+ /*
+ * We've already committed to copying this segment:
+ * we have allocated space elsewhere in the pool for
+ * it and have an IO outstanding to copy the data. We
+ * cannot free the space before the copy has
+ * completed, or else the copy IO might overwrite any
+ * new data. To free that space, we record the
+ * segment in the appropriate svr_frees tree and free
+ * the mapped space later, in the txg where we have
+ * completed the copy and synced the mapping (see
+ * vdev_mapping_sync).
+ */
+ range_tree_add(svr->svr_frees[txgoff],
+ offset, inflight_size);
+ size -= inflight_size;
+ offset += inflight_size;
+
+ /*
+ * This space is already accounted for as being
+ * done, because it is being copied in txg+i.
+ * However, if i!=0, then it is being copied in
+ * a future txg. If we crash after this txg
+ * syncs but before txg+i syncs, then the space
+ * will be free. Therefore we must account
+ * for the space being done in *this* txg
+ * (when it is freed) rather than the future txg
+ * (when it will be copied).
+ */
+ ASSERT3U(svr->svr_bytes_done[txgoff], >=,
+ inflight_size);
+ svr->svr_bytes_done[txgoff] -= inflight_size;
+ svr->svr_bytes_done[txg & TXG_MASK] += inflight_size;
+ }
+ }
+ ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]);
+
+ if (size > 0) {
+ /*
+ * The copy thread has not yet visited this offset. Ensure
+ * that it doesn't.
+ */
+
+ DTRACE_PROBE3(remove__free__unvisited,
+ spa_t *, spa,
+ uint64_t, offset,
+ uint64_t, size);
+
+ if (svr->svr_allocd_segs != NULL)
+ range_tree_clear(svr->svr_allocd_segs, offset, size);
+
+ /*
+ * Since we now do not need to copy this data, for
+ * accounting purposes we have done our job and can count
+ * it as completed.
+ */
+ svr->svr_bytes_done[txg & TXG_MASK] += size;
+ }
+ mutex_exit(&svr->svr_lock);
+
+ /*
+ * Now that we have dropped svr_lock, process the synced portion
+ * of this free.
+ */
+ if (synced_size > 0) {
+ vdev_indirect_mark_obsolete(vd, synced_offset, synced_size);
+
+ /*
+ * Note: this can only be called from syncing context,
+ * and the vdev_indirect_mapping is only changed from the
+ * sync thread, so we don't need svr_lock while doing
+ * metaslab_free_impl_cb.
+ */
+ boolean_t checkpoint = B_FALSE;
+ vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size,
+ metaslab_free_impl_cb, &checkpoint);
+ }
+}
+
+/*
+ * Stop an active removal and update the spa_removing phys.
+ */
+static void
+spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx)
+{
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+ ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa));
+
+ /* Ensure the removal thread has completed before we free the svr. */
+ spa_vdev_remove_suspend(spa);
+
+ ASSERT(state == DSS_FINISHED || state == DSS_CANCELED);
+
+ if (state == DSS_FINISHED) {
+ spa_removing_phys_t *srp = &spa->spa_removing_phys;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+
+ if (srp->sr_prev_indirect_vdev != -1) {
+ vdev_t *pvd;
+ pvd = vdev_lookup_top(spa,
+ srp->sr_prev_indirect_vdev);
+ ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops);
+ }
+
+ vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev;
+ srp->sr_prev_indirect_vdev = vd->vdev_id;
+ }
+ spa->spa_removing_phys.sr_state = state;
+ spa->spa_removing_phys.sr_end_time = gethrestime_sec();
+
+ spa->spa_vdev_removal = NULL;
+ spa_vdev_removal_destroy(svr);
+
+ spa_sync_removing_state(spa, tx);
+ spa_notify_waiters(spa);
+
+ vdev_config_dirty(spa->spa_root_vdev);
+}
+
+static void
+free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size)
+{
+ vdev_t *vd = arg;
+ vdev_indirect_mark_obsolete(vd, offset, size);
+ boolean_t checkpoint = B_FALSE;
+ vdev_indirect_ops.vdev_op_remap(vd, offset, size,
+ metaslab_free_impl_cb, &checkpoint);
+}
+
+/*
+ * On behalf of the removal thread, syncs an incremental bit more of
+ * the indirect mapping to disk and updates the in-memory mapping.
+ * Called as a sync task in every txg that the removal thread makes progress.
+ */
+static void
+vdev_mapping_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_vdev_removal_t *svr = arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+ vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config;
+ uint64_t txg = dmu_tx_get_txg(tx);
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+ ASSERT(vic->vic_mapping_object != 0);
+ ASSERT3U(txg, ==, spa_syncing_txg(spa));
+
+ vdev_indirect_mapping_add_entries(vim,
+ &svr->svr_new_segments[txg & TXG_MASK], tx);
+ vdev_indirect_births_add_entry(vd->vdev_indirect_births,
+ vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx);
+
+ /*
+ * Free the copied data for anything that was freed while the
+ * mapping entries were in flight.
+ */
+ mutex_enter(&svr->svr_lock);
+ range_tree_vacate(svr->svr_frees[txg & TXG_MASK],
+ free_mapped_segment_cb, vd);
+ ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=,
+ vdev_indirect_mapping_max_offset(vim));
+ svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0;
+ mutex_exit(&svr->svr_lock);
+
+ spa_sync_removing_state(spa, tx);
+}
+
+typedef struct vdev_copy_segment_arg {
+ spa_t *vcsa_spa;
+ dva_t *vcsa_dest_dva;
+ uint64_t vcsa_txg;
+ range_tree_t *vcsa_obsolete_segs;
+} vdev_copy_segment_arg_t;
+
+static void
+unalloc_seg(void *arg, uint64_t start, uint64_t size)
+{
+ vdev_copy_segment_arg_t *vcsa = arg;
+ spa_t *spa = vcsa->vcsa_spa;
+ blkptr_t bp = { { { {0} } } };
+
+ BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL);
+ BP_SET_LSIZE(&bp, size);
+ BP_SET_PSIZE(&bp, size);
+ BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+ BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF);
+ BP_SET_TYPE(&bp, DMU_OT_NONE);
+ BP_SET_LEVEL(&bp, 0);
+ BP_SET_DEDUP(&bp, 0);
+ BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
+
+ DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva));
+ DVA_SET_OFFSET(&bp.blk_dva[0],
+ DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start);
+ DVA_SET_ASIZE(&bp.blk_dva[0], size);
+
+ zio_free(spa, vcsa->vcsa_txg, &bp);
+}
+
+/*
+ * All reads and writes associated with a call to spa_vdev_copy_segment()
+ * are done.
+ */
+static void
+spa_vdev_copy_segment_done(zio_t *zio)
+{
+ vdev_copy_segment_arg_t *vcsa = zio->io_private;
+
+ range_tree_vacate(vcsa->vcsa_obsolete_segs,
+ unalloc_seg, vcsa);
+ range_tree_destroy(vcsa->vcsa_obsolete_segs);
+ kmem_free(vcsa, sizeof (*vcsa));
+
+ spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
+}
+
+/*
+ * The write of the new location is done.
+ */
+static void
+spa_vdev_copy_segment_write_done(zio_t *zio)
+{
+ vdev_copy_arg_t *vca = zio->io_private;
+
+ abd_free(zio->io_abd);
+
+ mutex_enter(&vca->vca_lock);
+ vca->vca_outstanding_bytes -= zio->io_size;
+
+ if (zio->io_error != 0)
+ vca->vca_write_error_bytes += zio->io_size;
+
+ cv_signal(&vca->vca_cv);
+ mutex_exit(&vca->vca_lock);
+}
+
+/*
+ * The read of the old location is done. The parent zio is the write to
+ * the new location. Allow it to start.
+ */
+static void
+spa_vdev_copy_segment_read_done(zio_t *zio)
+{
+ vdev_copy_arg_t *vca = zio->io_private;
+
+ if (zio->io_error != 0) {
+ mutex_enter(&vca->vca_lock);
+ vca->vca_read_error_bytes += zio->io_size;
+ mutex_exit(&vca->vca_lock);
+ }
+
+ zio_nowait(zio_unique_parent(zio));
+}
+
+/*
+ * If the old and new vdevs are mirrors, we will read both sides of the old
+ * mirror, and write each copy to the corresponding side of the new mirror.
+ * If the old and new vdevs have a different number of children, we will do
+ * this as best as possible. Since we aren't verifying checksums, this
+ * ensures that as long as there's a good copy of the data, we'll have a
+ * good copy after the removal, even if there's silent damage to one side
+ * of the mirror. If we're removing a mirror that has some silent damage,
+ * we'll have exactly the same damage in the new location (assuming that
+ * the new location is also a mirror).
+ *
+ * We accomplish this by creating a tree of zio_t's, with as many writes as
+ * there are "children" of the new vdev (a non-redundant vdev counts as one
+ * child, a 2-way mirror has 2 children, etc). Each write has an associated
+ * read from a child of the old vdev. Typically there will be the same
+ * number of children of the old and new vdevs. However, if there are more
+ * children of the new vdev, some child(ren) of the old vdev will be issued
+ * multiple reads. If there are more children of the old vdev, some copies
+ * will be dropped.
+ *
+ * For example, the tree of zio_t's for a 2-way mirror is:
+ *
+ * null
+ * / \
+ * write(new vdev, child 0) write(new vdev, child 1)
+ * | |
+ * read(old vdev, child 0) read(old vdev, child 1)
+ *
+ * Child zio's complete before their parents complete. However, zio's
+ * created with zio_vdev_child_io() may be issued before their children
+ * complete. In this case we need to make sure that the children (reads)
+ * complete before the parents (writes) are *issued*. We do this by not
+ * calling zio_nowait() on each write until its corresponding read has
+ * completed.
+ *
+ * The spa_config_lock must be held while zio's created by
+ * zio_vdev_child_io() are in progress, to ensure that the vdev tree does
+ * not change (e.g. due to a concurrent "zpool attach/detach"). The "null"
+ * zio is needed to release the spa_config_lock after all the reads and
+ * writes complete. (Note that we can't grab the config lock for each read,
+ * because it is not reentrant - we could deadlock with a thread waiting
+ * for a write lock.)
+ */
+static void
+spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
+ vdev_t *source_vd, uint64_t source_offset,
+ vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size)
+{
+ ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
+
+ /*
+ * If the destination child in unwritable then there is no point
+ * in issuing the source reads which cannot be written.
+ */
+ if (!vdev_writeable(dest_child_vd))
+ return;
+
+ mutex_enter(&vca->vca_lock);
+ vca->vca_outstanding_bytes += size;
+ mutex_exit(&vca->vca_lock);
+
+ abd_t *abd = abd_alloc_for_io(size, B_FALSE);
+
+ vdev_t *source_child_vd = NULL;
+ if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
+ /*
+ * Source and dest are both mirrors. Copy from the same
+ * child id as we are copying to (wrapping around if there
+ * are more dest children than source children). If the
+ * preferred source child is unreadable select another.
+ */
+ for (int i = 0; i < source_vd->vdev_children; i++) {
+ source_child_vd = source_vd->vdev_child[
+ (dest_id + i) % source_vd->vdev_children];
+ if (vdev_readable(source_child_vd))
+ break;
+ }
+ } else {
+ source_child_vd = source_vd;
+ }
+
+ /*
+ * There should always be at least one readable source child or
+ * the pool would be in a suspended state. Somehow selecting an
+ * unreadable child would result in IO errors, the removal process
+ * being cancelled, and the pool reverting to its pre-removal state.
+ */
+ ASSERT3P(source_child_vd, !=, NULL);
+
+ zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
+ dest_child_vd, dest_offset, abd, size,
+ ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
+ ZIO_FLAG_CANFAIL,
+ spa_vdev_copy_segment_write_done, vca);
+
+ zio_nowait(zio_vdev_child_io(write_zio, NULL,
+ source_child_vd, source_offset, abd, size,
+ ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
+ ZIO_FLAG_CANFAIL,
+ spa_vdev_copy_segment_read_done, vca));
+}
+
+/*
+ * Allocate a new location for this segment, and create the zio_t's to
+ * read from the old location and write to the new location.
+ */
+static int
+spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
+ uint64_t maxalloc, uint64_t txg,
+ vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
+{
+ metaslab_group_t *mg = vd->vdev_mg;
+ spa_t *spa = vd->vdev_spa;
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+ vdev_indirect_mapping_entry_t *entry;
+ dva_t dst = {{ 0 }};
+ uint64_t start = range_tree_min(segs);
+ ASSERT0(P2PHASE(start, 1 << spa->spa_min_ashift));
+
+ ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE);
+ ASSERT0(P2PHASE(maxalloc, 1 << spa->spa_min_ashift));
+
+ uint64_t size = range_tree_span(segs);
+ if (range_tree_span(segs) > maxalloc) {
+ /*
+ * We can't allocate all the segments. Prefer to end
+ * the allocation at the end of a segment, thus avoiding
+ * additional split blocks.
+ */
+ range_seg_max_t search;
+ zfs_btree_index_t where;
+ rs_set_start(&search, segs, start + maxalloc);
+ rs_set_end(&search, segs, start + maxalloc);
+ (void) zfs_btree_find(&segs->rt_root, &search, &where);
+ range_seg_t *rs = zfs_btree_prev(&segs->rt_root, &where,
+ &where);
+ if (rs != NULL) {
+ size = rs_get_end(rs, segs) - start;
+ } else {
+ /*
+ * There are no segments that end before maxalloc.
+ * I.e. the first segment is larger than maxalloc,
+ * so we must split it.
+ */
+ size = maxalloc;
+ }
+ }
+ ASSERT3U(size, <=, maxalloc);
+ ASSERT0(P2PHASE(size, 1 << spa->spa_min_ashift));
+
+ /*
+ * An allocation class might not have any remaining vdevs or space
+ */
+ metaslab_class_t *mc = mg->mg_class;
+ if (mc->mc_groups == 0)
+ mc = spa_normal_class(spa);
+ int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0,
+ zal, 0);
+ if (error == ENOSPC && mc != spa_normal_class(spa)) {
+ error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
+ &dst, 0, NULL, txg, 0, zal, 0);
+ }
+ if (error != 0)
+ return (error);
+
+ /*
+ * Determine the ranges that are not actually needed. Offsets are
+ * relative to the start of the range to be copied (i.e. relative to the
+ * local variable "start").
+ */
+ range_tree_t *obsolete_segs = range_tree_create(NULL, RANGE_SEG64, NULL,
+ 0, 0);
+
+ zfs_btree_index_t where;
+ range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where);
+ ASSERT3U(rs_get_start(rs, segs), ==, start);
+ uint64_t prev_seg_end = rs_get_end(rs, segs);
+ while ((rs = zfs_btree_next(&segs->rt_root, &where, &where)) != NULL) {
+ if (rs_get_start(rs, segs) >= start + size) {
+ break;
+ } else {
+ range_tree_add(obsolete_segs,
+ prev_seg_end - start,
+ rs_get_start(rs, segs) - prev_seg_end);
+ }
+ prev_seg_end = rs_get_end(rs, segs);
+ }
+ /* We don't end in the middle of an obsolete range */
+ ASSERT3U(start + size, <=, prev_seg_end);
+
+ range_tree_clear(segs, start, size);
+
+ /*
+ * We can't have any padding of the allocated size, otherwise we will
+ * misunderstand what's allocated, and the size of the mapping. We
+ * prevent padding by ensuring that all devices in the pool have the
+ * same ashift, and the allocation size is a multiple of the ashift.
+ */
+ VERIFY3U(DVA_GET_ASIZE(&dst), ==, size);
+
+ entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
+ DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
+ entry->vime_mapping.vimep_dst = dst;
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+ entry->vime_obsolete_count = range_tree_space(obsolete_segs);
+ }
+
+ vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP);
+ vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
+ vcsa->vcsa_obsolete_segs = obsolete_segs;
+ vcsa->vcsa_spa = spa;
+ vcsa->vcsa_txg = txg;
+
+ /*
+ * See comment before spa_vdev_copy_one_child().
+ */
+ spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+ zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
+ spa_vdev_copy_segment_done, vcsa, 0);
+ vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
+ if (dest_vd->vdev_ops == &vdev_mirror_ops) {
+ for (int i = 0; i < dest_vd->vdev_children; i++) {
+ vdev_t *child = dest_vd->vdev_child[i];
+ spa_vdev_copy_one_child(vca, nzio, vd, start,
+ child, DVA_GET_OFFSET(&dst), i, size);
+ }
+ } else {
+ spa_vdev_copy_one_child(vca, nzio, vd, start,
+ dest_vd, DVA_GET_OFFSET(&dst), -1, size);
+ }
+ zio_nowait(nzio);
+
+ list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry);
+ ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift);
+ vdev_dirty(vd, 0, NULL, txg);
+
+ return (0);
+}
+
+/*
+ * Complete the removal of a toplevel vdev. This is called as a
+ * synctask in the same txg that we will sync out the new config (to the
+ * MOS object) which indicates that this vdev is indirect.
+ */
+static void
+vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_vdev_removal_t *svr = arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ ASSERT0(svr->svr_bytes_done[i]);
+ }
+
+ ASSERT3U(spa->spa_removing_phys.sr_copied, ==,
+ spa->spa_removing_phys.sr_to_copy);
+
+ vdev_destroy_spacemaps(vd, tx);
+
+ /* destroy leaf zaps, if any */
+ ASSERT3P(svr->svr_zaplist, !=, NULL);
+ for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL);
+ pair != NULL;
+ pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) {
+ vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx);
+ }
+ fnvlist_free(svr->svr_zaplist);
+
+ spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx);
+ /* vd->vdev_path is not available here */
+ spa_history_log_internal(spa, "vdev remove completed", tx,
+ "%s vdev %llu", spa_name(spa), (u_longlong_t)vd->vdev_id);
+}
+
+static void
+vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
+{
+ ASSERT3P(zlist, !=, NULL);
+ ASSERT0(vdev_get_nparity(vd));
+
+ if (vd->vdev_leaf_zap != 0) {
+ char zkey[32];
+ (void) snprintf(zkey, sizeof (zkey), "%s-%llu",
+ VDEV_REMOVAL_ZAP_OBJS, (u_longlong_t)vd->vdev_leaf_zap);
+ fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap);
+ }
+
+ for (uint64_t id = 0; id < vd->vdev_children; id++) {
+ vdev_remove_enlist_zaps(vd->vdev_child[id], zlist);
+ }
+}
+
+static void
+vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
+{
+ vdev_t *ivd;
+ dmu_tx_t *tx;
+ spa_t *spa = vd->vdev_spa;
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+
+ /*
+ * First, build a list of leaf zaps to be destroyed.
+ * This is passed to the sync context thread,
+ * which does the actual unlinking.
+ */
+ svr->svr_zaplist = fnvlist_alloc();
+ vdev_remove_enlist_zaps(vd, svr->svr_zaplist);
+
+ ivd = vdev_add_parent(vd, &vdev_indirect_ops);
+ ivd->vdev_removing = 0;
+
+ vd->vdev_leaf_zap = 0;
+
+ vdev_remove_child(ivd, vd);
+ vdev_compact_children(ivd);
+
+ ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
+
+ mutex_enter(&svr->svr_lock);
+ svr->svr_thread = NULL;
+ cv_broadcast(&svr->svr_cv);
+ mutex_exit(&svr->svr_lock);
+
+ /* After this, we can not use svr. */
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ dsl_sync_task_nowait(spa->spa_dsl_pool,
+ vdev_remove_complete_sync, svr, tx);
+ dmu_tx_commit(tx);
+}
+
+/*
+ * Complete the removal of a toplevel vdev. This is called in open
+ * context by the removal thread after we have copied all vdev's data.
+ */
+static void
+vdev_remove_complete(spa_t *spa)
+{
+ uint64_t txg;
+
+ /*
+ * Wait for any deferred frees to be synced before we call
+ * vdev_metaslab_fini()
+ */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ txg = spa_vdev_enter(spa);
+ vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+ ASSERT3P(vd->vdev_trim_thread, ==, NULL);
+ ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
+
+ sysevent_t *ev = spa_event_create(spa, vd, NULL,
+ ESC_ZFS_VDEV_REMOVE_DEV);
+
+ zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
+ vd->vdev_id, txg);
+
+ /*
+ * Discard allocation state.
+ */
+ if (vd->vdev_mg != NULL) {
+ vdev_metaslab_fini(vd);
+ metaslab_group_destroy(vd->vdev_mg);
+ vd->vdev_mg = NULL;
+ spa_log_sm_set_blocklimit(spa);
+ }
+ if (vd->vdev_log_mg != NULL) {
+ ASSERT0(vd->vdev_ms_count);
+ metaslab_group_destroy(vd->vdev_log_mg);
+ vd->vdev_log_mg = NULL;
+ }
+ ASSERT0(vd->vdev_stat.vs_space);
+ ASSERT0(vd->vdev_stat.vs_dspace);
+
+ vdev_remove_replace_with_indirect(vd, txg);
+
+ /*
+ * We now release the locks, allowing spa_sync to run and finish the
+ * removal via vdev_remove_complete_sync in syncing context.
+ *
+ * Note that we hold on to the vdev_t that has been replaced. Since
+ * it isn't part of the vdev tree any longer, it can't be concurrently
+ * manipulated, even while we don't have the config lock.
+ */
+ (void) spa_vdev_exit(spa, NULL, txg, 0);
+
+ /*
+ * Top ZAP should have been transferred to the indirect vdev in
+ * vdev_remove_replace_with_indirect.
+ */
+ ASSERT0(vd->vdev_top_zap);
+
+ /*
+ * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect.
+ */
+ ASSERT0(vd->vdev_leaf_zap);
+
+ txg = spa_vdev_enter(spa);
+ (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+ /*
+ * Request to update the config and the config cachefile.
+ */
+ vdev_config_dirty(spa->spa_root_vdev);
+ (void) spa_vdev_exit(spa, vd, txg, 0);
+
+ if (ev != NULL)
+ spa_event_post(ev);
+}
+
+/*
+ * Evacuates a segment of size at most max_alloc from the vdev
+ * via repeated calls to spa_vdev_copy_segment. If an allocation
+ * fails, the pool is probably too fragmented to handle such a
+ * large size, so decrease max_alloc so that the caller will not try
+ * this size again this txg.
+ */
+static void
+spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
+ uint64_t *max_alloc, dmu_tx_t *tx)
+{
+ uint64_t txg = dmu_tx_get_txg(tx);
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ mutex_enter(&svr->svr_lock);
+
+ /*
+ * Determine how big of a chunk to copy. We can allocate up
+ * to max_alloc bytes, and we can span up to vdev_removal_max_span
+ * bytes of unallocated space at a time. "segs" will track the
+ * allocated segments that we are copying. We may also be copying
+ * free segments (of up to vdev_removal_max_span bytes).
+ */
+ range_tree_t *segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+ for (;;) {
+ range_tree_t *rt = svr->svr_allocd_segs;
+ range_seg_t *rs = range_tree_first(rt);
+
+ if (rs == NULL)
+ break;
+
+ uint64_t seg_length;
+
+ if (range_tree_is_empty(segs)) {
+ /* need to truncate the first seg based on max_alloc */
+ seg_length = MIN(rs_get_end(rs, rt) - rs_get_start(rs,
+ rt), *max_alloc);
+ } else {
+ if (rs_get_start(rs, rt) - range_tree_max(segs) >
+ vdev_removal_max_span) {
+ /*
+ * Including this segment would cause us to
+ * copy a larger unneeded chunk than is allowed.
+ */
+ break;
+ } else if (rs_get_end(rs, rt) - range_tree_min(segs) >
+ *max_alloc) {
+ /*
+ * This additional segment would extend past
+ * max_alloc. Rather than splitting this
+ * segment, leave it for the next mapping.
+ */
+ break;
+ } else {
+ seg_length = rs_get_end(rs, rt) -
+ rs_get_start(rs, rt);
+ }
+ }
+
+ range_tree_add(segs, rs_get_start(rs, rt), seg_length);
+ range_tree_remove(svr->svr_allocd_segs,
+ rs_get_start(rs, rt), seg_length);
+ }
+
+ if (range_tree_is_empty(segs)) {
+ mutex_exit(&svr->svr_lock);
+ range_tree_destroy(segs);
+ return;
+ }
+
+ if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) {
+ dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync,
+ svr, tx);
+ }
+
+ svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs);
+
+ /*
+ * Note: this is the amount of *allocated* space
+ * that we are taking care of each txg.
+ */
+ svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs);
+
+ mutex_exit(&svr->svr_lock);
+
+ zio_alloc_list_t zal;
+ metaslab_trace_init(&zal);
+ uint64_t thismax = SPA_MAXBLOCKSIZE;
+ while (!range_tree_is_empty(segs)) {
+ int error = spa_vdev_copy_segment(vd,
+ segs, thismax, txg, vca, &zal);
+
+ if (error == ENOSPC) {
+ /*
+ * Cut our segment in half, and don't try this
+ * segment size again this txg. Note that the
+ * allocation size must be aligned to the highest
+ * ashift in the pool, so that the allocation will
+ * not be padded out to a multiple of the ashift,
+ * which could cause us to think that this mapping
+ * is larger than we intended.
+ */
+ ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT);
+ ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift);
+ uint64_t attempted =
+ MIN(range_tree_span(segs), thismax);
+ thismax = P2ROUNDUP(attempted / 2,
+ 1 << spa->spa_max_ashift);
+ /*
+ * The minimum-size allocation can not fail.
+ */
+ ASSERT3U(attempted, >, 1 << spa->spa_max_ashift);
+ *max_alloc = attempted - (1 << spa->spa_max_ashift);
+ } else {
+ ASSERT0(error);
+
+ /*
+ * We've performed an allocation, so reset the
+ * alloc trace list.
+ */
+ metaslab_trace_fini(&zal);
+ metaslab_trace_init(&zal);
+ }
+ }
+ metaslab_trace_fini(&zal);
+ range_tree_destroy(segs);
+}
+
+/*
+ * The size of each removal mapping is limited by the tunable
+ * zfs_remove_max_segment, but we must adjust this to be a multiple of the
+ * pool's ashift, so that we don't try to split individual sectors regardless
+ * of the tunable value. (Note that device removal requires that all devices
+ * have the same ashift, so there's no difference between spa_min_ashift and
+ * spa_max_ashift.) The raw tunable should not be used elsewhere.
+ */
+uint64_t
+spa_remove_max_segment(spa_t *spa)
+{
+ return (P2ROUNDUP(zfs_remove_max_segment, 1 << spa->spa_max_ashift));
+}
+
+/*
+ * The removal thread operates in open context. It iterates over all
+ * allocated space in the vdev, by loading each metaslab's spacemap.
+ * For each contiguous segment of allocated space (capping the segment
+ * size at SPA_MAXBLOCKSIZE), we:
+ * - Allocate space for it on another vdev.
+ * - Create a new mapping from the old location to the new location
+ * (as a record in svr_new_segments).
+ * - Initiate a physical read zio to get the data off the removing disk.
+ * - In the read zio's done callback, initiate a physical write zio to
+ * write it to the new vdev.
+ * Note that all of this will take effect when a particular TXG syncs.
+ * The sync thread ensures that all the phys reads and writes for the syncing
+ * TXG have completed (see spa_txg_zio) and writes the new mappings to disk
+ * (see vdev_mapping_sync()).
+ */
+static void
+spa_vdev_remove_thread(void *arg)
+{
+ spa_t *spa = arg;
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+ vdev_copy_arg_t vca;
+ uint64_t max_alloc = spa_remove_max_segment(spa);
+ uint64_t last_txg = 0;
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ uint64_t start_offset = vdev_indirect_mapping_max_offset(vim);
+
+ ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT(vd->vdev_removing);
+ ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
+ ASSERT(vim != NULL);
+
+ mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL);
+ vca.vca_outstanding_bytes = 0;
+ vca.vca_read_error_bytes = 0;
+ vca.vca_write_error_bytes = 0;
+
+ mutex_enter(&svr->svr_lock);
+
+ /*
+ * Start from vim_max_offset so we pick up where we left off
+ * if we are restarting the removal after opening the pool.
+ */
+ uint64_t msi;
+ for (msi = start_offset >> vd->vdev_ms_shift;
+ msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) {
+ metaslab_t *msp = vd->vdev_ms[msi];
+ ASSERT3U(msi, <=, vd->vdev_ms_count);
+
+ ASSERT0(range_tree_space(svr->svr_allocd_segs));
+
+ mutex_enter(&msp->ms_sync_lock);
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * Assert nothing in flight -- ms_*tree is empty.
+ */
+ for (int i = 0; i < TXG_SIZE; i++) {
+ ASSERT0(range_tree_space(msp->ms_allocating[i]));
+ }
+
+ /*
+ * If the metaslab has ever been allocated from (ms_sm!=NULL),
+ * read the allocated segments from the space map object
+ * into svr_allocd_segs. Since we do this while holding
+ * svr_lock and ms_sync_lock, concurrent frees (which
+ * would have modified the space map) will wait for us
+ * to finish loading the spacemap, and then take the
+ * appropriate action (see free_from_removing_vdev()).
+ */
+ if (msp->ms_sm != NULL) {
+ VERIFY0(space_map_load(msp->ms_sm,
+ svr->svr_allocd_segs, SM_ALLOC));
+
+ range_tree_walk(msp->ms_unflushed_allocs,
+ range_tree_add, svr->svr_allocd_segs);
+ range_tree_walk(msp->ms_unflushed_frees,
+ range_tree_remove, svr->svr_allocd_segs);
+ range_tree_walk(msp->ms_freeing,
+ range_tree_remove, svr->svr_allocd_segs);
+
+ /*
+ * When we are resuming from a paused removal (i.e.
+ * when importing a pool with a removal in progress),
+ * discard any state that we have already processed.
+ */
+ range_tree_clear(svr->svr_allocd_segs, 0, start_offset);
+ }
+ mutex_exit(&msp->ms_lock);
+ mutex_exit(&msp->ms_sync_lock);
+
+ vca.vca_msp = msp;
+ zfs_dbgmsg("copying %llu segments for metaslab %llu",
+ zfs_btree_numnodes(&svr->svr_allocd_segs->rt_root),
+ msp->ms_id);
+
+ while (!svr->svr_thread_exit &&
+ !range_tree_is_empty(svr->svr_allocd_segs)) {
+
+ mutex_exit(&svr->svr_lock);
+
+ /*
+ * We need to periodically drop the config lock so that
+ * writers can get in. Additionally, we can't wait
+ * for a txg to sync while holding a config lock
+ * (since a waiting writer could cause a 3-way deadlock
+ * with the sync thread, which also gets a config
+ * lock for reader). So we can't hold the config lock
+ * while calling dmu_tx_assign().
+ */
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ /*
+ * This delay will pause the removal around the point
+ * specified by zfs_removal_suspend_progress. We do this
+ * solely from the test suite or during debugging.
+ */
+ uint64_t bytes_copied =
+ spa->spa_removing_phys.sr_copied;
+ for (int i = 0; i < TXG_SIZE; i++)
+ bytes_copied += svr->svr_bytes_done[i];
+ while (zfs_removal_suspend_progress &&
+ !svr->svr_thread_exit)
+ delay(hz);
+
+ mutex_enter(&vca.vca_lock);
+ while (vca.vca_outstanding_bytes >
+ zfs_remove_max_copy_bytes) {
+ cv_wait(&vca.vca_cv, &vca.vca_lock);
+ }
+ mutex_exit(&vca.vca_lock);
+
+ dmu_tx_t *tx =
+ dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ uint64_t txg = dmu_tx_get_txg(tx);
+
+ /*
+ * Reacquire the vdev_config lock. The vdev_t
+ * that we're removing may have changed, e.g. due
+ * to a vdev_attach or vdev_detach.
+ */
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+
+ if (txg != last_txg)
+ max_alloc = spa_remove_max_segment(spa);
+ last_txg = txg;
+
+ spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx);
+
+ dmu_tx_commit(tx);
+ mutex_enter(&svr->svr_lock);
+ }
+
+ mutex_enter(&vca.vca_lock);
+ if (zfs_removal_ignore_errors == 0 &&
+ (vca.vca_read_error_bytes > 0 ||
+ vca.vca_write_error_bytes > 0)) {
+ svr->svr_thread_exit = B_TRUE;
+ }
+ mutex_exit(&vca.vca_lock);
+ }
+
+ mutex_exit(&svr->svr_lock);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ /*
+ * Wait for all copies to finish before cleaning up the vca.
+ */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ ASSERT0(vca.vca_outstanding_bytes);
+
+ mutex_destroy(&vca.vca_lock);
+ cv_destroy(&vca.vca_cv);
+
+ if (svr->svr_thread_exit) {
+ mutex_enter(&svr->svr_lock);
+ range_tree_vacate(svr->svr_allocd_segs, NULL, NULL);
+ svr->svr_thread = NULL;
+ cv_broadcast(&svr->svr_cv);
+ mutex_exit(&svr->svr_lock);
+
+ /*
+ * During the removal process an unrecoverable read or write
+ * error was encountered. The removal process must be
+ * cancelled or this damage may become permanent.
+ */
+ if (zfs_removal_ignore_errors == 0 &&
+ (vca.vca_read_error_bytes > 0 ||
+ vca.vca_write_error_bytes > 0)) {
+ zfs_dbgmsg("canceling removal due to IO errors: "
+ "[read_error_bytes=%llu] [write_error_bytes=%llu]",
+ vca.vca_read_error_bytes,
+ vca.vca_write_error_bytes);
+ spa_vdev_remove_cancel_impl(spa);
+ }
+ } else {
+ ASSERT0(range_tree_space(svr->svr_allocd_segs));
+ vdev_remove_complete(spa);
+ }
+
+ thread_exit();
+}
+
+void
+spa_vdev_remove_suspend(spa_t *spa)
+{
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+
+ if (svr == NULL)
+ return;
+
+ mutex_enter(&svr->svr_lock);
+ svr->svr_thread_exit = B_TRUE;
+ while (svr->svr_thread != NULL)
+ cv_wait(&svr->svr_cv, &svr->svr_lock);
+ svr->svr_thread_exit = B_FALSE;
+ mutex_exit(&svr->svr_lock);
+}
+
+/* ARGSUSED */
+static int
+spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ if (spa->spa_vdev_removal == NULL)
+ return (ENOTACTIVE);
+ return (0);
+}
+
+/*
+ * Cancel a removal by freeing all entries from the partial mapping
+ * and marking the vdev as no longer being removing.
+ */
+/* ARGSUSED */
+static void
+spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ objset_t *mos = spa->spa_meta_objset;
+
+ ASSERT3P(svr->svr_thread, ==, NULL);
+
+ spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
+
+ boolean_t are_precise;
+ VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+ if (are_precise) {
+ spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx));
+ }
+
+ uint64_t obsolete_sm_object;
+ VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+ if (obsolete_sm_object != 0) {
+ ASSERT(vd->vdev_obsolete_sm != NULL);
+ ASSERT3U(obsolete_sm_object, ==,
+ space_map_object(vd->vdev_obsolete_sm));
+
+ space_map_free(vd->vdev_obsolete_sm, tx);
+ VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
+ space_map_close(vd->vdev_obsolete_sm);
+ vd->vdev_obsolete_sm = NULL;
+ spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ }
+ for (int i = 0; i < TXG_SIZE; i++) {
+ ASSERT(list_is_empty(&svr->svr_new_segments[i]));
+ ASSERT3U(svr->svr_max_offset_to_sync[i], <=,
+ vdev_indirect_mapping_max_offset(vim));
+ }
+
+ for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
+ metaslab_t *msp = vd->vdev_ms[msi];
+
+ if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
+ break;
+
+ ASSERT0(range_tree_space(svr->svr_allocd_segs));
+
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * Assert nothing in flight -- ms_*tree is empty.
+ */
+ for (int i = 0; i < TXG_SIZE; i++)
+ ASSERT0(range_tree_space(msp->ms_allocating[i]));
+ for (int i = 0; i < TXG_DEFER_SIZE; i++)
+ ASSERT0(range_tree_space(msp->ms_defer[i]));
+ ASSERT0(range_tree_space(msp->ms_freed));
+
+ if (msp->ms_sm != NULL) {
+ mutex_enter(&svr->svr_lock);
+ VERIFY0(space_map_load(msp->ms_sm,
+ svr->svr_allocd_segs, SM_ALLOC));
+
+ range_tree_walk(msp->ms_unflushed_allocs,
+ range_tree_add, svr->svr_allocd_segs);
+ range_tree_walk(msp->ms_unflushed_frees,
+ range_tree_remove, svr->svr_allocd_segs);
+ range_tree_walk(msp->ms_freeing,
+ range_tree_remove, svr->svr_allocd_segs);
+
+ /*
+ * Clear everything past what has been synced,
+ * because we have not allocated mappings for it yet.
+ */
+ uint64_t syncd = vdev_indirect_mapping_max_offset(vim);
+ uint64_t sm_end = msp->ms_sm->sm_start +
+ msp->ms_sm->sm_size;
+ if (sm_end > syncd)
+ range_tree_clear(svr->svr_allocd_segs,
+ syncd, sm_end - syncd);
+
+ mutex_exit(&svr->svr_lock);
+ }
+ mutex_exit(&msp->ms_lock);
+
+ mutex_enter(&svr->svr_lock);
+ range_tree_vacate(svr->svr_allocd_segs,
+ free_mapped_segment_cb, vd);
+ mutex_exit(&svr->svr_lock);
+ }
+
+ /*
+ * Note: this must happen after we invoke free_mapped_segment_cb,
+ * because it adds to the obsolete_segments.
+ */
+ range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
+
+ ASSERT3U(vic->vic_mapping_object, ==,
+ vdev_indirect_mapping_object(vd->vdev_indirect_mapping));
+ vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
+ vd->vdev_indirect_mapping = NULL;
+ vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
+ vic->vic_mapping_object = 0;
+
+ ASSERT3U(vic->vic_births_object, ==,
+ vdev_indirect_births_object(vd->vdev_indirect_births));
+ vdev_indirect_births_close(vd->vdev_indirect_births);
+ vd->vdev_indirect_births = NULL;
+ vdev_indirect_births_free(mos, vic->vic_births_object, tx);
+ vic->vic_births_object = 0;
+
+ /*
+ * We may have processed some frees from the removing vdev in this
+ * txg, thus increasing svr_bytes_done; discard that here to
+ * satisfy the assertions in spa_vdev_removal_destroy().
+ * Note that future txg's can not have any bytes_done, because
+ * future TXG's are only modified from open context, and we have
+ * already shut down the copying thread.
+ */
+ svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0;
+ spa_finish_removal(spa, DSS_CANCELED, tx);
+
+ vd->vdev_removing = B_FALSE;
+ vdev_config_dirty(vd);
+
+ zfs_dbgmsg("canceled device removal for vdev %llu in %llu",
+ vd->vdev_id, dmu_tx_get_txg(tx));
+ spa_history_log_internal(spa, "vdev remove canceled", tx,
+ "%s vdev %llu %s", spa_name(spa),
+ (u_longlong_t)vd->vdev_id,
+ (vd->vdev_path != NULL) ? vd->vdev_path : "-");
+}
+
+static int
+spa_vdev_remove_cancel_impl(spa_t *spa)
+{
+ uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
+
+ int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
+ spa_vdev_remove_cancel_sync, NULL, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED);
+
+ if (error == 0) {
+ spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
+ vdev_t *vd = vdev_lookup_top(spa, vdid);
+ metaslab_group_activate(vd->vdev_mg);
+ ASSERT(!vd->vdev_islog);
+ metaslab_group_activate(vd->vdev_log_mg);
+ spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG);
+ }
+
+ return (error);
+}
+
+int
+spa_vdev_remove_cancel(spa_t *spa)
+{
+ spa_vdev_remove_suspend(spa);
+
+ if (spa->spa_vdev_removal == NULL)
+ return (ENOTACTIVE);
+
+ return (spa_vdev_remove_cancel_impl(spa));
+}
+
+void
+svr_sync(spa_t *spa, dmu_tx_t *tx)
+{
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+
+ if (svr == NULL)
+ return;
+
+ /*
+ * This check is necessary so that we do not dirty the
+ * DIRECTORY_OBJECT via spa_sync_removing_state() when there
+ * is nothing to do. Dirtying it every time would prevent us
+ * from syncing-to-convergence.
+ */
+ if (svr->svr_bytes_done[txgoff] == 0)
+ return;
+
+ /*
+ * Update progress accounting.
+ */
+ spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff];
+ svr->svr_bytes_done[txgoff] = 0;
+
+ spa_sync_removing_state(spa, tx);
+}
+
+static void
+vdev_remove_make_hole_and_free(vdev_t *vd)
+{
+ uint64_t id = vd->vdev_id;
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ vdev_free(vd);
+
+ vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
+ vdev_add_child(rvd, vd);
+ vdev_config_dirty(rvd);
+
+ /*
+ * Reassess the health of our root vdev.
+ */
+ vdev_reopen(rvd);
+}
+
+/*
+ * Remove a log device. The config lock is held for the specified TXG.
+ */
+static int
+spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
+{
+ metaslab_group_t *mg = vd->vdev_mg;
+ spa_t *spa = vd->vdev_spa;
+ int error = 0;
+
+ ASSERT(vd->vdev_islog);
+ ASSERT(vd == vd->vdev_top);
+ ASSERT3P(vd->vdev_log_mg, ==, NULL);
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ /*
+ * Stop allocating from this vdev.
+ */
+ metaslab_group_passivate(mg);
+
+ /*
+ * Wait for the youngest allocations and frees to sync,
+ * and then wait for the deferral of those frees to finish.
+ */
+ spa_vdev_config_exit(spa, NULL,
+ *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
+
+ /*
+ * Cancel any initialize or TRIM which was in progress.
+ */
+ vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
+ vdev_trim_stop_all(vd, VDEV_TRIM_CANCELED);
+ vdev_autotrim_stop_wait(vd);
+
+ /*
+ * Evacuate the device. We don't hold the config lock as
+ * writer since we need to do I/O but we do keep the
+ * spa_namespace_lock held. Once this completes the device
+ * should no longer have any blocks allocated on it.
+ */
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ if (vd->vdev_stat.vs_alloc != 0)
+ error = spa_reset_logs(spa);
+
+ *txg = spa_vdev_config_enter(spa);
+
+ if (error != 0) {
+ metaslab_group_activate(mg);
+ ASSERT3P(vd->vdev_log_mg, ==, NULL);
+ return (error);
+ }
+ ASSERT0(vd->vdev_stat.vs_alloc);
+
+ /*
+ * The evacuation succeeded. Remove any remaining MOS metadata
+ * associated with this vdev, and wait for these changes to sync.
+ */
+ vd->vdev_removing = B_TRUE;
+
+ vdev_dirty_leaves(vd, VDD_DTL, *txg);
+ vdev_config_dirty(vd);
+
+ /*
+ * When the log space map feature is enabled we look at
+ * the vdev's top_zap to find the on-disk flush data of
+ * the metaslab we just flushed. Thus, while removing a
+ * log vdev we make sure to call vdev_metaslab_fini()
+ * first, which removes all metaslabs of this vdev from
+ * spa_metaslabs_by_flushed before vdev_remove_empty()
+ * destroys the top_zap of this log vdev.
+ *
+ * This avoids the scenario where we flush a metaslab
+ * from the log vdev being removed that doesn't have a
+ * top_zap and end up failing to lookup its on-disk flush
+ * data.
+ *
+ * We don't call metaslab_group_destroy() right away
+ * though (it will be called in vdev_free() later) as
+ * during metaslab_sync() of metaslabs from other vdevs
+ * we may touch the metaslab group of this vdev through
+ * metaslab_class_histogram_verify()
+ */
+ vdev_metaslab_fini(vd);
+ spa_log_sm_set_blocklimit(spa);
+
+ spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
+ *txg = spa_vdev_config_enter(spa);
+
+ sysevent_t *ev = spa_event_create(spa, vd, NULL,
+ ESC_ZFS_VDEV_REMOVE_DEV);
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ /* The top ZAP should have been destroyed by vdev_remove_empty. */
+ ASSERT0(vd->vdev_top_zap);
+ /* The leaf ZAP should have been destroyed by vdev_dtl_sync. */
+ ASSERT0(vd->vdev_leaf_zap);
+
+ (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+
+ if (list_link_active(&vd->vdev_state_dirty_node))
+ vdev_state_clean(vd);
+ if (list_link_active(&vd->vdev_config_dirty_node))
+ vdev_config_clean(vd);
+
+ ASSERT0(vd->vdev_stat.vs_alloc);
+
+ /*
+ * Clean up the vdev namespace.
+ */
+ vdev_remove_make_hole_and_free(vd);
+
+ if (ev != NULL)
+ spa_event_post(ev);
+
+ return (0);
+}
+
+static int
+spa_vdev_remove_top_check(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ if (vd != vd->vdev_top)
+ return (SET_ERROR(ENOTSUP));
+
+ if (!vdev_is_concrete(vd))
+ return (SET_ERROR(ENOTSUP));
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
+ return (SET_ERROR(ENOTSUP));
+
+
+ metaslab_class_t *mc = vd->vdev_mg->mg_class;
+ metaslab_class_t *normal = spa_normal_class(spa);
+ if (mc != normal) {
+ /*
+ * Space allocated from the special (or dedup) class is
+ * included in the DMU's space usage, but it's not included
+ * in spa_dspace (or dsl_pool_adjustedsize()). Therefore
+ * there is always at least as much free space in the normal
+ * class, as is allocated from the special (and dedup) class.
+ * As a backup check, we will return ENOSPC if this is
+ * violated. See also spa_update_dspace().
+ */
+ uint64_t available = metaslab_class_get_space(normal) -
+ metaslab_class_get_alloc(normal);
+ ASSERT3U(available, >=, vd->vdev_stat.vs_alloc);
+ if (available < vd->vdev_stat.vs_alloc)
+ return (SET_ERROR(ENOSPC));
+ } else {
+ /* available space in the pool's normal class */
+ uint64_t available = dsl_dir_space_available(
+ spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
+ if (available <
+ vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
+ /*
+ * This is a normal device. There has to be enough free
+ * space to remove the device and leave double the
+ * "slop" space (i.e. we must leave at least 3% of the
+ * pool free, in addition to the normal slop space).
+ */
+ return (SET_ERROR(ENOSPC));
+ }
+ }
+
+ /*
+ * There can not be a removal in progress.
+ */
+ if (spa->spa_removing_phys.sr_state == DSS_SCANNING)
+ return (SET_ERROR(EBUSY));
+
+ /*
+ * The device must have all its data.
+ */
+ if (!vdev_dtl_empty(vd, DTL_MISSING) ||
+ !vdev_dtl_empty(vd, DTL_OUTAGE))
+ return (SET_ERROR(EBUSY));
+
+ /*
+ * The device must be healthy.
+ */
+ if (!vdev_readable(vd))
+ return (SET_ERROR(EIO));
+
+ /*
+ * All vdevs in normal class must have the same ashift.
+ */
+ if (spa->spa_max_ashift != spa->spa_min_ashift) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * A removed special/dedup vdev must have same ashift as normal class.
+ */
+ ASSERT(!vd->vdev_islog);
+ if (vd->vdev_alloc_bias != VDEV_BIAS_NONE &&
+ vd->vdev_ashift != spa->spa_max_ashift) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * All vdevs in normal class must have the same ashift
+ * and not be raidz or draid.
+ */
+ vdev_t *rvd = spa->spa_root_vdev;
+ int num_indirect = 0;
+ for (uint64_t id = 0; id < rvd->vdev_children; id++) {
+ vdev_t *cvd = rvd->vdev_child[id];
+
+ /*
+ * A removed special/dedup vdev must have the same ashift
+ * across all vdevs in its class.
+ */
+ if (vd->vdev_alloc_bias != VDEV_BIAS_NONE &&
+ cvd->vdev_alloc_bias == vd->vdev_alloc_bias &&
+ cvd->vdev_ashift != vd->vdev_ashift) {
+ return (SET_ERROR(EINVAL));
+ }
+ if (cvd->vdev_ashift != 0 &&
+ cvd->vdev_alloc_bias == VDEV_BIAS_NONE)
+ ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift);
+ if (cvd->vdev_ops == &vdev_indirect_ops)
+ num_indirect++;
+ if (!vdev_is_concrete(cvd))
+ continue;
+ if (vdev_get_nparity(cvd) != 0)
+ return (SET_ERROR(EINVAL));
+ /*
+ * Need the mirror to be mirror of leaf vdevs only
+ */
+ if (cvd->vdev_ops == &vdev_mirror_ops) {
+ for (uint64_t cid = 0;
+ cid < cvd->vdev_children; cid++) {
+ if (!cvd->vdev_child[cid]->vdev_ops->
+ vdev_op_leaf)
+ return (SET_ERROR(EINVAL));
+ }
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Initiate removal of a top-level vdev, reducing the total space in the pool.
+ * The config lock is held for the specified TXG. Once initiated,
+ * evacuation of all allocated space (copying it to other vdevs) happens
+ * in the background (see spa_vdev_remove_thread()), and can be canceled
+ * (see spa_vdev_remove_cancel()). If successful, the vdev will
+ * be transformed to an indirect vdev (see spa_vdev_remove_complete()).
+ */
+static int
+spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ int error;
+
+ /*
+ * Check for errors up-front, so that we don't waste time
+ * passivating the metaslab group and clearing the ZIL if there
+ * are errors.
+ */
+ error = spa_vdev_remove_top_check(vd);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Stop allocating from this vdev. Note that we must check
+ * that this is not the only device in the pool before
+ * passivating, otherwise we will not be able to make
+ * progress because we can't allocate from any vdevs.
+ * The above check for sufficient free space serves this
+ * purpose.
+ */
+ metaslab_group_t *mg = vd->vdev_mg;
+ metaslab_group_passivate(mg);
+ ASSERT(!vd->vdev_islog);
+ metaslab_group_passivate(vd->vdev_log_mg);
+
+ /*
+ * Wait for the youngest allocations and frees to sync,
+ * and then wait for the deferral of those frees to finish.
+ */
+ spa_vdev_config_exit(spa, NULL,
+ *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
+
+ /*
+ * We must ensure that no "stubby" log blocks are allocated
+ * on the device to be removed. These blocks could be
+ * written at any time, including while we are in the middle
+ * of copying them.
+ */
+ error = spa_reset_logs(spa);
+
+ /*
+ * We stop any initializing and TRIM that is currently in progress
+ * but leave the state as "active". This will allow the process to
+ * resume if the removal is canceled sometime later.
+ */
+ vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
+ vdev_trim_stop_all(vd, VDEV_TRIM_ACTIVE);
+ vdev_autotrim_stop_wait(vd);
+
+ *txg = spa_vdev_config_enter(spa);
+
+ /*
+ * Things might have changed while the config lock was dropped
+ * (e.g. space usage). Check for errors again.
+ */
+ if (error == 0)
+ error = spa_vdev_remove_top_check(vd);
+
+ if (error != 0) {
+ metaslab_group_activate(mg);
+ ASSERT(!vd->vdev_islog);
+ metaslab_group_activate(vd->vdev_log_mg);
+ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
+ spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
+ spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
+ return (error);
+ }
+
+ vd->vdev_removing = B_TRUE;
+
+ vdev_dirty_leaves(vd, VDD_DTL, *txg);
+ vdev_config_dirty(vd);
+ dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg);
+ dsl_sync_task_nowait(spa->spa_dsl_pool,
+ vdev_remove_initiate_sync, (void *)(uintptr_t)vd->vdev_id, tx);
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+
+/*
+ * Remove a device from the pool.
+ *
+ * Removing a device from the vdev namespace requires several steps
+ * and can take a significant amount of time. As a result we use
+ * the spa_vdev_config_[enter/exit] functions which allow us to
+ * grab and release the spa_config_lock while still holding the namespace
+ * lock. During each step the configuration is synced out.
+ */
+int
+spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
+{
+ vdev_t *vd;
+ nvlist_t **spares, **l2cache, *nv;
+ uint64_t txg = 0;
+ uint_t nspares, nl2cache;
+ int error = 0, error_log;
+ boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
+ sysevent_t *ev = NULL;
+ char *vd_type = NULL, *vd_path = NULL;
+
+ ASSERT(spa_writeable(spa));
+
+ if (!locked)
+ txg = spa_vdev_enter(spa);
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+ error = (spa_has_checkpoint(spa)) ?
+ ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+
+ if (!locked)
+ return (spa_vdev_exit(spa, NULL, txg, error));
+
+ return (error);
+ }
+
+ vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+
+ if (spa->spa_spares.sav_vdevs != NULL &&
+ nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
+ (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
+ /*
+ * Only remove the hot spare if it's not currently in use
+ * in this pool.
+ */
+ if (vd == NULL || unspare) {
+ char *type;
+ boolean_t draid_spare = B_FALSE;
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type)
+ == 0 && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0)
+ draid_spare = B_TRUE;
+
+ if (vd == NULL && draid_spare) {
+ error = SET_ERROR(ENOTSUP);
+ } else {
+ if (vd == NULL)
+ vd = spa_lookup_by_guid(spa,
+ guid, B_TRUE);
+ ev = spa_event_create(spa, vd, NULL,
+ ESC_ZFS_VDEV_REMOVE_AUX);
+
+ vd_type = VDEV_TYPE_SPARE;
+ vd_path = spa_strdup(fnvlist_lookup_string(
+ nv, ZPOOL_CONFIG_PATH));
+ spa_vdev_remove_aux(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, spares, nspares, nv);
+ spa_load_spares(spa);
+ spa->spa_spares.sav_sync = B_TRUE;
+ }
+ } else {
+ error = SET_ERROR(EBUSY);
+ }
+ } else if (spa->spa_l2cache.sav_vdevs != NULL &&
+ nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
+ (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
+ vd_type = VDEV_TYPE_L2CACHE;
+ vd_path = spa_strdup(fnvlist_lookup_string(
+ nv, ZPOOL_CONFIG_PATH));
+ /*
+ * Cache devices can always be removed.
+ */
+ vd = spa_lookup_by_guid(spa, guid, B_TRUE);
+
+ /*
+ * Stop trimming the cache device. We need to release the
+ * config lock to allow the syncing of TRIM transactions
+ * without releasing the spa_namespace_lock. The same
+ * strategy is employed in spa_vdev_remove_top().
+ */
+ spa_vdev_config_exit(spa, NULL,
+ txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
+ mutex_enter(&vd->vdev_trim_lock);
+ vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL);
+ mutex_exit(&vd->vdev_trim_lock);
+ txg = spa_vdev_config_enter(spa);
+
+ ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX);
+ spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
+ spa_load_l2cache(spa);
+ spa->spa_l2cache.sav_sync = B_TRUE;
+ } else if (vd != NULL && vd->vdev_islog) {
+ ASSERT(!locked);
+ vd_type = VDEV_TYPE_LOG;
+ vd_path = spa_strdup((vd->vdev_path != NULL) ?
+ vd->vdev_path : "-");
+ error = spa_vdev_remove_log(vd, &txg);
+ } else if (vd != NULL) {
+ ASSERT(!locked);
+ error = spa_vdev_remove_top(vd, &txg);
+ } else {
+ /*
+ * There is no vdev of any kind with the specified guid.
+ */
+ error = SET_ERROR(ENOENT);
+ }
+
+ error_log = error;
+
+ if (!locked)
+ error = spa_vdev_exit(spa, NULL, txg, error);
+
+ /*
+ * Logging must be done outside the spa config lock. Otherwise,
+ * this code path could end up holding the spa config lock while
+ * waiting for a txg_sync so it can write to the internal log.
+ * Doing that would prevent the txg sync from actually happening,
+ * causing a deadlock.
+ */
+ if (error_log == 0 && vd_type != NULL && vd_path != NULL) {
+ spa_history_log_internal(spa, "vdev remove", NULL,
+ "%s vdev (%s) %s", spa_name(spa), vd_type, vd_path);
+ }
+ if (vd_path != NULL)
+ spa_strfree(vd_path);
+
+ if (ev != NULL)
+ spa_event_post(ev);
+
+ return (error);
+}
+
+int
+spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
+{
+ prs->prs_state = spa->spa_removing_phys.sr_state;
+
+ if (prs->prs_state == DSS_NONE)
+ return (SET_ERROR(ENOENT));
+
+ prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev;
+ prs->prs_start_time = spa->spa_removing_phys.sr_start_time;
+ prs->prs_end_time = spa->spa_removing_phys.sr_end_time;
+ prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy;
+ prs->prs_copied = spa->spa_removing_phys.sr_copied;
+
+ prs->prs_mapping_memory = 0;
+ uint64_t indirect_vdev_id =
+ spa->spa_removing_phys.sr_prev_indirect_vdev;
+ while (indirect_vdev_id != -1) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id];
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+ prs->prs_mapping_memory += vdev_indirect_mapping_size(vim);
+ indirect_vdev_id = vic->vic_prev_indirect_vdev;
+ }
+
+ return (0);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_ignore_errors, INT, ZMOD_RW,
+ "Ignore hard IO errors when removing device");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_, remove_max_segment, INT, ZMOD_RW,
+ "Largest contiguous segment to allocate when removing device");
+
+ZFS_MODULE_PARAM(zfs_vdev, vdev_, removal_max_span, INT, ZMOD_RW,
+ "Largest span of free chunks a remap segment can span");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_suspend_progress, INT, ZMOD_RW,
+ "Pause device removal after this many bytes are copied "
+ "(debug use only - causes removal to hang)");
+/* END CSTYLED */
+
+EXPORT_SYMBOL(free_from_removing_vdev);
+EXPORT_SYMBOL(spa_removal_get_stats);
+EXPORT_SYMBOL(spa_remove_init);
+EXPORT_SYMBOL(spa_restart_removal);
+EXPORT_SYMBOL(spa_vdev_removal_destroy);
+EXPORT_SYMBOL(spa_vdev_remove);
+EXPORT_SYMBOL(spa_vdev_remove_cancel);
+EXPORT_SYMBOL(spa_vdev_remove_suspend);
+EXPORT_SYMBOL(svr_sync);
diff --git a/sys/contrib/openzfs/module/zfs/vdev_root.c b/sys/contrib/openzfs/module/zfs/vdev_root.c
new file mode 100644
index 000000000000..45ddc2f71927
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_root.c
@@ -0,0 +1,167 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for the pool's root vdev.
+ */
+
+static uint64_t
+vdev_root_core_tvds(vdev_t *vd)
+{
+ uint64_t tvds = 0;
+
+ for (uint64_t c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (!cvd->vdev_ishole && !cvd->vdev_islog &&
+ cvd->vdev_ops != &vdev_indirect_ops) {
+ tvds++;
+ }
+ }
+
+ return (tvds);
+}
+
+/*
+ * We should be able to tolerate one failure with absolutely no damage
+ * to our metadata. Two failures will take out space maps, a bunch of
+ * indirect block trees, meta dnodes, dnodes, etc. Probably not a happy
+ * place to live. When we get smarter, we can liberalize this policy.
+ * e.g. If we haven't lost two consecutive top-level vdevs, then we are
+ * probably fine. Adding bean counters during alloc/free can make this
+ * future guesswork more accurate.
+ */
+static boolean_t
+too_many_errors(vdev_t *vd, uint64_t numerrors)
+{
+ uint64_t tvds;
+
+ if (numerrors == 0)
+ return (B_FALSE);
+
+ tvds = vdev_root_core_tvds(vd);
+ ASSERT3U(numerrors, <=, tvds);
+
+ if (numerrors == tvds)
+ return (B_TRUE);
+
+ return (numerrors > spa_missing_tvds_allowed(vd->vdev_spa));
+}
+
+static int
+vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
+ uint64_t *ashift, uint64_t *pshift)
+{
+ spa_t *spa = vd->vdev_spa;
+ int lasterror = 0;
+ int numerrors = 0;
+
+ if (vd->vdev_children == 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (SET_ERROR(EINVAL));
+ }
+
+ vdev_open_children(vd);
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (cvd->vdev_open_error && !cvd->vdev_islog &&
+ cvd->vdev_ops != &vdev_indirect_ops) {
+ lasterror = cvd->vdev_open_error;
+ numerrors++;
+ }
+ }
+
+ if (spa_load_state(spa) != SPA_LOAD_NONE)
+ spa_set_missing_tvds(spa, numerrors);
+
+ if (too_many_errors(vd, numerrors)) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+ return (lasterror);
+ }
+
+ *asize = 0;
+ *max_asize = 0;
+ *ashift = 0;
+ *pshift = 0;
+
+ return (0);
+}
+
+static void
+vdev_root_close(vdev_t *vd)
+{
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_close(vd->vdev_child[c]);
+}
+
+static void
+vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
+{
+ if (too_many_errors(vd, faulted)) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
+ } else if (degraded || faulted) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ } else {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+ }
+}
+
+vdev_ops_t vdev_root_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
+ .vdev_op_open = vdev_root_open,
+ .vdev_op_close = vdev_root_close,
+ .vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
+ .vdev_op_io_start = NULL, /* not applicable to the root */
+ .vdev_op_io_done = NULL, /* not applicable to the root */
+ .vdev_op_state_change = vdev_root_state_change,
+ .vdev_op_need_resilver = NULL,
+ .vdev_op_hold = NULL,
+ .vdev_op_rele = NULL,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = NULL,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
+ .vdev_op_type = VDEV_TYPE_ROOT, /* name of this vdev type */
+ .vdev_op_leaf = B_FALSE /* not a leaf vdev */
+};
diff --git a/sys/contrib/openzfs/module/zfs/vdev_trim.c b/sys/contrib/openzfs/module/zfs/vdev_trim.c
new file mode 100644
index 000000000000..895957bda195
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_trim.c
@@ -0,0 +1,1719 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2019 by Lawrence Livermore National Security, LLC.
+ */
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/txg.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
+#include <sys/metaslab_impl.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc_impl.h>
+
+/*
+ * TRIM is a feature which is used to notify a SSD that some previously
+ * written space is no longer allocated by the pool. This is useful because
+ * writes to a SSD must be performed to blocks which have first been erased.
+ * Ensuring the SSD always has a supply of erased blocks for new writes
+ * helps prevent the performance from deteriorating.
+ *
+ * There are two supported TRIM methods; manual and automatic.
+ *
+ * Manual TRIM:
+ *
+ * A manual TRIM is initiated by running the 'zpool trim' command. A single
+ * 'vdev_trim' thread is created for each leaf vdev, and it is responsible for
+ * managing that vdev TRIM process. This involves iterating over all the
+ * metaslabs, calculating the unallocated space ranges, and then issuing the
+ * required TRIM I/Os.
+ *
+ * While a metaslab is being actively trimmed it is not eligible to perform
+ * new allocations. After traversing all of the metaslabs the thread is
+ * terminated. Finally, both the requested options and current progress of
+ * the TRIM are regularly written to the pool. This allows the TRIM to be
+ * suspended and resumed as needed.
+ *
+ * Automatic TRIM:
+ *
+ * An automatic TRIM is enabled by setting the 'autotrim' pool property
+ * to 'on'. When enabled, a `vdev_autotrim' thread is created for each
+ * top-level (not leaf) vdev in the pool. These threads perform the same
+ * core TRIM process as a manual TRIM, but with a few key differences.
+ *
+ * 1) Automatic TRIM happens continuously in the background and operates
+ * solely on recently freed blocks (ms_trim not ms_allocatable).
+ *
+ * 2) Each thread is associated with a top-level (not leaf) vdev. This has
+ * the benefit of simplifying the threading model, it makes it easier
+ * to coordinate administrative commands, and it ensures only a single
+ * metaslab is disabled at a time. Unlike manual TRIM, this means each
+ * 'vdev_autotrim' thread is responsible for issuing TRIM I/Os for its
+ * children.
+ *
+ * 3) There is no automatic TRIM progress information stored on disk, nor
+ * is it reported by 'zpool status'.
+ *
+ * While the automatic TRIM process is highly effective it is more likely
+ * than a manual TRIM to encounter tiny ranges. Ranges less than or equal to
+ * 'zfs_trim_extent_bytes_min' (32k) are considered too small to efficiently
+ * TRIM and are skipped. This means small amounts of freed space may not
+ * be automatically trimmed.
+ *
+ * Furthermore, devices with attached hot spares and devices being actively
+ * replaced are skipped. This is done to avoid adding additional stress to
+ * a potentially unhealthy device and to minimize the required rebuild time.
+ *
+ * For this reason it may be beneficial to occasionally manually TRIM a pool
+ * even when automatic TRIM is enabled.
+ */
+
+/*
+ * Maximum size of TRIM I/O, ranges will be chunked in to 128MiB lengths.
+ */
+unsigned int zfs_trim_extent_bytes_max = 128 * 1024 * 1024;
+
+/*
+ * Minimum size of TRIM I/O, extents smaller than 32Kib will be skipped.
+ */
+unsigned int zfs_trim_extent_bytes_min = 32 * 1024;
+
+/*
+ * Skip uninitialized metaslabs during the TRIM process. This option is
+ * useful for pools constructed from large thinly-provisioned devices where
+ * TRIM operations are slow. As a pool ages an increasing fraction of
+ * the pools metaslabs will be initialized progressively degrading the
+ * usefulness of this option. This setting is stored when starting a
+ * manual TRIM and will persist for the duration of the requested TRIM.
+ */
+unsigned int zfs_trim_metaslab_skip = 0;
+
+/*
+ * Maximum number of queued TRIM I/Os per leaf vdev. The number of
+ * concurrent TRIM I/Os issued to the device is controlled by the
+ * zfs_vdev_trim_min_active and zfs_vdev_trim_max_active module options.
+ */
+unsigned int zfs_trim_queue_limit = 10;
+
+/*
+ * The minimum number of transaction groups between automatic trims of a
+ * metaslab. This setting represents a trade-off between issuing more
+ * efficient TRIM operations, by allowing them to be aggregated longer,
+ * and issuing them promptly so the trimmed space is available. Note
+ * that this value is a minimum; metaslabs can be trimmed less frequently
+ * when there are a large number of ranges which need to be trimmed.
+ *
+ * Increasing this value will allow frees to be aggregated for a longer
+ * time. This can result is larger TRIM operations, and increased memory
+ * usage in order to track the ranges to be trimmed. Decreasing this value
+ * has the opposite effect. The default value of 32 was determined though
+ * testing to be a reasonable compromise.
+ */
+unsigned int zfs_trim_txg_batch = 32;
+
+/*
+ * The trim_args are a control structure which describe how a leaf vdev
+ * should be trimmed. The core elements are the vdev, the metaslab being
+ * trimmed and a range tree containing the extents to TRIM. All provided
+ * ranges must be within the metaslab.
+ */
+typedef struct trim_args {
+ /*
+ * These fields are set by the caller of vdev_trim_ranges().
+ */
+ vdev_t *trim_vdev; /* Leaf vdev to TRIM */
+ metaslab_t *trim_msp; /* Disabled metaslab */
+ range_tree_t *trim_tree; /* TRIM ranges (in metaslab) */
+ trim_type_t trim_type; /* Manual or auto TRIM */
+ uint64_t trim_extent_bytes_max; /* Maximum TRIM I/O size */
+ uint64_t trim_extent_bytes_min; /* Minimum TRIM I/O size */
+ enum trim_flag trim_flags; /* TRIM flags (secure) */
+
+ /*
+ * These fields are updated by vdev_trim_ranges().
+ */
+ hrtime_t trim_start_time; /* Start time */
+ uint64_t trim_bytes_done; /* Bytes trimmed */
+} trim_args_t;
+
+/*
+ * Determines whether a vdev_trim_thread() should be stopped.
+ */
+static boolean_t
+vdev_trim_should_stop(vdev_t *vd)
+{
+ return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) ||
+ vd->vdev_detached || vd->vdev_top->vdev_removing);
+}
+
+/*
+ * Determines whether a vdev_autotrim_thread() should be stopped.
+ */
+static boolean_t
+vdev_autotrim_should_stop(vdev_t *tvd)
+{
+ return (tvd->vdev_autotrim_exit_wanted ||
+ !vdev_writeable(tvd) || tvd->vdev_removing ||
+ spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF);
+}
+
+/*
+ * The sync task for updating the on-disk state of a manual TRIM. This
+ * is scheduled by vdev_trim_change_state().
+ */
+static void
+vdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx)
+{
+ /*
+ * We pass in the guid instead of the vdev_t since the vdev may
+ * have been freed prior to the sync task being processed. This
+ * happens when a vdev is detached as we call spa_config_vdev_exit(),
+ * stop the trimming thread, schedule the sync task, and free
+ * the vdev. Later when the scheduled sync task is invoked, it would
+ * find that the vdev has been freed.
+ */
+ uint64_t guid = *(uint64_t *)arg;
+ uint64_t txg = dmu_tx_get_txg(tx);
+ kmem_free(arg, sizeof (uint64_t));
+
+ vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
+ if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
+ return;
+
+ uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK];
+ vd->vdev_trim_offset[txg & TXG_MASK] = 0;
+
+ VERIFY3U(vd->vdev_leaf_zap, !=, 0);
+
+ objset_t *mos = vd->vdev_spa->spa_meta_objset;
+
+ if (last_offset > 0 || vd->vdev_trim_last_offset == UINT64_MAX) {
+
+ if (vd->vdev_trim_last_offset == UINT64_MAX)
+ last_offset = 0;
+
+ vd->vdev_trim_last_offset = last_offset;
+ VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_TRIM_LAST_OFFSET,
+ sizeof (last_offset), 1, &last_offset, tx));
+ }
+
+ if (vd->vdev_trim_action_time > 0) {
+ uint64_t val = (uint64_t)vd->vdev_trim_action_time;
+ VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_TRIM_ACTION_TIME, sizeof (val),
+ 1, &val, tx));
+ }
+
+ if (vd->vdev_trim_rate > 0) {
+ uint64_t rate = (uint64_t)vd->vdev_trim_rate;
+
+ if (rate == UINT64_MAX)
+ rate = 0;
+
+ VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_TRIM_RATE, sizeof (rate), 1, &rate, tx));
+ }
+
+ uint64_t partial = vd->vdev_trim_partial;
+ if (partial == UINT64_MAX)
+ partial = 0;
+
+ VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL,
+ sizeof (partial), 1, &partial, tx));
+
+ uint64_t secure = vd->vdev_trim_secure;
+ if (secure == UINT64_MAX)
+ secure = 0;
+
+ VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE,
+ sizeof (secure), 1, &secure, tx));
+
+
+ uint64_t trim_state = vd->vdev_trim_state;
+ VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE,
+ sizeof (trim_state), 1, &trim_state, tx));
+}
+
+/*
+ * Update the on-disk state of a manual TRIM. This is called to request
+ * that a TRIM be started/suspended/canceled, or to change one of the
+ * TRIM options (partial, secure, rate).
+ */
+static void
+vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state,
+ uint64_t rate, boolean_t partial, boolean_t secure)
+{
+ ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
+ spa_t *spa = vd->vdev_spa;
+
+ if (new_state == vd->vdev_trim_state)
+ return;
+
+ /*
+ * Copy the vd's guid, this will be freed by the sync task.
+ */
+ uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+ *guid = vd->vdev_guid;
+
+ /*
+ * If we're suspending, then preserve the original start time.
+ */
+ if (vd->vdev_trim_state != VDEV_TRIM_SUSPENDED) {
+ vd->vdev_trim_action_time = gethrestime_sec();
+ }
+
+ /*
+ * If we're activating, then preserve the requested rate and trim
+ * method. Setting the last offset and rate to UINT64_MAX is used
+ * as a sentinel to indicate they should be reset to default values.
+ */
+ if (new_state == VDEV_TRIM_ACTIVE) {
+ if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE ||
+ vd->vdev_trim_state == VDEV_TRIM_CANCELED) {
+ vd->vdev_trim_last_offset = UINT64_MAX;
+ vd->vdev_trim_rate = UINT64_MAX;
+ vd->vdev_trim_partial = UINT64_MAX;
+ vd->vdev_trim_secure = UINT64_MAX;
+ }
+
+ if (rate != 0)
+ vd->vdev_trim_rate = rate;
+
+ if (partial != 0)
+ vd->vdev_trim_partial = partial;
+
+ if (secure != 0)
+ vd->vdev_trim_secure = secure;
+ }
+
+ vdev_trim_state_t old_state = vd->vdev_trim_state;
+ boolean_t resumed = (old_state == VDEV_TRIM_SUSPENDED);
+ vd->vdev_trim_state = new_state;
+
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ dsl_sync_task_nowait(spa_get_dsl(spa), vdev_trim_zap_update_sync,
+ guid, tx);
+
+ switch (new_state) {
+ case VDEV_TRIM_ACTIVE:
+ spa_event_notify(spa, vd, NULL,
+ resumed ? ESC_ZFS_TRIM_RESUME : ESC_ZFS_TRIM_START);
+ spa_history_log_internal(spa, "trim", tx,
+ "vdev=%s activated", vd->vdev_path);
+ break;
+ case VDEV_TRIM_SUSPENDED:
+ spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_SUSPEND);
+ spa_history_log_internal(spa, "trim", tx,
+ "vdev=%s suspended", vd->vdev_path);
+ break;
+ case VDEV_TRIM_CANCELED:
+ if (old_state == VDEV_TRIM_ACTIVE ||
+ old_state == VDEV_TRIM_SUSPENDED) {
+ spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL);
+ spa_history_log_internal(spa, "trim", tx,
+ "vdev=%s canceled", vd->vdev_path);
+ }
+ break;
+ case VDEV_TRIM_COMPLETE:
+ spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_FINISH);
+ spa_history_log_internal(spa, "trim", tx,
+ "vdev=%s complete", vd->vdev_path);
+ break;
+ default:
+ panic("invalid state %llu", (unsigned long long)new_state);
+ }
+
+ dmu_tx_commit(tx);
+
+ if (new_state != VDEV_TRIM_ACTIVE)
+ spa_notify_waiters(spa);
+}
+
+/*
+ * The zio_done_func_t done callback for each manual TRIM issued. It is
+ * responsible for updating the TRIM stats, reissuing failed TRIM I/Os,
+ * and limiting the number of in flight TRIM I/Os.
+ */
+static void
+vdev_trim_cb(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+
+ mutex_enter(&vd->vdev_trim_io_lock);
+ if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
+ /*
+ * The I/O failed because the vdev was unavailable; roll the
+ * last offset back. (This works because spa_sync waits on
+ * spa_txg_zio before it runs sync tasks.)
+ */
+ uint64_t *offset =
+ &vd->vdev_trim_offset[zio->io_txg & TXG_MASK];
+ *offset = MIN(*offset, zio->io_offset);
+ } else {
+ if (zio->io_error != 0) {
+ vd->vdev_stat.vs_trim_errors++;
+ spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL,
+ 0, 0, 0, 0, 1, zio->io_orig_size);
+ } else {
+ spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL,
+ 1, zio->io_orig_size, 0, 0, 0, 0);
+ }
+
+ vd->vdev_trim_bytes_done += zio->io_orig_size;
+ }
+
+ ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_MANUAL], >, 0);
+ vd->vdev_trim_inflight[TRIM_TYPE_MANUAL]--;
+ cv_broadcast(&vd->vdev_trim_io_cv);
+ mutex_exit(&vd->vdev_trim_io_lock);
+
+ spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+}
+
+/*
+ * The zio_done_func_t done callback for each automatic TRIM issued. It
+ * is responsible for updating the TRIM stats and limiting the number of
+ * in flight TRIM I/Os. Automatic TRIM I/Os are best effort and are
+ * never reissued on failure.
+ */
+static void
+vdev_autotrim_cb(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+
+ mutex_enter(&vd->vdev_trim_io_lock);
+
+ if (zio->io_error != 0) {
+ vd->vdev_stat.vs_trim_errors++;
+ spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO,
+ 0, 0, 0, 0, 1, zio->io_orig_size);
+ } else {
+ spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO,
+ 1, zio->io_orig_size, 0, 0, 0, 0);
+ }
+
+ ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_AUTO], >, 0);
+ vd->vdev_trim_inflight[TRIM_TYPE_AUTO]--;
+ cv_broadcast(&vd->vdev_trim_io_cv);
+ mutex_exit(&vd->vdev_trim_io_lock);
+
+ spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+}
+
+/*
+ * The zio_done_func_t done callback for each TRIM issued via
+ * vdev_trim_simple(). It is responsible for updating the TRIM stats and
+ * limiting the number of in flight TRIM I/Os. Simple TRIM I/Os are best
+ * effort and are never reissued on failure.
+ */
+static void
+vdev_trim_simple_cb(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+
+ mutex_enter(&vd->vdev_trim_io_lock);
+
+ if (zio->io_error != 0) {
+ vd->vdev_stat.vs_trim_errors++;
+ spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE,
+ 0, 0, 0, 0, 1, zio->io_orig_size);
+ } else {
+ spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE,
+ 1, zio->io_orig_size, 0, 0, 0, 0);
+ }
+
+ ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE], >, 0);
+ vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE]--;
+ cv_broadcast(&vd->vdev_trim_io_cv);
+ mutex_exit(&vd->vdev_trim_io_lock);
+
+ spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+}
+/*
+ * Returns the average trim rate in bytes/sec for the ta->trim_vdev.
+ */
+static uint64_t
+vdev_trim_calculate_rate(trim_args_t *ta)
+{
+ return (ta->trim_bytes_done * 1000 /
+ (NSEC2MSEC(gethrtime() - ta->trim_start_time) + 1));
+}
+
+/*
+ * Issues a physical TRIM and takes care of rate limiting (bytes/sec)
+ * and number of concurrent TRIM I/Os.
+ */
+static int
+vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size)
+{
+ vdev_t *vd = ta->trim_vdev;
+ spa_t *spa = vd->vdev_spa;
+ void *cb;
+
+ mutex_enter(&vd->vdev_trim_io_lock);
+
+ /*
+ * Limit manual TRIM I/Os to the requested rate. This does not
+ * apply to automatic TRIM since no per vdev rate can be specified.
+ */
+ if (ta->trim_type == TRIM_TYPE_MANUAL) {
+ while (vd->vdev_trim_rate != 0 && !vdev_trim_should_stop(vd) &&
+ vdev_trim_calculate_rate(ta) > vd->vdev_trim_rate) {
+ cv_timedwait_idle(&vd->vdev_trim_io_cv,
+ &vd->vdev_trim_io_lock, ddi_get_lbolt() +
+ MSEC_TO_TICK(10));
+ }
+ }
+ ta->trim_bytes_done += size;
+
+ /* Limit in flight trimming I/Os */
+ while (vd->vdev_trim_inflight[0] + vd->vdev_trim_inflight[1] +
+ vd->vdev_trim_inflight[2] >= zfs_trim_queue_limit) {
+ cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
+ }
+ vd->vdev_trim_inflight[ta->trim_type]++;
+ mutex_exit(&vd->vdev_trim_io_lock);
+
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ uint64_t txg = dmu_tx_get_txg(tx);
+
+ spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
+ mutex_enter(&vd->vdev_trim_lock);
+
+ if (ta->trim_type == TRIM_TYPE_MANUAL &&
+ vd->vdev_trim_offset[txg & TXG_MASK] == 0) {
+ uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+ *guid = vd->vdev_guid;
+
+ /* This is the first write of this txg. */
+ dsl_sync_task_nowait(spa_get_dsl(spa),
+ vdev_trim_zap_update_sync, guid, tx);
+ }
+
+ /*
+ * We know the vdev_t will still be around since all consumers of
+ * vdev_free must stop the trimming first.
+ */
+ if ((ta->trim_type == TRIM_TYPE_MANUAL &&
+ vdev_trim_should_stop(vd)) ||
+ (ta->trim_type == TRIM_TYPE_AUTO &&
+ vdev_autotrim_should_stop(vd->vdev_top))) {
+ mutex_enter(&vd->vdev_trim_io_lock);
+ vd->vdev_trim_inflight[ta->trim_type]--;
+ mutex_exit(&vd->vdev_trim_io_lock);
+ spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+ mutex_exit(&vd->vdev_trim_lock);
+ dmu_tx_commit(tx);
+ return (SET_ERROR(EINTR));
+ }
+ mutex_exit(&vd->vdev_trim_lock);
+
+ if (ta->trim_type == TRIM_TYPE_MANUAL)
+ vd->vdev_trim_offset[txg & TXG_MASK] = start + size;
+
+ if (ta->trim_type == TRIM_TYPE_MANUAL) {
+ cb = vdev_trim_cb;
+ } else if (ta->trim_type == TRIM_TYPE_AUTO) {
+ cb = vdev_autotrim_cb;
+ } else {
+ cb = vdev_trim_simple_cb;
+ }
+
+ zio_nowait(zio_trim(spa->spa_txg_zio[txg & TXG_MASK], vd,
+ start, size, cb, NULL, ZIO_PRIORITY_TRIM, ZIO_FLAG_CANFAIL,
+ ta->trim_flags));
+ /* vdev_trim_cb and vdev_autotrim_cb release SCL_STATE_ALL */
+
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+
+/*
+ * Issues TRIM I/Os for all ranges in the provided ta->trim_tree range tree.
+ * Additional parameters describing how the TRIM should be performed must
+ * be set in the trim_args structure. See the trim_args definition for
+ * additional information.
+ */
+static int
+vdev_trim_ranges(trim_args_t *ta)
+{
+ vdev_t *vd = ta->trim_vdev;
+ zfs_btree_t *t = &ta->trim_tree->rt_root;
+ zfs_btree_index_t idx;
+ uint64_t extent_bytes_max = ta->trim_extent_bytes_max;
+ uint64_t extent_bytes_min = ta->trim_extent_bytes_min;
+ spa_t *spa = vd->vdev_spa;
+
+ ta->trim_start_time = gethrtime();
+ ta->trim_bytes_done = 0;
+
+ for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL;
+ rs = zfs_btree_next(t, &idx, &idx)) {
+ uint64_t size = rs_get_end(rs, ta->trim_tree) - rs_get_start(rs,
+ ta->trim_tree);
+
+ if (extent_bytes_min && size < extent_bytes_min) {
+ spa_iostats_trim_add(spa, ta->trim_type,
+ 0, 0, 1, size, 0, 0);
+ continue;
+ }
+
+ /* Split range into legally-sized physical chunks */
+ uint64_t writes_required = ((size - 1) / extent_bytes_max) + 1;
+
+ for (uint64_t w = 0; w < writes_required; w++) {
+ int error;
+
+ error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE +
+ rs_get_start(rs, ta->trim_tree) +
+ (w *extent_bytes_max), MIN(size -
+ (w * extent_bytes_max), extent_bytes_max));
+ if (error != 0) {
+ return (error);
+ }
+ }
+ }
+
+ return (0);
+}
+
+static void
+vdev_trim_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs)
+{
+ uint64_t *last_rs_end = (uint64_t *)arg;
+
+ if (physical_rs->rs_end > *last_rs_end)
+ *last_rs_end = physical_rs->rs_end;
+}
+
+static void
+vdev_trim_xlate_progress(void *arg, range_seg64_t *physical_rs)
+{
+ vdev_t *vd = (vdev_t *)arg;
+
+ uint64_t size = physical_rs->rs_end - physical_rs->rs_start;
+ vd->vdev_trim_bytes_est += size;
+
+ if (vd->vdev_trim_last_offset >= physical_rs->rs_end) {
+ vd->vdev_trim_bytes_done += size;
+ } else if (vd->vdev_trim_last_offset > physical_rs->rs_start &&
+ vd->vdev_trim_last_offset <= physical_rs->rs_end) {
+ vd->vdev_trim_bytes_done +=
+ vd->vdev_trim_last_offset - physical_rs->rs_start;
+ }
+}
+
+/*
+ * Calculates the completion percentage of a manual TRIM.
+ */
+static void
+vdev_trim_calculate_progress(vdev_t *vd)
+{
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
+ spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
+ ASSERT(vd->vdev_leaf_zap != 0);
+
+ vd->vdev_trim_bytes_est = 0;
+ vd->vdev_trim_bytes_done = 0;
+
+ for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
+ metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+ mutex_enter(&msp->ms_lock);
+
+ uint64_t ms_free = (msp->ms_size -
+ metaslab_allocated_space(msp)) /
+ vdev_get_ndisks(vd->vdev_top);
+
+ /*
+ * Convert the metaslab range to a physical range
+ * on our vdev. We use this to determine if we are
+ * in the middle of this metaslab range.
+ */
+ range_seg64_t logical_rs, physical_rs, remain_rs;
+ logical_rs.rs_start = msp->ms_start;
+ logical_rs.rs_end = msp->ms_start + msp->ms_size;
+
+ /* Metaslab space after this offset has not been trimmed. */
+ vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs);
+ if (vd->vdev_trim_last_offset <= physical_rs.rs_start) {
+ vd->vdev_trim_bytes_est += ms_free;
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ /* Metaslab space before this offset has been trimmed */
+ uint64_t last_rs_end = physical_rs.rs_end;
+ if (!vdev_xlate_is_empty(&remain_rs)) {
+ vdev_xlate_walk(vd, &remain_rs,
+ vdev_trim_xlate_last_rs_end, &last_rs_end);
+ }
+
+ if (vd->vdev_trim_last_offset > last_rs_end) {
+ vd->vdev_trim_bytes_done += ms_free;
+ vd->vdev_trim_bytes_est += ms_free;
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ /*
+ * If we get here, we're in the middle of trimming this
+ * metaslab. Load it and walk the free tree for more
+ * accurate progress estimation.
+ */
+ VERIFY0(metaslab_load(msp));
+
+ range_tree_t *rt = msp->ms_allocatable;
+ zfs_btree_t *bt = &rt->rt_root;
+ zfs_btree_index_t idx;
+ for (range_seg_t *rs = zfs_btree_first(bt, &idx);
+ rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) {
+ logical_rs.rs_start = rs_get_start(rs, rt);
+ logical_rs.rs_end = rs_get_end(rs, rt);
+
+ vdev_xlate_walk(vd, &logical_rs,
+ vdev_trim_xlate_progress, vd);
+ }
+ mutex_exit(&msp->ms_lock);
+ }
+}
+
+/*
+ * Load from disk the vdev's manual TRIM information. This includes the
+ * state, progress, and options provided when initiating the manual TRIM.
+ */
+static int
+vdev_trim_load(vdev_t *vd)
+{
+ int err = 0;
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
+ spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
+ ASSERT(vd->vdev_leaf_zap != 0);
+
+ if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE ||
+ vd->vdev_trim_state == VDEV_TRIM_SUSPENDED) {
+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_LAST_OFFSET,
+ sizeof (vd->vdev_trim_last_offset), 1,
+ &vd->vdev_trim_last_offset);
+ if (err == ENOENT) {
+ vd->vdev_trim_last_offset = 0;
+ err = 0;
+ }
+
+ if (err == 0) {
+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_RATE,
+ sizeof (vd->vdev_trim_rate), 1,
+ &vd->vdev_trim_rate);
+ if (err == ENOENT) {
+ vd->vdev_trim_rate = 0;
+ err = 0;
+ }
+ }
+
+ if (err == 0) {
+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL,
+ sizeof (vd->vdev_trim_partial), 1,
+ &vd->vdev_trim_partial);
+ if (err == ENOENT) {
+ vd->vdev_trim_partial = 0;
+ err = 0;
+ }
+ }
+
+ if (err == 0) {
+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE,
+ sizeof (vd->vdev_trim_secure), 1,
+ &vd->vdev_trim_secure);
+ if (err == ENOENT) {
+ vd->vdev_trim_secure = 0;
+ err = 0;
+ }
+ }
+ }
+
+ vdev_trim_calculate_progress(vd);
+
+ return (err);
+}
+
+static void
+vdev_trim_xlate_range_add(void *arg, range_seg64_t *physical_rs)
+{
+ trim_args_t *ta = arg;
+ vdev_t *vd = ta->trim_vdev;
+
+ /*
+ * Only a manual trim will be traversing the vdev sequentially.
+ * For an auto trim all valid ranges should be added.
+ */
+ if (ta->trim_type == TRIM_TYPE_MANUAL) {
+
+ /* Only add segments that we have not visited yet */
+ if (physical_rs->rs_end <= vd->vdev_trim_last_offset)
+ return;
+
+ /* Pick up where we left off mid-range. */
+ if (vd->vdev_trim_last_offset > physical_rs->rs_start) {
+ ASSERT3U(physical_rs->rs_end, >,
+ vd->vdev_trim_last_offset);
+ physical_rs->rs_start = vd->vdev_trim_last_offset;
+ }
+ }
+
+ ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start);
+
+ range_tree_add(ta->trim_tree, physical_rs->rs_start,
+ physical_rs->rs_end - physical_rs->rs_start);
+}
+
+/*
+ * Convert the logical range into physical ranges and add them to the
+ * range tree passed in the trim_args_t.
+ */
+static void
+vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)
+{
+ trim_args_t *ta = arg;
+ vdev_t *vd = ta->trim_vdev;
+ range_seg64_t logical_rs;
+ logical_rs.rs_start = start;
+ logical_rs.rs_end = start + size;
+
+ /*
+ * Every range to be trimmed must be part of ms_allocatable.
+ * When ZFS_DEBUG_TRIM is set load the metaslab to verify this
+ * is always the case.
+ */
+ if (zfs_flags & ZFS_DEBUG_TRIM) {
+ metaslab_t *msp = ta->trim_msp;
+ VERIFY0(metaslab_load(msp));
+ VERIFY3B(msp->ms_loaded, ==, B_TRUE);
+ VERIFY(range_tree_contains(msp->ms_allocatable, start, size));
+ }
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ vdev_xlate_walk(vd, &logical_rs, vdev_trim_xlate_range_add, arg);
+}
+
+/*
+ * Each manual TRIM thread is responsible for trimming the unallocated
+ * space for each leaf vdev. This is accomplished by sequentially iterating
+ * over its top-level metaslabs and issuing TRIM I/O for the space described
+ * by its ms_allocatable. While a metaslab is undergoing trimming it is
+ * not eligible for new allocations.
+ */
+static void
+vdev_trim_thread(void *arg)
+{
+ vdev_t *vd = arg;
+ spa_t *spa = vd->vdev_spa;
+ trim_args_t ta;
+ int error = 0;
+
+ /*
+ * The VDEV_LEAF_ZAP_TRIM_* entries may have been updated by
+ * vdev_trim(). Wait for the updated values to be reflected
+ * in the zap in order to start with the requested settings.
+ */
+ txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
+
+ ASSERT(vdev_is_concrete(vd));
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ vd->vdev_trim_last_offset = 0;
+ vd->vdev_trim_rate = 0;
+ vd->vdev_trim_partial = 0;
+ vd->vdev_trim_secure = 0;
+
+ VERIFY0(vdev_trim_load(vd));
+
+ ta.trim_vdev = vd;
+ ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
+ ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min;
+ ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+ ta.trim_type = TRIM_TYPE_MANUAL;
+ ta.trim_flags = 0;
+
+ /*
+ * When a secure TRIM has been requested infer that the intent
+ * is that everything must be trimmed. Override the default
+ * minimum TRIM size to prevent ranges from being skipped.
+ */
+ if (vd->vdev_trim_secure) {
+ ta.trim_flags |= ZIO_TRIM_SECURE;
+ ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
+ }
+
+ uint64_t ms_count = 0;
+ for (uint64_t i = 0; !vd->vdev_detached &&
+ i < vd->vdev_top->vdev_ms_count; i++) {
+ metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+
+ /*
+ * If we've expanded the top-level vdev or it's our
+ * first pass, calculate our progress.
+ */
+ if (vd->vdev_top->vdev_ms_count != ms_count) {
+ vdev_trim_calculate_progress(vd);
+ ms_count = vd->vdev_top->vdev_ms_count;
+ }
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ metaslab_disable(msp);
+ mutex_enter(&msp->ms_lock);
+ VERIFY0(metaslab_load(msp));
+
+ /*
+ * If a partial TRIM was requested skip metaslabs which have
+ * never been initialized and thus have never been written.
+ */
+ if (msp->ms_sm == NULL && vd->vdev_trim_partial) {
+ mutex_exit(&msp->ms_lock);
+ metaslab_enable(msp, B_FALSE, B_FALSE);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_trim_calculate_progress(vd);
+ continue;
+ }
+
+ ta.trim_msp = msp;
+ range_tree_walk(msp->ms_allocatable, vdev_trim_range_add, &ta);
+ range_tree_vacate(msp->ms_trim, NULL, NULL);
+ mutex_exit(&msp->ms_lock);
+
+ error = vdev_trim_ranges(&ta);
+ metaslab_enable(msp, B_TRUE, B_FALSE);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ range_tree_vacate(ta.trim_tree, NULL, NULL);
+ if (error != 0)
+ break;
+ }
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ mutex_enter(&vd->vdev_trim_io_lock);
+ while (vd->vdev_trim_inflight[0] > 0) {
+ cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
+ }
+ mutex_exit(&vd->vdev_trim_io_lock);
+
+ range_tree_destroy(ta.trim_tree);
+
+ mutex_enter(&vd->vdev_trim_lock);
+ if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) {
+ vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE,
+ vd->vdev_trim_rate, vd->vdev_trim_partial,
+ vd->vdev_trim_secure);
+ }
+ ASSERT(vd->vdev_trim_thread != NULL || vd->vdev_trim_inflight[0] == 0);
+
+ /*
+ * Drop the vdev_trim_lock while we sync out the txg since it's
+ * possible that a device might be trying to come online and must
+ * check to see if it needs to restart a trim. That thread will be
+ * holding the spa_config_lock which would prevent the txg_wait_synced
+ * from completing.
+ */
+ mutex_exit(&vd->vdev_trim_lock);
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ mutex_enter(&vd->vdev_trim_lock);
+
+ vd->vdev_trim_thread = NULL;
+ cv_broadcast(&vd->vdev_trim_cv);
+ mutex_exit(&vd->vdev_trim_lock);
+
+ thread_exit();
+}
+
+/*
+ * Initiates a manual TRIM for the vdev_t. Callers must hold vdev_trim_lock,
+ * the vdev_t must be a leaf and cannot already be manually trimming.
+ */
+void
+vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure)
+{
+ ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT3P(vd->vdev_trim_thread, ==, NULL);
+ ASSERT(!vd->vdev_detached);
+ ASSERT(!vd->vdev_trim_exit_wanted);
+ ASSERT(!vd->vdev_top->vdev_removing);
+
+ vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, rate, partial, secure);
+ vd->vdev_trim_thread = thread_create(NULL, 0,
+ vdev_trim_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+}
+
+/*
+ * Wait for the trimming thread to be terminated (canceled or stopped).
+ */
+static void
+vdev_trim_stop_wait_impl(vdev_t *vd)
+{
+ ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
+
+ while (vd->vdev_trim_thread != NULL)
+ cv_wait(&vd->vdev_trim_cv, &vd->vdev_trim_lock);
+
+ ASSERT3P(vd->vdev_trim_thread, ==, NULL);
+ vd->vdev_trim_exit_wanted = B_FALSE;
+}
+
+/*
+ * Wait for vdev trim threads which were listed to cleanly exit.
+ */
+void
+vdev_trim_stop_wait(spa_t *spa, list_t *vd_list)
+{
+ vdev_t *vd;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ while ((vd = list_remove_head(vd_list)) != NULL) {
+ mutex_enter(&vd->vdev_trim_lock);
+ vdev_trim_stop_wait_impl(vd);
+ mutex_exit(&vd->vdev_trim_lock);
+ }
+}
+
+/*
+ * Stop trimming a device, with the resultant trimming state being tgt_state.
+ * For blocking behavior pass NULL for vd_list. Otherwise, when a list_t is
+ * provided the stopping vdev is inserted in to the list. Callers are then
+ * required to call vdev_trim_stop_wait() to block for all the trim threads
+ * to exit. The caller must hold vdev_trim_lock and must not be writing to
+ * the spa config, as the trimming thread may try to enter the config as a
+ * reader before exiting.
+ */
+void
+vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list)
+{
+ ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER));
+ ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(vdev_is_concrete(vd));
+
+ /*
+ * Allow cancel requests to proceed even if the trim thread has
+ * stopped.
+ */
+ if (vd->vdev_trim_thread == NULL && tgt_state != VDEV_TRIM_CANCELED)
+ return;
+
+ vdev_trim_change_state(vd, tgt_state, 0, 0, 0);
+ vd->vdev_trim_exit_wanted = B_TRUE;
+
+ if (vd_list == NULL) {
+ vdev_trim_stop_wait_impl(vd);
+ } else {
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ list_insert_tail(vd_list, vd);
+ }
+}
+
+/*
+ * Requests that all listed vdevs stop trimming.
+ */
+static void
+vdev_trim_stop_all_impl(vdev_t *vd, vdev_trim_state_t tgt_state,
+ list_t *vd_list)
+{
+ if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
+ mutex_enter(&vd->vdev_trim_lock);
+ vdev_trim_stop(vd, tgt_state, vd_list);
+ mutex_exit(&vd->vdev_trim_lock);
+ return;
+ }
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ vdev_trim_stop_all_impl(vd->vdev_child[i], tgt_state,
+ vd_list);
+ }
+}
+
+/*
+ * Convenience function to stop trimming of a vdev tree and set all trim
+ * thread pointers to NULL.
+ */
+void
+vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state)
+{
+ spa_t *spa = vd->vdev_spa;
+ list_t vd_list;
+ vdev_t *vd_l2cache;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ list_create(&vd_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_trim_node));
+
+ vdev_trim_stop_all_impl(vd, tgt_state, &vd_list);
+
+ /*
+ * Iterate over cache devices and request stop trimming the
+ * whole device in case we export the pool or remove the cache
+ * device prematurely.
+ */
+ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
+ vd_l2cache = spa->spa_l2cache.sav_vdevs[i];
+ vdev_trim_stop_all_impl(vd_l2cache, tgt_state, &vd_list);
+ }
+
+ vdev_trim_stop_wait(spa, &vd_list);
+
+ if (vd->vdev_spa->spa_sync_on) {
+ /* Make sure that our state has been synced to disk */
+ txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
+ }
+
+ list_destroy(&vd_list);
+}
+
+/*
+ * Conditionally restarts a manual TRIM given its on-disk state.
+ */
+void
+vdev_trim_restart(vdev_t *vd)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+
+ if (vd->vdev_leaf_zap != 0) {
+ mutex_enter(&vd->vdev_trim_lock);
+ uint64_t trim_state = VDEV_TRIM_NONE;
+ int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE,
+ sizeof (trim_state), 1, &trim_state);
+ ASSERT(err == 0 || err == ENOENT);
+ vd->vdev_trim_state = trim_state;
+
+ uint64_t timestamp = 0;
+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_ACTION_TIME,
+ sizeof (timestamp), 1, &timestamp);
+ ASSERT(err == 0 || err == ENOENT);
+ vd->vdev_trim_action_time = timestamp;
+
+ if (vd->vdev_trim_state == VDEV_TRIM_SUSPENDED ||
+ vd->vdev_offline) {
+ /* load progress for reporting, but don't resume */
+ VERIFY0(vdev_trim_load(vd));
+ } else if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE &&
+ vdev_writeable(vd) && !vd->vdev_top->vdev_removing &&
+ vd->vdev_trim_thread == NULL) {
+ VERIFY0(vdev_trim_load(vd));
+ vdev_trim(vd, vd->vdev_trim_rate,
+ vd->vdev_trim_partial, vd->vdev_trim_secure);
+ }
+
+ mutex_exit(&vd->vdev_trim_lock);
+ }
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ vdev_trim_restart(vd->vdev_child[i]);
+ }
+}
+
+/*
+ * Used by the automatic TRIM when ZFS_DEBUG_TRIM is set to verify that
+ * every TRIM range is contained within ms_allocatable.
+ */
+static void
+vdev_trim_range_verify(void *arg, uint64_t start, uint64_t size)
+{
+ trim_args_t *ta = arg;
+ metaslab_t *msp = ta->trim_msp;
+
+ VERIFY3B(msp->ms_loaded, ==, B_TRUE);
+ VERIFY3U(msp->ms_disabled, >, 0);
+ VERIFY(range_tree_contains(msp->ms_allocatable, start, size));
+}
+
+/*
+ * Each automatic TRIM thread is responsible for managing the trimming of a
+ * top-level vdev in the pool. No automatic TRIM state is maintained on-disk.
+ *
+ * N.B. This behavior is different from a manual TRIM where a thread
+ * is created for each leaf vdev, instead of each top-level vdev.
+ */
+static void
+vdev_autotrim_thread(void *arg)
+{
+ vdev_t *vd = arg;
+ spa_t *spa = vd->vdev_spa;
+ int shift = 0;
+
+ mutex_enter(&vd->vdev_autotrim_lock);
+ ASSERT3P(vd->vdev_top, ==, vd);
+ ASSERT3P(vd->vdev_autotrim_thread, !=, NULL);
+ mutex_exit(&vd->vdev_autotrim_lock);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ uint64_t extent_bytes_max = zfs_trim_extent_bytes_max;
+ uint64_t extent_bytes_min = zfs_trim_extent_bytes_min;
+
+ while (!vdev_autotrim_should_stop(vd)) {
+ int txgs_per_trim = MAX(zfs_trim_txg_batch, 1);
+ boolean_t issued_trim = B_FALSE;
+
+ /*
+ * All of the metaslabs are divided in to groups of size
+ * num_metaslabs / zfs_trim_txg_batch. Each of these groups
+ * is composed of metaslabs which are spread evenly over the
+ * device.
+ *
+ * For example, when zfs_trim_txg_batch = 32 (default) then
+ * group 0 will contain metaslabs 0, 32, 64, ...;
+ * group 1 will contain metaslabs 1, 33, 65, ...;
+ * group 2 will contain metaslabs 2, 34, 66, ...; and so on.
+ *
+ * On each pass through the while() loop one of these groups
+ * is selected. This is accomplished by using a shift value
+ * to select the starting metaslab, then striding over the
+ * metaslabs using the zfs_trim_txg_batch size. This is
+ * done to accomplish two things.
+ *
+ * 1) By dividing the metaslabs in to groups, and making sure
+ * that each group takes a minimum of one txg to process.
+ * Then zfs_trim_txg_batch controls the minimum number of
+ * txgs which must occur before a metaslab is revisited.
+ *
+ * 2) Selecting non-consecutive metaslabs distributes the
+ * TRIM commands for a group evenly over the entire device.
+ * This can be advantageous for certain types of devices.
+ */
+ for (uint64_t i = shift % txgs_per_trim; i < vd->vdev_ms_count;
+ i += txgs_per_trim) {
+ metaslab_t *msp = vd->vdev_ms[i];
+ range_tree_t *trim_tree;
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ metaslab_disable(msp);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * Skip the metaslab when it has never been allocated
+ * or when there are no recent frees to trim.
+ */
+ if (msp->ms_sm == NULL ||
+ range_tree_is_empty(msp->ms_trim)) {
+ mutex_exit(&msp->ms_lock);
+ metaslab_enable(msp, B_FALSE, B_FALSE);
+ continue;
+ }
+
+ /*
+ * Skip the metaslab when it has already been disabled.
+ * This may happen when a manual TRIM or initialize
+ * operation is running concurrently. In the case
+ * of a manual TRIM, the ms_trim tree will have been
+ * vacated. Only ranges added after the manual TRIM
+ * disabled the metaslab will be included in the tree.
+ * These will be processed when the automatic TRIM
+ * next revisits this metaslab.
+ */
+ if (msp->ms_disabled > 1) {
+ mutex_exit(&msp->ms_lock);
+ metaslab_enable(msp, B_FALSE, B_FALSE);
+ continue;
+ }
+
+ /*
+ * Allocate an empty range tree which is swapped in
+ * for the existing ms_trim tree while it is processed.
+ */
+ trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL,
+ 0, 0);
+ range_tree_swap(&msp->ms_trim, &trim_tree);
+ ASSERT(range_tree_is_empty(msp->ms_trim));
+
+ /*
+ * There are two cases when constructing the per-vdev
+ * trim trees for a metaslab. If the top-level vdev
+ * has no children then it is also a leaf and should
+ * be trimmed. Otherwise our children are the leaves
+ * and a trim tree should be constructed for each.
+ */
+ trim_args_t *tap;
+ uint64_t children = vd->vdev_children;
+ if (children == 0) {
+ children = 1;
+ tap = kmem_zalloc(sizeof (trim_args_t) *
+ children, KM_SLEEP);
+ tap[0].trim_vdev = vd;
+ } else {
+ tap = kmem_zalloc(sizeof (trim_args_t) *
+ children, KM_SLEEP);
+
+ for (uint64_t c = 0; c < children; c++) {
+ tap[c].trim_vdev = vd->vdev_child[c];
+ }
+ }
+
+ for (uint64_t c = 0; c < children; c++) {
+ trim_args_t *ta = &tap[c];
+ vdev_t *cvd = ta->trim_vdev;
+
+ ta->trim_msp = msp;
+ ta->trim_extent_bytes_max = extent_bytes_max;
+ ta->trim_extent_bytes_min = extent_bytes_min;
+ ta->trim_type = TRIM_TYPE_AUTO;
+ ta->trim_flags = 0;
+
+ if (cvd->vdev_detached ||
+ !vdev_writeable(cvd) ||
+ !cvd->vdev_has_trim ||
+ cvd->vdev_trim_thread != NULL) {
+ continue;
+ }
+
+ /*
+ * When a device has an attached hot spare, or
+ * is being replaced it will not be trimmed.
+ * This is done to avoid adding additional
+ * stress to a potentially unhealthy device,
+ * and to minimize the required rebuild time.
+ */
+ if (!cvd->vdev_ops->vdev_op_leaf)
+ continue;
+
+ ta->trim_tree = range_tree_create(NULL,
+ RANGE_SEG64, NULL, 0, 0);
+ range_tree_walk(trim_tree,
+ vdev_trim_range_add, ta);
+ }
+
+ mutex_exit(&msp->ms_lock);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ /*
+ * Issue the TRIM I/Os for all ranges covered by the
+ * TRIM trees. These ranges are safe to TRIM because
+ * no new allocations will be performed until the call
+ * to metaslab_enabled() below.
+ */
+ for (uint64_t c = 0; c < children; c++) {
+ trim_args_t *ta = &tap[c];
+
+ /*
+ * Always yield to a manual TRIM if one has
+ * been started for the child vdev.
+ */
+ if (ta->trim_tree == NULL ||
+ ta->trim_vdev->vdev_trim_thread != NULL) {
+ continue;
+ }
+
+ /*
+ * After this point metaslab_enable() must be
+ * called with the sync flag set. This is done
+ * here because vdev_trim_ranges() is allowed
+ * to be interrupted (EINTR) before issuing all
+ * of the required TRIM I/Os.
+ */
+ issued_trim = B_TRUE;
+
+ int error = vdev_trim_ranges(ta);
+ if (error)
+ break;
+ }
+
+ /*
+ * Verify every range which was trimmed is still
+ * contained within the ms_allocatable tree.
+ */
+ if (zfs_flags & ZFS_DEBUG_TRIM) {
+ mutex_enter(&msp->ms_lock);
+ VERIFY0(metaslab_load(msp));
+ VERIFY3P(tap[0].trim_msp, ==, msp);
+ range_tree_walk(trim_tree,
+ vdev_trim_range_verify, &tap[0]);
+ mutex_exit(&msp->ms_lock);
+ }
+
+ range_tree_vacate(trim_tree, NULL, NULL);
+ range_tree_destroy(trim_tree);
+
+ metaslab_enable(msp, issued_trim, B_FALSE);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ for (uint64_t c = 0; c < children; c++) {
+ trim_args_t *ta = &tap[c];
+
+ if (ta->trim_tree == NULL)
+ continue;
+
+ range_tree_vacate(ta->trim_tree, NULL, NULL);
+ range_tree_destroy(ta->trim_tree);
+ }
+
+ kmem_free(tap, sizeof (trim_args_t) * children);
+ }
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ /*
+ * After completing the group of metaslabs wait for the next
+ * open txg. This is done to make sure that a minimum of
+ * zfs_trim_txg_batch txgs will occur before these metaslabs
+ * are trimmed again.
+ */
+ txg_wait_open(spa_get_dsl(spa), 0, issued_trim);
+
+ shift++;
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ }
+
+ for (uint64_t c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ mutex_enter(&cvd->vdev_trim_io_lock);
+
+ while (cvd->vdev_trim_inflight[1] > 0) {
+ cv_wait(&cvd->vdev_trim_io_cv,
+ &cvd->vdev_trim_io_lock);
+ }
+ mutex_exit(&cvd->vdev_trim_io_lock);
+ }
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ /*
+ * When exiting because the autotrim property was set to off, then
+ * abandon any unprocessed ms_trim ranges to reclaim the memory.
+ */
+ if (spa_get_autotrim(spa) == SPA_AUTOTRIM_OFF) {
+ for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
+ metaslab_t *msp = vd->vdev_ms[i];
+
+ mutex_enter(&msp->ms_lock);
+ range_tree_vacate(msp->ms_trim, NULL, NULL);
+ mutex_exit(&msp->ms_lock);
+ }
+ }
+
+ mutex_enter(&vd->vdev_autotrim_lock);
+ ASSERT(vd->vdev_autotrim_thread != NULL);
+ vd->vdev_autotrim_thread = NULL;
+ cv_broadcast(&vd->vdev_autotrim_cv);
+ mutex_exit(&vd->vdev_autotrim_lock);
+
+ thread_exit();
+}
+
+/*
+ * Starts an autotrim thread, if needed, for each top-level vdev which can be
+ * trimmed. A top-level vdev which has been evacuated will never be trimmed.
+ */
+void
+vdev_autotrim(spa_t *spa)
+{
+ vdev_t *root_vd = spa->spa_root_vdev;
+
+ for (uint64_t i = 0; i < root_vd->vdev_children; i++) {
+ vdev_t *tvd = root_vd->vdev_child[i];
+
+ mutex_enter(&tvd->vdev_autotrim_lock);
+ if (vdev_writeable(tvd) && !tvd->vdev_removing &&
+ tvd->vdev_autotrim_thread == NULL) {
+ ASSERT3P(tvd->vdev_top, ==, tvd);
+
+ tvd->vdev_autotrim_thread = thread_create(NULL, 0,
+ vdev_autotrim_thread, tvd, 0, &p0, TS_RUN,
+ maxclsyspri);
+ ASSERT(tvd->vdev_autotrim_thread != NULL);
+ }
+ mutex_exit(&tvd->vdev_autotrim_lock);
+ }
+}
+
+/*
+ * Wait for the vdev_autotrim_thread associated with the passed top-level
+ * vdev to be terminated (canceled or stopped).
+ */
+void
+vdev_autotrim_stop_wait(vdev_t *tvd)
+{
+ mutex_enter(&tvd->vdev_autotrim_lock);
+ if (tvd->vdev_autotrim_thread != NULL) {
+ tvd->vdev_autotrim_exit_wanted = B_TRUE;
+
+ while (tvd->vdev_autotrim_thread != NULL) {
+ cv_wait(&tvd->vdev_autotrim_cv,
+ &tvd->vdev_autotrim_lock);
+ }
+
+ ASSERT3P(tvd->vdev_autotrim_thread, ==, NULL);
+ tvd->vdev_autotrim_exit_wanted = B_FALSE;
+ }
+ mutex_exit(&tvd->vdev_autotrim_lock);
+}
+
+/*
+ * Wait for all of the vdev_autotrim_thread associated with the pool to
+ * be terminated (canceled or stopped).
+ */
+void
+vdev_autotrim_stop_all(spa_t *spa)
+{
+ vdev_t *root_vd = spa->spa_root_vdev;
+
+ for (uint64_t i = 0; i < root_vd->vdev_children; i++)
+ vdev_autotrim_stop_wait(root_vd->vdev_child[i]);
+}
+
+/*
+ * Conditionally restart all of the vdev_autotrim_thread's for the pool.
+ */
+void
+vdev_autotrim_restart(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ if (spa->spa_autotrim)
+ vdev_autotrim(spa);
+}
+
+static void
+vdev_trim_l2arc_thread(void *arg)
+{
+ vdev_t *vd = arg;
+ spa_t *spa = vd->vdev_spa;
+ l2arc_dev_t *dev = l2arc_vdev_get(vd);
+ trim_args_t ta;
+ range_seg64_t physical_rs;
+
+ ASSERT(vdev_is_concrete(vd));
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ vd->vdev_trim_last_offset = 0;
+ vd->vdev_trim_rate = 0;
+ vd->vdev_trim_partial = 0;
+ vd->vdev_trim_secure = 0;
+
+ bzero(&ta, sizeof (ta));
+ ta.trim_vdev = vd;
+ ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+ ta.trim_type = TRIM_TYPE_MANUAL;
+ ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
+ ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
+ ta.trim_flags = 0;
+
+ physical_rs.rs_start = vd->vdev_trim_bytes_done = 0;
+ physical_rs.rs_end = vd->vdev_trim_bytes_est =
+ vdev_get_min_asize(vd);
+
+ range_tree_add(ta.trim_tree, physical_rs.rs_start,
+ physical_rs.rs_end - physical_rs.rs_start);
+
+ mutex_enter(&vd->vdev_trim_lock);
+ vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0);
+ mutex_exit(&vd->vdev_trim_lock);
+
+ (void) vdev_trim_ranges(&ta);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ mutex_enter(&vd->vdev_trim_io_lock);
+ while (vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] > 0) {
+ cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
+ }
+ mutex_exit(&vd->vdev_trim_io_lock);
+
+ range_tree_vacate(ta.trim_tree, NULL, NULL);
+ range_tree_destroy(ta.trim_tree);
+
+ mutex_enter(&vd->vdev_trim_lock);
+ if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) {
+ vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE,
+ vd->vdev_trim_rate, vd->vdev_trim_partial,
+ vd->vdev_trim_secure);
+ }
+ ASSERT(vd->vdev_trim_thread != NULL ||
+ vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] == 0);
+
+ /*
+ * Drop the vdev_trim_lock while we sync out the txg since it's
+ * possible that a device might be trying to come online and
+ * must check to see if it needs to restart a trim. That thread
+ * will be holding the spa_config_lock which would prevent the
+ * txg_wait_synced from completing. Same strategy as in
+ * vdev_trim_thread().
+ */
+ mutex_exit(&vd->vdev_trim_lock);
+ txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
+ mutex_enter(&vd->vdev_trim_lock);
+
+ /*
+ * Update the header of the cache device here, before
+ * broadcasting vdev_trim_cv which may lead to the removal
+ * of the device. The same applies for setting l2ad_trim_all to
+ * false.
+ */
+ spa_config_enter(vd->vdev_spa, SCL_L2ARC, vd,
+ RW_READER);
+ bzero(dev->l2ad_dev_hdr, dev->l2ad_dev_hdr_asize);
+ l2arc_dev_hdr_update(dev);
+ spa_config_exit(vd->vdev_spa, SCL_L2ARC, vd);
+
+ vd->vdev_trim_thread = NULL;
+ if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE)
+ dev->l2ad_trim_all = B_FALSE;
+
+ cv_broadcast(&vd->vdev_trim_cv);
+ mutex_exit(&vd->vdev_trim_lock);
+
+ thread_exit();
+}
+
+/*
+ * Punches out TRIM threads for the L2ARC devices in a spa and assigns them
+ * to vd->vdev_trim_thread variable. This facilitates the management of
+ * trimming the whole cache device using TRIM_TYPE_MANUAL upon addition
+ * to a pool or pool creation or when the header of the device is invalid.
+ */
+void
+vdev_trim_l2arc(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ /*
+ * Locate the spa's l2arc devices and kick off TRIM threads.
+ */
+ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
+ vdev_t *vd = spa->spa_l2cache.sav_vdevs[i];
+ l2arc_dev_t *dev = l2arc_vdev_get(vd);
+
+ if (dev == NULL || !dev->l2ad_trim_all) {
+ /*
+ * Don't attempt TRIM if the vdev is UNAVAIL or if the
+ * cache device was not marked for whole device TRIM
+ * (ie l2arc_trim_ahead = 0, or the L2ARC device header
+ * is valid with trim_state = VDEV_TRIM_COMPLETE and
+ * l2ad_log_entries > 0).
+ */
+ continue;
+ }
+
+ mutex_enter(&vd->vdev_trim_lock);
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT3P(vd->vdev_trim_thread, ==, NULL);
+ ASSERT(!vd->vdev_detached);
+ ASSERT(!vd->vdev_trim_exit_wanted);
+ ASSERT(!vd->vdev_top->vdev_removing);
+ vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0);
+ vd->vdev_trim_thread = thread_create(NULL, 0,
+ vdev_trim_l2arc_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+ mutex_exit(&vd->vdev_trim_lock);
+ }
+}
+
+/*
+ * A wrapper which calls vdev_trim_ranges(). It is intended to be called
+ * on leaf vdevs.
+ */
+int
+vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size)
+{
+ trim_args_t ta;
+ range_seg64_t physical_rs;
+ int error;
+ physical_rs.rs_start = start;
+ physical_rs.rs_end = start + size;
+
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(!vd->vdev_detached);
+ ASSERT(!vd->vdev_top->vdev_removing);
+
+ bzero(&ta, sizeof (ta));
+ ta.trim_vdev = vd;
+ ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+ ta.trim_type = TRIM_TYPE_SIMPLE;
+ ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
+ ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
+ ta.trim_flags = 0;
+
+ ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
+
+ if (physical_rs.rs_end > physical_rs.rs_start) {
+ range_tree_add(ta.trim_tree, physical_rs.rs_start,
+ physical_rs.rs_end - physical_rs.rs_start);
+ } else {
+ ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
+ }
+
+ error = vdev_trim_ranges(&ta);
+
+ mutex_enter(&vd->vdev_trim_io_lock);
+ while (vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE] > 0) {
+ cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
+ }
+ mutex_exit(&vd->vdev_trim_io_lock);
+
+ range_tree_vacate(ta.trim_tree, NULL, NULL);
+ range_tree_destroy(ta.trim_tree);
+
+ return (error);
+}
+
+EXPORT_SYMBOL(vdev_trim);
+EXPORT_SYMBOL(vdev_trim_stop);
+EXPORT_SYMBOL(vdev_trim_stop_all);
+EXPORT_SYMBOL(vdev_trim_stop_wait);
+EXPORT_SYMBOL(vdev_trim_restart);
+EXPORT_SYMBOL(vdev_autotrim);
+EXPORT_SYMBOL(vdev_autotrim_stop_all);
+EXPORT_SYMBOL(vdev_autotrim_stop_wait);
+EXPORT_SYMBOL(vdev_autotrim_restart);
+EXPORT_SYMBOL(vdev_trim_l2arc);
+EXPORT_SYMBOL(vdev_trim_simple);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_max, UINT, ZMOD_RW,
+ "Max size of TRIM commands, larger will be split");
+
+ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_min, UINT, ZMOD_RW,
+ "Min size of TRIM commands, smaller will be skipped");
+
+ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, metaslab_skip, UINT, ZMOD_RW,
+ "Skip metaslabs which have never been initialized");
+
+ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, txg_batch, UINT, ZMOD_RW,
+ "Min number of txgs to aggregate frees before issuing TRIM");
+
+ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, queue_limit, UINT, ZMOD_RW,
+ "Max queued TRIMs outstanding per leaf vdev");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/zap.c b/sys/contrib/openzfs/module/zfs/zap.c
new file mode 100644
index 000000000000..c0c280c52076
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zap.c
@@ -0,0 +1,1384 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ */
+
+/*
+ * This file contains the top half of the zfs directory structure
+ * implementation. The bottom half is in zap_leaf.c.
+ *
+ * The zdir is an extendable hash data structure. There is a table of
+ * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
+ * each a constant size and hold a variable number of directory entries.
+ * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
+ *
+ * The pointer table holds a power of 2 number of pointers.
+ * (1<<zap_t->zd_data->zd_phys->zd_prefix_len). The bucket pointed to
+ * by the pointer at index i in the table holds entries whose hash value
+ * has a zd_prefix_len - bit prefix
+ */
+
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+#include <sys/fs/zfs.h>
+#include <sys/zap.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+
+/*
+ * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object
+ * (all leaf blocks) when we start iterating over it.
+ *
+ * For zap_cursor_init(), the callers all intend to iterate through all the
+ * entries. There are a few cases where an error (typically i/o error) could
+ * cause it to bail out early.
+ *
+ * For zap_cursor_init_serialized(), there are callers that do the iteration
+ * outside of ZFS. Typically they would iterate over everything, but we
+ * don't have control of that. E.g. zfs_ioc_snapshot_list_next(),
+ * zcp_snapshots_iter(), and other iterators over things in the MOS - these
+ * are called by /sbin/zfs and channel programs. The other example is
+ * zfs_readdir() which iterates over directory entries for the getdents()
+ * syscall. /sbin/ls iterates to the end (unless it receives a signal), but
+ * userland doesn't have to.
+ *
+ * Given that the ZAP entries aren't returned in a specific order, the only
+ * legitimate use cases for partial iteration would be:
+ *
+ * 1. Pagination: e.g. you only want to display 100 entries at a time, so you
+ * get the first 100 and then wait for the user to hit "next page", which
+ * they may never do).
+ *
+ * 2. You want to know if there are more than X entries, without relying on
+ * the zfs-specific implementation of the directory's st_size (which is
+ * the number of entries).
+ */
+int zap_iterate_prefetch = B_TRUE;
+
+int fzap_default_block_shift = 14; /* 16k blocksize */
+
+extern inline zap_phys_t *zap_f_phys(zap_t *zap);
+
+static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
+
+void
+fzap_byteswap(void *vbuf, size_t size)
+{
+ uint64_t block_type = *(uint64_t *)vbuf;
+
+ if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
+ zap_leaf_byteswap(vbuf, size);
+ else {
+ /* it's a ptrtbl block */
+ byteswap_uint64_array(vbuf, size);
+ }
+}
+
+void
+fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
+{
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ zap->zap_ismicro = FALSE;
+
+ zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync;
+ zap->zap_dbu.dbu_evict_func_async = NULL;
+
+ mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT, 0);
+ zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
+
+ zap_phys_t *zp = zap_f_phys(zap);
+ /*
+ * explicitly zero it since it might be coming from an
+ * initialized microzap
+ */
+ bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
+ zp->zap_block_type = ZBT_HEADER;
+ zp->zap_magic = ZAP_MAGIC;
+
+ zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
+
+ zp->zap_freeblk = 2; /* block 1 will be the first leaf */
+ zp->zap_num_leafs = 1;
+ zp->zap_num_entries = 0;
+ zp->zap_salt = zap->zap_salt;
+ zp->zap_normflags = zap->zap_normflags;
+ zp->zap_flags = flags;
+
+ /* block 1 will be the first leaf */
+ for (int i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
+ ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
+
+ /*
+ * set up block 1 - the first leaf
+ */
+ dmu_buf_t *db;
+ VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ 1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
+ dmu_buf_will_dirty(db, tx);
+
+ zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+ l->l_dbuf = db;
+
+ zap_leaf_init(l, zp->zap_normflags != 0);
+
+ kmem_free(l, sizeof (zap_leaf_t));
+ dmu_buf_rele(db, FTAG);
+}
+
+static int
+zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
+{
+ if (RW_WRITE_HELD(&zap->zap_rwlock))
+ return (1);
+ if (rw_tryupgrade(&zap->zap_rwlock)) {
+ dmu_buf_will_dirty(zap->zap_dbuf, tx);
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * Generic routines for dealing with the pointer & cookie tables.
+ */
+
+static int
+zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
+ void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
+ dmu_tx_t *tx)
+{
+ uint64_t newblk;
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ int hepb = 1<<(bs-4);
+ /* hepb = half the number of entries in a block */
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ ASSERT(tbl->zt_blk != 0);
+ ASSERT(tbl->zt_numblks > 0);
+
+ if (tbl->zt_nextblk != 0) {
+ newblk = tbl->zt_nextblk;
+ } else {
+ newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
+ tbl->zt_nextblk = newblk;
+ ASSERT0(tbl->zt_blks_copied);
+ dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+ tbl->zt_blk << bs, tbl->zt_numblks << bs,
+ ZIO_PRIORITY_SYNC_READ);
+ }
+
+ /*
+ * Copy the ptrtbl from the old to new location.
+ */
+
+ uint64_t b = tbl->zt_blks_copied;
+ dmu_buf_t *db_old;
+ int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
+ if (err != 0)
+ return (err);
+
+ /* first half of entries in old[b] go to new[2*b+0] */
+ dmu_buf_t *db_new;
+ VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
+ dmu_buf_will_dirty(db_new, tx);
+ transfer_func(db_old->db_data, db_new->db_data, hepb);
+ dmu_buf_rele(db_new, FTAG);
+
+ /* second half of entries in old[b] go to new[2*b+1] */
+ VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
+ dmu_buf_will_dirty(db_new, tx);
+ transfer_func((uint64_t *)db_old->db_data + hepb,
+ db_new->db_data, hepb);
+ dmu_buf_rele(db_new, FTAG);
+
+ dmu_buf_rele(db_old, FTAG);
+
+ tbl->zt_blks_copied++;
+
+ dprintf("copied block %llu of %llu\n",
+ tbl->zt_blks_copied, tbl->zt_numblks);
+
+ if (tbl->zt_blks_copied == tbl->zt_numblks) {
+ (void) dmu_free_range(zap->zap_objset, zap->zap_object,
+ tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
+
+ tbl->zt_blk = newblk;
+ tbl->zt_numblks *= 2;
+ tbl->zt_shift++;
+ tbl->zt_nextblk = 0;
+ tbl->zt_blks_copied = 0;
+
+ dprintf("finished; numblocks now %llu (%uk entries)\n",
+ tbl->zt_numblks, 1<<(tbl->zt_shift-10));
+ }
+
+ return (0);
+}
+
+static int
+zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
+ dmu_tx_t *tx)
+{
+ int bs = FZAP_BLOCK_SHIFT(zap);
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ ASSERT(tbl->zt_blk != 0);
+
+ dprintf("storing %llx at index %llx\n", val, idx);
+
+ uint64_t blk = idx >> (bs-3);
+ uint64_t off = idx & ((1<<(bs-3))-1);
+
+ dmu_buf_t *db;
+ int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
+ if (err != 0)
+ return (err);
+ dmu_buf_will_dirty(db, tx);
+
+ if (tbl->zt_nextblk != 0) {
+ uint64_t idx2 = idx * 2;
+ uint64_t blk2 = idx2 >> (bs-3);
+ uint64_t off2 = idx2 & ((1<<(bs-3))-1);
+ dmu_buf_t *db2;
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
+ DMU_READ_NO_PREFETCH);
+ if (err != 0) {
+ dmu_buf_rele(db, FTAG);
+ return (err);
+ }
+ dmu_buf_will_dirty(db2, tx);
+ ((uint64_t *)db2->db_data)[off2] = val;
+ ((uint64_t *)db2->db_data)[off2+1] = val;
+ dmu_buf_rele(db2, FTAG);
+ }
+
+ ((uint64_t *)db->db_data)[off] = val;
+ dmu_buf_rele(db, FTAG);
+
+ return (0);
+}
+
+static int
+zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
+{
+ int bs = FZAP_BLOCK_SHIFT(zap);
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ uint64_t blk = idx >> (bs-3);
+ uint64_t off = idx & ((1<<(bs-3))-1);
+
+ /*
+ * Note: this is equivalent to dmu_buf_hold(), but we use
+ * _dnode_enter / _by_dnode because it's faster because we don't
+ * have to hold the dnode.
+ */
+ dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+ dmu_buf_t *db;
+ int err = dmu_buf_hold_by_dnode(dn,
+ (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
+ dmu_buf_dnode_exit(zap->zap_dbuf);
+ if (err != 0)
+ return (err);
+ *valp = ((uint64_t *)db->db_data)[off];
+ dmu_buf_rele(db, FTAG);
+
+ if (tbl->zt_nextblk != 0) {
+ /*
+ * read the nextblk for the sake of i/o error checking,
+ * so that zap_table_load() will catch errors for
+ * zap_table_store.
+ */
+ blk = (idx*2) >> (bs-3);
+
+ dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+ err = dmu_buf_hold_by_dnode(dn,
+ (tbl->zt_nextblk + blk) << bs, FTAG, &db,
+ DMU_READ_NO_PREFETCH);
+ dmu_buf_dnode_exit(zap->zap_dbuf);
+ if (err == 0)
+ dmu_buf_rele(db, FTAG);
+ }
+ return (err);
+}
+
+/*
+ * Routines for growing the ptrtbl.
+ */
+
+static void
+zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
+{
+ for (int i = 0; i < n; i++) {
+ uint64_t lb = src[i];
+ dst[2 * i + 0] = lb;
+ dst[2 * i + 1] = lb;
+ }
+}
+
+static int
+zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
+{
+ /*
+ * The pointer table should never use more hash bits than we
+ * have (otherwise we'd be using useless zero bits to index it).
+ * If we are within 2 bits of running out, stop growing, since
+ * this is already an aberrant condition.
+ */
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
+ return (SET_ERROR(ENOSPC));
+
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
+ /*
+ * We are outgrowing the "embedded" ptrtbl (the one
+ * stored in the header block). Give it its own entire
+ * block, which will double the size of the ptrtbl.
+ */
+ ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
+ ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
+ ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk);
+
+ uint64_t newblk = zap_allocate_blocks(zap, 1);
+ dmu_buf_t *db_new;
+ int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
+ DMU_READ_NO_PREFETCH);
+ if (err != 0)
+ return (err);
+ dmu_buf_will_dirty(db_new, tx);
+ zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
+ db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
+ dmu_buf_rele(db_new, FTAG);
+
+ zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk;
+ zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1;
+ zap_f_phys(zap)->zap_ptrtbl.zt_shift++;
+
+ ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
+ zap_f_phys(zap)->zap_ptrtbl.zt_numblks <<
+ (FZAP_BLOCK_SHIFT(zap)-3));
+
+ return (0);
+ } else {
+ return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl,
+ zap_ptrtbl_transfer, tx));
+ }
+}
+
+static void
+zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
+{
+ dmu_buf_will_dirty(zap->zap_dbuf, tx);
+ mutex_enter(&zap->zap_f.zap_num_entries_mtx);
+ ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta);
+ zap_f_phys(zap)->zap_num_entries += delta;
+ mutex_exit(&zap->zap_f.zap_num_entries_mtx);
+}
+
+static uint64_t
+zap_allocate_blocks(zap_t *zap, int nblocks)
+{
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ uint64_t newblk = zap_f_phys(zap)->zap_freeblk;
+ zap_f_phys(zap)->zap_freeblk += nblocks;
+ return (newblk);
+}
+
+static void
+zap_leaf_evict_sync(void *dbu)
+{
+ zap_leaf_t *l = dbu;
+
+ rw_destroy(&l->l_rwlock);
+ kmem_free(l, sizeof (zap_leaf_t));
+}
+
+static zap_leaf_t *
+zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
+{
+ zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ rw_init(&l->l_rwlock, NULL, RW_NOLOCKDEP, NULL);
+ rw_enter(&l->l_rwlock, RW_WRITER);
+ l->l_blkid = zap_allocate_blocks(zap, 1);
+ l->l_dbuf = NULL;
+
+ VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
+ DMU_READ_NO_PREFETCH));
+ dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
+ VERIFY3P(NULL, ==, dmu_buf_set_user(l->l_dbuf, &l->l_dbu));
+ dmu_buf_will_dirty(l->l_dbuf, tx);
+
+ zap_leaf_init(l, zap->zap_normflags != 0);
+
+ zap_f_phys(zap)->zap_num_leafs++;
+
+ return (l);
+}
+
+int
+fzap_count(zap_t *zap, uint64_t *count)
+{
+ ASSERT(!zap->zap_ismicro);
+ mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
+ *count = zap_f_phys(zap)->zap_num_entries;
+ mutex_exit(&zap->zap_f.zap_num_entries_mtx);
+ return (0);
+}
+
+/*
+ * Routines for obtaining zap_leaf_t's
+ */
+
+void
+zap_put_leaf(zap_leaf_t *l)
+{
+ rw_exit(&l->l_rwlock);
+ dmu_buf_rele(l->l_dbuf, NULL);
+}
+
+static zap_leaf_t *
+zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
+{
+ ASSERT(blkid != 0);
+
+ zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+ rw_init(&l->l_rwlock, NULL, RW_DEFAULT, NULL);
+ rw_enter(&l->l_rwlock, RW_WRITER);
+ l->l_blkid = blkid;
+ l->l_bs = highbit64(db->db_size) - 1;
+ l->l_dbuf = db;
+
+ dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
+ zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu);
+
+ rw_exit(&l->l_rwlock);
+ if (winner != NULL) {
+ /* someone else set it first */
+ zap_leaf_evict_sync(&l->l_dbu);
+ l = winner;
+ }
+
+ /*
+ * lhr_pad was previously used for the next leaf in the leaf
+ * chain. There should be no chained leafs (as we have removed
+ * support for them).
+ */
+ ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1);
+
+ /*
+ * There should be more hash entries than there can be
+ * chunks to put in the hash table
+ */
+ ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
+
+ /* The chunks should begin at the end of the hash table */
+ ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, (zap_leaf_chunk_t *)
+ &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
+
+ /* The chunks should end at the end of the block */
+ ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
+ (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size);
+
+ return (l);
+}
+
+static int
+zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
+ zap_leaf_t **lp)
+{
+ dmu_buf_t *db;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ /*
+ * If system crashed just after dmu_free_long_range in zfs_rmnode, we
+ * would be left with an empty xattr dir in delete queue. blkid=0
+ * would be passed in when doing zfs_purgedir. If that's the case we
+ * should just return immediately. The underlying objects should
+ * already be freed, so this should be perfectly fine.
+ */
+ if (blkid == 0)
+ return (SET_ERROR(ENOENT));
+
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+ int err = dmu_buf_hold_by_dnode(dn,
+ blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
+ dmu_buf_dnode_exit(zap->zap_dbuf);
+ if (err != 0)
+ return (err);
+
+ ASSERT3U(db->db_object, ==, zap->zap_object);
+ ASSERT3U(db->db_offset, ==, blkid << bs);
+ ASSERT3U(db->db_size, ==, 1 << bs);
+ ASSERT(blkid != 0);
+
+ zap_leaf_t *l = dmu_buf_get_user(db);
+
+ if (l == NULL)
+ l = zap_open_leaf(blkid, db);
+
+ rw_enter(&l->l_rwlock, lt);
+ /*
+ * Must lock before dirtying, otherwise zap_leaf_phys(l) could change,
+ * causing ASSERT below to fail.
+ */
+ if (lt == RW_WRITER)
+ dmu_buf_will_dirty(db, tx);
+ ASSERT3U(l->l_blkid, ==, blkid);
+ ASSERT3P(l->l_dbuf, ==, db);
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF);
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+
+ *lp = l;
+ return (0);
+}
+
+static int
+zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
+{
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
+ ASSERT3U(idx, <,
+ (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
+ *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
+ return (0);
+ } else {
+ return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl,
+ idx, valp));
+ }
+}
+
+static int
+zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
+{
+ ASSERT(tx != NULL);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
+ ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
+ return (0);
+ } else {
+ return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl,
+ idx, blk, tx));
+ }
+}
+
+static int
+zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
+{
+ uint64_t blk;
+
+ ASSERT(zap->zap_dbuf == NULL ||
+ zap_f_phys(zap) == zap->zap_dbuf->db_data);
+
+ /* Reality check for corrupt zap objects (leaf or header). */
+ if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF &&
+ zap_f_phys(zap)->zap_block_type != ZBT_HEADER) ||
+ zap_f_phys(zap)->zap_magic != ZAP_MAGIC) {
+ return (SET_ERROR(EIO));
+ }
+
+ uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+ int err = zap_idx_to_blk(zap, idx, &blk);
+ if (err != 0)
+ return (err);
+ err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
+
+ ASSERT(err ||
+ ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) ==
+ zap_leaf_phys(*lp)->l_hdr.lh_prefix);
+ return (err);
+}
+
+static int
+zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
+ void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
+{
+ zap_t *zap = zn->zn_zap;
+ uint64_t hash = zn->zn_hash;
+ int err;
+ int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+
+ ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
+ zap_leaf_phys(l)->l_hdr.lh_prefix);
+
+ if (zap_tryupgradedir(zap, tx) == 0 ||
+ old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
+ /* We failed to upgrade, or need to grow the pointer table */
+ objset_t *os = zap->zap_objset;
+ uint64_t object = zap->zap_object;
+
+ zap_put_leaf(l);
+ zap_unlockdir(zap, tag);
+ err = zap_lockdir(os, object, tx, RW_WRITER,
+ FALSE, FALSE, tag, &zn->zn_zap);
+ zap = zn->zn_zap;
+ if (err != 0)
+ return (err);
+ ASSERT(!zap->zap_ismicro);
+
+ while (old_prefix_len ==
+ zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
+ err = zap_grow_ptrtbl(zap, tx);
+ if (err != 0)
+ return (err);
+ }
+
+ err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+ if (err != 0)
+ return (err);
+
+ if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) {
+ /* it split while our locks were down */
+ *lp = l;
+ return (0);
+ }
+ }
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+ ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
+ zap_leaf_phys(l)->l_hdr.lh_prefix);
+
+ int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
+ (old_prefix_len + 1);
+ uint64_t sibling =
+ (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
+
+ /* check for i/o errors before doing zap_leaf_split */
+ for (int i = 0; i < (1ULL << prefix_diff); i++) {
+ uint64_t blk;
+ err = zap_idx_to_blk(zap, sibling + i, &blk);
+ if (err != 0)
+ return (err);
+ ASSERT3U(blk, ==, l->l_blkid);
+ }
+
+ zap_leaf_t *nl = zap_create_leaf(zap, tx);
+ zap_leaf_split(l, nl, zap->zap_normflags != 0);
+
+ /* set sibling pointers */
+ for (int i = 0; i < (1ULL << prefix_diff); i++) {
+ err = zap_set_idx_to_blk(zap, sibling + i, nl->l_blkid, tx);
+ ASSERT0(err); /* we checked for i/o errors above */
+ }
+
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_prefix_len, >, 0);
+
+ if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) {
+ /* we want the sibling */
+ zap_put_leaf(l);
+ *lp = nl;
+ } else {
+ zap_put_leaf(nl);
+ *lp = l;
+ }
+
+ return (0);
+}
+
+static void
+zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
+ void *tag, dmu_tx_t *tx)
+{
+ zap_t *zap = zn->zn_zap;
+ int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+ int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift &&
+ zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
+
+ zap_put_leaf(l);
+
+ if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) {
+ /*
+ * We are in the middle of growing the pointer table, or
+ * this leaf will soon make us grow it.
+ */
+ if (zap_tryupgradedir(zap, tx) == 0) {
+ objset_t *os = zap->zap_objset;
+ uint64_t zapobj = zap->zap_object;
+
+ zap_unlockdir(zap, tag);
+ int err = zap_lockdir(os, zapobj, tx,
+ RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap);
+ zap = zn->zn_zap;
+ if (err != 0)
+ return;
+ }
+
+ /* could have finished growing while our locks were down */
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift)
+ (void) zap_grow_ptrtbl(zap, tx);
+ }
+}
+
+static int
+fzap_checkname(zap_name_t *zn)
+{
+ if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
+ return (SET_ERROR(ENAMETOOLONG));
+ return (0);
+}
+
+static int
+fzap_checksize(uint64_t integer_size, uint64_t num_integers)
+{
+ /* Only integer sizes supported by C */
+ switch (integer_size) {
+ case 1:
+ case 2:
+ case 4:
+ case 8:
+ break;
+ default:
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (integer_size * num_integers > ZAP_MAXVALUELEN)
+ return (SET_ERROR(E2BIG));
+
+ return (0);
+}
+
+static int
+fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
+{
+ int err = fzap_checkname(zn);
+ if (err != 0)
+ return (err);
+ return (fzap_checksize(integer_size, num_integers));
+}
+
+/*
+ * Routines for manipulating attributes.
+ */
+int
+fzap_lookup(zap_name_t *zn,
+ uint64_t integer_size, uint64_t num_integers, void *buf,
+ char *realname, int rn_len, boolean_t *ncp)
+{
+ zap_leaf_t *l;
+ zap_entry_handle_t zeh;
+
+ int err = fzap_checkname(zn);
+ if (err != 0)
+ return (err);
+
+ err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
+ if (err != 0)
+ return (err);
+ err = zap_leaf_lookup(l, zn, &zeh);
+ if (err == 0) {
+ if ((err = fzap_checksize(integer_size, num_integers)) != 0) {
+ zap_put_leaf(l);
+ return (err);
+ }
+
+ err = zap_entry_read(&zeh, integer_size, num_integers, buf);
+ (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname);
+ if (ncp) {
+ *ncp = zap_entry_normalization_conflict(&zeh,
+ zn, NULL, zn->zn_zap);
+ }
+ }
+
+ zap_put_leaf(l);
+ return (err);
+}
+
+int
+fzap_add_cd(zap_name_t *zn,
+ uint64_t integer_size, uint64_t num_integers,
+ const void *val, uint32_t cd, void *tag, dmu_tx_t *tx)
+{
+ zap_leaf_t *l;
+ int err;
+ zap_entry_handle_t zeh;
+ zap_t *zap = zn->zn_zap;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ ASSERT(!zap->zap_ismicro);
+ ASSERT(fzap_check(zn, integer_size, num_integers) == 0);
+
+ err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
+ if (err != 0)
+ return (err);
+retry:
+ err = zap_leaf_lookup(l, zn, &zeh);
+ if (err == 0) {
+ err = SET_ERROR(EEXIST);
+ goto out;
+ }
+ if (err != ENOENT)
+ goto out;
+
+ err = zap_entry_create(l, zn, cd,
+ integer_size, num_integers, val, &zeh);
+
+ if (err == 0) {
+ zap_increment_num_entries(zap, 1, tx);
+ } else if (err == EAGAIN) {
+ err = zap_expand_leaf(zn, l, tag, tx, &l);
+ zap = zn->zn_zap; /* zap_expand_leaf() may change zap */
+ if (err == 0) {
+ goto retry;
+ } else if (err == ENOSPC) {
+ /*
+ * If we failed to expand the leaf, then bailout
+ * as there is no point trying
+ * zap_put_leaf_maybe_grow_ptrtbl().
+ */
+ return (err);
+ }
+ }
+
+out:
+ if (zap != NULL)
+ zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
+ return (err);
+}
+
+int
+fzap_add(zap_name_t *zn,
+ uint64_t integer_size, uint64_t num_integers,
+ const void *val, void *tag, dmu_tx_t *tx)
+{
+ int err = fzap_check(zn, integer_size, num_integers);
+ if (err != 0)
+ return (err);
+
+ return (fzap_add_cd(zn, integer_size, num_integers,
+ val, ZAP_NEED_CD, tag, tx));
+}
+
+int
+fzap_update(zap_name_t *zn,
+ int integer_size, uint64_t num_integers, const void *val,
+ void *tag, dmu_tx_t *tx)
+{
+ zap_leaf_t *l;
+ int err;
+ boolean_t create;
+ zap_entry_handle_t zeh;
+ zap_t *zap = zn->zn_zap;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ err = fzap_check(zn, integer_size, num_integers);
+ if (err != 0)
+ return (err);
+
+ err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
+ if (err != 0)
+ return (err);
+retry:
+ err = zap_leaf_lookup(l, zn, &zeh);
+ create = (err == ENOENT);
+ ASSERT(err == 0 || err == ENOENT);
+
+ if (create) {
+ err = zap_entry_create(l, zn, ZAP_NEED_CD,
+ integer_size, num_integers, val, &zeh);
+ if (err == 0)
+ zap_increment_num_entries(zap, 1, tx);
+ } else {
+ err = zap_entry_update(&zeh, integer_size, num_integers, val);
+ }
+
+ if (err == EAGAIN) {
+ err = zap_expand_leaf(zn, l, tag, tx, &l);
+ zap = zn->zn_zap; /* zap_expand_leaf() may change zap */
+ if (err == 0)
+ goto retry;
+ }
+
+ if (zap != NULL)
+ zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
+ return (err);
+}
+
+int
+fzap_length(zap_name_t *zn,
+ uint64_t *integer_size, uint64_t *num_integers)
+{
+ zap_leaf_t *l;
+ int err;
+ zap_entry_handle_t zeh;
+
+ err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
+ if (err != 0)
+ return (err);
+ err = zap_leaf_lookup(l, zn, &zeh);
+ if (err != 0)
+ goto out;
+
+ if (integer_size != 0)
+ *integer_size = zeh.zeh_integer_size;
+ if (num_integers != 0)
+ *num_integers = zeh.zeh_num_integers;
+out:
+ zap_put_leaf(l);
+ return (err);
+}
+
+int
+fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
+{
+ zap_leaf_t *l;
+ int err;
+ zap_entry_handle_t zeh;
+
+ err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l);
+ if (err != 0)
+ return (err);
+ err = zap_leaf_lookup(l, zn, &zeh);
+ if (err == 0) {
+ zap_entry_remove(&zeh);
+ zap_increment_num_entries(zn->zn_zap, -1, tx);
+ }
+ zap_put_leaf(l);
+ return (err);
+}
+
+void
+fzap_prefetch(zap_name_t *zn)
+{
+ uint64_t blk;
+ zap_t *zap = zn->zn_zap;
+
+ uint64_t idx = ZAP_HASH_IDX(zn->zn_hash,
+ zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+ if (zap_idx_to_blk(zap, idx, &blk) != 0)
+ return;
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
+ ZIO_PRIORITY_SYNC_READ);
+}
+
+/*
+ * Helper functions for consumers.
+ */
+
+uint64_t
+zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
+ const char *name, dmu_tx_t *tx)
+{
+ return (zap_create_link_dnsize(os, ot, parent_obj, name, 0, tx));
+}
+
+uint64_t
+zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
+ const char *name, int dnodesize, dmu_tx_t *tx)
+{
+ uint64_t new_obj;
+
+ new_obj = zap_create_dnsize(os, ot, DMU_OT_NONE, 0, dnodesize, tx);
+ VERIFY(new_obj != 0);
+ VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
+ tx));
+
+ return (new_obj);
+}
+
+int
+zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
+ char *name)
+{
+ zap_cursor_t zc;
+ int err;
+
+ if (mask == 0)
+ mask = -1ULL;
+
+ zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
+ for (zap_cursor_init(&zc, os, zapobj);
+ (err = zap_cursor_retrieve(&zc, za)) == 0;
+ zap_cursor_advance(&zc)) {
+ if ((za->za_first_integer & mask) == (value & mask)) {
+ (void) strlcpy(name, za->za_name, MAXNAMELEN);
+ break;
+ }
+ }
+ zap_cursor_fini(&zc);
+ kmem_free(za, sizeof (*za));
+ return (err);
+}
+
+int
+zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ int err = 0;
+
+ zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
+ for (zap_cursor_init(&zc, os, fromobj);
+ zap_cursor_retrieve(&zc, za) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ if (za->za_integer_length != 8 || za->za_num_integers != 1) {
+ err = SET_ERROR(EINVAL);
+ break;
+ }
+ err = zap_add(os, intoobj, za->za_name,
+ 8, 1, &za->za_first_integer, tx);
+ if (err != 0)
+ break;
+ }
+ zap_cursor_fini(&zc);
+ kmem_free(za, sizeof (*za));
+ return (err);
+}
+
+int
+zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+ uint64_t value, dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ int err = 0;
+
+ zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
+ for (zap_cursor_init(&zc, os, fromobj);
+ zap_cursor_retrieve(&zc, za) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ if (za->za_integer_length != 8 || za->za_num_integers != 1) {
+ err = SET_ERROR(EINVAL);
+ break;
+ }
+ err = zap_add(os, intoobj, za->za_name,
+ 8, 1, &value, tx);
+ if (err != 0)
+ break;
+ }
+ zap_cursor_fini(&zc);
+ kmem_free(za, sizeof (*za));
+ return (err);
+}
+
+int
+zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+ dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ int err = 0;
+
+ zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
+ for (zap_cursor_init(&zc, os, fromobj);
+ zap_cursor_retrieve(&zc, za) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ uint64_t delta = 0;
+
+ if (za->za_integer_length != 8 || za->za_num_integers != 1) {
+ err = SET_ERROR(EINVAL);
+ break;
+ }
+
+ err = zap_lookup(os, intoobj, za->za_name, 8, 1, &delta);
+ if (err != 0 && err != ENOENT)
+ break;
+ delta += za->za_first_integer;
+ err = zap_update(os, intoobj, za->za_name, 8, 1, &delta, tx);
+ if (err != 0)
+ break;
+ }
+ zap_cursor_fini(&zc);
+ kmem_free(za, sizeof (*za));
+ return (err);
+}
+
+int
+zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
+{
+ char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
+ return (zap_add(os, obj, name, 8, 1, &value, tx));
+}
+
+int
+zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
+{
+ char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
+ return (zap_remove(os, obj, name, tx));
+}
+
+int
+zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
+{
+ char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
+ return (zap_lookup(os, obj, name, 8, 1, &value));
+}
+
+int
+zap_add_int_key(objset_t *os, uint64_t obj,
+ uint64_t key, uint64_t value, dmu_tx_t *tx)
+{
+ char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+ return (zap_add(os, obj, name, 8, 1, &value, tx));
+}
+
+int
+zap_update_int_key(objset_t *os, uint64_t obj,
+ uint64_t key, uint64_t value, dmu_tx_t *tx)
+{
+ char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+ return (zap_update(os, obj, name, 8, 1, &value, tx));
+}
+
+int
+zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
+{
+ char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+ return (zap_lookup(os, obj, name, 8, 1, valuep));
+}
+
+int
+zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
+ dmu_tx_t *tx)
+{
+ uint64_t value = 0;
+
+ if (delta == 0)
+ return (0);
+
+ int err = zap_lookup(os, obj, name, 8, 1, &value);
+ if (err != 0 && err != ENOENT)
+ return (err);
+ value += delta;
+ if (value == 0)
+ err = zap_remove(os, obj, name, tx);
+ else
+ err = zap_update(os, obj, name, 8, 1, &value, tx);
+ return (err);
+}
+
+int
+zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+ dmu_tx_t *tx)
+{
+ char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+ return (zap_increment(os, obj, name, delta, tx));
+}
+
+/*
+ * Routines for iterating over the attributes.
+ */
+
+int
+fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
+{
+ int err = ENOENT;
+ zap_entry_handle_t zeh;
+ zap_leaf_t *l;
+
+ /* retrieve the next entry at or after zc_hash/zc_cd */
+ /* if no entry, return ENOENT */
+
+ /*
+ * If we are reading from the beginning, we're almost certain to
+ * iterate over the entire ZAP object. If there are multiple leaf
+ * blocks (freeblk > 2), prefetch the whole object (up to
+ * dmu_prefetch_max bytes), so that we read the leaf blocks
+ * concurrently. (Unless noprefetch was requested via
+ * zap_cursor_init_noprefetch()).
+ */
+ if (zc->zc_hash == 0 && zap_iterate_prefetch &&
+ zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
+ dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0,
+ zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
+ ZIO_PRIORITY_ASYNC_READ);
+ }
+
+ if (zc->zc_leaf &&
+ (ZAP_HASH_IDX(zc->zc_hash,
+ zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
+ zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
+ rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+ zap_put_leaf(zc->zc_leaf);
+ zc->zc_leaf = NULL;
+ }
+
+again:
+ if (zc->zc_leaf == NULL) {
+ err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
+ &zc->zc_leaf);
+ if (err != 0)
+ return (err);
+ } else {
+ rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+ }
+ l = zc->zc_leaf;
+
+ err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
+
+ if (err == ENOENT) {
+ if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0) {
+ zc->zc_hash = -1ULL;
+ zc->zc_cd = 0;
+ } else {
+ uint64_t nocare = (1ULL <<
+ (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1;
+
+ zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
+ zc->zc_cd = 0;
+
+ if (zc->zc_hash == 0) {
+ zc->zc_hash = -1ULL;
+ } else {
+ zap_put_leaf(zc->zc_leaf);
+ zc->zc_leaf = NULL;
+ goto again;
+ }
+ }
+ }
+
+ if (err == 0) {
+ zc->zc_hash = zeh.zeh_hash;
+ zc->zc_cd = zeh.zeh_cd;
+ za->za_integer_length = zeh.zeh_integer_size;
+ za->za_num_integers = zeh.zeh_num_integers;
+ if (zeh.zeh_num_integers == 0) {
+ za->za_first_integer = 0;
+ } else {
+ err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
+ ASSERT(err == 0 || err == EOVERFLOW);
+ }
+ err = zap_entry_read_name(zap, &zeh,
+ sizeof (za->za_name), za->za_name);
+ ASSERT(err == 0);
+
+ za->za_normalization_conflict =
+ zap_entry_normalization_conflict(&zeh,
+ NULL, za->za_name, zap);
+ }
+ rw_exit(&zc->zc_leaf->l_rwlock);
+ return (err);
+}
+
+static void
+zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
+{
+ uint64_t lastblk = 0;
+
+ /*
+ * NB: if a leaf has more pointers than an entire ptrtbl block
+ * can hold, then it'll be accounted for more than once, since
+ * we won't have lastblk.
+ */
+ for (int i = 0; i < len; i++) {
+ zap_leaf_t *l;
+
+ if (tbl[i] == lastblk)
+ continue;
+ lastblk = tbl[i];
+
+ int err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
+ if (err == 0) {
+ zap_leaf_stats(zap, l, zs);
+ zap_put_leaf(l);
+ }
+ }
+}
+
+void
+fzap_get_stats(zap_t *zap, zap_stats_t *zs)
+{
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ zs->zs_blocksize = 1ULL << bs;
+
+ /*
+ * Set zap_phys_t fields
+ */
+ zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs;
+ zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries;
+ zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk;
+ zs->zs_block_type = zap_f_phys(zap)->zap_block_type;
+ zs->zs_magic = zap_f_phys(zap)->zap_magic;
+ zs->zs_salt = zap_f_phys(zap)->zap_salt;
+
+ /*
+ * Set zap_ptrtbl fields
+ */
+ zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+ zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk;
+ zs->zs_ptrtbl_blks_copied =
+ zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied;
+ zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk;
+ zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
+ zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
+ /* the ptrtbl is entirely in the header block. */
+ zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
+ 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
+ } else {
+ dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+ zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
+ zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
+ ZIO_PRIORITY_SYNC_READ);
+
+ for (int b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
+ b++) {
+ dmu_buf_t *db;
+ int err;
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
+ FTAG, &db, DMU_READ_NO_PREFETCH);
+ if (err == 0) {
+ zap_stats_ptrtbl(zap, db->db_data,
+ 1<<(bs-3), zs);
+ dmu_buf_rele(db, FTAG);
+ }
+ }
+ }
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW,
+ "When iterating ZAP object, prefetch it");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/zap_leaf.c b/sys/contrib/openzfs/module/zfs/zap_leaf.c
new file mode 100644
index 000000000000..aa6c298c3b4b
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zap_leaf.c
@@ -0,0 +1,849 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+/*
+ * The 512-byte leaf is broken into 32 16-byte chunks.
+ * chunk number n means l_chunk[n], even though the header precedes it.
+ * the names are stored null-terminated.
+ */
+
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+#include <sys/zap.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+#include <sys/arc.h>
+
+static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
+
+#define CHAIN_END 0xffff /* end of the chunk chain */
+
+#define LEAF_HASH(l, h) \
+ ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
+ ((h) >> \
+ (64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len)))
+
+#define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
+
+extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l);
+
+static void
+zap_memset(void *a, int c, size_t n)
+{
+ char *cp = a;
+ char *cpend = cp + n;
+
+ while (cp < cpend)
+ *cp++ = c;
+}
+
+static void
+stv(int len, void *addr, uint64_t value)
+{
+ switch (len) {
+ case 1:
+ *(uint8_t *)addr = value;
+ return;
+ case 2:
+ *(uint16_t *)addr = value;
+ return;
+ case 4:
+ *(uint32_t *)addr = value;
+ return;
+ case 8:
+ *(uint64_t *)addr = value;
+ return;
+ default:
+ cmn_err(CE_PANIC, "bad int len %d", len);
+ }
+}
+
+static uint64_t
+ldv(int len, const void *addr)
+{
+ switch (len) {
+ case 1:
+ return (*(uint8_t *)addr);
+ case 2:
+ return (*(uint16_t *)addr);
+ case 4:
+ return (*(uint32_t *)addr);
+ case 8:
+ return (*(uint64_t *)addr);
+ default:
+ cmn_err(CE_PANIC, "bad int len %d", len);
+ }
+ return (0xFEEDFACEDEADBEEFULL);
+}
+
+void
+zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
+{
+ zap_leaf_t l;
+ dmu_buf_t l_dbuf;
+
+ l_dbuf.db_data = buf;
+ l.l_bs = highbit64(size) - 1;
+ l.l_dbuf = &l_dbuf;
+
+ buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type);
+ buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix);
+ buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic);
+ buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree);
+ buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries);
+ buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len);
+ buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist);
+
+ for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
+ buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
+
+ for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
+ zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
+ struct zap_leaf_entry *le;
+
+ switch (lc->l_free.lf_type) {
+ case ZAP_CHUNK_ENTRY:
+ le = &lc->l_entry;
+
+ le->le_type = BSWAP_8(le->le_type);
+ le->le_value_intlen = BSWAP_8(le->le_value_intlen);
+ le->le_next = BSWAP_16(le->le_next);
+ le->le_name_chunk = BSWAP_16(le->le_name_chunk);
+ le->le_name_numints = BSWAP_16(le->le_name_numints);
+ le->le_value_chunk = BSWAP_16(le->le_value_chunk);
+ le->le_value_numints = BSWAP_16(le->le_value_numints);
+ le->le_cd = BSWAP_32(le->le_cd);
+ le->le_hash = BSWAP_64(le->le_hash);
+ break;
+ case ZAP_CHUNK_FREE:
+ lc->l_free.lf_type = BSWAP_8(lc->l_free.lf_type);
+ lc->l_free.lf_next = BSWAP_16(lc->l_free.lf_next);
+ break;
+ case ZAP_CHUNK_ARRAY:
+ lc->l_array.la_type = BSWAP_8(lc->l_array.la_type);
+ lc->l_array.la_next = BSWAP_16(lc->l_array.la_next);
+ /* la_array doesn't need swapping */
+ break;
+ default:
+ cmn_err(CE_PANIC, "bad leaf type %d",
+ lc->l_free.lf_type);
+ }
+ }
+}
+
+void
+zap_leaf_init(zap_leaf_t *l, boolean_t sort)
+{
+ l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
+ zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
+ sizeof (struct zap_leaf_header));
+ zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+ 2*ZAP_LEAF_HASH_NUMENTRIES(l));
+ for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+ ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
+ ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
+ }
+ ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
+ zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF;
+ zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
+ zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
+ if (sort)
+ zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
+}
+
+/*
+ * Routines which manipulate leaf chunks (l_chunk[]).
+ */
+
+static uint16_t
+zap_leaf_chunk_alloc(zap_leaf_t *l)
+{
+ ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
+
+ int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
+
+ zap_leaf_phys(l)->l_hdr.lh_freelist =
+ ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
+
+ zap_leaf_phys(l)->l_hdr.lh_nfree--;
+
+ return (chunk);
+}
+
+static void
+zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
+{
+ struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
+
+ zlf->lf_type = ZAP_CHUNK_FREE;
+ zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist;
+ bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
+ zap_leaf_phys(l)->l_hdr.lh_freelist = chunk;
+
+ zap_leaf_phys(l)->l_hdr.lh_nfree++;
+}
+
+/*
+ * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
+ */
+
+static uint16_t
+zap_leaf_array_create(zap_leaf_t *l, const char *buf,
+ int integer_size, int num_integers)
+{
+ uint16_t chunk_head;
+ uint16_t *chunkp = &chunk_head;
+ int byten = 0;
+ uint64_t value = 0;
+ int shift = (integer_size - 1) * 8;
+ int len = num_integers;
+
+ ASSERT3U(num_integers * integer_size, <=, ZAP_MAXVALUELEN);
+
+ while (len > 0) {
+ uint16_t chunk = zap_leaf_chunk_alloc(l);
+ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+
+ la->la_type = ZAP_CHUNK_ARRAY;
+ for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
+ if (byten == 0)
+ value = ldv(integer_size, buf);
+ la->la_array[i] = value >> shift;
+ value <<= 8;
+ if (++byten == integer_size) {
+ byten = 0;
+ buf += integer_size;
+ if (--len == 0)
+ break;
+ }
+ }
+
+ *chunkp = chunk;
+ chunkp = &la->la_next;
+ }
+ *chunkp = CHAIN_END;
+
+ return (chunk_head);
+}
+
+static void
+zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
+{
+ uint16_t chunk = *chunkp;
+
+ *chunkp = CHAIN_END;
+
+ while (chunk != CHAIN_END) {
+ int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
+ ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
+ ZAP_CHUNK_ARRAY);
+ zap_leaf_chunk_free(l, chunk);
+ chunk = nextchunk;
+ }
+}
+
+/* array_len and buf_len are in integers, not bytes */
+static void
+zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
+ int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
+ void *buf)
+{
+ int len = MIN(array_len, buf_len);
+ int byten = 0;
+ uint64_t value = 0;
+ char *p = buf;
+
+ ASSERT3U(array_int_len, <=, buf_int_len);
+
+ /* Fast path for one 8-byte integer */
+ if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
+ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ uint8_t *ip = la->la_array;
+ uint64_t *buf64 = buf;
+
+ *buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
+ (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
+ (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
+ (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
+ return;
+ }
+
+ /* Fast path for an array of 1-byte integers (eg. the entry name) */
+ if (array_int_len == 1 && buf_int_len == 1 &&
+ buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
+ while (chunk != CHAIN_END) {
+ struct zap_leaf_array *la =
+ &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES);
+ p += ZAP_LEAF_ARRAY_BYTES;
+ chunk = la->la_next;
+ }
+ return;
+ }
+
+ while (len > 0) {
+ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
+ value = (value << 8) | la->la_array[i];
+ byten++;
+ if (byten == array_int_len) {
+ stv(buf_int_len, p, value);
+ byten = 0;
+ len--;
+ if (len == 0)
+ return;
+ p += buf_int_len;
+ }
+ }
+ chunk = la->la_next;
+ }
+}
+
+static boolean_t
+zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
+ int chunk, int array_numints)
+{
+ int bseen = 0;
+
+ if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
+ uint64_t *thiskey =
+ kmem_alloc(array_numints * sizeof (*thiskey), KM_SLEEP);
+ ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
+
+ zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
+ sizeof (*thiskey), array_numints, thiskey);
+ boolean_t match = bcmp(thiskey, zn->zn_key_orig,
+ array_numints * sizeof (*thiskey)) == 0;
+ kmem_free(thiskey, array_numints * sizeof (*thiskey));
+ return (match);
+ }
+
+ ASSERT(zn->zn_key_intlen == 1);
+ if (zn->zn_matchtype & MT_NORMALIZE) {
+ char *thisname = kmem_alloc(array_numints, KM_SLEEP);
+
+ zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
+ sizeof (char), array_numints, thisname);
+ boolean_t match = zap_match(zn, thisname);
+ kmem_free(thisname, array_numints);
+ return (match);
+ }
+
+ /*
+ * Fast path for exact matching.
+ * First check that the lengths match, so that we don't read
+ * past the end of the zn_key_orig array.
+ */
+ if (array_numints != zn->zn_key_orig_numints)
+ return (B_FALSE);
+ while (bseen < array_numints) {
+ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread))
+ break;
+ chunk = la->la_next;
+ bseen += toread;
+ }
+ return (bseen == array_numints);
+}
+
+/*
+ * Routines which manipulate leaf entries.
+ */
+
+int
+zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
+{
+ struct zap_leaf_entry *le;
+
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+
+ for (uint16_t *chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
+ *chunkp != CHAIN_END; chunkp = &le->le_next) {
+ uint16_t chunk = *chunkp;
+ le = ZAP_LEAF_ENTRY(l, chunk);
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ if (le->le_hash != zn->zn_hash)
+ continue;
+
+ /*
+ * NB: the entry chain is always sorted by cd on
+ * normalized zap objects, so this will find the
+ * lowest-cd match for MT_NORMALIZE.
+ */
+ ASSERT((zn->zn_matchtype == 0) ||
+ (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
+ if (zap_leaf_array_match(l, zn, le->le_name_chunk,
+ le->le_name_numints)) {
+ zeh->zeh_num_integers = le->le_value_numints;
+ zeh->zeh_integer_size = le->le_value_intlen;
+ zeh->zeh_cd = le->le_cd;
+ zeh->zeh_hash = le->le_hash;
+ zeh->zeh_chunkp = chunkp;
+ zeh->zeh_leaf = l;
+ return (0);
+ }
+ }
+
+ return (SET_ERROR(ENOENT));
+}
+
+/* Return (h1,cd1 >= h2,cd2) */
+#define HCD_GTEQ(h1, cd1, h2, cd2) \
+ ((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
+
+int
+zap_leaf_lookup_closest(zap_leaf_t *l,
+ uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
+{
+ uint64_t besth = -1ULL;
+ uint32_t bestcd = -1U;
+ uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
+ struct zap_leaf_entry *le;
+
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+
+ for (uint16_t lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
+ for (uint16_t chunk = zap_leaf_phys(l)->l_hash[lh];
+ chunk != CHAIN_END; chunk = le->le_next) {
+ le = ZAP_LEAF_ENTRY(l, chunk);
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
+ HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
+ ASSERT3U(bestlh, >=, lh);
+ bestlh = lh;
+ besth = le->le_hash;
+ bestcd = le->le_cd;
+
+ zeh->zeh_num_integers = le->le_value_numints;
+ zeh->zeh_integer_size = le->le_value_intlen;
+ zeh->zeh_cd = le->le_cd;
+ zeh->zeh_hash = le->le_hash;
+ zeh->zeh_fakechunk = chunk;
+ zeh->zeh_chunkp = &zeh->zeh_fakechunk;
+ zeh->zeh_leaf = l;
+ }
+ }
+ }
+
+ return (bestcd == -1U ? SET_ERROR(ENOENT) : 0);
+}
+
+int
+zap_entry_read(const zap_entry_handle_t *zeh,
+ uint8_t integer_size, uint64_t num_integers, void *buf)
+{
+ struct zap_leaf_entry *le =
+ ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ if (le->le_value_intlen > integer_size)
+ return (SET_ERROR(EINVAL));
+
+ zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk,
+ le->le_value_intlen, le->le_value_numints,
+ integer_size, num_integers, buf);
+
+ if (zeh->zeh_num_integers > num_integers)
+ return (SET_ERROR(EOVERFLOW));
+ return (0);
+
+}
+
+int
+zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
+ char *buf)
+{
+ struct zap_leaf_entry *le =
+ ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
+ zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
+ le->le_name_numints, 8, buflen / 8, buf);
+ } else {
+ zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
+ le->le_name_numints, 1, buflen, buf);
+ }
+ if (le->le_name_numints > buflen)
+ return (SET_ERROR(EOVERFLOW));
+ return (0);
+}
+
+int
+zap_entry_update(zap_entry_handle_t *zeh,
+ uint8_t integer_size, uint64_t num_integers, const void *buf)
+{
+ zap_leaf_t *l = zeh->zeh_leaf;
+ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
+
+ int delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
+ ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
+
+ if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks)
+ return (SET_ERROR(EAGAIN));
+
+ zap_leaf_array_free(l, &le->le_value_chunk);
+ le->le_value_chunk =
+ zap_leaf_array_create(l, buf, integer_size, num_integers);
+ le->le_value_numints = num_integers;
+ le->le_value_intlen = integer_size;
+ return (0);
+}
+
+void
+zap_entry_remove(zap_entry_handle_t *zeh)
+{
+ zap_leaf_t *l = zeh->zeh_leaf;
+
+ ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
+
+ uint16_t entry_chunk = *zeh->zeh_chunkp;
+ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry_chunk);
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ zap_leaf_array_free(l, &le->le_name_chunk);
+ zap_leaf_array_free(l, &le->le_value_chunk);
+
+ *zeh->zeh_chunkp = le->le_next;
+ zap_leaf_chunk_free(l, entry_chunk);
+
+ zap_leaf_phys(l)->l_hdr.lh_nentries--;
+}
+
+int
+zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
+ uint8_t integer_size, uint64_t num_integers, const void *buf,
+ zap_entry_handle_t *zeh)
+{
+ uint16_t chunk;
+ struct zap_leaf_entry *le;
+ uint64_t h = zn->zn_hash;
+
+ uint64_t valuelen = integer_size * num_integers;
+
+ int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
+ zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
+ if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
+ return (SET_ERROR(E2BIG));
+
+ if (cd == ZAP_NEED_CD) {
+ /* find the lowest unused cd */
+ if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
+ cd = 0;
+
+ for (chunk = *LEAF_HASH_ENTPTR(l, h);
+ chunk != CHAIN_END; chunk = le->le_next) {
+ le = ZAP_LEAF_ENTRY(l, chunk);
+ if (le->le_cd > cd)
+ break;
+ if (le->le_hash == h) {
+ ASSERT3U(cd, ==, le->le_cd);
+ cd++;
+ }
+ }
+ } else {
+ /* old unsorted format; do it the O(n^2) way */
+ for (cd = 0; ; cd++) {
+ for (chunk = *LEAF_HASH_ENTPTR(l, h);
+ chunk != CHAIN_END; chunk = le->le_next) {
+ le = ZAP_LEAF_ENTRY(l, chunk);
+ if (le->le_hash == h &&
+ le->le_cd == cd) {
+ break;
+ }
+ }
+ /* If this cd is not in use, we are good. */
+ if (chunk == CHAIN_END)
+ break;
+ }
+ }
+ /*
+ * We would run out of space in a block before we could
+ * store enough entries to run out of CD values.
+ */
+ ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
+ }
+
+ if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks)
+ return (SET_ERROR(EAGAIN));
+
+ /* make the entry */
+ chunk = zap_leaf_chunk_alloc(l);
+ le = ZAP_LEAF_ENTRY(l, chunk);
+ le->le_type = ZAP_CHUNK_ENTRY;
+ le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
+ zn->zn_key_intlen, zn->zn_key_orig_numints);
+ le->le_name_numints = zn->zn_key_orig_numints;
+ le->le_value_chunk =
+ zap_leaf_array_create(l, buf, integer_size, num_integers);
+ le->le_value_numints = num_integers;
+ le->le_value_intlen = integer_size;
+ le->le_hash = h;
+ le->le_cd = cd;
+
+ /* link it into the hash chain */
+ /* XXX if we did the search above, we could just use that */
+ uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk);
+
+ zap_leaf_phys(l)->l_hdr.lh_nentries++;
+
+ zeh->zeh_leaf = l;
+ zeh->zeh_num_integers = num_integers;
+ zeh->zeh_integer_size = le->le_value_intlen;
+ zeh->zeh_cd = le->le_cd;
+ zeh->zeh_hash = le->le_hash;
+ zeh->zeh_chunkp = chunkp;
+
+ return (0);
+}
+
+/*
+ * Determine if there is another entry with the same normalized form.
+ * For performance purposes, either zn or name must be provided (the
+ * other can be NULL). Note, there usually won't be any hash
+ * conflicts, in which case we don't need the concatenated/normalized
+ * form of the name. But all callers have one of these on hand anyway,
+ * so might as well take advantage. A cleaner but slower interface
+ * would accept neither argument, and compute the normalized name as
+ * needed (using zap_name_alloc(zap_entry_read_name(zeh))).
+ */
+boolean_t
+zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
+ const char *name, zap_t *zap)
+{
+ struct zap_leaf_entry *le;
+ boolean_t allocdzn = B_FALSE;
+
+ if (zap->zap_normflags == 0)
+ return (B_FALSE);
+
+ for (uint16_t chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
+ chunk != CHAIN_END; chunk = le->le_next) {
+ le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk);
+ if (le->le_hash != zeh->zeh_hash)
+ continue;
+ if (le->le_cd == zeh->zeh_cd)
+ continue;
+
+ if (zn == NULL) {
+ zn = zap_name_alloc(zap, name, MT_NORMALIZE);
+ allocdzn = B_TRUE;
+ }
+ if (zap_leaf_array_match(zeh->zeh_leaf, zn,
+ le->le_name_chunk, le->le_name_numints)) {
+ if (allocdzn)
+ zap_name_free(zn);
+ return (B_TRUE);
+ }
+ }
+ if (allocdzn)
+ zap_name_free(zn);
+ return (B_FALSE);
+}
+
+/*
+ * Routines for transferring entries between leafs.
+ */
+
+static uint16_t *
+zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
+{
+ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
+ struct zap_leaf_entry *le2;
+ uint16_t *chunkp;
+
+ /*
+ * keep the entry chain sorted by cd
+ * NB: this will not cause problems for unsorted leafs, though
+ * it is unnecessary there.
+ */
+ for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash);
+ *chunkp != CHAIN_END; chunkp = &le2->le_next) {
+ le2 = ZAP_LEAF_ENTRY(l, *chunkp);
+ if (le2->le_cd > le->le_cd)
+ break;
+ }
+
+ le->le_next = *chunkp;
+ *chunkp = entry;
+ return (chunkp);
+}
+
+static uint16_t
+zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
+{
+ uint16_t new_chunk;
+ uint16_t *nchunkp = &new_chunk;
+
+ while (chunk != CHAIN_END) {
+ uint16_t nchunk = zap_leaf_chunk_alloc(nl);
+ struct zap_leaf_array *nla =
+ &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
+ struct zap_leaf_array *la =
+ &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ int nextchunk = la->la_next;
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
+
+ *nla = *la; /* structure assignment */
+
+ zap_leaf_chunk_free(l, chunk);
+ chunk = nextchunk;
+ *nchunkp = nchunk;
+ nchunkp = &nla->la_next;
+ }
+ *nchunkp = CHAIN_END;
+ return (new_chunk);
+}
+
+static void
+zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
+{
+ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ uint16_t chunk = zap_leaf_chunk_alloc(nl);
+ struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk);
+ *nle = *le; /* structure assignment */
+
+ (void) zap_leaf_rehash_entry(nl, chunk);
+
+ nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
+ nle->le_value_chunk =
+ zap_leaf_transfer_array(l, le->le_value_chunk, nl);
+
+ zap_leaf_chunk_free(l, entry);
+
+ zap_leaf_phys(l)->l_hdr.lh_nentries--;
+ zap_leaf_phys(nl)->l_hdr.lh_nentries++;
+}
+
+/*
+ * Transfer the entries whose hash prefix ends in 1 to the new leaf.
+ */
+void
+zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
+{
+ int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+
+ /* set new prefix and prefix_len */
+ zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
+ zap_leaf_phys(l)->l_hdr.lh_prefix_len++;
+ zap_leaf_phys(nl)->l_hdr.lh_prefix =
+ zap_leaf_phys(l)->l_hdr.lh_prefix | 1;
+ zap_leaf_phys(nl)->l_hdr.lh_prefix_len =
+ zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+
+ /* break existing hash chains */
+ zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+ 2*ZAP_LEAF_HASH_NUMENTRIES(l));
+
+ if (sort)
+ zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
+
+ /*
+ * Transfer entries whose hash bit 'bit' is set to nl; rehash
+ * the remaining entries
+ *
+ * NB: We could find entries via the hashtable instead. That
+ * would be O(hashents+numents) rather than O(numblks+numents),
+ * but this accesses memory more sequentially, and when we're
+ * called, the block is usually pretty full.
+ */
+ for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
+ if (le->le_type != ZAP_CHUNK_ENTRY)
+ continue;
+
+ if (le->le_hash & (1ULL << bit))
+ zap_leaf_transfer_entry(l, i, nl);
+ else
+ (void) zap_leaf_rehash_entry(l, i);
+ }
+}
+
+void
+zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
+{
+ int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
+ zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_leafs_with_2n_pointers[n]++;
+
+
+ n = zap_leaf_phys(l)->l_hdr.lh_nentries/5;
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_blocks_with_n5_entries[n]++;
+
+ n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
+ zap_leaf_phys(l)->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
+ (1<<FZAP_BLOCK_SHIFT(zap));
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_blocks_n_tenths_full[n]++;
+
+ for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
+ int nentries = 0;
+ int chunk = zap_leaf_phys(l)->l_hash[i];
+
+ while (chunk != CHAIN_END) {
+ struct zap_leaf_entry *le =
+ ZAP_LEAF_ENTRY(l, chunk);
+
+ n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) +
+ ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints *
+ le->le_value_intlen);
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_entries_using_n_chunks[n]++;
+
+ chunk = le->le_next;
+ nentries++;
+ }
+
+ n = nentries;
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_buckets_with_n_entries[n]++;
+ }
+}
diff --git a/sys/contrib/openzfs/module/zfs/zap_micro.c b/sys/contrib/openzfs/module/zfs/zap_micro.c
new file mode 100644
index 000000000000..5d9bc2076068
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zap_micro.c
@@ -0,0 +1,1697 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zfs_context.h>
+#include <sys/zap.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+#include <sys/avl.h>
+#include <sys/arc.h>
+#include <sys/dmu_objset.h>
+
+#ifdef _KERNEL
+#include <sys/sunddi.h>
+#endif
+
+extern inline mzap_phys_t *zap_m_phys(zap_t *zap);
+
+static int mzap_upgrade(zap_t **zapp,
+ void *tag, dmu_tx_t *tx, zap_flags_t flags);
+
+uint64_t
+zap_getflags(zap_t *zap)
+{
+ if (zap->zap_ismicro)
+ return (0);
+ return (zap_f_phys(zap)->zap_flags);
+}
+
+int
+zap_hashbits(zap_t *zap)
+{
+ if (zap_getflags(zap) & ZAP_FLAG_HASH64)
+ return (48);
+ else
+ return (28);
+}
+
+uint32_t
+zap_maxcd(zap_t *zap)
+{
+ if (zap_getflags(zap) & ZAP_FLAG_HASH64)
+ return ((1<<16)-1);
+ else
+ return (-1U);
+}
+
+static uint64_t
+zap_hash(zap_name_t *zn)
+{
+ zap_t *zap = zn->zn_zap;
+ uint64_t h = 0;
+
+ if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
+ ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
+ h = *(uint64_t *)zn->zn_key_orig;
+ } else {
+ h = zap->zap_salt;
+ ASSERT(h != 0);
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+
+ if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
+ const uint64_t *wp = zn->zn_key_norm;
+
+ ASSERT(zn->zn_key_intlen == 8);
+ for (int i = 0; i < zn->zn_key_norm_numints;
+ wp++, i++) {
+ uint64_t word = *wp;
+
+ for (int j = 0; j < zn->zn_key_intlen; j++) {
+ h = (h >> 8) ^
+ zfs_crc64_table[(h ^ word) & 0xFF];
+ word >>= NBBY;
+ }
+ }
+ } else {
+ const uint8_t *cp = zn->zn_key_norm;
+
+ /*
+ * We previously stored the terminating null on
+ * disk, but didn't hash it, so we need to
+ * continue to not hash it. (The
+ * zn_key_*_numints includes the terminating
+ * null for non-binary keys.)
+ */
+ int len = zn->zn_key_norm_numints - 1;
+
+ ASSERT(zn->zn_key_intlen == 1);
+ for (int i = 0; i < len; cp++, i++) {
+ h = (h >> 8) ^
+ zfs_crc64_table[(h ^ *cp) & 0xFF];
+ }
+ }
+ }
+ /*
+ * Don't use all 64 bits, since we need some in the cookie for
+ * the collision differentiator. We MUST use the high bits,
+ * since those are the ones that we first pay attention to when
+ * choosing the bucket.
+ */
+ h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
+
+ return (h);
+}
+
+static int
+zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags)
+{
+ ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
+
+ size_t inlen = strlen(name) + 1;
+ size_t outlen = ZAP_MAXNAMELEN;
+
+ int err = 0;
+ (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
+ normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
+ U8_UNICODE_LATEST, &err);
+
+ return (err);
+}
+
+boolean_t
+zap_match(zap_name_t *zn, const char *matchname)
+{
+ ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
+
+ if (zn->zn_matchtype & MT_NORMALIZE) {
+ char norm[ZAP_MAXNAMELEN];
+
+ if (zap_normalize(zn->zn_zap, matchname, norm,
+ zn->zn_normflags) != 0)
+ return (B_FALSE);
+
+ return (strcmp(zn->zn_key_norm, norm) == 0);
+ } else {
+ return (strcmp(zn->zn_key_orig, matchname) == 0);
+ }
+}
+
+void
+zap_name_free(zap_name_t *zn)
+{
+ kmem_free(zn, sizeof (zap_name_t));
+}
+
+zap_name_t *
+zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
+{
+ zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
+
+ zn->zn_zap = zap;
+ zn->zn_key_intlen = sizeof (*key);
+ zn->zn_key_orig = key;
+ zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1;
+ zn->zn_matchtype = mt;
+ zn->zn_normflags = zap->zap_normflags;
+
+ /*
+ * If we're dealing with a case sensitive lookup on a mixed or
+ * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
+ * will fold case to all caps overriding the lookup request.
+ */
+ if (mt & MT_MATCH_CASE)
+ zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER;
+
+ if (zap->zap_normflags) {
+ /*
+ * We *must* use zap_normflags because this normalization is
+ * what the hash is computed from.
+ */
+ if (zap_normalize(zap, key, zn->zn_normbuf,
+ zap->zap_normflags) != 0) {
+ zap_name_free(zn);
+ return (NULL);
+ }
+ zn->zn_key_norm = zn->zn_normbuf;
+ zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
+ } else {
+ if (mt != 0) {
+ zap_name_free(zn);
+ return (NULL);
+ }
+ zn->zn_key_norm = zn->zn_key_orig;
+ zn->zn_key_norm_numints = zn->zn_key_orig_numints;
+ }
+
+ zn->zn_hash = zap_hash(zn);
+
+ if (zap->zap_normflags != zn->zn_normflags) {
+ /*
+ * We *must* use zn_normflags because this normalization is
+ * what the matching is based on. (Not the hash!)
+ */
+ if (zap_normalize(zap, key, zn->zn_normbuf,
+ zn->zn_normflags) != 0) {
+ zap_name_free(zn);
+ return (NULL);
+ }
+ zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
+ }
+
+ return (zn);
+}
+
+static zap_name_t *
+zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
+{
+ zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
+
+ ASSERT(zap->zap_normflags == 0);
+ zn->zn_zap = zap;
+ zn->zn_key_intlen = sizeof (*key);
+ zn->zn_key_orig = zn->zn_key_norm = key;
+ zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
+ zn->zn_matchtype = 0;
+
+ zn->zn_hash = zap_hash(zn);
+ return (zn);
+}
+
+static void
+mzap_byteswap(mzap_phys_t *buf, size_t size)
+{
+ buf->mz_block_type = BSWAP_64(buf->mz_block_type);
+ buf->mz_salt = BSWAP_64(buf->mz_salt);
+ buf->mz_normflags = BSWAP_64(buf->mz_normflags);
+ int max = (size / MZAP_ENT_LEN) - 1;
+ for (int i = 0; i < max; i++) {
+ buf->mz_chunk[i].mze_value =
+ BSWAP_64(buf->mz_chunk[i].mze_value);
+ buf->mz_chunk[i].mze_cd =
+ BSWAP_32(buf->mz_chunk[i].mze_cd);
+ }
+}
+
+void
+zap_byteswap(void *buf, size_t size)
+{
+ uint64_t block_type = *(uint64_t *)buf;
+
+ if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
+ /* ASSERT(magic == ZAP_LEAF_MAGIC); */
+ mzap_byteswap(buf, size);
+ } else {
+ fzap_byteswap(buf, size);
+ }
+}
+
+static int
+mze_compare(const void *arg1, const void *arg2)
+{
+ const mzap_ent_t *mze1 = arg1;
+ const mzap_ent_t *mze2 = arg2;
+
+ int cmp = TREE_CMP(mze1->mze_hash, mze2->mze_hash);
+ if (likely(cmp))
+ return (cmp);
+
+ return (TREE_CMP(mze1->mze_cd, mze2->mze_cd));
+}
+
+static void
+mze_insert(zap_t *zap, int chunkid, uint64_t hash)
+{
+ ASSERT(zap->zap_ismicro);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
+ mze->mze_chunkid = chunkid;
+ mze->mze_hash = hash;
+ mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
+ ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0);
+ avl_add(&zap->zap_m.zap_avl, mze);
+}
+
+static mzap_ent_t *
+mze_find(zap_name_t *zn)
+{
+ mzap_ent_t mze_tofind;
+ mzap_ent_t *mze;
+ avl_index_t idx;
+ avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl;
+
+ ASSERT(zn->zn_zap->zap_ismicro);
+ ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
+
+ mze_tofind.mze_hash = zn->zn_hash;
+ mze_tofind.mze_cd = 0;
+
+ mze = avl_find(avl, &mze_tofind, &idx);
+ if (mze == NULL)
+ mze = avl_nearest(avl, idx, AVL_AFTER);
+ for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
+ ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
+ if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
+ return (mze);
+ }
+
+ return (NULL);
+}
+
+static uint32_t
+mze_find_unused_cd(zap_t *zap, uint64_t hash)
+{
+ mzap_ent_t mze_tofind;
+ avl_index_t idx;
+ avl_tree_t *avl = &zap->zap_m.zap_avl;
+
+ ASSERT(zap->zap_ismicro);
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ mze_tofind.mze_hash = hash;
+ mze_tofind.mze_cd = 0;
+
+ uint32_t cd = 0;
+ for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx);
+ mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
+ if (mze->mze_cd != cd)
+ break;
+ cd++;
+ }
+
+ return (cd);
+}
+
+/*
+ * Each mzap entry requires at max : 4 chunks
+ * 3 chunks for names + 1 chunk for value.
+ */
+#define MZAP_ENT_CHUNKS (1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \
+ ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t)))
+
+/*
+ * Check if the current entry keeps the colliding entries under the fatzap leaf
+ * size.
+ */
+static boolean_t
+mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
+{
+ zap_t *zap = zn->zn_zap;
+ mzap_ent_t mze_tofind;
+ mzap_ent_t *mze;
+ avl_index_t idx;
+ avl_tree_t *avl = &zap->zap_m.zap_avl;
+ uint32_t mzap_ents = 0;
+
+ mze_tofind.mze_hash = hash;
+ mze_tofind.mze_cd = 0;
+
+ for (mze = avl_find(avl, &mze_tofind, &idx);
+ mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
+ mzap_ents++;
+ }
+
+ /* Include the new entry being added */
+ mzap_ents++;
+
+ return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS));
+}
+
+static void
+mze_remove(zap_t *zap, mzap_ent_t *mze)
+{
+ ASSERT(zap->zap_ismicro);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ avl_remove(&zap->zap_m.zap_avl, mze);
+ kmem_free(mze, sizeof (mzap_ent_t));
+}
+
+static void
+mze_destroy(zap_t *zap)
+{
+ mzap_ent_t *mze;
+ void *avlcookie = NULL;
+
+ while ((mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)))
+ kmem_free(mze, sizeof (mzap_ent_t));
+ avl_destroy(&zap->zap_m.zap_avl);
+}
+
+static zap_t *
+mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
+{
+ zap_t *winner;
+ uint64_t *zap_hdr = (uint64_t *)db->db_data;
+ uint64_t zap_block_type = zap_hdr[0];
+ uint64_t zap_magic = zap_hdr[1];
+
+ ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
+
+ zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
+ rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL);
+ rw_enter(&zap->zap_rwlock, RW_WRITER);
+ zap->zap_objset = os;
+ zap->zap_object = obj;
+ zap->zap_dbuf = db;
+
+ if (zap_block_type != ZBT_MICRO) {
+ mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT,
+ 0);
+ zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
+ if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) {
+ winner = NULL; /* No actual winner here... */
+ goto handle_winner;
+ }
+ } else {
+ zap->zap_ismicro = TRUE;
+ }
+
+ /*
+ * Make sure that zap_ismicro is set before we let others see
+ * it, because zap_lockdir() checks zap_ismicro without the lock
+ * held.
+ */
+ dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf);
+ winner = dmu_buf_set_user(db, &zap->zap_dbu);
+
+ if (winner != NULL)
+ goto handle_winner;
+
+ if (zap->zap_ismicro) {
+ zap->zap_salt = zap_m_phys(zap)->mz_salt;
+ zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
+ zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
+ avl_create(&zap->zap_m.zap_avl, mze_compare,
+ sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
+
+ for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+ mzap_ent_phys_t *mze =
+ &zap_m_phys(zap)->mz_chunk[i];
+ if (mze->mze_name[0]) {
+ zap_name_t *zn;
+
+ zap->zap_m.zap_num_entries++;
+ zn = zap_name_alloc(zap, mze->mze_name, 0);
+ mze_insert(zap, i, zn->zn_hash);
+ zap_name_free(zn);
+ }
+ }
+ } else {
+ zap->zap_salt = zap_f_phys(zap)->zap_salt;
+ zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
+
+ ASSERT3U(sizeof (struct zap_leaf_header), ==,
+ 2*ZAP_LEAF_CHUNKSIZE);
+
+ /*
+ * The embedded pointer table should not overlap the
+ * other members.
+ */
+ ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
+ &zap_f_phys(zap)->zap_salt);
+
+ /*
+ * The embedded pointer table should end at the end of
+ * the block
+ */
+ ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
+ 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
+ (uintptr_t)zap_f_phys(zap), ==,
+ zap->zap_dbuf->db_size);
+ }
+ rw_exit(&zap->zap_rwlock);
+ return (zap);
+
+handle_winner:
+ rw_exit(&zap->zap_rwlock);
+ rw_destroy(&zap->zap_rwlock);
+ if (!zap->zap_ismicro)
+ mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
+ kmem_free(zap, sizeof (zap_t));
+ return (winner);
+}
+
+/*
+ * This routine "consumes" the caller's hold on the dbuf, which must
+ * have the specified tag.
+ */
+static int
+zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
+ krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
+{
+ ASSERT0(db->db_offset);
+ objset_t *os = dmu_buf_get_objset(db);
+ uint64_t obj = db->db_object;
+ dmu_object_info_t doi;
+
+ *zapp = NULL;
+
+ dmu_object_info_from_db(db, &doi);
+ if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
+ return (SET_ERROR(EINVAL));
+
+ zap_t *zap = dmu_buf_get_user(db);
+ if (zap == NULL) {
+ zap = mzap_open(os, obj, db);
+ if (zap == NULL) {
+ /*
+ * mzap_open() didn't like what it saw on-disk.
+ * Check for corruption!
+ */
+ return (SET_ERROR(EIO));
+ }
+ }
+
+ /*
+ * We're checking zap_ismicro without the lock held, in order to
+ * tell what type of lock we want. Once we have some sort of
+ * lock, see if it really is the right type. In practice this
+ * can only be different if it was upgraded from micro to fat,
+ * and micro wanted WRITER but fat only needs READER.
+ */
+ krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
+ rw_enter(&zap->zap_rwlock, lt);
+ if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
+ /* it was upgraded, now we only need reader */
+ ASSERT(lt == RW_WRITER);
+ ASSERT(RW_READER ==
+ ((!zap->zap_ismicro && fatreader) ? RW_READER : lti));
+ rw_downgrade(&zap->zap_rwlock);
+ lt = RW_READER;
+ }
+
+ zap->zap_objset = os;
+
+ if (lt == RW_WRITER)
+ dmu_buf_will_dirty(db, tx);
+
+ ASSERT3P(zap->zap_dbuf, ==, db);
+
+ ASSERT(!zap->zap_ismicro ||
+ zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
+ if (zap->zap_ismicro && tx && adding &&
+ zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
+ uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
+ if (newsz > MZAP_MAX_BLKSZ) {
+ dprintf("upgrading obj %llu: num_entries=%u\n",
+ obj, zap->zap_m.zap_num_entries);
+ *zapp = zap;
+ int err = mzap_upgrade(zapp, tag, tx, 0);
+ if (err != 0)
+ rw_exit(&zap->zap_rwlock);
+ return (err);
+ }
+ VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
+ zap->zap_m.zap_num_chunks =
+ db->db_size / MZAP_ENT_LEN - 1;
+ }
+
+ *zapp = zap;
+ return (0);
+}
+
+static int
+zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
+ krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
+{
+ dmu_buf_t *db;
+
+ int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+ if (err != 0) {
+ return (err);
+ }
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(db, &doi);
+ ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
+ }
+#endif
+
+ err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
+ if (err != 0) {
+ dmu_buf_rele(db, tag);
+ }
+ return (err);
+}
+
+int
+zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+ krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
+{
+ dmu_buf_t *db;
+
+ int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
+ if (err != 0)
+ return (err);
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(db, &doi);
+ ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
+ }
+#endif
+ err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
+ if (err != 0)
+ dmu_buf_rele(db, tag);
+ return (err);
+}
+
+void
+zap_unlockdir(zap_t *zap, void *tag)
+{
+ rw_exit(&zap->zap_rwlock);
+ dmu_buf_rele(zap->zap_dbuf, tag);
+}
+
+static int
+mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
+{
+ int err = 0;
+ zap_t *zap = *zapp;
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ int sz = zap->zap_dbuf->db_size;
+ mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP);
+ bcopy(zap->zap_dbuf->db_data, mzp, sz);
+ int nchunks = zap->zap_m.zap_num_chunks;
+
+ if (!flags) {
+ err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
+ 1ULL << fzap_default_block_shift, 0, tx);
+ if (err != 0) {
+ vmem_free(mzp, sz);
+ return (err);
+ }
+ }
+
+ dprintf("upgrading obj=%llu with %u chunks\n",
+ zap->zap_object, nchunks);
+ /* XXX destroy the avl later, so we can use the stored hash value */
+ mze_destroy(zap);
+
+ fzap_upgrade(zap, tx, flags);
+
+ for (int i = 0; i < nchunks; i++) {
+ mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
+ if (mze->mze_name[0] == 0)
+ continue;
+ dprintf("adding %s=%llu\n",
+ mze->mze_name, mze->mze_value);
+ zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0);
+ /* If we fail here, we would end up losing entries */
+ VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
+ tag, tx));
+ zap = zn->zn_zap; /* fzap_add_cd() may change zap */
+ zap_name_free(zn);
+ }
+ vmem_free(mzp, sz);
+ *zapp = zap;
+ return (0);
+}
+
+/*
+ * The "normflags" determine the behavior of the matchtype_t which is
+ * passed to zap_lookup_norm(). Names which have the same normalized
+ * version will be stored with the same hash value, and therefore we can
+ * perform normalization-insensitive lookups. We can be Unicode form-
+ * insensitive and/or case-insensitive. The following flags are valid for
+ * "normflags":
+ *
+ * U8_TEXTPREP_NFC
+ * U8_TEXTPREP_NFD
+ * U8_TEXTPREP_NFKC
+ * U8_TEXTPREP_NFKD
+ * U8_TEXTPREP_TOUPPER
+ *
+ * The *_NF* (Normalization Form) flags are mutually exclusive; at most one
+ * of them may be supplied.
+ */
+void
+mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
+{
+ dmu_buf_t *db;
+
+ VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
+
+ dmu_buf_will_dirty(db, tx);
+ mzap_phys_t *zp = db->db_data;
+ zp->mz_block_type = ZBT_MICRO;
+ zp->mz_salt =
+ ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL;
+ zp->mz_normflags = normflags;
+
+ if (flags != 0) {
+ zap_t *zap;
+ /* Only fat zap supports flags; upgrade immediately. */
+ VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER,
+ B_FALSE, B_FALSE, &zap));
+ VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
+ zap_unlockdir(zap, FTAG);
+ } else {
+ dmu_buf_rele(db, FTAG);
+ }
+}
+
+static uint64_t
+zap_create_impl(objset_t *os, int normflags, zap_flags_t flags,
+ dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize,
+ dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
+{
+ uint64_t obj;
+
+ ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
+
+ if (allocated_dnode == NULL) {
+ dnode_t *dn;
+ obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
+ indirect_blockshift, bonustype, bonuslen, dnodesize,
+ &dn, FTAG, tx);
+ mzap_create_impl(dn, normflags, flags, tx);
+ dnode_rele(dn, FTAG);
+ } else {
+ obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
+ indirect_blockshift, bonustype, bonuslen, dnodesize,
+ allocated_dnode, tag, tx);
+ mzap_create_impl(*allocated_dnode, normflags, flags, tx);
+ }
+
+ return (obj);
+}
+
+int
+zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen,
+ 0, tx));
+}
+
+int
+zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ return (zap_create_claim_norm_dnsize(os, obj,
+ 0, ot, bonustype, bonuslen, dnodesize, tx));
+}
+
+int
+zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
+ dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype,
+ bonuslen, 0, tx));
+}
+
+int
+zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
+ dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
+ int dnodesize, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int error;
+
+ ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
+ error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
+ dnodesize, tx);
+ if (error != 0)
+ return (error);
+
+ error = dnode_hold(os, obj, FTAG, &dn);
+ if (error != 0)
+ return (error);
+
+ mzap_create_impl(dn, normflags, 0, tx);
+
+ dnode_rele(dn, FTAG);
+
+ return (0);
+}
+
+uint64_t
+zap_create(objset_t *os, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
+}
+
+uint64_t
+zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen,
+ dnodesize, tx));
+}
+
+uint64_t
+zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen,
+ 0, tx));
+}
+
+uint64_t
+zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ return (zap_create_impl(os, normflags, 0, ot, 0, 0,
+ bonustype, bonuslen, dnodesize, NULL, NULL, tx));
+}
+
+uint64_t
+zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
+ dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ return (zap_create_flags_dnsize(os, normflags, flags, ot,
+ leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx));
+}
+
+uint64_t
+zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
+ dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
+ indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL,
+ tx));
+}
+
+/*
+ * Create a zap object and return a pointer to the newly allocated dnode via
+ * the allocated_dnode argument. The returned dnode will be held and the
+ * caller is responsible for releasing the hold by calling dnode_rele().
+ */
+uint64_t
+zap_create_hold(objset_t *os, int normflags, zap_flags_t flags,
+ dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize,
+ dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
+{
+ return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
+ indirect_blockshift, bonustype, bonuslen, dnodesize,
+ allocated_dnode, tag, tx));
+}
+
+int
+zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
+{
+ /*
+ * dmu_object_free will free the object number and free the
+ * data. Freeing the data will cause our pageout function to be
+ * called, which will destroy our data (zap_leaf_t's and zap_t).
+ */
+
+ return (dmu_object_free(os, zapobj, tx));
+}
+
+void
+zap_evict_sync(void *dbu)
+{
+ zap_t *zap = dbu;
+
+ rw_destroy(&zap->zap_rwlock);
+
+ if (zap->zap_ismicro)
+ mze_destroy(zap);
+ else
+ mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
+
+ kmem_free(zap, sizeof (zap_t));
+}
+
+int
+zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ if (!zap->zap_ismicro) {
+ err = fzap_count(zap, count);
+ } else {
+ *count = zap->zap_m.zap_num_entries;
+ }
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+/*
+ * zn may be NULL; if not specified, it will be computed if needed.
+ * See also the comment above zap_entry_normalization_conflict().
+ */
+static boolean_t
+mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
+{
+ int direction = AVL_BEFORE;
+ boolean_t allocdzn = B_FALSE;
+
+ if (zap->zap_normflags == 0)
+ return (B_FALSE);
+
+again:
+ for (mzap_ent_t *other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
+ other && other->mze_hash == mze->mze_hash;
+ other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
+
+ if (zn == NULL) {
+ zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name,
+ MT_NORMALIZE);
+ allocdzn = B_TRUE;
+ }
+ if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
+ if (allocdzn)
+ zap_name_free(zn);
+ return (B_TRUE);
+ }
+ }
+
+ if (direction == AVL_BEFORE) {
+ direction = AVL_AFTER;
+ goto again;
+ }
+
+ if (allocdzn)
+ zap_name_free(zn);
+ return (B_FALSE);
+}
+
+/*
+ * Routines for manipulating attributes.
+ */
+
+int
+zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+ return (zap_lookup_norm(os, zapobj, name, integer_size,
+ num_integers, buf, 0, NULL, 0, NULL));
+}
+
+static int
+zap_lookup_impl(zap_t *zap, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf,
+ matchtype_t mt, char *realname, int rn_len,
+ boolean_t *ncp)
+{
+ int err = 0;
+
+ zap_name_t *zn = zap_name_alloc(zap, name, mt);
+ if (zn == NULL)
+ return (SET_ERROR(ENOTSUP));
+
+ if (!zap->zap_ismicro) {
+ err = fzap_lookup(zn, integer_size, num_integers, buf,
+ realname, rn_len, ncp);
+ } else {
+ mzap_ent_t *mze = mze_find(zn);
+ if (mze == NULL) {
+ err = SET_ERROR(ENOENT);
+ } else {
+ if (num_integers < 1) {
+ err = SET_ERROR(EOVERFLOW);
+ } else if (integer_size != 8) {
+ err = SET_ERROR(EINVAL);
+ } else {
+ *(uint64_t *)buf =
+ MZE_PHYS(zap, mze)->mze_value;
+ (void) strlcpy(realname,
+ MZE_PHYS(zap, mze)->mze_name, rn_len);
+ if (ncp) {
+ *ncp = mzap_normalization_conflict(zap,
+ zn, mze);
+ }
+ }
+ }
+ }
+ zap_name_free(zn);
+ return (err);
+}
+
+int
+zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf,
+ matchtype_t mt, char *realname, int rn_len,
+ boolean_t *ncp)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ err = zap_lookup_impl(zap, name, integer_size,
+ num_integers, buf, mt, realname, rn_len, ncp);
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
+zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
+{
+ zap_t *zap;
+ int err;
+ zap_name_t *zn;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc(zap, name, 0);
+ if (zn == NULL) {
+ zap_unlockdir(zap, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ fzap_prefetch(zn);
+ zap_name_free(zn);
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
+zap_lookup_by_dnode(dnode_t *dn, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+ return (zap_lookup_norm_by_dnode(dn, name, integer_size,
+ num_integers, buf, 0, NULL, 0, NULL));
+}
+
+int
+zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf,
+ matchtype_t mt, char *realname, int rn_len,
+ boolean_t *ncp)
+{
+ zap_t *zap;
+
+ int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
+ FTAG, &zap);
+ if (err != 0)
+ return (err);
+ err = zap_lookup_impl(zap, name, integer_size,
+ num_integers, buf, mt, realname, rn_len, ncp);
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
+zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ fzap_prefetch(zn);
+ zap_name_free(zn);
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
+zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ err = fzap_lookup(zn, integer_size, num_integers, buf,
+ NULL, 0, NULL);
+ zap_name_free(zn);
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
+zap_contains(objset_t *os, uint64_t zapobj, const char *name)
+{
+ int err = zap_lookup_norm(os, zapobj, name, 0,
+ 0, NULL, 0, NULL, 0, NULL);
+ if (err == EOVERFLOW || err == EINVAL)
+ err = 0; /* found, but skipped reading the value */
+ return (err);
+}
+
+int
+zap_length(objset_t *os, uint64_t zapobj, const char *name,
+ uint64_t *integer_size, uint64_t *num_integers)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ zap_name_t *zn = zap_name_alloc(zap, name, 0);
+ if (zn == NULL) {
+ zap_unlockdir(zap, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ if (!zap->zap_ismicro) {
+ err = fzap_length(zn, integer_size, num_integers);
+ } else {
+ mzap_ent_t *mze = mze_find(zn);
+ if (mze == NULL) {
+ err = SET_ERROR(ENOENT);
+ } else {
+ if (integer_size)
+ *integer_size = 8;
+ if (num_integers)
+ *num_integers = 1;
+ }
+ }
+ zap_name_free(zn);
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
+zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, uint64_t *integer_size, uint64_t *num_integers)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ err = fzap_length(zn, integer_size, num_integers);
+ zap_name_free(zn);
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+static void
+mzap_addent(zap_name_t *zn, uint64_t value)
+{
+ zap_t *zap = zn->zn_zap;
+ int start = zap->zap_m.zap_alloc_next;
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+#ifdef ZFS_DEBUG
+ for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+ mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
+ ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
+ }
+#endif
+
+ uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash);
+ /* given the limited size of the microzap, this can't happen */
+ ASSERT(cd < zap_maxcd(zap));
+
+again:
+ for (int i = start; i < zap->zap_m.zap_num_chunks; i++) {
+ mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
+ if (mze->mze_name[0] == 0) {
+ mze->mze_value = value;
+ mze->mze_cd = cd;
+ (void) strlcpy(mze->mze_name, zn->zn_key_orig,
+ sizeof (mze->mze_name));
+ zap->zap_m.zap_num_entries++;
+ zap->zap_m.zap_alloc_next = i+1;
+ if (zap->zap_m.zap_alloc_next ==
+ zap->zap_m.zap_num_chunks)
+ zap->zap_m.zap_alloc_next = 0;
+ mze_insert(zap, i, zn->zn_hash);
+ return;
+ }
+ }
+ if (start != 0) {
+ start = 0;
+ goto again;
+ }
+ cmn_err(CE_PANIC, "out of entries!");
+}
+
+static int
+zap_add_impl(zap_t *zap, const char *key,
+ int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx, void *tag)
+{
+ const uint64_t *intval = val;
+ int err = 0;
+
+ zap_name_t *zn = zap_name_alloc(zap, key, 0);
+ if (zn == NULL) {
+ zap_unlockdir(zap, tag);
+ return (SET_ERROR(ENOTSUP));
+ }
+ if (!zap->zap_ismicro) {
+ err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
+ zap = zn->zn_zap; /* fzap_add() may change zap */
+ } else if (integer_size != 8 || num_integers != 1 ||
+ strlen(key) >= MZAP_NAME_LEN ||
+ !mze_canfit_fzap_leaf(zn, zn->zn_hash)) {
+ err = mzap_upgrade(&zn->zn_zap, tag, tx, 0);
+ if (err == 0) {
+ err = fzap_add(zn, integer_size, num_integers, val,
+ tag, tx);
+ }
+ zap = zn->zn_zap; /* fzap_add() may change zap */
+ } else {
+ if (mze_find(zn) != NULL) {
+ err = SET_ERROR(EEXIST);
+ } else {
+ mzap_addent(zn, *intval);
+ }
+ }
+ ASSERT(zap == zn->zn_zap);
+ zap_name_free(zn);
+ if (zap != NULL) /* may be NULL if fzap_add() failed */
+ zap_unlockdir(zap, tag);
+ return (err);
+}
+
+int
+zap_add(objset_t *os, uint64_t zapobj, const char *key,
+ int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ int err;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
+ /* zap_add_impl() calls zap_unlockdir() */
+ return (err);
+}
+
+int
+zap_add_by_dnode(dnode_t *dn, const char *key,
+ int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ int err;
+
+ err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
+ /* zap_add_impl() calls zap_unlockdir() */
+ return (err);
+}
+
+int
+zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
+ zap = zn->zn_zap; /* fzap_add() may change zap */
+ zap_name_free(zn);
+ if (zap != NULL) /* may be NULL if fzap_add() failed */
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
+zap_update(objset_t *os, uint64_t zapobj, const char *name,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ const uint64_t *intval = val;
+
+ int err =
+ zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ zap_name_t *zn = zap_name_alloc(zap, name, 0);
+ if (zn == NULL) {
+ zap_unlockdir(zap, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ if (!zap->zap_ismicro) {
+ err = fzap_update(zn, integer_size, num_integers, val,
+ FTAG, tx);
+ zap = zn->zn_zap; /* fzap_update() may change zap */
+ } else if (integer_size != 8 || num_integers != 1 ||
+ strlen(name) >= MZAP_NAME_LEN) {
+ dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
+ zapobj, integer_size, num_integers, name);
+ err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
+ if (err == 0) {
+ err = fzap_update(zn, integer_size, num_integers,
+ val, FTAG, tx);
+ }
+ zap = zn->zn_zap; /* fzap_update() may change zap */
+ } else {
+ mzap_ent_t *mze = mze_find(zn);
+ if (mze != NULL) {
+ MZE_PHYS(zap, mze)->mze_value = *intval;
+ } else {
+ mzap_addent(zn, *intval);
+ }
+ }
+ ASSERT(zap == zn->zn_zap);
+ zap_name_free(zn);
+ if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
+zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx);
+ zap = zn->zn_zap; /* fzap_update() may change zap */
+ zap_name_free(zn);
+ if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
+zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
+{
+ return (zap_remove_norm(os, zapobj, name, 0, tx));
+}
+
+static int
+zap_remove_impl(zap_t *zap, const char *name,
+ matchtype_t mt, dmu_tx_t *tx)
+{
+ int err = 0;
+
+ zap_name_t *zn = zap_name_alloc(zap, name, mt);
+ if (zn == NULL)
+ return (SET_ERROR(ENOTSUP));
+ if (!zap->zap_ismicro) {
+ err = fzap_remove(zn, tx);
+ } else {
+ mzap_ent_t *mze = mze_find(zn);
+ if (mze == NULL) {
+ err = SET_ERROR(ENOENT);
+ } else {
+ zap->zap_m.zap_num_entries--;
+ bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid],
+ sizeof (mzap_ent_phys_t));
+ mze_remove(zap, mze);
+ }
+ }
+ zap_name_free(zn);
+ return (err);
+}
+
+int
+zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
+ matchtype_t mt, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ int err;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+ if (err)
+ return (err);
+ err = zap_remove_impl(zap, name, mt, tx);
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
+zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ int err;
+
+ err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+ if (err)
+ return (err);
+ err = zap_remove_impl(zap, name, 0, tx);
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
+zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, dmu_tx_t *tx)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ err = fzap_remove(zn, tx);
+ zap_name_free(zn);
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+/*
+ * Routines for iterating over the attributes.
+ */
+
+static void
+zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+ uint64_t serialized, boolean_t prefetch)
+{
+ zc->zc_objset = os;
+ zc->zc_zap = NULL;
+ zc->zc_leaf = NULL;
+ zc->zc_zapobj = zapobj;
+ zc->zc_serialized = serialized;
+ zc->zc_hash = 0;
+ zc->zc_cd = 0;
+ zc->zc_prefetch = prefetch;
+}
+void
+zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+ uint64_t serialized)
+{
+ zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
+}
+
+/*
+ * Initialize a cursor at the beginning of the ZAP object. The entire
+ * ZAP object will be prefetched.
+ */
+void
+zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
+{
+ zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
+}
+
+/*
+ * Initialize a cursor at the beginning, but request that we not prefetch
+ * the entire ZAP object.
+ */
+void
+zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
+{
+ zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
+}
+
+void
+zap_cursor_fini(zap_cursor_t *zc)
+{
+ if (zc->zc_zap) {
+ rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
+ zap_unlockdir(zc->zc_zap, NULL);
+ zc->zc_zap = NULL;
+ }
+ if (zc->zc_leaf) {
+ rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+ zap_put_leaf(zc->zc_leaf);
+ zc->zc_leaf = NULL;
+ }
+ zc->zc_objset = NULL;
+}
+
+uint64_t
+zap_cursor_serialize(zap_cursor_t *zc)
+{
+ if (zc->zc_hash == -1ULL)
+ return (-1ULL);
+ if (zc->zc_zap == NULL)
+ return (zc->zc_serialized);
+ ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
+ ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
+
+ /*
+ * We want to keep the high 32 bits of the cursor zero if we can, so
+ * that 32-bit programs can access this. So usually use a small
+ * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
+ * of the cursor.
+ *
+ * [ collision differentiator | zap_hashbits()-bit hash value ]
+ */
+ return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
+ ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
+}
+
+int
+zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
+{
+ int err;
+
+ if (zc->zc_hash == -1ULL)
+ return (SET_ERROR(ENOENT));
+
+ if (zc->zc_zap == NULL) {
+ int hb;
+ err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
+ RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
+ if (err != 0)
+ return (err);
+
+ /*
+ * To support zap_cursor_init_serialized, advance, retrieve,
+ * we must add to the existing zc_cd, which may already
+ * be 1 due to the zap_cursor_advance.
+ */
+ ASSERT(zc->zc_hash == 0);
+ hb = zap_hashbits(zc->zc_zap);
+ zc->zc_hash = zc->zc_serialized << (64 - hb);
+ zc->zc_cd += zc->zc_serialized >> hb;
+ if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
+ zc->zc_cd = 0;
+ } else {
+ rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
+ }
+ if (!zc->zc_zap->zap_ismicro) {
+ err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
+ } else {
+ avl_index_t idx;
+ mzap_ent_t mze_tofind;
+
+ mze_tofind.mze_hash = zc->zc_hash;
+ mze_tofind.mze_cd = zc->zc_cd;
+
+ mzap_ent_t *mze =
+ avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
+ if (mze == NULL) {
+ mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
+ idx, AVL_AFTER);
+ }
+ if (mze) {
+ mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
+ ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
+ za->za_normalization_conflict =
+ mzap_normalization_conflict(zc->zc_zap, NULL, mze);
+ za->za_integer_length = 8;
+ za->za_num_integers = 1;
+ za->za_first_integer = mzep->mze_value;
+ (void) strlcpy(za->za_name, mzep->mze_name,
+ sizeof (za->za_name));
+ zc->zc_hash = mze->mze_hash;
+ zc->zc_cd = mze->mze_cd;
+ err = 0;
+ } else {
+ zc->zc_hash = -1ULL;
+ err = SET_ERROR(ENOENT);
+ }
+ }
+ rw_exit(&zc->zc_zap->zap_rwlock);
+ return (err);
+}
+
+void
+zap_cursor_advance(zap_cursor_t *zc)
+{
+ if (zc->zc_hash == -1ULL)
+ return;
+ zc->zc_cd++;
+}
+
+int
+zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+
+ bzero(zs, sizeof (zap_stats_t));
+
+ if (zap->zap_ismicro) {
+ zs->zs_blocksize = zap->zap_dbuf->db_size;
+ zs->zs_num_entries = zap->zap_m.zap_num_entries;
+ zs->zs_num_blocks = 1;
+ } else {
+ fzap_get_stats(zap, zs);
+ }
+ zap_unlockdir(zap, FTAG);
+ return (0);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zap_create);
+EXPORT_SYMBOL(zap_create_dnsize);
+EXPORT_SYMBOL(zap_create_norm);
+EXPORT_SYMBOL(zap_create_norm_dnsize);
+EXPORT_SYMBOL(zap_create_flags);
+EXPORT_SYMBOL(zap_create_flags_dnsize);
+EXPORT_SYMBOL(zap_create_claim);
+EXPORT_SYMBOL(zap_create_claim_norm);
+EXPORT_SYMBOL(zap_create_claim_norm_dnsize);
+EXPORT_SYMBOL(zap_create_hold);
+EXPORT_SYMBOL(zap_destroy);
+EXPORT_SYMBOL(zap_lookup);
+EXPORT_SYMBOL(zap_lookup_by_dnode);
+EXPORT_SYMBOL(zap_lookup_norm);
+EXPORT_SYMBOL(zap_lookup_uint64);
+EXPORT_SYMBOL(zap_contains);
+EXPORT_SYMBOL(zap_prefetch);
+EXPORT_SYMBOL(zap_prefetch_uint64);
+EXPORT_SYMBOL(zap_add);
+EXPORT_SYMBOL(zap_add_by_dnode);
+EXPORT_SYMBOL(zap_add_uint64);
+EXPORT_SYMBOL(zap_update);
+EXPORT_SYMBOL(zap_update_uint64);
+EXPORT_SYMBOL(zap_length);
+EXPORT_SYMBOL(zap_length_uint64);
+EXPORT_SYMBOL(zap_remove);
+EXPORT_SYMBOL(zap_remove_by_dnode);
+EXPORT_SYMBOL(zap_remove_norm);
+EXPORT_SYMBOL(zap_remove_uint64);
+EXPORT_SYMBOL(zap_count);
+EXPORT_SYMBOL(zap_value_search);
+EXPORT_SYMBOL(zap_join);
+EXPORT_SYMBOL(zap_join_increment);
+EXPORT_SYMBOL(zap_add_int);
+EXPORT_SYMBOL(zap_remove_int);
+EXPORT_SYMBOL(zap_lookup_int);
+EXPORT_SYMBOL(zap_increment_int);
+EXPORT_SYMBOL(zap_add_int_key);
+EXPORT_SYMBOL(zap_lookup_int_key);
+EXPORT_SYMBOL(zap_increment);
+EXPORT_SYMBOL(zap_cursor_init);
+EXPORT_SYMBOL(zap_cursor_fini);
+EXPORT_SYMBOL(zap_cursor_retrieve);
+EXPORT_SYMBOL(zap_cursor_advance);
+EXPORT_SYMBOL(zap_cursor_serialize);
+EXPORT_SYMBOL(zap_cursor_init_serialized);
+EXPORT_SYMBOL(zap_get_stats);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/zcp.c b/sys/contrib/openzfs/module/zfs/zcp.c
new file mode 100644
index 000000000000..1ad53eae1eef
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zcp.c
@@ -0,0 +1,1451 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, 2018 by Delphix. All rights reserved.
+ */
+
+/*
+ * ZFS Channel Programs (ZCP)
+ *
+ * The ZCP interface allows various ZFS commands and operations ZFS
+ * administrative operations (e.g. creating and destroying snapshots, typically
+ * performed via an ioctl to /dev/zfs by the zfs(8) command and
+ * libzfs/libzfs_core) to be run * programmatically as a Lua script. A ZCP
+ * script is run as a dsl_sync_task and fully executed during one transaction
+ * group sync. This ensures that no other changes can be written concurrently
+ * with a running Lua script. Combining multiple calls to the exposed ZFS
+ * functions into one script gives a number of benefits:
+ *
+ * 1. Atomicity. For some compound or iterative operations, it's useful to be
+ * able to guarantee that the state of a pool has not changed between calls to
+ * ZFS.
+ *
+ * 2. Performance. If a large number of changes need to be made (e.g. deleting
+ * many filesystems), there can be a significant performance penalty as a
+ * result of the need to wait for a transaction group sync to pass for every
+ * single operation. When expressed as a single ZCP script, all these changes
+ * can be performed at once in one txg sync.
+ *
+ * A modified version of the Lua 5.2 interpreter is used to run channel program
+ * scripts. The Lua 5.2 manual can be found at:
+ *
+ * http://www.lua.org/manual/5.2/
+ *
+ * If being run by a user (via an ioctl syscall), executing a ZCP script
+ * requires root privileges in the global zone.
+ *
+ * Scripts are passed to zcp_eval() as a string, then run in a synctask by
+ * zcp_eval_sync(). Arguments can be passed into the Lua script as an nvlist,
+ * which will be converted to a Lua table. Similarly, values returned from
+ * a ZCP script will be converted to an nvlist. See zcp_lua_to_nvlist_impl()
+ * for details on exact allowed types and conversion.
+ *
+ * ZFS functionality is exposed to a ZCP script as a library of function calls.
+ * These calls are sorted into submodules, such as zfs.list and zfs.sync, for
+ * iterators and synctasks, respectively. Each of these submodules resides in
+ * its own source file, with a zcp_*_info structure describing each library
+ * call in the submodule.
+ *
+ * Error handling in ZCP scripts is handled by a number of different methods
+ * based on severity:
+ *
+ * 1. Memory and time limits are in place to prevent a channel program from
+ * consuming excessive system or running forever. If one of these limits is
+ * hit, the channel program will be stopped immediately and return from
+ * zcp_eval() with an error code. No attempt will be made to roll back or undo
+ * any changes made by the channel program before the error occurred.
+ * Consumers invoking zcp_eval() from elsewhere in the kernel may pass a time
+ * limit of 0, disabling the time limit.
+ *
+ * 2. Internal Lua errors can occur as a result of a syntax error, calling a
+ * library function with incorrect arguments, invoking the error() function,
+ * failing an assert(), or other runtime errors. In these cases the channel
+ * program will stop executing and return from zcp_eval() with an error code.
+ * In place of a return value, an error message will also be returned in the
+ * 'result' nvlist containing information about the error. No attempt will be
+ * made to roll back or undo any changes made by the channel program before the
+ * error occurred.
+ *
+ * 3. If an error occurs inside a ZFS library call which returns an error code,
+ * the error is returned to the Lua script to be handled as desired.
+ *
+ * In the first two cases, Lua's error-throwing mechanism is used, which
+ * longjumps out of the script execution with luaL_error() and returns with the
+ * error.
+ *
+ * See zfs-program(8) for more information on high level usage.
+ */
+
+#include <sys/lua/lua.h>
+#include <sys/lua/lualib.h>
+#include <sys/lua/lauxlib.h>
+
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_dataset.h>
+#include <sys/zcp.h>
+#include <sys/zcp_iter.h>
+#include <sys/zcp_prop.h>
+#include <sys/zcp_global.h>
+#include <sys/zvol.h>
+
+#ifndef KM_NORMALPRI
+#define KM_NORMALPRI 0
+#endif
+
+#define ZCP_NVLIST_MAX_DEPTH 20
+
+uint64_t zfs_lua_check_instrlimit_interval = 100;
+unsigned long zfs_lua_max_instrlimit = ZCP_MAX_INSTRLIMIT;
+unsigned long zfs_lua_max_memlimit = ZCP_MAX_MEMLIMIT;
+
+/*
+ * Forward declarations for mutually recursive functions
+ */
+static int zcp_nvpair_value_to_lua(lua_State *, nvpair_t *, char *, int);
+static int zcp_lua_to_nvlist_impl(lua_State *, int, nvlist_t *, const char *,
+ int);
+
+/*
+ * The outer-most error callback handler for use with lua_pcall(). On
+ * error Lua will call this callback with a single argument that
+ * represents the error value. In most cases this will be a string
+ * containing an error message, but channel programs can use Lua's
+ * error() function to return arbitrary objects as errors. This callback
+ * returns (on the Lua stack) the original error object along with a traceback.
+ *
+ * Fatal Lua errors can occur while resources are held, so we also call any
+ * registered cleanup function here.
+ */
+static int
+zcp_error_handler(lua_State *state)
+{
+ const char *msg;
+
+ zcp_cleanup(state);
+
+ VERIFY3U(1, ==, lua_gettop(state));
+ msg = lua_tostring(state, 1);
+ luaL_traceback(state, state, msg, 1);
+ return (1);
+}
+
+int
+zcp_argerror(lua_State *state, int narg, const char *msg, ...)
+{
+ va_list alist;
+
+ va_start(alist, msg);
+ const char *buf = lua_pushvfstring(state, msg, alist);
+ va_end(alist);
+
+ return (luaL_argerror(state, narg, buf));
+}
+
+/*
+ * Install a new cleanup function, which will be invoked with the given
+ * opaque argument if a fatal error causes the Lua interpreter to longjump out
+ * of a function call.
+ *
+ * If an error occurs, the cleanup function will be invoked exactly once and
+ * then unregistered.
+ *
+ * Returns the registered cleanup handler so the caller can deregister it
+ * if no error occurs.
+ */
+zcp_cleanup_handler_t *
+zcp_register_cleanup(lua_State *state, zcp_cleanup_t cleanfunc, void *cleanarg)
+{
+ zcp_run_info_t *ri = zcp_run_info(state);
+
+ zcp_cleanup_handler_t *zch = kmem_alloc(sizeof (*zch), KM_SLEEP);
+ zch->zch_cleanup_func = cleanfunc;
+ zch->zch_cleanup_arg = cleanarg;
+ list_insert_head(&ri->zri_cleanup_handlers, zch);
+
+ return (zch);
+}
+
+void
+zcp_deregister_cleanup(lua_State *state, zcp_cleanup_handler_t *zch)
+{
+ zcp_run_info_t *ri = zcp_run_info(state);
+ list_remove(&ri->zri_cleanup_handlers, zch);
+ kmem_free(zch, sizeof (*zch));
+}
+
+/*
+ * Execute the currently registered cleanup handlers then free them and
+ * destroy the handler list.
+ */
+void
+zcp_cleanup(lua_State *state)
+{
+ zcp_run_info_t *ri = zcp_run_info(state);
+
+ for (zcp_cleanup_handler_t *zch =
+ list_remove_head(&ri->zri_cleanup_handlers); zch != NULL;
+ zch = list_remove_head(&ri->zri_cleanup_handlers)) {
+ zch->zch_cleanup_func(zch->zch_cleanup_arg);
+ kmem_free(zch, sizeof (*zch));
+ }
+}
+
+/*
+ * Convert the lua table at the given index on the Lua stack to an nvlist
+ * and return it.
+ *
+ * If the table can not be converted for any reason, NULL is returned and
+ * an error message is pushed onto the Lua stack.
+ */
+static nvlist_t *
+zcp_table_to_nvlist(lua_State *state, int index, int depth)
+{
+ nvlist_t *nvl;
+ /*
+ * Converting a Lua table to an nvlist with key uniqueness checking is
+ * O(n^2) in the number of keys in the nvlist, which can take a long
+ * time when we return a large table from a channel program.
+ * Furthermore, Lua's table interface *almost* guarantees unique keys
+ * on its own (details below). Therefore, we don't use fnvlist_alloc()
+ * here to avoid the built-in uniqueness checking.
+ *
+ * The *almost* is because it's possible to have key collisions between
+ * e.g. the string "1" and the number 1, or the string "true" and the
+ * boolean true, so we explicitly check that when we're looking at a
+ * key which is an integer / boolean or a string that can be parsed as
+ * one of those types. In the worst case this could still devolve into
+ * O(n^2), so we only start doing these checks on boolean/integer keys
+ * once we've seen a string key which fits this weird usage pattern.
+ *
+ * Ultimately, we still want callers to know that the keys in this
+ * nvlist are unique, so before we return this we set the nvlist's
+ * flags to reflect that.
+ */
+ VERIFY0(nvlist_alloc(&nvl, 0, KM_SLEEP));
+
+ /*
+ * Push an empty stack slot where lua_next() will store each
+ * table key.
+ */
+ lua_pushnil(state);
+ boolean_t saw_str_could_collide = B_FALSE;
+ while (lua_next(state, index) != 0) {
+ /*
+ * The next key-value pair from the table at index is
+ * now on the stack, with the key at stack slot -2 and
+ * the value at slot -1.
+ */
+ int err = 0;
+ char buf[32];
+ const char *key = NULL;
+ boolean_t key_could_collide = B_FALSE;
+
+ switch (lua_type(state, -2)) {
+ case LUA_TSTRING:
+ key = lua_tostring(state, -2);
+
+ /* check if this could collide with a number or bool */
+ long long tmp;
+ int parselen;
+ if ((sscanf(key, "%lld%n", &tmp, &parselen) > 0 &&
+ parselen == strlen(key)) ||
+ strcmp(key, "true") == 0 ||
+ strcmp(key, "false") == 0) {
+ key_could_collide = B_TRUE;
+ saw_str_could_collide = B_TRUE;
+ }
+ break;
+ case LUA_TBOOLEAN:
+ key = (lua_toboolean(state, -2) == B_TRUE ?
+ "true" : "false");
+ if (saw_str_could_collide) {
+ key_could_collide = B_TRUE;
+ }
+ break;
+ case LUA_TNUMBER:
+ VERIFY3U(sizeof (buf), >,
+ snprintf(buf, sizeof (buf), "%lld",
+ (longlong_t)lua_tonumber(state, -2)));
+ key = buf;
+ if (saw_str_could_collide) {
+ key_could_collide = B_TRUE;
+ }
+ break;
+ default:
+ fnvlist_free(nvl);
+ (void) lua_pushfstring(state, "Invalid key "
+ "type '%s' in table",
+ lua_typename(state, lua_type(state, -2)));
+ return (NULL);
+ }
+ /*
+ * Check for type-mismatched key collisions, and throw an error.
+ */
+ if (key_could_collide && nvlist_exists(nvl, key)) {
+ fnvlist_free(nvl);
+ (void) lua_pushfstring(state, "Collision of "
+ "key '%s' in table", key);
+ return (NULL);
+ }
+ /*
+ * Recursively convert the table value and insert into
+ * the new nvlist with the parsed key. To prevent
+ * stack overflow on circular or heavily nested tables,
+ * we track the current nvlist depth.
+ */
+ if (depth >= ZCP_NVLIST_MAX_DEPTH) {
+ fnvlist_free(nvl);
+ (void) lua_pushfstring(state, "Maximum table "
+ "depth (%d) exceeded for table",
+ ZCP_NVLIST_MAX_DEPTH);
+ return (NULL);
+ }
+ err = zcp_lua_to_nvlist_impl(state, -1, nvl, key,
+ depth + 1);
+ if (err != 0) {
+ fnvlist_free(nvl);
+ /*
+ * Error message has been pushed to the lua
+ * stack by the recursive call.
+ */
+ return (NULL);
+ }
+ /*
+ * Pop the value pushed by lua_next().
+ */
+ lua_pop(state, 1);
+ }
+
+ /*
+ * Mark the nvlist as having unique keys. This is a little ugly, but we
+ * ensured above that there are no duplicate keys in the nvlist.
+ */
+ nvl->nvl_nvflag |= NV_UNIQUE_NAME;
+
+ return (nvl);
+}
+
+/*
+ * Convert a value from the given index into the lua stack to an nvpair, adding
+ * it to an nvlist with the given key.
+ *
+ * Values are converted as follows:
+ *
+ * string -> string
+ * number -> int64
+ * boolean -> boolean
+ * nil -> boolean (no value)
+ *
+ * Lua tables are converted to nvlists and then inserted. The table's keys
+ * are converted to strings then used as keys in the nvlist to store each table
+ * element. Keys are converted as follows:
+ *
+ * string -> no change
+ * number -> "%lld"
+ * boolean -> "true" | "false"
+ * nil -> error
+ *
+ * In the case of a key collision, an error is thrown.
+ *
+ * If an error is encountered, a nonzero error code is returned, and an error
+ * string will be pushed onto the Lua stack.
+ */
+static int
+zcp_lua_to_nvlist_impl(lua_State *state, int index, nvlist_t *nvl,
+ const char *key, int depth)
+{
+ /*
+ * Verify that we have enough remaining space in the lua stack to parse
+ * a key-value pair and push an error.
+ */
+ if (!lua_checkstack(state, 3)) {
+ (void) lua_pushstring(state, "Lua stack overflow");
+ return (1);
+ }
+
+ index = lua_absindex(state, index);
+
+ switch (lua_type(state, index)) {
+ case LUA_TNIL:
+ fnvlist_add_boolean(nvl, key);
+ break;
+ case LUA_TBOOLEAN:
+ fnvlist_add_boolean_value(nvl, key,
+ lua_toboolean(state, index));
+ break;
+ case LUA_TNUMBER:
+ fnvlist_add_int64(nvl, key, lua_tonumber(state, index));
+ break;
+ case LUA_TSTRING:
+ fnvlist_add_string(nvl, key, lua_tostring(state, index));
+ break;
+ case LUA_TTABLE: {
+ nvlist_t *value_nvl = zcp_table_to_nvlist(state, index, depth);
+ if (value_nvl == NULL)
+ return (SET_ERROR(EINVAL));
+
+ fnvlist_add_nvlist(nvl, key, value_nvl);
+ fnvlist_free(value_nvl);
+ break;
+ }
+ default:
+ (void) lua_pushfstring(state,
+ "Invalid value type '%s' for key '%s'",
+ lua_typename(state, lua_type(state, index)), key);
+ return (SET_ERROR(EINVAL));
+ }
+
+ return (0);
+}
+
+/*
+ * Convert a lua value to an nvpair, adding it to an nvlist with the given key.
+ */
+static void
+zcp_lua_to_nvlist(lua_State *state, int index, nvlist_t *nvl, const char *key)
+{
+ /*
+ * On error, zcp_lua_to_nvlist_impl pushes an error string onto the Lua
+ * stack before returning with a nonzero error code. If an error is
+ * returned, throw a fatal lua error with the given string.
+ */
+ if (zcp_lua_to_nvlist_impl(state, index, nvl, key, 0) != 0)
+ (void) lua_error(state);
+}
+
+static int
+zcp_lua_to_nvlist_helper(lua_State *state)
+{
+ nvlist_t *nv = (nvlist_t *)lua_touserdata(state, 2);
+ const char *key = (const char *)lua_touserdata(state, 1);
+ zcp_lua_to_nvlist(state, 3, nv, key);
+ return (0);
+}
+
+static void
+zcp_convert_return_values(lua_State *state, nvlist_t *nvl,
+ const char *key, int *result)
+{
+ int err;
+ VERIFY3U(1, ==, lua_gettop(state));
+ lua_pushcfunction(state, zcp_lua_to_nvlist_helper);
+ lua_pushlightuserdata(state, (char *)key);
+ lua_pushlightuserdata(state, nvl);
+ lua_pushvalue(state, 1);
+ lua_remove(state, 1);
+ err = lua_pcall(state, 3, 0, 0); /* zcp_lua_to_nvlist_helper */
+ if (err != 0) {
+ zcp_lua_to_nvlist(state, 1, nvl, ZCP_RET_ERROR);
+ *result = SET_ERROR(ECHRNG);
+ }
+}
+
+/*
+ * Push a Lua table representing nvl onto the stack. If it can't be
+ * converted, return EINVAL, fill in errbuf, and push nothing. errbuf may
+ * be specified as NULL, in which case no error string will be output.
+ *
+ * Most nvlists are converted as simple key->value Lua tables, but we make
+ * an exception for the case where all nvlist entries are BOOLEANs (a string
+ * key without a value). In Lua, a table key pointing to a value of Nil
+ * (no value) is equivalent to the key not existing, so a BOOLEAN nvlist
+ * entry can't be directly converted to a Lua table entry. Nvlists of entirely
+ * BOOLEAN entries are frequently used to pass around lists of datasets, so for
+ * convenience we check for this case, and convert it to a simple Lua array of
+ * strings.
+ */
+int
+zcp_nvlist_to_lua(lua_State *state, nvlist_t *nvl,
+ char *errbuf, int errbuf_len)
+{
+ nvpair_t *pair;
+ lua_newtable(state);
+ boolean_t has_values = B_FALSE;
+ /*
+ * If the list doesn't have any values, just convert it to a string
+ * array.
+ */
+ for (pair = nvlist_next_nvpair(nvl, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) {
+ if (nvpair_type(pair) != DATA_TYPE_BOOLEAN) {
+ has_values = B_TRUE;
+ break;
+ }
+ }
+ if (!has_values) {
+ int i = 1;
+ for (pair = nvlist_next_nvpair(nvl, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) {
+ (void) lua_pushinteger(state, i);
+ (void) lua_pushstring(state, nvpair_name(pair));
+ (void) lua_settable(state, -3);
+ i++;
+ }
+ } else {
+ for (pair = nvlist_next_nvpair(nvl, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) {
+ int err = zcp_nvpair_value_to_lua(state, pair,
+ errbuf, errbuf_len);
+ if (err != 0) {
+ lua_pop(state, 1);
+ return (err);
+ }
+ (void) lua_setfield(state, -2, nvpair_name(pair));
+ }
+ }
+ return (0);
+}
+
+/*
+ * Push a Lua object representing the value of "pair" onto the stack.
+ *
+ * Only understands boolean_value, string, int64, nvlist,
+ * string_array, and int64_array type values. For other
+ * types, returns EINVAL, fills in errbuf, and pushes nothing.
+ */
+static int
+zcp_nvpair_value_to_lua(lua_State *state, nvpair_t *pair,
+ char *errbuf, int errbuf_len)
+{
+ int err = 0;
+
+ if (pair == NULL) {
+ lua_pushnil(state);
+ return (0);
+ }
+
+ switch (nvpair_type(pair)) {
+ case DATA_TYPE_BOOLEAN_VALUE:
+ (void) lua_pushboolean(state,
+ fnvpair_value_boolean_value(pair));
+ break;
+ case DATA_TYPE_STRING:
+ (void) lua_pushstring(state, fnvpair_value_string(pair));
+ break;
+ case DATA_TYPE_INT64:
+ (void) lua_pushinteger(state, fnvpair_value_int64(pair));
+ break;
+ case DATA_TYPE_NVLIST:
+ err = zcp_nvlist_to_lua(state,
+ fnvpair_value_nvlist(pair), errbuf, errbuf_len);
+ break;
+ case DATA_TYPE_STRING_ARRAY: {
+ char **strarr;
+ uint_t nelem;
+ (void) nvpair_value_string_array(pair, &strarr, &nelem);
+ lua_newtable(state);
+ for (int i = 0; i < nelem; i++) {
+ (void) lua_pushinteger(state, i + 1);
+ (void) lua_pushstring(state, strarr[i]);
+ (void) lua_settable(state, -3);
+ }
+ break;
+ }
+ case DATA_TYPE_UINT64_ARRAY: {
+ uint64_t *intarr;
+ uint_t nelem;
+ (void) nvpair_value_uint64_array(pair, &intarr, &nelem);
+ lua_newtable(state);
+ for (int i = 0; i < nelem; i++) {
+ (void) lua_pushinteger(state, i + 1);
+ (void) lua_pushinteger(state, intarr[i]);
+ (void) lua_settable(state, -3);
+ }
+ break;
+ }
+ case DATA_TYPE_INT64_ARRAY: {
+ int64_t *intarr;
+ uint_t nelem;
+ (void) nvpair_value_int64_array(pair, &intarr, &nelem);
+ lua_newtable(state);
+ for (int i = 0; i < nelem; i++) {
+ (void) lua_pushinteger(state, i + 1);
+ (void) lua_pushinteger(state, intarr[i]);
+ (void) lua_settable(state, -3);
+ }
+ break;
+ }
+ default: {
+ if (errbuf != NULL) {
+ (void) snprintf(errbuf, errbuf_len,
+ "Unhandled nvpair type %d for key '%s'",
+ nvpair_type(pair), nvpair_name(pair));
+ }
+ return (SET_ERROR(EINVAL));
+ }
+ }
+ return (err);
+}
+
+int
+zcp_dataset_hold_error(lua_State *state, dsl_pool_t *dp, const char *dsname,
+ int error)
+{
+ if (error == ENOENT) {
+ (void) zcp_argerror(state, 1, "no such dataset '%s'", dsname);
+ return (0); /* not reached; zcp_argerror will longjmp */
+ } else if (error == EXDEV) {
+ (void) zcp_argerror(state, 1,
+ "dataset '%s' is not in the target pool '%s'",
+ dsname, spa_name(dp->dp_spa));
+ return (0); /* not reached; zcp_argerror will longjmp */
+ } else if (error == EIO) {
+ (void) luaL_error(state,
+ "I/O error while accessing dataset '%s'", dsname);
+ return (0); /* not reached; luaL_error will longjmp */
+ } else if (error != 0) {
+ (void) luaL_error(state,
+ "unexpected error %d while accessing dataset '%s'",
+ error, dsname);
+ return (0); /* not reached; luaL_error will longjmp */
+ }
+ return (0);
+}
+
+/*
+ * Note: will longjmp (via lua_error()) on error.
+ * Assumes that the dsname is argument #1 (for error reporting purposes).
+ */
+dsl_dataset_t *
+zcp_dataset_hold(lua_State *state, dsl_pool_t *dp, const char *dsname,
+ void *tag)
+{
+ dsl_dataset_t *ds;
+ int error = dsl_dataset_hold(dp, dsname, tag, &ds);
+ (void) zcp_dataset_hold_error(state, dp, dsname, error);
+ return (ds);
+}
+
+static int zcp_debug(lua_State *);
+static zcp_lib_info_t zcp_debug_info = {
+ .name = "debug",
+ .func = zcp_debug,
+ .pargs = {
+ { .za_name = "debug string", .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ }
+};
+
+static int
+zcp_debug(lua_State *state)
+{
+ const char *dbgstring;
+ zcp_run_info_t *ri = zcp_run_info(state);
+ zcp_lib_info_t *libinfo = &zcp_debug_info;
+
+ zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
+
+ dbgstring = lua_tostring(state, 1);
+
+ zfs_dbgmsg("txg %lld ZCP: %s", ri->zri_tx->tx_txg, dbgstring);
+
+ return (0);
+}
+
+static int zcp_exists(lua_State *);
+static zcp_lib_info_t zcp_exists_info = {
+ .name = "exists",
+ .func = zcp_exists,
+ .pargs = {
+ { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ }
+};
+
+static int
+zcp_exists(lua_State *state)
+{
+ zcp_run_info_t *ri = zcp_run_info(state);
+ dsl_pool_t *dp = ri->zri_pool;
+ zcp_lib_info_t *libinfo = &zcp_exists_info;
+
+ zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
+
+ const char *dsname = lua_tostring(state, 1);
+
+ dsl_dataset_t *ds;
+ int error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+ if (error == 0) {
+ dsl_dataset_rele(ds, FTAG);
+ lua_pushboolean(state, B_TRUE);
+ } else if (error == ENOENT) {
+ lua_pushboolean(state, B_FALSE);
+ } else if (error == EXDEV) {
+ return (luaL_error(state, "dataset '%s' is not in the "
+ "target pool", dsname));
+ } else if (error == EIO) {
+ return (luaL_error(state, "I/O error opening dataset '%s'",
+ dsname));
+ } else if (error != 0) {
+ return (luaL_error(state, "unexpected error %d", error));
+ }
+
+ return (1);
+}
+
+/*
+ * Allocate/realloc/free a buffer for the lua interpreter.
+ *
+ * When nsize is 0, behaves as free() and returns NULL.
+ *
+ * If ptr is NULL, behaves as malloc() and returns an allocated buffer of size
+ * at least nsize.
+ *
+ * Otherwise, behaves as realloc(), changing the allocation from osize to nsize.
+ * Shrinking the buffer size never fails.
+ *
+ * The original allocated buffer size is stored as a uint64 at the beginning of
+ * the buffer to avoid actually reallocating when shrinking a buffer, since lua
+ * requires that this operation never fail.
+ */
+static void *
+zcp_lua_alloc(void *ud, void *ptr, size_t osize, size_t nsize)
+{
+ zcp_alloc_arg_t *allocargs = ud;
+
+ if (nsize == 0) {
+ if (ptr != NULL) {
+ int64_t *allocbuf = (int64_t *)ptr - 1;
+ int64_t allocsize = *allocbuf;
+ ASSERT3S(allocsize, >, 0);
+ ASSERT3S(allocargs->aa_alloc_remaining + allocsize, <=,
+ allocargs->aa_alloc_limit);
+ allocargs->aa_alloc_remaining += allocsize;
+ vmem_free(allocbuf, allocsize);
+ }
+ return (NULL);
+ } else if (ptr == NULL) {
+ int64_t *allocbuf;
+ int64_t allocsize = nsize + sizeof (int64_t);
+
+ if (!allocargs->aa_must_succeed &&
+ (allocsize <= 0 ||
+ allocsize > allocargs->aa_alloc_remaining)) {
+ return (NULL);
+ }
+
+ allocbuf = vmem_alloc(allocsize, KM_SLEEP);
+ allocargs->aa_alloc_remaining -= allocsize;
+
+ *allocbuf = allocsize;
+ return (allocbuf + 1);
+ } else if (nsize <= osize) {
+ /*
+ * If shrinking the buffer, lua requires that the reallocation
+ * never fail.
+ */
+ return (ptr);
+ } else {
+ ASSERT3U(nsize, >, osize);
+
+ uint64_t *luabuf = zcp_lua_alloc(ud, NULL, 0, nsize);
+ if (luabuf == NULL) {
+ return (NULL);
+ }
+ (void) memcpy(luabuf, ptr, osize);
+ VERIFY3P(zcp_lua_alloc(ud, ptr, osize, 0), ==, NULL);
+ return (luabuf);
+ }
+}
+
+/* ARGSUSED */
+static void
+zcp_lua_counthook(lua_State *state, lua_Debug *ar)
+{
+ lua_getfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY);
+ zcp_run_info_t *ri = lua_touserdata(state, -1);
+
+ /*
+ * Check if we were canceled while waiting for the
+ * txg to sync or from our open context thread
+ */
+ if (ri->zri_canceled ||
+ (!ri->zri_sync && issig(JUSTLOOKING) && issig(FORREAL))) {
+ ri->zri_canceled = B_TRUE;
+ (void) lua_pushstring(state, "Channel program was canceled.");
+ (void) lua_error(state);
+ /* Unreachable */
+ }
+
+ /*
+ * Check how many instructions the channel program has
+ * executed so far, and compare against the limit.
+ */
+ ri->zri_curinstrs += zfs_lua_check_instrlimit_interval;
+ if (ri->zri_maxinstrs != 0 && ri->zri_curinstrs > ri->zri_maxinstrs) {
+ ri->zri_timed_out = B_TRUE;
+ (void) lua_pushstring(state,
+ "Channel program timed out.");
+ (void) lua_error(state);
+ /* Unreachable */
+ }
+}
+
+static int
+zcp_panic_cb(lua_State *state)
+{
+ panic("unprotected error in call to Lua API (%s)\n",
+ lua_tostring(state, -1));
+ return (0);
+}
+
+static void
+zcp_eval_impl(dmu_tx_t *tx, zcp_run_info_t *ri)
+{
+ int err;
+ lua_State *state = ri->zri_state;
+
+ VERIFY3U(3, ==, lua_gettop(state));
+
+ /* finish initializing our runtime state */
+ ri->zri_pool = dmu_tx_pool(tx);
+ ri->zri_tx = tx;
+ list_create(&ri->zri_cleanup_handlers, sizeof (zcp_cleanup_handler_t),
+ offsetof(zcp_cleanup_handler_t, zch_node));
+
+ /*
+ * Store the zcp_run_info_t struct for this run in the Lua registry.
+ * Registry entries are not directly accessible by the Lua scripts but
+ * can be accessed by our callbacks.
+ */
+ lua_pushlightuserdata(state, ri);
+ lua_setfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY);
+ VERIFY3U(3, ==, lua_gettop(state));
+
+ /*
+ * Tell the Lua interpreter to call our handler every count
+ * instructions. Channel programs that execute too many instructions
+ * should die with ETIME.
+ */
+ (void) lua_sethook(state, zcp_lua_counthook, LUA_MASKCOUNT,
+ zfs_lua_check_instrlimit_interval);
+
+ /*
+ * Tell the Lua memory allocator to stop using KM_SLEEP before handing
+ * off control to the channel program. Channel programs that use too
+ * much memory should die with ENOSPC.
+ */
+ ri->zri_allocargs->aa_must_succeed = B_FALSE;
+
+ /*
+ * Call the Lua function that open-context passed us. This pops the
+ * function and its input from the stack and pushes any return
+ * or error values.
+ */
+ err = lua_pcall(state, 1, LUA_MULTRET, 1);
+
+ /*
+ * Let Lua use KM_SLEEP while we interpret the return values.
+ */
+ ri->zri_allocargs->aa_must_succeed = B_TRUE;
+
+ /*
+ * Remove the error handler callback from the stack. At this point,
+ * there shouldn't be any cleanup handler registered in the handler
+ * list (zri_cleanup_handlers), regardless of whether it ran or not.
+ */
+ list_destroy(&ri->zri_cleanup_handlers);
+ lua_remove(state, 1);
+
+ switch (err) {
+ case LUA_OK: {
+ /*
+ * Lua supports returning multiple values in a single return
+ * statement. Return values will have been pushed onto the
+ * stack:
+ * 1: Return value 1
+ * 2: Return value 2
+ * 3: etc...
+ * To simplify the process of retrieving a return value from a
+ * channel program, we disallow returning more than one value
+ * to ZFS from the Lua script, yielding a singleton return
+ * nvlist of the form { "return": Return value 1 }.
+ */
+ int return_count = lua_gettop(state);
+
+ if (return_count == 1) {
+ ri->zri_result = 0;
+ zcp_convert_return_values(state, ri->zri_outnvl,
+ ZCP_RET_RETURN, &ri->zri_result);
+ } else if (return_count > 1) {
+ ri->zri_result = SET_ERROR(ECHRNG);
+ lua_settop(state, 0);
+ (void) lua_pushfstring(state, "Multiple return "
+ "values not supported");
+ zcp_convert_return_values(state, ri->zri_outnvl,
+ ZCP_RET_ERROR, &ri->zri_result);
+ }
+ break;
+ }
+ case LUA_ERRRUN:
+ case LUA_ERRGCMM: {
+ /*
+ * The channel program encountered a fatal error within the
+ * script, such as failing an assertion, or calling a function
+ * with incompatible arguments. The error value and the
+ * traceback generated by zcp_error_handler() should be on the
+ * stack.
+ */
+ VERIFY3U(1, ==, lua_gettop(state));
+ if (ri->zri_timed_out) {
+ ri->zri_result = SET_ERROR(ETIME);
+ } else if (ri->zri_canceled) {
+ ri->zri_result = SET_ERROR(EINTR);
+ } else {
+ ri->zri_result = SET_ERROR(ECHRNG);
+ }
+
+ zcp_convert_return_values(state, ri->zri_outnvl,
+ ZCP_RET_ERROR, &ri->zri_result);
+
+ if (ri->zri_result == ETIME && ri->zri_outnvl != NULL) {
+ (void) nvlist_add_uint64(ri->zri_outnvl,
+ ZCP_ARG_INSTRLIMIT, ri->zri_curinstrs);
+ }
+ break;
+ }
+ case LUA_ERRERR: {
+ /*
+ * The channel program encountered a fatal error within the
+ * script, and we encountered another error while trying to
+ * compute the traceback in zcp_error_handler(). We can only
+ * return the error message.
+ */
+ VERIFY3U(1, ==, lua_gettop(state));
+ if (ri->zri_timed_out) {
+ ri->zri_result = SET_ERROR(ETIME);
+ } else if (ri->zri_canceled) {
+ ri->zri_result = SET_ERROR(EINTR);
+ } else {
+ ri->zri_result = SET_ERROR(ECHRNG);
+ }
+
+ zcp_convert_return_values(state, ri->zri_outnvl,
+ ZCP_RET_ERROR, &ri->zri_result);
+ break;
+ }
+ case LUA_ERRMEM:
+ /*
+ * Lua ran out of memory while running the channel program.
+ * There's not much we can do.
+ */
+ ri->zri_result = SET_ERROR(ENOSPC);
+ break;
+ default:
+ VERIFY0(err);
+ }
+}
+
+static void
+zcp_pool_error(zcp_run_info_t *ri, const char *poolname)
+{
+ ri->zri_result = SET_ERROR(ECHRNG);
+ lua_settop(ri->zri_state, 0);
+ (void) lua_pushfstring(ri->zri_state, "Could not open pool: %s",
+ poolname);
+ zcp_convert_return_values(ri->zri_state, ri->zri_outnvl,
+ ZCP_RET_ERROR, &ri->zri_result);
+
+}
+
+/*
+ * This callback is called when txg_wait_synced_sig encountered a signal.
+ * The txg_wait_synced_sig will continue to wait for the txg to complete
+ * after calling this callback.
+ */
+/* ARGSUSED */
+static void
+zcp_eval_sig(void *arg, dmu_tx_t *tx)
+{
+ zcp_run_info_t *ri = arg;
+
+ ri->zri_canceled = B_TRUE;
+}
+
+static void
+zcp_eval_sync(void *arg, dmu_tx_t *tx)
+{
+ zcp_run_info_t *ri = arg;
+
+ /*
+ * Open context should have setup the stack to contain:
+ * 1: Error handler callback
+ * 2: Script to run (converted to a Lua function)
+ * 3: nvlist input to function (converted to Lua table or nil)
+ */
+ VERIFY3U(3, ==, lua_gettop(ri->zri_state));
+
+ zcp_eval_impl(tx, ri);
+}
+
+static void
+zcp_eval_open(zcp_run_info_t *ri, const char *poolname)
+{
+ int error;
+ dsl_pool_t *dp;
+ dmu_tx_t *tx;
+
+ /*
+ * See comment from the same assertion in zcp_eval_sync().
+ */
+ VERIFY3U(3, ==, lua_gettop(ri->zri_state));
+
+ error = dsl_pool_hold(poolname, FTAG, &dp);
+ if (error != 0) {
+ zcp_pool_error(ri, poolname);
+ return;
+ }
+
+ /*
+ * As we are running in open-context, we have no transaction associated
+ * with the channel program. At the same time, functions from the
+ * zfs.check submodule need to be associated with a transaction as
+ * they are basically dry-runs of their counterparts in the zfs.sync
+ * submodule. These functions should be able to run in open-context.
+ * Therefore we create a new transaction that we later abort once
+ * the channel program has been evaluated.
+ */
+ tx = dmu_tx_create_dd(dp->dp_mos_dir);
+
+ zcp_eval_impl(tx, ri);
+
+ dmu_tx_abort(tx);
+
+ dsl_pool_rele(dp, FTAG);
+}
+
+int
+zcp_eval(const char *poolname, const char *program, boolean_t sync,
+ uint64_t instrlimit, uint64_t memlimit, nvpair_t *nvarg, nvlist_t *outnvl)
+{
+ int err;
+ lua_State *state;
+ zcp_run_info_t runinfo;
+
+ if (instrlimit > zfs_lua_max_instrlimit)
+ return (SET_ERROR(EINVAL));
+ if (memlimit == 0 || memlimit > zfs_lua_max_memlimit)
+ return (SET_ERROR(EINVAL));
+
+ zcp_alloc_arg_t allocargs = {
+ .aa_must_succeed = B_TRUE,
+ .aa_alloc_remaining = (int64_t)memlimit,
+ .aa_alloc_limit = (int64_t)memlimit,
+ };
+
+ /*
+ * Creates a Lua state with a memory allocator that uses KM_SLEEP.
+ * This should never fail.
+ */
+ state = lua_newstate(zcp_lua_alloc, &allocargs);
+ VERIFY(state != NULL);
+ (void) lua_atpanic(state, zcp_panic_cb);
+
+ /*
+ * Load core Lua libraries we want access to.
+ */
+ VERIFY3U(1, ==, luaopen_base(state));
+ lua_pop(state, 1);
+ VERIFY3U(1, ==, luaopen_coroutine(state));
+ lua_setglobal(state, LUA_COLIBNAME);
+ VERIFY0(lua_gettop(state));
+ VERIFY3U(1, ==, luaopen_string(state));
+ lua_setglobal(state, LUA_STRLIBNAME);
+ VERIFY0(lua_gettop(state));
+ VERIFY3U(1, ==, luaopen_table(state));
+ lua_setglobal(state, LUA_TABLIBNAME);
+ VERIFY0(lua_gettop(state));
+
+ /*
+ * Load globally visible variables such as errno aliases.
+ */
+ zcp_load_globals(state);
+ VERIFY0(lua_gettop(state));
+
+ /*
+ * Load ZFS-specific modules.
+ */
+ lua_newtable(state);
+ VERIFY3U(1, ==, zcp_load_list_lib(state));
+ lua_setfield(state, -2, "list");
+ VERIFY3U(1, ==, zcp_load_synctask_lib(state, B_FALSE));
+ lua_setfield(state, -2, "check");
+ VERIFY3U(1, ==, zcp_load_synctask_lib(state, B_TRUE));
+ lua_setfield(state, -2, "sync");
+ VERIFY3U(1, ==, zcp_load_get_lib(state));
+ lua_pushcclosure(state, zcp_debug_info.func, 0);
+ lua_setfield(state, -2, zcp_debug_info.name);
+ lua_pushcclosure(state, zcp_exists_info.func, 0);
+ lua_setfield(state, -2, zcp_exists_info.name);
+ lua_setglobal(state, "zfs");
+ VERIFY0(lua_gettop(state));
+
+ /*
+ * Push the error-callback that calculates Lua stack traces on
+ * unexpected failures.
+ */
+ lua_pushcfunction(state, zcp_error_handler);
+ VERIFY3U(1, ==, lua_gettop(state));
+
+ /*
+ * Load the actual script as a function onto the stack as text ("t").
+ * The only valid error condition is a syntax error in the script.
+ * ERRMEM should not be possible because our allocator is using
+ * KM_SLEEP. ERRGCMM should not be possible because we have not added
+ * any objects with __gc metamethods to the interpreter that could
+ * fail.
+ */
+ err = luaL_loadbufferx(state, program, strlen(program),
+ "channel program", "t");
+ if (err == LUA_ERRSYNTAX) {
+ fnvlist_add_string(outnvl, ZCP_RET_ERROR,
+ lua_tostring(state, -1));
+ lua_close(state);
+ return (SET_ERROR(EINVAL));
+ }
+ VERIFY0(err);
+ VERIFY3U(2, ==, lua_gettop(state));
+
+ /*
+ * Convert the input nvlist to a Lua object and put it on top of the
+ * stack.
+ */
+ char errmsg[128];
+ err = zcp_nvpair_value_to_lua(state, nvarg,
+ errmsg, sizeof (errmsg));
+ if (err != 0) {
+ fnvlist_add_string(outnvl, ZCP_RET_ERROR, errmsg);
+ lua_close(state);
+ return (SET_ERROR(EINVAL));
+ }
+ VERIFY3U(3, ==, lua_gettop(state));
+
+ runinfo.zri_state = state;
+ runinfo.zri_allocargs = &allocargs;
+ runinfo.zri_outnvl = outnvl;
+ runinfo.zri_result = 0;
+ runinfo.zri_cred = CRED();
+ runinfo.zri_proc = curproc;
+ runinfo.zri_timed_out = B_FALSE;
+ runinfo.zri_canceled = B_FALSE;
+ runinfo.zri_sync = sync;
+ runinfo.zri_space_used = 0;
+ runinfo.zri_curinstrs = 0;
+ runinfo.zri_maxinstrs = instrlimit;
+ runinfo.zri_new_zvols = fnvlist_alloc();
+
+ if (sync) {
+ err = dsl_sync_task_sig(poolname, NULL, zcp_eval_sync,
+ zcp_eval_sig, &runinfo, 0, ZFS_SPACE_CHECK_ZCP_EVAL);
+ if (err != 0)
+ zcp_pool_error(&runinfo, poolname);
+ } else {
+ zcp_eval_open(&runinfo, poolname);
+ }
+ lua_close(state);
+
+ /*
+ * Create device minor nodes for any new zvols.
+ */
+ for (nvpair_t *pair = nvlist_next_nvpair(runinfo.zri_new_zvols, NULL);
+ pair != NULL;
+ pair = nvlist_next_nvpair(runinfo.zri_new_zvols, pair)) {
+ zvol_create_minor(nvpair_name(pair));
+ }
+ fnvlist_free(runinfo.zri_new_zvols);
+
+ return (runinfo.zri_result);
+}
+
+/*
+ * Retrieve metadata about the currently running channel program.
+ */
+zcp_run_info_t *
+zcp_run_info(lua_State *state)
+{
+ zcp_run_info_t *ri;
+
+ lua_getfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY);
+ ri = lua_touserdata(state, -1);
+ lua_pop(state, 1);
+ return (ri);
+}
+
+/*
+ * Argument Parsing
+ * ================
+ *
+ * The Lua language allows methods to be called with any number
+ * of arguments of any type. When calling back into ZFS we need to sanitize
+ * arguments from channel programs to make sure unexpected arguments or
+ * arguments of the wrong type result in clear error messages. To do this
+ * in a uniform way all callbacks from channel programs should use the
+ * zcp_parse_args() function to interpret inputs.
+ *
+ * Positional vs Keyword Arguments
+ * ===============================
+ *
+ * Every callback function takes a fixed set of required positional arguments
+ * and optional keyword arguments. For example, the destroy function takes
+ * a single positional string argument (the name of the dataset to destroy)
+ * and an optional "defer" keyword boolean argument. When calling lua functions
+ * with parentheses, only positional arguments can be used:
+ *
+ * zfs.sync.snapshot("rpool@snap")
+ *
+ * To use keyword arguments functions should be called with a single argument
+ * that is a lua table containing mappings of integer -> positional arguments
+ * and string -> keyword arguments:
+ *
+ * zfs.sync.snapshot({1="rpool@snap", defer=true})
+ *
+ * The lua language allows curly braces to be used in place of parenthesis as
+ * syntactic sugar for this calling convention:
+ *
+ * zfs.sync.snapshot{"rpool@snap", defer=true}
+ */
+
+/*
+ * Throw an error and print the given arguments. If there are too many
+ * arguments to fit in the output buffer, only the error format string is
+ * output.
+ */
+static void
+zcp_args_error(lua_State *state, const char *fname, const zcp_arg_t *pargs,
+ const zcp_arg_t *kwargs, const char *fmt, ...)
+{
+ int i;
+ char errmsg[512];
+ size_t len = sizeof (errmsg);
+ size_t msglen = 0;
+ va_list argp;
+
+ va_start(argp, fmt);
+ VERIFY3U(len, >, vsnprintf(errmsg, len, fmt, argp));
+ va_end(argp);
+
+ /*
+ * Calculate the total length of the final string, including extra
+ * formatting characters. If the argument dump would be too large,
+ * only print the error string.
+ */
+ msglen = strlen(errmsg);
+ msglen += strlen(fname) + 4; /* : + {} + null terminator */
+ for (i = 0; pargs[i].za_name != NULL; i++) {
+ msglen += strlen(pargs[i].za_name);
+ msglen += strlen(lua_typename(state, pargs[i].za_lua_type));
+ if (pargs[i + 1].za_name != NULL || kwargs[0].za_name != NULL)
+ msglen += 5; /* < + ( + )> + , */
+ else
+ msglen += 4; /* < + ( + )> */
+ }
+ for (i = 0; kwargs[i].za_name != NULL; i++) {
+ msglen += strlen(kwargs[i].za_name);
+ msglen += strlen(lua_typename(state, kwargs[i].za_lua_type));
+ if (kwargs[i + 1].za_name != NULL)
+ msglen += 4; /* =( + ) + , */
+ else
+ msglen += 3; /* =( + ) */
+ }
+
+ if (msglen >= len)
+ (void) luaL_error(state, errmsg);
+
+ VERIFY3U(len, >, strlcat(errmsg, ": ", len));
+ VERIFY3U(len, >, strlcat(errmsg, fname, len));
+ VERIFY3U(len, >, strlcat(errmsg, "{", len));
+ for (i = 0; pargs[i].za_name != NULL; i++) {
+ VERIFY3U(len, >, strlcat(errmsg, "<", len));
+ VERIFY3U(len, >, strlcat(errmsg, pargs[i].za_name, len));
+ VERIFY3U(len, >, strlcat(errmsg, "(", len));
+ VERIFY3U(len, >, strlcat(errmsg,
+ lua_typename(state, pargs[i].za_lua_type), len));
+ VERIFY3U(len, >, strlcat(errmsg, ")>", len));
+ if (pargs[i + 1].za_name != NULL || kwargs[0].za_name != NULL) {
+ VERIFY3U(len, >, strlcat(errmsg, ", ", len));
+ }
+ }
+ for (i = 0; kwargs[i].za_name != NULL; i++) {
+ VERIFY3U(len, >, strlcat(errmsg, kwargs[i].za_name, len));
+ VERIFY3U(len, >, strlcat(errmsg, "=(", len));
+ VERIFY3U(len, >, strlcat(errmsg,
+ lua_typename(state, kwargs[i].za_lua_type), len));
+ VERIFY3U(len, >, strlcat(errmsg, ")", len));
+ if (kwargs[i + 1].za_name != NULL) {
+ VERIFY3U(len, >, strlcat(errmsg, ", ", len));
+ }
+ }
+ VERIFY3U(len, >, strlcat(errmsg, "}", len));
+
+ (void) luaL_error(state, errmsg);
+ panic("unreachable code");
+}
+
+static void
+zcp_parse_table_args(lua_State *state, const char *fname,
+ const zcp_arg_t *pargs, const zcp_arg_t *kwargs)
+{
+ int i;
+ int type;
+
+ for (i = 0; pargs[i].za_name != NULL; i++) {
+ /*
+ * Check the table for this positional argument, leaving it
+ * on the top of the stack once we finish validating it.
+ */
+ lua_pushinteger(state, i + 1);
+ lua_gettable(state, 1);
+
+ type = lua_type(state, -1);
+ if (type == LUA_TNIL) {
+ zcp_args_error(state, fname, pargs, kwargs,
+ "too few arguments");
+ panic("unreachable code");
+ } else if (type != pargs[i].za_lua_type) {
+ zcp_args_error(state, fname, pargs, kwargs,
+ "arg %d wrong type (is '%s', expected '%s')",
+ i + 1, lua_typename(state, type),
+ lua_typename(state, pargs[i].za_lua_type));
+ panic("unreachable code");
+ }
+
+ /*
+ * Remove the positional argument from the table.
+ */
+ lua_pushinteger(state, i + 1);
+ lua_pushnil(state);
+ lua_settable(state, 1);
+ }
+
+ for (i = 0; kwargs[i].za_name != NULL; i++) {
+ /*
+ * Check the table for this keyword argument, which may be
+ * nil if it was omitted. Leave the value on the top of
+ * the stack after validating it.
+ */
+ lua_getfield(state, 1, kwargs[i].za_name);
+
+ type = lua_type(state, -1);
+ if (type != LUA_TNIL && type != kwargs[i].za_lua_type) {
+ zcp_args_error(state, fname, pargs, kwargs,
+ "kwarg '%s' wrong type (is '%s', expected '%s')",
+ kwargs[i].za_name, lua_typename(state, type),
+ lua_typename(state, kwargs[i].za_lua_type));
+ panic("unreachable code");
+ }
+
+ /*
+ * Remove the keyword argument from the table.
+ */
+ lua_pushnil(state);
+ lua_setfield(state, 1, kwargs[i].za_name);
+ }
+
+ /*
+ * Any entries remaining in the table are invalid inputs, print
+ * an error message based on what the entry is.
+ */
+ lua_pushnil(state);
+ if (lua_next(state, 1)) {
+ if (lua_isnumber(state, -2) && lua_tointeger(state, -2) > 0) {
+ zcp_args_error(state, fname, pargs, kwargs,
+ "too many positional arguments");
+ } else if (lua_isstring(state, -2)) {
+ zcp_args_error(state, fname, pargs, kwargs,
+ "invalid kwarg '%s'", lua_tostring(state, -2));
+ } else {
+ zcp_args_error(state, fname, pargs, kwargs,
+ "kwarg keys must be strings");
+ }
+ panic("unreachable code");
+ }
+
+ lua_remove(state, 1);
+}
+
+static void
+zcp_parse_pos_args(lua_State *state, const char *fname, const zcp_arg_t *pargs,
+ const zcp_arg_t *kwargs)
+{
+ int i;
+ int type;
+
+ for (i = 0; pargs[i].za_name != NULL; i++) {
+ type = lua_type(state, i + 1);
+ if (type == LUA_TNONE) {
+ zcp_args_error(state, fname, pargs, kwargs,
+ "too few arguments");
+ panic("unreachable code");
+ } else if (type != pargs[i].za_lua_type) {
+ zcp_args_error(state, fname, pargs, kwargs,
+ "arg %d wrong type (is '%s', expected '%s')",
+ i + 1, lua_typename(state, type),
+ lua_typename(state, pargs[i].za_lua_type));
+ panic("unreachable code");
+ }
+ }
+ if (lua_gettop(state) != i) {
+ zcp_args_error(state, fname, pargs, kwargs,
+ "too many positional arguments");
+ panic("unreachable code");
+ }
+
+ for (i = 0; kwargs[i].za_name != NULL; i++) {
+ lua_pushnil(state);
+ }
+}
+
+/*
+ * Checks the current Lua stack against an expected set of positional and
+ * keyword arguments. If the stack does not match the expected arguments
+ * aborts the current channel program with a useful error message, otherwise
+ * it re-arranges the stack so that it contains the positional arguments
+ * followed by the keyword argument values in declaration order. Any missing
+ * keyword argument will be represented by a nil value on the stack.
+ *
+ * If the stack contains exactly one argument of type LUA_TTABLE the curly
+ * braces calling convention is assumed, otherwise the stack is parsed for
+ * positional arguments only.
+ *
+ * This function should be used by every function callback. It should be called
+ * before the callback manipulates the Lua stack as it assumes the stack
+ * represents the function arguments.
+ */
+void
+zcp_parse_args(lua_State *state, const char *fname, const zcp_arg_t *pargs,
+ const zcp_arg_t *kwargs)
+{
+ if (lua_gettop(state) == 1 && lua_istable(state, 1)) {
+ zcp_parse_table_args(state, fname, pargs, kwargs);
+ } else {
+ zcp_parse_pos_args(state, fname, pargs, kwargs);
+ }
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_instrlimit, ULONG, ZMOD_RW,
+ "Max instruction limit that can be specified for a channel program");
+
+ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_memlimit, ULONG, ZMOD_RW,
+ "Max memory limit that can be specified for a channel program");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/zcp_get.c b/sys/contrib/openzfs/module/zfs/zcp_get.c
new file mode 100644
index 000000000000..7256e4de1915
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zcp_get.c
@@ -0,0 +1,813 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/lua/lua.h>
+#include <sys/lua/lualib.h>
+#include <sys/lua/lauxlib.h>
+
+#include <zfs_prop.h>
+
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_objset.h>
+#include <sys/mntent.h>
+#include <sys/sunddi.h>
+#include <sys/zap.h>
+#include <sys/zcp.h>
+#include <sys/zcp_iter.h>
+#include <sys/zcp_global.h>
+#include <sys/zcp_prop.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_znode.h>
+#include <sys/zvol.h>
+
+#ifdef _KERNEL
+#include <sys/zfs_quota.h>
+#include <sys/zfs_vfsops.h>
+#endif
+
+static int
+get_objset_type(dsl_dataset_t *ds, zfs_type_t *type)
+{
+ int error;
+ objset_t *os;
+ error = dmu_objset_from_ds(ds, &os);
+ if (error != 0)
+ return (error);
+ if (ds->ds_is_snapshot) {
+ *type = ZFS_TYPE_SNAPSHOT;
+ } else {
+ switch (os->os_phys->os_type) {
+ case DMU_OST_ZFS:
+ *type = ZFS_TYPE_FILESYSTEM;
+ break;
+ case DMU_OST_ZVOL:
+ *type = ZFS_TYPE_VOLUME;
+ break;
+ default:
+ return (EINVAL);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Returns the string name of ds's type in str (a buffer which should be
+ * at least 12 bytes long).
+ */
+static int
+get_objset_type_name(dsl_dataset_t *ds, char *str)
+{
+ int error;
+ zfs_type_t type;
+ error = get_objset_type(ds, &type);
+ if (error != 0)
+ return (error);
+ switch (type) {
+ case ZFS_TYPE_SNAPSHOT:
+ (void) strlcpy(str, "snapshot", ZAP_MAXVALUELEN);
+ break;
+ case ZFS_TYPE_FILESYSTEM:
+ (void) strlcpy(str, "filesystem", ZAP_MAXVALUELEN);
+ break;
+ case ZFS_TYPE_VOLUME:
+ (void) strlcpy(str, "volume", ZAP_MAXVALUELEN);
+ break;
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * Determines the source of a property given its setpoint and
+ * property type. It pushes the source to the lua stack.
+ */
+static void
+get_prop_src(lua_State *state, const char *setpoint, zfs_prop_t prop)
+{
+ if (zfs_prop_readonly(prop) || (prop == ZFS_PROP_VERSION)) {
+ lua_pushnil(state);
+ } else {
+ const char *src;
+ if (strcmp("", setpoint) == 0) {
+ src = "default";
+ } else {
+ src = setpoint;
+ }
+ (void) lua_pushstring(state, src);
+ }
+}
+
+/*
+ * Given an error encountered while getting properties, either longjmp's for
+ * a fatal error or pushes nothing to the stack for a non fatal one.
+ */
+static int
+zcp_handle_error(lua_State *state, const char *dataset_name,
+ const char *property_name, int error)
+{
+ ASSERT3S(error, !=, 0);
+ if (error == ENOENT) {
+ return (0);
+ } else if (error == EINVAL) {
+ return (luaL_error(state,
+ "property '%s' is not a valid property on dataset '%s'",
+ property_name, dataset_name));
+ } else if (error == EIO) {
+ return (luaL_error(state,
+ "I/O error while retrieving property '%s' on dataset '%s'",
+ property_name, dataset_name));
+ } else {
+ return (luaL_error(state, "unexpected error %d while "
+ "retrieving property '%s' on dataset '%s'",
+ error, property_name, dataset_name));
+ }
+}
+
+/*
+ * Look up a user defined property in the zap object. If it exists, push it
+ * and the setpoint onto the stack, otherwise don't push anything.
+ */
+static int
+zcp_get_user_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name,
+ const char *property_name)
+{
+ int error;
+ char *buf;
+ char setpoint[ZFS_MAX_DATASET_NAME_LEN];
+ /*
+ * zcp_dataset_hold will either successfully return the requested
+ * dataset or throw a lua error and longjmp out of the zfs.get_prop call
+ * without returning.
+ */
+ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
+ if (ds == NULL)
+ return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+
+ buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
+ error = dsl_prop_get_ds(ds, property_name, 1, ZAP_MAXVALUELEN,
+ buf, setpoint);
+ dsl_dataset_rele(ds, FTAG);
+
+ if (error != 0) {
+ kmem_free(buf, ZAP_MAXVALUELEN);
+ return (zcp_handle_error(state, dataset_name, property_name,
+ error));
+ }
+ (void) lua_pushstring(state, buf);
+ (void) lua_pushstring(state, setpoint);
+ kmem_free(buf, ZAP_MAXVALUELEN);
+ return (2);
+}
+
+/*
+ * Check if the property we're looking for is stored in the ds_dir. If so,
+ * return it in the 'val' argument. Return 0 on success and ENOENT and if
+ * the property is not present.
+ */
+static int
+get_dsl_dir_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop,
+ uint64_t *val)
+{
+ dsl_dir_t *dd = ds->ds_dir;
+ mutex_enter(&dd->dd_lock);
+ switch (zfs_prop) {
+ case ZFS_PROP_USEDSNAP:
+ *val = dsl_dir_get_usedsnap(dd);
+ break;
+ case ZFS_PROP_USEDCHILD:
+ *val = dsl_dir_get_usedchild(dd);
+ break;
+ case ZFS_PROP_USEDDS:
+ *val = dsl_dir_get_usedds(dd);
+ break;
+ case ZFS_PROP_USEDREFRESERV:
+ *val = dsl_dir_get_usedrefreserv(dd);
+ break;
+ case ZFS_PROP_LOGICALUSED:
+ *val = dsl_dir_get_logicalused(dd);
+ break;
+ default:
+ mutex_exit(&dd->dd_lock);
+ return (SET_ERROR(ENOENT));
+ }
+ mutex_exit(&dd->dd_lock);
+ return (0);
+}
+
+/*
+ * Check if the property we're looking for is stored at the dsl_dataset or
+ * dsl_dir level. If so, push the property value and source onto the lua stack
+ * and return 0. If it is not present or a failure occurs in lookup, return a
+ * non-zero error value.
+ */
+static int
+get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname,
+ zfs_prop_t zfs_prop)
+{
+ int error = 0;
+ objset_t *os;
+ uint64_t numval = 0;
+ char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
+ char setpoint[ZFS_MAX_DATASET_NAME_LEN] =
+ "Internal error - setpoint not determined";
+ zfs_type_t ds_type;
+ zprop_type_t prop_type = zfs_prop_get_type(zfs_prop);
+ (void) get_objset_type(ds, &ds_type);
+
+ switch (zfs_prop) {
+ case ZFS_PROP_REFRATIO:
+ numval = dsl_get_refratio(ds);
+ break;
+ case ZFS_PROP_USED:
+ numval = dsl_get_used(ds);
+ break;
+ case ZFS_PROP_CLONES: {
+ nvlist_t *clones = fnvlist_alloc();
+ error = get_clones_stat_impl(ds, clones);
+ if (error == 0) {
+ /* push list to lua stack */
+ VERIFY0(zcp_nvlist_to_lua(state, clones, NULL, 0ULL));
+ /* source */
+ (void) lua_pushnil(state);
+ }
+ nvlist_free(clones);
+ kmem_free(strval, ZAP_MAXVALUELEN);
+ return (error);
+ }
+ case ZFS_PROP_COMPRESSRATIO:
+ numval = dsl_get_compressratio(ds);
+ break;
+ case ZFS_PROP_CREATION:
+ numval = dsl_get_creation(ds);
+ break;
+ case ZFS_PROP_REFERENCED:
+ numval = dsl_get_referenced(ds);
+ break;
+ case ZFS_PROP_AVAILABLE:
+ numval = dsl_get_available(ds);
+ break;
+ case ZFS_PROP_LOGICALREFERENCED:
+ numval = dsl_get_logicalreferenced(ds);
+ break;
+ case ZFS_PROP_CREATETXG:
+ numval = dsl_get_creationtxg(ds);
+ break;
+ case ZFS_PROP_GUID:
+ numval = dsl_get_guid(ds);
+ break;
+ case ZFS_PROP_UNIQUE:
+ numval = dsl_get_unique(ds);
+ break;
+ case ZFS_PROP_OBJSETID:
+ numval = dsl_get_objsetid(ds);
+ break;
+ case ZFS_PROP_ORIGIN:
+ dsl_dir_get_origin(ds->ds_dir, strval);
+ break;
+ case ZFS_PROP_USERACCOUNTING:
+ error = dmu_objset_from_ds(ds, &os);
+ if (error == 0)
+ numval = dmu_objset_userspace_present(os);
+ break;
+ case ZFS_PROP_WRITTEN:
+ error = dsl_get_written(ds, &numval);
+ break;
+ case ZFS_PROP_TYPE:
+ error = get_objset_type_name(ds, strval);
+ break;
+ case ZFS_PROP_PREV_SNAP:
+ error = dsl_get_prev_snap(ds, strval);
+ break;
+ case ZFS_PROP_NAME:
+ dsl_dataset_name(ds, strval);
+ break;
+ case ZFS_PROP_MOUNTPOINT:
+ error = dsl_get_mountpoint(ds, dsname, strval, setpoint);
+ break;
+ case ZFS_PROP_VERSION:
+ /* should be a snapshot or filesystem */
+ ASSERT(ds_type != ZFS_TYPE_VOLUME);
+ error = dmu_objset_from_ds(ds, &os);
+ /* look in the master node for the version */
+ if (error == 0) {
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+ sizeof (numval), 1, &numval);
+ }
+ break;
+ case ZFS_PROP_DEFER_DESTROY:
+ numval = dsl_get_defer_destroy(ds);
+ break;
+ case ZFS_PROP_USERREFS:
+ numval = dsl_get_userrefs(ds);
+ break;
+ case ZFS_PROP_FILESYSTEM_COUNT:
+ error = dsl_dir_get_filesystem_count(ds->ds_dir, &numval);
+ (void) strlcpy(setpoint, "", ZFS_MAX_DATASET_NAME_LEN);
+ break;
+ case ZFS_PROP_SNAPSHOT_COUNT:
+ error = dsl_dir_get_snapshot_count(ds->ds_dir, &numval);
+ (void) strlcpy(setpoint, "", ZFS_MAX_DATASET_NAME_LEN);
+ break;
+ case ZFS_PROP_NUMCLONES:
+ numval = dsl_get_numclones(ds);
+ break;
+ case ZFS_PROP_INCONSISTENT:
+ numval = dsl_get_inconsistent(ds);
+ break;
+ case ZFS_PROP_IVSET_GUID:
+ if (dsl_dataset_is_zapified(ds)) {
+ error = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_object, DS_FIELD_IVSET_GUID,
+ sizeof (numval), 1, &numval);
+ } else {
+ error = ENOENT;
+ }
+ break;
+ case ZFS_PROP_RECEIVE_RESUME_TOKEN: {
+ char *token = get_receive_resume_stats_impl(ds);
+
+ (void) strlcpy(strval, token, ZAP_MAXVALUELEN);
+ if (strcmp(strval, "") == 0) {
+ char *childval = get_child_receive_stats(ds);
+
+ (void) strlcpy(strval, childval, ZAP_MAXVALUELEN);
+ if (strcmp(strval, "") == 0)
+ error = ENOENT;
+
+ kmem_strfree(childval);
+ }
+ kmem_strfree(token);
+ break;
+ }
+ case ZFS_PROP_VOLSIZE:
+ ASSERT(ds_type == ZFS_TYPE_VOLUME ||
+ ds_type == ZFS_TYPE_SNAPSHOT);
+ error = dmu_objset_from_ds(ds, &os);
+ if (error == 0) {
+ error = zap_lookup(os, ZVOL_ZAP_OBJ, "size",
+ sizeof (numval), 1, &numval);
+ }
+ if (error == 0)
+ (void) strlcpy(setpoint, dsname,
+ ZFS_MAX_DATASET_NAME_LEN);
+
+ break;
+ case ZFS_PROP_VOLBLOCKSIZE: {
+ ASSERT(ds_type == ZFS_TYPE_VOLUME);
+ dmu_object_info_t doi;
+ error = dmu_objset_from_ds(ds, &os);
+ if (error == 0) {
+ error = dmu_object_info(os, ZVOL_OBJ, &doi);
+ if (error == 0)
+ numval = doi.doi_data_block_size;
+ }
+ break;
+ }
+
+ case ZFS_PROP_KEYSTATUS:
+ case ZFS_PROP_KEYFORMAT: {
+ /* provide defaults in case no crypto obj exists */
+ setpoint[0] = '\0';
+ if (zfs_prop == ZFS_PROP_KEYSTATUS)
+ numval = ZFS_KEYSTATUS_NONE;
+ else
+ numval = ZFS_KEYFORMAT_NONE;
+
+ nvlist_t *nvl, *propval;
+ nvl = fnvlist_alloc();
+ dsl_dataset_crypt_stats(ds, nvl);
+ if (nvlist_lookup_nvlist(nvl, zfs_prop_to_name(zfs_prop),
+ &propval) == 0) {
+ char *source;
+
+ (void) nvlist_lookup_uint64(propval, ZPROP_VALUE,
+ &numval);
+ if (nvlist_lookup_string(propval, ZPROP_SOURCE,
+ &source) == 0)
+ strlcpy(setpoint, source, sizeof (setpoint));
+ }
+ nvlist_free(nvl);
+ break;
+ }
+
+ default:
+ /* Did not match these props, check in the dsl_dir */
+ error = get_dsl_dir_prop(ds, zfs_prop, &numval);
+ }
+ if (error != 0) {
+ kmem_free(strval, ZAP_MAXVALUELEN);
+ return (error);
+ }
+
+ switch (prop_type) {
+ case PROP_TYPE_NUMBER: {
+ (void) lua_pushnumber(state, numval);
+ break;
+ }
+ case PROP_TYPE_STRING: {
+ (void) lua_pushstring(state, strval);
+ break;
+ }
+ case PROP_TYPE_INDEX: {
+ const char *propval;
+ error = zfs_prop_index_to_string(zfs_prop, numval, &propval);
+ if (error != 0) {
+ kmem_free(strval, ZAP_MAXVALUELEN);
+ return (error);
+ }
+ (void) lua_pushstring(state, propval);
+ break;
+ }
+ }
+ kmem_free(strval, ZAP_MAXVALUELEN);
+
+ /* Push the source to the stack */
+ get_prop_src(state, setpoint, zfs_prop);
+ return (0);
+}
+
+/*
+ * Look up a property and its source in the zap object. If the value is
+ * present and successfully retrieved, push the value and source on the
+ * lua stack and return 0. On failure, return a non-zero error value.
+ */
+static int
+get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop)
+{
+ int error = 0;
+ char setpoint[ZFS_MAX_DATASET_NAME_LEN];
+ char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
+ uint64_t numval;
+ const char *prop_name = zfs_prop_to_name(zfs_prop);
+ zprop_type_t prop_type = zfs_prop_get_type(zfs_prop);
+
+ if (prop_type == PROP_TYPE_STRING) {
+ /* Push value to lua stack */
+ error = dsl_prop_get_ds(ds, prop_name, 1,
+ ZAP_MAXVALUELEN, strval, setpoint);
+ if (error == 0)
+ (void) lua_pushstring(state, strval);
+ } else {
+ error = dsl_prop_get_ds(ds, prop_name, sizeof (numval),
+ 1, &numval, setpoint);
+
+#ifdef _KERNEL
+ /* Fill in temporary value for prop, if applicable */
+ (void) zfs_get_temporary_prop(ds, zfs_prop, &numval, setpoint);
+#else
+ return (luaL_error(state,
+ "temporary properties only supported in kernel mode",
+ prop_name));
+#endif
+ /* Push value to lua stack */
+ if (prop_type == PROP_TYPE_INDEX) {
+ const char *propval;
+ error = zfs_prop_index_to_string(zfs_prop, numval,
+ &propval);
+ if (error == 0)
+ (void) lua_pushstring(state, propval);
+ } else {
+ if (error == 0)
+ (void) lua_pushnumber(state, numval);
+ }
+ }
+ kmem_free(strval, ZAP_MAXVALUELEN);
+ if (error == 0)
+ get_prop_src(state, setpoint, zfs_prop);
+ return (error);
+}
+
+/*
+ * Determine whether property is valid for a given dataset
+ */
+boolean_t
+prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop)
+{
+ int error;
+ zfs_type_t zfs_type;
+
+ /* properties not supported */
+ if ((zfs_prop == ZFS_PROP_ISCSIOPTIONS) ||
+ (zfs_prop == ZFS_PROP_MOUNTED))
+ return (B_FALSE);
+
+ /* if we want the origin prop, ds must be a clone */
+ if ((zfs_prop == ZFS_PROP_ORIGIN) && (!dsl_dir_is_clone(ds->ds_dir)))
+ return (B_FALSE);
+
+ error = get_objset_type(ds, &zfs_type);
+ if (error != 0)
+ return (B_FALSE);
+ return (zfs_prop_valid_for_type(zfs_prop, zfs_type, B_FALSE));
+}
+
+/*
+ * Look up a given dataset property. On success return 2, the number of
+ * values pushed to the lua stack (property value and source). On a fatal
+ * error, longjmp. On a non fatal error push nothing.
+ */
+static int
+zcp_get_system_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name,
+ zfs_prop_t zfs_prop)
+{
+ int error;
+ /*
+ * zcp_dataset_hold will either successfully return the requested
+ * dataset or throw a lua error and longjmp out of the zfs.get_prop call
+ * without returning.
+ */
+ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
+ if (ds == NULL)
+ return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+
+ /* Check that the property is valid for the given dataset */
+ const char *prop_name = zfs_prop_to_name(zfs_prop);
+ if (!prop_valid_for_ds(ds, zfs_prop)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ /* Check if the property can be accessed directly */
+ error = get_special_prop(state, ds, dataset_name, zfs_prop);
+ if (error == 0) {
+ dsl_dataset_rele(ds, FTAG);
+ /* The value and source have been pushed by get_special_prop */
+ return (2);
+ }
+ if (error != ENOENT) {
+ dsl_dataset_rele(ds, FTAG);
+ return (zcp_handle_error(state, dataset_name,
+ prop_name, error));
+ }
+
+ /* If we were unable to find it, look in the zap object */
+ error = get_zap_prop(state, ds, zfs_prop);
+ dsl_dataset_rele(ds, FTAG);
+ if (error != 0) {
+ return (zcp_handle_error(state, dataset_name,
+ prop_name, error));
+ }
+ /* The value and source have been pushed by get_zap_prop */
+ return (2);
+}
+
+#ifdef _KERNEL
+static zfs_userquota_prop_t
+get_userquota_prop(const char *prop_name)
+{
+ zfs_userquota_prop_t type;
+ /* Figure out the property type ({user|group}{quota|used}) */
+ for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) {
+ if (strncmp(prop_name, zfs_userquota_prop_prefixes[type],
+ strlen(zfs_userquota_prop_prefixes[type])) == 0)
+ break;
+ }
+ return (type);
+}
+
+/*
+ * Given the name of a zfs_userquota_prop, this function determines the
+ * prop type as well as the numeric group/user ids based on the string
+ * following the '@' in the property name. On success, returns 0. On failure,
+ * returns a non-zero error.
+ * 'domain' must be free'd by caller using kmem_strfree()
+ */
+static int
+parse_userquota_prop(const char *prop_name, zfs_userquota_prop_t *type,
+ char **domain, uint64_t *rid)
+{
+ char *cp, *end, *domain_val;
+
+ *type = get_userquota_prop(prop_name);
+ if (*type >= ZFS_NUM_USERQUOTA_PROPS)
+ return (EINVAL);
+
+ *rid = 0;
+ cp = strchr(prop_name, '@') + 1;
+ if (strncmp(cp, "S-1-", 4) == 0) {
+ /*
+ * It's a numeric SID (eg "S-1-234-567-89") and we want to
+ * separate the domain id and the rid
+ */
+ int domain_len = strrchr(cp, '-') - cp;
+ domain_val = kmem_alloc(domain_len + 1, KM_SLEEP);
+ (void) strncpy(domain_val, cp, domain_len);
+ domain_val[domain_len] = '\0';
+ cp += domain_len + 1;
+
+ (void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid);
+ if (*end != '\0') {
+ kmem_strfree(domain_val);
+ return (EINVAL);
+ }
+ } else {
+ /* It's only a user/group ID (eg "12345"), just get the rid */
+ domain_val = NULL;
+ (void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid);
+ if (*end != '\0')
+ return (EINVAL);
+ }
+ *domain = domain_val;
+ return (0);
+}
+
+/*
+ * Look up {user|group}{quota|used} property for given dataset. On success
+ * push the value (quota or used amount) and the setpoint. On failure, push
+ * a lua error.
+ */
+static int
+zcp_get_userquota_prop(lua_State *state, dsl_pool_t *dp,
+ const char *dataset_name, const char *prop_name)
+{
+ zfsvfs_t *zfvp;
+ zfsvfs_t *zfsvfs;
+ int error;
+ zfs_userquota_prop_t type;
+ char *domain;
+ uint64_t rid, value = 0;
+ objset_t *os;
+
+ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
+ if (ds == NULL)
+ return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+
+ error = parse_userquota_prop(prop_name, &type, &domain, &rid);
+ if (error == 0) {
+ error = dmu_objset_from_ds(ds, &os);
+ if (error == 0) {
+ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+ error = zfsvfs_create_impl(&zfvp, zfsvfs, os);
+ if (error == 0) {
+ error = zfs_userspace_one(zfvp, type, domain,
+ rid, &value);
+ zfsvfs_free(zfvp);
+ }
+ }
+ if (domain != NULL)
+ kmem_strfree(domain);
+ }
+ dsl_dataset_rele(ds, FTAG);
+
+ if ((value == 0) && ((type == ZFS_PROP_USERQUOTA) ||
+ (type == ZFS_PROP_GROUPQUOTA)))
+ error = SET_ERROR(ENOENT);
+ if (error != 0) {
+ return (zcp_handle_error(state, dataset_name,
+ prop_name, error));
+ }
+
+ (void) lua_pushnumber(state, value);
+ (void) lua_pushstring(state, dataset_name);
+ return (2);
+}
+#endif
+
+/*
+ * Determines the name of the snapshot referenced in the written property
+ * name. Returns snapshot name in snap_name, a buffer that must be at least
+ * as large as ZFS_MAX_DATASET_NAME_LEN
+ */
+static void
+parse_written_prop(const char *dataset_name, const char *prop_name,
+ char *snap_name)
+{
+ ASSERT(zfs_prop_written(prop_name));
+ const char *name = prop_name + ZFS_WRITTEN_PROP_PREFIX_LEN;
+ if (strchr(name, '@') == NULL) {
+ (void) snprintf(snap_name, ZFS_MAX_DATASET_NAME_LEN, "%s@%s",
+ dataset_name, name);
+ } else {
+ (void) strlcpy(snap_name, name, ZFS_MAX_DATASET_NAME_LEN);
+ }
+}
+
+/*
+ * Look up written@ property for given dataset. On success
+ * push the value and the setpoint. If error is fatal, we will
+ * longjmp, otherwise push nothing.
+ */
+static int
+zcp_get_written_prop(lua_State *state, dsl_pool_t *dp,
+ const char *dataset_name, const char *prop_name)
+{
+ char snap_name[ZFS_MAX_DATASET_NAME_LEN];
+ uint64_t used, comp, uncomp;
+ dsl_dataset_t *old;
+ int error = 0;
+
+ parse_written_prop(dataset_name, prop_name, snap_name);
+ dsl_dataset_t *new = zcp_dataset_hold(state, dp, dataset_name, FTAG);
+ if (new == NULL)
+ return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+
+ error = dsl_dataset_hold(dp, snap_name, FTAG, &old);
+ if (error != 0) {
+ dsl_dataset_rele(new, FTAG);
+ return (zcp_dataset_hold_error(state, dp, snap_name,
+ error));
+ }
+ error = dsl_dataset_space_written(old, new,
+ &used, &comp, &uncomp);
+
+ dsl_dataset_rele(old, FTAG);
+ dsl_dataset_rele(new, FTAG);
+
+ if (error != 0) {
+ return (zcp_handle_error(state, dataset_name,
+ snap_name, error));
+ }
+ (void) lua_pushnumber(state, used);
+ (void) lua_pushstring(state, dataset_name);
+ return (2);
+}
+
+static int zcp_get_prop(lua_State *state);
+static zcp_lib_info_t zcp_get_prop_info = {
+ .name = "get_prop",
+ .func = zcp_get_prop,
+ .pargs = {
+ { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
+ { .za_name = "property", .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ }
+};
+
+static int
+zcp_get_prop(lua_State *state)
+{
+ const char *dataset_name;
+ const char *property_name;
+ dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+ zcp_lib_info_t *libinfo = &zcp_get_prop_info;
+
+ zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
+
+ dataset_name = lua_tostring(state, 1);
+ property_name = lua_tostring(state, 2);
+
+ /* User defined property */
+ if (zfs_prop_user(property_name)) {
+ return (zcp_get_user_prop(state, dp,
+ dataset_name, property_name));
+ }
+ /* userspace property */
+ if (zfs_prop_userquota(property_name)) {
+#ifdef _KERNEL
+ return (zcp_get_userquota_prop(state, dp,
+ dataset_name, property_name));
+#else
+ return (luaL_error(state,
+ "user quota properties only supported in kernel mode",
+ property_name));
+#endif
+ }
+ /* written@ property */
+ if (zfs_prop_written(property_name)) {
+ return (zcp_get_written_prop(state, dp,
+ dataset_name, property_name));
+ }
+
+ zfs_prop_t zfs_prop = zfs_name_to_prop(property_name);
+ /* Valid system property */
+ if (zfs_prop != ZPROP_INVAL) {
+ return (zcp_get_system_prop(state, dp, dataset_name,
+ zfs_prop));
+ }
+
+ /* Invalid property name */
+ return (luaL_error(state,
+ "'%s' is not a valid property", property_name));
+}
+
+int
+zcp_load_get_lib(lua_State *state)
+{
+ lua_pushcclosure(state, zcp_get_prop_info.func, 0);
+ lua_setfield(state, -2, zcp_get_prop_info.name);
+
+ return (1);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zcp_global.c b/sys/contrib/openzfs/module/zfs/zcp_global.c
new file mode 100644
index 000000000000..8e166e0736d6
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zcp_global.c
@@ -0,0 +1,89 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/zcp_global.h>
+
+#include <sys/lua/lua.h>
+#include <sys/lua/lauxlib.h>
+
+typedef struct zcp_errno_global {
+ const char *zeg_name;
+ int zeg_errno;
+} zcp_errno_global_t;
+
+static const zcp_errno_global_t errno_globals[] = {
+ {"EPERM", EPERM},
+ {"ENOENT", ENOENT},
+ {"ESRCH", ESRCH},
+ {"EINTR", EINTR},
+ {"EIO", EIO},
+ {"ENXIO", ENXIO},
+ {"E2BIG", E2BIG},
+ {"ENOEXEC", ENOEXEC},
+ {"EBADF", EBADF},
+ {"ECHILD", ECHILD},
+ {"EAGAIN", EAGAIN},
+ {"ENOMEM", ENOMEM},
+ {"EACCES", EACCES},
+ {"EFAULT", EFAULT},
+ {"ENOTBLK", ENOTBLK},
+ {"EBUSY", EBUSY},
+ {"EEXIST", EEXIST},
+ {"EXDEV", EXDEV},
+ {"ENODEV", ENODEV},
+ {"ENOTDIR", ENOTDIR},
+ {"EISDIR", EISDIR},
+ {"EINVAL", EINVAL},
+ {"ENFILE", ENFILE},
+ {"EMFILE", EMFILE},
+ {"ENOTTY", ENOTTY},
+ {"ETXTBSY", ETXTBSY},
+ {"EFBIG", EFBIG},
+ {"ENOSPC", ENOSPC},
+ {"ESPIPE", ESPIPE},
+ {"EROFS", EROFS},
+ {"EMLINK", EMLINK},
+ {"EPIPE", EPIPE},
+ {"EDOM", EDOM},
+ {"ERANGE", ERANGE},
+ {"EDEADLK", EDEADLK},
+ {"ENOLCK", ENOLCK},
+ {"ECANCELED", ECANCELED},
+ {"ENOTSUP", ENOTSUP},
+ {"EDQUOT", EDQUOT},
+ {"ENAMETOOLONG", ENAMETOOLONG},
+ {0, 0}
+};
+
+static void
+zcp_load_errno_globals(lua_State *state)
+{
+ const zcp_errno_global_t *global = errno_globals;
+ while (global->zeg_name != NULL) {
+ lua_pushnumber(state, (lua_Number)global->zeg_errno);
+ lua_setglobal(state, global->zeg_name);
+ global++;
+ }
+}
+
+void
+zcp_load_globals(lua_State *state)
+{
+ zcp_load_errno_globals(state);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zcp_iter.c b/sys/contrib/openzfs/module/zfs/zcp_iter.c
new file mode 100644
index 000000000000..f727c56f212d
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zcp_iter.c
@@ -0,0 +1,751 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/lua/lua.h>
+#include <sys/lua/lauxlib.h>
+
+#include <sys/dmu.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_pool.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/zap.h>
+#include <sys/dsl_dir.h>
+#include <sys/zcp_prop.h>
+
+#include <sys/zcp.h>
+
+#include "zfs_comutil.h"
+
+typedef int (zcp_list_func_t)(lua_State *);
+typedef struct zcp_list_info {
+ const char *name;
+ zcp_list_func_t *func;
+ zcp_list_func_t *gc;
+ const zcp_arg_t pargs[4];
+ const zcp_arg_t kwargs[2];
+} zcp_list_info_t;
+
+static int
+zcp_clones_iter(lua_State *state)
+{
+ int err;
+ char clonename[ZFS_MAX_DATASET_NAME_LEN];
+ uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1));
+ uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2));
+ dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+ dsl_dataset_t *ds, *clone;
+ zap_attribute_t za;
+ zap_cursor_t zc;
+
+ err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+ if (err == ENOENT) {
+ return (0);
+ } else if (err != 0) {
+ return (luaL_error(state,
+ "unexpected error %d from dsl_dataset_hold_obj(dsobj)",
+ err));
+ }
+
+ if (dsl_dataset_phys(ds)->ds_next_clones_obj == 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ zap_cursor_init_serialized(&zc, dp->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_next_clones_obj, cursor);
+ dsl_dataset_rele(ds, FTAG);
+
+ err = zap_cursor_retrieve(&zc, &za);
+ if (err != 0) {
+ zap_cursor_fini(&zc);
+ if (err != ENOENT) {
+ return (luaL_error(state,
+ "unexpected error %d from zap_cursor_retrieve()",
+ err));
+ }
+ return (0);
+ }
+ zap_cursor_advance(&zc);
+ cursor = zap_cursor_serialize(&zc);
+ zap_cursor_fini(&zc);
+
+ err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &clone);
+ if (err != 0) {
+ return (luaL_error(state,
+ "unexpected error %d from "
+ "dsl_dataset_hold_obj(za_first_integer)", err));
+ }
+
+ dsl_dir_name(clone->ds_dir, clonename);
+ dsl_dataset_rele(clone, FTAG);
+
+ lua_pushnumber(state, cursor);
+ lua_replace(state, lua_upvalueindex(2));
+
+ (void) lua_pushstring(state, clonename);
+ return (1);
+}
+
+static int zcp_clones_list(lua_State *);
+static zcp_list_info_t zcp_clones_list_info = {
+ .name = "clones",
+ .func = zcp_clones_list,
+ .gc = NULL,
+ .pargs = {
+ { .za_name = "snapshot", .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ }
+};
+
+static int
+zcp_clones_list(lua_State *state)
+{
+ const char *snapname = lua_tostring(state, 1);
+ dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+
+ /*
+ * zcp_dataset_hold will either successfully return the requested
+ * dataset or throw a lua error and longjmp out of the zfs.list.clones
+ * call without returning.
+ */
+ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, snapname, FTAG);
+ if (ds == NULL)
+ return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+ boolean_t issnap = ds->ds_is_snapshot;
+ uint64_t cursor = 0;
+ uint64_t dsobj = ds->ds_object;
+ dsl_dataset_rele(ds, FTAG);
+
+ if (!issnap) {
+ return (zcp_argerror(state, 1, "%s is not a snapshot",
+ snapname));
+ }
+
+ lua_pushnumber(state, dsobj);
+ lua_pushnumber(state, cursor);
+ lua_pushcclosure(state, &zcp_clones_iter, 2);
+ return (1);
+}
+
+static int
+zcp_snapshots_iter(lua_State *state)
+{
+ int err;
+ char snapname[ZFS_MAX_DATASET_NAME_LEN];
+ uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1));
+ uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2));
+ dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+ dsl_dataset_t *ds;
+ objset_t *os;
+ char *p;
+
+ err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+ if (err != 0) {
+ return (luaL_error(state,
+ "unexpected error %d from dsl_dataset_hold_obj(dsobj)",
+ err));
+ }
+
+ dsl_dataset_name(ds, snapname);
+ VERIFY3U(sizeof (snapname), >,
+ strlcat(snapname, "@", sizeof (snapname)));
+
+ p = strchr(snapname, '\0');
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+ err = dmu_snapshot_list_next(os,
+ sizeof (snapname) - (p - snapname), p, NULL, &cursor, NULL);
+ dsl_dataset_rele(ds, FTAG);
+
+ if (err == ENOENT) {
+ return (0);
+ } else if (err != 0) {
+ return (luaL_error(state,
+ "unexpected error %d from dmu_snapshot_list_next()", err));
+ }
+
+ lua_pushnumber(state, cursor);
+ lua_replace(state, lua_upvalueindex(2));
+
+ (void) lua_pushstring(state, snapname);
+ return (1);
+}
+
+static int zcp_snapshots_list(lua_State *);
+static zcp_list_info_t zcp_snapshots_list_info = {
+ .name = "snapshots",
+ .func = zcp_snapshots_list,
+ .gc = NULL,
+ .pargs = {
+ { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ }
+};
+
+static int
+zcp_snapshots_list(lua_State *state)
+{
+ const char *fsname = lua_tostring(state, 1);
+ dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+ boolean_t issnap;
+ uint64_t dsobj;
+
+ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, fsname, FTAG);
+ if (ds == NULL)
+ return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+ issnap = ds->ds_is_snapshot;
+ dsobj = ds->ds_object;
+ dsl_dataset_rele(ds, FTAG);
+
+ if (issnap) {
+ return (zcp_argerror(state, 1,
+ "argument %s cannot be a snapshot", fsname));
+ }
+
+ lua_pushnumber(state, dsobj);
+ lua_pushnumber(state, 0);
+ lua_pushcclosure(state, &zcp_snapshots_iter, 2);
+ return (1);
+}
+
+static int
+zcp_children_iter(lua_State *state)
+{
+ int err;
+ char childname[ZFS_MAX_DATASET_NAME_LEN];
+ uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1));
+ uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2));
+ zcp_run_info_t *ri = zcp_run_info(state);
+ dsl_pool_t *dp = ri->zri_pool;
+ dsl_dataset_t *ds;
+ objset_t *os;
+ char *p;
+
+ err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+ if (err != 0) {
+ return (luaL_error(state,
+ "unexpected error %d from dsl_dataset_hold_obj(dsobj)",
+ err));
+ }
+
+ dsl_dataset_name(ds, childname);
+ VERIFY3U(sizeof (childname), >,
+ strlcat(childname, "/", sizeof (childname)));
+ p = strchr(childname, '\0');
+
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+ do {
+ err = dmu_dir_list_next(os,
+ sizeof (childname) - (p - childname), p, NULL, &cursor);
+ } while (err == 0 && zfs_dataset_name_hidden(childname));
+ dsl_dataset_rele(ds, FTAG);
+
+ if (err == ENOENT) {
+ return (0);
+ } else if (err != 0) {
+ return (luaL_error(state,
+ "unexpected error %d from dmu_dir_list_next()",
+ err));
+ }
+
+ lua_pushnumber(state, cursor);
+ lua_replace(state, lua_upvalueindex(2));
+
+ (void) lua_pushstring(state, childname);
+ return (1);
+}
+
+static int zcp_children_list(lua_State *);
+static zcp_list_info_t zcp_children_list_info = {
+ .name = "children",
+ .func = zcp_children_list,
+ .gc = NULL,
+ .pargs = {
+ { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ }
+};
+
+static int
+zcp_children_list(lua_State *state)
+{
+ const char *fsname = lua_tostring(state, 1);
+ dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+ boolean_t issnap;
+ uint64_t dsobj;
+
+ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, fsname, FTAG);
+ if (ds == NULL)
+ return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+
+ issnap = ds->ds_is_snapshot;
+ dsobj = ds->ds_object;
+ dsl_dataset_rele(ds, FTAG);
+
+ if (issnap) {
+ return (zcp_argerror(state, 1,
+ "argument %s cannot be a snapshot", fsname));
+ }
+
+ lua_pushnumber(state, dsobj);
+ lua_pushnumber(state, 0);
+ lua_pushcclosure(state, &zcp_children_iter, 2);
+ return (1);
+}
+
+static int
+zcp_user_props_list_gc(lua_State *state)
+{
+ nvlist_t **props = lua_touserdata(state, 1);
+ if (*props != NULL)
+ fnvlist_free(*props);
+ return (0);
+}
+
+static int
+zcp_user_props_iter(lua_State *state)
+{
+ char *source, *val;
+ nvlist_t *nvprop;
+ nvlist_t **props = lua_touserdata(state, lua_upvalueindex(1));
+ nvpair_t *pair = lua_touserdata(state, lua_upvalueindex(2));
+
+ do {
+ pair = nvlist_next_nvpair(*props, pair);
+ if (pair == NULL) {
+ fnvlist_free(*props);
+ *props = NULL;
+ return (0);
+ }
+ } while (!zfs_prop_user(nvpair_name(pair)));
+
+ lua_pushlightuserdata(state, pair);
+ lua_replace(state, lua_upvalueindex(2));
+
+ nvprop = fnvpair_value_nvlist(pair);
+ val = fnvlist_lookup_string(nvprop, ZPROP_VALUE);
+ source = fnvlist_lookup_string(nvprop, ZPROP_SOURCE);
+
+ (void) lua_pushstring(state, nvpair_name(pair));
+ (void) lua_pushstring(state, val);
+ (void) lua_pushstring(state, source);
+ return (3);
+}
+
+static int zcp_user_props_list(lua_State *);
+static zcp_list_info_t zcp_user_props_list_info = {
+ .name = "user_properties",
+ .func = zcp_user_props_list,
+ .gc = zcp_user_props_list_gc,
+ .pargs = {
+ { .za_name = "filesystem | snapshot | volume",
+ .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ }
+};
+
+/*
+ * 'properties' was the initial name for 'user_properties' seen
+ * above. 'user_properties' is a better name as it distinguishes
+ * these properties from 'system_properties' which are different.
+ * In order to avoid breaking compatibility between different
+ * versions of ZFS, we declare 'properties' as an alias for
+ * 'user_properties'.
+ */
+static zcp_list_info_t zcp_props_list_info = {
+ .name = "properties",
+ .func = zcp_user_props_list,
+ .gc = zcp_user_props_list_gc,
+ .pargs = {
+ { .za_name = "filesystem | snapshot | volume",
+ .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ }
+};
+
+static int
+zcp_user_props_list(lua_State *state)
+{
+ const char *dsname = lua_tostring(state, 1);
+ dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+ objset_t *os;
+ nvlist_t **props = lua_newuserdata(state, sizeof (nvlist_t *));
+
+ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dsname, FTAG);
+ if (ds == NULL)
+ return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+ VERIFY0(dsl_prop_get_all(os, props));
+ dsl_dataset_rele(ds, FTAG);
+
+ /*
+ * Set the metatable for the properties list to free it on
+ * completion.
+ */
+ luaL_getmetatable(state, zcp_user_props_list_info.name);
+ (void) lua_setmetatable(state, -2);
+
+ lua_pushlightuserdata(state, NULL);
+ lua_pushcclosure(state, &zcp_user_props_iter, 2);
+ return (1);
+}
+
+
+/*
+ * Populate nv with all valid system properties and their values for the given
+ * dataset.
+ */
+static void
+zcp_dataset_system_props(dsl_dataset_t *ds, nvlist_t *nv)
+{
+ for (int prop = ZFS_PROP_TYPE; prop < ZFS_NUM_PROPS; prop++) {
+ /* Do not display hidden props */
+ if (!zfs_prop_visible(prop))
+ continue;
+ /* Do not display props not valid for this dataset */
+ if (!prop_valid_for_ds(ds, prop))
+ continue;
+ fnvlist_add_boolean(nv, zfs_prop_to_name(prop));
+ }
+}
+
+static int zcp_system_props_list(lua_State *);
+static zcp_list_info_t zcp_system_props_list_info = {
+ .name = "system_properties",
+ .func = zcp_system_props_list,
+ .pargs = {
+ { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ }
+};
+
+/*
+ * Get a list of all visible system properties and their values for a given
+ * dataset. Returned on the stack as a Lua table.
+ */
+static int
+zcp_system_props_list(lua_State *state)
+{
+ int error;
+ char errbuf[128];
+ const char *dataset_name;
+ dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+ zcp_list_info_t *libinfo = &zcp_system_props_list_info;
+ zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
+ dataset_name = lua_tostring(state, 1);
+ nvlist_t *nv = fnvlist_alloc();
+
+ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
+ if (ds == NULL)
+ return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+
+ /* Get the names of all valid system properties for this dataset */
+ zcp_dataset_system_props(ds, nv);
+ dsl_dataset_rele(ds, FTAG);
+
+ /* push list as lua table */
+ error = zcp_nvlist_to_lua(state, nv, errbuf, sizeof (errbuf));
+ nvlist_free(nv);
+ if (error != 0) {
+ return (luaL_error(state,
+ "Error returning nvlist: %s", errbuf));
+ }
+ return (1);
+}
+
+static int
+zcp_bookmarks_iter(lua_State *state)
+{
+ char ds_name[ZFS_MAX_DATASET_NAME_LEN];
+ char bookmark_name[ZFS_MAX_DATASET_NAME_LEN];
+ uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1));
+ uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2));
+ dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+ dsl_dataset_t *ds;
+ zap_attribute_t za;
+ zap_cursor_t zc;
+
+ int err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+ if (err == ENOENT) {
+ return (0);
+ } else if (err != 0) {
+ return (luaL_error(state,
+ "unexpected error %d from dsl_dataset_hold_obj(dsobj)",
+ err));
+ }
+
+ if (!dsl_dataset_is_zapified(ds)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ err = zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_BOOKMARK_NAMES, sizeof (ds->ds_bookmarks_obj), 1,
+ &ds->ds_bookmarks_obj);
+ if (err != 0 && err != ENOENT) {
+ dsl_dataset_rele(ds, FTAG);
+ return (luaL_error(state,
+ "unexpected error %d from zap_lookup()", err));
+ }
+ if (ds->ds_bookmarks_obj == 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ /* Store the dataset's name so we can append the bookmark's name */
+ dsl_dataset_name(ds, ds_name);
+
+ zap_cursor_init_serialized(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_bookmarks_obj, cursor);
+ dsl_dataset_rele(ds, FTAG);
+
+ err = zap_cursor_retrieve(&zc, &za);
+ if (err != 0) {
+ zap_cursor_fini(&zc);
+ if (err != ENOENT) {
+ return (luaL_error(state,
+ "unexpected error %d from zap_cursor_retrieve()",
+ err));
+ }
+ return (0);
+ }
+ zap_cursor_advance(&zc);
+ cursor = zap_cursor_serialize(&zc);
+ zap_cursor_fini(&zc);
+
+ /* Create the full "pool/fs#bookmark" string to return */
+ int n = snprintf(bookmark_name, ZFS_MAX_DATASET_NAME_LEN, "%s#%s",
+ ds_name, za.za_name);
+ if (n >= ZFS_MAX_DATASET_NAME_LEN) {
+ return (luaL_error(state,
+ "unexpected error %d from snprintf()", ENAMETOOLONG));
+ }
+
+ lua_pushnumber(state, cursor);
+ lua_replace(state, lua_upvalueindex(2));
+
+ (void) lua_pushstring(state, bookmark_name);
+ return (1);
+}
+
+static int zcp_bookmarks_list(lua_State *);
+static zcp_list_info_t zcp_bookmarks_list_info = {
+ .name = "bookmarks",
+ .func = zcp_bookmarks_list,
+ .pargs = {
+ { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ }
+};
+
+static int
+zcp_bookmarks_list(lua_State *state)
+{
+ const char *dsname = lua_tostring(state, 1);
+ dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+
+ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dsname, FTAG);
+ if (ds == NULL)
+ return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+
+ boolean_t issnap = ds->ds_is_snapshot;
+ uint64_t dsobj = ds->ds_object;
+ uint64_t cursor = 0;
+ dsl_dataset_rele(ds, FTAG);
+
+ if (issnap) {
+ return (zcp_argerror(state, 1, "%s is a snapshot", dsname));
+ }
+
+ lua_pushnumber(state, dsobj);
+ lua_pushnumber(state, cursor);
+ lua_pushcclosure(state, &zcp_bookmarks_iter, 2);
+ return (1);
+}
+
+static int
+zcp_holds_iter(lua_State *state)
+{
+ uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1));
+ uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2));
+ dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+ dsl_dataset_t *ds;
+ zap_attribute_t za;
+ zap_cursor_t zc;
+
+ int err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+ if (err == ENOENT) {
+ return (0);
+ } else if (err != 0) {
+ return (luaL_error(state,
+ "unexpected error %d from dsl_dataset_hold_obj(dsobj)",
+ err));
+ }
+
+ if (dsl_dataset_phys(ds)->ds_userrefs_obj == 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ zap_cursor_init_serialized(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_userrefs_obj, cursor);
+ dsl_dataset_rele(ds, FTAG);
+
+ err = zap_cursor_retrieve(&zc, &za);
+ if (err != 0) {
+ zap_cursor_fini(&zc);
+ if (err != ENOENT) {
+ return (luaL_error(state,
+ "unexpected error %d from zap_cursor_retrieve()",
+ err));
+ }
+ return (0);
+ }
+ zap_cursor_advance(&zc);
+ cursor = zap_cursor_serialize(&zc);
+ zap_cursor_fini(&zc);
+
+ lua_pushnumber(state, cursor);
+ lua_replace(state, lua_upvalueindex(2));
+
+ (void) lua_pushstring(state, za.za_name);
+ (void) lua_pushnumber(state, za.za_first_integer);
+ return (2);
+}
+
+static int zcp_holds_list(lua_State *);
+static zcp_list_info_t zcp_holds_list_info = {
+ .name = "holds",
+ .func = zcp_holds_list,
+ .gc = NULL,
+ .pargs = {
+ { .za_name = "snapshot", .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ }
+};
+
+/*
+ * Iterate over all the holds for a given dataset. Each iteration returns
+ * a hold's tag and its timestamp as an integer.
+ */
+static int
+zcp_holds_list(lua_State *state)
+{
+ const char *snapname = lua_tostring(state, 1);
+ dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+
+ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, snapname, FTAG);
+ if (ds == NULL)
+ return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+
+ boolean_t issnap = ds->ds_is_snapshot;
+ uint64_t dsobj = ds->ds_object;
+ uint64_t cursor = 0;
+ dsl_dataset_rele(ds, FTAG);
+
+ if (!issnap) {
+ return (zcp_argerror(state, 1, "%s is not a snapshot",
+ snapname));
+ }
+
+ lua_pushnumber(state, dsobj);
+ lua_pushnumber(state, cursor);
+ lua_pushcclosure(state, &zcp_holds_iter, 2);
+ return (1);
+}
+
+static int
+zcp_list_func(lua_State *state)
+{
+ zcp_list_info_t *info = lua_touserdata(state, lua_upvalueindex(1));
+
+ zcp_parse_args(state, info->name, info->pargs, info->kwargs);
+
+ return (info->func(state));
+}
+
+int
+zcp_load_list_lib(lua_State *state)
+{
+ int i;
+ zcp_list_info_t *zcp_list_funcs[] = {
+ &zcp_children_list_info,
+ &zcp_snapshots_list_info,
+ &zcp_user_props_list_info,
+ &zcp_props_list_info,
+ &zcp_clones_list_info,
+ &zcp_system_props_list_info,
+ &zcp_bookmarks_list_info,
+ &zcp_holds_list_info,
+ NULL
+ };
+
+ lua_newtable(state);
+
+ for (i = 0; zcp_list_funcs[i] != NULL; i++) {
+ zcp_list_info_t *info = zcp_list_funcs[i];
+
+ if (info->gc != NULL) {
+ /*
+ * If the function requires garbage collection, create
+ * a metatable with its name and register the __gc
+ * function.
+ */
+ (void) luaL_newmetatable(state, info->name);
+ (void) lua_pushstring(state, "__gc");
+ lua_pushcfunction(state, info->gc);
+ lua_settable(state, -3);
+ lua_pop(state, 1);
+ }
+
+ lua_pushlightuserdata(state, info);
+ lua_pushcclosure(state, &zcp_list_func, 1);
+ lua_setfield(state, -2, info->name);
+ info++;
+ }
+
+ return (1);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zcp_set.c b/sys/contrib/openzfs/module/zfs/zcp_set.c
new file mode 100644
index 000000000000..cebb56a5f181
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zcp_set.c
@@ -0,0 +1,100 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyrigh 2020 Joyent, Inc.
+ */
+
+#include <sys/lua/lua.h>
+#include <sys/lua/lualib.h>
+#include <sys/lua/lauxlib.h>
+
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_dataset.h>
+#include <sys/zcp.h>
+#include <sys/zcp_set.h>
+#include <sys/zcp_iter.h>
+#include <sys/zcp_global.h>
+#include <sys/zvol.h>
+
+#include <zfs_prop.h>
+
+static void
+zcp_set_user_prop(lua_State *state, dsl_pool_t *dp, const char *dsname,
+ const char *prop_name, const char *prop_val, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dsname, FTAG);
+ if (ds == NULL)
+ return; /* not reached; zcp_dataset_hold() longjmp'd */
+
+ nvlist_t *nvl = fnvlist_alloc();
+ fnvlist_add_string(nvl, prop_name, prop_val);
+
+ dsl_props_set_sync_impl(ds, ZPROP_SRC_LOCAL, nvl, tx);
+
+ fnvlist_free(nvl);
+ dsl_dataset_rele(ds, FTAG);
+}
+
+int
+zcp_set_prop_check(void *arg, dmu_tx_t *tx)
+{
+ zcp_set_prop_arg_t *args = arg;
+ const char *prop_name = args->prop;
+ dsl_props_set_arg_t dpsa = {
+ .dpsa_dsname = args->dsname,
+ .dpsa_source = ZPROP_SRC_LOCAL,
+ };
+ nvlist_t *nvl = NULL;
+ int ret = 0;
+
+ /*
+ * Only user properties are currently supported. When non-user
+ * properties are supported, we will want to use
+ * zfs_valid_proplist() to verify the properties.
+ */
+ if (!zfs_prop_user(prop_name)) {
+ return (EINVAL);
+ }
+
+ nvl = fnvlist_alloc();
+ fnvlist_add_string(nvl, args->prop, args->val);
+ dpsa.dpsa_props = nvl;
+
+ ret = dsl_props_set_check(&dpsa, tx);
+ nvlist_free(nvl);
+
+ return (ret);
+}
+
+void
+zcp_set_prop_sync(void *arg, dmu_tx_t *tx)
+{
+ zcp_set_prop_arg_t *args = arg;
+ zcp_run_info_t *ri = zcp_run_info(args->state);
+ dsl_pool_t *dp = ri->zri_pool;
+
+ const char *dsname = args->dsname;
+ const char *prop_name = args->prop;
+ const char *prop_val = args->val;
+
+ if (zfs_prop_user(prop_name)) {
+ zcp_set_user_prop(args->state, dp, dsname, prop_name,
+ prop_val, tx);
+ }
+}
diff --git a/sys/contrib/openzfs/module/zfs/zcp_synctask.c b/sys/contrib/openzfs/module/zfs/zcp_synctask.c
new file mode 100644
index 000000000000..4e0fa0d85cbf
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zcp_synctask.c
@@ -0,0 +1,544 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved.
+ * Copyright 2020 Joyent, Inc.
+ */
+
+#include <sys/lua/lua.h>
+#include <sys/lua/lauxlib.h>
+
+#include <sys/zcp.h>
+#include <sys/zcp_set.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dmu_objset.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfeature.h>
+#include <sys/metaslab.h>
+
+#define DST_AVG_BLKSHIFT 14
+
+typedef struct zcp_inherit_prop_arg {
+ lua_State *zipa_state;
+ const char *zipa_prop;
+ dsl_props_set_arg_t zipa_dpsa;
+} zcp_inherit_prop_arg_t;
+
+typedef int (zcp_synctask_func_t)(lua_State *, boolean_t, nvlist_t *);
+typedef struct zcp_synctask_info {
+ const char *name;
+ zcp_synctask_func_t *func;
+ const zcp_arg_t pargs[4];
+ const zcp_arg_t kwargs[2];
+ zfs_space_check_t space_check;
+ int blocks_modified;
+} zcp_synctask_info_t;
+
+/*
+ * Generic synctask interface for channel program syncfuncs.
+ *
+ * To perform some action in syncing context, we'd generally call
+ * dsl_sync_task(), but since the Lua script is already running inside a
+ * synctask we need to leave out some actions (such as acquiring the config
+ * rwlock and performing space checks).
+ *
+ * If 'sync' is false, executes a dry run and returns the error code.
+ *
+ * If we are not running in syncing context and we are not doing a dry run
+ * (meaning we are running a zfs.sync function in open-context) then we
+ * return a Lua error.
+ *
+ * This function also handles common fatal error cases for channel program
+ * library functions. If a fatal error occurs, err_dsname will be the dataset
+ * name reported in error messages, if supplied.
+ */
+static int
+zcp_sync_task(lua_State *state, dsl_checkfunc_t *checkfunc,
+ dsl_syncfunc_t *syncfunc, void *arg, boolean_t sync, const char *err_dsname)
+{
+ int err;
+ zcp_run_info_t *ri = zcp_run_info(state);
+
+ err = checkfunc(arg, ri->zri_tx);
+ if (!sync)
+ return (err);
+
+ if (!ri->zri_sync) {
+ return (luaL_error(state, "running functions from the zfs.sync "
+ "submodule requires passing sync=TRUE to "
+ "lzc_channel_program() (i.e. do not specify the \"-n\" "
+ "command line argument)"));
+ }
+
+ if (err == 0) {
+ syncfunc(arg, ri->zri_tx);
+ } else if (err == EIO) {
+ if (err_dsname != NULL) {
+ return (luaL_error(state,
+ "I/O error while accessing dataset '%s'",
+ err_dsname));
+ } else {
+ return (luaL_error(state,
+ "I/O error while accessing dataset."));
+ }
+ }
+
+ return (err);
+}
+
+
+static int zcp_synctask_destroy(lua_State *, boolean_t, nvlist_t *);
+static zcp_synctask_info_t zcp_synctask_destroy_info = {
+ .name = "destroy",
+ .func = zcp_synctask_destroy,
+ .pargs = {
+ {.za_name = "filesystem | snapshot", .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {.za_name = "defer", .za_lua_type = LUA_TBOOLEAN},
+ {NULL, 0}
+ },
+ .space_check = ZFS_SPACE_CHECK_DESTROY,
+ .blocks_modified = 0
+};
+
+/* ARGSUSED */
+static int
+zcp_synctask_destroy(lua_State *state, boolean_t sync, nvlist_t *err_details)
+{
+ int err;
+ const char *dsname = lua_tostring(state, 1);
+
+ boolean_t issnap = (strchr(dsname, '@') != NULL);
+
+ if (!issnap && !lua_isnil(state, 2)) {
+ return (luaL_error(state,
+ "'deferred' kwarg only supported for snapshots: %s",
+ dsname));
+ }
+
+ if (issnap) {
+ dsl_destroy_snapshot_arg_t ddsa = { 0 };
+ ddsa.ddsa_name = dsname;
+ if (!lua_isnil(state, 2)) {
+ ddsa.ddsa_defer = lua_toboolean(state, 2);
+ } else {
+ ddsa.ddsa_defer = B_FALSE;
+ }
+
+ err = zcp_sync_task(state, dsl_destroy_snapshot_check,
+ dsl_destroy_snapshot_sync, &ddsa, sync, dsname);
+ } else {
+ dsl_destroy_head_arg_t ddha = { 0 };
+ ddha.ddha_name = dsname;
+
+ err = zcp_sync_task(state, dsl_destroy_head_check,
+ dsl_destroy_head_sync, &ddha, sync, dsname);
+ }
+
+ return (err);
+}
+
+static int zcp_synctask_promote(lua_State *, boolean_t, nvlist_t *);
+static zcp_synctask_info_t zcp_synctask_promote_info = {
+ .name = "promote",
+ .func = zcp_synctask_promote,
+ .pargs = {
+ {.za_name = "clone", .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ },
+ .space_check = ZFS_SPACE_CHECK_RESERVED,
+ .blocks_modified = 3
+};
+
+static int
+zcp_synctask_promote(lua_State *state, boolean_t sync, nvlist_t *err_details)
+{
+ int err;
+ dsl_dataset_promote_arg_t ddpa = { 0 };
+ const char *dsname = lua_tostring(state, 1);
+ zcp_run_info_t *ri = zcp_run_info(state);
+
+ ddpa.ddpa_clonename = dsname;
+ ddpa.err_ds = err_details;
+ ddpa.cr = ri->zri_cred;
+ ddpa.proc = ri->zri_proc;
+
+ /*
+ * If there was a snapshot name conflict, then err_ds will be filled
+ * with a list of conflicting snapshot names.
+ */
+ err = zcp_sync_task(state, dsl_dataset_promote_check,
+ dsl_dataset_promote_sync, &ddpa, sync, dsname);
+
+ return (err);
+}
+
+static int zcp_synctask_rollback(lua_State *, boolean_t, nvlist_t *err_details);
+static zcp_synctask_info_t zcp_synctask_rollback_info = {
+ .name = "rollback",
+ .func = zcp_synctask_rollback,
+ .space_check = ZFS_SPACE_CHECK_RESERVED,
+ .blocks_modified = 1,
+ .pargs = {
+ {.za_name = "filesystem", .za_lua_type = LUA_TSTRING},
+ {0, 0}
+ },
+ .kwargs = {
+ {0, 0}
+ }
+};
+
+static int
+zcp_synctask_rollback(lua_State *state, boolean_t sync, nvlist_t *err_details)
+{
+ int err;
+ const char *dsname = lua_tostring(state, 1);
+ dsl_dataset_rollback_arg_t ddra = { 0 };
+
+ ddra.ddra_fsname = dsname;
+ ddra.ddra_result = err_details;
+
+ err = zcp_sync_task(state, dsl_dataset_rollback_check,
+ dsl_dataset_rollback_sync, &ddra, sync, dsname);
+
+ return (err);
+}
+
+static int zcp_synctask_snapshot(lua_State *, boolean_t, nvlist_t *);
+static zcp_synctask_info_t zcp_synctask_snapshot_info = {
+ .name = "snapshot",
+ .func = zcp_synctask_snapshot,
+ .pargs = {
+ {.za_name = "filesystem@snapname | volume@snapname",
+ .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ },
+ .space_check = ZFS_SPACE_CHECK_NORMAL,
+ .blocks_modified = 3
+};
+
+/* ARGSUSED */
+static int
+zcp_synctask_snapshot(lua_State *state, boolean_t sync, nvlist_t *err_details)
+{
+ int err;
+ dsl_dataset_snapshot_arg_t ddsa = { 0 };
+ const char *dsname = lua_tostring(state, 1);
+ zcp_run_info_t *ri = zcp_run_info(state);
+
+ /*
+ * On old pools, the ZIL must not be active when a snapshot is created,
+ * but we can't suspend the ZIL because we're already in syncing
+ * context.
+ */
+ if (spa_version(ri->zri_pool->dp_spa) < SPA_VERSION_FAST_SNAP) {
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ /*
+ * We only allow for a single snapshot rather than a list, so the
+ * error list output is unnecessary.
+ */
+ ddsa.ddsa_errors = NULL;
+ ddsa.ddsa_props = NULL;
+ ddsa.ddsa_cr = ri->zri_cred;
+ ddsa.ddsa_proc = ri->zri_proc;
+ ddsa.ddsa_snaps = fnvlist_alloc();
+ fnvlist_add_boolean(ddsa.ddsa_snaps, dsname);
+
+ zcp_cleanup_handler_t *zch = zcp_register_cleanup(state,
+ (zcp_cleanup_t *)&fnvlist_free, ddsa.ddsa_snaps);
+
+ err = zcp_sync_task(state, dsl_dataset_snapshot_check,
+ dsl_dataset_snapshot_sync, &ddsa, sync, dsname);
+
+ if (err == 0) {
+ /*
+ * We may need to create a new device minor node for this
+ * dataset (if it is a zvol and the "snapdev" property is set).
+ * Save it in the nvlist so that it can be processed in open
+ * context.
+ */
+ fnvlist_add_boolean(ri->zri_new_zvols, dsname);
+ }
+
+ zcp_deregister_cleanup(state, zch);
+ fnvlist_free(ddsa.ddsa_snaps);
+
+ return (err);
+}
+
+static int zcp_synctask_inherit_prop(lua_State *, boolean_t,
+ nvlist_t *err_details);
+static zcp_synctask_info_t zcp_synctask_inherit_prop_info = {
+ .name = "inherit",
+ .func = zcp_synctask_inherit_prop,
+ .space_check = ZFS_SPACE_CHECK_RESERVED,
+ .blocks_modified = 2, /* 2 * numprops */
+ .pargs = {
+ { .za_name = "dataset", .za_lua_type = LUA_TSTRING },
+ { .za_name = "property", .za_lua_type = LUA_TSTRING },
+ { NULL, 0 }
+ },
+ .kwargs = {
+ { NULL, 0 }
+ },
+};
+
+static int
+zcp_synctask_inherit_prop_check(void *arg, dmu_tx_t *tx)
+{
+ zcp_inherit_prop_arg_t *args = arg;
+ zfs_prop_t prop = zfs_name_to_prop(args->zipa_prop);
+
+ if (prop == ZPROP_INVAL) {
+ if (zfs_prop_user(args->zipa_prop))
+ return (0);
+
+ return (EINVAL);
+ }
+
+ if (zfs_prop_readonly(prop))
+ return (EINVAL);
+
+ if (!zfs_prop_inheritable(prop))
+ return (EINVAL);
+
+ return (dsl_props_set_check(&args->zipa_dpsa, tx));
+}
+
+static void
+zcp_synctask_inherit_prop_sync(void *arg, dmu_tx_t *tx)
+{
+ zcp_inherit_prop_arg_t *args = arg;
+ dsl_props_set_arg_t *dpsa = &args->zipa_dpsa;
+
+ dsl_props_set_sync(dpsa, tx);
+}
+
+static int
+zcp_synctask_inherit_prop(lua_State *state, boolean_t sync,
+ nvlist_t *err_details)
+{
+ int err;
+ zcp_inherit_prop_arg_t zipa = { 0 };
+ dsl_props_set_arg_t *dpsa = &zipa.zipa_dpsa;
+
+ const char *dsname = lua_tostring(state, 1);
+ const char *prop = lua_tostring(state, 2);
+
+ zipa.zipa_state = state;
+ zipa.zipa_prop = prop;
+ dpsa->dpsa_dsname = dsname;
+ dpsa->dpsa_source = ZPROP_SRC_INHERITED;
+ dpsa->dpsa_props = fnvlist_alloc();
+ fnvlist_add_boolean(dpsa->dpsa_props, prop);
+
+ zcp_cleanup_handler_t *zch = zcp_register_cleanup(state,
+ (zcp_cleanup_t *)&fnvlist_free, dpsa->dpsa_props);
+
+ err = zcp_sync_task(state, zcp_synctask_inherit_prop_check,
+ zcp_synctask_inherit_prop_sync, &zipa, sync, dsname);
+
+ zcp_deregister_cleanup(state, zch);
+ fnvlist_free(dpsa->dpsa_props);
+
+ return (err);
+}
+
+static int zcp_synctask_bookmark(lua_State *, boolean_t, nvlist_t *);
+static zcp_synctask_info_t zcp_synctask_bookmark_info = {
+ .name = "bookmark",
+ .func = zcp_synctask_bookmark,
+ .pargs = {
+ {.za_name = "snapshot | bookmark", .za_lua_type = LUA_TSTRING},
+ {.za_name = "bookmark", .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ },
+ .space_check = ZFS_SPACE_CHECK_NORMAL,
+ .blocks_modified = 1,
+};
+
+/* ARGSUSED */
+static int
+zcp_synctask_bookmark(lua_State *state, boolean_t sync, nvlist_t *err_details)
+{
+ int err;
+ const char *source = lua_tostring(state, 1);
+ const char *new = lua_tostring(state, 2);
+
+ nvlist_t *bmarks = fnvlist_alloc();
+ fnvlist_add_string(bmarks, new, source);
+
+ zcp_cleanup_handler_t *zch = zcp_register_cleanup(state,
+ (zcp_cleanup_t *)&fnvlist_free, bmarks);
+
+ dsl_bookmark_create_arg_t dbca = {
+ .dbca_bmarks = bmarks,
+ .dbca_errors = NULL,
+ };
+ err = zcp_sync_task(state, dsl_bookmark_create_check,
+ dsl_bookmark_create_sync, &dbca, sync, source);
+
+ zcp_deregister_cleanup(state, zch);
+ fnvlist_free(bmarks);
+
+ return (err);
+}
+
+static int zcp_synctask_set_prop(lua_State *, boolean_t, nvlist_t *err_details);
+static zcp_synctask_info_t zcp_synctask_set_prop_info = {
+ .name = "set_prop",
+ .func = zcp_synctask_set_prop,
+ .space_check = ZFS_SPACE_CHECK_RESERVED,
+ .blocks_modified = 2,
+ .pargs = {
+ { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
+ { .za_name = "property", .za_lua_type = LUA_TSTRING},
+ { .za_name = "value", .za_lua_type = LUA_TSTRING},
+ { NULL, 0 }
+ },
+ .kwargs = {
+ { NULL, 0 }
+ }
+};
+
+static int
+zcp_synctask_set_prop(lua_State *state, boolean_t sync, nvlist_t *err_details)
+{
+ int err;
+ zcp_set_prop_arg_t args = { 0 };
+
+ const char *dsname = lua_tostring(state, 1);
+ const char *prop = lua_tostring(state, 2);
+ const char *val = lua_tostring(state, 3);
+
+ args.state = state;
+ args.dsname = dsname;
+ args.prop = prop;
+ args.val = val;
+
+ err = zcp_sync_task(state, zcp_set_prop_check, zcp_set_prop_sync,
+ &args, sync, dsname);
+
+ return (err);
+}
+
+static int
+zcp_synctask_wrapper(lua_State *state)
+{
+ int err;
+ zcp_cleanup_handler_t *zch;
+ int num_ret = 1;
+ nvlist_t *err_details = fnvlist_alloc();
+
+ /*
+ * Make sure err_details is properly freed, even if a fatal error is
+ * thrown during the synctask.
+ */
+ zch = zcp_register_cleanup(state,
+ (zcp_cleanup_t *)&fnvlist_free, err_details);
+
+ zcp_synctask_info_t *info = lua_touserdata(state, lua_upvalueindex(1));
+ boolean_t sync = lua_toboolean(state, lua_upvalueindex(2));
+
+ zcp_run_info_t *ri = zcp_run_info(state);
+ dsl_pool_t *dp = ri->zri_pool;
+
+ /* MOS space is triple-dittoed, so we multiply by 3. */
+ uint64_t funcspace =
+ ((uint64_t)info->blocks_modified << DST_AVG_BLKSHIFT) * 3;
+
+ zcp_parse_args(state, info->name, info->pargs, info->kwargs);
+
+ err = 0;
+ if (info->space_check != ZFS_SPACE_CHECK_NONE) {
+ uint64_t quota = dsl_pool_unreserved_space(dp,
+ info->space_check);
+ uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes +
+ ri->zri_space_used;
+
+ if (used + funcspace > quota) {
+ err = SET_ERROR(ENOSPC);
+ }
+ }
+
+ if (err == 0) {
+ err = info->func(state, sync, err_details);
+ }
+
+ if (err == 0) {
+ ri->zri_space_used += funcspace;
+ }
+
+ lua_pushnumber(state, (lua_Number)err);
+ if (fnvlist_num_pairs(err_details) > 0) {
+ (void) zcp_nvlist_to_lua(state, err_details, NULL, 0);
+ num_ret++;
+ }
+
+ zcp_deregister_cleanup(state, zch);
+ fnvlist_free(err_details);
+
+ return (num_ret);
+}
+
+int
+zcp_load_synctask_lib(lua_State *state, boolean_t sync)
+{
+ int i;
+ zcp_synctask_info_t *zcp_synctask_funcs[] = {
+ &zcp_synctask_destroy_info,
+ &zcp_synctask_promote_info,
+ &zcp_synctask_rollback_info,
+ &zcp_synctask_snapshot_info,
+ &zcp_synctask_inherit_prop_info,
+ &zcp_synctask_bookmark_info,
+ &zcp_synctask_set_prop_info,
+ NULL
+ };
+
+ lua_newtable(state);
+
+ for (i = 0; zcp_synctask_funcs[i] != NULL; i++) {
+ zcp_synctask_info_t *info = zcp_synctask_funcs[i];
+ lua_pushlightuserdata(state, info);
+ lua_pushboolean(state, sync);
+ lua_pushcclosure(state, &zcp_synctask_wrapper, 2);
+ lua_setfield(state, -2, info->name);
+ info++;
+ }
+
+ return (1);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zfeature.c b/sys/contrib/openzfs/module/zfs/zfeature.c
new file mode 100644
index 000000000000..9d16fff81d0a
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfeature.c
@@ -0,0 +1,526 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zfeature.h>
+#include <sys/dmu.h>
+#include <sys/nvpair.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include "zfeature_common.h"
+#include <sys/spa_impl.h>
+
+/*
+ * ZFS Feature Flags
+ * -----------------
+ *
+ * ZFS feature flags are used to provide fine-grained versioning to the ZFS
+ * on-disk format. Once enabled on a pool feature flags replace the old
+ * spa_version() number.
+ *
+ * Each new on-disk format change will be given a uniquely identifying string
+ * GUID rather than a version number. This avoids the problem of different
+ * organizations creating new on-disk formats with the same version number. To
+ * keep feature GUIDs unique they should consist of the reverse dns name of the
+ * organization which implemented the feature and a short name for the feature,
+ * separated by a colon (e.g. com.delphix:async_destroy).
+ *
+ * Reference Counts
+ * ----------------
+ *
+ * Within each pool features can be in one of three states: disabled, enabled,
+ * or active. These states are differentiated by a reference count stored on
+ * disk for each feature:
+ *
+ * 1) If there is no reference count stored on disk the feature is disabled.
+ * 2) If the reference count is 0 a system administrator has enabled the
+ * feature, but the feature has not been used yet, so no on-disk
+ * format changes have been made.
+ * 3) If the reference count is greater than 0 the feature is active.
+ * The format changes required by the feature are currently on disk.
+ * Note that if the feature's format changes are reversed the feature
+ * may choose to set its reference count back to 0.
+ *
+ * Feature flags makes no differentiation between non-zero reference counts
+ * for an active feature (e.g. a reference count of 1 means the same thing as a
+ * reference count of 27834721), but feature implementations may choose to use
+ * the reference count to store meaningful information. For example, a new RAID
+ * implementation might set the reference count to the number of vdevs using
+ * it. If all those disks are removed from the pool the feature goes back to
+ * having a reference count of 0.
+ *
+ * It is the responsibility of the individual features to maintain a non-zero
+ * reference count as long as the feature's format changes are present on disk.
+ *
+ * Dependencies
+ * ------------
+ *
+ * Each feature may depend on other features. The only effect of this
+ * relationship is that when a feature is enabled all of its dependencies are
+ * automatically enabled as well. Any future work to support disabling of
+ * features would need to ensure that features cannot be disabled if other
+ * enabled features depend on them.
+ *
+ * On-disk Format
+ * --------------
+ *
+ * When feature flags are enabled spa_version() is set to SPA_VERSION_FEATURES
+ * (5000). In order for this to work the pool is automatically upgraded to
+ * SPA_VERSION_BEFORE_FEATURES (28) first, so all pre-feature flags on disk
+ * format changes will be in use.
+ *
+ * Information about features is stored in 3 ZAP objects in the pool's MOS.
+ * These objects are linked to by the following names in the pool directory
+ * object:
+ *
+ * 1) features_for_read: feature GUID -> reference count
+ * Features needed to open the pool for reading.
+ * 2) features_for_write: feature GUID -> reference count
+ * Features needed to open the pool for writing.
+ * 3) feature_descriptions: feature GUID -> descriptive string
+ * A human readable string.
+ *
+ * All enabled features appear in either features_for_read or
+ * features_for_write, but not both.
+ *
+ * To open a pool in read-only mode only the features listed in
+ * features_for_read need to be supported.
+ *
+ * To open the pool in read-write mode features in both features_for_read and
+ * features_for_write need to be supported.
+ *
+ * Some features may be required to read the ZAP objects containing feature
+ * information. To allow software to check for compatibility with these features
+ * before the pool is opened their names must be stored in the label in a
+ * new "features_for_read" entry (note that features that are only required
+ * to write to a pool never need to be stored in the label since the
+ * features_for_write ZAP object can be read before the pool is written to).
+ * To save space in the label features must be explicitly marked as needing to
+ * be written to the label. Also, reference counts are not stored in the label,
+ * instead any feature whose reference count drops to 0 is removed from the
+ * label.
+ *
+ * Adding New Features
+ * -------------------
+ *
+ * Features must be registered in zpool_feature_init() function in
+ * zfeature_common.c using the zfeature_register() function. This function
+ * has arguments to specify if the feature should be stored in the
+ * features_for_read or features_for_write ZAP object and if it needs to be
+ * written to the label when active.
+ *
+ * Once a feature is registered it will appear as a "feature@<feature name>"
+ * property which can be set by an administrator. Feature implementors should
+ * use the spa_feature_is_enabled() and spa_feature_is_active() functions to
+ * query the state of a feature and the spa_feature_incr() and
+ * spa_feature_decr() functions to change an enabled feature's reference count.
+ * Reference counts may only be updated in the syncing context.
+ *
+ * Features may not perform enable-time initialization. Instead, any such
+ * initialization should occur when the feature is first used. This design
+ * enforces that on-disk changes be made only when features are used. Code
+ * should only check if a feature is enabled using spa_feature_is_enabled(),
+ * not by relying on any feature specific metadata existing. If a feature is
+ * enabled, but the feature's metadata is not on disk yet then it should be
+ * created as needed.
+ *
+ * As an example, consider the com.delphix:async_destroy feature. This feature
+ * relies on the existence of a bptree in the MOS that store blocks for
+ * asynchronous freeing. This bptree is not created when async_destroy is
+ * enabled. Instead, when a dataset is destroyed spa_feature_is_enabled() is
+ * called to check if async_destroy is enabled. If it is and the bptree object
+ * does not exist yet, the bptree object is created as part of the dataset
+ * destroy and async_destroy's reference count is incremented to indicate it
+ * has made an on-disk format change. Later, after the destroyed dataset's
+ * blocks have all been asynchronously freed there is no longer any use for the
+ * bptree object, so it is destroyed and async_destroy's reference count is
+ * decremented back to 0 to indicate that it has undone its on-disk format
+ * changes.
+ */
+
+typedef enum {
+ FEATURE_ACTION_INCR,
+ FEATURE_ACTION_DECR,
+} feature_action_t;
+
+/*
+ * Checks that the active features in the pool are supported by
+ * this software. Adds each unsupported feature (name -> description) to
+ * the supplied nvlist.
+ */
+boolean_t
+spa_features_check(spa_t *spa, boolean_t for_write,
+ nvlist_t *unsup_feat, nvlist_t *enabled_feat)
+{
+ objset_t *os = spa->spa_meta_objset;
+ boolean_t supported;
+ zap_cursor_t *zc;
+ zap_attribute_t *za;
+ uint64_t obj = for_write ?
+ spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+ char *buf;
+
+ zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+ buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+ supported = B_TRUE;
+ for (zap_cursor_init(zc, os, obj);
+ zap_cursor_retrieve(zc, za) == 0;
+ zap_cursor_advance(zc)) {
+ ASSERT(za->za_integer_length == sizeof (uint64_t) &&
+ za->za_num_integers == 1);
+
+ if (NULL != enabled_feat) {
+ fnvlist_add_uint64(enabled_feat, za->za_name,
+ za->za_first_integer);
+ }
+
+ if (za->za_first_integer != 0 &&
+ !zfeature_is_supported(za->za_name)) {
+ supported = B_FALSE;
+
+ if (NULL != unsup_feat) {
+ const char *desc = "";
+
+ if (zap_lookup(os, spa->spa_feat_desc_obj,
+ za->za_name, 1, MAXPATHLEN, buf) == 0)
+ desc = buf;
+
+ VERIFY(nvlist_add_string(unsup_feat,
+ za->za_name, desc) == 0);
+ }
+ }
+ }
+ zap_cursor_fini(zc);
+
+ kmem_free(buf, MAXPATHLEN);
+ kmem_free(za, sizeof (zap_attribute_t));
+ kmem_free(zc, sizeof (zap_cursor_t));
+
+ return (supported);
+}
+
+/*
+ * Use an in-memory cache of feature refcounts for quick retrieval.
+ *
+ * Note: well-designed features will not need to use this; they should
+ * use spa_feature_is_enabled() and spa_feature_is_active() instead.
+ * However, this is non-static for zdb, zhack, and spa_add_feature_stats().
+ */
+int
+feature_get_refcount(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
+{
+ ASSERT(VALID_FEATURE_FID(feature->fi_feature));
+ if (spa->spa_feat_refcount_cache[feature->fi_feature] ==
+ SPA_FEATURE_DISABLED) {
+ return (SET_ERROR(ENOTSUP));
+ }
+ *res = spa->spa_feat_refcount_cache[feature->fi_feature];
+ return (0);
+}
+
+/*
+ * Note: well-designed features will not need to use this; they should
+ * use spa_feature_is_enabled() and spa_feature_is_active() instead.
+ * However, this is non-static for zdb and zhack.
+ */
+int
+feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature,
+ uint64_t *res)
+{
+ int err;
+ uint64_t refcount;
+ uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+ spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+
+ /*
+ * If the pool is currently being created, the feature objects may not
+ * have been allocated yet. Act as though all features are disabled.
+ */
+ if (zapobj == 0)
+ return (SET_ERROR(ENOTSUP));
+
+ err = zap_lookup(spa->spa_meta_objset, zapobj,
+ feature->fi_guid, sizeof (uint64_t), 1, &refcount);
+ if (err != 0) {
+ if (err == ENOENT)
+ return (SET_ERROR(ENOTSUP));
+ else
+ return (err);
+ }
+ *res = refcount;
+ return (0);
+}
+
+
+static int
+feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
+{
+ uint64_t enabled_txg_obj __maybe_unused = spa->spa_feat_enabled_txg_obj;
+
+ ASSERT(zfeature_depends_on(feature->fi_feature,
+ SPA_FEATURE_ENABLED_TXG));
+
+ if (!spa_feature_is_enabled(spa, feature->fi_feature)) {
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ ASSERT(enabled_txg_obj != 0);
+
+ VERIFY0(zap_lookup(spa->spa_meta_objset, spa->spa_feat_enabled_txg_obj,
+ feature->fi_guid, sizeof (uint64_t), 1, res));
+
+ return (0);
+}
+
+/*
+ * This function is non-static for zhack; it should otherwise not be used
+ * outside this file.
+ */
+void
+feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount,
+ dmu_tx_t *tx)
+{
+ ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature));
+ uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+ spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+ VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid,
+ sizeof (uint64_t), 1, &refcount, tx));
+
+ /*
+ * feature_sync is called directly from zhack, allowing the
+ * creation of arbitrary features whose fi_feature field may
+ * be greater than SPA_FEATURES. When called from zhack, the
+ * zfeature_info_t object's fi_feature field will be set to
+ * SPA_FEATURE_NONE.
+ */
+ if (feature->fi_feature != SPA_FEATURE_NONE) {
+ uint64_t *refcount_cache =
+ &spa->spa_feat_refcount_cache[feature->fi_feature];
+ VERIFY3U(*refcount_cache, ==,
+ atomic_swap_64(refcount_cache, refcount));
+ }
+
+ if (refcount == 0)
+ spa_deactivate_mos_feature(spa, feature->fi_guid);
+ else if (feature->fi_flags & ZFEATURE_FLAG_MOS)
+ spa_activate_mos_feature(spa, feature->fi_guid, tx);
+}
+
+/*
+ * This function is non-static for zhack; it should otherwise not be used
+ * outside this file.
+ */
+void
+feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
+{
+ uint64_t initial_refcount =
+ (feature->fi_flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE) ? 1 : 0;
+ uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+ spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+
+ ASSERT(0 != zapobj);
+ ASSERT(zfeature_is_valid_guid(feature->fi_guid));
+ ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
+
+ /*
+ * If the feature is already enabled, ignore the request.
+ */
+ if (zap_contains(spa->spa_meta_objset, zapobj, feature->fi_guid) == 0)
+ return;
+
+ for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++)
+ spa_feature_enable(spa, feature->fi_depends[i], tx);
+
+ VERIFY0(zap_update(spa->spa_meta_objset, spa->spa_feat_desc_obj,
+ feature->fi_guid, 1, strlen(feature->fi_desc) + 1,
+ feature->fi_desc, tx));
+
+ feature_sync(spa, feature, initial_refcount, tx);
+
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) {
+ uint64_t enabling_txg = dmu_tx_get_txg(tx);
+
+ if (spa->spa_feat_enabled_txg_obj == 0ULL) {
+ spa->spa_feat_enabled_txg_obj =
+ zap_create_link(spa->spa_meta_objset,
+ DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_FEATURE_ENABLED_TXG, tx);
+ }
+ spa_feature_incr(spa, SPA_FEATURE_ENABLED_TXG, tx);
+
+ VERIFY0(zap_add(spa->spa_meta_objset,
+ spa->spa_feat_enabled_txg_obj, feature->fi_guid,
+ sizeof (uint64_t), 1, &enabling_txg, tx));
+ }
+
+ /*
+ * Errata #4 is mostly a problem with encrypted datasets, but it
+ * is also a problem where the old encryption feature did not
+ * depend on the bookmark_v2 feature. If the pool does not have
+ * any encrypted datasets we can resolve this issue simply by
+ * enabling this dependency.
+ */
+ if (spa->spa_errata == ZPOOL_ERRATA_ZOL_8308_ENCRYPTION &&
+ spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) &&
+ !spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION) &&
+ feature->fi_feature == SPA_FEATURE_BOOKMARK_V2)
+ spa->spa_errata = 0;
+}
+
+static void
+feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action,
+ dmu_tx_t *tx)
+{
+ uint64_t refcount = 0;
+ zfeature_info_t *feature = &spa_feature_table[fid];
+ uint64_t zapobj __maybe_unused =
+ (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+ spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+
+ ASSERT(VALID_FEATURE_FID(fid));
+ ASSERT(0 != zapobj);
+ ASSERT(zfeature_is_valid_guid(feature->fi_guid));
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
+
+ VERIFY3U(feature_get_refcount(spa, feature, &refcount), !=, ENOTSUP);
+
+ switch (action) {
+ case FEATURE_ACTION_INCR:
+ VERIFY3U(refcount, !=, UINT64_MAX);
+ refcount++;
+ break;
+ case FEATURE_ACTION_DECR:
+ VERIFY3U(refcount, !=, 0);
+ refcount--;
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ feature_sync(spa, feature, refcount, tx);
+}
+
+void
+spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx)
+{
+ /*
+ * We create feature flags ZAP objects in two instances: during pool
+ * creation and during pool upgrade.
+ */
+ ASSERT((!spa->spa_sync_on && tx->tx_txg == TXG_INITIAL) ||
+ dsl_pool_sync_context(spa_get_dsl(spa)));
+
+ spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset,
+ DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_FEATURES_FOR_READ, tx);
+ spa->spa_feat_for_write_obj = zap_create_link(spa->spa_meta_objset,
+ DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_FEATURES_FOR_WRITE, tx);
+ spa->spa_feat_desc_obj = zap_create_link(spa->spa_meta_objset,
+ DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_FEATURE_DESCRIPTIONS, tx);
+}
+
+/*
+ * Enable any required dependencies, then enable the requested feature.
+ */
+void
+spa_feature_enable(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
+{
+ ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
+ ASSERT(VALID_FEATURE_FID(fid));
+ feature_enable_sync(spa, &spa_feature_table[fid], tx);
+}
+
+void
+spa_feature_incr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
+{
+ feature_do_action(spa, fid, FEATURE_ACTION_INCR, tx);
+}
+
+void
+spa_feature_decr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
+{
+ feature_do_action(spa, fid, FEATURE_ACTION_DECR, tx);
+}
+
+boolean_t
+spa_feature_is_enabled(spa_t *spa, spa_feature_t fid)
+{
+ int err;
+ uint64_t refcount = 0;
+
+ ASSERT(VALID_FEATURE_FID(fid));
+ if (spa_version(spa) < SPA_VERSION_FEATURES)
+ return (B_FALSE);
+
+ err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount);
+ ASSERT(err == 0 || err == ENOTSUP);
+ return (err == 0);
+}
+
+boolean_t
+spa_feature_is_active(spa_t *spa, spa_feature_t fid)
+{
+ int err;
+ uint64_t refcount = 0;
+
+ ASSERT(VALID_FEATURE_FID(fid));
+ if (spa_version(spa) < SPA_VERSION_FEATURES)
+ return (B_FALSE);
+
+ err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount);
+ ASSERT(err == 0 || err == ENOTSUP);
+ return (err == 0 && refcount > 0);
+}
+
+/*
+ * For the feature specified by fid (which must depend on
+ * SPA_FEATURE_ENABLED_TXG), return the TXG at which it was enabled in the
+ * OUT txg argument.
+ *
+ * Returns B_TRUE if the feature is enabled, in which case txg will be filled
+ * with the transaction group in which the specified feature was enabled.
+ * Returns B_FALSE otherwise (i.e. if the feature is not enabled).
+ */
+boolean_t
+spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg)
+{
+ int err;
+
+ ASSERT(VALID_FEATURE_FID(fid));
+ if (spa_version(spa) < SPA_VERSION_FEATURES)
+ return (B_FALSE);
+
+ err = feature_get_enabled_txg(spa, &spa_feature_table[fid], txg);
+ ASSERT(err == 0 || err == ENOTSUP);
+
+ return (err == 0);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zfs_byteswap.c b/sys/contrib/openzfs/module/zfs/zfs_byteswap.c
new file mode 100644
index 000000000000..cd35849c3f37
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_byteswap.c
@@ -0,0 +1,211 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/vfs.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_acl.h>
+
+#ifndef _KERNEL
+static
+#endif
+void
+zfs_oldace_byteswap(ace_t *ace, int ace_cnt)
+{
+ int i;
+
+ for (i = 0; i != ace_cnt; i++, ace++) {
+ ace->a_who = BSWAP_32(ace->a_who);
+ ace->a_access_mask = BSWAP_32(ace->a_access_mask);
+ ace->a_flags = BSWAP_16(ace->a_flags);
+ ace->a_type = BSWAP_16(ace->a_type);
+ }
+}
+
+/*
+ * swap ace_t and ace_object_t
+ */
+#ifndef _KERNEL
+static
+#endif
+void
+zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout)
+{
+ caddr_t end;
+ caddr_t ptr;
+ zfs_ace_t *zacep = NULL;
+ ace_t *acep;
+ uint16_t entry_type;
+ size_t entry_size;
+ int ace_type;
+
+ end = (caddr_t)buf + size;
+ ptr = buf;
+
+ while (ptr < end) {
+ if (zfs_layout) {
+ /*
+ * Avoid overrun. Embedded aces can have one
+ * of several sizes. We don't know exactly
+ * how many our present, only the size of the
+ * buffer containing them. That size may be
+ * larger than needed to hold the aces
+ * present. As long as we do not do any
+ * swapping beyond the end of our block we are
+ * okay. It is safe to swap any non-ace data
+ * within the block since it is just zeros.
+ */
+ if (ptr + sizeof (zfs_ace_hdr_t) > end) {
+ break;
+ }
+ zacep = (zfs_ace_t *)ptr;
+ zacep->z_hdr.z_access_mask =
+ BSWAP_32(zacep->z_hdr.z_access_mask);
+ zacep->z_hdr.z_flags = BSWAP_16(zacep->z_hdr.z_flags);
+ ace_type = zacep->z_hdr.z_type =
+ BSWAP_16(zacep->z_hdr.z_type);
+ entry_type = zacep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+ } else {
+ /* Overrun avoidance */
+ if (ptr + sizeof (ace_t) > end) {
+ break;
+ }
+ acep = (ace_t *)ptr;
+ acep->a_access_mask = BSWAP_32(acep->a_access_mask);
+ acep->a_flags = BSWAP_16(acep->a_flags);
+ ace_type = acep->a_type = BSWAP_16(acep->a_type);
+ acep->a_who = BSWAP_32(acep->a_who);
+ entry_type = acep->a_flags & ACE_TYPE_FLAGS;
+ }
+ switch (entry_type) {
+ case ACE_OWNER:
+ case ACE_EVERYONE:
+ case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
+ entry_size = zfs_layout ?
+ sizeof (zfs_ace_hdr_t) : sizeof (ace_t);
+ break;
+ case ACE_IDENTIFIER_GROUP:
+ default:
+ /* Overrun avoidance */
+ if (zfs_layout) {
+ if (ptr + sizeof (zfs_ace_t) <= end) {
+ zacep->z_fuid = BSWAP_64(zacep->z_fuid);
+ } else {
+ entry_size = sizeof (zfs_ace_t);
+ break;
+ }
+ }
+ switch (ace_type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ entry_size = zfs_layout ?
+ sizeof (zfs_object_ace_t) :
+ sizeof (ace_object_t);
+ break;
+ default:
+ entry_size = zfs_layout ? sizeof (zfs_ace_t) :
+ sizeof (ace_t);
+ break;
+ }
+ }
+ ptr = ptr + entry_size;
+ }
+}
+
+/* ARGSUSED */
+void
+zfs_oldacl_byteswap(void *buf, size_t size)
+{
+ int cnt;
+
+ /*
+ * Arggh, since we don't know how many ACEs are in
+ * the array, we have to swap the entire block
+ */
+
+ cnt = size / sizeof (ace_t);
+
+ zfs_oldace_byteswap((ace_t *)buf, cnt);
+}
+
+/* ARGSUSED */
+void
+zfs_acl_byteswap(void *buf, size_t size)
+{
+ zfs_ace_byteswap(buf, size, B_TRUE);
+}
+
+void
+zfs_znode_byteswap(void *buf, size_t size)
+{
+ znode_phys_t *zp = buf;
+
+ ASSERT(size >= sizeof (znode_phys_t));
+
+ zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]);
+ zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]);
+ zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]);
+ zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]);
+ zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]);
+ zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]);
+ zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]);
+ zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]);
+ zp->zp_gen = BSWAP_64(zp->zp_gen);
+ zp->zp_mode = BSWAP_64(zp->zp_mode);
+ zp->zp_size = BSWAP_64(zp->zp_size);
+ zp->zp_parent = BSWAP_64(zp->zp_parent);
+ zp->zp_links = BSWAP_64(zp->zp_links);
+ zp->zp_xattr = BSWAP_64(zp->zp_xattr);
+ zp->zp_rdev = BSWAP_64(zp->zp_rdev);
+ zp->zp_flags = BSWAP_64(zp->zp_flags);
+ zp->zp_uid = BSWAP_64(zp->zp_uid);
+ zp->zp_gid = BSWAP_64(zp->zp_gid);
+ zp->zp_zap = BSWAP_64(zp->zp_zap);
+ zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]);
+ zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]);
+ zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]);
+
+ zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj);
+ zp->zp_acl.z_acl_size = BSWAP_32(zp->zp_acl.z_acl_size);
+ zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version);
+ zp->zp_acl.z_acl_count = BSWAP_16(zp->zp_acl.z_acl_count);
+ if (zp->zp_acl.z_acl_version == ZFS_ACL_VERSION) {
+ zfs_acl_byteswap((void *)&zp->zp_acl.z_ace_data[0],
+ ZFS_ACE_SPACE);
+ } else {
+ zfs_oldace_byteswap((ace_t *)&zp->zp_acl.z_ace_data[0],
+ ACE_SLOT_CNT);
+ }
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zfs_oldacl_byteswap);
+EXPORT_SYMBOL(zfs_acl_byteswap);
+EXPORT_SYMBOL(zfs_znode_byteswap);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/zfs_fm.c b/sys/contrib/openzfs/module/zfs/zfs_fm.c
new file mode 100644
index 000000000000..ea71ef325c89
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_fm.c
@@ -0,0 +1,1416 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012,2020 by Delphix. All rights reserved.
+ */
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+
+#include <sys/fm/fs/zfs.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+#include <sys/sysevent.h>
+
+/*
+ * This general routine is responsible for generating all the different ZFS
+ * ereports. The payload is dependent on the class, and which arguments are
+ * supplied to the function:
+ *
+ * EREPORT POOL VDEV IO
+ * block X X X
+ * data X X
+ * device X X
+ * pool X
+ *
+ * If we are in a loading state, all errors are chained together by the same
+ * SPA-wide ENA (Error Numeric Association).
+ *
+ * For isolated I/O requests, we get the ENA from the zio_t. The propagation
+ * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want
+ * to chain together all ereports associated with a logical piece of data. For
+ * read I/Os, there are basically three 'types' of I/O, which form a roughly
+ * layered diagram:
+ *
+ * +---------------+
+ * | Aggregate I/O | No associated logical data or device
+ * +---------------+
+ * |
+ * V
+ * +---------------+ Reads associated with a piece of logical data.
+ * | Read I/O | This includes reads on behalf of RAID-Z,
+ * +---------------+ mirrors, gang blocks, retries, etc.
+ * |
+ * V
+ * +---------------+ Reads associated with a particular device, but
+ * | Physical I/O | no logical data. Issued as part of vdev caching
+ * +---------------+ and I/O aggregation.
+ *
+ * Note that 'physical I/O' here is not the same terminology as used in the rest
+ * of ZIO. Typically, 'physical I/O' simply means that there is no attached
+ * blockpointer. But I/O with no associated block pointer can still be related
+ * to a logical piece of data (i.e. RAID-Z requests).
+ *
+ * Purely physical I/O always have unique ENAs. They are not related to a
+ * particular piece of logical data, and therefore cannot be chained together.
+ * We still generate an ereport, but the DE doesn't correlate it with any
+ * logical piece of data. When such an I/O fails, the delegated I/O requests
+ * will issue a retry, which will trigger the 'real' ereport with the correct
+ * ENA.
+ *
+ * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
+ * When a new logical I/O is issued, we set this to point to itself. Child I/Os
+ * then inherit this pointer, so that when it is first set subsequent failures
+ * will use the same ENA. For vdev cache fill and queue aggregation I/O,
+ * this pointer is set to NULL, and no ereport will be generated (since it
+ * doesn't actually correspond to any particular device or piece of data,
+ * and the caller will always retry without caching or queueing anyway).
+ *
+ * For checksum errors, we want to include more information about the actual
+ * error which occurs. Accordingly, we build an ereport when the error is
+ * noticed, but instead of sending it in immediately, we hang it off of the
+ * io_cksum_report field of the logical IO. When the logical IO completes
+ * (successfully or not), zfs_ereport_finish_checksum() is called with the
+ * good and bad versions of the buffer (if available), and we annotate the
+ * ereport with information about the differences.
+ */
+
+#ifdef _KERNEL
+/*
+ * Duplicate ereport Detection
+ *
+ * Some ereports are retained momentarily for detecting duplicates. These
+ * are kept in a recent_events_node_t in both a time-ordered list and an AVL
+ * tree of recent unique ereports.
+ *
+ * The lifespan of these recent ereports is bounded (15 mins) and a cleaner
+ * task is used to purge stale entries.
+ */
+static list_t recent_events_list;
+static avl_tree_t recent_events_tree;
+static kmutex_t recent_events_lock;
+static taskqid_t recent_events_cleaner_tqid;
+
+/*
+ * Each node is about 128 bytes so 2,000 would consume 1/4 MiB.
+ *
+ * This setting can be changed dynamically and setting it to zero
+ * disables duplicate detection.
+ */
+unsigned int zfs_zevent_retain_max = 2000;
+
+/*
+ * The lifespan for a recent ereport entry. The default of 15 minutes is
+ * intended to outlive the zfs diagnosis engine's threshold of 10 errors
+ * over a period of 10 minutes.
+ */
+unsigned int zfs_zevent_retain_expire_secs = 900;
+
+typedef enum zfs_subclass {
+ ZSC_IO,
+ ZSC_DATA,
+ ZSC_CHECKSUM
+} zfs_subclass_t;
+
+typedef struct {
+ /* common criteria */
+ uint64_t re_pool_guid;
+ uint64_t re_vdev_guid;
+ int re_io_error;
+ uint64_t re_io_size;
+ uint64_t re_io_offset;
+ zfs_subclass_t re_subclass;
+ zio_priority_t re_io_priority;
+
+ /* logical zio criteria (optional) */
+ zbookmark_phys_t re_io_bookmark;
+
+ /* internal state */
+ avl_node_t re_tree_link;
+ list_node_t re_list_link;
+ uint64_t re_timestamp;
+} recent_events_node_t;
+
+static int
+recent_events_compare(const void *a, const void *b)
+{
+ const recent_events_node_t *node1 = a;
+ const recent_events_node_t *node2 = b;
+ int cmp;
+
+ /*
+ * The comparison order here is somewhat arbitrary.
+ * What's important is that if every criteria matches, then it
+ * is a duplicate (i.e. compare returns 0)
+ */
+ if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0)
+ return (cmp);
+
+ const zbookmark_phys_t *zb1 = &node1->re_io_bookmark;
+ const zbookmark_phys_t *zb2 = &node2->re_io_bookmark;
+
+ if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0)
+ return (cmp);
+
+ return (0);
+}
+
+static void zfs_ereport_schedule_cleaner(void);
+
+/*
+ * background task to clean stale recent event nodes.
+ */
+/*ARGSUSED*/
+static void
+zfs_ereport_cleaner(void *arg)
+{
+ recent_events_node_t *entry;
+ uint64_t now = gethrtime();
+
+ /*
+ * purge expired entries
+ */
+ mutex_enter(&recent_events_lock);
+ while ((entry = list_tail(&recent_events_list)) != NULL) {
+ uint64_t age = NSEC2SEC(now - entry->re_timestamp);
+ if (age <= zfs_zevent_retain_expire_secs)
+ break;
+
+ /* remove expired node */
+ avl_remove(&recent_events_tree, entry);
+ list_remove(&recent_events_list, entry);
+ kmem_free(entry, sizeof (*entry));
+ }
+
+ /* Restart the cleaner if more entries remain */
+ recent_events_cleaner_tqid = 0;
+ if (!list_is_empty(&recent_events_list))
+ zfs_ereport_schedule_cleaner();
+
+ mutex_exit(&recent_events_lock);
+}
+
+static void
+zfs_ereport_schedule_cleaner(void)
+{
+ ASSERT(MUTEX_HELD(&recent_events_lock));
+
+ uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1);
+
+ recent_events_cleaner_tqid = taskq_dispatch_delay(
+ system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP,
+ ddi_get_lbolt() + NSEC_TO_TICK(timeout));
+}
+
+/*
+ * Check if an ereport would be a duplicate of one recently posted.
+ *
+ * An ereport is considered a duplicate if the set of criteria in
+ * recent_events_node_t all match.
+ *
+ * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM
+ * are candidates for duplicate checking.
+ */
+static boolean_t
+zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd,
+ const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size)
+{
+ recent_events_node_t search = {0}, *entry;
+
+ if (vd == NULL || zio == NULL)
+ return (B_FALSE);
+
+ if (zfs_zevent_retain_max == 0)
+ return (B_FALSE);
+
+ if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0)
+ search.re_subclass = ZSC_IO;
+ else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0)
+ search.re_subclass = ZSC_DATA;
+ else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0)
+ search.re_subclass = ZSC_CHECKSUM;
+ else
+ return (B_FALSE);
+
+ search.re_pool_guid = spa_guid(spa);
+ search.re_vdev_guid = vd->vdev_guid;
+ search.re_io_error = zio->io_error;
+ search.re_io_priority = zio->io_priority;
+ /* if size is supplied use it over what's in zio */
+ if (size) {
+ search.re_io_size = size;
+ search.re_io_offset = offset;
+ } else {
+ search.re_io_size = zio->io_size;
+ search.re_io_offset = zio->io_offset;
+ }
+
+ /* grab optional logical zio criteria */
+ if (zb != NULL) {
+ search.re_io_bookmark.zb_objset = zb->zb_objset;
+ search.re_io_bookmark.zb_object = zb->zb_object;
+ search.re_io_bookmark.zb_level = zb->zb_level;
+ search.re_io_bookmark.zb_blkid = zb->zb_blkid;
+ }
+
+ uint64_t now = gethrtime();
+
+ mutex_enter(&recent_events_lock);
+
+ /* check if we have seen this one recently */
+ entry = avl_find(&recent_events_tree, &search, NULL);
+ if (entry != NULL) {
+ uint64_t age = NSEC2SEC(now - entry->re_timestamp);
+
+ /*
+ * There is still an active cleaner (since we're here).
+ * Reset the last seen time for this duplicate entry
+ * so that its lifespand gets extended.
+ */
+ list_remove(&recent_events_list, entry);
+ list_insert_head(&recent_events_list, entry);
+ entry->re_timestamp = now;
+
+ zfs_zevent_track_duplicate();
+ mutex_exit(&recent_events_lock);
+
+ return (age <= zfs_zevent_retain_expire_secs);
+ }
+
+ if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) {
+ /* recycle oldest node */
+ entry = list_tail(&recent_events_list);
+ ASSERT(entry != NULL);
+ list_remove(&recent_events_list, entry);
+ avl_remove(&recent_events_tree, entry);
+ } else {
+ entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP);
+ }
+
+ /* record this as a recent ereport */
+ *entry = search;
+ avl_add(&recent_events_tree, entry);
+ list_insert_head(&recent_events_list, entry);
+ entry->re_timestamp = now;
+
+ /* Start a cleaner if not already scheduled */
+ if (recent_events_cleaner_tqid == 0)
+ zfs_ereport_schedule_cleaner();
+
+ mutex_exit(&recent_events_lock);
+ return (B_FALSE);
+}
+
+void
+zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector)
+{
+ if (nvl)
+ fm_nvlist_destroy(nvl, FM_NVA_FREE);
+
+ if (detector)
+ fm_nvlist_destroy(detector, FM_NVA_FREE);
+}
+
+/*
+ * We want to rate limit ZIO delay and checksum events so as to not
+ * flood ZED when a disk is acting up.
+ *
+ * Returns 1 if we're ratelimiting, 0 if not.
+ */
+static int
+zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd)
+{
+ int rc = 0;
+ /*
+ * __ratelimit() returns 1 if we're *not* ratelimiting and 0 if we
+ * are. Invert it to get our return value.
+ */
+ if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
+ rc = !zfs_ratelimit(&vd->vdev_delay_rl);
+ } else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) {
+ rc = !zfs_ratelimit(&vd->vdev_checksum_rl);
+ }
+
+ if (rc) {
+ /* We're rate limiting */
+ fm_erpt_dropped_increment();
+ }
+
+ return (rc);
+}
+
+/*
+ * Return B_TRUE if the event actually posted, B_FALSE if not.
+ */
+static boolean_t
+zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
+ const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
+ zio_t *zio, uint64_t stateoroffset, uint64_t size)
+{
+ nvlist_t *ereport, *detector;
+
+ uint64_t ena;
+ char class[64];
+
+ if ((ereport = fm_nvlist_create(NULL)) == NULL)
+ return (B_FALSE);
+
+ if ((detector = fm_nvlist_create(NULL)) == NULL) {
+ fm_nvlist_destroy(ereport, FM_NVA_FREE);
+ return (B_FALSE);
+ }
+
+ /*
+ * Serialize ereport generation
+ */
+ mutex_enter(&spa->spa_errlist_lock);
+
+ /*
+ * Determine the ENA to use for this event. If we are in a loading
+ * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use
+ * a root zio-wide ENA. Otherwise, simply use a unique ENA.
+ */
+ if (spa_load_state(spa) != SPA_LOAD_NONE) {
+ if (spa->spa_ena == 0)
+ spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
+ ena = spa->spa_ena;
+ } else if (zio != NULL && zio->io_logical != NULL) {
+ if (zio->io_logical->io_ena == 0)
+ zio->io_logical->io_ena =
+ fm_ena_generate(0, FM_ENA_FMT1);
+ ena = zio->io_logical->io_ena;
+ } else {
+ ena = fm_ena_generate(0, FM_ENA_FMT1);
+ }
+
+ /*
+ * Construct the full class, detector, and other standard FMA fields.
+ */
+ (void) snprintf(class, sizeof (class), "%s.%s",
+ ZFS_ERROR_CLASS, subclass);
+
+ fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
+ vd != NULL ? vd->vdev_guid : 0);
+
+ fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
+
+ /*
+ * Construct the per-ereport payload, depending on which parameters are
+ * passed in.
+ */
+
+ /*
+ * Generic payload members common to all ereports.
+ */
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_POOL, DATA_TYPE_STRING, spa_name(spa),
+ FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa),
+ FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, DATA_TYPE_UINT64,
+ (uint64_t)spa_state(spa),
+ FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
+ (int32_t)spa_load_state(spa), NULL);
+
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
+ DATA_TYPE_STRING,
+ spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
+ FM_EREPORT_FAILMODE_WAIT :
+ spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
+ FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC,
+ NULL);
+
+ if (vd != NULL) {
+ vdev_t *pvd = vd->vdev_parent;
+ vdev_queue_t *vq = &vd->vdev_queue;
+ vdev_stat_t *vs = &vd->vdev_stat;
+ vdev_t *spare_vd;
+ uint64_t *spare_guids;
+ char **spare_paths;
+ int i, spare_count;
+
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
+ DATA_TYPE_UINT64, vd->vdev_guid,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
+ DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
+ if (vd->vdev_path != NULL)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
+ DATA_TYPE_STRING, vd->vdev_path, NULL);
+ if (vd->vdev_devid != NULL)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
+ DATA_TYPE_STRING, vd->vdev_devid, NULL);
+ if (vd->vdev_fru != NULL)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
+ DATA_TYPE_STRING, vd->vdev_fru, NULL);
+ if (vd->vdev_enc_sysfs_path != NULL)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
+ DATA_TYPE_STRING, vd->vdev_enc_sysfs_path, NULL);
+ if (vd->vdev_ashift)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT,
+ DATA_TYPE_UINT64, vd->vdev_ashift, NULL);
+
+ if (vq != NULL) {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS,
+ DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL);
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS,
+ DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL);
+ }
+
+ if (vs != NULL) {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS,
+ DATA_TYPE_UINT64, vs->vs_read_errors,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS,
+ DATA_TYPE_UINT64, vs->vs_write_errors,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS,
+ DATA_TYPE_UINT64, vs->vs_checksum_errors,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS,
+ DATA_TYPE_UINT64, vs->vs_slow_ios,
+ NULL);
+ }
+
+ if (pvd != NULL) {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
+ DATA_TYPE_UINT64, pvd->vdev_guid,
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
+ DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
+ NULL);
+ if (pvd->vdev_path)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
+ DATA_TYPE_STRING, pvd->vdev_path, NULL);
+ if (pvd->vdev_devid)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
+ DATA_TYPE_STRING, pvd->vdev_devid, NULL);
+ }
+
+ spare_count = spa->spa_spares.sav_count;
+ spare_paths = kmem_zalloc(sizeof (char *) * spare_count,
+ KM_SLEEP);
+ spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count,
+ KM_SLEEP);
+
+ for (i = 0; i < spare_count; i++) {
+ spare_vd = spa->spa_spares.sav_vdevs[i];
+ if (spare_vd) {
+ spare_paths[i] = spare_vd->vdev_path;
+ spare_guids[i] = spare_vd->vdev_guid;
+ }
+ }
+
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS,
+ DATA_TYPE_STRING_ARRAY, spare_count, spare_paths,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS,
+ DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL);
+
+ kmem_free(spare_guids, sizeof (uint64_t) * spare_count);
+ kmem_free(spare_paths, sizeof (char *) * spare_count);
+ }
+
+ if (zio != NULL) {
+ /*
+ * Payload common to all I/Os.
+ */
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
+ DATA_TYPE_INT32, zio->io_error, NULL);
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS,
+ DATA_TYPE_INT32, zio->io_flags, NULL);
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE,
+ DATA_TYPE_UINT32, zio->io_stage, NULL);
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE,
+ DATA_TYPE_UINT32, zio->io_pipeline, NULL);
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY,
+ DATA_TYPE_UINT64, zio->io_delay, NULL);
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP,
+ DATA_TYPE_UINT64, zio->io_timestamp, NULL);
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA,
+ DATA_TYPE_UINT64, zio->io_delta, NULL);
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY,
+ DATA_TYPE_UINT32, zio->io_priority, NULL);
+
+ /*
+ * If the 'size' parameter is non-zero, it indicates this is a
+ * RAID-Z or other I/O where the physical offset and length are
+ * provided for us, instead of within the zio_t.
+ */
+ if (vd != NULL) {
+ if (size)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
+ DATA_TYPE_UINT64, stateoroffset,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
+ DATA_TYPE_UINT64, size, NULL);
+ else
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
+ DATA_TYPE_UINT64, zio->io_offset,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
+ DATA_TYPE_UINT64, zio->io_size, NULL);
+ }
+ } else if (vd != NULL) {
+ /*
+ * If we have a vdev but no zio, this is a device fault, and the
+ * 'stateoroffset' parameter indicates the previous state of the
+ * vdev.
+ */
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
+ DATA_TYPE_UINT64, stateoroffset, NULL);
+ }
+
+ /*
+ * Payload for I/Os with corresponding logical information.
+ */
+ if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
+ DATA_TYPE_UINT64, zb->zb_objset,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
+ DATA_TYPE_UINT64, zb->zb_object,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
+ DATA_TYPE_INT64, zb->zb_level,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
+ DATA_TYPE_UINT64, zb->zb_blkid, NULL);
+ }
+
+ mutex_exit(&spa->spa_errlist_lock);
+
+ *ereport_out = ereport;
+ *detector_out = detector;
+ return (B_TRUE);
+}
+
+/* if it's <= 128 bytes, save the corruption directly */
+#define ZFM_MAX_INLINE (128 / sizeof (uint64_t))
+
+#define MAX_RANGES 16
+
+typedef struct zfs_ecksum_info {
+ /* histograms of set and cleared bits by bit number in a 64-bit word */
+ uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY];
+ uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
+
+ /* inline arrays of bits set and cleared. */
+ uint64_t zei_bits_set[ZFM_MAX_INLINE];
+ uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
+
+ /*
+ * for each range, the number of bits set and cleared. The Hamming
+ * distance between the good and bad buffers is the sum of them all.
+ */
+ uint32_t zei_range_sets[MAX_RANGES];
+ uint32_t zei_range_clears[MAX_RANGES];
+
+ struct zei_ranges {
+ uint32_t zr_start;
+ uint32_t zr_end;
+ } zei_ranges[MAX_RANGES];
+
+ size_t zei_range_count;
+ uint32_t zei_mingap;
+ uint32_t zei_allowed_mingap;
+
+} zfs_ecksum_info_t;
+
+static void
+update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count)
+{
+ size_t i;
+ size_t bits = 0;
+ uint64_t value = BE_64(value_arg);
+
+ /* We store the bits in big-endian (largest-first) order */
+ for (i = 0; i < 64; i++) {
+ if (value & (1ull << i)) {
+ hist[63 - i]++;
+ ++bits;
+ }
+ }
+ /* update the count of bits changed */
+ *count += bits;
+}
+
+/*
+ * We've now filled up the range array, and need to increase "mingap" and
+ * shrink the range list accordingly. zei_mingap is always the smallest
+ * distance between array entries, so we set the new_allowed_gap to be
+ * one greater than that. We then go through the list, joining together
+ * any ranges which are closer than the new_allowed_gap.
+ *
+ * By construction, there will be at least one. We also update zei_mingap
+ * to the new smallest gap, to prepare for our next invocation.
+ */
+static void
+zei_shrink_ranges(zfs_ecksum_info_t *eip)
+{
+ uint32_t mingap = UINT32_MAX;
+ uint32_t new_allowed_gap = eip->zei_mingap + 1;
+
+ size_t idx, output;
+ size_t max = eip->zei_range_count;
+
+ struct zei_ranges *r = eip->zei_ranges;
+
+ ASSERT3U(eip->zei_range_count, >, 0);
+ ASSERT3U(eip->zei_range_count, <=, MAX_RANGES);
+
+ output = idx = 0;
+ while (idx < max - 1) {
+ uint32_t start = r[idx].zr_start;
+ uint32_t end = r[idx].zr_end;
+
+ while (idx < max - 1) {
+ idx++;
+
+ uint32_t nstart = r[idx].zr_start;
+ uint32_t nend = r[idx].zr_end;
+
+ uint32_t gap = nstart - end;
+ if (gap < new_allowed_gap) {
+ end = nend;
+ continue;
+ }
+ if (gap < mingap)
+ mingap = gap;
+ break;
+ }
+ r[output].zr_start = start;
+ r[output].zr_end = end;
+ output++;
+ }
+ ASSERT3U(output, <, eip->zei_range_count);
+ eip->zei_range_count = output;
+ eip->zei_mingap = mingap;
+ eip->zei_allowed_mingap = new_allowed_gap;
+}
+
+static void
+zei_add_range(zfs_ecksum_info_t *eip, int start, int end)
+{
+ struct zei_ranges *r = eip->zei_ranges;
+ size_t count = eip->zei_range_count;
+
+ if (count >= MAX_RANGES) {
+ zei_shrink_ranges(eip);
+ count = eip->zei_range_count;
+ }
+ if (count == 0) {
+ eip->zei_mingap = UINT32_MAX;
+ eip->zei_allowed_mingap = 1;
+ } else {
+ int gap = start - r[count - 1].zr_end;
+
+ if (gap < eip->zei_allowed_mingap) {
+ r[count - 1].zr_end = end;
+ return;
+ }
+ if (gap < eip->zei_mingap)
+ eip->zei_mingap = gap;
+ }
+ r[count].zr_start = start;
+ r[count].zr_end = end;
+ eip->zei_range_count++;
+}
+
+static size_t
+zei_range_total_size(zfs_ecksum_info_t *eip)
+{
+ struct zei_ranges *r = eip->zei_ranges;
+ size_t count = eip->zei_range_count;
+ size_t result = 0;
+ size_t idx;
+
+ for (idx = 0; idx < count; idx++)
+ result += (r[idx].zr_end - r[idx].zr_start);
+
+ return (result);
+}
+
+static zfs_ecksum_info_t *
+annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
+ const abd_t *goodabd, const abd_t *badabd, size_t size,
+ boolean_t drop_if_identical)
+{
+ const uint64_t *good;
+ const uint64_t *bad;
+
+ uint64_t allset = 0;
+ uint64_t allcleared = 0;
+
+ size_t nui64s = size / sizeof (uint64_t);
+
+ size_t inline_size;
+ int no_inline = 0;
+ size_t idx;
+ size_t range;
+
+ size_t offset = 0;
+ ssize_t start = -1;
+
+ zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP);
+
+ /* don't do any annotation for injected checksum errors */
+ if (info != NULL && info->zbc_injected)
+ return (eip);
+
+ if (info != NULL && info->zbc_has_cksum) {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED,
+ DATA_TYPE_UINT64_ARRAY,
+ sizeof (info->zbc_expected) / sizeof (uint64_t),
+ (uint64_t *)&info->zbc_expected,
+ FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL,
+ DATA_TYPE_UINT64_ARRAY,
+ sizeof (info->zbc_actual) / sizeof (uint64_t),
+ (uint64_t *)&info->zbc_actual,
+ FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
+ DATA_TYPE_STRING,
+ info->zbc_checksum_name,
+ NULL);
+
+ if (info->zbc_byteswapped) {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP,
+ DATA_TYPE_BOOLEAN, 1,
+ NULL);
+ }
+ }
+
+ if (badabd == NULL || goodabd == NULL)
+ return (eip);
+
+ ASSERT3U(nui64s, <=, UINT32_MAX);
+ ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
+ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(size, <=, UINT32_MAX);
+
+ good = (const uint64_t *) abd_borrow_buf_copy((abd_t *)goodabd, size);
+ bad = (const uint64_t *) abd_borrow_buf_copy((abd_t *)badabd, size);
+
+ /* build up the range list by comparing the two buffers. */
+ for (idx = 0; idx < nui64s; idx++) {
+ if (good[idx] == bad[idx]) {
+ if (start == -1)
+ continue;
+
+ zei_add_range(eip, start, idx);
+ start = -1;
+ } else {
+ if (start != -1)
+ continue;
+
+ start = idx;
+ }
+ }
+ if (start != -1)
+ zei_add_range(eip, start, idx);
+
+ /* See if it will fit in our inline buffers */
+ inline_size = zei_range_total_size(eip);
+ if (inline_size > ZFM_MAX_INLINE)
+ no_inline = 1;
+
+ /*
+ * If there is no change and we want to drop if the buffers are
+ * identical, do so.
+ */
+ if (inline_size == 0 && drop_if_identical) {
+ kmem_free(eip, sizeof (*eip));
+ abd_return_buf((abd_t *)goodabd, (void *)good, size);
+ abd_return_buf((abd_t *)badabd, (void *)bad, size);
+ return (NULL);
+ }
+
+ /*
+ * Now walk through the ranges, filling in the details of the
+ * differences. Also convert our uint64_t-array offsets to byte
+ * offsets.
+ */
+ for (range = 0; range < eip->zei_range_count; range++) {
+ size_t start = eip->zei_ranges[range].zr_start;
+ size_t end = eip->zei_ranges[range].zr_end;
+
+ for (idx = start; idx < end; idx++) {
+ uint64_t set, cleared;
+
+ // bits set in bad, but not in good
+ set = ((~good[idx]) & bad[idx]);
+ // bits set in good, but not in bad
+ cleared = (good[idx] & (~bad[idx]));
+
+ allset |= set;
+ allcleared |= cleared;
+
+ if (!no_inline) {
+ ASSERT3U(offset, <, inline_size);
+ eip->zei_bits_set[offset] = set;
+ eip->zei_bits_cleared[offset] = cleared;
+ offset++;
+ }
+
+ update_histogram(set, eip->zei_histogram_set,
+ &eip->zei_range_sets[range]);
+ update_histogram(cleared, eip->zei_histogram_cleared,
+ &eip->zei_range_clears[range]);
+ }
+
+ /* convert to byte offsets */
+ eip->zei_ranges[range].zr_start *= sizeof (uint64_t);
+ eip->zei_ranges[range].zr_end *= sizeof (uint64_t);
+ }
+
+ abd_return_buf((abd_t *)goodabd, (void *)good, size);
+ abd_return_buf((abd_t *)badabd, (void *)bad, size);
+
+ eip->zei_allowed_mingap *= sizeof (uint64_t);
+ inline_size *= sizeof (uint64_t);
+
+ /* fill in ereport */
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES,
+ DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count,
+ (uint32_t *)eip->zei_ranges,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP,
+ DATA_TYPE_UINT32, eip->zei_allowed_mingap,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS,
+ DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS,
+ DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears,
+ NULL);
+
+ if (!no_inline) {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS,
+ DATA_TYPE_UINT8_ARRAY,
+ inline_size, (uint8_t *)eip->zei_bits_set,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS,
+ DATA_TYPE_UINT8_ARRAY,
+ inline_size, (uint8_t *)eip->zei_bits_cleared,
+ NULL);
+ } else {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
+ DATA_TYPE_UINT32_ARRAY,
+ NBBY * sizeof (uint64_t), eip->zei_histogram_set,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
+ DATA_TYPE_UINT32_ARRAY,
+ NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
+ NULL);
+ }
+ return (eip);
+}
+#endif
+
+/*
+ * Make sure our event is still valid for the given zio/vdev/pool. For example,
+ * we don't want to keep logging events for a faulted or missing vdev.
+ */
+boolean_t
+zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
+{
+#ifdef _KERNEL
+ /*
+ * If we are doing a spa_tryimport() or in recovery mode,
+ * ignore errors.
+ */
+ if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
+ spa_load_state(spa) == SPA_LOAD_RECOVER)
+ return (B_FALSE);
+
+ /*
+ * If we are in the middle of opening a pool, and the previous attempt
+ * failed, don't bother logging any new ereports - we're just going to
+ * get the same diagnosis anyway.
+ */
+ if (spa_load_state(spa) != SPA_LOAD_NONE &&
+ spa->spa_last_open_failed)
+ return (B_FALSE);
+
+ if (zio != NULL) {
+ /*
+ * If this is not a read or write zio, ignore the error. This
+ * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
+ */
+ if (zio->io_type != ZIO_TYPE_READ &&
+ zio->io_type != ZIO_TYPE_WRITE)
+ return (B_FALSE);
+
+ if (vd != NULL) {
+ /*
+ * If the vdev has already been marked as failing due
+ * to a failed probe, then ignore any subsequent I/O
+ * errors, as the DE will automatically fault the vdev
+ * on the first such failure. This also catches cases
+ * where vdev_remove_wanted is set and the device has
+ * not yet been asynchronously placed into the REMOVED
+ * state.
+ */
+ if (zio->io_vd == vd && !vdev_accessible(vd, zio))
+ return (B_FALSE);
+
+ /*
+ * Ignore checksum errors for reads from DTL regions of
+ * leaf vdevs.
+ */
+ if (zio->io_type == ZIO_TYPE_READ &&
+ zio->io_error == ECKSUM &&
+ vd->vdev_ops->vdev_op_leaf &&
+ vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
+ return (B_FALSE);
+ }
+ }
+
+ /*
+ * For probe failure, we want to avoid posting ereports if we've
+ * already removed the device in the meantime.
+ */
+ if (vd != NULL &&
+ strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
+ (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
+ return (B_FALSE);
+
+ /* Ignore bogus delay events (like from ioctls or unqueued IOs) */
+ if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) &&
+ (zio != NULL) && (!zio->io_timestamp)) {
+ return (B_FALSE);
+ }
+#endif
+ return (B_TRUE);
+}
+
+/*
+ * Post an ereport for the given subclass
+ *
+ * Returns
+ * - 0 if an event was posted
+ * - EINVAL if there was a problem posting event
+ * - EBUSY if the event was rate limited
+ * - EALREADY if the event was already posted (duplicate)
+ */
+int
+zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
+ const zbookmark_phys_t *zb, zio_t *zio, uint64_t state)
+{
+ int rc = 0;
+#ifdef _KERNEL
+ nvlist_t *ereport = NULL;
+ nvlist_t *detector = NULL;
+
+ if (!zfs_ereport_is_valid(subclass, spa, vd, zio))
+ return (EINVAL);
+
+ if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0))
+ return (SET_ERROR(EALREADY));
+
+ if (zfs_is_ratelimiting_event(subclass, vd))
+ return (SET_ERROR(EBUSY));
+
+ if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
+ zb, zio, state, 0))
+ return (SET_ERROR(EINVAL)); /* couldn't post event */
+
+ if (ereport == NULL)
+ return (SET_ERROR(EINVAL));
+
+ /* Cleanup is handled by the callback function */
+ rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
+#endif
+ return (rc);
+}
+
+/*
+ * Prepare a checksum ereport
+ *
+ * Returns
+ * - 0 if an event was posted
+ * - EINVAL if there was a problem posting event
+ * - EBUSY if the event was rate limited
+ * - EALREADY if the event was already posted (duplicate)
+ */
+int
+zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
+ struct zio *zio, uint64_t offset, uint64_t length, void *arg,
+ zio_bad_cksum_t *info)
+{
+ zio_cksum_report_t *report;
+
+#ifdef _KERNEL
+ if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
+ return (SET_ERROR(EINVAL));
+
+ if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
+ offset, length))
+ return (SET_ERROR(EALREADY));
+
+ if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
+ return (SET_ERROR(EBUSY));
+#endif
+
+ report = kmem_zalloc(sizeof (*report), KM_SLEEP);
+
+ if (zio->io_vsd != NULL)
+ zio->io_vsd_ops->vsd_cksum_report(zio, report, arg);
+ else
+ zio_vsd_default_cksum_report(zio, report, arg);
+
+ /* copy the checksum failure information if it was provided */
+ if (info != NULL) {
+ report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
+ bcopy(info, report->zcr_ckinfo, sizeof (*info));
+ }
+
+ report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift;
+ report->zcr_align =
+ vdev_psize_to_asize(vd->vdev_top, report->zcr_sector);
+ report->zcr_length = length;
+
+#ifdef _KERNEL
+ (void) zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
+ FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length);
+
+ if (report->zcr_ereport == NULL) {
+ zfs_ereport_free_checksum(report);
+ return (0);
+ }
+#endif
+
+ mutex_enter(&spa->spa_errlist_lock);
+ report->zcr_next = zio->io_logical->io_cksum_report;
+ zio->io_logical->io_cksum_report = report;
+ mutex_exit(&spa->spa_errlist_lock);
+ return (0);
+}
+
+void
+zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data,
+ const abd_t *bad_data, boolean_t drop_if_identical)
+{
+#ifdef _KERNEL
+ zfs_ecksum_info_t *info;
+
+ info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo,
+ good_data, bad_data, report->zcr_length, drop_if_identical);
+ if (info != NULL)
+ zfs_zevent_post(report->zcr_ereport,
+ report->zcr_detector, zfs_zevent_post_cb);
+ else
+ zfs_zevent_post_cb(report->zcr_ereport, report->zcr_detector);
+
+ report->zcr_ereport = report->zcr_detector = NULL;
+ if (info != NULL)
+ kmem_free(info, sizeof (*info));
+#endif
+}
+
+void
+zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
+{
+#ifdef _KERNEL
+ if (rpt->zcr_ereport != NULL) {
+ fm_nvlist_destroy(rpt->zcr_ereport,
+ FM_NVA_FREE);
+ fm_nvlist_destroy(rpt->zcr_detector,
+ FM_NVA_FREE);
+ }
+#endif
+ rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo);
+
+ if (rpt->zcr_ckinfo != NULL)
+ kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo));
+
+ kmem_free(rpt, sizeof (*rpt));
+}
+
+/*
+ * Post a checksum ereport
+ *
+ * Returns
+ * - 0 if an event was posted
+ * - EINVAL if there was a problem posting event
+ * - EBUSY if the event was rate limited
+ * - EALREADY if the event was already posted (duplicate)
+ */
+int
+zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
+ struct zio *zio, uint64_t offset, uint64_t length,
+ const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc)
+{
+ int rc = 0;
+#ifdef _KERNEL
+ nvlist_t *ereport = NULL;
+ nvlist_t *detector = NULL;
+ zfs_ecksum_info_t *info;
+
+ if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
+ return (SET_ERROR(EINVAL));
+
+ if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
+ offset, length))
+ return (SET_ERROR(EALREADY));
+
+ if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
+ return (SET_ERROR(EBUSY));
+
+ if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
+ spa, vd, zb, zio, offset, length) || (ereport == NULL)) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
+ B_FALSE);
+
+ if (info != NULL) {
+ rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
+ kmem_free(info, sizeof (*info));
+ }
+#endif
+ return (rc);
+}
+
+/*
+ * The 'sysevent.fs.zfs.*' events are signals posted to notify user space of
+ * change in the pool. All sysevents are listed in sys/sysevent/eventdefs.h
+ * and are designed to be consumed by the ZFS Event Daemon (ZED). For
+ * additional details refer to the zed(8) man page.
+ */
+nvlist_t *
+zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name,
+ nvlist_t *aux)
+{
+ nvlist_t *resource = NULL;
+#ifdef _KERNEL
+ char class[64];
+
+ if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
+ return (NULL);
+
+ if ((resource = fm_nvlist_create(NULL)) == NULL)
+ return (NULL);
+
+ (void) snprintf(class, sizeof (class), "%s.%s.%s", type,
+ ZFS_ERROR_CLASS, name);
+ VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION));
+ VERIFY0(nvlist_add_string(resource, FM_CLASS, class));
+ VERIFY0(nvlist_add_string(resource,
+ FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa)));
+ VERIFY0(nvlist_add_uint64(resource,
+ FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)));
+ VERIFY0(nvlist_add_uint64(resource,
+ FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, spa_state(spa)));
+ VERIFY0(nvlist_add_int32(resource,
+ FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa)));
+
+ if (vd) {
+ VERIFY0(nvlist_add_uint64(resource,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid));
+ VERIFY0(nvlist_add_uint64(resource,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state));
+ if (vd->vdev_path != NULL)
+ VERIFY0(nvlist_add_string(resource,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path));
+ if (vd->vdev_devid != NULL)
+ VERIFY0(nvlist_add_string(resource,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid));
+ if (vd->vdev_fru != NULL)
+ VERIFY0(nvlist_add_string(resource,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru));
+ if (vd->vdev_enc_sysfs_path != NULL)
+ VERIFY0(nvlist_add_string(resource,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
+ vd->vdev_enc_sysfs_path));
+ }
+
+ /* also copy any optional payload data */
+ if (aux) {
+ nvpair_t *elem = NULL;
+
+ while ((elem = nvlist_next_nvpair(aux, elem)) != NULL)
+ (void) nvlist_add_nvpair(resource, elem);
+ }
+
+#endif
+ return (resource);
+}
+
+static void
+zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name,
+ nvlist_t *aux)
+{
+#ifdef _KERNEL
+ nvlist_t *resource;
+
+ resource = zfs_event_create(spa, vd, type, name, aux);
+ if (resource)
+ zfs_zevent_post(resource, NULL, zfs_zevent_post_cb);
+#endif
+}
+
+/*
+ * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
+ * has been removed from the system. This will cause the DE to ignore any
+ * recent I/O errors, inferring that they are due to the asynchronous device
+ * removal.
+ */
+void
+zfs_post_remove(spa_t *spa, vdev_t *vd)
+{
+ zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, NULL);
+}
+
+/*
+ * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool
+ * has the 'autoreplace' property set, and therefore any broken vdevs will be
+ * handled by higher level logic, and no vdev fault should be generated.
+ */
+void
+zfs_post_autoreplace(spa_t *spa, vdev_t *vd)
+{
+ zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE, NULL);
+}
+
+/*
+ * The 'resource.fs.zfs.statechange' event is an internal signal that the
+ * given vdev has transitioned its state to DEGRADED or HEALTHY. This will
+ * cause the retire agent to repair any outstanding fault management cases
+ * open because the device was not found (fault.fs.zfs.device).
+ */
+void
+zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate)
+{
+#ifdef _KERNEL
+ nvlist_t *aux;
+
+ /*
+ * Add optional supplemental keys to payload
+ */
+ aux = fm_nvlist_create(NULL);
+ if (vd && aux) {
+ if (vd->vdev_physpath) {
+ (void) nvlist_add_string(aux,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH,
+ vd->vdev_physpath);
+ }
+ if (vd->vdev_enc_sysfs_path) {
+ (void) nvlist_add_string(aux,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
+ vd->vdev_enc_sysfs_path);
+ }
+
+ (void) nvlist_add_uint64(aux,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate);
+ }
+
+ zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE,
+ aux);
+
+ if (aux)
+ fm_nvlist_destroy(aux, FM_NVA_FREE);
+#endif
+}
+
+#ifdef _KERNEL
+void
+zfs_ereport_init(void)
+{
+ mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&recent_events_list, sizeof (recent_events_node_t),
+ offsetof(recent_events_node_t, re_list_link));
+ avl_create(&recent_events_tree, recent_events_compare,
+ sizeof (recent_events_node_t), offsetof(recent_events_node_t,
+ re_tree_link));
+}
+
+/*
+ * This 'early' fini needs to run before zfs_fini() which on Linux waits
+ * for the system_delay_taskq to drain.
+ */
+void
+zfs_ereport_taskq_fini(void)
+{
+ mutex_enter(&recent_events_lock);
+ if (recent_events_cleaner_tqid != 0) {
+ taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid);
+ recent_events_cleaner_tqid = 0;
+ }
+ mutex_exit(&recent_events_lock);
+}
+
+void
+zfs_ereport_fini(void)
+{
+ recent_events_node_t *entry;
+
+ while ((entry = list_head(&recent_events_list)) != NULL) {
+ avl_remove(&recent_events_tree, entry);
+ list_remove(&recent_events_list, entry);
+ kmem_free(entry, sizeof (*entry));
+ }
+ avl_destroy(&recent_events_tree);
+ list_destroy(&recent_events_list);
+ mutex_destroy(&recent_events_lock);
+}
+
+EXPORT_SYMBOL(zfs_ereport_post);
+EXPORT_SYMBOL(zfs_ereport_is_valid);
+EXPORT_SYMBOL(zfs_ereport_post_checksum);
+EXPORT_SYMBOL(zfs_post_remove);
+EXPORT_SYMBOL(zfs_post_autoreplace);
+EXPORT_SYMBOL(zfs_post_state_change);
+
+ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW,
+ "Maximum recent zevents records to retain for duplicate checking");
+ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW,
+ "Expiration time for recent zevents records");
+#endif /* _KERNEL */
diff --git a/sys/contrib/openzfs/module/zfs/zfs_fuid.c b/sys/contrib/openzfs/module/zfs/zfs_fuid.c
new file mode 100644
index 000000000000..015dde4811e4
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_fuid.c
@@ -0,0 +1,815 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dmu.h>
+#include <sys/avl.h>
+#include <sys/zap.h>
+#include <sys/nvpair.h>
+#ifdef _KERNEL
+#include <sys/sid.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+#endif
+#include <sys/zfs_fuid.h>
+
+/*
+ * FUID Domain table(s).
+ *
+ * The FUID table is stored as a packed nvlist of an array
+ * of nvlists which contain an index, domain string and offset
+ *
+ * During file system initialization the nvlist(s) are read and
+ * two AVL trees are created. One tree is keyed by the index number
+ * and the other by the domain string. Nodes are never removed from
+ * trees, but new entries may be added. If a new entry is added then
+ * the zfsvfs->z_fuid_dirty flag is set to true and the caller will then
+ * be responsible for calling zfs_fuid_sync() to sync the changes to disk.
+ *
+ */
+
+#define FUID_IDX "fuid_idx"
+#define FUID_DOMAIN "fuid_domain"
+#define FUID_OFFSET "fuid_offset"
+#define FUID_NVP_ARRAY "fuid_nvlist"
+
+typedef struct fuid_domain {
+ avl_node_t f_domnode;
+ avl_node_t f_idxnode;
+ ksiddomain_t *f_ksid;
+ uint64_t f_idx;
+} fuid_domain_t;
+
+static char *nulldomain = "";
+
+/*
+ * Compare two indexes.
+ */
+static int
+idx_compare(const void *arg1, const void *arg2)
+{
+ const fuid_domain_t *node1 = (const fuid_domain_t *)arg1;
+ const fuid_domain_t *node2 = (const fuid_domain_t *)arg2;
+
+ return (TREE_CMP(node1->f_idx, node2->f_idx));
+}
+
+/*
+ * Compare two domain strings.
+ */
+static int
+domain_compare(const void *arg1, const void *arg2)
+{
+ const fuid_domain_t *node1 = (const fuid_domain_t *)arg1;
+ const fuid_domain_t *node2 = (const fuid_domain_t *)arg2;
+ int val;
+
+ val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name);
+
+ return (TREE_ISIGN(val));
+}
+
+void
+zfs_fuid_avl_tree_create(avl_tree_t *idx_tree, avl_tree_t *domain_tree)
+{
+ avl_create(idx_tree, idx_compare,
+ sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode));
+ avl_create(domain_tree, domain_compare,
+ sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode));
+}
+
+/*
+ * load initial fuid domain and idx trees. This function is used by
+ * both the kernel and zdb.
+ */
+uint64_t
+zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree,
+ avl_tree_t *domain_tree)
+{
+ dmu_buf_t *db;
+ uint64_t fuid_size;
+
+ ASSERT(fuid_obj != 0);
+ VERIFY(0 == dmu_bonus_hold(os, fuid_obj,
+ FTAG, &db));
+ fuid_size = *(uint64_t *)db->db_data;
+ dmu_buf_rele(db, FTAG);
+
+ if (fuid_size) {
+ nvlist_t **fuidnvp;
+ nvlist_t *nvp = NULL;
+ uint_t count;
+ char *packed;
+ int i;
+
+ packed = kmem_alloc(fuid_size, KM_SLEEP);
+ VERIFY(dmu_read(os, fuid_obj, 0,
+ fuid_size, packed, DMU_READ_PREFETCH) == 0);
+ VERIFY(nvlist_unpack(packed, fuid_size,
+ &nvp, 0) == 0);
+ VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY,
+ &fuidnvp, &count) == 0);
+
+ for (i = 0; i != count; i++) {
+ fuid_domain_t *domnode;
+ char *domain;
+ uint64_t idx;
+
+ VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN,
+ &domain) == 0);
+ VERIFY(nvlist_lookup_uint64(fuidnvp[i], FUID_IDX,
+ &idx) == 0);
+
+ domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
+
+ domnode->f_idx = idx;
+ domnode->f_ksid = ksid_lookupdomain(domain);
+ avl_add(idx_tree, domnode);
+ avl_add(domain_tree, domnode);
+ }
+ nvlist_free(nvp);
+ kmem_free(packed, fuid_size);
+ }
+ return (fuid_size);
+}
+
+void
+zfs_fuid_table_destroy(avl_tree_t *idx_tree, avl_tree_t *domain_tree)
+{
+ fuid_domain_t *domnode;
+ void *cookie;
+
+ cookie = NULL;
+ while ((domnode = avl_destroy_nodes(domain_tree, &cookie)))
+ ksiddomain_rele(domnode->f_ksid);
+
+ avl_destroy(domain_tree);
+ cookie = NULL;
+ while ((domnode = avl_destroy_nodes(idx_tree, &cookie)))
+ kmem_free(domnode, sizeof (fuid_domain_t));
+ avl_destroy(idx_tree);
+}
+
+char *
+zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx)
+{
+ fuid_domain_t searchnode, *findnode;
+ avl_index_t loc;
+
+ searchnode.f_idx = idx;
+
+ findnode = avl_find(idx_tree, &searchnode, &loc);
+
+ return (findnode ? findnode->f_ksid->kd_name : nulldomain);
+}
+
+#ifdef _KERNEL
+/*
+ * Load the fuid table(s) into memory.
+ */
+static void
+zfs_fuid_init(zfsvfs_t *zfsvfs)
+{
+ rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
+
+ if (zfsvfs->z_fuid_loaded) {
+ rw_exit(&zfsvfs->z_fuid_lock);
+ return;
+ }
+
+ zfs_fuid_avl_tree_create(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
+
+ (void) zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
+ ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj);
+ if (zfsvfs->z_fuid_obj != 0) {
+ zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os,
+ zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx,
+ &zfsvfs->z_fuid_domain);
+ }
+
+ zfsvfs->z_fuid_loaded = B_TRUE;
+ rw_exit(&zfsvfs->z_fuid_lock);
+}
+
+/*
+ * sync out AVL trees to persistent storage.
+ */
+void
+zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
+{
+ nvlist_t *nvp;
+ nvlist_t **fuids;
+ size_t nvsize = 0;
+ char *packed;
+ dmu_buf_t *db;
+ fuid_domain_t *domnode;
+ int numnodes;
+ int i;
+
+ if (!zfsvfs->z_fuid_dirty) {
+ return;
+ }
+
+ rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
+
+ /*
+ * First see if table needs to be created?
+ */
+ if (zfsvfs->z_fuid_obj == 0) {
+ zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os,
+ DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE,
+ sizeof (uint64_t), tx);
+ VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+ ZFS_FUID_TABLES, sizeof (uint64_t), 1,
+ &zfsvfs->z_fuid_obj, tx) == 0);
+ }
+
+ VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ numnodes = avl_numnodes(&zfsvfs->z_fuid_idx);
+ fuids = kmem_alloc(numnodes * sizeof (void *), KM_SLEEP);
+ for (i = 0, domnode = avl_first(&zfsvfs->z_fuid_domain); domnode; i++,
+ domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode)) {
+ VERIFY(nvlist_alloc(&fuids[i], NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX,
+ domnode->f_idx) == 0);
+ VERIFY(nvlist_add_uint64(fuids[i], FUID_OFFSET, 0) == 0);
+ VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN,
+ domnode->f_ksid->kd_name) == 0);
+ }
+ VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
+ fuids, numnodes) == 0);
+ for (i = 0; i != numnodes; i++)
+ nvlist_free(fuids[i]);
+ kmem_free(fuids, numnodes * sizeof (void *));
+ VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0);
+ packed = kmem_alloc(nvsize, KM_SLEEP);
+ VERIFY(nvlist_pack(nvp, &packed, &nvsize,
+ NV_ENCODE_XDR, KM_SLEEP) == 0);
+ nvlist_free(nvp);
+ zfsvfs->z_fuid_size = nvsize;
+ dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0,
+ zfsvfs->z_fuid_size, packed, tx);
+ kmem_free(packed, zfsvfs->z_fuid_size);
+ VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj,
+ FTAG, &db));
+ dmu_buf_will_dirty(db, tx);
+ *(uint64_t *)db->db_data = zfsvfs->z_fuid_size;
+ dmu_buf_rele(db, FTAG);
+
+ zfsvfs->z_fuid_dirty = B_FALSE;
+ rw_exit(&zfsvfs->z_fuid_lock);
+}
+
+/*
+ * Query domain table for a given domain.
+ *
+ * If domain isn't found and addok is set, it is added to AVL trees and
+ * the zfsvfs->z_fuid_dirty flag will be set to TRUE. It will then be
+ * necessary for the caller or another thread to detect the dirty table
+ * and sync out the changes.
+ */
+int
+zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain,
+ char **retdomain, boolean_t addok)
+{
+ fuid_domain_t searchnode, *findnode;
+ avl_index_t loc;
+ krw_t rw = RW_READER;
+
+ /*
+ * If the dummy "nobody" domain then return an index of 0
+ * to cause the created FUID to be a standard POSIX id
+ * for the user nobody.
+ */
+ if (domain[0] == '\0') {
+ if (retdomain)
+ *retdomain = nulldomain;
+ return (0);
+ }
+
+ searchnode.f_ksid = ksid_lookupdomain(domain);
+ if (retdomain)
+ *retdomain = searchnode.f_ksid->kd_name;
+ if (!zfsvfs->z_fuid_loaded)
+ zfs_fuid_init(zfsvfs);
+
+retry:
+ rw_enter(&zfsvfs->z_fuid_lock, rw);
+ findnode = avl_find(&zfsvfs->z_fuid_domain, &searchnode, &loc);
+
+ if (findnode) {
+ rw_exit(&zfsvfs->z_fuid_lock);
+ ksiddomain_rele(searchnode.f_ksid);
+ return (findnode->f_idx);
+ } else if (addok) {
+ fuid_domain_t *domnode;
+ uint64_t retidx;
+
+ if (rw == RW_READER && !rw_tryupgrade(&zfsvfs->z_fuid_lock)) {
+ rw_exit(&zfsvfs->z_fuid_lock);
+ rw = RW_WRITER;
+ goto retry;
+ }
+
+ domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
+ domnode->f_ksid = searchnode.f_ksid;
+
+ retidx = domnode->f_idx = avl_numnodes(&zfsvfs->z_fuid_idx) + 1;
+
+ avl_add(&zfsvfs->z_fuid_domain, domnode);
+ avl_add(&zfsvfs->z_fuid_idx, domnode);
+ zfsvfs->z_fuid_dirty = B_TRUE;
+ rw_exit(&zfsvfs->z_fuid_lock);
+ return (retidx);
+ } else {
+ rw_exit(&zfsvfs->z_fuid_lock);
+ return (-1);
+ }
+}
+
+/*
+ * Query domain table by index, returning domain string
+ *
+ * Returns a pointer from an avl node of the domain string.
+ *
+ */
+const char *
+zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
+{
+ char *domain;
+
+ if (idx == 0 || !zfsvfs->z_use_fuids)
+ return (NULL);
+
+ if (!zfsvfs->z_fuid_loaded)
+ zfs_fuid_init(zfsvfs);
+
+ rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
+
+ if (zfsvfs->z_fuid_obj || zfsvfs->z_fuid_dirty)
+ domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx);
+ else
+ domain = nulldomain;
+ rw_exit(&zfsvfs->z_fuid_lock);
+
+ ASSERT(domain);
+ return (domain);
+}
+
+void
+zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp)
+{
+ *uidp = zfs_fuid_map_id(ZTOZSB(zp), KUID_TO_SUID(ZTOUID(zp)),
+ cr, ZFS_OWNER);
+ *gidp = zfs_fuid_map_id(ZTOZSB(zp), KGID_TO_SGID(ZTOGID(zp)),
+ cr, ZFS_GROUP);
+}
+
+#ifdef __FreeBSD__
+uid_t
+zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
+ cred_t *cr, zfs_fuid_type_t type)
+{
+ uint32_t index = FUID_INDEX(fuid);
+
+ if (index == 0)
+ return (fuid);
+
+ return (UID_NOBODY);
+}
+#elif defined(__linux__)
+uid_t
+zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
+ cred_t *cr, zfs_fuid_type_t type)
+{
+ /*
+ * The Linux port only supports POSIX IDs, use the passed id.
+ */
+ return (fuid);
+}
+
+#else
+uid_t
+zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
+ cred_t *cr, zfs_fuid_type_t type)
+{
+ uint32_t index = FUID_INDEX(fuid);
+ const char *domain;
+ uid_t id;
+
+ if (index == 0)
+ return (fuid);
+
+ domain = zfs_fuid_find_by_idx(zfsvfs, index);
+ ASSERT(domain != NULL);
+
+ if (type == ZFS_OWNER || type == ZFS_ACE_USER) {
+ (void) kidmap_getuidbysid(crgetzone(cr), domain,
+ FUID_RID(fuid), &id);
+ } else {
+ (void) kidmap_getgidbysid(crgetzone(cr), domain,
+ FUID_RID(fuid), &id);
+ }
+ return (id);
+}
+#endif
+
+/*
+ * Add a FUID node to the list of fuid's being created for this
+ * ACL
+ *
+ * If ACL has multiple domains, then keep only one copy of each unique
+ * domain.
+ */
+void
+zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
+ uint64_t idx, uint64_t id, zfs_fuid_type_t type)
+{
+ zfs_fuid_t *fuid;
+ zfs_fuid_domain_t *fuid_domain;
+ zfs_fuid_info_t *fuidp;
+ uint64_t fuididx;
+ boolean_t found = B_FALSE;
+
+ if (*fuidpp == NULL)
+ *fuidpp = zfs_fuid_info_alloc();
+
+ fuidp = *fuidpp;
+ /*
+ * First find fuid domain index in linked list
+ *
+ * If one isn't found then create an entry.
+ */
+
+ for (fuididx = 1, fuid_domain = list_head(&fuidp->z_domains);
+ fuid_domain; fuid_domain = list_next(&fuidp->z_domains,
+ fuid_domain), fuididx++) {
+ if (idx == fuid_domain->z_domidx) {
+ found = B_TRUE;
+ break;
+ }
+ }
+
+ if (!found) {
+ fuid_domain = kmem_alloc(sizeof (zfs_fuid_domain_t), KM_SLEEP);
+ fuid_domain->z_domain = domain;
+ fuid_domain->z_domidx = idx;
+ list_insert_tail(&fuidp->z_domains, fuid_domain);
+ fuidp->z_domain_str_sz += strlen(domain) + 1;
+ fuidp->z_domain_cnt++;
+ }
+
+ if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) {
+
+ /*
+ * Now allocate fuid entry and add it on the end of the list
+ */
+
+ fuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
+ fuid->z_id = id;
+ fuid->z_domidx = idx;
+ fuid->z_logfuid = FUID_ENCODE(fuididx, rid);
+
+ list_insert_tail(&fuidp->z_fuids, fuid);
+ fuidp->z_fuid_cnt++;
+ } else {
+ if (type == ZFS_OWNER)
+ fuidp->z_fuid_owner = FUID_ENCODE(fuididx, rid);
+ else
+ fuidp->z_fuid_group = FUID_ENCODE(fuididx, rid);
+ }
+}
+
+#ifdef HAVE_KSID
+/*
+ * Create a file system FUID, based on information in the users cred
+ *
+ * If cred contains KSID_OWNER then it should be used to determine
+ * the uid otherwise cred's uid will be used. By default cred's gid
+ * is used unless it's an ephemeral ID in which case KSID_GROUP will
+ * be used if it exists.
+ */
+uint64_t
+zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
+ cred_t *cr, zfs_fuid_info_t **fuidp)
+{
+ uint64_t idx;
+ ksid_t *ksid;
+ uint32_t rid;
+ char *kdomain;
+ const char *domain;
+ uid_t id;
+
+ VERIFY(type == ZFS_OWNER || type == ZFS_GROUP);
+
+ ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP);
+
+ if (!zfsvfs->z_use_fuids || (ksid == NULL)) {
+ id = (type == ZFS_OWNER) ? crgetuid(cr) : crgetgid(cr);
+
+ if (IS_EPHEMERAL(id))
+ return ((type == ZFS_OWNER) ? UID_NOBODY : GID_NOBODY);
+
+ return ((uint64_t)id);
+ }
+
+ /*
+ * ksid is present and FUID is supported
+ */
+ id = (type == ZFS_OWNER) ? ksid_getid(ksid) : crgetgid(cr);
+
+ if (!IS_EPHEMERAL(id))
+ return ((uint64_t)id);
+
+ if (type == ZFS_GROUP)
+ id = ksid_getid(ksid);
+
+ rid = ksid_getrid(ksid);
+ domain = ksid_getdomain(ksid);
+
+ idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
+
+ zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type);
+
+ return (FUID_ENCODE(idx, rid));
+}
+#endif /* HAVE_KSID */
+
+/*
+ * Create a file system FUID for an ACL ace
+ * or a chown/chgrp of the file.
+ * This is similar to zfs_fuid_create_cred, except that
+ * we can't find the domain + rid information in the
+ * cred. Instead we have to query Winchester for the
+ * domain and rid.
+ *
+ * During replay operations the domain+rid information is
+ * found in the zfs_fuid_info_t that the replay code has
+ * attached to the zfsvfs of the file system.
+ */
+uint64_t
+zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
+ zfs_fuid_type_t type, zfs_fuid_info_t **fuidpp)
+{
+#ifdef HAVE_KSID
+ const char *domain;
+ char *kdomain;
+ uint32_t fuid_idx = FUID_INDEX(id);
+ uint32_t rid = 0;
+ idmap_stat status;
+ uint64_t idx = UID_NOBODY;
+ zfs_fuid_t *zfuid = NULL;
+ zfs_fuid_info_t *fuidp = NULL;
+
+ /*
+ * If POSIX ID, or entry is already a FUID then
+ * just return the id
+ *
+ * We may also be handed an already FUID'ized id via
+ * chmod.
+ */
+
+ if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0)
+ return (id);
+
+ if (zfsvfs->z_replay) {
+ fuidp = zfsvfs->z_fuid_replay;
+
+ /*
+ * If we are passed an ephemeral id, but no
+ * fuid_info was logged then return NOBODY.
+ * This is most likely a result of idmap service
+ * not being available.
+ */
+ if (fuidp == NULL)
+ return (UID_NOBODY);
+
+ VERIFY3U(type, >=, ZFS_OWNER);
+ VERIFY3U(type, <=, ZFS_ACE_GROUP);
+
+ switch (type) {
+ case ZFS_ACE_USER:
+ case ZFS_ACE_GROUP:
+ zfuid = list_head(&fuidp->z_fuids);
+ rid = FUID_RID(zfuid->z_logfuid);
+ idx = FUID_INDEX(zfuid->z_logfuid);
+ break;
+ case ZFS_OWNER:
+ rid = FUID_RID(fuidp->z_fuid_owner);
+ idx = FUID_INDEX(fuidp->z_fuid_owner);
+ break;
+ case ZFS_GROUP:
+ rid = FUID_RID(fuidp->z_fuid_group);
+ idx = FUID_INDEX(fuidp->z_fuid_group);
+ break;
+ };
+ domain = fuidp->z_domain_table[idx - 1];
+ } else {
+ if (type == ZFS_OWNER || type == ZFS_ACE_USER)
+ status = kidmap_getsidbyuid(crgetzone(cr), id,
+ &domain, &rid);
+ else
+ status = kidmap_getsidbygid(crgetzone(cr), id,
+ &domain, &rid);
+
+ if (status != 0) {
+ /*
+ * When returning nobody we will need to
+ * make a dummy fuid table entry for logging
+ * purposes.
+ */
+ rid = UID_NOBODY;
+ domain = nulldomain;
+ }
+ }
+
+ idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
+
+ if (!zfsvfs->z_replay)
+ zfs_fuid_node_add(fuidpp, kdomain,
+ rid, idx, id, type);
+ else if (zfuid != NULL) {
+ list_remove(&fuidp->z_fuids, zfuid);
+ kmem_free(zfuid, sizeof (zfs_fuid_t));
+ }
+ return (FUID_ENCODE(idx, rid));
+#else
+ /*
+ * The Linux port only supports POSIX IDs, use the passed id.
+ */
+ return (id);
+#endif
+}
+
+void
+zfs_fuid_destroy(zfsvfs_t *zfsvfs)
+{
+ rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
+ if (!zfsvfs->z_fuid_loaded) {
+ rw_exit(&zfsvfs->z_fuid_lock);
+ return;
+ }
+ zfs_fuid_table_destroy(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
+ rw_exit(&zfsvfs->z_fuid_lock);
+}
+
+/*
+ * Allocate zfs_fuid_info for tracking FUIDs created during
+ * zfs_mknode, VOP_SETATTR() or VOP_SETSECATTR()
+ */
+zfs_fuid_info_t *
+zfs_fuid_info_alloc(void)
+{
+ zfs_fuid_info_t *fuidp;
+
+ fuidp = kmem_zalloc(sizeof (zfs_fuid_info_t), KM_SLEEP);
+ list_create(&fuidp->z_domains, sizeof (zfs_fuid_domain_t),
+ offsetof(zfs_fuid_domain_t, z_next));
+ list_create(&fuidp->z_fuids, sizeof (zfs_fuid_t),
+ offsetof(zfs_fuid_t, z_next));
+ return (fuidp);
+}
+
+/*
+ * Release all memory associated with zfs_fuid_info_t
+ */
+void
+zfs_fuid_info_free(zfs_fuid_info_t *fuidp)
+{
+ zfs_fuid_t *zfuid;
+ zfs_fuid_domain_t *zdomain;
+
+ while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) {
+ list_remove(&fuidp->z_fuids, zfuid);
+ kmem_free(zfuid, sizeof (zfs_fuid_t));
+ }
+
+ if (fuidp->z_domain_table != NULL)
+ kmem_free(fuidp->z_domain_table,
+ (sizeof (char *)) * fuidp->z_domain_cnt);
+
+ while ((zdomain = list_head(&fuidp->z_domains)) != NULL) {
+ list_remove(&fuidp->z_domains, zdomain);
+ kmem_free(zdomain, sizeof (zfs_fuid_domain_t));
+ }
+
+ kmem_free(fuidp, sizeof (zfs_fuid_info_t));
+}
+
+/*
+ * Check to see if id is a groupmember. If cred
+ * has ksid info then sidlist is checked first
+ * and if still not found then POSIX groups are checked
+ *
+ * Will use a straight FUID compare when possible.
+ */
+boolean_t
+zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
+{
+#ifdef HAVE_KSID
+ uid_t gid;
+
+#ifdef illumos
+ ksid_t *ksid = crgetsid(cr, KSID_GROUP);
+ ksidlist_t *ksidlist = crgetsidlist(cr);
+
+ if (ksid && ksidlist) {
+ int i;
+ ksid_t *ksid_groups;
+ uint32_t idx = FUID_INDEX(id);
+ uint32_t rid = FUID_RID(id);
+
+ ksid_groups = ksidlist->ksl_sids;
+
+ for (i = 0; i != ksidlist->ksl_nsid; i++) {
+ if (idx == 0) {
+ if (id != IDMAP_WK_CREATOR_GROUP_GID &&
+ id == ksid_groups[i].ks_id) {
+ return (B_TRUE);
+ }
+ } else {
+ const char *domain;
+
+ domain = zfs_fuid_find_by_idx(zfsvfs, idx);
+ ASSERT(domain != NULL);
+
+ if (strcmp(domain,
+ IDMAP_WK_CREATOR_SID_AUTHORITY) == 0)
+ return (B_FALSE);
+
+ if ((strcmp(domain,
+ ksid_groups[i].ks_domain->kd_name) == 0) &&
+ rid == ksid_groups[i].ks_rid)
+ return (B_TRUE);
+ }
+ }
+ }
+#endif /* illumos */
+
+ /*
+ * Not found in ksidlist, check posix groups
+ */
+ gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP);
+ return (groupmember(gid, cr));
+#else
+ return (B_TRUE);
+#endif
+}
+
+void
+zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
+{
+ if (zfsvfs->z_fuid_obj == 0) {
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ FUID_SIZE_ESTIMATE(zfsvfs));
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
+ } else {
+ dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
+ dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
+ FUID_SIZE_ESTIMATE(zfsvfs));
+ }
+}
+
+/*
+ * buf must be big enough (eg, 32 bytes)
+ */
+int
+zfs_id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
+ char *buf, size_t len, boolean_t addok)
+{
+ uint64_t fuid;
+ int domainid = 0;
+
+ if (domain && domain[0]) {
+ domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
+ if (domainid == -1)
+ return (SET_ERROR(ENOENT));
+ }
+ fuid = FUID_ENCODE(domainid, rid);
+ (void) snprintf(buf, len, "%llx", (longlong_t)fuid);
+ return (0);
+}
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
new file mode 100644
index 000000000000..0e35fd069cbb
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
@@ -0,0 +1,7688 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright 2011 Martin Matuska
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Portions Copyright 2012 Pawel Jakub Dawidek <pawel@dawidek.net>
+ * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Toomas Soome <tsoome@me.com>
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ * Copyright 2017 RackTop Systems.
+ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+ * Copyright (c) 2019 Datto Inc.
+ * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ */
+
+/*
+ * ZFS ioctls.
+ *
+ * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage
+ * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool.
+ *
+ * There are two ways that we handle ioctls: the legacy way where almost
+ * all of the logic is in the ioctl callback, and the new way where most
+ * of the marshalling is handled in the common entry point, zfsdev_ioctl().
+ *
+ * Non-legacy ioctls should be registered by calling
+ * zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked
+ * from userland by lzc_ioctl().
+ *
+ * The registration arguments are as follows:
+ *
+ * const char *name
+ * The name of the ioctl. This is used for history logging. If the
+ * ioctl returns successfully (the callback returns 0), and allow_log
+ * is true, then a history log entry will be recorded with the input &
+ * output nvlists. The log entry can be printed with "zpool history -i".
+ *
+ * zfs_ioc_t ioc
+ * The ioctl request number, which userland will pass to ioctl(2).
+ * We want newer versions of libzfs and libzfs_core to run against
+ * existing zfs kernel modules (i.e. a deferred reboot after an update).
+ * Therefore the ioctl numbers cannot change from release to release.
+ *
+ * zfs_secpolicy_func_t *secpolicy
+ * This function will be called before the zfs_ioc_func_t, to
+ * determine if this operation is permitted. It should return EPERM
+ * on failure, and 0 on success. Checks include determining if the
+ * dataset is visible in this zone, and if the user has either all
+ * zfs privileges in the zone (SYS_MOUNT), or has been granted permission
+ * to do this operation on this dataset with "zfs allow".
+ *
+ * zfs_ioc_namecheck_t namecheck
+ * This specifies what to expect in the zfs_cmd_t:zc_name -- a pool
+ * name, a dataset name, or nothing. If the name is not well-formed,
+ * the ioctl will fail and the callback will not be called.
+ * Therefore, the callback can assume that the name is well-formed
+ * (e.g. is null-terminated, doesn't have more than one '@' character,
+ * doesn't have invalid characters).
+ *
+ * zfs_ioc_poolcheck_t pool_check
+ * This specifies requirements on the pool state. If the pool does
+ * not meet them (is suspended or is readonly), the ioctl will fail
+ * and the callback will not be called. If any checks are specified
+ * (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME.
+ * Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED |
+ * POOL_CHECK_READONLY).
+ *
+ * zfs_ioc_key_t *nvl_keys
+ * The list of expected/allowable innvl input keys. This list is used
+ * to validate the nvlist input to the ioctl.
+ *
+ * boolean_t smush_outnvlist
+ * If smush_outnvlist is true, then the output is presumed to be a
+ * list of errors, and it will be "smushed" down to fit into the
+ * caller's buffer, by removing some entries and replacing them with a
+ * single "N_MORE_ERRORS" entry indicating how many were removed. See
+ * nvlist_smush() for details. If smush_outnvlist is false, and the
+ * outnvlist does not fit into the userland-provided buffer, then the
+ * ioctl will fail with ENOMEM.
+ *
+ * zfs_ioc_func_t *func
+ * The callback function that will perform the operation.
+ *
+ * The callback should return 0 on success, or an error number on
+ * failure. If the function fails, the userland ioctl will return -1,
+ * and errno will be set to the callback's return value. The callback
+ * will be called with the following arguments:
+ *
+ * const char *name
+ * The name of the pool or dataset to operate on, from
+ * zfs_cmd_t:zc_name. The 'namecheck' argument specifies the
+ * expected type (pool, dataset, or none).
+ *
+ * nvlist_t *innvl
+ * The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or
+ * NULL if no input nvlist was provided. Changes to this nvlist are
+ * ignored. If the input nvlist could not be deserialized, the
+ * ioctl will fail and the callback will not be called.
+ *
+ * nvlist_t *outnvl
+ * The output nvlist, initially empty. The callback can fill it in,
+ * and it will be returned to userland by serializing it into
+ * zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization
+ * fails (e.g. because the caller didn't supply a large enough
+ * buffer), then the overall ioctl will fail. See the
+ * 'smush_nvlist' argument above for additional behaviors.
+ *
+ * There are two typical uses of the output nvlist:
+ * - To return state, e.g. property values. In this case,
+ * smush_outnvlist should be false. If the buffer was not large
+ * enough, the caller will reallocate a larger buffer and try
+ * the ioctl again.
+ *
+ * - To return multiple errors from an ioctl which makes on-disk
+ * changes. In this case, smush_outnvlist should be true.
+ * Ioctls which make on-disk modifications should generally not
+ * use the outnvl if they succeed, because the caller can not
+ * distinguish between the operation failing, and
+ * deserialization failing.
+ *
+ * IOCTL Interface Errors
+ *
+ * The following ioctl input errors can be returned:
+ * ZFS_ERR_IOC_CMD_UNAVAIL the ioctl number is not supported by kernel
+ * ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel
+ * ZFS_ERR_IOC_ARG_REQUIRED a required input argument is missing
+ * ZFS_ERR_IOC_ARG_BADTYPE an input argument has an invalid type
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_quota.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/dmu.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_redact.h>
+#include <sys/dmu_tx.h>
+#include <sys/sunddi.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+#include <sys/nvpair.h>
+#include <sys/pathname.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+#include <sys/dsl_scan.h>
+#include <sys/fm/util.h>
+#include <sys/dsl_crypt.h>
+#include <sys/rrwlock.h>
+#include <sys/zfs_file.h>
+
+#include <sys/dmu_recv.h>
+#include <sys/dmu_send.h>
+#include <sys/dmu_recv.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dsl_userhold.h>
+#include <sys/zfeature.h>
+#include <sys/zcp.h>
+#include <sys/zio_checksum.h>
+#include <sys/vdev_removal.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_initialize.h>
+#include <sys/vdev_trim.h>
+
+#include "zfs_namecheck.h"
+#include "zfs_prop.h"
+#include "zfs_deleg.h"
+#include "zfs_comutil.h"
+
+#include <sys/lua/lua.h>
+#include <sys/lua/lauxlib.h>
+#include <sys/zfs_ioctl_impl.h>
+
+kmutex_t zfsdev_state_lock;
+zfsdev_state_t *zfsdev_state_list;
+
+/*
+ * Limit maximum nvlist size. We don't want users passing in insane values
+ * for zc->zc_nvlist_src_size, since we will need to allocate that much memory.
+ * Defaults to 0=auto which is handled by platform code.
+ */
+unsigned long zfs_max_nvlist_src_size = 0;
+
+/*
+ * When logging the output nvlist of an ioctl in the on-disk history, limit
+ * the logged size to this many bytes. This must be less then DMU_MAX_ACCESS.
+ * This applies primarily to zfs_ioc_channel_program().
+ */
+unsigned long zfs_history_output_max = 1024 * 1024;
+
+uint_t zfs_fsyncer_key;
+uint_t zfs_allow_log_key;
+
+/* DATA_TYPE_ANY is used when zkey_type can vary. */
+#define DATA_TYPE_ANY DATA_TYPE_UNKNOWN
+
+typedef struct zfs_ioc_vec {
+ zfs_ioc_legacy_func_t *zvec_legacy_func;
+ zfs_ioc_func_t *zvec_func;
+ zfs_secpolicy_func_t *zvec_secpolicy;
+ zfs_ioc_namecheck_t zvec_namecheck;
+ boolean_t zvec_allow_log;
+ zfs_ioc_poolcheck_t zvec_pool_check;
+ boolean_t zvec_smush_outnvlist;
+ const char *zvec_name;
+ const zfs_ioc_key_t *zvec_nvl_keys;
+ size_t zvec_nvl_key_count;
+} zfs_ioc_vec_t;
+
+/* This array is indexed by zfs_userquota_prop_t */
+static const char *userquota_perms[] = {
+ ZFS_DELEG_PERM_USERUSED,
+ ZFS_DELEG_PERM_USERQUOTA,
+ ZFS_DELEG_PERM_GROUPUSED,
+ ZFS_DELEG_PERM_GROUPQUOTA,
+ ZFS_DELEG_PERM_USEROBJUSED,
+ ZFS_DELEG_PERM_USEROBJQUOTA,
+ ZFS_DELEG_PERM_GROUPOBJUSED,
+ ZFS_DELEG_PERM_GROUPOBJQUOTA,
+ ZFS_DELEG_PERM_PROJECTUSED,
+ ZFS_DELEG_PERM_PROJECTQUOTA,
+ ZFS_DELEG_PERM_PROJECTOBJUSED,
+ ZFS_DELEG_PERM_PROJECTOBJQUOTA,
+};
+
+static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc);
+static int zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc);
+static int zfs_check_settable(const char *name, nvpair_t *property,
+ cred_t *cr);
+static int zfs_check_clearable(const char *dataset, nvlist_t *props,
+ nvlist_t **errors);
+static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *,
+ boolean_t *);
+int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *);
+static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp);
+
+static void
+history_str_free(char *buf)
+{
+ kmem_free(buf, HIS_MAX_RECORD_LEN);
+}
+
+static char *
+history_str_get(zfs_cmd_t *zc)
+{
+ char *buf;
+
+ if (zc->zc_history == 0)
+ return (NULL);
+
+ buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
+ if (copyinstr((void *)(uintptr_t)zc->zc_history,
+ buf, HIS_MAX_RECORD_LEN, NULL) != 0) {
+ history_str_free(buf);
+ return (NULL);
+ }
+
+ buf[HIS_MAX_RECORD_LEN -1] = '\0';
+
+ return (buf);
+}
+
+/*
+ * Return non-zero if the spa version is less than requested version.
+ */
+static int
+zfs_earlier_version(const char *name, int version)
+{
+ spa_t *spa;
+
+ if (spa_open(name, &spa, FTAG) == 0) {
+ if (spa_version(spa) < version) {
+ spa_close(spa, FTAG);
+ return (1);
+ }
+ spa_close(spa, FTAG);
+ }
+ return (0);
+}
+
+/*
+ * Return TRUE if the ZPL version is less than requested version.
+ */
+static boolean_t
+zpl_earlier_version(const char *name, int version)
+{
+ objset_t *os;
+ boolean_t rc = B_TRUE;
+
+ if (dmu_objset_hold(name, FTAG, &os) == 0) {
+ uint64_t zplversion;
+
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ dmu_objset_rele(os, FTAG);
+ return (B_TRUE);
+ }
+ /* XXX reading from non-owned objset */
+ if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0)
+ rc = zplversion < version;
+ dmu_objset_rele(os, FTAG);
+ }
+ return (rc);
+}
+
+static void
+zfs_log_history(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ char *buf;
+
+ if ((buf = history_str_get(zc)) == NULL)
+ return;
+
+ if (spa_open(zc->zc_name, &spa, FTAG) == 0) {
+ if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY)
+ (void) spa_history_log(spa, buf);
+ spa_close(spa, FTAG);
+ }
+ history_str_free(buf);
+}
+
+/*
+ * Policy for top-level read operations (list pools). Requires no privileges,
+ * and can be used in the local zone, as there is no associated dataset.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ return (0);
+}
+
+/*
+ * Policy for dataset read operations (list children, get statistics). Requires
+ * no privileges, but must be visible in the local zone.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ if (INGLOBALZONE(curproc) ||
+ zone_dataset_visible(zc->zc_name, NULL))
+ return (0);
+
+ return (SET_ERROR(ENOENT));
+}
+
+static int
+zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr)
+{
+ int writable = 1;
+
+ /*
+ * The dataset must be visible by this zone -- check this first
+ * so they don't see EPERM on something they shouldn't know about.
+ */
+ if (!INGLOBALZONE(curproc) &&
+ !zone_dataset_visible(dataset, &writable))
+ return (SET_ERROR(ENOENT));
+
+ if (INGLOBALZONE(curproc)) {
+ /*
+ * If the fs is zoned, only root can access it from the
+ * global zone.
+ */
+ if (secpolicy_zfs(cr) && zoned)
+ return (SET_ERROR(EPERM));
+ } else {
+ /*
+ * If we are in a local zone, the 'zoned' property must be set.
+ */
+ if (!zoned)
+ return (SET_ERROR(EPERM));
+
+ /* must be writable by this zone */
+ if (!writable)
+ return (SET_ERROR(EPERM));
+ }
+ return (0);
+}
+
+static int
+zfs_dozonecheck(const char *dataset, cred_t *cr)
+{
+ uint64_t zoned;
+
+ if (dsl_prop_get_integer(dataset, zfs_prop_to_name(ZFS_PROP_ZONED),
+ &zoned, NULL))
+ return (SET_ERROR(ENOENT));
+
+ return (zfs_dozonecheck_impl(dataset, zoned, cr));
+}
+
+static int
+zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr)
+{
+ uint64_t zoned;
+
+ if (dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned))
+ return (SET_ERROR(ENOENT));
+
+ return (zfs_dozonecheck_impl(dataset, zoned, cr));
+}
+
+static int
+zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds,
+ const char *perm, cred_t *cr)
+{
+ int error;
+
+ error = zfs_dozonecheck_ds(name, ds, cr);
+ if (error == 0) {
+ error = secpolicy_zfs(cr);
+ if (error != 0)
+ error = dsl_deleg_access_impl(ds, perm, cr);
+ }
+ return (error);
+}
+
+static int
+zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
+{
+ int error;
+ dsl_dataset_t *ds;
+ dsl_pool_t *dp;
+
+ /*
+ * First do a quick check for root in the global zone, which
+ * is allowed to do all write_perms. This ensures that zfs_ioc_*
+ * will get to handle nonexistent datasets.
+ */
+ if (INGLOBALZONE(curproc) && secpolicy_zfs(cr) == 0)
+ return (0);
+
+ error = dsl_pool_hold(name, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_hold(dp, name, FTAG, &ds);
+ if (error != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr);
+
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+}
+
+/*
+ * Policy for setting the security label property.
+ *
+ * Returns 0 for success, non-zero for access and other errors.
+ */
+static int
+zfs_set_slabel_policy(const char *name, const char *strval, cred_t *cr)
+{
+#ifdef HAVE_MLSLABEL
+ char ds_hexsl[MAXNAMELEN];
+ bslabel_t ds_sl, new_sl;
+ boolean_t new_default = FALSE;
+ uint64_t zoned;
+ int needed_priv = -1;
+ int error;
+
+ /* First get the existing dataset label. */
+ error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+ 1, sizeof (ds_hexsl), &ds_hexsl, NULL);
+ if (error != 0)
+ return (SET_ERROR(EPERM));
+
+ if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
+ new_default = TRUE;
+
+ /* The label must be translatable */
+ if (!new_default && (hexstr_to_label(strval, &new_sl) != 0))
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * In a non-global zone, disallow attempts to set a label that
+ * doesn't match that of the zone; otherwise no other checks
+ * are needed.
+ */
+ if (!INGLOBALZONE(curproc)) {
+ if (new_default || !blequal(&new_sl, CR_SL(CRED())))
+ return (SET_ERROR(EPERM));
+ return (0);
+ }
+
+ /*
+ * For global-zone datasets (i.e., those whose zoned property is
+ * "off", verify that the specified new label is valid for the
+ * global zone.
+ */
+ if (dsl_prop_get_integer(name,
+ zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
+ return (SET_ERROR(EPERM));
+ if (!zoned) {
+ if (zfs_check_global_label(name, strval) != 0)
+ return (SET_ERROR(EPERM));
+ }
+
+ /*
+ * If the existing dataset label is nondefault, check if the
+ * dataset is mounted (label cannot be changed while mounted).
+ * Get the zfsvfs_t; if there isn't one, then the dataset isn't
+ * mounted (or isn't a dataset, doesn't exist, ...).
+ */
+ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) {
+ objset_t *os;
+ static const char *setsl_tag = "setsl_tag";
+
+ /*
+ * Try to own the dataset; abort if there is any error,
+ * (e.g., already mounted, in use, or other error).
+ */
+ error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, B_TRUE,
+ setsl_tag, &os);
+ if (error != 0)
+ return (SET_ERROR(EPERM));
+
+ dmu_objset_disown(os, B_TRUE, setsl_tag);
+
+ if (new_default) {
+ needed_priv = PRIV_FILE_DOWNGRADE_SL;
+ goto out_check;
+ }
+
+ if (hexstr_to_label(strval, &new_sl) != 0)
+ return (SET_ERROR(EPERM));
+
+ if (blstrictdom(&ds_sl, &new_sl))
+ needed_priv = PRIV_FILE_DOWNGRADE_SL;
+ else if (blstrictdom(&new_sl, &ds_sl))
+ needed_priv = PRIV_FILE_UPGRADE_SL;
+ } else {
+ /* dataset currently has a default label */
+ if (!new_default)
+ needed_priv = PRIV_FILE_UPGRADE_SL;
+ }
+
+out_check:
+ if (needed_priv != -1)
+ return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL));
+ return (0);
+#else
+ return (SET_ERROR(ENOTSUP));
+#endif /* HAVE_MLSLABEL */
+}
+
+static int
+zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
+ cred_t *cr)
+{
+ char *strval;
+
+ /*
+ * Check permissions for special properties.
+ */
+ switch (prop) {
+ default:
+ break;
+ case ZFS_PROP_ZONED:
+ /*
+ * Disallow setting of 'zoned' from within a local zone.
+ */
+ if (!INGLOBALZONE(curproc))
+ return (SET_ERROR(EPERM));
+ break;
+
+ case ZFS_PROP_QUOTA:
+ case ZFS_PROP_FILESYSTEM_LIMIT:
+ case ZFS_PROP_SNAPSHOT_LIMIT:
+ if (!INGLOBALZONE(curproc)) {
+ uint64_t zoned;
+ char setpoint[ZFS_MAX_DATASET_NAME_LEN];
+ /*
+ * Unprivileged users are allowed to modify the
+ * limit on things *under* (ie. contained by)
+ * the thing they own.
+ */
+ if (dsl_prop_get_integer(dsname,
+ zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, setpoint))
+ return (SET_ERROR(EPERM));
+ if (!zoned || strlen(dsname) <= strlen(setpoint))
+ return (SET_ERROR(EPERM));
+ }
+ break;
+
+ case ZFS_PROP_MLSLABEL:
+ if (!is_system_labeled())
+ return (SET_ERROR(EPERM));
+
+ if (nvpair_value_string(propval, &strval) == 0) {
+ int err;
+
+ err = zfs_set_slabel_policy(dsname, strval, CRED());
+ if (err != 0)
+ return (err);
+ }
+ break;
+ }
+
+ return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ int error;
+
+ error = zfs_dozonecheck(zc->zc_name, cr);
+ if (error != 0)
+ return (error);
+
+ /*
+ * permission to set permissions will be evaluated later in
+ * dsl_deleg_can_allow()
+ */
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_ROLLBACK, cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ const char *cp;
+ int error;
+
+ /*
+ * Generate the current snapshot name from the given objsetid, then
+ * use that name for the secpolicy/zone checks.
+ */
+ cp = strchr(zc->zc_name, '@');
+ if (cp == NULL)
+ return (SET_ERROR(EINVAL));
+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
+ if (error != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ dsl_dataset_name(ds, zc->zc_name);
+
+ error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
+ ZFS_DELEG_PERM_SEND, cr);
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_rele(dp, FTAG);
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_SEND, cr));
+}
+
+static int
+zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ return (SET_ERROR(ENOTSUP));
+}
+
+static int
+zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ return (SET_ERROR(ENOTSUP));
+}
+
+static int
+zfs_get_parent(const char *datasetname, char *parent, int parentsize)
+{
+ char *cp;
+
+ /*
+ * Remove the @bla or /bla from the end of the name to get the parent.
+ */
+ (void) strncpy(parent, datasetname, parentsize);
+ cp = strrchr(parent, '@');
+ if (cp != NULL) {
+ cp[0] = '\0';
+ } else {
+ cp = strrchr(parent, '/');
+ if (cp == NULL)
+ return (SET_ERROR(ENOENT));
+ cp[0] = '\0';
+ }
+
+ return (0);
+}
+
+int
+zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
+{
+ int error;
+
+ if ((error = zfs_secpolicy_write_perms(name,
+ ZFS_DELEG_PERM_MOUNT, cr)) != 0)
+ return (error);
+
+ return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ return (zfs_secpolicy_destroy_perms(zc->zc_name, cr));
+}
+
+/*
+ * Destroying snapshots with delegated permissions requires
+ * descendant mount and destroy permissions.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ nvlist_t *snaps;
+ nvpair_t *pair, *nextpair;
+ int error = 0;
+
+ snaps = fnvlist_lookup_nvlist(innvl, "snaps");
+
+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+ pair = nextpair) {
+ nextpair = nvlist_next_nvpair(snaps, pair);
+ error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr);
+ if (error == ENOENT) {
+ /*
+ * Ignore any snapshots that don't exist (we consider
+ * them "already destroyed"). Remove the name from the
+ * nvl here in case the snapshot is created between
+ * now and when we try to destroy it (in which case
+ * we don't want to destroy it since we haven't
+ * checked for permission).
+ */
+ fnvlist_remove_nvpair(snaps, pair);
+ error = 0;
+ }
+ if (error != 0)
+ break;
+ }
+
+ return (error);
+}
+
+int
+zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
+{
+ char parentname[ZFS_MAX_DATASET_NAME_LEN];
+ int error;
+
+ if ((error = zfs_secpolicy_write_perms(from,
+ ZFS_DELEG_PERM_RENAME, cr)) != 0)
+ return (error);
+
+ if ((error = zfs_secpolicy_write_perms(from,
+ ZFS_DELEG_PERM_MOUNT, cr)) != 0)
+ return (error);
+
+ if ((error = zfs_get_parent(to, parentname,
+ sizeof (parentname))) != 0)
+ return (error);
+
+ if ((error = zfs_secpolicy_write_perms(parentname,
+ ZFS_DELEG_PERM_CREATE, cr)) != 0)
+ return (error);
+
+ if ((error = zfs_secpolicy_write_perms(parentname,
+ ZFS_DELEG_PERM_MOUNT, cr)) != 0)
+ return (error);
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *clone;
+ int error;
+
+ error = zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_PROMOTE, cr);
+ if (error != 0)
+ return (error);
+
+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone);
+
+ if (error == 0) {
+ char parentname[ZFS_MAX_DATASET_NAME_LEN];
+ dsl_dataset_t *origin = NULL;
+ dsl_dir_t *dd;
+ dd = clone->ds_dir;
+
+ error = dsl_dataset_hold_obj(dd->dd_pool,
+ dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin);
+ if (error != 0) {
+ dsl_dataset_rele(clone, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone,
+ ZFS_DELEG_PERM_MOUNT, cr);
+
+ dsl_dataset_name(origin, parentname);
+ if (error == 0) {
+ error = zfs_secpolicy_write_perms_ds(parentname, origin,
+ ZFS_DELEG_PERM_PROMOTE, cr);
+ }
+ dsl_dataset_rele(clone, FTAG);
+ dsl_dataset_rele(origin, FTAG);
+ }
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ int error;
+
+ if ((error = zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_RECEIVE, cr)) != 0)
+ return (error);
+
+ if ((error = zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_MOUNT, cr)) != 0)
+ return (error);
+
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_CREATE, cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_recv_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ return (zfs_secpolicy_recv(zc, innvl, cr));
+}
+
+int
+zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
+{
+ return (zfs_secpolicy_write_perms(name,
+ ZFS_DELEG_PERM_SNAPSHOT, cr));
+}
+
+/*
+ * Check for permission to create each snapshot in the nvlist.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ nvlist_t *snaps;
+ int error = 0;
+ nvpair_t *pair;
+
+ snaps = fnvlist_lookup_nvlist(innvl, "snaps");
+
+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(snaps, pair)) {
+ char *name = nvpair_name(pair);
+ char *atp = strchr(name, '@');
+
+ if (atp == NULL) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+ *atp = '\0';
+ error = zfs_secpolicy_snapshot_perms(name, cr);
+ *atp = '@';
+ if (error != 0)
+ break;
+ }
+ return (error);
+}
+
+/*
+ * Check for permission to create each bookmark in the nvlist.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ int error = 0;
+
+ for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
+ char *name = nvpair_name(pair);
+ char *hashp = strchr(name, '#');
+
+ if (hashp == NULL) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+ *hashp = '\0';
+ error = zfs_secpolicy_write_perms(name,
+ ZFS_DELEG_PERM_BOOKMARK, cr);
+ *hashp = '#';
+ if (error != 0)
+ break;
+ }
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ nvpair_t *pair, *nextpair;
+ int error = 0;
+
+ for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
+ pair = nextpair) {
+ char *name = nvpair_name(pair);
+ char *hashp = strchr(name, '#');
+ nextpair = nvlist_next_nvpair(innvl, pair);
+
+ if (hashp == NULL) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ *hashp = '\0';
+ error = zfs_secpolicy_write_perms(name,
+ ZFS_DELEG_PERM_DESTROY, cr);
+ *hashp = '#';
+ if (error == ENOENT) {
+ /*
+ * Ignore any filesystems that don't exist (we consider
+ * their bookmarks "already destroyed"). Remove
+ * the name from the nvl here in case the filesystem
+ * is created between now and when we try to destroy
+ * the bookmark (in which case we don't want to
+ * destroy it since we haven't checked for permission).
+ */
+ fnvlist_remove_nvpair(innvl, pair);
+ error = 0;
+ }
+ if (error != 0)
+ break;
+ }
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ /*
+ * Even root must have a proper TSD so that we know what pool
+ * to log to.
+ */
+ if (tsd_get(zfs_allow_log_key) == NULL)
+ return (SET_ERROR(EPERM));
+ return (0);
+}
+
+static int
+zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ char parentname[ZFS_MAX_DATASET_NAME_LEN];
+ int error;
+ char *origin;
+
+ if ((error = zfs_get_parent(zc->zc_name, parentname,
+ sizeof (parentname))) != 0)
+ return (error);
+
+ if (nvlist_lookup_string(innvl, "origin", &origin) == 0 &&
+ (error = zfs_secpolicy_write_perms(origin,
+ ZFS_DELEG_PERM_CLONE, cr)) != 0)
+ return (error);
+
+ if ((error = zfs_secpolicy_write_perms(parentname,
+ ZFS_DELEG_PERM_CREATE, cr)) != 0)
+ return (error);
+
+ return (zfs_secpolicy_write_perms(parentname,
+ ZFS_DELEG_PERM_MOUNT, cr));
+}
+
+/*
+ * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires
+ * SYS_CONFIG privilege, which is not available in a local zone.
+ */
+/* ARGSUSED */
+int
+zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ if (secpolicy_sys_config(cr, B_FALSE) != 0)
+ return (SET_ERROR(EPERM));
+
+ return (0);
+}
+
+/*
+ * Policy for object to name lookups.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ int error;
+
+ if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0)
+ return (0);
+
+ error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr);
+ return (error);
+}
+
+/*
+ * Policy for fault injection. Requires all privileges.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ return (secpolicy_zinject(cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ zfs_prop_t prop = zfs_name_to_prop(zc->zc_value);
+
+ if (prop == ZPROP_INVAL) {
+ if (!zfs_prop_user(zc->zc_value))
+ return (SET_ERROR(EINVAL));
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_USERPROP, cr));
+ } else {
+ return (zfs_secpolicy_setprop(zc->zc_name, prop,
+ NULL, cr));
+ }
+}
+
+static int
+zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ int err = zfs_secpolicy_read(zc, innvl, cr);
+ if (err)
+ return (err);
+
+ if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
+ return (SET_ERROR(EINVAL));
+
+ if (zc->zc_value[0] == 0) {
+ /*
+ * They are asking about a posix uid/gid. If it's
+ * themself, allow it.
+ */
+ if (zc->zc_objset_type == ZFS_PROP_USERUSED ||
+ zc->zc_objset_type == ZFS_PROP_USERQUOTA ||
+ zc->zc_objset_type == ZFS_PROP_USEROBJUSED ||
+ zc->zc_objset_type == ZFS_PROP_USEROBJQUOTA) {
+ if (zc->zc_guid == crgetuid(cr))
+ return (0);
+ } else if (zc->zc_objset_type == ZFS_PROP_GROUPUSED ||
+ zc->zc_objset_type == ZFS_PROP_GROUPQUOTA ||
+ zc->zc_objset_type == ZFS_PROP_GROUPOBJUSED ||
+ zc->zc_objset_type == ZFS_PROP_GROUPOBJQUOTA) {
+ if (groupmember(zc->zc_guid, cr))
+ return (0);
+ }
+ /* else is for project quota/used */
+ }
+
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ userquota_perms[zc->zc_objset_type], cr));
+}
+
+static int
+zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ int err = zfs_secpolicy_read(zc, innvl, cr);
+ if (err)
+ return (err);
+
+ if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
+ return (SET_ERROR(EINVAL));
+
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ userquota_perms[zc->zc_objset_type], cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION,
+ NULL, cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ nvpair_t *pair;
+ nvlist_t *holds;
+ int error;
+
+ holds = fnvlist_lookup_nvlist(innvl, "holds");
+
+ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(holds, pair)) {
+ char fsname[ZFS_MAX_DATASET_NAME_LEN];
+ error = dmu_fsname(nvpair_name(pair), fsname);
+ if (error != 0)
+ return (error);
+ error = zfs_secpolicy_write_perms(fsname,
+ ZFS_DELEG_PERM_HOLD, cr);
+ if (error != 0)
+ return (error);
+ }
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ nvpair_t *pair;
+ int error;
+
+ for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(innvl, pair)) {
+ char fsname[ZFS_MAX_DATASET_NAME_LEN];
+ error = dmu_fsname(nvpair_name(pair), fsname);
+ if (error != 0)
+ return (error);
+ error = zfs_secpolicy_write_perms(fsname,
+ ZFS_DELEG_PERM_RELEASE, cr);
+ if (error != 0)
+ return (error);
+ }
+ return (0);
+}
+
+/*
+ * Policy for allowing temporary snapshots to be taken or released
+ */
+static int
+zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ /*
+ * A temporary snapshot is the same as a snapshot,
+ * hold, destroy and release all rolled into one.
+ * Delegated diff alone is sufficient that we allow this.
+ */
+ int error;
+
+ if ((error = zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_DIFF, cr)) == 0)
+ return (0);
+
+ error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr);
+
+ if (innvl != NULL) {
+ if (error == 0)
+ error = zfs_secpolicy_hold(zc, innvl, cr);
+ if (error == 0)
+ error = zfs_secpolicy_release(zc, innvl, cr);
+ if (error == 0)
+ error = zfs_secpolicy_destroy(zc, innvl, cr);
+ }
+ return (error);
+}
+
+static int
+zfs_secpolicy_load_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_LOAD_KEY, cr));
+}
+
+static int
+zfs_secpolicy_change_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_CHANGE_KEY, cr));
+}
+
+/*
+ * Returns the nvlist as specified by the user in the zfs_cmd_t.
+ */
+static int
+get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp)
+{
+ char *packed;
+ int error;
+ nvlist_t *list = NULL;
+
+ /*
+ * Read in and unpack the user-supplied nvlist.
+ */
+ if (size == 0)
+ return (SET_ERROR(EINVAL));
+
+ packed = vmem_alloc(size, KM_SLEEP);
+
+ if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
+ iflag)) != 0) {
+ vmem_free(packed, size);
+ return (SET_ERROR(EFAULT));
+ }
+
+ if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) {
+ vmem_free(packed, size);
+ return (error);
+ }
+
+ vmem_free(packed, size);
+
+ *nvp = list;
+ return (0);
+}
+
+/*
+ * Reduce the size of this nvlist until it can be serialized in 'max' bytes.
+ * Entries will be removed from the end of the nvlist, and one int32 entry
+ * named "N_MORE_ERRORS" will be added indicating how many entries were
+ * removed.
+ */
+static int
+nvlist_smush(nvlist_t *errors, size_t max)
+{
+ size_t size;
+
+ size = fnvlist_size(errors);
+
+ if (size > max) {
+ nvpair_t *more_errors;
+ int n = 0;
+
+ if (max < 1024)
+ return (SET_ERROR(ENOMEM));
+
+ fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0);
+ more_errors = nvlist_prev_nvpair(errors, NULL);
+
+ do {
+ nvpair_t *pair = nvlist_prev_nvpair(errors,
+ more_errors);
+ fnvlist_remove_nvpair(errors, pair);
+ n++;
+ size = fnvlist_size(errors);
+ } while (size > max);
+
+ fnvlist_remove_nvpair(errors, more_errors);
+ fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n);
+ ASSERT3U(fnvlist_size(errors), <=, max);
+ }
+
+ return (0);
+}
+
+static int
+put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
+{
+ char *packed = NULL;
+ int error = 0;
+ size_t size;
+
+ size = fnvlist_size(nvl);
+
+ if (size > zc->zc_nvlist_dst_size) {
+ error = SET_ERROR(ENOMEM);
+ } else {
+ packed = fnvlist_pack(nvl, &size);
+ if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
+ size, zc->zc_iflags) != 0)
+ error = SET_ERROR(EFAULT);
+ fnvlist_pack_free(packed, size);
+ }
+
+ zc->zc_nvlist_dst_size = size;
+ zc->zc_nvlist_dst_filled = B_TRUE;
+ return (error);
+}
+
+int
+getzfsvfs_impl(objset_t *os, zfsvfs_t **zfvp)
+{
+ int error = 0;
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ mutex_enter(&os->os_user_ptr_lock);
+ *zfvp = dmu_objset_get_user(os);
+ /* bump s_active only when non-zero to prevent umount race */
+ error = zfs_vfs_ref(zfvp);
+ mutex_exit(&os->os_user_ptr_lock);
+ return (error);
+}
+
+int
+getzfsvfs(const char *dsname, zfsvfs_t **zfvp)
+{
+ objset_t *os;
+ int error;
+
+ error = dmu_objset_hold(dsname, FTAG, &os);
+ if (error != 0)
+ return (error);
+
+ error = getzfsvfs_impl(os, zfvp);
+ dmu_objset_rele(os, FTAG);
+ return (error);
+}
+
+/*
+ * Find a zfsvfs_t for a mounted filesystem, or create our own, in which
+ * case its z_sb will be NULL, and it will be opened as the owner.
+ * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER,
+ * which prevents all inode ops from running.
+ */
+static int
+zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer)
+{
+ int error = 0;
+
+ if (getzfsvfs(name, zfvp) != 0)
+ error = zfsvfs_create(name, B_FALSE, zfvp);
+ if (error == 0) {
+ if (writer)
+ ZFS_TEARDOWN_ENTER_WRITE(*zfvp, tag);
+ else
+ ZFS_TEARDOWN_ENTER_READ(*zfvp, tag);
+ if ((*zfvp)->z_unmounted) {
+ /*
+ * XXX we could probably try again, since the unmounting
+ * thread should be just about to disassociate the
+ * objset from the zfsvfs.
+ */
+ ZFS_TEARDOWN_EXIT(*zfvp, tag);
+ return (SET_ERROR(EBUSY));
+ }
+ }
+ return (error);
+}
+
+static void
+zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
+{
+ ZFS_TEARDOWN_EXIT(zfsvfs, tag);
+
+ if (zfs_vfs_held(zfsvfs)) {
+ zfs_vfs_rele(zfsvfs);
+ } else {
+ dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
+ zfsvfs_free(zfsvfs);
+ }
+}
+
+static int
+zfs_ioc_pool_create(zfs_cmd_t *zc)
+{
+ int error;
+ nvlist_t *config, *props = NULL;
+ nvlist_t *rootprops = NULL;
+ nvlist_t *zplprops = NULL;
+ dsl_crypto_params_t *dcp = NULL;
+ const char *spa_name = zc->zc_name;
+ boolean_t unload_wkey = B_TRUE;
+
+ if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+ zc->zc_iflags, &config)))
+ return (error);
+
+ if (zc->zc_nvlist_src_size != 0 && (error =
+ get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &props))) {
+ nvlist_free(config);
+ return (error);
+ }
+
+ if (props) {
+ nvlist_t *nvl = NULL;
+ nvlist_t *hidden_args = NULL;
+ uint64_t version = SPA_VERSION;
+ char *tname;
+
+ (void) nvlist_lookup_uint64(props,
+ zpool_prop_to_name(ZPOOL_PROP_VERSION), &version);
+ if (!SPA_VERSION_IS_SUPPORTED(version)) {
+ error = SET_ERROR(EINVAL);
+ goto pool_props_bad;
+ }
+ (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl);
+ if (nvl) {
+ error = nvlist_dup(nvl, &rootprops, KM_SLEEP);
+ if (error != 0)
+ goto pool_props_bad;
+ (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS);
+ }
+
+ (void) nvlist_lookup_nvlist(props, ZPOOL_HIDDEN_ARGS,
+ &hidden_args);
+ error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE,
+ rootprops, hidden_args, &dcp);
+ if (error != 0)
+ goto pool_props_bad;
+ (void) nvlist_remove_all(props, ZPOOL_HIDDEN_ARGS);
+
+ VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ error = zfs_fill_zplprops_root(version, rootprops,
+ zplprops, NULL);
+ if (error != 0)
+ goto pool_props_bad;
+
+ if (nvlist_lookup_string(props,
+ zpool_prop_to_name(ZPOOL_PROP_TNAME), &tname) == 0)
+ spa_name = tname;
+ }
+
+ error = spa_create(zc->zc_name, config, props, zplprops, dcp);
+
+ /*
+ * Set the remaining root properties
+ */
+ if (!error && (error = zfs_set_prop_nvlist(spa_name,
+ ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) {
+ (void) spa_destroy(spa_name);
+ unload_wkey = B_FALSE; /* spa_destroy() unloads wrapping keys */
+ }
+
+pool_props_bad:
+ nvlist_free(rootprops);
+ nvlist_free(zplprops);
+ nvlist_free(config);
+ nvlist_free(props);
+ dsl_crypto_params_free(dcp, unload_wkey && !!error);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_destroy(zfs_cmd_t *zc)
+{
+ int error;
+ zfs_log_history(zc);
+ error = spa_destroy(zc->zc_name);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_import(zfs_cmd_t *zc)
+{
+ nvlist_t *config, *props = NULL;
+ uint64_t guid;
+ int error;
+
+ if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+ zc->zc_iflags, &config)) != 0)
+ return (error);
+
+ if (zc->zc_nvlist_src_size != 0 && (error =
+ get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &props))) {
+ nvlist_free(config);
+ return (error);
+ }
+
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
+ guid != zc->zc_guid)
+ error = SET_ERROR(EINVAL);
+ else
+ error = spa_import(zc->zc_name, config, props, zc->zc_cookie);
+
+ if (zc->zc_nvlist_dst != 0) {
+ int err;
+
+ if ((err = put_nvlist(zc, config)) != 0)
+ error = err;
+ }
+
+ nvlist_free(config);
+ nvlist_free(props);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_export(zfs_cmd_t *zc)
+{
+ int error;
+ boolean_t force = (boolean_t)zc->zc_cookie;
+ boolean_t hardforce = (boolean_t)zc->zc_guid;
+
+ zfs_log_history(zc);
+ error = spa_export(zc->zc_name, NULL, force, hardforce);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_configs(zfs_cmd_t *zc)
+{
+ nvlist_t *configs;
+ int error;
+
+ if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
+ return (SET_ERROR(EEXIST));
+
+ error = put_nvlist(zc, configs);
+
+ nvlist_free(configs);
+
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of the pool
+ *
+ * outputs:
+ * zc_cookie real errno
+ * zc_nvlist_dst config nvlist
+ * zc_nvlist_dst_size size of config nvlist
+ */
+static int
+zfs_ioc_pool_stats(zfs_cmd_t *zc)
+{
+ nvlist_t *config;
+ int error;
+ int ret = 0;
+
+ error = spa_get_stats(zc->zc_name, &config, zc->zc_value,
+ sizeof (zc->zc_value));
+
+ if (config != NULL) {
+ ret = put_nvlist(zc, config);
+ nvlist_free(config);
+
+ /*
+ * The config may be present even if 'error' is non-zero.
+ * In this case we return success, and preserve the real errno
+ * in 'zc_cookie'.
+ */
+ zc->zc_cookie = error;
+ } else {
+ ret = error;
+ }
+
+ return (ret);
+}
+
+/*
+ * Try to import the given pool, returning pool stats as appropriate so that
+ * user land knows which devices are available and overall pool health.
+ */
+static int
+zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
+{
+ nvlist_t *tryconfig, *config = NULL;
+ int error;
+
+ if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+ zc->zc_iflags, &tryconfig)) != 0)
+ return (error);
+
+ config = spa_tryimport(tryconfig);
+
+ nvlist_free(tryconfig);
+
+ if (config == NULL)
+ return (SET_ERROR(EINVAL));
+
+ error = put_nvlist(zc, config);
+ nvlist_free(config);
+
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of the pool
+ * zc_cookie scan func (pool_scan_func_t)
+ * zc_flags scrub pause/resume flag (pool_scrub_cmd_t)
+ */
+static int
+zfs_ioc_pool_scan(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ if (zc->zc_flags >= POOL_SCRUB_FLAGS_END)
+ return (SET_ERROR(EINVAL));
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ if (zc->zc_flags == POOL_SCRUB_PAUSE)
+ error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE);
+ else if (zc->zc_cookie == POOL_SCAN_NONE)
+ error = spa_scan_stop(spa);
+ else
+ error = spa_scan(spa, zc->zc_cookie);
+
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_freeze(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error == 0) {
+ spa_freeze(spa);
+ spa_close(spa, FTAG);
+ }
+ return (error);
+}
+
+static int
+zfs_ioc_pool_upgrade(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ if (zc->zc_cookie < spa_version(spa) ||
+ !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ spa_upgrade(spa, zc->zc_cookie);
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_get_history(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ char *hist_buf;
+ uint64_t size;
+ int error;
+
+ if ((size = zc->zc_history_len) == 0)
+ return (SET_ERROR(EINVAL));
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ hist_buf = vmem_alloc(size, KM_SLEEP);
+ if ((error = spa_history_get(spa, &zc->zc_history_offset,
+ &zc->zc_history_len, hist_buf)) == 0) {
+ error = ddi_copyout(hist_buf,
+ (void *)(uintptr_t)zc->zc_history,
+ zc->zc_history_len, zc->zc_iflags);
+ }
+
+ spa_close(spa, FTAG);
+ vmem_free(hist_buf, size);
+ return (error);
+}
+
+static int
+zfs_ioc_pool_reguid(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error == 0) {
+ error = spa_change_guid(spa);
+ spa_close(spa, FTAG);
+ }
+ return (error);
+}
+
+static int
+zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
+{
+ return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value));
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_obj object to find
+ *
+ * outputs:
+ * zc_value name of object
+ */
+static int
+zfs_ioc_obj_to_path(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ int error;
+
+ /* XXX reading from objset not owned */
+ if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE,
+ FTAG, &os)) != 0)
+ return (error);
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ dmu_objset_rele_flags(os, B_TRUE, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value,
+ sizeof (zc->zc_value));
+ dmu_objset_rele_flags(os, B_TRUE, FTAG);
+
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_obj object to find
+ *
+ * outputs:
+ * zc_stat stats on object
+ * zc_value path to object
+ */
+static int
+zfs_ioc_obj_to_stats(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ int error;
+
+ /* XXX reading from objset not owned */
+ if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE,
+ FTAG, &os)) != 0)
+ return (error);
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ dmu_objset_rele_flags(os, B_TRUE, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value,
+ sizeof (zc->zc_value));
+ dmu_objset_rele_flags(os, B_TRUE, FTAG);
+
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_add(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+ nvlist_t *config;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+ zc->zc_iflags, &config);
+ if (error == 0) {
+ error = spa_vdev_add(spa, config);
+ nvlist_free(config);
+ }
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of the pool
+ * zc_guid guid of vdev to remove
+ * zc_cookie cancel removal
+ */
+static int
+zfs_ioc_vdev_remove(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+ if (zc->zc_cookie != 0) {
+ error = spa_vdev_remove_cancel(spa);
+ } else {
+ error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE);
+ }
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_set_state(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+ vdev_state_t newstate = VDEV_STATE_UNKNOWN;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+ switch (zc->zc_cookie) {
+ case VDEV_STATE_ONLINE:
+ error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate);
+ break;
+
+ case VDEV_STATE_OFFLINE:
+ error = vdev_offline(spa, zc->zc_guid, zc->zc_obj);
+ break;
+
+ case VDEV_STATE_FAULTED:
+ if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
+ zc->zc_obj != VDEV_AUX_EXTERNAL &&
+ zc->zc_obj != VDEV_AUX_EXTERNAL_PERSIST)
+ zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
+
+ error = vdev_fault(spa, zc->zc_guid, zc->zc_obj);
+ break;
+
+ case VDEV_STATE_DEGRADED:
+ if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
+ zc->zc_obj != VDEV_AUX_EXTERNAL)
+ zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
+
+ error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj);
+ break;
+
+ default:
+ error = SET_ERROR(EINVAL);
+ }
+ zc->zc_cookie = newstate;
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_attach(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ nvlist_t *config;
+ int replacing = zc->zc_cookie;
+ int rebuild = zc->zc_simple;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+ zc->zc_iflags, &config)) == 0) {
+ error = spa_vdev_attach(spa, zc->zc_guid, config, replacing,
+ rebuild);
+ nvlist_free(config);
+ }
+
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_detach(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE);
+
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_split(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ nvlist_t *config, *props = NULL;
+ int error;
+ boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT);
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+ zc->zc_iflags, &config))) {
+ spa_close(spa, FTAG);
+ return (error);
+ }
+
+ if (zc->zc_nvlist_src_size != 0 && (error =
+ get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &props))) {
+ spa_close(spa, FTAG);
+ nvlist_free(config);
+ return (error);
+ }
+
+ error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp);
+
+ spa_close(spa, FTAG);
+
+ nvlist_free(config);
+ nvlist_free(props);
+
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ const char *path = zc->zc_value;
+ uint64_t guid = zc->zc_guid;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = spa_vdev_setpath(spa, guid, path);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ const char *fru = zc->zc_value;
+ uint64_t guid = zc->zc_guid;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = spa_vdev_setfru(spa, guid, fru);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
+{
+ int error = 0;
+ nvlist_t *nv;
+
+ dmu_objset_fast_stat(os, &zc->zc_objset_stats);
+
+ if (zc->zc_nvlist_dst != 0 &&
+ (error = dsl_prop_get_all(os, &nv)) == 0) {
+ dmu_objset_stats(os, nv);
+ /*
+ * NB: zvol_get_stats() will read the objset contents,
+ * which we aren't supposed to do with a
+ * DS_MODE_USER hold, because it could be
+ * inconsistent. So this is a bit of a workaround...
+ * XXX reading without owning
+ */
+ if (!zc->zc_objset_stats.dds_inconsistent &&
+ dmu_objset_type(os) == DMU_OST_ZVOL) {
+ error = zvol_get_stats(os, nv);
+ if (error == EIO) {
+ nvlist_free(nv);
+ return (error);
+ }
+ VERIFY0(error);
+ }
+ if (error == 0)
+ error = put_nvlist(zc, nv);
+ nvlist_free(nv);
+ }
+
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_nvlist_dst_size size of buffer for property nvlist
+ *
+ * outputs:
+ * zc_objset_stats stats
+ * zc_nvlist_dst property nvlist
+ * zc_nvlist_dst_size size of property nvlist
+ */
+static int
+zfs_ioc_objset_stats(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ int error;
+
+ error = dmu_objset_hold(zc->zc_name, FTAG, &os);
+ if (error == 0) {
+ error = zfs_ioc_objset_stats_impl(zc, os);
+ dmu_objset_rele(os, FTAG);
+ }
+
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_nvlist_dst_size size of buffer for property nvlist
+ *
+ * outputs:
+ * zc_nvlist_dst received property nvlist
+ * zc_nvlist_dst_size size of received property nvlist
+ *
+ * Gets received properties (distinct from local properties on or after
+ * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from
+ * local property values.
+ */
+static int
+zfs_ioc_objset_recvd_props(zfs_cmd_t *zc)
+{
+ int error = 0;
+ nvlist_t *nv;
+
+ /*
+ * Without this check, we would return local property values if the
+ * caller has not already received properties on or after
+ * SPA_VERSION_RECVD_PROPS.
+ */
+ if (!dsl_prop_get_hasrecvd(zc->zc_name))
+ return (SET_ERROR(ENOTSUP));
+
+ if (zc->zc_nvlist_dst != 0 &&
+ (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) {
+ error = put_nvlist(zc, nv);
+ nvlist_free(nv);
+ }
+
+ return (error);
+}
+
+static int
+nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop)
+{
+ uint64_t value;
+ int error;
+
+ /*
+ * zfs_get_zplprop() will either find a value or give us
+ * the default value (if there is one).
+ */
+ if ((error = zfs_get_zplprop(os, prop, &value)) != 0)
+ return (error);
+ VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0);
+ return (0);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_nvlist_dst_size size of buffer for zpl property nvlist
+ *
+ * outputs:
+ * zc_nvlist_dst zpl property nvlist
+ * zc_nvlist_dst_size size of zpl property nvlist
+ */
+static int
+zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ int err;
+
+ /* XXX reading without owning */
+ if ((err = dmu_objset_hold(zc->zc_name, FTAG, &os)))
+ return (err);
+
+ dmu_objset_fast_stat(os, &zc->zc_objset_stats);
+
+ /*
+ * NB: nvl_add_zplprop() will read the objset contents,
+ * which we aren't supposed to do with a DS_MODE_USER
+ * hold, because it could be inconsistent.
+ */
+ if (zc->zc_nvlist_dst != 0 &&
+ !zc->zc_objset_stats.dds_inconsistent &&
+ dmu_objset_type(os) == DMU_OST_ZFS) {
+ nvlist_t *nv;
+
+ VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 &&
+ (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 &&
+ (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 &&
+ (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0)
+ err = put_nvlist(zc, nv);
+ nvlist_free(nv);
+ } else {
+ err = SET_ERROR(ENOENT);
+ }
+ dmu_objset_rele(os, FTAG);
+ return (err);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_cookie zap cursor
+ * zc_nvlist_dst_size size of buffer for property nvlist
+ *
+ * outputs:
+ * zc_name name of next filesystem
+ * zc_cookie zap cursor
+ * zc_objset_stats stats
+ * zc_nvlist_dst property nvlist
+ * zc_nvlist_dst_size size of property nvlist
+ */
+static int
+zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ int error;
+ char *p;
+ size_t orig_len = strlen(zc->zc_name);
+
+top:
+ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) {
+ if (error == ENOENT)
+ error = SET_ERROR(ESRCH);
+ return (error);
+ }
+
+ p = strrchr(zc->zc_name, '/');
+ if (p == NULL || p[1] != '\0')
+ (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
+ p = zc->zc_name + strlen(zc->zc_name);
+
+ do {
+ error = dmu_dir_list_next(os,
+ sizeof (zc->zc_name) - (p - zc->zc_name), p,
+ NULL, &zc->zc_cookie);
+ if (error == ENOENT)
+ error = SET_ERROR(ESRCH);
+ } while (error == 0 && zfs_dataset_name_hidden(zc->zc_name));
+ dmu_objset_rele(os, FTAG);
+
+ /*
+ * If it's an internal dataset (ie. with a '$' in its name),
+ * don't try to get stats for it, otherwise we'll return ENOENT.
+ */
+ if (error == 0 && strchr(zc->zc_name, '$') == NULL) {
+ error = zfs_ioc_objset_stats(zc); /* fill in the stats */
+ if (error == ENOENT) {
+ /* We lost a race with destroy, get the next one. */
+ zc->zc_name[orig_len] = '\0';
+ goto top;
+ }
+ }
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_cookie zap cursor
+ * zc_nvlist_src iteration range nvlist
+ * zc_nvlist_src_size size of iteration range nvlist
+ *
+ * outputs:
+ * zc_name name of next snapshot
+ * zc_objset_stats stats
+ * zc_nvlist_dst property nvlist
+ * zc_nvlist_dst_size size of property nvlist
+ */
+static int
+zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
+{
+ int error;
+ objset_t *os, *ossnap;
+ dsl_dataset_t *ds;
+ uint64_t min_txg = 0, max_txg = 0;
+
+ if (zc->zc_nvlist_src_size != 0) {
+ nvlist_t *props = NULL;
+ error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &props);
+ if (error != 0)
+ return (error);
+ (void) nvlist_lookup_uint64(props, SNAP_ITER_MIN_TXG,
+ &min_txg);
+ (void) nvlist_lookup_uint64(props, SNAP_ITER_MAX_TXG,
+ &max_txg);
+ nvlist_free(props);
+ }
+
+ error = dmu_objset_hold(zc->zc_name, FTAG, &os);
+ if (error != 0) {
+ return (error == ENOENT ? SET_ERROR(ESRCH) : error);
+ }
+
+ /*
+ * A dataset name of maximum length cannot have any snapshots,
+ * so exit immediately.
+ */
+ if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >=
+ ZFS_MAX_DATASET_NAME_LEN) {
+ dmu_objset_rele(os, FTAG);
+ return (SET_ERROR(ESRCH));
+ }
+
+ while (error == 0) {
+ if (issig(JUSTLOOKING) && issig(FORREAL)) {
+ error = SET_ERROR(EINTR);
+ break;
+ }
+
+ error = dmu_snapshot_list_next(os,
+ sizeof (zc->zc_name) - strlen(zc->zc_name),
+ zc->zc_name + strlen(zc->zc_name), &zc->zc_obj,
+ &zc->zc_cookie, NULL);
+ if (error == ENOENT) {
+ error = SET_ERROR(ESRCH);
+ break;
+ } else if (error != 0) {
+ break;
+ }
+
+ error = dsl_dataset_hold_obj(dmu_objset_pool(os), zc->zc_obj,
+ FTAG, &ds);
+ if (error != 0)
+ break;
+
+ if ((min_txg != 0 && dsl_get_creationtxg(ds) < min_txg) ||
+ (max_txg != 0 && dsl_get_creationtxg(ds) > max_txg)) {
+ dsl_dataset_rele(ds, FTAG);
+ /* undo snapshot name append */
+ *(strchr(zc->zc_name, '@') + 1) = '\0';
+ /* skip snapshot */
+ continue;
+ }
+
+ if (zc->zc_simple) {
+ dsl_dataset_rele(ds, FTAG);
+ break;
+ }
+
+ if ((error = dmu_objset_from_ds(ds, &ossnap)) != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ break;
+ }
+ if ((error = zfs_ioc_objset_stats_impl(zc, ossnap)) != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ break;
+ }
+ dsl_dataset_rele(ds, FTAG);
+ break;
+ }
+
+ dmu_objset_rele(os, FTAG);
+ /* if we failed, undo the @ that we tacked on to zc_name */
+ if (error != 0)
+ *strchr(zc->zc_name, '@') = '\0';
+ return (error);
+}
+
+static int
+zfs_prop_set_userquota(const char *dsname, nvpair_t *pair)
+{
+ const char *propname = nvpair_name(pair);
+ uint64_t *valary;
+ unsigned int vallen;
+ const char *dash, *domain;
+ zfs_userquota_prop_t type;
+ uint64_t rid;
+ uint64_t quota;
+ zfsvfs_t *zfsvfs;
+ int err;
+
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+ if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &pair) != 0)
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * A correctly constructed propname is encoded as
+ * userquota@<rid>-<domain>.
+ */
+ if ((dash = strchr(propname, '-')) == NULL ||
+ nvpair_value_uint64_array(pair, &valary, &vallen) != 0 ||
+ vallen != 3)
+ return (SET_ERROR(EINVAL));
+
+ domain = dash + 1;
+ type = valary[0];
+ rid = valary[1];
+ quota = valary[2];
+
+ err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE);
+ if (err == 0) {
+ err = zfs_set_userquota(zfsvfs, type, domain, rid, quota);
+ zfsvfs_rele(zfsvfs, FTAG);
+ }
+
+ return (err);
+}
+
+/*
+ * If the named property is one that has a special function to set its value,
+ * return 0 on success and a positive error code on failure; otherwise if it is
+ * not one of the special properties handled by this function, return -1.
+ *
+ * XXX: It would be better for callers of the property interface if we handled
+ * these special cases in dsl_prop.c (in the dsl layer).
+ */
+static int
+zfs_prop_set_special(const char *dsname, zprop_source_t source,
+ nvpair_t *pair)
+{
+ const char *propname = nvpair_name(pair);
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ uint64_t intval = 0;
+ const char *strval = NULL;
+ int err = -1;
+
+ if (prop == ZPROP_INVAL) {
+ if (zfs_prop_userquota(propname))
+ return (zfs_prop_set_userquota(dsname, pair));
+ return (-1);
+ }
+
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &pair) == 0);
+ }
+
+ /* all special properties are numeric except for keylocation */
+ if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
+ strval = fnvpair_value_string(pair);
+ } else {
+ intval = fnvpair_value_uint64(pair);
+ }
+
+ switch (prop) {
+ case ZFS_PROP_QUOTA:
+ err = dsl_dir_set_quota(dsname, source, intval);
+ break;
+ case ZFS_PROP_REFQUOTA:
+ err = dsl_dataset_set_refquota(dsname, source, intval);
+ break;
+ case ZFS_PROP_FILESYSTEM_LIMIT:
+ case ZFS_PROP_SNAPSHOT_LIMIT:
+ if (intval == UINT64_MAX) {
+ /* clearing the limit, just do it */
+ err = 0;
+ } else {
+ err = dsl_dir_activate_fs_ss_limit(dsname);
+ }
+ /*
+ * Set err to -1 to force the zfs_set_prop_nvlist code down the
+ * default path to set the value in the nvlist.
+ */
+ if (err == 0)
+ err = -1;
+ break;
+ case ZFS_PROP_KEYLOCATION:
+ err = dsl_crypto_can_set_keylocation(dsname, strval);
+
+ /*
+ * Set err to -1 to force the zfs_set_prop_nvlist code down the
+ * default path to set the value in the nvlist.
+ */
+ if (err == 0)
+ err = -1;
+ break;
+ case ZFS_PROP_RESERVATION:
+ err = dsl_dir_set_reservation(dsname, source, intval);
+ break;
+ case ZFS_PROP_REFRESERVATION:
+ err = dsl_dataset_set_refreservation(dsname, source, intval);
+ break;
+ case ZFS_PROP_COMPRESSION:
+ err = dsl_dataset_set_compression(dsname, source, intval);
+ /*
+ * Set err to -1 to force the zfs_set_prop_nvlist code down the
+ * default path to set the value in the nvlist.
+ */
+ if (err == 0)
+ err = -1;
+ break;
+ case ZFS_PROP_VOLSIZE:
+ err = zvol_set_volsize(dsname, intval);
+ break;
+ case ZFS_PROP_SNAPDEV:
+ err = zvol_set_snapdev(dsname, source, intval);
+ break;
+ case ZFS_PROP_VOLMODE:
+ err = zvol_set_volmode(dsname, source, intval);
+ break;
+ case ZFS_PROP_VERSION:
+ {
+ zfsvfs_t *zfsvfs;
+
+ if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0)
+ break;
+
+ err = zfs_set_version(zfsvfs, intval);
+ zfsvfs_rele(zfsvfs, FTAG);
+
+ if (err == 0 && intval >= ZPL_VERSION_USERSPACE) {
+ zfs_cmd_t *zc;
+
+ zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+ (void) strlcpy(zc->zc_name, dsname,
+ sizeof (zc->zc_name));
+ (void) zfs_ioc_userspace_upgrade(zc);
+ (void) zfs_ioc_id_quota_upgrade(zc);
+ kmem_free(zc, sizeof (zfs_cmd_t));
+ }
+ break;
+ }
+ default:
+ err = -1;
+ }
+
+ return (err);
+}
+
+/*
+ * This function is best effort. If it fails to set any of the given properties,
+ * it continues to set as many as it can and returns the last error
+ * encountered. If the caller provides a non-NULL errlist, it will be filled in
+ * with the list of names of all the properties that failed along with the
+ * corresponding error numbers.
+ *
+ * If every property is set successfully, zero is returned and errlist is not
+ * modified.
+ */
+int
+zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl,
+ nvlist_t *errlist)
+{
+ nvpair_t *pair;
+ nvpair_t *propval;
+ int rv = 0;
+ uint64_t intval;
+ const char *strval;
+
+ nvlist_t *genericnvl = fnvlist_alloc();
+ nvlist_t *retrynvl = fnvlist_alloc();
+retry:
+ pair = NULL;
+ while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
+ const char *propname = nvpair_name(pair);
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ int err = 0;
+
+ /* decode the property value */
+ propval = pair;
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ nvlist_t *attrs;
+ attrs = fnvpair_value_nvlist(pair);
+ if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &propval) != 0)
+ err = SET_ERROR(EINVAL);
+ }
+
+ /* Validate value type */
+ if (err == 0 && source == ZPROP_SRC_INHERITED) {
+ /* inherited properties are expected to be booleans */
+ if (nvpair_type(propval) != DATA_TYPE_BOOLEAN)
+ err = SET_ERROR(EINVAL);
+ } else if (err == 0 && prop == ZPROP_INVAL) {
+ if (zfs_prop_user(propname)) {
+ if (nvpair_type(propval) != DATA_TYPE_STRING)
+ err = SET_ERROR(EINVAL);
+ } else if (zfs_prop_userquota(propname)) {
+ if (nvpair_type(propval) !=
+ DATA_TYPE_UINT64_ARRAY)
+ err = SET_ERROR(EINVAL);
+ } else {
+ err = SET_ERROR(EINVAL);
+ }
+ } else if (err == 0) {
+ if (nvpair_type(propval) == DATA_TYPE_STRING) {
+ if (zfs_prop_get_type(prop) != PROP_TYPE_STRING)
+ err = SET_ERROR(EINVAL);
+ } else if (nvpair_type(propval) == DATA_TYPE_UINT64) {
+ const char *unused;
+
+ intval = fnvpair_value_uint64(propval);
+
+ switch (zfs_prop_get_type(prop)) {
+ case PROP_TYPE_NUMBER:
+ break;
+ case PROP_TYPE_STRING:
+ err = SET_ERROR(EINVAL);
+ break;
+ case PROP_TYPE_INDEX:
+ if (zfs_prop_index_to_string(prop,
+ intval, &unused) != 0)
+ err =
+ SET_ERROR(ZFS_ERR_BADPROP);
+ break;
+ default:
+ cmn_err(CE_PANIC,
+ "unknown property type");
+ }
+ } else {
+ err = SET_ERROR(EINVAL);
+ }
+ }
+
+ /* Validate permissions */
+ if (err == 0)
+ err = zfs_check_settable(dsname, pair, CRED());
+
+ if (err == 0) {
+ if (source == ZPROP_SRC_INHERITED)
+ err = -1; /* does not need special handling */
+ else
+ err = zfs_prop_set_special(dsname, source,
+ pair);
+ if (err == -1) {
+ /*
+ * For better performance we build up a list of
+ * properties to set in a single transaction.
+ */
+ err = nvlist_add_nvpair(genericnvl, pair);
+ } else if (err != 0 && nvl != retrynvl) {
+ /*
+ * This may be a spurious error caused by
+ * receiving quota and reservation out of order.
+ * Try again in a second pass.
+ */
+ err = nvlist_add_nvpair(retrynvl, pair);
+ }
+ }
+
+ if (err != 0) {
+ if (errlist != NULL)
+ fnvlist_add_int32(errlist, propname, err);
+ rv = err;
+ }
+ }
+
+ if (nvl != retrynvl && !nvlist_empty(retrynvl)) {
+ nvl = retrynvl;
+ goto retry;
+ }
+
+ if (!nvlist_empty(genericnvl) &&
+ dsl_props_set(dsname, source, genericnvl) != 0) {
+ /*
+ * If this fails, we still want to set as many properties as we
+ * can, so try setting them individually.
+ */
+ pair = NULL;
+ while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) {
+ const char *propname = nvpair_name(pair);
+ int err = 0;
+
+ propval = pair;
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ nvlist_t *attrs;
+ attrs = fnvpair_value_nvlist(pair);
+ propval = fnvlist_lookup_nvpair(attrs,
+ ZPROP_VALUE);
+ }
+
+ if (nvpair_type(propval) == DATA_TYPE_STRING) {
+ strval = fnvpair_value_string(propval);
+ err = dsl_prop_set_string(dsname, propname,
+ source, strval);
+ } else if (nvpair_type(propval) == DATA_TYPE_BOOLEAN) {
+ err = dsl_prop_inherit(dsname, propname,
+ source);
+ } else {
+ intval = fnvpair_value_uint64(propval);
+ err = dsl_prop_set_int(dsname, propname, source,
+ intval);
+ }
+
+ if (err != 0) {
+ if (errlist != NULL) {
+ fnvlist_add_int32(errlist, propname,
+ err);
+ }
+ rv = err;
+ }
+ }
+ }
+ nvlist_free(genericnvl);
+ nvlist_free(retrynvl);
+
+ return (rv);
+}
+
+/*
+ * Check that all the properties are valid user properties.
+ */
+static int
+zfs_check_userprops(nvlist_t *nvl)
+{
+ nvpair_t *pair = NULL;
+
+ while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
+ const char *propname = nvpair_name(pair);
+
+ if (!zfs_prop_user(propname) ||
+ nvpair_type(pair) != DATA_TYPE_STRING)
+ return (SET_ERROR(EINVAL));
+
+ if (strlen(propname) >= ZAP_MAXNAMELEN)
+ return (SET_ERROR(ENAMETOOLONG));
+
+ if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN)
+ return (SET_ERROR(E2BIG));
+ }
+ return (0);
+}
+
+static void
+props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops)
+{
+ nvpair_t *pair;
+
+ VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ pair = NULL;
+ while ((pair = nvlist_next_nvpair(props, pair)) != NULL) {
+ if (nvlist_exists(skipped, nvpair_name(pair)))
+ continue;
+
+ VERIFY(nvlist_add_nvpair(*newprops, pair) == 0);
+ }
+}
+
+static int
+clear_received_props(const char *dsname, nvlist_t *props,
+ nvlist_t *skipped)
+{
+ int err = 0;
+ nvlist_t *cleared_props = NULL;
+ props_skip(props, skipped, &cleared_props);
+ if (!nvlist_empty(cleared_props)) {
+ /*
+ * Acts on local properties until the dataset has received
+ * properties at least once on or after SPA_VERSION_RECVD_PROPS.
+ */
+ zprop_source_t flags = (ZPROP_SRC_NONE |
+ (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0));
+ err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL);
+ }
+ nvlist_free(cleared_props);
+ return (err);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_value name of property to set
+ * zc_nvlist_src{_size} nvlist of properties to apply
+ * zc_cookie received properties flag
+ *
+ * outputs:
+ * zc_nvlist_dst{_size} error for each unapplied received property
+ */
+static int
+zfs_ioc_set_prop(zfs_cmd_t *zc)
+{
+ nvlist_t *nvl;
+ boolean_t received = zc->zc_cookie;
+ zprop_source_t source = (received ? ZPROP_SRC_RECEIVED :
+ ZPROP_SRC_LOCAL);
+ nvlist_t *errors;
+ int error;
+
+ if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &nvl)) != 0)
+ return (error);
+
+ if (received) {
+ nvlist_t *origprops;
+
+ if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) {
+ (void) clear_received_props(zc->zc_name,
+ origprops, nvl);
+ nvlist_free(origprops);
+ }
+
+ error = dsl_prop_set_hasrecvd(zc->zc_name);
+ }
+
+ errors = fnvlist_alloc();
+ if (error == 0)
+ error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors);
+
+ if (zc->zc_nvlist_dst != 0 && errors != NULL) {
+ (void) put_nvlist(zc, errors);
+ }
+
+ nvlist_free(errors);
+ nvlist_free(nvl);
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_value name of property to inherit
+ * zc_cookie revert to received value if TRUE
+ *
+ * outputs: none
+ */
+static int
+zfs_ioc_inherit_prop(zfs_cmd_t *zc)
+{
+ const char *propname = zc->zc_value;
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ boolean_t received = zc->zc_cookie;
+ zprop_source_t source = (received
+ ? ZPROP_SRC_NONE /* revert to received value, if any */
+ : ZPROP_SRC_INHERITED); /* explicitly inherit */
+ nvlist_t *dummy;
+ nvpair_t *pair;
+ zprop_type_t type;
+ int err;
+
+ if (!received) {
+ /*
+ * Only check this in the non-received case. We want to allow
+ * 'inherit -S' to revert non-inheritable properties like quota
+ * and reservation to the received or default values even though
+ * they are not considered inheritable.
+ */
+ if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (prop == ZPROP_INVAL) {
+ if (!zfs_prop_user(propname))
+ return (SET_ERROR(EINVAL));
+
+ type = PROP_TYPE_STRING;
+ } else if (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION) {
+ return (SET_ERROR(EINVAL));
+ } else {
+ type = zfs_prop_get_type(prop);
+ }
+
+ /*
+ * zfs_prop_set_special() expects properties in the form of an
+ * nvpair with type info.
+ */
+ dummy = fnvlist_alloc();
+
+ switch (type) {
+ case PROP_TYPE_STRING:
+ VERIFY(0 == nvlist_add_string(dummy, propname, ""));
+ break;
+ case PROP_TYPE_NUMBER:
+ case PROP_TYPE_INDEX:
+ VERIFY(0 == nvlist_add_uint64(dummy, propname, 0));
+ break;
+ default:
+ err = SET_ERROR(EINVAL);
+ goto errout;
+ }
+
+ pair = nvlist_next_nvpair(dummy, NULL);
+ if (pair == NULL) {
+ err = SET_ERROR(EINVAL);
+ } else {
+ err = zfs_prop_set_special(zc->zc_name, source, pair);
+ if (err == -1) /* property is not "special", needs handling */
+ err = dsl_prop_inherit(zc->zc_name, zc->zc_value,
+ source);
+ }
+
+errout:
+ nvlist_free(dummy);
+ return (err);
+}
+
+static int
+zfs_ioc_pool_set_props(zfs_cmd_t *zc)
+{
+ nvlist_t *props;
+ spa_t *spa;
+ int error;
+ nvpair_t *pair;
+
+ if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &props)))
+ return (error);
+
+ /*
+ * If the only property is the configfile, then just do a spa_lookup()
+ * to handle the faulted case.
+ */
+ pair = nvlist_next_nvpair(props, NULL);
+ if (pair != NULL && strcmp(nvpair_name(pair),
+ zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 &&
+ nvlist_next_nvpair(props, pair) == NULL) {
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(zc->zc_name)) != NULL) {
+ spa_configfile_set(spa, props, B_FALSE);
+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
+ }
+ mutex_exit(&spa_namespace_lock);
+ if (spa != NULL) {
+ nvlist_free(props);
+ return (0);
+ }
+ }
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
+ nvlist_free(props);
+ return (error);
+ }
+
+ error = spa_prop_set(spa, props);
+
+ nvlist_free(props);
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_get_props(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+ nvlist_t *nvp = NULL;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
+ /*
+ * If the pool is faulted, there may be properties we can still
+ * get (such as altroot and cachefile), so attempt to get them
+ * anyway.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(zc->zc_name)) != NULL)
+ error = spa_prop_get(spa, &nvp);
+ mutex_exit(&spa_namespace_lock);
+ } else {
+ error = spa_prop_get(spa, &nvp);
+ spa_close(spa, FTAG);
+ }
+
+ if (error == 0 && zc->zc_nvlist_dst != 0)
+ error = put_nvlist(zc, nvp);
+ else
+ error = SET_ERROR(EFAULT);
+
+ nvlist_free(nvp);
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_nvlist_src{_size} nvlist of delegated permissions
+ * zc_perm_action allow/unallow flag
+ *
+ * outputs: none
+ */
+static int
+zfs_ioc_set_fsacl(zfs_cmd_t *zc)
+{
+ int error;
+ nvlist_t *fsaclnv = NULL;
+
+ if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &fsaclnv)) != 0)
+ return (error);
+
+ /*
+ * Verify nvlist is constructed correctly
+ */
+ if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) {
+ nvlist_free(fsaclnv);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * If we don't have PRIV_SYS_MOUNT, then validate
+ * that user is allowed to hand out each permission in
+ * the nvlist(s)
+ */
+
+ error = secpolicy_zfs(CRED());
+ if (error != 0) {
+ if (zc->zc_perm_action == B_FALSE) {
+ error = dsl_deleg_can_allow(zc->zc_name,
+ fsaclnv, CRED());
+ } else {
+ error = dsl_deleg_can_unallow(zc->zc_name,
+ fsaclnv, CRED());
+ }
+ }
+
+ if (error == 0)
+ error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action);
+
+ nvlist_free(fsaclnv);
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ *
+ * outputs:
+ * zc_nvlist_src{_size} nvlist of delegated permissions
+ */
+static int
+zfs_ioc_get_fsacl(zfs_cmd_t *zc)
+{
+ nvlist_t *nvp;
+ int error;
+
+ if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) {
+ error = put_nvlist(zc, nvp);
+ nvlist_free(nvp);
+ }
+
+ return (error);
+}
+
+/* ARGSUSED */
+static void
+zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
+{
+ zfs_creat_t *zct = arg;
+
+ zfs_create_fs(os, cr, zct->zct_zplprops, tx);
+}
+
+#define ZFS_PROP_UNDEFINED ((uint64_t)-1)
+
+/*
+ * inputs:
+ * os parent objset pointer (NULL if root fs)
+ * fuids_ok fuids allowed in this version of the spa?
+ * sa_ok SAs allowed in this version of the spa?
+ * createprops list of properties requested by creator
+ *
+ * outputs:
+ * zplprops values for the zplprops we attach to the master node object
+ * is_ci true if requested file system will be purely case-insensitive
+ *
+ * Determine the settings for utf8only, normalization and
+ * casesensitivity. Specific values may have been requested by the
+ * creator and/or we can inherit values from the parent dataset. If
+ * the file system is of too early a vintage, a creator can not
+ * request settings for these properties, even if the requested
+ * setting is the default value. We don't actually want to create dsl
+ * properties for these, so remove them from the source nvlist after
+ * processing.
+ */
+static int
+zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
+ boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops,
+ nvlist_t *zplprops, boolean_t *is_ci)
+{
+ uint64_t sense = ZFS_PROP_UNDEFINED;
+ uint64_t norm = ZFS_PROP_UNDEFINED;
+ uint64_t u8 = ZFS_PROP_UNDEFINED;
+ int error;
+
+ ASSERT(zplprops != NULL);
+
+ /* parent dataset must be a filesystem */
+ if (os != NULL && os->os_phys->os_type != DMU_OST_ZFS)
+ return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
+
+ /*
+ * Pull out creator prop choices, if any.
+ */
+ if (createprops) {
+ (void) nvlist_lookup_uint64(createprops,
+ zfs_prop_to_name(ZFS_PROP_VERSION), &zplver);
+ (void) nvlist_lookup_uint64(createprops,
+ zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm);
+ (void) nvlist_remove_all(createprops,
+ zfs_prop_to_name(ZFS_PROP_NORMALIZE));
+ (void) nvlist_lookup_uint64(createprops,
+ zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8);
+ (void) nvlist_remove_all(createprops,
+ zfs_prop_to_name(ZFS_PROP_UTF8ONLY));
+ (void) nvlist_lookup_uint64(createprops,
+ zfs_prop_to_name(ZFS_PROP_CASE), &sense);
+ (void) nvlist_remove_all(createprops,
+ zfs_prop_to_name(ZFS_PROP_CASE));
+ }
+
+ /*
+ * If the zpl version requested is whacky or the file system
+ * or pool is version is too "young" to support normalization
+ * and the creator tried to set a value for one of the props,
+ * error out.
+ */
+ if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) ||
+ (zplver >= ZPL_VERSION_FUID && !fuids_ok) ||
+ (zplver >= ZPL_VERSION_SA && !sa_ok) ||
+ (zplver < ZPL_VERSION_NORMALIZATION &&
+ (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED ||
+ sense != ZFS_PROP_UNDEFINED)))
+ return (SET_ERROR(ENOTSUP));
+
+ /*
+ * Put the version in the zplprops
+ */
+ VERIFY(nvlist_add_uint64(zplprops,
+ zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0);
+
+ if (norm == ZFS_PROP_UNDEFINED &&
+ (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0)
+ return (error);
+ VERIFY(nvlist_add_uint64(zplprops,
+ zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0);
+
+ /*
+ * If we're normalizing, names must always be valid UTF-8 strings.
+ */
+ if (norm)
+ u8 = 1;
+ if (u8 == ZFS_PROP_UNDEFINED &&
+ (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0)
+ return (error);
+ VERIFY(nvlist_add_uint64(zplprops,
+ zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0);
+
+ if (sense == ZFS_PROP_UNDEFINED &&
+ (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0)
+ return (error);
+ VERIFY(nvlist_add_uint64(zplprops,
+ zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0);
+
+ if (is_ci)
+ *is_ci = (sense == ZFS_CASE_INSENSITIVE);
+
+ return (0);
+}
+
+static int
+zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
+ nvlist_t *zplprops, boolean_t *is_ci)
+{
+ boolean_t fuids_ok, sa_ok;
+ uint64_t zplver = ZPL_VERSION;
+ objset_t *os = NULL;
+ char parentname[ZFS_MAX_DATASET_NAME_LEN];
+ spa_t *spa;
+ uint64_t spa_vers;
+ int error;
+
+ zfs_get_parent(dataset, parentname, sizeof (parentname));
+
+ if ((error = spa_open(dataset, &spa, FTAG)) != 0)
+ return (error);
+
+ spa_vers = spa_version(spa);
+ spa_close(spa, FTAG);
+
+ zplver = zfs_zpl_version_map(spa_vers);
+ fuids_ok = (zplver >= ZPL_VERSION_FUID);
+ sa_ok = (zplver >= ZPL_VERSION_SA);
+
+ /*
+ * Open parent object set so we can inherit zplprop values.
+ */
+ if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0)
+ return (error);
+
+ error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops,
+ zplprops, is_ci);
+ dmu_objset_rele(os, FTAG);
+ return (error);
+}
+
+static int
+zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops,
+ nvlist_t *zplprops, boolean_t *is_ci)
+{
+ boolean_t fuids_ok;
+ boolean_t sa_ok;
+ uint64_t zplver = ZPL_VERSION;
+ int error;
+
+ zplver = zfs_zpl_version_map(spa_vers);
+ fuids_ok = (zplver >= ZPL_VERSION_FUID);
+ sa_ok = (zplver >= ZPL_VERSION_SA);
+
+ error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok,
+ createprops, zplprops, is_ci);
+ return (error);
+}
+
+/*
+ * innvl: {
+ * "type" -> dmu_objset_type_t (int32)
+ * (optional) "props" -> { prop -> value }
+ * (optional) "hidden_args" -> { "wkeydata" -> value }
+ * raw uint8_t array of encryption wrapping key data (32 bytes)
+ * }
+ *
+ * outnvl: propname -> error code (int32)
+ */
+
+static const zfs_ioc_key_t zfs_keys_create[] = {
+ {"type", DATA_TYPE_INT32, 0},
+ {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL},
+ {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ int error = 0;
+ zfs_creat_t zct = { 0 };
+ nvlist_t *nvprops = NULL;
+ nvlist_t *hidden_args = NULL;
+ void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
+ dmu_objset_type_t type;
+ boolean_t is_insensitive = B_FALSE;
+ dsl_crypto_params_t *dcp = NULL;
+
+ type = (dmu_objset_type_t)fnvlist_lookup_int32(innvl, "type");
+ (void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
+ (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args);
+
+ switch (type) {
+ case DMU_OST_ZFS:
+ cbfunc = zfs_create_cb;
+ break;
+
+ case DMU_OST_ZVOL:
+ cbfunc = zvol_create_cb;
+ break;
+
+ default:
+ cbfunc = NULL;
+ break;
+ }
+ if (strchr(fsname, '@') ||
+ strchr(fsname, '%'))
+ return (SET_ERROR(EINVAL));
+
+ zct.zct_props = nvprops;
+
+ if (cbfunc == NULL)
+ return (SET_ERROR(EINVAL));
+
+ if (type == DMU_OST_ZVOL) {
+ uint64_t volsize, volblocksize;
+
+ if (nvprops == NULL)
+ return (SET_ERROR(EINVAL));
+ if (nvlist_lookup_uint64(nvprops,
+ zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0)
+ return (SET_ERROR(EINVAL));
+
+ if ((error = nvlist_lookup_uint64(nvprops,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
+ &volblocksize)) != 0 && error != ENOENT)
+ return (SET_ERROR(EINVAL));
+
+ if (error != 0)
+ volblocksize = zfs_prop_default_numeric(
+ ZFS_PROP_VOLBLOCKSIZE);
+
+ if ((error = zvol_check_volblocksize(fsname,
+ volblocksize)) != 0 ||
+ (error = zvol_check_volsize(volsize,
+ volblocksize)) != 0)
+ return (error);
+ } else if (type == DMU_OST_ZFS) {
+ int error;
+
+ /*
+ * We have to have normalization and
+ * case-folding flags correct when we do the
+ * file system creation, so go figure them out
+ * now.
+ */
+ VERIFY(nvlist_alloc(&zct.zct_zplprops,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ error = zfs_fill_zplprops(fsname, nvprops,
+ zct.zct_zplprops, &is_insensitive);
+ if (error != 0) {
+ nvlist_free(zct.zct_zplprops);
+ return (error);
+ }
+ }
+
+ error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, nvprops,
+ hidden_args, &dcp);
+ if (error != 0) {
+ nvlist_free(zct.zct_zplprops);
+ return (error);
+ }
+
+ error = dmu_objset_create(fsname, type,
+ is_insensitive ? DS_FLAG_CI_DATASET : 0, dcp, cbfunc, &zct);
+
+ nvlist_free(zct.zct_zplprops);
+ dsl_crypto_params_free(dcp, !!error);
+
+ /*
+ * It would be nice to do this atomically.
+ */
+ if (error == 0) {
+ error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
+ nvprops, outnvl);
+ if (error != 0) {
+ spa_t *spa;
+ int error2;
+
+ /*
+ * Volumes will return EBUSY and cannot be destroyed
+ * until all asynchronous minor handling (e.g. from
+ * setting the volmode property) has completed. Wait for
+ * the spa_zvol_taskq to drain then retry.
+ */
+ error2 = dsl_destroy_head(fsname);
+ while ((error2 == EBUSY) && (type == DMU_OST_ZVOL)) {
+ error2 = spa_open(fsname, &spa, FTAG);
+ if (error2 == 0) {
+ taskq_wait(spa->spa_zvol_taskq);
+ spa_close(spa, FTAG);
+ }
+ error2 = dsl_destroy_head(fsname);
+ }
+ }
+ }
+ return (error);
+}
+
+/*
+ * innvl: {
+ * "origin" -> name of origin snapshot
+ * (optional) "props" -> { prop -> value }
+ * (optional) "hidden_args" -> { "wkeydata" -> value }
+ * raw uint8_t array of encryption wrapping key data (32 bytes)
+ * }
+ *
+ * outputs:
+ * outnvl: propname -> error code (int32)
+ */
+static const zfs_ioc_key_t zfs_keys_clone[] = {
+ {"origin", DATA_TYPE_STRING, 0},
+ {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL},
+ {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ int error = 0;
+ nvlist_t *nvprops = NULL;
+ const char *origin_name;
+
+ origin_name = fnvlist_lookup_string(innvl, "origin");
+ (void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
+
+ if (strchr(fsname, '@') ||
+ strchr(fsname, '%'))
+ return (SET_ERROR(EINVAL));
+
+ if (dataset_namecheck(origin_name, NULL, NULL) != 0)
+ return (SET_ERROR(EINVAL));
+
+ error = dmu_objset_clone(fsname, origin_name);
+
+ /*
+ * It would be nice to do this atomically.
+ */
+ if (error == 0) {
+ error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
+ nvprops, outnvl);
+ if (error != 0)
+ (void) dsl_destroy_head(fsname);
+ }
+ return (error);
+}
+
+static const zfs_ioc_key_t zfs_keys_remap[] = {
+ /* no nvl keys */
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ /* This IOCTL is no longer supported. */
+ return (0);
+}
+
+/*
+ * innvl: {
+ * "snaps" -> { snapshot1, snapshot2 }
+ * (optional) "props" -> { prop -> value (string) }
+ * }
+ *
+ * outnvl: snapshot -> error code (int32)
+ */
+static const zfs_ioc_key_t zfs_keys_snapshot[] = {
+ {"snaps", DATA_TYPE_NVLIST, 0},
+ {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ nvlist_t *snaps;
+ nvlist_t *props = NULL;
+ int error, poollen;
+ nvpair_t *pair;
+
+ (void) nvlist_lookup_nvlist(innvl, "props", &props);
+ if (!nvlist_empty(props) &&
+ zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS))
+ return (SET_ERROR(ENOTSUP));
+ if ((error = zfs_check_userprops(props)) != 0)
+ return (error);
+
+ snaps = fnvlist_lookup_nvlist(innvl, "snaps");
+ poollen = strlen(poolname);
+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(snaps, pair)) {
+ const char *name = nvpair_name(pair);
+ char *cp = strchr(name, '@');
+
+ /*
+ * The snap name must contain an @, and the part after it must
+ * contain only valid characters.
+ */
+ if (cp == NULL ||
+ zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * The snap must be in the specified pool.
+ */
+ if (strncmp(name, poolname, poollen) != 0 ||
+ (name[poollen] != '/' && name[poollen] != '@'))
+ return (SET_ERROR(EXDEV));
+
+ /*
+ * Check for permission to set the properties on the fs.
+ */
+ if (!nvlist_empty(props)) {
+ *cp = '\0';
+ error = zfs_secpolicy_write_perms(name,
+ ZFS_DELEG_PERM_USERPROP, CRED());
+ *cp = '@';
+ if (error != 0)
+ return (error);
+ }
+
+ /* This must be the only snap of this fs. */
+ for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair);
+ pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) {
+ if (strncmp(name, nvpair_name(pair2), cp - name + 1)
+ == 0) {
+ return (SET_ERROR(EXDEV));
+ }
+ }
+ }
+
+ error = dsl_dataset_snapshot(snaps, props, outnvl);
+
+ return (error);
+}
+
+/*
+ * innvl: "message" -> string
+ */
+static const zfs_ioc_key_t zfs_keys_log_history[] = {
+ {"message", DATA_TYPE_STRING, 0},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ const char *message;
+ char *poolname;
+ spa_t *spa;
+ int error;
+
+ /*
+ * The poolname in the ioctl is not set, we get it from the TSD,
+ * which was set at the end of the last successful ioctl that allows
+ * logging. The secpolicy func already checked that it is set.
+ * Only one log ioctl is allowed after each successful ioctl, so
+ * we clear the TSD here.
+ */
+ poolname = tsd_get(zfs_allow_log_key);
+ if (poolname == NULL)
+ return (SET_ERROR(EINVAL));
+ (void) tsd_set(zfs_allow_log_key, NULL);
+ error = spa_open(poolname, &spa, FTAG);
+ kmem_strfree(poolname);
+ if (error != 0)
+ return (error);
+
+ message = fnvlist_lookup_string(innvl, "message");
+
+ if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ error = spa_history_log(spa, message);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+/*
+ * This ioctl is used to set the bootenv configuration on the current
+ * pool. This configuration is stored in the second padding area of the label,
+ * and it is used by the bootloader(s) to store the bootloader and/or system
+ * specific data.
+ * The data is stored as nvlist data stream, and is protected by
+ * an embedded checksum.
+ * The version can have two possible values:
+ * VB_RAW: nvlist should have key GRUB_ENVMAP, value DATA_TYPE_STRING.
+ * VB_NVLIST: nvlist with arbitrary <key, value> pairs.
+ */
+static const zfs_ioc_key_t zfs_keys_set_bootenv[] = {
+ {"version", DATA_TYPE_UINT64, 0},
+ {"<keys>", DATA_TYPE_ANY, ZK_OPTIONAL | ZK_WILDCARDLIST},
+};
+
+static int
+zfs_ioc_set_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ int error;
+ spa_t *spa;
+
+ if ((error = spa_open(name, &spa, FTAG)) != 0)
+ return (error);
+ spa_vdev_state_enter(spa, SCL_ALL);
+ error = vdev_label_write_bootenv(spa->spa_root_vdev, innvl);
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static const zfs_ioc_key_t zfs_keys_get_bootenv[] = {
+ /* no nvl keys */
+};
+
+static int
+zfs_ioc_get_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ spa_t *spa;
+ int error;
+
+ if ((error = spa_open(name, &spa, FTAG)) != 0)
+ return (error);
+ spa_vdev_state_enter(spa, SCL_ALL);
+ error = vdev_label_read_bootenv(spa->spa_root_vdev, outnvl);
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+/*
+ * The dp_config_rwlock must not be held when calling this, because the
+ * unmount may need to write out data.
+ *
+ * This function is best-effort. Callers must deal gracefully if it
+ * remains mounted (or is remounted after this call).
+ *
+ * Returns 0 if the argument is not a snapshot, or it is not currently a
+ * filesystem, or we were able to unmount it. Returns error code otherwise.
+ */
+void
+zfs_unmount_snap(const char *snapname)
+{
+ if (strchr(snapname, '@') == NULL)
+ return;
+
+ (void) zfsctl_snapshot_unmount(snapname, MNT_FORCE);
+}
+
+/* ARGSUSED */
+static int
+zfs_unmount_snap_cb(const char *snapname, void *arg)
+{
+ zfs_unmount_snap(snapname);
+ return (0);
+}
+
+/*
+ * When a clone is destroyed, its origin may also need to be destroyed,
+ * in which case it must be unmounted. This routine will do that unmount
+ * if necessary.
+ */
+void
+zfs_destroy_unmount_origin(const char *fsname)
+{
+ int error;
+ objset_t *os;
+ dsl_dataset_t *ds;
+
+ error = dmu_objset_hold(fsname, FTAG, &os);
+ if (error != 0)
+ return;
+ ds = dmu_objset_ds(os);
+ if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) {
+ char originname[ZFS_MAX_DATASET_NAME_LEN];
+ dsl_dataset_name(ds->ds_prev, originname);
+ dmu_objset_rele(os, FTAG);
+ zfs_unmount_snap(originname);
+ } else {
+ dmu_objset_rele(os, FTAG);
+ }
+}
+
+/*
+ * innvl: {
+ * "snaps" -> { snapshot1, snapshot2 }
+ * (optional boolean) "defer"
+ * }
+ *
+ * outnvl: snapshot -> error code (int32)
+ */
+static const zfs_ioc_key_t zfs_keys_destroy_snaps[] = {
+ {"snaps", DATA_TYPE_NVLIST, 0},
+ {"defer", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ int poollen;
+ nvlist_t *snaps;
+ nvpair_t *pair;
+ boolean_t defer;
+ spa_t *spa;
+
+ snaps = fnvlist_lookup_nvlist(innvl, "snaps");
+ defer = nvlist_exists(innvl, "defer");
+
+ poollen = strlen(poolname);
+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(snaps, pair)) {
+ const char *name = nvpair_name(pair);
+
+ /*
+ * The snap must be in the specified pool to prevent the
+ * invalid removal of zvol minors below.
+ */
+ if (strncmp(name, poolname, poollen) != 0 ||
+ (name[poollen] != '/' && name[poollen] != '@'))
+ return (SET_ERROR(EXDEV));
+
+ zfs_unmount_snap(nvpair_name(pair));
+ if (spa_open(name, &spa, FTAG) == 0) {
+ zvol_remove_minors(spa, name, B_TRUE);
+ spa_close(spa, FTAG);
+ }
+ }
+
+ return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl));
+}
+
+/*
+ * Create bookmarks. The bookmark names are of the form <fs>#<bmark>.
+ * All bookmarks and snapshots must be in the same pool.
+ * dsl_bookmark_create_nvl_validate describes the nvlist schema in more detail.
+ *
+ * innvl: {
+ * new_bookmark1 -> existing_snapshot,
+ * new_bookmark2 -> existing_bookmark,
+ * }
+ *
+ * outnvl: bookmark -> error code (int32)
+ *
+ */
+static const zfs_ioc_key_t zfs_keys_bookmark[] = {
+ {"<bookmark>...", DATA_TYPE_STRING, ZK_WILDCARDLIST},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ return (dsl_bookmark_create(innvl, outnvl));
+}
+
+/*
+ * innvl: {
+ * property 1, property 2, ...
+ * }
+ *
+ * outnvl: {
+ * bookmark name 1 -> { property 1, property 2, ... },
+ * bookmark name 2 -> { property 1, property 2, ... }
+ * }
+ *
+ */
+static const zfs_ioc_key_t zfs_keys_get_bookmarks[] = {
+ {"<property>...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST | ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ return (dsl_get_bookmarks(fsname, innvl, outnvl));
+}
+
+/*
+ * innvl is not used.
+ *
+ * outnvl: {
+ * property 1, property 2, ...
+ * }
+ *
+ */
+static const zfs_ioc_key_t zfs_keys_get_bookmark_props[] = {
+ /* no nvl keys */
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_get_bookmark_props(const char *bookmark, nvlist_t *innvl,
+ nvlist_t *outnvl)
+{
+ char fsname[ZFS_MAX_DATASET_NAME_LEN];
+ char *bmname;
+
+ bmname = strchr(bookmark, '#');
+ if (bmname == NULL)
+ return (SET_ERROR(EINVAL));
+ bmname++;
+
+ (void) strlcpy(fsname, bookmark, sizeof (fsname));
+ *(strchr(fsname, '#')) = '\0';
+
+ return (dsl_get_bookmark_props(fsname, bmname, outnvl));
+}
+
+/*
+ * innvl: {
+ * bookmark name 1, bookmark name 2
+ * }
+ *
+ * outnvl: bookmark -> error code (int32)
+ *
+ */
+static const zfs_ioc_key_t zfs_keys_destroy_bookmarks[] = {
+ {"<bookmark>...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST},
+};
+
+static int
+zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl,
+ nvlist_t *outnvl)
+{
+ int error, poollen;
+
+ poollen = strlen(poolname);
+ for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
+ const char *name = nvpair_name(pair);
+ const char *cp = strchr(name, '#');
+
+ /*
+ * The bookmark name must contain an #, and the part after it
+ * must contain only valid characters.
+ */
+ if (cp == NULL ||
+ zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * The bookmark must be in the specified pool.
+ */
+ if (strncmp(name, poolname, poollen) != 0 ||
+ (name[poollen] != '/' && name[poollen] != '#'))
+ return (SET_ERROR(EXDEV));
+ }
+
+ error = dsl_bookmark_destroy(innvl, outnvl);
+ return (error);
+}
+
+static const zfs_ioc_key_t zfs_keys_channel_program[] = {
+ {"program", DATA_TYPE_STRING, 0},
+ {"arg", DATA_TYPE_ANY, 0},
+ {"sync", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL},
+ {"instrlimit", DATA_TYPE_UINT64, ZK_OPTIONAL},
+ {"memlimit", DATA_TYPE_UINT64, ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl,
+ nvlist_t *outnvl)
+{
+ char *program;
+ uint64_t instrlimit, memlimit;
+ boolean_t sync_flag;
+ nvpair_t *nvarg = NULL;
+
+ program = fnvlist_lookup_string(innvl, ZCP_ARG_PROGRAM);
+ if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) {
+ sync_flag = B_TRUE;
+ }
+ if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) {
+ instrlimit = ZCP_DEFAULT_INSTRLIMIT;
+ }
+ if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) {
+ memlimit = ZCP_DEFAULT_MEMLIMIT;
+ }
+ nvarg = fnvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST);
+
+ if (instrlimit == 0 || instrlimit > zfs_lua_max_instrlimit)
+ return (SET_ERROR(EINVAL));
+ if (memlimit == 0 || memlimit > zfs_lua_max_memlimit)
+ return (SET_ERROR(EINVAL));
+
+ return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit,
+ nvarg, outnvl));
+}
+
+/*
+ * innvl: unused
+ * outnvl: empty
+ */
+static const zfs_ioc_key_t zfs_keys_pool_checkpoint[] = {
+ /* no nvl keys */
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ return (spa_checkpoint(poolname));
+}
+
+/*
+ * innvl: unused
+ * outnvl: empty
+ */
+static const zfs_ioc_key_t zfs_keys_pool_discard_checkpoint[] = {
+ /* no nvl keys */
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl,
+ nvlist_t *outnvl)
+{
+ return (spa_checkpoint_discard(poolname));
+}
+
+/*
+ * inputs:
+ * zc_name name of dataset to destroy
+ * zc_defer_destroy mark for deferred destroy
+ *
+ * outputs: none
+ */
+static int
+zfs_ioc_destroy(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ dmu_objset_type_t ost;
+ int err;
+
+ err = dmu_objset_hold(zc->zc_name, FTAG, &os);
+ if (err != 0)
+ return (err);
+ ost = dmu_objset_type(os);
+ dmu_objset_rele(os, FTAG);
+
+ if (ost == DMU_OST_ZFS)
+ zfs_unmount_snap(zc->zc_name);
+
+ if (strchr(zc->zc_name, '@')) {
+ err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy);
+ } else {
+ err = dsl_destroy_head(zc->zc_name);
+ if (err == EEXIST) {
+ /*
+ * It is possible that the given DS may have
+ * hidden child (%recv) datasets - "leftovers"
+ * resulting from the previously interrupted
+ * 'zfs receive'.
+ *
+ * 6 extra bytes for /%recv
+ */
+ char namebuf[ZFS_MAX_DATASET_NAME_LEN + 6];
+
+ if (snprintf(namebuf, sizeof (namebuf), "%s/%s",
+ zc->zc_name, recv_clone_name) >=
+ sizeof (namebuf))
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * Try to remove the hidden child (%recv) and after
+ * that try to remove the target dataset.
+ * If the hidden child (%recv) does not exist
+ * the original error (EEXIST) will be returned
+ */
+ err = dsl_destroy_head(namebuf);
+ if (err == 0)
+ err = dsl_destroy_head(zc->zc_name);
+ else if (err == ENOENT)
+ err = SET_ERROR(EEXIST);
+ }
+ }
+
+ return (err);
+}
+
+/*
+ * innvl: {
+ * "initialize_command" -> POOL_INITIALIZE_{CANCEL|START|SUSPEND} (uint64)
+ * "initialize_vdevs": { -> guids to initialize (nvlist)
+ * "vdev_path_1": vdev_guid_1, (uint64),
+ * "vdev_path_2": vdev_guid_2, (uint64),
+ * ...
+ * },
+ * }
+ *
+ * outnvl: {
+ * "initialize_vdevs": { -> initialization errors (nvlist)
+ * "vdev_path_1": errno, see function body for possible errnos (uint64)
+ * "vdev_path_2": errno, ... (uint64)
+ * ...
+ * }
+ * }
+ *
+ * EINVAL is returned for an unknown commands or if any of the provided vdev
+ * guids have be specified with a type other than uint64.
+ */
+static const zfs_ioc_key_t zfs_keys_pool_initialize[] = {
+ {ZPOOL_INITIALIZE_COMMAND, DATA_TYPE_UINT64, 0},
+ {ZPOOL_INITIALIZE_VDEVS, DATA_TYPE_NVLIST, 0}
+};
+
+static int
+zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ uint64_t cmd_type;
+ if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND,
+ &cmd_type) != 0) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (!(cmd_type == POOL_INITIALIZE_CANCEL ||
+ cmd_type == POOL_INITIALIZE_START ||
+ cmd_type == POOL_INITIALIZE_SUSPEND)) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ nvlist_t *vdev_guids;
+ if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS,
+ &vdev_guids) != 0) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) {
+ uint64_t vdev_guid;
+ if (nvpair_value_uint64(pair, &vdev_guid) != 0) {
+ return (SET_ERROR(EINVAL));
+ }
+ }
+
+ spa_t *spa;
+ int error = spa_open(poolname, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ nvlist_t *vdev_errlist = fnvlist_alloc();
+ int total_errors = spa_vdev_initialize(spa, vdev_guids, cmd_type,
+ vdev_errlist);
+
+ if (fnvlist_size(vdev_errlist) > 0) {
+ fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS,
+ vdev_errlist);
+ }
+ fnvlist_free(vdev_errlist);
+
+ spa_close(spa, FTAG);
+ return (total_errors > 0 ? EINVAL : 0);
+}
+
+/*
+ * innvl: {
+ * "trim_command" -> POOL_TRIM_{CANCEL|START|SUSPEND} (uint64)
+ * "trim_vdevs": { -> guids to TRIM (nvlist)
+ * "vdev_path_1": vdev_guid_1, (uint64),
+ * "vdev_path_2": vdev_guid_2, (uint64),
+ * ...
+ * },
+ * "trim_rate" -> Target TRIM rate in bytes/sec.
+ * "trim_secure" -> Set to request a secure TRIM.
+ * }
+ *
+ * outnvl: {
+ * "trim_vdevs": { -> TRIM errors (nvlist)
+ * "vdev_path_1": errno, see function body for possible errnos (uint64)
+ * "vdev_path_2": errno, ... (uint64)
+ * ...
+ * }
+ * }
+ *
+ * EINVAL is returned for an unknown commands or if any of the provided vdev
+ * guids have be specified with a type other than uint64.
+ */
+static const zfs_ioc_key_t zfs_keys_pool_trim[] = {
+ {ZPOOL_TRIM_COMMAND, DATA_TYPE_UINT64, 0},
+ {ZPOOL_TRIM_VDEVS, DATA_TYPE_NVLIST, 0},
+ {ZPOOL_TRIM_RATE, DATA_TYPE_UINT64, ZK_OPTIONAL},
+ {ZPOOL_TRIM_SECURE, DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ uint64_t cmd_type;
+ if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_COMMAND, &cmd_type) != 0)
+ return (SET_ERROR(EINVAL));
+
+ if (!(cmd_type == POOL_TRIM_CANCEL ||
+ cmd_type == POOL_TRIM_START ||
+ cmd_type == POOL_TRIM_SUSPEND)) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ nvlist_t *vdev_guids;
+ if (nvlist_lookup_nvlist(innvl, ZPOOL_TRIM_VDEVS, &vdev_guids) != 0)
+ return (SET_ERROR(EINVAL));
+
+ for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) {
+ uint64_t vdev_guid;
+ if (nvpair_value_uint64(pair, &vdev_guid) != 0) {
+ return (SET_ERROR(EINVAL));
+ }
+ }
+
+ /* Optional, defaults to maximum rate when not provided */
+ uint64_t rate;
+ if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_RATE, &rate) != 0)
+ rate = 0;
+
+ /* Optional, defaults to standard TRIM when not provided */
+ boolean_t secure;
+ if (nvlist_lookup_boolean_value(innvl, ZPOOL_TRIM_SECURE,
+ &secure) != 0) {
+ secure = B_FALSE;
+ }
+
+ spa_t *spa;
+ int error = spa_open(poolname, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ nvlist_t *vdev_errlist = fnvlist_alloc();
+ int total_errors = spa_vdev_trim(spa, vdev_guids, cmd_type,
+ rate, !!zfs_trim_metaslab_skip, secure, vdev_errlist);
+
+ if (fnvlist_size(vdev_errlist) > 0)
+ fnvlist_add_nvlist(outnvl, ZPOOL_TRIM_VDEVS, vdev_errlist);
+
+ fnvlist_free(vdev_errlist);
+
+ spa_close(spa, FTAG);
+ return (total_errors > 0 ? EINVAL : 0);
+}
+
+/*
+ * This ioctl waits for activity of a particular type to complete. If there is
+ * no activity of that type in progress, it returns immediately, and the
+ * returned value "waited" is false. If there is activity in progress, and no
+ * tag is passed in, the ioctl blocks until all activity of that type is
+ * complete, and then returns with "waited" set to true.
+ *
+ * If a tag is provided, it identifies a particular instance of an activity to
+ * wait for. Currently, this is only valid for use with 'initialize', because
+ * that is the only activity for which there can be multiple instances running
+ * concurrently. In the case of 'initialize', the tag corresponds to the guid of
+ * the vdev on which to wait.
+ *
+ * If a thread waiting in the ioctl receives a signal, the call will return
+ * immediately, and the return value will be EINTR.
+ *
+ * innvl: {
+ * "wait_activity" -> int32_t
+ * (optional) "wait_tag" -> uint64_t
+ * }
+ *
+ * outnvl: "waited" -> boolean_t
+ */
+static const zfs_ioc_key_t zfs_keys_pool_wait[] = {
+ {ZPOOL_WAIT_ACTIVITY, DATA_TYPE_INT32, 0},
+ {ZPOOL_WAIT_TAG, DATA_TYPE_UINT64, ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_wait(const char *name, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ int32_t activity;
+ uint64_t tag;
+ boolean_t waited;
+ int error;
+
+ if (nvlist_lookup_int32(innvl, ZPOOL_WAIT_ACTIVITY, &activity) != 0)
+ return (EINVAL);
+
+ if (nvlist_lookup_uint64(innvl, ZPOOL_WAIT_TAG, &tag) == 0)
+ error = spa_wait_tag(name, activity, tag, &waited);
+ else
+ error = spa_wait(name, activity, &waited);
+
+ if (error == 0)
+ fnvlist_add_boolean_value(outnvl, ZPOOL_WAIT_WAITED, waited);
+
+ return (error);
+}
+
+/*
+ * This ioctl waits for activity of a particular type to complete. If there is
+ * no activity of that type in progress, it returns immediately, and the
+ * returned value "waited" is false. If there is activity in progress, and no
+ * tag is passed in, the ioctl blocks until all activity of that type is
+ * complete, and then returns with "waited" set to true.
+ *
+ * If a thread waiting in the ioctl receives a signal, the call will return
+ * immediately, and the return value will be EINTR.
+ *
+ * innvl: {
+ * "wait_activity" -> int32_t
+ * }
+ *
+ * outnvl: "waited" -> boolean_t
+ */
+static const zfs_ioc_key_t zfs_keys_fs_wait[] = {
+ {ZFS_WAIT_ACTIVITY, DATA_TYPE_INT32, 0},
+};
+
+static int
+zfs_ioc_wait_fs(const char *name, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ int32_t activity;
+ boolean_t waited = B_FALSE;
+ int error;
+ dsl_pool_t *dp;
+ dsl_dir_t *dd;
+ dsl_dataset_t *ds;
+
+ if (nvlist_lookup_int32(innvl, ZFS_WAIT_ACTIVITY, &activity) != 0)
+ return (SET_ERROR(EINVAL));
+
+ if (activity >= ZFS_WAIT_NUM_ACTIVITIES || activity < 0)
+ return (SET_ERROR(EINVAL));
+
+ if ((error = dsl_pool_hold(name, FTAG, &dp)) != 0)
+ return (error);
+
+ if ((error = dsl_dataset_hold(dp, name, FTAG, &ds)) != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ dd = ds->ds_dir;
+ mutex_enter(&dd->dd_activity_lock);
+ dd->dd_activity_waiters++;
+
+ /*
+ * We get a long-hold here so that the dsl_dataset_t and dsl_dir_t
+ * aren't evicted while we're waiting. Normally this is prevented by
+ * holding the pool, but we can't do that while we're waiting since
+ * that would prevent TXGs from syncing out. Some of the functionality
+ * of long-holds (e.g. preventing deletion) is unnecessary for this
+ * case, since we would cancel the waiters before proceeding with a
+ * deletion. An alternative mechanism for keeping the dataset around
+ * could be developed but this is simpler.
+ */
+ dsl_dataset_long_hold(ds, FTAG);
+ dsl_pool_rele(dp, FTAG);
+
+ error = dsl_dir_wait(dd, ds, activity, &waited);
+
+ dsl_dataset_long_rele(ds, FTAG);
+ dd->dd_activity_waiters--;
+ if (dd->dd_activity_waiters == 0)
+ cv_signal(&dd->dd_activity_cv);
+ mutex_exit(&dd->dd_activity_lock);
+
+ dsl_dataset_rele(ds, FTAG);
+
+ if (error == 0)
+ fnvlist_add_boolean_value(outnvl, ZFS_WAIT_WAITED, waited);
+
+ return (error);
+}
+
+/*
+ * fsname is name of dataset to rollback (to most recent snapshot)
+ *
+ * innvl may contain name of expected target snapshot
+ *
+ * outnvl: "target" -> name of most recent snapshot
+ * }
+ */
+static const zfs_ioc_key_t zfs_keys_rollback[] = {
+ {"target", DATA_TYPE_STRING, ZK_OPTIONAL},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ zfsvfs_t *zfsvfs;
+ zvol_state_handle_t *zv;
+ char *target = NULL;
+ int error;
+
+ (void) nvlist_lookup_string(innvl, "target", &target);
+ if (target != NULL) {
+ const char *cp = strchr(target, '@');
+
+ /*
+ * The snap name must contain an @, and the part after it must
+ * contain only valid characters.
+ */
+ if (cp == NULL ||
+ zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (getzfsvfs(fsname, &zfsvfs) == 0) {
+ dsl_dataset_t *ds;
+
+ ds = dmu_objset_ds(zfsvfs->z_os);
+ error = zfs_suspend_fs(zfsvfs);
+ if (error == 0) {
+ int resume_err;
+
+ error = dsl_dataset_rollback(fsname, target, zfsvfs,
+ outnvl);
+ resume_err = zfs_resume_fs(zfsvfs, ds);
+ error = error ? error : resume_err;
+ }
+ zfs_vfs_rele(zfsvfs);
+ } else if ((zv = zvol_suspend(fsname)) != NULL) {
+ error = dsl_dataset_rollback(fsname, target, zvol_tag(zv),
+ outnvl);
+ zvol_resume(zv);
+ } else {
+ error = dsl_dataset_rollback(fsname, target, NULL, outnvl);
+ }
+ return (error);
+}
+
+static int
+recursive_unmount(const char *fsname, void *arg)
+{
+ const char *snapname = arg;
+ char *fullname;
+
+ fullname = kmem_asprintf("%s@%s", fsname, snapname);
+ zfs_unmount_snap(fullname);
+ kmem_strfree(fullname);
+
+ return (0);
+}
+
+/*
+ *
+ * snapname is the snapshot to redact.
+ * innvl: {
+ * "bookname" -> (string)
+ * shortname of the redaction bookmark to generate
+ * "snapnv" -> (nvlist, values ignored)
+ * snapshots to redact snapname with respect to
+ * }
+ *
+ * outnvl is unused
+ */
+
+/* ARGSUSED */
+static const zfs_ioc_key_t zfs_keys_redact[] = {
+ {"bookname", DATA_TYPE_STRING, 0},
+ {"snapnv", DATA_TYPE_NVLIST, 0},
+};
+static int
+zfs_ioc_redact(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ nvlist_t *redactnvl = NULL;
+ char *redactbook = NULL;
+
+ if (nvlist_lookup_nvlist(innvl, "snapnv", &redactnvl) != 0)
+ return (SET_ERROR(EINVAL));
+ if (fnvlist_num_pairs(redactnvl) == 0)
+ return (SET_ERROR(ENXIO));
+ if (nvlist_lookup_string(innvl, "bookname", &redactbook) != 0)
+ return (SET_ERROR(EINVAL));
+
+ return (dmu_redact_snap(snapname, redactnvl, redactbook));
+}
+
+/*
+ * inputs:
+ * zc_name old name of dataset
+ * zc_value new name of dataset
+ * zc_cookie recursive flag (only valid for snapshots)
+ *
+ * outputs: none
+ */
+static int
+zfs_ioc_rename(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ dmu_objset_type_t ost;
+ boolean_t recursive = zc->zc_cookie & 1;
+ boolean_t nounmount = !!(zc->zc_cookie & 2);
+ char *at;
+ int err;
+
+ /* "zfs rename" from and to ...%recv datasets should both fail */
+ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
+ zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
+ if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 ||
+ dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
+ strchr(zc->zc_name, '%') || strchr(zc->zc_value, '%'))
+ return (SET_ERROR(EINVAL));
+
+ err = dmu_objset_hold(zc->zc_name, FTAG, &os);
+ if (err != 0)
+ return (err);
+ ost = dmu_objset_type(os);
+ dmu_objset_rele(os, FTAG);
+
+ at = strchr(zc->zc_name, '@');
+ if (at != NULL) {
+ /* snaps must be in same fs */
+ int error;
+
+ if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1))
+ return (SET_ERROR(EXDEV));
+ *at = '\0';
+ if (ost == DMU_OST_ZFS && !nounmount) {
+ error = dmu_objset_find(zc->zc_name,
+ recursive_unmount, at + 1,
+ recursive ? DS_FIND_CHILDREN : 0);
+ if (error != 0) {
+ *at = '@';
+ return (error);
+ }
+ }
+ error = dsl_dataset_rename_snapshot(zc->zc_name,
+ at + 1, strchr(zc->zc_value, '@') + 1, recursive);
+ *at = '@';
+
+ return (error);
+ } else {
+ return (dsl_dir_rename(zc->zc_name, zc->zc_value));
+ }
+}
+
+static int
+zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
+{
+ const char *propname = nvpair_name(pair);
+ boolean_t issnap = (strchr(dsname, '@') != NULL);
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ uint64_t intval, compval;
+ int err;
+
+ if (prop == ZPROP_INVAL) {
+ if (zfs_prop_user(propname)) {
+ if ((err = zfs_secpolicy_write_perms(dsname,
+ ZFS_DELEG_PERM_USERPROP, cr)))
+ return (err);
+ return (0);
+ }
+
+ if (!issnap && zfs_prop_userquota(propname)) {
+ const char *perm = NULL;
+ const char *uq_prefix =
+ zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA];
+ const char *gq_prefix =
+ zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA];
+ const char *uiq_prefix =
+ zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA];
+ const char *giq_prefix =
+ zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA];
+ const char *pq_prefix =
+ zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA];
+ const char *piq_prefix = zfs_userquota_prop_prefixes[\
+ ZFS_PROP_PROJECTOBJQUOTA];
+
+ if (strncmp(propname, uq_prefix,
+ strlen(uq_prefix)) == 0) {
+ perm = ZFS_DELEG_PERM_USERQUOTA;
+ } else if (strncmp(propname, uiq_prefix,
+ strlen(uiq_prefix)) == 0) {
+ perm = ZFS_DELEG_PERM_USEROBJQUOTA;
+ } else if (strncmp(propname, gq_prefix,
+ strlen(gq_prefix)) == 0) {
+ perm = ZFS_DELEG_PERM_GROUPQUOTA;
+ } else if (strncmp(propname, giq_prefix,
+ strlen(giq_prefix)) == 0) {
+ perm = ZFS_DELEG_PERM_GROUPOBJQUOTA;
+ } else if (strncmp(propname, pq_prefix,
+ strlen(pq_prefix)) == 0) {
+ perm = ZFS_DELEG_PERM_PROJECTQUOTA;
+ } else if (strncmp(propname, piq_prefix,
+ strlen(piq_prefix)) == 0) {
+ perm = ZFS_DELEG_PERM_PROJECTOBJQUOTA;
+ } else {
+ /* {USER|GROUP|PROJECT}USED are read-only */
+ return (SET_ERROR(EINVAL));
+ }
+
+ if ((err = zfs_secpolicy_write_perms(dsname, perm, cr)))
+ return (err);
+ return (0);
+ }
+
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (issnap)
+ return (SET_ERROR(EINVAL));
+
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ /*
+ * dsl_prop_get_all_impl() returns properties in this
+ * format.
+ */
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &pair) == 0);
+ }
+
+ /*
+ * Check that this value is valid for this pool version
+ */
+ switch (prop) {
+ case ZFS_PROP_COMPRESSION:
+ /*
+ * If the user specified gzip compression, make sure
+ * the SPA supports it. We ignore any errors here since
+ * we'll catch them later.
+ */
+ if (nvpair_value_uint64(pair, &intval) == 0) {
+ compval = ZIO_COMPRESS_ALGO(intval);
+ if (compval >= ZIO_COMPRESS_GZIP_1 &&
+ compval <= ZIO_COMPRESS_GZIP_9 &&
+ zfs_earlier_version(dsname,
+ SPA_VERSION_GZIP_COMPRESSION)) {
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ if (compval == ZIO_COMPRESS_ZLE &&
+ zfs_earlier_version(dsname,
+ SPA_VERSION_ZLE_COMPRESSION))
+ return (SET_ERROR(ENOTSUP));
+
+ if (compval == ZIO_COMPRESS_LZ4) {
+ spa_t *spa;
+
+ if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+ return (err);
+
+ if (!spa_feature_is_enabled(spa,
+ SPA_FEATURE_LZ4_COMPRESS)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ spa_close(spa, FTAG);
+ }
+
+ if (compval == ZIO_COMPRESS_ZSTD) {
+ spa_t *spa;
+
+ if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+ return (err);
+
+ if (!spa_feature_is_enabled(spa,
+ SPA_FEATURE_ZSTD_COMPRESS)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ spa_close(spa, FTAG);
+ }
+ }
+ break;
+
+ case ZFS_PROP_COPIES:
+ if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS))
+ return (SET_ERROR(ENOTSUP));
+ break;
+
+ case ZFS_PROP_VOLBLOCKSIZE:
+ case ZFS_PROP_RECORDSIZE:
+ /* Record sizes above 128k need the feature to be enabled */
+ if (nvpair_value_uint64(pair, &intval) == 0 &&
+ intval > SPA_OLD_MAXBLOCKSIZE) {
+ spa_t *spa;
+
+ /*
+ * We don't allow setting the property above 1MB,
+ * unless the tunable has been changed.
+ */
+ if (intval > zfs_max_recordsize ||
+ intval > SPA_MAXBLOCKSIZE)
+ return (SET_ERROR(ERANGE));
+
+ if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+ return (err);
+
+ if (!spa_feature_is_enabled(spa,
+ SPA_FEATURE_LARGE_BLOCKS)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ spa_close(spa, FTAG);
+ }
+ break;
+
+ case ZFS_PROP_DNODESIZE:
+ /* Dnode sizes above 512 need the feature to be enabled */
+ if (nvpair_value_uint64(pair, &intval) == 0 &&
+ intval != ZFS_DNSIZE_LEGACY) {
+ spa_t *spa;
+
+ if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+ return (err);
+
+ if (!spa_feature_is_enabled(spa,
+ SPA_FEATURE_LARGE_DNODE)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ spa_close(spa, FTAG);
+ }
+ break;
+
+ case ZFS_PROP_SPECIAL_SMALL_BLOCKS:
+ /*
+ * This property could require the allocation classes
+ * feature to be active for setting, however we allow
+ * it so that tests of settable properties succeed.
+ * The CLI will issue a warning in this case.
+ */
+ break;
+
+ case ZFS_PROP_SHARESMB:
+ if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
+ return (SET_ERROR(ENOTSUP));
+ break;
+
+ case ZFS_PROP_ACLINHERIT:
+ if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
+ nvpair_value_uint64(pair, &intval) == 0) {
+ if (intval == ZFS_ACL_PASSTHROUGH_X &&
+ zfs_earlier_version(dsname,
+ SPA_VERSION_PASSTHROUGH_X))
+ return (SET_ERROR(ENOTSUP));
+ }
+ break;
+ case ZFS_PROP_CHECKSUM:
+ case ZFS_PROP_DEDUP:
+ {
+ spa_feature_t feature;
+ spa_t *spa;
+ int err;
+
+ /* dedup feature version checks */
+ if (prop == ZFS_PROP_DEDUP &&
+ zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
+ return (SET_ERROR(ENOTSUP));
+
+ if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
+ nvpair_value_uint64(pair, &intval) == 0) {
+ /* check prop value is enabled in features */
+ feature = zio_checksum_to_feature(
+ intval & ZIO_CHECKSUM_MASK);
+ if (feature == SPA_FEATURE_NONE)
+ break;
+
+ if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+ return (err);
+
+ if (!spa_feature_is_enabled(spa, feature)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ spa_close(spa, FTAG);
+ }
+ break;
+ }
+
+ default:
+ break;
+ }
+
+ return (zfs_secpolicy_setprop(dsname, prop, pair, CRED()));
+}
+
+/*
+ * Removes properties from the given props list that fail permission checks
+ * needed to clear them and to restore them in case of a receive error. For each
+ * property, make sure we have both set and inherit permissions.
+ *
+ * Returns the first error encountered if any permission checks fail. If the
+ * caller provides a non-NULL errlist, it also gives the complete list of names
+ * of all the properties that failed a permission check along with the
+ * corresponding error numbers. The caller is responsible for freeing the
+ * returned errlist.
+ *
+ * If every property checks out successfully, zero is returned and the list
+ * pointed at by errlist is NULL.
+ */
+static int
+zfs_check_clearable(const char *dataset, nvlist_t *props, nvlist_t **errlist)
+{
+ zfs_cmd_t *zc;
+ nvpair_t *pair, *next_pair;
+ nvlist_t *errors;
+ int err, rv = 0;
+
+ if (props == NULL)
+ return (0);
+
+ VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP);
+ (void) strlcpy(zc->zc_name, dataset, sizeof (zc->zc_name));
+ pair = nvlist_next_nvpair(props, NULL);
+ while (pair != NULL) {
+ next_pair = nvlist_next_nvpair(props, pair);
+
+ (void) strlcpy(zc->zc_value, nvpair_name(pair),
+ sizeof (zc->zc_value));
+ if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 ||
+ (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) {
+ VERIFY(nvlist_remove_nvpair(props, pair) == 0);
+ VERIFY(nvlist_add_int32(errors,
+ zc->zc_value, err) == 0);
+ }
+ pair = next_pair;
+ }
+ kmem_free(zc, sizeof (zfs_cmd_t));
+
+ if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) {
+ nvlist_free(errors);
+ errors = NULL;
+ } else {
+ VERIFY(nvpair_value_int32(pair, &rv) == 0);
+ }
+
+ if (errlist == NULL)
+ nvlist_free(errors);
+ else
+ *errlist = errors;
+
+ return (rv);
+}
+
+static boolean_t
+propval_equals(nvpair_t *p1, nvpair_t *p2)
+{
+ if (nvpair_type(p1) == DATA_TYPE_NVLIST) {
+ /* dsl_prop_get_all_impl() format */
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(p1, &attrs) == 0);
+ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &p1) == 0);
+ }
+
+ if (nvpair_type(p2) == DATA_TYPE_NVLIST) {
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(p2, &attrs) == 0);
+ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &p2) == 0);
+ }
+
+ if (nvpair_type(p1) != nvpair_type(p2))
+ return (B_FALSE);
+
+ if (nvpair_type(p1) == DATA_TYPE_STRING) {
+ char *valstr1, *valstr2;
+
+ VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0);
+ VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0);
+ return (strcmp(valstr1, valstr2) == 0);
+ } else {
+ uint64_t intval1, intval2;
+
+ VERIFY(nvpair_value_uint64(p1, &intval1) == 0);
+ VERIFY(nvpair_value_uint64(p2, &intval2) == 0);
+ return (intval1 == intval2);
+ }
+}
+
+/*
+ * Remove properties from props if they are not going to change (as determined
+ * by comparison with origprops). Remove them from origprops as well, since we
+ * do not need to clear or restore properties that won't change.
+ */
+static void
+props_reduce(nvlist_t *props, nvlist_t *origprops)
+{
+ nvpair_t *pair, *next_pair;
+
+ if (origprops == NULL)
+ return; /* all props need to be received */
+
+ pair = nvlist_next_nvpair(props, NULL);
+ while (pair != NULL) {
+ const char *propname = nvpair_name(pair);
+ nvpair_t *match;
+
+ next_pair = nvlist_next_nvpair(props, pair);
+
+ if ((nvlist_lookup_nvpair(origprops, propname,
+ &match) != 0) || !propval_equals(pair, match))
+ goto next; /* need to set received value */
+
+ /* don't clear the existing received value */
+ (void) nvlist_remove_nvpair(origprops, match);
+ /* don't bother receiving the property */
+ (void) nvlist_remove_nvpair(props, pair);
+next:
+ pair = next_pair;
+ }
+}
+
+/*
+ * Extract properties that cannot be set PRIOR to the receipt of a dataset.
+ * For example, refquota cannot be set until after the receipt of a dataset,
+ * because in replication streams, an older/earlier snapshot may exceed the
+ * refquota. We want to receive the older/earlier snapshot, but setting
+ * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent
+ * the older/earlier snapshot from being received (with EDQUOT).
+ *
+ * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario.
+ *
+ * libzfs will need to be judicious handling errors encountered by props
+ * extracted by this function.
+ */
+static nvlist_t *
+extract_delay_props(nvlist_t *props)
+{
+ nvlist_t *delayprops;
+ nvpair_t *nvp, *tmp;
+ static const zfs_prop_t delayable[] = {
+ ZFS_PROP_REFQUOTA,
+ ZFS_PROP_KEYLOCATION,
+ 0
+ };
+ int i;
+
+ VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL;
+ nvp = nvlist_next_nvpair(props, nvp)) {
+ /*
+ * strcmp() is safe because zfs_prop_to_name() always returns
+ * a bounded string.
+ */
+ for (i = 0; delayable[i] != 0; i++) {
+ if (strcmp(zfs_prop_to_name(delayable[i]),
+ nvpair_name(nvp)) == 0) {
+ break;
+ }
+ }
+ if (delayable[i] != 0) {
+ tmp = nvlist_prev_nvpair(props, nvp);
+ VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0);
+ VERIFY(nvlist_remove_nvpair(props, nvp) == 0);
+ nvp = tmp;
+ }
+ }
+
+ if (nvlist_empty(delayprops)) {
+ nvlist_free(delayprops);
+ delayprops = NULL;
+ }
+ return (delayprops);
+}
+
+static void
+zfs_allow_log_destroy(void *arg)
+{
+ char *poolname = arg;
+
+ if (poolname != NULL)
+ kmem_strfree(poolname);
+}
+
+#ifdef ZFS_DEBUG
+static boolean_t zfs_ioc_recv_inject_err;
+#endif
+
+/*
+ * nvlist 'errors' is always allocated. It will contain descriptions of
+ * encountered errors, if any. It's the callers responsibility to free.
+ */
+static int
+zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
+ nvlist_t *localprops, nvlist_t *hidden_args, boolean_t force,
+ boolean_t resumable, int input_fd,
+ dmu_replay_record_t *begin_record, uint64_t *read_bytes,
+ uint64_t *errflags, nvlist_t **errors)
+{
+ dmu_recv_cookie_t drc;
+ int error = 0;
+ int props_error = 0;
+ offset_t off, noff;
+ nvlist_t *local_delayprops = NULL;
+ nvlist_t *recv_delayprops = NULL;
+ nvlist_t *origprops = NULL; /* existing properties */
+ nvlist_t *origrecvd = NULL; /* existing received properties */
+ boolean_t first_recvd_props = B_FALSE;
+ boolean_t tofs_was_redacted;
+ zfs_file_t *input_fp;
+
+ *read_bytes = 0;
+ *errflags = 0;
+ *errors = fnvlist_alloc();
+ off = 0;
+
+ if ((error = zfs_file_get(input_fd, &input_fp)))
+ return (error);
+
+ noff = off = zfs_file_off(input_fp);
+ error = dmu_recv_begin(tofs, tosnap, begin_record, force,
+ resumable, localprops, hidden_args, origin, &drc, input_fp,
+ &off);
+ if (error != 0)
+ goto out;
+ tofs_was_redacted = dsl_get_redacted(drc.drc_ds);
+
+ /*
+ * Set properties before we receive the stream so that they are applied
+ * to the new data. Note that we must call dmu_recv_stream() if
+ * dmu_recv_begin() succeeds.
+ */
+ if (recvprops != NULL && !drc.drc_newfs) {
+ if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >=
+ SPA_VERSION_RECVD_PROPS &&
+ !dsl_prop_get_hasrecvd(tofs))
+ first_recvd_props = B_TRUE;
+
+ /*
+ * If new received properties are supplied, they are to
+ * completely replace the existing received properties,
+ * so stash away the existing ones.
+ */
+ if (dsl_prop_get_received(tofs, &origrecvd) == 0) {
+ nvlist_t *errlist = NULL;
+ /*
+ * Don't bother writing a property if its value won't
+ * change (and avoid the unnecessary security checks).
+ *
+ * The first receive after SPA_VERSION_RECVD_PROPS is a
+ * special case where we blow away all local properties
+ * regardless.
+ */
+ if (!first_recvd_props)
+ props_reduce(recvprops, origrecvd);
+ if (zfs_check_clearable(tofs, origrecvd, &errlist) != 0)
+ (void) nvlist_merge(*errors, errlist, 0);
+ nvlist_free(errlist);
+
+ if (clear_received_props(tofs, origrecvd,
+ first_recvd_props ? NULL : recvprops) != 0)
+ *errflags |= ZPROP_ERR_NOCLEAR;
+ } else {
+ *errflags |= ZPROP_ERR_NOCLEAR;
+ }
+ }
+
+ /*
+ * Stash away existing properties so we can restore them on error unless
+ * we're doing the first receive after SPA_VERSION_RECVD_PROPS, in which
+ * case "origrecvd" will take care of that.
+ */
+ if (localprops != NULL && !drc.drc_newfs && !first_recvd_props) {
+ objset_t *os;
+ if (dmu_objset_hold(tofs, FTAG, &os) == 0) {
+ if (dsl_prop_get_all(os, &origprops) != 0) {
+ *errflags |= ZPROP_ERR_NOCLEAR;
+ }
+ dmu_objset_rele(os, FTAG);
+ } else {
+ *errflags |= ZPROP_ERR_NOCLEAR;
+ }
+ }
+
+ if (recvprops != NULL) {
+ props_error = dsl_prop_set_hasrecvd(tofs);
+
+ if (props_error == 0) {
+ recv_delayprops = extract_delay_props(recvprops);
+ (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
+ recvprops, *errors);
+ }
+ }
+
+ if (localprops != NULL) {
+ nvlist_t *oprops = fnvlist_alloc();
+ nvlist_t *xprops = fnvlist_alloc();
+ nvpair_t *nvp = NULL;
+
+ while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) {
+ if (nvpair_type(nvp) == DATA_TYPE_BOOLEAN) {
+ /* -x property */
+ const char *name = nvpair_name(nvp);
+ zfs_prop_t prop = zfs_name_to_prop(name);
+ if (prop != ZPROP_INVAL) {
+ if (!zfs_prop_inheritable(prop))
+ continue;
+ } else if (!zfs_prop_user(name))
+ continue;
+ fnvlist_add_boolean(xprops, name);
+ } else {
+ /* -o property=value */
+ fnvlist_add_nvpair(oprops, nvp);
+ }
+ }
+
+ local_delayprops = extract_delay_props(oprops);
+ (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL,
+ oprops, *errors);
+ (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED,
+ xprops, *errors);
+
+ nvlist_free(oprops);
+ nvlist_free(xprops);
+ }
+
+ error = dmu_recv_stream(&drc, &off);
+
+ if (error == 0) {
+ zfsvfs_t *zfsvfs = NULL;
+ zvol_state_handle_t *zv = NULL;
+
+ if (getzfsvfs(tofs, &zfsvfs) == 0) {
+ /* online recv */
+ dsl_dataset_t *ds;
+ int end_err;
+ boolean_t stream_is_redacted = DMU_GET_FEATUREFLAGS(
+ begin_record->drr_u.drr_begin.
+ drr_versioninfo) & DMU_BACKUP_FEATURE_REDACTED;
+
+ ds = dmu_objset_ds(zfsvfs->z_os);
+ error = zfs_suspend_fs(zfsvfs);
+ /*
+ * If the suspend fails, then the recv_end will
+ * likely also fail, and clean up after itself.
+ */
+ end_err = dmu_recv_end(&drc, zfsvfs);
+ /*
+ * If the dataset was not redacted, but we received a
+ * redacted stream onto it, we need to unmount the
+ * dataset. Otherwise, resume the filesystem.
+ */
+ if (error == 0 && !drc.drc_newfs &&
+ stream_is_redacted && !tofs_was_redacted) {
+ error = zfs_end_fs(zfsvfs, ds);
+ } else if (error == 0) {
+ error = zfs_resume_fs(zfsvfs, ds);
+ }
+ error = error ? error : end_err;
+ zfs_vfs_rele(zfsvfs);
+ } else if ((zv = zvol_suspend(tofs)) != NULL) {
+ error = dmu_recv_end(&drc, zvol_tag(zv));
+ zvol_resume(zv);
+ } else {
+ error = dmu_recv_end(&drc, NULL);
+ }
+
+ /* Set delayed properties now, after we're done receiving. */
+ if (recv_delayprops != NULL && error == 0) {
+ (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
+ recv_delayprops, *errors);
+ }
+ if (local_delayprops != NULL && error == 0) {
+ (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL,
+ local_delayprops, *errors);
+ }
+ }
+
+ /*
+ * Merge delayed props back in with initial props, in case
+ * we're DEBUG and zfs_ioc_recv_inject_err is set (which means
+ * we have to make sure clear_received_props() includes
+ * the delayed properties).
+ *
+ * Since zfs_ioc_recv_inject_err is only in DEBUG kernels,
+ * using ASSERT() will be just like a VERIFY.
+ */
+ if (recv_delayprops != NULL) {
+ ASSERT(nvlist_merge(recvprops, recv_delayprops, 0) == 0);
+ nvlist_free(recv_delayprops);
+ }
+ if (local_delayprops != NULL) {
+ ASSERT(nvlist_merge(localprops, local_delayprops, 0) == 0);
+ nvlist_free(local_delayprops);
+ }
+ *read_bytes = off - noff;
+
+#ifdef ZFS_DEBUG
+ if (zfs_ioc_recv_inject_err) {
+ zfs_ioc_recv_inject_err = B_FALSE;
+ error = 1;
+ }
+#endif
+
+ /*
+ * On error, restore the original props.
+ */
+ if (error != 0 && recvprops != NULL && !drc.drc_newfs) {
+ if (clear_received_props(tofs, recvprops, NULL) != 0) {
+ /*
+ * We failed to clear the received properties.
+ * Since we may have left a $recvd value on the
+ * system, we can't clear the $hasrecvd flag.
+ */
+ *errflags |= ZPROP_ERR_NORESTORE;
+ } else if (first_recvd_props) {
+ dsl_prop_unset_hasrecvd(tofs);
+ }
+
+ if (origrecvd == NULL && !drc.drc_newfs) {
+ /* We failed to stash the original properties. */
+ *errflags |= ZPROP_ERR_NORESTORE;
+ }
+
+ /*
+ * dsl_props_set() will not convert RECEIVED to LOCAL on or
+ * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL
+ * explicitly if we're restoring local properties cleared in the
+ * first new-style receive.
+ */
+ if (origrecvd != NULL &&
+ zfs_set_prop_nvlist(tofs, (first_recvd_props ?
+ ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED),
+ origrecvd, NULL) != 0) {
+ /*
+ * We stashed the original properties but failed to
+ * restore them.
+ */
+ *errflags |= ZPROP_ERR_NORESTORE;
+ }
+ }
+ if (error != 0 && localprops != NULL && !drc.drc_newfs &&
+ !first_recvd_props) {
+ nvlist_t *setprops;
+ nvlist_t *inheritprops;
+ nvpair_t *nvp;
+
+ if (origprops == NULL) {
+ /* We failed to stash the original properties. */
+ *errflags |= ZPROP_ERR_NORESTORE;
+ goto out;
+ }
+
+ /* Restore original props */
+ setprops = fnvlist_alloc();
+ inheritprops = fnvlist_alloc();
+ nvp = NULL;
+ while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) {
+ const char *name = nvpair_name(nvp);
+ const char *source;
+ nvlist_t *attrs;
+
+ if (!nvlist_exists(origprops, name)) {
+ /*
+ * Property was not present or was explicitly
+ * inherited before the receive, restore this.
+ */
+ fnvlist_add_boolean(inheritprops, name);
+ continue;
+ }
+ attrs = fnvlist_lookup_nvlist(origprops, name);
+ source = fnvlist_lookup_string(attrs, ZPROP_SOURCE);
+
+ /* Skip received properties */
+ if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0)
+ continue;
+
+ if (strcmp(source, tofs) == 0) {
+ /* Property was locally set */
+ fnvlist_add_nvlist(setprops, name, attrs);
+ } else {
+ /* Property was implicitly inherited */
+ fnvlist_add_boolean(inheritprops, name);
+ }
+ }
+
+ if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, setprops,
+ NULL) != 0)
+ *errflags |= ZPROP_ERR_NORESTORE;
+ if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, inheritprops,
+ NULL) != 0)
+ *errflags |= ZPROP_ERR_NORESTORE;
+
+ nvlist_free(setprops);
+ nvlist_free(inheritprops);
+ }
+out:
+ zfs_file_put(input_fd);
+ nvlist_free(origrecvd);
+ nvlist_free(origprops);
+
+ if (error == 0)
+ error = props_error;
+
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of containing filesystem (unused)
+ * zc_nvlist_src{_size} nvlist of properties to apply
+ * zc_nvlist_conf{_size} nvlist of properties to exclude
+ * (DATA_TYPE_BOOLEAN) and override (everything else)
+ * zc_value name of snapshot to create
+ * zc_string name of clone origin (if DRR_FLAG_CLONE)
+ * zc_cookie file descriptor to recv from
+ * zc_begin_record the BEGIN record of the stream (not byteswapped)
+ * zc_guid force flag
+ *
+ * outputs:
+ * zc_cookie number of bytes read
+ * zc_obj zprop_errflags_t
+ * zc_nvlist_dst{_size} error for each unapplied received property
+ */
+static int
+zfs_ioc_recv(zfs_cmd_t *zc)
+{
+ dmu_replay_record_t begin_record;
+ nvlist_t *errors = NULL;
+ nvlist_t *recvdprops = NULL;
+ nvlist_t *localprops = NULL;
+ char *origin = NULL;
+ char *tosnap;
+ char tofs[ZFS_MAX_DATASET_NAME_LEN];
+ int error = 0;
+
+ if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
+ strchr(zc->zc_value, '@') == NULL ||
+ strchr(zc->zc_value, '%'))
+ return (SET_ERROR(EINVAL));
+
+ (void) strlcpy(tofs, zc->zc_value, sizeof (tofs));
+ tosnap = strchr(tofs, '@');
+ *tosnap++ = '\0';
+
+ if (zc->zc_nvlist_src != 0 &&
+ (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &recvdprops)) != 0)
+ return (error);
+
+ if (zc->zc_nvlist_conf != 0 &&
+ (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+ zc->zc_iflags, &localprops)) != 0)
+ return (error);
+
+ if (zc->zc_string[0])
+ origin = zc->zc_string;
+
+ begin_record.drr_type = DRR_BEGIN;
+ begin_record.drr_payloadlen = 0;
+ begin_record.drr_u.drr_begin = zc->zc_begin_record;
+
+ error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops,
+ NULL, zc->zc_guid, B_FALSE, zc->zc_cookie, &begin_record,
+ &zc->zc_cookie, &zc->zc_obj, &errors);
+ nvlist_free(recvdprops);
+ nvlist_free(localprops);
+
+ /*
+ * Now that all props, initial and delayed, are set, report the prop
+ * errors to the caller.
+ */
+ if (zc->zc_nvlist_dst_size != 0 && errors != NULL &&
+ (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 ||
+ put_nvlist(zc, errors) != 0)) {
+ /*
+ * Caller made zc->zc_nvlist_dst less than the minimum expected
+ * size or supplied an invalid address.
+ */
+ error = SET_ERROR(EINVAL);
+ }
+
+ nvlist_free(errors);
+
+ return (error);
+}
+
+/*
+ * innvl: {
+ * "snapname" -> full name of the snapshot to create
+ * (optional) "props" -> received properties to set (nvlist)
+ * (optional) "localprops" -> override and exclude properties (nvlist)
+ * (optional) "origin" -> name of clone origin (DRR_FLAG_CLONE)
+ * "begin_record" -> non-byteswapped dmu_replay_record_t
+ * "input_fd" -> file descriptor to read stream from (int32)
+ * (optional) "force" -> force flag (value ignored)
+ * (optional) "resumable" -> resumable flag (value ignored)
+ * (optional) "cleanup_fd" -> unused
+ * (optional) "action_handle" -> unused
+ * (optional) "hidden_args" -> { "wkeydata" -> value }
+ * }
+ *
+ * outnvl: {
+ * "read_bytes" -> number of bytes read
+ * "error_flags" -> zprop_errflags_t
+ * "errors" -> error for each unapplied received property (nvlist)
+ * }
+ */
+static const zfs_ioc_key_t zfs_keys_recv_new[] = {
+ {"snapname", DATA_TYPE_STRING, 0},
+ {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL},
+ {"localprops", DATA_TYPE_NVLIST, ZK_OPTIONAL},
+ {"origin", DATA_TYPE_STRING, ZK_OPTIONAL},
+ {"begin_record", DATA_TYPE_BYTE_ARRAY, 0},
+ {"input_fd", DATA_TYPE_INT32, 0},
+ {"force", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"resumable", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL},
+ {"action_handle", DATA_TYPE_UINT64, ZK_OPTIONAL},
+ {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ dmu_replay_record_t *begin_record;
+ uint_t begin_record_size;
+ nvlist_t *errors = NULL;
+ nvlist_t *recvprops = NULL;
+ nvlist_t *localprops = NULL;
+ nvlist_t *hidden_args = NULL;
+ char *snapname;
+ char *origin = NULL;
+ char *tosnap;
+ char tofs[ZFS_MAX_DATASET_NAME_LEN];
+ boolean_t force;
+ boolean_t resumable;
+ uint64_t read_bytes = 0;
+ uint64_t errflags = 0;
+ int input_fd = -1;
+ int error;
+
+ snapname = fnvlist_lookup_string(innvl, "snapname");
+
+ if (dataset_namecheck(snapname, NULL, NULL) != 0 ||
+ strchr(snapname, '@') == NULL ||
+ strchr(snapname, '%'))
+ return (SET_ERROR(EINVAL));
+
+ (void) strlcpy(tofs, snapname, sizeof (tofs));
+ tosnap = strchr(tofs, '@');
+ *tosnap++ = '\0';
+
+ error = nvlist_lookup_string(innvl, "origin", &origin);
+ if (error && error != ENOENT)
+ return (error);
+
+ error = nvlist_lookup_byte_array(innvl, "begin_record",
+ (uchar_t **)&begin_record, &begin_record_size);
+ if (error != 0 || begin_record_size != sizeof (*begin_record))
+ return (SET_ERROR(EINVAL));
+
+ input_fd = fnvlist_lookup_int32(innvl, "input_fd");
+
+ force = nvlist_exists(innvl, "force");
+ resumable = nvlist_exists(innvl, "resumable");
+
+ /* we still use "props" here for backwards compatibility */
+ error = nvlist_lookup_nvlist(innvl, "props", &recvprops);
+ if (error && error != ENOENT)
+ return (error);
+
+ error = nvlist_lookup_nvlist(innvl, "localprops", &localprops);
+ if (error && error != ENOENT)
+ return (error);
+
+ error = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args);
+ if (error && error != ENOENT)
+ return (error);
+
+ error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops,
+ hidden_args, force, resumable, input_fd, begin_record,
+ &read_bytes, &errflags, &errors);
+
+ fnvlist_add_uint64(outnvl, "read_bytes", read_bytes);
+ fnvlist_add_uint64(outnvl, "error_flags", errflags);
+ fnvlist_add_nvlist(outnvl, "errors", errors);
+
+ nvlist_free(errors);
+ nvlist_free(recvprops);
+ nvlist_free(localprops);
+
+ return (error);
+}
+
+typedef struct dump_bytes_io {
+ zfs_file_t *dbi_fp;
+ caddr_t dbi_buf;
+ int dbi_len;
+ int dbi_err;
+} dump_bytes_io_t;
+
+static void
+dump_bytes_cb(void *arg)
+{
+ dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg;
+ zfs_file_t *fp;
+ caddr_t buf;
+
+ fp = dbi->dbi_fp;
+ buf = dbi->dbi_buf;
+
+ dbi->dbi_err = zfs_file_write(fp, buf, dbi->dbi_len, NULL);
+}
+
+static int
+dump_bytes(objset_t *os, void *buf, int len, void *arg)
+{
+ dump_bytes_io_t dbi;
+
+ dbi.dbi_fp = arg;
+ dbi.dbi_buf = buf;
+ dbi.dbi_len = len;
+
+#if defined(HAVE_LARGE_STACKS)
+ dump_bytes_cb(&dbi);
+#else
+ /*
+ * The vn_rdwr() call is performed in a taskq to ensure that there is
+ * always enough stack space to write safely to the target filesystem.
+ * The ZIO_TYPE_FREE threads are used because there can be a lot of
+ * them and they are used in vdev_file.c for a similar purpose.
+ */
+ spa_taskq_dispatch_sync(dmu_objset_spa(os), ZIO_TYPE_FREE,
+ ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP);
+#endif /* HAVE_LARGE_STACKS */
+
+ return (dbi.dbi_err);
+}
+
+/*
+ * inputs:
+ * zc_name name of snapshot to send
+ * zc_cookie file descriptor to send stream to
+ * zc_obj fromorigin flag (mutually exclusive with zc_fromobj)
+ * zc_sendobj objsetid of snapshot to send
+ * zc_fromobj objsetid of incremental fromsnap (may be zero)
+ * zc_guid if set, estimate size of stream only. zc_cookie is ignored.
+ * output size in zc_objset_type.
+ * zc_flags lzc_send_flags
+ *
+ * outputs:
+ * zc_objset_type estimated size, if zc_guid is set
+ *
+ * NOTE: This is no longer the preferred interface, any new functionality
+ * should be added to zfs_ioc_send_new() instead.
+ */
+static int
+zfs_ioc_send(zfs_cmd_t *zc)
+{
+ int error;
+ offset_t off;
+ boolean_t estimate = (zc->zc_guid != 0);
+ boolean_t embedok = (zc->zc_flags & 0x1);
+ boolean_t large_block_ok = (zc->zc_flags & 0x2);
+ boolean_t compressok = (zc->zc_flags & 0x4);
+ boolean_t rawok = (zc->zc_flags & 0x8);
+ boolean_t savedok = (zc->zc_flags & 0x10);
+
+ if (zc->zc_obj != 0) {
+ dsl_pool_t *dp;
+ dsl_dataset_t *tosnap;
+
+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
+ if (error != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ if (dsl_dir_is_clone(tosnap->ds_dir))
+ zc->zc_fromobj =
+ dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj;
+ dsl_dataset_rele(tosnap, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ }
+
+ if (estimate) {
+ dsl_pool_t *dp;
+ dsl_dataset_t *tosnap;
+ dsl_dataset_t *fromsnap = NULL;
+
+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj,
+ FTAG, &tosnap);
+ if (error != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ if (zc->zc_fromobj != 0) {
+ error = dsl_dataset_hold_obj(dp, zc->zc_fromobj,
+ FTAG, &fromsnap);
+ if (error != 0) {
+ dsl_dataset_rele(tosnap, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+ }
+
+ error = dmu_send_estimate_fast(tosnap, fromsnap, NULL,
+ compressok || rawok, savedok, &zc->zc_objset_type);
+
+ if (fromsnap != NULL)
+ dsl_dataset_rele(fromsnap, FTAG);
+ dsl_dataset_rele(tosnap, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ } else {
+ zfs_file_t *fp;
+ dmu_send_outparams_t out = {0};
+
+ if ((error = zfs_file_get(zc->zc_cookie, &fp)))
+ return (error);
+
+ off = zfs_file_off(fp);
+ out.dso_outfunc = dump_bytes;
+ out.dso_arg = fp;
+ out.dso_dryrun = B_FALSE;
+ error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
+ zc->zc_fromobj, embedok, large_block_ok, compressok,
+ rawok, savedok, zc->zc_cookie, &off, &out);
+
+ zfs_file_put(zc->zc_cookie);
+ }
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of snapshot on which to report progress
+ * zc_cookie file descriptor of send stream
+ *
+ * outputs:
+ * zc_cookie number of bytes written in send stream thus far
+ * zc_objset_type logical size of data traversed by send thus far
+ */
+static int
+zfs_ioc_send_progress(zfs_cmd_t *zc)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ dmu_sendstatus_t *dsp = NULL;
+ int error;
+
+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds);
+ if (error != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ mutex_enter(&ds->ds_sendstream_lock);
+
+ /*
+ * Iterate over all the send streams currently active on this dataset.
+ * If there's one which matches the specified file descriptor _and_ the
+ * stream was started by the current process, return the progress of
+ * that stream.
+ */
+
+ for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL;
+ dsp = list_next(&ds->ds_sendstreams, dsp)) {
+ if (dsp->dss_outfd == zc->zc_cookie &&
+ zfs_proc_is_caller(dsp->dss_proc))
+ break;
+ }
+
+ if (dsp != NULL) {
+ zc->zc_cookie = atomic_cas_64((volatile uint64_t *)dsp->dss_off,
+ 0, 0);
+ /* This is the closest thing we have to atomic_read_64. */
+ zc->zc_objset_type = atomic_cas_64(&dsp->dss_blocks, 0, 0);
+ } else {
+ error = SET_ERROR(ENOENT);
+ }
+
+ mutex_exit(&ds->ds_sendstream_lock);
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_inject_fault(zfs_cmd_t *zc)
+{
+ int id, error;
+
+ error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id,
+ &zc->zc_inject_record);
+
+ if (error == 0)
+ zc->zc_guid = (uint64_t)id;
+
+ return (error);
+}
+
+static int
+zfs_ioc_clear_fault(zfs_cmd_t *zc)
+{
+ return (zio_clear_fault((int)zc->zc_guid));
+}
+
+static int
+zfs_ioc_inject_list_next(zfs_cmd_t *zc)
+{
+ int id = (int)zc->zc_guid;
+ int error;
+
+ error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name),
+ &zc->zc_inject_record);
+
+ zc->zc_guid = id;
+
+ return (error);
+}
+
+static int
+zfs_ioc_error_log(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+ size_t count = (size_t)zc->zc_nvlist_dst_size;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
+ &count);
+ if (error == 0)
+ zc->zc_nvlist_dst_size = count;
+ else
+ zc->zc_nvlist_dst_size = spa_get_errlog_size(spa);
+
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+static int
+zfs_ioc_clear(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ vdev_t *vd;
+ int error;
+
+ /*
+ * On zpool clear we also fix up missing slogs
+ */
+ mutex_enter(&spa_namespace_lock);
+ spa = spa_lookup(zc->zc_name);
+ if (spa == NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(EIO));
+ }
+ if (spa_get_log_state(spa) == SPA_LOG_MISSING) {
+ /* we need to let spa_open/spa_load clear the chains */
+ spa_set_log_state(spa, SPA_LOG_CLEAR);
+ }
+ spa->spa_last_open_failed = 0;
+ mutex_exit(&spa_namespace_lock);
+
+ if (zc->zc_cookie & ZPOOL_NO_REWIND) {
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ } else {
+ nvlist_t *policy;
+ nvlist_t *config = NULL;
+
+ if (zc->zc_nvlist_src == 0)
+ return (SET_ERROR(EINVAL));
+
+ if ((error = get_nvlist(zc->zc_nvlist_src,
+ zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) {
+ error = spa_open_rewind(zc->zc_name, &spa, FTAG,
+ policy, &config);
+ if (config != NULL) {
+ int err;
+
+ if ((err = put_nvlist(zc, config)) != 0)
+ error = err;
+ nvlist_free(config);
+ }
+ nvlist_free(policy);
+ }
+ }
+
+ if (error != 0)
+ return (error);
+
+ /*
+ * If multihost is enabled, resuming I/O is unsafe as another
+ * host may have imported the pool.
+ */
+ if (spa_multihost(spa) && spa_suspended(spa))
+ return (SET_ERROR(EINVAL));
+
+ spa_vdev_state_enter(spa, SCL_NONE);
+
+ if (zc->zc_guid == 0) {
+ vd = NULL;
+ } else {
+ vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE);
+ if (vd == NULL) {
+ error = SET_ERROR(ENODEV);
+ (void) spa_vdev_state_exit(spa, NULL, error);
+ spa_close(spa, FTAG);
+ return (error);
+ }
+ }
+
+ vdev_clear(spa, vd);
+
+ (void) spa_vdev_state_exit(spa, spa_suspended(spa) ?
+ NULL : spa->spa_root_vdev, 0);
+
+ /*
+ * Resume any suspended I/Os.
+ */
+ if (zio_resume(spa) != 0)
+ error = SET_ERROR(EIO);
+
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+/*
+ * Reopen all the vdevs associated with the pool.
+ *
+ * innvl: {
+ * "scrub_restart" -> when true and scrub is running, allow to restart
+ * scrub as the side effect of the reopen (boolean).
+ * }
+ *
+ * outnvl is unused
+ */
+static const zfs_ioc_key_t zfs_keys_pool_reopen[] = {
+ {"scrub_restart", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_pool_reopen(const char *pool, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ spa_t *spa;
+ int error;
+ boolean_t rc, scrub_restart = B_TRUE;
+
+ if (innvl) {
+ error = nvlist_lookup_boolean_value(innvl,
+ "scrub_restart", &rc);
+ if (error == 0)
+ scrub_restart = rc;
+ }
+
+ error = spa_open(pool, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ spa_vdev_state_enter(spa, SCL_NONE);
+
+ /*
+ * If the scrub_restart flag is B_FALSE and a scrub is already
+ * in progress then set spa_scrub_reopen flag to B_TRUE so that
+ * we don't restart the scrub as a side effect of the reopen.
+ * Otherwise, let vdev_open() decided if a resilver is required.
+ */
+
+ spa->spa_scrub_reopen = (!scrub_restart &&
+ dsl_scan_scrubbing(spa->spa_dsl_pool));
+ vdev_reopen(spa->spa_root_vdev);
+ spa->spa_scrub_reopen = B_FALSE;
+
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+ spa_close(spa, FTAG);
+ return (0);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ *
+ * outputs:
+ * zc_string name of conflicting snapshot, if there is one
+ */
+static int
+zfs_ioc_promote(zfs_cmd_t *zc)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds, *ods;
+ char origin[ZFS_MAX_DATASET_NAME_LEN];
+ char *cp;
+ int error;
+
+ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
+ if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 ||
+ strchr(zc->zc_name, '%'))
+ return (SET_ERROR(EINVAL));
+
+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds);
+ if (error != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ if (!dsl_dir_is_clone(ds->ds_dir)) {
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ error = dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ dsl_dataset_name(ods, origin);
+ dsl_dataset_rele(ods, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_rele(dp, FTAG);
+
+ /*
+ * We don't need to unmount *all* the origin fs's snapshots, but
+ * it's easier.
+ */
+ cp = strchr(origin, '@');
+ if (cp)
+ *cp = '\0';
+ (void) dmu_objset_find(origin,
+ zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS);
+ return (dsl_dataset_promote(zc->zc_name, zc->zc_string));
+}
+
+/*
+ * Retrieve a single {user|group|project}{used|quota}@... property.
+ *
+ * inputs:
+ * zc_name name of filesystem
+ * zc_objset_type zfs_userquota_prop_t
+ * zc_value domain name (eg. "S-1-234-567-89")
+ * zc_guid RID/UID/GID
+ *
+ * outputs:
+ * zc_cookie property value
+ */
+static int
+zfs_ioc_userspace_one(zfs_cmd_t *zc)
+{
+ zfsvfs_t *zfsvfs;
+ int error;
+
+ if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
+ return (SET_ERROR(EINVAL));
+
+ error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
+ if (error != 0)
+ return (error);
+
+ error = zfs_userspace_one(zfsvfs,
+ zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie);
+ zfsvfs_rele(zfsvfs, FTAG);
+
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_cookie zap cursor
+ * zc_objset_type zfs_userquota_prop_t
+ * zc_nvlist_dst[_size] buffer to fill (not really an nvlist)
+ *
+ * outputs:
+ * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t)
+ * zc_cookie zap cursor
+ */
+static int
+zfs_ioc_userspace_many(zfs_cmd_t *zc)
+{
+ zfsvfs_t *zfsvfs;
+ int bufsize = zc->zc_nvlist_dst_size;
+
+ if (bufsize <= 0)
+ return (SET_ERROR(ENOMEM));
+
+ int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
+ if (error != 0)
+ return (error);
+
+ void *buf = vmem_alloc(bufsize, KM_SLEEP);
+
+ error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie,
+ buf, &zc->zc_nvlist_dst_size);
+
+ if (error == 0) {
+ error = xcopyout(buf,
+ (void *)(uintptr_t)zc->zc_nvlist_dst,
+ zc->zc_nvlist_dst_size);
+ }
+ vmem_free(buf, bufsize);
+ zfsvfs_rele(zfsvfs, FTAG);
+
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ *
+ * outputs:
+ * none
+ */
+static int
+zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
+{
+ int error = 0;
+ zfsvfs_t *zfsvfs;
+
+ if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
+ if (!dmu_objset_userused_enabled(zfsvfs->z_os)) {
+ /*
+ * If userused is not enabled, it may be because the
+ * objset needs to be closed & reopened (to grow the
+ * objset_phys_t). Suspend/resume the fs will do that.
+ */
+ dsl_dataset_t *ds, *newds;
+
+ ds = dmu_objset_ds(zfsvfs->z_os);
+ error = zfs_suspend_fs(zfsvfs);
+ if (error == 0) {
+ dmu_objset_refresh_ownership(ds, &newds,
+ B_TRUE, zfsvfs);
+ error = zfs_resume_fs(zfsvfs, newds);
+ }
+ }
+ if (error == 0) {
+ mutex_enter(&zfsvfs->z_os->os_upgrade_lock);
+ if (zfsvfs->z_os->os_upgrade_id == 0) {
+ /* clear potential error code and retry */
+ zfsvfs->z_os->os_upgrade_status = 0;
+ mutex_exit(&zfsvfs->z_os->os_upgrade_lock);
+
+ dsl_pool_config_enter(
+ dmu_objset_pool(zfsvfs->z_os), FTAG);
+ dmu_objset_userspace_upgrade(zfsvfs->z_os);
+ dsl_pool_config_exit(
+ dmu_objset_pool(zfsvfs->z_os), FTAG);
+ } else {
+ mutex_exit(&zfsvfs->z_os->os_upgrade_lock);
+ }
+
+ taskq_wait_id(zfsvfs->z_os->os_spa->spa_upgrade_taskq,
+ zfsvfs->z_os->os_upgrade_id);
+ error = zfsvfs->z_os->os_upgrade_status;
+ }
+ zfs_vfs_rele(zfsvfs);
+ } else {
+ objset_t *os;
+
+ /* XXX kind of reading contents without owning */
+ error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os);
+ if (error != 0)
+ return (error);
+
+ mutex_enter(&os->os_upgrade_lock);
+ if (os->os_upgrade_id == 0) {
+ /* clear potential error code and retry */
+ os->os_upgrade_status = 0;
+ mutex_exit(&os->os_upgrade_lock);
+
+ dmu_objset_userspace_upgrade(os);
+ } else {
+ mutex_exit(&os->os_upgrade_lock);
+ }
+
+ dsl_pool_rele(dmu_objset_pool(os), FTAG);
+
+ taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id);
+ error = os->os_upgrade_status;
+
+ dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT,
+ FTAG);
+ }
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ *
+ * outputs:
+ * none
+ */
+static int
+zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ int error;
+
+ error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os);
+ if (error != 0)
+ return (error);
+
+ if (dmu_objset_userobjspace_upgradable(os) ||
+ dmu_objset_projectquota_upgradable(os)) {
+ mutex_enter(&os->os_upgrade_lock);
+ if (os->os_upgrade_id == 0) {
+ /* clear potential error code and retry */
+ os->os_upgrade_status = 0;
+ mutex_exit(&os->os_upgrade_lock);
+
+ dmu_objset_id_quota_upgrade(os);
+ } else {
+ mutex_exit(&os->os_upgrade_lock);
+ }
+
+ dsl_pool_rele(dmu_objset_pool(os), FTAG);
+
+ taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id);
+ error = os->os_upgrade_status;
+ } else {
+ dsl_pool_rele(dmu_objset_pool(os), FTAG);
+ }
+
+ dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT, FTAG);
+
+ return (error);
+}
+
+static int
+zfs_ioc_share(zfs_cmd_t *zc)
+{
+ return (SET_ERROR(ENOSYS));
+}
+
+ace_t full_access[] = {
+ {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0}
+};
+
+/*
+ * inputs:
+ * zc_name name of containing filesystem
+ * zc_obj object # beyond which we want next in-use object #
+ *
+ * outputs:
+ * zc_obj next in-use object #
+ */
+static int
+zfs_ioc_next_obj(zfs_cmd_t *zc)
+{
+ objset_t *os = NULL;
+ int error;
+
+ error = dmu_objset_hold(zc->zc_name, FTAG, &os);
+ if (error != 0)
+ return (error);
+
+ error = dmu_object_next(os, &zc->zc_obj, B_FALSE, 0);
+
+ dmu_objset_rele(os, FTAG);
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_value prefix name for snapshot
+ * zc_cleanup_fd cleanup-on-exit file descriptor for calling process
+ *
+ * outputs:
+ * zc_value short name of new snapshot
+ */
+static int
+zfs_ioc_tmp_snapshot(zfs_cmd_t *zc)
+{
+ char *snap_name;
+ char *hold_name;
+ int error;
+ minor_t minor;
+
+ error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor);
+ if (error != 0)
+ return (error);
+
+ snap_name = kmem_asprintf("%s-%016llx", zc->zc_value,
+ (u_longlong_t)ddi_get_lbolt64());
+ hold_name = kmem_asprintf("%%%s", zc->zc_value);
+
+ error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor,
+ hold_name);
+ if (error == 0)
+ (void) strlcpy(zc->zc_value, snap_name,
+ sizeof (zc->zc_value));
+ kmem_strfree(snap_name);
+ kmem_strfree(hold_name);
+ zfs_onexit_fd_rele(zc->zc_cleanup_fd);
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of "to" snapshot
+ * zc_value name of "from" snapshot
+ * zc_cookie file descriptor to write diff data on
+ *
+ * outputs:
+ * dmu_diff_record_t's to the file descriptor
+ */
+static int
+zfs_ioc_diff(zfs_cmd_t *zc)
+{
+ zfs_file_t *fp;
+ offset_t off;
+ int error;
+
+ if ((error = zfs_file_get(zc->zc_cookie, &fp)))
+ return (error);
+
+ off = zfs_file_off(fp);
+ error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off);
+
+ zfs_file_put(zc->zc_cookie);
+
+ return (error);
+}
+
+static int
+zfs_ioc_smb_acl(zfs_cmd_t *zc)
+{
+ return (SET_ERROR(ENOTSUP));
+}
+
+/*
+ * innvl: {
+ * "holds" -> { snapname -> holdname (string), ... }
+ * (optional) "cleanup_fd" -> fd (int32)
+ * }
+ *
+ * outnvl: {
+ * snapname -> error value (int32)
+ * ...
+ * }
+ */
+static const zfs_ioc_key_t zfs_keys_hold[] = {
+ {"holds", DATA_TYPE_NVLIST, 0},
+ {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist)
+{
+ nvpair_t *pair;
+ nvlist_t *holds;
+ int cleanup_fd = -1;
+ int error;
+ minor_t minor = 0;
+
+ holds = fnvlist_lookup_nvlist(args, "holds");
+
+ /* make sure the user didn't pass us any invalid (empty) tags */
+ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(holds, pair)) {
+ char *htag;
+
+ error = nvpair_value_string(pair, &htag);
+ if (error != 0)
+ return (SET_ERROR(error));
+
+ if (strlen(htag) == 0)
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) {
+ error = zfs_onexit_fd_hold(cleanup_fd, &minor);
+ if (error != 0)
+ return (SET_ERROR(error));
+ }
+
+ error = dsl_dataset_user_hold(holds, minor, errlist);
+ if (minor != 0)
+ zfs_onexit_fd_rele(cleanup_fd);
+ return (SET_ERROR(error));
+}
+
+/*
+ * innvl is not used.
+ *
+ * outnvl: {
+ * holdname -> time added (uint64 seconds since epoch)
+ * ...
+ * }
+ */
+static const zfs_ioc_key_t zfs_keys_get_holds[] = {
+ /* no nvl keys */
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl)
+{
+ return (dsl_dataset_get_holds(snapname, outnvl));
+}
+
+/*
+ * innvl: {
+ * snapname -> { holdname, ... }
+ * ...
+ * }
+ *
+ * outnvl: {
+ * snapname -> error value (int32)
+ * ...
+ * }
+ */
+static const zfs_ioc_key_t zfs_keys_release[] = {
+ {"<snapname>...", DATA_TYPE_NVLIST, ZK_WILDCARDLIST},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist)
+{
+ return (dsl_dataset_user_release(holds, errlist));
+}
+
+/*
+ * inputs:
+ * zc_guid flags (ZEVENT_NONBLOCK)
+ * zc_cleanup_fd zevent file descriptor
+ *
+ * outputs:
+ * zc_nvlist_dst next nvlist event
+ * zc_cookie dropped events since last get
+ */
+static int
+zfs_ioc_events_next(zfs_cmd_t *zc)
+{
+ zfs_zevent_t *ze;
+ nvlist_t *event = NULL;
+ minor_t minor;
+ uint64_t dropped = 0;
+ int error;
+
+ error = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze);
+ if (error != 0)
+ return (error);
+
+ do {
+ error = zfs_zevent_next(ze, &event,
+ &zc->zc_nvlist_dst_size, &dropped);
+ if (event != NULL) {
+ zc->zc_cookie = dropped;
+ error = put_nvlist(zc, event);
+ nvlist_free(event);
+ }
+
+ if (zc->zc_guid & ZEVENT_NONBLOCK)
+ break;
+
+ if ((error == 0) || (error != ENOENT))
+ break;
+
+ error = zfs_zevent_wait(ze);
+ if (error != 0)
+ break;
+ } while (1);
+
+ zfs_zevent_fd_rele(zc->zc_cleanup_fd);
+
+ return (error);
+}
+
+/*
+ * outputs:
+ * zc_cookie cleared events count
+ */
+static int
+zfs_ioc_events_clear(zfs_cmd_t *zc)
+{
+ int count;
+
+ zfs_zevent_drain_all(&count);
+ zc->zc_cookie = count;
+
+ return (0);
+}
+
+/*
+ * inputs:
+ * zc_guid eid | ZEVENT_SEEK_START | ZEVENT_SEEK_END
+ * zc_cleanup zevent file descriptor
+ */
+static int
+zfs_ioc_events_seek(zfs_cmd_t *zc)
+{
+ zfs_zevent_t *ze;
+ minor_t minor;
+ int error;
+
+ error = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze);
+ if (error != 0)
+ return (error);
+
+ error = zfs_zevent_seek(ze, zc->zc_guid);
+ zfs_zevent_fd_rele(zc->zc_cleanup_fd);
+
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of later filesystem or snapshot
+ * zc_value full name of old snapshot or bookmark
+ *
+ * outputs:
+ * zc_cookie space in bytes
+ * zc_objset_type compressed space in bytes
+ * zc_perm_action uncompressed space in bytes
+ */
+static int
+zfs_ioc_space_written(zfs_cmd_t *zc)
+{
+ int error;
+ dsl_pool_t *dp;
+ dsl_dataset_t *new;
+
+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+ if (error != 0)
+ return (error);
+ error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new);
+ if (error != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+ if (strchr(zc->zc_value, '#') != NULL) {
+ zfs_bookmark_phys_t bmp;
+ error = dsl_bookmark_lookup(dp, zc->zc_value,
+ new, &bmp);
+ if (error == 0) {
+ error = dsl_dataset_space_written_bookmark(&bmp, new,
+ &zc->zc_cookie,
+ &zc->zc_objset_type, &zc->zc_perm_action);
+ }
+ } else {
+ dsl_dataset_t *old;
+ error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old);
+
+ if (error == 0) {
+ error = dsl_dataset_space_written(old, new,
+ &zc->zc_cookie,
+ &zc->zc_objset_type, &zc->zc_perm_action);
+ dsl_dataset_rele(old, FTAG);
+ }
+ }
+ dsl_dataset_rele(new, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+}
+
+/*
+ * innvl: {
+ * "firstsnap" -> snapshot name
+ * }
+ *
+ * outnvl: {
+ * "used" -> space in bytes
+ * "compressed" -> compressed space in bytes
+ * "uncompressed" -> uncompressed space in bytes
+ * }
+ */
+static const zfs_ioc_key_t zfs_keys_space_snaps[] = {
+ {"firstsnap", DATA_TYPE_STRING, 0},
+};
+
+static int
+zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ int error;
+ dsl_pool_t *dp;
+ dsl_dataset_t *new, *old;
+ char *firstsnap;
+ uint64_t used, comp, uncomp;
+
+ firstsnap = fnvlist_lookup_string(innvl, "firstsnap");
+
+ error = dsl_pool_hold(lastsnap, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_hold(dp, lastsnap, FTAG, &new);
+ if (error == 0 && !new->ds_is_snapshot) {
+ dsl_dataset_rele(new, FTAG);
+ error = SET_ERROR(EINVAL);
+ }
+ if (error != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+ error = dsl_dataset_hold(dp, firstsnap, FTAG, &old);
+ if (error == 0 && !old->ds_is_snapshot) {
+ dsl_dataset_rele(old, FTAG);
+ error = SET_ERROR(EINVAL);
+ }
+ if (error != 0) {
+ dsl_dataset_rele(new, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp);
+ dsl_dataset_rele(old, FTAG);
+ dsl_dataset_rele(new, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ fnvlist_add_uint64(outnvl, "used", used);
+ fnvlist_add_uint64(outnvl, "compressed", comp);
+ fnvlist_add_uint64(outnvl, "uncompressed", uncomp);
+ return (error);
+}
+
+/*
+ * innvl: {
+ * "fd" -> file descriptor to write stream to (int32)
+ * (optional) "fromsnap" -> full snap name to send an incremental from
+ * (optional) "largeblockok" -> (value ignored)
+ * indicates that blocks > 128KB are permitted
+ * (optional) "embedok" -> (value ignored)
+ * presence indicates DRR_WRITE_EMBEDDED records are permitted
+ * (optional) "compressok" -> (value ignored)
+ * presence indicates compressed DRR_WRITE records are permitted
+ * (optional) "rawok" -> (value ignored)
+ * presence indicates raw encrypted records should be used.
+ * (optional) "savedok" -> (value ignored)
+ * presence indicates we should send a partially received snapshot
+ * (optional) "resume_object" and "resume_offset" -> (uint64)
+ * if present, resume send stream from specified object and offset.
+ * (optional) "redactbook" -> (string)
+ * if present, use this bookmark's redaction list to generate a redacted
+ * send stream
+ * }
+ *
+ * outnvl is unused
+ */
+static const zfs_ioc_key_t zfs_keys_send_new[] = {
+ {"fd", DATA_TYPE_INT32, 0},
+ {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL},
+ {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"savedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL},
+ {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL},
+ {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ int error;
+ offset_t off;
+ char *fromname = NULL;
+ int fd;
+ zfs_file_t *fp;
+ boolean_t largeblockok;
+ boolean_t embedok;
+ boolean_t compressok;
+ boolean_t rawok;
+ boolean_t savedok;
+ uint64_t resumeobj = 0;
+ uint64_t resumeoff = 0;
+ char *redactbook = NULL;
+
+ fd = fnvlist_lookup_int32(innvl, "fd");
+
+ (void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
+
+ largeblockok = nvlist_exists(innvl, "largeblockok");
+ embedok = nvlist_exists(innvl, "embedok");
+ compressok = nvlist_exists(innvl, "compressok");
+ rawok = nvlist_exists(innvl, "rawok");
+ savedok = nvlist_exists(innvl, "savedok");
+
+ (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
+ (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);
+
+ (void) nvlist_lookup_string(innvl, "redactbook", &redactbook);
+
+ if ((error = zfs_file_get(fd, &fp)))
+ return (error);
+
+ off = zfs_file_off(fp);
+
+ dmu_send_outparams_t out = {0};
+ out.dso_outfunc = dump_bytes;
+ out.dso_arg = fp;
+ out.dso_dryrun = B_FALSE;
+ error = dmu_send(snapname, fromname, embedok, largeblockok,
+ compressok, rawok, savedok, resumeobj, resumeoff,
+ redactbook, fd, &off, &out);
+
+ zfs_file_put(fd);
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+send_space_sum(objset_t *os, void *buf, int len, void *arg)
+{
+ uint64_t *size = arg;
+ *size += len;
+ return (0);
+}
+
+/*
+ * Determine approximately how large a zfs send stream will be -- the number
+ * of bytes that will be written to the fd supplied to zfs_ioc_send_new().
+ *
+ * innvl: {
+ * (optional) "from" -> full snap or bookmark name to send an incremental
+ * from
+ * (optional) "largeblockok" -> (value ignored)
+ * indicates that blocks > 128KB are permitted
+ * (optional) "embedok" -> (value ignored)
+ * presence indicates DRR_WRITE_EMBEDDED records are permitted
+ * (optional) "compressok" -> (value ignored)
+ * presence indicates compressed DRR_WRITE records are permitted
+ * (optional) "rawok" -> (value ignored)
+ * presence indicates raw encrypted records should be used.
+ * (optional) "resume_object" and "resume_offset" -> (uint64)
+ * if present, resume send stream from specified object and offset.
+ * (optional) "fd" -> file descriptor to use as a cookie for progress
+ * tracking (int32)
+ * }
+ *
+ * outnvl: {
+ * "space" -> bytes of space (uint64)
+ * }
+ */
+static const zfs_ioc_key_t zfs_keys_send_space[] = {
+ {"from", DATA_TYPE_STRING, ZK_OPTIONAL},
+ {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL},
+ {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"fd", DATA_TYPE_INT32, ZK_OPTIONAL},
+ {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL},
+ {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL},
+ {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL},
+ {"bytes", DATA_TYPE_UINT64, ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *tosnap;
+ dsl_dataset_t *fromsnap = NULL;
+ int error;
+ char *fromname = NULL;
+ char *redactlist_book = NULL;
+ boolean_t largeblockok;
+ boolean_t embedok;
+ boolean_t compressok;
+ boolean_t rawok;
+ boolean_t savedok;
+ uint64_t space = 0;
+ boolean_t full_estimate = B_FALSE;
+ uint64_t resumeobj = 0;
+ uint64_t resumeoff = 0;
+ uint64_t resume_bytes = 0;
+ int32_t fd = -1;
+ zfs_bookmark_phys_t zbm = {0};
+
+ error = dsl_pool_hold(snapname, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap);
+ if (error != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+ (void) nvlist_lookup_int32(innvl, "fd", &fd);
+
+ largeblockok = nvlist_exists(innvl, "largeblockok");
+ embedok = nvlist_exists(innvl, "embedok");
+ compressok = nvlist_exists(innvl, "compressok");
+ rawok = nvlist_exists(innvl, "rawok");
+ savedok = nvlist_exists(innvl, "savedok");
+ boolean_t from = (nvlist_lookup_string(innvl, "from", &fromname) == 0);
+ boolean_t altbook = (nvlist_lookup_string(innvl, "redactbook",
+ &redactlist_book) == 0);
+
+ (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
+ (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);
+ (void) nvlist_lookup_uint64(innvl, "bytes", &resume_bytes);
+
+ if (altbook) {
+ full_estimate = B_TRUE;
+ } else if (from) {
+ if (strchr(fromname, '#')) {
+ error = dsl_bookmark_lookup(dp, fromname, tosnap, &zbm);
+
+ /*
+ * dsl_bookmark_lookup() will fail with EXDEV if
+ * the from-bookmark and tosnap are at the same txg.
+ * However, it's valid to do a send (and therefore,
+ * a send estimate) from and to the same time point,
+ * if the bookmark is redacted (the incremental send
+ * can change what's redacted on the target). In
+ * this case, dsl_bookmark_lookup() fills in zbm
+ * but returns EXDEV. Ignore this error.
+ */
+ if (error == EXDEV && zbm.zbm_redaction_obj != 0 &&
+ zbm.zbm_guid ==
+ dsl_dataset_phys(tosnap)->ds_guid)
+ error = 0;
+
+ if (error != 0) {
+ dsl_dataset_rele(tosnap, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+ if (zbm.zbm_redaction_obj != 0 || !(zbm.zbm_flags &
+ ZBM_FLAG_HAS_FBN)) {
+ full_estimate = B_TRUE;
+ }
+ } else if (strchr(fromname, '@')) {
+ error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
+ if (error != 0) {
+ dsl_dataset_rele(tosnap, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) {
+ full_estimate = B_TRUE;
+ dsl_dataset_rele(fromsnap, FTAG);
+ }
+ } else {
+ /*
+ * from is not properly formatted as a snapshot or
+ * bookmark
+ */
+ dsl_dataset_rele(tosnap, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ }
+
+ if (full_estimate) {
+ dmu_send_outparams_t out = {0};
+ offset_t off = 0;
+ out.dso_outfunc = send_space_sum;
+ out.dso_arg = &space;
+ out.dso_dryrun = B_TRUE;
+ /*
+ * We have to release these holds so dmu_send can take them. It
+ * will do all the error checking we need.
+ */
+ dsl_dataset_rele(tosnap, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ error = dmu_send(snapname, fromname, embedok, largeblockok,
+ compressok, rawok, savedok, resumeobj, resumeoff,
+ redactlist_book, fd, &off, &out);
+ } else {
+ error = dmu_send_estimate_fast(tosnap, fromsnap,
+ (from && strchr(fromname, '#') != NULL ? &zbm : NULL),
+ compressok || rawok, savedok, &space);
+ space -= resume_bytes;
+ if (fromsnap != NULL)
+ dsl_dataset_rele(fromsnap, FTAG);
+ dsl_dataset_rele(tosnap, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ }
+
+ fnvlist_add_uint64(outnvl, "space", space);
+
+ return (error);
+}
+
+/*
+ * Sync the currently open TXG to disk for the specified pool.
+ * This is somewhat similar to 'zfs_sync()'.
+ * For cases that do not result in error this ioctl will wait for
+ * the currently open TXG to commit before returning back to the caller.
+ *
+ * innvl: {
+ * "force" -> when true, force uberblock update even if there is no dirty data.
+ * In addition this will cause the vdev configuration to be written
+ * out including updating the zpool cache file. (boolean_t)
+ * }
+ *
+ * onvl is unused
+ */
+static const zfs_ioc_key_t zfs_keys_pool_sync[] = {
+ {"force", DATA_TYPE_BOOLEAN_VALUE, 0},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl)
+{
+ int err;
+ boolean_t rc, force = B_FALSE;
+ spa_t *spa;
+
+ if ((err = spa_open(pool, &spa, FTAG)) != 0)
+ return (err);
+
+ if (innvl) {
+ err = nvlist_lookup_boolean_value(innvl, "force", &rc);
+ if (err == 0)
+ force = rc;
+ }
+
+ if (force) {
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER);
+ vdev_config_dirty(spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ }
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ spa_close(spa, FTAG);
+
+ return (0);
+}
+
+/*
+ * Load a user's wrapping key into the kernel.
+ * innvl: {
+ * "hidden_args" -> { "wkeydata" -> value }
+ * raw uint8_t array of encryption wrapping key data (32 bytes)
+ * (optional) "noop" -> (value ignored)
+ * presence indicated key should only be verified, not loaded
+ * }
+ */
+static const zfs_ioc_key_t zfs_keys_load_key[] = {
+ {"hidden_args", DATA_TYPE_NVLIST, 0},
+ {"noop", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_load_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ int ret;
+ dsl_crypto_params_t *dcp = NULL;
+ nvlist_t *hidden_args;
+ boolean_t noop = nvlist_exists(innvl, "noop");
+
+ if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ hidden_args = fnvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS);
+
+ ret = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL,
+ hidden_args, &dcp);
+ if (ret != 0)
+ goto error;
+
+ ret = spa_keystore_load_wkey(dsname, dcp, noop);
+ if (ret != 0)
+ goto error;
+
+ dsl_crypto_params_free(dcp, noop);
+
+ return (0);
+
+error:
+ dsl_crypto_params_free(dcp, B_TRUE);
+ return (ret);
+}
+
+/*
+ * Unload a user's wrapping key from the kernel.
+ * Both innvl and outnvl are unused.
+ */
+static const zfs_ioc_key_t zfs_keys_unload_key[] = {
+ /* no nvl keys */
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_unload_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ int ret = 0;
+
+ if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) {
+ ret = (SET_ERROR(EINVAL));
+ goto out;
+ }
+
+ ret = spa_keystore_unload_wkey(dsname);
+ if (ret != 0)
+ goto out;
+
+out:
+ return (ret);
+}
+
+/*
+ * Changes a user's wrapping key used to decrypt a dataset. The keyformat,
+ * keylocation, pbkdf2salt, and pbkdf2iters properties can also be specified
+ * here to change how the key is derived in userspace.
+ *
+ * innvl: {
+ * "hidden_args" (optional) -> { "wkeydata" -> value }
+ * raw uint8_t array of new encryption wrapping key data (32 bytes)
+ * "props" (optional) -> { prop -> value }
+ * }
+ *
+ * outnvl is unused
+ */
+static const zfs_ioc_key_t zfs_keys_change_key[] = {
+ {"crypt_cmd", DATA_TYPE_UINT64, ZK_OPTIONAL},
+ {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL},
+ {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_change_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ int ret;
+ uint64_t cmd = DCP_CMD_NONE;
+ dsl_crypto_params_t *dcp = NULL;
+ nvlist_t *args = NULL, *hidden_args = NULL;
+
+ if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) {
+ ret = (SET_ERROR(EINVAL));
+ goto error;
+ }
+
+ (void) nvlist_lookup_uint64(innvl, "crypt_cmd", &cmd);
+ (void) nvlist_lookup_nvlist(innvl, "props", &args);
+ (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args);
+
+ ret = dsl_crypto_params_create_nvlist(cmd, args, hidden_args, &dcp);
+ if (ret != 0)
+ goto error;
+
+ ret = spa_keystore_change_key(dsname, dcp);
+ if (ret != 0)
+ goto error;
+
+ dsl_crypto_params_free(dcp, B_FALSE);
+
+ return (0);
+
+error:
+ dsl_crypto_params_free(dcp, B_TRUE);
+ return (ret);
+}
+
+static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST];
+
+static void
+zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
+ zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
+ boolean_t log_history, zfs_ioc_poolcheck_t pool_check)
+{
+ zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
+
+ ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
+ ASSERT3U(ioc, <, ZFS_IOC_LAST);
+ ASSERT3P(vec->zvec_legacy_func, ==, NULL);
+ ASSERT3P(vec->zvec_func, ==, NULL);
+
+ vec->zvec_legacy_func = func;
+ vec->zvec_secpolicy = secpolicy;
+ vec->zvec_namecheck = namecheck;
+ vec->zvec_allow_log = log_history;
+ vec->zvec_pool_check = pool_check;
+}
+
+/*
+ * See the block comment at the beginning of this file for details on
+ * each argument to this function.
+ */
+void
+zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func,
+ zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
+ zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist,
+ boolean_t allow_log, const zfs_ioc_key_t *nvl_keys, size_t num_keys)
+{
+ zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
+
+ ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
+ ASSERT3U(ioc, <, ZFS_IOC_LAST);
+ ASSERT3P(vec->zvec_legacy_func, ==, NULL);
+ ASSERT3P(vec->zvec_func, ==, NULL);
+
+ /* if we are logging, the name must be valid */
+ ASSERT(!allow_log || namecheck != NO_NAME);
+
+ vec->zvec_name = name;
+ vec->zvec_func = func;
+ vec->zvec_secpolicy = secpolicy;
+ vec->zvec_namecheck = namecheck;
+ vec->zvec_pool_check = pool_check;
+ vec->zvec_smush_outnvlist = smush_outnvlist;
+ vec->zvec_allow_log = allow_log;
+ vec->zvec_nvl_keys = nvl_keys;
+ vec->zvec_nvl_key_count = num_keys;
+}
+
+static void
+zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
+ zfs_secpolicy_func_t *secpolicy, boolean_t log_history,
+ zfs_ioc_poolcheck_t pool_check)
+{
+ zfs_ioctl_register_legacy(ioc, func, secpolicy,
+ POOL_NAME, log_history, pool_check);
+}
+
+void
+zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
+ zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check)
+{
+ zfs_ioctl_register_legacy(ioc, func, secpolicy,
+ DATASET_NAME, B_FALSE, pool_check);
+}
+
+static void
+zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
+{
+ zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config,
+ POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
+}
+
+static void
+zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
+ zfs_secpolicy_func_t *secpolicy)
+{
+ zfs_ioctl_register_legacy(ioc, func, secpolicy,
+ NO_NAME, B_FALSE, POOL_CHECK_NONE);
+}
+
+static void
+zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc,
+ zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy)
+{
+ zfs_ioctl_register_legacy(ioc, func, secpolicy,
+ DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED);
+}
+
+static void
+zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
+{
+ zfs_ioctl_register_dataset_read_secpolicy(ioc, func,
+ zfs_secpolicy_read);
+}
+
+static void
+zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
+ zfs_secpolicy_func_t *secpolicy)
+{
+ zfs_ioctl_register_legacy(ioc, func, secpolicy,
+ DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
+}
+
+static void
+zfs_ioctl_init(void)
+{
+ zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT,
+ zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_snapshot, ARRAY_SIZE(zfs_keys_snapshot));
+
+ zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY,
+ zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
+ zfs_keys_log_history, ARRAY_SIZE(zfs_keys_log_history));
+
+ zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS,
+ zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME,
+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
+ zfs_keys_space_snaps, ARRAY_SIZE(zfs_keys_space_snaps));
+
+ zfs_ioctl_register("send", ZFS_IOC_SEND_NEW,
+ zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME,
+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
+ zfs_keys_send_new, ARRAY_SIZE(zfs_keys_send_new));
+
+ zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE,
+ zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME,
+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
+ zfs_keys_send_space, ARRAY_SIZE(zfs_keys_send_space));
+
+ zfs_ioctl_register("create", ZFS_IOC_CREATE,
+ zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_create, ARRAY_SIZE(zfs_keys_create));
+
+ zfs_ioctl_register("clone", ZFS_IOC_CLONE,
+ zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_clone, ARRAY_SIZE(zfs_keys_clone));
+
+ zfs_ioctl_register("remap", ZFS_IOC_REMAP,
+ zfs_ioc_remap, zfs_secpolicy_none, DATASET_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE,
+ zfs_keys_remap, ARRAY_SIZE(zfs_keys_remap));
+
+ zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS,
+ zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_destroy_snaps, ARRAY_SIZE(zfs_keys_destroy_snaps));
+
+ zfs_ioctl_register("hold", ZFS_IOC_HOLD,
+ zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_hold, ARRAY_SIZE(zfs_keys_hold));
+ zfs_ioctl_register("release", ZFS_IOC_RELEASE,
+ zfs_ioc_release, zfs_secpolicy_release, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_release, ARRAY_SIZE(zfs_keys_release));
+
+ zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS,
+ zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME,
+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
+ zfs_keys_get_holds, ARRAY_SIZE(zfs_keys_get_holds));
+
+ zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK,
+ zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE,
+ zfs_keys_rollback, ARRAY_SIZE(zfs_keys_rollback));
+
+ zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK,
+ zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_bookmark, ARRAY_SIZE(zfs_keys_bookmark));
+
+ zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS,
+ zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME,
+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
+ zfs_keys_get_bookmarks, ARRAY_SIZE(zfs_keys_get_bookmarks));
+
+ zfs_ioctl_register("get_bookmark_props", ZFS_IOC_GET_BOOKMARK_PROPS,
+ zfs_ioc_get_bookmark_props, zfs_secpolicy_read, ENTITY_NAME,
+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmark_props,
+ ARRAY_SIZE(zfs_keys_get_bookmark_props));
+
+ zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS,
+ zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks,
+ POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_destroy_bookmarks,
+ ARRAY_SIZE(zfs_keys_destroy_bookmarks));
+
+ zfs_ioctl_register("receive", ZFS_IOC_RECV_NEW,
+ zfs_ioc_recv_new, zfs_secpolicy_recv_new, DATASET_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_recv_new, ARRAY_SIZE(zfs_keys_recv_new));
+ zfs_ioctl_register("load-key", ZFS_IOC_LOAD_KEY,
+ zfs_ioc_load_key, zfs_secpolicy_load_key,
+ DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE,
+ zfs_keys_load_key, ARRAY_SIZE(zfs_keys_load_key));
+ zfs_ioctl_register("unload-key", ZFS_IOC_UNLOAD_KEY,
+ zfs_ioc_unload_key, zfs_secpolicy_load_key,
+ DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE,
+ zfs_keys_unload_key, ARRAY_SIZE(zfs_keys_unload_key));
+ zfs_ioctl_register("change-key", ZFS_IOC_CHANGE_KEY,
+ zfs_ioc_change_key, zfs_secpolicy_change_key,
+ DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY,
+ B_TRUE, B_TRUE, zfs_keys_change_key,
+ ARRAY_SIZE(zfs_keys_change_key));
+
+ zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC,
+ zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
+ zfs_keys_pool_sync, ARRAY_SIZE(zfs_keys_pool_sync));
+ zfs_ioctl_register("reopen", ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen,
+ zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED, B_TRUE,
+ B_TRUE, zfs_keys_pool_reopen, ARRAY_SIZE(zfs_keys_pool_reopen));
+
+ zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM,
+ zfs_ioc_channel_program, zfs_secpolicy_config,
+ POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE,
+ B_TRUE, zfs_keys_channel_program,
+ ARRAY_SIZE(zfs_keys_channel_program));
+
+ zfs_ioctl_register("redact", ZFS_IOC_REDACT,
+ zfs_ioc_redact, zfs_secpolicy_config, DATASET_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_redact, ARRAY_SIZE(zfs_keys_redact));
+
+ zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT,
+ zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_pool_checkpoint, ARRAY_SIZE(zfs_keys_pool_checkpoint));
+
+ zfs_ioctl_register("zpool_discard_checkpoint",
+ ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint,
+ zfs_secpolicy_config, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_pool_discard_checkpoint,
+ ARRAY_SIZE(zfs_keys_pool_discard_checkpoint));
+
+ zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE,
+ zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize));
+
+ zfs_ioctl_register("trim", ZFS_IOC_POOL_TRIM,
+ zfs_ioc_pool_trim, zfs_secpolicy_config, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_pool_trim, ARRAY_SIZE(zfs_keys_pool_trim));
+
+ zfs_ioctl_register("wait", ZFS_IOC_WAIT,
+ zfs_ioc_wait, zfs_secpolicy_none, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
+ zfs_keys_pool_wait, ARRAY_SIZE(zfs_keys_pool_wait));
+
+ zfs_ioctl_register("wait_fs", ZFS_IOC_WAIT_FS,
+ zfs_ioc_wait_fs, zfs_secpolicy_none, DATASET_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
+ zfs_keys_fs_wait, ARRAY_SIZE(zfs_keys_fs_wait));
+
+ zfs_ioctl_register("set_bootenv", ZFS_IOC_SET_BOOTENV,
+ zfs_ioc_set_bootenv, zfs_secpolicy_config, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE,
+ zfs_keys_set_bootenv, ARRAY_SIZE(zfs_keys_set_bootenv));
+
+ zfs_ioctl_register("get_bootenv", ZFS_IOC_GET_BOOTENV,
+ zfs_ioc_get_bootenv, zfs_secpolicy_none, POOL_NAME,
+ POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE,
+ zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv));
+
+ /* IOCTLS that use the legacy function signature */
+
+ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
+ zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY);
+
+ zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create,
+ zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN,
+ zfs_ioc_pool_scan);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE,
+ zfs_ioc_pool_upgrade);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD,
+ zfs_ioc_vdev_add);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE,
+ zfs_ioc_vdev_remove);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE,
+ zfs_ioc_vdev_set_state);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH,
+ zfs_ioc_vdev_attach);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH,
+ zfs_ioc_vdev_detach);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH,
+ zfs_ioc_vdev_setpath);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU,
+ zfs_ioc_vdev_setfru);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS,
+ zfs_ioc_pool_set_props);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT,
+ zfs_ioc_vdev_split);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID,
+ zfs_ioc_pool_reguid);
+
+ zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS,
+ zfs_ioc_pool_configs, zfs_secpolicy_none);
+ zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT,
+ zfs_ioc_pool_tryimport, zfs_secpolicy_config);
+ zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT,
+ zfs_ioc_inject_fault, zfs_secpolicy_inject);
+ zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT,
+ zfs_ioc_clear_fault, zfs_secpolicy_inject);
+ zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT,
+ zfs_ioc_inject_list_next, zfs_secpolicy_inject);
+
+ /*
+ * pool destroy, and export don't log the history as part of
+ * zfsdev_ioctl, but rather zfs_ioc_pool_export
+ * does the logging of those commands.
+ */
+ zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy,
+ zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED);
+ zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export,
+ zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED);
+
+ zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats,
+ zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
+ zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props,
+ zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
+
+ zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log,
+ zfs_secpolicy_inject, B_FALSE, POOL_CHECK_SUSPENDED);
+ zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME,
+ zfs_ioc_dsobj_to_dsname,
+ zfs_secpolicy_diff, B_FALSE, POOL_CHECK_SUSPENDED);
+ zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY,
+ zfs_ioc_pool_get_history,
+ zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED);
+
+ zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import,
+ zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
+
+ zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear,
+ zfs_secpolicy_config, B_TRUE, POOL_CHECK_READONLY);
+
+ zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN,
+ zfs_ioc_space_written);
+ zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS,
+ zfs_ioc_objset_recvd_props);
+ zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ,
+ zfs_ioc_next_obj);
+ zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL,
+ zfs_ioc_get_fsacl);
+ zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS,
+ zfs_ioc_objset_stats);
+ zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS,
+ zfs_ioc_objset_zplprops);
+ zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT,
+ zfs_ioc_dataset_list_next);
+ zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT,
+ zfs_ioc_snapshot_list_next);
+ zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS,
+ zfs_ioc_send_progress);
+
+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF,
+ zfs_ioc_diff, zfs_secpolicy_diff);
+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS,
+ zfs_ioc_obj_to_stats, zfs_secpolicy_diff);
+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH,
+ zfs_ioc_obj_to_path, zfs_secpolicy_diff);
+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE,
+ zfs_ioc_userspace_one, zfs_secpolicy_userspace_one);
+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY,
+ zfs_ioc_userspace_many, zfs_secpolicy_userspace_many);
+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND,
+ zfs_ioc_send, zfs_secpolicy_send);
+
+ zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop,
+ zfs_secpolicy_none);
+ zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy,
+ zfs_secpolicy_destroy);
+ zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename,
+ zfs_secpolicy_rename);
+ zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv,
+ zfs_secpolicy_recv);
+ zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote,
+ zfs_secpolicy_promote);
+ zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP,
+ zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop);
+ zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl,
+ zfs_secpolicy_set_fsacl);
+
+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share,
+ zfs_secpolicy_share, POOL_CHECK_NONE);
+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl,
+ zfs_secpolicy_smb_acl, POOL_CHECK_NONE);
+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE,
+ zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT,
+ zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
+
+ zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_NEXT, zfs_ioc_events_next,
+ zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE);
+ zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_CLEAR, zfs_ioc_events_clear,
+ zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE);
+ zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_SEEK, zfs_ioc_events_seek,
+ zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE);
+
+ zfs_ioctl_init_os();
+}
+
+/*
+ * Verify that for non-legacy ioctls the input nvlist
+ * pairs match against the expected input.
+ *
+ * Possible errors are:
+ * ZFS_ERR_IOC_ARG_UNAVAIL An unrecognized nvpair was encountered
+ * ZFS_ERR_IOC_ARG_REQUIRED A required nvpair is missing
+ * ZFS_ERR_IOC_ARG_BADTYPE Invalid type for nvpair
+ */
+static int
+zfs_check_input_nvpairs(nvlist_t *innvl, const zfs_ioc_vec_t *vec)
+{
+ const zfs_ioc_key_t *nvl_keys = vec->zvec_nvl_keys;
+ boolean_t required_keys_found = B_FALSE;
+
+ /*
+ * examine each input pair
+ */
+ for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
+ char *name = nvpair_name(pair);
+ data_type_t type = nvpair_type(pair);
+ boolean_t identified = B_FALSE;
+
+ /*
+ * check pair against the documented names and type
+ */
+ for (int k = 0; k < vec->zvec_nvl_key_count; k++) {
+ /* if not a wild card name, check for an exact match */
+ if ((nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) == 0 &&
+ strcmp(nvl_keys[k].zkey_name, name) != 0)
+ continue;
+
+ identified = B_TRUE;
+
+ if (nvl_keys[k].zkey_type != DATA_TYPE_ANY &&
+ nvl_keys[k].zkey_type != type) {
+ return (SET_ERROR(ZFS_ERR_IOC_ARG_BADTYPE));
+ }
+
+ if (nvl_keys[k].zkey_flags & ZK_OPTIONAL)
+ continue;
+
+ required_keys_found = B_TRUE;
+ break;
+ }
+
+ /* allow an 'optional' key, everything else is invalid */
+ if (!identified &&
+ (strcmp(name, "optional") != 0 ||
+ type != DATA_TYPE_NVLIST)) {
+ return (SET_ERROR(ZFS_ERR_IOC_ARG_UNAVAIL));
+ }
+ }
+
+ /* verify that all required keys were found */
+ for (int k = 0; k < vec->zvec_nvl_key_count; k++) {
+ if (nvl_keys[k].zkey_flags & ZK_OPTIONAL)
+ continue;
+
+ if (nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) {
+ /* at least one non-optional key is expected here */
+ if (!required_keys_found)
+ return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED));
+ continue;
+ }
+
+ if (!nvlist_exists(innvl, nvl_keys[k].zkey_name))
+ return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED));
+ }
+
+ return (0);
+}
+
+static int
+pool_status_check(const char *name, zfs_ioc_namecheck_t type,
+ zfs_ioc_poolcheck_t check)
+{
+ spa_t *spa;
+ int error;
+
+ ASSERT(type == POOL_NAME || type == DATASET_NAME ||
+ type == ENTITY_NAME);
+
+ if (check & POOL_CHECK_NONE)
+ return (0);
+
+ error = spa_open(name, &spa, FTAG);
+ if (error == 0) {
+ if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa))
+ error = SET_ERROR(EAGAIN);
+ else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa))
+ error = SET_ERROR(EROFS);
+ spa_close(spa, FTAG);
+ }
+ return (error);
+}
+
+int
+zfsdev_getminor(int fd, minor_t *minorp)
+{
+ zfsdev_state_t *zs, *fpd;
+ zfs_file_t *fp;
+ int rc;
+
+ ASSERT(!MUTEX_HELD(&zfsdev_state_lock));
+
+ if ((rc = zfs_file_get(fd, &fp)))
+ return (rc);
+
+ fpd = zfs_file_private(fp);
+ if (fpd == NULL)
+ return (SET_ERROR(EBADF));
+
+ mutex_enter(&zfsdev_state_lock);
+
+ for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
+
+ if (zs->zs_minor == -1)
+ continue;
+
+ if (fpd == zs) {
+ *minorp = fpd->zs_minor;
+ mutex_exit(&zfsdev_state_lock);
+ return (0);
+ }
+ }
+
+ mutex_exit(&zfsdev_state_lock);
+
+ return (SET_ERROR(EBADF));
+}
+
+static void *
+zfsdev_get_state_impl(minor_t minor, enum zfsdev_state_type which)
+{
+ zfsdev_state_t *zs;
+
+ for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
+ if (zs->zs_minor == minor) {
+ smp_rmb();
+ switch (which) {
+ case ZST_ONEXIT:
+ return (zs->zs_onexit);
+ case ZST_ZEVENT:
+ return (zs->zs_zevent);
+ case ZST_ALL:
+ return (zs);
+ }
+ }
+ }
+
+ return (NULL);
+}
+
+void *
+zfsdev_get_state(minor_t minor, enum zfsdev_state_type which)
+{
+ void *ptr;
+
+ ptr = zfsdev_get_state_impl(minor, which);
+
+ return (ptr);
+}
+
+/*
+ * Find a free minor number. The zfsdev_state_list is expected to
+ * be short since it is only a list of currently open file handles.
+ */
+minor_t
+zfsdev_minor_alloc(void)
+{
+ static minor_t last_minor = 0;
+ minor_t m;
+
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+ for (m = last_minor + 1; m != last_minor; m++) {
+ if (m > ZFSDEV_MAX_MINOR)
+ m = 1;
+ if (zfsdev_get_state_impl(m, ZST_ALL) == NULL) {
+ last_minor = m;
+ return (m);
+ }
+ }
+
+ return (0);
+}
+
+long
+zfsdev_ioctl_common(uint_t vecnum, zfs_cmd_t *zc, int flag)
+{
+ int error, cmd;
+ const zfs_ioc_vec_t *vec;
+ char *saved_poolname = NULL;
+ uint64_t max_nvlist_src_size;
+ size_t saved_poolname_len = 0;
+ nvlist_t *innvl = NULL;
+ fstrans_cookie_t cookie;
+ hrtime_t start_time = gethrtime();
+
+ cmd = vecnum;
+ error = 0;
+ if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
+ return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL));
+
+ vec = &zfs_ioc_vec[vecnum];
+
+ /*
+ * The registered ioctl list may be sparse, verify that either
+ * a normal or legacy handler are registered.
+ */
+ if (vec->zvec_func == NULL && vec->zvec_legacy_func == NULL)
+ return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL));
+
+ zc->zc_iflags = flag & FKIOCTL;
+ max_nvlist_src_size = zfs_max_nvlist_src_size_os();
+ if (zc->zc_nvlist_src_size > max_nvlist_src_size) {
+ /*
+ * Make sure the user doesn't pass in an insane value for
+ * zc_nvlist_src_size. We have to check, since we will end
+ * up allocating that much memory inside of get_nvlist(). This
+ * prevents a nefarious user from allocating tons of kernel
+ * memory.
+ *
+ * Also, we return EINVAL instead of ENOMEM here. The reason
+ * being that returning ENOMEM from an ioctl() has a special
+ * connotation; that the user's size value is too small and
+ * needs to be expanded to hold the nvlist. See
+ * zcmd_expand_dst_nvlist() for details.
+ */
+ error = SET_ERROR(EINVAL); /* User's size too big */
+
+ } else if (zc->zc_nvlist_src_size != 0) {
+ error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &innvl);
+ if (error != 0)
+ goto out;
+ }
+
+ /*
+ * Ensure that all pool/dataset names are valid before we pass down to
+ * the lower layers.
+ */
+ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
+ switch (vec->zvec_namecheck) {
+ case POOL_NAME:
+ if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
+ error = SET_ERROR(EINVAL);
+ else
+ error = pool_status_check(zc->zc_name,
+ vec->zvec_namecheck, vec->zvec_pool_check);
+ break;
+
+ case DATASET_NAME:
+ if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
+ error = SET_ERROR(EINVAL);
+ else
+ error = pool_status_check(zc->zc_name,
+ vec->zvec_namecheck, vec->zvec_pool_check);
+ break;
+
+ case ENTITY_NAME:
+ if (entity_namecheck(zc->zc_name, NULL, NULL) != 0) {
+ error = SET_ERROR(EINVAL);
+ } else {
+ error = pool_status_check(zc->zc_name,
+ vec->zvec_namecheck, vec->zvec_pool_check);
+ }
+ break;
+
+ case NO_NAME:
+ break;
+ }
+ /*
+ * Ensure that all input pairs are valid before we pass them down
+ * to the lower layers.
+ *
+ * The vectored functions can use fnvlist_lookup_{type} for any
+ * required pairs since zfs_check_input_nvpairs() confirmed that
+ * they exist and are of the correct type.
+ */
+ if (error == 0 && vec->zvec_func != NULL) {
+ error = zfs_check_input_nvpairs(innvl, vec);
+ if (error != 0)
+ goto out;
+ }
+
+ if (error == 0) {
+ cookie = spl_fstrans_mark();
+ error = vec->zvec_secpolicy(zc, innvl, CRED());
+ spl_fstrans_unmark(cookie);
+ }
+
+ if (error != 0)
+ goto out;
+
+ /* legacy ioctls can modify zc_name */
+ /*
+ * Can't use kmem_strdup() as we might truncate the string and
+ * kmem_strfree() would then free with incorrect size.
+ */
+ saved_poolname_len = strlen(zc->zc_name) + 1;
+ saved_poolname = kmem_alloc(saved_poolname_len, KM_SLEEP);
+
+ strlcpy(saved_poolname, zc->zc_name, saved_poolname_len);
+ saved_poolname[strcspn(saved_poolname, "/@#")] = '\0';
+
+ if (vec->zvec_func != NULL) {
+ nvlist_t *outnvl;
+ int puterror = 0;
+ spa_t *spa;
+ nvlist_t *lognv = NULL;
+
+ ASSERT(vec->zvec_legacy_func == NULL);
+
+ /*
+ * Add the innvl to the lognv before calling the func,
+ * in case the func changes the innvl.
+ */
+ if (vec->zvec_allow_log) {
+ lognv = fnvlist_alloc();
+ fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL,
+ vec->zvec_name);
+ if (!nvlist_empty(innvl)) {
+ fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL,
+ innvl);
+ }
+ }
+
+ outnvl = fnvlist_alloc();
+ cookie = spl_fstrans_mark();
+ error = vec->zvec_func(zc->zc_name, innvl, outnvl);
+ spl_fstrans_unmark(cookie);
+
+ /*
+ * Some commands can partially execute, modify state, and still
+ * return an error. In these cases, attempt to record what
+ * was modified.
+ */
+ if ((error == 0 ||
+ (cmd == ZFS_IOC_CHANNEL_PROGRAM && error != EINVAL)) &&
+ vec->zvec_allow_log &&
+ spa_open(zc->zc_name, &spa, FTAG) == 0) {
+ if (!nvlist_empty(outnvl)) {
+ size_t out_size = fnvlist_size(outnvl);
+ if (out_size > zfs_history_output_max) {
+ fnvlist_add_int64(lognv,
+ ZPOOL_HIST_OUTPUT_SIZE, out_size);
+ } else {
+ fnvlist_add_nvlist(lognv,
+ ZPOOL_HIST_OUTPUT_NVL, outnvl);
+ }
+ }
+ if (error != 0) {
+ fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO,
+ error);
+ }
+ fnvlist_add_int64(lognv, ZPOOL_HIST_ELAPSED_NS,
+ gethrtime() - start_time);
+ (void) spa_history_log_nvl(spa, lognv);
+ spa_close(spa, FTAG);
+ }
+ fnvlist_free(lognv);
+
+ if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) {
+ int smusherror = 0;
+ if (vec->zvec_smush_outnvlist) {
+ smusherror = nvlist_smush(outnvl,
+ zc->zc_nvlist_dst_size);
+ }
+ if (smusherror == 0)
+ puterror = put_nvlist(zc, outnvl);
+ }
+
+ if (puterror != 0)
+ error = puterror;
+
+ nvlist_free(outnvl);
+ } else {
+ cookie = spl_fstrans_mark();
+ error = vec->zvec_legacy_func(zc);
+ spl_fstrans_unmark(cookie);
+ }
+
+out:
+ nvlist_free(innvl);
+ if (error == 0 && vec->zvec_allow_log) {
+ char *s = tsd_get(zfs_allow_log_key);
+ if (s != NULL)
+ kmem_strfree(s);
+ (void) tsd_set(zfs_allow_log_key, kmem_strdup(saved_poolname));
+ }
+ if (saved_poolname != NULL)
+ kmem_free(saved_poolname, saved_poolname_len);
+
+ return (error);
+}
+
+int
+zfs_kmod_init(void)
+{
+ int error;
+
+ if ((error = zvol_init()) != 0)
+ return (error);
+
+ spa_init(SPA_MODE_READ | SPA_MODE_WRITE);
+ zfs_init();
+
+ zfs_ioctl_init();
+
+ mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
+ zfsdev_state_list = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP);
+ zfsdev_state_list->zs_minor = -1;
+
+ if ((error = zfsdev_attach()) != 0)
+ goto out;
+
+ tsd_create(&zfs_fsyncer_key, NULL);
+ tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
+ tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
+
+ return (0);
+out:
+ zfs_fini();
+ spa_fini();
+ zvol_fini();
+
+ return (error);
+}
+
+void
+zfs_kmod_fini(void)
+{
+ zfsdev_state_t *zs, *zsnext = NULL;
+
+ zfsdev_detach();
+
+ mutex_destroy(&zfsdev_state_lock);
+
+ for (zs = zfsdev_state_list; zs != NULL; zs = zsnext) {
+ zsnext = zs->zs_next;
+ if (zs->zs_onexit)
+ zfs_onexit_destroy(zs->zs_onexit);
+ if (zs->zs_zevent)
+ zfs_zevent_destroy(zs->zs_zevent);
+ kmem_free(zs, sizeof (zfsdev_state_t));
+ }
+
+ zfs_ereport_taskq_fini(); /* run before zfs_fini() on Linux */
+ zfs_fini();
+ spa_fini();
+ zvol_fini();
+
+ tsd_destroy(&zfs_fsyncer_key);
+ tsd_destroy(&rrw_tsd_key);
+ tsd_destroy(&zfs_allow_log_key);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, ULONG, ZMOD_RW,
+ "Maximum size in bytes allowed for src nvlist passed with ZFS ioctls");
+
+ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, ULONG, ZMOD_RW,
+ "Maximum size in bytes of ZFS ioctl output that will be logged");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/zfs_log.c b/sys/contrib/openzfs/module/zfs/zfs_log.c
new file mode 100644
index 000000000000..4bb529f78838
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_log.c
@@ -0,0 +1,781 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2018 by Delphix. All rights reserved.
+ */
+
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/thread.h>
+#include <sys/file.h>
+#include <sys/vfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/byteorder.h>
+#include <sys/policy.h>
+#include <sys/stat.h>
+#include <sys/acl.h>
+#include <sys/dmu.h>
+#include <sys/dbuf.h>
+#include <sys/spa.h>
+#include <sys/zfs_fuid.h>
+#include <sys/dsl_dataset.h>
+
+/*
+ * These zfs_log_* functions must be called within a dmu tx, in one
+ * of 2 contexts depending on zilog->z_replay:
+ *
+ * Non replay mode
+ * ---------------
+ * We need to record the transaction so that if it is committed to
+ * the Intent Log then it can be replayed. An intent log transaction
+ * structure (itx_t) is allocated and all the information necessary to
+ * possibly replay the transaction is saved in it. The itx is then assigned
+ * a sequence number and inserted in the in-memory list anchored in the zilog.
+ *
+ * Replay mode
+ * -----------
+ * We need to mark the intent log record as replayed in the log header.
+ * This is done in the same transaction as the replay so that they
+ * commit atomically.
+ */
+
+int
+zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap)
+{
+ int isxvattr = (vap->va_mask & ATTR_XVATTR);
+ switch (type) {
+ case Z_FILE:
+ if (vsecp == NULL && !isxvattr)
+ return (TX_CREATE);
+ if (vsecp && isxvattr)
+ return (TX_CREATE_ACL_ATTR);
+ if (vsecp)
+ return (TX_CREATE_ACL);
+ else
+ return (TX_CREATE_ATTR);
+ /*NOTREACHED*/
+ case Z_DIR:
+ if (vsecp == NULL && !isxvattr)
+ return (TX_MKDIR);
+ if (vsecp && isxvattr)
+ return (TX_MKDIR_ACL_ATTR);
+ if (vsecp)
+ return (TX_MKDIR_ACL);
+ else
+ return (TX_MKDIR_ATTR);
+ case Z_XATTRDIR:
+ return (TX_MKXATTR);
+ }
+ ASSERT(0);
+ return (TX_MAX_TYPE);
+}
+
+/*
+ * build up the log data necessary for logging xvattr_t
+ * First lr_attr_t is initialized. following the lr_attr_t
+ * is the mapsize and attribute bitmap copied from the xvattr_t.
+ * Following the bitmap and bitmapsize two 64 bit words are reserved
+ * for the create time which may be set. Following the create time
+ * records a single 64 bit integer which has the bits to set on
+ * replay for the xvattr.
+ */
+static void
+zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
+{
+ uint32_t *bitmap;
+ uint64_t *attrs;
+ uint64_t *crtime;
+ xoptattr_t *xoap;
+ void *scanstamp;
+ int i;
+
+ xoap = xva_getxoptattr(xvap);
+ ASSERT(xoap);
+
+ lrattr->lr_attr_masksize = xvap->xva_mapsize;
+ bitmap = &lrattr->lr_attr_bitmap;
+ for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) {
+ *bitmap = xvap->xva_reqattrmap[i];
+ }
+
+ /* Now pack the attributes up in a single uint64_t */
+ attrs = (uint64_t *)bitmap;
+ crtime = attrs + 1;
+ scanstamp = (caddr_t)(crtime + 2);
+ *attrs = 0;
+ if (XVA_ISSET_REQ(xvap, XAT_READONLY))
+ *attrs |= (xoap->xoa_readonly == 0) ? 0 :
+ XAT0_READONLY;
+ if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
+ *attrs |= (xoap->xoa_hidden == 0) ? 0 :
+ XAT0_HIDDEN;
+ if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
+ *attrs |= (xoap->xoa_system == 0) ? 0 :
+ XAT0_SYSTEM;
+ if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
+ *attrs |= (xoap->xoa_archive == 0) ? 0 :
+ XAT0_ARCHIVE;
+ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
+ *attrs |= (xoap->xoa_immutable == 0) ? 0 :
+ XAT0_IMMUTABLE;
+ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
+ *attrs |= (xoap->xoa_nounlink == 0) ? 0 :
+ XAT0_NOUNLINK;
+ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
+ *attrs |= (xoap->xoa_appendonly == 0) ? 0 :
+ XAT0_APPENDONLY;
+ if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
+ *attrs |= (xoap->xoa_opaque == 0) ? 0 :
+ XAT0_APPENDONLY;
+ if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
+ *attrs |= (xoap->xoa_nodump == 0) ? 0 :
+ XAT0_NODUMP;
+ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
+ *attrs |= (xoap->xoa_av_quarantined == 0) ? 0 :
+ XAT0_AV_QUARANTINED;
+ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
+ *attrs |= (xoap->xoa_av_modified == 0) ? 0 :
+ XAT0_AV_MODIFIED;
+ if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
+ ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime);
+ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
+ ASSERT(!XVA_ISSET_REQ(xvap, XAT_PROJID));
+
+ bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
+ } else if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
+ /*
+ * XAT_PROJID and XAT_AV_SCANSTAMP will never be valid
+ * at the same time, so we can share the same space.
+ */
+ bcopy(&xoap->xoa_projid, scanstamp, sizeof (uint64_t));
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
+ *attrs |= (xoap->xoa_reparse == 0) ? 0 :
+ XAT0_REPARSE;
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
+ *attrs |= (xoap->xoa_offline == 0) ? 0 :
+ XAT0_OFFLINE;
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
+ *attrs |= (xoap->xoa_sparse == 0) ? 0 :
+ XAT0_SPARSE;
+ if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT))
+ *attrs |= (xoap->xoa_projinherit == 0) ? 0 :
+ XAT0_PROJINHERIT;
+}
+
+static void *
+zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start)
+{
+ zfs_fuid_t *zfuid;
+ uint64_t *fuidloc = start;
+
+ /* First copy in the ACE FUIDs */
+ for (zfuid = list_head(&fuidp->z_fuids); zfuid;
+ zfuid = list_next(&fuidp->z_fuids, zfuid)) {
+ *fuidloc++ = zfuid->z_logfuid;
+ }
+ return (fuidloc);
+}
+
+
+static void *
+zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start)
+{
+ zfs_fuid_domain_t *zdomain;
+
+ /* now copy in the domain info, if any */
+ if (fuidp->z_domain_str_sz != 0) {
+ for (zdomain = list_head(&fuidp->z_domains); zdomain;
+ zdomain = list_next(&fuidp->z_domains, zdomain)) {
+ bcopy((void *)zdomain->z_domain, start,
+ strlen(zdomain->z_domain) + 1);
+ start = (caddr_t)start +
+ strlen(zdomain->z_domain) + 1;
+ }
+ }
+ return (start);
+}
+
+/*
+ * If zp is an xattr node, check whether the xattr owner is unlinked.
+ * We don't want to log anything if the owner is unlinked.
+ */
+static int
+zfs_xattr_owner_unlinked(znode_t *zp)
+{
+ int unlinked = 0;
+ znode_t *dzp;
+#ifdef __FreeBSD__
+ znode_t *tzp = zp;
+
+ /*
+ * zrele drops the vnode lock which violates the VOP locking contract
+ * on FreeBSD. See comment at the top of zfs_replay.c for more detail.
+ */
+ /*
+ * if zp is XATTR node, keep walking up via z_xattr_parent until we
+ * get the owner
+ */
+ while (tzp->z_pflags & ZFS_XATTR) {
+ ASSERT3U(zp->z_xattr_parent, !=, 0);
+ if (zfs_zget(ZTOZSB(tzp), tzp->z_xattr_parent, &dzp) != 0) {
+ unlinked = 1;
+ break;
+ }
+
+ if (tzp != zp)
+ zrele(tzp);
+ tzp = dzp;
+ unlinked = tzp->z_unlinked;
+ }
+ if (tzp != zp)
+ zrele(tzp);
+#else
+ zhold(zp);
+ /*
+ * if zp is XATTR node, keep walking up via z_xattr_parent until we
+ * get the owner
+ */
+ while (zp->z_pflags & ZFS_XATTR) {
+ ASSERT3U(zp->z_xattr_parent, !=, 0);
+ if (zfs_zget(ZTOZSB(zp), zp->z_xattr_parent, &dzp) != 0) {
+ unlinked = 1;
+ break;
+ }
+
+ zrele(zp);
+ zp = dzp;
+ unlinked = zp->z_unlinked;
+ }
+ zrele(zp);
+#endif
+ return (unlinked);
+}
+
+/*
+ * Handles TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, TX_MKDIR_ATTR and
+ * TK_MKXATTR transactions.
+ *
+ * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID
+ * domain information appended prior to the name. In this case the
+ * uid/gid in the log record will be a log centric FUID.
+ *
+ * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that
+ * may contain attributes, ACL and optional fuid information.
+ *
+ * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify
+ * and ACL and normal users/groups in the ACEs.
+ *
+ * There may be an optional xvattr attribute information similar
+ * to zfs_log_setattr.
+ *
+ * Also, after the file name "domain" strings may be appended.
+ */
+void
+zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+ znode_t *dzp, znode_t *zp, const char *name, vsecattr_t *vsecp,
+ zfs_fuid_info_t *fuidp, vattr_t *vap)
+{
+ itx_t *itx;
+ lr_create_t *lr;
+ lr_acl_create_t *lracl;
+ size_t aclsize = 0;
+ size_t xvatsize = 0;
+ size_t txsize;
+ xvattr_t *xvap = (xvattr_t *)vap;
+ void *end;
+ size_t lrsize;
+ size_t namesize = strlen(name) + 1;
+ size_t fuidsz = 0;
+
+ if (zil_replaying(zilog, tx) || zfs_xattr_owner_unlinked(dzp))
+ return;
+
+ /*
+ * If we have FUIDs present then add in space for
+ * domains and ACE fuid's if any.
+ */
+ if (fuidp) {
+ fuidsz += fuidp->z_domain_str_sz;
+ fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t);
+ }
+
+ if (vap->va_mask & ATTR_XVATTR)
+ xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize);
+
+ if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR ||
+ (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR ||
+ (int)txtype == TX_MKXATTR) {
+ txsize = sizeof (*lr) + namesize + fuidsz + xvatsize;
+ lrsize = sizeof (*lr);
+ } else {
+ txsize =
+ sizeof (lr_acl_create_t) + namesize + fuidsz +
+ ZIL_ACE_LENGTH(aclsize) + xvatsize;
+ lrsize = sizeof (lr_acl_create_t);
+ }
+
+ itx = zil_itx_create(txtype, txsize);
+
+ lr = (lr_create_t *)&itx->itx_lr;
+ lr->lr_doid = dzp->z_id;
+ lr->lr_foid = zp->z_id;
+ /* Store dnode slot count in 8 bits above object id. */
+ LR_FOID_SET_SLOTS(lr->lr_foid, zp->z_dnodesize >> DNODE_SHIFT);
+ lr->lr_mode = zp->z_mode;
+ if (!IS_EPHEMERAL(KUID_TO_SUID(ZTOUID(zp)))) {
+ lr->lr_uid = (uint64_t)KUID_TO_SUID(ZTOUID(zp));
+ } else {
+ lr->lr_uid = fuidp->z_fuid_owner;
+ }
+ if (!IS_EPHEMERAL(KGID_TO_SGID(ZTOGID(zp)))) {
+ lr->lr_gid = (uint64_t)KGID_TO_SGID(ZTOGID(zp));
+ } else {
+ lr->lr_gid = fuidp->z_fuid_group;
+ }
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &lr->lr_gen,
+ sizeof (uint64_t));
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
+ lr->lr_crtime, sizeof (uint64_t) * 2);
+
+ if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(ZTOZSB(zp)), &lr->lr_rdev,
+ sizeof (lr->lr_rdev)) != 0)
+ lr->lr_rdev = 0;
+
+ /*
+ * Fill in xvattr info if any
+ */
+ if (vap->va_mask & ATTR_XVATTR) {
+ zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap);
+ end = (caddr_t)lr + lrsize + xvatsize;
+ } else {
+ end = (caddr_t)lr + lrsize;
+ }
+
+ /* Now fill in any ACL info */
+
+ if (vsecp) {
+ lracl = (lr_acl_create_t *)&itx->itx_lr;
+ lracl->lr_aclcnt = vsecp->vsa_aclcnt;
+ lracl->lr_acl_bytes = aclsize;
+ lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
+ lracl->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0;
+ if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS)
+ lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
+ else
+ lracl->lr_acl_flags = 0;
+
+ bcopy(vsecp->vsa_aclentp, end, aclsize);
+ end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize);
+ }
+
+ /* drop in FUID info */
+ if (fuidp) {
+ end = zfs_log_fuid_ids(fuidp, end);
+ end = zfs_log_fuid_domains(fuidp, end);
+ }
+ /*
+ * Now place file name in log record
+ */
+ bcopy(name, end, namesize);
+
+ zil_itx_assign(zilog, itx, tx);
+}
+
+/*
+ * Handles both TX_REMOVE and TX_RMDIR transactions.
+ */
+void
+zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+ znode_t *dzp, const char *name, uint64_t foid, boolean_t unlinked)
+{
+ itx_t *itx;
+ lr_remove_t *lr;
+ size_t namesize = strlen(name) + 1;
+
+ if (zil_replaying(zilog, tx) || zfs_xattr_owner_unlinked(dzp))
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+ lr = (lr_remove_t *)&itx->itx_lr;
+ lr->lr_doid = dzp->z_id;
+ bcopy(name, (char *)(lr + 1), namesize);
+
+ itx->itx_oid = foid;
+
+ /*
+ * Object ids can be re-instantiated in the next txg so
+ * remove any async transactions to avoid future leaks.
+ * This can happen if a fsync occurs on the re-instantiated
+ * object for a WR_INDIRECT or WR_NEED_COPY write, which gets
+ * the new file data and flushes a write record for the old object.
+ */
+ if (unlinked) {
+ ASSERT((txtype & ~TX_CI) == TX_REMOVE);
+ zil_remove_async(zilog, foid);
+ }
+ zil_itx_assign(zilog, itx, tx);
+}
+
+/*
+ * Handles TX_LINK transactions.
+ */
+void
+zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+ znode_t *dzp, znode_t *zp, const char *name)
+{
+ itx_t *itx;
+ lr_link_t *lr;
+ size_t namesize = strlen(name) + 1;
+
+ if (zil_replaying(zilog, tx))
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+ lr = (lr_link_t *)&itx->itx_lr;
+ lr->lr_doid = dzp->z_id;
+ lr->lr_link_obj = zp->z_id;
+ bcopy(name, (char *)(lr + 1), namesize);
+
+ zil_itx_assign(zilog, itx, tx);
+}
+
+/*
+ * Handles TX_SYMLINK transactions.
+ */
+void
+zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+ znode_t *dzp, znode_t *zp, const char *name, const char *link)
+{
+ itx_t *itx;
+ lr_create_t *lr;
+ size_t namesize = strlen(name) + 1;
+ size_t linksize = strlen(link) + 1;
+
+ if (zil_replaying(zilog, tx))
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
+ lr = (lr_create_t *)&itx->itx_lr;
+ lr->lr_doid = dzp->z_id;
+ lr->lr_foid = zp->z_id;
+ lr->lr_uid = KUID_TO_SUID(ZTOUID(zp));
+ lr->lr_gid = KGID_TO_SGID(ZTOGID(zp));
+ lr->lr_mode = zp->z_mode;
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &lr->lr_gen,
+ sizeof (uint64_t));
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
+ lr->lr_crtime, sizeof (uint64_t) * 2);
+ bcopy(name, (char *)(lr + 1), namesize);
+ bcopy(link, (char *)(lr + 1) + namesize, linksize);
+
+ zil_itx_assign(zilog, itx, tx);
+}
+
+/*
+ * Handles TX_RENAME transactions.
+ */
+void
+zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp,
+ const char *sname, znode_t *tdzp, const char *dname, znode_t *szp)
+{
+ itx_t *itx;
+ lr_rename_t *lr;
+ size_t snamesize = strlen(sname) + 1;
+ size_t dnamesize = strlen(dname) + 1;
+
+ if (zil_replaying(zilog, tx))
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
+ lr = (lr_rename_t *)&itx->itx_lr;
+ lr->lr_sdoid = sdzp->z_id;
+ lr->lr_tdoid = tdzp->z_id;
+ bcopy(sname, (char *)(lr + 1), snamesize);
+ bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
+ itx->itx_oid = szp->z_id;
+
+ zil_itx_assign(zilog, itx, tx);
+}
+
+/*
+ * zfs_log_write() handles TX_WRITE transactions. The specified callback is
+ * called as soon as the write is on stable storage (be it via a DMU sync or a
+ * ZIL commit).
+ */
+long zfs_immediate_write_sz = 32768;
+
+void
+zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, offset_t off, ssize_t resid, int ioflag,
+ zil_callback_t callback, void *callback_data)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
+ uint32_t blocksize = zp->z_blksz;
+ itx_wr_state_t write_state;
+ uintptr_t fsync_cnt;
+
+ if (zil_replaying(zilog, tx) || zp->z_unlinked ||
+ zfs_xattr_owner_unlinked(zp)) {
+ if (callback != NULL)
+ callback(callback_data);
+ return;
+ }
+
+ if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+ write_state = WR_INDIRECT;
+ else if (!spa_has_slogs(zilog->zl_spa) &&
+ resid >= zfs_immediate_write_sz)
+ write_state = WR_INDIRECT;
+ else if (ioflag & (O_SYNC | O_DSYNC))
+ write_state = WR_COPIED;
+ else
+ write_state = WR_NEED_COPY;
+
+ if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
+ (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
+ }
+
+ while (resid) {
+ itx_t *itx;
+ lr_write_t *lr;
+ itx_wr_state_t wr_state = write_state;
+ ssize_t len = resid;
+
+ /*
+ * A WR_COPIED record must fit entirely in one log block.
+ * Large writes can use WR_NEED_COPY, which the ZIL will
+ * split into multiple records across several log blocks
+ * if necessary.
+ */
+ if (wr_state == WR_COPIED &&
+ resid > zil_max_copied_data(zilog))
+ wr_state = WR_NEED_COPY;
+ else if (wr_state == WR_INDIRECT)
+ len = MIN(blocksize - P2PHASE(off, blocksize), resid);
+
+ itx = zil_itx_create(txtype, sizeof (*lr) +
+ (wr_state == WR_COPIED ? len : 0));
+ lr = (lr_write_t *)&itx->itx_lr;
+
+ /*
+ * For WR_COPIED records, copy the data into the lr_write_t.
+ */
+ if (wr_state == WR_COPIED) {
+ int err;
+ DB_DNODE_ENTER(db);
+ err = dmu_read_by_dnode(DB_DNODE(db), off, len, lr + 1,
+ DMU_READ_NO_PREFETCH);
+ if (err != 0) {
+ zil_itx_destroy(itx);
+ itx = zil_itx_create(txtype, sizeof (*lr));
+ lr = (lr_write_t *)&itx->itx_lr;
+ wr_state = WR_NEED_COPY;
+ }
+ DB_DNODE_EXIT(db);
+ }
+
+ itx->itx_wr_state = wr_state;
+ lr->lr_foid = zp->z_id;
+ lr->lr_offset = off;
+ lr->lr_length = len;
+ lr->lr_blkoff = 0;
+ BP_ZERO(&lr->lr_blkptr);
+
+ itx->itx_private = ZTOZSB(zp);
+
+ if (!(ioflag & (O_SYNC | O_DSYNC)) && (zp->z_sync_cnt == 0) &&
+ (fsync_cnt == 0))
+ itx->itx_sync = B_FALSE;
+
+ itx->itx_callback = callback;
+ itx->itx_callback_data = callback_data;
+ zil_itx_assign(zilog, itx, tx);
+
+ off += len;
+ resid -= len;
+ }
+}
+
+/*
+ * Handles TX_TRUNCATE transactions.
+ */
+void
+zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, uint64_t off, uint64_t len)
+{
+ itx_t *itx;
+ lr_truncate_t *lr;
+
+ if (zil_replaying(zilog, tx) || zp->z_unlinked ||
+ zfs_xattr_owner_unlinked(zp))
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr));
+ lr = (lr_truncate_t *)&itx->itx_lr;
+ lr->lr_foid = zp->z_id;
+ lr->lr_offset = off;
+ lr->lr_length = len;
+
+ itx->itx_sync = (zp->z_sync_cnt != 0);
+ zil_itx_assign(zilog, itx, tx);
+}
+
+/*
+ * Handles TX_SETATTR transactions.
+ */
+void
+zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
+{
+ itx_t *itx;
+ lr_setattr_t *lr;
+ xvattr_t *xvap = (xvattr_t *)vap;
+ size_t recsize = sizeof (lr_setattr_t);
+ void *start;
+
+ if (zil_replaying(zilog, tx) || zp->z_unlinked)
+ return;
+
+ /*
+ * If XVATTR set, then log record size needs to allow
+ * for lr_attr_t + xvattr mask, mapsize and create time
+ * plus actual attribute values
+ */
+ if (vap->va_mask & ATTR_XVATTR)
+ recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize);
+
+ if (fuidp)
+ recsize += fuidp->z_domain_str_sz;
+
+ itx = zil_itx_create(txtype, recsize);
+ lr = (lr_setattr_t *)&itx->itx_lr;
+ lr->lr_foid = zp->z_id;
+ lr->lr_mask = (uint64_t)mask_applied;
+ lr->lr_mode = (uint64_t)vap->va_mode;
+ if ((mask_applied & ATTR_UID) && IS_EPHEMERAL(vap->va_uid))
+ lr->lr_uid = fuidp->z_fuid_owner;
+ else
+ lr->lr_uid = (uint64_t)vap->va_uid;
+
+ if ((mask_applied & ATTR_GID) && IS_EPHEMERAL(vap->va_gid))
+ lr->lr_gid = fuidp->z_fuid_group;
+ else
+ lr->lr_gid = (uint64_t)vap->va_gid;
+
+ lr->lr_size = (uint64_t)vap->va_size;
+ ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
+ ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
+ start = (lr_setattr_t *)(lr + 1);
+ if (vap->va_mask & ATTR_XVATTR) {
+ zfs_log_xvattr((lr_attr_t *)start, xvap);
+ start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize);
+ }
+
+ /*
+ * Now stick on domain information if any on end
+ */
+
+ if (fuidp)
+ (void) zfs_log_fuid_domains(fuidp, start);
+
+ itx->itx_sync = (zp->z_sync_cnt != 0);
+ zil_itx_assign(zilog, itx, tx);
+}
+
+/*
+ * Handles TX_ACL transactions.
+ */
+void
+zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
+ vsecattr_t *vsecp, zfs_fuid_info_t *fuidp)
+{
+ itx_t *itx;
+ lr_acl_v0_t *lrv0;
+ lr_acl_t *lr;
+ int txtype;
+ int lrsize;
+ size_t txsize;
+ size_t aclbytes = vsecp->vsa_aclentsz;
+
+ if (zil_replaying(zilog, tx) || zp->z_unlinked)
+ return;
+
+ txtype = (ZTOZSB(zp)->z_version < ZPL_VERSION_FUID) ?
+ TX_ACL_V0 : TX_ACL;
+
+ if (txtype == TX_ACL)
+ lrsize = sizeof (*lr);
+ else
+ lrsize = sizeof (*lrv0);
+
+ txsize = lrsize +
+ ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) +
+ (fuidp ? fuidp->z_domain_str_sz : 0) +
+ sizeof (uint64_t) * (fuidp ? fuidp->z_fuid_cnt : 0);
+
+ itx = zil_itx_create(txtype, txsize);
+
+ lr = (lr_acl_t *)&itx->itx_lr;
+ lr->lr_foid = zp->z_id;
+ if (txtype == TX_ACL) {
+ lr->lr_acl_bytes = aclbytes;
+ lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
+ lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0;
+ if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS)
+ lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
+ else
+ lr->lr_acl_flags = 0;
+ }
+ lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt;
+
+ if (txtype == TX_ACL_V0) {
+ lrv0 = (lr_acl_v0_t *)lr;
+ bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes);
+ } else {
+ void *start = (ace_t *)(lr + 1);
+
+ bcopy(vsecp->vsa_aclentp, start, aclbytes);
+
+ start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes);
+
+ if (fuidp) {
+ start = zfs_log_fuid_ids(fuidp, start);
+ (void) zfs_log_fuid_domains(fuidp, start);
+ }
+ }
+
+ itx->itx_sync = (zp->z_sync_cnt != 0);
+ zil_itx_assign(zilog, itx, tx);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, LONG, ZMOD_RW,
+ "Largest data block to write to zil");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/zfs_onexit.c b/sys/contrib/openzfs/module/zfs/zfs_onexit.c
new file mode 100644
index 000000000000..2a1332e715ee
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_onexit.c
@@ -0,0 +1,173 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2020 by Delphix. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/kmem.h>
+#include <sys/sunddi.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+
+/*
+ * ZFS kernel routines may add/delete callback routines to be invoked
+ * upon process exit (triggered via the close operation from the /dev/zfs
+ * driver).
+ *
+ * These cleanup callbacks are intended to allow for the accumulation
+ * of kernel state across multiple ioctls. User processes participate
+ * simply by opening ZFS_DEV. This causes the ZFS driver to do create
+ * some private data for the file descriptor and generating a unique
+ * minor number. The process then passes along that file descriptor to
+ * each ioctl that might have a cleanup operation.
+ *
+ * Consumers of the onexit routines should call zfs_onexit_fd_hold() early
+ * on to validate the given fd and add a reference to its file table entry.
+ * This allows the consumer to do its work and then add a callback, knowing
+ * that zfs_onexit_add_cb() won't fail with EBADF. When finished, consumers
+ * should call zfs_onexit_fd_rele().
+ *
+ * A simple example is zfs_ioc_recv(), where we might create an AVL tree
+ * with dataset/GUID mappings and then reuse that tree on subsequent
+ * zfs_ioc_recv() calls.
+ *
+ * On the first zfs_ioc_recv() call, dmu_recv_stream() will kmem_alloc()
+ * the AVL tree and pass it along with a callback function to
+ * zfs_onexit_add_cb(). The zfs_onexit_add_cb() routine will register the
+ * callback and return an action handle.
+ *
+ * The action handle is then passed from user space to subsequent
+ * zfs_ioc_recv() calls, so that dmu_recv_stream() can fetch its AVL tree
+ * by calling zfs_onexit_cb_data() with the device minor number and
+ * action handle.
+ *
+ * If the user process exits abnormally, the callback is invoked implicitly
+ * as part of the driver close operation. Once the user space process is
+ * finished with the accumulated kernel state, it can also just call close(2)
+ * on the cleanup fd to trigger the cleanup callback.
+ */
+
+void
+zfs_onexit_init(zfs_onexit_t **zop)
+{
+ zfs_onexit_t *zo;
+
+ zo = *zop = kmem_zalloc(sizeof (zfs_onexit_t), KM_SLEEP);
+ mutex_init(&zo->zo_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zo->zo_actions, sizeof (zfs_onexit_action_node_t),
+ offsetof(zfs_onexit_action_node_t, za_link));
+}
+
+void
+zfs_onexit_destroy(zfs_onexit_t *zo)
+{
+ zfs_onexit_action_node_t *ap;
+
+ mutex_enter(&zo->zo_lock);
+ while ((ap = list_head(&zo->zo_actions)) != NULL) {
+ list_remove(&zo->zo_actions, ap);
+ mutex_exit(&zo->zo_lock);
+ ap->za_func(ap->za_data);
+ kmem_free(ap, sizeof (zfs_onexit_action_node_t));
+ mutex_enter(&zo->zo_lock);
+ }
+ mutex_exit(&zo->zo_lock);
+
+ list_destroy(&zo->zo_actions);
+ mutex_destroy(&zo->zo_lock);
+ kmem_free(zo, sizeof (zfs_onexit_t));
+}
+
+/*
+ * Consumers might need to operate by minor number instead of fd, since
+ * they might be running in another thread (e.g. txg_sync_thread). Callers
+ * of this function must call zfs_onexit_fd_rele() when they're finished
+ * using the minor number.
+ */
+int
+zfs_onexit_fd_hold(int fd, minor_t *minorp)
+{
+ zfs_onexit_t *zo = NULL;
+ int error;
+
+ error = zfsdev_getminor(fd, minorp);
+ if (error) {
+ zfs_onexit_fd_rele(fd);
+ return (error);
+ }
+
+ zo = zfsdev_get_state(*minorp, ZST_ONEXIT);
+ if (zo == NULL) {
+ zfs_onexit_fd_rele(fd);
+ return (SET_ERROR(EBADF));
+ }
+ return (0);
+}
+
+void
+zfs_onexit_fd_rele(int fd)
+{
+ zfs_file_put(fd);
+}
+
+static int
+zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo)
+{
+ *zo = zfsdev_get_state(minor, ZST_ONEXIT);
+ if (*zo == NULL)
+ return (SET_ERROR(EBADF));
+
+ return (0);
+}
+
+/*
+ * Add a callback to be invoked when the calling process exits.
+ */
+int
+zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
+ uint64_t *action_handle)
+{
+ zfs_onexit_t *zo;
+ zfs_onexit_action_node_t *ap;
+ int error;
+
+ error = zfs_onexit_minor_to_state(minor, &zo);
+ if (error)
+ return (error);
+
+ ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_SLEEP);
+ list_link_init(&ap->za_link);
+ ap->za_func = func;
+ ap->za_data = data;
+
+ mutex_enter(&zo->zo_lock);
+ list_insert_tail(&zo->zo_actions, ap);
+ mutex_exit(&zo->zo_lock);
+ if (action_handle)
+ *action_handle = (uint64_t)(uintptr_t)ap;
+
+ return (0);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zfs_quota.c b/sys/contrib/openzfs/module/zfs/zfs_quota.c
new file mode 100644
index 000000000000..e61db5c7ab83
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_quota.c
@@ -0,0 +1,476 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
+ * All rights reserved.
+ * Copyright (c) 2012, 2015, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/avl.h>
+#include <sys/dmu_objset.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/zap.h>
+#include <sys/zfs_project.h>
+#include <sys/zfs_quota.h>
+#include <sys/zfs_znode.h>
+
+int
+zpl_get_file_info(dmu_object_type_t bonustype, const void *data,
+ zfs_file_info_t *zoi)
+{
+ /*
+ * Is it a valid type of object to track?
+ */
+ if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
+ return (SET_ERROR(ENOENT));
+
+ zoi->zfi_project = ZFS_DEFAULT_PROJID;
+
+ /*
+ * If we have a NULL data pointer
+ * then assume the id's aren't changing and
+ * return EEXIST to the dmu to let it know to
+ * use the same ids
+ */
+ if (data == NULL)
+ return (SET_ERROR(EEXIST));
+
+ if (bonustype == DMU_OT_ZNODE) {
+ const znode_phys_t *znp = data;
+ zoi->zfi_user = znp->zp_uid;
+ zoi->zfi_group = znp->zp_gid;
+ zoi->zfi_generation = znp->zp_gen;
+ return (0);
+ }
+
+ const sa_hdr_phys_t *sap = data;
+ if (sap->sa_magic == 0) {
+ /*
+ * This should only happen for newly created files
+ * that haven't had the znode data filled in yet.
+ */
+ zoi->zfi_user = 0;
+ zoi->zfi_group = 0;
+ zoi->zfi_generation = 0;
+ return (0);
+ }
+
+ sa_hdr_phys_t sa = *sap;
+ boolean_t swap = B_FALSE;
+ if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
+ sa.sa_magic = SA_MAGIC;
+ sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
+ swap = B_TRUE;
+ }
+ VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
+
+ int hdrsize = sa_hdrsize(&sa);
+ VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
+
+ uintptr_t data_after_hdr = (uintptr_t)data + hdrsize;
+ zoi->zfi_user = *((uint64_t *)(data_after_hdr + SA_UID_OFFSET));
+ zoi->zfi_group = *((uint64_t *)(data_after_hdr + SA_GID_OFFSET));
+ zoi->zfi_generation = *((uint64_t *)(data_after_hdr + SA_GEN_OFFSET));
+ uint64_t flags = *((uint64_t *)(data_after_hdr + SA_FLAGS_OFFSET));
+ if (swap)
+ flags = BSWAP_64(flags);
+
+ if (flags & ZFS_PROJID) {
+ zoi->zfi_project =
+ *((uint64_t *)(data_after_hdr + SA_PROJID_OFFSET));
+ }
+
+ if (swap) {
+ zoi->zfi_user = BSWAP_64(zoi->zfi_user);
+ zoi->zfi_group = BSWAP_64(zoi->zfi_group);
+ zoi->zfi_project = BSWAP_64(zoi->zfi_project);
+ zoi->zfi_generation = BSWAP_64(zoi->zfi_generation);
+ }
+ return (0);
+}
+
+static void
+fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
+ char *domainbuf, int buflen, uid_t *ridp)
+{
+ uint64_t fuid;
+ const char *domain;
+
+ fuid = zfs_strtonum(fuidstr, NULL);
+
+ domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
+ if (domain)
+ (void) strlcpy(domainbuf, domain, buflen);
+ else
+ domainbuf[0] = '\0';
+ *ridp = FUID_RID(fuid);
+}
+
+static uint64_t
+zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
+{
+ switch (type) {
+ case ZFS_PROP_USERUSED:
+ case ZFS_PROP_USEROBJUSED:
+ return (DMU_USERUSED_OBJECT);
+ case ZFS_PROP_GROUPUSED:
+ case ZFS_PROP_GROUPOBJUSED:
+ return (DMU_GROUPUSED_OBJECT);
+ case ZFS_PROP_PROJECTUSED:
+ case ZFS_PROP_PROJECTOBJUSED:
+ return (DMU_PROJECTUSED_OBJECT);
+ case ZFS_PROP_USERQUOTA:
+ return (zfsvfs->z_userquota_obj);
+ case ZFS_PROP_GROUPQUOTA:
+ return (zfsvfs->z_groupquota_obj);
+ case ZFS_PROP_USEROBJQUOTA:
+ return (zfsvfs->z_userobjquota_obj);
+ case ZFS_PROP_GROUPOBJQUOTA:
+ return (zfsvfs->z_groupobjquota_obj);
+ case ZFS_PROP_PROJECTQUOTA:
+ return (zfsvfs->z_projectquota_obj);
+ case ZFS_PROP_PROJECTOBJQUOTA:
+ return (zfsvfs->z_projectobjquota_obj);
+ default:
+ return (ZFS_NO_OBJECT);
+ }
+}
+
+int
+zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+ uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
+{
+ int error;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ zfs_useracct_t *buf = vbuf;
+ uint64_t obj;
+ int offset = 0;
+
+ if (!dmu_objset_userspace_present(zfsvfs->z_os))
+ return (SET_ERROR(ENOTSUP));
+
+ if ((type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED ||
+ type == ZFS_PROP_PROJECTOBJQUOTA ||
+ type == ZFS_PROP_PROJECTOBJUSED) &&
+ !dmu_objset_projectquota_present(zfsvfs->z_os))
+ return (SET_ERROR(ENOTSUP));
+
+ if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
+ type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA ||
+ type == ZFS_PROP_PROJECTOBJUSED ||
+ type == ZFS_PROP_PROJECTOBJQUOTA) &&
+ !dmu_objset_userobjspace_present(zfsvfs->z_os))
+ return (SET_ERROR(ENOTSUP));
+
+ obj = zfs_userquota_prop_to_obj(zfsvfs, type);
+ if (obj == ZFS_NO_OBJECT) {
+ *bufsizep = 0;
+ return (0);
+ }
+
+ if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
+ type == ZFS_PROP_PROJECTOBJUSED)
+ offset = DMU_OBJACCT_PREFIX_LEN;
+
+ for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
+ (error = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
+ *bufsizep)
+ break;
+
+ /*
+ * skip object quota (with zap name prefix DMU_OBJACCT_PREFIX)
+ * when dealing with block quota and vice versa.
+ */
+ if ((offset > 0) != (strncmp(za.za_name, DMU_OBJACCT_PREFIX,
+ DMU_OBJACCT_PREFIX_LEN) == 0))
+ continue;
+
+ fuidstr_to_sid(zfsvfs, za.za_name + offset,
+ buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
+
+ buf->zu_space = za.za_first_integer;
+ buf++;
+ }
+ if (error == ENOENT)
+ error = 0;
+
+ ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
+ *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
+ *cookiep = zap_cursor_serialize(&zc);
+ zap_cursor_fini(&zc);
+ return (error);
+}
+
+int
+zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+ const char *domain, uint64_t rid, uint64_t *valp)
+{
+ char buf[20 + DMU_OBJACCT_PREFIX_LEN];
+ int offset = 0;
+ int err;
+ uint64_t obj;
+
+ *valp = 0;
+
+ if (!dmu_objset_userspace_present(zfsvfs->z_os))
+ return (SET_ERROR(ENOTSUP));
+
+ if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
+ type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA ||
+ type == ZFS_PROP_PROJECTOBJUSED ||
+ type == ZFS_PROP_PROJECTOBJQUOTA) &&
+ !dmu_objset_userobjspace_present(zfsvfs->z_os))
+ return (SET_ERROR(ENOTSUP));
+
+ if (type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED ||
+ type == ZFS_PROP_PROJECTOBJQUOTA ||
+ type == ZFS_PROP_PROJECTOBJUSED) {
+ if (!dmu_objset_projectquota_present(zfsvfs->z_os))
+ return (SET_ERROR(ENOTSUP));
+ if (!zpl_is_valid_projid(rid))
+ return (SET_ERROR(EINVAL));
+ }
+
+ obj = zfs_userquota_prop_to_obj(zfsvfs, type);
+ if (obj == ZFS_NO_OBJECT)
+ return (0);
+
+ if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
+ type == ZFS_PROP_PROJECTOBJUSED) {
+ strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1);
+ offset = DMU_OBJACCT_PREFIX_LEN;
+ }
+
+ err = zfs_id_to_fuidstr(zfsvfs, domain, rid, buf + offset,
+ sizeof (buf) - offset, B_FALSE);
+ if (err)
+ return (err);
+
+ err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
+ if (err == ENOENT)
+ err = 0;
+ return (err);
+}
+
+int
+zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+ const char *domain, uint64_t rid, uint64_t quota)
+{
+ char buf[32];
+ int err;
+ dmu_tx_t *tx;
+ uint64_t *objp;
+ boolean_t fuid_dirtied;
+
+ if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
+ return (SET_ERROR(ENOTSUP));
+
+ switch (type) {
+ case ZFS_PROP_USERQUOTA:
+ objp = &zfsvfs->z_userquota_obj;
+ break;
+ case ZFS_PROP_GROUPQUOTA:
+ objp = &zfsvfs->z_groupquota_obj;
+ break;
+ case ZFS_PROP_USEROBJQUOTA:
+ objp = &zfsvfs->z_userobjquota_obj;
+ break;
+ case ZFS_PROP_GROUPOBJQUOTA:
+ objp = &zfsvfs->z_groupobjquota_obj;
+ break;
+ case ZFS_PROP_PROJECTQUOTA:
+ if (!dmu_objset_projectquota_enabled(zfsvfs->z_os))
+ return (SET_ERROR(ENOTSUP));
+ if (!zpl_is_valid_projid(rid))
+ return (SET_ERROR(EINVAL));
+
+ objp = &zfsvfs->z_projectquota_obj;
+ break;
+ case ZFS_PROP_PROJECTOBJQUOTA:
+ if (!dmu_objset_projectquota_enabled(zfsvfs->z_os))
+ return (SET_ERROR(ENOTSUP));
+ if (!zpl_is_valid_projid(rid))
+ return (SET_ERROR(EINVAL));
+
+ objp = &zfsvfs->z_projectobjquota_obj;
+ break;
+ default:
+ return (SET_ERROR(EINVAL));
+ }
+
+ err = zfs_id_to_fuidstr(zfsvfs, domain, rid, buf, sizeof (buf), B_TRUE);
+ if (err)
+ return (err);
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
+ if (*objp == 0) {
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
+ zfs_userquota_prop_prefixes[type]);
+ }
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ mutex_enter(&zfsvfs->z_lock);
+ if (*objp == 0) {
+ *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
+ DMU_OT_NONE, 0, tx);
+ VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
+ }
+ mutex_exit(&zfsvfs->z_lock);
+
+ if (quota == 0) {
+ err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
+ if (err == ENOENT)
+ err = 0;
+ } else {
+ err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
+ }
+ ASSERT(err == 0);
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+ dmu_tx_commit(tx);
+ return (err);
+}
+
+boolean_t
+zfs_id_overobjquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id)
+{
+ char buf[20 + DMU_OBJACCT_PREFIX_LEN];
+ uint64_t used, quota, quotaobj;
+ int err;
+
+ if (!dmu_objset_userobjspace_present(zfsvfs->z_os)) {
+ if (dmu_objset_userobjspace_upgradable(zfsvfs->z_os)) {
+ dsl_pool_config_enter(
+ dmu_objset_pool(zfsvfs->z_os), FTAG);
+ dmu_objset_id_quota_upgrade(zfsvfs->z_os);
+ dsl_pool_config_exit(
+ dmu_objset_pool(zfsvfs->z_os), FTAG);
+ }
+ return (B_FALSE);
+ }
+
+ if (usedobj == DMU_PROJECTUSED_OBJECT) {
+ if (!dmu_objset_projectquota_present(zfsvfs->z_os)) {
+ if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) {
+ dsl_pool_config_enter(
+ dmu_objset_pool(zfsvfs->z_os), FTAG);
+ dmu_objset_id_quota_upgrade(zfsvfs->z_os);
+ dsl_pool_config_exit(
+ dmu_objset_pool(zfsvfs->z_os), FTAG);
+ }
+ return (B_FALSE);
+ }
+ quotaobj = zfsvfs->z_projectobjquota_obj;
+ } else if (usedobj == DMU_USERUSED_OBJECT) {
+ quotaobj = zfsvfs->z_userobjquota_obj;
+ } else if (usedobj == DMU_GROUPUSED_OBJECT) {
+ quotaobj = zfsvfs->z_groupobjquota_obj;
+ } else {
+ return (B_FALSE);
+ }
+ if (quotaobj == 0 || zfsvfs->z_replay)
+ return (B_FALSE);
+
+ (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)id);
+ err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
+ if (err != 0)
+ return (B_FALSE);
+
+ (void) snprintf(buf, sizeof (buf), DMU_OBJACCT_PREFIX "%llx",
+ (longlong_t)id);
+ err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
+ if (err != 0)
+ return (B_FALSE);
+ return (used >= quota);
+}
+
+boolean_t
+zfs_id_overblockquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id)
+{
+ char buf[20];
+ uint64_t used, quota, quotaobj;
+ int err;
+
+ if (usedobj == DMU_PROJECTUSED_OBJECT) {
+ if (!dmu_objset_projectquota_present(zfsvfs->z_os)) {
+ if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) {
+ dsl_pool_config_enter(
+ dmu_objset_pool(zfsvfs->z_os), FTAG);
+ dmu_objset_id_quota_upgrade(zfsvfs->z_os);
+ dsl_pool_config_exit(
+ dmu_objset_pool(zfsvfs->z_os), FTAG);
+ }
+ return (B_FALSE);
+ }
+ quotaobj = zfsvfs->z_projectquota_obj;
+ } else if (usedobj == DMU_USERUSED_OBJECT) {
+ quotaobj = zfsvfs->z_userquota_obj;
+ } else if (usedobj == DMU_GROUPUSED_OBJECT) {
+ quotaobj = zfsvfs->z_groupquota_obj;
+ } else {
+ return (B_FALSE);
+ }
+ if (quotaobj == 0 || zfsvfs->z_replay)
+ return (B_FALSE);
+
+ (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)id);
+ err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
+ if (err != 0)
+ return (B_FALSE);
+
+ err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
+ if (err != 0)
+ return (B_FALSE);
+ return (used >= quota);
+}
+
+boolean_t
+zfs_id_overquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id)
+{
+ return (zfs_id_overblockquota(zfsvfs, usedobj, id) ||
+ zfs_id_overobjquota(zfsvfs, usedobj, id));
+}
+
+EXPORT_SYMBOL(zpl_get_file_info);
+EXPORT_SYMBOL(zfs_userspace_one);
+EXPORT_SYMBOL(zfs_userspace_many);
+EXPORT_SYMBOL(zfs_set_userquota);
+EXPORT_SYMBOL(zfs_id_overblockquota);
+EXPORT_SYMBOL(zfs_id_overobjquota);
+EXPORT_SYMBOL(zfs_id_overquota);
diff --git a/sys/contrib/openzfs/module/zfs/zfs_ratelimit.c b/sys/contrib/openzfs/module/zfs/zfs_ratelimit.c
new file mode 100644
index 000000000000..b18b480ce527
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_ratelimit.c
@@ -0,0 +1,99 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+ */
+
+#include <sys/zfs_ratelimit.h>
+
+/*
+ * Initialize rate limit struct
+ *
+ * rl: zfs_ratelimit_t struct
+ * burst: Number to allow in an interval before rate limiting
+ * interval: Interval time in seconds
+ */
+void
+zfs_ratelimit_init(zfs_ratelimit_t *rl, unsigned int *burst,
+ unsigned int interval)
+{
+ rl->count = 0;
+ rl->start = 0;
+ rl->interval = interval;
+ rl->burst = burst;
+ mutex_init(&rl->lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+/*
+ * Finalize rate limit struct
+ *
+ * rl: zfs_ratelimit_t struct
+ */
+void
+zfs_ratelimit_fini(zfs_ratelimit_t *rl)
+{
+ mutex_destroy(&rl->lock);
+}
+
+/*
+ * Re-implementation of the kernel's __ratelimit() function
+ *
+ * We had to write our own rate limiter because the kernel's __ratelimit()
+ * function annoyingly prints out how many times it rate limited to the kernel
+ * logs (and there's no way to turn it off):
+ *
+ * __ratelimit: 59 callbacks suppressed
+ *
+ * If the kernel ever allows us to disable these prints, we should go back to
+ * using __ratelimit() instead.
+ *
+ * Return values are the same as __ratelimit():
+ *
+ * 0: If we're rate limiting
+ * 1: If we're not rate limiting.
+ */
+int
+zfs_ratelimit(zfs_ratelimit_t *rl)
+{
+ hrtime_t now;
+
+ hrtime_t elapsed;
+ int error = 1;
+
+ mutex_enter(&rl->lock);
+
+ now = gethrtime();
+ elapsed = now - rl->start;
+
+ rl->count++;
+ if (NSEC2SEC(elapsed) >= rl->interval) {
+ rl->start = now;
+ rl->count = 0;
+ } else {
+ if (rl->count >= *rl->burst) {
+ error = 0; /* We're ratelimiting */
+ }
+ }
+ mutex_exit(&rl->lock);
+
+ return (error);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zfs_replay.c b/sys/contrib/openzfs/module/zfs/zfs_replay.c
new file mode 100644
index 000000000000..53c7dbd5df43
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_replay.c
@@ -0,0 +1,997 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 Cyril Plisko. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/thread.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/vfs.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_vnops.h>
+#include <sys/spa.h>
+#include <sys/zil.h>
+#include <sys/byteorder.h>
+#include <sys/stat.h>
+#include <sys/acl.h>
+#include <sys/atomic.h>
+#include <sys/cred.h>
+#include <sys/zpl.h>
+
+/*
+ * NB: FreeBSD expects to be able to do vnode locking in lookup and
+ * hold the locks across all subsequent VOPs until vput is called.
+ * This means that its zfs vnops routines can't do any internal locking.
+ * In order to have the same contract as the Linux vnops there would
+ * needed to be duplicate locked vnops. If the vnops were used more widely
+ * in common code this would likely be preferable. However, currently
+ * this is the only file where this is the case.
+ */
+
+/*
+ * Functions to replay ZFS intent log (ZIL) records
+ * The functions are called through a function vector (zfs_replay_vector)
+ * which is indexed by the transaction type.
+ */
+
+static void
+zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
+ uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
+{
+ bzero(vap, sizeof (*vap));
+ vap->va_mask = (uint_t)mask;
+ vap->va_mode = mode;
+#ifdef __FreeBSD__
+ vap->va_type = IFTOVT(mode);
+#endif
+ vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid;
+ vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid;
+ vap->va_rdev = zfs_cmpldev(rdev);
+ vap->va_nodeid = nodeid;
+}
+
+/* ARGSUSED */
+static int
+zfs_replay_error(void *arg1, void *arg2, boolean_t byteswap)
+{
+ return (SET_ERROR(ENOTSUP));
+}
+
+static void
+zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
+{
+ xoptattr_t *xoap = NULL;
+ uint64_t *attrs;
+ uint64_t *crtime;
+ uint32_t *bitmap;
+ void *scanstamp;
+ int i;
+
+ xvap->xva_vattr.va_mask |= ATTR_XVATTR;
+ if ((xoap = xva_getxoptattr(xvap)) == NULL) {
+ xvap->xva_vattr.va_mask &= ~ATTR_XVATTR; /* shouldn't happen */
+ return;
+ }
+
+ ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize);
+
+ bitmap = &lrattr->lr_attr_bitmap;
+ for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++)
+ xvap->xva_reqattrmap[i] = *bitmap;
+
+ attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1);
+ crtime = attrs + 1;
+ scanstamp = (caddr_t)(crtime + 2);
+
+ if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
+ xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
+ xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
+ xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_READONLY))
+ xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
+ xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
+ xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
+ xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
+ xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
+ xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
+ xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
+ xoap->xoa_av_quarantined =
+ ((*attrs & XAT0_AV_QUARANTINED) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
+ ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime);
+ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
+ ASSERT(!XVA_ISSET_REQ(xvap, XAT_PROJID));
+
+ bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ);
+ } else if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
+ /*
+ * XAT_PROJID and XAT_AV_SCANSTAMP will never be valid
+ * at the same time, so we can share the same space.
+ */
+ bcopy(scanstamp, &xoap->xoa_projid, sizeof (uint64_t));
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
+ xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
+ xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
+ xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT))
+ xoap->xoa_projinherit = ((*attrs & XAT0_PROJINHERIT) != 0);
+}
+
+static int
+zfs_replay_domain_cnt(uint64_t uid, uint64_t gid)
+{
+ uint64_t uid_idx;
+ uint64_t gid_idx;
+ int domcnt = 0;
+
+ uid_idx = FUID_INDEX(uid);
+ gid_idx = FUID_INDEX(gid);
+ if (uid_idx)
+ domcnt++;
+ if (gid_idx > 0 && gid_idx != uid_idx)
+ domcnt++;
+
+ return (domcnt);
+}
+
+static void *
+zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start,
+ int domcnt)
+{
+ int i;
+
+ for (i = 0; i != domcnt; i++) {
+ fuid_infop->z_domain_table[i] = start;
+ start = (caddr_t)start + strlen(start) + 1;
+ }
+
+ return (start);
+}
+
+/*
+ * Set the uid/gid in the fuid_info structure.
+ */
+static void
+zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid)
+{
+ /*
+ * If owner or group are log specific FUIDs then slurp up
+ * domain information and build zfs_fuid_info_t
+ */
+ if (IS_EPHEMERAL(uid))
+ fuid_infop->z_fuid_owner = uid;
+
+ if (IS_EPHEMERAL(gid))
+ fuid_infop->z_fuid_group = gid;
+}
+
+/*
+ * Load fuid domains into fuid_info_t
+ */
+static zfs_fuid_info_t *
+zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid)
+{
+ int domcnt;
+
+ zfs_fuid_info_t *fuid_infop;
+
+ fuid_infop = zfs_fuid_info_alloc();
+
+ domcnt = zfs_replay_domain_cnt(uid, gid);
+
+ if (domcnt == 0)
+ return (fuid_infop);
+
+ fuid_infop->z_domain_table =
+ kmem_zalloc(domcnt * sizeof (char *), KM_SLEEP);
+
+ zfs_replay_fuid_ugid(fuid_infop, uid, gid);
+
+ fuid_infop->z_domain_cnt = domcnt;
+ *end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt);
+ return (fuid_infop);
+}
+
+/*
+ * load zfs_fuid_t's and fuid_domains into fuid_info_t
+ */
+static zfs_fuid_info_t *
+zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid,
+ uint64_t gid)
+{
+ uint64_t *log_fuid = (uint64_t *)start;
+ zfs_fuid_info_t *fuid_infop;
+ int i;
+
+ fuid_infop = zfs_fuid_info_alloc();
+ fuid_infop->z_domain_cnt = domcnt;
+
+ fuid_infop->z_domain_table =
+ kmem_zalloc(domcnt * sizeof (char *), KM_SLEEP);
+
+ for (i = 0; i != idcnt; i++) {
+ zfs_fuid_t *zfuid;
+
+ zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
+ zfuid->z_logfuid = *log_fuid;
+ zfuid->z_id = -1;
+ zfuid->z_domidx = 0;
+ list_insert_tail(&fuid_infop->z_fuids, zfuid);
+ log_fuid++;
+ }
+
+ zfs_replay_fuid_ugid(fuid_infop, uid, gid);
+
+ *end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt);
+ return (fuid_infop);
+}
+
+static void
+zfs_replay_swap_attrs(lr_attr_t *lrattr)
+{
+ /* swap the lr_attr structure */
+ byteswap_uint32_array(lrattr, sizeof (*lrattr));
+ /* swap the bitmap */
+ byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) *
+ sizeof (uint32_t));
+ /* swap the attributes, create time + 64 bit word for attributes */
+ byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) *
+ (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t));
+}
+
+/*
+ * Replay file create with optional ACL, xvattr information as well
+ * as option FUID information.
+ */
+static int
+zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_acl_create_t *lracl = arg2;
+ char *name = NULL; /* location determined later */
+ lr_create_t *lr = (lr_create_t *)lracl;
+ znode_t *dzp;
+ znode_t *zp;
+ xvattr_t xva;
+ int vflg = 0;
+ vsecattr_t vsec = { 0 };
+ lr_attr_t *lrattr;
+ void *aclstart;
+ void *fuidstart;
+ size_t xvatlen = 0;
+ uint64_t txtype;
+ uint64_t objid;
+ uint64_t dnodesize;
+ int error;
+
+ txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
+ if (byteswap) {
+ byteswap_uint64_array(lracl, sizeof (*lracl));
+ if (txtype == TX_CREATE_ACL_ATTR ||
+ txtype == TX_MKDIR_ACL_ATTR) {
+ lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
+ zfs_replay_swap_attrs(lrattr);
+ xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+ }
+
+ aclstart = (caddr_t)(lracl + 1) + xvatlen;
+ zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE);
+ /* swap fuids */
+ if (lracl->lr_fuidcnt) {
+ byteswap_uint64_array((caddr_t)aclstart +
+ ZIL_ACE_LENGTH(lracl->lr_acl_bytes),
+ lracl->lr_fuidcnt * sizeof (uint64_t));
+ }
+ }
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+ return (error);
+
+ objid = LR_FOID_GET_OBJ(lr->lr_foid);
+ dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
+
+ xva_init(&xva);
+ zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID,
+ lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
+
+ /*
+ * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
+ * eventually end up in zfs_mknode(), which assigns the object's
+ * creation time, generation number, and dnode size. The generic
+ * zfs_create() has no concept of these attributes, so we smuggle
+ * the values inside the vattr's otherwise unused va_ctime,
+ * va_nblocks, and va_fsid fields.
+ */
+ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
+ xva.xva_vattr.va_nblocks = lr->lr_gen;
+ xva.xva_vattr.va_fsid = dnodesize;
+
+ error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT);
+ if (error)
+ goto bail;
+
+ if (lr->lr_common.lrc_txtype & TX_CI)
+ vflg |= FIGNORECASE;
+ switch (txtype) {
+ case TX_CREATE_ACL:
+ aclstart = (caddr_t)(lracl + 1);
+ fuidstart = (caddr_t)aclstart +
+ ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
+ zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
+ (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
+ lr->lr_uid, lr->lr_gid);
+ /*FALLTHROUGH*/
+ case TX_CREATE_ACL_ATTR:
+ if (name == NULL) {
+ lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
+ xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+ xva.xva_vattr.va_mask |= ATTR_XVATTR;
+ zfs_replay_xvattr(lrattr, &xva);
+ }
+ vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
+ vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
+ vsec.vsa_aclcnt = lracl->lr_aclcnt;
+ vsec.vsa_aclentsz = lracl->lr_acl_bytes;
+ vsec.vsa_aclflags = lracl->lr_acl_flags;
+ if (zfsvfs->z_fuid_replay == NULL) {
+ fuidstart = (caddr_t)(lracl + 1) + xvatlen +
+ ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
+ zfsvfs->z_fuid_replay =
+ zfs_replay_fuids(fuidstart,
+ (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
+ lr->lr_uid, lr->lr_gid);
+ }
+
+ error = zfs_create(dzp, name, &xva.xva_vattr,
+ 0, 0, &zp, kcred, vflg, &vsec);
+ break;
+ case TX_MKDIR_ACL:
+ aclstart = (caddr_t)(lracl + 1);
+ fuidstart = (caddr_t)aclstart +
+ ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
+ zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
+ (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
+ lr->lr_uid, lr->lr_gid);
+ /*FALLTHROUGH*/
+ case TX_MKDIR_ACL_ATTR:
+ if (name == NULL) {
+ lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
+ xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+ zfs_replay_xvattr(lrattr, &xva);
+ }
+ vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
+ vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
+ vsec.vsa_aclcnt = lracl->lr_aclcnt;
+ vsec.vsa_aclentsz = lracl->lr_acl_bytes;
+ vsec.vsa_aclflags = lracl->lr_acl_flags;
+ if (zfsvfs->z_fuid_replay == NULL) {
+ fuidstart = (caddr_t)(lracl + 1) + xvatlen +
+ ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
+ zfsvfs->z_fuid_replay =
+ zfs_replay_fuids(fuidstart,
+ (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
+ lr->lr_uid, lr->lr_gid);
+ }
+ error = zfs_mkdir(dzp, name, &xva.xva_vattr,
+ &zp, kcred, vflg, &vsec);
+ break;
+ default:
+ error = SET_ERROR(ENOTSUP);
+ }
+
+bail:
+ if (error == 0 && zp != NULL) {
+#ifdef __FreeBSD__
+ VOP_UNLOCK1(ZTOV(zp));
+#endif
+ zrele(zp);
+ }
+ zrele(dzp);
+
+ if (zfsvfs->z_fuid_replay)
+ zfs_fuid_info_free(zfsvfs->z_fuid_replay);
+ zfsvfs->z_fuid_replay = NULL;
+
+ return (error);
+}
+
+static int
+zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_create_t *lr = arg2;
+ char *name = NULL; /* location determined later */
+ char *link; /* symlink content follows name */
+ znode_t *dzp;
+ znode_t *zp = NULL;
+ xvattr_t xva;
+ int vflg = 0;
+ size_t lrsize = sizeof (lr_create_t);
+ lr_attr_t *lrattr;
+ void *start;
+ size_t xvatlen;
+ uint64_t txtype;
+ uint64_t objid;
+ uint64_t dnodesize;
+ int error;
+
+ txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
+ if (byteswap) {
+ byteswap_uint64_array(lr, sizeof (*lr));
+ if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR)
+ zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
+ }
+
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+ return (error);
+
+ objid = LR_FOID_GET_OBJ(lr->lr_foid);
+ dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
+
+ xva_init(&xva);
+ zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID,
+ lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
+
+ /*
+ * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
+ * eventually end up in zfs_mknode(), which assigns the object's
+ * creation time, generation number, and dnode slot count. The
+ * generic zfs_create() has no concept of these attributes, so
+ * we smuggle the values inside the vattr's otherwise unused
+ * va_ctime, va_nblocks, and va_fsid fields.
+ */
+ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
+ xva.xva_vattr.va_nblocks = lr->lr_gen;
+ xva.xva_vattr.va_fsid = dnodesize;
+
+ error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT);
+ if (error)
+ goto out;
+
+ if (lr->lr_common.lrc_txtype & TX_CI)
+ vflg |= FIGNORECASE;
+
+ /*
+ * Symlinks don't have fuid info, and CIFS never creates
+ * symlinks.
+ *
+ * The _ATTR versions will grab the fuid info in their subcases.
+ */
+ if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK &&
+ (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR &&
+ (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) {
+ start = (lr + 1);
+ zfsvfs->z_fuid_replay =
+ zfs_replay_fuid_domain(start, &start,
+ lr->lr_uid, lr->lr_gid);
+ }
+
+ switch (txtype) {
+ case TX_CREATE_ATTR:
+ lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
+ xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+ zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
+ start = (caddr_t)(lr + 1) + xvatlen;
+ zfsvfs->z_fuid_replay =
+ zfs_replay_fuid_domain(start, &start,
+ lr->lr_uid, lr->lr_gid);
+ name = (char *)start;
+
+ /*FALLTHROUGH*/
+ case TX_CREATE:
+ if (name == NULL)
+ name = (char *)start;
+
+ error = zfs_create(dzp, name, &xva.xva_vattr,
+ 0, 0, &zp, kcred, vflg, NULL);
+ break;
+ case TX_MKDIR_ATTR:
+ lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
+ xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+ zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
+ start = (caddr_t)(lr + 1) + xvatlen;
+ zfsvfs->z_fuid_replay =
+ zfs_replay_fuid_domain(start, &start,
+ lr->lr_uid, lr->lr_gid);
+ name = (char *)start;
+
+ /*FALLTHROUGH*/
+ case TX_MKDIR:
+ if (name == NULL)
+ name = (char *)(lr + 1);
+
+ error = zfs_mkdir(dzp, name, &xva.xva_vattr,
+ &zp, kcred, vflg, NULL);
+ break;
+ case TX_MKXATTR:
+ error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &zp, kcred);
+ break;
+ case TX_SYMLINK:
+ name = (char *)(lr + 1);
+ link = name + strlen(name) + 1;
+ error = zfs_symlink(dzp, name, &xva.xva_vattr,
+ link, &zp, kcred, vflg);
+ break;
+ default:
+ error = SET_ERROR(ENOTSUP);
+ }
+
+out:
+ if (error == 0 && zp != NULL) {
+#ifdef __FreeBSD__
+ VOP_UNLOCK1(ZTOV(zp));
+#endif
+ zrele(zp);
+ }
+ zrele(dzp);
+
+ if (zfsvfs->z_fuid_replay)
+ zfs_fuid_info_free(zfsvfs->z_fuid_replay);
+ zfsvfs->z_fuid_replay = NULL;
+ return (error);
+}
+
+static int
+zfs_replay_remove(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_remove_t *lr = arg2;
+ char *name = (char *)(lr + 1); /* name follows lr_remove_t */
+ znode_t *dzp;
+ int error;
+ int vflg = 0;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+ return (error);
+
+ if (lr->lr_common.lrc_txtype & TX_CI)
+ vflg |= FIGNORECASE;
+
+ switch ((int)lr->lr_common.lrc_txtype) {
+ case TX_REMOVE:
+ error = zfs_remove(dzp, name, kcred, vflg);
+ break;
+ case TX_RMDIR:
+ error = zfs_rmdir(dzp, name, NULL, kcred, vflg);
+ break;
+ default:
+ error = SET_ERROR(ENOTSUP);
+ }
+
+ zrele(dzp);
+
+ return (error);
+}
+
+static int
+zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_link_t *lr = arg2;
+ char *name = (char *)(lr + 1); /* name follows lr_link_t */
+ znode_t *dzp, *zp;
+ int error;
+ int vflg = 0;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+ return (error);
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
+ zrele(dzp);
+ return (error);
+ }
+
+ if (lr->lr_common.lrc_txtype & TX_CI)
+ vflg |= FIGNORECASE;
+
+ error = zfs_link(dzp, zp, name, kcred, vflg);
+ zrele(zp);
+ zrele(dzp);
+
+ return (error);
+}
+
+static int
+zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_rename_t *lr = arg2;
+ char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
+ char *tname = sname + strlen(sname) + 1;
+ znode_t *sdzp, *tdzp;
+ int error;
+ int vflg = 0;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
+ return (error);
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
+ zrele(sdzp);
+ return (error);
+ }
+
+ if (lr->lr_common.lrc_txtype & TX_CI)
+ vflg |= FIGNORECASE;
+
+ error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg);
+
+ zrele(tdzp);
+ zrele(sdzp);
+ return (error);
+}
+
+static int
+zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_write_t *lr = arg2;
+ char *data = (char *)(lr + 1); /* data follows lr_write_t */
+ znode_t *zp;
+ int error;
+ uint64_t eod, offset, length;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+ /*
+ * As we can log writes out of order, it's possible the
+ * file has been removed. In this case just drop the write
+ * and return success.
+ */
+ if (error == ENOENT)
+ error = 0;
+ return (error);
+ }
+
+ offset = lr->lr_offset;
+ length = lr->lr_length;
+ eod = offset + length; /* end of data for this write */
+
+ /*
+ * This may be a write from a dmu_sync() for a whole block,
+ * and may extend beyond the current end of the file.
+ * We can't just replay what was written for this TX_WRITE as
+ * a future TX_WRITE2 may extend the eof and the data for that
+ * write needs to be there. So we write the whole block and
+ * reduce the eof. This needs to be done within the single dmu
+ * transaction created within vn_rdwr -> zfs_write. So a possible
+ * new end of file is passed through in zfsvfs->z_replay_eof
+ */
+
+ zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */
+
+ /* If it's a dmu_sync() block, write the whole block */
+ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+ uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+ if (length < blocksize) {
+ offset -= offset % blocksize;
+ length = blocksize;
+ }
+ if (zp->z_size < eod)
+ zfsvfs->z_replay_eof = eod;
+ }
+ error = zfs_write_simple(zp, data, length, offset, NULL);
+ zrele(zp);
+ zfsvfs->z_replay_eof = 0; /* safety */
+
+ return (error);
+}
+
+/*
+ * TX_WRITE2 are only generated when dmu_sync() returns EALREADY
+ * meaning the pool block is already being synced. So now that we always write
+ * out full blocks, all we have to do is expand the eof if
+ * the file is grown.
+ */
+static int
+zfs_replay_write2(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_write_t *lr = arg2;
+ znode_t *zp;
+ int error;
+ uint64_t end;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+ return (error);
+
+top:
+ end = lr->lr_offset + lr->lr_length;
+ if (end > zp->z_size) {
+ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+ zp->z_size = end;
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ zrele(zp);
+ if (error == ERESTART) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ return (error);
+ }
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+ (void *)&zp->z_size, sizeof (uint64_t), tx);
+
+ /* Ensure the replayed seq is updated */
+ (void) zil_replaying(zfsvfs->z_log, tx);
+
+ dmu_tx_commit(tx);
+ }
+
+ zrele(zp);
+
+ return (error);
+}
+
+static int
+zfs_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_truncate_t *lr = arg2;
+ znode_t *zp;
+ flock64_t fl;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+ return (error);
+
+ bzero(&fl, sizeof (fl));
+ fl.l_type = F_WRLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = lr->lr_offset;
+ fl.l_len = lr->lr_length;
+
+ error = zfs_space(zp, F_FREESP, &fl, O_RDWR | O_LARGEFILE,
+ lr->lr_offset, kcred);
+
+ zrele(zp);
+
+ return (error);
+}
+
+static int
+zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_setattr_t *lr = arg2;
+ znode_t *zp;
+ xvattr_t xva;
+ vattr_t *vap = &xva.xva_vattr;
+ int error;
+ void *start;
+
+ xva_init(&xva);
+ if (byteswap) {
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((lr->lr_mask & ATTR_XVATTR) &&
+ zfsvfs->z_version >= ZPL_VERSION_INITIAL)
+ zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
+ }
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+ return (error);
+
+ zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode,
+ lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
+
+ vap->va_size = lr->lr_size;
+ ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime);
+ ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime);
+ gethrestime(&vap->va_ctime);
+ vap->va_mask |= ATTR_CTIME;
+
+ /*
+ * Fill in xvattr_t portions if necessary.
+ */
+
+ start = (lr_setattr_t *)(lr + 1);
+ if (vap->va_mask & ATTR_XVATTR) {
+ zfs_replay_xvattr((lr_attr_t *)start, &xva);
+ start = (caddr_t)start +
+ ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize);
+ } else
+ xva.xva_vattr.va_mask &= ~ATTR_XVATTR;
+
+ zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start,
+ lr->lr_uid, lr->lr_gid);
+
+ /*
+ * Satisfy assertions.
+ */
+ vn_seqc_write_begin(ZTOV(zp));
+ error = zfs_setattr(zp, vap, 0, kcred);
+ vn_seqc_write_end(ZTOV(zp));
+
+ zfs_fuid_info_free(zfsvfs->z_fuid_replay);
+ zfsvfs->z_fuid_replay = NULL;
+ zrele(zp);
+
+ return (error);
+}
+
+static int
+zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_acl_v0_t *lr = arg2;
+ ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */
+ vsecattr_t vsa;
+ znode_t *zp;
+ int error;
+
+ if (byteswap) {
+ byteswap_uint64_array(lr, sizeof (*lr));
+ zfs_oldace_byteswap(ace, lr->lr_aclcnt);
+ }
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+ return (error);
+
+ bzero(&vsa, sizeof (vsa));
+ vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
+ vsa.vsa_aclcnt = lr->lr_aclcnt;
+ vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt;
+ vsa.vsa_aclflags = 0;
+ vsa.vsa_aclentp = ace;
+
+ error = zfs_setsecattr(zp, &vsa, 0, kcred);
+
+ zrele(zp);
+
+ return (error);
+}
+
+/*
+ * Replaying ACLs is complicated by FUID support.
+ * The log record may contain some optional data
+ * to be used for replaying FUID's. These pieces
+ * are the actual FUIDs that were created initially.
+ * The FUID table index may no longer be valid and
+ * during zfs_create() a new index may be assigned.
+ * Because of this the log will contain the original
+ * domain+rid in order to create a new FUID.
+ *
+ * The individual ACEs may contain an ephemeral uid/gid which is no
+ * longer valid and will need to be replaced with an actual FUID.
+ *
+ */
+static int
+zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_acl_t *lr = arg2;
+ ace_t *ace = (ace_t *)(lr + 1);
+ vsecattr_t vsa;
+ znode_t *zp;
+ int error;
+
+ if (byteswap) {
+ byteswap_uint64_array(lr, sizeof (*lr));
+ zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE);
+ if (lr->lr_fuidcnt) {
+ byteswap_uint64_array((caddr_t)ace +
+ ZIL_ACE_LENGTH(lr->lr_acl_bytes),
+ lr->lr_fuidcnt * sizeof (uint64_t));
+ }
+ }
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+ return (error);
+
+ bzero(&vsa, sizeof (vsa));
+ vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS;
+ vsa.vsa_aclcnt = lr->lr_aclcnt;
+ vsa.vsa_aclentp = ace;
+ vsa.vsa_aclentsz = lr->lr_acl_bytes;
+ vsa.vsa_aclflags = lr->lr_acl_flags;
+
+ if (lr->lr_fuidcnt) {
+ void *fuidstart = (caddr_t)ace +
+ ZIL_ACE_LENGTH(lr->lr_acl_bytes);
+
+ zfsvfs->z_fuid_replay =
+ zfs_replay_fuids(fuidstart, &fuidstart,
+ lr->lr_fuidcnt, lr->lr_domcnt, 0, 0);
+ }
+
+ error = zfs_setsecattr(zp, &vsa, 0, kcred);
+
+ if (zfsvfs->z_fuid_replay)
+ zfs_fuid_info_free(zfsvfs->z_fuid_replay);
+
+ zfsvfs->z_fuid_replay = NULL;
+ zrele(zp);
+
+ return (error);
+}
+
+/*
+ * Callback vectors for replaying records
+ */
+zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
+ zfs_replay_error, /* no such type */
+ zfs_replay_create, /* TX_CREATE */
+ zfs_replay_create, /* TX_MKDIR */
+ zfs_replay_create, /* TX_MKXATTR */
+ zfs_replay_create, /* TX_SYMLINK */
+ zfs_replay_remove, /* TX_REMOVE */
+ zfs_replay_remove, /* TX_RMDIR */
+ zfs_replay_link, /* TX_LINK */
+ zfs_replay_rename, /* TX_RENAME */
+ zfs_replay_write, /* TX_WRITE */
+ zfs_replay_truncate, /* TX_TRUNCATE */
+ zfs_replay_setattr, /* TX_SETATTR */
+ zfs_replay_acl_v0, /* TX_ACL_V0 */
+ zfs_replay_acl, /* TX_ACL */
+ zfs_replay_create_acl, /* TX_CREATE_ACL */
+ zfs_replay_create, /* TX_CREATE_ATTR */
+ zfs_replay_create_acl, /* TX_CREATE_ACL_ATTR */
+ zfs_replay_create_acl, /* TX_MKDIR_ACL */
+ zfs_replay_create, /* TX_MKDIR_ATTR */
+ zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */
+ zfs_replay_write2, /* TX_WRITE2 */
+};
diff --git a/sys/contrib/openzfs/module/zfs/zfs_rlock.c b/sys/contrib/openzfs/module/zfs/zfs_rlock.c
new file mode 100644
index 000000000000..06a5e031a7df
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_rlock.c
@@ -0,0 +1,691 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ */
+
+/*
+ * This file contains the code to implement file range locking in
+ * ZFS, although there isn't much specific to ZFS (all that comes to mind is
+ * support for growing the blocksize).
+ *
+ * Interface
+ * ---------
+ * Defined in zfs_rlock.h but essentially:
+ * lr = rangelock_enter(zp, off, len, lock_type);
+ * rangelock_reduce(lr, off, len); // optional
+ * rangelock_exit(lr);
+ *
+ * Range locking rules
+ * --------------------
+ * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole
+ * file range needs to be locked as RL_WRITER. Only then can the pages be
+ * freed etc and zp_size reset. zp_size must be set within range lock.
+ * 2. For writes and punching holes (zfs_write & zfs_space) just the range
+ * being written or freed needs to be locked as RL_WRITER.
+ * Multiple writes at the end of the file must coordinate zp_size updates
+ * to ensure data isn't lost. A compare and swap loop is currently used
+ * to ensure the file size is at least the offset last written.
+ * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being
+ * read needs to be locked as RL_READER. A check against zp_size can then
+ * be made for reading beyond end of file.
+ *
+ * AVL tree
+ * --------
+ * An AVL tree is used to maintain the state of the existing ranges
+ * that are locked for exclusive (writer) or shared (reader) use.
+ * The starting range offset is used for searching and sorting the tree.
+ *
+ * Common case
+ * -----------
+ * The (hopefully) usual case is of no overlaps or contention for locks. On
+ * entry to rangelock_enter(), a locked_range_t is allocated; the tree
+ * searched that finds no overlap, and *this* locked_range_t is placed in the
+ * tree.
+ *
+ * Overlaps/Reference counting/Proxy locks
+ * ---------------------------------------
+ * The avl code only allows one node at a particular offset. Also it's very
+ * inefficient to search through all previous entries looking for overlaps
+ * (because the very 1st in the ordered list might be at offset 0 but
+ * cover the whole file).
+ * So this implementation uses reference counts and proxy range locks.
+ * Firstly, only reader locks use reference counts and proxy locks,
+ * because writer locks are exclusive.
+ * When a reader lock overlaps with another then a proxy lock is created
+ * for that range and replaces the original lock. If the overlap
+ * is exact then the reference count of the proxy is simply incremented.
+ * Otherwise, the proxy lock is split into smaller lock ranges and
+ * new proxy locks created for non overlapping ranges.
+ * The reference counts are adjusted accordingly.
+ * Meanwhile, the original lock is kept around (this is the callers handle)
+ * and its offset and length are used when releasing the lock.
+ *
+ * Thread coordination
+ * -------------------
+ * In order to make wakeups efficient and to ensure multiple continuous
+ * readers on a range don't starve a writer for the same range lock,
+ * two condition variables are allocated in each rl_t.
+ * If a writer (or reader) can't get a range it initialises the writer
+ * (or reader) cv; sets a flag saying there's a writer (or reader) waiting;
+ * and waits on that cv. When a thread unlocks that range it wakes up all
+ * writers then all readers before destroying the lock.
+ *
+ * Append mode writes
+ * ------------------
+ * Append mode writes need to lock a range at the end of a file.
+ * The offset of the end of the file is determined under the
+ * range locking mutex, and the lock type converted from RL_APPEND to
+ * RL_WRITER and the range locked.
+ *
+ * Grow block handling
+ * -------------------
+ * ZFS supports multiple block sizes, up to 16MB. The smallest
+ * block size is used for the file which is grown as needed. During this
+ * growth all other writers and readers must be excluded.
+ * So if the block size needs to be grown then the whole file is
+ * exclusively locked, then later the caller will reduce the lock
+ * range to just the range to be written using rangelock_reduce().
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zfs_rlock.h>
+
+
+/*
+ * AVL comparison function used to order range locks
+ * Locks are ordered on the start offset of the range.
+ */
+static int
+zfs_rangelock_compare(const void *arg1, const void *arg2)
+{
+ const zfs_locked_range_t *rl1 = (const zfs_locked_range_t *)arg1;
+ const zfs_locked_range_t *rl2 = (const zfs_locked_range_t *)arg2;
+
+ return (TREE_CMP(rl1->lr_offset, rl2->lr_offset));
+}
+
+/*
+ * The callback is invoked when acquiring a RL_WRITER or RL_APPEND lock.
+ * It must convert RL_APPEND to RL_WRITER (starting at the end of the file),
+ * and may increase the range that's locked for RL_WRITER.
+ */
+void
+zfs_rangelock_init(zfs_rangelock_t *rl, zfs_rangelock_cb_t *cb, void *arg)
+{
+ mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&rl->rl_tree, zfs_rangelock_compare,
+ sizeof (zfs_locked_range_t), offsetof(zfs_locked_range_t, lr_node));
+ rl->rl_cb = cb;
+ rl->rl_arg = arg;
+}
+
+void
+zfs_rangelock_fini(zfs_rangelock_t *rl)
+{
+ mutex_destroy(&rl->rl_lock);
+ avl_destroy(&rl->rl_tree);
+}
+
+/*
+ * Check if a write lock can be grabbed. If not, fail immediately or sleep and
+ * recheck until available, depending on the value of the "nonblock" parameter.
+ */
+static boolean_t
+zfs_rangelock_enter_writer(zfs_rangelock_t *rl, zfs_locked_range_t *new,
+ boolean_t nonblock)
+{
+ avl_tree_t *tree = &rl->rl_tree;
+ zfs_locked_range_t *lr;
+ avl_index_t where;
+ uint64_t orig_off = new->lr_offset;
+ uint64_t orig_len = new->lr_length;
+ zfs_rangelock_type_t orig_type = new->lr_type;
+
+ for (;;) {
+ /*
+ * Call callback which can modify new->r_off,len,type.
+ * Note, the callback is used by the ZPL to handle appending
+ * and changing blocksizes. It isn't needed for zvols.
+ */
+ if (rl->rl_cb != NULL) {
+ rl->rl_cb(new, rl->rl_arg);
+ }
+
+ /*
+ * If the type was APPEND, the callback must convert it to
+ * WRITER.
+ */
+ ASSERT3U(new->lr_type, ==, RL_WRITER);
+
+ /*
+ * First check for the usual case of no locks
+ */
+ if (avl_numnodes(tree) == 0) {
+ avl_add(tree, new);
+ return (B_TRUE);
+ }
+
+ /*
+ * Look for any locks in the range.
+ */
+ lr = avl_find(tree, new, &where);
+ if (lr != NULL)
+ goto wait; /* already locked at same offset */
+
+ lr = avl_nearest(tree, where, AVL_AFTER);
+ if (lr != NULL &&
+ lr->lr_offset < new->lr_offset + new->lr_length)
+ goto wait;
+
+ lr = avl_nearest(tree, where, AVL_BEFORE);
+ if (lr != NULL &&
+ lr->lr_offset + lr->lr_length > new->lr_offset)
+ goto wait;
+
+ avl_insert(tree, new, where);
+ return (B_TRUE);
+wait:
+ if (nonblock)
+ return (B_FALSE);
+ if (!lr->lr_write_wanted) {
+ cv_init(&lr->lr_write_cv, NULL, CV_DEFAULT, NULL);
+ lr->lr_write_wanted = B_TRUE;
+ }
+ cv_wait(&lr->lr_write_cv, &rl->rl_lock);
+
+ /* reset to original */
+ new->lr_offset = orig_off;
+ new->lr_length = orig_len;
+ new->lr_type = orig_type;
+ }
+}
+
+/*
+ * If this is an original (non-proxy) lock then replace it by
+ * a proxy and return the proxy.
+ */
+static zfs_locked_range_t *
+zfs_rangelock_proxify(avl_tree_t *tree, zfs_locked_range_t *lr)
+{
+ zfs_locked_range_t *proxy;
+
+ if (lr->lr_proxy)
+ return (lr); /* already a proxy */
+
+ ASSERT3U(lr->lr_count, ==, 1);
+ ASSERT(lr->lr_write_wanted == B_FALSE);
+ ASSERT(lr->lr_read_wanted == B_FALSE);
+ avl_remove(tree, lr);
+ lr->lr_count = 0;
+
+ /* create a proxy range lock */
+ proxy = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP);
+ proxy->lr_offset = lr->lr_offset;
+ proxy->lr_length = lr->lr_length;
+ proxy->lr_count = 1;
+ proxy->lr_type = RL_READER;
+ proxy->lr_proxy = B_TRUE;
+ proxy->lr_write_wanted = B_FALSE;
+ proxy->lr_read_wanted = B_FALSE;
+ avl_add(tree, proxy);
+
+ return (proxy);
+}
+
+/*
+ * Split the range lock at the supplied offset
+ * returning the *front* proxy.
+ */
+static zfs_locked_range_t *
+zfs_rangelock_split(avl_tree_t *tree, zfs_locked_range_t *lr, uint64_t off)
+{
+ zfs_locked_range_t *rear;
+
+ ASSERT3U(lr->lr_length, >, 1);
+ ASSERT3U(off, >, lr->lr_offset);
+ ASSERT3U(off, <, lr->lr_offset + lr->lr_length);
+ ASSERT(lr->lr_write_wanted == B_FALSE);
+ ASSERT(lr->lr_read_wanted == B_FALSE);
+
+ /* create the rear proxy range lock */
+ rear = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP);
+ rear->lr_offset = off;
+ rear->lr_length = lr->lr_offset + lr->lr_length - off;
+ rear->lr_count = lr->lr_count;
+ rear->lr_type = RL_READER;
+ rear->lr_proxy = B_TRUE;
+ rear->lr_write_wanted = B_FALSE;
+ rear->lr_read_wanted = B_FALSE;
+
+ zfs_locked_range_t *front = zfs_rangelock_proxify(tree, lr);
+ front->lr_length = off - lr->lr_offset;
+
+ avl_insert_here(tree, rear, front, AVL_AFTER);
+ return (front);
+}
+
+/*
+ * Create and add a new proxy range lock for the supplied range.
+ */
+static void
+zfs_rangelock_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
+{
+ zfs_locked_range_t *lr;
+
+ ASSERT(len != 0);
+ lr = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP);
+ lr->lr_offset = off;
+ lr->lr_length = len;
+ lr->lr_count = 1;
+ lr->lr_type = RL_READER;
+ lr->lr_proxy = B_TRUE;
+ lr->lr_write_wanted = B_FALSE;
+ lr->lr_read_wanted = B_FALSE;
+ avl_add(tree, lr);
+}
+
+static void
+zfs_rangelock_add_reader(avl_tree_t *tree, zfs_locked_range_t *new,
+ zfs_locked_range_t *prev, avl_index_t where)
+{
+ zfs_locked_range_t *next;
+ uint64_t off = new->lr_offset;
+ uint64_t len = new->lr_length;
+
+ /*
+ * prev arrives either:
+ * - pointing to an entry at the same offset
+ * - pointing to the entry with the closest previous offset whose
+ * range may overlap with the new range
+ * - null, if there were no ranges starting before the new one
+ */
+ if (prev != NULL) {
+ if (prev->lr_offset + prev->lr_length <= off) {
+ prev = NULL;
+ } else if (prev->lr_offset != off) {
+ /*
+ * convert to proxy if needed then
+ * split this entry and bump ref count
+ */
+ prev = zfs_rangelock_split(tree, prev, off);
+ prev = AVL_NEXT(tree, prev); /* move to rear range */
+ }
+ }
+ ASSERT((prev == NULL) || (prev->lr_offset == off));
+
+ if (prev != NULL)
+ next = prev;
+ else
+ next = avl_nearest(tree, where, AVL_AFTER);
+
+ if (next == NULL || off + len <= next->lr_offset) {
+ /* no overlaps, use the original new rl_t in the tree */
+ avl_insert(tree, new, where);
+ return;
+ }
+
+ if (off < next->lr_offset) {
+ /* Add a proxy for initial range before the overlap */
+ zfs_rangelock_new_proxy(tree, off, next->lr_offset - off);
+ }
+
+ new->lr_count = 0; /* will use proxies in tree */
+ /*
+ * We now search forward through the ranges, until we go past the end
+ * of the new range. For each entry we make it a proxy if it
+ * isn't already, then bump its reference count. If there's any
+ * gaps between the ranges then we create a new proxy range.
+ */
+ for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) {
+ if (off + len <= next->lr_offset)
+ break;
+ if (prev != NULL && prev->lr_offset + prev->lr_length <
+ next->lr_offset) {
+ /* there's a gap */
+ ASSERT3U(next->lr_offset, >,
+ prev->lr_offset + prev->lr_length);
+ zfs_rangelock_new_proxy(tree,
+ prev->lr_offset + prev->lr_length,
+ next->lr_offset -
+ (prev->lr_offset + prev->lr_length));
+ }
+ if (off + len == next->lr_offset + next->lr_length) {
+ /* exact overlap with end */
+ next = zfs_rangelock_proxify(tree, next);
+ next->lr_count++;
+ return;
+ }
+ if (off + len < next->lr_offset + next->lr_length) {
+ /* new range ends in the middle of this block */
+ next = zfs_rangelock_split(tree, next, off + len);
+ next->lr_count++;
+ return;
+ }
+ ASSERT3U(off + len, >, next->lr_offset + next->lr_length);
+ next = zfs_rangelock_proxify(tree, next);
+ next->lr_count++;
+ }
+
+ /* Add the remaining end range. */
+ zfs_rangelock_new_proxy(tree, prev->lr_offset + prev->lr_length,
+ (off + len) - (prev->lr_offset + prev->lr_length));
+}
+
+/*
+ * Check if a reader lock can be grabbed. If not, fail immediately or sleep and
+ * recheck until available, depending on the value of the "nonblock" parameter.
+ */
+static boolean_t
+zfs_rangelock_enter_reader(zfs_rangelock_t *rl, zfs_locked_range_t *new,
+ boolean_t nonblock)
+{
+ avl_tree_t *tree = &rl->rl_tree;
+ zfs_locked_range_t *prev, *next;
+ avl_index_t where;
+ uint64_t off = new->lr_offset;
+ uint64_t len = new->lr_length;
+
+ /*
+ * Look for any writer locks in the range.
+ */
+retry:
+ prev = avl_find(tree, new, &where);
+ if (prev == NULL)
+ prev = avl_nearest(tree, where, AVL_BEFORE);
+
+ /*
+ * Check the previous range for a writer lock overlap.
+ */
+ if (prev && (off < prev->lr_offset + prev->lr_length)) {
+ if ((prev->lr_type == RL_WRITER) || (prev->lr_write_wanted)) {
+ if (nonblock)
+ return (B_FALSE);
+ if (!prev->lr_read_wanted) {
+ cv_init(&prev->lr_read_cv,
+ NULL, CV_DEFAULT, NULL);
+ prev->lr_read_wanted = B_TRUE;
+ }
+ cv_wait(&prev->lr_read_cv, &rl->rl_lock);
+ goto retry;
+ }
+ if (off + len < prev->lr_offset + prev->lr_length)
+ goto got_lock;
+ }
+
+ /*
+ * Search through the following ranges to see if there's
+ * write lock any overlap.
+ */
+ if (prev != NULL)
+ next = AVL_NEXT(tree, prev);
+ else
+ next = avl_nearest(tree, where, AVL_AFTER);
+ for (; next != NULL; next = AVL_NEXT(tree, next)) {
+ if (off + len <= next->lr_offset)
+ goto got_lock;
+ if ((next->lr_type == RL_WRITER) || (next->lr_write_wanted)) {
+ if (nonblock)
+ return (B_FALSE);
+ if (!next->lr_read_wanted) {
+ cv_init(&next->lr_read_cv,
+ NULL, CV_DEFAULT, NULL);
+ next->lr_read_wanted = B_TRUE;
+ }
+ cv_wait(&next->lr_read_cv, &rl->rl_lock);
+ goto retry;
+ }
+ if (off + len <= next->lr_offset + next->lr_length)
+ goto got_lock;
+ }
+
+got_lock:
+ /*
+ * Add the read lock, which may involve splitting existing
+ * locks and bumping ref counts (r_count).
+ */
+ zfs_rangelock_add_reader(tree, new, prev, where);
+ return (B_TRUE);
+}
+
+/*
+ * Lock a range (offset, length) as either shared (RL_READER) or exclusive
+ * (RL_WRITER or RL_APPEND). If RL_APPEND is specified, rl_cb() will convert
+ * it to a RL_WRITER lock (with the offset at the end of the file). Returns
+ * the range lock structure for later unlocking (or reduce range if the
+ * entire file is locked as RL_WRITER), or NULL if nonblock is true and the
+ * lock could not be acquired immediately.
+ */
+static zfs_locked_range_t *
+zfs_rangelock_enter_impl(zfs_rangelock_t *rl, uint64_t off, uint64_t len,
+ zfs_rangelock_type_t type, boolean_t nonblock)
+{
+ zfs_locked_range_t *new;
+
+ ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
+
+ new = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP);
+ new->lr_rangelock = rl;
+ new->lr_offset = off;
+ if (len + off < off) /* overflow */
+ len = UINT64_MAX - off;
+ new->lr_length = len;
+ new->lr_count = 1; /* assume it's going to be in the tree */
+ new->lr_type = type;
+ new->lr_proxy = B_FALSE;
+ new->lr_write_wanted = B_FALSE;
+ new->lr_read_wanted = B_FALSE;
+
+ mutex_enter(&rl->rl_lock);
+ if (type == RL_READER) {
+ /*
+ * First check for the usual case of no locks
+ */
+ if (avl_numnodes(&rl->rl_tree) == 0) {
+ avl_add(&rl->rl_tree, new);
+ } else if (!zfs_rangelock_enter_reader(rl, new, nonblock)) {
+ kmem_free(new, sizeof (*new));
+ new = NULL;
+ }
+ } else if (!zfs_rangelock_enter_writer(rl, new, nonblock)) {
+ kmem_free(new, sizeof (*new));
+ new = NULL;
+ }
+ mutex_exit(&rl->rl_lock);
+ return (new);
+}
+
+zfs_locked_range_t *
+zfs_rangelock_enter(zfs_rangelock_t *rl, uint64_t off, uint64_t len,
+ zfs_rangelock_type_t type)
+{
+ return (zfs_rangelock_enter_impl(rl, off, len, type, B_FALSE));
+}
+
+zfs_locked_range_t *
+zfs_rangelock_tryenter(zfs_rangelock_t *rl, uint64_t off, uint64_t len,
+ zfs_rangelock_type_t type)
+{
+ return (zfs_rangelock_enter_impl(rl, off, len, type, B_TRUE));
+}
+
+/*
+ * Safely free the zfs_locked_range_t.
+ */
+static void
+zfs_rangelock_free(zfs_locked_range_t *lr)
+{
+ if (lr->lr_write_wanted)
+ cv_destroy(&lr->lr_write_cv);
+
+ if (lr->lr_read_wanted)
+ cv_destroy(&lr->lr_read_cv);
+
+ kmem_free(lr, sizeof (zfs_locked_range_t));
+}
+
+/*
+ * Unlock a reader lock
+ */
+static void
+zfs_rangelock_exit_reader(zfs_rangelock_t *rl, zfs_locked_range_t *remove,
+ list_t *free_list)
+{
+ avl_tree_t *tree = &rl->rl_tree;
+ uint64_t len;
+
+ /*
+ * The common case is when the remove entry is in the tree
+ * (cnt == 1) meaning there's been no other reader locks overlapping
+ * with this one. Otherwise the remove entry will have been
+ * removed from the tree and replaced by proxies (one or
+ * more ranges mapping to the entire range).
+ */
+ if (remove->lr_count == 1) {
+ avl_remove(tree, remove);
+ if (remove->lr_write_wanted)
+ cv_broadcast(&remove->lr_write_cv);
+ if (remove->lr_read_wanted)
+ cv_broadcast(&remove->lr_read_cv);
+ list_insert_tail(free_list, remove);
+ } else {
+ ASSERT0(remove->lr_count);
+ ASSERT0(remove->lr_write_wanted);
+ ASSERT0(remove->lr_read_wanted);
+ /*
+ * Find start proxy representing this reader lock,
+ * then decrement ref count on all proxies
+ * that make up this range, freeing them as needed.
+ */
+ zfs_locked_range_t *lr = avl_find(tree, remove, NULL);
+ ASSERT3P(lr, !=, NULL);
+ ASSERT3U(lr->lr_count, !=, 0);
+ ASSERT3U(lr->lr_type, ==, RL_READER);
+ zfs_locked_range_t *next = NULL;
+ for (len = remove->lr_length; len != 0; lr = next) {
+ len -= lr->lr_length;
+ if (len != 0) {
+ next = AVL_NEXT(tree, lr);
+ ASSERT3P(next, !=, NULL);
+ ASSERT3U(lr->lr_offset + lr->lr_length, ==,
+ next->lr_offset);
+ ASSERT3U(next->lr_count, !=, 0);
+ ASSERT3U(next->lr_type, ==, RL_READER);
+ }
+ lr->lr_count--;
+ if (lr->lr_count == 0) {
+ avl_remove(tree, lr);
+ if (lr->lr_write_wanted)
+ cv_broadcast(&lr->lr_write_cv);
+ if (lr->lr_read_wanted)
+ cv_broadcast(&lr->lr_read_cv);
+ list_insert_tail(free_list, lr);
+ }
+ }
+ kmem_free(remove, sizeof (zfs_locked_range_t));
+ }
+}
+
+/*
+ * Unlock range and destroy range lock structure.
+ */
+void
+zfs_rangelock_exit(zfs_locked_range_t *lr)
+{
+ zfs_rangelock_t *rl = lr->lr_rangelock;
+ list_t free_list;
+ zfs_locked_range_t *free_lr;
+
+ ASSERT(lr->lr_type == RL_WRITER || lr->lr_type == RL_READER);
+ ASSERT(lr->lr_count == 1 || lr->lr_count == 0);
+ ASSERT(!lr->lr_proxy);
+
+ /*
+ * The free list is used to defer the cv_destroy() and
+ * subsequent kmem_free until after the mutex is dropped.
+ */
+ list_create(&free_list, sizeof (zfs_locked_range_t),
+ offsetof(zfs_locked_range_t, lr_node));
+
+ mutex_enter(&rl->rl_lock);
+ if (lr->lr_type == RL_WRITER) {
+ /* writer locks can't be shared or split */
+ avl_remove(&rl->rl_tree, lr);
+ if (lr->lr_write_wanted)
+ cv_broadcast(&lr->lr_write_cv);
+ if (lr->lr_read_wanted)
+ cv_broadcast(&lr->lr_read_cv);
+ list_insert_tail(&free_list, lr);
+ } else {
+ /*
+ * lock may be shared, let rangelock_exit_reader()
+ * release the lock and free the zfs_locked_range_t.
+ */
+ zfs_rangelock_exit_reader(rl, lr, &free_list);
+ }
+ mutex_exit(&rl->rl_lock);
+
+ while ((free_lr = list_remove_head(&free_list)) != NULL)
+ zfs_rangelock_free(free_lr);
+
+ list_destroy(&free_list);
+}
+
+/*
+ * Reduce range locked as RL_WRITER from whole file to specified range.
+ * Asserts the whole file is exclusively locked and so there's only one
+ * entry in the tree.
+ */
+void
+zfs_rangelock_reduce(zfs_locked_range_t *lr, uint64_t off, uint64_t len)
+{
+ zfs_rangelock_t *rl = lr->lr_rangelock;
+
+ /* Ensure there are no other locks */
+ ASSERT3U(avl_numnodes(&rl->rl_tree), ==, 1);
+ ASSERT3U(lr->lr_offset, ==, 0);
+ ASSERT3U(lr->lr_type, ==, RL_WRITER);
+ ASSERT(!lr->lr_proxy);
+ ASSERT3U(lr->lr_length, ==, UINT64_MAX);
+ ASSERT3U(lr->lr_count, ==, 1);
+
+ mutex_enter(&rl->rl_lock);
+ lr->lr_offset = off;
+ lr->lr_length = len;
+ mutex_exit(&rl->rl_lock);
+ if (lr->lr_write_wanted)
+ cv_broadcast(&lr->lr_write_cv);
+ if (lr->lr_read_wanted)
+ cv_broadcast(&lr->lr_read_cv);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zfs_rangelock_init);
+EXPORT_SYMBOL(zfs_rangelock_fini);
+EXPORT_SYMBOL(zfs_rangelock_enter);
+EXPORT_SYMBOL(zfs_rangelock_tryenter);
+EXPORT_SYMBOL(zfs_rangelock_exit);
+EXPORT_SYMBOL(zfs_rangelock_reduce);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/zfs_sa.c b/sys/contrib/openzfs/module/zfs/zfs_sa.c
new file mode 100644
index 000000000000..67be131da63b
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_sa.c
@@ -0,0 +1,446 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/vnode.h>
+#include <sys/sa.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_sa.h>
+#include <sys/dmu_objset.h>
+#include <sys/sa_impl.h>
+
+/*
+ * ZPL attribute registration table.
+ * Order of attributes doesn't matter
+ * a unique value will be assigned for each
+ * attribute that is file system specific
+ *
+ * This is just the set of ZPL attributes that this
+ * version of ZFS deals with natively. The file system
+ * could have other attributes stored in files, but they will be
+ * ignored. The SA framework will preserve them, just that
+ * this version of ZFS won't change or delete them.
+ */
+
+sa_attr_reg_t zfs_attr_table[ZPL_END+1] = {
+ {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
+ {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
+ {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
+ {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
+ {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
+ {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
+ {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
+ {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
+ {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
+ {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
+ {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
+ {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
+ {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
+ {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
+ {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
+ {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
+ {"ZPL_DACL_COUNT", sizeof (uint64_t), SA_UINT64_ARRAY, 0},
+ {"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0},
+ {"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0},
+ {"ZPL_DACL_ACES", 0, SA_ACL, 0},
+ {"ZPL_DXATTR", 0, SA_UINT8_ARRAY, 0},
+ {"ZPL_PROJID", sizeof (uint64_t), SA_UINT64_ARRAY, 0},
+ {NULL, 0, 0, 0}
+};
+
+#ifdef _KERNEL
+int
+zfs_sa_readlink(znode_t *zp, zfs_uio_t *uio)
+{
+ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+ size_t bufsz;
+ int error;
+
+ bufsz = zp->z_size;
+ if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) {
+ error = zfs_uiomove((caddr_t)db->db_data +
+ ZFS_OLD_ZNODE_PHYS_SIZE,
+ MIN((size_t)bufsz, zfs_uio_resid(uio)), UIO_READ, uio);
+ } else {
+ dmu_buf_t *dbp;
+ if ((error = dmu_buf_hold(ZTOZSB(zp)->z_os, zp->z_id,
+ 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) {
+ error = zfs_uiomove(dbp->db_data,
+ MIN((size_t)bufsz, zfs_uio_resid(uio)), UIO_READ,
+ uio);
+ dmu_buf_rele(dbp, FTAG);
+ }
+ }
+ return (error);
+}
+
+void
+zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx)
+{
+ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+
+ if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) {
+ VERIFY0(dmu_set_bonus(db, len + ZFS_OLD_ZNODE_PHYS_SIZE, tx));
+ if (len) {
+ bcopy(link, (caddr_t)db->db_data +
+ ZFS_OLD_ZNODE_PHYS_SIZE, len);
+ }
+ } else {
+ dmu_buf_t *dbp;
+
+ zfs_grow_blocksize(zp, len, tx);
+ VERIFY0(dmu_buf_hold(ZTOZSB(zp)->z_os, zp->z_id, 0, FTAG, &dbp,
+ DMU_READ_NO_PREFETCH));
+
+ dmu_buf_will_dirty(dbp, tx);
+
+ ASSERT3U(len, <=, dbp->db_size);
+ bcopy(link, dbp->db_data, len);
+ dmu_buf_rele(dbp, FTAG);
+ }
+}
+
+void
+zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ xoptattr_t *xoap;
+
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+ VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
+ if (zp->z_is_sa) {
+ if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
+ &xoap->xoa_av_scanstamp,
+ sizeof (xoap->xoa_av_scanstamp)) != 0)
+ return;
+ } else {
+ dmu_object_info_t doi;
+ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+ int len;
+
+ if (!(zp->z_pflags & ZFS_BONUS_SCANSTAMP))
+ return;
+
+ sa_object_info(zp->z_sa_hdl, &doi);
+ len = sizeof (xoap->xoa_av_scanstamp) +
+ ZFS_OLD_ZNODE_PHYS_SIZE;
+
+ if (len <= doi.doi_bonus_size) {
+ (void) memcpy(xoap->xoa_av_scanstamp,
+ (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+ sizeof (xoap->xoa_av_scanstamp));
+ }
+ }
+ XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
+}
+
+void
+zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ xoptattr_t *xoap;
+
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+ VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
+ if (zp->z_is_sa)
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
+ &xoap->xoa_av_scanstamp,
+ sizeof (xoap->xoa_av_scanstamp), tx));
+ else {
+ dmu_object_info_t doi;
+ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+ int len;
+
+ sa_object_info(zp->z_sa_hdl, &doi);
+ len = sizeof (xoap->xoa_av_scanstamp) +
+ ZFS_OLD_ZNODE_PHYS_SIZE;
+ if (len > doi.doi_bonus_size)
+ VERIFY(dmu_set_bonus(db, len, tx) == 0);
+ (void) memcpy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+ xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp));
+
+ zp->z_pflags |= ZFS_BONUS_SCANSTAMP;
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+ &zp->z_pflags, sizeof (uint64_t), tx));
+ }
+}
+
+int
+zfs_sa_get_xattr(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ char *obj;
+ int size;
+ int error;
+
+ ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
+ ASSERT(!zp->z_xattr_cached);
+ ASSERT(zp->z_is_sa);
+
+ error = sa_size(zp->z_sa_hdl, SA_ZPL_DXATTR(zfsvfs), &size);
+ if (error) {
+ if (error == ENOENT)
+ return nvlist_alloc(&zp->z_xattr_cached,
+ NV_UNIQUE_NAME, KM_SLEEP);
+ else
+ return (error);
+ }
+
+ obj = vmem_alloc(size, KM_SLEEP);
+
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DXATTR(zfsvfs), obj, size);
+ if (error == 0)
+ error = nvlist_unpack(obj, size, &zp->z_xattr_cached, KM_SLEEP);
+
+ vmem_free(obj, size);
+
+ return (error);
+}
+
+int
+zfs_sa_set_xattr(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ dmu_tx_t *tx;
+ char *obj;
+ size_t size;
+ int error;
+
+ ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
+ ASSERT(zp->z_xattr_cached);
+ ASSERT(zp->z_is_sa);
+
+ error = nvlist_size(zp->z_xattr_cached, &size, NV_ENCODE_XDR);
+ if ((error == 0) && (size > SA_ATTR_MAX_LEN))
+ error = SET_ERROR(EFBIG);
+ if (error)
+ goto out;
+
+ obj = vmem_alloc(size, KM_SLEEP);
+
+ error = nvlist_pack(zp->z_xattr_cached, &obj, &size,
+ NV_ENCODE_XDR, KM_SLEEP);
+ if (error)
+ goto out_free;
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa_create(tx, size);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ int count = 0;
+ sa_bulk_attr_t bulk[2];
+ uint64_t ctime[2];
+
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DXATTR(zfsvfs),
+ NULL, obj, size);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, 16);
+ VERIFY0(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
+
+ dmu_tx_commit(tx);
+ }
+out_free:
+ vmem_free(obj, size);
+out:
+ return (error);
+}
+
+/*
+ * I'm not convinced we should do any of this upgrade.
+ * since the SA code can read both old/new znode formats
+ * with probably little to no performance difference.
+ *
+ * All new files will be created with the new format.
+ */
+
+void
+zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
+{
+ dmu_buf_t *db = sa_get_db(hdl);
+ znode_t *zp = sa_get_userdata(hdl);
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ int count = 0;
+ sa_bulk_attr_t *bulk, *sa_attrs;
+ zfs_acl_locator_cb_t locate = { 0 };
+ uint64_t uid, gid, mode, rdev, xattr, parent, tmp_gen;
+ uint64_t crtime[2], mtime[2], ctime[2], atime[2];
+ uint64_t links;
+ zfs_acl_phys_t znode_acl;
+ char scanstamp[AV_SCANSTAMP_SZ];
+ boolean_t drop_lock = B_FALSE;
+
+ /*
+ * No upgrade if ACL isn't cached
+ * since we won't know which locks are held
+ * and ready the ACL would require special "locked"
+ * interfaces that would be messy
+ */
+ if (zp->z_acl_cached == NULL || Z_ISLNK(ZTOTYPE(zp)))
+ return;
+
+ /*
+ * If the z_lock is held and we aren't the owner
+ * the just return since we don't want to deadlock
+ * trying to update the status of z_is_sa. This
+ * file can then be upgraded at a later time.
+ *
+ * Otherwise, we know we are doing the
+ * sa_update() that caused us to enter this function.
+ */
+ if (MUTEX_NOT_HELD(&zp->z_lock)) {
+ if (mutex_tryenter(&zp->z_lock) == 0)
+ return;
+ else
+ drop_lock = B_TRUE;
+ }
+
+ /* First do a bulk query of the attributes that aren't cached */
+ bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL, &xattr, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+ &znode_acl, 88);
+
+ if (sa_bulk_lookup_locked(hdl, bulk, count) != 0)
+ goto done;
+
+ if (dmu_objset_projectquota_enabled(hdl->sa_os) &&
+ !(zp->z_pflags & ZFS_PROJID)) {
+ zp->z_pflags |= ZFS_PROJID;
+ zp->z_projid = ZFS_DEFAULT_PROJID;
+ }
+
+ /*
+ * While the order here doesn't matter its best to try and organize
+ * it is such a way to pick up an already existing layout number
+ */
+ count = 0;
+ sa_attrs = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GEN(zfsvfs),
+ NULL, &tmp_gen, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PARENT(zfsvfs),
+ NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &atime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ &mtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL,
+ &crtime, 16);
+ links = ZTONLNK(zp);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &links, 8);
+ if (dmu_objset_projectquota_enabled(hdl->sa_os))
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PROJID(zfsvfs), NULL,
+ &zp->z_projid, 8);
+ if (Z_ISBLK(ZTOTYPE(zp)) || Z_ISCHR(ZTOTYPE(zp)))
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs), NULL,
+ &rdev, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+ &zp->z_acl_cached->z_acl_count, 8);
+
+ if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID)
+ zfs_acl_xform(zp, zp->z_acl_cached, CRED());
+
+ locate.cb_aclp = zp->z_acl_cached;
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_ACES(zfsvfs),
+ zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes);
+
+ if (xattr)
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_XATTR(zfsvfs),
+ NULL, &xattr, 8);
+
+ /* if scanstamp then add scanstamp */
+
+ if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) {
+ bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+ scanstamp, AV_SCANSTAMP_SZ);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zfsvfs),
+ NULL, scanstamp, AV_SCANSTAMP_SZ);
+ zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP;
+ }
+
+ VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0);
+ VERIFY(sa_replace_all_by_template_locked(hdl, sa_attrs,
+ count, tx) == 0);
+ if (znode_acl.z_acl_extern_obj)
+ VERIFY(0 == dmu_object_free(zfsvfs->z_os,
+ znode_acl.z_acl_extern_obj, tx));
+
+ zp->z_is_sa = B_TRUE;
+ kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
+done:
+ kmem_free(bulk, sizeof (sa_bulk_attr_t) * ZPL_END);
+ if (drop_lock)
+ mutex_exit(&zp->z_lock);
+}
+
+void
+zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp)
+{
+ if (!ZTOZSB(zp)->z_use_sa || zp->z_is_sa)
+ return;
+
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+ if (zfs_external_acl(zp)) {
+ dmu_tx_hold_free(tx, zfs_external_acl(zp), 0,
+ DMU_OBJECT_END);
+ }
+}
+
+EXPORT_SYMBOL(zfs_attr_table);
+EXPORT_SYMBOL(zfs_sa_readlink);
+EXPORT_SYMBOL(zfs_sa_symlink);
+EXPORT_SYMBOL(zfs_sa_get_scanstamp);
+EXPORT_SYMBOL(zfs_sa_set_scanstamp);
+EXPORT_SYMBOL(zfs_sa_get_xattr);
+EXPORT_SYMBOL(zfs_sa_set_xattr);
+EXPORT_SYMBOL(zfs_sa_upgrade);
+EXPORT_SYMBOL(zfs_sa_upgrade_txholds);
+
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c
new file mode 100644
index 000000000000..61d5f06c6455
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c
@@ -0,0 +1,897 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+/* Portions Copyright 2007 Jeremy Teo */
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+#include <sys/uio.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/dbuf.h>
+#include <sys/policy.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_quota.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+
+
+static ulong_t zfs_fsync_sync_cnt = 4;
+
+int
+zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+
+ (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
+
+ if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+ zil_commit(zfsvfs->z_log, zp->z_id);
+ ZFS_EXIT(zfsvfs);
+ }
+ tsd_set(zfs_fsyncer_key, NULL);
+
+ return (0);
+}
+
+
+#if defined(SEEK_HOLE) && defined(SEEK_DATA)
+/*
+ * Lseek support for finding holes (cmd == SEEK_HOLE) and
+ * data (cmd == SEEK_DATA). "off" is an in/out parameter.
+ */
+static int
+zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
+{
+ uint64_t noff = (uint64_t)*off; /* new offset */
+ uint64_t file_sz;
+ int error;
+ boolean_t hole;
+
+ file_sz = zp->z_size;
+ if (noff >= file_sz) {
+ return (SET_ERROR(ENXIO));
+ }
+
+ if (cmd == F_SEEK_HOLE)
+ hole = B_TRUE;
+ else
+ hole = B_FALSE;
+
+ error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
+
+ if (error == ESRCH)
+ return (SET_ERROR(ENXIO));
+
+ /* file was dirty, so fall back to using generic logic */
+ if (error == EBUSY) {
+ if (hole)
+ *off = file_sz;
+
+ return (0);
+ }
+
+ /*
+ * We could find a hole that begins after the logical end-of-file,
+ * because dmu_offset_next() only works on whole blocks. If the
+ * EOF falls mid-block, then indicate that the "virtual hole"
+ * at the end of the file begins at the logical EOF, rather than
+ * at the end of the last block.
+ */
+ if (noff > file_sz) {
+ ASSERT(hole);
+ noff = file_sz;
+ }
+
+ if (noff < *off)
+ return (error);
+ *off = noff;
+ return (error);
+}
+
+int
+zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ error = zfs_holey_common(zp, cmd, off);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+#endif /* SEEK_HOLE && SEEK_DATA */
+
+/*ARGSUSED*/
+int
+zfs_access(znode_t *zp, int mode, int flag, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if (flag & V_ACE_MASK)
+ error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
+ else
+ error = zfs_zaccess_rwx(zp, mode, flag, cr);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+static unsigned long zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */
+
+/*
+ * Read bytes from specified file into supplied buffer.
+ *
+ * IN: zp - inode of file to be read from.
+ * uio - structure supplying read location, range info,
+ * and return buffer.
+ * ioflag - O_SYNC flags; used to provide FRSYNC semantics.
+ * O_DIRECT flag; used to bypass page cache.
+ * cr - credentials of caller.
+ *
+ * OUT: uio - updated offset and range, buffer filled.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Side Effects:
+ * inode - atime updated if byte count > 0
+ */
+/* ARGSUSED */
+int
+zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
+{
+ int error = 0;
+ boolean_t frsync = B_FALSE;
+
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if (zp->z_pflags & ZFS_AV_QUARANTINED) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EACCES));
+ }
+
+ /* We don't copy out anything useful for directories. */
+ if (Z_ISDIR(ZTOTYPE(zp))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EISDIR));
+ }
+
+ /*
+ * Validate file offset
+ */
+ if (zfs_uio_offset(uio) < (offset_t)0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Fasttrack empty reads
+ */
+ if (zfs_uio_resid(uio) == 0) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+#ifdef FRSYNC
+ /*
+ * If we're in FRSYNC mode, sync out this znode before reading it.
+ * Only do this for non-snapshots.
+ *
+ * Some platforms do not support FRSYNC and instead map it
+ * to O_SYNC, which results in unnecessary calls to zil_commit. We
+ * only honor FRSYNC requests on platforms which support it.
+ */
+ frsync = !!(ioflag & FRSYNC);
+#endif
+ if (zfsvfs->z_log &&
+ (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
+ zil_commit(zfsvfs->z_log, zp->z_id);
+
+ /*
+ * Lock the range against changes.
+ */
+ zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
+ zfs_uio_offset(uio), zfs_uio_resid(uio), RL_READER);
+
+ /*
+ * If we are reading past end-of-file we can skip
+ * to the end; but we might still need to set atime.
+ */
+ if (zfs_uio_offset(uio) >= zp->z_size) {
+ error = 0;
+ goto out;
+ }
+
+ ASSERT(zfs_uio_offset(uio) < zp->z_size);
+ ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio));
+ ssize_t start_resid = n;
+
+ while (n > 0) {
+ ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size -
+ P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size));
+#ifdef UIO_NOCOPY
+ if (zfs_uio_segflg(uio) == UIO_NOCOPY)
+ error = mappedread_sf(zp, nbytes, uio);
+ else
+#endif
+ if (zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) {
+ error = mappedread(zp, nbytes, uio);
+ } else {
+ error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+ uio, nbytes);
+ }
+
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = SET_ERROR(EIO);
+ break;
+ }
+
+ n -= nbytes;
+ }
+
+ int64_t nread = start_resid - n;
+ dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
+ task_io_account_read(nread);
+out:
+ zfs_rangelock_exit(lr);
+
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Write the bytes to a file.
+ *
+ * IN: zp - znode of file to be written to.
+ * uio - structure supplying write location, range info,
+ * and data buffer.
+ * ioflag - O_APPEND flag set if in append mode.
+ * O_DIRECT flag; used to bypass page cache.
+ * cr - credentials of caller.
+ *
+ * OUT: uio - updated offset and range.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * ip - ctime|mtime updated if byte count > 0
+ */
+
+/* ARGSUSED */
+int
+zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
+{
+ int error = 0;
+ ssize_t start_resid = zfs_uio_resid(uio);
+
+ /*
+ * Fasttrack empty write
+ */
+ ssize_t n = start_resid;
+ if (n == 0)
+ return (0);
+
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ sa_bulk_attr_t bulk[4];
+ int count = 0;
+ uint64_t mtime[2], ctime[2];
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+
+ /*
+ * Callers might not be able to detect properly that we are read-only,
+ * so check it explicitly here.
+ */
+ if (zfs_is_readonly(zfsvfs)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EROFS));
+ }
+
+ /*
+ * If immutable or not appending then return EPERM
+ */
+ if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
+ ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) &&
+ (zfs_uio_offset(uio) < zp->z_size))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ /*
+ * Validate file offset
+ */
+ offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio);
+ if (woff < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ const uint64_t max_blksz = zfsvfs->z_max_blksz;
+
+ /*
+ * Pre-fault the pages to ensure slow (eg NFS) pages
+ * don't hold up txg.
+ * Skip this if uio contains loaned arc_buf.
+ */
+ if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EFAULT));
+ }
+
+ /*
+ * If in append mode, set the io offset pointer to eof.
+ */
+ zfs_locked_range_t *lr;
+ if (ioflag & O_APPEND) {
+ /*
+ * Obtain an appending range lock to guarantee file append
+ * semantics. We reset the write offset once we have the lock.
+ */
+ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
+ woff = lr->lr_offset;
+ if (lr->lr_length == UINT64_MAX) {
+ /*
+ * We overlocked the file because this write will cause
+ * the file block size to increase.
+ * Note that zp_size cannot change with this lock held.
+ */
+ woff = zp->z_size;
+ }
+ zfs_uio_setoffset(uio, woff);
+ } else {
+ /*
+ * Note that if the file block size will change as a result of
+ * this write, then this range lock will lock the entire file
+ * so that we can re-write the block safely.
+ */
+ lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
+ }
+
+ if (zn_rlimit_fsize(zp, uio)) {
+ zfs_rangelock_exit(lr);
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EFBIG));
+ }
+
+ const rlim64_t limit = MAXOFFSET_T;
+
+ if (woff >= limit) {
+ zfs_rangelock_exit(lr);
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EFBIG));
+ }
+
+ if (n > limit - woff)
+ n = limit - woff;
+
+ uint64_t end_size = MAX(zp->z_size, woff + n);
+ zilog_t *zilog = zfsvfs->z_log;
+
+ const uint64_t uid = KUID_TO_SUID(ZTOUID(zp));
+ const uint64_t gid = KGID_TO_SGID(ZTOGID(zp));
+ const uint64_t projid = zp->z_projid;
+
+ /*
+ * Write the file in reasonable size chunks. Each chunk is written
+ * in a separate transaction; this keeps the intent log records small
+ * and allows us to do more fine-grained space accounting.
+ */
+ while (n > 0) {
+ woff = zfs_uio_offset(uio);
+
+ if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) ||
+ zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) ||
+ (projid != ZFS_DEFAULT_PROJID &&
+ zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
+ projid))) {
+ error = SET_ERROR(EDQUOT);
+ break;
+ }
+
+ arc_buf_t *abuf = NULL;
+ if (n >= max_blksz && woff >= zp->z_size &&
+ P2PHASE(woff, max_blksz) == 0 &&
+ zp->z_blksz == max_blksz) {
+ /*
+ * This write covers a full block. "Borrow" a buffer
+ * from the dmu so that we can fill it before we enter
+ * a transaction. This avoids the possibility of
+ * holding up the transaction if the data copy hangs
+ * up on a pagefault (e.g., from an NFS server mapping).
+ */
+ size_t cbytes;
+
+ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ max_blksz);
+ ASSERT(abuf != NULL);
+ ASSERT(arc_buf_size(abuf) == max_blksz);
+ if ((error = zfs_uiocopy(abuf->b_data, max_blksz,
+ UIO_WRITE, uio, &cbytes))) {
+ dmu_return_arcbuf(abuf);
+ break;
+ }
+ ASSERT3S(cbytes, ==, max_blksz);
+ }
+
+ /*
+ * Start a transaction.
+ */
+ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
+ DB_DNODE_ENTER(db);
+ dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
+ MIN(n, max_blksz));
+ DB_DNODE_EXIT(db);
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ if (abuf != NULL)
+ dmu_return_arcbuf(abuf);
+ break;
+ }
+
+ /*
+ * If rangelock_enter() over-locked we grow the blocksize
+ * and then reduce the lock range. This will only happen
+ * on the first iteration since rangelock_reduce() will
+ * shrink down lr_length to the appropriate size.
+ */
+ if (lr->lr_length == UINT64_MAX) {
+ uint64_t new_blksz;
+
+ if (zp->z_blksz > max_blksz) {
+ /*
+ * File's blocksize is already larger than the
+ * "recordsize" property. Only let it grow to
+ * the next power of 2.
+ */
+ ASSERT(!ISP2(zp->z_blksz));
+ new_blksz = MIN(end_size,
+ 1 << highbit64(zp->z_blksz));
+ } else {
+ new_blksz = MIN(end_size, max_blksz);
+ }
+ zfs_grow_blocksize(zp, new_blksz, tx);
+ zfs_rangelock_reduce(lr, woff, n);
+ }
+
+ /*
+ * XXX - should we really limit each write to z_max_blksz?
+ * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
+ */
+ const ssize_t nbytes =
+ MIN(n, max_blksz - P2PHASE(woff, max_blksz));
+
+ ssize_t tx_bytes;
+ if (abuf == NULL) {
+ tx_bytes = zfs_uio_resid(uio);
+ zfs_uio_fault_disable(uio, B_TRUE);
+ error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+ uio, nbytes, tx);
+ zfs_uio_fault_disable(uio, B_FALSE);
+#ifdef __linux__
+ if (error == EFAULT) {
+ dmu_tx_commit(tx);
+ /*
+ * Account for partial writes before
+ * continuing the loop.
+ * Update needs to occur before the next
+ * zfs_uio_prefaultpages, or prefaultpages may
+ * error, and we may break the loop early.
+ */
+ if (tx_bytes != zfs_uio_resid(uio))
+ n -= tx_bytes - zfs_uio_resid(uio);
+ if (zfs_uio_prefaultpages(MIN(n, max_blksz),
+ uio)) {
+ break;
+ }
+ continue;
+ }
+#endif
+ if (error != 0) {
+ dmu_tx_commit(tx);
+ break;
+ }
+ tx_bytes -= zfs_uio_resid(uio);
+ } else {
+ /* Implied by abuf != NULL: */
+ ASSERT3S(n, >=, max_blksz);
+ ASSERT0(P2PHASE(woff, max_blksz));
+ /*
+ * We can simplify nbytes to MIN(n, max_blksz) since
+ * P2PHASE(woff, max_blksz) is 0, and knowing
+ * n >= max_blksz lets us simplify further:
+ */
+ ASSERT3S(nbytes, ==, max_blksz);
+ /*
+ * Thus, we're writing a full block at a block-aligned
+ * offset and extending the file past EOF.
+ *
+ * dmu_assign_arcbuf_by_dbuf() will directly assign the
+ * arc buffer to a dbuf.
+ */
+ error = dmu_assign_arcbuf_by_dbuf(
+ sa_get_db(zp->z_sa_hdl), woff, abuf, tx);
+ if (error != 0) {
+ dmu_return_arcbuf(abuf);
+ dmu_tx_commit(tx);
+ break;
+ }
+ ASSERT3S(nbytes, <=, zfs_uio_resid(uio));
+ zfs_uioskip(uio, nbytes);
+ tx_bytes = nbytes;
+ }
+ if (tx_bytes && zn_has_cached_data(zp) &&
+ !(ioflag & O_DIRECT)) {
+ update_pages(zp, woff, tx_bytes, zfsvfs->z_os);
+ }
+
+ /*
+ * If we made no progress, we're done. If we made even
+ * partial progress, update the znode and ZIL accordingly.
+ */
+ if (tx_bytes == 0) {
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+ (void *)&zp->z_size, sizeof (uint64_t), tx);
+ dmu_tx_commit(tx);
+ ASSERT(error != 0);
+ break;
+ }
+
+ /*
+ * Clear Set-UID/Set-GID bits on successful write if not
+ * privileged and at least one of the execute bits is set.
+ *
+ * It would be nice to do this after all writes have
+ * been done, but that would still expose the ISUID/ISGID
+ * to another app after the partial write is committed.
+ *
+ * Note: we don't call zfs_fuid_map_id() here because
+ * user 0 is not an ephemeral uid.
+ */
+ mutex_enter(&zp->z_acl_lock);
+ if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
+ (S_IXUSR >> 6))) != 0 &&
+ (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
+ secpolicy_vnode_setid_retain(zp, cr,
+ ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) {
+ uint64_t newmode;
+ zp->z_mode &= ~(S_ISUID | S_ISGID);
+ newmode = zp->z_mode;
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
+ (void *)&newmode, sizeof (uint64_t), tx);
+ }
+ mutex_exit(&zp->z_acl_lock);
+
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+
+ /*
+ * Update the file size (zp_size) if it has changed;
+ * account for possible concurrent updates.
+ */
+ while ((end_size = zp->z_size) < zfs_uio_offset(uio)) {
+ (void) atomic_cas_64(&zp->z_size, end_size,
+ zfs_uio_offset(uio));
+ ASSERT(error == 0);
+ }
+ /*
+ * If we are replaying and eof is non zero then force
+ * the file size to the specified eof. Note, there's no
+ * concurrency during replay.
+ */
+ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
+ zp->z_size = zfsvfs->z_replay_eof;
+
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
+ zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag,
+ NULL, NULL);
+ dmu_tx_commit(tx);
+
+ if (error != 0)
+ break;
+ ASSERT3S(tx_bytes, ==, nbytes);
+ n -= nbytes;
+
+ if (n > 0) {
+ if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
+ error = SET_ERROR(EFAULT);
+ break;
+ }
+ }
+ }
+
+ zfs_znode_update_vfs(zp);
+ zfs_rangelock_exit(lr);
+
+ /*
+ * If we're in replay mode, or we made no progress, or the
+ * uio data is inaccessible return an error. Otherwise, it's
+ * at least a partial write, so it's successful.
+ */
+ if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid ||
+ error == EFAULT) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (ioflag & (O_SYNC | O_DSYNC) ||
+ zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, zp->z_id);
+
+ const int64_t nwritten = start_resid - zfs_uio_resid(uio);
+ dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
+ task_io_account_write(nwritten);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*ARGSUSED*/
+int
+zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ int error;
+ boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+ error = zfs_getacl(zp, vsecp, skipaclchk, cr);
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*ARGSUSED*/
+int
+zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ int error;
+ boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+ zilog_t *zilog = zfsvfs->z_log;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ error = zfs_setacl(zp, vsecp, skipaclchk, cr);
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+#ifdef ZFS_DEBUG
+static int zil_fault_io = 0;
+#endif
+
+static void zfs_get_done(zgd_t *zgd, int error);
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+int
+zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
+{
+ zfsvfs_t *zfsvfs = arg;
+ objset_t *os = zfsvfs->z_os;
+ znode_t *zp;
+ uint64_t object = lr->lr_foid;
+ uint64_t offset = lr->lr_offset;
+ uint64_t size = lr->lr_length;
+ dmu_buf_t *db;
+ zgd_t *zgd;
+ int error = 0;
+
+ ASSERT3P(lwb, !=, NULL);
+ ASSERT3P(zio, !=, NULL);
+ ASSERT3U(size, !=, 0);
+
+ /*
+ * Nothing to do if the file has been removed
+ */
+ if (zfs_zget(zfsvfs, object, &zp) != 0)
+ return (SET_ERROR(ENOENT));
+ if (zp->z_unlinked) {
+ /*
+ * Release the vnode asynchronously as we currently have the
+ * txg stopped from syncing.
+ */
+ zfs_zrele_async(zp);
+ return (SET_ERROR(ENOENT));
+ }
+
+ zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+ zgd->zgd_lwb = lwb;
+ zgd->zgd_private = zp;
+
+ /*
+ * Write records come in two flavors: immediate and indirect.
+ * For small writes it's cheaper to store the data with the
+ * log record (immediate); for large writes it's cheaper to
+ * sync the data and get a pointer to it (indirect) so that
+ * we don't have to write the data twice.
+ */
+ if (buf != NULL) { /* immediate write */
+ zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
+ offset, size, RL_READER);
+ /* test for truncation needs to be done while range locked */
+ if (offset >= zp->z_size) {
+ error = SET_ERROR(ENOENT);
+ } else {
+ error = dmu_read(os, object, offset, size, buf,
+ DMU_READ_NO_PREFETCH);
+ }
+ ASSERT(error == 0 || error == ENOENT);
+ } else { /* indirect write */
+ /*
+ * Have to lock the whole block to ensure when it's
+ * written out and its checksum is being calculated
+ * that no one can change the data. We need to re-check
+ * blocksize after we get the lock in case it's changed!
+ */
+ for (;;) {
+ uint64_t blkoff;
+ size = zp->z_blksz;
+ blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
+ offset -= blkoff;
+ zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
+ offset, size, RL_READER);
+ if (zp->z_blksz == size)
+ break;
+ offset += blkoff;
+ zfs_rangelock_exit(zgd->zgd_lr);
+ }
+ /* test for truncation needs to be done while range locked */
+ if (lr->lr_offset >= zp->z_size)
+ error = SET_ERROR(ENOENT);
+#ifdef ZFS_DEBUG
+ if (zil_fault_io) {
+ error = SET_ERROR(EIO);
+ zil_fault_io = 0;
+ }
+#endif
+ if (error == 0)
+ error = dmu_buf_hold(os, object, offset, zgd, &db,
+ DMU_READ_NO_PREFETCH);
+
+ if (error == 0) {
+ blkptr_t *bp = &lr->lr_blkptr;
+
+ zgd->zgd_db = db;
+ zgd->zgd_bp = bp;
+
+ ASSERT(db->db_offset == offset);
+ ASSERT(db->db_size == size);
+
+ error = dmu_sync(zio, lr->lr_common.lrc_txg,
+ zfs_get_done, zgd);
+ ASSERT(error || lr->lr_length <= size);
+
+ /*
+ * On success, we need to wait for the write I/O
+ * initiated by dmu_sync() to complete before we can
+ * release this dbuf. We will finish everything up
+ * in the zfs_get_done() callback.
+ */
+ if (error == 0)
+ return (0);
+
+ if (error == EALREADY) {
+ lr->lr_common.lrc_txtype = TX_WRITE2;
+ /*
+ * TX_WRITE2 relies on the data previously
+ * written by the TX_WRITE that caused
+ * EALREADY. We zero out the BP because
+ * it is the old, currently-on-disk BP.
+ */
+ zgd->zgd_bp = NULL;
+ BP_ZERO(bp);
+ error = 0;
+ }
+ }
+ }
+
+ zfs_get_done(zgd, error);
+
+ return (error);
+}
+
+
+/* ARGSUSED */
+static void
+zfs_get_done(zgd_t *zgd, int error)
+{
+ znode_t *zp = zgd->zgd_private;
+
+ if (zgd->zgd_db)
+ dmu_buf_rele(zgd->zgd_db, zgd);
+
+ zfs_rangelock_exit(zgd->zgd_lr);
+
+ /*
+ * Release the vnode asynchronously as we currently have the
+ * txg stopped from syncing.
+ */
+ zfs_zrele_async(zp);
+
+ kmem_free(zgd, sizeof (zgd_t));
+}
+
+EXPORT_SYMBOL(zfs_access);
+EXPORT_SYMBOL(zfs_fsync);
+EXPORT_SYMBOL(zfs_holey);
+EXPORT_SYMBOL(zfs_read);
+EXPORT_SYMBOL(zfs_write);
+EXPORT_SYMBOL(zfs_getsecattr);
+EXPORT_SYMBOL(zfs_setsecattr);
+
+ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, ULONG, ZMOD_RW,
+ "Bytes to read per chunk");
diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c
new file mode 100644
index 000000000000..7b52f9249298
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zil.c
@@ -0,0 +1,3695 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright (c) 2018 Datto Inc.
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/arc.h>
+#include <sys/stat.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/dsl_dataset.h>
+#include <sys/vdev_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/metaslab.h>
+#include <sys/trace_zfs.h>
+#include <sys/abd.h>
+
+/*
+ * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system
+ * calls that change the file system. Each itx has enough information to
+ * be able to replay them after a system crash, power loss, or
+ * equivalent failure mode. These are stored in memory until either:
+ *
+ * 1. they are committed to the pool by the DMU transaction group
+ * (txg), at which point they can be discarded; or
+ * 2. they are committed to the on-disk ZIL for the dataset being
+ * modified (e.g. due to an fsync, O_DSYNC, or other synchronous
+ * requirement).
+ *
+ * In the event of a crash or power loss, the itxs contained by each
+ * dataset's on-disk ZIL will be replayed when that dataset is first
+ * instantiated (e.g. if the dataset is a normal filesystem, when it is
+ * first mounted).
+ *
+ * As hinted at above, there is one ZIL per dataset (both the in-memory
+ * representation, and the on-disk representation). The on-disk format
+ * consists of 3 parts:
+ *
+ * - a single, per-dataset, ZIL header; which points to a chain of
+ * - zero or more ZIL blocks; each of which contains
+ * - zero or more ZIL records
+ *
+ * A ZIL record holds the information necessary to replay a single
+ * system call transaction. A ZIL block can hold many ZIL records, and
+ * the blocks are chained together, similarly to a singly linked list.
+ *
+ * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL
+ * block in the chain, and the ZIL header points to the first block in
+ * the chain.
+ *
+ * Note, there is not a fixed place in the pool to hold these ZIL
+ * blocks; they are dynamically allocated and freed as needed from the
+ * blocks available on the pool, though they can be preferentially
+ * allocated from a dedicated "log" vdev.
+ */
+
+/*
+ * This controls the amount of time that a ZIL block (lwb) will remain
+ * "open" when it isn't "full", and it has a thread waiting for it to be
+ * committed to stable storage. Please refer to the zil_commit_waiter()
+ * function (and the comments within it) for more details.
+ */
+int zfs_commit_timeout_pct = 5;
+
+/*
+ * See zil.h for more information about these fields.
+ */
+zil_stats_t zil_stats = {
+ { "zil_commit_count", KSTAT_DATA_UINT64 },
+ { "zil_commit_writer_count", KSTAT_DATA_UINT64 },
+ { "zil_itx_count", KSTAT_DATA_UINT64 },
+ { "zil_itx_indirect_count", KSTAT_DATA_UINT64 },
+ { "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 },
+ { "zil_itx_copied_count", KSTAT_DATA_UINT64 },
+ { "zil_itx_copied_bytes", KSTAT_DATA_UINT64 },
+ { "zil_itx_needcopy_count", KSTAT_DATA_UINT64 },
+ { "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 },
+ { "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 },
+ { "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 },
+ { "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 },
+ { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 },
+};
+
+static kstat_t *zil_ksp;
+
+/*
+ * Disable intent logging replay. This global ZIL switch affects all pools.
+ */
+int zil_replay_disable = 0;
+
+/*
+ * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to
+ * the disk(s) by the ZIL after an LWB write has completed. Setting this
+ * will cause ZIL corruption on power loss if a volatile out-of-order
+ * write cache is enabled.
+ */
+int zil_nocacheflush = 0;
+
+/*
+ * Limit SLOG write size per commit executed with synchronous priority.
+ * Any writes above that will be executed with lower (asynchronous) priority
+ * to limit potential SLOG device abuse by single active ZIL writer.
+ */
+unsigned long zil_slog_bulk = 768 * 1024;
+
+static kmem_cache_t *zil_lwb_cache;
+static kmem_cache_t *zil_zcw_cache;
+
+#define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
+ sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
+
+static int
+zil_bp_compare(const void *x1, const void *x2)
+{
+ const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
+ const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
+
+ int cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
+ if (likely(cmp))
+ return (cmp);
+
+ return (TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)));
+}
+
+static void
+zil_bp_tree_init(zilog_t *zilog)
+{
+ avl_create(&zilog->zl_bp_tree, zil_bp_compare,
+ sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
+}
+
+static void
+zil_bp_tree_fini(zilog_t *zilog)
+{
+ avl_tree_t *t = &zilog->zl_bp_tree;
+ zil_bp_node_t *zn;
+ void *cookie = NULL;
+
+ while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
+ kmem_free(zn, sizeof (zil_bp_node_t));
+
+ avl_destroy(t);
+}
+
+int
+zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
+{
+ avl_tree_t *t = &zilog->zl_bp_tree;
+ const dva_t *dva;
+ zil_bp_node_t *zn;
+ avl_index_t where;
+
+ if (BP_IS_EMBEDDED(bp))
+ return (0);
+
+ dva = BP_IDENTITY(bp);
+
+ if (avl_find(t, dva, &where) != NULL)
+ return (SET_ERROR(EEXIST));
+
+ zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
+ zn->zn_dva = *dva;
+ avl_insert(t, zn, where);
+
+ return (0);
+}
+
+static zil_header_t *
+zil_header_in_syncing_context(zilog_t *zilog)
+{
+ return ((zil_header_t *)zilog->zl_header);
+}
+
+static void
+zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
+{
+ zio_cksum_t *zc = &bp->blk_cksum;
+
+ zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL);
+ zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL);
+ zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
+ zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
+}
+
+/*
+ * Read a log block and make sure it's valid.
+ */
+static int
+zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
+ blkptr_t *nbp, void *dst, char **end)
+{
+ enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+ arc_flags_t aflags = ARC_FLAG_WAIT;
+ arc_buf_t *abuf = NULL;
+ zbookmark_phys_t zb;
+ int error;
+
+ if (zilog->zl_header->zh_claim_txg == 0)
+ zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
+
+ if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
+ zio_flags |= ZIO_FLAG_SPECULATIVE;
+
+ if (!decrypt)
+ zio_flags |= ZIO_FLAG_RAW;
+
+ SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+ error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func,
+ &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
+
+ if (error == 0) {
+ zio_cksum_t cksum = bp->blk_cksum;
+
+ /*
+ * Validate the checksummed log block.
+ *
+ * Sequence numbers should be... sequential. The checksum
+ * verifier for the next block should be bp's checksum plus 1.
+ *
+ * Also check the log chain linkage and size used.
+ */
+ cksum.zc_word[ZIL_ZC_SEQ]++;
+
+ if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
+ zil_chain_t *zilc = abuf->b_data;
+ char *lr = (char *)(zilc + 1);
+ uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
+
+ if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
+ sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
+ error = SET_ERROR(ECKSUM);
+ } else {
+ ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
+ bcopy(lr, dst, len);
+ *end = (char *)dst + len;
+ *nbp = zilc->zc_next_blk;
+ }
+ } else {
+ char *lr = abuf->b_data;
+ uint64_t size = BP_GET_LSIZE(bp);
+ zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
+
+ if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
+ sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
+ (zilc->zc_nused > (size - sizeof (*zilc)))) {
+ error = SET_ERROR(ECKSUM);
+ } else {
+ ASSERT3U(zilc->zc_nused, <=,
+ SPA_OLD_MAXBLOCKSIZE);
+ bcopy(lr, dst, zilc->zc_nused);
+ *end = (char *)dst + zilc->zc_nused;
+ *nbp = zilc->zc_next_blk;
+ }
+ }
+
+ arc_buf_destroy(abuf, &abuf);
+ }
+
+ return (error);
+}
+
+/*
+ * Read a TX_WRITE log data block.
+ */
+static int
+zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
+{
+ enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+ const blkptr_t *bp = &lr->lr_blkptr;
+ arc_flags_t aflags = ARC_FLAG_WAIT;
+ arc_buf_t *abuf = NULL;
+ zbookmark_phys_t zb;
+ int error;
+
+ if (BP_IS_HOLE(bp)) {
+ if (wbuf != NULL)
+ bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
+ return (0);
+ }
+
+ if (zilog->zl_header->zh_claim_txg == 0)
+ zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
+
+ /*
+ * If we are not using the resulting data, we are just checking that
+ * it hasn't been corrupted so we don't need to waste CPU time
+ * decompressing and decrypting it.
+ */
+ if (wbuf == NULL)
+ zio_flags |= ZIO_FLAG_RAW;
+
+ SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
+ ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
+
+ error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
+ ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
+
+ if (error == 0) {
+ if (wbuf != NULL)
+ bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
+ arc_buf_destroy(abuf, &abuf);
+ }
+
+ return (error);
+}
+
+/*
+ * Parse the intent log, and call parse_func for each valid record within.
+ */
+int
+zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+ zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg,
+ boolean_t decrypt)
+{
+ const zil_header_t *zh = zilog->zl_header;
+ boolean_t claimed = !!zh->zh_claim_txg;
+ uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
+ uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
+ uint64_t max_blk_seq = 0;
+ uint64_t max_lr_seq = 0;
+ uint64_t blk_count = 0;
+ uint64_t lr_count = 0;
+ blkptr_t blk, next_blk;
+ char *lrbuf, *lrp;
+ int error = 0;
+
+ bzero(&next_blk, sizeof (blkptr_t));
+
+ /*
+ * Old logs didn't record the maximum zh_claim_lr_seq.
+ */
+ if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
+ claim_lr_seq = UINT64_MAX;
+
+ /*
+ * Starting at the block pointed to by zh_log we read the log chain.
+ * For each block in the chain we strongly check that block to
+ * ensure its validity. We stop when an invalid block is found.
+ * For each block pointer in the chain we call parse_blk_func().
+ * For each record in each valid block we call parse_lr_func().
+ * If the log has been claimed, stop if we encounter a sequence
+ * number greater than the highest claimed sequence number.
+ */
+ lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
+ zil_bp_tree_init(zilog);
+
+ for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
+ uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+ int reclen;
+ char *end = NULL;
+
+ if (blk_seq > claim_blk_seq)
+ break;
+
+ error = parse_blk_func(zilog, &blk, arg, txg);
+ if (error != 0)
+ break;
+ ASSERT3U(max_blk_seq, <, blk_seq);
+ max_blk_seq = blk_seq;
+ blk_count++;
+
+ if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
+ break;
+
+ error = zil_read_log_block(zilog, decrypt, &blk, &next_blk,
+ lrbuf, &end);
+ if (error != 0)
+ break;
+
+ for (lrp = lrbuf; lrp < end; lrp += reclen) {
+ lr_t *lr = (lr_t *)lrp;
+ reclen = lr->lrc_reclen;
+ ASSERT3U(reclen, >=, sizeof (lr_t));
+ if (lr->lrc_seq > claim_lr_seq)
+ goto done;
+
+ error = parse_lr_func(zilog, lr, arg, txg);
+ if (error != 0)
+ goto done;
+ ASSERT3U(max_lr_seq, <, lr->lrc_seq);
+ max_lr_seq = lr->lrc_seq;
+ lr_count++;
+ }
+ }
+done:
+ zilog->zl_parse_error = error;
+ zilog->zl_parse_blk_seq = max_blk_seq;
+ zilog->zl_parse_lr_seq = max_lr_seq;
+ zilog->zl_parse_blk_count = blk_count;
+ zilog->zl_parse_lr_count = lr_count;
+
+ ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) ||
+ (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq) ||
+ (decrypt && error == EIO));
+
+ zil_bp_tree_fini(zilog);
+ zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
+ uint64_t first_txg)
+{
+ ASSERT(!BP_IS_HOLE(bp));
+
+ /*
+ * As we call this function from the context of a rewind to a
+ * checkpoint, each ZIL block whose txg is later than the txg
+ * that we rewind to is invalid. Thus, we return -1 so
+ * zil_parse() doesn't attempt to read it.
+ */
+ if (bp->blk_birth >= first_txg)
+ return (-1);
+
+ if (zil_bp_tree_add(zilog, bp) != 0)
+ return (0);
+
+ zio_free(zilog->zl_spa, first_txg, bp);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zil_noop_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
+ uint64_t first_txg)
+{
+ return (0);
+}
+
+static int
+zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
+ uint64_t first_txg)
+{
+ /*
+ * Claim log block if not already committed and not already claimed.
+ * If tx == NULL, just verify that the block is claimable.
+ */
+ if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
+ zil_bp_tree_add(zilog, bp) != 0)
+ return (0);
+
+ return (zio_wait(zio_claim(NULL, zilog->zl_spa,
+ tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
+}
+
+static int
+zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
+ uint64_t first_txg)
+{
+ lr_write_t *lr = (lr_write_t *)lrc;
+ int error;
+
+ if (lrc->lrc_txtype != TX_WRITE)
+ return (0);
+
+ /*
+ * If the block is not readable, don't claim it. This can happen
+ * in normal operation when a log block is written to disk before
+ * some of the dmu_sync() blocks it points to. In this case, the
+ * transaction cannot have been committed to anyone (we would have
+ * waited for all writes to be stable first), so it is semantically
+ * correct to declare this the end of the log.
+ */
+ if (lr->lr_blkptr.blk_birth >= first_txg) {
+ error = zil_read_log_data(zilog, lr, NULL);
+ if (error != 0)
+ return (error);
+ }
+
+ return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
+}
+
+/* ARGSUSED */
+static int
+zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
+ uint64_t claim_txg)
+{
+ zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+
+ return (0);
+}
+
+static int
+zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
+ uint64_t claim_txg)
+{
+ lr_write_t *lr = (lr_write_t *)lrc;
+ blkptr_t *bp = &lr->lr_blkptr;
+
+ /*
+ * If we previously claimed it, we need to free it.
+ */
+ if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
+ bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
+ !BP_IS_HOLE(bp))
+ zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+
+ return (0);
+}
+
+static int
+zil_lwb_vdev_compare(const void *x1, const void *x2)
+{
+ const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
+ const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
+
+ return (TREE_CMP(v1, v2));
+}
+
+static lwb_t *
+zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg,
+ boolean_t fastwrite)
+{
+ lwb_t *lwb;
+
+ lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+ lwb->lwb_zilog = zilog;
+ lwb->lwb_blk = *bp;
+ lwb->lwb_fastwrite = fastwrite;
+ lwb->lwb_slog = slog;
+ lwb->lwb_state = LWB_STATE_CLOSED;
+ lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
+ lwb->lwb_max_txg = txg;
+ lwb->lwb_write_zio = NULL;
+ lwb->lwb_root_zio = NULL;
+ lwb->lwb_tx = NULL;
+ lwb->lwb_issued_timestamp = 0;
+ if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
+ lwb->lwb_nused = sizeof (zil_chain_t);
+ lwb->lwb_sz = BP_GET_LSIZE(bp);
+ } else {
+ lwb->lwb_nused = 0;
+ lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
+ }
+
+ mutex_enter(&zilog->zl_lock);
+ list_insert_tail(&zilog->zl_lwb_list, lwb);
+ mutex_exit(&zilog->zl_lock);
+
+ ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
+ ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
+ VERIFY(list_is_empty(&lwb->lwb_waiters));
+ VERIFY(list_is_empty(&lwb->lwb_itxs));
+
+ return (lwb);
+}
+
+static void
+zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
+{
+ ASSERT(MUTEX_HELD(&zilog->zl_lock));
+ ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
+ VERIFY(list_is_empty(&lwb->lwb_waiters));
+ VERIFY(list_is_empty(&lwb->lwb_itxs));
+ ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
+ ASSERT3P(lwb->lwb_write_zio, ==, NULL);
+ ASSERT3P(lwb->lwb_root_zio, ==, NULL);
+ ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
+ ASSERT(lwb->lwb_state == LWB_STATE_CLOSED ||
+ lwb->lwb_state == LWB_STATE_FLUSH_DONE);
+
+ /*
+ * Clear the zilog's field to indicate this lwb is no longer
+ * valid, and prevent use-after-free errors.
+ */
+ if (zilog->zl_last_lwb_opened == lwb)
+ zilog->zl_last_lwb_opened = NULL;
+
+ kmem_cache_free(zil_lwb_cache, lwb);
+}
+
+/*
+ * Called when we create in-memory log transactions so that we know
+ * to cleanup the itxs at the end of spa_sync().
+ */
+static void
+zilog_dirty(zilog_t *zilog, uint64_t txg)
+{
+ dsl_pool_t *dp = zilog->zl_dmu_pool;
+ dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
+
+ ASSERT(spa_writeable(zilog->zl_spa));
+
+ if (ds->ds_is_snapshot)
+ panic("dirtying snapshot!");
+
+ if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
+ /* up the hold count until we can be written out */
+ dmu_buf_add_ref(ds->ds_dbuf, zilog);
+
+ zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg);
+ }
+}
+
+/*
+ * Determine if the zil is dirty in the specified txg. Callers wanting to
+ * ensure that the dirty state does not change must hold the itxg_lock for
+ * the specified txg. Holding the lock will ensure that the zil cannot be
+ * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current
+ * state.
+ */
+static boolean_t __maybe_unused
+zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg)
+{
+ dsl_pool_t *dp = zilog->zl_dmu_pool;
+
+ if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK))
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+/*
+ * Determine if the zil is dirty. The zil is considered dirty if it has
+ * any pending itx records that have not been cleaned by zil_clean().
+ */
+static boolean_t
+zilog_is_dirty(zilog_t *zilog)
+{
+ dsl_pool_t *dp = zilog->zl_dmu_pool;
+
+ for (int t = 0; t < TXG_SIZE; t++) {
+ if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+/*
+ * Create an on-disk intent log.
+ */
+static lwb_t *
+zil_create(zilog_t *zilog)
+{
+ const zil_header_t *zh = zilog->zl_header;
+ lwb_t *lwb = NULL;
+ uint64_t txg = 0;
+ dmu_tx_t *tx = NULL;
+ blkptr_t blk;
+ int error = 0;
+ boolean_t fastwrite = FALSE;
+ boolean_t slog = FALSE;
+
+ /*
+ * Wait for any previous destroy to complete.
+ */
+ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+
+ ASSERT(zh->zh_claim_txg == 0);
+ ASSERT(zh->zh_replay_seq == 0);
+
+ blk = zh->zh_log;
+
+ /*
+ * Allocate an initial log block if:
+ * - there isn't one already
+ * - the existing block is the wrong endianness
+ */
+ if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
+ tx = dmu_tx_create(zilog->zl_os);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ txg = dmu_tx_get_txg(tx);
+
+ if (!BP_IS_HOLE(&blk)) {
+ zio_free(zilog->zl_spa, txg, &blk);
+ BP_ZERO(&blk);
+ }
+
+ error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk,
+ ZIL_MIN_BLKSZ, &slog);
+ fastwrite = TRUE;
+
+ if (error == 0)
+ zil_init_log_chain(zilog, &blk);
+ }
+
+ /*
+ * Allocate a log write block (lwb) for the first log block.
+ */
+ if (error == 0)
+ lwb = zil_alloc_lwb(zilog, &blk, slog, txg, fastwrite);
+
+ /*
+ * If we just allocated the first log block, commit our transaction
+ * and wait for zil_sync() to stuff the block pointer into zh_log.
+ * (zh is part of the MOS, so we cannot modify it in open context.)
+ */
+ if (tx != NULL) {
+ dmu_tx_commit(tx);
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ }
+
+ ASSERT(error != 0 || bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
+ IMPLY(error == 0, lwb != NULL);
+
+ return (lwb);
+}
+
+/*
+ * In one tx, free all log blocks and clear the log header. If keep_first
+ * is set, then we're replaying a log with no content. We want to keep the
+ * first block, however, so that the first synchronous transaction doesn't
+ * require a txg_wait_synced() in zil_create(). We don't need to
+ * txg_wait_synced() here either when keep_first is set, because both
+ * zil_create() and zil_destroy() will wait for any in-progress destroys
+ * to complete.
+ */
+void
+zil_destroy(zilog_t *zilog, boolean_t keep_first)
+{
+ const zil_header_t *zh = zilog->zl_header;
+ lwb_t *lwb;
+ dmu_tx_t *tx;
+ uint64_t txg;
+
+ /*
+ * Wait for any previous destroy to complete.
+ */
+ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+
+ zilog->zl_old_header = *zh; /* debugging aid */
+
+ if (BP_IS_HOLE(&zh->zh_log))
+ return;
+
+ tx = dmu_tx_create(zilog->zl_os);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ txg = dmu_tx_get_txg(tx);
+
+ mutex_enter(&zilog->zl_lock);
+
+ ASSERT3U(zilog->zl_destroy_txg, <, txg);
+ zilog->zl_destroy_txg = txg;
+ zilog->zl_keep_first = keep_first;
+
+ if (!list_is_empty(&zilog->zl_lwb_list)) {
+ ASSERT(zh->zh_claim_txg == 0);
+ VERIFY(!keep_first);
+ while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
+ if (lwb->lwb_fastwrite)
+ metaslab_fastwrite_unmark(zilog->zl_spa,
+ &lwb->lwb_blk);
+
+ list_remove(&zilog->zl_lwb_list, lwb);
+ if (lwb->lwb_buf != NULL)
+ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+ zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
+ zil_free_lwb(zilog, lwb);
+ }
+ } else if (!keep_first) {
+ zil_destroy_sync(zilog, tx);
+ }
+ mutex_exit(&zilog->zl_lock);
+
+ dmu_tx_commit(tx);
+}
+
+void
+zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
+{
+ ASSERT(list_is_empty(&zilog->zl_lwb_list));
+ (void) zil_parse(zilog, zil_free_log_block,
+ zil_free_log_record, tx, zilog->zl_header->zh_claim_txg, B_FALSE);
+}
+
+int
+zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
+{
+ dmu_tx_t *tx = txarg;
+ zilog_t *zilog;
+ uint64_t first_txg;
+ zil_header_t *zh;
+ objset_t *os;
+ int error;
+
+ error = dmu_objset_own_obj(dp, ds->ds_object,
+ DMU_OST_ANY, B_FALSE, B_FALSE, FTAG, &os);
+ if (error != 0) {
+ /*
+ * EBUSY indicates that the objset is inconsistent, in which
+ * case it can not have a ZIL.
+ */
+ if (error != EBUSY) {
+ cmn_err(CE_WARN, "can't open objset for %llu, error %u",
+ (unsigned long long)ds->ds_object, error);
+ }
+
+ return (0);
+ }
+
+ zilog = dmu_objset_zil(os);
+ zh = zil_header_in_syncing_context(zilog);
+ ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa));
+ first_txg = spa_min_claim_txg(zilog->zl_spa);
+
+ /*
+ * If the spa_log_state is not set to be cleared, check whether
+ * the current uberblock is a checkpoint one and if the current
+ * header has been claimed before moving on.
+ *
+ * If the current uberblock is a checkpointed uberblock then
+ * one of the following scenarios took place:
+ *
+ * 1] We are currently rewinding to the checkpoint of the pool.
+ * 2] We crashed in the middle of a checkpoint rewind but we
+ * did manage to write the checkpointed uberblock to the
+ * vdev labels, so when we tried to import the pool again
+ * the checkpointed uberblock was selected from the import
+ * procedure.
+ *
+ * In both cases we want to zero out all the ZIL blocks, except
+ * the ones that have been claimed at the time of the checkpoint
+ * (their zh_claim_txg != 0). The reason is that these blocks
+ * may be corrupted since we may have reused their locations on
+ * disk after we took the checkpoint.
+ *
+ * We could try to set spa_log_state to SPA_LOG_CLEAR earlier
+ * when we first figure out whether the current uberblock is
+ * checkpointed or not. Unfortunately, that would discard all
+ * the logs, including the ones that are claimed, and we would
+ * leak space.
+ */
+ if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR ||
+ (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
+ zh->zh_claim_txg == 0)) {
+ if (!BP_IS_HOLE(&zh->zh_log)) {
+ (void) zil_parse(zilog, zil_clear_log_block,
+ zil_noop_log_record, tx, first_txg, B_FALSE);
+ }
+ BP_ZERO(&zh->zh_log);
+ if (os->os_encrypted)
+ os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
+ dsl_dataset_dirty(dmu_objset_ds(os), tx);
+ dmu_objset_disown(os, B_FALSE, FTAG);
+ return (0);
+ }
+
+ /*
+ * If we are not rewinding and opening the pool normally, then
+ * the min_claim_txg should be equal to the first txg of the pool.
+ */
+ ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa));
+
+ /*
+ * Claim all log blocks if we haven't already done so, and remember
+ * the highest claimed sequence number. This ensures that if we can
+ * read only part of the log now (e.g. due to a missing device),
+ * but we can read the entire log later, we will not try to replay
+ * or destroy beyond the last block we successfully claimed.
+ */
+ ASSERT3U(zh->zh_claim_txg, <=, first_txg);
+ if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
+ (void) zil_parse(zilog, zil_claim_log_block,
+ zil_claim_log_record, tx, first_txg, B_FALSE);
+ zh->zh_claim_txg = first_txg;
+ zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
+ zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
+ if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
+ zh->zh_flags |= ZIL_REPLAY_NEEDED;
+ zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
+ if (os->os_encrypted)
+ os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
+ dsl_dataset_dirty(dmu_objset_ds(os), tx);
+ }
+
+ ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
+ dmu_objset_disown(os, B_FALSE, FTAG);
+ return (0);
+}
+
+/*
+ * Check the log by walking the log chain.
+ * Checksum errors are ok as they indicate the end of the chain.
+ * Any other error (no device or read failure) returns an error.
+ */
+/* ARGSUSED */
+int
+zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
+{
+ zilog_t *zilog;
+ objset_t *os;
+ blkptr_t *bp;
+ int error;
+
+ ASSERT(tx == NULL);
+
+ error = dmu_objset_from_ds(ds, &os);
+ if (error != 0) {
+ cmn_err(CE_WARN, "can't open objset %llu, error %d",
+ (unsigned long long)ds->ds_object, error);
+ return (0);
+ }
+
+ zilog = dmu_objset_zil(os);
+ bp = (blkptr_t *)&zilog->zl_header->zh_log;
+
+ if (!BP_IS_HOLE(bp)) {
+ vdev_t *vd;
+ boolean_t valid = B_TRUE;
+
+ /*
+ * Check the first block and determine if it's on a log device
+ * which may have been removed or faulted prior to loading this
+ * pool. If so, there's no point in checking the rest of the
+ * log as its content should have already been synced to the
+ * pool.
+ */
+ spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
+ vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
+ if (vd->vdev_islog && vdev_is_dead(vd))
+ valid = vdev_log_state_valid(vd);
+ spa_config_exit(os->os_spa, SCL_STATE, FTAG);
+
+ if (!valid)
+ return (0);
+
+ /*
+ * Check whether the current uberblock is checkpointed (e.g.
+ * we are rewinding) and whether the current header has been
+ * claimed or not. If it hasn't then skip verifying it. We
+ * do this because its ZIL blocks may be part of the pool's
+ * state before the rewind, which is no longer valid.
+ */
+ zil_header_t *zh = zil_header_in_syncing_context(zilog);
+ if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
+ zh->zh_claim_txg == 0)
+ return (0);
+ }
+
+ /*
+ * Because tx == NULL, zil_claim_log_block() will not actually claim
+ * any blocks, but just determine whether it is possible to do so.
+ * In addition to checking the log chain, zil_claim_log_block()
+ * will invoke zio_claim() with a done func of spa_claim_notify(),
+ * which will update spa_max_claim_txg. See spa_load() for details.
+ */
+ error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
+ zilog->zl_header->zh_claim_txg ? -1ULL :
+ spa_min_claim_txg(os->os_spa), B_FALSE);
+
+ return ((error == ECKSUM || error == ENOENT) ? 0 : error);
+}
+
+/*
+ * When an itx is "skipped", this function is used to properly mark the
+ * waiter as "done, and signal any thread(s) waiting on it. An itx can
+ * be skipped (and not committed to an lwb) for a variety of reasons,
+ * one of them being that the itx was committed via spa_sync(), prior to
+ * it being committed to an lwb; this can happen if a thread calling
+ * zil_commit() is racing with spa_sync().
+ */
+static void
+zil_commit_waiter_skip(zil_commit_waiter_t *zcw)
+{
+ mutex_enter(&zcw->zcw_lock);
+ ASSERT3B(zcw->zcw_done, ==, B_FALSE);
+ zcw->zcw_done = B_TRUE;
+ cv_broadcast(&zcw->zcw_cv);
+ mutex_exit(&zcw->zcw_lock);
+}
+
+/*
+ * This function is used when the given waiter is to be linked into an
+ * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb.
+ * At this point, the waiter will no longer be referenced by the itx,
+ * and instead, will be referenced by the lwb.
+ */
+static void
+zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
+{
+ /*
+ * The lwb_waiters field of the lwb is protected by the zilog's
+ * zl_lock, thus it must be held when calling this function.
+ */
+ ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
+
+ mutex_enter(&zcw->zcw_lock);
+ ASSERT(!list_link_active(&zcw->zcw_node));
+ ASSERT3P(zcw->zcw_lwb, ==, NULL);
+ ASSERT3P(lwb, !=, NULL);
+ ASSERT(lwb->lwb_state == LWB_STATE_OPENED ||
+ lwb->lwb_state == LWB_STATE_ISSUED ||
+ lwb->lwb_state == LWB_STATE_WRITE_DONE);
+
+ list_insert_tail(&lwb->lwb_waiters, zcw);
+ zcw->zcw_lwb = lwb;
+ mutex_exit(&zcw->zcw_lock);
+}
+
+/*
+ * This function is used when zio_alloc_zil() fails to allocate a ZIL
+ * block, and the given waiter must be linked to the "nolwb waiters"
+ * list inside of zil_process_commit_list().
+ */
+static void
+zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb)
+{
+ mutex_enter(&zcw->zcw_lock);
+ ASSERT(!list_link_active(&zcw->zcw_node));
+ ASSERT3P(zcw->zcw_lwb, ==, NULL);
+ list_insert_tail(nolwb, zcw);
+ mutex_exit(&zcw->zcw_lock);
+}
+
+void
+zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
+{
+ avl_tree_t *t = &lwb->lwb_vdev_tree;
+ avl_index_t where;
+ zil_vdev_node_t *zv, zvsearch;
+ int ndvas = BP_GET_NDVAS(bp);
+ int i;
+
+ if (zil_nocacheflush)
+ return;
+
+ mutex_enter(&lwb->lwb_vdev_lock);
+ for (i = 0; i < ndvas; i++) {
+ zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
+ if (avl_find(t, &zvsearch, &where) == NULL) {
+ zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
+ zv->zv_vdev = zvsearch.zv_vdev;
+ avl_insert(t, zv, where);
+ }
+ }
+ mutex_exit(&lwb->lwb_vdev_lock);
+}
+
+static void
+zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
+{
+ avl_tree_t *src = &lwb->lwb_vdev_tree;
+ avl_tree_t *dst = &nlwb->lwb_vdev_tree;
+ void *cookie = NULL;
+ zil_vdev_node_t *zv;
+
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
+ ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
+ ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
+
+ /*
+ * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
+ * not need the protection of lwb_vdev_lock (it will only be modified
+ * while holding zilog->zl_lock) as its writes and those of its
+ * children have all completed. The younger 'nlwb' may be waiting on
+ * future writes to additional vdevs.
+ */
+ mutex_enter(&nlwb->lwb_vdev_lock);
+ /*
+ * Tear down the 'lwb' vdev tree, ensuring that entries which do not
+ * exist in 'nlwb' are moved to it, freeing any would-be duplicates.
+ */
+ while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) {
+ avl_index_t where;
+
+ if (avl_find(dst, zv, &where) == NULL) {
+ avl_insert(dst, zv, where);
+ } else {
+ kmem_free(zv, sizeof (*zv));
+ }
+ }
+ mutex_exit(&nlwb->lwb_vdev_lock);
+}
+
+void
+zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
+{
+ lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
+}
+
+/*
+ * This function is a called after all vdevs associated with a given lwb
+ * write have completed their DKIOCFLUSHWRITECACHE command; or as soon
+ * as the lwb write completes, if "zil_nocacheflush" is set. Further,
+ * all "previous" lwb's will have completed before this function is
+ * called; i.e. this function is called for all previous lwbs before
+ * it's called for "this" lwb (enforced via zio the dependencies
+ * configured in zil_lwb_set_zio_dependency()).
+ *
+ * The intention is for this function to be called as soon as the
+ * contents of an lwb are considered "stable" on disk, and will survive
+ * any sudden loss of power. At this point, any threads waiting for the
+ * lwb to reach this state are signalled, and the "waiter" structures
+ * are marked "done".
+ */
+static void
+zil_lwb_flush_vdevs_done(zio_t *zio)
+{
+ lwb_t *lwb = zio->io_private;
+ zilog_t *zilog = lwb->lwb_zilog;
+ dmu_tx_t *tx = lwb->lwb_tx;
+ zil_commit_waiter_t *zcw;
+ itx_t *itx;
+
+ spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);
+
+ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+
+ mutex_enter(&zilog->zl_lock);
+
+ /*
+ * Ensure the lwb buffer pointer is cleared before releasing the
+ * txg. If we have had an allocation failure and the txg is
+ * waiting to sync then we want zil_sync() to remove the lwb so
+ * that it's not picked up as the next new one in
+ * zil_process_commit_list(). zil_sync() will only remove the
+ * lwb if lwb_buf is null.
+ */
+ lwb->lwb_buf = NULL;
+ lwb->lwb_tx = NULL;
+
+ ASSERT3U(lwb->lwb_issued_timestamp, >, 0);
+ zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp;
+
+ lwb->lwb_root_zio = NULL;
+
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
+ lwb->lwb_state = LWB_STATE_FLUSH_DONE;
+
+ if (zilog->zl_last_lwb_opened == lwb) {
+ /*
+ * Remember the highest committed log sequence number
+ * for ztest. We only update this value when all the log
+ * writes succeeded, because ztest wants to ASSERT that
+ * it got the whole log chain.
+ */
+ zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
+ }
+
+ while ((itx = list_head(&lwb->lwb_itxs)) != NULL) {
+ list_remove(&lwb->lwb_itxs, itx);
+ zil_itx_destroy(itx);
+ }
+
+ while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) {
+ mutex_enter(&zcw->zcw_lock);
+
+ ASSERT(list_link_active(&zcw->zcw_node));
+ list_remove(&lwb->lwb_waiters, zcw);
+
+ ASSERT3P(zcw->zcw_lwb, ==, lwb);
+ zcw->zcw_lwb = NULL;
+
+ zcw->zcw_zio_error = zio->io_error;
+
+ ASSERT3B(zcw->zcw_done, ==, B_FALSE);
+ zcw->zcw_done = B_TRUE;
+ cv_broadcast(&zcw->zcw_cv);
+
+ mutex_exit(&zcw->zcw_lock);
+ }
+
+ mutex_exit(&zilog->zl_lock);
+
+ /*
+ * Now that we've written this log block, we have a stable pointer
+ * to the next block in the chain, so it's OK to let the txg in
+ * which we allocated the next block sync.
+ */
+ dmu_tx_commit(tx);
+}
+
+/*
+ * This is called when an lwb's write zio completes. The callback's
+ * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs
+ * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved
+ * in writing out this specific lwb's data, and in the case that cache
+ * flushes have been deferred, vdevs involved in writing the data for
+ * previous lwbs. The writes corresponding to all the vdevs in the
+ * lwb_vdev_tree will have completed by the time this is called, due to
+ * the zio dependencies configured in zil_lwb_set_zio_dependency(),
+ * which takes deferred flushes into account. The lwb will be "done"
+ * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio
+ * completion callback for the lwb's root zio.
+ */
+static void
+zil_lwb_write_done(zio_t *zio)
+{
+ lwb_t *lwb = zio->io_private;
+ spa_t *spa = zio->io_spa;
+ zilog_t *zilog = lwb->lwb_zilog;
+ avl_tree_t *t = &lwb->lwb_vdev_tree;
+ void *cookie = NULL;
+ zil_vdev_node_t *zv;
+ lwb_t *nlwb;
+
+ ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
+
+ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
+ ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
+ ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
+ ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
+ ASSERT(!BP_IS_GANG(zio->io_bp));
+ ASSERT(!BP_IS_HOLE(zio->io_bp));
+ ASSERT(BP_GET_FILL(zio->io_bp) == 0);
+
+ abd_free(zio->io_abd);
+
+ mutex_enter(&zilog->zl_lock);
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
+ lwb->lwb_state = LWB_STATE_WRITE_DONE;
+ lwb->lwb_write_zio = NULL;
+ lwb->lwb_fastwrite = FALSE;
+ nlwb = list_next(&zilog->zl_lwb_list, lwb);
+ mutex_exit(&zilog->zl_lock);
+
+ if (avl_numnodes(t) == 0)
+ return;
+
+ /*
+ * If there was an IO error, we're not going to call zio_flush()
+ * on these vdevs, so we simply empty the tree and free the
+ * nodes. We avoid calling zio_flush() since there isn't any
+ * good reason for doing so, after the lwb block failed to be
+ * written out.
+ */
+ if (zio->io_error != 0) {
+ while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
+ kmem_free(zv, sizeof (*zv));
+ return;
+ }
+
+ /*
+ * If this lwb does not have any threads waiting for it to
+ * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE
+ * command to the vdevs written to by "this" lwb, and instead
+ * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE
+ * command for those vdevs. Thus, we merge the vdev tree of
+ * "this" lwb with the vdev tree of the "next" lwb in the list,
+ * and assume the "next" lwb will handle flushing the vdevs (or
+ * deferring the flush(s) again).
+ *
+ * This is a useful performance optimization, especially for
+ * workloads with lots of async write activity and few sync
+ * write and/or fsync activity, as it has the potential to
+ * coalesce multiple flush commands to a vdev into one.
+ */
+ if (list_head(&lwb->lwb_waiters) == NULL && nlwb != NULL) {
+ zil_lwb_flush_defer(lwb, nlwb);
+ ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
+ return;
+ }
+
+ while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
+ vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
+ if (vd != NULL)
+ zio_flush(lwb->lwb_root_zio, vd);
+ kmem_free(zv, sizeof (*zv));
+ }
+}
+
+static void
+zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
+{
+ lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened;
+
+ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+ ASSERT(MUTEX_HELD(&zilog->zl_lock));
+
+ /*
+ * The zilog's "zl_last_lwb_opened" field is used to build the
+ * lwb/zio dependency chain, which is used to preserve the
+ * ordering of lwb completions that is required by the semantics
+ * of the ZIL. Each new lwb zio becomes a parent of the
+ * "previous" lwb zio, such that the new lwb's zio cannot
+ * complete until the "previous" lwb's zio completes.
+ *
+ * This is required by the semantics of zil_commit(); the commit
+ * waiters attached to the lwbs will be woken in the lwb zio's
+ * completion callback, so this zio dependency graph ensures the
+ * waiters are woken in the correct order (the same order the
+ * lwbs were created).
+ */
+ if (last_lwb_opened != NULL &&
+ last_lwb_opened->lwb_state != LWB_STATE_FLUSH_DONE) {
+ ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
+ last_lwb_opened->lwb_state == LWB_STATE_ISSUED ||
+ last_lwb_opened->lwb_state == LWB_STATE_WRITE_DONE);
+
+ ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL);
+ zio_add_child(lwb->lwb_root_zio,
+ last_lwb_opened->lwb_root_zio);
+
+ /*
+ * If the previous lwb's write hasn't already completed,
+ * we also want to order the completion of the lwb write
+ * zios (above, we only order the completion of the lwb
+ * root zios). This is required because of how we can
+ * defer the DKIOCFLUSHWRITECACHE commands for each lwb.
+ *
+ * When the DKIOCFLUSHWRITECACHE commands are deferred,
+ * the previous lwb will rely on this lwb to flush the
+ * vdevs written to by that previous lwb. Thus, we need
+ * to ensure this lwb doesn't issue the flush until
+ * after the previous lwb's write completes. We ensure
+ * this ordering by setting the zio parent/child
+ * relationship here.
+ *
+ * Without this relationship on the lwb's write zio,
+ * it's possible for this lwb's write to complete prior
+ * to the previous lwb's write completing; and thus, the
+ * vdevs for the previous lwb would be flushed prior to
+ * that lwb's data being written to those vdevs (the
+ * vdevs are flushed in the lwb write zio's completion
+ * handler, zil_lwb_write_done()).
+ */
+ if (last_lwb_opened->lwb_state != LWB_STATE_WRITE_DONE) {
+ ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
+ last_lwb_opened->lwb_state == LWB_STATE_ISSUED);
+
+ ASSERT3P(last_lwb_opened->lwb_write_zio, !=, NULL);
+ zio_add_child(lwb->lwb_write_zio,
+ last_lwb_opened->lwb_write_zio);
+ }
+ }
+}
+
+
+/*
+ * This function's purpose is to "open" an lwb such that it is ready to
+ * accept new itxs being committed to it. To do this, the lwb's zio
+ * structures are created, and linked to the lwb. This function is
+ * idempotent; if the passed in lwb has already been opened, this
+ * function is essentially a no-op.
+ */
+static void
+zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
+{
+ zbookmark_phys_t zb;
+ zio_priority_t prio;
+
+ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+ ASSERT3P(lwb, !=, NULL);
+ EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED);
+ EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED);
+
+ SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+ lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+ /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */
+ mutex_enter(&zilog->zl_lock);
+ if (lwb->lwb_root_zio == NULL) {
+ abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
+ BP_GET_LSIZE(&lwb->lwb_blk));
+
+ if (!lwb->lwb_fastwrite) {
+ metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk);
+ lwb->lwb_fastwrite = 1;
+ }
+
+ if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
+ prio = ZIO_PRIORITY_SYNC_WRITE;
+ else
+ prio = ZIO_PRIORITY_ASYNC_WRITE;
+
+ lwb->lwb_root_zio = zio_root(zilog->zl_spa,
+ zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL);
+ ASSERT3P(lwb->lwb_root_zio, !=, NULL);
+
+ lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio,
+ zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd,
+ BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb,
+ prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_FASTWRITE, &zb);
+ ASSERT3P(lwb->lwb_write_zio, !=, NULL);
+
+ lwb->lwb_state = LWB_STATE_OPENED;
+
+ zil_lwb_set_zio_dependency(zilog, lwb);
+ zilog->zl_last_lwb_opened = lwb;
+ }
+ mutex_exit(&zilog->zl_lock);
+
+ ASSERT3P(lwb->lwb_root_zio, !=, NULL);
+ ASSERT3P(lwb->lwb_write_zio, !=, NULL);
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+}
+
+/*
+ * Define a limited set of intent log block sizes.
+ *
+ * These must be a multiple of 4KB. Note only the amount used (again
+ * aligned to 4KB) actually gets written. However, we can't always just
+ * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
+ */
+struct {
+ uint64_t limit;
+ uint64_t blksz;
+} zil_block_buckets[] = {
+ { 4096, 4096 }, /* non TX_WRITE */
+ { 8192 + 4096, 8192 + 4096 }, /* database */
+ { 32768 + 4096, 32768 + 4096 }, /* NFS writes */
+ { 65536 + 4096, 65536 + 4096 }, /* 64KB writes */
+ { 131072, 131072 }, /* < 128KB writes */
+ { 131072 +4096, 65536 + 4096 }, /* 128KB writes */
+ { UINT64_MAX, SPA_OLD_MAXBLOCKSIZE}, /* > 128KB writes */
+};
+
+/*
+ * Maximum block size used by the ZIL. This is picked up when the ZIL is
+ * initialized. Otherwise this should not be used directly; see
+ * zl_max_block_size instead.
+ */
+int zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
+
+/*
+ * Start a log block write and advance to the next log block.
+ * Calls are serialized.
+ */
+static lwb_t *
+zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
+{
+ lwb_t *nlwb = NULL;
+ zil_chain_t *zilc;
+ spa_t *spa = zilog->zl_spa;
+ blkptr_t *bp;
+ dmu_tx_t *tx;
+ uint64_t txg;
+ uint64_t zil_blksz, wsz;
+ int i, error;
+ boolean_t slog;
+
+ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+ ASSERT3P(lwb->lwb_root_zio, !=, NULL);
+ ASSERT3P(lwb->lwb_write_zio, !=, NULL);
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+
+ if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
+ zilc = (zil_chain_t *)lwb->lwb_buf;
+ bp = &zilc->zc_next_blk;
+ } else {
+ zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
+ bp = &zilc->zc_next_blk;
+ }
+
+ ASSERT(lwb->lwb_nused <= lwb->lwb_sz);
+
+ /*
+ * Allocate the next block and save its address in this block
+ * before writing it in order to establish the log chain.
+ * Note that if the allocation of nlwb synced before we wrote
+ * the block that points at it (lwb), we'd leak it if we crashed.
+ * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
+ * We dirty the dataset to ensure that zil_sync() will be called
+ * to clean up in the event of allocation failure or I/O failure.
+ */
+
+ tx = dmu_tx_create(zilog->zl_os);
+
+ /*
+ * Since we are not going to create any new dirty data, and we
+ * can even help with clearing the existing dirty data, we
+ * should not be subject to the dirty data based delays. We
+ * use TXG_NOTHROTTLE to bypass the delay mechanism.
+ */
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
+
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ txg = dmu_tx_get_txg(tx);
+
+ lwb->lwb_tx = tx;
+
+ /*
+ * Log blocks are pre-allocated. Here we select the size of the next
+ * block, based on size used in the last block.
+ * - first find the smallest bucket that will fit the block from a
+ * limited set of block sizes. This is because it's faster to write
+ * blocks allocated from the same metaslab as they are adjacent or
+ * close.
+ * - next find the maximum from the new suggested size and an array of
+ * previous sizes. This lessens a picket fence effect of wrongly
+ * guessing the size if we have a stream of say 2k, 64k, 2k, 64k
+ * requests.
+ *
+ * Note we only write what is used, but we can't just allocate
+ * the maximum block size because we can exhaust the available
+ * pool log space.
+ */
+ zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
+ for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++)
+ continue;
+ zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size);
+ zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
+ for (i = 0; i < ZIL_PREV_BLKS; i++)
+ zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
+ zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
+
+ BP_ZERO(bp);
+ error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog);
+ if (slog) {
+ ZIL_STAT_BUMP(zil_itx_metaslab_slog_count);
+ ZIL_STAT_INCR(zil_itx_metaslab_slog_bytes, lwb->lwb_nused);
+ } else {
+ ZIL_STAT_BUMP(zil_itx_metaslab_normal_count);
+ ZIL_STAT_INCR(zil_itx_metaslab_normal_bytes, lwb->lwb_nused);
+ }
+ if (error == 0) {
+ ASSERT3U(bp->blk_birth, ==, txg);
+ bp->blk_cksum = lwb->lwb_blk.blk_cksum;
+ bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
+
+ /*
+ * Allocate a new log write block (lwb).
+ */
+ nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE);
+ }
+
+ if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
+ /* For Slim ZIL only write what is used. */
+ wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
+ ASSERT3U(wsz, <=, lwb->lwb_sz);
+ zio_shrink(lwb->lwb_write_zio, wsz);
+
+ } else {
+ wsz = lwb->lwb_sz;
+ }
+
+ zilc->zc_pad = 0;
+ zilc->zc_nused = lwb->lwb_nused;
+ zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
+
+ /*
+ * clear unused data for security
+ */
+ bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused);
+
+ spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER);
+
+ zil_lwb_add_block(lwb, &lwb->lwb_blk);
+ lwb->lwb_issued_timestamp = gethrtime();
+ lwb->lwb_state = LWB_STATE_ISSUED;
+
+ zio_nowait(lwb->lwb_root_zio);
+ zio_nowait(lwb->lwb_write_zio);
+
+ /*
+ * If there was an allocation failure then nlwb will be null which
+ * forces a txg_wait_synced().
+ */
+ return (nlwb);
+}
+
+/*
+ * Maximum amount of write data that can be put into single log block.
+ */
+uint64_t
+zil_max_log_data(zilog_t *zilog)
+{
+ return (zilog->zl_max_block_size -
+ sizeof (zil_chain_t) - sizeof (lr_write_t));
+}
+
+/*
+ * Maximum amount of log space we agree to waste to reduce number of
+ * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%).
+ */
+static inline uint64_t
+zil_max_waste_space(zilog_t *zilog)
+{
+ return (zil_max_log_data(zilog) / 8);
+}
+
+/*
+ * Maximum amount of write data for WR_COPIED. For correctness, consumers
+ * must fall back to WR_NEED_COPY if we can't fit the entire record into one
+ * maximum sized log block, because each WR_COPIED record must fit in a
+ * single log block. For space efficiency, we want to fit two records into a
+ * max-sized log block.
+ */
+uint64_t
+zil_max_copied_data(zilog_t *zilog)
+{
+ return ((zilog->zl_max_block_size - sizeof (zil_chain_t)) / 2 -
+ sizeof (lr_write_t));
+}
+
+static lwb_t *
+zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
+{
+ lr_t *lrcb, *lrc;
+ lr_write_t *lrwb, *lrw;
+ char *lr_buf;
+ uint64_t dlen, dnow, lwb_sp, reclen, txg, max_log_data;
+
+ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+ ASSERT3P(lwb, !=, NULL);
+ ASSERT3P(lwb->lwb_buf, !=, NULL);
+
+ zil_lwb_write_open(zilog, lwb);
+
+ lrc = &itx->itx_lr;
+ lrw = (lr_write_t *)lrc;
+
+ /*
+ * A commit itx doesn't represent any on-disk state; instead
+ * it's simply used as a place holder on the commit list, and
+ * provides a mechanism for attaching a "commit waiter" onto the
+ * correct lwb (such that the waiter can be signalled upon
+ * completion of that lwb). Thus, we don't process this itx's
+ * log record if it's a commit itx (these itx's don't have log
+ * records), and instead link the itx's waiter onto the lwb's
+ * list of waiters.
+ *
+ * For more details, see the comment above zil_commit().
+ */
+ if (lrc->lrc_txtype == TX_COMMIT) {
+ mutex_enter(&zilog->zl_lock);
+ zil_commit_waiter_link_lwb(itx->itx_private, lwb);
+ itx->itx_private = NULL;
+ mutex_exit(&zilog->zl_lock);
+ return (lwb);
+ }
+
+ if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
+ dlen = P2ROUNDUP_TYPED(
+ lrw->lr_length, sizeof (uint64_t), uint64_t);
+ } else {
+ dlen = 0;
+ }
+ reclen = lrc->lrc_reclen;
+ zilog->zl_cur_used += (reclen + dlen);
+ txg = lrc->lrc_txg;
+
+ ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen));
+
+cont:
+ /*
+ * If this record won't fit in the current log block, start a new one.
+ * For WR_NEED_COPY optimize layout for minimal number of chunks.
+ */
+ lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+ max_log_data = zil_max_log_data(zilog);
+ if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
+ lwb_sp < zil_max_waste_space(zilog) &&
+ (dlen % max_log_data == 0 ||
+ lwb_sp < reclen + dlen % max_log_data))) {
+ lwb = zil_lwb_write_issue(zilog, lwb);
+ if (lwb == NULL)
+ return (NULL);
+ zil_lwb_write_open(zilog, lwb);
+ ASSERT(LWB_EMPTY(lwb));
+ lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+
+ /*
+ * There must be enough space in the new, empty log block to
+ * hold reclen. For WR_COPIED, we need to fit the whole
+ * record in one block, and reclen is the header size + the
+ * data size. For WR_NEED_COPY, we can create multiple
+ * records, splitting the data into multiple blocks, so we
+ * only need to fit one word of data per block; in this case
+ * reclen is just the header size (no data).
+ */
+ ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
+ }
+
+ dnow = MIN(dlen, lwb_sp - reclen);
+ lr_buf = lwb->lwb_buf + lwb->lwb_nused;
+ bcopy(lrc, lr_buf, reclen);
+ lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */
+ lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */
+
+ ZIL_STAT_BUMP(zil_itx_count);
+
+ /*
+ * If it's a write, fetch the data or get its blkptr as appropriate.
+ */
+ if (lrc->lrc_txtype == TX_WRITE) {
+ if (txg > spa_freeze_txg(zilog->zl_spa))
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ if (itx->itx_wr_state == WR_COPIED) {
+ ZIL_STAT_BUMP(zil_itx_copied_count);
+ ZIL_STAT_INCR(zil_itx_copied_bytes, lrw->lr_length);
+ } else {
+ char *dbuf;
+ int error;
+
+ if (itx->itx_wr_state == WR_NEED_COPY) {
+ dbuf = lr_buf + reclen;
+ lrcb->lrc_reclen += dnow;
+ if (lrwb->lr_length > dnow)
+ lrwb->lr_length = dnow;
+ lrw->lr_offset += dnow;
+ lrw->lr_length -= dnow;
+ ZIL_STAT_BUMP(zil_itx_needcopy_count);
+ ZIL_STAT_INCR(zil_itx_needcopy_bytes, dnow);
+ } else {
+ ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT);
+ dbuf = NULL;
+ ZIL_STAT_BUMP(zil_itx_indirect_count);
+ ZIL_STAT_INCR(zil_itx_indirect_bytes,
+ lrw->lr_length);
+ }
+
+ /*
+ * We pass in the "lwb_write_zio" rather than
+ * "lwb_root_zio" so that the "lwb_write_zio"
+ * becomes the parent of any zio's created by
+ * the "zl_get_data" callback. The vdevs are
+ * flushed after the "lwb_write_zio" completes,
+ * so we want to make sure that completion
+ * callback waits for these additional zio's,
+ * such that the vdevs used by those zio's will
+ * be included in the lwb's vdev tree, and those
+ * vdevs will be properly flushed. If we passed
+ * in "lwb_root_zio" here, then these additional
+ * vdevs may not be flushed; e.g. if these zio's
+ * completed after "lwb_write_zio" completed.
+ */
+ error = zilog->zl_get_data(itx->itx_private,
+ lrwb, dbuf, lwb, lwb->lwb_write_zio);
+
+ if (error == EIO) {
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ return (lwb);
+ }
+ if (error != 0) {
+ ASSERT(error == ENOENT || error == EEXIST ||
+ error == EALREADY);
+ return (lwb);
+ }
+ }
+ }
+
+ /*
+ * We're actually making an entry, so update lrc_seq to be the
+ * log record sequence number. Note that this is generally not
+ * equal to the itx sequence number because not all transactions
+ * are synchronous, and sometimes spa_sync() gets there first.
+ */
+ lrcb->lrc_seq = ++zilog->zl_lr_seq;
+ lwb->lwb_nused += reclen + dnow;
+
+ zil_lwb_add_txg(lwb, txg);
+
+ ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
+ ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
+
+ dlen -= dnow;
+ if (dlen > 0) {
+ zilog->zl_cur_used += reclen;
+ goto cont;
+ }
+
+ return (lwb);
+}
+
+itx_t *
+zil_itx_create(uint64_t txtype, size_t lrsize)
+{
+ size_t itxsize;
+ itx_t *itx;
+
+ lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t);
+ itxsize = offsetof(itx_t, itx_lr) + lrsize;
+
+ itx = zio_data_buf_alloc(itxsize);
+ itx->itx_lr.lrc_txtype = txtype;
+ itx->itx_lr.lrc_reclen = lrsize;
+ itx->itx_lr.lrc_seq = 0; /* defensive */
+ itx->itx_sync = B_TRUE; /* default is synchronous */
+ itx->itx_callback = NULL;
+ itx->itx_callback_data = NULL;
+ itx->itx_size = itxsize;
+
+ return (itx);
+}
+
+void
+zil_itx_destroy(itx_t *itx)
+{
+ IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL);
+ IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
+
+ if (itx->itx_callback != NULL)
+ itx->itx_callback(itx->itx_callback_data);
+
+ zio_data_buf_free(itx, itx->itx_size);
+}
+
+/*
+ * Free up the sync and async itxs. The itxs_t has already been detached
+ * so no locks are needed.
+ */
+static void
+zil_itxg_clean(itxs_t *itxs)
+{
+ itx_t *itx;
+ list_t *list;
+ avl_tree_t *t;
+ void *cookie;
+ itx_async_node_t *ian;
+
+ list = &itxs->i_sync_list;
+ while ((itx = list_head(list)) != NULL) {
+ /*
+ * In the general case, commit itxs will not be found
+ * here, as they'll be committed to an lwb via
+ * zil_lwb_commit(), and free'd in that function. Having
+ * said that, it is still possible for commit itxs to be
+ * found here, due to the following race:
+ *
+ * - a thread calls zil_commit() which assigns the
+ * commit itx to a per-txg i_sync_list
+ * - zil_itxg_clean() is called (e.g. via spa_sync())
+ * while the waiter is still on the i_sync_list
+ *
+ * There's nothing to prevent syncing the txg while the
+ * waiter is on the i_sync_list. This normally doesn't
+ * happen because spa_sync() is slower than zil_commit(),
+ * but if zil_commit() calls txg_wait_synced() (e.g.
+ * because zil_create() or zil_commit_writer_stall() is
+ * called) we will hit this case.
+ */
+ if (itx->itx_lr.lrc_txtype == TX_COMMIT)
+ zil_commit_waiter_skip(itx->itx_private);
+
+ list_remove(list, itx);
+ zil_itx_destroy(itx);
+ }
+
+ cookie = NULL;
+ t = &itxs->i_async_tree;
+ while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
+ list = &ian->ia_list;
+ while ((itx = list_head(list)) != NULL) {
+ list_remove(list, itx);
+ /* commit itxs should never be on the async lists. */
+ ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
+ zil_itx_destroy(itx);
+ }
+ list_destroy(list);
+ kmem_free(ian, sizeof (itx_async_node_t));
+ }
+ avl_destroy(t);
+
+ kmem_free(itxs, sizeof (itxs_t));
+}
+
+static int
+zil_aitx_compare(const void *x1, const void *x2)
+{
+ const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
+ const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
+
+ return (TREE_CMP(o1, o2));
+}
+
+/*
+ * Remove all async itx with the given oid.
+ */
+void
+zil_remove_async(zilog_t *zilog, uint64_t oid)
+{
+ uint64_t otxg, txg;
+ itx_async_node_t *ian;
+ avl_tree_t *t;
+ avl_index_t where;
+ list_t clean_list;
+ itx_t *itx;
+
+ ASSERT(oid != 0);
+ list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
+
+ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+ otxg = ZILTEST_TXG;
+ else
+ otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
+
+ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+ itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
+
+ mutex_enter(&itxg->itxg_lock);
+ if (itxg->itxg_txg != txg) {
+ mutex_exit(&itxg->itxg_lock);
+ continue;
+ }
+
+ /*
+ * Locate the object node and append its list.
+ */
+ t = &itxg->itxg_itxs->i_async_tree;
+ ian = avl_find(t, &oid, &where);
+ if (ian != NULL)
+ list_move_tail(&clean_list, &ian->ia_list);
+ mutex_exit(&itxg->itxg_lock);
+ }
+ while ((itx = list_head(&clean_list)) != NULL) {
+ list_remove(&clean_list, itx);
+ /* commit itxs should never be on the async lists. */
+ ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
+ zil_itx_destroy(itx);
+ }
+ list_destroy(&clean_list);
+}
+
+void
+zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
+{
+ uint64_t txg;
+ itxg_t *itxg;
+ itxs_t *itxs, *clean = NULL;
+
+ /*
+ * Ensure the data of a renamed file is committed before the rename.
+ */
+ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
+ zil_async_to_sync(zilog, itx->itx_oid);
+
+ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
+ txg = ZILTEST_TXG;
+ else
+ txg = dmu_tx_get_txg(tx);
+
+ itxg = &zilog->zl_itxg[txg & TXG_MASK];
+ mutex_enter(&itxg->itxg_lock);
+ itxs = itxg->itxg_itxs;
+ if (itxg->itxg_txg != txg) {
+ if (itxs != NULL) {
+ /*
+ * The zil_clean callback hasn't got around to cleaning
+ * this itxg. Save the itxs for release below.
+ * This should be rare.
+ */
+ zfs_dbgmsg("zil_itx_assign: missed itx cleanup for "
+ "txg %llu", itxg->itxg_txg);
+ clean = itxg->itxg_itxs;
+ }
+ itxg->itxg_txg = txg;
+ itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t),
+ KM_SLEEP);
+
+ list_create(&itxs->i_sync_list, sizeof (itx_t),
+ offsetof(itx_t, itx_node));
+ avl_create(&itxs->i_async_tree, zil_aitx_compare,
+ sizeof (itx_async_node_t),
+ offsetof(itx_async_node_t, ia_node));
+ }
+ if (itx->itx_sync) {
+ list_insert_tail(&itxs->i_sync_list, itx);
+ } else {
+ avl_tree_t *t = &itxs->i_async_tree;
+ uint64_t foid =
+ LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid);
+ itx_async_node_t *ian;
+ avl_index_t where;
+
+ ian = avl_find(t, &foid, &where);
+ if (ian == NULL) {
+ ian = kmem_alloc(sizeof (itx_async_node_t),
+ KM_SLEEP);
+ list_create(&ian->ia_list, sizeof (itx_t),
+ offsetof(itx_t, itx_node));
+ ian->ia_foid = foid;
+ avl_insert(t, ian, where);
+ }
+ list_insert_tail(&ian->ia_list, itx);
+ }
+
+ itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
+
+ /*
+ * We don't want to dirty the ZIL using ZILTEST_TXG, because
+ * zil_clean() will never be called using ZILTEST_TXG. Thus, we
+ * need to be careful to always dirty the ZIL using the "real"
+ * TXG (not itxg_txg) even when the SPA is frozen.
+ */
+ zilog_dirty(zilog, dmu_tx_get_txg(tx));
+ mutex_exit(&itxg->itxg_lock);
+
+ /* Release the old itxs now we've dropped the lock */
+ if (clean != NULL)
+ zil_itxg_clean(clean);
+}
+
+/*
+ * If there are any in-memory intent log transactions which have now been
+ * synced then start up a taskq to free them. We should only do this after we
+ * have written out the uberblocks (i.e. txg has been committed) so that
+ * don't inadvertently clean out in-memory log records that would be required
+ * by zil_commit().
+ */
+void
+zil_clean(zilog_t *zilog, uint64_t synced_txg)
+{
+ itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
+ itxs_t *clean_me;
+
+ ASSERT3U(synced_txg, <, ZILTEST_TXG);
+
+ mutex_enter(&itxg->itxg_lock);
+ if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
+ mutex_exit(&itxg->itxg_lock);
+ return;
+ }
+ ASSERT3U(itxg->itxg_txg, <=, synced_txg);
+ ASSERT3U(itxg->itxg_txg, !=, 0);
+ clean_me = itxg->itxg_itxs;
+ itxg->itxg_itxs = NULL;
+ itxg->itxg_txg = 0;
+ mutex_exit(&itxg->itxg_lock);
+ /*
+ * Preferably start a task queue to free up the old itxs but
+ * if taskq_dispatch can't allocate resources to do that then
+ * free it in-line. This should be rare. Note, using TQ_SLEEP
+ * created a bad performance problem.
+ */
+ ASSERT3P(zilog->zl_dmu_pool, !=, NULL);
+ ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL);
+ taskqid_t id = taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq,
+ (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP);
+ if (id == TASKQID_INVALID)
+ zil_itxg_clean(clean_me);
+}
+
+/*
+ * This function will traverse the queue of itxs that need to be
+ * committed, and move them onto the ZIL's zl_itx_commit_list.
+ */
+static void
+zil_get_commit_list(zilog_t *zilog)
+{
+ uint64_t otxg, txg;
+ list_t *commit_list = &zilog->zl_itx_commit_list;
+
+ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+
+ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+ otxg = ZILTEST_TXG;
+ else
+ otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
+
+ /*
+ * This is inherently racy, since there is nothing to prevent
+ * the last synced txg from changing. That's okay since we'll
+ * only commit things in the future.
+ */
+ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+ itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
+
+ mutex_enter(&itxg->itxg_lock);
+ if (itxg->itxg_txg != txg) {
+ mutex_exit(&itxg->itxg_lock);
+ continue;
+ }
+
+ /*
+ * If we're adding itx records to the zl_itx_commit_list,
+ * then the zil better be dirty in this "txg". We can assert
+ * that here since we're holding the itxg_lock which will
+ * prevent spa_sync from cleaning it. Once we add the itxs
+ * to the zl_itx_commit_list we must commit it to disk even
+ * if it's unnecessary (i.e. the txg was synced).
+ */
+ ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
+ spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
+ list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
+
+ mutex_exit(&itxg->itxg_lock);
+ }
+}
+
+/*
+ * Move the async itxs for a specified object to commit into sync lists.
+ */
+void
+zil_async_to_sync(zilog_t *zilog, uint64_t foid)
+{
+ uint64_t otxg, txg;
+ itx_async_node_t *ian;
+ avl_tree_t *t;
+ avl_index_t where;
+
+ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+ otxg = ZILTEST_TXG;
+ else
+ otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
+
+ /*
+ * This is inherently racy, since there is nothing to prevent
+ * the last synced txg from changing.
+ */
+ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+ itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
+
+ mutex_enter(&itxg->itxg_lock);
+ if (itxg->itxg_txg != txg) {
+ mutex_exit(&itxg->itxg_lock);
+ continue;
+ }
+
+ /*
+ * If a foid is specified then find that node and append its
+ * list. Otherwise walk the tree appending all the lists
+ * to the sync list. We add to the end rather than the
+ * beginning to ensure the create has happened.
+ */
+ t = &itxg->itxg_itxs->i_async_tree;
+ if (foid != 0) {
+ ian = avl_find(t, &foid, &where);
+ if (ian != NULL) {
+ list_move_tail(&itxg->itxg_itxs->i_sync_list,
+ &ian->ia_list);
+ }
+ } else {
+ void *cookie = NULL;
+
+ while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
+ list_move_tail(&itxg->itxg_itxs->i_sync_list,
+ &ian->ia_list);
+ list_destroy(&ian->ia_list);
+ kmem_free(ian, sizeof (itx_async_node_t));
+ }
+ }
+ mutex_exit(&itxg->itxg_lock);
+ }
+}
+
+/*
+ * This function will prune commit itxs that are at the head of the
+ * commit list (it won't prune past the first non-commit itx), and
+ * either: a) attach them to the last lwb that's still pending
+ * completion, or b) skip them altogether.
+ *
+ * This is used as a performance optimization to prevent commit itxs
+ * from generating new lwbs when it's unnecessary to do so.
+ */
+static void
+zil_prune_commit_list(zilog_t *zilog)
+{
+ itx_t *itx;
+
+ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+
+ while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) {
+ lr_t *lrc = &itx->itx_lr;
+ if (lrc->lrc_txtype != TX_COMMIT)
+ break;
+
+ mutex_enter(&zilog->zl_lock);
+
+ lwb_t *last_lwb = zilog->zl_last_lwb_opened;
+ if (last_lwb == NULL ||
+ last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) {
+ /*
+ * All of the itxs this waiter was waiting on
+ * must have already completed (or there were
+ * never any itx's for it to wait on), so it's
+ * safe to skip this waiter and mark it done.
+ */
+ zil_commit_waiter_skip(itx->itx_private);
+ } else {
+ zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
+ itx->itx_private = NULL;
+ }
+
+ mutex_exit(&zilog->zl_lock);
+
+ list_remove(&zilog->zl_itx_commit_list, itx);
+ zil_itx_destroy(itx);
+ }
+
+ IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
+}
+
+static void
+zil_commit_writer_stall(zilog_t *zilog)
+{
+ /*
+ * When zio_alloc_zil() fails to allocate the next lwb block on
+ * disk, we must call txg_wait_synced() to ensure all of the
+ * lwbs in the zilog's zl_lwb_list are synced and then freed (in
+ * zil_sync()), such that any subsequent ZIL writer (i.e. a call
+ * to zil_process_commit_list()) will have to call zil_create(),
+ * and start a new ZIL chain.
+ *
+ * Since zil_alloc_zil() failed, the lwb that was previously
+ * issued does not have a pointer to the "next" lwb on disk.
+ * Thus, if another ZIL writer thread was to allocate the "next"
+ * on-disk lwb, that block could be leaked in the event of a
+ * crash (because the previous lwb on-disk would not point to
+ * it).
+ *
+ * We must hold the zilog's zl_issuer_lock while we do this, to
+ * ensure no new threads enter zil_process_commit_list() until
+ * all lwb's in the zl_lwb_list have been synced and freed
+ * (which is achieved via the txg_wait_synced() call).
+ */
+ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
+ ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
+}
+
+/*
+ * This function will traverse the commit list, creating new lwbs as
+ * needed, and committing the itxs from the commit list to these newly
+ * created lwbs. Additionally, as a new lwb is created, the previous
+ * lwb will be issued to the zio layer to be written to disk.
+ */
+static void
+zil_process_commit_list(zilog_t *zilog)
+{
+ spa_t *spa = zilog->zl_spa;
+ list_t nolwb_itxs;
+ list_t nolwb_waiters;
+ lwb_t *lwb;
+ itx_t *itx;
+
+ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+
+ /*
+ * Return if there's nothing to commit before we dirty the fs by
+ * calling zil_create().
+ */
+ if (list_head(&zilog->zl_itx_commit_list) == NULL)
+ return;
+
+ list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
+ list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
+ offsetof(zil_commit_waiter_t, zcw_node));
+
+ lwb = list_tail(&zilog->zl_lwb_list);
+ if (lwb == NULL) {
+ lwb = zil_create(zilog);
+ } else {
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
+ }
+
+ while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) {
+ lr_t *lrc = &itx->itx_lr;
+ uint64_t txg = lrc->lrc_txg;
+
+ ASSERT3U(txg, !=, 0);
+
+ if (lrc->lrc_txtype == TX_COMMIT) {
+ DTRACE_PROBE2(zil__process__commit__itx,
+ zilog_t *, zilog, itx_t *, itx);
+ } else {
+ DTRACE_PROBE2(zil__process__normal__itx,
+ zilog_t *, zilog, itx_t *, itx);
+ }
+
+ list_remove(&zilog->zl_itx_commit_list, itx);
+
+ boolean_t synced = txg <= spa_last_synced_txg(spa);
+ boolean_t frozen = txg > spa_freeze_txg(spa);
+
+ /*
+ * If the txg of this itx has already been synced out, then
+ * we don't need to commit this itx to an lwb. This is
+ * because the data of this itx will have already been
+ * written to the main pool. This is inherently racy, and
+ * it's still ok to commit an itx whose txg has already
+ * been synced; this will result in a write that's
+ * unnecessary, but will do no harm.
+ *
+ * With that said, we always want to commit TX_COMMIT itxs
+ * to an lwb, regardless of whether or not that itx's txg
+ * has been synced out. We do this to ensure any OPENED lwb
+ * will always have at least one zil_commit_waiter_t linked
+ * to the lwb.
+ *
+ * As a counter-example, if we skipped TX_COMMIT itx's
+ * whose txg had already been synced, the following
+ * situation could occur if we happened to be racing with
+ * spa_sync:
+ *
+ * 1. We commit a non-TX_COMMIT itx to an lwb, where the
+ * itx's txg is 10 and the last synced txg is 9.
+ * 2. spa_sync finishes syncing out txg 10.
+ * 3. We move to the next itx in the list, it's a TX_COMMIT
+ * whose txg is 10, so we skip it rather than committing
+ * it to the lwb used in (1).
+ *
+ * If the itx that is skipped in (3) is the last TX_COMMIT
+ * itx in the commit list, than it's possible for the lwb
+ * used in (1) to remain in the OPENED state indefinitely.
+ *
+ * To prevent the above scenario from occurring, ensuring
+ * that once an lwb is OPENED it will transition to ISSUED
+ * and eventually DONE, we always commit TX_COMMIT itx's to
+ * an lwb here, even if that itx's txg has already been
+ * synced.
+ *
+ * Finally, if the pool is frozen, we _always_ commit the
+ * itx. The point of freezing the pool is to prevent data
+ * from being written to the main pool via spa_sync, and
+ * instead rely solely on the ZIL to persistently store the
+ * data; i.e. when the pool is frozen, the last synced txg
+ * value can't be trusted.
+ */
+ if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) {
+ if (lwb != NULL) {
+ lwb = zil_lwb_commit(zilog, itx, lwb);
+
+ if (lwb == NULL)
+ list_insert_tail(&nolwb_itxs, itx);
+ else
+ list_insert_tail(&lwb->lwb_itxs, itx);
+ } else {
+ if (lrc->lrc_txtype == TX_COMMIT) {
+ zil_commit_waiter_link_nolwb(
+ itx->itx_private, &nolwb_waiters);
+ }
+
+ list_insert_tail(&nolwb_itxs, itx);
+ }
+ } else {
+ ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT);
+ zil_itx_destroy(itx);
+ }
+ }
+
+ if (lwb == NULL) {
+ /*
+ * This indicates zio_alloc_zil() failed to allocate the
+ * "next" lwb on-disk. When this happens, we must stall
+ * the ZIL write pipeline; see the comment within
+ * zil_commit_writer_stall() for more details.
+ */
+ zil_commit_writer_stall(zilog);
+
+ /*
+ * Additionally, we have to signal and mark the "nolwb"
+ * waiters as "done" here, since without an lwb, we
+ * can't do this via zil_lwb_flush_vdevs_done() like
+ * normal.
+ */
+ zil_commit_waiter_t *zcw;
+ while ((zcw = list_head(&nolwb_waiters)) != NULL) {
+ zil_commit_waiter_skip(zcw);
+ list_remove(&nolwb_waiters, zcw);
+ }
+
+ /*
+ * And finally, we have to destroy the itx's that
+ * couldn't be committed to an lwb; this will also call
+ * the itx's callback if one exists for the itx.
+ */
+ while ((itx = list_head(&nolwb_itxs)) != NULL) {
+ list_remove(&nolwb_itxs, itx);
+ zil_itx_destroy(itx);
+ }
+ } else {
+ ASSERT(list_is_empty(&nolwb_waiters));
+ ASSERT3P(lwb, !=, NULL);
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
+
+ /*
+ * At this point, the ZIL block pointed at by the "lwb"
+ * variable is in one of the following states: "closed"
+ * or "open".
+ *
+ * If it's "closed", then no itxs have been committed to
+ * it, so there's no point in issuing its zio (i.e. it's
+ * "empty").
+ *
+ * If it's "open", then it contains one or more itxs that
+ * eventually need to be committed to stable storage. In
+ * this case we intentionally do not issue the lwb's zio
+ * to disk yet, and instead rely on one of the following
+ * two mechanisms for issuing the zio:
+ *
+ * 1. Ideally, there will be more ZIL activity occurring
+ * on the system, such that this function will be
+ * immediately called again (not necessarily by the same
+ * thread) and this lwb's zio will be issued via
+ * zil_lwb_commit(). This way, the lwb is guaranteed to
+ * be "full" when it is issued to disk, and we'll make
+ * use of the lwb's size the best we can.
+ *
+ * 2. If there isn't sufficient ZIL activity occurring on
+ * the system, such that this lwb's zio isn't issued via
+ * zil_lwb_commit(), zil_commit_waiter() will issue the
+ * lwb's zio. If this occurs, the lwb is not guaranteed
+ * to be "full" by the time its zio is issued, and means
+ * the size of the lwb was "too large" given the amount
+ * of ZIL activity occurring on the system at that time.
+ *
+ * We do this for a couple of reasons:
+ *
+ * 1. To try and reduce the number of IOPs needed to
+ * write the same number of itxs. If an lwb has space
+ * available in its buffer for more itxs, and more itxs
+ * will be committed relatively soon (relative to the
+ * latency of performing a write), then it's beneficial
+ * to wait for these "next" itxs. This way, more itxs
+ * can be committed to stable storage with fewer writes.
+ *
+ * 2. To try and use the largest lwb block size that the
+ * incoming rate of itxs can support. Again, this is to
+ * try and pack as many itxs into as few lwbs as
+ * possible, without significantly impacting the latency
+ * of each individual itx.
+ */
+ }
+}
+
+/*
+ * This function is responsible for ensuring the passed in commit waiter
+ * (and associated commit itx) is committed to an lwb. If the waiter is
+ * not already committed to an lwb, all itxs in the zilog's queue of
+ * itxs will be processed. The assumption is the passed in waiter's
+ * commit itx will found in the queue just like the other non-commit
+ * itxs, such that when the entire queue is processed, the waiter will
+ * have been committed to an lwb.
+ *
+ * The lwb associated with the passed in waiter is not guaranteed to
+ * have been issued by the time this function completes. If the lwb is
+ * not issued, we rely on future calls to zil_commit_writer() to issue
+ * the lwb, or the timeout mechanism found in zil_commit_waiter().
+ */
+static void
+zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
+{
+ ASSERT(!MUTEX_HELD(&zilog->zl_lock));
+ ASSERT(spa_writeable(zilog->zl_spa));
+
+ mutex_enter(&zilog->zl_issuer_lock);
+
+ if (zcw->zcw_lwb != NULL || zcw->zcw_done) {
+ /*
+ * It's possible that, while we were waiting to acquire
+ * the "zl_issuer_lock", another thread committed this
+ * waiter to an lwb. If that occurs, we bail out early,
+ * without processing any of the zilog's queue of itxs.
+ *
+ * On certain workloads and system configurations, the
+ * "zl_issuer_lock" can become highly contended. In an
+ * attempt to reduce this contention, we immediately drop
+ * the lock if the waiter has already been processed.
+ *
+ * We've measured this optimization to reduce CPU spent
+ * contending on this lock by up to 5%, using a system
+ * with 32 CPUs, low latency storage (~50 usec writes),
+ * and 1024 threads performing sync writes.
+ */
+ goto out;
+ }
+
+ ZIL_STAT_BUMP(zil_commit_writer_count);
+
+ zil_get_commit_list(zilog);
+ zil_prune_commit_list(zilog);
+ zil_process_commit_list(zilog);
+
+out:
+ mutex_exit(&zilog->zl_issuer_lock);
+}
+
+static void
+zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
+{
+ ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
+ ASSERT(MUTEX_HELD(&zcw->zcw_lock));
+ ASSERT3B(zcw->zcw_done, ==, B_FALSE);
+
+ lwb_t *lwb = zcw->zcw_lwb;
+ ASSERT3P(lwb, !=, NULL);
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED);
+
+ /*
+ * If the lwb has already been issued by another thread, we can
+ * immediately return since there's no work to be done (the
+ * point of this function is to issue the lwb). Additionally, we
+ * do this prior to acquiring the zl_issuer_lock, to avoid
+ * acquiring it when it's not necessary to do so.
+ */
+ if (lwb->lwb_state == LWB_STATE_ISSUED ||
+ lwb->lwb_state == LWB_STATE_WRITE_DONE ||
+ lwb->lwb_state == LWB_STATE_FLUSH_DONE)
+ return;
+
+ /*
+ * In order to call zil_lwb_write_issue() we must hold the
+ * zilog's "zl_issuer_lock". We can't simply acquire that lock,
+ * since we're already holding the commit waiter's "zcw_lock",
+ * and those two locks are acquired in the opposite order
+ * elsewhere.
+ */
+ mutex_exit(&zcw->zcw_lock);
+ mutex_enter(&zilog->zl_issuer_lock);
+ mutex_enter(&zcw->zcw_lock);
+
+ /*
+ * Since we just dropped and re-acquired the commit waiter's
+ * lock, we have to re-check to see if the waiter was marked
+ * "done" during that process. If the waiter was marked "done",
+ * the "lwb" pointer is no longer valid (it can be free'd after
+ * the waiter is marked "done"), so without this check we could
+ * wind up with a use-after-free error below.
+ */
+ if (zcw->zcw_done)
+ goto out;
+
+ ASSERT3P(lwb, ==, zcw->zcw_lwb);
+
+ /*
+ * We've already checked this above, but since we hadn't acquired
+ * the zilog's zl_issuer_lock, we have to perform this check a
+ * second time while holding the lock.
+ *
+ * We don't need to hold the zl_lock since the lwb cannot transition
+ * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb
+ * _can_ transition from ISSUED to DONE, but it's OK to race with
+ * that transition since we treat the lwb the same, whether it's in
+ * the ISSUED or DONE states.
+ *
+ * The important thing, is we treat the lwb differently depending on
+ * if it's ISSUED or OPENED, and block any other threads that might
+ * attempt to issue this lwb. For that reason we hold the
+ * zl_issuer_lock when checking the lwb_state; we must not call
+ * zil_lwb_write_issue() if the lwb had already been issued.
+ *
+ * See the comment above the lwb_state_t structure definition for
+ * more details on the lwb states, and locking requirements.
+ */
+ if (lwb->lwb_state == LWB_STATE_ISSUED ||
+ lwb->lwb_state == LWB_STATE_WRITE_DONE ||
+ lwb->lwb_state == LWB_STATE_FLUSH_DONE)
+ goto out;
+
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+
+ /*
+ * As described in the comments above zil_commit_waiter() and
+ * zil_process_commit_list(), we need to issue this lwb's zio
+ * since we've reached the commit waiter's timeout and it still
+ * hasn't been issued.
+ */
+ lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
+
+ IMPLY(nlwb != NULL, lwb->lwb_state != LWB_STATE_OPENED);
+
+ /*
+ * Since the lwb's zio hadn't been issued by the time this thread
+ * reached its timeout, we reset the zilog's "zl_cur_used" field
+ * to influence the zil block size selection algorithm.
+ *
+ * By having to issue the lwb's zio here, it means the size of the
+ * lwb was too large, given the incoming throughput of itxs. By
+ * setting "zl_cur_used" to zero, we communicate this fact to the
+ * block size selection algorithm, so it can take this information
+ * into account, and potentially select a smaller size for the
+ * next lwb block that is allocated.
+ */
+ zilog->zl_cur_used = 0;
+
+ if (nlwb == NULL) {
+ /*
+ * When zil_lwb_write_issue() returns NULL, this
+ * indicates zio_alloc_zil() failed to allocate the
+ * "next" lwb on-disk. When this occurs, the ZIL write
+ * pipeline must be stalled; see the comment within the
+ * zil_commit_writer_stall() function for more details.
+ *
+ * We must drop the commit waiter's lock prior to
+ * calling zil_commit_writer_stall() or else we can wind
+ * up with the following deadlock:
+ *
+ * - This thread is waiting for the txg to sync while
+ * holding the waiter's lock; txg_wait_synced() is
+ * used within txg_commit_writer_stall().
+ *
+ * - The txg can't sync because it is waiting for this
+ * lwb's zio callback to call dmu_tx_commit().
+ *
+ * - The lwb's zio callback can't call dmu_tx_commit()
+ * because it's blocked trying to acquire the waiter's
+ * lock, which occurs prior to calling dmu_tx_commit()
+ */
+ mutex_exit(&zcw->zcw_lock);
+ zil_commit_writer_stall(zilog);
+ mutex_enter(&zcw->zcw_lock);
+ }
+
+out:
+ mutex_exit(&zilog->zl_issuer_lock);
+ ASSERT(MUTEX_HELD(&zcw->zcw_lock));
+}
+
+/*
+ * This function is responsible for performing the following two tasks:
+ *
+ * 1. its primary responsibility is to block until the given "commit
+ * waiter" is considered "done".
+ *
+ * 2. its secondary responsibility is to issue the zio for the lwb that
+ * the given "commit waiter" is waiting on, if this function has
+ * waited "long enough" and the lwb is still in the "open" state.
+ *
+ * Given a sufficient amount of itxs being generated and written using
+ * the ZIL, the lwb's zio will be issued via the zil_lwb_commit()
+ * function. If this does not occur, this secondary responsibility will
+ * ensure the lwb is issued even if there is not other synchronous
+ * activity on the system.
+ *
+ * For more details, see zil_process_commit_list(); more specifically,
+ * the comment at the bottom of that function.
+ */
+static void
+zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
+{
+ ASSERT(!MUTEX_HELD(&zilog->zl_lock));
+ ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
+ ASSERT(spa_writeable(zilog->zl_spa));
+
+ mutex_enter(&zcw->zcw_lock);
+
+ /*
+ * The timeout is scaled based on the lwb latency to avoid
+ * significantly impacting the latency of each individual itx.
+ * For more details, see the comment at the bottom of the
+ * zil_process_commit_list() function.
+ */
+ int pct = MAX(zfs_commit_timeout_pct, 1);
+ hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100;
+ hrtime_t wakeup = gethrtime() + sleep;
+ boolean_t timedout = B_FALSE;
+
+ while (!zcw->zcw_done) {
+ ASSERT(MUTEX_HELD(&zcw->zcw_lock));
+
+ lwb_t *lwb = zcw->zcw_lwb;
+
+ /*
+ * Usually, the waiter will have a non-NULL lwb field here,
+ * but it's possible for it to be NULL as a result of
+ * zil_commit() racing with spa_sync().
+ *
+ * When zil_clean() is called, it's possible for the itxg
+ * list (which may be cleaned via a taskq) to contain
+ * commit itxs. When this occurs, the commit waiters linked
+ * off of these commit itxs will not be committed to an
+ * lwb. Additionally, these commit waiters will not be
+ * marked done until zil_commit_waiter_skip() is called via
+ * zil_itxg_clean().
+ *
+ * Thus, it's possible for this commit waiter (i.e. the
+ * "zcw" variable) to be found in this "in between" state;
+ * where it's "zcw_lwb" field is NULL, and it hasn't yet
+ * been skipped, so it's "zcw_done" field is still B_FALSE.
+ */
+ IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED);
+
+ if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) {
+ ASSERT3B(timedout, ==, B_FALSE);
+
+ /*
+ * If the lwb hasn't been issued yet, then we
+ * need to wait with a timeout, in case this
+ * function needs to issue the lwb after the
+ * timeout is reached; responsibility (2) from
+ * the comment above this function.
+ */
+ int rc = cv_timedwait_hires(&zcw->zcw_cv,
+ &zcw->zcw_lock, wakeup, USEC2NSEC(1),
+ CALLOUT_FLAG_ABSOLUTE);
+
+ if (rc != -1 || zcw->zcw_done)
+ continue;
+
+ timedout = B_TRUE;
+ zil_commit_waiter_timeout(zilog, zcw);
+
+ if (!zcw->zcw_done) {
+ /*
+ * If the commit waiter has already been
+ * marked "done", it's possible for the
+ * waiter's lwb structure to have already
+ * been freed. Thus, we can only reliably
+ * make these assertions if the waiter
+ * isn't done.
+ */
+ ASSERT3P(lwb, ==, zcw->zcw_lwb);
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
+ }
+ } else {
+ /*
+ * If the lwb isn't open, then it must have already
+ * been issued. In that case, there's no need to
+ * use a timeout when waiting for the lwb to
+ * complete.
+ *
+ * Additionally, if the lwb is NULL, the waiter
+ * will soon be signaled and marked done via
+ * zil_clean() and zil_itxg_clean(), so no timeout
+ * is required.
+ */
+
+ IMPLY(lwb != NULL,
+ lwb->lwb_state == LWB_STATE_ISSUED ||
+ lwb->lwb_state == LWB_STATE_WRITE_DONE ||
+ lwb->lwb_state == LWB_STATE_FLUSH_DONE);
+ cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
+ }
+ }
+
+ mutex_exit(&zcw->zcw_lock);
+}
+
+static zil_commit_waiter_t *
+zil_alloc_commit_waiter(void)
+{
+ zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP);
+
+ cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_link_init(&zcw->zcw_node);
+ zcw->zcw_lwb = NULL;
+ zcw->zcw_done = B_FALSE;
+ zcw->zcw_zio_error = 0;
+
+ return (zcw);
+}
+
+static void
+zil_free_commit_waiter(zil_commit_waiter_t *zcw)
+{
+ ASSERT(!list_link_active(&zcw->zcw_node));
+ ASSERT3P(zcw->zcw_lwb, ==, NULL);
+ ASSERT3B(zcw->zcw_done, ==, B_TRUE);
+ mutex_destroy(&zcw->zcw_lock);
+ cv_destroy(&zcw->zcw_cv);
+ kmem_cache_free(zil_zcw_cache, zcw);
+}
+
+/*
+ * This function is used to create a TX_COMMIT itx and assign it. This
+ * way, it will be linked into the ZIL's list of synchronous itxs, and
+ * then later committed to an lwb (or skipped) when
+ * zil_process_commit_list() is called.
+ */
+static void
+zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
+{
+ dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+ itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t));
+ itx->itx_sync = B_TRUE;
+ itx->itx_private = zcw;
+
+ zil_itx_assign(zilog, itx, tx);
+
+ dmu_tx_commit(tx);
+}
+
+/*
+ * Commit ZFS Intent Log transactions (itxs) to stable storage.
+ *
+ * When writing ZIL transactions to the on-disk representation of the
+ * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple
+ * itxs can be committed to a single lwb. Once a lwb is written and
+ * committed to stable storage (i.e. the lwb is written, and vdevs have
+ * been flushed), each itx that was committed to that lwb is also
+ * considered to be committed to stable storage.
+ *
+ * When an itx is committed to an lwb, the log record (lr_t) contained
+ * by the itx is copied into the lwb's zio buffer, and once this buffer
+ * is written to disk, it becomes an on-disk ZIL block.
+ *
+ * As itxs are generated, they're inserted into the ZIL's queue of
+ * uncommitted itxs. The semantics of zil_commit() are such that it will
+ * block until all itxs that were in the queue when it was called, are
+ * committed to stable storage.
+ *
+ * If "foid" is zero, this means all "synchronous" and "asynchronous"
+ * itxs, for all objects in the dataset, will be committed to stable
+ * storage prior to zil_commit() returning. If "foid" is non-zero, all
+ * "synchronous" itxs for all objects, but only "asynchronous" itxs
+ * that correspond to the foid passed in, will be committed to stable
+ * storage prior to zil_commit() returning.
+ *
+ * Generally speaking, when zil_commit() is called, the consumer doesn't
+ * actually care about _all_ of the uncommitted itxs. Instead, they're
+ * simply trying to waiting for a specific itx to be committed to disk,
+ * but the interface(s) for interacting with the ZIL don't allow such
+ * fine-grained communication. A better interface would allow a consumer
+ * to create and assign an itx, and then pass a reference to this itx to
+ * zil_commit(); such that zil_commit() would return as soon as that
+ * specific itx was committed to disk (instead of waiting for _all_
+ * itxs to be committed).
+ *
+ * When a thread calls zil_commit() a special "commit itx" will be
+ * generated, along with a corresponding "waiter" for this commit itx.
+ * zil_commit() will wait on this waiter's CV, such that when the waiter
+ * is marked done, and signaled, zil_commit() will return.
+ *
+ * This commit itx is inserted into the queue of uncommitted itxs. This
+ * provides an easy mechanism for determining which itxs were in the
+ * queue prior to zil_commit() having been called, and which itxs were
+ * added after zil_commit() was called.
+ *
+ * The commit it is special; it doesn't have any on-disk representation.
+ * When a commit itx is "committed" to an lwb, the waiter associated
+ * with it is linked onto the lwb's list of waiters. Then, when that lwb
+ * completes, each waiter on the lwb's list is marked done and signaled
+ * -- allowing the thread waiting on the waiter to return from zil_commit().
+ *
+ * It's important to point out a few critical factors that allow us
+ * to make use of the commit itxs, commit waiters, per-lwb lists of
+ * commit waiters, and zio completion callbacks like we're doing:
+ *
+ * 1. The list of waiters for each lwb is traversed, and each commit
+ * waiter is marked "done" and signaled, in the zio completion
+ * callback of the lwb's zio[*].
+ *
+ * * Actually, the waiters are signaled in the zio completion
+ * callback of the root zio for the DKIOCFLUSHWRITECACHE commands
+ * that are sent to the vdevs upon completion of the lwb zio.
+ *
+ * 2. When the itxs are inserted into the ZIL's queue of uncommitted
+ * itxs, the order in which they are inserted is preserved[*]; as
+ * itxs are added to the queue, they are added to the tail of
+ * in-memory linked lists.
+ *
+ * When committing the itxs to lwbs (to be written to disk), they
+ * are committed in the same order in which the itxs were added to
+ * the uncommitted queue's linked list(s); i.e. the linked list of
+ * itxs to commit is traversed from head to tail, and each itx is
+ * committed to an lwb in that order.
+ *
+ * * To clarify:
+ *
+ * - the order of "sync" itxs is preserved w.r.t. other
+ * "sync" itxs, regardless of the corresponding objects.
+ * - the order of "async" itxs is preserved w.r.t. other
+ * "async" itxs corresponding to the same object.
+ * - the order of "async" itxs is *not* preserved w.r.t. other
+ * "async" itxs corresponding to different objects.
+ * - the order of "sync" itxs w.r.t. "async" itxs (or vice
+ * versa) is *not* preserved, even for itxs that correspond
+ * to the same object.
+ *
+ * For more details, see: zil_itx_assign(), zil_async_to_sync(),
+ * zil_get_commit_list(), and zil_process_commit_list().
+ *
+ * 3. The lwbs represent a linked list of blocks on disk. Thus, any
+ * lwb cannot be considered committed to stable storage, until its
+ * "previous" lwb is also committed to stable storage. This fact,
+ * coupled with the fact described above, means that itxs are
+ * committed in (roughly) the order in which they were generated.
+ * This is essential because itxs are dependent on prior itxs.
+ * Thus, we *must not* deem an itx as being committed to stable
+ * storage, until *all* prior itxs have also been committed to
+ * stable storage.
+ *
+ * To enforce this ordering of lwb zio's, while still leveraging as
+ * much of the underlying storage performance as possible, we rely
+ * on two fundamental concepts:
+ *
+ * 1. The creation and issuance of lwb zio's is protected by
+ * the zilog's "zl_issuer_lock", which ensures only a single
+ * thread is creating and/or issuing lwb's at a time
+ * 2. The "previous" lwb is a child of the "current" lwb
+ * (leveraging the zio parent-child dependency graph)
+ *
+ * By relying on this parent-child zio relationship, we can have
+ * many lwb zio's concurrently issued to the underlying storage,
+ * but the order in which they complete will be the same order in
+ * which they were created.
+ */
+void
+zil_commit(zilog_t *zilog, uint64_t foid)
+{
+ /*
+ * We should never attempt to call zil_commit on a snapshot for
+ * a couple of reasons:
+ *
+ * 1. A snapshot may never be modified, thus it cannot have any
+ * in-flight itxs that would have modified the dataset.
+ *
+ * 2. By design, when zil_commit() is called, a commit itx will
+ * be assigned to this zilog; as a result, the zilog will be
+ * dirtied. We must not dirty the zilog of a snapshot; there's
+ * checks in the code that enforce this invariant, and will
+ * cause a panic if it's not upheld.
+ */
+ ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE);
+
+ if (zilog->zl_sync == ZFS_SYNC_DISABLED)
+ return;
+
+ if (!spa_writeable(zilog->zl_spa)) {
+ /*
+ * If the SPA is not writable, there should never be any
+ * pending itxs waiting to be committed to disk. If that
+ * weren't true, we'd skip writing those itxs out, and
+ * would break the semantics of zil_commit(); thus, we're
+ * verifying that truth before we return to the caller.
+ */
+ ASSERT(list_is_empty(&zilog->zl_lwb_list));
+ ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
+ for (int i = 0; i < TXG_SIZE; i++)
+ ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL);
+ return;
+ }
+
+ /*
+ * If the ZIL is suspended, we don't want to dirty it by calling
+ * zil_commit_itx_assign() below, nor can we write out
+ * lwbs like would be done in zil_commit_write(). Thus, we
+ * simply rely on txg_wait_synced() to maintain the necessary
+ * semantics, and avoid calling those functions altogether.
+ */
+ if (zilog->zl_suspend > 0) {
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
+ return;
+ }
+
+ zil_commit_impl(zilog, foid);
+}
+
+void
+zil_commit_impl(zilog_t *zilog, uint64_t foid)
+{
+ ZIL_STAT_BUMP(zil_commit_count);
+
+ /*
+ * Move the "async" itxs for the specified foid to the "sync"
+ * queues, such that they will be later committed (or skipped)
+ * to an lwb when zil_process_commit_list() is called.
+ *
+ * Since these "async" itxs must be committed prior to this
+ * call to zil_commit returning, we must perform this operation
+ * before we call zil_commit_itx_assign().
+ */
+ zil_async_to_sync(zilog, foid);
+
+ /*
+ * We allocate a new "waiter" structure which will initially be
+ * linked to the commit itx using the itx's "itx_private" field.
+ * Since the commit itx doesn't represent any on-disk state,
+ * when it's committed to an lwb, rather than copying the its
+ * lr_t into the lwb's buffer, the commit itx's "waiter" will be
+ * added to the lwb's list of waiters. Then, when the lwb is
+ * committed to stable storage, each waiter in the lwb's list of
+ * waiters will be marked "done", and signalled.
+ *
+ * We must create the waiter and assign the commit itx prior to
+ * calling zil_commit_writer(), or else our specific commit itx
+ * is not guaranteed to be committed to an lwb prior to calling
+ * zil_commit_waiter().
+ */
+ zil_commit_waiter_t *zcw = zil_alloc_commit_waiter();
+ zil_commit_itx_assign(zilog, zcw);
+
+ zil_commit_writer(zilog, zcw);
+ zil_commit_waiter(zilog, zcw);
+
+ if (zcw->zcw_zio_error != 0) {
+ /*
+ * If there was an error writing out the ZIL blocks that
+ * this thread is waiting on, then we fallback to
+ * relying on spa_sync() to write out the data this
+ * thread is waiting on. Obviously this has performance
+ * implications, but the expectation is for this to be
+ * an exceptional case, and shouldn't occur often.
+ */
+ DTRACE_PROBE2(zil__commit__io__error,
+ zilog_t *, zilog, zil_commit_waiter_t *, zcw);
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
+ }
+
+ zil_free_commit_waiter(zcw);
+}
+
+/*
+ * Called in syncing context to free committed log blocks and update log header.
+ */
+void
+zil_sync(zilog_t *zilog, dmu_tx_t *tx)
+{
+ zil_header_t *zh = zil_header_in_syncing_context(zilog);
+ uint64_t txg = dmu_tx_get_txg(tx);
+ spa_t *spa = zilog->zl_spa;
+ uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
+ lwb_t *lwb;
+
+ /*
+ * We don't zero out zl_destroy_txg, so make sure we don't try
+ * to destroy it twice.
+ */
+ if (spa_sync_pass(spa) != 1)
+ return;
+
+ mutex_enter(&zilog->zl_lock);
+
+ ASSERT(zilog->zl_stop_sync == 0);
+
+ if (*replayed_seq != 0) {
+ ASSERT(zh->zh_replay_seq < *replayed_seq);
+ zh->zh_replay_seq = *replayed_seq;
+ *replayed_seq = 0;
+ }
+
+ if (zilog->zl_destroy_txg == txg) {
+ blkptr_t blk = zh->zh_log;
+
+ ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
+
+ bzero(zh, sizeof (zil_header_t));
+ bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));
+
+ if (zilog->zl_keep_first) {
+ /*
+ * If this block was part of log chain that couldn't
+ * be claimed because a device was missing during
+ * zil_claim(), but that device later returns,
+ * then this block could erroneously appear valid.
+ * To guard against this, assign a new GUID to the new
+ * log chain so it doesn't matter what blk points to.
+ */
+ zil_init_log_chain(zilog, &blk);
+ zh->zh_log = blk;
+ }
+ }
+
+ while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
+ zh->zh_log = lwb->lwb_blk;
+ if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
+ break;
+ list_remove(&zilog->zl_lwb_list, lwb);
+ zio_free(spa, txg, &lwb->lwb_blk);
+ zil_free_lwb(zilog, lwb);
+
+ /*
+ * If we don't have anything left in the lwb list then
+ * we've had an allocation failure and we need to zero
+ * out the zil_header blkptr so that we don't end
+ * up freeing the same block twice.
+ */
+ if (list_head(&zilog->zl_lwb_list) == NULL)
+ BP_ZERO(&zh->zh_log);
+ }
+
+ /*
+ * Remove fastwrite on any blocks that have been pre-allocated for
+ * the next commit. This prevents fastwrite counter pollution by
+ * unused, long-lived LWBs.
+ */
+ for (; lwb != NULL; lwb = list_next(&zilog->zl_lwb_list, lwb)) {
+ if (lwb->lwb_fastwrite && !lwb->lwb_write_zio) {
+ metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk);
+ lwb->lwb_fastwrite = 0;
+ }
+ }
+
+ mutex_exit(&zilog->zl_lock);
+}
+
+/* ARGSUSED */
+static int
+zil_lwb_cons(void *vbuf, void *unused, int kmflag)
+{
+ lwb_t *lwb = vbuf;
+ list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
+ list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t),
+ offsetof(zil_commit_waiter_t, zcw_node));
+ avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
+ sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
+ mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+zil_lwb_dest(void *vbuf, void *unused)
+{
+ lwb_t *lwb = vbuf;
+ mutex_destroy(&lwb->lwb_vdev_lock);
+ avl_destroy(&lwb->lwb_vdev_tree);
+ list_destroy(&lwb->lwb_waiters);
+ list_destroy(&lwb->lwb_itxs);
+}
+
+void
+zil_init(void)
+{
+ zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
+ sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0);
+
+ zil_zcw_cache = kmem_cache_create("zil_zcw_cache",
+ sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ zil_ksp = kstat_create("zfs", 0, "zil", "misc",
+ KSTAT_TYPE_NAMED, sizeof (zil_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (zil_ksp != NULL) {
+ zil_ksp->ks_data = &zil_stats;
+ kstat_install(zil_ksp);
+ }
+}
+
+void
+zil_fini(void)
+{
+ kmem_cache_destroy(zil_zcw_cache);
+ kmem_cache_destroy(zil_lwb_cache);
+
+ if (zil_ksp != NULL) {
+ kstat_delete(zil_ksp);
+ zil_ksp = NULL;
+ }
+}
+
+void
+zil_set_sync(zilog_t *zilog, uint64_t sync)
+{
+ zilog->zl_sync = sync;
+}
+
+void
+zil_set_logbias(zilog_t *zilog, uint64_t logbias)
+{
+ zilog->zl_logbias = logbias;
+}
+
+zilog_t *
+zil_alloc(objset_t *os, zil_header_t *zh_phys)
+{
+ zilog_t *zilog;
+
+ zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
+
+ zilog->zl_header = zh_phys;
+ zilog->zl_os = os;
+ zilog->zl_spa = dmu_objset_spa(os);
+ zilog->zl_dmu_pool = dmu_objset_pool(os);
+ zilog->zl_destroy_txg = TXG_INITIAL - 1;
+ zilog->zl_logbias = dmu_objset_logbias(os);
+ zilog->zl_sync = dmu_objset_syncprop(os);
+ zilog->zl_dirty_max_txg = 0;
+ zilog->zl_last_lwb_opened = NULL;
+ zilog->zl_last_lwb_latency = 0;
+ zilog->zl_max_block_size = zil_maxblocksize;
+
+ mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
+ MUTEX_DEFAULT, NULL);
+ }
+
+ list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
+ offsetof(lwb_t, lwb_node));
+
+ list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
+ offsetof(itx_t, itx_node));
+
+ cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
+
+ return (zilog);
+}
+
+void
+zil_free(zilog_t *zilog)
+{
+ int i;
+
+ zilog->zl_stop_sync = 1;
+
+ ASSERT0(zilog->zl_suspend);
+ ASSERT0(zilog->zl_suspending);
+
+ ASSERT(list_is_empty(&zilog->zl_lwb_list));
+ list_destroy(&zilog->zl_lwb_list);
+
+ ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
+ list_destroy(&zilog->zl_itx_commit_list);
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ /*
+ * It's possible for an itx to be generated that doesn't dirty
+ * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
+ * callback to remove the entry. We remove those here.
+ *
+ * Also free up the ziltest itxs.
+ */
+ if (zilog->zl_itxg[i].itxg_itxs)
+ zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
+ mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
+ }
+
+ mutex_destroy(&zilog->zl_issuer_lock);
+ mutex_destroy(&zilog->zl_lock);
+
+ cv_destroy(&zilog->zl_cv_suspend);
+
+ kmem_free(zilog, sizeof (zilog_t));
+}
+
+/*
+ * Open an intent log.
+ */
+zilog_t *
+zil_open(objset_t *os, zil_get_data_t *get_data)
+{
+ zilog_t *zilog = dmu_objset_zil(os);
+
+ ASSERT3P(zilog->zl_get_data, ==, NULL);
+ ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
+ ASSERT(list_is_empty(&zilog->zl_lwb_list));
+
+ zilog->zl_get_data = get_data;
+
+ return (zilog);
+}
+
+/*
+ * Close an intent log.
+ */
+void
+zil_close(zilog_t *zilog)
+{
+ lwb_t *lwb;
+ uint64_t txg;
+
+ if (!dmu_objset_is_snapshot(zilog->zl_os)) {
+ zil_commit(zilog, 0);
+ } else {
+ ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
+ ASSERT0(zilog->zl_dirty_max_txg);
+ ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE);
+ }
+
+ mutex_enter(&zilog->zl_lock);
+ lwb = list_tail(&zilog->zl_lwb_list);
+ if (lwb == NULL)
+ txg = zilog->zl_dirty_max_txg;
+ else
+ txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg);
+ mutex_exit(&zilog->zl_lock);
+
+ /*
+ * We need to use txg_wait_synced() to wait long enough for the
+ * ZIL to be clean, and to wait for all pending lwbs to be
+ * written out.
+ */
+ if (txg != 0)
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+
+ if (zilog_is_dirty(zilog))
+ zfs_dbgmsg("zil (%px) is dirty, txg %llu", zilog, txg);
+ if (txg < spa_freeze_txg(zilog->zl_spa))
+ VERIFY(!zilog_is_dirty(zilog));
+
+ zilog->zl_get_data = NULL;
+
+ /*
+ * We should have only one lwb left on the list; remove it now.
+ */
+ mutex_enter(&zilog->zl_lock);
+ lwb = list_head(&zilog->zl_lwb_list);
+ if (lwb != NULL) {
+ ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list));
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
+
+ if (lwb->lwb_fastwrite)
+ metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk);
+
+ list_remove(&zilog->zl_lwb_list, lwb);
+ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+ zil_free_lwb(zilog, lwb);
+ }
+ mutex_exit(&zilog->zl_lock);
+}
+
+static char *suspend_tag = "zil suspending";
+
+/*
+ * Suspend an intent log. While in suspended mode, we still honor
+ * synchronous semantics, but we rely on txg_wait_synced() to do it.
+ * On old version pools, we suspend the log briefly when taking a
+ * snapshot so that it will have an empty intent log.
+ *
+ * Long holds are not really intended to be used the way we do here --
+ * held for such a short time. A concurrent caller of dsl_dataset_long_held()
+ * could fail. Therefore we take pains to only put a long hold if it is
+ * actually necessary. Fortunately, it will only be necessary if the
+ * objset is currently mounted (or the ZVOL equivalent). In that case it
+ * will already have a long hold, so we are not really making things any worse.
+ *
+ * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or
+ * zvol_state_t), and use their mechanism to prevent their hold from being
+ * dropped (e.g. VFS_HOLD()). However, that would be even more pain for
+ * very little gain.
+ *
+ * if cookiep == NULL, this does both the suspend & resume.
+ * Otherwise, it returns with the dataset "long held", and the cookie
+ * should be passed into zil_resume().
+ */
+int
+zil_suspend(const char *osname, void **cookiep)
+{
+ objset_t *os;
+ zilog_t *zilog;
+ const zil_header_t *zh;
+ int error;
+
+ error = dmu_objset_hold(osname, suspend_tag, &os);
+ if (error != 0)
+ return (error);
+ zilog = dmu_objset_zil(os);
+
+ mutex_enter(&zilog->zl_lock);
+ zh = zilog->zl_header;
+
+ if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */
+ mutex_exit(&zilog->zl_lock);
+ dmu_objset_rele(os, suspend_tag);
+ return (SET_ERROR(EBUSY));
+ }
+
+ /*
+ * Don't put a long hold in the cases where we can avoid it. This
+ * is when there is no cookie so we are doing a suspend & resume
+ * (i.e. called from zil_vdev_offline()), and there's nothing to do
+ * for the suspend because it's already suspended, or there's no ZIL.
+ */
+ if (cookiep == NULL && !zilog->zl_suspending &&
+ (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
+ mutex_exit(&zilog->zl_lock);
+ dmu_objset_rele(os, suspend_tag);
+ return (0);
+ }
+
+ dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag);
+ dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
+
+ zilog->zl_suspend++;
+
+ if (zilog->zl_suspend > 1) {
+ /*
+ * Someone else is already suspending it.
+ * Just wait for them to finish.
+ */
+
+ while (zilog->zl_suspending)
+ cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
+ mutex_exit(&zilog->zl_lock);
+
+ if (cookiep == NULL)
+ zil_resume(os);
+ else
+ *cookiep = os;
+ return (0);
+ }
+
+ /*
+ * If there is no pointer to an on-disk block, this ZIL must not
+ * be active (e.g. filesystem not mounted), so there's nothing
+ * to clean up.
+ */
+ if (BP_IS_HOLE(&zh->zh_log)) {
+ ASSERT(cookiep != NULL); /* fast path already handled */
+
+ *cookiep = os;
+ mutex_exit(&zilog->zl_lock);
+ return (0);
+ }
+
+ /*
+ * The ZIL has work to do. Ensure that the associated encryption
+ * key will remain mapped while we are committing the log by
+ * grabbing a reference to it. If the key isn't loaded we have no
+ * choice but to return an error until the wrapping key is loaded.
+ */
+ if (os->os_encrypted &&
+ dsl_dataset_create_key_mapping(dmu_objset_ds(os)) != 0) {
+ zilog->zl_suspend--;
+ mutex_exit(&zilog->zl_lock);
+ dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
+ dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
+ return (SET_ERROR(EACCES));
+ }
+
+ zilog->zl_suspending = B_TRUE;
+ mutex_exit(&zilog->zl_lock);
+
+ /*
+ * We need to use zil_commit_impl to ensure we wait for all
+ * LWB_STATE_OPENED and LWB_STATE_ISSUED lwbs to be committed
+ * to disk before proceeding. If we used zil_commit instead, it
+ * would just call txg_wait_synced(), because zl_suspend is set.
+ * txg_wait_synced() doesn't wait for these lwb's to be
+ * LWB_STATE_FLUSH_DONE before returning.
+ */
+ zil_commit_impl(zilog, 0);
+
+ /*
+ * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we
+ * use txg_wait_synced() to ensure the data from the zilog has
+ * migrated to the main pool before calling zil_destroy().
+ */
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
+
+ zil_destroy(zilog, B_FALSE);
+
+ mutex_enter(&zilog->zl_lock);
+ zilog->zl_suspending = B_FALSE;
+ cv_broadcast(&zilog->zl_cv_suspend);
+ mutex_exit(&zilog->zl_lock);
+
+ if (os->os_encrypted)
+ dsl_dataset_remove_key_mapping(dmu_objset_ds(os));
+
+ if (cookiep == NULL)
+ zil_resume(os);
+ else
+ *cookiep = os;
+ return (0);
+}
+
+void
+zil_resume(void *cookie)
+{
+ objset_t *os = cookie;
+ zilog_t *zilog = dmu_objset_zil(os);
+
+ mutex_enter(&zilog->zl_lock);
+ ASSERT(zilog->zl_suspend != 0);
+ zilog->zl_suspend--;
+ mutex_exit(&zilog->zl_lock);
+ dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
+ dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
+}
+
+typedef struct zil_replay_arg {
+ zil_replay_func_t **zr_replay;
+ void *zr_arg;
+ boolean_t zr_byteswap;
+ char *zr_lr;
+} zil_replay_arg_t;
+
+static int
+zil_replay_error(zilog_t *zilog, const lr_t *lr, int error)
+{
+ char name[ZFS_MAX_DATASET_NAME_LEN];
+
+ zilog->zl_replaying_seq--; /* didn't actually replay this one */
+
+ dmu_objset_name(zilog->zl_os, name);
+
+ cmn_err(CE_WARN, "ZFS replay transaction error %d, "
+ "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
+ (u_longlong_t)lr->lrc_seq,
+ (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
+ (lr->lrc_txtype & TX_CI) ? "CI" : "");
+
+ return (error);
+}
+
+static int
+zil_replay_log_record(zilog_t *zilog, const lr_t *lr, void *zra,
+ uint64_t claim_txg)
+{
+ zil_replay_arg_t *zr = zra;
+ const zil_header_t *zh = zilog->zl_header;
+ uint64_t reclen = lr->lrc_reclen;
+ uint64_t txtype = lr->lrc_txtype;
+ int error = 0;
+
+ zilog->zl_replaying_seq = lr->lrc_seq;
+
+ if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */
+ return (0);
+
+ if (lr->lrc_txg < claim_txg) /* already committed */
+ return (0);
+
+ /* Strip case-insensitive bit, still present in log record */
+ txtype &= ~TX_CI;
+
+ if (txtype == 0 || txtype >= TX_MAX_TYPE)
+ return (zil_replay_error(zilog, lr, EINVAL));
+
+ /*
+ * If this record type can be logged out of order, the object
+ * (lr_foid) may no longer exist. That's legitimate, not an error.
+ */
+ if (TX_OOO(txtype)) {
+ error = dmu_object_info(zilog->zl_os,
+ LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL);
+ if (error == ENOENT || error == EEXIST)
+ return (0);
+ }
+
+ /*
+ * Make a copy of the data so we can revise and extend it.
+ */
+ bcopy(lr, zr->zr_lr, reclen);
+
+ /*
+ * If this is a TX_WRITE with a blkptr, suck in the data.
+ */
+ if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
+ error = zil_read_log_data(zilog, (lr_write_t *)lr,
+ zr->zr_lr + reclen);
+ if (error != 0)
+ return (zil_replay_error(zilog, lr, error));
+ }
+
+ /*
+ * The log block containing this lr may have been byteswapped
+ * so that we can easily examine common fields like lrc_txtype.
+ * However, the log is a mix of different record types, and only the
+ * replay vectors know how to byteswap their records. Therefore, if
+ * the lr was byteswapped, undo it before invoking the replay vector.
+ */
+ if (zr->zr_byteswap)
+ byteswap_uint64_array(zr->zr_lr, reclen);
+
+ /*
+ * We must now do two things atomically: replay this log record,
+ * and update the log header sequence number to reflect the fact that
+ * we did so. At the end of each replay function the sequence number
+ * is updated if we are in replay mode.
+ */
+ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
+ if (error != 0) {
+ /*
+ * The DMU's dnode layer doesn't see removes until the txg
+ * commits, so a subsequent claim can spuriously fail with
+ * EEXIST. So if we receive any error we try syncing out
+ * any removes then retry the transaction. Note that we
+ * specify B_FALSE for byteswap now, so we don't do it twice.
+ */
+ txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
+ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
+ if (error != 0)
+ return (zil_replay_error(zilog, lr, error));
+ }
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zil_incr_blks(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+ zilog->zl_replay_blks++;
+
+ return (0);
+}
+
+/*
+ * If this dataset has a non-empty intent log, replay it and destroy it.
+ */
+void
+zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
+{
+ zilog_t *zilog = dmu_objset_zil(os);
+ const zil_header_t *zh = zilog->zl_header;
+ zil_replay_arg_t zr;
+
+ if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
+ zil_destroy(zilog, B_TRUE);
+ return;
+ }
+
+ zr.zr_replay = replay_func;
+ zr.zr_arg = arg;
+ zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
+ zr.zr_lr = vmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
+
+ /*
+ * Wait for in-progress removes to sync before starting replay.
+ */
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
+
+ zilog->zl_replay = B_TRUE;
+ zilog->zl_replay_time = ddi_get_lbolt();
+ ASSERT(zilog->zl_replay_blks == 0);
+ (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
+ zh->zh_claim_txg, B_TRUE);
+ vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
+
+ zil_destroy(zilog, B_FALSE);
+ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+ zilog->zl_replay = B_FALSE;
+}
+
+boolean_t
+zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
+{
+ if (zilog->zl_sync == ZFS_SYNC_DISABLED)
+ return (B_TRUE);
+
+ if (zilog->zl_replay) {
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
+ zilog->zl_replaying_seq;
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/* ARGSUSED */
+int
+zil_reset(const char *osname, void *arg)
+{
+ int error;
+
+ error = zil_suspend(osname, NULL);
+ /* EACCES means crypto key not loaded */
+ if ((error == EACCES) || (error == EBUSY))
+ return (SET_ERROR(error));
+ if (error != 0)
+ return (SET_ERROR(EEXIST));
+ return (0);
+}
+
+EXPORT_SYMBOL(zil_alloc);
+EXPORT_SYMBOL(zil_free);
+EXPORT_SYMBOL(zil_open);
+EXPORT_SYMBOL(zil_close);
+EXPORT_SYMBOL(zil_replay);
+EXPORT_SYMBOL(zil_replaying);
+EXPORT_SYMBOL(zil_destroy);
+EXPORT_SYMBOL(zil_destroy_sync);
+EXPORT_SYMBOL(zil_itx_create);
+EXPORT_SYMBOL(zil_itx_destroy);
+EXPORT_SYMBOL(zil_itx_assign);
+EXPORT_SYMBOL(zil_commit);
+EXPORT_SYMBOL(zil_claim);
+EXPORT_SYMBOL(zil_check_log_chain);
+EXPORT_SYMBOL(zil_sync);
+EXPORT_SYMBOL(zil_clean);
+EXPORT_SYMBOL(zil_suspend);
+EXPORT_SYMBOL(zil_resume);
+EXPORT_SYMBOL(zil_lwb_add_block);
+EXPORT_SYMBOL(zil_bp_tree_add);
+EXPORT_SYMBOL(zil_set_sync);
+EXPORT_SYMBOL(zil_set_logbias);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, INT, ZMOD_RW,
+ "ZIL block open timeout percentage");
+
+ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
+ "Disable intent logging replay");
+
+ZFS_MODULE_PARAM(zfs_zil, zil_, nocacheflush, INT, ZMOD_RW,
+ "Disable ZIL cache flushes");
+
+ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, ULONG, ZMOD_RW,
+ "Limit in bytes slog sync writes per commit");
+
+ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, INT, ZMOD_RW,
+ "Limit in bytes of ZIL log block size");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c
new file mode 100644
index 000000000000..7f3cb19d46db
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zio.c
@@ -0,0 +1,5039 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ */
+
+#include <sys/sysmacros.h>
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
+#include <sys/zio_impl.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/ddt.h>
+#include <sys/blkptr.h>
+#include <sys/zfeature.h>
+#include <sys/dsl_scan.h>
+#include <sys/metaslab_impl.h>
+#include <sys/time.h>
+#include <sys/trace_zfs.h>
+#include <sys/abd.h>
+#include <sys/dsl_crypt.h>
+#include <cityhash.h>
+
+/*
+ * ==========================================================================
+ * I/O type descriptions
+ * ==========================================================================
+ */
+const char *zio_type_name[ZIO_TYPES] = {
+ /*
+ * Note: Linux kernel thread name length is limited
+ * so these names will differ from upstream open zfs.
+ */
+ "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim"
+};
+
+int zio_dva_throttle_enabled = B_TRUE;
+int zio_deadman_log_all = B_FALSE;
+
+/*
+ * ==========================================================================
+ * I/O kmem caches
+ * ==========================================================================
+ */
+kmem_cache_t *zio_cache;
+kmem_cache_t *zio_link_cache;
+kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+#if defined(ZFS_DEBUG) && !defined(_KERNEL)
+uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+#endif
+
+/* Mark IOs as "slow" if they take longer than 30 seconds */
+int zio_slow_io_ms = (30 * MILLISEC);
+
+#define BP_SPANB(indblkshift, level) \
+ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
+#define COMPARE_META_LEVEL 0x80000000ul
+/*
+ * The following actions directly effect the spa's sync-to-convergence logic.
+ * The values below define the sync pass when we start performing the action.
+ * Care should be taken when changing these values as they directly impact
+ * spa_sync() performance. Tuning these values may introduce subtle performance
+ * pathologies and should only be done in the context of performance analysis.
+ * These tunables will eventually be removed and replaced with #defines once
+ * enough analysis has been done to determine optimal values.
+ *
+ * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
+ * regular blocks are not deferred.
+ *
+ * Starting in sync pass 8 (zfs_sync_pass_dont_compress), we disable
+ * compression (including of metadata). In practice, we don't have this
+ * many sync passes, so this has no effect.
+ *
+ * The original intent was that disabling compression would help the sync
+ * passes to converge. However, in practice disabling compression increases
+ * the average number of sync passes, because when we turn compression off, a
+ * lot of block's size will change and thus we have to re-allocate (not
+ * overwrite) them. It also increases the number of 128KB allocations (e.g.
+ * for indirect blocks and spacemaps) because these will not be compressed.
+ * The 128K allocations are especially detrimental to performance on highly
+ * fragmented systems, which may have very few free segments of this size,
+ * and may need to load new metaslabs to satisfy 128K allocations.
+ */
+int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
+int zfs_sync_pass_dont_compress = 8; /* don't compress starting in this pass */
+int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
+
+/*
+ * An allocating zio is one that either currently has the DVA allocate
+ * stage set or will have it later in its lifetime.
+ */
+#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
+
+/*
+ * Enable smaller cores by excluding metadata
+ * allocations as well.
+ */
+int zio_exclude_metadata = 0;
+int zio_requeue_io_start_cut_in_line = 1;
+
+#ifdef ZFS_DEBUG
+int zio_buf_debug_limit = 16384;
+#else
+int zio_buf_debug_limit = 0;
+#endif
+
+static inline void __zio_execute(zio_t *zio);
+
+static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
+
+void
+zio_init(void)
+{
+ size_t c;
+
+ zio_cache = kmem_cache_create("zio_cache",
+ sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ zio_link_cache = kmem_cache_create("zio_link_cache",
+ sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ /*
+ * For small buffers, we want a cache for each multiple of
+ * SPA_MINBLOCKSIZE. For larger buffers, we want a cache
+ * for each quarter-power of 2.
+ */
+ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
+ size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
+ size_t p2 = size;
+ size_t align = 0;
+ size_t data_cflags, cflags;
+
+ data_cflags = KMC_NODEBUG;
+ cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ?
+ KMC_NODEBUG : 0;
+
+#if defined(_ILP32) && defined(_KERNEL)
+ /*
+ * Cache size limited to 1M on 32-bit platforms until ARC
+ * buffers no longer require virtual address space.
+ */
+ if (size > zfs_max_recordsize)
+ break;
+#endif
+
+ while (!ISP2(p2))
+ p2 &= p2 - 1;
+
+#ifndef _KERNEL
+ /*
+ * If we are using watchpoints, put each buffer on its own page,
+ * to eliminate the performance overhead of trapping to the
+ * kernel when modifying a non-watched buffer that shares the
+ * page with a watched buffer.
+ */
+ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
+ continue;
+ /*
+ * Here's the problem - on 4K native devices in userland on
+ * Linux using O_DIRECT, buffers must be 4K aligned or I/O
+ * will fail with EINVAL, causing zdb (and others) to coredump.
+ * Since userland probably doesn't need optimized buffer caches,
+ * we just force 4K alignment on everything.
+ */
+ align = 8 * SPA_MINBLOCKSIZE;
+#else
+ if (size < PAGESIZE) {
+ align = SPA_MINBLOCKSIZE;
+ } else if (IS_P2ALIGNED(size, p2 >> 2)) {
+ align = PAGESIZE;
+ }
+#endif
+
+ if (align != 0) {
+ char name[36];
+ if (cflags == data_cflags) {
+ /*
+ * Resulting kmem caches would be identical.
+ * Save memory by creating only one.
+ */
+ (void) snprintf(name, sizeof (name),
+ "zio_buf_comb_%lu", (ulong_t)size);
+ zio_buf_cache[c] = kmem_cache_create(name,
+ size, align, NULL, NULL, NULL, NULL, NULL,
+ cflags);
+ zio_data_buf_cache[c] = zio_buf_cache[c];
+ continue;
+ }
+ (void) snprintf(name, sizeof (name), "zio_buf_%lu",
+ (ulong_t)size);
+ zio_buf_cache[c] = kmem_cache_create(name, size,
+ align, NULL, NULL, NULL, NULL, NULL, cflags);
+
+ (void) snprintf(name, sizeof (name), "zio_data_buf_%lu",
+ (ulong_t)size);
+ zio_data_buf_cache[c] = kmem_cache_create(name, size,
+ align, NULL, NULL, NULL, NULL, NULL, data_cflags);
+ }
+ }
+
+ while (--c != 0) {
+ ASSERT(zio_buf_cache[c] != NULL);
+ if (zio_buf_cache[c - 1] == NULL)
+ zio_buf_cache[c - 1] = zio_buf_cache[c];
+
+ ASSERT(zio_data_buf_cache[c] != NULL);
+ if (zio_data_buf_cache[c - 1] == NULL)
+ zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
+ }
+
+ zio_inject_init();
+
+ lz4_init();
+}
+
+void
+zio_fini(void)
+{
+ size_t i, j, n;
+ kmem_cache_t *cache;
+
+ n = SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT;
+
+#if defined(ZFS_DEBUG) && !defined(_KERNEL)
+ for (i = 0; i < n; i++) {
+ if (zio_buf_cache_allocs[i] != zio_buf_cache_frees[i])
+ (void) printf("zio_fini: [%d] %llu != %llu\n",
+ (int)((i + 1) << SPA_MINBLOCKSHIFT),
+ (long long unsigned)zio_buf_cache_allocs[i],
+ (long long unsigned)zio_buf_cache_frees[i]);
+ }
+#endif
+
+ /*
+ * The same kmem cache can show up multiple times in both zio_buf_cache
+ * and zio_data_buf_cache. Do a wasteful but trivially correct scan to
+ * sort it out.
+ */
+ for (i = 0; i < n; i++) {
+ cache = zio_buf_cache[i];
+ if (cache == NULL)
+ continue;
+ for (j = i; j < n; j++) {
+ if (cache == zio_buf_cache[j])
+ zio_buf_cache[j] = NULL;
+ if (cache == zio_data_buf_cache[j])
+ zio_data_buf_cache[j] = NULL;
+ }
+ kmem_cache_destroy(cache);
+ }
+
+ for (i = 0; i < n; i++) {
+ cache = zio_data_buf_cache[i];
+ if (cache == NULL)
+ continue;
+ for (j = i; j < n; j++) {
+ if (cache == zio_data_buf_cache[j])
+ zio_data_buf_cache[j] = NULL;
+ }
+ kmem_cache_destroy(cache);
+ }
+
+ for (i = 0; i < n; i++) {
+ if (zio_buf_cache[i] != NULL)
+ panic("zio_fini: zio_buf_cache[%d] != NULL", (int)i);
+ if (zio_data_buf_cache[i] != NULL)
+ panic("zio_fini: zio_data_buf_cache[%d] != NULL", (int)i);
+ }
+
+ kmem_cache_destroy(zio_link_cache);
+ kmem_cache_destroy(zio_cache);
+
+ zio_inject_fini();
+
+ lz4_fini();
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free I/O buffers
+ * ==========================================================================
+ */
+
+/*
+ * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
+ * crashdump if the kernel panics, so use it judiciously. Obviously, it's
+ * useful to inspect ZFS metadata, but if possible, we should avoid keeping
+ * excess / transient data in-core during a crashdump.
+ */
+void *
+zio_buf_alloc(size_t size)
+{
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+#if defined(ZFS_DEBUG) && !defined(_KERNEL)
+ atomic_add_64(&zio_buf_cache_allocs[c], 1);
+#endif
+
+ return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
+}
+
+/*
+ * Use zio_data_buf_alloc to allocate data. The data will not appear in a
+ * crashdump if the kernel panics. This exists so that we will limit the amount
+ * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
+ * of kernel heap dumped to disk when the kernel panics)
+ */
+void *
+zio_data_buf_alloc(size_t size)
+{
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
+}
+
+void
+zio_buf_free(void *buf, size_t size)
+{
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+#if defined(ZFS_DEBUG) && !defined(_KERNEL)
+ atomic_add_64(&zio_buf_cache_frees[c], 1);
+#endif
+
+ kmem_cache_free(zio_buf_cache[c], buf);
+}
+
+void
+zio_data_buf_free(void *buf, size_t size)
+{
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ kmem_cache_free(zio_data_buf_cache[c], buf);
+}
+
+static void
+zio_abd_free(void *abd, size_t size)
+{
+ abd_free((abd_t *)abd);
+}
+
+/*
+ * ==========================================================================
+ * Push and pop I/O transform buffers
+ * ==========================================================================
+ */
+void
+zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
+ zio_transform_func_t *transform)
+{
+ zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
+
+ zt->zt_orig_abd = zio->io_abd;
+ zt->zt_orig_size = zio->io_size;
+ zt->zt_bufsize = bufsize;
+ zt->zt_transform = transform;
+
+ zt->zt_next = zio->io_transform_stack;
+ zio->io_transform_stack = zt;
+
+ zio->io_abd = data;
+ zio->io_size = size;
+}
+
+void
+zio_pop_transforms(zio_t *zio)
+{
+ zio_transform_t *zt;
+
+ while ((zt = zio->io_transform_stack) != NULL) {
+ if (zt->zt_transform != NULL)
+ zt->zt_transform(zio,
+ zt->zt_orig_abd, zt->zt_orig_size);
+
+ if (zt->zt_bufsize != 0)
+ abd_free(zio->io_abd);
+
+ zio->io_abd = zt->zt_orig_abd;
+ zio->io_size = zt->zt_orig_size;
+ zio->io_transform_stack = zt->zt_next;
+
+ kmem_free(zt, sizeof (zio_transform_t));
+ }
+}
+
+/*
+ * ==========================================================================
+ * I/O transform callbacks for subblocks, decompression, and decryption
+ * ==========================================================================
+ */
+static void
+zio_subblock(zio_t *zio, abd_t *data, uint64_t size)
+{
+ ASSERT(zio->io_size > size);
+
+ if (zio->io_type == ZIO_TYPE_READ)
+ abd_copy(data, zio->io_abd, size);
+}
+
+static void
+zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
+{
+ if (zio->io_error == 0) {
+ void *tmp = abd_borrow_buf(data, size);
+ int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
+ zio->io_abd, tmp, zio->io_size, size,
+ &zio->io_prop.zp_complevel);
+ abd_return_buf_copy(data, tmp, size);
+
+ if (zio_injection_enabled && ret == 0)
+ ret = zio_handle_fault_injection(zio, EINVAL);
+
+ if (ret != 0)
+ zio->io_error = SET_ERROR(EIO);
+ }
+}
+
+static void
+zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
+{
+ int ret;
+ void *tmp;
+ blkptr_t *bp = zio->io_bp;
+ spa_t *spa = zio->io_spa;
+ uint64_t dsobj = zio->io_bookmark.zb_objset;
+ uint64_t lsize = BP_GET_LSIZE(bp);
+ dmu_object_type_t ot = BP_GET_TYPE(bp);
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+ boolean_t no_crypt = B_FALSE;
+
+ ASSERT(BP_USES_CRYPT(bp));
+ ASSERT3U(size, !=, 0);
+
+ if (zio->io_error != 0)
+ return;
+
+ /*
+ * Verify the cksum of MACs stored in an indirect bp. It will always
+ * be possible to verify this since it does not require an encryption
+ * key.
+ */
+ if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) {
+ zio_crypt_decode_mac_bp(bp, mac);
+
+ if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
+ /*
+ * We haven't decompressed the data yet, but
+ * zio_crypt_do_indirect_mac_checksum() requires
+ * decompressed data to be able to parse out the MACs
+ * from the indirect block. We decompress it now and
+ * throw away the result after we are finished.
+ */
+ tmp = zio_buf_alloc(lsize);
+ ret = zio_decompress_data(BP_GET_COMPRESS(bp),
+ zio->io_abd, tmp, zio->io_size, lsize,
+ &zio->io_prop.zp_complevel);
+ if (ret != 0) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+ ret = zio_crypt_do_indirect_mac_checksum(B_FALSE,
+ tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac);
+ zio_buf_free(tmp, lsize);
+ } else {
+ ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
+ zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac);
+ }
+ abd_copy(data, zio->io_abd, size);
+
+ if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) {
+ ret = zio_handle_decrypt_injection(spa,
+ &zio->io_bookmark, ot, ECKSUM);
+ }
+ if (ret != 0)
+ goto error;
+
+ return;
+ }
+
+ /*
+ * If this is an authenticated block, just check the MAC. It would be
+ * nice to separate this out into its own flag, but for the moment
+ * enum zio_flag is out of bits.
+ */
+ if (BP_IS_AUTHENTICATED(bp)) {
+ if (ot == DMU_OT_OBJSET) {
+ ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa,
+ dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp));
+ } else {
+ zio_crypt_decode_mac_bp(bp, mac);
+ ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj,
+ zio->io_abd, size, mac);
+ if (zio_injection_enabled && ret == 0) {
+ ret = zio_handle_decrypt_injection(spa,
+ &zio->io_bookmark, ot, ECKSUM);
+ }
+ }
+ abd_copy(data, zio->io_abd, size);
+
+ if (ret != 0)
+ goto error;
+
+ return;
+ }
+
+ zio_crypt_decode_params_bp(bp, salt, iv);
+
+ if (ot == DMU_OT_INTENT_LOG) {
+ tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t));
+ zio_crypt_decode_mac_zil(tmp, mac);
+ abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t));
+ } else {
+ zio_crypt_decode_mac_bp(bp, mac);
+ }
+
+ ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp),
+ BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data,
+ zio->io_abd, &no_crypt);
+ if (no_crypt)
+ abd_copy(data, zio->io_abd, size);
+
+ if (ret != 0)
+ goto error;
+
+ return;
+
+error:
+ /* assert that the key was found unless this was speculative */
+ ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE));
+
+ /*
+ * If there was a decryption / authentication error return EIO as
+ * the io_error. If this was not a speculative zio, create an ereport.
+ */
+ if (ret == ECKSUM) {
+ zio->io_error = SET_ERROR(EIO);
+ if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
+ spa_log_error(spa, &zio->io_bookmark);
+ (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
+ spa, NULL, &zio->io_bookmark, zio, 0);
+ }
+ } else {
+ zio->io_error = ret;
+ }
+}
+
+/*
+ * ==========================================================================
+ * I/O parent/child relationships and pipeline interlocks
+ * ==========================================================================
+ */
+zio_t *
+zio_walk_parents(zio_t *cio, zio_link_t **zl)
+{
+ list_t *pl = &cio->io_parent_list;
+
+ *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
+ if (*zl == NULL)
+ return (NULL);
+
+ ASSERT((*zl)->zl_child == cio);
+ return ((*zl)->zl_parent);
+}
+
+zio_t *
+zio_walk_children(zio_t *pio, zio_link_t **zl)
+{
+ list_t *cl = &pio->io_child_list;
+
+ ASSERT(MUTEX_HELD(&pio->io_lock));
+
+ *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
+ if (*zl == NULL)
+ return (NULL);
+
+ ASSERT((*zl)->zl_parent == pio);
+ return ((*zl)->zl_child);
+}
+
+zio_t *
+zio_unique_parent(zio_t *cio)
+{
+ zio_link_t *zl = NULL;
+ zio_t *pio = zio_walk_parents(cio, &zl);
+
+ VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
+ return (pio);
+}
+
+void
+zio_add_child(zio_t *pio, zio_t *cio)
+{
+ zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
+
+ /*
+ * Logical I/Os can have logical, gang, or vdev children.
+ * Gang I/Os can have gang or vdev children.
+ * Vdev I/Os can only have vdev children.
+ * The following ASSERT captures all of these constraints.
+ */
+ ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
+
+ zl->zl_parent = pio;
+ zl->zl_child = cio;
+
+ mutex_enter(&pio->io_lock);
+ mutex_enter(&cio->io_lock);
+
+ ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
+
+ for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+ pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
+
+ list_insert_head(&pio->io_child_list, zl);
+ list_insert_head(&cio->io_parent_list, zl);
+
+ pio->io_child_count++;
+ cio->io_parent_count++;
+
+ mutex_exit(&cio->io_lock);
+ mutex_exit(&pio->io_lock);
+}
+
+static void
+zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
+{
+ ASSERT(zl->zl_parent == pio);
+ ASSERT(zl->zl_child == cio);
+
+ mutex_enter(&pio->io_lock);
+ mutex_enter(&cio->io_lock);
+
+ list_remove(&pio->io_child_list, zl);
+ list_remove(&cio->io_parent_list, zl);
+
+ pio->io_child_count--;
+ cio->io_parent_count--;
+
+ mutex_exit(&cio->io_lock);
+ mutex_exit(&pio->io_lock);
+ kmem_cache_free(zio_link_cache, zl);
+}
+
+static boolean_t
+zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
+{
+ boolean_t waiting = B_FALSE;
+
+ mutex_enter(&zio->io_lock);
+ ASSERT(zio->io_stall == NULL);
+ for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
+ if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
+ continue;
+
+ uint64_t *countp = &zio->io_children[c][wait];
+ if (*countp != 0) {
+ zio->io_stage >>= 1;
+ ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
+ zio->io_stall = countp;
+ waiting = B_TRUE;
+ break;
+ }
+ }
+ mutex_exit(&zio->io_lock);
+ return (waiting);
+}
+
+__attribute__((always_inline))
+static inline void
+zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
+ zio_t **next_to_executep)
+{
+ uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
+ int *errorp = &pio->io_child_error[zio->io_child_type];
+
+ mutex_enter(&pio->io_lock);
+ if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+ *errorp = zio_worst_error(*errorp, zio->io_error);
+ pio->io_reexecute |= zio->io_reexecute;
+ ASSERT3U(*countp, >, 0);
+
+ (*countp)--;
+
+ if (*countp == 0 && pio->io_stall == countp) {
+ zio_taskq_type_t type =
+ pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
+ ZIO_TASKQ_INTERRUPT;
+ pio->io_stall = NULL;
+ mutex_exit(&pio->io_lock);
+
+ /*
+ * If we can tell the caller to execute this parent next, do
+ * so. Otherwise dispatch the parent zio as its own task.
+ *
+ * Having the caller execute the parent when possible reduces
+ * locking on the zio taskq's, reduces context switch
+ * overhead, and has no recursion penalty. Note that one
+ * read from disk typically causes at least 3 zio's: a
+ * zio_null(), the logical zio_read(), and then a physical
+ * zio. When the physical ZIO completes, we are able to call
+ * zio_done() on all 3 of these zio's from one invocation of
+ * zio_execute() by returning the parent back to
+ * zio_execute(). Since the parent isn't executed until this
+ * thread returns back to zio_execute(), the caller should do
+ * so promptly.
+ *
+ * In other cases, dispatching the parent prevents
+ * overflowing the stack when we have deeply nested
+ * parent-child relationships, as we do with the "mega zio"
+ * of writes for spa_sync(), and the chain of ZIL blocks.
+ */
+ if (next_to_executep != NULL && *next_to_executep == NULL) {
+ *next_to_executep = pio;
+ } else {
+ zio_taskq_dispatch(pio, type, B_FALSE);
+ }
+ } else {
+ mutex_exit(&pio->io_lock);
+ }
+}
+
+static void
+zio_inherit_child_errors(zio_t *zio, enum zio_child c)
+{
+ if (zio->io_child_error[c] != 0 && zio->io_error == 0)
+ zio->io_error = zio->io_child_error[c];
+}
+
+int
+zio_bookmark_compare(const void *x1, const void *x2)
+{
+ const zio_t *z1 = x1;
+ const zio_t *z2 = x2;
+
+ if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset)
+ return (-1);
+ if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset)
+ return (1);
+
+ if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object)
+ return (-1);
+ if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object)
+ return (1);
+
+ if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level)
+ return (-1);
+ if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level)
+ return (1);
+
+ if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid)
+ return (-1);
+ if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid)
+ return (1);
+
+ if (z1 < z2)
+ return (-1);
+ if (z1 > z2)
+ return (1);
+
+ return (0);
+}
+
+/*
+ * ==========================================================================
+ * Create the various types of I/O (read, write, free, etc)
+ * ==========================================================================
+ */
+static zio_t *
+zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+ abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
+ void *private, zio_type_t type, zio_priority_t priority,
+ enum zio_flag flags, vdev_t *vd, uint64_t offset,
+ const zbookmark_phys_t *zb, enum zio_stage stage,
+ enum zio_stage pipeline)
+{
+ zio_t *zio;
+
+ IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE);
+ ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
+ ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
+
+ ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
+ ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
+ ASSERT(vd || stage == ZIO_STAGE_OPEN);
+
+ IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0);
+
+ zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
+ bzero(zio, sizeof (zio_t));
+
+ mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL);
+ cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
+
+ list_create(&zio->io_parent_list, sizeof (zio_link_t),
+ offsetof(zio_link_t, zl_parent_node));
+ list_create(&zio->io_child_list, sizeof (zio_link_t),
+ offsetof(zio_link_t, zl_child_node));
+ metaslab_trace_init(&zio->io_alloc_list);
+
+ if (vd != NULL)
+ zio->io_child_type = ZIO_CHILD_VDEV;
+ else if (flags & ZIO_FLAG_GANG_CHILD)
+ zio->io_child_type = ZIO_CHILD_GANG;
+ else if (flags & ZIO_FLAG_DDT_CHILD)
+ zio->io_child_type = ZIO_CHILD_DDT;
+ else
+ zio->io_child_type = ZIO_CHILD_LOGICAL;
+
+ if (bp != NULL) {
+ zio->io_bp = (blkptr_t *)bp;
+ zio->io_bp_copy = *bp;
+ zio->io_bp_orig = *bp;
+ if (type != ZIO_TYPE_WRITE ||
+ zio->io_child_type == ZIO_CHILD_DDT)
+ zio->io_bp = &zio->io_bp_copy; /* so caller can free */
+ if (zio->io_child_type == ZIO_CHILD_LOGICAL)
+ zio->io_logical = zio;
+ if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
+ pipeline |= ZIO_GANG_STAGES;
+ }
+
+ zio->io_spa = spa;
+ zio->io_txg = txg;
+ zio->io_done = done;
+ zio->io_private = private;
+ zio->io_type = type;
+ zio->io_priority = priority;
+ zio->io_vd = vd;
+ zio->io_offset = offset;
+ zio->io_orig_abd = zio->io_abd = data;
+ zio->io_orig_size = zio->io_size = psize;
+ zio->io_lsize = lsize;
+ zio->io_orig_flags = zio->io_flags = flags;
+ zio->io_orig_stage = zio->io_stage = stage;
+ zio->io_orig_pipeline = zio->io_pipeline = pipeline;
+ zio->io_pipeline_trace = ZIO_STAGE_OPEN;
+
+ zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
+ zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
+
+ if (zb != NULL)
+ zio->io_bookmark = *zb;
+
+ if (pio != NULL) {
+ if (zio->io_metaslab_class == NULL)
+ zio->io_metaslab_class = pio->io_metaslab_class;
+ if (zio->io_logical == NULL)
+ zio->io_logical = pio->io_logical;
+ if (zio->io_child_type == ZIO_CHILD_GANG)
+ zio->io_gang_leader = pio->io_gang_leader;
+ zio_add_child(pio, zio);
+ }
+
+ taskq_init_ent(&zio->io_tqent);
+
+ return (zio);
+}
+
+static void
+zio_destroy(zio_t *zio)
+{
+ metaslab_trace_fini(&zio->io_alloc_list);
+ list_destroy(&zio->io_parent_list);
+ list_destroy(&zio->io_child_list);
+ mutex_destroy(&zio->io_lock);
+ cv_destroy(&zio->io_cv);
+ kmem_cache_free(zio_cache, zio);
+}
+
+zio_t *
+zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
+ void *private, enum zio_flag flags)
+{
+ zio_t *zio;
+
+ zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
+ ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
+ ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
+
+ return (zio);
+}
+
+zio_t *
+zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
+{
+ return (zio_null(NULL, spa, NULL, done, private, flags));
+}
+
+static int
+zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
+ enum blk_verify_flag blk_verify, const char *fmt, ...)
+{
+ va_list adx;
+ char buf[256];
+
+ va_start(adx, fmt);
+ (void) vsnprintf(buf, sizeof (buf), fmt, adx);
+ va_end(adx);
+
+ switch (blk_verify) {
+ case BLK_VERIFY_HALT:
+ dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp);
+ zfs_panic_recover("%s: %s", spa_name(spa), buf);
+ break;
+ case BLK_VERIFY_LOG:
+ zfs_dbgmsg("%s: %s", spa_name(spa), buf);
+ break;
+ case BLK_VERIFY_ONLY:
+ break;
+ }
+
+ return (1);
+}
+
+/*
+ * Verify the block pointer fields contain reasonable values. This means
+ * it only contains known object types, checksum/compression identifiers,
+ * block sizes within the maximum allowed limits, valid DVAs, etc.
+ *
+ * If everything checks out B_TRUE is returned. The zfs_blkptr_verify
+ * argument controls the behavior when an invalid field is detected.
+ *
+ * Modes for zfs_blkptr_verify:
+ * 1) BLK_VERIFY_ONLY (evaluate the block)
+ * 2) BLK_VERIFY_LOG (evaluate the block and log problems)
+ * 3) BLK_VERIFY_HALT (call zfs_panic_recover on error)
+ */
+boolean_t
+zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held,
+ enum blk_verify_flag blk_verify)
+{
+ int errors = 0;
+
+ if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
+ errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
+ "blkptr at %p has invalid TYPE %llu",
+ bp, (longlong_t)BP_GET_TYPE(bp));
+ }
+ if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
+ BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
+ errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
+ "blkptr at %p has invalid CHECKSUM %llu",
+ bp, (longlong_t)BP_GET_CHECKSUM(bp));
+ }
+ if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
+ BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
+ errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
+ "blkptr at %p has invalid COMPRESS %llu",
+ bp, (longlong_t)BP_GET_COMPRESS(bp));
+ }
+ if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
+ errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
+ "blkptr at %p has invalid LSIZE %llu",
+ bp, (longlong_t)BP_GET_LSIZE(bp));
+ }
+ if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
+ errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
+ "blkptr at %p has invalid PSIZE %llu",
+ bp, (longlong_t)BP_GET_PSIZE(bp));
+ }
+
+ if (BP_IS_EMBEDDED(bp)) {
+ if (BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES) {
+ errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
+ "blkptr at %p has invalid ETYPE %llu",
+ bp, (longlong_t)BPE_GET_ETYPE(bp));
+ }
+ }
+
+ /*
+ * Do not verify individual DVAs if the config is not trusted. This
+ * will be done once the zio is executed in vdev_mirror_map_alloc.
+ */
+ if (!spa->spa_trust_config)
+ return (B_TRUE);
+
+ if (!config_held)
+ spa_config_enter(spa, SCL_VDEV, bp, RW_READER);
+ else
+ ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER));
+ /*
+ * Pool-specific checks.
+ *
+ * Note: it would be nice to verify that the blk_birth and
+ * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze()
+ * allows the birth time of log blocks (and dmu_sync()-ed blocks
+ * that are in the log) to be arbitrarily large.
+ */
+ for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
+ uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
+
+ if (vdevid >= spa->spa_root_vdev->vdev_children) {
+ errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
+ "blkptr at %p DVA %u has invalid VDEV %llu",
+ bp, i, (longlong_t)vdevid);
+ continue;
+ }
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
+ if (vd == NULL) {
+ errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
+ "blkptr at %p DVA %u has invalid VDEV %llu",
+ bp, i, (longlong_t)vdevid);
+ continue;
+ }
+ if (vd->vdev_ops == &vdev_hole_ops) {
+ errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
+ "blkptr at %p DVA %u has hole VDEV %llu",
+ bp, i, (longlong_t)vdevid);
+ continue;
+ }
+ if (vd->vdev_ops == &vdev_missing_ops) {
+ /*
+ * "missing" vdevs are valid during import, but we
+ * don't have their detailed info (e.g. asize), so
+ * we can't perform any more checks on them.
+ */
+ continue;
+ }
+ uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
+ uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
+ if (BP_IS_GANG(bp))
+ asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+ if (offset + asize > vd->vdev_asize) {
+ errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
+ "blkptr at %p DVA %u has invalid OFFSET %llu",
+ bp, i, (longlong_t)offset);
+ }
+ }
+ if (errors > 0)
+ dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp);
+ if (!config_held)
+ spa_config_exit(spa, SCL_VDEV, bp);
+
+ return (errors == 0);
+}
+
+boolean_t
+zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp)
+{
+ uint64_t vdevid = DVA_GET_VDEV(dva);
+
+ if (vdevid >= spa->spa_root_vdev->vdev_children)
+ return (B_FALSE);
+
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
+ if (vd == NULL)
+ return (B_FALSE);
+
+ if (vd->vdev_ops == &vdev_hole_ops)
+ return (B_FALSE);
+
+ if (vd->vdev_ops == &vdev_missing_ops) {
+ return (B_FALSE);
+ }
+
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t asize = DVA_GET_ASIZE(dva);
+
+ if (BP_IS_GANG(bp))
+ asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+ if (offset + asize > vd->vdev_asize)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+zio_t *
+zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
+ abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
+ zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
+{
+ zio_t *zio;
+
+ (void) zfs_blkptr_verify(spa, bp, flags & ZIO_FLAG_CONFIG_WRITER,
+ BLK_VERIFY_HALT);
+
+ zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
+ data, size, size, done, private,
+ ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
+ ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
+ ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
+
+ return (zio);
+}
+
+zio_t *
+zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
+ zio_done_func_t *ready, zio_done_func_t *children_ready,
+ zio_done_func_t *physdone, zio_done_func_t *done,
+ void *private, zio_priority_t priority, enum zio_flag flags,
+ const zbookmark_phys_t *zb)
+{
+ zio_t *zio;
+
+ ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
+ zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
+ zp->zp_compress >= ZIO_COMPRESS_OFF &&
+ zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
+ DMU_OT_IS_VALID(zp->zp_type) &&
+ zp->zp_level < 32 &&
+ zp->zp_copies > 0 &&
+ zp->zp_copies <= spa_max_replication(spa));
+
+ zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
+ ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
+ ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
+ ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
+
+ zio->io_ready = ready;
+ zio->io_children_ready = children_ready;
+ zio->io_physdone = physdone;
+ zio->io_prop = *zp;
+
+ /*
+ * Data can be NULL if we are going to call zio_write_override() to
+ * provide the already-allocated BP. But we may need the data to
+ * verify a dedup hit (if requested). In this case, don't try to
+ * dedup (just take the already-allocated BP verbatim). Encrypted
+ * dedup blocks need data as well so we also disable dedup in this
+ * case.
+ */
+ if (data == NULL &&
+ (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) {
+ zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
+ }
+
+ return (zio);
+}
+
+zio_t *
+zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
+ uint64_t size, zio_done_func_t *done, void *private,
+ zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
+{
+ zio_t *zio;
+
+ zio = zio_create(pio, spa, txg, bp, data, size, size, done, private,
+ ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
+ ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
+
+ return (zio);
+}
+
+void
+zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
+{
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
+ ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
+
+ /*
+ * We must reset the io_prop to match the values that existed
+ * when the bp was first written by dmu_sync() keeping in mind
+ * that nopwrite and dedup are mutually exclusive.
+ */
+ zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
+ zio->io_prop.zp_nopwrite = nopwrite;
+ zio->io_prop.zp_copies = copies;
+ zio->io_bp_override = bp;
+}
+
+void
+zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
+{
+
+ (void) zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_HALT);
+
+ /*
+ * The check for EMBEDDED is a performance optimization. We
+ * process the free here (by ignoring it) rather than
+ * putting it on the list and then processing it in zio_free_sync().
+ */
+ if (BP_IS_EMBEDDED(bp))
+ return;
+ metaslab_check_free(spa, bp);
+
+ /*
+ * Frees that are for the currently-syncing txg, are not going to be
+ * deferred, and which will not need to do a read (i.e. not GANG or
+ * DEDUP), can be processed immediately. Otherwise, put them on the
+ * in-memory list for later processing.
+ *
+ * Note that we only defer frees after zfs_sync_pass_deferred_free
+ * when the log space map feature is disabled. [see relevant comment
+ * in spa_sync_iterate_to_convergence()]
+ */
+ if (BP_IS_GANG(bp) ||
+ BP_GET_DEDUP(bp) ||
+ txg != spa->spa_syncing_txg ||
+ (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free &&
+ !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))) {
+ bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
+ } else {
+ VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL);
+ }
+}
+
+/*
+ * To improve performance, this function may return NULL if we were able
+ * to do the free immediately. This avoids the cost of creating a zio
+ * (and linking it to the parent, etc).
+ */
+zio_t *
+zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+ enum zio_flag flags)
+{
+ ASSERT(!BP_IS_HOLE(bp));
+ ASSERT(spa_syncing_txg(spa) == txg);
+
+ if (BP_IS_EMBEDDED(bp))
+ return (NULL);
+
+ metaslab_check_free(spa, bp);
+ arc_freed(spa, bp);
+ dsl_scan_freed(spa, bp);
+
+ if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) {
+ /*
+ * GANG and DEDUP blocks can induce a read (for the gang block
+ * header, or the DDT), so issue them asynchronously so that
+ * this thread is not tied up.
+ */
+ enum zio_stage stage =
+ ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC;
+
+ return (zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
+ BP_GET_PSIZE(bp), NULL, NULL,
+ ZIO_TYPE_FREE, ZIO_PRIORITY_NOW,
+ flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage));
+ } else {
+ metaslab_free(spa, bp, txg, B_FALSE);
+ return (NULL);
+ }
+}
+
+zio_t *
+zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+ zio_done_func_t *done, void *private, enum zio_flag flags)
+{
+ zio_t *zio;
+
+ (void) zfs_blkptr_verify(spa, bp, flags & ZIO_FLAG_CONFIG_WRITER,
+ BLK_VERIFY_HALT);
+
+ if (BP_IS_EMBEDDED(bp))
+ return (zio_null(pio, spa, NULL, NULL, NULL, 0));
+
+ /*
+ * A claim is an allocation of a specific block. Claims are needed
+ * to support immediate writes in the intent log. The issue is that
+ * immediate writes contain committed data, but in a txg that was
+ * *not* committed. Upon opening the pool after an unclean shutdown,
+ * the intent log claims all blocks that contain immediate write data
+ * so that the SPA knows they're in use.
+ *
+ * All claims *must* be resolved in the first txg -- before the SPA
+ * starts allocating blocks -- so that nothing is allocated twice.
+ * If txg == 0 we just verify that the block is claimable.
+ */
+ ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <,
+ spa_min_claim_txg(spa));
+ ASSERT(txg == spa_min_claim_txg(spa) || txg == 0);
+ ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(8) */
+
+ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
+ BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
+ flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
+ ASSERT0(zio->io_queued_timestamp);
+
+ return (zio);
+}
+
+zio_t *
+zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
+ zio_done_func_t *done, void *private, enum zio_flag flags)
+{
+ zio_t *zio;
+ int c;
+
+ if (vd->vdev_children == 0) {
+ zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
+ ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
+ ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
+
+ zio->io_cmd = cmd;
+ } else {
+ zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
+ done, private, flags));
+ }
+
+ return (zio);
+}
+
+zio_t *
+zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+ zio_done_func_t *done, void *private, zio_priority_t priority,
+ enum zio_flag flags, enum trim_flag trim_flags)
+{
+ zio_t *zio;
+
+ ASSERT0(vd->vdev_children);
+ ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+ ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+ ASSERT3U(size, !=, 0);
+
+ zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done,
+ private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL,
+ vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE);
+ zio->io_trim_flags = trim_flags;
+
+ return (zio);
+}
+
+zio_t *
+zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+ abd_t *data, int checksum, zio_done_func_t *done, void *private,
+ zio_priority_t priority, enum zio_flag flags, boolean_t labels)
+{
+ zio_t *zio;
+
+ ASSERT(vd->vdev_children == 0);
+ ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
+ offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
+ ASSERT3U(offset + size, <=, vd->vdev_psize);
+
+ zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
+ private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd,
+ offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
+
+ zio->io_prop.zp_checksum = checksum;
+
+ return (zio);
+}
+
+zio_t *
+zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+ abd_t *data, int checksum, zio_done_func_t *done, void *private,
+ zio_priority_t priority, enum zio_flag flags, boolean_t labels)
+{
+ zio_t *zio;
+
+ ASSERT(vd->vdev_children == 0);
+ ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
+ offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
+ ASSERT3U(offset + size, <=, vd->vdev_psize);
+
+ zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
+ private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd,
+ offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
+
+ zio->io_prop.zp_checksum = checksum;
+
+ if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
+ /*
+ * zec checksums are necessarily destructive -- they modify
+ * the end of the write buffer to hold the verifier/checksum.
+ * Therefore, we must make a local copy in case the data is
+ * being written to multiple places in parallel.
+ */
+ abd_t *wbuf = abd_alloc_sametype(data, size);
+ abd_copy(wbuf, data, size);
+
+ zio_push_transform(zio, wbuf, size, size, NULL);
+ }
+
+ return (zio);
+}
+
+/*
+ * Create a child I/O to do some work for us.
+ */
+zio_t *
+zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
+ abd_t *data, uint64_t size, int type, zio_priority_t priority,
+ enum zio_flag flags, zio_done_func_t *done, void *private)
+{
+ enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
+ zio_t *zio;
+
+ /*
+ * vdev child I/Os do not propagate their error to the parent.
+ * Therefore, for correct operation the caller *must* check for
+ * and handle the error in the child i/o's done callback.
+ * The only exceptions are i/os that we don't care about
+ * (OPTIONAL or REPAIR).
+ */
+ ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) ||
+ done != NULL);
+
+ if (type == ZIO_TYPE_READ && bp != NULL) {
+ /*
+ * If we have the bp, then the child should perform the
+ * checksum and the parent need not. This pushes error
+ * detection as close to the leaves as possible and
+ * eliminates redundant checksums in the interior nodes.
+ */
+ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
+ pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
+ }
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ ASSERT0(vd->vdev_children);
+ offset += VDEV_LABEL_START_SIZE;
+ }
+
+ flags |= ZIO_VDEV_CHILD_FLAGS(pio);
+
+ /*
+ * If we've decided to do a repair, the write is not speculative --
+ * even if the original read was.
+ */
+ if (flags & ZIO_FLAG_IO_REPAIR)
+ flags &= ~ZIO_FLAG_SPECULATIVE;
+
+ /*
+ * If we're creating a child I/O that is not associated with a
+ * top-level vdev, then the child zio is not an allocating I/O.
+ * If this is a retried I/O then we ignore it since we will
+ * have already processed the original allocating I/O.
+ */
+ if (flags & ZIO_FLAG_IO_ALLOCATING &&
+ (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
+ ASSERT(pio->io_metaslab_class != NULL);
+ ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled);
+ ASSERT(type == ZIO_TYPE_WRITE);
+ ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
+ ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
+ pio->io_child_type == ZIO_CHILD_GANG);
+
+ flags &= ~ZIO_FLAG_IO_ALLOCATING;
+ }
+
+
+ zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
+ done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
+ ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
+ ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
+
+ zio->io_physdone = pio->io_physdone;
+ if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
+ zio->io_logical->io_phys_children++;
+
+ return (zio);
+}
+
+zio_t *
+zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
+ zio_type_t type, zio_priority_t priority, enum zio_flag flags,
+ zio_done_func_t *done, void *private)
+{
+ zio_t *zio;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
+ data, size, size, done, private, type, priority,
+ flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
+ vd, offset, NULL,
+ ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
+
+ return (zio);
+}
+
+void
+zio_flush(zio_t *zio, vdev_t *vd)
+{
+ zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
+ NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
+}
+
+void
+zio_shrink(zio_t *zio, uint64_t size)
+{
+ ASSERT3P(zio->io_executor, ==, NULL);
+ ASSERT3U(zio->io_orig_size, ==, zio->io_size);
+ ASSERT3U(size, <=, zio->io_size);
+
+ /*
+ * We don't shrink for raidz because of problems with the
+ * reconstruction when reading back less than the block size.
+ * Note, BP_IS_RAIDZ() assumes no compression.
+ */
+ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
+ if (!BP_IS_RAIDZ(zio->io_bp)) {
+ /* we are not doing a raw write */
+ ASSERT3U(zio->io_size, ==, zio->io_lsize);
+ zio->io_orig_size = zio->io_size = zio->io_lsize = size;
+ }
+}
+
+/*
+ * ==========================================================================
+ * Prepare to read and write logical blocks
+ * ==========================================================================
+ */
+
+static zio_t *
+zio_read_bp_init(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ uint64_t psize =
+ BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
+
+ ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
+
+ if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
+ zio->io_child_type == ZIO_CHILD_LOGICAL &&
+ !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
+ zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
+ psize, psize, zio_decompress);
+ }
+
+ if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) ||
+ BP_HAS_INDIRECT_MAC_CKSUM(bp)) &&
+ zio->io_child_type == ZIO_CHILD_LOGICAL) {
+ zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
+ psize, psize, zio_decrypt);
+ }
+
+ if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
+ int psize = BPE_GET_PSIZE(bp);
+ void *data = abd_borrow_buf(zio->io_abd, psize);
+
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ decode_embedded_bp_compressed(bp, data);
+ abd_return_buf_copy(zio->io_abd, data, psize);
+ } else {
+ ASSERT(!BP_IS_EMBEDDED(bp));
+ ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
+ }
+
+ if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+
+ if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+
+ if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
+ zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
+
+ return (zio);
+}
+
+static zio_t *
+zio_write_bp_init(zio_t *zio)
+{
+ if (!IO_IS_ALLOCATING(zio))
+ return (zio);
+
+ ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
+
+ if (zio->io_bp_override) {
+ blkptr_t *bp = zio->io_bp;
+ zio_prop_t *zp = &zio->io_prop;
+
+ ASSERT(bp->blk_birth != zio->io_txg);
+ ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
+
+ *bp = *zio->io_bp_override;
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+ if (BP_IS_EMBEDDED(bp))
+ return (zio);
+
+ /*
+ * If we've been overridden and nopwrite is set then
+ * set the flag accordingly to indicate that a nopwrite
+ * has already occurred.
+ */
+ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
+ ASSERT(!zp->zp_dedup);
+ ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
+ zio->io_flags |= ZIO_FLAG_NOPWRITE;
+ return (zio);
+ }
+
+ ASSERT(!zp->zp_nopwrite);
+
+ if (BP_IS_HOLE(bp) || !zp->zp_dedup)
+ return (zio);
+
+ ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
+
+ if (BP_GET_CHECKSUM(bp) == zp->zp_checksum &&
+ !zp->zp_encrypt) {
+ BP_SET_DEDUP(bp, 1);
+ zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
+ return (zio);
+ }
+
+ /*
+ * We were unable to handle this as an override bp, treat
+ * it as a regular write I/O.
+ */
+ zio->io_bp_override = NULL;
+ *bp = zio->io_bp_orig;
+ zio->io_pipeline = zio->io_orig_pipeline;
+ }
+
+ return (zio);
+}
+
+static zio_t *
+zio_write_compress(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ zio_prop_t *zp = &zio->io_prop;
+ enum zio_compress compress = zp->zp_compress;
+ blkptr_t *bp = zio->io_bp;
+ uint64_t lsize = zio->io_lsize;
+ uint64_t psize = zio->io_size;
+ int pass = 1;
+
+ /*
+ * If our children haven't all reached the ready stage,
+ * wait for them and then repeat this pipeline stage.
+ */
+ if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
+ ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
+ return (NULL);
+ }
+
+ if (!IO_IS_ALLOCATING(zio))
+ return (zio);
+
+ if (zio->io_children_ready != NULL) {
+ /*
+ * Now that all our children are ready, run the callback
+ * associated with this zio in case it wants to modify the
+ * data to be written.
+ */
+ ASSERT3U(zp->zp_level, >, 0);
+ zio->io_children_ready(zio);
+ }
+
+ ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
+ ASSERT(zio->io_bp_override == NULL);
+
+ if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
+ /*
+ * We're rewriting an existing block, which means we're
+ * working on behalf of spa_sync(). For spa_sync() to
+ * converge, it must eventually be the case that we don't
+ * have to allocate new blocks. But compression changes
+ * the blocksize, which forces a reallocate, and makes
+ * convergence take longer. Therefore, after the first
+ * few passes, stop compressing to ensure convergence.
+ */
+ pass = spa_sync_pass(spa);
+
+ ASSERT(zio->io_txg == spa_syncing_txg(spa));
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(!BP_GET_DEDUP(bp));
+
+ if (pass >= zfs_sync_pass_dont_compress)
+ compress = ZIO_COMPRESS_OFF;
+
+ /* Make sure someone doesn't change their mind on overwrites */
+ ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
+ spa_max_replication(spa)) == BP_GET_NDVAS(bp));
+ }
+
+ /* If it's a compressed write that is not raw, compress the buffer. */
+ if (compress != ZIO_COMPRESS_OFF &&
+ !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
+ void *cbuf = zio_buf_alloc(lsize);
+ psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize,
+ zp->zp_complevel);
+ if (psize == 0 || psize >= lsize) {
+ compress = ZIO_COMPRESS_OFF;
+ zio_buf_free(cbuf, lsize);
+ } else if (!zp->zp_dedup && !zp->zp_encrypt &&
+ psize <= BPE_PAYLOAD_SIZE &&
+ zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
+ spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
+ encode_embedded_bp_compressed(bp,
+ cbuf, compress, lsize, psize);
+ BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
+ BP_SET_TYPE(bp, zio->io_prop.zp_type);
+ BP_SET_LEVEL(bp, zio->io_prop.zp_level);
+ zio_buf_free(cbuf, lsize);
+ bp->blk_birth = zio->io_txg;
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ ASSERT(spa_feature_is_active(spa,
+ SPA_FEATURE_EMBEDDED_DATA));
+ return (zio);
+ } else {
+ /*
+ * Round compressed size up to the minimum allocation
+ * size of the smallest-ashift device, and zero the
+ * tail. This ensures that the compressed size of the
+ * BP (and thus compressratio property) are correct,
+ * in that we charge for the padding used to fill out
+ * the last sector.
+ */
+ ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT);
+ size_t rounded = (size_t)roundup(psize,
+ spa->spa_min_alloc);
+ if (rounded >= lsize) {
+ compress = ZIO_COMPRESS_OFF;
+ zio_buf_free(cbuf, lsize);
+ psize = lsize;
+ } else {
+ abd_t *cdata = abd_get_from_buf(cbuf, lsize);
+ abd_take_ownership_of_buf(cdata, B_TRUE);
+ abd_zero_off(cdata, psize, rounded - psize);
+ psize = rounded;
+ zio_push_transform(zio, cdata,
+ psize, lsize, NULL);
+ }
+ }
+
+ /*
+ * We were unable to handle this as an override bp, treat
+ * it as a regular write I/O.
+ */
+ zio->io_bp_override = NULL;
+ *bp = zio->io_bp_orig;
+ zio->io_pipeline = zio->io_orig_pipeline;
+
+ } else if ((zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) != 0 &&
+ zp->zp_type == DMU_OT_DNODE) {
+ /*
+ * The DMU actually relies on the zio layer's compression
+ * to free metadnode blocks that have had all contained
+ * dnodes freed. As a result, even when doing a raw
+ * receive, we must check whether the block can be compressed
+ * to a hole.
+ */
+ psize = zio_compress_data(ZIO_COMPRESS_EMPTY,
+ zio->io_abd, NULL, lsize, zp->zp_complevel);
+ if (psize == 0 || psize >= lsize)
+ compress = ZIO_COMPRESS_OFF;
+ } else {
+ ASSERT3U(psize, !=, 0);
+ }
+
+ /*
+ * The final pass of spa_sync() must be all rewrites, but the first
+ * few passes offer a trade-off: allocating blocks defers convergence,
+ * but newly allocated blocks are sequential, so they can be written
+ * to disk faster. Therefore, we allow the first few passes of
+ * spa_sync() to allocate new blocks, but force rewrites after that.
+ * There should only be a handful of blocks after pass 1 in any case.
+ */
+ if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
+ BP_GET_PSIZE(bp) == psize &&
+ pass >= zfs_sync_pass_rewrite) {
+ VERIFY3U(psize, !=, 0);
+ enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
+
+ zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
+ zio->io_flags |= ZIO_FLAG_IO_REWRITE;
+ } else {
+ BP_ZERO(bp);
+ zio->io_pipeline = ZIO_WRITE_PIPELINE;
+ }
+
+ if (psize == 0) {
+ if (zio->io_bp_orig.blk_birth != 0 &&
+ spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
+ BP_SET_LSIZE(bp, lsize);
+ BP_SET_TYPE(bp, zp->zp_type);
+ BP_SET_LEVEL(bp, zp->zp_level);
+ BP_SET_BIRTH(bp, zio->io_txg, 0);
+ }
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ } else {
+ ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
+ BP_SET_LSIZE(bp, lsize);
+ BP_SET_TYPE(bp, zp->zp_type);
+ BP_SET_LEVEL(bp, zp->zp_level);
+ BP_SET_PSIZE(bp, psize);
+ BP_SET_COMPRESS(bp, compress);
+ BP_SET_CHECKSUM(bp, zp->zp_checksum);
+ BP_SET_DEDUP(bp, zp->zp_dedup);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+ if (zp->zp_dedup) {
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+ ASSERT(!zp->zp_encrypt ||
+ DMU_OT_IS_ENCRYPTED(zp->zp_type));
+ zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
+ }
+ if (zp->zp_nopwrite) {
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+ zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
+ }
+ }
+ return (zio);
+}
+
+static zio_t *
+zio_free_bp_init(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
+ if (BP_GET_DEDUP(bp))
+ zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
+ }
+
+ ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
+
+ return (zio);
+}
+
+/*
+ * ==========================================================================
+ * Execute the I/O pipeline
+ * ==========================================================================
+ */
+
+static void
+zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
+{
+ spa_t *spa = zio->io_spa;
+ zio_type_t t = zio->io_type;
+ int flags = (cutinline ? TQ_FRONT : 0);
+
+ /*
+ * If we're a config writer or a probe, the normal issue and
+ * interrupt threads may all be blocked waiting for the config lock.
+ * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
+ */
+ if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
+ t = ZIO_TYPE_NULL;
+
+ /*
+ * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
+ */
+ if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
+ t = ZIO_TYPE_NULL;
+
+ /*
+ * If this is a high priority I/O, then use the high priority taskq if
+ * available.
+ */
+ if ((zio->io_priority == ZIO_PRIORITY_NOW ||
+ zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) &&
+ spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
+ q++;
+
+ ASSERT3U(q, <, ZIO_TASKQ_TYPES);
+
+ /*
+ * NB: We are assuming that the zio can only be dispatched
+ * to a single taskq at a time. It would be a grievous error
+ * to dispatch the zio to another taskq at the same time.
+ */
+ ASSERT(taskq_empty_ent(&zio->io_tqent));
+ spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
+ flags, &zio->io_tqent);
+}
+
+static boolean_t
+zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
+{
+ spa_t *spa = zio->io_spa;
+
+ taskq_t *tq = taskq_of_curthread();
+
+ for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
+ spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+ uint_t i;
+ for (i = 0; i < tqs->stqs_count; i++) {
+ if (tqs->stqs_taskq[i] == tq)
+ return (B_TRUE);
+ }
+ }
+
+ return (B_FALSE);
+}
+
+static zio_t *
+zio_issue_async(zio_t *zio)
+{
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
+
+ return (NULL);
+}
+
+void
+zio_interrupt(zio_t *zio)
+{
+ zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
+}
+
+void
+zio_delay_interrupt(zio_t *zio)
+{
+ /*
+ * The timeout_generic() function isn't defined in userspace, so
+ * rather than trying to implement the function, the zio delay
+ * functionality has been disabled for userspace builds.
+ */
+
+#ifdef _KERNEL
+ /*
+ * If io_target_timestamp is zero, then no delay has been registered
+ * for this IO, thus jump to the end of this function and "skip" the
+ * delay; issuing it directly to the zio layer.
+ */
+ if (zio->io_target_timestamp != 0) {
+ hrtime_t now = gethrtime();
+
+ if (now >= zio->io_target_timestamp) {
+ /*
+ * This IO has already taken longer than the target
+ * delay to complete, so we don't want to delay it
+ * any longer; we "miss" the delay and issue it
+ * directly to the zio layer. This is likely due to
+ * the target latency being set to a value less than
+ * the underlying hardware can satisfy (e.g. delay
+ * set to 1ms, but the disks take 10ms to complete an
+ * IO request).
+ */
+
+ DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
+ hrtime_t, now);
+
+ zio_interrupt(zio);
+ } else {
+ taskqid_t tid;
+ hrtime_t diff = zio->io_target_timestamp - now;
+ clock_t expire_at_tick = ddi_get_lbolt() +
+ NSEC_TO_TICK(diff);
+
+ DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
+ hrtime_t, now, hrtime_t, diff);
+
+ if (NSEC_TO_TICK(diff) == 0) {
+ /* Our delay is less than a jiffy - just spin */
+ zfs_sleep_until(zio->io_target_timestamp);
+ zio_interrupt(zio);
+ } else {
+ /*
+ * Use taskq_dispatch_delay() in the place of
+ * OpenZFS's timeout_generic().
+ */
+ tid = taskq_dispatch_delay(system_taskq,
+ (task_func_t *)zio_interrupt,
+ zio, TQ_NOSLEEP, expire_at_tick);
+ if (tid == TASKQID_INVALID) {
+ /*
+ * Couldn't allocate a task. Just
+ * finish the zio without a delay.
+ */
+ zio_interrupt(zio);
+ }
+ }
+ }
+ return;
+ }
+#endif
+ DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
+ zio_interrupt(zio);
+}
+
+static void
+zio_deadman_impl(zio_t *pio, int ziodepth)
+{
+ zio_t *cio, *cio_next;
+ zio_link_t *zl = NULL;
+ vdev_t *vd = pio->io_vd;
+
+ if (zio_deadman_log_all || (vd != NULL && vd->vdev_ops->vdev_op_leaf)) {
+ vdev_queue_t *vq = vd ? &vd->vdev_queue : NULL;
+ zbookmark_phys_t *zb = &pio->io_bookmark;
+ uint64_t delta = gethrtime() - pio->io_timestamp;
+ uint64_t failmode = spa_get_deadman_failmode(pio->io_spa);
+
+ zfs_dbgmsg("slow zio[%d]: zio=%px timestamp=%llu "
+ "delta=%llu queued=%llu io=%llu "
+ "path=%s last=%llu "
+ "type=%d priority=%d flags=0x%x "
+ "stage=0x%x pipeline=0x%x pipeline-trace=0x%x "
+ "objset=%llu object=%llu level=%llu blkid=%llu "
+ "offset=%llu size=%llu error=%d",
+ ziodepth, pio, pio->io_timestamp,
+ delta, pio->io_delta, pio->io_delay,
+ vd ? vd->vdev_path : "NULL", vq ? vq->vq_io_complete_ts : 0,
+ pio->io_type, pio->io_priority, pio->io_flags,
+ pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace,
+ zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
+ pio->io_offset, pio->io_size, pio->io_error);
+ (void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN,
+ pio->io_spa, vd, zb, pio, 0);
+
+ if (failmode == ZIO_FAILURE_MODE_CONTINUE &&
+ taskq_empty_ent(&pio->io_tqent)) {
+ zio_interrupt(pio);
+ }
+ }
+
+ mutex_enter(&pio->io_lock);
+ for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
+ cio_next = zio_walk_children(pio, &zl);
+ zio_deadman_impl(cio, ziodepth + 1);
+ }
+ mutex_exit(&pio->io_lock);
+}
+
+/*
+ * Log the critical information describing this zio and all of its children
+ * using the zfs_dbgmsg() interface then post deadman event for the ZED.
+ */
+void
+zio_deadman(zio_t *pio, char *tag)
+{
+ spa_t *spa = pio->io_spa;
+ char *name = spa_name(spa);
+
+ if (!zfs_deadman_enabled || spa_suspended(spa))
+ return;
+
+ zio_deadman_impl(pio, 0);
+
+ switch (spa_get_deadman_failmode(spa)) {
+ case ZIO_FAILURE_MODE_WAIT:
+ zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name);
+ break;
+
+ case ZIO_FAILURE_MODE_CONTINUE:
+ zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name);
+ break;
+
+ case ZIO_FAILURE_MODE_PANIC:
+ fm_panic("%s determined I/O to pool '%s' is hung.", tag, name);
+ break;
+ }
+}
+
+/*
+ * Execute the I/O pipeline until one of the following occurs:
+ * (1) the I/O completes; (2) the pipeline stalls waiting for
+ * dependent child I/Os; (3) the I/O issues, so we're waiting
+ * for an I/O completion interrupt; (4) the I/O is delegated by
+ * vdev-level caching or aggregation; (5) the I/O is deferred
+ * due to vdev-level queueing; (6) the I/O is handed off to
+ * another thread. In all cases, the pipeline stops whenever
+ * there's no CPU work; it never burns a thread in cv_wait_io().
+ *
+ * There's no locking on io_stage because there's no legitimate way
+ * for multiple threads to be attempting to process the same I/O.
+ */
+static zio_pipe_stage_t *zio_pipeline[];
+
+/*
+ * zio_execute() is a wrapper around the static function
+ * __zio_execute() so that we can force __zio_execute() to be
+ * inlined. This reduces stack overhead which is important
+ * because __zio_execute() is called recursively in several zio
+ * code paths. zio_execute() itself cannot be inlined because
+ * it is externally visible.
+ */
+void
+zio_execute(zio_t *zio)
+{
+ fstrans_cookie_t cookie;
+
+ cookie = spl_fstrans_mark();
+ __zio_execute(zio);
+ spl_fstrans_unmark(cookie);
+}
+
+/*
+ * Used to determine if in the current context the stack is sized large
+ * enough to allow zio_execute() to be called recursively. A minimum
+ * stack size of 16K is required to avoid needing to re-dispatch the zio.
+ */
+static boolean_t
+zio_execute_stack_check(zio_t *zio)
+{
+#if !defined(HAVE_LARGE_STACKS)
+ dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
+
+ /* Executing in txg_sync_thread() context. */
+ if (dp && curthread == dp->dp_tx.tx_sync_thread)
+ return (B_TRUE);
+
+ /* Pool initialization outside of zio_taskq context. */
+ if (dp && spa_is_initializing(dp->dp_spa) &&
+ !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
+ !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))
+ return (B_TRUE);
+#endif /* HAVE_LARGE_STACKS */
+
+ return (B_FALSE);
+}
+
+__attribute__((always_inline))
+static inline void
+__zio_execute(zio_t *zio)
+{
+ ASSERT3U(zio->io_queued_timestamp, >, 0);
+
+ while (zio->io_stage < ZIO_STAGE_DONE) {
+ enum zio_stage pipeline = zio->io_pipeline;
+ enum zio_stage stage = zio->io_stage;
+
+ zio->io_executor = curthread;
+
+ ASSERT(!MUTEX_HELD(&zio->io_lock));
+ ASSERT(ISP2(stage));
+ ASSERT(zio->io_stall == NULL);
+
+ do {
+ stage <<= 1;
+ } while ((stage & pipeline) == 0);
+
+ ASSERT(stage <= ZIO_STAGE_DONE);
+
+ /*
+ * If we are in interrupt context and this pipeline stage
+ * will grab a config lock that is held across I/O,
+ * or may wait for an I/O that needs an interrupt thread
+ * to complete, issue async to avoid deadlock.
+ *
+ * For VDEV_IO_START, we cut in line so that the io will
+ * be sent to disk promptly.
+ */
+ if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
+ zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
+ boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
+ zio_requeue_io_start_cut_in_line : B_FALSE;
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
+ return;
+ }
+
+ /*
+ * If the current context doesn't have large enough stacks
+ * the zio must be issued asynchronously to prevent overflow.
+ */
+ if (zio_execute_stack_check(zio)) {
+ boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
+ zio_requeue_io_start_cut_in_line : B_FALSE;
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
+ return;
+ }
+
+ zio->io_stage = stage;
+ zio->io_pipeline_trace |= zio->io_stage;
+
+ /*
+ * The zio pipeline stage returns the next zio to execute
+ * (typically the same as this one), or NULL if we should
+ * stop.
+ */
+ zio = zio_pipeline[highbit64(stage) - 1](zio);
+
+ if (zio == NULL)
+ return;
+ }
+}
+
+
+/*
+ * ==========================================================================
+ * Initiate I/O, either sync or async
+ * ==========================================================================
+ */
+int
+zio_wait(zio_t *zio)
+{
+ /*
+ * Some routines, like zio_free_sync(), may return a NULL zio
+ * to avoid the performance overhead of creating and then destroying
+ * an unneeded zio. For the callers' simplicity, we accept a NULL
+ * zio and ignore it.
+ */
+ if (zio == NULL)
+ return (0);
+
+ long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms);
+ int error;
+
+ ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN);
+ ASSERT3P(zio->io_executor, ==, NULL);
+
+ zio->io_waiter = curthread;
+ ASSERT0(zio->io_queued_timestamp);
+ zio->io_queued_timestamp = gethrtime();
+
+ __zio_execute(zio);
+
+ mutex_enter(&zio->io_lock);
+ while (zio->io_executor != NULL) {
+ error = cv_timedwait_io(&zio->io_cv, &zio->io_lock,
+ ddi_get_lbolt() + timeout);
+
+ if (zfs_deadman_enabled && error == -1 &&
+ gethrtime() - zio->io_queued_timestamp >
+ spa_deadman_ziotime(zio->io_spa)) {
+ mutex_exit(&zio->io_lock);
+ timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms);
+ zio_deadman(zio, FTAG);
+ mutex_enter(&zio->io_lock);
+ }
+ }
+ mutex_exit(&zio->io_lock);
+
+ error = zio->io_error;
+ zio_destroy(zio);
+
+ return (error);
+}
+
+void
+zio_nowait(zio_t *zio)
+{
+ /*
+ * See comment in zio_wait().
+ */
+ if (zio == NULL)
+ return;
+
+ ASSERT3P(zio->io_executor, ==, NULL);
+
+ if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
+ zio_unique_parent(zio) == NULL) {
+ zio_t *pio;
+
+ /*
+ * This is a logical async I/O with no parent to wait for it.
+ * We add it to the spa_async_root_zio "Godfather" I/O which
+ * will ensure they complete prior to unloading the pool.
+ */
+ spa_t *spa = zio->io_spa;
+ pio = spa->spa_async_zio_root[CPU_SEQID_UNSTABLE];
+
+ zio_add_child(pio, zio);
+ }
+
+ ASSERT0(zio->io_queued_timestamp);
+ zio->io_queued_timestamp = gethrtime();
+ __zio_execute(zio);
+}
+
+/*
+ * ==========================================================================
+ * Reexecute, cancel, or suspend/resume failed I/O
+ * ==========================================================================
+ */
+
+static void
+zio_reexecute(zio_t *pio)
+{
+ zio_t *cio, *cio_next;
+
+ ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
+ ASSERT(pio->io_gang_leader == NULL);
+ ASSERT(pio->io_gang_tree == NULL);
+
+ pio->io_flags = pio->io_orig_flags;
+ pio->io_stage = pio->io_orig_stage;
+ pio->io_pipeline = pio->io_orig_pipeline;
+ pio->io_reexecute = 0;
+ pio->io_flags |= ZIO_FLAG_REEXECUTED;
+ pio->io_pipeline_trace = 0;
+ pio->io_error = 0;
+ for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+ pio->io_state[w] = 0;
+ for (int c = 0; c < ZIO_CHILD_TYPES; c++)
+ pio->io_child_error[c] = 0;
+
+ if (IO_IS_ALLOCATING(pio))
+ BP_ZERO(pio->io_bp);
+
+ /*
+ * As we reexecute pio's children, new children could be created.
+ * New children go to the head of pio's io_child_list, however,
+ * so we will (correctly) not reexecute them. The key is that
+ * the remainder of pio's io_child_list, from 'cio_next' onward,
+ * cannot be affected by any side effects of reexecuting 'cio'.
+ */
+ zio_link_t *zl = NULL;
+ mutex_enter(&pio->io_lock);
+ for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
+ cio_next = zio_walk_children(pio, &zl);
+ for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+ pio->io_children[cio->io_child_type][w]++;
+ mutex_exit(&pio->io_lock);
+ zio_reexecute(cio);
+ mutex_enter(&pio->io_lock);
+ }
+ mutex_exit(&pio->io_lock);
+
+ /*
+ * Now that all children have been reexecuted, execute the parent.
+ * We don't reexecute "The Godfather" I/O here as it's the
+ * responsibility of the caller to wait on it.
+ */
+ if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
+ pio->io_queued_timestamp = gethrtime();
+ __zio_execute(pio);
+ }
+}
+
+void
+zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
+{
+ if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
+ fm_panic("Pool '%s' has encountered an uncorrectable I/O "
+ "failure and the failure mode property for this pool "
+ "is set to panic.", spa_name(spa));
+
+ cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
+ "failure and has been suspended.\n", spa_name(spa));
+
+ (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
+ NULL, NULL, 0);
+
+ mutex_enter(&spa->spa_suspend_lock);
+
+ if (spa->spa_suspend_zio_root == NULL)
+ spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+ ZIO_FLAG_GODFATHER);
+
+ spa->spa_suspended = reason;
+
+ if (zio != NULL) {
+ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
+ ASSERT(zio != spa->spa_suspend_zio_root);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(zio_unique_parent(zio) == NULL);
+ ASSERT(zio->io_stage == ZIO_STAGE_DONE);
+ zio_add_child(spa->spa_suspend_zio_root, zio);
+ }
+
+ mutex_exit(&spa->spa_suspend_lock);
+}
+
+int
+zio_resume(spa_t *spa)
+{
+ zio_t *pio;
+
+ /*
+ * Reexecute all previously suspended i/o.
+ */
+ mutex_enter(&spa->spa_suspend_lock);
+ spa->spa_suspended = ZIO_SUSPEND_NONE;
+ cv_broadcast(&spa->spa_suspend_cv);
+ pio = spa->spa_suspend_zio_root;
+ spa->spa_suspend_zio_root = NULL;
+ mutex_exit(&spa->spa_suspend_lock);
+
+ if (pio == NULL)
+ return (0);
+
+ zio_reexecute(pio);
+ return (zio_wait(pio));
+}
+
+void
+zio_resume_wait(spa_t *spa)
+{
+ mutex_enter(&spa->spa_suspend_lock);
+ while (spa_suspended(spa))
+ cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
+ mutex_exit(&spa->spa_suspend_lock);
+}
+
+/*
+ * ==========================================================================
+ * Gang blocks.
+ *
+ * A gang block is a collection of small blocks that looks to the DMU
+ * like one large block. When zio_dva_allocate() cannot find a block
+ * of the requested size, due to either severe fragmentation or the pool
+ * being nearly full, it calls zio_write_gang_block() to construct the
+ * block from smaller fragments.
+ *
+ * A gang block consists of a gang header (zio_gbh_phys_t) and up to
+ * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like
+ * an indirect block: it's an array of block pointers. It consumes
+ * only one sector and hence is allocatable regardless of fragmentation.
+ * The gang header's bps point to its gang members, which hold the data.
+ *
+ * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
+ * as the verifier to ensure uniqueness of the SHA256 checksum.
+ * Critically, the gang block bp's blk_cksum is the checksum of the data,
+ * not the gang header. This ensures that data block signatures (needed for
+ * deduplication) are independent of how the block is physically stored.
+ *
+ * Gang blocks can be nested: a gang member may itself be a gang block.
+ * Thus every gang block is a tree in which root and all interior nodes are
+ * gang headers, and the leaves are normal blocks that contain user data.
+ * The root of the gang tree is called the gang leader.
+ *
+ * To perform any operation (read, rewrite, free, claim) on a gang block,
+ * zio_gang_assemble() first assembles the gang tree (minus data leaves)
+ * in the io_gang_tree field of the original logical i/o by recursively
+ * reading the gang leader and all gang headers below it. This yields
+ * an in-core tree containing the contents of every gang header and the
+ * bps for every constituent of the gang block.
+ *
+ * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
+ * and invokes a callback on each bp. To free a gang block, zio_gang_issue()
+ * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
+ * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
+ * zio_read_gang() is a wrapper around zio_read() that omits reading gang
+ * headers, since we already have those in io_gang_tree. zio_rewrite_gang()
+ * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
+ * of the gang header plus zio_checksum_compute() of the data to update the
+ * gang header's blk_cksum as described above.
+ *
+ * The two-phase assemble/issue model solves the problem of partial failure --
+ * what if you'd freed part of a gang block but then couldn't read the
+ * gang header for another part? Assembling the entire gang tree first
+ * ensures that all the necessary gang header I/O has succeeded before
+ * starting the actual work of free, claim, or write. Once the gang tree
+ * is assembled, free and claim are in-memory operations that cannot fail.
+ *
+ * In the event that a gang write fails, zio_dva_unallocate() walks the
+ * gang tree to immediately free (i.e. insert back into the space map)
+ * everything we've allocated. This ensures that we don't get ENOSPC
+ * errors during repeated suspend/resume cycles due to a flaky device.
+ *
+ * Gang rewrites only happen during sync-to-convergence. If we can't assemble
+ * the gang tree, we won't modify the block, so we can safely defer the free
+ * (knowing that the block is still intact). If we *can* assemble the gang
+ * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
+ * each constituent bp and we can allocate a new block on the next sync pass.
+ *
+ * In all cases, the gang tree allows complete recovery from partial failure.
+ * ==========================================================================
+ */
+
+static void
+zio_gang_issue_func_done(zio_t *zio)
+{
+ abd_free(zio->io_abd);
+}
+
+static zio_t *
+zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+ uint64_t offset)
+{
+ if (gn != NULL)
+ return (pio);
+
+ return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
+ BP_GET_PSIZE(bp), zio_gang_issue_func_done,
+ NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
+ &pio->io_bookmark));
+}
+
+static zio_t *
+zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+ uint64_t offset)
+{
+ zio_t *zio;
+
+ if (gn != NULL) {
+ abd_t *gbh_abd =
+ abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
+ zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
+ gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
+ pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
+ &pio->io_bookmark);
+ /*
+ * As we rewrite each gang header, the pipeline will compute
+ * a new gang block header checksum for it; but no one will
+ * compute a new data checksum, so we do that here. The one
+ * exception is the gang leader: the pipeline already computed
+ * its data checksum because that stage precedes gang assembly.
+ * (Presently, nothing actually uses interior data checksums;
+ * this is just good hygiene.)
+ */
+ if (gn != pio->io_gang_leader->io_gang_tree) {
+ abd_t *buf = abd_get_offset(data, offset);
+
+ zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
+ buf, BP_GET_PSIZE(bp));
+
+ abd_free(buf);
+ }
+ /*
+ * If we are here to damage data for testing purposes,
+ * leave the GBH alone so that we can detect the damage.
+ */
+ if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
+ zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+ } else {
+ zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
+ abd_get_offset(data, offset), BP_GET_PSIZE(bp),
+ zio_gang_issue_func_done, NULL, pio->io_priority,
+ ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+ }
+
+ return (zio);
+}
+
+/* ARGSUSED */
+static zio_t *
+zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+ uint64_t offset)
+{
+ zio_t *zio = zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
+ ZIO_GANG_CHILD_FLAGS(pio));
+ if (zio == NULL) {
+ zio = zio_null(pio, pio->io_spa,
+ NULL, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio));
+ }
+ return (zio);
+}
+
+/* ARGSUSED */
+static zio_t *
+zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+ uint64_t offset)
+{
+ return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
+ NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
+}
+
+static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
+ NULL,
+ zio_read_gang,
+ zio_rewrite_gang,
+ zio_free_gang,
+ zio_claim_gang,
+ NULL
+};
+
+static void zio_gang_tree_assemble_done(zio_t *zio);
+
+static zio_gang_node_t *
+zio_gang_node_alloc(zio_gang_node_t **gnpp)
+{
+ zio_gang_node_t *gn;
+
+ ASSERT(*gnpp == NULL);
+
+ gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
+ gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
+ *gnpp = gn;
+
+ return (gn);
+}
+
+static void
+zio_gang_node_free(zio_gang_node_t **gnpp)
+{
+ zio_gang_node_t *gn = *gnpp;
+
+ for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
+ ASSERT(gn->gn_child[g] == NULL);
+
+ zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
+ kmem_free(gn, sizeof (*gn));
+ *gnpp = NULL;
+}
+
+static void
+zio_gang_tree_free(zio_gang_node_t **gnpp)
+{
+ zio_gang_node_t *gn = *gnpp;
+
+ if (gn == NULL)
+ return;
+
+ for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
+ zio_gang_tree_free(&gn->gn_child[g]);
+
+ zio_gang_node_free(gnpp);
+}
+
+static void
+zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
+{
+ zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
+ abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
+
+ ASSERT(gio->io_gang_leader == gio);
+ ASSERT(BP_IS_GANG(bp));
+
+ zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
+ zio_gang_tree_assemble_done, gn, gio->io_priority,
+ ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
+}
+
+static void
+zio_gang_tree_assemble_done(zio_t *zio)
+{
+ zio_t *gio = zio->io_gang_leader;
+ zio_gang_node_t *gn = zio->io_private;
+ blkptr_t *bp = zio->io_bp;
+
+ ASSERT(gio == zio_unique_parent(zio));
+ ASSERT(zio->io_child_count == 0);
+
+ if (zio->io_error)
+ return;
+
+ /* this ABD was created from a linear buf in zio_gang_tree_assemble */
+ if (BP_SHOULD_BYTESWAP(bp))
+ byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
+
+ ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
+ ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
+ ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
+
+ abd_free(zio->io_abd);
+
+ for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+ blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
+ if (!BP_IS_GANG(gbp))
+ continue;
+ zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
+ }
+}
+
+static void
+zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
+ uint64_t offset)
+{
+ zio_t *gio = pio->io_gang_leader;
+ zio_t *zio;
+
+ ASSERT(BP_IS_GANG(bp) == !!gn);
+ ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
+ ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
+
+ /*
+ * If you're a gang header, your data is in gn->gn_gbh.
+ * If you're a gang member, your data is in 'data' and gn == NULL.
+ */
+ zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
+
+ if (gn != NULL) {
+ ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
+
+ for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+ blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
+ if (BP_IS_HOLE(gbp))
+ continue;
+ zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
+ offset);
+ offset += BP_GET_PSIZE(gbp);
+ }
+ }
+
+ if (gn == gio->io_gang_tree)
+ ASSERT3U(gio->io_size, ==, offset);
+
+ if (zio != pio)
+ zio_nowait(zio);
+}
+
+static zio_t *
+zio_gang_assemble(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
+ ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+
+ zio->io_gang_leader = zio;
+
+ zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
+
+ return (zio);
+}
+
+static zio_t *
+zio_gang_issue(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
+ return (NULL);
+ }
+
+ ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
+ ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+
+ if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
+ zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
+ 0);
+ else
+ zio_gang_tree_free(&zio->io_gang_tree);
+
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+ return (zio);
+}
+
+static void
+zio_write_gang_member_ready(zio_t *zio)
+{
+ zio_t *pio = zio_unique_parent(zio);
+ dva_t *cdva = zio->io_bp->blk_dva;
+ dva_t *pdva = pio->io_bp->blk_dva;
+ uint64_t asize;
+ zio_t *gio __maybe_unused = zio->io_gang_leader;
+
+ if (BP_IS_HOLE(zio->io_bp))
+ return;
+
+ ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
+
+ ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
+ ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
+ ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
+ ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
+ ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
+
+ mutex_enter(&pio->io_lock);
+ for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
+ ASSERT(DVA_GET_GANG(&pdva[d]));
+ asize = DVA_GET_ASIZE(&pdva[d]);
+ asize += DVA_GET_ASIZE(&cdva[d]);
+ DVA_SET_ASIZE(&pdva[d], asize);
+ }
+ mutex_exit(&pio->io_lock);
+}
+
+static void
+zio_write_gang_done(zio_t *zio)
+{
+ /*
+ * The io_abd field will be NULL for a zio with no data. The io_flags
+ * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't
+ * check for it here as it is cleared in zio_ready.
+ */
+ if (zio->io_abd != NULL)
+ abd_free(zio->io_abd);
+}
+
+static zio_t *
+zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
+{
+ spa_t *spa = pio->io_spa;
+ blkptr_t *bp = pio->io_bp;
+ zio_t *gio = pio->io_gang_leader;
+ zio_t *zio;
+ zio_gang_node_t *gn, **gnpp;
+ zio_gbh_phys_t *gbh;
+ abd_t *gbh_abd;
+ uint64_t txg = pio->io_txg;
+ uint64_t resid = pio->io_size;
+ uint64_t lsize;
+ int copies = gio->io_prop.zp_copies;
+ int gbh_copies;
+ zio_prop_t zp;
+ int error;
+ boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);
+
+ /*
+ * encrypted blocks need DVA[2] free so encrypted gang headers can't
+ * have a third copy.
+ */
+ gbh_copies = MIN(copies + 1, spa_max_replication(spa));
+ if (gio->io_prop.zp_encrypt && gbh_copies >= SPA_DVAS_PER_BP)
+ gbh_copies = SPA_DVAS_PER_BP - 1;
+
+ int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
+ if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(has_data);
+
+ flags |= METASLAB_ASYNC_ALLOC;
+ VERIFY(zfs_refcount_held(&mc->mc_allocator[pio->io_allocator].
+ mca_alloc_slots, pio));
+
+ /*
+ * The logical zio has already placed a reservation for
+ * 'copies' allocation slots but gang blocks may require
+ * additional copies. These additional copies
+ * (i.e. gbh_copies - copies) are guaranteed to succeed
+ * since metaslab_class_throttle_reserve() always allows
+ * additional reservations for gang blocks.
+ */
+ VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
+ pio->io_allocator, pio, flags));
+ }
+
+ error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
+ bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
+ &pio->io_alloc_list, pio, pio->io_allocator);
+ if (error) {
+ if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(has_data);
+
+ /*
+ * If we failed to allocate the gang block header then
+ * we remove any additional allocation reservations that
+ * we placed here. The original reservation will
+ * be removed when the logical I/O goes to the ready
+ * stage.
+ */
+ metaslab_class_throttle_unreserve(mc,
+ gbh_copies - copies, pio->io_allocator, pio);
+ }
+
+ pio->io_error = error;
+ return (pio);
+ }
+
+ if (pio == gio) {
+ gnpp = &gio->io_gang_tree;
+ } else {
+ gnpp = pio->io_private;
+ ASSERT(pio->io_ready == zio_write_gang_member_ready);
+ }
+
+ gn = zio_gang_node_alloc(gnpp);
+ gbh = gn->gn_gbh;
+ bzero(gbh, SPA_GANGBLOCKSIZE);
+ gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
+
+ /*
+ * Create the gang header.
+ */
+ zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
+ zio_write_gang_done, NULL, pio->io_priority,
+ ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+
+ /*
+ * Create and nowait the gang children.
+ */
+ for (int g = 0; resid != 0; resid -= lsize, g++) {
+ lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
+ SPA_MINBLOCKSIZE);
+ ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
+
+ zp.zp_checksum = gio->io_prop.zp_checksum;
+ zp.zp_compress = ZIO_COMPRESS_OFF;
+ zp.zp_complevel = gio->io_prop.zp_complevel;
+ zp.zp_type = DMU_OT_NONE;
+ zp.zp_level = 0;
+ zp.zp_copies = gio->io_prop.zp_copies;
+ zp.zp_dedup = B_FALSE;
+ zp.zp_dedup_verify = B_FALSE;
+ zp.zp_nopwrite = B_FALSE;
+ zp.zp_encrypt = gio->io_prop.zp_encrypt;
+ zp.zp_byteorder = gio->io_prop.zp_byteorder;
+ bzero(zp.zp_salt, ZIO_DATA_SALT_LEN);
+ bzero(zp.zp_iv, ZIO_DATA_IV_LEN);
+ bzero(zp.zp_mac, ZIO_DATA_MAC_LEN);
+
+ zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
+ has_data ? abd_get_offset(pio->io_abd, pio->io_size -
+ resid) : NULL, lsize, lsize, &zp,
+ zio_write_gang_member_ready, NULL, NULL,
+ zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
+ ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+
+ if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(has_data);
+
+ /*
+ * Gang children won't throttle but we should
+ * account for their work, so reserve an allocation
+ * slot for them here.
+ */
+ VERIFY(metaslab_class_throttle_reserve(mc,
+ zp.zp_copies, cio->io_allocator, cio, flags));
+ }
+ zio_nowait(cio);
+ }
+
+ /*
+ * Set pio's pipeline to just wait for zio to finish.
+ */
+ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+ /*
+ * We didn't allocate this bp, so make sure it doesn't get unmarked.
+ */
+ pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
+
+ zio_nowait(zio);
+
+ return (pio);
+}
+
+/*
+ * The zio_nop_write stage in the pipeline determines if allocating a
+ * new bp is necessary. The nopwrite feature can handle writes in
+ * either syncing or open context (i.e. zil writes) and as a result is
+ * mutually exclusive with dedup.
+ *
+ * By leveraging a cryptographically secure checksum, such as SHA256, we
+ * can compare the checksums of the new data and the old to determine if
+ * allocating a new block is required. Note that our requirements for
+ * cryptographic strength are fairly weak: there can't be any accidental
+ * hash collisions, but we don't need to be secure against intentional
+ * (malicious) collisions. To trigger a nopwrite, you have to be able
+ * to write the file to begin with, and triggering an incorrect (hash
+ * collision) nopwrite is no worse than simply writing to the file.
+ * That said, there are no known attacks against the checksum algorithms
+ * used for nopwrite, assuming that the salt and the checksums
+ * themselves remain secret.
+ */
+static zio_t *
+zio_nop_write(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ zio_prop_t *zp = &zio->io_prop;
+
+ ASSERT(BP_GET_LEVEL(bp) == 0);
+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+ ASSERT(zp->zp_nopwrite);
+ ASSERT(!zp->zp_dedup);
+ ASSERT(zio->io_bp_override == NULL);
+ ASSERT(IO_IS_ALLOCATING(zio));
+
+ /*
+ * Check to see if the original bp and the new bp have matching
+ * characteristics (i.e. same checksum, compression algorithms, etc).
+ * If they don't then just continue with the pipeline which will
+ * allocate a new bp.
+ */
+ if (BP_IS_HOLE(bp_orig) ||
+ !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
+ ZCHECKSUM_FLAG_NOPWRITE) ||
+ BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) ||
+ BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
+ BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
+ BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
+ zp->zp_copies != BP_GET_NDVAS(bp_orig))
+ return (zio);
+
+ /*
+ * If the checksums match then reset the pipeline so that we
+ * avoid allocating a new bp and issuing any I/O.
+ */
+ if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
+ ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
+ ZCHECKSUM_FLAG_NOPWRITE);
+ ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
+ ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
+ ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
+ ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
+ sizeof (uint64_t)) == 0);
+
+ /*
+ * If we're overwriting a block that is currently on an
+ * indirect vdev, then ignore the nopwrite request and
+ * allow a new block to be allocated on a concrete vdev.
+ */
+ spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER);
+ vdev_t *tvd = vdev_lookup_top(zio->io_spa,
+ DVA_GET_VDEV(&bp->blk_dva[0]));
+ if (tvd->vdev_ops == &vdev_indirect_ops) {
+ spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
+ return (zio);
+ }
+ spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
+
+ *bp = *bp_orig;
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ zio->io_flags |= ZIO_FLAG_NOPWRITE;
+ }
+
+ return (zio);
+}
+
+/*
+ * ==========================================================================
+ * Dedup
+ * ==========================================================================
+ */
+static void
+zio_ddt_child_read_done(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp;
+ zio_t *pio = zio_unique_parent(zio);
+
+ mutex_enter(&pio->io_lock);
+ ddp = ddt_phys_select(dde, bp);
+ if (zio->io_error == 0)
+ ddt_phys_clear(ddp); /* this ddp doesn't need repair */
+
+ if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
+ dde->dde_repair_abd = zio->io_abd;
+ else
+ abd_free(zio->io_abd);
+ mutex_exit(&pio->io_lock);
+}
+
+static zio_t *
+zio_ddt_read_start(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ if (zio->io_child_error[ZIO_CHILD_DDT]) {
+ ddt_t *ddt = ddt_select(zio->io_spa, bp);
+ ddt_entry_t *dde = ddt_repair_start(ddt, bp);
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
+ blkptr_t blk;
+
+ ASSERT(zio->io_vsd == NULL);
+ zio->io_vsd = dde;
+
+ if (ddp_self == NULL)
+ return (zio);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
+ continue;
+ ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
+ &blk);
+ zio_nowait(zio_read(zio, zio->io_spa, &blk,
+ abd_alloc_for_io(zio->io_size, B_TRUE),
+ zio->io_size, zio_ddt_child_read_done, dde,
+ zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
+ ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
+ }
+ return (zio);
+ }
+
+ zio_nowait(zio_read(zio, zio->io_spa, bp,
+ zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
+ ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
+
+ return (zio);
+}
+
+static zio_t *
+zio_ddt_read_done(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
+ return (NULL);
+ }
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ if (zio->io_child_error[ZIO_CHILD_DDT]) {
+ ddt_t *ddt = ddt_select(zio->io_spa, bp);
+ ddt_entry_t *dde = zio->io_vsd;
+ if (ddt == NULL) {
+ ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
+ return (zio);
+ }
+ if (dde == NULL) {
+ zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
+ return (NULL);
+ }
+ if (dde->dde_repair_abd != NULL) {
+ abd_copy(zio->io_abd, dde->dde_repair_abd,
+ zio->io_size);
+ zio->io_child_error[ZIO_CHILD_DDT] = 0;
+ }
+ ddt_repair_done(ddt, dde);
+ zio->io_vsd = NULL;
+ }
+
+ ASSERT(zio->io_vsd == NULL);
+
+ return (zio);
+}
+
+static boolean_t
+zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
+{
+ spa_t *spa = zio->io_spa;
+ boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW);
+
+ ASSERT(!(zio->io_bp_override && do_raw));
+
+ /*
+ * Note: we compare the original data, not the transformed data,
+ * because when zio->io_bp is an override bp, we will not have
+ * pushed the I/O transforms. That's an important optimization
+ * because otherwise we'd compress/encrypt all dmu_sync() data twice.
+ * However, we should never get a raw, override zio so in these
+ * cases we can compare the io_abd directly. This is useful because
+ * it allows us to do dedup verification even if we don't have access
+ * to the original data (for instance, if the encryption keys aren't
+ * loaded).
+ */
+
+ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ zio_t *lio = dde->dde_lead_zio[p];
+
+ if (lio != NULL && do_raw) {
+ return (lio->io_size != zio->io_size ||
+ abd_cmp(zio->io_abd, lio->io_abd) != 0);
+ } else if (lio != NULL) {
+ return (lio->io_orig_size != zio->io_orig_size ||
+ abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
+ }
+ }
+
+ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+
+ if (ddp->ddp_phys_birth != 0 && do_raw) {
+ blkptr_t blk = *zio->io_bp;
+ uint64_t psize;
+ abd_t *tmpabd;
+ int error;
+
+ ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+ psize = BP_GET_PSIZE(&blk);
+
+ if (psize != zio->io_size)
+ return (B_TRUE);
+
+ ddt_exit(ddt);
+
+ tmpabd = abd_alloc_for_io(psize, B_TRUE);
+
+ error = zio_wait(zio_read(NULL, spa, &blk, tmpabd,
+ psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+ ZIO_FLAG_RAW, &zio->io_bookmark));
+
+ if (error == 0) {
+ if (abd_cmp(tmpabd, zio->io_abd) != 0)
+ error = SET_ERROR(ENOENT);
+ }
+
+ abd_free(tmpabd);
+ ddt_enter(ddt);
+ return (error != 0);
+ } else if (ddp->ddp_phys_birth != 0) {
+ arc_buf_t *abuf = NULL;
+ arc_flags_t aflags = ARC_FLAG_WAIT;
+ blkptr_t blk = *zio->io_bp;
+ int error;
+
+ ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+
+ if (BP_GET_LSIZE(&blk) != zio->io_orig_size)
+ return (B_TRUE);
+
+ ddt_exit(ddt);
+
+ error = arc_read(NULL, spa, &blk,
+ arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ &aflags, &zio->io_bookmark);
+
+ if (error == 0) {
+ if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
+ zio->io_orig_size) != 0)
+ error = SET_ERROR(ENOENT);
+ arc_buf_destroy(abuf, &abuf);
+ }
+
+ ddt_enter(ddt);
+ return (error != 0);
+ }
+ }
+
+ return (B_FALSE);
+}
+
+static void
+zio_ddt_child_write_ready(zio_t *zio)
+{
+ int p = zio->io_prop.zp_copies;
+ ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+ zio_t *pio;
+
+ if (zio->io_error)
+ return;
+
+ ddt_enter(ddt);
+
+ ASSERT(dde->dde_lead_zio[p] == zio);
+
+ ddt_phys_fill(ddp, zio->io_bp);
+
+ zio_link_t *zl = NULL;
+ while ((pio = zio_walk_parents(zio, &zl)) != NULL)
+ ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
+
+ ddt_exit(ddt);
+}
+
+static void
+zio_ddt_child_write_done(zio_t *zio)
+{
+ int p = zio->io_prop.zp_copies;
+ ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+
+ ddt_enter(ddt);
+
+ ASSERT(ddp->ddp_refcnt == 0);
+ ASSERT(dde->dde_lead_zio[p] == zio);
+ dde->dde_lead_zio[p] = NULL;
+
+ if (zio->io_error == 0) {
+ zio_link_t *zl = NULL;
+ while (zio_walk_parents(zio, &zl) != NULL)
+ ddt_phys_addref(ddp);
+ } else {
+ ddt_phys_clear(ddp);
+ }
+
+ ddt_exit(ddt);
+}
+
+static zio_t *
+zio_ddt_write(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ uint64_t txg = zio->io_txg;
+ zio_prop_t *zp = &zio->io_prop;
+ int p = zp->zp_copies;
+ zio_t *cio = NULL;
+ ddt_t *ddt = ddt_select(spa, bp);
+ ddt_entry_t *dde;
+ ddt_phys_t *ddp;
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
+ ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
+ ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
+
+ ddt_enter(ddt);
+ dde = ddt_lookup(ddt, bp, B_TRUE);
+ ddp = &dde->dde_phys[p];
+
+ if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
+ /*
+ * If we're using a weak checksum, upgrade to a strong checksum
+ * and try again. If we're already using a strong checksum,
+ * we can't resolve it, so just convert to an ordinary write.
+ * (And automatically e-mail a paper to Nature?)
+ */
+ if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP)) {
+ zp->zp_checksum = spa_dedup_checksum(spa);
+ zio_pop_transforms(zio);
+ zio->io_stage = ZIO_STAGE_OPEN;
+ BP_ZERO(bp);
+ } else {
+ zp->zp_dedup = B_FALSE;
+ BP_SET_DEDUP(bp, B_FALSE);
+ }
+ ASSERT(!BP_GET_DEDUP(bp));
+ zio->io_pipeline = ZIO_WRITE_PIPELINE;
+ ddt_exit(ddt);
+ return (zio);
+ }
+
+ if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
+ if (ddp->ddp_phys_birth != 0)
+ ddt_bp_fill(ddp, bp, txg);
+ if (dde->dde_lead_zio[p] != NULL)
+ zio_add_child(zio, dde->dde_lead_zio[p]);
+ else
+ ddt_phys_addref(ddp);
+ } else if (zio->io_bp_override) {
+ ASSERT(bp->blk_birth == txg);
+ ASSERT(BP_EQUAL(bp, zio->io_bp_override));
+ ddt_phys_fill(ddp, bp);
+ ddt_phys_addref(ddp);
+ } else {
+ cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
+ zio->io_orig_size, zio->io_orig_size, zp,
+ zio_ddt_child_write_ready, NULL, NULL,
+ zio_ddt_child_write_done, dde, zio->io_priority,
+ ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+ zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
+ dde->dde_lead_zio[p] = cio;
+ }
+
+ ddt_exit(ddt);
+
+ zio_nowait(cio);
+
+ return (zio);
+}
+
+ddt_entry_t *freedde; /* for debugging */
+
+static zio_t *
+zio_ddt_free(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ ddt_t *ddt = ddt_select(spa, bp);
+ ddt_entry_t *dde;
+ ddt_phys_t *ddp;
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ ddt_enter(ddt);
+ freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
+ if (dde) {
+ ddp = ddt_phys_select(dde, bp);
+ if (ddp)
+ ddt_phys_decref(ddp);
+ }
+ ddt_exit(ddt);
+
+ return (zio);
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free blocks
+ * ==========================================================================
+ */
+
+static zio_t *
+zio_io_to_allocate(spa_t *spa, int allocator)
+{
+ zio_t *zio;
+
+ ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator]));
+
+ zio = avl_first(&spa->spa_alloc_trees[allocator]);
+ if (zio == NULL)
+ return (NULL);
+
+ ASSERT(IO_IS_ALLOCATING(zio));
+
+ /*
+ * Try to place a reservation for this zio. If we're unable to
+ * reserve then we throttle.
+ */
+ ASSERT3U(zio->io_allocator, ==, allocator);
+ if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
+ zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
+ return (NULL);
+ }
+
+ avl_remove(&spa->spa_alloc_trees[allocator], zio);
+ ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
+
+ return (zio);
+}
+
+static zio_t *
+zio_dva_throttle(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ zio_t *nio;
+ metaslab_class_t *mc;
+
+ /* locate an appropriate allocation class */
+ mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type,
+ zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk);
+
+ if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
+ !mc->mc_alloc_throttle_enabled ||
+ zio->io_child_type == ZIO_CHILD_GANG ||
+ zio->io_flags & ZIO_FLAG_NODATA) {
+ return (zio);
+ }
+
+ ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+
+ ASSERT3U(zio->io_queued_timestamp, >, 0);
+ ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
+
+ zbookmark_phys_t *bm = &zio->io_bookmark;
+ /*
+ * We want to try to use as many allocators as possible to help improve
+ * performance, but we also want logically adjacent IOs to be physically
+ * adjacent to improve sequential read performance. We chunk each object
+ * into 2^20 block regions, and then hash based on the objset, object,
+ * level, and region to accomplish both of these goals.
+ */
+ zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
+ bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
+ mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ zio->io_metaslab_class = mc;
+ avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
+ nio = zio_io_to_allocate(spa, zio->io_allocator);
+ mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
+ return (nio);
+}
+
+static void
+zio_allocate_dispatch(spa_t *spa, int allocator)
+{
+ zio_t *zio;
+
+ mutex_enter(&spa->spa_alloc_locks[allocator]);
+ zio = zio_io_to_allocate(spa, allocator);
+ mutex_exit(&spa->spa_alloc_locks[allocator]);
+ if (zio == NULL)
+ return;
+
+ ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
+ ASSERT0(zio->io_error);
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
+}
+
+static zio_t *
+zio_dva_allocate(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ metaslab_class_t *mc;
+ blkptr_t *bp = zio->io_bp;
+ int error;
+ int flags = 0;
+
+ if (zio->io_gang_leader == NULL) {
+ ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+ zio->io_gang_leader = zio;
+ }
+
+ ASSERT(BP_IS_HOLE(bp));
+ ASSERT0(BP_GET_NDVAS(bp));
+ ASSERT3U(zio->io_prop.zp_copies, >, 0);
+ ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
+ ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+
+ flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
+ if (zio->io_flags & ZIO_FLAG_NODATA)
+ flags |= METASLAB_DONT_THROTTLE;
+ if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
+ flags |= METASLAB_GANG_CHILD;
+ if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE)
+ flags |= METASLAB_ASYNC_ALLOC;
+
+ /*
+ * if not already chosen, locate an appropriate allocation class
+ */
+ mc = zio->io_metaslab_class;
+ if (mc == NULL) {
+ mc = spa_preferred_class(spa, zio->io_size,
+ zio->io_prop.zp_type, zio->io_prop.zp_level,
+ zio->io_prop.zp_zpl_smallblk);
+ zio->io_metaslab_class = mc;
+ }
+
+ /*
+ * Try allocating the block in the usual metaslab class.
+ * If that's full, allocate it in the normal class.
+ * If that's full, allocate as a gang block,
+ * and if all are full, the allocation fails (which shouldn't happen).
+ *
+ * Note that we do not fall back on embedded slog (ZIL) space, to
+ * preserve unfragmented slog space, which is critical for decent
+ * sync write performance. If a log allocation fails, we will fall
+ * back to spa_sync() which is abysmal for performance.
+ */
+ error = metaslab_alloc(spa, mc, zio->io_size, bp,
+ zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
+ &zio->io_alloc_list, zio, zio->io_allocator);
+
+ /*
+ * Fallback to normal class when an alloc class is full
+ */
+ if (error == ENOSPC && mc != spa_normal_class(spa)) {
+ /*
+ * If throttling, transfer reservation over to normal class.
+ * The io_allocator slot can remain the same even though we
+ * are switching classes.
+ */
+ if (mc->mc_alloc_throttle_enabled &&
+ (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) {
+ metaslab_class_throttle_unreserve(mc,
+ zio->io_prop.zp_copies, zio->io_allocator, zio);
+ zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING;
+
+ VERIFY(metaslab_class_throttle_reserve(
+ spa_normal_class(spa),
+ zio->io_prop.zp_copies, zio->io_allocator, zio,
+ flags | METASLAB_MUST_RESERVE));
+ }
+ zio->io_metaslab_class = mc = spa_normal_class(spa);
+ if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
+ zfs_dbgmsg("%s: metaslab allocation failure, "
+ "trying normal class: zio %px, size %llu, error %d",
+ spa_name(spa), zio, zio->io_size, error);
+ }
+
+ error = metaslab_alloc(spa, mc, zio->io_size, bp,
+ zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
+ &zio->io_alloc_list, zio, zio->io_allocator);
+ }
+
+ if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) {
+ if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
+ zfs_dbgmsg("%s: metaslab allocation failure, "
+ "trying ganging: zio %px, size %llu, error %d",
+ spa_name(spa), zio, zio->io_size, error);
+ }
+ return (zio_write_gang_block(zio, mc));
+ }
+ if (error != 0) {
+ if (error != ENOSPC ||
+ (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC)) {
+ zfs_dbgmsg("%s: metaslab allocation failure: zio %px, "
+ "size %llu, error %d",
+ spa_name(spa), zio, zio->io_size, error);
+ }
+ zio->io_error = error;
+ }
+
+ return (zio);
+}
+
+static zio_t *
+zio_dva_free(zio_t *zio)
+{
+ metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
+
+ return (zio);
+}
+
+static zio_t *
+zio_dva_claim(zio_t *zio)
+{
+ int error;
+
+ error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
+ if (error)
+ zio->io_error = error;
+
+ return (zio);
+}
+
+/*
+ * Undo an allocation. This is used by zio_done() when an I/O fails
+ * and we want to give back the block we just allocated.
+ * This handles both normal blocks and gang blocks.
+ */
+static void
+zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
+{
+ ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
+ ASSERT(zio->io_bp_override == NULL);
+
+ if (!BP_IS_HOLE(bp))
+ metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
+
+ if (gn != NULL) {
+ for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+ zio_dva_unallocate(zio, gn->gn_child[g],
+ &gn->gn_gbh->zg_blkptr[g]);
+ }
+ }
+}
+
+/*
+ * Try to allocate an intent log block. Return 0 on success, errno on failure.
+ */
+int
+zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
+ uint64_t size, boolean_t *slog)
+{
+ int error = 1;
+ zio_alloc_list_t io_alloc_list;
+
+ ASSERT(txg > spa_syncing_txg(spa));
+
+ metaslab_trace_init(&io_alloc_list);
+
+ /*
+ * Block pointer fields are useful to metaslabs for stats and debugging.
+ * Fill in the obvious ones before calling into metaslab_alloc().
+ */
+ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
+ BP_SET_PSIZE(new_bp, size);
+ BP_SET_LEVEL(new_bp, 0);
+
+ /*
+ * When allocating a zil block, we don't have information about
+ * the final destination of the block except the objset it's part
+ * of, so we just hash the objset ID to pick the allocator to get
+ * some parallelism.
+ */
+ int flags = METASLAB_FASTWRITE | METASLAB_ZIL;
+ int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) %
+ spa->spa_alloc_count;
+ error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
+ txg, NULL, flags, &io_alloc_list, NULL, allocator);
+ *slog = (error == 0);
+ if (error != 0) {
+ error = metaslab_alloc(spa, spa_embedded_log_class(spa), size,
+ new_bp, 1, txg, NULL, flags,
+ &io_alloc_list, NULL, allocator);
+ }
+ if (error != 0) {
+ error = metaslab_alloc(spa, spa_normal_class(spa), size,
+ new_bp, 1, txg, NULL, flags,
+ &io_alloc_list, NULL, allocator);
+ }
+ metaslab_trace_fini(&io_alloc_list);
+
+ if (error == 0) {
+ BP_SET_LSIZE(new_bp, size);
+ BP_SET_PSIZE(new_bp, size);
+ BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
+ BP_SET_CHECKSUM(new_bp,
+ spa_version(spa) >= SPA_VERSION_SLIM_ZIL
+ ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
+ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
+ BP_SET_LEVEL(new_bp, 0);
+ BP_SET_DEDUP(new_bp, 0);
+ BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
+
+ /*
+ * encrypted blocks will require an IV and salt. We generate
+ * these now since we will not be rewriting the bp at
+ * rewrite time.
+ */
+ if (os->os_encrypted) {
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+
+ BP_SET_CRYPT(new_bp, B_TRUE);
+ VERIFY0(spa_crypt_get_salt(spa,
+ dmu_objset_id(os), salt));
+ VERIFY0(zio_crypt_generate_iv(iv));
+
+ zio_crypt_encode_params_bp(new_bp, salt, iv);
+ }
+ } else {
+ zfs_dbgmsg("%s: zil block allocation failure: "
+ "size %llu, error %d", spa_name(spa), size, error);
+ }
+
+ return (error);
+}
+
+/*
+ * ==========================================================================
+ * Read and write to physical devices
+ * ==========================================================================
+ */
+
+/*
+ * Issue an I/O to the underlying vdev. Typically the issue pipeline
+ * stops after this stage and will resume upon I/O completion.
+ * However, there are instances where the vdev layer may need to
+ * continue the pipeline when an I/O was not issued. Since the I/O
+ * that was sent to the vdev layer might be different than the one
+ * currently active in the pipeline (see vdev_queue_io()), we explicitly
+ * force the underlying vdev layers to call either zio_execute() or
+ * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
+ */
+static zio_t *
+zio_vdev_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ uint64_t align;
+ spa_t *spa = zio->io_spa;
+
+ zio->io_delay = 0;
+
+ ASSERT(zio->io_error == 0);
+ ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
+
+ if (vd == NULL) {
+ if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
+ spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
+
+ /*
+ * The mirror_ops handle multiple DVAs in a single BP.
+ */
+ vdev_mirror_ops.vdev_op_io_start(zio);
+ return (NULL);
+ }
+
+ ASSERT3P(zio->io_logical, !=, zio);
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ ASSERT(spa->spa_trust_config);
+
+ /*
+ * Note: the code can handle other kinds of writes,
+ * but we don't expect them.
+ */
+ if (zio->io_vd->vdev_removing) {
+ ASSERT(zio->io_flags &
+ (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
+ ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
+ }
+ }
+
+ align = 1ULL << vd->vdev_top->vdev_ashift;
+
+ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
+ P2PHASE(zio->io_size, align) != 0) {
+ /* Transform logical writes to be a full physical block size. */
+ uint64_t asize = P2ROUNDUP(zio->io_size, align);
+ abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize);
+ ASSERT(vd == vd->vdev_top);
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ abd_copy(abuf, zio->io_abd, zio->io_size);
+ abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
+ }
+ zio_push_transform(zio, abuf, asize, asize, zio_subblock);
+ }
+
+ /*
+ * If this is not a physical io, make sure that it is properly aligned
+ * before proceeding.
+ */
+ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
+ ASSERT0(P2PHASE(zio->io_offset, align));
+ ASSERT0(P2PHASE(zio->io_size, align));
+ } else {
+ /*
+ * For physical writes, we allow 512b aligned writes and assume
+ * the device will perform a read-modify-write as necessary.
+ */
+ ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
+ ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
+ }
+
+ VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
+
+ /*
+ * If this is a repair I/O, and there's no self-healing involved --
+ * that is, we're just resilvering what we expect to resilver --
+ * then don't do the I/O unless zio's txg is actually in vd's DTL.
+ * This prevents spurious resilvering.
+ *
+ * There are a few ways that we can end up creating these spurious
+ * resilver i/os:
+ *
+ * 1. A resilver i/o will be issued if any DVA in the BP has a
+ * dirty DTL. The mirror code will issue resilver writes to
+ * each DVA, including the one(s) that are not on vdevs with dirty
+ * DTLs.
+ *
+ * 2. With nested replication, which happens when we have a
+ * "replacing" or "spare" vdev that's a child of a mirror or raidz.
+ * For example, given mirror(replacing(A+B), C), it's likely that
+ * only A is out of date (it's the new device). In this case, we'll
+ * read from C, then use the data to resilver A+B -- but we don't
+ * actually want to resilver B, just A. The top-level mirror has no
+ * way to know this, so instead we just discard unnecessary repairs
+ * as we work our way down the vdev tree.
+ *
+ * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc.
+ * The same logic applies to any form of nested replication: ditto
+ * + mirror, RAID-Z + replacing, etc.
+ *
+ * However, indirect vdevs point off to other vdevs which may have
+ * DTL's, so we never bypass them. The child i/os on concrete vdevs
+ * will be properly bypassed instead.
+ *
+ * Leaf DTL_PARTIAL can be empty when a legitimate write comes from
+ * a dRAID spare vdev. For example, when a dRAID spare is first
+ * used, its spare blocks need to be written to but the leaf vdev's
+ * of such blocks can have empty DTL_PARTIAL.
+ *
+ * There seemed no clean way to allow such writes while bypassing
+ * spurious ones. At this point, just avoid all bypassing for dRAID
+ * for correctness.
+ */
+ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
+ !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
+ zio->io_txg != 0 && /* not a delegated i/o */
+ vd->vdev_ops != &vdev_indirect_ops &&
+ vd->vdev_top->vdev_ops != &vdev_draid_ops &&
+ !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ zio_vdev_io_bypass(zio);
+ return (zio);
+ }
+
+ /*
+ * Select the next best leaf I/O to process. Distributed spares are
+ * excluded since they dispatch the I/O directly to a leaf vdev after
+ * applying the dRAID mapping.
+ */
+ if (vd->vdev_ops->vdev_op_leaf &&
+ vd->vdev_ops != &vdev_draid_spare_ops &&
+ (zio->io_type == ZIO_TYPE_READ ||
+ zio->io_type == ZIO_TYPE_WRITE ||
+ zio->io_type == ZIO_TYPE_TRIM)) {
+
+ if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
+ return (zio);
+
+ if ((zio = vdev_queue_io(zio)) == NULL)
+ return (NULL);
+
+ if (!vdev_accessible(vd, zio)) {
+ zio->io_error = SET_ERROR(ENXIO);
+ zio_interrupt(zio);
+ return (NULL);
+ }
+ zio->io_delay = gethrtime();
+ }
+
+ vd->vdev_ops->vdev_op_io_start(zio);
+ return (NULL);
+}
+
+static zio_t *
+zio_vdev_io_done(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
+ boolean_t unexpected_error = B_FALSE;
+
+ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
+ return (NULL);
+ }
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ ||
+ zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM);
+
+ if (zio->io_delay)
+ zio->io_delay = gethrtime() - zio->io_delay;
+
+ if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+ vd->vdev_ops != &vdev_draid_spare_ops) {
+ vdev_queue_io_done(zio);
+
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ vdev_cache_write(zio);
+
+ if (zio_injection_enabled && zio->io_error == 0)
+ zio->io_error = zio_handle_device_injections(vd, zio,
+ EIO, EILSEQ);
+
+ if (zio_injection_enabled && zio->io_error == 0)
+ zio->io_error = zio_handle_label_injection(zio, EIO);
+
+ if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) {
+ if (!vdev_accessible(vd, zio)) {
+ zio->io_error = SET_ERROR(ENXIO);
+ } else {
+ unexpected_error = B_TRUE;
+ }
+ }
+ }
+
+ ops->vdev_op_io_done(zio);
+
+ if (unexpected_error)
+ VERIFY(vdev_probe(vd, zio) == NULL);
+
+ return (zio);
+}
+
+/*
+ * This function is used to change the priority of an existing zio that is
+ * currently in-flight. This is used by the arc to upgrade priority in the
+ * event that a demand read is made for a block that is currently queued
+ * as a scrub or async read IO. Otherwise, the high priority read request
+ * would end up having to wait for the lower priority IO.
+ */
+void
+zio_change_priority(zio_t *pio, zio_priority_t priority)
+{
+ zio_t *cio, *cio_next;
+ zio_link_t *zl = NULL;
+
+ ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+
+ if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) {
+ vdev_queue_change_io_priority(pio, priority);
+ } else {
+ pio->io_priority = priority;
+ }
+
+ mutex_enter(&pio->io_lock);
+ for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
+ cio_next = zio_walk_children(pio, &zl);
+ zio_change_priority(cio, priority);
+ }
+ mutex_exit(&pio->io_lock);
+}
+
+/*
+ * For non-raidz ZIOs, we can just copy aside the bad data read from the
+ * disk, and use that to finish the checksum ereport later.
+ */
+static void
+zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
+ const abd_t *good_buf)
+{
+ /* no processing needed */
+ zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
+}
+
+/*ARGSUSED*/
+void
+zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
+{
+ void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size);
+
+ abd_copy(abd, zio->io_abd, zio->io_size);
+
+ zcr->zcr_cbinfo = zio->io_size;
+ zcr->zcr_cbdata = abd;
+ zcr->zcr_finish = zio_vsd_default_cksum_finish;
+ zcr->zcr_free = zio_abd_free;
+}
+
+static zio_t *
+zio_vdev_io_assess(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+
+ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
+ return (NULL);
+ }
+
+ if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
+ spa_config_exit(zio->io_spa, SCL_ZIO, zio);
+
+ if (zio->io_vsd != NULL) {
+ zio->io_vsd_ops->vsd_free(zio);
+ zio->io_vsd = NULL;
+ }
+
+ if (zio_injection_enabled && zio->io_error == 0)
+ zio->io_error = zio_handle_fault_injection(zio, EIO);
+
+ /*
+ * If the I/O failed, determine whether we should attempt to retry it.
+ *
+ * On retry, we cut in line in the issue queue, since we don't want
+ * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
+ */
+ if (zio->io_error && vd == NULL &&
+ !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
+ ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */
+ zio->io_error = 0;
+ zio->io_flags |= ZIO_FLAG_IO_RETRY |
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
+ zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
+ zio_requeue_io_start_cut_in_line);
+ return (NULL);
+ }
+
+ /*
+ * If we got an error on a leaf device, convert it to ENXIO
+ * if the device is not accessible at all.
+ */
+ if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+ !vdev_accessible(vd, zio))
+ zio->io_error = SET_ERROR(ENXIO);
+
+ /*
+ * If we can't write to an interior vdev (mirror or RAID-Z),
+ * set vdev_cant_write so that we stop trying to allocate from it.
+ */
+ if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
+ vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
+ vd->vdev_cant_write = B_TRUE;
+ }
+
+ /*
+ * If a cache flush returns ENOTSUP or ENOTTY, we know that no future
+ * attempts will ever succeed. In this case we set a persistent
+ * boolean flag so that we don't bother with it in the future.
+ */
+ if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
+ zio->io_type == ZIO_TYPE_IOCTL &&
+ zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL)
+ vd->vdev_nowritecache = B_TRUE;
+
+ if (zio->io_error)
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+ if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+ zio->io_physdone != NULL) {
+ ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
+ ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
+ zio->io_physdone(zio->io_logical);
+ }
+
+ return (zio);
+}
+
+void
+zio_vdev_io_reissue(zio_t *zio)
+{
+ ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
+ ASSERT(zio->io_error == 0);
+
+ zio->io_stage >>= 1;
+}
+
+void
+zio_vdev_io_redone(zio_t *zio)
+{
+ ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
+
+ zio->io_stage >>= 1;
+}
+
+void
+zio_vdev_io_bypass(zio_t *zio)
+{
+ ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
+ ASSERT(zio->io_error == 0);
+
+ zio->io_flags |= ZIO_FLAG_IO_BYPASS;
+ zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
+}
+
+/*
+ * ==========================================================================
+ * Encrypt and store encryption parameters
+ * ==========================================================================
+ */
+
+
+/*
+ * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for
+ * managing the storage of encryption parameters and passing them to the
+ * lower-level encryption functions.
+ */
+static zio_t *
+zio_encrypt(zio_t *zio)
+{
+ zio_prop_t *zp = &zio->io_prop;
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ uint64_t psize = BP_GET_PSIZE(bp);
+ uint64_t dsobj = zio->io_bookmark.zb_objset;
+ dmu_object_type_t ot = BP_GET_TYPE(bp);
+ void *enc_buf = NULL;
+ abd_t *eabd = NULL;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+ boolean_t no_crypt = B_FALSE;
+
+ /* the root zio already encrypted the data */
+ if (zio->io_child_type == ZIO_CHILD_GANG)
+ return (zio);
+
+ /* only ZIL blocks are re-encrypted on rewrite */
+ if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG)
+ return (zio);
+
+ if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) {
+ BP_SET_CRYPT(bp, B_FALSE);
+ return (zio);
+ }
+
+ /* if we are doing raw encryption set the provided encryption params */
+ if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) {
+ ASSERT0(BP_GET_LEVEL(bp));
+ BP_SET_CRYPT(bp, B_TRUE);
+ BP_SET_BYTEORDER(bp, zp->zp_byteorder);
+ if (ot != DMU_OT_OBJSET)
+ zio_crypt_encode_mac_bp(bp, zp->zp_mac);
+
+ /* dnode blocks must be written out in the provided byteorder */
+ if (zp->zp_byteorder != ZFS_HOST_BYTEORDER &&
+ ot == DMU_OT_DNODE) {
+ void *bswap_buf = zio_buf_alloc(psize);
+ abd_t *babd = abd_get_from_buf(bswap_buf, psize);
+
+ ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
+ abd_copy_to_buf(bswap_buf, zio->io_abd, psize);
+ dmu_ot_byteswap[DMU_OT_BYTESWAP(ot)].ob_func(bswap_buf,
+ psize);
+
+ abd_take_ownership_of_buf(babd, B_TRUE);
+ zio_push_transform(zio, babd, psize, psize, NULL);
+ }
+
+ if (DMU_OT_IS_ENCRYPTED(ot))
+ zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv);
+ return (zio);
+ }
+
+ /* indirect blocks only maintain a cksum of the lower level MACs */
+ if (BP_GET_LEVEL(bp) > 0) {
+ BP_SET_CRYPT(bp, B_TRUE);
+ VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE,
+ zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp),
+ mac));
+ zio_crypt_encode_mac_bp(bp, mac);
+ return (zio);
+ }
+
+ /*
+ * Objset blocks are a special case since they have 2 256-bit MACs
+ * embedded within them.
+ */
+ if (ot == DMU_OT_OBJSET) {
+ ASSERT0(DMU_OT_IS_ENCRYPTED(ot));
+ ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
+ BP_SET_CRYPT(bp, B_TRUE);
+ VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, dsobj,
+ zio->io_abd, psize, BP_SHOULD_BYTESWAP(bp)));
+ return (zio);
+ }
+
+ /* unencrypted object types are only authenticated with a MAC */
+ if (!DMU_OT_IS_ENCRYPTED(ot)) {
+ BP_SET_CRYPT(bp, B_TRUE);
+ VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, dsobj,
+ zio->io_abd, psize, mac));
+ zio_crypt_encode_mac_bp(bp, mac);
+ return (zio);
+ }
+
+ /*
+ * Later passes of sync-to-convergence may decide to rewrite data
+ * in place to avoid more disk reallocations. This presents a problem
+ * for encryption because this constitutes rewriting the new data with
+ * the same encryption key and IV. However, this only applies to blocks
+ * in the MOS (particularly the spacemaps) and we do not encrypt the
+ * MOS. We assert that the zio is allocating or an intent log write
+ * to enforce this.
+ */
+ ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG);
+ ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG);
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION));
+ ASSERT3U(psize, !=, 0);
+
+ enc_buf = zio_buf_alloc(psize);
+ eabd = abd_get_from_buf(enc_buf, psize);
+ abd_take_ownership_of_buf(eabd, B_TRUE);
+
+ /*
+ * For an explanation of what encryption parameters are stored
+ * where, see the block comment in zio_crypt.c.
+ */
+ if (ot == DMU_OT_INTENT_LOG) {
+ zio_crypt_decode_params_bp(bp, salt, iv);
+ } else {
+ BP_SET_CRYPT(bp, B_TRUE);
+ }
+
+ /* Perform the encryption. This should not fail */
+ VERIFY0(spa_do_crypt_abd(B_TRUE, spa, &zio->io_bookmark,
+ BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
+ salt, iv, mac, psize, zio->io_abd, eabd, &no_crypt));
+
+ /* encode encryption metadata into the bp */
+ if (ot == DMU_OT_INTENT_LOG) {
+ /*
+ * ZIL blocks store the MAC in the embedded checksum, so the
+ * transform must always be applied.
+ */
+ zio_crypt_encode_mac_zil(enc_buf, mac);
+ zio_push_transform(zio, eabd, psize, psize, NULL);
+ } else {
+ BP_SET_CRYPT(bp, B_TRUE);
+ zio_crypt_encode_params_bp(bp, salt, iv);
+ zio_crypt_encode_mac_bp(bp, mac);
+
+ if (no_crypt) {
+ ASSERT3U(ot, ==, DMU_OT_DNODE);
+ abd_free(eabd);
+ } else {
+ zio_push_transform(zio, eabd, psize, psize, NULL);
+ }
+ }
+
+ return (zio);
+}
+
+/*
+ * ==========================================================================
+ * Generate and verify checksums
+ * ==========================================================================
+ */
+static zio_t *
+zio_checksum_generate(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ enum zio_checksum checksum;
+
+ if (bp == NULL) {
+ /*
+ * This is zio_write_phys().
+ * We're either generating a label checksum, or none at all.
+ */
+ checksum = zio->io_prop.zp_checksum;
+
+ if (checksum == ZIO_CHECKSUM_OFF)
+ return (zio);
+
+ ASSERT(checksum == ZIO_CHECKSUM_LABEL);
+ } else {
+ if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
+ ASSERT(!IO_IS_ALLOCATING(zio));
+ checksum = ZIO_CHECKSUM_GANG_HEADER;
+ } else {
+ checksum = BP_GET_CHECKSUM(bp);
+ }
+ }
+
+ zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
+
+ return (zio);
+}
+
+static zio_t *
+zio_checksum_verify(zio_t *zio)
+{
+ zio_bad_cksum_t info;
+ blkptr_t *bp = zio->io_bp;
+ int error;
+
+ ASSERT(zio->io_vd != NULL);
+
+ if (bp == NULL) {
+ /*
+ * This is zio_read_phys().
+ * We're either verifying a label checksum, or nothing at all.
+ */
+ if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
+ return (zio);
+
+ ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL);
+ }
+
+ if ((error = zio_checksum_error(zio, &info)) != 0) {
+ zio->io_error = error;
+ if (error == ECKSUM &&
+ !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ int ret = zfs_ereport_start_checksum(zio->io_spa,
+ zio->io_vd, &zio->io_bookmark, zio,
+ zio->io_offset, zio->io_size, NULL, &info);
+
+ if (ret != EALREADY) {
+ mutex_enter(&zio->io_vd->vdev_stat_lock);
+ zio->io_vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&zio->io_vd->vdev_stat_lock);
+ }
+ }
+ }
+
+ return (zio);
+}
+
+/*
+ * Called by RAID-Z to ensure we don't compute the checksum twice.
+ */
+void
+zio_checksum_verified(zio_t *zio)
+{
+ zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
+}
+
+/*
+ * ==========================================================================
+ * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
+ * An error of 0 indicates success. ENXIO indicates whole-device failure,
+ * which may be transient (e.g. unplugged) or permanent. ECKSUM and EIO
+ * indicate errors that are specific to one I/O, and most likely permanent.
+ * Any other error is presumed to be worse because we weren't expecting it.
+ * ==========================================================================
+ */
+int
+zio_worst_error(int e1, int e2)
+{
+ static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
+ int r1, r2;
+
+ for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
+ if (e1 == zio_error_rank[r1])
+ break;
+
+ for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
+ if (e2 == zio_error_rank[r2])
+ break;
+
+ return (r1 > r2 ? e1 : e2);
+}
+
+/*
+ * ==========================================================================
+ * I/O completion
+ * ==========================================================================
+ */
+static zio_t *
+zio_ready(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ zio_t *pio, *pio_next;
+ zio_link_t *zl = NULL;
+
+ if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
+ ZIO_WAIT_READY)) {
+ return (NULL);
+ }
+
+ if (zio->io_ready) {
+ ASSERT(IO_IS_ALLOCATING(zio));
+ ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
+ (zio->io_flags & ZIO_FLAG_NOPWRITE));
+ ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
+
+ zio->io_ready(zio);
+ }
+
+ if (bp != NULL && bp != &zio->io_bp_copy)
+ zio->io_bp_copy = *bp;
+
+ if (zio->io_error != 0) {
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(IO_IS_ALLOCATING(zio));
+ ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(zio->io_metaslab_class != NULL);
+
+ /*
+ * We were unable to allocate anything, unreserve and
+ * issue the next I/O to allocate.
+ */
+ metaslab_class_throttle_unreserve(
+ zio->io_metaslab_class, zio->io_prop.zp_copies,
+ zio->io_allocator, zio);
+ zio_allocate_dispatch(zio->io_spa, zio->io_allocator);
+ }
+ }
+
+ mutex_enter(&zio->io_lock);
+ zio->io_state[ZIO_WAIT_READY] = 1;
+ pio = zio_walk_parents(zio, &zl);
+ mutex_exit(&zio->io_lock);
+
+ /*
+ * As we notify zio's parents, new parents could be added.
+ * New parents go to the head of zio's io_parent_list, however,
+ * so we will (correctly) not notify them. The remainder of zio's
+ * io_parent_list, from 'pio_next' onward, cannot change because
+ * all parents must wait for us to be done before they can be done.
+ */
+ for (; pio != NULL; pio = pio_next) {
+ pio_next = zio_walk_parents(zio, &zl);
+ zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL);
+ }
+
+ if (zio->io_flags & ZIO_FLAG_NODATA) {
+ if (BP_IS_GANG(bp)) {
+ zio->io_flags &= ~ZIO_FLAG_NODATA;
+ } else {
+ ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
+ zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+ }
+ }
+
+ if (zio_injection_enabled &&
+ zio->io_spa->spa_syncing_txg == zio->io_txg)
+ zio_handle_ignored_writes(zio);
+
+ return (zio);
+}
+
+/*
+ * Update the allocation throttle accounting.
+ */
+static void
+zio_dva_throttle_done(zio_t *zio)
+{
+ zio_t *lio __maybe_unused = zio->io_logical;
+ zio_t *pio = zio_unique_parent(zio);
+ vdev_t *vd = zio->io_vd;
+ int flags = METASLAB_ASYNC_ALLOC;
+
+ ASSERT3P(zio->io_bp, !=, NULL);
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+ ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
+ ASSERT(vd != NULL);
+ ASSERT3P(vd, ==, vd->vdev_top);
+ ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY));
+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
+ ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
+ ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
+ ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
+
+ /*
+ * Parents of gang children can have two flavors -- ones that
+ * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
+ * and ones that allocated the constituent blocks. The allocation
+ * throttle needs to know the allocating parent zio so we must find
+ * it here.
+ */
+ if (pio->io_child_type == ZIO_CHILD_GANG) {
+ /*
+ * If our parent is a rewrite gang child then our grandparent
+ * would have been the one that performed the allocation.
+ */
+ if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
+ pio = zio_unique_parent(pio);
+ flags |= METASLAB_GANG_CHILD;
+ }
+
+ ASSERT(IO_IS_ALLOCATING(pio));
+ ASSERT3P(zio, !=, zio->io_logical);
+ ASSERT(zio->io_logical != NULL);
+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
+ ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
+ ASSERT(zio->io_metaslab_class != NULL);
+
+ mutex_enter(&pio->io_lock);
+ metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags,
+ pio->io_allocator, B_TRUE);
+ mutex_exit(&pio->io_lock);
+
+ metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1,
+ pio->io_allocator, pio);
+
+ /*
+ * Call into the pipeline to see if there is more work that
+ * needs to be done. If there is work to be done it will be
+ * dispatched to another taskq thread.
+ */
+ zio_allocate_dispatch(zio->io_spa, pio->io_allocator);
+}
+
+static zio_t *
+zio_done(zio_t *zio)
+{
+ /*
+ * Always attempt to keep stack usage minimal here since
+ * we can be called recursively up to 19 levels deep.
+ */
+ const uint64_t psize = zio->io_size;
+ zio_t *pio, *pio_next;
+ zio_link_t *zl = NULL;
+
+ /*
+ * If our children haven't all completed,
+ * wait for them and then repeat this pipeline stage.
+ */
+ if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
+ return (NULL);
+ }
+
+ /*
+ * If the allocation throttle is enabled, then update the accounting.
+ * We only track child I/Os that are part of an allocating async
+ * write. We must do this since the allocation is performed
+ * by the logical I/O but the actual write is done by child I/Os.
+ */
+ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
+ zio->io_child_type == ZIO_CHILD_VDEV) {
+ ASSERT(zio->io_metaslab_class != NULL);
+ ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
+ zio_dva_throttle_done(zio);
+ }
+
+ /*
+ * If the allocation throttle is enabled, verify that
+ * we have decremented the refcounts for every I/O that was throttled.
+ */
+ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(zio->io_bp != NULL);
+
+ metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio,
+ zio->io_allocator);
+ VERIFY(zfs_refcount_not_held(&zio->io_metaslab_class->
+ mc_allocator[zio->io_allocator].mca_alloc_slots, zio));
+ }
+
+
+ for (int c = 0; c < ZIO_CHILD_TYPES; c++)
+ for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+ ASSERT(zio->io_children[c][w] == 0);
+
+ if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
+ ASSERT(zio->io_bp->blk_pad[0] == 0);
+ ASSERT(zio->io_bp->blk_pad[1] == 0);
+ ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy,
+ sizeof (blkptr_t)) == 0 ||
+ (zio->io_bp == zio_unique_parent(zio)->io_bp));
+ if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
+ zio->io_bp_override == NULL &&
+ !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
+ ASSERT3U(zio->io_prop.zp_copies, <=,
+ BP_GET_NDVAS(zio->io_bp));
+ ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
+ (BP_COUNT_GANG(zio->io_bp) ==
+ BP_GET_NDVAS(zio->io_bp)));
+ }
+ if (zio->io_flags & ZIO_FLAG_NOPWRITE)
+ VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
+ }
+
+ /*
+ * If there were child vdev/gang/ddt errors, they apply to us now.
+ */
+ zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
+ zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
+ zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
+
+ /*
+ * If the I/O on the transformed data was successful, generate any
+ * checksum reports now while we still have the transformed data.
+ */
+ if (zio->io_error == 0) {
+ while (zio->io_cksum_report != NULL) {
+ zio_cksum_report_t *zcr = zio->io_cksum_report;
+ uint64_t align = zcr->zcr_align;
+ uint64_t asize = P2ROUNDUP(psize, align);
+ abd_t *adata = zio->io_abd;
+
+ if (asize != psize) {
+ adata = abd_alloc(asize, B_TRUE);
+ abd_copy(adata, zio->io_abd, psize);
+ abd_zero_off(adata, psize, asize - psize);
+ }
+
+ zio->io_cksum_report = zcr->zcr_next;
+ zcr->zcr_next = NULL;
+ zcr->zcr_finish(zcr, adata);
+ zfs_ereport_free_checksum(zcr);
+
+ if (asize != psize)
+ abd_free(adata);
+ }
+ }
+
+ zio_pop_transforms(zio); /* note: may set zio->io_error */
+
+ vdev_stat_update(zio, psize);
+
+ /*
+ * If this I/O is attached to a particular vdev is slow, exceeding
+ * 30 seconds to complete, post an error described the I/O delay.
+ * We ignore these errors if the device is currently unavailable.
+ */
+ if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) {
+ if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) {
+ /*
+ * We want to only increment our slow IO counters if
+ * the IO is valid (i.e. not if the drive is removed).
+ *
+ * zfs_ereport_post() will also do these checks, but
+ * it can also ratelimit and have other failures, so we
+ * need to increment the slow_io counters independent
+ * of it.
+ */
+ if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY,
+ zio->io_spa, zio->io_vd, zio)) {
+ mutex_enter(&zio->io_vd->vdev_stat_lock);
+ zio->io_vd->vdev_stat.vs_slow_ios++;
+ mutex_exit(&zio->io_vd->vdev_stat_lock);
+
+ (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
+ zio->io_spa, zio->io_vd, &zio->io_bookmark,
+ zio, 0);
+ }
+ }
+ }
+
+ if (zio->io_error) {
+ /*
+ * If this I/O is attached to a particular vdev,
+ * generate an error message describing the I/O failure
+ * at the block level. We ignore these errors if the
+ * device is currently unavailable.
+ */
+ if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
+ !vdev_is_dead(zio->io_vd)) {
+ int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO,
+ zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
+ if (ret != EALREADY) {
+ mutex_enter(&zio->io_vd->vdev_stat_lock);
+ if (zio->io_type == ZIO_TYPE_READ)
+ zio->io_vd->vdev_stat.vs_read_errors++;
+ else if (zio->io_type == ZIO_TYPE_WRITE)
+ zio->io_vd->vdev_stat.vs_write_errors++;
+ mutex_exit(&zio->io_vd->vdev_stat_lock);
+ }
+ }
+
+ if ((zio->io_error == EIO || !(zio->io_flags &
+ (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
+ zio == zio->io_logical) {
+ /*
+ * For logical I/O requests, tell the SPA to log the
+ * error and generate a logical data ereport.
+ */
+ spa_log_error(zio->io_spa, &zio->io_bookmark);
+ (void) zfs_ereport_post(FM_EREPORT_ZFS_DATA,
+ zio->io_spa, NULL, &zio->io_bookmark, zio, 0);
+ }
+ }
+
+ if (zio->io_error && zio == zio->io_logical) {
+ /*
+ * Determine whether zio should be reexecuted. This will
+ * propagate all the way to the root via zio_notify_parent().
+ */
+ ASSERT(zio->io_vd == NULL && zio->io_bp != NULL);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ if (IO_IS_ALLOCATING(zio) &&
+ !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
+ if (zio->io_error != ENOSPC)
+ zio->io_reexecute |= ZIO_REEXECUTE_NOW;
+ else
+ zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+ }
+
+ if ((zio->io_type == ZIO_TYPE_READ ||
+ zio->io_type == ZIO_TYPE_FREE) &&
+ !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
+ zio->io_error == ENXIO &&
+ spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
+ spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
+ zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+
+ if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
+ zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+
+ /*
+ * Here is a possibly good place to attempt to do
+ * either combinatorial reconstruction or error correction
+ * based on checksums. It also might be a good place
+ * to send out preliminary ereports before we suspend
+ * processing.
+ */
+ }
+
+ /*
+ * If there were logical child errors, they apply to us now.
+ * We defer this until now to avoid conflating logical child
+ * errors with errors that happened to the zio itself when
+ * updating vdev stats and reporting FMA events above.
+ */
+ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
+
+ if ((zio->io_error || zio->io_reexecute) &&
+ IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
+ !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
+ zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
+
+ zio_gang_tree_free(&zio->io_gang_tree);
+
+ /*
+ * Godfather I/Os should never suspend.
+ */
+ if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
+ (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
+ zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND;
+
+ if (zio->io_reexecute) {
+ /*
+ * This is a logical I/O that wants to reexecute.
+ *
+ * Reexecute is top-down. When an i/o fails, if it's not
+ * the root, it simply notifies its parent and sticks around.
+ * The parent, seeing that it still has children in zio_done(),
+ * does the same. This percolates all the way up to the root.
+ * The root i/o will reexecute or suspend the entire tree.
+ *
+ * This approach ensures that zio_reexecute() honors
+ * all the original i/o dependency relationships, e.g.
+ * parents not executing until children are ready.
+ */
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ zio->io_gang_leader = NULL;
+
+ mutex_enter(&zio->io_lock);
+ zio->io_state[ZIO_WAIT_DONE] = 1;
+ mutex_exit(&zio->io_lock);
+
+ /*
+ * "The Godfather" I/O monitors its children but is
+ * not a true parent to them. It will track them through
+ * the pipeline but severs its ties whenever they get into
+ * trouble (e.g. suspended). This allows "The Godfather"
+ * I/O to return status without blocking.
+ */
+ zl = NULL;
+ for (pio = zio_walk_parents(zio, &zl); pio != NULL;
+ pio = pio_next) {
+ zio_link_t *remove_zl = zl;
+ pio_next = zio_walk_parents(zio, &zl);
+
+ if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
+ (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
+ zio_remove_child(pio, zio, remove_zl);
+ /*
+ * This is a rare code path, so we don't
+ * bother with "next_to_execute".
+ */
+ zio_notify_parent(pio, zio, ZIO_WAIT_DONE,
+ NULL);
+ }
+ }
+
+ if ((pio = zio_unique_parent(zio)) != NULL) {
+ /*
+ * We're not a root i/o, so there's nothing to do
+ * but notify our parent. Don't propagate errors
+ * upward since we haven't permanently failed yet.
+ */
+ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
+ zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
+ /*
+ * This is a rare code path, so we don't bother with
+ * "next_to_execute".
+ */
+ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL);
+ } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
+ /*
+ * We'd fail again if we reexecuted now, so suspend
+ * until conditions improve (e.g. device comes online).
+ */
+ zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR);
+ } else {
+ /*
+ * Reexecution is potentially a huge amount of work.
+ * Hand it off to the otherwise-unused claim taskq.
+ */
+ ASSERT(taskq_empty_ent(&zio->io_tqent));
+ spa_taskq_dispatch_ent(zio->io_spa,
+ ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
+ (task_func_t *)zio_reexecute, zio, 0,
+ &zio->io_tqent);
+ }
+ return (NULL);
+ }
+
+ ASSERT(zio->io_child_count == 0);
+ ASSERT(zio->io_reexecute == 0);
+ ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
+
+ /*
+ * Report any checksum errors, since the I/O is complete.
+ */
+ while (zio->io_cksum_report != NULL) {
+ zio_cksum_report_t *zcr = zio->io_cksum_report;
+ zio->io_cksum_report = zcr->zcr_next;
+ zcr->zcr_next = NULL;
+ zcr->zcr_finish(zcr, NULL);
+ zfs_ereport_free_checksum(zcr);
+ }
+
+ if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
+ !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
+ !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
+ metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
+ }
+
+ /*
+ * It is the responsibility of the done callback to ensure that this
+ * particular zio is no longer discoverable for adoption, and as
+ * such, cannot acquire any new parents.
+ */
+ if (zio->io_done)
+ zio->io_done(zio);
+
+ mutex_enter(&zio->io_lock);
+ zio->io_state[ZIO_WAIT_DONE] = 1;
+ mutex_exit(&zio->io_lock);
+
+ /*
+ * We are done executing this zio. We may want to execute a parent
+ * next. See the comment in zio_notify_parent().
+ */
+ zio_t *next_to_execute = NULL;
+ zl = NULL;
+ for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
+ zio_link_t *remove_zl = zl;
+ pio_next = zio_walk_parents(zio, &zl);
+ zio_remove_child(pio, zio, remove_zl);
+ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute);
+ }
+
+ if (zio->io_waiter != NULL) {
+ mutex_enter(&zio->io_lock);
+ zio->io_executor = NULL;
+ cv_broadcast(&zio->io_cv);
+ mutex_exit(&zio->io_lock);
+ } else {
+ zio_destroy(zio);
+ }
+
+ return (next_to_execute);
+}
+
+/*
+ * ==========================================================================
+ * I/O pipeline definition
+ * ==========================================================================
+ */
+static zio_pipe_stage_t *zio_pipeline[] = {
+ NULL,
+ zio_read_bp_init,
+ zio_write_bp_init,
+ zio_free_bp_init,
+ zio_issue_async,
+ zio_write_compress,
+ zio_encrypt,
+ zio_checksum_generate,
+ zio_nop_write,
+ zio_ddt_read_start,
+ zio_ddt_read_done,
+ zio_ddt_write,
+ zio_ddt_free,
+ zio_gang_assemble,
+ zio_gang_issue,
+ zio_dva_throttle,
+ zio_dva_allocate,
+ zio_dva_free,
+ zio_dva_claim,
+ zio_ready,
+ zio_vdev_io_start,
+ zio_vdev_io_done,
+ zio_vdev_io_assess,
+ zio_checksum_verify,
+ zio_done
+};
+
+
+
+
+/*
+ * Compare two zbookmark_phys_t's to see which we would reach first in a
+ * pre-order traversal of the object tree.
+ *
+ * This is simple in every case aside from the meta-dnode object. For all other
+ * objects, we traverse them in order (object 1 before object 2, and so on).
+ * However, all of these objects are traversed while traversing object 0, since
+ * the data it points to is the list of objects. Thus, we need to convert to a
+ * canonical representation so we can compare meta-dnode bookmarks to
+ * non-meta-dnode bookmarks.
+ *
+ * We do this by calculating "equivalents" for each field of the zbookmark.
+ * zbookmarks outside of the meta-dnode use their own object and level, and
+ * calculate the level 0 equivalent (the first L0 blkid that is contained in the
+ * blocks this bookmark refers to) by multiplying their blkid by their span
+ * (the number of L0 blocks contained within one block at their level).
+ * zbookmarks inside the meta-dnode calculate their object equivalent
+ * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
+ * level + 1<<31 (any value larger than a level could ever be) for their level.
+ * This causes them to always compare before a bookmark in their object
+ * equivalent, compare appropriately to bookmarks in other objects, and to
+ * compare appropriately to other bookmarks in the meta-dnode.
+ */
+int
+zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
+ const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
+{
+ /*
+ * These variables represent the "equivalent" values for the zbookmark,
+ * after converting zbookmarks inside the meta dnode to their
+ * normal-object equivalents.
+ */
+ uint64_t zb1obj, zb2obj;
+ uint64_t zb1L0, zb2L0;
+ uint64_t zb1level, zb2level;
+
+ if (zb1->zb_object == zb2->zb_object &&
+ zb1->zb_level == zb2->zb_level &&
+ zb1->zb_blkid == zb2->zb_blkid)
+ return (0);
+
+ IMPLY(zb1->zb_level > 0, ibs1 >= SPA_MINBLOCKSHIFT);
+ IMPLY(zb2->zb_level > 0, ibs2 >= SPA_MINBLOCKSHIFT);
+
+ /*
+ * BP_SPANB calculates the span in blocks.
+ */
+ zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
+ zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
+
+ if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
+ zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
+ zb1L0 = 0;
+ zb1level = zb1->zb_level + COMPARE_META_LEVEL;
+ } else {
+ zb1obj = zb1->zb_object;
+ zb1level = zb1->zb_level;
+ }
+
+ if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
+ zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
+ zb2L0 = 0;
+ zb2level = zb2->zb_level + COMPARE_META_LEVEL;
+ } else {
+ zb2obj = zb2->zb_object;
+ zb2level = zb2->zb_level;
+ }
+
+ /* Now that we have a canonical representation, do the comparison. */
+ if (zb1obj != zb2obj)
+ return (zb1obj < zb2obj ? -1 : 1);
+ else if (zb1L0 != zb2L0)
+ return (zb1L0 < zb2L0 ? -1 : 1);
+ else if (zb1level != zb2level)
+ return (zb1level > zb2level ? -1 : 1);
+ /*
+ * This can (theoretically) happen if the bookmarks have the same object
+ * and level, but different blkids, if the block sizes are not the same.
+ * There is presently no way to change the indirect block sizes
+ */
+ return (0);
+}
+
+/*
+ * This function checks the following: given that last_block is the place that
+ * our traversal stopped last time, does that guarantee that we've visited
+ * every node under subtree_root? Therefore, we can't just use the raw output
+ * of zbookmark_compare. We have to pass in a modified version of
+ * subtree_root; by incrementing the block id, and then checking whether
+ * last_block is before or equal to that, we can tell whether or not having
+ * visited last_block implies that all of subtree_root's children have been
+ * visited.
+ */
+boolean_t
+zbookmark_subtree_completed(const dnode_phys_t *dnp,
+ const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
+{
+ zbookmark_phys_t mod_zb = *subtree_root;
+ mod_zb.zb_blkid++;
+ ASSERT(last_block->zb_level == 0);
+
+ /* The objset_phys_t isn't before anything. */
+ if (dnp == NULL)
+ return (B_FALSE);
+
+ /*
+ * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
+ * data block size in sectors, because that variable is only used if
+ * the bookmark refers to a block in the meta-dnode. Since we don't
+ * know without examining it what object it refers to, and there's no
+ * harm in passing in this value in other cases, we always pass it in.
+ *
+ * We pass in 0 for the indirect block size shift because zb2 must be
+ * level 0. The indirect block size is only used to calculate the span
+ * of the bookmark, but since the bookmark must be level 0, the span is
+ * always 1, so the math works out.
+ *
+ * If you make changes to how the zbookmark_compare code works, be sure
+ * to make sure that this code still works afterwards.
+ */
+ return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
+ 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
+ last_block) <= 0);
+}
+
+EXPORT_SYMBOL(zio_type_name);
+EXPORT_SYMBOL(zio_buf_alloc);
+EXPORT_SYMBOL(zio_data_buf_alloc);
+EXPORT_SYMBOL(zio_buf_free);
+EXPORT_SYMBOL(zio_data_buf_free);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW,
+ "Max I/O completion time (milliseconds) before marking it as slow");
+
+ZFS_MODULE_PARAM(zfs_zio, zio_, requeue_io_start_cut_in_line, INT, ZMOD_RW,
+ "Prioritize requeued I/O");
+
+ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free, INT, ZMOD_RW,
+ "Defer frees starting in this pass");
+
+ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, INT, ZMOD_RW,
+ "Don't compress starting in this pass");
+
+ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, INT, ZMOD_RW,
+ "Rewrite new bps starting in this pass");
+
+ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW,
+ "Throttle block allocations in the ZIO pipeline");
+
+ZFS_MODULE_PARAM(zfs_zio, zio_, deadman_log_all, INT, ZMOD_RW,
+ "Log all slow ZIOs, not just those with vdevs");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/zio_checksum.c b/sys/contrib/openzfs/module/zfs/zio_checksum.c
new file mode 100644
index 000000000000..f8fee78c6068
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zio_checksum.c
@@ -0,0 +1,570 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zil.h>
+#include <sys/abd.h>
+#include <zfs_fletcher.h>
+
+/*
+ * Checksum vectors.
+ *
+ * In the SPA, everything is checksummed. We support checksum vectors
+ * for three distinct reasons:
+ *
+ * 1. Different kinds of data need different levels of protection.
+ * For SPA metadata, we always want a very strong checksum.
+ * For user data, we let users make the trade-off between speed
+ * and checksum strength.
+ *
+ * 2. Cryptographic hash and MAC algorithms are an area of active research.
+ * It is likely that in future hash functions will be at least as strong
+ * as current best-of-breed, and may be substantially faster as well.
+ * We want the ability to take advantage of these new hashes as soon as
+ * they become available.
+ *
+ * 3. If someone develops hardware that can compute a strong hash quickly,
+ * we want the ability to take advantage of that hardware.
+ *
+ * Of course, we don't want a checksum upgrade to invalidate existing
+ * data, so we store the checksum *function* in eight bits of the bp.
+ * This gives us room for up to 256 different checksum functions.
+ *
+ * When writing a block, we always checksum it with the latest-and-greatest
+ * checksum function of the appropriate strength. When reading a block,
+ * we compare the expected checksum against the actual checksum, which we
+ * compute via the checksum function specified by BP_GET_CHECKSUM(bp).
+ *
+ * SALTED CHECKSUMS
+ *
+ * To enable the use of less secure hash algorithms with dedup, we
+ * introduce the notion of salted checksums (MACs, really). A salted
+ * checksum is fed both a random 256-bit value (the salt) and the data
+ * to be checksummed. This salt is kept secret (stored on the pool, but
+ * never shown to the user). Thus even if an attacker knew of collision
+ * weaknesses in the hash algorithm, they won't be able to mount a known
+ * plaintext attack on the DDT, since the actual hash value cannot be
+ * known ahead of time. How the salt is used is algorithm-specific
+ * (some might simply prefix it to the data block, others might need to
+ * utilize a full-blown HMAC). On disk the salt is stored in a ZAP
+ * object in the MOS (DMU_POOL_CHECKSUM_SALT).
+ *
+ * CONTEXT TEMPLATES
+ *
+ * Some hashing algorithms need to perform a substantial amount of
+ * initialization work (e.g. salted checksums above may need to pre-hash
+ * the salt) before being able to process data. Performing this
+ * redundant work for each block would be wasteful, so we instead allow
+ * a checksum algorithm to do the work once (the first time it's used)
+ * and then keep this pre-initialized context as a template inside the
+ * spa_t (spa_cksum_tmpls). If the zio_checksum_info_t contains
+ * non-NULL ci_tmpl_init and ci_tmpl_free callbacks, they are used to
+ * construct and destruct the pre-initialized checksum context. The
+ * pre-initialized context is then reused during each checksum
+ * invocation and passed to the checksum function.
+ */
+
+/*ARGSUSED*/
+static void
+abd_checksum_off(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+}
+
+/*ARGSUSED*/
+static void
+abd_fletcher_2_native(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_init(zcp);
+ (void) abd_iterate_func(abd, 0, size,
+ fletcher_2_incremental_native, zcp);
+}
+
+/*ARGSUSED*/
+static void
+abd_fletcher_2_byteswap(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_init(zcp);
+ (void) abd_iterate_func(abd, 0, size,
+ fletcher_2_incremental_byteswap, zcp);
+}
+
+static inline void
+abd_fletcher_4_impl(abd_t *abd, uint64_t size, zio_abd_checksum_data_t *acdp)
+{
+ fletcher_4_abd_ops.acf_init(acdp);
+ abd_iterate_func(abd, 0, size, fletcher_4_abd_ops.acf_iter, acdp);
+ fletcher_4_abd_ops.acf_fini(acdp);
+}
+
+/*ARGSUSED*/
+void
+abd_fletcher_4_native(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_4_ctx_t ctx;
+
+ zio_abd_checksum_data_t acd = {
+ .acd_byteorder = ZIO_CHECKSUM_NATIVE,
+ .acd_zcp = zcp,
+ .acd_ctx = &ctx
+ };
+
+ abd_fletcher_4_impl(abd, size, &acd);
+
+}
+
+/*ARGSUSED*/
+void
+abd_fletcher_4_byteswap(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_4_ctx_t ctx;
+
+ zio_abd_checksum_data_t acd = {
+ .acd_byteorder = ZIO_CHECKSUM_BYTESWAP,
+ .acd_zcp = zcp,
+ .acd_ctx = &ctx
+ };
+
+ abd_fletcher_4_impl(abd, size, &acd);
+}
+
+zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
+ {{NULL, NULL}, NULL, NULL, 0, "inherit"},
+ {{NULL, NULL}, NULL, NULL, 0, "on"},
+ {{abd_checksum_off, abd_checksum_off},
+ NULL, NULL, 0, "off"},
+ {{abd_checksum_SHA256, abd_checksum_SHA256},
+ NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
+ "label"},
+ {{abd_checksum_SHA256, abd_checksum_SHA256},
+ NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
+ "gang_header"},
+ {{abd_fletcher_2_native, abd_fletcher_2_byteswap},
+ NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
+ {{abd_fletcher_2_native, abd_fletcher_2_byteswap},
+ NULL, NULL, 0, "fletcher2"},
+ {{abd_fletcher_4_native, abd_fletcher_4_byteswap},
+ NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
+ {{abd_checksum_SHA256, abd_checksum_SHA256},
+ NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+ ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
+ {{abd_fletcher_4_native, abd_fletcher_4_byteswap},
+ NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
+ {{abd_checksum_off, abd_checksum_off},
+ NULL, NULL, 0, "noparity"},
+ {{abd_checksum_SHA512_native, abd_checksum_SHA512_byteswap},
+ NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+ ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
+ {{abd_checksum_skein_native, abd_checksum_skein_byteswap},
+ abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free,
+ ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+ ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
+#if !defined(__FreeBSD__)
+ {{abd_checksum_edonr_native, abd_checksum_edonr_byteswap},
+ abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free,
+ ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
+ ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
+#endif
+};
+
+/*
+ * The flag corresponding to the "verify" in dedup=[checksum,]verify
+ * must be cleared first, so callers should use ZIO_CHECKSUM_MASK.
+ */
+spa_feature_t
+zio_checksum_to_feature(enum zio_checksum cksum)
+{
+ VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0);
+
+ switch (cksum) {
+ case ZIO_CHECKSUM_SHA512:
+ return (SPA_FEATURE_SHA512);
+ case ZIO_CHECKSUM_SKEIN:
+ return (SPA_FEATURE_SKEIN);
+#if !defined(__FreeBSD__)
+ case ZIO_CHECKSUM_EDONR:
+ return (SPA_FEATURE_EDONR);
+#endif
+ default:
+ return (SPA_FEATURE_NONE);
+ }
+}
+
+enum zio_checksum
+zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
+{
+ ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
+
+ if (child == ZIO_CHECKSUM_INHERIT)
+ return (parent);
+
+ if (child == ZIO_CHECKSUM_ON)
+ return (ZIO_CHECKSUM_ON_VALUE);
+
+ return (child);
+}
+
+enum zio_checksum
+zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
+ enum zio_checksum parent)
+{
+ ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
+
+ if (child == ZIO_CHECKSUM_INHERIT)
+ return (parent);
+
+ if (child == ZIO_CHECKSUM_ON)
+ return (spa_dedup_checksum(spa));
+
+ if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
+ return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
+
+ ASSERT((zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP) ||
+ (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
+
+ return (child);
+}
+
+/*
+ * Set the external verifier for a gang block based on <vdev, offset, txg>,
+ * a tuple which is guaranteed to be unique for the life of the pool.
+ */
+static void
+zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
+{
+ const dva_t *dva = BP_IDENTITY(bp);
+ uint64_t txg = BP_PHYSICAL_BIRTH(bp);
+
+ ASSERT(BP_IS_GANG(bp));
+
+ ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
+}
+
+/*
+ * Set the external verifier for a label block based on its offset.
+ * The vdev is implicit, and the txg is unknowable at pool open time --
+ * hence the logic in vdev_uberblock_load() to find the most recent copy.
+ */
+static void
+zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
+{
+ ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
+}
+
+/*
+ * Calls the template init function of a checksum which supports context
+ * templates and installs the template into the spa_t.
+ */
+static void
+zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
+{
+ zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+
+ if (ci->ci_tmpl_init == NULL)
+ return;
+ if (spa->spa_cksum_tmpls[checksum] != NULL)
+ return;
+
+ VERIFY(ci->ci_tmpl_free != NULL);
+ mutex_enter(&spa->spa_cksum_tmpls_lock);
+ if (spa->spa_cksum_tmpls[checksum] == NULL) {
+ spa->spa_cksum_tmpls[checksum] =
+ ci->ci_tmpl_init(&spa->spa_cksum_salt);
+ VERIFY(spa->spa_cksum_tmpls[checksum] != NULL);
+ }
+ mutex_exit(&spa->spa_cksum_tmpls_lock);
+}
+
+/* convenience function to update a checksum to accommodate an encryption MAC */
+static void
+zio_checksum_handle_crypt(zio_cksum_t *cksum, zio_cksum_t *saved, boolean_t xor)
+{
+ /*
+ * Weak checksums do not have their entropy spread evenly
+ * across the bits of the checksum. Therefore, when truncating
+ * a weak checksum we XOR the first 2 words with the last 2 so
+ * that we don't "lose" any entropy unnecessarily.
+ */
+ if (xor) {
+ cksum->zc_word[0] ^= cksum->zc_word[2];
+ cksum->zc_word[1] ^= cksum->zc_word[3];
+ }
+
+ cksum->zc_word[2] = saved->zc_word[2];
+ cksum->zc_word[3] = saved->zc_word[3];
+}
+
+/*
+ * Generate the checksum.
+ */
+void
+zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
+ abd_t *abd, uint64_t size)
+{
+ static const uint64_t zec_magic = ZEC_MAGIC;
+ blkptr_t *bp = zio->io_bp;
+ uint64_t offset = zio->io_offset;
+ zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+ zio_cksum_t cksum, saved;
+ spa_t *spa = zio->io_spa;
+ boolean_t insecure = (ci->ci_flags & ZCHECKSUM_FLAG_DEDUP) == 0;
+
+ ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(ci->ci_func[0] != NULL);
+
+ zio_checksum_template_init(checksum, spa);
+
+ if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
+ zio_eck_t eck;
+ size_t eck_offset;
+
+ bzero(&saved, sizeof (zio_cksum_t));
+
+ if (checksum == ZIO_CHECKSUM_ZILOG2) {
+ zil_chain_t zilc;
+ abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t));
+
+ size = P2ROUNDUP_TYPED(zilc.zc_nused, ZIL_MIN_BLKSZ,
+ uint64_t);
+ eck = zilc.zc_eck;
+ eck_offset = offsetof(zil_chain_t, zc_eck);
+ } else {
+ eck_offset = size - sizeof (zio_eck_t);
+ abd_copy_to_buf_off(&eck, abd, eck_offset,
+ sizeof (zio_eck_t));
+ }
+
+ if (checksum == ZIO_CHECKSUM_GANG_HEADER) {
+ zio_checksum_gang_verifier(&eck.zec_cksum, bp);
+ } else if (checksum == ZIO_CHECKSUM_LABEL) {
+ zio_checksum_label_verifier(&eck.zec_cksum, offset);
+ } else {
+ saved = eck.zec_cksum;
+ eck.zec_cksum = bp->blk_cksum;
+ }
+
+ abd_copy_from_buf_off(abd, &zec_magic,
+ eck_offset + offsetof(zio_eck_t, zec_magic),
+ sizeof (zec_magic));
+ abd_copy_from_buf_off(abd, &eck.zec_cksum,
+ eck_offset + offsetof(zio_eck_t, zec_cksum),
+ sizeof (zio_cksum_t));
+
+ ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
+ &cksum);
+ if (bp != NULL && BP_USES_CRYPT(bp) &&
+ BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+ zio_checksum_handle_crypt(&cksum, &saved, insecure);
+
+ abd_copy_from_buf_off(abd, &cksum,
+ eck_offset + offsetof(zio_eck_t, zec_cksum),
+ sizeof (zio_cksum_t));
+ } else {
+ saved = bp->blk_cksum;
+ ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
+ &cksum);
+ if (BP_USES_CRYPT(bp) && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+ zio_checksum_handle_crypt(&cksum, &saved, insecure);
+ bp->blk_cksum = cksum;
+ }
+}
+
+int
+zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp,
+ enum zio_checksum checksum, abd_t *abd, uint64_t size, uint64_t offset,
+ zio_bad_cksum_t *info)
+{
+ zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+ zio_cksum_t actual_cksum, expected_cksum;
+ zio_eck_t eck;
+ int byteswap;
+
+ if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
+ return (SET_ERROR(EINVAL));
+
+ zio_checksum_template_init(checksum, spa);
+
+ if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
+ zio_cksum_t verifier;
+ size_t eck_offset;
+
+ if (checksum == ZIO_CHECKSUM_ZILOG2) {
+ zil_chain_t zilc;
+ uint64_t nused;
+
+ abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t));
+
+ eck = zilc.zc_eck;
+ eck_offset = offsetof(zil_chain_t, zc_eck) +
+ offsetof(zio_eck_t, zec_cksum);
+
+ if (eck.zec_magic == ZEC_MAGIC) {
+ nused = zilc.zc_nused;
+ } else if (eck.zec_magic == BSWAP_64(ZEC_MAGIC)) {
+ nused = BSWAP_64(zilc.zc_nused);
+ } else {
+ return (SET_ERROR(ECKSUM));
+ }
+
+ if (nused > size) {
+ return (SET_ERROR(ECKSUM));
+ }
+
+ size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
+ } else {
+ eck_offset = size - sizeof (zio_eck_t);
+ abd_copy_to_buf_off(&eck, abd, eck_offset,
+ sizeof (zio_eck_t));
+ eck_offset += offsetof(zio_eck_t, zec_cksum);
+ }
+
+ if (checksum == ZIO_CHECKSUM_GANG_HEADER)
+ zio_checksum_gang_verifier(&verifier, bp);
+ else if (checksum == ZIO_CHECKSUM_LABEL)
+ zio_checksum_label_verifier(&verifier, offset);
+ else
+ verifier = bp->blk_cksum;
+
+ byteswap = (eck.zec_magic == BSWAP_64(ZEC_MAGIC));
+
+ if (byteswap)
+ byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
+
+ expected_cksum = eck.zec_cksum;
+
+ abd_copy_from_buf_off(abd, &verifier, eck_offset,
+ sizeof (zio_cksum_t));
+
+ ci->ci_func[byteswap](abd, size,
+ spa->spa_cksum_tmpls[checksum], &actual_cksum);
+
+ abd_copy_from_buf_off(abd, &expected_cksum, eck_offset,
+ sizeof (zio_cksum_t));
+
+ if (byteswap) {
+ byteswap_uint64_array(&expected_cksum,
+ sizeof (zio_cksum_t));
+ }
+ } else {
+ byteswap = BP_SHOULD_BYTESWAP(bp);
+ expected_cksum = bp->blk_cksum;
+ ci->ci_func[byteswap](abd, size,
+ spa->spa_cksum_tmpls[checksum], &actual_cksum);
+ }
+
+ /*
+ * MAC checksums are a special case since half of this checksum will
+ * actually be the encryption MAC. This will be verified by the
+ * decryption process, so we just check the truncated checksum now.
+ * Objset blocks use embedded MACs so we don't truncate the checksum
+ * for them.
+ */
+ if (bp != NULL && BP_USES_CRYPT(bp) &&
+ BP_GET_TYPE(bp) != DMU_OT_OBJSET) {
+ if (!(ci->ci_flags & ZCHECKSUM_FLAG_DEDUP)) {
+ actual_cksum.zc_word[0] ^= actual_cksum.zc_word[2];
+ actual_cksum.zc_word[1] ^= actual_cksum.zc_word[3];
+ }
+
+ actual_cksum.zc_word[2] = 0;
+ actual_cksum.zc_word[3] = 0;
+ expected_cksum.zc_word[2] = 0;
+ expected_cksum.zc_word[3] = 0;
+ }
+
+ if (info != NULL) {
+ info->zbc_expected = expected_cksum;
+ info->zbc_actual = actual_cksum;
+ info->zbc_checksum_name = ci->ci_name;
+ info->zbc_byteswapped = byteswap;
+ info->zbc_injected = 0;
+ info->zbc_has_cksum = 1;
+ }
+
+ if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
+ return (SET_ERROR(ECKSUM));
+
+ return (0);
+}
+
+int
+zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
+{
+ blkptr_t *bp = zio->io_bp;
+ uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
+ (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+ int error;
+ uint64_t size = (bp == NULL ? zio->io_size :
+ (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
+ uint64_t offset = zio->io_offset;
+ abd_t *data = zio->io_abd;
+ spa_t *spa = zio->io_spa;
+
+ error = zio_checksum_error_impl(spa, bp, checksum, data, size,
+ offset, info);
+
+ if (zio_injection_enabled && error == 0 && zio->io_error == 0) {
+ error = zio_handle_fault_injection(zio, ECKSUM);
+ if (error != 0)
+ info->zbc_injected = 1;
+ }
+
+ return (error);
+}
+
+/*
+ * Called by a spa_t that's about to be deallocated. This steps through
+ * all of the checksum context templates and deallocates any that were
+ * initialized using the algorithm-specific template init function.
+ */
+void
+zio_checksum_templates_free(spa_t *spa)
+{
+ for (enum zio_checksum checksum = 0;
+ checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
+ if (spa->spa_cksum_tmpls[checksum] != NULL) {
+ zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+
+ VERIFY(ci->ci_tmpl_free != NULL);
+ ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
+ spa->spa_cksum_tmpls[checksum] = NULL;
+ }
+ }
+}
diff --git a/sys/contrib/openzfs/module/zfs/zio_compress.c b/sys/contrib/openzfs/module/zfs/zio_compress.c
new file mode 100644
index 000000000000..2db3cec35d5d
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zio_compress.c
@@ -0,0 +1,220 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zfeature.h>
+#include <sys/zio.h>
+#include <sys/zio_compress.h>
+#include <sys/zstd/zstd.h>
+
+/*
+ * If nonzero, every 1/X decompression attempts will fail, simulating
+ * an undetected memory error.
+ */
+unsigned long zio_decompress_fail_fraction = 0;
+
+/*
+ * Compression vectors.
+ */
+zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
+ {"inherit", 0, NULL, NULL, NULL},
+ {"on", 0, NULL, NULL, NULL},
+ {"uncompressed", 0, NULL, NULL, NULL},
+ {"lzjb", 0, lzjb_compress, lzjb_decompress, NULL},
+ {"empty", 0, NULL, NULL, NULL},
+ {"gzip-1", 1, gzip_compress, gzip_decompress, NULL},
+ {"gzip-2", 2, gzip_compress, gzip_decompress, NULL},
+ {"gzip-3", 3, gzip_compress, gzip_decompress, NULL},
+ {"gzip-4", 4, gzip_compress, gzip_decompress, NULL},
+ {"gzip-5", 5, gzip_compress, gzip_decompress, NULL},
+ {"gzip-6", 6, gzip_compress, gzip_decompress, NULL},
+ {"gzip-7", 7, gzip_compress, gzip_decompress, NULL},
+ {"gzip-8", 8, gzip_compress, gzip_decompress, NULL},
+ {"gzip-9", 9, gzip_compress, gzip_decompress, NULL},
+ {"zle", 64, zle_compress, zle_decompress, NULL},
+ {"lz4", 0, lz4_compress_zfs, lz4_decompress_zfs, NULL},
+ {"zstd", ZIO_ZSTD_LEVEL_DEFAULT, zfs_zstd_compress,
+ zfs_zstd_decompress, zfs_zstd_decompress_level},
+};
+
+uint8_t
+zio_complevel_select(spa_t *spa, enum zio_compress compress, uint8_t child,
+ uint8_t parent)
+{
+ uint8_t result;
+
+ if (!ZIO_COMPRESS_HASLEVEL(compress))
+ return (0);
+
+ result = child;
+ if (result == ZIO_COMPLEVEL_INHERIT)
+ result = parent;
+
+ return (result);
+}
+
+enum zio_compress
+zio_compress_select(spa_t *spa, enum zio_compress child,
+ enum zio_compress parent)
+{
+ enum zio_compress result;
+
+ ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
+ ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
+ ASSERT(parent != ZIO_COMPRESS_INHERIT);
+
+ result = child;
+ if (result == ZIO_COMPRESS_INHERIT)
+ result = parent;
+
+ if (result == ZIO_COMPRESS_ON) {
+ if (spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS))
+ result = ZIO_COMPRESS_LZ4_ON_VALUE;
+ else
+ result = ZIO_COMPRESS_LEGACY_ON_VALUE;
+ }
+
+ return (result);
+}
+
+/*ARGSUSED*/
+static int
+zio_compress_zeroed_cb(void *data, size_t len, void *private)
+{
+ uint64_t *end = (uint64_t *)((char *)data + len);
+ for (uint64_t *word = (uint64_t *)data; word < end; word++)
+ if (*word != 0)
+ return (1);
+
+ return (0);
+}
+
+size_t
+zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len,
+ uint8_t level)
+{
+ size_t c_len, d_len;
+ uint8_t complevel;
+ zio_compress_info_t *ci = &zio_compress_table[c];
+
+ ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS);
+ ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
+
+ /*
+ * If the data is all zeroes, we don't even need to allocate
+ * a block for it. We indicate this by returning zero size.
+ */
+ if (abd_iterate_func(src, 0, s_len, zio_compress_zeroed_cb, NULL) == 0)
+ return (0);
+
+ if (c == ZIO_COMPRESS_EMPTY)
+ return (s_len);
+
+ /* Compress at least 12.5% */
+ d_len = s_len - (s_len >> 3);
+
+ complevel = ci->ci_level;
+
+ if (c == ZIO_COMPRESS_ZSTD) {
+ /* If we don't know the level, we can't compress it */
+ if (level == ZIO_COMPLEVEL_INHERIT)
+ return (s_len);
+
+ if (level == ZIO_COMPLEVEL_DEFAULT)
+ complevel = ZIO_ZSTD_LEVEL_DEFAULT;
+ else
+ complevel = level;
+
+ ASSERT3U(complevel, !=, ZIO_COMPLEVEL_INHERIT);
+ }
+
+ /* No compression algorithms can read from ABDs directly */
+ void *tmp = abd_borrow_buf_copy(src, s_len);
+ c_len = ci->ci_compress(tmp, dst, s_len, d_len, complevel);
+ abd_return_buf(src, tmp, s_len);
+
+ if (c_len > d_len)
+ return (s_len);
+
+ ASSERT3U(c_len, <=, d_len);
+ return (c_len);
+}
+
+int
+zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
+ size_t s_len, size_t d_len, uint8_t *level)
+{
+ zio_compress_info_t *ci = &zio_compress_table[c];
+ if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
+ return (SET_ERROR(EINVAL));
+
+ if (ci->ci_decompress_level != NULL && level != NULL)
+ return (ci->ci_decompress_level(src, dst, s_len, d_len, level));
+
+ return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
+}
+
+int
+zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
+ size_t s_len, size_t d_len, uint8_t *level)
+{
+ void *tmp = abd_borrow_buf_copy(src, s_len);
+ int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len, level);
+ abd_return_buf(src, tmp, s_len);
+
+ /*
+ * Decompression shouldn't fail, because we've already verified
+ * the checksum. However, for extra protection (e.g. against bitflips
+ * in non-ECC RAM), we handle this error (and test it).
+ */
+ if (zio_decompress_fail_fraction != 0 &&
+ spa_get_random(zio_decompress_fail_fraction) == 0)
+ ret = SET_ERROR(EINVAL);
+
+ return (ret);
+}
+
+int
+zio_compress_to_feature(enum zio_compress comp)
+{
+ switch (comp) {
+ case ZIO_COMPRESS_ZSTD:
+ return (SPA_FEATURE_ZSTD_COMPRESS);
+ default:
+ /* fallthru */;
+ }
+ return (SPA_FEATURE_NONE);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zio_inject.c b/sys/contrib/openzfs/module/zfs/zio_inject.c
new file mode 100644
index 000000000000..e56ea88682ff
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zio_inject.c
@@ -0,0 +1,972 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+/*
+ * ZFS fault injection
+ *
+ * To handle fault injection, we keep track of a series of zinject_record_t
+ * structures which describe which logical block(s) should be injected with a
+ * fault. These are kept in a global list. Each record corresponds to a given
+ * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
+ * or exported while the injection record exists.
+ *
+ * Device level injection is done using the 'zi_guid' field. If this is set, it
+ * means that the error is destined for a particular device, not a piece of
+ * data.
+ *
+ * This is a rather poor data structure and algorithm, but we don't expect more
+ * than a few faults at any one time, so it should be sufficient for our needs.
+ */
+
+#include <sys/arc.h>
+#include <sys/zio.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/vdev_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/fs/zfs.h>
+
+uint32_t zio_injection_enabled = 0;
+
+/*
+ * Data describing each zinject handler registered on the system, and
+ * contains the list node linking the handler in the global zinject
+ * handler list.
+ */
+typedef struct inject_handler {
+ int zi_id;
+ spa_t *zi_spa;
+ zinject_record_t zi_record;
+ uint64_t *zi_lanes;
+ int zi_next_lane;
+ list_node_t zi_link;
+} inject_handler_t;
+
+/*
+ * List of all zinject handlers registered on the system, protected by
+ * the inject_lock defined below.
+ */
+static list_t inject_handlers;
+
+/*
+ * This protects insertion into, and traversal of, the inject handler
+ * list defined above; as well as the inject_delay_count. Any time a
+ * handler is inserted or removed from the list, this lock should be
+ * taken as a RW_WRITER; and any time traversal is done over the list
+ * (without modification to it) this lock should be taken as a RW_READER.
+ */
+static krwlock_t inject_lock;
+
+/*
+ * This holds the number of zinject delay handlers that have been
+ * registered on the system. It is protected by the inject_lock defined
+ * above. Thus modifications to this count must be a RW_WRITER of the
+ * inject_lock, and reads of this count must be (at least) a RW_READER
+ * of the lock.
+ */
+static int inject_delay_count = 0;
+
+/*
+ * This lock is used only in zio_handle_io_delay(), refer to the comment
+ * in that function for more details.
+ */
+static kmutex_t inject_delay_mtx;
+
+/*
+ * Used to assign unique identifying numbers to each new zinject handler.
+ */
+static int inject_next_id = 1;
+
+/*
+ * Test if the requested frequency was triggered
+ */
+static boolean_t
+freq_triggered(uint32_t frequency)
+{
+ /*
+ * zero implies always (100%)
+ */
+ if (frequency == 0)
+ return (B_TRUE);
+
+ /*
+ * Note: we still handle legacy (unscaled) frequency values
+ */
+ uint32_t maximum = (frequency <= 100) ? 100 : ZI_PERCENTAGE_MAX;
+
+ return (spa_get_random(maximum) < frequency);
+}
+
+/*
+ * Returns true if the given record matches the I/O in progress.
+ */
+static boolean_t
+zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva,
+ zinject_record_t *record, int error)
+{
+ /*
+ * Check for a match against the MOS, which is based on type
+ */
+ if (zb->zb_objset == DMU_META_OBJSET &&
+ record->zi_objset == DMU_META_OBJSET &&
+ record->zi_object == DMU_META_DNODE_OBJECT) {
+ if (record->zi_type == DMU_OT_NONE ||
+ type == record->zi_type)
+ return (freq_triggered(record->zi_freq));
+ else
+ return (B_FALSE);
+ }
+
+ /*
+ * Check for an exact match.
+ */
+ if (zb->zb_objset == record->zi_objset &&
+ zb->zb_object == record->zi_object &&
+ zb->zb_level == record->zi_level &&
+ zb->zb_blkid >= record->zi_start &&
+ zb->zb_blkid <= record->zi_end &&
+ (record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) &&
+ error == record->zi_error) {
+ return (freq_triggered(record->zi_freq));
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Panic the system when a config change happens in the function
+ * specified by tag.
+ */
+void
+zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type)
+{
+ inject_handler_t *handler;
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ if (spa != handler->zi_spa)
+ continue;
+
+ if (handler->zi_record.zi_type == type &&
+ strcmp(tag, handler->zi_record.zi_func) == 0)
+ panic("Panic requested in function %s\n", tag);
+ }
+
+ rw_exit(&inject_lock);
+}
+
+/*
+ * Inject a decryption failure. Decryption failures can occur in
+ * both the ARC and the ZIO layers.
+ */
+int
+zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb,
+ uint64_t type, int error)
+{
+ int ret = 0;
+ inject_handler_t *handler;
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ if (spa != handler->zi_spa ||
+ handler->zi_record.zi_cmd != ZINJECT_DECRYPT_FAULT)
+ continue;
+
+ if (zio_match_handler(zb, type, ZI_NO_DVA,
+ &handler->zi_record, error)) {
+ ret = error;
+ break;
+ }
+ }
+
+ rw_exit(&inject_lock);
+ return (ret);
+}
+
+/*
+ * If this is a physical I/O for a vdev child determine which DVA it is
+ * for. We iterate backwards through the DVAs matching on the offset so
+ * that we end up with ZI_NO_DVA (-1) if we don't find a match.
+ */
+static int
+zio_match_dva(zio_t *zio)
+{
+ int i = ZI_NO_DVA;
+
+ if (zio->io_bp != NULL && zio->io_vd != NULL &&
+ zio->io_child_type == ZIO_CHILD_VDEV) {
+ for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) {
+ dva_t *dva = &zio->io_bp->blk_dva[i];
+ uint64_t off = DVA_GET_OFFSET(dva);
+ vdev_t *vd = vdev_lookup_top(zio->io_spa,
+ DVA_GET_VDEV(dva));
+
+ /* Compensate for vdev label added to leaves */
+ if (zio->io_vd->vdev_ops->vdev_op_leaf)
+ off += VDEV_LABEL_START_SIZE;
+
+ if (zio->io_vd == vd && zio->io_offset == off)
+ break;
+ }
+ }
+
+ return (i);
+}
+
+
+/*
+ * Determine if the I/O in question should return failure. Returns the errno
+ * to be returned to the caller.
+ */
+int
+zio_handle_fault_injection(zio_t *zio, int error)
+{
+ int ret = 0;
+ inject_handler_t *handler;
+
+ /*
+ * Ignore I/O not associated with any logical data.
+ */
+ if (zio->io_logical == NULL)
+ return (0);
+
+ /*
+ * Currently, we only support fault injection on reads.
+ */
+ if (zio->io_type != ZIO_TYPE_READ)
+ return (0);
+
+ /*
+ * A rebuild I/O has no checksum to verify.
+ */
+ if (zio->io_priority == ZIO_PRIORITY_REBUILD && error == ECKSUM)
+ return (0);
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+ if (zio->io_spa != handler->zi_spa ||
+ handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
+ continue;
+
+ /* If this handler matches, return the specified error */
+ if (zio_match_handler(&zio->io_logical->io_bookmark,
+ zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
+ zio_match_dva(zio), &handler->zi_record, error)) {
+ ret = error;
+ break;
+ }
+ }
+
+ rw_exit(&inject_lock);
+
+ return (ret);
+}
+
+/*
+ * Determine if the zio is part of a label update and has an injection
+ * handler associated with that portion of the label. Currently, we
+ * allow error injection in either the nvlist or the uberblock region of
+ * of the vdev label.
+ */
+int
+zio_handle_label_injection(zio_t *zio, int error)
+{
+ inject_handler_t *handler;
+ vdev_t *vd = zio->io_vd;
+ uint64_t offset = zio->io_offset;
+ int label;
+ int ret = 0;
+
+ if (offset >= VDEV_LABEL_START_SIZE &&
+ offset < vd->vdev_psize - VDEV_LABEL_END_SIZE)
+ return (0);
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+ uint64_t start = handler->zi_record.zi_start;
+ uint64_t end = handler->zi_record.zi_end;
+
+ if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
+ continue;
+
+ /*
+ * The injection region is the relative offsets within a
+ * vdev label. We must determine the label which is being
+ * updated and adjust our region accordingly.
+ */
+ label = vdev_label_number(vd->vdev_psize, offset);
+ start = vdev_label_offset(vd->vdev_psize, label, start);
+ end = vdev_label_offset(vd->vdev_psize, label, end);
+
+ if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid &&
+ (offset >= start && offset <= end)) {
+ ret = error;
+ break;
+ }
+ }
+ rw_exit(&inject_lock);
+ return (ret);
+}
+
+/*ARGSUSED*/
+static int
+zio_inject_bitflip_cb(void *data, size_t len, void *private)
+{
+ zio_t *zio __maybe_unused = private;
+ uint8_t *buffer = data;
+ uint_t byte = spa_get_random(len);
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+ /* flip a single random bit in an abd data buffer */
+ buffer[byte] ^= 1 << spa_get_random(8);
+
+ return (1); /* stop after first flip */
+}
+
+static int
+zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2)
+{
+ inject_handler_t *handler;
+ int ret = 0;
+
+ /*
+ * We skip over faults in the labels unless it's during
+ * device open (i.e. zio == NULL).
+ */
+ if (zio != NULL) {
+ uint64_t offset = zio->io_offset;
+
+ if (offset < VDEV_LABEL_START_SIZE ||
+ offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE)
+ return (0);
+ }
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
+ continue;
+
+ if (vd->vdev_guid == handler->zi_record.zi_guid) {
+ if (handler->zi_record.zi_failfast &&
+ (zio == NULL || (zio->io_flags &
+ (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) {
+ continue;
+ }
+
+ /* Handle type specific I/O failures */
+ if (zio != NULL &&
+ handler->zi_record.zi_iotype != ZIO_TYPES &&
+ handler->zi_record.zi_iotype != zio->io_type)
+ continue;
+
+ if (handler->zi_record.zi_error == err1 ||
+ handler->zi_record.zi_error == err2) {
+ /*
+ * limit error injection if requested
+ */
+ if (!freq_triggered(handler->zi_record.zi_freq))
+ continue;
+
+ /*
+ * For a failed open, pretend like the device
+ * has gone away.
+ */
+ if (err1 == ENXIO)
+ vd->vdev_stat.vs_aux =
+ VDEV_AUX_OPEN_FAILED;
+
+ /*
+ * Treat these errors as if they had been
+ * retried so that all the appropriate stats
+ * and FMA events are generated.
+ */
+ if (!handler->zi_record.zi_failfast &&
+ zio != NULL)
+ zio->io_flags |= ZIO_FLAG_IO_RETRY;
+
+ /*
+ * EILSEQ means flip a bit after a read
+ */
+ if (handler->zi_record.zi_error == EILSEQ) {
+ if (zio == NULL)
+ break;
+
+ /* locate buffer data and flip a bit */
+ (void) abd_iterate_func(zio->io_abd, 0,
+ zio->io_size, zio_inject_bitflip_cb,
+ zio);
+ break;
+ }
+
+ ret = handler->zi_record.zi_error;
+ break;
+ }
+ if (handler->zi_record.zi_error == ENXIO) {
+ ret = SET_ERROR(EIO);
+ break;
+ }
+ }
+ }
+
+ rw_exit(&inject_lock);
+
+ return (ret);
+}
+
+int
+zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
+{
+ return (zio_handle_device_injection_impl(vd, zio, error, INT_MAX));
+}
+
+int
+zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, int err2)
+{
+ return (zio_handle_device_injection_impl(vd, zio, err1, err2));
+}
+
+/*
+ * Simulate hardware that ignores cache flushes. For requested number
+ * of seconds nix the actual writing to disk.
+ */
+void
+zio_handle_ignored_writes(zio_t *zio)
+{
+ inject_handler_t *handler;
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ /* Ignore errors not destined for this pool */
+ if (zio->io_spa != handler->zi_spa ||
+ handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
+ continue;
+
+ /*
+ * Positive duration implies # of seconds, negative
+ * a number of txgs
+ */
+ if (handler->zi_record.zi_timer == 0) {
+ if (handler->zi_record.zi_duration > 0)
+ handler->zi_record.zi_timer = ddi_get_lbolt64();
+ else
+ handler->zi_record.zi_timer = zio->io_txg;
+ }
+
+ /* Have a "problem" writing 60% of the time */
+ if (spa_get_random(100) < 60)
+ zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+ break;
+ }
+
+ rw_exit(&inject_lock);
+}
+
+void
+spa_handle_ignored_writes(spa_t *spa)
+{
+ inject_handler_t *handler;
+
+ if (zio_injection_enabled == 0)
+ return;
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ if (spa != handler->zi_spa ||
+ handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
+ continue;
+
+ if (handler->zi_record.zi_duration > 0) {
+ VERIFY(handler->zi_record.zi_timer == 0 ||
+ ddi_time_after64(
+ (int64_t)handler->zi_record.zi_timer +
+ handler->zi_record.zi_duration * hz,
+ ddi_get_lbolt64()));
+ } else {
+ /* duration is negative so the subtraction here adds */
+ VERIFY(handler->zi_record.zi_timer == 0 ||
+ handler->zi_record.zi_timer -
+ handler->zi_record.zi_duration >=
+ spa_syncing_txg(spa));
+ }
+ }
+
+ rw_exit(&inject_lock);
+}
+
+hrtime_t
+zio_handle_io_delay(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ inject_handler_t *min_handler = NULL;
+ hrtime_t min_target = 0;
+
+ rw_enter(&inject_lock, RW_READER);
+
+ /*
+ * inject_delay_count is a subset of zio_injection_enabled that
+ * is only incremented for delay handlers. These checks are
+ * mainly added to remind the reader why we're not explicitly
+ * checking zio_injection_enabled like the other functions.
+ */
+ IMPLY(inject_delay_count > 0, zio_injection_enabled > 0);
+ IMPLY(zio_injection_enabled == 0, inject_delay_count == 0);
+
+ /*
+ * If there aren't any inject delay handlers registered, then we
+ * can short circuit and simply return 0 here. A value of zero
+ * informs zio_delay_interrupt() that this request should not be
+ * delayed. This short circuit keeps us from acquiring the
+ * inject_delay_mutex unnecessarily.
+ */
+ if (inject_delay_count == 0) {
+ rw_exit(&inject_lock);
+ return (0);
+ }
+
+ /*
+ * Each inject handler has a number of "lanes" associated with
+ * it. Each lane is able to handle requests independently of one
+ * another, and at a latency defined by the inject handler
+ * record's zi_timer field. Thus if a handler in configured with
+ * a single lane with a 10ms latency, it will delay requests
+ * such that only a single request is completed every 10ms. So,
+ * if more than one request is attempted per each 10ms interval,
+ * the average latency of the requests will be greater than
+ * 10ms; but if only a single request is submitted each 10ms
+ * interval the average latency will be 10ms.
+ *
+ * We need to acquire this mutex to prevent multiple concurrent
+ * threads being assigned to the same lane of a given inject
+ * handler. The mutex allows us to perform the following two
+ * operations atomically:
+ *
+ * 1. determine the minimum handler and minimum target
+ * value of all the possible handlers
+ * 2. update that minimum handler's lane array
+ *
+ * Without atomicity, two (or more) threads could pick the same
+ * lane in step (1), and then conflict with each other in step
+ * (2). This could allow a single lane handler to process
+ * multiple requests simultaneously, which shouldn't be possible.
+ */
+ mutex_enter(&inject_delay_mtx);
+
+ for (inject_handler_t *handler = list_head(&inject_handlers);
+ handler != NULL; handler = list_next(&inject_handlers, handler)) {
+ if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
+ continue;
+
+ if (!freq_triggered(handler->zi_record.zi_freq))
+ continue;
+
+ if (vd->vdev_guid != handler->zi_record.zi_guid)
+ continue;
+
+ /*
+ * Defensive; should never happen as the array allocation
+ * occurs prior to inserting this handler on the list.
+ */
+ ASSERT3P(handler->zi_lanes, !=, NULL);
+
+ /*
+ * This should never happen, the zinject command should
+ * prevent a user from setting an IO delay with zero lanes.
+ */
+ ASSERT3U(handler->zi_record.zi_nlanes, !=, 0);
+
+ ASSERT3U(handler->zi_record.zi_nlanes, >,
+ handler->zi_next_lane);
+
+ /*
+ * We want to issue this IO to the lane that will become
+ * idle the soonest, so we compare the soonest this
+ * specific handler can complete the IO with all other
+ * handlers, to find the lowest value of all possible
+ * lanes. We then use this lane to submit the request.
+ *
+ * Since each handler has a constant value for its
+ * delay, we can just use the "next" lane for that
+ * handler; as it will always be the lane with the
+ * lowest value for that particular handler (i.e. the
+ * lane that will become idle the soonest). This saves a
+ * scan of each handler's lanes array.
+ *
+ * There's two cases to consider when determining when
+ * this specific IO request should complete. If this
+ * lane is idle, we want to "submit" the request now so
+ * it will complete after zi_timer milliseconds. Thus,
+ * we set the target to now + zi_timer.
+ *
+ * If the lane is busy, we want this request to complete
+ * zi_timer milliseconds after the lane becomes idle.
+ * Since the 'zi_lanes' array holds the time at which
+ * each lane will become idle, we use that value to
+ * determine when this request should complete.
+ */
+ hrtime_t idle = handler->zi_record.zi_timer + gethrtime();
+ hrtime_t busy = handler->zi_record.zi_timer +
+ handler->zi_lanes[handler->zi_next_lane];
+ hrtime_t target = MAX(idle, busy);
+
+ if (min_handler == NULL) {
+ min_handler = handler;
+ min_target = target;
+ continue;
+ }
+
+ ASSERT3P(min_handler, !=, NULL);
+ ASSERT3U(min_target, !=, 0);
+
+ /*
+ * We don't yet increment the "next lane" variable since
+ * we still might find a lower value lane in another
+ * handler during any remaining iterations. Once we're
+ * sure we've selected the absolute minimum, we'll claim
+ * the lane and increment the handler's "next lane"
+ * field below.
+ */
+
+ if (target < min_target) {
+ min_handler = handler;
+ min_target = target;
+ }
+ }
+
+ /*
+ * 'min_handler' will be NULL if no IO delays are registered for
+ * this vdev, otherwise it will point to the handler containing
+ * the lane that will become idle the soonest.
+ */
+ if (min_handler != NULL) {
+ ASSERT3U(min_target, !=, 0);
+ min_handler->zi_lanes[min_handler->zi_next_lane] = min_target;
+
+ /*
+ * If we've used all possible lanes for this handler,
+ * loop back and start using the first lane again;
+ * otherwise, just increment the lane index.
+ */
+ min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) %
+ min_handler->zi_record.zi_nlanes;
+ }
+
+ mutex_exit(&inject_delay_mtx);
+ rw_exit(&inject_lock);
+
+ return (min_target);
+}
+
+static int
+zio_calculate_range(const char *pool, zinject_record_t *record)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ objset_t *os = NULL;
+ dnode_t *dn = NULL;
+ int error;
+
+ /*
+ * Obtain the dnode for object using pool, objset, and object
+ */
+ error = dsl_pool_hold(pool, FTAG, &dp);
+ if (error)
+ return (error);
+
+ error = dsl_dataset_hold_obj(dp, record->zi_objset, FTAG, &ds);
+ dsl_pool_rele(dp, FTAG);
+ if (error)
+ return (error);
+
+ error = dmu_objset_from_ds(ds, &os);
+ dsl_dataset_rele(ds, FTAG);
+ if (error)
+ return (error);
+
+ error = dnode_hold(os, record->zi_object, FTAG, &dn);
+ if (error)
+ return (error);
+
+ /*
+ * Translate the range into block IDs
+ */
+ if (record->zi_start != 0 || record->zi_end != -1ULL) {
+ record->zi_start >>= dn->dn_datablkshift;
+ record->zi_end >>= dn->dn_datablkshift;
+ }
+ if (record->zi_level > 0) {
+ if (record->zi_level >= dn->dn_nlevels) {
+ dnode_rele(dn, FTAG);
+ return (SET_ERROR(EDOM));
+ }
+
+ if (record->zi_start != 0 || record->zi_end != 0) {
+ int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ for (int level = record->zi_level; level > 0; level--) {
+ record->zi_start >>= shift;
+ record->zi_end >>= shift;
+ }
+ }
+ }
+
+ dnode_rele(dn, FTAG);
+ return (0);
+}
+
+/*
+ * Create a new handler for the given record. We add it to the list, adding
+ * a reference to the spa_t in the process. We increment zio_injection_enabled,
+ * which is the switch to trigger all fault injection.
+ */
+int
+zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
+{
+ inject_handler_t *handler;
+ int error;
+ spa_t *spa;
+
+ /*
+ * If this is pool-wide metadata, make sure we unload the corresponding
+ * spa_t, so that the next attempt to load it will trigger the fault.
+ * We call spa_reset() to unload the pool appropriately.
+ */
+ if (flags & ZINJECT_UNLOAD_SPA)
+ if ((error = spa_reset(name)) != 0)
+ return (error);
+
+ if (record->zi_cmd == ZINJECT_DELAY_IO) {
+ /*
+ * A value of zero for the number of lanes or for the
+ * delay time doesn't make sense.
+ */
+ if (record->zi_timer == 0 || record->zi_nlanes == 0)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * The number of lanes is directly mapped to the size of
+ * an array used by the handler. Thus, to ensure the
+ * user doesn't trigger an allocation that's "too large"
+ * we cap the number of lanes here.
+ */
+ if (record->zi_nlanes >= UINT16_MAX)
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * If the supplied range was in bytes -- calculate the actual blkid
+ */
+ if (flags & ZINJECT_CALC_RANGE) {
+ error = zio_calculate_range(name, record);
+ if (error != 0)
+ return (error);
+ }
+
+ if (!(flags & ZINJECT_NULL)) {
+ /*
+ * spa_inject_ref() will add an injection reference, which will
+ * prevent the pool from being removed from the namespace while
+ * still allowing it to be unloaded.
+ */
+ if ((spa = spa_inject_addref(name)) == NULL)
+ return (SET_ERROR(ENOENT));
+
+ handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
+
+ handler->zi_spa = spa;
+ handler->zi_record = *record;
+
+ if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+ handler->zi_lanes = kmem_zalloc(
+ sizeof (*handler->zi_lanes) *
+ handler->zi_record.zi_nlanes, KM_SLEEP);
+ handler->zi_next_lane = 0;
+ } else {
+ handler->zi_lanes = NULL;
+ handler->zi_next_lane = 0;
+ }
+
+ rw_enter(&inject_lock, RW_WRITER);
+
+ /*
+ * We can't move this increment into the conditional
+ * above because we need to hold the RW_WRITER lock of
+ * inject_lock, and we don't want to hold that while
+ * allocating the handler's zi_lanes array.
+ */
+ if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+ ASSERT3S(inject_delay_count, >=, 0);
+ inject_delay_count++;
+ ASSERT3S(inject_delay_count, >, 0);
+ }
+
+ *id = handler->zi_id = inject_next_id++;
+ list_insert_tail(&inject_handlers, handler);
+ atomic_inc_32(&zio_injection_enabled);
+
+ rw_exit(&inject_lock);
+ }
+
+ /*
+ * Flush the ARC, so that any attempts to read this data will end up
+ * going to the ZIO layer. Note that this is a little overkill, but
+ * we don't have the necessary ARC interfaces to do anything else, and
+ * fault injection isn't a performance critical path.
+ */
+ if (flags & ZINJECT_FLUSH_ARC)
+ /*
+ * We must use FALSE to ensure arc_flush returns, since
+ * we're not preventing concurrent ARC insertions.
+ */
+ arc_flush(NULL, FALSE);
+
+ return (0);
+}
+
+/*
+ * Returns the next record with an ID greater than that supplied to the
+ * function. Used to iterate over all handlers in the system.
+ */
+int
+zio_inject_list_next(int *id, char *name, size_t buflen,
+ zinject_record_t *record)
+{
+ inject_handler_t *handler;
+ int ret;
+
+ mutex_enter(&spa_namespace_lock);
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler))
+ if (handler->zi_id > *id)
+ break;
+
+ if (handler) {
+ *record = handler->zi_record;
+ *id = handler->zi_id;
+ (void) strncpy(name, spa_name(handler->zi_spa), buflen);
+ ret = 0;
+ } else {
+ ret = SET_ERROR(ENOENT);
+ }
+
+ rw_exit(&inject_lock);
+ mutex_exit(&spa_namespace_lock);
+
+ return (ret);
+}
+
+/*
+ * Clear the fault handler with the given identifier, or return ENOENT if none
+ * exists.
+ */
+int
+zio_clear_fault(int id)
+{
+ inject_handler_t *handler;
+
+ rw_enter(&inject_lock, RW_WRITER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler))
+ if (handler->zi_id == id)
+ break;
+
+ if (handler == NULL) {
+ rw_exit(&inject_lock);
+ return (SET_ERROR(ENOENT));
+ }
+
+ if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+ ASSERT3S(inject_delay_count, >, 0);
+ inject_delay_count--;
+ ASSERT3S(inject_delay_count, >=, 0);
+ }
+
+ list_remove(&inject_handlers, handler);
+ rw_exit(&inject_lock);
+
+ if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+ ASSERT3P(handler->zi_lanes, !=, NULL);
+ kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) *
+ handler->zi_record.zi_nlanes);
+ } else {
+ ASSERT3P(handler->zi_lanes, ==, NULL);
+ }
+
+ spa_inject_delref(handler->zi_spa);
+ kmem_free(handler, sizeof (inject_handler_t));
+ atomic_dec_32(&zio_injection_enabled);
+
+ return (0);
+}
+
+void
+zio_inject_init(void)
+{
+ rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&inject_handlers, sizeof (inject_handler_t),
+ offsetof(inject_handler_t, zi_link));
+}
+
+void
+zio_inject_fini(void)
+{
+ list_destroy(&inject_handlers);
+ mutex_destroy(&inject_delay_mtx);
+ rw_destroy(&inject_lock);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zio_injection_enabled);
+EXPORT_SYMBOL(zio_inject_fault);
+EXPORT_SYMBOL(zio_inject_list_next);
+EXPORT_SYMBOL(zio_clear_fault);
+EXPORT_SYMBOL(zio_handle_fault_injection);
+EXPORT_SYMBOL(zio_handle_device_injection);
+EXPORT_SYMBOL(zio_handle_label_injection);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/zle.c b/sys/contrib/openzfs/module/zfs/zle.c
new file mode 100644
index 000000000000..0decebb13ca7
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zle.c
@@ -0,0 +1,91 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Zero-length encoding. This is a fast and simple algorithm to eliminate
+ * runs of zeroes. Each chunk of compressed data begins with a length byte, b.
+ * If b < n (where n is the compression parameter) then the next b + 1 bytes
+ * are literal values. If b >= n then the next (256 - b + 1) bytes are zero.
+ */
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/zio_compress.h>
+
+size_t
+zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ uchar_t *src = s_start;
+ uchar_t *dst = d_start;
+ uchar_t *s_end = src + s_len;
+ uchar_t *d_end = dst + d_len;
+
+ while (src < s_end && dst < d_end - 1) {
+ uchar_t *first = src;
+ uchar_t *len = dst++;
+ if (src[0] == 0) {
+ uchar_t *last = src + (256 - n);
+ while (src < MIN(last, s_end) && src[0] == 0)
+ src++;
+ *len = src - first - 1 + n;
+ } else {
+ uchar_t *last = src + n;
+ if (d_end - dst < n)
+ break;
+ while (src < MIN(last, s_end) - 1 && (src[0] | src[1]))
+ *dst++ = *src++;
+ if (src[0])
+ *dst++ = *src++;
+ *len = src - first - 1;
+ }
+ }
+ return (src == s_end ? dst - (uchar_t *)d_start : s_len);
+}
+
+int
+zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ uchar_t *src = s_start;
+ uchar_t *dst = d_start;
+ uchar_t *s_end = src + s_len;
+ uchar_t *d_end = dst + d_len;
+
+ while (src < s_end && dst < d_end) {
+ int len = 1 + *src++;
+ if (len <= n) {
+ if (src + len > s_end || dst + len > d_end)
+ return (-1);
+ while (len-- != 0)
+ *dst++ = *src++;
+ } else {
+ len -= n;
+ if (dst + len > d_end)
+ return (-1);
+ while (len-- != 0)
+ *dst++ = 0;
+ }
+ }
+ return (dst == d_end ? 0 : -1);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zrlock.c b/sys/contrib/openzfs/module/zfs/zrlock.c
new file mode 100644
index 000000000000..a4def6053622
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zrlock.c
@@ -0,0 +1,188 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright 2016 The MathWorks, Inc. All rights reserved.
+ */
+
+/*
+ * A Zero Reference Lock (ZRL) is a reference count that can lock out new
+ * references only when the count is zero and only without waiting if the count
+ * is not already zero. It is similar to a read-write lock in that it allows
+ * multiple readers and only a single writer, but it does not allow a writer to
+ * block while waiting for readers to exit, and therefore the question of
+ * reader/writer priority is moot (no WRWANT bit). Since the equivalent of
+ * rw_enter(&lock, RW_WRITER) is disallowed and only tryenter() is allowed, it
+ * is perfectly safe for the same reader to acquire the same lock multiple
+ * times. The fact that a ZRL is reentrant for readers (through multiple calls
+ * to zrl_add()) makes it convenient for determining whether something is
+ * actively referenced without the fuss of flagging lock ownership across
+ * function calls.
+ */
+#include <sys/zrlock.h>
+#include <sys/trace_zfs.h>
+
+/*
+ * A ZRL can be locked only while there are zero references, so ZRL_LOCKED is
+ * treated as zero references.
+ */
+#define ZRL_LOCKED -1
+#define ZRL_DESTROYED -2
+
+void
+zrl_init(zrlock_t *zrl)
+{
+ mutex_init(&zrl->zr_mtx, NULL, MUTEX_DEFAULT, NULL);
+ zrl->zr_refcount = 0;
+ cv_init(&zrl->zr_cv, NULL, CV_DEFAULT, NULL);
+#ifdef ZFS_DEBUG
+ zrl->zr_owner = NULL;
+ zrl->zr_caller = NULL;
+#endif
+}
+
+void
+zrl_destroy(zrlock_t *zrl)
+{
+ ASSERT0(zrl->zr_refcount);
+
+ mutex_destroy(&zrl->zr_mtx);
+ zrl->zr_refcount = ZRL_DESTROYED;
+ cv_destroy(&zrl->zr_cv);
+}
+
+void
+zrl_add_impl(zrlock_t *zrl, const char *zc)
+{
+ for (;;) {
+ uint32_t n = (uint32_t)zrl->zr_refcount;
+ while (n != ZRL_LOCKED) {
+ uint32_t cas = atomic_cas_32(
+ (uint32_t *)&zrl->zr_refcount, n, n + 1);
+ if (cas == n) {
+ ASSERT3S((int32_t)n, >=, 0);
+#ifdef ZFS_DEBUG
+ if (zrl->zr_owner == curthread) {
+ DTRACE_PROBE3(zrlock__reentry,
+ zrlock_t *, zrl,
+ kthread_t *, curthread,
+ uint32_t, n);
+ }
+ zrl->zr_owner = curthread;
+ zrl->zr_caller = zc;
+#endif
+ return;
+ }
+ n = cas;
+ }
+
+ mutex_enter(&zrl->zr_mtx);
+ while (zrl->zr_refcount == ZRL_LOCKED) {
+ cv_wait(&zrl->zr_cv, &zrl->zr_mtx);
+ }
+ mutex_exit(&zrl->zr_mtx);
+ }
+}
+
+void
+zrl_remove(zrlock_t *zrl)
+{
+ uint32_t n;
+
+#ifdef ZFS_DEBUG
+ if (zrl->zr_owner == curthread) {
+ zrl->zr_owner = NULL;
+ zrl->zr_caller = NULL;
+ }
+#endif
+ n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount);
+ ASSERT3S((int32_t)n, >=, 0);
+}
+
+int
+zrl_tryenter(zrlock_t *zrl)
+{
+ uint32_t n = (uint32_t)zrl->zr_refcount;
+
+ if (n == 0) {
+ uint32_t cas = atomic_cas_32(
+ (uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED);
+ if (cas == 0) {
+#ifdef ZFS_DEBUG
+ ASSERT3P(zrl->zr_owner, ==, NULL);
+ zrl->zr_owner = curthread;
+#endif
+ return (1);
+ }
+ }
+
+ ASSERT3S((int32_t)n, >, ZRL_DESTROYED);
+
+ return (0);
+}
+
+void
+zrl_exit(zrlock_t *zrl)
+{
+ ASSERT3S(zrl->zr_refcount, ==, ZRL_LOCKED);
+
+ mutex_enter(&zrl->zr_mtx);
+#ifdef ZFS_DEBUG
+ ASSERT3P(zrl->zr_owner, ==, curthread);
+ zrl->zr_owner = NULL;
+ membar_producer(); /* make sure the owner store happens first */
+#endif
+ zrl->zr_refcount = 0;
+ cv_broadcast(&zrl->zr_cv);
+ mutex_exit(&zrl->zr_mtx);
+}
+
+int
+zrl_is_zero(zrlock_t *zrl)
+{
+ ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
+
+ return (zrl->zr_refcount <= 0);
+}
+
+int
+zrl_is_locked(zrlock_t *zrl)
+{
+ ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
+
+ return (zrl->zr_refcount == ZRL_LOCKED);
+}
+
+#ifdef ZFS_DEBUG
+kthread_t *
+zrl_owner(zrlock_t *zrl)
+{
+ return (zrl->zr_owner);
+}
+#endif
+
+#if defined(_KERNEL)
+
+EXPORT_SYMBOL(zrl_add_impl);
+EXPORT_SYMBOL(zrl_remove);
+
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/zthr.c b/sys/contrib/openzfs/module/zfs/zthr.c
new file mode 100644
index 000000000000..5ac2e30467e3
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zthr.c
@@ -0,0 +1,536 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, 2020 by Delphix. All rights reserved.
+ */
+
+/*
+ * ZTHR Infrastructure
+ * ===================
+ *
+ * ZTHR threads are used for isolated operations that span multiple txgs
+ * within a SPA. They generally exist from SPA creation/loading and until
+ * the SPA is exported/destroyed. The ideal requirements for an operation
+ * to be modeled with a zthr are the following:
+ *
+ * 1] The operation needs to run over multiple txgs.
+ * 2] There is be a single point of reference in memory or on disk that
+ * indicates whether the operation should run/is running or has
+ * stopped.
+ *
+ * If the operation satisfies the above then the following rules guarantee
+ * a certain level of correctness:
+ *
+ * 1] Any thread EXCEPT the zthr changes the work indicator from stopped
+ * to running but not the opposite.
+ * 2] Only the zthr can change the work indicator from running to stopped
+ * (e.g. when it is done) but not the opposite.
+ *
+ * This way a normal zthr cycle should go like this:
+ *
+ * 1] An external thread changes the work indicator from stopped to
+ * running and wakes up the zthr.
+ * 2] The zthr wakes up, checks the indicator and starts working.
+ * 3] When the zthr is done, it changes the indicator to stopped, allowing
+ * a new cycle to start.
+ *
+ * Besides being awakened by other threads, a zthr can be configured
+ * during creation to wakeup on its own after a specified interval
+ * [see zthr_create_timer()].
+ *
+ * Note: ZTHR threads are NOT a replacement for generic threads! Please
+ * ensure that they fit your use-case well before using them.
+ *
+ * == ZTHR creation
+ *
+ * Every zthr needs four inputs to start running:
+ *
+ * 1] A user-defined checker function (checkfunc) that decides whether
+ * the zthr should start working or go to sleep. The function should
+ * return TRUE when the zthr needs to work or FALSE to let it sleep,
+ * and should adhere to the following signature:
+ * boolean_t checkfunc_name(void *args, zthr_t *t);
+ *
+ * 2] A user-defined ZTHR function (func) which the zthr executes when
+ * it is not sleeping. The function should adhere to the following
+ * signature type:
+ * void func_name(void *args, zthr_t *t);
+ *
+ * 3] A void args pointer that will be passed to checkfunc and func
+ * implicitly by the infrastructure.
+ *
+ * 4] A name for the thread. This string must be valid for the lifetime
+ * of the zthr.
+ *
+ * The reason why the above API needs two different functions,
+ * instead of one that both checks and does the work, has to do with
+ * the zthr's internal state lock (zthr_state_lock) and the allowed
+ * cancellation windows. We want to hold the zthr_state_lock while
+ * running checkfunc but not while running func. This way the zthr
+ * can be cancelled while doing work and not while checking for work.
+ *
+ * To start a zthr:
+ * zthr_t *zthr_pointer = zthr_create(checkfunc, func, args);
+ * or
+ * zthr_t *zthr_pointer = zthr_create_timer(checkfunc, func,
+ * args, max_sleep);
+ *
+ * After that you should be able to wakeup, cancel, and resume the
+ * zthr from another thread using the zthr_pointer.
+ *
+ * NOTE: ZTHR threads could potentially wake up spuriously and the
+ * user should take this into account when writing a checkfunc.
+ * [see ZTHR state transitions]
+ *
+ * == ZTHR wakeup
+ *
+ * ZTHR wakeup should be used when new work is added for the zthr. The
+ * sleeping zthr will wakeup, see that it has more work to complete
+ * and proceed. This can be invoked from open or syncing context.
+ *
+ * To wakeup a zthr:
+ * zthr_wakeup(zthr_t *t)
+ *
+ * == ZTHR cancellation and resumption
+ *
+ * ZTHR threads must be cancelled when their SPA is being exported
+ * or when they need to be paused so they don't interfere with other
+ * operations.
+ *
+ * To cancel a zthr:
+ * zthr_cancel(zthr_pointer);
+ *
+ * To resume it:
+ * zthr_resume(zthr_pointer);
+ *
+ * ZTHR cancel and resume should be invoked in open context during the
+ * lifecycle of the pool as it is imported, exported or destroyed.
+ *
+ * A zthr will implicitly check if it has received a cancellation
+ * signal every time func returns and every time it wakes up [see
+ * ZTHR state transitions below].
+ *
+ * At times, waiting for the zthr's func to finish its job may take
+ * time. This may be very time-consuming for some operations that
+ * need to cancel the SPA's zthrs (e.g spa_export). For this scenario
+ * the user can explicitly make their ZTHR function aware of incoming
+ * cancellation signals using zthr_iscancelled(). A common pattern for
+ * that looks like this:
+ *
+ * int
+ * func_name(void *args, zthr_t *t)
+ * {
+ * ... <unpack args> ...
+ * while (!work_done && !zthr_iscancelled(t)) {
+ * ... <do more work> ...
+ * }
+ * }
+ *
+ * == ZTHR cleanup
+ *
+ * Cancelling a zthr doesn't clean up its metadata (internal locks,
+ * function pointers to func and checkfunc, etc..). This is because
+ * we want to keep them around in case we want to resume the execution
+ * of the zthr later. Similarly for zthrs that exit themselves.
+ *
+ * To completely cleanup a zthr, cancel it first to ensure that it
+ * is not running and then use zthr_destroy().
+ *
+ * == ZTHR state transitions
+ *
+ * zthr creation
+ * +
+ * |
+ * | woke up
+ * | +--------------+ sleep
+ * | | ^
+ * | | |
+ * | | | FALSE
+ * | | |
+ * v v FALSE +
+ * cancelled? +---------> checkfunc?
+ * + ^ +
+ * | | |
+ * | | | TRUE
+ * | | |
+ * | | func returned v
+ * | +---------------+ func
+ * |
+ * | TRUE
+ * |
+ * v
+ * zthr stopped running
+ *
+ * == Implementation of ZTHR requests
+ *
+ * ZTHR cancel and resume are requests on a zthr to change its
+ * internal state. These requests are serialized using the
+ * zthr_request_lock, while changes in its internal state are
+ * protected by the zthr_state_lock. A request will first acquire
+ * the zthr_request_lock and then immediately acquire the
+ * zthr_state_lock. We do this so that incoming requests are
+ * serialized using the request lock, while still allowing us
+ * to use the state lock for thread communication via zthr_cv.
+ *
+ * ZTHR wakeup broadcasts to zthr_cv, causing sleeping threads
+ * to wakeup. It acquires the zthr_state_lock but not the
+ * zthr_request_lock, so that a wakeup on a zthr in the middle
+ * of being cancelled will not block.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zthr.h>
+
+struct zthr {
+ /* running thread doing the work */
+ kthread_t *zthr_thread;
+
+ /* lock protecting internal data & invariants */
+ kmutex_t zthr_state_lock;
+
+ /* mutex that serializes external requests */
+ kmutex_t zthr_request_lock;
+
+ /* notification mechanism for requests */
+ kcondvar_t zthr_cv;
+
+ /* flag set to true if we are canceling the zthr */
+ boolean_t zthr_cancel;
+
+ /* flag set to true if we are waiting for the zthr to finish */
+ boolean_t zthr_haswaiters;
+ kcondvar_t zthr_wait_cv;
+ /*
+ * maximum amount of time that the zthr is spent sleeping;
+ * if this is 0, the thread doesn't wake up until it gets
+ * signaled.
+ */
+ hrtime_t zthr_sleep_timeout;
+
+ /* consumer-provided callbacks & data */
+ zthr_checkfunc_t *zthr_checkfunc;
+ zthr_func_t *zthr_func;
+ void *zthr_arg;
+ const char *zthr_name;
+};
+
+static void
+zthr_procedure(void *arg)
+{
+ zthr_t *t = arg;
+
+ mutex_enter(&t->zthr_state_lock);
+ ASSERT3P(t->zthr_thread, ==, curthread);
+
+ while (!t->zthr_cancel) {
+ if (t->zthr_checkfunc(t->zthr_arg, t)) {
+ mutex_exit(&t->zthr_state_lock);
+ t->zthr_func(t->zthr_arg, t);
+ mutex_enter(&t->zthr_state_lock);
+ } else {
+ if (t->zthr_sleep_timeout == 0) {
+ cv_wait_idle(&t->zthr_cv, &t->zthr_state_lock);
+ } else {
+ (void) cv_timedwait_idle_hires(&t->zthr_cv,
+ &t->zthr_state_lock, t->zthr_sleep_timeout,
+ MSEC2NSEC(1), 0);
+ }
+ }
+ if (t->zthr_haswaiters) {
+ t->zthr_haswaiters = B_FALSE;
+ cv_broadcast(&t->zthr_wait_cv);
+ }
+ }
+
+ /*
+ * Clear out the kernel thread metadata and notify the
+ * zthr_cancel() thread that we've stopped running.
+ */
+ t->zthr_thread = NULL;
+ t->zthr_cancel = B_FALSE;
+ cv_broadcast(&t->zthr_cv);
+
+ mutex_exit(&t->zthr_state_lock);
+ thread_exit();
+}
+
+zthr_t *
+zthr_create(const char *zthr_name, zthr_checkfunc_t *checkfunc,
+ zthr_func_t *func, void *arg)
+{
+ return (zthr_create_timer(zthr_name, checkfunc,
+ func, arg, (hrtime_t)0));
+}
+
+/*
+ * Create a zthr with specified maximum sleep time. If the time
+ * in sleeping state exceeds max_sleep, a wakeup(do the check and
+ * start working if required) will be triggered.
+ */
+zthr_t *
+zthr_create_timer(const char *zthr_name, zthr_checkfunc_t *checkfunc,
+ zthr_func_t *func, void *arg, hrtime_t max_sleep)
+{
+ zthr_t *t = kmem_zalloc(sizeof (*t), KM_SLEEP);
+ mutex_init(&t->zthr_state_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&t->zthr_request_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&t->zthr_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&t->zthr_wait_cv, NULL, CV_DEFAULT, NULL);
+
+ mutex_enter(&t->zthr_state_lock);
+ t->zthr_checkfunc = checkfunc;
+ t->zthr_func = func;
+ t->zthr_arg = arg;
+ t->zthr_sleep_timeout = max_sleep;
+ t->zthr_name = zthr_name;
+
+ t->zthr_thread = thread_create_named(zthr_name, NULL, 0,
+ zthr_procedure, t, 0, &p0, TS_RUN, minclsyspri);
+
+ mutex_exit(&t->zthr_state_lock);
+
+ return (t);
+}
+
+void
+zthr_destroy(zthr_t *t)
+{
+ ASSERT(!MUTEX_HELD(&t->zthr_state_lock));
+ ASSERT(!MUTEX_HELD(&t->zthr_request_lock));
+ VERIFY3P(t->zthr_thread, ==, NULL);
+ mutex_destroy(&t->zthr_request_lock);
+ mutex_destroy(&t->zthr_state_lock);
+ cv_destroy(&t->zthr_cv);
+ cv_destroy(&t->zthr_wait_cv);
+ kmem_free(t, sizeof (*t));
+}
+
+/*
+ * Wake up the zthr if it is sleeping. If the thread has been cancelled
+ * or is in the process of being cancelled, this is a no-op.
+ */
+void
+zthr_wakeup(zthr_t *t)
+{
+ mutex_enter(&t->zthr_state_lock);
+
+ /*
+ * There are 5 states that we can find the zthr when issuing
+ * this broadcast:
+ *
+ * [1] The common case of the thread being asleep, at which
+ * point the broadcast will wake it up.
+ * [2] The thread has been cancelled. Waking up a cancelled
+ * thread is a no-op. Any work that is still left to be
+ * done should be handled the next time the thread is
+ * resumed.
+ * [3] The thread is doing work and is already up, so this
+ * is basically a no-op.
+ * [4] The thread was just created/resumed, in which case the
+ * behavior is similar to [3].
+ * [5] The thread is in the middle of being cancelled, which
+ * will be a no-op.
+ */
+ cv_broadcast(&t->zthr_cv);
+
+ mutex_exit(&t->zthr_state_lock);
+}
+
+/*
+ * Sends a cancel request to the zthr and blocks until the zthr is
+ * cancelled. If the zthr is not running (e.g. has been cancelled
+ * already), this is a no-op. Note that this function should not be
+ * called from syncing context as it could deadlock with the zthr_func.
+ */
+void
+zthr_cancel(zthr_t *t)
+{
+ mutex_enter(&t->zthr_request_lock);
+ mutex_enter(&t->zthr_state_lock);
+
+ /*
+ * Since we are holding the zthr_state_lock at this point
+ * we can find the state in one of the following 4 states:
+ *
+ * [1] The thread has already been cancelled, therefore
+ * there is nothing for us to do.
+ * [2] The thread is sleeping so we set the flag, broadcast
+ * the CV and wait for it to exit.
+ * [3] The thread is doing work, in which case we just set
+ * the flag and wait for it to finish.
+ * [4] The thread was just created/resumed, in which case
+ * the behavior is similar to [3].
+ *
+ * Since requests are serialized, by the time that we get
+ * control back we expect that the zthr is cancelled and
+ * not running anymore.
+ */
+ if (t->zthr_thread != NULL) {
+ t->zthr_cancel = B_TRUE;
+
+ /* broadcast in case the zthr is sleeping */
+ cv_broadcast(&t->zthr_cv);
+
+ while (t->zthr_thread != NULL)
+ cv_wait(&t->zthr_cv, &t->zthr_state_lock);
+
+ ASSERT(!t->zthr_cancel);
+ }
+
+ mutex_exit(&t->zthr_state_lock);
+ mutex_exit(&t->zthr_request_lock);
+}
+
+/*
+ * Sends a resume request to the supplied zthr. If the zthr is already
+ * running this is a no-op. Note that this function should not be
+ * called from syncing context as it could deadlock with the zthr_func.
+ */
+void
+zthr_resume(zthr_t *t)
+{
+ mutex_enter(&t->zthr_request_lock);
+ mutex_enter(&t->zthr_state_lock);
+
+ ASSERT3P(&t->zthr_checkfunc, !=, NULL);
+ ASSERT3P(&t->zthr_func, !=, NULL);
+ ASSERT(!t->zthr_cancel);
+ ASSERT(!t->zthr_haswaiters);
+
+ /*
+ * There are 4 states that we find the zthr in at this point
+ * given the locks that we hold:
+ *
+ * [1] The zthr was cancelled, so we spawn a new thread for
+ * the zthr (common case).
+ * [2] The zthr is running at which point this is a no-op.
+ * [3] The zthr is sleeping at which point this is a no-op.
+ * [4] The zthr was just spawned at which point this is a
+ * no-op.
+ */
+ if (t->zthr_thread == NULL) {
+ t->zthr_thread = thread_create_named(t->zthr_name, NULL, 0,
+ zthr_procedure, t, 0, &p0, TS_RUN, minclsyspri);
+ }
+
+ mutex_exit(&t->zthr_state_lock);
+ mutex_exit(&t->zthr_request_lock);
+}
+
+/*
+ * This function is intended to be used by the zthr itself
+ * (specifically the zthr_func callback provided) to check
+ * if another thread has signaled it to stop running before
+ * doing some expensive operation.
+ *
+ * returns TRUE if we are in the middle of trying to cancel
+ * this thread.
+ *
+ * returns FALSE otherwise.
+ */
+boolean_t
+zthr_iscancelled(zthr_t *t)
+{
+ ASSERT3P(t->zthr_thread, ==, curthread);
+
+ /*
+ * The majority of the functions here grab zthr_request_lock
+ * first and then zthr_state_lock. This function only grabs
+ * the zthr_state_lock. That is because this function should
+ * only be called from the zthr_func to check if someone has
+ * issued a zthr_cancel() on the thread. If there is a zthr_cancel()
+ * happening concurrently, attempting to grab the request lock
+ * here would result in a deadlock.
+ *
+ * By grabbing only the zthr_state_lock this function is allowed
+ * to run concurrently with a zthr_cancel() request.
+ */
+ mutex_enter(&t->zthr_state_lock);
+ boolean_t cancelled = t->zthr_cancel;
+ mutex_exit(&t->zthr_state_lock);
+ return (cancelled);
+}
+
+/*
+ * Wait for the zthr to finish its current function. Similar to
+ * zthr_iscancelled, you can use zthr_has_waiters to have the zthr_func end
+ * early. Unlike zthr_cancel, the thread is not destroyed. If the zthr was
+ * sleeping or cancelled, return immediately.
+ */
+void
+zthr_wait_cycle_done(zthr_t *t)
+{
+ mutex_enter(&t->zthr_state_lock);
+
+ /*
+ * Since we are holding the zthr_state_lock at this point
+ * we can find the state in one of the following 5 states:
+ *
+ * [1] The thread has already cancelled, therefore
+ * there is nothing for us to do.
+ * [2] The thread is sleeping so we set the flag, broadcast
+ * the CV and wait for it to exit.
+ * [3] The thread is doing work, in which case we just set
+ * the flag and wait for it to finish.
+ * [4] The thread was just created/resumed, in which case
+ * the behavior is similar to [3].
+ * [5] The thread is the middle of being cancelled, which is
+ * similar to [3]. We'll wait for the cancel, which is
+ * waiting for the zthr func.
+ *
+ * Since requests are serialized, by the time that we get
+ * control back we expect that the zthr has completed it's
+ * zthr_func.
+ */
+ if (t->zthr_thread != NULL) {
+ t->zthr_haswaiters = B_TRUE;
+
+ /* broadcast in case the zthr is sleeping */
+ cv_broadcast(&t->zthr_cv);
+
+ while ((t->zthr_haswaiters) && (t->zthr_thread != NULL))
+ cv_wait(&t->zthr_wait_cv, &t->zthr_state_lock);
+
+ ASSERT(!t->zthr_haswaiters);
+ }
+
+ mutex_exit(&t->zthr_state_lock);
+}
+
+/*
+ * This function is intended to be used by the zthr itself
+ * to check if another thread is waiting on it to finish
+ *
+ * returns TRUE if we have been asked to finish.
+ *
+ * returns FALSE otherwise.
+ */
+boolean_t
+zthr_has_waiters(zthr_t *t)
+{
+ ASSERT3P(t->zthr_thread, ==, curthread);
+
+ mutex_enter(&t->zthr_state_lock);
+
+ /*
+ * Similarly to zthr_iscancelled(), we only grab the
+ * zthr_state_lock so that the zthr itself can use this
+ * to check for the request.
+ */
+ boolean_t has_waiters = t->zthr_haswaiters;
+ mutex_exit(&t->zthr_state_lock);
+ return (has_waiters);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c
new file mode 100644
index 000000000000..7c6dae8650c7
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zvol.c
@@ -0,0 +1,1739 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * LLNL-CODE-403049.
+ *
+ * ZFS volume emulation driver.
+ *
+ * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
+ * Volumes are accessed through the symbolic links named:
+ *
+ * /dev/<pool_name>/<dataset_name>
+ *
+ * Volumes are persistent through reboot and module load. No user command
+ * needs to be run before opening and using a device.
+ *
+ * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ */
+
+/*
+ * Note on locking of zvol state structures.
+ *
+ * These structures are used to maintain internal state used to emulate block
+ * devices on top of zvols. In particular, management of device minor number
+ * operations - create, remove, rename, and set_snapdev - involves access to
+ * these structures. The zvol_state_lock is primarily used to protect the
+ * zvol_state_list. The zv->zv_state_lock is used to protect the contents
+ * of the zvol_state_t structures, as well as to make sure that when the
+ * time comes to remove the structure from the list, it is not in use, and
+ * therefore, it can be taken off zvol_state_list and freed.
+ *
+ * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol,
+ * e.g. for the duration of receive and rollback operations. This lock can be
+ * held for significant periods of time. Given that it is undesirable to hold
+ * mutexes for long periods of time, the following lock ordering applies:
+ * - take zvol_state_lock if necessary, to protect zvol_state_list
+ * - take zv_suspend_lock if necessary, by the code path in question
+ * - take zv_state_lock to protect zvol_state_t
+ *
+ * The minor operations are issued to spa->spa_zvol_taskq queues, that are
+ * single-threaded (to preserve order of minor operations), and are executed
+ * through the zvol_task_cb that dispatches the specific operations. Therefore,
+ * these operations are serialized per pool. Consequently, we can be certain
+ * that for a given zvol, there is only one operation at a time in progress.
+ * That is why one can be sure that first, zvol_state_t for a given zvol is
+ * allocated and placed on zvol_state_list, and then other minor operations
+ * for this zvol are going to proceed in the order of issue.
+ *
+ */
+
+#include <sys/dataset_kstats.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/zil_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/zio.h>
+#include <sys/zfs_rlock.h>
+#include <sys/spa_impl.h>
+#include <sys/zvol.h>
+
+#include <sys/zvol_impl.h>
+
+
+unsigned int zvol_inhibit_dev = 0;
+unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
+
+struct hlist_head *zvol_htable;
+list_t zvol_state_list;
+krwlock_t zvol_state_lock;
+const zvol_platform_ops_t *ops;
+
+typedef enum {
+ ZVOL_ASYNC_REMOVE_MINORS,
+ ZVOL_ASYNC_RENAME_MINORS,
+ ZVOL_ASYNC_SET_SNAPDEV,
+ ZVOL_ASYNC_SET_VOLMODE,
+ ZVOL_ASYNC_MAX
+} zvol_async_op_t;
+
+typedef struct {
+ zvol_async_op_t op;
+ char pool[MAXNAMELEN];
+ char name1[MAXNAMELEN];
+ char name2[MAXNAMELEN];
+ zprop_source_t source;
+ uint64_t value;
+} zvol_task_t;
+
+uint64_t
+zvol_name_hash(const char *name)
+{
+ int i;
+ uint64_t crc = -1ULL;
+ const uint8_t *p = (const uint8_t *)name;
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+ for (i = 0; i < MAXNAMELEN - 1 && *p; i++, p++) {
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF];
+ }
+ return (crc);
+}
+
+/*
+ * Find a zvol_state_t given the name and hash generated by zvol_name_hash.
+ * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
+ * return (NULL) without the taking locks. The zv_suspend_lock is always taken
+ * before zv_state_lock. The mode argument indicates the mode (including none)
+ * for zv_suspend_lock to be taken.
+ */
+zvol_state_t *
+zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
+{
+ zvol_state_t *zv;
+ struct hlist_node *p = NULL;
+
+ rw_enter(&zvol_state_lock, RW_READER);
+ hlist_for_each(p, ZVOL_HT_HEAD(hash)) {
+ zv = hlist_entry(p, zvol_state_t, zv_hlink);
+ mutex_enter(&zv->zv_state_lock);
+ if (zv->zv_hash == hash &&
+ strncmp(zv->zv_name, name, MAXNAMELEN) == 0) {
+ /*
+ * this is the right zvol, take the locks in the
+ * right order
+ */
+ if (mode != RW_NONE &&
+ !rw_tryenter(&zv->zv_suspend_lock, mode)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, mode);
+ mutex_enter(&zv->zv_state_lock);
+ /*
+ * zvol cannot be renamed as we continue
+ * to hold zvol_state_lock
+ */
+ ASSERT(zv->zv_hash == hash &&
+ strncmp(zv->zv_name, name, MAXNAMELEN)
+ == 0);
+ }
+ rw_exit(&zvol_state_lock);
+ return (zv);
+ }
+ mutex_exit(&zv->zv_state_lock);
+ }
+ rw_exit(&zvol_state_lock);
+
+ return (NULL);
+}
+
+/*
+ * Find a zvol_state_t given the name.
+ * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
+ * return (NULL) without the taking locks. The zv_suspend_lock is always taken
+ * before zv_state_lock. The mode argument indicates the mode (including none)
+ * for zv_suspend_lock to be taken.
+ */
+static zvol_state_t *
+zvol_find_by_name(const char *name, int mode)
+{
+ return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode));
+}
+
+/*
+ * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation.
+ */
+void
+zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
+{
+ zfs_creat_t *zct = arg;
+ nvlist_t *nvprops = zct->zct_props;
+ int error;
+ uint64_t volblocksize, volsize;
+
+ VERIFY(nvlist_lookup_uint64(nvprops,
+ zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
+ if (nvlist_lookup_uint64(nvprops,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
+ volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
+
+ /*
+ * These properties must be removed from the list so the generic
+ * property setting step won't apply to them.
+ */
+ VERIFY(nvlist_remove_all(nvprops,
+ zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
+ (void) nvlist_remove_all(nvprops,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
+
+ error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
+ DMU_OT_NONE, 0, tx);
+ ASSERT(error == 0);
+
+ error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
+ DMU_OT_NONE, 0, tx);
+ ASSERT(error == 0);
+
+ error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
+ ASSERT(error == 0);
+}
+
+/*
+ * ZFS_IOC_OBJSET_STATS entry point.
+ */
+int
+zvol_get_stats(objset_t *os, nvlist_t *nv)
+{
+ int error;
+ dmu_object_info_t *doi;
+ uint64_t val;
+
+ error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
+ if (error)
+ return (SET_ERROR(error));
+
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
+ doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
+ error = dmu_object_info(os, ZVOL_OBJ, doi);
+
+ if (error == 0) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
+ doi->doi_data_block_size);
+ }
+
+ kmem_free(doi, sizeof (dmu_object_info_t));
+
+ return (SET_ERROR(error));
+}
+
+/*
+ * Sanity check volume size.
+ */
+int
+zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
+{
+ if (volsize == 0)
+ return (SET_ERROR(EINVAL));
+
+ if (volsize % blocksize != 0)
+ return (SET_ERROR(EINVAL));
+
+#ifdef _ILP32
+ if (volsize - 1 > SPEC_MAXOFFSET_T)
+ return (SET_ERROR(EOVERFLOW));
+#endif
+ return (0);
+}
+
+/*
+ * Ensure the zap is flushed then inform the VFS of the capacity change.
+ */
+static int
+zvol_update_volsize(uint64_t volsize, objset_t *os)
+{
+ dmu_tx_t *tx;
+ int error;
+ uint64_t txg;
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
+ dmu_tx_mark_netfree(tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ return (SET_ERROR(error));
+ }
+ txg = dmu_tx_get_txg(tx);
+
+ error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
+ &volsize, tx);
+ dmu_tx_commit(tx);
+
+ txg_wait_synced(dmu_objset_pool(os), txg);
+
+ if (error == 0)
+ error = dmu_free_long_range(os,
+ ZVOL_OBJ, volsize, DMU_OBJECT_END);
+
+ return (error);
+}
+
+/*
+ * Set ZFS_PROP_VOLSIZE set entry point. Note that modifying the volume
+ * size will result in a udev "change" event being generated.
+ */
+int
+zvol_set_volsize(const char *name, uint64_t volsize)
+{
+ objset_t *os = NULL;
+ uint64_t readonly;
+ int error;
+ boolean_t owned = B_FALSE;
+
+ error = dsl_prop_get_integer(name,
+ zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
+ if (error != 0)
+ return (SET_ERROR(error));
+ if (readonly)
+ return (SET_ERROR(EROFS));
+
+ zvol_state_t *zv = zvol_find_by_name(name, RW_READER);
+
+ ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) &&
+ RW_READ_HELD(&zv->zv_suspend_lock)));
+
+ if (zv == NULL || zv->zv_objset == NULL) {
+ if (zv != NULL)
+ rw_exit(&zv->zv_suspend_lock);
+ if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, B_TRUE,
+ FTAG, &os)) != 0) {
+ if (zv != NULL)
+ mutex_exit(&zv->zv_state_lock);
+ return (SET_ERROR(error));
+ }
+ owned = B_TRUE;
+ if (zv != NULL)
+ zv->zv_objset = os;
+ } else {
+ os = zv->zv_objset;
+ }
+
+ dmu_object_info_t *doi = kmem_alloc(sizeof (*doi), KM_SLEEP);
+
+ if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) ||
+ (error = zvol_check_volsize(volsize, doi->doi_data_block_size)))
+ goto out;
+
+ error = zvol_update_volsize(volsize, os);
+ if (error == 0 && zv != NULL) {
+ zv->zv_volsize = volsize;
+ zv->zv_changed = 1;
+ }
+out:
+ kmem_free(doi, sizeof (dmu_object_info_t));
+
+ if (owned) {
+ dmu_objset_disown(os, B_TRUE, FTAG);
+ if (zv != NULL)
+ zv->zv_objset = NULL;
+ } else {
+ rw_exit(&zv->zv_suspend_lock);
+ }
+
+ if (zv != NULL)
+ mutex_exit(&zv->zv_state_lock);
+
+ if (error == 0 && zv != NULL)
+ ops->zv_update_volsize(zv, volsize);
+
+ return (SET_ERROR(error));
+}
+
+/*
+ * Sanity check volume block size.
+ */
+int
+zvol_check_volblocksize(const char *name, uint64_t volblocksize)
+{
+ /* Record sizes above 128k need the feature to be enabled */
+ if (volblocksize > SPA_OLD_MAXBLOCKSIZE) {
+ spa_t *spa;
+ int error;
+
+ if ((error = spa_open(name, &spa, FTAG)) != 0)
+ return (error);
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ /*
+ * We don't allow setting the property above 1MB,
+ * unless the tunable has been changed.
+ */
+ if (volblocksize > zfs_max_recordsize)
+ return (SET_ERROR(EDOM));
+
+ spa_close(spa, FTAG);
+ }
+
+ if (volblocksize < SPA_MINBLOCKSIZE ||
+ volblocksize > SPA_MAXBLOCKSIZE ||
+ !ISP2(volblocksize))
+ return (SET_ERROR(EDOM));
+
+ return (0);
+}
+
+/*
+ * Set ZFS_PROP_VOLBLOCKSIZE set entry point.
+ */
+int
+zvol_set_volblocksize(const char *name, uint64_t volblocksize)
+{
+ zvol_state_t *zv;
+ dmu_tx_t *tx;
+ int error;
+
+ zv = zvol_find_by_name(name, RW_READER);
+
+ if (zv == NULL)
+ return (SET_ERROR(ENXIO));
+
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
+
+ if (zv->zv_flags & ZVOL_RDONLY) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_exit(&zv->zv_suspend_lock);
+ return (SET_ERROR(EROFS));
+ }
+
+ tx = dmu_tx_create(zv->zv_objset);
+ dmu_tx_hold_bonus(tx, ZVOL_OBJ);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
+ volblocksize, 0, tx);
+ if (error == ENOTSUP)
+ error = SET_ERROR(EBUSY);
+ dmu_tx_commit(tx);
+ if (error == 0)
+ zv->zv_volblocksize = volblocksize;
+ }
+
+ mutex_exit(&zv->zv_state_lock);
+ rw_exit(&zv->zv_suspend_lock);
+
+ return (SET_ERROR(error));
+}
+
+/*
+ * Replay a TX_TRUNCATE ZIL transaction if asked. TX_TRUNCATE is how we
+ * implement DKIOCFREE/free-long-range.
+ */
+static int
+zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zvol_state_t *zv = arg1;
+ lr_truncate_t *lr = arg2;
+ uint64_t offset, length;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ offset = lr->lr_offset;
+ length = lr->lr_length;
+
+ return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
+}
+
+/*
+ * Replay a TX_WRITE ZIL transaction that didn't get committed
+ * after a system failure
+ */
+static int
+zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zvol_state_t *zv = arg1;
+ lr_write_t *lr = arg2;
+ objset_t *os = zv->zv_objset;
+ char *data = (char *)(lr + 1); /* data follows lr_write_t */
+ uint64_t offset, length;
+ dmu_tx_t *tx;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ offset = lr->lr_offset;
+ length = lr->lr_length;
+
+ /* If it's a dmu_sync() block, write the whole block */
+ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+ uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+ if (length < blocksize) {
+ offset -= offset % blocksize;
+ length = blocksize;
+ }
+ }
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
+ dmu_tx_commit(tx);
+ }
+
+ return (error);
+}
+
+static int
+zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
+{
+ return (SET_ERROR(ENOTSUP));
+}
+
+/*
+ * Callback vectors for replaying records.
+ * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
+ */
+zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
+ zvol_replay_err, /* no such transaction type */
+ zvol_replay_err, /* TX_CREATE */
+ zvol_replay_err, /* TX_MKDIR */
+ zvol_replay_err, /* TX_MKXATTR */
+ zvol_replay_err, /* TX_SYMLINK */
+ zvol_replay_err, /* TX_REMOVE */
+ zvol_replay_err, /* TX_RMDIR */
+ zvol_replay_err, /* TX_LINK */
+ zvol_replay_err, /* TX_RENAME */
+ zvol_replay_write, /* TX_WRITE */
+ zvol_replay_truncate, /* TX_TRUNCATE */
+ zvol_replay_err, /* TX_SETATTR */
+ zvol_replay_err, /* TX_ACL */
+ zvol_replay_err, /* TX_CREATE_ATTR */
+ zvol_replay_err, /* TX_CREATE_ACL_ATTR */
+ zvol_replay_err, /* TX_MKDIR_ACL */
+ zvol_replay_err, /* TX_MKDIR_ATTR */
+ zvol_replay_err, /* TX_MKDIR_ACL_ATTR */
+ zvol_replay_err, /* TX_WRITE2 */
+};
+
+/*
+ * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
+ *
+ * We store data in the log buffers if it's small enough.
+ * Otherwise we will later flush the data out via dmu_sync().
+ */
+ssize_t zvol_immediate_write_sz = 32768;
+
+void
+zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
+ uint64_t size, int sync)
+{
+ uint32_t blocksize = zv->zv_volblocksize;
+ zilog_t *zilog = zv->zv_zilog;
+ itx_wr_state_t write_state;
+
+ if (zil_replaying(zilog, tx))
+ return;
+
+ if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+ write_state = WR_INDIRECT;
+ else if (!spa_has_slogs(zilog->zl_spa) &&
+ size >= blocksize && blocksize > zvol_immediate_write_sz)
+ write_state = WR_INDIRECT;
+ else if (sync)
+ write_state = WR_COPIED;
+ else
+ write_state = WR_NEED_COPY;
+
+ while (size) {
+ itx_t *itx;
+ lr_write_t *lr;
+ itx_wr_state_t wr_state = write_state;
+ ssize_t len = size;
+
+ if (wr_state == WR_COPIED && size > zil_max_copied_data(zilog))
+ wr_state = WR_NEED_COPY;
+ else if (wr_state == WR_INDIRECT)
+ len = MIN(blocksize - P2PHASE(offset, blocksize), size);
+
+ itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
+ (wr_state == WR_COPIED ? len : 0));
+ lr = (lr_write_t *)&itx->itx_lr;
+ if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn,
+ offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) {
+ zil_itx_destroy(itx);
+ itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+ lr = (lr_write_t *)&itx->itx_lr;
+ wr_state = WR_NEED_COPY;
+ }
+
+ itx->itx_wr_state = wr_state;
+ lr->lr_foid = ZVOL_OBJ;
+ lr->lr_offset = offset;
+ lr->lr_length = len;
+ lr->lr_blkoff = 0;
+ BP_ZERO(&lr->lr_blkptr);
+
+ itx->itx_private = zv;
+ itx->itx_sync = sync;
+
+ (void) zil_itx_assign(zilog, itx, tx);
+
+ offset += len;
+ size -= len;
+ }
+}
+
+/*
+ * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
+ */
+void
+zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
+ boolean_t sync)
+{
+ itx_t *itx;
+ lr_truncate_t *lr;
+ zilog_t *zilog = zv->zv_zilog;
+
+ if (zil_replaying(zilog, tx))
+ return;
+
+ itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
+ lr = (lr_truncate_t *)&itx->itx_lr;
+ lr->lr_foid = ZVOL_OBJ;
+ lr->lr_offset = off;
+ lr->lr_length = len;
+
+ itx->itx_sync = sync;
+ zil_itx_assign(zilog, itx, tx);
+}
+
+
+/* ARGSUSED */
+static void
+zvol_get_done(zgd_t *zgd, int error)
+{
+ if (zgd->zgd_db)
+ dmu_buf_rele(zgd->zgd_db, zgd);
+
+ zfs_rangelock_exit(zgd->zgd_lr);
+
+ kmem_free(zgd, sizeof (zgd_t));
+}
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+int
+zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
+{
+ zvol_state_t *zv = arg;
+ uint64_t offset = lr->lr_offset;
+ uint64_t size = lr->lr_length;
+ dmu_buf_t *db;
+ zgd_t *zgd;
+ int error;
+
+ ASSERT3P(lwb, !=, NULL);
+ ASSERT3P(zio, !=, NULL);
+ ASSERT3U(size, !=, 0);
+
+ zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+ zgd->zgd_lwb = lwb;
+
+ /*
+ * Write records come in two flavors: immediate and indirect.
+ * For small writes it's cheaper to store the data with the
+ * log record (immediate); for large writes it's cheaper to
+ * sync the data and get a pointer to it (indirect) so that
+ * we don't have to write the data twice.
+ */
+ if (buf != NULL) { /* immediate write */
+ zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
+ size, RL_READER);
+ error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
+ DMU_READ_NO_PREFETCH);
+ } else { /* indirect write */
+ /*
+ * Have to lock the whole block to ensure when it's written out
+ * and its checksum is being calculated that no one can change
+ * the data. Contrarily to zfs_get_data we need not re-check
+ * blocksize after we get the lock because it cannot be changed.
+ */
+ size = zv->zv_volblocksize;
+ offset = P2ALIGN_TYPED(offset, size, uint64_t);
+ zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
+ size, RL_READER);
+ error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
+ DMU_READ_NO_PREFETCH);
+ if (error == 0) {
+ blkptr_t *bp = &lr->lr_blkptr;
+
+ zgd->zgd_db = db;
+ zgd->zgd_bp = bp;
+
+ ASSERT(db != NULL);
+ ASSERT(db->db_offset == offset);
+ ASSERT(db->db_size == size);
+
+ error = dmu_sync(zio, lr->lr_common.lrc_txg,
+ zvol_get_done, zgd);
+
+ if (error == 0)
+ return (0);
+ }
+ }
+
+ zvol_get_done(zgd, error);
+
+ return (SET_ERROR(error));
+}
+
+/*
+ * The zvol_state_t's are inserted into zvol_state_list and zvol_htable.
+ */
+
+void
+zvol_insert(zvol_state_t *zv)
+{
+ ASSERT(RW_WRITE_HELD(&zvol_state_lock));
+ list_insert_head(&zvol_state_list, zv);
+ hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
+}
+
+/*
+ * Simply remove the zvol from to list of zvols.
+ */
+static void
+zvol_remove(zvol_state_t *zv)
+{
+ ASSERT(RW_WRITE_HELD(&zvol_state_lock));
+ list_remove(&zvol_state_list, zv);
+ hlist_del(&zv->zv_hlink);
+}
+
+/*
+ * Setup zv after we just own the zv->objset
+ */
+static int
+zvol_setup_zv(zvol_state_t *zv)
+{
+ uint64_t volsize;
+ int error;
+ uint64_t ro;
+ objset_t *os = zv->zv_objset;
+
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock));
+
+ zv->zv_zilog = NULL;
+ zv->zv_flags &= ~ZVOL_WRITTEN_TO;
+
+ error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
+ if (error)
+ return (SET_ERROR(error));
+
+ error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+ if (error)
+ return (SET_ERROR(error));
+
+ error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn);
+ if (error)
+ return (SET_ERROR(error));
+
+ ops->zv_set_capacity(zv, volsize >> 9);
+ zv->zv_volsize = volsize;
+
+ if (ro || dmu_objset_is_snapshot(os) ||
+ !spa_writeable(dmu_objset_spa(os))) {
+ ops->zv_set_disk_ro(zv, 1);
+ zv->zv_flags |= ZVOL_RDONLY;
+ } else {
+ ops->zv_set_disk_ro(zv, 0);
+ zv->zv_flags &= ~ZVOL_RDONLY;
+ }
+ return (0);
+}
+
+/*
+ * Shutdown every zv_objset related stuff except zv_objset itself.
+ * The is the reverse of zvol_setup_zv.
+ */
+static void
+zvol_shutdown_zv(zvol_state_t *zv)
+{
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
+ RW_LOCK_HELD(&zv->zv_suspend_lock));
+
+ if (zv->zv_flags & ZVOL_WRITTEN_TO) {
+ ASSERT(zv->zv_zilog != NULL);
+ zil_close(zv->zv_zilog);
+ }
+
+ zv->zv_zilog = NULL;
+
+ dnode_rele(zv->zv_dn, zv);
+ zv->zv_dn = NULL;
+
+ /*
+ * Evict cached data. We must write out any dirty data before
+ * disowning the dataset.
+ */
+ if (zv->zv_flags & ZVOL_WRITTEN_TO)
+ txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
+ (void) dmu_objset_evict_dbufs(zv->zv_objset);
+}
+
+/*
+ * return the proper tag for rollback and recv
+ */
+void *
+zvol_tag(zvol_state_t *zv)
+{
+ ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
+ return (zv->zv_open_count > 0 ? zv : NULL);
+}
+
+/*
+ * Suspend the zvol for recv and rollback.
+ */
+zvol_state_t *
+zvol_suspend(const char *name)
+{
+ zvol_state_t *zv;
+
+ zv = zvol_find_by_name(name, RW_WRITER);
+
+ if (zv == NULL)
+ return (NULL);
+
+ /* block all I/O, release in zvol_resume. */
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
+
+ atomic_inc(&zv->zv_suspend_ref);
+
+ if (zv->zv_open_count > 0)
+ zvol_shutdown_zv(zv);
+
+ /*
+ * do not hold zv_state_lock across suspend/resume to
+ * avoid locking up zvol lookups
+ */
+ mutex_exit(&zv->zv_state_lock);
+
+ /* zv_suspend_lock is released in zvol_resume() */
+ return (zv);
+}
+
+int
+zvol_resume(zvol_state_t *zv)
+{
+ int error = 0;
+
+ ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
+
+ mutex_enter(&zv->zv_state_lock);
+
+ if (zv->zv_open_count > 0) {
+ VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset));
+ VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv);
+ VERIFY(dsl_dataset_long_held(zv->zv_objset->os_dsl_dataset));
+ dmu_objset_rele(zv->zv_objset, zv);
+
+ error = zvol_setup_zv(zv);
+ }
+
+ mutex_exit(&zv->zv_state_lock);
+
+ rw_exit(&zv->zv_suspend_lock);
+ /*
+ * We need this because we don't hold zvol_state_lock while releasing
+ * zv_suspend_lock. zvol_remove_minors_impl thus cannot check
+ * zv_suspend_lock to determine it is safe to free because rwlock is
+ * not inherent atomic.
+ */
+ atomic_dec(&zv->zv_suspend_ref);
+
+ return (SET_ERROR(error));
+}
+
+int
+zvol_first_open(zvol_state_t *zv, boolean_t readonly)
+{
+ objset_t *os;
+ int error, locked = 0;
+ boolean_t ro;
+
+ ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+ /*
+ * In all other cases the spa_namespace_lock is taken before the
+ * bdev->bd_mutex lock. But in this case the Linux __blkdev_get()
+ * function calls fops->open() with the bdev->bd_mutex lock held.
+ * This deadlock can be easily observed with zvols used as vdevs.
+ *
+ * To avoid a potential lock inversion deadlock we preemptively
+ * try to take the spa_namespace_lock(). Normally it will not
+ * be contended and this is safe because spa_open_common() handles
+ * the case where the caller already holds the spa_namespace_lock.
+ *
+ * When it is contended we risk a lock inversion if we were to
+ * block waiting for the lock. Luckily, the __blkdev_get()
+ * function allows us to return -ERESTARTSYS which will result in
+ * bdev->bd_mutex being dropped, reacquired, and fops->open() being
+ * called again. This process can be repeated safely until both
+ * locks are acquired.
+ */
+ if (!mutex_owned(&spa_namespace_lock)) {
+ locked = mutex_tryenter(&spa_namespace_lock);
+ if (!locked)
+ return (SET_ERROR(EINTR));
+ }
+
+ ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
+ error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os);
+ if (error)
+ goto out_mutex;
+
+ zv->zv_objset = os;
+
+ error = zvol_setup_zv(zv);
+
+ if (error) {
+ dmu_objset_disown(os, 1, zv);
+ zv->zv_objset = NULL;
+ }
+
+out_mutex:
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(error));
+}
+
+void
+zvol_last_close(zvol_state_t *zv)
+{
+ ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+ zvol_shutdown_zv(zv);
+
+ dmu_objset_disown(zv->zv_objset, 1, zv);
+ zv->zv_objset = NULL;
+}
+
+typedef struct minors_job {
+ list_t *list;
+ list_node_t link;
+ /* input */
+ char *name;
+ /* output */
+ int error;
+} minors_job_t;
+
+/*
+ * Prefetch zvol dnodes for the minors_job
+ */
+static void
+zvol_prefetch_minors_impl(void *arg)
+{
+ minors_job_t *job = arg;
+ char *dsname = job->name;
+ objset_t *os = NULL;
+
+ job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE,
+ FTAG, &os);
+ if (job->error == 0) {
+ dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+ dmu_objset_disown(os, B_TRUE, FTAG);
+ }
+}
+
+/*
+ * Mask errors to continue dmu_objset_find() traversal
+ */
+static int
+zvol_create_snap_minor_cb(const char *dsname, void *arg)
+{
+ minors_job_t *j = arg;
+ list_t *minors_list = j->list;
+ const char *name = j->name;
+
+ ASSERT0(MUTEX_HELD(&spa_namespace_lock));
+
+ /* skip the designated dataset */
+ if (name && strcmp(dsname, name) == 0)
+ return (0);
+
+ /* at this point, the dsname should name a snapshot */
+ if (strchr(dsname, '@') == 0) {
+ dprintf("zvol_create_snap_minor_cb(): "
+ "%s is not a snapshot name\n", dsname);
+ } else {
+ minors_job_t *job;
+ char *n = kmem_strdup(dsname);
+ if (n == NULL)
+ return (0);
+
+ job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
+ job->name = n;
+ job->list = minors_list;
+ job->error = 0;
+ list_insert_tail(minors_list, job);
+ /* don't care if dispatch fails, because job->error is 0 */
+ taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job,
+ TQ_SLEEP);
+ }
+
+ return (0);
+}
+
+/*
+ * Mask errors to continue dmu_objset_find() traversal
+ */
+static int
+zvol_create_minors_cb(const char *dsname, void *arg)
+{
+ uint64_t snapdev;
+ int error;
+ list_t *minors_list = arg;
+
+ ASSERT0(MUTEX_HELD(&spa_namespace_lock));
+
+ error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL);
+ if (error)
+ return (0);
+
+ /*
+ * Given the name and the 'snapdev' property, create device minor nodes
+ * with the linkages to zvols/snapshots as needed.
+ * If the name represents a zvol, create a minor node for the zvol, then
+ * check if its snapshots are 'visible', and if so, iterate over the
+ * snapshots and create device minor nodes for those.
+ */
+ if (strchr(dsname, '@') == 0) {
+ minors_job_t *job;
+ char *n = kmem_strdup(dsname);
+ if (n == NULL)
+ return (0);
+
+ job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
+ job->name = n;
+ job->list = minors_list;
+ job->error = 0;
+ list_insert_tail(minors_list, job);
+ /* don't care if dispatch fails, because job->error is 0 */
+ taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job,
+ TQ_SLEEP);
+
+ if (snapdev == ZFS_SNAPDEV_VISIBLE) {
+ /*
+ * traverse snapshots only, do not traverse children,
+ * and skip the 'dsname'
+ */
+ error = dmu_objset_find(dsname,
+ zvol_create_snap_minor_cb, (void *)job,
+ DS_FIND_SNAPSHOTS);
+ }
+ } else {
+ dprintf("zvol_create_minors_cb(): %s is not a zvol name\n",
+ dsname);
+ }
+
+ return (0);
+}
+
+/*
+ * Create minors for the specified dataset, including children and snapshots.
+ * Pay attention to the 'snapdev' property and iterate over the snapshots
+ * only if they are 'visible'. This approach allows one to assure that the
+ * snapshot metadata is read from disk only if it is needed.
+ *
+ * The name can represent a dataset to be recursively scanned for zvols and
+ * their snapshots, or a single zvol snapshot. If the name represents a
+ * dataset, the scan is performed in two nested stages:
+ * - scan the dataset for zvols, and
+ * - for each zvol, create a minor node, then check if the zvol's snapshots
+ * are 'visible', and only then iterate over the snapshots if needed
+ *
+ * If the name represents a snapshot, a check is performed if the snapshot is
+ * 'visible' (which also verifies that the parent is a zvol), and if so,
+ * a minor node for that snapshot is created.
+ */
+void
+zvol_create_minors_recursive(const char *name)
+{
+ list_t minors_list;
+ minors_job_t *job;
+
+ if (zvol_inhibit_dev)
+ return;
+
+ /*
+ * This is the list for prefetch jobs. Whenever we found a match
+ * during dmu_objset_find, we insert a minors_job to the list and do
+ * taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need
+ * any lock because all list operation is done on the current thread.
+ *
+ * We will use this list to do zvol_create_minor_impl after prefetch
+ * so we don't have to traverse using dmu_objset_find again.
+ */
+ list_create(&minors_list, sizeof (minors_job_t),
+ offsetof(minors_job_t, link));
+
+
+ if (strchr(name, '@') != NULL) {
+ uint64_t snapdev;
+
+ int error = dsl_prop_get_integer(name, "snapdev",
+ &snapdev, NULL);
+
+ if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
+ (void) ops->zv_create_minor(name);
+ } else {
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+ (void) dmu_objset_find(name, zvol_create_minors_cb,
+ &minors_list, DS_FIND_CHILDREN);
+ spl_fstrans_unmark(cookie);
+ }
+
+ taskq_wait_outstanding(system_taskq, 0);
+
+ /*
+ * Prefetch is completed, we can do zvol_create_minor_impl
+ * sequentially.
+ */
+ while ((job = list_head(&minors_list)) != NULL) {
+ list_remove(&minors_list, job);
+ if (!job->error)
+ (void) ops->zv_create_minor(job->name);
+ kmem_strfree(job->name);
+ kmem_free(job, sizeof (minors_job_t));
+ }
+
+ list_destroy(&minors_list);
+}
+
+void
+zvol_create_minor(const char *name)
+{
+ /*
+ * Note: the dsl_pool_config_lock must not be held.
+ * Minor node creation needs to obtain the zvol_state_lock.
+ * zvol_open() obtains the zvol_state_lock and then the dsl pool
+ * config lock. Therefore, we can't have the config lock now if
+ * we are going to wait for the zvol_state_lock, because it
+ * would be a lock order inversion which could lead to deadlock.
+ */
+
+ if (zvol_inhibit_dev)
+ return;
+
+ if (strchr(name, '@') != NULL) {
+ uint64_t snapdev;
+
+ int error = dsl_prop_get_integer(name,
+ "snapdev", &snapdev, NULL);
+
+ if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
+ (void) ops->zv_create_minor(name);
+ } else {
+ (void) ops->zv_create_minor(name);
+ }
+}
+
+/*
+ * Remove minors for specified dataset including children and snapshots.
+ */
+
+void
+zvol_remove_minors_impl(const char *name)
+{
+ zvol_state_t *zv, *zv_next;
+ int namelen = ((name) ? strlen(name) : 0);
+ taskqid_t t;
+ list_t free_list;
+
+ if (zvol_inhibit_dev)
+ return;
+
+ list_create(&free_list, sizeof (zvol_state_t),
+ offsetof(zvol_state_t, zv_next));
+
+ rw_enter(&zvol_state_lock, RW_WRITER);
+
+ for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
+ zv_next = list_next(&zvol_state_list, zv);
+
+ mutex_enter(&zv->zv_state_lock);
+ if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
+ (strncmp(zv->zv_name, name, namelen) == 0 &&
+ (zv->zv_name[namelen] == '/' ||
+ zv->zv_name[namelen] == '@'))) {
+ /*
+ * By holding zv_state_lock here, we guarantee that no
+ * one is currently using this zv
+ */
+
+ /* If in use, leave alone */
+ if (zv->zv_open_count > 0 ||
+ atomic_read(&zv->zv_suspend_ref)) {
+ mutex_exit(&zv->zv_state_lock);
+ continue;
+ }
+
+ zvol_remove(zv);
+
+ /*
+ * Cleared while holding zvol_state_lock as a writer
+ * which will prevent zvol_open() from opening it.
+ */
+ ops->zv_clear_private(zv);
+
+ /* Drop zv_state_lock before zvol_free() */
+ mutex_exit(&zv->zv_state_lock);
+
+ /* Try parallel zv_free, if failed do it in place */
+ t = taskq_dispatch(system_taskq,
+ (task_func_t *)ops->zv_free, zv, TQ_SLEEP);
+ if (t == TASKQID_INVALID)
+ list_insert_head(&free_list, zv);
+ } else {
+ mutex_exit(&zv->zv_state_lock);
+ }
+ }
+ rw_exit(&zvol_state_lock);
+
+ /* Drop zvol_state_lock before calling zvol_free() */
+ while ((zv = list_head(&free_list)) != NULL) {
+ list_remove(&free_list, zv);
+ ops->zv_free(zv);
+ }
+}
+
+/* Remove minor for this specific volume only */
+static void
+zvol_remove_minor_impl(const char *name)
+{
+ zvol_state_t *zv = NULL, *zv_next;
+
+ if (zvol_inhibit_dev)
+ return;
+
+ rw_enter(&zvol_state_lock, RW_WRITER);
+
+ for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
+ zv_next = list_next(&zvol_state_list, zv);
+
+ mutex_enter(&zv->zv_state_lock);
+ if (strcmp(zv->zv_name, name) == 0) {
+ /*
+ * By holding zv_state_lock here, we guarantee that no
+ * one is currently using this zv
+ */
+
+ /* If in use, leave alone */
+ if (zv->zv_open_count > 0 ||
+ atomic_read(&zv->zv_suspend_ref)) {
+ mutex_exit(&zv->zv_state_lock);
+ continue;
+ }
+ zvol_remove(zv);
+
+ ops->zv_clear_private(zv);
+ mutex_exit(&zv->zv_state_lock);
+ break;
+ } else {
+ mutex_exit(&zv->zv_state_lock);
+ }
+ }
+
+ /* Drop zvol_state_lock before calling zvol_free() */
+ rw_exit(&zvol_state_lock);
+
+ if (zv != NULL)
+ ops->zv_free(zv);
+}
+
+/*
+ * Rename minors for specified dataset including children and snapshots.
+ */
+static void
+zvol_rename_minors_impl(const char *oldname, const char *newname)
+{
+ zvol_state_t *zv, *zv_next;
+ int oldnamelen, newnamelen;
+
+ if (zvol_inhibit_dev)
+ return;
+
+ oldnamelen = strlen(oldname);
+ newnamelen = strlen(newname);
+
+ rw_enter(&zvol_state_lock, RW_READER);
+
+ for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
+ zv_next = list_next(&zvol_state_list, zv);
+
+ mutex_enter(&zv->zv_state_lock);
+
+ if (strcmp(zv->zv_name, oldname) == 0) {
+ ops->zv_rename_minor(zv, newname);
+ } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
+ (zv->zv_name[oldnamelen] == '/' ||
+ zv->zv_name[oldnamelen] == '@')) {
+ char *name = kmem_asprintf("%s%c%s", newname,
+ zv->zv_name[oldnamelen],
+ zv->zv_name + oldnamelen + 1);
+ ops->zv_rename_minor(zv, name);
+ kmem_strfree(name);
+ }
+
+ mutex_exit(&zv->zv_state_lock);
+ }
+
+ rw_exit(&zvol_state_lock);
+}
+
+typedef struct zvol_snapdev_cb_arg {
+ uint64_t snapdev;
+} zvol_snapdev_cb_arg_t;
+
+static int
+zvol_set_snapdev_cb(const char *dsname, void *param)
+{
+ zvol_snapdev_cb_arg_t *arg = param;
+
+ if (strchr(dsname, '@') == NULL)
+ return (0);
+
+ switch (arg->snapdev) {
+ case ZFS_SNAPDEV_VISIBLE:
+ (void) ops->zv_create_minor(dsname);
+ break;
+ case ZFS_SNAPDEV_HIDDEN:
+ (void) zvol_remove_minor_impl(dsname);
+ break;
+ }
+
+ return (0);
+}
+
+static void
+zvol_set_snapdev_impl(char *name, uint64_t snapdev)
+{
+ zvol_snapdev_cb_arg_t arg = {snapdev};
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+ /*
+ * The zvol_set_snapdev_sync() sets snapdev appropriately
+ * in the dataset hierarchy. Here, we only scan snapshots.
+ */
+ dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS);
+ spl_fstrans_unmark(cookie);
+}
+
+typedef struct zvol_volmode_cb_arg {
+ uint64_t volmode;
+} zvol_volmode_cb_arg_t;
+
+static void
+zvol_set_volmode_impl(char *name, uint64_t volmode)
+{
+ fstrans_cookie_t cookie;
+ uint64_t old_volmode;
+ zvol_state_t *zv;
+
+ if (strchr(name, '@') != NULL)
+ return;
+
+ /*
+ * It's unfortunate we need to remove minors before we create new ones:
+ * this is necessary because our backing gendisk (zvol_state->zv_disk)
+ * could be different when we set, for instance, volmode from "geom"
+ * to "dev" (or vice versa).
+ */
+ zv = zvol_find_by_name(name, RW_NONE);
+ if (zv == NULL && volmode == ZFS_VOLMODE_NONE)
+ return;
+ if (zv != NULL) {
+ old_volmode = zv->zv_volmode;
+ mutex_exit(&zv->zv_state_lock);
+ if (old_volmode == volmode)
+ return;
+ zvol_wait_close(zv);
+ }
+ cookie = spl_fstrans_mark();
+ switch (volmode) {
+ case ZFS_VOLMODE_NONE:
+ (void) zvol_remove_minor_impl(name);
+ break;
+ case ZFS_VOLMODE_GEOM:
+ case ZFS_VOLMODE_DEV:
+ (void) zvol_remove_minor_impl(name);
+ (void) ops->zv_create_minor(name);
+ break;
+ case ZFS_VOLMODE_DEFAULT:
+ (void) zvol_remove_minor_impl(name);
+ if (zvol_volmode == ZFS_VOLMODE_NONE)
+ break;
+ else /* if zvol_volmode is invalid defaults to "geom" */
+ (void) ops->zv_create_minor(name);
+ break;
+ }
+ spl_fstrans_unmark(cookie);
+}
+
+static zvol_task_t *
+zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2,
+ uint64_t value)
+{
+ zvol_task_t *task;
+ char *delim;
+
+ /* Never allow tasks on hidden names. */
+ if (name1[0] == '$')
+ return (NULL);
+
+ task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
+ task->op = op;
+ task->value = value;
+ delim = strchr(name1, '/');
+ strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN);
+
+ strlcpy(task->name1, name1, MAXNAMELEN);
+ if (name2 != NULL)
+ strlcpy(task->name2, name2, MAXNAMELEN);
+
+ return (task);
+}
+
+static void
+zvol_task_free(zvol_task_t *task)
+{
+ kmem_free(task, sizeof (zvol_task_t));
+}
+
+/*
+ * The worker thread function performed asynchronously.
+ */
+static void
+zvol_task_cb(void *arg)
+{
+ zvol_task_t *task = arg;
+
+ switch (task->op) {
+ case ZVOL_ASYNC_REMOVE_MINORS:
+ zvol_remove_minors_impl(task->name1);
+ break;
+ case ZVOL_ASYNC_RENAME_MINORS:
+ zvol_rename_minors_impl(task->name1, task->name2);
+ break;
+ case ZVOL_ASYNC_SET_SNAPDEV:
+ zvol_set_snapdev_impl(task->name1, task->value);
+ break;
+ case ZVOL_ASYNC_SET_VOLMODE:
+ zvol_set_volmode_impl(task->name1, task->value);
+ break;
+ default:
+ VERIFY(0);
+ break;
+ }
+
+ zvol_task_free(task);
+}
+
+typedef struct zvol_set_prop_int_arg {
+ const char *zsda_name;
+ uint64_t zsda_value;
+ zprop_source_t zsda_source;
+ dmu_tx_t *zsda_tx;
+} zvol_set_prop_int_arg_t;
+
+/*
+ * Sanity check the dataset for safe use by the sync task. No additional
+ * conditions are imposed.
+ */
+static int
+zvol_set_snapdev_check(void *arg, dmu_tx_t *tx)
+{
+ zvol_set_prop_int_arg_t *zsda = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dir_t *dd;
+ int error;
+
+ error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL);
+ if (error != 0)
+ return (error);
+
+ dsl_dir_rele(dd, FTAG);
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zvol_set_snapdev_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+{
+ char dsname[MAXNAMELEN];
+ zvol_task_t *task;
+ uint64_t snapdev;
+
+ dsl_dataset_name(ds, dsname);
+ if (dsl_prop_get_int_ds(ds, "snapdev", &snapdev) != 0)
+ return (0);
+ task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname, NULL, snapdev);
+ if (task == NULL)
+ return (0);
+
+ (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
+ task, TQ_SLEEP);
+ return (0);
+}
+
+/*
+ * Traverse all child datasets and apply snapdev appropriately.
+ * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel
+ * dataset and read the effective "snapdev" on every child in the callback
+ * function: this is because the value is not guaranteed to be the same in the
+ * whole dataset hierarchy.
+ */
+static void
+zvol_set_snapdev_sync(void *arg, dmu_tx_t *tx)
+{
+ zvol_set_prop_int_arg_t *zsda = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dir_t *dd;
+ dsl_dataset_t *ds;
+ int error;
+
+ VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
+ zsda->zsda_tx = tx;
+
+ error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds);
+ if (error == 0) {
+ dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_SNAPDEV),
+ zsda->zsda_source, sizeof (zsda->zsda_value), 1,
+ &zsda->zsda_value, zsda->zsda_tx);
+ dsl_dataset_rele(ds, FTAG);
+ }
+ dmu_objset_find_dp(dp, dd->dd_object, zvol_set_snapdev_sync_cb,
+ zsda, DS_FIND_CHILDREN);
+
+ dsl_dir_rele(dd, FTAG);
+}
+
+int
+zvol_set_snapdev(const char *ddname, zprop_source_t source, uint64_t snapdev)
+{
+ zvol_set_prop_int_arg_t zsda;
+
+ zsda.zsda_name = ddname;
+ zsda.zsda_source = source;
+ zsda.zsda_value = snapdev;
+
+ return (dsl_sync_task(ddname, zvol_set_snapdev_check,
+ zvol_set_snapdev_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
+}
+
+/*
+ * Sanity check the dataset for safe use by the sync task. No additional
+ * conditions are imposed.
+ */
+static int
+zvol_set_volmode_check(void *arg, dmu_tx_t *tx)
+{
+ zvol_set_prop_int_arg_t *zsda = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dir_t *dd;
+ int error;
+
+ error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL);
+ if (error != 0)
+ return (error);
+
+ dsl_dir_rele(dd, FTAG);
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zvol_set_volmode_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+{
+ char dsname[MAXNAMELEN];
+ zvol_task_t *task;
+ uint64_t volmode;
+
+ dsl_dataset_name(ds, dsname);
+ if (dsl_prop_get_int_ds(ds, "volmode", &volmode) != 0)
+ return (0);
+ task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname, NULL, volmode);
+ if (task == NULL)
+ return (0);
+
+ (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
+ task, TQ_SLEEP);
+ return (0);
+}
+
+/*
+ * Traverse all child datasets and apply volmode appropriately.
+ * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel
+ * dataset and read the effective "volmode" on every child in the callback
+ * function: this is because the value is not guaranteed to be the same in the
+ * whole dataset hierarchy.
+ */
+static void
+zvol_set_volmode_sync(void *arg, dmu_tx_t *tx)
+{
+ zvol_set_prop_int_arg_t *zsda = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dir_t *dd;
+ dsl_dataset_t *ds;
+ int error;
+
+ VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
+ zsda->zsda_tx = tx;
+
+ error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds);
+ if (error == 0) {
+ dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_VOLMODE),
+ zsda->zsda_source, sizeof (zsda->zsda_value), 1,
+ &zsda->zsda_value, zsda->zsda_tx);
+ dsl_dataset_rele(ds, FTAG);
+ }
+
+ dmu_objset_find_dp(dp, dd->dd_object, zvol_set_volmode_sync_cb,
+ zsda, DS_FIND_CHILDREN);
+
+ dsl_dir_rele(dd, FTAG);
+}
+
+int
+zvol_set_volmode(const char *ddname, zprop_source_t source, uint64_t volmode)
+{
+ zvol_set_prop_int_arg_t zsda;
+
+ zsda.zsda_name = ddname;
+ zsda.zsda_source = source;
+ zsda.zsda_value = volmode;
+
+ return (dsl_sync_task(ddname, zvol_set_volmode_check,
+ zvol_set_volmode_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
+}
+
+void
+zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
+{
+ zvol_task_t *task;
+ taskqid_t id;
+
+ task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL);
+ if (task == NULL)
+ return;
+
+ id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
+ if ((async == B_FALSE) && (id != TASKQID_INVALID))
+ taskq_wait_id(spa->spa_zvol_taskq, id);
+}
+
+void
+zvol_rename_minors(spa_t *spa, const char *name1, const char *name2,
+ boolean_t async)
+{
+ zvol_task_t *task;
+ taskqid_t id;
+
+ task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL);
+ if (task == NULL)
+ return;
+
+ id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
+ if ((async == B_FALSE) && (id != TASKQID_INVALID))
+ taskq_wait_id(spa->spa_zvol_taskq, id);
+}
+
+boolean_t
+zvol_is_zvol(const char *name)
+{
+
+ return (ops->zv_is_zvol(name));
+}
+
+void
+zvol_register_ops(const zvol_platform_ops_t *zvol_ops)
+{
+ ops = zvol_ops;
+}
+
+int
+zvol_init_impl(void)
+{
+ int i;
+
+ list_create(&zvol_state_list, sizeof (zvol_state_t),
+ offsetof(zvol_state_t, zv_next));
+ rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL);
+
+ zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head),
+ KM_SLEEP);
+ for (i = 0; i < ZVOL_HT_SIZE; i++)
+ INIT_HLIST_HEAD(&zvol_htable[i]);
+
+ return (0);
+}
+
+void
+zvol_fini_impl(void)
+{
+ zvol_remove_minors_impl(NULL);
+
+ /*
+ * The call to "zvol_remove_minors_impl" may dispatch entries to
+ * the system_taskq, but it doesn't wait for those entries to
+ * complete before it returns. Thus, we must wait for all of the
+ * removals to finish, before we can continue.
+ */
+ taskq_wait_outstanding(system_taskq, 0);
+
+ kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
+ list_destroy(&zvol_state_list);
+ rw_destroy(&zvol_state_lock);
+}
diff --git a/sys/contrib/openzfs/module/zstd/Makefile.in b/sys/contrib/openzfs/module/zstd/Makefile.in
new file mode 100644
index 000000000000..f67db710f097
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/Makefile.in
@@ -0,0 +1,38 @@
+ifneq ($(KBUILD_EXTMOD),)
+src = @abs_srcdir@
+obj = @abs_builddir@
+zstd_include = $(src)/include
+else
+zstd_include = $(srctree)/$(src)/include
+endif
+
+MODULE := zzstd
+
+obj-$(CONFIG_ZFS) := $(MODULE).o
+
+asflags-y := -I$(zstd_include)
+ccflags-y := -I$(zstd_include)
+
+# Zstd uses -O3 by default, so we should follow
+ccflags-y += -O3
+
+# -fno-tree-vectorize gets set for gcc in zstd/common/compiler.h
+# Set it for other compilers, too.
+$(obj)/lib/zstd.o: c_flags += -fno-tree-vectorize
+
+# SSE register return with SSE disabled if -march=znverX is passed
+$(obj)/lib/zstd.o: c_flags += -U__BMI__
+
+# Quiet warnings about frame size due to unused code in unmodified zstd lib
+$(obj)/lib/zstd.o: c_flags += -Wframe-larger-than=20480
+
+# Disable aarch64 neon SIMD instructions for kernel mode
+$(obj)/lib/zstd.o: c_flags += -include $(zstd_include)/aarch64_compat.h -include $(zstd_include)/zstd_compat_wrapper.h -Wp,-w
+
+$(obj)/zfs_zstd.o: c_flags += -include $(zstd_include)/zstd_compat_wrapper.h
+
+$(MODULE)-objs += zfs_zstd.o
+$(MODULE)-objs += lib/zstd.o
+
+all:
+ mkdir -p lib
diff --git a/sys/contrib/openzfs/module/zstd/README.md b/sys/contrib/openzfs/module/zstd/README.md
new file mode 100644
index 000000000000..f8e127736aac
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/README.md
@@ -0,0 +1,65 @@
+# ZSTD-On-ZFS Library Manual
+
+## Introduction
+
+This subtree contains the ZSTD library used in ZFS. It is heavily cut-down by
+dropping any unneeded files, and combined into a single file, but otherwise is
+intentionally unmodified. Please do not alter the file containing the zstd
+library, besides upgrading to a newer ZSTD release.
+
+Tree structure:
+
+* `zfs_zstd.c` is the actual `zzstd` kernel module.
+* `lib/` contains the the unmodified, [_"amalgamated"_](https://github.com/facebook/zstd/blob/dev/contrib/single_file_libs/README.md)
+ version of the `Zstandard` library, generated from our template file
+* `zstd-in.c` is our template file for generating the library
+* `include/`: This directory contains supplemental includes for platform
+ compatibility, which are not expected to be used by ZFS elsewhere in the
+ future. Thus we keep them private to ZSTD.
+
+## Updating ZSTD
+
+To update ZSTD the following steps need to be taken:
+
+1. Grab the latest release of [ZSTD](https://github.com/facebook/zstd/releases).
+2. Update `module/zstd/zstd-in.c` if required. (see
+ `zstd/contrib/single_file_libs/zstd-in.c` in the zstd repository)
+3. Generate the "single-file-library" and put it to `module/zstd/lib/`.
+4. Copy the following files to `module/zstd/lib/`:
+ - `zstd/lib/zstd.h`
+ - `zstd/lib/common/zstd_errors.h`
+
+This can be done using a few shell commands from inside the zfs repo:
+
+~~~sh
+cd PATH/TO/ZFS
+
+url="https://github.com/facebook/zstd"
+release="$(curl -s "${url}"/releases/latest | grep -oP '(?<=v)[\d\.]+')"
+zstd="/tmp/zstd-${release}/"
+
+wget -O /tmp/zstd.tar.gz \
+ "${url}/releases/download/v${release}/zstd-${release}.tar.gz"
+tar -C /tmp -xzf /tmp/zstd.tar.gz
+
+cp ${zstd}/lib/zstd.h module/zstd/lib/
+cp ${zstd}/lib/zstd_errors.h module/zstd/lib/
+${zstd}/contrib/single_file_libs/combine.sh \
+ -r ${zstd}/lib -o module/zstd/lib/zstd.c module/zstd/zstd-in.c
+~~~
+
+Note: if the zstd library for zfs is updated to a newer version,
+the macro list in include/zstd_compat_wrapper.h usually needs to be updated.
+this can be done with some hand crafting of the output of the following
+script: nm zstd.o | awk '{print "#define "$3 " zfs_" $3}' > macrotable
+
+
+## Altering ZSTD and breaking changes
+
+If ZSTD made changes that break compatibility or you need to make breaking
+changes to the way we handle ZSTD, it is required to maintain backwards
+compatibility.
+
+We already save the ZSTD version number within the block header to be used
+to add future compatibility checks and/or fixes. However, currently it is
+not actually used in such a way.
diff --git a/sys/contrib/openzfs/module/zstd/include/aarch64_compat.h b/sys/contrib/openzfs/module/zstd/include/aarch64_compat.h
new file mode 100644
index 000000000000..088517d3d23b
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/include/aarch64_compat.h
@@ -0,0 +1,37 @@
+/*
+ * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2018-2020, Sebastian Gottschall
+ */
+
+#ifdef _KERNEL
+#undef __aarch64__
+#endif
diff --git a/sys/contrib/openzfs/module/zstd/include/limits.h b/sys/contrib/openzfs/module/zstd/include/limits.h
new file mode 100644
index 000000000000..3bf5b67765ae
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/include/limits.h
@@ -0,0 +1,63 @@
+/*
+ * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2014-2019, Allan Jude
+ * Copyright (c) 2020, Brian Behlendorf
+ * Copyright (c) 2020, Michael Niewöhner
+ */
+
+#ifndef _ZSTD_LIMITS_H
+#define _ZSTD_LIMITS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+#if defined(__FreeBSD__)
+#include <sys/limits.h>
+#elif defined(__linux__)
+#include <linux/limits.h>
+#include <linux/kernel.h>
+#else
+#error "Unsupported platform"
+#endif
+
+#else /* !_KERNEL */
+#include_next <limits.h>
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZSTD_LIMITS_H */
diff --git a/sys/contrib/openzfs/module/zstd/include/stddef.h b/sys/contrib/openzfs/module/zstd/include/stddef.h
new file mode 100644
index 000000000000..3f46fb8b033e
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/include/stddef.h
@@ -0,0 +1,62 @@
+/*
+ * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2014-2019, Allan Jude
+ * Copyright (c) 2020, Brian Behlendorf
+ * Copyright (c) 2020, Michael Niewöhner
+ */
+
+#ifndef _ZSTD_STDDEF_H
+#define _ZSTD_STDDEF_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+#if defined(__FreeBSD__)
+#include <sys/types.h>
+#elif defined(__linux__)
+#include <linux/types.h>
+#else
+#error "Unsupported platform"
+#endif
+
+#else /* !_KERNEL */
+#include_next <stddef.h>
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZSTD_STDDEF_H */
diff --git a/sys/contrib/openzfs/module/zstd/include/stdint.h b/sys/contrib/openzfs/module/zstd/include/stdint.h
new file mode 100644
index 000000000000..2d98a556c23e
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/include/stdint.h
@@ -0,0 +1,62 @@
+/*
+ * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2014-2019, Allan Jude
+ * Copyright (c) 2020, Brian Behlendorf
+ * Copyright (c) 2020, Michael Niewöhner
+ */
+
+#ifndef _ZSTD_STDINT_H
+#define _ZSTD_STDINT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+#if defined(__FreeBSD__)
+#include <sys/stdint.h>
+#elif defined(__linux__)
+#include <linux/types.h>
+#else
+#error "Unsupported platform"
+#endif
+
+#else /* !_KERNEL */
+#include_next <stdint.h>
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZSTD_STDINT_H */
diff --git a/sys/contrib/openzfs/module/zstd/include/stdio.h b/sys/contrib/openzfs/module/zstd/include/stdio.h
new file mode 100644
index 000000000000..5a7c6ec69916
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/include/stdio.h
@@ -0,0 +1,54 @@
+/*
+ * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2014-2019, Allan Jude
+ * Copyright (c) 2020, Brian Behlendorf
+ * Copyright (c) 2020, Michael Niewöhner
+ */
+
+#ifndef _ZSTD_STDIO_H
+#define _ZSTD_STDIO_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef _KERNEL
+
+#include_next <stdio.h>
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZSTD_STDIO_H */
diff --git a/sys/contrib/openzfs/module/zstd/include/stdlib.h b/sys/contrib/openzfs/module/zstd/include/stdlib.h
new file mode 100644
index 000000000000..c341a0c84884
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/include/stdlib.h
@@ -0,0 +1,58 @@
+/*
+ * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2014-2019, Allan Jude
+ * Copyright (c) 2020, Brian Behlendorf
+ * Copyright (c) 2020, Michael Niewöhner
+ */
+
+#ifndef _ZSTD_STDLIB_H
+#define _ZSTD_STDLIB_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#undef GCC_VERSION
+
+/*
+ * Define calloc, malloc, free to make building work. They are never really used
+ * in zstdlib.c since allocation is done in zstd.c.
+ */
+#define calloc(n, sz) NULL
+#define malloc(sz) NULL
+#define free(ptr)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZSTD_STDLIB_H */
diff --git a/sys/contrib/openzfs/module/zstd/include/string.h b/sys/contrib/openzfs/module/zstd/include/string.h
new file mode 100644
index 000000000000..78998d3c4655
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/include/string.h
@@ -0,0 +1,62 @@
+/*
+ * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2014-2019, Allan Jude
+ * Copyright (c) 2020, Brian Behlendorf
+ * Copyright (c) 2020, Michael Niewöhner
+ */
+
+#ifndef _ZSTD_STRING_H
+#define _ZSTD_STRING_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+#if defined(__FreeBSD__)
+#include <sys/systm.h> /* memcpy, memset */
+#elif defined(__linux__)
+#include <linux/string.h> /* memcpy, memset */
+#else
+#error "Unsupported platform"
+#endif
+
+#else /* !_KERNEL */
+#include_next <string.h>
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZSTD_STRING_H */
diff --git a/sys/contrib/openzfs/module/zstd/include/zstd_compat_wrapper.h b/sys/contrib/openzfs/module/zstd/include/zstd_compat_wrapper.h
new file mode 100644
index 000000000000..5cca517b5508
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/include/zstd_compat_wrapper.h
@@ -0,0 +1,460 @@
+/*
+ * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2020, Sebastian Gottschall
+ */
+
+/*
+ * This wrapper fixes a problem, in case the ZFS filesystem driver, is compiled
+ * staticly into the kernel.
+ * This will cause a symbol collision with the older in-kernel zstd library.
+ * The following macros will simply rename all local zstd symbols and references
+ *
+ * Note: if the zstd library for zfs is updated to a newer version, this macro
+ * list usually needs to be updated.
+ * this can be done with some hand crafting of the output of the following
+ * script
+ * nm zstd.o | awk '{print "#define "$3 " zfs_" $3}' > macrotable
+ */
+
+#define BIT_initDStream zfs_BIT_initDStream
+#define BIT_mask zfs_BIT_mask
+#define BIT_reloadDStream zfs_BIT_reloadDStream
+#define ERR_getErrorString zfs_ERR_getErrorString
+#define FSE_NCountWriteBound zfs_FSE_NCountWriteBound
+#define FSE_buildCTable zfs_FSE_buildCTable
+#define FSE_buildCTable_raw zfs_FSE_buildCTable_raw
+#define FSE_buildCTable_rle zfs_FSE_buildCTable_rle
+#define FSE_buildCTable_wksp zfs_FSE_buildCTable_wksp
+#define FSE_buildDTable zfs_FSE_buildDTable
+#define FSE_buildDTable_raw zfs_FSE_buildDTable_raw
+#define FSE_buildDTable_rle zfs_FSE_buildDTable_rle
+#define FSE_compress zfs_FSE_compress
+#define FSE_compress2 zfs_FSE_compress2
+#define FSE_compressBound zfs_FSE_compressBound
+#define FSE_compress_usingCTable zfs_FSE_compress_usingCTable
+#define FSE_compress_usingCTable_generic zfs_FSE_compress_usingCTable_generic
+#define FSE_compress_wksp zfs_FSE_compress_wksp
+#define FSE_createCTable zfs_FSE_createCTable
+#define FSE_createDTable zfs_FSE_createDTable
+#define FSE_decompress zfs_FSE_decompress
+#define FSE_decompress_usingDTable zfs_FSE_decompress_usingDTable
+#define FSE_decompress_wksp zfs_FSE_decompress_wksp
+#define FSE_freeCTable zfs_FSE_freeCTable
+#define FSE_freeDTable zfs_FSE_freeDTable
+#define FSE_getErrorName zfs_FSE_getErrorName
+#define FSE_normalizeCount zfs_FSE_normalizeCount
+#define FSE_optimalTableLog zfs_FSE_optimalTableLog
+#define FSE_optimalTableLog_internal zfs_FSE_optimalTableLog_internal
+#define FSE_readNCount zfs_FSE_readNCount
+#define FSE_versionNumber zfs_FSE_versionNumber
+#define FSE_writeNCount zfs_FSE_writeNCount
+#define HIST_count zfs_HIST_count
+#define HIST_countFast zfs_HIST_countFast
+#define HIST_countFast_wksp zfs_HIST_countFast_wksp
+#define HIST_count_parallel_wksp zfs_HIST_count_parallel_wksp
+#define HIST_count_simple zfs_HIST_count_simple
+#define HIST_count_wksp zfs_HIST_count_wksp
+#define HUF_buildCTable zfs_HUF_buildCTable
+#define HUF_buildCTable_wksp zfs_HUF_buildCTable_wksp
+#define HUF_compress zfs_HUF_compress
+#define HUF_compress1X zfs_HUF_compress1X
+#define HUF_compress1X_repeat zfs_HUF_compress1X_repeat
+#define HUF_compress1X_usingCTable zfs_HUF_compress1X_usingCTable
+#define HUF_compress1X_wksp zfs_HUF_compress1X_wksp
+#define HUF_compress2 zfs_HUF_compress2
+#define HUF_compress4X_repeat zfs_HUF_compress4X_repeat
+#define HUF_compress4X_usingCTable zfs_HUF_compress4X_usingCTable
+#define HUF_compress4X_wksp zfs_HUF_compress4X_wksp
+#define HUF_compressBound zfs_HUF_compressBound
+#define HUF_compressWeights zfs_HUF_compressWeights
+#define HUF_decompress zfs_HUF_decompress
+#define HUF_decompress1X1 zfs_HUF_decompress1X1
+#define HUF_decompress1X1_DCtx zfs_HUF_decompress1X1_DCtx
+#define HUF_decompress1X1_DCtx_wksp zfs_HUF_decompress1X1_DCtx_wksp
+#define HUF_decompress1X1_DCtx_wksp_bmi2 zfs_HUF_decompress1X1_DCtx_wksp_bmi2
+#define HUF_decompress1X1_usingDTable zfs_HUF_decompress1X1_usingDTable
+#define HUF_decompress1X2 zfs_HUF_decompress1X2
+#define HUF_decompress1X2_DCtx zfs_HUF_decompress1X2_DCtx
+#define HUF_decompress1X2_DCtx_wksp zfs_HUF_decompress1X2_DCtx_wksp
+#define HUF_decompress1X2_usingDTable zfs_HUF_decompress1X2_usingDTable
+#define HUF_decompress1X_DCtx zfs_HUF_decompress1X_DCtx
+#define HUF_decompress1X_DCtx_wksp zfs_HUF_decompress1X_DCtx_wksp
+#define HUF_decompress1X_usingDTable zfs_HUF_decompress1X_usingDTable
+#define HUF_decompress1X_usingDTable_bmi2 zfs_HUF_decompress1X_usingDTable_bmi2
+#define HUF_decompress4X1 zfs_HUF_decompress4X1
+#define HUF_decompress4X1_DCtx zfs_HUF_decompress4X1_DCtx
+#define HUF_decompress4X1_DCtx_wksp zfs_HUF_decompress4X1_DCtx_wksp
+#define HUF_decompress4X1_usingDTable zfs_HUF_decompress4X1_usingDTable
+#define HUF_decompress4X2 zfs_HUF_decompress4X2
+#define HUF_decompress4X2_DCtx zfs_HUF_decompress4X2_DCtx
+#define HUF_decompress4X2_DCtx_wksp zfs_HUF_decompress4X2_DCtx_wksp
+#define HUF_decompress4X2_usingDTable zfs_HUF_decompress4X2_usingDTable
+#define HUF_decompress4X_DCtx zfs_HUF_decompress4X_DCtx
+#define HUF_decompress4X_hufOnly zfs_HUF_decompress4X_hufOnly
+#define HUF_decompress4X_hufOnly_wksp zfs_HUF_decompress4X_hufOnly_wksp
+#define HUF_decompress4X_hufOnly_wksp_bmi2 \
+ zfs_HUF_decompress4X_hufOnly_wksp_bmi2
+#define HUF_decompress4X_usingDTable zfs_HUF_decompress4X_usingDTable
+#define HUF_decompress4X_usingDTable_bmi2 zfs_HUF_decompress4X_usingDTable_bmi2
+#define HUF_estimateCompressedSize zfs_HUF_estimateCompressedSize
+#define HUF_fillDTableX2Level2 zfs_HUF_fillDTableX2Level2
+#define HUF_getErrorName zfs_HUF_getErrorName
+#define HUF_getNbBits zfs_HUF_getNbBits
+#define HUF_optimalTableLog zfs_HUF_optimalTableLog
+#define HUF_readCTable zfs_HUF_readCTable
+#define HUF_readDTableX1 zfs_HUF_readDTableX1
+#define HUF_readDTableX1_wksp zfs_HUF_readDTableX1_wksp
+#define HUF_readDTableX2 zfs_HUF_readDTableX2
+#define HUF_readDTableX2_wksp zfs_HUF_readDTableX2_wksp
+#define HUF_readStats zfs_HUF_readStats
+#define HUF_selectDecoder zfs_HUF_selectDecoder
+#define HUF_setMaxHeight zfs_HUF_setMaxHeight
+#define HUF_validateCTable zfs_HUF_validateCTable
+#define HUF_writeCTable zfs_HUF_writeCTable
+#define LL_base zfs_LL_base
+#define LL_bits zfs_LL_bits
+#define LL_defaultDTable zfs_LL_defaultDTable
+#define LL_defaultNorm zfs_LL_defaultNorm
+#define ML_base zfs_ML_base
+#define ML_bits zfs_ML_bits
+#define ML_defaultDTable zfs_ML_defaultDTable
+#define ML_defaultNorm zfs_ML_defaultNorm
+#define OF_base zfs_OF_base
+#define OF_bits zfs_OF_bits
+#define OF_defaultDTable zfs_OF_defaultDTable
+#define OF_defaultNorm zfs_OF_defaultNorm
+#define POOL_add zfs_POOL_add
+#define POOL_create zfs_POOL_create
+#define POOL_create_advanced zfs_POOL_create_advanced
+#define POOL_free zfs_POOL_free
+#define POOL_resize zfs_POOL_resize
+#define POOL_sizeof zfs_POOL_sizeof
+#define POOL_tryAdd zfs_POOL_tryAdd
+#define ZSTD_CCtxParams_getParameter zfs_ZSTD_CCtxParams_getParameter
+#define ZSTD_CCtxParams_init zfs_ZSTD_CCtxParams_init
+#define ZSTD_CCtxParams_init_advanced zfs_ZSTD_CCtxParams_init_advanced
+#define ZSTD_CCtxParams_reset zfs_ZSTD_CCtxParams_reset
+#define ZSTD_CCtxParams_setParameter zfs_ZSTD_CCtxParams_setParameter
+#define ZSTD_CCtx_getParameter zfs_ZSTD_CCtx_getParameter
+#define ZSTD_CCtx_loadDictionary zfs_ZSTD_CCtx_loadDictionary
+#define ZSTD_CCtx_loadDictionary_advanced zfs_ZSTD_CCtx_loadDictionary_advanced
+#define ZSTD_CCtx_loadDictionary_byReference \
+ zfs_ZSTD_CCtx_loadDictionary_byReference
+#define ZSTD_CCtx_refCDict zfs_ZSTD_CCtx_refCDict
+#define ZSTD_CCtx_refPrefix zfs_ZSTD_CCtx_refPrefix
+#define ZSTD_CCtx_refPrefix_advanced zfs_ZSTD_CCtx_refPrefix_advanced
+#define ZSTD_CCtx_reset zfs_ZSTD_CCtx_reset
+#define ZSTD_CCtx_setParameter zfs_ZSTD_CCtx_setParameter
+#define ZSTD_CCtx_setParametersUsingCCtxParams \
+ zfs_ZSTD_CCtx_setParametersUsingCCtxParams
+#define ZSTD_CCtx_setPledgedSrcSize zfs_ZSTD_CCtx_setPledgedSrcSize
+#define ZSTD_CStreamInSize zfs_ZSTD_CStreamInSize
+#define ZSTD_CStreamOutSize zfs_ZSTD_CStreamOutSize
+#define ZSTD_DCtx_loadDictionary zfs_ZSTD_DCtx_loadDictionary
+#define ZSTD_DCtx_loadDictionary_advanced zfs_ZSTD_DCtx_loadDictionary_advanced
+#define ZSTD_DCtx_loadDictionary_byReference \
+ zfs_ZSTD_DCtx_loadDictionary_byReference
+#define ZSTD_DCtx_refDDict zfs_ZSTD_DCtx_refDDict
+#define ZSTD_DCtx_refPrefix zfs_ZSTD_DCtx_refPrefix
+#define ZSTD_DCtx_refPrefix_advanced zfs_ZSTD_DCtx_refPrefix_advanced
+#define ZSTD_DCtx_reset zfs_ZSTD_DCtx_reset
+#define ZSTD_DCtx_setFormat zfs_ZSTD_DCtx_setFormat
+#define ZSTD_DCtx_setMaxWindowSize zfs_ZSTD_DCtx_setMaxWindowSize
+#define ZSTD_DCtx_setParameter zfs_ZSTD_DCtx_setParameter
+#define ZSTD_DDict_dictContent zfs_ZSTD_DDict_dictContent
+#define ZSTD_DDict_dictSize zfs_ZSTD_DDict_dictSize
+#define ZSTD_DStreamInSize zfs_ZSTD_DStreamInSize
+#define ZSTD_DStreamOutSize zfs_ZSTD_DStreamOutSize
+#define ZSTD_DUBT_findBestMatch zfs_ZSTD_DUBT_findBestMatch
+#define ZSTD_NCountCost zfs_ZSTD_NCountCost
+#define ZSTD_XXH64_digest zfs_ZSTD_XXH64_digest
+#define ZSTD_adjustCParams zfs_ZSTD_adjustCParams
+#define ZSTD_assignParamsToCCtxParams zfs_ZSTD_assignParamsToCCtxParams
+#define ZSTD_buildCTable zfs_ZSTD_buildCTable
+#define ZSTD_buildFSETable zfs_ZSTD_buildFSETable
+#define ZSTD_buildSeqStore zfs_ZSTD_buildSeqStore
+#define ZSTD_buildSeqTable zfs_ZSTD_buildSeqTable
+#define ZSTD_cParam_getBounds zfs_ZSTD_cParam_getBounds
+#define ZSTD_cParam_withinBounds zfs_ZSTD_cParam_withinBounds
+#define ZSTD_calloc zfs_ZSTD_calloc
+#define ZSTD_checkCParams zfs_ZSTD_checkCParams
+#define ZSTD_checkContinuity zfs_ZSTD_checkContinuity
+#define ZSTD_compress zfs_ZSTD_compress
+#define ZSTD_compress2 zfs_ZSTD_compress2
+#define ZSTD_compressBegin zfs_ZSTD_compressBegin
+#define ZSTD_compressBegin_advanced zfs_ZSTD_compressBegin_advanced
+#define ZSTD_compressBegin_advanced_internal \
+ zfs_ZSTD_compressBegin_advanced_internal
+#define ZSTD_compressBegin_usingCDict zfs_ZSTD_compressBegin_usingCDict
+#define ZSTD_compressBegin_usingCDict_advanced \
+ zfs_ZSTD_compressBegin_usingCDict_advanced
+#define ZSTD_compressBegin_usingDict zfs_ZSTD_compressBegin_usingDict
+#define ZSTD_compressBlock zfs_ZSTD_compressBlock
+#define ZSTD_compressBlock_btlazy2 zfs_ZSTD_compressBlock_btlazy2
+#define ZSTD_compressBlock_btlazy2_dictMatchState \
+ zfs_ZSTD_compressBlock_btlazy2_dictMatchState
+#define ZSTD_compressBlock_btlazy2_extDict \
+ zfs_ZSTD_compressBlock_btlazy2_extDict
+#define ZSTD_compressBlock_btopt zfs_ZSTD_compressBlock_btopt
+#define ZSTD_compressBlock_btopt_dictMatchState \
+ zfs_ZSTD_compressBlock_btopt_dictMatchState
+#define ZSTD_compressBlock_btopt_extDict zfs_ZSTD_compressBlock_btopt_extDict
+#define ZSTD_compressBlock_btultra zfs_ZSTD_compressBlock_btultra
+#define ZSTD_compressBlock_btultra2 zfs_ZSTD_compressBlock_btultra2
+#define ZSTD_compressBlock_btultra_dictMatchState \
+ zfs_ZSTD_compressBlock_btultra_dictMatchState
+#define ZSTD_compressBlock_btultra_extDict \
+ zfs_ZSTD_compressBlock_btultra_extDict
+#define ZSTD_compressBlock_doubleFast zfs_ZSTD_compressBlock_doubleFast
+#define ZSTD_compressBlock_doubleFast_dictMatchState \
+ zfs_ZSTD_compressBlock_doubleFast_dictMatchState
+#define ZSTD_compressBlock_doubleFast_extDict \
+ zfs_ZSTD_compressBlock_doubleFast_extDict
+#define ZSTD_compressBlock_doubleFast_extDict_generic \
+ zfs_ZSTD_compressBlock_doubleFast_extDict_generic
+#define ZSTD_compressBlock_fast zfs_ZSTD_compressBlock_fast
+#define ZSTD_compressBlock_fast_dictMatchState \
+ zfs_ZSTD_compressBlock_fast_dictMatchState
+#define ZSTD_compressBlock_fast_extDict zfs_ZSTD_compressBlock_fast_extDict
+#define ZSTD_compressBlock_fast_extDict_generic \
+ zfs_ZSTD_compressBlock_fast_extDict_generic
+#define ZSTD_compressBlock_greedy zfs_ZSTD_compressBlock_greedy
+#define ZSTD_compressBlock_greedy_dictMatchState \
+ zfs_ZSTD_compressBlock_greedy_dictMatchState
+#define ZSTD_compressBlock_greedy_extDict zfs_ZSTD_compressBlock_greedy_extDict
+#define ZSTD_compressBlock_internal zfs_ZSTD_compressBlock_internal
+#define ZSTD_compressBlock_lazy zfs_ZSTD_compressBlock_lazy
+#define ZSTD_compressBlock_lazy2 zfs_ZSTD_compressBlock_lazy2
+#define ZSTD_compressBlock_lazy2_dictMatchState \
+ zfs_ZSTD_compressBlock_lazy2_dictMatchState
+#define ZSTD_compressBlock_lazy2_extDict zfs_ZSTD_compressBlock_lazy2_extDict
+#define ZSTD_compressBlock_lazy_dictMatchState \
+ zfs_ZSTD_compressBlock_lazy_dictMatchState
+#define ZSTD_compressBlock_lazy_extDict zfs_ZSTD_compressBlock_lazy_extDict
+#define ZSTD_compressBound zfs_ZSTD_compressBound
+#define ZSTD_compressCCtx zfs_ZSTD_compressCCtx
+#define ZSTD_compressContinue zfs_ZSTD_compressContinue
+#define ZSTD_compressContinue_internal zfs_ZSTD_compressContinue_internal
+#define ZSTD_compressEnd zfs_ZSTD_compressEnd
+#define ZSTD_compressLiterals zfs_ZSTD_compressLiterals
+#define ZSTD_compressRleLiteralsBlock zfs_ZSTD_compressRleLiteralsBlock
+#define ZSTD_compressStream zfs_ZSTD_compressStream
+#define ZSTD_compressStream2 zfs_ZSTD_compressStream2
+#define ZSTD_compressStream2_simpleArgs zfs_ZSTD_compressStream2_simpleArgs
+#define ZSTD_compressSuperBlock zfs_ZSTD_compressSuperBlock
+#define ZSTD_compress_advanced zfs_ZSTD_compress_advanced
+#define ZSTD_compress_advanced_internal zfs_ZSTD_compress_advanced_internal
+#define ZSTD_compress_internal zfs_ZSTD_compress_internal
+#define ZSTD_compress_usingCDict zfs_ZSTD_compress_usingCDict
+#define ZSTD_compress_usingCDict_advanced zfs_ZSTD_compress_usingCDict_advanced
+#define ZSTD_compress_usingDict zfs_ZSTD_compress_usingDict
+#define ZSTD_copyCCtx zfs_ZSTD_copyCCtx
+#define ZSTD_copyDCtx zfs_ZSTD_copyDCtx
+#define ZSTD_copyDDictParameters zfs_ZSTD_copyDDictParameters
+#define ZSTD_count zfs_ZSTD_count
+#define ZSTD_count_2segments zfs_ZSTD_count_2segments
+#define ZSTD_createCCtx zfs_ZSTD_createCCtx
+#define ZSTD_createCCtxParams zfs_ZSTD_createCCtxParams
+#define ZSTD_createCCtx_advanced zfs_ZSTD_createCCtx_advanced
+#define ZSTD_createCDict zfs_ZSTD_createCDict
+#define ZSTD_createCDict_advanced zfs_ZSTD_createCDict_advanced
+#define ZSTD_createCDict_byReference zfs_ZSTD_createCDict_byReference
+#define ZSTD_createCStream zfs_ZSTD_createCStream
+#define ZSTD_createCStream_advanced zfs_ZSTD_createCStream_advanced
+#define ZSTD_createDCtx zfs_ZSTD_createDCtx
+#define ZSTD_createDCtx_advanced zfs_ZSTD_createDCtx_advanced
+#define ZSTD_createDDict zfs_ZSTD_createDDict
+#define ZSTD_createDDict_advanced zfs_ZSTD_createDDict_advanced
+#define ZSTD_createDDict_byReference zfs_ZSTD_createDDict_byReference
+#define ZSTD_createDStream zfs_ZSTD_createDStream
+#define ZSTD_createDStream_advanced zfs_ZSTD_createDStream_advanced
+#define ZSTD_crossEntropyCost zfs_ZSTD_crossEntropyCost
+#define ZSTD_cycleLog zfs_ZSTD_cycleLog
+#define ZSTD_dParam_getBounds zfs_ZSTD_dParam_getBounds
+#define ZSTD_decodeLiteralsBlock zfs_ZSTD_decodeLiteralsBlock
+#define ZSTD_decodeSeqHeaders zfs_ZSTD_decodeSeqHeaders
+#define ZSTD_decodingBufferSize_min zfs_ZSTD_decodingBufferSize_min
+#define ZSTD_decompress zfs_ZSTD_decompress
+#define ZSTD_decompressBegin zfs_ZSTD_decompressBegin
+#define ZSTD_decompressBegin_usingDDict zfs_ZSTD_decompressBegin_usingDDict
+#define ZSTD_decompressBegin_usingDict zfs_ZSTD_decompressBegin_usingDict
+#define ZSTD_decompressBlock zfs_ZSTD_decompressBlock
+#define ZSTD_decompressBlock_internal zfs_ZSTD_decompressBlock_internal
+#define ZSTD_decompressBound zfs_ZSTD_decompressBound
+#define ZSTD_decompressContinue zfs_ZSTD_decompressContinue
+#define ZSTD_decompressContinueStream zfs_ZSTD_decompressContinueStream
+#define ZSTD_decompressDCtx zfs_ZSTD_decompressDCtx
+#define ZSTD_decompressMultiFrame zfs_ZSTD_decompressMultiFrame
+#define ZSTD_decompressStream zfs_ZSTD_decompressStream
+#define ZSTD_decompressStream_simpleArgs zfs_ZSTD_decompressStream_simpleArgs
+#define ZSTD_decompress_usingDDict zfs_ZSTD_decompress_usingDDict
+#define ZSTD_decompress_usingDict zfs_ZSTD_decompress_usingDict
+#define ZSTD_defaultCParameters zfs_ZSTD_defaultCParameters
+#define ZSTD_did_fieldSize zfs_ZSTD_did_fieldSize
+#define ZSTD_encodeSequences zfs_ZSTD_encodeSequences
+#define ZSTD_encodeSequences_default zfs_ZSTD_encodeSequences_default
+#define ZSTD_endStream zfs_ZSTD_endStream
+#define ZSTD_estimateCCtxSize zfs_ZSTD_estimateCCtxSize
+#define ZSTD_estimateCCtxSize_usingCCtxParams \
+ zfs_ZSTD_estimateCCtxSize_usingCCtxParams
+#define ZSTD_estimateCCtxSize_usingCParams \
+ zfs_ZSTD_estimateCCtxSize_usingCParams
+#define ZSTD_estimateCDictSize zfs_ZSTD_estimateCDictSize
+#define ZSTD_estimateCDictSize_advanced zfs_ZSTD_estimateCDictSize_advanced
+#define ZSTD_estimateCStreamSize zfs_ZSTD_estimateCStreamSize
+#define ZSTD_estimateCStreamSize_usingCCtxParams \
+ zfs_ZSTD_estimateCStreamSize_usingCCtxParams
+#define ZSTD_estimateCStreamSize_usingCParams \
+ zfs_ZSTD_estimateCStreamSize_usingCParams
+#define ZSTD_estimateDCtxSize zfs_ZSTD_estimateDCtxSize
+#define ZSTD_estimateDDictSize zfs_ZSTD_estimateDDictSize
+#define ZSTD_estimateDStreamSize zfs_ZSTD_estimateDStreamSize
+#define ZSTD_estimateDStreamSize_fromFrame \
+ zfs_ZSTD_estimateDStreamSize_fromFrame
+#define ZSTD_fcs_fieldSize zfs_ZSTD_fcs_fieldSize
+#define ZSTD_fillDoubleHashTable zfs_ZSTD_fillDoubleHashTable
+#define ZSTD_fillHashTable zfs_ZSTD_fillHashTable
+#define ZSTD_findDecompressedSize zfs_ZSTD_findDecompressedSize
+#define ZSTD_findFrameCompressedSize zfs_ZSTD_findFrameCompressedSize
+#define ZSTD_findFrameSizeInfo zfs_ZSTD_findFrameSizeInfo
+#define ZSTD_flushStream zfs_ZSTD_flushStream
+#define ZSTD_frameHeaderSize zfs_ZSTD_frameHeaderSize
+#define ZSTD_free zfs_ZSTD_free
+#define ZSTD_freeCCtx zfs_ZSTD_freeCCtx
+#define ZSTD_freeCCtxParams zfs_ZSTD_freeCCtxParams
+#define ZSTD_freeCDict zfs_ZSTD_freeCDict
+#define ZSTD_freeCStream zfs_ZSTD_freeCStream
+#define ZSTD_freeDCtx zfs_ZSTD_freeDCtx
+#define ZSTD_freeDDict zfs_ZSTD_freeDDict
+#define ZSTD_freeDStream zfs_ZSTD_freeDStream
+#define ZSTD_fseBitCost zfs_ZSTD_fseBitCost
+#define ZSTD_getBlockSize zfs_ZSTD_getBlockSize
+#define ZSTD_getCParams zfs_ZSTD_getCParams
+#define ZSTD_getCParamsFromCCtxParams zfs_ZSTD_getCParamsFromCCtxParams
+#define ZSTD_getCParamsFromCDict zfs_ZSTD_getCParamsFromCDict
+#define ZSTD_getCParams_internal zfs_ZSTD_getCParams_internal
+#define ZSTD_getDDict zfs_ZSTD_getDDict
+#define ZSTD_getDecompressedSize zfs_ZSTD_getDecompressedSize
+#define ZSTD_getDictID_fromDDict zfs_ZSTD_getDictID_fromDDict
+#define ZSTD_getDictID_fromDict zfs_ZSTD_getDictID_fromDict
+#define ZSTD_getDictID_fromFrame zfs_ZSTD_getDictID_fromFrame
+#define ZSTD_getErrorCode zfs_ZSTD_getErrorCode
+#define ZSTD_getErrorName zfs_ZSTD_getErrorName
+#define ZSTD_getErrorString zfs_ZSTD_getErrorString
+#define ZSTD_getFrameContentSize zfs_ZSTD_getFrameContentSize
+#define ZSTD_getFrameHeader zfs_ZSTD_getFrameHeader
+#define ZSTD_getFrameHeader_advanced zfs_ZSTD_getFrameHeader_advanced
+#define ZSTD_getFrameProgression zfs_ZSTD_getFrameProgression
+#define ZSTD_getParams zfs_ZSTD_getParams
+#define ZSTD_getSeqStore zfs_ZSTD_getSeqStore
+#define ZSTD_getSequences zfs_ZSTD_getSequences
+#define ZSTD_getcBlockSize zfs_ZSTD_getcBlockSize
+#define ZSTD_hashPtr zfs_ZSTD_hashPtr
+#define ZSTD_initCDict_internal zfs_ZSTD_initCDict_internal
+#define ZSTD_initCStream zfs_ZSTD_initCStream
+#define ZSTD_initCStream_advanced zfs_ZSTD_initCStream_advanced
+#define ZSTD_initCStream_internal zfs_ZSTD_initCStream_internal
+#define ZSTD_initCStream_srcSize zfs_ZSTD_initCStream_srcSize
+#define ZSTD_initCStream_usingCDict zfs_ZSTD_initCStream_usingCDict
+#define ZSTD_initCStream_usingCDict_advanced \
+ zfs_ZSTD_initCStream_usingCDict_advanced
+#define ZSTD_initCStream_usingDict zfs_ZSTD_initCStream_usingDict
+#define ZSTD_initDDict_internal zfs_ZSTD_initDDict_internal
+#define ZSTD_initDStream zfs_ZSTD_initDStream
+#define ZSTD_initDStream_usingDDict zfs_ZSTD_initDStream_usingDDict
+#define ZSTD_initDStream_usingDict zfs_ZSTD_initDStream_usingDict
+#define ZSTD_initFseState zfs_ZSTD_initFseState
+#define ZSTD_initStaticCCtx zfs_ZSTD_initStaticCCtx
+#define ZSTD_initStaticCDict zfs_ZSTD_initStaticCDict
+#define ZSTD_initStaticCStream zfs_ZSTD_initStaticCStream
+#define ZSTD_initStaticDCtx zfs_ZSTD_initStaticDCtx
+#define ZSTD_initStaticDDict zfs_ZSTD_initStaticDDict
+#define ZSTD_initStaticDStream zfs_ZSTD_initStaticDStream
+#define ZSTD_initStats_ultra zfs_ZSTD_initStats_ultra
+#define ZSTD_insertAndFindFirstIndex zfs_ZSTD_insertAndFindFirstIndex
+#define ZSTD_insertAndFindFirstIndexHash3 zfs_ZSTD_insertAndFindFirstIndexHash3
+#define ZSTD_insertAndFindFirstIndex_internal \
+ zfs_ZSTD_insertAndFindFirstIndex_internal
+#define ZSTD_insertBlock zfs_ZSTD_insertBlock
+#define ZSTD_invalidateRepCodes zfs_ZSTD_invalidateRepCodes
+#define ZSTD_isFrame zfs_ZSTD_isFrame
+#define ZSTD_ldm_adjustParameters zfs_ZSTD_ldm_adjustParameters
+#define ZSTD_ldm_blockCompress zfs_ZSTD_ldm_blockCompress
+#define ZSTD_ldm_fillHashTable zfs_ZSTD_ldm_fillHashTable
+#define ZSTD_ldm_generateSequences zfs_ZSTD_ldm_generateSequences
+#define ZSTD_ldm_getMaxNbSeq zfs_ZSTD_ldm_getMaxNbSeq
+#define ZSTD_ldm_getTableSize zfs_ZSTD_ldm_getTableSize
+#define ZSTD_ldm_skipSequences zfs_ZSTD_ldm_skipSequences
+#define ZSTD_loadCEntropy zfs_ZSTD_loadCEntropy
+#define ZSTD_loadDEntropy zfs_ZSTD_loadDEntropy
+#define ZSTD_loadDictionaryContent zfs_ZSTD_loadDictionaryContent
+#define ZSTD_makeCCtxParamsFromCParams zfs_ZSTD_makeCCtxParamsFromCParams
+#define ZSTD_malloc zfs_ZSTD_malloc
+#define ZSTD_maxCLevel zfs_ZSTD_maxCLevel
+#define ZSTD_minCLevel zfs_ZSTD_minCLevel
+#define ZSTD_nextInputType zfs_ZSTD_nextInputType
+#define ZSTD_nextSrcSizeToDecompress zfs_ZSTD_nextSrcSizeToDecompress
+#define ZSTD_noCompressLiterals zfs_ZSTD_noCompressLiterals
+#define ZSTD_referenceExternalSequences zfs_ZSTD_referenceExternalSequences
+#define ZSTD_rescaleFreqs zfs_ZSTD_rescaleFreqs
+#define ZSTD_resetCCtx_internal zfs_ZSTD_resetCCtx_internal
+#define ZSTD_resetCCtx_usingCDict zfs_ZSTD_resetCCtx_usingCDict
+#define ZSTD_resetCStream zfs_ZSTD_resetCStream
+#define ZSTD_resetDStream zfs_ZSTD_resetDStream
+#define ZSTD_resetSeqStore zfs_ZSTD_resetSeqStore
+#define ZSTD_reset_compressedBlockState zfs_ZSTD_reset_compressedBlockState
+#define ZSTD_safecopy zfs_ZSTD_safecopy
+#define ZSTD_selectBlockCompressor zfs_ZSTD_selectBlockCompressor
+#define ZSTD_selectEncodingType zfs_ZSTD_selectEncodingType
+#define ZSTD_seqToCodes zfs_ZSTD_seqToCodes
+#define ZSTD_sizeof_CCtx zfs_ZSTD_sizeof_CCtx
+#define ZSTD_sizeof_CDict zfs_ZSTD_sizeof_CDict
+#define ZSTD_sizeof_CStream zfs_ZSTD_sizeof_CStream
+#define ZSTD_sizeof_DCtx zfs_ZSTD_sizeof_DCtx
+#define ZSTD_sizeof_DDict zfs_ZSTD_sizeof_DDict
+#define ZSTD_sizeof_DStream zfs_ZSTD_sizeof_DStream
+#define ZSTD_toFlushNow zfs_ZSTD_toFlushNow
+#define ZSTD_updateRep zfs_ZSTD_updateRep
+#define ZSTD_updateStats zfs_ZSTD_updateStats
+#define ZSTD_updateTree zfs_ZSTD_updateTree
+#define ZSTD_versionNumber zfs_ZSTD_versionNumber
+#define ZSTD_versionString zfs_ZSTD_versionString
+#define ZSTD_writeFrameHeader zfs_ZSTD_writeFrameHeader
+#define ZSTD_writeLastEmptyBlock zfs_ZSTD_writeLastEmptyBlock
+#define algoTime zfs_algoTime
+#define attachDictSizeCutoffs zfs_attachDictSizeCutoffs
+#define g_ctx zfs_g_ctx
+#define g_debuglevel zfs_g_debuglevel
+#define kInverseProbabilityLog256 zfs_kInverseProbabilityLog256
+#define repStartValue zfs_repStartValue
+#define FSE_isError zfs_FSE_isError
+#define HUF_isError zfs_HUF_isError
diff --git a/sys/contrib/openzfs/module/zstd/lib/zstd.c b/sys/contrib/openzfs/module/zstd/lib/zstd.c
new file mode 100644
index 000000000000..acdd4d9dac9d
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/lib/zstd.c
@@ -0,0 +1,27826 @@
+/*
+ * BSD 3-Clause Clear License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. All rights reserved.
+ * Copyright (c) 2019-2020, Michael Niewöhner. All rights reserved.
+ */
+
+#define MEM_MODULE
+#define XXH_NAMESPACE ZSTD_
+#define XXH_PRIVATE_API
+#define XXH_INLINE_ALL
+#define ZSTD_LEGACY_SUPPORT 0
+#define ZSTD_LIB_DICTBUILDER 0
+#define ZSTD_LIB_DEPRECATED 0
+#define ZSTD_NOBENCH
+
+/**** start inlining common/debug.c ****/
+/* ******************************************************************
+ * debug
+ * Part of FSE library
+ * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+
+/*
+ * This module only hosts one global variable
+ * which can be used to dynamically influence the verbosity of traces,
+ * such as DEBUGLOG and RAWLOG
+ */
+
+/**** start inlining debug.h ****/
+/* ******************************************************************
+ * debug
+ * Part of FSE library
+ * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+
+/*
+ * The purpose of this header is to enable debug functions.
+ * They regroup assert(), DEBUGLOG() and RAWLOG() for run-time,
+ * and DEBUG_STATIC_ASSERT() for compile-time.
+ *
+ * By default, DEBUGLEVEL==0, which means run-time debug is disabled.
+ *
+ * Level 1 enables assert() only.
+ * Starting level 2, traces can be generated and pushed to stderr.
+ * The higher the level, the more verbose the traces.
+ *
+ * It's possible to dynamically adjust level using variable g_debug_level,
+ * which is only declared if DEBUGLEVEL>=2,
+ * and is a global variable, not multi-thread protected (use with care)
+ */
+
+#ifndef DEBUG_H_12987983217
+#define DEBUG_H_12987983217
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* static assert is triggered at compile time, leaving no runtime artefact.
+ * static assert only works with compile-time constants.
+ * Also, this variant can only be used inside a function. */
+#define DEBUG_STATIC_ASSERT(c) (void)sizeof(char[(c) ? 1 : -1])
+
+
+/* DEBUGLEVEL is expected to be defined externally,
+ * typically through compiler command line.
+ * Value must be a number. */
+#ifndef DEBUGLEVEL
+# define DEBUGLEVEL 0
+#endif
+
+
+/* DEBUGFILE can be defined externally,
+ * typically through compiler command line.
+ * note : currently useless.
+ * Value must be stderr or stdout */
+#ifndef DEBUGFILE
+# define DEBUGFILE stderr
+#endif
+
+
+/* recommended values for DEBUGLEVEL :
+ * 0 : release mode, no debug, all run-time checks disabled
+ * 1 : enables assert() only, no display
+ * 2 : reserved, for currently active debug path
+ * 3 : events once per object lifetime (CCtx, CDict, etc.)
+ * 4 : events once per frame
+ * 5 : events once per block
+ * 6 : events once per sequence (verbose)
+ * 7+: events at every position (*very* verbose)
+ *
+ * It's generally inconvenient to output traces > 5.
+ * In which case, it's possible to selectively trigger high verbosity levels
+ * by modifying g_debug_level.
+ */
+
+#if (DEBUGLEVEL>=1)
+# include <assert.h>
+#else
+# ifndef assert /* assert may be already defined, due to prior #include <assert.h> */
+# define assert(condition) ((void)0) /* disable assert (default) */
+# endif
+#endif
+
+#if (DEBUGLEVEL>=2)
+# include <stdio.h>
+extern int g_debuglevel; /* the variable is only declared,
+ it actually lives in debug.c,
+ and is shared by the whole process.
+ It's not thread-safe.
+ It's useful when enabling very verbose levels
+ on selective conditions (such as position in src) */
+
+# define RAWLOG(l, ...) { \
+ if (l<=g_debuglevel) { \
+ fprintf(stderr, __VA_ARGS__); \
+ } }
+# define DEBUGLOG(l, ...) { \
+ if (l<=g_debuglevel) { \
+ fprintf(stderr, __FILE__ ": " __VA_ARGS__); \
+ fprintf(stderr, " \n"); \
+ } }
+#else
+# define RAWLOG(l, ...) {} /* disabled */
+# define DEBUGLOG(l, ...) {} /* disabled */
+#endif
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* DEBUG_H_12987983217 */
+/**** ended inlining debug.h ****/
+
+int g_debuglevel = DEBUGLEVEL;
+/**** ended inlining common/debug.c ****/
+/**** start inlining common/entropy_common.c ****/
+/* ******************************************************************
+ * Common functions of New Generation Entropy library
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ * - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* *************************************
+* Dependencies
+***************************************/
+/**** start inlining mem.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-****************************************
+* Dependencies
+******************************************/
+#include <stddef.h> /* size_t, ptrdiff_t */
+#include <string.h> /* memcpy */
+
+
+/*-****************************************
+* Compiler specifics
+******************************************/
+#if defined(_MSC_VER) /* Visual Studio */
+# include <stdlib.h> /* _byteswap_ulong */
+# include <intrin.h> /* _byteswap_* */
+#endif
+#if defined(__GNUC__)
+# define MEM_STATIC static __inline __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+# define MEM_STATIC static inline
+#elif defined(_MSC_VER)
+# define MEM_STATIC static __inline
+#else
+# define MEM_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+#ifndef __has_builtin
+# define __has_builtin(x) 0 /* compat. with non-clang compilers */
+#endif
+
+/* code only tested on 32 and 64 bits systems */
+#define MEM_STATIC_ASSERT(c) { enum { MEM_static_assert = 1/(int)(!!(c)) }; }
+MEM_STATIC void MEM_check(void) { MEM_STATIC_ASSERT((sizeof(size_t)==4) || (sizeof(size_t)==8)); }
+
+/* detects whether we are being compiled under msan */
+#if defined (__has_feature)
+# if __has_feature(memory_sanitizer)
+# define MEMORY_SANITIZER 1
+# endif
+#endif
+
+#if defined (MEMORY_SANITIZER)
+/* Not all platforms that support msan provide sanitizers/msan_interface.h.
+ * We therefore declare the functions we need ourselves, rather than trying to
+ * include the header file... */
+
+#include <stdint.h> /* intptr_t */
+
+/* Make memory region fully initialized (without changing its contents). */
+void __msan_unpoison(const volatile void *a, size_t size);
+
+/* Make memory region fully uninitialized (without changing its contents).
+ This is a legacy interface that does not update origin information. Use
+ __msan_allocated_memory() instead. */
+void __msan_poison(const volatile void *a, size_t size);
+
+/* Returns the offset of the first (at least partially) poisoned byte in the
+ memory range, or -1 if the whole range is good. */
+intptr_t __msan_test_shadow(const volatile void *x, size_t size);
+#endif
+
+/* detects whether we are being compiled under asan */
+#if defined (__has_feature)
+# if __has_feature(address_sanitizer)
+# define ADDRESS_SANITIZER 1
+# endif
+#elif defined(__SANITIZE_ADDRESS__)
+# define ADDRESS_SANITIZER 1
+#endif
+
+#if defined (ADDRESS_SANITIZER)
+/* Not all platforms that support asan provide sanitizers/asan_interface.h.
+ * We therefore declare the functions we need ourselves, rather than trying to
+ * include the header file... */
+
+/**
+ * Marks a memory region (<c>[addr, addr+size)</c>) as unaddressable.
+ *
+ * This memory must be previously allocated by your program. Instrumented
+ * code is forbidden from accessing addresses in this region until it is
+ * unpoisoned. This function is not guaranteed to poison the entire region -
+ * it could poison only a subregion of <c>[addr, addr+size)</c> due to ASan
+ * alignment restrictions.
+ *
+ * \note This function is not thread-safe because no two threads can poison or
+ * unpoison memory in the same memory region simultaneously.
+ *
+ * \param addr Start of memory region.
+ * \param size Size of memory region. */
+void __asan_poison_memory_region(void const volatile *addr, size_t size);
+
+/**
+ * Marks a memory region (<c>[addr, addr+size)</c>) as addressable.
+ *
+ * This memory must be previously allocated by your program. Accessing
+ * addresses in this region is allowed until this region is poisoned again.
+ * This function could unpoison a super-region of <c>[addr, addr+size)</c> due
+ * to ASan alignment restrictions.
+ *
+ * \note This function is not thread-safe because no two threads can
+ * poison or unpoison memory in the same memory region simultaneously.
+ *
+ * \param addr Start of memory region.
+ * \param size Size of memory region. */
+void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
+#endif
+
+
+/*-**************************************************************
+* Basic Types
+*****************************************************************/
+#if !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+ typedef uint8_t BYTE;
+ typedef uint16_t U16;
+ typedef int16_t S16;
+ typedef uint32_t U32;
+ typedef int32_t S32;
+ typedef uint64_t U64;
+ typedef int64_t S64;
+#else
+# include <limits.h>
+#if CHAR_BIT != 8
+# error "this implementation requires char to be exactly 8-bit type"
+#endif
+ typedef unsigned char BYTE;
+#if USHRT_MAX != 65535
+# error "this implementation requires short to be exactly 16-bit type"
+#endif
+ typedef unsigned short U16;
+ typedef signed short S16;
+#if UINT_MAX != 4294967295
+# error "this implementation requires int to be exactly 32-bit type"
+#endif
+ typedef unsigned int U32;
+ typedef signed int S32;
+/* note : there are no limits defined for long long type in C90.
+ * limits exist in C99, however, in such case, <stdint.h> is preferred */
+ typedef unsigned long long U64;
+ typedef signed long long S64;
+#endif
+
+
+/*-**************************************************************
+* Memory I/O
+*****************************************************************/
+/* MEM_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (i.e., not portable).
+ * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ * It can generate buggy code on targets depending on alignment.
+ * In some circumstances, it's the only known way to get the most performance (i.e. GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
+# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+# define MEM_FORCE_MEMORY_ACCESS 2
+# elif defined(__INTEL_COMPILER) || defined(__GNUC__) || defined(__ICCARM__)
+# define MEM_FORCE_MEMORY_ACCESS 1
+# endif
+#endif
+
+MEM_STATIC unsigned MEM_32bits(void) { return sizeof(size_t)==4; }
+MEM_STATIC unsigned MEM_64bits(void) { return sizeof(size_t)==8; }
+
+MEM_STATIC unsigned MEM_isLittleEndian(void)
+{
+ const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */
+ return one.c[0];
+}
+
+#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
+
+/* violates C standard, by lying on structure alignment.
+Only use if no other choice to achieve best performance on target platform */
+MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
+MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
+MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
+MEM_STATIC size_t MEM_readST(const void* memPtr) { return *(const size_t*) memPtr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
+MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; }
+
+#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+#if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32))
+ __pragma( pack(push, 1) )
+ typedef struct { U16 v; } unalign16;
+ typedef struct { U32 v; } unalign32;
+ typedef struct { U64 v; } unalign64;
+ typedef struct { size_t v; } unalignArch;
+ __pragma( pack(pop) )
+#else
+ typedef struct { U16 v; } __attribute__((packed)) unalign16;
+ typedef struct { U32 v; } __attribute__((packed)) unalign32;
+ typedef struct { U64 v; } __attribute__((packed)) unalign64;
+ typedef struct { size_t v; } __attribute__((packed)) unalignArch;
+#endif
+
+MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign16*)ptr)->v; }
+MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign32*)ptr)->v; }
+MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign64*)ptr)->v; }
+MEM_STATIC size_t MEM_readST(const void* ptr) { return ((const unalignArch*)ptr)->v; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign16*)memPtr)->v = value; }
+MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign32*)memPtr)->v = value; }
+MEM_STATIC void MEM_write64(void* memPtr, U64 value) { ((unalign64*)memPtr)->v = value; }
+
+#else
+
+/* default method, safe and standard.
+ can sometimes prove slower */
+
+MEM_STATIC U16 MEM_read16(const void* memPtr)
+{
+ U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U32 MEM_read32(const void* memPtr)
+{
+ U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U64 MEM_read64(const void* memPtr)
+{
+ U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC size_t MEM_readST(const void* memPtr)
+{
+ size_t val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value)
+{
+ memcpy(memPtr, &value, sizeof(value));
+}
+
+MEM_STATIC void MEM_write32(void* memPtr, U32 value)
+{
+ memcpy(memPtr, &value, sizeof(value));
+}
+
+MEM_STATIC void MEM_write64(void* memPtr, U64 value)
+{
+ memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif /* MEM_FORCE_MEMORY_ACCESS */
+
+MEM_STATIC U32 MEM_swap32(U32 in)
+{
+#if defined(_MSC_VER) /* Visual Studio */
+ return _byteswap_ulong(in);
+#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \
+ || (defined(__clang__) && __has_builtin(__builtin_bswap32))
+ return __builtin_bswap32(in);
+#else
+ return ((in << 24) & 0xff000000 ) |
+ ((in << 8) & 0x00ff0000 ) |
+ ((in >> 8) & 0x0000ff00 ) |
+ ((in >> 24) & 0x000000ff );
+#endif
+}
+
+MEM_STATIC U64 MEM_swap64(U64 in)
+{
+#if defined(_MSC_VER) /* Visual Studio */
+ return _byteswap_uint64(in);
+#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \
+ || (defined(__clang__) && __has_builtin(__builtin_bswap64))
+ return __builtin_bswap64(in);
+#else
+ return ((in << 56) & 0xff00000000000000ULL) |
+ ((in << 40) & 0x00ff000000000000ULL) |
+ ((in << 24) & 0x0000ff0000000000ULL) |
+ ((in << 8) & 0x000000ff00000000ULL) |
+ ((in >> 8) & 0x00000000ff000000ULL) |
+ ((in >> 24) & 0x0000000000ff0000ULL) |
+ ((in >> 40) & 0x000000000000ff00ULL) |
+ ((in >> 56) & 0x00000000000000ffULL);
+#endif
+}
+
+MEM_STATIC size_t MEM_swapST(size_t in)
+{
+ if (MEM_32bits())
+ return (size_t)MEM_swap32((U32)in);
+ else
+ return (size_t)MEM_swap64((U64)in);
+}
+
+/*=== Little endian r/w ===*/
+
+MEM_STATIC U16 MEM_readLE16(const void* memPtr)
+{
+ if (MEM_isLittleEndian())
+ return MEM_read16(memPtr);
+ else {
+ const BYTE* p = (const BYTE*)memPtr;
+ return (U16)(p[0] + (p[1]<<8));
+ }
+}
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val)
+{
+ if (MEM_isLittleEndian()) {
+ MEM_write16(memPtr, val);
+ } else {
+ BYTE* p = (BYTE*)memPtr;
+ p[0] = (BYTE)val;
+ p[1] = (BYTE)(val>>8);
+ }
+}
+
+MEM_STATIC U32 MEM_readLE24(const void* memPtr)
+{
+ return MEM_readLE16(memPtr) + (((const BYTE*)memPtr)[2] << 16);
+}
+
+MEM_STATIC void MEM_writeLE24(void* memPtr, U32 val)
+{
+ MEM_writeLE16(memPtr, (U16)val);
+ ((BYTE*)memPtr)[2] = (BYTE)(val>>16);
+}
+
+MEM_STATIC U32 MEM_readLE32(const void* memPtr)
+{
+ if (MEM_isLittleEndian())
+ return MEM_read32(memPtr);
+ else
+ return MEM_swap32(MEM_read32(memPtr));
+}
+
+MEM_STATIC void MEM_writeLE32(void* memPtr, U32 val32)
+{
+ if (MEM_isLittleEndian())
+ MEM_write32(memPtr, val32);
+ else
+ MEM_write32(memPtr, MEM_swap32(val32));
+}
+
+MEM_STATIC U64 MEM_readLE64(const void* memPtr)
+{
+ if (MEM_isLittleEndian())
+ return MEM_read64(memPtr);
+ else
+ return MEM_swap64(MEM_read64(memPtr));
+}
+
+MEM_STATIC void MEM_writeLE64(void* memPtr, U64 val64)
+{
+ if (MEM_isLittleEndian())
+ MEM_write64(memPtr, val64);
+ else
+ MEM_write64(memPtr, MEM_swap64(val64));
+}
+
+MEM_STATIC size_t MEM_readLEST(const void* memPtr)
+{
+ if (MEM_32bits())
+ return (size_t)MEM_readLE32(memPtr);
+ else
+ return (size_t)MEM_readLE64(memPtr);
+}
+
+MEM_STATIC void MEM_writeLEST(void* memPtr, size_t val)
+{
+ if (MEM_32bits())
+ MEM_writeLE32(memPtr, (U32)val);
+ else
+ MEM_writeLE64(memPtr, (U64)val);
+}
+
+/*=== Big endian r/w ===*/
+
+MEM_STATIC U32 MEM_readBE32(const void* memPtr)
+{
+ if (MEM_isLittleEndian())
+ return MEM_swap32(MEM_read32(memPtr));
+ else
+ return MEM_read32(memPtr);
+}
+
+MEM_STATIC void MEM_writeBE32(void* memPtr, U32 val32)
+{
+ if (MEM_isLittleEndian())
+ MEM_write32(memPtr, MEM_swap32(val32));
+ else
+ MEM_write32(memPtr, val32);
+}
+
+MEM_STATIC U64 MEM_readBE64(const void* memPtr)
+{
+ if (MEM_isLittleEndian())
+ return MEM_swap64(MEM_read64(memPtr));
+ else
+ return MEM_read64(memPtr);
+}
+
+MEM_STATIC void MEM_writeBE64(void* memPtr, U64 val64)
+{
+ if (MEM_isLittleEndian())
+ MEM_write64(memPtr, MEM_swap64(val64));
+ else
+ MEM_write64(memPtr, val64);
+}
+
+MEM_STATIC size_t MEM_readBEST(const void* memPtr)
+{
+ if (MEM_32bits())
+ return (size_t)MEM_readBE32(memPtr);
+ else
+ return (size_t)MEM_readBE64(memPtr);
+}
+
+MEM_STATIC void MEM_writeBEST(void* memPtr, size_t val)
+{
+ if (MEM_32bits())
+ MEM_writeBE32(memPtr, (U32)val);
+ else
+ MEM_writeBE64(memPtr, (U64)val);
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* MEM_H_MODULE */
+/**** ended inlining mem.h ****/
+/**** start inlining error_private.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* Note : this module is expected to remain private, do not expose it */
+
+#ifndef ERROR_H_MODULE
+#define ERROR_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* ****************************************
+* Dependencies
+******************************************/
+#include <stddef.h> /* size_t */
+/**** start inlining zstd_errors.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_ERRORS_H_398273423
+#define ZSTD_ERRORS_H_398273423
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*===== dependency =====*/
+#include <stddef.h> /* size_t */
+
+
+/* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */
+#ifndef ZSTDERRORLIB_VISIBILITY
+# if defined(__GNUC__) && (__GNUC__ >= 4)
+# define ZSTDERRORLIB_VISIBILITY __attribute__ ((visibility ("default")))
+# else
+# define ZSTDERRORLIB_VISIBILITY
+# endif
+#endif
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+# define ZSTDERRORLIB_API __declspec(dllexport) ZSTDERRORLIB_VISIBILITY
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+# define ZSTDERRORLIB_API __declspec(dllimport) ZSTDERRORLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+# define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
+#endif
+
+/*-*********************************************
+ * Error codes list
+ *-*********************************************
+ * Error codes _values_ are pinned down since v1.3.1 only.
+ * Therefore, don't rely on values if you may link to any version < v1.3.1.
+ *
+ * Only values < 100 are considered stable.
+ *
+ * note 1 : this API shall be used with static linking only.
+ * dynamic linking is not yet officially supported.
+ * note 2 : Prefer relying on the enum than on its value whenever possible
+ * This is the only supported way to use the error list < v1.3.1
+ * note 3 : ZSTD_isError() is always correct, whatever the library version.
+ **********************************************/
+typedef enum {
+ ZSTD_error_no_error = 0,
+ ZSTD_error_GENERIC = 1,
+ ZSTD_error_prefix_unknown = 10,
+ ZSTD_error_version_unsupported = 12,
+ ZSTD_error_frameParameter_unsupported = 14,
+ ZSTD_error_frameParameter_windowTooLarge = 16,
+ ZSTD_error_corruption_detected = 20,
+ ZSTD_error_checksum_wrong = 22,
+ ZSTD_error_dictionary_corrupted = 30,
+ ZSTD_error_dictionary_wrong = 32,
+ ZSTD_error_dictionaryCreation_failed = 34,
+ ZSTD_error_parameter_unsupported = 40,
+ ZSTD_error_parameter_outOfBound = 42,
+ ZSTD_error_tableLog_tooLarge = 44,
+ ZSTD_error_maxSymbolValue_tooLarge = 46,
+ ZSTD_error_maxSymbolValue_tooSmall = 48,
+ ZSTD_error_stage_wrong = 60,
+ ZSTD_error_init_missing = 62,
+ ZSTD_error_memory_allocation = 64,
+ ZSTD_error_workSpace_tooSmall= 66,
+ ZSTD_error_dstSize_tooSmall = 70,
+ ZSTD_error_srcSize_wrong = 72,
+ ZSTD_error_dstBuffer_null = 74,
+ /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
+ ZSTD_error_frameIndex_tooLarge = 100,
+ ZSTD_error_seekableIO = 102,
+ ZSTD_error_dstBuffer_wrong = 104,
+ ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
+} ZSTD_ErrorCode;
+
+/*! ZSTD_getErrorCode() :
+ convert a `size_t` function result into a `ZSTD_ErrorCode` enum type,
+ which can be used to compare with enum list published above */
+ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult);
+ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code); /**< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_ERRORS_H_398273423 */
+/**** ended inlining zstd_errors.h ****/
+
+
+/* ****************************************
+* Compiler-specific
+******************************************/
+#if defined(__GNUC__)
+# define ERR_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+# define ERR_STATIC static inline
+#elif defined(_MSC_VER)
+# define ERR_STATIC static __inline
+#else
+# define ERR_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/*-****************************************
+* Customization (error_public.h)
+******************************************/
+typedef ZSTD_ErrorCode ERR_enum;
+#define PREFIX(name) ZSTD_error_##name
+
+
+/*-****************************************
+* Error codes handling
+******************************************/
+#undef ERROR /* already defined on Visual Studio */
+#define ERROR(name) ZSTD_ERROR(name)
+#define ZSTD_ERROR(name) ((size_t)-PREFIX(name))
+
+ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+
+ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); }
+
+/* check and forward error code */
+#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e
+#define CHECK_F(f) { CHECK_V_F(_var_err__, f); }
+
+
+/*-****************************************
+* Error Strings
+******************************************/
+
+const char* ERR_getErrorString(ERR_enum code); /* error_private.c */
+
+ERR_STATIC const char* ERR_getErrorName(size_t code)
+{
+ return ERR_getErrorString(ERR_getErrorCode(code));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ERROR_H_MODULE */
+/**** ended inlining error_private.h ****/
+#define FSE_STATIC_LINKING_ONLY /* FSE_MIN_TABLELOG */
+/**** start inlining fse.h ****/
+/* ******************************************************************
+ * FSE : Finite State Entropy codec
+ * Public Prototypes declaration
+ * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef FSE_H
+#define FSE_H
+
+
+/*-*****************************************
+* Dependencies
+******************************************/
+#include <stddef.h> /* size_t, ptrdiff_t */
+
+
+/*-*****************************************
+* FSE_PUBLIC_API : control library symbols visibility
+******************************************/
+#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+# define FSE_PUBLIC_API __attribute__ ((visibility ("default")))
+#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) /* Visual expected */
+# define FSE_PUBLIC_API __declspec(dllexport)
+#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+# define FSE_PUBLIC_API __declspec(dllimport) /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+# define FSE_PUBLIC_API
+#endif
+
+/*------ Version ------*/
+#define FSE_VERSION_MAJOR 0
+#define FSE_VERSION_MINOR 9
+#define FSE_VERSION_RELEASE 0
+
+#define FSE_LIB_VERSION FSE_VERSION_MAJOR.FSE_VERSION_MINOR.FSE_VERSION_RELEASE
+#define FSE_QUOTE(str) #str
+#define FSE_EXPAND_AND_QUOTE(str) FSE_QUOTE(str)
+#define FSE_VERSION_STRING FSE_EXPAND_AND_QUOTE(FSE_LIB_VERSION)
+
+#define FSE_VERSION_NUMBER (FSE_VERSION_MAJOR *100*100 + FSE_VERSION_MINOR *100 + FSE_VERSION_RELEASE)
+FSE_PUBLIC_API unsigned FSE_versionNumber(void); /**< library version number; to be used when checking dll version */
+
+
+/*-****************************************
+* FSE simple functions
+******************************************/
+/*! FSE_compress() :
+ Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
+ 'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize).
+ @return : size of compressed data (<= dstCapacity).
+ Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+ if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead.
+ if FSE_isError(return), compression failed (more details using FSE_getErrorName())
+*/
+FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize);
+
+/*! FSE_decompress():
+ Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
+ into already allocated destination buffer 'dst', of size 'dstCapacity'.
+ @return : size of regenerated data (<= maxDstSize),
+ or an error code, which can be tested using FSE_isError() .
+
+ ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!!
+ Why ? : making this distinction requires a header.
+ Header management is intentionally delegated to the user layer, which can better manage special cases.
+*/
+FSE_PUBLIC_API size_t FSE_decompress(void* dst, size_t dstCapacity,
+ const void* cSrc, size_t cSrcSize);
+
+
+/*-*****************************************
+* Tool functions
+******************************************/
+FSE_PUBLIC_API size_t FSE_compressBound(size_t size); /* maximum compressed size */
+
+/* Error Management */
+FSE_PUBLIC_API unsigned FSE_isError(size_t code); /* tells if a return value is an error code */
+FSE_PUBLIC_API const char* FSE_getErrorName(size_t code); /* provides error code string (useful for debugging) */
+
+
+/*-*****************************************
+* FSE advanced functions
+******************************************/
+/*! FSE_compress2() :
+ Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
+ Both parameters can be defined as '0' to mean : use default value
+ @return : size of compressed data
+ Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
+ if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
+ if FSE_isError(return), it's an error code.
+*/
+FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+
+
+/*-*****************************************
+* FSE detailed API
+******************************************/
+/*!
+FSE_compress() does the following:
+1. count symbol occurrence from source[] into table count[] (see hist.h)
+2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
+3. save normalized counters to memory buffer using writeNCount()
+4. build encoding table 'CTable' from normalized counters
+5. encode the data stream using encoding table 'CTable'
+
+FSE_decompress() does the following:
+1. read normalized counters with readNCount()
+2. build decoding table 'DTable' from normalized counters
+3. decode the data stream using decoding table 'DTable'
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and provide normalized distribution using external method.
+*/
+
+/* *** COMPRESSION *** */
+
+/*! FSE_optimalTableLog():
+ dynamically downsize 'tableLog' when conditions are met.
+ It saves CPU time, by using smaller tables, while preserving or even improving compression ratio.
+ @return : recommended tableLog (necessarily <= 'maxTableLog') */
+FSE_PUBLIC_API unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+
+/*! FSE_normalizeCount():
+ normalize counts so that sum(count[]) == Power_of_2 (2^tableLog)
+ 'normalizedCounter' is a table of short, of minimum size (maxSymbolValue+1).
+ @return : tableLog,
+ or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog,
+ const unsigned* count, size_t srcSize, unsigned maxSymbolValue);
+
+/*! FSE_NCountWriteBound():
+ Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'.
+ Typically useful for allocation purpose. */
+FSE_PUBLIC_API size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_writeNCount():
+ Compactly save 'normalizedCounter' into 'buffer'.
+ @return : size of the compressed table,
+ or an errorCode, which can be tested using FSE_isError(). */
+FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+ const short* normalizedCounter,
+ unsigned maxSymbolValue, unsigned tableLog);
+
+/*! Constructor and Destructor of FSE_CTable.
+ Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
+typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */
+FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog);
+FSE_PUBLIC_API void FSE_freeCTable (FSE_CTable* ct);
+
+/*! FSE_buildCTable():
+ Builds `ct`, which must be already allocated, using FSE_createCTable().
+ @return : 0, or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_compress_usingCTable():
+ Compress `src` using `ct` into `dst` which must be already allocated.
+ @return : size of compressed data (<= `dstCapacity`),
+ or 0 if compressed data could not fit into `dst`,
+ or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_compress_usingCTable (void* dst, size_t dstCapacity, const void* src, size_t srcSize, const FSE_CTable* ct);
+
+/*!
+Tutorial :
+----------
+The first step is to count all symbols. FSE_count() does this job very fast.
+Result will be saved into 'count', a table of unsigned int, which must be already allocated, and have 'maxSymbolValuePtr[0]+1' cells.
+'src' is a table of bytes of size 'srcSize'. All values within 'src' MUST be <= maxSymbolValuePtr[0]
+maxSymbolValuePtr[0] will be updated, with its real value (necessarily <= original value)
+FSE_count() will return the number of occurrence of the most frequent symbol.
+This can be used to know if there is a single symbol within 'src', and to quickly evaluate its compressibility.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+The next step is to normalize the frequencies.
+FSE_normalizeCount() will ensure that sum of frequencies is == 2 ^'tableLog'.
+It also guarantees a minimum of 1 to any Symbol with frequency >= 1.
+You can use 'tableLog'==0 to mean "use default tableLog value".
+If you are unsure of which tableLog value to use, you can ask FSE_optimalTableLog(),
+which will provide the optimal valid tableLog given sourceSize, maxSymbolValue, and a user-defined maximum (0 means "default").
+
+The result of FSE_normalizeCount() will be saved into a table,
+called 'normalizedCounter', which is a table of signed short.
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValue+1' cells.
+The return value is tableLog if everything proceeded as expected.
+It is 0 if there is a single symbol within distribution.
+If there is an error (ex: invalid tableLog value), the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+'normalizedCounter' can be saved in a compact manner to a memory area using FSE_writeNCount().
+'buffer' must be already allocated.
+For guaranteed success, buffer size must be at least FSE_headerBound().
+The result of the function is the number of bytes written into 'buffer'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError(); ex : buffer size too small).
+
+'normalizedCounter' can then be used to create the compression table 'CTable'.
+The space required by 'CTable' must be already allocated, using FSE_createCTable().
+You can then use FSE_buildCTable() to fill 'CTable'.
+If there is an error, both functions will return an ErrorCode (which can be tested using FSE_isError()).
+
+'CTable' can then be used to compress 'src', with FSE_compress_usingCTable().
+Similar to FSE_count(), the convention is that 'src' is assumed to be a table of char of size 'srcSize'
+The function returns the size of compressed data (without header), necessarily <= `dstCapacity`.
+If it returns '0', compressed data could not fit into 'dst'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+*/
+
+
+/* *** DECOMPRESSION *** */
+
+/*! FSE_readNCount():
+ Read compactly saved 'normalizedCounter' from 'rBuffer'.
+ @return : size read from 'rBuffer',
+ or an errorCode, which can be tested using FSE_isError().
+ maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
+FSE_PUBLIC_API size_t FSE_readNCount (short* normalizedCounter,
+ unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
+ const void* rBuffer, size_t rBuffSize);
+
+/*! Constructor and Destructor of FSE_DTable.
+ Note that its size depends on 'tableLog' */
+typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */
+FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog);
+FSE_PUBLIC_API void FSE_freeDTable(FSE_DTable* dt);
+
+/*! FSE_buildDTable():
+ Builds 'dt', which must be already allocated, using FSE_createDTable().
+ return : 0, or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_decompress_usingDTable():
+ Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+ into `dst` which must be already allocated.
+ @return : size of regenerated data (necessarily <= `dstCapacity`),
+ or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
+
+/*!
+Tutorial :
+----------
+(Note : these functions only decompress FSE-compressed blocks.
+ If block is uncompressed, use memcpy() instead
+ If block is a single repeated byte, use memset() instead )
+
+The first step is to obtain the normalized frequencies of symbols.
+This can be performed by FSE_readNCount() if it was saved using FSE_writeNCount().
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short.
+In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
+or size the table to handle worst case situations (typically 256).
+FSE_readNCount() will provide 'tableLog' and 'maxSymbolValue'.
+The result of FSE_readNCount() is the number of bytes read from 'rBuffer'.
+Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that.
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+The next step is to build the decompression tables 'FSE_DTable' from 'normalizedCounter'.
+This is performed by the function FSE_buildDTable().
+The space required by 'FSE_DTable' must be already allocated using FSE_createDTable().
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+`FSE_DTable` can then be used to decompress `cSrc`, with FSE_decompress_usingDTable().
+`cSrcSize` must be strictly correct, otherwise decompression will fail.
+FSE_decompress_usingDTable() result will tell how many bytes were regenerated (<=`dstCapacity`).
+If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small)
+*/
+
+#endif /* FSE_H */
+
+#if defined(FSE_STATIC_LINKING_ONLY) && !defined(FSE_H_FSE_STATIC_LINKING_ONLY)
+#define FSE_H_FSE_STATIC_LINKING_ONLY
+
+/* *** Dependency *** */
+/**** start inlining bitstream.h ****/
+/* ******************************************************************
+ * bitstream
+ * Part of FSE library
+ * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*
+* This API consists of small unitary functions, which must be inlined for best performance.
+* Since link-time-optimization is not available for all compilers,
+* these functions are defined into a .h to be included.
+*/
+
+/*-****************************************
+* Dependencies
+******************************************/
+/**** skipping file: mem.h ****/
+/**** start inlining compiler.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMPILER_H
+#define ZSTD_COMPILER_H
+
+/*-*******************************************************
+* Compiler specifics
+*********************************************************/
+/* force inlining */
+
+#if !defined(ZSTD_NO_INLINE)
+#if (defined(__GNUC__) && !defined(__STRICT_ANSI__)) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */
+# define INLINE_KEYWORD inline
+#else
+# define INLINE_KEYWORD
+#endif
+
+#if defined(__GNUC__) || defined(__ICCARM__)
+# define FORCE_INLINE_ATTR __attribute__((always_inline))
+#elif defined(_MSC_VER)
+# define FORCE_INLINE_ATTR __forceinline
+#else
+# define FORCE_INLINE_ATTR
+#endif
+
+#else
+
+#define INLINE_KEYWORD
+#define FORCE_INLINE_ATTR
+
+#endif
+
+/**
+ * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant
+ * parameters. They must be inlined for the compiler to eliminate the constant
+ * branches.
+ */
+#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
+/**
+ * HINT_INLINE is used to help the compiler generate better code. It is *not*
+ * used for "templates", so it can be tweaked based on the compilers
+ * performance.
+ *
+ * gcc-4.8 and gcc-4.9 have been shown to benefit from leaving off the
+ * always_inline attribute.
+ *
+ * clang up to 5.0.0 (trunk) benefit tremendously from the always_inline
+ * attribute.
+ */
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5
+# define HINT_INLINE static INLINE_KEYWORD
+#else
+# define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR
+#endif
+
+/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
+#if defined(__GNUC__)
+# define UNUSED_ATTR __attribute__((unused))
+#else
+# define UNUSED_ATTR
+#endif
+
+/* force no inlining */
+#ifdef _MSC_VER
+# define FORCE_NOINLINE static __declspec(noinline)
+#else
+# if defined(__GNUC__) || defined(__ICCARM__)
+# define FORCE_NOINLINE static __attribute__((__noinline__))
+# else
+# define FORCE_NOINLINE static
+# endif
+#endif
+
+/* target attribute */
+#ifndef __has_attribute
+ #define __has_attribute(x) 0 /* Compatibility with non-clang compilers. */
+#endif
+#if defined(__GNUC__) || defined(__ICCARM__)
+# define TARGET_ATTRIBUTE(target) __attribute__((__target__(target)))
+#else
+# define TARGET_ATTRIBUTE(target)
+#endif
+
+/* Enable runtime BMI2 dispatch based on the CPU.
+ * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default.
+ */
+#ifndef DYNAMIC_BMI2
+ #if ((defined(__clang__) && __has_attribute(__target__)) \
+ || (defined(__GNUC__) \
+ && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \
+ && (defined(__x86_64__) || defined(_M_X86)) \
+ && !defined(__BMI2__)
+ # define DYNAMIC_BMI2 1
+ #else
+ # define DYNAMIC_BMI2 0
+ #endif
+#endif
+
+/* prefetch
+ * can be disabled, by declaring NO_PREFETCH build macro */
+#if defined(NO_PREFETCH)
+# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */
+# define PREFETCH_L2(ptr) (void)(ptr) /* disabled */
+#else
+# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */
+# include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+# define PREFETCH_L1(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+# define PREFETCH_L2(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
+# elif defined(__aarch64__)
+# define PREFETCH_L1(ptr) __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr)))
+# define PREFETCH_L2(ptr) __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr)))
+# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+# define PREFETCH_L1(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+# define PREFETCH_L2(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
+# else
+# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */
+# define PREFETCH_L2(ptr) (void)(ptr) /* disabled */
+# endif
+#endif /* NO_PREFETCH */
+
+#define CACHELINE_SIZE 64
+
+#define PREFETCH_AREA(p, s) { \
+ const char* const _ptr = (const char*)(p); \
+ size_t const _size = (size_t)(s); \
+ size_t _pos; \
+ for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \
+ PREFETCH_L2(_ptr + _pos); \
+ } \
+}
+
+/* vectorization
+ * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax */
+#if !defined(__INTEL_COMPILER) && !defined(__clang__) && defined(__GNUC__)
+# if (__GNUC__ == 4 && __GNUC_MINOR__ > 3) || (__GNUC__ >= 5)
+# define DONT_VECTORIZE __attribute__((optimize("no-tree-vectorize")))
+# else
+# define DONT_VECTORIZE _Pragma("GCC optimize(\"no-tree-vectorize\")")
+# endif
+#else
+# define DONT_VECTORIZE
+#endif
+
+/* Tell the compiler that a branch is likely or unlikely.
+ * Only use these macros if it causes the compiler to generate better code.
+ * If you can remove a LIKELY/UNLIKELY annotation without speed changes in gcc
+ * and clang, please do.
+ */
+#if defined(__GNUC__)
+#define LIKELY(x) (__builtin_expect((x), 1))
+#define UNLIKELY(x) (__builtin_expect((x), 0))
+#else
+#define LIKELY(x) (x)
+#define UNLIKELY(x) (x)
+#endif
+
+/* disable warnings */
+#ifdef _MSC_VER /* Visual Studio */
+# include <intrin.h> /* For Visual 2005 */
+# pragma warning(disable : 4100) /* disable: C4100: unreferenced formal parameter */
+# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
+# pragma warning(disable : 4204) /* disable: C4204: non-constant aggregate initializer */
+# pragma warning(disable : 4214) /* disable: C4214: non-int bitfields */
+# pragma warning(disable : 4324) /* disable: C4324: padded structure */
+#endif
+
+#endif /* ZSTD_COMPILER_H */
+/**** ended inlining compiler.h ****/
+/**** skipping file: debug.h ****/
+/**** skipping file: error_private.h ****/
+
+
+/*=========================================
+* Target specific
+=========================================*/
+#if defined(__BMI__) && defined(__GNUC__)
+# include <immintrin.h> /* support for bextr (experimental) */
+#elif defined(__ICCARM__)
+# include <intrinsics.h>
+#endif
+
+#define STREAM_ACCUMULATOR_MIN_32 25
+#define STREAM_ACCUMULATOR_MIN_64 57
+#define STREAM_ACCUMULATOR_MIN ((U32)(MEM_32bits() ? STREAM_ACCUMULATOR_MIN_32 : STREAM_ACCUMULATOR_MIN_64))
+
+
+/*-******************************************
+* bitStream encoding API (write forward)
+********************************************/
+/* bitStream can mix input from multiple sources.
+ * A critical property of these streams is that they encode and decode in **reverse** direction.
+ * So the first bit sequence you add will be the last to be read, like a LIFO stack.
+ */
+typedef struct {
+ size_t bitContainer;
+ unsigned bitPos;
+ char* startPtr;
+ char* ptr;
+ char* endPtr;
+} BIT_CStream_t;
+
+MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* dstBuffer, size_t dstCapacity);
+MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
+MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC);
+MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
+
+/* Start with initCStream, providing the size of buffer to write into.
+* bitStream will never write outside of this buffer.
+* `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code.
+*
+* bits are first added to a local register.
+* Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems.
+* Writing data into memory is an explicit operation, performed by the flushBits function.
+* Hence keep track how many bits are potentially stored into local register to avoid register overflow.
+* After a flushBits, a maximum of 7 bits might still be stored into local register.
+*
+* Avoid storing elements of more than 24 bits if you want compatibility with 32-bits bitstream readers.
+*
+* Last operation is to close the bitStream.
+* The function returns the final size of CStream in bytes.
+* If data couldn't fit into `dstBuffer`, it will return a 0 ( == not storable)
+*/
+
+
+/*-********************************************
+* bitStream decoding API (read backward)
+**********************************************/
+typedef struct {
+ size_t bitContainer;
+ unsigned bitsConsumed;
+ const char* ptr;
+ const char* start;
+ const char* limitPtr;
+} BIT_DStream_t;
+
+typedef enum { BIT_DStream_unfinished = 0,
+ BIT_DStream_endOfBuffer = 1,
+ BIT_DStream_completed = 2,
+ BIT_DStream_overflow = 3 } BIT_DStream_status; /* result of BIT_reloadDStream() */
+ /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD);
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
+
+
+/* Start by invoking BIT_initDStream().
+* A chunk of the bitStream is then stored into a local register.
+* Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+* You can then retrieve bitFields stored into the local register, **in reverse order**.
+* Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
+* A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
+* Otherwise, it can be less than that, so proceed accordingly.
+* Checking if DStream has reached its end can be performed with BIT_endOfDStream().
+*/
+
+
+/*-****************************************
+* unsafe API
+******************************************/
+MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
+/* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */
+
+MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
+/* unsafe version; does not check buffer overflow */
+
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+
+
+/*-**************************************************************
+* Internal functions
+****************************************************************/
+MEM_STATIC unsigned BIT_highbit32 (U32 val)
+{
+ assert(val != 0);
+ {
+# if defined(_MSC_VER) /* Visual */
+ unsigned long r=0;
+ return _BitScanReverse ( &r, val ) ? (unsigned)r : 0;
+# elif defined(__GNUC__) && (__GNUC__ >= 3) /* Use GCC Intrinsic */
+ return __builtin_clz (val) ^ 31;
+# elif defined(__ICCARM__) /* IAR Intrinsic */
+ return 31 - __CLZ(val);
+# else /* Software version */
+ static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29,
+ 11, 14, 16, 18, 22, 25, 3, 30,
+ 8, 12, 20, 28, 15, 17, 24, 7,
+ 19, 27, 23, 6, 26, 5, 4, 31 };
+ U32 v = val;
+ v |= v >> 1;
+ v |= v >> 2;
+ v |= v >> 4;
+ v |= v >> 8;
+ v |= v >> 16;
+ return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+# endif
+ }
+}
+
+/*===== Local Constants =====*/
+static const unsigned BIT_mask[] = {
+ 0, 1, 3, 7, 0xF, 0x1F,
+ 0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF,
+ 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0x1FFFF,
+ 0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF,
+ 0xFFFFFF, 0x1FFFFFF, 0x3FFFFFF, 0x7FFFFFF, 0xFFFFFFF, 0x1FFFFFFF,
+ 0x3FFFFFFF, 0x7FFFFFFF}; /* up to 31 bits */
+#define BIT_MASK_SIZE (sizeof(BIT_mask) / sizeof(BIT_mask[0]))
+
+/*-**************************************************************
+* bitStream encoding
+****************************************************************/
+/*! BIT_initCStream() :
+ * `dstCapacity` must be > sizeof(size_t)
+ * @return : 0 if success,
+ * otherwise an error code (can be tested using ERR_isError()) */
+MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
+ void* startPtr, size_t dstCapacity)
+{
+ bitC->bitContainer = 0;
+ bitC->bitPos = 0;
+ bitC->startPtr = (char*)startPtr;
+ bitC->ptr = bitC->startPtr;
+ bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer);
+ if (dstCapacity <= sizeof(bitC->bitContainer)) return ERROR(dstSize_tooSmall);
+ return 0;
+}
+
+/*! BIT_addBits() :
+ * can add up to 31 bits into `bitC`.
+ * Note : does not check for register overflow ! */
+MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
+ size_t value, unsigned nbBits)
+{
+ MEM_STATIC_ASSERT(BIT_MASK_SIZE == 32);
+ assert(nbBits < BIT_MASK_SIZE);
+ assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+ bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
+ bitC->bitPos += nbBits;
+}
+
+/*! BIT_addBitsFast() :
+ * works only if `value` is _clean_,
+ * meaning all high bits above nbBits are 0 */
+MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC,
+ size_t value, unsigned nbBits)
+{
+ assert((value>>nbBits) == 0);
+ assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+ bitC->bitContainer |= value << bitC->bitPos;
+ bitC->bitPos += nbBits;
+}
+
+/*! BIT_flushBitsFast() :
+ * assumption : bitContainer has not overflowed
+ * unsafe version; does not check buffer overflow */
+MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC)
+{
+ size_t const nbBytes = bitC->bitPos >> 3;
+ assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+ assert(bitC->ptr <= bitC->endPtr);
+ MEM_writeLEST(bitC->ptr, bitC->bitContainer);
+ bitC->ptr += nbBytes;
+ bitC->bitPos &= 7;
+ bitC->bitContainer >>= nbBytes*8;
+}
+
+/*! BIT_flushBits() :
+ * assumption : bitContainer has not overflowed
+ * safe version; check for buffer overflow, and prevents it.
+ * note : does not signal buffer overflow.
+ * overflow will be revealed later on using BIT_closeCStream() */
+MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC)
+{
+ size_t const nbBytes = bitC->bitPos >> 3;
+ assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+ assert(bitC->ptr <= bitC->endPtr);
+ MEM_writeLEST(bitC->ptr, bitC->bitContainer);
+ bitC->ptr += nbBytes;
+ if (bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr;
+ bitC->bitPos &= 7;
+ bitC->bitContainer >>= nbBytes*8;
+}
+
+/*! BIT_closeCStream() :
+ * @return : size of CStream, in bytes,
+ * or 0 if it could not fit into dstBuffer */
+MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC)
+{
+ BIT_addBitsFast(bitC, 1, 1); /* endMark */
+ BIT_flushBits(bitC);
+ if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+ return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0);
+}
+
+
+/*-********************************************************
+* bitStream decoding
+**********************************************************/
+/*! BIT_initDStream() :
+ * Initialize a BIT_DStream_t.
+ * `bitD` : a pointer to an already allocated BIT_DStream_t structure.
+ * `srcSize` must be the *exact* size of the bitStream, in bytes.
+ * @return : size of stream (== srcSize), or an errorCode if a problem is detected
+ */
+MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+ if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
+
+ bitD->start = (const char*)srcBuffer;
+ bitD->limitPtr = bitD->start + sizeof(bitD->bitContainer);
+
+ if (srcSize >= sizeof(bitD->bitContainer)) { /* normal case */
+ bitD->ptr = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+ bitD->bitContainer = MEM_readLEST(bitD->ptr);
+ { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+ bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */
+ if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
+ } else {
+ bitD->ptr = bitD->start;
+ bitD->bitContainer = *(const BYTE*)(bitD->start);
+ switch(srcSize)
+ {
+ case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
+ /* fall-through */
+
+ case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
+ /* fall-through */
+
+ case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
+ /* fall-through */
+
+ case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24;
+ /* fall-through */
+
+ case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16;
+ /* fall-through */
+
+ case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) << 8;
+ /* fall-through */
+
+ default: break;
+ }
+ { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+ bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
+ if (lastByte == 0) return ERROR(corruption_detected); /* endMark not present */
+ }
+ bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
+ }
+
+ return srcSize;
+}
+
+MEM_STATIC size_t BIT_getUpperBits(size_t bitContainer, U32 const start)
+{
+ return bitContainer >> start;
+}
+
+MEM_STATIC size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits)
+{
+ U32 const regMask = sizeof(bitContainer)*8 - 1;
+ /* if start > regMask, bitstream is corrupted, and result is undefined */
+ assert(nbBits < BIT_MASK_SIZE);
+ return (bitContainer >> (start & regMask)) & BIT_mask[nbBits];
+}
+
+MEM_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
+{
+ assert(nbBits < BIT_MASK_SIZE);
+ return bitContainer & BIT_mask[nbBits];
+}
+
+/*! BIT_lookBits() :
+ * Provides next n bits from local register.
+ * local register is not modified.
+ * On 32-bits, maxNbBits==24.
+ * On 64-bits, maxNbBits==56.
+ * @return : value extracted */
+MEM_STATIC size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits)
+{
+ /* arbitrate between double-shift and shift+mask */
+#if 1
+ /* if bitD->bitsConsumed + nbBits > sizeof(bitD->bitContainer)*8,
+ * bitstream is likely corrupted, and result is undefined */
+ return BIT_getMiddleBits(bitD->bitContainer, (sizeof(bitD->bitContainer)*8) - bitD->bitsConsumed - nbBits, nbBits);
+#else
+ /* this code path is slower on my os-x laptop */
+ U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
+ return ((bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> 1) >> ((regMask-nbBits) & regMask);
+#endif
+}
+
+/*! BIT_lookBitsFast() :
+ * unsafe version; only works if nbBits >= 1 */
+MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits)
+{
+ U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
+ assert(nbBits >= 1);
+ return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask);
+}
+
+MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+ bitD->bitsConsumed += nbBits;
+}
+
+/*! BIT_readBits() :
+ * Read (consume) next n bits from local register and update.
+ * Pay attention to not read more than nbBits contained into local register.
+ * @return : extracted value. */
+MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
+{
+ size_t const value = BIT_lookBits(bitD, nbBits);
+ BIT_skipBits(bitD, nbBits);
+ return value;
+}
+
+/*! BIT_readBitsFast() :
+ * unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
+{
+ size_t const value = BIT_lookBitsFast(bitD, nbBits);
+ assert(nbBits >= 1);
+ BIT_skipBits(bitD, nbBits);
+ return value;
+}
+
+/*! BIT_reloadDStreamFast() :
+ * Similar to BIT_reloadDStream(), but with two differences:
+ * 1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold!
+ * 2. Returns BIT_DStream_overflow when bitD->ptr < bitD->limitPtr, at this
+ * point you must use BIT_reloadDStream() to reload.
+ */
+MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD)
+{
+ if (UNLIKELY(bitD->ptr < bitD->limitPtr))
+ return BIT_DStream_overflow;
+ assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
+ bitD->ptr -= bitD->bitsConsumed >> 3;
+ bitD->bitsConsumed &= 7;
+ bitD->bitContainer = MEM_readLEST(bitD->ptr);
+ return BIT_DStream_unfinished;
+}
+
+/*! BIT_reloadDStream() :
+ * Refill `bitD` from buffer previously set in BIT_initDStream() .
+ * This function is safe, it guarantees it will not read beyond src buffer.
+ * @return : status of `BIT_DStream_t` internal register.
+ * when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+{
+ if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* overflow detected, like end of stream */
+ return BIT_DStream_overflow;
+
+ if (bitD->ptr >= bitD->limitPtr) {
+ return BIT_reloadDStreamFast(bitD);
+ }
+ if (bitD->ptr == bitD->start) {
+ if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
+ return BIT_DStream_completed;
+ }
+ /* start < ptr < limitPtr */
+ { U32 nbBytes = bitD->bitsConsumed >> 3;
+ BIT_DStream_status result = BIT_DStream_unfinished;
+ if (bitD->ptr - nbBytes < bitD->start) {
+ nbBytes = (U32)(bitD->ptr - bitD->start); /* ptr > start */
+ result = BIT_DStream_endOfBuffer;
+ }
+ bitD->ptr -= nbBytes;
+ bitD->bitsConsumed -= nbBytes*8;
+ bitD->bitContainer = MEM_readLEST(bitD->ptr); /* reminder : srcSize > sizeof(bitD->bitContainer), otherwise bitD->ptr == bitD->start */
+ return result;
+ }
+}
+
+/*! BIT_endOfDStream() :
+ * @return : 1 if DStream has _exactly_ reached its end (all bits consumed).
+ */
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream)
+{
+ return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BITSTREAM_H_MODULE */
+/**** ended inlining bitstream.h ****/
+
+
+/* *****************************************
+* Static allocation
+*******************************************/
+/* FSE buffer bounds */
+#define FSE_NCOUNTBOUND 512
+#define FSE_BLOCKBOUND(size) (size + (size>>7) + 4 /* fse states */ + sizeof(size_t) /* bitContainer */)
+#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size)) /* Macro version, useful for static allocation */
+
+/* It is possible to statically allocate FSE CTable/DTable as a table of FSE_CTable/FSE_DTable using below macros */
+#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2))
+#define FSE_DTABLE_SIZE_U32(maxTableLog) (1 + (1<<maxTableLog))
+
+/* or use the size to malloc() space directly. Pay attention to alignment restrictions though */
+#define FSE_CTABLE_SIZE(maxTableLog, maxSymbolValue) (FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(FSE_CTable))
+#define FSE_DTABLE_SIZE(maxTableLog) (FSE_DTABLE_SIZE_U32(maxTableLog) * sizeof(FSE_DTable))
+
+
+/* *****************************************
+ * FSE advanced API
+ ***************************************** */
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
+/**< same as FSE_optimalTableLog(), which used `minus==2` */
+
+/* FSE_compress_wksp() :
+ * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
+ * FSE_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
+ */
+#define FSE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
+size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+
+size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
+/**< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
+
+size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
+/**< build a fake FSE_CTable, designed to compress always the same symbolValue */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * `wkspSize` must be >= `(1<<tableLog)`.
+ */
+size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+
+size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
+/**< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
+
+size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
+/**< build a fake FSE_DTable, designed to always generate the same symbolValue */
+
+size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog);
+/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DTABLE_SIZE_U32(maxLog)` */
+
+typedef enum {
+ FSE_repeat_none, /**< Cannot use the previous table */
+ FSE_repeat_check, /**< Can use the previous table but it must be checked */
+ FSE_repeat_valid /**< Can use the previous table and it is assumed to be valid */
+ } FSE_repeat;
+
+/* *****************************************
+* FSE symbol compression API
+*******************************************/
+/*!
+ This API consists of small unitary functions, which highly benefit from being inlined.
+ Hence their body are included in next section.
+*/
+typedef struct {
+ ptrdiff_t value;
+ const void* stateTable;
+ const void* symbolTT;
+ unsigned stateLog;
+} FSE_CState_t;
+
+static void FSE_initCState(FSE_CState_t* CStatePtr, const FSE_CTable* ct);
+
+static void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* CStatePtr, unsigned symbol);
+
+static void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* CStatePtr);
+
+/**<
+These functions are inner components of FSE_compress_usingCTable().
+They allow the creation of custom streams, mixing multiple tables and bit sources.
+
+A key property to keep in mind is that encoding and decoding are done **in reverse direction**.
+So the first symbol you will encode is the last you will decode, like a LIFO stack.
+
+You will need a few variables to track your CStream. They are :
+
+FSE_CTable ct; // Provided by FSE_buildCTable()
+BIT_CStream_t bitStream; // bitStream tracking structure
+FSE_CState_t state; // State tracking structure (can have several)
+
+
+The first thing to do is to init bitStream and state.
+ size_t errorCode = BIT_initCStream(&bitStream, dstBuffer, maxDstSize);
+ FSE_initCState(&state, ct);
+
+Note that BIT_initCStream() can produce an error code, so its result should be tested, using FSE_isError();
+You can then encode your input data, byte after byte.
+FSE_encodeSymbol() outputs a maximum of 'tableLog' bits at a time.
+Remember decoding will be done in reverse direction.
+ FSE_encodeByte(&bitStream, &state, symbol);
+
+At any time, you can also add any bit sequence.
+Note : maximum allowed nbBits is 25, for compatibility with 32-bits decoders
+ BIT_addBits(&bitStream, bitField, nbBits);
+
+The above methods don't commit data to memory, they just store it into local register, for speed.
+Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+Writing data to memory is a manual operation, performed by the flushBits function.
+ BIT_flushBits(&bitStream);
+
+Your last FSE encoding operation shall be to flush your last state value(s).
+ FSE_flushState(&bitStream, &state);
+
+Finally, you must close the bitStream.
+The function returns the size of CStream in bytes.
+If data couldn't fit into dstBuffer, it will return a 0 ( == not compressible)
+If there is an error, it returns an errorCode (which can be tested using FSE_isError()).
+ size_t size = BIT_closeCStream(&bitStream);
+*/
+
+
+/* *****************************************
+* FSE symbol decompression API
+*******************************************/
+typedef struct {
+ size_t state;
+ const void* table; /* precise table may vary, depending on U16 */
+} FSE_DState_t;
+
+
+static void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt);
+
+static unsigned char FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+
+static unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr);
+
+/**<
+Let's now decompose FSE_decompress_usingDTable() into its unitary components.
+You will decode FSE-encoded symbols from the bitStream,
+and also any other bitFields you put in, **in reverse order**.
+
+You will need a few variables to track your bitStream. They are :
+
+BIT_DStream_t DStream; // Stream context
+FSE_DState_t DState; // State context. Multiple ones are possible
+FSE_DTable* DTablePtr; // Decoding table, provided by FSE_buildDTable()
+
+The first thing to do is to init the bitStream.
+ errorCode = BIT_initDStream(&DStream, srcBuffer, srcSize);
+
+You should then retrieve your initial state(s)
+(in reverse flushing order if you have several ones) :
+ errorCode = FSE_initDState(&DState, &DStream, DTablePtr);
+
+You can then decode your data, symbol after symbol.
+For information the maximum number of bits read by FSE_decodeSymbol() is 'tableLog'.
+Keep in mind that symbols are decoded in reverse order, like a LIFO stack (last in, first out).
+ unsigned char symbol = FSE_decodeSymbol(&DState, &DStream);
+
+You can retrieve any bitfield you eventually stored into the bitStream (in reverse order)
+Note : maximum allowed nbBits is 25, for 32-bits compatibility
+ size_t bitField = BIT_readBits(&DStream, nbBits);
+
+All above operations only read from local register (which size depends on size_t).
+Refueling the register from memory is manually performed by the reload method.
+ endSignal = FSE_reloadDStream(&DStream);
+
+BIT_reloadDStream() result tells if there is still some more data to read from DStream.
+BIT_DStream_unfinished : there is still some data left into the DStream.
+BIT_DStream_endOfBuffer : Dstream reached end of buffer. Its container may no longer be completely filled.
+BIT_DStream_completed : Dstream reached its exact end, corresponding in general to decompression completed.
+BIT_DStream_tooFar : Dstream went too far. Decompression result is corrupted.
+
+When reaching end of buffer (BIT_DStream_endOfBuffer), progress slowly, notably if you decode multiple symbols per loop,
+to properly detect the exact end of stream.
+After each decoded symbol, check if DStream is fully consumed using this simple test :
+ BIT_reloadDStream(&DStream) >= BIT_DStream_completed
+
+When it's done, verify decompression is fully completed, by checking both DStream and the relevant states.
+Checking if DStream has reached its end is performed by :
+ BIT_endOfDStream(&DStream);
+Check also the states. There might be some symbols left there, if some high probability ones (>50%) are possible.
+ FSE_endOfDState(&DState);
+*/
+
+
+/* *****************************************
+* FSE unsafe API
+*******************************************/
+static unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+
+/* *****************************************
+* Implementation of inlined functions
+*******************************************/
+typedef struct {
+ int deltaFindState;
+ U32 deltaNbBits;
+} FSE_symbolCompressionTransform; /* total 8 bytes */
+
+MEM_STATIC void FSE_initCState(FSE_CState_t* statePtr, const FSE_CTable* ct)
+{
+ const void* ptr = ct;
+ const U16* u16ptr = (const U16*) ptr;
+ const U32 tableLog = MEM_read16(ptr);
+ statePtr->value = (ptrdiff_t)1<<tableLog;
+ statePtr->stateTable = u16ptr+2;
+ statePtr->symbolTT = ct + 1 + (tableLog ? (1<<(tableLog-1)) : 1);
+ statePtr->stateLog = tableLog;
+}
+
+
+/*! FSE_initCState2() :
+* Same as FSE_initCState(), but the first symbol to include (which will be the last to be read)
+* uses the smallest state value possible, saving the cost of this symbol */
+MEM_STATIC void FSE_initCState2(FSE_CState_t* statePtr, const FSE_CTable* ct, U32 symbol)
+{
+ FSE_initCState(statePtr, ct);
+ { const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+ const U16* stateTable = (const U16*)(statePtr->stateTable);
+ U32 nbBitsOut = (U32)((symbolTT.deltaNbBits + (1<<15)) >> 16);
+ statePtr->value = (nbBitsOut << 16) - symbolTT.deltaNbBits;
+ statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+ }
+}
+
+MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, unsigned symbol)
+{
+ FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+ const U16* const stateTable = (const U16*)(statePtr->stateTable);
+ U32 const nbBitsOut = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+ BIT_addBits(bitC, statePtr->value, nbBitsOut);
+ statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+}
+
+MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr)
+{
+ BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
+ BIT_flushBits(bitC);
+}
+
+
+/* FSE_getMaxNbBits() :
+ * Approximate maximum cost of a symbol, in bits.
+ * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
+ * note 1 : assume symbolValue is valid (<= maxSymbolValue)
+ * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
+MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
+{
+ const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*) symbolTTPtr;
+ return (symbolTT[symbolValue].deltaNbBits + ((1<<16)-1)) >> 16;
+}
+
+/* FSE_bitCost() :
+ * Approximate symbol cost, as fractional value, using fixed-point format (accuracyLog fractional bits)
+ * note 1 : assume symbolValue is valid (<= maxSymbolValue)
+ * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
+MEM_STATIC U32 FSE_bitCost(const void* symbolTTPtr, U32 tableLog, U32 symbolValue, U32 accuracyLog)
+{
+ const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*) symbolTTPtr;
+ U32 const minNbBits = symbolTT[symbolValue].deltaNbBits >> 16;
+ U32 const threshold = (minNbBits+1) << 16;
+ assert(tableLog < 16);
+ assert(accuracyLog < 31-tableLog); /* ensure enough room for renormalization double shift */
+ { U32 const tableSize = 1 << tableLog;
+ U32 const deltaFromThreshold = threshold - (symbolTT[symbolValue].deltaNbBits + tableSize);
+ U32 const normalizedDeltaFromThreshold = (deltaFromThreshold << accuracyLog) >> tableLog; /* linear interpolation (very approximate) */
+ U32 const bitMultiplier = 1 << accuracyLog;
+ assert(symbolTT[symbolValue].deltaNbBits + tableSize <= threshold);
+ assert(normalizedDeltaFromThreshold <= bitMultiplier);
+ return (minNbBits+1)*bitMultiplier - normalizedDeltaFromThreshold;
+ }
+}
+
+
+/* ====== Decompression ====== */
+
+typedef struct {
+ U16 tableLog;
+ U16 fastMode;
+} FSE_DTableHeader; /* sizeof U32 */
+
+typedef struct
+{
+ unsigned short newState;
+ unsigned char symbol;
+ unsigned char nbBits;
+} FSE_decode_t; /* size == U32 */
+
+MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt)
+{
+ const void* ptr = dt;
+ const FSE_DTableHeader* const DTableH = (const FSE_DTableHeader*)ptr;
+ DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
+ BIT_reloadDStream(bitD);
+ DStatePtr->table = dt + 1;
+}
+
+MEM_STATIC BYTE FSE_peekSymbol(const FSE_DState_t* DStatePtr)
+{
+ FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+ return DInfo.symbol;
+}
+
+MEM_STATIC void FSE_updateState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+ FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+ U32 const nbBits = DInfo.nbBits;
+ size_t const lowBits = BIT_readBits(bitD, nbBits);
+ DStatePtr->state = DInfo.newState + lowBits;
+}
+
+MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+ FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+ U32 const nbBits = DInfo.nbBits;
+ BYTE const symbol = DInfo.symbol;
+ size_t const lowBits = BIT_readBits(bitD, nbBits);
+
+ DStatePtr->state = DInfo.newState + lowBits;
+ return symbol;
+}
+
+/*! FSE_decodeSymbolFast() :
+ unsafe, only works if no symbol has a probability > 50% */
+MEM_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+ FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+ U32 const nbBits = DInfo.nbBits;
+ BYTE const symbol = DInfo.symbol;
+ size_t const lowBits = BIT_readBitsFast(bitD, nbBits);
+
+ DStatePtr->state = DInfo.newState + lowBits;
+ return symbol;
+}
+
+MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
+{
+ return DStatePtr->state == 0;
+}
+
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/* **************************************************************
+* Tuning parameters
+****************************************************************/
+/*!MEMORY_USAGE :
+* Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+* Increasing memory usage improves compression ratio
+* Reduced memory usage can improve speed, due to cache effect
+* Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#ifndef FSE_MAX_MEMORY_USAGE
+# define FSE_MAX_MEMORY_USAGE 14
+#endif
+#ifndef FSE_DEFAULT_MEMORY_USAGE
+# define FSE_DEFAULT_MEMORY_USAGE 13
+#endif
+
+/*!FSE_MAX_SYMBOL_VALUE :
+* Maximum symbol value authorized.
+* Required for proper stack allocation */
+#ifndef FSE_MAX_SYMBOL_VALUE
+# define FSE_MAX_SYMBOL_VALUE 255
+#endif
+
+/* **************************************************************
+* template functions type & suffix
+****************************************************************/
+#define FSE_FUNCTION_TYPE BYTE
+#define FSE_FUNCTION_EXTENSION
+#define FSE_DECODE_TYPE FSE_decode_t
+
+
+#endif /* !FSE_COMMONDEFS_ONLY */
+
+
+/* ***************************************************************
+* Constants
+*****************************************************************/
+#define FSE_MAX_TABLELOG (FSE_MAX_MEMORY_USAGE-2)
+#define FSE_MAX_TABLESIZE (1U<<FSE_MAX_TABLELOG)
+#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE-1)
+#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE-2)
+#define FSE_MIN_TABLELOG 5
+
+#define FSE_TABLELOG_ABSOLUTE_MAX 15
+#if FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX
+# error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+#define FSE_TABLESTEP(tableSize) ((tableSize>>1) + (tableSize>>3) + 3)
+
+
+#endif /* FSE_STATIC_LINKING_ONLY */
+
+
+#if defined (__cplusplus)
+}
+#endif
+/**** ended inlining fse.h ****/
+#define HUF_STATIC_LINKING_ONLY /* HUF_TABLELOG_ABSOLUTEMAX */
+/**** start inlining huf.h ****/
+/* ******************************************************************
+ * huff0 huffman codec,
+ * part of Finite State Entropy library
+ * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef HUF_H_298734234
+#define HUF_H_298734234
+
+/* *** Dependencies *** */
+#include <stddef.h> /* size_t */
+
+
+/* *** library symbols visibility *** */
+/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual,
+ * HUF symbols remain "private" (internal symbols for library only).
+ * Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */
+#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+# define HUF_PUBLIC_API __attribute__ ((visibility ("default")))
+#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) /* Visual expected */
+# define HUF_PUBLIC_API __declspec(dllexport)
+#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+# define HUF_PUBLIC_API __declspec(dllimport) /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */
+#else
+# define HUF_PUBLIC_API
+#endif
+
+
+/* ========================== */
+/* *** simple functions *** */
+/* ========================== */
+
+/** HUF_compress() :
+ * Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'.
+ * 'dst' buffer must be already allocated.
+ * Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize).
+ * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB.
+ * @return : size of compressed data (<= `dstCapacity`).
+ * Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+ * if HUF_isError(return), compression failed (more details using HUF_getErrorName())
+ */
+HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize);
+
+/** HUF_decompress() :
+ * Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
+ * into already allocated buffer 'dst', of minimum size 'dstSize'.
+ * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data.
+ * Note : in contrast with FSE, HUF_decompress can regenerate
+ * RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+ * because it knows size to regenerate (originalSize).
+ * @return : size of regenerated data (== originalSize),
+ * or an error code, which can be tested using HUF_isError()
+ */
+HUF_PUBLIC_API size_t HUF_decompress(void* dst, size_t originalSize,
+ const void* cSrc, size_t cSrcSize);
+
+
+/* *** Tool functions *** */
+#define HUF_BLOCKSIZE_MAX (128 * 1024) /**< maximum input size for a single block compressed with HUF_compress */
+HUF_PUBLIC_API size_t HUF_compressBound(size_t size); /**< maximum compressed size (worst case) */
+
+/* Error Management */
+HUF_PUBLIC_API unsigned HUF_isError(size_t code); /**< tells if a return value is an error code */
+HUF_PUBLIC_API const char* HUF_getErrorName(size_t code); /**< provides error code string (useful for debugging) */
+
+
+/* *** Advanced function *** */
+
+/** HUF_compress2() :
+ * Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`.
+ * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX .
+ * `tableLog` must be `<= HUF_TABLELOG_MAX` . */
+HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ unsigned maxSymbolValue, unsigned tableLog);
+
+/** HUF_compress4X_wksp() :
+ * Same as HUF_compress2(), but uses externally allocated `workSpace`.
+ * `workspace` must have minimum alignment of 4, and be at least as large as HUF_WORKSPACE_SIZE */
+#define HUF_WORKSPACE_SIZE ((6 << 10) + 256)
+#define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32))
+HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ unsigned maxSymbolValue, unsigned tableLog,
+ void* workSpace, size_t wkspSize);
+
+#endif /* HUF_H_298734234 */
+
+/* ******************************************************************
+ * WARNING !!
+ * The following section contains advanced and experimental definitions
+ * which shall never be used in the context of a dynamic library,
+ * because they are not guaranteed to remain stable in the future.
+ * Only consider them in association with static linking.
+ * *****************************************************************/
+#if defined(HUF_STATIC_LINKING_ONLY) && !defined(HUF_H_HUF_STATIC_LINKING_ONLY)
+#define HUF_H_HUF_STATIC_LINKING_ONLY
+
+/* *** Dependencies *** */
+/**** skipping file: mem.h ****/
+
+
+/* *** Constants *** */
+#define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */
+#define HUF_TABLELOG_DEFAULT 11 /* default tableLog value when none specified */
+#define HUF_SYMBOLVALUE_MAX 255
+
+#define HUF_TABLELOG_ABSOLUTEMAX 15 /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX)
+# error "HUF_TABLELOG_MAX is too large !"
+#endif
+
+
+/* ****************************************
+* Static allocation
+******************************************/
+/* HUF buffer bounds */
+#define HUF_CTABLEBOUND 129
+#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8) /* only true when incompressible is pre-filtered with fast heuristic */
+#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size)) /* Macro version, useful for static allocation */
+
+/* static allocation of HUF's Compression Table */
+#define HUF_CTABLE_SIZE_U32(maxSymbolValue) ((maxSymbolValue)+1) /* Use tables of U32, for proper alignment */
+#define HUF_CTABLE_SIZE(maxSymbolValue) (HUF_CTABLE_SIZE_U32(maxSymbolValue) * sizeof(U32))
+#define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \
+ U32 name##hb[HUF_CTABLE_SIZE_U32(maxSymbolValue)]; \
+ void* name##hv = &(name##hb); \
+ HUF_CElt* name = (HUF_CElt*)(name##hv) /* no final ; */
+
+/* static allocation of HUF's DTable */
+typedef U32 HUF_DTable;
+#define HUF_DTABLE_SIZE(maxTableLog) (1 + (1<<(maxTableLog)))
+#define HUF_CREATE_STATIC_DTABLEX1(DTable, maxTableLog) \
+ HUF_DTable DTable[HUF_DTABLE_SIZE((maxTableLog)-1)] = { ((U32)((maxTableLog)-1) * 0x01000001) }
+#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+ HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = { ((U32)(maxTableLog) * 0x01000001) }
+
+
+/* ****************************************
+* Advanced decompression functions
+******************************************/
+size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< single-symbol decoder */
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< double-symbols decoder */
+#endif
+
+size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< decodes RLE and uncompressed */
+size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< considers RLE and uncompressed as errors */
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< considers RLE and uncompressed as errors */
+size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< single-symbol decoder */
+size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< single-symbol decoder */
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< double-symbols decoder */
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< double-symbols decoder */
+#endif
+
+
+/* ****************************************
+ * HUF detailed API
+ * ****************************************/
+
+/*! HUF_compress() does the following:
+ * 1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h")
+ * 2. (optional) refine tableLog using HUF_optimalTableLog()
+ * 3. build Huffman table from count using HUF_buildCTable()
+ * 4. save Huffman table to memory buffer using HUF_writeCTable()
+ * 5. encode the data stream using HUF_compress4X_usingCTable()
+ *
+ * The following API allows targeting specific sub-functions for advanced tasks.
+ * For example, it's possible to compress several blocks using the same 'CTable',
+ * or to save and regenerate 'CTable' using external methods.
+ */
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+typedef struct HUF_CElt_s HUF_CElt; /* incomplete type */
+size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */
+size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+
+typedef enum {
+ HUF_repeat_none, /**< Cannot use the previous table */
+ HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
+ HUF_repeat_valid /**< Can use the previous table and it is assumed to be valid */
+ } HUF_repeat;
+/** HUF_compress4X_repeat() :
+ * Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+ * If it uses hufTable it does not modify hufTable or repeat.
+ * If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+ * If preferRepeat then the old table will always be used if valid. */
+size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+ unsigned maxSymbolValue, unsigned tableLog,
+ void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+ HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2);
+
+/** HUF_buildCTable_wksp() :
+ * Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
+ */
+#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1)
+#define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
+size_t HUF_buildCTable_wksp (HUF_CElt* tree,
+ const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
+ void* workSpace, size_t wkspSize);
+
+/*! HUF_readStats() :
+ * Read compact Huffman tree, saved by HUF_writeCTable().
+ * `huffWeight` is destination buffer.
+ * @return : size read from `src` , or an error Code .
+ * Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */
+size_t HUF_readStats(BYTE* huffWeight, size_t hwSize,
+ U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
+ const void* src, size_t srcSize);
+
+/** HUF_readCTable() :
+ * Loading a CTable saved with HUF_writeCTable() */
+size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned *hasZeroWeights);
+
+/** HUF_getNbBits() :
+ * Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX
+ * Note 1 : is not inlined, as HUF_CElt definition is private
+ * Note 2 : const void* used, so that it can provide a statically allocated table as argument (which uses type U32) */
+U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue);
+
+/*
+ * HUF_decompress() does the following:
+ * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics
+ * 2. build Huffman table from save, using HUF_readDTableX?()
+ * 3. decode 1 or 4 segments in parallel using HUF_decompress?X?_usingDTable()
+ */
+
+/** HUF_selectDecoder() :
+ * Tells which decoder is likely to decode faster,
+ * based on a set of pre-computed metrics.
+ * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 .
+ * Assumption : 0 < dstSize <= 128 KB */
+U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
+
+/**
+ * The minimum workspace size for the `workSpace` used in
+ * HUF_readDTableX1_wksp() and HUF_readDTableX2_wksp().
+ *
+ * The space used depends on HUF_TABLELOG_MAX, ranging from ~1500 bytes when
+ * HUF_TABLE_LOG_MAX=12 to ~1850 bytes when HUF_TABLE_LOG_MAX=15.
+ * Buffer overflow errors may potentially occur if code modifications result in
+ * a required workspace size greater than that specified in the following
+ * macro.
+ */
+#define HUF_DECOMPRESS_WORKSPACE_SIZE (2 << 10)
+#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
+
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize);
+size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+#endif
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
+size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+#endif
+
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+#endif
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+#endif
+
+
+/* ====================== */
+/* single stream variants */
+/* ====================== */
+
+size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+/** HUF_compress1X_repeat() :
+ * Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+ * If it uses hufTable it does not modify hufTable or repeat.
+ * If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+ * If preferRepeat then the old table will always be used if valid. */
+size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+ unsigned maxSymbolValue, unsigned tableLog,
+ void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+ HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2);
+
+size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* double-symbol decoder */
+#endif
+
+size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< single-symbol decoder */
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< single-symbol decoder */
+#endif
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< double-symbols decoder */
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< double-symbols decoder */
+#endif
+
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); /**< automatic selection of sing or double symbol decoder, based on DTable */
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+#endif
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+#endif
+
+/* BMI2 variants.
+ * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
+ */
+size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+#endif
+size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+
+#endif /* HUF_STATIC_LINKING_ONLY */
+
+#if defined (__cplusplus)
+}
+#endif
+/**** ended inlining huf.h ****/
+
+
+/*=== Version ===*/
+unsigned FSE_versionNumber(void) { return FSE_VERSION_NUMBER; }
+
+
+/*=== Error Management ===*/
+unsigned FSE_isError(size_t code) { return ERR_isError(code); }
+const char* FSE_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+unsigned HUF_isError(size_t code) { return ERR_isError(code); }
+const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/*-**************************************************************
+* FSE NCount encoding-decoding
+****************************************************************/
+size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+ const void* headerBuffer, size_t hbSize)
+{
+ const BYTE* const istart = (const BYTE*) headerBuffer;
+ const BYTE* const iend = istart + hbSize;
+ const BYTE* ip = istart;
+ int nbBits;
+ int remaining;
+ int threshold;
+ U32 bitStream;
+ int bitCount;
+ unsigned charnum = 0;
+ int previous0 = 0;
+
+ if (hbSize < 4) {
+ /* This function only works when hbSize >= 4 */
+ char buffer[4];
+ memset(buffer, 0, sizeof(buffer));
+ memcpy(buffer, headerBuffer, hbSize);
+ { size_t const countSize = FSE_readNCount(normalizedCounter, maxSVPtr, tableLogPtr,
+ buffer, sizeof(buffer));
+ if (FSE_isError(countSize)) return countSize;
+ if (countSize > hbSize) return ERROR(corruption_detected);
+ return countSize;
+ } }
+ assert(hbSize >= 4);
+
+ /* init */
+ memset(normalizedCounter, 0, (*maxSVPtr+1) * sizeof(normalizedCounter[0])); /* all symbols not present in NCount have a frequency of 0 */
+ bitStream = MEM_readLE32(ip);
+ nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG; /* extract tableLog */
+ if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
+ bitStream >>= 4;
+ bitCount = 4;
+ *tableLogPtr = nbBits;
+ remaining = (1<<nbBits)+1;
+ threshold = 1<<nbBits;
+ nbBits++;
+
+ while ((remaining>1) & (charnum<=*maxSVPtr)) {
+ if (previous0) {
+ unsigned n0 = charnum;
+ while ((bitStream & 0xFFFF) == 0xFFFF) {
+ n0 += 24;
+ if (ip < iend-5) {
+ ip += 2;
+ bitStream = MEM_readLE32(ip) >> bitCount;
+ } else {
+ bitStream >>= 16;
+ bitCount += 16;
+ } }
+ while ((bitStream & 3) == 3) {
+ n0 += 3;
+ bitStream >>= 2;
+ bitCount += 2;
+ }
+ n0 += bitStream & 3;
+ bitCount += 2;
+ if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
+ while (charnum < n0) normalizedCounter[charnum++] = 0;
+ if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+ assert((bitCount >> 3) <= 3); /* For first condition to work */
+ ip += bitCount>>3;
+ bitCount &= 7;
+ bitStream = MEM_readLE32(ip) >> bitCount;
+ } else {
+ bitStream >>= 2;
+ } }
+ { int const max = (2*threshold-1) - remaining;
+ int count;
+
+ if ((bitStream & (threshold-1)) < (U32)max) {
+ count = bitStream & (threshold-1);
+ bitCount += nbBits-1;
+ } else {
+ count = bitStream & (2*threshold-1);
+ if (count >= threshold) count -= max;
+ bitCount += nbBits;
+ }
+
+ count--; /* extra accuracy */
+ remaining -= count < 0 ? -count : count; /* -1 means +1 */
+ normalizedCounter[charnum++] = (short)count;
+ previous0 = !count;
+ while (remaining < threshold) {
+ nbBits--;
+ threshold >>= 1;
+ }
+
+ if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+ ip += bitCount>>3;
+ bitCount &= 7;
+ } else {
+ bitCount -= (int)(8 * (iend - 4 - ip));
+ ip = iend - 4;
+ }
+ bitStream = MEM_readLE32(ip) >> (bitCount & 31);
+ } } /* while ((remaining>1) & (charnum<=*maxSVPtr)) */
+ if (remaining != 1) return ERROR(corruption_detected);
+ if (bitCount > 32) return ERROR(corruption_detected);
+ *maxSVPtr = charnum-1;
+
+ ip += (bitCount+7)>>3;
+ return ip-istart;
+}
+
+
+/*! HUF_readStats() :
+ Read compact Huffman tree, saved by HUF_writeCTable().
+ `huffWeight` is destination buffer.
+ `rankStats` is assumed to be a table of at least HUF_TABLELOG_MAX U32.
+ @return : size read from `src` , or an error Code .
+ Note : Needed by HUF_readCTable() and HUF_readDTableX?() .
+*/
+size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+ U32* nbSymbolsPtr, U32* tableLogPtr,
+ const void* src, size_t srcSize)
+{
+ U32 weightTotal;
+ const BYTE* ip = (const BYTE*) src;
+ size_t iSize;
+ size_t oSize;
+
+ if (!srcSize) return ERROR(srcSize_wrong);
+ iSize = ip[0];
+ /* memset(huffWeight, 0, hwSize); *//* is not necessary, even though some analyzer complain ... */
+
+ if (iSize >= 128) { /* special header */
+ oSize = iSize - 127;
+ iSize = ((oSize+1)/2);
+ if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+ if (oSize >= hwSize) return ERROR(corruption_detected);
+ ip += 1;
+ { U32 n;
+ for (n=0; n<oSize; n+=2) {
+ huffWeight[n] = ip[n/2] >> 4;
+ huffWeight[n+1] = ip[n/2] & 15;
+ } } }
+ else { /* header compressed with FSE (normal case) */
+ FSE_DTable fseWorkspace[FSE_DTABLE_SIZE_U32(6)]; /* 6 is max possible tableLog for HUF header (maybe even 5, to be tested) */
+ if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+ oSize = FSE_decompress_wksp(huffWeight, hwSize-1, ip+1, iSize, fseWorkspace, 6); /* max (hwSize-1) values decoded, as last one is implied */
+ if (FSE_isError(oSize)) return oSize;
+ }
+
+ /* collect weight stats */
+ memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32));
+ weightTotal = 0;
+ { U32 n; for (n=0; n<oSize; n++) {
+ if (huffWeight[n] >= HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+ rankStats[huffWeight[n]]++;
+ weightTotal += (1 << huffWeight[n]) >> 1;
+ } }
+ if (weightTotal == 0) return ERROR(corruption_detected);
+
+ /* get last non-null symbol weight (implied, total must be 2^n) */
+ { U32 const tableLog = BIT_highbit32(weightTotal) + 1;
+ if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+ *tableLogPtr = tableLog;
+ /* determine last weight */
+ { U32 const total = 1 << tableLog;
+ U32 const rest = total - weightTotal;
+ U32 const verif = 1 << BIT_highbit32(rest);
+ U32 const lastWeight = BIT_highbit32(rest) + 1;
+ if (verif != rest) return ERROR(corruption_detected); /* last value must be a clean power of 2 */
+ huffWeight[oSize] = (BYTE)lastWeight;
+ rankStats[lastWeight]++;
+ } }
+
+ /* check tree construction validity */
+ if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected); /* by construction : at least 2 elts of rank 1, must be even */
+
+ /* results */
+ *nbSymbolsPtr = (U32)(oSize+1);
+ return iSize+1;
+}
+/**** ended inlining common/entropy_common.c ****/
+/**** start inlining common/error_private.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* The purpose of this file is to have a single list of error strings embedded in binary */
+
+/**** skipping file: error_private.h ****/
+
+const char* ERR_getErrorString(ERR_enum code)
+{
+#ifdef ZSTD_STRIP_ERROR_STRINGS
+ (void)code;
+ return "Error strings stripped";
+#else
+ static const char* const notErrorCode = "Unspecified error code";
+ switch( code )
+ {
+ case PREFIX(no_error): return "No error detected";
+ case PREFIX(GENERIC): return "Error (generic)";
+ case PREFIX(prefix_unknown): return "Unknown frame descriptor";
+ case PREFIX(version_unsupported): return "Version not supported";
+ case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
+ case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding";
+ case PREFIX(corruption_detected): return "Corrupted block detected";
+ case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
+ case PREFIX(parameter_unsupported): return "Unsupported parameter";
+ case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
+ case PREFIX(init_missing): return "Context should be init first";
+ case PREFIX(memory_allocation): return "Allocation error : not enough memory";
+ case PREFIX(workSpace_tooSmall): return "workSpace buffer is not large enough";
+ case PREFIX(stage_wrong): return "Operation not authorized at current processing stage";
+ case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
+ case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
+ case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
+ case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
+ case PREFIX(dictionary_wrong): return "Dictionary mismatch";
+ case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
+ case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
+ case PREFIX(srcSize_wrong): return "Src size is incorrect";
+ case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer";
+ /* following error codes are not stable and may be removed or changed in a future version */
+ case PREFIX(frameIndex_tooLarge): return "Frame index is too large";
+ case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
+ case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong";
+ case PREFIX(maxCode):
+ default: return notErrorCode;
+ }
+#endif
+}
+/**** ended inlining common/error_private.c ****/
+/**** start inlining common/fse_decompress.c ****/
+/* ******************************************************************
+ * FSE : Finite State Entropy decoder
+ * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ * - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+
+/* **************************************************************
+* Includes
+****************************************************************/
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memcpy, memset */
+/**** skipping file: bitstream.h ****/
+/**** skipping file: compiler.h ****/
+#define FSE_STATIC_LINKING_ONLY
+/**** skipping file: fse.h ****/
+/**** skipping file: error_private.h ****/
+
+
+/* **************************************************************
+* Error Management
+****************************************************************/
+#define FSE_isError ERR_isError
+#define FSE_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c) /* use only *after* variable declarations */
+
+
+/* **************************************************************
+* Templates
+****************************************************************/
+/*
+ designed to be included
+ for type-specific functions (template emulation in C)
+ Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+# error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+# error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+/* Function templates */
+FSE_DTable* FSE_createDTable (unsigned tableLog)
+{
+ if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+ return (FSE_DTable*)malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
+}
+
+void FSE_freeDTable (FSE_DTable* dt)
+{
+ free(dt);
+}
+
+size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+ void* const tdPtr = dt+1; /* because *dt is unsigned, 32-bits aligned on 32-bits */
+ FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr);
+ U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1];
+
+ U32 const maxSV1 = maxSymbolValue + 1;
+ U32 const tableSize = 1 << tableLog;
+ U32 highThreshold = tableSize-1;
+
+ /* Sanity Checks */
+ if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
+ if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+
+ /* Init, lay down lowprob symbols */
+ { FSE_DTableHeader DTableH;
+ DTableH.tableLog = (U16)tableLog;
+ DTableH.fastMode = 1;
+ { S16 const largeLimit= (S16)(1 << (tableLog-1));
+ U32 s;
+ for (s=0; s<maxSV1; s++) {
+ if (normalizedCounter[s]==-1) {
+ tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
+ symbolNext[s] = 1;
+ } else {
+ if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+ symbolNext[s] = normalizedCounter[s];
+ } } }
+ memcpy(dt, &DTableH, sizeof(DTableH));
+ }
+
+ /* Spread symbols */
+ { U32 const tableMask = tableSize-1;
+ U32 const step = FSE_TABLESTEP(tableSize);
+ U32 s, position = 0;
+ for (s=0; s<maxSV1; s++) {
+ int i;
+ for (i=0; i<normalizedCounter[s]; i++) {
+ tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
+ position = (position + step) & tableMask;
+ while (position > highThreshold) position = (position + step) & tableMask; /* lowprob area */
+ } }
+ if (position!=0) return ERROR(GENERIC); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+ }
+
+ /* Build Decoding table */
+ { U32 u;
+ for (u=0; u<tableSize; u++) {
+ FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
+ U32 const nextState = symbolNext[symbol]++;
+ tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
+ tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+ } }
+
+ return 0;
+}
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/*-*******************************************************
+* Decompression (Byte symbols)
+*********************************************************/
+size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+{
+ void* ptr = dt;
+ FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+ void* dPtr = dt + 1;
+ FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
+
+ DTableH->tableLog = 0;
+ DTableH->fastMode = 0;
+
+ cell->newState = 0;
+ cell->symbol = symbolValue;
+ cell->nbBits = 0;
+
+ return 0;
+}
+
+
+size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+{
+ void* ptr = dt;
+ FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+ void* dPtr = dt + 1;
+ FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
+ const unsigned tableSize = 1 << nbBits;
+ const unsigned tableMask = tableSize - 1;
+ const unsigned maxSV1 = tableMask+1;
+ unsigned s;
+
+ /* Sanity checks */
+ if (nbBits < 1) return ERROR(GENERIC); /* min size */
+
+ /* Build Decoding Table */
+ DTableH->tableLog = (U16)nbBits;
+ DTableH->fastMode = 1;
+ for (s=0; s<maxSV1; s++) {
+ dinfo[s].newState = 0;
+ dinfo[s].symbol = (BYTE)s;
+ dinfo[s].nbBits = (BYTE)nbBits;
+ }
+
+ return 0;
+}
+
+FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+ void* dst, size_t maxDstSize,
+ const void* cSrc, size_t cSrcSize,
+ const FSE_DTable* dt, const unsigned fast)
+{
+ BYTE* const ostart = (BYTE*) dst;
+ BYTE* op = ostart;
+ BYTE* const omax = op + maxDstSize;
+ BYTE* const olimit = omax-3;
+
+ BIT_DStream_t bitD;
+ FSE_DState_t state1;
+ FSE_DState_t state2;
+
+ /* Init */
+ CHECK_F(BIT_initDStream(&bitD, cSrc, cSrcSize));
+
+ FSE_initDState(&state1, &bitD, dt);
+ FSE_initDState(&state2, &bitD, dt);
+
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+
+ /* 4 symbols per loop */
+ for ( ; (BIT_reloadDStream(&bitD)==BIT_DStream_unfinished) & (op<olimit) ; op+=4) {
+ op[0] = FSE_GETSYMBOL(&state1);
+
+ if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8) /* This test must be static */
+ BIT_reloadDStream(&bitD);
+
+ op[1] = FSE_GETSYMBOL(&state2);
+
+ if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8) /* This test must be static */
+ { if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { op+=2; break; } }
+
+ op[2] = FSE_GETSYMBOL(&state1);
+
+ if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8) /* This test must be static */
+ BIT_reloadDStream(&bitD);
+
+ op[3] = FSE_GETSYMBOL(&state2);
+ }
+
+ /* tail */
+ /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
+ while (1) {
+ if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+ *op++ = FSE_GETSYMBOL(&state1);
+ if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) {
+ *op++ = FSE_GETSYMBOL(&state2);
+ break;
+ }
+
+ if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+ *op++ = FSE_GETSYMBOL(&state2);
+ if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) {
+ *op++ = FSE_GETSYMBOL(&state1);
+ break;
+ } }
+
+ return op-ostart;
+}
+
+
+size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+ const void* cSrc, size_t cSrcSize,
+ const FSE_DTable* dt)
+{
+ const void* ptr = dt;
+ const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+ const U32 fastMode = DTableH->fastMode;
+
+ /* select fast mode (static) */
+ if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+ return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog)
+{
+ const BYTE* const istart = (const BYTE*)cSrc;
+ const BYTE* ip = istart;
+ short counting[FSE_MAX_SYMBOL_VALUE+1];
+ unsigned tableLog;
+ unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+
+ /* normal FSE decoding mode */
+ size_t const NCountLength = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+ if (FSE_isError(NCountLength)) return NCountLength;
+ /* if (NCountLength >= cSrcSize) return ERROR(srcSize_wrong); */ /* too small input size; supposed to be already checked in NCountLength, only remaining case : NCountLength==cSrcSize */
+ if (tableLog > maxLog) return ERROR(tableLog_tooLarge);
+ ip += NCountLength;
+ cSrcSize -= NCountLength;
+
+ CHECK_F( FSE_buildDTable (workSpace, counting, maxSymbolValue, tableLog) );
+
+ return FSE_decompress_usingDTable (dst, dstCapacity, ip, cSrcSize, workSpace); /* always return, even if it is an error code */
+}
+
+
+typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+
+size_t FSE_decompress(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize)
+{
+ DTable_max_t dt; /* Static analyzer seems unable to understand this table will be properly initialized later */
+ return FSE_decompress_wksp(dst, dstCapacity, cSrc, cSrcSize, dt, FSE_MAX_TABLELOG);
+}
+
+
+
+#endif /* FSE_COMMONDEFS_ONLY */
+/**** ended inlining common/fse_decompress.c ****/
+/**** start inlining common/pool.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* ====== Dependencies ======= */
+#include <stddef.h> /* size_t */
+/**** skipping file: debug.h ****/
+/**** start inlining zstd_internal.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_CCOMMON_H_MODULE
+#define ZSTD_CCOMMON_H_MODULE
+
+/* this module contains definitions which must be identical
+ * across compression, decompression and dictBuilder.
+ * It also contains a few functions useful to at least 2 of them
+ * and which benefit from being inlined */
+
+/*-*************************************
+* Dependencies
+***************************************/
+#if !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
+/**** skipping file: compiler.h ****/
+/**** skipping file: mem.h ****/
+/**** skipping file: debug.h ****/
+/**** skipping file: error_private.h ****/
+#define ZSTD_STATIC_LINKING_ONLY
+/**** start inlining ../zstd.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef ZSTD_H_235446
+#define ZSTD_H_235446
+
+/* ====== Dependency ======*/
+#include <limits.h> /* INT_MAX */
+#include <stddef.h> /* size_t */
+
+
+/* ===== ZSTDLIB_API : control library symbols visibility ===== */
+#ifndef ZSTDLIB_VISIBILITY
+# if defined(__GNUC__) && (__GNUC__ >= 4)
+# define ZSTDLIB_VISIBILITY __attribute__ ((visibility ("default")))
+# else
+# define ZSTDLIB_VISIBILITY
+# endif
+#endif
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+# define ZSTDLIB_API __declspec(dllexport) ZSTDLIB_VISIBILITY
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+# define ZSTDLIB_API __declspec(dllimport) ZSTDLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+# define ZSTDLIB_API ZSTDLIB_VISIBILITY
+#endif
+
+
+/*******************************************************************************
+ Introduction
+
+ zstd, short for Zstandard, is a fast lossless compression algorithm, targeting
+ real-time compression scenarios at zlib-level and better compression ratios.
+ The zstd compression library provides in-memory compression and decompression
+ functions.
+
+ The library supports regular compression levels from 1 up to ZSTD_maxCLevel(),
+ which is currently 22. Levels >= 20, labeled `--ultra`, should be used with
+ caution, as they require more memory. The library also offers negative
+ compression levels, which extend the range of speed vs. ratio preferences.
+ The lower the level, the faster the speed (at the cost of compression).
+
+ Compression can be done in:
+ - a single step (described as Simple API)
+ - a single step, reusing a context (described as Explicit context)
+ - unbounded multiple steps (described as Streaming compression)
+
+ The compression ratio achievable on small data can be highly improved using
+ a dictionary. Dictionary compression can be performed in:
+ - a single step (described as Simple dictionary API)
+ - a single step, reusing a dictionary (described as Bulk-processing
+ dictionary API)
+
+ Advanced experimental functions can be accessed using
+ `#define ZSTD_STATIC_LINKING_ONLY` before including zstd.h.
+
+ Advanced experimental APIs should never be used with a dynamically-linked
+ library. They are not "stable"; their definitions or signatures may change in
+ the future. Only static linking is allowed.
+*******************************************************************************/
+
+/*------ Version ------*/
+#define ZSTD_VERSION_MAJOR 1
+#define ZSTD_VERSION_MINOR 4
+#define ZSTD_VERSION_RELEASE 5
+
+#define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+ZSTDLIB_API unsigned ZSTD_versionNumber(void); /**< to check runtime library version */
+
+#define ZSTD_LIB_VERSION ZSTD_VERSION_MAJOR.ZSTD_VERSION_MINOR.ZSTD_VERSION_RELEASE
+#define ZSTD_QUOTE(str) #str
+#define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str)
+#define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION)
+ZSTDLIB_API const char* ZSTD_versionString(void); /* requires v1.3.0+ */
+
+/* *************************************
+ * Default constant
+ ***************************************/
+#ifndef ZSTD_CLEVEL_DEFAULT
+# define ZSTD_CLEVEL_DEFAULT 3
+#endif
+
+/* *************************************
+ * Constants
+ ***************************************/
+
+/* All magic numbers are supposed read/written to/from files/memory using little-endian convention */
+#define ZSTD_MAGICNUMBER 0xFD2FB528 /* valid since v0.8.0 */
+#define ZSTD_MAGIC_DICTIONARY 0xEC30A437 /* valid since v0.7.0 */
+#define ZSTD_MAGIC_SKIPPABLE_START 0x184D2A50 /* all 16 values, from 0x184D2A50 to 0x184D2A5F, signal the beginning of a skippable frame */
+#define ZSTD_MAGIC_SKIPPABLE_MASK 0xFFFFFFF0
+
+#define ZSTD_BLOCKSIZELOG_MAX 17
+#define ZSTD_BLOCKSIZE_MAX (1<<ZSTD_BLOCKSIZELOG_MAX)
+
+
+
+/***************************************
+* Simple API
+***************************************/
+/*! ZSTD_compress() :
+ * Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
+ * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`.
+ * @return : compressed size written into `dst` (<= `dstCapacity),
+ * or an error code if it fails (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ int compressionLevel);
+
+/*! ZSTD_decompress() :
+ * `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
+ * `dstCapacity` is an upper bound of originalSize to regenerate.
+ * If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data.
+ * @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+ * or an errorCode if it fails (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity,
+ const void* src, size_t compressedSize);
+
+/*! ZSTD_getFrameContentSize() : requires v1.3.0+
+ * `src` should point to the start of a ZSTD encoded frame.
+ * `srcSize` must be at least as large as the frame header.
+ * hint : any size >= `ZSTD_frameHeaderSize_max` is large enough.
+ * @return : - decompressed size of `src` frame content, if known
+ * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+ * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small)
+ * note 1 : a 0 return value means the frame is valid but "empty".
+ * note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode.
+ * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
+ * In which case, it's necessary to use streaming mode to decompress data.
+ * Optionally, application can rely on some implicit limit,
+ * as ZSTD_decompress() only needs an upper bound of decompressed size.
+ * (For example, data could be necessarily cut into blocks <= 16 KB).
+ * note 3 : decompressed size is always present when compression is completed using single-pass functions,
+ * such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict().
+ * note 4 : decompressed size can be very large (64-bits value),
+ * potentially larger than what local system can handle as a single memory segment.
+ * In which case, it's necessary to use streaming mode to decompress data.
+ * note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+ * Always ensure return value fits within application's authorized limits.
+ * Each application can set its own limits.
+ * note 6 : This function replaces ZSTD_getDecompressedSize() */
+#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1)
+#define ZSTD_CONTENTSIZE_ERROR (0ULL - 2)
+ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
+
+/*! ZSTD_getDecompressedSize() :
+ * NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize().
+ * Both functions work the same way, but ZSTD_getDecompressedSize() blends
+ * "empty", "unknown" and "error" results to the same return value (0),
+ * while ZSTD_getFrameContentSize() gives them separate return values.
+ * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
+ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+
+/*! ZSTD_findFrameCompressedSize() :
+ * `src` should point to the start of a ZSTD frame or skippable frame.
+ * `srcSize` must be >= first frame size
+ * @return : the compressed size of the first frame starting at `src`,
+ * suitable to pass as `srcSize` to `ZSTD_decompress` or similar,
+ * or an error code if input is invalid */
+ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize);
+
+
+/*====== Helper functions ======*/
+#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
+ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */
+ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */
+ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed */
+ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */
+
+
+/***************************************
+* Explicit context
+***************************************/
+/*= Compression context
+ * When compressing many times,
+ * it is recommended to allocate a context just once,
+ * and re-use it for each successive compression operation.
+ * This will make workload friendlier for system's memory.
+ * Note : re-using context is just a speed / resource optimization.
+ * It doesn't change the compression ratio, which remains identical.
+ * Note 2 : In multi-threaded environments,
+ * use one different context per thread for parallel execution.
+ */
+typedef struct ZSTD_CCtx_s ZSTD_CCtx;
+ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void);
+ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx);
+
+/*! ZSTD_compressCCtx() :
+ * Same as ZSTD_compress(), using an explicit ZSTD_CCtx.
+ * Important : in order to behave similarly to `ZSTD_compress()`,
+ * this function compresses at requested compression level,
+ * __ignoring any other parameter__ .
+ * If any advanced parameter was set using the advanced API,
+ * they will all be reset. Only `compressionLevel` remains.
+ */
+ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ int compressionLevel);
+
+/*= Decompression context
+ * When decompressing many times,
+ * it is recommended to allocate a context only once,
+ * and re-use it for each successive compression operation.
+ * This will make workload friendlier for system's memory.
+ * Use one context per thread for parallel execution. */
+typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void);
+ZSTDLIB_API size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx);
+
+/*! ZSTD_decompressDCtx() :
+ * Same as ZSTD_decompress(),
+ * requires an allocated ZSTD_DCtx.
+ * Compatible with sticky parameters.
+ */
+ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize);
+
+
+/***************************************
+* Advanced compression API
+***************************************/
+
+/* API design :
+ * Parameters are pushed one by one into an existing context,
+ * using ZSTD_CCtx_set*() functions.
+ * Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame.
+ * "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` !
+ * __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ .
+ *
+ * It's possible to reset all parameters to "default" using ZSTD_CCtx_reset().
+ *
+ * This API supercedes all other "advanced" API entry points in the experimental section.
+ * In the future, we expect to remove from experimental API entry points which are redundant with this API.
+ */
+
+
+/* Compression strategies, listed from fastest to strongest */
+typedef enum { ZSTD_fast=1,
+ ZSTD_dfast=2,
+ ZSTD_greedy=3,
+ ZSTD_lazy=4,
+ ZSTD_lazy2=5,
+ ZSTD_btlazy2=6,
+ ZSTD_btopt=7,
+ ZSTD_btultra=8,
+ ZSTD_btultra2=9
+ /* note : new strategies _might_ be added in the future.
+ Only the order (from fast to strong) is guaranteed */
+} ZSTD_strategy;
+
+
+typedef enum {
+
+ /* compression parameters
+ * Note: When compressing with a ZSTD_CDict these parameters are superseded
+ * by the parameters used to construct the ZSTD_CDict.
+ * See ZSTD_CCtx_refCDict() for more info (superseded-by-cdict). */
+ ZSTD_c_compressionLevel=100, /* Set compression parameters according to pre-defined cLevel table.
+ * Note that exact compression parameters are dynamically determined,
+ * depending on both compression level and srcSize (when known).
+ * Default level is ZSTD_CLEVEL_DEFAULT==3.
+ * Special: value 0 means default, which is controlled by ZSTD_CLEVEL_DEFAULT.
+ * Note 1 : it's possible to pass a negative compression level.
+ * Note 2 : setting a level does not automatically set all other compression parameters
+ * to default. Setting this will however eventually dynamically impact the compression
+ * parameters which have not been manually set. The manually set
+ * ones will 'stick'. */
+ /* Advanced compression parameters :
+ * It's possible to pin down compression parameters to some specific values.
+ * In which case, these values are no longer dynamically selected by the compressor */
+ ZSTD_c_windowLog=101, /* Maximum allowed back-reference distance, expressed as power of 2.
+ * This will set a memory budget for streaming decompression,
+ * with larger values requiring more memory
+ * and typically compressing more.
+ * Must be clamped between ZSTD_WINDOWLOG_MIN and ZSTD_WINDOWLOG_MAX.
+ * Special: value 0 means "use default windowLog".
+ * Note: Using a windowLog greater than ZSTD_WINDOWLOG_LIMIT_DEFAULT
+ * requires explicitly allowing such size at streaming decompression stage. */
+ ZSTD_c_hashLog=102, /* Size of the initial probe table, as a power of 2.
+ * Resulting memory usage is (1 << (hashLog+2)).
+ * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX.
+ * Larger tables improve compression ratio of strategies <= dFast,
+ * and improve speed of strategies > dFast.
+ * Special: value 0 means "use default hashLog". */
+ ZSTD_c_chainLog=103, /* Size of the multi-probe search table, as a power of 2.
+ * Resulting memory usage is (1 << (chainLog+2)).
+ * Must be clamped between ZSTD_CHAINLOG_MIN and ZSTD_CHAINLOG_MAX.
+ * Larger tables result in better and slower compression.
+ * This parameter is useless for "fast" strategy.
+ * It's still useful when using "dfast" strategy,
+ * in which case it defines a secondary probe table.
+ * Special: value 0 means "use default chainLog". */
+ ZSTD_c_searchLog=104, /* Number of search attempts, as a power of 2.
+ * More attempts result in better and slower compression.
+ * This parameter is useless for "fast" and "dFast" strategies.
+ * Special: value 0 means "use default searchLog". */
+ ZSTD_c_minMatch=105, /* Minimum size of searched matches.
+ * Note that Zstandard can still find matches of smaller size,
+ * it just tweaks its search algorithm to look for this size and larger.
+ * Larger values increase compression and decompression speed, but decrease ratio.
+ * Must be clamped between ZSTD_MINMATCH_MIN and ZSTD_MINMATCH_MAX.
+ * Note that currently, for all strategies < btopt, effective minimum is 4.
+ * , for all strategies > fast, effective maximum is 6.
+ * Special: value 0 means "use default minMatchLength". */
+ ZSTD_c_targetLength=106, /* Impact of this field depends on strategy.
+ * For strategies btopt, btultra & btultra2:
+ * Length of Match considered "good enough" to stop search.
+ * Larger values make compression stronger, and slower.
+ * For strategy fast:
+ * Distance between match sampling.
+ * Larger values make compression faster, and weaker.
+ * Special: value 0 means "use default targetLength". */
+ ZSTD_c_strategy=107, /* See ZSTD_strategy enum definition.
+ * The higher the value of selected strategy, the more complex it is,
+ * resulting in stronger and slower compression.
+ * Special: value 0 means "use default strategy". */
+
+ /* LDM mode parameters */
+ ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching.
+ * This parameter is designed to improve compression ratio
+ * for large inputs, by finding large matches at long distance.
+ * It increases memory usage and window size.
+ * Note: enabling this parameter increases default ZSTD_c_windowLog to 128 MB
+ * except when expressly set to a different value. */
+ ZSTD_c_ldmHashLog=161, /* Size of the table for long distance matching, as a power of 2.
+ * Larger values increase memory usage and compression ratio,
+ * but decrease compression speed.
+ * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX
+ * default: windowlog - 7.
+ * Special: value 0 means "automatically determine hashlog". */
+ ZSTD_c_ldmMinMatch=162, /* Minimum match size for long distance matcher.
+ * Larger/too small values usually decrease compression ratio.
+ * Must be clamped between ZSTD_LDM_MINMATCH_MIN and ZSTD_LDM_MINMATCH_MAX.
+ * Special: value 0 means "use default value" (default: 64). */
+ ZSTD_c_ldmBucketSizeLog=163, /* Log size of each bucket in the LDM hash table for collision resolution.
+ * Larger values improve collision resolution but decrease compression speed.
+ * The maximum value is ZSTD_LDM_BUCKETSIZELOG_MAX.
+ * Special: value 0 means "use default value" (default: 3). */
+ ZSTD_c_ldmHashRateLog=164, /* Frequency of inserting/looking up entries into the LDM hash table.
+ * Must be clamped between 0 and (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN).
+ * Default is MAX(0, (windowLog - ldmHashLog)), optimizing hash table usage.
+ * Larger values improve compression speed.
+ * Deviating far from default value will likely result in a compression ratio decrease.
+ * Special: value 0 means "automatically determine hashRateLog". */
+
+ /* frame parameters */
+ ZSTD_c_contentSizeFlag=200, /* Content size will be written into frame header _whenever known_ (default:1)
+ * Content size must be known at the beginning of compression.
+ * This is automatically the case when using ZSTD_compress2(),
+ * For streaming scenarios, content size must be provided with ZSTD_CCtx_setPledgedSrcSize() */
+ ZSTD_c_checksumFlag=201, /* A 32-bits checksum of content is written at end of frame (default:0) */
+ ZSTD_c_dictIDFlag=202, /* When applicable, dictionary's ID is written into frame header (default:1) */
+
+ /* multi-threading parameters */
+ /* These parameters are only useful if multi-threading is enabled (compiled with build macro ZSTD_MULTITHREAD).
+ * They return an error otherwise. */
+ ZSTD_c_nbWorkers=400, /* Select how many threads will be spawned to compress in parallel.
+ * When nbWorkers >= 1, triggers asynchronous mode when used with ZSTD_compressStream*() :
+ * ZSTD_compressStream*() consumes input and flush output if possible, but immediately gives back control to caller,
+ * while compression work is performed in parallel, within worker threads.
+ * (note : a strong exception to this rule is when first invocation of ZSTD_compressStream2() sets ZSTD_e_end :
+ * in which case, ZSTD_compressStream2() delegates to ZSTD_compress2(), which is always a blocking call).
+ * More workers improve speed, but also increase memory usage.
+ * Default value is `0`, aka "single-threaded mode" : no worker is spawned, compression is performed inside Caller's thread, all invocations are blocking */
+ ZSTD_c_jobSize=401, /* Size of a compression job. This value is enforced only when nbWorkers >= 1.
+ * Each compression job is completed in parallel, so this value can indirectly impact the nb of active threads.
+ * 0 means default, which is dynamically determined based on compression parameters.
+ * Job size must be a minimum of overlap size, or 1 MB, whichever is largest.
+ * The minimum size is automatically and transparently enforced. */
+ ZSTD_c_overlapLog=402, /* Control the overlap size, as a fraction of window size.
+ * The overlap size is an amount of data reloaded from previous job at the beginning of a new job.
+ * It helps preserve compression ratio, while each job is compressed in parallel.
+ * This value is enforced only when nbWorkers >= 1.
+ * Larger values increase compression ratio, but decrease speed.
+ * Possible values range from 0 to 9 :
+ * - 0 means "default" : value will be determined by the library, depending on strategy
+ * - 1 means "no overlap"
+ * - 9 means "full overlap", using a full window size.
+ * Each intermediate rank increases/decreases load size by a factor 2 :
+ * 9: full window; 8: w/2; 7: w/4; 6: w/8; 5:w/16; 4: w/32; 3:w/64; 2:w/128; 1:no overlap; 0:default
+ * default value varies between 6 and 9, depending on strategy */
+
+ /* note : additional experimental parameters are also available
+ * within the experimental section of the API.
+ * At the time of this writing, they include :
+ * ZSTD_c_rsyncable
+ * ZSTD_c_format
+ * ZSTD_c_forceMaxWindow
+ * ZSTD_c_forceAttachDict
+ * ZSTD_c_literalCompressionMode
+ * ZSTD_c_targetCBlockSize
+ * ZSTD_c_srcSizeHint
+ * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+ * note : never ever use experimentalParam? names directly;
+ * also, the enums values themselves are unstable and can still change.
+ */
+ ZSTD_c_experimentalParam1=500,
+ ZSTD_c_experimentalParam2=10,
+ ZSTD_c_experimentalParam3=1000,
+ ZSTD_c_experimentalParam4=1001,
+ ZSTD_c_experimentalParam5=1002,
+ ZSTD_c_experimentalParam6=1003,
+ ZSTD_c_experimentalParam7=1004
+} ZSTD_cParameter;
+
+typedef struct {
+ size_t error;
+ int lowerBound;
+ int upperBound;
+} ZSTD_bounds;
+
+/*! ZSTD_cParam_getBounds() :
+ * All parameters must belong to an interval with lower and upper bounds,
+ * otherwise they will either trigger an error or be automatically clamped.
+ * @return : a structure, ZSTD_bounds, which contains
+ * - an error status field, which must be tested using ZSTD_isError()
+ * - lower and upper bounds, both inclusive
+ */
+ZSTDLIB_API ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter cParam);
+
+/*! ZSTD_CCtx_setParameter() :
+ * Set one compression parameter, selected by enum ZSTD_cParameter.
+ * All parameters have valid bounds. Bounds can be queried using ZSTD_cParam_getBounds().
+ * Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter).
+ * Setting a parameter is generally only possible during frame initialization (before starting compression).
+ * Exception : when using multi-threading mode (nbWorkers >= 1),
+ * the following parameters can be updated _during_ compression (within same frame):
+ * => compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy.
+ * new parameters will be active for next job only (after a flush()).
+ * @return : an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value);
+
+/*! ZSTD_CCtx_setPledgedSrcSize() :
+ * Total input data size to be compressed as a single frame.
+ * Value will be written in frame header, unless if explicitly forbidden using ZSTD_c_contentSizeFlag.
+ * This value will also be controlled at end of frame, and trigger an error if not respected.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ * Note 1 : pledgedSrcSize==0 actually means zero, aka an empty frame.
+ * In order to mean "unknown content size", pass constant ZSTD_CONTENTSIZE_UNKNOWN.
+ * ZSTD_CONTENTSIZE_UNKNOWN is default value for any new frame.
+ * Note 2 : pledgedSrcSize is only valid once, for the next frame.
+ * It's discarded at the end of the frame, and replaced by ZSTD_CONTENTSIZE_UNKNOWN.
+ * Note 3 : Whenever all input data is provided and consumed in a single round,
+ * for example with ZSTD_compress2(),
+ * or invoking immediately ZSTD_compressStream2(,,,ZSTD_e_end),
+ * this value is automatically overridden by srcSize instead.
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize);
+
+typedef enum {
+ ZSTD_reset_session_only = 1,
+ ZSTD_reset_parameters = 2,
+ ZSTD_reset_session_and_parameters = 3
+} ZSTD_ResetDirective;
+
+/*! ZSTD_CCtx_reset() :
+ * There are 2 different things that can be reset, independently or jointly :
+ * - The session : will stop compressing current frame, and make CCtx ready to start a new one.
+ * Useful after an error, or to interrupt any ongoing compression.
+ * Any internal data not yet flushed is cancelled.
+ * Compression parameters and dictionary remain unchanged.
+ * They will be used to compress next frame.
+ * Resetting session never fails.
+ * - The parameters : changes all parameters back to "default".
+ * This removes any reference to any dictionary too.
+ * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
+ * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
+ * - Both : similar to resetting the session, followed by resetting parameters.
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
+
+/*! ZSTD_compress2() :
+ * Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API.
+ * ZSTD_compress2() always starts a new frame.
+ * Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
+ * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
+ * - The function is always blocking, returns when compression is completed.
+ * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`.
+ * @return : compressed size written into `dst` (<= `dstCapacity),
+ * or an error code if it fails (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_compress2( ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize);
+
+
+/***************************************
+* Advanced decompression API
+***************************************/
+
+/* The advanced API pushes parameters one by one into an existing DCtx context.
+ * Parameters are sticky, and remain valid for all following frames
+ * using the same DCtx context.
+ * It's possible to reset parameters to default values using ZSTD_DCtx_reset().
+ * Note : This API is compatible with existing ZSTD_decompressDCtx() and ZSTD_decompressStream().
+ * Therefore, no new decompression function is necessary.
+ */
+
+typedef enum {
+
+ ZSTD_d_windowLogMax=100, /* Select a size limit (in power of 2) beyond which
+ * the streaming API will refuse to allocate memory buffer
+ * in order to protect the host from unreasonable memory requirements.
+ * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode.
+ * By default, a decompression context accepts window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT).
+ * Special: value 0 means "use default maximum windowLog". */
+
+ /* note : additional experimental parameters are also available
+ * within the experimental section of the API.
+ * At the time of this writing, they include :
+ * ZSTD_d_format
+ * ZSTD_d_stableOutBuffer
+ * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+ * note : never ever use experimentalParam? names directly
+ */
+ ZSTD_d_experimentalParam1=1000,
+ ZSTD_d_experimentalParam2=1001
+
+} ZSTD_dParameter;
+
+/*! ZSTD_dParam_getBounds() :
+ * All parameters must belong to an interval with lower and upper bounds,
+ * otherwise they will either trigger an error or be automatically clamped.
+ * @return : a structure, ZSTD_bounds, which contains
+ * - an error status field, which must be tested using ZSTD_isError()
+ * - both lower and upper bounds, inclusive
+ */
+ZSTDLIB_API ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam);
+
+/*! ZSTD_DCtx_setParameter() :
+ * Set one compression parameter, selected by enum ZSTD_dParameter.
+ * All parameters have valid bounds. Bounds can be queried using ZSTD_dParam_getBounds().
+ * Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter).
+ * Setting a parameter is only possible during frame initialization (before starting decompression).
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int value);
+
+/*! ZSTD_DCtx_reset() :
+ * Return a DCtx to clean state.
+ * Session and parameters can be reset jointly or separately.
+ * Parameters can only be reset when no active frame is being decompressed.
+ * @return : 0, or an error code, which can be tested with ZSTD_isError()
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset);
+
+
+/****************************
+* Streaming
+****************************/
+
+typedef struct ZSTD_inBuffer_s {
+ const void* src; /**< start of input buffer */
+ size_t size; /**< size of input buffer */
+ size_t pos; /**< position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */
+} ZSTD_inBuffer;
+
+typedef struct ZSTD_outBuffer_s {
+ void* dst; /**< start of output buffer */
+ size_t size; /**< size of output buffer */
+ size_t pos; /**< position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */
+} ZSTD_outBuffer;
+
+
+
+/*-***********************************************************************
+* Streaming compression - HowTo
+*
+* A ZSTD_CStream object is required to track streaming operation.
+* Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
+* ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
+* It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
+*
+* For parallel execution, use one separate ZSTD_CStream per thread.
+*
+* note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing.
+*
+* Parameters are sticky : when starting a new compression on the same context,
+* it will re-use the same sticky parameters as previous compression session.
+* When in doubt, it's recommended to fully initialize the context before usage.
+* Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(),
+* ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to
+* set more specific parameters, the pledged source size, or load a dictionary.
+*
+* Use ZSTD_compressStream2() with ZSTD_e_continue as many times as necessary to
+* consume input stream. The function will automatically update both `pos`
+* fields within `input` and `output`.
+* Note that the function may not consume the entire input, for example, because
+* the output buffer is already full, in which case `input.pos < input.size`.
+* The caller must check if input has been entirely consumed.
+* If not, the caller must make some room to receive more compressed data,
+* and then present again remaining input data.
+* note: ZSTD_e_continue is guaranteed to make some forward progress when called,
+* but doesn't guarantee maximal forward progress. This is especially relevant
+* when compressing with multiple threads. The call won't block if it can
+* consume some input, but if it can't it will wait for some, but not all,
+* output to be flushed.
+* @return : provides a minimum amount of data remaining to be flushed from internal buffers
+* or an error code, which can be tested using ZSTD_isError().
+*
+* At any moment, it's possible to flush whatever data might remain stuck within internal buffer,
+* using ZSTD_compressStream2() with ZSTD_e_flush. `output->pos` will be updated.
+* Note that, if `output->size` is too small, a single invocation with ZSTD_e_flush might not be enough (return code > 0).
+* In which case, make some room to receive more compressed data, and call again ZSTD_compressStream2() with ZSTD_e_flush.
+* You must continue calling ZSTD_compressStream2() with ZSTD_e_flush until it returns 0, at which point you can change the
+* operation.
+* note: ZSTD_e_flush will flush as much output as possible, meaning when compressing with multiple threads, it will
+* block until the flush is complete or the output buffer is full.
+* @return : 0 if internal buffers are entirely flushed,
+* >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
+* or an error code, which can be tested using ZSTD_isError().
+*
+* Calling ZSTD_compressStream2() with ZSTD_e_end instructs to finish a frame.
+* It will perform a flush and write frame epilogue.
+* The epilogue is required for decoders to consider a frame completed.
+* flush operation is the same, and follows same rules as calling ZSTD_compressStream2() with ZSTD_e_flush.
+* You must continue calling ZSTD_compressStream2() with ZSTD_e_end until it returns 0, at which point you are free to
+* start a new frame.
+* note: ZSTD_e_end will flush as much output as possible, meaning when compressing with multiple threads, it will
+* block until the flush is complete or the output buffer is full.
+* @return : 0 if frame fully completed and fully flushed,
+* >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
+* or an error code, which can be tested using ZSTD_isError().
+*
+* *******************************************************************/
+
+typedef ZSTD_CCtx ZSTD_CStream; /**< CCtx and CStream are now effectively same object (>= v1.3.0) */
+ /* Continue to distinguish them for compatibility with older versions <= v1.2.0 */
+/*===== ZSTD_CStream management functions =====*/
+ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void);
+ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs);
+
+/*===== Streaming compression functions =====*/
+typedef enum {
+ ZSTD_e_continue=0, /* collect more data, encoder decides when to output compressed result, for optimal compression ratio */
+ ZSTD_e_flush=1, /* flush any data provided so far,
+ * it creates (at least) one new block, that can be decoded immediately on reception;
+ * frame will continue: any future data can still reference previously compressed data, improving compression.
+ * note : multithreaded compression will block to flush as much output as possible. */
+ ZSTD_e_end=2 /* flush any remaining data _and_ close current frame.
+ * note that frame is only closed after compressed data is fully flushed (return value == 0).
+ * After that point, any additional data starts a new frame.
+ * note : each frame is independent (does not reference any content from previous frame).
+ : note : multithreaded compression will block to flush as much output as possible. */
+} ZSTD_EndDirective;
+
+/*! ZSTD_compressStream2() :
+ * Behaves about the same as ZSTD_compressStream, with additional control on end directive.
+ * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
+ * - Compression parameters cannot be changed once compression is started (save a list of exceptions in multi-threading mode)
+ * - output->pos must be <= dstCapacity, input->pos must be <= srcSize
+ * - output->pos and input->pos will be updated. They are guaranteed to remain below their respective limit.
+ * - When nbWorkers==0 (default), function is blocking : it completes its job before returning to caller.
+ * - When nbWorkers>=1, function is non-blocking : it just acquires a copy of input, and distributes jobs to internal worker threads, flush whatever is available,
+ * and then immediately returns, just indicating that there is some data remaining to be flushed.
+ * The function nonetheless guarantees forward progress : it will return only after it reads or write at least 1+ byte.
+ * - Exception : if the first call requests a ZSTD_e_end directive and provides enough dstCapacity, the function delegates to ZSTD_compress2() which is always blocking.
+ * - @return provides a minimum amount of data remaining to be flushed from internal buffers
+ * or an error code, which can be tested using ZSTD_isError().
+ * if @return != 0, flush is not fully completed, there is still some data left within internal buffers.
+ * This is useful for ZSTD_e_flush, since in this case more flushes are necessary to empty all buffers.
+ * For ZSTD_e_end, @return == 0 when internal buffers are fully flushed and frame is completed.
+ * - after a ZSTD_e_end directive, if internal buffer is not fully flushed (@return != 0),
+ * only ZSTD_e_end or ZSTD_e_flush operations are allowed.
+ * Before starting a new compression job, or changing compression parameters,
+ * it is required to fully flush internal buffers.
+ */
+ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+ ZSTD_outBuffer* output,
+ ZSTD_inBuffer* input,
+ ZSTD_EndDirective endOp);
+
+
+/* These buffer sizes are softly recommended.
+ * They are not required : ZSTD_compressStream*() happily accepts any buffer size, for both input and output.
+ * Respecting the recommended size just makes it a bit easier for ZSTD_compressStream*(),
+ * reducing the amount of memory shuffling and buffering, resulting in minor performance savings.
+ *
+ * However, note that these recommendations are from the perspective of a C caller program.
+ * If the streaming interface is invoked from some other language,
+ * especially managed ones such as Java or Go, through a foreign function interface such as jni or cgo,
+ * a major performance rule is to reduce crossing such interface to an absolute minimum.
+ * It's not rare that performance ends being spent more into the interface, rather than compression itself.
+ * In which cases, prefer using large buffers, as large as practical,
+ * for both input and output, to reduce the nb of roundtrips.
+ */
+ZSTDLIB_API size_t ZSTD_CStreamInSize(void); /**< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /**< recommended size for output buffer. Guarantee to successfully flush at least one complete compressed block. */
+
+
+/* *****************************************************************************
+ * This following is a legacy streaming API.
+ * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
+ * It is redundant, but remains fully supported.
+ * Advanced parameters and dictionary compression can only be used through the
+ * new API.
+ ******************************************************************************/
+
+/*!
+ * Equivalent to:
+ *
+ * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
+ * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ */
+ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
+/*!
+ * Alternative for ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue).
+ * NOTE: The return value is different. ZSTD_compressStream() returns a hint for
+ * the next read size (if non-zero and not an error). ZSTD_compressStream2()
+ * returns the minimum nb of bytes left to flush (if non-zero and not an error).
+ */
+ZSTDLIB_API size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_flush). */
+ZSTDLIB_API size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_end). */
+ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+
+
+/*-***************************************************************************
+* Streaming decompression - HowTo
+*
+* A ZSTD_DStream object is required to track streaming operations.
+* Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
+* ZSTD_DStream objects can be re-used multiple times.
+*
+* Use ZSTD_initDStream() to start a new decompression operation.
+* @return : recommended first input size
+* Alternatively, use advanced API to set specific properties.
+*
+* Use ZSTD_decompressStream() repetitively to consume your input.
+* The function will update both `pos` fields.
+* If `input.pos < input.size`, some input has not been consumed.
+* It's up to the caller to present again remaining data.
+* The function tries to flush all data decoded immediately, respecting output buffer size.
+* If `output.pos < output.size`, decoder has flushed everything it could.
+* But if `output.pos == output.size`, there might be some data left within internal buffers.,
+* In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer.
+* Note : with no additional input provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX.
+* @return : 0 when a frame is completely decoded and fully flushed,
+* or an error code, which can be tested using ZSTD_isError(),
+* or any other value > 0, which means there is still some decoding or flushing to do to complete current frame :
+* the return value is a suggested next input size (just a hint for better latency)
+* that will never request more than the remaining frame size.
+* *******************************************************************************/
+
+typedef ZSTD_DCtx ZSTD_DStream; /**< DCtx and DStream are now effectively same object (>= v1.3.0) */
+ /* For compatibility with versions <= v1.2.0, prefer differentiating them. */
+/*===== ZSTD_DStream management functions =====*/
+ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void);
+ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);
+
+/*===== Streaming decompression functions =====*/
+
+/* This function is redundant with the advanced API and equivalent to:
+ *
+ * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ * ZSTD_DCtx_refDDict(zds, NULL);
+ */
+ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
+
+ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+
+ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_DStreamOutSize(void); /*!< recommended size for output buffer. Guarantee to successfully flush at least one complete block in all circumstances. */
+
+
+/**************************
+* Simple dictionary API
+***************************/
+/*! ZSTD_compress_usingDict() :
+ * Compression at an explicit compression level using a Dictionary.
+ * A dictionary can be any arbitrary data segment (also called a prefix),
+ * or a buffer with specified information (see dictBuilder/zdict.h).
+ * Note : This function loads the dictionary, resulting in significant startup delay.
+ * It's intended for a dictionary used only once.
+ * Note 2 : When `dict == NULL || dictSize < 8` no dictionary is used. */
+ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const void* dict,size_t dictSize,
+ int compressionLevel);
+
+/*! ZSTD_decompress_usingDict() :
+ * Decompression using a known Dictionary.
+ * Dictionary must be identical to the one used during compression.
+ * Note : This function loads the dictionary, resulting in significant startup delay.
+ * It's intended for a dictionary used only once.
+ * Note : When `dict == NULL || dictSize < 8` no dictionary is used. */
+ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const void* dict,size_t dictSize);
+
+
+/***********************************
+ * Bulk processing dictionary API
+ **********************************/
+typedef struct ZSTD_CDict_s ZSTD_CDict;
+
+/*! ZSTD_createCDict() :
+ * When compressing multiple messages or blocks using the same dictionary,
+ * it's recommended to digest the dictionary only once, since it's a costly operation.
+ * ZSTD_createCDict() will create a state from digesting a dictionary.
+ * The resulting state can be used for future compression operations with very limited startup cost.
+ * ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
+ * @dictBuffer can be released after ZSTD_CDict creation, because its content is copied within CDict.
+ * Note 1 : Consider experimental function `ZSTD_createCDict_byReference()` if you prefer to not duplicate @dictBuffer content.
+ * Note 2 : A ZSTD_CDict can be created from an empty @dictBuffer,
+ * in which case the only thing that it transports is the @compressionLevel.
+ * This can be useful in a pipeline featuring ZSTD_compress_usingCDict() exclusively,
+ * expecting a ZSTD_CDict parameter with any data, including those without a known dictionary. */
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize,
+ int compressionLevel);
+
+/*! ZSTD_freeCDict() :
+ * Function frees memory allocated by ZSTD_createCDict(). */
+ZSTDLIB_API size_t ZSTD_freeCDict(ZSTD_CDict* CDict);
+
+/*! ZSTD_compress_usingCDict() :
+ * Compression using a digested Dictionary.
+ * Recommended when same dictionary is used multiple times.
+ * Note : compression level is _decided at dictionary creation time_,
+ * and frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */
+ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const ZSTD_CDict* cdict);
+
+
+typedef struct ZSTD_DDict_s ZSTD_DDict;
+
+/*! ZSTD_createDDict() :
+ * Create a digested dictionary, ready to start decompression operation without startup delay.
+ * dictBuffer can be released after DDict creation, as its content is copied inside DDict. */
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize);
+
+/*! ZSTD_freeDDict() :
+ * Function frees memory allocated with ZSTD_createDDict() */
+ZSTDLIB_API size_t ZSTD_freeDDict(ZSTD_DDict* ddict);
+
+/*! ZSTD_decompress_usingDDict() :
+ * Decompression using a digested Dictionary.
+ * Recommended when same dictionary is used multiple times. */
+ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const ZSTD_DDict* ddict);
+
+
+/********************************
+ * Dictionary helper functions
+ *******************************/
+
+/*! ZSTD_getDictID_fromDict() :
+ * Provides the dictID stored within dictionary.
+ * if @return == 0, the dictionary is not conformant with Zstandard specification.
+ * It can still be loaded, but as a content-only dictionary. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize);
+
+/*! ZSTD_getDictID_fromDDict() :
+ * Provides the dictID of the dictionary loaded into `ddict`.
+ * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
+
+/*! ZSTD_getDictID_fromFrame() :
+ * Provides the dictID required to decompressed the frame stored within `src`.
+ * If @return == 0, the dictID could not be decoded.
+ * This could for one of the following reasons :
+ * - The frame does not require a dictionary to be decoded (most common case).
+ * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
+ * Note : this use case also happens when using a non-conformant dictionary.
+ * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+ * - This is not a Zstandard frame.
+ * When identifying the exact failure cause, it's possible to use ZSTD_getFrameHeader(), which will provide a more precise error code. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+
+
+/*******************************************************************************
+ * Advanced dictionary and prefix API
+ *
+ * This API allows dictionaries to be used with ZSTD_compress2(),
+ * ZSTD_compressStream2(), and ZSTD_decompress(). Dictionaries are sticky, and
+ * only reset with the context is reset with ZSTD_reset_parameters or
+ * ZSTD_reset_session_and_parameters. Prefixes are single-use.
+ ******************************************************************************/
+
+
+/*! ZSTD_CCtx_loadDictionary() :
+ * Create an internal CDict from `dict` buffer.
+ * Decompression will have to use same dictionary.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ * Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary,
+ * meaning "return to no-dictionary mode".
+ * Note 1 : Dictionary is sticky, it will be used for all future compressed frames.
+ * To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters).
+ * Note 2 : Loading a dictionary involves building tables.
+ * It's also a CPU consuming operation, with non-negligible impact on latency.
+ * Tables are dependent on compression parameters, and for this reason,
+ * compression parameters can no longer be changed after loading a dictionary.
+ * Note 3 :`dict` content will be copied internally.
+ * Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead.
+ * In such a case, dictionary buffer must outlive its users.
+ * Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
+ * to precisely select how dictionary content must be interpreted. */
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_CCtx_refCDict() :
+ * Reference a prepared dictionary, to be used for all next compressed frames.
+ * Note that compression parameters are enforced from within CDict,
+ * and supersede any compression parameter previously set within CCtx.
+ * The parameters ignored are labled as "superseded-by-cdict" in the ZSTD_cParameter enum docs.
+ * The ignored parameters will be used again if the CCtx is returned to no-dictionary mode.
+ * The dictionary will remain valid for future compressed frames using same CCtx.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ * Special : Referencing a NULL CDict means "return to no-dictionary mode".
+ * Note 1 : Currently, only one dictionary can be managed.
+ * Referencing a new dictionary effectively "discards" any previous one.
+ * Note 2 : CDict is just referenced, its lifetime must outlive its usage within CCtx. */
+ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
+
+/*! ZSTD_CCtx_refPrefix() :
+ * Reference a prefix (single-usage dictionary) for next compressed frame.
+ * A prefix is **only used once**. Tables are discarded at end of frame (ZSTD_e_end).
+ * Decompression will need same prefix to properly regenerate data.
+ * Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+ * but performs much faster, especially during decompression (compression speed is tunable with compression level).
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ * Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
+ * Note 1 : Prefix buffer is referenced. It **must** outlive compression.
+ * Its content must remain unmodified during compression.
+ * Note 2 : If the intention is to diff some large src data blob with some prior version of itself,
+ * ensure that the window size is large enough to contain the entire source.
+ * See ZSTD_c_windowLog.
+ * Note 3 : Referencing a prefix involves building tables, which are dependent on compression parameters.
+ * It's a CPU consuming operation, with non-negligible impact on latency.
+ * If there is a need to use the same prefix multiple times, consider loadDictionary instead.
+ * Note 4 : By default, the prefix is interpreted as raw content (ZSTD_dct_rawContent).
+ * Use experimental ZSTD_CCtx_refPrefix_advanced() to alter dictionary interpretation. */
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
+ const void* prefix, size_t prefixSize);
+
+/*! ZSTD_DCtx_loadDictionary() :
+ * Create an internal DDict from dict buffer,
+ * to be used to decompress next frames.
+ * The dictionary remains valid for all future frames, until explicitly invalidated.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ * Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
+ * meaning "return to no-dictionary mode".
+ * Note 1 : Loading a dictionary involves building tables,
+ * which has a non-negligible impact on CPU usage and latency.
+ * It's recommended to "load once, use many times", to amortize the cost
+ * Note 2 :`dict` content will be copied internally, so `dict` can be released after loading.
+ * Use ZSTD_DCtx_loadDictionary_byReference() to reference dictionary content instead.
+ * Note 3 : Use ZSTD_DCtx_loadDictionary_advanced() to take control of
+ * how dictionary content is loaded and interpreted.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_DCtx_refDDict() :
+ * Reference a prepared dictionary, to be used to decompress next frames.
+ * The dictionary remains active for decompression of future frames using same DCtx.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ * Note 1 : Currently, only one dictionary can be managed.
+ * Referencing a new dictionary effectively "discards" any previous one.
+ * Special: referencing a NULL DDict means "return to no-dictionary mode".
+ * Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
+
+/*! ZSTD_DCtx_refPrefix() :
+ * Reference a prefix (single-usage dictionary) to decompress next frame.
+ * This is the reverse operation of ZSTD_CCtx_refPrefix(),
+ * and must use the same prefix as the one used during compression.
+ * Prefix is **only used once**. Reference is discarded at end of frame.
+ * End of frame is reached when ZSTD_decompressStream() returns 0.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ * Note 1 : Adding any prefix (including NULL) invalidates any previously set prefix or dictionary
+ * Note 2 : Prefix buffer is referenced. It **must** outlive decompression.
+ * Prefix buffer must remain unmodified up to the end of frame,
+ * reached when ZSTD_decompressStream() returns 0.
+ * Note 3 : By default, the prefix is treated as raw content (ZSTD_dct_rawContent).
+ * Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode (Experimental section)
+ * Note 4 : Referencing a raw content prefix has almost no cpu nor memory cost.
+ * A full dictionary is more costly, as it requires building tables.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx,
+ const void* prefix, size_t prefixSize);
+
+/* === Memory management === */
+
+/*! ZSTD_sizeof_*() :
+ * These functions give the _current_ memory usage of selected object.
+ * Note that object memory usage can evolve (increase or decrease) over time. */
+ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx);
+ZSTDLIB_API size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs);
+ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds);
+ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict);
+ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+
+#endif /* ZSTD_H_235446 */
+
+
+/* **************************************************************************************
+ * ADVANCED AND EXPERIMENTAL FUNCTIONS
+ ****************************************************************************************
+ * The definitions in the following section are considered experimental.
+ * They are provided for advanced scenarios.
+ * They should never be used with a dynamic library, as prototypes may change in the future.
+ * Use them only in association with static linking.
+ * ***************************************************************************************/
+
+#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY)
+#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY
+
+/****************************************************************************************
+ * experimental API (static linking only)
+ ****************************************************************************************
+ * The following symbols and constants
+ * are not planned to join "stable API" status in the near future.
+ * They can still change in future versions.
+ * Some of them are planned to remain in the static_only section indefinitely.
+ * Some of them might be removed in the future (especially when redundant with existing stable functions)
+ * ***************************************************************************************/
+
+#define ZSTD_FRAMEHEADERSIZE_PREFIX(format) ((format) == ZSTD_f_zstd1 ? 5 : 1) /* minimum input size required to query frame header size */
+#define ZSTD_FRAMEHEADERSIZE_MIN(format) ((format) == ZSTD_f_zstd1 ? 6 : 2)
+#define ZSTD_FRAMEHEADERSIZE_MAX 18 /* can be useful for static allocation */
+#define ZSTD_SKIPPABLEHEADERSIZE 8
+
+/* compression parameter bounds */
+#define ZSTD_WINDOWLOG_MAX_32 30
+#define ZSTD_WINDOWLOG_MAX_64 31
+#define ZSTD_WINDOWLOG_MAX ((int)(sizeof(size_t) == 4 ? ZSTD_WINDOWLOG_MAX_32 : ZSTD_WINDOWLOG_MAX_64))
+#define ZSTD_WINDOWLOG_MIN 10
+#define ZSTD_HASHLOG_MAX ((ZSTD_WINDOWLOG_MAX < 30) ? ZSTD_WINDOWLOG_MAX : 30)
+#define ZSTD_HASHLOG_MIN 6
+#define ZSTD_CHAINLOG_MAX_32 29
+#define ZSTD_CHAINLOG_MAX_64 30
+#define ZSTD_CHAINLOG_MAX ((int)(sizeof(size_t) == 4 ? ZSTD_CHAINLOG_MAX_32 : ZSTD_CHAINLOG_MAX_64))
+#define ZSTD_CHAINLOG_MIN ZSTD_HASHLOG_MIN
+#define ZSTD_SEARCHLOG_MAX (ZSTD_WINDOWLOG_MAX-1)
+#define ZSTD_SEARCHLOG_MIN 1
+#define ZSTD_MINMATCH_MAX 7 /* only for ZSTD_fast, other strategies are limited to 6 */
+#define ZSTD_MINMATCH_MIN 3 /* only for ZSTD_btopt+, faster strategies are limited to 4 */
+#define ZSTD_TARGETLENGTH_MAX ZSTD_BLOCKSIZE_MAX
+#define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */
+#define ZSTD_STRATEGY_MIN ZSTD_fast
+#define ZSTD_STRATEGY_MAX ZSTD_btultra2
+
+
+#define ZSTD_OVERLAPLOG_MIN 0
+#define ZSTD_OVERLAPLOG_MAX 9
+
+#define ZSTD_WINDOWLOG_LIMIT_DEFAULT 27 /* by default, the streaming decoder will refuse any frame
+ * requiring larger than (1<<ZSTD_WINDOWLOG_LIMIT_DEFAULT) window size,
+ * to preserve host's memory from unreasonable requirements.
+ * This limit can be overridden using ZSTD_DCtx_setParameter(,ZSTD_d_windowLogMax,).
+ * The limit does not apply for one-pass decoders (such as ZSTD_decompress()), since no additional memory is allocated */
+
+
+/* LDM parameter bounds */
+#define ZSTD_LDM_HASHLOG_MIN ZSTD_HASHLOG_MIN
+#define ZSTD_LDM_HASHLOG_MAX ZSTD_HASHLOG_MAX
+#define ZSTD_LDM_MINMATCH_MIN 4
+#define ZSTD_LDM_MINMATCH_MAX 4096
+#define ZSTD_LDM_BUCKETSIZELOG_MIN 1
+#define ZSTD_LDM_BUCKETSIZELOG_MAX 8
+#define ZSTD_LDM_HASHRATELOG_MIN 0
+#define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN)
+
+/* Advanced parameter bounds */
+#define ZSTD_TARGETCBLOCKSIZE_MIN 64
+#define ZSTD_TARGETCBLOCKSIZE_MAX ZSTD_BLOCKSIZE_MAX
+#define ZSTD_SRCSIZEHINT_MIN 0
+#define ZSTD_SRCSIZEHINT_MAX INT_MAX
+
+/* internal */
+#define ZSTD_HASHLOG3_MAX 17
+
+
+/* --- Advanced types --- */
+
+typedef struct ZSTD_CCtx_params_s ZSTD_CCtx_params;
+
+typedef struct {
+ unsigned int matchPos; /* Match pos in dst */
+ /* If seqDef.offset > 3, then this is seqDef.offset - 3
+ * If seqDef.offset < 3, then this is the corresponding repeat offset
+ * But if seqDef.offset < 3 and litLength == 0, this is the
+ * repeat offset before the corresponding repeat offset
+ * And if seqDef.offset == 3 and litLength == 0, this is the
+ * most recent repeat offset - 1
+ */
+ unsigned int offset;
+ unsigned int litLength; /* Literal length */
+ unsigned int matchLength; /* Match length */
+ /* 0 when seq not rep and seqDef.offset otherwise
+ * when litLength == 0 this will be <= 4, otherwise <= 3 like normal
+ */
+ unsigned int rep;
+} ZSTD_Sequence;
+
+typedef struct {
+ unsigned windowLog; /**< largest match distance : larger == more compression, more memory needed during decompression */
+ unsigned chainLog; /**< fully searched segment : larger == more compression, slower, more memory (useless for fast) */
+ unsigned hashLog; /**< dispatch table : larger == faster, more memory */
+ unsigned searchLog; /**< nb of searches : larger == more compression, slower */
+ unsigned minMatch; /**< match length searched : larger == faster decompression, sometimes less compression */
+ unsigned targetLength; /**< acceptable match size for optimal parser (only) : larger == more compression, slower */
+ ZSTD_strategy strategy; /**< see ZSTD_strategy definition above */
+} ZSTD_compressionParameters;
+
+typedef struct {
+ int contentSizeFlag; /**< 1: content size will be in frame header (when known) */
+ int checksumFlag; /**< 1: generate a 32-bits checksum using XXH64 algorithm at end of frame, for error detection */
+ int noDictIDFlag; /**< 1: no dictID will be saved into frame header (dictID is only useful for dictionary compression) */
+} ZSTD_frameParameters;
+
+typedef struct {
+ ZSTD_compressionParameters cParams;
+ ZSTD_frameParameters fParams;
+} ZSTD_parameters;
+
+typedef enum {
+ ZSTD_dct_auto = 0, /* dictionary is "full" when starting with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */
+ ZSTD_dct_rawContent = 1, /* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */
+ ZSTD_dct_fullDict = 2 /* refuses to load a dictionary if it does not respect Zstandard's specification, starting with ZSTD_MAGIC_DICTIONARY */
+} ZSTD_dictContentType_e;
+
+typedef enum {
+ ZSTD_dlm_byCopy = 0, /**< Copy dictionary content internally */
+ ZSTD_dlm_byRef = 1 /**< Reference dictionary content -- the dictionary buffer must outlive its users. */
+} ZSTD_dictLoadMethod_e;
+
+typedef enum {
+ ZSTD_f_zstd1 = 0, /* zstd frame format, specified in zstd_compression_format.md (default) */
+ ZSTD_f_zstd1_magicless = 1 /* Variant of zstd frame format, without initial 4-bytes magic number.
+ * Useful to save 4 bytes per generated frame.
+ * Decoder cannot recognise automatically this format, requiring this instruction. */
+} ZSTD_format_e;
+
+typedef enum {
+ /* Note: this enum and the behavior it controls are effectively internal
+ * implementation details of the compressor. They are expected to continue
+ * to evolve and should be considered only in the context of extremely
+ * advanced performance tuning.
+ *
+ * Zstd currently supports the use of a CDict in three ways:
+ *
+ * - The contents of the CDict can be copied into the working context. This
+ * means that the compression can search both the dictionary and input
+ * while operating on a single set of internal tables. This makes
+ * the compression faster per-byte of input. However, the initial copy of
+ * the CDict's tables incurs a fixed cost at the beginning of the
+ * compression. For small compressions (< 8 KB), that copy can dominate
+ * the cost of the compression.
+ *
+ * - The CDict's tables can be used in-place. In this model, compression is
+ * slower per input byte, because the compressor has to search two sets of
+ * tables. However, this model incurs no start-up cost (as long as the
+ * working context's tables can be reused). For small inputs, this can be
+ * faster than copying the CDict's tables.
+ *
+ * - The CDict's tables are not used at all, and instead we use the working
+ * context alone to reload the dictionary and use params based on the source
+ * size. See ZSTD_compress_insertDictionary() and ZSTD_compress_usingDict().
+ * This method is effective when the dictionary sizes are very small relative
+ * to the input size, and the input size is fairly large to begin with.
+ *
+ * Zstd has a simple internal heuristic that selects which strategy to use
+ * at the beginning of a compression. However, if experimentation shows that
+ * Zstd is making poor choices, it is possible to override that choice with
+ * this enum.
+ */
+ ZSTD_dictDefaultAttach = 0, /* Use the default heuristic. */
+ ZSTD_dictForceAttach = 1, /* Never copy the dictionary. */
+ ZSTD_dictForceCopy = 2, /* Always copy the dictionary. */
+ ZSTD_dictForceLoad = 3 /* Always reload the dictionary */
+} ZSTD_dictAttachPref_e;
+
+typedef enum {
+ ZSTD_lcm_auto = 0, /**< Automatically determine the compression mode based on the compression level.
+ * Negative compression levels will be uncompressed, and positive compression
+ * levels will be compressed. */
+ ZSTD_lcm_huffman = 1, /**< Always attempt Huffman compression. Uncompressed literals will still be
+ * emitted if Huffman compression is not profitable. */
+ ZSTD_lcm_uncompressed = 2 /**< Always emit uncompressed literals. */
+} ZSTD_literalCompressionMode_e;
+
+
+/***************************************
+* Frame size functions
+***************************************/
+
+/*! ZSTD_findDecompressedSize() :
+ * `src` should point to the start of a series of ZSTD encoded and/or skippable frames
+ * `srcSize` must be the _exact_ size of this series
+ * (i.e. there should be a frame boundary at `src + srcSize`)
+ * @return : - decompressed size of all data in all successive frames
+ * - if the decompressed size cannot be determined: ZSTD_CONTENTSIZE_UNKNOWN
+ * - if an error occurred: ZSTD_CONTENTSIZE_ERROR
+ *
+ * note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode.
+ * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
+ * In which case, it's necessary to use streaming mode to decompress data.
+ * note 2 : decompressed size is always present when compression is done with ZSTD_compress()
+ * note 3 : decompressed size can be very large (64-bits value),
+ * potentially larger than what local system can handle as a single memory segment.
+ * In which case, it's necessary to use streaming mode to decompress data.
+ * note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+ * Always ensure result fits within application's authorized limits.
+ * Each application can set its own limits.
+ * note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it must traverse the input to
+ * read each contained frame header. This is fast as most of the data is skipped,
+ * however it does mean that all frame data must be present and valid. */
+ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize);
+
+/*! ZSTD_decompressBound() :
+ * `src` should point to the start of a series of ZSTD encoded and/or skippable frames
+ * `srcSize` must be the _exact_ size of this series
+ * (i.e. there should be a frame boundary at `src + srcSize`)
+ * @return : - upper-bound for the decompressed size of all data in all successive frames
+ * - if an error occured: ZSTD_CONTENTSIZE_ERROR
+ *
+ * note 1 : an error can occur if `src` contains an invalid or incorrectly formatted frame.
+ * note 2 : the upper-bound is exact when the decompressed size field is available in every ZSTD encoded frame of `src`.
+ * in this case, `ZSTD_findDecompressedSize` and `ZSTD_decompressBound` return the same value.
+ * note 3 : when the decompressed size field isn't available, the upper-bound for that frame is calculated by:
+ * upper-bound = # blocks * min(128 KB, Window_Size)
+ */
+ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize);
+
+/*! ZSTD_frameHeaderSize() :
+ * srcSize must be >= ZSTD_FRAMEHEADERSIZE_PREFIX.
+ * @return : size of the Frame Header,
+ * or an error code (if srcSize is too small) */
+ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
+
+/*! ZSTD_getSequences() :
+ * Extract sequences from the sequence store
+ * zc can be used to insert custom compression params.
+ * This function invokes ZSTD_compress2
+ * @return : number of sequences extracted
+ */
+ZSTDLIB_API size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+ size_t outSeqsSize, const void* src, size_t srcSize);
+
+
+/***************************************
+* Memory management
+***************************************/
+
+/*! ZSTD_estimate*() :
+ * These functions make it possible to estimate memory usage
+ * of a future {D,C}Ctx, before its creation.
+ *
+ * ZSTD_estimateCCtxSize() will provide a memory budget large enough
+ * for any compression level up to selected one.
+ * Note : Unlike ZSTD_estimateCStreamSize*(), this estimate
+ * does not include space for a window buffer.
+ * Therefore, the estimation is only guaranteed for single-shot compressions, not streaming.
+ * The estimate will assume the input may be arbitrarily large,
+ * which is the worst case.
+ *
+ * When srcSize can be bound by a known and rather "small" value,
+ * this fact can be used to provide a tighter estimation
+ * because the CCtx compression context will need less memory.
+ * This tighter estimation can be provided by more advanced functions
+ * ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(),
+ * and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
+ * Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
+ *
+ * Note 2 : only single-threaded compression is supported.
+ * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
+ */
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void);
+
+/*! ZSTD_estimateCStreamSize() :
+ * ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one.
+ * It will also consider src size to be arbitrarily "large", which is worst case.
+ * If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation.
+ * ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
+ * ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1.
+ * Note : CStream size estimation is only correct for single-threaded compression.
+ * ZSTD_DStream memory budget depends on window Size.
+ * This information can be passed manually, using ZSTD_estimateDStreamSize,
+ * or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
+ * Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
+ * an internal ?Dict will be created, which additional size is not estimated here.
+ * In this case, get total size by adding ZSTD_estimate?DictSize */
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ZSTDLIB_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
+ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
+
+/*! ZSTD_estimate?DictSize() :
+ * ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict().
+ * ZSTD_estimateCDictSize_advanced() makes it possible to control compression parameters precisely, like ZSTD_createCDict_advanced().
+ * Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logically smaller.
+ */
+ZSTDLIB_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod);
+ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod);
+
+/*! ZSTD_initStatic*() :
+ * Initialize an object using a pre-allocated fixed-size buffer.
+ * workspace: The memory area to emplace the object into.
+ * Provided pointer *must be 8-bytes aligned*.
+ * Buffer must outlive object.
+ * workspaceSize: Use ZSTD_estimate*Size() to determine
+ * how large workspace must be to support target scenario.
+ * @return : pointer to object (same address as workspace, just different type),
+ * or NULL if error (size too small, incorrect alignment, etc.)
+ * Note : zstd will never resize nor malloc() when using a static buffer.
+ * If the object requires more memory than available,
+ * zstd will just error out (typically ZSTD_error_memory_allocation).
+ * Note 2 : there is no corresponding "free" function.
+ * Since workspace is allocated externally, it must be freed externally too.
+ * Note 3 : cParams : use ZSTD_getCParams() to convert a compression level
+ * into its associated cParams.
+ * Limitation 1 : currently not compatible with internal dictionary creation, triggered by
+ * ZSTD_CCtx_loadDictionary(), ZSTD_initCStream_usingDict() or ZSTD_initDStream_usingDict().
+ * Limitation 2 : static cctx currently not compatible with multi-threading.
+ * Limitation 3 : static dctx is incompatible with legacy support.
+ */
+ZSTDLIB_API ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize);
+ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize); /**< same as ZSTD_initStaticCCtx() */
+
+ZSTDLIB_API ZSTD_DCtx* ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize);
+ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize); /**< same as ZSTD_initStaticDCtx() */
+
+ZSTDLIB_API const ZSTD_CDict* ZSTD_initStaticCDict(
+ void* workspace, size_t workspaceSize,
+ const void* dict, size_t dictSize,
+ ZSTD_dictLoadMethod_e dictLoadMethod,
+ ZSTD_dictContentType_e dictContentType,
+ ZSTD_compressionParameters cParams);
+
+ZSTDLIB_API const ZSTD_DDict* ZSTD_initStaticDDict(
+ void* workspace, size_t workspaceSize,
+ const void* dict, size_t dictSize,
+ ZSTD_dictLoadMethod_e dictLoadMethod,
+ ZSTD_dictContentType_e dictContentType);
+
+
+/*! Custom memory allocation :
+ * These prototypes make it possible to pass your own allocation/free functions.
+ * ZSTD_customMem is provided at creation time, using ZSTD_create*_advanced() variants listed below.
+ * All allocation/free operations will be completed using these custom variants instead of regular <stdlib.h> ones.
+ */
+typedef void* (*ZSTD_allocFunction) (void* opaque, size_t size);
+typedef void (*ZSTD_freeFunction) (void* opaque, void* address);
+typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem;
+static ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL }; /**< this constant defers to stdlib's functions */
+
+ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem);
+
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize,
+ ZSTD_dictLoadMethod_e dictLoadMethod,
+ ZSTD_dictContentType_e dictContentType,
+ ZSTD_compressionParameters cParams,
+ ZSTD_customMem customMem);
+
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize,
+ ZSTD_dictLoadMethod_e dictLoadMethod,
+ ZSTD_dictContentType_e dictContentType,
+ ZSTD_customMem customMem);
+
+
+
+/***************************************
+* Advanced compression functions
+***************************************/
+
+/*! ZSTD_createCDict_byReference() :
+ * Create a digested dictionary for compression
+ * Dictionary content is just referenced, not duplicated.
+ * As a consequence, `dictBuffer` **must** outlive CDict,
+ * and its content must remain unmodified throughout the lifetime of CDict.
+ * note: equivalent to ZSTD_createCDict_advanced(), with dictLoadMethod==ZSTD_dlm_byRef */
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel);
+
+/*! ZSTD_getCParams() :
+ * @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize.
+ * `estimatedSrcSize` value is optional, select 0 if not known */
+ZSTDLIB_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*! ZSTD_getParams() :
+ * same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object instead of sub-component `ZSTD_compressionParameters`.
+ * All fields of `ZSTD_frameParameters` are set to default : contentSize=1, checksum=0, noDictID=0 */
+ZSTDLIB_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*! ZSTD_checkCParams() :
+ * Ensure param values remain within authorized range.
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()) */
+ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
+
+/*! ZSTD_adjustCParams() :
+ * optimize params for a given `srcSize` and `dictSize`.
+ * `srcSize` can be unknown, in which case use ZSTD_CONTENTSIZE_UNKNOWN.
+ * `dictSize` must be `0` when there is no dictionary.
+ * cPar can be invalid : all parameters will be clamped within valid range in the @return struct.
+ * This function never fails (wide contract) */
+ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
+
+/*! ZSTD_compress_advanced() :
+ * Note : this function is now DEPRECATED.
+ * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters.
+ * This prototype will be marked as deprecated and generate compilation warning on reaching v1.5.x */
+ZSTDLIB_API size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const void* dict,size_t dictSize,
+ ZSTD_parameters params);
+
+/*! ZSTD_compress_usingCDict_advanced() :
+ * Note : this function is now REDUNDANT.
+ * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters.
+ * This prototype will be marked as deprecated and generate compilation warning in some future version */
+ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const ZSTD_CDict* cdict,
+ ZSTD_frameParameters fParams);
+
+
+/*! ZSTD_CCtx_loadDictionary_byReference() :
+ * Same as ZSTD_CCtx_loadDictionary(), but dictionary content is referenced, instead of being copied into CCtx.
+ * It saves some memory, but also requires that `dict` outlives its usage within `cctx` */
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_CCtx_loadDictionary_advanced() :
+ * Same as ZSTD_CCtx_loadDictionary(), but gives finer control over
+ * how to load the dictionary (by copy ? by reference ?)
+ * and how to interpret it (automatic ? force raw mode ? full mode only ?) */
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_CCtx_refPrefix_advanced() :
+ * Same as ZSTD_CCtx_refPrefix(), but gives finer control over
+ * how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+
+/* === experimental parameters === */
+/* these parameters can be used with ZSTD_setParameter()
+ * they are not guaranteed to remain supported in the future */
+
+ /* Enables rsyncable mode,
+ * which makes compressed files more rsync friendly
+ * by adding periodic synchronization points to the compressed data.
+ * The target average block size is ZSTD_c_jobSize / 2.
+ * It's possible to modify the job size to increase or decrease
+ * the granularity of the synchronization point.
+ * Once the jobSize is smaller than the window size,
+ * it will result in compression ratio degradation.
+ * NOTE 1: rsyncable mode only works when multithreading is enabled.
+ * NOTE 2: rsyncable performs poorly in combination with long range mode,
+ * since it will decrease the effectiveness of synchronization points,
+ * though mileage may vary.
+ * NOTE 3: Rsyncable mode limits maximum compression speed to ~400 MB/s.
+ * If the selected compression level is already running significantly slower,
+ * the overall speed won't be significantly impacted.
+ */
+ #define ZSTD_c_rsyncable ZSTD_c_experimentalParam1
+
+/* Select a compression format.
+ * The value must be of type ZSTD_format_e.
+ * See ZSTD_format_e enum definition for details */
+#define ZSTD_c_format ZSTD_c_experimentalParam2
+
+/* Force back-reference distances to remain < windowSize,
+ * even when referencing into Dictionary content (default:0) */
+#define ZSTD_c_forceMaxWindow ZSTD_c_experimentalParam3
+
+/* Controls whether the contents of a CDict
+ * are used in place, or copied into the working context.
+ * Accepts values from the ZSTD_dictAttachPref_e enum.
+ * See the comments on that enum for an explanation of the feature. */
+#define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4
+
+/* Controls how the literals are compressed (default is auto).
+ * The value must be of type ZSTD_literalCompressionMode_e.
+ * See ZSTD_literalCompressionMode_t enum definition for details.
+ */
+#define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5
+
+/* Tries to fit compressed block size to be around targetCBlockSize.
+ * No target when targetCBlockSize == 0.
+ * There is no guarantee on compressed block size (default:0) */
+#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6
+
+/* User's best guess of source size.
+ * Hint is not valid when srcSizeHint == 0.
+ * There is no guarantee that hint is close to actual source size,
+ * but compression ratio may regress significantly if guess considerably underestimates */
+#define ZSTD_c_srcSizeHint ZSTD_c_experimentalParam7
+
+/*! ZSTD_CCtx_getParameter() :
+ * Get the requested compression parameter value, selected by enum ZSTD_cParameter,
+ * and store it into int* value.
+ * @return : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value);
+
+
+/*! ZSTD_CCtx_params :
+ * Quick howto :
+ * - ZSTD_createCCtxParams() : Create a ZSTD_CCtx_params structure
+ * - ZSTD_CCtxParams_setParameter() : Push parameters one by one into
+ * an existing ZSTD_CCtx_params structure.
+ * This is similar to
+ * ZSTD_CCtx_setParameter().
+ * - ZSTD_CCtx_setParametersUsingCCtxParams() : Apply parameters to
+ * an existing CCtx.
+ * These parameters will be applied to
+ * all subsequent frames.
+ * - ZSTD_compressStream2() : Do compression using the CCtx.
+ * - ZSTD_freeCCtxParams() : Free the memory.
+ *
+ * This can be used with ZSTD_estimateCCtxSize_advanced_usingCCtxParams()
+ * for static allocation of CCtx for single-threaded compression.
+ */
+ZSTDLIB_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void);
+ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params);
+
+/*! ZSTD_CCtxParams_reset() :
+ * Reset params to default values.
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params);
+
+/*! ZSTD_CCtxParams_init() :
+ * Initializes the compression parameters of cctxParams according to
+ * compression level. All other parameters are reset to their default values.
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel);
+
+/*! ZSTD_CCtxParams_init_advanced() :
+ * Initializes the compression and frame parameters of cctxParams according to
+ * params. All other parameters are reset to their default values.
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params);
+
+/*! ZSTD_CCtxParams_setParameter() :
+ * Similar to ZSTD_CCtx_setParameter.
+ * Set one compression parameter, selected by enum ZSTD_cParameter.
+ * Parameters must be applied to a ZSTD_CCtx using ZSTD_CCtx_setParametersUsingCCtxParams().
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int value);
+
+/*! ZSTD_CCtxParams_getParameter() :
+ * Similar to ZSTD_CCtx_getParameter.
+ * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int* value);
+
+/*! ZSTD_CCtx_setParametersUsingCCtxParams() :
+ * Apply a set of ZSTD_CCtx_params to the compression context.
+ * This can be done even after compression is started,
+ * if nbWorkers==0, this will have no impact until a new compression is started.
+ * if nbWorkers>=1, new parameters will be picked up at next job,
+ * with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+ ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params);
+
+/*! ZSTD_compressStream2_simpleArgs() :
+ * Same as ZSTD_compressStream2(),
+ * but using only integral types as arguments.
+ * This variant might be helpful for binders from dynamic languages
+ * which have troubles handling structures containing memory pointers.
+ */
+ZSTDLIB_API size_t ZSTD_compressStream2_simpleArgs (
+ ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity, size_t* dstPos,
+ const void* src, size_t srcSize, size_t* srcPos,
+ ZSTD_EndDirective endOp);
+
+
+/***************************************
+* Advanced decompression functions
+***************************************/
+
+/*! ZSTD_isFrame() :
+ * Tells if the content of `buffer` starts with a valid Frame Identifier.
+ * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
+ * Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled.
+ * Note 3 : Skippable Frame Identifiers are considered valid. */
+ZSTDLIB_API unsigned ZSTD_isFrame(const void* buffer, size_t size);
+
+/*! ZSTD_createDDict_byReference() :
+ * Create a digested dictionary, ready to start decompression operation without startup delay.
+ * Dictionary content is referenced, and therefore stays in dictBuffer.
+ * It is important that dictBuffer outlives DDict,
+ * it must remain read accessible throughout the lifetime of DDict */
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize);
+
+/*! ZSTD_DCtx_loadDictionary_byReference() :
+ * Same as ZSTD_DCtx_loadDictionary(),
+ * but references `dict` content instead of copying it into `dctx`.
+ * This saves memory if `dict` remains around.,
+ * However, it's imperative that `dict` remains accessible (and unmodified) while being used, so it must outlive decompression. */
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_DCtx_loadDictionary_advanced() :
+ * Same as ZSTD_DCtx_loadDictionary(),
+ * but gives direct control over
+ * how to load the dictionary (by copy ? by reference ?)
+ * and how to interpret it (automatic ? force raw mode ? full mode only ?). */
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_DCtx_refPrefix_advanced() :
+ * Same as ZSTD_DCtx_refPrefix(), but gives finer control over
+ * how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_DCtx_setMaxWindowSize() :
+ * Refuses allocating internal buffers for frames requiring a window size larger than provided limit.
+ * This protects a decoder context from reserving too much memory for itself (potential attack scenario).
+ * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode.
+ * By default, a decompression context accepts all window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT)
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize);
+
+/* ZSTD_d_format
+ * experimental parameter,
+ * allowing selection between ZSTD_format_e input compression formats
+ */
+#define ZSTD_d_format ZSTD_d_experimentalParam1
+/* ZSTD_d_stableOutBuffer
+ * Experimental parameter.
+ * Default is 0 == disabled. Set to 1 to enable.
+ *
+ * Tells the decompressor that the ZSTD_outBuffer will ALWAYS be the same
+ * between calls, except for the modifications that zstd makes to pos (the
+ * caller must not modify pos). This is checked by the decompressor, and
+ * decompression will fail if it ever changes. Therefore the ZSTD_outBuffer
+ * MUST be large enough to fit the entire decompressed frame. This will be
+ * checked when the frame content size is known. The data in the ZSTD_outBuffer
+ * in the range [dst, dst + pos) MUST not be modified during decompression
+ * or you will get data corruption.
+ *
+ * When this flags is enabled zstd won't allocate an output buffer, because
+ * it can write directly to the ZSTD_outBuffer, but it will still allocate
+ * an input buffer large enough to fit any compressed block. This will also
+ * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer.
+ * If you need to avoid the input buffer allocation use the buffer-less
+ * streaming API.
+ *
+ * NOTE: So long as the ZSTD_outBuffer always points to valid memory, using
+ * this flag is ALWAYS memory safe, and will never access out-of-bounds
+ * memory. However, decompression WILL fail if you violate the preconditions.
+ *
+ * WARNING: The data in the ZSTD_outBuffer in the range [dst, dst + pos) MUST
+ * not be modified during decompression or you will get data corruption. This
+ * is because zstd needs to reference data in the ZSTD_outBuffer to regenerate
+ * matches. Normally zstd maintains its own buffer for this purpose, but passing
+ * this flag tells zstd to use the user provided buffer.
+ */
+#define ZSTD_d_stableOutBuffer ZSTD_d_experimentalParam2
+
+/*! ZSTD_DCtx_setFormat() :
+ * Instruct the decoder context about what kind of data to decode next.
+ * This instruction is mandatory to decode data without a fully-formed header,
+ * such ZSTD_f_zstd1_magicless for example.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
+
+/*! ZSTD_decompressStream_simpleArgs() :
+ * Same as ZSTD_decompressStream(),
+ * but using only integral types as arguments.
+ * This can be helpful for binders from dynamic languages
+ * which have troubles handling structures containing memory pointers.
+ */
+ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs (
+ ZSTD_DCtx* dctx,
+ void* dst, size_t dstCapacity, size_t* dstPos,
+ const void* src, size_t srcSize, size_t* srcPos);
+
+
+/********************************************************************
+* Advanced streaming functions
+* Warning : most of these functions are now redundant with the Advanced API.
+* Once Advanced API reaches "stable" status,
+* redundant functions will be deprecated, and then at some point removed.
+********************************************************************/
+
+/*===== Advanced Streaming compression functions =====*/
+/**! ZSTD_initCStream_srcSize() :
+ * This function is deprecated, and equivalent to:
+ * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
+ * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *
+ * pledgedSrcSize must be correct. If it is not known at init time, use
+ * ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs,
+ * "0" also disables frame content size field. It may be enabled in the future.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+ int compressionLevel,
+ unsigned long long pledgedSrcSize);
+
+/**! ZSTD_initCStream_usingDict() :
+ * This function is deprecated, and is equivalent to:
+ * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
+ *
+ * Creates of an internal CDict (incompatible with static CCtx), except if
+ * dict == NULL or dictSize < 8, in which case no dict is used.
+ * Note: dict is loaded with ZSTD_dct_auto (treated as a full zstd dictionary if
+ * it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+ const void* dict, size_t dictSize,
+ int compressionLevel);
+
+/**! ZSTD_initCStream_advanced() :
+ * This function is deprecated, and is approximately equivalent to:
+ * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ * // Pseudocode: Set each zstd parameter and leave the rest as-is.
+ * for ((param, value) : params) {
+ * ZSTD_CCtx_setParameter(zcs, param, value);
+ * }
+ * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
+ *
+ * dict is loaded with ZSTD_dct_auto and ZSTD_dlm_byCopy.
+ * pledgedSrcSize must be correct.
+ * If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+ const void* dict, size_t dictSize,
+ ZSTD_parameters params,
+ unsigned long long pledgedSrcSize);
+
+/**! ZSTD_initCStream_usingCDict() :
+ * This function is deprecated, and equivalent to:
+ * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ * ZSTD_CCtx_refCDict(zcs, cdict);
+ *
+ * note : cdict will just be referenced, and must outlive compression session
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+
+/**! ZSTD_initCStream_usingCDict_advanced() :
+ * This function is DEPRECATED, and is approximately equivalent to:
+ * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ * // Pseudocode: Set each zstd frame parameter and leave the rest as-is.
+ * for ((fParam, value) : fParams) {
+ * ZSTD_CCtx_setParameter(zcs, fParam, value);
+ * }
+ * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ * ZSTD_CCtx_refCDict(zcs, cdict);
+ *
+ * same as ZSTD_initCStream_usingCDict(), with control over frame parameters.
+ * pledgedSrcSize must be correct. If srcSize is not known at init time, use
+ * value ZSTD_CONTENTSIZE_UNKNOWN.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+ const ZSTD_CDict* cdict,
+ ZSTD_frameParameters fParams,
+ unsigned long long pledgedSrcSize);
+
+/*! ZSTD_resetCStream() :
+ * This function is deprecated, and is equivalent to:
+ * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *
+ * start a new frame, using same parameters from previous frame.
+ * This is typically useful to skip dictionary loading stage, since it will re-use it in-place.
+ * Note that zcs must be init at least once before using ZSTD_resetCStream().
+ * If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
+ * If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
+ * For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs,
+ * but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError())
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
+
+
+typedef struct {
+ unsigned long long ingested; /* nb input bytes read and buffered */
+ unsigned long long consumed; /* nb input bytes actually compressed */
+ unsigned long long produced; /* nb of compressed bytes generated and buffered */
+ unsigned long long flushed; /* nb of compressed bytes flushed : not provided; can be tracked from caller side */
+ unsigned currentJobID; /* MT only : latest started job nb */
+ unsigned nbActiveWorkers; /* MT only : nb of workers actively compressing at probe time */
+} ZSTD_frameProgression;
+
+/* ZSTD_getFrameProgression() :
+ * tells how much data has been ingested (read from input)
+ * consumed (input actually compressed) and produced (output) for current frame.
+ * Note : (ingested - consumed) is amount of input data buffered internally, not yet compressed.
+ * Aggregates progression inside active worker threads.
+ */
+ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx);
+
+/*! ZSTD_toFlushNow() :
+ * Tell how many bytes are ready to be flushed immediately.
+ * Useful for multithreading scenarios (nbWorkers >= 1).
+ * Probe the oldest active job, defined as oldest job not yet entirely flushed,
+ * and check its output buffer.
+ * @return : amount of data stored in oldest job and ready to be flushed immediately.
+ * if @return == 0, it means either :
+ * + there is no active job (could be checked with ZSTD_frameProgression()), or
+ * + oldest job is still actively compressing data,
+ * but everything it has produced has also been flushed so far,
+ * therefore flush speed is limited by production speed of oldest job
+ * irrespective of the speed of concurrent (and newer) jobs.
+ */
+ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
+
+
+/*===== Advanced Streaming decompression functions =====*/
+/**
+ * This function is deprecated, and is equivalent to:
+ *
+ * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ * ZSTD_DCtx_loadDictionary(zds, dict, dictSize);
+ *
+ * note: no dictionary will be used if dict == NULL or dictSize < 8
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize);
+
+/**
+ * This function is deprecated, and is equivalent to:
+ *
+ * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ * ZSTD_DCtx_refDDict(zds, ddict);
+ *
+ * note : ddict is referenced, it must outlive decompression session
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);
+
+/**
+ * This function is deprecated, and is equivalent to:
+ *
+ * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ *
+ * re-use decompression parameters from previous init; saves dictionary loading
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+
+
+/*********************************************************************
+* Buffer-less and synchronous inner streaming functions
+*
+* This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
+* But it's also a complex one, with several restrictions, documented below.
+* Prefer normal streaming API for an easier experience.
+********************************************************************* */
+
+/**
+ Buffer-less streaming compression (synchronous mode)
+
+ A ZSTD_CCtx object is required to track streaming operations.
+ Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource.
+ ZSTD_CCtx object can be re-used multiple times within successive compression operations.
+
+ Start by initializing a context.
+ Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression,
+ or ZSTD_compressBegin_advanced(), for finer parameter control.
+ It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
+
+ Then, consume your input using ZSTD_compressContinue().
+ There are some important considerations to keep in mind when using this advanced function :
+ - ZSTD_compressContinue() has no internal buffer. It uses externally provided buffers only.
+ - Interface is synchronous : input is consumed entirely and produces 1+ compressed blocks.
+ - Caller must ensure there is enough space in `dst` to store compressed data under worst case scenario.
+ Worst case evaluation is provided by ZSTD_compressBound().
+ ZSTD_compressContinue() doesn't guarantee recover after a failed compression.
+ - ZSTD_compressContinue() presumes prior input ***is still accessible and unmodified*** (up to maximum distance size, see WindowLog).
+ It remembers all previous contiguous blocks, plus one separated memory segment (which can itself consists of multiple contiguous blocks)
+ - ZSTD_compressContinue() detects that prior input has been overwritten when `src` buffer overlaps.
+ In which case, it will "discard" the relevant memory section from its history.
+
+ Finish a frame with ZSTD_compressEnd(), which will write the last block(s) and optional checksum.
+ It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame.
+ Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders.
+
+ `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again.
+*/
+
+/*===== Buffer-less streaming compression functions =====*/
+ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
+ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /**< note: fails if cdict==NULL */
+ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
+ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /**< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
+
+ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-
+ Buffer-less streaming decompression (synchronous mode)
+
+ A ZSTD_DCtx object is required to track streaming operations.
+ Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
+ A ZSTD_DCtx object can be re-used multiple times.
+
+ First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader().
+ Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
+ Data fragment must be large enough to ensure successful decoding.
+ `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
+ @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
+ >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
+ errorCode, which can be tested using ZSTD_isError().
+
+ It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
+ such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`).
+ Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information.
+ As a consequence, check that values remain within valid application range.
+ For example, do not allocate memory blindly, check that `windowSize` is within expectation.
+ Each application can set its own limits, depending on local restrictions.
+ For extended interoperability, it is recommended to support `windowSize` of at least 8 MB.
+
+ ZSTD_decompressContinue() needs previous data blocks during decompression, up to `windowSize` bytes.
+ ZSTD_decompressContinue() is very sensitive to contiguity,
+ if 2 blocks don't follow each other, make sure that either the compressor breaks contiguity at the same place,
+ or that previous contiguous segment is large enough to properly handle maximum back-reference distance.
+ There are multiple ways to guarantee this condition.
+
+ The most memory efficient way is to use a round buffer of sufficient size.
+ Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
+ which can @return an error code if required value is too large for current system (in 32-bits mode).
+ In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
+ up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
+ which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
+ At which point, decoding can resume from the beginning of the buffer.
+ Note that already decoded data stored in the buffer should be flushed before being overwritten.
+
+ There are alternatives possible, for example using two or more buffers of size `windowSize` each, though they consume more memory.
+
+ Finally, if you control the compression process, you can also ignore all buffer size rules,
+ as long as the encoder and decoder progress in "lock-step",
+ aka use exactly the same buffer sizes, break contiguity at the same place, etc.
+
+ Once buffers are setup, start decompression, with ZSTD_decompressBegin().
+ If decompression requires a dictionary, use ZSTD_decompressBegin_usingDict() or ZSTD_decompressBegin_usingDDict().
+
+ Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively.
+ ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+ ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
+
+ @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+ It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
+ It can also be an error code, which can be tested with ZSTD_isError().
+
+ A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero.
+ Context can then be reset to start a new decompression.
+
+ Note : it's possible to know if next input to present is a header or a block, using ZSTD_nextInputType().
+ This information is not required to properly decode a frame.
+
+ == Special case : skippable frames ==
+
+ Skippable frames allow integration of user-defined data into a flow of concatenated frames.
+ Skippable frames will be ignored (skipped) by decompressor.
+ The format of skippable frames is as follows :
+ a) Skippable frame ID - 4 Bytes, Little endian format, any value from 0x184D2A50 to 0x184D2A5F
+ b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits
+ c) Frame Content - any content (User Data) of length equal to Frame Size
+ For skippable frames ZSTD_getFrameHeader() returns zfhPtr->frameType==ZSTD_skippableFrame.
+ For skippable frames ZSTD_decompressContinue() always returns 0 : it only skips the content.
+*/
+
+/*===== Buffer-less streaming decompression functions =====*/
+typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
+typedef struct {
+ unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
+ unsigned long long windowSize; /* can be very large, up to <= frameContentSize */
+ unsigned blockSizeMax;
+ ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
+ unsigned headerSize;
+ unsigned dictID;
+ unsigned checksumFlag;
+} ZSTD_frameHeader;
+
+/*! ZSTD_getFrameHeader() :
+ * decode Frame Header, or requires larger `srcSize`.
+ * @return : 0, `zfhPtr` is correctly filled,
+ * >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ * or an error code, which can be tested using ZSTD_isError() */
+ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /**< doesn't consume input */
+/*! ZSTD_getFrameHeader_advanced() :
+ * same as ZSTD_getFrameHeader(),
+ * with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+ZSTDLIB_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
+ZSTDLIB_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /**< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
+
+ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIB_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
+
+ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+/* misc */
+ZSTDLIB_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
+typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
+ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+
+
+
+
+/* ============================ */
+/** Block level API */
+/* ============================ */
+
+/*!
+ Block functions produce and decode raw zstd blocks, without frame metadata.
+ Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes).
+ But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes.
+
+ A few rules to respect :
+ - Compressing and decompressing require a context structure
+ + Use ZSTD_createCCtx() and ZSTD_createDCtx()
+ - It is necessary to init context before starting
+ + compression : any ZSTD_compressBegin*() variant, including with dictionary
+ + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
+ + copyCCtx() and copyDCtx() can be used too
+ - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
+ + If input is larger than a block size, it's necessary to split input data into multiple blocks
+ + For inputs larger than a single block, consider using regular ZSTD_compress() instead.
+ Frame metadata is not that costly, and quickly becomes negligible as source size grows larger than a block.
+ - When a block is considered not compressible enough, ZSTD_compressBlock() result will be 0 (zero) !
+ ===> In which case, nothing is produced into `dst` !
+ + User __must__ test for such outcome and deal directly with uncompressed data
+ + A block cannot be declared incompressible if ZSTD_compressBlock() return value was != 0.
+ Doing so would mess up with statistics history, leading to potential data corruption.
+ + ZSTD_decompressBlock() _doesn't accept uncompressed data as input_ !!
+ + In case of multiple successive blocks, should some of them be uncompressed,
+ decoder must be informed of their existence in order to follow proper history.
+ Use ZSTD_insertBlock() for such a case.
+*/
+
+/*===== Raw zstd block functions =====*/
+ZSTDLIB_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx);
+ZSTDLIB_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
+
+
+#endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
+
+#if defined (__cplusplus)
+}
+#endif
+/**** ended inlining ../zstd.h ****/
+#define FSE_STATIC_LINKING_ONLY
+/**** skipping file: fse.h ****/
+#define HUF_STATIC_LINKING_ONLY
+/**** skipping file: huf.h ****/
+#ifndef XXH_STATIC_LINKING_ONLY
+# define XXH_STATIC_LINKING_ONLY /* XXH64_state_t */
+#endif
+/**** start inlining xxhash.h ****/
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Header File
+ * Copyright (c) 2012-2020, Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - xxHash source repository : https://github.com/Cyan4973/xxHash
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+*/
+
+/* Notice extracted from xxHash homepage :
+
+xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name Speed Q.Score Author
+xxHash 5.4 GB/s 10
+CrapWow 3.2 GB/s 2 Andrew
+MumurHash 3a 2.7 GB/s 10 Austin Appleby
+SpookyHash 2.0 GB/s 10 Bob Jenkins
+SBox 1.4 GB/s 9 Bret Mulvey
+Lookup3 1.2 GB/s 9 Bob Jenkins
+SuperFastHash 1.2 GB/s 1 Paul Hsieh
+CityHash64 1.05 GB/s 10 Pike & Alakuijala
+FNV 0.55 GB/s 5 Fowler, Noll, Vo
+CRC32 0.43 GB/s 9
+MD5-32 0.33 GB/s 10 Ronald L. Rivest
+SHA1-32 0.28 GB/s 10
+
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+
+A 64-bits version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bits applications only.
+Name Speed on 64 bits Speed on 32 bits
+XXH64 13.8 GB/s 1.9 GB/s
+XXH32 6.8 GB/s 6.0 GB/s
+*/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+
+/* ****************************
+* Definitions
+******************************/
+#include <stddef.h> /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+/* ****************************
+* API modifier
+******************************/
+/** XXH_PRIVATE_API
+* This is useful if you want to include xxhash functions in `static` mode
+* in order to inline them, and remove their symbol from the public list.
+* Methodology :
+* #define XXH_PRIVATE_API
+* #include "xxhash.h"
+* `xxhash.c` is automatically included.
+* It's not useful to compile and link it as a separate module anymore.
+*/
+#ifdef XXH_PRIVATE_API
+# ifndef XXH_STATIC_LINKING_ONLY
+# define XXH_STATIC_LINKING_ONLY
+# endif
+# if defined(__GNUC__)
+# define XXH_PUBLIC_API static __inline __attribute__((unused))
+# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+# define XXH_PUBLIC_API static inline
+# elif defined(_MSC_VER)
+# define XXH_PUBLIC_API static __inline
+# else
+# define XXH_PUBLIC_API static /* this version may generate warnings for unused static functions; disable the relevant warning */
+# endif
+#else
+# define XXH_PUBLIC_API /* do nothing */
+#endif /* XXH_PRIVATE_API */
+
+/*!XXH_NAMESPACE, aka Namespace Emulation :
+
+If you want to include _and expose_ xxHash functions from within your own library,
+but also want to avoid symbol collisions with another library which also includes xxHash,
+
+you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library
+with the value of XXH_NAMESPACE (so avoid to keep it NULL and avoid numeric values).
+
+Note that no change is required within the calling program as long as it includes `xxhash.h` :
+regular symbol name will be automatically translated by this header.
+*/
+#ifdef XXH_NAMESPACE
+# define XXH_CAT(A,B) A##B
+# define XXH_NAME2(A,B) XXH_CAT(A,B)
+# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+#endif
+
+
+/* *************************************
+* Version
+***************************************/
+#define XXH_VERSION_MAJOR 0
+#define XXH_VERSION_MINOR 6
+#define XXH_VERSION_RELEASE 2
+#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+XXH_PUBLIC_API unsigned XXH_versionNumber (void);
+
+
+/* ****************************
+* Simple Hash Functions
+******************************/
+typedef unsigned int XXH32_hash_t;
+typedef unsigned long long XXH64_hash_t;
+
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, unsigned int seed);
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, unsigned long long seed);
+
+/*!
+XXH32() :
+ Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input".
+ The memory between input & input+length must be valid (allocated and read-accessible).
+ "seed" can be used to alter the result predictably.
+ Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+XXH64() :
+ Calculate the 64-bits hash of sequence of length "len" stored at memory address "input".
+ "seed" can be used to alter the result predictably.
+ This function runs 2x faster on 64-bits systems, but slower on 32-bits systems (see benchmark).
+*/
+
+
+/* ****************************
+* Streaming Hash Functions
+******************************/
+typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */
+typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */
+
+/*! State allocation, compatible with dynamic libraries */
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr);
+
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);
+
+
+/* hash streaming */
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, unsigned int seed);
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr);
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, unsigned long long seed);
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr);
+
+/*
+These functions generate the xxHash of an input provided in multiple segments.
+Note that, for small input, they are slower than single-call functions, due to state management.
+For small input, prefer `XXH32()` and `XXH64()` .
+
+XXH state must first be allocated, using XXH*_createState() .
+
+Start a new hash by initializing state with a seed, using XXH*_reset().
+
+Then, feed the hash state by calling XXH*_update() as many times as necessary.
+Obviously, input must be allocated and read accessible.
+The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+
+Finally, a hash value can be produced anytime, by using XXH*_digest().
+This function returns the nn-bits hash as an int or long long.
+
+It's still possible to continue inserting input into the hash state after a digest,
+and generate some new hashes later on, by calling again XXH*_digest().
+
+When done, free XXH state space if it was allocated dynamically.
+*/
+
+
+/* **************************
+* Utils
+****************************/
+#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* ! C99 */
+# define restrict /* disable restrict */
+#endif
+
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* restrict dst_state, const XXH32_state_t* restrict src_state);
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* restrict dst_state, const XXH64_state_t* restrict src_state);
+
+
+/* **************************
+* Canonical representation
+****************************/
+/* Default result type for XXH functions are primitive unsigned 32 and 64 bits.
+* The canonical representation uses human-readable write convention, aka big-endian (large digits first).
+* These functions allow transformation of hash result into and from its canonical format.
+* This way, hash values can be written into a file / memory, and remain comparable on different systems and programs.
+*/
+typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
+typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
+
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+
+#endif /* XXHASH_H_5627135585666179 */
+
+
+
+/* ================================================================================================
+ This section contains definitions which are not guaranteed to remain stable.
+ They may change in future versions, becoming incompatible with a different version of the library.
+ They shall only be used with static linking.
+ Never use these definitions in association with dynamic linking !
+=================================================================================================== */
+#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXH_STATIC_H_3543687687345)
+#define XXH_STATIC_H_3543687687345
+
+/* These definitions are only meant to allow allocation of XXH state
+ statically, on stack, or in a struct for example.
+ Do not use members directly. */
+
+ struct XXH32_state_s {
+ unsigned total_len_32;
+ unsigned large_len;
+ unsigned v1;
+ unsigned v2;
+ unsigned v3;
+ unsigned v4;
+ unsigned mem32[4]; /* buffer defined as U32 for alignment */
+ unsigned memsize;
+ unsigned reserved; /* never read nor write, will be removed in a future version */
+ }; /* typedef'd to XXH32_state_t */
+
+ struct XXH64_state_s {
+ unsigned long long total_len;
+ unsigned long long v1;
+ unsigned long long v2;
+ unsigned long long v3;
+ unsigned long long v4;
+ unsigned long long mem64[4]; /* buffer defined as U64 for alignment */
+ unsigned memsize;
+ unsigned reserved[2]; /* never read nor write, will be removed in a future version */
+ }; /* typedef'd to XXH64_state_t */
+
+
+# ifdef XXH_PRIVATE_API
+/**** start inlining xxhash.c ****/
+/*
+ * xxHash - Fast Hash algorithm
+ * Copyright (c) 2012-2020, Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - xxHash homepage: http://www.xxhash.com
+ * - xxHash source repository : https://github.com/Cyan4973/xxHash
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+*/
+
+
+/* *************************************
+* Tuning parameters
+***************************************/
+/*!XXH_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method doesn't depend on compiler but violate C standard.
+ * It can generate buggy code on targets which do not support unaligned memory accesses.
+ * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://stackoverflow.com/a/32095106/646947 for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
+# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+# define XXH_FORCE_MEMORY_ACCESS 2
+# elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \
+ (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) )) || \
+ defined(__ICCARM__)
+# define XXH_FORCE_MEMORY_ACCESS 1
+# endif
+#endif
+
+/*!XXH_ACCEPT_NULL_INPUT_POINTER :
+ * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
+ * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
+ * By default, this option is disabled. To enable it, uncomment below define :
+ */
+/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */
+
+/*!XXH_FORCE_NATIVE_FORMAT :
+ * By default, xxHash library provides endian-independent Hash values, based on little-endian convention.
+ * Results are therefore identical for little-endian and big-endian CPU.
+ * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
+ * Should endian-independence be of no importance for your application, you may set the #define below to 1,
+ * to improve speed for Big-endian CPU.
+ * This option has no impact on Little_Endian CPU.
+ */
+#ifndef XXH_FORCE_NATIVE_FORMAT /* can be defined externally */
+# define XXH_FORCE_NATIVE_FORMAT 0
+#endif
+
+/*!XXH_FORCE_ALIGN_CHECK :
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means : check for aligned/unaligned input.
+ * The check costs one initial branch per hash; set to 0 when the input data
+ * is guaranteed to be aligned.
+ */
+#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
+# if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+# define XXH_FORCE_ALIGN_CHECK 0
+# else
+# define XXH_FORCE_ALIGN_CHECK 1
+# endif
+#endif
+
+
+/* *************************************
+* Includes & Memory related functions
+***************************************/
+/* Modify the local functions below should you wish to use some other memory routines */
+/* for malloc(), free() */
+#include <stdlib.h>
+#include <stddef.h> /* size_t */
+static void* XXH_malloc(size_t s) { return malloc(s); }
+static void XXH_free (void* p) { free(p); }
+/* for memcpy() */
+#include <string.h>
+static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
+
+#ifndef XXH_STATIC_LINKING_ONLY
+# define XXH_STATIC_LINKING_ONLY
+#endif
+/**** skipping file: xxhash.h ****/
+
+
+/* *************************************
+* Compiler Specific Options
+***************************************/
+#if (defined(__GNUC__) && !defined(__STRICT_ANSI__)) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */
+# define INLINE_KEYWORD inline
+#else
+# define INLINE_KEYWORD
+#endif
+
+#if defined(__GNUC__) || defined(__ICCARM__)
+# define FORCE_INLINE_ATTR __attribute__((always_inline))
+#elif defined(_MSC_VER)
+# define FORCE_INLINE_ATTR __forceinline
+#else
+# define FORCE_INLINE_ATTR
+#endif
+
+#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
+
+
+#ifdef _MSC_VER
+# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
+#endif
+
+
+/* *************************************
+* Basic Types
+***************************************/
+#ifndef MEM_MODULE
+# define MEM_MODULE
+# if !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+ typedef uint8_t BYTE;
+ typedef uint16_t U16;
+ typedef uint32_t U32;
+ typedef int32_t S32;
+ typedef uint64_t U64;
+# else
+ typedef unsigned char BYTE;
+ typedef unsigned short U16;
+ typedef unsigned int U32;
+ typedef signed int S32;
+ typedef unsigned long long U64; /* if your compiler doesn't support unsigned long long, replace by another 64-bit type here. Note that xxhash.h will also need to be updated. */
+# endif
+#endif
+
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; }
+static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign;
+
+static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+static U64 XXH_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+
+static U32 XXH_read32(const void* memPtr)
+{
+ U32 val;
+ memcpy(&val, memPtr, sizeof(val));
+ return val;
+}
+
+static U64 XXH_read64(const void* memPtr)
+{
+ U64 val;
+ memcpy(&val, memPtr, sizeof(val));
+ return val;
+}
+
+#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ****************************************
+* Compiler-specific Functions and Macros
+******************************************/
+#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
+#if defined(_MSC_VER)
+# define XXH_rotl32(x,r) _rotl(x,r)
+# define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#if defined(__ICCARM__)
+# include <intrinsics.h>
+# define XXH_rotl32(x,r) __ROR(x,(32 - r))
+#else
+# define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#endif
+# define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r)))
+#endif
+
+#if defined(_MSC_VER) /* Visual Studio */
+# define XXH_swap32 _byteswap_ulong
+# define XXH_swap64 _byteswap_uint64
+#elif GCC_VERSION >= 403
+# define XXH_swap32 __builtin_bswap32
+# define XXH_swap64 __builtin_bswap64
+#else
+static U32 XXH_swap32 (U32 x)
+{
+ return ((x << 24) & 0xff000000 ) |
+ ((x << 8) & 0x00ff0000 ) |
+ ((x >> 8) & 0x0000ff00 ) |
+ ((x >> 24) & 0x000000ff );
+}
+static U64 XXH_swap64 (U64 x)
+{
+ return ((x << 56) & 0xff00000000000000ULL) |
+ ((x << 40) & 0x00ff000000000000ULL) |
+ ((x << 24) & 0x0000ff0000000000ULL) |
+ ((x << 8) & 0x000000ff00000000ULL) |
+ ((x >> 8) & 0x00000000ff000000ULL) |
+ ((x >> 24) & 0x0000000000ff0000ULL) |
+ ((x >> 40) & 0x000000000000ff00ULL) |
+ ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+
+/* *************************************
+* Architecture Macros
+***************************************/
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+
+/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+ static const int g_one = 1;
+# define XXH_CPU_LITTLE_ENDIAN (*(const char*)(&g_one))
+#endif
+
+
+/* ***************************
+* Memory reads
+*****************************/
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+FORCE_INLINE_TEMPLATE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+ if (align==XXH_unaligned)
+ return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+ else
+ return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr);
+}
+
+FORCE_INLINE_TEMPLATE U32 XXH_readLE32(const void* ptr, XXH_endianess endian)
+{
+ return XXH_readLE32_align(ptr, endian, XXH_unaligned);
+}
+
+static U32 XXH_readBE32(const void* ptr)
+{
+ return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+
+FORCE_INLINE_TEMPLATE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+ if (align==XXH_unaligned)
+ return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+ else
+ return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr);
+}
+
+FORCE_INLINE_TEMPLATE U64 XXH_readLE64(const void* ptr, XXH_endianess endian)
+{
+ return XXH_readLE64_align(ptr, endian, XXH_unaligned);
+}
+
+static U64 XXH_readBE64(const void* ptr)
+{
+ return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+
+
+/* *************************************
+* Macros
+***************************************/
+#define XXH_STATIC_ASSERT(c) { enum { XXH_static_assert = 1/(int)(!!(c)) }; } /* use only *after* variable declarations */
+
+
+/* *************************************
+* Constants
+***************************************/
+static const U32 PRIME32_1 = 2654435761U;
+static const U32 PRIME32_2 = 2246822519U;
+static const U32 PRIME32_3 = 3266489917U;
+static const U32 PRIME32_4 = 668265263U;
+static const U32 PRIME32_5 = 374761393U;
+
+static const U64 PRIME64_1 = 11400714785074694791ULL;
+static const U64 PRIME64_2 = 14029467366897019727ULL;
+static const U64 PRIME64_3 = 1609587929392839161ULL;
+static const U64 PRIME64_4 = 9650029242287828579ULL;
+static const U64 PRIME64_5 = 2870177450012600261ULL;
+
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* **************************
+* Utils
+****************************/
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* restrict dstState, const XXH32_state_t* restrict srcState)
+{
+ memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* restrict dstState, const XXH64_state_t* restrict srcState)
+{
+ memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+
+/* ***************************
+* Simple Hash Functions
+*****************************/
+
+static U32 XXH32_round(U32 seed, U32 input)
+{
+ seed += input * PRIME32_2;
+ seed = XXH_rotl32(seed, 13);
+ seed *= PRIME32_1;
+ return seed;
+}
+
+FORCE_INLINE_TEMPLATE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align)
+{
+ const BYTE* p = (const BYTE*)input;
+ const BYTE* bEnd = p + len;
+ U32 h32;
+#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+ if (p==NULL) {
+ len=0;
+ bEnd=p=(const BYTE*)(size_t)16;
+ }
+#endif
+
+ if (len>=16) {
+ const BYTE* const limit = bEnd - 16;
+ U32 v1 = seed + PRIME32_1 + PRIME32_2;
+ U32 v2 = seed + PRIME32_2;
+ U32 v3 = seed + 0;
+ U32 v4 = seed - PRIME32_1;
+
+ do {
+ v1 = XXH32_round(v1, XXH_get32bits(p)); p+=4;
+ v2 = XXH32_round(v2, XXH_get32bits(p)); p+=4;
+ v3 = XXH32_round(v3, XXH_get32bits(p)); p+=4;
+ v4 = XXH32_round(v4, XXH_get32bits(p)); p+=4;
+ } while (p<=limit);
+
+ h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+ } else {
+ h32 = seed + PRIME32_5;
+ }
+
+ h32 += (U32) len;
+
+ while (p+4<=bEnd) {
+ h32 += XXH_get32bits(p) * PRIME32_3;
+ h32 = XXH_rotl32(h32, 17) * PRIME32_4 ;
+ p+=4;
+ }
+
+ while (p<bEnd) {
+ h32 += (*p) * PRIME32_5;
+ h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+ p++;
+ }
+
+ h32 ^= h32 >> 15;
+ h32 *= PRIME32_2;
+ h32 ^= h32 >> 13;
+ h32 *= PRIME32_3;
+ h32 ^= h32 >> 16;
+
+ return h32;
+}
+
+
+XXH_PUBLIC_API unsigned int XXH32 (const void* input, size_t len, unsigned int seed)
+{
+#if 0
+ /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+ XXH32_CREATESTATE_STATIC(state);
+ XXH32_reset(state, seed);
+ XXH32_update(state, input, len);
+ return XXH32_digest(state);
+#else
+ XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+ if (XXH_FORCE_ALIGN_CHECK) {
+ if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */
+ if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+ return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+ else
+ return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+ } }
+
+ if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+ return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+ else
+ return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+
+static U64 XXH64_round(U64 acc, U64 input)
+{
+ acc += input * PRIME64_2;
+ acc = XXH_rotl64(acc, 31);
+ acc *= PRIME64_1;
+ return acc;
+}
+
+static U64 XXH64_mergeRound(U64 acc, U64 val)
+{
+ val = XXH64_round(0, val);
+ acc ^= val;
+ acc = acc * PRIME64_1 + PRIME64_4;
+ return acc;
+}
+
+FORCE_INLINE_TEMPLATE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align)
+{
+ const BYTE* p = (const BYTE*)input;
+ const BYTE* const bEnd = p + len;
+ U64 h64;
+#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+ if (p==NULL) {
+ len=0;
+ bEnd=p=(const BYTE*)(size_t)32;
+ }
+#endif
+
+ if (len>=32) {
+ const BYTE* const limit = bEnd - 32;
+ U64 v1 = seed + PRIME64_1 + PRIME64_2;
+ U64 v2 = seed + PRIME64_2;
+ U64 v3 = seed + 0;
+ U64 v4 = seed - PRIME64_1;
+
+ do {
+ v1 = XXH64_round(v1, XXH_get64bits(p)); p+=8;
+ v2 = XXH64_round(v2, XXH_get64bits(p)); p+=8;
+ v3 = XXH64_round(v3, XXH_get64bits(p)); p+=8;
+ v4 = XXH64_round(v4, XXH_get64bits(p)); p+=8;
+ } while (p<=limit);
+
+ h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+ h64 = XXH64_mergeRound(h64, v1);
+ h64 = XXH64_mergeRound(h64, v2);
+ h64 = XXH64_mergeRound(h64, v3);
+ h64 = XXH64_mergeRound(h64, v4);
+
+ } else {
+ h64 = seed + PRIME64_5;
+ }
+
+ h64 += (U64) len;
+
+ while (p+8<=bEnd) {
+ U64 const k1 = XXH64_round(0, XXH_get64bits(p));
+ h64 ^= k1;
+ h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
+ p+=8;
+ }
+
+ if (p+4<=bEnd) {
+ h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1;
+ h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+ p+=4;
+ }
+
+ while (p<bEnd) {
+ h64 ^= (*p) * PRIME64_5;
+ h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+ p++;
+ }
+
+ h64 ^= h64 >> 33;
+ h64 *= PRIME64_2;
+ h64 ^= h64 >> 29;
+ h64 *= PRIME64_3;
+ h64 ^= h64 >> 32;
+
+ return h64;
+}
+
+
+XXH_PUBLIC_API unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed)
+{
+#if 0
+ /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+ XXH64_CREATESTATE_STATIC(state);
+ XXH64_reset(state, seed);
+ XXH64_update(state, input, len);
+ return XXH64_digest(state);
+#else
+ XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+ if (XXH_FORCE_ALIGN_CHECK) {
+ if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */
+ if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+ return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+ else
+ return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+ } }
+
+ if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+ return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+ else
+ return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+
+/* **************************************************
+* Advanced Hash Functions
+****************************************************/
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+ return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+ XXH_free(statePtr);
+ return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+ return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+ XXH_free(statePtr);
+ return XXH_OK;
+}
+
+
+/*** Hash feed ***/
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int seed)
+{
+ XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+ memset(&state, 0, sizeof(state)-4); /* do not write into reserved, for future removal */
+ state.v1 = seed + PRIME32_1 + PRIME32_2;
+ state.v2 = seed + PRIME32_2;
+ state.v3 = seed + 0;
+ state.v4 = seed - PRIME32_1;
+ memcpy(statePtr, &state, sizeof(state));
+ return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed)
+{
+ XXH64_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+ memset(&state, 0, sizeof(state)-8); /* do not write into reserved, for future removal */
+ state.v1 = seed + PRIME64_1 + PRIME64_2;
+ state.v2 = seed + PRIME64_2;
+ state.v3 = seed + 0;
+ state.v4 = seed - PRIME64_1;
+ memcpy(statePtr, &state, sizeof(state));
+ return XXH_OK;
+}
+
+
+FORCE_INLINE_TEMPLATE XXH_errorcode XXH32_update_endian (XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian)
+{
+ const BYTE* p = (const BYTE*)input;
+ const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+ if (input==NULL) return XXH_ERROR;
+#endif
+
+ state->total_len_32 += (unsigned)len;
+ state->large_len |= (len>=16) | (state->total_len_32>=16);
+
+ if (state->memsize + len < 16) { /* fill in tmp buffer */
+ XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
+ state->memsize += (unsigned)len;
+ return XXH_OK;
+ }
+
+ if (state->memsize) { /* some data left from previous update */
+ XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize);
+ { const U32* p32 = state->mem32;
+ state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); p32++;
+ state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); p32++;
+ state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); p32++;
+ state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian)); p32++;
+ }
+ p += 16-state->memsize;
+ state->memsize = 0;
+ }
+
+ if (p <= bEnd-16) {
+ const BYTE* const limit = bEnd - 16;
+ U32 v1 = state->v1;
+ U32 v2 = state->v2;
+ U32 v3 = state->v3;
+ U32 v4 = state->v4;
+
+ do {
+ v1 = XXH32_round(v1, XXH_readLE32(p, endian)); p+=4;
+ v2 = XXH32_round(v2, XXH_readLE32(p, endian)); p+=4;
+ v3 = XXH32_round(v3, XXH_readLE32(p, endian)); p+=4;
+ v4 = XXH32_round(v4, XXH_readLE32(p, endian)); p+=4;
+ } while (p<=limit);
+
+ state->v1 = v1;
+ state->v2 = v2;
+ state->v3 = v3;
+ state->v4 = v4;
+ }
+
+ if (p < bEnd) {
+ XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+ state->memsize = (unsigned)(bEnd-p);
+ }
+
+ return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len)
+{
+ XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+ if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+ return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
+ else
+ return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+
+
+FORCE_INLINE_TEMPLATE U32 XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian)
+{
+ const BYTE * p = (const BYTE*)state->mem32;
+ const BYTE* const bEnd = (const BYTE*)(state->mem32) + state->memsize;
+ U32 h32;
+
+ if (state->large_len) {
+ h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
+ } else {
+ h32 = state->v3 /* == seed */ + PRIME32_5;
+ }
+
+ h32 += state->total_len_32;
+
+ while (p+4<=bEnd) {
+ h32 += XXH_readLE32(p, endian) * PRIME32_3;
+ h32 = XXH_rotl32(h32, 17) * PRIME32_4;
+ p+=4;
+ }
+
+ while (p<bEnd) {
+ h32 += (*p) * PRIME32_5;
+ h32 = XXH_rotl32(h32, 11) * PRIME32_1;
+ p++;
+ }
+
+ h32 ^= h32 >> 15;
+ h32 *= PRIME32_2;
+ h32 ^= h32 >> 13;
+ h32 *= PRIME32_3;
+ h32 ^= h32 >> 16;
+
+ return h32;
+}
+
+
+XXH_PUBLIC_API unsigned int XXH32_digest (const XXH32_state_t* state_in)
+{
+ XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+ if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+ return XXH32_digest_endian(state_in, XXH_littleEndian);
+ else
+ return XXH32_digest_endian(state_in, XXH_bigEndian);
+}
+
+
+
+/* **** XXH64 **** */
+
+FORCE_INLINE_TEMPLATE XXH_errorcode XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian)
+{
+ const BYTE* p = (const BYTE*)input;
+ const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+ if (input==NULL) return XXH_ERROR;
+#endif
+
+ state->total_len += len;
+
+ if (state->memsize + len < 32) { /* fill in tmp buffer */
+ if (input != NULL) {
+ XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
+ }
+ state->memsize += (U32)len;
+ return XXH_OK;
+ }
+
+ if (state->memsize) { /* tmp buffer is full */
+ XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize);
+ state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0, endian));
+ state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1, endian));
+ state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2, endian));
+ state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3, endian));
+ p += 32-state->memsize;
+ state->memsize = 0;
+ }
+
+ if (p+32 <= bEnd) {
+ const BYTE* const limit = bEnd - 32;
+ U64 v1 = state->v1;
+ U64 v2 = state->v2;
+ U64 v3 = state->v3;
+ U64 v4 = state->v4;
+
+ do {
+ v1 = XXH64_round(v1, XXH_readLE64(p, endian)); p+=8;
+ v2 = XXH64_round(v2, XXH_readLE64(p, endian)); p+=8;
+ v3 = XXH64_round(v3, XXH_readLE64(p, endian)); p+=8;
+ v4 = XXH64_round(v4, XXH_readLE64(p, endian)); p+=8;
+ } while (p<=limit);
+
+ state->v1 = v1;
+ state->v2 = v2;
+ state->v3 = v3;
+ state->v4 = v4;
+ }
+
+ if (p < bEnd) {
+ XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+ state->memsize = (unsigned)(bEnd-p);
+ }
+
+ return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len)
+{
+ XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+ if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+ return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
+ else
+ return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+
+
+FORCE_INLINE_TEMPLATE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian)
+{
+ const BYTE * p = (const BYTE*)state->mem64;
+ const BYTE* const bEnd = (const BYTE*)state->mem64 + state->memsize;
+ U64 h64;
+
+ if (state->total_len >= 32) {
+ U64 const v1 = state->v1;
+ U64 const v2 = state->v2;
+ U64 const v3 = state->v3;
+ U64 const v4 = state->v4;
+
+ h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+ h64 = XXH64_mergeRound(h64, v1);
+ h64 = XXH64_mergeRound(h64, v2);
+ h64 = XXH64_mergeRound(h64, v3);
+ h64 = XXH64_mergeRound(h64, v4);
+ } else {
+ h64 = state->v3 + PRIME64_5;
+ }
+
+ h64 += (U64) state->total_len;
+
+ while (p+8<=bEnd) {
+ U64 const k1 = XXH64_round(0, XXH_readLE64(p, endian));
+ h64 ^= k1;
+ h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
+ p+=8;
+ }
+
+ if (p+4<=bEnd) {
+ h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1;
+ h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+ p+=4;
+ }
+
+ while (p<bEnd) {
+ h64 ^= (*p) * PRIME64_5;
+ h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+ p++;
+ }
+
+ h64 ^= h64 >> 33;
+ h64 *= PRIME64_2;
+ h64 ^= h64 >> 29;
+ h64 *= PRIME64_3;
+ h64 ^= h64 >> 32;
+
+ return h64;
+}
+
+
+XXH_PUBLIC_API unsigned long long XXH64_digest (const XXH64_state_t* state_in)
+{
+ XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+ if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+ return XXH64_digest_endian(state_in, XXH_littleEndian);
+ else
+ return XXH64_digest_endian(state_in, XXH_bigEndian);
+}
+
+
+/* **************************
+* Canonical representation
+****************************/
+
+/*! Default XXH result types are basic unsigned 32 and 64 bits.
+* The canonical representation follows human-readable write convention, aka big-endian (large digits first).
+* These functions allow transformation of hash result into and from its canonical format.
+* This way, hash values can be written into a file or buffer, and remain comparable across different systems and programs.
+*/
+
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+ XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+ if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+ memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+ XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+ if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+ memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+ return XXH_readBE32(src);
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+{
+ return XXH_readBE64(src);
+}
+/**** ended inlining xxhash.c ****/
+# endif
+
+#endif /* XXH_STATIC_LINKING_ONLY && XXH_STATIC_H_3543687687345 */
+
+
+#if defined (__cplusplus)
+}
+#endif
+/**** ended inlining xxhash.h ****/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* ---- static assert (debug) --- */
+#define ZSTD_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)
+#define ZSTD_isError ERR_isError /* for inlining */
+#define FSE_isError ERR_isError
+#define HUF_isError ERR_isError
+
+
+/*-*************************************
+* shared macros
+***************************************/
+#undef MIN
+#undef MAX
+#define MIN(a,b) ((a)<(b) ? (a) : (b))
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
+
+/**
+ * Ignore: this is an internal helper.
+ *
+ * This is a helper function to help force C99-correctness during compilation.
+ * Under strict compilation modes, variadic macro arguments can't be empty.
+ * However, variadic function arguments can be. Using a function therefore lets
+ * us statically check that at least one (string) argument was passed,
+ * independent of the compilation flags.
+ */
+static INLINE_KEYWORD UNUSED_ATTR
+void _force_has_format_string(const char *format, ...) {
+ (void)format;
+}
+
+/**
+ * Ignore: this is an internal helper.
+ *
+ * We want to force this function invocation to be syntactically correct, but
+ * we don't want to force runtime evaluation of its arguments.
+ */
+#define _FORCE_HAS_FORMAT_STRING(...) \
+ if (0) { \
+ _force_has_format_string(__VA_ARGS__); \
+ }
+
+/**
+ * Return the specified error if the condition evaluates to true.
+ *
+ * In debug modes, prints additional information.
+ * In order to do that (particularly, printing the conditional that failed),
+ * this can't just wrap RETURN_ERROR().
+ */
+#define RETURN_ERROR_IF(cond, err, ...) \
+ if (cond) { \
+ RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \
+ __FILE__, __LINE__, ZSTD_QUOTE(cond), ZSTD_QUOTE(ERROR(err))); \
+ _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+ RAWLOG(3, ": " __VA_ARGS__); \
+ RAWLOG(3, "\n"); \
+ return ERROR(err); \
+ }
+
+/**
+ * Unconditionally return the specified error.
+ *
+ * In debug modes, prints additional information.
+ */
+#define RETURN_ERROR(err, ...) \
+ do { \
+ RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
+ __FILE__, __LINE__, ZSTD_QUOTE(ERROR(err))); \
+ _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+ RAWLOG(3, ": " __VA_ARGS__); \
+ RAWLOG(3, "\n"); \
+ return ERROR(err); \
+ } while(0);
+
+/**
+ * If the provided expression evaluates to an error code, returns that error code.
+ *
+ * In debug modes, prints additional information.
+ */
+#define FORWARD_IF_ERROR(err, ...) \
+ do { \
+ size_t const err_code = (err); \
+ if (ERR_isError(err_code)) { \
+ RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \
+ __FILE__, __LINE__, ZSTD_QUOTE(err), ERR_getErrorName(err_code)); \
+ _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+ RAWLOG(3, ": " __VA_ARGS__); \
+ RAWLOG(3, "\n"); \
+ return err_code; \
+ } \
+ } while(0);
+
+
+/*-*************************************
+* Common constants
+***************************************/
+#define ZSTD_OPT_NUM (1<<12)
+
+#define ZSTD_REP_NUM 3 /* number of repcodes */
+#define ZSTD_REP_MOVE (ZSTD_REP_NUM-1)
+static const U32 repStartValue[ZSTD_REP_NUM] = { 1, 4, 8 };
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BIT7 128
+#define BIT6 64
+#define BIT5 32
+#define BIT4 16
+#define BIT1 2
+#define BIT0 1
+
+#define ZSTD_WINDOWLOG_ABSOLUTEMIN 10
+static const size_t ZSTD_fcs_fieldSize[4] = { 0, 2, 4, 8 };
+static const size_t ZSTD_did_fieldSize[4] = { 0, 1, 2, 4 };
+
+#define ZSTD_FRAMEIDSIZE 4 /* magic number size */
+
+#define ZSTD_BLOCKHEADERSIZE 3 /* C standard doesn't allow `static const` variable to be init using another `static const` variable */
+static const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE;
+typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
+
+#define ZSTD_FRAMECHECKSUMSIZE 4
+
+#define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */
+
+#define HufLog 12
+typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
+
+#define LONGNBSEQ 0x7F00
+
+#define MINMATCH 3
+
+#define Litbits 8
+#define MaxLit ((1<<Litbits) - 1)
+#define MaxML 52
+#define MaxLL 35
+#define DefaultMaxOff 28
+#define MaxOff 31
+#define MaxSeq MAX(MaxLL, MaxML) /* Assumption : MaxOff < MaxLL,MaxML */
+#define MLFSELog 9
+#define LLFSELog 9
+#define OffFSELog 8
+#define MaxFSELog MAX(MAX(MLFSELog, LLFSELog), OffFSELog)
+
+static const U32 LL_bits[MaxLL+1] = { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 2, 2, 3, 3,
+ 4, 6, 7, 8, 9,10,11,12,
+ 13,14,15,16 };
+static const S16 LL_defaultNorm[MaxLL+1] = { 4, 3, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 3, 2, 1, 1, 1, 1, 1,
+ -1,-1,-1,-1 };
+#define LL_DEFAULTNORMLOG 6 /* for static allocation */
+static const U32 LL_defaultNormLog = LL_DEFAULTNORMLOG;
+
+static const U32 ML_bits[MaxML+1] = { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 2, 2, 3, 3,
+ 4, 4, 5, 7, 8, 9,10,11,
+ 12,13,14,15,16 };
+static const S16 ML_defaultNorm[MaxML+1] = { 1, 4, 3, 2, 2, 2, 2, 2,
+ 2, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1,-1,-1,
+ -1,-1,-1,-1,-1 };
+#define ML_DEFAULTNORMLOG 6 /* for static allocation */
+static const U32 ML_defaultNormLog = ML_DEFAULTNORMLOG;
+
+static const S16 OF_defaultNorm[DefaultMaxOff+1] = { 1, 1, 1, 1, 1, 1, 2, 2,
+ 2, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ -1,-1,-1,-1,-1 };
+#define OF_DEFAULTNORMLOG 5 /* for static allocation */
+static const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG;
+
+
+/*-*******************************************
+* Shared functions to include for inlining
+*********************************************/
+static void ZSTD_copy8(void* dst, const void* src) {
+#if !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON)
+ vst1_u8((uint8_t*)dst, vld1_u8((const uint8_t*)src));
+#else
+ memcpy(dst, src, 8);
+#endif
+}
+
+#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
+static void ZSTD_copy16(void* dst, const void* src) {
+#if !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON)
+ vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src));
+#else
+ memcpy(dst, src, 16);
+#endif
+}
+#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
+
+#define WILDCOPY_OVERLENGTH 32
+#define WILDCOPY_VECLEN 16
+
+typedef enum {
+ ZSTD_no_overlap,
+ ZSTD_overlap_src_before_dst
+ /* ZSTD_overlap_dst_before_src, */
+} ZSTD_overlap_e;
+
+/*! ZSTD_wildcopy() :
+ * Custom version of memcpy(), can over read/write up to WILDCOPY_OVERLENGTH bytes (if length==0)
+ * @param ovtype controls the overlap detection
+ * - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart.
+ * - ZSTD_overlap_src_before_dst: The src and dst may overlap, but they MUST be at least 8 bytes apart.
+ * The src buffer must be before the dst buffer.
+ */
+MEM_STATIC FORCE_INLINE_ATTR
+void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e const ovtype)
+{
+ ptrdiff_t diff = (BYTE*)dst - (const BYTE*)src;
+ const BYTE* ip = (const BYTE*)src;
+ BYTE* op = (BYTE*)dst;
+ BYTE* const oend = op + length;
+
+ assert(diff >= 8 || (ovtype == ZSTD_no_overlap && diff <= -WILDCOPY_VECLEN));
+
+ if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) {
+ /* Handle short offset copies. */
+ do {
+ COPY8(op, ip)
+ } while (op < oend);
+ } else {
+ assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN);
+ /* Separate out the first COPY16() call because the copy length is
+ * almost certain to be short, so the branches have different
+ * probabilities. Since it is almost certain to be short, only do
+ * one COPY16() in the first call. Then, do two calls per loop since
+ * at that point it is more likely to have a high trip count.
+ */
+#ifndef __aarch64__
+ do {
+ COPY16(op, ip);
+ }
+ while (op < oend);
+#else
+ COPY16(op, ip);
+ if (op >= oend) return;
+ do {
+ COPY16(op, ip);
+ COPY16(op, ip);
+ }
+ while (op < oend);
+#endif
+ }
+}
+
+MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+ size_t const length = MIN(dstCapacity, srcSize);
+ if (length > 0) {
+ memcpy(dst, src, length);
+ }
+ return length;
+}
+
+/* define "workspace is too large" as this number of times larger than needed */
+#define ZSTD_WORKSPACETOOLARGE_FACTOR 3
+
+/* when workspace is continuously too large
+ * during at least this number of times,
+ * context's memory usage is considered wasteful,
+ * because it's sized to handle a worst case scenario which rarely happens.
+ * In which case, resize it down to free some memory */
+#define ZSTD_WORKSPACETOOLARGE_MAXDURATION 128
+
+
+/*-*******************************************
+* Private declarations
+*********************************************/
+typedef struct seqDef_s {
+ U32 offset;
+ U16 litLength;
+ U16 matchLength;
+} seqDef;
+
+typedef struct {
+ seqDef* sequencesStart;
+ seqDef* sequences;
+ BYTE* litStart;
+ BYTE* lit;
+ BYTE* llCode;
+ BYTE* mlCode;
+ BYTE* ofCode;
+ size_t maxNbSeq;
+ size_t maxNbLit;
+ U32 longLengthID; /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */
+ U32 longLengthPos;
+} seqStore_t;
+
+typedef struct {
+ U32 litLength;
+ U32 matchLength;
+} ZSTD_sequenceLength;
+
+/**
+ * Returns the ZSTD_sequenceLength for the given sequences. It handles the decoding of long sequences
+ * indicated by longLengthPos and longLengthID, and adds MINMATCH back to matchLength.
+ */
+MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore, seqDef const* seq)
+{
+ ZSTD_sequenceLength seqLen;
+ seqLen.litLength = seq->litLength;
+ seqLen.matchLength = seq->matchLength + MINMATCH;
+ if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) {
+ if (seqStore->longLengthID == 1) {
+ seqLen.litLength += 0xFFFF;
+ }
+ if (seqStore->longLengthID == 2) {
+ seqLen.matchLength += 0xFFFF;
+ }
+ }
+ return seqLen;
+}
+
+/**
+ * Contains the compressed frame size and an upper-bound for the decompressed frame size.
+ * Note: before using `compressedSize`, check for errors using ZSTD_isError().
+ * similarly, before using `decompressedBound`, check for errors using:
+ * `decompressedBound != ZSTD_CONTENTSIZE_ERROR`
+ */
+typedef struct {
+ size_t compressedSize;
+ unsigned long long decompressedBound;
+} ZSTD_frameSizeInfo; /* decompress & legacy */
+
+const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */
+void ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
+
+/* custom memory allocation functions */
+void* ZSTD_malloc(size_t size, ZSTD_customMem customMem);
+void* ZSTD_calloc(size_t size, ZSTD_customMem customMem);
+void ZSTD_free(void* ptr, ZSTD_customMem customMem);
+
+
+MEM_STATIC U32 ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */
+{
+ assert(val != 0);
+ {
+# if defined(_MSC_VER) /* Visual */
+ unsigned long r=0;
+ return _BitScanReverse(&r, val) ? (unsigned)r : 0;
+# elif defined(__GNUC__) && (__GNUC__ >= 3) /* GCC Intrinsic */
+ return __builtin_clz (val) ^ 31;
+# elif defined(__ICCARM__) /* IAR Intrinsic */
+ return 31 - __CLZ(val);
+# else /* Software version */
+ static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+ U32 v = val;
+ v |= v >> 1;
+ v |= v >> 2;
+ v |= v >> 4;
+ v |= v >> 8;
+ v |= v >> 16;
+ return DeBruijnClz[(v * 0x07C4ACDDU) >> 27];
+# endif
+ }
+}
+
+
+/* ZSTD_invalidateRepCodes() :
+ * ensures next compression will not use repcodes from previous block.
+ * Note : only works with regular variant;
+ * do not use with extDict variant ! */
+void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx); /* zstdmt, adaptive_compression (shouldn't get this definition from here) */
+
+
+typedef struct {
+ blockType_e blockType;
+ U32 lastBlock;
+ U32 origSize;
+} blockProperties_t; /* declared here for decompress and fullbench */
+
+/*! ZSTD_getcBlockSize() :
+ * Provides the size of compressed block from block header `src` */
+/* Used by: decompress, fullbench (does not get its definition from here) */
+size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+ blockProperties_t* bpPtr);
+
+/*! ZSTD_decodeSeqHeaders() :
+ * decode sequence header from src */
+/* Used by: decompress, fullbench (does not get its definition from here) */
+size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+ const void* src, size_t srcSize);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_CCOMMON_H_MODULE */
+/**** ended inlining zstd_internal.h ****/
+/**** start inlining pool.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef POOL_H
+#define POOL_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+#include <stddef.h> /* size_t */
+#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_customMem */
+/**** skipping file: ../zstd.h ****/
+
+typedef struct POOL_ctx_s POOL_ctx;
+
+/*! POOL_create() :
+ * Create a thread pool with at most `numThreads` threads.
+ * `numThreads` must be at least 1.
+ * The maximum number of queued jobs before blocking is `queueSize`.
+ * @return : POOL_ctx pointer on success, else NULL.
+*/
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize);
+
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize,
+ ZSTD_customMem customMem);
+
+/*! POOL_free() :
+ * Free a thread pool returned by POOL_create().
+ */
+void POOL_free(POOL_ctx* ctx);
+
+/*! POOL_resize() :
+ * Expands or shrinks pool's number of threads.
+ * This is more efficient than releasing + creating a new context,
+ * since it tries to preserve and re-use existing threads.
+ * `numThreads` must be at least 1.
+ * @return : 0 when resize was successful,
+ * !0 (typically 1) if there is an error.
+ * note : only numThreads can be resized, queueSize remains unchanged.
+ */
+int POOL_resize(POOL_ctx* ctx, size_t numThreads);
+
+/*! POOL_sizeof() :
+ * @return threadpool memory usage
+ * note : compatible with NULL (returns 0 in this case)
+ */
+size_t POOL_sizeof(POOL_ctx* ctx);
+
+/*! POOL_function :
+ * The function type that can be added to a thread pool.
+ */
+typedef void (*POOL_function)(void*);
+
+/*! POOL_add() :
+ * Add the job `function(opaque)` to the thread pool. `ctx` must be valid.
+ * Possibly blocks until there is room in the queue.
+ * Note : The function may be executed asynchronously,
+ * therefore, `opaque` must live until function has been completed.
+ */
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque);
+
+
+/*! POOL_tryAdd() :
+ * Add the job `function(opaque)` to thread pool _if_ a worker is available.
+ * Returns immediately even if not (does not block).
+ * @return : 1 if successful, 0 if not.
+ */
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif
+/**** ended inlining pool.h ****/
+
+/* ====== Compiler specifics ====== */
+#if defined(_MSC_VER)
+# pragma warning(disable : 4204) /* disable: C4204: non-constant aggregate initializer */
+#endif
+
+
+#ifdef ZSTD_MULTITHREAD
+
+/**** start inlining threading.h ****/
+/**
+ * Copyright (c) 2016 Tino Reichardt
+ * All rights reserved.
+ *
+ * You can contact the author at:
+ * - zstdmt source repository: https://github.com/mcmilk/zstdmt
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef THREADING_H_938743
+#define THREADING_H_938743
+
+/**** skipping file: debug.h ****/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#if defined(ZSTD_MULTITHREAD) && defined(_WIN32)
+
+/**
+ * Windows minimalist Pthread Wrapper, based on :
+ * http://www.cse.wustl.edu/~schmidt/win32-cv-1.html
+ */
+#ifdef WINVER
+# undef WINVER
+#endif
+#define WINVER 0x0600
+
+#ifdef _WIN32_WINNT
+# undef _WIN32_WINNT
+#endif
+#define _WIN32_WINNT 0x0600
+
+#ifndef WIN32_LEAN_AND_MEAN
+# define WIN32_LEAN_AND_MEAN
+#endif
+
+#undef ERROR /* reported already defined on VS 2015 (Rich Geldreich) */
+#include <windows.h>
+#undef ERROR
+#define ERROR(name) ZSTD_ERROR(name)
+
+
+/* mutex */
+#define ZSTD_pthread_mutex_t CRITICAL_SECTION
+#define ZSTD_pthread_mutex_init(a, b) ((void)(b), InitializeCriticalSection((a)), 0)
+#define ZSTD_pthread_mutex_destroy(a) DeleteCriticalSection((a))
+#define ZSTD_pthread_mutex_lock(a) EnterCriticalSection((a))
+#define ZSTD_pthread_mutex_unlock(a) LeaveCriticalSection((a))
+
+/* condition variable */
+#define ZSTD_pthread_cond_t CONDITION_VARIABLE
+#define ZSTD_pthread_cond_init(a, b) ((void)(b), InitializeConditionVariable((a)), 0)
+#define ZSTD_pthread_cond_destroy(a) ((void)(a))
+#define ZSTD_pthread_cond_wait(a, b) SleepConditionVariableCS((a), (b), INFINITE)
+#define ZSTD_pthread_cond_signal(a) WakeConditionVariable((a))
+#define ZSTD_pthread_cond_broadcast(a) WakeAllConditionVariable((a))
+
+/* ZSTD_pthread_create() and ZSTD_pthread_join() */
+typedef struct {
+ HANDLE handle;
+ void* (*start_routine)(void*);
+ void* arg;
+} ZSTD_pthread_t;
+
+int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused,
+ void* (*start_routine) (void*), void* arg);
+
+int ZSTD_pthread_join(ZSTD_pthread_t thread, void** value_ptr);
+
+/**
+ * add here more wrappers as required
+ */
+
+
+#elif defined(ZSTD_MULTITHREAD) /* posix assumed ; need a better detection method */
+/* === POSIX Systems === */
+# include <pthread.h>
+
+#if DEBUGLEVEL < 1
+
+#define ZSTD_pthread_mutex_t pthread_mutex_t
+#define ZSTD_pthread_mutex_init(a, b) pthread_mutex_init((a), (b))
+#define ZSTD_pthread_mutex_destroy(a) pthread_mutex_destroy((a))
+#define ZSTD_pthread_mutex_lock(a) pthread_mutex_lock((a))
+#define ZSTD_pthread_mutex_unlock(a) pthread_mutex_unlock((a))
+
+#define ZSTD_pthread_cond_t pthread_cond_t
+#define ZSTD_pthread_cond_init(a, b) pthread_cond_init((a), (b))
+#define ZSTD_pthread_cond_destroy(a) pthread_cond_destroy((a))
+#define ZSTD_pthread_cond_wait(a, b) pthread_cond_wait((a), (b))
+#define ZSTD_pthread_cond_signal(a) pthread_cond_signal((a))
+#define ZSTD_pthread_cond_broadcast(a) pthread_cond_broadcast((a))
+
+#define ZSTD_pthread_t pthread_t
+#define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d))
+#define ZSTD_pthread_join(a, b) pthread_join((a),(b))
+
+#else /* DEBUGLEVEL >= 1 */
+
+/* Debug implementation of threading.
+ * In this implementation we use pointers for mutexes and condition variables.
+ * This way, if we forget to init/destroy them the program will crash or ASAN
+ * will report leaks.
+ */
+
+#define ZSTD_pthread_mutex_t pthread_mutex_t*
+int ZSTD_pthread_mutex_init(ZSTD_pthread_mutex_t* mutex, pthread_mutexattr_t const* attr);
+int ZSTD_pthread_mutex_destroy(ZSTD_pthread_mutex_t* mutex);
+#define ZSTD_pthread_mutex_lock(a) pthread_mutex_lock(*(a))
+#define ZSTD_pthread_mutex_unlock(a) pthread_mutex_unlock(*(a))
+
+#define ZSTD_pthread_cond_t pthread_cond_t*
+int ZSTD_pthread_cond_init(ZSTD_pthread_cond_t* cond, pthread_condattr_t const* attr);
+int ZSTD_pthread_cond_destroy(ZSTD_pthread_cond_t* cond);
+#define ZSTD_pthread_cond_wait(a, b) pthread_cond_wait(*(a), *(b))
+#define ZSTD_pthread_cond_signal(a) pthread_cond_signal(*(a))
+#define ZSTD_pthread_cond_broadcast(a) pthread_cond_broadcast(*(a))
+
+#define ZSTD_pthread_t pthread_t
+#define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d))
+#define ZSTD_pthread_join(a, b) pthread_join((a),(b))
+
+#endif
+
+#else /* ZSTD_MULTITHREAD not defined */
+/* No multithreading support */
+
+typedef int ZSTD_pthread_mutex_t;
+#define ZSTD_pthread_mutex_init(a, b) ((void)(a), (void)(b), 0)
+#define ZSTD_pthread_mutex_destroy(a) ((void)(a))
+#define ZSTD_pthread_mutex_lock(a) ((void)(a))
+#define ZSTD_pthread_mutex_unlock(a) ((void)(a))
+
+typedef int ZSTD_pthread_cond_t;
+#define ZSTD_pthread_cond_init(a, b) ((void)(a), (void)(b), 0)
+#define ZSTD_pthread_cond_destroy(a) ((void)(a))
+#define ZSTD_pthread_cond_wait(a, b) ((void)(a), (void)(b))
+#define ZSTD_pthread_cond_signal(a) ((void)(a))
+#define ZSTD_pthread_cond_broadcast(a) ((void)(a))
+
+/* do not use ZSTD_pthread_t */
+
+#endif /* ZSTD_MULTITHREAD */
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* THREADING_H_938743 */
+/**** ended inlining threading.h ****/
+
+/* A job is a function and an opaque argument */
+typedef struct POOL_job_s {
+ POOL_function function;
+ void *opaque;
+} POOL_job;
+
+struct POOL_ctx_s {
+ ZSTD_customMem customMem;
+ /* Keep track of the threads */
+ ZSTD_pthread_t* threads;
+ size_t threadCapacity;
+ size_t threadLimit;
+
+ /* The queue is a circular buffer */
+ POOL_job *queue;
+ size_t queueHead;
+ size_t queueTail;
+ size_t queueSize;
+
+ /* The number of threads working on jobs */
+ size_t numThreadsBusy;
+ /* Indicates if the queue is empty */
+ int queueEmpty;
+
+ /* The mutex protects the queue */
+ ZSTD_pthread_mutex_t queueMutex;
+ /* Condition variable for pushers to wait on when the queue is full */
+ ZSTD_pthread_cond_t queuePushCond;
+ /* Condition variables for poppers to wait on when the queue is empty */
+ ZSTD_pthread_cond_t queuePopCond;
+ /* Indicates if the queue is shutting down */
+ int shutdown;
+};
+
+/* POOL_thread() :
+ * Work thread for the thread pool.
+ * Waits for jobs and executes them.
+ * @returns : NULL on failure else non-null.
+ */
+static void* POOL_thread(void* opaque) {
+ POOL_ctx* const ctx = (POOL_ctx*)opaque;
+ if (!ctx) { return NULL; }
+ for (;;) {
+ /* Lock the mutex and wait for a non-empty queue or until shutdown */
+ ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+
+ while ( ctx->queueEmpty
+ || (ctx->numThreadsBusy >= ctx->threadLimit) ) {
+ if (ctx->shutdown) {
+ /* even if !queueEmpty, (possible if numThreadsBusy >= threadLimit),
+ * a few threads will be shutdown while !queueEmpty,
+ * but enough threads will remain active to finish the queue */
+ ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+ return opaque;
+ }
+ ZSTD_pthread_cond_wait(&ctx->queuePopCond, &ctx->queueMutex);
+ }
+ /* Pop a job off the queue */
+ { POOL_job const job = ctx->queue[ctx->queueHead];
+ ctx->queueHead = (ctx->queueHead + 1) % ctx->queueSize;
+ ctx->numThreadsBusy++;
+ ctx->queueEmpty = ctx->queueHead == ctx->queueTail;
+ /* Unlock the mutex, signal a pusher, and run the job */
+ ZSTD_pthread_cond_signal(&ctx->queuePushCond);
+ ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+
+ job.function(job.opaque);
+
+ /* If the intended queue size was 0, signal after finishing job */
+ ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+ ctx->numThreadsBusy--;
+ if (ctx->queueSize == 1) {
+ ZSTD_pthread_cond_signal(&ctx->queuePushCond);
+ }
+ ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+ }
+ } /* for (;;) */
+ assert(0); /* Unreachable */
+}
+
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) {
+ return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem);
+}
+
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize,
+ ZSTD_customMem customMem) {
+ POOL_ctx* ctx;
+ /* Check parameters */
+ if (!numThreads) { return NULL; }
+ /* Allocate the context and zero initialize */
+ ctx = (POOL_ctx*)ZSTD_calloc(sizeof(POOL_ctx), customMem);
+ if (!ctx) { return NULL; }
+ /* Initialize the job queue.
+ * It needs one extra space since one space is wasted to differentiate
+ * empty and full queues.
+ */
+ ctx->queueSize = queueSize + 1;
+ ctx->queue = (POOL_job*)ZSTD_malloc(ctx->queueSize * sizeof(POOL_job), customMem);
+ ctx->queueHead = 0;
+ ctx->queueTail = 0;
+ ctx->numThreadsBusy = 0;
+ ctx->queueEmpty = 1;
+ {
+ int error = 0;
+ error |= ZSTD_pthread_mutex_init(&ctx->queueMutex, NULL);
+ error |= ZSTD_pthread_cond_init(&ctx->queuePushCond, NULL);
+ error |= ZSTD_pthread_cond_init(&ctx->queuePopCond, NULL);
+ if (error) { POOL_free(ctx); return NULL; }
+ }
+ ctx->shutdown = 0;
+ /* Allocate space for the thread handles */
+ ctx->threads = (ZSTD_pthread_t*)ZSTD_malloc(numThreads * sizeof(ZSTD_pthread_t), customMem);
+ ctx->threadCapacity = 0;
+ ctx->customMem = customMem;
+ /* Check for errors */
+ if (!ctx->threads || !ctx->queue) { POOL_free(ctx); return NULL; }
+ /* Initialize the threads */
+ { size_t i;
+ for (i = 0; i < numThreads; ++i) {
+ if (ZSTD_pthread_create(&ctx->threads[i], NULL, &POOL_thread, ctx)) {
+ ctx->threadCapacity = i;
+ POOL_free(ctx);
+ return NULL;
+ } }
+ ctx->threadCapacity = numThreads;
+ ctx->threadLimit = numThreads;
+ }
+ return ctx;
+}
+
+/*! POOL_join() :
+ Shutdown the queue, wake any sleeping threads, and join all of the threads.
+*/
+static void POOL_join(POOL_ctx* ctx) {
+ /* Shut down the queue */
+ ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+ ctx->shutdown = 1;
+ ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+ /* Wake up sleeping threads */
+ ZSTD_pthread_cond_broadcast(&ctx->queuePushCond);
+ ZSTD_pthread_cond_broadcast(&ctx->queuePopCond);
+ /* Join all of the threads */
+ { size_t i;
+ for (i = 0; i < ctx->threadCapacity; ++i) {
+ ZSTD_pthread_join(ctx->threads[i], NULL); /* note : could fail */
+ } }
+}
+
+void POOL_free(POOL_ctx *ctx) {
+ if (!ctx) { return; }
+ POOL_join(ctx);
+ ZSTD_pthread_mutex_destroy(&ctx->queueMutex);
+ ZSTD_pthread_cond_destroy(&ctx->queuePushCond);
+ ZSTD_pthread_cond_destroy(&ctx->queuePopCond);
+ ZSTD_free(ctx->queue, ctx->customMem);
+ ZSTD_free(ctx->threads, ctx->customMem);
+ ZSTD_free(ctx, ctx->customMem);
+}
+
+
+
+size_t POOL_sizeof(POOL_ctx *ctx) {
+ if (ctx==NULL) return 0; /* supports sizeof NULL */
+ return sizeof(*ctx)
+ + ctx->queueSize * sizeof(POOL_job)
+ + ctx->threadCapacity * sizeof(ZSTD_pthread_t);
+}
+
+
+/* @return : 0 on success, 1 on error */
+static int POOL_resize_internal(POOL_ctx* ctx, size_t numThreads)
+{
+ if (numThreads <= ctx->threadCapacity) {
+ if (!numThreads) return 1;
+ ctx->threadLimit = numThreads;
+ return 0;
+ }
+ /* numThreads > threadCapacity */
+ { ZSTD_pthread_t* const threadPool = (ZSTD_pthread_t*)ZSTD_malloc(numThreads * sizeof(ZSTD_pthread_t), ctx->customMem);
+ if (!threadPool) return 1;
+ /* replace existing thread pool */
+ memcpy(threadPool, ctx->threads, ctx->threadCapacity * sizeof(*threadPool));
+ ZSTD_free(ctx->threads, ctx->customMem);
+ ctx->threads = threadPool;
+ /* Initialize additional threads */
+ { size_t threadId;
+ for (threadId = ctx->threadCapacity; threadId < numThreads; ++threadId) {
+ if (ZSTD_pthread_create(&threadPool[threadId], NULL, &POOL_thread, ctx)) {
+ ctx->threadCapacity = threadId;
+ return 1;
+ } }
+ } }
+ /* successfully expanded */
+ ctx->threadCapacity = numThreads;
+ ctx->threadLimit = numThreads;
+ return 0;
+}
+
+/* @return : 0 on success, 1 on error */
+int POOL_resize(POOL_ctx* ctx, size_t numThreads)
+{
+ int result;
+ if (ctx==NULL) return 1;
+ ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+ result = POOL_resize_internal(ctx, numThreads);
+ ZSTD_pthread_cond_broadcast(&ctx->queuePopCond);
+ ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+ return result;
+}
+
+/**
+ * Returns 1 if the queue is full and 0 otherwise.
+ *
+ * When queueSize is 1 (pool was created with an intended queueSize of 0),
+ * then a queue is empty if there is a thread free _and_ no job is waiting.
+ */
+static int isQueueFull(POOL_ctx const* ctx) {
+ if (ctx->queueSize > 1) {
+ return ctx->queueHead == ((ctx->queueTail + 1) % ctx->queueSize);
+ } else {
+ return (ctx->numThreadsBusy == ctx->threadLimit) ||
+ !ctx->queueEmpty;
+ }
+}
+
+
+static void POOL_add_internal(POOL_ctx* ctx, POOL_function function, void *opaque)
+{
+ POOL_job const job = {function, opaque};
+ assert(ctx != NULL);
+ if (ctx->shutdown) return;
+
+ ctx->queueEmpty = 0;
+ ctx->queue[ctx->queueTail] = job;
+ ctx->queueTail = (ctx->queueTail + 1) % ctx->queueSize;
+ ZSTD_pthread_cond_signal(&ctx->queuePopCond);
+}
+
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque)
+{
+ assert(ctx != NULL);
+ ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+ /* Wait until there is space in the queue for the new job */
+ while (isQueueFull(ctx) && (!ctx->shutdown)) {
+ ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex);
+ }
+ POOL_add_internal(ctx, function, opaque);
+ ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+}
+
+
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque)
+{
+ assert(ctx != NULL);
+ ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+ if (isQueueFull(ctx)) {
+ ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+ return 0;
+ }
+ POOL_add_internal(ctx, function, opaque);
+ ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+ return 1;
+}
+
+
+#else /* ZSTD_MULTITHREAD not defined */
+
+/* ========================== */
+/* No multi-threading support */
+/* ========================== */
+
+
+/* We don't need any data, but if it is empty, malloc() might return NULL. */
+struct POOL_ctx_s {
+ int dummy;
+};
+static POOL_ctx g_ctx;
+
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) {
+ return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem);
+}
+
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem) {
+ (void)numThreads;
+ (void)queueSize;
+ (void)customMem;
+ return &g_ctx;
+}
+
+void POOL_free(POOL_ctx* ctx) {
+ assert(!ctx || ctx == &g_ctx);
+ (void)ctx;
+}
+
+int POOL_resize(POOL_ctx* ctx, size_t numThreads) {
+ (void)ctx; (void)numThreads;
+ return 0;
+}
+
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque) {
+ (void)ctx;
+ function(opaque);
+}
+
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque) {
+ (void)ctx;
+ function(opaque);
+ return 1;
+}
+
+size_t POOL_sizeof(POOL_ctx* ctx) {
+ if (ctx==NULL) return 0; /* supports sizeof NULL */
+ assert(ctx == &g_ctx);
+ return sizeof(*ctx);
+}
+
+#endif /* ZSTD_MULTITHREAD */
+/**** ended inlining common/pool.c ****/
+/**** start inlining common/zstd_common.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+
+/*-*************************************
+* Dependencies
+***************************************/
+#include <stdlib.h> /* malloc, calloc, free */
+#include <string.h> /* memset */
+/**** skipping file: error_private.h ****/
+/**** skipping file: zstd_internal.h ****/
+
+
+/*-****************************************
+* Version
+******************************************/
+unsigned ZSTD_versionNumber(void) { return ZSTD_VERSION_NUMBER; }
+
+const char* ZSTD_versionString(void) { return ZSTD_VERSION_STRING; }
+
+
+/*-****************************************
+* ZSTD Error Management
+******************************************/
+#undef ZSTD_isError /* defined within zstd_internal.h */
+/*! ZSTD_isError() :
+ * tells if a return value is an error code
+ * symbol is required for external callers */
+unsigned ZSTD_isError(size_t code) { return ERR_isError(code); }
+
+/*! ZSTD_getErrorName() :
+ * provides error code string from function result (useful for debugging) */
+const char* ZSTD_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+/*! ZSTD_getError() :
+ * convert a `size_t` function result into a proper ZSTD_errorCode enum */
+ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
+
+/*! ZSTD_getErrorString() :
+ * provides error code string from enum */
+const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
+
+
+
+/*=**************************************************************
+* Custom allocator
+****************************************************************/
+void* ZSTD_malloc(size_t size, ZSTD_customMem customMem)
+{
+ if (customMem.customAlloc)
+ return customMem.customAlloc(customMem.opaque, size);
+ return malloc(size);
+}
+
+void* ZSTD_calloc(size_t size, ZSTD_customMem customMem)
+{
+ if (customMem.customAlloc) {
+ /* calloc implemented as malloc+memset;
+ * not as efficient as calloc, but next best guess for custom malloc */
+ void* const ptr = customMem.customAlloc(customMem.opaque, size);
+ memset(ptr, 0, size);
+ return ptr;
+ }
+ return calloc(1, size);
+}
+
+void ZSTD_free(void* ptr, ZSTD_customMem customMem)
+{
+ if (ptr!=NULL) {
+ if (customMem.customFree)
+ customMem.customFree(customMem.opaque, ptr);
+ else
+ free(ptr);
+ }
+}
+/**** ended inlining common/zstd_common.c ****/
+
+/**** start inlining compress/fse_compress.c ****/
+/* ******************************************************************
+ * FSE : Finite State Entropy encoder
+ * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ * - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* **************************************************************
+* Includes
+****************************************************************/
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memcpy, memset */
+/**** skipping file: ../common/compiler.h ****/
+/**** skipping file: ../common/mem.h ****/
+/**** skipping file: ../common/debug.h ****/
+/**** start inlining hist.h ****/
+/* ******************************************************************
+ * hist : Histogram functions
+ * part of Finite State Entropy project
+ * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ * - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* --- dependencies --- */
+#include <stddef.h> /* size_t */
+
+
+/* --- simple histogram functions --- */
+
+/*! HIST_count():
+ * Provides the precise count of each byte within a table 'count'.
+ * 'count' is a table of unsigned int, of minimum size (*maxSymbolValuePtr+1).
+ * Updates *maxSymbolValuePtr with actual largest symbol value detected.
+ * @return : count of the most frequent symbol (which isn't identified).
+ * or an error code, which can be tested using HIST_isError().
+ * note : if return == srcSize, there is only one symbol.
+ */
+size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr,
+ const void* src, size_t srcSize);
+
+unsigned HIST_isError(size_t code); /**< tells if a return value is an error code */
+
+
+/* --- advanced histogram functions --- */
+
+#define HIST_WKSP_SIZE_U32 1024
+#define HIST_WKSP_SIZE (HIST_WKSP_SIZE_U32 * sizeof(unsigned))
+/** HIST_count_wksp() :
+ * Same as HIST_count(), but using an externally provided scratch buffer.
+ * Benefit is this function will use very little stack space.
+ * `workSpace` is a writable buffer which must be 4-bytes aligned,
+ * `workSpaceSize` must be >= HIST_WKSP_SIZE
+ */
+size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+ const void* src, size_t srcSize,
+ void* workSpace, size_t workSpaceSize);
+
+/** HIST_countFast() :
+ * same as HIST_count(), but blindly trusts that all byte values within src are <= *maxSymbolValuePtr.
+ * This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr`
+ */
+size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
+ const void* src, size_t srcSize);
+
+/** HIST_countFast_wksp() :
+ * Same as HIST_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` is a writable buffer which must be 4-bytes aligned,
+ * `workSpaceSize` must be >= HIST_WKSP_SIZE
+ */
+size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+ const void* src, size_t srcSize,
+ void* workSpace, size_t workSpaceSize);
+
+/*! HIST_count_simple() :
+ * Same as HIST_countFast(), this function is unsafe,
+ * and will segfault if any value within `src` is `> *maxSymbolValuePtr`.
+ * It is also a bit slower for large inputs.
+ * However, it does not need any additional memory (not even on stack).
+ * @return : count of the most frequent symbol.
+ * Note this function doesn't produce any error (i.e. it must succeed).
+ */
+unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+ const void* src, size_t srcSize);
+/**** ended inlining hist.h ****/
+/**** skipping file: ../common/bitstream.h ****/
+#define FSE_STATIC_LINKING_ONLY
+/**** skipping file: ../common/fse.h ****/
+/**** skipping file: ../common/error_private.h ****/
+
+
+/* **************************************************************
+* Error Management
+****************************************************************/
+#define FSE_isError ERR_isError
+
+
+/* **************************************************************
+* Templates
+****************************************************************/
+/*
+ designed to be included
+ for type-specific functions (template emulation in C)
+ Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+# error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+# error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+/* Function templates */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * wkspSize should be sized to handle worst case situation, which is `1<<max_tableLog * sizeof(FSE_FUNCTION_TYPE)`
+ * workSpace must also be properly aligned with FSE_FUNCTION_TYPE requirements
+ */
+size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+ const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+ void* workSpace, size_t wkspSize)
+{
+ U32 const tableSize = 1 << tableLog;
+ U32 const tableMask = tableSize - 1;
+ void* const ptr = ct;
+ U16* const tableU16 = ( (U16*) ptr) + 2;
+ void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableLog ? tableSize>>1 : 1) ;
+ FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+ U32 const step = FSE_TABLESTEP(tableSize);
+ U32 cumul[FSE_MAX_SYMBOL_VALUE+2];
+
+ FSE_FUNCTION_TYPE* const tableSymbol = (FSE_FUNCTION_TYPE*)workSpace;
+ U32 highThreshold = tableSize-1;
+
+ /* CTable header */
+ if (((size_t)1 << tableLog) * sizeof(FSE_FUNCTION_TYPE) > wkspSize) return ERROR(tableLog_tooLarge);
+ tableU16[-2] = (U16) tableLog;
+ tableU16[-1] = (U16) maxSymbolValue;
+ assert(tableLog < 16); /* required for threshold strategy to work */
+
+ /* For explanations on how to distribute symbol values over the table :
+ * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+
+ #ifdef __clang_analyzer__
+ memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize); /* useless initialization, just to keep scan-build happy */
+ #endif
+
+ /* symbol start positions */
+ { U32 u;
+ cumul[0] = 0;
+ for (u=1; u <= maxSymbolValue+1; u++) {
+ if (normalizedCounter[u-1]==-1) { /* Low proba symbol */
+ cumul[u] = cumul[u-1] + 1;
+ tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u-1);
+ } else {
+ cumul[u] = cumul[u-1] + normalizedCounter[u-1];
+ } }
+ cumul[maxSymbolValue+1] = tableSize+1;
+ }
+
+ /* Spread symbols */
+ { U32 position = 0;
+ U32 symbol;
+ for (symbol=0; symbol<=maxSymbolValue; symbol++) {
+ int nbOccurrences;
+ int const freq = normalizedCounter[symbol];
+ for (nbOccurrences=0; nbOccurrences<freq; nbOccurrences++) {
+ tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
+ position = (position + step) & tableMask;
+ while (position > highThreshold)
+ position = (position + step) & tableMask; /* Low proba area */
+ } }
+
+ assert(position==0); /* Must have initialized all positions */
+ }
+
+ /* Build table */
+ { U32 u; for (u=0; u<tableSize; u++) {
+ FSE_FUNCTION_TYPE s = tableSymbol[u]; /* note : static analyzer may not understand tableSymbol is properly initialized */
+ tableU16[cumul[s]++] = (U16) (tableSize+u); /* TableU16 : sorted by symbol order; gives next state value */
+ } }
+
+ /* Build Symbol Transformation Table */
+ { unsigned total = 0;
+ unsigned s;
+ for (s=0; s<=maxSymbolValue; s++) {
+ switch (normalizedCounter[s])
+ {
+ case 0:
+ /* filling nonetheless, for compatibility with FSE_getMaxNbBits() */
+ symbolTT[s].deltaNbBits = ((tableLog+1) << 16) - (1<<tableLog);
+ break;
+
+ case -1:
+ case 1:
+ symbolTT[s].deltaNbBits = (tableLog << 16) - (1<<tableLog);
+ symbolTT[s].deltaFindState = total - 1;
+ total ++;
+ break;
+ default :
+ {
+ U32 const maxBitsOut = tableLog - BIT_highbit32 (normalizedCounter[s]-1);
+ U32 const minStatePlus = normalizedCounter[s] << maxBitsOut;
+ symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
+ symbolTT[s].deltaFindState = total - normalizedCounter[s];
+ total += normalizedCounter[s];
+ } } } }
+
+#if 0 /* debug : symbol costs */
+ DEBUGLOG(5, "\n --- table statistics : ");
+ { U32 symbol;
+ for (symbol=0; symbol<=maxSymbolValue; symbol++) {
+ DEBUGLOG(5, "%3u: w=%3i, maxBits=%u, fracBits=%.2f",
+ symbol, normalizedCounter[symbol],
+ FSE_getMaxNbBits(symbolTT, symbol),
+ (double)FSE_bitCost(symbolTT, tableLog, symbol, 8) / 256);
+ }
+ }
+#endif
+
+ return 0;
+}
+
+
+size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+ FSE_FUNCTION_TYPE tableSymbol[FSE_MAX_TABLESIZE]; /* memset() is not necessary, even if static analyzer complain about it */
+ return FSE_buildCTable_wksp(ct, normalizedCounter, maxSymbolValue, tableLog, tableSymbol, sizeof(tableSymbol));
+}
+
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+
+/*-**************************************************************
+* FSE NCount encoding
+****************************************************************/
+size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
+{
+ size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog) >> 3) + 3;
+ return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbolValue==0 ? use default */
+}
+
+static size_t
+FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+ const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+ unsigned writeIsSafe)
+{
+ BYTE* const ostart = (BYTE*) header;
+ BYTE* out = ostart;
+ BYTE* const oend = ostart + headerBufferSize;
+ int nbBits;
+ const int tableSize = 1 << tableLog;
+ int remaining;
+ int threshold;
+ U32 bitStream = 0;
+ int bitCount = 0;
+ unsigned symbol = 0;
+ unsigned const alphabetSize = maxSymbolValue + 1;
+ int previousIs0 = 0;
+
+ /* Table Size */
+ bitStream += (tableLog-FSE_MIN_TABLELOG) << bitCount;
+ bitCount += 4;
+
+ /* Init */
+ remaining = tableSize+1; /* +1 for extra accuracy */
+ threshold = tableSize;
+ nbBits = tableLog+1;
+
+ while ((symbol < alphabetSize) && (remaining>1)) { /* stops at 1 */
+ if (previousIs0) {
+ unsigned start = symbol;
+ while ((symbol < alphabetSize) && !normalizedCounter[symbol]) symbol++;
+ if (symbol == alphabetSize) break; /* incorrect distribution */
+ while (symbol >= start+24) {
+ start+=24;
+ bitStream += 0xFFFFU << bitCount;
+ if ((!writeIsSafe) && (out > oend-2))
+ return ERROR(dstSize_tooSmall); /* Buffer overflow */
+ out[0] = (BYTE) bitStream;
+ out[1] = (BYTE)(bitStream>>8);
+ out+=2;
+ bitStream>>=16;
+ }
+ while (symbol >= start+3) {
+ start+=3;
+ bitStream += 3 << bitCount;
+ bitCount += 2;
+ }
+ bitStream += (symbol-start) << bitCount;
+ bitCount += 2;
+ if (bitCount>16) {
+ if ((!writeIsSafe) && (out > oend - 2))
+ return ERROR(dstSize_tooSmall); /* Buffer overflow */
+ out[0] = (BYTE)bitStream;
+ out[1] = (BYTE)(bitStream>>8);
+ out += 2;
+ bitStream >>= 16;
+ bitCount -= 16;
+ } }
+ { int count = normalizedCounter[symbol++];
+ int const max = (2*threshold-1) - remaining;
+ remaining -= count < 0 ? -count : count;
+ count++; /* +1 for extra accuracy */
+ if (count>=threshold)
+ count += max; /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+ bitStream += count << bitCount;
+ bitCount += nbBits;
+ bitCount -= (count<max);
+ previousIs0 = (count==1);
+ if (remaining<1) return ERROR(GENERIC);
+ while (remaining<threshold) { nbBits--; threshold>>=1; }
+ }
+ if (bitCount>16) {
+ if ((!writeIsSafe) && (out > oend - 2))
+ return ERROR(dstSize_tooSmall); /* Buffer overflow */
+ out[0] = (BYTE)bitStream;
+ out[1] = (BYTE)(bitStream>>8);
+ out += 2;
+ bitStream >>= 16;
+ bitCount -= 16;
+ } }
+
+ if (remaining != 1)
+ return ERROR(GENERIC); /* incorrect normalized distribution */
+ assert(symbol <= alphabetSize);
+
+ /* flush remaining bitStream */
+ if ((!writeIsSafe) && (out > oend - 2))
+ return ERROR(dstSize_tooSmall); /* Buffer overflow */
+ out[0] = (BYTE)bitStream;
+ out[1] = (BYTE)(bitStream>>8);
+ out+= (bitCount+7) /8;
+
+ return (out-ostart);
+}
+
+
+size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+ const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+ if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge); /* Unsupported */
+ if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC); /* Unsupported */
+
+ if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
+ return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
+
+ return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1 /* write in buffer is safe */);
+}
+
+
+/*-**************************************************************
+* FSE Compression Code
+****************************************************************/
+
+FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
+{
+ size_t size;
+ if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+ size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
+ return (FSE_CTable*)malloc(size);
+}
+
+void FSE_freeCTable (FSE_CTable* ct) { free(ct); }
+
+/* provides the minimum logSize to safely represent a distribution */
+static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+{
+ U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1;
+ U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
+ U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+ assert(srcSize > 1); /* Not supported, RLE should be used instead */
+ return minBits;
+}
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
+{
+ U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
+ U32 tableLog = maxTableLog;
+ U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
+ assert(srcSize > 1); /* Not supported, RLE should be used instead */
+ if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
+ if (maxBitsSrc < tableLog) tableLog = maxBitsSrc; /* Accuracy can be reduced */
+ if (minBits > tableLog) tableLog = minBits; /* Need a minimum to safely represent all symbol values */
+ if (tableLog < FSE_MIN_TABLELOG) tableLog = FSE_MIN_TABLELOG;
+ if (tableLog > FSE_MAX_TABLELOG) tableLog = FSE_MAX_TABLELOG;
+ return tableLog;
+}
+
+unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+ return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 2);
+}
+
+
+/* Secondary normalization method.
+ To be used when primary method fails. */
+
+static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, size_t total, U32 maxSymbolValue)
+{
+ short const NOT_YET_ASSIGNED = -2;
+ U32 s;
+ U32 distributed = 0;
+ U32 ToDistribute;
+
+ /* Init */
+ U32 const lowThreshold = (U32)(total >> tableLog);
+ U32 lowOne = (U32)((total * 3) >> (tableLog + 1));
+
+ for (s=0; s<=maxSymbolValue; s++) {
+ if (count[s] == 0) {
+ norm[s]=0;
+ continue;
+ }
+ if (count[s] <= lowThreshold) {
+ norm[s] = -1;
+ distributed++;
+ total -= count[s];
+ continue;
+ }
+ if (count[s] <= lowOne) {
+ norm[s] = 1;
+ distributed++;
+ total -= count[s];
+ continue;
+ }
+
+ norm[s]=NOT_YET_ASSIGNED;
+ }
+ ToDistribute = (1 << tableLog) - distributed;
+
+ if (ToDistribute == 0)
+ return 0;
+
+ if ((total / ToDistribute) > lowOne) {
+ /* risk of rounding to zero */
+ lowOne = (U32)((total * 3) / (ToDistribute * 2));
+ for (s=0; s<=maxSymbolValue; s++) {
+ if ((norm[s] == NOT_YET_ASSIGNED) && (count[s] <= lowOne)) {
+ norm[s] = 1;
+ distributed++;
+ total -= count[s];
+ continue;
+ } }
+ ToDistribute = (1 << tableLog) - distributed;
+ }
+
+ if (distributed == maxSymbolValue+1) {
+ /* all values are pretty poor;
+ probably incompressible data (should have already been detected);
+ find max, then give all remaining points to max */
+ U32 maxV = 0, maxC = 0;
+ for (s=0; s<=maxSymbolValue; s++)
+ if (count[s] > maxC) { maxV=s; maxC=count[s]; }
+ norm[maxV] += (short)ToDistribute;
+ return 0;
+ }
+
+ if (total == 0) {
+ /* all of the symbols were low enough for the lowOne or lowThreshold */
+ for (s=0; ToDistribute > 0; s = (s+1)%(maxSymbolValue+1))
+ if (norm[s] > 0) { ToDistribute--; norm[s]++; }
+ return 0;
+ }
+
+ { U64 const vStepLog = 62 - tableLog;
+ U64 const mid = (1ULL << (vStepLog-1)) - 1;
+ U64 const rStep = ((((U64)1<<vStepLog) * ToDistribute) + mid) / total; /* scale on remaining */
+ U64 tmpTotal = mid;
+ for (s=0; s<=maxSymbolValue; s++) {
+ if (norm[s]==NOT_YET_ASSIGNED) {
+ U64 const end = tmpTotal + (count[s] * rStep);
+ U32 const sStart = (U32)(tmpTotal >> vStepLog);
+ U32 const sEnd = (U32)(end >> vStepLog);
+ U32 const weight = sEnd - sStart;
+ if (weight < 1)
+ return ERROR(GENERIC);
+ norm[s] = (short)weight;
+ tmpTotal = end;
+ } } }
+
+ return 0;
+}
+
+
+size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
+ const unsigned* count, size_t total,
+ unsigned maxSymbolValue)
+{
+ /* Sanity checks */
+ if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
+ if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC); /* Unsupported size */
+ if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge); /* Unsupported size */
+ if (tableLog < FSE_minTableLog(total, maxSymbolValue)) return ERROR(GENERIC); /* Too small tableLog, compression potentially impossible */
+
+ { static U32 const rtbTable[] = { 0, 473195, 504333, 520860, 550000, 700000, 750000, 830000 };
+ U64 const scale = 62 - tableLog;
+ U64 const step = ((U64)1<<62) / total; /* <== here, one division ! */
+ U64 const vStep = 1ULL<<(scale-20);
+ int stillToDistribute = 1<<tableLog;
+ unsigned s;
+ unsigned largest=0;
+ short largestP=0;
+ U32 lowThreshold = (U32)(total >> tableLog);
+
+ for (s=0; s<=maxSymbolValue; s++) {
+ if (count[s] == total) return 0; /* rle special case */
+ if (count[s] == 0) { normalizedCounter[s]=0; continue; }
+ if (count[s] <= lowThreshold) {
+ normalizedCounter[s] = -1;
+ stillToDistribute--;
+ } else {
+ short proba = (short)((count[s]*step) >> scale);
+ if (proba<8) {
+ U64 restToBeat = vStep * rtbTable[proba];
+ proba += (count[s]*step) - ((U64)proba<<scale) > restToBeat;
+ }
+ if (proba > largestP) { largestP=proba; largest=s; }
+ normalizedCounter[s] = proba;
+ stillToDistribute -= proba;
+ } }
+ if (-stillToDistribute >= (normalizedCounter[largest] >> 1)) {
+ /* corner case, need another normalization method */
+ size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue);
+ if (FSE_isError(errorCode)) return errorCode;
+ }
+ else normalizedCounter[largest] += (short)stillToDistribute;
+ }
+
+#if 0
+ { /* Print Table (debug) */
+ U32 s;
+ U32 nTotal = 0;
+ for (s=0; s<=maxSymbolValue; s++)
+ RAWLOG(2, "%3i: %4i \n", s, normalizedCounter[s]);
+ for (s=0; s<=maxSymbolValue; s++)
+ nTotal += abs(normalizedCounter[s]);
+ if (nTotal != (1U<<tableLog))
+ RAWLOG(2, "Warning !!! Total == %u != %u !!!", nTotal, 1U<<tableLog);
+ getchar();
+ }
+#endif
+
+ return tableLog;
+}
+
+
+/* fake FSE_CTable, for raw (uncompressed) input */
+size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
+{
+ const unsigned tableSize = 1 << nbBits;
+ const unsigned tableMask = tableSize - 1;
+ const unsigned maxSymbolValue = tableMask;
+ void* const ptr = ct;
+ U16* const tableU16 = ( (U16*) ptr) + 2;
+ void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1); /* assumption : tableLog >= 1 */
+ FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+ unsigned s;
+
+ /* Sanity checks */
+ if (nbBits < 1) return ERROR(GENERIC); /* min size */
+
+ /* header */
+ tableU16[-2] = (U16) nbBits;
+ tableU16[-1] = (U16) maxSymbolValue;
+
+ /* Build table */
+ for (s=0; s<tableSize; s++)
+ tableU16[s] = (U16)(tableSize + s);
+
+ /* Build Symbol Transformation Table */
+ { const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
+ for (s=0; s<=maxSymbolValue; s++) {
+ symbolTT[s].deltaNbBits = deltaNbBits;
+ symbolTT[s].deltaFindState = s-1;
+ } }
+
+ return 0;
+}
+
+/* fake FSE_CTable, for rle input (always same symbol) */
+size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
+{
+ void* ptr = ct;
+ U16* tableU16 = ( (U16*) ptr) + 2;
+ void* FSCTptr = (U32*)ptr + 2;
+ FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) FSCTptr;
+
+ /* header */
+ tableU16[-2] = (U16) 0;
+ tableU16[-1] = (U16) symbolValue;
+
+ /* Build table */
+ tableU16[0] = 0;
+ tableU16[1] = 0; /* just in case */
+
+ /* Build Symbol Transformation Table */
+ symbolTT[symbolValue].deltaNbBits = 0;
+ symbolTT[symbolValue].deltaFindState = 0;
+
+ return 0;
+}
+
+
+static size_t FSE_compress_usingCTable_generic (void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+ const FSE_CTable* ct, const unsigned fast)
+{
+ const BYTE* const istart = (const BYTE*) src;
+ const BYTE* const iend = istart + srcSize;
+ const BYTE* ip=iend;
+
+ BIT_CStream_t bitC;
+ FSE_CState_t CState1, CState2;
+
+ /* init */
+ if (srcSize <= 2) return 0;
+ { size_t const initError = BIT_initCStream(&bitC, dst, dstSize);
+ if (FSE_isError(initError)) return 0; /* not enough space available to write a bitstream */ }
+
+#define FSE_FLUSHBITS(s) (fast ? BIT_flushBitsFast(s) : BIT_flushBits(s))
+
+ if (srcSize & 1) {
+ FSE_initCState2(&CState1, ct, *--ip);
+ FSE_initCState2(&CState2, ct, *--ip);
+ FSE_encodeSymbol(&bitC, &CState1, *--ip);
+ FSE_FLUSHBITS(&bitC);
+ } else {
+ FSE_initCState2(&CState2, ct, *--ip);
+ FSE_initCState2(&CState1, ct, *--ip);
+ }
+
+ /* join to mod 4 */
+ srcSize -= 2;
+ if ((sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) && (srcSize & 2)) { /* test bit 2 */
+ FSE_encodeSymbol(&bitC, &CState2, *--ip);
+ FSE_encodeSymbol(&bitC, &CState1, *--ip);
+ FSE_FLUSHBITS(&bitC);
+ }
+
+ /* 2 or 4 encoding per loop */
+ while ( ip>istart ) {
+
+ FSE_encodeSymbol(&bitC, &CState2, *--ip);
+
+ if (sizeof(bitC.bitContainer)*8 < FSE_MAX_TABLELOG*2+7 ) /* this test must be static */
+ FSE_FLUSHBITS(&bitC);
+
+ FSE_encodeSymbol(&bitC, &CState1, *--ip);
+
+ if (sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) { /* this test must be static */
+ FSE_encodeSymbol(&bitC, &CState2, *--ip);
+ FSE_encodeSymbol(&bitC, &CState1, *--ip);
+ }
+
+ FSE_FLUSHBITS(&bitC);
+ }
+
+ FSE_flushCState(&bitC, &CState2);
+ FSE_flushCState(&bitC, &CState1);
+ return BIT_closeCStream(&bitC);
+}
+
+size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+ const FSE_CTable* ct)
+{
+ unsigned const fast = (dstSize >= FSE_BLOCKBOUND(srcSize));
+
+ if (fast)
+ return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 1);
+ else
+ return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 0);
+}
+
+
+size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
+
+/* FSE_compress_wksp() :
+ * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
+ * `wkspSize` size must be `(1<<tableLog)`.
+ */
+size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+{
+ BYTE* const ostart = (BYTE*) dst;
+ BYTE* op = ostart;
+ BYTE* const oend = ostart + dstSize;
+
+ unsigned count[FSE_MAX_SYMBOL_VALUE+1];
+ S16 norm[FSE_MAX_SYMBOL_VALUE+1];
+ FSE_CTable* CTable = (FSE_CTable*)workSpace;
+ size_t const CTableSize = FSE_CTABLE_SIZE_U32(tableLog, maxSymbolValue);
+ void* scratchBuffer = (void*)(CTable + CTableSize);
+ size_t const scratchBufferSize = wkspSize - (CTableSize * sizeof(FSE_CTable));
+
+ /* init conditions */
+ if (wkspSize < FSE_WKSP_SIZE_U32(tableLog, maxSymbolValue)) return ERROR(tableLog_tooLarge);
+ if (srcSize <= 1) return 0; /* Not compressible */
+ if (!maxSymbolValue) maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+ if (!tableLog) tableLog = FSE_DEFAULT_TABLELOG;
+
+ /* Scan input and build symbol stats */
+ { CHECK_V_F(maxCount, HIST_count_wksp(count, &maxSymbolValue, src, srcSize, scratchBuffer, scratchBufferSize) );
+ if (maxCount == srcSize) return 1; /* only a single symbol in src : rle */
+ if (maxCount == 1) return 0; /* each symbol present maximum once => not compressible */
+ if (maxCount < (srcSize >> 7)) return 0; /* Heuristic : not compressible enough */
+ }
+
+ tableLog = FSE_optimalTableLog(tableLog, srcSize, maxSymbolValue);
+ CHECK_F( FSE_normalizeCount(norm, tableLog, count, srcSize, maxSymbolValue) );
+
+ /* Write table description header */
+ { CHECK_V_F(nc_err, FSE_writeNCount(op, oend-op, norm, maxSymbolValue, tableLog) );
+ op += nc_err;
+ }
+
+ /* Compress */
+ CHECK_F( FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, scratchBufferSize) );
+ { CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, src, srcSize, CTable) );
+ if (cSize == 0) return 0; /* not enough space for compressed data */
+ op += cSize;
+ }
+
+ /* check compressibility */
+ if ( (size_t)(op-ostart) >= srcSize-1 ) return 0;
+
+ return op-ostart;
+}
+
+typedef struct {
+ FSE_CTable CTable_max[FSE_CTABLE_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)];
+ BYTE scratchBuffer[1 << FSE_MAX_TABLELOG];
+} fseWkspMax_t;
+
+size_t FSE_compress2 (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog)
+{
+ fseWkspMax_t scratchBuffer;
+ DEBUG_STATIC_ASSERT(sizeof(scratchBuffer) >= FSE_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)); /* compilation failures here means scratchBuffer is not large enough */
+ if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+ return FSE_compress_wksp(dst, dstCapacity, src, srcSize, maxSymbolValue, tableLog, &scratchBuffer, sizeof(scratchBuffer));
+}
+
+size_t FSE_compress (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+ return FSE_compress2(dst, dstCapacity, src, srcSize, FSE_MAX_SYMBOL_VALUE, FSE_DEFAULT_TABLELOG);
+}
+
+
+#endif /* FSE_COMMONDEFS_ONLY */
+/**** ended inlining compress/fse_compress.c ****/
+/**** start inlining compress/hist.c ****/
+/* ******************************************************************
+ * hist : Histogram functions
+ * part of Finite State Entropy project
+ * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ * - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* --- dependencies --- */
+/**** skipping file: ../common/mem.h ****/
+/**** skipping file: ../common/debug.h ****/
+/**** skipping file: ../common/error_private.h ****/
+/**** skipping file: hist.h ****/
+
+
+/* --- Error management --- */
+unsigned HIST_isError(size_t code) { return ERR_isError(code); }
+
+/*-**************************************************************
+ * Histogram functions
+ ****************************************************************/
+unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+ const void* src, size_t srcSize)
+{
+ const BYTE* ip = (const BYTE*)src;
+ const BYTE* const end = ip + srcSize;
+ unsigned maxSymbolValue = *maxSymbolValuePtr;
+ unsigned largestCount=0;
+
+ memset(count, 0, (maxSymbolValue+1) * sizeof(*count));
+ if (srcSize==0) { *maxSymbolValuePtr = 0; return 0; }
+
+ while (ip<end) {
+ assert(*ip <= maxSymbolValue);
+ count[*ip++]++;
+ }
+
+ while (!count[maxSymbolValue]) maxSymbolValue--;
+ *maxSymbolValuePtr = maxSymbolValue;
+
+ { U32 s;
+ for (s=0; s<=maxSymbolValue; s++)
+ if (count[s] > largestCount) largestCount = count[s];
+ }
+
+ return largestCount;
+}
+
+typedef enum { trustInput, checkMaxSymbolValue } HIST_checkInput_e;
+
+/* HIST_count_parallel_wksp() :
+ * store histogram into 4 intermediate tables, recombined at the end.
+ * this design makes better use of OoO cpus,
+ * and is noticeably faster when some values are heavily repeated.
+ * But it needs some additional workspace for intermediate tables.
+ * `workSpace` size must be a table of size >= HIST_WKSP_SIZE_U32.
+ * @return : largest histogram frequency,
+ * or an error code (notably when histogram would be larger than *maxSymbolValuePtr). */
+static size_t HIST_count_parallel_wksp(
+ unsigned* count, unsigned* maxSymbolValuePtr,
+ const void* source, size_t sourceSize,
+ HIST_checkInput_e check,
+ U32* const workSpace)
+{
+ const BYTE* ip = (const BYTE*)source;
+ const BYTE* const iend = ip+sourceSize;
+ unsigned maxSymbolValue = *maxSymbolValuePtr;
+ unsigned max=0;
+ U32* const Counting1 = workSpace;
+ U32* const Counting2 = Counting1 + 256;
+ U32* const Counting3 = Counting2 + 256;
+ U32* const Counting4 = Counting3 + 256;
+
+ memset(workSpace, 0, 4*256*sizeof(unsigned));
+
+ /* safety checks */
+ if (!sourceSize) {
+ memset(count, 0, maxSymbolValue + 1);
+ *maxSymbolValuePtr = 0;
+ return 0;
+ }
+ if (!maxSymbolValue) maxSymbolValue = 255; /* 0 == default */
+
+ /* by stripes of 16 bytes */
+ { U32 cached = MEM_read32(ip); ip += 4;
+ while (ip < iend-15) {
+ U32 c = cached; cached = MEM_read32(ip); ip += 4;
+ Counting1[(BYTE) c ]++;
+ Counting2[(BYTE)(c>>8) ]++;
+ Counting3[(BYTE)(c>>16)]++;
+ Counting4[ c>>24 ]++;
+ c = cached; cached = MEM_read32(ip); ip += 4;
+ Counting1[(BYTE) c ]++;
+ Counting2[(BYTE)(c>>8) ]++;
+ Counting3[(BYTE)(c>>16)]++;
+ Counting4[ c>>24 ]++;
+ c = cached; cached = MEM_read32(ip); ip += 4;
+ Counting1[(BYTE) c ]++;
+ Counting2[(BYTE)(c>>8) ]++;
+ Counting3[(BYTE)(c>>16)]++;
+ Counting4[ c>>24 ]++;
+ c = cached; cached = MEM_read32(ip); ip += 4;
+ Counting1[(BYTE) c ]++;
+ Counting2[(BYTE)(c>>8) ]++;
+ Counting3[(BYTE)(c>>16)]++;
+ Counting4[ c>>24 ]++;
+ }
+ ip-=4;
+ }
+
+ /* finish last symbols */
+ while (ip<iend) Counting1[*ip++]++;
+
+ if (check) { /* verify stats will fit into destination table */
+ U32 s; for (s=255; s>maxSymbolValue; s--) {
+ Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s];
+ if (Counting1[s]) return ERROR(maxSymbolValue_tooSmall);
+ } }
+
+ { U32 s;
+ if (maxSymbolValue > 255) maxSymbolValue = 255;
+ for (s=0; s<=maxSymbolValue; s++) {
+ count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s];
+ if (count[s] > max) max = count[s];
+ } }
+
+ while (!count[maxSymbolValue]) maxSymbolValue--;
+ *maxSymbolValuePtr = maxSymbolValue;
+ return (size_t)max;
+}
+
+/* HIST_countFast_wksp() :
+ * Same as HIST_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` is a writable buffer which must be 4-bytes aligned,
+ * `workSpaceSize` must be >= HIST_WKSP_SIZE
+ */
+size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+ const void* source, size_t sourceSize,
+ void* workSpace, size_t workSpaceSize)
+{
+ if (sourceSize < 1500) /* heuristic threshold */
+ return HIST_count_simple(count, maxSymbolValuePtr, source, sourceSize);
+ if ((size_t)workSpace & 3) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */
+ if (workSpaceSize < HIST_WKSP_SIZE) return ERROR(workSpace_tooSmall);
+ return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, trustInput, (U32*)workSpace);
+}
+
+/* fast variant (unsafe : won't check if src contains values beyond count[] limit) */
+size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
+ const void* source, size_t sourceSize)
+{
+ unsigned tmpCounters[HIST_WKSP_SIZE_U32];
+ return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, tmpCounters, sizeof(tmpCounters));
+}
+
+/* HIST_count_wksp() :
+ * Same as HIST_count(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= HIST_WKSP_SIZE_U32 unsigned */
+size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+ const void* source, size_t sourceSize,
+ void* workSpace, size_t workSpaceSize)
+{
+ if ((size_t)workSpace & 3) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */
+ if (workSpaceSize < HIST_WKSP_SIZE) return ERROR(workSpace_tooSmall);
+ if (*maxSymbolValuePtr < 255)
+ return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, checkMaxSymbolValue, (U32*)workSpace);
+ *maxSymbolValuePtr = 255;
+ return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace, workSpaceSize);
+}
+
+size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr,
+ const void* src, size_t srcSize)
+{
+ unsigned tmpCounters[HIST_WKSP_SIZE_U32];
+ return HIST_count_wksp(count, maxSymbolValuePtr, src, srcSize, tmpCounters, sizeof(tmpCounters));
+}
+/**** ended inlining compress/hist.c ****/
+/**** start inlining compress/huf_compress.c ****/
+/* ******************************************************************
+ * Huffman encoder, part of New Generation Entropy library
+ * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ * - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* **************************************************************
+* Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER /* Visual Studio */
+# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
+#endif
+
+
+/* **************************************************************
+* Includes
+****************************************************************/
+#include <string.h> /* memcpy, memset */
+#include <stdio.h> /* printf (debug) */
+/**** skipping file: ../common/compiler.h ****/
+/**** skipping file: ../common/bitstream.h ****/
+/**** skipping file: hist.h ****/
+#define FSE_STATIC_LINKING_ONLY /* FSE_optimalTableLog_internal */
+/**** skipping file: ../common/fse.h ****/
+#define HUF_STATIC_LINKING_ONLY
+/**** skipping file: ../common/huf.h ****/
+/**** skipping file: ../common/error_private.h ****/
+
+
+/* **************************************************************
+* Error Management
+****************************************************************/
+#define HUF_isError ERR_isError
+#define HUF_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c) /* use only *after* variable declarations */
+
+
+/* **************************************************************
+* Utils
+****************************************************************/
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+ return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
+}
+
+
+/* *******************************************************
+* HUF : Huffman block compression
+*********************************************************/
+/* HUF_compressWeights() :
+ * Same as FSE_compress(), but dedicated to huff0's weights compression.
+ * The use case needs much less stack memory.
+ * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX.
+ */
+#define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6
+static size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weightTable, size_t wtSize)
+{
+ BYTE* const ostart = (BYTE*) dst;
+ BYTE* op = ostart;
+ BYTE* const oend = ostart + dstSize;
+
+ unsigned maxSymbolValue = HUF_TABLELOG_MAX;
+ U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER;
+
+ FSE_CTable CTable[FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX)];
+ BYTE scratchBuffer[1<<MAX_FSE_TABLELOG_FOR_HUFF_HEADER];
+
+ unsigned count[HUF_TABLELOG_MAX+1];
+ S16 norm[HUF_TABLELOG_MAX+1];
+
+ /* init conditions */
+ if (wtSize <= 1) return 0; /* Not compressible */
+
+ /* Scan input and build symbol stats */
+ { unsigned const maxCount = HIST_count_simple(count, &maxSymbolValue, weightTable, wtSize); /* never fails */
+ if (maxCount == wtSize) return 1; /* only a single symbol in src : rle */
+ if (maxCount == 1) return 0; /* each symbol present maximum once => not compressible */
+ }
+
+ tableLog = FSE_optimalTableLog(tableLog, wtSize, maxSymbolValue);
+ CHECK_F( FSE_normalizeCount(norm, tableLog, count, wtSize, maxSymbolValue) );
+
+ /* Write table description header */
+ { CHECK_V_F(hSize, FSE_writeNCount(op, (size_t)(oend-op), norm, maxSymbolValue, tableLog) );
+ op += hSize;
+ }
+
+ /* Compress */
+ CHECK_F( FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, sizeof(scratchBuffer)) );
+ { CHECK_V_F(cSize, FSE_compress_usingCTable(op, (size_t)(oend - op), weightTable, wtSize, CTable) );
+ if (cSize == 0) return 0; /* not enough space for compressed data */
+ op += cSize;
+ }
+
+ return (size_t)(op-ostart);
+}
+
+
+struct HUF_CElt_s {
+ U16 val;
+ BYTE nbBits;
+}; /* typedef'd to HUF_CElt within "huf.h" */
+
+/*! HUF_writeCTable() :
+ `CTable` : Huffman tree to save, using huf representation.
+ @return : size of saved CTable */
+size_t HUF_writeCTable (void* dst, size_t maxDstSize,
+ const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog)
+{
+ BYTE bitsToWeight[HUF_TABLELOG_MAX + 1]; /* precomputed conversion table */
+ BYTE huffWeight[HUF_SYMBOLVALUE_MAX];
+ BYTE* op = (BYTE*)dst;
+ U32 n;
+
+ /* check conditions */
+ if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+
+ /* convert to weight */
+ bitsToWeight[0] = 0;
+ for (n=1; n<huffLog+1; n++)
+ bitsToWeight[n] = (BYTE)(huffLog + 1 - n);
+ for (n=0; n<maxSymbolValue; n++)
+ huffWeight[n] = bitsToWeight[CTable[n].nbBits];
+
+ /* attempt weights compression by FSE */
+ { CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, huffWeight, maxSymbolValue) );
+ if ((hSize>1) & (hSize < maxSymbolValue/2)) { /* FSE compressed */
+ op[0] = (BYTE)hSize;
+ return hSize+1;
+ } }
+
+ /* write raw values as 4-bits (max : 15) */
+ if (maxSymbolValue > (256-128)) return ERROR(GENERIC); /* should not happen : likely means source cannot be compressed */
+ if (((maxSymbolValue+1)/2) + 1 > maxDstSize) return ERROR(dstSize_tooSmall); /* not enough space within dst buffer */
+ op[0] = (BYTE)(128 /*special case*/ + (maxSymbolValue-1));
+ huffWeight[maxSymbolValue] = 0; /* to be sure it doesn't cause msan issue in final combination */
+ for (n=0; n<maxSymbolValue; n+=2)
+ op[(n/2)+1] = (BYTE)((huffWeight[n] << 4) + huffWeight[n+1]);
+ return ((maxSymbolValue+1)/2) + 1;
+}
+
+
+size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights)
+{
+ BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1]; /* init not required, even though some static analyzer may complain */
+ U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; /* large enough for values from 0 to 16 */
+ U32 tableLog = 0;
+ U32 nbSymbols = 0;
+
+ /* get symbol weights */
+ CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX+1, rankVal, &nbSymbols, &tableLog, src, srcSize));
+
+ /* check result */
+ if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+ if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall);
+
+ /* Prepare base value per rank */
+ { U32 n, nextRankStart = 0;
+ for (n=1; n<=tableLog; n++) {
+ U32 current = nextRankStart;
+ nextRankStart += (rankVal[n] << (n-1));
+ rankVal[n] = current;
+ } }
+
+ /* fill nbBits */
+ *hasZeroWeights = 0;
+ { U32 n; for (n=0; n<nbSymbols; n++) {
+ const U32 w = huffWeight[n];
+ *hasZeroWeights |= (w == 0);
+ CTable[n].nbBits = (BYTE)(tableLog + 1 - w) & -(w != 0);
+ } }
+
+ /* fill val */
+ { U16 nbPerRank[HUF_TABLELOG_MAX+2] = {0}; /* support w=0=>n=tableLog+1 */
+ U16 valPerRank[HUF_TABLELOG_MAX+2] = {0};
+ { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[CTable[n].nbBits]++; }
+ /* determine stating value per rank */
+ valPerRank[tableLog+1] = 0; /* for w==0 */
+ { U16 min = 0;
+ U32 n; for (n=tableLog; n>0; n--) { /* start at n=tablelog <-> w=1 */
+ valPerRank[n] = min; /* get starting value within each rank */
+ min += nbPerRank[n];
+ min >>= 1;
+ } }
+ /* assign value within rank, symbol order */
+ { U32 n; for (n=0; n<nbSymbols; n++) CTable[n].val = valPerRank[CTable[n].nbBits]++; }
+ }
+
+ *maxSymbolValuePtr = nbSymbols - 1;
+ return readSize;
+}
+
+U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue)
+{
+ const HUF_CElt* table = (const HUF_CElt*)symbolTable;
+ assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
+ return table[symbolValue].nbBits;
+}
+
+
+typedef struct nodeElt_s {
+ U32 count;
+ U16 parent;
+ BYTE byte;
+ BYTE nbBits;
+} nodeElt;
+
+static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+{
+ const U32 largestBits = huffNode[lastNonNull].nbBits;
+ if (largestBits <= maxNbBits) return largestBits; /* early exit : no elt > maxNbBits */
+
+ /* there are several too large elements (at least >= 2) */
+ { int totalCost = 0;
+ const U32 baseCost = 1 << (largestBits - maxNbBits);
+ int n = (int)lastNonNull;
+
+ while (huffNode[n].nbBits > maxNbBits) {
+ totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
+ huffNode[n].nbBits = (BYTE)maxNbBits;
+ n --;
+ } /* n stops at huffNode[n].nbBits <= maxNbBits */
+ while (huffNode[n].nbBits == maxNbBits) n--; /* n end at index of smallest symbol using < maxNbBits */
+
+ /* renorm totalCost */
+ totalCost >>= (largestBits - maxNbBits); /* note : totalCost is necessarily a multiple of baseCost */
+
+ /* repay normalized cost */
+ { U32 const noSymbol = 0xF0F0F0F0;
+ U32 rankLast[HUF_TABLELOG_MAX+2];
+
+ /* Get pos of last (smallest) symbol per rank */
+ memset(rankLast, 0xF0, sizeof(rankLast));
+ { U32 currentNbBits = maxNbBits;
+ int pos;
+ for (pos=n ; pos >= 0; pos--) {
+ if (huffNode[pos].nbBits >= currentNbBits) continue;
+ currentNbBits = huffNode[pos].nbBits; /* < maxNbBits */
+ rankLast[maxNbBits-currentNbBits] = (U32)pos;
+ } }
+
+ while (totalCost > 0) {
+ U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1;
+ for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
+ U32 const highPos = rankLast[nBitsToDecrease];
+ U32 const lowPos = rankLast[nBitsToDecrease-1];
+ if (highPos == noSymbol) continue;
+ if (lowPos == noSymbol) break;
+ { U32 const highTotal = huffNode[highPos].count;
+ U32 const lowTotal = 2 * huffNode[lowPos].count;
+ if (highTotal <= lowTotal) break;
+ } }
+ /* only triggered when no more rank 1 symbol left => find closest one (note : there is necessarily at least one !) */
+ /* HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary */
+ while ((nBitsToDecrease<=HUF_TABLELOG_MAX) && (rankLast[nBitsToDecrease] == noSymbol))
+ nBitsToDecrease ++;
+ totalCost -= 1 << (nBitsToDecrease-1);
+ if (rankLast[nBitsToDecrease-1] == noSymbol)
+ rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease]; /* this rank is no longer empty */
+ huffNode[rankLast[nBitsToDecrease]].nbBits ++;
+ if (rankLast[nBitsToDecrease] == 0) /* special case, reached largest symbol */
+ rankLast[nBitsToDecrease] = noSymbol;
+ else {
+ rankLast[nBitsToDecrease]--;
+ if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
+ rankLast[nBitsToDecrease] = noSymbol; /* this rank is now empty */
+ } } /* while (totalCost > 0) */
+
+ while (totalCost < 0) { /* Sometimes, cost correction overshoot */
+ if (rankLast[1] == noSymbol) { /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */
+ while (huffNode[n].nbBits == maxNbBits) n--;
+ huffNode[n+1].nbBits--;
+ assert(n >= 0);
+ rankLast[1] = (U32)(n+1);
+ totalCost++;
+ continue;
+ }
+ huffNode[ rankLast[1] + 1 ].nbBits--;
+ rankLast[1]++;
+ totalCost ++;
+ } } } /* there are several too large elements (at least >= 2) */
+
+ return maxNbBits;
+}
+
+typedef struct {
+ U32 base;
+ U32 current;
+} rankPos;
+
+typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
+
+#define RANK_POSITION_TABLE_SIZE 32
+
+typedef struct {
+ huffNodeTable huffNodeTbl;
+ rankPos rankPosition[RANK_POSITION_TABLE_SIZE];
+} HUF_buildCTable_wksp_tables;
+
+static void HUF_sort(nodeElt* huffNode, const unsigned* count, U32 maxSymbolValue, rankPos* rankPosition)
+{
+ U32 n;
+
+ memset(rankPosition, 0, sizeof(*rankPosition) * RANK_POSITION_TABLE_SIZE);
+ for (n=0; n<=maxSymbolValue; n++) {
+ U32 r = BIT_highbit32(count[n] + 1);
+ rankPosition[r].base ++;
+ }
+ for (n=30; n>0; n--) rankPosition[n-1].base += rankPosition[n].base;
+ for (n=0; n<32; n++) rankPosition[n].current = rankPosition[n].base;
+ for (n=0; n<=maxSymbolValue; n++) {
+ U32 const c = count[n];
+ U32 const r = BIT_highbit32(c+1) + 1;
+ U32 pos = rankPosition[r].current++;
+ while ((pos > rankPosition[r].base) && (c > huffNode[pos-1].count)) {
+ huffNode[pos] = huffNode[pos-1];
+ pos--;
+ }
+ huffNode[pos].count = c;
+ huffNode[pos].byte = (BYTE)n;
+ }
+}
+
+
+/** HUF_buildCTable_wksp() :
+ * Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables).
+ */
+#define STARTNODE (HUF_SYMBOLVALUE_MAX+1)
+
+size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
+{
+ HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)workSpace;
+ nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
+ nodeElt* const huffNode = huffNode0+1;
+ int nonNullRank;
+ int lowS, lowN;
+ int nodeNb = STARTNODE;
+ int n, nodeRoot;
+
+ /* safety checks */
+ if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */
+ if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
+ return ERROR(workSpace_tooSmall);
+ if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
+ if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
+ return ERROR(maxSymbolValue_tooLarge);
+ memset(huffNode0, 0, sizeof(huffNodeTable));
+
+ /* sort, decreasing order */
+ HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition);
+
+ /* init for parents */
+ nonNullRank = (int)maxSymbolValue;
+ while(huffNode[nonNullRank].count == 0) nonNullRank--;
+ lowS = nonNullRank; nodeRoot = nodeNb + lowS - 1; lowN = nodeNb;
+ huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count;
+ huffNode[lowS].parent = huffNode[lowS-1].parent = (U16)nodeNb;
+ nodeNb++; lowS-=2;
+ for (n=nodeNb; n<=nodeRoot; n++) huffNode[n].count = (U32)(1U<<30);
+ huffNode0[0].count = (U32)(1U<<31); /* fake entry, strong barrier */
+
+ /* create parents */
+ while (nodeNb <= nodeRoot) {
+ int const n1 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+ int const n2 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+ huffNode[nodeNb].count = huffNode[n1].count + huffNode[n2].count;
+ huffNode[n1].parent = huffNode[n2].parent = (U16)nodeNb;
+ nodeNb++;
+ }
+
+ /* distribute weights (unlimited tree height) */
+ huffNode[nodeRoot].nbBits = 0;
+ for (n=nodeRoot-1; n>=STARTNODE; n--)
+ huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+ for (n=0; n<=nonNullRank; n++)
+ huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+
+ /* enforce maxTableLog */
+ maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
+
+ /* fill result into tree (val, nbBits) */
+ { U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0};
+ U16 valPerRank[HUF_TABLELOG_MAX+1] = {0};
+ int const alphabetSize = (int)(maxSymbolValue + 1);
+ if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */
+ for (n=0; n<=nonNullRank; n++)
+ nbPerRank[huffNode[n].nbBits]++;
+ /* determine stating value per rank */
+ { U16 min = 0;
+ for (n=(int)maxNbBits; n>0; n--) {
+ valPerRank[n] = min; /* get starting value within each rank */
+ min += nbPerRank[n];
+ min >>= 1;
+ } }
+ for (n=0; n<alphabetSize; n++)
+ tree[huffNode[n].byte].nbBits = huffNode[n].nbBits; /* push nbBits per symbol, symbol order */
+ for (n=0; n<alphabetSize; n++)
+ tree[n].val = valPerRank[tree[n].nbBits]++; /* assign value within rank, symbol order */
+ }
+
+ return maxNbBits;
+}
+
+/** HUF_buildCTable() :
+ * @return : maxNbBits
+ * Note : count is used before tree is written, so they can safely overlap
+ */
+size_t HUF_buildCTable (HUF_CElt* tree, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits)
+{
+ HUF_buildCTable_wksp_tables workspace;
+ return HUF_buildCTable_wksp(tree, count, maxSymbolValue, maxNbBits, &workspace, sizeof(workspace));
+}
+
+size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue)
+{
+ size_t nbBits = 0;
+ int s;
+ for (s = 0; s <= (int)maxSymbolValue; ++s) {
+ nbBits += CTable[s].nbBits * count[s];
+ }
+ return nbBits >> 3;
+}
+
+int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
+ int bad = 0;
+ int s;
+ for (s = 0; s <= (int)maxSymbolValue; ++s) {
+ bad |= (count[s] != 0) & (CTable[s].nbBits == 0);
+ }
+ return !bad;
+}
+
+size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
+
+FORCE_INLINE_TEMPLATE void
+HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable)
+{
+ BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits);
+}
+
+#define HUF_FLUSHBITS(s) BIT_flushBits(s)
+
+#define HUF_FLUSHBITS_1(stream) \
+ if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream)
+
+#define HUF_FLUSHBITS_2(stream) \
+ if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream)
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+ const HUF_CElt* CTable)
+{
+ const BYTE* ip = (const BYTE*) src;
+ BYTE* const ostart = (BYTE*)dst;
+ BYTE* const oend = ostart + dstSize;
+ BYTE* op = ostart;
+ size_t n;
+ BIT_CStream_t bitC;
+
+ /* init */
+ if (dstSize < 8) return 0; /* not enough space to compress */
+ { size_t const initErr = BIT_initCStream(&bitC, op, (size_t)(oend-op));
+ if (HUF_isError(initErr)) return 0; }
+
+ n = srcSize & ~3; /* join to mod 4 */
+ switch (srcSize & 3)
+ {
+ case 3 : HUF_encodeSymbol(&bitC, ip[n+ 2], CTable);
+ HUF_FLUSHBITS_2(&bitC);
+ /* fall-through */
+ case 2 : HUF_encodeSymbol(&bitC, ip[n+ 1], CTable);
+ HUF_FLUSHBITS_1(&bitC);
+ /* fall-through */
+ case 1 : HUF_encodeSymbol(&bitC, ip[n+ 0], CTable);
+ HUF_FLUSHBITS(&bitC);
+ /* fall-through */
+ case 0 : /* fall-through */
+ default: break;
+ }
+
+ for (; n>0; n-=4) { /* note : n&3==0 at this stage */
+ HUF_encodeSymbol(&bitC, ip[n- 1], CTable);
+ HUF_FLUSHBITS_1(&bitC);
+ HUF_encodeSymbol(&bitC, ip[n- 2], CTable);
+ HUF_FLUSHBITS_2(&bitC);
+ HUF_encodeSymbol(&bitC, ip[n- 3], CTable);
+ HUF_FLUSHBITS_1(&bitC);
+ HUF_encodeSymbol(&bitC, ip[n- 4], CTable);
+ HUF_FLUSHBITS(&bitC);
+ }
+
+ return BIT_closeCStream(&bitC);
+}
+
+#if DYNAMIC_BMI2
+
+static TARGET_ATTRIBUTE("bmi2") size_t
+HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+ const HUF_CElt* CTable)
+{
+ return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+static size_t
+HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+ const HUF_CElt* CTable)
+{
+ return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+static size_t
+HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+ const HUF_CElt* CTable, const int bmi2)
+{
+ if (bmi2) {
+ return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
+ }
+ return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
+}
+
+#else
+
+static size_t
+HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+ const HUF_CElt* CTable, const int bmi2)
+{
+ (void)bmi2;
+ return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+#endif
+
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+{
+ return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+}
+
+
+static size_t
+HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+ const HUF_CElt* CTable, int bmi2)
+{
+ size_t const segmentSize = (srcSize+3)/4; /* first 3 segments */
+ const BYTE* ip = (const BYTE*) src;
+ const BYTE* const iend = ip + srcSize;
+ BYTE* const ostart = (BYTE*) dst;
+ BYTE* const oend = ostart + dstSize;
+ BYTE* op = ostart;
+
+ if (dstSize < 6 + 1 + 1 + 1 + 8) return 0; /* minimum space to compress successfully */
+ if (srcSize < 12) return 0; /* no saving possible : too small input */
+ op += 6; /* jumpTable */
+
+ assert(op <= oend);
+ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
+ if (cSize==0) return 0;
+ assert(cSize <= 65535);
+ MEM_writeLE16(ostart, (U16)cSize);
+ op += cSize;
+ }
+
+ ip += segmentSize;
+ assert(op <= oend);
+ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
+ if (cSize==0) return 0;
+ assert(cSize <= 65535);
+ MEM_writeLE16(ostart+2, (U16)cSize);
+ op += cSize;
+ }
+
+ ip += segmentSize;
+ assert(op <= oend);
+ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
+ if (cSize==0) return 0;
+ assert(cSize <= 65535);
+ MEM_writeLE16(ostart+4, (U16)cSize);
+ op += cSize;
+ }
+
+ ip += segmentSize;
+ assert(op <= oend);
+ assert(ip <= iend);
+ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) );
+ if (cSize==0) return 0;
+ op += cSize;
+ }
+
+ return (size_t)(op-ostart);
+}
+
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+{
+ return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+}
+
+typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
+
+static size_t HUF_compressCTable_internal(
+ BYTE* const ostart, BYTE* op, BYTE* const oend,
+ const void* src, size_t srcSize,
+ HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2)
+{
+ size_t const cSize = (nbStreams==HUF_singleStream) ?
+ HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) :
+ HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2);
+ if (HUF_isError(cSize)) { return cSize; }
+ if (cSize==0) { return 0; } /* uncompressible */
+ op += cSize;
+ /* check compressibility */
+ assert(op >= ostart);
+ if ((size_t)(op-ostart) >= srcSize-1) { return 0; }
+ return (size_t)(op-ostart);
+}
+
+typedef struct {
+ unsigned count[HUF_SYMBOLVALUE_MAX + 1];
+ HUF_CElt CTable[HUF_SYMBOLVALUE_MAX + 1];
+ HUF_buildCTable_wksp_tables buildCTable_wksp;
+} HUF_compress_tables_t;
+
+/* HUF_compress_internal() :
+ * `workSpace` must a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
+static size_t
+HUF_compress_internal (void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+ unsigned maxSymbolValue, unsigned huffLog,
+ HUF_nbStreams_e nbStreams,
+ void* workSpace, size_t wkspSize,
+ HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
+ const int bmi2)
+{
+ HUF_compress_tables_t* const table = (HUF_compress_tables_t*)workSpace;
+ BYTE* const ostart = (BYTE*)dst;
+ BYTE* const oend = ostart + dstSize;
+ BYTE* op = ostart;
+
+ HUF_STATIC_ASSERT(sizeof(*table) <= HUF_WORKSPACE_SIZE);
+
+ /* checks & inits */
+ if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */
+ if (wkspSize < HUF_WORKSPACE_SIZE) return ERROR(workSpace_tooSmall);
+ if (!srcSize) return 0; /* Uncompressed */
+ if (!dstSize) return 0; /* cannot fit anything within dst budget */
+ if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong); /* current block size limit */
+ if (huffLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+ if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+ if (!maxSymbolValue) maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+ if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
+
+ /* Heuristic : If old table is valid, use it for small inputs */
+ if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
+ return HUF_compressCTable_internal(ostart, op, oend,
+ src, srcSize,
+ nbStreams, oldHufTable, bmi2);
+ }
+
+ /* Scan input and build symbol stats */
+ { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, workSpace, wkspSize) );
+ if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */
+ if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */
+ }
+
+ /* Check validity of previous table */
+ if ( repeat
+ && *repeat == HUF_repeat_check
+ && !HUF_validateCTable(oldHufTable, table->count, maxSymbolValue)) {
+ *repeat = HUF_repeat_none;
+ }
+ /* Heuristic : use existing table for small inputs */
+ if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
+ return HUF_compressCTable_internal(ostart, op, oend,
+ src, srcSize,
+ nbStreams, oldHufTable, bmi2);
+ }
+
+ /* Build Huffman Tree */
+ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+ { size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count,
+ maxSymbolValue, huffLog,
+ &table->buildCTable_wksp, sizeof(table->buildCTable_wksp));
+ CHECK_F(maxBits);
+ huffLog = (U32)maxBits;
+ /* Zero unused symbols in CTable, so we can check it for validity */
+ memset(table->CTable + (maxSymbolValue + 1), 0,
+ sizeof(table->CTable) - ((maxSymbolValue + 1) * sizeof(HUF_CElt)));
+ }
+
+ /* Write table description header */
+ { CHECK_V_F(hSize, HUF_writeCTable (op, dstSize, table->CTable, maxSymbolValue, huffLog) );
+ /* Check if using previous huffman table is beneficial */
+ if (repeat && *repeat != HUF_repeat_none) {
+ size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, table->count, maxSymbolValue);
+ size_t const newSize = HUF_estimateCompressedSize(table->CTable, table->count, maxSymbolValue);
+ if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
+ return HUF_compressCTable_internal(ostart, op, oend,
+ src, srcSize,
+ nbStreams, oldHufTable, bmi2);
+ } }
+
+ /* Use the new huffman table */
+ if (hSize + 12ul >= srcSize) { return 0; }
+ op += hSize;
+ if (repeat) { *repeat = HUF_repeat_none; }
+ if (oldHufTable)
+ memcpy(oldHufTable, table->CTable, sizeof(table->CTable)); /* Save new table */
+ }
+ return HUF_compressCTable_internal(ostart, op, oend,
+ src, srcSize,
+ nbStreams, table->CTable, bmi2);
+}
+
+
+size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+ unsigned maxSymbolValue, unsigned huffLog,
+ void* workSpace, size_t wkspSize)
+{
+ return HUF_compress_internal(dst, dstSize, src, srcSize,
+ maxSymbolValue, huffLog, HUF_singleStream,
+ workSpace, wkspSize,
+ NULL, NULL, 0, 0 /*bmi2*/);
+}
+
+size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+ unsigned maxSymbolValue, unsigned huffLog,
+ void* workSpace, size_t wkspSize,
+ HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2)
+{
+ return HUF_compress_internal(dst, dstSize, src, srcSize,
+ maxSymbolValue, huffLog, HUF_singleStream,
+ workSpace, wkspSize, hufTable,
+ repeat, preferRepeat, bmi2);
+}
+
+size_t HUF_compress1X (void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+ unsigned maxSymbolValue, unsigned huffLog)
+{
+ unsigned workSpace[HUF_WORKSPACE_SIZE_U32];
+ return HUF_compress1X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
+}
+
+/* HUF_compress4X_repeat():
+ * compress input using 4 streams.
+ * provide workspace to generate compression tables */
+size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+ unsigned maxSymbolValue, unsigned huffLog,
+ void* workSpace, size_t wkspSize)
+{
+ return HUF_compress_internal(dst, dstSize, src, srcSize,
+ maxSymbolValue, huffLog, HUF_fourStreams,
+ workSpace, wkspSize,
+ NULL, NULL, 0, 0 /*bmi2*/);
+}
+
+/* HUF_compress4X_repeat():
+ * compress input using 4 streams.
+ * re-use an existing huffman compression table */
+size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+ unsigned maxSymbolValue, unsigned huffLog,
+ void* workSpace, size_t wkspSize,
+ HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2)
+{
+ return HUF_compress_internal(dst, dstSize, src, srcSize,
+ maxSymbolValue, huffLog, HUF_fourStreams,
+ workSpace, wkspSize,
+ hufTable, repeat, preferRepeat, bmi2);
+}
+
+size_t HUF_compress2 (void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+ unsigned maxSymbolValue, unsigned huffLog)
+{
+ unsigned workSpace[HUF_WORKSPACE_SIZE_U32];
+ return HUF_compress4X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
+}
+
+size_t HUF_compress (void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+ return HUF_compress2(dst, maxDstSize, src, srcSize, 255, HUF_TABLELOG_DEFAULT);
+}
+/**** ended inlining compress/huf_compress.c ****/
+/**** start inlining compress/zstd_compress_literals.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ /*-*************************************
+ * Dependencies
+ ***************************************/
+/**** start inlining zstd_compress_literals.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMPRESS_LITERALS_H
+#define ZSTD_COMPRESS_LITERALS_H
+
+/**** start inlining zstd_compress_internal.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* This header contains definitions
+ * that shall **only** be used by modules within lib/compress.
+ */
+
+#ifndef ZSTD_COMPRESS_H
+#define ZSTD_COMPRESS_H
+
+/*-*************************************
+* Dependencies
+***************************************/
+/**** skipping file: ../common/zstd_internal.h ****/
+/**** start inlining zstd_cwksp.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_CWKSP_H
+#define ZSTD_CWKSP_H
+
+/*-*************************************
+* Dependencies
+***************************************/
+/**** skipping file: ../common/zstd_internal.h ****/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-*************************************
+* Constants
+***************************************/
+
+/* Since the workspace is effectively its own little malloc implementation /
+ * arena, when we run under ASAN, we should similarly insert redzones between
+ * each internal element of the workspace, so ASAN will catch overruns that
+ * reach outside an object but that stay inside the workspace.
+ *
+ * This defines the size of that redzone.
+ */
+#ifndef ZSTD_CWKSP_ASAN_REDZONE_SIZE
+#define ZSTD_CWKSP_ASAN_REDZONE_SIZE 128
+#endif
+
+/*-*************************************
+* Structures
+***************************************/
+typedef enum {
+ ZSTD_cwksp_alloc_objects,
+ ZSTD_cwksp_alloc_buffers,
+ ZSTD_cwksp_alloc_aligned
+} ZSTD_cwksp_alloc_phase_e;
+
+/**
+ * Zstd fits all its internal datastructures into a single continuous buffer,
+ * so that it only needs to perform a single OS allocation (or so that a buffer
+ * can be provided to it and it can perform no allocations at all). This buffer
+ * is called the workspace.
+ *
+ * Several optimizations complicate that process of allocating memory ranges
+ * from this workspace for each internal datastructure:
+ *
+ * - These different internal datastructures have different setup requirements:
+ *
+ * - The static objects need to be cleared once and can then be trivially
+ * reused for each compression.
+ *
+ * - Various buffers don't need to be initialized at all--they are always
+ * written into before they're read.
+ *
+ * - The matchstate tables have a unique requirement that they don't need
+ * their memory to be totally cleared, but they do need the memory to have
+ * some bound, i.e., a guarantee that all values in the memory they've been
+ * allocated is less than some maximum value (which is the starting value
+ * for the indices that they will then use for compression). When this
+ * guarantee is provided to them, they can use the memory without any setup
+ * work. When it can't, they have to clear the area.
+ *
+ * - These buffers also have different alignment requirements.
+ *
+ * - We would like to reuse the objects in the workspace for multiple
+ * compressions without having to perform any expensive reallocation or
+ * reinitialization work.
+ *
+ * - We would like to be able to efficiently reuse the workspace across
+ * multiple compressions **even when the compression parameters change** and
+ * we need to resize some of the objects (where possible).
+ *
+ * To attempt to manage this buffer, given these constraints, the ZSTD_cwksp
+ * abstraction was created. It works as follows:
+ *
+ * Workspace Layout:
+ *
+ * [ ... workspace ... ]
+ * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers]
+ *
+ * The various objects that live in the workspace are divided into the
+ * following categories, and are allocated separately:
+ *
+ * - Static objects: this is optionally the enclosing ZSTD_CCtx or ZSTD_CDict,
+ * so that literally everything fits in a single buffer. Note: if present,
+ * this must be the first object in the workspace, since ZSTD_free{CCtx,
+ * CDict}() rely on a pointer comparison to see whether one or two frees are
+ * required.
+ *
+ * - Fixed size objects: these are fixed-size, fixed-count objects that are
+ * nonetheless "dynamically" allocated in the workspace so that we can
+ * control how they're initialized separately from the broader ZSTD_CCtx.
+ * Examples:
+ * - Entropy Workspace
+ * - 2 x ZSTD_compressedBlockState_t
+ * - CDict dictionary contents
+ *
+ * - Tables: these are any of several different datastructures (hash tables,
+ * chain tables, binary trees) that all respect a common format: they are
+ * uint32_t arrays, all of whose values are between 0 and (nextSrc - base).
+ * Their sizes depend on the cparams.
+ *
+ * - Aligned: these buffers are used for various purposes that require 4 byte
+ * alignment, but don't require any initialization before they're used.
+ *
+ * - Buffers: these buffers are used for various purposes that don't require
+ * any alignment or initialization before they're used. This means they can
+ * be moved around at no cost for a new compression.
+ *
+ * Allocating Memory:
+ *
+ * The various types of objects must be allocated in order, so they can be
+ * correctly packed into the workspace buffer. That order is:
+ *
+ * 1. Objects
+ * 2. Buffers
+ * 3. Aligned
+ * 4. Tables
+ *
+ * Attempts to reserve objects of different types out of order will fail.
+ */
+typedef struct {
+ void* workspace;
+ void* workspaceEnd;
+
+ void* objectEnd;
+ void* tableEnd;
+ void* tableValidEnd;
+ void* allocStart;
+
+ int allocFailed;
+ int workspaceOversizedDuration;
+ ZSTD_cwksp_alloc_phase_e phase;
+} ZSTD_cwksp;
+
+/*-*************************************
+* Functions
+***************************************/
+
+MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws);
+
+MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
+ (void)ws;
+ assert(ws->workspace <= ws->objectEnd);
+ assert(ws->objectEnd <= ws->tableEnd);
+ assert(ws->objectEnd <= ws->tableValidEnd);
+ assert(ws->tableEnd <= ws->allocStart);
+ assert(ws->tableValidEnd <= ws->allocStart);
+ assert(ws->allocStart <= ws->workspaceEnd);
+}
+
+/**
+ * Align must be a power of 2.
+ */
+MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) {
+ size_t const mask = align - 1;
+ assert((align & mask) == 0);
+ return (size + mask) & ~mask;
+}
+
+/**
+ * Use this to determine how much space in the workspace we will consume to
+ * allocate this object. (Normally it should be exactly the size of the object,
+ * but under special conditions, like ASAN, where we pad each object, it might
+ * be larger.)
+ *
+ * Since tables aren't currently redzoned, you don't need to call through this
+ * to figure out how much space you need for the matchState tables. Everything
+ * else is though.
+ */
+MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) {
+#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+ return size + 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+#else
+ return size;
+#endif
+}
+
+MEM_STATIC void ZSTD_cwksp_internal_advance_phase(
+ ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase) {
+ assert(phase >= ws->phase);
+ if (phase > ws->phase) {
+ if (ws->phase < ZSTD_cwksp_alloc_buffers &&
+ phase >= ZSTD_cwksp_alloc_buffers) {
+ ws->tableValidEnd = ws->objectEnd;
+ }
+ if (ws->phase < ZSTD_cwksp_alloc_aligned &&
+ phase >= ZSTD_cwksp_alloc_aligned) {
+ /* If unaligned allocations down from a too-large top have left us
+ * unaligned, we need to realign our alloc ptr. Technically, this
+ * can consume space that is unaccounted for in the neededSpace
+ * calculation. However, I believe this can only happen when the
+ * workspace is too large, and specifically when it is too large
+ * by a larger margin than the space that will be consumed. */
+ /* TODO: cleaner, compiler warning friendly way to do this??? */
+ ws->allocStart = (BYTE*)ws->allocStart - ((size_t)ws->allocStart & (sizeof(U32)-1));
+ if (ws->allocStart < ws->tableValidEnd) {
+ ws->tableValidEnd = ws->allocStart;
+ }
+ }
+ ws->phase = phase;
+ }
+}
+
+/**
+ * Returns whether this object/buffer/etc was allocated in this workspace.
+ */
+MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) {
+ return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd);
+}
+
+/**
+ * Internal function. Do not use directly.
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_internal(
+ ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase) {
+ void* alloc;
+ void* bottom = ws->tableEnd;
+ ZSTD_cwksp_internal_advance_phase(ws, phase);
+ alloc = (BYTE *)ws->allocStart - bytes;
+
+#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+ /* over-reserve space */
+ alloc = (BYTE *)alloc - 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+#endif
+
+ DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining",
+ alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
+ ZSTD_cwksp_assert_internal_consistency(ws);
+ assert(alloc >= bottom);
+ if (alloc < bottom) {
+ DEBUGLOG(4, "cwksp: alloc failed!");
+ ws->allocFailed = 1;
+ return NULL;
+ }
+ if (alloc < ws->tableValidEnd) {
+ ws->tableValidEnd = alloc;
+ }
+ ws->allocStart = alloc;
+
+#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+ /* Move alloc so there's ZSTD_CWKSP_ASAN_REDZONE_SIZE unused space on
+ * either size. */
+ alloc = (BYTE *)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+ __asan_unpoison_memory_region(alloc, bytes);
+#endif
+
+ return alloc;
+}
+
+/**
+ * Reserves and returns unaligned memory.
+ */
+MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) {
+ return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers);
+}
+
+/**
+ * Reserves and returns memory sized on and aligned on sizeof(unsigned).
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) {
+ assert((bytes & (sizeof(U32)-1)) == 0);
+ return ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, sizeof(U32)), ZSTD_cwksp_alloc_aligned);
+}
+
+/**
+ * Aligned on sizeof(unsigned). These buffers have the special property that
+ * their values remain constrained, allowing us to re-use them without
+ * memset()-ing them.
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) {
+ const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned;
+ void* alloc = ws->tableEnd;
+ void* end = (BYTE *)alloc + bytes;
+ void* top = ws->allocStart;
+
+ DEBUGLOG(5, "cwksp: reserving %p table %zd bytes, %zd bytes remaining",
+ alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
+ assert((bytes & (sizeof(U32)-1)) == 0);
+ ZSTD_cwksp_internal_advance_phase(ws, phase);
+ ZSTD_cwksp_assert_internal_consistency(ws);
+ assert(end <= top);
+ if (end > top) {
+ DEBUGLOG(4, "cwksp: table alloc failed!");
+ ws->allocFailed = 1;
+ return NULL;
+ }
+ ws->tableEnd = end;
+
+#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+ __asan_unpoison_memory_region(alloc, bytes);
+#endif
+
+ return alloc;
+}
+
+/**
+ * Aligned on sizeof(void*).
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) {
+ size_t roundedBytes = ZSTD_cwksp_align(bytes, sizeof(void*));
+ void* alloc = ws->objectEnd;
+ void* end = (BYTE*)alloc + roundedBytes;
+
+#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+ /* over-reserve space */
+ end = (BYTE *)end + 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+#endif
+
+ DEBUGLOG(5,
+ "cwksp: reserving %p object %zd bytes (rounded to %zd), %zd bytes remaining",
+ alloc, bytes, roundedBytes, ZSTD_cwksp_available_space(ws) - roundedBytes);
+ assert(((size_t)alloc & (sizeof(void*)-1)) == 0);
+ assert((bytes & (sizeof(void*)-1)) == 0);
+ ZSTD_cwksp_assert_internal_consistency(ws);
+ /* we must be in the first phase, no advance is possible */
+ if (ws->phase != ZSTD_cwksp_alloc_objects || end > ws->workspaceEnd) {
+ DEBUGLOG(4, "cwksp: object alloc failed!");
+ ws->allocFailed = 1;
+ return NULL;
+ }
+ ws->objectEnd = end;
+ ws->tableEnd = end;
+ ws->tableValidEnd = end;
+
+#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+ /* Move alloc so there's ZSTD_CWKSP_ASAN_REDZONE_SIZE unused space on
+ * either size. */
+ alloc = (BYTE *)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+ __asan_unpoison_memory_region(alloc, bytes);
+#endif
+
+ return alloc;
+}
+
+MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) {
+ DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_dirty");
+
+#if defined (MEMORY_SANITIZER) && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE)
+ /* To validate that the table re-use logic is sound, and that we don't
+ * access table space that we haven't cleaned, we re-"poison" the table
+ * space every time we mark it dirty. */
+ {
+ size_t size = (BYTE*)ws->tableValidEnd - (BYTE*)ws->objectEnd;
+ assert(__msan_test_shadow(ws->objectEnd, size) == -1);
+ __msan_poison(ws->objectEnd, size);
+ }
+#endif
+
+ assert(ws->tableValidEnd >= ws->objectEnd);
+ assert(ws->tableValidEnd <= ws->allocStart);
+ ws->tableValidEnd = ws->objectEnd;
+ ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+MEM_STATIC void ZSTD_cwksp_mark_tables_clean(ZSTD_cwksp* ws) {
+ DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_clean");
+ assert(ws->tableValidEnd >= ws->objectEnd);
+ assert(ws->tableValidEnd <= ws->allocStart);
+ if (ws->tableValidEnd < ws->tableEnd) {
+ ws->tableValidEnd = ws->tableEnd;
+ }
+ ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+/**
+ * Zero the part of the allocated tables not already marked clean.
+ */
+MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) {
+ DEBUGLOG(4, "cwksp: ZSTD_cwksp_clean_tables");
+ assert(ws->tableValidEnd >= ws->objectEnd);
+ assert(ws->tableValidEnd <= ws->allocStart);
+ if (ws->tableValidEnd < ws->tableEnd) {
+ memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd);
+ }
+ ZSTD_cwksp_mark_tables_clean(ws);
+}
+
+/**
+ * Invalidates table allocations.
+ * All other allocations remain valid.
+ */
+MEM_STATIC void ZSTD_cwksp_clear_tables(ZSTD_cwksp* ws) {
+ DEBUGLOG(4, "cwksp: clearing tables!");
+
+#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+ {
+ size_t size = (BYTE*)ws->tableValidEnd - (BYTE*)ws->objectEnd;
+ __asan_poison_memory_region(ws->objectEnd, size);
+ }
+#endif
+
+ ws->tableEnd = ws->objectEnd;
+ ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+/**
+ * Invalidates all buffer, aligned, and table allocations.
+ * Object allocations remain valid.
+ */
+MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) {
+ DEBUGLOG(4, "cwksp: clearing!");
+
+#if defined (MEMORY_SANITIZER) && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE)
+ /* To validate that the context re-use logic is sound, and that we don't
+ * access stuff that this compression hasn't initialized, we re-"poison"
+ * the workspace (or at least the non-static, non-table parts of it)
+ * every time we start a new compression. */
+ {
+ size_t size = (BYTE*)ws->workspaceEnd - (BYTE*)ws->tableValidEnd;
+ __msan_poison(ws->tableValidEnd, size);
+ }
+#endif
+
+#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+ {
+ size_t size = (BYTE*)ws->workspaceEnd - (BYTE*)ws->objectEnd;
+ __asan_poison_memory_region(ws->objectEnd, size);
+ }
+#endif
+
+ ws->tableEnd = ws->objectEnd;
+ ws->allocStart = ws->workspaceEnd;
+ ws->allocFailed = 0;
+ if (ws->phase > ZSTD_cwksp_alloc_buffers) {
+ ws->phase = ZSTD_cwksp_alloc_buffers;
+ }
+ ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+/**
+ * The provided workspace takes ownership of the buffer [start, start+size).
+ * Any existing values in the workspace are ignored (the previously managed
+ * buffer, if present, must be separately freed).
+ */
+MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size) {
+ DEBUGLOG(4, "cwksp: init'ing workspace with %zd bytes", size);
+ assert(((size_t)start & (sizeof(void*)-1)) == 0); /* ensure correct alignment */
+ ws->workspace = start;
+ ws->workspaceEnd = (BYTE*)start + size;
+ ws->objectEnd = ws->workspace;
+ ws->tableValidEnd = ws->objectEnd;
+ ws->phase = ZSTD_cwksp_alloc_objects;
+ ZSTD_cwksp_clear(ws);
+ ws->workspaceOversizedDuration = 0;
+ ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+MEM_STATIC size_t ZSTD_cwksp_create(ZSTD_cwksp* ws, size_t size, ZSTD_customMem customMem) {
+ void* workspace = ZSTD_malloc(size, customMem);
+ DEBUGLOG(4, "cwksp: creating new workspace with %zd bytes", size);
+ RETURN_ERROR_IF(workspace == NULL, memory_allocation, "NULL pointer!");
+ ZSTD_cwksp_init(ws, workspace, size);
+ return 0;
+}
+
+MEM_STATIC void ZSTD_cwksp_free(ZSTD_cwksp* ws, ZSTD_customMem customMem) {
+ void *ptr = ws->workspace;
+ DEBUGLOG(4, "cwksp: freeing workspace");
+ memset(ws, 0, sizeof(ZSTD_cwksp));
+ ZSTD_free(ptr, customMem);
+}
+
+/**
+ * Moves the management of a workspace from one cwksp to another. The src cwksp
+ * is left in an invalid state (src must be re-init()'ed before its used again).
+ */
+MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) {
+ *dst = *src;
+ memset(src, 0, sizeof(ZSTD_cwksp));
+}
+
+MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) {
+ return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace);
+}
+
+MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
+ return ws->allocFailed;
+}
+
+/*-*************************************
+* Functions Checking Free Space
+***************************************/
+
+MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws) {
+ return (size_t)((BYTE*)ws->allocStart - (BYTE*)ws->tableEnd);
+}
+
+MEM_STATIC int ZSTD_cwksp_check_available(ZSTD_cwksp* ws, size_t additionalNeededSpace) {
+ return ZSTD_cwksp_available_space(ws) >= additionalNeededSpace;
+}
+
+MEM_STATIC int ZSTD_cwksp_check_too_large(ZSTD_cwksp* ws, size_t additionalNeededSpace) {
+ return ZSTD_cwksp_check_available(
+ ws, additionalNeededSpace * ZSTD_WORKSPACETOOLARGE_FACTOR);
+}
+
+MEM_STATIC int ZSTD_cwksp_check_wasteful(ZSTD_cwksp* ws, size_t additionalNeededSpace) {
+ return ZSTD_cwksp_check_too_large(ws, additionalNeededSpace)
+ && ws->workspaceOversizedDuration > ZSTD_WORKSPACETOOLARGE_MAXDURATION;
+}
+
+MEM_STATIC void ZSTD_cwksp_bump_oversized_duration(
+ ZSTD_cwksp* ws, size_t additionalNeededSpace) {
+ if (ZSTD_cwksp_check_too_large(ws, additionalNeededSpace)) {
+ ws->workspaceOversizedDuration++;
+ } else {
+ ws->workspaceOversizedDuration = 0;
+ }
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_CWKSP_H */
+/**** ended inlining zstd_cwksp.h ****/
+#ifdef ZSTD_MULTITHREAD
+/**** start inlining zstdmt_compress.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ #ifndef ZSTDMT_COMPRESS_H
+ #define ZSTDMT_COMPRESS_H
+
+ #if defined (__cplusplus)
+ extern "C" {
+ #endif
+
+
+/* Note : This is an internal API.
+ * These APIs used to be exposed with ZSTDLIB_API,
+ * because it used to be the only way to invoke MT compression.
+ * Now, it's recommended to use ZSTD_compress2 and ZSTD_compressStream2()
+ * instead.
+ *
+ * If you depend on these APIs and can't switch, then define
+ * ZSTD_LEGACY_MULTITHREADED_API when making the dynamic library.
+ * However, we may completely remove these functions in a future
+ * release, so please switch soon.
+ *
+ * This API requires ZSTD_MULTITHREAD to be defined during compilation,
+ * otherwise ZSTDMT_createCCtx*() will fail.
+ */
+
+#ifdef ZSTD_LEGACY_MULTITHREADED_API
+# define ZSTDMT_API ZSTDLIB_API
+#else
+# define ZSTDMT_API
+#endif
+
+/* === Dependencies === */
+#include <stddef.h> /* size_t */
+#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_parameters */
+/**** skipping file: ../zstd.h ****/
+
+
+/* === Constants === */
+#ifndef ZSTDMT_NBWORKERS_MAX
+# define ZSTDMT_NBWORKERS_MAX 200
+#endif
+#ifndef ZSTDMT_JOBSIZE_MIN
+# define ZSTDMT_JOBSIZE_MIN (1 MB)
+#endif
+#define ZSTDMT_JOBLOG_MAX (MEM_32bits() ? 29 : 30)
+#define ZSTDMT_JOBSIZE_MAX (MEM_32bits() ? (512 MB) : (1024 MB))
+
+
+/* === Memory management === */
+typedef struct ZSTDMT_CCtx_s ZSTDMT_CCtx;
+/* Requires ZSTD_MULTITHREAD to be defined during compilation, otherwise it will return NULL. */
+ZSTDMT_API ZSTDMT_CCtx* ZSTDMT_createCCtx(unsigned nbWorkers);
+/* Requires ZSTD_MULTITHREAD to be defined during compilation, otherwise it will return NULL. */
+ZSTDMT_API ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers,
+ ZSTD_customMem cMem);
+ZSTDMT_API size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx);
+
+ZSTDMT_API size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx);
+
+
+/* === Simple one-pass compression function === */
+
+ZSTDMT_API size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ int compressionLevel);
+
+
+
+/* === Streaming functions === */
+
+ZSTDMT_API size_t ZSTDMT_initCStream(ZSTDMT_CCtx* mtctx, int compressionLevel);
+ZSTDMT_API size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* mtctx, unsigned long long pledgedSrcSize); /**< if srcSize is not known at reset time, use ZSTD_CONTENTSIZE_UNKNOWN. Note: for compatibility with older programs, 0 means the same as ZSTD_CONTENTSIZE_UNKNOWN, but it will change in the future to mean "empty" */
+
+ZSTDMT_API size_t ZSTDMT_nextInputSizeHint(const ZSTDMT_CCtx* mtctx);
+ZSTDMT_API size_t ZSTDMT_compressStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+
+ZSTDMT_API size_t ZSTDMT_flushStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output); /**< @return : 0 == all flushed; >0 : still some data to be flushed; or an error code (ZSTD_isError()) */
+ZSTDMT_API size_t ZSTDMT_endStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output); /**< @return : 0 == all flushed; >0 : still some data to be flushed; or an error code (ZSTD_isError()) */
+
+
+/* === Advanced functions and parameters === */
+
+ZSTDMT_API size_t ZSTDMT_compress_advanced(ZSTDMT_CCtx* mtctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const ZSTD_CDict* cdict,
+ ZSTD_parameters params,
+ int overlapLog);
+
+ZSTDMT_API size_t ZSTDMT_initCStream_advanced(ZSTDMT_CCtx* mtctx,
+ const void* dict, size_t dictSize, /* dict can be released after init, a local copy is preserved within zcs */
+ ZSTD_parameters params,
+ unsigned long long pledgedSrcSize); /* pledgedSrcSize is optional and can be zero == unknown */
+
+ZSTDMT_API size_t ZSTDMT_initCStream_usingCDict(ZSTDMT_CCtx* mtctx,
+ const ZSTD_CDict* cdict,
+ ZSTD_frameParameters fparams,
+ unsigned long long pledgedSrcSize); /* note : zero means empty */
+
+/* ZSTDMT_parameter :
+ * List of parameters that can be set using ZSTDMT_setMTCtxParameter() */
+typedef enum {
+ ZSTDMT_p_jobSize, /* Each job is compressed in parallel. By default, this value is dynamically determined depending on compression parameters. Can be set explicitly here. */
+ ZSTDMT_p_overlapLog, /* Each job may reload a part of previous job to enhance compression ratio; 0 == no overlap, 6(default) == use 1/8th of window, >=9 == use full window. This is a "sticky" parameter : its value will be re-used on next compression job */
+ ZSTDMT_p_rsyncable /* Enables rsyncable mode. */
+} ZSTDMT_parameter;
+
+/* ZSTDMT_setMTCtxParameter() :
+ * allow setting individual parameters, one at a time, among a list of enums defined in ZSTDMT_parameter.
+ * The function must be called typically after ZSTD_createCCtx() but __before ZSTDMT_init*() !__
+ * Parameters not explicitly reset by ZSTDMT_init*() remain the same in consecutive compression sessions.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()) */
+ZSTDMT_API size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, int value);
+
+/* ZSTDMT_getMTCtxParameter() :
+ * Query the ZSTDMT_CCtx for a parameter value.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()) */
+ZSTDMT_API size_t ZSTDMT_getMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, int* value);
+
+
+/*! ZSTDMT_compressStream_generic() :
+ * Combines ZSTDMT_compressStream() with optional ZSTDMT_flushStream() or ZSTDMT_endStream()
+ * depending on flush directive.
+ * @return : minimum amount of data still to be flushed
+ * 0 if fully flushed
+ * or an error code
+ * note : needs to be init using any ZSTD_initCStream*() variant */
+ZSTDMT_API size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx,
+ ZSTD_outBuffer* output,
+ ZSTD_inBuffer* input,
+ ZSTD_EndDirective endOp);
+
+
+/* ========================================================
+ * === Private interface, for use by ZSTD_compress.c ===
+ * === Not exposed in libzstd. Never invoke directly ===
+ * ======================================================== */
+
+ /*! ZSTDMT_toFlushNow()
+ * Tell how many bytes are ready to be flushed immediately.
+ * Probe the oldest active job (not yet entirely flushed) and check its output buffer.
+ * If return 0, it means there is no active job,
+ * or, it means oldest job is still active, but everything produced has been flushed so far,
+ * therefore flushing is limited by speed of oldest job. */
+size_t ZSTDMT_toFlushNow(ZSTDMT_CCtx* mtctx);
+
+/*! ZSTDMT_CCtxParam_setMTCtxParameter()
+ * like ZSTDMT_setMTCtxParameter(), but into a ZSTD_CCtx_Params */
+size_t ZSTDMT_CCtxParam_setMTCtxParameter(ZSTD_CCtx_params* params, ZSTDMT_parameter parameter, int value);
+
+/*! ZSTDMT_CCtxParam_setNbWorkers()
+ * Set nbWorkers, and clamp it.
+ * Also reset jobSize and overlapLog */
+size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers);
+
+/*! ZSTDMT_updateCParams_whileCompressing() :
+ * Updates only a selected set of compression parameters, to remain compatible with current frame.
+ * New parameters will be applied to next compression job. */
+void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams);
+
+/*! ZSTDMT_getFrameProgression():
+ * tells how much data has been consumed (input) and produced (output) for current frame.
+ * able to count progression inside worker threads.
+ */
+ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx);
+
+
+/*! ZSTDMT_initCStream_internal() :
+ * Private use only. Init streaming operation.
+ * expects params to be valid.
+ * must receive dict, or cdict, or none, but not both.
+ * @return : 0, or an error code */
+size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* zcs,
+ const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType,
+ const ZSTD_CDict* cdict,
+ ZSTD_CCtx_params params, unsigned long long pledgedSrcSize);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTDMT_COMPRESS_H */
+/**** ended inlining zstdmt_compress.h ****/
+#endif
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*-*************************************
+* Constants
+***************************************/
+#define kSearchStrength 8
+#define HASH_READ_SIZE 8
+#define ZSTD_DUBT_UNSORTED_MARK 1 /* For btlazy2 strategy, index ZSTD_DUBT_UNSORTED_MARK==1 means "unsorted".
+ It could be confused for a real successor at index "1", if sorted as larger than its predecessor.
+ It's not a big deal though : candidate will just be sorted again.
+ Additionally, candidate position 1 will be lost.
+ But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
+ The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table re-use with a different strategy.
+ This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */
+
+
+/*-*************************************
+* Context memory management
+***************************************/
+typedef enum { ZSTDcs_created=0, ZSTDcs_init, ZSTDcs_ongoing, ZSTDcs_ending } ZSTD_compressionStage_e;
+typedef enum { zcss_init=0, zcss_load, zcss_flush } ZSTD_cStreamStage;
+
+typedef struct ZSTD_prefixDict_s {
+ const void* dict;
+ size_t dictSize;
+ ZSTD_dictContentType_e dictContentType;
+} ZSTD_prefixDict;
+
+typedef struct {
+ void* dictBuffer;
+ void const* dict;
+ size_t dictSize;
+ ZSTD_dictContentType_e dictContentType;
+ ZSTD_CDict* cdict;
+} ZSTD_localDict;
+
+typedef struct {
+ U32 CTable[HUF_CTABLE_SIZE_U32(255)];
+ HUF_repeat repeatMode;
+} ZSTD_hufCTables_t;
+
+typedef struct {
+ FSE_CTable offcodeCTable[FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
+ FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
+ FSE_CTable litlengthCTable[FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
+ FSE_repeat offcode_repeatMode;
+ FSE_repeat matchlength_repeatMode;
+ FSE_repeat litlength_repeatMode;
+} ZSTD_fseCTables_t;
+
+typedef struct {
+ ZSTD_hufCTables_t huf;
+ ZSTD_fseCTables_t fse;
+} ZSTD_entropyCTables_t;
+
+typedef struct {
+ U32 off;
+ U32 len;
+} ZSTD_match_t;
+
+typedef struct {
+ int price;
+ U32 off;
+ U32 mlen;
+ U32 litlen;
+ U32 rep[ZSTD_REP_NUM];
+} ZSTD_optimal_t;
+
+typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e;
+
+typedef struct {
+ /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */
+ unsigned* litFreq; /* table of literals statistics, of size 256 */
+ unsigned* litLengthFreq; /* table of litLength statistics, of size (MaxLL+1) */
+ unsigned* matchLengthFreq; /* table of matchLength statistics, of size (MaxML+1) */
+ unsigned* offCodeFreq; /* table of offCode statistics, of size (MaxOff+1) */
+ ZSTD_match_t* matchTable; /* list of found matches, of size ZSTD_OPT_NUM+1 */
+ ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */
+
+ U32 litSum; /* nb of literals */
+ U32 litLengthSum; /* nb of litLength codes */
+ U32 matchLengthSum; /* nb of matchLength codes */
+ U32 offCodeSum; /* nb of offset codes */
+ U32 litSumBasePrice; /* to compare to log2(litfreq) */
+ U32 litLengthSumBasePrice; /* to compare to log2(llfreq) */
+ U32 matchLengthSumBasePrice;/* to compare to log2(mlfreq) */
+ U32 offCodeSumBasePrice; /* to compare to log2(offreq) */
+ ZSTD_OptPrice_e priceType; /* prices can be determined dynamically, or follow a pre-defined cost structure */
+ const ZSTD_entropyCTables_t* symbolCosts; /* pre-calculated dictionary statistics */
+ ZSTD_literalCompressionMode_e literalCompressionMode;
+} optState_t;
+
+typedef struct {
+ ZSTD_entropyCTables_t entropy;
+ U32 rep[ZSTD_REP_NUM];
+} ZSTD_compressedBlockState_t;
+
+typedef struct {
+ BYTE const* nextSrc; /* next block here to continue on current prefix */
+ BYTE const* base; /* All regular indexes relative to this position */
+ BYTE const* dictBase; /* extDict indexes relative to this position */
+ U32 dictLimit; /* below that point, need extDict */
+ U32 lowLimit; /* below that point, no more valid data */
+} ZSTD_window_t;
+
+typedef struct ZSTD_matchState_t ZSTD_matchState_t;
+struct ZSTD_matchState_t {
+ ZSTD_window_t window; /* State for window round buffer management */
+ U32 loadedDictEnd; /* index of end of dictionary, within context's referential.
+ * When loadedDictEnd != 0, a dictionary is in use, and still valid.
+ * This relies on a mechanism to set loadedDictEnd=0 when dictionary is no longer within distance.
+ * Such mechanism is provided within ZSTD_window_enforceMaxDist() and ZSTD_checkDictValidity().
+ * When dict referential is copied into active context (i.e. not attached),
+ * loadedDictEnd == dictSize, since referential starts from zero.
+ */
+ U32 nextToUpdate; /* index from which to continue table update */
+ U32 hashLog3; /* dispatch table for matches of len==3 : larger == faster, more memory */
+ U32* hashTable;
+ U32* hashTable3;
+ U32* chainTable;
+ optState_t opt; /* optimal parser state */
+ const ZSTD_matchState_t* dictMatchState;
+ ZSTD_compressionParameters cParams;
+};
+
+typedef struct {
+ ZSTD_compressedBlockState_t* prevCBlock;
+ ZSTD_compressedBlockState_t* nextCBlock;
+ ZSTD_matchState_t matchState;
+} ZSTD_blockState_t;
+
+typedef struct {
+ U32 offset;
+ U32 checksum;
+} ldmEntry_t;
+
+typedef struct {
+ ZSTD_window_t window; /* State for the window round buffer management */
+ ldmEntry_t* hashTable;
+ U32 loadedDictEnd;
+ BYTE* bucketOffsets; /* Next position in bucket to insert entry */
+ U64 hashPower; /* Used to compute the rolling hash.
+ * Depends on ldmParams.minMatchLength */
+} ldmState_t;
+
+typedef struct {
+ U32 enableLdm; /* 1 if enable long distance matching */
+ U32 hashLog; /* Log size of hashTable */
+ U32 bucketSizeLog; /* Log bucket size for collision resolution, at most 8 */
+ U32 minMatchLength; /* Minimum match length */
+ U32 hashRateLog; /* Log number of entries to skip */
+ U32 windowLog; /* Window log for the LDM */
+} ldmParams_t;
+
+typedef struct {
+ U32 offset;
+ U32 litLength;
+ U32 matchLength;
+} rawSeq;
+
+typedef struct {
+ rawSeq* seq; /* The start of the sequences */
+ size_t pos; /* The position where reading stopped. <= size. */
+ size_t size; /* The number of sequences. <= capacity. */
+ size_t capacity; /* The capacity starting from `seq` pointer */
+} rawSeqStore_t;
+
+typedef struct {
+ int collectSequences;
+ ZSTD_Sequence* seqStart;
+ size_t seqIndex;
+ size_t maxSequences;
+} SeqCollector;
+
+struct ZSTD_CCtx_params_s {
+ ZSTD_format_e format;
+ ZSTD_compressionParameters cParams;
+ ZSTD_frameParameters fParams;
+
+ int compressionLevel;
+ int forceWindow; /* force back-references to respect limit of
+ * 1<<wLog, even for dictionary */
+ size_t targetCBlockSize; /* Tries to fit compressed block size to be around targetCBlockSize.
+ * No target when targetCBlockSize == 0.
+ * There is no guarantee on compressed block size */
+ int srcSizeHint; /* User's best guess of source size.
+ * Hint is not valid when srcSizeHint == 0.
+ * There is no guarantee that hint is close to actual source size */
+
+ ZSTD_dictAttachPref_e attachDictPref;
+ ZSTD_literalCompressionMode_e literalCompressionMode;
+
+ /* Multithreading: used to pass parameters to mtctx */
+ int nbWorkers;
+ size_t jobSize;
+ int overlapLog;
+ int rsyncable;
+
+ /* Long distance matching parameters */
+ ldmParams_t ldmParams;
+
+ /* Internal use, for createCCtxParams() and freeCCtxParams() only */
+ ZSTD_customMem customMem;
+}; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
+
+struct ZSTD_CCtx_s {
+ ZSTD_compressionStage_e stage;
+ int cParamsChanged; /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */
+ int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
+ ZSTD_CCtx_params requestedParams;
+ ZSTD_CCtx_params appliedParams;
+ U32 dictID;
+
+ ZSTD_cwksp workspace; /* manages buffer for dynamic allocations */
+ size_t blockSize;
+ unsigned long long pledgedSrcSizePlusOne; /* this way, 0 (default) == unknown */
+ unsigned long long consumedSrcSize;
+ unsigned long long producedCSize;
+ XXH64_state_t xxhState;
+ ZSTD_customMem customMem;
+ size_t staticSize;
+ SeqCollector seqCollector;
+ int isFirstBlock;
+ int initialized;
+
+ seqStore_t seqStore; /* sequences storage ptrs */
+ ldmState_t ldmState; /* long distance matching state */
+ rawSeq* ldmSequences; /* Storage for the ldm output sequences */
+ size_t maxNbLdmSequences;
+ rawSeqStore_t externSeqStore; /* Mutable reference to external sequences */
+ ZSTD_blockState_t blockState;
+ U32* entropyWorkspace; /* entropy workspace of HUF_WORKSPACE_SIZE bytes */
+
+ /* streaming */
+ char* inBuff;
+ size_t inBuffSize;
+ size_t inToCompress;
+ size_t inBuffPos;
+ size_t inBuffTarget;
+ char* outBuff;
+ size_t outBuffSize;
+ size_t outBuffContentSize;
+ size_t outBuffFlushedSize;
+ ZSTD_cStreamStage streamStage;
+ U32 frameEnded;
+
+ /* Dictionary */
+ ZSTD_localDict localDict;
+ const ZSTD_CDict* cdict;
+ ZSTD_prefixDict prefixDict; /* single-usage dictionary */
+
+ /* Multi-threading */
+#ifdef ZSTD_MULTITHREAD
+ ZSTDMT_CCtx* mtctx;
+#endif
+};
+
+typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
+
+typedef enum { ZSTD_noDict = 0, ZSTD_extDict = 1, ZSTD_dictMatchState = 2 } ZSTD_dictMode_e;
+
+
+typedef size_t (*ZSTD_blockCompressor) (
+ ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode);
+
+
+MEM_STATIC U32 ZSTD_LLcode(U32 litLength)
+{
+ static const BYTE LL_Code[64] = { 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 16, 17, 17, 18, 18, 19, 19,
+ 20, 20, 20, 20, 21, 21, 21, 21,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 23, 23, 23, 23, 23, 23, 23, 23,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24 };
+ static const U32 LL_deltaCode = 19;
+ return (litLength > 63) ? ZSTD_highbit32(litLength) + LL_deltaCode : LL_Code[litLength];
+}
+
+/* ZSTD_MLcode() :
+ * note : mlBase = matchLength - MINMATCH;
+ * because it's the format it's stored in seqStore->sequences */
+MEM_STATIC U32 ZSTD_MLcode(U32 mlBase)
+{
+ static const BYTE ML_Code[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37,
+ 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39,
+ 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+ 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
+ 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 };
+ static const U32 ML_deltaCode = 36;
+ return (mlBase > 127) ? ZSTD_highbit32(mlBase) + ML_deltaCode : ML_Code[mlBase];
+}
+
+typedef struct repcodes_s {
+ U32 rep[3];
+} repcodes_t;
+
+MEM_STATIC repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0)
+{
+ repcodes_t newReps;
+ if (offset >= ZSTD_REP_NUM) { /* full offset */
+ newReps.rep[2] = rep[1];
+ newReps.rep[1] = rep[0];
+ newReps.rep[0] = offset - ZSTD_REP_MOVE;
+ } else { /* repcode */
+ U32 const repCode = offset + ll0;
+ if (repCode > 0) { /* note : if repCode==0, no change */
+ U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+ newReps.rep[2] = (repCode >= 2) ? rep[1] : rep[2];
+ newReps.rep[1] = rep[0];
+ newReps.rep[0] = currentOffset;
+ } else { /* repCode == 0 */
+ memcpy(&newReps, rep, sizeof(newReps));
+ }
+ }
+ return newReps;
+}
+
+/* ZSTD_cParam_withinBounds:
+ * @return 1 if value is within cParam bounds,
+ * 0 otherwise */
+MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
+{
+ ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);
+ if (ZSTD_isError(bounds.error)) return 0;
+ if (value < bounds.lowerBound) return 0;
+ if (value > bounds.upperBound) return 0;
+ return 1;
+}
+
+/* ZSTD_noCompressBlock() :
+ * Writes uncompressed block to dst buffer from given src.
+ * Returns the size of the block */
+MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
+{
+ U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3);
+ RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity,
+ dstSize_tooSmall, "dst buf too small for uncompressed block");
+ MEM_writeLE24(dst, cBlockHeader24);
+ memcpy((BYTE*)dst + ZSTD_blockHeaderSize, src, srcSize);
+ return ZSTD_blockHeaderSize + srcSize;
+}
+
+MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
+{
+ BYTE* const op = (BYTE*)dst;
+ U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3);
+ RETURN_ERROR_IF(dstCapacity < 4, dstSize_tooSmall, "");
+ MEM_writeLE24(op, cBlockHeader);
+ op[3] = src;
+ return 4;
+}
+
+
+/* ZSTD_minGain() :
+ * minimum compression required
+ * to generate a compress block or a compressed literals section.
+ * note : use same formula for both situations */
+MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
+{
+ U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6;
+ ZSTD_STATIC_ASSERT(ZSTD_btultra == 8);
+ assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
+ return (srcSize >> minlog) + 2;
+}
+
+MEM_STATIC int ZSTD_disableLiteralsCompression(const ZSTD_CCtx_params* cctxParams)
+{
+ switch (cctxParams->literalCompressionMode) {
+ case ZSTD_lcm_huffman:
+ return 0;
+ case ZSTD_lcm_uncompressed:
+ return 1;
+ default:
+ assert(0 /* impossible: pre-validated */);
+ /* fall-through */
+ case ZSTD_lcm_auto:
+ return (cctxParams->cParams.strategy == ZSTD_fast) && (cctxParams->cParams.targetLength > 0);
+ }
+}
+
+/*! ZSTD_safecopyLiterals() :
+ * memcpy() function that won't read beyond more than WILDCOPY_OVERLENGTH bytes past ilimit_w.
+ * Only called when the sequence ends past ilimit_w, so it only needs to be optimized for single
+ * large copies.
+ */
+static void ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE const* ilimit_w) {
+ assert(iend > ilimit_w);
+ if (ip <= ilimit_w) {
+ ZSTD_wildcopy(op, ip, ilimit_w - ip, ZSTD_no_overlap);
+ op += ilimit_w - ip;
+ ip = ilimit_w;
+ }
+ while (ip < iend) *op++ = *ip++;
+}
+
+/*! ZSTD_storeSeq() :
+ * Store a sequence (litlen, litPtr, offCode and mlBase) into seqStore_t.
+ * `offCode` : distance to match + ZSTD_REP_MOVE (values <= ZSTD_REP_MOVE are repCodes).
+ * `mlBase` : matchLength - MINMATCH
+ * Allowed to overread literals up to litLimit.
+*/
+HINT_INLINE UNUSED_ATTR
+void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* literals, const BYTE* litLimit, U32 offCode, size_t mlBase)
+{
+ BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;
+ BYTE const* const litEnd = literals + litLength;
+#if defined(DEBUGLEVEL) && (DEBUGLEVEL >= 6)
+ static const BYTE* g_start = NULL;
+ if (g_start==NULL) g_start = (const BYTE*)literals; /* note : index only works for compression within a single segment */
+ { U32 const pos = (U32)((const BYTE*)literals - g_start);
+ DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u",
+ pos, (U32)litLength, (U32)mlBase+MINMATCH, (U32)offCode);
+ }
+#endif
+ assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
+ /* copy Literals */
+ assert(seqStorePtr->maxNbLit <= 128 KB);
+ assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + seqStorePtr->maxNbLit);
+ assert(literals + litLength <= litLimit);
+ if (litEnd <= litLimit_w) {
+ /* Common case we can use wildcopy.
+ * First copy 16 bytes, because literals are likely short.
+ */
+ assert(WILDCOPY_OVERLENGTH >= 16);
+ ZSTD_copy16(seqStorePtr->lit, literals);
+ if (litLength > 16) {
+ ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap);
+ }
+ } else {
+ ZSTD_safecopyLiterals(seqStorePtr->lit, literals, litEnd, litLimit_w);
+ }
+ seqStorePtr->lit += litLength;
+
+ /* literal Length */
+ if (litLength>0xFFFF) {
+ assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */
+ seqStorePtr->longLengthID = 1;
+ seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+ }
+ seqStorePtr->sequences[0].litLength = (U16)litLength;
+
+ /* match offset */
+ seqStorePtr->sequences[0].offset = offCode + 1;
+
+ /* match Length */
+ if (mlBase>0xFFFF) {
+ assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */
+ seqStorePtr->longLengthID = 2;
+ seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+ }
+ seqStorePtr->sequences[0].matchLength = (U16)mlBase;
+
+ seqStorePtr->sequences++;
+}
+
+
+/*-*************************************
+* Match length counter
+***************************************/
+static unsigned ZSTD_NbCommonBytes (size_t val)
+{
+ if (MEM_isLittleEndian()) {
+ if (MEM_64bits()) {
+# if defined(_MSC_VER) && defined(_WIN64)
+ unsigned long r = 0;
+ return _BitScanForward64( &r, (U64)val ) ? (unsigned)(r >> 3) : 0;
+# elif defined(__GNUC__) && (__GNUC__ >= 4)
+ return (__builtin_ctzll((U64)val) >> 3);
+# else
+ static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2,
+ 0, 3, 1, 3, 1, 4, 2, 7,
+ 0, 2, 3, 6, 1, 5, 3, 5,
+ 1, 3, 4, 4, 2, 5, 6, 7,
+ 7, 0, 1, 2, 3, 3, 4, 6,
+ 2, 6, 5, 5, 3, 4, 5, 6,
+ 7, 1, 2, 4, 6, 4, 4, 5,
+ 7, 2, 6, 5, 7, 6, 7, 7 };
+ return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+# endif
+ } else { /* 32 bits */
+# if defined(_MSC_VER)
+ unsigned long r=0;
+ return _BitScanForward( &r, (U32)val ) ? (unsigned)(r >> 3) : 0;
+# elif defined(__GNUC__) && (__GNUC__ >= 3)
+ return (__builtin_ctz((U32)val) >> 3);
+# else
+ static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0,
+ 3, 2, 2, 1, 3, 2, 0, 1,
+ 3, 3, 1, 2, 2, 2, 2, 0,
+ 3, 1, 2, 0, 1, 0, 1, 1 };
+ return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+# endif
+ }
+ } else { /* Big Endian CPU */
+ if (MEM_64bits()) {
+# if defined(_MSC_VER) && defined(_WIN64)
+ unsigned long r = 0;
+ return _BitScanReverse64( &r, val ) ? (unsigned)(r >> 3) : 0;
+# elif defined(__GNUC__) && (__GNUC__ >= 4)
+ return (__builtin_clzll(val) >> 3);
+# else
+ unsigned r;
+ const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */
+ if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
+ if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+ r += (!val);
+ return r;
+# endif
+ } else { /* 32 bits */
+# if defined(_MSC_VER)
+ unsigned long r = 0;
+ return _BitScanReverse( &r, (unsigned long)val ) ? (unsigned)(r >> 3) : 0;
+# elif defined(__GNUC__) && (__GNUC__ >= 3)
+ return (__builtin_clz((U32)val) >> 3);
+# else
+ unsigned r;
+ if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+ r += (!val);
+ return r;
+# endif
+ } }
+}
+
+
+MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
+{
+ const BYTE* const pStart = pIn;
+ const BYTE* const pInLoopLimit = pInLimit - (sizeof(size_t)-1);
+
+ if (pIn < pInLoopLimit) {
+ { size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
+ if (diff) return ZSTD_NbCommonBytes(diff); }
+ pIn+=sizeof(size_t); pMatch+=sizeof(size_t);
+ while (pIn < pInLoopLimit) {
+ size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
+ if (!diff) { pIn+=sizeof(size_t); pMatch+=sizeof(size_t); continue; }
+ pIn += ZSTD_NbCommonBytes(diff);
+ return (size_t)(pIn - pStart);
+ } }
+ if (MEM_64bits() && (pIn<(pInLimit-3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { pIn+=4; pMatch+=4; }
+ if ((pIn<(pInLimit-1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { pIn+=2; pMatch+=2; }
+ if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
+ return (size_t)(pIn - pStart);
+}
+
+/** ZSTD_count_2segments() :
+ * can count match length with `ip` & `match` in 2 different segments.
+ * convention : on reaching mEnd, match count continue starting from iStart
+ */
+MEM_STATIC size_t
+ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
+ const BYTE* iEnd, const BYTE* mEnd, const BYTE* iStart)
+{
+ const BYTE* const vEnd = MIN( ip + (mEnd - match), iEnd);
+ size_t const matchLength = ZSTD_count(ip, match, vEnd);
+ if (match + matchLength != mEnd) return matchLength;
+ DEBUGLOG(7, "ZSTD_count_2segments: found a 2-parts match (current length==%zu)", matchLength);
+ DEBUGLOG(7, "distance from match beginning to end dictionary = %zi", mEnd - match);
+ DEBUGLOG(7, "distance from current pos to end buffer = %zi", iEnd - ip);
+ DEBUGLOG(7, "next byte : ip==%02X, istart==%02X", ip[matchLength], *iStart);
+ DEBUGLOG(7, "final match length = %zu", matchLength + ZSTD_count(ip+matchLength, iStart, iEnd));
+ return matchLength + ZSTD_count(ip+matchLength, iStart, iEnd);
+}
+
+
+/*-*************************************
+ * Hashes
+ ***************************************/
+static const U32 prime3bytes = 506832829U;
+static U32 ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes) >> (32-h) ; }
+MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
+
+static const U32 prime4bytes = 2654435761U;
+static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
+static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); }
+
+static const U64 prime5bytes = 889523592379ULL;
+static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; }
+static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
+
+static const U64 prime6bytes = 227718039650203ULL;
+static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; }
+static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
+
+static const U64 prime7bytes = 58295818150454627ULL;
+static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; }
+static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
+
+static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
+static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
+
+MEM_STATIC size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+{
+ switch(mls)
+ {
+ default:
+ case 4: return ZSTD_hash4Ptr(p, hBits);
+ case 5: return ZSTD_hash5Ptr(p, hBits);
+ case 6: return ZSTD_hash6Ptr(p, hBits);
+ case 7: return ZSTD_hash7Ptr(p, hBits);
+ case 8: return ZSTD_hash8Ptr(p, hBits);
+ }
+}
+
+/** ZSTD_ipow() :
+ * Return base^exponent.
+ */
+static U64 ZSTD_ipow(U64 base, U64 exponent)
+{
+ U64 power = 1;
+ while (exponent) {
+ if (exponent & 1) power *= base;
+ exponent >>= 1;
+ base *= base;
+ }
+ return power;
+}
+
+#define ZSTD_ROLL_HASH_CHAR_OFFSET 10
+
+/** ZSTD_rollingHash_append() :
+ * Add the buffer to the hash value.
+ */
+static U64 ZSTD_rollingHash_append(U64 hash, void const* buf, size_t size)
+{
+ BYTE const* istart = (BYTE const*)buf;
+ size_t pos;
+ for (pos = 0; pos < size; ++pos) {
+ hash *= prime8bytes;
+ hash += istart[pos] + ZSTD_ROLL_HASH_CHAR_OFFSET;
+ }
+ return hash;
+}
+
+/** ZSTD_rollingHash_compute() :
+ * Compute the rolling hash value of the buffer.
+ */
+MEM_STATIC U64 ZSTD_rollingHash_compute(void const* buf, size_t size)
+{
+ return ZSTD_rollingHash_append(0, buf, size);
+}
+
+/** ZSTD_rollingHash_primePower() :
+ * Compute the primePower to be passed to ZSTD_rollingHash_rotate() for a hash
+ * over a window of length bytes.
+ */
+MEM_STATIC U64 ZSTD_rollingHash_primePower(U32 length)
+{
+ return ZSTD_ipow(prime8bytes, length - 1);
+}
+
+/** ZSTD_rollingHash_rotate() :
+ * Rotate the rolling hash by one byte.
+ */
+MEM_STATIC U64 ZSTD_rollingHash_rotate(U64 hash, BYTE toRemove, BYTE toAdd, U64 primePower)
+{
+ hash -= (toRemove + ZSTD_ROLL_HASH_CHAR_OFFSET) * primePower;
+ hash *= prime8bytes;
+ hash += toAdd + ZSTD_ROLL_HASH_CHAR_OFFSET;
+ return hash;
+}
+
+/*-*************************************
+* Round buffer management
+***************************************/
+#if (ZSTD_WINDOWLOG_MAX_64 > 31)
+# error "ZSTD_WINDOWLOG_MAX is too large : would overflow ZSTD_CURRENT_MAX"
+#endif
+/* Max current allowed */
+#define ZSTD_CURRENT_MAX ((3U << 29) + (1U << ZSTD_WINDOWLOG_MAX))
+/* Maximum chunk size before overflow correction needs to be called again */
+#define ZSTD_CHUNKSIZE_MAX \
+ ( ((U32)-1) /* Maximum ending current index */ \
+ - ZSTD_CURRENT_MAX) /* Maximum beginning lowLimit */
+
+/**
+ * ZSTD_window_clear():
+ * Clears the window containing the history by simply setting it to empty.
+ */
+MEM_STATIC void ZSTD_window_clear(ZSTD_window_t* window)
+{
+ size_t const endT = (size_t)(window->nextSrc - window->base);
+ U32 const end = (U32)endT;
+
+ window->lowLimit = end;
+ window->dictLimit = end;
+}
+
+/**
+ * ZSTD_window_hasExtDict():
+ * Returns non-zero if the window has a non-empty extDict.
+ */
+MEM_STATIC U32 ZSTD_window_hasExtDict(ZSTD_window_t const window)
+{
+ return window.lowLimit < window.dictLimit;
+}
+
+/**
+ * ZSTD_matchState_dictMode():
+ * Inspects the provided matchState and figures out what dictMode should be
+ * passed to the compressor.
+ */
+MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_matchState_t *ms)
+{
+ return ZSTD_window_hasExtDict(ms->window) ?
+ ZSTD_extDict :
+ ms->dictMatchState != NULL ?
+ ZSTD_dictMatchState :
+ ZSTD_noDict;
+}
+
+/**
+ * ZSTD_window_needOverflowCorrection():
+ * Returns non-zero if the indices are getting too large and need overflow
+ * protection.
+ */
+MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window,
+ void const* srcEnd)
+{
+ U32 const current = (U32)((BYTE const*)srcEnd - window.base);
+ return current > ZSTD_CURRENT_MAX;
+}
+
+/**
+ * ZSTD_window_correctOverflow():
+ * Reduces the indices to protect from index overflow.
+ * Returns the correction made to the indices, which must be applied to every
+ * stored index.
+ *
+ * The least significant cycleLog bits of the indices must remain the same,
+ * which may be 0. Every index up to maxDist in the past must be valid.
+ * NOTE: (maxDist & cycleMask) must be zero.
+ */
+MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
+ U32 maxDist, void const* src)
+{
+ /* preemptive overflow correction:
+ * 1. correction is large enough:
+ * lowLimit > (3<<29) ==> current > 3<<29 + 1<<windowLog
+ * 1<<windowLog <= newCurrent < 1<<chainLog + 1<<windowLog
+ *
+ * current - newCurrent
+ * > (3<<29 + 1<<windowLog) - (1<<windowLog + 1<<chainLog)
+ * > (3<<29) - (1<<chainLog)
+ * > (3<<29) - (1<<30) (NOTE: chainLog <= 30)
+ * > 1<<29
+ *
+ * 2. (ip+ZSTD_CHUNKSIZE_MAX - cctx->base) doesn't overflow:
+ * After correction, current is less than (1<<chainLog + 1<<windowLog).
+ * In 64-bit mode we are safe, because we have 64-bit ptrdiff_t.
+ * In 32-bit mode we are safe, because (chainLog <= 29), so
+ * ip+ZSTD_CHUNKSIZE_MAX - cctx->base < 1<<32.
+ * 3. (cctx->lowLimit + 1<<windowLog) < 1<<32:
+ * windowLog <= 31 ==> 3<<29 + 1<<windowLog < 7<<29 < 1<<32.
+ */
+ U32 const cycleMask = (1U << cycleLog) - 1;
+ U32 const current = (U32)((BYTE const*)src - window->base);
+ U32 const currentCycle0 = current & cycleMask;
+ /* Exclude zero so that newCurrent - maxDist >= 1. */
+ U32 const currentCycle1 = currentCycle0 == 0 ? (1U << cycleLog) : currentCycle0;
+ U32 const newCurrent = currentCycle1 + maxDist;
+ U32 const correction = current - newCurrent;
+ assert((maxDist & cycleMask) == 0);
+ assert(current > newCurrent);
+ /* Loose bound, should be around 1<<29 (see above) */
+ assert(correction > 1<<28);
+
+ window->base += correction;
+ window->dictBase += correction;
+ if (window->lowLimit <= correction) window->lowLimit = 1;
+ else window->lowLimit -= correction;
+ if (window->dictLimit <= correction) window->dictLimit = 1;
+ else window->dictLimit -= correction;
+
+ /* Ensure we can still reference the full window. */
+ assert(newCurrent >= maxDist);
+ assert(newCurrent - maxDist >= 1);
+ /* Ensure that lowLimit and dictLimit didn't underflow. */
+ assert(window->lowLimit <= newCurrent);
+ assert(window->dictLimit <= newCurrent);
+
+ DEBUGLOG(4, "Correction of 0x%x bytes to lowLimit=0x%x", correction,
+ window->lowLimit);
+ return correction;
+}
+
+/**
+ * ZSTD_window_enforceMaxDist():
+ * Updates lowLimit so that:
+ * (srcEnd - base) - lowLimit == maxDist + loadedDictEnd
+ *
+ * It ensures index is valid as long as index >= lowLimit.
+ * This must be called before a block compression call.
+ *
+ * loadedDictEnd is only defined if a dictionary is in use for current compression.
+ * As the name implies, loadedDictEnd represents the index at end of dictionary.
+ * The value lies within context's referential, it can be directly compared to blockEndIdx.
+ *
+ * If loadedDictEndPtr is NULL, no dictionary is in use, and we use loadedDictEnd == 0.
+ * If loadedDictEndPtr is not NULL, we set it to zero after updating lowLimit.
+ * This is because dictionaries are allowed to be referenced fully
+ * as long as the last byte of the dictionary is in the window.
+ * Once input has progressed beyond window size, dictionary cannot be referenced anymore.
+ *
+ * In normal dict mode, the dictionary lies between lowLimit and dictLimit.
+ * In dictMatchState mode, lowLimit and dictLimit are the same,
+ * and the dictionary is below them.
+ * forceWindow and dictMatchState are therefore incompatible.
+ */
+MEM_STATIC void
+ZSTD_window_enforceMaxDist(ZSTD_window_t* window,
+ const void* blockEnd,
+ U32 maxDist,
+ U32* loadedDictEndPtr,
+ const ZSTD_matchState_t** dictMatchStatePtr)
+{
+ U32 const blockEndIdx = (U32)((BYTE const*)blockEnd - window->base);
+ U32 const loadedDictEnd = (loadedDictEndPtr != NULL) ? *loadedDictEndPtr : 0;
+ DEBUGLOG(5, "ZSTD_window_enforceMaxDist: blockEndIdx=%u, maxDist=%u, loadedDictEnd=%u",
+ (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
+
+ /* - When there is no dictionary : loadedDictEnd == 0.
+ In which case, the test (blockEndIdx > maxDist) is merely to avoid
+ overflowing next operation `newLowLimit = blockEndIdx - maxDist`.
+ - When there is a standard dictionary :
+ Index referential is copied from the dictionary,
+ which means it starts from 0.
+ In which case, loadedDictEnd == dictSize,
+ and it makes sense to compare `blockEndIdx > maxDist + dictSize`
+ since `blockEndIdx` also starts from zero.
+ - When there is an attached dictionary :
+ loadedDictEnd is expressed within the referential of the context,
+ so it can be directly compared against blockEndIdx.
+ */
+ if (blockEndIdx > maxDist + loadedDictEnd) {
+ U32 const newLowLimit = blockEndIdx - maxDist;
+ if (window->lowLimit < newLowLimit) window->lowLimit = newLowLimit;
+ if (window->dictLimit < window->lowLimit) {
+ DEBUGLOG(5, "Update dictLimit to match lowLimit, from %u to %u",
+ (unsigned)window->dictLimit, (unsigned)window->lowLimit);
+ window->dictLimit = window->lowLimit;
+ }
+ /* On reaching window size, dictionaries are invalidated */
+ if (loadedDictEndPtr) *loadedDictEndPtr = 0;
+ if (dictMatchStatePtr) *dictMatchStatePtr = NULL;
+ }
+}
+
+/* Similar to ZSTD_window_enforceMaxDist(),
+ * but only invalidates dictionary
+ * when input progresses beyond window size.
+ * assumption : loadedDictEndPtr and dictMatchStatePtr are valid (non NULL)
+ * loadedDictEnd uses same referential as window->base
+ * maxDist is the window size */
+MEM_STATIC void
+ZSTD_checkDictValidity(const ZSTD_window_t* window,
+ const void* blockEnd,
+ U32 maxDist,
+ U32* loadedDictEndPtr,
+ const ZSTD_matchState_t** dictMatchStatePtr)
+{
+ assert(loadedDictEndPtr != NULL);
+ assert(dictMatchStatePtr != NULL);
+ { U32 const blockEndIdx = (U32)((BYTE const*)blockEnd - window->base);
+ U32 const loadedDictEnd = *loadedDictEndPtr;
+ DEBUGLOG(5, "ZSTD_checkDictValidity: blockEndIdx=%u, maxDist=%u, loadedDictEnd=%u",
+ (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
+ assert(blockEndIdx >= loadedDictEnd);
+
+ if (blockEndIdx > loadedDictEnd + maxDist) {
+ /* On reaching window size, dictionaries are invalidated.
+ * For simplification, if window size is reached anywhere within next block,
+ * the dictionary is invalidated for the full block.
+ */
+ DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)");
+ *loadedDictEndPtr = 0;
+ *dictMatchStatePtr = NULL;
+ } else {
+ if (*loadedDictEndPtr != 0) {
+ DEBUGLOG(6, "dictionary considered valid for current block");
+ } } }
+}
+
+MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) {
+ memset(window, 0, sizeof(*window));
+ window->base = (BYTE const*)"";
+ window->dictBase = (BYTE const*)"";
+ window->dictLimit = 1; /* start from 1, so that 1st position is valid */
+ window->lowLimit = 1; /* it ensures first and later CCtx usages compress the same */
+ window->nextSrc = window->base + 1; /* see issue #1241 */
+}
+
+/**
+ * ZSTD_window_update():
+ * Updates the window by appending [src, src + srcSize) to the window.
+ * If it is not contiguous, the current prefix becomes the extDict, and we
+ * forget about the extDict. Handles overlap of the prefix and extDict.
+ * Returns non-zero if the segment is contiguous.
+ */
+MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
+ void const* src, size_t srcSize)
+{
+ BYTE const* const ip = (BYTE const*)src;
+ U32 contiguous = 1;
+ DEBUGLOG(5, "ZSTD_window_update");
+ if (srcSize == 0)
+ return contiguous;
+ assert(window->base != NULL);
+ assert(window->dictBase != NULL);
+ /* Check if blocks follow each other */
+ if (src != window->nextSrc) {
+ /* not contiguous */
+ size_t const distanceFromBase = (size_t)(window->nextSrc - window->base);
+ DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u", window->dictLimit);
+ window->lowLimit = window->dictLimit;
+ assert(distanceFromBase == (size_t)(U32)distanceFromBase); /* should never overflow */
+ window->dictLimit = (U32)distanceFromBase;
+ window->dictBase = window->base;
+ window->base = ip - distanceFromBase;
+ /* ms->nextToUpdate = window->dictLimit; */
+ if (window->dictLimit - window->lowLimit < HASH_READ_SIZE) window->lowLimit = window->dictLimit; /* too small extDict */
+ contiguous = 0;
+ }
+ window->nextSrc = ip + srcSize;
+ /* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */
+ if ( (ip+srcSize > window->dictBase + window->lowLimit)
+ & (ip < window->dictBase + window->dictLimit)) {
+ ptrdiff_t const highInputIdx = (ip + srcSize) - window->dictBase;
+ U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx;
+ window->lowLimit = lowLimitMax;
+ DEBUGLOG(5, "Overlapping extDict and input : new lowLimit = %u", window->lowLimit);
+ }
+ return contiguous;
+}
+
+/**
+ * Returns the lowest allowed match index. It may either be in the ext-dict or the prefix.
+ */
+MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 current, unsigned windowLog)
+{
+ U32 const maxDistance = 1U << windowLog;
+ U32 const lowestValid = ms->window.lowLimit;
+ U32 const withinWindow = (current - lowestValid > maxDistance) ? current - maxDistance : lowestValid;
+ U32 const isDictionary = (ms->loadedDictEnd != 0);
+ U32 const matchLowest = isDictionary ? lowestValid : withinWindow;
+ return matchLowest;
+}
+
+/**
+ * Returns the lowest allowed match index in the prefix.
+ */
+MEM_STATIC U32 ZSTD_getLowestPrefixIndex(const ZSTD_matchState_t* ms, U32 current, unsigned windowLog)
+{
+ U32 const maxDistance = 1U << windowLog;
+ U32 const lowestValid = ms->window.dictLimit;
+ U32 const withinWindow = (current - lowestValid > maxDistance) ? current - maxDistance : lowestValid;
+ U32 const isDictionary = (ms->loadedDictEnd != 0);
+ U32 const matchLowest = isDictionary ? lowestValid : withinWindow;
+ return matchLowest;
+}
+
+
+
+/* debug functions */
+#if (DEBUGLEVEL>=2)
+
+MEM_STATIC double ZSTD_fWeight(U32 rawStat)
+{
+ U32 const fp_accuracy = 8;
+ U32 const fp_multiplier = (1 << fp_accuracy);
+ U32 const newStat = rawStat + 1;
+ U32 const hb = ZSTD_highbit32(newStat);
+ U32 const BWeight = hb * fp_multiplier;
+ U32 const FWeight = (newStat << fp_accuracy) >> hb;
+ U32 const weight = BWeight + FWeight;
+ assert(hb + fp_accuracy < 31);
+ return (double)weight / fp_multiplier;
+}
+
+/* display a table content,
+ * listing each element, its frequency, and its predicted bit cost */
+MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
+{
+ unsigned u, sum;
+ for (u=0, sum=0; u<=max; u++) sum += table[u];
+ DEBUGLOG(2, "total nb elts: %u", sum);
+ for (u=0; u<=max; u++) {
+ DEBUGLOG(2, "%2u: %5u (%.2f)",
+ u, table[u], ZSTD_fWeight(sum) - ZSTD_fWeight(table[u]) );
+ }
+}
+
+#endif
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+/* ===============================================================
+ * Shared internal declarations
+ * These prototypes may be called from sources not in lib/compress
+ * =============================================================== */
+
+/* ZSTD_loadCEntropy() :
+ * dict : must point at beginning of a valid zstd dictionary.
+ * return : size of dictionary header (size of magic number + dict ID + entropy tables)
+ * assumptions : magic number supposed already checked
+ * and dictSize >= 8 */
+size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+ short* offcodeNCount, unsigned* offcodeMaxValue,
+ const void* const dict, size_t dictSize);
+
+void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs);
+
+/* ==============================================================
+ * Private declarations
+ * These prototypes shall only be called from within lib/compress
+ * ============================================================== */
+
+/* ZSTD_getCParamsFromCCtxParams() :
+ * cParams are built depending on compressionLevel, src size hints,
+ * LDM and manually set compression parameters.
+ * Note: srcSizeHint == 0 means 0!
+ */
+ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+ const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize);
+
+/*! ZSTD_initCStream_internal() :
+ * Private use only. Init streaming operation.
+ * expects params to be valid.
+ * must receive dict, or cdict, or none, but not both.
+ * @return : 0, or an error code */
+size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs,
+ const void* dict, size_t dictSize,
+ const ZSTD_CDict* cdict,
+ const ZSTD_CCtx_params* params, unsigned long long pledgedSrcSize);
+
+void ZSTD_resetSeqStore(seqStore_t* ssPtr);
+
+/*! ZSTD_getCParamsFromCDict() :
+ * as the name implies */
+ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict);
+
+/* ZSTD_compressBegin_advanced_internal() :
+ * Private use only. To be called from zstdmt_compress.c. */
+size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx,
+ const void* dict, size_t dictSize,
+ ZSTD_dictContentType_e dictContentType,
+ ZSTD_dictTableLoadMethod_e dtlm,
+ const ZSTD_CDict* cdict,
+ const ZSTD_CCtx_params* params,
+ unsigned long long pledgedSrcSize);
+
+/* ZSTD_compress_advanced_internal() :
+ * Private use only. To be called from zstdmt_compress.c. */
+size_t ZSTD_compress_advanced_internal(ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const void* dict,size_t dictSize,
+ const ZSTD_CCtx_params* params);
+
+
+/* ZSTD_writeLastEmptyBlock() :
+ * output an empty Block with end-of-frame mark to complete a frame
+ * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h))
+ * or an error code if `dstCapacity` is too small (<ZSTD_blockHeaderSize)
+ */
+size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity);
+
+
+/* ZSTD_referenceExternalSequences() :
+ * Must be called before starting a compression operation.
+ * seqs must parse a prefix of the source.
+ * This cannot be used when long range matching is enabled.
+ * Zstd will use these sequences, and pass the literals to a secondary block
+ * compressor.
+ * @return : An error code on failure.
+ * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory
+ * access and data corruption.
+ */
+size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
+
+/** ZSTD_cycleLog() :
+ * condition for correct operation : hashLog > 1 */
+U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
+
+#endif /* ZSTD_COMPRESS_H */
+/**** ended inlining zstd_compress_internal.h ****/
+
+
+size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+ ZSTD_hufCTables_t* nextHuf,
+ ZSTD_strategy strategy, int disableLiteralCompression,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ void* entropyWorkspace, size_t entropyWorkspaceSize,
+ const int bmi2);
+
+#endif /* ZSTD_COMPRESS_LITERALS_H */
+/**** ended inlining zstd_compress_literals.h ****/
+
+size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+ BYTE* const ostart = (BYTE* const)dst;
+ U32 const flSize = 1 + (srcSize>31) + (srcSize>4095);
+
+ RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, "");
+
+ switch(flSize)
+ {
+ case 1: /* 2 - 1 - 5 */
+ ostart[0] = (BYTE)((U32)set_basic + (srcSize<<3));
+ break;
+ case 2: /* 2 - 2 - 12 */
+ MEM_writeLE16(ostart, (U16)((U32)set_basic + (1<<2) + (srcSize<<4)));
+ break;
+ case 3: /* 2 - 2 - 20 */
+ MEM_writeLE32(ostart, (U32)((U32)set_basic + (3<<2) + (srcSize<<4)));
+ break;
+ default: /* not necessary : flSize is {1,2,3} */
+ assert(0);
+ }
+
+ memcpy(ostart + flSize, src, srcSize);
+ DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
+ return srcSize + flSize;
+}
+
+size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+ BYTE* const ostart = (BYTE* const)dst;
+ U32 const flSize = 1 + (srcSize>31) + (srcSize>4095);
+
+ (void)dstCapacity; /* dstCapacity already guaranteed to be >=4, hence large enough */
+
+ switch(flSize)
+ {
+ case 1: /* 2 - 1 - 5 */
+ ostart[0] = (BYTE)((U32)set_rle + (srcSize<<3));
+ break;
+ case 2: /* 2 - 2 - 12 */
+ MEM_writeLE16(ostart, (U16)((U32)set_rle + (1<<2) + (srcSize<<4)));
+ break;
+ case 3: /* 2 - 2 - 20 */
+ MEM_writeLE32(ostart, (U32)((U32)set_rle + (3<<2) + (srcSize<<4)));
+ break;
+ default: /* not necessary : flSize is {1,2,3} */
+ assert(0);
+ }
+
+ ostart[flSize] = *(const BYTE*)src;
+ DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1);
+ return flSize+1;
+}
+
+size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+ ZSTD_hufCTables_t* nextHuf,
+ ZSTD_strategy strategy, int disableLiteralCompression,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ void* entropyWorkspace, size_t entropyWorkspaceSize,
+ const int bmi2)
+{
+ size_t const minGain = ZSTD_minGain(srcSize, strategy);
+ size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
+ BYTE* const ostart = (BYTE*)dst;
+ U32 singleStream = srcSize < 256;
+ symbolEncodingType_e hType = set_compressed;
+ size_t cLitSize;
+
+ DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)",
+ disableLiteralCompression, (U32)srcSize);
+
+ /* Prepare nextEntropy assuming reusing the existing table */
+ memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+
+ if (disableLiteralCompression)
+ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+
+ /* small ? don't even attempt compression (speed opt) */
+# define COMPRESS_LITERALS_SIZE_MIN 63
+ { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+ if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+ }
+
+ RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression");
+ { HUF_repeat repeat = prevHuf->repeatMode;
+ int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
+ if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
+ cLitSize = singleStream ?
+ HUF_compress1X_repeat(
+ ostart+lhSize, dstCapacity-lhSize, src, srcSize,
+ HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
+ (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2) :
+ HUF_compress4X_repeat(
+ ostart+lhSize, dstCapacity-lhSize, src, srcSize,
+ HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
+ (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2);
+ if (repeat != HUF_repeat_none) {
+ /* reused the existing table */
+ DEBUGLOG(5, "Reusing previous huffman table");
+ hType = set_repeat;
+ }
+ }
+
+ if ((cLitSize==0) | (cLitSize >= srcSize - minGain) | ERR_isError(cLitSize)) {
+ memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+ }
+ if (cLitSize==1) {
+ memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+ return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+ }
+
+ if (hType == set_compressed) {
+ /* using a newly constructed table */
+ nextHuf->repeatMode = HUF_repeat_check;
+ }
+
+ /* Build header */
+ switch(lhSize)
+ {
+ case 3: /* 2 - 2 - 10 - 10 */
+ { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
+ MEM_writeLE24(ostart, lhc);
+ break;
+ }
+ case 4: /* 2 - 2 - 14 - 14 */
+ { U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18);
+ MEM_writeLE32(ostart, lhc);
+ break;
+ }
+ case 5: /* 2 - 2 - 18 - 18 */
+ { U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22);
+ MEM_writeLE32(ostart, lhc);
+ ostart[4] = (BYTE)(cLitSize >> 10);
+ break;
+ }
+ default: /* not possible : lhSize is {3,4,5} */
+ assert(0);
+ }
+ DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)srcSize, (U32)(lhSize+cLitSize));
+ return lhSize+cLitSize;
+}
+/**** ended inlining compress/zstd_compress_literals.c ****/
+/**** start inlining compress/zstd_compress_sequences.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ /*-*************************************
+ * Dependencies
+ ***************************************/
+/**** start inlining zstd_compress_sequences.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMPRESS_SEQUENCES_H
+#define ZSTD_COMPRESS_SEQUENCES_H
+
+/**** skipping file: ../common/fse.h ****/
+/**** skipping file: ../common/zstd_internal.h ****/
+
+typedef enum {
+ ZSTD_defaultDisallowed = 0,
+ ZSTD_defaultAllowed = 1
+} ZSTD_defaultPolicy_e;
+
+symbolEncodingType_e
+ZSTD_selectEncodingType(
+ FSE_repeat* repeatMode, unsigned const* count, unsigned const max,
+ size_t const mostFrequent, size_t nbSeq, unsigned const FSELog,
+ FSE_CTable const* prevCTable,
+ short const* defaultNorm, U32 defaultNormLog,
+ ZSTD_defaultPolicy_e const isDefaultAllowed,
+ ZSTD_strategy const strategy);
+
+size_t
+ZSTD_buildCTable(void* dst, size_t dstCapacity,
+ FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type,
+ unsigned* count, U32 max,
+ const BYTE* codeTable, size_t nbSeq,
+ const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+ const FSE_CTable* prevCTable, size_t prevCTableSize,
+ void* entropyWorkspace, size_t entropyWorkspaceSize);
+
+size_t ZSTD_encodeSequences(
+ void* dst, size_t dstCapacity,
+ FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+ FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+ FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+ seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2);
+
+size_t ZSTD_fseBitCost(
+ FSE_CTable const* ctable,
+ unsigned const* count,
+ unsigned const max);
+
+size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog,
+ unsigned const* count, unsigned const max);
+#endif /* ZSTD_COMPRESS_SEQUENCES_H */
+/**** ended inlining zstd_compress_sequences.h ****/
+
+/**
+ * -log2(x / 256) lookup table for x in [0, 256).
+ * If x == 0: Return 0
+ * Else: Return floor(-log2(x / 256) * 256)
+ */
+static unsigned const kInverseProbabilityLog256[256] = {
+ 0, 2048, 1792, 1642, 1536, 1453, 1386, 1329, 1280, 1236, 1197, 1162,
+ 1130, 1100, 1073, 1047, 1024, 1001, 980, 960, 941, 923, 906, 889,
+ 874, 859, 844, 830, 817, 804, 791, 779, 768, 756, 745, 734,
+ 724, 714, 704, 694, 685, 676, 667, 658, 650, 642, 633, 626,
+ 618, 610, 603, 595, 588, 581, 574, 567, 561, 554, 548, 542,
+ 535, 529, 523, 517, 512, 506, 500, 495, 489, 484, 478, 473,
+ 468, 463, 458, 453, 448, 443, 438, 434, 429, 424, 420, 415,
+ 411, 407, 402, 398, 394, 390, 386, 382, 377, 373, 370, 366,
+ 362, 358, 354, 350, 347, 343, 339, 336, 332, 329, 325, 322,
+ 318, 315, 311, 308, 305, 302, 298, 295, 292, 289, 286, 282,
+ 279, 276, 273, 270, 267, 264, 261, 258, 256, 253, 250, 247,
+ 244, 241, 239, 236, 233, 230, 228, 225, 222, 220, 217, 215,
+ 212, 209, 207, 204, 202, 199, 197, 194, 192, 190, 187, 185,
+ 182, 180, 178, 175, 173, 171, 168, 166, 164, 162, 159, 157,
+ 155, 153, 151, 149, 146, 144, 142, 140, 138, 136, 134, 132,
+ 130, 128, 126, 123, 121, 119, 117, 115, 114, 112, 110, 108,
+ 106, 104, 102, 100, 98, 96, 94, 93, 91, 89, 87, 85,
+ 83, 82, 80, 78, 76, 74, 73, 71, 69, 67, 66, 64,
+ 62, 61, 59, 57, 55, 54, 52, 50, 49, 47, 46, 44,
+ 42, 41, 39, 37, 36, 34, 33, 31, 30, 28, 26, 25,
+ 23, 22, 20, 19, 17, 16, 14, 13, 11, 10, 8, 7,
+ 5, 4, 2, 1,
+};
+
+static unsigned ZSTD_getFSEMaxSymbolValue(FSE_CTable const* ctable) {
+ void const* ptr = ctable;
+ U16 const* u16ptr = (U16 const*)ptr;
+ U32 const maxSymbolValue = MEM_read16(u16ptr + 1);
+ return maxSymbolValue;
+}
+
+/**
+ * Returns the cost in bytes of encoding the normalized count header.
+ * Returns an error if any of the helper functions return an error.
+ */
+static size_t ZSTD_NCountCost(unsigned const* count, unsigned const max,
+ size_t const nbSeq, unsigned const FSELog)
+{
+ BYTE wksp[FSE_NCOUNTBOUND];
+ S16 norm[MaxSeq + 1];
+ const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max);
+ FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq, max), "");
+ return FSE_writeNCount(wksp, sizeof(wksp), norm, max, tableLog);
+}
+
+/**
+ * Returns the cost in bits of encoding the distribution described by count
+ * using the entropy bound.
+ */
+static size_t ZSTD_entropyCost(unsigned const* count, unsigned const max, size_t const total)
+{
+ unsigned cost = 0;
+ unsigned s;
+ for (s = 0; s <= max; ++s) {
+ unsigned norm = (unsigned)((256 * count[s]) / total);
+ if (count[s] != 0 && norm == 0)
+ norm = 1;
+ assert(count[s] < total);
+ cost += count[s] * kInverseProbabilityLog256[norm];
+ }
+ return cost >> 8;
+}
+
+/**
+ * Returns the cost in bits of encoding the distribution in count using ctable.
+ * Returns an error if ctable cannot represent all the symbols in count.
+ */
+size_t ZSTD_fseBitCost(
+ FSE_CTable const* ctable,
+ unsigned const* count,
+ unsigned const max)
+{
+ unsigned const kAccuracyLog = 8;
+ size_t cost = 0;
+ unsigned s;
+ FSE_CState_t cstate;
+ FSE_initCState(&cstate, ctable);
+ if (ZSTD_getFSEMaxSymbolValue(ctable) < max) {
+ DEBUGLOG(5, "Repeat FSE_CTable has maxSymbolValue %u < %u",
+ ZSTD_getFSEMaxSymbolValue(ctable), max);
+ return ERROR(GENERIC);
+ }
+ for (s = 0; s <= max; ++s) {
+ unsigned const tableLog = cstate.stateLog;
+ unsigned const badCost = (tableLog + 1) << kAccuracyLog;
+ unsigned const bitCost = FSE_bitCost(cstate.symbolTT, tableLog, s, kAccuracyLog);
+ if (count[s] == 0)
+ continue;
+ if (bitCost >= badCost) {
+ DEBUGLOG(5, "Repeat FSE_CTable has Prob[%u] == 0", s);
+ return ERROR(GENERIC);
+ }
+ cost += (size_t)count[s] * bitCost;
+ }
+ return cost >> kAccuracyLog;
+}
+
+/**
+ * Returns the cost in bits of encoding the distribution in count using the
+ * table described by norm. The max symbol support by norm is assumed >= max.
+ * norm must be valid for every symbol with non-zero probability in count.
+ */
+size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog,
+ unsigned const* count, unsigned const max)
+{
+ unsigned const shift = 8 - accuracyLog;
+ size_t cost = 0;
+ unsigned s;
+ assert(accuracyLog <= 8);
+ for (s = 0; s <= max; ++s) {
+ unsigned const normAcc = (norm[s] != -1) ? (unsigned)norm[s] : 1;
+ unsigned const norm256 = normAcc << shift;
+ assert(norm256 > 0);
+ assert(norm256 < 256);
+ cost += count[s] * kInverseProbabilityLog256[norm256];
+ }
+ return cost >> 8;
+}
+
+symbolEncodingType_e
+ZSTD_selectEncodingType(
+ FSE_repeat* repeatMode, unsigned const* count, unsigned const max,
+ size_t const mostFrequent, size_t nbSeq, unsigned const FSELog,
+ FSE_CTable const* prevCTable,
+ short const* defaultNorm, U32 defaultNormLog,
+ ZSTD_defaultPolicy_e const isDefaultAllowed,
+ ZSTD_strategy const strategy)
+{
+ ZSTD_STATIC_ASSERT(ZSTD_defaultDisallowed == 0 && ZSTD_defaultAllowed != 0);
+ if (mostFrequent == nbSeq) {
+ *repeatMode = FSE_repeat_none;
+ if (isDefaultAllowed && nbSeq <= 2) {
+ /* Prefer set_basic over set_rle when there are 2 or less symbols,
+ * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
+ * If basic encoding isn't possible, always choose RLE.
+ */
+ DEBUGLOG(5, "Selected set_basic");
+ return set_basic;
+ }
+ DEBUGLOG(5, "Selected set_rle");
+ return set_rle;
+ }
+ if (strategy < ZSTD_lazy) {
+ if (isDefaultAllowed) {
+ size_t const staticFse_nbSeq_max = 1000;
+ size_t const mult = 10 - strategy;
+ size_t const baseLog = 3;
+ size_t const dynamicFse_nbSeq_min = (((size_t)1 << defaultNormLog) * mult) >> baseLog; /* 28-36 for offset, 56-72 for lengths */
+ assert(defaultNormLog >= 5 && defaultNormLog <= 6); /* xx_DEFAULTNORMLOG */
+ assert(mult <= 9 && mult >= 7);
+ if ( (*repeatMode == FSE_repeat_valid)
+ && (nbSeq < staticFse_nbSeq_max) ) {
+ DEBUGLOG(5, "Selected set_repeat");
+ return set_repeat;
+ }
+ if ( (nbSeq < dynamicFse_nbSeq_min)
+ || (mostFrequent < (nbSeq >> (defaultNormLog-1))) ) {
+ DEBUGLOG(5, "Selected set_basic");
+ /* The format allows default tables to be repeated, but it isn't useful.
+ * When using simple heuristics to select encoding type, we don't want
+ * to confuse these tables with dictionaries. When running more careful
+ * analysis, we don't need to waste time checking both repeating tables
+ * and default tables.
+ */
+ *repeatMode = FSE_repeat_none;
+ return set_basic;
+ }
+ }
+ } else {
+ size_t const basicCost = isDefaultAllowed ? ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, count, max) : ERROR(GENERIC);
+ size_t const repeatCost = *repeatMode != FSE_repeat_none ? ZSTD_fseBitCost(prevCTable, count, max) : ERROR(GENERIC);
+ size_t const NCountCost = ZSTD_NCountCost(count, max, nbSeq, FSELog);
+ size_t const compressedCost = (NCountCost << 3) + ZSTD_entropyCost(count, max, nbSeq);
+
+ if (isDefaultAllowed) {
+ assert(!ZSTD_isError(basicCost));
+ assert(!(*repeatMode == FSE_repeat_valid && ZSTD_isError(repeatCost)));
+ }
+ assert(!ZSTD_isError(NCountCost));
+ assert(compressedCost < ERROR(maxCode));
+ DEBUGLOG(5, "Estimated bit costs: basic=%u\trepeat=%u\tcompressed=%u",
+ (unsigned)basicCost, (unsigned)repeatCost, (unsigned)compressedCost);
+ if (basicCost <= repeatCost && basicCost <= compressedCost) {
+ DEBUGLOG(5, "Selected set_basic");
+ assert(isDefaultAllowed);
+ *repeatMode = FSE_repeat_none;
+ return set_basic;
+ }
+ if (repeatCost <= compressedCost) {
+ DEBUGLOG(5, "Selected set_repeat");
+ assert(!ZSTD_isError(repeatCost));
+ return set_repeat;
+ }
+ assert(compressedCost < basicCost && compressedCost < repeatCost);
+ }
+ DEBUGLOG(5, "Selected set_compressed");
+ *repeatMode = FSE_repeat_check;
+ return set_compressed;
+}
+
+size_t
+ZSTD_buildCTable(void* dst, size_t dstCapacity,
+ FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type,
+ unsigned* count, U32 max,
+ const BYTE* codeTable, size_t nbSeq,
+ const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+ const FSE_CTable* prevCTable, size_t prevCTableSize,
+ void* entropyWorkspace, size_t entropyWorkspaceSize)
+{
+ BYTE* op = (BYTE*)dst;
+ const BYTE* const oend = op + dstCapacity;
+ DEBUGLOG(6, "ZSTD_buildCTable (dstCapacity=%u)", (unsigned)dstCapacity);
+
+ switch (type) {
+ case set_rle:
+ FORWARD_IF_ERROR(FSE_buildCTable_rle(nextCTable, (BYTE)max), "");
+ RETURN_ERROR_IF(dstCapacity==0, dstSize_tooSmall, "not enough space");
+ *op = codeTable[0];
+ return 1;
+ case set_repeat:
+ memcpy(nextCTable, prevCTable, prevCTableSize);
+ return 0;
+ case set_basic:
+ FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, defaultNorm, defaultMax, defaultNormLog, entropyWorkspace, entropyWorkspaceSize), ""); /* note : could be pre-calculated */
+ return 0;
+ case set_compressed: {
+ S16 norm[MaxSeq + 1];
+ size_t nbSeq_1 = nbSeq;
+ const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max);
+ if (count[codeTable[nbSeq-1]] > 1) {
+ count[codeTable[nbSeq-1]]--;
+ nbSeq_1--;
+ }
+ assert(nbSeq_1 > 1);
+ FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max), "");
+ { size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog); /* overflow protected */
+ FORWARD_IF_ERROR(NCountSize, "FSE_writeNCount failed");
+ FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, norm, max, tableLog, entropyWorkspace, entropyWorkspaceSize), "");
+ return NCountSize;
+ }
+ }
+ default: assert(0); RETURN_ERROR(GENERIC, "impossible to reach");
+ }
+}
+
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_encodeSequences_body(
+ void* dst, size_t dstCapacity,
+ FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+ FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+ FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+ seqDef const* sequences, size_t nbSeq, int longOffsets)
+{
+ BIT_CStream_t blockStream;
+ FSE_CState_t stateMatchLength;
+ FSE_CState_t stateOffsetBits;
+ FSE_CState_t stateLitLength;
+
+ RETURN_ERROR_IF(
+ ERR_isError(BIT_initCStream(&blockStream, dst, dstCapacity)),
+ dstSize_tooSmall, "not enough space remaining");
+ DEBUGLOG(6, "available space for bitstream : %i (dstCapacity=%u)",
+ (int)(blockStream.endPtr - blockStream.startPtr),
+ (unsigned)dstCapacity);
+
+ /* first symbols */
+ FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq-1]);
+ FSE_initCState2(&stateOffsetBits, CTable_OffsetBits, ofCodeTable[nbSeq-1]);
+ FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbSeq-1]);
+ BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]);
+ if (MEM_32bits()) BIT_flushBits(&blockStream);
+ BIT_addBits(&blockStream, sequences[nbSeq-1].matchLength, ML_bits[mlCodeTable[nbSeq-1]]);
+ if (MEM_32bits()) BIT_flushBits(&blockStream);
+ if (longOffsets) {
+ U32 const ofBits = ofCodeTable[nbSeq-1];
+ unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1);
+ if (extraBits) {
+ BIT_addBits(&blockStream, sequences[nbSeq-1].offset, extraBits);
+ BIT_flushBits(&blockStream);
+ }
+ BIT_addBits(&blockStream, sequences[nbSeq-1].offset >> extraBits,
+ ofBits - extraBits);
+ } else {
+ BIT_addBits(&blockStream, sequences[nbSeq-1].offset, ofCodeTable[nbSeq-1]);
+ }
+ BIT_flushBits(&blockStream);
+
+ { size_t n;
+ for (n=nbSeq-2 ; n<nbSeq ; n--) { /* intentional underflow */
+ BYTE const llCode = llCodeTable[n];
+ BYTE const ofCode = ofCodeTable[n];
+ BYTE const mlCode = mlCodeTable[n];
+ U32 const llBits = LL_bits[llCode];
+ U32 const ofBits = ofCode;
+ U32 const mlBits = ML_bits[mlCode];
+ DEBUGLOG(6, "encoding: litlen:%2u - matchlen:%2u - offCode:%7u",
+ (unsigned)sequences[n].litLength,
+ (unsigned)sequences[n].matchLength + MINMATCH,
+ (unsigned)sequences[n].offset);
+ /* 32b*/ /* 64b*/
+ /* (7)*/ /* (7)*/
+ FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode); /* 15 */ /* 15 */
+ FSE_encodeSymbol(&blockStream, &stateMatchLength, mlCode); /* 24 */ /* 24 */
+ if (MEM_32bits()) BIT_flushBits(&blockStream); /* (7)*/
+ FSE_encodeSymbol(&blockStream, &stateLitLength, llCode); /* 16 */ /* 33 */
+ if (MEM_32bits() || (ofBits+mlBits+llBits >= 64-7-(LLFSELog+MLFSELog+OffFSELog)))
+ BIT_flushBits(&blockStream); /* (7)*/
+ BIT_addBits(&blockStream, sequences[n].litLength, llBits);
+ if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream);
+ BIT_addBits(&blockStream, sequences[n].matchLength, mlBits);
+ if (MEM_32bits() || (ofBits+mlBits+llBits > 56)) BIT_flushBits(&blockStream);
+ if (longOffsets) {
+ unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1);
+ if (extraBits) {
+ BIT_addBits(&blockStream, sequences[n].offset, extraBits);
+ BIT_flushBits(&blockStream); /* (7)*/
+ }
+ BIT_addBits(&blockStream, sequences[n].offset >> extraBits,
+ ofBits - extraBits); /* 31 */
+ } else {
+ BIT_addBits(&blockStream, sequences[n].offset, ofBits); /* 31 */
+ }
+ BIT_flushBits(&blockStream); /* (7)*/
+ DEBUGLOG(7, "remaining space : %i", (int)(blockStream.endPtr - blockStream.ptr));
+ } }
+
+ DEBUGLOG(6, "ZSTD_encodeSequences: flushing ML state with %u bits", stateMatchLength.stateLog);
+ FSE_flushCState(&blockStream, &stateMatchLength);
+ DEBUGLOG(6, "ZSTD_encodeSequences: flushing Off state with %u bits", stateOffsetBits.stateLog);
+ FSE_flushCState(&blockStream, &stateOffsetBits);
+ DEBUGLOG(6, "ZSTD_encodeSequences: flushing LL state with %u bits", stateLitLength.stateLog);
+ FSE_flushCState(&blockStream, &stateLitLength);
+
+ { size_t const streamSize = BIT_closeCStream(&blockStream);
+ RETURN_ERROR_IF(streamSize==0, dstSize_tooSmall, "not enough space");
+ return streamSize;
+ }
+}
+
+static size_t
+ZSTD_encodeSequences_default(
+ void* dst, size_t dstCapacity,
+ FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+ FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+ FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+ seqDef const* sequences, size_t nbSeq, int longOffsets)
+{
+ return ZSTD_encodeSequences_body(dst, dstCapacity,
+ CTable_MatchLength, mlCodeTable,
+ CTable_OffsetBits, ofCodeTable,
+ CTable_LitLength, llCodeTable,
+ sequences, nbSeq, longOffsets);
+}
+
+
+#if DYNAMIC_BMI2
+
+static TARGET_ATTRIBUTE("bmi2") size_t
+ZSTD_encodeSequences_bmi2(
+ void* dst, size_t dstCapacity,
+ FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+ FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+ FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+ seqDef const* sequences, size_t nbSeq, int longOffsets)
+{
+ return ZSTD_encodeSequences_body(dst, dstCapacity,
+ CTable_MatchLength, mlCodeTable,
+ CTable_OffsetBits, ofCodeTable,
+ CTable_LitLength, llCodeTable,
+ sequences, nbSeq, longOffsets);
+}
+
+#endif
+
+size_t ZSTD_encodeSequences(
+ void* dst, size_t dstCapacity,
+ FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+ FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+ FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+ seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2)
+{
+ DEBUGLOG(5, "ZSTD_encodeSequences: dstCapacity = %u", (unsigned)dstCapacity);
+#if DYNAMIC_BMI2
+ if (bmi2) {
+ return ZSTD_encodeSequences_bmi2(dst, dstCapacity,
+ CTable_MatchLength, mlCodeTable,
+ CTable_OffsetBits, ofCodeTable,
+ CTable_LitLength, llCodeTable,
+ sequences, nbSeq, longOffsets);
+ }
+#endif
+ (void)bmi2;
+ return ZSTD_encodeSequences_default(dst, dstCapacity,
+ CTable_MatchLength, mlCodeTable,
+ CTable_OffsetBits, ofCodeTable,
+ CTable_LitLength, llCodeTable,
+ sequences, nbSeq, longOffsets);
+}
+/**** ended inlining compress/zstd_compress_sequences.c ****/
+/**** start inlining compress/zstd_compress_superblock.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ /*-*************************************
+ * Dependencies
+ ***************************************/
+/**** start inlining zstd_compress_superblock.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMPRESS_ADVANCED_H
+#define ZSTD_COMPRESS_ADVANCED_H
+
+/*-*************************************
+* Dependencies
+***************************************/
+
+/**** skipping file: ../zstd.h ****/
+
+/*-*************************************
+* Target Compressed Block Size
+***************************************/
+
+/* ZSTD_compressSuperBlock() :
+ * Used to compress a super block when targetCBlockSize is being used.
+ * The given block will be compressed into multiple sub blocks that are around targetCBlockSize. */
+size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
+ void* dst, size_t dstCapacity,
+ void const* src, size_t srcSize,
+ unsigned lastBlock);
+
+#endif /* ZSTD_COMPRESS_ADVANCED_H */
+/**** ended inlining zstd_compress_superblock.h ****/
+
+/**** skipping file: ../common/zstd_internal.h ****/
+/**** skipping file: hist.h ****/
+/**** skipping file: zstd_compress_internal.h ****/
+/**** skipping file: zstd_compress_sequences.h ****/
+/**** skipping file: zstd_compress_literals.h ****/
+
+/*-*************************************
+* Superblock entropy buffer structs
+***************************************/
+/** ZSTD_hufCTablesMetadata_t :
+ * Stores Literals Block Type for a super-block in hType, and
+ * huffman tree description in hufDesBuffer.
+ * hufDesSize refers to the size of huffman tree description in bytes.
+ * This metadata is populated in ZSTD_buildSuperBlockEntropy_literal() */
+typedef struct {
+ symbolEncodingType_e hType;
+ BYTE hufDesBuffer[500]; /* TODO give name to this value */
+ size_t hufDesSize;
+} ZSTD_hufCTablesMetadata_t;
+
+/** ZSTD_fseCTablesMetadata_t :
+ * Stores symbol compression modes for a super-block in {ll, ol, ml}Type, and
+ * fse tables in fseTablesBuffer.
+ * fseTablesSize refers to the size of fse tables in bytes.
+ * This metadata is populated in ZSTD_buildSuperBlockEntropy_sequences() */
+typedef struct {
+ symbolEncodingType_e llType;
+ symbolEncodingType_e ofType;
+ symbolEncodingType_e mlType;
+ BYTE fseTablesBuffer[500]; /* TODO give name to this value */
+ size_t fseTablesSize;
+ size_t lastCountSize; /* This is to account for bug in 1.3.4. More detail in ZSTD_compressSubBlock_sequences() */
+} ZSTD_fseCTablesMetadata_t;
+
+typedef struct {
+ ZSTD_hufCTablesMetadata_t hufMetadata;
+ ZSTD_fseCTablesMetadata_t fseMetadata;
+} ZSTD_entropyCTablesMetadata_t;
+
+
+/** ZSTD_buildSuperBlockEntropy_literal() :
+ * Builds entropy for the super-block literals.
+ * Stores literals block type (raw, rle, compressed, repeat) and
+ * huffman description table to hufMetadata.
+ * @return : size of huffman description table or error code */
+static size_t ZSTD_buildSuperBlockEntropy_literal(void* const src, size_t srcSize,
+ const ZSTD_hufCTables_t* prevHuf,
+ ZSTD_hufCTables_t* nextHuf,
+ ZSTD_hufCTablesMetadata_t* hufMetadata,
+ const int disableLiteralsCompression,
+ void* workspace, size_t wkspSize)
+{
+ BYTE* const wkspStart = (BYTE*)workspace;
+ BYTE* const wkspEnd = wkspStart + wkspSize;
+ BYTE* const countWkspStart = wkspStart;
+ unsigned* const countWksp = (unsigned*)workspace;
+ const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned);
+ BYTE* const nodeWksp = countWkspStart + countWkspSize;
+ const size_t nodeWkspSize = wkspEnd-nodeWksp;
+ unsigned maxSymbolValue = 255;
+ unsigned huffLog = HUF_TABLELOG_DEFAULT;
+ HUF_repeat repeat = prevHuf->repeatMode;
+
+ DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_literal (srcSize=%zu)", srcSize);
+
+ /* Prepare nextEntropy assuming reusing the existing table */
+ memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+
+ if (disableLiteralsCompression) {
+ DEBUGLOG(5, "set_basic - disabled");
+ hufMetadata->hType = set_basic;
+ return 0;
+ }
+
+ /* small ? don't even attempt compression (speed opt) */
+# define COMPRESS_LITERALS_SIZE_MIN 63
+ { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+ if (srcSize <= minLitSize) {
+ DEBUGLOG(5, "set_basic - too small");
+ hufMetadata->hType = set_basic;
+ return 0;
+ }
+ }
+
+ /* Scan input and build symbol stats */
+ { size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize);
+ FORWARD_IF_ERROR(largest, "HIST_count_wksp failed");
+ if (largest == srcSize) {
+ DEBUGLOG(5, "set_rle");
+ hufMetadata->hType = set_rle;
+ return 0;
+ }
+ if (largest <= (srcSize >> 7)+4) {
+ DEBUGLOG(5, "set_basic - no gain");
+ hufMetadata->hType = set_basic;
+ return 0;
+ }
+ }
+
+ /* Validate the previous Huffman table */
+ if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
+ repeat = HUF_repeat_none;
+ }
+
+ /* Build Huffman Tree */
+ memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable));
+ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+ { size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp,
+ maxSymbolValue, huffLog,
+ nodeWksp, nodeWkspSize);
+ FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp");
+ huffLog = (U32)maxBits;
+ { /* Build and write the CTable */
+ size_t const newCSize = HUF_estimateCompressedSize(
+ (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
+ size_t const hSize = HUF_writeCTable(
+ hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
+ (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog);
+ /* Check against repeating the previous CTable */
+ if (repeat != HUF_repeat_none) {
+ size_t const oldCSize = HUF_estimateCompressedSize(
+ (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
+ if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
+ DEBUGLOG(5, "set_repeat - smaller");
+ memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+ hufMetadata->hType = set_repeat;
+ return 0;
+ }
+ }
+ if (newCSize + hSize >= srcSize) {
+ DEBUGLOG(5, "set_basic - no gains");
+ memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+ hufMetadata->hType = set_basic;
+ return 0;
+ }
+ DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
+ hufMetadata->hType = set_compressed;
+ nextHuf->repeatMode = HUF_repeat_check;
+ return hSize;
+ }
+ }
+}
+
+/** ZSTD_buildSuperBlockEntropy_sequences() :
+ * Builds entropy for the super-block sequences.
+ * Stores symbol compression modes and fse table to fseMetadata.
+ * @return : size of fse tables or error code */
+static size_t ZSTD_buildSuperBlockEntropy_sequences(seqStore_t* seqStorePtr,
+ const ZSTD_fseCTables_t* prevEntropy,
+ ZSTD_fseCTables_t* nextEntropy,
+ const ZSTD_CCtx_params* cctxParams,
+ ZSTD_fseCTablesMetadata_t* fseMetadata,
+ void* workspace, size_t wkspSize)
+{
+ BYTE* const wkspStart = (BYTE*)workspace;
+ BYTE* const wkspEnd = wkspStart + wkspSize;
+ BYTE* const countWkspStart = wkspStart;
+ unsigned* const countWksp = (unsigned*)workspace;
+ const size_t countWkspSize = (MaxSeq + 1) * sizeof(unsigned);
+ BYTE* const cTableWksp = countWkspStart + countWkspSize;
+ const size_t cTableWkspSize = wkspEnd-cTableWksp;
+ ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+ FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable;
+ FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable;
+ FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable;
+ const BYTE* const ofCodeTable = seqStorePtr->ofCode;
+ const BYTE* const llCodeTable = seqStorePtr->llCode;
+ const BYTE* const mlCodeTable = seqStorePtr->mlCode;
+ size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+ BYTE* const ostart = fseMetadata->fseTablesBuffer;
+ BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer);
+ BYTE* op = ostart;
+
+ assert(cTableWkspSize >= (1 << MaxFSELog) * sizeof(FSE_FUNCTION_TYPE));
+ DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_sequences (nbSeq=%zu)", nbSeq);
+ memset(workspace, 0, wkspSize);
+
+ fseMetadata->lastCountSize = 0;
+ /* convert length/distances into codes */
+ ZSTD_seqToCodes(seqStorePtr);
+ /* build CTable for Literal Lengths */
+ { U32 LLtype;
+ unsigned max = MaxLL;
+ size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, llCodeTable, nbSeq, workspace, wkspSize); /* can't fail */
+ DEBUGLOG(5, "Building LL table");
+ nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode;
+ LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode,
+ countWksp, max, mostFrequent, nbSeq,
+ LLFSELog, prevEntropy->litlengthCTable,
+ LL_defaultNorm, LL_defaultNormLog,
+ ZSTD_defaultAllowed, strategy);
+ assert(set_basic < set_compressed && set_rle < set_compressed);
+ assert(!(LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+ { size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype,
+ countWksp, max, llCodeTable, nbSeq, LL_defaultNorm, LL_defaultNormLog, MaxLL,
+ prevEntropy->litlengthCTable, sizeof(prevEntropy->litlengthCTable),
+ cTableWksp, cTableWkspSize);
+ FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for LitLens failed");
+ if (LLtype == set_compressed)
+ fseMetadata->lastCountSize = countSize;
+ op += countSize;
+ fseMetadata->llType = (symbolEncodingType_e) LLtype;
+ } }
+ /* build CTable for Offsets */
+ { U32 Offtype;
+ unsigned max = MaxOff;
+ size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, ofCodeTable, nbSeq, workspace, wkspSize); /* can't fail */
+ /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */
+ ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed;
+ DEBUGLOG(5, "Building OF table");
+ nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode;
+ Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode,
+ countWksp, max, mostFrequent, nbSeq,
+ OffFSELog, prevEntropy->offcodeCTable,
+ OF_defaultNorm, OF_defaultNormLog,
+ defaultPolicy, strategy);
+ assert(!(Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+ { size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype,
+ countWksp, max, ofCodeTable, nbSeq, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+ prevEntropy->offcodeCTable, sizeof(prevEntropy->offcodeCTable),
+ cTableWksp, cTableWkspSize);
+ FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for Offsets failed");
+ if (Offtype == set_compressed)
+ fseMetadata->lastCountSize = countSize;
+ op += countSize;
+ fseMetadata->ofType = (symbolEncodingType_e) Offtype;
+ } }
+ /* build CTable for MatchLengths */
+ { U32 MLtype;
+ unsigned max = MaxML;
+ size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, mlCodeTable, nbSeq, workspace, wkspSize); /* can't fail */
+ DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op));
+ nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode;
+ MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode,
+ countWksp, max, mostFrequent, nbSeq,
+ MLFSELog, prevEntropy->matchlengthCTable,
+ ML_defaultNorm, ML_defaultNormLog,
+ ZSTD_defaultAllowed, strategy);
+ assert(!(MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+ { size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype,
+ countWksp, max, mlCodeTable, nbSeq, ML_defaultNorm, ML_defaultNormLog, MaxML,
+ prevEntropy->matchlengthCTable, sizeof(prevEntropy->matchlengthCTable),
+ cTableWksp, cTableWkspSize);
+ FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for MatchLengths failed");
+ if (MLtype == set_compressed)
+ fseMetadata->lastCountSize = countSize;
+ op += countSize;
+ fseMetadata->mlType = (symbolEncodingType_e) MLtype;
+ } }
+ assert((size_t) (op-ostart) <= sizeof(fseMetadata->fseTablesBuffer));
+ return op-ostart;
+}
+
+
+/** ZSTD_buildSuperBlockEntropy() :
+ * Builds entropy for the super-block.
+ * @return : 0 on success or error code */
+static size_t
+ZSTD_buildSuperBlockEntropy(seqStore_t* seqStorePtr,
+ const ZSTD_entropyCTables_t* prevEntropy,
+ ZSTD_entropyCTables_t* nextEntropy,
+ const ZSTD_CCtx_params* cctxParams,
+ ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+ void* workspace, size_t wkspSize)
+{
+ size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart;
+ DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy");
+ entropyMetadata->hufMetadata.hufDesSize =
+ ZSTD_buildSuperBlockEntropy_literal(seqStorePtr->litStart, litSize,
+ &prevEntropy->huf, &nextEntropy->huf,
+ &entropyMetadata->hufMetadata,
+ ZSTD_disableLiteralsCompression(cctxParams),
+ workspace, wkspSize);
+ FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildSuperBlockEntropy_literal failed");
+ entropyMetadata->fseMetadata.fseTablesSize =
+ ZSTD_buildSuperBlockEntropy_sequences(seqStorePtr,
+ &prevEntropy->fse, &nextEntropy->fse,
+ cctxParams,
+ &entropyMetadata->fseMetadata,
+ workspace, wkspSize);
+ FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize, "ZSTD_buildSuperBlockEntropy_sequences failed");
+ return 0;
+}
+
+/** ZSTD_compressSubBlock_literal() :
+ * Compresses literals section for a sub-block.
+ * When we have to write the Huffman table we will sometimes choose a header
+ * size larger than necessary. This is because we have to pick the header size
+ * before we know the table size + compressed size, so we have a bound on the
+ * table size. If we guessed incorrectly, we fall back to uncompressed literals.
+ *
+ * We write the header when writeEntropy=1 and set entropyWrriten=1 when we succeeded
+ * in writing the header, otherwise it is set to 0.
+ *
+ * hufMetadata->hType has literals block type info.
+ * If it is set_basic, all sub-blocks literals section will be Raw_Literals_Block.
+ * If it is set_rle, all sub-blocks literals section will be RLE_Literals_Block.
+ * If it is set_compressed, first sub-block's literals section will be Compressed_Literals_Block
+ * If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block
+ * and the following sub-blocks' literals sections will be Treeless_Literals_Block.
+ * @return : compressed size of literals section of a sub-block
+ * Or 0 if it unable to compress.
+ * Or error code */
+static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+ const ZSTD_hufCTablesMetadata_t* hufMetadata,
+ const BYTE* literals, size_t litSize,
+ void* dst, size_t dstSize,
+ const int bmi2, int writeEntropy, int* entropyWritten)
+{
+ size_t const header = writeEntropy ? 200 : 0;
+ size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header));
+ BYTE* const ostart = (BYTE*)dst;
+ BYTE* const oend = ostart + dstSize;
+ BYTE* op = ostart + lhSize;
+ U32 const singleStream = lhSize == 3;
+ symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat;
+ size_t cLitSize = 0;
+
+ (void)bmi2; /* TODO bmi2... */
+
+ DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy);
+
+ *entropyWritten = 0;
+ if (litSize == 0 || hufMetadata->hType == set_basic) {
+ DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal");
+ return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
+ } else if (hufMetadata->hType == set_rle) {
+ DEBUGLOG(5, "ZSTD_compressSubBlock_literal using rle literal");
+ return ZSTD_compressRleLiteralsBlock(dst, dstSize, literals, litSize);
+ }
+
+ assert(litSize > 0);
+ assert(hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat);
+
+ if (writeEntropy && hufMetadata->hType == set_compressed) {
+ memcpy(op, hufMetadata->hufDesBuffer, hufMetadata->hufDesSize);
+ op += hufMetadata->hufDesSize;
+ cLitSize += hufMetadata->hufDesSize;
+ DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize);
+ }
+
+ /* TODO bmi2 */
+ { const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable)
+ : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable);
+ op += cSize;
+ cLitSize += cSize;
+ if (cSize == 0 || ERR_isError(cSize)) {
+ DEBUGLOG(5, "Failed to write entropy tables %s", ZSTD_getErrorName(cSize));
+ return 0;
+ }
+ /* If we expand and we aren't writing a header then emit uncompressed */
+ if (!writeEntropy && cLitSize >= litSize) {
+ DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal because uncompressible");
+ return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
+ }
+ /* If we are writing headers then allow expansion that doesn't change our header size. */
+ if (lhSize < (size_t)(3 + (cLitSize >= 1 KB) + (cLitSize >= 16 KB))) {
+ assert(cLitSize > litSize);
+ DEBUGLOG(5, "Literals expanded beyond allowed header size");
+ return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
+ }
+ DEBUGLOG(5, "ZSTD_compressSubBlock_literal (cSize=%zu)", cSize);
+ }
+
+ /* Build header */
+ switch(lhSize)
+ {
+ case 3: /* 2 - 2 - 10 - 10 */
+ { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14);
+ MEM_writeLE24(ostart, lhc);
+ break;
+ }
+ case 4: /* 2 - 2 - 14 - 14 */
+ { U32 const lhc = hType + (2 << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<18);
+ MEM_writeLE32(ostart, lhc);
+ break;
+ }
+ case 5: /* 2 - 2 - 18 - 18 */
+ { U32 const lhc = hType + (3 << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<22);
+ MEM_writeLE32(ostart, lhc);
+ ostart[4] = (BYTE)(cLitSize >> 10);
+ break;
+ }
+ default: /* not possible : lhSize is {3,4,5} */
+ assert(0);
+ }
+ *entropyWritten = 1;
+ DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart));
+ return op-ostart;
+}
+
+static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) {
+ const seqDef* const sstart = sequences;
+ const seqDef* const send = sequences + nbSeq;
+ const seqDef* sp = sstart;
+ size_t matchLengthSum = 0;
+ size_t litLengthSum = 0;
+ while (send-sp > 0) {
+ ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp);
+ litLengthSum += seqLen.litLength;
+ matchLengthSum += seqLen.matchLength;
+ sp++;
+ }
+ assert(litLengthSum <= litSize);
+ if (!lastSequence) {
+ assert(litLengthSum == litSize);
+ }
+ return matchLengthSum + litSize;
+}
+
+/** ZSTD_compressSubBlock_sequences() :
+ * Compresses sequences section for a sub-block.
+ * fseMetadata->llType, fseMetadata->ofType, and fseMetadata->mlType have
+ * symbol compression modes for the super-block.
+ * The first successfully compressed block will have these in its header.
+ * We set entropyWritten=1 when we succeed in compressing the sequences.
+ * The following sub-blocks will always have repeat mode.
+ * @return : compressed size of sequences section of a sub-block
+ * Or 0 if it is unable to compress
+ * Or error code. */
+static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
+ const ZSTD_fseCTablesMetadata_t* fseMetadata,
+ const seqDef* sequences, size_t nbSeq,
+ const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+ const ZSTD_CCtx_params* cctxParams,
+ void* dst, size_t dstCapacity,
+ const int bmi2, int writeEntropy, int* entropyWritten)
+{
+ const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+ BYTE* const ostart = (BYTE*)dst;
+ BYTE* const oend = ostart + dstCapacity;
+ BYTE* op = ostart;
+ BYTE* seqHead;
+
+ DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (nbSeq=%zu, writeEntropy=%d, longOffsets=%d)", nbSeq, writeEntropy, longOffsets);
+
+ *entropyWritten = 0;
+ /* Sequences Header */
+ RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/,
+ dstSize_tooSmall, "");
+ if (nbSeq < 0x7F)
+ *op++ = (BYTE)nbSeq;
+ else if (nbSeq < LONGNBSEQ)
+ op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
+ else
+ op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
+ if (nbSeq==0) {
+ return op - ostart;
+ }
+
+ /* seqHead : flags for FSE encoding type */
+ seqHead = op++;
+
+ DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (seqHeadSize=%u)", (unsigned)(op-ostart));
+
+ if (writeEntropy) {
+ const U32 LLtype = fseMetadata->llType;
+ const U32 Offtype = fseMetadata->ofType;
+ const U32 MLtype = fseMetadata->mlType;
+ DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (fseTablesSize=%zu)", fseMetadata->fseTablesSize);
+ *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2));
+ memcpy(op, fseMetadata->fseTablesBuffer, fseMetadata->fseTablesSize);
+ op += fseMetadata->fseTablesSize;
+ } else {
+ const U32 repeat = set_repeat;
+ *seqHead = (BYTE)((repeat<<6) + (repeat<<4) + (repeat<<2));
+ }
+
+ { size_t const bitstreamSize = ZSTD_encodeSequences(
+ op, oend - op,
+ fseTables->matchlengthCTable, mlCode,
+ fseTables->offcodeCTable, ofCode,
+ fseTables->litlengthCTable, llCode,
+ sequences, nbSeq,
+ longOffsets, bmi2);
+ FORWARD_IF_ERROR(bitstreamSize, "ZSTD_encodeSequences failed");
+ op += bitstreamSize;
+ /* zstd versions <= 1.3.4 mistakenly report corruption when
+ * FSE_readNCount() receives a buffer < 4 bytes.
+ * Fixed by https://github.com/facebook/zstd/pull/1146.
+ * This can happen when the last set_compressed table present is 2
+ * bytes and the bitstream is only one byte.
+ * In this exceedingly rare case, we will simply emit an uncompressed
+ * block, since it isn't worth optimizing.
+ */
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+ if (writeEntropy && fseMetadata->lastCountSize && fseMetadata->lastCountSize + bitstreamSize < 4) {
+ /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */
+ assert(fseMetadata->lastCountSize + bitstreamSize == 3);
+ DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by "
+ "emitting an uncompressed block.");
+ return 0;
+ }
+#endif
+ DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (bitstreamSize=%zu)", bitstreamSize);
+ }
+
+ /* zstd versions <= 1.4.0 mistakenly report error when
+ * sequences section body size is less than 3 bytes.
+ * Fixed by https://github.com/facebook/zstd/pull/1664.
+ * This can happen when the previous sequences section block is compressed
+ * with rle mode and the current block's sequences section is compressed
+ * with repeat mode where sequences section body size can be 1 byte.
+ */
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+ if (op-seqHead < 4) {
+ DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.4.0 by emitting "
+ "an uncompressed block when sequences are < 4 bytes");
+ return 0;
+ }
+#endif
+
+ *entropyWritten = 1;
+ return op - ostart;
+}
+
+/** ZSTD_compressSubBlock() :
+ * Compresses a single sub-block.
+ * @return : compressed size of the sub-block
+ * Or 0 if it failed to compress. */
+static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
+ const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+ const seqDef* sequences, size_t nbSeq,
+ const BYTE* literals, size_t litSize,
+ const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+ const ZSTD_CCtx_params* cctxParams,
+ void* dst, size_t dstCapacity,
+ const int bmi2,
+ int writeLitEntropy, int writeSeqEntropy,
+ int* litEntropyWritten, int* seqEntropyWritten,
+ U32 lastBlock)
+{
+ BYTE* const ostart = (BYTE*)dst;
+ BYTE* const oend = ostart + dstCapacity;
+ BYTE* op = ostart + ZSTD_blockHeaderSize;
+ DEBUGLOG(5, "ZSTD_compressSubBlock (litSize=%zu, nbSeq=%zu, writeLitEntropy=%d, writeSeqEntropy=%d, lastBlock=%d)",
+ litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock);
+ { size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable,
+ &entropyMetadata->hufMetadata, literals, litSize,
+ op, oend-op, bmi2, writeLitEntropy, litEntropyWritten);
+ FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed");
+ if (cLitSize == 0) return 0;
+ op += cLitSize;
+ }
+ { size_t cSeqSize = ZSTD_compressSubBlock_sequences(&entropy->fse,
+ &entropyMetadata->fseMetadata,
+ sequences, nbSeq,
+ llCode, mlCode, ofCode,
+ cctxParams,
+ op, oend-op,
+ bmi2, writeSeqEntropy, seqEntropyWritten);
+ FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed");
+ if (cSeqSize == 0) return 0;
+ op += cSeqSize;
+ }
+ /* Write block header */
+ { size_t cSize = (op-ostart)-ZSTD_blockHeaderSize;
+ U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+ MEM_writeLE24(ostart, cBlockHeader24);
+ }
+ return op-ostart;
+}
+
+static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize,
+ const ZSTD_hufCTables_t* huf,
+ const ZSTD_hufCTablesMetadata_t* hufMetadata,
+ void* workspace, size_t wkspSize,
+ int writeEntropy)
+{
+ unsigned* const countWksp = (unsigned*)workspace;
+ unsigned maxSymbolValue = 255;
+ size_t literalSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */
+
+ if (hufMetadata->hType == set_basic) return litSize;
+ else if (hufMetadata->hType == set_rle) return 1;
+ else if (hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat) {
+ size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)literals, litSize, workspace, wkspSize);
+ if (ZSTD_isError(largest)) return litSize;
+ { size_t cLitSizeEstimate = HUF_estimateCompressedSize((const HUF_CElt*)huf->CTable, countWksp, maxSymbolValue);
+ if (writeEntropy) cLitSizeEstimate += hufMetadata->hufDesSize;
+ return cLitSizeEstimate + literalSectionHeaderSize;
+ } }
+ assert(0); /* impossible */
+ return 0;
+}
+
+static size_t ZSTD_estimateSubBlockSize_symbolType(symbolEncodingType_e type,
+ const BYTE* codeTable, unsigned maxCode,
+ size_t nbSeq, const FSE_CTable* fseCTable,
+ const U32* additionalBits,
+ short const* defaultNorm, U32 defaultNormLog,
+ void* workspace, size_t wkspSize)
+{
+ unsigned* const countWksp = (unsigned*)workspace;
+ const BYTE* ctp = codeTable;
+ const BYTE* const ctStart = ctp;
+ const BYTE* const ctEnd = ctStart + nbSeq;
+ size_t cSymbolTypeSizeEstimateInBits = 0;
+ unsigned max = maxCode;
+
+ HIST_countFast_wksp(countWksp, &max, codeTable, nbSeq, workspace, wkspSize); /* can't fail */
+ if (type == set_basic) {
+ cSymbolTypeSizeEstimateInBits = ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, countWksp, max);
+ } else if (type == set_rle) {
+ cSymbolTypeSizeEstimateInBits = 0;
+ } else if (type == set_compressed || type == set_repeat) {
+ cSymbolTypeSizeEstimateInBits = ZSTD_fseBitCost(fseCTable, countWksp, max);
+ }
+ if (ZSTD_isError(cSymbolTypeSizeEstimateInBits)) return nbSeq * 10;
+ while (ctp < ctEnd) {
+ if (additionalBits) cSymbolTypeSizeEstimateInBits += additionalBits[*ctp];
+ else cSymbolTypeSizeEstimateInBits += *ctp; /* for offset, offset code is also the number of additional bits */
+ ctp++;
+ }
+ return cSymbolTypeSizeEstimateInBits / 8;
+}
+
+static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable,
+ const BYTE* llCodeTable,
+ const BYTE* mlCodeTable,
+ size_t nbSeq,
+ const ZSTD_fseCTables_t* fseTables,
+ const ZSTD_fseCTablesMetadata_t* fseMetadata,
+ void* workspace, size_t wkspSize,
+ int writeEntropy)
+{
+ size_t sequencesSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */
+ size_t cSeqSizeEstimate = 0;
+ cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, MaxOff,
+ nbSeq, fseTables->offcodeCTable, NULL,
+ OF_defaultNorm, OF_defaultNormLog,
+ workspace, wkspSize);
+ cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->llType, llCodeTable, MaxLL,
+ nbSeq, fseTables->litlengthCTable, LL_bits,
+ LL_defaultNorm, LL_defaultNormLog,
+ workspace, wkspSize);
+ cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, MaxML,
+ nbSeq, fseTables->matchlengthCTable, ML_bits,
+ ML_defaultNorm, ML_defaultNormLog,
+ workspace, wkspSize);
+ if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize;
+ return cSeqSizeEstimate + sequencesSectionHeaderSize;
+}
+
+static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
+ const BYTE* ofCodeTable,
+ const BYTE* llCodeTable,
+ const BYTE* mlCodeTable,
+ size_t nbSeq,
+ const ZSTD_entropyCTables_t* entropy,
+ const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+ void* workspace, size_t wkspSize,
+ int writeLitEntropy, int writeSeqEntropy) {
+ size_t cSizeEstimate = 0;
+ cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize,
+ &entropy->huf, &entropyMetadata->hufMetadata,
+ workspace, wkspSize, writeLitEntropy);
+ cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+ nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+ workspace, wkspSize, writeSeqEntropy);
+ return cSizeEstimate + ZSTD_blockHeaderSize;
+}
+
+static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata)
+{
+ if (fseMetadata->llType == set_compressed || fseMetadata->llType == set_rle)
+ return 1;
+ if (fseMetadata->mlType == set_compressed || fseMetadata->mlType == set_rle)
+ return 1;
+ if (fseMetadata->ofType == set_compressed || fseMetadata->ofType == set_rle)
+ return 1;
+ return 0;
+}
+
+/** ZSTD_compressSubBlock_multi() :
+ * Breaks super-block into multiple sub-blocks and compresses them.
+ * Entropy will be written to the first block.
+ * The following blocks will use repeat mode to compress.
+ * All sub-blocks are compressed blocks (no raw or rle blocks).
+ * @return : compressed size of the super block (which is multiple ZSTD blocks)
+ * Or 0 if it failed to compress. */
+static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+ const ZSTD_compressedBlockState_t* prevCBlock,
+ ZSTD_compressedBlockState_t* nextCBlock,
+ const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+ const ZSTD_CCtx_params* cctxParams,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const int bmi2, U32 lastBlock,
+ void* workspace, size_t wkspSize)
+{
+ const seqDef* const sstart = seqStorePtr->sequencesStart;
+ const seqDef* const send = seqStorePtr->sequences;
+ const seqDef* sp = sstart;
+ const BYTE* const lstart = seqStorePtr->litStart;
+ const BYTE* const lend = seqStorePtr->lit;
+ const BYTE* lp = lstart;
+ BYTE const* ip = (BYTE const*)src;
+ BYTE const* const iend = ip + srcSize;
+ BYTE* const ostart = (BYTE*)dst;
+ BYTE* const oend = ostart + dstCapacity;
+ BYTE* op = ostart;
+ const BYTE* llCodePtr = seqStorePtr->llCode;
+ const BYTE* mlCodePtr = seqStorePtr->mlCode;
+ const BYTE* ofCodePtr = seqStorePtr->ofCode;
+ size_t targetCBlockSize = cctxParams->targetCBlockSize;
+ size_t litSize, seqCount;
+ int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed;
+ int writeSeqEntropy = 1;
+ int lastSequence = 0;
+
+ DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)",
+ (unsigned)(lend-lp), (unsigned)(send-sstart));
+
+ litSize = 0;
+ seqCount = 0;
+ do {
+ size_t cBlockSizeEstimate = 0;
+ if (sstart == send) {
+ lastSequence = 1;
+ } else {
+ const seqDef* const sequence = sp + seqCount;
+ lastSequence = sequence == send - 1;
+ litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength;
+ seqCount++;
+ }
+ if (lastSequence) {
+ assert(lp <= lend);
+ assert(litSize <= (size_t)(lend - lp));
+ litSize = (size_t)(lend - lp);
+ }
+ /* I think there is an optimization opportunity here.
+ * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful
+ * since it recalculates estimate from scratch.
+ * For example, it would recount literal distribution and symbol codes everytime.
+ */
+ cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount,
+ &nextCBlock->entropy, entropyMetadata,
+ workspace, wkspSize, writeLitEntropy, writeSeqEntropy);
+ if (cBlockSizeEstimate > targetCBlockSize || lastSequence) {
+ int litEntropyWritten = 0;
+ int seqEntropyWritten = 0;
+ const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence);
+ const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
+ sp, seqCount,
+ lp, litSize,
+ llCodePtr, mlCodePtr, ofCodePtr,
+ cctxParams,
+ op, oend-op,
+ bmi2, writeLitEntropy, writeSeqEntropy,
+ &litEntropyWritten, &seqEntropyWritten,
+ lastBlock && lastSequence);
+ FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
+ if (cSize > 0 && cSize < decompressedSize) {
+ DEBUGLOG(5, "Committed the sub-block");
+ assert(ip + decompressedSize <= iend);
+ ip += decompressedSize;
+ sp += seqCount;
+ lp += litSize;
+ op += cSize;
+ llCodePtr += seqCount;
+ mlCodePtr += seqCount;
+ ofCodePtr += seqCount;
+ litSize = 0;
+ seqCount = 0;
+ /* Entropy only needs to be written once */
+ if (litEntropyWritten) {
+ writeLitEntropy = 0;
+ }
+ if (seqEntropyWritten) {
+ writeSeqEntropy = 0;
+ }
+ }
+ }
+ } while (!lastSequence);
+ if (writeLitEntropy) {
+ DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten");
+ memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf));
+ }
+ if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) {
+ /* If we haven't written our entropy tables, then we've violated our contract and
+ * must emit an uncompressed block.
+ */
+ DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten");
+ return 0;
+ }
+ if (ip < iend) {
+ size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock);
+ DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip));
+ FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+ assert(cSize != 0);
+ op += cSize;
+ /* We have to regenerate the repcodes because we've skipped some sequences */
+ if (sp < send) {
+ seqDef const* seq;
+ repcodes_t rep;
+ memcpy(&rep, prevCBlock->rep, sizeof(rep));
+ for (seq = sstart; seq < sp; ++seq) {
+ rep = ZSTD_updateRep(rep.rep, seq->offset - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
+ }
+ memcpy(nextCBlock->rep, &rep, sizeof(rep));
+ }
+ }
+ DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed");
+ return op-ostart;
+}
+
+size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
+ void* dst, size_t dstCapacity,
+ void const* src, size_t srcSize,
+ unsigned lastBlock) {
+ ZSTD_entropyCTablesMetadata_t entropyMetadata;
+
+ FORWARD_IF_ERROR(ZSTD_buildSuperBlockEntropy(&zc->seqStore,
+ &zc->blockState.prevCBlock->entropy,
+ &zc->blockState.nextCBlock->entropy,
+ &zc->appliedParams,
+ &entropyMetadata,
+ zc->entropyWorkspace, HUF_WORKSPACE_SIZE /* statically allocated in resetCCtx */), "");
+
+ return ZSTD_compressSubBlock_multi(&zc->seqStore,
+ zc->blockState.prevCBlock,
+ zc->blockState.nextCBlock,
+ &entropyMetadata,
+ &zc->appliedParams,
+ dst, dstCapacity,
+ src, srcSize,
+ zc->bmi2, lastBlock,
+ zc->entropyWorkspace, HUF_WORKSPACE_SIZE /* statically allocated in resetCCtx */);
+}
+/**** ended inlining compress/zstd_compress_superblock.c ****/
+/**** start inlining compress/zstd_compress.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/*-*************************************
+* Dependencies
+***************************************/
+#include <limits.h> /* INT_MAX */
+#include <string.h> /* memset */
+/**** start inlining ../common/cpu.h ****/
+/*
+ * Copyright (c) 2018-2020, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMMON_CPU_H
+#define ZSTD_COMMON_CPU_H
+
+/**
+ * Implementation taken from folly/CpuId.h
+ * https://github.com/facebook/folly/blob/master/folly/CpuId.h
+ */
+
+#include <string.h>
+
+/**** skipping file: mem.h ****/
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+typedef struct {
+ U32 f1c;
+ U32 f1d;
+ U32 f7b;
+ U32 f7c;
+} ZSTD_cpuid_t;
+
+MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) {
+ U32 f1c = 0;
+ U32 f1d = 0;
+ U32 f7b = 0;
+ U32 f7c = 0;
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+ int reg[4];
+ __cpuid((int*)reg, 0);
+ {
+ int const n = reg[0];
+ if (n >= 1) {
+ __cpuid((int*)reg, 1);
+ f1c = (U32)reg[2];
+ f1d = (U32)reg[3];
+ }
+ if (n >= 7) {
+ __cpuidex((int*)reg, 7, 0);
+ f7b = (U32)reg[1];
+ f7c = (U32)reg[2];
+ }
+ }
+#elif defined(__i386__) && defined(__PIC__) && !defined(__clang__) && defined(__GNUC__)
+ /* The following block like the normal cpuid branch below, but gcc
+ * reserves ebx for use of its pic register so we must specially
+ * handle the save and restore to avoid clobbering the register
+ */
+ U32 n;
+ __asm__(
+ "pushl %%ebx\n\t"
+ "cpuid\n\t"
+ "popl %%ebx\n\t"
+ : "=a"(n)
+ : "a"(0)
+ : "ecx", "edx");
+ if (n >= 1) {
+ U32 f1a;
+ __asm__(
+ "pushl %%ebx\n\t"
+ "cpuid\n\t"
+ "popl %%ebx\n\t"
+ : "=a"(f1a), "=c"(f1c), "=d"(f1d)
+ : "a"(1));
+ }
+ if (n >= 7) {
+ __asm__(
+ "pushl %%ebx\n\t"
+ "cpuid\n\t"
+ "movl %%ebx, %%eax\n\t"
+ "popl %%ebx"
+ : "=a"(f7b), "=c"(f7c)
+ : "a"(7), "c"(0)
+ : "edx");
+ }
+#elif defined(__x86_64__) || defined(_M_X64) || defined(__i386__)
+ U32 n;
+ __asm__("cpuid" : "=a"(n) : "a"(0) : "ebx", "ecx", "edx");
+ if (n >= 1) {
+ U32 f1a;
+ __asm__("cpuid" : "=a"(f1a), "=c"(f1c), "=d"(f1d) : "a"(1) : "ebx");
+ }
+ if (n >= 7) {
+ U32 f7a;
+ __asm__("cpuid"
+ : "=a"(f7a), "=b"(f7b), "=c"(f7c)
+ : "a"(7), "c"(0)
+ : "edx");
+ }
+#endif
+ {
+ ZSTD_cpuid_t cpuid;
+ cpuid.f1c = f1c;
+ cpuid.f1d = f1d;
+ cpuid.f7b = f7b;
+ cpuid.f7c = f7c;
+ return cpuid;
+ }
+}
+
+#define X(name, r, bit) \
+ MEM_STATIC int ZSTD_cpuid_##name(ZSTD_cpuid_t const cpuid) { \
+ return ((cpuid.r) & (1U << bit)) != 0; \
+ }
+
+/* cpuid(1): Processor Info and Feature Bits. */
+#define C(name, bit) X(name, f1c, bit)
+ C(sse3, 0)
+ C(pclmuldq, 1)
+ C(dtes64, 2)
+ C(monitor, 3)
+ C(dscpl, 4)
+ C(vmx, 5)
+ C(smx, 6)
+ C(eist, 7)
+ C(tm2, 8)
+ C(ssse3, 9)
+ C(cnxtid, 10)
+ C(fma, 12)
+ C(cx16, 13)
+ C(xtpr, 14)
+ C(pdcm, 15)
+ C(pcid, 17)
+ C(dca, 18)
+ C(sse41, 19)
+ C(sse42, 20)
+ C(x2apic, 21)
+ C(movbe, 22)
+ C(popcnt, 23)
+ C(tscdeadline, 24)
+ C(aes, 25)
+ C(xsave, 26)
+ C(osxsave, 27)
+ C(avx, 28)
+ C(f16c, 29)
+ C(rdrand, 30)
+#undef C
+#define D(name, bit) X(name, f1d, bit)
+ D(fpu, 0)
+ D(vme, 1)
+ D(de, 2)
+ D(pse, 3)
+ D(tsc, 4)
+ D(msr, 5)
+ D(pae, 6)
+ D(mce, 7)
+ D(cx8, 8)
+ D(apic, 9)
+ D(sep, 11)
+ D(mtrr, 12)
+ D(pge, 13)
+ D(mca, 14)
+ D(cmov, 15)
+ D(pat, 16)
+ D(pse36, 17)
+ D(psn, 18)
+ D(clfsh, 19)
+ D(ds, 21)
+ D(acpi, 22)
+ D(mmx, 23)
+ D(fxsr, 24)
+ D(sse, 25)
+ D(sse2, 26)
+ D(ss, 27)
+ D(htt, 28)
+ D(tm, 29)
+ D(pbe, 31)
+#undef D
+
+/* cpuid(7): Extended Features. */
+#define B(name, bit) X(name, f7b, bit)
+ B(bmi1, 3)
+ B(hle, 4)
+ B(avx2, 5)
+ B(smep, 7)
+ B(bmi2, 8)
+ B(erms, 9)
+ B(invpcid, 10)
+ B(rtm, 11)
+ B(mpx, 14)
+ B(avx512f, 16)
+ B(avx512dq, 17)
+ B(rdseed, 18)
+ B(adx, 19)
+ B(smap, 20)
+ B(avx512ifma, 21)
+ B(pcommit, 22)
+ B(clflushopt, 23)
+ B(clwb, 24)
+ B(avx512pf, 26)
+ B(avx512er, 27)
+ B(avx512cd, 28)
+ B(sha, 29)
+ B(avx512bw, 30)
+ B(avx512vl, 31)
+#undef B
+#define C(name, bit) X(name, f7c, bit)
+ C(prefetchwt1, 0)
+ C(avx512vbmi, 1)
+#undef C
+
+#undef X
+
+#endif /* ZSTD_COMMON_CPU_H */
+/**** ended inlining ../common/cpu.h ****/
+/**** skipping file: ../common/mem.h ****/
+/**** skipping file: hist.h ****/
+#define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */
+/**** skipping file: ../common/fse.h ****/
+#define HUF_STATIC_LINKING_ONLY
+/**** skipping file: ../common/huf.h ****/
+/**** skipping file: zstd_compress_internal.h ****/
+/**** skipping file: zstd_compress_sequences.h ****/
+/**** skipping file: zstd_compress_literals.h ****/
+/**** start inlining zstd_fast.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_FAST_H
+#define ZSTD_FAST_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/**** skipping file: ../common/mem.h ****/
+/**** skipping file: zstd_compress_internal.h ****/
+
+void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+ void const* end, ZSTD_dictTableLoadMethod_e dtlm);
+size_t ZSTD_compressBlock_fast(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_fast_dictMatchState(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_fast_extDict(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_FAST_H */
+/**** ended inlining zstd_fast.h ****/
+/**** start inlining zstd_double_fast.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_DOUBLE_FAST_H
+#define ZSTD_DOUBLE_FAST_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/**** skipping file: ../common/mem.h ****/
+/**** skipping file: zstd_compress_internal.h ****/
+
+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+ void const* end, ZSTD_dictTableLoadMethod_e dtlm);
+size_t ZSTD_compressBlock_doubleFast(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_doubleFast_extDict(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_DOUBLE_FAST_H */
+/**** ended inlining zstd_double_fast.h ****/
+/**** start inlining zstd_lazy.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_LAZY_H
+#define ZSTD_LAZY_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/**** skipping file: zstd_compress_internal.h ****/
+
+U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
+
+void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue); /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */
+
+size_t ZSTD_compressBlock_btlazy2(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_dictMatchState(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_dictMatchState(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_dictMatchState(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_greedy_extDict(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_extDict(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_extDict(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btlazy2_extDict(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_LAZY_H */
+/**** ended inlining zstd_lazy.h ****/
+/**** start inlining zstd_opt.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_OPT_H
+#define ZSTD_OPT_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/**** skipping file: zstd_compress_internal.h ****/
+
+/* used in ZSTD_loadDictionaryContent() */
+void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend);
+
+size_t ZSTD_compressBlock_btopt(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra2(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+
+
+size_t ZSTD_compressBlock_btopt_dictMatchState(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra_dictMatchState(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_btopt_extDict(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra_extDict(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+
+ /* note : no btultra2 variant for extDict nor dictMatchState,
+ * because btultra2 is not meant to work with dictionaries
+ * and is only specific for the first block (no prefix) */
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_OPT_H */
+/**** ended inlining zstd_opt.h ****/
+/**** start inlining zstd_ldm.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_LDM_H
+#define ZSTD_LDM_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/**** skipping file: zstd_compress_internal.h ****/
+/**** skipping file: ../zstd.h ****/
+
+/*-*************************************
+* Long distance matching
+***************************************/
+
+#define ZSTD_LDM_DEFAULT_WINDOW_LOG ZSTD_WINDOWLOG_LIMIT_DEFAULT
+
+void ZSTD_ldm_fillHashTable(
+ ldmState_t* state, const BYTE* ip,
+ const BYTE* iend, ldmParams_t const* params);
+
+/**
+ * ZSTD_ldm_generateSequences():
+ *
+ * Generates the sequences using the long distance match finder.
+ * Generates long range matching sequences in `sequences`, which parse a prefix
+ * of the source. `sequences` must be large enough to store every sequence,
+ * which can be checked with `ZSTD_ldm_getMaxNbSeq()`.
+ * @returns 0 or an error code.
+ *
+ * NOTE: The user must have called ZSTD_window_update() for all of the input
+ * they have, even if they pass it to ZSTD_ldm_generateSequences() in chunks.
+ * NOTE: This function returns an error if it runs out of space to store
+ * sequences.
+ */
+size_t ZSTD_ldm_generateSequences(
+ ldmState_t* ldms, rawSeqStore_t* sequences,
+ ldmParams_t const* params, void const* src, size_t srcSize);
+
+/**
+ * ZSTD_ldm_blockCompress():
+ *
+ * Compresses a block using the predefined sequences, along with a secondary
+ * block compressor. The literals section of every sequence is passed to the
+ * secondary block compressor, and those sequences are interspersed with the
+ * predefined sequences. Returns the length of the last literals.
+ * Updates `rawSeqStore.pos` to indicate how many sequences have been consumed.
+ * `rawSeqStore.seq` may also be updated to split the last sequence between two
+ * blocks.
+ * @return The length of the last literals.
+ *
+ * NOTE: The source must be at most the maximum block size, but the predefined
+ * sequences can be any size, and may be longer than the block. In the case that
+ * they are longer than the block, the last sequences may need to be split into
+ * two. We handle that case correctly, and update `rawSeqStore` appropriately.
+ * NOTE: This function does not return any errors.
+ */
+size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+
+/**
+ * ZSTD_ldm_skipSequences():
+ *
+ * Skip past `srcSize` bytes worth of sequences in `rawSeqStore`.
+ * Avoids emitting matches less than `minMatch` bytes.
+ * Must be called for data with is not passed to ZSTD_ldm_blockCompress().
+ */
+void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize,
+ U32 const minMatch);
+
+
+/** ZSTD_ldm_getTableSize() :
+ * Estimate the space needed for long distance matching tables or 0 if LDM is
+ * disabled.
+ */
+size_t ZSTD_ldm_getTableSize(ldmParams_t params);
+
+/** ZSTD_ldm_getSeqSpace() :
+ * Return an upper bound on the number of sequences that can be produced by
+ * the long distance matcher, or 0 if LDM is disabled.
+ */
+size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize);
+
+/** ZSTD_ldm_adjustParameters() :
+ * If the params->hashRateLog is not set, set it to its default value based on
+ * windowLog and params->hashLog.
+ *
+ * Ensures that params->bucketSizeLog is <= params->hashLog (setting it to
+ * params->hashLog if it is not).
+ *
+ * Ensures that the minMatchLength >= targetLength during optimal parsing.
+ */
+void ZSTD_ldm_adjustParameters(ldmParams_t* params,
+ ZSTD_compressionParameters const* cParams);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_FAST_H */
+/**** ended inlining zstd_ldm.h ****/
+/**** skipping file: zstd_compress_superblock.h ****/
+
+
+/*-*************************************
+* Helper functions
+***************************************/
+/* ZSTD_compressBound()
+ * Note that the result from this function is only compatible with the "normal"
+ * full-block strategy.
+ * When there are a lot of small blocks due to frequent flush in streaming mode
+ * the overhead of headers can make the compressed data to be larger than the
+ * return value of ZSTD_compressBound().
+ */
+size_t ZSTD_compressBound(size_t srcSize) {
+ return ZSTD_COMPRESSBOUND(srcSize);
+}
+
+
+/*-*************************************
+* Context memory management
+***************************************/
+struct ZSTD_CDict_s {
+ const void* dictContent;
+ size_t dictContentSize;
+ U32* entropyWorkspace; /* entropy workspace of HUF_WORKSPACE_SIZE bytes */
+ ZSTD_cwksp workspace;
+ ZSTD_matchState_t matchState;
+ ZSTD_compressedBlockState_t cBlockState;
+ ZSTD_customMem customMem;
+ U32 dictID;
+ int compressionLevel; /* 0 indicates that advanced API was used to select CDict params */
+}; /* typedef'd to ZSTD_CDict within "zstd.h" */
+
+ZSTD_CCtx* ZSTD_createCCtx(void)
+{
+ return ZSTD_createCCtx_advanced(ZSTD_defaultCMem);
+}
+
+static void ZSTD_initCCtx(ZSTD_CCtx* cctx, ZSTD_customMem memManager)
+{
+ assert(cctx != NULL);
+ memset(cctx, 0, sizeof(*cctx));
+ cctx->customMem = memManager;
+ cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
+ { size_t const err = ZSTD_CCtx_reset(cctx, ZSTD_reset_parameters);
+ assert(!ZSTD_isError(err));
+ (void)err;
+ }
+}
+
+ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem)
+{
+ ZSTD_STATIC_ASSERT(zcss_init==0);
+ ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN==(0ULL - 1));
+ if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+ { ZSTD_CCtx* const cctx = (ZSTD_CCtx*)ZSTD_malloc(sizeof(ZSTD_CCtx), customMem);
+ if (!cctx) return NULL;
+ ZSTD_initCCtx(cctx, customMem);
+ return cctx;
+ }
+}
+
+ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize)
+{
+ ZSTD_cwksp ws;
+ ZSTD_CCtx* cctx;
+ if (workspaceSize <= sizeof(ZSTD_CCtx)) return NULL; /* minimum size */
+ if ((size_t)workspace & 7) return NULL; /* must be 8-aligned */
+ ZSTD_cwksp_init(&ws, workspace, workspaceSize);
+
+ cctx = (ZSTD_CCtx*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CCtx));
+ if (cctx == NULL) return NULL;
+
+ memset(cctx, 0, sizeof(ZSTD_CCtx));
+ ZSTD_cwksp_move(&cctx->workspace, &ws);
+ cctx->staticSize = workspaceSize;
+
+ /* statically sized space. entropyWorkspace never moves (but prev/next block swap places) */
+ if (!ZSTD_cwksp_check_available(&cctx->workspace, HUF_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t))) return NULL;
+ cctx->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t));
+ cctx->blockState.nextCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t));
+ cctx->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cctx->workspace, HUF_WORKSPACE_SIZE);
+ cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
+ return cctx;
+}
+
+/**
+ * Clears and frees all of the dictionaries in the CCtx.
+ */
+static void ZSTD_clearAllDicts(ZSTD_CCtx* cctx)
+{
+ ZSTD_free(cctx->localDict.dictBuffer, cctx->customMem);
+ ZSTD_freeCDict(cctx->localDict.cdict);
+ memset(&cctx->localDict, 0, sizeof(cctx->localDict));
+ memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict));
+ cctx->cdict = NULL;
+}
+
+static size_t ZSTD_sizeof_localDict(ZSTD_localDict dict)
+{
+ size_t const bufferSize = dict.dictBuffer != NULL ? dict.dictSize : 0;
+ size_t const cdictSize = ZSTD_sizeof_CDict(dict.cdict);
+ return bufferSize + cdictSize;
+}
+
+static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx)
+{
+ assert(cctx != NULL);
+ assert(cctx->staticSize == 0);
+ ZSTD_clearAllDicts(cctx);
+#ifdef ZSTD_MULTITHREAD
+ ZSTDMT_freeCCtx(cctx->mtctx); cctx->mtctx = NULL;
+#endif
+ ZSTD_cwksp_free(&cctx->workspace, cctx->customMem);
+}
+
+size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
+{
+ if (cctx==NULL) return 0; /* support free on NULL */
+ RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+ "not compatible with static CCtx");
+ {
+ int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
+ ZSTD_freeCCtxContent(cctx);
+ if (!cctxInWorkspace) {
+ ZSTD_free(cctx, cctx->customMem);
+ }
+ }
+ return 0;
+}
+
+
+static size_t ZSTD_sizeof_mtctx(const ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+ return ZSTDMT_sizeof_CCtx(cctx->mtctx);
+#else
+ (void)cctx;
+ return 0;
+#endif
+}
+
+
+size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx)
+{
+ if (cctx==NULL) return 0; /* support sizeof on NULL */
+ /* cctx may be in the workspace */
+ return (cctx->workspace.workspace == cctx ? 0 : sizeof(*cctx))
+ + ZSTD_cwksp_sizeof(&cctx->workspace)
+ + ZSTD_sizeof_localDict(cctx->localDict)
+ + ZSTD_sizeof_mtctx(cctx);
+}
+
+size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs)
+{
+ return ZSTD_sizeof_CCtx(zcs); /* same object */
+}
+
+/* private API call, for dictBuilder only */
+const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); }
+
+static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+ ZSTD_compressionParameters cParams)
+{
+ ZSTD_CCtx_params cctxParams;
+ memset(&cctxParams, 0, sizeof(cctxParams));
+ cctxParams.cParams = cParams;
+ cctxParams.compressionLevel = ZSTD_CLEVEL_DEFAULT; /* should not matter, as all cParams are presumed properly defined */
+ assert(!ZSTD_checkCParams(cParams));
+ cctxParams.fParams.contentSizeFlag = 1;
+ return cctxParams;
+}
+
+static ZSTD_CCtx_params* ZSTD_createCCtxParams_advanced(
+ ZSTD_customMem customMem)
+{
+ ZSTD_CCtx_params* params;
+ if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+ params = (ZSTD_CCtx_params*)ZSTD_calloc(
+ sizeof(ZSTD_CCtx_params), customMem);
+ if (!params) { return NULL; }
+ params->customMem = customMem;
+ params->compressionLevel = ZSTD_CLEVEL_DEFAULT;
+ params->fParams.contentSizeFlag = 1;
+ return params;
+}
+
+ZSTD_CCtx_params* ZSTD_createCCtxParams(void)
+{
+ return ZSTD_createCCtxParams_advanced(ZSTD_defaultCMem);
+}
+
+size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params)
+{
+ if (params == NULL) { return 0; }
+ ZSTD_free(params, params->customMem);
+ return 0;
+}
+
+size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params)
+{
+ return ZSTD_CCtxParams_init(params, ZSTD_CLEVEL_DEFAULT);
+}
+
+size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) {
+ RETURN_ERROR_IF(!cctxParams, GENERIC, "NULL pointer!");
+ memset(cctxParams, 0, sizeof(*cctxParams));
+ cctxParams->compressionLevel = compressionLevel;
+ cctxParams->fParams.contentSizeFlag = 1;
+ return 0;
+}
+
+size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params)
+{
+ RETURN_ERROR_IF(!cctxParams, GENERIC, "NULL pointer!");
+ FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) , "");
+ memset(cctxParams, 0, sizeof(*cctxParams));
+ assert(!ZSTD_checkCParams(params.cParams));
+ cctxParams->cParams = params.cParams;
+ cctxParams->fParams = params.fParams;
+ cctxParams->compressionLevel = ZSTD_CLEVEL_DEFAULT; /* should not matter, as all cParams are presumed properly defined */
+ return 0;
+}
+
+/* ZSTD_assignParamsToCCtxParams() :
+ * params is presumed valid at this stage */
+static ZSTD_CCtx_params ZSTD_assignParamsToCCtxParams(
+ const ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params)
+{
+ ZSTD_CCtx_params ret = *cctxParams;
+ assert(!ZSTD_checkCParams(params->cParams));
+ ret.cParams = params->cParams;
+ ret.fParams = params->fParams;
+ ret.compressionLevel = ZSTD_CLEVEL_DEFAULT; /* should not matter, as all cParams are presumed properly defined */
+ return ret;
+}
+
+ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+{
+ ZSTD_bounds bounds = { 0, 0, 0 };
+
+ switch(param)
+ {
+ case ZSTD_c_compressionLevel:
+ bounds.lowerBound = ZSTD_minCLevel();
+ bounds.upperBound = ZSTD_maxCLevel();
+ return bounds;
+
+ case ZSTD_c_windowLog:
+ bounds.lowerBound = ZSTD_WINDOWLOG_MIN;
+ bounds.upperBound = ZSTD_WINDOWLOG_MAX;
+ return bounds;
+
+ case ZSTD_c_hashLog:
+ bounds.lowerBound = ZSTD_HASHLOG_MIN;
+ bounds.upperBound = ZSTD_HASHLOG_MAX;
+ return bounds;
+
+ case ZSTD_c_chainLog:
+ bounds.lowerBound = ZSTD_CHAINLOG_MIN;
+ bounds.upperBound = ZSTD_CHAINLOG_MAX;
+ return bounds;
+
+ case ZSTD_c_searchLog:
+ bounds.lowerBound = ZSTD_SEARCHLOG_MIN;
+ bounds.upperBound = ZSTD_SEARCHLOG_MAX;
+ return bounds;
+
+ case ZSTD_c_minMatch:
+ bounds.lowerBound = ZSTD_MINMATCH_MIN;
+ bounds.upperBound = ZSTD_MINMATCH_MAX;
+ return bounds;
+
+ case ZSTD_c_targetLength:
+ bounds.lowerBound = ZSTD_TARGETLENGTH_MIN;
+ bounds.upperBound = ZSTD_TARGETLENGTH_MAX;
+ return bounds;
+
+ case ZSTD_c_strategy:
+ bounds.lowerBound = ZSTD_STRATEGY_MIN;
+ bounds.upperBound = ZSTD_STRATEGY_MAX;
+ return bounds;
+
+ case ZSTD_c_contentSizeFlag:
+ bounds.lowerBound = 0;
+ bounds.upperBound = 1;
+ return bounds;
+
+ case ZSTD_c_checksumFlag:
+ bounds.lowerBound = 0;
+ bounds.upperBound = 1;
+ return bounds;
+
+ case ZSTD_c_dictIDFlag:
+ bounds.lowerBound = 0;
+ bounds.upperBound = 1;
+ return bounds;
+
+ case ZSTD_c_nbWorkers:
+ bounds.lowerBound = 0;
+#ifdef ZSTD_MULTITHREAD
+ bounds.upperBound = ZSTDMT_NBWORKERS_MAX;
+#else
+ bounds.upperBound = 0;
+#endif
+ return bounds;
+
+ case ZSTD_c_jobSize:
+ bounds.lowerBound = 0;
+#ifdef ZSTD_MULTITHREAD
+ bounds.upperBound = ZSTDMT_JOBSIZE_MAX;
+#else
+ bounds.upperBound = 0;
+#endif
+ return bounds;
+
+ case ZSTD_c_overlapLog:
+#ifdef ZSTD_MULTITHREAD
+ bounds.lowerBound = ZSTD_OVERLAPLOG_MIN;
+ bounds.upperBound = ZSTD_OVERLAPLOG_MAX;
+#else
+ bounds.lowerBound = 0;
+ bounds.upperBound = 0;
+#endif
+ return bounds;
+
+ case ZSTD_c_enableLongDistanceMatching:
+ bounds.lowerBound = 0;
+ bounds.upperBound = 1;
+ return bounds;
+
+ case ZSTD_c_ldmHashLog:
+ bounds.lowerBound = ZSTD_LDM_HASHLOG_MIN;
+ bounds.upperBound = ZSTD_LDM_HASHLOG_MAX;
+ return bounds;
+
+ case ZSTD_c_ldmMinMatch:
+ bounds.lowerBound = ZSTD_LDM_MINMATCH_MIN;
+ bounds.upperBound = ZSTD_LDM_MINMATCH_MAX;
+ return bounds;
+
+ case ZSTD_c_ldmBucketSizeLog:
+ bounds.lowerBound = ZSTD_LDM_BUCKETSIZELOG_MIN;
+ bounds.upperBound = ZSTD_LDM_BUCKETSIZELOG_MAX;
+ return bounds;
+
+ case ZSTD_c_ldmHashRateLog:
+ bounds.lowerBound = ZSTD_LDM_HASHRATELOG_MIN;
+ bounds.upperBound = ZSTD_LDM_HASHRATELOG_MAX;
+ return bounds;
+
+ /* experimental parameters */
+ case ZSTD_c_rsyncable:
+ bounds.lowerBound = 0;
+ bounds.upperBound = 1;
+ return bounds;
+
+ case ZSTD_c_forceMaxWindow :
+ bounds.lowerBound = 0;
+ bounds.upperBound = 1;
+ return bounds;
+
+ case ZSTD_c_format:
+ ZSTD_STATIC_ASSERT(ZSTD_f_zstd1 < ZSTD_f_zstd1_magicless);
+ bounds.lowerBound = ZSTD_f_zstd1;
+ bounds.upperBound = ZSTD_f_zstd1_magicless; /* note : how to ensure at compile time that this is the highest value enum ? */
+ return bounds;
+
+ case ZSTD_c_forceAttachDict:
+ ZSTD_STATIC_ASSERT(ZSTD_dictDefaultAttach < ZSTD_dictForceCopy);
+ bounds.lowerBound = ZSTD_dictDefaultAttach;
+ bounds.upperBound = ZSTD_dictForceLoad; /* note : how to ensure at compile time that this is the highest value enum ? */
+ return bounds;
+
+ case ZSTD_c_literalCompressionMode:
+ ZSTD_STATIC_ASSERT(ZSTD_lcm_auto < ZSTD_lcm_huffman && ZSTD_lcm_huffman < ZSTD_lcm_uncompressed);
+ bounds.lowerBound = ZSTD_lcm_auto;
+ bounds.upperBound = ZSTD_lcm_uncompressed;
+ return bounds;
+
+ case ZSTD_c_targetCBlockSize:
+ bounds.lowerBound = ZSTD_TARGETCBLOCKSIZE_MIN;
+ bounds.upperBound = ZSTD_TARGETCBLOCKSIZE_MAX;
+ return bounds;
+
+ case ZSTD_c_srcSizeHint:
+ bounds.lowerBound = ZSTD_SRCSIZEHINT_MIN;
+ bounds.upperBound = ZSTD_SRCSIZEHINT_MAX;
+ return bounds;
+
+ default:
+ bounds.error = ERROR(parameter_unsupported);
+ return bounds;
+ }
+}
+
+/* ZSTD_cParam_clampBounds:
+ * Clamps the value into the bounded range.
+ */
+static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value)
+{
+ ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);
+ if (ZSTD_isError(bounds.error)) return bounds.error;
+ if (*value < bounds.lowerBound) *value = bounds.lowerBound;
+ if (*value > bounds.upperBound) *value = bounds.upperBound;
+ return 0;
+}
+
+#define BOUNDCHECK(cParam, val) { \
+ RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \
+ parameter_outOfBound, "Param out of bounds"); \
+}
+
+
+static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+{
+ switch(param)
+ {
+ case ZSTD_c_compressionLevel:
+ case ZSTD_c_hashLog:
+ case ZSTD_c_chainLog:
+ case ZSTD_c_searchLog:
+ case ZSTD_c_minMatch:
+ case ZSTD_c_targetLength:
+ case ZSTD_c_strategy:
+ return 1;
+
+ case ZSTD_c_format:
+ case ZSTD_c_windowLog:
+ case ZSTD_c_contentSizeFlag:
+ case ZSTD_c_checksumFlag:
+ case ZSTD_c_dictIDFlag:
+ case ZSTD_c_forceMaxWindow :
+ case ZSTD_c_nbWorkers:
+ case ZSTD_c_jobSize:
+ case ZSTD_c_overlapLog:
+ case ZSTD_c_rsyncable:
+ case ZSTD_c_enableLongDistanceMatching:
+ case ZSTD_c_ldmHashLog:
+ case ZSTD_c_ldmMinMatch:
+ case ZSTD_c_ldmBucketSizeLog:
+ case ZSTD_c_ldmHashRateLog:
+ case ZSTD_c_forceAttachDict:
+ case ZSTD_c_literalCompressionMode:
+ case ZSTD_c_targetCBlockSize:
+ case ZSTD_c_srcSizeHint:
+ default:
+ return 0;
+ }
+}
+
+size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
+{
+ DEBUGLOG(4, "ZSTD_CCtx_setParameter (%i, %i)", (int)param, value);
+ if (cctx->streamStage != zcss_init) {
+ if (ZSTD_isUpdateAuthorized(param)) {
+ cctx->cParamsChanged = 1;
+ } else {
+ RETURN_ERROR(stage_wrong, "can only set params in ctx init stage");
+ } }
+
+ switch(param)
+ {
+ case ZSTD_c_nbWorkers:
+ RETURN_ERROR_IF((value!=0) && cctx->staticSize, parameter_unsupported,
+ "MT not compatible with static alloc");
+ break;
+
+ case ZSTD_c_compressionLevel:
+ case ZSTD_c_windowLog:
+ case ZSTD_c_hashLog:
+ case ZSTD_c_chainLog:
+ case ZSTD_c_searchLog:
+ case ZSTD_c_minMatch:
+ case ZSTD_c_targetLength:
+ case ZSTD_c_strategy:
+ case ZSTD_c_ldmHashRateLog:
+ case ZSTD_c_format:
+ case ZSTD_c_contentSizeFlag:
+ case ZSTD_c_checksumFlag:
+ case ZSTD_c_dictIDFlag:
+ case ZSTD_c_forceMaxWindow:
+ case ZSTD_c_forceAttachDict:
+ case ZSTD_c_literalCompressionMode:
+ case ZSTD_c_jobSize:
+ case ZSTD_c_overlapLog:
+ case ZSTD_c_rsyncable:
+ case ZSTD_c_enableLongDistanceMatching:
+ case ZSTD_c_ldmHashLog:
+ case ZSTD_c_ldmMinMatch:
+ case ZSTD_c_ldmBucketSizeLog:
+ case ZSTD_c_targetCBlockSize:
+ case ZSTD_c_srcSizeHint:
+ break;
+
+ default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+ }
+ return ZSTD_CCtxParams_setParameter(&cctx->requestedParams, param, value);
+}
+
+size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+ ZSTD_cParameter param, int value)
+{
+ DEBUGLOG(4, "ZSTD_CCtxParams_setParameter (%i, %i)", (int)param, value);
+ switch(param)
+ {
+ case ZSTD_c_format :
+ BOUNDCHECK(ZSTD_c_format, value);
+ CCtxParams->format = (ZSTD_format_e)value;
+ return (size_t)CCtxParams->format;
+
+ case ZSTD_c_compressionLevel : {
+ FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), "");
+ if (value) { /* 0 : does not change current level */
+ CCtxParams->compressionLevel = value;
+ }
+ if (CCtxParams->compressionLevel >= 0) return (size_t)CCtxParams->compressionLevel;
+ return 0; /* return type (size_t) cannot represent negative values */
+ }
+
+ case ZSTD_c_windowLog :
+ if (value!=0) /* 0 => use default */
+ BOUNDCHECK(ZSTD_c_windowLog, value);
+ CCtxParams->cParams.windowLog = (U32)value;
+ return CCtxParams->cParams.windowLog;
+
+ case ZSTD_c_hashLog :
+ if (value!=0) /* 0 => use default */
+ BOUNDCHECK(ZSTD_c_hashLog, value);
+ CCtxParams->cParams.hashLog = (U32)value;
+ return CCtxParams->cParams.hashLog;
+
+ case ZSTD_c_chainLog :
+ if (value!=0) /* 0 => use default */
+ BOUNDCHECK(ZSTD_c_chainLog, value);
+ CCtxParams->cParams.chainLog = (U32)value;
+ return CCtxParams->cParams.chainLog;
+
+ case ZSTD_c_searchLog :
+ if (value!=0) /* 0 => use default */
+ BOUNDCHECK(ZSTD_c_searchLog, value);
+ CCtxParams->cParams.searchLog = (U32)value;
+ return (size_t)value;
+
+ case ZSTD_c_minMatch :
+ if (value!=0) /* 0 => use default */
+ BOUNDCHECK(ZSTD_c_minMatch, value);
+ CCtxParams->cParams.minMatch = value;
+ return CCtxParams->cParams.minMatch;
+
+ case ZSTD_c_targetLength :
+ BOUNDCHECK(ZSTD_c_targetLength, value);
+ CCtxParams->cParams.targetLength = value;
+ return CCtxParams->cParams.targetLength;
+
+ case ZSTD_c_strategy :
+ if (value!=0) /* 0 => use default */
+ BOUNDCHECK(ZSTD_c_strategy, value);
+ CCtxParams->cParams.strategy = (ZSTD_strategy)value;
+ return (size_t)CCtxParams->cParams.strategy;
+
+ case ZSTD_c_contentSizeFlag :
+ /* Content size written in frame header _when known_ (default:1) */
+ DEBUGLOG(4, "set content size flag = %u", (value!=0));
+ CCtxParams->fParams.contentSizeFlag = value != 0;
+ return CCtxParams->fParams.contentSizeFlag;
+
+ case ZSTD_c_checksumFlag :
+ /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */
+ CCtxParams->fParams.checksumFlag = value != 0;
+ return CCtxParams->fParams.checksumFlag;
+
+ case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */
+ DEBUGLOG(4, "set dictIDFlag = %u", (value!=0));
+ CCtxParams->fParams.noDictIDFlag = !value;
+ return !CCtxParams->fParams.noDictIDFlag;
+
+ case ZSTD_c_forceMaxWindow :
+ CCtxParams->forceWindow = (value != 0);
+ return CCtxParams->forceWindow;
+
+ case ZSTD_c_forceAttachDict : {
+ const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value;
+ BOUNDCHECK(ZSTD_c_forceAttachDict, pref);
+ CCtxParams->attachDictPref = pref;
+ return CCtxParams->attachDictPref;
+ }
+
+ case ZSTD_c_literalCompressionMode : {
+ const ZSTD_literalCompressionMode_e lcm = (ZSTD_literalCompressionMode_e)value;
+ BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm);
+ CCtxParams->literalCompressionMode = lcm;
+ return CCtxParams->literalCompressionMode;
+ }
+
+ case ZSTD_c_nbWorkers :
+#ifndef ZSTD_MULTITHREAD
+ RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading");
+ return 0;
+#else
+ FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), "");
+ CCtxParams->nbWorkers = value;
+ return CCtxParams->nbWorkers;
+#endif
+
+ case ZSTD_c_jobSize :
+#ifndef ZSTD_MULTITHREAD
+ RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading");
+ return 0;
+#else
+ /* Adjust to the minimum non-default value. */
+ if (value != 0 && value < ZSTDMT_JOBSIZE_MIN)
+ value = ZSTDMT_JOBSIZE_MIN;
+ FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), "");
+ assert(value >= 0);
+ CCtxParams->jobSize = value;
+ return CCtxParams->jobSize;
+#endif
+
+ case ZSTD_c_overlapLog :
+#ifndef ZSTD_MULTITHREAD
+ RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading");
+ return 0;
+#else
+ FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value), "");
+ CCtxParams->overlapLog = value;
+ return CCtxParams->overlapLog;
+#endif
+
+ case ZSTD_c_rsyncable :
+#ifndef ZSTD_MULTITHREAD
+ RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading");
+ return 0;
+#else
+ FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value), "");
+ CCtxParams->rsyncable = value;
+ return CCtxParams->rsyncable;
+#endif
+
+ case ZSTD_c_enableLongDistanceMatching :
+ CCtxParams->ldmParams.enableLdm = (value!=0);
+ return CCtxParams->ldmParams.enableLdm;
+
+ case ZSTD_c_ldmHashLog :
+ if (value!=0) /* 0 ==> auto */
+ BOUNDCHECK(ZSTD_c_ldmHashLog, value);
+ CCtxParams->ldmParams.hashLog = value;
+ return CCtxParams->ldmParams.hashLog;
+
+ case ZSTD_c_ldmMinMatch :
+ if (value!=0) /* 0 ==> default */
+ BOUNDCHECK(ZSTD_c_ldmMinMatch, value);
+ CCtxParams->ldmParams.minMatchLength = value;
+ return CCtxParams->ldmParams.minMatchLength;
+
+ case ZSTD_c_ldmBucketSizeLog :
+ if (value!=0) /* 0 ==> default */
+ BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value);
+ CCtxParams->ldmParams.bucketSizeLog = value;
+ return CCtxParams->ldmParams.bucketSizeLog;
+
+ case ZSTD_c_ldmHashRateLog :
+ RETURN_ERROR_IF(value > ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN,
+ parameter_outOfBound, "Param out of bounds!");
+ CCtxParams->ldmParams.hashRateLog = value;
+ return CCtxParams->ldmParams.hashRateLog;
+
+ case ZSTD_c_targetCBlockSize :
+ if (value!=0) /* 0 ==> default */
+ BOUNDCHECK(ZSTD_c_targetCBlockSize, value);
+ CCtxParams->targetCBlockSize = value;
+ return CCtxParams->targetCBlockSize;
+
+ case ZSTD_c_srcSizeHint :
+ if (value!=0) /* 0 ==> default */
+ BOUNDCHECK(ZSTD_c_srcSizeHint, value);
+ CCtxParams->srcSizeHint = value;
+ return CCtxParams->srcSizeHint;
+
+ default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+ }
+}
+
+size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value)
+{
+ return ZSTD_CCtxParams_getParameter(&cctx->requestedParams, param, value);
+}
+
+size_t ZSTD_CCtxParams_getParameter(
+ ZSTD_CCtx_params* CCtxParams, ZSTD_cParameter param, int* value)
+{
+ switch(param)
+ {
+ case ZSTD_c_format :
+ *value = CCtxParams->format;
+ break;
+ case ZSTD_c_compressionLevel :
+ *value = CCtxParams->compressionLevel;
+ break;
+ case ZSTD_c_windowLog :
+ *value = (int)CCtxParams->cParams.windowLog;
+ break;
+ case ZSTD_c_hashLog :
+ *value = (int)CCtxParams->cParams.hashLog;
+ break;
+ case ZSTD_c_chainLog :
+ *value = (int)CCtxParams->cParams.chainLog;
+ break;
+ case ZSTD_c_searchLog :
+ *value = CCtxParams->cParams.searchLog;
+ break;
+ case ZSTD_c_minMatch :
+ *value = CCtxParams->cParams.minMatch;
+ break;
+ case ZSTD_c_targetLength :
+ *value = CCtxParams->cParams.targetLength;
+ break;
+ case ZSTD_c_strategy :
+ *value = (unsigned)CCtxParams->cParams.strategy;
+ break;
+ case ZSTD_c_contentSizeFlag :
+ *value = CCtxParams->fParams.contentSizeFlag;
+ break;
+ case ZSTD_c_checksumFlag :
+ *value = CCtxParams->fParams.checksumFlag;
+ break;
+ case ZSTD_c_dictIDFlag :
+ *value = !CCtxParams->fParams.noDictIDFlag;
+ break;
+ case ZSTD_c_forceMaxWindow :
+ *value = CCtxParams->forceWindow;
+ break;
+ case ZSTD_c_forceAttachDict :
+ *value = CCtxParams->attachDictPref;
+ break;
+ case ZSTD_c_literalCompressionMode :
+ *value = CCtxParams->literalCompressionMode;
+ break;
+ case ZSTD_c_nbWorkers :
+#ifndef ZSTD_MULTITHREAD
+ assert(CCtxParams->nbWorkers == 0);
+#endif
+ *value = CCtxParams->nbWorkers;
+ break;
+ case ZSTD_c_jobSize :
+#ifndef ZSTD_MULTITHREAD
+ RETURN_ERROR(parameter_unsupported, "not compiled with multithreading");
+#else
+ assert(CCtxParams->jobSize <= INT_MAX);
+ *value = (int)CCtxParams->jobSize;
+ break;
+#endif
+ case ZSTD_c_overlapLog :
+#ifndef ZSTD_MULTITHREAD
+ RETURN_ERROR(parameter_unsupported, "not compiled with multithreading");
+#else
+ *value = CCtxParams->overlapLog;
+ break;
+#endif
+ case ZSTD_c_rsyncable :
+#ifndef ZSTD_MULTITHREAD
+ RETURN_ERROR(parameter_unsupported, "not compiled with multithreading");
+#else
+ *value = CCtxParams->rsyncable;
+ break;
+#endif
+ case ZSTD_c_enableLongDistanceMatching :
+ *value = CCtxParams->ldmParams.enableLdm;
+ break;
+ case ZSTD_c_ldmHashLog :
+ *value = CCtxParams->ldmParams.hashLog;
+ break;
+ case ZSTD_c_ldmMinMatch :
+ *value = CCtxParams->ldmParams.minMatchLength;
+ break;
+ case ZSTD_c_ldmBucketSizeLog :
+ *value = CCtxParams->ldmParams.bucketSizeLog;
+ break;
+ case ZSTD_c_ldmHashRateLog :
+ *value = CCtxParams->ldmParams.hashRateLog;
+ break;
+ case ZSTD_c_targetCBlockSize :
+ *value = (int)CCtxParams->targetCBlockSize;
+ break;
+ case ZSTD_c_srcSizeHint :
+ *value = (int)CCtxParams->srcSizeHint;
+ break;
+ default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+ }
+ return 0;
+}
+
+/** ZSTD_CCtx_setParametersUsingCCtxParams() :
+ * just applies `params` into `cctx`
+ * no action is performed, parameters are merely stored.
+ * If ZSTDMT is enabled, parameters are pushed to cctx->mtctx.
+ * This is possible even if a compression is ongoing.
+ * In which case, new parameters will be applied on the fly, starting with next compression job.
+ */
+size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+ ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params)
+{
+ DEBUGLOG(4, "ZSTD_CCtx_setParametersUsingCCtxParams");
+ RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+ "The context is in the wrong stage!");
+ RETURN_ERROR_IF(cctx->cdict, stage_wrong,
+ "Can't override parameters with cdict attached (some must "
+ "be inherited from the cdict).");
+
+ cctx->requestedParams = *params;
+ return 0;
+}
+
+ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize)
+{
+ DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize);
+ RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+ "Can't set pledgedSrcSize when not in init stage.");
+ cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+ return 0;
+}
+
+/**
+ * Initializes the local dict using the requested parameters.
+ * NOTE: This does not use the pledged src size, because it may be used for more
+ * than one compression.
+ */
+static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+{
+ ZSTD_localDict* const dl = &cctx->localDict;
+ ZSTD_compressionParameters const cParams = ZSTD_getCParamsFromCCtxParams(
+ &cctx->requestedParams, ZSTD_CONTENTSIZE_UNKNOWN, dl->dictSize);
+ if (dl->dict == NULL) {
+ /* No local dictionary. */
+ assert(dl->dictBuffer == NULL);
+ assert(dl->cdict == NULL);
+ assert(dl->dictSize == 0);
+ return 0;
+ }
+ if (dl->cdict != NULL) {
+ assert(cctx->cdict == dl->cdict);
+ /* Local dictionary already initialized. */
+ return 0;
+ }
+ assert(dl->dictSize > 0);
+ assert(cctx->cdict == NULL);
+ assert(cctx->prefixDict.dict == NULL);
+
+ dl->cdict = ZSTD_createCDict_advanced(
+ dl->dict,
+ dl->dictSize,
+ ZSTD_dlm_byRef,
+ dl->dictContentType,
+ cParams,
+ cctx->customMem);
+ RETURN_ERROR_IF(!dl->cdict, memory_allocation, "ZSTD_createCDict_advanced failed");
+ cctx->cdict = dl->cdict;
+ return 0;
+}
+
+size_t ZSTD_CCtx_loadDictionary_advanced(
+ ZSTD_CCtx* cctx, const void* dict, size_t dictSize,
+ ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
+{
+ RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+ "Can't load a dictionary when ctx is not in init stage.");
+ RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+ "no malloc for static CCtx");
+ DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize);
+ ZSTD_clearAllDicts(cctx); /* in case one already exists */
+ if (dict == NULL || dictSize == 0) /* no dictionary mode */
+ return 0;
+ if (dictLoadMethod == ZSTD_dlm_byRef) {
+ cctx->localDict.dict = dict;
+ } else {
+ void* dictBuffer = ZSTD_malloc(dictSize, cctx->customMem);
+ RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!");
+ memcpy(dictBuffer, dict, dictSize);
+ cctx->localDict.dictBuffer = dictBuffer;
+ cctx->localDict.dict = dictBuffer;
+ }
+ cctx->localDict.dictSize = dictSize;
+ cctx->localDict.dictContentType = dictContentType;
+ return 0;
+}
+
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(
+ ZSTD_CCtx* cctx, const void* dict, size_t dictSize)
+{
+ return ZSTD_CCtx_loadDictionary_advanced(
+ cctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto);
+}
+
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize)
+{
+ return ZSTD_CCtx_loadDictionary_advanced(
+ cctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto);
+}
+
+
+size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+{
+ RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+ "Can't ref a dict when ctx not in init stage.");
+ /* Free the existing local cdict (if any) to save memory. */
+ ZSTD_clearAllDicts(cctx);
+ cctx->cdict = cdict;
+ return 0;
+}
+
+size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize)
+{
+ return ZSTD_CCtx_refPrefix_advanced(cctx, prefix, prefixSize, ZSTD_dct_rawContent);
+}
+
+size_t ZSTD_CCtx_refPrefix_advanced(
+ ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType)
+{
+ RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+ "Can't ref a prefix when ctx not in init stage.");
+ ZSTD_clearAllDicts(cctx);
+ if (prefix != NULL && prefixSize > 0) {
+ cctx->prefixDict.dict = prefix;
+ cctx->prefixDict.dictSize = prefixSize;
+ cctx->prefixDict.dictContentType = dictContentType;
+ }
+ return 0;
+}
+
+/*! ZSTD_CCtx_reset() :
+ * Also dumps dictionary */
+size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset)
+{
+ if ( (reset == ZSTD_reset_session_only)
+ || (reset == ZSTD_reset_session_and_parameters) ) {
+ cctx->streamStage = zcss_init;
+ cctx->pledgedSrcSizePlusOne = 0;
+ }
+ if ( (reset == ZSTD_reset_parameters)
+ || (reset == ZSTD_reset_session_and_parameters) ) {
+ RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+ "Can't reset parameters only when not in init stage.");
+ ZSTD_clearAllDicts(cctx);
+ return ZSTD_CCtxParams_reset(&cctx->requestedParams);
+ }
+ return 0;
+}
+
+
+/** ZSTD_checkCParams() :
+ control CParam values remain within authorized range.
+ @return : 0, or an error code if one value is beyond authorized range */
+size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams)
+{
+ BOUNDCHECK(ZSTD_c_windowLog, (int)cParams.windowLog);
+ BOUNDCHECK(ZSTD_c_chainLog, (int)cParams.chainLog);
+ BOUNDCHECK(ZSTD_c_hashLog, (int)cParams.hashLog);
+ BOUNDCHECK(ZSTD_c_searchLog, (int)cParams.searchLog);
+ BOUNDCHECK(ZSTD_c_minMatch, (int)cParams.minMatch);
+ BOUNDCHECK(ZSTD_c_targetLength,(int)cParams.targetLength);
+ BOUNDCHECK(ZSTD_c_strategy, cParams.strategy);
+ return 0;
+}
+
+/** ZSTD_clampCParams() :
+ * make CParam values within valid range.
+ * @return : valid CParams */
+static ZSTD_compressionParameters
+ZSTD_clampCParams(ZSTD_compressionParameters cParams)
+{
+# define CLAMP_TYPE(cParam, val, type) { \
+ ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); \
+ if ((int)val<bounds.lowerBound) val=(type)bounds.lowerBound; \
+ else if ((int)val>bounds.upperBound) val=(type)bounds.upperBound; \
+ }
+# define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned)
+ CLAMP(ZSTD_c_windowLog, cParams.windowLog);
+ CLAMP(ZSTD_c_chainLog, cParams.chainLog);
+ CLAMP(ZSTD_c_hashLog, cParams.hashLog);
+ CLAMP(ZSTD_c_searchLog, cParams.searchLog);
+ CLAMP(ZSTD_c_minMatch, cParams.minMatch);
+ CLAMP(ZSTD_c_targetLength,cParams.targetLength);
+ CLAMP_TYPE(ZSTD_c_strategy,cParams.strategy, ZSTD_strategy);
+ return cParams;
+}
+
+/** ZSTD_cycleLog() :
+ * condition for correct operation : hashLog > 1 */
+U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat)
+{
+ U32 const btScale = ((U32)strat >= (U32)ZSTD_btlazy2);
+ return hashLog - btScale;
+}
+
+/** ZSTD_adjustCParams_internal() :
+ * optimize `cPar` for a specified input (`srcSize` and `dictSize`).
+ * mostly downsize to reduce memory consumption and initialization latency.
+ * `srcSize` can be ZSTD_CONTENTSIZE_UNKNOWN when not known.
+ * note : `srcSize==0` means 0!
+ * condition : cPar is presumed validated (can be checked using ZSTD_checkCParams()). */
+static ZSTD_compressionParameters
+ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+ unsigned long long srcSize,
+ size_t dictSize)
+{
+ static const U64 minSrcSize = 513; /* (1<<9) + 1 */
+ static const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
+ assert(ZSTD_checkCParams(cPar)==0);
+
+ if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN)
+ srcSize = minSrcSize;
+
+ /* resize windowLog if input is small enough, to use less memory */
+ if ( (srcSize < maxWindowResize)
+ && (dictSize < maxWindowResize) ) {
+ U32 const tSize = (U32)(srcSize + dictSize);
+ static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN;
+ U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN :
+ ZSTD_highbit32(tSize-1) + 1;
+ if (cPar.windowLog > srcLog) cPar.windowLog = srcLog;
+ }
+ if (cPar.hashLog > cPar.windowLog+1) cPar.hashLog = cPar.windowLog+1;
+ { U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy);
+ if (cycleLog > cPar.windowLog)
+ cPar.chainLog -= (cycleLog - cPar.windowLog);
+ }
+
+ if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
+ cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* minimum wlog required for valid frame header */
+
+ return cPar;
+}
+
+ZSTD_compressionParameters
+ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
+ unsigned long long srcSize,
+ size_t dictSize)
+{
+ cPar = ZSTD_clampCParams(cPar); /* resulting cPar is necessarily valid (all parameters within range) */
+ if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN;
+ return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize);
+}
+
+static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize);
+static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize);
+
+ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+ const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize)
+{
+ ZSTD_compressionParameters cParams;
+ if (srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN && CCtxParams->srcSizeHint > 0) {
+ srcSizeHint = CCtxParams->srcSizeHint;
+ }
+ cParams = ZSTD_getCParams_internal(CCtxParams->compressionLevel, srcSizeHint, dictSize);
+ if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG;
+ if (CCtxParams->cParams.windowLog) cParams.windowLog = CCtxParams->cParams.windowLog;
+ if (CCtxParams->cParams.hashLog) cParams.hashLog = CCtxParams->cParams.hashLog;
+ if (CCtxParams->cParams.chainLog) cParams.chainLog = CCtxParams->cParams.chainLog;
+ if (CCtxParams->cParams.searchLog) cParams.searchLog = CCtxParams->cParams.searchLog;
+ if (CCtxParams->cParams.minMatch) cParams.minMatch = CCtxParams->cParams.minMatch;
+ if (CCtxParams->cParams.targetLength) cParams.targetLength = CCtxParams->cParams.targetLength;
+ if (CCtxParams->cParams.strategy) cParams.strategy = CCtxParams->cParams.strategy;
+ assert(!ZSTD_checkCParams(cParams));
+ /* srcSizeHint == 0 means 0 */
+ return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize);
+}
+
+static size_t
+ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+ const U32 forCCtx)
+{
+ size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog);
+ size_t const hSize = ((size_t)1) << cParams->hashLog;
+ U32 const hashLog3 = (forCCtx && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0;
+ size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0;
+ /* We don't use ZSTD_cwksp_alloc_size() here because the tables aren't
+ * surrounded by redzones in ASAN. */
+ size_t const tableSpace = chainSize * sizeof(U32)
+ + hSize * sizeof(U32)
+ + h3Size * sizeof(U32);
+ size_t const optPotentialSpace =
+ ZSTD_cwksp_alloc_size((MaxML+1) * sizeof(U32))
+ + ZSTD_cwksp_alloc_size((MaxLL+1) * sizeof(U32))
+ + ZSTD_cwksp_alloc_size((MaxOff+1) * sizeof(U32))
+ + ZSTD_cwksp_alloc_size((1<<Litbits) * sizeof(U32))
+ + ZSTD_cwksp_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t))
+ + ZSTD_cwksp_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+ size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt))
+ ? optPotentialSpace
+ : 0;
+ DEBUGLOG(4, "chainSize: %u - hSize: %u - h3Size: %u",
+ (U32)chainSize, (U32)hSize, (U32)h3Size);
+ return tableSpace + optSpace;
+}
+
+size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+{
+ RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
+ { ZSTD_compressionParameters const cParams =
+ ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0);
+ size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
+ U32 const divider = (cParams.minMatch==3) ? 3 : 4;
+ size_t const maxNbSeq = blockSize / divider;
+ size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize)
+ + ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(seqDef))
+ + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
+ size_t const entropySpace = ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE);
+ size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t));
+ size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 1);
+
+ size_t const ldmSpace = ZSTD_ldm_getTableSize(params->ldmParams);
+ size_t const ldmSeqSpace = ZSTD_cwksp_alloc_size(ZSTD_ldm_getMaxNbSeq(params->ldmParams, blockSize) * sizeof(rawSeq));
+
+ /* estimateCCtxSize is for one-shot compression. So no buffers should
+ * be needed. However, we still allocate two 0-sized buffers, which can
+ * take space under ASAN. */
+ size_t const bufferSpace = ZSTD_cwksp_alloc_size(0)
+ + ZSTD_cwksp_alloc_size(0);
+
+ size_t const cctxSpace = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx));
+
+ size_t const neededSpace =
+ cctxSpace +
+ entropySpace +
+ blockStateSpace +
+ ldmSpace +
+ ldmSeqSpace +
+ matchStateSize +
+ tokenSpace +
+ bufferSpace;
+
+ DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace);
+ return neededSpace;
+ }
+}
+
+size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
+{
+ ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams);
+ return ZSTD_estimateCCtxSize_usingCCtxParams(&params);
+}
+
+static size_t ZSTD_estimateCCtxSize_internal(int compressionLevel)
+{
+ ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0);
+ return ZSTD_estimateCCtxSize_usingCParams(cParams);
+}
+
+size_t ZSTD_estimateCCtxSize(int compressionLevel)
+{
+ int level;
+ size_t memBudget = 0;
+ for (level=MIN(compressionLevel, 1); level<=compressionLevel; level++) {
+ size_t const newMB = ZSTD_estimateCCtxSize_internal(level);
+ if (newMB > memBudget) memBudget = newMB;
+ }
+ return memBudget;
+}
+
+size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+{
+ RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
+ { ZSTD_compressionParameters const cParams =
+ ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0);
+ size_t const CCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(params);
+ size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
+ size_t const inBuffSize = ((size_t)1 << cParams.windowLog) + blockSize;
+ size_t const outBuffSize = ZSTD_compressBound(blockSize) + 1;
+ size_t const streamingSize = ZSTD_cwksp_alloc_size(inBuffSize)
+ + ZSTD_cwksp_alloc_size(outBuffSize);
+
+ return CCtxSize + streamingSize;
+ }
+}
+
+size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams)
+{
+ ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams);
+ return ZSTD_estimateCStreamSize_usingCCtxParams(&params);
+}
+
+static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel)
+{
+ ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0);
+ return ZSTD_estimateCStreamSize_usingCParams(cParams);
+}
+
+size_t ZSTD_estimateCStreamSize(int compressionLevel)
+{
+ int level;
+ size_t memBudget = 0;
+ for (level=MIN(compressionLevel, 1); level<=compressionLevel; level++) {
+ size_t const newMB = ZSTD_estimateCStreamSize_internal(level);
+ if (newMB > memBudget) memBudget = newMB;
+ }
+ return memBudget;
+}
+
+/* ZSTD_getFrameProgression():
+ * tells how much data has been consumed (input) and produced (output) for current frame.
+ * able to count progression inside worker threads (non-blocking mode).
+ */
+ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+ if (cctx->appliedParams.nbWorkers > 0) {
+ return ZSTDMT_getFrameProgression(cctx->mtctx);
+ }
+#endif
+ { ZSTD_frameProgression fp;
+ size_t const buffered = (cctx->inBuff == NULL) ? 0 :
+ cctx->inBuffPos - cctx->inToCompress;
+ if (buffered) assert(cctx->inBuffPos >= cctx->inToCompress);
+ assert(buffered <= ZSTD_BLOCKSIZE_MAX);
+ fp.ingested = cctx->consumedSrcSize + buffered;
+ fp.consumed = cctx->consumedSrcSize;
+ fp.produced = cctx->producedCSize;
+ fp.flushed = cctx->producedCSize; /* simplified; some data might still be left within streaming output buffer */
+ fp.currentJobID = 0;
+ fp.nbActiveWorkers = 0;
+ return fp;
+} }
+
+/*! ZSTD_toFlushNow()
+ * Only useful for multithreading scenarios currently (nbWorkers >= 1).
+ */
+size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+ if (cctx->appliedParams.nbWorkers > 0) {
+ return ZSTDMT_toFlushNow(cctx->mtctx);
+ }
+#endif
+ (void)cctx;
+ return 0; /* over-simplification; could also check if context is currently running in streaming mode, and in which case, report how many bytes are left to be flushed within output buffer */
+}
+
+static void ZSTD_assertEqualCParams(ZSTD_compressionParameters cParams1,
+ ZSTD_compressionParameters cParams2)
+{
+ (void)cParams1;
+ (void)cParams2;
+ assert(cParams1.windowLog == cParams2.windowLog);
+ assert(cParams1.chainLog == cParams2.chainLog);
+ assert(cParams1.hashLog == cParams2.hashLog);
+ assert(cParams1.searchLog == cParams2.searchLog);
+ assert(cParams1.minMatch == cParams2.minMatch);
+ assert(cParams1.targetLength == cParams2.targetLength);
+ assert(cParams1.strategy == cParams2.strategy);
+}
+
+void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs)
+{
+ int i;
+ for (i = 0; i < ZSTD_REP_NUM; ++i)
+ bs->rep[i] = repStartValue[i];
+ bs->entropy.huf.repeatMode = HUF_repeat_none;
+ bs->entropy.fse.offcode_repeatMode = FSE_repeat_none;
+ bs->entropy.fse.matchlength_repeatMode = FSE_repeat_none;
+ bs->entropy.fse.litlength_repeatMode = FSE_repeat_none;
+}
+
+/*! ZSTD_invalidateMatchState()
+ * Invalidate all the matches in the match finder tables.
+ * Requires nextSrc and base to be set (can be NULL).
+ */
+static void ZSTD_invalidateMatchState(ZSTD_matchState_t* ms)
+{
+ ZSTD_window_clear(&ms->window);
+
+ ms->nextToUpdate = ms->window.dictLimit;
+ ms->loadedDictEnd = 0;
+ ms->opt.litLengthSum = 0; /* force reset of btopt stats */
+ ms->dictMatchState = NULL;
+}
+
+/**
+ * Indicates whether this compression proceeds directly from user-provided
+ * source buffer to user-provided destination buffer (ZSTDb_not_buffered), or
+ * whether the context needs to buffer the input/output (ZSTDb_buffered).
+ */
+typedef enum {
+ ZSTDb_not_buffered,
+ ZSTDb_buffered
+} ZSTD_buffered_policy_e;
+
+/**
+ * Controls, for this matchState reset, whether the tables need to be cleared /
+ * prepared for the coming compression (ZSTDcrp_makeClean), or whether the
+ * tables can be left unclean (ZSTDcrp_leaveDirty), because we know that a
+ * subsequent operation will overwrite the table space anyways (e.g., copying
+ * the matchState contents in from a CDict).
+ */
+typedef enum {
+ ZSTDcrp_makeClean,
+ ZSTDcrp_leaveDirty
+} ZSTD_compResetPolicy_e;
+
+/**
+ * Controls, for this matchState reset, whether indexing can continue where it
+ * left off (ZSTDirp_continue), or whether it needs to be restarted from zero
+ * (ZSTDirp_reset).
+ */
+typedef enum {
+ ZSTDirp_continue,
+ ZSTDirp_reset
+} ZSTD_indexResetPolicy_e;
+
+typedef enum {
+ ZSTD_resetTarget_CDict,
+ ZSTD_resetTarget_CCtx
+} ZSTD_resetTarget_e;
+
+static size_t
+ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+ ZSTD_cwksp* ws,
+ const ZSTD_compressionParameters* cParams,
+ const ZSTD_compResetPolicy_e crp,
+ const ZSTD_indexResetPolicy_e forceResetIndex,
+ const ZSTD_resetTarget_e forWho)
+{
+ size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog);
+ size_t const hSize = ((size_t)1) << cParams->hashLog;
+ U32 const hashLog3 = ((forWho == ZSTD_resetTarget_CCtx) && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0;
+ size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0;
+
+ DEBUGLOG(4, "reset indices : %u", forceResetIndex == ZSTDirp_reset);
+ if (forceResetIndex == ZSTDirp_reset) {
+ ZSTD_window_init(&ms->window);
+ ZSTD_cwksp_mark_tables_dirty(ws);
+ }
+
+ ms->hashLog3 = hashLog3;
+
+ ZSTD_invalidateMatchState(ms);
+
+ assert(!ZSTD_cwksp_reserve_failed(ws)); /* check that allocation hasn't already failed */
+
+ ZSTD_cwksp_clear_tables(ws);
+
+ DEBUGLOG(5, "reserving table space");
+ /* table Space */
+ ms->hashTable = (U32*)ZSTD_cwksp_reserve_table(ws, hSize * sizeof(U32));
+ ms->chainTable = (U32*)ZSTD_cwksp_reserve_table(ws, chainSize * sizeof(U32));
+ ms->hashTable3 = (U32*)ZSTD_cwksp_reserve_table(ws, h3Size * sizeof(U32));
+ RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
+ "failed a workspace allocation in ZSTD_reset_matchState");
+
+ DEBUGLOG(4, "reset table : %u", crp!=ZSTDcrp_leaveDirty);
+ if (crp!=ZSTDcrp_leaveDirty) {
+ /* reset tables only */
+ ZSTD_cwksp_clean_tables(ws);
+ }
+
+ /* opt parser space */
+ if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
+ DEBUGLOG(4, "reserving optimal parser space");
+ ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<<Litbits) * sizeof(unsigned));
+ ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned));
+ ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned));
+ ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned));
+ ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t));
+ ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+ }
+
+ ms->cParams = *cParams;
+
+ RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
+ "failed a workspace allocation in ZSTD_reset_matchState");
+
+ return 0;
+}
+
+/* ZSTD_indexTooCloseToMax() :
+ * minor optimization : prefer memset() rather than reduceIndex()
+ * which is measurably slow in some circumstances (reported for Visual Studio).
+ * Works when re-using a context for a lot of smallish inputs :
+ * if all inputs are smaller than ZSTD_INDEXOVERFLOW_MARGIN,
+ * memset() will be triggered before reduceIndex().
+ */
+#define ZSTD_INDEXOVERFLOW_MARGIN (16 MB)
+static int ZSTD_indexTooCloseToMax(ZSTD_window_t w)
+{
+ return (size_t)(w.nextSrc - w.base) > (ZSTD_CURRENT_MAX - ZSTD_INDEXOVERFLOW_MARGIN);
+}
+
+/*! ZSTD_resetCCtx_internal() :
+ note : `params` are assumed fully validated at this stage */
+static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+ ZSTD_CCtx_params params,
+ U64 const pledgedSrcSize,
+ ZSTD_compResetPolicy_e const crp,
+ ZSTD_buffered_policy_e const zbuff)
+{
+ ZSTD_cwksp* const ws = &zc->workspace;
+ DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u",
+ (U32)pledgedSrcSize, params.cParams.windowLog);
+ assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+
+ zc->isFirstBlock = 1;
+
+ if (params.ldmParams.enableLdm) {
+ /* Adjust long distance matching parameters */
+ ZSTD_ldm_adjustParameters(&params.ldmParams, &params.cParams);
+ assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog);
+ assert(params.ldmParams.hashRateLog < 32);
+ zc->ldmState.hashPower = ZSTD_rollingHash_primePower(params.ldmParams.minMatchLength);
+ }
+
+ { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params.cParams.windowLog), pledgedSrcSize));
+ size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+ U32 const divider = (params.cParams.minMatch==3) ? 3 : 4;
+ size_t const maxNbSeq = blockSize / divider;
+ size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize)
+ + ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(seqDef))
+ + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
+ size_t const buffOutSize = (zbuff==ZSTDb_buffered) ? ZSTD_compressBound(blockSize)+1 : 0;
+ size_t const buffInSize = (zbuff==ZSTDb_buffered) ? windowSize + blockSize : 0;
+ size_t const matchStateSize = ZSTD_sizeof_matchState(&params.cParams, /* forCCtx */ 1);
+ size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params.ldmParams, blockSize);
+
+ ZSTD_indexResetPolicy_e needsIndexReset = zc->initialized ? ZSTDirp_continue : ZSTDirp_reset;
+
+ if (ZSTD_indexTooCloseToMax(zc->blockState.matchState.window)) {
+ needsIndexReset = ZSTDirp_reset;
+ }
+
+ if (!zc->staticSize) ZSTD_cwksp_bump_oversized_duration(ws, 0);
+
+ /* Check if workspace is large enough, alloc a new one if needed */
+ { size_t const cctxSpace = zc->staticSize ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0;
+ size_t const entropySpace = ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE);
+ size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t));
+ size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize) + ZSTD_cwksp_alloc_size(buffOutSize);
+ size_t const ldmSpace = ZSTD_ldm_getTableSize(params.ldmParams);
+ size_t const ldmSeqSpace = ZSTD_cwksp_alloc_size(maxNbLdmSeq * sizeof(rawSeq));
+
+ size_t const neededSpace =
+ cctxSpace +
+ entropySpace +
+ blockStateSpace +
+ ldmSpace +
+ ldmSeqSpace +
+ matchStateSize +
+ tokenSpace +
+ bufferSpace;
+
+ int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace;
+ int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace);
+
+ DEBUGLOG(4, "Need %zuKB workspace, including %zuKB for match state, and %zuKB for buffers",
+ neededSpace>>10, matchStateSize>>10, bufferSpace>>10);
+ DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize);
+
+ if (workspaceTooSmall || workspaceWasteful) {
+ DEBUGLOG(4, "Resize workspaceSize from %zuKB to %zuKB",
+ ZSTD_cwksp_sizeof(ws) >> 10,
+ neededSpace >> 10);
+
+ RETURN_ERROR_IF(zc->staticSize, memory_allocation, "static cctx : no resize");
+
+ needsIndexReset = ZSTDirp_reset;
+
+ ZSTD_cwksp_free(ws, zc->customMem);
+ FORWARD_IF_ERROR(ZSTD_cwksp_create(ws, neededSpace, zc->customMem), "");
+
+ DEBUGLOG(5, "reserving object space");
+ /* Statically sized space.
+ * entropyWorkspace never moves,
+ * though prev/next block swap places */
+ assert(ZSTD_cwksp_check_available(ws, 2 * sizeof(ZSTD_compressedBlockState_t)));
+ zc->blockState.prevCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t));
+ RETURN_ERROR_IF(zc->blockState.prevCBlock == NULL, memory_allocation, "couldn't allocate prevCBlock");
+ zc->blockState.nextCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t));
+ RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate nextCBlock");
+ zc->entropyWorkspace = (U32*) ZSTD_cwksp_reserve_object(ws, HUF_WORKSPACE_SIZE);
+ RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate entropyWorkspace");
+ } }
+
+ ZSTD_cwksp_clear(ws);
+
+ /* init params */
+ zc->appliedParams = params;
+ zc->blockState.matchState.cParams = params.cParams;
+ zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+ zc->consumedSrcSize = 0;
+ zc->producedCSize = 0;
+ if (pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN)
+ zc->appliedParams.fParams.contentSizeFlag = 0;
+ DEBUGLOG(4, "pledged content size : %u ; flag : %u",
+ (unsigned)pledgedSrcSize, zc->appliedParams.fParams.contentSizeFlag);
+ zc->blockSize = blockSize;
+
+ XXH64_reset(&zc->xxhState, 0);
+ zc->stage = ZSTDcs_init;
+ zc->dictID = 0;
+
+ ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock);
+
+ /* ZSTD_wildcopy() is used to copy into the literals buffer,
+ * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
+ */
+ zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH);
+ zc->seqStore.maxNbLit = blockSize;
+
+ /* buffers */
+ zc->inBuffSize = buffInSize;
+ zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize);
+ zc->outBuffSize = buffOutSize;
+ zc->outBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffOutSize);
+
+ /* ldm bucketOffsets table */
+ if (params.ldmParams.enableLdm) {
+ /* TODO: avoid memset? */
+ size_t const ldmBucketSize =
+ ((size_t)1) << (params.ldmParams.hashLog -
+ params.ldmParams.bucketSizeLog);
+ zc->ldmState.bucketOffsets = ZSTD_cwksp_reserve_buffer(ws, ldmBucketSize);
+ memset(zc->ldmState.bucketOffsets, 0, ldmBucketSize);
+ }
+
+ /* sequences storage */
+ ZSTD_referenceExternalSequences(zc, NULL, 0);
+ zc->seqStore.maxNbSeq = maxNbSeq;
+ zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+ zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+ zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+ zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
+
+ FORWARD_IF_ERROR(ZSTD_reset_matchState(
+ &zc->blockState.matchState,
+ ws,
+ &params.cParams,
+ crp,
+ needsIndexReset,
+ ZSTD_resetTarget_CCtx), "");
+
+ /* ldm hash table */
+ if (params.ldmParams.enableLdm) {
+ /* TODO: avoid memset? */
+ size_t const ldmHSize = ((size_t)1) << params.ldmParams.hashLog;
+ zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
+ memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
+ zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
+ zc->maxNbLdmSequences = maxNbLdmSeq;
+
+ ZSTD_window_init(&zc->ldmState.window);
+ ZSTD_window_clear(&zc->ldmState.window);
+ zc->ldmState.loadedDictEnd = 0;
+ }
+
+ DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
+ zc->initialized = 1;
+
+ return 0;
+ }
+}
+
+/* ZSTD_invalidateRepCodes() :
+ * ensures next compression will not use repcodes from previous block.
+ * Note : only works with regular variant;
+ * do not use with extDict variant ! */
+void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx) {
+ int i;
+ for (i=0; i<ZSTD_REP_NUM; i++) cctx->blockState.prevCBlock->rep[i] = 0;
+ assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window));
+}
+
+/* These are the approximate sizes for each strategy past which copying the
+ * dictionary tables into the working context is faster than using them
+ * in-place.
+ */
+static const size_t attachDictSizeCutoffs[ZSTD_STRATEGY_MAX+1] = {
+ 8 KB, /* unused */
+ 8 KB, /* ZSTD_fast */
+ 16 KB, /* ZSTD_dfast */
+ 32 KB, /* ZSTD_greedy */
+ 32 KB, /* ZSTD_lazy */
+ 32 KB, /* ZSTD_lazy2 */
+ 32 KB, /* ZSTD_btlazy2 */
+ 32 KB, /* ZSTD_btopt */
+ 8 KB, /* ZSTD_btultra */
+ 8 KB /* ZSTD_btultra2 */
+};
+
+static int ZSTD_shouldAttachDict(const ZSTD_CDict* cdict,
+ const ZSTD_CCtx_params* params,
+ U64 pledgedSrcSize)
+{
+ size_t cutoff = attachDictSizeCutoffs[cdict->matchState.cParams.strategy];
+ return ( pledgedSrcSize <= cutoff
+ || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN
+ || params->attachDictPref == ZSTD_dictForceAttach )
+ && params->attachDictPref != ZSTD_dictForceCopy
+ && !params->forceWindow; /* dictMatchState isn't correctly
+ * handled in _enforceMaxDist */
+}
+
+static size_t
+ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
+ const ZSTD_CDict* cdict,
+ ZSTD_CCtx_params params,
+ U64 pledgedSrcSize,
+ ZSTD_buffered_policy_e zbuff)
+{
+ { const ZSTD_compressionParameters* const cdict_cParams = &cdict->matchState.cParams;
+ unsigned const windowLog = params.cParams.windowLog;
+ assert(windowLog != 0);
+ /* Resize working context table params for input only, since the dict
+ * has its own tables. */
+ /* pledgeSrcSize == 0 means 0! */
+ params.cParams = ZSTD_adjustCParams_internal(*cdict_cParams, pledgedSrcSize, 0);
+ params.cParams.windowLog = windowLog;
+ FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
+ ZSTDcrp_makeClean, zbuff), "");
+ assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy);
+ }
+
+ { const U32 cdictEnd = (U32)( cdict->matchState.window.nextSrc
+ - cdict->matchState.window.base);
+ const U32 cdictLen = cdictEnd - cdict->matchState.window.dictLimit;
+ if (cdictLen == 0) {
+ /* don't even attach dictionaries with no contents */
+ DEBUGLOG(4, "skipping attaching empty dictionary");
+ } else {
+ DEBUGLOG(4, "attaching dictionary into context");
+ cctx->blockState.matchState.dictMatchState = &cdict->matchState;
+
+ /* prep working match state so dict matches never have negative indices
+ * when they are translated to the working context's index space. */
+ if (cctx->blockState.matchState.window.dictLimit < cdictEnd) {
+ cctx->blockState.matchState.window.nextSrc =
+ cctx->blockState.matchState.window.base + cdictEnd;
+ ZSTD_window_clear(&cctx->blockState.matchState.window);
+ }
+ /* loadedDictEnd is expressed within the referential of the active context */
+ cctx->blockState.matchState.loadedDictEnd = cctx->blockState.matchState.window.dictLimit;
+ } }
+
+ cctx->dictID = cdict->dictID;
+
+ /* copy block state */
+ memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState));
+
+ return 0;
+}
+
+static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+ const ZSTD_CDict* cdict,
+ ZSTD_CCtx_params params,
+ U64 pledgedSrcSize,
+ ZSTD_buffered_policy_e zbuff)
+{
+ const ZSTD_compressionParameters *cdict_cParams = &cdict->matchState.cParams;
+
+ DEBUGLOG(4, "copying dictionary into context");
+
+ { unsigned const windowLog = params.cParams.windowLog;
+ assert(windowLog != 0);
+ /* Copy only compression parameters related to tables. */
+ params.cParams = *cdict_cParams;
+ params.cParams.windowLog = windowLog;
+ FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
+ ZSTDcrp_leaveDirty, zbuff), "");
+ assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy);
+ assert(cctx->appliedParams.cParams.hashLog == cdict_cParams->hashLog);
+ assert(cctx->appliedParams.cParams.chainLog == cdict_cParams->chainLog);
+ }
+
+ ZSTD_cwksp_mark_tables_dirty(&cctx->workspace);
+
+ /* copy tables */
+ { size_t const chainSize = (cdict_cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cdict_cParams->chainLog);
+ size_t const hSize = (size_t)1 << cdict_cParams->hashLog;
+
+ memcpy(cctx->blockState.matchState.hashTable,
+ cdict->matchState.hashTable,
+ hSize * sizeof(U32));
+ memcpy(cctx->blockState.matchState.chainTable,
+ cdict->matchState.chainTable,
+ chainSize * sizeof(U32));
+ }
+
+ /* Zero the hashTable3, since the cdict never fills it */
+ { int const h3log = cctx->blockState.matchState.hashLog3;
+ size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0;
+ assert(cdict->matchState.hashLog3 == 0);
+ memset(cctx->blockState.matchState.hashTable3, 0, h3Size * sizeof(U32));
+ }
+
+ ZSTD_cwksp_mark_tables_clean(&cctx->workspace);
+
+ /* copy dictionary offsets */
+ { ZSTD_matchState_t const* srcMatchState = &cdict->matchState;
+ ZSTD_matchState_t* dstMatchState = &cctx->blockState.matchState;
+ dstMatchState->window = srcMatchState->window;
+ dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
+ dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd;
+ }
+
+ cctx->dictID = cdict->dictID;
+
+ /* copy block state */
+ memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState));
+
+ return 0;
+}
+
+/* We have a choice between copying the dictionary context into the working
+ * context, or referencing the dictionary context from the working context
+ * in-place. We decide here which strategy to use. */
+static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx,
+ const ZSTD_CDict* cdict,
+ const ZSTD_CCtx_params* params,
+ U64 pledgedSrcSize,
+ ZSTD_buffered_policy_e zbuff)
+{
+
+ DEBUGLOG(4, "ZSTD_resetCCtx_usingCDict (pledgedSrcSize=%u)",
+ (unsigned)pledgedSrcSize);
+
+ if (ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize)) {
+ return ZSTD_resetCCtx_byAttachingCDict(
+ cctx, cdict, *params, pledgedSrcSize, zbuff);
+ } else {
+ return ZSTD_resetCCtx_byCopyingCDict(
+ cctx, cdict, *params, pledgedSrcSize, zbuff);
+ }
+}
+
+/*! ZSTD_copyCCtx_internal() :
+ * Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
+ * Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
+ * The "context", in this case, refers to the hash and chain tables,
+ * entropy tables, and dictionary references.
+ * `windowLog` value is enforced if != 0, otherwise value is copied from srcCCtx.
+ * @return : 0, or an error code */
+static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
+ const ZSTD_CCtx* srcCCtx,
+ ZSTD_frameParameters fParams,
+ U64 pledgedSrcSize,
+ ZSTD_buffered_policy_e zbuff)
+{
+ DEBUGLOG(5, "ZSTD_copyCCtx_internal");
+ RETURN_ERROR_IF(srcCCtx->stage!=ZSTDcs_init, stage_wrong,
+ "Can't copy a ctx that's not in init stage.");
+
+ memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem));
+ { ZSTD_CCtx_params params = dstCCtx->requestedParams;
+ /* Copy only compression parameters related to tables. */
+ params.cParams = srcCCtx->appliedParams.cParams;
+ params.fParams = fParams;
+ ZSTD_resetCCtx_internal(dstCCtx, params, pledgedSrcSize,
+ ZSTDcrp_leaveDirty, zbuff);
+ assert(dstCCtx->appliedParams.cParams.windowLog == srcCCtx->appliedParams.cParams.windowLog);
+ assert(dstCCtx->appliedParams.cParams.strategy == srcCCtx->appliedParams.cParams.strategy);
+ assert(dstCCtx->appliedParams.cParams.hashLog == srcCCtx->appliedParams.cParams.hashLog);
+ assert(dstCCtx->appliedParams.cParams.chainLog == srcCCtx->appliedParams.cParams.chainLog);
+ assert(dstCCtx->blockState.matchState.hashLog3 == srcCCtx->blockState.matchState.hashLog3);
+ }
+
+ ZSTD_cwksp_mark_tables_dirty(&dstCCtx->workspace);
+
+ /* copy tables */
+ { size_t const chainSize = (srcCCtx->appliedParams.cParams.strategy == ZSTD_fast) ? 0 : ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog);
+ size_t const hSize = (size_t)1 << srcCCtx->appliedParams.cParams.hashLog;
+ int const h3log = srcCCtx->blockState.matchState.hashLog3;
+ size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0;
+
+ memcpy(dstCCtx->blockState.matchState.hashTable,
+ srcCCtx->blockState.matchState.hashTable,
+ hSize * sizeof(U32));
+ memcpy(dstCCtx->blockState.matchState.chainTable,
+ srcCCtx->blockState.matchState.chainTable,
+ chainSize * sizeof(U32));
+ memcpy(dstCCtx->blockState.matchState.hashTable3,
+ srcCCtx->blockState.matchState.hashTable3,
+ h3Size * sizeof(U32));
+ }
+
+ ZSTD_cwksp_mark_tables_clean(&dstCCtx->workspace);
+
+ /* copy dictionary offsets */
+ {
+ const ZSTD_matchState_t* srcMatchState = &srcCCtx->blockState.matchState;
+ ZSTD_matchState_t* dstMatchState = &dstCCtx->blockState.matchState;
+ dstMatchState->window = srcMatchState->window;
+ dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
+ dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd;
+ }
+ dstCCtx->dictID = srcCCtx->dictID;
+
+ /* copy block state */
+ memcpy(dstCCtx->blockState.prevCBlock, srcCCtx->blockState.prevCBlock, sizeof(*srcCCtx->blockState.prevCBlock));
+
+ return 0;
+}
+
+/*! ZSTD_copyCCtx() :
+ * Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
+ * Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
+ * pledgedSrcSize==0 means "unknown".
+* @return : 0, or an error code */
+size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long long pledgedSrcSize)
+{
+ ZSTD_frameParameters fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+ ZSTD_buffered_policy_e const zbuff = (ZSTD_buffered_policy_e)(srcCCtx->inBuffSize>0);
+ ZSTD_STATIC_ASSERT((U32)ZSTDb_buffered==1);
+ if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;
+ fParams.contentSizeFlag = (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN);
+
+ return ZSTD_copyCCtx_internal(dstCCtx, srcCCtx,
+ fParams, pledgedSrcSize,
+ zbuff);
+}
+
+
+#define ZSTD_ROWSIZE 16
+/*! ZSTD_reduceTable() :
+ * reduce table indexes by `reducerValue`, or squash to zero.
+ * PreserveMark preserves "unsorted mark" for btlazy2 strategy.
+ * It must be set to a clear 0/1 value, to remove branch during inlining.
+ * Presume table size is a multiple of ZSTD_ROWSIZE
+ * to help auto-vectorization */
+FORCE_INLINE_TEMPLATE void
+ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerValue, int const preserveMark)
+{
+ int const nbRows = (int)size / ZSTD_ROWSIZE;
+ int cellNb = 0;
+ int rowNb;
+ assert((size & (ZSTD_ROWSIZE-1)) == 0); /* multiple of ZSTD_ROWSIZE */
+ assert(size < (1U<<31)); /* can be casted to int */
+
+#if defined (MEMORY_SANITIZER) && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE)
+ /* To validate that the table re-use logic is sound, and that we don't
+ * access table space that we haven't cleaned, we re-"poison" the table
+ * space every time we mark it dirty.
+ *
+ * This function however is intended to operate on those dirty tables and
+ * re-clean them. So when this function is used correctly, we can unpoison
+ * the memory it operated on. This introduces a blind spot though, since
+ * if we now try to operate on __actually__ poisoned memory, we will not
+ * detect that. */
+ __msan_unpoison(table, size * sizeof(U32));
+#endif
+
+ for (rowNb=0 ; rowNb < nbRows ; rowNb++) {
+ int column;
+ for (column=0; column<ZSTD_ROWSIZE; column++) {
+ if (preserveMark) {
+ U32 const adder = (table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) ? reducerValue : 0;
+ table[cellNb] += adder;
+ }
+ if (table[cellNb] < reducerValue) table[cellNb] = 0;
+ else table[cellNb] -= reducerValue;
+ cellNb++;
+ } }
+}
+
+static void ZSTD_reduceTable(U32* const table, U32 const size, U32 const reducerValue)
+{
+ ZSTD_reduceTable_internal(table, size, reducerValue, 0);
+}
+
+static void ZSTD_reduceTable_btlazy2(U32* const table, U32 const size, U32 const reducerValue)
+{
+ ZSTD_reduceTable_internal(table, size, reducerValue, 1);
+}
+
+/*! ZSTD_reduceIndex() :
+* rescale all indexes to avoid future overflow (indexes are U32) */
+static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* params, const U32 reducerValue)
+{
+ { U32 const hSize = (U32)1 << params->cParams.hashLog;
+ ZSTD_reduceTable(ms->hashTable, hSize, reducerValue);
+ }
+
+ if (params->cParams.strategy != ZSTD_fast) {
+ U32 const chainSize = (U32)1 << params->cParams.chainLog;
+ if (params->cParams.strategy == ZSTD_btlazy2)
+ ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerValue);
+ else
+ ZSTD_reduceTable(ms->chainTable, chainSize, reducerValue);
+ }
+
+ if (ms->hashLog3) {
+ U32 const h3Size = (U32)1 << ms->hashLog3;
+ ZSTD_reduceTable(ms->hashTable3, h3Size, reducerValue);
+ }
+}
+
+
+/*-*******************************************************
+* Block entropic compression
+*********************************************************/
+
+/* See doc/zstd_compression_format.md for detailed format description */
+
+void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+{
+ const seqDef* const sequences = seqStorePtr->sequencesStart;
+ BYTE* const llCodeTable = seqStorePtr->llCode;
+ BYTE* const ofCodeTable = seqStorePtr->ofCode;
+ BYTE* const mlCodeTable = seqStorePtr->mlCode;
+ U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+ U32 u;
+ assert(nbSeq <= seqStorePtr->maxNbSeq);
+ for (u=0; u<nbSeq; u++) {
+ U32 const llv = sequences[u].litLength;
+ U32 const mlv = sequences[u].matchLength;
+ llCodeTable[u] = (BYTE)ZSTD_LLcode(llv);
+ ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offset);
+ mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv);
+ }
+ if (seqStorePtr->longLengthID==1)
+ llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
+ if (seqStorePtr->longLengthID==2)
+ mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
+}
+
+/* ZSTD_useTargetCBlockSize():
+ * Returns if target compressed block size param is being used.
+ * If used, compression will do best effort to make a compressed block size to be around targetCBlockSize.
+ * Returns 1 if true, 0 otherwise. */
+static int ZSTD_useTargetCBlockSize(const ZSTD_CCtx_params* cctxParams)
+{
+ DEBUGLOG(5, "ZSTD_useTargetCBlockSize (targetCBlockSize=%zu)", cctxParams->targetCBlockSize);
+ return (cctxParams->targetCBlockSize != 0);
+}
+
+/* ZSTD_compressSequences_internal():
+ * actually compresses both literals and sequences */
+MEM_STATIC size_t
+ZSTD_compressSequences_internal(seqStore_t* seqStorePtr,
+ const ZSTD_entropyCTables_t* prevEntropy,
+ ZSTD_entropyCTables_t* nextEntropy,
+ const ZSTD_CCtx_params* cctxParams,
+ void* dst, size_t dstCapacity,
+ void* entropyWorkspace, size_t entropyWkspSize,
+ const int bmi2)
+{
+ const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+ ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+ unsigned count[MaxSeq+1];
+ FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable;
+ FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable;
+ FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable;
+ U32 LLtype, Offtype, MLtype; /* compressed, raw or rle */
+ const seqDef* const sequences = seqStorePtr->sequencesStart;
+ const BYTE* const ofCodeTable = seqStorePtr->ofCode;
+ const BYTE* const llCodeTable = seqStorePtr->llCode;
+ const BYTE* const mlCodeTable = seqStorePtr->mlCode;
+ BYTE* const ostart = (BYTE*)dst;
+ BYTE* const oend = ostart + dstCapacity;
+ BYTE* op = ostart;
+ size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+ BYTE* seqHead;
+ BYTE* lastNCount = NULL;
+
+ DEBUGLOG(5, "ZSTD_compressSequences_internal (nbSeq=%zu)", nbSeq);
+ ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+
+ /* Compress literals */
+ { const BYTE* const literals = seqStorePtr->litStart;
+ size_t const litSize = (size_t)(seqStorePtr->lit - literals);
+ size_t const cSize = ZSTD_compressLiterals(
+ &prevEntropy->huf, &nextEntropy->huf,
+ cctxParams->cParams.strategy,
+ ZSTD_disableLiteralsCompression(cctxParams),
+ op, dstCapacity,
+ literals, litSize,
+ entropyWorkspace, entropyWkspSize,
+ bmi2);
+ FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
+ assert(cSize <= dstCapacity);
+ op += cSize;
+ }
+
+ /* Sequences Header */
+ RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/,
+ dstSize_tooSmall, "Can't fit seq hdr in output buf!");
+ if (nbSeq < 128) {
+ *op++ = (BYTE)nbSeq;
+ } else if (nbSeq < LONGNBSEQ) {
+ op[0] = (BYTE)((nbSeq>>8) + 0x80);
+ op[1] = (BYTE)nbSeq;
+ op+=2;
+ } else {
+ op[0]=0xFF;
+ MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ));
+ op+=3;
+ }
+ assert(op <= oend);
+ if (nbSeq==0) {
+ /* Copy the old tables over as if we repeated them */
+ memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
+ return (size_t)(op - ostart);
+ }
+
+ /* seqHead : flags for FSE encoding type */
+ seqHead = op++;
+ assert(op <= oend);
+
+ /* convert length/distances into codes */
+ ZSTD_seqToCodes(seqStorePtr);
+ /* build CTable for Literal Lengths */
+ { unsigned max = MaxLL;
+ size_t const mostFrequent = HIST_countFast_wksp(count, &max, llCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */
+ DEBUGLOG(5, "Building LL table");
+ nextEntropy->fse.litlength_repeatMode = prevEntropy->fse.litlength_repeatMode;
+ LLtype = ZSTD_selectEncodingType(&nextEntropy->fse.litlength_repeatMode,
+ count, max, mostFrequent, nbSeq,
+ LLFSELog, prevEntropy->fse.litlengthCTable,
+ LL_defaultNorm, LL_defaultNormLog,
+ ZSTD_defaultAllowed, strategy);
+ assert(set_basic < set_compressed && set_rle < set_compressed);
+ assert(!(LLtype < set_compressed && nextEntropy->fse.litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+ { size_t const countSize = ZSTD_buildCTable(
+ op, (size_t)(oend - op),
+ CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype,
+ count, max, llCodeTable, nbSeq,
+ LL_defaultNorm, LL_defaultNormLog, MaxLL,
+ prevEntropy->fse.litlengthCTable,
+ sizeof(prevEntropy->fse.litlengthCTable),
+ entropyWorkspace, entropyWkspSize);
+ FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for LitLens failed");
+ if (LLtype == set_compressed)
+ lastNCount = op;
+ op += countSize;
+ assert(op <= oend);
+ } }
+ /* build CTable for Offsets */
+ { unsigned max = MaxOff;
+ size_t const mostFrequent = HIST_countFast_wksp(
+ count, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */
+ /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */
+ ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed;
+ DEBUGLOG(5, "Building OF table");
+ nextEntropy->fse.offcode_repeatMode = prevEntropy->fse.offcode_repeatMode;
+ Offtype = ZSTD_selectEncodingType(&nextEntropy->fse.offcode_repeatMode,
+ count, max, mostFrequent, nbSeq,
+ OffFSELog, prevEntropy->fse.offcodeCTable,
+ OF_defaultNorm, OF_defaultNormLog,
+ defaultPolicy, strategy);
+ assert(!(Offtype < set_compressed && nextEntropy->fse.offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+ { size_t const countSize = ZSTD_buildCTable(
+ op, (size_t)(oend - op),
+ CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype,
+ count, max, ofCodeTable, nbSeq,
+ OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+ prevEntropy->fse.offcodeCTable,
+ sizeof(prevEntropy->fse.offcodeCTable),
+ entropyWorkspace, entropyWkspSize);
+ FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for Offsets failed");
+ if (Offtype == set_compressed)
+ lastNCount = op;
+ op += countSize;
+ assert(op <= oend);
+ } }
+ /* build CTable for MatchLengths */
+ { unsigned max = MaxML;
+ size_t const mostFrequent = HIST_countFast_wksp(
+ count, &max, mlCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */
+ DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op));
+ nextEntropy->fse.matchlength_repeatMode = prevEntropy->fse.matchlength_repeatMode;
+ MLtype = ZSTD_selectEncodingType(&nextEntropy->fse.matchlength_repeatMode,
+ count, max, mostFrequent, nbSeq,
+ MLFSELog, prevEntropy->fse.matchlengthCTable,
+ ML_defaultNorm, ML_defaultNormLog,
+ ZSTD_defaultAllowed, strategy);
+ assert(!(MLtype < set_compressed && nextEntropy->fse.matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+ { size_t const countSize = ZSTD_buildCTable(
+ op, (size_t)(oend - op),
+ CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype,
+ count, max, mlCodeTable, nbSeq,
+ ML_defaultNorm, ML_defaultNormLog, MaxML,
+ prevEntropy->fse.matchlengthCTable,
+ sizeof(prevEntropy->fse.matchlengthCTable),
+ entropyWorkspace, entropyWkspSize);
+ FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for MatchLengths failed");
+ if (MLtype == set_compressed)
+ lastNCount = op;
+ op += countSize;
+ assert(op <= oend);
+ } }
+
+ *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2));
+
+ { size_t const bitstreamSize = ZSTD_encodeSequences(
+ op, (size_t)(oend - op),
+ CTable_MatchLength, mlCodeTable,
+ CTable_OffsetBits, ofCodeTable,
+ CTable_LitLength, llCodeTable,
+ sequences, nbSeq,
+ longOffsets, bmi2);
+ FORWARD_IF_ERROR(bitstreamSize, "ZSTD_encodeSequences failed");
+ op += bitstreamSize;
+ assert(op <= oend);
+ /* zstd versions <= 1.3.4 mistakenly report corruption when
+ * FSE_readNCount() receives a buffer < 4 bytes.
+ * Fixed by https://github.com/facebook/zstd/pull/1146.
+ * This can happen when the last set_compressed table present is 2
+ * bytes and the bitstream is only one byte.
+ * In this exceedingly rare case, we will simply emit an uncompressed
+ * block, since it isn't worth optimizing.
+ */
+ if (lastNCount && (op - lastNCount) < 4) {
+ /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */
+ assert(op - lastNCount == 3);
+ DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by "
+ "emitting an uncompressed block.");
+ return 0;
+ }
+ }
+
+ DEBUGLOG(5, "compressed block size : %u", (unsigned)(op - ostart));
+ return (size_t)(op - ostart);
+}
+
+MEM_STATIC size_t
+ZSTD_compressSequences(seqStore_t* seqStorePtr,
+ const ZSTD_entropyCTables_t* prevEntropy,
+ ZSTD_entropyCTables_t* nextEntropy,
+ const ZSTD_CCtx_params* cctxParams,
+ void* dst, size_t dstCapacity,
+ size_t srcSize,
+ void* entropyWorkspace, size_t entropyWkspSize,
+ int bmi2)
+{
+ size_t const cSize = ZSTD_compressSequences_internal(
+ seqStorePtr, prevEntropy, nextEntropy, cctxParams,
+ dst, dstCapacity,
+ entropyWorkspace, entropyWkspSize, bmi2);
+ if (cSize == 0) return 0;
+ /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
+ * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
+ */
+ if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity))
+ return 0; /* block not compressed */
+ FORWARD_IF_ERROR(cSize, "ZSTD_compressSequences_internal failed");
+
+ /* Check compressibility */
+ { size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
+ if (cSize >= maxCSize) return 0; /* block not compressed */
+ }
+
+ return cSize;
+}
+
+/* ZSTD_selectBlockCompressor() :
+ * Not static, but internal use only (used by long distance matcher)
+ * assumption : strat is a valid strategy */
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode)
+{
+ static const ZSTD_blockCompressor blockCompressor[3][ZSTD_STRATEGY_MAX+1] = {
+ { ZSTD_compressBlock_fast /* default for 0 */,
+ ZSTD_compressBlock_fast,
+ ZSTD_compressBlock_doubleFast,
+ ZSTD_compressBlock_greedy,
+ ZSTD_compressBlock_lazy,
+ ZSTD_compressBlock_lazy2,
+ ZSTD_compressBlock_btlazy2,
+ ZSTD_compressBlock_btopt,
+ ZSTD_compressBlock_btultra,
+ ZSTD_compressBlock_btultra2 },
+ { ZSTD_compressBlock_fast_extDict /* default for 0 */,
+ ZSTD_compressBlock_fast_extDict,
+ ZSTD_compressBlock_doubleFast_extDict,
+ ZSTD_compressBlock_greedy_extDict,
+ ZSTD_compressBlock_lazy_extDict,
+ ZSTD_compressBlock_lazy2_extDict,
+ ZSTD_compressBlock_btlazy2_extDict,
+ ZSTD_compressBlock_btopt_extDict,
+ ZSTD_compressBlock_btultra_extDict,
+ ZSTD_compressBlock_btultra_extDict },
+ { ZSTD_compressBlock_fast_dictMatchState /* default for 0 */,
+ ZSTD_compressBlock_fast_dictMatchState,
+ ZSTD_compressBlock_doubleFast_dictMatchState,
+ ZSTD_compressBlock_greedy_dictMatchState,
+ ZSTD_compressBlock_lazy_dictMatchState,
+ ZSTD_compressBlock_lazy2_dictMatchState,
+ ZSTD_compressBlock_btlazy2_dictMatchState,
+ ZSTD_compressBlock_btopt_dictMatchState,
+ ZSTD_compressBlock_btultra_dictMatchState,
+ ZSTD_compressBlock_btultra_dictMatchState }
+ };
+ ZSTD_blockCompressor selectedCompressor;
+ ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1);
+
+ assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
+ selectedCompressor = blockCompressor[(int)dictMode][(int)strat];
+ assert(selectedCompressor != NULL);
+ return selectedCompressor;
+}
+
+static void ZSTD_storeLastLiterals(seqStore_t* seqStorePtr,
+ const BYTE* anchor, size_t lastLLSize)
+{
+ memcpy(seqStorePtr->lit, anchor, lastLLSize);
+ seqStorePtr->lit += lastLLSize;
+}
+
+void ZSTD_resetSeqStore(seqStore_t* ssPtr)
+{
+ ssPtr->lit = ssPtr->litStart;
+ ssPtr->sequences = ssPtr->sequencesStart;
+ ssPtr->longLengthID = 0;
+}
+
+typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e;
+
+static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+{
+ ZSTD_matchState_t* const ms = &zc->blockState.matchState;
+ DEBUGLOG(5, "ZSTD_buildSeqStore (srcSize=%zu)", srcSize);
+ assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+ /* Assert that we have correctly flushed the ctx params into the ms's copy */
+ ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
+ if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
+ ZSTD_ldm_skipSequences(&zc->externSeqStore, srcSize, zc->appliedParams.cParams.minMatch);
+ return ZSTDbss_noCompress; /* don't even attempt compression below a certain srcSize */
+ }
+ ZSTD_resetSeqStore(&(zc->seqStore));
+ /* required for optimal parser to read stats from dictionary */
+ ms->opt.symbolCosts = &zc->blockState.prevCBlock->entropy;
+ /* tell the optimal parser how we expect to compress literals */
+ ms->opt.literalCompressionMode = zc->appliedParams.literalCompressionMode;
+ /* a gap between an attached dict and the current window is not safe,
+ * they must remain adjacent,
+ * and when that stops being the case, the dict must be unset */
+ assert(ms->dictMatchState == NULL || ms->loadedDictEnd == ms->window.dictLimit);
+
+ /* limited update after a very long match */
+ { const BYTE* const base = ms->window.base;
+ const BYTE* const istart = (const BYTE*)src;
+ const U32 current = (U32)(istart-base);
+ if (sizeof(ptrdiff_t)==8) assert(istart - base < (ptrdiff_t)(U32)(-1)); /* ensure no overflow */
+ if (current > ms->nextToUpdate + 384)
+ ms->nextToUpdate = current - MIN(192, (U32)(current - ms->nextToUpdate - 384));
+ }
+
+ /* select and store sequences */
+ { ZSTD_dictMode_e const dictMode = ZSTD_matchState_dictMode(ms);
+ size_t lastLLSize;
+ { int i;
+ for (i = 0; i < ZSTD_REP_NUM; ++i)
+ zc->blockState.nextCBlock->rep[i] = zc->blockState.prevCBlock->rep[i];
+ }
+ if (zc->externSeqStore.pos < zc->externSeqStore.size) {
+ assert(!zc->appliedParams.ldmParams.enableLdm);
+ /* Updates ldmSeqStore.pos */
+ lastLLSize =
+ ZSTD_ldm_blockCompress(&zc->externSeqStore,
+ ms, &zc->seqStore,
+ zc->blockState.nextCBlock->rep,
+ src, srcSize);
+ assert(zc->externSeqStore.pos <= zc->externSeqStore.size);
+ } else if (zc->appliedParams.ldmParams.enableLdm) {
+ rawSeqStore_t ldmSeqStore = {NULL, 0, 0, 0};
+
+ ldmSeqStore.seq = zc->ldmSequences;
+ ldmSeqStore.capacity = zc->maxNbLdmSequences;
+ /* Updates ldmSeqStore.size */
+ FORWARD_IF_ERROR(ZSTD_ldm_generateSequences(&zc->ldmState, &ldmSeqStore,
+ &zc->appliedParams.ldmParams,
+ src, srcSize), "");
+ /* Updates ldmSeqStore.pos */
+ lastLLSize =
+ ZSTD_ldm_blockCompress(&ldmSeqStore,
+ ms, &zc->seqStore,
+ zc->blockState.nextCBlock->rep,
+ src, srcSize);
+ assert(ldmSeqStore.pos == ldmSeqStore.size);
+ } else { /* not long range mode */
+ ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, dictMode);
+ lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
+ }
+ { const BYTE* const lastLiterals = (const BYTE*)src + srcSize - lastLLSize;
+ ZSTD_storeLastLiterals(&zc->seqStore, lastLiterals, lastLLSize);
+ } }
+ return ZSTDbss_compress;
+}
+
+static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
+{
+ const seqStore_t* seqStore = ZSTD_getSeqStore(zc);
+ const seqDef* seqs = seqStore->sequencesStart;
+ size_t seqsSize = seqStore->sequences - seqs;
+
+ ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex];
+ size_t i; size_t position; int repIdx;
+
+ assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences);
+ for (i = 0, position = 0; i < seqsSize; ++i) {
+ outSeqs[i].offset = seqs[i].offset;
+ outSeqs[i].litLength = seqs[i].litLength;
+ outSeqs[i].matchLength = seqs[i].matchLength + MINMATCH;
+
+ if (i == seqStore->longLengthPos) {
+ if (seqStore->longLengthID == 1) {
+ outSeqs[i].litLength += 0x10000;
+ } else if (seqStore->longLengthID == 2) {
+ outSeqs[i].matchLength += 0x10000;
+ }
+ }
+
+ if (outSeqs[i].offset <= ZSTD_REP_NUM) {
+ outSeqs[i].rep = outSeqs[i].offset;
+ repIdx = (unsigned int)i - outSeqs[i].offset;
+
+ if (outSeqs[i].litLength == 0) {
+ if (outSeqs[i].offset < 3) {
+ --repIdx;
+ } else {
+ repIdx = (unsigned int)i - 1;
+ }
+ ++outSeqs[i].rep;
+ }
+ assert(repIdx >= -3);
+ outSeqs[i].offset = repIdx >= 0 ? outSeqs[repIdx].offset : repStartValue[-repIdx - 1];
+ if (outSeqs[i].rep == 4) {
+ --outSeqs[i].offset;
+ }
+ } else {
+ outSeqs[i].offset -= ZSTD_REP_NUM;
+ }
+
+ position += outSeqs[i].litLength;
+ outSeqs[i].matchPos = (unsigned int)position;
+ position += outSeqs[i].matchLength;
+ }
+ zc->seqCollector.seqIndex += seqsSize;
+}
+
+size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+ size_t outSeqsSize, const void* src, size_t srcSize)
+{
+ const size_t dstCapacity = ZSTD_compressBound(srcSize);
+ void* dst = ZSTD_malloc(dstCapacity, ZSTD_defaultCMem);
+ SeqCollector seqCollector;
+
+ RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!");
+
+ seqCollector.collectSequences = 1;
+ seqCollector.seqStart = outSeqs;
+ seqCollector.seqIndex = 0;
+ seqCollector.maxSequences = outSeqsSize;
+ zc->seqCollector = seqCollector;
+
+ ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
+ ZSTD_free(dst, ZSTD_defaultCMem);
+ return zc->seqCollector.seqIndex;
+}
+
+/* Returns true if the given block is a RLE block */
+static int ZSTD_isRLE(const BYTE *ip, size_t length) {
+ size_t i;
+ if (length < 2) return 1;
+ for (i = 1; i < length; ++i) {
+ if (ip[0] != ip[i]) return 0;
+ }
+ return 1;
+}
+
+/* Returns true if the given block may be RLE.
+ * This is just a heuristic based on the compressibility.
+ * It may return both false positives and false negatives.
+ */
+static int ZSTD_maybeRLE(seqStore_t const* seqStore)
+{
+ size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+ size_t const nbLits = (size_t)(seqStore->lit - seqStore->litStart);
+
+ return nbSeqs < 4 && nbLits < 10;
+}
+
+static void ZSTD_confirmRepcodesAndEntropyTables(ZSTD_CCtx* zc)
+{
+ ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock;
+ zc->blockState.prevCBlock = zc->blockState.nextCBlock;
+ zc->blockState.nextCBlock = tmp;
+}
+
+static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize, U32 frame)
+{
+ /* This the upper bound for the length of an rle block.
+ * This isn't the actual upper bound. Finding the real threshold
+ * needs further investigation.
+ */
+ const U32 rleMaxLength = 25;
+ size_t cSize;
+ const BYTE* ip = (const BYTE*)src;
+ BYTE* op = (BYTE*)dst;
+ DEBUGLOG(5, "ZSTD_compressBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
+ (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
+ (unsigned)zc->blockState.matchState.nextToUpdate);
+
+ { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
+ FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
+ if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; }
+ }
+
+ if (zc->seqCollector.collectSequences) {
+ ZSTD_copyBlockSequences(zc);
+ return 0;
+ }
+
+ /* encode sequences and literals */
+ cSize = ZSTD_compressSequences(&zc->seqStore,
+ &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy,
+ &zc->appliedParams,
+ dst, dstCapacity,
+ srcSize,
+ zc->entropyWorkspace, HUF_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
+ zc->bmi2);
+
+ if (frame &&
+ /* We don't want to emit our first block as a RLE even if it qualifies because
+ * doing so will cause the decoder (cli only) to throw a "should consume all input error."
+ * This is only an issue for zstd <= v1.4.3
+ */
+ !zc->isFirstBlock &&
+ cSize < rleMaxLength &&
+ ZSTD_isRLE(ip, srcSize))
+ {
+ cSize = 1;
+ op[0] = ip[0];
+ }
+
+out:
+ if (!ZSTD_isError(cSize) && cSize > 1) {
+ ZSTD_confirmRepcodesAndEntropyTables(zc);
+ }
+ /* We check that dictionaries have offset codes available for the first
+ * block. After the first block, the offcode table might not have large
+ * enough codes to represent the offsets in the data.
+ */
+ if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+ zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+
+ return cSize;
+}
+
+static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const size_t bss, U32 lastBlock)
+{
+ DEBUGLOG(6, "Attempting ZSTD_compressSuperBlock()");
+ if (bss == ZSTDbss_compress) {
+ if (/* We don't want to emit our first block as a RLE even if it qualifies because
+ * doing so will cause the decoder (cli only) to throw a "should consume all input error."
+ * This is only an issue for zstd <= v1.4.3
+ */
+ !zc->isFirstBlock &&
+ ZSTD_maybeRLE(&zc->seqStore) &&
+ ZSTD_isRLE((BYTE const*)src, srcSize))
+ {
+ return ZSTD_rleCompressBlock(dst, dstCapacity, *(BYTE const*)src, srcSize, lastBlock);
+ }
+ /* Attempt superblock compression.
+ *
+ * Note that compressed size of ZSTD_compressSuperBlock() is not bound by the
+ * standard ZSTD_compressBound(). This is a problem, because even if we have
+ * space now, taking an extra byte now could cause us to run out of space later
+ * and violate ZSTD_compressBound().
+ *
+ * Define blockBound(blockSize) = blockSize + ZSTD_blockHeaderSize.
+ *
+ * In order to respect ZSTD_compressBound() we must attempt to emit a raw
+ * uncompressed block in these cases:
+ * * cSize == 0: Return code for an uncompressed block.
+ * * cSize == dstSize_tooSmall: We may have expanded beyond blockBound(srcSize).
+ * ZSTD_noCompressBlock() will return dstSize_tooSmall if we are really out of
+ * output space.
+ * * cSize >= blockBound(srcSize): We have expanded the block too much so
+ * emit an uncompressed block.
+ */
+ {
+ size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
+ if (cSize != ERROR(dstSize_tooSmall)) {
+ size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
+ FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
+ if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
+ ZSTD_confirmRepcodesAndEntropyTables(zc);
+ return cSize;
+ }
+ }
+ }
+ }
+
+ DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()");
+ /* Superblock compression failed, attempt to emit a single no compress block.
+ * The decoder will be able to stream this block since it is uncompressed.
+ */
+ return ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
+}
+
+static size_t ZSTD_compressBlock_targetCBlockSize(ZSTD_CCtx* zc,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ U32 lastBlock)
+{
+ size_t cSize = 0;
+ const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
+ DEBUGLOG(5, "ZSTD_compressBlock_targetCBlockSize (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u, srcSize=%zu)",
+ (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, (unsigned)zc->blockState.matchState.nextToUpdate, srcSize);
+ FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
+
+ cSize = ZSTD_compressBlock_targetCBlockSize_body(zc, dst, dstCapacity, src, srcSize, bss, lastBlock);
+ FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize_body failed");
+
+ if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+ zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+
+ return cSize;
+}
+
+static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
+ ZSTD_cwksp* ws,
+ ZSTD_CCtx_params const* params,
+ void const* ip,
+ void const* iend)
+{
+ if (ZSTD_window_needOverflowCorrection(ms->window, iend)) {
+ U32 const maxDist = (U32)1 << params->cParams.windowLog;
+ U32 const cycleLog = ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy);
+ U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, maxDist, ip);
+ ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30);
+ ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30);
+ ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
+ ZSTD_cwksp_mark_tables_dirty(ws);
+ ZSTD_reduceIndex(ms, params, correction);
+ ZSTD_cwksp_mark_tables_clean(ws);
+ if (ms->nextToUpdate < correction) ms->nextToUpdate = 0;
+ else ms->nextToUpdate -= correction;
+ /* invalidate dictionaries on overflow correction */
+ ms->loadedDictEnd = 0;
+ ms->dictMatchState = NULL;
+ }
+}
+
+/*! ZSTD_compress_frameChunk() :
+* Compress a chunk of data into one or multiple blocks.
+* All blocks will be terminated, all input will be consumed.
+* Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
+* Frame is supposed already started (header already produced)
+* @return : compressed size, or an error code
+*/
+static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ U32 lastFrameChunk)
+{
+ size_t blockSize = cctx->blockSize;
+ size_t remaining = srcSize;
+ const BYTE* ip = (const BYTE*)src;
+ BYTE* const ostart = (BYTE*)dst;
+ BYTE* op = ostart;
+ U32 const maxDist = (U32)1 << cctx->appliedParams.cParams.windowLog;
+
+ assert(cctx->appliedParams.cParams.windowLog <= ZSTD_WINDOWLOG_MAX);
+
+ DEBUGLOG(5, "ZSTD_compress_frameChunk (blockSize=%u)", (unsigned)blockSize);
+ if (cctx->appliedParams.fParams.checksumFlag && srcSize)
+ XXH64_update(&cctx->xxhState, src, srcSize);
+
+ while (remaining) {
+ ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
+ U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
+
+ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE,
+ dstSize_tooSmall,
+ "not enough space to store compressed block");
+ if (remaining < blockSize) blockSize = remaining;
+
+ ZSTD_overflowCorrectIfNeeded(
+ ms, &cctx->workspace, &cctx->appliedParams, ip, ip + blockSize);
+ ZSTD_checkDictValidity(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd, &ms->dictMatchState);
+
+ /* Ensure hash/chain table insertion resumes no sooner than lowlimit */
+ if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate = ms->window.lowLimit;
+
+ { size_t cSize;
+ if (ZSTD_useTargetCBlockSize(&cctx->appliedParams)) {
+ cSize = ZSTD_compressBlock_targetCBlockSize(cctx, op, dstCapacity, ip, blockSize, lastBlock);
+ FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize failed");
+ assert(cSize > 0);
+ assert(cSize <= blockSize + ZSTD_blockHeaderSize);
+ } else {
+ cSize = ZSTD_compressBlock_internal(cctx,
+ op+ZSTD_blockHeaderSize, dstCapacity-ZSTD_blockHeaderSize,
+ ip, blockSize, 1 /* frame */);
+ FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_internal failed");
+
+ if (cSize == 0) { /* block is not compressible */
+ cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+ FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+ } else {
+ U32 const cBlockHeader = cSize == 1 ?
+ lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) :
+ lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+ MEM_writeLE24(op, cBlockHeader);
+ cSize += ZSTD_blockHeaderSize;
+ }
+ }
+
+
+ ip += blockSize;
+ assert(remaining >= blockSize);
+ remaining -= blockSize;
+ op += cSize;
+ assert(dstCapacity >= cSize);
+ dstCapacity -= cSize;
+ cctx->isFirstBlock = 0;
+ DEBUGLOG(5, "ZSTD_compress_frameChunk: adding a block of size %u",
+ (unsigned)cSize);
+ } }
+
+ if (lastFrameChunk && (op>ostart)) cctx->stage = ZSTDcs_ending;
+ return (size_t)(op-ostart);
+}
+
+
+static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity,
+ const ZSTD_CCtx_params* params, U64 pledgedSrcSize, U32 dictID)
+{ BYTE* const op = (BYTE*)dst;
+ U32 const dictIDSizeCodeLength = (dictID>0) + (dictID>=256) + (dictID>=65536); /* 0-3 */
+ U32 const dictIDSizeCode = params->fParams.noDictIDFlag ? 0 : dictIDSizeCodeLength; /* 0-3 */
+ U32 const checksumFlag = params->fParams.checksumFlag>0;
+ U32 const windowSize = (U32)1 << params->cParams.windowLog;
+ U32 const singleSegment = params->fParams.contentSizeFlag && (windowSize >= pledgedSrcSize);
+ BYTE const windowLogByte = (BYTE)((params->cParams.windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN) << 3);
+ U32 const fcsCode = params->fParams.contentSizeFlag ?
+ (pledgedSrcSize>=256) + (pledgedSrcSize>=65536+256) + (pledgedSrcSize>=0xFFFFFFFFU) : 0; /* 0-3 */
+ BYTE const frameHeaderDescriptionByte = (BYTE)(dictIDSizeCode + (checksumFlag<<2) + (singleSegment<<5) + (fcsCode<<6) );
+ size_t pos=0;
+
+ assert(!(params->fParams.contentSizeFlag && pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN));
+ RETURN_ERROR_IF(dstCapacity < ZSTD_FRAMEHEADERSIZE_MAX, dstSize_tooSmall,
+ "dst buf is too small to fit worst-case frame header size.");
+ DEBUGLOG(4, "ZSTD_writeFrameHeader : dictIDFlag : %u ; dictID : %u ; dictIDSizeCode : %u",
+ !params->fParams.noDictIDFlag, (unsigned)dictID, (unsigned)dictIDSizeCode);
+
+ if (params->format == ZSTD_f_zstd1) {
+ MEM_writeLE32(dst, ZSTD_MAGICNUMBER);
+ pos = 4;
+ }
+ op[pos++] = frameHeaderDescriptionByte;
+ if (!singleSegment) op[pos++] = windowLogByte;
+ switch(dictIDSizeCode)
+ {
+ default: assert(0); /* impossible */
+ case 0 : break;
+ case 1 : op[pos] = (BYTE)(dictID); pos++; break;
+ case 2 : MEM_writeLE16(op+pos, (U16)dictID); pos+=2; break;
+ case 3 : MEM_writeLE32(op+pos, dictID); pos+=4; break;
+ }
+ switch(fcsCode)
+ {
+ default: assert(0); /* impossible */
+ case 0 : if (singleSegment) op[pos++] = (BYTE)(pledgedSrcSize); break;
+ case 1 : MEM_writeLE16(op+pos, (U16)(pledgedSrcSize-256)); pos+=2; break;
+ case 2 : MEM_writeLE32(op+pos, (U32)(pledgedSrcSize)); pos+=4; break;
+ case 3 : MEM_writeLE64(op+pos, (U64)(pledgedSrcSize)); pos+=8; break;
+ }
+ return pos;
+}
+
+/* ZSTD_writeLastEmptyBlock() :
+ * output an empty Block with end-of-frame mark to complete a frame
+ * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h))
+ * or an error code if `dstCapacity` is too small (<ZSTD_blockHeaderSize)
+ */
+size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity)
+{
+ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall,
+ "dst buf is too small to write frame trailer empty block.");
+ { U32 const cBlockHeader24 = 1 /*lastBlock*/ + (((U32)bt_raw)<<1); /* 0 size */
+ MEM_writeLE24(dst, cBlockHeader24);
+ return ZSTD_blockHeaderSize;
+ }
+}
+
+size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq)
+{
+ RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong,
+ "wrong cctx stage");
+ RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm,
+ parameter_unsupported,
+ "incompatible with ldm");
+ cctx->externSeqStore.seq = seq;
+ cctx->externSeqStore.size = nbSeq;
+ cctx->externSeqStore.capacity = nbSeq;
+ cctx->externSeqStore.pos = 0;
+ return 0;
+}
+
+
+static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ U32 frame, U32 lastFrameChunk)
+{
+ ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
+ size_t fhSize = 0;
+
+ DEBUGLOG(5, "ZSTD_compressContinue_internal, stage: %u, srcSize: %u",
+ cctx->stage, (unsigned)srcSize);
+ RETURN_ERROR_IF(cctx->stage==ZSTDcs_created, stage_wrong,
+ "missing init (ZSTD_compressBegin)");
+
+ if (frame && (cctx->stage==ZSTDcs_init)) {
+ fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams,
+ cctx->pledgedSrcSizePlusOne-1, cctx->dictID);
+ FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed");
+ assert(fhSize <= dstCapacity);
+ dstCapacity -= fhSize;
+ dst = (char*)dst + fhSize;
+ cctx->stage = ZSTDcs_ongoing;
+ }
+
+ if (!srcSize) return fhSize; /* do not generate an empty block if no input */
+
+ if (!ZSTD_window_update(&ms->window, src, srcSize)) {
+ ms->nextToUpdate = ms->window.dictLimit;
+ }
+ if (cctx->appliedParams.ldmParams.enableLdm) {
+ ZSTD_window_update(&cctx->ldmState.window, src, srcSize);
+ }
+
+ if (!frame) {
+ /* overflow check and correction for block mode */
+ ZSTD_overflowCorrectIfNeeded(
+ ms, &cctx->workspace, &cctx->appliedParams,
+ src, (BYTE const*)src + srcSize);
+ }
+
+ DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (unsigned)cctx->blockSize);
+ { size_t const cSize = frame ?
+ ZSTD_compress_frameChunk (cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) :
+ ZSTD_compressBlock_internal (cctx, dst, dstCapacity, src, srcSize, 0 /* frame */);
+ FORWARD_IF_ERROR(cSize, "%s", frame ? "ZSTD_compress_frameChunk failed" : "ZSTD_compressBlock_internal failed");
+ cctx->consumedSrcSize += srcSize;
+ cctx->producedCSize += (cSize + fhSize);
+ assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0));
+ if (cctx->pledgedSrcSizePlusOne != 0) { /* control src size */
+ ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1);
+ RETURN_ERROR_IF(
+ cctx->consumedSrcSize+1 > cctx->pledgedSrcSizePlusOne,
+ srcSize_wrong,
+ "error : pledgedSrcSize = %u, while realSrcSize >= %u",
+ (unsigned)cctx->pledgedSrcSizePlusOne-1,
+ (unsigned)cctx->consumedSrcSize);
+ }
+ return cSize + fhSize;
+ }
+}
+
+size_t ZSTD_compressContinue (ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize)
+{
+ DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize);
+ return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */);
+}
+
+
+size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
+{
+ ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams;
+ assert(!ZSTD_checkCParams(cParams));
+ return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog);
+}
+
+size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+ DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize);
+ { size_t const blockSizeMax = ZSTD_getBlockSize(cctx);
+ RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); }
+
+ return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
+}
+
+/*! ZSTD_loadDictionaryContent() :
+ * @return : 0, or an error code
+ */
+static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+ ldmState_t* ls,
+ ZSTD_cwksp* ws,
+ ZSTD_CCtx_params const* params,
+ const void* src, size_t srcSize,
+ ZSTD_dictTableLoadMethod_e dtlm)
+{
+ const BYTE* ip = (const BYTE*) src;
+ const BYTE* const iend = ip + srcSize;
+
+ ZSTD_window_update(&ms->window, src, srcSize);
+ ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
+
+ if (params->ldmParams.enableLdm && ls != NULL) {
+ ZSTD_window_update(&ls->window, src, srcSize);
+ ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);
+ }
+
+ /* Assert that we the ms params match the params we're being given */
+ ZSTD_assertEqualCParams(params->cParams, ms->cParams);
+
+ if (srcSize <= HASH_READ_SIZE) return 0;
+
+ while (iend - ip > HASH_READ_SIZE) {
+ size_t const remaining = (size_t)(iend - ip);
+ size_t const chunk = MIN(remaining, ZSTD_CHUNKSIZE_MAX);
+ const BYTE* const ichunk = ip + chunk;
+
+ ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, ichunk);
+
+ if (params->ldmParams.enableLdm && ls != NULL)
+ ZSTD_ldm_fillHashTable(ls, (const BYTE*)src, (const BYTE*)src + srcSize, &params->ldmParams);
+
+ switch(params->cParams.strategy)
+ {
+ case ZSTD_fast:
+ ZSTD_fillHashTable(ms, ichunk, dtlm);
+ break;
+ case ZSTD_dfast:
+ ZSTD_fillDoubleHashTable(ms, ichunk, dtlm);
+ break;
+
+ case ZSTD_greedy:
+ case ZSTD_lazy:
+ case ZSTD_lazy2:
+ if (chunk >= HASH_READ_SIZE)
+ ZSTD_insertAndFindFirstIndex(ms, ichunk-HASH_READ_SIZE);
+ break;
+
+ case ZSTD_btlazy2: /* we want the dictionary table fully sorted */
+ case ZSTD_btopt:
+ case ZSTD_btultra:
+ case ZSTD_btultra2:
+ if (chunk >= HASH_READ_SIZE)
+ ZSTD_updateTree(ms, ichunk-HASH_READ_SIZE, ichunk);
+ break;
+
+ default:
+ assert(0); /* not possible : not a valid strategy id */
+ }
+
+ ip = ichunk;
+ }
+
+ ms->nextToUpdate = (U32)(iend - ms->window.base);
+ return 0;
+}
+
+
+/* Dictionaries that assign zero probability to symbols that show up causes problems
+ when FSE encoding. Refuse dictionaries that assign zero probability to symbols
+ that we may encounter during compression.
+ NOTE: This behavior is not standard and could be improved in the future. */
+static size_t ZSTD_checkDictNCount(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue) {
+ U32 s;
+ RETURN_ERROR_IF(dictMaxSymbolValue < maxSymbolValue, dictionary_corrupted, "dict fse tables don't have all symbols");
+ for (s = 0; s <= maxSymbolValue; ++s) {
+ RETURN_ERROR_IF(normalizedCounter[s] == 0, dictionary_corrupted, "dict fse tables don't have all symbols");
+ }
+ return 0;
+}
+
+size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+ short* offcodeNCount, unsigned* offcodeMaxValue,
+ const void* const dict, size_t dictSize)
+{
+ const BYTE* dictPtr = (const BYTE*)dict; /* skip magic num and dict ID */
+ const BYTE* const dictEnd = dictPtr + dictSize;
+ dictPtr += 8;
+ bs->entropy.huf.repeatMode = HUF_repeat_check;
+
+ { unsigned maxSymbolValue = 255;
+ unsigned hasZeroWeights = 1;
+ size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.huf.CTable, &maxSymbolValue, dictPtr,
+ dictEnd-dictPtr, &hasZeroWeights);
+
+ /* We only set the loaded table as valid if it contains all non-zero
+ * weights. Otherwise, we set it to check */
+ if (!hasZeroWeights)
+ bs->entropy.huf.repeatMode = HUF_repeat_valid;
+
+ RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, "");
+ RETURN_ERROR_IF(maxSymbolValue < 255, dictionary_corrupted, "");
+ dictPtr += hufHeaderSize;
+ }
+
+ { unsigned offcodeLog;
+ size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
+ RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, "");
+ RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, "");
+ /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
+ /* fill all offset symbols to avoid garbage at end of table */
+ RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
+ bs->entropy.fse.offcodeCTable,
+ offcodeNCount, MaxOff, offcodeLog,
+ workspace, HUF_WORKSPACE_SIZE)),
+ dictionary_corrupted, "");
+ dictPtr += offcodeHeaderSize;
+ }
+
+ { short matchlengthNCount[MaxML+1];
+ unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+ size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr);
+ RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, "");
+ RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, "");
+ /* Every match length code must have non-zero probability */
+ FORWARD_IF_ERROR( ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML), "");
+ RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
+ bs->entropy.fse.matchlengthCTable,
+ matchlengthNCount, matchlengthMaxValue, matchlengthLog,
+ workspace, HUF_WORKSPACE_SIZE)),
+ dictionary_corrupted, "");
+ dictPtr += matchlengthHeaderSize;
+ }
+
+ { short litlengthNCount[MaxLL+1];
+ unsigned litlengthMaxValue = MaxLL, litlengthLog;
+ size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr);
+ RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, "");
+ RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, "");
+ /* Every literal length code must have non-zero probability */
+ FORWARD_IF_ERROR( ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL), "");
+ RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
+ bs->entropy.fse.litlengthCTable,
+ litlengthNCount, litlengthMaxValue, litlengthLog,
+ workspace, HUF_WORKSPACE_SIZE)),
+ dictionary_corrupted, "");
+ dictPtr += litlengthHeaderSize;
+ }
+
+ RETURN_ERROR_IF(dictPtr+12 > dictEnd, dictionary_corrupted, "");
+ bs->rep[0] = MEM_readLE32(dictPtr+0);
+ bs->rep[1] = MEM_readLE32(dictPtr+4);
+ bs->rep[2] = MEM_readLE32(dictPtr+8);
+ dictPtr += 12;
+
+ return dictPtr - (const BYTE*)dict;
+}
+
+/* Dictionary format :
+ * See :
+ * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
+ */
+/*! ZSTD_loadZstdDictionary() :
+ * @return : dictID, or an error code
+ * assumptions : magic number supposed already checked
+ * dictSize supposed >= 8
+ */
+static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+ ZSTD_matchState_t* ms,
+ ZSTD_cwksp* ws,
+ ZSTD_CCtx_params const* params,
+ const void* dict, size_t dictSize,
+ ZSTD_dictTableLoadMethod_e dtlm,
+ void* workspace)
+{
+ const BYTE* dictPtr = (const BYTE*)dict;
+ const BYTE* const dictEnd = dictPtr + dictSize;
+ short offcodeNCount[MaxOff+1];
+ unsigned offcodeMaxValue = MaxOff;
+ size_t dictID;
+ size_t eSize;
+
+ ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+ assert(dictSize >= 8);
+ assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY);
+
+ dictID = params->fParams.noDictIDFlag ? 0 : MEM_readLE32(dictPtr + 4 /* skip magic number */ );
+ eSize = ZSTD_loadCEntropy(bs, workspace, offcodeNCount, &offcodeMaxValue, dict, dictSize);
+ FORWARD_IF_ERROR(eSize, "ZSTD_loadCEntropy failed");
+ dictPtr += eSize;
+
+ { size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
+ U32 offcodeMax = MaxOff;
+ if (dictContentSize <= ((U32)-1) - 128 KB) {
+ U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */
+ offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */
+ }
+ /* All offset values <= dictContentSize + 128 KB must be representable */
+ FORWARD_IF_ERROR(ZSTD_checkDictNCount(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff)), "");
+ /* All repCodes must be <= dictContentSize and != 0*/
+ { U32 u;
+ for (u=0; u<3; u++) {
+ RETURN_ERROR_IF(bs->rep[u] == 0, dictionary_corrupted, "");
+ RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, "");
+ } }
+
+ bs->entropy.fse.offcode_repeatMode = FSE_repeat_valid;
+ bs->entropy.fse.matchlength_repeatMode = FSE_repeat_valid;
+ bs->entropy.fse.litlength_repeatMode = FSE_repeat_valid;
+ FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
+ ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), "");
+ return dictID;
+ }
+}
+
+/** ZSTD_compress_insertDictionary() :
+* @return : dictID, or an error code */
+static size_t
+ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+ ZSTD_matchState_t* ms,
+ ldmState_t* ls,
+ ZSTD_cwksp* ws,
+ const ZSTD_CCtx_params* params,
+ const void* dict, size_t dictSize,
+ ZSTD_dictContentType_e dictContentType,
+ ZSTD_dictTableLoadMethod_e dtlm,
+ void* workspace)
+{
+ DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
+ if ((dict==NULL) || (dictSize<8)) {
+ RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
+ return 0;
+ }
+
+ ZSTD_reset_compressedBlockState(bs);
+
+ /* dict restricted modes */
+ if (dictContentType == ZSTD_dct_rawContent)
+ return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm);
+
+ if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
+ if (dictContentType == ZSTD_dct_auto) {
+ DEBUGLOG(4, "raw content dictionary detected");
+ return ZSTD_loadDictionaryContent(
+ ms, ls, ws, params, dict, dictSize, dtlm);
+ }
+ RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
+ assert(0); /* impossible */
+ }
+
+ /* dict as full zstd dictionary */
+ return ZSTD_loadZstdDictionary(
+ bs, ms, ws, params, dict, dictSize, dtlm, workspace);
+}
+
+#define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB)
+#define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6)
+
+/*! ZSTD_compressBegin_internal() :
+ * @return : 0, or an error code */
+static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+ const void* dict, size_t dictSize,
+ ZSTD_dictContentType_e dictContentType,
+ ZSTD_dictTableLoadMethod_e dtlm,
+ const ZSTD_CDict* cdict,
+ const ZSTD_CCtx_params* params, U64 pledgedSrcSize,
+ ZSTD_buffered_policy_e zbuff)
+{
+ DEBUGLOG(4, "ZSTD_compressBegin_internal: wlog=%u", params->cParams.windowLog);
+ /* params are supposed to be fully validated at this point */
+ assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams)));
+ assert(!((dict) && (cdict))); /* either dict or cdict, not both */
+ if ( (cdict)
+ && (cdict->dictContentSize > 0)
+ && ( pledgedSrcSize < ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF
+ || pledgedSrcSize < cdict->dictContentSize * ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER
+ || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN
+ || cdict->compressionLevel == 0)
+ && (params->attachDictPref != ZSTD_dictForceLoad) ) {
+ return ZSTD_resetCCtx_usingCDict(cctx, cdict, params, pledgedSrcSize, zbuff);
+ }
+
+ FORWARD_IF_ERROR( ZSTD_resetCCtx_internal(cctx, *params, pledgedSrcSize,
+ ZSTDcrp_makeClean, zbuff) , "");
+ { size_t const dictID = cdict ?
+ ZSTD_compress_insertDictionary(
+ cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+ &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
+ cdict->dictContentSize, dictContentType, dtlm,
+ cctx->entropyWorkspace)
+ : ZSTD_compress_insertDictionary(
+ cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+ &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize,
+ dictContentType, dtlm, cctx->entropyWorkspace);
+ FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+ assert(dictID <= UINT_MAX);
+ cctx->dictID = (U32)dictID;
+ }
+ return 0;
+}
+
+size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx,
+ const void* dict, size_t dictSize,
+ ZSTD_dictContentType_e dictContentType,
+ ZSTD_dictTableLoadMethod_e dtlm,
+ const ZSTD_CDict* cdict,
+ const ZSTD_CCtx_params* params,
+ unsigned long long pledgedSrcSize)
+{
+ DEBUGLOG(4, "ZSTD_compressBegin_advanced_internal: wlog=%u", params->cParams.windowLog);
+ /* compression parameters verification and optimization */
+ FORWARD_IF_ERROR( ZSTD_checkCParams(params->cParams) , "");
+ return ZSTD_compressBegin_internal(cctx,
+ dict, dictSize, dictContentType, dtlm,
+ cdict,
+ params, pledgedSrcSize,
+ ZSTDb_not_buffered);
+}
+
+/*! ZSTD_compressBegin_advanced() :
+* @return : 0, or an error code */
+size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
+ const void* dict, size_t dictSize,
+ ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+ ZSTD_CCtx_params const cctxParams =
+ ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, &params);
+ return ZSTD_compressBegin_advanced_internal(cctx,
+ dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast,
+ NULL /*cdict*/,
+ &cctxParams, pledgedSrcSize);
+}
+
+size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+{
+ ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize);
+ ZSTD_CCtx_params const cctxParams =
+ ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, &params);
+ DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize);
+ return ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
+ &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
+}
+
+size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel)
+{
+ return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel);
+}
+
+
+/*! ZSTD_writeEpilogue() :
+* Ends a frame.
+* @return : nb of bytes written into dst (or an error code) */
+static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
+{
+ BYTE* const ostart = (BYTE*)dst;
+ BYTE* op = ostart;
+ size_t fhSize = 0;
+
+ DEBUGLOG(4, "ZSTD_writeEpilogue");
+ RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing");
+
+ /* special case : empty frame */
+ if (cctx->stage == ZSTDcs_init) {
+ fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0);
+ FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed");
+ dstCapacity -= fhSize;
+ op += fhSize;
+ cctx->stage = ZSTDcs_ongoing;
+ }
+
+ if (cctx->stage != ZSTDcs_ending) {
+ /* write one last empty block, make it the "last" block */
+ U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0;
+ RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for epilogue");
+ MEM_writeLE32(op, cBlockHeader24);
+ op += ZSTD_blockHeaderSize;
+ dstCapacity -= ZSTD_blockHeaderSize;
+ }
+
+ if (cctx->appliedParams.fParams.checksumFlag) {
+ U32 const checksum = (U32) XXH64_digest(&cctx->xxhState);
+ RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for checksum");
+ DEBUGLOG(4, "ZSTD_writeEpilogue: write checksum : %08X", (unsigned)checksum);
+ MEM_writeLE32(op, checksum);
+ op += 4;
+ }
+
+ cctx->stage = ZSTDcs_created; /* return to "created but no init" status */
+ return op-ostart;
+}
+
+size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize)
+{
+ size_t endResult;
+ size_t const cSize = ZSTD_compressContinue_internal(cctx,
+ dst, dstCapacity, src, srcSize,
+ 1 /* frame mode */, 1 /* last chunk */);
+ FORWARD_IF_ERROR(cSize, "ZSTD_compressContinue_internal failed");
+ endResult = ZSTD_writeEpilogue(cctx, (char*)dst + cSize, dstCapacity-cSize);
+ FORWARD_IF_ERROR(endResult, "ZSTD_writeEpilogue failed");
+ assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0));
+ if (cctx->pledgedSrcSizePlusOne != 0) { /* control src size */
+ ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1);
+ DEBUGLOG(4, "end of frame : controlling src size");
+ RETURN_ERROR_IF(
+ cctx->pledgedSrcSizePlusOne != cctx->consumedSrcSize+1,
+ srcSize_wrong,
+ "error : pledgedSrcSize = %u, while realSrcSize = %u",
+ (unsigned)cctx->pledgedSrcSizePlusOne-1,
+ (unsigned)cctx->consumedSrcSize);
+ }
+ return cSize + endResult;
+}
+
+
+static size_t ZSTD_compress_internal (ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const void* dict,size_t dictSize,
+ const ZSTD_parameters* params)
+{
+ ZSTD_CCtx_params const cctxParams =
+ ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, params);
+ DEBUGLOG(4, "ZSTD_compress_internal");
+ return ZSTD_compress_advanced_internal(cctx,
+ dst, dstCapacity,
+ src, srcSize,
+ dict, dictSize,
+ &cctxParams);
+}
+
+size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const void* dict,size_t dictSize,
+ ZSTD_parameters params)
+{
+ DEBUGLOG(4, "ZSTD_compress_advanced");
+ FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
+ return ZSTD_compress_internal(cctx,
+ dst, dstCapacity,
+ src, srcSize,
+ dict, dictSize,
+ &params);
+}
+
+/* Internal */
+size_t ZSTD_compress_advanced_internal(
+ ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const void* dict,size_t dictSize,
+ const ZSTD_CCtx_params* params)
+{
+ DEBUGLOG(4, "ZSTD_compress_advanced_internal (srcSize:%u)", (unsigned)srcSize);
+ FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx,
+ dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
+ params, srcSize, ZSTDb_not_buffered) , "");
+ return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+}
+
+size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const void* dict, size_t dictSize,
+ int compressionLevel)
+{
+ ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, srcSize, dict ? dictSize : 0);
+ ZSTD_CCtx_params cctxParams = ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, &params);
+ DEBUGLOG(4, "ZSTD_compress_usingDict (srcSize=%u)", (unsigned)srcSize);
+ assert(params.fParams.contentSizeFlag == 1);
+ return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, &cctxParams);
+}
+
+size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ int compressionLevel)
+{
+ DEBUGLOG(4, "ZSTD_compressCCtx (srcSize=%u)", (unsigned)srcSize);
+ assert(cctx != NULL);
+ return ZSTD_compress_usingDict(cctx, dst, dstCapacity, src, srcSize, NULL, 0, compressionLevel);
+}
+
+size_t ZSTD_compress(void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ int compressionLevel)
+{
+ size_t result;
+ ZSTD_CCtx ctxBody;
+ ZSTD_initCCtx(&ctxBody, ZSTD_defaultCMem);
+ result = ZSTD_compressCCtx(&ctxBody, dst, dstCapacity, src, srcSize, compressionLevel);
+ ZSTD_freeCCtxContent(&ctxBody); /* can't free ctxBody itself, as it's on stack; free only heap content */
+ return result;
+}
+
+
+/* ===== Dictionary API ===== */
+
+/*! ZSTD_estimateCDictSize_advanced() :
+ * Estimate amount of memory that will be needed to create a dictionary with following arguments */
+size_t ZSTD_estimateCDictSize_advanced(
+ size_t dictSize, ZSTD_compressionParameters cParams,
+ ZSTD_dictLoadMethod_e dictLoadMethod)
+{
+ DEBUGLOG(5, "sizeof(ZSTD_CDict) : %u", (unsigned)sizeof(ZSTD_CDict));
+ return ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict))
+ + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE)
+ + ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0)
+ + (dictLoadMethod == ZSTD_dlm_byRef ? 0
+ : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *))));
+}
+
+size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel)
+{
+ ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize);
+ return ZSTD_estimateCDictSize_advanced(dictSize, cParams, ZSTD_dlm_byCopy);
+}
+
+size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict)
+{
+ if (cdict==NULL) return 0; /* support sizeof on NULL */
+ DEBUGLOG(5, "sizeof(*cdict) : %u", (unsigned)sizeof(*cdict));
+ /* cdict may be in the workspace */
+ return (cdict->workspace.workspace == cdict ? 0 : sizeof(*cdict))
+ + ZSTD_cwksp_sizeof(&cdict->workspace);
+}
+
+static size_t ZSTD_initCDict_internal(
+ ZSTD_CDict* cdict,
+ const void* dictBuffer, size_t dictSize,
+ ZSTD_dictLoadMethod_e dictLoadMethod,
+ ZSTD_dictContentType_e dictContentType,
+ ZSTD_compressionParameters cParams)
+{
+ DEBUGLOG(3, "ZSTD_initCDict_internal (dictContentType:%u)", (unsigned)dictContentType);
+ assert(!ZSTD_checkCParams(cParams));
+ cdict->matchState.cParams = cParams;
+ if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dictBuffer) || (!dictSize)) {
+ cdict->dictContent = dictBuffer;
+ } else {
+ void *internalBuffer = ZSTD_cwksp_reserve_object(&cdict->workspace, ZSTD_cwksp_align(dictSize, sizeof(void*)));
+ RETURN_ERROR_IF(!internalBuffer, memory_allocation, "NULL pointer!");
+ cdict->dictContent = internalBuffer;
+ memcpy(internalBuffer, dictBuffer, dictSize);
+ }
+ cdict->dictContentSize = dictSize;
+
+ cdict->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cdict->workspace, HUF_WORKSPACE_SIZE);
+
+
+ /* Reset the state to no dictionary */
+ ZSTD_reset_compressedBlockState(&cdict->cBlockState);
+ FORWARD_IF_ERROR(ZSTD_reset_matchState(
+ &cdict->matchState,
+ &cdict->workspace,
+ &cParams,
+ ZSTDcrp_makeClean,
+ ZSTDirp_reset,
+ ZSTD_resetTarget_CDict), "");
+ /* (Maybe) load the dictionary
+ * Skips loading the dictionary if it is < 8 bytes.
+ */
+ { ZSTD_CCtx_params params;
+ memset(&params, 0, sizeof(params));
+ params.compressionLevel = ZSTD_CLEVEL_DEFAULT;
+ params.fParams.contentSizeFlag = 1;
+ params.cParams = cParams;
+ { size_t const dictID = ZSTD_compress_insertDictionary(
+ &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
+ &params, cdict->dictContent, cdict->dictContentSize,
+ dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace);
+ FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+ assert(dictID <= (size_t)(U32)-1);
+ cdict->dictID = (U32)dictID;
+ }
+ }
+
+ return 0;
+}
+
+ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize,
+ ZSTD_dictLoadMethod_e dictLoadMethod,
+ ZSTD_dictContentType_e dictContentType,
+ ZSTD_compressionParameters cParams, ZSTD_customMem customMem)
+{
+ DEBUGLOG(3, "ZSTD_createCDict_advanced, mode %u", (unsigned)dictContentType);
+ if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+
+ { size_t const workspaceSize =
+ ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) +
+ ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) +
+ ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0) +
+ (dictLoadMethod == ZSTD_dlm_byRef ? 0
+ : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*))));
+ void* const workspace = ZSTD_malloc(workspaceSize, customMem);
+ ZSTD_cwksp ws;
+ ZSTD_CDict* cdict;
+
+ if (!workspace) {
+ ZSTD_free(workspace, customMem);
+ return NULL;
+ }
+
+ ZSTD_cwksp_init(&ws, workspace, workspaceSize);
+
+ cdict = (ZSTD_CDict*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CDict));
+ assert(cdict != NULL);
+ ZSTD_cwksp_move(&cdict->workspace, &ws);
+ cdict->customMem = customMem;
+ cdict->compressionLevel = 0; /* signals advanced API usage */
+
+ if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
+ dictBuffer, dictSize,
+ dictLoadMethod, dictContentType,
+ cParams) )) {
+ ZSTD_freeCDict(cdict);
+ return NULL;
+ }
+
+ return cdict;
+ }
+}
+
+ZSTD_CDict* ZSTD_createCDict(const void* dict, size_t dictSize, int compressionLevel)
+{
+ ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize);
+ ZSTD_CDict* cdict = ZSTD_createCDict_advanced(dict, dictSize,
+ ZSTD_dlm_byCopy, ZSTD_dct_auto,
+ cParams, ZSTD_defaultCMem);
+ if (cdict)
+ cdict->compressionLevel = compressionLevel == 0 ? ZSTD_CLEVEL_DEFAULT : compressionLevel;
+ return cdict;
+}
+
+ZSTD_CDict* ZSTD_createCDict_byReference(const void* dict, size_t dictSize, int compressionLevel)
+{
+ ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize);
+ return ZSTD_createCDict_advanced(dict, dictSize,
+ ZSTD_dlm_byRef, ZSTD_dct_auto,
+ cParams, ZSTD_defaultCMem);
+}
+
+size_t ZSTD_freeCDict(ZSTD_CDict* cdict)
+{
+ if (cdict==NULL) return 0; /* support free on NULL */
+ { ZSTD_customMem const cMem = cdict->customMem;
+ int cdictInWorkspace = ZSTD_cwksp_owns_buffer(&cdict->workspace, cdict);
+ ZSTD_cwksp_free(&cdict->workspace, cMem);
+ if (!cdictInWorkspace) {
+ ZSTD_free(cdict, cMem);
+ }
+ return 0;
+ }
+}
+
+/*! ZSTD_initStaticCDict_advanced() :
+ * Generate a digested dictionary in provided memory area.
+ * workspace: The memory area to emplace the dictionary into.
+ * Provided pointer must 8-bytes aligned.
+ * It must outlive dictionary usage.
+ * workspaceSize: Use ZSTD_estimateCDictSize()
+ * to determine how large workspace must be.
+ * cParams : use ZSTD_getCParams() to transform a compression level
+ * into its relevants cParams.
+ * @return : pointer to ZSTD_CDict*, or NULL if error (size too small)
+ * Note : there is no corresponding "free" function.
+ * Since workspace was allocated externally, it must be freed externally.
+ */
+const ZSTD_CDict* ZSTD_initStaticCDict(
+ void* workspace, size_t workspaceSize,
+ const void* dict, size_t dictSize,
+ ZSTD_dictLoadMethod_e dictLoadMethod,
+ ZSTD_dictContentType_e dictContentType,
+ ZSTD_compressionParameters cParams)
+{
+ size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0);
+ size_t const neededSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict))
+ + (dictLoadMethod == ZSTD_dlm_byRef ? 0
+ : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*))))
+ + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE)
+ + matchStateSize;
+ ZSTD_CDict* cdict;
+
+ if ((size_t)workspace & 7) return NULL; /* 8-aligned */
+
+ {
+ ZSTD_cwksp ws;
+ ZSTD_cwksp_init(&ws, workspace, workspaceSize);
+ cdict = (ZSTD_CDict*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CDict));
+ if (cdict == NULL) return NULL;
+ ZSTD_cwksp_move(&cdict->workspace, &ws);
+ }
+
+ DEBUGLOG(4, "(workspaceSize < neededSize) : (%u < %u) => %u",
+ (unsigned)workspaceSize, (unsigned)neededSize, (unsigned)(workspaceSize < neededSize));
+ if (workspaceSize < neededSize) return NULL;
+
+ if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
+ dict, dictSize,
+ dictLoadMethod, dictContentType,
+ cParams) ))
+ return NULL;
+
+ return cdict;
+}
+
+ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict)
+{
+ assert(cdict != NULL);
+ return cdict->matchState.cParams;
+}
+
+/* ZSTD_compressBegin_usingCDict_advanced() :
+ * cdict must be != NULL */
+size_t ZSTD_compressBegin_usingCDict_advanced(
+ ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict,
+ ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize)
+{
+ DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_advanced");
+ RETURN_ERROR_IF(cdict==NULL, dictionary_wrong, "NULL pointer!");
+ { ZSTD_CCtx_params params = cctx->requestedParams;
+ params.cParams = ( pledgedSrcSize < ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF
+ || pledgedSrcSize < cdict->dictContentSize * ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER
+ || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN
+ || cdict->compressionLevel == 0 )
+ && (params.attachDictPref != ZSTD_dictForceLoad) ?
+ ZSTD_getCParamsFromCDict(cdict)
+ : ZSTD_getCParams(cdict->compressionLevel,
+ pledgedSrcSize,
+ cdict->dictContentSize);
+ /* Increase window log to fit the entire dictionary and source if the
+ * source size is known. Limit the increase to 19, which is the
+ * window log for compression level 1 with the largest source size.
+ */
+ if (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN) {
+ U32 const limitedSrcSize = (U32)MIN(pledgedSrcSize, 1U << 19);
+ U32 const limitedSrcLog = limitedSrcSize > 1 ? ZSTD_highbit32(limitedSrcSize - 1) + 1 : 1;
+ params.cParams.windowLog = MAX(params.cParams.windowLog, limitedSrcLog);
+ }
+ params.fParams = fParams;
+ return ZSTD_compressBegin_internal(cctx,
+ NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast,
+ cdict,
+ &params, pledgedSrcSize,
+ ZSTDb_not_buffered);
+ }
+}
+
+/* ZSTD_compressBegin_usingCDict() :
+ * pledgedSrcSize=0 means "unknown"
+ * if pledgedSrcSize>0, it will enable contentSizeFlag */
+size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+{
+ ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+ DEBUGLOG(4, "ZSTD_compressBegin_usingCDict : dictIDFlag == %u", !fParams.noDictIDFlag);
+ return ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);
+}
+
+size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
+{
+ FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */
+ return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+}
+
+/*! ZSTD_compress_usingCDict() :
+ * Compression using a digested Dictionary.
+ * Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
+ * Note that compression parameters are decided at CDict creation time
+ * while frame parameters are hardcoded */
+size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const ZSTD_CDict* cdict)
+{
+ ZSTD_frameParameters const fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+ return ZSTD_compress_usingCDict_advanced(cctx, dst, dstCapacity, src, srcSize, cdict, fParams);
+}
+
+
+
+/* ******************************************************************
+* Streaming
+********************************************************************/
+
+ZSTD_CStream* ZSTD_createCStream(void)
+{
+ DEBUGLOG(3, "ZSTD_createCStream");
+ return ZSTD_createCStream_advanced(ZSTD_defaultCMem);
+}
+
+ZSTD_CStream* ZSTD_initStaticCStream(void *workspace, size_t workspaceSize)
+{
+ return ZSTD_initStaticCCtx(workspace, workspaceSize);
+}
+
+ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem)
+{ /* CStream and CCtx are now same object */
+ return ZSTD_createCCtx_advanced(customMem);
+}
+
+size_t ZSTD_freeCStream(ZSTD_CStream* zcs)
+{
+ return ZSTD_freeCCtx(zcs); /* same object */
+}
+
+
+
+/*====== Initialization ======*/
+
+size_t ZSTD_CStreamInSize(void) { return ZSTD_BLOCKSIZE_MAX; }
+
+size_t ZSTD_CStreamOutSize(void)
+{
+ return ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ;
+}
+
+static size_t ZSTD_resetCStream_internal(ZSTD_CStream* cctx,
+ const void* const dict, size_t const dictSize, ZSTD_dictContentType_e const dictContentType,
+ const ZSTD_CDict* const cdict,
+ ZSTD_CCtx_params params, unsigned long long const pledgedSrcSize)
+{
+ DEBUGLOG(4, "ZSTD_resetCStream_internal");
+ /* Finalize the compression parameters */
+ params.cParams = ZSTD_getCParamsFromCCtxParams(&params, pledgedSrcSize, dictSize);
+ /* params are supposed to be fully validated at this point */
+ assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+ assert(!((dict) && (cdict))); /* either dict or cdict, not both */
+
+ FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx,
+ dict, dictSize, dictContentType, ZSTD_dtlm_fast,
+ cdict,
+ &params, pledgedSrcSize,
+ ZSTDb_buffered) , "");
+
+ cctx->inToCompress = 0;
+ cctx->inBuffPos = 0;
+ cctx->inBuffTarget = cctx->blockSize
+ + (cctx->blockSize == pledgedSrcSize); /* for small input: avoid automatic flush on reaching end of block, since it would require to add a 3-bytes null block to end frame */
+ cctx->outBuffContentSize = cctx->outBuffFlushedSize = 0;
+ cctx->streamStage = zcss_load;
+ cctx->frameEnded = 0;
+ return 0; /* ready to go */
+}
+
+/* ZSTD_resetCStream():
+ * pledgedSrcSize == 0 means "unknown" */
+size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pss)
+{
+ /* temporary : 0 interpreted as "unknown" during transition period.
+ * Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN.
+ * 0 will be interpreted as "empty" in the future.
+ */
+ U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss;
+ DEBUGLOG(4, "ZSTD_resetCStream: pledgedSrcSize = %u", (unsigned)pledgedSrcSize);
+ FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+ FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , "");
+ return 0;
+}
+
+/*! ZSTD_initCStream_internal() :
+ * Note : for lib/compress only. Used by zstdmt_compress.c.
+ * Assumption 1 : params are valid
+ * Assumption 2 : either dict, or cdict, is defined, not both */
+size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs,
+ const void* dict, size_t dictSize, const ZSTD_CDict* cdict,
+ const ZSTD_CCtx_params* params,
+ unsigned long long pledgedSrcSize)
+{
+ DEBUGLOG(4, "ZSTD_initCStream_internal");
+ FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+ FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , "");
+ assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams)));
+ zcs->requestedParams = *params;
+ assert(!((dict) && (cdict))); /* either dict or cdict, not both */
+ if (dict) {
+ FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , "");
+ } else {
+ /* Dictionary is cleared if !cdict */
+ FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , "");
+ }
+ return 0;
+}
+
+/* ZSTD_initCStream_usingCDict_advanced() :
+ * same as ZSTD_initCStream_usingCDict(), with control over frame parameters */
+size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+ const ZSTD_CDict* cdict,
+ ZSTD_frameParameters fParams,
+ unsigned long long pledgedSrcSize)
+{
+ DEBUGLOG(4, "ZSTD_initCStream_usingCDict_advanced");
+ FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+ FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , "");
+ zcs->requestedParams.fParams = fParams;
+ FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , "");
+ return 0;
+}
+
+/* note : cdict must outlive compression session */
+size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict)
+{
+ DEBUGLOG(4, "ZSTD_initCStream_usingCDict");
+ FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+ FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , "");
+ return 0;
+}
+
+
+/* ZSTD_initCStream_advanced() :
+ * pledgedSrcSize must be exact.
+ * if srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN.
+ * dict is loaded with default parameters ZSTD_dct_auto and ZSTD_dlm_byCopy. */
+size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+ const void* dict, size_t dictSize,
+ ZSTD_parameters params, unsigned long long pss)
+{
+ /* for compatibility with older programs relying on this behavior.
+ * Users should now specify ZSTD_CONTENTSIZE_UNKNOWN.
+ * This line will be removed in the future.
+ */
+ U64 const pledgedSrcSize = (pss==0 && params.fParams.contentSizeFlag==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss;
+ DEBUGLOG(4, "ZSTD_initCStream_advanced");
+ FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+ FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , "");
+ FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) , "");
+ zcs->requestedParams = ZSTD_assignParamsToCCtxParams(&zcs->requestedParams, &params);
+ FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , "");
+ return 0;
+}
+
+size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel)
+{
+ DEBUGLOG(4, "ZSTD_initCStream_usingDict");
+ FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+ FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , "");
+ FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , "");
+ return 0;
+}
+
+size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pss)
+{
+ /* temporary : 0 interpreted as "unknown" during transition period.
+ * Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN.
+ * 0 will be interpreted as "empty" in the future.
+ */
+ U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss;
+ DEBUGLOG(4, "ZSTD_initCStream_srcSize");
+ FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+ FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, NULL) , "");
+ FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , "");
+ FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , "");
+ return 0;
+}
+
+size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
+{
+ DEBUGLOG(4, "ZSTD_initCStream");
+ FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+ FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, NULL) , "");
+ FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , "");
+ return 0;
+}
+
+/*====== Compression ======*/
+
+static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx)
+{
+ size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
+ if (hintInSize==0) hintInSize = cctx->blockSize;
+ return hintInSize;
+}
+
+/** ZSTD_compressStream_generic():
+ * internal function for all *compressStream*() variants
+ * non-static, because can be called from zstdmt_compress.c
+ * @return : hint size for next input */
+static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+ ZSTD_outBuffer* output,
+ ZSTD_inBuffer* input,
+ ZSTD_EndDirective const flushMode)
+{
+ const char* const istart = (const char*)input->src;
+ const char* const iend = input->size != 0 ? istart + input->size : istart;
+ const char* ip = input->pos != 0 ? istart + input->pos : istart;
+ char* const ostart = (char*)output->dst;
+ char* const oend = output->size != 0 ? ostart + output->size : ostart;
+ char* op = output->pos != 0 ? ostart + output->pos : ostart;
+ U32 someMoreWork = 1;
+
+ /* check expectations */
+ DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode);
+ assert(zcs->inBuff != NULL);
+ assert(zcs->inBuffSize > 0);
+ assert(zcs->outBuff != NULL);
+ assert(zcs->outBuffSize > 0);
+ assert(output->pos <= output->size);
+ assert(input->pos <= input->size);
+
+ while (someMoreWork) {
+ switch(zcs->streamStage)
+ {
+ case zcss_init:
+ RETURN_ERROR(init_missing, "call ZSTD_initCStream() first!");
+
+ case zcss_load:
+ if ( (flushMode == ZSTD_e_end)
+ && ((size_t)(oend-op) >= ZSTD_compressBound(iend-ip)) /* enough dstCapacity */
+ && (zcs->inBuffPos == 0) ) {
+ /* shortcut to compression pass directly into output buffer */
+ size_t const cSize = ZSTD_compressEnd(zcs,
+ op, oend-op, ip, iend-ip);
+ DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize);
+ FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed");
+ ip = iend;
+ op += cSize;
+ zcs->frameEnded = 1;
+ ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ someMoreWork = 0; break;
+ }
+ /* complete loading into inBuffer */
+ { size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos;
+ size_t const loaded = ZSTD_limitCopy(
+ zcs->inBuff + zcs->inBuffPos, toLoad,
+ ip, iend-ip);
+ zcs->inBuffPos += loaded;
+ if (loaded != 0)
+ ip += loaded;
+ if ( (flushMode == ZSTD_e_continue)
+ && (zcs->inBuffPos < zcs->inBuffTarget) ) {
+ /* not enough input to fill full block : stop here */
+ someMoreWork = 0; break;
+ }
+ if ( (flushMode == ZSTD_e_flush)
+ && (zcs->inBuffPos == zcs->inToCompress) ) {
+ /* empty */
+ someMoreWork = 0; break;
+ }
+ }
+ /* compress current block (note : this stage cannot be stopped in the middle) */
+ DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode);
+ { void* cDst;
+ size_t cSize;
+ size_t const iSize = zcs->inBuffPos - zcs->inToCompress;
+ size_t oSize = oend-op;
+ unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend);
+ if (oSize >= ZSTD_compressBound(iSize))
+ cDst = op; /* compress into output buffer, to skip flush stage */
+ else
+ cDst = zcs->outBuff, oSize = zcs->outBuffSize;
+ cSize = lastBlock ?
+ ZSTD_compressEnd(zcs, cDst, oSize,
+ zcs->inBuff + zcs->inToCompress, iSize) :
+ ZSTD_compressContinue(zcs, cDst, oSize,
+ zcs->inBuff + zcs->inToCompress, iSize);
+ FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
+ zcs->frameEnded = lastBlock;
+ /* prepare next block */
+ zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSize;
+ if (zcs->inBuffTarget > zcs->inBuffSize)
+ zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSize;
+ DEBUGLOG(5, "inBuffTarget:%u / inBuffSize:%u",
+ (unsigned)zcs->inBuffTarget, (unsigned)zcs->inBuffSize);
+ if (!lastBlock)
+ assert(zcs->inBuffTarget <= zcs->inBuffSize);
+ zcs->inToCompress = zcs->inBuffPos;
+ if (cDst == op) { /* no need to flush */
+ op += cSize;
+ if (zcs->frameEnded) {
+ DEBUGLOG(5, "Frame completed directly in outBuffer");
+ someMoreWork = 0;
+ ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ }
+ break;
+ }
+ zcs->outBuffContentSize = cSize;
+ zcs->outBuffFlushedSize = 0;
+ zcs->streamStage = zcss_flush; /* pass-through to flush stage */
+ }
+ /* fall-through */
+ case zcss_flush:
+ DEBUGLOG(5, "flush stage");
+ { size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize;
+ size_t const flushed = ZSTD_limitCopy(op, (size_t)(oend-op),
+ zcs->outBuff + zcs->outBuffFlushedSize, toFlush);
+ DEBUGLOG(5, "toFlush: %u into %u ==> flushed: %u",
+ (unsigned)toFlush, (unsigned)(oend-op), (unsigned)flushed);
+ if (flushed)
+ op += flushed;
+ zcs->outBuffFlushedSize += flushed;
+ if (toFlush!=flushed) {
+ /* flush not fully completed, presumably because dst is too small */
+ assert(op==oend);
+ someMoreWork = 0;
+ break;
+ }
+ zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0;
+ if (zcs->frameEnded) {
+ DEBUGLOG(5, "Frame completed on flush");
+ someMoreWork = 0;
+ ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ break;
+ }
+ zcs->streamStage = zcss_load;
+ break;
+ }
+
+ default: /* impossible */
+ assert(0);
+ }
+ }
+
+ input->pos = ip - istart;
+ output->pos = op - ostart;
+ if (zcs->frameEnded) return 0;
+ return ZSTD_nextInputSizeHint(zcs);
+}
+
+static size_t ZSTD_nextInputSizeHint_MTorST(const ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+ if (cctx->appliedParams.nbWorkers >= 1) {
+ assert(cctx->mtctx != NULL);
+ return ZSTDMT_nextInputSizeHint(cctx->mtctx);
+ }
+#endif
+ return ZSTD_nextInputSizeHint(cctx);
+
+}
+
+size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+ FORWARD_IF_ERROR( ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue) , "");
+ return ZSTD_nextInputSizeHint_MTorST(zcs);
+}
+
+
+size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+ ZSTD_outBuffer* output,
+ ZSTD_inBuffer* input,
+ ZSTD_EndDirective endOp)
+{
+ DEBUGLOG(5, "ZSTD_compressStream2, endOp=%u ", (unsigned)endOp);
+ /* check conditions */
+ RETURN_ERROR_IF(output->pos > output->size, GENERIC, "invalid buffer");
+ RETURN_ERROR_IF(input->pos > input->size, GENERIC, "invalid buffer");
+ assert(cctx!=NULL);
+
+ /* transparent initialization stage */
+ if (cctx->streamStage == zcss_init) {
+ ZSTD_CCtx_params params = cctx->requestedParams;
+ ZSTD_prefixDict const prefixDict = cctx->prefixDict;
+ FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */
+ memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict)); /* single usage */
+ assert(prefixDict.dict==NULL || cctx->cdict==NULL); /* only one can be set */
+ DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage");
+ if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = input->size + 1; /* auto-fix pledgedSrcSize */
+ params.cParams = ZSTD_getCParamsFromCCtxParams(
+ &cctx->requestedParams, cctx->pledgedSrcSizePlusOne-1, 0 /*dictSize*/);
+
+
+#ifdef ZSTD_MULTITHREAD
+ if ((cctx->pledgedSrcSizePlusOne-1) <= ZSTDMT_JOBSIZE_MIN) {
+ params.nbWorkers = 0; /* do not invoke multi-threading when src size is too small */
+ }
+ if (params.nbWorkers > 0) {
+ /* mt context creation */
+ if (cctx->mtctx == NULL) {
+ DEBUGLOG(4, "ZSTD_compressStream2: creating new mtctx for nbWorkers=%u",
+ params.nbWorkers);
+ cctx->mtctx = ZSTDMT_createCCtx_advanced((U32)params.nbWorkers, cctx->customMem);
+ RETURN_ERROR_IF(cctx->mtctx == NULL, memory_allocation, "NULL pointer!");
+ }
+ /* mt compression */
+ DEBUGLOG(4, "call ZSTDMT_initCStream_internal as nbWorkers=%u", params.nbWorkers);
+ FORWARD_IF_ERROR( ZSTDMT_initCStream_internal(
+ cctx->mtctx,
+ prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType,
+ cctx->cdict, params, cctx->pledgedSrcSizePlusOne-1) , "");
+ cctx->streamStage = zcss_load;
+ cctx->appliedParams.nbWorkers = params.nbWorkers;
+ } else
+#endif
+ { FORWARD_IF_ERROR( ZSTD_resetCStream_internal(cctx,
+ prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType,
+ cctx->cdict,
+ params, cctx->pledgedSrcSizePlusOne-1) , "");
+ assert(cctx->streamStage == zcss_load);
+ assert(cctx->appliedParams.nbWorkers == 0);
+ } }
+ /* end of transparent initialization stage */
+
+ /* compression stage */
+#ifdef ZSTD_MULTITHREAD
+ if (cctx->appliedParams.nbWorkers > 0) {
+ int const forceMaxProgress = (endOp == ZSTD_e_flush || endOp == ZSTD_e_end);
+ size_t flushMin;
+ assert(forceMaxProgress || endOp == ZSTD_e_continue /* Protection for a new flush type */);
+ if (cctx->cParamsChanged) {
+ ZSTDMT_updateCParams_whileCompressing(cctx->mtctx, &cctx->requestedParams);
+ cctx->cParamsChanged = 0;
+ }
+ do {
+ flushMin = ZSTDMT_compressStream_generic(cctx->mtctx, output, input, endOp);
+ if ( ZSTD_isError(flushMin)
+ || (endOp == ZSTD_e_end && flushMin == 0) ) { /* compression completed */
+ ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only);
+ }
+ FORWARD_IF_ERROR(flushMin, "ZSTDMT_compressStream_generic failed");
+ } while (forceMaxProgress && flushMin != 0 && output->pos < output->size);
+ DEBUGLOG(5, "completed ZSTD_compressStream2 delegating to ZSTDMT_compressStream_generic");
+ /* Either we don't require maximum forward progress, we've finished the
+ * flush, or we are out of output space.
+ */
+ assert(!forceMaxProgress || flushMin == 0 || output->pos == output->size);
+ return flushMin;
+ }
+#endif
+ FORWARD_IF_ERROR( ZSTD_compressStream_generic(cctx, output, input, endOp) , "");
+ DEBUGLOG(5, "completed ZSTD_compressStream2");
+ return cctx->outBuffContentSize - cctx->outBuffFlushedSize; /* remaining to flush */
+}
+
+size_t ZSTD_compressStream2_simpleArgs (
+ ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity, size_t* dstPos,
+ const void* src, size_t srcSize, size_t* srcPos,
+ ZSTD_EndDirective endOp)
+{
+ ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+ ZSTD_inBuffer input = { src, srcSize, *srcPos };
+ /* ZSTD_compressStream2() will check validity of dstPos and srcPos */
+ size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
+ *dstPos = output.pos;
+ *srcPos = input.pos;
+ return cErr;
+}
+
+size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize)
+{
+ DEBUGLOG(4, "ZSTD_compress2 (srcSize=%u)", (unsigned)srcSize);
+ ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only);
+ { size_t oPos = 0;
+ size_t iPos = 0;
+ size_t const result = ZSTD_compressStream2_simpleArgs(cctx,
+ dst, dstCapacity, &oPos,
+ src, srcSize, &iPos,
+ ZSTD_e_end);
+ FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed");
+ if (result != 0) { /* compression not completed, due to lack of output space */
+ assert(oPos == dstCapacity);
+ RETURN_ERROR(dstSize_tooSmall, "");
+ }
+ assert(iPos == srcSize); /* all input is expected consumed */
+ return oPos;
+ }
+}
+
+/*====== Finalize ======*/
+
+/*! ZSTD_flushStream() :
+ * @return : amount of data remaining to flush */
+size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+{
+ ZSTD_inBuffer input = { NULL, 0, 0 };
+ return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush);
+}
+
+
+size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+{
+ ZSTD_inBuffer input = { NULL, 0, 0 };
+ size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end);
+ FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed");
+ if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush; /* minimal estimation */
+ /* single thread mode : attempt to calculate remaining to flush more precisely */
+ { size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE;
+ size_t const checksumSize = (size_t)(zcs->frameEnded ? 0 : zcs->appliedParams.fParams.checksumFlag * 4);
+ size_t const toFlush = remainingToFlush + lastBlockSize + checksumSize;
+ DEBUGLOG(4, "ZSTD_endStream : remaining to flush : %u", (unsigned)toFlush);
+ return toFlush;
+ }
+}
+
+
+/*-===== Pre-defined compression levels =====-*/
+
+#define ZSTD_MAX_CLEVEL 22
+int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; }
+int ZSTD_minCLevel(void) { return (int)-ZSTD_TARGETLENGTH_MAX; }
+
+static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = {
+{ /* "default" - for any srcSize > 256 KB */
+ /* W, C, H, S, L, TL, strat */
+ { 19, 12, 13, 1, 6, 1, ZSTD_fast }, /* base for negative levels */
+ { 19, 13, 14, 1, 7, 0, ZSTD_fast }, /* level 1 */
+ { 20, 15, 16, 1, 6, 0, ZSTD_fast }, /* level 2 */
+ { 21, 16, 17, 1, 5, 0, ZSTD_dfast }, /* level 3 */
+ { 21, 18, 18, 1, 5, 0, ZSTD_dfast }, /* level 4 */
+ { 21, 18, 19, 2, 5, 2, ZSTD_greedy }, /* level 5 */
+ { 21, 19, 19, 3, 5, 4, ZSTD_greedy }, /* level 6 */
+ { 21, 19, 19, 3, 5, 8, ZSTD_lazy }, /* level 7 */
+ { 21, 19, 19, 3, 5, 16, ZSTD_lazy2 }, /* level 8 */
+ { 21, 19, 20, 4, 5, 16, ZSTD_lazy2 }, /* level 9 */
+ { 22, 20, 21, 4, 5, 16, ZSTD_lazy2 }, /* level 10 */
+ { 22, 21, 22, 4, 5, 16, ZSTD_lazy2 }, /* level 11 */
+ { 22, 21, 22, 5, 5, 16, ZSTD_lazy2 }, /* level 12 */
+ { 22, 21, 22, 5, 5, 32, ZSTD_btlazy2 }, /* level 13 */
+ { 22, 22, 23, 5, 5, 32, ZSTD_btlazy2 }, /* level 14 */
+ { 22, 23, 23, 6, 5, 32, ZSTD_btlazy2 }, /* level 15 */
+ { 22, 22, 22, 5, 5, 48, ZSTD_btopt }, /* level 16 */
+ { 23, 23, 22, 5, 4, 64, ZSTD_btopt }, /* level 17 */
+ { 23, 23, 22, 6, 3, 64, ZSTD_btultra }, /* level 18 */
+ { 23, 24, 22, 7, 3,256, ZSTD_btultra2}, /* level 19 */
+ { 25, 25, 23, 7, 3,256, ZSTD_btultra2}, /* level 20 */
+ { 26, 26, 24, 7, 3,512, ZSTD_btultra2}, /* level 21 */
+ { 27, 27, 25, 9, 3,999, ZSTD_btultra2}, /* level 22 */
+},
+{ /* for srcSize <= 256 KB */
+ /* W, C, H, S, L, T, strat */
+ { 18, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */
+ { 18, 13, 14, 1, 6, 0, ZSTD_fast }, /* level 1 */
+ { 18, 14, 14, 1, 5, 0, ZSTD_dfast }, /* level 2 */
+ { 18, 16, 16, 1, 4, 0, ZSTD_dfast }, /* level 3 */
+ { 18, 16, 17, 2, 5, 2, ZSTD_greedy }, /* level 4.*/
+ { 18, 18, 18, 3, 5, 2, ZSTD_greedy }, /* level 5.*/
+ { 18, 18, 19, 3, 5, 4, ZSTD_lazy }, /* level 6.*/
+ { 18, 18, 19, 4, 4, 4, ZSTD_lazy }, /* level 7 */
+ { 18, 18, 19, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */
+ { 18, 18, 19, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */
+ { 18, 18, 19, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */
+ { 18, 18, 19, 5, 4, 12, ZSTD_btlazy2 }, /* level 11.*/
+ { 18, 19, 19, 7, 4, 12, ZSTD_btlazy2 }, /* level 12.*/
+ { 18, 18, 19, 4, 4, 16, ZSTD_btopt }, /* level 13 */
+ { 18, 18, 19, 4, 3, 32, ZSTD_btopt }, /* level 14.*/
+ { 18, 18, 19, 6, 3,128, ZSTD_btopt }, /* level 15.*/
+ { 18, 19, 19, 6, 3,128, ZSTD_btultra }, /* level 16.*/
+ { 18, 19, 19, 8, 3,256, ZSTD_btultra }, /* level 17.*/
+ { 18, 19, 19, 6, 3,128, ZSTD_btultra2}, /* level 18.*/
+ { 18, 19, 19, 8, 3,256, ZSTD_btultra2}, /* level 19.*/
+ { 18, 19, 19, 10, 3,512, ZSTD_btultra2}, /* level 20.*/
+ { 18, 19, 19, 12, 3,512, ZSTD_btultra2}, /* level 21.*/
+ { 18, 19, 19, 13, 3,999, ZSTD_btultra2}, /* level 22.*/
+},
+{ /* for srcSize <= 128 KB */
+ /* W, C, H, S, L, T, strat */
+ { 17, 12, 12, 1, 5, 1, ZSTD_fast }, /* base for negative levels */
+ { 17, 12, 13, 1, 6, 0, ZSTD_fast }, /* level 1 */
+ { 17, 13, 15, 1, 5, 0, ZSTD_fast }, /* level 2 */
+ { 17, 15, 16, 2, 5, 0, ZSTD_dfast }, /* level 3 */
+ { 17, 17, 17, 2, 4, 0, ZSTD_dfast }, /* level 4 */
+ { 17, 16, 17, 3, 4, 2, ZSTD_greedy }, /* level 5 */
+ { 17, 17, 17, 3, 4, 4, ZSTD_lazy }, /* level 6 */
+ { 17, 17, 17, 3, 4, 8, ZSTD_lazy2 }, /* level 7 */
+ { 17, 17, 17, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */
+ { 17, 17, 17, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */
+ { 17, 17, 17, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */
+ { 17, 17, 17, 5, 4, 8, ZSTD_btlazy2 }, /* level 11 */
+ { 17, 18, 17, 7, 4, 12, ZSTD_btlazy2 }, /* level 12 */
+ { 17, 18, 17, 3, 4, 12, ZSTD_btopt }, /* level 13.*/
+ { 17, 18, 17, 4, 3, 32, ZSTD_btopt }, /* level 14.*/
+ { 17, 18, 17, 6, 3,256, ZSTD_btopt }, /* level 15.*/
+ { 17, 18, 17, 6, 3,128, ZSTD_btultra }, /* level 16.*/
+ { 17, 18, 17, 8, 3,256, ZSTD_btultra }, /* level 17.*/
+ { 17, 18, 17, 10, 3,512, ZSTD_btultra }, /* level 18.*/
+ { 17, 18, 17, 5, 3,256, ZSTD_btultra2}, /* level 19.*/
+ { 17, 18, 17, 7, 3,512, ZSTD_btultra2}, /* level 20.*/
+ { 17, 18, 17, 9, 3,512, ZSTD_btultra2}, /* level 21.*/
+ { 17, 18, 17, 11, 3,999, ZSTD_btultra2}, /* level 22.*/
+},
+{ /* for srcSize <= 16 KB */
+ /* W, C, H, S, L, T, strat */
+ { 14, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */
+ { 14, 14, 15, 1, 5, 0, ZSTD_fast }, /* level 1 */
+ { 14, 14, 15, 1, 4, 0, ZSTD_fast }, /* level 2 */
+ { 14, 14, 15, 2, 4, 0, ZSTD_dfast }, /* level 3 */
+ { 14, 14, 14, 4, 4, 2, ZSTD_greedy }, /* level 4 */
+ { 14, 14, 14, 3, 4, 4, ZSTD_lazy }, /* level 5.*/
+ { 14, 14, 14, 4, 4, 8, ZSTD_lazy2 }, /* level 6 */
+ { 14, 14, 14, 6, 4, 8, ZSTD_lazy2 }, /* level 7 */
+ { 14, 14, 14, 8, 4, 8, ZSTD_lazy2 }, /* level 8.*/
+ { 14, 15, 14, 5, 4, 8, ZSTD_btlazy2 }, /* level 9.*/
+ { 14, 15, 14, 9, 4, 8, ZSTD_btlazy2 }, /* level 10.*/
+ { 14, 15, 14, 3, 4, 12, ZSTD_btopt }, /* level 11.*/
+ { 14, 15, 14, 4, 3, 24, ZSTD_btopt }, /* level 12.*/
+ { 14, 15, 14, 5, 3, 32, ZSTD_btultra }, /* level 13.*/
+ { 14, 15, 15, 6, 3, 64, ZSTD_btultra }, /* level 14.*/
+ { 14, 15, 15, 7, 3,256, ZSTD_btultra }, /* level 15.*/
+ { 14, 15, 15, 5, 3, 48, ZSTD_btultra2}, /* level 16.*/
+ { 14, 15, 15, 6, 3,128, ZSTD_btultra2}, /* level 17.*/
+ { 14, 15, 15, 7, 3,256, ZSTD_btultra2}, /* level 18.*/
+ { 14, 15, 15, 8, 3,256, ZSTD_btultra2}, /* level 19.*/
+ { 14, 15, 15, 8, 3,512, ZSTD_btultra2}, /* level 20.*/
+ { 14, 15, 15, 9, 3,512, ZSTD_btultra2}, /* level 21.*/
+ { 14, 15, 15, 10, 3,999, ZSTD_btultra2}, /* level 22.*/
+},
+};
+
+/*! ZSTD_getCParams_internal() :
+ * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize.
+ * Note: srcSizeHint 0 means 0, use ZSTD_CONTENTSIZE_UNKNOWN for unknown.
+ * Use dictSize == 0 for unknown or unused. */
+static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize)
+{
+ int const unknown = srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN;
+ size_t const addedSize = unknown && dictSize > 0 ? 500 : 0;
+ U64 const rSize = unknown && dictSize == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : srcSizeHint+dictSize+addedSize;
+ U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB);
+ int row = compressionLevel;
+ DEBUGLOG(5, "ZSTD_getCParams_internal (cLevel=%i)", compressionLevel);
+ if (compressionLevel == 0) row = ZSTD_CLEVEL_DEFAULT; /* 0 == default */
+ if (compressionLevel < 0) row = 0; /* entry 0 is baseline for fast mode */
+ if (compressionLevel > ZSTD_MAX_CLEVEL) row = ZSTD_MAX_CLEVEL;
+ { ZSTD_compressionParameters cp = ZSTD_defaultCParameters[tableID][row];
+ if (compressionLevel < 0) cp.targetLength = (unsigned)(-compressionLevel); /* acceleration factor */
+ /* refine parameters based on srcSize & dictSize */
+ return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize);
+ }
+}
+
+/*! ZSTD_getCParams() :
+ * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize.
+ * Size values are optional, provide 0 if not known or unused */
+ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize)
+{
+ if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
+ return ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize);
+}
+
+/*! ZSTD_getParams() :
+ * same idea as ZSTD_getCParams()
+ * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`).
+ * Fields of `ZSTD_frameParameters` are set to default values */
+static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) {
+ ZSTD_parameters params;
+ ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize);
+ DEBUGLOG(5, "ZSTD_getParams (cLevel=%i)", compressionLevel);
+ memset(&params, 0, sizeof(params));
+ params.cParams = cParams;
+ params.fParams.contentSizeFlag = 1;
+ return params;
+}
+
+/*! ZSTD_getParams() :
+ * same idea as ZSTD_getCParams()
+ * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`).
+ * Fields of `ZSTD_frameParameters` are set to default values */
+ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) {
+ if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
+ return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize);
+}
+/**** ended inlining compress/zstd_compress.c ****/
+/**** start inlining compress/zstd_double_fast.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/**** skipping file: zstd_compress_internal.h ****/
+/**** skipping file: zstd_double_fast.h ****/
+
+
+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+ void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+{
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
+ U32* const hashLarge = ms->hashTable;
+ U32 const hBitsL = cParams->hashLog;
+ U32 const mls = cParams->minMatch;
+ U32* const hashSmall = ms->chainTable;
+ U32 const hBitsS = cParams->chainLog;
+ const BYTE* const base = ms->window.base;
+ const BYTE* ip = base + ms->nextToUpdate;
+ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+ const U32 fastHashFillStep = 3;
+
+ /* Always insert every fastHashFillStep position into the hash tables.
+ * Insert the other positions into the large hash table if their entry
+ * is empty.
+ */
+ for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
+ U32 const current = (U32)(ip - base);
+ U32 i;
+ for (i = 0; i < fastHashFillStep; ++i) {
+ size_t const smHash = ZSTD_hashPtr(ip + i, hBitsS, mls);
+ size_t const lgHash = ZSTD_hashPtr(ip + i, hBitsL, 8);
+ if (i == 0)
+ hashSmall[smHash] = current + i;
+ if (i == 0 || hashLarge[lgHash] == 0)
+ hashLarge[lgHash] = current + i;
+ /* Only load extra positions for ZSTD_dtlm_full */
+ if (dtlm == ZSTD_dtlm_fast)
+ break;
+ } }
+}
+
+
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_doubleFast_generic(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize,
+ U32 const mls /* template */, ZSTD_dictMode_e const dictMode)
+{
+ ZSTD_compressionParameters const* cParams = &ms->cParams;
+ U32* const hashLong = ms->hashTable;
+ const U32 hBitsL = cParams->hashLog;
+ U32* const hashSmall = ms->chainTable;
+ const U32 hBitsS = cParams->chainLog;
+ const BYTE* const base = ms->window.base;
+ const BYTE* const istart = (const BYTE*)src;
+ const BYTE* ip = istart;
+ const BYTE* anchor = istart;
+ const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+ /* presumes that, if there is a dictionary, it must be using Attach mode */
+ const U32 prefixLowestIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
+ const BYTE* const prefixLowest = base + prefixLowestIndex;
+ const BYTE* const iend = istart + srcSize;
+ const BYTE* const ilimit = iend - HASH_READ_SIZE;
+ U32 offset_1=rep[0], offset_2=rep[1];
+ U32 offsetSaved = 0;
+
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
+ const ZSTD_compressionParameters* const dictCParams =
+ dictMode == ZSTD_dictMatchState ?
+ &dms->cParams : NULL;
+ const U32* const dictHashLong = dictMode == ZSTD_dictMatchState ?
+ dms->hashTable : NULL;
+ const U32* const dictHashSmall = dictMode == ZSTD_dictMatchState ?
+ dms->chainTable : NULL;
+ const U32 dictStartIndex = dictMode == ZSTD_dictMatchState ?
+ dms->window.dictLimit : 0;
+ const BYTE* const dictBase = dictMode == ZSTD_dictMatchState ?
+ dms->window.base : NULL;
+ const BYTE* const dictStart = dictMode == ZSTD_dictMatchState ?
+ dictBase + dictStartIndex : NULL;
+ const BYTE* const dictEnd = dictMode == ZSTD_dictMatchState ?
+ dms->window.nextSrc : NULL;
+ const U32 dictIndexDelta = dictMode == ZSTD_dictMatchState ?
+ prefixLowestIndex - (U32)(dictEnd - dictBase) :
+ 0;
+ const U32 dictHBitsL = dictMode == ZSTD_dictMatchState ?
+ dictCParams->hashLog : hBitsL;
+ const U32 dictHBitsS = dictMode == ZSTD_dictMatchState ?
+ dictCParams->chainLog : hBitsS;
+ const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictStart));
+
+ DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_generic");
+
+ assert(dictMode == ZSTD_noDict || dictMode == ZSTD_dictMatchState);
+
+ /* if a dictionary is attached, it must be within window range */
+ if (dictMode == ZSTD_dictMatchState) {
+ assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);
+ }
+
+ /* init */
+ ip += (dictAndPrefixLength == 0);
+ if (dictMode == ZSTD_noDict) {
+ U32 const current = (U32)(ip - base);
+ U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
+ U32 const maxRep = current - windowLow;
+ if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
+ if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
+ }
+ if (dictMode == ZSTD_dictMatchState) {
+ /* dictMatchState repCode checks don't currently handle repCode == 0
+ * disabling. */
+ assert(offset_1 <= dictAndPrefixLength);
+ assert(offset_2 <= dictAndPrefixLength);
+ }
+
+ /* Main Search Loop */
+ while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */
+ size_t mLength;
+ U32 offset;
+ size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
+ size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
+ size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8);
+ size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls);
+ U32 const current = (U32)(ip-base);
+ U32 const matchIndexL = hashLong[h2];
+ U32 matchIndexS = hashSmall[h];
+ const BYTE* matchLong = base + matchIndexL;
+ const BYTE* match = base + matchIndexS;
+ const U32 repIndex = current + 1 - offset_1;
+ const BYTE* repMatch = (dictMode == ZSTD_dictMatchState
+ && repIndex < prefixLowestIndex) ?
+ dictBase + (repIndex - dictIndexDelta) :
+ base + repIndex;
+ hashLong[h2] = hashSmall[h] = current; /* update hash tables */
+
+ /* check dictMatchState repcode */
+ if (dictMode == ZSTD_dictMatchState
+ && ((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+ && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+ const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+ mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+ ip++;
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH);
+ goto _match_stored;
+ }
+
+ /* check noDict repcode */
+ if ( dictMode == ZSTD_noDict
+ && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {
+ mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+ ip++;
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH);
+ goto _match_stored;
+ }
+
+ if (matchIndexL > prefixLowestIndex) {
+ /* check prefix long match */
+ if (MEM_read64(matchLong) == MEM_read64(ip)) {
+ mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8;
+ offset = (U32)(ip-matchLong);
+ while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
+ goto _match_found;
+ }
+ } else if (dictMode == ZSTD_dictMatchState) {
+ /* check dictMatchState long match */
+ U32 const dictMatchIndexL = dictHashLong[dictHL];
+ const BYTE* dictMatchL = dictBase + dictMatchIndexL;
+ assert(dictMatchL < dictEnd);
+
+ if (dictMatchL > dictStart && MEM_read64(dictMatchL) == MEM_read64(ip)) {
+ mLength = ZSTD_count_2segments(ip+8, dictMatchL+8, iend, dictEnd, prefixLowest) + 8;
+ offset = (U32)(current - dictMatchIndexL - dictIndexDelta);
+ while (((ip>anchor) & (dictMatchL>dictStart)) && (ip[-1] == dictMatchL[-1])) { ip--; dictMatchL--; mLength++; } /* catch up */
+ goto _match_found;
+ } }
+
+ if (matchIndexS > prefixLowestIndex) {
+ /* check prefix short match */
+ if (MEM_read32(match) == MEM_read32(ip)) {
+ goto _search_next_long;
+ }
+ } else if (dictMode == ZSTD_dictMatchState) {
+ /* check dictMatchState short match */
+ U32 const dictMatchIndexS = dictHashSmall[dictHS];
+ match = dictBase + dictMatchIndexS;
+ matchIndexS = dictMatchIndexS + dictIndexDelta;
+
+ if (match > dictStart && MEM_read32(match) == MEM_read32(ip)) {
+ goto _search_next_long;
+ } }
+
+ ip += ((ip-anchor) >> kSearchStrength) + 1;
+#if defined(__aarch64__)
+ PREFETCH_L1(ip+256);
+#endif
+ continue;
+
+_search_next_long:
+
+ { size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+ size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
+ U32 const matchIndexL3 = hashLong[hl3];
+ const BYTE* matchL3 = base + matchIndexL3;
+ hashLong[hl3] = current + 1;
+
+ /* check prefix long +1 match */
+ if (matchIndexL3 > prefixLowestIndex) {
+ if (MEM_read64(matchL3) == MEM_read64(ip+1)) {
+ mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8;
+ ip++;
+ offset = (U32)(ip-matchL3);
+ while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
+ goto _match_found;
+ }
+ } else if (dictMode == ZSTD_dictMatchState) {
+ /* check dict long +1 match */
+ U32 const dictMatchIndexL3 = dictHashLong[dictHLNext];
+ const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
+ assert(dictMatchL3 < dictEnd);
+ if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
+ mLength = ZSTD_count_2segments(ip+1+8, dictMatchL3+8, iend, dictEnd, prefixLowest) + 8;
+ ip++;
+ offset = (U32)(current + 1 - dictMatchIndexL3 - dictIndexDelta);
+ while (((ip>anchor) & (dictMatchL3>dictStart)) && (ip[-1] == dictMatchL3[-1])) { ip--; dictMatchL3--; mLength++; } /* catch up */
+ goto _match_found;
+ } } }
+
+ /* if no long +1 match, explore the short match we found */
+ if (dictMode == ZSTD_dictMatchState && matchIndexS < prefixLowestIndex) {
+ mLength = ZSTD_count_2segments(ip+4, match+4, iend, dictEnd, prefixLowest) + 4;
+ offset = (U32)(current - matchIndexS);
+ while (((ip>anchor) & (match>dictStart)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+ } else {
+ mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+ offset = (U32)(ip - match);
+ while (((ip>anchor) & (match>prefixLowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+ }
+
+ /* fall-through */
+
+_match_found:
+ offset_2 = offset_1;
+ offset_1 = offset;
+
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+
+_match_stored:
+ /* match found */
+ ip += mLength;
+ anchor = ip;
+
+ if (ip <= ilimit) {
+ /* Complementary insertion */
+ /* done after iLimit test, as candidates could be > iend-8 */
+ { U32 const indexToInsert = current+2;
+ hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert;
+ hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base);
+ hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert;
+ hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base);
+ }
+
+ /* check immediate repcode */
+ if (dictMode == ZSTD_dictMatchState) {
+ while (ip <= ilimit) {
+ U32 const current2 = (U32)(ip-base);
+ U32 const repIndex2 = current2 - offset_2;
+ const BYTE* repMatch2 = dictMode == ZSTD_dictMatchState
+ && repIndex2 < prefixLowestIndex ?
+ dictBase + repIndex2 - dictIndexDelta :
+ base + repIndex2;
+ if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+ && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+ const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
+ size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
+ U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH);
+ hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+ hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+ ip += repLength2;
+ anchor = ip;
+ continue;
+ }
+ break;
+ } }
+
+ if (dictMode == ZSTD_noDict) {
+ while ( (ip <= ilimit)
+ && ( (offset_2>0)
+ & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
+ /* store sequence */
+ size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+ U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */
+ hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
+ hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, rLength-MINMATCH);
+ ip += rLength;
+ anchor = ip;
+ continue; /* faster when present ... (?) */
+ } } }
+ } /* while (ip < ilimit) */
+
+ /* save reps for next block */
+ rep[0] = offset_1 ? offset_1 : offsetSaved;
+ rep[1] = offset_2 ? offset_2 : offsetSaved;
+
+ /* Return the last literals size */
+ return (size_t)(iend - anchor);
+}
+
+
+size_t ZSTD_compressBlock_doubleFast(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ const U32 mls = ms->cParams.minMatch;
+ switch(mls)
+ {
+ default: /* includes case 3 */
+ case 4 :
+ return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_noDict);
+ case 5 :
+ return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_noDict);
+ case 6 :
+ return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_noDict);
+ case 7 :
+ return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_noDict);
+ }
+}
+
+
+size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ const U32 mls = ms->cParams.minMatch;
+ switch(mls)
+ {
+ default: /* includes case 3 */
+ case 4 :
+ return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_dictMatchState);
+ case 5 :
+ return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_dictMatchState);
+ case 6 :
+ return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_dictMatchState);
+ case 7 :
+ return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_dictMatchState);
+ }
+}
+
+
+static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize,
+ U32 const mls /* template */)
+{
+ ZSTD_compressionParameters const* cParams = &ms->cParams;
+ U32* const hashLong = ms->hashTable;
+ U32 const hBitsL = cParams->hashLog;
+ U32* const hashSmall = ms->chainTable;
+ U32 const hBitsS = cParams->chainLog;
+ const BYTE* const istart = (const BYTE*)src;
+ const BYTE* ip = istart;
+ const BYTE* anchor = istart;
+ const BYTE* const iend = istart + srcSize;
+ const BYTE* const ilimit = iend - 8;
+ const BYTE* const base = ms->window.base;
+ const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+ const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
+ const U32 dictStartIndex = lowLimit;
+ const U32 dictLimit = ms->window.dictLimit;
+ const U32 prefixStartIndex = (dictLimit > lowLimit) ? dictLimit : lowLimit;
+ const BYTE* const prefixStart = base + prefixStartIndex;
+ const BYTE* const dictBase = ms->window.dictBase;
+ const BYTE* const dictStart = dictBase + dictStartIndex;
+ const BYTE* const dictEnd = dictBase + prefixStartIndex;
+ U32 offset_1=rep[0], offset_2=rep[1];
+
+ DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_extDict_generic (srcSize=%zu)", srcSize);
+
+ /* if extDict is invalidated due to maxDistance, switch to "regular" variant */
+ if (prefixStartIndex == dictStartIndex)
+ return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, mls, ZSTD_noDict);
+
+ /* Search Loop */
+ while (ip < ilimit) { /* < instead of <=, because (ip+1) */
+ const size_t hSmall = ZSTD_hashPtr(ip, hBitsS, mls);
+ const U32 matchIndex = hashSmall[hSmall];
+ const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
+ const BYTE* match = matchBase + matchIndex;
+
+ const size_t hLong = ZSTD_hashPtr(ip, hBitsL, 8);
+ const U32 matchLongIndex = hashLong[hLong];
+ const BYTE* const matchLongBase = matchLongIndex < prefixStartIndex ? dictBase : base;
+ const BYTE* matchLong = matchLongBase + matchLongIndex;
+
+ const U32 current = (U32)(ip-base);
+ const U32 repIndex = current + 1 - offset_1; /* offset_1 expected <= current +1 */
+ const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+ const BYTE* const repMatch = repBase + repIndex;
+ size_t mLength;
+ hashSmall[hSmall] = hashLong[hLong] = current; /* update hash table */
+
+ if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */
+ & (repIndex > dictStartIndex))
+ && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+ const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+ mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+ ip++;
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH);
+ } else {
+ if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
+ const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
+ const BYTE* const lowMatchPtr = matchLongIndex < prefixStartIndex ? dictStart : prefixStart;
+ U32 offset;
+ mLength = ZSTD_count_2segments(ip+8, matchLong+8, iend, matchEnd, prefixStart) + 8;
+ offset = current - matchLongIndex;
+ while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
+ offset_2 = offset_1;
+ offset_1 = offset;
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+
+ } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
+ size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+ U32 const matchIndex3 = hashLong[h3];
+ const BYTE* const match3Base = matchIndex3 < prefixStartIndex ? dictBase : base;
+ const BYTE* match3 = match3Base + matchIndex3;
+ U32 offset;
+ hashLong[h3] = current + 1;
+ if ( (matchIndex3 > dictStartIndex) && (MEM_read64(match3) == MEM_read64(ip+1)) ) {
+ const BYTE* const matchEnd = matchIndex3 < prefixStartIndex ? dictEnd : iend;
+ const BYTE* const lowMatchPtr = matchIndex3 < prefixStartIndex ? dictStart : prefixStart;
+ mLength = ZSTD_count_2segments(ip+9, match3+8, iend, matchEnd, prefixStart) + 8;
+ ip++;
+ offset = current+1 - matchIndex3;
+ while (((ip>anchor) & (match3>lowMatchPtr)) && (ip[-1] == match3[-1])) { ip--; match3--; mLength++; } /* catch up */
+ } else {
+ const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
+ const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
+ mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
+ offset = current - matchIndex;
+ while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+ }
+ offset_2 = offset_1;
+ offset_1 = offset;
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+
+ } else {
+ ip += ((ip-anchor) >> kSearchStrength) + 1;
+ continue;
+ } }
+
+ /* move to next sequence start */
+ ip += mLength;
+ anchor = ip;
+
+ if (ip <= ilimit) {
+ /* Complementary insertion */
+ /* done after iLimit test, as candidates could be > iend-8 */
+ { U32 const indexToInsert = current+2;
+ hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert;
+ hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base);
+ hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert;
+ hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base);
+ }
+
+ /* check immediate repcode */
+ while (ip <= ilimit) {
+ U32 const current2 = (U32)(ip-base);
+ U32 const repIndex2 = current2 - offset_2;
+ const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+ if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */
+ & (repIndex2 > dictStartIndex))
+ && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+ const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+ size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+ U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH);
+ hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+ hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+ ip += repLength2;
+ anchor = ip;
+ continue;
+ }
+ break;
+ } } }
+
+ /* save reps for next block */
+ rep[0] = offset_1;
+ rep[1] = offset_2;
+
+ /* Return the last literals size */
+ return (size_t)(iend - anchor);
+}
+
+
+size_t ZSTD_compressBlock_doubleFast_extDict(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ U32 const mls = ms->cParams.minMatch;
+ switch(mls)
+ {
+ default: /* includes case 3 */
+ case 4 :
+ return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 4);
+ case 5 :
+ return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 5);
+ case 6 :
+ return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 6);
+ case 7 :
+ return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 7);
+ }
+}
+/**** ended inlining compress/zstd_double_fast.c ****/
+/**** start inlining compress/zstd_fast.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/**** skipping file: zstd_compress_internal.h ****/
+/**** skipping file: zstd_fast.h ****/
+
+
+void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+ const void* const end,
+ ZSTD_dictTableLoadMethod_e dtlm)
+{
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
+ U32* const hashTable = ms->hashTable;
+ U32 const hBits = cParams->hashLog;
+ U32 const mls = cParams->minMatch;
+ const BYTE* const base = ms->window.base;
+ const BYTE* ip = base + ms->nextToUpdate;
+ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+ const U32 fastHashFillStep = 3;
+
+ /* Always insert every fastHashFillStep position into the hash table.
+ * Insert the other positions if their hash entry is empty.
+ */
+ for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
+ U32 const current = (U32)(ip - base);
+ size_t const hash0 = ZSTD_hashPtr(ip, hBits, mls);
+ hashTable[hash0] = current;
+ if (dtlm == ZSTD_dtlm_fast) continue;
+ /* Only load extra positions for ZSTD_dtlm_full */
+ { U32 p;
+ for (p = 1; p < fastHashFillStep; ++p) {
+ size_t const hash = ZSTD_hashPtr(ip + p, hBits, mls);
+ if (hashTable[hash] == 0) { /* not yet filled */
+ hashTable[hash] = current + p;
+ } } } }
+}
+
+
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_compressBlock_fast_generic(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize,
+ U32 const mls)
+{
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
+ U32* const hashTable = ms->hashTable;
+ U32 const hlog = cParams->hashLog;
+ /* support stepSize of 0 */
+ size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
+ const BYTE* const base = ms->window.base;
+ const BYTE* const istart = (const BYTE*)src;
+ /* We check ip0 (ip + 0) and ip1 (ip + 1) each loop */
+ const BYTE* ip0 = istart;
+ const BYTE* ip1;
+ const BYTE* anchor = istart;
+ const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+ const U32 prefixStartIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
+ const BYTE* const prefixStart = base + prefixStartIndex;
+ const BYTE* const iend = istart + srcSize;
+ const BYTE* const ilimit = iend - HASH_READ_SIZE;
+ U32 offset_1=rep[0], offset_2=rep[1];
+ U32 offsetSaved = 0;
+
+ /* init */
+ DEBUGLOG(5, "ZSTD_compressBlock_fast_generic");
+ ip0 += (ip0 == prefixStart);
+ ip1 = ip0 + 1;
+ { U32 const current = (U32)(ip0 - base);
+ U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
+ U32 const maxRep = current - windowLow;
+ if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
+ if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
+ }
+
+ /* Main Search Loop */
+#ifdef __INTEL_COMPILER
+ /* From intel 'The vector pragma indicates that the loop should be
+ * vectorized if it is legal to do so'. Can be used together with
+ * #pragma ivdep (but have opted to exclude that because intel
+ * warns against using it).*/
+ #pragma vector always
+#endif
+ while (ip1 < ilimit) { /* < instead of <=, because check at ip0+2 */
+ size_t mLength;
+ BYTE const* ip2 = ip0 + 2;
+ size_t const h0 = ZSTD_hashPtr(ip0, hlog, mls);
+ U32 const val0 = MEM_read32(ip0);
+ size_t const h1 = ZSTD_hashPtr(ip1, hlog, mls);
+ U32 const val1 = MEM_read32(ip1);
+ U32 const current0 = (U32)(ip0-base);
+ U32 const current1 = (U32)(ip1-base);
+ U32 const matchIndex0 = hashTable[h0];
+ U32 const matchIndex1 = hashTable[h1];
+ BYTE const* repMatch = ip2 - offset_1;
+ const BYTE* match0 = base + matchIndex0;
+ const BYTE* match1 = base + matchIndex1;
+ U32 offcode;
+
+#if defined(__aarch64__)
+ PREFETCH_L1(ip0+256);
+#endif
+
+ hashTable[h0] = current0; /* update hash table */
+ hashTable[h1] = current1; /* update hash table */
+
+ assert(ip0 + 1 == ip1);
+
+ if ((offset_1 > 0) & (MEM_read32(repMatch) == MEM_read32(ip2))) {
+ mLength = (ip2[-1] == repMatch[-1]) ? 1 : 0;
+ ip0 = ip2 - mLength;
+ match0 = repMatch - mLength;
+ mLength += 4;
+ offcode = 0;
+ goto _match;
+ }
+ if ((matchIndex0 > prefixStartIndex) && MEM_read32(match0) == val0) {
+ /* found a regular match */
+ goto _offset;
+ }
+ if ((matchIndex1 > prefixStartIndex) && MEM_read32(match1) == val1) {
+ /* found a regular match after one literal */
+ ip0 = ip1;
+ match0 = match1;
+ goto _offset;
+ }
+ { size_t const step = ((size_t)(ip0-anchor) >> (kSearchStrength - 1)) + stepSize;
+ assert(step >= 2);
+ ip0 += step;
+ ip1 += step;
+ continue;
+ }
+_offset: /* Requires: ip0, match0 */
+ /* Compute the offset code */
+ offset_2 = offset_1;
+ offset_1 = (U32)(ip0-match0);
+ offcode = offset_1 + ZSTD_REP_MOVE;
+ mLength = 4;
+ /* Count the backwards match length */
+ while (((ip0>anchor) & (match0>prefixStart))
+ && (ip0[-1] == match0[-1])) { ip0--; match0--; mLength++; } /* catch up */
+
+_match: /* Requires: ip0, match0, offcode */
+ /* Count the forward length */
+ mLength += ZSTD_count(ip0+mLength, match0+mLength, iend);
+ ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, offcode, mLength-MINMATCH);
+ /* match found */
+ ip0 += mLength;
+ anchor = ip0;
+
+ if (ip0 <= ilimit) {
+ /* Fill Table */
+ assert(base+current0+2 > istart); /* check base overflow */
+ hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */
+ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+
+ if (offset_2 > 0) { /* offset_2==0 means offset_2 is invalidated */
+ while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - offset_2)) ) {
+ /* store sequence */
+ size_t const rLength = ZSTD_count(ip0+4, ip0+4-offset_2, iend) + 4;
+ { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; } /* swap offset_2 <=> offset_1 */
+ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
+ ip0 += rLength;
+ ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, 0 /*offCode*/, rLength-MINMATCH);
+ anchor = ip0;
+ continue; /* faster when present (confirmed on gcc-8) ... (?) */
+ } } }
+ ip1 = ip0 + 1;
+ }
+
+ /* save reps for next block */
+ rep[0] = offset_1 ? offset_1 : offsetSaved;
+ rep[1] = offset_2 ? offset_2 : offsetSaved;
+
+ /* Return the last literals size */
+ return (size_t)(iend - anchor);
+}
+
+
+size_t ZSTD_compressBlock_fast(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ U32 const mls = ms->cParams.minMatch;
+ assert(ms->dictMatchState == NULL);
+ switch(mls)
+ {
+ default: /* includes case 3 */
+ case 4 :
+ return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 4);
+ case 5 :
+ return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 5);
+ case 6 :
+ return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 6);
+ case 7 :
+ return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 7);
+ }
+}
+
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize, U32 const mls)
+{
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
+ U32* const hashTable = ms->hashTable;
+ U32 const hlog = cParams->hashLog;
+ /* support stepSize of 0 */
+ U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
+ const BYTE* const base = ms->window.base;
+ const BYTE* const istart = (const BYTE*)src;
+ const BYTE* ip = istart;
+ const BYTE* anchor = istart;
+ const U32 prefixStartIndex = ms->window.dictLimit;
+ const BYTE* const prefixStart = base + prefixStartIndex;
+ const BYTE* const iend = istart + srcSize;
+ const BYTE* const ilimit = iend - HASH_READ_SIZE;
+ U32 offset_1=rep[0], offset_2=rep[1];
+ U32 offsetSaved = 0;
+
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
+ const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
+ const U32* const dictHashTable = dms->hashTable;
+ const U32 dictStartIndex = dms->window.dictLimit;
+ const BYTE* const dictBase = dms->window.base;
+ const BYTE* const dictStart = dictBase + dictStartIndex;
+ const BYTE* const dictEnd = dms->window.nextSrc;
+ const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase);
+ const U32 dictAndPrefixLength = (U32)(ip - prefixStart + dictEnd - dictStart);
+ const U32 dictHLog = dictCParams->hashLog;
+
+ /* if a dictionary is still attached, it necessarily means that
+ * it is within window size. So we just check it. */
+ const U32 maxDistance = 1U << cParams->windowLog;
+ const U32 endIndex = (U32)((size_t)(ip - base) + srcSize);
+ assert(endIndex - prefixStartIndex <= maxDistance);
+ (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */
+
+ /* ensure there will be no no underflow
+ * when translating a dict index into a local index */
+ assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
+
+ /* init */
+ DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
+ ip += (dictAndPrefixLength == 0);
+ /* dictMatchState repCode checks don't currently handle repCode == 0
+ * disabling. */
+ assert(offset_1 <= dictAndPrefixLength);
+ assert(offset_2 <= dictAndPrefixLength);
+
+ /* Main Search Loop */
+ while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */
+ size_t mLength;
+ size_t const h = ZSTD_hashPtr(ip, hlog, mls);
+ U32 const current = (U32)(ip-base);
+ U32 const matchIndex = hashTable[h];
+ const BYTE* match = base + matchIndex;
+ const U32 repIndex = current + 1 - offset_1;
+ const BYTE* repMatch = (repIndex < prefixStartIndex) ?
+ dictBase + (repIndex - dictIndexDelta) :
+ base + repIndex;
+ hashTable[h] = current; /* update hash table */
+
+ if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
+ && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+ const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+ mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+ ip++;
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH);
+ } else if ( (matchIndex <= prefixStartIndex) ) {
+ size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls);
+ U32 const dictMatchIndex = dictHashTable[dictHash];
+ const BYTE* dictMatch = dictBase + dictMatchIndex;
+ if (dictMatchIndex <= dictStartIndex ||
+ MEM_read32(dictMatch) != MEM_read32(ip)) {
+ assert(stepSize >= 1);
+ ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+ continue;
+ } else {
+ /* found a dict match */
+ U32 const offset = (U32)(current-dictMatchIndex-dictIndexDelta);
+ mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4;
+ while (((ip>anchor) & (dictMatch>dictStart))
+ && (ip[-1] == dictMatch[-1])) {
+ ip--; dictMatch--; mLength++;
+ } /* catch up */
+ offset_2 = offset_1;
+ offset_1 = offset;
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+ }
+ } else if (MEM_read32(match) != MEM_read32(ip)) {
+ /* it's not a match, and we're not going to check the dictionary */
+ assert(stepSize >= 1);
+ ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+ continue;
+ } else {
+ /* found a regular match */
+ U32 const offset = (U32)(ip-match);
+ mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+ while (((ip>anchor) & (match>prefixStart))
+ && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+ offset_2 = offset_1;
+ offset_1 = offset;
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+ }
+
+ /* match found */
+ ip += mLength;
+ anchor = ip;
+
+ if (ip <= ilimit) {
+ /* Fill Table */
+ assert(base+current+2 > istart); /* check base overflow */
+ hashTable[ZSTD_hashPtr(base+current+2, hlog, mls)] = current+2; /* here because current+2 could be > iend-8 */
+ hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+
+ /* check immediate repcode */
+ while (ip <= ilimit) {
+ U32 const current2 = (U32)(ip-base);
+ U32 const repIndex2 = current2 - offset_2;
+ const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
+ dictBase - dictIndexDelta + repIndex2 :
+ base + repIndex2;
+ if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+ && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+ const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+ size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+ U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH);
+ hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+ ip += repLength2;
+ anchor = ip;
+ continue;
+ }
+ break;
+ }
+ }
+ }
+
+ /* save reps for next block */
+ rep[0] = offset_1 ? offset_1 : offsetSaved;
+ rep[1] = offset_2 ? offset_2 : offsetSaved;
+
+ /* Return the last literals size */
+ return (size_t)(iend - anchor);
+}
+
+size_t ZSTD_compressBlock_fast_dictMatchState(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ U32 const mls = ms->cParams.minMatch;
+ assert(ms->dictMatchState != NULL);
+ switch(mls)
+ {
+ default: /* includes case 3 */
+ case 4 :
+ return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 4);
+ case 5 :
+ return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 5);
+ case 6 :
+ return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 6);
+ case 7 :
+ return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 7);
+ }
+}
+
+
+static size_t ZSTD_compressBlock_fast_extDict_generic(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize, U32 const mls)
+{
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
+ U32* const hashTable = ms->hashTable;
+ U32 const hlog = cParams->hashLog;
+ /* support stepSize of 0 */
+ U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
+ const BYTE* const base = ms->window.base;
+ const BYTE* const dictBase = ms->window.dictBase;
+ const BYTE* const istart = (const BYTE*)src;
+ const BYTE* ip = istart;
+ const BYTE* anchor = istart;
+ const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+ const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
+ const U32 dictStartIndex = lowLimit;
+ const BYTE* const dictStart = dictBase + dictStartIndex;
+ const U32 dictLimit = ms->window.dictLimit;
+ const U32 prefixStartIndex = dictLimit < lowLimit ? lowLimit : dictLimit;
+ const BYTE* const prefixStart = base + prefixStartIndex;
+ const BYTE* const dictEnd = dictBase + prefixStartIndex;
+ const BYTE* const iend = istart + srcSize;
+ const BYTE* const ilimit = iend - 8;
+ U32 offset_1=rep[0], offset_2=rep[1];
+
+ DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic (offset_1=%u)", offset_1);
+
+ /* switch to "regular" variant if extDict is invalidated due to maxDistance */
+ if (prefixStartIndex == dictStartIndex)
+ return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, mls);
+
+ /* Search Loop */
+ while (ip < ilimit) { /* < instead of <=, because (ip+1) */
+ const size_t h = ZSTD_hashPtr(ip, hlog, mls);
+ const U32 matchIndex = hashTable[h];
+ const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
+ const BYTE* match = matchBase + matchIndex;
+ const U32 current = (U32)(ip-base);
+ const U32 repIndex = current + 1 - offset_1;
+ const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+ const BYTE* const repMatch = repBase + repIndex;
+ hashTable[h] = current; /* update hash table */
+ DEBUGLOG(7, "offset_1 = %u , current = %u", offset_1, current);
+ assert(offset_1 <= current +1); /* check repIndex */
+
+ if ( (((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > dictStartIndex))
+ && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+ const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+ size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4;
+ ip++;
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, rLength-MINMATCH);
+ ip += rLength;
+ anchor = ip;
+ } else {
+ if ( (matchIndex < dictStartIndex) ||
+ (MEM_read32(match) != MEM_read32(ip)) ) {
+ assert(stepSize >= 1);
+ ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+ continue;
+ }
+ { const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
+ const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
+ U32 const offset = current - matchIndex;
+ size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
+ while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+ offset_2 = offset_1; offset_1 = offset; /* update offset history */
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+ ip += mLength;
+ anchor = ip;
+ } }
+
+ if (ip <= ilimit) {
+ /* Fill Table */
+ hashTable[ZSTD_hashPtr(base+current+2, hlog, mls)] = current+2;
+ hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+ /* check immediate repcode */
+ while (ip <= ilimit) {
+ U32 const current2 = (U32)(ip-base);
+ U32 const repIndex2 = current2 - offset_2;
+ const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+ if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (repIndex2 > dictStartIndex)) /* intentional overflow */
+ && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+ const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+ size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+ { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */
+ ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, 0 /*offcode*/, repLength2-MINMATCH);
+ hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+ ip += repLength2;
+ anchor = ip;
+ continue;
+ }
+ break;
+ } } }
+
+ /* save reps for next block */
+ rep[0] = offset_1;
+ rep[1] = offset_2;
+
+ /* Return the last literals size */
+ return (size_t)(iend - anchor);
+}
+
+
+size_t ZSTD_compressBlock_fast_extDict(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ U32 const mls = ms->cParams.minMatch;
+ switch(mls)
+ {
+ default: /* includes case 3 */
+ case 4 :
+ return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 4);
+ case 5 :
+ return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 5);
+ case 6 :
+ return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 6);
+ case 7 :
+ return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 7);
+ }
+}
+/**** ended inlining compress/zstd_fast.c ****/
+/**** start inlining compress/zstd_lazy.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/**** skipping file: zstd_compress_internal.h ****/
+/**** skipping file: zstd_lazy.h ****/
+
+
+/*-*************************************
+* Binary Tree search
+***************************************/
+
+static void
+ZSTD_updateDUBT(ZSTD_matchState_t* ms,
+ const BYTE* ip, const BYTE* iend,
+ U32 mls)
+{
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
+ U32* const hashTable = ms->hashTable;
+ U32 const hashLog = cParams->hashLog;
+
+ U32* const bt = ms->chainTable;
+ U32 const btLog = cParams->chainLog - 1;
+ U32 const btMask = (1 << btLog) - 1;
+
+ const BYTE* const base = ms->window.base;
+ U32 const target = (U32)(ip - base);
+ U32 idx = ms->nextToUpdate;
+
+ if (idx != target)
+ DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)",
+ idx, target, ms->window.dictLimit);
+ assert(ip + 8 <= iend); /* condition for ZSTD_hashPtr */
+ (void)iend;
+
+ assert(idx >= ms->window.dictLimit); /* condition for valid base+idx */
+ for ( ; idx < target ; idx++) {
+ size_t const h = ZSTD_hashPtr(base + idx, hashLog, mls); /* assumption : ip + 8 <= iend */
+ U32 const matchIndex = hashTable[h];
+
+ U32* const nextCandidatePtr = bt + 2*(idx&btMask);
+ U32* const sortMarkPtr = nextCandidatePtr + 1;
+
+ DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx);
+ hashTable[h] = idx; /* Update Hash Table */
+ *nextCandidatePtr = matchIndex; /* update BT like a chain */
+ *sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK;
+ }
+ ms->nextToUpdate = target;
+}
+
+
+/** ZSTD_insertDUBT1() :
+ * sort one already inserted but unsorted position
+ * assumption : current >= btlow == (current - btmask)
+ * doesn't fail */
+static void
+ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
+ U32 current, const BYTE* inputEnd,
+ U32 nbCompares, U32 btLow,
+ const ZSTD_dictMode_e dictMode)
+{
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
+ U32* const bt = ms->chainTable;
+ U32 const btLog = cParams->chainLog - 1;
+ U32 const btMask = (1 << btLog) - 1;
+ size_t commonLengthSmaller=0, commonLengthLarger=0;
+ const BYTE* const base = ms->window.base;
+ const BYTE* const dictBase = ms->window.dictBase;
+ const U32 dictLimit = ms->window.dictLimit;
+ const BYTE* const ip = (current>=dictLimit) ? base + current : dictBase + current;
+ const BYTE* const iend = (current>=dictLimit) ? inputEnd : dictBase + dictLimit;
+ const BYTE* const dictEnd = dictBase + dictLimit;
+ const BYTE* const prefixStart = base + dictLimit;
+ const BYTE* match;
+ U32* smallerPtr = bt + 2*(current&btMask);
+ U32* largerPtr = smallerPtr + 1;
+ U32 matchIndex = *smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */
+ U32 dummy32; /* to be nullified at the end */
+ U32 const windowValid = ms->window.lowLimit;
+ U32 const maxDistance = 1U << cParams->windowLog;
+ U32 const windowLow = (current - windowValid > maxDistance) ? current - maxDistance : windowValid;
+
+
+ DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",
+ current, dictLimit, windowLow);
+ assert(current >= btLow);
+ assert(ip < iend); /* condition for ZSTD_count */
+
+ while (nbCompares-- && (matchIndex > windowLow)) {
+ U32* const nextPtr = bt + 2*(matchIndex & btMask);
+ size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
+ assert(matchIndex < current);
+ /* note : all candidates are now supposed sorted,
+ * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK
+ * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */
+
+ if ( (dictMode != ZSTD_extDict)
+ || (matchIndex+matchLength >= dictLimit) /* both in current segment*/
+ || (current < dictLimit) /* both in extDict */) {
+ const BYTE* const mBase = ( (dictMode != ZSTD_extDict)
+ || (matchIndex+matchLength >= dictLimit)) ?
+ base : dictBase;
+ assert( (matchIndex+matchLength >= dictLimit) /* might be wrong if extDict is incorrectly set to 0 */
+ || (current < dictLimit) );
+ match = mBase + matchIndex;
+ matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
+ } else {
+ match = dictBase + matchIndex;
+ matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+ if (matchIndex+matchLength >= dictLimit)
+ match = base + matchIndex; /* preparation for next read of match[matchLength] */
+ }
+
+ DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
+ current, matchIndex, (U32)matchLength);
+
+ if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
+ break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
+ }
+
+ if (match[matchLength] < ip[matchLength]) { /* necessarily within buffer */
+ /* match is smaller than current */
+ *smallerPtr = matchIndex; /* update smaller idx */
+ commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
+ if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop searching */
+ DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u",
+ matchIndex, btLow, nextPtr[1]);
+ smallerPtr = nextPtr+1; /* new "candidate" => larger than match, which was smaller than target */
+ matchIndex = nextPtr[1]; /* new matchIndex, larger than previous and closer to current */
+ } else {
+ /* match is larger than current */
+ *largerPtr = matchIndex;
+ commonLengthLarger = matchLength;
+ if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop searching */
+ DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u",
+ matchIndex, btLow, nextPtr[0]);
+ largerPtr = nextPtr;
+ matchIndex = nextPtr[0];
+ } }
+
+ *smallerPtr = *largerPtr = 0;
+}
+
+
+static size_t
+ZSTD_DUBT_findBetterDictMatch (
+ ZSTD_matchState_t* ms,
+ const BYTE* const ip, const BYTE* const iend,
+ size_t* offsetPtr,
+ size_t bestLength,
+ U32 nbCompares,
+ U32 const mls,
+ const ZSTD_dictMode_e dictMode)
+{
+ const ZSTD_matchState_t * const dms = ms->dictMatchState;
+ const ZSTD_compressionParameters* const dmsCParams = &dms->cParams;
+ const U32 * const dictHashTable = dms->hashTable;
+ U32 const hashLog = dmsCParams->hashLog;
+ size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
+ U32 dictMatchIndex = dictHashTable[h];
+
+ const BYTE* const base = ms->window.base;
+ const BYTE* const prefixStart = base + ms->window.dictLimit;
+ U32 const current = (U32)(ip-base);
+ const BYTE* const dictBase = dms->window.base;
+ const BYTE* const dictEnd = dms->window.nextSrc;
+ U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);
+ U32 const dictLowLimit = dms->window.lowLimit;
+ U32 const dictIndexDelta = ms->window.lowLimit - dictHighLimit;
+
+ U32* const dictBt = dms->chainTable;
+ U32 const btLog = dmsCParams->chainLog - 1;
+ U32 const btMask = (1 << btLog) - 1;
+ U32 const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask;
+
+ size_t commonLengthSmaller=0, commonLengthLarger=0;
+
+ (void)dictMode;
+ assert(dictMode == ZSTD_dictMatchState);
+
+ while (nbCompares-- && (dictMatchIndex > dictLowLimit)) {
+ U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask);
+ size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
+ const BYTE* match = dictBase + dictMatchIndex;
+ matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+ if (dictMatchIndex+matchLength >= dictHighLimit)
+ match = base + dictMatchIndex + dictIndexDelta; /* to prepare for next usage of match[matchLength] */
+
+ if (matchLength > bestLength) {
+ U32 matchIndex = dictMatchIndex + dictIndexDelta;
+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
+ DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
+ current, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + current - matchIndex, dictMatchIndex, matchIndex);
+ bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
+ }
+ if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
+ break; /* drop, to guarantee consistency (miss a little bit of compression) */
+ }
+ }
+
+ if (match[matchLength] < ip[matchLength]) {
+ if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */
+ commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
+ dictMatchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */
+ } else {
+ /* match is larger than current */
+ if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */
+ commonLengthLarger = matchLength;
+ dictMatchIndex = nextPtr[0];
+ }
+ }
+
+ if (bestLength >= MINMATCH) {
+ U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
+ DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+ current, (U32)bestLength, (U32)*offsetPtr, mIndex);
+ }
+ return bestLength;
+
+}
+
+
+static size_t
+ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+ const BYTE* const ip, const BYTE* const iend,
+ size_t* offsetPtr,
+ U32 const mls,
+ const ZSTD_dictMode_e dictMode)
+{
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
+ U32* const hashTable = ms->hashTable;
+ U32 const hashLog = cParams->hashLog;
+ size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
+ U32 matchIndex = hashTable[h];
+
+ const BYTE* const base = ms->window.base;
+ U32 const current = (U32)(ip-base);
+ U32 const windowLow = ZSTD_getLowestMatchIndex(ms, current, cParams->windowLog);
+
+ U32* const bt = ms->chainTable;
+ U32 const btLog = cParams->chainLog - 1;
+ U32 const btMask = (1 << btLog) - 1;
+ U32 const btLow = (btMask >= current) ? 0 : current - btMask;
+ U32 const unsortLimit = MAX(btLow, windowLow);
+
+ U32* nextCandidate = bt + 2*(matchIndex&btMask);
+ U32* unsortedMark = bt + 2*(matchIndex&btMask) + 1;
+ U32 nbCompares = 1U << cParams->searchLog;
+ U32 nbCandidates = nbCompares;
+ U32 previousCandidate = 0;
+
+ DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", current);
+ assert(ip <= iend-8); /* required for h calculation */
+
+ /* reach end of unsorted candidates list */
+ while ( (matchIndex > unsortLimit)
+ && (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK)
+ && (nbCandidates > 1) ) {
+ DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted",
+ matchIndex);
+ *unsortedMark = previousCandidate; /* the unsortedMark becomes a reversed chain, to move up back to original position */
+ previousCandidate = matchIndex;
+ matchIndex = *nextCandidate;
+ nextCandidate = bt + 2*(matchIndex&btMask);
+ unsortedMark = bt + 2*(matchIndex&btMask) + 1;
+ nbCandidates --;
+ }
+
+ /* nullify last candidate if it's still unsorted
+ * simplification, detrimental to compression ratio, beneficial for speed */
+ if ( (matchIndex > unsortLimit)
+ && (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) {
+ DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u",
+ matchIndex);
+ *nextCandidate = *unsortedMark = 0;
+ }
+
+ /* batch sort stacked candidates */
+ matchIndex = previousCandidate;
+ while (matchIndex) { /* will end on matchIndex == 0 */
+ U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1;
+ U32 const nextCandidateIdx = *nextCandidateIdxPtr;
+ ZSTD_insertDUBT1(ms, matchIndex, iend,
+ nbCandidates, unsortLimit, dictMode);
+ matchIndex = nextCandidateIdx;
+ nbCandidates++;
+ }
+
+ /* find longest match */
+ { size_t commonLengthSmaller = 0, commonLengthLarger = 0;
+ const BYTE* const dictBase = ms->window.dictBase;
+ const U32 dictLimit = ms->window.dictLimit;
+ const BYTE* const dictEnd = dictBase + dictLimit;
+ const BYTE* const prefixStart = base + dictLimit;
+ U32* smallerPtr = bt + 2*(current&btMask);
+ U32* largerPtr = bt + 2*(current&btMask) + 1;
+ U32 matchEndIdx = current + 8 + 1;
+ U32 dummy32; /* to be nullified at the end */
+ size_t bestLength = 0;
+
+ matchIndex = hashTable[h];
+ hashTable[h] = current; /* Update Hash Table */
+
+ while (nbCompares-- && (matchIndex > windowLow)) {
+ U32* const nextPtr = bt + 2*(matchIndex & btMask);
+ size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
+ const BYTE* match;
+
+ if ((dictMode != ZSTD_extDict) || (matchIndex+matchLength >= dictLimit)) {
+ match = base + matchIndex;
+ matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
+ } else {
+ match = dictBase + matchIndex;
+ matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+ if (matchIndex+matchLength >= dictLimit)
+ match = base + matchIndex; /* to prepare for next usage of match[matchLength] */
+ }
+
+ if (matchLength > bestLength) {
+ if (matchLength > matchEndIdx - matchIndex)
+ matchEndIdx = matchIndex + (U32)matchLength;
+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
+ bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
+ if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
+ if (dictMode == ZSTD_dictMatchState) {
+ nbCompares = 0; /* in addition to avoiding checking any
+ * further in this loop, make sure we
+ * skip checking in the dictionary. */
+ }
+ break; /* drop, to guarantee consistency (miss a little bit of compression) */
+ }
+ }
+
+ if (match[matchLength] < ip[matchLength]) {
+ /* match is smaller than current */
+ *smallerPtr = matchIndex; /* update smaller idx */
+ commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
+ if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */
+ smallerPtr = nextPtr+1; /* new "smaller" => larger of match */
+ matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */
+ } else {
+ /* match is larger than current */
+ *largerPtr = matchIndex;
+ commonLengthLarger = matchLength;
+ if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */
+ largerPtr = nextPtr;
+ matchIndex = nextPtr[0];
+ } }
+
+ *smallerPtr = *largerPtr = 0;
+
+ if (dictMode == ZSTD_dictMatchState && nbCompares) {
+ bestLength = ZSTD_DUBT_findBetterDictMatch(
+ ms, ip, iend,
+ offsetPtr, bestLength, nbCompares,
+ mls, dictMode);
+ }
+
+ assert(matchEndIdx > current+8); /* ensure nextToUpdate is increased */
+ ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
+ if (bestLength >= MINMATCH) {
+ U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
+ DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+ current, (U32)bestLength, (U32)*offsetPtr, mIndex);
+ }
+ return bestLength;
+ }
+}
+
+
+/** ZSTD_BtFindBestMatch() : Tree updater, providing best match */
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
+ const BYTE* const ip, const BYTE* const iLimit,
+ size_t* offsetPtr,
+ const U32 mls /* template */,
+ const ZSTD_dictMode_e dictMode)
+{
+ DEBUGLOG(7, "ZSTD_BtFindBestMatch");
+ if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */
+ ZSTD_updateDUBT(ms, ip, iLimit, mls);
+ return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
+}
+
+
+static size_t
+ZSTD_BtFindBestMatch_selectMLS ( ZSTD_matchState_t* ms,
+ const BYTE* ip, const BYTE* const iLimit,
+ size_t* offsetPtr)
+{
+ switch(ms->cParams.minMatch)
+ {
+ default : /* includes case 3 */
+ case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
+ case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
+ case 7 :
+ case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
+ }
+}
+
+
+static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS (
+ ZSTD_matchState_t* ms,
+ const BYTE* ip, const BYTE* const iLimit,
+ size_t* offsetPtr)
+{
+ switch(ms->cParams.minMatch)
+ {
+ default : /* includes case 3 */
+ case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
+ case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
+ case 7 :
+ case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
+ }
+}
+
+
+static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
+ ZSTD_matchState_t* ms,
+ const BYTE* ip, const BYTE* const iLimit,
+ size_t* offsetPtr)
+{
+ switch(ms->cParams.minMatch)
+ {
+ default : /* includes case 3 */
+ case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
+ case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
+ case 7 :
+ case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
+ }
+}
+
+
+
+/* *********************************
+* Hash Chain
+***********************************/
+#define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)]
+
+/* Update chains up to ip (excluded)
+ Assumption : always within prefix (i.e. not within extDict) */
+static U32 ZSTD_insertAndFindFirstIndex_internal(
+ ZSTD_matchState_t* ms,
+ const ZSTD_compressionParameters* const cParams,
+ const BYTE* ip, U32 const mls)
+{
+ U32* const hashTable = ms->hashTable;
+ const U32 hashLog = cParams->hashLog;
+ U32* const chainTable = ms->chainTable;
+ const U32 chainMask = (1 << cParams->chainLog) - 1;
+ const BYTE* const base = ms->window.base;
+ const U32 target = (U32)(ip - base);
+ U32 idx = ms->nextToUpdate;
+
+ while(idx < target) { /* catch up */
+ size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
+ NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
+ hashTable[h] = idx;
+ idx++;
+ }
+
+ ms->nextToUpdate = target;
+ return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
+}
+
+U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
+ return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
+}
+
+
+/* inlining is important to hardwire a hot branch (template emulation) */
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_HcFindBestMatch_generic (
+ ZSTD_matchState_t* ms,
+ const BYTE* const ip, const BYTE* const iLimit,
+ size_t* offsetPtr,
+ const U32 mls, const ZSTD_dictMode_e dictMode)
+{
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
+ U32* const chainTable = ms->chainTable;
+ const U32 chainSize = (1 << cParams->chainLog);
+ const U32 chainMask = chainSize-1;
+ const BYTE* const base = ms->window.base;
+ const BYTE* const dictBase = ms->window.dictBase;
+ const U32 dictLimit = ms->window.dictLimit;
+ const BYTE* const prefixStart = base + dictLimit;
+ const BYTE* const dictEnd = dictBase + dictLimit;
+ const U32 current = (U32)(ip-base);
+ const U32 maxDistance = 1U << cParams->windowLog;
+ const U32 lowestValid = ms->window.lowLimit;
+ const U32 withinMaxDistance = (current - lowestValid > maxDistance) ? current - maxDistance : lowestValid;
+ const U32 isDictionary = (ms->loadedDictEnd != 0);
+ const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
+ const U32 minChain = current > chainSize ? current - chainSize : 0;
+ U32 nbAttempts = 1U << cParams->searchLog;
+ size_t ml=4-1;
+
+ /* HC4 match finder */
+ U32 matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
+
+ for ( ; (matchIndex>lowLimit) & (nbAttempts>0) ; nbAttempts--) {
+ size_t currentMl=0;
+ if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+ const BYTE* const match = base + matchIndex;
+ assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
+ if (match[ml] == ip[ml]) /* potentially better */
+ currentMl = ZSTD_count(ip, match, iLimit);
+ } else {
+ const BYTE* const match = dictBase + matchIndex;
+ assert(match+4 <= dictEnd);
+ if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
+ }
+
+ /* save best solution */
+ if (currentMl > ml) {
+ ml = currentMl;
+ *offsetPtr = current - matchIndex + ZSTD_REP_MOVE;
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+ }
+
+ if (matchIndex <= minChain) break;
+ matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
+ }
+
+ if (dictMode == ZSTD_dictMatchState) {
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
+ const U32* const dmsChainTable = dms->chainTable;
+ const U32 dmsChainSize = (1 << dms->cParams.chainLog);
+ const U32 dmsChainMask = dmsChainSize - 1;
+ const U32 dmsLowestIndex = dms->window.dictLimit;
+ const BYTE* const dmsBase = dms->window.base;
+ const BYTE* const dmsEnd = dms->window.nextSrc;
+ const U32 dmsSize = (U32)(dmsEnd - dmsBase);
+ const U32 dmsIndexDelta = dictLimit - dmsSize;
+ const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : 0;
+
+ matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)];
+
+ for ( ; (matchIndex>dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
+ size_t currentMl=0;
+ const BYTE* const match = dmsBase + matchIndex;
+ assert(match+4 <= dmsEnd);
+ if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
+
+ /* save best solution */
+ if (currentMl > ml) {
+ ml = currentMl;
+ *offsetPtr = current - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+ }
+
+ if (matchIndex <= dmsMinChain) break;
+ matchIndex = dmsChainTable[matchIndex & dmsChainMask];
+ }
+ }
+
+ return ml;
+}
+
+
+FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS (
+ ZSTD_matchState_t* ms,
+ const BYTE* ip, const BYTE* const iLimit,
+ size_t* offsetPtr)
+{
+ switch(ms->cParams.minMatch)
+ {
+ default : /* includes case 3 */
+ case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
+ case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
+ case 7 :
+ case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
+ }
+}
+
+
+static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS (
+ ZSTD_matchState_t* ms,
+ const BYTE* ip, const BYTE* const iLimit,
+ size_t* offsetPtr)
+{
+ switch(ms->cParams.minMatch)
+ {
+ default : /* includes case 3 */
+ case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
+ case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
+ case 7 :
+ case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
+ }
+}
+
+
+FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
+ ZSTD_matchState_t* ms,
+ const BYTE* ip, const BYTE* const iLimit,
+ size_t* offsetPtr)
+{
+ switch(ms->cParams.minMatch)
+ {
+ default : /* includes case 3 */
+ case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
+ case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
+ case 7 :
+ case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
+ }
+}
+
+
+/* *******************************
+* Common parser - lazy strategy
+*********************************/
+typedef enum { search_hashChain, search_binaryTree } searchMethod_e;
+
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_compressBlock_lazy_generic(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore,
+ U32 rep[ZSTD_REP_NUM],
+ const void* src, size_t srcSize,
+ const searchMethod_e searchMethod, const U32 depth,
+ ZSTD_dictMode_e const dictMode)
+{
+ const BYTE* const istart = (const BYTE*)src;
+ const BYTE* ip = istart;
+ const BYTE* anchor = istart;
+ const BYTE* const iend = istart + srcSize;
+ const BYTE* const ilimit = iend - 8;
+ const BYTE* const base = ms->window.base;
+ const U32 prefixLowestIndex = ms->window.dictLimit;
+ const BYTE* const prefixLowest = base + prefixLowestIndex;
+
+ typedef size_t (*searchMax_f)(
+ ZSTD_matchState_t* ms,
+ const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
+ searchMax_f const searchMax = dictMode == ZSTD_dictMatchState ?
+ (searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_dictMatchState_selectMLS
+ : ZSTD_HcFindBestMatch_dictMatchState_selectMLS) :
+ (searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_selectMLS
+ : ZSTD_HcFindBestMatch_selectMLS);
+ U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
+
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
+ const U32 dictLowestIndex = dictMode == ZSTD_dictMatchState ?
+ dms->window.dictLimit : 0;
+ const BYTE* const dictBase = dictMode == ZSTD_dictMatchState ?
+ dms->window.base : NULL;
+ const BYTE* const dictLowest = dictMode == ZSTD_dictMatchState ?
+ dictBase + dictLowestIndex : NULL;
+ const BYTE* const dictEnd = dictMode == ZSTD_dictMatchState ?
+ dms->window.nextSrc : NULL;
+ const U32 dictIndexDelta = dictMode == ZSTD_dictMatchState ?
+ prefixLowestIndex - (U32)(dictEnd - dictBase) :
+ 0;
+ const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
+
+ DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u)", (U32)dictMode);
+
+ /* init */
+ ip += (dictAndPrefixLength == 0);
+ if (dictMode == ZSTD_noDict) {
+ U32 const current = (U32)(ip - base);
+ U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, ms->cParams.windowLog);
+ U32 const maxRep = current - windowLow;
+ if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
+ if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
+ }
+ if (dictMode == ZSTD_dictMatchState) {
+ /* dictMatchState repCode checks don't currently handle repCode == 0
+ * disabling. */
+ assert(offset_1 <= dictAndPrefixLength);
+ assert(offset_2 <= dictAndPrefixLength);
+ }
+
+ /* Match Loop */
+#if defined(__GNUC__) && defined(__x86_64__)
+ /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
+ * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
+ */
+ __asm__(".p2align 5");
+#endif
+ while (ip < ilimit) {
+ size_t matchLength=0;
+ size_t offset=0;
+ const BYTE* start=ip+1;
+
+ /* check repCode */
+ if (dictMode == ZSTD_dictMatchState) {
+ const U32 repIndex = (U32)(ip - base) + 1 - offset_1;
+ const BYTE* repMatch = (dictMode == ZSTD_dictMatchState
+ && repIndex < prefixLowestIndex) ?
+ dictBase + (repIndex - dictIndexDelta) :
+ base + repIndex;
+ if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+ && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+ const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+ matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+ if (depth==0) goto _storeSequence;
+ }
+ }
+ if ( dictMode == ZSTD_noDict
+ && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {
+ matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+ if (depth==0) goto _storeSequence;
+ }
+
+ /* first search (depth 0) */
+ { size_t offsetFound = 999999999;
+ size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
+ if (ml2 > matchLength)
+ matchLength = ml2, start = ip, offset=offsetFound;
+ }
+
+ if (matchLength < 4) {
+ ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
+ continue;
+ }
+
+ /* let's try to find a better solution */
+ if (depth>=1)
+ while (ip<ilimit) {
+ ip ++;
+ if ( (dictMode == ZSTD_noDict)
+ && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+ size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+ int const gain2 = (int)(mlRep * 3);
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
+ if ((mlRep >= 4) && (gain2 > gain1))
+ matchLength = mlRep, offset = 0, start = ip;
+ }
+ if (dictMode == ZSTD_dictMatchState) {
+ const U32 repIndex = (U32)(ip - base) - offset_1;
+ const BYTE* repMatch = repIndex < prefixLowestIndex ?
+ dictBase + (repIndex - dictIndexDelta) :
+ base + repIndex;
+ if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+ && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+ const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+ size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+ int const gain2 = (int)(mlRep * 3);
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
+ if ((mlRep >= 4) && (gain2 > gain1))
+ matchLength = mlRep, offset = 0, start = ip;
+ }
+ }
+ { size_t offset2=999999999;
+ size_t const ml2 = searchMax(ms, ip, iend, &offset2);
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
+ if ((ml2 >= 4) && (gain2 > gain1)) {
+ matchLength = ml2, offset = offset2, start = ip;
+ continue; /* search a better one */
+ } }
+
+ /* let's find an even better one */
+ if ((depth==2) && (ip<ilimit)) {
+ ip ++;
+ if ( (dictMode == ZSTD_noDict)
+ && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+ size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+ int const gain2 = (int)(mlRep * 4);
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
+ if ((mlRep >= 4) && (gain2 > gain1))
+ matchLength = mlRep, offset = 0, start = ip;
+ }
+ if (dictMode == ZSTD_dictMatchState) {
+ const U32 repIndex = (U32)(ip - base) - offset_1;
+ const BYTE* repMatch = repIndex < prefixLowestIndex ?
+ dictBase + (repIndex - dictIndexDelta) :
+ base + repIndex;
+ if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+ && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+ const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+ size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+ int const gain2 = (int)(mlRep * 4);
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
+ if ((mlRep >= 4) && (gain2 > gain1))
+ matchLength = mlRep, offset = 0, start = ip;
+ }
+ }
+ { size_t offset2=999999999;
+ size_t const ml2 = searchMax(ms, ip, iend, &offset2);
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
+ if ((ml2 >= 4) && (gain2 > gain1)) {
+ matchLength = ml2, offset = offset2, start = ip;
+ continue;
+ } } }
+ break; /* nothing found : store previous solution */
+ }
+
+ /* NOTE:
+ * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior.
+ * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which
+ * overflows the pointer, which is undefined behavior.
+ */
+ /* catch up */
+ if (offset) {
+ if (dictMode == ZSTD_noDict) {
+ while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > prefixLowest))
+ && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) ) /* only search for offset within prefix */
+ { start--; matchLength++; }
+ }
+ if (dictMode == ZSTD_dictMatchState) {
+ U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
+ const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
+ const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
+ while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
+ }
+ offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
+ }
+ /* store sequence */
+_storeSequence:
+ { size_t const litLength = start - anchor;
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
+ anchor = ip = start + matchLength;
+ }
+
+ /* check immediate repcode */
+ if (dictMode == ZSTD_dictMatchState) {
+ while (ip <= ilimit) {
+ U32 const current2 = (U32)(ip-base);
+ U32 const repIndex = current2 - offset_2;
+ const BYTE* repMatch = dictMode == ZSTD_dictMatchState
+ && repIndex < prefixLowestIndex ?
+ dictBase - dictIndexDelta + repIndex :
+ base + repIndex;
+ if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */)
+ && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+ const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
+ matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
+ offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset_2 <=> offset_1 */
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
+ ip += matchLength;
+ anchor = ip;
+ continue;
+ }
+ break;
+ }
+ }
+
+ if (dictMode == ZSTD_noDict) {
+ while ( ((ip <= ilimit) & (offset_2>0))
+ && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
+ /* store sequence */
+ matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+ offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
+ ip += matchLength;
+ anchor = ip;
+ continue; /* faster when present ... (?) */
+ } } }
+
+ /* Save reps for next block */
+ rep[0] = offset_1 ? offset_1 : savedOffset;
+ rep[1] = offset_2 ? offset_2 : savedOffset;
+
+ /* Return the last literals size */
+ return (size_t)(iend - anchor);
+}
+
+
+size_t ZSTD_compressBlock_btlazy2(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_lazy2(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_lazy(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_greedy(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_lazy2_dictMatchState(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_lazy_dictMatchState(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_greedy_dictMatchState(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
+}
+
+
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_lazy_extDict_generic(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore,
+ U32 rep[ZSTD_REP_NUM],
+ const void* src, size_t srcSize,
+ const searchMethod_e searchMethod, const U32 depth)
+{
+ const BYTE* const istart = (const BYTE*)src;
+ const BYTE* ip = istart;
+ const BYTE* anchor = istart;
+ const BYTE* const iend = istart + srcSize;
+ const BYTE* const ilimit = iend - 8;
+ const BYTE* const base = ms->window.base;
+ const U32 dictLimit = ms->window.dictLimit;
+ const BYTE* const prefixStart = base + dictLimit;
+ const BYTE* const dictBase = ms->window.dictBase;
+ const BYTE* const dictEnd = dictBase + dictLimit;
+ const BYTE* const dictStart = dictBase + ms->window.lowLimit;
+ const U32 windowLog = ms->cParams.windowLog;
+
+ typedef size_t (*searchMax_f)(
+ ZSTD_matchState_t* ms,
+ const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
+ searchMax_f searchMax = searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS;
+
+ U32 offset_1 = rep[0], offset_2 = rep[1];
+
+ DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic");
+
+ /* init */
+ ip += (ip == prefixStart);
+
+ /* Match Loop */
+#if defined(__GNUC__) && defined(__x86_64__)
+ /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
+ * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
+ */
+ __asm__(".p2align 5");
+#endif
+ while (ip < ilimit) {
+ size_t matchLength=0;
+ size_t offset=0;
+ const BYTE* start=ip+1;
+ U32 current = (U32)(ip-base);
+
+ /* check repCode */
+ { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, current+1, windowLog);
+ const U32 repIndex = (U32)(current+1 - offset_1);
+ const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+ const BYTE* const repMatch = repBase + repIndex;
+ if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */
+ if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
+ /* repcode detected we should take it */
+ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+ matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+ if (depth==0) goto _storeSequence;
+ } }
+
+ /* first search (depth 0) */
+ { size_t offsetFound = 999999999;
+ size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
+ if (ml2 > matchLength)
+ matchLength = ml2, start = ip, offset=offsetFound;
+ }
+
+ if (matchLength < 4) {
+ ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
+ continue;
+ }
+
+ /* let's try to find a better solution */
+ if (depth>=1)
+ while (ip<ilimit) {
+ ip ++;
+ current++;
+ /* check repCode */
+ if (offset) {
+ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, current, windowLog);
+ const U32 repIndex = (U32)(current - offset_1);
+ const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+ const BYTE* const repMatch = repBase + repIndex;
+ if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */
+ if (MEM_read32(ip) == MEM_read32(repMatch)) {
+ /* repcode detected */
+ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+ size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+ int const gain2 = (int)(repLength * 3);
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
+ if ((repLength >= 4) && (gain2 > gain1))
+ matchLength = repLength, offset = 0, start = ip;
+ } }
+
+ /* search match, depth 1 */
+ { size_t offset2=999999999;
+ size_t const ml2 = searchMax(ms, ip, iend, &offset2);
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
+ if ((ml2 >= 4) && (gain2 > gain1)) {
+ matchLength = ml2, offset = offset2, start = ip;
+ continue; /* search a better one */
+ } }
+
+ /* let's find an even better one */
+ if ((depth==2) && (ip<ilimit)) {
+ ip ++;
+ current++;
+ /* check repCode */
+ if (offset) {
+ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, current, windowLog);
+ const U32 repIndex = (U32)(current - offset_1);
+ const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+ const BYTE* const repMatch = repBase + repIndex;
+ if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */
+ if (MEM_read32(ip) == MEM_read32(repMatch)) {
+ /* repcode detected */
+ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+ size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+ int const gain2 = (int)(repLength * 4);
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
+ if ((repLength >= 4) && (gain2 > gain1))
+ matchLength = repLength, offset = 0, start = ip;
+ } }
+
+ /* search match, depth 2 */
+ { size_t offset2=999999999;
+ size_t const ml2 = searchMax(ms, ip, iend, &offset2);
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
+ if ((ml2 >= 4) && (gain2 > gain1)) {
+ matchLength = ml2, offset = offset2, start = ip;
+ continue;
+ } } }
+ break; /* nothing found : store previous solution */
+ }
+
+ /* catch up */
+ if (offset) {
+ U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
+ const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
+ const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
+ while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
+ offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
+ }
+
+ /* store sequence */
+_storeSequence:
+ { size_t const litLength = start - anchor;
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
+ anchor = ip = start + matchLength;
+ }
+
+ /* check immediate repcode */
+ while (ip <= ilimit) {
+ const U32 repCurrent = (U32)(ip-base);
+ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog);
+ const U32 repIndex = repCurrent - offset_2;
+ const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+ const BYTE* const repMatch = repBase + repIndex;
+ if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */
+ if (MEM_read32(ip) == MEM_read32(repMatch)) {
+ /* repcode detected we should take it */
+ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+ matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+ offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset history */
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
+ ip += matchLength;
+ anchor = ip;
+ continue; /* faster when present ... (?) */
+ }
+ break;
+ } }
+
+ /* Save reps for next block */
+ rep[0] = offset_1;
+ rep[1] = offset_2;
+
+ /* Return the last literals size */
+ return (size_t)(iend - anchor);
+}
+
+
+size_t ZSTD_compressBlock_greedy_extDict(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);
+}
+
+size_t ZSTD_compressBlock_lazy_extDict(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+
+{
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
+}
+
+size_t ZSTD_compressBlock_lazy2_extDict(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+
+{
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
+}
+
+size_t ZSTD_compressBlock_btlazy2_extDict(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+
+{
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
+}
+/**** ended inlining compress/zstd_lazy.c ****/
+/**** start inlining compress/zstd_ldm.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/**** skipping file: zstd_ldm.h ****/
+
+/**** skipping file: ../common/debug.h ****/
+/**** skipping file: zstd_fast.h ****/
+/**** skipping file: zstd_double_fast.h ****/
+
+#define LDM_BUCKET_SIZE_LOG 3
+#define LDM_MIN_MATCH_LENGTH 64
+#define LDM_HASH_RLOG 7
+#define LDM_HASH_CHAR_OFFSET 10
+
+void ZSTD_ldm_adjustParameters(ldmParams_t* params,
+ ZSTD_compressionParameters const* cParams)
+{
+ params->windowLog = cParams->windowLog;
+ ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX);
+ DEBUGLOG(4, "ZSTD_ldm_adjustParameters");
+ if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG;
+ if (!params->minMatchLength) params->minMatchLength = LDM_MIN_MATCH_LENGTH;
+ if (cParams->strategy >= ZSTD_btopt) {
+ /* Get out of the way of the optimal parser */
+ U32 const minMatch = MAX(cParams->targetLength, params->minMatchLength);
+ assert(minMatch >= ZSTD_LDM_MINMATCH_MIN);
+ assert(minMatch <= ZSTD_LDM_MINMATCH_MAX);
+ params->minMatchLength = minMatch;
+ }
+ if (params->hashLog == 0) {
+ params->hashLog = MAX(ZSTD_HASHLOG_MIN, params->windowLog - LDM_HASH_RLOG);
+ assert(params->hashLog <= ZSTD_HASHLOG_MAX);
+ }
+ if (params->hashRateLog == 0) {
+ params->hashRateLog = params->windowLog < params->hashLog
+ ? 0
+ : params->windowLog - params->hashLog;
+ }
+ params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog);
+}
+
+size_t ZSTD_ldm_getTableSize(ldmParams_t params)
+{
+ size_t const ldmHSize = ((size_t)1) << params.hashLog;
+ size_t const ldmBucketSizeLog = MIN(params.bucketSizeLog, params.hashLog);
+ size_t const ldmBucketSize = ((size_t)1) << (params.hashLog - ldmBucketSizeLog);
+ size_t const totalSize = ZSTD_cwksp_alloc_size(ldmBucketSize)
+ + ZSTD_cwksp_alloc_size(ldmHSize * sizeof(ldmEntry_t));
+ return params.enableLdm ? totalSize : 0;
+}
+
+size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize)
+{
+ return params.enableLdm ? (maxChunkSize / params.minMatchLength) : 0;
+}
+
+/** ZSTD_ldm_getSmallHash() :
+ * numBits should be <= 32
+ * If numBits==0, returns 0.
+ * @return : the most significant numBits of value. */
+static U32 ZSTD_ldm_getSmallHash(U64 value, U32 numBits)
+{
+ assert(numBits <= 32);
+ return numBits == 0 ? 0 : (U32)(value >> (64 - numBits));
+}
+
+/** ZSTD_ldm_getChecksum() :
+ * numBitsToDiscard should be <= 32
+ * @return : the next most significant 32 bits after numBitsToDiscard */
+static U32 ZSTD_ldm_getChecksum(U64 hash, U32 numBitsToDiscard)
+{
+ assert(numBitsToDiscard <= 32);
+ return (hash >> (64 - 32 - numBitsToDiscard)) & 0xFFFFFFFF;
+}
+
+/** ZSTD_ldm_getTag() ;
+ * Given the hash, returns the most significant numTagBits bits
+ * after (32 + hbits) bits.
+ *
+ * If there are not enough bits remaining, return the last
+ * numTagBits bits. */
+static U32 ZSTD_ldm_getTag(U64 hash, U32 hbits, U32 numTagBits)
+{
+ assert(numTagBits < 32 && hbits <= 32);
+ if (32 - hbits < numTagBits) {
+ return hash & (((U32)1 << numTagBits) - 1);
+ } else {
+ return (hash >> (32 - hbits - numTagBits)) & (((U32)1 << numTagBits) - 1);
+ }
+}
+
+/** ZSTD_ldm_getBucket() :
+ * Returns a pointer to the start of the bucket associated with hash. */
+static ldmEntry_t* ZSTD_ldm_getBucket(
+ ldmState_t* ldmState, size_t hash, ldmParams_t const ldmParams)
+{
+ return ldmState->hashTable + (hash << ldmParams.bucketSizeLog);
+}
+
+/** ZSTD_ldm_insertEntry() :
+ * Insert the entry with corresponding hash into the hash table */
+static void ZSTD_ldm_insertEntry(ldmState_t* ldmState,
+ size_t const hash, const ldmEntry_t entry,
+ ldmParams_t const ldmParams)
+{
+ BYTE* const bucketOffsets = ldmState->bucketOffsets;
+ *(ZSTD_ldm_getBucket(ldmState, hash, ldmParams) + bucketOffsets[hash]) = entry;
+ bucketOffsets[hash]++;
+ bucketOffsets[hash] &= ((U32)1 << ldmParams.bucketSizeLog) - 1;
+}
+
+/** ZSTD_ldm_makeEntryAndInsertByTag() :
+ *
+ * Gets the small hash, checksum, and tag from the rollingHash.
+ *
+ * If the tag matches (1 << ldmParams.hashRateLog)-1, then
+ * creates an ldmEntry from the offset, and inserts it into the hash table.
+ *
+ * hBits is the length of the small hash, which is the most significant hBits
+ * of rollingHash. The checksum is the next 32 most significant bits, followed
+ * by ldmParams.hashRateLog bits that make up the tag. */
+static void ZSTD_ldm_makeEntryAndInsertByTag(ldmState_t* ldmState,
+ U64 const rollingHash,
+ U32 const hBits,
+ U32 const offset,
+ ldmParams_t const ldmParams)
+{
+ U32 const tag = ZSTD_ldm_getTag(rollingHash, hBits, ldmParams.hashRateLog);
+ U32 const tagMask = ((U32)1 << ldmParams.hashRateLog) - 1;
+ if (tag == tagMask) {
+ U32 const hash = ZSTD_ldm_getSmallHash(rollingHash, hBits);
+ U32 const checksum = ZSTD_ldm_getChecksum(rollingHash, hBits);
+ ldmEntry_t entry;
+ entry.offset = offset;
+ entry.checksum = checksum;
+ ZSTD_ldm_insertEntry(ldmState, hash, entry, ldmParams);
+ }
+}
+
+/** ZSTD_ldm_countBackwardsMatch() :
+ * Returns the number of bytes that match backwards before pIn and pMatch.
+ *
+ * We count only bytes where pMatch >= pBase and pIn >= pAnchor. */
+static size_t ZSTD_ldm_countBackwardsMatch(
+ const BYTE* pIn, const BYTE* pAnchor,
+ const BYTE* pMatch, const BYTE* pBase)
+{
+ size_t matchLength = 0;
+ while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) {
+ pIn--;
+ pMatch--;
+ matchLength++;
+ }
+ return matchLength;
+}
+
+/** ZSTD_ldm_fillFastTables() :
+ *
+ * Fills the relevant tables for the ZSTD_fast and ZSTD_dfast strategies.
+ * This is similar to ZSTD_loadDictionaryContent.
+ *
+ * The tables for the other strategies are filled within their
+ * block compressors. */
+static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
+ void const* end)
+{
+ const BYTE* const iend = (const BYTE*)end;
+
+ switch(ms->cParams.strategy)
+ {
+ case ZSTD_fast:
+ ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast);
+ break;
+
+ case ZSTD_dfast:
+ ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast);
+ break;
+
+ case ZSTD_greedy:
+ case ZSTD_lazy:
+ case ZSTD_lazy2:
+ case ZSTD_btlazy2:
+ case ZSTD_btopt:
+ case ZSTD_btultra:
+ case ZSTD_btultra2:
+ break;
+ default:
+ assert(0); /* not possible : not a valid strategy id */
+ }
+
+ return 0;
+}
+
+/** ZSTD_ldm_fillLdmHashTable() :
+ *
+ * Fills hashTable from (lastHashed + 1) to iend (non-inclusive).
+ * lastHash is the rolling hash that corresponds to lastHashed.
+ *
+ * Returns the rolling hash corresponding to position iend-1. */
+static U64 ZSTD_ldm_fillLdmHashTable(ldmState_t* state,
+ U64 lastHash, const BYTE* lastHashed,
+ const BYTE* iend, const BYTE* base,
+ U32 hBits, ldmParams_t const ldmParams)
+{
+ U64 rollingHash = lastHash;
+ const BYTE* cur = lastHashed + 1;
+
+ while (cur < iend) {
+ rollingHash = ZSTD_rollingHash_rotate(rollingHash, cur[-1],
+ cur[ldmParams.minMatchLength-1],
+ state->hashPower);
+ ZSTD_ldm_makeEntryAndInsertByTag(state,
+ rollingHash, hBits,
+ (U32)(cur - base), ldmParams);
+ ++cur;
+ }
+ return rollingHash;
+}
+
+void ZSTD_ldm_fillHashTable(
+ ldmState_t* state, const BYTE* ip,
+ const BYTE* iend, ldmParams_t const* params)
+{
+ DEBUGLOG(5, "ZSTD_ldm_fillHashTable");
+ if ((size_t)(iend - ip) >= params->minMatchLength) {
+ U64 startingHash = ZSTD_rollingHash_compute(ip, params->minMatchLength);
+ ZSTD_ldm_fillLdmHashTable(
+ state, startingHash, ip, iend - params->minMatchLength, state->window.base,
+ params->hashLog - params->bucketSizeLog,
+ *params);
+ }
+}
+
+
+/** ZSTD_ldm_limitTableUpdate() :
+ *
+ * Sets cctx->nextToUpdate to a position corresponding closer to anchor
+ * if it is far way
+ * (after a long match, only update tables a limited amount). */
+static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor)
+{
+ U32 const current = (U32)(anchor - ms->window.base);
+ if (current > ms->nextToUpdate + 1024) {
+ ms->nextToUpdate =
+ current - MIN(512, current - ms->nextToUpdate - 1024);
+ }
+}
+
+static size_t ZSTD_ldm_generateSequences_internal(
+ ldmState_t* ldmState, rawSeqStore_t* rawSeqStore,
+ ldmParams_t const* params, void const* src, size_t srcSize)
+{
+ /* LDM parameters */
+ int const extDict = ZSTD_window_hasExtDict(ldmState->window);
+ U32 const minMatchLength = params->minMatchLength;
+ U64 const hashPower = ldmState->hashPower;
+ U32 const hBits = params->hashLog - params->bucketSizeLog;
+ U32 const ldmBucketSize = 1U << params->bucketSizeLog;
+ U32 const hashRateLog = params->hashRateLog;
+ U32 const ldmTagMask = (1U << params->hashRateLog) - 1;
+ /* Prefix and extDict parameters */
+ U32 const dictLimit = ldmState->window.dictLimit;
+ U32 const lowestIndex = extDict ? ldmState->window.lowLimit : dictLimit;
+ BYTE const* const base = ldmState->window.base;
+ BYTE const* const dictBase = extDict ? ldmState->window.dictBase : NULL;
+ BYTE const* const dictStart = extDict ? dictBase + lowestIndex : NULL;
+ BYTE const* const dictEnd = extDict ? dictBase + dictLimit : NULL;
+ BYTE const* const lowPrefixPtr = base + dictLimit;
+ /* Input bounds */
+ BYTE const* const istart = (BYTE const*)src;
+ BYTE const* const iend = istart + srcSize;
+ BYTE const* const ilimit = iend - MAX(minMatchLength, HASH_READ_SIZE);
+ /* Input positions */
+ BYTE const* anchor = istart;
+ BYTE const* ip = istart;
+ /* Rolling hash */
+ BYTE const* lastHashed = NULL;
+ U64 rollingHash = 0;
+
+ while (ip <= ilimit) {
+ size_t mLength;
+ U32 const current = (U32)(ip - base);
+ size_t forwardMatchLength = 0, backwardMatchLength = 0;
+ ldmEntry_t* bestEntry = NULL;
+ if (ip != istart) {
+ rollingHash = ZSTD_rollingHash_rotate(rollingHash, lastHashed[0],
+ lastHashed[minMatchLength],
+ hashPower);
+ } else {
+ rollingHash = ZSTD_rollingHash_compute(ip, minMatchLength);
+ }
+ lastHashed = ip;
+
+ /* Do not insert and do not look for a match */
+ if (ZSTD_ldm_getTag(rollingHash, hBits, hashRateLog) != ldmTagMask) {
+ ip++;
+ continue;
+ }
+
+ /* Get the best entry and compute the match lengths */
+ {
+ ldmEntry_t* const bucket =
+ ZSTD_ldm_getBucket(ldmState,
+ ZSTD_ldm_getSmallHash(rollingHash, hBits),
+ *params);
+ ldmEntry_t* cur;
+ size_t bestMatchLength = 0;
+ U32 const checksum = ZSTD_ldm_getChecksum(rollingHash, hBits);
+
+ for (cur = bucket; cur < bucket + ldmBucketSize; ++cur) {
+ size_t curForwardMatchLength, curBackwardMatchLength,
+ curTotalMatchLength;
+ if (cur->checksum != checksum || cur->offset <= lowestIndex) {
+ continue;
+ }
+ if (extDict) {
+ BYTE const* const curMatchBase =
+ cur->offset < dictLimit ? dictBase : base;
+ BYTE const* const pMatch = curMatchBase + cur->offset;
+ BYTE const* const matchEnd =
+ cur->offset < dictLimit ? dictEnd : iend;
+ BYTE const* const lowMatchPtr =
+ cur->offset < dictLimit ? dictStart : lowPrefixPtr;
+
+ curForwardMatchLength = ZSTD_count_2segments(
+ ip, pMatch, iend,
+ matchEnd, lowPrefixPtr);
+ if (curForwardMatchLength < minMatchLength) {
+ continue;
+ }
+ curBackwardMatchLength =
+ ZSTD_ldm_countBackwardsMatch(ip, anchor, pMatch,
+ lowMatchPtr);
+ curTotalMatchLength = curForwardMatchLength +
+ curBackwardMatchLength;
+ } else { /* !extDict */
+ BYTE const* const pMatch = base + cur->offset;
+ curForwardMatchLength = ZSTD_count(ip, pMatch, iend);
+ if (curForwardMatchLength < minMatchLength) {
+ continue;
+ }
+ curBackwardMatchLength =
+ ZSTD_ldm_countBackwardsMatch(ip, anchor, pMatch,
+ lowPrefixPtr);
+ curTotalMatchLength = curForwardMatchLength +
+ curBackwardMatchLength;
+ }
+
+ if (curTotalMatchLength > bestMatchLength) {
+ bestMatchLength = curTotalMatchLength;
+ forwardMatchLength = curForwardMatchLength;
+ backwardMatchLength = curBackwardMatchLength;
+ bestEntry = cur;
+ }
+ }
+ }
+
+ /* No match found -- continue searching */
+ if (bestEntry == NULL) {
+ ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash,
+ hBits, current,
+ *params);
+ ip++;
+ continue;
+ }
+
+ /* Match found */
+ mLength = forwardMatchLength + backwardMatchLength;
+ ip -= backwardMatchLength;
+
+ {
+ /* Store the sequence:
+ * ip = current - backwardMatchLength
+ * The match is at (bestEntry->offset - backwardMatchLength)
+ */
+ U32 const matchIndex = bestEntry->offset;
+ U32 const offset = current - matchIndex;
+ rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size;
+
+ /* Out of sequence storage */
+ if (rawSeqStore->size == rawSeqStore->capacity)
+ return ERROR(dstSize_tooSmall);
+ seq->litLength = (U32)(ip - anchor);
+ seq->matchLength = (U32)mLength;
+ seq->offset = offset;
+ rawSeqStore->size++;
+ }
+
+ /* Insert the current entry into the hash table */
+ ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash, hBits,
+ (U32)(lastHashed - base),
+ *params);
+
+ assert(ip + backwardMatchLength == lastHashed);
+
+ /* Fill the hash table from lastHashed+1 to ip+mLength*/
+ /* Heuristic: don't need to fill the entire table at end of block */
+ if (ip + mLength <= ilimit) {
+ rollingHash = ZSTD_ldm_fillLdmHashTable(
+ ldmState, rollingHash, lastHashed,
+ ip + mLength, base, hBits, *params);
+ lastHashed = ip + mLength - 1;
+ }
+ ip += mLength;
+ anchor = ip;
+ }
+ return iend - anchor;
+}
+
+/*! ZSTD_ldm_reduceTable() :
+ * reduce table indexes by `reducerValue` */
+static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size,
+ U32 const reducerValue)
+{
+ U32 u;
+ for (u = 0; u < size; u++) {
+ if (table[u].offset < reducerValue) table[u].offset = 0;
+ else table[u].offset -= reducerValue;
+ }
+}
+
+size_t ZSTD_ldm_generateSequences(
+ ldmState_t* ldmState, rawSeqStore_t* sequences,
+ ldmParams_t const* params, void const* src, size_t srcSize)
+{
+ U32 const maxDist = 1U << params->windowLog;
+ BYTE const* const istart = (BYTE const*)src;
+ BYTE const* const iend = istart + srcSize;
+ size_t const kMaxChunkSize = 1 << 20;
+ size_t const nbChunks = (srcSize / kMaxChunkSize) + ((srcSize % kMaxChunkSize) != 0);
+ size_t chunk;
+ size_t leftoverSize = 0;
+
+ assert(ZSTD_CHUNKSIZE_MAX >= kMaxChunkSize);
+ /* Check that ZSTD_window_update() has been called for this chunk prior
+ * to passing it to this function.
+ */
+ assert(ldmState->window.nextSrc >= (BYTE const*)src + srcSize);
+ /* The input could be very large (in zstdmt), so it must be broken up into
+ * chunks to enforce the maximum distance and handle overflow correction.
+ */
+ assert(sequences->pos <= sequences->size);
+ assert(sequences->size <= sequences->capacity);
+ for (chunk = 0; chunk < nbChunks && sequences->size < sequences->capacity; ++chunk) {
+ BYTE const* const chunkStart = istart + chunk * kMaxChunkSize;
+ size_t const remaining = (size_t)(iend - chunkStart);
+ BYTE const *const chunkEnd =
+ (remaining < kMaxChunkSize) ? iend : chunkStart + kMaxChunkSize;
+ size_t const chunkSize = chunkEnd - chunkStart;
+ size_t newLeftoverSize;
+ size_t const prevSize = sequences->size;
+
+ assert(chunkStart < iend);
+ /* 1. Perform overflow correction if necessary. */
+ if (ZSTD_window_needOverflowCorrection(ldmState->window, chunkEnd)) {
+ U32 const ldmHSize = 1U << params->hashLog;
+ U32 const correction = ZSTD_window_correctOverflow(
+ &ldmState->window, /* cycleLog */ 0, maxDist, chunkStart);
+ ZSTD_ldm_reduceTable(ldmState->hashTable, ldmHSize, correction);
+ /* invalidate dictionaries on overflow correction */
+ ldmState->loadedDictEnd = 0;
+ }
+ /* 2. We enforce the maximum offset allowed.
+ *
+ * kMaxChunkSize should be small enough that we don't lose too much of
+ * the window through early invalidation.
+ * TODO: * Test the chunk size.
+ * * Try invalidation after the sequence generation and test the
+ * the offset against maxDist directly.
+ *
+ * NOTE: Because of dictionaries + sequence splitting we MUST make sure
+ * that any offset used is valid at the END of the sequence, since it may
+ * be split into two sequences. This condition holds when using
+ * ZSTD_window_enforceMaxDist(), but if we move to checking offsets
+ * against maxDist directly, we'll have to carefully handle that case.
+ */
+ ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, &ldmState->loadedDictEnd, NULL);
+ /* 3. Generate the sequences for the chunk, and get newLeftoverSize. */
+ newLeftoverSize = ZSTD_ldm_generateSequences_internal(
+ ldmState, sequences, params, chunkStart, chunkSize);
+ if (ZSTD_isError(newLeftoverSize))
+ return newLeftoverSize;
+ /* 4. We add the leftover literals from previous iterations to the first
+ * newly generated sequence, or add the `newLeftoverSize` if none are
+ * generated.
+ */
+ /* Prepend the leftover literals from the last call */
+ if (prevSize < sequences->size) {
+ sequences->seq[prevSize].litLength += (U32)leftoverSize;
+ leftoverSize = newLeftoverSize;
+ } else {
+ assert(newLeftoverSize == chunkSize);
+ leftoverSize += chunkSize;
+ }
+ }
+ return 0;
+}
+
+void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) {
+ while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) {
+ rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos;
+ if (srcSize <= seq->litLength) {
+ /* Skip past srcSize literals */
+ seq->litLength -= (U32)srcSize;
+ return;
+ }
+ srcSize -= seq->litLength;
+ seq->litLength = 0;
+ if (srcSize < seq->matchLength) {
+ /* Skip past the first srcSize of the match */
+ seq->matchLength -= (U32)srcSize;
+ if (seq->matchLength < minMatch) {
+ /* The match is too short, omit it */
+ if (rawSeqStore->pos + 1 < rawSeqStore->size) {
+ seq[1].litLength += seq[0].matchLength;
+ }
+ rawSeqStore->pos++;
+ }
+ return;
+ }
+ srcSize -= seq->matchLength;
+ seq->matchLength = 0;
+ rawSeqStore->pos++;
+ }
+}
+
+/**
+ * If the sequence length is longer than remaining then the sequence is split
+ * between this block and the next.
+ *
+ * Returns the current sequence to handle, or if the rest of the block should
+ * be literals, it returns a sequence with offset == 0.
+ */
+static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore,
+ U32 const remaining, U32 const minMatch)
+{
+ rawSeq sequence = rawSeqStore->seq[rawSeqStore->pos];
+ assert(sequence.offset > 0);
+ /* Likely: No partial sequence */
+ if (remaining >= sequence.litLength + sequence.matchLength) {
+ rawSeqStore->pos++;
+ return sequence;
+ }
+ /* Cut the sequence short (offset == 0 ==> rest is literals). */
+ if (remaining <= sequence.litLength) {
+ sequence.offset = 0;
+ } else if (remaining < sequence.litLength + sequence.matchLength) {
+ sequence.matchLength = remaining - sequence.litLength;
+ if (sequence.matchLength < minMatch) {
+ sequence.offset = 0;
+ }
+ }
+ /* Skip past `remaining` bytes for the future sequences. */
+ ZSTD_ldm_skipSequences(rawSeqStore, remaining, minMatch);
+ return sequence;
+}
+
+size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
+ unsigned const minMatch = cParams->minMatch;
+ ZSTD_blockCompressor const blockCompressor =
+ ZSTD_selectBlockCompressor(cParams->strategy, ZSTD_matchState_dictMode(ms));
+ /* Input bounds */
+ BYTE const* const istart = (BYTE const*)src;
+ BYTE const* const iend = istart + srcSize;
+ /* Input positions */
+ BYTE const* ip = istart;
+
+ DEBUGLOG(5, "ZSTD_ldm_blockCompress: srcSize=%zu", srcSize);
+ assert(rawSeqStore->pos <= rawSeqStore->size);
+ assert(rawSeqStore->size <= rawSeqStore->capacity);
+ /* Loop through each sequence and apply the block compressor to the lits */
+ while (rawSeqStore->pos < rawSeqStore->size && ip < iend) {
+ /* maybeSplitSequence updates rawSeqStore->pos */
+ rawSeq const sequence = maybeSplitSequence(rawSeqStore,
+ (U32)(iend - ip), minMatch);
+ int i;
+ /* End signal */
+ if (sequence.offset == 0)
+ break;
+
+ assert(ip + sequence.litLength + sequence.matchLength <= iend);
+
+ /* Fill tables for block compressor */
+ ZSTD_ldm_limitTableUpdate(ms, ip);
+ ZSTD_ldm_fillFastTables(ms, ip);
+ /* Run the block compressor */
+ DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength);
+ {
+ size_t const newLitLength =
+ blockCompressor(ms, seqStore, rep, ip, sequence.litLength);
+ ip += sequence.litLength;
+ /* Update the repcodes */
+ for (i = ZSTD_REP_NUM - 1; i > 0; i--)
+ rep[i] = rep[i-1];
+ rep[0] = sequence.offset;
+ /* Store the sequence */
+ ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend,
+ sequence.offset + ZSTD_REP_MOVE,
+ sequence.matchLength - MINMATCH);
+ ip += sequence.matchLength;
+ }
+ }
+ /* Fill the tables for the block compressor */
+ ZSTD_ldm_limitTableUpdate(ms, ip);
+ ZSTD_ldm_fillFastTables(ms, ip);
+ /* Compress the last literals */
+ return blockCompressor(ms, seqStore, rep, ip, iend - ip);
+}
+/**** ended inlining compress/zstd_ldm.c ****/
+/**** start inlining compress/zstd_opt.c ****/
+/*
+ * Copyright (c) 2016-2020, Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/**** skipping file: zstd_compress_internal.h ****/
+/**** skipping file: hist.h ****/
+/**** skipping file: zstd_opt.h ****/
+
+
+#define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
+#define ZSTD_FREQ_DIV 4 /* log factor when using previous stats to init next stats */
+#define ZSTD_MAX_PRICE (1<<30)
+
+#define ZSTD_PREDEF_THRESHOLD 1024 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
+
+
+/*-*************************************
+* Price functions for optimal parser
+***************************************/
+
+#if 0 /* approximation at bit level */
+# define BITCOST_ACCURACY 0
+# define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+# define WEIGHT(stat) ((void)opt, ZSTD_bitWeight(stat))
+#elif 0 /* fractional bit accuracy */
+# define BITCOST_ACCURACY 8
+# define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+# define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat))
+#else /* opt==approx, ultra==accurate */
+# define BITCOST_ACCURACY 8
+# define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+# define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
+#endif
+
+MEM_STATIC U32 ZSTD_bitWeight(U32 stat)
+{
+ return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER);
+}
+
+MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
+{
+ U32 const stat = rawStat + 1;
+ U32 const hb = ZSTD_highbit32(stat);
+ U32 const BWeight = hb * BITCOST_MULTIPLIER;
+ U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
+ U32 const weight = BWeight + FWeight;
+ assert(hb + BITCOST_ACCURACY < 31);
+ return weight;
+}
+
+#if (DEBUGLEVEL>=2)
+/* debugging function,
+ * @return price in bytes as fractional value
+ * for debug messages only */
+MEM_STATIC double ZSTD_fCost(U32 price)
+{
+ return (double)price / (BITCOST_MULTIPLIER*8);
+}
+#endif
+
+static int ZSTD_compressedLiterals(optState_t const* const optPtr)
+{
+ return optPtr->literalCompressionMode != ZSTD_lcm_uncompressed;
+}
+
+static void ZSTD_setBasePrices(optState_t* optPtr, int optLevel)
+{
+ if (ZSTD_compressedLiterals(optPtr))
+ optPtr->litSumBasePrice = WEIGHT(optPtr->litSum, optLevel);
+ optPtr->litLengthSumBasePrice = WEIGHT(optPtr->litLengthSum, optLevel);
+ optPtr->matchLengthSumBasePrice = WEIGHT(optPtr->matchLengthSum, optLevel);
+ optPtr->offCodeSumBasePrice = WEIGHT(optPtr->offCodeSum, optLevel);
+}
+
+
+/* ZSTD_downscaleStat() :
+ * reduce all elements in table by a factor 2^(ZSTD_FREQ_DIV+malus)
+ * return the resulting sum of elements */
+static U32 ZSTD_downscaleStat(unsigned* table, U32 lastEltIndex, int malus)
+{
+ U32 s, sum=0;
+ DEBUGLOG(5, "ZSTD_downscaleStat (nbElts=%u)", (unsigned)lastEltIndex+1);
+ assert(ZSTD_FREQ_DIV+malus > 0 && ZSTD_FREQ_DIV+malus < 31);
+ for (s=0; s<lastEltIndex+1; s++) {
+ table[s] = 1 + (table[s] >> (ZSTD_FREQ_DIV+malus));
+ sum += table[s];
+ }
+ return sum;
+}
+
+/* ZSTD_rescaleFreqs() :
+ * if first block (detected by optPtr->litLengthSum == 0) : init statistics
+ * take hints from dictionary if there is one
+ * or init from zero, using src for literals stats, or flat 1 for match symbols
+ * otherwise downscale existing stats, to be used as seed for next block.
+ */
+static void
+ZSTD_rescaleFreqs(optState_t* const optPtr,
+ const BYTE* const src, size_t const srcSize,
+ int const optLevel)
+{
+ int const compressedLiterals = ZSTD_compressedLiterals(optPtr);
+ DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize);
+ optPtr->priceType = zop_dynamic;
+
+ if (optPtr->litLengthSum == 0) { /* first block : init */
+ if (srcSize <= ZSTD_PREDEF_THRESHOLD) { /* heuristic */
+ DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef");
+ optPtr->priceType = zop_predef;
+ }
+
+ assert(optPtr->symbolCosts != NULL);
+ if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) {
+ /* huffman table presumed generated by dictionary */
+ optPtr->priceType = zop_dynamic;
+
+ if (compressedLiterals) {
+ unsigned lit;
+ assert(optPtr->litFreq != NULL);
+ optPtr->litSum = 0;
+ for (lit=0; lit<=MaxLit; lit++) {
+ U32 const scaleLog = 11; /* scale to 2K */
+ U32 const bitCost = HUF_getNbBits(optPtr->symbolCosts->huf.CTable, lit);
+ assert(bitCost <= scaleLog);
+ optPtr->litFreq[lit] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+ optPtr->litSum += optPtr->litFreq[lit];
+ } }
+
+ { unsigned ll;
+ FSE_CState_t llstate;
+ FSE_initCState(&llstate, optPtr->symbolCosts->fse.litlengthCTable);
+ optPtr->litLengthSum = 0;
+ for (ll=0; ll<=MaxLL; ll++) {
+ U32 const scaleLog = 10; /* scale to 1K */
+ U32 const bitCost = FSE_getMaxNbBits(llstate.symbolTT, ll);
+ assert(bitCost < scaleLog);
+ optPtr->litLengthFreq[ll] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+ optPtr->litLengthSum += optPtr->litLengthFreq[ll];
+ } }
+
+ { unsigned ml;
+ FSE_CState_t mlstate;
+ FSE_initCState(&mlstate, optPtr->symbolCosts->fse.matchlengthCTable);
+ optPtr->matchLengthSum = 0;
+ for (ml=0; ml<=MaxML; ml++) {
+ U32 const scaleLog = 10;
+ U32 const bitCost = FSE_getMaxNbBits(mlstate.symbolTT, ml);
+ assert(bitCost < scaleLog);
+ optPtr->matchLengthFreq[ml] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+ optPtr->matchLengthSum += optPtr->matchLengthFreq[ml];
+ } }
+
+ { unsigned of;
+ FSE_CState_t ofstate;
+ FSE_initCState(&ofstate, optPtr->symbolCosts->fse.offcodeCTable);
+ optPtr->offCodeSum = 0;
+ for (of=0; of<=MaxOff; of++) {
+ U32 const scaleLog = 10;
+ U32 const bitCost = FSE_getMaxNbBits(ofstate.symbolTT, of);
+ assert(bitCost < scaleLog);
+ optPtr->offCodeFreq[of] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+ optPtr->offCodeSum += optPtr->offCodeFreq[of];
+ } }
+
+ } else { /* not a dictionary */
+
+ assert(optPtr->litFreq != NULL);
+ if (compressedLiterals) {
+ unsigned lit = MaxLit;
+ HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */
+ optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1);
+ }
+
+ { unsigned ll;
+ for (ll=0; ll<=MaxLL; ll++)
+ optPtr->litLengthFreq[ll] = 1;
+ }
+ optPtr->litLengthSum = MaxLL+1;
+
+ { unsigned ml;
+ for (ml=0; ml<=MaxML; ml++)
+ optPtr->matchLengthFreq[ml] = 1;
+ }
+ optPtr->matchLengthSum = MaxML+1;
+
+ { unsigned of;
+ for (of=0; of<=MaxOff; of++)
+ optPtr->offCodeFreq[of] = 1;
+ }
+ optPtr->offCodeSum = MaxOff+1;
+
+ }
+
+ } else { /* new block : re-use previous statistics, scaled down */
+
+ if (compressedLiterals)
+ optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1);
+ optPtr->litLengthSum = ZSTD_downscaleStat(optPtr->litLengthFreq, MaxLL, 0);
+ optPtr->matchLengthSum = ZSTD_downscaleStat(optPtr->matchLengthFreq, MaxML, 0);
+ optPtr->offCodeSum = ZSTD_downscaleStat(optPtr->offCodeFreq, MaxOff, 0);
+ }
+
+ ZSTD_setBasePrices(optPtr, optLevel);
+}
+
+/* ZSTD_rawLiteralsCost() :
+ * price of literals (only) in specified segment (which length can be 0).
+ * does not include price of literalLength symbol */
+static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
+ const optState_t* const optPtr,
+ int optLevel)
+{
+ if (litLength == 0) return 0;
+
+ if (!ZSTD_compressedLiterals(optPtr))
+ return (litLength << 3) * BITCOST_MULTIPLIER; /* Uncompressed - 8 bytes per literal. */
+
+ if (optPtr->priceType == zop_predef)
+ return (litLength*6) * BITCOST_MULTIPLIER; /* 6 bit per literal - no statistic used */
+
+ /* dynamic statistics */
+ { U32 price = litLength * optPtr->litSumBasePrice;
+ U32 u;
+ for (u=0; u < litLength; u++) {
+ assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice); /* literal cost should never be negative */
+ price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel);
+ }
+ return price;
+ }
+}
+
+/* ZSTD_litLengthPrice() :
+ * cost of literalLength symbol */
+static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optPtr, int optLevel)
+{
+ if (optPtr->priceType == zop_predef) return WEIGHT(litLength, optLevel);
+
+ /* dynamic statistics */
+ { U32 const llCode = ZSTD_LLcode(litLength);
+ return (LL_bits[llCode] * BITCOST_MULTIPLIER)
+ + optPtr->litLengthSumBasePrice
+ - WEIGHT(optPtr->litLengthFreq[llCode], optLevel);
+ }
+}
+
+/* ZSTD_getMatchPrice() :
+ * Provides the cost of the match part (offset + matchLength) of a sequence
+ * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
+ * optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) */
+FORCE_INLINE_TEMPLATE U32
+ZSTD_getMatchPrice(U32 const offset,
+ U32 const matchLength,
+ const optState_t* const optPtr,
+ int const optLevel)
+{
+ U32 price;
+ U32 const offCode = ZSTD_highbit32(offset+1);
+ U32 const mlBase = matchLength - MINMATCH;
+ assert(matchLength >= MINMATCH);
+
+ if (optPtr->priceType == zop_predef) /* fixed scheme, do not use statistics */
+ return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER);
+
+ /* dynamic statistics */
+ price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
+ if ((optLevel<2) /*static*/ && offCode >= 20)
+ price += (offCode-19)*2 * BITCOST_MULTIPLIER; /* handicap for long distance offsets, favor decompression speed */
+
+ /* match Length */
+ { U32 const mlCode = ZSTD_MLcode(mlBase);
+ price += (ML_bits[mlCode] * BITCOST_MULTIPLIER) + (optPtr->matchLengthSumBasePrice - WEIGHT(optPtr->matchLengthFreq[mlCode], optLevel));
+ }
+
+ price += BITCOST_MULTIPLIER / 5; /* heuristic : make matches a bit more costly to favor less sequences -> faster decompression speed */
+
+ DEBUGLOG(8, "ZSTD_getMatchPrice(ml:%u) = %u", matchLength, price);
+ return price;
+}
+
+/* ZSTD_updateStats() :
+ * assumption : literals + litLengtn <= iend */
+static void ZSTD_updateStats(optState_t* const optPtr,
+ U32 litLength, const BYTE* literals,
+ U32 offsetCode, U32 matchLength)
+{
+ /* literals */
+ if (ZSTD_compressedLiterals(optPtr)) {
+ U32 u;
+ for (u=0; u < litLength; u++)
+ optPtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD;
+ optPtr->litSum += litLength*ZSTD_LITFREQ_ADD;
+ }
+
+ /* literal Length */
+ { U32 const llCode = ZSTD_LLcode(litLength);
+ optPtr->litLengthFreq[llCode]++;
+ optPtr->litLengthSum++;
+ }
+
+ /* match offset code (0-2=>repCode; 3+=>offset+2) */
+ { U32 const offCode = ZSTD_highbit32(offsetCode+1);
+ assert(offCode <= MaxOff);
+ optPtr->offCodeFreq[offCode]++;
+ optPtr->offCodeSum++;
+ }
+
+ /* match Length */
+ { U32 const mlBase = matchLength - MINMATCH;
+ U32 const mlCode = ZSTD_MLcode(mlBase);
+ optPtr->matchLengthFreq[mlCode]++;
+ optPtr->matchLengthSum++;
+ }
+}
+
+
+/* ZSTD_readMINMATCH() :
+ * function safe only for comparisons
+ * assumption : memPtr must be at least 4 bytes before end of buffer */
+MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length)
+{
+ switch (length)
+ {
+ default :
+ case 4 : return MEM_read32(memPtr);
+ case 3 : if (MEM_isLittleEndian())
+ return MEM_read32(memPtr)<<8;
+ else
+ return MEM_read32(memPtr)>>8;
+ }
+}
+
+
+/* Update hashTable3 up to ip (excluded)
+ Assumption : always within prefix (i.e. not within extDict) */
+static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_matchState_t* ms,
+ U32* nextToUpdate3,
+ const BYTE* const ip)
+{
+ U32* const hashTable3 = ms->hashTable3;
+ U32 const hashLog3 = ms->hashLog3;
+ const BYTE* const base = ms->window.base;
+ U32 idx = *nextToUpdate3;
+ U32 const target = (U32)(ip - base);
+ size_t const hash3 = ZSTD_hash3Ptr(ip, hashLog3);
+ assert(hashLog3 > 0);
+
+ while(idx < target) {
+ hashTable3[ZSTD_hash3Ptr(base+idx, hashLog3)] = idx;
+ idx++;
+ }
+
+ *nextToUpdate3 = target;
+ return hashTable3[hash3];
+}
+
+
+/*-*************************************
+* Binary Tree search
+***************************************/
+/** ZSTD_insertBt1() : add one or multiple positions to tree.
+ * ip : assumed <= iend-8 .
+ * @return : nb of positions added */
+static U32 ZSTD_insertBt1(
+ ZSTD_matchState_t* ms,
+ const BYTE* const ip, const BYTE* const iend,
+ U32 const mls, const int extDict)
+{
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
+ U32* const hashTable = ms->hashTable;
+ U32 const hashLog = cParams->hashLog;
+ size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
+ U32* const bt = ms->chainTable;
+ U32 const btLog = cParams->chainLog - 1;
+ U32 const btMask = (1 << btLog) - 1;
+ U32 matchIndex = hashTable[h];
+ size_t commonLengthSmaller=0, commonLengthLarger=0;
+ const BYTE* const base = ms->window.base;
+ const BYTE* const dictBase = ms->window.dictBase;
+ const U32 dictLimit = ms->window.dictLimit;
+ const BYTE* const dictEnd = dictBase + dictLimit;
+ const BYTE* const prefixStart = base + dictLimit;
+ const BYTE* match;
+ const U32 current = (U32)(ip-base);
+ const U32 btLow = btMask >= current ? 0 : current - btMask;
+ U32* smallerPtr = bt + 2*(current&btMask);
+ U32* largerPtr = smallerPtr + 1;
+ U32 dummy32; /* to be nullified at the end */
+ U32 const windowLow = ms->window.lowLimit;
+ U32 matchEndIdx = current+8+1;
+ size_t bestLength = 8;
+ U32 nbCompares = 1U << cParams->searchLog;
+#ifdef ZSTD_C_PREDICT
+ U32 predictedSmall = *(bt + 2*((current-1)&btMask) + 0);
+ U32 predictedLarge = *(bt + 2*((current-1)&btMask) + 1);
+ predictedSmall += (predictedSmall>0);
+ predictedLarge += (predictedLarge>0);
+#endif /* ZSTD_C_PREDICT */
+
+ DEBUGLOG(8, "ZSTD_insertBt1 (%u)", current);
+
+ assert(ip <= iend-8); /* required for h calculation */
+ hashTable[h] = current; /* Update Hash Table */
+
+ assert(windowLow > 0);
+ while (nbCompares-- && (matchIndex >= windowLow)) {
+ U32* const nextPtr = bt + 2*(matchIndex & btMask);
+ size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
+ assert(matchIndex < current);
+
+#ifdef ZSTD_C_PREDICT /* note : can create issues when hlog small <= 11 */
+ const U32* predictPtr = bt + 2*((matchIndex-1) & btMask); /* written this way, as bt is a roll buffer */
+ if (matchIndex == predictedSmall) {
+ /* no need to check length, result known */
+ *smallerPtr = matchIndex;
+ if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */
+ smallerPtr = nextPtr+1; /* new "smaller" => larger of match */
+ matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */
+ predictedSmall = predictPtr[1] + (predictPtr[1]>0);
+ continue;
+ }
+ if (matchIndex == predictedLarge) {
+ *largerPtr = matchIndex;
+ if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */
+ largerPtr = nextPtr;
+ matchIndex = nextPtr[0];
+ predictedLarge = predictPtr[0] + (predictPtr[0]>0);
+ continue;
+ }
+#endif
+
+ if (!extDict || (matchIndex+matchLength >= dictLimit)) {
+ assert(matchIndex+matchLength >= dictLimit); /* might be wrong if actually extDict */
+ match = base + matchIndex;
+ matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
+ } else {
+ match = dictBase + matchIndex;
+ matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+ if (matchIndex+matchLength >= dictLimit)
+ match = base + matchIndex; /* to prepare for next usage of match[matchLength] */
+ }
+
+ if (matchLength > bestLength) {
+ bestLength = matchLength;
+ if (matchLength > matchEndIdx - matchIndex)
+ matchEndIdx = matchIndex + (U32)matchLength;
+ }
+
+ if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
+ break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
+ }
+
+ if (match[matchLength] < ip[matchLength]) { /* necessarily within buffer */
+ /* match is smaller than current */
+ *smallerPtr = matchIndex; /* update smaller idx */
+ commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
+ if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop searching */
+ smallerPtr = nextPtr+1; /* new "candidate" => larger than match, which was smaller than target */
+ matchIndex = nextPtr[1]; /* new matchIndex, larger than previous and closer to current */
+ } else {
+ /* match is larger than current */
+ *largerPtr = matchIndex;
+ commonLengthLarger = matchLength;
+ if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop searching */
+ largerPtr = nextPtr;
+ matchIndex = nextPtr[0];
+ } }
+
+ *smallerPtr = *largerPtr = 0;
+ { U32 positions = 0;
+ if (bestLength > 384) positions = MIN(192, (U32)(bestLength - 384)); /* speed optimization */
+ assert(matchEndIdx > current + 8);
+ return MAX(positions, matchEndIdx - (current + 8));
+ }
+}
+
+FORCE_INLINE_TEMPLATE
+void ZSTD_updateTree_internal(
+ ZSTD_matchState_t* ms,
+ const BYTE* const ip, const BYTE* const iend,
+ const U32 mls, const ZSTD_dictMode_e dictMode)
+{
+ const BYTE* const base = ms->window.base;
+ U32 const target = (U32)(ip - base);
+ U32 idx = ms->nextToUpdate;
+ DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u (dictMode:%u)",
+ idx, target, dictMode);
+
+ while(idx < target) {
+ U32 const forward = ZSTD_insertBt1(ms, base+idx, iend, mls, dictMode == ZSTD_extDict);
+ assert(idx < (U32)(idx + forward));
+ idx += forward;
+ }
+ assert((size_t)(ip - base) <= (size_t)(U32)(-1));
+ assert((size_t)(iend - base) <= (size_t)(U32)(-1));
+ ms->nextToUpdate = target;
+}
+
+void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
+ ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict);
+}
+
+FORCE_INLINE_TEMPLATE
+U32 ZSTD_insertBtAndGetAllMatches (
+ ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */
+ ZSTD_matchState_t* ms,
+ U32* nextToUpdate3,
+ const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode,
+ const U32 rep[ZSTD_REP_NUM],
+ U32 const ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
+ const U32 lengthToBeat,
+ U32 const mls /* template */)
+{
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
+ U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
+ const BYTE* const base = ms->window.base;
+ U32 const current = (U32)(ip-base);
+ U32 const hashLog = cParams->hashLog;
+ U32 const minMatch = (mls==3) ? 3 : 4;
+ U32* const hashTable = ms->hashTable;
+ size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
+ U32 matchIndex = hashTable[h];
+ U32* const bt = ms->chainTable;
+ U32 const btLog = cParams->chainLog - 1;
+ U32 const btMask= (1U << btLog) - 1;
+ size_t commonLengthSmaller=0, commonLengthLarger=0;
+ const BYTE* const dictBase = ms->window.dictBase;
+ U32 const dictLimit = ms->window.dictLimit;
+ const BYTE* const dictEnd = dictBase + dictLimit;
+ const BYTE* const prefixStart = base + dictLimit;
+ U32 const btLow = (btMask >= current) ? 0 : current - btMask;
+ U32 const windowLow = ZSTD_getLowestMatchIndex(ms, current, cParams->windowLog);
+ U32 const matchLow = windowLow ? windowLow : 1;
+ U32* smallerPtr = bt + 2*(current&btMask);
+ U32* largerPtr = bt + 2*(current&btMask) + 1;
+ U32 matchEndIdx = current+8+1; /* farthest referenced position of any match => detects repetitive patterns */
+ U32 dummy32; /* to be nullified at the end */
+ U32 mnum = 0;
+ U32 nbCompares = 1U << cParams->searchLog;
+
+ const ZSTD_matchState_t* dms = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL;
+ const ZSTD_compressionParameters* const dmsCParams =
+ dictMode == ZSTD_dictMatchState ? &dms->cParams : NULL;
+ const BYTE* const dmsBase = dictMode == ZSTD_dictMatchState ? dms->window.base : NULL;
+ const BYTE* const dmsEnd = dictMode == ZSTD_dictMatchState ? dms->window.nextSrc : NULL;
+ U32 const dmsHighLimit = dictMode == ZSTD_dictMatchState ? (U32)(dmsEnd - dmsBase) : 0;
+ U32 const dmsLowLimit = dictMode == ZSTD_dictMatchState ? dms->window.lowLimit : 0;
+ U32 const dmsIndexDelta = dictMode == ZSTD_dictMatchState ? windowLow - dmsHighLimit : 0;
+ U32 const dmsHashLog = dictMode == ZSTD_dictMatchState ? dmsCParams->hashLog : hashLog;
+ U32 const dmsBtLog = dictMode == ZSTD_dictMatchState ? dmsCParams->chainLog - 1 : btLog;
+ U32 const dmsBtMask = dictMode == ZSTD_dictMatchState ? (1U << dmsBtLog) - 1 : 0;
+ U32 const dmsBtLow = dictMode == ZSTD_dictMatchState && dmsBtMask < dmsHighLimit - dmsLowLimit ? dmsHighLimit - dmsBtMask : dmsLowLimit;
+
+ size_t bestLength = lengthToBeat-1;
+ DEBUGLOG(8, "ZSTD_insertBtAndGetAllMatches: current=%u", current);
+
+ /* check repCode */
+ assert(ll0 <= 1); /* necessarily 1 or 0 */
+ { U32 const lastR = ZSTD_REP_NUM + ll0;
+ U32 repCode;
+ for (repCode = ll0; repCode < lastR; repCode++) {
+ U32 const repOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+ U32 const repIndex = current - repOffset;
+ U32 repLen = 0;
+ assert(current >= dictLimit);
+ if (repOffset-1 /* intentional overflow, discards 0 and -1 */ < current-dictLimit) { /* equivalent to `current > repIndex >= dictLimit` */
+ /* We must validate the repcode offset because when we're using a dictionary the
+ * valid offset range shrinks when the dictionary goes out of bounds.
+ */
+ if ((repIndex >= windowLow) & (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(ip - repOffset, minMatch))) {
+ repLen = (U32)ZSTD_count(ip+minMatch, ip+minMatch-repOffset, iLimit) + minMatch;
+ }
+ } else { /* repIndex < dictLimit || repIndex >= current */
+ const BYTE* const repMatch = dictMode == ZSTD_dictMatchState ?
+ dmsBase + repIndex - dmsIndexDelta :
+ dictBase + repIndex;
+ assert(current >= windowLow);
+ if ( dictMode == ZSTD_extDict
+ && ( ((repOffset-1) /*intentional overflow*/ < current - windowLow) /* equivalent to `current > repIndex >= windowLow` */
+ & (((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */)
+ && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) {
+ repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dictEnd, prefixStart) + minMatch;
+ }
+ if (dictMode == ZSTD_dictMatchState
+ && ( ((repOffset-1) /*intentional overflow*/ < current - (dmsLowLimit + dmsIndexDelta)) /* equivalent to `current > repIndex >= dmsLowLimit` */
+ & ((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */
+ && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) {
+ repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dmsEnd, prefixStart) + minMatch;
+ } }
+ /* save longer solution */
+ if (repLen > bestLength) {
+ DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
+ repCode, ll0, repOffset, repLen);
+ bestLength = repLen;
+ matches[mnum].off = repCode - ll0;
+ matches[mnum].len = (U32)repLen;
+ mnum++;
+ if ( (repLen > sufficient_len)
+ | (ip+repLen == iLimit) ) { /* best possible */
+ return mnum;
+ } } } }
+
+ /* HC3 match finder */
+ if ((mls == 3) /*static*/ && (bestLength < mls)) {
+ U32 const matchIndex3 = ZSTD_insertAndFindFirstIndexHash3(ms, nextToUpdate3, ip);
+ if ((matchIndex3 >= matchLow)
+ & (current - matchIndex3 < (1<<18)) /*heuristic : longer distance likely too expensive*/ ) {
+ size_t mlen;
+ if ((dictMode == ZSTD_noDict) /*static*/ || (dictMode == ZSTD_dictMatchState) /*static*/ || (matchIndex3 >= dictLimit)) {
+ const BYTE* const match = base + matchIndex3;
+ mlen = ZSTD_count(ip, match, iLimit);
+ } else {
+ const BYTE* const match = dictBase + matchIndex3;
+ mlen = ZSTD_count_2segments(ip, match, iLimit, dictEnd, prefixStart);
+ }
+
+ /* save best solution */
+ if (mlen >= mls /* == 3 > bestLength */) {
+ DEBUGLOG(8, "found small match with hlog3, of length %u",
+ (U32)mlen);
+ bestLength = mlen;
+ assert(current > matchIndex3);
+ assert(mnum==0); /* no prior solution */
+ matches[0].off = (current - matchIndex3) + ZSTD_REP_MOVE;
+ matches[0].len = (U32)mlen;
+ mnum = 1;
+ if ( (mlen > sufficient_len) |
+ (ip+mlen == iLimit) ) { /* best possible length */
+ ms->nextToUpdate = current+1; /* skip insertion */
+ return 1;
+ } } }
+ /* no dictMatchState lookup: dicts don't have a populated HC3 table */
+ }
+
+ hashTable[h] = current; /* Update Hash Table */
+
+ while (nbCompares-- && (matchIndex >= matchLow)) {
+ U32* const nextPtr = bt + 2*(matchIndex & btMask);
+ const BYTE* match;
+ size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
+ assert(current > matchIndex);
+
+ if ((dictMode == ZSTD_noDict) || (dictMode == ZSTD_dictMatchState) || (matchIndex+matchLength >= dictLimit)) {
+ assert(matchIndex+matchLength >= dictLimit); /* ensure the condition is correct when !extDict */
+ match = base + matchIndex;
+ if (matchIndex >= dictLimit) assert(memcmp(match, ip, matchLength) == 0); /* ensure early section of match is equal as expected */
+ matchLength += ZSTD_count(ip+matchLength, match+matchLength, iLimit);
+ } else {
+ match = dictBase + matchIndex;
+ assert(memcmp(match, ip, matchLength) == 0); /* ensure early section of match is equal as expected */
+ matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dictEnd, prefixStart);
+ if (matchIndex+matchLength >= dictLimit)
+ match = base + matchIndex; /* prepare for match[matchLength] read */
+ }
+
+ if (matchLength > bestLength) {
+ DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)",
+ (U32)matchLength, current - matchIndex, current - matchIndex + ZSTD_REP_MOVE);
+ assert(matchEndIdx > matchIndex);
+ if (matchLength > matchEndIdx - matchIndex)
+ matchEndIdx = matchIndex + (U32)matchLength;
+ bestLength = matchLength;
+ matches[mnum].off = (current - matchIndex) + ZSTD_REP_MOVE;
+ matches[mnum].len = (U32)matchLength;
+ mnum++;
+ if ( (matchLength > ZSTD_OPT_NUM)
+ | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) {
+ if (dictMode == ZSTD_dictMatchState) nbCompares = 0; /* break should also skip searching dms */
+ break; /* drop, to preserve bt consistency (miss a little bit of compression) */
+ }
+ }
+
+ if (match[matchLength] < ip[matchLength]) {
+ /* match smaller than current */
+ *smallerPtr = matchIndex; /* update smaller idx */
+ commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
+ if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */
+ smallerPtr = nextPtr+1; /* new candidate => larger than match, which was smaller than current */
+ matchIndex = nextPtr[1]; /* new matchIndex, larger than previous, closer to current */
+ } else {
+ *largerPtr = matchIndex;
+ commonLengthLarger = matchLength;
+ if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */
+ largerPtr = nextPtr;
+ matchIndex = nextPtr[0];
+ } }
+
+ *smallerPtr = *largerPtr = 0;
+
+ if (dictMode == ZSTD_dictMatchState && nbCompares) {
+ size_t const dmsH = ZSTD_hashPtr(ip, dmsHashLog, mls);
+ U32 dictMatchIndex = dms->hashTable[dmsH];
+ const U32* const dmsBt = dms->chainTable;
+ commonLengthSmaller = commonLengthLarger = 0;
+ while (nbCompares-- && (dictMatchIndex > dmsLowLimit)) {
+ const U32* const nextPtr = dmsBt + 2*(dictMatchIndex & dmsBtMask);
+ size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
+ const BYTE* match = dmsBase + dictMatchIndex;
+ matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dmsEnd, prefixStart);
+ if (dictMatchIndex+matchLength >= dmsHighLimit)
+ match = base + dictMatchIndex + dmsIndexDelta; /* to prepare for next usage of match[matchLength] */
+
+ if (matchLength > bestLength) {
+ matchIndex = dictMatchIndex + dmsIndexDelta;
+ DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)",
+ (U32)matchLength, current - matchIndex, current - matchIndex + ZSTD_REP_MOVE);
+ if (matchLength > matchEndIdx - matchIndex)
+ matchEndIdx = matchIndex + (U32)matchLength;
+ bestLength = matchLength;
+ matches[mnum].off = (current - matchIndex) + ZSTD_REP_MOVE;
+ matches[mnum].len = (U32)matchLength;
+ mnum++;
+ if ( (matchLength > ZSTD_OPT_NUM)
+ | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) {
+ break; /* drop, to guarantee consistency (miss a little bit of compression) */
+ }
+ }
+
+ if (dictMatchIndex <= dmsBtLow) { break; } /* beyond tree size, stop the search */
+ if (match[matchLength] < ip[matchLength]) {
+ commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
+ dictMatchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */
+ } else {
+ /* match is larger than current */
+ commonLengthLarger = matchLength;
+ dictMatchIndex = nextPtr[0];
+ }
+ }
+ }
+
+ assert(matchEndIdx > current+8);
+ ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
+ return mnum;
+}
+
+
+FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches (
+ ZSTD_match_t* matches, /* store result (match found, increasing size) in this table */
+ ZSTD_matchState_t* ms,
+ U32* nextToUpdate3,
+ const BYTE* ip, const BYTE* const iHighLimit, const ZSTD_dictMode_e dictMode,
+ const U32 rep[ZSTD_REP_NUM],
+ U32 const ll0,
+ U32 const lengthToBeat)
+{
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
+ U32 const matchLengthSearch = cParams->minMatch;
+ DEBUGLOG(8, "ZSTD_BtGetAllMatches");
+ if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */
+ ZSTD_updateTree_internal(ms, ip, iHighLimit, matchLengthSearch, dictMode);
+ switch(matchLengthSearch)
+ {
+ case 3 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 3);
+ default :
+ case 4 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 4);
+ case 5 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 5);
+ case 7 :
+ case 6 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 6);
+ }
+}
+
+
+/*-*******************************
+* Optimal parser
+*********************************/
+
+
+static U32 ZSTD_totalLen(ZSTD_optimal_t sol)
+{
+ return sol.litlen + sol.mlen;
+}
+
+#if 0 /* debug */
+
+static void
+listStats(const U32* table, int lastEltID)
+{
+ int const nbElts = lastEltID + 1;
+ int enb;
+ for (enb=0; enb < nbElts; enb++) {
+ (void)table;
+ /* RAWLOG(2, "%3i:%3i, ", enb, table[enb]); */
+ RAWLOG(2, "%4i,", table[enb]);
+ }
+ RAWLOG(2, " \n");
+}
+
+#endif
+
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+ seqStore_t* seqStore,
+ U32 rep[ZSTD_REP_NUM],
+ const void* src, size_t srcSize,
+ const int optLevel,
+ const ZSTD_dictMode_e dictMode)
+{
+ optState_t* const optStatePtr = &ms->opt;
+ const BYTE* const istart = (const BYTE*)src;
+ const BYTE* ip = istart;
+ const BYTE* anchor = istart;
+ const BYTE* const iend = istart + srcSize;
+ const BYTE* const ilimit = iend - 8;
+ const BYTE* const base = ms->window.base;
+ const BYTE* const prefixStart = base + ms->window.dictLimit;
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
+
+ U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
+ U32 const minMatch = (cParams->minMatch == 3) ? 3 : 4;
+ U32 nextToUpdate3 = ms->nextToUpdate;
+
+ ZSTD_optimal_t* const opt = optStatePtr->priceTable;
+ ZSTD_match_t* const matches = optStatePtr->matchTable;
+ ZSTD_optimal_t lastSequence;
+
+ /* init */
+ DEBUGLOG(5, "ZSTD_compressBlock_opt_generic: current=%u, prefix=%u, nextToUpdate=%u",
+ (U32)(ip - base), ms->window.dictLimit, ms->nextToUpdate);
+ assert(optLevel <= 2);
+ ZSTD_rescaleFreqs(optStatePtr, (const BYTE*)src, srcSize, optLevel);
+ ip += (ip==prefixStart);
+
+ /* Match Loop */
+ while (ip < ilimit) {
+ U32 cur, last_pos = 0;
+
+ /* find first match */
+ { U32 const litlen = (U32)(ip - anchor);
+ U32 const ll0 = !litlen;
+ U32 const nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, ip, iend, dictMode, rep, ll0, minMatch);
+ if (!nbMatches) { ip++; continue; }
+
+ /* initialize opt[0] */
+ { U32 i ; for (i=0; i<ZSTD_REP_NUM; i++) opt[0].rep[i] = rep[i]; }
+ opt[0].mlen = 0; /* means is_a_literal */
+ opt[0].litlen = litlen;
+ /* We don't need to include the actual price of the literals because
+ * it is static for the duration of the forward pass, and is included
+ * in every price. We include the literal length to avoid negative
+ * prices when we subtract the previous literal length.
+ */
+ opt[0].price = ZSTD_litLengthPrice(litlen, optStatePtr, optLevel);
+
+ /* large match -> immediate encoding */
+ { U32 const maxML = matches[nbMatches-1].len;
+ U32 const maxOffset = matches[nbMatches-1].off;
+ DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series",
+ nbMatches, maxML, maxOffset, (U32)(ip-prefixStart));
+
+ if (maxML > sufficient_len) {
+ lastSequence.litlen = litlen;
+ lastSequence.mlen = maxML;
+ lastSequence.off = maxOffset;
+ DEBUGLOG(6, "large match (%u>%u), immediate encoding",
+ maxML, sufficient_len);
+ cur = 0;
+ last_pos = ZSTD_totalLen(lastSequence);
+ goto _shortestPath;
+ } }
+
+ /* set prices for first matches starting position == 0 */
+ { U32 const literalsPrice = opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
+ U32 pos;
+ U32 matchNb;
+ for (pos = 1; pos < minMatch; pos++) {
+ opt[pos].price = ZSTD_MAX_PRICE; /* mlen, litlen and price will be fixed during forward scanning */
+ }
+ for (matchNb = 0; matchNb < nbMatches; matchNb++) {
+ U32 const offset = matches[matchNb].off;
+ U32 const end = matches[matchNb].len;
+ for ( ; pos <= end ; pos++ ) {
+ U32 const matchPrice = ZSTD_getMatchPrice(offset, pos, optStatePtr, optLevel);
+ U32 const sequencePrice = literalsPrice + matchPrice;
+ DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
+ pos, ZSTD_fCost(sequencePrice));
+ opt[pos].mlen = pos;
+ opt[pos].off = offset;
+ opt[pos].litlen = litlen;
+ opt[pos].price = sequencePrice;
+ } }
+ last_pos = pos-1;
+ }
+ }
+
+ /* check further positions */
+ for (cur = 1; cur <= last_pos; cur++) {
+ const BYTE* const inr = ip + cur;
+ assert(cur < ZSTD_OPT_NUM);
+ DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur)
+
+ /* Fix current position with one literal if cheaper */
+ { U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1;
+ int const price = opt[cur-1].price
+ + ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel)
+ + ZSTD_litLengthPrice(litlen, optStatePtr, optLevel)
+ - ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel);
+ assert(price < 1000000000); /* overflow check */
+ if (price <= opt[cur].price) {
+ DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)",
+ inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen,
+ opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]);
+ opt[cur].mlen = 0;
+ opt[cur].off = 0;
+ opt[cur].litlen = litlen;
+ opt[cur].price = price;
+ } else {
+ DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)",
+ inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price),
+ opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]);
+ }
+ }
+
+ /* Set the repcodes of the current position. We must do it here
+ * because we rely on the repcodes of the 2nd to last sequence being
+ * correct to set the next chunks repcodes during the backward
+ * traversal.
+ */
+ ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t));
+ assert(cur >= opt[cur].mlen);
+ if (opt[cur].mlen != 0) {
+ U32 const prev = cur - opt[cur].mlen;
+ repcodes_t newReps = ZSTD_updateRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0);
+ memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t));
+ } else {
+ memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t));
+ }
+
+ /* last match must start at a minimum distance of 8 from oend */
+ if (inr > ilimit) continue;
+
+ if (cur == last_pos) break;
+
+ if ( (optLevel==0) /*static_test*/
+ && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) {
+ DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1);
+ continue; /* skip unpromising positions; about ~+6% speed, -0.01 ratio */
+ }
+
+ { U32 const ll0 = (opt[cur].mlen != 0);
+ U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0;
+ U32 const previousPrice = opt[cur].price;
+ U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
+ U32 const nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, inr, iend, dictMode, opt[cur].rep, ll0, minMatch);
+ U32 matchNb;
+ if (!nbMatches) {
+ DEBUGLOG(7, "rPos:%u : no match found", cur);
+ continue;
+ }
+
+ { U32 const maxML = matches[nbMatches-1].len;
+ DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u",
+ inr-istart, cur, nbMatches, maxML);
+
+ if ( (maxML > sufficient_len)
+ || (cur + maxML >= ZSTD_OPT_NUM) ) {
+ lastSequence.mlen = maxML;
+ lastSequence.off = matches[nbMatches-1].off;
+ lastSequence.litlen = litlen;
+ cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0; /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */
+ last_pos = cur + ZSTD_totalLen(lastSequence);
+ if (cur > ZSTD_OPT_NUM) cur = 0; /* underflow => first match */
+ goto _shortestPath;
+ } }
+
+ /* set prices using matches found at position == cur */
+ for (matchNb = 0; matchNb < nbMatches; matchNb++) {
+ U32 const offset = matches[matchNb].off;
+ U32 const lastML = matches[matchNb].len;
+ U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
+ U32 mlen;
+
+ DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u",
+ matchNb, matches[matchNb].off, lastML, litlen);
+
+ for (mlen = lastML; mlen >= startML; mlen--) { /* scan downward */
+ U32 const pos = cur + mlen;
+ int const price = basePrice + ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
+
+ if ((pos > last_pos) || (price < opt[pos].price)) {
+ DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)",
+ pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
+ while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; } /* fill empty positions */
+ opt[pos].mlen = mlen;
+ opt[pos].off = offset;
+ opt[pos].litlen = litlen;
+ opt[pos].price = price;
+ } else {
+ DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)",
+ pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
+ if (optLevel==0) break; /* early update abort; gets ~+10% speed for about -0.01 ratio loss */
+ }
+ } } }
+ } /* for (cur = 1; cur <= last_pos; cur++) */
+
+ lastSequence = opt[last_pos];
+ cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0; /* single sequence, and it starts before `ip` */
+ assert(cur < ZSTD_OPT_NUM); /* control overflow*/
+
+_shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */
+ assert(opt[0].mlen == 0);
+
+ /* Set the next chunk's repcodes based on the repcodes of the beginning
+ * of the last match, and the last sequence. This avoids us having to
+ * update them while traversing the sequences.
+ */
+ if (lastSequence.mlen != 0) {
+ repcodes_t reps = ZSTD_updateRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0);
+ memcpy(rep, &reps, sizeof(reps));
+ } else {
+ memcpy(rep, opt[cur].rep, sizeof(repcodes_t));
+ }
+
+ { U32 const storeEnd = cur + 1;
+ U32 storeStart = storeEnd;
+ U32 seqPos = cur;
+
+ DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)",
+ last_pos, cur); (void)last_pos;
+ assert(storeEnd < ZSTD_OPT_NUM);
+ DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+ storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off);
+ opt[storeEnd] = lastSequence;
+ while (seqPos > 0) {
+ U32 const backDist = ZSTD_totalLen(opt[seqPos]);
+ storeStart--;
+ DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+ seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off);
+ opt[storeStart] = opt[seqPos];
+ seqPos = (seqPos > backDist) ? seqPos - backDist : 0;
+ }
+
+ /* save sequences */
+ DEBUGLOG(6, "sending selected sequences into seqStore")
+ { U32 storePos;
+ for (storePos=storeStart; storePos <= storeEnd; storePos++) {
+ U32 const llen = opt[storePos].litlen;
+ U32 const mlen = opt[storePos].mlen;
+ U32 const offCode = opt[storePos].off;
+ U32 const advance = llen + mlen;
+ DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
+ anchor - istart, (unsigned)llen, (unsigned)mlen);
+
+ if (mlen==0) { /* only literals => must be last "sequence", actually starting a new stream of sequences */
+ assert(storePos == storeEnd); /* must be last sequence */
+ ip = anchor + llen; /* last "sequence" is a bunch of literals => don't progress anchor */
+ continue; /* will finish */
+ }
+
+ assert(anchor + llen <= iend);
+ ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen);
+ ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen-MINMATCH);
+ anchor += advance;
+ ip = anchor;
+ } }
+ ZSTD_setBasePrices(optStatePtr, optLevel);
+ }
+ } /* while (ip < ilimit) */
+
+ /* Return the last literals size */
+ return (size_t)(iend - anchor);
+}
+
+
+size_t ZSTD_compressBlock_btopt(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ const void* src, size_t srcSize)
+{
+ DEBUGLOG(5, "ZSTD_compressBlock_btopt");
+ return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_noDict);
+}
+
+
+/* used in 2-pass strategy */
+static U32 ZSTD_upscaleStat(unsigned* table, U32 lastEltIndex, int bonus)
+{
+ U32 s, sum=0;
+ assert(ZSTD_FREQ_DIV+bonus >= 0);
+ for (s=0; s<lastEltIndex+1; s++) {
+ table[s] <<= ZSTD_FREQ_DIV+bonus;
+ table[s]--;
+ sum += table[s];
+ }
+ return sum;
+}
+
+/* used in 2-pass strategy */
+MEM_STATIC void ZSTD_upscaleStats(optState_t* optPtr)
+{
+ if (ZSTD_compressedLiterals(optPtr))
+ optPtr->litSum = ZSTD_upscaleStat(optPtr->litFreq, MaxLit, 0);
+ optPtr->litLengthSum = ZSTD_upscaleStat(optPtr->litLengthFreq, MaxLL, 0);
+ optPtr->matchLengthSum = ZSTD_upscaleStat(optPtr->matchLengthFreq, MaxML, 0);
+ optPtr->offCodeSum = ZSTD_upscaleStat(optPtr->offCodeFreq, MaxOff, 0);
+}
+
+/* ZSTD_initStats_ultra():
+ * make a first compression pass, just to seed stats with more accurate starting values.
+ * only works on first block, with no dictionary and no ldm.
+ * this function cannot error, hence its contract must be respected.
+ */
+static void
+ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+ seqStore_t* seqStore,
+ U32 rep[ZSTD_REP_NUM],
+ const void* src, size_t srcSize)
+{
+ U32 tmpRep[ZSTD_REP_NUM]; /* updated rep codes will sink here */
+ memcpy(tmpRep, rep, sizeof(tmpRep));
+
+ DEBUGLOG(4, "ZSTD_initStats_ultra (srcSize=%zu)", srcSize);
+ assert(ms->opt.litLengthSum == 0); /* first block */
+ assert(seqStore->sequences == seqStore->sequencesStart); /* no ldm */
+ assert(ms->window.dictLimit == ms->window.lowLimit); /* no dictionary */
+ assert(ms->window.dictLimit - ms->nextToUpdate <= 1); /* no prefix (note: intentional overflow, defined as 2-complement) */
+
+ ZSTD_compressBlock_opt_generic(ms, seqStore, tmpRep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); /* generate stats into ms->opt*/
+
+ /* invalidate first scan from history */
+ ZSTD_resetSeqStore(seqStore);
+ ms->window.base -= srcSize;
+ ms->window.dictLimit += (U32)srcSize;
+ ms->window.lowLimit = ms->window.dictLimit;
+ ms->nextToUpdate = ms->window.dictLimit;
+
+ /* re-inforce weight of collected statistics */
+ ZSTD_upscaleStats(&ms->opt);
+}
+
+size_t ZSTD_compressBlock_btultra(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ const void* src, size_t srcSize)
+{
+ DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize);
+ return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_btultra2(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ const void* src, size_t srcSize)
+{
+ U32 const current = (U32)((const BYTE*)src - ms->window.base);
+ DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize);
+
+ /* 2-pass strategy:
+ * this strategy makes a first pass over first block to collect statistics
+ * and seed next round's statistics with it.
+ * After 1st pass, function forgets everything, and starts a new block.
+ * Consequently, this can only work if no data has been previously loaded in tables,
+ * aka, no dictionary, no prefix, no ldm preprocessing.
+ * The compression ratio gain is generally small (~0.5% on first block),
+ * the cost is 2x cpu time on first block. */
+ assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+ if ( (ms->opt.litLengthSum==0) /* first block */
+ && (seqStore->sequences == seqStore->sequencesStart) /* no ldm */
+ && (ms->window.dictLimit == ms->window.lowLimit) /* no dictionary */
+ && (current == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */
+ && (srcSize > ZSTD_PREDEF_THRESHOLD)
+ ) {
+ ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
+ }
+
+ return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_btopt_dictMatchState(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ const void* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_btultra_dictMatchState(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ const void* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_btopt_extDict(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ const void* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_extDict);
+}
+
+size_t ZSTD_compressBlock_btultra_extDict(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ const void* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_extDict);
+}
+
+/* note : no btultra2 variant for extDict nor dictMatchState,
+ * because btultra2 is not meant to work with dictionaries
+ * and is only specific for the first block (no prefix) */
+/**** ended inlining compress/zstd_opt.c ****/
+
+/**** start inlining decompress/huf_decompress.c ****/
+/* ******************************************************************
+ * huff0 huffman decoder,
+ * part of Finite State Entropy library
+ * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* **************************************************************
+* Dependencies
+****************************************************************/
+#include <string.h> /* memcpy, memset */
+/**** skipping file: ../common/compiler.h ****/
+/**** skipping file: ../common/bitstream.h ****/
+/**** skipping file: ../common/fse.h ****/
+#define HUF_STATIC_LINKING_ONLY
+/**** skipping file: ../common/huf.h ****/
+/**** skipping file: ../common/error_private.h ****/
+
+/* **************************************************************
+* Macros
+****************************************************************/
+
+/* These two optional macros force the use one way or another of the two
+ * Huffman decompression implementations. You can't force in both directions
+ * at the same time.
+ */
+#if defined(HUF_FORCE_DECOMPRESS_X1) && \
+ defined(HUF_FORCE_DECOMPRESS_X2)
+#error "Cannot force the use of the X1 and X2 decoders at the same time!"
+#endif
+
+
+/* **************************************************************
+* Error Management
+****************************************************************/
+#define HUF_isError ERR_isError
+
+
+/* **************************************************************
+* Byte alignment for workSpace management
+****************************************************************/
+#define HUF_ALIGN(x, a) HUF_ALIGN_MASK((x), (a) - 1)
+#define HUF_ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask))
+
+
+/* **************************************************************
+* BMI2 Variant Wrappers
+****************************************************************/
+#if DYNAMIC_BMI2
+
+#define HUF_DGEN(fn) \
+ \
+ static size_t fn##_default( \
+ void* dst, size_t dstSize, \
+ const void* cSrc, size_t cSrcSize, \
+ const HUF_DTable* DTable) \
+ { \
+ return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
+ } \
+ \
+ static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2( \
+ void* dst, size_t dstSize, \
+ const void* cSrc, size_t cSrcSize, \
+ const HUF_DTable* DTable) \
+ { \
+ return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
+ } \
+ \
+ static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
+ size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
+ { \
+ if (bmi2) { \
+ return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \
+ } \
+ return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \
+ }
+
+#else
+
+#define HUF_DGEN(fn) \
+ static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
+ size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
+ { \
+ (void)bmi2; \
+ return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
+ }
+
+#endif
+
+
+/*-***************************/
+/* generic DTableDesc */
+/*-***************************/
+typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved; } DTableDesc;
+
+static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
+{
+ DTableDesc dtd;
+ memcpy(&dtd, table, sizeof(dtd));
+ return dtd;
+}
+
+
+#ifndef HUF_FORCE_DECOMPRESS_X2
+
+/*-***************************/
+/* single-symbol decoding */
+/*-***************************/
+typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decoding */
+
+size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
+{
+ U32 tableLog = 0;
+ U32 nbSymbols = 0;
+ size_t iSize;
+ void* const dtPtr = DTable + 1;
+ HUF_DEltX1* const dt = (HUF_DEltX1*)dtPtr;
+
+ U32* rankVal;
+ BYTE* huffWeight;
+ size_t spaceUsed32 = 0;
+
+ rankVal = (U32 *)workSpace + spaceUsed32;
+ spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1;
+ huffWeight = (BYTE *)((U32 *)workSpace + spaceUsed32);
+ spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+ if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
+
+ DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
+ /* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
+
+ iSize = HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
+ if (HUF_isError(iSize)) return iSize;
+
+ /* Table header */
+ { DTableDesc dtd = HUF_getDTableDesc(DTable);
+ if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */
+ dtd.tableType = 0;
+ dtd.tableLog = (BYTE)tableLog;
+ memcpy(DTable, &dtd, sizeof(dtd));
+ }
+
+ /* Calculate starting value for each rank */
+ { U32 n, nextRankStart = 0;
+ for (n=1; n<tableLog+1; n++) {
+ U32 const current = nextRankStart;
+ nextRankStart += (rankVal[n] << (n-1));
+ rankVal[n] = current;
+ } }
+
+ /* fill DTable */
+ { U32 n;
+ size_t const nEnd = nbSymbols;
+ for (n=0; n<nEnd; n++) {
+ size_t const w = huffWeight[n];
+ size_t const length = (1 << w) >> 1;
+ size_t const uStart = rankVal[w];
+ size_t const uEnd = uStart + length;
+ size_t u;
+ HUF_DEltX1 D;
+ D.byte = (BYTE)n;
+ D.nbBits = (BYTE)(tableLog + 1 - w);
+ rankVal[w] = (U32)uEnd;
+ if (length < 4) {
+ /* Use length in the loop bound so the compiler knows it is short. */
+ for (u = 0; u < length; ++u)
+ dt[uStart + u] = D;
+ } else {
+ /* Unroll the loop 4 times, we know it is a power of 2. */
+ for (u = uStart; u < uEnd; u += 4) {
+ dt[u + 0] = D;
+ dt[u + 1] = D;
+ dt[u + 2] = D;
+ dt[u + 3] = D;
+ } } } }
+ return iSize;
+}
+
+size_t HUF_readDTableX1(HUF_DTable* DTable, const void* src, size_t srcSize)
+{
+ U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+ return HUF_readDTableX1_wksp(DTable, src, srcSize,
+ workSpace, sizeof(workSpace));
+}
+
+FORCE_INLINE_TEMPLATE BYTE
+HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog)
+{
+ size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+ BYTE const c = dt[val].byte;
+ BIT_skipBits(Dstream, dt[val].nbBits);
+ return c;
+}
+
+#define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \
+ *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \
+ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
+
+#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
+ if (MEM_64bits()) \
+ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
+
+HINT_INLINE size_t
+HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog)
+{
+ BYTE* const pStart = p;
+
+ /* up to 4 symbols at a time */
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
+ HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
+ HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
+ HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
+ HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+ }
+
+ /* [0-3] symbols remaining */
+ if (MEM_32bits())
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd))
+ HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+
+ /* no more data to retrieve from bitstream, no need to reload */
+ while (p < pEnd)
+ HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+
+ return pEnd-pStart;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress1X1_usingDTable_internal_body(
+ void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize,
+ const HUF_DTable* DTable)
+{
+ BYTE* op = (BYTE*)dst;
+ BYTE* const oend = op + dstSize;
+ const void* dtPtr = DTable + 1;
+ const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
+ BIT_DStream_t bitD;
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ U32 const dtLog = dtd.tableLog;
+
+ CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
+
+ HUF_decodeStreamX1(op, &bitD, oend, dt, dtLog);
+
+ if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+ return dstSize;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress4X1_usingDTable_internal_body(
+ void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize,
+ const HUF_DTable* DTable)
+{
+ /* Check */
+ if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
+
+ { const BYTE* const istart = (const BYTE*) cSrc;
+ BYTE* const ostart = (BYTE*) dst;
+ BYTE* const oend = ostart + dstSize;
+ BYTE* const olimit = oend - 3;
+ const void* const dtPtr = DTable + 1;
+ const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
+
+ /* Init */
+ BIT_DStream_t bitD1;
+ BIT_DStream_t bitD2;
+ BIT_DStream_t bitD3;
+ BIT_DStream_t bitD4;
+ size_t const length1 = MEM_readLE16(istart);
+ size_t const length2 = MEM_readLE16(istart+2);
+ size_t const length3 = MEM_readLE16(istart+4);
+ size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+ const BYTE* const istart1 = istart + 6; /* jumpTable */
+ const BYTE* const istart2 = istart1 + length1;
+ const BYTE* const istart3 = istart2 + length2;
+ const BYTE* const istart4 = istart3 + length3;
+ const size_t segmentSize = (dstSize+3) / 4;
+ BYTE* const opStart2 = ostart + segmentSize;
+ BYTE* const opStart3 = opStart2 + segmentSize;
+ BYTE* const opStart4 = opStart3 + segmentSize;
+ BYTE* op1 = ostart;
+ BYTE* op2 = opStart2;
+ BYTE* op3 = opStart3;
+ BYTE* op4 = opStart4;
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ U32 const dtLog = dtd.tableLog;
+ U32 endSignal = 1;
+
+ if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
+ CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+ CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+ CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+ CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
+
+ /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */
+ for ( ; (endSignal) & (op4 < olimit) ; ) {
+ HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
+ HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
+ HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
+ HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
+ HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
+ HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
+ HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
+ HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
+ HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
+ HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
+ HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
+ HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
+ HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
+ HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
+ HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
+ HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
+ endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
+ endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
+ endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
+ endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
+ }
+
+ /* check corruption */
+ /* note : should not be necessary : op# advance in lock step, and we control op4.
+ * but curiously, binary generated by gcc 7.2 & 7.3 with -mbmi2 runs faster when >=1 test is present */
+ if (op1 > opStart2) return ERROR(corruption_detected);
+ if (op2 > opStart3) return ERROR(corruption_detected);
+ if (op3 > opStart4) return ERROR(corruption_detected);
+ /* note : op4 supposed already verified within main loop */
+
+ /* finish bitStreams one by one */
+ HUF_decodeStreamX1(op1, &bitD1, opStart2, dt, dtLog);
+ HUF_decodeStreamX1(op2, &bitD2, opStart3, dt, dtLog);
+ HUF_decodeStreamX1(op3, &bitD3, opStart4, dt, dtLog);
+ HUF_decodeStreamX1(op4, &bitD4, oend, dt, dtLog);
+
+ /* check */
+ { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+ if (!endCheck) return ERROR(corruption_detected); }
+
+ /* decoded size */
+ return dstSize;
+ }
+}
+
+
+typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
+ const void *cSrc,
+ size_t cSrcSize,
+ const HUF_DTable *DTable);
+
+HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
+HUF_DGEN(HUF_decompress4X1_usingDTable_internal)
+
+
+
+size_t HUF_decompress1X1_usingDTable(
+ void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize,
+ const HUF_DTable* DTable)
+{
+ DTableDesc dtd = HUF_getDTableDesc(DTable);
+ if (dtd.tableType != 0) return ERROR(GENERIC);
+ return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize,
+ void* workSpace, size_t wkspSize)
+{
+ const BYTE* ip = (const BYTE*) cSrc;
+
+ size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
+ if (HUF_isError(hSize)) return hSize;
+ if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+ ip += hSize; cSrcSize -= hSize;
+
+ return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+}
+
+
+size_t HUF_decompress1X1_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize)
+{
+ U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+ return HUF_decompress1X1_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
+ workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+ HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
+ return HUF_decompress1X1_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+size_t HUF_decompress4X1_usingDTable(
+ void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize,
+ const HUF_DTable* DTable)
+{
+ DTableDesc dtd = HUF_getDTableDesc(DTable);
+ if (dtd.tableType != 0) return ERROR(GENERIC);
+ return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize,
+ void* workSpace, size_t wkspSize, int bmi2)
+{
+ const BYTE* ip = (const BYTE*) cSrc;
+
+ size_t const hSize = HUF_readDTableX1_wksp (dctx, cSrc, cSrcSize,
+ workSpace, wkspSize);
+ if (HUF_isError(hSize)) return hSize;
+ if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+ ip += hSize; cSrcSize -= hSize;
+
+ return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+}
+
+size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize,
+ void* workSpace, size_t wkspSize)
+{
+ return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
+}
+
+
+size_t HUF_decompress4X1_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+ U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+ return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+ workSpace, sizeof(workSpace));
+}
+size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+ HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
+ return HUF_decompress4X1_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+#endif /* HUF_FORCE_DECOMPRESS_X2 */
+
+
+#ifndef HUF_FORCE_DECOMPRESS_X1
+
+/* *************************/
+/* double-symbols decoding */
+/* *************************/
+
+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2; /* double-symbols decoding */
+typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
+typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
+typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];
+
+
+/* HUF_fillDTableX2Level2() :
+ * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
+static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const U32 consumed,
+ const U32* rankValOrigin, const int minWeight,
+ const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
+ U32 nbBitsBaseline, U16 baseSeq)
+{
+ HUF_DEltX2 DElt;
+ U32 rankVal[HUF_TABLELOG_MAX + 1];
+
+ /* get pre-calculated rankVal */
+ memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+ /* fill skipped values */
+ if (minWeight>1) {
+ U32 i, skipSize = rankVal[minWeight];
+ MEM_writeLE16(&(DElt.sequence), baseSeq);
+ DElt.nbBits = (BYTE)(consumed);
+ DElt.length = 1;
+ for (i = 0; i < skipSize; i++)
+ DTable[i] = DElt;
+ }
+
+ /* fill DTable */
+ { U32 s; for (s=0; s<sortedListSize; s++) { /* note : sortedSymbols already skipped */
+ const U32 symbol = sortedSymbols[s].symbol;
+ const U32 weight = sortedSymbols[s].weight;
+ const U32 nbBits = nbBitsBaseline - weight;
+ const U32 length = 1 << (sizeLog-nbBits);
+ const U32 start = rankVal[weight];
+ U32 i = start;
+ const U32 end = start + length;
+
+ MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
+ DElt.nbBits = (BYTE)(nbBits + consumed);
+ DElt.length = 2;
+ do { DTable[i++] = DElt; } while (i<end); /* since length >= 1 */
+
+ rankVal[weight] += length;
+ } }
+}
+
+
+static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
+ const sortedSymbol_t* sortedList, const U32 sortedListSize,
+ const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
+ const U32 nbBitsBaseline)
+{
+ U32 rankVal[HUF_TABLELOG_MAX + 1];
+ const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+ const U32 minBits = nbBitsBaseline - maxWeight;
+ U32 s;
+
+ memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+ /* fill DTable */
+ for (s=0; s<sortedListSize; s++) {
+ const U16 symbol = sortedList[s].symbol;
+ const U32 weight = sortedList[s].weight;
+ const U32 nbBits = nbBitsBaseline - weight;
+ const U32 start = rankVal[weight];
+ const U32 length = 1 << (targetLog-nbBits);
+
+ if (targetLog-nbBits >= minBits) { /* enough room for a second symbol */
+ U32 sortedRank;
+ int minWeight = nbBits + scaleLog;
+ if (minWeight < 1) minWeight = 1;
+ sortedRank = rankStart[minWeight];
+ HUF_fillDTableX2Level2(DTable+start, targetLog-nbBits, nbBits,
+ rankValOrigin[nbBits], minWeight,
+ sortedList+sortedRank, sortedListSize-sortedRank,
+ nbBitsBaseline, symbol);
+ } else {
+ HUF_DEltX2 DElt;
+ MEM_writeLE16(&(DElt.sequence), symbol);
+ DElt.nbBits = (BYTE)(nbBits);
+ DElt.length = 1;
+ { U32 const end = start + length;
+ U32 u;
+ for (u = start; u < end; u++) DTable[u] = DElt;
+ } }
+ rankVal[weight] += length;
+ }
+}
+
+size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
+ const void* src, size_t srcSize,
+ void* workSpace, size_t wkspSize)
+{
+ U32 tableLog, maxW, sizeOfSort, nbSymbols;
+ DTableDesc dtd = HUF_getDTableDesc(DTable);
+ U32 const maxTableLog = dtd.maxTableLog;
+ size_t iSize;
+ void* dtPtr = DTable+1; /* force compiler to avoid strict-aliasing */
+ HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
+ U32 *rankStart;
+
+ rankValCol_t* rankVal;
+ U32* rankStats;
+ U32* rankStart0;
+ sortedSymbol_t* sortedSymbol;
+ BYTE* weightList;
+ size_t spaceUsed32 = 0;
+
+ rankVal = (rankValCol_t *)((U32 *)workSpace + spaceUsed32);
+ spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2;
+ rankStats = (U32 *)workSpace + spaceUsed32;
+ spaceUsed32 += HUF_TABLELOG_MAX + 1;
+ rankStart0 = (U32 *)workSpace + spaceUsed32;
+ spaceUsed32 += HUF_TABLELOG_MAX + 2;
+ sortedSymbol = (sortedSymbol_t *)workSpace + (spaceUsed32 * sizeof(U32)) / sizeof(sortedSymbol_t);
+ spaceUsed32 += HUF_ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2;
+ weightList = (BYTE *)((U32 *)workSpace + spaceUsed32);
+ spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+ if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
+
+ rankStart = rankStart0 + 1;
+ memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1));
+
+ DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */
+ if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+ /* memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
+
+ iSize = HUF_readStats(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
+ if (HUF_isError(iSize)) return iSize;
+
+ /* check result */
+ if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */
+
+ /* find maxWeight */
+ for (maxW = tableLog; rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */
+
+ /* Get start index of each weight */
+ { U32 w, nextRankStart = 0;
+ for (w=1; w<maxW+1; w++) {
+ U32 current = nextRankStart;
+ nextRankStart += rankStats[w];
+ rankStart[w] = current;
+ }
+ rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/
+ sizeOfSort = nextRankStart;
+ }
+
+ /* sort symbols by weight */
+ { U32 s;
+ for (s=0; s<nbSymbols; s++) {
+ U32 const w = weightList[s];
+ U32 const r = rankStart[w]++;
+ sortedSymbol[r].symbol = (BYTE)s;
+ sortedSymbol[r].weight = (BYTE)w;
+ }
+ rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */
+ }
+
+ /* Build rankVal */
+ { U32* const rankVal0 = rankVal[0];
+ { int const rescale = (maxTableLog-tableLog) - 1; /* tableLog <= maxTableLog */
+ U32 nextRankVal = 0;
+ U32 w;
+ for (w=1; w<maxW+1; w++) {
+ U32 current = nextRankVal;
+ nextRankVal += rankStats[w] << (w+rescale);
+ rankVal0[w] = current;
+ } }
+ { U32 const minBits = tableLog+1 - maxW;
+ U32 consumed;
+ for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) {
+ U32* const rankValPtr = rankVal[consumed];
+ U32 w;
+ for (w = 1; w < maxW+1; w++) {
+ rankValPtr[w] = rankVal0[w] >> consumed;
+ } } } }
+
+ HUF_fillDTableX2(dt, maxTableLog,
+ sortedSymbol, sizeOfSort,
+ rankStart0, rankVal, maxW,
+ tableLog+1);
+
+ dtd.tableLog = (BYTE)maxTableLog;
+ dtd.tableType = 1;
+ memcpy(DTable, &dtd, sizeof(dtd));
+ return iSize;
+}
+
+size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
+{
+ U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+ return HUF_readDTableX2_wksp(DTable, src, srcSize,
+ workSpace, sizeof(workSpace));
+}
+
+
+FORCE_INLINE_TEMPLATE U32
+HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+ size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
+ memcpy(op, dt+val, 2);
+ BIT_skipBits(DStream, dt[val].nbBits);
+ return dt[val].length;
+}
+
+FORCE_INLINE_TEMPLATE U32
+HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+ size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
+ memcpy(op, dt+val, 1);
+ if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
+ else {
+ if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
+ BIT_skipBits(DStream, dt[val].nbBits);
+ if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+ /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+ DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
+ } }
+ return 1;
+}
+
+#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+ if (MEM_64bits()) \
+ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
+
+HINT_INLINE size_t
+HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
+ const HUF_DEltX2* const dt, const U32 dtLog)
+{
+ BYTE* const pStart = p;
+
+ /* up to 8 symbols at a time */
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
+ HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+ HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
+ HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+ }
+
+ /* closer to end : up to 2 symbols at a time */
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+ while (p <= pEnd-2)
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
+
+ if (p < pEnd)
+ p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog);
+
+ return p-pStart;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress1X2_usingDTable_internal_body(
+ void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize,
+ const HUF_DTable* DTable)
+{
+ BIT_DStream_t bitD;
+
+ /* Init */
+ CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
+
+ /* decode */
+ { BYTE* const ostart = (BYTE*) dst;
+ BYTE* const oend = ostart + dstSize;
+ const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */
+ const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ HUF_decodeStreamX2(ostart, &bitD, oend, dt, dtd.tableLog);
+ }
+
+ /* check */
+ if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+ /* decoded size */
+ return dstSize;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress4X2_usingDTable_internal_body(
+ void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize,
+ const HUF_DTable* DTable)
+{
+ if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
+
+ { const BYTE* const istart = (const BYTE*) cSrc;
+ BYTE* const ostart = (BYTE*) dst;
+ BYTE* const oend = ostart + dstSize;
+ BYTE* const olimit = oend - (sizeof(size_t)-1);
+ const void* const dtPtr = DTable+1;
+ const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+
+ /* Init */
+ BIT_DStream_t bitD1;
+ BIT_DStream_t bitD2;
+ BIT_DStream_t bitD3;
+ BIT_DStream_t bitD4;
+ size_t const length1 = MEM_readLE16(istart);
+ size_t const length2 = MEM_readLE16(istart+2);
+ size_t const length3 = MEM_readLE16(istart+4);
+ size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+ const BYTE* const istart1 = istart + 6; /* jumpTable */
+ const BYTE* const istart2 = istart1 + length1;
+ const BYTE* const istart3 = istart2 + length2;
+ const BYTE* const istart4 = istart3 + length3;
+ size_t const segmentSize = (dstSize+3) / 4;
+ BYTE* const opStart2 = ostart + segmentSize;
+ BYTE* const opStart3 = opStart2 + segmentSize;
+ BYTE* const opStart4 = opStart3 + segmentSize;
+ BYTE* op1 = ostart;
+ BYTE* op2 = opStart2;
+ BYTE* op3 = opStart3;
+ BYTE* op4 = opStart4;
+ U32 endSignal = 1;
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ U32 const dtLog = dtd.tableLog;
+
+ if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
+ CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+ CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+ CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+ CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
+
+ /* 16-32 symbols per loop (4-8 symbols per stream) */
+ for ( ; (endSignal) & (op4 < olimit); ) {
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+ HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+ HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+ HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+ HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+ endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
+ endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+ HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+ HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+ HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+ HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+ endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
+ endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
+#else
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+ HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+ HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+ HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+ HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+ HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+ HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+ HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+ HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+ endSignal = (U32)LIKELY(
+ (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished)
+ & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished)
+ & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished)
+ & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished));
+#endif
+ }
+
+ /* check corruption */
+ if (op1 > opStart2) return ERROR(corruption_detected);
+ if (op2 > opStart3) return ERROR(corruption_detected);
+ if (op3 > opStart4) return ERROR(corruption_detected);
+ /* note : op4 already verified within main loop */
+
+ /* finish bitStreams one by one */
+ HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+ HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+ HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+ HUF_decodeStreamX2(op4, &bitD4, oend, dt, dtLog);
+
+ /* check */
+ { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+ if (!endCheck) return ERROR(corruption_detected); }
+
+ /* decoded size */
+ return dstSize;
+ }
+}
+
+HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
+HUF_DGEN(HUF_decompress4X2_usingDTable_internal)
+
+size_t HUF_decompress1X2_usingDTable(
+ void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize,
+ const HUF_DTable* DTable)
+{
+ DTableDesc dtd = HUF_getDTableDesc(DTable);
+ if (dtd.tableType != 1) return ERROR(GENERIC);
+ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize,
+ void* workSpace, size_t wkspSize)
+{
+ const BYTE* ip = (const BYTE*) cSrc;
+
+ size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
+ workSpace, wkspSize);
+ if (HUF_isError(hSize)) return hSize;
+ if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+ ip += hSize; cSrcSize -= hSize;
+
+ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+}
+
+
+size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize)
+{
+ U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+ return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
+ workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+ HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
+ return HUF_decompress1X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+size_t HUF_decompress4X2_usingDTable(
+ void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize,
+ const HUF_DTable* DTable)
+{
+ DTableDesc dtd = HUF_getDTableDesc(DTable);
+ if (dtd.tableType != 1) return ERROR(GENERIC);
+ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize,
+ void* workSpace, size_t wkspSize, int bmi2)
+{
+ const BYTE* ip = (const BYTE*) cSrc;
+
+ size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
+ workSpace, wkspSize);
+ if (HUF_isError(hSize)) return hSize;
+ if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+ ip += hSize; cSrcSize -= hSize;
+
+ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+}
+
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize,
+ void* workSpace, size_t wkspSize)
+{
+ return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
+}
+
+
+size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize)
+{
+ U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+ return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+ workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+ HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
+ return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+#endif /* HUF_FORCE_DECOMPRESS_X1 */
+
+
+/* ***********************************/
+/* Universal decompression selectors */
+/* ***********************************/
+
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
+ const void* cSrc, size_t cSrcSize,
+ const HUF_DTable* DTable)
+{
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+ (void)dtd;
+ assert(dtd.tableType == 0);
+ return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+ (void)dtd;
+ assert(dtd.tableType == 1);
+ return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+#else
+ return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+ HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+#endif
+}
+
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
+ const void* cSrc, size_t cSrcSize,
+ const HUF_DTable* DTable)
+{
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+ (void)dtd;
+ assert(dtd.tableType == 0);
+ return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+ (void)dtd;
+ assert(dtd.tableType == 1);
+ return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+#else
+ return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+ HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+#endif
+}
+
+
+#if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
+typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
+{
+ /* single, double, quad */
+ {{0,0}, {1,1}, {2,2}}, /* Q==0 : impossible */
+ {{0,0}, {1,1}, {2,2}}, /* Q==1 : impossible */
+ {{ 38,130}, {1313, 74}, {2151, 38}}, /* Q == 2 : 12-18% */
+ {{ 448,128}, {1353, 74}, {2238, 41}}, /* Q == 3 : 18-25% */
+ {{ 556,128}, {1353, 74}, {2238, 47}}, /* Q == 4 : 25-32% */
+ {{ 714,128}, {1418, 74}, {2436, 53}}, /* Q == 5 : 32-38% */
+ {{ 883,128}, {1437, 74}, {2464, 61}}, /* Q == 6 : 38-44% */
+ {{ 897,128}, {1515, 75}, {2622, 68}}, /* Q == 7 : 44-50% */
+ {{ 926,128}, {1613, 75}, {2730, 75}}, /* Q == 8 : 50-56% */
+ {{ 947,128}, {1729, 77}, {3359, 77}}, /* Q == 9 : 56-62% */
+ {{1107,128}, {2083, 81}, {4006, 84}}, /* Q ==10 : 62-69% */
+ {{1177,128}, {2379, 87}, {4785, 88}}, /* Q ==11 : 69-75% */
+ {{1242,128}, {2415, 93}, {5155, 84}}, /* Q ==12 : 75-81% */
+ {{1349,128}, {2644,106}, {5260,106}}, /* Q ==13 : 81-87% */
+ {{1455,128}, {2422,124}, {4174,124}}, /* Q ==14 : 87-93% */
+ {{ 722,128}, {1891,145}, {1936,146}}, /* Q ==15 : 93-99% */
+};
+#endif
+
+/** HUF_selectDecoder() :
+ * Tells which decoder is likely to decode faster,
+ * based on a set of pre-computed metrics.
+ * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 .
+ * Assumption : 0 < dstSize <= 128 KB */
+U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
+{
+ assert(dstSize > 0);
+ assert(dstSize <= 128*1024);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+ (void)dstSize;
+ (void)cSrcSize;
+ return 0;
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+ (void)dstSize;
+ (void)cSrcSize;
+ return 1;
+#else
+ /* decoder timing evaluation */
+ { U32 const Q = (cSrcSize >= dstSize) ? 15 : (U32)(cSrcSize * 16 / dstSize); /* Q < 16 */
+ U32 const D256 = (U32)(dstSize >> 8);
+ U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
+ U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
+ DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, to reduce cache eviction */
+ return DTime1 < DTime0;
+ }
+#endif
+}
+
+
+typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+
+size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+#if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
+ static const decompressionAlgo decompress[2] = { HUF_decompress4X1, HUF_decompress4X2 };
+#endif
+
+ /* validation checks */
+ if (dstSize == 0) return ERROR(dstSize_tooSmall);
+ if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
+ if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
+ if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
+
+ { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+ (void)algoNb;
+ assert(algoNb == 0);
+ return HUF_decompress4X1(dst, dstSize, cSrc, cSrcSize);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+ (void)algoNb;
+ assert(algoNb == 1);
+ return HUF_decompress4X2(dst, dstSize, cSrc, cSrcSize);
+#else
+ return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
+#endif
+ }
+}
+
+size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+ /* validation checks */
+ if (dstSize == 0) return ERROR(dstSize_tooSmall);
+ if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
+ if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
+ if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
+
+ { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+ (void)algoNb;
+ assert(algoNb == 0);
+ return HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+ (void)algoNb;
+ assert(algoNb == 1);
+ return HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
+#else
+ return algoNb ? HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
+ HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
+#endif
+ }
+}
+
+size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+ U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+ return HUF_decompress4X_hufOnly_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+ workSpace, sizeof(workSpace));
+}
+
+
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
+ size_t dstSize, const void* cSrc,
+ size_t cSrcSize, void* workSpace,
+ size_t wkspSize)
+{
+ /* validation checks */
+ if (dstSize == 0) return ERROR(dstSize_tooSmall);
+ if (cSrcSize == 0) return ERROR(corruption_detected);
+
+ { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+ (void)algoNb;
+ assert(algoNb == 0);
+ return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+ (void)algoNb;
+ assert(algoNb == 1);
+ return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+#else
+ return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+ cSrcSize, workSpace, wkspSize):
+ HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+#endif
+ }
+}
+
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize,
+ void* workSpace, size_t wkspSize)
+{
+ /* validation checks */
+ if (dstSize == 0) return ERROR(dstSize_tooSmall);
+ if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
+ if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
+ if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
+
+ { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+ (void)algoNb;
+ assert(algoNb == 0);
+ return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+ cSrcSize, workSpace, wkspSize);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+ (void)algoNb;
+ assert(algoNb == 1);
+ return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+ cSrcSize, workSpace, wkspSize);
+#else
+ return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+ cSrcSize, workSpace, wkspSize):
+ HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+ cSrcSize, workSpace, wkspSize);
+#endif
+ }
+}
+
+size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize)
+{
+ U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+ return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+ workSpace, sizeof(workSpace));
+}
+
+
+size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+{
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+ (void)dtd;
+ assert(dtd.tableType == 0);
+ return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+ (void)dtd;
+ assert(dtd.tableType == 1);
+ return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+#else
+ return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+ HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+#endif
+}
+
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+{
+ const BYTE* ip = (const BYTE*) cSrc;
+
+ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize);
+ if (HUF_isError(hSize)) return hSize;
+ if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+ ip += hSize; cSrcSize -= hSize;
+
+ return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+}
+#endif
+
+size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+{
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+ (void)dtd;
+ assert(dtd.tableType == 0);
+ return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+ (void)dtd;
+ assert(dtd.tableType == 1);
+ return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+#else
+ return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+ HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+#endif
+}
+
+size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+{
+ /* validation checks */
+ if (dstSize == 0) return ERROR(dstSize_tooSmall);
+ if (cSrcSize == 0) return ERROR(corruption_detected);
+
+ { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+ (void)algoNb;
+ assert(algoNb == 0);
+ return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+ (void)algoNb;
+ assert(algoNb == 1);
+ return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+#else
+ return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
+ HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+#endif
+ }
+}
+/**** ended inlining decompress/huf_decompress.c ****/
+/**** start inlining decompress/zstd_ddict.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* zstd_ddict.c :
+ * concentrates all logic that needs to know the internals of ZSTD_DDict object */
+
+/*-*******************************************************
+* Dependencies
+*********************************************************/
+#include <string.h> /* memcpy, memmove, memset */
+/**** skipping file: ../common/cpu.h ****/
+/**** skipping file: ../common/mem.h ****/
+#define FSE_STATIC_LINKING_ONLY
+/**** skipping file: ../common/fse.h ****/
+#define HUF_STATIC_LINKING_ONLY
+/**** skipping file: ../common/huf.h ****/
+/**** start inlining zstd_decompress_internal.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* zstd_decompress_internal:
+ * objects and definitions shared within lib/decompress modules */
+
+ #ifndef ZSTD_DECOMPRESS_INTERNAL_H
+ #define ZSTD_DECOMPRESS_INTERNAL_H
+
+
+/*-*******************************************************
+ * Dependencies
+ *********************************************************/
+/**** skipping file: ../common/mem.h ****/
+/**** skipping file: ../common/zstd_internal.h ****/
+
+
+
+/*-*******************************************************
+ * Constants
+ *********************************************************/
+static const U32 LL_base[MaxLL+1] = {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 18, 20, 22, 24, 28, 32, 40,
+ 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
+ 0x2000, 0x4000, 0x8000, 0x10000 };
+
+static const U32 OF_base[MaxOff+1] = {
+ 0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D,
+ 0xFD, 0x1FD, 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD,
+ 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD,
+ 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD, 0x1FFFFFFD, 0x3FFFFFFD, 0x7FFFFFFD };
+
+static const U32 OF_bits[MaxOff+1] = {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31 };
+
+static const U32 ML_base[MaxML+1] = {
+ 3, 4, 5, 6, 7, 8, 9, 10,
+ 11, 12, 13, 14, 15, 16, 17, 18,
+ 19, 20, 21, 22, 23, 24, 25, 26,
+ 27, 28, 29, 30, 31, 32, 33, 34,
+ 35, 37, 39, 41, 43, 47, 51, 59,
+ 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
+ 0x1003, 0x2003, 0x4003, 0x8003, 0x10003 };
+
+
+/*-*******************************************************
+ * Decompression types
+ *********************************************************/
+ typedef struct {
+ U32 fastMode;
+ U32 tableLog;
+ } ZSTD_seqSymbol_header;
+
+ typedef struct {
+ U16 nextState;
+ BYTE nbAdditionalBits;
+ BYTE nbBits;
+ U32 baseValue;
+ } ZSTD_seqSymbol;
+
+ #define SEQSYMBOL_TABLE_SIZE(log) (1 + (1 << (log)))
+
+typedef struct {
+ ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */
+ ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)]; /* is also used as temporary workspace while building hufTable during DDict creation */
+ ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
+ HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */
+ U32 rep[ZSTD_REP_NUM];
+} ZSTD_entropyDTables_t;
+
+typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
+ ZSTDds_decodeBlockHeader, ZSTDds_decompressBlock,
+ ZSTDds_decompressLastBlock, ZSTDds_checkChecksum,
+ ZSTDds_decodeSkippableHeader, ZSTDds_skipFrame } ZSTD_dStage;
+
+typedef enum { zdss_init=0, zdss_loadHeader,
+ zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage;
+
+typedef enum {
+ ZSTD_use_indefinitely = -1, /* Use the dictionary indefinitely */
+ ZSTD_dont_use = 0, /* Do not use the dictionary (if one exists free it) */
+ ZSTD_use_once = 1 /* Use the dictionary once and set to ZSTD_dont_use */
+} ZSTD_dictUses_e;
+
+typedef enum {
+ ZSTD_obm_buffered = 0, /* Buffer the output */
+ ZSTD_obm_stable = 1 /* ZSTD_outBuffer is stable */
+} ZSTD_outBufferMode_e;
+
+struct ZSTD_DCtx_s
+{
+ const ZSTD_seqSymbol* LLTptr;
+ const ZSTD_seqSymbol* MLTptr;
+ const ZSTD_seqSymbol* OFTptr;
+ const HUF_DTable* HUFptr;
+ ZSTD_entropyDTables_t entropy;
+ U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; /* space needed when building huffman tables */
+ const void* previousDstEnd; /* detect continuity */
+ const void* prefixStart; /* start of current segment */
+ const void* virtualStart; /* virtual start of previous segment if it was just before current one */
+ const void* dictEnd; /* end of previous segment */
+ size_t expected;
+ ZSTD_frameHeader fParams;
+ U64 decodedSize;
+ blockType_e bType; /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */
+ ZSTD_dStage stage;
+ U32 litEntropy;
+ U32 fseEntropy;
+ XXH64_state_t xxhState;
+ size_t headerSize;
+ ZSTD_format_e format;
+ const BYTE* litPtr;
+ ZSTD_customMem customMem;
+ size_t litSize;
+ size_t rleSize;
+ size_t staticSize;
+ int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
+
+ /* dictionary */
+ ZSTD_DDict* ddictLocal;
+ const ZSTD_DDict* ddict; /* set by ZSTD_initDStream_usingDDict(), or ZSTD_DCtx_refDDict() */
+ U32 dictID;
+ int ddictIsCold; /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */
+ ZSTD_dictUses_e dictUses;
+
+ /* streaming */
+ ZSTD_dStreamStage streamStage;
+ char* inBuff;
+ size_t inBuffSize;
+ size_t inPos;
+ size_t maxWindowSize;
+ char* outBuff;
+ size_t outBuffSize;
+ size_t outStart;
+ size_t outEnd;
+ size_t lhSize;
+ void* legacyContext;
+ U32 previousLegacyVersion;
+ U32 legacyVersion;
+ U32 hostageByte;
+ int noForwardProgress;
+ ZSTD_outBufferMode_e outBufferMode;
+ ZSTD_outBuffer expectedOutBuffer;
+
+ /* workspace */
+ BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH];
+ BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];
+
+ size_t oversizedDuration;
+
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+ void const* dictContentBeginForFuzzing;
+ void const* dictContentEndForFuzzing;
+#endif
+}; /* typedef'd to ZSTD_DCtx within "zstd.h" */
+
+
+/*-*******************************************************
+ * Shared internal functions
+ *********************************************************/
+
+/*! ZSTD_loadDEntropy() :
+ * dict : must point at beginning of a valid zstd dictionary.
+ * @return : size of dictionary header (size of magic number + dict ID + entropy tables) */
+size_t ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
+ const void* const dict, size_t const dictSize);
+
+/*! ZSTD_checkContinuity() :
+ * check if next `dst` follows previous position, where decompression ended.
+ * If yes, do nothing (continue on current segment).
+ * If not, classify previous segment as "external dictionary", and start a new segment.
+ * This function cannot fail. */
+void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst);
+
+
+#endif /* ZSTD_DECOMPRESS_INTERNAL_H */
+/**** ended inlining zstd_decompress_internal.h ****/
+/**** start inlining zstd_ddict.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+#ifndef ZSTD_DDICT_H
+#define ZSTD_DDICT_H
+
+/*-*******************************************************
+ * Dependencies
+ *********************************************************/
+#include <stddef.h> /* size_t */
+/**** skipping file: ../zstd.h ****/
+
+
+/*-*******************************************************
+ * Interface
+ *********************************************************/
+
+/* note: several prototypes are already published in `zstd.h` :
+ * ZSTD_createDDict()
+ * ZSTD_createDDict_byReference()
+ * ZSTD_createDDict_advanced()
+ * ZSTD_freeDDict()
+ * ZSTD_initStaticDDict()
+ * ZSTD_sizeof_DDict()
+ * ZSTD_estimateDDictSize()
+ * ZSTD_getDictID_fromDict()
+ */
+
+const void* ZSTD_DDict_dictContent(const ZSTD_DDict* ddict);
+size_t ZSTD_DDict_dictSize(const ZSTD_DDict* ddict);
+
+void ZSTD_copyDDictParameters(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
+
+
+
+#endif /* ZSTD_DDICT_H */
+/**** ended inlining zstd_ddict.h ****/
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+/**** start inlining ../legacy/zstd_legacy.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_LEGACY_H
+#define ZSTD_LEGACY_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+* Includes
+***************************************/
+/**** skipping file: ../common/mem.h ****/
+/**** skipping file: ../common/error_private.h ****/
+/**** skipping file: ../common/zstd_internal.h ****/
+
+#if !defined (ZSTD_LEGACY_SUPPORT) || (ZSTD_LEGACY_SUPPORT == 0)
+# undef ZSTD_LEGACY_SUPPORT
+# define ZSTD_LEGACY_SUPPORT 8
+#endif
+
+#if (ZSTD_LEGACY_SUPPORT <= 1)
+/**** start inlining zstd_v01.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_V01_H_28739879432
+#define ZSTD_V01_H_28739879432
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+* Includes
+***************************************/
+#include <stddef.h> /* size_t */
+
+
+/* *************************************
+* Simple one-step function
+***************************************/
+/**
+ZSTDv01_decompress() : decompress ZSTD frames compliant with v0.1.x format
+ compressedSize : is the exact source size
+ maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated.
+ It must be equal or larger than originalSize, otherwise decompression will fail.
+ return : the number of bytes decompressed into destination buffer (originalSize)
+ or an errorCode if it fails (which can be tested using ZSTDv01_isError())
+*/
+size_t ZSTDv01_decompress( void* dst, size_t maxOriginalSize,
+ const void* src, size_t compressedSize);
+
+ /**
+ ZSTDv01_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.1.x format
+ srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+ cSize (output parameter) : the number of bytes that would be read to decompress this frame
+ or an error code if it fails (which can be tested using ZSTDv01_isError())
+ dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame
+ or ZSTD_CONTENTSIZE_ERROR if an error occurs
+
+ note : assumes `cSize` and `dBound` are _not_ NULL.
+ */
+void ZSTDv01_findFrameSizeInfoLegacy(const void *src, size_t srcSize,
+ size_t* cSize, unsigned long long* dBound);
+
+/**
+ZSTDv01_isError() : tells if the result of ZSTDv01_decompress() is an error
+*/
+unsigned ZSTDv01_isError(size_t code);
+
+
+/* *************************************
+* Advanced functions
+***************************************/
+typedef struct ZSTDv01_Dctx_s ZSTDv01_Dctx;
+ZSTDv01_Dctx* ZSTDv01_createDCtx(void);
+size_t ZSTDv01_freeDCtx(ZSTDv01_Dctx* dctx);
+
+size_t ZSTDv01_decompressDCtx(void* ctx,
+ void* dst, size_t maxOriginalSize,
+ const void* src, size_t compressedSize);
+
+/* *************************************
+* Streaming functions
+***************************************/
+size_t ZSTDv01_resetDCtx(ZSTDv01_Dctx* dctx);
+
+size_t ZSTDv01_nextSrcSizeToDecompress(ZSTDv01_Dctx* dctx);
+size_t ZSTDv01_decompressContinue(ZSTDv01_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+/**
+ Use above functions alternatively.
+ ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+ ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block.
+ Result is the number of bytes regenerated within 'dst'.
+ It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+*/
+
+/* *************************************
+* Prefix - version detection
+***************************************/
+#define ZSTDv01_magicNumber 0xFD2FB51E /* Big Endian version */
+#define ZSTDv01_magicNumberLE 0x1EB52FFD /* Little Endian version */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_V01_H_28739879432 */
+/**** ended inlining zstd_v01.h ****/
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 2)
+/**** start inlining zstd_v02.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_V02_H_4174539423
+#define ZSTD_V02_H_4174539423
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+* Includes
+***************************************/
+#include <stddef.h> /* size_t */
+
+
+/* *************************************
+* Simple one-step function
+***************************************/
+/**
+ZSTDv02_decompress() : decompress ZSTD frames compliant with v0.2.x format
+ compressedSize : is the exact source size
+ maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated.
+ It must be equal or larger than originalSize, otherwise decompression will fail.
+ return : the number of bytes decompressed into destination buffer (originalSize)
+ or an errorCode if it fails (which can be tested using ZSTDv01_isError())
+*/
+size_t ZSTDv02_decompress( void* dst, size_t maxOriginalSize,
+ const void* src, size_t compressedSize);
+
+ /**
+ ZSTDv02_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.2.x format
+ srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+ cSize (output parameter) : the number of bytes that would be read to decompress this frame
+ or an error code if it fails (which can be tested using ZSTDv01_isError())
+ dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame
+ or ZSTD_CONTENTSIZE_ERROR if an error occurs
+
+ note : assumes `cSize` and `dBound` are _not_ NULL.
+ */
+void ZSTDv02_findFrameSizeInfoLegacy(const void *src, size_t srcSize,
+ size_t* cSize, unsigned long long* dBound);
+
+/**
+ZSTDv02_isError() : tells if the result of ZSTDv02_decompress() is an error
+*/
+unsigned ZSTDv02_isError(size_t code);
+
+
+/* *************************************
+* Advanced functions
+***************************************/
+typedef struct ZSTDv02_Dctx_s ZSTDv02_Dctx;
+ZSTDv02_Dctx* ZSTDv02_createDCtx(void);
+size_t ZSTDv02_freeDCtx(ZSTDv02_Dctx* dctx);
+
+size_t ZSTDv02_decompressDCtx(void* ctx,
+ void* dst, size_t maxOriginalSize,
+ const void* src, size_t compressedSize);
+
+/* *************************************
+* Streaming functions
+***************************************/
+size_t ZSTDv02_resetDCtx(ZSTDv02_Dctx* dctx);
+
+size_t ZSTDv02_nextSrcSizeToDecompress(ZSTDv02_Dctx* dctx);
+size_t ZSTDv02_decompressContinue(ZSTDv02_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+/**
+ Use above functions alternatively.
+ ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+ ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block.
+ Result is the number of bytes regenerated within 'dst'.
+ It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+*/
+
+/* *************************************
+* Prefix - version detection
+***************************************/
+#define ZSTDv02_magicNumber 0xFD2FB522 /* v0.2 */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_V02_H_4174539423 */
+/**** ended inlining zstd_v02.h ****/
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 3)
+/**** start inlining zstd_v03.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_V03_H_298734209782
+#define ZSTD_V03_H_298734209782
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+* Includes
+***************************************/
+#include <stddef.h> /* size_t */
+
+
+/* *************************************
+* Simple one-step function
+***************************************/
+/**
+ZSTDv03_decompress() : decompress ZSTD frames compliant with v0.3.x format
+ compressedSize : is the exact source size
+ maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated.
+ It must be equal or larger than originalSize, otherwise decompression will fail.
+ return : the number of bytes decompressed into destination buffer (originalSize)
+ or an errorCode if it fails (which can be tested using ZSTDv01_isError())
+*/
+size_t ZSTDv03_decompress( void* dst, size_t maxOriginalSize,
+ const void* src, size_t compressedSize);
+
+ /**
+ ZSTDv03_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.3.x format
+ srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+ cSize (output parameter) : the number of bytes that would be read to decompress this frame
+ or an error code if it fails (which can be tested using ZSTDv01_isError())
+ dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame
+ or ZSTD_CONTENTSIZE_ERROR if an error occurs
+
+ note : assumes `cSize` and `dBound` are _not_ NULL.
+ */
+ void ZSTDv03_findFrameSizeInfoLegacy(const void *src, size_t srcSize,
+ size_t* cSize, unsigned long long* dBound);
+
+ /**
+ZSTDv03_isError() : tells if the result of ZSTDv03_decompress() is an error
+*/
+unsigned ZSTDv03_isError(size_t code);
+
+
+/* *************************************
+* Advanced functions
+***************************************/
+typedef struct ZSTDv03_Dctx_s ZSTDv03_Dctx;
+ZSTDv03_Dctx* ZSTDv03_createDCtx(void);
+size_t ZSTDv03_freeDCtx(ZSTDv03_Dctx* dctx);
+
+size_t ZSTDv03_decompressDCtx(void* ctx,
+ void* dst, size_t maxOriginalSize,
+ const void* src, size_t compressedSize);
+
+/* *************************************
+* Streaming functions
+***************************************/
+size_t ZSTDv03_resetDCtx(ZSTDv03_Dctx* dctx);
+
+size_t ZSTDv03_nextSrcSizeToDecompress(ZSTDv03_Dctx* dctx);
+size_t ZSTDv03_decompressContinue(ZSTDv03_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+/**
+ Use above functions alternatively.
+ ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+ ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block.
+ Result is the number of bytes regenerated within 'dst'.
+ It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+*/
+
+/* *************************************
+* Prefix - version detection
+***************************************/
+#define ZSTDv03_magicNumber 0xFD2FB523 /* v0.3 */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_V03_H_298734209782 */
+/**** ended inlining zstd_v03.h ****/
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+/**** start inlining zstd_v04.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_V04_H_91868324769238
+#define ZSTD_V04_H_91868324769238
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+* Includes
+***************************************/
+#include <stddef.h> /* size_t */
+
+
+/* *************************************
+* Simple one-step function
+***************************************/
+/**
+ZSTDv04_decompress() : decompress ZSTD frames compliant with v0.4.x format
+ compressedSize : is the exact source size
+ maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated.
+ It must be equal or larger than originalSize, otherwise decompression will fail.
+ return : the number of bytes decompressed into destination buffer (originalSize)
+ or an errorCode if it fails (which can be tested using ZSTDv01_isError())
+*/
+size_t ZSTDv04_decompress( void* dst, size_t maxOriginalSize,
+ const void* src, size_t compressedSize);
+
+ /**
+ ZSTDv04_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.4.x format
+ srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+ cSize (output parameter) : the number of bytes that would be read to decompress this frame
+ or an error code if it fails (which can be tested using ZSTDv01_isError())
+ dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame
+ or ZSTD_CONTENTSIZE_ERROR if an error occurs
+
+ note : assumes `cSize` and `dBound` are _not_ NULL.
+ */
+ void ZSTDv04_findFrameSizeInfoLegacy(const void *src, size_t srcSize,
+ size_t* cSize, unsigned long long* dBound);
+
+/**
+ZSTDv04_isError() : tells if the result of ZSTDv04_decompress() is an error
+*/
+unsigned ZSTDv04_isError(size_t code);
+
+
+/* *************************************
+* Advanced functions
+***************************************/
+typedef struct ZSTDv04_Dctx_s ZSTDv04_Dctx;
+ZSTDv04_Dctx* ZSTDv04_createDCtx(void);
+size_t ZSTDv04_freeDCtx(ZSTDv04_Dctx* dctx);
+
+size_t ZSTDv04_decompressDCtx(ZSTDv04_Dctx* dctx,
+ void* dst, size_t maxOriginalSize,
+ const void* src, size_t compressedSize);
+
+
+/* *************************************
+* Direct Streaming
+***************************************/
+size_t ZSTDv04_resetDCtx(ZSTDv04_Dctx* dctx);
+
+size_t ZSTDv04_nextSrcSizeToDecompress(ZSTDv04_Dctx* dctx);
+size_t ZSTDv04_decompressContinue(ZSTDv04_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+/**
+ Use above functions alternatively.
+ ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+ ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block.
+ Result is the number of bytes regenerated within 'dst'.
+ It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+*/
+
+
+/* *************************************
+* Buffered Streaming
+***************************************/
+typedef struct ZBUFFv04_DCtx_s ZBUFFv04_DCtx;
+ZBUFFv04_DCtx* ZBUFFv04_createDCtx(void);
+size_t ZBUFFv04_freeDCtx(ZBUFFv04_DCtx* dctx);
+
+size_t ZBUFFv04_decompressInit(ZBUFFv04_DCtx* dctx);
+size_t ZBUFFv04_decompressWithDictionary(ZBUFFv04_DCtx* dctx, const void* dict, size_t dictSize);
+
+size_t ZBUFFv04_decompressContinue(ZBUFFv04_DCtx* dctx, void* dst, size_t* maxDstSizePtr, const void* src, size_t* srcSizePtr);
+
+/** ************************************************
+* Streaming decompression
+*
+* A ZBUFF_DCtx object is required to track streaming operation.
+* Use ZBUFF_createDCtx() and ZBUFF_freeDCtx() to create/release resources.
+* Use ZBUFF_decompressInit() to start a new decompression operation.
+* ZBUFF_DCtx objects can be reused multiple times.
+*
+* Optionally, a reference to a static dictionary can be set, using ZBUFF_decompressWithDictionary()
+* It must be the same content as the one set during compression phase.
+* Dictionary content must remain accessible during the decompression process.
+*
+* Use ZBUFF_decompressContinue() repetitively to consume your input.
+* *srcSizePtr and *maxDstSizePtr can be any size.
+* The function will report how many bytes were read or written by modifying *srcSizePtr and *maxDstSizePtr.
+* Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+* The content of dst will be overwritten (up to *maxDstSizePtr) at each function call, so save its content if it matters or change dst.
+* @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency)
+* or 0 when a frame is completely decoded
+* or an error code, which can be tested using ZBUFF_isError().
+*
+* Hint : recommended buffer sizes (not compulsory) : ZBUFF_recommendedDInSize / ZBUFF_recommendedDOutSize
+* output : ZBUFF_recommendedDOutSize==128 KB block size is the internal unit, it ensures it's always possible to write a full block when it's decoded.
+* input : ZBUFF_recommendedDInSize==128Kb+3; just follow indications from ZBUFF_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* **************************************************/
+unsigned ZBUFFv04_isError(size_t errorCode);
+const char* ZBUFFv04_getErrorName(size_t errorCode);
+
+
+/** The below functions provide recommended buffer sizes for Compression or Decompression operations.
+* These sizes are not compulsory, they just tend to offer better latency */
+size_t ZBUFFv04_recommendedDInSize(void);
+size_t ZBUFFv04_recommendedDOutSize(void);
+
+
+/* *************************************
+* Prefix - version detection
+***************************************/
+#define ZSTDv04_magicNumber 0xFD2FB524 /* v0.4 */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_V04_H_91868324769238 */
+/**** ended inlining zstd_v04.h ****/
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+/**** start inlining zstd_v05.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTDv05_H
+#define ZSTDv05_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-*************************************
+* Dependencies
+***************************************/
+#include <stddef.h> /* size_t */
+/**** skipping file: ../common/mem.h ****/
+
+
+/* *************************************
+* Simple functions
+***************************************/
+/*! ZSTDv05_decompress() :
+ `compressedSize` : is the _exact_ size of the compressed blob, otherwise decompression will fail.
+ `dstCapacity` must be large enough, equal or larger than originalSize.
+ @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+ or an errorCode if it fails (which can be tested using ZSTDv05_isError()) */
+size_t ZSTDv05_decompress( void* dst, size_t dstCapacity,
+ const void* src, size_t compressedSize);
+
+ /**
+ ZSTDv05_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.5.x format
+ srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+ cSize (output parameter) : the number of bytes that would be read to decompress this frame
+ or an error code if it fails (which can be tested using ZSTDv01_isError())
+ dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame
+ or ZSTD_CONTENTSIZE_ERROR if an error occurs
+
+ note : assumes `cSize` and `dBound` are _not_ NULL.
+ */
+void ZSTDv05_findFrameSizeInfoLegacy(const void *src, size_t srcSize,
+ size_t* cSize, unsigned long long* dBound);
+
+/* *************************************
+* Helper functions
+***************************************/
+/* Error Management */
+unsigned ZSTDv05_isError(size_t code); /*!< tells if a `size_t` function result is an error code */
+const char* ZSTDv05_getErrorName(size_t code); /*!< provides readable string for an error code */
+
+
+/* *************************************
+* Explicit memory management
+***************************************/
+/** Decompression context */
+typedef struct ZSTDv05_DCtx_s ZSTDv05_DCtx;
+ZSTDv05_DCtx* ZSTDv05_createDCtx(void);
+size_t ZSTDv05_freeDCtx(ZSTDv05_DCtx* dctx); /*!< @return : errorCode */
+
+/** ZSTDv05_decompressDCtx() :
+* Same as ZSTDv05_decompress(), but requires an already allocated ZSTDv05_DCtx (see ZSTDv05_createDCtx()) */
+size_t ZSTDv05_decompressDCtx(ZSTDv05_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-***********************
+* Simple Dictionary API
+*************************/
+/*! ZSTDv05_decompress_usingDict() :
+* Decompression using a pre-defined Dictionary content (see dictBuilder).
+* Dictionary must be identical to the one used during compression, otherwise regenerated data will be corrupted.
+* Note : dict can be NULL, in which case, it's equivalent to ZSTDv05_decompressDCtx() */
+size_t ZSTDv05_decompress_usingDict(ZSTDv05_DCtx* dctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const void* dict,size_t dictSize);
+
+/*-************************
+* Advanced Streaming API
+***************************/
+typedef enum { ZSTDv05_fast, ZSTDv05_greedy, ZSTDv05_lazy, ZSTDv05_lazy2, ZSTDv05_btlazy2, ZSTDv05_opt, ZSTDv05_btopt } ZSTDv05_strategy;
+typedef struct {
+ U64 srcSize;
+ U32 windowLog; /* the only useful information to retrieve */
+ U32 contentLog; U32 hashLog; U32 searchLog; U32 searchLength; U32 targetLength; ZSTDv05_strategy strategy;
+} ZSTDv05_parameters;
+size_t ZSTDv05_getFrameParams(ZSTDv05_parameters* params, const void* src, size_t srcSize);
+
+size_t ZSTDv05_decompressBegin_usingDict(ZSTDv05_DCtx* dctx, const void* dict, size_t dictSize);
+void ZSTDv05_copyDCtx(ZSTDv05_DCtx* dstDCtx, const ZSTDv05_DCtx* srcDCtx);
+size_t ZSTDv05_nextSrcSizeToDecompress(ZSTDv05_DCtx* dctx);
+size_t ZSTDv05_decompressContinue(ZSTDv05_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-***********************
+* ZBUFF API
+*************************/
+typedef struct ZBUFFv05_DCtx_s ZBUFFv05_DCtx;
+ZBUFFv05_DCtx* ZBUFFv05_createDCtx(void);
+size_t ZBUFFv05_freeDCtx(ZBUFFv05_DCtx* dctx);
+
+size_t ZBUFFv05_decompressInit(ZBUFFv05_DCtx* dctx);
+size_t ZBUFFv05_decompressInitDictionary(ZBUFFv05_DCtx* dctx, const void* dict, size_t dictSize);
+
+size_t ZBUFFv05_decompressContinue(ZBUFFv05_DCtx* dctx,
+ void* dst, size_t* dstCapacityPtr,
+ const void* src, size_t* srcSizePtr);
+
+/*-***************************************************************************
+* Streaming decompression
+*
+* A ZBUFFv05_DCtx object is required to track streaming operations.
+* Use ZBUFFv05_createDCtx() and ZBUFFv05_freeDCtx() to create/release resources.
+* Use ZBUFFv05_decompressInit() to start a new decompression operation,
+* or ZBUFFv05_decompressInitDictionary() if decompression requires a dictionary.
+* Note that ZBUFFv05_DCtx objects can be reused multiple times.
+*
+* Use ZBUFFv05_decompressContinue() repetitively to consume your input.
+* *srcSizePtr and *dstCapacityPtr can be any size.
+* The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+* Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+* The content of @dst will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters or change @dst.
+* @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency)
+* or 0 when a frame is completely decoded
+* or an error code, which can be tested using ZBUFFv05_isError().
+*
+* Hint : recommended buffer sizes (not compulsory) : ZBUFFv05_recommendedDInSize() / ZBUFFv05_recommendedDOutSize()
+* output : ZBUFFv05_recommendedDOutSize==128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded.
+* input : ZBUFFv05_recommendedDInSize==128Kb+3; just follow indications from ZBUFFv05_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* *******************************************************************************/
+
+
+/* *************************************
+* Tool functions
+***************************************/
+unsigned ZBUFFv05_isError(size_t errorCode);
+const char* ZBUFFv05_getErrorName(size_t errorCode);
+
+/** Functions below provide recommended buffer sizes for Compression or Decompression operations.
+* These sizes are just hints, and tend to offer better latency */
+size_t ZBUFFv05_recommendedDInSize(void);
+size_t ZBUFFv05_recommendedDOutSize(void);
+
+
+
+/*-*************************************
+* Constants
+***************************************/
+#define ZSTDv05_MAGICNUMBER 0xFD2FB525 /* v0.5 */
+
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTDv0505_H */
+/**** ended inlining zstd_v05.h ****/
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+/**** start inlining zstd_v06.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTDv06_H
+#define ZSTDv06_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*====== Dependency ======*/
+#include <stddef.h> /* size_t */
+
+
+/*====== Export for Windows ======*/
+/*!
+* ZSTDv06_DLL_EXPORT :
+* Enable exporting of functions when building a Windows DLL
+*/
+#if defined(_WIN32) && defined(ZSTDv06_DLL_EXPORT) && (ZSTDv06_DLL_EXPORT==1)
+# define ZSTDLIBv06_API __declspec(dllexport)
+#else
+# define ZSTDLIBv06_API
+#endif
+
+
+/* *************************************
+* Simple functions
+***************************************/
+/*! ZSTDv06_decompress() :
+ `compressedSize` : is the _exact_ size of the compressed blob, otherwise decompression will fail.
+ `dstCapacity` must be large enough, equal or larger than originalSize.
+ @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+ or an errorCode if it fails (which can be tested using ZSTDv06_isError()) */
+ZSTDLIBv06_API size_t ZSTDv06_decompress( void* dst, size_t dstCapacity,
+ const void* src, size_t compressedSize);
+
+/**
+ZSTDv06_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.6.x format
+ srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+ cSize (output parameter) : the number of bytes that would be read to decompress this frame
+ or an error code if it fails (which can be tested using ZSTDv01_isError())
+ dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame
+ or ZSTD_CONTENTSIZE_ERROR if an error occurs
+
+ note : assumes `cSize` and `dBound` are _not_ NULL.
+*/
+void ZSTDv06_findFrameSizeInfoLegacy(const void *src, size_t srcSize,
+ size_t* cSize, unsigned long long* dBound);
+
+/* *************************************
+* Helper functions
+***************************************/
+ZSTDLIBv06_API size_t ZSTDv06_compressBound(size_t srcSize); /*!< maximum compressed size (worst case scenario) */
+
+/* Error Management */
+ZSTDLIBv06_API unsigned ZSTDv06_isError(size_t code); /*!< tells if a `size_t` function result is an error code */
+ZSTDLIBv06_API const char* ZSTDv06_getErrorName(size_t code); /*!< provides readable string for an error code */
+
+
+/* *************************************
+* Explicit memory management
+***************************************/
+/** Decompression context */
+typedef struct ZSTDv06_DCtx_s ZSTDv06_DCtx;
+ZSTDLIBv06_API ZSTDv06_DCtx* ZSTDv06_createDCtx(void);
+ZSTDLIBv06_API size_t ZSTDv06_freeDCtx(ZSTDv06_DCtx* dctx); /*!< @return : errorCode */
+
+/** ZSTDv06_decompressDCtx() :
+* Same as ZSTDv06_decompress(), but requires an already allocated ZSTDv06_DCtx (see ZSTDv06_createDCtx()) */
+ZSTDLIBv06_API size_t ZSTDv06_decompressDCtx(ZSTDv06_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-***********************
+* Dictionary API
+*************************/
+/*! ZSTDv06_decompress_usingDict() :
+* Decompression using a pre-defined Dictionary content (see dictBuilder).
+* Dictionary must be identical to the one used during compression, otherwise regenerated data will be corrupted.
+* Note : dict can be NULL, in which case, it's equivalent to ZSTDv06_decompressDCtx() */
+ZSTDLIBv06_API size_t ZSTDv06_decompress_usingDict(ZSTDv06_DCtx* dctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const void* dict,size_t dictSize);
+
+
+/*-************************
+* Advanced Streaming API
+***************************/
+struct ZSTDv06_frameParams_s { unsigned long long frameContentSize; unsigned windowLog; };
+typedef struct ZSTDv06_frameParams_s ZSTDv06_frameParams;
+
+ZSTDLIBv06_API size_t ZSTDv06_getFrameParams(ZSTDv06_frameParams* fparamsPtr, const void* src, size_t srcSize); /**< doesn't consume input */
+ZSTDLIBv06_API size_t ZSTDv06_decompressBegin_usingDict(ZSTDv06_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIBv06_API void ZSTDv06_copyDCtx(ZSTDv06_DCtx* dctx, const ZSTDv06_DCtx* preparedDCtx);
+
+ZSTDLIBv06_API size_t ZSTDv06_nextSrcSizeToDecompress(ZSTDv06_DCtx* dctx);
+ZSTDLIBv06_API size_t ZSTDv06_decompressContinue(ZSTDv06_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+
+/* *************************************
+* ZBUFF API
+***************************************/
+
+typedef struct ZBUFFv06_DCtx_s ZBUFFv06_DCtx;
+ZSTDLIBv06_API ZBUFFv06_DCtx* ZBUFFv06_createDCtx(void);
+ZSTDLIBv06_API size_t ZBUFFv06_freeDCtx(ZBUFFv06_DCtx* dctx);
+
+ZSTDLIBv06_API size_t ZBUFFv06_decompressInit(ZBUFFv06_DCtx* dctx);
+ZSTDLIBv06_API size_t ZBUFFv06_decompressInitDictionary(ZBUFFv06_DCtx* dctx, const void* dict, size_t dictSize);
+
+ZSTDLIBv06_API size_t ZBUFFv06_decompressContinue(ZBUFFv06_DCtx* dctx,
+ void* dst, size_t* dstCapacityPtr,
+ const void* src, size_t* srcSizePtr);
+
+/*-***************************************************************************
+* Streaming decompression howto
+*
+* A ZBUFFv06_DCtx object is required to track streaming operations.
+* Use ZBUFFv06_createDCtx() and ZBUFFv06_freeDCtx() to create/release resources.
+* Use ZBUFFv06_decompressInit() to start a new decompression operation,
+* or ZBUFFv06_decompressInitDictionary() if decompression requires a dictionary.
+* Note that ZBUFFv06_DCtx objects can be re-init multiple times.
+*
+* Use ZBUFFv06_decompressContinue() repetitively to consume your input.
+* *srcSizePtr and *dstCapacityPtr can be any size.
+* The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+* Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+* The content of `dst` will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters, or change `dst`.
+* @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency),
+* or 0 when a frame is completely decoded,
+* or an error code, which can be tested using ZBUFFv06_isError().
+*
+* Hint : recommended buffer sizes (not compulsory) : ZBUFFv06_recommendedDInSize() and ZBUFFv06_recommendedDOutSize()
+* output : ZBUFFv06_recommendedDOutSize== 128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded.
+* input : ZBUFFv06_recommendedDInSize == 128KB + 3;
+* just follow indications from ZBUFFv06_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* *******************************************************************************/
+
+
+/* *************************************
+* Tool functions
+***************************************/
+ZSTDLIBv06_API unsigned ZBUFFv06_isError(size_t errorCode);
+ZSTDLIBv06_API const char* ZBUFFv06_getErrorName(size_t errorCode);
+
+/** Functions below provide recommended buffer sizes for Compression or Decompression operations.
+* These sizes are just hints, they tend to offer better latency */
+ZSTDLIBv06_API size_t ZBUFFv06_recommendedDInSize(void);
+ZSTDLIBv06_API size_t ZBUFFv06_recommendedDOutSize(void);
+
+
+/*-*************************************
+* Constants
+***************************************/
+#define ZSTDv06_MAGICNUMBER 0xFD2FB526 /* v0.6 */
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTDv06_BUFFERED_H */
+/**** ended inlining zstd_v06.h ****/
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+/**** start inlining zstd_v07.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTDv07_H_235446
+#define ZSTDv07_H_235446
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*====== Dependency ======*/
+#include <stddef.h> /* size_t */
+
+
+/*====== Export for Windows ======*/
+/*!
+* ZSTDv07_DLL_EXPORT :
+* Enable exporting of functions when building a Windows DLL
+*/
+#if defined(_WIN32) && defined(ZSTDv07_DLL_EXPORT) && (ZSTDv07_DLL_EXPORT==1)
+# define ZSTDLIBv07_API __declspec(dllexport)
+#else
+# define ZSTDLIBv07_API
+#endif
+
+
+/* *************************************
+* Simple API
+***************************************/
+/*! ZSTDv07_getDecompressedSize() :
+* @return : decompressed size if known, 0 otherwise.
+ note 1 : if `0`, follow up with ZSTDv07_getFrameParams() to know precise failure cause.
+ note 2 : decompressed size could be wrong or intentionally modified !
+ always ensure results fit within application's authorized limits */
+unsigned long long ZSTDv07_getDecompressedSize(const void* src, size_t srcSize);
+
+/*! ZSTDv07_decompress() :
+ `compressedSize` : must be _exact_ size of compressed input, otherwise decompression will fail.
+ `dstCapacity` must be equal or larger than originalSize.
+ @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+ or an errorCode if it fails (which can be tested using ZSTDv07_isError()) */
+ZSTDLIBv07_API size_t ZSTDv07_decompress( void* dst, size_t dstCapacity,
+ const void* src, size_t compressedSize);
+
+/**
+ZSTDv07_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.7.x format
+ srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+ cSize (output parameter) : the number of bytes that would be read to decompress this frame
+ or an error code if it fails (which can be tested using ZSTDv01_isError())
+ dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame
+ or ZSTD_CONTENTSIZE_ERROR if an error occurs
+
+ note : assumes `cSize` and `dBound` are _not_ NULL.
+*/
+void ZSTDv07_findFrameSizeInfoLegacy(const void *src, size_t srcSize,
+ size_t* cSize, unsigned long long* dBound);
+
+/*====== Helper functions ======*/
+ZSTDLIBv07_API unsigned ZSTDv07_isError(size_t code); /*!< tells if a `size_t` function result is an error code */
+ZSTDLIBv07_API const char* ZSTDv07_getErrorName(size_t code); /*!< provides readable string from an error code */
+
+
+/*-*************************************
+* Explicit memory management
+***************************************/
+/** Decompression context */
+typedef struct ZSTDv07_DCtx_s ZSTDv07_DCtx;
+ZSTDLIBv07_API ZSTDv07_DCtx* ZSTDv07_createDCtx(void);
+ZSTDLIBv07_API size_t ZSTDv07_freeDCtx(ZSTDv07_DCtx* dctx); /*!< @return : errorCode */
+
+/** ZSTDv07_decompressDCtx() :
+* Same as ZSTDv07_decompress(), requires an allocated ZSTDv07_DCtx (see ZSTDv07_createDCtx()) */
+ZSTDLIBv07_API size_t ZSTDv07_decompressDCtx(ZSTDv07_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-************************
+* Simple dictionary API
+***************************/
+/*! ZSTDv07_decompress_usingDict() :
+* Decompression using a pre-defined Dictionary content (see dictBuilder).
+* Dictionary must be identical to the one used during compression.
+* Note : This function load the dictionary, resulting in a significant startup time */
+ZSTDLIBv07_API size_t ZSTDv07_decompress_usingDict(ZSTDv07_DCtx* dctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const void* dict,size_t dictSize);
+
+
+/*-**************************
+* Advanced Dictionary API
+****************************/
+/*! ZSTDv07_createDDict() :
+* Create a digested dictionary, ready to start decompression operation without startup delay.
+* `dict` can be released after creation */
+typedef struct ZSTDv07_DDict_s ZSTDv07_DDict;
+ZSTDLIBv07_API ZSTDv07_DDict* ZSTDv07_createDDict(const void* dict, size_t dictSize);
+ZSTDLIBv07_API size_t ZSTDv07_freeDDict(ZSTDv07_DDict* ddict);
+
+/*! ZSTDv07_decompress_usingDDict() :
+* Decompression using a pre-digested Dictionary
+* Faster startup than ZSTDv07_decompress_usingDict(), recommended when same dictionary is used multiple times. */
+ZSTDLIBv07_API size_t ZSTDv07_decompress_usingDDict(ZSTDv07_DCtx* dctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const ZSTDv07_DDict* ddict);
+
+typedef struct {
+ unsigned long long frameContentSize;
+ unsigned windowSize;
+ unsigned dictID;
+ unsigned checksumFlag;
+} ZSTDv07_frameParams;
+
+ZSTDLIBv07_API size_t ZSTDv07_getFrameParams(ZSTDv07_frameParams* fparamsPtr, const void* src, size_t srcSize); /**< doesn't consume input */
+
+
+
+
+/* *************************************
+* Streaming functions
+***************************************/
+typedef struct ZBUFFv07_DCtx_s ZBUFFv07_DCtx;
+ZSTDLIBv07_API ZBUFFv07_DCtx* ZBUFFv07_createDCtx(void);
+ZSTDLIBv07_API size_t ZBUFFv07_freeDCtx(ZBUFFv07_DCtx* dctx);
+
+ZSTDLIBv07_API size_t ZBUFFv07_decompressInit(ZBUFFv07_DCtx* dctx);
+ZSTDLIBv07_API size_t ZBUFFv07_decompressInitDictionary(ZBUFFv07_DCtx* dctx, const void* dict, size_t dictSize);
+
+ZSTDLIBv07_API size_t ZBUFFv07_decompressContinue(ZBUFFv07_DCtx* dctx,
+ void* dst, size_t* dstCapacityPtr,
+ const void* src, size_t* srcSizePtr);
+
+/*-***************************************************************************
+* Streaming decompression howto
+*
+* A ZBUFFv07_DCtx object is required to track streaming operations.
+* Use ZBUFFv07_createDCtx() and ZBUFFv07_freeDCtx() to create/release resources.
+* Use ZBUFFv07_decompressInit() to start a new decompression operation,
+* or ZBUFFv07_decompressInitDictionary() if decompression requires a dictionary.
+* Note that ZBUFFv07_DCtx objects can be re-init multiple times.
+*
+* Use ZBUFFv07_decompressContinue() repetitively to consume your input.
+* *srcSizePtr and *dstCapacityPtr can be any size.
+* The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+* Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+* The content of `dst` will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters, or change `dst`.
+* @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency),
+* or 0 when a frame is completely decoded,
+* or an error code, which can be tested using ZBUFFv07_isError().
+*
+* Hint : recommended buffer sizes (not compulsory) : ZBUFFv07_recommendedDInSize() and ZBUFFv07_recommendedDOutSize()
+* output : ZBUFFv07_recommendedDOutSize== 128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded.
+* input : ZBUFFv07_recommendedDInSize == 128KB + 3;
+* just follow indications from ZBUFFv07_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* *******************************************************************************/
+
+
+/* *************************************
+* Tool functions
+***************************************/
+ZSTDLIBv07_API unsigned ZBUFFv07_isError(size_t errorCode);
+ZSTDLIBv07_API const char* ZBUFFv07_getErrorName(size_t errorCode);
+
+/** Functions below provide recommended buffer sizes for Compression or Decompression operations.
+* These sizes are just hints, they tend to offer better latency */
+ZSTDLIBv07_API size_t ZBUFFv07_recommendedDInSize(void);
+ZSTDLIBv07_API size_t ZBUFFv07_recommendedDOutSize(void);
+
+
+/*-*************************************
+* Constants
+***************************************/
+#define ZSTDv07_MAGICNUMBER 0xFD2FB527 /* v0.7 */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTDv07_H_235446 */
+/**** ended inlining zstd_v07.h ****/
+#endif
+
+/** ZSTD_isLegacy() :
+ @return : > 0 if supported by legacy decoder. 0 otherwise.
+ return value is the version.
+*/
+MEM_STATIC unsigned ZSTD_isLegacy(const void* src, size_t srcSize)
+{
+ U32 magicNumberLE;
+ if (srcSize<4) return 0;
+ magicNumberLE = MEM_readLE32(src);
+ switch(magicNumberLE)
+ {
+#if (ZSTD_LEGACY_SUPPORT <= 1)
+ case ZSTDv01_magicNumberLE:return 1;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 2)
+ case ZSTDv02_magicNumber : return 2;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 3)
+ case ZSTDv03_magicNumber : return 3;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+ case ZSTDv04_magicNumber : return 4;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+ case ZSTDv05_MAGICNUMBER : return 5;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+ case ZSTDv06_MAGICNUMBER : return 6;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+ case ZSTDv07_MAGICNUMBER : return 7;
+#endif
+ default : return 0;
+ }
+}
+
+
+MEM_STATIC unsigned long long ZSTD_getDecompressedSize_legacy(const void* src, size_t srcSize)
+{
+ U32 const version = ZSTD_isLegacy(src, srcSize);
+ if (version < 5) return 0; /* no decompressed size in frame header, or not a legacy format */
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+ if (version==5) {
+ ZSTDv05_parameters fParams;
+ size_t const frResult = ZSTDv05_getFrameParams(&fParams, src, srcSize);
+ if (frResult != 0) return 0;
+ return fParams.srcSize;
+ }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+ if (version==6) {
+ ZSTDv06_frameParams fParams;
+ size_t const frResult = ZSTDv06_getFrameParams(&fParams, src, srcSize);
+ if (frResult != 0) return 0;
+ return fParams.frameContentSize;
+ }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+ if (version==7) {
+ ZSTDv07_frameParams fParams;
+ size_t const frResult = ZSTDv07_getFrameParams(&fParams, src, srcSize);
+ if (frResult != 0) return 0;
+ return fParams.frameContentSize;
+ }
+#endif
+ return 0; /* should not be possible */
+}
+
+
+MEM_STATIC size_t ZSTD_decompressLegacy(
+ void* dst, size_t dstCapacity,
+ const void* src, size_t compressedSize,
+ const void* dict,size_t dictSize)
+{
+ U32 const version = ZSTD_isLegacy(src, compressedSize);
+ (void)dst; (void)dstCapacity; (void)dict; (void)dictSize; /* unused when ZSTD_LEGACY_SUPPORT >= 8 */
+ switch(version)
+ {
+#if (ZSTD_LEGACY_SUPPORT <= 1)
+ case 1 :
+ return ZSTDv01_decompress(dst, dstCapacity, src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 2)
+ case 2 :
+ return ZSTDv02_decompress(dst, dstCapacity, src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 3)
+ case 3 :
+ return ZSTDv03_decompress(dst, dstCapacity, src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+ case 4 :
+ return ZSTDv04_decompress(dst, dstCapacity, src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+ case 5 :
+ { size_t result;
+ ZSTDv05_DCtx* const zd = ZSTDv05_createDCtx();
+ if (zd==NULL) return ERROR(memory_allocation);
+ result = ZSTDv05_decompress_usingDict(zd, dst, dstCapacity, src, compressedSize, dict, dictSize);
+ ZSTDv05_freeDCtx(zd);
+ return result;
+ }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+ case 6 :
+ { size_t result;
+ ZSTDv06_DCtx* const zd = ZSTDv06_createDCtx();
+ if (zd==NULL) return ERROR(memory_allocation);
+ result = ZSTDv06_decompress_usingDict(zd, dst, dstCapacity, src, compressedSize, dict, dictSize);
+ ZSTDv06_freeDCtx(zd);
+ return result;
+ }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+ case 7 :
+ { size_t result;
+ ZSTDv07_DCtx* const zd = ZSTDv07_createDCtx();
+ if (zd==NULL) return ERROR(memory_allocation);
+ result = ZSTDv07_decompress_usingDict(zd, dst, dstCapacity, src, compressedSize, dict, dictSize);
+ ZSTDv07_freeDCtx(zd);
+ return result;
+ }
+#endif
+ default :
+ return ERROR(prefix_unknown);
+ }
+}
+
+MEM_STATIC ZSTD_frameSizeInfo ZSTD_findFrameSizeInfoLegacy(const void *src, size_t srcSize)
+{
+ ZSTD_frameSizeInfo frameSizeInfo;
+ U32 const version = ZSTD_isLegacy(src, srcSize);
+ switch(version)
+ {
+#if (ZSTD_LEGACY_SUPPORT <= 1)
+ case 1 :
+ ZSTDv01_findFrameSizeInfoLegacy(src, srcSize,
+ &frameSizeInfo.compressedSize,
+ &frameSizeInfo.decompressedBound);
+ break;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 2)
+ case 2 :
+ ZSTDv02_findFrameSizeInfoLegacy(src, srcSize,
+ &frameSizeInfo.compressedSize,
+ &frameSizeInfo.decompressedBound);
+ break;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 3)
+ case 3 :
+ ZSTDv03_findFrameSizeInfoLegacy(src, srcSize,
+ &frameSizeInfo.compressedSize,
+ &frameSizeInfo.decompressedBound);
+ break;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+ case 4 :
+ ZSTDv04_findFrameSizeInfoLegacy(src, srcSize,
+ &frameSizeInfo.compressedSize,
+ &frameSizeInfo.decompressedBound);
+ break;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+ case 5 :
+ ZSTDv05_findFrameSizeInfoLegacy(src, srcSize,
+ &frameSizeInfo.compressedSize,
+ &frameSizeInfo.decompressedBound);
+ break;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+ case 6 :
+ ZSTDv06_findFrameSizeInfoLegacy(src, srcSize,
+ &frameSizeInfo.compressedSize,
+ &frameSizeInfo.decompressedBound);
+ break;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+ case 7 :
+ ZSTDv07_findFrameSizeInfoLegacy(src, srcSize,
+ &frameSizeInfo.compressedSize,
+ &frameSizeInfo.decompressedBound);
+ break;
+#endif
+ default :
+ frameSizeInfo.compressedSize = ERROR(prefix_unknown);
+ frameSizeInfo.decompressedBound = ZSTD_CONTENTSIZE_ERROR;
+ break;
+ }
+ if (!ZSTD_isError(frameSizeInfo.compressedSize) && frameSizeInfo.compressedSize > srcSize) {
+ frameSizeInfo.compressedSize = ERROR(srcSize_wrong);
+ frameSizeInfo.decompressedBound = ZSTD_CONTENTSIZE_ERROR;
+ }
+ return frameSizeInfo;
+}
+
+MEM_STATIC size_t ZSTD_findFrameCompressedSizeLegacy(const void *src, size_t srcSize)
+{
+ ZSTD_frameSizeInfo frameSizeInfo = ZSTD_findFrameSizeInfoLegacy(src, srcSize);
+ return frameSizeInfo.compressedSize;
+}
+
+MEM_STATIC size_t ZSTD_freeLegacyStreamContext(void* legacyContext, U32 version)
+{
+ switch(version)
+ {
+ default :
+ case 1 :
+ case 2 :
+ case 3 :
+ (void)legacyContext;
+ return ERROR(version_unsupported);
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+ case 4 : return ZBUFFv04_freeDCtx((ZBUFFv04_DCtx*)legacyContext);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+ case 5 : return ZBUFFv05_freeDCtx((ZBUFFv05_DCtx*)legacyContext);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+ case 6 : return ZBUFFv06_freeDCtx((ZBUFFv06_DCtx*)legacyContext);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+ case 7 : return ZBUFFv07_freeDCtx((ZBUFFv07_DCtx*)legacyContext);
+#endif
+ }
+}
+
+
+MEM_STATIC size_t ZSTD_initLegacyStream(void** legacyContext, U32 prevVersion, U32 newVersion,
+ const void* dict, size_t dictSize)
+{
+ DEBUGLOG(5, "ZSTD_initLegacyStream for v0.%u", newVersion);
+ if (prevVersion != newVersion) ZSTD_freeLegacyStreamContext(*legacyContext, prevVersion);
+ switch(newVersion)
+ {
+ default :
+ case 1 :
+ case 2 :
+ case 3 :
+ (void)dict; (void)dictSize;
+ return 0;
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+ case 4 :
+ {
+ ZBUFFv04_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv04_createDCtx() : (ZBUFFv04_DCtx*)*legacyContext;
+ if (dctx==NULL) return ERROR(memory_allocation);
+ ZBUFFv04_decompressInit(dctx);
+ ZBUFFv04_decompressWithDictionary(dctx, dict, dictSize);
+ *legacyContext = dctx;
+ return 0;
+ }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+ case 5 :
+ {
+ ZBUFFv05_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv05_createDCtx() : (ZBUFFv05_DCtx*)*legacyContext;
+ if (dctx==NULL) return ERROR(memory_allocation);
+ ZBUFFv05_decompressInitDictionary(dctx, dict, dictSize);
+ *legacyContext = dctx;
+ return 0;
+ }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+ case 6 :
+ {
+ ZBUFFv06_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv06_createDCtx() : (ZBUFFv06_DCtx*)*legacyContext;
+ if (dctx==NULL) return ERROR(memory_allocation);
+ ZBUFFv06_decompressInitDictionary(dctx, dict, dictSize);
+ *legacyContext = dctx;
+ return 0;
+ }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+ case 7 :
+ {
+ ZBUFFv07_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv07_createDCtx() : (ZBUFFv07_DCtx*)*legacyContext;
+ if (dctx==NULL) return ERROR(memory_allocation);
+ ZBUFFv07_decompressInitDictionary(dctx, dict, dictSize);
+ *legacyContext = dctx;
+ return 0;
+ }
+#endif
+ }
+}
+
+
+
+MEM_STATIC size_t ZSTD_decompressLegacyStream(void* legacyContext, U32 version,
+ ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+ DEBUGLOG(5, "ZSTD_decompressLegacyStream for v0.%u", version);
+ switch(version)
+ {
+ default :
+ case 1 :
+ case 2 :
+ case 3 :
+ (void)legacyContext; (void)output; (void)input;
+ return ERROR(version_unsupported);
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+ case 4 :
+ {
+ ZBUFFv04_DCtx* dctx = (ZBUFFv04_DCtx*) legacyContext;
+ const void* src = (const char*)input->src + input->pos;
+ size_t readSize = input->size - input->pos;
+ void* dst = (char*)output->dst + output->pos;
+ size_t decodedSize = output->size - output->pos;
+ size_t const hintSize = ZBUFFv04_decompressContinue(dctx, dst, &decodedSize, src, &readSize);
+ output->pos += decodedSize;
+ input->pos += readSize;
+ return hintSize;
+ }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+ case 5 :
+ {
+ ZBUFFv05_DCtx* dctx = (ZBUFFv05_DCtx*) legacyContext;
+ const void* src = (const char*)input->src + input->pos;
+ size_t readSize = input->size - input->pos;
+ void* dst = (char*)output->dst + output->pos;
+ size_t decodedSize = output->size - output->pos;
+ size_t const hintSize = ZBUFFv05_decompressContinue(dctx, dst, &decodedSize, src, &readSize);
+ output->pos += decodedSize;
+ input->pos += readSize;
+ return hintSize;
+ }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+ case 6 :
+ {
+ ZBUFFv06_DCtx* dctx = (ZBUFFv06_DCtx*) legacyContext;
+ const void* src = (const char*)input->src + input->pos;
+ size_t readSize = input->size - input->pos;
+ void* dst = (char*)output->dst + output->pos;
+ size_t decodedSize = output->size - output->pos;
+ size_t const hintSize = ZBUFFv06_decompressContinue(dctx, dst, &decodedSize, src, &readSize);
+ output->pos += decodedSize;
+ input->pos += readSize;
+ return hintSize;
+ }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+ case 7 :
+ {
+ ZBUFFv07_DCtx* dctx = (ZBUFFv07_DCtx*) legacyContext;
+ const void* src = (const char*)input->src + input->pos;
+ size_t readSize = input->size - input->pos;
+ void* dst = (char*)output->dst + output->pos;
+ size_t decodedSize = output->size - output->pos;
+ size_t const hintSize = ZBUFFv07_decompressContinue(dctx, dst, &decodedSize, src, &readSize);
+ output->pos += decodedSize;
+ input->pos += readSize;
+ return hintSize;
+ }
+#endif
+ }
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_LEGACY_H */
+/**** ended inlining ../legacy/zstd_legacy.h ****/
+#endif
+
+
+
+/*-*******************************************************
+* Types
+*********************************************************/
+struct ZSTD_DDict_s {
+ void* dictBuffer;
+ const void* dictContent;
+ size_t dictSize;
+ ZSTD_entropyDTables_t entropy;
+ U32 dictID;
+ U32 entropyPresent;
+ ZSTD_customMem cMem;
+}; /* typedef'd to ZSTD_DDict within "zstd.h" */
+
+const void* ZSTD_DDict_dictContent(const ZSTD_DDict* ddict)
+{
+ assert(ddict != NULL);
+ return ddict->dictContent;
+}
+
+size_t ZSTD_DDict_dictSize(const ZSTD_DDict* ddict)
+{
+ assert(ddict != NULL);
+ return ddict->dictSize;
+}
+
+void ZSTD_copyDDictParameters(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
+{
+ DEBUGLOG(4, "ZSTD_copyDDictParameters");
+ assert(dctx != NULL);
+ assert(ddict != NULL);
+ dctx->dictID = ddict->dictID;
+ dctx->prefixStart = ddict->dictContent;
+ dctx->virtualStart = ddict->dictContent;
+ dctx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize;
+ dctx->previousDstEnd = dctx->dictEnd;
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+ dctx->dictContentBeginForFuzzing = dctx->prefixStart;
+ dctx->dictContentEndForFuzzing = dctx->previousDstEnd;
+#endif
+ if (ddict->entropyPresent) {
+ dctx->litEntropy = 1;
+ dctx->fseEntropy = 1;
+ dctx->LLTptr = ddict->entropy.LLTable;
+ dctx->MLTptr = ddict->entropy.MLTable;
+ dctx->OFTptr = ddict->entropy.OFTable;
+ dctx->HUFptr = ddict->entropy.hufTable;
+ dctx->entropy.rep[0] = ddict->entropy.rep[0];
+ dctx->entropy.rep[1] = ddict->entropy.rep[1];
+ dctx->entropy.rep[2] = ddict->entropy.rep[2];
+ } else {
+ dctx->litEntropy = 0;
+ dctx->fseEntropy = 0;
+ }
+}
+
+
+static size_t
+ZSTD_loadEntropy_intoDDict(ZSTD_DDict* ddict,
+ ZSTD_dictContentType_e dictContentType)
+{
+ ddict->dictID = 0;
+ ddict->entropyPresent = 0;
+ if (dictContentType == ZSTD_dct_rawContent) return 0;
+
+ if (ddict->dictSize < 8) {
+ if (dictContentType == ZSTD_dct_fullDict)
+ return ERROR(dictionary_corrupted); /* only accept specified dictionaries */
+ return 0; /* pure content mode */
+ }
+ { U32 const magic = MEM_readLE32(ddict->dictContent);
+ if (magic != ZSTD_MAGIC_DICTIONARY) {
+ if (dictContentType == ZSTD_dct_fullDict)
+ return ERROR(dictionary_corrupted); /* only accept specified dictionaries */
+ return 0; /* pure content mode */
+ }
+ }
+ ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_FRAMEIDSIZE);
+
+ /* load entropy tables */
+ RETURN_ERROR_IF(ZSTD_isError(ZSTD_loadDEntropy(
+ &ddict->entropy, ddict->dictContent, ddict->dictSize)),
+ dictionary_corrupted, "");
+ ddict->entropyPresent = 1;
+ return 0;
+}
+
+
+static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
+ const void* dict, size_t dictSize,
+ ZSTD_dictLoadMethod_e dictLoadMethod,
+ ZSTD_dictContentType_e dictContentType)
+{
+ if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dict) || (!dictSize)) {
+ ddict->dictBuffer = NULL;
+ ddict->dictContent = dict;
+ if (!dict) dictSize = 0;
+ } else {
+ void* const internalBuffer = ZSTD_malloc(dictSize, ddict->cMem);
+ ddict->dictBuffer = internalBuffer;
+ ddict->dictContent = internalBuffer;
+ if (!internalBuffer) return ERROR(memory_allocation);
+ memcpy(internalBuffer, dict, dictSize);
+ }
+ ddict->dictSize = dictSize;
+ ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */
+
+ /* parse dictionary content */
+ FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , "");
+
+ return 0;
+}
+
+ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize,
+ ZSTD_dictLoadMethod_e dictLoadMethod,
+ ZSTD_dictContentType_e dictContentType,
+ ZSTD_customMem customMem)
+{
+ if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+
+ { ZSTD_DDict* const ddict = (ZSTD_DDict*) ZSTD_malloc(sizeof(ZSTD_DDict), customMem);
+ if (ddict == NULL) return NULL;
+ ddict->cMem = customMem;
+ { size_t const initResult = ZSTD_initDDict_internal(ddict,
+ dict, dictSize,
+ dictLoadMethod, dictContentType);
+ if (ZSTD_isError(initResult)) {
+ ZSTD_freeDDict(ddict);
+ return NULL;
+ } }
+ return ddict;
+ }
+}
+
+/*! ZSTD_createDDict() :
+* Create a digested dictionary, to start decompression without startup delay.
+* `dict` content is copied inside DDict.
+* Consequently, `dict` can be released after `ZSTD_DDict` creation */
+ZSTD_DDict* ZSTD_createDDict(const void* dict, size_t dictSize)
+{
+ ZSTD_customMem const allocator = { NULL, NULL, NULL };
+ return ZSTD_createDDict_advanced(dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto, allocator);
+}
+
+/*! ZSTD_createDDict_byReference() :
+ * Create a digested dictionary, to start decompression without startup delay.
+ * Dictionary content is simply referenced, it will be accessed during decompression.
+ * Warning : dictBuffer must outlive DDict (DDict must be freed before dictBuffer) */
+ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize)
+{
+ ZSTD_customMem const allocator = { NULL, NULL, NULL };
+ return ZSTD_createDDict_advanced(dictBuffer, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto, allocator);
+}
+
+
+const ZSTD_DDict* ZSTD_initStaticDDict(
+ void* sBuffer, size_t sBufferSize,
+ const void* dict, size_t dictSize,
+ ZSTD_dictLoadMethod_e dictLoadMethod,
+ ZSTD_dictContentType_e dictContentType)
+{
+ size_t const neededSpace = sizeof(ZSTD_DDict)
+ + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
+ ZSTD_DDict* const ddict = (ZSTD_DDict*)sBuffer;
+ assert(sBuffer != NULL);
+ assert(dict != NULL);
+ if ((size_t)sBuffer & 7) return NULL; /* 8-aligned */
+ if (sBufferSize < neededSpace) return NULL;
+ if (dictLoadMethod == ZSTD_dlm_byCopy) {
+ memcpy(ddict+1, dict, dictSize); /* local copy */
+ dict = ddict+1;
+ }
+ if (ZSTD_isError( ZSTD_initDDict_internal(ddict,
+ dict, dictSize,
+ ZSTD_dlm_byRef, dictContentType) ))
+ return NULL;
+ return ddict;
+}
+
+
+size_t ZSTD_freeDDict(ZSTD_DDict* ddict)
+{
+ if (ddict==NULL) return 0; /* support free on NULL */
+ { ZSTD_customMem const cMem = ddict->cMem;
+ ZSTD_free(ddict->dictBuffer, cMem);
+ ZSTD_free(ddict, cMem);
+ return 0;
+ }
+}
+
+/*! ZSTD_estimateDDictSize() :
+ * Estimate amount of memory that will be needed to create a dictionary for decompression.
+ * Note : dictionary created by reference using ZSTD_dlm_byRef are smaller */
+size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod)
+{
+ return sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
+}
+
+size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
+{
+ if (ddict==NULL) return 0; /* support sizeof on NULL */
+ return sizeof(*ddict) + (ddict->dictBuffer ? ddict->dictSize : 0) ;
+}
+
+/*! ZSTD_getDictID_fromDDict() :
+ * Provides the dictID of the dictionary loaded into `ddict`.
+ * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
+{
+ if (ddict==NULL) return 0;
+ return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
+}
+/**** ended inlining decompress/zstd_ddict.c ****/
+/**** start inlining decompress/zstd_decompress.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* ***************************************************************
+* Tuning parameters
+*****************************************************************/
+/*!
+ * HEAPMODE :
+ * Select how default decompression function ZSTD_decompress() allocates its context,
+ * on stack (0), or into heap (1, default; requires malloc()).
+ * Note that functions with explicit context such as ZSTD_decompressDCtx() are unaffected.
+ */
+#ifndef ZSTD_HEAPMODE
+# define ZSTD_HEAPMODE 1
+#endif
+
+/*!
+* LEGACY_SUPPORT :
+* if set to 1+, ZSTD_decompress() can decode older formats (v0.1+)
+*/
+#ifndef ZSTD_LEGACY_SUPPORT
+# define ZSTD_LEGACY_SUPPORT 0
+#endif
+
+/*!
+ * MAXWINDOWSIZE_DEFAULT :
+ * maximum window size accepted by DStream __by default__.
+ * Frames requiring more memory will be rejected.
+ * It's possible to set a different limit using ZSTD_DCtx_setMaxWindowSize().
+ */
+#ifndef ZSTD_MAXWINDOWSIZE_DEFAULT
+# define ZSTD_MAXWINDOWSIZE_DEFAULT (((U32)1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT) + 1)
+#endif
+
+/*!
+ * NO_FORWARD_PROGRESS_MAX :
+ * maximum allowed nb of calls to ZSTD_decompressStream()
+ * without any forward progress
+ * (defined as: no byte read from input, and no byte flushed to output)
+ * before triggering an error.
+ */
+#ifndef ZSTD_NO_FORWARD_PROGRESS_MAX
+# define ZSTD_NO_FORWARD_PROGRESS_MAX 16
+#endif
+
+
+/*-*******************************************************
+* Dependencies
+*********************************************************/
+#include <string.h> /* memcpy, memmove, memset */
+/**** skipping file: ../common/cpu.h ****/
+/**** skipping file: ../common/mem.h ****/
+#define FSE_STATIC_LINKING_ONLY
+/**** skipping file: ../common/fse.h ****/
+#define HUF_STATIC_LINKING_ONLY
+/**** skipping file: ../common/huf.h ****/
+/**** skipping file: ../common/zstd_internal.h ****/
+/**** skipping file: zstd_decompress_internal.h ****/
+/**** skipping file: zstd_ddict.h ****/
+/**** start inlining zstd_decompress_block.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+#ifndef ZSTD_DEC_BLOCK_H
+#define ZSTD_DEC_BLOCK_H
+
+/*-*******************************************************
+ * Dependencies
+ *********************************************************/
+#include <stddef.h> /* size_t */
+/**** skipping file: ../zstd.h ****/
+/**** skipping file: ../common/zstd_internal.h ****/
+/**** skipping file: zstd_decompress_internal.h ****/
+
+
+/* === Prototypes === */
+
+/* note: prototypes already published within `zstd.h` :
+ * ZSTD_decompressBlock()
+ */
+
+/* note: prototypes already published within `zstd_internal.h` :
+ * ZSTD_getcBlockSize()
+ * ZSTD_decodeSeqHeaders()
+ */
+
+
+/* ZSTD_decompressBlock_internal() :
+ * decompress block, starting at `src`,
+ * into destination buffer `dst`.
+ * @return : decompressed block size,
+ * or an error code (which can be tested using ZSTD_isError())
+ */
+size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize, const int frame);
+
+/* ZSTD_buildFSETable() :
+ * generate FSE decoding table for one symbol (ll, ml or off)
+ * this function must be called with valid parameters only
+ * (dt is large enough, normalizedCounter distribution total is a power of 2, max is within range, etc.)
+ * in which case it cannot fail.
+ * Internal use only.
+ */
+void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
+ const short* normalizedCounter, unsigned maxSymbolValue,
+ const U32* baseValue, const U32* nbAdditionalBits,
+ unsigned tableLog);
+
+
+#endif /* ZSTD_DEC_BLOCK_H */
+/**** ended inlining zstd_decompress_block.h ****/
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+/**** skipping file: ../legacy/zstd_legacy.h ****/
+#endif
+
+
+/*-*************************************************************
+* Context management
+***************************************************************/
+size_t ZSTD_sizeof_DCtx (const ZSTD_DCtx* dctx)
+{
+ if (dctx==NULL) return 0; /* support sizeof NULL */
+ return sizeof(*dctx)
+ + ZSTD_sizeof_DDict(dctx->ddictLocal)
+ + dctx->inBuffSize + dctx->outBuffSize;
+}
+
+size_t ZSTD_estimateDCtxSize(void) { return sizeof(ZSTD_DCtx); }
+
+
+static size_t ZSTD_startingInputLength(ZSTD_format_e format)
+{
+ size_t const startingInputLength = ZSTD_FRAMEHEADERSIZE_PREFIX(format);
+ /* only supports formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless */
+ assert( (format == ZSTD_f_zstd1) || (format == ZSTD_f_zstd1_magicless) );
+ return startingInputLength;
+}
+
+static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
+{
+ dctx->format = ZSTD_f_zstd1; /* ZSTD_decompressBegin() invokes ZSTD_startingInputLength() with argument dctx->format */
+ dctx->staticSize = 0;
+ dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT;
+ dctx->ddict = NULL;
+ dctx->ddictLocal = NULL;
+ dctx->dictEnd = NULL;
+ dctx->ddictIsCold = 0;
+ dctx->dictUses = ZSTD_dont_use;
+ dctx->inBuff = NULL;
+ dctx->inBuffSize = 0;
+ dctx->outBuffSize = 0;
+ dctx->streamStage = zdss_init;
+ dctx->legacyContext = NULL;
+ dctx->previousLegacyVersion = 0;
+ dctx->noForwardProgress = 0;
+ dctx->oversizedDuration = 0;
+ dctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
+ dctx->outBufferMode = ZSTD_obm_buffered;
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+ dctx->dictContentEndForFuzzing = NULL;
+#endif
+}
+
+ZSTD_DCtx* ZSTD_initStaticDCtx(void *workspace, size_t workspaceSize)
+{
+ ZSTD_DCtx* const dctx = (ZSTD_DCtx*) workspace;
+
+ if ((size_t)workspace & 7) return NULL; /* 8-aligned */
+ if (workspaceSize < sizeof(ZSTD_DCtx)) return NULL; /* minimum size */
+
+ ZSTD_initDCtx_internal(dctx);
+ dctx->staticSize = workspaceSize;
+ dctx->inBuff = (char*)(dctx+1);
+ return dctx;
+}
+
+ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem)
+{
+ if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+
+ { ZSTD_DCtx* const dctx = (ZSTD_DCtx*)ZSTD_malloc(sizeof(*dctx), customMem);
+ if (!dctx) return NULL;
+ dctx->customMem = customMem;
+ ZSTD_initDCtx_internal(dctx);
+ return dctx;
+ }
+}
+
+ZSTD_DCtx* ZSTD_createDCtx(void)
+{
+ DEBUGLOG(3, "ZSTD_createDCtx");
+ return ZSTD_createDCtx_advanced(ZSTD_defaultCMem);
+}
+
+static void ZSTD_clearDict(ZSTD_DCtx* dctx)
+{
+ ZSTD_freeDDict(dctx->ddictLocal);
+ dctx->ddictLocal = NULL;
+ dctx->ddict = NULL;
+ dctx->dictUses = ZSTD_dont_use;
+}
+
+size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx)
+{
+ if (dctx==NULL) return 0; /* support free on NULL */
+ RETURN_ERROR_IF(dctx->staticSize, memory_allocation, "not compatible with static DCtx");
+ { ZSTD_customMem const cMem = dctx->customMem;
+ ZSTD_clearDict(dctx);
+ ZSTD_free(dctx->inBuff, cMem);
+ dctx->inBuff = NULL;
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+ if (dctx->legacyContext)
+ ZSTD_freeLegacyStreamContext(dctx->legacyContext, dctx->previousLegacyVersion);
+#endif
+ ZSTD_free(dctx, cMem);
+ return 0;
+ }
+}
+
+/* no longer useful */
+void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx)
+{
+ size_t const toCopy = (size_t)((char*)(&dstDCtx->inBuff) - (char*)dstDCtx);
+ memcpy(dstDCtx, srcDCtx, toCopy); /* no need to copy workspace */
+}
+
+
+/*-*************************************************************
+ * Frame header decoding
+ ***************************************************************/
+
+/*! ZSTD_isFrame() :
+ * Tells if the content of `buffer` starts with a valid Frame Identifier.
+ * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
+ * Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled.
+ * Note 3 : Skippable Frame Identifiers are considered valid. */
+unsigned ZSTD_isFrame(const void* buffer, size_t size)
+{
+ if (size < ZSTD_FRAMEIDSIZE) return 0;
+ { U32 const magic = MEM_readLE32(buffer);
+ if (magic == ZSTD_MAGICNUMBER) return 1;
+ if ((magic & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) return 1;
+ }
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+ if (ZSTD_isLegacy(buffer, size)) return 1;
+#endif
+ return 0;
+}
+
+/** ZSTD_frameHeaderSize_internal() :
+ * srcSize must be large enough to reach header size fields.
+ * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless.
+ * @return : size of the Frame Header
+ * or an error code, which can be tested with ZSTD_isError() */
+static size_t ZSTD_frameHeaderSize_internal(const void* src, size_t srcSize, ZSTD_format_e format)
+{
+ size_t const minInputSize = ZSTD_startingInputLength(format);
+ RETURN_ERROR_IF(srcSize < minInputSize, srcSize_wrong, "");
+
+ { BYTE const fhd = ((const BYTE*)src)[minInputSize-1];
+ U32 const dictID= fhd & 3;
+ U32 const singleSegment = (fhd >> 5) & 1;
+ U32 const fcsId = fhd >> 6;
+ return minInputSize + !singleSegment
+ + ZSTD_did_fieldSize[dictID] + ZSTD_fcs_fieldSize[fcsId]
+ + (singleSegment && !fcsId);
+ }
+}
+
+/** ZSTD_frameHeaderSize() :
+ * srcSize must be >= ZSTD_frameHeaderSize_prefix.
+ * @return : size of the Frame Header,
+ * or an error code (if srcSize is too small) */
+size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
+{
+ return ZSTD_frameHeaderSize_internal(src, srcSize, ZSTD_f_zstd1);
+}
+
+
+/** ZSTD_getFrameHeader_advanced() :
+ * decode Frame Header, or require larger `srcSize`.
+ * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
+ * @return : 0, `zfhPtr` is correctly filled,
+ * >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ * or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
+{
+ const BYTE* ip = (const BYTE*)src;
+ size_t const minInputSize = ZSTD_startingInputLength(format);
+
+ memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */
+ if (srcSize < minInputSize) return minInputSize;
+ RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter");
+
+ if ( (format != ZSTD_f_zstd1_magicless)
+ && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
+ if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+ /* skippable frame */
+ if (srcSize < ZSTD_SKIPPABLEHEADERSIZE)
+ return ZSTD_SKIPPABLEHEADERSIZE; /* magic number + frame length */
+ memset(zfhPtr, 0, sizeof(*zfhPtr));
+ zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE);
+ zfhPtr->frameType = ZSTD_skippableFrame;
+ return 0;
+ }
+ RETURN_ERROR(prefix_unknown, "");
+ }
+
+ /* ensure there is enough `srcSize` to fully read/decode frame header */
+ { size_t const fhsize = ZSTD_frameHeaderSize_internal(src, srcSize, format);
+ if (srcSize < fhsize) return fhsize;
+ zfhPtr->headerSize = (U32)fhsize;
+ }
+
+ { BYTE const fhdByte = ip[minInputSize-1];
+ size_t pos = minInputSize;
+ U32 const dictIDSizeCode = fhdByte&3;
+ U32 const checksumFlag = (fhdByte>>2)&1;
+ U32 const singleSegment = (fhdByte>>5)&1;
+ U32 const fcsID = fhdByte>>6;
+ U64 windowSize = 0;
+ U32 dictID = 0;
+ U64 frameContentSize = ZSTD_CONTENTSIZE_UNKNOWN;
+ RETURN_ERROR_IF((fhdByte & 0x08) != 0, frameParameter_unsupported,
+ "reserved bits, must be zero");
+
+ if (!singleSegment) {
+ BYTE const wlByte = ip[pos++];
+ U32 const windowLog = (wlByte >> 3) + ZSTD_WINDOWLOG_ABSOLUTEMIN;
+ RETURN_ERROR_IF(windowLog > ZSTD_WINDOWLOG_MAX, frameParameter_windowTooLarge, "");
+ windowSize = (1ULL << windowLog);
+ windowSize += (windowSize >> 3) * (wlByte&7);
+ }
+ switch(dictIDSizeCode)
+ {
+ default: assert(0); /* impossible */
+ case 0 : break;
+ case 1 : dictID = ip[pos]; pos++; break;
+ case 2 : dictID = MEM_readLE16(ip+pos); pos+=2; break;
+ case 3 : dictID = MEM_readLE32(ip+pos); pos+=4; break;
+ }
+ switch(fcsID)
+ {
+ default: assert(0); /* impossible */
+ case 0 : if (singleSegment) frameContentSize = ip[pos]; break;
+ case 1 : frameContentSize = MEM_readLE16(ip+pos)+256; break;
+ case 2 : frameContentSize = MEM_readLE32(ip+pos); break;
+ case 3 : frameContentSize = MEM_readLE64(ip+pos); break;
+ }
+ if (singleSegment) windowSize = frameContentSize;
+
+ zfhPtr->frameType = ZSTD_frame;
+ zfhPtr->frameContentSize = frameContentSize;
+ zfhPtr->windowSize = windowSize;
+ zfhPtr->blockSizeMax = (unsigned) MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+ zfhPtr->dictID = dictID;
+ zfhPtr->checksumFlag = checksumFlag;
+ }
+ return 0;
+}
+
+/** ZSTD_getFrameHeader() :
+ * decode Frame Header, or require larger `srcSize`.
+ * note : this function does not consume input, it only reads it.
+ * @return : 0, `zfhPtr` is correctly filled,
+ * >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ * or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize)
+{
+ return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1);
+}
+
+
+/** ZSTD_getFrameContentSize() :
+ * compatible with legacy mode
+ * @return : decompressed size of the single frame pointed to be `src` if known, otherwise
+ * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+ * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */
+unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize)
+{
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+ if (ZSTD_isLegacy(src, srcSize)) {
+ unsigned long long const ret = ZSTD_getDecompressedSize_legacy(src, srcSize);
+ return ret == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : ret;
+ }
+#endif
+ { ZSTD_frameHeader zfh;
+ if (ZSTD_getFrameHeader(&zfh, src, srcSize) != 0)
+ return ZSTD_CONTENTSIZE_ERROR;
+ if (zfh.frameType == ZSTD_skippableFrame) {
+ return 0;
+ } else {
+ return zfh.frameContentSize;
+ } }
+}
+
+static size_t readSkippableFrameSize(void const* src, size_t srcSize)
+{
+ size_t const skippableHeaderSize = ZSTD_SKIPPABLEHEADERSIZE;
+ U32 sizeU32;
+
+ RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, "");
+
+ sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE);
+ RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32,
+ frameParameter_unsupported, "");
+ {
+ size_t const skippableSize = skippableHeaderSize + sizeU32;
+ RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, "");
+ return skippableSize;
+ }
+}
+
+/** ZSTD_findDecompressedSize() :
+ * compatible with legacy mode
+ * `srcSize` must be the exact length of some number of ZSTD compressed and/or
+ * skippable frames
+ * @return : decompressed size of the frames contained */
+unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+{
+ unsigned long long totalDstSize = 0;
+
+ while (srcSize >= ZSTD_startingInputLength(ZSTD_f_zstd1)) {
+ U32 const magicNumber = MEM_readLE32(src);
+
+ if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+ size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+ if (ZSTD_isError(skippableSize)) {
+ return ZSTD_CONTENTSIZE_ERROR;
+ }
+ assert(skippableSize <= srcSize);
+
+ src = (const BYTE *)src + skippableSize;
+ srcSize -= skippableSize;
+ continue;
+ }
+
+ { unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+ if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret;
+
+ /* check for overflow */
+ if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR;
+ totalDstSize += ret;
+ }
+ { size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
+ if (ZSTD_isError(frameSrcSize)) {
+ return ZSTD_CONTENTSIZE_ERROR;
+ }
+
+ src = (const BYTE *)src + frameSrcSize;
+ srcSize -= frameSrcSize;
+ }
+ } /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */
+
+ if (srcSize) return ZSTD_CONTENTSIZE_ERROR;
+
+ return totalDstSize;
+}
+
+/** ZSTD_getDecompressedSize() :
+ * compatible with legacy mode
+ * @return : decompressed size if known, 0 otherwise
+ note : 0 can mean any of the following :
+ - frame content is empty
+ - decompressed size field is not present in frame header
+ - frame header unknown / not supported
+ - frame header not complete (`srcSize` too small) */
+unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize)
+{
+ unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+ ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_ERROR < ZSTD_CONTENTSIZE_UNKNOWN);
+ return (ret >= ZSTD_CONTENTSIZE_ERROR) ? 0 : ret;
+}
+
+
+/** ZSTD_decodeFrameHeader() :
+ * `headerSize` must be the size provided by ZSTD_frameHeaderSize().
+ * @return : 0 if success, or an error code, which can be tested using ZSTD_isError() */
+static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t headerSize)
+{
+ size_t const result = ZSTD_getFrameHeader_advanced(&(dctx->fParams), src, headerSize, dctx->format);
+ if (ZSTD_isError(result)) return result; /* invalid header */
+ RETURN_ERROR_IF(result>0, srcSize_wrong, "headerSize too small");
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+ /* Skip the dictID check in fuzzing mode, because it makes the search
+ * harder.
+ */
+ RETURN_ERROR_IF(dctx->fParams.dictID && (dctx->dictID != dctx->fParams.dictID),
+ dictionary_wrong, "");
+#endif
+ if (dctx->fParams.checksumFlag) XXH64_reset(&dctx->xxhState, 0);
+ return 0;
+}
+
+static ZSTD_frameSizeInfo ZSTD_errorFrameSizeInfo(size_t ret)
+{
+ ZSTD_frameSizeInfo frameSizeInfo;
+ frameSizeInfo.compressedSize = ret;
+ frameSizeInfo.decompressedBound = ZSTD_CONTENTSIZE_ERROR;
+ return frameSizeInfo;
+}
+
+static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize)
+{
+ ZSTD_frameSizeInfo frameSizeInfo;
+ memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo));
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+ if (ZSTD_isLegacy(src, srcSize))
+ return ZSTD_findFrameSizeInfoLegacy(src, srcSize);
+#endif
+
+ if ((srcSize >= ZSTD_SKIPPABLEHEADERSIZE)
+ && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+ frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize);
+ assert(ZSTD_isError(frameSizeInfo.compressedSize) ||
+ frameSizeInfo.compressedSize <= srcSize);
+ return frameSizeInfo;
+ } else {
+ const BYTE* ip = (const BYTE*)src;
+ const BYTE* const ipstart = ip;
+ size_t remainingSize = srcSize;
+ size_t nbBlocks = 0;
+ ZSTD_frameHeader zfh;
+
+ /* Extract Frame Header */
+ { size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize);
+ if (ZSTD_isError(ret))
+ return ZSTD_errorFrameSizeInfo(ret);
+ if (ret > 0)
+ return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong));
+ }
+
+ ip += zfh.headerSize;
+ remainingSize -= zfh.headerSize;
+
+ /* Iterate over each block */
+ while (1) {
+ blockProperties_t blockProperties;
+ size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
+ if (ZSTD_isError(cBlockSize))
+ return ZSTD_errorFrameSizeInfo(cBlockSize);
+
+ if (ZSTD_blockHeaderSize + cBlockSize > remainingSize)
+ return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong));
+
+ ip += ZSTD_blockHeaderSize + cBlockSize;
+ remainingSize -= ZSTD_blockHeaderSize + cBlockSize;
+ nbBlocks++;
+
+ if (blockProperties.lastBlock) break;
+ }
+
+ /* Final frame content checksum */
+ if (zfh.checksumFlag) {
+ if (remainingSize < 4)
+ return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong));
+ ip += 4;
+ }
+
+ frameSizeInfo.compressedSize = ip - ipstart;
+ frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN)
+ ? zfh.frameContentSize
+ : nbBlocks * zfh.blockSizeMax;
+ return frameSizeInfo;
+ }
+}
+
+/** ZSTD_findFrameCompressedSize() :
+ * compatible with legacy mode
+ * `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame
+ * `srcSize` must be at least as large as the frame contained
+ * @return : the compressed size of the frame starting at `src` */
+size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
+{
+ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
+ return frameSizeInfo.compressedSize;
+}
+
+/** ZSTD_decompressBound() :
+ * compatible with legacy mode
+ * `src` must point to the start of a ZSTD frame or a skippeable frame
+ * `srcSize` must be at least as large as the frame contained
+ * @return : the maximum decompressed size of the compressed source
+ */
+unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
+{
+ unsigned long long bound = 0;
+ /* Iterate over each frame */
+ while (srcSize > 0) {
+ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
+ size_t const compressedSize = frameSizeInfo.compressedSize;
+ unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
+ if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
+ return ZSTD_CONTENTSIZE_ERROR;
+ assert(srcSize >= compressedSize);
+ src = (const BYTE*)src + compressedSize;
+ srcSize -= compressedSize;
+ bound += decompressedBound;
+ }
+ return bound;
+}
+
+
+/*-*************************************************************
+ * Frame decoding
+ ***************************************************************/
+
+/** ZSTD_insertBlock() :
+ * insert `src` block into `dctx` history. Useful to track uncompressed blocks. */
+size_t ZSTD_insertBlock(ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize)
+{
+ DEBUGLOG(5, "ZSTD_insertBlock: %u bytes", (unsigned)blockSize);
+ ZSTD_checkContinuity(dctx, blockStart);
+ dctx->previousDstEnd = (const char*)blockStart + blockSize;
+ return blockSize;
+}
+
+
+static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize)
+{
+ DEBUGLOG(5, "ZSTD_copyRawBlock");
+ if (dst == NULL) {
+ if (srcSize == 0) return 0;
+ RETURN_ERROR(dstBuffer_null, "");
+ }
+ RETURN_ERROR_IF(srcSize > dstCapacity, dstSize_tooSmall, "");
+ memcpy(dst, src, srcSize);
+ return srcSize;
+}
+
+static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity,
+ BYTE b,
+ size_t regenSize)
+{
+ if (dst == NULL) {
+ if (regenSize == 0) return 0;
+ RETURN_ERROR(dstBuffer_null, "");
+ }
+ RETURN_ERROR_IF(regenSize > dstCapacity, dstSize_tooSmall, "");
+ memset(dst, b, regenSize);
+ return regenSize;
+}
+
+
+/*! ZSTD_decompressFrame() :
+ * @dctx must be properly initialized
+ * will update *srcPtr and *srcSizePtr,
+ * to make *srcPtr progress by one frame. */
+static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+ void* dst, size_t dstCapacity,
+ const void** srcPtr, size_t *srcSizePtr)
+{
+ const BYTE* ip = (const BYTE*)(*srcPtr);
+ BYTE* const ostart = (BYTE* const)dst;
+ BYTE* const oend = dstCapacity != 0 ? ostart + dstCapacity : ostart;
+ BYTE* op = ostart;
+ size_t remainingSrcSize = *srcSizePtr;
+
+ DEBUGLOG(4, "ZSTD_decompressFrame (srcSize:%i)", (int)*srcSizePtr);
+
+ /* check */
+ RETURN_ERROR_IF(
+ remainingSrcSize < ZSTD_FRAMEHEADERSIZE_MIN(dctx->format)+ZSTD_blockHeaderSize,
+ srcSize_wrong, "");
+
+ /* Frame Header */
+ { size_t const frameHeaderSize = ZSTD_frameHeaderSize_internal(
+ ip, ZSTD_FRAMEHEADERSIZE_PREFIX(dctx->format), dctx->format);
+ if (ZSTD_isError(frameHeaderSize)) return frameHeaderSize;
+ RETURN_ERROR_IF(remainingSrcSize < frameHeaderSize+ZSTD_blockHeaderSize,
+ srcSize_wrong, "");
+ FORWARD_IF_ERROR( ZSTD_decodeFrameHeader(dctx, ip, frameHeaderSize) , "");
+ ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize;
+ }
+
+ /* Loop on each block */
+ while (1) {
+ size_t decodedSize;
+ blockProperties_t blockProperties;
+ size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSrcSize, &blockProperties);
+ if (ZSTD_isError(cBlockSize)) return cBlockSize;
+
+ ip += ZSTD_blockHeaderSize;
+ remainingSrcSize -= ZSTD_blockHeaderSize;
+ RETURN_ERROR_IF(cBlockSize > remainingSrcSize, srcSize_wrong, "");
+
+ switch(blockProperties.blockType)
+ {
+ case bt_compressed:
+ decodedSize = ZSTD_decompressBlock_internal(dctx, op, oend-op, ip, cBlockSize, /* frame */ 1);
+ break;
+ case bt_raw :
+ decodedSize = ZSTD_copyRawBlock(op, oend-op, ip, cBlockSize);
+ break;
+ case bt_rle :
+ decodedSize = ZSTD_setRleBlock(op, oend-op, *ip, blockProperties.origSize);
+ break;
+ case bt_reserved :
+ default:
+ RETURN_ERROR(corruption_detected, "invalid block type");
+ }
+
+ if (ZSTD_isError(decodedSize)) return decodedSize;
+ if (dctx->fParams.checksumFlag)
+ XXH64_update(&dctx->xxhState, op, decodedSize);
+ if (decodedSize != 0)
+ op += decodedSize;
+ assert(ip != NULL);
+ ip += cBlockSize;
+ remainingSrcSize -= cBlockSize;
+ if (blockProperties.lastBlock) break;
+ }
+
+ if (dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) {
+ RETURN_ERROR_IF((U64)(op-ostart) != dctx->fParams.frameContentSize,
+ corruption_detected, "");
+ }
+ if (dctx->fParams.checksumFlag) { /* Frame content checksum verification */
+ U32 const checkCalc = (U32)XXH64_digest(&dctx->xxhState);
+ U32 checkRead;
+ RETURN_ERROR_IF(remainingSrcSize<4, checksum_wrong, "");
+ checkRead = MEM_readLE32(ip);
+ RETURN_ERROR_IF(checkRead != checkCalc, checksum_wrong, "");
+ ip += 4;
+ remainingSrcSize -= 4;
+ }
+
+ /* Allow caller to get size read */
+ *srcPtr = ip;
+ *srcSizePtr = remainingSrcSize;
+ return op-ostart;
+}
+
+static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const void* dict, size_t dictSize,
+ const ZSTD_DDict* ddict)
+{
+ void* const dststart = dst;
+ int moreThan1Frame = 0;
+
+ DEBUGLOG(5, "ZSTD_decompressMultiFrame");
+ assert(dict==NULL || ddict==NULL); /* either dict or ddict set, not both */
+
+ if (ddict) {
+ dict = ZSTD_DDict_dictContent(ddict);
+ dictSize = ZSTD_DDict_dictSize(ddict);
+ }
+
+ while (srcSize >= ZSTD_startingInputLength(dctx->format)) {
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+ if (ZSTD_isLegacy(src, srcSize)) {
+ size_t decodedSize;
+ size_t const frameSize = ZSTD_findFrameCompressedSizeLegacy(src, srcSize);
+ if (ZSTD_isError(frameSize)) return frameSize;
+ RETURN_ERROR_IF(dctx->staticSize, memory_allocation,
+ "legacy support is not compatible with static dctx");
+
+ decodedSize = ZSTD_decompressLegacy(dst, dstCapacity, src, frameSize, dict, dictSize);
+ if (ZSTD_isError(decodedSize)) return decodedSize;
+
+ assert(decodedSize <=- dstCapacity);
+ dst = (BYTE*)dst + decodedSize;
+ dstCapacity -= decodedSize;
+
+ src = (const BYTE*)src + frameSize;
+ srcSize -= frameSize;
+
+ continue;
+ }
+#endif
+
+ { U32 const magicNumber = MEM_readLE32(src);
+ DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
+ (unsigned)magicNumber, ZSTD_MAGICNUMBER);
+ if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+ size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+ FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed");
+ assert(skippableSize <= srcSize);
+
+ src = (const BYTE *)src + skippableSize;
+ srcSize -= skippableSize;
+ continue;
+ } }
+
+ if (ddict) {
+ /* we were called from ZSTD_decompress_usingDDict */
+ FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(dctx, ddict), "");
+ } else {
+ /* this will initialize correctly with no dict if dict == NULL, so
+ * use this in all cases but ddict */
+ FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDict(dctx, dict, dictSize), "");
+ }
+ ZSTD_checkContinuity(dctx, dst);
+
+ { const size_t res = ZSTD_decompressFrame(dctx, dst, dstCapacity,
+ &src, &srcSize);
+ RETURN_ERROR_IF(
+ (ZSTD_getErrorCode(res) == ZSTD_error_prefix_unknown)
+ && (moreThan1Frame==1),
+ srcSize_wrong,
+ "at least one frame successfully completed, but following "
+ "bytes are garbage: it's more likely to be a srcSize error, "
+ "specifying more bytes than compressed size of frame(s). This "
+ "error message replaces ERROR(prefix_unknown), which would be "
+ "confusing, as the first header is actually correct. Note that "
+ "one could be unlucky, it might be a corruption error instead, "
+ "happening right at the place where we expect zstd magic "
+ "bytes. But this is _much_ less likely than a srcSize field "
+ "error.");
+ if (ZSTD_isError(res)) return res;
+ assert(res <= dstCapacity);
+ if (res != 0)
+ dst = (BYTE*)dst + res;
+ dstCapacity -= res;
+ }
+ moreThan1Frame = 1;
+ } /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */
+
+ RETURN_ERROR_IF(srcSize, srcSize_wrong, "input not entirely consumed");
+
+ return (BYTE*)dst - (BYTE*)dststart;
+}
+
+size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const void* dict, size_t dictSize)
+{
+ return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, dict, dictSize, NULL);
+}
+
+
+static ZSTD_DDict const* ZSTD_getDDict(ZSTD_DCtx* dctx)
+{
+ switch (dctx->dictUses) {
+ default:
+ assert(0 /* Impossible */);
+ /* fall-through */
+ case ZSTD_dont_use:
+ ZSTD_clearDict(dctx);
+ return NULL;
+ case ZSTD_use_indefinitely:
+ return dctx->ddict;
+ case ZSTD_use_once:
+ dctx->dictUses = ZSTD_dont_use;
+ return dctx->ddict;
+ }
+}
+
+size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+ return ZSTD_decompress_usingDDict(dctx, dst, dstCapacity, src, srcSize, ZSTD_getDDict(dctx));
+}
+
+
+size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+#if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE>=1)
+ size_t regenSize;
+ ZSTD_DCtx* const dctx = ZSTD_createDCtx();
+ RETURN_ERROR_IF(dctx==NULL, memory_allocation, "NULL pointer!");
+ regenSize = ZSTD_decompressDCtx(dctx, dst, dstCapacity, src, srcSize);
+ ZSTD_freeDCtx(dctx);
+ return regenSize;
+#else /* stack mode */
+ ZSTD_DCtx dctx;
+ ZSTD_initDCtx_internal(&dctx);
+ return ZSTD_decompressDCtx(&dctx, dst, dstCapacity, src, srcSize);
+#endif
+}
+
+
+/*-**************************************
+* Advanced Streaming Decompression API
+* Bufferless and synchronous
+****************************************/
+size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
+
+/**
+ * Similar to ZSTD_nextSrcSizeToDecompress(), but when when a block input can be streamed,
+ * we allow taking a partial block as the input. Currently only raw uncompressed blocks can
+ * be streamed.
+ *
+ * For blocks that can be streamed, this allows us to reduce the latency until we produce
+ * output, and avoid copying the input.
+ *
+ * @param inputSize - The total amount of input that the caller currently has.
+ */
+static size_t ZSTD_nextSrcSizeToDecompressWithInputSize(ZSTD_DCtx* dctx, size_t inputSize) {
+ if (!(dctx->stage == ZSTDds_decompressBlock || dctx->stage == ZSTDds_decompressLastBlock))
+ return dctx->expected;
+ if (dctx->bType != bt_raw)
+ return dctx->expected;
+ return MIN(MAX(inputSize, 1), dctx->expected);
+}
+
+ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) {
+ switch(dctx->stage)
+ {
+ default: /* should not happen */
+ assert(0);
+ case ZSTDds_getFrameHeaderSize:
+ case ZSTDds_decodeFrameHeader:
+ return ZSTDnit_frameHeader;
+ case ZSTDds_decodeBlockHeader:
+ return ZSTDnit_blockHeader;
+ case ZSTDds_decompressBlock:
+ return ZSTDnit_block;
+ case ZSTDds_decompressLastBlock:
+ return ZSTDnit_lastBlock;
+ case ZSTDds_checkChecksum:
+ return ZSTDnit_checksum;
+ case ZSTDds_decodeSkippableHeader:
+ case ZSTDds_skipFrame:
+ return ZSTDnit_skippableFrame;
+ }
+}
+
+static int ZSTD_isSkipFrame(ZSTD_DCtx* dctx) { return dctx->stage == ZSTDds_skipFrame; }
+
+/** ZSTD_decompressContinue() :
+ * srcSize : must be the exact nb of bytes expected (see ZSTD_nextSrcSizeToDecompress())
+ * @return : nb of bytes generated into `dst` (necessarily <= `dstCapacity)
+ * or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+ DEBUGLOG(5, "ZSTD_decompressContinue (srcSize:%u)", (unsigned)srcSize);
+ /* Sanity check */
+ RETURN_ERROR_IF(srcSize != ZSTD_nextSrcSizeToDecompressWithInputSize(dctx, srcSize), srcSize_wrong, "not allowed");
+ if (dstCapacity) ZSTD_checkContinuity(dctx, dst);
+
+ switch (dctx->stage)
+ {
+ case ZSTDds_getFrameHeaderSize :
+ assert(src != NULL);
+ if (dctx->format == ZSTD_f_zstd1) { /* allows header */
+ assert(srcSize >= ZSTD_FRAMEIDSIZE); /* to read skippable magic number */
+ if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */
+ memcpy(dctx->headerBuffer, src, srcSize);
+ dctx->expected = ZSTD_SKIPPABLEHEADERSIZE - srcSize; /* remaining to load to get full skippable frame header */
+ dctx->stage = ZSTDds_decodeSkippableHeader;
+ return 0;
+ } }
+ dctx->headerSize = ZSTD_frameHeaderSize_internal(src, srcSize, dctx->format);
+ if (ZSTD_isError(dctx->headerSize)) return dctx->headerSize;
+ memcpy(dctx->headerBuffer, src, srcSize);
+ dctx->expected = dctx->headerSize - srcSize;
+ dctx->stage = ZSTDds_decodeFrameHeader;
+ return 0;
+
+ case ZSTDds_decodeFrameHeader:
+ assert(src != NULL);
+ memcpy(dctx->headerBuffer + (dctx->headerSize - srcSize), src, srcSize);
+ FORWARD_IF_ERROR(ZSTD_decodeFrameHeader(dctx, dctx->headerBuffer, dctx->headerSize), "");
+ dctx->expected = ZSTD_blockHeaderSize;
+ dctx->stage = ZSTDds_decodeBlockHeader;
+ return 0;
+
+ case ZSTDds_decodeBlockHeader:
+ { blockProperties_t bp;
+ size_t const cBlockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
+ if (ZSTD_isError(cBlockSize)) return cBlockSize;
+ RETURN_ERROR_IF(cBlockSize > dctx->fParams.blockSizeMax, corruption_detected, "Block Size Exceeds Maximum");
+ dctx->expected = cBlockSize;
+ dctx->bType = bp.blockType;
+ dctx->rleSize = bp.origSize;
+ if (cBlockSize) {
+ dctx->stage = bp.lastBlock ? ZSTDds_decompressLastBlock : ZSTDds_decompressBlock;
+ return 0;
+ }
+ /* empty block */
+ if (bp.lastBlock) {
+ if (dctx->fParams.checksumFlag) {
+ dctx->expected = 4;
+ dctx->stage = ZSTDds_checkChecksum;
+ } else {
+ dctx->expected = 0; /* end of frame */
+ dctx->stage = ZSTDds_getFrameHeaderSize;
+ }
+ } else {
+ dctx->expected = ZSTD_blockHeaderSize; /* jump to next header */
+ dctx->stage = ZSTDds_decodeBlockHeader;
+ }
+ return 0;
+ }
+
+ case ZSTDds_decompressLastBlock:
+ case ZSTDds_decompressBlock:
+ DEBUGLOG(5, "ZSTD_decompressContinue: case ZSTDds_decompressBlock");
+ { size_t rSize;
+ switch(dctx->bType)
+ {
+ case bt_compressed:
+ DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed");
+ rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1);
+ dctx->expected = 0; /* Streaming not supported */
+ break;
+ case bt_raw :
+ assert(srcSize <= dctx->expected);
+ rSize = ZSTD_copyRawBlock(dst, dstCapacity, src, srcSize);
+ FORWARD_IF_ERROR(rSize, "ZSTD_copyRawBlock failed");
+ assert(rSize == srcSize);
+ dctx->expected -= rSize;
+ break;
+ case bt_rle :
+ rSize = ZSTD_setRleBlock(dst, dstCapacity, *(const BYTE*)src, dctx->rleSize);
+ dctx->expected = 0; /* Streaming not supported */
+ break;
+ case bt_reserved : /* should never happen */
+ default:
+ RETURN_ERROR(corruption_detected, "invalid block type");
+ }
+ FORWARD_IF_ERROR(rSize, "");
+ RETURN_ERROR_IF(rSize > dctx->fParams.blockSizeMax, corruption_detected, "Decompressed Block Size Exceeds Maximum");
+ DEBUGLOG(5, "ZSTD_decompressContinue: decoded size from block : %u", (unsigned)rSize);
+ dctx->decodedSize += rSize;
+ if (dctx->fParams.checksumFlag) XXH64_update(&dctx->xxhState, dst, rSize);
+ dctx->previousDstEnd = (char*)dst + rSize;
+
+ /* Stay on the same stage until we are finished streaming the block. */
+ if (dctx->expected > 0) {
+ return rSize;
+ }
+
+ if (dctx->stage == ZSTDds_decompressLastBlock) { /* end of frame */
+ DEBUGLOG(4, "ZSTD_decompressContinue: decoded size from frame : %u", (unsigned)dctx->decodedSize);
+ RETURN_ERROR_IF(
+ dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN
+ && dctx->decodedSize != dctx->fParams.frameContentSize,
+ corruption_detected, "");
+ if (dctx->fParams.checksumFlag) { /* another round for frame checksum */
+ dctx->expected = 4;
+ dctx->stage = ZSTDds_checkChecksum;
+ } else {
+ dctx->expected = 0; /* ends here */
+ dctx->stage = ZSTDds_getFrameHeaderSize;
+ }
+ } else {
+ dctx->stage = ZSTDds_decodeBlockHeader;
+ dctx->expected = ZSTD_blockHeaderSize;
+ }
+ return rSize;
+ }
+
+ case ZSTDds_checkChecksum:
+ assert(srcSize == 4); /* guaranteed by dctx->expected */
+ { U32 const h32 = (U32)XXH64_digest(&dctx->xxhState);
+ U32 const check32 = MEM_readLE32(src);
+ DEBUGLOG(4, "ZSTD_decompressContinue: checksum : calculated %08X :: %08X read", (unsigned)h32, (unsigned)check32);
+ RETURN_ERROR_IF(check32 != h32, checksum_wrong, "");
+ dctx->expected = 0;
+ dctx->stage = ZSTDds_getFrameHeaderSize;
+ return 0;
+ }
+
+ case ZSTDds_decodeSkippableHeader:
+ assert(src != NULL);
+ assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE);
+ memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize); /* complete skippable header */
+ dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE); /* note : dctx->expected can grow seriously large, beyond local buffer size */
+ dctx->stage = ZSTDds_skipFrame;
+ return 0;
+
+ case ZSTDds_skipFrame:
+ dctx->expected = 0;
+ dctx->stage = ZSTDds_getFrameHeaderSize;
+ return 0;
+
+ default:
+ assert(0); /* impossible */
+ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */
+ }
+}
+
+
+static size_t ZSTD_refDictContent(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+ dctx->dictEnd = dctx->previousDstEnd;
+ dctx->virtualStart = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
+ dctx->prefixStart = dict;
+ dctx->previousDstEnd = (const char*)dict + dictSize;
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+ dctx->dictContentBeginForFuzzing = dctx->prefixStart;
+ dctx->dictContentEndForFuzzing = dctx->previousDstEnd;
+#endif
+ return 0;
+}
+
+/*! ZSTD_loadDEntropy() :
+ * dict : must point at beginning of a valid zstd dictionary.
+ * @return : size of entropy tables read */
+size_t
+ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
+ const void* const dict, size_t const dictSize)
+{
+ const BYTE* dictPtr = (const BYTE*)dict;
+ const BYTE* const dictEnd = dictPtr + dictSize;
+
+ RETURN_ERROR_IF(dictSize <= 8, dictionary_corrupted, "dict is too small");
+ assert(MEM_readLE32(dict) == ZSTD_MAGIC_DICTIONARY); /* dict must be valid */
+ dictPtr += 8; /* skip header = magic + dictID */
+
+ ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, OFTable) == offsetof(ZSTD_entropyDTables_t, LLTable) + sizeof(entropy->LLTable));
+ ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, MLTable) == offsetof(ZSTD_entropyDTables_t, OFTable) + sizeof(entropy->OFTable));
+ ZSTD_STATIC_ASSERT(sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable) >= HUF_DECOMPRESS_WORKSPACE_SIZE);
+ { void* const workspace = &entropy->LLTable; /* use fse tables as temporary workspace; implies fse tables are grouped together */
+ size_t const workspaceSize = sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable);
+#ifdef HUF_FORCE_DECOMPRESS_X1
+ /* in minimal huffman, we always use X1 variants */
+ size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable,
+ dictPtr, dictEnd - dictPtr,
+ workspace, workspaceSize);
+#else
+ size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
+ dictPtr, dictEnd - dictPtr,
+ workspace, workspaceSize);
+#endif
+ RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, "");
+ dictPtr += hSize;
+ }
+
+ { short offcodeNCount[MaxOff+1];
+ unsigned offcodeMaxValue = MaxOff, offcodeLog;
+ size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
+ RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, "");
+ RETURN_ERROR_IF(offcodeMaxValue > MaxOff, dictionary_corrupted, "");
+ RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, "");
+ ZSTD_buildFSETable( entropy->OFTable,
+ offcodeNCount, offcodeMaxValue,
+ OF_base, OF_bits,
+ offcodeLog);
+ dictPtr += offcodeHeaderSize;
+ }
+
+ { short matchlengthNCount[MaxML+1];
+ unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+ size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr);
+ RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, "");
+ RETURN_ERROR_IF(matchlengthMaxValue > MaxML, dictionary_corrupted, "");
+ RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, "");
+ ZSTD_buildFSETable( entropy->MLTable,
+ matchlengthNCount, matchlengthMaxValue,
+ ML_base, ML_bits,
+ matchlengthLog);
+ dictPtr += matchlengthHeaderSize;
+ }
+
+ { short litlengthNCount[MaxLL+1];
+ unsigned litlengthMaxValue = MaxLL, litlengthLog;
+ size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr);
+ RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, "");
+ RETURN_ERROR_IF(litlengthMaxValue > MaxLL, dictionary_corrupted, "");
+ RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, "");
+ ZSTD_buildFSETable( entropy->LLTable,
+ litlengthNCount, litlengthMaxValue,
+ LL_base, LL_bits,
+ litlengthLog);
+ dictPtr += litlengthHeaderSize;
+ }
+
+ RETURN_ERROR_IF(dictPtr+12 > dictEnd, dictionary_corrupted, "");
+ { int i;
+ size_t const dictContentSize = (size_t)(dictEnd - (dictPtr+12));
+ for (i=0; i<3; i++) {
+ U32 const rep = MEM_readLE32(dictPtr); dictPtr += 4;
+ RETURN_ERROR_IF(rep==0 || rep > dictContentSize,
+ dictionary_corrupted, "");
+ entropy->rep[i] = rep;
+ } }
+
+ return dictPtr - (const BYTE*)dict;
+}
+
+static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+ if (dictSize < 8) return ZSTD_refDictContent(dctx, dict, dictSize);
+ { U32 const magic = MEM_readLE32(dict);
+ if (magic != ZSTD_MAGIC_DICTIONARY) {
+ return ZSTD_refDictContent(dctx, dict, dictSize); /* pure content mode */
+ } }
+ dctx->dictID = MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE);
+
+ /* load entropy tables */
+ { size_t const eSize = ZSTD_loadDEntropy(&dctx->entropy, dict, dictSize);
+ RETURN_ERROR_IF(ZSTD_isError(eSize), dictionary_corrupted, "");
+ dict = (const char*)dict + eSize;
+ dictSize -= eSize;
+ }
+ dctx->litEntropy = dctx->fseEntropy = 1;
+
+ /* reference dictionary content */
+ return ZSTD_refDictContent(dctx, dict, dictSize);
+}
+
+size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
+{
+ assert(dctx != NULL);
+ dctx->expected = ZSTD_startingInputLength(dctx->format); /* dctx->format must be properly set */
+ dctx->stage = ZSTDds_getFrameHeaderSize;
+ dctx->decodedSize = 0;
+ dctx->previousDstEnd = NULL;
+ dctx->prefixStart = NULL;
+ dctx->virtualStart = NULL;
+ dctx->dictEnd = NULL;
+ dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */
+ dctx->litEntropy = dctx->fseEntropy = 0;
+ dctx->dictID = 0;
+ dctx->bType = bt_reserved;
+ ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue));
+ memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue)); /* initial repcodes */
+ dctx->LLTptr = dctx->entropy.LLTable;
+ dctx->MLTptr = dctx->entropy.MLTable;
+ dctx->OFTptr = dctx->entropy.OFTable;
+ dctx->HUFptr = dctx->entropy.hufTable;
+ return 0;
+}
+
+size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+ FORWARD_IF_ERROR( ZSTD_decompressBegin(dctx) , "");
+ if (dict && dictSize)
+ RETURN_ERROR_IF(
+ ZSTD_isError(ZSTD_decompress_insertDictionary(dctx, dict, dictSize)),
+ dictionary_corrupted, "");
+ return 0;
+}
+
+
+/* ====== ZSTD_DDict ====== */
+
+size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
+{
+ DEBUGLOG(4, "ZSTD_decompressBegin_usingDDict");
+ assert(dctx != NULL);
+ if (ddict) {
+ const char* const dictStart = (const char*)ZSTD_DDict_dictContent(ddict);
+ size_t const dictSize = ZSTD_DDict_dictSize(ddict);
+ const void* const dictEnd = dictStart + dictSize;
+ dctx->ddictIsCold = (dctx->dictEnd != dictEnd);
+ DEBUGLOG(4, "DDict is %s",
+ dctx->ddictIsCold ? "~cold~" : "hot!");
+ }
+ FORWARD_IF_ERROR( ZSTD_decompressBegin(dctx) , "");
+ if (ddict) { /* NULL ddict is equivalent to no dictionary */
+ ZSTD_copyDDictParameters(dctx, ddict);
+ }
+ return 0;
+}
+
+/*! ZSTD_getDictID_fromDict() :
+ * Provides the dictID stored within dictionary.
+ * if @return == 0, the dictionary is not conformant with Zstandard specification.
+ * It can still be loaded, but as a content-only dictionary. */
+unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+{
+ if (dictSize < 8) return 0;
+ if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) return 0;
+ return MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE);
+}
+
+/*! ZSTD_getDictID_fromFrame() :
+ * Provides the dictID required to decompress frame stored within `src`.
+ * If @return == 0, the dictID could not be decoded.
+ * This could for one of the following reasons :
+ * - The frame does not require a dictionary (most common case).
+ * - The frame was built with dictID intentionally removed.
+ * Needed dictionary is a hidden information.
+ * Note : this use case also happens when using a non-conformant dictionary.
+ * - `srcSize` is too small, and as a result, frame header could not be decoded.
+ * Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
+ * - This is not a Zstandard frame.
+ * When identifying the exact failure cause, it's possible to use
+ * ZSTD_getFrameHeader(), which will provide a more precise error code. */
+unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
+{
+ ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 };
+ size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
+ if (ZSTD_isError(hError)) return 0;
+ return zfp.dictID;
+}
+
+
+/*! ZSTD_decompress_usingDDict() :
+* Decompression using a pre-digested Dictionary
+* Use dictionary without significant overhead. */
+size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const ZSTD_DDict* ddict)
+{
+ /* pass content and size in case legacy frames are encountered */
+ return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize,
+ NULL, 0,
+ ddict);
+}
+
+
+/*=====================================
+* Streaming decompression
+*====================================*/
+
+ZSTD_DStream* ZSTD_createDStream(void)
+{
+ DEBUGLOG(3, "ZSTD_createDStream");
+ return ZSTD_createDStream_advanced(ZSTD_defaultCMem);
+}
+
+ZSTD_DStream* ZSTD_initStaticDStream(void *workspace, size_t workspaceSize)
+{
+ return ZSTD_initStaticDCtx(workspace, workspaceSize);
+}
+
+ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem)
+{
+ return ZSTD_createDCtx_advanced(customMem);
+}
+
+size_t ZSTD_freeDStream(ZSTD_DStream* zds)
+{
+ return ZSTD_freeDCtx(zds);
+}
+
+
+/* *** Initialization *** */
+
+size_t ZSTD_DStreamInSize(void) { return ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize; }
+size_t ZSTD_DStreamOutSize(void) { return ZSTD_BLOCKSIZE_MAX; }
+
+size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx,
+ const void* dict, size_t dictSize,
+ ZSTD_dictLoadMethod_e dictLoadMethod,
+ ZSTD_dictContentType_e dictContentType)
+{
+ RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
+ ZSTD_clearDict(dctx);
+ if (dict && dictSize != 0) {
+ dctx->ddictLocal = ZSTD_createDDict_advanced(dict, dictSize, dictLoadMethod, dictContentType, dctx->customMem);
+ RETURN_ERROR_IF(dctx->ddictLocal == NULL, memory_allocation, "NULL pointer!");
+ dctx->ddict = dctx->ddictLocal;
+ dctx->dictUses = ZSTD_use_indefinitely;
+ }
+ return 0;
+}
+
+size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+ return ZSTD_DCtx_loadDictionary_advanced(dctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto);
+}
+
+size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+ return ZSTD_DCtx_loadDictionary_advanced(dctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto);
+}
+
+size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType)
+{
+ FORWARD_IF_ERROR(ZSTD_DCtx_loadDictionary_advanced(dctx, prefix, prefixSize, ZSTD_dlm_byRef, dictContentType), "");
+ dctx->dictUses = ZSTD_use_once;
+ return 0;
+}
+
+size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize)
+{
+ return ZSTD_DCtx_refPrefix_advanced(dctx, prefix, prefixSize, ZSTD_dct_rawContent);
+}
+
+
+/* ZSTD_initDStream_usingDict() :
+ * return : expected size, aka ZSTD_startingInputLength().
+ * this function cannot fail */
+size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize)
+{
+ DEBUGLOG(4, "ZSTD_initDStream_usingDict");
+ FORWARD_IF_ERROR( ZSTD_DCtx_reset(zds, ZSTD_reset_session_only) , "");
+ FORWARD_IF_ERROR( ZSTD_DCtx_loadDictionary(zds, dict, dictSize) , "");
+ return ZSTD_startingInputLength(zds->format);
+}
+
+/* note : this variant can't fail */
+size_t ZSTD_initDStream(ZSTD_DStream* zds)
+{
+ DEBUGLOG(4, "ZSTD_initDStream");
+ return ZSTD_initDStream_usingDDict(zds, NULL);
+}
+
+/* ZSTD_initDStream_usingDDict() :
+ * ddict will just be referenced, and must outlive decompression session
+ * this function cannot fail */
+size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
+{
+ FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , "");
+ FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , "");
+ return ZSTD_startingInputLength(dctx->format);
+}
+
+/* ZSTD_resetDStream() :
+ * return : expected size, aka ZSTD_startingInputLength().
+ * this function cannot fail */
+size_t ZSTD_resetDStream(ZSTD_DStream* dctx)
+{
+ FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), "");
+ return ZSTD_startingInputLength(dctx->format);
+}
+
+
+size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
+{
+ RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
+ ZSTD_clearDict(dctx);
+ if (ddict) {
+ dctx->ddict = ddict;
+ dctx->dictUses = ZSTD_use_indefinitely;
+ }
+ return 0;
+}
+
+/* ZSTD_DCtx_setMaxWindowSize() :
+ * note : no direct equivalence in ZSTD_DCtx_setParameter,
+ * since this version sets windowSize, and the other sets windowLog */
+size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize)
+{
+ ZSTD_bounds const bounds = ZSTD_dParam_getBounds(ZSTD_d_windowLogMax);
+ size_t const min = (size_t)1 << bounds.lowerBound;
+ size_t const max = (size_t)1 << bounds.upperBound;
+ RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
+ RETURN_ERROR_IF(maxWindowSize < min, parameter_outOfBound, "");
+ RETURN_ERROR_IF(maxWindowSize > max, parameter_outOfBound, "");
+ dctx->maxWindowSize = maxWindowSize;
+ return 0;
+}
+
+size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format)
+{
+ return ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, format);
+}
+
+ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
+{
+ ZSTD_bounds bounds = { 0, 0, 0 };
+ switch(dParam) {
+ case ZSTD_d_windowLogMax:
+ bounds.lowerBound = ZSTD_WINDOWLOG_ABSOLUTEMIN;
+ bounds.upperBound = ZSTD_WINDOWLOG_MAX;
+ return bounds;
+ case ZSTD_d_format:
+ bounds.lowerBound = (int)ZSTD_f_zstd1;
+ bounds.upperBound = (int)ZSTD_f_zstd1_magicless;
+ ZSTD_STATIC_ASSERT(ZSTD_f_zstd1 < ZSTD_f_zstd1_magicless);
+ return bounds;
+ case ZSTD_d_stableOutBuffer:
+ bounds.lowerBound = (int)ZSTD_obm_buffered;
+ bounds.upperBound = (int)ZSTD_obm_stable;
+ return bounds;
+ default:;
+ }
+ bounds.error = ERROR(parameter_unsupported);
+ return bounds;
+}
+
+/* ZSTD_dParam_withinBounds:
+ * @return 1 if value is within dParam bounds,
+ * 0 otherwise */
+static int ZSTD_dParam_withinBounds(ZSTD_dParameter dParam, int value)
+{
+ ZSTD_bounds const bounds = ZSTD_dParam_getBounds(dParam);
+ if (ZSTD_isError(bounds.error)) return 0;
+ if (value < bounds.lowerBound) return 0;
+ if (value > bounds.upperBound) return 0;
+ return 1;
+}
+
+#define CHECK_DBOUNDS(p,v) { \
+ RETURN_ERROR_IF(!ZSTD_dParam_withinBounds(p, v), parameter_outOfBound, ""); \
+}
+
+size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value)
+{
+ RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
+ switch(dParam) {
+ case ZSTD_d_windowLogMax:
+ if (value == 0) value = ZSTD_WINDOWLOG_LIMIT_DEFAULT;
+ CHECK_DBOUNDS(ZSTD_d_windowLogMax, value);
+ dctx->maxWindowSize = ((size_t)1) << value;
+ return 0;
+ case ZSTD_d_format:
+ CHECK_DBOUNDS(ZSTD_d_format, value);
+ dctx->format = (ZSTD_format_e)value;
+ return 0;
+ case ZSTD_d_stableOutBuffer:
+ CHECK_DBOUNDS(ZSTD_d_stableOutBuffer, value);
+ dctx->outBufferMode = (ZSTD_outBufferMode_e)value;
+ return 0;
+ default:;
+ }
+ RETURN_ERROR(parameter_unsupported, "");
+}
+
+size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset)
+{
+ if ( (reset == ZSTD_reset_session_only)
+ || (reset == ZSTD_reset_session_and_parameters) ) {
+ dctx->streamStage = zdss_init;
+ dctx->noForwardProgress = 0;
+ }
+ if ( (reset == ZSTD_reset_parameters)
+ || (reset == ZSTD_reset_session_and_parameters) ) {
+ RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
+ ZSTD_clearDict(dctx);
+ dctx->format = ZSTD_f_zstd1;
+ dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT;
+ }
+ return 0;
+}
+
+
+size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx)
+{
+ return ZSTD_sizeof_DCtx(dctx);
+}
+
+size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
+{
+ size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+ unsigned long long const neededRBSize = windowSize + blockSize + (WILDCOPY_OVERLENGTH * 2);
+ unsigned long long const neededSize = MIN(frameContentSize, neededRBSize);
+ size_t const minRBSize = (size_t) neededSize;
+ RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize,
+ frameParameter_windowTooLarge, "");
+ return minRBSize;
+}
+
+size_t ZSTD_estimateDStreamSize(size_t windowSize)
+{
+ size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+ size_t const inBuffSize = blockSize; /* no block can be larger */
+ size_t const outBuffSize = ZSTD_decodingBufferSize_min(windowSize, ZSTD_CONTENTSIZE_UNKNOWN);
+ return ZSTD_estimateDCtxSize() + inBuffSize + outBuffSize;
+}
+
+size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize)
+{
+ U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX; /* note : should be user-selectable, but requires an additional parameter (or a dctx) */
+ ZSTD_frameHeader zfh;
+ size_t const err = ZSTD_getFrameHeader(&zfh, src, srcSize);
+ if (ZSTD_isError(err)) return err;
+ RETURN_ERROR_IF(err>0, srcSize_wrong, "");
+ RETURN_ERROR_IF(zfh.windowSize > windowSizeMax,
+ frameParameter_windowTooLarge, "");
+ return ZSTD_estimateDStreamSize((size_t)zfh.windowSize);
+}
+
+
+/* ***** Decompression ***** */
+
+static int ZSTD_DCtx_isOverflow(ZSTD_DStream* zds, size_t const neededInBuffSize, size_t const neededOutBuffSize)
+{
+ return (zds->inBuffSize + zds->outBuffSize) >= (neededInBuffSize + neededOutBuffSize) * ZSTD_WORKSPACETOOLARGE_FACTOR;
+}
+
+static void ZSTD_DCtx_updateOversizedDuration(ZSTD_DStream* zds, size_t const neededInBuffSize, size_t const neededOutBuffSize)
+{
+ if (ZSTD_DCtx_isOverflow(zds, neededInBuffSize, neededOutBuffSize))
+ zds->oversizedDuration++;
+ else
+ zds->oversizedDuration = 0;
+}
+
+static int ZSTD_DCtx_isOversizedTooLong(ZSTD_DStream* zds)
+{
+ return zds->oversizedDuration >= ZSTD_WORKSPACETOOLARGE_MAXDURATION;
+}
+
+/* Checks that the output buffer hasn't changed if ZSTD_obm_stable is used. */
+static size_t ZSTD_checkOutBuffer(ZSTD_DStream const* zds, ZSTD_outBuffer const* output)
+{
+ ZSTD_outBuffer const expect = zds->expectedOutBuffer;
+ /* No requirement when ZSTD_obm_stable is not enabled. */
+ if (zds->outBufferMode != ZSTD_obm_stable)
+ return 0;
+ /* Any buffer is allowed in zdss_init, this must be the same for every other call until
+ * the context is reset.
+ */
+ if (zds->streamStage == zdss_init)
+ return 0;
+ /* The buffer must match our expectation exactly. */
+ if (expect.dst == output->dst && expect.pos == output->pos && expect.size == output->size)
+ return 0;
+ RETURN_ERROR(dstBuffer_wrong, "ZSTD_obm_stable enabled but output differs!");
+}
+
+/* Calls ZSTD_decompressContinue() with the right parameters for ZSTD_decompressStream()
+ * and updates the stage and the output buffer state. This call is extracted so it can be
+ * used both when reading directly from the ZSTD_inBuffer, and in buffered input mode.
+ * NOTE: You must break after calling this function since the streamStage is modified.
+ */
+static size_t ZSTD_decompressContinueStream(
+ ZSTD_DStream* zds, char** op, char* oend,
+ void const* src, size_t srcSize) {
+ int const isSkipFrame = ZSTD_isSkipFrame(zds);
+ if (zds->outBufferMode == ZSTD_obm_buffered) {
+ size_t const dstSize = isSkipFrame ? 0 : zds->outBuffSize - zds->outStart;
+ size_t const decodedSize = ZSTD_decompressContinue(zds,
+ zds->outBuff + zds->outStart, dstSize, src, srcSize);
+ FORWARD_IF_ERROR(decodedSize, "");
+ if (!decodedSize && !isSkipFrame) {
+ zds->streamStage = zdss_read;
+ } else {
+ zds->outEnd = zds->outStart + decodedSize;
+ zds->streamStage = zdss_flush;
+ }
+ } else {
+ /* Write directly into the output buffer */
+ size_t const dstSize = isSkipFrame ? 0 : oend - *op;
+ size_t const decodedSize = ZSTD_decompressContinue(zds, *op, dstSize, src, srcSize);
+ FORWARD_IF_ERROR(decodedSize, "");
+ *op += decodedSize;
+ /* Flushing is not needed. */
+ zds->streamStage = zdss_read;
+ assert(*op <= oend);
+ assert(zds->outBufferMode == ZSTD_obm_stable);
+ }
+ return 0;
+}
+
+size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+ const char* const src = (const char*)input->src;
+ const char* const istart = input->pos != 0 ? src + input->pos : src;
+ const char* const iend = input->size != 0 ? src + input->size : src;
+ const char* ip = istart;
+ char* const dst = (char*)output->dst;
+ char* const ostart = output->pos != 0 ? dst + output->pos : dst;
+ char* const oend = output->size != 0 ? dst + output->size : dst;
+ char* op = ostart;
+ U32 someMoreWork = 1;
+
+ DEBUGLOG(5, "ZSTD_decompressStream");
+ RETURN_ERROR_IF(
+ input->pos > input->size,
+ srcSize_wrong,
+ "forbidden. in: pos: %u vs size: %u",
+ (U32)input->pos, (U32)input->size);
+ RETURN_ERROR_IF(
+ output->pos > output->size,
+ dstSize_tooSmall,
+ "forbidden. out: pos: %u vs size: %u",
+ (U32)output->pos, (U32)output->size);
+ DEBUGLOG(5, "input size : %u", (U32)(input->size - input->pos));
+ FORWARD_IF_ERROR(ZSTD_checkOutBuffer(zds, output), "");
+
+ while (someMoreWork) {
+ switch(zds->streamStage)
+ {
+ case zdss_init :
+ DEBUGLOG(5, "stage zdss_init => transparent reset ");
+ zds->streamStage = zdss_loadHeader;
+ zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0;
+ zds->legacyVersion = 0;
+ zds->hostageByte = 0;
+ zds->expectedOutBuffer = *output;
+ /* fall-through */
+
+ case zdss_loadHeader :
+ DEBUGLOG(5, "stage zdss_loadHeader (srcSize : %u)", (U32)(iend - ip));
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+ if (zds->legacyVersion) {
+ RETURN_ERROR_IF(zds->staticSize, memory_allocation,
+ "legacy support is incompatible with static dctx");
+ { size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, zds->legacyVersion, output, input);
+ if (hint==0) zds->streamStage = zdss_init;
+ return hint;
+ } }
+#endif
+ { size_t const hSize = ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format);
+ DEBUGLOG(5, "header size : %u", (U32)hSize);
+ if (ZSTD_isError(hSize)) {
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+ U32 const legacyVersion = ZSTD_isLegacy(istart, iend-istart);
+ if (legacyVersion) {
+ ZSTD_DDict const* const ddict = ZSTD_getDDict(zds);
+ const void* const dict = ddict ? ZSTD_DDict_dictContent(ddict) : NULL;
+ size_t const dictSize = ddict ? ZSTD_DDict_dictSize(ddict) : 0;
+ DEBUGLOG(5, "ZSTD_decompressStream: detected legacy version v0.%u", legacyVersion);
+ RETURN_ERROR_IF(zds->staticSize, memory_allocation,
+ "legacy support is incompatible with static dctx");
+ FORWARD_IF_ERROR(ZSTD_initLegacyStream(&zds->legacyContext,
+ zds->previousLegacyVersion, legacyVersion,
+ dict, dictSize), "");
+ zds->legacyVersion = zds->previousLegacyVersion = legacyVersion;
+ { size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, legacyVersion, output, input);
+ if (hint==0) zds->streamStage = zdss_init; /* or stay in stage zdss_loadHeader */
+ return hint;
+ } }
+#endif
+ return hSize; /* error */
+ }
+ if (hSize != 0) { /* need more input */
+ size_t const toLoad = hSize - zds->lhSize; /* if hSize!=0, hSize > zds->lhSize */
+ size_t const remainingInput = (size_t)(iend-ip);
+ assert(iend >= ip);
+ if (toLoad > remainingInput) { /* not enough input to load full header */
+ if (remainingInput > 0) {
+ memcpy(zds->headerBuffer + zds->lhSize, ip, remainingInput);
+ zds->lhSize += remainingInput;
+ }
+ input->pos = input->size;
+ return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize; /* remaining header bytes + next block header */
+ }
+ assert(ip != NULL);
+ memcpy(zds->headerBuffer + zds->lhSize, ip, toLoad); zds->lhSize = hSize; ip += toLoad;
+ break;
+ } }
+
+ /* check for single-pass mode opportunity */
+ if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN
+ && zds->fParams.frameType != ZSTD_skippableFrame
+ && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) {
+ size_t const cSize = ZSTD_findFrameCompressedSize(istart, iend-istart);
+ if (cSize <= (size_t)(iend-istart)) {
+ /* shortcut : using single-pass mode */
+ size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, oend-op, istart, cSize, ZSTD_getDDict(zds));
+ if (ZSTD_isError(decompressedSize)) return decompressedSize;
+ DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
+ ip = istart + cSize;
+ op += decompressedSize;
+ zds->expected = 0;
+ zds->streamStage = zdss_init;
+ someMoreWork = 0;
+ break;
+ } }
+
+ /* Check output buffer is large enough for ZSTD_odm_stable. */
+ if (zds->outBufferMode == ZSTD_obm_stable
+ && zds->fParams.frameType != ZSTD_skippableFrame
+ && zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN
+ && (U64)(size_t)(oend-op) < zds->fParams.frameContentSize) {
+ RETURN_ERROR(dstSize_tooSmall, "ZSTD_obm_stable passed but ZSTD_outBuffer is too small");
+ }
+
+ /* Consume header (see ZSTDds_decodeFrameHeader) */
+ DEBUGLOG(4, "Consume header");
+ FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), "");
+
+ if ((MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */
+ zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE);
+ zds->stage = ZSTDds_skipFrame;
+ } else {
+ FORWARD_IF_ERROR(ZSTD_decodeFrameHeader(zds, zds->headerBuffer, zds->lhSize), "");
+ zds->expected = ZSTD_blockHeaderSize;
+ zds->stage = ZSTDds_decodeBlockHeader;
+ }
+
+ /* control buffer memory usage */
+ DEBUGLOG(4, "Control max memory usage (%u KB <= max %u KB)",
+ (U32)(zds->fParams.windowSize >>10),
+ (U32)(zds->maxWindowSize >> 10) );
+ zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN);
+ RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize,
+ frameParameter_windowTooLarge, "");
+
+ /* Adapt buffer sizes to frame header instructions */
+ { size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */);
+ size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_obm_buffered
+ ? ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize)
+ : 0;
+
+ ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize);
+
+ { int const tooSmall = (zds->inBuffSize < neededInBuffSize) || (zds->outBuffSize < neededOutBuffSize);
+ int const tooLarge = ZSTD_DCtx_isOversizedTooLong(zds);
+
+ if (tooSmall || tooLarge) {
+ size_t const bufferSize = neededInBuffSize + neededOutBuffSize;
+ DEBUGLOG(4, "inBuff : from %u to %u",
+ (U32)zds->inBuffSize, (U32)neededInBuffSize);
+ DEBUGLOG(4, "outBuff : from %u to %u",
+ (U32)zds->outBuffSize, (U32)neededOutBuffSize);
+ if (zds->staticSize) { /* static DCtx */
+ DEBUGLOG(4, "staticSize : %u", (U32)zds->staticSize);
+ assert(zds->staticSize >= sizeof(ZSTD_DCtx)); /* controlled at init */
+ RETURN_ERROR_IF(
+ bufferSize > zds->staticSize - sizeof(ZSTD_DCtx),
+ memory_allocation, "");
+ } else {
+ ZSTD_free(zds->inBuff, zds->customMem);
+ zds->inBuffSize = 0;
+ zds->outBuffSize = 0;
+ zds->inBuff = (char*)ZSTD_malloc(bufferSize, zds->customMem);
+ RETURN_ERROR_IF(zds->inBuff == NULL, memory_allocation, "");
+ }
+ zds->inBuffSize = neededInBuffSize;
+ zds->outBuff = zds->inBuff + zds->inBuffSize;
+ zds->outBuffSize = neededOutBuffSize;
+ } } }
+ zds->streamStage = zdss_read;
+ /* fall-through */
+
+ case zdss_read:
+ DEBUGLOG(5, "stage zdss_read");
+ { size_t const neededInSize = ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip);
+ DEBUGLOG(5, "neededInSize = %u", (U32)neededInSize);
+ if (neededInSize==0) { /* end of frame */
+ zds->streamStage = zdss_init;
+ someMoreWork = 0;
+ break;
+ }
+ if ((size_t)(iend-ip) >= neededInSize) { /* decode directly from src */
+ FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), "");
+ ip += neededInSize;
+ /* Function modifies the stage so we must break */
+ break;
+ } }
+ if (ip==iend) { someMoreWork = 0; break; } /* no more input */
+ zds->streamStage = zdss_load;
+ /* fall-through */
+
+ case zdss_load:
+ { size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds);
+ size_t const toLoad = neededInSize - zds->inPos;
+ int const isSkipFrame = ZSTD_isSkipFrame(zds);
+ size_t loadedSize;
+ /* At this point we shouldn't be decompressing a block that we can stream. */
+ assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip));
+ if (isSkipFrame) {
+ loadedSize = MIN(toLoad, (size_t)(iend-ip));
+ } else {
+ RETURN_ERROR_IF(toLoad > zds->inBuffSize - zds->inPos,
+ corruption_detected,
+ "should never happen");
+ loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, iend-ip);
+ }
+ ip += loadedSize;
+ zds->inPos += loadedSize;
+ if (loadedSize < toLoad) { someMoreWork = 0; break; } /* not enough input, wait for more */
+
+ /* decode loaded input */
+ zds->inPos = 0; /* input is consumed */
+ FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, zds->inBuff, neededInSize), "");
+ /* Function modifies the stage so we must break */
+ break;
+ }
+ case zdss_flush:
+ { size_t const toFlushSize = zds->outEnd - zds->outStart;
+ size_t const flushedSize = ZSTD_limitCopy(op, oend-op, zds->outBuff + zds->outStart, toFlushSize);
+ op += flushedSize;
+ zds->outStart += flushedSize;
+ if (flushedSize == toFlushSize) { /* flush completed */
+ zds->streamStage = zdss_read;
+ if ( (zds->outBuffSize < zds->fParams.frameContentSize)
+ && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
+ DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
+ (int)(zds->outBuffSize - zds->outStart),
+ (U32)zds->fParams.blockSizeMax);
+ zds->outStart = zds->outEnd = 0;
+ }
+ break;
+ } }
+ /* cannot complete flush */
+ someMoreWork = 0;
+ break;
+
+ default:
+ assert(0); /* impossible */
+ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */
+ } }
+
+ /* result */
+ input->pos = (size_t)(ip - (const char*)(input->src));
+ output->pos = (size_t)(op - (char*)(output->dst));
+
+ /* Update the expected output buffer for ZSTD_obm_stable. */
+ zds->expectedOutBuffer = *output;
+
+ if ((ip==istart) && (op==ostart)) { /* no forward progress */
+ zds->noForwardProgress ++;
+ if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
+ RETURN_ERROR_IF(op==oend, dstSize_tooSmall, "");
+ RETURN_ERROR_IF(ip==iend, srcSize_wrong, "");
+ assert(0);
+ }
+ } else {
+ zds->noForwardProgress = 0;
+ }
+ { size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zds);
+ if (!nextSrcSizeHint) { /* frame fully decoded */
+ if (zds->outEnd == zds->outStart) { /* output fully flushed */
+ if (zds->hostageByte) {
+ if (input->pos >= input->size) {
+ /* can't release hostage (not present) */
+ zds->streamStage = zdss_read;
+ return 1;
+ }
+ input->pos++; /* release hostage */
+ } /* zds->hostageByte */
+ return 0;
+ } /* zds->outEnd == zds->outStart */
+ if (!zds->hostageByte) { /* output not fully flushed; keep last byte as hostage; will be released when all output is flushed */
+ input->pos--; /* note : pos > 0, otherwise, impossible to finish reading last block */
+ zds->hostageByte=1;
+ }
+ return 1;
+ } /* nextSrcSizeHint==0 */
+ nextSrcSizeHint += ZSTD_blockHeaderSize * (ZSTD_nextInputType(zds) == ZSTDnit_block); /* preload header of next block */
+ assert(zds->inPos <= nextSrcSizeHint);
+ nextSrcSizeHint -= zds->inPos; /* part already loaded*/
+ return nextSrcSizeHint;
+ }
+}
+
+size_t ZSTD_decompressStream_simpleArgs (
+ ZSTD_DCtx* dctx,
+ void* dst, size_t dstCapacity, size_t* dstPos,
+ const void* src, size_t srcSize, size_t* srcPos)
+{
+ ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+ ZSTD_inBuffer input = { src, srcSize, *srcPos };
+ /* ZSTD_compress_generic() will check validity of dstPos and srcPos */
+ size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
+ *dstPos = output.pos;
+ *srcPos = input.pos;
+ return cErr;
+}
+/**** ended inlining decompress/zstd_decompress.c ****/
+/**** start inlining decompress/zstd_decompress_block.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* zstd_decompress_block :
+ * this module takes care of decompressing _compressed_ block */
+
+/*-*******************************************************
+* Dependencies
+*********************************************************/
+#include <string.h> /* memcpy, memmove, memset */
+/**** skipping file: ../common/compiler.h ****/
+/**** skipping file: ../common/cpu.h ****/
+/**** skipping file: ../common/mem.h ****/
+#define FSE_STATIC_LINKING_ONLY
+/**** skipping file: ../common/fse.h ****/
+#define HUF_STATIC_LINKING_ONLY
+/**** skipping file: ../common/huf.h ****/
+/**** skipping file: ../common/zstd_internal.h ****/
+/**** skipping file: zstd_decompress_internal.h ****/
+/**** skipping file: zstd_ddict.h ****/
+/**** skipping file: zstd_decompress_block.h ****/
+
+/*_*******************************************************
+* Macros
+**********************************************************/
+
+/* These two optional macros force the use one way or another of the two
+ * ZSTD_decompressSequences implementations. You can't force in both directions
+ * at the same time.
+ */
+#if defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+ defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+#error "Cannot force the use of the short and the long ZSTD_decompressSequences variants!"
+#endif
+
+
+/*_*******************************************************
+* Memory operations
+**********************************************************/
+static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
+
+
+/*-*************************************************************
+ * Block decoding
+ ***************************************************************/
+
+/*! ZSTD_getcBlockSize() :
+ * Provides the size of compressed block from block header `src` */
+size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+ blockProperties_t* bpPtr)
+{
+ RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong, "");
+
+ { U32 const cBlockHeader = MEM_readLE24(src);
+ U32 const cSize = cBlockHeader >> 3;
+ bpPtr->lastBlock = cBlockHeader & 1;
+ bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3);
+ bpPtr->origSize = cSize; /* only useful for RLE */
+ if (bpPtr->blockType == bt_rle) return 1;
+ RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected, "");
+ return cSize;
+ }
+}
+
+
+/* Hidden declaration for fullbench */
+size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+ const void* src, size_t srcSize);
+/*! ZSTD_decodeLiteralsBlock() :
+ * @return : nb of bytes read from src (< srcSize )
+ * note : symbol not declared but exposed for fullbench */
+size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+ const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */
+{
+ DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");
+ RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");
+
+ { const BYTE* const istart = (const BYTE*) src;
+ symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
+
+ switch(litEncType)
+ {
+ case set_repeat:
+ DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block");
+ RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, "");
+ /* fall-through */
+
+ case set_compressed:
+ RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
+ { size_t lhSize, litSize, litCSize;
+ U32 singleStream=0;
+ U32 const lhlCode = (istart[0] >> 2) & 3;
+ U32 const lhc = MEM_readLE32(istart);
+ size_t hufSuccess;
+ switch(lhlCode)
+ {
+ case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */
+ /* 2 - 2 - 10 - 10 */
+ singleStream = !lhlCode;
+ lhSize = 3;
+ litSize = (lhc >> 4) & 0x3FF;
+ litCSize = (lhc >> 14) & 0x3FF;
+ break;
+ case 2:
+ /* 2 - 2 - 14 - 14 */
+ lhSize = 4;
+ litSize = (lhc >> 4) & 0x3FFF;
+ litCSize = lhc >> 18;
+ break;
+ case 3:
+ /* 2 - 2 - 18 - 18 */
+ lhSize = 5;
+ litSize = (lhc >> 4) & 0x3FFFF;
+ litCSize = (lhc >> 22) + ((size_t)istart[4] << 10);
+ break;
+ }
+ RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
+ RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
+
+ /* prefetch huffman table if cold */
+ if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
+ PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable));
+ }
+
+ if (litEncType==set_repeat) {
+ if (singleStream) {
+ hufSuccess = HUF_decompress1X_usingDTable_bmi2(
+ dctx->litBuffer, litSize, istart+lhSize, litCSize,
+ dctx->HUFptr, dctx->bmi2);
+ } else {
+ hufSuccess = HUF_decompress4X_usingDTable_bmi2(
+ dctx->litBuffer, litSize, istart+lhSize, litCSize,
+ dctx->HUFptr, dctx->bmi2);
+ }
+ } else {
+ if (singleStream) {
+#if defined(HUF_FORCE_DECOMPRESS_X2)
+ hufSuccess = HUF_decompress1X_DCtx_wksp(
+ dctx->entropy.hufTable, dctx->litBuffer, litSize,
+ istart+lhSize, litCSize, dctx->workspace,
+ sizeof(dctx->workspace));
+#else
+ hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
+ dctx->entropy.hufTable, dctx->litBuffer, litSize,
+ istart+lhSize, litCSize, dctx->workspace,
+ sizeof(dctx->workspace), dctx->bmi2);
+#endif
+ } else {
+ hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
+ dctx->entropy.hufTable, dctx->litBuffer, litSize,
+ istart+lhSize, litCSize, dctx->workspace,
+ sizeof(dctx->workspace), dctx->bmi2);
+ }
+ }
+
+ RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
+
+ dctx->litPtr = dctx->litBuffer;
+ dctx->litSize = litSize;
+ dctx->litEntropy = 1;
+ if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable;
+ memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+ return litCSize + lhSize;
+ }
+
+ case set_basic:
+ { size_t litSize, lhSize;
+ U32 const lhlCode = ((istart[0]) >> 2) & 3;
+ switch(lhlCode)
+ {
+ case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */
+ lhSize = 1;
+ litSize = istart[0] >> 3;
+ break;
+ case 1:
+ lhSize = 2;
+ litSize = MEM_readLE16(istart) >> 4;
+ break;
+ case 3:
+ lhSize = 3;
+ litSize = MEM_readLE24(istart) >> 4;
+ break;
+ }
+
+ if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */
+ RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, "");
+ memcpy(dctx->litBuffer, istart+lhSize, litSize);
+ dctx->litPtr = dctx->litBuffer;
+ dctx->litSize = litSize;
+ memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+ return lhSize+litSize;
+ }
+ /* direct reference into compressed stream */
+ dctx->litPtr = istart+lhSize;
+ dctx->litSize = litSize;
+ return lhSize+litSize;
+ }
+
+ case set_rle:
+ { U32 const lhlCode = ((istart[0]) >> 2) & 3;
+ size_t litSize, lhSize;
+ switch(lhlCode)
+ {
+ case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */
+ lhSize = 1;
+ litSize = istart[0] >> 3;
+ break;
+ case 1:
+ lhSize = 2;
+ litSize = MEM_readLE16(istart) >> 4;
+ break;
+ case 3:
+ lhSize = 3;
+ litSize = MEM_readLE24(istart) >> 4;
+ RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
+ break;
+ }
+ RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
+ memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
+ dctx->litPtr = dctx->litBuffer;
+ dctx->litSize = litSize;
+ return lhSize+1;
+ }
+ default:
+ RETURN_ERROR(corruption_detected, "impossible");
+ }
+ }
+}
+
+/* Default FSE distribution tables.
+ * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
+ * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#default-distributions
+ * They were generated programmatically with following method :
+ * - start from default distributions, present in /lib/common/zstd_internal.h
+ * - generate tables normally, using ZSTD_buildFSETable()
+ * - printout the content of tables
+ * - pretify output, report below, test with fuzzer to ensure it's correct */
+
+/* Default FSE distribution table for Literal Lengths */
+static const ZSTD_seqSymbol LL_defaultDTable[(1<<LL_DEFAULTNORMLOG)+1] = {
+ { 1, 1, 1, LL_DEFAULTNORMLOG}, /* header : fastMode, tableLog */
+ /* nextState, nbAddBits, nbBits, baseVal */
+ { 0, 0, 4, 0}, { 16, 0, 4, 0},
+ { 32, 0, 5, 1}, { 0, 0, 5, 3},
+ { 0, 0, 5, 4}, { 0, 0, 5, 6},
+ { 0, 0, 5, 7}, { 0, 0, 5, 9},
+ { 0, 0, 5, 10}, { 0, 0, 5, 12},
+ { 0, 0, 6, 14}, { 0, 1, 5, 16},
+ { 0, 1, 5, 20}, { 0, 1, 5, 22},
+ { 0, 2, 5, 28}, { 0, 3, 5, 32},
+ { 0, 4, 5, 48}, { 32, 6, 5, 64},
+ { 0, 7, 5, 128}, { 0, 8, 6, 256},
+ { 0, 10, 6, 1024}, { 0, 12, 6, 4096},
+ { 32, 0, 4, 0}, { 0, 0, 4, 1},
+ { 0, 0, 5, 2}, { 32, 0, 5, 4},
+ { 0, 0, 5, 5}, { 32, 0, 5, 7},
+ { 0, 0, 5, 8}, { 32, 0, 5, 10},
+ { 0, 0, 5, 11}, { 0, 0, 6, 13},
+ { 32, 1, 5, 16}, { 0, 1, 5, 18},
+ { 32, 1, 5, 22}, { 0, 2, 5, 24},
+ { 32, 3, 5, 32}, { 0, 3, 5, 40},
+ { 0, 6, 4, 64}, { 16, 6, 4, 64},
+ { 32, 7, 5, 128}, { 0, 9, 6, 512},
+ { 0, 11, 6, 2048}, { 48, 0, 4, 0},
+ { 16, 0, 4, 1}, { 32, 0, 5, 2},
+ { 32, 0, 5, 3}, { 32, 0, 5, 5},
+ { 32, 0, 5, 6}, { 32, 0, 5, 8},
+ { 32, 0, 5, 9}, { 32, 0, 5, 11},
+ { 32, 0, 5, 12}, { 0, 0, 6, 15},
+ { 32, 1, 5, 18}, { 32, 1, 5, 20},
+ { 32, 2, 5, 24}, { 32, 2, 5, 28},
+ { 32, 3, 5, 40}, { 32, 4, 5, 48},
+ { 0, 16, 6,65536}, { 0, 15, 6,32768},
+ { 0, 14, 6,16384}, { 0, 13, 6, 8192},
+}; /* LL_defaultDTable */
+
+/* Default FSE distribution table for Offset Codes */
+static const ZSTD_seqSymbol OF_defaultDTable[(1<<OF_DEFAULTNORMLOG)+1] = {
+ { 1, 1, 1, OF_DEFAULTNORMLOG}, /* header : fastMode, tableLog */
+ /* nextState, nbAddBits, nbBits, baseVal */
+ { 0, 0, 5, 0}, { 0, 6, 4, 61},
+ { 0, 9, 5, 509}, { 0, 15, 5,32765},
+ { 0, 21, 5,2097149}, { 0, 3, 5, 5},
+ { 0, 7, 4, 125}, { 0, 12, 5, 4093},
+ { 0, 18, 5,262141}, { 0, 23, 5,8388605},
+ { 0, 5, 5, 29}, { 0, 8, 4, 253},
+ { 0, 14, 5,16381}, { 0, 20, 5,1048573},
+ { 0, 2, 5, 1}, { 16, 7, 4, 125},
+ { 0, 11, 5, 2045}, { 0, 17, 5,131069},
+ { 0, 22, 5,4194301}, { 0, 4, 5, 13},
+ { 16, 8, 4, 253}, { 0, 13, 5, 8189},
+ { 0, 19, 5,524285}, { 0, 1, 5, 1},
+ { 16, 6, 4, 61}, { 0, 10, 5, 1021},
+ { 0, 16, 5,65533}, { 0, 28, 5,268435453},
+ { 0, 27, 5,134217725}, { 0, 26, 5,67108861},
+ { 0, 25, 5,33554429}, { 0, 24, 5,16777213},
+}; /* OF_defaultDTable */
+
+
+/* Default FSE distribution table for Match Lengths */
+static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = {
+ { 1, 1, 1, ML_DEFAULTNORMLOG}, /* header : fastMode, tableLog */
+ /* nextState, nbAddBits, nbBits, baseVal */
+ { 0, 0, 6, 3}, { 0, 0, 4, 4},
+ { 32, 0, 5, 5}, { 0, 0, 5, 6},
+ { 0, 0, 5, 8}, { 0, 0, 5, 9},
+ { 0, 0, 5, 11}, { 0, 0, 6, 13},
+ { 0, 0, 6, 16}, { 0, 0, 6, 19},
+ { 0, 0, 6, 22}, { 0, 0, 6, 25},
+ { 0, 0, 6, 28}, { 0, 0, 6, 31},
+ { 0, 0, 6, 34}, { 0, 1, 6, 37},
+ { 0, 1, 6, 41}, { 0, 2, 6, 47},
+ { 0, 3, 6, 59}, { 0, 4, 6, 83},
+ { 0, 7, 6, 131}, { 0, 9, 6, 515},
+ { 16, 0, 4, 4}, { 0, 0, 4, 5},
+ { 32, 0, 5, 6}, { 0, 0, 5, 7},
+ { 32, 0, 5, 9}, { 0, 0, 5, 10},
+ { 0, 0, 6, 12}, { 0, 0, 6, 15},
+ { 0, 0, 6, 18}, { 0, 0, 6, 21},
+ { 0, 0, 6, 24}, { 0, 0, 6, 27},
+ { 0, 0, 6, 30}, { 0, 0, 6, 33},
+ { 0, 1, 6, 35}, { 0, 1, 6, 39},
+ { 0, 2, 6, 43}, { 0, 3, 6, 51},
+ { 0, 4, 6, 67}, { 0, 5, 6, 99},
+ { 0, 8, 6, 259}, { 32, 0, 4, 4},
+ { 48, 0, 4, 4}, { 16, 0, 4, 5},
+ { 32, 0, 5, 7}, { 32, 0, 5, 8},
+ { 32, 0, 5, 10}, { 32, 0, 5, 11},
+ { 0, 0, 6, 14}, { 0, 0, 6, 17},
+ { 0, 0, 6, 20}, { 0, 0, 6, 23},
+ { 0, 0, 6, 26}, { 0, 0, 6, 29},
+ { 0, 0, 6, 32}, { 0, 16, 6,65539},
+ { 0, 15, 6,32771}, { 0, 14, 6,16387},
+ { 0, 13, 6, 8195}, { 0, 12, 6, 4099},
+ { 0, 11, 6, 2051}, { 0, 10, 6, 1027},
+}; /* ML_defaultDTable */
+
+
+static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddBits)
+{
+ void* ptr = dt;
+ ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr;
+ ZSTD_seqSymbol* const cell = dt + 1;
+
+ DTableH->tableLog = 0;
+ DTableH->fastMode = 0;
+
+ cell->nbBits = 0;
+ cell->nextState = 0;
+ assert(nbAddBits < 255);
+ cell->nbAdditionalBits = (BYTE)nbAddBits;
+ cell->baseValue = baseValue;
+}
+
+
+/* ZSTD_buildFSETable() :
+ * generate FSE decoding table for one symbol (ll, ml or off)
+ * cannot fail if input is valid =>
+ * all inputs are presumed validated at this stage */
+void
+ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
+ const short* normalizedCounter, unsigned maxSymbolValue,
+ const U32* baseValue, const U32* nbAdditionalBits,
+ unsigned tableLog)
+{
+ ZSTD_seqSymbol* const tableDecode = dt+1;
+ U16 symbolNext[MaxSeq+1];
+
+ U32 const maxSV1 = maxSymbolValue + 1;
+ U32 const tableSize = 1 << tableLog;
+ U32 highThreshold = tableSize-1;
+
+ /* Sanity Checks */
+ assert(maxSymbolValue <= MaxSeq);
+ assert(tableLog <= MaxFSELog);
+
+ /* Init, lay down lowprob symbols */
+ { ZSTD_seqSymbol_header DTableH;
+ DTableH.tableLog = tableLog;
+ DTableH.fastMode = 1;
+ { S16 const largeLimit= (S16)(1 << (tableLog-1));
+ U32 s;
+ for (s=0; s<maxSV1; s++) {
+ if (normalizedCounter[s]==-1) {
+ tableDecode[highThreshold--].baseValue = s;
+ symbolNext[s] = 1;
+ } else {
+ if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+ assert(normalizedCounter[s]>=0);
+ symbolNext[s] = (U16)normalizedCounter[s];
+ } } }
+ memcpy(dt, &DTableH, sizeof(DTableH));
+ }
+
+ /* Spread symbols */
+ { U32 const tableMask = tableSize-1;
+ U32 const step = FSE_TABLESTEP(tableSize);
+ U32 s, position = 0;
+ for (s=0; s<maxSV1; s++) {
+ int i;
+ for (i=0; i<normalizedCounter[s]; i++) {
+ tableDecode[position].baseValue = s;
+ position = (position + step) & tableMask;
+ while (position > highThreshold) position = (position + step) & tableMask; /* lowprob area */
+ } }
+ assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+ }
+
+ /* Build Decoding table */
+ { U32 u;
+ for (u=0; u<tableSize; u++) {
+ U32 const symbol = tableDecode[u].baseValue;
+ U32 const nextState = symbolNext[symbol]++;
+ tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
+ tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+ assert(nbAdditionalBits[symbol] < 255);
+ tableDecode[u].nbAdditionalBits = (BYTE)nbAdditionalBits[symbol];
+ tableDecode[u].baseValue = baseValue[symbol];
+ } }
+}
+
+
+/*! ZSTD_buildSeqTable() :
+ * @return : nb bytes read from src,
+ * or an error code if it fails */
+static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr,
+ symbolEncodingType_e type, unsigned max, U32 maxLog,
+ const void* src, size_t srcSize,
+ const U32* baseValue, const U32* nbAdditionalBits,
+ const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
+ int ddictIsCold, int nbSeq)
+{
+ switch(type)
+ {
+ case set_rle :
+ RETURN_ERROR_IF(!srcSize, srcSize_wrong, "");
+ RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, "");
+ { U32 const symbol = *(const BYTE*)src;
+ U32 const baseline = baseValue[symbol];
+ U32 const nbBits = nbAdditionalBits[symbol];
+ ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits);
+ }
+ *DTablePtr = DTableSpace;
+ return 1;
+ case set_basic :
+ *DTablePtr = defaultTable;
+ return 0;
+ case set_repeat:
+ RETURN_ERROR_IF(!flagRepeatTable, corruption_detected, "");
+ /* prefetch FSE table if used */
+ if (ddictIsCold && (nbSeq > 24 /* heuristic */)) {
+ const void* const pStart = *DTablePtr;
+ size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog));
+ PREFETCH_AREA(pStart, pSize);
+ }
+ return 0;
+ case set_compressed :
+ { unsigned tableLog;
+ S16 norm[MaxSeq+1];
+ size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
+ RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, "");
+ RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, "");
+ ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog);
+ *DTablePtr = DTableSpace;
+ return headerSize;
+ }
+ default :
+ assert(0);
+ RETURN_ERROR(GENERIC, "impossible");
+ }
+}
+
+size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+ const void* src, size_t srcSize)
+{
+ const BYTE* const istart = (const BYTE* const)src;
+ const BYTE* const iend = istart + srcSize;
+ const BYTE* ip = istart;
+ int nbSeq;
+ DEBUGLOG(5, "ZSTD_decodeSeqHeaders");
+
+ /* check */
+ RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong, "");
+
+ /* SeqHead */
+ nbSeq = *ip++;
+ if (!nbSeq) {
+ *nbSeqPtr=0;
+ RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");
+ return 1;
+ }
+ if (nbSeq > 0x7F) {
+ if (nbSeq == 0xFF) {
+ RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
+ nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
+ } else {
+ RETURN_ERROR_IF(ip >= iend, srcSize_wrong, "");
+ nbSeq = ((nbSeq-0x80)<<8) + *ip++;
+ }
+ }
+ *nbSeqPtr = nbSeq;
+
+ /* FSE table descriptors */
+ RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
+ { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
+ symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
+ symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
+ ip++;
+
+ /* Build DTables */
+ { size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr,
+ LLtype, MaxLL, LLFSELog,
+ ip, iend-ip,
+ LL_base, LL_bits,
+ LL_defaultDTable, dctx->fseEntropy,
+ dctx->ddictIsCold, nbSeq);
+ RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed");
+ ip += llhSize;
+ }
+
+ { size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr,
+ OFtype, MaxOff, OffFSELog,
+ ip, iend-ip,
+ OF_base, OF_bits,
+ OF_defaultDTable, dctx->fseEntropy,
+ dctx->ddictIsCold, nbSeq);
+ RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed");
+ ip += ofhSize;
+ }
+
+ { size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr,
+ MLtype, MaxML, MLFSELog,
+ ip, iend-ip,
+ ML_base, ML_bits,
+ ML_defaultDTable, dctx->fseEntropy,
+ dctx->ddictIsCold, nbSeq);
+ RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed");
+ ip += mlhSize;
+ }
+ }
+
+ return ip-istart;
+}
+
+
+typedef struct {
+ size_t litLength;
+ size_t matchLength;
+ size_t offset;
+ const BYTE* match;
+} seq_t;
+
+typedef struct {
+ size_t state;
+ const ZSTD_seqSymbol* table;
+} ZSTD_fseState;
+
+typedef struct {
+ BIT_DStream_t DStream;
+ ZSTD_fseState stateLL;
+ ZSTD_fseState stateOffb;
+ ZSTD_fseState stateML;
+ size_t prevOffset[ZSTD_REP_NUM];
+ const BYTE* prefixStart;
+ const BYTE* dictEnd;
+ size_t pos;
+} seqState_t;
+
+/*! ZSTD_overlapCopy8() :
+ * Copies 8 bytes from ip to op and updates op and ip where ip <= op.
+ * If the offset is < 8 then the offset is spread to at least 8 bytes.
+ *
+ * Precondition: *ip <= *op
+ * Postcondition: *op - *op >= 8
+ */
+HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
+ assert(*ip <= *op);
+ if (offset < 8) {
+ /* close range match, overlap */
+ static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */
+ static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */
+ int const sub2 = dec64table[offset];
+ (*op)[0] = (*ip)[0];
+ (*op)[1] = (*ip)[1];
+ (*op)[2] = (*ip)[2];
+ (*op)[3] = (*ip)[3];
+ *ip += dec32table[offset];
+ ZSTD_copy4(*op+4, *ip);
+ *ip -= sub2;
+ } else {
+ ZSTD_copy8(*op, *ip);
+ }
+ *ip += 8;
+ *op += 8;
+ assert(*op - *ip >= 8);
+}
+
+/*! ZSTD_safecopy() :
+ * Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer
+ * and write up to 16 bytes past oend_w (op >= oend_w is allowed).
+ * This function is only called in the uncommon case where the sequence is near the end of the block. It
+ * should be fast for a single long sequence, but can be slow for several short sequences.
+ *
+ * @param ovtype controls the overlap detection
+ * - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart.
+ * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.
+ * The src buffer must be before the dst buffer.
+ */
+static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
+ ptrdiff_t const diff = op - ip;
+ BYTE* const oend = op + length;
+
+ assert((ovtype == ZSTD_no_overlap && (diff <= -8 || diff >= 8 || op >= oend_w)) ||
+ (ovtype == ZSTD_overlap_src_before_dst && diff >= 0));
+
+ if (length < 8) {
+ /* Handle short lengths. */
+ while (op < oend) *op++ = *ip++;
+ return;
+ }
+ if (ovtype == ZSTD_overlap_src_before_dst) {
+ /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */
+ assert(length >= 8);
+ ZSTD_overlapCopy8(&op, &ip, diff);
+ assert(op - ip >= 8);
+ assert(op <= oend);
+ }
+
+ if (oend <= oend_w) {
+ /* No risk of overwrite. */
+ ZSTD_wildcopy(op, ip, length, ovtype);
+ return;
+ }
+ if (op <= oend_w) {
+ /* Wildcopy until we get close to the end. */
+ assert(oend > oend_w);
+ ZSTD_wildcopy(op, ip, oend_w - op, ovtype);
+ ip += oend_w - op;
+ op = oend_w;
+ }
+ /* Handle the leftovers. */
+ while (op < oend) *op++ = *ip++;
+}
+
+/* ZSTD_execSequenceEnd():
+ * This version handles cases that are near the end of the output buffer. It requires
+ * more careful checks to make sure there is no overflow. By separating out these hard
+ * and unlikely cases, we can speed up the common cases.
+ *
+ * NOTE: This function needs to be fast for a single long sequence, but doesn't need
+ * to be optimized for many small sequences, since those fall into ZSTD_execSequence().
+ */
+FORCE_NOINLINE
+size_t ZSTD_execSequenceEnd(BYTE* op,
+ BYTE* const oend, seq_t sequence,
+ const BYTE** litPtr, const BYTE* const litLimit,
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
+{
+ BYTE* const oLitEnd = op + sequence.litLength;
+ size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+ const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+ const BYTE* match = oLitEnd - sequence.offset;
+ BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
+
+ /* bounds checks : careful of address space overflow in 32-bit mode */
+ RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
+ RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
+ assert(op < op + sequenceLength);
+ assert(oLitEnd < op + sequenceLength);
+
+ /* copy literals */
+ ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap);
+ op = oLitEnd;
+ *litPtr = iLitEnd;
+
+ /* copy Match */
+ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
+ /* offset beyond prefix */
+ RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
+ match = dictEnd - (prefixStart-match);
+ if (match + sequence.matchLength <= dictEnd) {
+ memmove(oLitEnd, match, sequence.matchLength);
+ return sequenceLength;
+ }
+ /* span extDict & currentPrefixSegment */
+ { size_t const length1 = dictEnd - match;
+ memmove(oLitEnd, match, length1);
+ op = oLitEnd + length1;
+ sequence.matchLength -= length1;
+ match = prefixStart;
+ } }
+ ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
+ return sequenceLength;
+}
+
+HINT_INLINE
+size_t ZSTD_execSequence(BYTE* op,
+ BYTE* const oend, seq_t sequence,
+ const BYTE** litPtr, const BYTE* const litLimit,
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
+{
+ BYTE* const oLitEnd = op + sequence.litLength;
+ size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+ BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
+ BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; /* risk : address space underflow on oend=NULL */
+ const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+ const BYTE* match = oLitEnd - sequence.offset;
+
+ assert(op != NULL /* Precondition */);
+ assert(oend_w < oend /* No underflow */);
+ /* Handle edge cases in a slow path:
+ * - Read beyond end of literals
+ * - Match end is within WILDCOPY_OVERLIMIT of oend
+ * - 32-bit mode and the match length overflows
+ */
+ if (UNLIKELY(
+ iLitEnd > litLimit ||
+ oMatchEnd > oend_w ||
+ (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
+ return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
+
+ /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
+ assert(op <= oLitEnd /* No overflow */);
+ assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
+ assert(oMatchEnd <= oend /* No underflow */);
+ assert(iLitEnd <= litLimit /* Literal length is in bounds */);
+ assert(oLitEnd <= oend_w /* Can wildcopy literals */);
+ assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
+
+ /* Copy Literals:
+ * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
+ * We likely don't need the full 32-byte wildcopy.
+ */
+ assert(WILDCOPY_OVERLENGTH >= 16);
+ ZSTD_copy16(op, (*litPtr));
+ if (UNLIKELY(sequence.litLength > 16)) {
+ ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap);
+ }
+ op = oLitEnd;
+ *litPtr = iLitEnd; /* update for next sequence */
+
+ /* Copy Match */
+ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
+ /* offset beyond prefix -> go into extDict */
+ RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
+ match = dictEnd + (match - prefixStart);
+ if (match + sequence.matchLength <= dictEnd) {
+ memmove(oLitEnd, match, sequence.matchLength);
+ return sequenceLength;
+ }
+ /* span extDict & currentPrefixSegment */
+ { size_t const length1 = dictEnd - match;
+ memmove(oLitEnd, match, length1);
+ op = oLitEnd + length1;
+ sequence.matchLength -= length1;
+ match = prefixStart;
+ } }
+ /* Match within prefix of 1 or more bytes */
+ assert(op <= oMatchEnd);
+ assert(oMatchEnd <= oend_w);
+ assert(match >= prefixStart);
+ assert(sequence.matchLength >= 1);
+
+ /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
+ * without overlap checking.
+ */
+ if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
+ /* We bet on a full wildcopy for matches, since we expect matches to be
+ * longer than literals (in general). In silesia, ~10% of matches are longer
+ * than 16 bytes.
+ */
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
+ return sequenceLength;
+ }
+ assert(sequence.offset < WILDCOPY_VECLEN);
+
+ /* Copy 8 bytes and spread the offset to be >= 8. */
+ ZSTD_overlapCopy8(&op, &match, sequence.offset);
+
+ /* If the match length is > 8 bytes, then continue with the wildcopy. */
+ if (sequence.matchLength > 8) {
+ assert(op < oMatchEnd);
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst);
+ }
+ return sequenceLength;
+}
+
+static void
+ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt)
+{
+ const void* ptr = dt;
+ const ZSTD_seqSymbol_header* const DTableH = (const ZSTD_seqSymbol_header*)ptr;
+ DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
+ DEBUGLOG(6, "ZSTD_initFseState : val=%u using %u bits",
+ (U32)DStatePtr->state, DTableH->tableLog);
+ BIT_reloadDStream(bitD);
+ DStatePtr->table = dt + 1;
+}
+
+FORCE_INLINE_TEMPLATE void
+ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
+{
+ ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state];
+ U32 const nbBits = DInfo.nbBits;
+ size_t const lowBits = BIT_readBits(bitD, nbBits);
+ DStatePtr->state = DInfo.nextState + lowBits;
+}
+
+FORCE_INLINE_TEMPLATE void
+ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD_seqSymbol const DInfo)
+{
+ U32 const nbBits = DInfo.nbBits;
+ size_t const lowBits = BIT_readBits(bitD, nbBits);
+ DStatePtr->state = DInfo.nextState + lowBits;
+}
+
+/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
+ * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
+ * bits before reloading. This value is the maximum number of bytes we read
+ * after reloading when we are decoding long offsets.
+ */
+#define LONG_OFFSETS_MAX_EXTRA_BITS_32 \
+ (ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32 \
+ ? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32 \
+ : 0)
+
+typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
+typedef enum { ZSTD_p_noPrefetch=0, ZSTD_p_prefetch=1 } ZSTD_prefetch_e;
+
+FORCE_INLINE_TEMPLATE seq_t
+ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const ZSTD_prefetch_e prefetch)
+{
+ seq_t seq;
+ ZSTD_seqSymbol const llDInfo = seqState->stateLL.table[seqState->stateLL.state];
+ ZSTD_seqSymbol const mlDInfo = seqState->stateML.table[seqState->stateML.state];
+ ZSTD_seqSymbol const ofDInfo = seqState->stateOffb.table[seqState->stateOffb.state];
+ U32 const llBase = llDInfo.baseValue;
+ U32 const mlBase = mlDInfo.baseValue;
+ U32 const ofBase = ofDInfo.baseValue;
+ BYTE const llBits = llDInfo.nbAdditionalBits;
+ BYTE const mlBits = mlDInfo.nbAdditionalBits;
+ BYTE const ofBits = ofDInfo.nbAdditionalBits;
+ BYTE const totalBits = llBits+mlBits+ofBits;
+
+ /* sequence */
+ { size_t offset;
+ if (ofBits > 1) {
+ ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
+ ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
+ assert(ofBits <= MaxOff);
+ if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
+ U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
+ offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
+ BIT_reloadDStream(&seqState->DStream);
+ if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+ assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */
+ } else {
+ offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
+ }
+ seqState->prevOffset[2] = seqState->prevOffset[1];
+ seqState->prevOffset[1] = seqState->prevOffset[0];
+ seqState->prevOffset[0] = offset;
+ } else {
+ U32 const ll0 = (llBase == 0);
+ if (LIKELY((ofBits == 0))) {
+ if (LIKELY(!ll0))
+ offset = seqState->prevOffset[0];
+ else {
+ offset = seqState->prevOffset[1];
+ seqState->prevOffset[1] = seqState->prevOffset[0];
+ seqState->prevOffset[0] = offset;
+ }
+ } else {
+ offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
+ { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+ temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
+ if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
+ seqState->prevOffset[1] = seqState->prevOffset[0];
+ seqState->prevOffset[0] = offset = temp;
+ } } }
+ seq.offset = offset;
+ }
+
+ seq.matchLength = mlBase;
+ if (mlBits > 0)
+ seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
+
+ if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
+ BIT_reloadDStream(&seqState->DStream);
+ if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
+ BIT_reloadDStream(&seqState->DStream);
+ /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
+ ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
+
+ seq.litLength = llBase;
+ if (llBits > 0)
+ seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
+
+ if (MEM_32bits())
+ BIT_reloadDStream(&seqState->DStream);
+
+ DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
+ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+
+ if (prefetch == ZSTD_p_prefetch) {
+ size_t const pos = seqState->pos + seq.litLength;
+ const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart;
+ seq.match = matchBase + pos - seq.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
+ * No consequence though : no memory access will occur, offset is only used for prefetching */
+ seqState->pos = pos + seq.matchLength;
+ }
+
+ /* ANS state update
+ * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo().
+ * clang-9.2.0 does 7% worse with ZSTD_updateFseState().
+ * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the
+ * better option, so it is the default for other compilers. But, if you
+ * measure that it is worse, please put up a pull request.
+ */
+ {
+#if defined(__GNUC__) && !defined(__clang__)
+ const int kUseUpdateFseState = 1;
+#else
+ const int kUseUpdateFseState = 0;
+#endif
+ if (kUseUpdateFseState) {
+ ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */
+ ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
+ ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */
+ } else {
+ ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llDInfo); /* <= 9 bits */
+ ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlDInfo); /* <= 9 bits */
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
+ ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofDInfo); /* <= 8 bits */
+ }
+ }
+
+ return seq;
+}
+
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
+{
+ size_t const windowSize = dctx->fParams.windowSize;
+ /* No dictionary used. */
+ if (dctx->dictContentEndForFuzzing == NULL) return 0;
+ /* Dictionary is our prefix. */
+ if (prefixStart == dctx->dictContentBeginForFuzzing) return 1;
+ /* Dictionary is not our ext-dict. */
+ if (dctx->dictEnd != dctx->dictContentEndForFuzzing) return 0;
+ /* Dictionary is not within our window size. */
+ if ((size_t)(oLitEnd - prefixStart) >= windowSize) return 0;
+ /* Dictionary is active. */
+ return 1;
+}
+
+MEM_STATIC void ZSTD_assertValidSequence(
+ ZSTD_DCtx const* dctx,
+ BYTE const* op, BYTE const* oend,
+ seq_t const seq,
+ BYTE const* prefixStart, BYTE const* virtualStart)
+{
+ size_t const windowSize = dctx->fParams.windowSize;
+ size_t const sequenceSize = seq.litLength + seq.matchLength;
+ BYTE const* const oLitEnd = op + seq.litLength;
+ DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
+ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+ assert(op <= oend);
+ assert((size_t)(oend - op) >= sequenceSize);
+ assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX);
+ if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
+ size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
+ /* Offset must be within the dictionary. */
+ assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
+ assert(seq.offset <= windowSize + dictSize);
+ } else {
+ /* Offset must be within our window. */
+ assert(seq.offset <= windowSize);
+ }
+}
+#endif
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+FORCE_INLINE_TEMPLATE size_t
+DONT_VECTORIZE
+ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
+ void* dst, size_t maxDstSize,
+ const void* seqStart, size_t seqSize, int nbSeq,
+ const ZSTD_longOffset_e isLongOffset,
+ const int frame)
+{
+ const BYTE* ip = (const BYTE*)seqStart;
+ const BYTE* const iend = ip + seqSize;
+ BYTE* const ostart = (BYTE* const)dst;
+ BYTE* const oend = ostart + maxDstSize;
+ BYTE* op = ostart;
+ const BYTE* litPtr = dctx->litPtr;
+ const BYTE* const litEnd = litPtr + dctx->litSize;
+ const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+ const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
+ const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+ DEBUGLOG(5, "ZSTD_decompressSequences_body");
+ (void)frame;
+
+ /* Regen sequences */
+ if (nbSeq) {
+ seqState_t seqState;
+ size_t error = 0;
+ dctx->fseEntropy = 1;
+ { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
+ RETURN_ERROR_IF(
+ ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
+ corruption_detected, "");
+ ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
+ ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
+ ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+ assert(dst != NULL);
+
+ ZSTD_STATIC_ASSERT(
+ BIT_DStream_unfinished < BIT_DStream_completed &&
+ BIT_DStream_endOfBuffer < BIT_DStream_completed &&
+ BIT_DStream_completed < BIT_DStream_overflow);
+
+#if defined(__GNUC__) && defined(__x86_64__)
+ /* Align the decompression loop to 32 + 16 bytes.
+ *
+ * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
+ * speed swings based on the alignment of the decompression loop. This
+ * performance swing is caused by parts of the decompression loop falling
+ * out of the DSB. The entire decompression loop should fit in the DSB,
+ * when it can't we get much worse performance. You can measure if you've
+ * hit the good case or the bad case with this perf command for some
+ * compressed file test.zst:
+ *
+ * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
+ * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
+ *
+ * If you see most cycles served out of the MITE you've hit the bad case.
+ * If you see most cycles served out of the DSB you've hit the good case.
+ * If it is pretty even then you may be in an okay case.
+ *
+ * I've been able to reproduce this issue on the following CPUs:
+ * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
+ * Use Instruments->Counters to get DSB/MITE cycles.
+ * I never got performance swings, but I was able to
+ * go from the good case of mostly DSB to half of the
+ * cycles served from MITE.
+ * - Coffeelake: Intel i9-9900k
+ *
+ * I haven't been able to reproduce the instability or DSB misses on any
+ * of the following CPUS:
+ * - Haswell
+ * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
+ * - Skylake
+ *
+ * If you are seeing performance stability this script can help test.
+ * It tests on 4 commits in zstd where I saw performance change.
+ *
+ * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
+ */
+ __asm__(".p2align 5");
+ __asm__("nop");
+ __asm__(".p2align 4");
+#endif
+ for ( ; ; ) {
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_noPrefetch);
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+ assert(!ZSTD_isError(oneSeqSize));
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+#endif
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+ BIT_reloadDStream(&(seqState.DStream));
+ /* gcc and clang both don't like early returns in this loop.
+ * gcc doesn't like early breaks either.
+ * Instead save an error and report it at the end.
+ * When there is an error, don't increment op, so we don't
+ * overwrite.
+ */
+ if (UNLIKELY(ZSTD_isError(oneSeqSize))) error = oneSeqSize;
+ else op += oneSeqSize;
+ if (UNLIKELY(!--nbSeq)) break;
+ }
+
+ /* check if reached exact end */
+ DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
+ if (ZSTD_isError(error)) return error;
+ RETURN_ERROR_IF(nbSeq, corruption_detected, "");
+ RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
+ /* save reps for next block */
+ { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+ }
+
+ /* last literal segment */
+ { size_t const lastLLSize = litEnd - litPtr;
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
+ if (op != NULL) {
+ memcpy(op, litPtr, lastLLSize);
+ op += lastLLSize;
+ }
+ }
+
+ return op-ostart;
+}
+
+static size_t
+ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
+ void* dst, size_t maxDstSize,
+ const void* seqStart, size_t seqSize, int nbSeq,
+ const ZSTD_longOffset_e isLongOffset,
+ const int frame)
+{
+ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_decompressSequencesLong_body(
+ ZSTD_DCtx* dctx,
+ void* dst, size_t maxDstSize,
+ const void* seqStart, size_t seqSize, int nbSeq,
+ const ZSTD_longOffset_e isLongOffset,
+ const int frame)
+{
+ const BYTE* ip = (const BYTE*)seqStart;
+ const BYTE* const iend = ip + seqSize;
+ BYTE* const ostart = (BYTE* const)dst;
+ BYTE* const oend = ostart + maxDstSize;
+ BYTE* op = ostart;
+ const BYTE* litPtr = dctx->litPtr;
+ const BYTE* const litEnd = litPtr + dctx->litSize;
+ const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+ const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
+ const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+ (void)frame;
+
+ /* Regen sequences */
+ if (nbSeq) {
+#define STORED_SEQS 4
+#define STORED_SEQS_MASK (STORED_SEQS-1)
+#define ADVANCED_SEQS 4
+ seq_t sequences[STORED_SEQS];
+ int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
+ seqState_t seqState;
+ int seqNb;
+ dctx->fseEntropy = 1;
+ { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
+ seqState.prefixStart = prefixStart;
+ seqState.pos = (size_t)(op-prefixStart);
+ seqState.dictEnd = dictEnd;
+ assert(dst != NULL);
+ assert(iend >= ip);
+ RETURN_ERROR_IF(
+ ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
+ corruption_detected, "");
+ ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
+ ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
+ ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+
+ /* prepare in advance */
+ for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
+ sequences[seqNb] = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_prefetch);
+ PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
+ }
+ RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
+
+ /* decode and decompress */
+ for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_prefetch);
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+ assert(!ZSTD_isError(oneSeqSize));
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
+#endif
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+ PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
+ sequences[seqNb & STORED_SEQS_MASK] = sequence;
+ op += oneSeqSize;
+ }
+ RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
+
+ /* finish queue */
+ seqNb -= seqAdvance;
+ for ( ; seqNb<nbSeq ; seqNb++) {
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[seqNb&STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+ assert(!ZSTD_isError(oneSeqSize));
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
+#endif
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+ op += oneSeqSize;
+ }
+
+ /* save reps for next block */
+ { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+ }
+
+ /* last literal segment */
+ { size_t const lastLLSize = litEnd - litPtr;
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
+ if (op != NULL) {
+ memcpy(op, litPtr, lastLLSize);
+ op += lastLLSize;
+ }
+ }
+
+ return op-ostart;
+}
+
+static size_t
+ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
+ void* dst, size_t maxDstSize,
+ const void* seqStart, size_t seqSize, int nbSeq,
+ const ZSTD_longOffset_e isLongOffset,
+ const int frame)
+{
+ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+
+
+
+#if DYNAMIC_BMI2
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+static TARGET_ATTRIBUTE("bmi2") size_t
+DONT_VECTORIZE
+ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
+ void* dst, size_t maxDstSize,
+ const void* seqStart, size_t seqSize, int nbSeq,
+ const ZSTD_longOffset_e isLongOffset,
+ const int frame)
+{
+ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+static TARGET_ATTRIBUTE("bmi2") size_t
+ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
+ void* dst, size_t maxDstSize,
+ const void* seqStart, size_t seqSize, int nbSeq,
+ const ZSTD_longOffset_e isLongOffset,
+ const int frame)
+{
+ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+
+#endif /* DYNAMIC_BMI2 */
+
+typedef size_t (*ZSTD_decompressSequences_t)(
+ ZSTD_DCtx* dctx,
+ void* dst, size_t maxDstSize,
+ const void* seqStart, size_t seqSize, int nbSeq,
+ const ZSTD_longOffset_e isLongOffset,
+ const int frame);
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+static size_t
+ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
+ const void* seqStart, size_t seqSize, int nbSeq,
+ const ZSTD_longOffset_e isLongOffset,
+ const int frame)
+{
+ DEBUGLOG(5, "ZSTD_decompressSequences");
+#if DYNAMIC_BMI2
+ if (dctx->bmi2) {
+ return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+ }
+#endif
+ return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+/* ZSTD_decompressSequencesLong() :
+ * decompression function triggered when a minimum share of offsets is considered "long",
+ * aka out of cache.
+ * note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes meaning "farther than memory cache distance".
+ * This function will try to mitigate main memory latency through the use of prefetching */
+static size_t
+ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
+ void* dst, size_t maxDstSize,
+ const void* seqStart, size_t seqSize, int nbSeq,
+ const ZSTD_longOffset_e isLongOffset,
+ const int frame)
+{
+ DEBUGLOG(5, "ZSTD_decompressSequencesLong");
+#if DYNAMIC_BMI2
+ if (dctx->bmi2) {
+ return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+ }
+#endif
+ return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+
+
+
+#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+ !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+/* ZSTD_getLongOffsetsShare() :
+ * condition : offTable must be valid
+ * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
+ * compared to maximum possible of (1<<OffFSELog) */
+static unsigned
+ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
+{
+ const void* ptr = offTable;
+ U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
+ const ZSTD_seqSymbol* table = offTable + 1;
+ U32 const max = 1 << tableLog;
+ U32 u, total = 0;
+ DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
+
+ assert(max <= (1 << OffFSELog)); /* max not too large */
+ for (u=0; u<max; u++) {
+ if (table[u].nbAdditionalBits > 22) total += 1;
+ }
+
+ assert(tableLog <= OffFSELog);
+ total <<= (OffFSELog - tableLog); /* scale to OffFSELog */
+
+ return total;
+}
+#endif
+
+size_t
+ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize, const int frame)
+{ /* blockType == blockCompressed */
+ const BYTE* ip = (const BYTE*)src;
+ /* isLongOffset must be true if there are long offsets.
+ * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
+ * We don't expect that to be the case in 64-bit mode.
+ * In block mode, window size is not known, so we have to be conservative.
+ * (note: but it could be evaluated from current-lowLimit)
+ */
+ ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
+ DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
+
+ RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
+
+ /* Decode literals section */
+ { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
+ DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
+ if (ZSTD_isError(litCSize)) return litCSize;
+ ip += litCSize;
+ srcSize -= litCSize;
+ }
+
+ /* Build Decoding Tables */
+ {
+ /* These macros control at build-time which decompressor implementation
+ * we use. If neither is defined, we do some inspection and dispatch at
+ * runtime.
+ */
+#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+ !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+ int usePrefetchDecoder = dctx->ddictIsCold;
+#endif
+ int nbSeq;
+ size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
+ if (ZSTD_isError(seqHSize)) return seqHSize;
+ ip += seqHSize;
+ srcSize -= seqHSize;
+
+ RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
+
+#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+ !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+ if ( !usePrefetchDecoder
+ && (!frame || (dctx->fParams.windowSize > (1<<24)))
+ && (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */
+ U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
+ U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
+ usePrefetchDecoder = (shareLongOffsets >= minShare);
+ }
+#endif
+
+ dctx->ddictIsCold = 0;
+
+#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+ !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+ if (usePrefetchDecoder)
+#endif
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+ return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
+#endif
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+ /* else */
+ return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
+#endif
+ }
+}
+
+
+void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst)
+{
+ if (dst != dctx->previousDstEnd) { /* not contiguous */
+ dctx->dictEnd = dctx->previousDstEnd;
+ dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
+ dctx->prefixStart = dst;
+ dctx->previousDstEnd = dst;
+ }
+}
+
+
+size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize)
+{
+ size_t dSize;
+ ZSTD_checkContinuity(dctx, dst);
+ dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0);
+ dctx->previousDstEnd = (char*)dst + dSize;
+ return dSize;
+}
+/**** ended inlining decompress/zstd_decompress_block.c ****/
diff --git a/sys/contrib/openzfs/module/zstd/lib/zstd.h b/sys/contrib/openzfs/module/zstd/lib/zstd.h
new file mode 100644
index 000000000000..b6772f8818a7
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/lib/zstd.h
@@ -0,0 +1,2115 @@
+/*
+ * BSD 3-Clause Clear License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. All rights reserved.
+ */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef ZSTD_H_235446
+#define ZSTD_H_235446
+
+/* ====== Dependency ======*/
+#include <limits.h> /* INT_MAX */
+#include <stddef.h> /* size_t */
+
+
+/* ===== ZSTDLIB_API : control library symbols visibility ===== */
+#ifndef ZSTDLIB_VISIBILITY
+# if defined(__GNUC__) && (__GNUC__ >= 4)
+# define ZSTDLIB_VISIBILITY __attribute__ ((visibility ("default")))
+# else
+# define ZSTDLIB_VISIBILITY
+# endif
+#endif
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+# define ZSTDLIB_API __declspec(dllexport) ZSTDLIB_VISIBILITY
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+# define ZSTDLIB_API __declspec(dllimport) ZSTDLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+# define ZSTDLIB_API ZSTDLIB_VISIBILITY
+#endif
+
+
+/*******************************************************************************
+ Introduction
+
+ zstd, short for Zstandard, is a fast lossless compression algorithm, targeting
+ real-time compression scenarios at zlib-level and better compression ratios.
+ The zstd compression library provides in-memory compression and decompression
+ functions.
+
+ The library supports regular compression levels from 1 up to ZSTD_maxCLevel(),
+ which is currently 22. Levels >= 20, labeled `--ultra`, should be used with
+ caution, as they require more memory. The library also offers negative
+ compression levels, which extend the range of speed vs. ratio preferences.
+ The lower the level, the faster the speed (at the cost of compression).
+
+ Compression can be done in:
+ - a single step (described as Simple API)
+ - a single step, reusing a context (described as Explicit context)
+ - unbounded multiple steps (described as Streaming compression)
+
+ The compression ratio achievable on small data can be highly improved using
+ a dictionary. Dictionary compression can be performed in:
+ - a single step (described as Simple dictionary API)
+ - a single step, reusing a dictionary (described as Bulk-processing
+ dictionary API)
+
+ Advanced experimental functions can be accessed using
+ `#define ZSTD_STATIC_LINKING_ONLY` before including zstd.h.
+
+ Advanced experimental APIs should never be used with a dynamically-linked
+ library. They are not "stable"; their definitions or signatures may change in
+ the future. Only static linking is allowed.
+*******************************************************************************/
+
+/*------ Version ------*/
+#define ZSTD_VERSION_MAJOR 1
+#define ZSTD_VERSION_MINOR 4
+#define ZSTD_VERSION_RELEASE 5
+
+#define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+ZSTDLIB_API unsigned ZSTD_versionNumber(void); /**< to check runtime library version */
+
+#define ZSTD_LIB_VERSION ZSTD_VERSION_MAJOR.ZSTD_VERSION_MINOR.ZSTD_VERSION_RELEASE
+#define ZSTD_QUOTE(str) #str
+#define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str)
+#define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION)
+ZSTDLIB_API const char* ZSTD_versionString(void); /* requires v1.3.0+ */
+
+/* *************************************
+ * Default constant
+ ***************************************/
+#ifndef ZSTD_CLEVEL_DEFAULT
+# define ZSTD_CLEVEL_DEFAULT 3
+#endif
+
+/* *************************************
+ * Constants
+ ***************************************/
+
+/* All magic numbers are supposed read/written to/from files/memory using little-endian convention */
+#define ZSTD_MAGICNUMBER 0xFD2FB528 /* valid since v0.8.0 */
+#define ZSTD_MAGIC_DICTIONARY 0xEC30A437 /* valid since v0.7.0 */
+#define ZSTD_MAGIC_SKIPPABLE_START 0x184D2A50 /* all 16 values, from 0x184D2A50 to 0x184D2A5F, signal the beginning of a skippable frame */
+#define ZSTD_MAGIC_SKIPPABLE_MASK 0xFFFFFFF0
+
+#define ZSTD_BLOCKSIZELOG_MAX 17
+#define ZSTD_BLOCKSIZE_MAX (1<<ZSTD_BLOCKSIZELOG_MAX)
+
+
+
+/***************************************
+* Simple API
+***************************************/
+/*! ZSTD_compress() :
+ * Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
+ * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`.
+ * @return : compressed size written into `dst` (<= `dstCapacity),
+ * or an error code if it fails (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ int compressionLevel);
+
+/*! ZSTD_decompress() :
+ * `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
+ * `dstCapacity` is an upper bound of originalSize to regenerate.
+ * If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data.
+ * @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+ * or an errorCode if it fails (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity,
+ const void* src, size_t compressedSize);
+
+/*! ZSTD_getFrameContentSize() : requires v1.3.0+
+ * `src` should point to the start of a ZSTD encoded frame.
+ * `srcSize` must be at least as large as the frame header.
+ * hint : any size >= `ZSTD_frameHeaderSize_max` is large enough.
+ * @return : - decompressed size of `src` frame content, if known
+ * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+ * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small)
+ * note 1 : a 0 return value means the frame is valid but "empty".
+ * note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode.
+ * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
+ * In which case, it's necessary to use streaming mode to decompress data.
+ * Optionally, application can rely on some implicit limit,
+ * as ZSTD_decompress() only needs an upper bound of decompressed size.
+ * (For example, data could be necessarily cut into blocks <= 16 KB).
+ * note 3 : decompressed size is always present when compression is completed using single-pass functions,
+ * such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict().
+ * note 4 : decompressed size can be very large (64-bits value),
+ * potentially larger than what local system can handle as a single memory segment.
+ * In which case, it's necessary to use streaming mode to decompress data.
+ * note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+ * Always ensure return value fits within application's authorized limits.
+ * Each application can set its own limits.
+ * note 6 : This function replaces ZSTD_getDecompressedSize() */
+#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1)
+#define ZSTD_CONTENTSIZE_ERROR (0ULL - 2)
+ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
+
+/*! ZSTD_getDecompressedSize() :
+ * NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize().
+ * Both functions work the same way, but ZSTD_getDecompressedSize() blends
+ * "empty", "unknown" and "error" results to the same return value (0),
+ * while ZSTD_getFrameContentSize() gives them separate return values.
+ * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
+ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+
+/*! ZSTD_findFrameCompressedSize() :
+ * `src` should point to the start of a ZSTD frame or skippable frame.
+ * `srcSize` must be >= first frame size
+ * @return : the compressed size of the first frame starting at `src`,
+ * suitable to pass as `srcSize` to `ZSTD_decompress` or similar,
+ * or an error code if input is invalid */
+ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize);
+
+
+/*====== Helper functions ======*/
+#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
+ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */
+ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */
+ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed */
+ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */
+
+
+/***************************************
+* Explicit context
+***************************************/
+/*= Compression context
+ * When compressing many times,
+ * it is recommended to allocate a context just once,
+ * and re-use it for each successive compression operation.
+ * This will make workload friendlier for system's memory.
+ * Note : re-using context is just a speed / resource optimization.
+ * It doesn't change the compression ratio, which remains identical.
+ * Note 2 : In multi-threaded environments,
+ * use one different context per thread for parallel execution.
+ */
+typedef struct ZSTD_CCtx_s ZSTD_CCtx;
+ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void);
+ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx);
+
+/*! ZSTD_compressCCtx() :
+ * Same as ZSTD_compress(), using an explicit ZSTD_CCtx.
+ * Important : in order to behave similarly to `ZSTD_compress()`,
+ * this function compresses at requested compression level,
+ * __ignoring any other parameter__ .
+ * If any advanced parameter was set using the advanced API,
+ * they will all be reset. Only `compressionLevel` remains.
+ */
+ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ int compressionLevel);
+
+/*= Decompression context
+ * When decompressing many times,
+ * it is recommended to allocate a context only once,
+ * and re-use it for each successive compression operation.
+ * This will make workload friendlier for system's memory.
+ * Use one context per thread for parallel execution. */
+typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void);
+ZSTDLIB_API size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx);
+
+/*! ZSTD_decompressDCtx() :
+ * Same as ZSTD_decompress(),
+ * requires an allocated ZSTD_DCtx.
+ * Compatible with sticky parameters.
+ */
+ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize);
+
+
+/***************************************
+* Advanced compression API
+***************************************/
+
+/* API design :
+ * Parameters are pushed one by one into an existing context,
+ * using ZSTD_CCtx_set*() functions.
+ * Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame.
+ * "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` !
+ * __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ .
+ *
+ * It's possible to reset all parameters to "default" using ZSTD_CCtx_reset().
+ *
+ * This API supercedes all other "advanced" API entry points in the experimental section.
+ * In the future, we expect to remove from experimental API entry points which are redundant with this API.
+ */
+
+
+/* Compression strategies, listed from fastest to strongest */
+typedef enum { ZSTD_fast=1,
+ ZSTD_dfast=2,
+ ZSTD_greedy=3,
+ ZSTD_lazy=4,
+ ZSTD_lazy2=5,
+ ZSTD_btlazy2=6,
+ ZSTD_btopt=7,
+ ZSTD_btultra=8,
+ ZSTD_btultra2=9
+ /* note : new strategies _might_ be added in the future.
+ Only the order (from fast to strong) is guaranteed */
+} ZSTD_strategy;
+
+
+typedef enum {
+
+ /* compression parameters
+ * Note: When compressing with a ZSTD_CDict these parameters are superseded
+ * by the parameters used to construct the ZSTD_CDict.
+ * See ZSTD_CCtx_refCDict() for more info (superseded-by-cdict). */
+ ZSTD_c_compressionLevel=100, /* Set compression parameters according to pre-defined cLevel table.
+ * Note that exact compression parameters are dynamically determined,
+ * depending on both compression level and srcSize (when known).
+ * Default level is ZSTD_CLEVEL_DEFAULT==3.
+ * Special: value 0 means default, which is controlled by ZSTD_CLEVEL_DEFAULT.
+ * Note 1 : it's possible to pass a negative compression level.
+ * Note 2 : setting a level does not automatically set all other compression parameters
+ * to default. Setting this will however eventually dynamically impact the compression
+ * parameters which have not been manually set. The manually set
+ * ones will 'stick'. */
+ /* Advanced compression parameters :
+ * It's possible to pin down compression parameters to some specific values.
+ * In which case, these values are no longer dynamically selected by the compressor */
+ ZSTD_c_windowLog=101, /* Maximum allowed back-reference distance, expressed as power of 2.
+ * This will set a memory budget for streaming decompression,
+ * with larger values requiring more memory
+ * and typically compressing more.
+ * Must be clamped between ZSTD_WINDOWLOG_MIN and ZSTD_WINDOWLOG_MAX.
+ * Special: value 0 means "use default windowLog".
+ * Note: Using a windowLog greater than ZSTD_WINDOWLOG_LIMIT_DEFAULT
+ * requires explicitly allowing such size at streaming decompression stage. */
+ ZSTD_c_hashLog=102, /* Size of the initial probe table, as a power of 2.
+ * Resulting memory usage is (1 << (hashLog+2)).
+ * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX.
+ * Larger tables improve compression ratio of strategies <= dFast,
+ * and improve speed of strategies > dFast.
+ * Special: value 0 means "use default hashLog". */
+ ZSTD_c_chainLog=103, /* Size of the multi-probe search table, as a power of 2.
+ * Resulting memory usage is (1 << (chainLog+2)).
+ * Must be clamped between ZSTD_CHAINLOG_MIN and ZSTD_CHAINLOG_MAX.
+ * Larger tables result in better and slower compression.
+ * This parameter is useless for "fast" strategy.
+ * It's still useful when using "dfast" strategy,
+ * in which case it defines a secondary probe table.
+ * Special: value 0 means "use default chainLog". */
+ ZSTD_c_searchLog=104, /* Number of search attempts, as a power of 2.
+ * More attempts result in better and slower compression.
+ * This parameter is useless for "fast" and "dFast" strategies.
+ * Special: value 0 means "use default searchLog". */
+ ZSTD_c_minMatch=105, /* Minimum size of searched matches.
+ * Note that Zstandard can still find matches of smaller size,
+ * it just tweaks its search algorithm to look for this size and larger.
+ * Larger values increase compression and decompression speed, but decrease ratio.
+ * Must be clamped between ZSTD_MINMATCH_MIN and ZSTD_MINMATCH_MAX.
+ * Note that currently, for all strategies < btopt, effective minimum is 4.
+ * , for all strategies > fast, effective maximum is 6.
+ * Special: value 0 means "use default minMatchLength". */
+ ZSTD_c_targetLength=106, /* Impact of this field depends on strategy.
+ * For strategies btopt, btultra & btultra2:
+ * Length of Match considered "good enough" to stop search.
+ * Larger values make compression stronger, and slower.
+ * For strategy fast:
+ * Distance between match sampling.
+ * Larger values make compression faster, and weaker.
+ * Special: value 0 means "use default targetLength". */
+ ZSTD_c_strategy=107, /* See ZSTD_strategy enum definition.
+ * The higher the value of selected strategy, the more complex it is,
+ * resulting in stronger and slower compression.
+ * Special: value 0 means "use default strategy". */
+
+ /* LDM mode parameters */
+ ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching.
+ * This parameter is designed to improve compression ratio
+ * for large inputs, by finding large matches at long distance.
+ * It increases memory usage and window size.
+ * Note: enabling this parameter increases default ZSTD_c_windowLog to 128 MB
+ * except when expressly set to a different value. */
+ ZSTD_c_ldmHashLog=161, /* Size of the table for long distance matching, as a power of 2.
+ * Larger values increase memory usage and compression ratio,
+ * but decrease compression speed.
+ * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX
+ * default: windowlog - 7.
+ * Special: value 0 means "automatically determine hashlog". */
+ ZSTD_c_ldmMinMatch=162, /* Minimum match size for long distance matcher.
+ * Larger/too small values usually decrease compression ratio.
+ * Must be clamped between ZSTD_LDM_MINMATCH_MIN and ZSTD_LDM_MINMATCH_MAX.
+ * Special: value 0 means "use default value" (default: 64). */
+ ZSTD_c_ldmBucketSizeLog=163, /* Log size of each bucket in the LDM hash table for collision resolution.
+ * Larger values improve collision resolution but decrease compression speed.
+ * The maximum value is ZSTD_LDM_BUCKETSIZELOG_MAX.
+ * Special: value 0 means "use default value" (default: 3). */
+ ZSTD_c_ldmHashRateLog=164, /* Frequency of inserting/looking up entries into the LDM hash table.
+ * Must be clamped between 0 and (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN).
+ * Default is MAX(0, (windowLog - ldmHashLog)), optimizing hash table usage.
+ * Larger values improve compression speed.
+ * Deviating far from default value will likely result in a compression ratio decrease.
+ * Special: value 0 means "automatically determine hashRateLog". */
+
+ /* frame parameters */
+ ZSTD_c_contentSizeFlag=200, /* Content size will be written into frame header _whenever known_ (default:1)
+ * Content size must be known at the beginning of compression.
+ * This is automatically the case when using ZSTD_compress2(),
+ * For streaming scenarios, content size must be provided with ZSTD_CCtx_setPledgedSrcSize() */
+ ZSTD_c_checksumFlag=201, /* A 32-bits checksum of content is written at end of frame (default:0) */
+ ZSTD_c_dictIDFlag=202, /* When applicable, dictionary's ID is written into frame header (default:1) */
+
+ /* multi-threading parameters */
+ /* These parameters are only useful if multi-threading is enabled (compiled with build macro ZSTD_MULTITHREAD).
+ * They return an error otherwise. */
+ ZSTD_c_nbWorkers=400, /* Select how many threads will be spawned to compress in parallel.
+ * When nbWorkers >= 1, triggers asynchronous mode when used with ZSTD_compressStream*() :
+ * ZSTD_compressStream*() consumes input and flush output if possible, but immediately gives back control to caller,
+ * while compression work is performed in parallel, within worker threads.
+ * (note : a strong exception to this rule is when first invocation of ZSTD_compressStream2() sets ZSTD_e_end :
+ * in which case, ZSTD_compressStream2() delegates to ZSTD_compress2(), which is always a blocking call).
+ * More workers improve speed, but also increase memory usage.
+ * Default value is `0`, aka "single-threaded mode" : no worker is spawned, compression is performed inside Caller's thread, all invocations are blocking */
+ ZSTD_c_jobSize=401, /* Size of a compression job. This value is enforced only when nbWorkers >= 1.
+ * Each compression job is completed in parallel, so this value can indirectly impact the nb of active threads.
+ * 0 means default, which is dynamically determined based on compression parameters.
+ * Job size must be a minimum of overlap size, or 1 MB, whichever is largest.
+ * The minimum size is automatically and transparently enforced. */
+ ZSTD_c_overlapLog=402, /* Control the overlap size, as a fraction of window size.
+ * The overlap size is an amount of data reloaded from previous job at the beginning of a new job.
+ * It helps preserve compression ratio, while each job is compressed in parallel.
+ * This value is enforced only when nbWorkers >= 1.
+ * Larger values increase compression ratio, but decrease speed.
+ * Possible values range from 0 to 9 :
+ * - 0 means "default" : value will be determined by the library, depending on strategy
+ * - 1 means "no overlap"
+ * - 9 means "full overlap", using a full window size.
+ * Each intermediate rank increases/decreases load size by a factor 2 :
+ * 9: full window; 8: w/2; 7: w/4; 6: w/8; 5:w/16; 4: w/32; 3:w/64; 2:w/128; 1:no overlap; 0:default
+ * default value varies between 6 and 9, depending on strategy */
+
+ /* note : additional experimental parameters are also available
+ * within the experimental section of the API.
+ * At the time of this writing, they include :
+ * ZSTD_c_rsyncable
+ * ZSTD_c_format
+ * ZSTD_c_forceMaxWindow
+ * ZSTD_c_forceAttachDict
+ * ZSTD_c_literalCompressionMode
+ * ZSTD_c_targetCBlockSize
+ * ZSTD_c_srcSizeHint
+ * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+ * note : never ever use experimentalParam? names directly;
+ * also, the enums values themselves are unstable and can still change.
+ */
+ ZSTD_c_experimentalParam1=500,
+ ZSTD_c_experimentalParam2=10,
+ ZSTD_c_experimentalParam3=1000,
+ ZSTD_c_experimentalParam4=1001,
+ ZSTD_c_experimentalParam5=1002,
+ ZSTD_c_experimentalParam6=1003,
+ ZSTD_c_experimentalParam7=1004
+} ZSTD_cParameter;
+
+typedef struct {
+ size_t error;
+ int lowerBound;
+ int upperBound;
+} ZSTD_bounds;
+
+/*! ZSTD_cParam_getBounds() :
+ * All parameters must belong to an interval with lower and upper bounds,
+ * otherwise they will either trigger an error or be automatically clamped.
+ * @return : a structure, ZSTD_bounds, which contains
+ * - an error status field, which must be tested using ZSTD_isError()
+ * - lower and upper bounds, both inclusive
+ */
+ZSTDLIB_API ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter cParam);
+
+/*! ZSTD_CCtx_setParameter() :
+ * Set one compression parameter, selected by enum ZSTD_cParameter.
+ * All parameters have valid bounds. Bounds can be queried using ZSTD_cParam_getBounds().
+ * Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter).
+ * Setting a parameter is generally only possible during frame initialization (before starting compression).
+ * Exception : when using multi-threading mode (nbWorkers >= 1),
+ * the following parameters can be updated _during_ compression (within same frame):
+ * => compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy.
+ * new parameters will be active for next job only (after a flush()).
+ * @return : an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value);
+
+/*! ZSTD_CCtx_setPledgedSrcSize() :
+ * Total input data size to be compressed as a single frame.
+ * Value will be written in frame header, unless if explicitly forbidden using ZSTD_c_contentSizeFlag.
+ * This value will also be controlled at end of frame, and trigger an error if not respected.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ * Note 1 : pledgedSrcSize==0 actually means zero, aka an empty frame.
+ * In order to mean "unknown content size", pass constant ZSTD_CONTENTSIZE_UNKNOWN.
+ * ZSTD_CONTENTSIZE_UNKNOWN is default value for any new frame.
+ * Note 2 : pledgedSrcSize is only valid once, for the next frame.
+ * It's discarded at the end of the frame, and replaced by ZSTD_CONTENTSIZE_UNKNOWN.
+ * Note 3 : Whenever all input data is provided and consumed in a single round,
+ * for example with ZSTD_compress2(),
+ * or invoking immediately ZSTD_compressStream2(,,,ZSTD_e_end),
+ * this value is automatically overridden by srcSize instead.
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize);
+
+typedef enum {
+ ZSTD_reset_session_only = 1,
+ ZSTD_reset_parameters = 2,
+ ZSTD_reset_session_and_parameters = 3
+} ZSTD_ResetDirective;
+
+/*! ZSTD_CCtx_reset() :
+ * There are 2 different things that can be reset, independently or jointly :
+ * - The session : will stop compressing current frame, and make CCtx ready to start a new one.
+ * Useful after an error, or to interrupt any ongoing compression.
+ * Any internal data not yet flushed is cancelled.
+ * Compression parameters and dictionary remain unchanged.
+ * They will be used to compress next frame.
+ * Resetting session never fails.
+ * - The parameters : changes all parameters back to "default".
+ * This removes any reference to any dictionary too.
+ * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
+ * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
+ * - Both : similar to resetting the session, followed by resetting parameters.
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
+
+/*! ZSTD_compress2() :
+ * Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API.
+ * ZSTD_compress2() always starts a new frame.
+ * Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
+ * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
+ * - The function is always blocking, returns when compression is completed.
+ * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`.
+ * @return : compressed size written into `dst` (<= `dstCapacity),
+ * or an error code if it fails (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_compress2( ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize);
+
+
+/***************************************
+* Advanced decompression API
+***************************************/
+
+/* The advanced API pushes parameters one by one into an existing DCtx context.
+ * Parameters are sticky, and remain valid for all following frames
+ * using the same DCtx context.
+ * It's possible to reset parameters to default values using ZSTD_DCtx_reset().
+ * Note : This API is compatible with existing ZSTD_decompressDCtx() and ZSTD_decompressStream().
+ * Therefore, no new decompression function is necessary.
+ */
+
+typedef enum {
+
+ ZSTD_d_windowLogMax=100, /* Select a size limit (in power of 2) beyond which
+ * the streaming API will refuse to allocate memory buffer
+ * in order to protect the host from unreasonable memory requirements.
+ * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode.
+ * By default, a decompression context accepts window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT).
+ * Special: value 0 means "use default maximum windowLog". */
+
+ /* note : additional experimental parameters are also available
+ * within the experimental section of the API.
+ * At the time of this writing, they include :
+ * ZSTD_d_format
+ * ZSTD_d_stableOutBuffer
+ * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+ * note : never ever use experimentalParam? names directly
+ */
+ ZSTD_d_experimentalParam1=1000,
+ ZSTD_d_experimentalParam2=1001
+
+} ZSTD_dParameter;
+
+/*! ZSTD_dParam_getBounds() :
+ * All parameters must belong to an interval with lower and upper bounds,
+ * otherwise they will either trigger an error or be automatically clamped.
+ * @return : a structure, ZSTD_bounds, which contains
+ * - an error status field, which must be tested using ZSTD_isError()
+ * - both lower and upper bounds, inclusive
+ */
+ZSTDLIB_API ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam);
+
+/*! ZSTD_DCtx_setParameter() :
+ * Set one compression parameter, selected by enum ZSTD_dParameter.
+ * All parameters have valid bounds. Bounds can be queried using ZSTD_dParam_getBounds().
+ * Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter).
+ * Setting a parameter is only possible during frame initialization (before starting decompression).
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int value);
+
+/*! ZSTD_DCtx_reset() :
+ * Return a DCtx to clean state.
+ * Session and parameters can be reset jointly or separately.
+ * Parameters can only be reset when no active frame is being decompressed.
+ * @return : 0, or an error code, which can be tested with ZSTD_isError()
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset);
+
+
+/****************************
+* Streaming
+****************************/
+
+typedef struct ZSTD_inBuffer_s {
+ const void* src; /**< start of input buffer */
+ size_t size; /**< size of input buffer */
+ size_t pos; /**< position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */
+} ZSTD_inBuffer;
+
+typedef struct ZSTD_outBuffer_s {
+ void* dst; /**< start of output buffer */
+ size_t size; /**< size of output buffer */
+ size_t pos; /**< position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */
+} ZSTD_outBuffer;
+
+
+
+/*-***********************************************************************
+* Streaming compression - HowTo
+*
+* A ZSTD_CStream object is required to track streaming operation.
+* Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
+* ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
+* It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
+*
+* For parallel execution, use one separate ZSTD_CStream per thread.
+*
+* note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing.
+*
+* Parameters are sticky : when starting a new compression on the same context,
+* it will re-use the same sticky parameters as previous compression session.
+* When in doubt, it's recommended to fully initialize the context before usage.
+* Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(),
+* ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to
+* set more specific parameters, the pledged source size, or load a dictionary.
+*
+* Use ZSTD_compressStream2() with ZSTD_e_continue as many times as necessary to
+* consume input stream. The function will automatically update both `pos`
+* fields within `input` and `output`.
+* Note that the function may not consume the entire input, for example, because
+* the output buffer is already full, in which case `input.pos < input.size`.
+* The caller must check if input has been entirely consumed.
+* If not, the caller must make some room to receive more compressed data,
+* and then present again remaining input data.
+* note: ZSTD_e_continue is guaranteed to make some forward progress when called,
+* but doesn't guarantee maximal forward progress. This is especially relevant
+* when compressing with multiple threads. The call won't block if it can
+* consume some input, but if it can't it will wait for some, but not all,
+* output to be flushed.
+* @return : provides a minimum amount of data remaining to be flushed from internal buffers
+* or an error code, which can be tested using ZSTD_isError().
+*
+* At any moment, it's possible to flush whatever data might remain stuck within internal buffer,
+* using ZSTD_compressStream2() with ZSTD_e_flush. `output->pos` will be updated.
+* Note that, if `output->size` is too small, a single invocation with ZSTD_e_flush might not be enough (return code > 0).
+* In which case, make some room to receive more compressed data, and call again ZSTD_compressStream2() with ZSTD_e_flush.
+* You must continue calling ZSTD_compressStream2() with ZSTD_e_flush until it returns 0, at which point you can change the
+* operation.
+* note: ZSTD_e_flush will flush as much output as possible, meaning when compressing with multiple threads, it will
+* block until the flush is complete or the output buffer is full.
+* @return : 0 if internal buffers are entirely flushed,
+* >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
+* or an error code, which can be tested using ZSTD_isError().
+*
+* Calling ZSTD_compressStream2() with ZSTD_e_end instructs to finish a frame.
+* It will perform a flush and write frame epilogue.
+* The epilogue is required for decoders to consider a frame completed.
+* flush operation is the same, and follows same rules as calling ZSTD_compressStream2() with ZSTD_e_flush.
+* You must continue calling ZSTD_compressStream2() with ZSTD_e_end until it returns 0, at which point you are free to
+* start a new frame.
+* note: ZSTD_e_end will flush as much output as possible, meaning when compressing with multiple threads, it will
+* block until the flush is complete or the output buffer is full.
+* @return : 0 if frame fully completed and fully flushed,
+* >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
+* or an error code, which can be tested using ZSTD_isError().
+*
+* *******************************************************************/
+
+typedef ZSTD_CCtx ZSTD_CStream; /**< CCtx and CStream are now effectively same object (>= v1.3.0) */
+ /* Continue to distinguish them for compatibility with older versions <= v1.2.0 */
+/*===== ZSTD_CStream management functions =====*/
+ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void);
+ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs);
+
+/*===== Streaming compression functions =====*/
+typedef enum {
+ ZSTD_e_continue=0, /* collect more data, encoder decides when to output compressed result, for optimal compression ratio */
+ ZSTD_e_flush=1, /* flush any data provided so far,
+ * it creates (at least) one new block, that can be decoded immediately on reception;
+ * frame will continue: any future data can still reference previously compressed data, improving compression.
+ * note : multithreaded compression will block to flush as much output as possible. */
+ ZSTD_e_end=2 /* flush any remaining data _and_ close current frame.
+ * note that frame is only closed after compressed data is fully flushed (return value == 0).
+ * After that point, any additional data starts a new frame.
+ * note : each frame is independent (does not reference any content from previous frame).
+ : note : multithreaded compression will block to flush as much output as possible. */
+} ZSTD_EndDirective;
+
+/*! ZSTD_compressStream2() :
+ * Behaves about the same as ZSTD_compressStream, with additional control on end directive.
+ * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
+ * - Compression parameters cannot be changed once compression is started (save a list of exceptions in multi-threading mode)
+ * - output->pos must be <= dstCapacity, input->pos must be <= srcSize
+ * - output->pos and input->pos will be updated. They are guaranteed to remain below their respective limit.
+ * - When nbWorkers==0 (default), function is blocking : it completes its job before returning to caller.
+ * - When nbWorkers>=1, function is non-blocking : it just acquires a copy of input, and distributes jobs to internal worker threads, flush whatever is available,
+ * and then immediately returns, just indicating that there is some data remaining to be flushed.
+ * The function nonetheless guarantees forward progress : it will return only after it reads or write at least 1+ byte.
+ * - Exception : if the first call requests a ZSTD_e_end directive and provides enough dstCapacity, the function delegates to ZSTD_compress2() which is always blocking.
+ * - @return provides a minimum amount of data remaining to be flushed from internal buffers
+ * or an error code, which can be tested using ZSTD_isError().
+ * if @return != 0, flush is not fully completed, there is still some data left within internal buffers.
+ * This is useful for ZSTD_e_flush, since in this case more flushes are necessary to empty all buffers.
+ * For ZSTD_e_end, @return == 0 when internal buffers are fully flushed and frame is completed.
+ * - after a ZSTD_e_end directive, if internal buffer is not fully flushed (@return != 0),
+ * only ZSTD_e_end or ZSTD_e_flush operations are allowed.
+ * Before starting a new compression job, or changing compression parameters,
+ * it is required to fully flush internal buffers.
+ */
+ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+ ZSTD_outBuffer* output,
+ ZSTD_inBuffer* input,
+ ZSTD_EndDirective endOp);
+
+
+/* These buffer sizes are softly recommended.
+ * They are not required : ZSTD_compressStream*() happily accepts any buffer size, for both input and output.
+ * Respecting the recommended size just makes it a bit easier for ZSTD_compressStream*(),
+ * reducing the amount of memory shuffling and buffering, resulting in minor performance savings.
+ *
+ * However, note that these recommendations are from the perspective of a C caller program.
+ * If the streaming interface is invoked from some other language,
+ * especially managed ones such as Java or Go, through a foreign function interface such as jni or cgo,
+ * a major performance rule is to reduce crossing such interface to an absolute minimum.
+ * It's not rare that performance ends being spent more into the interface, rather than compression itself.
+ * In which cases, prefer using large buffers, as large as practical,
+ * for both input and output, to reduce the nb of roundtrips.
+ */
+ZSTDLIB_API size_t ZSTD_CStreamInSize(void); /**< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /**< recommended size for output buffer. Guarantee to successfully flush at least one complete compressed block. */
+
+
+/* *****************************************************************************
+ * This following is a legacy streaming API.
+ * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
+ * It is redundant, but remains fully supported.
+ * Advanced parameters and dictionary compression can only be used through the
+ * new API.
+ ******************************************************************************/
+
+/*!
+ * Equivalent to:
+ *
+ * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
+ * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ */
+ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
+/*!
+ * Alternative for ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue).
+ * NOTE: The return value is different. ZSTD_compressStream() returns a hint for
+ * the next read size (if non-zero and not an error). ZSTD_compressStream2()
+ * returns the minimum nb of bytes left to flush (if non-zero and not an error).
+ */
+ZSTDLIB_API size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_flush). */
+ZSTDLIB_API size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_end). */
+ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+
+
+/*-***************************************************************************
+* Streaming decompression - HowTo
+*
+* A ZSTD_DStream object is required to track streaming operations.
+* Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
+* ZSTD_DStream objects can be re-used multiple times.
+*
+* Use ZSTD_initDStream() to start a new decompression operation.
+* @return : recommended first input size
+* Alternatively, use advanced API to set specific properties.
+*
+* Use ZSTD_decompressStream() repetitively to consume your input.
+* The function will update both `pos` fields.
+* If `input.pos < input.size`, some input has not been consumed.
+* It's up to the caller to present again remaining data.
+* The function tries to flush all data decoded immediately, respecting output buffer size.
+* If `output.pos < output.size`, decoder has flushed everything it could.
+* But if `output.pos == output.size`, there might be some data left within internal buffers.,
+* In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer.
+* Note : with no additional input provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX.
+* @return : 0 when a frame is completely decoded and fully flushed,
+* or an error code, which can be tested using ZSTD_isError(),
+* or any other value > 0, which means there is still some decoding or flushing to do to complete current frame :
+* the return value is a suggested next input size (just a hint for better latency)
+* that will never request more than the remaining frame size.
+* *******************************************************************************/
+
+typedef ZSTD_DCtx ZSTD_DStream; /**< DCtx and DStream are now effectively same object (>= v1.3.0) */
+ /* For compatibility with versions <= v1.2.0, prefer differentiating them. */
+/*===== ZSTD_DStream management functions =====*/
+ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void);
+ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);
+
+/*===== Streaming decompression functions =====*/
+
+/* This function is redundant with the advanced API and equivalent to:
+ *
+ * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ * ZSTD_DCtx_refDDict(zds, NULL);
+ */
+ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
+
+ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+
+ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_DStreamOutSize(void); /*!< recommended size for output buffer. Guarantee to successfully flush at least one complete block in all circumstances. */
+
+
+/**************************
+* Simple dictionary API
+***************************/
+/*! ZSTD_compress_usingDict() :
+ * Compression at an explicit compression level using a Dictionary.
+ * A dictionary can be any arbitrary data segment (also called a prefix),
+ * or a buffer with specified information (see dictBuilder/zdict.h).
+ * Note : This function loads the dictionary, resulting in significant startup delay.
+ * It's intended for a dictionary used only once.
+ * Note 2 : When `dict == NULL || dictSize < 8` no dictionary is used. */
+ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const void* dict,size_t dictSize,
+ int compressionLevel);
+
+/*! ZSTD_decompress_usingDict() :
+ * Decompression using a known Dictionary.
+ * Dictionary must be identical to the one used during compression.
+ * Note : This function loads the dictionary, resulting in significant startup delay.
+ * It's intended for a dictionary used only once.
+ * Note : When `dict == NULL || dictSize < 8` no dictionary is used. */
+ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const void* dict,size_t dictSize);
+
+
+/***********************************
+ * Bulk processing dictionary API
+ **********************************/
+typedef struct ZSTD_CDict_s ZSTD_CDict;
+
+/*! ZSTD_createCDict() :
+ * When compressing multiple messages or blocks using the same dictionary,
+ * it's recommended to digest the dictionary only once, since it's a costly operation.
+ * ZSTD_createCDict() will create a state from digesting a dictionary.
+ * The resulting state can be used for future compression operations with very limited startup cost.
+ * ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
+ * @dictBuffer can be released after ZSTD_CDict creation, because its content is copied within CDict.
+ * Note 1 : Consider experimental function `ZSTD_createCDict_byReference()` if you prefer to not duplicate @dictBuffer content.
+ * Note 2 : A ZSTD_CDict can be created from an empty @dictBuffer,
+ * in which case the only thing that it transports is the @compressionLevel.
+ * This can be useful in a pipeline featuring ZSTD_compress_usingCDict() exclusively,
+ * expecting a ZSTD_CDict parameter with any data, including those without a known dictionary. */
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize,
+ int compressionLevel);
+
+/*! ZSTD_freeCDict() :
+ * Function frees memory allocated by ZSTD_createCDict(). */
+ZSTDLIB_API size_t ZSTD_freeCDict(ZSTD_CDict* CDict);
+
+/*! ZSTD_compress_usingCDict() :
+ * Compression using a digested Dictionary.
+ * Recommended when same dictionary is used multiple times.
+ * Note : compression level is _decided at dictionary creation time_,
+ * and frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */
+ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const ZSTD_CDict* cdict);
+
+
+typedef struct ZSTD_DDict_s ZSTD_DDict;
+
+/*! ZSTD_createDDict() :
+ * Create a digested dictionary, ready to start decompression operation without startup delay.
+ * dictBuffer can be released after DDict creation, as its content is copied inside DDict. */
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize);
+
+/*! ZSTD_freeDDict() :
+ * Function frees memory allocated with ZSTD_createDDict() */
+ZSTDLIB_API size_t ZSTD_freeDDict(ZSTD_DDict* ddict);
+
+/*! ZSTD_decompress_usingDDict() :
+ * Decompression using a digested Dictionary.
+ * Recommended when same dictionary is used multiple times. */
+ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const ZSTD_DDict* ddict);
+
+
+/********************************
+ * Dictionary helper functions
+ *******************************/
+
+/*! ZSTD_getDictID_fromDict() :
+ * Provides the dictID stored within dictionary.
+ * if @return == 0, the dictionary is not conformant with Zstandard specification.
+ * It can still be loaded, but as a content-only dictionary. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize);
+
+/*! ZSTD_getDictID_fromDDict() :
+ * Provides the dictID of the dictionary loaded into `ddict`.
+ * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
+
+/*! ZSTD_getDictID_fromFrame() :
+ * Provides the dictID required to decompressed the frame stored within `src`.
+ * If @return == 0, the dictID could not be decoded.
+ * This could for one of the following reasons :
+ * - The frame does not require a dictionary to be decoded (most common case).
+ * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
+ * Note : this use case also happens when using a non-conformant dictionary.
+ * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+ * - This is not a Zstandard frame.
+ * When identifying the exact failure cause, it's possible to use ZSTD_getFrameHeader(), which will provide a more precise error code. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+
+
+/*******************************************************************************
+ * Advanced dictionary and prefix API
+ *
+ * This API allows dictionaries to be used with ZSTD_compress2(),
+ * ZSTD_compressStream2(), and ZSTD_decompress(). Dictionaries are sticky, and
+ * only reset with the context is reset with ZSTD_reset_parameters or
+ * ZSTD_reset_session_and_parameters. Prefixes are single-use.
+ ******************************************************************************/
+
+
+/*! ZSTD_CCtx_loadDictionary() :
+ * Create an internal CDict from `dict` buffer.
+ * Decompression will have to use same dictionary.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ * Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary,
+ * meaning "return to no-dictionary mode".
+ * Note 1 : Dictionary is sticky, it will be used for all future compressed frames.
+ * To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters).
+ * Note 2 : Loading a dictionary involves building tables.
+ * It's also a CPU consuming operation, with non-negligible impact on latency.
+ * Tables are dependent on compression parameters, and for this reason,
+ * compression parameters can no longer be changed after loading a dictionary.
+ * Note 3 :`dict` content will be copied internally.
+ * Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead.
+ * In such a case, dictionary buffer must outlive its users.
+ * Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
+ * to precisely select how dictionary content must be interpreted. */
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_CCtx_refCDict() :
+ * Reference a prepared dictionary, to be used for all next compressed frames.
+ * Note that compression parameters are enforced from within CDict,
+ * and supersede any compression parameter previously set within CCtx.
+ * The parameters ignored are labled as "superseded-by-cdict" in the ZSTD_cParameter enum docs.
+ * The ignored parameters will be used again if the CCtx is returned to no-dictionary mode.
+ * The dictionary will remain valid for future compressed frames using same CCtx.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ * Special : Referencing a NULL CDict means "return to no-dictionary mode".
+ * Note 1 : Currently, only one dictionary can be managed.
+ * Referencing a new dictionary effectively "discards" any previous one.
+ * Note 2 : CDict is just referenced, its lifetime must outlive its usage within CCtx. */
+ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
+
+/*! ZSTD_CCtx_refPrefix() :
+ * Reference a prefix (single-usage dictionary) for next compressed frame.
+ * A prefix is **only used once**. Tables are discarded at end of frame (ZSTD_e_end).
+ * Decompression will need same prefix to properly regenerate data.
+ * Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+ * but performs much faster, especially during decompression (compression speed is tunable with compression level).
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ * Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
+ * Note 1 : Prefix buffer is referenced. It **must** outlive compression.
+ * Its content must remain unmodified during compression.
+ * Note 2 : If the intention is to diff some large src data blob with some prior version of itself,
+ * ensure that the window size is large enough to contain the entire source.
+ * See ZSTD_c_windowLog.
+ * Note 3 : Referencing a prefix involves building tables, which are dependent on compression parameters.
+ * It's a CPU consuming operation, with non-negligible impact on latency.
+ * If there is a need to use the same prefix multiple times, consider loadDictionary instead.
+ * Note 4 : By default, the prefix is interpreted as raw content (ZSTD_dct_rawContent).
+ * Use experimental ZSTD_CCtx_refPrefix_advanced() to alter dictionary interpretation. */
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
+ const void* prefix, size_t prefixSize);
+
+/*! ZSTD_DCtx_loadDictionary() :
+ * Create an internal DDict from dict buffer,
+ * to be used to decompress next frames.
+ * The dictionary remains valid for all future frames, until explicitly invalidated.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ * Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
+ * meaning "return to no-dictionary mode".
+ * Note 1 : Loading a dictionary involves building tables,
+ * which has a non-negligible impact on CPU usage and latency.
+ * It's recommended to "load once, use many times", to amortize the cost
+ * Note 2 :`dict` content will be copied internally, so `dict` can be released after loading.
+ * Use ZSTD_DCtx_loadDictionary_byReference() to reference dictionary content instead.
+ * Note 3 : Use ZSTD_DCtx_loadDictionary_advanced() to take control of
+ * how dictionary content is loaded and interpreted.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_DCtx_refDDict() :
+ * Reference a prepared dictionary, to be used to decompress next frames.
+ * The dictionary remains active for decompression of future frames using same DCtx.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ * Note 1 : Currently, only one dictionary can be managed.
+ * Referencing a new dictionary effectively "discards" any previous one.
+ * Special: referencing a NULL DDict means "return to no-dictionary mode".
+ * Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
+
+/*! ZSTD_DCtx_refPrefix() :
+ * Reference a prefix (single-usage dictionary) to decompress next frame.
+ * This is the reverse operation of ZSTD_CCtx_refPrefix(),
+ * and must use the same prefix as the one used during compression.
+ * Prefix is **only used once**. Reference is discarded at end of frame.
+ * End of frame is reached when ZSTD_decompressStream() returns 0.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ * Note 1 : Adding any prefix (including NULL) invalidates any previously set prefix or dictionary
+ * Note 2 : Prefix buffer is referenced. It **must** outlive decompression.
+ * Prefix buffer must remain unmodified up to the end of frame,
+ * reached when ZSTD_decompressStream() returns 0.
+ * Note 3 : By default, the prefix is treated as raw content (ZSTD_dct_rawContent).
+ * Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode (Experimental section)
+ * Note 4 : Referencing a raw content prefix has almost no cpu nor memory cost.
+ * A full dictionary is more costly, as it requires building tables.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx,
+ const void* prefix, size_t prefixSize);
+
+/* === Memory management === */
+
+/*! ZSTD_sizeof_*() :
+ * These functions give the _current_ memory usage of selected object.
+ * Note that object memory usage can evolve (increase or decrease) over time. */
+ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx);
+ZSTDLIB_API size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs);
+ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds);
+ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict);
+ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+
+#endif /* ZSTD_H_235446 */
+
+
+/* **************************************************************************************
+ * ADVANCED AND EXPERIMENTAL FUNCTIONS
+ ****************************************************************************************
+ * The definitions in the following section are considered experimental.
+ * They are provided for advanced scenarios.
+ * They should never be used with a dynamic library, as prototypes may change in the future.
+ * Use them only in association with static linking.
+ * ***************************************************************************************/
+
+#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY)
+#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY
+
+/****************************************************************************************
+ * experimental API (static linking only)
+ ****************************************************************************************
+ * The following symbols and constants
+ * are not planned to join "stable API" status in the near future.
+ * They can still change in future versions.
+ * Some of them are planned to remain in the static_only section indefinitely.
+ * Some of them might be removed in the future (especially when redundant with existing stable functions)
+ * ***************************************************************************************/
+
+#define ZSTD_FRAMEHEADERSIZE_PREFIX(format) ((format) == ZSTD_f_zstd1 ? 5 : 1) /* minimum input size required to query frame header size */
+#define ZSTD_FRAMEHEADERSIZE_MIN(format) ((format) == ZSTD_f_zstd1 ? 6 : 2)
+#define ZSTD_FRAMEHEADERSIZE_MAX 18 /* can be useful for static allocation */
+#define ZSTD_SKIPPABLEHEADERSIZE 8
+
+/* compression parameter bounds */
+#define ZSTD_WINDOWLOG_MAX_32 30
+#define ZSTD_WINDOWLOG_MAX_64 31
+#define ZSTD_WINDOWLOG_MAX ((int)(sizeof(size_t) == 4 ? ZSTD_WINDOWLOG_MAX_32 : ZSTD_WINDOWLOG_MAX_64))
+#define ZSTD_WINDOWLOG_MIN 10
+#define ZSTD_HASHLOG_MAX ((ZSTD_WINDOWLOG_MAX < 30) ? ZSTD_WINDOWLOG_MAX : 30)
+#define ZSTD_HASHLOG_MIN 6
+#define ZSTD_CHAINLOG_MAX_32 29
+#define ZSTD_CHAINLOG_MAX_64 30
+#define ZSTD_CHAINLOG_MAX ((int)(sizeof(size_t) == 4 ? ZSTD_CHAINLOG_MAX_32 : ZSTD_CHAINLOG_MAX_64))
+#define ZSTD_CHAINLOG_MIN ZSTD_HASHLOG_MIN
+#define ZSTD_SEARCHLOG_MAX (ZSTD_WINDOWLOG_MAX-1)
+#define ZSTD_SEARCHLOG_MIN 1
+#define ZSTD_MINMATCH_MAX 7 /* only for ZSTD_fast, other strategies are limited to 6 */
+#define ZSTD_MINMATCH_MIN 3 /* only for ZSTD_btopt+, faster strategies are limited to 4 */
+#define ZSTD_TARGETLENGTH_MAX ZSTD_BLOCKSIZE_MAX
+#define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */
+#define ZSTD_STRATEGY_MIN ZSTD_fast
+#define ZSTD_STRATEGY_MAX ZSTD_btultra2
+
+
+#define ZSTD_OVERLAPLOG_MIN 0
+#define ZSTD_OVERLAPLOG_MAX 9
+
+#define ZSTD_WINDOWLOG_LIMIT_DEFAULT 27 /* by default, the streaming decoder will refuse any frame
+ * requiring larger than (1<<ZSTD_WINDOWLOG_LIMIT_DEFAULT) window size,
+ * to preserve host's memory from unreasonable requirements.
+ * This limit can be overridden using ZSTD_DCtx_setParameter(,ZSTD_d_windowLogMax,).
+ * The limit does not apply for one-pass decoders (such as ZSTD_decompress()), since no additional memory is allocated */
+
+
+/* LDM parameter bounds */
+#define ZSTD_LDM_HASHLOG_MIN ZSTD_HASHLOG_MIN
+#define ZSTD_LDM_HASHLOG_MAX ZSTD_HASHLOG_MAX
+#define ZSTD_LDM_MINMATCH_MIN 4
+#define ZSTD_LDM_MINMATCH_MAX 4096
+#define ZSTD_LDM_BUCKETSIZELOG_MIN 1
+#define ZSTD_LDM_BUCKETSIZELOG_MAX 8
+#define ZSTD_LDM_HASHRATELOG_MIN 0
+#define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN)
+
+/* Advanced parameter bounds */
+#define ZSTD_TARGETCBLOCKSIZE_MIN 64
+#define ZSTD_TARGETCBLOCKSIZE_MAX ZSTD_BLOCKSIZE_MAX
+#define ZSTD_SRCSIZEHINT_MIN 0
+#define ZSTD_SRCSIZEHINT_MAX INT_MAX
+
+/* internal */
+#define ZSTD_HASHLOG3_MAX 17
+
+
+/* --- Advanced types --- */
+
+typedef struct ZSTD_CCtx_params_s ZSTD_CCtx_params;
+
+typedef struct {
+ unsigned int matchPos; /* Match pos in dst */
+ /* If seqDef.offset > 3, then this is seqDef.offset - 3
+ * If seqDef.offset < 3, then this is the corresponding repeat offset
+ * But if seqDef.offset < 3 and litLength == 0, this is the
+ * repeat offset before the corresponding repeat offset
+ * And if seqDef.offset == 3 and litLength == 0, this is the
+ * most recent repeat offset - 1
+ */
+ unsigned int offset;
+ unsigned int litLength; /* Literal length */
+ unsigned int matchLength; /* Match length */
+ /* 0 when seq not rep and seqDef.offset otherwise
+ * when litLength == 0 this will be <= 4, otherwise <= 3 like normal
+ */
+ unsigned int rep;
+} ZSTD_Sequence;
+
+typedef struct {
+ unsigned windowLog; /**< largest match distance : larger == more compression, more memory needed during decompression */
+ unsigned chainLog; /**< fully searched segment : larger == more compression, slower, more memory (useless for fast) */
+ unsigned hashLog; /**< dispatch table : larger == faster, more memory */
+ unsigned searchLog; /**< nb of searches : larger == more compression, slower */
+ unsigned minMatch; /**< match length searched : larger == faster decompression, sometimes less compression */
+ unsigned targetLength; /**< acceptable match size for optimal parser (only) : larger == more compression, slower */
+ ZSTD_strategy strategy; /**< see ZSTD_strategy definition above */
+} ZSTD_compressionParameters;
+
+typedef struct {
+ int contentSizeFlag; /**< 1: content size will be in frame header (when known) */
+ int checksumFlag; /**< 1: generate a 32-bits checksum using XXH64 algorithm at end of frame, for error detection */
+ int noDictIDFlag; /**< 1: no dictID will be saved into frame header (dictID is only useful for dictionary compression) */
+} ZSTD_frameParameters;
+
+typedef struct {
+ ZSTD_compressionParameters cParams;
+ ZSTD_frameParameters fParams;
+} ZSTD_parameters;
+
+typedef enum {
+ ZSTD_dct_auto = 0, /* dictionary is "full" when starting with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */
+ ZSTD_dct_rawContent = 1, /* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */
+ ZSTD_dct_fullDict = 2 /* refuses to load a dictionary if it does not respect Zstandard's specification, starting with ZSTD_MAGIC_DICTIONARY */
+} ZSTD_dictContentType_e;
+
+typedef enum {
+ ZSTD_dlm_byCopy = 0, /**< Copy dictionary content internally */
+ ZSTD_dlm_byRef = 1 /**< Reference dictionary content -- the dictionary buffer must outlive its users. */
+} ZSTD_dictLoadMethod_e;
+
+typedef enum {
+ ZSTD_f_zstd1 = 0, /* zstd frame format, specified in zstd_compression_format.md (default) */
+ ZSTD_f_zstd1_magicless = 1 /* Variant of zstd frame format, without initial 4-bytes magic number.
+ * Useful to save 4 bytes per generated frame.
+ * Decoder cannot recognise automatically this format, requiring this instruction. */
+} ZSTD_format_e;
+
+typedef enum {
+ /* Note: this enum and the behavior it controls are effectively internal
+ * implementation details of the compressor. They are expected to continue
+ * to evolve and should be considered only in the context of extremely
+ * advanced performance tuning.
+ *
+ * Zstd currently supports the use of a CDict in three ways:
+ *
+ * - The contents of the CDict can be copied into the working context. This
+ * means that the compression can search both the dictionary and input
+ * while operating on a single set of internal tables. This makes
+ * the compression faster per-byte of input. However, the initial copy of
+ * the CDict's tables incurs a fixed cost at the beginning of the
+ * compression. For small compressions (< 8 KB), that copy can dominate
+ * the cost of the compression.
+ *
+ * - The CDict's tables can be used in-place. In this model, compression is
+ * slower per input byte, because the compressor has to search two sets of
+ * tables. However, this model incurs no start-up cost (as long as the
+ * working context's tables can be reused). For small inputs, this can be
+ * faster than copying the CDict's tables.
+ *
+ * - The CDict's tables are not used at all, and instead we use the working
+ * context alone to reload the dictionary and use params based on the source
+ * size. See ZSTD_compress_insertDictionary() and ZSTD_compress_usingDict().
+ * This method is effective when the dictionary sizes are very small relative
+ * to the input size, and the input size is fairly large to begin with.
+ *
+ * Zstd has a simple internal heuristic that selects which strategy to use
+ * at the beginning of a compression. However, if experimentation shows that
+ * Zstd is making poor choices, it is possible to override that choice with
+ * this enum.
+ */
+ ZSTD_dictDefaultAttach = 0, /* Use the default heuristic. */
+ ZSTD_dictForceAttach = 1, /* Never copy the dictionary. */
+ ZSTD_dictForceCopy = 2, /* Always copy the dictionary. */
+ ZSTD_dictForceLoad = 3 /* Always reload the dictionary */
+} ZSTD_dictAttachPref_e;
+
+typedef enum {
+ ZSTD_lcm_auto = 0, /**< Automatically determine the compression mode based on the compression level.
+ * Negative compression levels will be uncompressed, and positive compression
+ * levels will be compressed. */
+ ZSTD_lcm_huffman = 1, /**< Always attempt Huffman compression. Uncompressed literals will still be
+ * emitted if Huffman compression is not profitable. */
+ ZSTD_lcm_uncompressed = 2 /**< Always emit uncompressed literals. */
+} ZSTD_literalCompressionMode_e;
+
+
+/***************************************
+* Frame size functions
+***************************************/
+
+/*! ZSTD_findDecompressedSize() :
+ * `src` should point to the start of a series of ZSTD encoded and/or skippable frames
+ * `srcSize` must be the _exact_ size of this series
+ * (i.e. there should be a frame boundary at `src + srcSize`)
+ * @return : - decompressed size of all data in all successive frames
+ * - if the decompressed size cannot be determined: ZSTD_CONTENTSIZE_UNKNOWN
+ * - if an error occurred: ZSTD_CONTENTSIZE_ERROR
+ *
+ * note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode.
+ * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
+ * In which case, it's necessary to use streaming mode to decompress data.
+ * note 2 : decompressed size is always present when compression is done with ZSTD_compress()
+ * note 3 : decompressed size can be very large (64-bits value),
+ * potentially larger than what local system can handle as a single memory segment.
+ * In which case, it's necessary to use streaming mode to decompress data.
+ * note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+ * Always ensure result fits within application's authorized limits.
+ * Each application can set its own limits.
+ * note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it must traverse the input to
+ * read each contained frame header. This is fast as most of the data is skipped,
+ * however it does mean that all frame data must be present and valid. */
+ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize);
+
+/*! ZSTD_decompressBound() :
+ * `src` should point to the start of a series of ZSTD encoded and/or skippable frames
+ * `srcSize` must be the _exact_ size of this series
+ * (i.e. there should be a frame boundary at `src + srcSize`)
+ * @return : - upper-bound for the decompressed size of all data in all successive frames
+ * - if an error occured: ZSTD_CONTENTSIZE_ERROR
+ *
+ * note 1 : an error can occur if `src` contains an invalid or incorrectly formatted frame.
+ * note 2 : the upper-bound is exact when the decompressed size field is available in every ZSTD encoded frame of `src`.
+ * in this case, `ZSTD_findDecompressedSize` and `ZSTD_decompressBound` return the same value.
+ * note 3 : when the decompressed size field isn't available, the upper-bound for that frame is calculated by:
+ * upper-bound = # blocks * min(128 KB, Window_Size)
+ */
+ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize);
+
+/*! ZSTD_frameHeaderSize() :
+ * srcSize must be >= ZSTD_FRAMEHEADERSIZE_PREFIX.
+ * @return : size of the Frame Header,
+ * or an error code (if srcSize is too small) */
+ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
+
+/*! ZSTD_getSequences() :
+ * Extract sequences from the sequence store
+ * zc can be used to insert custom compression params.
+ * This function invokes ZSTD_compress2
+ * @return : number of sequences extracted
+ */
+ZSTDLIB_API size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+ size_t outSeqsSize, const void* src, size_t srcSize);
+
+
+/***************************************
+* Memory management
+***************************************/
+
+/*! ZSTD_estimate*() :
+ * These functions make it possible to estimate memory usage
+ * of a future {D,C}Ctx, before its creation.
+ *
+ * ZSTD_estimateCCtxSize() will provide a memory budget large enough
+ * for any compression level up to selected one.
+ * Note : Unlike ZSTD_estimateCStreamSize*(), this estimate
+ * does not include space for a window buffer.
+ * Therefore, the estimation is only guaranteed for single-shot compressions, not streaming.
+ * The estimate will assume the input may be arbitrarily large,
+ * which is the worst case.
+ *
+ * When srcSize can be bound by a known and rather "small" value,
+ * this fact can be used to provide a tighter estimation
+ * because the CCtx compression context will need less memory.
+ * This tighter estimation can be provided by more advanced functions
+ * ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(),
+ * and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
+ * Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
+ *
+ * Note 2 : only single-threaded compression is supported.
+ * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
+ */
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void);
+
+/*! ZSTD_estimateCStreamSize() :
+ * ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one.
+ * It will also consider src size to be arbitrarily "large", which is worst case.
+ * If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation.
+ * ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
+ * ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1.
+ * Note : CStream size estimation is only correct for single-threaded compression.
+ * ZSTD_DStream memory budget depends on window Size.
+ * This information can be passed manually, using ZSTD_estimateDStreamSize,
+ * or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
+ * Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
+ * an internal ?Dict will be created, which additional size is not estimated here.
+ * In this case, get total size by adding ZSTD_estimate?DictSize */
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ZSTDLIB_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
+ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
+
+/*! ZSTD_estimate?DictSize() :
+ * ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict().
+ * ZSTD_estimateCDictSize_advanced() makes it possible to control compression parameters precisely, like ZSTD_createCDict_advanced().
+ * Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logically smaller.
+ */
+ZSTDLIB_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod);
+ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod);
+
+/*! ZSTD_initStatic*() :
+ * Initialize an object using a pre-allocated fixed-size buffer.
+ * workspace: The memory area to emplace the object into.
+ * Provided pointer *must be 8-bytes aligned*.
+ * Buffer must outlive object.
+ * workspaceSize: Use ZSTD_estimate*Size() to determine
+ * how large workspace must be to support target scenario.
+ * @return : pointer to object (same address as workspace, just different type),
+ * or NULL if error (size too small, incorrect alignment, etc.)
+ * Note : zstd will never resize nor malloc() when using a static buffer.
+ * If the object requires more memory than available,
+ * zstd will just error out (typically ZSTD_error_memory_allocation).
+ * Note 2 : there is no corresponding "free" function.
+ * Since workspace is allocated externally, it must be freed externally too.
+ * Note 3 : cParams : use ZSTD_getCParams() to convert a compression level
+ * into its associated cParams.
+ * Limitation 1 : currently not compatible with internal dictionary creation, triggered by
+ * ZSTD_CCtx_loadDictionary(), ZSTD_initCStream_usingDict() or ZSTD_initDStream_usingDict().
+ * Limitation 2 : static cctx currently not compatible with multi-threading.
+ * Limitation 3 : static dctx is incompatible with legacy support.
+ */
+ZSTDLIB_API ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize);
+ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize); /**< same as ZSTD_initStaticCCtx() */
+
+ZSTDLIB_API ZSTD_DCtx* ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize);
+ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize); /**< same as ZSTD_initStaticDCtx() */
+
+ZSTDLIB_API const ZSTD_CDict* ZSTD_initStaticCDict(
+ void* workspace, size_t workspaceSize,
+ const void* dict, size_t dictSize,
+ ZSTD_dictLoadMethod_e dictLoadMethod,
+ ZSTD_dictContentType_e dictContentType,
+ ZSTD_compressionParameters cParams);
+
+ZSTDLIB_API const ZSTD_DDict* ZSTD_initStaticDDict(
+ void* workspace, size_t workspaceSize,
+ const void* dict, size_t dictSize,
+ ZSTD_dictLoadMethod_e dictLoadMethod,
+ ZSTD_dictContentType_e dictContentType);
+
+
+/*! Custom memory allocation :
+ * These prototypes make it possible to pass your own allocation/free functions.
+ * ZSTD_customMem is provided at creation time, using ZSTD_create*_advanced() variants listed below.
+ * All allocation/free operations will be completed using these custom variants instead of regular <stdlib.h> ones.
+ */
+typedef void* (*ZSTD_allocFunction) (void* opaque, size_t size);
+typedef void (*ZSTD_freeFunction) (void* opaque, void* address);
+typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem;
+static ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL }; /**< this constant defers to stdlib's functions */
+
+ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem);
+
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize,
+ ZSTD_dictLoadMethod_e dictLoadMethod,
+ ZSTD_dictContentType_e dictContentType,
+ ZSTD_compressionParameters cParams,
+ ZSTD_customMem customMem);
+
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize,
+ ZSTD_dictLoadMethod_e dictLoadMethod,
+ ZSTD_dictContentType_e dictContentType,
+ ZSTD_customMem customMem);
+
+
+
+/***************************************
+* Advanced compression functions
+***************************************/
+
+/*! ZSTD_createCDict_byReference() :
+ * Create a digested dictionary for compression
+ * Dictionary content is just referenced, not duplicated.
+ * As a consequence, `dictBuffer` **must** outlive CDict,
+ * and its content must remain unmodified throughout the lifetime of CDict.
+ * note: equivalent to ZSTD_createCDict_advanced(), with dictLoadMethod==ZSTD_dlm_byRef */
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel);
+
+/*! ZSTD_getCParams() :
+ * @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize.
+ * `estimatedSrcSize` value is optional, select 0 if not known */
+ZSTDLIB_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*! ZSTD_getParams() :
+ * same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object instead of sub-component `ZSTD_compressionParameters`.
+ * All fields of `ZSTD_frameParameters` are set to default : contentSize=1, checksum=0, noDictID=0 */
+ZSTDLIB_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*! ZSTD_checkCParams() :
+ * Ensure param values remain within authorized range.
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()) */
+ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
+
+/*! ZSTD_adjustCParams() :
+ * optimize params for a given `srcSize` and `dictSize`.
+ * `srcSize` can be unknown, in which case use ZSTD_CONTENTSIZE_UNKNOWN.
+ * `dictSize` must be `0` when there is no dictionary.
+ * cPar can be invalid : all parameters will be clamped within valid range in the @return struct.
+ * This function never fails (wide contract) */
+ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
+
+/*! ZSTD_compress_advanced() :
+ * Note : this function is now DEPRECATED.
+ * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters.
+ * This prototype will be marked as deprecated and generate compilation warning on reaching v1.5.x */
+ZSTDLIB_API size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const void* dict,size_t dictSize,
+ ZSTD_parameters params);
+
+/*! ZSTD_compress_usingCDict_advanced() :
+ * Note : this function is now REDUNDANT.
+ * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters.
+ * This prototype will be marked as deprecated and generate compilation warning in some future version */
+ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ const ZSTD_CDict* cdict,
+ ZSTD_frameParameters fParams);
+
+
+/*! ZSTD_CCtx_loadDictionary_byReference() :
+ * Same as ZSTD_CCtx_loadDictionary(), but dictionary content is referenced, instead of being copied into CCtx.
+ * It saves some memory, but also requires that `dict` outlives its usage within `cctx` */
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_CCtx_loadDictionary_advanced() :
+ * Same as ZSTD_CCtx_loadDictionary(), but gives finer control over
+ * how to load the dictionary (by copy ? by reference ?)
+ * and how to interpret it (automatic ? force raw mode ? full mode only ?) */
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_CCtx_refPrefix_advanced() :
+ * Same as ZSTD_CCtx_refPrefix(), but gives finer control over
+ * how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+
+/* === experimental parameters === */
+/* these parameters can be used with ZSTD_setParameter()
+ * they are not guaranteed to remain supported in the future */
+
+ /* Enables rsyncable mode,
+ * which makes compressed files more rsync friendly
+ * by adding periodic synchronization points to the compressed data.
+ * The target average block size is ZSTD_c_jobSize / 2.
+ * It's possible to modify the job size to increase or decrease
+ * the granularity of the synchronization point.
+ * Once the jobSize is smaller than the window size,
+ * it will result in compression ratio degradation.
+ * NOTE 1: rsyncable mode only works when multithreading is enabled.
+ * NOTE 2: rsyncable performs poorly in combination with long range mode,
+ * since it will decrease the effectiveness of synchronization points,
+ * though mileage may vary.
+ * NOTE 3: Rsyncable mode limits maximum compression speed to ~400 MB/s.
+ * If the selected compression level is already running significantly slower,
+ * the overall speed won't be significantly impacted.
+ */
+ #define ZSTD_c_rsyncable ZSTD_c_experimentalParam1
+
+/* Select a compression format.
+ * The value must be of type ZSTD_format_e.
+ * See ZSTD_format_e enum definition for details */
+#define ZSTD_c_format ZSTD_c_experimentalParam2
+
+/* Force back-reference distances to remain < windowSize,
+ * even when referencing into Dictionary content (default:0) */
+#define ZSTD_c_forceMaxWindow ZSTD_c_experimentalParam3
+
+/* Controls whether the contents of a CDict
+ * are used in place, or copied into the working context.
+ * Accepts values from the ZSTD_dictAttachPref_e enum.
+ * See the comments on that enum for an explanation of the feature. */
+#define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4
+
+/* Controls how the literals are compressed (default is auto).
+ * The value must be of type ZSTD_literalCompressionMode_e.
+ * See ZSTD_literalCompressionMode_t enum definition for details.
+ */
+#define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5
+
+/* Tries to fit compressed block size to be around targetCBlockSize.
+ * No target when targetCBlockSize == 0.
+ * There is no guarantee on compressed block size (default:0) */
+#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6
+
+/* User's best guess of source size.
+ * Hint is not valid when srcSizeHint == 0.
+ * There is no guarantee that hint is close to actual source size,
+ * but compression ratio may regress significantly if guess considerably underestimates */
+#define ZSTD_c_srcSizeHint ZSTD_c_experimentalParam7
+
+/*! ZSTD_CCtx_getParameter() :
+ * Get the requested compression parameter value, selected by enum ZSTD_cParameter,
+ * and store it into int* value.
+ * @return : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value);
+
+
+/*! ZSTD_CCtx_params :
+ * Quick howto :
+ * - ZSTD_createCCtxParams() : Create a ZSTD_CCtx_params structure
+ * - ZSTD_CCtxParams_setParameter() : Push parameters one by one into
+ * an existing ZSTD_CCtx_params structure.
+ * This is similar to
+ * ZSTD_CCtx_setParameter().
+ * - ZSTD_CCtx_setParametersUsingCCtxParams() : Apply parameters to
+ * an existing CCtx.
+ * These parameters will be applied to
+ * all subsequent frames.
+ * - ZSTD_compressStream2() : Do compression using the CCtx.
+ * - ZSTD_freeCCtxParams() : Free the memory.
+ *
+ * This can be used with ZSTD_estimateCCtxSize_advanced_usingCCtxParams()
+ * for static allocation of CCtx for single-threaded compression.
+ */
+ZSTDLIB_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void);
+ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params);
+
+/*! ZSTD_CCtxParams_reset() :
+ * Reset params to default values.
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params);
+
+/*! ZSTD_CCtxParams_init() :
+ * Initializes the compression parameters of cctxParams according to
+ * compression level. All other parameters are reset to their default values.
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel);
+
+/*! ZSTD_CCtxParams_init_advanced() :
+ * Initializes the compression and frame parameters of cctxParams according to
+ * params. All other parameters are reset to their default values.
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params);
+
+/*! ZSTD_CCtxParams_setParameter() :
+ * Similar to ZSTD_CCtx_setParameter.
+ * Set one compression parameter, selected by enum ZSTD_cParameter.
+ * Parameters must be applied to a ZSTD_CCtx using ZSTD_CCtx_setParametersUsingCCtxParams().
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int value);
+
+/*! ZSTD_CCtxParams_getParameter() :
+ * Similar to ZSTD_CCtx_getParameter.
+ * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int* value);
+
+/*! ZSTD_CCtx_setParametersUsingCCtxParams() :
+ * Apply a set of ZSTD_CCtx_params to the compression context.
+ * This can be done even after compression is started,
+ * if nbWorkers==0, this will have no impact until a new compression is started.
+ * if nbWorkers>=1, new parameters will be picked up at next job,
+ * with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+ ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params);
+
+/*! ZSTD_compressStream2_simpleArgs() :
+ * Same as ZSTD_compressStream2(),
+ * but using only integral types as arguments.
+ * This variant might be helpful for binders from dynamic languages
+ * which have troubles handling structures containing memory pointers.
+ */
+ZSTDLIB_API size_t ZSTD_compressStream2_simpleArgs (
+ ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity, size_t* dstPos,
+ const void* src, size_t srcSize, size_t* srcPos,
+ ZSTD_EndDirective endOp);
+
+
+/***************************************
+* Advanced decompression functions
+***************************************/
+
+/*! ZSTD_isFrame() :
+ * Tells if the content of `buffer` starts with a valid Frame Identifier.
+ * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
+ * Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled.
+ * Note 3 : Skippable Frame Identifiers are considered valid. */
+ZSTDLIB_API unsigned ZSTD_isFrame(const void* buffer, size_t size);
+
+/*! ZSTD_createDDict_byReference() :
+ * Create a digested dictionary, ready to start decompression operation without startup delay.
+ * Dictionary content is referenced, and therefore stays in dictBuffer.
+ * It is important that dictBuffer outlives DDict,
+ * it must remain read accessible throughout the lifetime of DDict */
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize);
+
+/*! ZSTD_DCtx_loadDictionary_byReference() :
+ * Same as ZSTD_DCtx_loadDictionary(),
+ * but references `dict` content instead of copying it into `dctx`.
+ * This saves memory if `dict` remains around.,
+ * However, it's imperative that `dict` remains accessible (and unmodified) while being used, so it must outlive decompression. */
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_DCtx_loadDictionary_advanced() :
+ * Same as ZSTD_DCtx_loadDictionary(),
+ * but gives direct control over
+ * how to load the dictionary (by copy ? by reference ?)
+ * and how to interpret it (automatic ? force raw mode ? full mode only ?). */
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_DCtx_refPrefix_advanced() :
+ * Same as ZSTD_DCtx_refPrefix(), but gives finer control over
+ * how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_DCtx_setMaxWindowSize() :
+ * Refuses allocating internal buffers for frames requiring a window size larger than provided limit.
+ * This protects a decoder context from reserving too much memory for itself (potential attack scenario).
+ * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode.
+ * By default, a decompression context accepts all window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT)
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize);
+
+/* ZSTD_d_format
+ * experimental parameter,
+ * allowing selection between ZSTD_format_e input compression formats
+ */
+#define ZSTD_d_format ZSTD_d_experimentalParam1
+/* ZSTD_d_stableOutBuffer
+ * Experimental parameter.
+ * Default is 0 == disabled. Set to 1 to enable.
+ *
+ * Tells the decompressor that the ZSTD_outBuffer will ALWAYS be the same
+ * between calls, except for the modifications that zstd makes to pos (the
+ * caller must not modify pos). This is checked by the decompressor, and
+ * decompression will fail if it ever changes. Therefore the ZSTD_outBuffer
+ * MUST be large enough to fit the entire decompressed frame. This will be
+ * checked when the frame content size is known. The data in the ZSTD_outBuffer
+ * in the range [dst, dst + pos) MUST not be modified during decompression
+ * or you will get data corruption.
+ *
+ * When this flags is enabled zstd won't allocate an output buffer, because
+ * it can write directly to the ZSTD_outBuffer, but it will still allocate
+ * an input buffer large enough to fit any compressed block. This will also
+ * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer.
+ * If you need to avoid the input buffer allocation use the buffer-less
+ * streaming API.
+ *
+ * NOTE: So long as the ZSTD_outBuffer always points to valid memory, using
+ * this flag is ALWAYS memory safe, and will never access out-of-bounds
+ * memory. However, decompression WILL fail if you violate the preconditions.
+ *
+ * WARNING: The data in the ZSTD_outBuffer in the range [dst, dst + pos) MUST
+ * not be modified during decompression or you will get data corruption. This
+ * is because zstd needs to reference data in the ZSTD_outBuffer to regenerate
+ * matches. Normally zstd maintains its own buffer for this purpose, but passing
+ * this flag tells zstd to use the user provided buffer.
+ */
+#define ZSTD_d_stableOutBuffer ZSTD_d_experimentalParam2
+
+/*! ZSTD_DCtx_setFormat() :
+ * Instruct the decoder context about what kind of data to decode next.
+ * This instruction is mandatory to decode data without a fully-formed header,
+ * such ZSTD_f_zstd1_magicless for example.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
+
+/*! ZSTD_decompressStream_simpleArgs() :
+ * Same as ZSTD_decompressStream(),
+ * but using only integral types as arguments.
+ * This can be helpful for binders from dynamic languages
+ * which have troubles handling structures containing memory pointers.
+ */
+ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs (
+ ZSTD_DCtx* dctx,
+ void* dst, size_t dstCapacity, size_t* dstPos,
+ const void* src, size_t srcSize, size_t* srcPos);
+
+
+/********************************************************************
+* Advanced streaming functions
+* Warning : most of these functions are now redundant with the Advanced API.
+* Once Advanced API reaches "stable" status,
+* redundant functions will be deprecated, and then at some point removed.
+********************************************************************/
+
+/*===== Advanced Streaming compression functions =====*/
+/**! ZSTD_initCStream_srcSize() :
+ * This function is deprecated, and equivalent to:
+ * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
+ * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *
+ * pledgedSrcSize must be correct. If it is not known at init time, use
+ * ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs,
+ * "0" also disables frame content size field. It may be enabled in the future.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+ int compressionLevel,
+ unsigned long long pledgedSrcSize);
+
+/**! ZSTD_initCStream_usingDict() :
+ * This function is deprecated, and is equivalent to:
+ * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
+ *
+ * Creates of an internal CDict (incompatible with static CCtx), except if
+ * dict == NULL or dictSize < 8, in which case no dict is used.
+ * Note: dict is loaded with ZSTD_dct_auto (treated as a full zstd dictionary if
+ * it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+ const void* dict, size_t dictSize,
+ int compressionLevel);
+
+/**! ZSTD_initCStream_advanced() :
+ * This function is deprecated, and is approximately equivalent to:
+ * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ * // Pseudocode: Set each zstd parameter and leave the rest as-is.
+ * for ((param, value) : params) {
+ * ZSTD_CCtx_setParameter(zcs, param, value);
+ * }
+ * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
+ *
+ * dict is loaded with ZSTD_dct_auto and ZSTD_dlm_byCopy.
+ * pledgedSrcSize must be correct.
+ * If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+ const void* dict, size_t dictSize,
+ ZSTD_parameters params,
+ unsigned long long pledgedSrcSize);
+
+/**! ZSTD_initCStream_usingCDict() :
+ * This function is deprecated, and equivalent to:
+ * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ * ZSTD_CCtx_refCDict(zcs, cdict);
+ *
+ * note : cdict will just be referenced, and must outlive compression session
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+
+/**! ZSTD_initCStream_usingCDict_advanced() :
+ * This function is DEPRECATED, and is approximately equivalent to:
+ * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ * // Pseudocode: Set each zstd frame parameter and leave the rest as-is.
+ * for ((fParam, value) : fParams) {
+ * ZSTD_CCtx_setParameter(zcs, fParam, value);
+ * }
+ * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ * ZSTD_CCtx_refCDict(zcs, cdict);
+ *
+ * same as ZSTD_initCStream_usingCDict(), with control over frame parameters.
+ * pledgedSrcSize must be correct. If srcSize is not known at init time, use
+ * value ZSTD_CONTENTSIZE_UNKNOWN.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+ const ZSTD_CDict* cdict,
+ ZSTD_frameParameters fParams,
+ unsigned long long pledgedSrcSize);
+
+/*! ZSTD_resetCStream() :
+ * This function is deprecated, and is equivalent to:
+ * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *
+ * start a new frame, using same parameters from previous frame.
+ * This is typically useful to skip dictionary loading stage, since it will re-use it in-place.
+ * Note that zcs must be init at least once before using ZSTD_resetCStream().
+ * If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
+ * If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
+ * For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs,
+ * but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError())
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
+
+
+typedef struct {
+ unsigned long long ingested; /* nb input bytes read and buffered */
+ unsigned long long consumed; /* nb input bytes actually compressed */
+ unsigned long long produced; /* nb of compressed bytes generated and buffered */
+ unsigned long long flushed; /* nb of compressed bytes flushed : not provided; can be tracked from caller side */
+ unsigned currentJobID; /* MT only : latest started job nb */
+ unsigned nbActiveWorkers; /* MT only : nb of workers actively compressing at probe time */
+} ZSTD_frameProgression;
+
+/* ZSTD_getFrameProgression() :
+ * tells how much data has been ingested (read from input)
+ * consumed (input actually compressed) and produced (output) for current frame.
+ * Note : (ingested - consumed) is amount of input data buffered internally, not yet compressed.
+ * Aggregates progression inside active worker threads.
+ */
+ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx);
+
+/*! ZSTD_toFlushNow() :
+ * Tell how many bytes are ready to be flushed immediately.
+ * Useful for multithreading scenarios (nbWorkers >= 1).
+ * Probe the oldest active job, defined as oldest job not yet entirely flushed,
+ * and check its output buffer.
+ * @return : amount of data stored in oldest job and ready to be flushed immediately.
+ * if @return == 0, it means either :
+ * + there is no active job (could be checked with ZSTD_frameProgression()), or
+ * + oldest job is still actively compressing data,
+ * but everything it has produced has also been flushed so far,
+ * therefore flush speed is limited by production speed of oldest job
+ * irrespective of the speed of concurrent (and newer) jobs.
+ */
+ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
+
+
+/*===== Advanced Streaming decompression functions =====*/
+/**
+ * This function is deprecated, and is equivalent to:
+ *
+ * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ * ZSTD_DCtx_loadDictionary(zds, dict, dictSize);
+ *
+ * note: no dictionary will be used if dict == NULL or dictSize < 8
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize);
+
+/**
+ * This function is deprecated, and is equivalent to:
+ *
+ * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ * ZSTD_DCtx_refDDict(zds, ddict);
+ *
+ * note : ddict is referenced, it must outlive decompression session
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);
+
+/**
+ * This function is deprecated, and is equivalent to:
+ *
+ * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ *
+ * re-use decompression parameters from previous init; saves dictionary loading
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+
+
+/*********************************************************************
+* Buffer-less and synchronous inner streaming functions
+*
+* This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
+* But it's also a complex one, with several restrictions, documented below.
+* Prefer normal streaming API for an easier experience.
+********************************************************************* */
+
+/**
+ Buffer-less streaming compression (synchronous mode)
+
+ A ZSTD_CCtx object is required to track streaming operations.
+ Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource.
+ ZSTD_CCtx object can be re-used multiple times within successive compression operations.
+
+ Start by initializing a context.
+ Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression,
+ or ZSTD_compressBegin_advanced(), for finer parameter control.
+ It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
+
+ Then, consume your input using ZSTD_compressContinue().
+ There are some important considerations to keep in mind when using this advanced function :
+ - ZSTD_compressContinue() has no internal buffer. It uses externally provided buffers only.
+ - Interface is synchronous : input is consumed entirely and produces 1+ compressed blocks.
+ - Caller must ensure there is enough space in `dst` to store compressed data under worst case scenario.
+ Worst case evaluation is provided by ZSTD_compressBound().
+ ZSTD_compressContinue() doesn't guarantee recover after a failed compression.
+ - ZSTD_compressContinue() presumes prior input ***is still accessible and unmodified*** (up to maximum distance size, see WindowLog).
+ It remembers all previous contiguous blocks, plus one separated memory segment (which can itself consists of multiple contiguous blocks)
+ - ZSTD_compressContinue() detects that prior input has been overwritten when `src` buffer overlaps.
+ In which case, it will "discard" the relevant memory section from its history.
+
+ Finish a frame with ZSTD_compressEnd(), which will write the last block(s) and optional checksum.
+ It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame.
+ Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders.
+
+ `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again.
+*/
+
+/*===== Buffer-less streaming compression functions =====*/
+ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
+ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /**< note: fails if cdict==NULL */
+ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
+ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /**< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
+
+ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-
+ Buffer-less streaming decompression (synchronous mode)
+
+ A ZSTD_DCtx object is required to track streaming operations.
+ Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
+ A ZSTD_DCtx object can be re-used multiple times.
+
+ First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader().
+ Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
+ Data fragment must be large enough to ensure successful decoding.
+ `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
+ @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
+ >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
+ errorCode, which can be tested using ZSTD_isError().
+
+ It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
+ such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`).
+ Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information.
+ As a consequence, check that values remain within valid application range.
+ For example, do not allocate memory blindly, check that `windowSize` is within expectation.
+ Each application can set its own limits, depending on local restrictions.
+ For extended interoperability, it is recommended to support `windowSize` of at least 8 MB.
+
+ ZSTD_decompressContinue() needs previous data blocks during decompression, up to `windowSize` bytes.
+ ZSTD_decompressContinue() is very sensitive to contiguity,
+ if 2 blocks don't follow each other, make sure that either the compressor breaks contiguity at the same place,
+ or that previous contiguous segment is large enough to properly handle maximum back-reference distance.
+ There are multiple ways to guarantee this condition.
+
+ The most memory efficient way is to use a round buffer of sufficient size.
+ Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
+ which can @return an error code if required value is too large for current system (in 32-bits mode).
+ In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
+ up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
+ which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
+ At which point, decoding can resume from the beginning of the buffer.
+ Note that already decoded data stored in the buffer should be flushed before being overwritten.
+
+ There are alternatives possible, for example using two or more buffers of size `windowSize` each, though they consume more memory.
+
+ Finally, if you control the compression process, you can also ignore all buffer size rules,
+ as long as the encoder and decoder progress in "lock-step",
+ aka use exactly the same buffer sizes, break contiguity at the same place, etc.
+
+ Once buffers are setup, start decompression, with ZSTD_decompressBegin().
+ If decompression requires a dictionary, use ZSTD_decompressBegin_usingDict() or ZSTD_decompressBegin_usingDDict().
+
+ Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively.
+ ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+ ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
+
+ @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+ It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
+ It can also be an error code, which can be tested with ZSTD_isError().
+
+ A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero.
+ Context can then be reset to start a new decompression.
+
+ Note : it's possible to know if next input to present is a header or a block, using ZSTD_nextInputType().
+ This information is not required to properly decode a frame.
+
+ == Special case : skippable frames ==
+
+ Skippable frames allow integration of user-defined data into a flow of concatenated frames.
+ Skippable frames will be ignored (skipped) by decompressor.
+ The format of skippable frames is as follows :
+ a) Skippable frame ID - 4 Bytes, Little endian format, any value from 0x184D2A50 to 0x184D2A5F
+ b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits
+ c) Frame Content - any content (User Data) of length equal to Frame Size
+ For skippable frames ZSTD_getFrameHeader() returns zfhPtr->frameType==ZSTD_skippableFrame.
+ For skippable frames ZSTD_decompressContinue() always returns 0 : it only skips the content.
+*/
+
+/*===== Buffer-less streaming decompression functions =====*/
+typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
+typedef struct {
+ unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
+ unsigned long long windowSize; /* can be very large, up to <= frameContentSize */
+ unsigned blockSizeMax;
+ ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
+ unsigned headerSize;
+ unsigned dictID;
+ unsigned checksumFlag;
+} ZSTD_frameHeader;
+
+/*! ZSTD_getFrameHeader() :
+ * decode Frame Header, or requires larger `srcSize`.
+ * @return : 0, `zfhPtr` is correctly filled,
+ * >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ * or an error code, which can be tested using ZSTD_isError() */
+ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /**< doesn't consume input */
+/*! ZSTD_getFrameHeader_advanced() :
+ * same as ZSTD_getFrameHeader(),
+ * with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+ZSTDLIB_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
+ZSTDLIB_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /**< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
+
+ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIB_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
+
+ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+/* misc */
+ZSTDLIB_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
+typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
+ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+
+
+
+
+/* ============================ */
+/** Block level API */
+/* ============================ */
+
+/*!
+ Block functions produce and decode raw zstd blocks, without frame metadata.
+ Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes).
+ But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes.
+
+ A few rules to respect :
+ - Compressing and decompressing require a context structure
+ + Use ZSTD_createCCtx() and ZSTD_createDCtx()
+ - It is necessary to init context before starting
+ + compression : any ZSTD_compressBegin*() variant, including with dictionary
+ + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
+ + copyCCtx() and copyDCtx() can be used too
+ - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
+ + If input is larger than a block size, it's necessary to split input data into multiple blocks
+ + For inputs larger than a single block, consider using regular ZSTD_compress() instead.
+ Frame metadata is not that costly, and quickly becomes negligible as source size grows larger than a block.
+ - When a block is considered not compressible enough, ZSTD_compressBlock() result will be 0 (zero) !
+ ===> In which case, nothing is produced into `dst` !
+ + User __must__ test for such outcome and deal directly with uncompressed data
+ + A block cannot be declared incompressible if ZSTD_compressBlock() return value was != 0.
+ Doing so would mess up with statistics history, leading to potential data corruption.
+ + ZSTD_decompressBlock() _doesn't accept uncompressed data as input_ !!
+ + In case of multiple successive blocks, should some of them be uncompressed,
+ decoder must be informed of their existence in order to follow proper history.
+ Use ZSTD_insertBlock() for such a case.
+*/
+
+/*===== Raw zstd block functions =====*/
+ZSTDLIB_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx);
+ZSTDLIB_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
+
+
+#endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/sys/contrib/openzfs/module/zstd/lib/zstd_errors.h b/sys/contrib/openzfs/module/zstd/lib/zstd_errors.h
new file mode 100644
index 000000000000..998398e7e57f
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/lib/zstd_errors.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_ERRORS_H_398273423
+#define ZSTD_ERRORS_H_398273423
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*===== dependency =====*/
+#include <stddef.h> /* size_t */
+
+
+/* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */
+#ifndef ZSTDERRORLIB_VISIBILITY
+# if defined(__GNUC__) && (__GNUC__ >= 4)
+# define ZSTDERRORLIB_VISIBILITY __attribute__ ((visibility ("default")))
+# else
+# define ZSTDERRORLIB_VISIBILITY
+# endif
+#endif
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+# define ZSTDERRORLIB_API __declspec(dllexport) ZSTDERRORLIB_VISIBILITY
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+# define ZSTDERRORLIB_API __declspec(dllimport) ZSTDERRORLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+# define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
+#endif
+
+/*-*********************************************
+ * Error codes list
+ *-*********************************************
+ * Error codes _values_ are pinned down since v1.3.1 only.
+ * Therefore, don't rely on values if you may link to any version < v1.3.1.
+ *
+ * Only values < 100 are considered stable.
+ *
+ * note 1 : this API shall be used with static linking only.
+ * dynamic linking is not yet officially supported.
+ * note 2 : Prefer relying on the enum than on its value whenever possible
+ * This is the only supported way to use the error list < v1.3.1
+ * note 3 : ZSTD_isError() is always correct, whatever the library version.
+ **********************************************/
+typedef enum {
+ ZSTD_error_no_error = 0,
+ ZSTD_error_GENERIC = 1,
+ ZSTD_error_prefix_unknown = 10,
+ ZSTD_error_version_unsupported = 12,
+ ZSTD_error_frameParameter_unsupported = 14,
+ ZSTD_error_frameParameter_windowTooLarge = 16,
+ ZSTD_error_corruption_detected = 20,
+ ZSTD_error_checksum_wrong = 22,
+ ZSTD_error_dictionary_corrupted = 30,
+ ZSTD_error_dictionary_wrong = 32,
+ ZSTD_error_dictionaryCreation_failed = 34,
+ ZSTD_error_parameter_unsupported = 40,
+ ZSTD_error_parameter_outOfBound = 42,
+ ZSTD_error_tableLog_tooLarge = 44,
+ ZSTD_error_maxSymbolValue_tooLarge = 46,
+ ZSTD_error_maxSymbolValue_tooSmall = 48,
+ ZSTD_error_stage_wrong = 60,
+ ZSTD_error_init_missing = 62,
+ ZSTD_error_memory_allocation = 64,
+ ZSTD_error_workSpace_tooSmall= 66,
+ ZSTD_error_dstSize_tooSmall = 70,
+ ZSTD_error_srcSize_wrong = 72,
+ ZSTD_error_dstBuffer_null = 74,
+ /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
+ ZSTD_error_frameIndex_tooLarge = 100,
+ ZSTD_error_seekableIO = 102,
+ ZSTD_error_dstBuffer_wrong = 104,
+ ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
+} ZSTD_ErrorCode;
+
+/*! ZSTD_getErrorCode() :
+ convert a `size_t` function result into a `ZSTD_ErrorCode` enum type,
+ which can be used to compare with enum list published above */
+ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult);
+ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code); /**< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_ERRORS_H_398273423 */
diff --git a/sys/contrib/openzfs/module/zstd/zfs_zstd.c b/sys/contrib/openzfs/module/zstd/zfs_zstd.c
new file mode 100644
index 000000000000..69ebf252d1ba
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/zfs_zstd.c
@@ -0,0 +1,780 @@
+/*
+ * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2016-2018, Klara Inc.
+ * Copyright (c) 2016-2018, Allan Jude
+ * Copyright (c) 2018-2020, Sebastian Gottschall
+ * Copyright (c) 2019-2020, Michael Niewöhner
+ * Copyright (c) 2020, The FreeBSD Foundation [1]
+ *
+ * [1] Portions of this software were developed by Allan Jude
+ * under sponsorship from the FreeBSD Foundation.
+ */
+
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/zfs_context.h>
+#include <sys/zio_compress.h>
+#include <sys/spa.h>
+#include <sys/zstd/zstd.h>
+
+#define ZSTD_STATIC_LINKING_ONLY
+#include "lib/zstd.h"
+#include "lib/zstd_errors.h"
+
+kstat_t *zstd_ksp = NULL;
+
+typedef struct zstd_stats {
+ kstat_named_t zstd_stat_alloc_fail;
+ kstat_named_t zstd_stat_alloc_fallback;
+ kstat_named_t zstd_stat_com_alloc_fail;
+ kstat_named_t zstd_stat_dec_alloc_fail;
+ kstat_named_t zstd_stat_com_inval;
+ kstat_named_t zstd_stat_dec_inval;
+ kstat_named_t zstd_stat_dec_header_inval;
+ kstat_named_t zstd_stat_com_fail;
+ kstat_named_t zstd_stat_dec_fail;
+ kstat_named_t zstd_stat_buffers;
+ kstat_named_t zstd_stat_size;
+} zstd_stats_t;
+
+static zstd_stats_t zstd_stats = {
+ { "alloc_fail", KSTAT_DATA_UINT64 },
+ { "alloc_fallback", KSTAT_DATA_UINT64 },
+ { "compress_alloc_fail", KSTAT_DATA_UINT64 },
+ { "decompress_alloc_fail", KSTAT_DATA_UINT64 },
+ { "compress_level_invalid", KSTAT_DATA_UINT64 },
+ { "decompress_level_invalid", KSTAT_DATA_UINT64 },
+ { "decompress_header_invalid", KSTAT_DATA_UINT64 },
+ { "compress_failed", KSTAT_DATA_UINT64 },
+ { "decompress_failed", KSTAT_DATA_UINT64 },
+ { "buffers", KSTAT_DATA_UINT64 },
+ { "size", KSTAT_DATA_UINT64 },
+};
+
+/* Enums describing the allocator type specified by kmem_type in zstd_kmem */
+enum zstd_kmem_type {
+ ZSTD_KMEM_UNKNOWN = 0,
+ /* Allocation type using kmem_vmalloc */
+ ZSTD_KMEM_DEFAULT,
+ /* Pool based allocation using mempool_alloc */
+ ZSTD_KMEM_POOL,
+ /* Reserved fallback memory for decompression only */
+ ZSTD_KMEM_DCTX,
+ ZSTD_KMEM_COUNT,
+};
+
+/* Structure for pooled memory objects */
+struct zstd_pool {
+ void *mem;
+ size_t size;
+ kmutex_t barrier;
+ hrtime_t timeout;
+};
+
+/* Global structure for handling memory allocations */
+struct zstd_kmem {
+ enum zstd_kmem_type kmem_type;
+ size_t kmem_size;
+ struct zstd_pool *pool;
+};
+
+/* Fallback memory structure used for decompression only if memory runs out */
+struct zstd_fallback_mem {
+ size_t mem_size;
+ void *mem;
+ kmutex_t barrier;
+};
+
+struct zstd_levelmap {
+ int16_t zstd_level;
+ enum zio_zstd_levels level;
+};
+
+/*
+ * ZSTD memory handlers
+ *
+ * For decompression we use a different handler which also provides fallback
+ * memory allocation in case memory runs out.
+ *
+ * The ZSTD handlers were split up for the most simplified implementation.
+ */
+static void *zstd_alloc(void *opaque, size_t size);
+static void *zstd_dctx_alloc(void *opaque, size_t size);
+static void zstd_free(void *opaque, void *ptr);
+
+/* Compression memory handler */
+static const ZSTD_customMem zstd_malloc = {
+ zstd_alloc,
+ zstd_free,
+ NULL,
+};
+
+/* Decompression memory handler */
+static const ZSTD_customMem zstd_dctx_malloc = {
+ zstd_dctx_alloc,
+ zstd_free,
+ NULL,
+};
+
+/* Level map for converting ZFS internal levels to ZSTD levels and vice versa */
+static struct zstd_levelmap zstd_levels[] = {
+ {ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1},
+ {ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2},
+ {ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3},
+ {ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4},
+ {ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5},
+ {ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6},
+ {ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7},
+ {ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8},
+ {ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9},
+ {ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10},
+ {ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11},
+ {ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12},
+ {ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13},
+ {ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14},
+ {ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15},
+ {ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16},
+ {ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17},
+ {ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18},
+ {ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19},
+ {-1, ZIO_ZSTD_LEVEL_FAST_1},
+ {-2, ZIO_ZSTD_LEVEL_FAST_2},
+ {-3, ZIO_ZSTD_LEVEL_FAST_3},
+ {-4, ZIO_ZSTD_LEVEL_FAST_4},
+ {-5, ZIO_ZSTD_LEVEL_FAST_5},
+ {-6, ZIO_ZSTD_LEVEL_FAST_6},
+ {-7, ZIO_ZSTD_LEVEL_FAST_7},
+ {-8, ZIO_ZSTD_LEVEL_FAST_8},
+ {-9, ZIO_ZSTD_LEVEL_FAST_9},
+ {-10, ZIO_ZSTD_LEVEL_FAST_10},
+ {-20, ZIO_ZSTD_LEVEL_FAST_20},
+ {-30, ZIO_ZSTD_LEVEL_FAST_30},
+ {-40, ZIO_ZSTD_LEVEL_FAST_40},
+ {-50, ZIO_ZSTD_LEVEL_FAST_50},
+ {-60, ZIO_ZSTD_LEVEL_FAST_60},
+ {-70, ZIO_ZSTD_LEVEL_FAST_70},
+ {-80, ZIO_ZSTD_LEVEL_FAST_80},
+ {-90, ZIO_ZSTD_LEVEL_FAST_90},
+ {-100, ZIO_ZSTD_LEVEL_FAST_100},
+ {-500, ZIO_ZSTD_LEVEL_FAST_500},
+ {-1000, ZIO_ZSTD_LEVEL_FAST_1000},
+};
+
+/*
+ * This variable represents the maximum count of the pool based on the number
+ * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd.
+ */
+static int pool_count = 16;
+
+#define ZSTD_POOL_MAX pool_count
+#define ZSTD_POOL_TIMEOUT 60 * 2
+
+static struct zstd_fallback_mem zstd_dctx_fallback;
+static struct zstd_pool *zstd_mempool_cctx;
+static struct zstd_pool *zstd_mempool_dctx;
+
+
+static void
+zstd_mempool_reap(struct zstd_pool *zstd_mempool)
+{
+ struct zstd_pool *pool;
+
+ if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) {
+ return;
+ }
+
+ /* free obsolete slots */
+ for (int i = 0; i < ZSTD_POOL_MAX; i++) {
+ pool = &zstd_mempool[i];
+ if (pool->mem && mutex_tryenter(&pool->barrier)) {
+ /* Free memory if unused object older than 2 minutes */
+ if (pool->mem && gethrestime_sec() > pool->timeout) {
+ vmem_free(pool->mem, pool->size);
+ ZSTDSTAT_SUB(zstd_stat_buffers, 1);
+ ZSTDSTAT_SUB(zstd_stat_size, pool->size);
+ pool->mem = NULL;
+ pool->size = 0;
+ pool->timeout = 0;
+ }
+ mutex_exit(&pool->barrier);
+ }
+ }
+}
+
+/*
+ * Try to get a cached allocated buffer from memory pool or allocate a new one
+ * if necessary. If a object is older than 2 minutes and does not fit the
+ * requested size, it will be released and a new cached entry will be allocated.
+ * If other pooled objects are detected without being used for 2 minutes, they
+ * will be released, too.
+ *
+ * The concept is that high frequency memory allocations of bigger objects are
+ * expensive. So if a lot of work is going on, allocations will be kept for a
+ * while and can be reused in that time frame.
+ *
+ * The scheduled release will be updated every time a object is reused.
+ */
+
+static void *
+zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size)
+{
+ struct zstd_pool *pool;
+ struct zstd_kmem *mem = NULL;
+
+ if (!zstd_mempool) {
+ return (NULL);
+ }
+
+ /* Seek for preallocated memory slot and free obsolete slots */
+ for (int i = 0; i < ZSTD_POOL_MAX; i++) {
+ pool = &zstd_mempool[i];
+ /*
+ * This lock is simply a marker for a pool object beeing in use.
+ * If it's already hold, it will be skipped.
+ *
+ * We need to create it before checking it to avoid race
+ * conditions caused by running in a threaded context.
+ *
+ * The lock is later released by zstd_mempool_free.
+ */
+ if (mutex_tryenter(&pool->barrier)) {
+ /*
+ * Check if objects fits the size, if so we take it and
+ * update the timestamp.
+ */
+ if (pool->mem && size <= pool->size) {
+ pool->timeout = gethrestime_sec() +
+ ZSTD_POOL_TIMEOUT;
+ mem = pool->mem;
+ return (mem);
+ }
+ mutex_exit(&pool->barrier);
+ }
+ }
+
+ /*
+ * If no preallocated slot was found, try to fill in a new one.
+ *
+ * We run a similar algorithm twice here to avoid pool fragmentation.
+ * The first one may generate holes in the list if objects get released.
+ * We always make sure that these holes get filled instead of adding new
+ * allocations constantly at the end.
+ */
+ for (int i = 0; i < ZSTD_POOL_MAX; i++) {
+ pool = &zstd_mempool[i];
+ if (mutex_tryenter(&pool->barrier)) {
+ /* Object is free, try to allocate new one */
+ if (!pool->mem) {
+ mem = vmem_alloc(size, KM_SLEEP);
+ if (mem) {
+ ZSTDSTAT_ADD(zstd_stat_buffers, 1);
+ ZSTDSTAT_ADD(zstd_stat_size, size);
+ pool->mem = mem;
+ pool->size = size;
+ /* Keep track for later release */
+ mem->pool = pool;
+ mem->kmem_type = ZSTD_KMEM_POOL;
+ mem->kmem_size = size;
+ }
+ }
+
+ if (size <= pool->size) {
+ /* Update timestamp */
+ pool->timeout = gethrestime_sec() +
+ ZSTD_POOL_TIMEOUT;
+
+ return (pool->mem);
+ }
+
+ mutex_exit(&pool->barrier);
+ }
+ }
+
+ /*
+ * If the pool is full or the allocation failed, try lazy allocation
+ * instead.
+ */
+ if (!mem) {
+ mem = vmem_alloc(size, KM_NOSLEEP);
+ if (mem) {
+ mem->pool = NULL;
+ mem->kmem_type = ZSTD_KMEM_DEFAULT;
+ mem->kmem_size = size;
+ }
+ }
+
+ return (mem);
+}
+
+/* Mark object as released by releasing the barrier mutex */
+static void
+zstd_mempool_free(struct zstd_kmem *z)
+{
+ mutex_exit(&z->pool->barrier);
+}
+
+/* Convert ZFS internal enum to ZSTD level */
+static int
+zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
+{
+ if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) {
+ *zstd_level = zstd_levels[level - 1].zstd_level;
+ return (0);
+ }
+ if (level >= ZIO_ZSTD_LEVEL_FAST_1 &&
+ level <= ZIO_ZSTD_LEVEL_FAST_1000) {
+ *zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1
+ + ZIO_ZSTD_LEVEL_19].zstd_level;
+ return (0);
+ }
+
+ /* Invalid/unknown zfs compression enum - this should never happen. */
+ return (1);
+}
+
+/* Compress block using zstd */
+size_t
+zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
+ int level)
+{
+ size_t c_len;
+ int16_t zstd_level;
+ zfs_zstdhdr_t *hdr;
+ ZSTD_CCtx *cctx;
+
+ hdr = (zfs_zstdhdr_t *)d_start;
+
+ /* Skip compression if the specified level is invalid */
+ if (zstd_enum_to_level(level, &zstd_level)) {
+ ZSTDSTAT_BUMP(zstd_stat_com_inval);
+ return (s_len);
+ }
+
+ ASSERT3U(d_len, >=, sizeof (*hdr));
+ ASSERT3U(d_len, <=, s_len);
+ ASSERT3U(zstd_level, !=, 0);
+
+ cctx = ZSTD_createCCtx_advanced(zstd_malloc);
+
+ /*
+ * Out of kernel memory, gently fall through - this will disable
+ * compression in zio_compress_data
+ */
+ if (!cctx) {
+ ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail);
+ return (s_len);
+ }
+
+ /* Set the compression level */
+ ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level);
+
+ /* Use the "magicless" zstd header which saves us 4 header bytes */
+ ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless);
+
+ /*
+ * Disable redundant checksum calculation and content size storage since
+ * this is already done by ZFS itself.
+ */
+ ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0);
+ ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0);
+
+ c_len = ZSTD_compress2(cctx,
+ hdr->data,
+ d_len - sizeof (*hdr),
+ s_start, s_len);
+
+ ZSTD_freeCCtx(cctx);
+
+ /* Error in the compression routine, disable compression. */
+ if (ZSTD_isError(c_len)) {
+ /*
+ * If we are aborting the compression because the saves are
+ * too small, that is not a failure. Everything else is a
+ * failure, so increment the compression failure counter.
+ */
+ if (ZSTD_getErrorCode(c_len) != ZSTD_error_dstSize_tooSmall) {
+ ZSTDSTAT_BUMP(zstd_stat_com_fail);
+ }
+ return (s_len);
+ }
+
+ /*
+ * Encode the compressed buffer size at the start. We'll need this in
+ * decompression to counter the effects of padding which might be added
+ * to the compressed buffer and which, if unhandled, would confuse the
+ * hell out of our decompression function.
+ */
+ hdr->c_len = BE_32(c_len);
+
+ /*
+ * Check version for overflow.
+ * The limit of 24 bits must not be exceeded. This allows a maximum
+ * version 1677.72.15 which we don't expect to be ever reached.
+ */
+ ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF);
+
+ /*
+ * Encode the compression level as well. We may need to know the
+ * original compression level if compressed_arc is disabled, to match
+ * the compression settings to write this block to the L2ARC.
+ *
+ * Encode the actual level, so if the enum changes in the future, we
+ * will be compatible.
+ *
+ * The upper 24 bits store the ZSTD version to be able to provide
+ * future compatibility, since new versions might enhance the
+ * compression algorithm in a way, where the compressed data will
+ * change.
+ *
+ * As soon as such incompatibility occurs, handling code needs to be
+ * added, differentiating between the versions.
+ */
+ hdr->version = ZSTD_VERSION_NUMBER;
+ hdr->level = level;
+ hdr->raw_version_level = BE_32(hdr->raw_version_level);
+
+ return (c_len + sizeof (*hdr));
+}
+
+/* Decompress block using zstd and return its stored level */
+int
+zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
+ size_t d_len, uint8_t *level)
+{
+ ZSTD_DCtx *dctx;
+ size_t result;
+ int16_t zstd_level;
+ uint32_t c_len;
+ const zfs_zstdhdr_t *hdr;
+ zfs_zstdhdr_t hdr_copy;
+
+ hdr = (const zfs_zstdhdr_t *)s_start;
+ c_len = BE_32(hdr->c_len);
+
+ /*
+ * Make a copy instead of directly converting the header, since we must
+ * not modify the original data that may be used again later.
+ */
+ hdr_copy.raw_version_level = BE_32(hdr->raw_version_level);
+
+ /*
+ * NOTE: We ignore the ZSTD version for now. As soon as any
+ * incompatibility occurrs, it has to be handled accordingly.
+ * The version can be accessed via `hdr_copy.version`.
+ */
+
+ /*
+ * Convert and check the level
+ * An invalid level is a strong indicator for data corruption! In such
+ * case return an error so the upper layers can try to fix it.
+ */
+ if (zstd_enum_to_level(hdr_copy.level, &zstd_level)) {
+ ZSTDSTAT_BUMP(zstd_stat_dec_inval);
+ return (1);
+ }
+
+ ASSERT3U(d_len, >=, s_len);
+ ASSERT3U(hdr_copy.level, !=, ZIO_COMPLEVEL_INHERIT);
+
+ /* Invalid compressed buffer size encoded at start */
+ if (c_len + sizeof (*hdr) > s_len) {
+ ZSTDSTAT_BUMP(zstd_stat_dec_header_inval);
+ return (1);
+ }
+
+ dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc);
+ if (!dctx) {
+ ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail);
+ return (1);
+ }
+
+ /* Set header type to "magicless" */
+ ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless);
+
+ /* Decompress the data and release the context */
+ result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len);
+ ZSTD_freeDCtx(dctx);
+
+ /*
+ * Returns 0 on success (decompression function returned non-negative)
+ * and non-zero on failure (decompression function returned negative.
+ */
+ if (ZSTD_isError(result)) {
+ ZSTDSTAT_BUMP(zstd_stat_dec_fail);
+ return (1);
+ }
+
+ if (level) {
+ *level = hdr_copy.level;
+ }
+
+ return (0);
+}
+
+/* Decompress datablock using zstd */
+int
+zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len,
+ int level __maybe_unused)
+{
+
+ return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len,
+ NULL));
+}
+
+/* Allocator for zstd compression context using mempool_allocator */
+static void *
+zstd_alloc(void *opaque __maybe_unused, size_t size)
+{
+ size_t nbytes = sizeof (struct zstd_kmem) + size;
+ struct zstd_kmem *z = NULL;
+
+ z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes);
+
+ if (!z) {
+ ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
+ return (NULL);
+ }
+
+ return ((void*)z + (sizeof (struct zstd_kmem)));
+}
+
+/*
+ * Allocator for zstd decompression context using mempool_allocator with
+ * fallback to reserved memory if allocation fails
+ */
+static void *
+zstd_dctx_alloc(void *opaque __maybe_unused, size_t size)
+{
+ size_t nbytes = sizeof (struct zstd_kmem) + size;
+ struct zstd_kmem *z = NULL;
+ enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT;
+
+ z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes);
+ if (!z) {
+ /* Try harder, decompression shall not fail */
+ z = vmem_alloc(nbytes, KM_SLEEP);
+ if (z) {
+ z->pool = NULL;
+ }
+ ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
+ } else {
+ return ((void*)z + (sizeof (struct zstd_kmem)));
+ }
+
+ /* Fallback if everything fails */
+ if (!z) {
+ /*
+ * Barrier since we only can handle it in a single thread. All
+ * other following threads need to wait here until decompression
+ * is completed. zstd_free will release this barrier later.
+ */
+ mutex_enter(&zstd_dctx_fallback.barrier);
+
+ z = zstd_dctx_fallback.mem;
+ type = ZSTD_KMEM_DCTX;
+ ZSTDSTAT_BUMP(zstd_stat_alloc_fallback);
+ }
+
+ /* Allocation should always be successful */
+ if (!z) {
+ return (NULL);
+ }
+
+ z->kmem_type = type;
+ z->kmem_size = nbytes;
+
+ return ((void*)z + (sizeof (struct zstd_kmem)));
+}
+
+/* Free allocated memory by its specific type */
+static void
+zstd_free(void *opaque __maybe_unused, void *ptr)
+{
+ struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem));
+ enum zstd_kmem_type type;
+
+ ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT);
+ ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN);
+
+ type = z->kmem_type;
+ switch (type) {
+ case ZSTD_KMEM_DEFAULT:
+ vmem_free(z, z->kmem_size);
+ break;
+ case ZSTD_KMEM_POOL:
+ zstd_mempool_free(z);
+ break;
+ case ZSTD_KMEM_DCTX:
+ mutex_exit(&zstd_dctx_fallback.barrier);
+ break;
+ default:
+ break;
+ }
+}
+
+/* Allocate fallback memory to ensure safe decompression */
+static void __init
+create_fallback_mem(struct zstd_fallback_mem *mem, size_t size)
+{
+ mem->mem_size = size;
+ mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP);
+ mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL);
+}
+
+/* Initialize memory pool barrier mutexes */
+static void __init
+zstd_mempool_init(void)
+{
+ zstd_mempool_cctx = (struct zstd_pool *)
+ kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
+ zstd_mempool_dctx = (struct zstd_pool *)
+ kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
+
+ for (int i = 0; i < ZSTD_POOL_MAX; i++) {
+ mutex_init(&zstd_mempool_cctx[i].barrier, NULL,
+ MUTEX_DEFAULT, NULL);
+ mutex_init(&zstd_mempool_dctx[i].barrier, NULL,
+ MUTEX_DEFAULT, NULL);
+ }
+}
+
+/* Initialize zstd-related memory handling */
+static int __init
+zstd_meminit(void)
+{
+ zstd_mempool_init();
+
+ /*
+ * Estimate the size of the fallback decompression context.
+ * The expected size on x64 with current ZSTD should be about 160 KB.
+ */
+ create_fallback_mem(&zstd_dctx_fallback,
+ P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem),
+ PAGESIZE));
+
+ return (0);
+}
+
+/* Release object from pool and free memory */
+static void __exit
+release_pool(struct zstd_pool *pool)
+{
+ mutex_destroy(&pool->barrier);
+ vmem_free(pool->mem, pool->size);
+ pool->mem = NULL;
+ pool->size = 0;
+}
+
+/* Release memory pool objects */
+static void __exit
+zstd_mempool_deinit(void)
+{
+ for (int i = 0; i < ZSTD_POOL_MAX; i++) {
+ release_pool(&zstd_mempool_cctx[i]);
+ release_pool(&zstd_mempool_dctx[i]);
+ }
+
+ kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
+ kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
+ zstd_mempool_dctx = NULL;
+ zstd_mempool_cctx = NULL;
+}
+
+/* release unused memory from pool */
+
+void
+zfs_zstd_cache_reap_now(void)
+{
+
+ /*
+ * Short-circuit if there are no buffers to begin with.
+ */
+ if (ZSTDSTAT(zstd_stat_buffers) == 0)
+ return;
+
+ /*
+ * calling alloc with zero size seeks
+ * and releases old unused objects
+ */
+ zstd_mempool_reap(zstd_mempool_cctx);
+ zstd_mempool_reap(zstd_mempool_dctx);
+}
+
+extern int __init
+zstd_init(void)
+{
+ /* Set pool size by using maximum sane thread count * 4 */
+ pool_count = (boot_ncpus * 4);
+ zstd_meminit();
+
+ /* Initialize kstat */
+ zstd_ksp = kstat_create("zfs", 0, "zstd", "misc",
+ KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ if (zstd_ksp != NULL) {
+ zstd_ksp->ks_data = &zstd_stats;
+ kstat_install(zstd_ksp);
+ }
+
+ return (0);
+}
+
+extern void __exit
+zstd_fini(void)
+{
+ /* Deinitialize kstat */
+ if (zstd_ksp != NULL) {
+ kstat_delete(zstd_ksp);
+ zstd_ksp = NULL;
+ }
+
+ /* Release fallback memory */
+ vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size);
+ mutex_destroy(&zstd_dctx_fallback.barrier);
+
+ /* Deinit memory pool */
+ zstd_mempool_deinit();
+}
+
+#if defined(_KERNEL)
+module_init(zstd_init);
+module_exit(zstd_fini);
+
+ZFS_MODULE_DESCRIPTION("ZSTD Compression for ZFS");
+ZFS_MODULE_LICENSE("Dual BSD/GPL");
+ZFS_MODULE_VERSION(ZSTD_VERSION_STRING);
+
+EXPORT_SYMBOL(zfs_zstd_compress);
+EXPORT_SYMBOL(zfs_zstd_decompress_level);
+EXPORT_SYMBOL(zfs_zstd_decompress);
+EXPORT_SYMBOL(zfs_zstd_cache_reap_now);
+#endif
diff --git a/sys/contrib/openzfs/module/zstd/zstd-in.c b/sys/contrib/openzfs/module/zstd/zstd-in.c
new file mode 100644
index 000000000000..121f375e5515
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/zstd-in.c
@@ -0,0 +1,68 @@
+/*
+ * BSD 3-Clause Clear License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * Copyright (c) 2019-2020, Michael Niewöhner
+ */
+
+#define MEM_MODULE
+#define XXH_NAMESPACE ZSTD_
+#define XXH_PRIVATE_API
+#define XXH_INLINE_ALL
+#define ZSTD_LEGACY_SUPPORT 0
+#define ZSTD_LIB_DICTBUILDER 0
+#define ZSTD_LIB_DEPRECATED 0
+#define ZSTD_NOBENCH
+
+#include "common/debug.c"
+#include "common/entropy_common.c"
+#include "common/error_private.c"
+#include "common/fse_decompress.c"
+#include "common/pool.c"
+#include "common/zstd_common.c"
+
+#include "compress/fse_compress.c"
+#include "compress/hist.c"
+#include "compress/huf_compress.c"
+#include "compress/zstd_compress_literals.c"
+#include "compress/zstd_compress_sequences.c"
+#include "compress/zstd_compress_superblock.c"
+#include "compress/zstd_compress.c"
+#include "compress/zstd_double_fast.c"
+#include "compress/zstd_fast.c"
+#include "compress/zstd_lazy.c"
+#include "compress/zstd_ldm.c"
+#include "compress/zstd_opt.c"
+
+#include "decompress/huf_decompress.c"
+#include "decompress/zstd_ddict.c"
+#include "decompress/zstd_decompress.c"
+#include "decompress/zstd_decompress_block.c"